• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * HTMLparser.c : an HTML 4.0 non-verifying parser
3  *
4  * See Copyright for the status of this software.
5  *
6  * daniel@veillard.com
7  */
8 
9 #define IN_LIBXML
10 #include "libxml.h"
11 #ifdef LIBXML_HTML_ENABLED
12 
13 #include <string.h>
14 #ifdef HAVE_CTYPE_H
15 #include <ctype.h>
16 #endif
17 #ifdef HAVE_STDLIB_H
18 #include <stdlib.h>
19 #endif
20 #ifdef HAVE_SYS_STAT_H
21 #include <sys/stat.h>
22 #endif
23 #ifdef HAVE_FCNTL_H
24 #include <fcntl.h>
25 #endif
26 #ifdef HAVE_UNISTD_H
27 #include <unistd.h>
28 #endif
29 #ifdef LIBXML_ZLIB_ENABLED
30 #include <zlib.h>
31 #endif
32 
33 #include <libxml/xmlmemory.h>
34 #include <libxml/tree.h>
35 #include <libxml/parser.h>
36 #include <libxml/parserInternals.h>
37 #include <libxml/xmlerror.h>
38 #include <libxml/HTMLparser.h>
39 #include <libxml/HTMLtree.h>
40 #include <libxml/entities.h>
41 #include <libxml/encoding.h>
42 #include <libxml/valid.h>
43 #include <libxml/xmlIO.h>
44 #include <libxml/globals.h>
45 #include <libxml/uri.h>
46 
47 #include "buf.h"
48 #include "enc.h"
49 
50 #define HTML_MAX_NAMELEN 1000
51 #define HTML_PARSER_BIG_BUFFER_SIZE 1000
52 #define HTML_PARSER_BUFFER_SIZE 100
53 
54 /* #define DEBUG */
55 /* #define DEBUG_PUSH */
56 
57 static int htmlOmittedDefaultValue = 1;
58 
59 xmlChar * htmlDecodeEntities(htmlParserCtxtPtr ctxt, int len,
60 			     xmlChar end, xmlChar  end2, xmlChar end3);
61 static void htmlParseComment(htmlParserCtxtPtr ctxt);
62 
63 /************************************************************************
64  *									*
65  *		Some factorized error routines				*
66  *									*
67  ************************************************************************/
68 
69 /**
70  * htmlErrMemory:
71  * @ctxt:  an HTML parser context
72  * @extra:  extra information
73  *
74  * Handle a redefinition of attribute error
75  */
76 static void
htmlErrMemory(xmlParserCtxtPtr ctxt,const char * extra)77 htmlErrMemory(xmlParserCtxtPtr ctxt, const char *extra)
78 {
79     if ((ctxt != NULL) && (ctxt->disableSAX != 0) &&
80         (ctxt->instate == XML_PARSER_EOF))
81 	return;
82     if (ctxt != NULL) {
83         ctxt->errNo = XML_ERR_NO_MEMORY;
84         ctxt->instate = XML_PARSER_EOF;
85         ctxt->disableSAX = 1;
86     }
87     if (extra)
88         __xmlRaiseError(NULL, NULL, NULL, ctxt, NULL, XML_FROM_PARSER,
89                         XML_ERR_NO_MEMORY, XML_ERR_FATAL, NULL, 0, extra,
90                         NULL, NULL, 0, 0,
91                         "Memory allocation failed : %s\n", extra);
92     else
93         __xmlRaiseError(NULL, NULL, NULL, ctxt, NULL, XML_FROM_PARSER,
94                         XML_ERR_NO_MEMORY, XML_ERR_FATAL, NULL, 0, NULL,
95                         NULL, NULL, 0, 0, "Memory allocation failed\n");
96 }
97 
98 /**
99  * htmlParseErr:
100  * @ctxt:  an HTML parser context
101  * @error:  the error number
102  * @msg:  the error message
103  * @str1:  string infor
104  * @str2:  string infor
105  *
106  * Handle a fatal parser error, i.e. violating Well-Formedness constraints
107  */
108 static void LIBXML_ATTR_FORMAT(3,0)
htmlParseErr(xmlParserCtxtPtr ctxt,xmlParserErrors error,const char * msg,const xmlChar * str1,const xmlChar * str2)109 htmlParseErr(xmlParserCtxtPtr ctxt, xmlParserErrors error,
110              const char *msg, const xmlChar *str1, const xmlChar *str2)
111 {
112     if ((ctxt != NULL) && (ctxt->disableSAX != 0) &&
113         (ctxt->instate == XML_PARSER_EOF))
114 	return;
115     if (ctxt != NULL)
116 	ctxt->errNo = error;
117     __xmlRaiseError(NULL, NULL, NULL, ctxt, NULL, XML_FROM_HTML, error,
118                     XML_ERR_ERROR, NULL, 0,
119 		    (const char *) str1, (const char *) str2,
120 		    NULL, 0, 0,
121 		    msg, str1, str2);
122     if (ctxt != NULL)
123 	ctxt->wellFormed = 0;
124 }
125 
126 /**
127  * htmlParseErrInt:
128  * @ctxt:  an HTML parser context
129  * @error:  the error number
130  * @msg:  the error message
131  * @val:  integer info
132  *
133  * Handle a fatal parser error, i.e. violating Well-Formedness constraints
134  */
135 static void LIBXML_ATTR_FORMAT(3,0)
htmlParseErrInt(xmlParserCtxtPtr ctxt,xmlParserErrors error,const char * msg,int val)136 htmlParseErrInt(xmlParserCtxtPtr ctxt, xmlParserErrors error,
137              const char *msg, int val)
138 {
139     if ((ctxt != NULL) && (ctxt->disableSAX != 0) &&
140         (ctxt->instate == XML_PARSER_EOF))
141 	return;
142     if (ctxt != NULL)
143 	ctxt->errNo = error;
144     __xmlRaiseError(NULL, NULL, NULL, ctxt, NULL, XML_FROM_HTML, error,
145                     XML_ERR_ERROR, NULL, 0, NULL, NULL,
146 		    NULL, val, 0, msg, val);
147     if (ctxt != NULL)
148 	ctxt->wellFormed = 0;
149 }
150 
151 /************************************************************************
152  *									*
153  *	Parser stacks related functions and macros		*
154  *									*
155  ************************************************************************/
156 
157 /**
158  * htmlnamePush:
159  * @ctxt:  an HTML parser context
160  * @value:  the element name
161  *
162  * Pushes a new element name on top of the name stack
163  *
164  * Returns 0 in case of error, the index in the stack otherwise
165  */
166 static int
htmlnamePush(htmlParserCtxtPtr ctxt,const xmlChar * value)167 htmlnamePush(htmlParserCtxtPtr ctxt, const xmlChar * value)
168 {
169     if ((ctxt->html < 3) && (xmlStrEqual(value, BAD_CAST "head")))
170         ctxt->html = 3;
171     if ((ctxt->html < 10) && (xmlStrEqual(value, BAD_CAST "body")))
172         ctxt->html = 10;
173     if (ctxt->nameNr >= ctxt->nameMax) {
174         ctxt->nameMax *= 2;
175         ctxt->nameTab = (const xmlChar * *)
176                          xmlRealloc((xmlChar * *)ctxt->nameTab,
177                                     ctxt->nameMax *
178                                     sizeof(ctxt->nameTab[0]));
179         if (ctxt->nameTab == NULL) {
180             htmlErrMemory(ctxt, NULL);
181             return (0);
182         }
183     }
184     ctxt->nameTab[ctxt->nameNr] = value;
185     ctxt->name = value;
186     return (ctxt->nameNr++);
187 }
188 /**
189  * htmlnamePop:
190  * @ctxt: an HTML parser context
191  *
192  * Pops the top element name from the name stack
193  *
194  * Returns the name just removed
195  */
196 static const xmlChar *
htmlnamePop(htmlParserCtxtPtr ctxt)197 htmlnamePop(htmlParserCtxtPtr ctxt)
198 {
199     const xmlChar *ret;
200 
201     if (ctxt->nameNr <= 0)
202         return (NULL);
203     ctxt->nameNr--;
204     if (ctxt->nameNr < 0)
205         return (NULL);
206     if (ctxt->nameNr > 0)
207         ctxt->name = ctxt->nameTab[ctxt->nameNr - 1];
208     else
209         ctxt->name = NULL;
210     ret = ctxt->nameTab[ctxt->nameNr];
211     ctxt->nameTab[ctxt->nameNr] = NULL;
212     return (ret);
213 }
214 
215 /**
216  * htmlNodeInfoPush:
217  * @ctxt:  an HTML parser context
218  * @value:  the node info
219  *
220  * Pushes a new element name on top of the node info stack
221  *
222  * Returns 0 in case of error, the index in the stack otherwise
223  */
224 static int
htmlNodeInfoPush(htmlParserCtxtPtr ctxt,htmlParserNodeInfo * value)225 htmlNodeInfoPush(htmlParserCtxtPtr ctxt, htmlParserNodeInfo *value)
226 {
227     if (ctxt->nodeInfoNr >= ctxt->nodeInfoMax) {
228         if (ctxt->nodeInfoMax == 0)
229                 ctxt->nodeInfoMax = 5;
230         ctxt->nodeInfoMax *= 2;
231         ctxt->nodeInfoTab = (htmlParserNodeInfo *)
232                          xmlRealloc((htmlParserNodeInfo *)ctxt->nodeInfoTab,
233                                     ctxt->nodeInfoMax *
234                                     sizeof(ctxt->nodeInfoTab[0]));
235         if (ctxt->nodeInfoTab == NULL) {
236             htmlErrMemory(ctxt, NULL);
237             return (0);
238         }
239     }
240     ctxt->nodeInfoTab[ctxt->nodeInfoNr] = *value;
241     ctxt->nodeInfo = &ctxt->nodeInfoTab[ctxt->nodeInfoNr];
242     return (ctxt->nodeInfoNr++);
243 }
244 
245 /**
246  * htmlNodeInfoPop:
247  * @ctxt:  an HTML parser context
248  *
249  * Pops the top element name from the node info stack
250  *
251  * Returns 0 in case of error, the pointer to NodeInfo otherwise
252  */
253 static htmlParserNodeInfo *
htmlNodeInfoPop(htmlParserCtxtPtr ctxt)254 htmlNodeInfoPop(htmlParserCtxtPtr ctxt)
255 {
256     if (ctxt->nodeInfoNr <= 0)
257         return (NULL);
258     ctxt->nodeInfoNr--;
259     if (ctxt->nodeInfoNr < 0)
260         return (NULL);
261     if (ctxt->nodeInfoNr > 0)
262         ctxt->nodeInfo = &ctxt->nodeInfoTab[ctxt->nodeInfoNr - 1];
263     else
264         ctxt->nodeInfo = NULL;
265     return &ctxt->nodeInfoTab[ctxt->nodeInfoNr];
266 }
267 
268 /*
269  * Macros for accessing the content. Those should be used only by the parser,
270  * and not exported.
271  *
272  * Dirty macros, i.e. one need to make assumption on the context to use them
273  *
274  *   CUR_PTR return the current pointer to the xmlChar to be parsed.
275  *   CUR     returns the current xmlChar value, i.e. a 8 bit value if compiled
276  *           in ISO-Latin or UTF-8, and the current 16 bit value if compiled
277  *           in UNICODE mode. This should be used internally by the parser
278  *           only to compare to ASCII values otherwise it would break when
279  *           running with UTF-8 encoding.
280  *   NXT(n)  returns the n'th next xmlChar. Same as CUR is should be used only
281  *           to compare on ASCII based substring.
282  *   UPP(n)  returns the n'th next xmlChar converted to uppercase. Same as CUR
283  *           it should be used only to compare on ASCII based substring.
284  *   SKIP(n) Skip n xmlChar, and must also be used only to skip ASCII defined
285  *           strings without newlines within the parser.
286  *
287  * Clean macros, not dependent of an ASCII context, expect UTF-8 encoding
288  *
289  *   CURRENT Returns the current char value, with the full decoding of
290  *           UTF-8 if we are using this mode. It returns an int.
291  *   NEXT    Skip to the next character, this does the proper decoding
292  *           in UTF-8 mode. It also pop-up unfinished entities on the fly.
293  *   NEXTL(l) Skip the current unicode character of l xmlChars long.
294  *   COPY(to) copy one char to *to, increment CUR_PTR and to accordingly
295  */
296 
297 #define UPPER (toupper(*ctxt->input->cur))
298 
299 #define SKIP(val) ctxt->input->cur += (val),ctxt->input->col+=(val)
300 
301 #define NXT(val) ctxt->input->cur[(val)]
302 
303 #define UPP(val) (toupper(ctxt->input->cur[(val)]))
304 
305 #define CUR_PTR ctxt->input->cur
306 #define BASE_PTR ctxt->input->base
307 
308 #define SHRINK if ((ctxt->input->cur - ctxt->input->base > 2 * INPUT_CHUNK) && \
309 		   (ctxt->input->end - ctxt->input->cur < 2 * INPUT_CHUNK)) \
310 	xmlParserInputShrink(ctxt->input)
311 
312 #define GROW if ((ctxt->progressive == 0) &&				\
313 		 (ctxt->input->end - ctxt->input->cur < INPUT_CHUNK))	\
314 	xmlParserInputGrow(ctxt->input, INPUT_CHUNK)
315 
316 #define CURRENT ((int) (*ctxt->input->cur))
317 
318 #define SKIP_BLANKS htmlSkipBlankChars(ctxt)
319 
320 /* Imported from XML */
321 
322 /* #define CUR (ctxt->token ? ctxt->token : (int) (*ctxt->input->cur)) */
323 #define CUR ((int) (*ctxt->input->cur))
324 #define NEXT xmlNextChar(ctxt)
325 
326 #define RAW (ctxt->token ? -1 : (*ctxt->input->cur))
327 
328 
329 #define NEXTL(l) do {							\
330     if (*(ctxt->input->cur) == '\n') {					\
331 	ctxt->input->line++; ctxt->input->col = 1;			\
332     } else ctxt->input->col++;						\
333     ctxt->token = 0; ctxt->input->cur += l;				\
334   } while (0)
335 
336 /************
337     \
338     if (*ctxt->input->cur == '%') xmlParserHandlePEReference(ctxt);	\
339     if (*ctxt->input->cur == '&') xmlParserHandleReference(ctxt);
340  ************/
341 
342 #define CUR_CHAR(l) htmlCurrentChar(ctxt, &l)
343 #define CUR_SCHAR(s, l) xmlStringCurrentChar(ctxt, s, &l)
344 
345 #define COPY_BUF(l,b,i,v)						\
346     if (l == 1) b[i++] = (xmlChar) v;					\
347     else i += xmlCopyChar(l,&b[i],v)
348 
349 /**
350  * htmlFindEncoding:
351  * @the HTML parser context
352  *
353  * Ty to find and encoding in the current data available in the input
354  * buffer this is needed to try to switch to the proper encoding when
355  * one face a character error.
356  * That's an heuristic, since it's operating outside of parsing it could
357  * try to use a meta which had been commented out, that's the reason it
358  * should only be used in case of error, not as a default.
359  *
360  * Returns an encoding string or NULL if not found, the string need to
361  *   be freed
362  */
363 static xmlChar *
htmlFindEncoding(xmlParserCtxtPtr ctxt)364 htmlFindEncoding(xmlParserCtxtPtr ctxt) {
365     const xmlChar *start, *cur, *end;
366 
367     if ((ctxt == NULL) || (ctxt->input == NULL) ||
368         (ctxt->input->encoding != NULL) || (ctxt->input->buf == NULL) ||
369         (ctxt->input->buf->encoder != NULL))
370         return(NULL);
371     if ((ctxt->input->cur == NULL) || (ctxt->input->end == NULL))
372         return(NULL);
373 
374     start = ctxt->input->cur;
375     end = ctxt->input->end;
376     /* we also expect the input buffer to be zero terminated */
377     if (*end != 0)
378         return(NULL);
379 
380     cur = xmlStrcasestr(start, BAD_CAST "HTTP-EQUIV");
381     if (cur == NULL)
382         return(NULL);
383     cur = xmlStrcasestr(cur, BAD_CAST  "CONTENT");
384     if (cur == NULL)
385         return(NULL);
386     cur = xmlStrcasestr(cur, BAD_CAST  "CHARSET=");
387     if (cur == NULL)
388         return(NULL);
389     cur += 8;
390     start = cur;
391     while (((*cur >= 'A') && (*cur <= 'Z')) ||
392            ((*cur >= 'a') && (*cur <= 'z')) ||
393            ((*cur >= '0') && (*cur <= '9')) ||
394            (*cur == '-') || (*cur == '_') || (*cur == ':') || (*cur == '/'))
395            cur++;
396     if (cur == start)
397         return(NULL);
398     return(xmlStrndup(start, cur - start));
399 }
400 
401 /**
402  * htmlCurrentChar:
403  * @ctxt:  the HTML parser context
404  * @len:  pointer to the length of the char read
405  *
406  * The current char value, if using UTF-8 this may actually span multiple
407  * bytes in the input buffer. Implement the end of line normalization:
408  * 2.11 End-of-Line Handling
409  * If the encoding is unspecified, in the case we find an ISO-Latin-1
410  * char, then the encoding converter is plugged in automatically.
411  *
412  * Returns the current char value and its length
413  */
414 
415 static int
htmlCurrentChar(xmlParserCtxtPtr ctxt,int * len)416 htmlCurrentChar(xmlParserCtxtPtr ctxt, int *len) {
417     const unsigned char *cur;
418     unsigned char c;
419     unsigned int val;
420 
421     if (ctxt->instate == XML_PARSER_EOF)
422 	return(0);
423 
424     if (ctxt->token != 0) {
425 	*len = 0;
426 	return(ctxt->token);
427     }
428     if (ctxt->charset != XML_CHAR_ENCODING_UTF8) {
429         xmlChar * guess;
430         xmlCharEncodingHandlerPtr handler;
431 
432         /*
433          * Assume it's a fixed length encoding (1) with
434          * a compatible encoding for the ASCII set, since
435          * HTML constructs only use < 128 chars
436          */
437         if ((int) *ctxt->input->cur < 0x80) {
438             *len = 1;
439             if ((*ctxt->input->cur == 0) &&
440                 (ctxt->input->cur < ctxt->input->end)) {
441                 htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR,
442                                 "Char 0x%X out of allowed range\n", 0);
443                 return(' ');
444             }
445             return((int) *ctxt->input->cur);
446         }
447 
448         /*
449          * Humm this is bad, do an automatic flow conversion
450          */
451         guess = htmlFindEncoding(ctxt);
452         if (guess == NULL) {
453             xmlSwitchEncoding(ctxt, XML_CHAR_ENCODING_8859_1);
454         } else {
455             if (ctxt->input->encoding != NULL)
456                 xmlFree((xmlChar *) ctxt->input->encoding);
457             ctxt->input->encoding = guess;
458             handler = xmlFindCharEncodingHandler((const char *) guess);
459             if (handler != NULL) {
460                 /*
461                  * Don't use UTF-8 encoder which isn't required and
462                  * can produce invalid UTF-8.
463                  */
464                 if (!xmlStrEqual(BAD_CAST handler->name, BAD_CAST "UTF-8"))
465                     xmlSwitchToEncoding(ctxt, handler);
466             } else {
467                 htmlParseErr(ctxt, XML_ERR_INVALID_ENCODING,
468                              "Unsupported encoding %s", guess, NULL);
469             }
470         }
471         ctxt->charset = XML_CHAR_ENCODING_UTF8;
472     }
473 
474     /*
475      * We are supposed to handle UTF8, check it's valid
476      * From rfc2044: encoding of the Unicode values on UTF-8:
477      *
478      * UCS-4 range (hex.)           UTF-8 octet sequence (binary)
479      * 0000 0000-0000 007F   0xxxxxxx
480      * 0000 0080-0000 07FF   110xxxxx 10xxxxxx
481      * 0000 0800-0000 FFFF   1110xxxx 10xxxxxx 10xxxxxx
482      *
483      * Check for the 0x110000 limit too
484      */
485     cur = ctxt->input->cur;
486     c = *cur;
487     if (c & 0x80) {
488         if ((c & 0x40) == 0)
489             goto encoding_error;
490         if (cur[1] == 0) {
491             xmlParserInputGrow(ctxt->input, INPUT_CHUNK);
492             cur = ctxt->input->cur;
493         }
494         if ((cur[1] & 0xc0) != 0x80)
495             goto encoding_error;
496         if ((c & 0xe0) == 0xe0) {
497 
498             if (cur[2] == 0) {
499                 xmlParserInputGrow(ctxt->input, INPUT_CHUNK);
500                 cur = ctxt->input->cur;
501             }
502             if ((cur[2] & 0xc0) != 0x80)
503                 goto encoding_error;
504             if ((c & 0xf0) == 0xf0) {
505                 if (cur[3] == 0) {
506                     xmlParserInputGrow(ctxt->input, INPUT_CHUNK);
507                     cur = ctxt->input->cur;
508                 }
509                 if (((c & 0xf8) != 0xf0) ||
510                     ((cur[3] & 0xc0) != 0x80))
511                     goto encoding_error;
512                 /* 4-byte code */
513                 *len = 4;
514                 val = (cur[0] & 0x7) << 18;
515                 val |= (cur[1] & 0x3f) << 12;
516                 val |= (cur[2] & 0x3f) << 6;
517                 val |= cur[3] & 0x3f;
518                 if (val < 0x10000)
519                     goto encoding_error;
520             } else {
521               /* 3-byte code */
522                 *len = 3;
523                 val = (cur[0] & 0xf) << 12;
524                 val |= (cur[1] & 0x3f) << 6;
525                 val |= cur[2] & 0x3f;
526                 if (val < 0x800)
527                     goto encoding_error;
528             }
529         } else {
530           /* 2-byte code */
531             *len = 2;
532             val = (cur[0] & 0x1f) << 6;
533             val |= cur[1] & 0x3f;
534             if (val < 0x80)
535                 goto encoding_error;
536         }
537         if (!IS_CHAR(val)) {
538             htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR,
539                             "Char 0x%X out of allowed range\n", val);
540         }
541         return(val);
542     } else {
543         if ((*ctxt->input->cur == 0) &&
544             (ctxt->input->cur < ctxt->input->end)) {
545             htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR,
546                             "Char 0x%X out of allowed range\n", 0);
547             *len = 1;
548             return(' ');
549         }
550         /* 1-byte code */
551         *len = 1;
552         return((int) *ctxt->input->cur);
553     }
554 
555 encoding_error:
556     /*
557      * If we detect an UTF8 error that probably mean that the
558      * input encoding didn't get properly advertised in the
559      * declaration header. Report the error and switch the encoding
560      * to ISO-Latin-1 (if you don't like this policy, just declare the
561      * encoding !)
562      */
563     {
564         char buffer[150];
565 
566 	if (ctxt->input->end - ctxt->input->cur >= 4) {
567 	    snprintf(buffer, 149, "Bytes: 0x%02X 0x%02X 0x%02X 0x%02X\n",
568 			    ctxt->input->cur[0], ctxt->input->cur[1],
569 			    ctxt->input->cur[2], ctxt->input->cur[3]);
570 	} else {
571 	    snprintf(buffer, 149, "Bytes: 0x%02X\n", ctxt->input->cur[0]);
572 	}
573 	htmlParseErr(ctxt, XML_ERR_INVALID_ENCODING,
574 		     "Input is not proper UTF-8, indicate encoding !\n",
575 		     BAD_CAST buffer, NULL);
576     }
577 
578     /*
579      * Don't switch encodings twice. Note that if there's an encoder, we
580      * shouldn't receive invalid UTF-8 anyway.
581      *
582      * Note that if ctxt->input->buf == NULL, switching encodings is
583      * impossible, see Gitlab issue #34.
584      */
585     if ((ctxt->input->buf != NULL) &&
586         (ctxt->input->buf->encoder == NULL))
587         xmlSwitchEncoding(ctxt, XML_CHAR_ENCODING_8859_1);
588     *len = 1;
589     return((int) *ctxt->input->cur);
590 }
591 
592 /**
593  * htmlSkipBlankChars:
594  * @ctxt:  the HTML parser context
595  *
596  * skip all blanks character found at that point in the input streams.
597  *
598  * Returns the number of space chars skipped
599  */
600 
601 static int
htmlSkipBlankChars(xmlParserCtxtPtr ctxt)602 htmlSkipBlankChars(xmlParserCtxtPtr ctxt) {
603     int res = 0;
604 
605     while (IS_BLANK_CH(*(ctxt->input->cur))) {
606 	if ((*ctxt->input->cur == 0) &&
607 	    (xmlParserInputGrow(ctxt->input, INPUT_CHUNK) <= 0)) {
608 		xmlPopInput(ctxt);
609 	} else {
610 	    if (*(ctxt->input->cur) == '\n') {
611 		ctxt->input->line++; ctxt->input->col = 1;
612 	    } else ctxt->input->col++;
613 	    ctxt->input->cur++;
614 	    if (*ctxt->input->cur == 0)
615 		xmlParserInputGrow(ctxt->input, INPUT_CHUNK);
616 	}
617 	res++;
618     }
619     return(res);
620 }
621 
622 
623 
624 /************************************************************************
625  *									*
626  *	The list of HTML elements and their properties		*
627  *									*
628  ************************************************************************/
629 
630 /*
631  *  Start Tag: 1 means the start tag can be omitted
632  *  End Tag:   1 means the end tag can be omitted
633  *             2 means it's forbidden (empty elements)
634  *             3 means the tag is stylistic and should be closed easily
635  *  Depr:      this element is deprecated
636  *  DTD:       1 means that this element is valid only in the Loose DTD
637  *             2 means that this element is valid only in the Frameset DTD
638  *
639  * Name,Start Tag,End Tag,Save End,Empty,Deprecated,DTD,inline,Description
640 	, subElements , impliedsubelt , Attributes, userdata
641  */
642 
643 /* Definitions and a couple of vars for HTML Elements */
644 
645 #define FONTSTYLE "tt", "i", "b", "u", "s", "strike", "big", "small"
646 #define NB_FONTSTYLE 8
647 #define PHRASE "em", "strong", "dfn", "code", "samp", "kbd", "var", "cite", "abbr", "acronym"
648 #define NB_PHRASE 10
649 #define SPECIAL "a", "img", "applet", "embed", "object", "font", "basefont", "br", "script", "map", "q", "sub", "sup", "span", "bdo", "iframe"
650 #define NB_SPECIAL 16
651 #define INLINE FONTSTYLE, PHRASE, SPECIAL, FORMCTRL
652 #define NB_INLINE NB_PCDATA + NB_FONTSTYLE + NB_PHRASE + NB_SPECIAL + NB_FORMCTRL
653 #define BLOCK HEADING, LIST, "pre", "p", "dl", "div", "center", "noscript", "noframes", "blockquote", "form", "isindex", "hr", "table", "fieldset", "address"
654 #define NB_BLOCK NB_HEADING + NB_LIST + 14
655 #define FORMCTRL "input", "select", "textarea", "label", "button"
656 #define NB_FORMCTRL 5
657 #define PCDATA
658 #define NB_PCDATA 0
659 #define HEADING "h1", "h2", "h3", "h4", "h5", "h6"
660 #define NB_HEADING 6
661 #define LIST "ul", "ol", "dir", "menu"
662 #define NB_LIST 4
663 #define MODIFIER
664 #define NB_MODIFIER 0
665 #define FLOW BLOCK,INLINE
666 #define NB_FLOW NB_BLOCK + NB_INLINE
667 #define EMPTY NULL
668 
669 
670 static const char* const html_flow[] = { FLOW, NULL } ;
671 static const char* const html_inline[] = { INLINE, NULL } ;
672 
673 /* placeholders: elts with content but no subelements */
674 static const char* const html_pcdata[] = { NULL } ;
675 #define html_cdata html_pcdata
676 
677 
678 /* ... and for HTML Attributes */
679 
680 #define COREATTRS "id", "class", "style", "title"
681 #define NB_COREATTRS 4
682 #define I18N "lang", "dir"
683 #define NB_I18N 2
684 #define EVENTS "onclick", "ondblclick", "onmousedown", "onmouseup", "onmouseover", "onmouseout", "onkeypress", "onkeydown", "onkeyup"
685 #define NB_EVENTS 9
686 #define ATTRS COREATTRS,I18N,EVENTS
687 #define NB_ATTRS NB_NB_COREATTRS + NB_I18N + NB_EVENTS
688 #define CELLHALIGN "align", "char", "charoff"
689 #define NB_CELLHALIGN 3
690 #define CELLVALIGN "valign"
691 #define NB_CELLVALIGN 1
692 
693 static const char* const html_attrs[] = { ATTRS, NULL } ;
694 static const char* const core_i18n_attrs[] = { COREATTRS, I18N, NULL } ;
695 static const char* const core_attrs[] = { COREATTRS, NULL } ;
696 static const char* const i18n_attrs[] = { I18N, NULL } ;
697 
698 
699 /* Other declarations that should go inline ... */
700 static const char* const a_attrs[] = { ATTRS, "charset", "type", "name",
701 	"href", "hreflang", "rel", "rev", "accesskey", "shape", "coords",
702 	"tabindex", "onfocus", "onblur", NULL } ;
703 static const char* const target_attr[] = { "target", NULL } ;
704 static const char* const rows_cols_attr[] = { "rows", "cols", NULL } ;
705 static const char* const alt_attr[] = { "alt", NULL } ;
706 static const char* const src_alt_attrs[] = { "src", "alt", NULL } ;
707 static const char* const href_attrs[] = { "href", NULL } ;
708 static const char* const clear_attrs[] = { "clear", NULL } ;
709 static const char* const inline_p[] = { INLINE, "p", NULL } ;
710 
711 static const char* const flow_param[] = { FLOW, "param", NULL } ;
712 static const char* const applet_attrs[] = { COREATTRS , "codebase",
713 		"archive", "alt", "name", "height", "width", "align",
714 		"hspace", "vspace", NULL } ;
715 static const char* const area_attrs[] = { "shape", "coords", "href", "nohref",
716 	"tabindex", "accesskey", "onfocus", "onblur", NULL } ;
717 static const char* const basefont_attrs[] =
718 	{ "id", "size", "color", "face", NULL } ;
719 static const char* const quote_attrs[] = { ATTRS, "cite", NULL } ;
720 static const char* const body_contents[] = { FLOW, "ins", "del", NULL } ;
721 static const char* const body_attrs[] = { ATTRS, "onload", "onunload", NULL } ;
722 static const char* const body_depr[] = { "background", "bgcolor", "text",
723 	"link", "vlink", "alink", NULL } ;
724 static const char* const button_attrs[] = { ATTRS, "name", "value", "type",
725 	"disabled", "tabindex", "accesskey", "onfocus", "onblur", NULL } ;
726 
727 
728 static const char* const col_attrs[] = { ATTRS, "span", "width", CELLHALIGN, CELLVALIGN, NULL } ;
729 static const char* const col_elt[] = { "col", NULL } ;
730 static const char* const edit_attrs[] = { ATTRS, "datetime", "cite", NULL } ;
731 static const char* const compact_attrs[] = { ATTRS, "compact", NULL } ;
732 static const char* const dl_contents[] = { "dt", "dd", NULL } ;
733 static const char* const compact_attr[] = { "compact", NULL } ;
734 static const char* const label_attr[] = { "label", NULL } ;
735 static const char* const fieldset_contents[] = { FLOW, "legend" } ;
736 static const char* const font_attrs[] = { COREATTRS, I18N, "size", "color", "face" , NULL } ;
737 static const char* const form_contents[] = { HEADING, LIST, INLINE, "pre", "p", "div", "center", "noscript", "noframes", "blockquote", "isindex", "hr", "table", "fieldset", "address", NULL } ;
738 static const char* const form_attrs[] = { ATTRS, "method", "enctype", "accept", "name", "onsubmit", "onreset", "accept-charset", NULL } ;
739 static const char* const frame_attrs[] = { COREATTRS, "longdesc", "name", "src", "frameborder", "marginwidth", "marginheight", "noresize", "scrolling" , NULL } ;
740 static const char* const frameset_attrs[] = { COREATTRS, "rows", "cols", "onload", "onunload", NULL } ;
741 static const char* const frameset_contents[] = { "frameset", "frame", "noframes", NULL } ;
742 static const char* const head_attrs[] = { I18N, "profile", NULL } ;
743 static const char* const head_contents[] = { "title", "isindex", "base", "script", "style", "meta", "link", "object", NULL } ;
744 static const char* const hr_depr[] = { "align", "noshade", "size", "width", NULL } ;
745 static const char* const version_attr[] = { "version", NULL } ;
746 static const char* const html_content[] = { "head", "body", "frameset", NULL } ;
747 static const char* const iframe_attrs[] = { COREATTRS, "longdesc", "name", "src", "frameborder", "marginwidth", "marginheight", "scrolling", "align", "height", "width", NULL } ;
748 static const char* const img_attrs[] = { ATTRS, "longdesc", "name", "height", "width", "usemap", "ismap", NULL } ;
749 static const char* const embed_attrs[] = { COREATTRS, "align", "alt", "border", "code", "codebase", "frameborder", "height", "hidden", "hspace", "name", "palette", "pluginspace", "pluginurl", "src", "type", "units", "vspace", "width", NULL } ;
750 static const char* const input_attrs[] = { ATTRS, "type", "name", "value", "checked", "disabled", "readonly", "size", "maxlength", "src", "alt", "usemap", "ismap", "tabindex", "accesskey", "onfocus", "onblur", "onselect", "onchange", "accept", NULL } ;
751 static const char* const prompt_attrs[] = { COREATTRS, I18N, "prompt", NULL } ;
752 static const char* const label_attrs[] = { ATTRS, "for", "accesskey", "onfocus", "onblur", NULL } ;
753 static const char* const legend_attrs[] = { ATTRS, "accesskey", NULL } ;
754 static const char* const align_attr[] = { "align", NULL } ;
755 static const char* const link_attrs[] = { ATTRS, "charset", "href", "hreflang", "type", "rel", "rev", "media", NULL } ;
756 static const char* const map_contents[] = { BLOCK, "area", NULL } ;
757 static const char* const name_attr[] = { "name", NULL } ;
758 static const char* const action_attr[] = { "action", NULL } ;
759 static const char* const blockli_elt[] = { BLOCK, "li", NULL } ;
760 static const char* const meta_attrs[] = { I18N, "http-equiv", "name", "scheme", "charset", NULL } ;
761 static const char* const content_attr[] = { "content", NULL } ;
762 static const char* const type_attr[] = { "type", NULL } ;
763 static const char* const noframes_content[] = { "body", FLOW MODIFIER, NULL } ;
764 static const char* const object_contents[] = { FLOW, "param", NULL } ;
765 static const char* const object_attrs[] = { ATTRS, "declare", "classid", "codebase", "data", "type", "codetype", "archive", "standby", "height", "width", "usemap", "name", "tabindex", NULL } ;
766 static const char* const object_depr[] = { "align", "border", "hspace", "vspace", NULL } ;
767 static const char* const ol_attrs[] = { "type", "compact", "start", NULL} ;
768 static const char* const option_elt[] = { "option", NULL } ;
769 static const char* const optgroup_attrs[] = { ATTRS, "disabled", NULL } ;
770 static const char* const option_attrs[] = { ATTRS, "disabled", "label", "selected", "value", NULL } ;
771 static const char* const param_attrs[] = { "id", "value", "valuetype", "type", NULL } ;
772 static const char* const width_attr[] = { "width", NULL } ;
773 static const char* const pre_content[] = { PHRASE, "tt", "i", "b", "u", "s", "strike", "a", "br", "script", "map", "q", "span", "bdo", "iframe", NULL } ;
774 static const char* const script_attrs[] = { "charset", "src", "defer", "event", "for", NULL } ;
775 static const char* const language_attr[] = { "language", NULL } ;
776 static const char* const select_content[] = { "optgroup", "option", NULL } ;
777 static const char* const select_attrs[] = { ATTRS, "name", "size", "multiple", "disabled", "tabindex", "onfocus", "onblur", "onchange", NULL } ;
778 static const char* const style_attrs[] = { I18N, "media", "title", NULL } ;
779 static const char* const table_attrs[] = { ATTRS, "summary", "width", "border", "frame", "rules", "cellspacing", "cellpadding", "datapagesize", NULL } ;
780 static const char* const table_depr[] = { "align", "bgcolor", NULL } ;
781 static const char* const table_contents[] = { "caption", "col", "colgroup", "thead", "tfoot", "tbody", "tr", NULL} ;
782 static const char* const tr_elt[] = { "tr", NULL } ;
783 static const char* const talign_attrs[] = { ATTRS, CELLHALIGN, CELLVALIGN, NULL} ;
784 static const char* const th_td_depr[] = { "nowrap", "bgcolor", "width", "height", NULL } ;
785 static const char* const th_td_attr[] = { ATTRS, "abbr", "axis", "headers", "scope", "rowspan", "colspan", CELLHALIGN, CELLVALIGN, NULL } ;
786 static const char* const textarea_attrs[] = { ATTRS, "name", "disabled", "readonly", "tabindex", "accesskey", "onfocus", "onblur", "onselect", "onchange", NULL } ;
787 static const char* const tr_contents[] = { "th", "td", NULL } ;
788 static const char* const bgcolor_attr[] = { "bgcolor", NULL } ;
789 static const char* const li_elt[] = { "li", NULL } ;
790 static const char* const ul_depr[] = { "type", "compact", NULL} ;
791 static const char* const dir_attr[] = { "dir", NULL} ;
792 
793 #define DECL (const char**)
794 
795 static const htmlElemDesc
796 html40ElementTable[] = {
797 { "a",		0, 0, 0, 0, 0, 0, 1, "anchor ",
798 	DECL html_inline , NULL , DECL a_attrs , DECL target_attr, NULL
799 },
800 { "abbr",	0, 0, 0, 0, 0, 0, 1, "abbreviated form",
801 	DECL html_inline , NULL , DECL html_attrs, NULL, NULL
802 },
803 { "acronym",	0, 0, 0, 0, 0, 0, 1, "",
804 	DECL html_inline , NULL , DECL html_attrs, NULL, NULL
805 },
806 { "address",	0, 0, 0, 0, 0, 0, 0, "information on author ",
807 	DECL inline_p  , NULL , DECL html_attrs, NULL, NULL
808 },
809 { "applet",	0, 0, 0, 0, 1, 1, 2, "java applet ",
810 	DECL flow_param , NULL , NULL , DECL applet_attrs, NULL
811 },
812 { "area",	0, 2, 2, 1, 0, 0, 0, "client-side image map area ",
813 	EMPTY ,  NULL , DECL area_attrs , DECL target_attr, DECL alt_attr
814 },
815 { "b",		0, 3, 0, 0, 0, 0, 1, "bold text style",
816 	DECL html_inline , NULL , DECL html_attrs, NULL, NULL
817 },
818 { "base",	0, 2, 2, 1, 0, 0, 0, "document base uri ",
819 	EMPTY , NULL , NULL , DECL target_attr, DECL href_attrs
820 },
821 { "basefont",	0, 2, 2, 1, 1, 1, 1, "base font size " ,
822 	EMPTY , NULL , NULL, DECL basefont_attrs, NULL
823 },
824 { "bdo",	0, 0, 0, 0, 0, 0, 1, "i18n bidi over-ride ",
825 	DECL html_inline , NULL , DECL core_i18n_attrs, NULL, DECL dir_attr
826 },
827 { "big",	0, 3, 0, 0, 0, 0, 1, "large text style",
828 	DECL html_inline , NULL , DECL html_attrs, NULL, NULL
829 },
830 { "blockquote",	0, 0, 0, 0, 0, 0, 0, "long quotation ",
831 	DECL html_flow , NULL , DECL quote_attrs , NULL, NULL
832 },
833 { "body",	1, 1, 0, 0, 0, 0, 0, "document body ",
834 	DECL body_contents , "div" , DECL body_attrs, DECL body_depr, NULL
835 },
836 { "br",		0, 2, 2, 1, 0, 0, 1, "forced line break ",
837 	EMPTY , NULL , DECL core_attrs, DECL clear_attrs , NULL
838 },
839 { "button",	0, 0, 0, 0, 0, 0, 2, "push button ",
840 	DECL html_flow MODIFIER , NULL , DECL button_attrs, NULL, NULL
841 },
842 { "caption",	0, 0, 0, 0, 0, 0, 0, "table caption ",
843 	DECL html_inline , NULL , DECL html_attrs, NULL, NULL
844 },
845 { "center",	0, 3, 0, 0, 1, 1, 0, "shorthand for div align=center ",
846 	DECL html_flow , NULL , NULL, DECL html_attrs, NULL
847 },
848 { "cite",	0, 0, 0, 0, 0, 0, 1, "citation",
849 	DECL html_inline , NULL , DECL html_attrs, NULL, NULL
850 },
851 { "code",	0, 0, 0, 0, 0, 0, 1, "computer code fragment",
852 	DECL html_inline , NULL , DECL html_attrs, NULL, NULL
853 },
854 { "col",	0, 2, 2, 1, 0, 0, 0, "table column ",
855 	EMPTY , NULL , DECL col_attrs , NULL, NULL
856 },
857 { "colgroup",	0, 1, 0, 0, 0, 0, 0, "table column group ",
858 	DECL col_elt , "col" , DECL col_attrs , NULL, NULL
859 },
860 { "dd",		0, 1, 0, 0, 0, 0, 0, "definition description ",
861 	DECL html_flow , NULL , DECL html_attrs, NULL, NULL
862 },
863 { "del",	0, 0, 0, 0, 0, 0, 2, "deleted text ",
864 	DECL html_flow , NULL , DECL edit_attrs , NULL, NULL
865 },
866 { "dfn",	0, 0, 0, 0, 0, 0, 1, "instance definition",
867 	DECL html_inline , NULL , DECL html_attrs, NULL, NULL
868 },
869 { "dir",	0, 0, 0, 0, 1, 1, 0, "directory list",
870 	DECL blockli_elt, "li" , NULL, DECL compact_attrs, NULL
871 },
872 { "div",	0, 0, 0, 0, 0, 0, 0, "generic language/style container",
873 	DECL html_flow, NULL, DECL html_attrs, DECL align_attr, NULL
874 },
875 { "dl",		0, 0, 0, 0, 0, 0, 0, "definition list ",
876 	DECL dl_contents , "dd" , DECL html_attrs, DECL compact_attr, NULL
877 },
878 { "dt",		0, 1, 0, 0, 0, 0, 0, "definition term ",
879 	DECL html_inline, NULL, DECL html_attrs, NULL, NULL
880 },
881 { "em",		0, 3, 0, 0, 0, 0, 1, "emphasis",
882 	DECL html_inline, NULL, DECL html_attrs, NULL, NULL
883 },
884 { "embed",	0, 1, 0, 0, 1, 1, 1, "generic embedded object ",
885 	EMPTY, NULL, DECL embed_attrs, NULL, NULL
886 },
887 { "fieldset",	0, 0, 0, 0, 0, 0, 0, "form control group ",
888 	DECL fieldset_contents , NULL, DECL html_attrs, NULL, NULL
889 },
890 { "font",	0, 3, 0, 0, 1, 1, 1, "local change to font ",
891 	DECL html_inline, NULL, NULL, DECL font_attrs, NULL
892 },
893 { "form",	0, 0, 0, 0, 0, 0, 0, "interactive form ",
894 	DECL form_contents, "fieldset", DECL form_attrs , DECL target_attr, DECL action_attr
895 },
896 { "frame",	0, 2, 2, 1, 0, 2, 0, "subwindow " ,
897 	EMPTY, NULL, NULL, DECL frame_attrs, NULL
898 },
899 { "frameset",	0, 0, 0, 0, 0, 2, 0, "window subdivision" ,
900 	DECL frameset_contents, "noframes" , NULL , DECL frameset_attrs, NULL
901 },
902 { "h1",		0, 0, 0, 0, 0, 0, 0, "heading ",
903 	DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
904 },
905 { "h2",		0, 0, 0, 0, 0, 0, 0, "heading ",
906 	DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
907 },
908 { "h3",		0, 0, 0, 0, 0, 0, 0, "heading ",
909 	DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
910 },
911 { "h4",		0, 0, 0, 0, 0, 0, 0, "heading ",
912 	DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
913 },
914 { "h5",		0, 0, 0, 0, 0, 0, 0, "heading ",
915 	DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
916 },
917 { "h6",		0, 0, 0, 0, 0, 0, 0, "heading ",
918 	DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
919 },
920 { "head",	1, 1, 0, 0, 0, 0, 0, "document head ",
921 	DECL head_contents, NULL, DECL head_attrs, NULL, NULL
922 },
923 { "hr",		0, 2, 2, 1, 0, 0, 0, "horizontal rule " ,
924 	EMPTY, NULL, DECL html_attrs, DECL hr_depr, NULL
925 },
926 { "html",	1, 1, 0, 0, 0, 0, 0, "document root element ",
927 	DECL html_content , NULL , DECL i18n_attrs, DECL version_attr, NULL
928 },
929 { "i",		0, 3, 0, 0, 0, 0, 1, "italic text style",
930 	DECL html_inline, NULL, DECL html_attrs, NULL, NULL
931 },
932 { "iframe",	0, 0, 0, 0, 0, 1, 2, "inline subwindow ",
933 	DECL html_flow, NULL, NULL, DECL iframe_attrs, NULL
934 },
935 { "img",	0, 2, 2, 1, 0, 0, 1, "embedded image ",
936 	EMPTY, NULL, DECL img_attrs, DECL align_attr, DECL src_alt_attrs
937 },
938 { "input",	0, 2, 2, 1, 0, 0, 1, "form control ",
939 	EMPTY, NULL, DECL input_attrs , DECL align_attr, NULL
940 },
941 { "ins",	0, 0, 0, 0, 0, 0, 2, "inserted text",
942 	DECL html_flow, NULL, DECL edit_attrs, NULL, NULL
943 },
944 { "isindex",	0, 2, 2, 1, 1, 1, 0, "single line prompt ",
945 	EMPTY, NULL, NULL, DECL prompt_attrs, NULL
946 },
947 { "kbd",	0, 0, 0, 0, 0, 0, 1, "text to be entered by the user",
948 	DECL html_inline, NULL, DECL html_attrs, NULL, NULL
949 },
950 { "label",	0, 0, 0, 0, 0, 0, 1, "form field label text ",
951 	DECL html_inline MODIFIER, NULL, DECL label_attrs , NULL, NULL
952 },
953 { "legend",	0, 0, 0, 0, 0, 0, 0, "fieldset legend ",
954 	DECL html_inline, NULL, DECL legend_attrs , DECL align_attr, NULL
955 },
956 { "li",		0, 1, 1, 0, 0, 0, 0, "list item ",
957 	DECL html_flow, NULL, DECL html_attrs, NULL, NULL
958 },
959 { "link",	0, 2, 2, 1, 0, 0, 0, "a media-independent link ",
960 	EMPTY, NULL, DECL link_attrs, DECL target_attr, NULL
961 },
962 { "map",	0, 0, 0, 0, 0, 0, 2, "client-side image map ",
963 	DECL map_contents , NULL, DECL html_attrs , NULL, DECL name_attr
964 },
965 { "menu",	0, 0, 0, 0, 1, 1, 0, "menu list ",
966 	DECL blockli_elt , NULL, NULL, DECL compact_attrs, NULL
967 },
968 { "meta",	0, 2, 2, 1, 0, 0, 0, "generic metainformation ",
969 	EMPTY, NULL, DECL meta_attrs , NULL , DECL content_attr
970 },
971 { "noframes",	0, 0, 0, 0, 0, 2, 0, "alternate content container for non frame-based rendering ",
972 	DECL noframes_content, "body" , DECL html_attrs, NULL, NULL
973 },
974 { "noscript",	0, 0, 0, 0, 0, 0, 0, "alternate content container for non script-based rendering ",
975 	DECL html_flow, "div", DECL html_attrs, NULL, NULL
976 },
977 { "object",	0, 0, 0, 0, 0, 0, 2, "generic embedded object ",
978 	DECL object_contents , "div" , DECL object_attrs, DECL object_depr, NULL
979 },
980 { "ol",		0, 0, 0, 0, 0, 0, 0, "ordered list ",
981 	DECL li_elt , "li" , DECL html_attrs, DECL ol_attrs, NULL
982 },
983 { "optgroup",	0, 0, 0, 0, 0, 0, 0, "option group ",
984 	DECL option_elt , "option", DECL optgroup_attrs, NULL, DECL label_attr
985 },
986 { "option",	0, 1, 0, 0, 0, 0, 0, "selectable choice " ,
987 	DECL html_pcdata, NULL, DECL option_attrs, NULL, NULL
988 },
989 { "p",		0, 1, 0, 0, 0, 0, 0, "paragraph ",
990 	DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
991 },
992 { "param",	0, 2, 2, 1, 0, 0, 0, "named property value ",
993 	EMPTY, NULL, DECL param_attrs, NULL, DECL name_attr
994 },
995 { "pre",	0, 0, 0, 0, 0, 0, 0, "preformatted text ",
996 	DECL pre_content, NULL, DECL html_attrs, DECL width_attr, NULL
997 },
998 { "q",		0, 0, 0, 0, 0, 0, 1, "short inline quotation ",
999 	DECL html_inline, NULL, DECL quote_attrs, NULL, NULL
1000 },
1001 { "s",		0, 3, 0, 0, 1, 1, 1, "strike-through text style",
1002 	DECL html_inline, NULL, NULL, DECL html_attrs, NULL
1003 },
1004 { "samp",	0, 0, 0, 0, 0, 0, 1, "sample program output, scripts, etc.",
1005 	DECL html_inline, NULL, DECL html_attrs, NULL, NULL
1006 },
1007 { "script",	0, 0, 0, 0, 0, 0, 2, "script statements ",
1008 	DECL html_cdata, NULL, DECL script_attrs, DECL language_attr, DECL type_attr
1009 },
1010 { "select",	0, 0, 0, 0, 0, 0, 1, "option selector ",
1011 	DECL select_content, NULL, DECL select_attrs, NULL, NULL
1012 },
1013 { "small",	0, 3, 0, 0, 0, 0, 1, "small text style",
1014 	DECL html_inline, NULL, DECL html_attrs, NULL, NULL
1015 },
1016 { "span",	0, 0, 0, 0, 0, 0, 1, "generic language/style container ",
1017 	DECL html_inline, NULL, DECL html_attrs, NULL, NULL
1018 },
1019 { "strike",	0, 3, 0, 0, 1, 1, 1, "strike-through text",
1020 	DECL html_inline, NULL, NULL, DECL html_attrs, NULL
1021 },
1022 { "strong",	0, 3, 0, 0, 0, 0, 1, "strong emphasis",
1023 	DECL html_inline, NULL, DECL html_attrs, NULL, NULL
1024 },
1025 { "style",	0, 0, 0, 0, 0, 0, 0, "style info ",
1026 	DECL html_cdata, NULL, DECL style_attrs, NULL, DECL type_attr
1027 },
1028 { "sub",	0, 3, 0, 0, 0, 0, 1, "subscript",
1029 	DECL html_inline, NULL, DECL html_attrs, NULL, NULL
1030 },
1031 { "sup",	0, 3, 0, 0, 0, 0, 1, "superscript ",
1032 	DECL html_inline, NULL, DECL html_attrs, NULL, NULL
1033 },
1034 { "table",	0, 0, 0, 0, 0, 0, 0, "",
1035 	DECL table_contents , "tr" , DECL table_attrs , DECL table_depr, NULL
1036 },
1037 { "tbody",	1, 0, 0, 0, 0, 0, 0, "table body ",
1038 	DECL tr_elt , "tr" , DECL talign_attrs, NULL, NULL
1039 },
1040 { "td",		0, 0, 0, 0, 0, 0, 0, "table data cell",
1041 	DECL html_flow, NULL, DECL th_td_attr, DECL th_td_depr, NULL
1042 },
1043 { "textarea",	0, 0, 0, 0, 0, 0, 1, "multi-line text field ",
1044 	DECL html_pcdata, NULL, DECL textarea_attrs, NULL, DECL rows_cols_attr
1045 },
1046 { "tfoot",	0, 1, 0, 0, 0, 0, 0, "table footer ",
1047 	DECL tr_elt , "tr" , DECL talign_attrs, NULL, NULL
1048 },
1049 { "th",		0, 1, 0, 0, 0, 0, 0, "table header cell",
1050 	DECL html_flow, NULL, DECL th_td_attr, DECL th_td_depr, NULL
1051 },
1052 { "thead",	0, 1, 0, 0, 0, 0, 0, "table header ",
1053 	DECL tr_elt , "tr" , DECL talign_attrs, NULL, NULL
1054 },
1055 { "title",	0, 0, 0, 0, 0, 0, 0, "document title ",
1056 	DECL html_pcdata, NULL, DECL i18n_attrs, NULL, NULL
1057 },
1058 { "tr",		0, 0, 0, 0, 0, 0, 0, "table row ",
1059 	DECL tr_contents , "td" , DECL talign_attrs, DECL bgcolor_attr, NULL
1060 },
1061 { "tt",		0, 3, 0, 0, 0, 0, 1, "teletype or monospaced text style",
1062 	DECL html_inline, NULL, DECL html_attrs, NULL, NULL
1063 },
1064 { "u",		0, 3, 0, 0, 1, 1, 1, "underlined text style",
1065 	DECL html_inline, NULL, NULL, DECL html_attrs, NULL
1066 },
1067 { "ul",		0, 0, 0, 0, 0, 0, 0, "unordered list ",
1068 	DECL li_elt , "li" , DECL html_attrs, DECL ul_depr, NULL
1069 },
1070 { "var",	0, 0, 0, 0, 0, 0, 1, "instance of a variable or program argument",
1071 	DECL html_inline, NULL, DECL html_attrs, NULL, NULL
1072 }
1073 };
1074 
1075 typedef struct {
1076     const char *oldTag;
1077     const char *newTag;
1078 } htmlStartCloseEntry;
1079 
1080 /*
1081  * start tags that imply the end of current element
1082  */
1083 static const htmlStartCloseEntry htmlStartClose[] = {
1084     { "a", "a" },
1085     { "a", "fieldset" },
1086     { "a", "table" },
1087     { "a", "td" },
1088     { "a", "th" },
1089     { "address", "dd" },
1090     { "address", "dl" },
1091     { "address", "dt" },
1092     { "address", "form" },
1093     { "address", "li" },
1094     { "address", "ul" },
1095     { "b", "center" },
1096     { "b", "p" },
1097     { "b", "td" },
1098     { "b", "th" },
1099     { "big", "p" },
1100     { "caption", "col" },
1101     { "caption", "colgroup" },
1102     { "caption", "tbody" },
1103     { "caption", "tfoot" },
1104     { "caption", "thead" },
1105     { "caption", "tr" },
1106     { "col", "col" },
1107     { "col", "colgroup" },
1108     { "col", "tbody" },
1109     { "col", "tfoot" },
1110     { "col", "thead" },
1111     { "col", "tr" },
1112     { "colgroup", "colgroup" },
1113     { "colgroup", "tbody" },
1114     { "colgroup", "tfoot" },
1115     { "colgroup", "thead" },
1116     { "colgroup", "tr" },
1117     { "dd", "dt" },
1118     { "dir", "dd" },
1119     { "dir", "dl" },
1120     { "dir", "dt" },
1121     { "dir", "form" },
1122     { "dir", "ul" },
1123     { "dl", "form" },
1124     { "dl", "li" },
1125     { "dt", "dd" },
1126     { "dt", "dl" },
1127     { "font", "center" },
1128     { "font", "td" },
1129     { "font", "th" },
1130     { "form", "form" },
1131     { "h1", "fieldset" },
1132     { "h1", "form" },
1133     { "h1", "li" },
1134     { "h1", "p" },
1135     { "h1", "table" },
1136     { "h2", "fieldset" },
1137     { "h2", "form" },
1138     { "h2", "li" },
1139     { "h2", "p" },
1140     { "h2", "table" },
1141     { "h3", "fieldset" },
1142     { "h3", "form" },
1143     { "h3", "li" },
1144     { "h3", "p" },
1145     { "h3", "table" },
1146     { "h4", "fieldset" },
1147     { "h4", "form" },
1148     { "h4", "li" },
1149     { "h4", "p" },
1150     { "h4", "table" },
1151     { "h5", "fieldset" },
1152     { "h5", "form" },
1153     { "h5", "li" },
1154     { "h5", "p" },
1155     { "h5", "table" },
1156     { "h6", "fieldset" },
1157     { "h6", "form" },
1158     { "h6", "li" },
1159     { "h6", "p" },
1160     { "h6", "table" },
1161     { "head", "a" },
1162     { "head", "abbr" },
1163     { "head", "acronym" },
1164     { "head", "address" },
1165     { "head", "b" },
1166     { "head", "bdo" },
1167     { "head", "big" },
1168     { "head", "blockquote" },
1169     { "head", "body" },
1170     { "head", "br" },
1171     { "head", "center" },
1172     { "head", "cite" },
1173     { "head", "code" },
1174     { "head", "dd" },
1175     { "head", "dfn" },
1176     { "head", "dir" },
1177     { "head", "div" },
1178     { "head", "dl" },
1179     { "head", "dt" },
1180     { "head", "em" },
1181     { "head", "fieldset" },
1182     { "head", "font" },
1183     { "head", "form" },
1184     { "head", "frameset" },
1185     { "head", "h1" },
1186     { "head", "h2" },
1187     { "head", "h3" },
1188     { "head", "h4" },
1189     { "head", "h5" },
1190     { "head", "h6" },
1191     { "head", "hr" },
1192     { "head", "i" },
1193     { "head", "iframe" },
1194     { "head", "img" },
1195     { "head", "kbd" },
1196     { "head", "li" },
1197     { "head", "listing" },
1198     { "head", "map" },
1199     { "head", "menu" },
1200     { "head", "ol" },
1201     { "head", "p" },
1202     { "head", "pre" },
1203     { "head", "q" },
1204     { "head", "s" },
1205     { "head", "samp" },
1206     { "head", "small" },
1207     { "head", "span" },
1208     { "head", "strike" },
1209     { "head", "strong" },
1210     { "head", "sub" },
1211     { "head", "sup" },
1212     { "head", "table" },
1213     { "head", "tt" },
1214     { "head", "u" },
1215     { "head", "ul" },
1216     { "head", "var" },
1217     { "head", "xmp" },
1218     { "hr", "form" },
1219     { "i", "center" },
1220     { "i", "p" },
1221     { "i", "td" },
1222     { "i", "th" },
1223     { "legend", "fieldset" },
1224     { "li", "li" },
1225     { "link", "body" },
1226     { "link", "frameset" },
1227     { "listing", "dd" },
1228     { "listing", "dl" },
1229     { "listing", "dt" },
1230     { "listing", "fieldset" },
1231     { "listing", "form" },
1232     { "listing", "li" },
1233     { "listing", "table" },
1234     { "listing", "ul" },
1235     { "menu", "dd" },
1236     { "menu", "dl" },
1237     { "menu", "dt" },
1238     { "menu", "form" },
1239     { "menu", "ul" },
1240     { "ol", "form" },
1241     { "ol", "ul" },
1242     { "option", "optgroup" },
1243     { "option", "option" },
1244     { "p", "address" },
1245     { "p", "blockquote" },
1246     { "p", "body" },
1247     { "p", "caption" },
1248     { "p", "center" },
1249     { "p", "col" },
1250     { "p", "colgroup" },
1251     { "p", "dd" },
1252     { "p", "dir" },
1253     { "p", "div" },
1254     { "p", "dl" },
1255     { "p", "dt" },
1256     { "p", "fieldset" },
1257     { "p", "form" },
1258     { "p", "frameset" },
1259     { "p", "h1" },
1260     { "p", "h2" },
1261     { "p", "h3" },
1262     { "p", "h4" },
1263     { "p", "h5" },
1264     { "p", "h6" },
1265     { "p", "head" },
1266     { "p", "hr" },
1267     { "p", "li" },
1268     { "p", "listing" },
1269     { "p", "menu" },
1270     { "p", "ol" },
1271     { "p", "p" },
1272     { "p", "pre" },
1273     { "p", "table" },
1274     { "p", "tbody" },
1275     { "p", "td" },
1276     { "p", "tfoot" },
1277     { "p", "th" },
1278     { "p", "title" },
1279     { "p", "tr" },
1280     { "p", "ul" },
1281     { "p", "xmp" },
1282     { "pre", "dd" },
1283     { "pre", "dl" },
1284     { "pre", "dt" },
1285     { "pre", "fieldset" },
1286     { "pre", "form" },
1287     { "pre", "li" },
1288     { "pre", "table" },
1289     { "pre", "ul" },
1290     { "s", "p" },
1291     { "script", "noscript" },
1292     { "small", "p" },
1293     { "span", "td" },
1294     { "span", "th" },
1295     { "strike", "p" },
1296     { "style", "body" },
1297     { "style", "frameset" },
1298     { "tbody", "tbody" },
1299     { "tbody", "tfoot" },
1300     { "td", "tbody" },
1301     { "td", "td" },
1302     { "td", "tfoot" },
1303     { "td", "th" },
1304     { "td", "tr" },
1305     { "tfoot", "tbody" },
1306     { "th", "tbody" },
1307     { "th", "td" },
1308     { "th", "tfoot" },
1309     { "th", "th" },
1310     { "th", "tr" },
1311     { "thead", "tbody" },
1312     { "thead", "tfoot" },
1313     { "title", "body" },
1314     { "title", "frameset" },
1315     { "tr", "tbody" },
1316     { "tr", "tfoot" },
1317     { "tr", "tr" },
1318     { "tt", "p" },
1319     { "u", "p" },
1320     { "u", "td" },
1321     { "u", "th" },
1322     { "ul", "address" },
1323     { "ul", "form" },
1324     { "ul", "menu" },
1325     { "ul", "ol" },
1326     { "ul", "pre" },
1327     { "xmp", "dd" },
1328     { "xmp", "dl" },
1329     { "xmp", "dt" },
1330     { "xmp", "fieldset" },
1331     { "xmp", "form" },
1332     { "xmp", "li" },
1333     { "xmp", "table" },
1334     { "xmp", "ul" }
1335 };
1336 
1337 /*
1338  * The list of HTML elements which are supposed not to have
1339  * CDATA content and where a p element will be implied
1340  *
1341  * TODO: extend that list by reading the HTML SGML DTD on
1342  *       implied paragraph
1343  */
1344 static const char *const htmlNoContentElements[] = {
1345     "html",
1346     "head",
1347     NULL
1348 };
1349 
1350 /*
1351  * The list of HTML attributes which are of content %Script;
1352  * NOTE: when adding ones, check htmlIsScriptAttribute() since
1353  *       it assumes the name starts with 'on'
1354  */
1355 static const char *const htmlScriptAttributes[] = {
1356     "onclick",
1357     "ondblclick",
1358     "onmousedown",
1359     "onmouseup",
1360     "onmouseover",
1361     "onmousemove",
1362     "onmouseout",
1363     "onkeypress",
1364     "onkeydown",
1365     "onkeyup",
1366     "onload",
1367     "onunload",
1368     "onfocus",
1369     "onblur",
1370     "onsubmit",
1371     "onreset",
1372     "onchange",
1373     "onselect"
1374 };
1375 
1376 /*
1377  * This table is used by the htmlparser to know what to do with
1378  * broken html pages. By assigning different priorities to different
1379  * elements the parser can decide how to handle extra endtags.
1380  * Endtags are only allowed to close elements with lower or equal
1381  * priority.
1382  */
1383 
1384 typedef struct {
1385     const char *name;
1386     int priority;
1387 } elementPriority;
1388 
1389 static const elementPriority htmlEndPriority[] = {
1390     {"div",   150},
1391     {"td",    160},
1392     {"th",    160},
1393     {"tr",    170},
1394     {"thead", 180},
1395     {"tbody", 180},
1396     {"tfoot", 180},
1397     {"table", 190},
1398     {"head",  200},
1399     {"body",  200},
1400     {"html",  220},
1401     {NULL,    100} /* Default priority */
1402 };
1403 
1404 /************************************************************************
1405  *									*
1406  *	functions to handle HTML specific data			*
1407  *									*
1408  ************************************************************************/
1409 
1410 /**
1411  * htmlInitAutoClose:
1412  *
1413  * This is a no-op now.
1414  */
1415 void
htmlInitAutoClose(void)1416 htmlInitAutoClose(void) {
1417 }
1418 
1419 static int
htmlCompareTags(const void * key,const void * member)1420 htmlCompareTags(const void *key, const void *member) {
1421     const xmlChar *tag = (const xmlChar *) key;
1422     const htmlElemDesc *desc = (const htmlElemDesc *) member;
1423 
1424     return(xmlStrcasecmp(tag, BAD_CAST desc->name));
1425 }
1426 
1427 /**
1428  * htmlTagLookup:
1429  * @tag:  The tag name in lowercase
1430  *
1431  * Lookup the HTML tag in the ElementTable
1432  *
1433  * Returns the related htmlElemDescPtr or NULL if not found.
1434  */
1435 const htmlElemDesc *
htmlTagLookup(const xmlChar * tag)1436 htmlTagLookup(const xmlChar *tag) {
1437     if (tag == NULL)
1438         return(NULL);
1439 
1440     return((const htmlElemDesc *) bsearch(tag, html40ElementTable,
1441                 sizeof(html40ElementTable) / sizeof(htmlElemDesc),
1442                 sizeof(htmlElemDesc), htmlCompareTags));
1443 }
1444 
1445 /**
1446  * htmlGetEndPriority:
1447  * @name: The name of the element to look up the priority for.
1448  *
1449  * Return value: The "endtag" priority.
1450  **/
1451 static int
htmlGetEndPriority(const xmlChar * name)1452 htmlGetEndPriority (const xmlChar *name) {
1453     int i = 0;
1454 
1455     while ((htmlEndPriority[i].name != NULL) &&
1456 	   (!xmlStrEqual((const xmlChar *)htmlEndPriority[i].name, name)))
1457 	i++;
1458 
1459     return(htmlEndPriority[i].priority);
1460 }
1461 
1462 
1463 static int
htmlCompareStartClose(const void * vkey,const void * member)1464 htmlCompareStartClose(const void *vkey, const void *member) {
1465     const htmlStartCloseEntry *key = (const htmlStartCloseEntry *) vkey;
1466     const htmlStartCloseEntry *entry = (const htmlStartCloseEntry *) member;
1467     int ret;
1468 
1469     ret = strcmp(key->oldTag, entry->oldTag);
1470     if (ret == 0)
1471         ret = strcmp(key->newTag, entry->newTag);
1472 
1473     return(ret);
1474 }
1475 
1476 /**
1477  * htmlCheckAutoClose:
1478  * @newtag:  The new tag name
1479  * @oldtag:  The old tag name
1480  *
1481  * Checks whether the new tag is one of the registered valid tags for
1482  * closing old.
1483  *
1484  * Returns 0 if no, 1 if yes.
1485  */
1486 static int
htmlCheckAutoClose(const xmlChar * newtag,const xmlChar * oldtag)1487 htmlCheckAutoClose(const xmlChar * newtag, const xmlChar * oldtag)
1488 {
1489     htmlStartCloseEntry key;
1490     void *res;
1491 
1492     key.oldTag = (const char *) oldtag;
1493     key.newTag = (const char *) newtag;
1494     res = bsearch(&key, htmlStartClose,
1495             sizeof(htmlStartClose) / sizeof(htmlStartCloseEntry),
1496             sizeof(htmlStartCloseEntry), htmlCompareStartClose);
1497     return(res != NULL);
1498 }
1499 
1500 /**
1501  * htmlAutoCloseOnClose:
1502  * @ctxt:  an HTML parser context
1503  * @newtag:  The new tag name
1504  * @force:  force the tag closure
1505  *
1506  * The HTML DTD allows an ending tag to implicitly close other tags.
1507  */
1508 static void
htmlAutoCloseOnClose(htmlParserCtxtPtr ctxt,const xmlChar * newtag)1509 htmlAutoCloseOnClose(htmlParserCtxtPtr ctxt, const xmlChar * newtag)
1510 {
1511     const htmlElemDesc *info;
1512     int i, priority;
1513 
1514     priority = htmlGetEndPriority(newtag);
1515 
1516     for (i = (ctxt->nameNr - 1); i >= 0; i--) {
1517 
1518         if (xmlStrEqual(newtag, ctxt->nameTab[i]))
1519             break;
1520         /*
1521          * A misplaced endtag can only close elements with lower
1522          * or equal priority, so if we find an element with higher
1523          * priority before we find an element with
1524          * matching name, we just ignore this endtag
1525          */
1526         if (htmlGetEndPriority(ctxt->nameTab[i]) > priority)
1527             return;
1528     }
1529     if (i < 0)
1530         return;
1531 
1532     while (!xmlStrEqual(newtag, ctxt->name)) {
1533         info = htmlTagLookup(ctxt->name);
1534         if ((info != NULL) && (info->endTag == 3)) {
1535             htmlParseErr(ctxt, XML_ERR_TAG_NAME_MISMATCH,
1536 	                 "Opening and ending tag mismatch: %s and %s\n",
1537 			 newtag, ctxt->name);
1538         }
1539         if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
1540             ctxt->sax->endElement(ctxt->userData, ctxt->name);
1541 	htmlnamePop(ctxt);
1542     }
1543 }
1544 
1545 /**
1546  * htmlAutoCloseOnEnd:
1547  * @ctxt:  an HTML parser context
1548  *
1549  * Close all remaining tags at the end of the stream
1550  */
1551 static void
htmlAutoCloseOnEnd(htmlParserCtxtPtr ctxt)1552 htmlAutoCloseOnEnd(htmlParserCtxtPtr ctxt)
1553 {
1554     int i;
1555 
1556     if (ctxt->nameNr == 0)
1557         return;
1558     for (i = (ctxt->nameNr - 1); i >= 0; i--) {
1559         if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
1560             ctxt->sax->endElement(ctxt->userData, ctxt->name);
1561 	htmlnamePop(ctxt);
1562     }
1563 }
1564 
1565 /**
1566  * htmlAutoClose:
1567  * @ctxt:  an HTML parser context
1568  * @newtag:  The new tag name or NULL
1569  *
1570  * The HTML DTD allows a tag to implicitly close other tags.
1571  * The list is kept in htmlStartClose array. This function is
1572  * called when a new tag has been detected and generates the
1573  * appropriates closes if possible/needed.
1574  * If newtag is NULL this mean we are at the end of the resource
1575  * and we should check
1576  */
1577 static void
htmlAutoClose(htmlParserCtxtPtr ctxt,const xmlChar * newtag)1578 htmlAutoClose(htmlParserCtxtPtr ctxt, const xmlChar * newtag)
1579 {
1580     while ((newtag != NULL) && (ctxt->name != NULL) &&
1581            (htmlCheckAutoClose(newtag, ctxt->name))) {
1582         if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
1583             ctxt->sax->endElement(ctxt->userData, ctxt->name);
1584 	htmlnamePop(ctxt);
1585     }
1586     if (newtag == NULL) {
1587         htmlAutoCloseOnEnd(ctxt);
1588         return;
1589     }
1590     while ((newtag == NULL) && (ctxt->name != NULL) &&
1591            ((xmlStrEqual(ctxt->name, BAD_CAST "head")) ||
1592             (xmlStrEqual(ctxt->name, BAD_CAST "body")) ||
1593             (xmlStrEqual(ctxt->name, BAD_CAST "html")))) {
1594         if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
1595             ctxt->sax->endElement(ctxt->userData, ctxt->name);
1596 	htmlnamePop(ctxt);
1597     }
1598 }
1599 
1600 /**
1601  * htmlAutoCloseTag:
1602  * @doc:  the HTML document
1603  * @name:  The tag name
1604  * @elem:  the HTML element
1605  *
1606  * The HTML DTD allows a tag to implicitly close other tags.
1607  * The list is kept in htmlStartClose array. This function checks
1608  * if the element or one of it's children would autoclose the
1609  * given tag.
1610  *
1611  * Returns 1 if autoclose, 0 otherwise
1612  */
1613 int
htmlAutoCloseTag(htmlDocPtr doc,const xmlChar * name,htmlNodePtr elem)1614 htmlAutoCloseTag(htmlDocPtr doc, const xmlChar *name, htmlNodePtr elem) {
1615     htmlNodePtr child;
1616 
1617     if (elem == NULL) return(1);
1618     if (xmlStrEqual(name, elem->name)) return(0);
1619     if (htmlCheckAutoClose(elem->name, name)) return(1);
1620     child = elem->children;
1621     while (child != NULL) {
1622         if (htmlAutoCloseTag(doc, name, child)) return(1);
1623 	child = child->next;
1624     }
1625     return(0);
1626 }
1627 
1628 /**
1629  * htmlIsAutoClosed:
1630  * @doc:  the HTML document
1631  * @elem:  the HTML element
1632  *
1633  * The HTML DTD allows a tag to implicitly close other tags.
1634  * The list is kept in htmlStartClose array. This function checks
1635  * if a tag is autoclosed by one of it's child
1636  *
1637  * Returns 1 if autoclosed, 0 otherwise
1638  */
1639 int
htmlIsAutoClosed(htmlDocPtr doc,htmlNodePtr elem)1640 htmlIsAutoClosed(htmlDocPtr doc, htmlNodePtr elem) {
1641     htmlNodePtr child;
1642 
1643     if (elem == NULL) return(1);
1644     child = elem->children;
1645     while (child != NULL) {
1646 	if (htmlAutoCloseTag(doc, elem->name, child)) return(1);
1647 	child = child->next;
1648     }
1649     return(0);
1650 }
1651 
1652 /**
1653  * htmlCheckImplied:
1654  * @ctxt:  an HTML parser context
1655  * @newtag:  The new tag name
1656  *
1657  * The HTML DTD allows a tag to exists only implicitly
1658  * called when a new tag has been detected and generates the
1659  * appropriates implicit tags if missing
1660  */
1661 static void
htmlCheckImplied(htmlParserCtxtPtr ctxt,const xmlChar * newtag)1662 htmlCheckImplied(htmlParserCtxtPtr ctxt, const xmlChar *newtag) {
1663     int i;
1664 
1665     if (ctxt->options & HTML_PARSE_NOIMPLIED)
1666         return;
1667     if (!htmlOmittedDefaultValue)
1668 	return;
1669     if (xmlStrEqual(newtag, BAD_CAST"html"))
1670 	return;
1671     if (ctxt->nameNr <= 0) {
1672 	htmlnamePush(ctxt, BAD_CAST"html");
1673 	if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
1674 	    ctxt->sax->startElement(ctxt->userData, BAD_CAST"html", NULL);
1675     }
1676     if ((xmlStrEqual(newtag, BAD_CAST"body")) || (xmlStrEqual(newtag, BAD_CAST"head")))
1677         return;
1678     if ((ctxt->nameNr <= 1) &&
1679         ((xmlStrEqual(newtag, BAD_CAST"script")) ||
1680 	 (xmlStrEqual(newtag, BAD_CAST"style")) ||
1681 	 (xmlStrEqual(newtag, BAD_CAST"meta")) ||
1682 	 (xmlStrEqual(newtag, BAD_CAST"link")) ||
1683 	 (xmlStrEqual(newtag, BAD_CAST"title")) ||
1684 	 (xmlStrEqual(newtag, BAD_CAST"base")))) {
1685         if (ctxt->html >= 3) {
1686             /* we already saw or generated an <head> before */
1687             return;
1688         }
1689         /*
1690          * dropped OBJECT ... i you put it first BODY will be
1691          * assumed !
1692          */
1693         htmlnamePush(ctxt, BAD_CAST"head");
1694         if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
1695             ctxt->sax->startElement(ctxt->userData, BAD_CAST"head", NULL);
1696     } else if ((!xmlStrEqual(newtag, BAD_CAST"noframes")) &&
1697 	       (!xmlStrEqual(newtag, BAD_CAST"frame")) &&
1698 	       (!xmlStrEqual(newtag, BAD_CAST"frameset"))) {
1699         if (ctxt->html >= 10) {
1700             /* we already saw or generated a <body> before */
1701             return;
1702         }
1703 	for (i = 0;i < ctxt->nameNr;i++) {
1704 	    if (xmlStrEqual(ctxt->nameTab[i], BAD_CAST"body")) {
1705 		return;
1706 	    }
1707 	    if (xmlStrEqual(ctxt->nameTab[i], BAD_CAST"head")) {
1708 		return;
1709 	    }
1710 	}
1711 
1712 	htmlnamePush(ctxt, BAD_CAST"body");
1713 	if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
1714 	    ctxt->sax->startElement(ctxt->userData, BAD_CAST"body", NULL);
1715     }
1716 }
1717 
1718 /**
1719  * htmlCheckParagraph
1720  * @ctxt:  an HTML parser context
1721  *
1722  * Check whether a p element need to be implied before inserting
1723  * characters in the current element.
1724  *
1725  * Returns 1 if a paragraph has been inserted, 0 if not and -1
1726  *         in case of error.
1727  */
1728 
1729 static int
htmlCheckParagraph(htmlParserCtxtPtr ctxt)1730 htmlCheckParagraph(htmlParserCtxtPtr ctxt) {
1731     const xmlChar *tag;
1732     int i;
1733 
1734     if (ctxt == NULL)
1735 	return(-1);
1736     tag = ctxt->name;
1737     if (tag == NULL) {
1738 	htmlAutoClose(ctxt, BAD_CAST"p");
1739 	htmlCheckImplied(ctxt, BAD_CAST"p");
1740 	htmlnamePush(ctxt, BAD_CAST"p");
1741 	if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
1742 	    ctxt->sax->startElement(ctxt->userData, BAD_CAST"p", NULL);
1743 	return(1);
1744     }
1745     if (!htmlOmittedDefaultValue)
1746 	return(0);
1747     for (i = 0; htmlNoContentElements[i] != NULL; i++) {
1748 	if (xmlStrEqual(tag, BAD_CAST htmlNoContentElements[i])) {
1749 	    htmlAutoClose(ctxt, BAD_CAST"p");
1750 	    htmlCheckImplied(ctxt, BAD_CAST"p");
1751 	    htmlnamePush(ctxt, BAD_CAST"p");
1752 	    if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
1753 		ctxt->sax->startElement(ctxt->userData, BAD_CAST"p", NULL);
1754 	    return(1);
1755 	}
1756     }
1757     return(0);
1758 }
1759 
1760 /**
1761  * htmlIsScriptAttribute:
1762  * @name:  an attribute name
1763  *
1764  * Check if an attribute is of content type Script
1765  *
1766  * Returns 1 is the attribute is a script 0 otherwise
1767  */
1768 int
htmlIsScriptAttribute(const xmlChar * name)1769 htmlIsScriptAttribute(const xmlChar *name) {
1770     unsigned int i;
1771 
1772     if (name == NULL)
1773       return(0);
1774     /*
1775      * all script attributes start with 'on'
1776      */
1777     if ((name[0] != 'o') || (name[1] != 'n'))
1778       return(0);
1779     for (i = 0;
1780 	 i < sizeof(htmlScriptAttributes)/sizeof(htmlScriptAttributes[0]);
1781 	 i++) {
1782 	if (xmlStrEqual(name, (const xmlChar *) htmlScriptAttributes[i]))
1783 	    return(1);
1784     }
1785     return(0);
1786 }
1787 
1788 /************************************************************************
1789  *									*
1790  *	The list of HTML predefined entities			*
1791  *									*
1792  ************************************************************************/
1793 
1794 
1795 static const htmlEntityDesc  html40EntitiesTable[] = {
1796 /*
1797  * the 4 absolute ones, plus apostrophe.
1798  */
1799 { 34,	"quot",	"quotation mark = APL quote, U+0022 ISOnum" },
1800 { 38,	"amp",	"ampersand, U+0026 ISOnum" },
1801 { 39,	"apos",	"single quote" },
1802 { 60,	"lt",	"less-than sign, U+003C ISOnum" },
1803 { 62,	"gt",	"greater-than sign, U+003E ISOnum" },
1804 
1805 /*
1806  * A bunch still in the 128-255 range
1807  * Replacing them depend really on the charset used.
1808  */
1809 { 160,	"nbsp",	"no-break space = non-breaking space, U+00A0 ISOnum" },
1810 { 161,	"iexcl","inverted exclamation mark, U+00A1 ISOnum" },
1811 { 162,	"cent",	"cent sign, U+00A2 ISOnum" },
1812 { 163,	"pound","pound sign, U+00A3 ISOnum" },
1813 { 164,	"curren","currency sign, U+00A4 ISOnum" },
1814 { 165,	"yen",	"yen sign = yuan sign, U+00A5 ISOnum" },
1815 { 166,	"brvbar","broken bar = broken vertical bar, U+00A6 ISOnum" },
1816 { 167,	"sect",	"section sign, U+00A7 ISOnum" },
1817 { 168,	"uml",	"diaeresis = spacing diaeresis, U+00A8 ISOdia" },
1818 { 169,	"copy",	"copyright sign, U+00A9 ISOnum" },
1819 { 170,	"ordf",	"feminine ordinal indicator, U+00AA ISOnum" },
1820 { 171,	"laquo","left-pointing double angle quotation mark = left pointing guillemet, U+00AB ISOnum" },
1821 { 172,	"not",	"not sign, U+00AC ISOnum" },
1822 { 173,	"shy",	"soft hyphen = discretionary hyphen, U+00AD ISOnum" },
1823 { 174,	"reg",	"registered sign = registered trade mark sign, U+00AE ISOnum" },
1824 { 175,	"macr",	"macron = spacing macron = overline = APL overbar, U+00AF ISOdia" },
1825 { 176,	"deg",	"degree sign, U+00B0 ISOnum" },
1826 { 177,	"plusmn","plus-minus sign = plus-or-minus sign, U+00B1 ISOnum" },
1827 { 178,	"sup2",	"superscript two = superscript digit two = squared, U+00B2 ISOnum" },
1828 { 179,	"sup3",	"superscript three = superscript digit three = cubed, U+00B3 ISOnum" },
1829 { 180,	"acute","acute accent = spacing acute, U+00B4 ISOdia" },
1830 { 181,	"micro","micro sign, U+00B5 ISOnum" },
1831 { 182,	"para",	"pilcrow sign = paragraph sign, U+00B6 ISOnum" },
1832 { 183,	"middot","middle dot = Georgian comma Greek middle dot, U+00B7 ISOnum" },
1833 { 184,	"cedil","cedilla = spacing cedilla, U+00B8 ISOdia" },
1834 { 185,	"sup1",	"superscript one = superscript digit one, U+00B9 ISOnum" },
1835 { 186,	"ordm",	"masculine ordinal indicator, U+00BA ISOnum" },
1836 { 187,	"raquo","right-pointing double angle quotation mark right pointing guillemet, U+00BB ISOnum" },
1837 { 188,	"frac14","vulgar fraction one quarter = fraction one quarter, U+00BC ISOnum" },
1838 { 189,	"frac12","vulgar fraction one half = fraction one half, U+00BD ISOnum" },
1839 { 190,	"frac34","vulgar fraction three quarters = fraction three quarters, U+00BE ISOnum" },
1840 { 191,	"iquest","inverted question mark = turned question mark, U+00BF ISOnum" },
1841 { 192,	"Agrave","latin capital letter A with grave = latin capital letter A grave, U+00C0 ISOlat1" },
1842 { 193,	"Aacute","latin capital letter A with acute, U+00C1 ISOlat1" },
1843 { 194,	"Acirc","latin capital letter A with circumflex, U+00C2 ISOlat1" },
1844 { 195,	"Atilde","latin capital letter A with tilde, U+00C3 ISOlat1" },
1845 { 196,	"Auml",	"latin capital letter A with diaeresis, U+00C4 ISOlat1" },
1846 { 197,	"Aring","latin capital letter A with ring above = latin capital letter A ring, U+00C5 ISOlat1" },
1847 { 198,	"AElig","latin capital letter AE = latin capital ligature AE, U+00C6 ISOlat1" },
1848 { 199,	"Ccedil","latin capital letter C with cedilla, U+00C7 ISOlat1" },
1849 { 200,	"Egrave","latin capital letter E with grave, U+00C8 ISOlat1" },
1850 { 201,	"Eacute","latin capital letter E with acute, U+00C9 ISOlat1" },
1851 { 202,	"Ecirc","latin capital letter E with circumflex, U+00CA ISOlat1" },
1852 { 203,	"Euml",	"latin capital letter E with diaeresis, U+00CB ISOlat1" },
1853 { 204,	"Igrave","latin capital letter I with grave, U+00CC ISOlat1" },
1854 { 205,	"Iacute","latin capital letter I with acute, U+00CD ISOlat1" },
1855 { 206,	"Icirc","latin capital letter I with circumflex, U+00CE ISOlat1" },
1856 { 207,	"Iuml",	"latin capital letter I with diaeresis, U+00CF ISOlat1" },
1857 { 208,	"ETH",	"latin capital letter ETH, U+00D0 ISOlat1" },
1858 { 209,	"Ntilde","latin capital letter N with tilde, U+00D1 ISOlat1" },
1859 { 210,	"Ograve","latin capital letter O with grave, U+00D2 ISOlat1" },
1860 { 211,	"Oacute","latin capital letter O with acute, U+00D3 ISOlat1" },
1861 { 212,	"Ocirc","latin capital letter O with circumflex, U+00D4 ISOlat1" },
1862 { 213,	"Otilde","latin capital letter O with tilde, U+00D5 ISOlat1" },
1863 { 214,	"Ouml",	"latin capital letter O with diaeresis, U+00D6 ISOlat1" },
1864 { 215,	"times","multiplication sign, U+00D7 ISOnum" },
1865 { 216,	"Oslash","latin capital letter O with stroke latin capital letter O slash, U+00D8 ISOlat1" },
1866 { 217,	"Ugrave","latin capital letter U with grave, U+00D9 ISOlat1" },
1867 { 218,	"Uacute","latin capital letter U with acute, U+00DA ISOlat1" },
1868 { 219,	"Ucirc","latin capital letter U with circumflex, U+00DB ISOlat1" },
1869 { 220,	"Uuml",	"latin capital letter U with diaeresis, U+00DC ISOlat1" },
1870 { 221,	"Yacute","latin capital letter Y with acute, U+00DD ISOlat1" },
1871 { 222,	"THORN","latin capital letter THORN, U+00DE ISOlat1" },
1872 { 223,	"szlig","latin small letter sharp s = ess-zed, U+00DF ISOlat1" },
1873 { 224,	"agrave","latin small letter a with grave = latin small letter a grave, U+00E0 ISOlat1" },
1874 { 225,	"aacute","latin small letter a with acute, U+00E1 ISOlat1" },
1875 { 226,	"acirc","latin small letter a with circumflex, U+00E2 ISOlat1" },
1876 { 227,	"atilde","latin small letter a with tilde, U+00E3 ISOlat1" },
1877 { 228,	"auml",	"latin small letter a with diaeresis, U+00E4 ISOlat1" },
1878 { 229,	"aring","latin small letter a with ring above = latin small letter a ring, U+00E5 ISOlat1" },
1879 { 230,	"aelig","latin small letter ae = latin small ligature ae, U+00E6 ISOlat1" },
1880 { 231,	"ccedil","latin small letter c with cedilla, U+00E7 ISOlat1" },
1881 { 232,	"egrave","latin small letter e with grave, U+00E8 ISOlat1" },
1882 { 233,	"eacute","latin small letter e with acute, U+00E9 ISOlat1" },
1883 { 234,	"ecirc","latin small letter e with circumflex, U+00EA ISOlat1" },
1884 { 235,	"euml",	"latin small letter e with diaeresis, U+00EB ISOlat1" },
1885 { 236,	"igrave","latin small letter i with grave, U+00EC ISOlat1" },
1886 { 237,	"iacute","latin small letter i with acute, U+00ED ISOlat1" },
1887 { 238,	"icirc","latin small letter i with circumflex, U+00EE ISOlat1" },
1888 { 239,	"iuml",	"latin small letter i with diaeresis, U+00EF ISOlat1" },
1889 { 240,	"eth",	"latin small letter eth, U+00F0 ISOlat1" },
1890 { 241,	"ntilde","latin small letter n with tilde, U+00F1 ISOlat1" },
1891 { 242,	"ograve","latin small letter o with grave, U+00F2 ISOlat1" },
1892 { 243,	"oacute","latin small letter o with acute, U+00F3 ISOlat1" },
1893 { 244,	"ocirc","latin small letter o with circumflex, U+00F4 ISOlat1" },
1894 { 245,	"otilde","latin small letter o with tilde, U+00F5 ISOlat1" },
1895 { 246,	"ouml",	"latin small letter o with diaeresis, U+00F6 ISOlat1" },
1896 { 247,	"divide","division sign, U+00F7 ISOnum" },
1897 { 248,	"oslash","latin small letter o with stroke, = latin small letter o slash, U+00F8 ISOlat1" },
1898 { 249,	"ugrave","latin small letter u with grave, U+00F9 ISOlat1" },
1899 { 250,	"uacute","latin small letter u with acute, U+00FA ISOlat1" },
1900 { 251,	"ucirc","latin small letter u with circumflex, U+00FB ISOlat1" },
1901 { 252,	"uuml",	"latin small letter u with diaeresis, U+00FC ISOlat1" },
1902 { 253,	"yacute","latin small letter y with acute, U+00FD ISOlat1" },
1903 { 254,	"thorn","latin small letter thorn with, U+00FE ISOlat1" },
1904 { 255,	"yuml",	"latin small letter y with diaeresis, U+00FF ISOlat1" },
1905 
1906 { 338,	"OElig","latin capital ligature OE, U+0152 ISOlat2" },
1907 { 339,	"oelig","latin small ligature oe, U+0153 ISOlat2" },
1908 { 352,	"Scaron","latin capital letter S with caron, U+0160 ISOlat2" },
1909 { 353,	"scaron","latin small letter s with caron, U+0161 ISOlat2" },
1910 { 376,	"Yuml",	"latin capital letter Y with diaeresis, U+0178 ISOlat2" },
1911 
1912 /*
1913  * Anything below should really be kept as entities references
1914  */
1915 { 402,	"fnof",	"latin small f with hook = function = florin, U+0192 ISOtech" },
1916 
1917 { 710,	"circ",	"modifier letter circumflex accent, U+02C6 ISOpub" },
1918 { 732,	"tilde","small tilde, U+02DC ISOdia" },
1919 
1920 { 913,	"Alpha","greek capital letter alpha, U+0391" },
1921 { 914,	"Beta",	"greek capital letter beta, U+0392" },
1922 { 915,	"Gamma","greek capital letter gamma, U+0393 ISOgrk3" },
1923 { 916,	"Delta","greek capital letter delta, U+0394 ISOgrk3" },
1924 { 917,	"Epsilon","greek capital letter epsilon, U+0395" },
1925 { 918,	"Zeta",	"greek capital letter zeta, U+0396" },
1926 { 919,	"Eta",	"greek capital letter eta, U+0397" },
1927 { 920,	"Theta","greek capital letter theta, U+0398 ISOgrk3" },
1928 { 921,	"Iota",	"greek capital letter iota, U+0399" },
1929 { 922,	"Kappa","greek capital letter kappa, U+039A" },
1930 { 923,	"Lambda", "greek capital letter lambda, U+039B ISOgrk3" },
1931 { 924,	"Mu",	"greek capital letter mu, U+039C" },
1932 { 925,	"Nu",	"greek capital letter nu, U+039D" },
1933 { 926,	"Xi",	"greek capital letter xi, U+039E ISOgrk3" },
1934 { 927,	"Omicron","greek capital letter omicron, U+039F" },
1935 { 928,	"Pi",	"greek capital letter pi, U+03A0 ISOgrk3" },
1936 { 929,	"Rho",	"greek capital letter rho, U+03A1" },
1937 { 931,	"Sigma","greek capital letter sigma, U+03A3 ISOgrk3" },
1938 { 932,	"Tau",	"greek capital letter tau, U+03A4" },
1939 { 933,	"Upsilon","greek capital letter upsilon, U+03A5 ISOgrk3" },
1940 { 934,	"Phi",	"greek capital letter phi, U+03A6 ISOgrk3" },
1941 { 935,	"Chi",	"greek capital letter chi, U+03A7" },
1942 { 936,	"Psi",	"greek capital letter psi, U+03A8 ISOgrk3" },
1943 { 937,	"Omega","greek capital letter omega, U+03A9 ISOgrk3" },
1944 
1945 { 945,	"alpha","greek small letter alpha, U+03B1 ISOgrk3" },
1946 { 946,	"beta",	"greek small letter beta, U+03B2 ISOgrk3" },
1947 { 947,	"gamma","greek small letter gamma, U+03B3 ISOgrk3" },
1948 { 948,	"delta","greek small letter delta, U+03B4 ISOgrk3" },
1949 { 949,	"epsilon","greek small letter epsilon, U+03B5 ISOgrk3" },
1950 { 950,	"zeta",	"greek small letter zeta, U+03B6 ISOgrk3" },
1951 { 951,	"eta",	"greek small letter eta, U+03B7 ISOgrk3" },
1952 { 952,	"theta","greek small letter theta, U+03B8 ISOgrk3" },
1953 { 953,	"iota",	"greek small letter iota, U+03B9 ISOgrk3" },
1954 { 954,	"kappa","greek small letter kappa, U+03BA ISOgrk3" },
1955 { 955,	"lambda","greek small letter lambda, U+03BB ISOgrk3" },
1956 { 956,	"mu",	"greek small letter mu, U+03BC ISOgrk3" },
1957 { 957,	"nu",	"greek small letter nu, U+03BD ISOgrk3" },
1958 { 958,	"xi",	"greek small letter xi, U+03BE ISOgrk3" },
1959 { 959,	"omicron","greek small letter omicron, U+03BF NEW" },
1960 { 960,	"pi",	"greek small letter pi, U+03C0 ISOgrk3" },
1961 { 961,	"rho",	"greek small letter rho, U+03C1 ISOgrk3" },
1962 { 962,	"sigmaf","greek small letter final sigma, U+03C2 ISOgrk3" },
1963 { 963,	"sigma","greek small letter sigma, U+03C3 ISOgrk3" },
1964 { 964,	"tau",	"greek small letter tau, U+03C4 ISOgrk3" },
1965 { 965,	"upsilon","greek small letter upsilon, U+03C5 ISOgrk3" },
1966 { 966,	"phi",	"greek small letter phi, U+03C6 ISOgrk3" },
1967 { 967,	"chi",	"greek small letter chi, U+03C7 ISOgrk3" },
1968 { 968,	"psi",	"greek small letter psi, U+03C8 ISOgrk3" },
1969 { 969,	"omega","greek small letter omega, U+03C9 ISOgrk3" },
1970 { 977,	"thetasym","greek small letter theta symbol, U+03D1 NEW" },
1971 { 978,	"upsih","greek upsilon with hook symbol, U+03D2 NEW" },
1972 { 982,	"piv",	"greek pi symbol, U+03D6 ISOgrk3" },
1973 
1974 { 8194,	"ensp",	"en space, U+2002 ISOpub" },
1975 { 8195,	"emsp",	"em space, U+2003 ISOpub" },
1976 { 8201,	"thinsp","thin space, U+2009 ISOpub" },
1977 { 8204,	"zwnj",	"zero width non-joiner, U+200C NEW RFC 2070" },
1978 { 8205,	"zwj",	"zero width joiner, U+200D NEW RFC 2070" },
1979 { 8206,	"lrm",	"left-to-right mark, U+200E NEW RFC 2070" },
1980 { 8207,	"rlm",	"right-to-left mark, U+200F NEW RFC 2070" },
1981 { 8211,	"ndash","en dash, U+2013 ISOpub" },
1982 { 8212,	"mdash","em dash, U+2014 ISOpub" },
1983 { 8216,	"lsquo","left single quotation mark, U+2018 ISOnum" },
1984 { 8217,	"rsquo","right single quotation mark, U+2019 ISOnum" },
1985 { 8218,	"sbquo","single low-9 quotation mark, U+201A NEW" },
1986 { 8220,	"ldquo","left double quotation mark, U+201C ISOnum" },
1987 { 8221,	"rdquo","right double quotation mark, U+201D ISOnum" },
1988 { 8222,	"bdquo","double low-9 quotation mark, U+201E NEW" },
1989 { 8224,	"dagger","dagger, U+2020 ISOpub" },
1990 { 8225,	"Dagger","double dagger, U+2021 ISOpub" },
1991 
1992 { 8226,	"bull",	"bullet = black small circle, U+2022 ISOpub" },
1993 { 8230,	"hellip","horizontal ellipsis = three dot leader, U+2026 ISOpub" },
1994 
1995 { 8240,	"permil","per mille sign, U+2030 ISOtech" },
1996 
1997 { 8242,	"prime","prime = minutes = feet, U+2032 ISOtech" },
1998 { 8243,	"Prime","double prime = seconds = inches, U+2033 ISOtech" },
1999 
2000 { 8249,	"lsaquo","single left-pointing angle quotation mark, U+2039 ISO proposed" },
2001 { 8250,	"rsaquo","single right-pointing angle quotation mark, U+203A ISO proposed" },
2002 
2003 { 8254,	"oline","overline = spacing overscore, U+203E NEW" },
2004 { 8260,	"frasl","fraction slash, U+2044 NEW" },
2005 
2006 { 8364,	"euro",	"euro sign, U+20AC NEW" },
2007 
2008 { 8465,	"image","blackletter capital I = imaginary part, U+2111 ISOamso" },
2009 { 8472,	"weierp","script capital P = power set = Weierstrass p, U+2118 ISOamso" },
2010 { 8476,	"real",	"blackletter capital R = real part symbol, U+211C ISOamso" },
2011 { 8482,	"trade","trade mark sign, U+2122 ISOnum" },
2012 { 8501,	"alefsym","alef symbol = first transfinite cardinal, U+2135 NEW" },
2013 { 8592,	"larr",	"leftwards arrow, U+2190 ISOnum" },
2014 { 8593,	"uarr",	"upwards arrow, U+2191 ISOnum" },
2015 { 8594,	"rarr",	"rightwards arrow, U+2192 ISOnum" },
2016 { 8595,	"darr",	"downwards arrow, U+2193 ISOnum" },
2017 { 8596,	"harr",	"left right arrow, U+2194 ISOamsa" },
2018 { 8629,	"crarr","downwards arrow with corner leftwards = carriage return, U+21B5 NEW" },
2019 { 8656,	"lArr",	"leftwards double arrow, U+21D0 ISOtech" },
2020 { 8657,	"uArr",	"upwards double arrow, U+21D1 ISOamsa" },
2021 { 8658,	"rArr",	"rightwards double arrow, U+21D2 ISOtech" },
2022 { 8659,	"dArr",	"downwards double arrow, U+21D3 ISOamsa" },
2023 { 8660,	"hArr",	"left right double arrow, U+21D4 ISOamsa" },
2024 
2025 { 8704,	"forall","for all, U+2200 ISOtech" },
2026 { 8706,	"part",	"partial differential, U+2202 ISOtech" },
2027 { 8707,	"exist","there exists, U+2203 ISOtech" },
2028 { 8709,	"empty","empty set = null set = diameter, U+2205 ISOamso" },
2029 { 8711,	"nabla","nabla = backward difference, U+2207 ISOtech" },
2030 { 8712,	"isin",	"element of, U+2208 ISOtech" },
2031 { 8713,	"notin","not an element of, U+2209 ISOtech" },
2032 { 8715,	"ni",	"contains as member, U+220B ISOtech" },
2033 { 8719,	"prod",	"n-ary product = product sign, U+220F ISOamsb" },
2034 { 8721,	"sum",	"n-ary summation, U+2211 ISOamsb" },
2035 { 8722,	"minus","minus sign, U+2212 ISOtech" },
2036 { 8727,	"lowast","asterisk operator, U+2217 ISOtech" },
2037 { 8730,	"radic","square root = radical sign, U+221A ISOtech" },
2038 { 8733,	"prop",	"proportional to, U+221D ISOtech" },
2039 { 8734,	"infin","infinity, U+221E ISOtech" },
2040 { 8736,	"ang",	"angle, U+2220 ISOamso" },
2041 { 8743,	"and",	"logical and = wedge, U+2227 ISOtech" },
2042 { 8744,	"or",	"logical or = vee, U+2228 ISOtech" },
2043 { 8745,	"cap",	"intersection = cap, U+2229 ISOtech" },
2044 { 8746,	"cup",	"union = cup, U+222A ISOtech" },
2045 { 8747,	"int",	"integral, U+222B ISOtech" },
2046 { 8756,	"there4","therefore, U+2234 ISOtech" },
2047 { 8764,	"sim",	"tilde operator = varies with = similar to, U+223C ISOtech" },
2048 { 8773,	"cong",	"approximately equal to, U+2245 ISOtech" },
2049 { 8776,	"asymp","almost equal to = asymptotic to, U+2248 ISOamsr" },
2050 { 8800,	"ne",	"not equal to, U+2260 ISOtech" },
2051 { 8801,	"equiv","identical to, U+2261 ISOtech" },
2052 { 8804,	"le",	"less-than or equal to, U+2264 ISOtech" },
2053 { 8805,	"ge",	"greater-than or equal to, U+2265 ISOtech" },
2054 { 8834,	"sub",	"subset of, U+2282 ISOtech" },
2055 { 8835,	"sup",	"superset of, U+2283 ISOtech" },
2056 { 8836,	"nsub",	"not a subset of, U+2284 ISOamsn" },
2057 { 8838,	"sube",	"subset of or equal to, U+2286 ISOtech" },
2058 { 8839,	"supe",	"superset of or equal to, U+2287 ISOtech" },
2059 { 8853,	"oplus","circled plus = direct sum, U+2295 ISOamsb" },
2060 { 8855,	"otimes","circled times = vector product, U+2297 ISOamsb" },
2061 { 8869,	"perp",	"up tack = orthogonal to = perpendicular, U+22A5 ISOtech" },
2062 { 8901,	"sdot",	"dot operator, U+22C5 ISOamsb" },
2063 { 8968,	"lceil","left ceiling = apl upstile, U+2308 ISOamsc" },
2064 { 8969,	"rceil","right ceiling, U+2309 ISOamsc" },
2065 { 8970,	"lfloor","left floor = apl downstile, U+230A ISOamsc" },
2066 { 8971,	"rfloor","right floor, U+230B ISOamsc" },
2067 { 9001,	"lang",	"left-pointing angle bracket = bra, U+2329 ISOtech" },
2068 { 9002,	"rang",	"right-pointing angle bracket = ket, U+232A ISOtech" },
2069 { 9674,	"loz",	"lozenge, U+25CA ISOpub" },
2070 
2071 { 9824,	"spades","black spade suit, U+2660 ISOpub" },
2072 { 9827,	"clubs","black club suit = shamrock, U+2663 ISOpub" },
2073 { 9829,	"hearts","black heart suit = valentine, U+2665 ISOpub" },
2074 { 9830,	"diams","black diamond suit, U+2666 ISOpub" },
2075 
2076 };
2077 
2078 /************************************************************************
2079  *									*
2080  *		Commodity functions to handle entities			*
2081  *									*
2082  ************************************************************************/
2083 
2084 /*
2085  * Macro used to grow the current buffer.
2086  */
2087 #define growBuffer(buffer) {						\
2088     xmlChar *tmp;							\
2089     buffer##_size *= 2;							\
2090     tmp = (xmlChar *) xmlRealloc(buffer, buffer##_size * sizeof(xmlChar)); \
2091     if (tmp == NULL) {						\
2092 	htmlErrMemory(ctxt, "growing buffer\n");			\
2093 	xmlFree(buffer);						\
2094 	return(NULL);							\
2095     }									\
2096     buffer = tmp;							\
2097 }
2098 
2099 /**
2100  * htmlEntityLookup:
2101  * @name: the entity name
2102  *
2103  * Lookup the given entity in EntitiesTable
2104  *
2105  * TODO: the linear scan is really ugly, an hash table is really needed.
2106  *
2107  * Returns the associated htmlEntityDescPtr if found, NULL otherwise.
2108  */
2109 const htmlEntityDesc *
htmlEntityLookup(const xmlChar * name)2110 htmlEntityLookup(const xmlChar *name) {
2111     unsigned int i;
2112 
2113     for (i = 0;i < (sizeof(html40EntitiesTable)/
2114                     sizeof(html40EntitiesTable[0]));i++) {
2115         if (xmlStrEqual(name, BAD_CAST html40EntitiesTable[i].name)) {
2116             return((htmlEntityDescPtr) &html40EntitiesTable[i]);
2117 	}
2118     }
2119     return(NULL);
2120 }
2121 
2122 /**
2123  * htmlEntityValueLookup:
2124  * @value: the entity's unicode value
2125  *
2126  * Lookup the given entity in EntitiesTable
2127  *
2128  * TODO: the linear scan is really ugly, an hash table is really needed.
2129  *
2130  * Returns the associated htmlEntityDescPtr if found, NULL otherwise.
2131  */
2132 const htmlEntityDesc *
htmlEntityValueLookup(unsigned int value)2133 htmlEntityValueLookup(unsigned int value) {
2134     unsigned int i;
2135 
2136     for (i = 0;i < (sizeof(html40EntitiesTable)/
2137                     sizeof(html40EntitiesTable[0]));i++) {
2138         if (html40EntitiesTable[i].value >= value) {
2139 	    if (html40EntitiesTable[i].value > value)
2140 		break;
2141             return((htmlEntityDescPtr) &html40EntitiesTable[i]);
2142 	}
2143     }
2144     return(NULL);
2145 }
2146 
2147 /**
2148  * UTF8ToHtml:
2149  * @out:  a pointer to an array of bytes to store the result
2150  * @outlen:  the length of @out
2151  * @in:  a pointer to an array of UTF-8 chars
2152  * @inlen:  the length of @in
2153  *
2154  * Take a block of UTF-8 chars in and try to convert it to an ASCII
2155  * plus HTML entities block of chars out.
2156  *
2157  * Returns 0 if success, -2 if the transcoding fails, or -1 otherwise
2158  * The value of @inlen after return is the number of octets consumed
2159  *     as the return value is positive, else unpredictable.
2160  * The value of @outlen after return is the number of octets consumed.
2161  */
2162 int
UTF8ToHtml(unsigned char * out,int * outlen,const unsigned char * in,int * inlen)2163 UTF8ToHtml(unsigned char* out, int *outlen,
2164               const unsigned char* in, int *inlen) {
2165     const unsigned char* processed = in;
2166     const unsigned char* outend;
2167     const unsigned char* outstart = out;
2168     const unsigned char* instart = in;
2169     const unsigned char* inend;
2170     unsigned int c, d;
2171     int trailing;
2172 
2173     if ((out == NULL) || (outlen == NULL) || (inlen == NULL)) return(-1);
2174     if (in == NULL) {
2175         /*
2176 	 * initialization nothing to do
2177 	 */
2178 	*outlen = 0;
2179 	*inlen = 0;
2180 	return(0);
2181     }
2182     inend = in + (*inlen);
2183     outend = out + (*outlen);
2184     while (in < inend) {
2185 	d = *in++;
2186 	if      (d < 0x80)  { c= d; trailing= 0; }
2187 	else if (d < 0xC0) {
2188 	    /* trailing byte in leading position */
2189 	    *outlen = out - outstart;
2190 	    *inlen = processed - instart;
2191 	    return(-2);
2192         } else if (d < 0xE0)  { c= d & 0x1F; trailing= 1; }
2193         else if (d < 0xF0)  { c= d & 0x0F; trailing= 2; }
2194         else if (d < 0xF8)  { c= d & 0x07; trailing= 3; }
2195 	else {
2196 	    /* no chance for this in Ascii */
2197 	    *outlen = out - outstart;
2198 	    *inlen = processed - instart;
2199 	    return(-2);
2200 	}
2201 
2202 	if (inend - in < trailing) {
2203 	    break;
2204 	}
2205 
2206 	for ( ; trailing; trailing--) {
2207 	    if ((in >= inend) || (((d= *in++) & 0xC0) != 0x80))
2208 		break;
2209 	    c <<= 6;
2210 	    c |= d & 0x3F;
2211 	}
2212 
2213 	/* assertion: c is a single UTF-4 value */
2214 	if (c < 0x80) {
2215 	    if (out + 1 >= outend)
2216 		break;
2217 	    *out++ = c;
2218 	} else {
2219 	    int len;
2220 	    const htmlEntityDesc * ent;
2221 	    const char *cp;
2222 	    char nbuf[16];
2223 
2224 	    /*
2225 	     * Try to lookup a predefined HTML entity for it
2226 	     */
2227 
2228 	    ent = htmlEntityValueLookup(c);
2229 	    if (ent == NULL) {
2230 	      snprintf(nbuf, sizeof(nbuf), "#%u", c);
2231 	      cp = nbuf;
2232 	    }
2233 	    else
2234 	      cp = ent->name;
2235 	    len = strlen(cp);
2236 	    if (out + 2 + len >= outend)
2237 		break;
2238 	    *out++ = '&';
2239 	    memcpy(out, cp, len);
2240 	    out += len;
2241 	    *out++ = ';';
2242 	}
2243 	processed = in;
2244     }
2245     *outlen = out - outstart;
2246     *inlen = processed - instart;
2247     return(0);
2248 }
2249 
2250 /**
2251  * htmlEncodeEntities:
2252  * @out:  a pointer to an array of bytes to store the result
2253  * @outlen:  the length of @out
2254  * @in:  a pointer to an array of UTF-8 chars
2255  * @inlen:  the length of @in
2256  * @quoteChar: the quote character to escape (' or ") or zero.
2257  *
2258  * Take a block of UTF-8 chars in and try to convert it to an ASCII
2259  * plus HTML entities block of chars out.
2260  *
2261  * Returns 0 if success, -2 if the transcoding fails, or -1 otherwise
2262  * The value of @inlen after return is the number of octets consumed
2263  *     as the return value is positive, else unpredictable.
2264  * The value of @outlen after return is the number of octets consumed.
2265  */
2266 int
htmlEncodeEntities(unsigned char * out,int * outlen,const unsigned char * in,int * inlen,int quoteChar)2267 htmlEncodeEntities(unsigned char* out, int *outlen,
2268 		   const unsigned char* in, int *inlen, int quoteChar) {
2269     const unsigned char* processed = in;
2270     const unsigned char* outend;
2271     const unsigned char* outstart = out;
2272     const unsigned char* instart = in;
2273     const unsigned char* inend;
2274     unsigned int c, d;
2275     int trailing;
2276 
2277     if ((out == NULL) || (outlen == NULL) || (inlen == NULL) || (in == NULL))
2278         return(-1);
2279     outend = out + (*outlen);
2280     inend = in + (*inlen);
2281     while (in < inend) {
2282 	d = *in++;
2283 	if      (d < 0x80)  { c= d; trailing= 0; }
2284 	else if (d < 0xC0) {
2285 	    /* trailing byte in leading position */
2286 	    *outlen = out - outstart;
2287 	    *inlen = processed - instart;
2288 	    return(-2);
2289         } else if (d < 0xE0)  { c= d & 0x1F; trailing= 1; }
2290         else if (d < 0xF0)  { c= d & 0x0F; trailing= 2; }
2291         else if (d < 0xF8)  { c= d & 0x07; trailing= 3; }
2292 	else {
2293 	    /* no chance for this in Ascii */
2294 	    *outlen = out - outstart;
2295 	    *inlen = processed - instart;
2296 	    return(-2);
2297 	}
2298 
2299 	if (inend - in < trailing)
2300 	    break;
2301 
2302 	while (trailing--) {
2303 	    if (((d= *in++) & 0xC0) != 0x80) {
2304 		*outlen = out - outstart;
2305 		*inlen = processed - instart;
2306 		return(-2);
2307 	    }
2308 	    c <<= 6;
2309 	    c |= d & 0x3F;
2310 	}
2311 
2312 	/* assertion: c is a single UTF-4 value */
2313 	if ((c < 0x80) && (c != (unsigned int) quoteChar) &&
2314 	    (c != '&') && (c != '<') && (c != '>')) {
2315 	    if (out >= outend)
2316 		break;
2317 	    *out++ = c;
2318 	} else {
2319 	    const htmlEntityDesc * ent;
2320 	    const char *cp;
2321 	    char nbuf[16];
2322 	    int len;
2323 
2324 	    /*
2325 	     * Try to lookup a predefined HTML entity for it
2326 	     */
2327 	    ent = htmlEntityValueLookup(c);
2328 	    if (ent == NULL) {
2329 		snprintf(nbuf, sizeof(nbuf), "#%u", c);
2330 		cp = nbuf;
2331 	    }
2332 	    else
2333 		cp = ent->name;
2334 	    len = strlen(cp);
2335 	    if (out + 2 + len > outend)
2336 		break;
2337 	    *out++ = '&';
2338 	    memcpy(out, cp, len);
2339 	    out += len;
2340 	    *out++ = ';';
2341 	}
2342 	processed = in;
2343     }
2344     *outlen = out - outstart;
2345     *inlen = processed - instart;
2346     return(0);
2347 }
2348 
2349 /************************************************************************
2350  *									*
2351  *		Commodity functions to handle streams			*
2352  *									*
2353  ************************************************************************/
2354 
2355 #ifdef LIBXML_PUSH_ENABLED
2356 /**
2357  * htmlNewInputStream:
2358  * @ctxt:  an HTML parser context
2359  *
2360  * Create a new input stream structure
2361  * Returns the new input stream or NULL
2362  */
2363 static htmlParserInputPtr
htmlNewInputStream(htmlParserCtxtPtr ctxt)2364 htmlNewInputStream(htmlParserCtxtPtr ctxt) {
2365     htmlParserInputPtr input;
2366 
2367     input = (xmlParserInputPtr) xmlMalloc(sizeof(htmlParserInput));
2368     if (input == NULL) {
2369         htmlErrMemory(ctxt, "couldn't allocate a new input stream\n");
2370 	return(NULL);
2371     }
2372     memset(input, 0, sizeof(htmlParserInput));
2373     input->filename = NULL;
2374     input->directory = NULL;
2375     input->base = NULL;
2376     input->cur = NULL;
2377     input->buf = NULL;
2378     input->line = 1;
2379     input->col = 1;
2380     input->buf = NULL;
2381     input->free = NULL;
2382     input->version = NULL;
2383     input->consumed = 0;
2384     input->length = 0;
2385     return(input);
2386 }
2387 #endif
2388 
2389 
2390 /************************************************************************
2391  *									*
2392  *		Commodity functions, cleanup needed ?			*
2393  *									*
2394  ************************************************************************/
2395 /*
2396  * all tags allowing pc data from the html 4.01 loose dtd
2397  * NOTE: it might be more appropriate to integrate this information
2398  * into the html40ElementTable array but I don't want to risk any
2399  * binary incompatibility
2400  */
2401 static const char *allowPCData[] = {
2402     "a", "abbr", "acronym", "address", "applet", "b", "bdo", "big",
2403     "blockquote", "body", "button", "caption", "center", "cite", "code",
2404     "dd", "del", "dfn", "div", "dt", "em", "font", "form", "h1", "h2",
2405     "h3", "h4", "h5", "h6", "i", "iframe", "ins", "kbd", "label", "legend",
2406     "li", "noframes", "noscript", "object", "p", "pre", "q", "s", "samp",
2407     "small", "span", "strike", "strong", "td", "th", "tt", "u", "var"
2408 };
2409 
2410 /**
2411  * areBlanks:
2412  * @ctxt:  an HTML parser context
2413  * @str:  a xmlChar *
2414  * @len:  the size of @str
2415  *
2416  * Is this a sequence of blank chars that one can ignore ?
2417  *
2418  * Returns 1 if ignorable 0 otherwise.
2419  */
2420 
areBlanks(htmlParserCtxtPtr ctxt,const xmlChar * str,int len)2421 static int areBlanks(htmlParserCtxtPtr ctxt, const xmlChar *str, int len) {
2422     unsigned int i;
2423     int j;
2424     xmlNodePtr lastChild;
2425     xmlDtdPtr dtd;
2426 
2427     for (j = 0;j < len;j++)
2428         if (!(IS_BLANK_CH(str[j]))) return(0);
2429 
2430     if (CUR == 0) return(1);
2431     if (CUR != '<') return(0);
2432     if (ctxt->name == NULL)
2433 	return(1);
2434     if (xmlStrEqual(ctxt->name, BAD_CAST"html"))
2435 	return(1);
2436     if (xmlStrEqual(ctxt->name, BAD_CAST"head"))
2437 	return(1);
2438 
2439     /* Only strip CDATA children of the body tag for strict HTML DTDs */
2440     if (xmlStrEqual(ctxt->name, BAD_CAST "body") && ctxt->myDoc != NULL) {
2441         dtd = xmlGetIntSubset(ctxt->myDoc);
2442         if (dtd != NULL && dtd->ExternalID != NULL) {
2443             if (!xmlStrcasecmp(dtd->ExternalID, BAD_CAST "-//W3C//DTD HTML 4.01//EN") ||
2444                     !xmlStrcasecmp(dtd->ExternalID, BAD_CAST "-//W3C//DTD HTML 4//EN"))
2445                 return(1);
2446         }
2447     }
2448 
2449     if (ctxt->node == NULL) return(0);
2450     lastChild = xmlGetLastChild(ctxt->node);
2451     while ((lastChild) && (lastChild->type == XML_COMMENT_NODE))
2452 	lastChild = lastChild->prev;
2453     if (lastChild == NULL) {
2454         if ((ctxt->node->type != XML_ELEMENT_NODE) &&
2455             (ctxt->node->content != NULL)) return(0);
2456 	/* keep ws in constructs like ...<b> </b>...
2457 	   for all tags "b" allowing PCDATA */
2458 	for ( i = 0; i < sizeof(allowPCData)/sizeof(allowPCData[0]); i++ ) {
2459 	    if ( xmlStrEqual(ctxt->name, BAD_CAST allowPCData[i]) ) {
2460 		return(0);
2461 	    }
2462 	}
2463     } else if (xmlNodeIsText(lastChild)) {
2464         return(0);
2465     } else {
2466 	/* keep ws in constructs like <p><b>xy</b> <i>z</i><p>
2467 	   for all tags "p" allowing PCDATA */
2468 	for ( i = 0; i < sizeof(allowPCData)/sizeof(allowPCData[0]); i++ ) {
2469 	    if ( xmlStrEqual(lastChild->name, BAD_CAST allowPCData[i]) ) {
2470 		return(0);
2471 	    }
2472 	}
2473     }
2474     return(1);
2475 }
2476 
2477 /**
2478  * htmlNewDocNoDtD:
2479  * @URI:  URI for the dtd, or NULL
2480  * @ExternalID:  the external ID of the DTD, or NULL
2481  *
2482  * Creates a new HTML document without a DTD node if @URI and @ExternalID
2483  * are NULL
2484  *
2485  * Returns a new document, do not initialize the DTD if not provided
2486  */
2487 htmlDocPtr
htmlNewDocNoDtD(const xmlChar * URI,const xmlChar * ExternalID)2488 htmlNewDocNoDtD(const xmlChar *URI, const xmlChar *ExternalID) {
2489     xmlDocPtr cur;
2490 
2491     /*
2492      * Allocate a new document and fill the fields.
2493      */
2494     cur = (xmlDocPtr) xmlMalloc(sizeof(xmlDoc));
2495     if (cur == NULL) {
2496 	htmlErrMemory(NULL, "HTML document creation failed\n");
2497 	return(NULL);
2498     }
2499     memset(cur, 0, sizeof(xmlDoc));
2500 
2501     cur->type = XML_HTML_DOCUMENT_NODE;
2502     cur->version = NULL;
2503     cur->intSubset = NULL;
2504     cur->doc = cur;
2505     cur->name = NULL;
2506     cur->children = NULL;
2507     cur->extSubset = NULL;
2508     cur->oldNs = NULL;
2509     cur->encoding = NULL;
2510     cur->standalone = 1;
2511     cur->compression = 0;
2512     cur->ids = NULL;
2513     cur->refs = NULL;
2514     cur->_private = NULL;
2515     cur->charset = XML_CHAR_ENCODING_UTF8;
2516     cur->properties = XML_DOC_HTML | XML_DOC_USERBUILT;
2517     if ((ExternalID != NULL) ||
2518 	(URI != NULL))
2519 	xmlCreateIntSubset(cur, BAD_CAST "html", ExternalID, URI);
2520     return(cur);
2521 }
2522 
2523 /**
2524  * htmlNewDoc:
2525  * @URI:  URI for the dtd, or NULL
2526  * @ExternalID:  the external ID of the DTD, or NULL
2527  *
2528  * Creates a new HTML document
2529  *
2530  * Returns a new document
2531  */
2532 htmlDocPtr
htmlNewDoc(const xmlChar * URI,const xmlChar * ExternalID)2533 htmlNewDoc(const xmlChar *URI, const xmlChar *ExternalID) {
2534     if ((URI == NULL) && (ExternalID == NULL))
2535 	return(htmlNewDocNoDtD(
2536 		    BAD_CAST "http://www.w3.org/TR/REC-html40/loose.dtd",
2537 		    BAD_CAST "-//W3C//DTD HTML 4.0 Transitional//EN"));
2538 
2539     return(htmlNewDocNoDtD(URI, ExternalID));
2540 }
2541 
2542 
2543 /************************************************************************
2544  *									*
2545  *			The parser itself				*
2546  *	Relates to http://www.w3.org/TR/html40				*
2547  *									*
2548  ************************************************************************/
2549 
2550 /************************************************************************
2551  *									*
2552  *			The parser itself				*
2553  *									*
2554  ************************************************************************/
2555 
2556 static const xmlChar * htmlParseNameComplex(xmlParserCtxtPtr ctxt);
2557 
2558 /**
2559  * htmlParseHTMLName:
2560  * @ctxt:  an HTML parser context
2561  *
2562  * parse an HTML tag or attribute name, note that we convert it to lowercase
2563  * since HTML names are not case-sensitive.
2564  *
2565  * Returns the Tag Name parsed or NULL
2566  */
2567 
2568 static const xmlChar *
htmlParseHTMLName(htmlParserCtxtPtr ctxt)2569 htmlParseHTMLName(htmlParserCtxtPtr ctxt) {
2570     int i = 0;
2571     xmlChar loc[HTML_PARSER_BUFFER_SIZE];
2572 
2573     if (!IS_ASCII_LETTER(CUR) && (CUR != '_') &&
2574         (CUR != ':') && (CUR != '.')) return(NULL);
2575 
2576     while ((i < HTML_PARSER_BUFFER_SIZE) &&
2577            ((IS_ASCII_LETTER(CUR)) || (IS_ASCII_DIGIT(CUR)) ||
2578 	   (CUR == ':') || (CUR == '-') || (CUR == '_') ||
2579            (CUR == '.'))) {
2580 	if ((CUR >= 'A') && (CUR <= 'Z')) loc[i] = CUR + 0x20;
2581         else loc[i] = CUR;
2582 	i++;
2583 
2584 	NEXT;
2585     }
2586 
2587     return(xmlDictLookup(ctxt->dict, loc, i));
2588 }
2589 
2590 
2591 /**
2592  * htmlParseHTMLName_nonInvasive:
2593  * @ctxt:  an HTML parser context
2594  *
2595  * parse an HTML tag or attribute name, note that we convert it to lowercase
2596  * since HTML names are not case-sensitive, this doesn't consume the data
2597  * from the stream, it's a look-ahead
2598  *
2599  * Returns the Tag Name parsed or NULL
2600  */
2601 
2602 static const xmlChar *
htmlParseHTMLName_nonInvasive(htmlParserCtxtPtr ctxt)2603 htmlParseHTMLName_nonInvasive(htmlParserCtxtPtr ctxt) {
2604     int i = 0;
2605     xmlChar loc[HTML_PARSER_BUFFER_SIZE];
2606 
2607     if (!IS_ASCII_LETTER(NXT(1)) && (NXT(1) != '_') &&
2608         (NXT(1) != ':')) return(NULL);
2609 
2610     while ((i < HTML_PARSER_BUFFER_SIZE) &&
2611            ((IS_ASCII_LETTER(NXT(1+i))) || (IS_ASCII_DIGIT(NXT(1+i))) ||
2612 	   (NXT(1+i) == ':') || (NXT(1+i) == '-') || (NXT(1+i) == '_'))) {
2613 	if ((NXT(1+i) >= 'A') && (NXT(1+i) <= 'Z')) loc[i] = NXT(1+i) + 0x20;
2614         else loc[i] = NXT(1+i);
2615 	i++;
2616     }
2617 
2618     return(xmlDictLookup(ctxt->dict, loc, i));
2619 }
2620 
2621 
2622 /**
2623  * htmlParseName:
2624  * @ctxt:  an HTML parser context
2625  *
2626  * parse an HTML name, this routine is case sensitive.
2627  *
2628  * Returns the Name parsed or NULL
2629  */
2630 
2631 static const xmlChar *
htmlParseName(htmlParserCtxtPtr ctxt)2632 htmlParseName(htmlParserCtxtPtr ctxt) {
2633     const xmlChar *in;
2634     const xmlChar *ret;
2635     int count = 0;
2636 
2637     GROW;
2638 
2639     /*
2640      * Accelerator for simple ASCII names
2641      */
2642     in = ctxt->input->cur;
2643     if (((*in >= 0x61) && (*in <= 0x7A)) ||
2644 	((*in >= 0x41) && (*in <= 0x5A)) ||
2645 	(*in == '_') || (*in == ':')) {
2646 	in++;
2647 	while (((*in >= 0x61) && (*in <= 0x7A)) ||
2648 	       ((*in >= 0x41) && (*in <= 0x5A)) ||
2649 	       ((*in >= 0x30) && (*in <= 0x39)) ||
2650 	       (*in == '_') || (*in == '-') ||
2651 	       (*in == ':') || (*in == '.'))
2652 	    in++;
2653 
2654 	if (in == ctxt->input->end)
2655 	    return(NULL);
2656 
2657 	if ((*in > 0) && (*in < 0x80)) {
2658 	    count = in - ctxt->input->cur;
2659 	    ret = xmlDictLookup(ctxt->dict, ctxt->input->cur, count);
2660 	    ctxt->input->cur = in;
2661 	    ctxt->input->col += count;
2662 	    return(ret);
2663 	}
2664     }
2665     return(htmlParseNameComplex(ctxt));
2666 }
2667 
2668 static const xmlChar *
htmlParseNameComplex(xmlParserCtxtPtr ctxt)2669 htmlParseNameComplex(xmlParserCtxtPtr ctxt) {
2670     int len = 0, l;
2671     int c;
2672     int count = 0;
2673     const xmlChar *base = ctxt->input->base;
2674 
2675     /*
2676      * Handler for more complex cases
2677      */
2678     GROW;
2679     c = CUR_CHAR(l);
2680     if ((c == ' ') || (c == '>') || (c == '/') || /* accelerators */
2681 	(!IS_LETTER(c) && (c != '_') &&
2682          (c != ':'))) {
2683 	return(NULL);
2684     }
2685 
2686     while ((c != ' ') && (c != '>') && (c != '/') && /* test bigname.xml */
2687 	   ((IS_LETTER(c)) || (IS_DIGIT(c)) ||
2688             (c == '.') || (c == '-') ||
2689 	    (c == '_') || (c == ':') ||
2690 	    (IS_COMBINING(c)) ||
2691 	    (IS_EXTENDER(c)))) {
2692 	if (count++ > 100) {
2693 	    count = 0;
2694 	    GROW;
2695 	}
2696 	len += l;
2697 	NEXTL(l);
2698 	c = CUR_CHAR(l);
2699 	if (ctxt->input->base != base) {
2700 	    /*
2701 	     * We changed encoding from an unknown encoding
2702 	     * Input buffer changed location, so we better start again
2703 	     */
2704 	    return(htmlParseNameComplex(ctxt));
2705 	}
2706     }
2707 
2708     if (ctxt->input->cur - ctxt->input->base < len) {
2709         /* Sanity check */
2710 	htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
2711                      "unexpected change of input buffer", NULL, NULL);
2712         return (NULL);
2713     }
2714 
2715     return(xmlDictLookup(ctxt->dict, ctxt->input->cur - len, len));
2716 }
2717 
2718 
2719 /**
2720  * htmlParseHTMLAttribute:
2721  * @ctxt:  an HTML parser context
2722  * @stop:  a char stop value
2723  *
2724  * parse an HTML attribute value till the stop (quote), if
2725  * stop is 0 then it stops at the first space
2726  *
2727  * Returns the attribute parsed or NULL
2728  */
2729 
2730 static xmlChar *
htmlParseHTMLAttribute(htmlParserCtxtPtr ctxt,const xmlChar stop)2731 htmlParseHTMLAttribute(htmlParserCtxtPtr ctxt, const xmlChar stop) {
2732     xmlChar *buffer = NULL;
2733     int buffer_size = 0;
2734     xmlChar *out = NULL;
2735     const xmlChar *name = NULL;
2736     const xmlChar *cur = NULL;
2737     const htmlEntityDesc * ent;
2738 
2739     /*
2740      * allocate a translation buffer.
2741      */
2742     buffer_size = HTML_PARSER_BUFFER_SIZE;
2743     buffer = (xmlChar *) xmlMallocAtomic(buffer_size * sizeof(xmlChar));
2744     if (buffer == NULL) {
2745 	htmlErrMemory(ctxt, "buffer allocation failed\n");
2746 	return(NULL);
2747     }
2748     out = buffer;
2749 
2750     /*
2751      * Ok loop until we reach one of the ending chars
2752      */
2753     while ((CUR != 0) && (CUR != stop)) {
2754 	if ((stop == 0) && (CUR == '>')) break;
2755 	if ((stop == 0) && (IS_BLANK_CH(CUR))) break;
2756         if (CUR == '&') {
2757 	    if (NXT(1) == '#') {
2758 		unsigned int c;
2759 		int bits;
2760 
2761 		c = htmlParseCharRef(ctxt);
2762 		if      (c <    0x80)
2763 		        { *out++  = c;                bits= -6; }
2764 		else if (c <   0x800)
2765 		        { *out++  =((c >>  6) & 0x1F) | 0xC0;  bits=  0; }
2766 		else if (c < 0x10000)
2767 		        { *out++  =((c >> 12) & 0x0F) | 0xE0;  bits=  6; }
2768 		else
2769 		        { *out++  =((c >> 18) & 0x07) | 0xF0;  bits= 12; }
2770 
2771 		for ( ; bits >= 0; bits-= 6) {
2772 		    *out++  = ((c >> bits) & 0x3F) | 0x80;
2773 		}
2774 
2775 		if (out - buffer > buffer_size - 100) {
2776 			int indx = out - buffer;
2777 
2778 			growBuffer(buffer);
2779 			out = &buffer[indx];
2780 		}
2781 	    } else {
2782 		ent = htmlParseEntityRef(ctxt, &name);
2783 		if (name == NULL) {
2784 		    *out++ = '&';
2785 		    if (out - buffer > buffer_size - 100) {
2786 			int indx = out - buffer;
2787 
2788 			growBuffer(buffer);
2789 			out = &buffer[indx];
2790 		    }
2791 		} else if (ent == NULL) {
2792 		    *out++ = '&';
2793 		    cur = name;
2794 		    while (*cur != 0) {
2795 			if (out - buffer > buffer_size - 100) {
2796 			    int indx = out - buffer;
2797 
2798 			    growBuffer(buffer);
2799 			    out = &buffer[indx];
2800 			}
2801 			*out++ = *cur++;
2802 		    }
2803 		} else {
2804 		    unsigned int c;
2805 		    int bits;
2806 
2807 		    if (out - buffer > buffer_size - 100) {
2808 			int indx = out - buffer;
2809 
2810 			growBuffer(buffer);
2811 			out = &buffer[indx];
2812 		    }
2813 		    c = ent->value;
2814 		    if      (c <    0x80)
2815 			{ *out++  = c;                bits= -6; }
2816 		    else if (c <   0x800)
2817 			{ *out++  =((c >>  6) & 0x1F) | 0xC0;  bits=  0; }
2818 		    else if (c < 0x10000)
2819 			{ *out++  =((c >> 12) & 0x0F) | 0xE0;  bits=  6; }
2820 		    else
2821 			{ *out++  =((c >> 18) & 0x07) | 0xF0;  bits= 12; }
2822 
2823 		    for ( ; bits >= 0; bits-= 6) {
2824 			*out++  = ((c >> bits) & 0x3F) | 0x80;
2825 		    }
2826 		}
2827 	    }
2828 	} else {
2829 	    unsigned int c;
2830 	    int bits, l;
2831 
2832 	    if (out - buffer > buffer_size - 100) {
2833 		int indx = out - buffer;
2834 
2835 		growBuffer(buffer);
2836 		out = &buffer[indx];
2837 	    }
2838 	    c = CUR_CHAR(l);
2839 	    if      (c <    0x80)
2840 		    { *out++  = c;                bits= -6; }
2841 	    else if (c <   0x800)
2842 		    { *out++  =((c >>  6) & 0x1F) | 0xC0;  bits=  0; }
2843 	    else if (c < 0x10000)
2844 		    { *out++  =((c >> 12) & 0x0F) | 0xE0;  bits=  6; }
2845 	    else
2846 		    { *out++  =((c >> 18) & 0x07) | 0xF0;  bits= 12; }
2847 
2848 	    for ( ; bits >= 0; bits-= 6) {
2849 		*out++  = ((c >> bits) & 0x3F) | 0x80;
2850 	    }
2851 	    NEXT;
2852 	}
2853     }
2854     *out = 0;
2855     return(buffer);
2856 }
2857 
2858 /**
2859  * htmlParseEntityRef:
2860  * @ctxt:  an HTML parser context
2861  * @str:  location to store the entity name
2862  *
2863  * parse an HTML ENTITY references
2864  *
2865  * [68] EntityRef ::= '&' Name ';'
2866  *
2867  * Returns the associated htmlEntityDescPtr if found, or NULL otherwise,
2868  *         if non-NULL *str will have to be freed by the caller.
2869  */
2870 const htmlEntityDesc *
htmlParseEntityRef(htmlParserCtxtPtr ctxt,const xmlChar ** str)2871 htmlParseEntityRef(htmlParserCtxtPtr ctxt, const xmlChar **str) {
2872     const xmlChar *name;
2873     const htmlEntityDesc * ent = NULL;
2874 
2875     if (str != NULL) *str = NULL;
2876     if ((ctxt == NULL) || (ctxt->input == NULL)) return(NULL);
2877 
2878     if (CUR == '&') {
2879         NEXT;
2880         name = htmlParseName(ctxt);
2881 	if (name == NULL) {
2882 	    htmlParseErr(ctxt, XML_ERR_NAME_REQUIRED,
2883 	                 "htmlParseEntityRef: no name\n", NULL, NULL);
2884 	} else {
2885 	    GROW;
2886 	    if (CUR == ';') {
2887 	        if (str != NULL)
2888 		    *str = name;
2889 
2890 		/*
2891 		 * Lookup the entity in the table.
2892 		 */
2893 		ent = htmlEntityLookup(name);
2894 		if (ent != NULL) /* OK that's ugly !!! */
2895 		    NEXT;
2896 	    } else {
2897 		htmlParseErr(ctxt, XML_ERR_ENTITYREF_SEMICOL_MISSING,
2898 		             "htmlParseEntityRef: expecting ';'\n",
2899 			     NULL, NULL);
2900 	        if (str != NULL)
2901 		    *str = name;
2902 	    }
2903 	}
2904     }
2905     return(ent);
2906 }
2907 
2908 /**
2909  * htmlParseAttValue:
2910  * @ctxt:  an HTML parser context
2911  *
2912  * parse a value for an attribute
2913  * Note: the parser won't do substitution of entities here, this
2914  * will be handled later in xmlStringGetNodeList, unless it was
2915  * asked for ctxt->replaceEntities != 0
2916  *
2917  * Returns the AttValue parsed or NULL.
2918  */
2919 
2920 static xmlChar *
htmlParseAttValue(htmlParserCtxtPtr ctxt)2921 htmlParseAttValue(htmlParserCtxtPtr ctxt) {
2922     xmlChar *ret = NULL;
2923 
2924     if (CUR == '"') {
2925         NEXT;
2926 	ret = htmlParseHTMLAttribute(ctxt, '"');
2927         if (CUR != '"') {
2928 	    htmlParseErr(ctxt, XML_ERR_ATTRIBUTE_NOT_FINISHED,
2929 	                 "AttValue: \" expected\n", NULL, NULL);
2930 	} else
2931 	    NEXT;
2932     } else if (CUR == '\'') {
2933         NEXT;
2934 	ret = htmlParseHTMLAttribute(ctxt, '\'');
2935         if (CUR != '\'') {
2936 	    htmlParseErr(ctxt, XML_ERR_ATTRIBUTE_NOT_FINISHED,
2937 	                 "AttValue: ' expected\n", NULL, NULL);
2938 	} else
2939 	    NEXT;
2940     } else {
2941         /*
2942 	 * That's an HTMLism, the attribute value may not be quoted
2943 	 */
2944 	ret = htmlParseHTMLAttribute(ctxt, 0);
2945 	if (ret == NULL) {
2946 	    htmlParseErr(ctxt, XML_ERR_ATTRIBUTE_WITHOUT_VALUE,
2947 	                 "AttValue: no value found\n", NULL, NULL);
2948 	}
2949     }
2950     return(ret);
2951 }
2952 
2953 /**
2954  * htmlParseSystemLiteral:
2955  * @ctxt:  an HTML parser context
2956  *
2957  * parse an HTML Literal
2958  *
2959  * [11] SystemLiteral ::= ('"' [^"]* '"') | ("'" [^']* "'")
2960  *
2961  * Returns the SystemLiteral parsed or NULL
2962  */
2963 
2964 static xmlChar *
htmlParseSystemLiteral(htmlParserCtxtPtr ctxt)2965 htmlParseSystemLiteral(htmlParserCtxtPtr ctxt) {
2966     size_t len = 0, startPosition = 0;
2967     int err = 0;
2968     int quote;
2969     xmlChar *ret = NULL;
2970 
2971     if ((CUR != '"') && (CUR != '\'')) {
2972 	htmlParseErr(ctxt, XML_ERR_LITERAL_NOT_STARTED,
2973 	             "SystemLiteral \" or ' expected\n", NULL, NULL);
2974         return(NULL);
2975     }
2976     quote = CUR;
2977     NEXT;
2978 
2979     if (CUR_PTR < BASE_PTR)
2980         return(ret);
2981     startPosition = CUR_PTR - BASE_PTR;
2982 
2983     while ((CUR != 0) && (CUR != quote)) {
2984         /* TODO: Handle UTF-8 */
2985         if (!IS_CHAR_CH(CUR)) {
2986             htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR,
2987                             "Invalid char in SystemLiteral 0x%X\n", CUR);
2988             err = 1;
2989         }
2990         NEXT;
2991         len++;
2992     }
2993     if (CUR != quote) {
2994         htmlParseErr(ctxt, XML_ERR_LITERAL_NOT_FINISHED,
2995                      "Unfinished SystemLiteral\n", NULL, NULL);
2996     } else {
2997         NEXT;
2998         if (err == 0)
2999             ret = xmlStrndup((BASE_PTR+startPosition), len);
3000     }
3001 
3002     return(ret);
3003 }
3004 
3005 /**
3006  * htmlParsePubidLiteral:
3007  * @ctxt:  an HTML parser context
3008  *
3009  * parse an HTML public literal
3010  *
3011  * [12] PubidLiteral ::= '"' PubidChar* '"' | "'" (PubidChar - "'")* "'"
3012  *
3013  * Returns the PubidLiteral parsed or NULL.
3014  */
3015 
3016 static xmlChar *
htmlParsePubidLiteral(htmlParserCtxtPtr ctxt)3017 htmlParsePubidLiteral(htmlParserCtxtPtr ctxt) {
3018     size_t len = 0, startPosition = 0;
3019     int err = 0;
3020     int quote;
3021     xmlChar *ret = NULL;
3022 
3023     if ((CUR != '"') && (CUR != '\'')) {
3024 	htmlParseErr(ctxt, XML_ERR_LITERAL_NOT_STARTED,
3025 	             "PubidLiteral \" or ' expected\n", NULL, NULL);
3026         return(NULL);
3027     }
3028     quote = CUR;
3029     NEXT;
3030 
3031     /*
3032      * Name ::= (Letter | '_') (NameChar)*
3033      */
3034     if (CUR_PTR < BASE_PTR)
3035         return(ret);
3036     startPosition = CUR_PTR - BASE_PTR;
3037 
3038     while ((CUR != 0) && (CUR != quote)) {
3039         if (!IS_PUBIDCHAR_CH(CUR)) {
3040             htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR,
3041                             "Invalid char in PubidLiteral 0x%X\n", CUR);
3042             err = 1;
3043         }
3044         len++;
3045         NEXT;
3046     }
3047 
3048     if (CUR != '"') {
3049         htmlParseErr(ctxt, XML_ERR_LITERAL_NOT_FINISHED,
3050                      "Unfinished PubidLiteral\n", NULL, NULL);
3051     } else {
3052         NEXT;
3053         if (err == 0)
3054             ret = xmlStrndup((BASE_PTR + startPosition), len);
3055     }
3056 
3057     return(ret);
3058 }
3059 
3060 /**
3061  * htmlParseScript:
3062  * @ctxt:  an HTML parser context
3063  *
3064  * parse the content of an HTML SCRIPT or STYLE element
3065  * http://www.w3.org/TR/html4/sgml/dtd.html#Script
3066  * http://www.w3.org/TR/html4/sgml/dtd.html#StyleSheet
3067  * http://www.w3.org/TR/html4/types.html#type-script
3068  * http://www.w3.org/TR/html4/types.html#h-6.15
3069  * http://www.w3.org/TR/html4/appendix/notes.html#h-B.3.2.1
3070  *
3071  * Script data ( %Script; in the DTD) can be the content of the SCRIPT
3072  * element and the value of intrinsic event attributes. User agents must
3073  * not evaluate script data as HTML markup but instead must pass it on as
3074  * data to a script engine.
3075  * NOTES:
3076  * - The content is passed like CDATA
3077  * - the attributes for style and scripting "onXXX" are also described
3078  *   as CDATA but SGML allows entities references in attributes so their
3079  *   processing is identical as other attributes
3080  */
3081 static void
htmlParseScript(htmlParserCtxtPtr ctxt)3082 htmlParseScript(htmlParserCtxtPtr ctxt) {
3083     xmlChar buf[HTML_PARSER_BIG_BUFFER_SIZE + 5];
3084     int nbchar = 0;
3085     int cur,l;
3086 
3087     SHRINK;
3088     cur = CUR_CHAR(l);
3089     while (cur != 0) {
3090 	if ((cur == '<') && (NXT(1) == '/')) {
3091             /*
3092              * One should break here, the specification is clear:
3093              * Authors should therefore escape "</" within the content.
3094              * Escape mechanisms are specific to each scripting or
3095              * style sheet language.
3096              *
3097              * In recovery mode, only break if end tag match the
3098              * current tag, effectively ignoring all tags inside the
3099              * script/style block and treating the entire block as
3100              * CDATA.
3101              */
3102             if (ctxt->recovery) {
3103                 if (xmlStrncasecmp(ctxt->name, ctxt->input->cur+2,
3104 				   xmlStrlen(ctxt->name)) == 0)
3105                 {
3106                     break; /* while */
3107                 } else {
3108 		    htmlParseErr(ctxt, XML_ERR_TAG_NAME_MISMATCH,
3109 				 "Element %s embeds close tag\n",
3110 		                 ctxt->name, NULL);
3111 		}
3112             } else {
3113                 if (((NXT(2) >= 'A') && (NXT(2) <= 'Z')) ||
3114                     ((NXT(2) >= 'a') && (NXT(2) <= 'z')))
3115                 {
3116                     break; /* while */
3117                 }
3118             }
3119 	}
3120         if (IS_CHAR(cur)) {
3121 	    COPY_BUF(l,buf,nbchar,cur);
3122         } else {
3123             htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR,
3124                             "Invalid char in CDATA 0x%X\n", cur);
3125         }
3126 	if (nbchar >= HTML_PARSER_BIG_BUFFER_SIZE) {
3127             buf[nbchar] = 0;
3128 	    if (ctxt->sax->cdataBlock!= NULL) {
3129 		/*
3130 		 * Insert as CDATA, which is the same as HTML_PRESERVE_NODE
3131 		 */
3132 		ctxt->sax->cdataBlock(ctxt->userData, buf, nbchar);
3133 	    } else if (ctxt->sax->characters != NULL) {
3134 		ctxt->sax->characters(ctxt->userData, buf, nbchar);
3135 	    }
3136 	    nbchar = 0;
3137 	}
3138 	GROW;
3139 	NEXTL(l);
3140 	cur = CUR_CHAR(l);
3141     }
3142 
3143     if ((nbchar != 0) && (ctxt->sax != NULL) && (!ctxt->disableSAX)) {
3144         buf[nbchar] = 0;
3145 	if (ctxt->sax->cdataBlock!= NULL) {
3146 	    /*
3147 	     * Insert as CDATA, which is the same as HTML_PRESERVE_NODE
3148 	     */
3149 	    ctxt->sax->cdataBlock(ctxt->userData, buf, nbchar);
3150 	} else if (ctxt->sax->characters != NULL) {
3151 	    ctxt->sax->characters(ctxt->userData, buf, nbchar);
3152 	}
3153     }
3154 }
3155 
3156 
3157 /**
3158  * htmlParseCharDataInternal:
3159  * @ctxt:  an HTML parser context
3160  * @readahead: optional read ahead character in ascii range
3161  *
3162  * parse a CharData section.
3163  * if we are within a CDATA section ']]>' marks an end of section.
3164  *
3165  * [14] CharData ::= [^<&]* - ([^<&]* ']]>' [^<&]*)
3166  */
3167 
3168 static void
htmlParseCharDataInternal(htmlParserCtxtPtr ctxt,int readahead)3169 htmlParseCharDataInternal(htmlParserCtxtPtr ctxt, int readahead) {
3170     xmlChar buf[HTML_PARSER_BIG_BUFFER_SIZE + 6];
3171     int nbchar = 0;
3172     int cur, l;
3173     int chunk = 0;
3174 
3175     if (readahead)
3176         buf[nbchar++] = readahead;
3177 
3178     SHRINK;
3179     cur = CUR_CHAR(l);
3180     while (((cur != '<') || (ctxt->token == '<')) &&
3181            ((cur != '&') || (ctxt->token == '&')) &&
3182 	   (cur != 0)) {
3183 	if (!(IS_CHAR(cur))) {
3184 	    htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR,
3185 	                "Invalid char in CDATA 0x%X\n", cur);
3186 	} else {
3187 	    COPY_BUF(l,buf,nbchar,cur);
3188 	}
3189 	if (nbchar >= HTML_PARSER_BIG_BUFFER_SIZE) {
3190             buf[nbchar] = 0;
3191 
3192 	    /*
3193 	     * Ok the segment is to be consumed as chars.
3194 	     */
3195 	    if ((ctxt->sax != NULL) && (!ctxt->disableSAX)) {
3196 		if (areBlanks(ctxt, buf, nbchar)) {
3197 		    if (ctxt->keepBlanks) {
3198 			if (ctxt->sax->characters != NULL)
3199 			    ctxt->sax->characters(ctxt->userData, buf, nbchar);
3200 		    } else {
3201 			if (ctxt->sax->ignorableWhitespace != NULL)
3202 			    ctxt->sax->ignorableWhitespace(ctxt->userData,
3203 			                                   buf, nbchar);
3204 		    }
3205 		} else {
3206 		    htmlCheckParagraph(ctxt);
3207 		    if (ctxt->sax->characters != NULL)
3208 			ctxt->sax->characters(ctxt->userData, buf, nbchar);
3209 		}
3210 	    }
3211 	    nbchar = 0;
3212 	}
3213 	NEXTL(l);
3214         chunk++;
3215         if (chunk > HTML_PARSER_BUFFER_SIZE) {
3216             chunk = 0;
3217             SHRINK;
3218             GROW;
3219         }
3220 	cur = CUR_CHAR(l);
3221 	if (cur == 0) {
3222 	    SHRINK;
3223 	    GROW;
3224 	    cur = CUR_CHAR(l);
3225 	}
3226     }
3227     if (nbchar != 0) {
3228         buf[nbchar] = 0;
3229 
3230 	/*
3231 	 * Ok the segment is to be consumed as chars.
3232 	 */
3233 	if ((ctxt->sax != NULL) && (!ctxt->disableSAX)) {
3234 	    if (areBlanks(ctxt, buf, nbchar)) {
3235 		if (ctxt->keepBlanks) {
3236 		    if (ctxt->sax->characters != NULL)
3237 			ctxt->sax->characters(ctxt->userData, buf, nbchar);
3238 		} else {
3239 		    if (ctxt->sax->ignorableWhitespace != NULL)
3240 			ctxt->sax->ignorableWhitespace(ctxt->userData,
3241 			                               buf, nbchar);
3242 		}
3243 	    } else {
3244 		htmlCheckParagraph(ctxt);
3245 		if (ctxt->sax->characters != NULL)
3246 		    ctxt->sax->characters(ctxt->userData, buf, nbchar);
3247 	    }
3248 	}
3249     } else {
3250 	/*
3251 	 * Loop detection
3252 	 */
3253 	if (cur == 0)
3254 	    ctxt->instate = XML_PARSER_EOF;
3255     }
3256 }
3257 
3258 /**
3259  * htmlParseCharData:
3260  * @ctxt:  an HTML parser context
3261  *
3262  * parse a CharData section.
3263  * if we are within a CDATA section ']]>' marks an end of section.
3264  *
3265  * [14] CharData ::= [^<&]* - ([^<&]* ']]>' [^<&]*)
3266  */
3267 
3268 static void
htmlParseCharData(htmlParserCtxtPtr ctxt)3269 htmlParseCharData(htmlParserCtxtPtr ctxt) {
3270     htmlParseCharDataInternal(ctxt, 0);
3271 }
3272 
3273 /**
3274  * htmlParseExternalID:
3275  * @ctxt:  an HTML parser context
3276  * @publicID:  a xmlChar** receiving PubidLiteral
3277  *
3278  * Parse an External ID or a Public ID
3279  *
3280  * [75] ExternalID ::= 'SYSTEM' S SystemLiteral
3281  *                   | 'PUBLIC' S PubidLiteral S SystemLiteral
3282  *
3283  * [83] PublicID ::= 'PUBLIC' S PubidLiteral
3284  *
3285  * Returns the function returns SystemLiteral and in the second
3286  *                case publicID receives PubidLiteral, is strict is off
3287  *                it is possible to return NULL and have publicID set.
3288  */
3289 
3290 static xmlChar *
htmlParseExternalID(htmlParserCtxtPtr ctxt,xmlChar ** publicID)3291 htmlParseExternalID(htmlParserCtxtPtr ctxt, xmlChar **publicID) {
3292     xmlChar *URI = NULL;
3293 
3294     if ((UPPER == 'S') && (UPP(1) == 'Y') &&
3295          (UPP(2) == 'S') && (UPP(3) == 'T') &&
3296 	 (UPP(4) == 'E') && (UPP(5) == 'M')) {
3297         SKIP(6);
3298 	if (!IS_BLANK_CH(CUR)) {
3299 	    htmlParseErr(ctxt, XML_ERR_SPACE_REQUIRED,
3300 	                 "Space required after 'SYSTEM'\n", NULL, NULL);
3301 	}
3302         SKIP_BLANKS;
3303 	URI = htmlParseSystemLiteral(ctxt);
3304 	if (URI == NULL) {
3305 	    htmlParseErr(ctxt, XML_ERR_URI_REQUIRED,
3306 	                 "htmlParseExternalID: SYSTEM, no URI\n", NULL, NULL);
3307         }
3308     } else if ((UPPER == 'P') && (UPP(1) == 'U') &&
3309 	       (UPP(2) == 'B') && (UPP(3) == 'L') &&
3310 	       (UPP(4) == 'I') && (UPP(5) == 'C')) {
3311         SKIP(6);
3312 	if (!IS_BLANK_CH(CUR)) {
3313 	    htmlParseErr(ctxt, XML_ERR_SPACE_REQUIRED,
3314 	                 "Space required after 'PUBLIC'\n", NULL, NULL);
3315 	}
3316         SKIP_BLANKS;
3317 	*publicID = htmlParsePubidLiteral(ctxt);
3318 	if (*publicID == NULL) {
3319 	    htmlParseErr(ctxt, XML_ERR_PUBID_REQUIRED,
3320 	                 "htmlParseExternalID: PUBLIC, no Public Identifier\n",
3321 			 NULL, NULL);
3322 	}
3323         SKIP_BLANKS;
3324         if ((CUR == '"') || (CUR == '\'')) {
3325 	    URI = htmlParseSystemLiteral(ctxt);
3326 	}
3327     }
3328     return(URI);
3329 }
3330 
3331 /**
3332  * xmlParsePI:
3333  * @ctxt:  an XML parser context
3334  *
3335  * parse an XML Processing Instruction.
3336  *
3337  * [16] PI ::= '<?' PITarget (S (Char* - (Char* '?>' Char*)))? '?>'
3338  */
3339 static void
htmlParsePI(htmlParserCtxtPtr ctxt)3340 htmlParsePI(htmlParserCtxtPtr ctxt) {
3341     xmlChar *buf = NULL;
3342     int len = 0;
3343     int size = HTML_PARSER_BUFFER_SIZE;
3344     int cur, l;
3345     const xmlChar *target;
3346     xmlParserInputState state;
3347     int count = 0;
3348 
3349     if ((RAW == '<') && (NXT(1) == '?')) {
3350 	state = ctxt->instate;
3351         ctxt->instate = XML_PARSER_PI;
3352 	/*
3353 	 * this is a Processing Instruction.
3354 	 */
3355 	SKIP(2);
3356 	SHRINK;
3357 
3358 	/*
3359 	 * Parse the target name and check for special support like
3360 	 * namespace.
3361 	 */
3362         target = htmlParseName(ctxt);
3363 	if (target != NULL) {
3364 	    if (RAW == '>') {
3365 		SKIP(1);
3366 
3367 		/*
3368 		 * SAX: PI detected.
3369 		 */
3370 		if ((ctxt->sax) && (!ctxt->disableSAX) &&
3371 		    (ctxt->sax->processingInstruction != NULL))
3372 		    ctxt->sax->processingInstruction(ctxt->userData,
3373 		                                     target, NULL);
3374 		ctxt->instate = state;
3375 		return;
3376 	    }
3377 	    buf = (xmlChar *) xmlMallocAtomic(size * sizeof(xmlChar));
3378 	    if (buf == NULL) {
3379 		htmlErrMemory(ctxt, NULL);
3380 		ctxt->instate = state;
3381 		return;
3382 	    }
3383 	    cur = CUR;
3384 	    if (!IS_BLANK(cur)) {
3385 		htmlParseErr(ctxt, XML_ERR_SPACE_REQUIRED,
3386 			  "ParsePI: PI %s space expected\n", target, NULL);
3387 	    }
3388             SKIP_BLANKS;
3389 	    cur = CUR_CHAR(l);
3390 	    while ((cur != 0) && (cur != '>')) {
3391 		if (len + 5 >= size) {
3392 		    xmlChar *tmp;
3393 
3394 		    size *= 2;
3395 		    tmp = (xmlChar *) xmlRealloc(buf, size * sizeof(xmlChar));
3396 		    if (tmp == NULL) {
3397 			htmlErrMemory(ctxt, NULL);
3398 			xmlFree(buf);
3399 			ctxt->instate = state;
3400 			return;
3401 		    }
3402 		    buf = tmp;
3403 		}
3404 		count++;
3405 		if (count > 50) {
3406 		    GROW;
3407 		    count = 0;
3408 		}
3409                 if (IS_CHAR(cur)) {
3410 		    COPY_BUF(l,buf,len,cur);
3411                 } else {
3412                     htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR,
3413                                     "Invalid char in processing instruction "
3414                                     "0x%X\n", cur);
3415                 }
3416 		NEXTL(l);
3417 		cur = CUR_CHAR(l);
3418 		if (cur == 0) {
3419 		    SHRINK;
3420 		    GROW;
3421 		    cur = CUR_CHAR(l);
3422 		}
3423 	    }
3424 	    buf[len] = 0;
3425 	    if (cur != '>') {
3426 		htmlParseErr(ctxt, XML_ERR_PI_NOT_FINISHED,
3427 		      "ParsePI: PI %s never end ...\n", target, NULL);
3428 	    } else {
3429 		SKIP(1);
3430 
3431 		/*
3432 		 * SAX: PI detected.
3433 		 */
3434 		if ((ctxt->sax) && (!ctxt->disableSAX) &&
3435 		    (ctxt->sax->processingInstruction != NULL))
3436 		    ctxt->sax->processingInstruction(ctxt->userData,
3437 		                                     target, buf);
3438 	    }
3439 	    xmlFree(buf);
3440 	} else {
3441 	    htmlParseErr(ctxt, XML_ERR_PI_NOT_STARTED,
3442                          "PI is not started correctly", NULL, NULL);
3443 	}
3444 	ctxt->instate = state;
3445     }
3446 }
3447 
3448 /**
3449  * htmlParseComment:
3450  * @ctxt:  an HTML parser context
3451  *
3452  * Parse an XML (SGML) comment <!-- .... -->
3453  *
3454  * [15] Comment ::= '<!--' ((Char - '-') | ('-' (Char - '-')))* '-->'
3455  */
3456 static void
htmlParseComment(htmlParserCtxtPtr ctxt)3457 htmlParseComment(htmlParserCtxtPtr ctxt) {
3458     xmlChar *buf = NULL;
3459     int len;
3460     int size = HTML_PARSER_BUFFER_SIZE;
3461     int q, ql;
3462     int r, rl;
3463     int cur, l;
3464     int next, nl;
3465     xmlParserInputState state;
3466 
3467     /*
3468      * Check that there is a comment right here.
3469      */
3470     if ((RAW != '<') || (NXT(1) != '!') ||
3471         (NXT(2) != '-') || (NXT(3) != '-')) return;
3472 
3473     state = ctxt->instate;
3474     ctxt->instate = XML_PARSER_COMMENT;
3475     SHRINK;
3476     SKIP(4);
3477     buf = (xmlChar *) xmlMallocAtomic(size * sizeof(xmlChar));
3478     if (buf == NULL) {
3479         htmlErrMemory(ctxt, "buffer allocation failed\n");
3480 	ctxt->instate = state;
3481 	return;
3482     }
3483     len = 0;
3484     buf[len] = 0;
3485     q = CUR_CHAR(ql);
3486     if (q == 0)
3487         goto unfinished;
3488     NEXTL(ql);
3489     r = CUR_CHAR(rl);
3490     if (r == 0)
3491         goto unfinished;
3492     NEXTL(rl);
3493     cur = CUR_CHAR(l);
3494     while ((cur != 0) &&
3495            ((cur != '>') ||
3496 	    (r != '-') || (q != '-'))) {
3497 	NEXTL(l);
3498 	next = CUR_CHAR(nl);
3499 	if (next == 0) {
3500 	    SHRINK;
3501 	    GROW;
3502 	    next = CUR_CHAR(nl);
3503 	}
3504 
3505 	if ((q == '-') && (r == '-') && (cur == '!') && (next == '>')) {
3506 	  htmlParseErr(ctxt, XML_ERR_COMMENT_NOT_FINISHED,
3507 		       "Comment incorrectly closed by '--!>'", NULL, NULL);
3508 	  cur = '>';
3509 	  break;
3510 	}
3511 
3512 	if (len + 5 >= size) {
3513 	    xmlChar *tmp;
3514 
3515 	    size *= 2;
3516 	    tmp = (xmlChar *) xmlRealloc(buf, size * sizeof(xmlChar));
3517 	    if (tmp == NULL) {
3518 	        xmlFree(buf);
3519 	        htmlErrMemory(ctxt, "growing buffer failed\n");
3520 		ctxt->instate = state;
3521 		return;
3522 	    }
3523 	    buf = tmp;
3524 	}
3525         if (IS_CHAR(q)) {
3526 	    COPY_BUF(ql,buf,len,q);
3527         } else {
3528             htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR,
3529                             "Invalid char in comment 0x%X\n", q);
3530         }
3531 
3532 	q = r;
3533 	ql = rl;
3534 	r = cur;
3535 	rl = l;
3536 	cur = next;
3537 	l = nl;
3538     }
3539     buf[len] = 0;
3540     if (cur == '>') {
3541         NEXT;
3542 	if ((ctxt->sax != NULL) && (ctxt->sax->comment != NULL) &&
3543 	    (!ctxt->disableSAX))
3544 	    ctxt->sax->comment(ctxt->userData, buf);
3545 	xmlFree(buf);
3546 	ctxt->instate = state;
3547 	return;
3548     }
3549 
3550 unfinished:
3551     htmlParseErr(ctxt, XML_ERR_COMMENT_NOT_FINISHED,
3552 		 "Comment not terminated \n<!--%.50s\n", buf, NULL);
3553     xmlFree(buf);
3554 }
3555 
3556 /**
3557  * htmlParseCharRef:
3558  * @ctxt:  an HTML parser context
3559  *
3560  * parse Reference declarations
3561  *
3562  * [66] CharRef ::= '&#' [0-9]+ ';' |
3563  *                  '&#x' [0-9a-fA-F]+ ';'
3564  *
3565  * Returns the value parsed (as an int)
3566  */
3567 int
htmlParseCharRef(htmlParserCtxtPtr ctxt)3568 htmlParseCharRef(htmlParserCtxtPtr ctxt) {
3569     int val = 0;
3570 
3571     if ((ctxt == NULL) || (ctxt->input == NULL)) {
3572 	htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
3573 		     "htmlParseCharRef: context error\n",
3574 		     NULL, NULL);
3575         return(0);
3576     }
3577     if ((CUR == '&') && (NXT(1) == '#') &&
3578         ((NXT(2) == 'x') || NXT(2) == 'X')) {
3579 	SKIP(3);
3580 	while (CUR != ';') {
3581 	    if ((CUR >= '0') && (CUR <= '9')) {
3582                 if (val < 0x110000)
3583 	            val = val * 16 + (CUR - '0');
3584             } else if ((CUR >= 'a') && (CUR <= 'f')) {
3585                 if (val < 0x110000)
3586 	            val = val * 16 + (CUR - 'a') + 10;
3587             } else if ((CUR >= 'A') && (CUR <= 'F')) {
3588                 if (val < 0x110000)
3589 	            val = val * 16 + (CUR - 'A') + 10;
3590             } else {
3591 	        htmlParseErr(ctxt, XML_ERR_INVALID_HEX_CHARREF,
3592 		             "htmlParseCharRef: missing semicolon\n",
3593 			     NULL, NULL);
3594 		break;
3595 	    }
3596 	    NEXT;
3597 	}
3598 	if (CUR == ';')
3599 	    NEXT;
3600     } else if  ((CUR == '&') && (NXT(1) == '#')) {
3601 	SKIP(2);
3602 	while (CUR != ';') {
3603 	    if ((CUR >= '0') && (CUR <= '9')) {
3604                 if (val < 0x110000)
3605 	            val = val * 10 + (CUR - '0');
3606             } else {
3607 	        htmlParseErr(ctxt, XML_ERR_INVALID_DEC_CHARREF,
3608 		             "htmlParseCharRef: missing semicolon\n",
3609 			     NULL, NULL);
3610 		break;
3611 	    }
3612 	    NEXT;
3613 	}
3614 	if (CUR == ';')
3615 	    NEXT;
3616     } else {
3617 	htmlParseErr(ctxt, XML_ERR_INVALID_CHARREF,
3618 	             "htmlParseCharRef: invalid value\n", NULL, NULL);
3619     }
3620     /*
3621      * Check the value IS_CHAR ...
3622      */
3623     if (IS_CHAR(val)) {
3624         return(val);
3625     } else if (val >= 0x110000) {
3626 	htmlParseErr(ctxt, XML_ERR_INVALID_CHAR,
3627 		     "htmlParseCharRef: value too large\n", NULL, NULL);
3628     } else {
3629 	htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR,
3630 			"htmlParseCharRef: invalid xmlChar value %d\n",
3631 			val);
3632     }
3633     return(0);
3634 }
3635 
3636 
3637 /**
3638  * htmlParseDocTypeDecl:
3639  * @ctxt:  an HTML parser context
3640  *
3641  * parse a DOCTYPE declaration
3642  *
3643  * [28] doctypedecl ::= '<!DOCTYPE' S Name (S ExternalID)? S?
3644  *                      ('[' (markupdecl | PEReference | S)* ']' S?)? '>'
3645  */
3646 
3647 static void
htmlParseDocTypeDecl(htmlParserCtxtPtr ctxt)3648 htmlParseDocTypeDecl(htmlParserCtxtPtr ctxt) {
3649     const xmlChar *name;
3650     xmlChar *ExternalID = NULL;
3651     xmlChar *URI = NULL;
3652 
3653     /*
3654      * We know that '<!DOCTYPE' has been detected.
3655      */
3656     SKIP(9);
3657 
3658     SKIP_BLANKS;
3659 
3660     /*
3661      * Parse the DOCTYPE name.
3662      */
3663     name = htmlParseName(ctxt);
3664     if (name == NULL) {
3665 	htmlParseErr(ctxt, XML_ERR_NAME_REQUIRED,
3666 	             "htmlParseDocTypeDecl : no DOCTYPE name !\n",
3667 		     NULL, NULL);
3668     }
3669     /*
3670      * Check that upper(name) == "HTML" !!!!!!!!!!!!!
3671      */
3672 
3673     SKIP_BLANKS;
3674 
3675     /*
3676      * Check for SystemID and ExternalID
3677      */
3678     URI = htmlParseExternalID(ctxt, &ExternalID);
3679     SKIP_BLANKS;
3680 
3681     /*
3682      * We should be at the end of the DOCTYPE declaration.
3683      */
3684     if (CUR != '>') {
3685 	htmlParseErr(ctxt, XML_ERR_DOCTYPE_NOT_FINISHED,
3686 	             "DOCTYPE improperly terminated\n", NULL, NULL);
3687         /* Ignore bogus content */
3688         while ((CUR != 0) && (CUR != '>'))
3689             NEXT;
3690     }
3691     if (CUR == '>')
3692         NEXT;
3693 
3694     /*
3695      * Create or update the document accordingly to the DOCTYPE
3696      */
3697     if ((ctxt->sax != NULL) && (ctxt->sax->internalSubset != NULL) &&
3698 	(!ctxt->disableSAX))
3699 	ctxt->sax->internalSubset(ctxt->userData, name, ExternalID, URI);
3700 
3701     /*
3702      * Cleanup, since we don't use all those identifiers
3703      */
3704     if (URI != NULL) xmlFree(URI);
3705     if (ExternalID != NULL) xmlFree(ExternalID);
3706 }
3707 
3708 /**
3709  * htmlParseAttribute:
3710  * @ctxt:  an HTML parser context
3711  * @value:  a xmlChar ** used to store the value of the attribute
3712  *
3713  * parse an attribute
3714  *
3715  * [41] Attribute ::= Name Eq AttValue
3716  *
3717  * [25] Eq ::= S? '=' S?
3718  *
3719  * With namespace:
3720  *
3721  * [NS 11] Attribute ::= QName Eq AttValue
3722  *
3723  * Also the case QName == xmlns:??? is handled independently as a namespace
3724  * definition.
3725  *
3726  * Returns the attribute name, and the value in *value.
3727  */
3728 
3729 static const xmlChar *
htmlParseAttribute(htmlParserCtxtPtr ctxt,xmlChar ** value)3730 htmlParseAttribute(htmlParserCtxtPtr ctxt, xmlChar **value) {
3731     const xmlChar *name;
3732     xmlChar *val = NULL;
3733 
3734     *value = NULL;
3735     name = htmlParseHTMLName(ctxt);
3736     if (name == NULL) {
3737 	htmlParseErr(ctxt, XML_ERR_NAME_REQUIRED,
3738 	             "error parsing attribute name\n", NULL, NULL);
3739         return(NULL);
3740     }
3741 
3742     /*
3743      * read the value
3744      */
3745     SKIP_BLANKS;
3746     if (CUR == '=') {
3747         NEXT;
3748 	SKIP_BLANKS;
3749 	val = htmlParseAttValue(ctxt);
3750     }
3751 
3752     *value = val;
3753     return(name);
3754 }
3755 
3756 /**
3757  * htmlCheckEncodingDirect:
3758  * @ctxt:  an HTML parser context
3759  * @attvalue: the attribute value
3760  *
3761  * Checks an attribute value to detect
3762  * the encoding
3763  * If a new encoding is detected the parser is switched to decode
3764  * it and pass UTF8
3765  */
3766 static void
htmlCheckEncodingDirect(htmlParserCtxtPtr ctxt,const xmlChar * encoding)3767 htmlCheckEncodingDirect(htmlParserCtxtPtr ctxt, const xmlChar *encoding) {
3768 
3769     if ((ctxt == NULL) || (encoding == NULL) ||
3770         (ctxt->options & HTML_PARSE_IGNORE_ENC))
3771 	return;
3772 
3773     /* do not change encoding */
3774     if (ctxt->input->encoding != NULL)
3775         return;
3776 
3777     if (encoding != NULL) {
3778 	xmlCharEncoding enc;
3779 	xmlCharEncodingHandlerPtr handler;
3780 
3781 	while ((*encoding == ' ') || (*encoding == '\t')) encoding++;
3782 
3783 	if (ctxt->input->encoding != NULL)
3784 	    xmlFree((xmlChar *) ctxt->input->encoding);
3785 	ctxt->input->encoding = xmlStrdup(encoding);
3786 
3787 	enc = xmlParseCharEncoding((const char *) encoding);
3788 	/*
3789 	 * registered set of known encodings
3790 	 */
3791 	if (enc != XML_CHAR_ENCODING_ERROR) {
3792 	    if (((enc == XML_CHAR_ENCODING_UTF16LE) ||
3793 	         (enc == XML_CHAR_ENCODING_UTF16BE) ||
3794 		 (enc == XML_CHAR_ENCODING_UCS4LE) ||
3795 		 (enc == XML_CHAR_ENCODING_UCS4BE)) &&
3796 		(ctxt->input->buf != NULL) &&
3797 		(ctxt->input->buf->encoder == NULL)) {
3798 		htmlParseErr(ctxt, XML_ERR_INVALID_ENCODING,
3799 		             "htmlCheckEncoding: wrong encoding meta\n",
3800 			     NULL, NULL);
3801 	    } else {
3802 		xmlSwitchEncoding(ctxt, enc);
3803 	    }
3804 	    ctxt->charset = XML_CHAR_ENCODING_UTF8;
3805 	} else {
3806 	    /*
3807 	     * fallback for unknown encodings
3808 	     */
3809 	    handler = xmlFindCharEncodingHandler((const char *) encoding);
3810 	    if (handler != NULL) {
3811 		xmlSwitchToEncoding(ctxt, handler);
3812 		ctxt->charset = XML_CHAR_ENCODING_UTF8;
3813 	    } else {
3814 		htmlParseErr(ctxt, XML_ERR_UNSUPPORTED_ENCODING,
3815 		             "htmlCheckEncoding: unknown encoding %s\n",
3816 			     encoding, NULL);
3817 	    }
3818 	}
3819 
3820 	if ((ctxt->input->buf != NULL) &&
3821 	    (ctxt->input->buf->encoder != NULL) &&
3822 	    (ctxt->input->buf->raw != NULL) &&
3823 	    (ctxt->input->buf->buffer != NULL)) {
3824 	    int nbchars;
3825 	    int processed;
3826 
3827 	    /*
3828 	     * convert as much as possible to the parser reading buffer.
3829 	     */
3830 	    processed = ctxt->input->cur - ctxt->input->base;
3831 	    xmlBufShrink(ctxt->input->buf->buffer, processed);
3832 	    nbchars = xmlCharEncInput(ctxt->input->buf, 1);
3833             xmlBufResetInput(ctxt->input->buf->buffer, ctxt->input);
3834 	    if (nbchars < 0) {
3835 		htmlParseErr(ctxt, XML_ERR_INVALID_ENCODING,
3836 		             "htmlCheckEncoding: encoder error\n",
3837 			     NULL, NULL);
3838 	    }
3839 	}
3840     }
3841 }
3842 
3843 /**
3844  * htmlCheckEncoding:
3845  * @ctxt:  an HTML parser context
3846  * @attvalue: the attribute value
3847  *
3848  * Checks an http-equiv attribute from a Meta tag to detect
3849  * the encoding
3850  * If a new encoding is detected the parser is switched to decode
3851  * it and pass UTF8
3852  */
3853 static void
htmlCheckEncoding(htmlParserCtxtPtr ctxt,const xmlChar * attvalue)3854 htmlCheckEncoding(htmlParserCtxtPtr ctxt, const xmlChar *attvalue) {
3855     const xmlChar *encoding;
3856 
3857     if (!attvalue)
3858 	return;
3859 
3860     encoding = xmlStrcasestr(attvalue, BAD_CAST"charset");
3861     if (encoding != NULL) {
3862 	encoding += 7;
3863     }
3864     /*
3865      * skip blank
3866      */
3867     if (encoding && IS_BLANK_CH(*encoding))
3868 	encoding = xmlStrcasestr(attvalue, BAD_CAST"=");
3869     if (encoding && *encoding == '=') {
3870 	encoding ++;
3871 	htmlCheckEncodingDirect(ctxt, encoding);
3872     }
3873 }
3874 
3875 /**
3876  * htmlCheckMeta:
3877  * @ctxt:  an HTML parser context
3878  * @atts:  the attributes values
3879  *
3880  * Checks an attributes from a Meta tag
3881  */
3882 static void
htmlCheckMeta(htmlParserCtxtPtr ctxt,const xmlChar ** atts)3883 htmlCheckMeta(htmlParserCtxtPtr ctxt, const xmlChar **atts) {
3884     int i;
3885     const xmlChar *att, *value;
3886     int http = 0;
3887     const xmlChar *content = NULL;
3888 
3889     if ((ctxt == NULL) || (atts == NULL))
3890 	return;
3891 
3892     i = 0;
3893     att = atts[i++];
3894     while (att != NULL) {
3895 	value = atts[i++];
3896 	if ((value != NULL) && (!xmlStrcasecmp(att, BAD_CAST"http-equiv"))
3897 	 && (!xmlStrcasecmp(value, BAD_CAST"Content-Type")))
3898 	    http = 1;
3899 	else if ((value != NULL) && (!xmlStrcasecmp(att, BAD_CAST"charset")))
3900 	    htmlCheckEncodingDirect(ctxt, value);
3901 	else if ((value != NULL) && (!xmlStrcasecmp(att, BAD_CAST"content")))
3902 	    content = value;
3903 	att = atts[i++];
3904     }
3905     if ((http) && (content != NULL))
3906 	htmlCheckEncoding(ctxt, content);
3907 
3908 }
3909 
3910 /**
3911  * htmlParseStartTag:
3912  * @ctxt:  an HTML parser context
3913  *
3914  * parse a start of tag either for rule element or
3915  * EmptyElement. In both case we don't parse the tag closing chars.
3916  *
3917  * [40] STag ::= '<' Name (S Attribute)* S? '>'
3918  *
3919  * [44] EmptyElemTag ::= '<' Name (S Attribute)* S? '/>'
3920  *
3921  * With namespace:
3922  *
3923  * [NS 8] STag ::= '<' QName (S Attribute)* S? '>'
3924  *
3925  * [NS 10] EmptyElement ::= '<' QName (S Attribute)* S? '/>'
3926  *
3927  * Returns 0 in case of success, -1 in case of error and 1 if discarded
3928  */
3929 
3930 static int
htmlParseStartTag(htmlParserCtxtPtr ctxt)3931 htmlParseStartTag(htmlParserCtxtPtr ctxt) {
3932     const xmlChar *name;
3933     const xmlChar *attname;
3934     xmlChar *attvalue;
3935     const xmlChar **atts;
3936     int nbatts = 0;
3937     int maxatts;
3938     int meta = 0;
3939     int i;
3940     int discardtag = 0;
3941 
3942     if ((ctxt == NULL) || (ctxt->input == NULL)) {
3943 	htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
3944 		     "htmlParseStartTag: context error\n", NULL, NULL);
3945 	return -1;
3946     }
3947     if (ctxt->instate == XML_PARSER_EOF)
3948         return(-1);
3949     if (CUR != '<') return -1;
3950     NEXT;
3951 
3952     atts = ctxt->atts;
3953     maxatts = ctxt->maxatts;
3954 
3955     GROW;
3956     name = htmlParseHTMLName(ctxt);
3957     if (name == NULL) {
3958 	htmlParseErr(ctxt, XML_ERR_NAME_REQUIRED,
3959 	             "htmlParseStartTag: invalid element name\n",
3960 		     NULL, NULL);
3961 	/* if recover preserve text on classic misconstructs */
3962 	if ((ctxt->recovery) && ((IS_BLANK_CH(CUR)) || (CUR == '<') ||
3963 	    (CUR == '=') || (CUR == '>') || (((CUR >= '0') && (CUR <= '9'))))) {
3964 	    htmlParseCharDataInternal(ctxt, '<');
3965 	    return(-1);
3966 	}
3967 
3968 
3969 	/* Dump the bogus tag like browsers do */
3970 	while ((CUR != 0) && (CUR != '>') &&
3971                (ctxt->instate != XML_PARSER_EOF))
3972 	    NEXT;
3973         return -1;
3974     }
3975     if (xmlStrEqual(name, BAD_CAST"meta"))
3976 	meta = 1;
3977 
3978     /*
3979      * Check for auto-closure of HTML elements.
3980      */
3981     htmlAutoClose(ctxt, name);
3982 
3983     /*
3984      * Check for implied HTML elements.
3985      */
3986     htmlCheckImplied(ctxt, name);
3987 
3988     /*
3989      * Avoid html at any level > 0, head at any level != 1
3990      * or any attempt to recurse body
3991      */
3992     if ((ctxt->nameNr > 0) && (xmlStrEqual(name, BAD_CAST"html"))) {
3993 	htmlParseErr(ctxt, XML_HTML_STRUCURE_ERROR,
3994 	             "htmlParseStartTag: misplaced <html> tag\n",
3995 		     name, NULL);
3996 	discardtag = 1;
3997 	ctxt->depth++;
3998     }
3999     if ((ctxt->nameNr != 1) &&
4000 	(xmlStrEqual(name, BAD_CAST"head"))) {
4001 	htmlParseErr(ctxt, XML_HTML_STRUCURE_ERROR,
4002 	             "htmlParseStartTag: misplaced <head> tag\n",
4003 		     name, NULL);
4004 	discardtag = 1;
4005 	ctxt->depth++;
4006     }
4007     if (xmlStrEqual(name, BAD_CAST"body")) {
4008 	int indx;
4009 	for (indx = 0;indx < ctxt->nameNr;indx++) {
4010 	    if (xmlStrEqual(ctxt->nameTab[indx], BAD_CAST"body")) {
4011 		htmlParseErr(ctxt, XML_HTML_STRUCURE_ERROR,
4012 		             "htmlParseStartTag: misplaced <body> tag\n",
4013 			     name, NULL);
4014 		discardtag = 1;
4015 		ctxt->depth++;
4016 	    }
4017 	}
4018     }
4019 
4020     /*
4021      * Now parse the attributes, it ends up with the ending
4022      *
4023      * (S Attribute)* S?
4024      */
4025     SKIP_BLANKS;
4026     while ((CUR != 0) &&
4027            (CUR != '>') &&
4028 	   ((CUR != '/') || (NXT(1) != '>'))) {
4029 	GROW;
4030 	attname = htmlParseAttribute(ctxt, &attvalue);
4031         if (attname != NULL) {
4032 
4033 	    /*
4034 	     * Well formedness requires at most one declaration of an attribute
4035 	     */
4036 	    for (i = 0; i < nbatts;i += 2) {
4037 	        if (xmlStrEqual(atts[i], attname)) {
4038 		    htmlParseErr(ctxt, XML_ERR_ATTRIBUTE_REDEFINED,
4039 		                 "Attribute %s redefined\n", attname, NULL);
4040 		    if (attvalue != NULL)
4041 			xmlFree(attvalue);
4042 		    goto failed;
4043 		}
4044 	    }
4045 
4046 	    /*
4047 	     * Add the pair to atts
4048 	     */
4049 	    if (atts == NULL) {
4050 	        maxatts = 22; /* allow for 10 attrs by default */
4051 	        atts = (const xmlChar **)
4052 		       xmlMalloc(maxatts * sizeof(xmlChar *));
4053 		if (atts == NULL) {
4054 		    htmlErrMemory(ctxt, NULL);
4055 		    if (attvalue != NULL)
4056 			xmlFree(attvalue);
4057 		    goto failed;
4058 		}
4059 		ctxt->atts = atts;
4060 		ctxt->maxatts = maxatts;
4061 	    } else if (nbatts + 4 > maxatts) {
4062 	        const xmlChar **n;
4063 
4064 	        maxatts *= 2;
4065 	        n = (const xmlChar **) xmlRealloc((void *) atts,
4066 					     maxatts * sizeof(const xmlChar *));
4067 		if (n == NULL) {
4068 		    htmlErrMemory(ctxt, NULL);
4069 		    if (attvalue != NULL)
4070 			xmlFree(attvalue);
4071 		    goto failed;
4072 		}
4073 		atts = n;
4074 		ctxt->atts = atts;
4075 		ctxt->maxatts = maxatts;
4076 	    }
4077 	    atts[nbatts++] = attname;
4078 	    atts[nbatts++] = attvalue;
4079 	    atts[nbatts] = NULL;
4080 	    atts[nbatts + 1] = NULL;
4081 	}
4082 	else {
4083 	    if (attvalue != NULL)
4084 	        xmlFree(attvalue);
4085 	    /* Dump the bogus attribute string up to the next blank or
4086 	     * the end of the tag. */
4087 	    while ((CUR != 0) &&
4088 	           !(IS_BLANK_CH(CUR)) && (CUR != '>') &&
4089 		   ((CUR != '/') || (NXT(1) != '>')))
4090 		NEXT;
4091 	}
4092 
4093 failed:
4094 	SKIP_BLANKS;
4095     }
4096 
4097     /*
4098      * Handle specific association to the META tag
4099      */
4100     if (meta && (nbatts != 0))
4101 	htmlCheckMeta(ctxt, atts);
4102 
4103     /*
4104      * SAX: Start of Element !
4105      */
4106     if (!discardtag) {
4107 	htmlnamePush(ctxt, name);
4108 	if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL)) {
4109 	    if (nbatts != 0)
4110 		ctxt->sax->startElement(ctxt->userData, name, atts);
4111 	    else
4112 		ctxt->sax->startElement(ctxt->userData, name, NULL);
4113 	}
4114     }
4115 
4116     if (atts != NULL) {
4117         for (i = 1;i < nbatts;i += 2) {
4118 	    if (atts[i] != NULL)
4119 		xmlFree((xmlChar *) atts[i]);
4120 	}
4121     }
4122 
4123     return(discardtag);
4124 }
4125 
4126 /**
4127  * htmlParseEndTag:
4128  * @ctxt:  an HTML parser context
4129  *
4130  * parse an end of tag
4131  *
4132  * [42] ETag ::= '</' Name S? '>'
4133  *
4134  * With namespace
4135  *
4136  * [NS 9] ETag ::= '</' QName S? '>'
4137  *
4138  * Returns 1 if the current level should be closed.
4139  */
4140 
4141 static int
htmlParseEndTag(htmlParserCtxtPtr ctxt)4142 htmlParseEndTag(htmlParserCtxtPtr ctxt)
4143 {
4144     const xmlChar *name;
4145     const xmlChar *oldname;
4146     int i, ret;
4147 
4148     if ((CUR != '<') || (NXT(1) != '/')) {
4149         htmlParseErr(ctxt, XML_ERR_LTSLASH_REQUIRED,
4150 	             "htmlParseEndTag: '</' not found\n", NULL, NULL);
4151         return (0);
4152     }
4153     SKIP(2);
4154 
4155     name = htmlParseHTMLName(ctxt);
4156     if (name == NULL)
4157         return (0);
4158     /*
4159      * We should definitely be at the ending "S? '>'" part
4160      */
4161     SKIP_BLANKS;
4162     if (CUR != '>') {
4163         htmlParseErr(ctxt, XML_ERR_GT_REQUIRED,
4164 	             "End tag : expected '>'\n", NULL, NULL);
4165         /* Skip to next '>' */
4166         while ((CUR != 0) && (CUR != '>'))
4167             NEXT;
4168     }
4169     if (CUR == '>')
4170         NEXT;
4171 
4172     /*
4173      * if we ignored misplaced tags in htmlParseStartTag don't pop them
4174      * out now.
4175      */
4176     if ((ctxt->depth > 0) &&
4177         (xmlStrEqual(name, BAD_CAST "html") ||
4178          xmlStrEqual(name, BAD_CAST "body") ||
4179 	 xmlStrEqual(name, BAD_CAST "head"))) {
4180 	ctxt->depth--;
4181 	return (0);
4182     }
4183 
4184     /*
4185      * If the name read is not one of the element in the parsing stack
4186      * then return, it's just an error.
4187      */
4188     for (i = (ctxt->nameNr - 1); i >= 0; i--) {
4189         if (xmlStrEqual(name, ctxt->nameTab[i]))
4190             break;
4191     }
4192     if (i < 0) {
4193         htmlParseErr(ctxt, XML_ERR_TAG_NAME_MISMATCH,
4194 	             "Unexpected end tag : %s\n", name, NULL);
4195         return (0);
4196     }
4197 
4198 
4199     /*
4200      * Check for auto-closure of HTML elements.
4201      */
4202 
4203     htmlAutoCloseOnClose(ctxt, name);
4204 
4205     /*
4206      * Well formedness constraints, opening and closing must match.
4207      * With the exception that the autoclose may have popped stuff out
4208      * of the stack.
4209      */
4210     if ((ctxt->name != NULL) && (!xmlStrEqual(ctxt->name, name))) {
4211         htmlParseErr(ctxt, XML_ERR_TAG_NAME_MISMATCH,
4212                      "Opening and ending tag mismatch: %s and %s\n",
4213                      name, ctxt->name);
4214     }
4215 
4216     /*
4217      * SAX: End of Tag
4218      */
4219     oldname = ctxt->name;
4220     if ((oldname != NULL) && (xmlStrEqual(oldname, name))) {
4221         if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
4222             ctxt->sax->endElement(ctxt->userData, name);
4223 	htmlNodeInfoPop(ctxt);
4224         htmlnamePop(ctxt);
4225         ret = 1;
4226     } else {
4227         ret = 0;
4228     }
4229 
4230     return (ret);
4231 }
4232 
4233 
4234 /**
4235  * htmlParseReference:
4236  * @ctxt:  an HTML parser context
4237  *
4238  * parse and handle entity references in content,
4239  * this will end-up in a call to character() since this is either a
4240  * CharRef, or a predefined entity.
4241  */
4242 static void
htmlParseReference(htmlParserCtxtPtr ctxt)4243 htmlParseReference(htmlParserCtxtPtr ctxt) {
4244     const htmlEntityDesc * ent;
4245     xmlChar out[6];
4246     const xmlChar *name;
4247     if (CUR != '&') return;
4248 
4249     if (NXT(1) == '#') {
4250 	unsigned int c;
4251 	int bits, i = 0;
4252 
4253 	c = htmlParseCharRef(ctxt);
4254 	if (c == 0)
4255 	    return;
4256 
4257         if      (c <    0x80) { out[i++]= c;                bits= -6; }
4258         else if (c <   0x800) { out[i++]=((c >>  6) & 0x1F) | 0xC0;  bits=  0; }
4259         else if (c < 0x10000) { out[i++]=((c >> 12) & 0x0F) | 0xE0;  bits=  6; }
4260         else                  { out[i++]=((c >> 18) & 0x07) | 0xF0;  bits= 12; }
4261 
4262         for ( ; bits >= 0; bits-= 6) {
4263             out[i++]= ((c >> bits) & 0x3F) | 0x80;
4264         }
4265 	out[i] = 0;
4266 
4267 	htmlCheckParagraph(ctxt);
4268 	if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL))
4269 	    ctxt->sax->characters(ctxt->userData, out, i);
4270     } else {
4271 	ent = htmlParseEntityRef(ctxt, &name);
4272 	if (name == NULL) {
4273 	    htmlCheckParagraph(ctxt);
4274 	    if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL))
4275 	        ctxt->sax->characters(ctxt->userData, BAD_CAST "&", 1);
4276 	    return;
4277 	}
4278 	if ((ent == NULL) || !(ent->value > 0)) {
4279 	    htmlCheckParagraph(ctxt);
4280 	    if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL)) {
4281 		ctxt->sax->characters(ctxt->userData, BAD_CAST "&", 1);
4282 		ctxt->sax->characters(ctxt->userData, name, xmlStrlen(name));
4283 		/* ctxt->sax->characters(ctxt->userData, BAD_CAST ";", 1); */
4284 	    }
4285 	} else {
4286 	    unsigned int c;
4287 	    int bits, i = 0;
4288 
4289 	    c = ent->value;
4290 	    if      (c <    0x80)
4291 	            { out[i++]= c;                bits= -6; }
4292 	    else if (c <   0x800)
4293 	            { out[i++]=((c >>  6) & 0x1F) | 0xC0;  bits=  0; }
4294 	    else if (c < 0x10000)
4295 	            { out[i++]=((c >> 12) & 0x0F) | 0xE0;  bits=  6; }
4296 	    else
4297 	            { out[i++]=((c >> 18) & 0x07) | 0xF0;  bits= 12; }
4298 
4299 	    for ( ; bits >= 0; bits-= 6) {
4300 		out[i++]= ((c >> bits) & 0x3F) | 0x80;
4301 	    }
4302 	    out[i] = 0;
4303 
4304 	    htmlCheckParagraph(ctxt);
4305 	    if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL))
4306 		ctxt->sax->characters(ctxt->userData, out, i);
4307 	}
4308     }
4309 }
4310 
4311 /**
4312  * htmlParseContent:
4313  * @ctxt:  an HTML parser context
4314  *
4315  * Parse a content: comment, sub-element, reference or text.
4316  * Kept for compatibility with old code
4317  */
4318 
4319 static void
htmlParseContent(htmlParserCtxtPtr ctxt)4320 htmlParseContent(htmlParserCtxtPtr ctxt) {
4321     xmlChar *currentNode;
4322     int depth;
4323     const xmlChar *name;
4324 
4325     currentNode = xmlStrdup(ctxt->name);
4326     depth = ctxt->nameNr;
4327     while (1) {
4328         GROW;
4329 
4330         if (ctxt->instate == XML_PARSER_EOF)
4331             break;
4332 
4333 	/*
4334 	 * Our tag or one of it's parent or children is ending.
4335 	 */
4336         if ((CUR == '<') && (NXT(1) == '/')) {
4337 	    if (htmlParseEndTag(ctxt) &&
4338 		((currentNode != NULL) || (ctxt->nameNr == 0))) {
4339 		if (currentNode != NULL)
4340 		    xmlFree(currentNode);
4341 		return;
4342 	    }
4343 	    continue; /* while */
4344         }
4345 
4346 	else if ((CUR == '<') &&
4347 	         ((IS_ASCII_LETTER(NXT(1))) ||
4348 		  (NXT(1) == '_') || (NXT(1) == ':'))) {
4349 	    name = htmlParseHTMLName_nonInvasive(ctxt);
4350 	    if (name == NULL) {
4351 	        htmlParseErr(ctxt, XML_ERR_NAME_REQUIRED,
4352 			 "htmlParseStartTag: invalid element name\n",
4353 			 NULL, NULL);
4354 	        /* Dump the bogus tag like browsers do */
4355                 while ((CUR != 0) && (CUR != '>'))
4356 	            NEXT;
4357 
4358 	        if (currentNode != NULL)
4359 	            xmlFree(currentNode);
4360 	        return;
4361 	    }
4362 
4363 	    if (ctxt->name != NULL) {
4364 	        if (htmlCheckAutoClose(name, ctxt->name) == 1) {
4365 	            htmlAutoClose(ctxt, name);
4366 	            continue;
4367 	        }
4368 	    }
4369 	}
4370 
4371 	/*
4372 	 * Has this node been popped out during parsing of
4373 	 * the next element
4374 	 */
4375         if ((ctxt->nameNr > 0) && (depth >= ctxt->nameNr) &&
4376 	    (!xmlStrEqual(currentNode, ctxt->name)))
4377 	     {
4378 	    if (currentNode != NULL) xmlFree(currentNode);
4379 	    return;
4380 	}
4381 
4382 	if ((CUR != 0) && ((xmlStrEqual(currentNode, BAD_CAST"script")) ||
4383 	    (xmlStrEqual(currentNode, BAD_CAST"style")))) {
4384 	    /*
4385 	     * Handle SCRIPT/STYLE separately
4386 	     */
4387 	    htmlParseScript(ctxt);
4388 	} else {
4389 	    /*
4390 	     * Sometimes DOCTYPE arrives in the middle of the document
4391 	     */
4392 	    if ((CUR == '<') && (NXT(1) == '!') &&
4393 		(UPP(2) == 'D') && (UPP(3) == 'O') &&
4394 		(UPP(4) == 'C') && (UPP(5) == 'T') &&
4395 		(UPP(6) == 'Y') && (UPP(7) == 'P') &&
4396 		(UPP(8) == 'E')) {
4397 		htmlParseErr(ctxt, XML_HTML_STRUCURE_ERROR,
4398 		             "Misplaced DOCTYPE declaration\n",
4399 			     BAD_CAST "DOCTYPE" , NULL);
4400 		htmlParseDocTypeDecl(ctxt);
4401 	    }
4402 
4403 	    /*
4404 	     * First case :  a comment
4405 	     */
4406 	    if ((CUR == '<') && (NXT(1) == '!') &&
4407 		(NXT(2) == '-') && (NXT(3) == '-')) {
4408 		htmlParseComment(ctxt);
4409 	    }
4410 
4411 	    /*
4412 	     * Second case : a Processing Instruction.
4413 	     */
4414 	    else if ((CUR == '<') && (NXT(1) == '?')) {
4415 		htmlParsePI(ctxt);
4416 	    }
4417 
4418 	    /*
4419 	     * Third case :  a sub-element.
4420 	     */
4421 	    else if (CUR == '<') {
4422 		htmlParseElement(ctxt);
4423 	    }
4424 
4425 	    /*
4426 	     * Fourth case : a reference. If if has not been resolved,
4427 	     *    parsing returns it's Name, create the node
4428 	     */
4429 	    else if (CUR == '&') {
4430 		htmlParseReference(ctxt);
4431 	    }
4432 
4433 	    /*
4434 	     * Fifth case : end of the resource
4435 	     */
4436 	    else if (CUR == 0) {
4437 		htmlAutoCloseOnEnd(ctxt);
4438 		break;
4439 	    }
4440 
4441 	    /*
4442 	     * Last case, text. Note that References are handled directly.
4443 	     */
4444 	    else {
4445 		htmlParseCharData(ctxt);
4446 	    }
4447 	}
4448         GROW;
4449     }
4450     if (currentNode != NULL) xmlFree(currentNode);
4451 }
4452 
4453 /**
4454  * htmlParseElement:
4455  * @ctxt:  an HTML parser context
4456  *
4457  * parse an HTML element, this is highly recursive
4458  * this is kept for compatibility with previous code versions
4459  *
4460  * [39] element ::= EmptyElemTag | STag content ETag
4461  *
4462  * [41] Attribute ::= Name Eq AttValue
4463  */
4464 
4465 void
htmlParseElement(htmlParserCtxtPtr ctxt)4466 htmlParseElement(htmlParserCtxtPtr ctxt) {
4467     const xmlChar *name;
4468     xmlChar *currentNode = NULL;
4469     const htmlElemDesc * info;
4470     htmlParserNodeInfo node_info;
4471     int failed;
4472     int depth;
4473     const xmlChar *oldptr;
4474 
4475     if ((ctxt == NULL) || (ctxt->input == NULL)) {
4476 	htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
4477 		     "htmlParseElement: context error\n", NULL, NULL);
4478 	return;
4479     }
4480 
4481     if (ctxt->instate == XML_PARSER_EOF)
4482         return;
4483 
4484     /* Capture start position */
4485     if (ctxt->record_info) {
4486         node_info.begin_pos = ctxt->input->consumed +
4487                           (CUR_PTR - ctxt->input->base);
4488 	node_info.begin_line = ctxt->input->line;
4489     }
4490 
4491     failed = htmlParseStartTag(ctxt);
4492     name = ctxt->name;
4493     if ((failed == -1) || (name == NULL)) {
4494 	if (CUR == '>')
4495 	    NEXT;
4496         return;
4497     }
4498 
4499     /*
4500      * Lookup the info for that element.
4501      */
4502     info = htmlTagLookup(name);
4503     if (info == NULL) {
4504 	htmlParseErr(ctxt, XML_HTML_UNKNOWN_TAG,
4505 	             "Tag %s invalid\n", name, NULL);
4506     }
4507 
4508     /*
4509      * Check for an Empty Element labeled the XML/SGML way
4510      */
4511     if ((CUR == '/') && (NXT(1) == '>')) {
4512         SKIP(2);
4513 	if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
4514 	    ctxt->sax->endElement(ctxt->userData, name);
4515 	htmlnamePop(ctxt);
4516 	return;
4517     }
4518 
4519     if (CUR == '>') {
4520         NEXT;
4521     } else {
4522 	htmlParseErr(ctxt, XML_ERR_GT_REQUIRED,
4523 	             "Couldn't find end of Start Tag %s\n", name, NULL);
4524 
4525 	/*
4526 	 * end of parsing of this node.
4527 	 */
4528 	if (xmlStrEqual(name, ctxt->name)) {
4529 	    nodePop(ctxt);
4530 	    htmlnamePop(ctxt);
4531 	}
4532 
4533 	/*
4534 	 * Capture end position and add node
4535 	 */
4536 	if (ctxt->record_info) {
4537 	   node_info.end_pos = ctxt->input->consumed +
4538 			      (CUR_PTR - ctxt->input->base);
4539 	   node_info.end_line = ctxt->input->line;
4540 	   node_info.node = ctxt->node;
4541 	   xmlParserAddNodeInfo(ctxt, &node_info);
4542 	}
4543 	return;
4544     }
4545 
4546     /*
4547      * Check for an Empty Element from DTD definition
4548      */
4549     if ((info != NULL) && (info->empty)) {
4550 	if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
4551 	    ctxt->sax->endElement(ctxt->userData, name);
4552 	htmlnamePop(ctxt);
4553 	return;
4554     }
4555 
4556     /*
4557      * Parse the content of the element:
4558      */
4559     currentNode = xmlStrdup(ctxt->name);
4560     depth = ctxt->nameNr;
4561     while (CUR != 0) {
4562 	oldptr = ctxt->input->cur;
4563 	htmlParseContent(ctxt);
4564 	if (oldptr==ctxt->input->cur) break;
4565 	if (ctxt->nameNr < depth) break;
4566     }
4567 
4568     /*
4569      * Capture end position and add node
4570      */
4571     if ( currentNode != NULL && ctxt->record_info ) {
4572        node_info.end_pos = ctxt->input->consumed +
4573                           (CUR_PTR - ctxt->input->base);
4574        node_info.end_line = ctxt->input->line;
4575        node_info.node = ctxt->node;
4576        xmlParserAddNodeInfo(ctxt, &node_info);
4577     }
4578     if (CUR == 0) {
4579 	htmlAutoCloseOnEnd(ctxt);
4580     }
4581 
4582     if (currentNode != NULL)
4583 	xmlFree(currentNode);
4584 }
4585 
4586 static void
htmlParserFinishElementParsing(htmlParserCtxtPtr ctxt)4587 htmlParserFinishElementParsing(htmlParserCtxtPtr ctxt) {
4588     /*
4589      * Capture end position and add node
4590      */
4591     if ( ctxt->node != NULL && ctxt->record_info ) {
4592        ctxt->nodeInfo->end_pos = ctxt->input->consumed +
4593                                 (CUR_PTR - ctxt->input->base);
4594        ctxt->nodeInfo->end_line = ctxt->input->line;
4595        ctxt->nodeInfo->node = ctxt->node;
4596        xmlParserAddNodeInfo(ctxt, ctxt->nodeInfo);
4597        htmlNodeInfoPop(ctxt);
4598     }
4599     if (CUR == 0) {
4600        htmlAutoCloseOnEnd(ctxt);
4601     }
4602 }
4603 
4604 /**
4605  * htmlParseElementInternal:
4606  * @ctxt:  an HTML parser context
4607  *
4608  * parse an HTML element, new version, non recursive
4609  *
4610  * [39] element ::= EmptyElemTag | STag content ETag
4611  *
4612  * [41] Attribute ::= Name Eq AttValue
4613  */
4614 
4615 static void
htmlParseElementInternal(htmlParserCtxtPtr ctxt)4616 htmlParseElementInternal(htmlParserCtxtPtr ctxt) {
4617     const xmlChar *name;
4618     const htmlElemDesc * info;
4619     htmlParserNodeInfo node_info = { NULL, 0, 0, 0, 0 };
4620     int failed;
4621 
4622     if ((ctxt == NULL) || (ctxt->input == NULL)) {
4623 	htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
4624 		     "htmlParseElementInternal: context error\n", NULL, NULL);
4625 	return;
4626     }
4627 
4628     if (ctxt->instate == XML_PARSER_EOF)
4629         return;
4630 
4631     /* Capture start position */
4632     if (ctxt->record_info) {
4633         node_info.begin_pos = ctxt->input->consumed +
4634                           (CUR_PTR - ctxt->input->base);
4635 	node_info.begin_line = ctxt->input->line;
4636     }
4637 
4638     failed = htmlParseStartTag(ctxt);
4639     name = ctxt->name;
4640     if ((failed == -1) || (name == NULL)) {
4641 	if (CUR == '>')
4642 	    NEXT;
4643         return;
4644     }
4645 
4646     /*
4647      * Lookup the info for that element.
4648      */
4649     info = htmlTagLookup(name);
4650     if (info == NULL) {
4651 	htmlParseErr(ctxt, XML_HTML_UNKNOWN_TAG,
4652 	             "Tag %s invalid\n", name, NULL);
4653     }
4654 
4655     /*
4656      * Check for an Empty Element labeled the XML/SGML way
4657      */
4658     if ((CUR == '/') && (NXT(1) == '>')) {
4659         SKIP(2);
4660 	if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
4661 	    ctxt->sax->endElement(ctxt->userData, name);
4662 	htmlnamePop(ctxt);
4663 	return;
4664     }
4665 
4666     if (CUR == '>') {
4667         NEXT;
4668     } else {
4669 	htmlParseErr(ctxt, XML_ERR_GT_REQUIRED,
4670 	             "Couldn't find end of Start Tag %s\n", name, NULL);
4671 
4672 	/*
4673 	 * end of parsing of this node.
4674 	 */
4675 	if (xmlStrEqual(name, ctxt->name)) {
4676 	    nodePop(ctxt);
4677 	    htmlnamePop(ctxt);
4678 	}
4679 
4680         if (ctxt->record_info)
4681             htmlNodeInfoPush(ctxt, &node_info);
4682         htmlParserFinishElementParsing(ctxt);
4683 	return;
4684     }
4685 
4686     /*
4687      * Check for an Empty Element from DTD definition
4688      */
4689     if ((info != NULL) && (info->empty)) {
4690 	if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
4691 	    ctxt->sax->endElement(ctxt->userData, name);
4692 	htmlnamePop(ctxt);
4693 	return;
4694     }
4695 
4696     if (ctxt->record_info)
4697         htmlNodeInfoPush(ctxt, &node_info);
4698 }
4699 
4700 /**
4701  * htmlParseContentInternal:
4702  * @ctxt:  an HTML parser context
4703  *
4704  * Parse a content: comment, sub-element, reference or text.
4705  * New version for non recursive htmlParseElementInternal
4706  */
4707 
4708 static void
htmlParseContentInternal(htmlParserCtxtPtr ctxt)4709 htmlParseContentInternal(htmlParserCtxtPtr ctxt) {
4710     xmlChar *currentNode;
4711     int depth;
4712     const xmlChar *name;
4713 
4714     currentNode = xmlStrdup(ctxt->name);
4715     depth = ctxt->nameNr;
4716     while (1) {
4717         GROW;
4718 
4719         if (ctxt->instate == XML_PARSER_EOF)
4720             break;
4721 
4722 	/*
4723 	 * Our tag or one of it's parent or children is ending.
4724 	 */
4725         if ((CUR == '<') && (NXT(1) == '/')) {
4726 	    if (htmlParseEndTag(ctxt) &&
4727 		((currentNode != NULL) || (ctxt->nameNr == 0))) {
4728 		if (currentNode != NULL)
4729 		    xmlFree(currentNode);
4730 
4731 	        currentNode = xmlStrdup(ctxt->name);
4732 	        depth = ctxt->nameNr;
4733 	    }
4734 	    continue; /* while */
4735         }
4736 
4737 	else if ((CUR == '<') &&
4738 	         ((IS_ASCII_LETTER(NXT(1))) ||
4739 		  (NXT(1) == '_') || (NXT(1) == ':'))) {
4740 	    name = htmlParseHTMLName_nonInvasive(ctxt);
4741 	    if (name == NULL) {
4742 	        htmlParseErr(ctxt, XML_ERR_NAME_REQUIRED,
4743 			 "htmlParseStartTag: invalid element name\n",
4744 			 NULL, NULL);
4745 	        /* Dump the bogus tag like browsers do */
4746 	        while ((CUR == 0) && (CUR != '>'))
4747 	            NEXT;
4748 
4749 	        htmlParserFinishElementParsing(ctxt);
4750 	        if (currentNode != NULL)
4751 	            xmlFree(currentNode);
4752 
4753 	        currentNode = xmlStrdup(ctxt->name);
4754 	        depth = ctxt->nameNr;
4755 	        continue;
4756 	    }
4757 
4758 	    if (ctxt->name != NULL) {
4759 	        if (htmlCheckAutoClose(name, ctxt->name) == 1) {
4760 	            htmlAutoClose(ctxt, name);
4761 	            continue;
4762 	        }
4763 	    }
4764 	}
4765 
4766 	/*
4767 	 * Has this node been popped out during parsing of
4768 	 * the next element
4769 	 */
4770         if ((ctxt->nameNr > 0) && (depth >= ctxt->nameNr) &&
4771 	    (!xmlStrEqual(currentNode, ctxt->name)))
4772 	     {
4773 	    htmlParserFinishElementParsing(ctxt);
4774 	    if (currentNode != NULL) xmlFree(currentNode);
4775 
4776 	    currentNode = xmlStrdup(ctxt->name);
4777 	    depth = ctxt->nameNr;
4778 	    continue;
4779 	}
4780 
4781 	if ((CUR != 0) && ((xmlStrEqual(currentNode, BAD_CAST"script")) ||
4782 	    (xmlStrEqual(currentNode, BAD_CAST"style")))) {
4783 	    /*
4784 	     * Handle SCRIPT/STYLE separately
4785 	     */
4786 	    htmlParseScript(ctxt);
4787 	} else {
4788 	    /*
4789 	     * Sometimes DOCTYPE arrives in the middle of the document
4790 	     */
4791 	    if ((CUR == '<') && (NXT(1) == '!') &&
4792 		(UPP(2) == 'D') && (UPP(3) == 'O') &&
4793 		(UPP(4) == 'C') && (UPP(5) == 'T') &&
4794 		(UPP(6) == 'Y') && (UPP(7) == 'P') &&
4795 		(UPP(8) == 'E')) {
4796 		htmlParseErr(ctxt, XML_HTML_STRUCURE_ERROR,
4797 		             "Misplaced DOCTYPE declaration\n",
4798 			     BAD_CAST "DOCTYPE" , NULL);
4799 		htmlParseDocTypeDecl(ctxt);
4800 	    }
4801 
4802 	    /*
4803 	     * First case :  a comment
4804 	     */
4805 	    if ((CUR == '<') && (NXT(1) == '!') &&
4806 		(NXT(2) == '-') && (NXT(3) == '-')) {
4807 		htmlParseComment(ctxt);
4808 	    }
4809 
4810 	    /*
4811 	     * Second case : a Processing Instruction.
4812 	     */
4813 	    else if ((CUR == '<') && (NXT(1) == '?')) {
4814 		htmlParsePI(ctxt);
4815 	    }
4816 
4817 	    /*
4818 	     * Third case :  a sub-element.
4819 	     */
4820 	    else if (CUR == '<') {
4821 		htmlParseElementInternal(ctxt);
4822 		if (currentNode != NULL) xmlFree(currentNode);
4823 
4824 		currentNode = xmlStrdup(ctxt->name);
4825 		depth = ctxt->nameNr;
4826 	    }
4827 
4828 	    /*
4829 	     * Fourth case : a reference. If if has not been resolved,
4830 	     *    parsing returns it's Name, create the node
4831 	     */
4832 	    else if (CUR == '&') {
4833 		htmlParseReference(ctxt);
4834 	    }
4835 
4836 	    /*
4837 	     * Fifth case : end of the resource
4838 	     */
4839 	    else if (CUR == 0) {
4840 		htmlAutoCloseOnEnd(ctxt);
4841 		break;
4842 	    }
4843 
4844 	    /*
4845 	     * Last case, text. Note that References are handled directly.
4846 	     */
4847 	    else {
4848 		htmlParseCharData(ctxt);
4849 	    }
4850 	}
4851         GROW;
4852     }
4853     if (currentNode != NULL) xmlFree(currentNode);
4854 }
4855 
4856 /**
4857  * htmlParseContent:
4858  * @ctxt:  an HTML parser context
4859  *
4860  * Parse a content: comment, sub-element, reference or text.
4861  * This is the entry point when called from parser.c
4862  */
4863 
4864 void
__htmlParseContent(void * ctxt)4865 __htmlParseContent(void *ctxt) {
4866     if (ctxt != NULL)
4867 	htmlParseContentInternal((htmlParserCtxtPtr) ctxt);
4868 }
4869 
4870 /**
4871  * htmlParseDocument:
4872  * @ctxt:  an HTML parser context
4873  *
4874  * parse an HTML document (and build a tree if using the standard SAX
4875  * interface).
4876  *
4877  * Returns 0, -1 in case of error. the parser context is augmented
4878  *                as a result of the parsing.
4879  */
4880 
4881 int
htmlParseDocument(htmlParserCtxtPtr ctxt)4882 htmlParseDocument(htmlParserCtxtPtr ctxt) {
4883     xmlChar start[4];
4884     xmlCharEncoding enc;
4885     xmlDtdPtr dtd;
4886 
4887     xmlInitParser();
4888 
4889     htmlDefaultSAXHandlerInit();
4890 
4891     if ((ctxt == NULL) || (ctxt->input == NULL)) {
4892 	htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
4893 		     "htmlParseDocument: context error\n", NULL, NULL);
4894 	return(XML_ERR_INTERNAL_ERROR);
4895     }
4896     ctxt->html = 1;
4897     ctxt->linenumbers = 1;
4898     GROW;
4899     /*
4900      * SAX: beginning of the document processing.
4901      */
4902     if ((ctxt->sax) && (ctxt->sax->setDocumentLocator))
4903         ctxt->sax->setDocumentLocator(ctxt->userData, &xmlDefaultSAXLocator);
4904 
4905     if ((ctxt->encoding == (const xmlChar *)XML_CHAR_ENCODING_NONE) &&
4906         ((ctxt->input->end - ctxt->input->cur) >= 4)) {
4907 	/*
4908 	 * Get the 4 first bytes and decode the charset
4909 	 * if enc != XML_CHAR_ENCODING_NONE
4910 	 * plug some encoding conversion routines.
4911 	 */
4912 	start[0] = RAW;
4913 	start[1] = NXT(1);
4914 	start[2] = NXT(2);
4915 	start[3] = NXT(3);
4916 	enc = xmlDetectCharEncoding(&start[0], 4);
4917 	if (enc != XML_CHAR_ENCODING_NONE) {
4918 	    xmlSwitchEncoding(ctxt, enc);
4919 	}
4920     }
4921 
4922     /*
4923      * Wipe out everything which is before the first '<'
4924      */
4925     SKIP_BLANKS;
4926     if (CUR == 0) {
4927 	htmlParseErr(ctxt, XML_ERR_DOCUMENT_EMPTY,
4928 	             "Document is empty\n", NULL, NULL);
4929     }
4930 
4931     if ((ctxt->sax) && (ctxt->sax->startDocument) && (!ctxt->disableSAX))
4932 	ctxt->sax->startDocument(ctxt->userData);
4933 
4934 
4935     /*
4936      * Parse possible comments and PIs before any content
4937      */
4938     while (((CUR == '<') && (NXT(1) == '!') &&
4939             (NXT(2) == '-') && (NXT(3) == '-')) ||
4940 	   ((CUR == '<') && (NXT(1) == '?'))) {
4941         htmlParseComment(ctxt);
4942         htmlParsePI(ctxt);
4943 	SKIP_BLANKS;
4944     }
4945 
4946 
4947     /*
4948      * Then possibly doc type declaration(s) and more Misc
4949      * (doctypedecl Misc*)?
4950      */
4951     if ((CUR == '<') && (NXT(1) == '!') &&
4952 	(UPP(2) == 'D') && (UPP(3) == 'O') &&
4953 	(UPP(4) == 'C') && (UPP(5) == 'T') &&
4954 	(UPP(6) == 'Y') && (UPP(7) == 'P') &&
4955 	(UPP(8) == 'E')) {
4956 	htmlParseDocTypeDecl(ctxt);
4957     }
4958     SKIP_BLANKS;
4959 
4960     /*
4961      * Parse possible comments and PIs before any content
4962      */
4963     while (((CUR == '<') && (NXT(1) == '!') &&
4964             (NXT(2) == '-') && (NXT(3) == '-')) ||
4965 	   ((CUR == '<') && (NXT(1) == '?'))) {
4966         htmlParseComment(ctxt);
4967         htmlParsePI(ctxt);
4968 	SKIP_BLANKS;
4969     }
4970 
4971     /*
4972      * Time to start parsing the tree itself
4973      */
4974     htmlParseContentInternal(ctxt);
4975 
4976     /*
4977      * autoclose
4978      */
4979     if (CUR == 0)
4980 	htmlAutoCloseOnEnd(ctxt);
4981 
4982 
4983     /*
4984      * SAX: end of the document processing.
4985      */
4986     if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
4987         ctxt->sax->endDocument(ctxt->userData);
4988 
4989     if ((!(ctxt->options & HTML_PARSE_NODEFDTD)) && (ctxt->myDoc != NULL)) {
4990 	dtd = xmlGetIntSubset(ctxt->myDoc);
4991 	if (dtd == NULL)
4992 	    ctxt->myDoc->intSubset =
4993 		xmlCreateIntSubset(ctxt->myDoc, BAD_CAST "html",
4994 		    BAD_CAST "-//W3C//DTD HTML 4.0 Transitional//EN",
4995 		    BAD_CAST "http://www.w3.org/TR/REC-html40/loose.dtd");
4996     }
4997     if (! ctxt->wellFormed) return(-1);
4998     return(0);
4999 }
5000 
5001 
5002 /************************************************************************
5003  *									*
5004  *			Parser contexts handling			*
5005  *									*
5006  ************************************************************************/
5007 
5008 /**
5009  * htmlInitParserCtxt:
5010  * @ctxt:  an HTML parser context
5011  *
5012  * Initialize a parser context
5013  *
5014  * Returns 0 in case of success and -1 in case of error
5015  */
5016 
5017 static int
htmlInitParserCtxt(htmlParserCtxtPtr ctxt)5018 htmlInitParserCtxt(htmlParserCtxtPtr ctxt)
5019 {
5020     htmlSAXHandler *sax;
5021 
5022     if (ctxt == NULL) return(-1);
5023     memset(ctxt, 0, sizeof(htmlParserCtxt));
5024 
5025     ctxt->dict = xmlDictCreate();
5026     if (ctxt->dict == NULL) {
5027         htmlErrMemory(NULL, "htmlInitParserCtxt: out of memory\n");
5028 	return(-1);
5029     }
5030     sax = (htmlSAXHandler *) xmlMalloc(sizeof(htmlSAXHandler));
5031     if (sax == NULL) {
5032         htmlErrMemory(NULL, "htmlInitParserCtxt: out of memory\n");
5033 	return(-1);
5034     }
5035     else
5036         memset(sax, 0, sizeof(htmlSAXHandler));
5037 
5038     /* Allocate the Input stack */
5039     ctxt->inputTab = (htmlParserInputPtr *)
5040                       xmlMalloc(5 * sizeof(htmlParserInputPtr));
5041     if (ctxt->inputTab == NULL) {
5042         htmlErrMemory(NULL, "htmlInitParserCtxt: out of memory\n");
5043 	ctxt->inputNr = 0;
5044 	ctxt->inputMax = 0;
5045 	ctxt->input = NULL;
5046 	return(-1);
5047     }
5048     ctxt->inputNr = 0;
5049     ctxt->inputMax = 5;
5050     ctxt->input = NULL;
5051     ctxt->version = NULL;
5052     ctxt->encoding = NULL;
5053     ctxt->standalone = -1;
5054     ctxt->instate = XML_PARSER_START;
5055 
5056     /* Allocate the Node stack */
5057     ctxt->nodeTab = (htmlNodePtr *) xmlMalloc(10 * sizeof(htmlNodePtr));
5058     if (ctxt->nodeTab == NULL) {
5059         htmlErrMemory(NULL, "htmlInitParserCtxt: out of memory\n");
5060 	ctxt->nodeNr = 0;
5061 	ctxt->nodeMax = 0;
5062 	ctxt->node = NULL;
5063 	ctxt->inputNr = 0;
5064 	ctxt->inputMax = 0;
5065 	ctxt->input = NULL;
5066 	return(-1);
5067     }
5068     ctxt->nodeNr = 0;
5069     ctxt->nodeMax = 10;
5070     ctxt->node = NULL;
5071 
5072     /* Allocate the Name stack */
5073     ctxt->nameTab = (const xmlChar **) xmlMalloc(10 * sizeof(xmlChar *));
5074     if (ctxt->nameTab == NULL) {
5075         htmlErrMemory(NULL, "htmlInitParserCtxt: out of memory\n");
5076 	ctxt->nameNr = 0;
5077 	ctxt->nameMax = 0;
5078 	ctxt->name = NULL;
5079 	ctxt->nodeNr = 0;
5080 	ctxt->nodeMax = 0;
5081 	ctxt->node = NULL;
5082 	ctxt->inputNr = 0;
5083 	ctxt->inputMax = 0;
5084 	ctxt->input = NULL;
5085 	return(-1);
5086     }
5087     ctxt->nameNr = 0;
5088     ctxt->nameMax = 10;
5089     ctxt->name = NULL;
5090 
5091     ctxt->nodeInfoTab = NULL;
5092     ctxt->nodeInfoNr  = 0;
5093     ctxt->nodeInfoMax = 0;
5094 
5095     if (sax == NULL) ctxt->sax = (xmlSAXHandlerPtr) &htmlDefaultSAXHandler;
5096     else {
5097         ctxt->sax = sax;
5098 	memcpy(sax, &htmlDefaultSAXHandler, sizeof(xmlSAXHandlerV1));
5099     }
5100     ctxt->userData = ctxt;
5101     ctxt->myDoc = NULL;
5102     ctxt->wellFormed = 1;
5103     ctxt->replaceEntities = 0;
5104     ctxt->linenumbers = xmlLineNumbersDefaultValue;
5105     ctxt->keepBlanks = xmlKeepBlanksDefaultValue;
5106     ctxt->html = 1;
5107     ctxt->vctxt.finishDtd = XML_CTXT_FINISH_DTD_0;
5108     ctxt->vctxt.userData = ctxt;
5109     ctxt->vctxt.error = xmlParserValidityError;
5110     ctxt->vctxt.warning = xmlParserValidityWarning;
5111     ctxt->record_info = 0;
5112     ctxt->validate = 0;
5113     ctxt->checkIndex = 0;
5114     ctxt->catalogs = NULL;
5115     xmlInitNodeInfoSeq(&ctxt->node_seq);
5116     return(0);
5117 }
5118 
5119 /**
5120  * htmlFreeParserCtxt:
5121  * @ctxt:  an HTML parser context
5122  *
5123  * Free all the memory used by a parser context. However the parsed
5124  * document in ctxt->myDoc is not freed.
5125  */
5126 
5127 void
htmlFreeParserCtxt(htmlParserCtxtPtr ctxt)5128 htmlFreeParserCtxt(htmlParserCtxtPtr ctxt)
5129 {
5130     xmlFreeParserCtxt(ctxt);
5131 }
5132 
5133 /**
5134  * htmlNewParserCtxt:
5135  *
5136  * Allocate and initialize a new parser context.
5137  *
5138  * Returns the htmlParserCtxtPtr or NULL in case of allocation error
5139  */
5140 
5141 htmlParserCtxtPtr
htmlNewParserCtxt(void)5142 htmlNewParserCtxt(void)
5143 {
5144     xmlParserCtxtPtr ctxt;
5145 
5146     ctxt = (xmlParserCtxtPtr) xmlMalloc(sizeof(xmlParserCtxt));
5147     if (ctxt == NULL) {
5148         htmlErrMemory(NULL, "NewParserCtxt: out of memory\n");
5149 	return(NULL);
5150     }
5151     memset(ctxt, 0, sizeof(xmlParserCtxt));
5152     if (htmlInitParserCtxt(ctxt) < 0) {
5153         htmlFreeParserCtxt(ctxt);
5154 	return(NULL);
5155     }
5156     return(ctxt);
5157 }
5158 
5159 /**
5160  * htmlCreateMemoryParserCtxt:
5161  * @buffer:  a pointer to a char array
5162  * @size:  the size of the array
5163  *
5164  * Create a parser context for an HTML in-memory document.
5165  *
5166  * Returns the new parser context or NULL
5167  */
5168 htmlParserCtxtPtr
htmlCreateMemoryParserCtxt(const char * buffer,int size)5169 htmlCreateMemoryParserCtxt(const char *buffer, int size) {
5170     xmlParserCtxtPtr ctxt;
5171     xmlParserInputPtr input;
5172     xmlParserInputBufferPtr buf;
5173 
5174     if (buffer == NULL)
5175 	return(NULL);
5176     if (size <= 0)
5177 	return(NULL);
5178 
5179     ctxt = htmlNewParserCtxt();
5180     if (ctxt == NULL)
5181 	return(NULL);
5182 
5183     buf = xmlParserInputBufferCreateMem(buffer, size, XML_CHAR_ENCODING_NONE);
5184     if (buf == NULL) return(NULL);
5185 
5186     input = xmlNewInputStream(ctxt);
5187     if (input == NULL) {
5188 	xmlFreeParserCtxt(ctxt);
5189 	return(NULL);
5190     }
5191 
5192     input->filename = NULL;
5193     input->buf = buf;
5194     xmlBufResetInput(buf->buffer, input);
5195 
5196     inputPush(ctxt, input);
5197     return(ctxt);
5198 }
5199 
5200 /**
5201  * htmlCreateDocParserCtxt:
5202  * @cur:  a pointer to an array of xmlChar
5203  * @encoding:  a free form C string describing the HTML document encoding, or NULL
5204  *
5205  * Create a parser context for an HTML document.
5206  *
5207  * TODO: check the need to add encoding handling there
5208  *
5209  * Returns the new parser context or NULL
5210  */
5211 static htmlParserCtxtPtr
htmlCreateDocParserCtxt(const xmlChar * cur,const char * encoding)5212 htmlCreateDocParserCtxt(const xmlChar *cur, const char *encoding) {
5213     int len;
5214     htmlParserCtxtPtr ctxt;
5215 
5216     if (cur == NULL)
5217 	return(NULL);
5218     len = xmlStrlen(cur);
5219     ctxt = htmlCreateMemoryParserCtxt((char *)cur, len);
5220     if (ctxt == NULL)
5221 	return(NULL);
5222 
5223     if (encoding != NULL) {
5224 	xmlCharEncoding enc;
5225 	xmlCharEncodingHandlerPtr handler;
5226 
5227 	if (ctxt->input->encoding != NULL)
5228 	    xmlFree((xmlChar *) ctxt->input->encoding);
5229 	ctxt->input->encoding = xmlStrdup((const xmlChar *) encoding);
5230 
5231 	enc = xmlParseCharEncoding(encoding);
5232 	/*
5233 	 * registered set of known encodings
5234 	 */
5235 	if (enc != XML_CHAR_ENCODING_ERROR) {
5236 	    xmlSwitchEncoding(ctxt, enc);
5237 	    if (ctxt->errNo == XML_ERR_UNSUPPORTED_ENCODING) {
5238 		htmlParseErr(ctxt, XML_ERR_UNSUPPORTED_ENCODING,
5239 		             "Unsupported encoding %s\n",
5240 			     (const xmlChar *) encoding, NULL);
5241 	    }
5242 	} else {
5243 	    /*
5244 	     * fallback for unknown encodings
5245 	     */
5246 	    handler = xmlFindCharEncodingHandler((const char *) encoding);
5247 	    if (handler != NULL) {
5248 		xmlSwitchToEncoding(ctxt, handler);
5249 	    } else {
5250 		htmlParseErr(ctxt, XML_ERR_UNSUPPORTED_ENCODING,
5251 		             "Unsupported encoding %s\n",
5252 			     (const xmlChar *) encoding, NULL);
5253 	    }
5254 	}
5255     }
5256     return(ctxt);
5257 }
5258 
5259 #ifdef LIBXML_PUSH_ENABLED
5260 /************************************************************************
5261  *									*
5262  *	Progressive parsing interfaces				*
5263  *									*
5264  ************************************************************************/
5265 
5266 /**
5267  * htmlParseLookupSequence:
5268  * @ctxt:  an HTML parser context
5269  * @first:  the first char to lookup
5270  * @next:  the next char to lookup or zero
5271  * @third:  the next char to lookup or zero
5272  * @ignoreattrval: skip over attribute values
5273  *
5274  * Try to find if a sequence (first, next, third) or  just (first next) or
5275  * (first) is available in the input stream.
5276  * This function has a side effect of (possibly) incrementing ctxt->checkIndex
5277  * to avoid rescanning sequences of bytes, it DOES change the state of the
5278  * parser, do not use liberally.
5279  * This is basically similar to xmlParseLookupSequence()
5280  *
5281  * Returns the index to the current parsing point if the full sequence
5282  *      is available, -1 otherwise.
5283  */
5284 static int
htmlParseLookupSequence(htmlParserCtxtPtr ctxt,xmlChar first,xmlChar next,xmlChar third,int ignoreattrval)5285 htmlParseLookupSequence(htmlParserCtxtPtr ctxt, xmlChar first,
5286                         xmlChar next, xmlChar third, int ignoreattrval)
5287 {
5288     int base, len;
5289     htmlParserInputPtr in;
5290     const xmlChar *buf;
5291     int invalue = 0;
5292     char valdellim = 0x0;
5293 
5294     in = ctxt->input;
5295     if (in == NULL)
5296         return (-1);
5297 
5298     base = in->cur - in->base;
5299     if (base < 0)
5300         return (-1);
5301 
5302     if (ctxt->checkIndex > base) {
5303         base = ctxt->checkIndex;
5304         /* Abuse hasPErefs member to restore current state. */
5305         invalue = ctxt->hasPErefs & 1 ? 1 : 0;
5306     }
5307 
5308     if (in->buf == NULL) {
5309         buf = in->base;
5310         len = in->length;
5311     } else {
5312         buf = xmlBufContent(in->buf->buffer);
5313         len = xmlBufUse(in->buf->buffer);
5314     }
5315 
5316     /* take into account the sequence length */
5317     if (third)
5318         len -= 2;
5319     else if (next)
5320         len--;
5321     for (; base < len; base++) {
5322         if (ignoreattrval) {
5323             if (buf[base] == '"' || buf[base] == '\'') {
5324                 if (invalue) {
5325                     if (buf[base] == valdellim) {
5326                         invalue = 0;
5327                         continue;
5328                     }
5329                 } else {
5330                     valdellim = buf[base];
5331                     invalue = 1;
5332                     continue;
5333                 }
5334             } else if (invalue) {
5335                 continue;
5336             }
5337         }
5338         if (buf[base] == first) {
5339             if (third != 0) {
5340                 if ((buf[base + 1] != next) || (buf[base + 2] != third))
5341                     continue;
5342             } else if (next != 0) {
5343                 if (buf[base + 1] != next)
5344                     continue;
5345             }
5346             ctxt->checkIndex = 0;
5347 #ifdef DEBUG_PUSH
5348             if (next == 0)
5349                 xmlGenericError(xmlGenericErrorContext,
5350                                 "HPP: lookup '%c' found at %d\n",
5351                                 first, base);
5352             else if (third == 0)
5353                 xmlGenericError(xmlGenericErrorContext,
5354                                 "HPP: lookup '%c%c' found at %d\n",
5355                                 first, next, base);
5356             else
5357                 xmlGenericError(xmlGenericErrorContext,
5358                                 "HPP: lookup '%c%c%c' found at %d\n",
5359                                 first, next, third, base);
5360 #endif
5361             return (base - (in->cur - in->base));
5362         }
5363     }
5364     ctxt->checkIndex = base;
5365     /* Abuse hasPErefs member to track current state. */
5366     if (invalue)
5367         ctxt->hasPErefs |= 1;
5368     else
5369         ctxt->hasPErefs &= ~1;
5370 #ifdef DEBUG_PUSH
5371     if (next == 0)
5372         xmlGenericError(xmlGenericErrorContext,
5373                         "HPP: lookup '%c' failed\n", first);
5374     else if (third == 0)
5375         xmlGenericError(xmlGenericErrorContext,
5376                         "HPP: lookup '%c%c' failed\n", first, next);
5377     else
5378         xmlGenericError(xmlGenericErrorContext,
5379                         "HPP: lookup '%c%c%c' failed\n", first, next,
5380                         third);
5381 #endif
5382     return (-1);
5383 }
5384 
5385 /**
5386  * htmlParseLookupCommentEnd:
5387  * @ctxt: an HTML parser context
5388  *
5389  * Try to find a comment end tag in the input stream
5390  * The search includes "-->" as well as WHATWG-recommended incorrectly-closed tags.
5391  * (See https://html.spec.whatwg.org/multipage/parsing.html#parse-error-incorrectly-closed-comment)
5392  * This function has a side effect of (possibly) incrementing ctxt->checkIndex
5393  * to avoid rescanning sequences of bytes, it DOES change the state of the
5394  * parser, do not use liberally.
5395  * This wraps to htmlParseLookupSequence()
5396  *
5397  * Returns the index to the current parsing point if the full sequence is available, -1 otherwise.
5398  */
5399 static int
htmlParseLookupCommentEnd(htmlParserCtxtPtr ctxt)5400 htmlParseLookupCommentEnd(htmlParserCtxtPtr ctxt)
5401 {
5402     int mark = 0;
5403     int cur = CUR_PTR - BASE_PTR;
5404 
5405     while (mark >= 0) {
5406 	mark = htmlParseLookupSequence(ctxt, '-', '-', 0, 0);
5407 	if ((mark < 0) ||
5408 	    (NXT(mark+2) == '>') ||
5409 	    ((NXT(mark+2) == '!') && (NXT(mark+3) == '>'))) {
5410 	    return mark;
5411 	}
5412 	ctxt->checkIndex = cur + mark + 1;
5413     }
5414     return mark;
5415 }
5416 
5417 
5418 /**
5419  * htmlParseTryOrFinish:
5420  * @ctxt:  an HTML parser context
5421  * @terminate:  last chunk indicator
5422  *
5423  * Try to progress on parsing
5424  *
5425  * Returns zero if no parsing was possible
5426  */
5427 static int
htmlParseTryOrFinish(htmlParserCtxtPtr ctxt,int terminate)5428 htmlParseTryOrFinish(htmlParserCtxtPtr ctxt, int terminate) {
5429     int ret = 0;
5430     htmlParserInputPtr in;
5431     ptrdiff_t avail = 0;
5432     xmlChar cur, next;
5433 
5434     htmlParserNodeInfo node_info;
5435 
5436 #ifdef DEBUG_PUSH
5437     switch (ctxt->instate) {
5438 	case XML_PARSER_EOF:
5439 	    xmlGenericError(xmlGenericErrorContext,
5440 		    "HPP: try EOF\n"); break;
5441 	case XML_PARSER_START:
5442 	    xmlGenericError(xmlGenericErrorContext,
5443 		    "HPP: try START\n"); break;
5444 	case XML_PARSER_MISC:
5445 	    xmlGenericError(xmlGenericErrorContext,
5446 		    "HPP: try MISC\n");break;
5447 	case XML_PARSER_COMMENT:
5448 	    xmlGenericError(xmlGenericErrorContext,
5449 		    "HPP: try COMMENT\n");break;
5450 	case XML_PARSER_PROLOG:
5451 	    xmlGenericError(xmlGenericErrorContext,
5452 		    "HPP: try PROLOG\n");break;
5453 	case XML_PARSER_START_TAG:
5454 	    xmlGenericError(xmlGenericErrorContext,
5455 		    "HPP: try START_TAG\n");break;
5456 	case XML_PARSER_CONTENT:
5457 	    xmlGenericError(xmlGenericErrorContext,
5458 		    "HPP: try CONTENT\n");break;
5459 	case XML_PARSER_CDATA_SECTION:
5460 	    xmlGenericError(xmlGenericErrorContext,
5461 		    "HPP: try CDATA_SECTION\n");break;
5462 	case XML_PARSER_END_TAG:
5463 	    xmlGenericError(xmlGenericErrorContext,
5464 		    "HPP: try END_TAG\n");break;
5465 	case XML_PARSER_ENTITY_DECL:
5466 	    xmlGenericError(xmlGenericErrorContext,
5467 		    "HPP: try ENTITY_DECL\n");break;
5468 	case XML_PARSER_ENTITY_VALUE:
5469 	    xmlGenericError(xmlGenericErrorContext,
5470 		    "HPP: try ENTITY_VALUE\n");break;
5471 	case XML_PARSER_ATTRIBUTE_VALUE:
5472 	    xmlGenericError(xmlGenericErrorContext,
5473 		    "HPP: try ATTRIBUTE_VALUE\n");break;
5474 	case XML_PARSER_DTD:
5475 	    xmlGenericError(xmlGenericErrorContext,
5476 		    "HPP: try DTD\n");break;
5477 	case XML_PARSER_EPILOG:
5478 	    xmlGenericError(xmlGenericErrorContext,
5479 		    "HPP: try EPILOG\n");break;
5480 	case XML_PARSER_PI:
5481 	    xmlGenericError(xmlGenericErrorContext,
5482 		    "HPP: try PI\n");break;
5483 	case XML_PARSER_SYSTEM_LITERAL:
5484 	    xmlGenericError(xmlGenericErrorContext,
5485 		    "HPP: try SYSTEM_LITERAL\n");break;
5486     }
5487 #endif
5488 
5489     while (1) {
5490 
5491 	in = ctxt->input;
5492 	if (in == NULL) break;
5493 	if (in->buf == NULL)
5494 	    avail = in->length - (in->cur - in->base);
5495 	else
5496 	    avail = (ptrdiff_t)xmlBufUse(in->buf->buffer) -
5497                     (in->cur - in->base);
5498 	if ((avail == 0) && (terminate)) {
5499 	    htmlAutoCloseOnEnd(ctxt);
5500 	    if ((ctxt->nameNr == 0) && (ctxt->instate != XML_PARSER_EOF)) {
5501 		/*
5502 		 * SAX: end of the document processing.
5503 		 */
5504 		ctxt->instate = XML_PARSER_EOF;
5505 		if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
5506 		    ctxt->sax->endDocument(ctxt->userData);
5507 	    }
5508 	}
5509         if (avail < 1)
5510 	    goto done;
5511         /*
5512          * This is done to make progress and avoid an infinite loop
5513          * if a parsing attempt was aborted by hitting a NUL byte. After
5514          * changing htmlCurrentChar, this probably isn't necessary anymore.
5515          * We should consider removing this check.
5516          */
5517 	cur = in->cur[0];
5518 	if (cur == 0) {
5519 	    SKIP(1);
5520 	    continue;
5521 	}
5522 
5523         switch (ctxt->instate) {
5524             case XML_PARSER_EOF:
5525 	        /*
5526 		 * Document parsing is done !
5527 		 */
5528 	        goto done;
5529             case XML_PARSER_START:
5530 	        /*
5531 		 * Very first chars read from the document flow.
5532 		 */
5533 		cur = in->cur[0];
5534 		if (IS_BLANK_CH(cur)) {
5535 		    SKIP_BLANKS;
5536 		    if (in->buf == NULL)
5537 			avail = in->length - (in->cur - in->base);
5538 		    else
5539 			avail = (ptrdiff_t)xmlBufUse(in->buf->buffer) -
5540                                 (in->cur - in->base);
5541 		}
5542 		if ((ctxt->sax) && (ctxt->sax->setDocumentLocator))
5543 		    ctxt->sax->setDocumentLocator(ctxt->userData,
5544 						  &xmlDefaultSAXLocator);
5545 		if ((ctxt->sax) && (ctxt->sax->startDocument) &&
5546 	            (!ctxt->disableSAX))
5547 		    ctxt->sax->startDocument(ctxt->userData);
5548 
5549 		cur = in->cur[0];
5550 		next = in->cur[1];
5551 		if ((cur == '<') && (next == '!') &&
5552 		    (UPP(2) == 'D') && (UPP(3) == 'O') &&
5553 		    (UPP(4) == 'C') && (UPP(5) == 'T') &&
5554 		    (UPP(6) == 'Y') && (UPP(7) == 'P') &&
5555 		    (UPP(8) == 'E')) {
5556 		    if ((!terminate) &&
5557 		        (htmlParseLookupSequence(ctxt, '>', 0, 0, 1) < 0))
5558 			goto done;
5559 #ifdef DEBUG_PUSH
5560 		    xmlGenericError(xmlGenericErrorContext,
5561 			    "HPP: Parsing internal subset\n");
5562 #endif
5563 		    htmlParseDocTypeDecl(ctxt);
5564 		    ctxt->instate = XML_PARSER_PROLOG;
5565 #ifdef DEBUG_PUSH
5566 		    xmlGenericError(xmlGenericErrorContext,
5567 			    "HPP: entering PROLOG\n");
5568 #endif
5569                 } else {
5570 		    ctxt->instate = XML_PARSER_MISC;
5571 #ifdef DEBUG_PUSH
5572 		    xmlGenericError(xmlGenericErrorContext,
5573 			    "HPP: entering MISC\n");
5574 #endif
5575 		}
5576 		break;
5577             case XML_PARSER_MISC:
5578 		SKIP_BLANKS;
5579 		if (in->buf == NULL)
5580 		    avail = in->length - (in->cur - in->base);
5581 		else
5582 		    avail = (ptrdiff_t)xmlBufUse(in->buf->buffer) -
5583                             (in->cur - in->base);
5584 		/*
5585 		 * no chars in buffer
5586 		 */
5587 		if (avail < 1)
5588 		    goto done;
5589 		/*
5590 		 * not enough chars in buffer
5591 		 */
5592 		if (avail < 2) {
5593 		    if (!terminate)
5594 			goto done;
5595 		    else
5596 			next = ' ';
5597 		} else {
5598 		    next = in->cur[1];
5599 		}
5600 		cur = in->cur[0];
5601 	        if ((cur == '<') && (next == '!') &&
5602 		    (in->cur[2] == '-') && (in->cur[3] == '-')) {
5603 		    if ((!terminate) && (htmlParseLookupCommentEnd(ctxt) < 0))
5604 			goto done;
5605 #ifdef DEBUG_PUSH
5606 		    xmlGenericError(xmlGenericErrorContext,
5607 			    "HPP: Parsing Comment\n");
5608 #endif
5609 		    htmlParseComment(ctxt);
5610 		    ctxt->instate = XML_PARSER_MISC;
5611 	        } else if ((cur == '<') && (next == '?')) {
5612 		    if ((!terminate) &&
5613 		        (htmlParseLookupSequence(ctxt, '>', 0, 0, 0) < 0))
5614 			goto done;
5615 #ifdef DEBUG_PUSH
5616 		    xmlGenericError(xmlGenericErrorContext,
5617 			    "HPP: Parsing PI\n");
5618 #endif
5619 		    htmlParsePI(ctxt);
5620 		    ctxt->instate = XML_PARSER_MISC;
5621 		} else if ((cur == '<') && (next == '!') &&
5622 		    (UPP(2) == 'D') && (UPP(3) == 'O') &&
5623 		    (UPP(4) == 'C') && (UPP(5) == 'T') &&
5624 		    (UPP(6) == 'Y') && (UPP(7) == 'P') &&
5625 		    (UPP(8) == 'E')) {
5626 		    if ((!terminate) &&
5627 		        (htmlParseLookupSequence(ctxt, '>', 0, 0, 1) < 0))
5628 			goto done;
5629 #ifdef DEBUG_PUSH
5630 		    xmlGenericError(xmlGenericErrorContext,
5631 			    "HPP: Parsing internal subset\n");
5632 #endif
5633 		    htmlParseDocTypeDecl(ctxt);
5634 		    ctxt->instate = XML_PARSER_PROLOG;
5635 #ifdef DEBUG_PUSH
5636 		    xmlGenericError(xmlGenericErrorContext,
5637 			    "HPP: entering PROLOG\n");
5638 #endif
5639 		} else if ((cur == '<') && (next == '!') &&
5640 		           (avail < 9)) {
5641 		    goto done;
5642 		} else {
5643 		    ctxt->instate = XML_PARSER_CONTENT;
5644 #ifdef DEBUG_PUSH
5645 		    xmlGenericError(xmlGenericErrorContext,
5646 			    "HPP: entering START_TAG\n");
5647 #endif
5648 		}
5649 		break;
5650             case XML_PARSER_PROLOG:
5651 		SKIP_BLANKS;
5652 		if (in->buf == NULL)
5653 		    avail = in->length - (in->cur - in->base);
5654 		else
5655 		    avail = (ptrdiff_t)xmlBufUse(in->buf->buffer) -
5656                             (in->cur - in->base);
5657 		if (avail < 2)
5658 		    goto done;
5659 		cur = in->cur[0];
5660 		next = in->cur[1];
5661 		if ((cur == '<') && (next == '!') &&
5662 		    (in->cur[2] == '-') && (in->cur[3] == '-')) {
5663 		    if ((!terminate) && (htmlParseLookupCommentEnd(ctxt) < 0))
5664 			goto done;
5665 #ifdef DEBUG_PUSH
5666 		    xmlGenericError(xmlGenericErrorContext,
5667 			    "HPP: Parsing Comment\n");
5668 #endif
5669 		    htmlParseComment(ctxt);
5670 		    ctxt->instate = XML_PARSER_PROLOG;
5671 	        } else if ((cur == '<') && (next == '?')) {
5672 		    if ((!terminate) &&
5673 		        (htmlParseLookupSequence(ctxt, '>', 0, 0, 0) < 0))
5674 			goto done;
5675 #ifdef DEBUG_PUSH
5676 		    xmlGenericError(xmlGenericErrorContext,
5677 			    "HPP: Parsing PI\n");
5678 #endif
5679 		    htmlParsePI(ctxt);
5680 		    ctxt->instate = XML_PARSER_PROLOG;
5681 		} else if ((cur == '<') && (next == '!') &&
5682 		           (avail < 4)) {
5683 		    goto done;
5684 		} else {
5685 		    ctxt->instate = XML_PARSER_CONTENT;
5686 #ifdef DEBUG_PUSH
5687 		    xmlGenericError(xmlGenericErrorContext,
5688 			    "HPP: entering START_TAG\n");
5689 #endif
5690 		}
5691 		break;
5692             case XML_PARSER_EPILOG:
5693 		if (in->buf == NULL)
5694 		    avail = in->length - (in->cur - in->base);
5695 		else
5696 		    avail = (ptrdiff_t)xmlBufUse(in->buf->buffer) -
5697                             (in->cur - in->base);
5698 		if (avail < 1)
5699 		    goto done;
5700 		cur = in->cur[0];
5701 		if (IS_BLANK_CH(cur)) {
5702 		    htmlParseCharData(ctxt);
5703 		    goto done;
5704 		}
5705 		if (avail < 2)
5706 		    goto done;
5707 		next = in->cur[1];
5708 	        if ((cur == '<') && (next == '!') &&
5709 		    (in->cur[2] == '-') && (in->cur[3] == '-')) {
5710 		    if ((!terminate) && (htmlParseLookupCommentEnd(ctxt) < 0))
5711 			goto done;
5712 #ifdef DEBUG_PUSH
5713 		    xmlGenericError(xmlGenericErrorContext,
5714 			    "HPP: Parsing Comment\n");
5715 #endif
5716 		    htmlParseComment(ctxt);
5717 		    ctxt->instate = XML_PARSER_EPILOG;
5718 	        } else if ((cur == '<') && (next == '?')) {
5719 		    if ((!terminate) &&
5720 		        (htmlParseLookupSequence(ctxt, '>', 0, 0, 0) < 0))
5721 			goto done;
5722 #ifdef DEBUG_PUSH
5723 		    xmlGenericError(xmlGenericErrorContext,
5724 			    "HPP: Parsing PI\n");
5725 #endif
5726 		    htmlParsePI(ctxt);
5727 		    ctxt->instate = XML_PARSER_EPILOG;
5728 		} else if ((cur == '<') && (next == '!') &&
5729 		           (avail < 4)) {
5730 		    goto done;
5731 		} else {
5732 		    ctxt->errNo = XML_ERR_DOCUMENT_END;
5733 		    ctxt->wellFormed = 0;
5734 		    ctxt->instate = XML_PARSER_EOF;
5735 #ifdef DEBUG_PUSH
5736 		    xmlGenericError(xmlGenericErrorContext,
5737 			    "HPP: entering EOF\n");
5738 #endif
5739 		    if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
5740 			ctxt->sax->endDocument(ctxt->userData);
5741 		    goto done;
5742 		}
5743 		break;
5744             case XML_PARSER_START_TAG: {
5745 	        const xmlChar *name;
5746 		int failed;
5747 		const htmlElemDesc * info;
5748 
5749 		/*
5750 		 * no chars in buffer
5751 		 */
5752 		if (avail < 1)
5753 		    goto done;
5754 		/*
5755 		 * not enough chars in buffer
5756 		 */
5757 		if (avail < 2) {
5758 		    if (!terminate)
5759 			goto done;
5760 		    else
5761 			next = ' ';
5762 		} else {
5763 		    next = in->cur[1];
5764 		}
5765 		cur = in->cur[0];
5766 	        if (cur != '<') {
5767 		    ctxt->instate = XML_PARSER_CONTENT;
5768 #ifdef DEBUG_PUSH
5769 		    xmlGenericError(xmlGenericErrorContext,
5770 			    "HPP: entering CONTENT\n");
5771 #endif
5772 		    break;
5773 		}
5774 		if (next == '/') {
5775 		    ctxt->instate = XML_PARSER_END_TAG;
5776 		    ctxt->checkIndex = 0;
5777 #ifdef DEBUG_PUSH
5778 		    xmlGenericError(xmlGenericErrorContext,
5779 			    "HPP: entering END_TAG\n");
5780 #endif
5781 		    break;
5782 		}
5783 		if ((!terminate) &&
5784 		    (htmlParseLookupSequence(ctxt, '>', 0, 0, 1) < 0))
5785 		    goto done;
5786 
5787                 /* Capture start position */
5788 	        if (ctxt->record_info) {
5789 	             node_info.begin_pos = ctxt->input->consumed +
5790 	                                (CUR_PTR - ctxt->input->base);
5791 	             node_info.begin_line = ctxt->input->line;
5792 	        }
5793 
5794 
5795 		failed = htmlParseStartTag(ctxt);
5796 		name = ctxt->name;
5797 		if ((failed == -1) ||
5798 		    (name == NULL)) {
5799 		    if (CUR == '>')
5800 			NEXT;
5801 		    break;
5802 		}
5803 
5804 		/*
5805 		 * Lookup the info for that element.
5806 		 */
5807 		info = htmlTagLookup(name);
5808 		if (info == NULL) {
5809 		    htmlParseErr(ctxt, XML_HTML_UNKNOWN_TAG,
5810 		                 "Tag %s invalid\n", name, NULL);
5811 		}
5812 
5813 		/*
5814 		 * Check for an Empty Element labeled the XML/SGML way
5815 		 */
5816 		if ((CUR == '/') && (NXT(1) == '>')) {
5817 		    SKIP(2);
5818 		    if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
5819 			ctxt->sax->endElement(ctxt->userData, name);
5820 		    htmlnamePop(ctxt);
5821 		    ctxt->instate = XML_PARSER_CONTENT;
5822 #ifdef DEBUG_PUSH
5823 		    xmlGenericError(xmlGenericErrorContext,
5824 			    "HPP: entering CONTENT\n");
5825 #endif
5826 		    break;
5827 		}
5828 
5829 		if (CUR == '>') {
5830 		    NEXT;
5831 		} else {
5832 		    htmlParseErr(ctxt, XML_ERR_GT_REQUIRED,
5833 		                 "Couldn't find end of Start Tag %s\n",
5834 				 name, NULL);
5835 
5836 		    /*
5837 		     * end of parsing of this node.
5838 		     */
5839 		    if (xmlStrEqual(name, ctxt->name)) {
5840 			nodePop(ctxt);
5841 			htmlnamePop(ctxt);
5842 		    }
5843 
5844 		    if (ctxt->record_info)
5845 		        htmlNodeInfoPush(ctxt, &node_info);
5846 
5847 		    ctxt->instate = XML_PARSER_CONTENT;
5848 #ifdef DEBUG_PUSH
5849 		    xmlGenericError(xmlGenericErrorContext,
5850 			    "HPP: entering CONTENT\n");
5851 #endif
5852 		    break;
5853 		}
5854 
5855 		/*
5856 		 * Check for an Empty Element from DTD definition
5857 		 */
5858 		if ((info != NULL) && (info->empty)) {
5859 		    if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
5860 			ctxt->sax->endElement(ctxt->userData, name);
5861 		    htmlnamePop(ctxt);
5862 		}
5863 
5864                 if (ctxt->record_info)
5865 	            htmlNodeInfoPush(ctxt, &node_info);
5866 
5867 		ctxt->instate = XML_PARSER_CONTENT;
5868 #ifdef DEBUG_PUSH
5869 		xmlGenericError(xmlGenericErrorContext,
5870 			"HPP: entering CONTENT\n");
5871 #endif
5872                 break;
5873 	    }
5874             case XML_PARSER_CONTENT: {
5875 		xmlChar chr[2] = { 0, 0 };
5876 
5877                 /*
5878 		 * Handle preparsed entities and charRef
5879 		 */
5880 		if (ctxt->token != 0) {
5881 		    chr[0] = (xmlChar) ctxt->token;
5882 		    htmlCheckParagraph(ctxt);
5883 		    if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL))
5884 			ctxt->sax->characters(ctxt->userData, chr, 1);
5885 		    ctxt->token = 0;
5886 		    ctxt->checkIndex = 0;
5887 		}
5888 		if ((avail == 1) && (terminate)) {
5889 		    cur = in->cur[0];
5890 		    if ((cur != '<') && (cur != '&')) {
5891 			if (ctxt->sax != NULL) {
5892                             chr[0] = cur;
5893 			    if (IS_BLANK_CH(cur)) {
5894 				if (ctxt->keepBlanks) {
5895 				    if (ctxt->sax->characters != NULL)
5896 					ctxt->sax->characters(
5897 						ctxt->userData, chr, 1);
5898 				} else {
5899 				    if (ctxt->sax->ignorableWhitespace != NULL)
5900 					ctxt->sax->ignorableWhitespace(
5901 						ctxt->userData, chr, 1);
5902 				}
5903 			    } else {
5904 				htmlCheckParagraph(ctxt);
5905 				if (ctxt->sax->characters != NULL)
5906 				    ctxt->sax->characters(
5907 					    ctxt->userData, chr, 1);
5908 			    }
5909 			}
5910 			ctxt->token = 0;
5911 			ctxt->checkIndex = 0;
5912 			in->cur++;
5913 			break;
5914 		    }
5915 		}
5916 		if (avail < 2)
5917 		    goto done;
5918 		cur = in->cur[0];
5919 		next = in->cur[1];
5920 		if ((xmlStrEqual(ctxt->name, BAD_CAST"script")) ||
5921 		    (xmlStrEqual(ctxt->name, BAD_CAST"style"))) {
5922 		    /*
5923 		     * Handle SCRIPT/STYLE separately
5924 		     */
5925 		    if (!terminate) {
5926 		        int idx;
5927 			xmlChar val;
5928 
5929 			idx = htmlParseLookupSequence(ctxt, '<', '/', 0, 0);
5930 			if (idx < 0)
5931 			    goto done;
5932 		        val = in->cur[idx + 2];
5933 			if (val == 0) /* bad cut of input */
5934 			    goto done;
5935 		    }
5936 		    htmlParseScript(ctxt);
5937 		    if ((cur == '<') && (next == '/')) {
5938 			ctxt->instate = XML_PARSER_END_TAG;
5939 			ctxt->checkIndex = 0;
5940 #ifdef DEBUG_PUSH
5941 			xmlGenericError(xmlGenericErrorContext,
5942 				"HPP: entering END_TAG\n");
5943 #endif
5944 			break;
5945 		    }
5946 		} else {
5947 		    /*
5948 		     * Sometimes DOCTYPE arrives in the middle of the document
5949 		     */
5950 		    if ((cur == '<') && (next == '!') &&
5951 			(UPP(2) == 'D') && (UPP(3) == 'O') &&
5952 			(UPP(4) == 'C') && (UPP(5) == 'T') &&
5953 			(UPP(6) == 'Y') && (UPP(7) == 'P') &&
5954 			(UPP(8) == 'E')) {
5955 			if ((!terminate) &&
5956 			    (htmlParseLookupSequence(ctxt, '>', 0, 0, 1) < 0))
5957 			    goto done;
5958 			htmlParseErr(ctxt, XML_HTML_STRUCURE_ERROR,
5959 			             "Misplaced DOCTYPE declaration\n",
5960 				     BAD_CAST "DOCTYPE" , NULL);
5961 			htmlParseDocTypeDecl(ctxt);
5962 		    } else if ((cur == '<') && (next == '!') &&
5963 			(in->cur[2] == '-') && (in->cur[3] == '-')) {
5964 			if ((!terminate) && (htmlParseLookupCommentEnd(ctxt) < 0))
5965 			    goto done;
5966 #ifdef DEBUG_PUSH
5967 			xmlGenericError(xmlGenericErrorContext,
5968 				"HPP: Parsing Comment\n");
5969 #endif
5970 			htmlParseComment(ctxt);
5971 			ctxt->instate = XML_PARSER_CONTENT;
5972 		    } else if ((cur == '<') && (next == '?')) {
5973 			if ((!terminate) &&
5974 			    (htmlParseLookupSequence(ctxt, '>', 0, 0, 0) < 0))
5975 			    goto done;
5976 #ifdef DEBUG_PUSH
5977 			xmlGenericError(xmlGenericErrorContext,
5978 				"HPP: Parsing PI\n");
5979 #endif
5980 			htmlParsePI(ctxt);
5981 			ctxt->instate = XML_PARSER_CONTENT;
5982 		    } else if ((cur == '<') && (next == '!') && (avail < 4)) {
5983 			goto done;
5984 		    } else if ((cur == '<') && (next == '/')) {
5985 			ctxt->instate = XML_PARSER_END_TAG;
5986 			ctxt->checkIndex = 0;
5987 #ifdef DEBUG_PUSH
5988 			xmlGenericError(xmlGenericErrorContext,
5989 				"HPP: entering END_TAG\n");
5990 #endif
5991 			break;
5992 		    } else if (cur == '<') {
5993                         if ((!terminate) && (next == 0))
5994                             goto done;
5995                         /*
5996                          * Only switch to START_TAG if the next character
5997                          * starts a valid name. Otherwise, htmlParseStartTag
5998                          * might return without consuming all characters
5999                          * up to the final '>'.
6000                          */
6001                         if ((IS_ASCII_LETTER(next)) ||
6002                             (next == '_') || (next == ':') || (next == '.')) {
6003                             ctxt->instate = XML_PARSER_START_TAG;
6004                             ctxt->checkIndex = 0;
6005 #ifdef DEBUG_PUSH
6006                             xmlGenericError(xmlGenericErrorContext,
6007                                     "HPP: entering START_TAG\n");
6008 #endif
6009                         } else {
6010                             htmlParseErr(ctxt, XML_ERR_NAME_REQUIRED,
6011                                          "htmlParseTryOrFinish: "
6012                                          "invalid element name\n",
6013                                          NULL, NULL);
6014                             htmlCheckParagraph(ctxt);
6015                             if ((ctxt->sax != NULL) &&
6016                                 (ctxt->sax->characters != NULL))
6017                                 ctxt->sax->characters(ctxt->userData,
6018                                                       in->cur, 1);
6019                             NEXT;
6020                         }
6021 			break;
6022 		    } else {
6023 		        /*
6024 			 * check that the text sequence is complete
6025 			 * before handing out the data to the parser
6026 			 * to avoid problems with erroneous end of
6027 			 * data detection.
6028 			 */
6029 			if ((!terminate) &&
6030                             (htmlParseLookupSequence(ctxt, '<', 0, 0, 0) < 0))
6031 			    goto done;
6032 			ctxt->checkIndex = 0;
6033 #ifdef DEBUG_PUSH
6034 			xmlGenericError(xmlGenericErrorContext,
6035 				"HPP: Parsing char data\n");
6036 #endif
6037                         while ((ctxt->instate != XML_PARSER_EOF) &&
6038                                (cur != '<') && (in->cur < in->end)) {
6039                             if (cur == '&') {
6040 			        htmlParseReference(ctxt);
6041                             } else {
6042 			        htmlParseCharData(ctxt);
6043                             }
6044                             cur = in->cur[0];
6045                         }
6046 		    }
6047 		}
6048 
6049 		break;
6050 	    }
6051             case XML_PARSER_END_TAG:
6052 		if (avail < 2)
6053 		    goto done;
6054 		if ((!terminate) &&
6055 		    (htmlParseLookupSequence(ctxt, '>', 0, 0, 0) < 0))
6056 		    goto done;
6057 		htmlParseEndTag(ctxt);
6058 		if (ctxt->nameNr == 0) {
6059 		    ctxt->instate = XML_PARSER_EPILOG;
6060 		} else {
6061 		    ctxt->instate = XML_PARSER_CONTENT;
6062 		}
6063 		ctxt->checkIndex = 0;
6064 #ifdef DEBUG_PUSH
6065 		xmlGenericError(xmlGenericErrorContext,
6066 			"HPP: entering CONTENT\n");
6067 #endif
6068 	        break;
6069             case XML_PARSER_CDATA_SECTION:
6070 		htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
6071 			"HPP: internal error, state == CDATA\n",
6072 			     NULL, NULL);
6073 		ctxt->instate = XML_PARSER_CONTENT;
6074 		ctxt->checkIndex = 0;
6075 #ifdef DEBUG_PUSH
6076 		xmlGenericError(xmlGenericErrorContext,
6077 			"HPP: entering CONTENT\n");
6078 #endif
6079 		break;
6080             case XML_PARSER_DTD:
6081 		htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
6082 			"HPP: internal error, state == DTD\n",
6083 			     NULL, NULL);
6084 		ctxt->instate = XML_PARSER_CONTENT;
6085 		ctxt->checkIndex = 0;
6086 #ifdef DEBUG_PUSH
6087 		xmlGenericError(xmlGenericErrorContext,
6088 			"HPP: entering CONTENT\n");
6089 #endif
6090 		break;
6091             case XML_PARSER_COMMENT:
6092 		htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
6093 			"HPP: internal error, state == COMMENT\n",
6094 			     NULL, NULL);
6095 		ctxt->instate = XML_PARSER_CONTENT;
6096 		ctxt->checkIndex = 0;
6097 #ifdef DEBUG_PUSH
6098 		xmlGenericError(xmlGenericErrorContext,
6099 			"HPP: entering CONTENT\n");
6100 #endif
6101 		break;
6102             case XML_PARSER_PI:
6103 		htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
6104 			"HPP: internal error, state == PI\n",
6105 			     NULL, NULL);
6106 		ctxt->instate = XML_PARSER_CONTENT;
6107 		ctxt->checkIndex = 0;
6108 #ifdef DEBUG_PUSH
6109 		xmlGenericError(xmlGenericErrorContext,
6110 			"HPP: entering CONTENT\n");
6111 #endif
6112 		break;
6113             case XML_PARSER_ENTITY_DECL:
6114 		htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
6115 			"HPP: internal error, state == ENTITY_DECL\n",
6116 			     NULL, NULL);
6117 		ctxt->instate = XML_PARSER_CONTENT;
6118 		ctxt->checkIndex = 0;
6119 #ifdef DEBUG_PUSH
6120 		xmlGenericError(xmlGenericErrorContext,
6121 			"HPP: entering CONTENT\n");
6122 #endif
6123 		break;
6124             case XML_PARSER_ENTITY_VALUE:
6125 		htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
6126 			"HPP: internal error, state == ENTITY_VALUE\n",
6127 			     NULL, NULL);
6128 		ctxt->instate = XML_PARSER_CONTENT;
6129 		ctxt->checkIndex = 0;
6130 #ifdef DEBUG_PUSH
6131 		xmlGenericError(xmlGenericErrorContext,
6132 			"HPP: entering DTD\n");
6133 #endif
6134 		break;
6135             case XML_PARSER_ATTRIBUTE_VALUE:
6136 		htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
6137 			"HPP: internal error, state == ATTRIBUTE_VALUE\n",
6138 			     NULL, NULL);
6139 		ctxt->instate = XML_PARSER_START_TAG;
6140 		ctxt->checkIndex = 0;
6141 #ifdef DEBUG_PUSH
6142 		xmlGenericError(xmlGenericErrorContext,
6143 			"HPP: entering START_TAG\n");
6144 #endif
6145 		break;
6146 	    case XML_PARSER_SYSTEM_LITERAL:
6147 		htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
6148 		    "HPP: internal error, state == XML_PARSER_SYSTEM_LITERAL\n",
6149 			     NULL, NULL);
6150 		ctxt->instate = XML_PARSER_CONTENT;
6151 		ctxt->checkIndex = 0;
6152 #ifdef DEBUG_PUSH
6153 		xmlGenericError(xmlGenericErrorContext,
6154 			"HPP: entering CONTENT\n");
6155 #endif
6156 		break;
6157 	    case XML_PARSER_IGNORE:
6158 		htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
6159 			"HPP: internal error, state == XML_PARSER_IGNORE\n",
6160 			     NULL, NULL);
6161 		ctxt->instate = XML_PARSER_CONTENT;
6162 		ctxt->checkIndex = 0;
6163 #ifdef DEBUG_PUSH
6164 		xmlGenericError(xmlGenericErrorContext,
6165 			"HPP: entering CONTENT\n");
6166 #endif
6167 		break;
6168 	    case XML_PARSER_PUBLIC_LITERAL:
6169 		htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
6170 			"HPP: internal error, state == XML_PARSER_LITERAL\n",
6171 			     NULL, NULL);
6172 		ctxt->instate = XML_PARSER_CONTENT;
6173 		ctxt->checkIndex = 0;
6174 #ifdef DEBUG_PUSH
6175 		xmlGenericError(xmlGenericErrorContext,
6176 			"HPP: entering CONTENT\n");
6177 #endif
6178 		break;
6179 
6180 	}
6181     }
6182 done:
6183     if ((avail == 0) && (terminate)) {
6184 	htmlAutoCloseOnEnd(ctxt);
6185 	if ((ctxt->nameNr == 0) && (ctxt->instate != XML_PARSER_EOF)) {
6186 	    /*
6187 	     * SAX: end of the document processing.
6188 	     */
6189 	    ctxt->instate = XML_PARSER_EOF;
6190 	    if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
6191 		ctxt->sax->endDocument(ctxt->userData);
6192 	}
6193     }
6194     if ((!(ctxt->options & HTML_PARSE_NODEFDTD)) && (ctxt->myDoc != NULL) &&
6195 	((terminate) || (ctxt->instate == XML_PARSER_EOF) ||
6196 	 (ctxt->instate == XML_PARSER_EPILOG))) {
6197 	xmlDtdPtr dtd;
6198 	dtd = xmlGetIntSubset(ctxt->myDoc);
6199 	if (dtd == NULL)
6200 	    ctxt->myDoc->intSubset =
6201 		xmlCreateIntSubset(ctxt->myDoc, BAD_CAST "html",
6202 		    BAD_CAST "-//W3C//DTD HTML 4.0 Transitional//EN",
6203 		    BAD_CAST "http://www.w3.org/TR/REC-html40/loose.dtd");
6204     }
6205 #ifdef DEBUG_PUSH
6206     xmlGenericError(xmlGenericErrorContext, "HPP: done %d\n", ret);
6207 #endif
6208     return(ret);
6209 }
6210 
6211 /**
6212  * htmlParseChunk:
6213  * @ctxt:  an HTML parser context
6214  * @chunk:  an char array
6215  * @size:  the size in byte of the chunk
6216  * @terminate:  last chunk indicator
6217  *
6218  * Parse a Chunk of memory
6219  *
6220  * Returns zero if no error, the xmlParserErrors otherwise.
6221  */
6222 int
htmlParseChunk(htmlParserCtxtPtr ctxt,const char * chunk,int size,int terminate)6223 htmlParseChunk(htmlParserCtxtPtr ctxt, const char *chunk, int size,
6224               int terminate) {
6225     if ((ctxt == NULL) || (ctxt->input == NULL)) {
6226 	htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
6227 		     "htmlParseChunk: context error\n", NULL, NULL);
6228 	return(XML_ERR_INTERNAL_ERROR);
6229     }
6230     if ((size > 0) && (chunk != NULL) && (ctxt->input != NULL) &&
6231         (ctxt->input->buf != NULL) && (ctxt->instate != XML_PARSER_EOF))  {
6232 	size_t base = xmlBufGetInputBase(ctxt->input->buf->buffer, ctxt->input);
6233 	size_t cur = ctxt->input->cur - ctxt->input->base;
6234 	int res;
6235 
6236 	res = xmlParserInputBufferPush(ctxt->input->buf, size, chunk);
6237         xmlBufSetInputBaseCur(ctxt->input->buf->buffer, ctxt->input, base, cur);
6238 	if (res < 0) {
6239 	    ctxt->errNo = XML_PARSER_EOF;
6240 	    ctxt->disableSAX = 1;
6241 	    return (XML_PARSER_EOF);
6242 	}
6243 #ifdef DEBUG_PUSH
6244 	xmlGenericError(xmlGenericErrorContext, "HPP: pushed %d\n", size);
6245 #endif
6246 
6247 #if 0
6248 	if ((terminate) || (ctxt->input->buf->buffer->use > 80))
6249 	    htmlParseTryOrFinish(ctxt, terminate);
6250 #endif
6251     } else if (ctxt->instate != XML_PARSER_EOF) {
6252 	if ((ctxt->input != NULL) && ctxt->input->buf != NULL) {
6253 	    xmlParserInputBufferPtr in = ctxt->input->buf;
6254 	    if ((in->encoder != NULL) && (in->buffer != NULL) &&
6255 		    (in->raw != NULL)) {
6256 		int nbchars;
6257 		size_t base = xmlBufGetInputBase(in->buffer, ctxt->input);
6258 		size_t current = ctxt->input->cur - ctxt->input->base;
6259 
6260 		nbchars = xmlCharEncInput(in, terminate);
6261 		xmlBufSetInputBaseCur(in->buffer, ctxt->input, base, current);
6262 		if (nbchars < 0) {
6263 		    htmlParseErr(ctxt, XML_ERR_INVALID_ENCODING,
6264 			         "encoder error\n", NULL, NULL);
6265 		    return(XML_ERR_INVALID_ENCODING);
6266 		}
6267 	    }
6268 	}
6269     }
6270     htmlParseTryOrFinish(ctxt, terminate);
6271     if (terminate) {
6272 	if ((ctxt->instate != XML_PARSER_EOF) &&
6273 	    (ctxt->instate != XML_PARSER_EPILOG) &&
6274 	    (ctxt->instate != XML_PARSER_MISC)) {
6275 	    ctxt->errNo = XML_ERR_DOCUMENT_END;
6276 	    ctxt->wellFormed = 0;
6277 	}
6278 	if (ctxt->instate != XML_PARSER_EOF) {
6279 	    if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
6280 		ctxt->sax->endDocument(ctxt->userData);
6281 	}
6282 	ctxt->instate = XML_PARSER_EOF;
6283     }
6284     return((xmlParserErrors) ctxt->errNo);
6285 }
6286 
6287 /************************************************************************
6288  *									*
6289  *			User entry points				*
6290  *									*
6291  ************************************************************************/
6292 
6293 /**
6294  * htmlCreatePushParserCtxt:
6295  * @sax:  a SAX handler
6296  * @user_data:  The user data returned on SAX callbacks
6297  * @chunk:  a pointer to an array of chars
6298  * @size:  number of chars in the array
6299  * @filename:  an optional file name or URI
6300  * @enc:  an optional encoding
6301  *
6302  * Create a parser context for using the HTML parser in push mode
6303  * The value of @filename is used for fetching external entities
6304  * and error/warning reports.
6305  *
6306  * Returns the new parser context or NULL
6307  */
6308 htmlParserCtxtPtr
htmlCreatePushParserCtxt(htmlSAXHandlerPtr sax,void * user_data,const char * chunk,int size,const char * filename,xmlCharEncoding enc)6309 htmlCreatePushParserCtxt(htmlSAXHandlerPtr sax, void *user_data,
6310                          const char *chunk, int size, const char *filename,
6311 			 xmlCharEncoding enc) {
6312     htmlParserCtxtPtr ctxt;
6313     htmlParserInputPtr inputStream;
6314     xmlParserInputBufferPtr buf;
6315 
6316     xmlInitParser();
6317 
6318     buf = xmlAllocParserInputBuffer(enc);
6319     if (buf == NULL) return(NULL);
6320 
6321     ctxt = htmlNewParserCtxt();
6322     if (ctxt == NULL) {
6323 	xmlFreeParserInputBuffer(buf);
6324 	return(NULL);
6325     }
6326     if(enc==XML_CHAR_ENCODING_UTF8 || buf->encoder)
6327 	ctxt->charset=XML_CHAR_ENCODING_UTF8;
6328     if (sax != NULL) {
6329 	if (ctxt->sax != (xmlSAXHandlerPtr) &htmlDefaultSAXHandler)
6330 	    xmlFree(ctxt->sax);
6331 	ctxt->sax = (htmlSAXHandlerPtr) xmlMalloc(sizeof(htmlSAXHandler));
6332 	if (ctxt->sax == NULL) {
6333 	    xmlFree(buf);
6334 	    xmlFree(ctxt);
6335 	    return(NULL);
6336 	}
6337 	memcpy(ctxt->sax, sax, sizeof(htmlSAXHandler));
6338 	if (user_data != NULL)
6339 	    ctxt->userData = user_data;
6340     }
6341     if (filename == NULL) {
6342 	ctxt->directory = NULL;
6343     } else {
6344         ctxt->directory = xmlParserGetDirectory(filename);
6345     }
6346 
6347     inputStream = htmlNewInputStream(ctxt);
6348     if (inputStream == NULL) {
6349 	xmlFreeParserCtxt(ctxt);
6350 	xmlFree(buf);
6351 	return(NULL);
6352     }
6353 
6354     if (filename == NULL)
6355 	inputStream->filename = NULL;
6356     else
6357 	inputStream->filename = (char *)
6358 	    xmlCanonicPath((const xmlChar *) filename);
6359     inputStream->buf = buf;
6360     xmlBufResetInput(buf->buffer, inputStream);
6361 
6362     inputPush(ctxt, inputStream);
6363 
6364     if ((size > 0) && (chunk != NULL) && (ctxt->input != NULL) &&
6365         (ctxt->input->buf != NULL))  {
6366 	size_t base = xmlBufGetInputBase(ctxt->input->buf->buffer, ctxt->input);
6367 	size_t cur = ctxt->input->cur - ctxt->input->base;
6368 
6369 	xmlParserInputBufferPush(ctxt->input->buf, size, chunk);
6370 
6371         xmlBufSetInputBaseCur(ctxt->input->buf->buffer, ctxt->input, base, cur);
6372 #ifdef DEBUG_PUSH
6373 	xmlGenericError(xmlGenericErrorContext, "HPP: pushed %d\n", size);
6374 #endif
6375     }
6376     ctxt->progressive = 1;
6377 
6378     return(ctxt);
6379 }
6380 #endif /* LIBXML_PUSH_ENABLED */
6381 
6382 /**
6383  * htmlSAXParseDoc:
6384  * @cur:  a pointer to an array of xmlChar
6385  * @encoding:  a free form C string describing the HTML document encoding, or NULL
6386  * @sax:  the SAX handler block
6387  * @userData: if using SAX, this pointer will be provided on callbacks.
6388  *
6389  * Parse an HTML in-memory document. If sax is not NULL, use the SAX callbacks
6390  * to handle parse events. If sax is NULL, fallback to the default DOM
6391  * behavior and return a tree.
6392  *
6393  * Returns the resulting document tree unless SAX is NULL or the document is
6394  *     not well formed.
6395  */
6396 
6397 htmlDocPtr
htmlSAXParseDoc(const xmlChar * cur,const char * encoding,htmlSAXHandlerPtr sax,void * userData)6398 htmlSAXParseDoc(const xmlChar *cur, const char *encoding,
6399                 htmlSAXHandlerPtr sax, void *userData) {
6400     htmlDocPtr ret;
6401     htmlParserCtxtPtr ctxt;
6402 
6403     xmlInitParser();
6404 
6405     if (cur == NULL) return(NULL);
6406 
6407 
6408     ctxt = htmlCreateDocParserCtxt(cur, encoding);
6409     if (ctxt == NULL) return(NULL);
6410     if (sax != NULL) {
6411         if (ctxt->sax != NULL) xmlFree (ctxt->sax);
6412         ctxt->sax = sax;
6413         ctxt->userData = userData;
6414     }
6415 
6416     htmlParseDocument(ctxt);
6417     ret = ctxt->myDoc;
6418     if (sax != NULL) {
6419 	ctxt->sax = NULL;
6420 	ctxt->userData = NULL;
6421     }
6422     htmlFreeParserCtxt(ctxt);
6423 
6424     return(ret);
6425 }
6426 
6427 /**
6428  * htmlParseDoc:
6429  * @cur:  a pointer to an array of xmlChar
6430  * @encoding:  a free form C string describing the HTML document encoding, or NULL
6431  *
6432  * parse an HTML in-memory document and build a tree.
6433  *
6434  * Returns the resulting document tree
6435  */
6436 
6437 htmlDocPtr
htmlParseDoc(const xmlChar * cur,const char * encoding)6438 htmlParseDoc(const xmlChar *cur, const char *encoding) {
6439     return(htmlSAXParseDoc(cur, encoding, NULL, NULL));
6440 }
6441 
6442 
6443 /**
6444  * htmlCreateFileParserCtxt:
6445  * @filename:  the filename
6446  * @encoding:  a free form C string describing the HTML document encoding, or NULL
6447  *
6448  * Create a parser context for a file content.
6449  * Automatic support for ZLIB/Compress compressed document is provided
6450  * by default if found at compile-time.
6451  *
6452  * Returns the new parser context or NULL
6453  */
6454 htmlParserCtxtPtr
htmlCreateFileParserCtxt(const char * filename,const char * encoding)6455 htmlCreateFileParserCtxt(const char *filename, const char *encoding)
6456 {
6457     htmlParserCtxtPtr ctxt;
6458     htmlParserInputPtr inputStream;
6459     char *canonicFilename;
6460     /* htmlCharEncoding enc; */
6461     xmlChar *content, *content_line = (xmlChar *) "charset=";
6462 
6463     if (filename == NULL)
6464         return(NULL);
6465 
6466     ctxt = htmlNewParserCtxt();
6467     if (ctxt == NULL) {
6468 	return(NULL);
6469     }
6470     canonicFilename = (char *) xmlCanonicPath((const xmlChar *) filename);
6471     if (canonicFilename == NULL) {
6472 #ifdef LIBXML_SAX1_ENABLED
6473 	if (xmlDefaultSAXHandler.error != NULL) {
6474 	    xmlDefaultSAXHandler.error(NULL, "out of memory\n");
6475 	}
6476 #endif
6477 	xmlFreeParserCtxt(ctxt);
6478 	return(NULL);
6479     }
6480 
6481     inputStream = xmlLoadExternalEntity(canonicFilename, NULL, ctxt);
6482     xmlFree(canonicFilename);
6483     if (inputStream == NULL) {
6484 	xmlFreeParserCtxt(ctxt);
6485 	return(NULL);
6486     }
6487 
6488     inputPush(ctxt, inputStream);
6489 
6490     /* set encoding */
6491     if (encoding) {
6492         size_t l = strlen(encoding);
6493 
6494 	if (l < 1000) {
6495 	    content = xmlMallocAtomic (xmlStrlen(content_line) + l + 1);
6496 	    if (content) {
6497 		strcpy ((char *)content, (char *)content_line);
6498 		strcat ((char *)content, (char *)encoding);
6499 		htmlCheckEncoding (ctxt, content);
6500 		xmlFree (content);
6501 	    }
6502 	}
6503     }
6504 
6505     return(ctxt);
6506 }
6507 
6508 /**
6509  * htmlSAXParseFile:
6510  * @filename:  the filename
6511  * @encoding:  a free form C string describing the HTML document encoding, or NULL
6512  * @sax:  the SAX handler block
6513  * @userData: if using SAX, this pointer will be provided on callbacks.
6514  *
6515  * parse an HTML file and build a tree. Automatic support for ZLIB/Compress
6516  * compressed document is provided by default if found at compile-time.
6517  * It use the given SAX function block to handle the parsing callback.
6518  * If sax is NULL, fallback to the default DOM tree building routines.
6519  *
6520  * Returns the resulting document tree unless SAX is NULL or the document is
6521  *     not well formed.
6522  */
6523 
6524 htmlDocPtr
htmlSAXParseFile(const char * filename,const char * encoding,htmlSAXHandlerPtr sax,void * userData)6525 htmlSAXParseFile(const char *filename, const char *encoding, htmlSAXHandlerPtr sax,
6526                  void *userData) {
6527     htmlDocPtr ret;
6528     htmlParserCtxtPtr ctxt;
6529     htmlSAXHandlerPtr oldsax = NULL;
6530 
6531     xmlInitParser();
6532 
6533     ctxt = htmlCreateFileParserCtxt(filename, encoding);
6534     if (ctxt == NULL) return(NULL);
6535     if (sax != NULL) {
6536 	oldsax = ctxt->sax;
6537         ctxt->sax = sax;
6538         ctxt->userData = userData;
6539     }
6540 
6541     htmlParseDocument(ctxt);
6542 
6543     ret = ctxt->myDoc;
6544     if (sax != NULL) {
6545         ctxt->sax = oldsax;
6546         ctxt->userData = NULL;
6547     }
6548     htmlFreeParserCtxt(ctxt);
6549 
6550     return(ret);
6551 }
6552 
6553 /**
6554  * htmlParseFile:
6555  * @filename:  the filename
6556  * @encoding:  a free form C string describing the HTML document encoding, or NULL
6557  *
6558  * parse an HTML file and build a tree. Automatic support for ZLIB/Compress
6559  * compressed document is provided by default if found at compile-time.
6560  *
6561  * Returns the resulting document tree
6562  */
6563 
6564 htmlDocPtr
htmlParseFile(const char * filename,const char * encoding)6565 htmlParseFile(const char *filename, const char *encoding) {
6566     return(htmlSAXParseFile(filename, encoding, NULL, NULL));
6567 }
6568 
6569 /**
6570  * htmlHandleOmittedElem:
6571  * @val:  int 0 or 1
6572  *
6573  * Set and return the previous value for handling HTML omitted tags.
6574  *
6575  * Returns the last value for 0 for no handling, 1 for auto insertion.
6576  */
6577 
6578 int
htmlHandleOmittedElem(int val)6579 htmlHandleOmittedElem(int val) {
6580     int old = htmlOmittedDefaultValue;
6581 
6582     htmlOmittedDefaultValue = val;
6583     return(old);
6584 }
6585 
6586 /**
6587  * htmlElementAllowedHere:
6588  * @parent: HTML parent element
6589  * @elt: HTML element
6590  *
6591  * Checks whether an HTML element may be a direct child of a parent element.
6592  * Note - doesn't check for deprecated elements
6593  *
6594  * Returns 1 if allowed; 0 otherwise.
6595  */
6596 int
htmlElementAllowedHere(const htmlElemDesc * parent,const xmlChar * elt)6597 htmlElementAllowedHere(const htmlElemDesc* parent, const xmlChar* elt) {
6598   const char** p ;
6599 
6600   if ( ! elt || ! parent || ! parent->subelts )
6601 	return 0 ;
6602 
6603   for ( p = parent->subelts; *p; ++p )
6604     if ( !xmlStrcmp((const xmlChar *)*p, elt) )
6605       return 1 ;
6606 
6607   return 0 ;
6608 }
6609 /**
6610  * htmlElementStatusHere:
6611  * @parent: HTML parent element
6612  * @elt: HTML element
6613  *
6614  * Checks whether an HTML element may be a direct child of a parent element.
6615  * and if so whether it is valid or deprecated.
6616  *
6617  * Returns one of HTML_VALID, HTML_DEPRECATED, HTML_INVALID
6618  */
6619 htmlStatus
htmlElementStatusHere(const htmlElemDesc * parent,const htmlElemDesc * elt)6620 htmlElementStatusHere(const htmlElemDesc* parent, const htmlElemDesc* elt) {
6621   if ( ! parent || ! elt )
6622     return HTML_INVALID ;
6623   if ( ! htmlElementAllowedHere(parent, (const xmlChar*) elt->name ) )
6624     return HTML_INVALID ;
6625 
6626   return ( elt->dtd == 0 ) ? HTML_VALID : HTML_DEPRECATED ;
6627 }
6628 /**
6629  * htmlAttrAllowed:
6630  * @elt: HTML element
6631  * @attr: HTML attribute
6632  * @legacy: whether to allow deprecated attributes
6633  *
6634  * Checks whether an attribute is valid for an element
6635  * Has full knowledge of Required and Deprecated attributes
6636  *
6637  * Returns one of HTML_REQUIRED, HTML_VALID, HTML_DEPRECATED, HTML_INVALID
6638  */
6639 htmlStatus
htmlAttrAllowed(const htmlElemDesc * elt,const xmlChar * attr,int legacy)6640 htmlAttrAllowed(const htmlElemDesc* elt, const xmlChar* attr, int legacy) {
6641   const char** p ;
6642 
6643   if ( !elt || ! attr )
6644 	return HTML_INVALID ;
6645 
6646   if ( elt->attrs_req )
6647     for ( p = elt->attrs_req; *p; ++p)
6648       if ( !xmlStrcmp((const xmlChar*)*p, attr) )
6649         return HTML_REQUIRED ;
6650 
6651   if ( elt->attrs_opt )
6652     for ( p = elt->attrs_opt; *p; ++p)
6653       if ( !xmlStrcmp((const xmlChar*)*p, attr) )
6654         return HTML_VALID ;
6655 
6656   if ( legacy && elt->attrs_depr )
6657     for ( p = elt->attrs_depr; *p; ++p)
6658       if ( !xmlStrcmp((const xmlChar*)*p, attr) )
6659         return HTML_DEPRECATED ;
6660 
6661   return HTML_INVALID ;
6662 }
6663 /**
6664  * htmlNodeStatus:
6665  * @node: an htmlNodePtr in a tree
6666  * @legacy: whether to allow deprecated elements (YES is faster here
6667  *	for Element nodes)
6668  *
6669  * Checks whether the tree node is valid.  Experimental (the author
6670  *     only uses the HTML enhancements in a SAX parser)
6671  *
6672  * Return: for Element nodes, a return from htmlElementAllowedHere (if
6673  *	legacy allowed) or htmlElementStatusHere (otherwise).
6674  *	for Attribute nodes, a return from htmlAttrAllowed
6675  *	for other nodes, HTML_NA (no checks performed)
6676  */
6677 htmlStatus
htmlNodeStatus(const htmlNodePtr node,int legacy)6678 htmlNodeStatus(const htmlNodePtr node, int legacy) {
6679   if ( ! node )
6680     return HTML_INVALID ;
6681 
6682   switch ( node->type ) {
6683     case XML_ELEMENT_NODE:
6684       return legacy
6685 	? ( htmlElementAllowedHere (
6686 		htmlTagLookup(node->parent->name) , node->name
6687 		) ? HTML_VALID : HTML_INVALID )
6688 	: htmlElementStatusHere(
6689 		htmlTagLookup(node->parent->name) ,
6690 		htmlTagLookup(node->name) )
6691 	;
6692     case XML_ATTRIBUTE_NODE:
6693       return htmlAttrAllowed(
6694 	htmlTagLookup(node->parent->name) , node->name, legacy) ;
6695     default: return HTML_NA ;
6696   }
6697 }
6698 /************************************************************************
6699  *									*
6700  *	New set (2.6.0) of simpler and more flexible APIs		*
6701  *									*
6702  ************************************************************************/
6703 /**
6704  * DICT_FREE:
6705  * @str:  a string
6706  *
6707  * Free a string if it is not owned by the "dict" dictionary in the
6708  * current scope
6709  */
6710 #define DICT_FREE(str)						\
6711 	if ((str) && ((!dict) ||				\
6712 	    (xmlDictOwns(dict, (const xmlChar *)(str)) == 0)))	\
6713 	    xmlFree((char *)(str));
6714 
6715 /**
6716  * htmlCtxtReset:
6717  * @ctxt: an HTML parser context
6718  *
6719  * Reset a parser context
6720  */
6721 void
htmlCtxtReset(htmlParserCtxtPtr ctxt)6722 htmlCtxtReset(htmlParserCtxtPtr ctxt)
6723 {
6724     xmlParserInputPtr input;
6725     xmlDictPtr dict;
6726 
6727     if (ctxt == NULL)
6728         return;
6729 
6730     xmlInitParser();
6731     dict = ctxt->dict;
6732 
6733     while ((input = inputPop(ctxt)) != NULL) { /* Non consuming */
6734         xmlFreeInputStream(input);
6735     }
6736     ctxt->inputNr = 0;
6737     ctxt->input = NULL;
6738 
6739     ctxt->spaceNr = 0;
6740     if (ctxt->spaceTab != NULL) {
6741 	ctxt->spaceTab[0] = -1;
6742 	ctxt->space = &ctxt->spaceTab[0];
6743     } else {
6744 	ctxt->space = NULL;
6745     }
6746 
6747 
6748     ctxt->nodeNr = 0;
6749     ctxt->node = NULL;
6750 
6751     ctxt->nameNr = 0;
6752     ctxt->name = NULL;
6753 
6754     DICT_FREE(ctxt->version);
6755     ctxt->version = NULL;
6756     DICT_FREE(ctxt->encoding);
6757     ctxt->encoding = NULL;
6758     DICT_FREE(ctxt->directory);
6759     ctxt->directory = NULL;
6760     DICT_FREE(ctxt->extSubURI);
6761     ctxt->extSubURI = NULL;
6762     DICT_FREE(ctxt->extSubSystem);
6763     ctxt->extSubSystem = NULL;
6764     if (ctxt->myDoc != NULL)
6765         xmlFreeDoc(ctxt->myDoc);
6766     ctxt->myDoc = NULL;
6767 
6768     ctxt->standalone = -1;
6769     ctxt->hasExternalSubset = 0;
6770     ctxt->hasPErefs = 0;
6771     ctxt->html = 1;
6772     ctxt->external = 0;
6773     ctxt->instate = XML_PARSER_START;
6774     ctxt->token = 0;
6775 
6776     ctxt->wellFormed = 1;
6777     ctxt->nsWellFormed = 1;
6778     ctxt->disableSAX = 0;
6779     ctxt->valid = 1;
6780     ctxt->vctxt.userData = ctxt;
6781     ctxt->vctxt.error = xmlParserValidityError;
6782     ctxt->vctxt.warning = xmlParserValidityWarning;
6783     ctxt->record_info = 0;
6784     ctxt->checkIndex = 0;
6785     ctxt->inSubset = 0;
6786     ctxt->errNo = XML_ERR_OK;
6787     ctxt->depth = 0;
6788     ctxt->charset = XML_CHAR_ENCODING_NONE;
6789     ctxt->catalogs = NULL;
6790     xmlInitNodeInfoSeq(&ctxt->node_seq);
6791 
6792     if (ctxt->attsDefault != NULL) {
6793         xmlHashFree(ctxt->attsDefault, xmlHashDefaultDeallocator);
6794         ctxt->attsDefault = NULL;
6795     }
6796     if (ctxt->attsSpecial != NULL) {
6797         xmlHashFree(ctxt->attsSpecial, NULL);
6798         ctxt->attsSpecial = NULL;
6799     }
6800 }
6801 
6802 /**
6803  * htmlCtxtUseOptions:
6804  * @ctxt: an HTML parser context
6805  * @options:  a combination of htmlParserOption(s)
6806  *
6807  * Applies the options to the parser context
6808  *
6809  * Returns 0 in case of success, the set of unknown or unimplemented options
6810  *         in case of error.
6811  */
6812 int
htmlCtxtUseOptions(htmlParserCtxtPtr ctxt,int options)6813 htmlCtxtUseOptions(htmlParserCtxtPtr ctxt, int options)
6814 {
6815     if (ctxt == NULL)
6816         return(-1);
6817 
6818     if (options & HTML_PARSE_NOWARNING) {
6819         ctxt->sax->warning = NULL;
6820         ctxt->vctxt.warning = NULL;
6821         options -= XML_PARSE_NOWARNING;
6822 	ctxt->options |= XML_PARSE_NOWARNING;
6823     }
6824     if (options & HTML_PARSE_NOERROR) {
6825         ctxt->sax->error = NULL;
6826         ctxt->vctxt.error = NULL;
6827         ctxt->sax->fatalError = NULL;
6828         options -= XML_PARSE_NOERROR;
6829 	ctxt->options |= XML_PARSE_NOERROR;
6830     }
6831     if (options & HTML_PARSE_PEDANTIC) {
6832         ctxt->pedantic = 1;
6833         options -= XML_PARSE_PEDANTIC;
6834 	ctxt->options |= XML_PARSE_PEDANTIC;
6835     } else
6836         ctxt->pedantic = 0;
6837     if (options & XML_PARSE_NOBLANKS) {
6838         ctxt->keepBlanks = 0;
6839         ctxt->sax->ignorableWhitespace = xmlSAX2IgnorableWhitespace;
6840         options -= XML_PARSE_NOBLANKS;
6841 	ctxt->options |= XML_PARSE_NOBLANKS;
6842     } else
6843         ctxt->keepBlanks = 1;
6844     if (options & HTML_PARSE_RECOVER) {
6845         ctxt->recovery = 1;
6846 	options -= HTML_PARSE_RECOVER;
6847     } else
6848         ctxt->recovery = 0;
6849     if (options & HTML_PARSE_COMPACT) {
6850 	ctxt->options |= HTML_PARSE_COMPACT;
6851         options -= HTML_PARSE_COMPACT;
6852     }
6853     if (options & XML_PARSE_HUGE) {
6854 	ctxt->options |= XML_PARSE_HUGE;
6855         options -= XML_PARSE_HUGE;
6856     }
6857     if (options & HTML_PARSE_NODEFDTD) {
6858 	ctxt->options |= HTML_PARSE_NODEFDTD;
6859         options -= HTML_PARSE_NODEFDTD;
6860     }
6861     if (options & HTML_PARSE_IGNORE_ENC) {
6862 	ctxt->options |= HTML_PARSE_IGNORE_ENC;
6863         options -= HTML_PARSE_IGNORE_ENC;
6864     }
6865     if (options & HTML_PARSE_NOIMPLIED) {
6866         ctxt->options |= HTML_PARSE_NOIMPLIED;
6867         options -= HTML_PARSE_NOIMPLIED;
6868     }
6869     ctxt->dictNames = 0;
6870     return (options);
6871 }
6872 
6873 /**
6874  * htmlDoRead:
6875  * @ctxt:  an HTML parser context
6876  * @URL:  the base URL to use for the document
6877  * @encoding:  the document encoding, or NULL
6878  * @options:  a combination of htmlParserOption(s)
6879  * @reuse:  keep the context for reuse
6880  *
6881  * Common front-end for the htmlRead functions
6882  *
6883  * Returns the resulting document tree or NULL
6884  */
6885 static htmlDocPtr
htmlDoRead(htmlParserCtxtPtr ctxt,const char * URL,const char * encoding,int options,int reuse)6886 htmlDoRead(htmlParserCtxtPtr ctxt, const char *URL, const char *encoding,
6887           int options, int reuse)
6888 {
6889     htmlDocPtr ret;
6890 
6891     htmlCtxtUseOptions(ctxt, options);
6892     ctxt->html = 1;
6893     if (encoding != NULL) {
6894         xmlCharEncodingHandlerPtr hdlr;
6895 
6896 	hdlr = xmlFindCharEncodingHandler(encoding);
6897 	if (hdlr != NULL) {
6898 	    xmlSwitchToEncoding(ctxt, hdlr);
6899 	    if (ctxt->input->encoding != NULL)
6900 	      xmlFree((xmlChar *) ctxt->input->encoding);
6901             ctxt->input->encoding = xmlStrdup((xmlChar *)encoding);
6902         }
6903     }
6904     if ((URL != NULL) && (ctxt->input != NULL) &&
6905         (ctxt->input->filename == NULL))
6906         ctxt->input->filename = (char *) xmlStrdup((const xmlChar *) URL);
6907     htmlParseDocument(ctxt);
6908     ret = ctxt->myDoc;
6909     ctxt->myDoc = NULL;
6910     if (!reuse) {
6911         if ((ctxt->dictNames) &&
6912 	    (ret != NULL) &&
6913 	    (ret->dict == ctxt->dict))
6914 	    ctxt->dict = NULL;
6915 	xmlFreeParserCtxt(ctxt);
6916     }
6917     return (ret);
6918 }
6919 
6920 /**
6921  * htmlReadDoc:
6922  * @cur:  a pointer to a zero terminated string
6923  * @URL:  the base URL to use for the document
6924  * @encoding:  the document encoding, or NULL
6925  * @options:  a combination of htmlParserOption(s)
6926  *
6927  * parse an XML in-memory document and build a tree.
6928  *
6929  * Returns the resulting document tree
6930  */
6931 htmlDocPtr
htmlReadDoc(const xmlChar * cur,const char * URL,const char * encoding,int options)6932 htmlReadDoc(const xmlChar * cur, const char *URL, const char *encoding, int options)
6933 {
6934     htmlParserCtxtPtr ctxt;
6935 
6936     if (cur == NULL)
6937         return (NULL);
6938 
6939     xmlInitParser();
6940     ctxt = htmlCreateDocParserCtxt(cur, NULL);
6941     if (ctxt == NULL)
6942         return (NULL);
6943     return (htmlDoRead(ctxt, URL, encoding, options, 0));
6944 }
6945 
6946 /**
6947  * htmlReadFile:
6948  * @filename:  a file or URL
6949  * @encoding:  the document encoding, or NULL
6950  * @options:  a combination of htmlParserOption(s)
6951  *
6952  * parse an XML file from the filesystem or the network.
6953  *
6954  * Returns the resulting document tree
6955  */
6956 htmlDocPtr
htmlReadFile(const char * filename,const char * encoding,int options)6957 htmlReadFile(const char *filename, const char *encoding, int options)
6958 {
6959     htmlParserCtxtPtr ctxt;
6960 
6961     xmlInitParser();
6962     ctxt = htmlCreateFileParserCtxt(filename, encoding);
6963     if (ctxt == NULL)
6964         return (NULL);
6965     return (htmlDoRead(ctxt, NULL, NULL, options, 0));
6966 }
6967 
6968 /**
6969  * htmlReadMemory:
6970  * @buffer:  a pointer to a char array
6971  * @size:  the size of the array
6972  * @URL:  the base URL to use for the document
6973  * @encoding:  the document encoding, or NULL
6974  * @options:  a combination of htmlParserOption(s)
6975  *
6976  * parse an XML in-memory document and build a tree.
6977  *
6978  * Returns the resulting document tree
6979  */
6980 htmlDocPtr
htmlReadMemory(const char * buffer,int size,const char * URL,const char * encoding,int options)6981 htmlReadMemory(const char *buffer, int size, const char *URL, const char *encoding, int options)
6982 {
6983     htmlParserCtxtPtr ctxt;
6984 
6985     xmlInitParser();
6986     ctxt = xmlCreateMemoryParserCtxt(buffer, size);
6987     if (ctxt == NULL)
6988         return (NULL);
6989     htmlDefaultSAXHandlerInit();
6990     if (ctxt->sax != NULL)
6991         memcpy(ctxt->sax, &htmlDefaultSAXHandler, sizeof(xmlSAXHandlerV1));
6992     return (htmlDoRead(ctxt, URL, encoding, options, 0));
6993 }
6994 
6995 /**
6996  * htmlReadFd:
6997  * @fd:  an open file descriptor
6998  * @URL:  the base URL to use for the document
6999  * @encoding:  the document encoding, or NULL
7000  * @options:  a combination of htmlParserOption(s)
7001  *
7002  * parse an XML from a file descriptor and build a tree.
7003  *
7004  * Returns the resulting document tree
7005  */
7006 htmlDocPtr
htmlReadFd(int fd,const char * URL,const char * encoding,int options)7007 htmlReadFd(int fd, const char *URL, const char *encoding, int options)
7008 {
7009     htmlParserCtxtPtr ctxt;
7010     xmlParserInputBufferPtr input;
7011     xmlParserInputPtr stream;
7012 
7013     if (fd < 0)
7014         return (NULL);
7015     xmlInitParser();
7016 
7017     xmlInitParser();
7018     input = xmlParserInputBufferCreateFd(fd, XML_CHAR_ENCODING_NONE);
7019     if (input == NULL)
7020         return (NULL);
7021     ctxt = xmlNewParserCtxt();
7022     if (ctxt == NULL) {
7023         xmlFreeParserInputBuffer(input);
7024         return (NULL);
7025     }
7026     stream = xmlNewIOInputStream(ctxt, input, XML_CHAR_ENCODING_NONE);
7027     if (stream == NULL) {
7028         xmlFreeParserInputBuffer(input);
7029 	xmlFreeParserCtxt(ctxt);
7030         return (NULL);
7031     }
7032     inputPush(ctxt, stream);
7033     return (htmlDoRead(ctxt, URL, encoding, options, 0));
7034 }
7035 
7036 /**
7037  * htmlReadIO:
7038  * @ioread:  an I/O read function
7039  * @ioclose:  an I/O close function
7040  * @ioctx:  an I/O handler
7041  * @URL:  the base URL to use for the document
7042  * @encoding:  the document encoding, or NULL
7043  * @options:  a combination of htmlParserOption(s)
7044  *
7045  * parse an HTML document from I/O functions and source and build a tree.
7046  *
7047  * Returns the resulting document tree
7048  */
7049 htmlDocPtr
htmlReadIO(xmlInputReadCallback ioread,xmlInputCloseCallback ioclose,void * ioctx,const char * URL,const char * encoding,int options)7050 htmlReadIO(xmlInputReadCallback ioread, xmlInputCloseCallback ioclose,
7051           void *ioctx, const char *URL, const char *encoding, int options)
7052 {
7053     htmlParserCtxtPtr ctxt;
7054     xmlParserInputBufferPtr input;
7055     xmlParserInputPtr stream;
7056 
7057     if (ioread == NULL)
7058         return (NULL);
7059     xmlInitParser();
7060 
7061     input = xmlParserInputBufferCreateIO(ioread, ioclose, ioctx,
7062                                          XML_CHAR_ENCODING_NONE);
7063     if (input == NULL) {
7064         if (ioclose != NULL)
7065             ioclose(ioctx);
7066         return (NULL);
7067     }
7068     ctxt = htmlNewParserCtxt();
7069     if (ctxt == NULL) {
7070         xmlFreeParserInputBuffer(input);
7071         return (NULL);
7072     }
7073     stream = xmlNewIOInputStream(ctxt, input, XML_CHAR_ENCODING_NONE);
7074     if (stream == NULL) {
7075         xmlFreeParserInputBuffer(input);
7076 	xmlFreeParserCtxt(ctxt);
7077         return (NULL);
7078     }
7079     inputPush(ctxt, stream);
7080     return (htmlDoRead(ctxt, URL, encoding, options, 0));
7081 }
7082 
7083 /**
7084  * htmlCtxtReadDoc:
7085  * @ctxt:  an HTML parser context
7086  * @cur:  a pointer to a zero terminated string
7087  * @URL:  the base URL to use for the document
7088  * @encoding:  the document encoding, or NULL
7089  * @options:  a combination of htmlParserOption(s)
7090  *
7091  * parse an XML in-memory document and build a tree.
7092  * This reuses the existing @ctxt parser context
7093  *
7094  * Returns the resulting document tree
7095  */
7096 htmlDocPtr
htmlCtxtReadDoc(htmlParserCtxtPtr ctxt,const xmlChar * cur,const char * URL,const char * encoding,int options)7097 htmlCtxtReadDoc(htmlParserCtxtPtr ctxt, const xmlChar * cur,
7098                const char *URL, const char *encoding, int options)
7099 {
7100     xmlParserInputPtr stream;
7101 
7102     if (cur == NULL)
7103         return (NULL);
7104     if (ctxt == NULL)
7105         return (NULL);
7106     xmlInitParser();
7107 
7108     htmlCtxtReset(ctxt);
7109 
7110     stream = xmlNewStringInputStream(ctxt, cur);
7111     if (stream == NULL) {
7112         return (NULL);
7113     }
7114     inputPush(ctxt, stream);
7115     return (htmlDoRead(ctxt, URL, encoding, options, 1));
7116 }
7117 
7118 /**
7119  * htmlCtxtReadFile:
7120  * @ctxt:  an HTML parser context
7121  * @filename:  a file or URL
7122  * @encoding:  the document encoding, or NULL
7123  * @options:  a combination of htmlParserOption(s)
7124  *
7125  * parse an XML file from the filesystem or the network.
7126  * This reuses the existing @ctxt parser context
7127  *
7128  * Returns the resulting document tree
7129  */
7130 htmlDocPtr
htmlCtxtReadFile(htmlParserCtxtPtr ctxt,const char * filename,const char * encoding,int options)7131 htmlCtxtReadFile(htmlParserCtxtPtr ctxt, const char *filename,
7132                 const char *encoding, int options)
7133 {
7134     xmlParserInputPtr stream;
7135 
7136     if (filename == NULL)
7137         return (NULL);
7138     if (ctxt == NULL)
7139         return (NULL);
7140     xmlInitParser();
7141 
7142     htmlCtxtReset(ctxt);
7143 
7144     stream = xmlLoadExternalEntity(filename, NULL, ctxt);
7145     if (stream == NULL) {
7146         return (NULL);
7147     }
7148     inputPush(ctxt, stream);
7149     return (htmlDoRead(ctxt, NULL, encoding, options, 1));
7150 }
7151 
7152 /**
7153  * htmlCtxtReadMemory:
7154  * @ctxt:  an HTML parser context
7155  * @buffer:  a pointer to a char array
7156  * @size:  the size of the array
7157  * @URL:  the base URL to use for the document
7158  * @encoding:  the document encoding, or NULL
7159  * @options:  a combination of htmlParserOption(s)
7160  *
7161  * parse an XML in-memory document and build a tree.
7162  * This reuses the existing @ctxt parser context
7163  *
7164  * Returns the resulting document tree
7165  */
7166 htmlDocPtr
htmlCtxtReadMemory(htmlParserCtxtPtr ctxt,const char * buffer,int size,const char * URL,const char * encoding,int options)7167 htmlCtxtReadMemory(htmlParserCtxtPtr ctxt, const char *buffer, int size,
7168                   const char *URL, const char *encoding, int options)
7169 {
7170     xmlParserInputBufferPtr input;
7171     xmlParserInputPtr stream;
7172 
7173     if (ctxt == NULL)
7174         return (NULL);
7175     if (buffer == NULL)
7176         return (NULL);
7177     xmlInitParser();
7178 
7179     htmlCtxtReset(ctxt);
7180 
7181     input = xmlParserInputBufferCreateMem(buffer, size, XML_CHAR_ENCODING_NONE);
7182     if (input == NULL) {
7183 	return(NULL);
7184     }
7185 
7186     stream = xmlNewIOInputStream(ctxt, input, XML_CHAR_ENCODING_NONE);
7187     if (stream == NULL) {
7188 	xmlFreeParserInputBuffer(input);
7189 	return(NULL);
7190     }
7191 
7192     inputPush(ctxt, stream);
7193     return (htmlDoRead(ctxt, URL, encoding, options, 1));
7194 }
7195 
7196 /**
7197  * htmlCtxtReadFd:
7198  * @ctxt:  an HTML parser context
7199  * @fd:  an open file descriptor
7200  * @URL:  the base URL to use for the document
7201  * @encoding:  the document encoding, or NULL
7202  * @options:  a combination of htmlParserOption(s)
7203  *
7204  * parse an XML from a file descriptor and build a tree.
7205  * This reuses the existing @ctxt parser context
7206  *
7207  * Returns the resulting document tree
7208  */
7209 htmlDocPtr
htmlCtxtReadFd(htmlParserCtxtPtr ctxt,int fd,const char * URL,const char * encoding,int options)7210 htmlCtxtReadFd(htmlParserCtxtPtr ctxt, int fd,
7211               const char *URL, const char *encoding, int options)
7212 {
7213     xmlParserInputBufferPtr input;
7214     xmlParserInputPtr stream;
7215 
7216     if (fd < 0)
7217         return (NULL);
7218     if (ctxt == NULL)
7219         return (NULL);
7220     xmlInitParser();
7221 
7222     htmlCtxtReset(ctxt);
7223 
7224 
7225     input = xmlParserInputBufferCreateFd(fd, XML_CHAR_ENCODING_NONE);
7226     if (input == NULL)
7227         return (NULL);
7228     stream = xmlNewIOInputStream(ctxt, input, XML_CHAR_ENCODING_NONE);
7229     if (stream == NULL) {
7230         xmlFreeParserInputBuffer(input);
7231         return (NULL);
7232     }
7233     inputPush(ctxt, stream);
7234     return (htmlDoRead(ctxt, URL, encoding, options, 1));
7235 }
7236 
7237 /**
7238  * htmlCtxtReadIO:
7239  * @ctxt:  an HTML parser context
7240  * @ioread:  an I/O read function
7241  * @ioclose:  an I/O close function
7242  * @ioctx:  an I/O handler
7243  * @URL:  the base URL to use for the document
7244  * @encoding:  the document encoding, or NULL
7245  * @options:  a combination of htmlParserOption(s)
7246  *
7247  * parse an HTML document from I/O functions and source and build a tree.
7248  * This reuses the existing @ctxt parser context
7249  *
7250  * Returns the resulting document tree
7251  */
7252 htmlDocPtr
htmlCtxtReadIO(htmlParserCtxtPtr ctxt,xmlInputReadCallback ioread,xmlInputCloseCallback ioclose,void * ioctx,const char * URL,const char * encoding,int options)7253 htmlCtxtReadIO(htmlParserCtxtPtr ctxt, xmlInputReadCallback ioread,
7254               xmlInputCloseCallback ioclose, void *ioctx,
7255 	      const char *URL,
7256               const char *encoding, int options)
7257 {
7258     xmlParserInputBufferPtr input;
7259     xmlParserInputPtr stream;
7260 
7261     if (ioread == NULL)
7262         return (NULL);
7263     if (ctxt == NULL)
7264         return (NULL);
7265     xmlInitParser();
7266 
7267     htmlCtxtReset(ctxt);
7268 
7269     input = xmlParserInputBufferCreateIO(ioread, ioclose, ioctx,
7270                                          XML_CHAR_ENCODING_NONE);
7271     if (input == NULL) {
7272         if (ioclose != NULL)
7273             ioclose(ioctx);
7274         return (NULL);
7275     }
7276     stream = xmlNewIOInputStream(ctxt, input, XML_CHAR_ENCODING_NONE);
7277     if (stream == NULL) {
7278         xmlFreeParserInputBuffer(input);
7279         return (NULL);
7280     }
7281     inputPush(ctxt, stream);
7282     return (htmlDoRead(ctxt, URL, encoding, options, 1));
7283 }
7284 
7285 #define bottom_HTMLparser
7286 #include "elfgcchack.h"
7287 #endif /* LIBXML_HTML_ENABLED */
7288