• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * HTMLparser.c : an HTML 4.0 non-verifying parser
3  *
4  * See Copyright for the status of this software.
5  *
6  * daniel@veillard.com
7  */
8 
9 #define IN_LIBXML
10 #include "libxml.h"
11 #ifdef LIBXML_HTML_ENABLED
12 
13 #include <string.h>
14 #ifdef HAVE_CTYPE_H
15 #include <ctype.h>
16 #endif
17 #ifdef HAVE_STDLIB_H
18 #include <stdlib.h>
19 #endif
20 #ifdef HAVE_SYS_STAT_H
21 #include <sys/stat.h>
22 #endif
23 #ifdef HAVE_FCNTL_H
24 #include <fcntl.h>
25 #endif
26 #ifdef HAVE_UNISTD_H
27 #include <unistd.h>
28 #endif
29 #ifdef HAVE_ZLIB_H
30 #include <zlib.h>
31 #endif
32 
33 #include <libxml/xmlmemory.h>
34 #include <libxml/tree.h>
35 #include <libxml/parser.h>
36 #include <libxml/parserInternals.h>
37 #include <libxml/xmlerror.h>
38 #include <libxml/HTMLparser.h>
39 #include <libxml/HTMLtree.h>
40 #include <libxml/entities.h>
41 #include <libxml/encoding.h>
42 #include <libxml/valid.h>
43 #include <libxml/xmlIO.h>
44 #include <libxml/globals.h>
45 #include <libxml/uri.h>
46 
47 #define HTML_MAX_NAMELEN 1000
48 #define HTML_PARSER_BIG_BUFFER_SIZE 1000
49 #define HTML_PARSER_BUFFER_SIZE 100
50 
51 /* #define DEBUG */
52 /* #define DEBUG_PUSH */
53 
54 static int htmlOmittedDefaultValue = 1;
55 
56 xmlChar * htmlDecodeEntities(htmlParserCtxtPtr ctxt, int len,
57 			     xmlChar end, xmlChar  end2, xmlChar end3);
58 static void htmlParseComment(htmlParserCtxtPtr ctxt);
59 
60 /************************************************************************
61  *									*
62  *		Some factorized error routines				*
63  *									*
64  ************************************************************************/
65 
66 /**
67  * htmlErrMemory:
68  * @ctxt:  an HTML parser context
69  * @extra:  extra informations
70  *
71  * Handle a redefinition of attribute error
72  */
73 static void
htmlErrMemory(xmlParserCtxtPtr ctxt,const char * extra)74 htmlErrMemory(xmlParserCtxtPtr ctxt, const char *extra)
75 {
76     if ((ctxt != NULL) && (ctxt->disableSAX != 0) &&
77         (ctxt->instate == XML_PARSER_EOF))
78 	return;
79     if (ctxt != NULL) {
80         ctxt->errNo = XML_ERR_NO_MEMORY;
81         ctxt->instate = XML_PARSER_EOF;
82         ctxt->disableSAX = 1;
83     }
84     if (extra)
85         __xmlRaiseError(NULL, NULL, NULL, ctxt, NULL, XML_FROM_PARSER,
86                         XML_ERR_NO_MEMORY, XML_ERR_FATAL, NULL, 0, extra,
87                         NULL, NULL, 0, 0,
88                         "Memory allocation failed : %s\n", extra);
89     else
90         __xmlRaiseError(NULL, NULL, NULL, ctxt, NULL, XML_FROM_PARSER,
91                         XML_ERR_NO_MEMORY, XML_ERR_FATAL, NULL, 0, NULL,
92                         NULL, NULL, 0, 0, "Memory allocation failed\n");
93 }
94 
95 /**
96  * htmlParseErr:
97  * @ctxt:  an HTML parser context
98  * @error:  the error number
99  * @msg:  the error message
100  * @str1:  string infor
101  * @str2:  string infor
102  *
103  * Handle a fatal parser error, i.e. violating Well-Formedness constraints
104  */
105 static void
htmlParseErr(xmlParserCtxtPtr ctxt,xmlParserErrors error,const char * msg,const xmlChar * str1,const xmlChar * str2)106 htmlParseErr(xmlParserCtxtPtr ctxt, xmlParserErrors error,
107              const char *msg, const xmlChar *str1, const xmlChar *str2)
108 {
109     if ((ctxt != NULL) && (ctxt->disableSAX != 0) &&
110         (ctxt->instate == XML_PARSER_EOF))
111 	return;
112     if (ctxt != NULL)
113 	ctxt->errNo = error;
114     __xmlRaiseError(NULL, NULL, NULL, ctxt, NULL, XML_FROM_HTML, error,
115                     XML_ERR_ERROR, NULL, 0,
116 		    (const char *) str1, (const char *) str2,
117 		    NULL, 0, 0,
118 		    msg, str1, str2);
119     if (ctxt != NULL)
120 	ctxt->wellFormed = 0;
121 }
122 
123 /**
124  * htmlParseErrInt:
125  * @ctxt:  an HTML parser context
126  * @error:  the error number
127  * @msg:  the error message
128  * @val:  integer info
129  *
130  * Handle a fatal parser error, i.e. violating Well-Formedness constraints
131  */
132 static void
htmlParseErrInt(xmlParserCtxtPtr ctxt,xmlParserErrors error,const char * msg,int val)133 htmlParseErrInt(xmlParserCtxtPtr ctxt, xmlParserErrors error,
134              const char *msg, int val)
135 {
136     if ((ctxt != NULL) && (ctxt->disableSAX != 0) &&
137         (ctxt->instate == XML_PARSER_EOF))
138 	return;
139     if (ctxt != NULL)
140 	ctxt->errNo = error;
141     __xmlRaiseError(NULL, NULL, NULL, ctxt, NULL, XML_FROM_HTML, error,
142                     XML_ERR_ERROR, NULL, 0, NULL, NULL,
143 		    NULL, val, 0, msg, val);
144     if (ctxt != NULL)
145 	ctxt->wellFormed = 0;
146 }
147 
148 /************************************************************************
149  *									*
150  *	Parser stacks related functions and macros		*
151  *									*
152  ************************************************************************/
153 
154 /**
155  * htmlnamePush:
156  * @ctxt:  an HTML parser context
157  * @value:  the element name
158  *
159  * Pushes a new element name on top of the name stack
160  *
161  * Returns 0 in case of error, the index in the stack otherwise
162  */
163 static int
htmlnamePush(htmlParserCtxtPtr ctxt,const xmlChar * value)164 htmlnamePush(htmlParserCtxtPtr ctxt, const xmlChar * value)
165 {
166     if ((ctxt->html < 3) && (xmlStrEqual(value, BAD_CAST "head")))
167         ctxt->html = 3;
168     if ((ctxt->html < 10) && (xmlStrEqual(value, BAD_CAST "body")))
169         ctxt->html = 10;
170     if (ctxt->nameNr >= ctxt->nameMax) {
171         ctxt->nameMax *= 2;
172         ctxt->nameTab = (const xmlChar * *)
173                          xmlRealloc((xmlChar * *)ctxt->nameTab,
174                                     ctxt->nameMax *
175                                     sizeof(ctxt->nameTab[0]));
176         if (ctxt->nameTab == NULL) {
177             htmlErrMemory(ctxt, NULL);
178             return (0);
179         }
180     }
181     ctxt->nameTab[ctxt->nameNr] = value;
182     ctxt->name = value;
183     return (ctxt->nameNr++);
184 }
185 /**
186  * htmlnamePop:
187  * @ctxt: an HTML parser context
188  *
189  * Pops the top element name from the name stack
190  *
191  * Returns the name just removed
192  */
193 static const xmlChar *
htmlnamePop(htmlParserCtxtPtr ctxt)194 htmlnamePop(htmlParserCtxtPtr ctxt)
195 {
196     const xmlChar *ret;
197 
198     if (ctxt->nameNr <= 0)
199         return (NULL);
200     ctxt->nameNr--;
201     if (ctxt->nameNr < 0)
202         return (NULL);
203     if (ctxt->nameNr > 0)
204         ctxt->name = ctxt->nameTab[ctxt->nameNr - 1];
205     else
206         ctxt->name = NULL;
207     ret = ctxt->nameTab[ctxt->nameNr];
208     ctxt->nameTab[ctxt->nameNr] = NULL;
209     return (ret);
210 }
211 
212 /**
213  * htmlNodeInfoPush:
214  * @ctxt:  an HTML parser context
215  * @value:  the node info
216  *
217  * Pushes a new element name on top of the node info stack
218  *
219  * Returns 0 in case of error, the index in the stack otherwise
220  */
221 static int
htmlNodeInfoPush(htmlParserCtxtPtr ctxt,htmlParserNodeInfo * value)222 htmlNodeInfoPush(htmlParserCtxtPtr ctxt, htmlParserNodeInfo *value)
223 {
224     if (ctxt->nodeInfoNr >= ctxt->nodeInfoMax) {
225         if (ctxt->nodeInfoMax == 0)
226                 ctxt->nodeInfoMax = 5;
227         ctxt->nodeInfoMax *= 2;
228         ctxt->nodeInfoTab = (htmlParserNodeInfo *)
229                          xmlRealloc((htmlParserNodeInfo *)ctxt->nodeInfoTab,
230                                     ctxt->nodeInfoMax *
231                                     sizeof(ctxt->nodeInfoTab[0]));
232         if (ctxt->nodeInfoTab == NULL) {
233             htmlErrMemory(ctxt, NULL);
234             return (0);
235         }
236     }
237     ctxt->nodeInfoTab[ctxt->nodeInfoNr] = *value;
238     ctxt->nodeInfo = &ctxt->nodeInfoTab[ctxt->nodeInfoNr];
239     return (ctxt->nodeInfoNr++);
240 }
241 
242 /**
243  * htmlNodeInfoPop:
244  * @ctxt:  an HTML parser context
245  *
246  * Pops the top element name from the node info stack
247  *
248  * Returns 0 in case of error, the pointer to NodeInfo otherwise
249  */
250 static htmlParserNodeInfo *
htmlNodeInfoPop(htmlParserCtxtPtr ctxt)251 htmlNodeInfoPop(htmlParserCtxtPtr ctxt)
252 {
253     if (ctxt->nodeInfoNr <= 0)
254         return (NULL);
255     ctxt->nodeInfoNr--;
256     if (ctxt->nodeInfoNr < 0)
257         return (NULL);
258     if (ctxt->nodeInfoNr > 0)
259         ctxt->nodeInfo = &ctxt->nodeInfoTab[ctxt->nodeInfoNr - 1];
260     else
261         ctxt->nodeInfo = NULL;
262     return &ctxt->nodeInfoTab[ctxt->nodeInfoNr];
263 }
264 
265 /*
266  * Macros for accessing the content. Those should be used only by the parser,
267  * and not exported.
268  *
269  * Dirty macros, i.e. one need to make assumption on the context to use them
270  *
271  *   CUR_PTR return the current pointer to the xmlChar to be parsed.
272  *   CUR     returns the current xmlChar value, i.e. a 8 bit value if compiled
273  *           in ISO-Latin or UTF-8, and the current 16 bit value if compiled
274  *           in UNICODE mode. This should be used internally by the parser
275  *           only to compare to ASCII values otherwise it would break when
276  *           running with UTF-8 encoding.
277  *   NXT(n)  returns the n'th next xmlChar. Same as CUR is should be used only
278  *           to compare on ASCII based substring.
279  *   UPP(n)  returns the n'th next xmlChar converted to uppercase. Same as CUR
280  *           it should be used only to compare on ASCII based substring.
281  *   SKIP(n) Skip n xmlChar, and must also be used only to skip ASCII defined
282  *           strings without newlines within the parser.
283  *
284  * Clean macros, not dependent of an ASCII context, expect UTF-8 encoding
285  *
286  *   CURRENT Returns the current char value, with the full decoding of
287  *           UTF-8 if we are using this mode. It returns an int.
288  *   NEXT    Skip to the next character, this does the proper decoding
289  *           in UTF-8 mode. It also pop-up unfinished entities on the fly.
290  *   NEXTL(l) Skip the current unicode character of l xmlChars long.
291  *   COPY(to) copy one char to *to, increment CUR_PTR and to accordingly
292  */
293 
294 #define UPPER (toupper(*ctxt->input->cur))
295 
296 #define SKIP(val) ctxt->nbChars += (val),ctxt->input->cur += (val),ctxt->input->col+=(val)
297 
298 #define NXT(val) ctxt->input->cur[(val)]
299 
300 #define UPP(val) (toupper(ctxt->input->cur[(val)]))
301 
302 #define CUR_PTR ctxt->input->cur
303 
304 #define SHRINK if ((ctxt->input->cur - ctxt->input->base > 2 * INPUT_CHUNK) && \
305 		   (ctxt->input->end - ctxt->input->cur < 2 * INPUT_CHUNK)) \
306 	xmlParserInputShrink(ctxt->input)
307 
308 #define GROW if ((ctxt->progressive == 0) &&				\
309 		 (ctxt->input->end - ctxt->input->cur < INPUT_CHUNK))	\
310 	xmlParserInputGrow(ctxt->input, INPUT_CHUNK)
311 
312 #define CURRENT ((int) (*ctxt->input->cur))
313 
314 #define SKIP_BLANKS htmlSkipBlankChars(ctxt)
315 
316 /* Inported from XML */
317 
318 /* #define CUR (ctxt->token ? ctxt->token : (int) (*ctxt->input->cur)) */
319 #define CUR ((int) (*ctxt->input->cur))
320 #define NEXT xmlNextChar(ctxt)
321 
322 #define RAW (ctxt->token ? -1 : (*ctxt->input->cur))
323 
324 
325 #define NEXTL(l) do {							\
326     if (*(ctxt->input->cur) == '\n') {					\
327 	ctxt->input->line++; ctxt->input->col = 1;			\
328     } else ctxt->input->col++;						\
329     ctxt->token = 0; ctxt->input->cur += l; ctxt->nbChars++;		\
330   } while (0)
331 
332 /************
333     \
334     if (*ctxt->input->cur == '%') xmlParserHandlePEReference(ctxt);	\
335     if (*ctxt->input->cur == '&') xmlParserHandleReference(ctxt);
336  ************/
337 
338 #define CUR_CHAR(l) htmlCurrentChar(ctxt, &l)
339 #define CUR_SCHAR(s, l) xmlStringCurrentChar(ctxt, s, &l)
340 
341 #define COPY_BUF(l,b,i,v)						\
342     if (l == 1) b[i++] = (xmlChar) v;					\
343     else i += xmlCopyChar(l,&b[i],v)
344 
345 /**
346  * htmlFindEncoding:
347  * @the HTML parser context
348  *
349  * Ty to find and encoding in the current data available in the input
350  * buffer this is needed to try to switch to the proper encoding when
351  * one face a character error.
352  * That's an heuristic, since it's operating outside of parsing it could
353  * try to use a meta which had been commented out, that's the reason it
354  * should only be used in case of error, not as a default.
355  *
356  * Returns an encoding string or NULL if not found, the string need to
357  *   be freed
358  */
359 static xmlChar *
htmlFindEncoding(xmlParserCtxtPtr ctxt)360 htmlFindEncoding(xmlParserCtxtPtr ctxt) {
361     const xmlChar *start, *cur, *end;
362 
363     if ((ctxt == NULL) || (ctxt->input == NULL) ||
364         (ctxt->input->encoding != NULL) || (ctxt->input->buf == NULL) ||
365         (ctxt->input->buf->encoder != NULL))
366         return(NULL);
367     if ((ctxt->input->cur == NULL) || (ctxt->input->end == NULL))
368         return(NULL);
369 
370     start = ctxt->input->cur;
371     end = ctxt->input->end;
372     /* we also expect the input buffer to be zero terminated */
373     if (*end != 0)
374         return(NULL);
375 
376     cur = xmlStrcasestr(start, BAD_CAST "HTTP-EQUIV");
377     if (cur == NULL)
378         return(NULL);
379     cur = xmlStrcasestr(cur, BAD_CAST  "CONTENT");
380     if (cur == NULL)
381         return(NULL);
382     cur = xmlStrcasestr(cur, BAD_CAST  "CHARSET=");
383     if (cur == NULL)
384         return(NULL);
385     cur += 8;
386     start = cur;
387     while (((*cur >= 'A') && (*cur <= 'Z')) ||
388            ((*cur >= 'a') && (*cur <= 'z')) ||
389            ((*cur >= '0') && (*cur <= '9')) ||
390            (*cur == '-') || (*cur == '_') || (*cur == ':') || (*cur == '/'))
391            cur++;
392     if (cur == start)
393         return(NULL);
394     return(xmlStrndup(start, cur - start));
395 }
396 
397 /**
398  * htmlCurrentChar:
399  * @ctxt:  the HTML parser context
400  * @len:  pointer to the length of the char read
401  *
402  * The current char value, if using UTF-8 this may actually span multiple
403  * bytes in the input buffer. Implement the end of line normalization:
404  * 2.11 End-of-Line Handling
405  * If the encoding is unspecified, in the case we find an ISO-Latin-1
406  * char, then the encoding converter is plugged in automatically.
407  *
408  * Returns the current char value and its length
409  */
410 
411 static int
htmlCurrentChar(xmlParserCtxtPtr ctxt,int * len)412 htmlCurrentChar(xmlParserCtxtPtr ctxt, int *len) {
413     if (ctxt->instate == XML_PARSER_EOF)
414 	return(0);
415 
416     if (ctxt->token != 0) {
417 	*len = 0;
418 	return(ctxt->token);
419     }
420     if (ctxt->charset == XML_CHAR_ENCODING_UTF8) {
421 	/*
422 	 * We are supposed to handle UTF8, check it's valid
423 	 * From rfc2044: encoding of the Unicode values on UTF-8:
424 	 *
425 	 * UCS-4 range (hex.)           UTF-8 octet sequence (binary)
426 	 * 0000 0000-0000 007F   0xxxxxxx
427 	 * 0000 0080-0000 07FF   110xxxxx 10xxxxxx
428 	 * 0000 0800-0000 FFFF   1110xxxx 10xxxxxx 10xxxxxx
429 	 *
430 	 * Check for the 0x110000 limit too
431 	 */
432 	const unsigned char *cur = ctxt->input->cur;
433 	unsigned char c;
434 	unsigned int val;
435 
436 	c = *cur;
437 	if (c & 0x80) {
438 	    if (cur[1] == 0) {
439 		xmlParserInputGrow(ctxt->input, INPUT_CHUNK);
440                 cur = ctxt->input->cur;
441             }
442 	    if ((cur[1] & 0xc0) != 0x80)
443 		goto encoding_error;
444 	    if ((c & 0xe0) == 0xe0) {
445 
446 		if (cur[2] == 0) {
447 		    xmlParserInputGrow(ctxt->input, INPUT_CHUNK);
448                     cur = ctxt->input->cur;
449                 }
450 		if ((cur[2] & 0xc0) != 0x80)
451 		    goto encoding_error;
452 		if ((c & 0xf0) == 0xf0) {
453 		    if (cur[3] == 0) {
454 			xmlParserInputGrow(ctxt->input, INPUT_CHUNK);
455                         cur = ctxt->input->cur;
456                     }
457 		    if (((c & 0xf8) != 0xf0) ||
458 			((cur[3] & 0xc0) != 0x80))
459 			goto encoding_error;
460 		    /* 4-byte code */
461 		    *len = 4;
462 		    val = (cur[0] & 0x7) << 18;
463 		    val |= (cur[1] & 0x3f) << 12;
464 		    val |= (cur[2] & 0x3f) << 6;
465 		    val |= cur[3] & 0x3f;
466 		} else {
467 		  /* 3-byte code */
468 		    *len = 3;
469 		    val = (cur[0] & 0xf) << 12;
470 		    val |= (cur[1] & 0x3f) << 6;
471 		    val |= cur[2] & 0x3f;
472 		}
473 	    } else {
474 	      /* 2-byte code */
475 		*len = 2;
476 		val = (cur[0] & 0x1f) << 6;
477 		val |= cur[1] & 0x3f;
478 	    }
479 	    if (!IS_CHAR(val)) {
480 	        htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR,
481 				"Char 0x%X out of allowed range\n", val);
482 	    }
483 	    return(val);
484 	} else {
485             if ((*ctxt->input->cur == 0) &&
486                 (ctxt->input->cur < ctxt->input->end)) {
487                     htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR,
488 				"Char 0x%X out of allowed range\n", 0);
489                 *len = 1;
490                 return(' ');
491             }
492 	    /* 1-byte code */
493 	    *len = 1;
494 	    return((int) *ctxt->input->cur);
495 	}
496     }
497     /*
498      * Assume it's a fixed length encoding (1) with
499      * a compatible encoding for the ASCII set, since
500      * XML constructs only use < 128 chars
501      */
502     *len = 1;
503     if ((int) *ctxt->input->cur < 0x80)
504 	return((int) *ctxt->input->cur);
505 
506     /*
507      * Humm this is bad, do an automatic flow conversion
508      */
509     {
510         xmlChar * guess;
511         xmlCharEncodingHandlerPtr handler;
512 
513         guess = htmlFindEncoding(ctxt);
514         if (guess == NULL) {
515             xmlSwitchEncoding(ctxt, XML_CHAR_ENCODING_8859_1);
516         } else {
517             if (ctxt->input->encoding != NULL)
518                 xmlFree((xmlChar *) ctxt->input->encoding);
519             ctxt->input->encoding = guess;
520             handler = xmlFindCharEncodingHandler((const char *) guess);
521             if (handler != NULL) {
522                 xmlSwitchToEncoding(ctxt, handler);
523             } else {
524                 htmlParseErr(ctxt, XML_ERR_INVALID_ENCODING,
525                              "Unsupported encoding %s", guess, NULL);
526             }
527         }
528         ctxt->charset = XML_CHAR_ENCODING_UTF8;
529     }
530 
531     return(xmlCurrentChar(ctxt, len));
532 
533 encoding_error:
534     /*
535      * If we detect an UTF8 error that probably mean that the
536      * input encoding didn't get properly advertized in the
537      * declaration header. Report the error and switch the encoding
538      * to ISO-Latin-1 (if you don't like this policy, just declare the
539      * encoding !)
540      */
541     {
542         char buffer[150];
543 
544 	if (ctxt->input->end - ctxt->input->cur >= 4) {
545 	    snprintf(buffer, 149, "Bytes: 0x%02X 0x%02X 0x%02X 0x%02X\n",
546 			    ctxt->input->cur[0], ctxt->input->cur[1],
547 			    ctxt->input->cur[2], ctxt->input->cur[3]);
548 	} else {
549 	    snprintf(buffer, 149, "Bytes: 0x%02X\n", ctxt->input->cur[0]);
550 	}
551 	htmlParseErr(ctxt, XML_ERR_INVALID_ENCODING,
552 		     "Input is not proper UTF-8, indicate encoding !\n",
553 		     BAD_CAST buffer, NULL);
554     }
555 
556     ctxt->charset = XML_CHAR_ENCODING_8859_1;
557     *len = 1;
558     return((int) *ctxt->input->cur);
559 }
560 
561 /**
562  * htmlSkipBlankChars:
563  * @ctxt:  the HTML parser context
564  *
565  * skip all blanks character found at that point in the input streams.
566  *
567  * Returns the number of space chars skipped
568  */
569 
570 static int
htmlSkipBlankChars(xmlParserCtxtPtr ctxt)571 htmlSkipBlankChars(xmlParserCtxtPtr ctxt) {
572     int res = 0;
573 
574     while (IS_BLANK_CH(*(ctxt->input->cur))) {
575 	if ((*ctxt->input->cur == 0) &&
576 	    (xmlParserInputGrow(ctxt->input, INPUT_CHUNK) <= 0)) {
577 		xmlPopInput(ctxt);
578 	} else {
579 	    if (*(ctxt->input->cur) == '\n') {
580 		ctxt->input->line++; ctxt->input->col = 1;
581 	    } else ctxt->input->col++;
582 	    ctxt->input->cur++;
583 	    ctxt->nbChars++;
584 	    if (*ctxt->input->cur == 0)
585 		xmlParserInputGrow(ctxt->input, INPUT_CHUNK);
586 	}
587 	res++;
588     }
589     return(res);
590 }
591 
592 
593 
594 /************************************************************************
595  *									*
596  *	The list of HTML elements and their properties		*
597  *									*
598  ************************************************************************/
599 
600 /*
601  *  Start Tag: 1 means the start tag can be ommited
602  *  End Tag:   1 means the end tag can be ommited
603  *             2 means it's forbidden (empty elements)
604  *             3 means the tag is stylistic and should be closed easily
605  *  Depr:      this element is deprecated
606  *  DTD:       1 means that this element is valid only in the Loose DTD
607  *             2 means that this element is valid only in the Frameset DTD
608  *
609  * Name,Start Tag,End Tag,Save End,Empty,Deprecated,DTD,inline,Description
610 	, subElements , impliedsubelt , Attributes, userdata
611  */
612 
613 /* Definitions and a couple of vars for HTML Elements */
614 
615 #define FONTSTYLE "tt", "i", "b", "u", "s", "strike", "big", "small"
616 #define NB_FONTSTYLE 8
617 #define PHRASE "em", "strong", "dfn", "code", "samp", "kbd", "var", "cite", "abbr", "acronym"
618 #define NB_PHRASE 10
619 #define SPECIAL "a", "img", "applet", "embed", "object", "font", "basefont", "br", "script", "map", "q", "sub", "sup", "span", "bdo", "iframe"
620 #define NB_SPECIAL 16
621 #define INLINE FONTSTYLE, PHRASE, SPECIAL, FORMCTRL
622 #define NB_INLINE NB_PCDATA + NB_FONTSTYLE + NB_PHRASE + NB_SPECIAL + NB_FORMCTRL
623 #define BLOCK HEADING, LIST, "pre", "p", "dl", "div", "center", "noscript", "noframes", "blockquote", "form", "isindex", "hr", "table", "fieldset", "address"
624 #define NB_BLOCK NB_HEADING + NB_LIST + 14
625 #define FORMCTRL "input", "select", "textarea", "label", "button"
626 #define NB_FORMCTRL 5
627 #define PCDATA
628 #define NB_PCDATA 0
629 #define HEADING "h1", "h2", "h3", "h4", "h5", "h6"
630 #define NB_HEADING 6
631 #define LIST "ul", "ol", "dir", "menu"
632 #define NB_LIST 4
633 #define MODIFIER
634 #define NB_MODIFIER 0
635 #define FLOW BLOCK,INLINE
636 #define NB_FLOW NB_BLOCK + NB_INLINE
637 #define EMPTY NULL
638 
639 
640 static const char* const html_flow[] = { FLOW, NULL } ;
641 static const char* const html_inline[] = { INLINE, NULL } ;
642 
643 /* placeholders: elts with content but no subelements */
644 static const char* const html_pcdata[] = { NULL } ;
645 #define html_cdata html_pcdata
646 
647 
648 /* ... and for HTML Attributes */
649 
650 #define COREATTRS "id", "class", "style", "title"
651 #define NB_COREATTRS 4
652 #define I18N "lang", "dir"
653 #define NB_I18N 2
654 #define EVENTS "onclick", "ondblclick", "onmousedown", "onmouseup", "onmouseover", "onmouseout", "onkeypress", "onkeydown", "onkeyup"
655 #define NB_EVENTS 9
656 #define ATTRS COREATTRS,I18N,EVENTS
657 #define NB_ATTRS NB_NB_COREATTRS + NB_I18N + NB_EVENTS
658 #define CELLHALIGN "align", "char", "charoff"
659 #define NB_CELLHALIGN 3
660 #define CELLVALIGN "valign"
661 #define NB_CELLVALIGN 1
662 
663 static const char* const html_attrs[] = { ATTRS, NULL } ;
664 static const char* const core_i18n_attrs[] = { COREATTRS, I18N, NULL } ;
665 static const char* const core_attrs[] = { COREATTRS, NULL } ;
666 static const char* const i18n_attrs[] = { I18N, NULL } ;
667 
668 
669 /* Other declarations that should go inline ... */
670 static const char* const a_attrs[] = { ATTRS, "charset", "type", "name",
671 	"href", "hreflang", "rel", "rev", "accesskey", "shape", "coords",
672 	"tabindex", "onfocus", "onblur", NULL } ;
673 static const char* const target_attr[] = { "target", NULL } ;
674 static const char* const rows_cols_attr[] = { "rows", "cols", NULL } ;
675 static const char* const alt_attr[] = { "alt", NULL } ;
676 static const char* const src_alt_attrs[] = { "src", "alt", NULL } ;
677 static const char* const href_attrs[] = { "href", NULL } ;
678 static const char* const clear_attrs[] = { "clear", NULL } ;
679 static const char* const inline_p[] = { INLINE, "p", NULL } ;
680 
681 static const char* const flow_param[] = { FLOW, "param", NULL } ;
682 static const char* const applet_attrs[] = { COREATTRS , "codebase",
683 		"archive", "alt", "name", "height", "width", "align",
684 		"hspace", "vspace", NULL } ;
685 static const char* const area_attrs[] = { "shape", "coords", "href", "nohref",
686 	"tabindex", "accesskey", "onfocus", "onblur", NULL } ;
687 static const char* const basefont_attrs[] =
688 	{ "id", "size", "color", "face", NULL } ;
689 static const char* const quote_attrs[] = { ATTRS, "cite", NULL } ;
690 static const char* const body_contents[] = { FLOW, "ins", "del", NULL } ;
691 static const char* const body_attrs[] = { ATTRS, "onload", "onunload", NULL } ;
692 static const char* const body_depr[] = { "background", "bgcolor", "text",
693 	"link", "vlink", "alink", NULL } ;
694 static const char* const button_attrs[] = { ATTRS, "name", "value", "type",
695 	"disabled", "tabindex", "accesskey", "onfocus", "onblur", NULL } ;
696 
697 
698 static const char* const col_attrs[] = { ATTRS, "span", "width", CELLHALIGN, CELLVALIGN, NULL } ;
699 static const char* const col_elt[] = { "col", NULL } ;
700 static const char* const edit_attrs[] = { ATTRS, "datetime", "cite", NULL } ;
701 static const char* const compact_attrs[] = { ATTRS, "compact", NULL } ;
702 static const char* const dl_contents[] = { "dt", "dd", NULL } ;
703 static const char* const compact_attr[] = { "compact", NULL } ;
704 static const char* const label_attr[] = { "label", NULL } ;
705 static const char* const fieldset_contents[] = { FLOW, "legend" } ;
706 static const char* const font_attrs[] = { COREATTRS, I18N, "size", "color", "face" , NULL } ;
707 static const char* const form_contents[] = { HEADING, LIST, INLINE, "pre", "p", "div", "center", "noscript", "noframes", "blockquote", "isindex", "hr", "table", "fieldset", "address", NULL } ;
708 static const char* const form_attrs[] = { ATTRS, "method", "enctype", "accept", "name", "onsubmit", "onreset", "accept-charset", NULL } ;
709 static const char* const frame_attrs[] = { COREATTRS, "longdesc", "name", "src", "frameborder", "marginwidth", "marginheight", "noresize", "scrolling" , NULL } ;
710 static const char* const frameset_attrs[] = { COREATTRS, "rows", "cols", "onload", "onunload", NULL } ;
711 static const char* const frameset_contents[] = { "frameset", "frame", "noframes", NULL } ;
712 static const char* const head_attrs[] = { I18N, "profile", NULL } ;
713 static const char* const head_contents[] = { "title", "isindex", "base", "script", "style", "meta", "link", "object", NULL } ;
714 static const char* const hr_depr[] = { "align", "noshade", "size", "width", NULL } ;
715 static const char* const version_attr[] = { "version", NULL } ;
716 static const char* const html_content[] = { "head", "body", "frameset", NULL } ;
717 static const char* const iframe_attrs[] = { COREATTRS, "longdesc", "name", "src", "frameborder", "marginwidth", "marginheight", "scrolling", "align", "height", "width", NULL } ;
718 static const char* const img_attrs[] = { ATTRS, "longdesc", "name", "height", "width", "usemap", "ismap", NULL } ;
719 static const char* const embed_attrs[] = { COREATTRS, "align", "alt", "border", "code", "codebase", "frameborder", "height", "hidden", "hspace", "name", "palette", "pluginspace", "pluginurl", "src", "type", "units", "vspace", "width", NULL } ;
720 static const char* const input_attrs[] = { ATTRS, "type", "name", "value", "checked", "disabled", "readonly", "size", "maxlength", "src", "alt", "usemap", "ismap", "tabindex", "accesskey", "onfocus", "onblur", "onselect", "onchange", "accept", NULL } ;
721 static const char* const prompt_attrs[] = { COREATTRS, I18N, "prompt", NULL } ;
722 static const char* const label_attrs[] = { ATTRS, "for", "accesskey", "onfocus", "onblur", NULL } ;
723 static const char* const legend_attrs[] = { ATTRS, "accesskey", NULL } ;
724 static const char* const align_attr[] = { "align", NULL } ;
725 static const char* const link_attrs[] = { ATTRS, "charset", "href", "hreflang", "type", "rel", "rev", "media", NULL } ;
726 static const char* const map_contents[] = { BLOCK, "area", NULL } ;
727 static const char* const name_attr[] = { "name", NULL } ;
728 static const char* const action_attr[] = { "action", NULL } ;
729 static const char* const blockli_elt[] = { BLOCK, "li", NULL } ;
730 static const char* const meta_attrs[] = { I18N, "http-equiv", "name", "scheme", NULL } ;
731 static const char* const content_attr[] = { "content", NULL } ;
732 static const char* const type_attr[] = { "type", NULL } ;
733 static const char* const noframes_content[] = { "body", FLOW MODIFIER, NULL } ;
734 static const char* const object_contents[] = { FLOW, "param", NULL } ;
735 static const char* const object_attrs[] = { ATTRS, "declare", "classid", "codebase", "data", "type", "codetype", "archive", "standby", "height", "width", "usemap", "name", "tabindex", NULL } ;
736 static const char* const object_depr[] = { "align", "border", "hspace", "vspace", NULL } ;
737 static const char* const ol_attrs[] = { "type", "compact", "start", NULL} ;
738 static const char* const option_elt[] = { "option", NULL } ;
739 static const char* const optgroup_attrs[] = { ATTRS, "disabled", NULL } ;
740 static const char* const option_attrs[] = { ATTRS, "disabled", "label", "selected", "value", NULL } ;
741 static const char* const param_attrs[] = { "id", "value", "valuetype", "type", NULL } ;
742 static const char* const width_attr[] = { "width", NULL } ;
743 static const char* const pre_content[] = { PHRASE, "tt", "i", "b", "u", "s", "strike", "a", "br", "script", "map", "q", "span", "bdo", "iframe", NULL } ;
744 static const char* const script_attrs[] = { "charset", "src", "defer", "event", "for", NULL } ;
745 static const char* const language_attr[] = { "language", NULL } ;
746 static const char* const select_content[] = { "optgroup", "option", NULL } ;
747 static const char* const select_attrs[] = { ATTRS, "name", "size", "multiple", "disabled", "tabindex", "onfocus", "onblur", "onchange", NULL } ;
748 static const char* const style_attrs[] = { I18N, "media", "title", NULL } ;
749 static const char* const table_attrs[] = { ATTRS, "summary", "width", "border", "frame", "rules", "cellspacing", "cellpadding", "datapagesize", NULL } ;
750 static const char* const table_depr[] = { "align", "bgcolor", NULL } ;
751 static const char* const table_contents[] = { "caption", "col", "colgroup", "thead", "tfoot", "tbody", "tr", NULL} ;
752 static const char* const tr_elt[] = { "tr", NULL } ;
753 static const char* const talign_attrs[] = { ATTRS, CELLHALIGN, CELLVALIGN, NULL} ;
754 static const char* const th_td_depr[] = { "nowrap", "bgcolor", "width", "height", NULL } ;
755 static const char* const th_td_attr[] = { ATTRS, "abbr", "axis", "headers", "scope", "rowspan", "colspan", CELLHALIGN, CELLVALIGN, NULL } ;
756 static const char* const textarea_attrs[] = { ATTRS, "name", "disabled", "readonly", "tabindex", "accesskey", "onfocus", "onblur", "onselect", "onchange", NULL } ;
757 static const char* const tr_contents[] = { "th", "td", NULL } ;
758 static const char* const bgcolor_attr[] = { "bgcolor", NULL } ;
759 static const char* const li_elt[] = { "li", NULL } ;
760 static const char* const ul_depr[] = { "type", "compact", NULL} ;
761 static const char* const dir_attr[] = { "dir", NULL} ;
762 
763 #define DECL (const char**)
764 
765 static const htmlElemDesc
766 html40ElementTable[] = {
767 { "a",		0, 0, 0, 0, 0, 0, 1, "anchor ",
768 	DECL html_inline , NULL , DECL a_attrs , DECL target_attr, NULL
769 },
770 { "abbr",	0, 0, 0, 0, 0, 0, 1, "abbreviated form",
771 	DECL html_inline , NULL , DECL html_attrs, NULL, NULL
772 },
773 { "acronym",	0, 0, 0, 0, 0, 0, 1, "",
774 	DECL html_inline , NULL , DECL html_attrs, NULL, NULL
775 },
776 { "address",	0, 0, 0, 0, 0, 0, 0, "information on author ",
777 	DECL inline_p  , NULL , DECL html_attrs, NULL, NULL
778 },
779 { "applet",	0, 0, 0, 0, 1, 1, 2, "java applet ",
780 	DECL flow_param , NULL , NULL , DECL applet_attrs, NULL
781 },
782 { "area",	0, 2, 2, 1, 0, 0, 0, "client-side image map area ",
783 	EMPTY ,  NULL , DECL area_attrs , DECL target_attr, DECL alt_attr
784 },
785 { "b",		0, 3, 0, 0, 0, 0, 1, "bold text style",
786 	DECL html_inline , NULL , DECL html_attrs, NULL, NULL
787 },
788 { "base",	0, 2, 2, 1, 0, 0, 0, "document base uri ",
789 	EMPTY , NULL , NULL , DECL target_attr, DECL href_attrs
790 },
791 { "basefont",	0, 2, 2, 1, 1, 1, 1, "base font size " ,
792 	EMPTY , NULL , NULL, DECL basefont_attrs, NULL
793 },
794 { "bdo",	0, 0, 0, 0, 0, 0, 1, "i18n bidi over-ride ",
795 	DECL html_inline , NULL , DECL core_i18n_attrs, NULL, DECL dir_attr
796 },
797 { "big",	0, 3, 0, 0, 0, 0, 1, "large text style",
798 	DECL html_inline , NULL , DECL html_attrs, NULL, NULL
799 },
800 { "blockquote",	0, 0, 0, 0, 0, 0, 0, "long quotation ",
801 	DECL html_flow , NULL , DECL quote_attrs , NULL, NULL
802 },
803 { "body",	1, 1, 0, 0, 0, 0, 0, "document body ",
804 	DECL body_contents , "div" , DECL body_attrs, DECL body_depr, NULL
805 },
806 { "br",		0, 2, 2, 1, 0, 0, 1, "forced line break ",
807 	EMPTY , NULL , DECL core_attrs, DECL clear_attrs , NULL
808 },
809 { "button",	0, 0, 0, 0, 0, 0, 2, "push button ",
810 	DECL html_flow MODIFIER , NULL , DECL button_attrs, NULL, NULL
811 },
812 { "caption",	0, 0, 0, 0, 0, 0, 0, "table caption ",
813 	DECL html_inline , NULL , DECL html_attrs, NULL, NULL
814 },
815 { "center",	0, 3, 0, 0, 1, 1, 0, "shorthand for div align=center ",
816 	DECL html_flow , NULL , NULL, DECL html_attrs, NULL
817 },
818 { "cite",	0, 0, 0, 0, 0, 0, 1, "citation",
819 	DECL html_inline , NULL , DECL html_attrs, NULL, NULL
820 },
821 { "code",	0, 0, 0, 0, 0, 0, 1, "computer code fragment",
822 	DECL html_inline , NULL , DECL html_attrs, NULL, NULL
823 },
824 { "col",	0, 2, 2, 1, 0, 0, 0, "table column ",
825 	EMPTY , NULL , DECL col_attrs , NULL, NULL
826 },
827 { "colgroup",	0, 1, 0, 0, 0, 0, 0, "table column group ",
828 	DECL col_elt , "col" , DECL col_attrs , NULL, NULL
829 },
830 { "dd",		0, 1, 0, 0, 0, 0, 0, "definition description ",
831 	DECL html_flow , NULL , DECL html_attrs, NULL, NULL
832 },
833 { "del",	0, 0, 0, 0, 0, 0, 2, "deleted text ",
834 	DECL html_flow , NULL , DECL edit_attrs , NULL, NULL
835 },
836 { "dfn",	0, 0, 0, 0, 0, 0, 1, "instance definition",
837 	DECL html_inline , NULL , DECL html_attrs, NULL, NULL
838 },
839 { "dir",	0, 0, 0, 0, 1, 1, 0, "directory list",
840 	DECL blockli_elt, "li" , NULL, DECL compact_attrs, NULL
841 },
842 { "div",	0, 0, 0, 0, 0, 0, 0, "generic language/style container",
843 	DECL html_flow, NULL, DECL html_attrs, DECL align_attr, NULL
844 },
845 { "dl",		0, 0, 0, 0, 0, 0, 0, "definition list ",
846 	DECL dl_contents , "dd" , DECL html_attrs, DECL compact_attr, NULL
847 },
848 { "dt",		0, 1, 0, 0, 0, 0, 0, "definition term ",
849 	DECL html_inline, NULL, DECL html_attrs, NULL, NULL
850 },
851 { "em",		0, 3, 0, 0, 0, 0, 1, "emphasis",
852 	DECL html_inline, NULL, DECL html_attrs, NULL, NULL
853 },
854 { "embed",	0, 1, 0, 0, 1, 1, 1, "generic embedded object ",
855 	EMPTY, NULL, DECL embed_attrs, NULL, NULL
856 },
857 { "fieldset",	0, 0, 0, 0, 0, 0, 0, "form control group ",
858 	DECL fieldset_contents , NULL, DECL html_attrs, NULL, NULL
859 },
860 { "font",	0, 3, 0, 0, 1, 1, 1, "local change to font ",
861 	DECL html_inline, NULL, NULL, DECL font_attrs, NULL
862 },
863 { "form",	0, 0, 0, 0, 0, 0, 0, "interactive form ",
864 	DECL form_contents, "fieldset", DECL form_attrs , DECL target_attr, DECL action_attr
865 },
866 { "frame",	0, 2, 2, 1, 0, 2, 0, "subwindow " ,
867 	EMPTY, NULL, NULL, DECL frame_attrs, NULL
868 },
869 { "frameset",	0, 0, 0, 0, 0, 2, 0, "window subdivision" ,
870 	DECL frameset_contents, "noframes" , NULL , DECL frameset_attrs, NULL
871 },
872 { "h1",		0, 0, 0, 0, 0, 0, 0, "heading ",
873 	DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
874 },
875 { "h2",		0, 0, 0, 0, 0, 0, 0, "heading ",
876 	DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
877 },
878 { "h3",		0, 0, 0, 0, 0, 0, 0, "heading ",
879 	DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
880 },
881 { "h4",		0, 0, 0, 0, 0, 0, 0, "heading ",
882 	DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
883 },
884 { "h5",		0, 0, 0, 0, 0, 0, 0, "heading ",
885 	DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
886 },
887 { "h6",		0, 0, 0, 0, 0, 0, 0, "heading ",
888 	DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
889 },
890 { "head",	1, 1, 0, 0, 0, 0, 0, "document head ",
891 	DECL head_contents, NULL, DECL head_attrs, NULL, NULL
892 },
893 { "hr",		0, 2, 2, 1, 0, 0, 0, "horizontal rule " ,
894 	EMPTY, NULL, DECL html_attrs, DECL hr_depr, NULL
895 },
896 { "html",	1, 1, 0, 0, 0, 0, 0, "document root element ",
897 	DECL html_content , NULL , DECL i18n_attrs, DECL version_attr, NULL
898 },
899 { "i",		0, 3, 0, 0, 0, 0, 1, "italic text style",
900 	DECL html_inline, NULL, DECL html_attrs, NULL, NULL
901 },
902 { "iframe",	0, 0, 0, 0, 0, 1, 2, "inline subwindow ",
903 	DECL html_flow, NULL, NULL, DECL iframe_attrs, NULL
904 },
905 { "img",	0, 2, 2, 1, 0, 0, 1, "embedded image ",
906 	EMPTY, NULL, DECL img_attrs, DECL align_attr, DECL src_alt_attrs
907 },
908 { "input",	0, 2, 2, 1, 0, 0, 1, "form control ",
909 	EMPTY, NULL, DECL input_attrs , DECL align_attr, NULL
910 },
911 { "ins",	0, 0, 0, 0, 0, 0, 2, "inserted text",
912 	DECL html_flow, NULL, DECL edit_attrs, NULL, NULL
913 },
914 { "isindex",	0, 2, 2, 1, 1, 1, 0, "single line prompt ",
915 	EMPTY, NULL, NULL, DECL prompt_attrs, NULL
916 },
917 { "kbd",	0, 0, 0, 0, 0, 0, 1, "text to be entered by the user",
918 	DECL html_inline, NULL, DECL html_attrs, NULL, NULL
919 },
920 { "label",	0, 0, 0, 0, 0, 0, 1, "form field label text ",
921 	DECL html_inline MODIFIER, NULL, DECL label_attrs , NULL, NULL
922 },
923 { "legend",	0, 0, 0, 0, 0, 0, 0, "fieldset legend ",
924 	DECL html_inline, NULL, DECL legend_attrs , DECL align_attr, NULL
925 },
926 { "li",		0, 1, 1, 0, 0, 0, 0, "list item ",
927 	DECL html_flow, NULL, DECL html_attrs, NULL, NULL
928 },
929 { "link",	0, 2, 2, 1, 0, 0, 0, "a media-independent link ",
930 	EMPTY, NULL, DECL link_attrs, DECL target_attr, NULL
931 },
932 { "map",	0, 0, 0, 0, 0, 0, 2, "client-side image map ",
933 	DECL map_contents , NULL, DECL html_attrs , NULL, DECL name_attr
934 },
935 { "menu",	0, 0, 0, 0, 1, 1, 0, "menu list ",
936 	DECL blockli_elt , NULL, NULL, DECL compact_attrs, NULL
937 },
938 { "meta",	0, 2, 2, 1, 0, 0, 0, "generic metainformation ",
939 	EMPTY, NULL, DECL meta_attrs , NULL , DECL content_attr
940 },
941 { "noframes",	0, 0, 0, 0, 0, 2, 0, "alternate content container for non frame-based rendering ",
942 	DECL noframes_content, "body" , DECL html_attrs, NULL, NULL
943 },
944 { "noscript",	0, 0, 0, 0, 0, 0, 0, "alternate content container for non script-based rendering ",
945 	DECL html_flow, "div", DECL html_attrs, NULL, NULL
946 },
947 { "object",	0, 0, 0, 0, 0, 0, 2, "generic embedded object ",
948 	DECL object_contents , "div" , DECL object_attrs, DECL object_depr, NULL
949 },
950 { "ol",		0, 0, 0, 0, 0, 0, 0, "ordered list ",
951 	DECL li_elt , "li" , DECL html_attrs, DECL ol_attrs, NULL
952 },
953 { "optgroup",	0, 0, 0, 0, 0, 0, 0, "option group ",
954 	DECL option_elt , "option", DECL optgroup_attrs, NULL, DECL label_attr
955 },
956 { "option",	0, 1, 0, 0, 0, 0, 0, "selectable choice " ,
957 	DECL html_pcdata, NULL, DECL option_attrs, NULL, NULL
958 },
959 { "p",		0, 1, 0, 0, 0, 0, 0, "paragraph ",
960 	DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
961 },
962 { "param",	0, 2, 2, 1, 0, 0, 0, "named property value ",
963 	EMPTY, NULL, DECL param_attrs, NULL, DECL name_attr
964 },
965 { "pre",	0, 0, 0, 0, 0, 0, 0, "preformatted text ",
966 	DECL pre_content, NULL, DECL html_attrs, DECL width_attr, NULL
967 },
968 { "q",		0, 0, 0, 0, 0, 0, 1, "short inline quotation ",
969 	DECL html_inline, NULL, DECL quote_attrs, NULL, NULL
970 },
971 { "s",		0, 3, 0, 0, 1, 1, 1, "strike-through text style",
972 	DECL html_inline, NULL, NULL, DECL html_attrs, NULL
973 },
974 { "samp",	0, 0, 0, 0, 0, 0, 1, "sample program output, scripts, etc.",
975 	DECL html_inline, NULL, DECL html_attrs, NULL, NULL
976 },
977 { "script",	0, 0, 0, 0, 0, 0, 2, "script statements ",
978 	DECL html_cdata, NULL, DECL script_attrs, DECL language_attr, DECL type_attr
979 },
980 { "select",	0, 0, 0, 0, 0, 0, 1, "option selector ",
981 	DECL select_content, NULL, DECL select_attrs, NULL, NULL
982 },
983 { "small",	0, 3, 0, 0, 0, 0, 1, "small text style",
984 	DECL html_inline, NULL, DECL html_attrs, NULL, NULL
985 },
986 { "span",	0, 0, 0, 0, 0, 0, 1, "generic language/style container ",
987 	DECL html_inline, NULL, DECL html_attrs, NULL, NULL
988 },
989 { "strike",	0, 3, 0, 0, 1, 1, 1, "strike-through text",
990 	DECL html_inline, NULL, NULL, DECL html_attrs, NULL
991 },
992 { "strong",	0, 3, 0, 0, 0, 0, 1, "strong emphasis",
993 	DECL html_inline, NULL, DECL html_attrs, NULL, NULL
994 },
995 { "style",	0, 0, 0, 0, 0, 0, 0, "style info ",
996 	DECL html_cdata, NULL, DECL style_attrs, NULL, DECL type_attr
997 },
998 { "sub",	0, 3, 0, 0, 0, 0, 1, "subscript",
999 	DECL html_inline, NULL, DECL html_attrs, NULL, NULL
1000 },
1001 { "sup",	0, 3, 0, 0, 0, 0, 1, "superscript ",
1002 	DECL html_inline, NULL, DECL html_attrs, NULL, NULL
1003 },
1004 { "table",	0, 0, 0, 0, 0, 0, 0, "",
1005 	DECL table_contents , "tr" , DECL table_attrs , DECL table_depr, NULL
1006 },
1007 { "tbody",	1, 0, 0, 0, 0, 0, 0, "table body ",
1008 	DECL tr_elt , "tr" , DECL talign_attrs, NULL, NULL
1009 },
1010 { "td",		0, 0, 0, 0, 0, 0, 0, "table data cell",
1011 	DECL html_flow, NULL, DECL th_td_attr, DECL th_td_depr, NULL
1012 },
1013 { "textarea",	0, 0, 0, 0, 0, 0, 1, "multi-line text field ",
1014 	DECL html_pcdata, NULL, DECL textarea_attrs, NULL, DECL rows_cols_attr
1015 },
1016 { "tfoot",	0, 1, 0, 0, 0, 0, 0, "table footer ",
1017 	DECL tr_elt , "tr" , DECL talign_attrs, NULL, NULL
1018 },
1019 { "th",		0, 1, 0, 0, 0, 0, 0, "table header cell",
1020 	DECL html_flow, NULL, DECL th_td_attr, DECL th_td_depr, NULL
1021 },
1022 { "thead",	0, 1, 0, 0, 0, 0, 0, "table header ",
1023 	DECL tr_elt , "tr" , DECL talign_attrs, NULL, NULL
1024 },
1025 { "title",	0, 0, 0, 0, 0, 0, 0, "document title ",
1026 	DECL html_pcdata, NULL, DECL i18n_attrs, NULL, NULL
1027 },
1028 { "tr",		0, 0, 0, 0, 0, 0, 0, "table row ",
1029 	DECL tr_contents , "td" , DECL talign_attrs, DECL bgcolor_attr, NULL
1030 },
1031 { "tt",		0, 3, 0, 0, 0, 0, 1, "teletype or monospaced text style",
1032 	DECL html_inline, NULL, DECL html_attrs, NULL, NULL
1033 },
1034 { "u",		0, 3, 0, 0, 1, 1, 1, "underlined text style",
1035 	DECL html_inline, NULL, NULL, DECL html_attrs, NULL
1036 },
1037 { "ul",		0, 0, 0, 0, 0, 0, 0, "unordered list ",
1038 	DECL li_elt , "li" , DECL html_attrs, DECL ul_depr, NULL
1039 },
1040 { "var",	0, 0, 0, 0, 0, 0, 1, "instance of a variable or program argument",
1041 	DECL html_inline, NULL, DECL html_attrs, NULL, NULL
1042 }
1043 };
1044 
1045 /*
1046  * start tags that imply the end of current element
1047  */
1048 static const char * const htmlStartClose[] = {
1049 "form",		"form", "p", "hr", "h1", "h2", "h3", "h4", "h5", "h6",
1050 		"dl", "ul", "ol", "menu", "dir", "address", "pre",
1051 		"listing", "xmp", "head", NULL,
1052 "head",		"p", NULL,
1053 "title",	"p", NULL,
1054 "body",		"head", "style", "link", "title", "p", NULL,
1055 "frameset",	"head", "style", "link", "title", "p", NULL,
1056 "li",		"p", "h1", "h2", "h3", "h4", "h5", "h6", "dl", "address",
1057 		"pre", "listing", "xmp", "head", "li", NULL,
1058 "hr",		"p", "head", NULL,
1059 "h1",		"p", "head", NULL,
1060 "h2",		"p", "head", NULL,
1061 "h3",		"p", "head", NULL,
1062 "h4",		"p", "head", NULL,
1063 "h5",		"p", "head", NULL,
1064 "h6",		"p", "head", NULL,
1065 "dir",		"p", "head", NULL,
1066 "address",	"p", "head", "ul", NULL,
1067 "pre",		"p", "head", "ul", NULL,
1068 "listing",	"p", "head", NULL,
1069 "xmp",		"p", "head", NULL,
1070 "blockquote",	"p", "head", NULL,
1071 "dl",		"p", "dt", "menu", "dir", "address", "pre", "listing",
1072 		"xmp", "head", NULL,
1073 "dt",		"p", "menu", "dir", "address", "pre", "listing", "xmp",
1074                 "head", "dd", NULL,
1075 "dd",		"p", "menu", "dir", "address", "pre", "listing", "xmp",
1076                 "head", "dt", NULL,
1077 "ul",		"p", "head", "ol", "menu", "dir", "address", "pre",
1078 		"listing", "xmp", NULL,
1079 "ol",		"p", "head", "ul", NULL,
1080 "menu",		"p", "head", "ul", NULL,
1081 "p",		"p", "head", "h1", "h2", "h3", "h4", "h5", "h6", FONTSTYLE, NULL,
1082 "div",		"p", "head", NULL,
1083 "noscript",	"p", "head", NULL,
1084 "center",	"font", "b", "i", "p", "head", NULL,
1085 "a",		"a", NULL,
1086 "caption",	"p", NULL,
1087 "colgroup",	"caption", "colgroup", "col", "p", NULL,
1088 "col",		"caption", "col", "p", NULL,
1089 "table",	"p", "head", "h1", "h2", "h3", "h4", "h5", "h6", "pre",
1090 		"listing", "xmp", "a", NULL,
1091 "th",		"th", "td", "p", "span", "font", "a", "b", "i", "u", NULL,
1092 "td",		"th", "td", "p", "span", "font", "a", "b", "i", "u", NULL,
1093 "tr",		"th", "td", "tr", "caption", "col", "colgroup", "p", NULL,
1094 "thead",	"caption", "col", "colgroup", NULL,
1095 "tfoot",	"th", "td", "tr", "caption", "col", "colgroup", "thead",
1096 		"tbody", "p", NULL,
1097 "tbody",	"th", "td", "tr", "caption", "col", "colgroup", "thead",
1098 		"tfoot", "tbody", "p", NULL,
1099 "optgroup",	"option", NULL,
1100 "option",	"option", NULL,
1101 "fieldset",	"legend", "p", "head", "h1", "h2", "h3", "h4", "h5", "h6",
1102 		"pre", "listing", "xmp", "a", NULL,
1103 NULL
1104 };
1105 
1106 /*
1107  * The list of HTML elements which are supposed not to have
1108  * CDATA content and where a p element will be implied
1109  *
1110  * TODO: extend that list by reading the HTML SGML DTD on
1111  *       implied paragraph
1112  */
1113 static const char *const htmlNoContentElements[] = {
1114     "html",
1115     "head",
1116     NULL
1117 };
1118 
1119 /*
1120  * The list of HTML attributes which are of content %Script;
1121  * NOTE: when adding ones, check htmlIsScriptAttribute() since
1122  *       it assumes the name starts with 'on'
1123  */
1124 static const char *const htmlScriptAttributes[] = {
1125     "onclick",
1126     "ondblclick",
1127     "onmousedown",
1128     "onmouseup",
1129     "onmouseover",
1130     "onmousemove",
1131     "onmouseout",
1132     "onkeypress",
1133     "onkeydown",
1134     "onkeyup",
1135     "onload",
1136     "onunload",
1137     "onfocus",
1138     "onblur",
1139     "onsubmit",
1140     "onrest",
1141     "onchange",
1142     "onselect"
1143 };
1144 
1145 /*
1146  * This table is used by the htmlparser to know what to do with
1147  * broken html pages. By assigning different priorities to different
1148  * elements the parser can decide how to handle extra endtags.
1149  * Endtags are only allowed to close elements with lower or equal
1150  * priority.
1151  */
1152 
1153 typedef struct {
1154     const char *name;
1155     int priority;
1156 } elementPriority;
1157 
1158 static const elementPriority htmlEndPriority[] = {
1159     {"div",   150},
1160     {"td",    160},
1161     {"th",    160},
1162     {"tr",    170},
1163     {"thead", 180},
1164     {"tbody", 180},
1165     {"tfoot", 180},
1166     {"table", 190},
1167     {"head",  200},
1168     {"body",  200},
1169     {"html",  220},
1170     {NULL,    100} /* Default priority */
1171 };
1172 
1173 static const char** htmlStartCloseIndex[100];
1174 static int htmlStartCloseIndexinitialized = 0;
1175 
1176 /************************************************************************
1177  *									*
1178  *	functions to handle HTML specific data			*
1179  *									*
1180  ************************************************************************/
1181 
1182 /**
1183  * htmlInitAutoClose:
1184  *
1185  * Initialize the htmlStartCloseIndex for fast lookup of closing tags names.
1186  * This is not reentrant. Call xmlInitParser() once before processing in
1187  * case of use in multithreaded programs.
1188  */
1189 void
htmlInitAutoClose(void)1190 htmlInitAutoClose(void) {
1191     int indx, i = 0;
1192 
1193     if (htmlStartCloseIndexinitialized) return;
1194 
1195     for (indx = 0;indx < 100;indx ++) htmlStartCloseIndex[indx] = NULL;
1196     indx = 0;
1197     while ((htmlStartClose[i] != NULL) && (indx < 100 - 1)) {
1198         htmlStartCloseIndex[indx++] = (const char**) &htmlStartClose[i];
1199 	while (htmlStartClose[i] != NULL) i++;
1200 	i++;
1201     }
1202     htmlStartCloseIndexinitialized = 1;
1203 }
1204 
1205 /**
1206  * htmlTagLookup:
1207  * @tag:  The tag name in lowercase
1208  *
1209  * Lookup the HTML tag in the ElementTable
1210  *
1211  * Returns the related htmlElemDescPtr or NULL if not found.
1212  */
1213 const htmlElemDesc *
htmlTagLookup(const xmlChar * tag)1214 htmlTagLookup(const xmlChar *tag) {
1215     unsigned int i;
1216 
1217     for (i = 0; i < (sizeof(html40ElementTable) /
1218                      sizeof(html40ElementTable[0]));i++) {
1219         if (!xmlStrcasecmp(tag, BAD_CAST html40ElementTable[i].name))
1220 	    return((htmlElemDescPtr) &html40ElementTable[i]);
1221     }
1222     return(NULL);
1223 }
1224 
1225 /**
1226  * htmlGetEndPriority:
1227  * @name: The name of the element to look up the priority for.
1228  *
1229  * Return value: The "endtag" priority.
1230  **/
1231 static int
htmlGetEndPriority(const xmlChar * name)1232 htmlGetEndPriority (const xmlChar *name) {
1233     int i = 0;
1234 
1235     while ((htmlEndPriority[i].name != NULL) &&
1236 	   (!xmlStrEqual((const xmlChar *)htmlEndPriority[i].name, name)))
1237 	i++;
1238 
1239     return(htmlEndPriority[i].priority);
1240 }
1241 
1242 
1243 /**
1244  * htmlCheckAutoClose:
1245  * @newtag:  The new tag name
1246  * @oldtag:  The old tag name
1247  *
1248  * Checks whether the new tag is one of the registered valid tags for
1249  * closing old.
1250  * Initialize the htmlStartCloseIndex for fast lookup of closing tags names.
1251  *
1252  * Returns 0 if no, 1 if yes.
1253  */
1254 static int
htmlCheckAutoClose(const xmlChar * newtag,const xmlChar * oldtag)1255 htmlCheckAutoClose(const xmlChar * newtag, const xmlChar * oldtag)
1256 {
1257     int i, indx;
1258     const char **closed = NULL;
1259 
1260     if (htmlStartCloseIndexinitialized == 0)
1261         htmlInitAutoClose();
1262 
1263     /* inefficient, but not a big deal */
1264     for (indx = 0; indx < 100; indx++) {
1265         closed = htmlStartCloseIndex[indx];
1266         if (closed == NULL)
1267             return (0);
1268         if (xmlStrEqual(BAD_CAST * closed, newtag))
1269             break;
1270     }
1271 
1272     i = closed - htmlStartClose;
1273     i++;
1274     while (htmlStartClose[i] != NULL) {
1275         if (xmlStrEqual(BAD_CAST htmlStartClose[i], oldtag)) {
1276             return (1);
1277         }
1278         i++;
1279     }
1280     return (0);
1281 }
1282 
1283 /**
1284  * htmlAutoCloseOnClose:
1285  * @ctxt:  an HTML parser context
1286  * @newtag:  The new tag name
1287  * @force:  force the tag closure
1288  *
1289  * The HTML DTD allows an ending tag to implicitly close other tags.
1290  */
1291 static void
htmlAutoCloseOnClose(htmlParserCtxtPtr ctxt,const xmlChar * newtag)1292 htmlAutoCloseOnClose(htmlParserCtxtPtr ctxt, const xmlChar * newtag)
1293 {
1294     const htmlElemDesc *info;
1295     int i, priority;
1296 
1297     priority = htmlGetEndPriority(newtag);
1298 
1299     for (i = (ctxt->nameNr - 1); i >= 0; i--) {
1300 
1301         if (xmlStrEqual(newtag, ctxt->nameTab[i]))
1302             break;
1303         /*
1304          * A missplaced endtag can only close elements with lower
1305          * or equal priority, so if we find an element with higher
1306          * priority before we find an element with
1307          * matching name, we just ignore this endtag
1308          */
1309         if (htmlGetEndPriority(ctxt->nameTab[i]) > priority)
1310             return;
1311     }
1312     if (i < 0)
1313         return;
1314 
1315     while (!xmlStrEqual(newtag, ctxt->name)) {
1316         info = htmlTagLookup(ctxt->name);
1317         if ((info != NULL) && (info->endTag == 3)) {
1318             htmlParseErr(ctxt, XML_ERR_TAG_NAME_MISMATCH,
1319 	                 "Opening and ending tag mismatch: %s and %s\n",
1320 			 newtag, ctxt->name);
1321         }
1322         if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
1323             ctxt->sax->endElement(ctxt->userData, ctxt->name);
1324 	htmlnamePop(ctxt);
1325     }
1326 }
1327 
1328 /**
1329  * htmlAutoCloseOnEnd:
1330  * @ctxt:  an HTML parser context
1331  *
1332  * Close all remaining tags at the end of the stream
1333  */
1334 static void
htmlAutoCloseOnEnd(htmlParserCtxtPtr ctxt)1335 htmlAutoCloseOnEnd(htmlParserCtxtPtr ctxt)
1336 {
1337     int i;
1338 
1339     if (ctxt->nameNr == 0)
1340         return;
1341     for (i = (ctxt->nameNr - 1); i >= 0; i--) {
1342         if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
1343             ctxt->sax->endElement(ctxt->userData, ctxt->name);
1344 	htmlnamePop(ctxt);
1345     }
1346 }
1347 
1348 /**
1349  * htmlAutoClose:
1350  * @ctxt:  an HTML parser context
1351  * @newtag:  The new tag name or NULL
1352  *
1353  * The HTML DTD allows a tag to implicitly close other tags.
1354  * The list is kept in htmlStartClose array. This function is
1355  * called when a new tag has been detected and generates the
1356  * appropriates closes if possible/needed.
1357  * If newtag is NULL this mean we are at the end of the resource
1358  * and we should check
1359  */
1360 static void
htmlAutoClose(htmlParserCtxtPtr ctxt,const xmlChar * newtag)1361 htmlAutoClose(htmlParserCtxtPtr ctxt, const xmlChar * newtag)
1362 {
1363     while ((newtag != NULL) && (ctxt->name != NULL) &&
1364            (htmlCheckAutoClose(newtag, ctxt->name))) {
1365         if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
1366             ctxt->sax->endElement(ctxt->userData, ctxt->name);
1367 	htmlnamePop(ctxt);
1368     }
1369     if (newtag == NULL) {
1370         htmlAutoCloseOnEnd(ctxt);
1371         return;
1372     }
1373     while ((newtag == NULL) && (ctxt->name != NULL) &&
1374            ((xmlStrEqual(ctxt->name, BAD_CAST "head")) ||
1375             (xmlStrEqual(ctxt->name, BAD_CAST "body")) ||
1376             (xmlStrEqual(ctxt->name, BAD_CAST "html")))) {
1377         if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
1378             ctxt->sax->endElement(ctxt->userData, ctxt->name);
1379 	htmlnamePop(ctxt);
1380     }
1381 }
1382 
1383 /**
1384  * htmlAutoCloseTag:
1385  * @doc:  the HTML document
1386  * @name:  The tag name
1387  * @elem:  the HTML element
1388  *
1389  * The HTML DTD allows a tag to implicitly close other tags.
1390  * The list is kept in htmlStartClose array. This function checks
1391  * if the element or one of it's children would autoclose the
1392  * given tag.
1393  *
1394  * Returns 1 if autoclose, 0 otherwise
1395  */
1396 int
htmlAutoCloseTag(htmlDocPtr doc,const xmlChar * name,htmlNodePtr elem)1397 htmlAutoCloseTag(htmlDocPtr doc, const xmlChar *name, htmlNodePtr elem) {
1398     htmlNodePtr child;
1399 
1400     if (elem == NULL) return(1);
1401     if (xmlStrEqual(name, elem->name)) return(0);
1402     if (htmlCheckAutoClose(elem->name, name)) return(1);
1403     child = elem->children;
1404     while (child != NULL) {
1405         if (htmlAutoCloseTag(doc, name, child)) return(1);
1406 	child = child->next;
1407     }
1408     return(0);
1409 }
1410 
1411 /**
1412  * htmlIsAutoClosed:
1413  * @doc:  the HTML document
1414  * @elem:  the HTML element
1415  *
1416  * The HTML DTD allows a tag to implicitly close other tags.
1417  * The list is kept in htmlStartClose array. This function checks
1418  * if a tag is autoclosed by one of it's child
1419  *
1420  * Returns 1 if autoclosed, 0 otherwise
1421  */
1422 int
htmlIsAutoClosed(htmlDocPtr doc,htmlNodePtr elem)1423 htmlIsAutoClosed(htmlDocPtr doc, htmlNodePtr elem) {
1424     htmlNodePtr child;
1425 
1426     if (elem == NULL) return(1);
1427     child = elem->children;
1428     while (child != NULL) {
1429 	if (htmlAutoCloseTag(doc, elem->name, child)) return(1);
1430 	child = child->next;
1431     }
1432     return(0);
1433 }
1434 
1435 /**
1436  * htmlCheckImplied:
1437  * @ctxt:  an HTML parser context
1438  * @newtag:  The new tag name
1439  *
1440  * The HTML DTD allows a tag to exists only implicitly
1441  * called when a new tag has been detected and generates the
1442  * appropriates implicit tags if missing
1443  */
1444 static void
htmlCheckImplied(htmlParserCtxtPtr ctxt,const xmlChar * newtag)1445 htmlCheckImplied(htmlParserCtxtPtr ctxt, const xmlChar *newtag) {
1446     int i;
1447 
1448     if (ctxt->options & HTML_PARSE_NOIMPLIED)
1449         return;
1450     if (!htmlOmittedDefaultValue)
1451 	return;
1452     if (xmlStrEqual(newtag, BAD_CAST"html"))
1453 	return;
1454     if (ctxt->nameNr <= 0) {
1455 	htmlnamePush(ctxt, BAD_CAST"html");
1456 	if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
1457 	    ctxt->sax->startElement(ctxt->userData, BAD_CAST"html", NULL);
1458     }
1459     if ((xmlStrEqual(newtag, BAD_CAST"body")) || (xmlStrEqual(newtag, BAD_CAST"head")))
1460         return;
1461     if ((ctxt->nameNr <= 1) &&
1462         ((xmlStrEqual(newtag, BAD_CAST"script")) ||
1463 	 (xmlStrEqual(newtag, BAD_CAST"style")) ||
1464 	 (xmlStrEqual(newtag, BAD_CAST"meta")) ||
1465 	 (xmlStrEqual(newtag, BAD_CAST"link")) ||
1466 	 (xmlStrEqual(newtag, BAD_CAST"title")) ||
1467 	 (xmlStrEqual(newtag, BAD_CAST"base")))) {
1468         if (ctxt->html >= 3) {
1469             /* we already saw or generated an <head> before */
1470             return;
1471         }
1472         /*
1473          * dropped OBJECT ... i you put it first BODY will be
1474          * assumed !
1475          */
1476         htmlnamePush(ctxt, BAD_CAST"head");
1477         if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
1478             ctxt->sax->startElement(ctxt->userData, BAD_CAST"head", NULL);
1479     } else if ((!xmlStrEqual(newtag, BAD_CAST"noframes")) &&
1480 	       (!xmlStrEqual(newtag, BAD_CAST"frame")) &&
1481 	       (!xmlStrEqual(newtag, BAD_CAST"frameset"))) {
1482         if (ctxt->html >= 10) {
1483             /* we already saw or generated a <body> before */
1484             return;
1485         }
1486 	for (i = 0;i < ctxt->nameNr;i++) {
1487 	    if (xmlStrEqual(ctxt->nameTab[i], BAD_CAST"body")) {
1488 		return;
1489 	    }
1490 	    if (xmlStrEqual(ctxt->nameTab[i], BAD_CAST"head")) {
1491 		return;
1492 	    }
1493 	}
1494 
1495 	htmlnamePush(ctxt, BAD_CAST"body");
1496 	if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
1497 	    ctxt->sax->startElement(ctxt->userData, BAD_CAST"body", NULL);
1498     }
1499 }
1500 
1501 /**
1502  * htmlCheckParagraph
1503  * @ctxt:  an HTML parser context
1504  *
1505  * Check whether a p element need to be implied before inserting
1506  * characters in the current element.
1507  *
1508  * Returns 1 if a paragraph has been inserted, 0 if not and -1
1509  *         in case of error.
1510  */
1511 
1512 static int
htmlCheckParagraph(htmlParserCtxtPtr ctxt)1513 htmlCheckParagraph(htmlParserCtxtPtr ctxt) {
1514     const xmlChar *tag;
1515     int i;
1516 
1517     if (ctxt == NULL)
1518 	return(-1);
1519     tag = ctxt->name;
1520     if (tag == NULL) {
1521 	htmlAutoClose(ctxt, BAD_CAST"p");
1522 	htmlCheckImplied(ctxt, BAD_CAST"p");
1523 	htmlnamePush(ctxt, BAD_CAST"p");
1524 	if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
1525 	    ctxt->sax->startElement(ctxt->userData, BAD_CAST"p", NULL);
1526 	return(1);
1527     }
1528     if (!htmlOmittedDefaultValue)
1529 	return(0);
1530     for (i = 0; htmlNoContentElements[i] != NULL; i++) {
1531 	if (xmlStrEqual(tag, BAD_CAST htmlNoContentElements[i])) {
1532 	    htmlAutoClose(ctxt, BAD_CAST"p");
1533 	    htmlCheckImplied(ctxt, BAD_CAST"p");
1534 	    htmlnamePush(ctxt, BAD_CAST"p");
1535 	    if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
1536 		ctxt->sax->startElement(ctxt->userData, BAD_CAST"p", NULL);
1537 	    return(1);
1538 	}
1539     }
1540     return(0);
1541 }
1542 
1543 /**
1544  * htmlIsScriptAttribute:
1545  * @name:  an attribute name
1546  *
1547  * Check if an attribute is of content type Script
1548  *
1549  * Returns 1 is the attribute is a script 0 otherwise
1550  */
1551 int
htmlIsScriptAttribute(const xmlChar * name)1552 htmlIsScriptAttribute(const xmlChar *name) {
1553     unsigned int i;
1554 
1555     if (name == NULL)
1556       return(0);
1557     /*
1558      * all script attributes start with 'on'
1559      */
1560     if ((name[0] != 'o') || (name[1] != 'n'))
1561       return(0);
1562     for (i = 0;
1563 	 i < sizeof(htmlScriptAttributes)/sizeof(htmlScriptAttributes[0]);
1564 	 i++) {
1565 	if (xmlStrEqual(name, (const xmlChar *) htmlScriptAttributes[i]))
1566 	    return(1);
1567     }
1568     return(0);
1569 }
1570 
1571 /************************************************************************
1572  *									*
1573  *	The list of HTML predefined entities			*
1574  *									*
1575  ************************************************************************/
1576 
1577 
1578 static const htmlEntityDesc  html40EntitiesTable[] = {
1579 /*
1580  * the 4 absolute ones, plus apostrophe.
1581  */
1582 { 34,	"quot",	"quotation mark = APL quote, U+0022 ISOnum" },
1583 { 38,	"amp",	"ampersand, U+0026 ISOnum" },
1584 { 39,	"apos",	"single quote" },
1585 { 60,	"lt",	"less-than sign, U+003C ISOnum" },
1586 { 62,	"gt",	"greater-than sign, U+003E ISOnum" },
1587 
1588 /*
1589  * A bunch still in the 128-255 range
1590  * Replacing them depend really on the charset used.
1591  */
1592 { 160,	"nbsp",	"no-break space = non-breaking space, U+00A0 ISOnum" },
1593 { 161,	"iexcl","inverted exclamation mark, U+00A1 ISOnum" },
1594 { 162,	"cent",	"cent sign, U+00A2 ISOnum" },
1595 { 163,	"pound","pound sign, U+00A3 ISOnum" },
1596 { 164,	"curren","currency sign, U+00A4 ISOnum" },
1597 { 165,	"yen",	"yen sign = yuan sign, U+00A5 ISOnum" },
1598 { 166,	"brvbar","broken bar = broken vertical bar, U+00A6 ISOnum" },
1599 { 167,	"sect",	"section sign, U+00A7 ISOnum" },
1600 { 168,	"uml",	"diaeresis = spacing diaeresis, U+00A8 ISOdia" },
1601 { 169,	"copy",	"copyright sign, U+00A9 ISOnum" },
1602 { 170,	"ordf",	"feminine ordinal indicator, U+00AA ISOnum" },
1603 { 171,	"laquo","left-pointing double angle quotation mark = left pointing guillemet, U+00AB ISOnum" },
1604 { 172,	"not",	"not sign, U+00AC ISOnum" },
1605 { 173,	"shy",	"soft hyphen = discretionary hyphen, U+00AD ISOnum" },
1606 { 174,	"reg",	"registered sign = registered trade mark sign, U+00AE ISOnum" },
1607 { 175,	"macr",	"macron = spacing macron = overline = APL overbar, U+00AF ISOdia" },
1608 { 176,	"deg",	"degree sign, U+00B0 ISOnum" },
1609 { 177,	"plusmn","plus-minus sign = plus-or-minus sign, U+00B1 ISOnum" },
1610 { 178,	"sup2",	"superscript two = superscript digit two = squared, U+00B2 ISOnum" },
1611 { 179,	"sup3",	"superscript three = superscript digit three = cubed, U+00B3 ISOnum" },
1612 { 180,	"acute","acute accent = spacing acute, U+00B4 ISOdia" },
1613 { 181,	"micro","micro sign, U+00B5 ISOnum" },
1614 { 182,	"para",	"pilcrow sign = paragraph sign, U+00B6 ISOnum" },
1615 { 183,	"middot","middle dot = Georgian comma Greek middle dot, U+00B7 ISOnum" },
1616 { 184,	"cedil","cedilla = spacing cedilla, U+00B8 ISOdia" },
1617 { 185,	"sup1",	"superscript one = superscript digit one, U+00B9 ISOnum" },
1618 { 186,	"ordm",	"masculine ordinal indicator, U+00BA ISOnum" },
1619 { 187,	"raquo","right-pointing double angle quotation mark right pointing guillemet, U+00BB ISOnum" },
1620 { 188,	"frac14","vulgar fraction one quarter = fraction one quarter, U+00BC ISOnum" },
1621 { 189,	"frac12","vulgar fraction one half = fraction one half, U+00BD ISOnum" },
1622 { 190,	"frac34","vulgar fraction three quarters = fraction three quarters, U+00BE ISOnum" },
1623 { 191,	"iquest","inverted question mark = turned question mark, U+00BF ISOnum" },
1624 { 192,	"Agrave","latin capital letter A with grave = latin capital letter A grave, U+00C0 ISOlat1" },
1625 { 193,	"Aacute","latin capital letter A with acute, U+00C1 ISOlat1" },
1626 { 194,	"Acirc","latin capital letter A with circumflex, U+00C2 ISOlat1" },
1627 { 195,	"Atilde","latin capital letter A with tilde, U+00C3 ISOlat1" },
1628 { 196,	"Auml",	"latin capital letter A with diaeresis, U+00C4 ISOlat1" },
1629 { 197,	"Aring","latin capital letter A with ring above = latin capital letter A ring, U+00C5 ISOlat1" },
1630 { 198,	"AElig","latin capital letter AE = latin capital ligature AE, U+00C6 ISOlat1" },
1631 { 199,	"Ccedil","latin capital letter C with cedilla, U+00C7 ISOlat1" },
1632 { 200,	"Egrave","latin capital letter E with grave, U+00C8 ISOlat1" },
1633 { 201,	"Eacute","latin capital letter E with acute, U+00C9 ISOlat1" },
1634 { 202,	"Ecirc","latin capital letter E with circumflex, U+00CA ISOlat1" },
1635 { 203,	"Euml",	"latin capital letter E with diaeresis, U+00CB ISOlat1" },
1636 { 204,	"Igrave","latin capital letter I with grave, U+00CC ISOlat1" },
1637 { 205,	"Iacute","latin capital letter I with acute, U+00CD ISOlat1" },
1638 { 206,	"Icirc","latin capital letter I with circumflex, U+00CE ISOlat1" },
1639 { 207,	"Iuml",	"latin capital letter I with diaeresis, U+00CF ISOlat1" },
1640 { 208,	"ETH",	"latin capital letter ETH, U+00D0 ISOlat1" },
1641 { 209,	"Ntilde","latin capital letter N with tilde, U+00D1 ISOlat1" },
1642 { 210,	"Ograve","latin capital letter O with grave, U+00D2 ISOlat1" },
1643 { 211,	"Oacute","latin capital letter O with acute, U+00D3 ISOlat1" },
1644 { 212,	"Ocirc","latin capital letter O with circumflex, U+00D4 ISOlat1" },
1645 { 213,	"Otilde","latin capital letter O with tilde, U+00D5 ISOlat1" },
1646 { 214,	"Ouml",	"latin capital letter O with diaeresis, U+00D6 ISOlat1" },
1647 { 215,	"times","multiplication sign, U+00D7 ISOnum" },
1648 { 216,	"Oslash","latin capital letter O with stroke latin capital letter O slash, U+00D8 ISOlat1" },
1649 { 217,	"Ugrave","latin capital letter U with grave, U+00D9 ISOlat1" },
1650 { 218,	"Uacute","latin capital letter U with acute, U+00DA ISOlat1" },
1651 { 219,	"Ucirc","latin capital letter U with circumflex, U+00DB ISOlat1" },
1652 { 220,	"Uuml",	"latin capital letter U with diaeresis, U+00DC ISOlat1" },
1653 { 221,	"Yacute","latin capital letter Y with acute, U+00DD ISOlat1" },
1654 { 222,	"THORN","latin capital letter THORN, U+00DE ISOlat1" },
1655 { 223,	"szlig","latin small letter sharp s = ess-zed, U+00DF ISOlat1" },
1656 { 224,	"agrave","latin small letter a with grave = latin small letter a grave, U+00E0 ISOlat1" },
1657 { 225,	"aacute","latin small letter a with acute, U+00E1 ISOlat1" },
1658 { 226,	"acirc","latin small letter a with circumflex, U+00E2 ISOlat1" },
1659 { 227,	"atilde","latin small letter a with tilde, U+00E3 ISOlat1" },
1660 { 228,	"auml",	"latin small letter a with diaeresis, U+00E4 ISOlat1" },
1661 { 229,	"aring","latin small letter a with ring above = latin small letter a ring, U+00E5 ISOlat1" },
1662 { 230,	"aelig","latin small letter ae = latin small ligature ae, U+00E6 ISOlat1" },
1663 { 231,	"ccedil","latin small letter c with cedilla, U+00E7 ISOlat1" },
1664 { 232,	"egrave","latin small letter e with grave, U+00E8 ISOlat1" },
1665 { 233,	"eacute","latin small letter e with acute, U+00E9 ISOlat1" },
1666 { 234,	"ecirc","latin small letter e with circumflex, U+00EA ISOlat1" },
1667 { 235,	"euml",	"latin small letter e with diaeresis, U+00EB ISOlat1" },
1668 { 236,	"igrave","latin small letter i with grave, U+00EC ISOlat1" },
1669 { 237,	"iacute","latin small letter i with acute, U+00ED ISOlat1" },
1670 { 238,	"icirc","latin small letter i with circumflex, U+00EE ISOlat1" },
1671 { 239,	"iuml",	"latin small letter i with diaeresis, U+00EF ISOlat1" },
1672 { 240,	"eth",	"latin small letter eth, U+00F0 ISOlat1" },
1673 { 241,	"ntilde","latin small letter n with tilde, U+00F1 ISOlat1" },
1674 { 242,	"ograve","latin small letter o with grave, U+00F2 ISOlat1" },
1675 { 243,	"oacute","latin small letter o with acute, U+00F3 ISOlat1" },
1676 { 244,	"ocirc","latin small letter o with circumflex, U+00F4 ISOlat1" },
1677 { 245,	"otilde","latin small letter o with tilde, U+00F5 ISOlat1" },
1678 { 246,	"ouml",	"latin small letter o with diaeresis, U+00F6 ISOlat1" },
1679 { 247,	"divide","division sign, U+00F7 ISOnum" },
1680 { 248,	"oslash","latin small letter o with stroke, = latin small letter o slash, U+00F8 ISOlat1" },
1681 { 249,	"ugrave","latin small letter u with grave, U+00F9 ISOlat1" },
1682 { 250,	"uacute","latin small letter u with acute, U+00FA ISOlat1" },
1683 { 251,	"ucirc","latin small letter u with circumflex, U+00FB ISOlat1" },
1684 { 252,	"uuml",	"latin small letter u with diaeresis, U+00FC ISOlat1" },
1685 { 253,	"yacute","latin small letter y with acute, U+00FD ISOlat1" },
1686 { 254,	"thorn","latin small letter thorn with, U+00FE ISOlat1" },
1687 { 255,	"yuml",	"latin small letter y with diaeresis, U+00FF ISOlat1" },
1688 
1689 { 338,	"OElig","latin capital ligature OE, U+0152 ISOlat2" },
1690 { 339,	"oelig","latin small ligature oe, U+0153 ISOlat2" },
1691 { 352,	"Scaron","latin capital letter S with caron, U+0160 ISOlat2" },
1692 { 353,	"scaron","latin small letter s with caron, U+0161 ISOlat2" },
1693 { 376,	"Yuml",	"latin capital letter Y with diaeresis, U+0178 ISOlat2" },
1694 
1695 /*
1696  * Anything below should really be kept as entities references
1697  */
1698 { 402,	"fnof",	"latin small f with hook = function = florin, U+0192 ISOtech" },
1699 
1700 { 710,	"circ",	"modifier letter circumflex accent, U+02C6 ISOpub" },
1701 { 732,	"tilde","small tilde, U+02DC ISOdia" },
1702 
1703 { 913,	"Alpha","greek capital letter alpha, U+0391" },
1704 { 914,	"Beta",	"greek capital letter beta, U+0392" },
1705 { 915,	"Gamma","greek capital letter gamma, U+0393 ISOgrk3" },
1706 { 916,	"Delta","greek capital letter delta, U+0394 ISOgrk3" },
1707 { 917,	"Epsilon","greek capital letter epsilon, U+0395" },
1708 { 918,	"Zeta",	"greek capital letter zeta, U+0396" },
1709 { 919,	"Eta",	"greek capital letter eta, U+0397" },
1710 { 920,	"Theta","greek capital letter theta, U+0398 ISOgrk3" },
1711 { 921,	"Iota",	"greek capital letter iota, U+0399" },
1712 { 922,	"Kappa","greek capital letter kappa, U+039A" },
1713 { 923,	"Lambda", "greek capital letter lambda, U+039B ISOgrk3" },
1714 { 924,	"Mu",	"greek capital letter mu, U+039C" },
1715 { 925,	"Nu",	"greek capital letter nu, U+039D" },
1716 { 926,	"Xi",	"greek capital letter xi, U+039E ISOgrk3" },
1717 { 927,	"Omicron","greek capital letter omicron, U+039F" },
1718 { 928,	"Pi",	"greek capital letter pi, U+03A0 ISOgrk3" },
1719 { 929,	"Rho",	"greek capital letter rho, U+03A1" },
1720 { 931,	"Sigma","greek capital letter sigma, U+03A3 ISOgrk3" },
1721 { 932,	"Tau",	"greek capital letter tau, U+03A4" },
1722 { 933,	"Upsilon","greek capital letter upsilon, U+03A5 ISOgrk3" },
1723 { 934,	"Phi",	"greek capital letter phi, U+03A6 ISOgrk3" },
1724 { 935,	"Chi",	"greek capital letter chi, U+03A7" },
1725 { 936,	"Psi",	"greek capital letter psi, U+03A8 ISOgrk3" },
1726 { 937,	"Omega","greek capital letter omega, U+03A9 ISOgrk3" },
1727 
1728 { 945,	"alpha","greek small letter alpha, U+03B1 ISOgrk3" },
1729 { 946,	"beta",	"greek small letter beta, U+03B2 ISOgrk3" },
1730 { 947,	"gamma","greek small letter gamma, U+03B3 ISOgrk3" },
1731 { 948,	"delta","greek small letter delta, U+03B4 ISOgrk3" },
1732 { 949,	"epsilon","greek small letter epsilon, U+03B5 ISOgrk3" },
1733 { 950,	"zeta",	"greek small letter zeta, U+03B6 ISOgrk3" },
1734 { 951,	"eta",	"greek small letter eta, U+03B7 ISOgrk3" },
1735 { 952,	"theta","greek small letter theta, U+03B8 ISOgrk3" },
1736 { 953,	"iota",	"greek small letter iota, U+03B9 ISOgrk3" },
1737 { 954,	"kappa","greek small letter kappa, U+03BA ISOgrk3" },
1738 { 955,	"lambda","greek small letter lambda, U+03BB ISOgrk3" },
1739 { 956,	"mu",	"greek small letter mu, U+03BC ISOgrk3" },
1740 { 957,	"nu",	"greek small letter nu, U+03BD ISOgrk3" },
1741 { 958,	"xi",	"greek small letter xi, U+03BE ISOgrk3" },
1742 { 959,	"omicron","greek small letter omicron, U+03BF NEW" },
1743 { 960,	"pi",	"greek small letter pi, U+03C0 ISOgrk3" },
1744 { 961,	"rho",	"greek small letter rho, U+03C1 ISOgrk3" },
1745 { 962,	"sigmaf","greek small letter final sigma, U+03C2 ISOgrk3" },
1746 { 963,	"sigma","greek small letter sigma, U+03C3 ISOgrk3" },
1747 { 964,	"tau",	"greek small letter tau, U+03C4 ISOgrk3" },
1748 { 965,	"upsilon","greek small letter upsilon, U+03C5 ISOgrk3" },
1749 { 966,	"phi",	"greek small letter phi, U+03C6 ISOgrk3" },
1750 { 967,	"chi",	"greek small letter chi, U+03C7 ISOgrk3" },
1751 { 968,	"psi",	"greek small letter psi, U+03C8 ISOgrk3" },
1752 { 969,	"omega","greek small letter omega, U+03C9 ISOgrk3" },
1753 { 977,	"thetasym","greek small letter theta symbol, U+03D1 NEW" },
1754 { 978,	"upsih","greek upsilon with hook symbol, U+03D2 NEW" },
1755 { 982,	"piv",	"greek pi symbol, U+03D6 ISOgrk3" },
1756 
1757 { 8194,	"ensp",	"en space, U+2002 ISOpub" },
1758 { 8195,	"emsp",	"em space, U+2003 ISOpub" },
1759 { 8201,	"thinsp","thin space, U+2009 ISOpub" },
1760 { 8204,	"zwnj",	"zero width non-joiner, U+200C NEW RFC 2070" },
1761 { 8205,	"zwj",	"zero width joiner, U+200D NEW RFC 2070" },
1762 { 8206,	"lrm",	"left-to-right mark, U+200E NEW RFC 2070" },
1763 { 8207,	"rlm",	"right-to-left mark, U+200F NEW RFC 2070" },
1764 { 8211,	"ndash","en dash, U+2013 ISOpub" },
1765 { 8212,	"mdash","em dash, U+2014 ISOpub" },
1766 { 8216,	"lsquo","left single quotation mark, U+2018 ISOnum" },
1767 { 8217,	"rsquo","right single quotation mark, U+2019 ISOnum" },
1768 { 8218,	"sbquo","single low-9 quotation mark, U+201A NEW" },
1769 { 8220,	"ldquo","left double quotation mark, U+201C ISOnum" },
1770 { 8221,	"rdquo","right double quotation mark, U+201D ISOnum" },
1771 { 8222,	"bdquo","double low-9 quotation mark, U+201E NEW" },
1772 { 8224,	"dagger","dagger, U+2020 ISOpub" },
1773 { 8225,	"Dagger","double dagger, U+2021 ISOpub" },
1774 
1775 { 8226,	"bull",	"bullet = black small circle, U+2022 ISOpub" },
1776 { 8230,	"hellip","horizontal ellipsis = three dot leader, U+2026 ISOpub" },
1777 
1778 { 8240,	"permil","per mille sign, U+2030 ISOtech" },
1779 
1780 { 8242,	"prime","prime = minutes = feet, U+2032 ISOtech" },
1781 { 8243,	"Prime","double prime = seconds = inches, U+2033 ISOtech" },
1782 
1783 { 8249,	"lsaquo","single left-pointing angle quotation mark, U+2039 ISO proposed" },
1784 { 8250,	"rsaquo","single right-pointing angle quotation mark, U+203A ISO proposed" },
1785 
1786 { 8254,	"oline","overline = spacing overscore, U+203E NEW" },
1787 { 8260,	"frasl","fraction slash, U+2044 NEW" },
1788 
1789 { 8364,	"euro",	"euro sign, U+20AC NEW" },
1790 
1791 { 8465,	"image","blackletter capital I = imaginary part, U+2111 ISOamso" },
1792 { 8472,	"weierp","script capital P = power set = Weierstrass p, U+2118 ISOamso" },
1793 { 8476,	"real",	"blackletter capital R = real part symbol, U+211C ISOamso" },
1794 { 8482,	"trade","trade mark sign, U+2122 ISOnum" },
1795 { 8501,	"alefsym","alef symbol = first transfinite cardinal, U+2135 NEW" },
1796 { 8592,	"larr",	"leftwards arrow, U+2190 ISOnum" },
1797 { 8593,	"uarr",	"upwards arrow, U+2191 ISOnum" },
1798 { 8594,	"rarr",	"rightwards arrow, U+2192 ISOnum" },
1799 { 8595,	"darr",	"downwards arrow, U+2193 ISOnum" },
1800 { 8596,	"harr",	"left right arrow, U+2194 ISOamsa" },
1801 { 8629,	"crarr","downwards arrow with corner leftwards = carriage return, U+21B5 NEW" },
1802 { 8656,	"lArr",	"leftwards double arrow, U+21D0 ISOtech" },
1803 { 8657,	"uArr",	"upwards double arrow, U+21D1 ISOamsa" },
1804 { 8658,	"rArr",	"rightwards double arrow, U+21D2 ISOtech" },
1805 { 8659,	"dArr",	"downwards double arrow, U+21D3 ISOamsa" },
1806 { 8660,	"hArr",	"left right double arrow, U+21D4 ISOamsa" },
1807 
1808 { 8704,	"forall","for all, U+2200 ISOtech" },
1809 { 8706,	"part",	"partial differential, U+2202 ISOtech" },
1810 { 8707,	"exist","there exists, U+2203 ISOtech" },
1811 { 8709,	"empty","empty set = null set = diameter, U+2205 ISOamso" },
1812 { 8711,	"nabla","nabla = backward difference, U+2207 ISOtech" },
1813 { 8712,	"isin",	"element of, U+2208 ISOtech" },
1814 { 8713,	"notin","not an element of, U+2209 ISOtech" },
1815 { 8715,	"ni",	"contains as member, U+220B ISOtech" },
1816 { 8719,	"prod",	"n-ary product = product sign, U+220F ISOamsb" },
1817 { 8721,	"sum",	"n-ary summation, U+2211 ISOamsb" },
1818 { 8722,	"minus","minus sign, U+2212 ISOtech" },
1819 { 8727,	"lowast","asterisk operator, U+2217 ISOtech" },
1820 { 8730,	"radic","square root = radical sign, U+221A ISOtech" },
1821 { 8733,	"prop",	"proportional to, U+221D ISOtech" },
1822 { 8734,	"infin","infinity, U+221E ISOtech" },
1823 { 8736,	"ang",	"angle, U+2220 ISOamso" },
1824 { 8743,	"and",	"logical and = wedge, U+2227 ISOtech" },
1825 { 8744,	"or",	"logical or = vee, U+2228 ISOtech" },
1826 { 8745,	"cap",	"intersection = cap, U+2229 ISOtech" },
1827 { 8746,	"cup",	"union = cup, U+222A ISOtech" },
1828 { 8747,	"int",	"integral, U+222B ISOtech" },
1829 { 8756,	"there4","therefore, U+2234 ISOtech" },
1830 { 8764,	"sim",	"tilde operator = varies with = similar to, U+223C ISOtech" },
1831 { 8773,	"cong",	"approximately equal to, U+2245 ISOtech" },
1832 { 8776,	"asymp","almost equal to = asymptotic to, U+2248 ISOamsr" },
1833 { 8800,	"ne",	"not equal to, U+2260 ISOtech" },
1834 { 8801,	"equiv","identical to, U+2261 ISOtech" },
1835 { 8804,	"le",	"less-than or equal to, U+2264 ISOtech" },
1836 { 8805,	"ge",	"greater-than or equal to, U+2265 ISOtech" },
1837 { 8834,	"sub",	"subset of, U+2282 ISOtech" },
1838 { 8835,	"sup",	"superset of, U+2283 ISOtech" },
1839 { 8836,	"nsub",	"not a subset of, U+2284 ISOamsn" },
1840 { 8838,	"sube",	"subset of or equal to, U+2286 ISOtech" },
1841 { 8839,	"supe",	"superset of or equal to, U+2287 ISOtech" },
1842 { 8853,	"oplus","circled plus = direct sum, U+2295 ISOamsb" },
1843 { 8855,	"otimes","circled times = vector product, U+2297 ISOamsb" },
1844 { 8869,	"perp",	"up tack = orthogonal to = perpendicular, U+22A5 ISOtech" },
1845 { 8901,	"sdot",	"dot operator, U+22C5 ISOamsb" },
1846 { 8968,	"lceil","left ceiling = apl upstile, U+2308 ISOamsc" },
1847 { 8969,	"rceil","right ceiling, U+2309 ISOamsc" },
1848 { 8970,	"lfloor","left floor = apl downstile, U+230A ISOamsc" },
1849 { 8971,	"rfloor","right floor, U+230B ISOamsc" },
1850 { 9001,	"lang",	"left-pointing angle bracket = bra, U+2329 ISOtech" },
1851 { 9002,	"rang",	"right-pointing angle bracket = ket, U+232A ISOtech" },
1852 { 9674,	"loz",	"lozenge, U+25CA ISOpub" },
1853 
1854 { 9824,	"spades","black spade suit, U+2660 ISOpub" },
1855 { 9827,	"clubs","black club suit = shamrock, U+2663 ISOpub" },
1856 { 9829,	"hearts","black heart suit = valentine, U+2665 ISOpub" },
1857 { 9830,	"diams","black diamond suit, U+2666 ISOpub" },
1858 
1859 };
1860 
1861 /************************************************************************
1862  *									*
1863  *		Commodity functions to handle entities			*
1864  *									*
1865  ************************************************************************/
1866 
1867 /*
1868  * Macro used to grow the current buffer.
1869  */
1870 #define growBuffer(buffer) {						\
1871     xmlChar *tmp;							\
1872     buffer##_size *= 2;							\
1873     tmp = (xmlChar *) xmlRealloc(buffer, buffer##_size * sizeof(xmlChar)); \
1874     if (tmp == NULL) {						\
1875 	htmlErrMemory(ctxt, "growing buffer\n");			\
1876 	xmlFree(buffer);						\
1877 	return(NULL);							\
1878     }									\
1879     buffer = tmp;							\
1880 }
1881 
1882 /**
1883  * htmlEntityLookup:
1884  * @name: the entity name
1885  *
1886  * Lookup the given entity in EntitiesTable
1887  *
1888  * TODO: the linear scan is really ugly, an hash table is really needed.
1889  *
1890  * Returns the associated htmlEntityDescPtr if found, NULL otherwise.
1891  */
1892 const htmlEntityDesc *
htmlEntityLookup(const xmlChar * name)1893 htmlEntityLookup(const xmlChar *name) {
1894     unsigned int i;
1895 
1896     for (i = 0;i < (sizeof(html40EntitiesTable)/
1897                     sizeof(html40EntitiesTable[0]));i++) {
1898         if (xmlStrEqual(name, BAD_CAST html40EntitiesTable[i].name)) {
1899             return((htmlEntityDescPtr) &html40EntitiesTable[i]);
1900 	}
1901     }
1902     return(NULL);
1903 }
1904 
1905 /**
1906  * htmlEntityValueLookup:
1907  * @value: the entity's unicode value
1908  *
1909  * Lookup the given entity in EntitiesTable
1910  *
1911  * TODO: the linear scan is really ugly, an hash table is really needed.
1912  *
1913  * Returns the associated htmlEntityDescPtr if found, NULL otherwise.
1914  */
1915 const htmlEntityDesc *
htmlEntityValueLookup(unsigned int value)1916 htmlEntityValueLookup(unsigned int value) {
1917     unsigned int i;
1918 
1919     for (i = 0;i < (sizeof(html40EntitiesTable)/
1920                     sizeof(html40EntitiesTable[0]));i++) {
1921         if (html40EntitiesTable[i].value >= value) {
1922 	    if (html40EntitiesTable[i].value > value)
1923 		break;
1924             return((htmlEntityDescPtr) &html40EntitiesTable[i]);
1925 	}
1926     }
1927     return(NULL);
1928 }
1929 
1930 /**
1931  * UTF8ToHtml:
1932  * @out:  a pointer to an array of bytes to store the result
1933  * @outlen:  the length of @out
1934  * @in:  a pointer to an array of UTF-8 chars
1935  * @inlen:  the length of @in
1936  *
1937  * Take a block of UTF-8 chars in and try to convert it to an ASCII
1938  * plus HTML entities block of chars out.
1939  *
1940  * Returns 0 if success, -2 if the transcoding fails, or -1 otherwise
1941  * The value of @inlen after return is the number of octets consumed
1942  *     as the return value is positive, else unpredictable.
1943  * The value of @outlen after return is the number of octets consumed.
1944  */
1945 int
UTF8ToHtml(unsigned char * out,int * outlen,const unsigned char * in,int * inlen)1946 UTF8ToHtml(unsigned char* out, int *outlen,
1947               const unsigned char* in, int *inlen) {
1948     const unsigned char* processed = in;
1949     const unsigned char* outend;
1950     const unsigned char* outstart = out;
1951     const unsigned char* instart = in;
1952     const unsigned char* inend;
1953     unsigned int c, d;
1954     int trailing;
1955 
1956     if ((out == NULL) || (outlen == NULL) || (inlen == NULL)) return(-1);
1957     if (in == NULL) {
1958         /*
1959 	 * initialization nothing to do
1960 	 */
1961 	*outlen = 0;
1962 	*inlen = 0;
1963 	return(0);
1964     }
1965     inend = in + (*inlen);
1966     outend = out + (*outlen);
1967     while (in < inend) {
1968 	d = *in++;
1969 	if      (d < 0x80)  { c= d; trailing= 0; }
1970 	else if (d < 0xC0) {
1971 	    /* trailing byte in leading position */
1972 	    *outlen = out - outstart;
1973 	    *inlen = processed - instart;
1974 	    return(-2);
1975         } else if (d < 0xE0)  { c= d & 0x1F; trailing= 1; }
1976         else if (d < 0xF0)  { c= d & 0x0F; trailing= 2; }
1977         else if (d < 0xF8)  { c= d & 0x07; trailing= 3; }
1978 	else {
1979 	    /* no chance for this in Ascii */
1980 	    *outlen = out - outstart;
1981 	    *inlen = processed - instart;
1982 	    return(-2);
1983 	}
1984 
1985 	if (inend - in < trailing) {
1986 	    break;
1987 	}
1988 
1989 	for ( ; trailing; trailing--) {
1990 	    if ((in >= inend) || (((d= *in++) & 0xC0) != 0x80))
1991 		break;
1992 	    c <<= 6;
1993 	    c |= d & 0x3F;
1994 	}
1995 
1996 	/* assertion: c is a single UTF-4 value */
1997 	if (c < 0x80) {
1998 	    if (out + 1 >= outend)
1999 		break;
2000 	    *out++ = c;
2001 	} else {
2002 	    int len;
2003 	    const htmlEntityDesc * ent;
2004 	    const char *cp;
2005 	    char nbuf[16];
2006 
2007 	    /*
2008 	     * Try to lookup a predefined HTML entity for it
2009 	     */
2010 
2011 	    ent = htmlEntityValueLookup(c);
2012 	    if (ent == NULL) {
2013 	      snprintf(nbuf, sizeof(nbuf), "#%u", c);
2014 	      cp = nbuf;
2015 	    }
2016 	    else
2017 	      cp = ent->name;
2018 	    len = strlen(cp);
2019 	    if (out + 2 + len >= outend)
2020 		break;
2021 	    *out++ = '&';
2022 	    memcpy(out, cp, len);
2023 	    out += len;
2024 	    *out++ = ';';
2025 	}
2026 	processed = in;
2027     }
2028     *outlen = out - outstart;
2029     *inlen = processed - instart;
2030     return(0);
2031 }
2032 
2033 /**
2034  * htmlEncodeEntities:
2035  * @out:  a pointer to an array of bytes to store the result
2036  * @outlen:  the length of @out
2037  * @in:  a pointer to an array of UTF-8 chars
2038  * @inlen:  the length of @in
2039  * @quoteChar: the quote character to escape (' or ") or zero.
2040  *
2041  * Take a block of UTF-8 chars in and try to convert it to an ASCII
2042  * plus HTML entities block of chars out.
2043  *
2044  * Returns 0 if success, -2 if the transcoding fails, or -1 otherwise
2045  * The value of @inlen after return is the number of octets consumed
2046  *     as the return value is positive, else unpredictable.
2047  * The value of @outlen after return is the number of octets consumed.
2048  */
2049 int
htmlEncodeEntities(unsigned char * out,int * outlen,const unsigned char * in,int * inlen,int quoteChar)2050 htmlEncodeEntities(unsigned char* out, int *outlen,
2051 		   const unsigned char* in, int *inlen, int quoteChar) {
2052     const unsigned char* processed = in;
2053     const unsigned char* outend;
2054     const unsigned char* outstart = out;
2055     const unsigned char* instart = in;
2056     const unsigned char* inend;
2057     unsigned int c, d;
2058     int trailing;
2059 
2060     if ((out == NULL) || (outlen == NULL) || (inlen == NULL) || (in == NULL))
2061         return(-1);
2062     outend = out + (*outlen);
2063     inend = in + (*inlen);
2064     while (in < inend) {
2065 	d = *in++;
2066 	if      (d < 0x80)  { c= d; trailing= 0; }
2067 	else if (d < 0xC0) {
2068 	    /* trailing byte in leading position */
2069 	    *outlen = out - outstart;
2070 	    *inlen = processed - instart;
2071 	    return(-2);
2072         } else if (d < 0xE0)  { c= d & 0x1F; trailing= 1; }
2073         else if (d < 0xF0)  { c= d & 0x0F; trailing= 2; }
2074         else if (d < 0xF8)  { c= d & 0x07; trailing= 3; }
2075 	else {
2076 	    /* no chance for this in Ascii */
2077 	    *outlen = out - outstart;
2078 	    *inlen = processed - instart;
2079 	    return(-2);
2080 	}
2081 
2082 	if (inend - in < trailing)
2083 	    break;
2084 
2085 	while (trailing--) {
2086 	    if (((d= *in++) & 0xC0) != 0x80) {
2087 		*outlen = out - outstart;
2088 		*inlen = processed - instart;
2089 		return(-2);
2090 	    }
2091 	    c <<= 6;
2092 	    c |= d & 0x3F;
2093 	}
2094 
2095 	/* assertion: c is a single UTF-4 value */
2096 	if ((c < 0x80) && (c != (unsigned int) quoteChar) &&
2097 	    (c != '&') && (c != '<') && (c != '>')) {
2098 	    if (out >= outend)
2099 		break;
2100 	    *out++ = c;
2101 	} else {
2102 	    const htmlEntityDesc * ent;
2103 	    const char *cp;
2104 	    char nbuf[16];
2105 	    int len;
2106 
2107 	    /*
2108 	     * Try to lookup a predefined HTML entity for it
2109 	     */
2110 	    ent = htmlEntityValueLookup(c);
2111 	    if (ent == NULL) {
2112 		snprintf(nbuf, sizeof(nbuf), "#%u", c);
2113 		cp = nbuf;
2114 	    }
2115 	    else
2116 		cp = ent->name;
2117 	    len = strlen(cp);
2118 	    if (out + 2 + len > outend)
2119 		break;
2120 	    *out++ = '&';
2121 	    memcpy(out, cp, len);
2122 	    out += len;
2123 	    *out++ = ';';
2124 	}
2125 	processed = in;
2126     }
2127     *outlen = out - outstart;
2128     *inlen = processed - instart;
2129     return(0);
2130 }
2131 
2132 /************************************************************************
2133  *									*
2134  *		Commodity functions to handle streams			*
2135  *									*
2136  ************************************************************************/
2137 
2138 /**
2139  * htmlNewInputStream:
2140  * @ctxt:  an HTML parser context
2141  *
2142  * Create a new input stream structure
2143  * Returns the new input stream or NULL
2144  */
2145 static htmlParserInputPtr
htmlNewInputStream(htmlParserCtxtPtr ctxt)2146 htmlNewInputStream(htmlParserCtxtPtr ctxt) {
2147     htmlParserInputPtr input;
2148 
2149     input = (xmlParserInputPtr) xmlMalloc(sizeof(htmlParserInput));
2150     if (input == NULL) {
2151         htmlErrMemory(ctxt, "couldn't allocate a new input stream\n");
2152 	return(NULL);
2153     }
2154     memset(input, 0, sizeof(htmlParserInput));
2155     input->filename = NULL;
2156     input->directory = NULL;
2157     input->base = NULL;
2158     input->cur = NULL;
2159     input->buf = NULL;
2160     input->line = 1;
2161     input->col = 1;
2162     input->buf = NULL;
2163     input->free = NULL;
2164     input->version = NULL;
2165     input->consumed = 0;
2166     input->length = 0;
2167     return(input);
2168 }
2169 
2170 
2171 /************************************************************************
2172  *									*
2173  *		Commodity functions, cleanup needed ?			*
2174  *									*
2175  ************************************************************************/
2176 /*
2177  * all tags allowing pc data from the html 4.01 loose dtd
2178  * NOTE: it might be more apropriate to integrate this information
2179  * into the html40ElementTable array but I don't want to risk any
2180  * binary incomptibility
2181  */
2182 static const char *allowPCData[] = {
2183     "a", "abbr", "acronym", "address", "applet", "b", "bdo", "big",
2184     "blockquote", "body", "button", "caption", "center", "cite", "code",
2185     "dd", "del", "dfn", "div", "dt", "em", "font", "form", "h1", "h2",
2186     "h3", "h4", "h5", "h6", "i", "iframe", "ins", "kbd", "label", "legend",
2187     "li", "noframes", "noscript", "object", "p", "pre", "q", "s", "samp",
2188     "small", "span", "strike", "strong", "td", "th", "tt", "u", "var"
2189 };
2190 
2191 /**
2192  * areBlanks:
2193  * @ctxt:  an HTML parser context
2194  * @str:  a xmlChar *
2195  * @len:  the size of @str
2196  *
2197  * Is this a sequence of blank chars that one can ignore ?
2198  *
2199  * Returns 1 if ignorable 0 otherwise.
2200  */
2201 
areBlanks(htmlParserCtxtPtr ctxt,const xmlChar * str,int len)2202 static int areBlanks(htmlParserCtxtPtr ctxt, const xmlChar *str, int len) {
2203     unsigned int i;
2204     int j;
2205     xmlNodePtr lastChild;
2206     xmlDtdPtr dtd;
2207 
2208     for (j = 0;j < len;j++)
2209         if (!(IS_BLANK_CH(str[j]))) return(0);
2210 
2211     if (CUR == 0) return(1);
2212     if (CUR != '<') return(0);
2213     if (ctxt->name == NULL)
2214 	return(1);
2215     if (xmlStrEqual(ctxt->name, BAD_CAST"html"))
2216 	return(1);
2217     if (xmlStrEqual(ctxt->name, BAD_CAST"head"))
2218 	return(1);
2219 
2220     /* Only strip CDATA children of the body tag for strict HTML DTDs */
2221     if (xmlStrEqual(ctxt->name, BAD_CAST "body") && ctxt->myDoc != NULL) {
2222         dtd = xmlGetIntSubset(ctxt->myDoc);
2223         if (dtd != NULL && dtd->ExternalID != NULL) {
2224             if (!xmlStrcasecmp(dtd->ExternalID, BAD_CAST "-//W3C//DTD HTML 4.01//EN") ||
2225                     !xmlStrcasecmp(dtd->ExternalID, BAD_CAST "-//W3C//DTD HTML 4//EN"))
2226                 return(1);
2227         }
2228     }
2229 
2230     if (ctxt->node == NULL) return(0);
2231     lastChild = xmlGetLastChild(ctxt->node);
2232     while ((lastChild) && (lastChild->type == XML_COMMENT_NODE))
2233 	lastChild = lastChild->prev;
2234     if (lastChild == NULL) {
2235         if ((ctxt->node->type != XML_ELEMENT_NODE) &&
2236             (ctxt->node->content != NULL)) return(0);
2237 	/* keep ws in constructs like ...<b> </b>...
2238 	   for all tags "b" allowing PCDATA */
2239 	for ( i = 0; i < sizeof(allowPCData)/sizeof(allowPCData[0]); i++ ) {
2240 	    if ( xmlStrEqual(ctxt->name, BAD_CAST allowPCData[i]) ) {
2241 		return(0);
2242 	    }
2243 	}
2244     } else if (xmlNodeIsText(lastChild)) {
2245         return(0);
2246     } else {
2247 	/* keep ws in constructs like <p><b>xy</b> <i>z</i><p>
2248 	   for all tags "p" allowing PCDATA */
2249 	for ( i = 0; i < sizeof(allowPCData)/sizeof(allowPCData[0]); i++ ) {
2250 	    if ( xmlStrEqual(lastChild->name, BAD_CAST allowPCData[i]) ) {
2251 		return(0);
2252 	    }
2253 	}
2254     }
2255     return(1);
2256 }
2257 
2258 /**
2259  * htmlNewDocNoDtD:
2260  * @URI:  URI for the dtd, or NULL
2261  * @ExternalID:  the external ID of the DTD, or NULL
2262  *
2263  * Creates a new HTML document without a DTD node if @URI and @ExternalID
2264  * are NULL
2265  *
2266  * Returns a new document, do not initialize the DTD if not provided
2267  */
2268 htmlDocPtr
htmlNewDocNoDtD(const xmlChar * URI,const xmlChar * ExternalID)2269 htmlNewDocNoDtD(const xmlChar *URI, const xmlChar *ExternalID) {
2270     xmlDocPtr cur;
2271 
2272     /*
2273      * Allocate a new document and fill the fields.
2274      */
2275     cur = (xmlDocPtr) xmlMalloc(sizeof(xmlDoc));
2276     if (cur == NULL) {
2277 	htmlErrMemory(NULL, "HTML document creation failed\n");
2278 	return(NULL);
2279     }
2280     memset(cur, 0, sizeof(xmlDoc));
2281 
2282     cur->type = XML_HTML_DOCUMENT_NODE;
2283     cur->version = NULL;
2284     cur->intSubset = NULL;
2285     cur->doc = cur;
2286     cur->name = NULL;
2287     cur->children = NULL;
2288     cur->extSubset = NULL;
2289     cur->oldNs = NULL;
2290     cur->encoding = NULL;
2291     cur->standalone = 1;
2292     cur->compression = 0;
2293     cur->ids = NULL;
2294     cur->refs = NULL;
2295     cur->_private = NULL;
2296     cur->charset = XML_CHAR_ENCODING_UTF8;
2297     cur->properties = XML_DOC_HTML | XML_DOC_USERBUILT;
2298     if ((ExternalID != NULL) ||
2299 	(URI != NULL))
2300 	xmlCreateIntSubset(cur, BAD_CAST "html", ExternalID, URI);
2301     return(cur);
2302 }
2303 
2304 /**
2305  * htmlNewDoc:
2306  * @URI:  URI for the dtd, or NULL
2307  * @ExternalID:  the external ID of the DTD, or NULL
2308  *
2309  * Creates a new HTML document
2310  *
2311  * Returns a new document
2312  */
2313 htmlDocPtr
htmlNewDoc(const xmlChar * URI,const xmlChar * ExternalID)2314 htmlNewDoc(const xmlChar *URI, const xmlChar *ExternalID) {
2315     if ((URI == NULL) && (ExternalID == NULL))
2316 	return(htmlNewDocNoDtD(
2317 		    BAD_CAST "http://www.w3.org/TR/REC-html40/loose.dtd",
2318 		    BAD_CAST "-//W3C//DTD HTML 4.0 Transitional//EN"));
2319 
2320     return(htmlNewDocNoDtD(URI, ExternalID));
2321 }
2322 
2323 
2324 /************************************************************************
2325  *									*
2326  *			The parser itself				*
2327  *	Relates to http://www.w3.org/TR/html40				*
2328  *									*
2329  ************************************************************************/
2330 
2331 /************************************************************************
2332  *									*
2333  *			The parser itself				*
2334  *									*
2335  ************************************************************************/
2336 
2337 static const xmlChar * htmlParseNameComplex(xmlParserCtxtPtr ctxt);
2338 
2339 /**
2340  * htmlParseHTMLName:
2341  * @ctxt:  an HTML parser context
2342  *
2343  * parse an HTML tag or attribute name, note that we convert it to lowercase
2344  * since HTML names are not case-sensitive.
2345  *
2346  * Returns the Tag Name parsed or NULL
2347  */
2348 
2349 static const xmlChar *
htmlParseHTMLName(htmlParserCtxtPtr ctxt)2350 htmlParseHTMLName(htmlParserCtxtPtr ctxt) {
2351     int i = 0;
2352     xmlChar loc[HTML_PARSER_BUFFER_SIZE];
2353 
2354     if (!IS_ASCII_LETTER(CUR) && (CUR != '_') &&
2355         (CUR != ':') && (CUR != '.')) return(NULL);
2356 
2357     while ((i < HTML_PARSER_BUFFER_SIZE) &&
2358            ((IS_ASCII_LETTER(CUR)) || (IS_ASCII_DIGIT(CUR)) ||
2359 	   (CUR == ':') || (CUR == '-') || (CUR == '_') ||
2360            (CUR == '.'))) {
2361 	if ((CUR >= 'A') && (CUR <= 'Z')) loc[i] = CUR + 0x20;
2362         else loc[i] = CUR;
2363 	i++;
2364 
2365 	NEXT;
2366     }
2367 
2368     return(xmlDictLookup(ctxt->dict, loc, i));
2369 }
2370 
2371 
2372 /**
2373  * htmlParseHTMLName_nonInvasive:
2374  * @ctxt:  an HTML parser context
2375  *
2376  * parse an HTML tag or attribute name, note that we convert it to lowercase
2377  * since HTML names are not case-sensitive, this doesn't consume the data
2378  * from the stream, it's a look-ahead
2379  *
2380  * Returns the Tag Name parsed or NULL
2381  */
2382 
2383 static const xmlChar *
htmlParseHTMLName_nonInvasive(htmlParserCtxtPtr ctxt)2384 htmlParseHTMLName_nonInvasive(htmlParserCtxtPtr ctxt) {
2385     int i = 0;
2386     xmlChar loc[HTML_PARSER_BUFFER_SIZE];
2387 
2388     if (!IS_ASCII_LETTER(NXT(1)) && (NXT(1) != '_') &&
2389         (NXT(1) != ':')) return(NULL);
2390 
2391     while ((i < HTML_PARSER_BUFFER_SIZE) &&
2392            ((IS_ASCII_LETTER(NXT(1+i))) || (IS_ASCII_DIGIT(NXT(1+i))) ||
2393 	   (NXT(1+i) == ':') || (NXT(1+i) == '-') || (NXT(1+i) == '_'))) {
2394 	if ((NXT(1+i) >= 'A') && (NXT(1+i) <= 'Z')) loc[i] = NXT(1+i) + 0x20;
2395         else loc[i] = NXT(1+i);
2396 	i++;
2397     }
2398 
2399     return(xmlDictLookup(ctxt->dict, loc, i));
2400 }
2401 
2402 
2403 /**
2404  * htmlParseName:
2405  * @ctxt:  an HTML parser context
2406  *
2407  * parse an HTML name, this routine is case sensitive.
2408  *
2409  * Returns the Name parsed or NULL
2410  */
2411 
2412 static const xmlChar *
htmlParseName(htmlParserCtxtPtr ctxt)2413 htmlParseName(htmlParserCtxtPtr ctxt) {
2414     const xmlChar *in;
2415     const xmlChar *ret;
2416     int count = 0;
2417 
2418     GROW;
2419 
2420     /*
2421      * Accelerator for simple ASCII names
2422      */
2423     in = ctxt->input->cur;
2424     if (((*in >= 0x61) && (*in <= 0x7A)) ||
2425 	((*in >= 0x41) && (*in <= 0x5A)) ||
2426 	(*in == '_') || (*in == ':')) {
2427 	in++;
2428 	while (((*in >= 0x61) && (*in <= 0x7A)) ||
2429 	       ((*in >= 0x41) && (*in <= 0x5A)) ||
2430 	       ((*in >= 0x30) && (*in <= 0x39)) ||
2431 	       (*in == '_') || (*in == '-') ||
2432 	       (*in == ':') || (*in == '.'))
2433 	    in++;
2434 	if ((*in > 0) && (*in < 0x80)) {
2435 	    count = in - ctxt->input->cur;
2436 	    ret = xmlDictLookup(ctxt->dict, ctxt->input->cur, count);
2437 	    ctxt->input->cur = in;
2438 	    ctxt->nbChars += count;
2439 	    ctxt->input->col += count;
2440 	    return(ret);
2441 	}
2442     }
2443     return(htmlParseNameComplex(ctxt));
2444 }
2445 
2446 static const xmlChar *
htmlParseNameComplex(xmlParserCtxtPtr ctxt)2447 htmlParseNameComplex(xmlParserCtxtPtr ctxt) {
2448     int len = 0, l;
2449     int c;
2450     int count = 0;
2451 
2452     /*
2453      * Handler for more complex cases
2454      */
2455     GROW;
2456     c = CUR_CHAR(l);
2457     if ((c == ' ') || (c == '>') || (c == '/') || /* accelerators */
2458 	(!IS_LETTER(c) && (c != '_') &&
2459          (c != ':'))) {
2460 	return(NULL);
2461     }
2462 
2463     while ((c != ' ') && (c != '>') && (c != '/') && /* test bigname.xml */
2464 	   ((IS_LETTER(c)) || (IS_DIGIT(c)) ||
2465             (c == '.') || (c == '-') ||
2466 	    (c == '_') || (c == ':') ||
2467 	    (IS_COMBINING(c)) ||
2468 	    (IS_EXTENDER(c)))) {
2469 	if (count++ > 100) {
2470 	    count = 0;
2471 	    GROW;
2472 	}
2473 	len += l;
2474 	NEXTL(l);
2475 	c = CUR_CHAR(l);
2476     }
2477     return(xmlDictLookup(ctxt->dict, ctxt->input->cur - len, len));
2478 }
2479 
2480 
2481 /**
2482  * htmlParseHTMLAttribute:
2483  * @ctxt:  an HTML parser context
2484  * @stop:  a char stop value
2485  *
2486  * parse an HTML attribute value till the stop (quote), if
2487  * stop is 0 then it stops at the first space
2488  *
2489  * Returns the attribute parsed or NULL
2490  */
2491 
2492 static xmlChar *
htmlParseHTMLAttribute(htmlParserCtxtPtr ctxt,const xmlChar stop)2493 htmlParseHTMLAttribute(htmlParserCtxtPtr ctxt, const xmlChar stop) {
2494     xmlChar *buffer = NULL;
2495     int buffer_size = 0;
2496     xmlChar *out = NULL;
2497     const xmlChar *name = NULL;
2498     const xmlChar *cur = NULL;
2499     const htmlEntityDesc * ent;
2500 
2501     /*
2502      * allocate a translation buffer.
2503      */
2504     buffer_size = HTML_PARSER_BUFFER_SIZE;
2505     buffer = (xmlChar *) xmlMallocAtomic(buffer_size * sizeof(xmlChar));
2506     if (buffer == NULL) {
2507 	htmlErrMemory(ctxt, "buffer allocation failed\n");
2508 	return(NULL);
2509     }
2510     out = buffer;
2511 
2512     /*
2513      * Ok loop until we reach one of the ending chars
2514      */
2515     while ((CUR != 0) && (CUR != stop)) {
2516 	if ((stop == 0) && (CUR == '>')) break;
2517 	if ((stop == 0) && (IS_BLANK_CH(CUR))) break;
2518         if (CUR == '&') {
2519 	    if (NXT(1) == '#') {
2520 		unsigned int c;
2521 		int bits;
2522 
2523 		c = htmlParseCharRef(ctxt);
2524 		if      (c <    0x80)
2525 		        { *out++  = c;                bits= -6; }
2526 		else if (c <   0x800)
2527 		        { *out++  =((c >>  6) & 0x1F) | 0xC0;  bits=  0; }
2528 		else if (c < 0x10000)
2529 		        { *out++  =((c >> 12) & 0x0F) | 0xE0;  bits=  6; }
2530 		else
2531 		        { *out++  =((c >> 18) & 0x07) | 0xF0;  bits= 12; }
2532 
2533 		for ( ; bits >= 0; bits-= 6) {
2534 		    *out++  = ((c >> bits) & 0x3F) | 0x80;
2535 		}
2536 
2537 		if (out - buffer > buffer_size - 100) {
2538 			int indx = out - buffer;
2539 
2540 			growBuffer(buffer);
2541 			out = &buffer[indx];
2542 		}
2543 	    } else {
2544 		ent = htmlParseEntityRef(ctxt, &name);
2545 		if (name == NULL) {
2546 		    *out++ = '&';
2547 		    if (out - buffer > buffer_size - 100) {
2548 			int indx = out - buffer;
2549 
2550 			growBuffer(buffer);
2551 			out = &buffer[indx];
2552 		    }
2553 		} else if (ent == NULL) {
2554 		    *out++ = '&';
2555 		    cur = name;
2556 		    while (*cur != 0) {
2557 			if (out - buffer > buffer_size - 100) {
2558 			    int indx = out - buffer;
2559 
2560 			    growBuffer(buffer);
2561 			    out = &buffer[indx];
2562 			}
2563 			*out++ = *cur++;
2564 		    }
2565 		} else {
2566 		    unsigned int c;
2567 		    int bits;
2568 
2569 		    if (out - buffer > buffer_size - 100) {
2570 			int indx = out - buffer;
2571 
2572 			growBuffer(buffer);
2573 			out = &buffer[indx];
2574 		    }
2575 		    c = ent->value;
2576 		    if      (c <    0x80)
2577 			{ *out++  = c;                bits= -6; }
2578 		    else if (c <   0x800)
2579 			{ *out++  =((c >>  6) & 0x1F) | 0xC0;  bits=  0; }
2580 		    else if (c < 0x10000)
2581 			{ *out++  =((c >> 12) & 0x0F) | 0xE0;  bits=  6; }
2582 		    else
2583 			{ *out++  =((c >> 18) & 0x07) | 0xF0;  bits= 12; }
2584 
2585 		    for ( ; bits >= 0; bits-= 6) {
2586 			*out++  = ((c >> bits) & 0x3F) | 0x80;
2587 		    }
2588 		}
2589 	    }
2590 	} else {
2591 	    unsigned int c;
2592 	    int bits, l;
2593 
2594 	    if (out - buffer > buffer_size - 100) {
2595 		int indx = out - buffer;
2596 
2597 		growBuffer(buffer);
2598 		out = &buffer[indx];
2599 	    }
2600 	    c = CUR_CHAR(l);
2601 	    if      (c <    0x80)
2602 		    { *out++  = c;                bits= -6; }
2603 	    else if (c <   0x800)
2604 		    { *out++  =((c >>  6) & 0x1F) | 0xC0;  bits=  0; }
2605 	    else if (c < 0x10000)
2606 		    { *out++  =((c >> 12) & 0x0F) | 0xE0;  bits=  6; }
2607 	    else
2608 		    { *out++  =((c >> 18) & 0x07) | 0xF0;  bits= 12; }
2609 
2610 	    for ( ; bits >= 0; bits-= 6) {
2611 		*out++  = ((c >> bits) & 0x3F) | 0x80;
2612 	    }
2613 	    NEXT;
2614 	}
2615     }
2616     *out = 0;
2617     return(buffer);
2618 }
2619 
2620 /**
2621  * htmlParseEntityRef:
2622  * @ctxt:  an HTML parser context
2623  * @str:  location to store the entity name
2624  *
2625  * parse an HTML ENTITY references
2626  *
2627  * [68] EntityRef ::= '&' Name ';'
2628  *
2629  * Returns the associated htmlEntityDescPtr if found, or NULL otherwise,
2630  *         if non-NULL *str will have to be freed by the caller.
2631  */
2632 const htmlEntityDesc *
htmlParseEntityRef(htmlParserCtxtPtr ctxt,const xmlChar ** str)2633 htmlParseEntityRef(htmlParserCtxtPtr ctxt, const xmlChar **str) {
2634     const xmlChar *name;
2635     const htmlEntityDesc * ent = NULL;
2636 
2637     if (str != NULL) *str = NULL;
2638     if ((ctxt == NULL) || (ctxt->input == NULL)) return(NULL);
2639 
2640     if (CUR == '&') {
2641         NEXT;
2642         name = htmlParseName(ctxt);
2643 	if (name == NULL) {
2644 	    htmlParseErr(ctxt, XML_ERR_NAME_REQUIRED,
2645 	                 "htmlParseEntityRef: no name\n", NULL, NULL);
2646 	} else {
2647 	    GROW;
2648 	    if (CUR == ';') {
2649 	        if (str != NULL)
2650 		    *str = name;
2651 
2652 		/*
2653 		 * Lookup the entity in the table.
2654 		 */
2655 		ent = htmlEntityLookup(name);
2656 		if (ent != NULL) /* OK that's ugly !!! */
2657 		    NEXT;
2658 	    } else {
2659 		htmlParseErr(ctxt, XML_ERR_ENTITYREF_SEMICOL_MISSING,
2660 		             "htmlParseEntityRef: expecting ';'\n",
2661 			     NULL, NULL);
2662 	        if (str != NULL)
2663 		    *str = name;
2664 	    }
2665 	}
2666     }
2667     return(ent);
2668 }
2669 
2670 /**
2671  * htmlParseAttValue:
2672  * @ctxt:  an HTML parser context
2673  *
2674  * parse a value for an attribute
2675  * Note: the parser won't do substitution of entities here, this
2676  * will be handled later in xmlStringGetNodeList, unless it was
2677  * asked for ctxt->replaceEntities != 0
2678  *
2679  * Returns the AttValue parsed or NULL.
2680  */
2681 
2682 static xmlChar *
htmlParseAttValue(htmlParserCtxtPtr ctxt)2683 htmlParseAttValue(htmlParserCtxtPtr ctxt) {
2684     xmlChar *ret = NULL;
2685 
2686     if (CUR == '"') {
2687         NEXT;
2688 	ret = htmlParseHTMLAttribute(ctxt, '"');
2689         if (CUR != '"') {
2690 	    htmlParseErr(ctxt, XML_ERR_ATTRIBUTE_NOT_FINISHED,
2691 	                 "AttValue: \" expected\n", NULL, NULL);
2692 	} else
2693 	    NEXT;
2694     } else if (CUR == '\'') {
2695         NEXT;
2696 	ret = htmlParseHTMLAttribute(ctxt, '\'');
2697         if (CUR != '\'') {
2698 	    htmlParseErr(ctxt, XML_ERR_ATTRIBUTE_NOT_FINISHED,
2699 	                 "AttValue: ' expected\n", NULL, NULL);
2700 	} else
2701 	    NEXT;
2702     } else {
2703         /*
2704 	 * That's an HTMLism, the attribute value may not be quoted
2705 	 */
2706 	ret = htmlParseHTMLAttribute(ctxt, 0);
2707 	if (ret == NULL) {
2708 	    htmlParseErr(ctxt, XML_ERR_ATTRIBUTE_WITHOUT_VALUE,
2709 	                 "AttValue: no value found\n", NULL, NULL);
2710 	}
2711     }
2712     return(ret);
2713 }
2714 
2715 /**
2716  * htmlParseSystemLiteral:
2717  * @ctxt:  an HTML parser context
2718  *
2719  * parse an HTML Literal
2720  *
2721  * [11] SystemLiteral ::= ('"' [^"]* '"') | ("'" [^']* "'")
2722  *
2723  * Returns the SystemLiteral parsed or NULL
2724  */
2725 
2726 static xmlChar *
htmlParseSystemLiteral(htmlParserCtxtPtr ctxt)2727 htmlParseSystemLiteral(htmlParserCtxtPtr ctxt) {
2728     const xmlChar *q;
2729     xmlChar *ret = NULL;
2730 
2731     if (CUR == '"') {
2732         NEXT;
2733 	q = CUR_PTR;
2734 	while ((IS_CHAR_CH(CUR)) && (CUR != '"'))
2735 	    NEXT;
2736 	if (!IS_CHAR_CH(CUR)) {
2737 	    htmlParseErr(ctxt, XML_ERR_LITERAL_NOT_FINISHED,
2738 			 "Unfinished SystemLiteral\n", NULL, NULL);
2739 	} else {
2740 	    ret = xmlStrndup(q, CUR_PTR - q);
2741 	    NEXT;
2742         }
2743     } else if (CUR == '\'') {
2744         NEXT;
2745 	q = CUR_PTR;
2746 	while ((IS_CHAR_CH(CUR)) && (CUR != '\''))
2747 	    NEXT;
2748 	if (!IS_CHAR_CH(CUR)) {
2749 	    htmlParseErr(ctxt, XML_ERR_LITERAL_NOT_FINISHED,
2750 			 "Unfinished SystemLiteral\n", NULL, NULL);
2751 	} else {
2752 	    ret = xmlStrndup(q, CUR_PTR - q);
2753 	    NEXT;
2754         }
2755     } else {
2756 	htmlParseErr(ctxt, XML_ERR_LITERAL_NOT_STARTED,
2757 	             " or ' expected\n", NULL, NULL);
2758     }
2759 
2760     return(ret);
2761 }
2762 
2763 /**
2764  * htmlParsePubidLiteral:
2765  * @ctxt:  an HTML parser context
2766  *
2767  * parse an HTML public literal
2768  *
2769  * [12] PubidLiteral ::= '"' PubidChar* '"' | "'" (PubidChar - "'")* "'"
2770  *
2771  * Returns the PubidLiteral parsed or NULL.
2772  */
2773 
2774 static xmlChar *
htmlParsePubidLiteral(htmlParserCtxtPtr ctxt)2775 htmlParsePubidLiteral(htmlParserCtxtPtr ctxt) {
2776     const xmlChar *q;
2777     xmlChar *ret = NULL;
2778     /*
2779      * Name ::= (Letter | '_') (NameChar)*
2780      */
2781     if (CUR == '"') {
2782         NEXT;
2783 	q = CUR_PTR;
2784 	while (IS_PUBIDCHAR_CH(CUR)) NEXT;
2785 	if (CUR != '"') {
2786 	    htmlParseErr(ctxt, XML_ERR_LITERAL_NOT_FINISHED,
2787 	                 "Unfinished PubidLiteral\n", NULL, NULL);
2788 	} else {
2789 	    ret = xmlStrndup(q, CUR_PTR - q);
2790 	    NEXT;
2791 	}
2792     } else if (CUR == '\'') {
2793         NEXT;
2794 	q = CUR_PTR;
2795 	while ((IS_PUBIDCHAR_CH(CUR)) && (CUR != '\''))
2796 	    NEXT;
2797 	if (CUR != '\'') {
2798 	    htmlParseErr(ctxt, XML_ERR_LITERAL_NOT_FINISHED,
2799 	                 "Unfinished PubidLiteral\n", NULL, NULL);
2800 	} else {
2801 	    ret = xmlStrndup(q, CUR_PTR - q);
2802 	    NEXT;
2803 	}
2804     } else {
2805 	htmlParseErr(ctxt, XML_ERR_LITERAL_NOT_STARTED,
2806 	             "PubidLiteral \" or ' expected\n", NULL, NULL);
2807     }
2808 
2809     return(ret);
2810 }
2811 
2812 /**
2813  * htmlParseScript:
2814  * @ctxt:  an HTML parser context
2815  *
2816  * parse the content of an HTML SCRIPT or STYLE element
2817  * http://www.w3.org/TR/html4/sgml/dtd.html#Script
2818  * http://www.w3.org/TR/html4/sgml/dtd.html#StyleSheet
2819  * http://www.w3.org/TR/html4/types.html#type-script
2820  * http://www.w3.org/TR/html4/types.html#h-6.15
2821  * http://www.w3.org/TR/html4/appendix/notes.html#h-B.3.2.1
2822  *
2823  * Script data ( %Script; in the DTD) can be the content of the SCRIPT
2824  * element and the value of intrinsic event attributes. User agents must
2825  * not evaluate script data as HTML markup but instead must pass it on as
2826  * data to a script engine.
2827  * NOTES:
2828  * - The content is passed like CDATA
2829  * - the attributes for style and scripting "onXXX" are also described
2830  *   as CDATA but SGML allows entities references in attributes so their
2831  *   processing is identical as other attributes
2832  */
2833 static void
htmlParseScript(htmlParserCtxtPtr ctxt)2834 htmlParseScript(htmlParserCtxtPtr ctxt) {
2835     xmlChar buf[HTML_PARSER_BIG_BUFFER_SIZE + 5];
2836     int nbchar = 0;
2837     int cur,l;
2838 
2839     SHRINK;
2840     cur = CUR_CHAR(l);
2841     while (IS_CHAR_CH(cur)) {
2842 	if ((cur == '<') && (NXT(1) == '/')) {
2843             /*
2844              * One should break here, the specification is clear:
2845              * Authors should therefore escape "</" within the content.
2846              * Escape mechanisms are specific to each scripting or
2847              * style sheet language.
2848              *
2849              * In recovery mode, only break if end tag match the
2850              * current tag, effectively ignoring all tags inside the
2851              * script/style block and treating the entire block as
2852              * CDATA.
2853              */
2854             if (ctxt->recovery) {
2855                 if (xmlStrncasecmp(ctxt->name, ctxt->input->cur+2,
2856 				   xmlStrlen(ctxt->name)) == 0)
2857                 {
2858                     break; /* while */
2859                 } else {
2860 		    htmlParseErr(ctxt, XML_ERR_TAG_NAME_MISMATCH,
2861 				 "Element %s embeds close tag\n",
2862 		                 ctxt->name, NULL);
2863 		}
2864             } else {
2865                 if (((NXT(2) >= 'A') && (NXT(2) <= 'Z')) ||
2866                     ((NXT(2) >= 'a') && (NXT(2) <= 'z')))
2867                 {
2868                     break; /* while */
2869                 }
2870             }
2871 	}
2872 	COPY_BUF(l,buf,nbchar,cur);
2873 	if (nbchar >= HTML_PARSER_BIG_BUFFER_SIZE) {
2874 	    if (ctxt->sax->cdataBlock!= NULL) {
2875 		/*
2876 		 * Insert as CDATA, which is the same as HTML_PRESERVE_NODE
2877 		 */
2878 		ctxt->sax->cdataBlock(ctxt->userData, buf, nbchar);
2879 	    } else if (ctxt->sax->characters != NULL) {
2880 		ctxt->sax->characters(ctxt->userData, buf, nbchar);
2881 	    }
2882 	    nbchar = 0;
2883 	}
2884 	GROW;
2885 	NEXTL(l);
2886 	cur = CUR_CHAR(l);
2887     }
2888 
2889     if ((!(IS_CHAR_CH(cur))) && (!((cur == 0) && (ctxt->progressive)))) {
2890         htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR,
2891                     "Invalid char in CDATA 0x%X\n", cur);
2892         if (ctxt->input->cur < ctxt->input->end) {
2893             NEXT;
2894         }
2895     }
2896 
2897     if ((nbchar != 0) && (ctxt->sax != NULL) && (!ctxt->disableSAX)) {
2898 	if (ctxt->sax->cdataBlock!= NULL) {
2899 	    /*
2900 	     * Insert as CDATA, which is the same as HTML_PRESERVE_NODE
2901 	     */
2902 	    ctxt->sax->cdataBlock(ctxt->userData, buf, nbchar);
2903 	} else if (ctxt->sax->characters != NULL) {
2904 	    ctxt->sax->characters(ctxt->userData, buf, nbchar);
2905 	}
2906     }
2907 }
2908 
2909 
2910 /**
2911  * htmlParseCharData:
2912  * @ctxt:  an HTML parser context
2913  *
2914  * parse a CharData section.
2915  * if we are within a CDATA section ']]>' marks an end of section.
2916  *
2917  * [14] CharData ::= [^<&]* - ([^<&]* ']]>' [^<&]*)
2918  */
2919 
2920 static void
htmlParseCharData(htmlParserCtxtPtr ctxt)2921 htmlParseCharData(htmlParserCtxtPtr ctxt) {
2922     xmlChar buf[HTML_PARSER_BIG_BUFFER_SIZE + 5];
2923     int nbchar = 0;
2924     int cur, l;
2925     int chunk = 0;
2926 
2927     SHRINK;
2928     cur = CUR_CHAR(l);
2929     while (((cur != '<') || (ctxt->token == '<')) &&
2930            ((cur != '&') || (ctxt->token == '&')) &&
2931 	   (cur != 0)) {
2932 	if (!(IS_CHAR(cur))) {
2933 	    htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR,
2934 	                "Invalid char in CDATA 0x%X\n", cur);
2935 	} else {
2936 	    COPY_BUF(l,buf,nbchar,cur);
2937 	}
2938 	if (nbchar >= HTML_PARSER_BIG_BUFFER_SIZE) {
2939 	    /*
2940 	     * Ok the segment is to be consumed as chars.
2941 	     */
2942 	    if ((ctxt->sax != NULL) && (!ctxt->disableSAX)) {
2943 		if (areBlanks(ctxt, buf, nbchar)) {
2944 		    if (ctxt->sax->ignorableWhitespace != NULL)
2945 			ctxt->sax->ignorableWhitespace(ctxt->userData,
2946 			                               buf, nbchar);
2947 		} else {
2948 		    htmlCheckParagraph(ctxt);
2949 		    if (ctxt->sax->characters != NULL)
2950 			ctxt->sax->characters(ctxt->userData, buf, nbchar);
2951 		}
2952 	    }
2953 	    nbchar = 0;
2954 	}
2955 	NEXTL(l);
2956         chunk++;
2957         if (chunk > HTML_PARSER_BUFFER_SIZE) {
2958             chunk = 0;
2959             SHRINK;
2960             GROW;
2961         }
2962 	cur = CUR_CHAR(l);
2963 	if (cur == 0) {
2964 	    SHRINK;
2965 	    GROW;
2966 	    cur = CUR_CHAR(l);
2967 	}
2968     }
2969     if (nbchar != 0) {
2970         buf[nbchar] = 0;
2971 
2972 	/*
2973 	 * Ok the segment is to be consumed as chars.
2974 	 */
2975 	if ((ctxt->sax != NULL) && (!ctxt->disableSAX)) {
2976 	    if (areBlanks(ctxt, buf, nbchar)) {
2977 		if (ctxt->sax->ignorableWhitespace != NULL)
2978 		    ctxt->sax->ignorableWhitespace(ctxt->userData, buf, nbchar);
2979 	    } else {
2980 		htmlCheckParagraph(ctxt);
2981 		if (ctxt->sax->characters != NULL)
2982 		    ctxt->sax->characters(ctxt->userData, buf, nbchar);
2983 	    }
2984 	}
2985     } else {
2986 	/*
2987 	 * Loop detection
2988 	 */
2989 	if (cur == 0)
2990 	    ctxt->instate = XML_PARSER_EOF;
2991     }
2992 }
2993 
2994 /**
2995  * htmlParseExternalID:
2996  * @ctxt:  an HTML parser context
2997  * @publicID:  a xmlChar** receiving PubidLiteral
2998  *
2999  * Parse an External ID or a Public ID
3000  *
3001  * [75] ExternalID ::= 'SYSTEM' S SystemLiteral
3002  *                   | 'PUBLIC' S PubidLiteral S SystemLiteral
3003  *
3004  * [83] PublicID ::= 'PUBLIC' S PubidLiteral
3005  *
3006  * Returns the function returns SystemLiteral and in the second
3007  *                case publicID receives PubidLiteral, is strict is off
3008  *                it is possible to return NULL and have publicID set.
3009  */
3010 
3011 static xmlChar *
htmlParseExternalID(htmlParserCtxtPtr ctxt,xmlChar ** publicID)3012 htmlParseExternalID(htmlParserCtxtPtr ctxt, xmlChar **publicID) {
3013     xmlChar *URI = NULL;
3014 
3015     if ((UPPER == 'S') && (UPP(1) == 'Y') &&
3016          (UPP(2) == 'S') && (UPP(3) == 'T') &&
3017 	 (UPP(4) == 'E') && (UPP(5) == 'M')) {
3018         SKIP(6);
3019 	if (!IS_BLANK_CH(CUR)) {
3020 	    htmlParseErr(ctxt, XML_ERR_SPACE_REQUIRED,
3021 	                 "Space required after 'SYSTEM'\n", NULL, NULL);
3022 	}
3023         SKIP_BLANKS;
3024 	URI = htmlParseSystemLiteral(ctxt);
3025 	if (URI == NULL) {
3026 	    htmlParseErr(ctxt, XML_ERR_URI_REQUIRED,
3027 	                 "htmlParseExternalID: SYSTEM, no URI\n", NULL, NULL);
3028         }
3029     } else if ((UPPER == 'P') && (UPP(1) == 'U') &&
3030 	       (UPP(2) == 'B') && (UPP(3) == 'L') &&
3031 	       (UPP(4) == 'I') && (UPP(5) == 'C')) {
3032         SKIP(6);
3033 	if (!IS_BLANK_CH(CUR)) {
3034 	    htmlParseErr(ctxt, XML_ERR_SPACE_REQUIRED,
3035 	                 "Space required after 'PUBLIC'\n", NULL, NULL);
3036 	}
3037         SKIP_BLANKS;
3038 	*publicID = htmlParsePubidLiteral(ctxt);
3039 	if (*publicID == NULL) {
3040 	    htmlParseErr(ctxt, XML_ERR_PUBID_REQUIRED,
3041 	                 "htmlParseExternalID: PUBLIC, no Public Identifier\n",
3042 			 NULL, NULL);
3043 	}
3044         SKIP_BLANKS;
3045         if ((CUR == '"') || (CUR == '\'')) {
3046 	    URI = htmlParseSystemLiteral(ctxt);
3047 	}
3048     }
3049     return(URI);
3050 }
3051 
3052 /**
3053  * xmlParsePI:
3054  * @ctxt:  an XML parser context
3055  *
3056  * parse an XML Processing Instruction.
3057  *
3058  * [16] PI ::= '<?' PITarget (S (Char* - (Char* '?>' Char*)))? '?>'
3059  */
3060 static void
htmlParsePI(htmlParserCtxtPtr ctxt)3061 htmlParsePI(htmlParserCtxtPtr ctxt) {
3062     xmlChar *buf = NULL;
3063     int len = 0;
3064     int size = HTML_PARSER_BUFFER_SIZE;
3065     int cur, l;
3066     const xmlChar *target;
3067     xmlParserInputState state;
3068     int count = 0;
3069 
3070     if ((RAW == '<') && (NXT(1) == '?')) {
3071 	state = ctxt->instate;
3072         ctxt->instate = XML_PARSER_PI;
3073 	/*
3074 	 * this is a Processing Instruction.
3075 	 */
3076 	SKIP(2);
3077 	SHRINK;
3078 
3079 	/*
3080 	 * Parse the target name and check for special support like
3081 	 * namespace.
3082 	 */
3083         target = htmlParseName(ctxt);
3084 	if (target != NULL) {
3085 	    if (RAW == '>') {
3086 		SKIP(1);
3087 
3088 		/*
3089 		 * SAX: PI detected.
3090 		 */
3091 		if ((ctxt->sax) && (!ctxt->disableSAX) &&
3092 		    (ctxt->sax->processingInstruction != NULL))
3093 		    ctxt->sax->processingInstruction(ctxt->userData,
3094 		                                     target, NULL);
3095 		ctxt->instate = state;
3096 		return;
3097 	    }
3098 	    buf = (xmlChar *) xmlMallocAtomic(size * sizeof(xmlChar));
3099 	    if (buf == NULL) {
3100 		htmlErrMemory(ctxt, NULL);
3101 		ctxt->instate = state;
3102 		return;
3103 	    }
3104 	    cur = CUR;
3105 	    if (!IS_BLANK(cur)) {
3106 		htmlParseErr(ctxt, XML_ERR_SPACE_REQUIRED,
3107 			  "ParsePI: PI %s space expected\n", target, NULL);
3108 	    }
3109             SKIP_BLANKS;
3110 	    cur = CUR_CHAR(l);
3111 	    while (IS_CHAR(cur) && (cur != '>')) {
3112 		if (len + 5 >= size) {
3113 		    xmlChar *tmp;
3114 
3115 		    size *= 2;
3116 		    tmp = (xmlChar *) xmlRealloc(buf, size * sizeof(xmlChar));
3117 		    if (tmp == NULL) {
3118 			htmlErrMemory(ctxt, NULL);
3119 			xmlFree(buf);
3120 			ctxt->instate = state;
3121 			return;
3122 		    }
3123 		    buf = tmp;
3124 		}
3125 		count++;
3126 		if (count > 50) {
3127 		    GROW;
3128 		    count = 0;
3129 		}
3130 		COPY_BUF(l,buf,len,cur);
3131 		NEXTL(l);
3132 		cur = CUR_CHAR(l);
3133 		if (cur == 0) {
3134 		    SHRINK;
3135 		    GROW;
3136 		    cur = CUR_CHAR(l);
3137 		}
3138 	    }
3139 	    buf[len] = 0;
3140 	    if (cur != '>') {
3141 		htmlParseErr(ctxt, XML_ERR_PI_NOT_FINISHED,
3142 		      "ParsePI: PI %s never end ...\n", target, NULL);
3143 	    } else {
3144 		SKIP(1);
3145 
3146 		/*
3147 		 * SAX: PI detected.
3148 		 */
3149 		if ((ctxt->sax) && (!ctxt->disableSAX) &&
3150 		    (ctxt->sax->processingInstruction != NULL))
3151 		    ctxt->sax->processingInstruction(ctxt->userData,
3152 		                                     target, buf);
3153 	    }
3154 	    xmlFree(buf);
3155 	} else {
3156 	    htmlParseErr(ctxt, XML_ERR_PI_NOT_STARTED,
3157                          "PI is not started correctly", NULL, NULL);
3158 	}
3159 	ctxt->instate = state;
3160     }
3161 }
3162 
3163 /**
3164  * htmlParseComment:
3165  * @ctxt:  an HTML parser context
3166  *
3167  * Parse an XML (SGML) comment <!-- .... -->
3168  *
3169  * [15] Comment ::= '<!--' ((Char - '-') | ('-' (Char - '-')))* '-->'
3170  */
3171 static void
htmlParseComment(htmlParserCtxtPtr ctxt)3172 htmlParseComment(htmlParserCtxtPtr ctxt) {
3173     xmlChar *buf = NULL;
3174     int len;
3175     int size = HTML_PARSER_BUFFER_SIZE;
3176     int q, ql;
3177     int r, rl;
3178     int cur, l;
3179     xmlParserInputState state;
3180 
3181     /*
3182      * Check that there is a comment right here.
3183      */
3184     if ((RAW != '<') || (NXT(1) != '!') ||
3185         (NXT(2) != '-') || (NXT(3) != '-')) return;
3186 
3187     state = ctxt->instate;
3188     ctxt->instate = XML_PARSER_COMMENT;
3189     SHRINK;
3190     SKIP(4);
3191     buf = (xmlChar *) xmlMallocAtomic(size * sizeof(xmlChar));
3192     if (buf == NULL) {
3193         htmlErrMemory(ctxt, "buffer allocation failed\n");
3194 	ctxt->instate = state;
3195 	return;
3196     }
3197     q = CUR_CHAR(ql);
3198     NEXTL(ql);
3199     r = CUR_CHAR(rl);
3200     NEXTL(rl);
3201     cur = CUR_CHAR(l);
3202     len = 0;
3203     while (IS_CHAR(cur) &&
3204            ((cur != '>') ||
3205 	    (r != '-') || (q != '-'))) {
3206 	if (len + 5 >= size) {
3207 	    xmlChar *tmp;
3208 
3209 	    size *= 2;
3210 	    tmp = (xmlChar *) xmlRealloc(buf, size * sizeof(xmlChar));
3211 	    if (tmp == NULL) {
3212 	        xmlFree(buf);
3213 	        htmlErrMemory(ctxt, "growing buffer failed\n");
3214 		ctxt->instate = state;
3215 		return;
3216 	    }
3217 	    buf = tmp;
3218 	}
3219 	COPY_BUF(ql,buf,len,q);
3220 	q = r;
3221 	ql = rl;
3222 	r = cur;
3223 	rl = l;
3224 	NEXTL(l);
3225 	cur = CUR_CHAR(l);
3226 	if (cur == 0) {
3227 	    SHRINK;
3228 	    GROW;
3229 	    cur = CUR_CHAR(l);
3230 	}
3231     }
3232     buf[len] = 0;
3233     if (!IS_CHAR(cur)) {
3234 	htmlParseErr(ctxt, XML_ERR_COMMENT_NOT_FINISHED,
3235 	             "Comment not terminated \n<!--%.50s\n", buf, NULL);
3236 	xmlFree(buf);
3237     } else {
3238         NEXT;
3239 	if ((ctxt->sax != NULL) && (ctxt->sax->comment != NULL) &&
3240 	    (!ctxt->disableSAX))
3241 	    ctxt->sax->comment(ctxt->userData, buf);
3242 	xmlFree(buf);
3243     }
3244     ctxt->instate = state;
3245 }
3246 
3247 /**
3248  * htmlParseCharRef:
3249  * @ctxt:  an HTML parser context
3250  *
3251  * parse Reference declarations
3252  *
3253  * [66] CharRef ::= '&#' [0-9]+ ';' |
3254  *                  '&#x' [0-9a-fA-F]+ ';'
3255  *
3256  * Returns the value parsed (as an int)
3257  */
3258 int
htmlParseCharRef(htmlParserCtxtPtr ctxt)3259 htmlParseCharRef(htmlParserCtxtPtr ctxt) {
3260     int val = 0;
3261 
3262     if ((ctxt == NULL) || (ctxt->input == NULL)) {
3263 	htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
3264 		     "htmlParseCharRef: context error\n",
3265 		     NULL, NULL);
3266         return(0);
3267     }
3268     if ((CUR == '&') && (NXT(1) == '#') &&
3269         ((NXT(2) == 'x') || NXT(2) == 'X')) {
3270 	SKIP(3);
3271 	while (CUR != ';') {
3272 	    if ((CUR >= '0') && (CUR <= '9'))
3273 	        val = val * 16 + (CUR - '0');
3274 	    else if ((CUR >= 'a') && (CUR <= 'f'))
3275 	        val = val * 16 + (CUR - 'a') + 10;
3276 	    else if ((CUR >= 'A') && (CUR <= 'F'))
3277 	        val = val * 16 + (CUR - 'A') + 10;
3278 	    else {
3279 	        htmlParseErr(ctxt, XML_ERR_INVALID_HEX_CHARREF,
3280 		             "htmlParseCharRef: missing semicolon\n",
3281 			     NULL, NULL);
3282 		break;
3283 	    }
3284 	    NEXT;
3285 	}
3286 	if (CUR == ';')
3287 	    NEXT;
3288     } else if  ((CUR == '&') && (NXT(1) == '#')) {
3289 	SKIP(2);
3290 	while (CUR != ';') {
3291 	    if ((CUR >= '0') && (CUR <= '9'))
3292 	        val = val * 10 + (CUR - '0');
3293 	    else {
3294 	        htmlParseErr(ctxt, XML_ERR_INVALID_DEC_CHARREF,
3295 		             "htmlParseCharRef: missing semicolon\n",
3296 			     NULL, NULL);
3297 		break;
3298 	    }
3299 	    NEXT;
3300 	}
3301 	if (CUR == ';')
3302 	    NEXT;
3303     } else {
3304 	htmlParseErr(ctxt, XML_ERR_INVALID_CHARREF,
3305 	             "htmlParseCharRef: invalid value\n", NULL, NULL);
3306     }
3307     /*
3308      * Check the value IS_CHAR ...
3309      */
3310     if (IS_CHAR(val)) {
3311         return(val);
3312     } else {
3313 	htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR,
3314 			"htmlParseCharRef: invalid xmlChar value %d\n",
3315 			val);
3316     }
3317     return(0);
3318 }
3319 
3320 
3321 /**
3322  * htmlParseDocTypeDecl:
3323  * @ctxt:  an HTML parser context
3324  *
3325  * parse a DOCTYPE declaration
3326  *
3327  * [28] doctypedecl ::= '<!DOCTYPE' S Name (S ExternalID)? S?
3328  *                      ('[' (markupdecl | PEReference | S)* ']' S?)? '>'
3329  */
3330 
3331 static void
htmlParseDocTypeDecl(htmlParserCtxtPtr ctxt)3332 htmlParseDocTypeDecl(htmlParserCtxtPtr ctxt) {
3333     const xmlChar *name;
3334     xmlChar *ExternalID = NULL;
3335     xmlChar *URI = NULL;
3336 
3337     /*
3338      * We know that '<!DOCTYPE' has been detected.
3339      */
3340     SKIP(9);
3341 
3342     SKIP_BLANKS;
3343 
3344     /*
3345      * Parse the DOCTYPE name.
3346      */
3347     name = htmlParseName(ctxt);
3348     if (name == NULL) {
3349 	htmlParseErr(ctxt, XML_ERR_NAME_REQUIRED,
3350 	             "htmlParseDocTypeDecl : no DOCTYPE name !\n",
3351 		     NULL, NULL);
3352     }
3353     /*
3354      * Check that upper(name) == "HTML" !!!!!!!!!!!!!
3355      */
3356 
3357     SKIP_BLANKS;
3358 
3359     /*
3360      * Check for SystemID and ExternalID
3361      */
3362     URI = htmlParseExternalID(ctxt, &ExternalID);
3363     SKIP_BLANKS;
3364 
3365     /*
3366      * We should be at the end of the DOCTYPE declaration.
3367      */
3368     if (CUR != '>') {
3369 	htmlParseErr(ctxt, XML_ERR_DOCTYPE_NOT_FINISHED,
3370 	             "DOCTYPE improperly terminated\n", NULL, NULL);
3371         /* We shouldn't try to resynchronize ... */
3372     }
3373     NEXT;
3374 
3375     /*
3376      * Create or update the document accordingly to the DOCTYPE
3377      */
3378     if ((ctxt->sax != NULL) && (ctxt->sax->internalSubset != NULL) &&
3379 	(!ctxt->disableSAX))
3380 	ctxt->sax->internalSubset(ctxt->userData, name, ExternalID, URI);
3381 
3382     /*
3383      * Cleanup, since we don't use all those identifiers
3384      */
3385     if (URI != NULL) xmlFree(URI);
3386     if (ExternalID != NULL) xmlFree(ExternalID);
3387 }
3388 
3389 /**
3390  * htmlParseAttribute:
3391  * @ctxt:  an HTML parser context
3392  * @value:  a xmlChar ** used to store the value of the attribute
3393  *
3394  * parse an attribute
3395  *
3396  * [41] Attribute ::= Name Eq AttValue
3397  *
3398  * [25] Eq ::= S? '=' S?
3399  *
3400  * With namespace:
3401  *
3402  * [NS 11] Attribute ::= QName Eq AttValue
3403  *
3404  * Also the case QName == xmlns:??? is handled independently as a namespace
3405  * definition.
3406  *
3407  * Returns the attribute name, and the value in *value.
3408  */
3409 
3410 static const xmlChar *
htmlParseAttribute(htmlParserCtxtPtr ctxt,xmlChar ** value)3411 htmlParseAttribute(htmlParserCtxtPtr ctxt, xmlChar **value) {
3412     const xmlChar *name;
3413     xmlChar *val = NULL;
3414 
3415     *value = NULL;
3416     name = htmlParseHTMLName(ctxt);
3417     if (name == NULL) {
3418 	htmlParseErr(ctxt, XML_ERR_NAME_REQUIRED,
3419 	             "error parsing attribute name\n", NULL, NULL);
3420         return(NULL);
3421     }
3422 
3423     /*
3424      * read the value
3425      */
3426     SKIP_BLANKS;
3427     if (CUR == '=') {
3428         NEXT;
3429 	SKIP_BLANKS;
3430 	val = htmlParseAttValue(ctxt);
3431     }
3432 
3433     *value = val;
3434     return(name);
3435 }
3436 
3437 /**
3438  * htmlCheckEncoding:
3439  * @ctxt:  an HTML parser context
3440  * @attvalue: the attribute value
3441  *
3442  * Checks an http-equiv attribute from a Meta tag to detect
3443  * the encoding
3444  * If a new encoding is detected the parser is switched to decode
3445  * it and pass UTF8
3446  */
3447 static void
htmlCheckEncoding(htmlParserCtxtPtr ctxt,const xmlChar * attvalue)3448 htmlCheckEncoding(htmlParserCtxtPtr ctxt, const xmlChar *attvalue) {
3449     const xmlChar *encoding;
3450 
3451     if ((ctxt == NULL) || (attvalue == NULL) ||
3452         (ctxt->options & HTML_PARSE_IGNORE_ENC))
3453 	return;
3454 
3455     /* do not change encoding */
3456     if (ctxt->input->encoding != NULL)
3457         return;
3458 
3459     encoding = xmlStrcasestr(attvalue, BAD_CAST"charset=");
3460     if (encoding != NULL) {
3461 	encoding += 8;
3462     } else {
3463 	encoding = xmlStrcasestr(attvalue, BAD_CAST"charset =");
3464 	if (encoding != NULL)
3465 	    encoding += 9;
3466     }
3467     if (encoding != NULL) {
3468 	xmlCharEncoding enc;
3469 	xmlCharEncodingHandlerPtr handler;
3470 
3471 	while ((*encoding == ' ') || (*encoding == '\t')) encoding++;
3472 
3473 	if (ctxt->input->encoding != NULL)
3474 	    xmlFree((xmlChar *) ctxt->input->encoding);
3475 	ctxt->input->encoding = xmlStrdup(encoding);
3476 
3477 	enc = xmlParseCharEncoding((const char *) encoding);
3478 	/*
3479 	 * registered set of known encodings
3480 	 */
3481 	if (enc != XML_CHAR_ENCODING_ERROR) {
3482 	    if (((enc == XML_CHAR_ENCODING_UTF16LE) ||
3483 	         (enc == XML_CHAR_ENCODING_UTF16BE) ||
3484 		 (enc == XML_CHAR_ENCODING_UCS4LE) ||
3485 		 (enc == XML_CHAR_ENCODING_UCS4BE)) &&
3486 		(ctxt->input->buf != NULL) &&
3487 		(ctxt->input->buf->encoder == NULL)) {
3488 		htmlParseErr(ctxt, XML_ERR_INVALID_ENCODING,
3489 		             "htmlCheckEncoding: wrong encoding meta\n",
3490 			     NULL, NULL);
3491 	    } else {
3492 		xmlSwitchEncoding(ctxt, enc);
3493 	    }
3494 	    ctxt->charset = XML_CHAR_ENCODING_UTF8;
3495 	} else {
3496 	    /*
3497 	     * fallback for unknown encodings
3498 	     */
3499 	    handler = xmlFindCharEncodingHandler((const char *) encoding);
3500 	    if (handler != NULL) {
3501 		xmlSwitchToEncoding(ctxt, handler);
3502 		ctxt->charset = XML_CHAR_ENCODING_UTF8;
3503 	    } else {
3504 		htmlParseErr(ctxt, XML_ERR_UNSUPPORTED_ENCODING,
3505 		             "htmlCheckEncoding: unknown encoding %s\n",
3506 			     encoding, NULL);
3507 	    }
3508 	}
3509 
3510 	if ((ctxt->input->buf != NULL) &&
3511 	    (ctxt->input->buf->encoder != NULL) &&
3512 	    (ctxt->input->buf->raw != NULL) &&
3513 	    (ctxt->input->buf->buffer != NULL)) {
3514 	    int nbchars;
3515 	    int processed;
3516 
3517 	    /*
3518 	     * convert as much as possible to the parser reading buffer.
3519 	     */
3520 	    processed = ctxt->input->cur - ctxt->input->base;
3521 	    xmlBufferShrink(ctxt->input->buf->buffer, processed);
3522 	    nbchars = xmlCharEncInFunc(ctxt->input->buf->encoder,
3523 		                       ctxt->input->buf->buffer,
3524 				       ctxt->input->buf->raw);
3525 	    if (nbchars < 0) {
3526 		htmlParseErr(ctxt, XML_ERR_INVALID_ENCODING,
3527 		             "htmlCheckEncoding: encoder error\n",
3528 			     NULL, NULL);
3529 	    }
3530 	    ctxt->input->base =
3531 	    ctxt->input->cur = ctxt->input->buf->buffer->content;
3532             ctxt->input->end =
3533                           &ctxt->input->base[ctxt->input->buf->buffer->use];
3534 	}
3535     }
3536 }
3537 
3538 /**
3539  * htmlCheckMeta:
3540  * @ctxt:  an HTML parser context
3541  * @atts:  the attributes values
3542  *
3543  * Checks an attributes from a Meta tag
3544  */
3545 static void
htmlCheckMeta(htmlParserCtxtPtr ctxt,const xmlChar ** atts)3546 htmlCheckMeta(htmlParserCtxtPtr ctxt, const xmlChar **atts) {
3547     int i;
3548     const xmlChar *att, *value;
3549     int http = 0;
3550     const xmlChar *content = NULL;
3551 
3552     if ((ctxt == NULL) || (atts == NULL))
3553 	return;
3554 
3555     i = 0;
3556     att = atts[i++];
3557     while (att != NULL) {
3558 	value = atts[i++];
3559 	if ((value != NULL) && (!xmlStrcasecmp(att, BAD_CAST"http-equiv"))
3560 	 && (!xmlStrcasecmp(value, BAD_CAST"Content-Type")))
3561 	    http = 1;
3562 	else if ((value != NULL) && (!xmlStrcasecmp(att, BAD_CAST"content")))
3563 	    content = value;
3564 	att = atts[i++];
3565     }
3566     if ((http) && (content != NULL))
3567 	htmlCheckEncoding(ctxt, content);
3568 
3569 }
3570 
3571 /**
3572  * htmlParseStartTag:
3573  * @ctxt:  an HTML parser context
3574  *
3575  * parse a start of tag either for rule element or
3576  * EmptyElement. In both case we don't parse the tag closing chars.
3577  *
3578  * [40] STag ::= '<' Name (S Attribute)* S? '>'
3579  *
3580  * [44] EmptyElemTag ::= '<' Name (S Attribute)* S? '/>'
3581  *
3582  * With namespace:
3583  *
3584  * [NS 8] STag ::= '<' QName (S Attribute)* S? '>'
3585  *
3586  * [NS 10] EmptyElement ::= '<' QName (S Attribute)* S? '/>'
3587  *
3588  * Returns 0 in case of success, -1 in case of error and 1 if discarded
3589  */
3590 
3591 static int
htmlParseStartTag(htmlParserCtxtPtr ctxt)3592 htmlParseStartTag(htmlParserCtxtPtr ctxt) {
3593     const xmlChar *name;
3594     const xmlChar *attname;
3595     xmlChar *attvalue;
3596     const xmlChar **atts;
3597     int nbatts = 0;
3598     int maxatts;
3599     int meta = 0;
3600     int i;
3601     int discardtag = 0;
3602 
3603     if (ctxt->instate == XML_PARSER_EOF)
3604         return(-1);
3605     if ((ctxt == NULL) || (ctxt->input == NULL)) {
3606 	htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
3607 		     "htmlParseStartTag: context error\n", NULL, NULL);
3608 	return -1;
3609     }
3610     if (CUR != '<') return -1;
3611     NEXT;
3612 
3613     atts = ctxt->atts;
3614     maxatts = ctxt->maxatts;
3615 
3616     GROW;
3617     name = htmlParseHTMLName(ctxt);
3618     if (name == NULL) {
3619 	htmlParseErr(ctxt, XML_ERR_NAME_REQUIRED,
3620 	             "htmlParseStartTag: invalid element name\n",
3621 		     NULL, NULL);
3622 	/* Dump the bogus tag like browsers do */
3623 	while ((IS_CHAR_CH(CUR)) && (CUR != '>') &&
3624                (ctxt->instate != XML_PARSER_EOF))
3625 	    NEXT;
3626         return -1;
3627     }
3628     if (xmlStrEqual(name, BAD_CAST"meta"))
3629 	meta = 1;
3630 
3631     /*
3632      * Check for auto-closure of HTML elements.
3633      */
3634     htmlAutoClose(ctxt, name);
3635 
3636     /*
3637      * Check for implied HTML elements.
3638      */
3639     htmlCheckImplied(ctxt, name);
3640 
3641     /*
3642      * Avoid html at any level > 0, head at any level != 1
3643      * or any attempt to recurse body
3644      */
3645     if ((ctxt->nameNr > 0) && (xmlStrEqual(name, BAD_CAST"html"))) {
3646 	htmlParseErr(ctxt, XML_HTML_STRUCURE_ERROR,
3647 	             "htmlParseStartTag: misplaced <html> tag\n",
3648 		     name, NULL);
3649 	discardtag = 1;
3650 	ctxt->depth++;
3651     }
3652     if ((ctxt->nameNr != 1) &&
3653 	(xmlStrEqual(name, BAD_CAST"head"))) {
3654 	htmlParseErr(ctxt, XML_HTML_STRUCURE_ERROR,
3655 	             "htmlParseStartTag: misplaced <head> tag\n",
3656 		     name, NULL);
3657 	discardtag = 1;
3658 	ctxt->depth++;
3659     }
3660     if (xmlStrEqual(name, BAD_CAST"body")) {
3661 	int indx;
3662 	for (indx = 0;indx < ctxt->nameNr;indx++) {
3663 	    if (xmlStrEqual(ctxt->nameTab[indx], BAD_CAST"body")) {
3664 		htmlParseErr(ctxt, XML_HTML_STRUCURE_ERROR,
3665 		             "htmlParseStartTag: misplaced <body> tag\n",
3666 			     name, NULL);
3667 		discardtag = 1;
3668 		ctxt->depth++;
3669 	    }
3670 	}
3671     }
3672 
3673     /*
3674      * Now parse the attributes, it ends up with the ending
3675      *
3676      * (S Attribute)* S?
3677      */
3678     SKIP_BLANKS;
3679     while ((IS_CHAR_CH(CUR)) &&
3680            (CUR != '>') &&
3681 	   ((CUR != '/') || (NXT(1) != '>'))) {
3682 	long cons = ctxt->nbChars;
3683 
3684 	GROW;
3685 	attname = htmlParseAttribute(ctxt, &attvalue);
3686         if (attname != NULL) {
3687 
3688 	    /*
3689 	     * Well formedness requires at most one declaration of an attribute
3690 	     */
3691 	    for (i = 0; i < nbatts;i += 2) {
3692 	        if (xmlStrEqual(atts[i], attname)) {
3693 		    htmlParseErr(ctxt, XML_ERR_ATTRIBUTE_REDEFINED,
3694 		                 "Attribute %s redefined\n", attname, NULL);
3695 		    if (attvalue != NULL)
3696 			xmlFree(attvalue);
3697 		    goto failed;
3698 		}
3699 	    }
3700 
3701 	    /*
3702 	     * Add the pair to atts
3703 	     */
3704 	    if (atts == NULL) {
3705 	        maxatts = 22; /* allow for 10 attrs by default */
3706 	        atts = (const xmlChar **)
3707 		       xmlMalloc(maxatts * sizeof(xmlChar *));
3708 		if (atts == NULL) {
3709 		    htmlErrMemory(ctxt, NULL);
3710 		    if (attvalue != NULL)
3711 			xmlFree(attvalue);
3712 		    goto failed;
3713 		}
3714 		ctxt->atts = atts;
3715 		ctxt->maxatts = maxatts;
3716 	    } else if (nbatts + 4 > maxatts) {
3717 	        const xmlChar **n;
3718 
3719 	        maxatts *= 2;
3720 	        n = (const xmlChar **) xmlRealloc((void *) atts,
3721 					     maxatts * sizeof(const xmlChar *));
3722 		if (n == NULL) {
3723 		    htmlErrMemory(ctxt, NULL);
3724 		    if (attvalue != NULL)
3725 			xmlFree(attvalue);
3726 		    goto failed;
3727 		}
3728 		atts = n;
3729 		ctxt->atts = atts;
3730 		ctxt->maxatts = maxatts;
3731 	    }
3732 	    atts[nbatts++] = attname;
3733 	    atts[nbatts++] = attvalue;
3734 	    atts[nbatts] = NULL;
3735 	    atts[nbatts + 1] = NULL;
3736 	}
3737 	else {
3738 	    if (attvalue != NULL)
3739 	        xmlFree(attvalue);
3740 	    /* Dump the bogus attribute string up to the next blank or
3741 	     * the end of the tag. */
3742 	    while ((IS_CHAR_CH(CUR)) &&
3743 	           !(IS_BLANK_CH(CUR)) && (CUR != '>') &&
3744 		   ((CUR != '/') || (NXT(1) != '>')))
3745 		NEXT;
3746 	}
3747 
3748 failed:
3749 	SKIP_BLANKS;
3750         if (cons == ctxt->nbChars) {
3751 	    htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
3752 	                 "htmlParseStartTag: problem parsing attributes\n",
3753 			 NULL, NULL);
3754 	    break;
3755 	}
3756     }
3757 
3758     /*
3759      * Handle specific association to the META tag
3760      */
3761     if (meta && (nbatts != 0))
3762 	htmlCheckMeta(ctxt, atts);
3763 
3764     /*
3765      * SAX: Start of Element !
3766      */
3767     if (!discardtag) {
3768 	htmlnamePush(ctxt, name);
3769 	if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL)) {
3770 	    if (nbatts != 0)
3771 		ctxt->sax->startElement(ctxt->userData, name, atts);
3772 	    else
3773 		ctxt->sax->startElement(ctxt->userData, name, NULL);
3774 	}
3775     }
3776 
3777     if (atts != NULL) {
3778         for (i = 1;i < nbatts;i += 2) {
3779 	    if (atts[i] != NULL)
3780 		xmlFree((xmlChar *) atts[i]);
3781 	}
3782     }
3783 
3784     return(discardtag);
3785 }
3786 
3787 /**
3788  * htmlParseEndTag:
3789  * @ctxt:  an HTML parser context
3790  *
3791  * parse an end of tag
3792  *
3793  * [42] ETag ::= '</' Name S? '>'
3794  *
3795  * With namespace
3796  *
3797  * [NS 9] ETag ::= '</' QName S? '>'
3798  *
3799  * Returns 1 if the current level should be closed.
3800  */
3801 
3802 static int
htmlParseEndTag(htmlParserCtxtPtr ctxt)3803 htmlParseEndTag(htmlParserCtxtPtr ctxt)
3804 {
3805     const xmlChar *name;
3806     const xmlChar *oldname;
3807     int i, ret;
3808 
3809     if ((CUR != '<') || (NXT(1) != '/')) {
3810         htmlParseErr(ctxt, XML_ERR_LTSLASH_REQUIRED,
3811 	             "htmlParseEndTag: '</' not found\n", NULL, NULL);
3812         return (0);
3813     }
3814     SKIP(2);
3815 
3816     name = htmlParseHTMLName(ctxt);
3817     if (name == NULL)
3818         return (0);
3819     /*
3820      * We should definitely be at the ending "S? '>'" part
3821      */
3822     SKIP_BLANKS;
3823     if ((!IS_CHAR_CH(CUR)) || (CUR != '>')) {
3824         htmlParseErr(ctxt, XML_ERR_GT_REQUIRED,
3825 	             "End tag : expected '>'\n", NULL, NULL);
3826 	if (ctxt->recovery) {
3827 	    /*
3828 	     * We're not at the ending > !!
3829 	     * Error, unless in recover mode where we search forwards
3830 	     * until we find a >
3831 	     */
3832 	    while (CUR != '\0' && CUR != '>') NEXT;
3833 	    NEXT;
3834 	}
3835     } else
3836         NEXT;
3837 
3838     /*
3839      * if we ignored misplaced tags in htmlParseStartTag don't pop them
3840      * out now.
3841      */
3842     if ((ctxt->depth > 0) &&
3843         (xmlStrEqual(name, BAD_CAST "html") ||
3844          xmlStrEqual(name, BAD_CAST "body") ||
3845 	 xmlStrEqual(name, BAD_CAST "head"))) {
3846 	ctxt->depth--;
3847 	return (0);
3848     }
3849 
3850     /*
3851      * If the name read is not one of the element in the parsing stack
3852      * then return, it's just an error.
3853      */
3854     for (i = (ctxt->nameNr - 1); i >= 0; i--) {
3855         if (xmlStrEqual(name, ctxt->nameTab[i]))
3856             break;
3857     }
3858     if (i < 0) {
3859         htmlParseErr(ctxt, XML_ERR_TAG_NAME_MISMATCH,
3860 	             "Unexpected end tag : %s\n", name, NULL);
3861         return (0);
3862     }
3863 
3864 
3865     /*
3866      * Check for auto-closure of HTML elements.
3867      */
3868 
3869     htmlAutoCloseOnClose(ctxt, name);
3870 
3871     /*
3872      * Well formedness constraints, opening and closing must match.
3873      * With the exception that the autoclose may have popped stuff out
3874      * of the stack.
3875      */
3876     if (!xmlStrEqual(name, ctxt->name)) {
3877         if ((ctxt->name != NULL) && (!xmlStrEqual(ctxt->name, name))) {
3878             htmlParseErr(ctxt, XML_ERR_TAG_NAME_MISMATCH,
3879 	                 "Opening and ending tag mismatch: %s and %s\n",
3880 			 name, ctxt->name);
3881         }
3882     }
3883 
3884     /*
3885      * SAX: End of Tag
3886      */
3887     oldname = ctxt->name;
3888     if ((oldname != NULL) && (xmlStrEqual(oldname, name))) {
3889         if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
3890             ctxt->sax->endElement(ctxt->userData, name);
3891         htmlnamePop(ctxt);
3892         ret = 1;
3893     } else {
3894         ret = 0;
3895     }
3896 
3897     return (ret);
3898 }
3899 
3900 
3901 /**
3902  * htmlParseReference:
3903  * @ctxt:  an HTML parser context
3904  *
3905  * parse and handle entity references in content,
3906  * this will end-up in a call to character() since this is either a
3907  * CharRef, or a predefined entity.
3908  */
3909 static void
htmlParseReference(htmlParserCtxtPtr ctxt)3910 htmlParseReference(htmlParserCtxtPtr ctxt) {
3911     const htmlEntityDesc * ent;
3912     xmlChar out[6];
3913     const xmlChar *name;
3914     if (CUR != '&') return;
3915 
3916     if (NXT(1) == '#') {
3917 	unsigned int c;
3918 	int bits, i = 0;
3919 
3920 	c = htmlParseCharRef(ctxt);
3921 	if (c == 0)
3922 	    return;
3923 
3924         if      (c <    0x80) { out[i++]= c;                bits= -6; }
3925         else if (c <   0x800) { out[i++]=((c >>  6) & 0x1F) | 0xC0;  bits=  0; }
3926         else if (c < 0x10000) { out[i++]=((c >> 12) & 0x0F) | 0xE0;  bits=  6; }
3927         else                  { out[i++]=((c >> 18) & 0x07) | 0xF0;  bits= 12; }
3928 
3929         for ( ; bits >= 0; bits-= 6) {
3930             out[i++]= ((c >> bits) & 0x3F) | 0x80;
3931         }
3932 	out[i] = 0;
3933 
3934 	htmlCheckParagraph(ctxt);
3935 	if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL))
3936 	    ctxt->sax->characters(ctxt->userData, out, i);
3937     } else {
3938 	ent = htmlParseEntityRef(ctxt, &name);
3939 	if (name == NULL) {
3940 	    htmlCheckParagraph(ctxt);
3941 	    if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL))
3942 	        ctxt->sax->characters(ctxt->userData, BAD_CAST "&", 1);
3943 	    return;
3944 	}
3945 	if ((ent == NULL) || !(ent->value > 0)) {
3946 	    htmlCheckParagraph(ctxt);
3947 	    if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL)) {
3948 		ctxt->sax->characters(ctxt->userData, BAD_CAST "&", 1);
3949 		ctxt->sax->characters(ctxt->userData, name, xmlStrlen(name));
3950 		/* ctxt->sax->characters(ctxt->userData, BAD_CAST ";", 1); */
3951 	    }
3952 	} else {
3953 	    unsigned int c;
3954 	    int bits, i = 0;
3955 
3956 	    c = ent->value;
3957 	    if      (c <    0x80)
3958 	            { out[i++]= c;                bits= -6; }
3959 	    else if (c <   0x800)
3960 	            { out[i++]=((c >>  6) & 0x1F) | 0xC0;  bits=  0; }
3961 	    else if (c < 0x10000)
3962 	            { out[i++]=((c >> 12) & 0x0F) | 0xE0;  bits=  6; }
3963 	    else
3964 	            { out[i++]=((c >> 18) & 0x07) | 0xF0;  bits= 12; }
3965 
3966 	    for ( ; bits >= 0; bits-= 6) {
3967 		out[i++]= ((c >> bits) & 0x3F) | 0x80;
3968 	    }
3969 	    out[i] = 0;
3970 
3971 	    htmlCheckParagraph(ctxt);
3972 	    if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL))
3973 		ctxt->sax->characters(ctxt->userData, out, i);
3974 	}
3975     }
3976 }
3977 
3978 /**
3979  * htmlParseContent:
3980  * @ctxt:  an HTML parser context
3981  *
3982  * Parse a content: comment, sub-element, reference or text.
3983  * Kept for compatibility with old code
3984  */
3985 
3986 static void
htmlParseContent(htmlParserCtxtPtr ctxt)3987 htmlParseContent(htmlParserCtxtPtr ctxt) {
3988     xmlChar *currentNode;
3989     int depth;
3990     const xmlChar *name;
3991 
3992     currentNode = xmlStrdup(ctxt->name);
3993     depth = ctxt->nameNr;
3994     while (1) {
3995 	long cons = ctxt->nbChars;
3996 
3997         GROW;
3998 
3999         if (ctxt->instate == XML_PARSER_EOF)
4000             break;
4001 
4002 	/*
4003 	 * Our tag or one of it's parent or children is ending.
4004 	 */
4005         if ((CUR == '<') && (NXT(1) == '/')) {
4006 	    if (htmlParseEndTag(ctxt) &&
4007 		((currentNode != NULL) || (ctxt->nameNr == 0))) {
4008 		if (currentNode != NULL)
4009 		    xmlFree(currentNode);
4010 		return;
4011 	    }
4012 	    continue; /* while */
4013         }
4014 
4015 	else if ((CUR == '<') &&
4016 	         ((IS_ASCII_LETTER(NXT(1))) ||
4017 		  (NXT(1) == '_') || (NXT(1) == ':'))) {
4018 	    name = htmlParseHTMLName_nonInvasive(ctxt);
4019 	    if (name == NULL) {
4020 	        htmlParseErr(ctxt, XML_ERR_NAME_REQUIRED,
4021 			 "htmlParseStartTag: invalid element name\n",
4022 			 NULL, NULL);
4023 	        /* Dump the bogus tag like browsers do */
4024         while ((IS_CHAR_CH(CUR)) && (CUR != '>'))
4025 	            NEXT;
4026 
4027 	        if (currentNode != NULL)
4028 	            xmlFree(currentNode);
4029 	        return;
4030 	    }
4031 
4032 	    if (ctxt->name != NULL) {
4033 	        if (htmlCheckAutoClose(name, ctxt->name) == 1) {
4034 	            htmlAutoClose(ctxt, name);
4035 	            continue;
4036 	        }
4037 	    }
4038 	}
4039 
4040 	/*
4041 	 * Has this node been popped out during parsing of
4042 	 * the next element
4043 	 */
4044         if ((ctxt->nameNr > 0) && (depth >= ctxt->nameNr) &&
4045 	    (!xmlStrEqual(currentNode, ctxt->name)))
4046 	     {
4047 	    if (currentNode != NULL) xmlFree(currentNode);
4048 	    return;
4049 	}
4050 
4051 	if ((CUR != 0) && ((xmlStrEqual(currentNode, BAD_CAST"script")) ||
4052 	    (xmlStrEqual(currentNode, BAD_CAST"style")))) {
4053 	    /*
4054 	     * Handle SCRIPT/STYLE separately
4055 	     */
4056 	    htmlParseScript(ctxt);
4057 	} else {
4058 	    /*
4059 	     * Sometimes DOCTYPE arrives in the middle of the document
4060 	     */
4061 	    if ((CUR == '<') && (NXT(1) == '!') &&
4062 		(UPP(2) == 'D') && (UPP(3) == 'O') &&
4063 		(UPP(4) == 'C') && (UPP(5) == 'T') &&
4064 		(UPP(6) == 'Y') && (UPP(7) == 'P') &&
4065 		(UPP(8) == 'E')) {
4066 		htmlParseErr(ctxt, XML_HTML_STRUCURE_ERROR,
4067 		             "Misplaced DOCTYPE declaration\n",
4068 			     BAD_CAST "DOCTYPE" , NULL);
4069 		htmlParseDocTypeDecl(ctxt);
4070 	    }
4071 
4072 	    /*
4073 	     * First case :  a comment
4074 	     */
4075 	    if ((CUR == '<') && (NXT(1) == '!') &&
4076 		(NXT(2) == '-') && (NXT(3) == '-')) {
4077 		htmlParseComment(ctxt);
4078 	    }
4079 
4080 	    /*
4081 	     * Second case : a Processing Instruction.
4082 	     */
4083 	    else if ((CUR == '<') && (NXT(1) == '?')) {
4084 		htmlParsePI(ctxt);
4085 	    }
4086 
4087 	    /*
4088 	     * Third case :  a sub-element.
4089 	     */
4090 	    else if (CUR == '<') {
4091 		htmlParseElement(ctxt);
4092 	    }
4093 
4094 	    /*
4095 	     * Fourth case : a reference. If if has not been resolved,
4096 	     *    parsing returns it's Name, create the node
4097 	     */
4098 	    else if (CUR == '&') {
4099 		htmlParseReference(ctxt);
4100 	    }
4101 
4102 	    /*
4103 	     * Fifth case : end of the resource
4104 	     */
4105 	    else if (CUR == 0) {
4106 		htmlAutoCloseOnEnd(ctxt);
4107 		break;
4108 	    }
4109 
4110 	    /*
4111 	     * Last case, text. Note that References are handled directly.
4112 	     */
4113 	    else {
4114 		htmlParseCharData(ctxt);
4115 	    }
4116 
4117 	    if (cons == ctxt->nbChars) {
4118 		if (ctxt->node != NULL) {
4119 		    htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
4120 		                 "detected an error in element content\n",
4121 				 NULL, NULL);
4122 		}
4123 		break;
4124 	    }
4125 	}
4126         GROW;
4127     }
4128     if (currentNode != NULL) xmlFree(currentNode);
4129 }
4130 
4131 /**
4132  * htmlParseElement:
4133  * @ctxt:  an HTML parser context
4134  *
4135  * parse an HTML element, this is highly recursive
4136  * this is kept for compatibility with previous code versions
4137  *
4138  * [39] element ::= EmptyElemTag | STag content ETag
4139  *
4140  * [41] Attribute ::= Name Eq AttValue
4141  */
4142 
4143 void
htmlParseElement(htmlParserCtxtPtr ctxt)4144 htmlParseElement(htmlParserCtxtPtr ctxt) {
4145     const xmlChar *name;
4146     xmlChar *currentNode = NULL;
4147     const htmlElemDesc * info;
4148     htmlParserNodeInfo node_info;
4149     int failed;
4150     int depth;
4151     const xmlChar *oldptr;
4152 
4153     if ((ctxt == NULL) || (ctxt->input == NULL)) {
4154 	htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
4155 		     "htmlParseElement: context error\n", NULL, NULL);
4156 	return;
4157     }
4158 
4159     if (ctxt->instate == XML_PARSER_EOF)
4160         return;
4161 
4162     /* Capture start position */
4163     if (ctxt->record_info) {
4164         node_info.begin_pos = ctxt->input->consumed +
4165                           (CUR_PTR - ctxt->input->base);
4166 	node_info.begin_line = ctxt->input->line;
4167     }
4168 
4169     failed = htmlParseStartTag(ctxt);
4170     name = ctxt->name;
4171     if ((failed == -1) || (name == NULL)) {
4172 	if (CUR == '>')
4173 	    NEXT;
4174         return;
4175     }
4176 
4177     /*
4178      * Lookup the info for that element.
4179      */
4180     info = htmlTagLookup(name);
4181     if (info == NULL) {
4182 	htmlParseErr(ctxt, XML_HTML_UNKNOWN_TAG,
4183 	             "Tag %s invalid\n", name, NULL);
4184     }
4185 
4186     /*
4187      * Check for an Empty Element labeled the XML/SGML way
4188      */
4189     if ((CUR == '/') && (NXT(1) == '>')) {
4190         SKIP(2);
4191 	if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
4192 	    ctxt->sax->endElement(ctxt->userData, name);
4193 	htmlnamePop(ctxt);
4194 	return;
4195     }
4196 
4197     if (CUR == '>') {
4198         NEXT;
4199     } else {
4200 	htmlParseErr(ctxt, XML_ERR_GT_REQUIRED,
4201 	             "Couldn't find end of Start Tag %s\n", name, NULL);
4202 
4203 	/*
4204 	 * end of parsing of this node.
4205 	 */
4206 	if (xmlStrEqual(name, ctxt->name)) {
4207 	    nodePop(ctxt);
4208 	    htmlnamePop(ctxt);
4209 	}
4210 
4211 	/*
4212 	 * Capture end position and add node
4213 	 */
4214 	if (ctxt->record_info) {
4215 	   node_info.end_pos = ctxt->input->consumed +
4216 			      (CUR_PTR - ctxt->input->base);
4217 	   node_info.end_line = ctxt->input->line;
4218 	   node_info.node = ctxt->node;
4219 	   xmlParserAddNodeInfo(ctxt, &node_info);
4220 	}
4221 	return;
4222     }
4223 
4224     /*
4225      * Check for an Empty Element from DTD definition
4226      */
4227     if ((info != NULL) && (info->empty)) {
4228 	if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
4229 	    ctxt->sax->endElement(ctxt->userData, name);
4230 	htmlnamePop(ctxt);
4231 	return;
4232     }
4233 
4234     /*
4235      * Parse the content of the element:
4236      */
4237     currentNode = xmlStrdup(ctxt->name);
4238     depth = ctxt->nameNr;
4239     while (IS_CHAR_CH(CUR)) {
4240 	oldptr = ctxt->input->cur;
4241 	htmlParseContent(ctxt);
4242 	if (oldptr==ctxt->input->cur) break;
4243 	if (ctxt->nameNr < depth) break;
4244     }
4245 
4246     /*
4247      * Capture end position and add node
4248      */
4249     if ( currentNode != NULL && ctxt->record_info ) {
4250        node_info.end_pos = ctxt->input->consumed +
4251                           (CUR_PTR - ctxt->input->base);
4252        node_info.end_line = ctxt->input->line;
4253        node_info.node = ctxt->node;
4254        xmlParserAddNodeInfo(ctxt, &node_info);
4255     }
4256     if (!IS_CHAR_CH(CUR)) {
4257 	htmlAutoCloseOnEnd(ctxt);
4258     }
4259 
4260     if (currentNode != NULL)
4261 	xmlFree(currentNode);
4262 }
4263 
4264 static void
htmlParserFinishElementParsing(htmlParserCtxtPtr ctxt)4265 htmlParserFinishElementParsing(htmlParserCtxtPtr ctxt) {
4266     /*
4267      * Capture end position and add node
4268      */
4269     if ( ctxt->node != NULL && ctxt->record_info ) {
4270        ctxt->nodeInfo->end_pos = ctxt->input->consumed +
4271                                 (CUR_PTR - ctxt->input->base);
4272        ctxt->nodeInfo->end_line = ctxt->input->line;
4273        ctxt->nodeInfo->node = ctxt->node;
4274        xmlParserAddNodeInfo(ctxt, ctxt->nodeInfo);
4275        htmlNodeInfoPop(ctxt);
4276     }
4277     if (!IS_CHAR_CH(CUR)) {
4278        htmlAutoCloseOnEnd(ctxt);
4279     }
4280 }
4281 
4282 /**
4283  * htmlParseElementInternal:
4284  * @ctxt:  an HTML parser context
4285  *
4286  * parse an HTML element, new version, non recursive
4287  *
4288  * [39] element ::= EmptyElemTag | STag content ETag
4289  *
4290  * [41] Attribute ::= Name Eq AttValue
4291  */
4292 
4293 static void
htmlParseElementInternal(htmlParserCtxtPtr ctxt)4294 htmlParseElementInternal(htmlParserCtxtPtr ctxt) {
4295     const xmlChar *name;
4296     const htmlElemDesc * info;
4297     htmlParserNodeInfo node_info;
4298     int failed;
4299 
4300     if ((ctxt == NULL) || (ctxt->input == NULL)) {
4301 	htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
4302 		     "htmlParseElementInternal: context error\n", NULL, NULL);
4303 	return;
4304     }
4305 
4306     if (ctxt->instate == XML_PARSER_EOF)
4307         return;
4308 
4309     /* Capture start position */
4310     if (ctxt->record_info) {
4311         node_info.begin_pos = ctxt->input->consumed +
4312                           (CUR_PTR - ctxt->input->base);
4313 	node_info.begin_line = ctxt->input->line;
4314     }
4315 
4316     failed = htmlParseStartTag(ctxt);
4317     name = ctxt->name;
4318     if ((failed == -1) || (name == NULL)) {
4319 	if (CUR == '>')
4320 	    NEXT;
4321         return;
4322     }
4323 
4324     /*
4325      * Lookup the info for that element.
4326      */
4327     info = htmlTagLookup(name);
4328     if (info == NULL) {
4329 	htmlParseErr(ctxt, XML_HTML_UNKNOWN_TAG,
4330 	             "Tag %s invalid\n", name, NULL);
4331     }
4332 
4333     /*
4334      * Check for an Empty Element labeled the XML/SGML way
4335      */
4336     if ((CUR == '/') && (NXT(1) == '>')) {
4337         SKIP(2);
4338 	if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
4339 	    ctxt->sax->endElement(ctxt->userData, name);
4340 	htmlnamePop(ctxt);
4341 	return;
4342     }
4343 
4344     if (CUR == '>') {
4345         NEXT;
4346     } else {
4347 	htmlParseErr(ctxt, XML_ERR_GT_REQUIRED,
4348 	             "Couldn't find end of Start Tag %s\n", name, NULL);
4349 
4350 	/*
4351 	 * end of parsing of this node.
4352 	 */
4353 	if (xmlStrEqual(name, ctxt->name)) {
4354 	    nodePop(ctxt);
4355 	    htmlnamePop(ctxt);
4356 	}
4357 
4358         if (ctxt->record_info)
4359             htmlNodeInfoPush(ctxt, &node_info);
4360         htmlParserFinishElementParsing(ctxt);
4361 	return;
4362     }
4363 
4364     /*
4365      * Check for an Empty Element from DTD definition
4366      */
4367     if ((info != NULL) && (info->empty)) {
4368 	if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
4369 	    ctxt->sax->endElement(ctxt->userData, name);
4370 	htmlnamePop(ctxt);
4371 	return;
4372     }
4373 
4374     if (ctxt->record_info)
4375         htmlNodeInfoPush(ctxt, &node_info);
4376 }
4377 
4378 /**
4379  * htmlParseContentInternal:
4380  * @ctxt:  an HTML parser context
4381  *
4382  * Parse a content: comment, sub-element, reference or text.
4383  * New version for non recursive htmlParseElementInternal
4384  */
4385 
4386 static void
htmlParseContentInternal(htmlParserCtxtPtr ctxt)4387 htmlParseContentInternal(htmlParserCtxtPtr ctxt) {
4388     xmlChar *currentNode;
4389     int depth;
4390     const xmlChar *name;
4391 
4392     currentNode = xmlStrdup(ctxt->name);
4393     depth = ctxt->nameNr;
4394     while (1) {
4395 	long cons = ctxt->nbChars;
4396 
4397         GROW;
4398 
4399         if (ctxt->instate == XML_PARSER_EOF)
4400             break;
4401 
4402 	/*
4403 	 * Our tag or one of it's parent or children is ending.
4404 	 */
4405         if ((CUR == '<') && (NXT(1) == '/')) {
4406 	    if (htmlParseEndTag(ctxt) &&
4407 		((currentNode != NULL) || (ctxt->nameNr == 0))) {
4408 		if (currentNode != NULL)
4409 		    xmlFree(currentNode);
4410 
4411 	        currentNode = xmlStrdup(ctxt->name);
4412 	        depth = ctxt->nameNr;
4413 	    }
4414 	    continue; /* while */
4415         }
4416 
4417 	else if ((CUR == '<') &&
4418 	         ((IS_ASCII_LETTER(NXT(1))) ||
4419 		  (NXT(1) == '_') || (NXT(1) == ':'))) {
4420 	    name = htmlParseHTMLName_nonInvasive(ctxt);
4421 	    if (name == NULL) {
4422 	        htmlParseErr(ctxt, XML_ERR_NAME_REQUIRED,
4423 			 "htmlParseStartTag: invalid element name\n",
4424 			 NULL, NULL);
4425 	        /* Dump the bogus tag like browsers do */
4426 	        while ((IS_CHAR_CH(CUR)) && (CUR != '>'))
4427 	            NEXT;
4428 
4429 	        htmlParserFinishElementParsing(ctxt);
4430 	        if (currentNode != NULL)
4431 	            xmlFree(currentNode);
4432 
4433 	        currentNode = xmlStrdup(ctxt->name);
4434 	        depth = ctxt->nameNr;
4435 	        continue;
4436 	    }
4437 
4438 	    if (ctxt->name != NULL) {
4439 	        if (htmlCheckAutoClose(name, ctxt->name) == 1) {
4440 	            htmlAutoClose(ctxt, name);
4441 	            continue;
4442 	        }
4443 	    }
4444 	}
4445 
4446 	/*
4447 	 * Has this node been popped out during parsing of
4448 	 * the next element
4449 	 */
4450         if ((ctxt->nameNr > 0) && (depth >= ctxt->nameNr) &&
4451 	    (!xmlStrEqual(currentNode, ctxt->name)))
4452 	     {
4453 	    htmlParserFinishElementParsing(ctxt);
4454 	    if (currentNode != NULL) xmlFree(currentNode);
4455 
4456 	    currentNode = xmlStrdup(ctxt->name);
4457 	    depth = ctxt->nameNr;
4458 	    continue;
4459 	}
4460 
4461 	if ((CUR != 0) && ((xmlStrEqual(currentNode, BAD_CAST"script")) ||
4462 	    (xmlStrEqual(currentNode, BAD_CAST"style")))) {
4463 	    /*
4464 	     * Handle SCRIPT/STYLE separately
4465 	     */
4466 	    htmlParseScript(ctxt);
4467 	} else {
4468 	    /*
4469 	     * Sometimes DOCTYPE arrives in the middle of the document
4470 	     */
4471 	    if ((CUR == '<') && (NXT(1) == '!') &&
4472 		(UPP(2) == 'D') && (UPP(3) == 'O') &&
4473 		(UPP(4) == 'C') && (UPP(5) == 'T') &&
4474 		(UPP(6) == 'Y') && (UPP(7) == 'P') &&
4475 		(UPP(8) == 'E')) {
4476 		htmlParseErr(ctxt, XML_HTML_STRUCURE_ERROR,
4477 		             "Misplaced DOCTYPE declaration\n",
4478 			     BAD_CAST "DOCTYPE" , NULL);
4479 		htmlParseDocTypeDecl(ctxt);
4480 	    }
4481 
4482 	    /*
4483 	     * First case :  a comment
4484 	     */
4485 	    if ((CUR == '<') && (NXT(1) == '!') &&
4486 		(NXT(2) == '-') && (NXT(3) == '-')) {
4487 		htmlParseComment(ctxt);
4488 	    }
4489 
4490 	    /*
4491 	     * Second case : a Processing Instruction.
4492 	     */
4493 	    else if ((CUR == '<') && (NXT(1) == '?')) {
4494 		htmlParsePI(ctxt);
4495 	    }
4496 
4497 	    /*
4498 	     * Third case :  a sub-element.
4499 	     */
4500 	    else if (CUR == '<') {
4501 		htmlParseElementInternal(ctxt);
4502 		if (currentNode != NULL) xmlFree(currentNode);
4503 
4504 		currentNode = xmlStrdup(ctxt->name);
4505 		depth = ctxt->nameNr;
4506 	    }
4507 
4508 	    /*
4509 	     * Fourth case : a reference. If if has not been resolved,
4510 	     *    parsing returns it's Name, create the node
4511 	     */
4512 	    else if (CUR == '&') {
4513 		htmlParseReference(ctxt);
4514 	    }
4515 
4516 	    /*
4517 	     * Fifth case : end of the resource
4518 	     */
4519 	    else if (CUR == 0) {
4520 		htmlAutoCloseOnEnd(ctxt);
4521 		break;
4522 	    }
4523 
4524 	    /*
4525 	     * Last case, text. Note that References are handled directly.
4526 	     */
4527 	    else {
4528 		htmlParseCharData(ctxt);
4529 	    }
4530 
4531 	    if (cons == ctxt->nbChars) {
4532 		if (ctxt->node != NULL) {
4533 		    htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
4534 		                 "detected an error in element content\n",
4535 				 NULL, NULL);
4536 		}
4537 		break;
4538 	    }
4539 	}
4540         GROW;
4541     }
4542     if (currentNode != NULL) xmlFree(currentNode);
4543 }
4544 
4545 /**
4546  * htmlParseContent:
4547  * @ctxt:  an HTML parser context
4548  *
4549  * Parse a content: comment, sub-element, reference or text.
4550  * This is the entry point when called from parser.c
4551  */
4552 
4553 void
__htmlParseContent(void * ctxt)4554 __htmlParseContent(void *ctxt) {
4555     if (ctxt != NULL)
4556 	htmlParseContentInternal((htmlParserCtxtPtr) ctxt);
4557 }
4558 
4559 /**
4560  * htmlParseDocument:
4561  * @ctxt:  an HTML parser context
4562  *
4563  * parse an HTML document (and build a tree if using the standard SAX
4564  * interface).
4565  *
4566  * Returns 0, -1 in case of error. the parser context is augmented
4567  *                as a result of the parsing.
4568  */
4569 
4570 int
htmlParseDocument(htmlParserCtxtPtr ctxt)4571 htmlParseDocument(htmlParserCtxtPtr ctxt) {
4572     xmlChar start[4];
4573     xmlCharEncoding enc;
4574     xmlDtdPtr dtd;
4575 
4576     xmlInitParser();
4577 
4578     htmlDefaultSAXHandlerInit();
4579 
4580     if ((ctxt == NULL) || (ctxt->input == NULL)) {
4581 	htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
4582 		     "htmlParseDocument: context error\n", NULL, NULL);
4583 	return(XML_ERR_INTERNAL_ERROR);
4584     }
4585     ctxt->html = 1;
4586     ctxt->linenumbers = 1;
4587     GROW;
4588     /*
4589      * SAX: beginning of the document processing.
4590      */
4591     if ((ctxt->sax) && (ctxt->sax->setDocumentLocator))
4592         ctxt->sax->setDocumentLocator(ctxt->userData, &xmlDefaultSAXLocator);
4593 
4594     if ((ctxt->encoding == (const xmlChar *)XML_CHAR_ENCODING_NONE) &&
4595         ((ctxt->input->end - ctxt->input->cur) >= 4)) {
4596 	/*
4597 	 * Get the 4 first bytes and decode the charset
4598 	 * if enc != XML_CHAR_ENCODING_NONE
4599 	 * plug some encoding conversion routines.
4600 	 */
4601 	start[0] = RAW;
4602 	start[1] = NXT(1);
4603 	start[2] = NXT(2);
4604 	start[3] = NXT(3);
4605 	enc = xmlDetectCharEncoding(&start[0], 4);
4606 	if (enc != XML_CHAR_ENCODING_NONE) {
4607 	    xmlSwitchEncoding(ctxt, enc);
4608 	}
4609     }
4610 
4611     /*
4612      * Wipe out everything which is before the first '<'
4613      */
4614     SKIP_BLANKS;
4615     if (CUR == 0) {
4616 	htmlParseErr(ctxt, XML_ERR_DOCUMENT_EMPTY,
4617 	             "Document is empty\n", NULL, NULL);
4618     }
4619 
4620     if ((ctxt->sax) && (ctxt->sax->startDocument) && (!ctxt->disableSAX))
4621 	ctxt->sax->startDocument(ctxt->userData);
4622 
4623 
4624     /*
4625      * Parse possible comments and PIs before any content
4626      */
4627     while (((CUR == '<') && (NXT(1) == '!') &&
4628             (NXT(2) == '-') && (NXT(3) == '-')) ||
4629 	   ((CUR == '<') && (NXT(1) == '?'))) {
4630         htmlParseComment(ctxt);
4631         htmlParsePI(ctxt);
4632 	SKIP_BLANKS;
4633     }
4634 
4635 
4636     /*
4637      * Then possibly doc type declaration(s) and more Misc
4638      * (doctypedecl Misc*)?
4639      */
4640     if ((CUR == '<') && (NXT(1) == '!') &&
4641 	(UPP(2) == 'D') && (UPP(3) == 'O') &&
4642 	(UPP(4) == 'C') && (UPP(5) == 'T') &&
4643 	(UPP(6) == 'Y') && (UPP(7) == 'P') &&
4644 	(UPP(8) == 'E')) {
4645 	htmlParseDocTypeDecl(ctxt);
4646     }
4647     SKIP_BLANKS;
4648 
4649     /*
4650      * Parse possible comments and PIs before any content
4651      */
4652     while (((CUR == '<') && (NXT(1) == '!') &&
4653             (NXT(2) == '-') && (NXT(3) == '-')) ||
4654 	   ((CUR == '<') && (NXT(1) == '?'))) {
4655         htmlParseComment(ctxt);
4656         htmlParsePI(ctxt);
4657 	SKIP_BLANKS;
4658     }
4659 
4660     /*
4661      * Time to start parsing the tree itself
4662      */
4663     htmlParseContentInternal(ctxt);
4664 
4665     /*
4666      * autoclose
4667      */
4668     if (CUR == 0)
4669 	htmlAutoCloseOnEnd(ctxt);
4670 
4671 
4672     /*
4673      * SAX: end of the document processing.
4674      */
4675     if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
4676         ctxt->sax->endDocument(ctxt->userData);
4677 
4678     if ((!(ctxt->options & HTML_PARSE_NODEFDTD)) && (ctxt->myDoc != NULL)) {
4679 	dtd = xmlGetIntSubset(ctxt->myDoc);
4680 	if (dtd == NULL)
4681 	    ctxt->myDoc->intSubset =
4682 		xmlCreateIntSubset(ctxt->myDoc, BAD_CAST "html",
4683 		    BAD_CAST "-//W3C//DTD HTML 4.0 Transitional//EN",
4684 		    BAD_CAST "http://www.w3.org/TR/REC-html40/loose.dtd");
4685     }
4686     if (! ctxt->wellFormed) return(-1);
4687     return(0);
4688 }
4689 
4690 
4691 /************************************************************************
4692  *									*
4693  *			Parser contexts handling			*
4694  *									*
4695  ************************************************************************/
4696 
4697 /**
4698  * htmlInitParserCtxt:
4699  * @ctxt:  an HTML parser context
4700  *
4701  * Initialize a parser context
4702  *
4703  * Returns 0 in case of success and -1 in case of error
4704  */
4705 
4706 static int
htmlInitParserCtxt(htmlParserCtxtPtr ctxt)4707 htmlInitParserCtxt(htmlParserCtxtPtr ctxt)
4708 {
4709     htmlSAXHandler *sax;
4710 
4711     if (ctxt == NULL) return(-1);
4712     memset(ctxt, 0, sizeof(htmlParserCtxt));
4713 
4714     ctxt->dict = xmlDictCreate();
4715     if (ctxt->dict == NULL) {
4716         htmlErrMemory(NULL, "htmlInitParserCtxt: out of memory\n");
4717 	return(-1);
4718     }
4719     sax = (htmlSAXHandler *) xmlMalloc(sizeof(htmlSAXHandler));
4720     if (sax == NULL) {
4721         htmlErrMemory(NULL, "htmlInitParserCtxt: out of memory\n");
4722 	return(-1);
4723     }
4724     else
4725         memset(sax, 0, sizeof(htmlSAXHandler));
4726 
4727     /* Allocate the Input stack */
4728     ctxt->inputTab = (htmlParserInputPtr *)
4729                       xmlMalloc(5 * sizeof(htmlParserInputPtr));
4730     if (ctxt->inputTab == NULL) {
4731         htmlErrMemory(NULL, "htmlInitParserCtxt: out of memory\n");
4732 	ctxt->inputNr = 0;
4733 	ctxt->inputMax = 0;
4734 	ctxt->input = NULL;
4735 	return(-1);
4736     }
4737     ctxt->inputNr = 0;
4738     ctxt->inputMax = 5;
4739     ctxt->input = NULL;
4740     ctxt->version = NULL;
4741     ctxt->encoding = NULL;
4742     ctxt->standalone = -1;
4743     ctxt->instate = XML_PARSER_START;
4744 
4745     /* Allocate the Node stack */
4746     ctxt->nodeTab = (htmlNodePtr *) xmlMalloc(10 * sizeof(htmlNodePtr));
4747     if (ctxt->nodeTab == NULL) {
4748         htmlErrMemory(NULL, "htmlInitParserCtxt: out of memory\n");
4749 	ctxt->nodeNr = 0;
4750 	ctxt->nodeMax = 0;
4751 	ctxt->node = NULL;
4752 	ctxt->inputNr = 0;
4753 	ctxt->inputMax = 0;
4754 	ctxt->input = NULL;
4755 	return(-1);
4756     }
4757     ctxt->nodeNr = 0;
4758     ctxt->nodeMax = 10;
4759     ctxt->node = NULL;
4760 
4761     /* Allocate the Name stack */
4762     ctxt->nameTab = (const xmlChar **) xmlMalloc(10 * sizeof(xmlChar *));
4763     if (ctxt->nameTab == NULL) {
4764         htmlErrMemory(NULL, "htmlInitParserCtxt: out of memory\n");
4765 	ctxt->nameNr = 0;
4766 	ctxt->nameMax = 0;
4767 	ctxt->name = NULL;
4768 	ctxt->nodeNr = 0;
4769 	ctxt->nodeMax = 0;
4770 	ctxt->node = NULL;
4771 	ctxt->inputNr = 0;
4772 	ctxt->inputMax = 0;
4773 	ctxt->input = NULL;
4774 	return(-1);
4775     }
4776     ctxt->nameNr = 0;
4777     ctxt->nameMax = 10;
4778     ctxt->name = NULL;
4779 
4780     ctxt->nodeInfoTab = NULL;
4781     ctxt->nodeInfoNr  = 0;
4782     ctxt->nodeInfoMax = 0;
4783 
4784     if (sax == NULL) ctxt->sax = (xmlSAXHandlerPtr) &htmlDefaultSAXHandler;
4785     else {
4786         ctxt->sax = sax;
4787 	memcpy(sax, &htmlDefaultSAXHandler, sizeof(xmlSAXHandlerV1));
4788     }
4789     ctxt->userData = ctxt;
4790     ctxt->myDoc = NULL;
4791     ctxt->wellFormed = 1;
4792     ctxt->replaceEntities = 0;
4793     ctxt->linenumbers = xmlLineNumbersDefaultValue;
4794     ctxt->html = 1;
4795     ctxt->vctxt.finishDtd = XML_CTXT_FINISH_DTD_0;
4796     ctxt->vctxt.userData = ctxt;
4797     ctxt->vctxt.error = xmlParserValidityError;
4798     ctxt->vctxt.warning = xmlParserValidityWarning;
4799     ctxt->record_info = 0;
4800     ctxt->validate = 0;
4801     ctxt->nbChars = 0;
4802     ctxt->checkIndex = 0;
4803     ctxt->catalogs = NULL;
4804     xmlInitNodeInfoSeq(&ctxt->node_seq);
4805     return(0);
4806 }
4807 
4808 /**
4809  * htmlFreeParserCtxt:
4810  * @ctxt:  an HTML parser context
4811  *
4812  * Free all the memory used by a parser context. However the parsed
4813  * document in ctxt->myDoc is not freed.
4814  */
4815 
4816 void
htmlFreeParserCtxt(htmlParserCtxtPtr ctxt)4817 htmlFreeParserCtxt(htmlParserCtxtPtr ctxt)
4818 {
4819     xmlFreeParserCtxt(ctxt);
4820 }
4821 
4822 /**
4823  * htmlNewParserCtxt:
4824  *
4825  * Allocate and initialize a new parser context.
4826  *
4827  * Returns the htmlParserCtxtPtr or NULL in case of allocation error
4828  */
4829 
4830 htmlParserCtxtPtr
htmlNewParserCtxt(void)4831 htmlNewParserCtxt(void)
4832 {
4833     xmlParserCtxtPtr ctxt;
4834 
4835     ctxt = (xmlParserCtxtPtr) xmlMalloc(sizeof(xmlParserCtxt));
4836     if (ctxt == NULL) {
4837         htmlErrMemory(NULL, "NewParserCtxt: out of memory\n");
4838 	return(NULL);
4839     }
4840     memset(ctxt, 0, sizeof(xmlParserCtxt));
4841     if (htmlInitParserCtxt(ctxt) < 0) {
4842         htmlFreeParserCtxt(ctxt);
4843 	return(NULL);
4844     }
4845     return(ctxt);
4846 }
4847 
4848 /**
4849  * htmlCreateMemoryParserCtxt:
4850  * @buffer:  a pointer to a char array
4851  * @size:  the size of the array
4852  *
4853  * Create a parser context for an HTML in-memory document.
4854  *
4855  * Returns the new parser context or NULL
4856  */
4857 htmlParserCtxtPtr
htmlCreateMemoryParserCtxt(const char * buffer,int size)4858 htmlCreateMemoryParserCtxt(const char *buffer, int size) {
4859     xmlParserCtxtPtr ctxt;
4860     xmlParserInputPtr input;
4861     xmlParserInputBufferPtr buf;
4862 
4863     if (buffer == NULL)
4864 	return(NULL);
4865     if (size <= 0)
4866 	return(NULL);
4867 
4868     ctxt = htmlNewParserCtxt();
4869     if (ctxt == NULL)
4870 	return(NULL);
4871 
4872     buf = xmlParserInputBufferCreateMem(buffer, size, XML_CHAR_ENCODING_NONE);
4873     if (buf == NULL) return(NULL);
4874 
4875     input = xmlNewInputStream(ctxt);
4876     if (input == NULL) {
4877 	xmlFreeParserCtxt(ctxt);
4878 	return(NULL);
4879     }
4880 
4881     input->filename = NULL;
4882     input->buf = buf;
4883     input->base = input->buf->buffer->content;
4884     input->cur = input->buf->buffer->content;
4885     input->end = &input->buf->buffer->content[input->buf->buffer->use];
4886 
4887     inputPush(ctxt, input);
4888     return(ctxt);
4889 }
4890 
4891 /**
4892  * htmlCreateDocParserCtxt:
4893  * @cur:  a pointer to an array of xmlChar
4894  * @encoding:  a free form C string describing the HTML document encoding, or NULL
4895  *
4896  * Create a parser context for an HTML document.
4897  *
4898  * TODO: check the need to add encoding handling there
4899  *
4900  * Returns the new parser context or NULL
4901  */
4902 static htmlParserCtxtPtr
htmlCreateDocParserCtxt(const xmlChar * cur,const char * encoding)4903 htmlCreateDocParserCtxt(const xmlChar *cur, const char *encoding) {
4904     int len;
4905     htmlParserCtxtPtr ctxt;
4906 
4907     if (cur == NULL)
4908 	return(NULL);
4909     len = xmlStrlen(cur);
4910     ctxt = htmlCreateMemoryParserCtxt((char *)cur, len);
4911     if (ctxt == NULL)
4912 	return(NULL);
4913 
4914     if (encoding != NULL) {
4915 	xmlCharEncoding enc;
4916 	xmlCharEncodingHandlerPtr handler;
4917 
4918 	if (ctxt->input->encoding != NULL)
4919 	    xmlFree((xmlChar *) ctxt->input->encoding);
4920 	ctxt->input->encoding = xmlStrdup((const xmlChar *) encoding);
4921 
4922 	enc = xmlParseCharEncoding(encoding);
4923 	/*
4924 	 * registered set of known encodings
4925 	 */
4926 	if (enc != XML_CHAR_ENCODING_ERROR) {
4927 	    xmlSwitchEncoding(ctxt, enc);
4928 	    if (ctxt->errNo == XML_ERR_UNSUPPORTED_ENCODING) {
4929 		htmlParseErr(ctxt, XML_ERR_UNSUPPORTED_ENCODING,
4930 		             "Unsupported encoding %s\n",
4931 			     (const xmlChar *) encoding, NULL);
4932 	    }
4933 	} else {
4934 	    /*
4935 	     * fallback for unknown encodings
4936 	     */
4937 	    handler = xmlFindCharEncodingHandler((const char *) encoding);
4938 	    if (handler != NULL) {
4939 		xmlSwitchToEncoding(ctxt, handler);
4940 	    } else {
4941 		htmlParseErr(ctxt, XML_ERR_UNSUPPORTED_ENCODING,
4942 		             "Unsupported encoding %s\n",
4943 			     (const xmlChar *) encoding, NULL);
4944 	    }
4945 	}
4946     }
4947     return(ctxt);
4948 }
4949 
4950 #ifdef LIBXML_PUSH_ENABLED
4951 /************************************************************************
4952  *									*
4953  *	Progressive parsing interfaces				*
4954  *									*
4955  ************************************************************************/
4956 
4957 /**
4958  * htmlParseLookupSequence:
4959  * @ctxt:  an HTML parser context
4960  * @first:  the first char to lookup
4961  * @next:  the next char to lookup or zero
4962  * @third:  the next char to lookup or zero
4963  * @comment: flag to force checking inside comments
4964  *
4965  * Try to find if a sequence (first, next, third) or  just (first next) or
4966  * (first) is available in the input stream.
4967  * This function has a side effect of (possibly) incrementing ctxt->checkIndex
4968  * to avoid rescanning sequences of bytes, it DOES change the state of the
4969  * parser, do not use liberally.
4970  * This is basically similar to xmlParseLookupSequence()
4971  *
4972  * Returns the index to the current parsing point if the full sequence
4973  *      is available, -1 otherwise.
4974  */
4975 static int
htmlParseLookupSequence(htmlParserCtxtPtr ctxt,xmlChar first,xmlChar next,xmlChar third,int iscomment,int ignoreattrval)4976 htmlParseLookupSequence(htmlParserCtxtPtr ctxt, xmlChar first,
4977                         xmlChar next, xmlChar third, int iscomment,
4978                         int ignoreattrval)
4979 {
4980     int base, len;
4981     htmlParserInputPtr in;
4982     const xmlChar *buf;
4983     int incomment = 0;
4984     int invalue = 0;
4985     char valdellim = 0x0;
4986 
4987     in = ctxt->input;
4988     if (in == NULL)
4989         return (-1);
4990 
4991     base = in->cur - in->base;
4992     if (base < 0)
4993         return (-1);
4994 
4995     if (ctxt->checkIndex > base)
4996         base = ctxt->checkIndex;
4997 
4998     if (in->buf == NULL) {
4999         buf = in->base;
5000         len = in->length;
5001     } else {
5002         buf = in->buf->buffer->content;
5003         len = in->buf->buffer->use;
5004     }
5005 
5006     /* take into account the sequence length */
5007     if (third)
5008         len -= 2;
5009     else if (next)
5010         len--;
5011     for (; base < len; base++) {
5012         if ((!incomment) && (base + 4 < len) && (!iscomment)) {
5013             if ((buf[base] == '<') && (buf[base + 1] == '!') &&
5014                 (buf[base + 2] == '-') && (buf[base + 3] == '-')) {
5015                 incomment = 1;
5016                 /* do not increment past <! - some people use <!--> */
5017                 base += 2;
5018             }
5019         }
5020         if (ignoreattrval) {
5021             if (buf[base] == '"' || buf[base] == '\'') {
5022                 if (invalue) {
5023                     if (buf[base] == valdellim) {
5024                         invalue = 0;
5025                         continue;
5026                     }
5027                 } else {
5028                     valdellim = buf[base];
5029                     invalue = 1;
5030                     continue;
5031                 }
5032             } else if (invalue) {
5033                 continue;
5034             }
5035         }
5036         if (incomment) {
5037             if (base + 3 > len)
5038                 return (-1);
5039             if ((buf[base] == '-') && (buf[base + 1] == '-') &&
5040                 (buf[base + 2] == '>')) {
5041                 incomment = 0;
5042                 base += 2;
5043             }
5044             continue;
5045         }
5046         if (buf[base] == first) {
5047             if (third != 0) {
5048                 if ((buf[base + 1] != next) || (buf[base + 2] != third))
5049                     continue;
5050             } else if (next != 0) {
5051                 if (buf[base + 1] != next)
5052                     continue;
5053             }
5054             ctxt->checkIndex = 0;
5055 #ifdef DEBUG_PUSH
5056             if (next == 0)
5057                 xmlGenericError(xmlGenericErrorContext,
5058                                 "HPP: lookup '%c' found at %d\n",
5059                                 first, base);
5060             else if (third == 0)
5061                 xmlGenericError(xmlGenericErrorContext,
5062                                 "HPP: lookup '%c%c' found at %d\n",
5063                                 first, next, base);
5064             else
5065                 xmlGenericError(xmlGenericErrorContext,
5066                                 "HPP: lookup '%c%c%c' found at %d\n",
5067                                 first, next, third, base);
5068 #endif
5069             return (base - (in->cur - in->base));
5070         }
5071     }
5072     if ((!incomment) && (!invalue))
5073         ctxt->checkIndex = base;
5074 #ifdef DEBUG_PUSH
5075     if (next == 0)
5076         xmlGenericError(xmlGenericErrorContext,
5077                         "HPP: lookup '%c' failed\n", first);
5078     else if (third == 0)
5079         xmlGenericError(xmlGenericErrorContext,
5080                         "HPP: lookup '%c%c' failed\n", first, next);
5081     else
5082         xmlGenericError(xmlGenericErrorContext,
5083                         "HPP: lookup '%c%c%c' failed\n", first, next,
5084                         third);
5085 #endif
5086     return (-1);
5087 }
5088 
5089 /**
5090  * htmlParseLookupChars:
5091  * @ctxt: an HTML parser context
5092  * @stop: Array of chars, which stop the lookup.
5093  * @stopLen: Length of stop-Array
5094  *
5095  * Try to find if any char of the stop-Array is available in the input
5096  * stream.
5097  * This function has a side effect of (possibly) incrementing ctxt->checkIndex
5098  * to avoid rescanning sequences of bytes, it DOES change the state of the
5099  * parser, do not use liberally.
5100  *
5101  * Returns the index to the current parsing point if a stopChar
5102  *      is available, -1 otherwise.
5103  */
5104 static int
htmlParseLookupChars(htmlParserCtxtPtr ctxt,const xmlChar * stop,int stopLen)5105 htmlParseLookupChars(htmlParserCtxtPtr ctxt, const xmlChar * stop,
5106                      int stopLen)
5107 {
5108     int base, len;
5109     htmlParserInputPtr in;
5110     const xmlChar *buf;
5111     int incomment = 0;
5112     int i;
5113 
5114     in = ctxt->input;
5115     if (in == NULL)
5116         return (-1);
5117 
5118     base = in->cur - in->base;
5119     if (base < 0)
5120         return (-1);
5121 
5122     if (ctxt->checkIndex > base)
5123         base = ctxt->checkIndex;
5124 
5125     if (in->buf == NULL) {
5126         buf = in->base;
5127         len = in->length;
5128     } else {
5129         buf = in->buf->buffer->content;
5130         len = in->buf->buffer->use;
5131     }
5132 
5133     for (; base < len; base++) {
5134         if (!incomment && (base + 4 < len)) {
5135             if ((buf[base] == '<') && (buf[base + 1] == '!') &&
5136                 (buf[base + 2] == '-') && (buf[base + 3] == '-')) {
5137                 incomment = 1;
5138                 /* do not increment past <! - some people use <!--> */
5139                 base += 2;
5140             }
5141         }
5142         if (incomment) {
5143             if (base + 3 > len)
5144                 return (-1);
5145             if ((buf[base] == '-') && (buf[base + 1] == '-') &&
5146                 (buf[base + 2] == '>')) {
5147                 incomment = 0;
5148                 base += 2;
5149             }
5150             continue;
5151         }
5152         for (i = 0; i < stopLen; ++i) {
5153             if (buf[base] == stop[i]) {
5154                 ctxt->checkIndex = 0;
5155                 return (base - (in->cur - in->base));
5156             }
5157         }
5158     }
5159     ctxt->checkIndex = base;
5160     return (-1);
5161 }
5162 
5163 /**
5164  * htmlParseTryOrFinish:
5165  * @ctxt:  an HTML parser context
5166  * @terminate:  last chunk indicator
5167  *
5168  * Try to progress on parsing
5169  *
5170  * Returns zero if no parsing was possible
5171  */
5172 static int
htmlParseTryOrFinish(htmlParserCtxtPtr ctxt,int terminate)5173 htmlParseTryOrFinish(htmlParserCtxtPtr ctxt, int terminate) {
5174     int ret = 0;
5175     htmlParserInputPtr in;
5176     int avail = 0;
5177     xmlChar cur, next;
5178 
5179 #ifdef DEBUG_PUSH
5180     switch (ctxt->instate) {
5181 	case XML_PARSER_EOF:
5182 	    xmlGenericError(xmlGenericErrorContext,
5183 		    "HPP: try EOF\n"); break;
5184 	case XML_PARSER_START:
5185 	    xmlGenericError(xmlGenericErrorContext,
5186 		    "HPP: try START\n"); break;
5187 	case XML_PARSER_MISC:
5188 	    xmlGenericError(xmlGenericErrorContext,
5189 		    "HPP: try MISC\n");break;
5190 	case XML_PARSER_COMMENT:
5191 	    xmlGenericError(xmlGenericErrorContext,
5192 		    "HPP: try COMMENT\n");break;
5193 	case XML_PARSER_PROLOG:
5194 	    xmlGenericError(xmlGenericErrorContext,
5195 		    "HPP: try PROLOG\n");break;
5196 	case XML_PARSER_START_TAG:
5197 	    xmlGenericError(xmlGenericErrorContext,
5198 		    "HPP: try START_TAG\n");break;
5199 	case XML_PARSER_CONTENT:
5200 	    xmlGenericError(xmlGenericErrorContext,
5201 		    "HPP: try CONTENT\n");break;
5202 	case XML_PARSER_CDATA_SECTION:
5203 	    xmlGenericError(xmlGenericErrorContext,
5204 		    "HPP: try CDATA_SECTION\n");break;
5205 	case XML_PARSER_END_TAG:
5206 	    xmlGenericError(xmlGenericErrorContext,
5207 		    "HPP: try END_TAG\n");break;
5208 	case XML_PARSER_ENTITY_DECL:
5209 	    xmlGenericError(xmlGenericErrorContext,
5210 		    "HPP: try ENTITY_DECL\n");break;
5211 	case XML_PARSER_ENTITY_VALUE:
5212 	    xmlGenericError(xmlGenericErrorContext,
5213 		    "HPP: try ENTITY_VALUE\n");break;
5214 	case XML_PARSER_ATTRIBUTE_VALUE:
5215 	    xmlGenericError(xmlGenericErrorContext,
5216 		    "HPP: try ATTRIBUTE_VALUE\n");break;
5217 	case XML_PARSER_DTD:
5218 	    xmlGenericError(xmlGenericErrorContext,
5219 		    "HPP: try DTD\n");break;
5220 	case XML_PARSER_EPILOG:
5221 	    xmlGenericError(xmlGenericErrorContext,
5222 		    "HPP: try EPILOG\n");break;
5223 	case XML_PARSER_PI:
5224 	    xmlGenericError(xmlGenericErrorContext,
5225 		    "HPP: try PI\n");break;
5226 	case XML_PARSER_SYSTEM_LITERAL:
5227 	    xmlGenericError(xmlGenericErrorContext,
5228 		    "HPP: try SYSTEM_LITERAL\n");break;
5229     }
5230 #endif
5231 
5232     while (1) {
5233 
5234 	in = ctxt->input;
5235 	if (in == NULL) break;
5236 	if (in->buf == NULL)
5237 	    avail = in->length - (in->cur - in->base);
5238 	else
5239 	    avail = in->buf->buffer->use - (in->cur - in->base);
5240 	if ((avail == 0) && (terminate)) {
5241 	    htmlAutoCloseOnEnd(ctxt);
5242 	    if ((ctxt->nameNr == 0) && (ctxt->instate != XML_PARSER_EOF)) {
5243 		/*
5244 		 * SAX: end of the document processing.
5245 		 */
5246 		ctxt->instate = XML_PARSER_EOF;
5247 		if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
5248 		    ctxt->sax->endDocument(ctxt->userData);
5249 	    }
5250 	}
5251         if (avail < 1)
5252 	    goto done;
5253 	cur = in->cur[0];
5254 	if (cur == 0) {
5255 	    SKIP(1);
5256 	    continue;
5257 	}
5258 
5259         switch (ctxt->instate) {
5260             case XML_PARSER_EOF:
5261 	        /*
5262 		 * Document parsing is done !
5263 		 */
5264 	        goto done;
5265             case XML_PARSER_START:
5266 	        /*
5267 		 * Very first chars read from the document flow.
5268 		 */
5269 		cur = in->cur[0];
5270 		if (IS_BLANK_CH(cur)) {
5271 		    SKIP_BLANKS;
5272 		    if (in->buf == NULL)
5273 			avail = in->length - (in->cur - in->base);
5274 		    else
5275 			avail = in->buf->buffer->use - (in->cur - in->base);
5276 		}
5277 		if ((ctxt->sax) && (ctxt->sax->setDocumentLocator))
5278 		    ctxt->sax->setDocumentLocator(ctxt->userData,
5279 						  &xmlDefaultSAXLocator);
5280 		if ((ctxt->sax) && (ctxt->sax->startDocument) &&
5281 	            (!ctxt->disableSAX))
5282 		    ctxt->sax->startDocument(ctxt->userData);
5283 
5284 		cur = in->cur[0];
5285 		next = in->cur[1];
5286 		if ((cur == '<') && (next == '!') &&
5287 		    (UPP(2) == 'D') && (UPP(3) == 'O') &&
5288 		    (UPP(4) == 'C') && (UPP(5) == 'T') &&
5289 		    (UPP(6) == 'Y') && (UPP(7) == 'P') &&
5290 		    (UPP(8) == 'E')) {
5291 		    if ((!terminate) &&
5292 		        (htmlParseLookupSequence(ctxt, '>', 0, 0, 0, 1) < 0))
5293 			goto done;
5294 #ifdef DEBUG_PUSH
5295 		    xmlGenericError(xmlGenericErrorContext,
5296 			    "HPP: Parsing internal subset\n");
5297 #endif
5298 		    htmlParseDocTypeDecl(ctxt);
5299 		    ctxt->instate = XML_PARSER_PROLOG;
5300 #ifdef DEBUG_PUSH
5301 		    xmlGenericError(xmlGenericErrorContext,
5302 			    "HPP: entering PROLOG\n");
5303 #endif
5304                 } else {
5305 		    ctxt->instate = XML_PARSER_MISC;
5306 #ifdef DEBUG_PUSH
5307 		    xmlGenericError(xmlGenericErrorContext,
5308 			    "HPP: entering MISC\n");
5309 #endif
5310 		}
5311 		break;
5312             case XML_PARSER_MISC:
5313 		SKIP_BLANKS;
5314 		if (in->buf == NULL)
5315 		    avail = in->length - (in->cur - in->base);
5316 		else
5317 		    avail = in->buf->buffer->use - (in->cur - in->base);
5318 		if (avail < 2)
5319 		    goto done;
5320 		cur = in->cur[0];
5321 		next = in->cur[1];
5322 	        if ((cur == '<') && (next == '!') &&
5323 		    (in->cur[2] == '-') && (in->cur[3] == '-')) {
5324 		    if ((!terminate) &&
5325 		        (htmlParseLookupSequence(ctxt, '-', '-', '>', 1, 1) < 0))
5326 			goto done;
5327 #ifdef DEBUG_PUSH
5328 		    xmlGenericError(xmlGenericErrorContext,
5329 			    "HPP: Parsing Comment\n");
5330 #endif
5331 		    htmlParseComment(ctxt);
5332 		    ctxt->instate = XML_PARSER_MISC;
5333 	        } else if ((cur == '<') && (next == '?')) {
5334 		    if ((!terminate) &&
5335 		        (htmlParseLookupSequence(ctxt, '>', 0, 0, 0, 1) < 0))
5336 			goto done;
5337 #ifdef DEBUG_PUSH
5338 		    xmlGenericError(xmlGenericErrorContext,
5339 			    "HPP: Parsing PI\n");
5340 #endif
5341 		    htmlParsePI(ctxt);
5342 		    ctxt->instate = XML_PARSER_MISC;
5343 		} else if ((cur == '<') && (next == '!') &&
5344 		    (UPP(2) == 'D') && (UPP(3) == 'O') &&
5345 		    (UPP(4) == 'C') && (UPP(5) == 'T') &&
5346 		    (UPP(6) == 'Y') && (UPP(7) == 'P') &&
5347 		    (UPP(8) == 'E')) {
5348 		    if ((!terminate) &&
5349 		        (htmlParseLookupSequence(ctxt, '>', 0, 0, 0, 1) < 0))
5350 			goto done;
5351 #ifdef DEBUG_PUSH
5352 		    xmlGenericError(xmlGenericErrorContext,
5353 			    "HPP: Parsing internal subset\n");
5354 #endif
5355 		    htmlParseDocTypeDecl(ctxt);
5356 		    ctxt->instate = XML_PARSER_PROLOG;
5357 #ifdef DEBUG_PUSH
5358 		    xmlGenericError(xmlGenericErrorContext,
5359 			    "HPP: entering PROLOG\n");
5360 #endif
5361 		} else if ((cur == '<') && (next == '!') &&
5362 		           (avail < 9)) {
5363 		    goto done;
5364 		} else {
5365 		    ctxt->instate = XML_PARSER_START_TAG;
5366 #ifdef DEBUG_PUSH
5367 		    xmlGenericError(xmlGenericErrorContext,
5368 			    "HPP: entering START_TAG\n");
5369 #endif
5370 		}
5371 		break;
5372             case XML_PARSER_PROLOG:
5373 		SKIP_BLANKS;
5374 		if (in->buf == NULL)
5375 		    avail = in->length - (in->cur - in->base);
5376 		else
5377 		    avail = in->buf->buffer->use - (in->cur - in->base);
5378 		if (avail < 2)
5379 		    goto done;
5380 		cur = in->cur[0];
5381 		next = in->cur[1];
5382 		if ((cur == '<') && (next == '!') &&
5383 		    (in->cur[2] == '-') && (in->cur[3] == '-')) {
5384 		    if ((!terminate) &&
5385 		        (htmlParseLookupSequence(ctxt, '-', '-', '>', 1, 1) < 0))
5386 			goto done;
5387 #ifdef DEBUG_PUSH
5388 		    xmlGenericError(xmlGenericErrorContext,
5389 			    "HPP: Parsing Comment\n");
5390 #endif
5391 		    htmlParseComment(ctxt);
5392 		    ctxt->instate = XML_PARSER_PROLOG;
5393 	        } else if ((cur == '<') && (next == '?')) {
5394 		    if ((!terminate) &&
5395 		        (htmlParseLookupSequence(ctxt, '>', 0, 0, 0, 1) < 0))
5396 			goto done;
5397 #ifdef DEBUG_PUSH
5398 		    xmlGenericError(xmlGenericErrorContext,
5399 			    "HPP: Parsing PI\n");
5400 #endif
5401 		    htmlParsePI(ctxt);
5402 		    ctxt->instate = XML_PARSER_PROLOG;
5403 		} else if ((cur == '<') && (next == '!') &&
5404 		           (avail < 4)) {
5405 		    goto done;
5406 		} else {
5407 		    ctxt->instate = XML_PARSER_START_TAG;
5408 #ifdef DEBUG_PUSH
5409 		    xmlGenericError(xmlGenericErrorContext,
5410 			    "HPP: entering START_TAG\n");
5411 #endif
5412 		}
5413 		break;
5414             case XML_PARSER_EPILOG:
5415 		if (in->buf == NULL)
5416 		    avail = in->length - (in->cur - in->base);
5417 		else
5418 		    avail = in->buf->buffer->use - (in->cur - in->base);
5419 		if (avail < 1)
5420 		    goto done;
5421 		cur = in->cur[0];
5422 		if (IS_BLANK_CH(cur)) {
5423 		    htmlParseCharData(ctxt);
5424 		    goto done;
5425 		}
5426 		if (avail < 2)
5427 		    goto done;
5428 		next = in->cur[1];
5429 	        if ((cur == '<') && (next == '!') &&
5430 		    (in->cur[2] == '-') && (in->cur[3] == '-')) {
5431 		    if ((!terminate) &&
5432 		        (htmlParseLookupSequence(ctxt, '-', '-', '>', 1, 1) < 0))
5433 			goto done;
5434 #ifdef DEBUG_PUSH
5435 		    xmlGenericError(xmlGenericErrorContext,
5436 			    "HPP: Parsing Comment\n");
5437 #endif
5438 		    htmlParseComment(ctxt);
5439 		    ctxt->instate = XML_PARSER_EPILOG;
5440 	        } else if ((cur == '<') && (next == '?')) {
5441 		    if ((!terminate) &&
5442 		        (htmlParseLookupSequence(ctxt, '>', 0, 0, 0, 1) < 0))
5443 			goto done;
5444 #ifdef DEBUG_PUSH
5445 		    xmlGenericError(xmlGenericErrorContext,
5446 			    "HPP: Parsing PI\n");
5447 #endif
5448 		    htmlParsePI(ctxt);
5449 		    ctxt->instate = XML_PARSER_EPILOG;
5450 		} else if ((cur == '<') && (next == '!') &&
5451 		           (avail < 4)) {
5452 		    goto done;
5453 		} else {
5454 		    ctxt->errNo = XML_ERR_DOCUMENT_END;
5455 		    ctxt->wellFormed = 0;
5456 		    ctxt->instate = XML_PARSER_EOF;
5457 #ifdef DEBUG_PUSH
5458 		    xmlGenericError(xmlGenericErrorContext,
5459 			    "HPP: entering EOF\n");
5460 #endif
5461 		    if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
5462 			ctxt->sax->endDocument(ctxt->userData);
5463 		    goto done;
5464 		}
5465 		break;
5466             case XML_PARSER_START_TAG: {
5467 	        const xmlChar *name;
5468 		int failed;
5469 		const htmlElemDesc * info;
5470 
5471 		if (avail < 2)
5472 		    goto done;
5473 		cur = in->cur[0];
5474 	        if (cur != '<') {
5475 		    ctxt->instate = XML_PARSER_CONTENT;
5476 #ifdef DEBUG_PUSH
5477 		    xmlGenericError(xmlGenericErrorContext,
5478 			    "HPP: entering CONTENT\n");
5479 #endif
5480 		    break;
5481 		}
5482 		if (in->cur[1] == '/') {
5483 		    ctxt->instate = XML_PARSER_END_TAG;
5484 		    ctxt->checkIndex = 0;
5485 #ifdef DEBUG_PUSH
5486 		    xmlGenericError(xmlGenericErrorContext,
5487 			    "HPP: entering END_TAG\n");
5488 #endif
5489 		    break;
5490 		}
5491 		if ((!terminate) &&
5492 		    (htmlParseLookupSequence(ctxt, '>', 0, 0, 0, 1) < 0))
5493 		    goto done;
5494 
5495 		failed = htmlParseStartTag(ctxt);
5496 		name = ctxt->name;
5497 		if ((failed == -1) ||
5498 		    (name == NULL)) {
5499 		    if (CUR == '>')
5500 			NEXT;
5501 		    break;
5502 		}
5503 
5504 		/*
5505 		 * Lookup the info for that element.
5506 		 */
5507 		info = htmlTagLookup(name);
5508 		if (info == NULL) {
5509 		    htmlParseErr(ctxt, XML_HTML_UNKNOWN_TAG,
5510 		                 "Tag %s invalid\n", name, NULL);
5511 		}
5512 
5513 		/*
5514 		 * Check for an Empty Element labeled the XML/SGML way
5515 		 */
5516 		if ((CUR == '/') && (NXT(1) == '>')) {
5517 		    SKIP(2);
5518 		    if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
5519 			ctxt->sax->endElement(ctxt->userData, name);
5520 		    htmlnamePop(ctxt);
5521 		    ctxt->instate = XML_PARSER_CONTENT;
5522 #ifdef DEBUG_PUSH
5523 		    xmlGenericError(xmlGenericErrorContext,
5524 			    "HPP: entering CONTENT\n");
5525 #endif
5526 		    break;
5527 		}
5528 
5529 		if (CUR == '>') {
5530 		    NEXT;
5531 		} else {
5532 		    htmlParseErr(ctxt, XML_ERR_GT_REQUIRED,
5533 		                 "Couldn't find end of Start Tag %s\n",
5534 				 name, NULL);
5535 
5536 		    /*
5537 		     * end of parsing of this node.
5538 		     */
5539 		    if (xmlStrEqual(name, ctxt->name)) {
5540 			nodePop(ctxt);
5541 			htmlnamePop(ctxt);
5542 		    }
5543 
5544 		    ctxt->instate = XML_PARSER_CONTENT;
5545 #ifdef DEBUG_PUSH
5546 		    xmlGenericError(xmlGenericErrorContext,
5547 			    "HPP: entering CONTENT\n");
5548 #endif
5549 		    break;
5550 		}
5551 
5552 		/*
5553 		 * Check for an Empty Element from DTD definition
5554 		 */
5555 		if ((info != NULL) && (info->empty)) {
5556 		    if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
5557 			ctxt->sax->endElement(ctxt->userData, name);
5558 		    htmlnamePop(ctxt);
5559 		}
5560 		ctxt->instate = XML_PARSER_CONTENT;
5561 #ifdef DEBUG_PUSH
5562 		xmlGenericError(xmlGenericErrorContext,
5563 			"HPP: entering CONTENT\n");
5564 #endif
5565                 break;
5566 	    }
5567             case XML_PARSER_CONTENT: {
5568 		long cons;
5569                 /*
5570 		 * Handle preparsed entities and charRef
5571 		 */
5572 		if (ctxt->token != 0) {
5573 		    xmlChar chr[2] = { 0 , 0 } ;
5574 
5575 		    chr[0] = (xmlChar) ctxt->token;
5576 		    htmlCheckParagraph(ctxt);
5577 		    if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL))
5578 			ctxt->sax->characters(ctxt->userData, chr, 1);
5579 		    ctxt->token = 0;
5580 		    ctxt->checkIndex = 0;
5581 		}
5582 		if ((avail == 1) && (terminate)) {
5583 		    cur = in->cur[0];
5584 		    if ((cur != '<') && (cur != '&')) {
5585 			if (ctxt->sax != NULL) {
5586 			    if (IS_BLANK_CH(cur)) {
5587 				if (ctxt->sax->ignorableWhitespace != NULL)
5588 				    ctxt->sax->ignorableWhitespace(
5589 					    ctxt->userData, &cur, 1);
5590 			    } else {
5591 				htmlCheckParagraph(ctxt);
5592 				if (ctxt->sax->characters != NULL)
5593 				    ctxt->sax->characters(
5594 					    ctxt->userData, &cur, 1);
5595 			    }
5596 			}
5597 			ctxt->token = 0;
5598 			ctxt->checkIndex = 0;
5599 			in->cur++;
5600 			break;
5601 		    }
5602 		}
5603 		if (avail < 2)
5604 		    goto done;
5605 		cur = in->cur[0];
5606 		next = in->cur[1];
5607 		cons = ctxt->nbChars;
5608 		if ((xmlStrEqual(ctxt->name, BAD_CAST"script")) ||
5609 		    (xmlStrEqual(ctxt->name, BAD_CAST"style"))) {
5610 		    /*
5611 		     * Handle SCRIPT/STYLE separately
5612 		     */
5613 		    if (!terminate) {
5614 		        int idx;
5615 			xmlChar val;
5616 
5617 			idx = htmlParseLookupSequence(ctxt, '<', '/', 0, 0, 0);
5618 			if (idx < 0)
5619 			    goto done;
5620 		        val = in->cur[idx + 2];
5621 			if (val == 0) /* bad cut of input */
5622 			    goto done;
5623 		    }
5624 		    htmlParseScript(ctxt);
5625 		    if ((cur == '<') && (next == '/')) {
5626 			ctxt->instate = XML_PARSER_END_TAG;
5627 			ctxt->checkIndex = 0;
5628 #ifdef DEBUG_PUSH
5629 			xmlGenericError(xmlGenericErrorContext,
5630 				"HPP: entering END_TAG\n");
5631 #endif
5632 			break;
5633 		    }
5634 		} else {
5635 		    /*
5636 		     * Sometimes DOCTYPE arrives in the middle of the document
5637 		     */
5638 		    if ((cur == '<') && (next == '!') &&
5639 			(UPP(2) == 'D') && (UPP(3) == 'O') &&
5640 			(UPP(4) == 'C') && (UPP(5) == 'T') &&
5641 			(UPP(6) == 'Y') && (UPP(7) == 'P') &&
5642 			(UPP(8) == 'E')) {
5643 			if ((!terminate) &&
5644 			    (htmlParseLookupSequence(ctxt, '>', 0, 0, 0, 1) < 0))
5645 			    goto done;
5646 			htmlParseErr(ctxt, XML_HTML_STRUCURE_ERROR,
5647 			             "Misplaced DOCTYPE declaration\n",
5648 				     BAD_CAST "DOCTYPE" , NULL);
5649 			htmlParseDocTypeDecl(ctxt);
5650 		    } else if ((cur == '<') && (next == '!') &&
5651 			(in->cur[2] == '-') && (in->cur[3] == '-')) {
5652 			if ((!terminate) &&
5653 			    (htmlParseLookupSequence(
5654 				ctxt, '-', '-', '>', 1, 1) < 0))
5655 			    goto done;
5656 #ifdef DEBUG_PUSH
5657 			xmlGenericError(xmlGenericErrorContext,
5658 				"HPP: Parsing Comment\n");
5659 #endif
5660 			htmlParseComment(ctxt);
5661 			ctxt->instate = XML_PARSER_CONTENT;
5662 		    } else if ((cur == '<') && (next == '?')) {
5663 			if ((!terminate) &&
5664 			    (htmlParseLookupSequence(ctxt, '>', 0, 0, 0, 1) < 0))
5665 			    goto done;
5666 #ifdef DEBUG_PUSH
5667 			xmlGenericError(xmlGenericErrorContext,
5668 				"HPP: Parsing PI\n");
5669 #endif
5670 			htmlParsePI(ctxt);
5671 			ctxt->instate = XML_PARSER_CONTENT;
5672 		    } else if ((cur == '<') && (next == '!') && (avail < 4)) {
5673 			goto done;
5674 		    } else if ((cur == '<') && (next == '/')) {
5675 			ctxt->instate = XML_PARSER_END_TAG;
5676 			ctxt->checkIndex = 0;
5677 #ifdef DEBUG_PUSH
5678 			xmlGenericError(xmlGenericErrorContext,
5679 				"HPP: entering END_TAG\n");
5680 #endif
5681 			break;
5682 		    } else if (cur == '<') {
5683 			ctxt->instate = XML_PARSER_START_TAG;
5684 			ctxt->checkIndex = 0;
5685 #ifdef DEBUG_PUSH
5686 			xmlGenericError(xmlGenericErrorContext,
5687 				"HPP: entering START_TAG\n");
5688 #endif
5689 			break;
5690 		    } else if (cur == '&') {
5691 			if ((!terminate) &&
5692 			    (htmlParseLookupChars(ctxt,
5693                                                   BAD_CAST "; >/", 4) < 0))
5694 			    goto done;
5695 #ifdef DEBUG_PUSH
5696 			xmlGenericError(xmlGenericErrorContext,
5697 				"HPP: Parsing Reference\n");
5698 #endif
5699 			/* TODO: check generation of subtrees if noent !!! */
5700 			htmlParseReference(ctxt);
5701 		    } else {
5702 		        /*
5703 			 * check that the text sequence is complete
5704 			 * before handing out the data to the parser
5705 			 * to avoid problems with erroneous end of
5706 			 * data detection.
5707 			 */
5708 			if ((!terminate) &&
5709                             (htmlParseLookupChars(ctxt, BAD_CAST "<&", 2) < 0))
5710 			    goto done;
5711 			ctxt->checkIndex = 0;
5712 #ifdef DEBUG_PUSH
5713 			xmlGenericError(xmlGenericErrorContext,
5714 				"HPP: Parsing char data\n");
5715 #endif
5716 			htmlParseCharData(ctxt);
5717 		    }
5718 		}
5719 		if (cons == ctxt->nbChars) {
5720 		    if (ctxt->node != NULL) {
5721 			htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
5722 			             "detected an error in element content\n",
5723 				     NULL, NULL);
5724 		    }
5725 		    NEXT;
5726 		    break;
5727 		}
5728 
5729 		break;
5730 	    }
5731             case XML_PARSER_END_TAG:
5732 		if (avail < 2)
5733 		    goto done;
5734 		if ((!terminate) &&
5735 		    (htmlParseLookupSequence(ctxt, '>', 0, 0, 0, 1) < 0))
5736 		    goto done;
5737 		htmlParseEndTag(ctxt);
5738 		if (ctxt->nameNr == 0) {
5739 		    ctxt->instate = XML_PARSER_EPILOG;
5740 		} else {
5741 		    ctxt->instate = XML_PARSER_CONTENT;
5742 		}
5743 		ctxt->checkIndex = 0;
5744 #ifdef DEBUG_PUSH
5745 		xmlGenericError(xmlGenericErrorContext,
5746 			"HPP: entering CONTENT\n");
5747 #endif
5748 	        break;
5749             case XML_PARSER_CDATA_SECTION:
5750 		htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
5751 			"HPP: internal error, state == CDATA\n",
5752 			     NULL, NULL);
5753 		ctxt->instate = XML_PARSER_CONTENT;
5754 		ctxt->checkIndex = 0;
5755 #ifdef DEBUG_PUSH
5756 		xmlGenericError(xmlGenericErrorContext,
5757 			"HPP: entering CONTENT\n");
5758 #endif
5759 		break;
5760             case XML_PARSER_DTD:
5761 		htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
5762 			"HPP: internal error, state == DTD\n",
5763 			     NULL, NULL);
5764 		ctxt->instate = XML_PARSER_CONTENT;
5765 		ctxt->checkIndex = 0;
5766 #ifdef DEBUG_PUSH
5767 		xmlGenericError(xmlGenericErrorContext,
5768 			"HPP: entering CONTENT\n");
5769 #endif
5770 		break;
5771             case XML_PARSER_COMMENT:
5772 		htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
5773 			"HPP: internal error, state == COMMENT\n",
5774 			     NULL, NULL);
5775 		ctxt->instate = XML_PARSER_CONTENT;
5776 		ctxt->checkIndex = 0;
5777 #ifdef DEBUG_PUSH
5778 		xmlGenericError(xmlGenericErrorContext,
5779 			"HPP: entering CONTENT\n");
5780 #endif
5781 		break;
5782             case XML_PARSER_PI:
5783 		htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
5784 			"HPP: internal error, state == PI\n",
5785 			     NULL, NULL);
5786 		ctxt->instate = XML_PARSER_CONTENT;
5787 		ctxt->checkIndex = 0;
5788 #ifdef DEBUG_PUSH
5789 		xmlGenericError(xmlGenericErrorContext,
5790 			"HPP: entering CONTENT\n");
5791 #endif
5792 		break;
5793             case XML_PARSER_ENTITY_DECL:
5794 		htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
5795 			"HPP: internal error, state == ENTITY_DECL\n",
5796 			     NULL, NULL);
5797 		ctxt->instate = XML_PARSER_CONTENT;
5798 		ctxt->checkIndex = 0;
5799 #ifdef DEBUG_PUSH
5800 		xmlGenericError(xmlGenericErrorContext,
5801 			"HPP: entering CONTENT\n");
5802 #endif
5803 		break;
5804             case XML_PARSER_ENTITY_VALUE:
5805 		htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
5806 			"HPP: internal error, state == ENTITY_VALUE\n",
5807 			     NULL, NULL);
5808 		ctxt->instate = XML_PARSER_CONTENT;
5809 		ctxt->checkIndex = 0;
5810 #ifdef DEBUG_PUSH
5811 		xmlGenericError(xmlGenericErrorContext,
5812 			"HPP: entering DTD\n");
5813 #endif
5814 		break;
5815             case XML_PARSER_ATTRIBUTE_VALUE:
5816 		htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
5817 			"HPP: internal error, state == ATTRIBUTE_VALUE\n",
5818 			     NULL, NULL);
5819 		ctxt->instate = XML_PARSER_START_TAG;
5820 		ctxt->checkIndex = 0;
5821 #ifdef DEBUG_PUSH
5822 		xmlGenericError(xmlGenericErrorContext,
5823 			"HPP: entering START_TAG\n");
5824 #endif
5825 		break;
5826 	    case XML_PARSER_SYSTEM_LITERAL:
5827 		htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
5828 		    "HPP: internal error, state == XML_PARSER_SYSTEM_LITERAL\n",
5829 			     NULL, NULL);
5830 		ctxt->instate = XML_PARSER_CONTENT;
5831 		ctxt->checkIndex = 0;
5832 #ifdef DEBUG_PUSH
5833 		xmlGenericError(xmlGenericErrorContext,
5834 			"HPP: entering CONTENT\n");
5835 #endif
5836 		break;
5837 	    case XML_PARSER_IGNORE:
5838 		htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
5839 			"HPP: internal error, state == XML_PARSER_IGNORE\n",
5840 			     NULL, NULL);
5841 		ctxt->instate = XML_PARSER_CONTENT;
5842 		ctxt->checkIndex = 0;
5843 #ifdef DEBUG_PUSH
5844 		xmlGenericError(xmlGenericErrorContext,
5845 			"HPP: entering CONTENT\n");
5846 #endif
5847 		break;
5848 	    case XML_PARSER_PUBLIC_LITERAL:
5849 		htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
5850 			"HPP: internal error, state == XML_PARSER_LITERAL\n",
5851 			     NULL, NULL);
5852 		ctxt->instate = XML_PARSER_CONTENT;
5853 		ctxt->checkIndex = 0;
5854 #ifdef DEBUG_PUSH
5855 		xmlGenericError(xmlGenericErrorContext,
5856 			"HPP: entering CONTENT\n");
5857 #endif
5858 		break;
5859 
5860 	}
5861     }
5862 done:
5863     if ((avail == 0) && (terminate)) {
5864 	htmlAutoCloseOnEnd(ctxt);
5865 	if ((ctxt->nameNr == 0) && (ctxt->instate != XML_PARSER_EOF)) {
5866 	    /*
5867 	     * SAX: end of the document processing.
5868 	     */
5869 	    ctxt->instate = XML_PARSER_EOF;
5870 	    if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
5871 		ctxt->sax->endDocument(ctxt->userData);
5872 	}
5873     }
5874     if ((ctxt->myDoc != NULL) &&
5875 	((terminate) || (ctxt->instate == XML_PARSER_EOF) ||
5876 	 (ctxt->instate == XML_PARSER_EPILOG))) {
5877 	xmlDtdPtr dtd;
5878 	dtd = xmlGetIntSubset(ctxt->myDoc);
5879 	if (dtd == NULL)
5880 	    ctxt->myDoc->intSubset =
5881 		xmlCreateIntSubset(ctxt->myDoc, BAD_CAST "html",
5882 		    BAD_CAST "-//W3C//DTD HTML 4.0 Transitional//EN",
5883 		    BAD_CAST "http://www.w3.org/TR/REC-html40/loose.dtd");
5884     }
5885 #ifdef DEBUG_PUSH
5886     xmlGenericError(xmlGenericErrorContext, "HPP: done %d\n", ret);
5887 #endif
5888     return(ret);
5889 }
5890 
5891 /**
5892  * htmlParseChunk:
5893  * @ctxt:  an HTML parser context
5894  * @chunk:  an char array
5895  * @size:  the size in byte of the chunk
5896  * @terminate:  last chunk indicator
5897  *
5898  * Parse a Chunk of memory
5899  *
5900  * Returns zero if no error, the xmlParserErrors otherwise.
5901  */
5902 int
htmlParseChunk(htmlParserCtxtPtr ctxt,const char * chunk,int size,int terminate)5903 htmlParseChunk(htmlParserCtxtPtr ctxt, const char *chunk, int size,
5904               int terminate) {
5905     if ((ctxt == NULL) || (ctxt->input == NULL)) {
5906 	htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
5907 		     "htmlParseChunk: context error\n", NULL, NULL);
5908 	return(XML_ERR_INTERNAL_ERROR);
5909     }
5910     if ((size > 0) && (chunk != NULL) && (ctxt->input != NULL) &&
5911         (ctxt->input->buf != NULL) && (ctxt->instate != XML_PARSER_EOF))  {
5912 	int base = ctxt->input->base - ctxt->input->buf->buffer->content;
5913 	int cur = ctxt->input->cur - ctxt->input->base;
5914 	int res;
5915 
5916 	res = xmlParserInputBufferPush(ctxt->input->buf, size, chunk);
5917 	if (res < 0) {
5918 	    ctxt->errNo = XML_PARSER_EOF;
5919 	    ctxt->disableSAX = 1;
5920 	    return (XML_PARSER_EOF);
5921 	}
5922 	ctxt->input->base = ctxt->input->buf->buffer->content + base;
5923 	ctxt->input->cur = ctxt->input->base + cur;
5924 	ctxt->input->end =
5925 	  &ctxt->input->buf->buffer->content[ctxt->input->buf->buffer->use];
5926 #ifdef DEBUG_PUSH
5927 	xmlGenericError(xmlGenericErrorContext, "HPP: pushed %d\n", size);
5928 #endif
5929 
5930 #if 0
5931 	if ((terminate) || (ctxt->input->buf->buffer->use > 80))
5932 	    htmlParseTryOrFinish(ctxt, terminate);
5933 #endif
5934     } else if (ctxt->instate != XML_PARSER_EOF) {
5935 	if ((ctxt->input != NULL) && ctxt->input->buf != NULL) {
5936 	    xmlParserInputBufferPtr in = ctxt->input->buf;
5937 	    if ((in->encoder != NULL) && (in->buffer != NULL) &&
5938 		    (in->raw != NULL)) {
5939 		int nbchars;
5940 
5941 		nbchars = xmlCharEncInFunc(in->encoder, in->buffer, in->raw);
5942 		if (nbchars < 0) {
5943 		    htmlParseErr(ctxt, XML_ERR_INVALID_ENCODING,
5944 			         "encoder error\n", NULL, NULL);
5945 		    return(XML_ERR_INVALID_ENCODING);
5946 		}
5947 	    }
5948 	}
5949     }
5950     htmlParseTryOrFinish(ctxt, terminate);
5951     if (terminate) {
5952 	if ((ctxt->instate != XML_PARSER_EOF) &&
5953 	    (ctxt->instate != XML_PARSER_EPILOG) &&
5954 	    (ctxt->instate != XML_PARSER_MISC)) {
5955 	    ctxt->errNo = XML_ERR_DOCUMENT_END;
5956 	    ctxt->wellFormed = 0;
5957 	}
5958 	if (ctxt->instate != XML_PARSER_EOF) {
5959 	    if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
5960 		ctxt->sax->endDocument(ctxt->userData);
5961 	}
5962 	ctxt->instate = XML_PARSER_EOF;
5963     }
5964     return((xmlParserErrors) ctxt->errNo);
5965 }
5966 
5967 /************************************************************************
5968  *									*
5969  *			User entry points				*
5970  *									*
5971  ************************************************************************/
5972 
5973 /**
5974  * htmlCreatePushParserCtxt:
5975  * @sax:  a SAX handler
5976  * @user_data:  The user data returned on SAX callbacks
5977  * @chunk:  a pointer to an array of chars
5978  * @size:  number of chars in the array
5979  * @filename:  an optional file name or URI
5980  * @enc:  an optional encoding
5981  *
5982  * Create a parser context for using the HTML parser in push mode
5983  * The value of @filename is used for fetching external entities
5984  * and error/warning reports.
5985  *
5986  * Returns the new parser context or NULL
5987  */
5988 htmlParserCtxtPtr
htmlCreatePushParserCtxt(htmlSAXHandlerPtr sax,void * user_data,const char * chunk,int size,const char * filename,xmlCharEncoding enc)5989 htmlCreatePushParserCtxt(htmlSAXHandlerPtr sax, void *user_data,
5990                          const char *chunk, int size, const char *filename,
5991 			 xmlCharEncoding enc) {
5992     htmlParserCtxtPtr ctxt;
5993     htmlParserInputPtr inputStream;
5994     xmlParserInputBufferPtr buf;
5995 
5996     xmlInitParser();
5997 
5998     buf = xmlAllocParserInputBuffer(enc);
5999     if (buf == NULL) return(NULL);
6000 
6001     ctxt = htmlNewParserCtxt();
6002     if (ctxt == NULL) {
6003 	xmlFreeParserInputBuffer(buf);
6004 	return(NULL);
6005     }
6006     if(enc==XML_CHAR_ENCODING_UTF8 || buf->encoder)
6007 	ctxt->charset=XML_CHAR_ENCODING_UTF8;
6008     if (sax != NULL) {
6009 	if (ctxt->sax != (xmlSAXHandlerPtr) &htmlDefaultSAXHandler)
6010 	    xmlFree(ctxt->sax);
6011 	ctxt->sax = (htmlSAXHandlerPtr) xmlMalloc(sizeof(htmlSAXHandler));
6012 	if (ctxt->sax == NULL) {
6013 	    xmlFree(buf);
6014 	    xmlFree(ctxt);
6015 	    return(NULL);
6016 	}
6017 	memcpy(ctxt->sax, sax, sizeof(htmlSAXHandler));
6018 	if (user_data != NULL)
6019 	    ctxt->userData = user_data;
6020     }
6021     if (filename == NULL) {
6022 	ctxt->directory = NULL;
6023     } else {
6024         ctxt->directory = xmlParserGetDirectory(filename);
6025     }
6026 
6027     inputStream = htmlNewInputStream(ctxt);
6028     if (inputStream == NULL) {
6029 	xmlFreeParserCtxt(ctxt);
6030 	xmlFree(buf);
6031 	return(NULL);
6032     }
6033 
6034     if (filename == NULL)
6035 	inputStream->filename = NULL;
6036     else
6037 	inputStream->filename = (char *)
6038 	    xmlCanonicPath((const xmlChar *) filename);
6039     inputStream->buf = buf;
6040     inputStream->base = inputStream->buf->buffer->content;
6041     inputStream->cur = inputStream->buf->buffer->content;
6042     inputStream->end =
6043 	&inputStream->buf->buffer->content[inputStream->buf->buffer->use];
6044 
6045     inputPush(ctxt, inputStream);
6046 
6047     if ((size > 0) && (chunk != NULL) && (ctxt->input != NULL) &&
6048         (ctxt->input->buf != NULL))  {
6049 	int base = ctxt->input->base - ctxt->input->buf->buffer->content;
6050 	int cur = ctxt->input->cur - ctxt->input->base;
6051 
6052 	xmlParserInputBufferPush(ctxt->input->buf, size, chunk);
6053 
6054 	ctxt->input->base = ctxt->input->buf->buffer->content + base;
6055 	ctxt->input->cur = ctxt->input->base + cur;
6056 	ctxt->input->end =
6057 	    &ctxt->input->buf->buffer->content[ctxt->input->buf->buffer->use];
6058 #ifdef DEBUG_PUSH
6059 	xmlGenericError(xmlGenericErrorContext, "HPP: pushed %d\n", size);
6060 #endif
6061     }
6062     ctxt->progressive = 1;
6063 
6064     return(ctxt);
6065 }
6066 #endif /* LIBXML_PUSH_ENABLED */
6067 
6068 /**
6069  * htmlSAXParseDoc:
6070  * @cur:  a pointer to an array of xmlChar
6071  * @encoding:  a free form C string describing the HTML document encoding, or NULL
6072  * @sax:  the SAX handler block
6073  * @userData: if using SAX, this pointer will be provided on callbacks.
6074  *
6075  * Parse an HTML in-memory document. If sax is not NULL, use the SAX callbacks
6076  * to handle parse events. If sax is NULL, fallback to the default DOM
6077  * behavior and return a tree.
6078  *
6079  * Returns the resulting document tree unless SAX is NULL or the document is
6080  *     not well formed.
6081  */
6082 
6083 htmlDocPtr
htmlSAXParseDoc(xmlChar * cur,const char * encoding,htmlSAXHandlerPtr sax,void * userData)6084 htmlSAXParseDoc(xmlChar *cur, const char *encoding, htmlSAXHandlerPtr sax, void *userData) {
6085     htmlDocPtr ret;
6086     htmlParserCtxtPtr ctxt;
6087 
6088     xmlInitParser();
6089 
6090     if (cur == NULL) return(NULL);
6091 
6092 
6093     ctxt = htmlCreateDocParserCtxt(cur, encoding);
6094     if (ctxt == NULL) return(NULL);
6095     if (sax != NULL) {
6096         if (ctxt->sax != NULL) xmlFree (ctxt->sax);
6097         ctxt->sax = sax;
6098         ctxt->userData = userData;
6099     }
6100 
6101     htmlParseDocument(ctxt);
6102     ret = ctxt->myDoc;
6103     if (sax != NULL) {
6104 	ctxt->sax = NULL;
6105 	ctxt->userData = NULL;
6106     }
6107     htmlFreeParserCtxt(ctxt);
6108 
6109     return(ret);
6110 }
6111 
6112 /**
6113  * htmlParseDoc:
6114  * @cur:  a pointer to an array of xmlChar
6115  * @encoding:  a free form C string describing the HTML document encoding, or NULL
6116  *
6117  * parse an HTML in-memory document and build a tree.
6118  *
6119  * Returns the resulting document tree
6120  */
6121 
6122 htmlDocPtr
htmlParseDoc(xmlChar * cur,const char * encoding)6123 htmlParseDoc(xmlChar *cur, const char *encoding) {
6124     return(htmlSAXParseDoc(cur, encoding, NULL, NULL));
6125 }
6126 
6127 
6128 /**
6129  * htmlCreateFileParserCtxt:
6130  * @filename:  the filename
6131  * @encoding:  a free form C string describing the HTML document encoding, or NULL
6132  *
6133  * Create a parser context for a file content.
6134  * Automatic support for ZLIB/Compress compressed document is provided
6135  * by default if found at compile-time.
6136  *
6137  * Returns the new parser context or NULL
6138  */
6139 htmlParserCtxtPtr
htmlCreateFileParserCtxt(const char * filename,const char * encoding)6140 htmlCreateFileParserCtxt(const char *filename, const char *encoding)
6141 {
6142     htmlParserCtxtPtr ctxt;
6143     htmlParserInputPtr inputStream;
6144     char *canonicFilename;
6145     /* htmlCharEncoding enc; */
6146     xmlChar *content, *content_line = (xmlChar *) "charset=";
6147 
6148     if (filename == NULL)
6149         return(NULL);
6150 
6151     ctxt = htmlNewParserCtxt();
6152     if (ctxt == NULL) {
6153 	return(NULL);
6154     }
6155     canonicFilename = (char *) xmlCanonicPath((const xmlChar *) filename);
6156     if (canonicFilename == NULL) {
6157 #ifdef LIBXML_SAX1_ENABLED
6158 	if (xmlDefaultSAXHandler.error != NULL) {
6159 	    xmlDefaultSAXHandler.error(NULL, "out of memory\n");
6160 	}
6161 #endif
6162 	xmlFreeParserCtxt(ctxt);
6163 	return(NULL);
6164     }
6165 
6166     inputStream = xmlLoadExternalEntity(canonicFilename, NULL, ctxt);
6167     xmlFree(canonicFilename);
6168     if (inputStream == NULL) {
6169 	xmlFreeParserCtxt(ctxt);
6170 	return(NULL);
6171     }
6172 
6173     inputPush(ctxt, inputStream);
6174 
6175     /* set encoding */
6176     if (encoding) {
6177         content = xmlMallocAtomic (xmlStrlen(content_line) + strlen(encoding) + 1);
6178 	if (content) {
6179 	    strcpy ((char *)content, (char *)content_line);
6180             strcat ((char *)content, (char *)encoding);
6181             htmlCheckEncoding (ctxt, content);
6182 	    xmlFree (content);
6183 	}
6184     }
6185 
6186     return(ctxt);
6187 }
6188 
6189 /**
6190  * htmlSAXParseFile:
6191  * @filename:  the filename
6192  * @encoding:  a free form C string describing the HTML document encoding, or NULL
6193  * @sax:  the SAX handler block
6194  * @userData: if using SAX, this pointer will be provided on callbacks.
6195  *
6196  * parse an HTML file and build a tree. Automatic support for ZLIB/Compress
6197  * compressed document is provided by default if found at compile-time.
6198  * It use the given SAX function block to handle the parsing callback.
6199  * If sax is NULL, fallback to the default DOM tree building routines.
6200  *
6201  * Returns the resulting document tree unless SAX is NULL or the document is
6202  *     not well formed.
6203  */
6204 
6205 htmlDocPtr
htmlSAXParseFile(const char * filename,const char * encoding,htmlSAXHandlerPtr sax,void * userData)6206 htmlSAXParseFile(const char *filename, const char *encoding, htmlSAXHandlerPtr sax,
6207                  void *userData) {
6208     htmlDocPtr ret;
6209     htmlParserCtxtPtr ctxt;
6210     htmlSAXHandlerPtr oldsax = NULL;
6211 
6212     xmlInitParser();
6213 
6214     ctxt = htmlCreateFileParserCtxt(filename, encoding);
6215     if (ctxt == NULL) return(NULL);
6216     if (sax != NULL) {
6217 	oldsax = ctxt->sax;
6218         ctxt->sax = sax;
6219         ctxt->userData = userData;
6220     }
6221 
6222     htmlParseDocument(ctxt);
6223 
6224     ret = ctxt->myDoc;
6225     if (sax != NULL) {
6226         ctxt->sax = oldsax;
6227         ctxt->userData = NULL;
6228     }
6229     htmlFreeParserCtxt(ctxt);
6230 
6231     return(ret);
6232 }
6233 
6234 /**
6235  * htmlParseFile:
6236  * @filename:  the filename
6237  * @encoding:  a free form C string describing the HTML document encoding, or NULL
6238  *
6239  * parse an HTML file and build a tree. Automatic support for ZLIB/Compress
6240  * compressed document is provided by default if found at compile-time.
6241  *
6242  * Returns the resulting document tree
6243  */
6244 
6245 htmlDocPtr
htmlParseFile(const char * filename,const char * encoding)6246 htmlParseFile(const char *filename, const char *encoding) {
6247     return(htmlSAXParseFile(filename, encoding, NULL, NULL));
6248 }
6249 
6250 /**
6251  * htmlHandleOmittedElem:
6252  * @val:  int 0 or 1
6253  *
6254  * Set and return the previous value for handling HTML omitted tags.
6255  *
6256  * Returns the last value for 0 for no handling, 1 for auto insertion.
6257  */
6258 
6259 int
htmlHandleOmittedElem(int val)6260 htmlHandleOmittedElem(int val) {
6261     int old = htmlOmittedDefaultValue;
6262 
6263     htmlOmittedDefaultValue = val;
6264     return(old);
6265 }
6266 
6267 /**
6268  * htmlElementAllowedHere:
6269  * @parent: HTML parent element
6270  * @elt: HTML element
6271  *
6272  * Checks whether an HTML element may be a direct child of a parent element.
6273  * Note - doesn't check for deprecated elements
6274  *
6275  * Returns 1 if allowed; 0 otherwise.
6276  */
6277 int
htmlElementAllowedHere(const htmlElemDesc * parent,const xmlChar * elt)6278 htmlElementAllowedHere(const htmlElemDesc* parent, const xmlChar* elt) {
6279   const char** p ;
6280 
6281   if ( ! elt || ! parent || ! parent->subelts )
6282 	return 0 ;
6283 
6284   for ( p = parent->subelts; *p; ++p )
6285     if ( !xmlStrcmp((const xmlChar *)*p, elt) )
6286       return 1 ;
6287 
6288   return 0 ;
6289 }
6290 /**
6291  * htmlElementStatusHere:
6292  * @parent: HTML parent element
6293  * @elt: HTML element
6294  *
6295  * Checks whether an HTML element may be a direct child of a parent element.
6296  * and if so whether it is valid or deprecated.
6297  *
6298  * Returns one of HTML_VALID, HTML_DEPRECATED, HTML_INVALID
6299  */
6300 htmlStatus
htmlElementStatusHere(const htmlElemDesc * parent,const htmlElemDesc * elt)6301 htmlElementStatusHere(const htmlElemDesc* parent, const htmlElemDesc* elt) {
6302   if ( ! parent || ! elt )
6303     return HTML_INVALID ;
6304   if ( ! htmlElementAllowedHere(parent, (const xmlChar*) elt->name ) )
6305     return HTML_INVALID ;
6306 
6307   return ( elt->dtd == 0 ) ? HTML_VALID : HTML_DEPRECATED ;
6308 }
6309 /**
6310  * htmlAttrAllowed:
6311  * @elt: HTML element
6312  * @attr: HTML attribute
6313  * @legacy: whether to allow deprecated attributes
6314  *
6315  * Checks whether an attribute is valid for an element
6316  * Has full knowledge of Required and Deprecated attributes
6317  *
6318  * Returns one of HTML_REQUIRED, HTML_VALID, HTML_DEPRECATED, HTML_INVALID
6319  */
6320 htmlStatus
htmlAttrAllowed(const htmlElemDesc * elt,const xmlChar * attr,int legacy)6321 htmlAttrAllowed(const htmlElemDesc* elt, const xmlChar* attr, int legacy) {
6322   const char** p ;
6323 
6324   if ( !elt || ! attr )
6325 	return HTML_INVALID ;
6326 
6327   if ( elt->attrs_req )
6328     for ( p = elt->attrs_req; *p; ++p)
6329       if ( !xmlStrcmp((const xmlChar*)*p, attr) )
6330         return HTML_REQUIRED ;
6331 
6332   if ( elt->attrs_opt )
6333     for ( p = elt->attrs_opt; *p; ++p)
6334       if ( !xmlStrcmp((const xmlChar*)*p, attr) )
6335         return HTML_VALID ;
6336 
6337   if ( legacy && elt->attrs_depr )
6338     for ( p = elt->attrs_depr; *p; ++p)
6339       if ( !xmlStrcmp((const xmlChar*)*p, attr) )
6340         return HTML_DEPRECATED ;
6341 
6342   return HTML_INVALID ;
6343 }
6344 /**
6345  * htmlNodeStatus:
6346  * @node: an htmlNodePtr in a tree
6347  * @legacy: whether to allow deprecated elements (YES is faster here
6348  *	for Element nodes)
6349  *
6350  * Checks whether the tree node is valid.  Experimental (the author
6351  *     only uses the HTML enhancements in a SAX parser)
6352  *
6353  * Return: for Element nodes, a return from htmlElementAllowedHere (if
6354  *	legacy allowed) or htmlElementStatusHere (otherwise).
6355  *	for Attribute nodes, a return from htmlAttrAllowed
6356  *	for other nodes, HTML_NA (no checks performed)
6357  */
6358 htmlStatus
htmlNodeStatus(const htmlNodePtr node,int legacy)6359 htmlNodeStatus(const htmlNodePtr node, int legacy) {
6360   if ( ! node )
6361     return HTML_INVALID ;
6362 
6363   switch ( node->type ) {
6364     case XML_ELEMENT_NODE:
6365       return legacy
6366 	? ( htmlElementAllowedHere (
6367 		htmlTagLookup(node->parent->name) , node->name
6368 		) ? HTML_VALID : HTML_INVALID )
6369 	: htmlElementStatusHere(
6370 		htmlTagLookup(node->parent->name) ,
6371 		htmlTagLookup(node->name) )
6372 	;
6373     case XML_ATTRIBUTE_NODE:
6374       return htmlAttrAllowed(
6375 	htmlTagLookup(node->parent->name) , node->name, legacy) ;
6376     default: return HTML_NA ;
6377   }
6378 }
6379 /************************************************************************
6380  *									*
6381  *	New set (2.6.0) of simpler and more flexible APIs		*
6382  *									*
6383  ************************************************************************/
6384 /**
6385  * DICT_FREE:
6386  * @str:  a string
6387  *
6388  * Free a string if it is not owned by the "dict" dictionnary in the
6389  * current scope
6390  */
6391 #define DICT_FREE(str)						\
6392 	if ((str) && ((!dict) ||				\
6393 	    (xmlDictOwns(dict, (const xmlChar *)(str)) == 0)))	\
6394 	    xmlFree((char *)(str));
6395 
6396 /**
6397  * htmlCtxtReset:
6398  * @ctxt: an HTML parser context
6399  *
6400  * Reset a parser context
6401  */
6402 void
htmlCtxtReset(htmlParserCtxtPtr ctxt)6403 htmlCtxtReset(htmlParserCtxtPtr ctxt)
6404 {
6405     xmlParserInputPtr input;
6406     xmlDictPtr dict;
6407 
6408     if (ctxt == NULL)
6409         return;
6410 
6411     xmlInitParser();
6412     dict = ctxt->dict;
6413 
6414     while ((input = inputPop(ctxt)) != NULL) { /* Non consuming */
6415         xmlFreeInputStream(input);
6416     }
6417     ctxt->inputNr = 0;
6418     ctxt->input = NULL;
6419 
6420     ctxt->spaceNr = 0;
6421     if (ctxt->spaceTab != NULL) {
6422 	ctxt->spaceTab[0] = -1;
6423 	ctxt->space = &ctxt->spaceTab[0];
6424     } else {
6425 	ctxt->space = NULL;
6426     }
6427 
6428 
6429     ctxt->nodeNr = 0;
6430     ctxt->node = NULL;
6431 
6432     ctxt->nameNr = 0;
6433     ctxt->name = NULL;
6434 
6435     DICT_FREE(ctxt->version);
6436     ctxt->version = NULL;
6437     DICT_FREE(ctxt->encoding);
6438     ctxt->encoding = NULL;
6439     DICT_FREE(ctxt->directory);
6440     ctxt->directory = NULL;
6441     DICT_FREE(ctxt->extSubURI);
6442     ctxt->extSubURI = NULL;
6443     DICT_FREE(ctxt->extSubSystem);
6444     ctxt->extSubSystem = NULL;
6445     if (ctxt->myDoc != NULL)
6446         xmlFreeDoc(ctxt->myDoc);
6447     ctxt->myDoc = NULL;
6448 
6449     ctxt->standalone = -1;
6450     ctxt->hasExternalSubset = 0;
6451     ctxt->hasPErefs = 0;
6452     ctxt->html = 1;
6453     ctxt->external = 0;
6454     ctxt->instate = XML_PARSER_START;
6455     ctxt->token = 0;
6456 
6457     ctxt->wellFormed = 1;
6458     ctxt->nsWellFormed = 1;
6459     ctxt->disableSAX = 0;
6460     ctxt->valid = 1;
6461     ctxt->vctxt.userData = ctxt;
6462     ctxt->vctxt.error = xmlParserValidityError;
6463     ctxt->vctxt.warning = xmlParserValidityWarning;
6464     ctxt->record_info = 0;
6465     ctxt->nbChars = 0;
6466     ctxt->checkIndex = 0;
6467     ctxt->inSubset = 0;
6468     ctxt->errNo = XML_ERR_OK;
6469     ctxt->depth = 0;
6470     ctxt->charset = XML_CHAR_ENCODING_NONE;
6471     ctxt->catalogs = NULL;
6472     xmlInitNodeInfoSeq(&ctxt->node_seq);
6473 
6474     if (ctxt->attsDefault != NULL) {
6475         xmlHashFree(ctxt->attsDefault, (xmlHashDeallocator) xmlFree);
6476         ctxt->attsDefault = NULL;
6477     }
6478     if (ctxt->attsSpecial != NULL) {
6479         xmlHashFree(ctxt->attsSpecial, NULL);
6480         ctxt->attsSpecial = NULL;
6481     }
6482 }
6483 
6484 /**
6485  * htmlCtxtUseOptions:
6486  * @ctxt: an HTML parser context
6487  * @options:  a combination of htmlParserOption(s)
6488  *
6489  * Applies the options to the parser context
6490  *
6491  * Returns 0 in case of success, the set of unknown or unimplemented options
6492  *         in case of error.
6493  */
6494 int
htmlCtxtUseOptions(htmlParserCtxtPtr ctxt,int options)6495 htmlCtxtUseOptions(htmlParserCtxtPtr ctxt, int options)
6496 {
6497     if (ctxt == NULL)
6498         return(-1);
6499 
6500     if (options & HTML_PARSE_NOWARNING) {
6501         ctxt->sax->warning = NULL;
6502         ctxt->vctxt.warning = NULL;
6503         options -= XML_PARSE_NOWARNING;
6504 	ctxt->options |= XML_PARSE_NOWARNING;
6505     }
6506     if (options & HTML_PARSE_NOERROR) {
6507         ctxt->sax->error = NULL;
6508         ctxt->vctxt.error = NULL;
6509         ctxt->sax->fatalError = NULL;
6510         options -= XML_PARSE_NOERROR;
6511 	ctxt->options |= XML_PARSE_NOERROR;
6512     }
6513     if (options & HTML_PARSE_PEDANTIC) {
6514         ctxt->pedantic = 1;
6515         options -= XML_PARSE_PEDANTIC;
6516 	ctxt->options |= XML_PARSE_PEDANTIC;
6517     } else
6518         ctxt->pedantic = 0;
6519     if (options & XML_PARSE_NOBLANKS) {
6520         ctxt->keepBlanks = 0;
6521         ctxt->sax->ignorableWhitespace = xmlSAX2IgnorableWhitespace;
6522         options -= XML_PARSE_NOBLANKS;
6523 	ctxt->options |= XML_PARSE_NOBLANKS;
6524     } else
6525         ctxt->keepBlanks = 1;
6526     if (options & HTML_PARSE_RECOVER) {
6527         ctxt->recovery = 1;
6528 	options -= HTML_PARSE_RECOVER;
6529     } else
6530         ctxt->recovery = 0;
6531     if (options & HTML_PARSE_COMPACT) {
6532 	ctxt->options |= HTML_PARSE_COMPACT;
6533         options -= HTML_PARSE_COMPACT;
6534     }
6535     if (options & XML_PARSE_HUGE) {
6536 	ctxt->options |= XML_PARSE_HUGE;
6537         options -= XML_PARSE_HUGE;
6538     }
6539     if (options & HTML_PARSE_NODEFDTD) {
6540 	ctxt->options |= HTML_PARSE_NODEFDTD;
6541         options -= HTML_PARSE_NODEFDTD;
6542     }
6543     if (options & HTML_PARSE_IGNORE_ENC) {
6544 	ctxt->options |= HTML_PARSE_IGNORE_ENC;
6545         options -= HTML_PARSE_IGNORE_ENC;
6546     }
6547     ctxt->dictNames = 0;
6548     return (options);
6549 }
6550 
6551 /**
6552  * htmlDoRead:
6553  * @ctxt:  an HTML parser context
6554  * @URL:  the base URL to use for the document
6555  * @encoding:  the document encoding, or NULL
6556  * @options:  a combination of htmlParserOption(s)
6557  * @reuse:  keep the context for reuse
6558  *
6559  * Common front-end for the htmlRead functions
6560  *
6561  * Returns the resulting document tree or NULL
6562  */
6563 static htmlDocPtr
htmlDoRead(htmlParserCtxtPtr ctxt,const char * URL,const char * encoding,int options,int reuse)6564 htmlDoRead(htmlParserCtxtPtr ctxt, const char *URL, const char *encoding,
6565           int options, int reuse)
6566 {
6567     htmlDocPtr ret;
6568 
6569     htmlCtxtUseOptions(ctxt, options);
6570     ctxt->html = 1;
6571     if (encoding != NULL) {
6572         xmlCharEncodingHandlerPtr hdlr;
6573 
6574 	hdlr = xmlFindCharEncodingHandler(encoding);
6575 	if (hdlr != NULL) {
6576 	    xmlSwitchToEncoding(ctxt, hdlr);
6577 	    if (ctxt->input->encoding != NULL)
6578 	      xmlFree((xmlChar *) ctxt->input->encoding);
6579             ctxt->input->encoding = xmlStrdup((xmlChar *)encoding);
6580         }
6581     }
6582     if ((URL != NULL) && (ctxt->input != NULL) &&
6583         (ctxt->input->filename == NULL))
6584         ctxt->input->filename = (char *) xmlStrdup((const xmlChar *) URL);
6585     htmlParseDocument(ctxt);
6586     ret = ctxt->myDoc;
6587     ctxt->myDoc = NULL;
6588     if (!reuse) {
6589         if ((ctxt->dictNames) &&
6590 	    (ret != NULL) &&
6591 	    (ret->dict == ctxt->dict))
6592 	    ctxt->dict = NULL;
6593 	xmlFreeParserCtxt(ctxt);
6594     }
6595     return (ret);
6596 }
6597 
6598 /**
6599  * htmlReadDoc:
6600  * @cur:  a pointer to a zero terminated string
6601  * @URL:  the base URL to use for the document
6602  * @encoding:  the document encoding, or NULL
6603  * @options:  a combination of htmlParserOption(s)
6604  *
6605  * parse an XML in-memory document and build a tree.
6606  *
6607  * Returns the resulting document tree
6608  */
6609 htmlDocPtr
htmlReadDoc(const xmlChar * cur,const char * URL,const char * encoding,int options)6610 htmlReadDoc(const xmlChar * cur, const char *URL, const char *encoding, int options)
6611 {
6612     htmlParserCtxtPtr ctxt;
6613 
6614     if (cur == NULL)
6615         return (NULL);
6616 
6617     xmlInitParser();
6618     ctxt = htmlCreateDocParserCtxt(cur, NULL);
6619     if (ctxt == NULL)
6620         return (NULL);
6621     return (htmlDoRead(ctxt, URL, encoding, options, 0));
6622 }
6623 
6624 /**
6625  * htmlReadFile:
6626  * @filename:  a file or URL
6627  * @encoding:  the document encoding, or NULL
6628  * @options:  a combination of htmlParserOption(s)
6629  *
6630  * parse an XML file from the filesystem or the network.
6631  *
6632  * Returns the resulting document tree
6633  */
6634 htmlDocPtr
htmlReadFile(const char * filename,const char * encoding,int options)6635 htmlReadFile(const char *filename, const char *encoding, int options)
6636 {
6637     htmlParserCtxtPtr ctxt;
6638 
6639     xmlInitParser();
6640     ctxt = htmlCreateFileParserCtxt(filename, encoding);
6641     if (ctxt == NULL)
6642         return (NULL);
6643     return (htmlDoRead(ctxt, NULL, NULL, options, 0));
6644 }
6645 
6646 /**
6647  * htmlReadMemory:
6648  * @buffer:  a pointer to a char array
6649  * @size:  the size of the array
6650  * @URL:  the base URL to use for the document
6651  * @encoding:  the document encoding, or NULL
6652  * @options:  a combination of htmlParserOption(s)
6653  *
6654  * parse an XML in-memory document and build a tree.
6655  *
6656  * Returns the resulting document tree
6657  */
6658 htmlDocPtr
htmlReadMemory(const char * buffer,int size,const char * URL,const char * encoding,int options)6659 htmlReadMemory(const char *buffer, int size, const char *URL, const char *encoding, int options)
6660 {
6661     htmlParserCtxtPtr ctxt;
6662 
6663     xmlInitParser();
6664     ctxt = xmlCreateMemoryParserCtxt(buffer, size);
6665     if (ctxt == NULL)
6666         return (NULL);
6667     htmlDefaultSAXHandlerInit();
6668     if (ctxt->sax != NULL)
6669         memcpy(ctxt->sax, &htmlDefaultSAXHandler, sizeof(xmlSAXHandlerV1));
6670     return (htmlDoRead(ctxt, URL, encoding, options, 0));
6671 }
6672 
6673 /**
6674  * htmlReadFd:
6675  * @fd:  an open file descriptor
6676  * @URL:  the base URL to use for the document
6677  * @encoding:  the document encoding, or NULL
6678  * @options:  a combination of htmlParserOption(s)
6679  *
6680  * parse an XML from a file descriptor and build a tree.
6681  *
6682  * Returns the resulting document tree
6683  */
6684 htmlDocPtr
htmlReadFd(int fd,const char * URL,const char * encoding,int options)6685 htmlReadFd(int fd, const char *URL, const char *encoding, int options)
6686 {
6687     htmlParserCtxtPtr ctxt;
6688     xmlParserInputBufferPtr input;
6689     xmlParserInputPtr stream;
6690 
6691     if (fd < 0)
6692         return (NULL);
6693 
6694     xmlInitParser();
6695     input = xmlParserInputBufferCreateFd(fd, XML_CHAR_ENCODING_NONE);
6696     if (input == NULL)
6697         return (NULL);
6698     ctxt = xmlNewParserCtxt();
6699     if (ctxt == NULL) {
6700         xmlFreeParserInputBuffer(input);
6701         return (NULL);
6702     }
6703     stream = xmlNewIOInputStream(ctxt, input, XML_CHAR_ENCODING_NONE);
6704     if (stream == NULL) {
6705         xmlFreeParserInputBuffer(input);
6706 	xmlFreeParserCtxt(ctxt);
6707         return (NULL);
6708     }
6709     inputPush(ctxt, stream);
6710     return (htmlDoRead(ctxt, URL, encoding, options, 0));
6711 }
6712 
6713 /**
6714  * htmlReadIO:
6715  * @ioread:  an I/O read function
6716  * @ioclose:  an I/O close function
6717  * @ioctx:  an I/O handler
6718  * @URL:  the base URL to use for the document
6719  * @encoding:  the document encoding, or NULL
6720  * @options:  a combination of htmlParserOption(s)
6721  *
6722  * parse an HTML document from I/O functions and source and build a tree.
6723  *
6724  * Returns the resulting document tree
6725  */
6726 htmlDocPtr
htmlReadIO(xmlInputReadCallback ioread,xmlInputCloseCallback ioclose,void * ioctx,const char * URL,const char * encoding,int options)6727 htmlReadIO(xmlInputReadCallback ioread, xmlInputCloseCallback ioclose,
6728           void *ioctx, const char *URL, const char *encoding, int options)
6729 {
6730     htmlParserCtxtPtr ctxt;
6731     xmlParserInputBufferPtr input;
6732     xmlParserInputPtr stream;
6733 
6734     if (ioread == NULL)
6735         return (NULL);
6736     xmlInitParser();
6737 
6738     input = xmlParserInputBufferCreateIO(ioread, ioclose, ioctx,
6739                                          XML_CHAR_ENCODING_NONE);
6740     if (input == NULL)
6741         return (NULL);
6742     ctxt = htmlNewParserCtxt();
6743     if (ctxt == NULL) {
6744         xmlFreeParserInputBuffer(input);
6745         return (NULL);
6746     }
6747     stream = xmlNewIOInputStream(ctxt, input, XML_CHAR_ENCODING_NONE);
6748     if (stream == NULL) {
6749         xmlFreeParserInputBuffer(input);
6750 	xmlFreeParserCtxt(ctxt);
6751         return (NULL);
6752     }
6753     inputPush(ctxt, stream);
6754     return (htmlDoRead(ctxt, URL, encoding, options, 0));
6755 }
6756 
6757 /**
6758  * htmlCtxtReadDoc:
6759  * @ctxt:  an HTML parser context
6760  * @cur:  a pointer to a zero terminated string
6761  * @URL:  the base URL to use for the document
6762  * @encoding:  the document encoding, or NULL
6763  * @options:  a combination of htmlParserOption(s)
6764  *
6765  * parse an XML in-memory document and build a tree.
6766  * This reuses the existing @ctxt parser context
6767  *
6768  * Returns the resulting document tree
6769  */
6770 htmlDocPtr
htmlCtxtReadDoc(htmlParserCtxtPtr ctxt,const xmlChar * cur,const char * URL,const char * encoding,int options)6771 htmlCtxtReadDoc(htmlParserCtxtPtr ctxt, const xmlChar * cur,
6772                const char *URL, const char *encoding, int options)
6773 {
6774     xmlParserInputPtr stream;
6775 
6776     if (cur == NULL)
6777         return (NULL);
6778     if (ctxt == NULL)
6779         return (NULL);
6780 
6781     htmlCtxtReset(ctxt);
6782 
6783     stream = xmlNewStringInputStream(ctxt, cur);
6784     if (stream == NULL) {
6785         return (NULL);
6786     }
6787     inputPush(ctxt, stream);
6788     return (htmlDoRead(ctxt, URL, encoding, options, 1));
6789 }
6790 
6791 /**
6792  * htmlCtxtReadFile:
6793  * @ctxt:  an HTML parser context
6794  * @filename:  a file or URL
6795  * @encoding:  the document encoding, or NULL
6796  * @options:  a combination of htmlParserOption(s)
6797  *
6798  * parse an XML file from the filesystem or the network.
6799  * This reuses the existing @ctxt parser context
6800  *
6801  * Returns the resulting document tree
6802  */
6803 htmlDocPtr
htmlCtxtReadFile(htmlParserCtxtPtr ctxt,const char * filename,const char * encoding,int options)6804 htmlCtxtReadFile(htmlParserCtxtPtr ctxt, const char *filename,
6805                 const char *encoding, int options)
6806 {
6807     xmlParserInputPtr stream;
6808 
6809     if (filename == NULL)
6810         return (NULL);
6811     if (ctxt == NULL)
6812         return (NULL);
6813 
6814     htmlCtxtReset(ctxt);
6815 
6816     stream = xmlLoadExternalEntity(filename, NULL, ctxt);
6817     if (stream == NULL) {
6818         return (NULL);
6819     }
6820     inputPush(ctxt, stream);
6821     return (htmlDoRead(ctxt, NULL, encoding, options, 1));
6822 }
6823 
6824 /**
6825  * htmlCtxtReadMemory:
6826  * @ctxt:  an HTML parser context
6827  * @buffer:  a pointer to a char array
6828  * @size:  the size of the array
6829  * @URL:  the base URL to use for the document
6830  * @encoding:  the document encoding, or NULL
6831  * @options:  a combination of htmlParserOption(s)
6832  *
6833  * parse an XML in-memory document and build a tree.
6834  * This reuses the existing @ctxt parser context
6835  *
6836  * Returns the resulting document tree
6837  */
6838 htmlDocPtr
htmlCtxtReadMemory(htmlParserCtxtPtr ctxt,const char * buffer,int size,const char * URL,const char * encoding,int options)6839 htmlCtxtReadMemory(htmlParserCtxtPtr ctxt, const char *buffer, int size,
6840                   const char *URL, const char *encoding, int options)
6841 {
6842     xmlParserInputBufferPtr input;
6843     xmlParserInputPtr stream;
6844 
6845     if (ctxt == NULL)
6846         return (NULL);
6847     if (buffer == NULL)
6848         return (NULL);
6849 
6850     htmlCtxtReset(ctxt);
6851 
6852     input = xmlParserInputBufferCreateMem(buffer, size, XML_CHAR_ENCODING_NONE);
6853     if (input == NULL) {
6854 	return(NULL);
6855     }
6856 
6857     stream = xmlNewIOInputStream(ctxt, input, XML_CHAR_ENCODING_NONE);
6858     if (stream == NULL) {
6859 	xmlFreeParserInputBuffer(input);
6860 	return(NULL);
6861     }
6862 
6863     inputPush(ctxt, stream);
6864     return (htmlDoRead(ctxt, URL, encoding, options, 1));
6865 }
6866 
6867 /**
6868  * htmlCtxtReadFd:
6869  * @ctxt:  an HTML parser context
6870  * @fd:  an open file descriptor
6871  * @URL:  the base URL to use for the document
6872  * @encoding:  the document encoding, or NULL
6873  * @options:  a combination of htmlParserOption(s)
6874  *
6875  * parse an XML from a file descriptor and build a tree.
6876  * This reuses the existing @ctxt parser context
6877  *
6878  * Returns the resulting document tree
6879  */
6880 htmlDocPtr
htmlCtxtReadFd(htmlParserCtxtPtr ctxt,int fd,const char * URL,const char * encoding,int options)6881 htmlCtxtReadFd(htmlParserCtxtPtr ctxt, int fd,
6882               const char *URL, const char *encoding, int options)
6883 {
6884     xmlParserInputBufferPtr input;
6885     xmlParserInputPtr stream;
6886 
6887     if (fd < 0)
6888         return (NULL);
6889     if (ctxt == NULL)
6890         return (NULL);
6891 
6892     htmlCtxtReset(ctxt);
6893 
6894 
6895     input = xmlParserInputBufferCreateFd(fd, XML_CHAR_ENCODING_NONE);
6896     if (input == NULL)
6897         return (NULL);
6898     stream = xmlNewIOInputStream(ctxt, input, XML_CHAR_ENCODING_NONE);
6899     if (stream == NULL) {
6900         xmlFreeParserInputBuffer(input);
6901         return (NULL);
6902     }
6903     inputPush(ctxt, stream);
6904     return (htmlDoRead(ctxt, URL, encoding, options, 1));
6905 }
6906 
6907 /**
6908  * htmlCtxtReadIO:
6909  * @ctxt:  an HTML parser context
6910  * @ioread:  an I/O read function
6911  * @ioclose:  an I/O close function
6912  * @ioctx:  an I/O handler
6913  * @URL:  the base URL to use for the document
6914  * @encoding:  the document encoding, or NULL
6915  * @options:  a combination of htmlParserOption(s)
6916  *
6917  * parse an HTML document from I/O functions and source and build a tree.
6918  * This reuses the existing @ctxt parser context
6919  *
6920  * Returns the resulting document tree
6921  */
6922 htmlDocPtr
htmlCtxtReadIO(htmlParserCtxtPtr ctxt,xmlInputReadCallback ioread,xmlInputCloseCallback ioclose,void * ioctx,const char * URL,const char * encoding,int options)6923 htmlCtxtReadIO(htmlParserCtxtPtr ctxt, xmlInputReadCallback ioread,
6924               xmlInputCloseCallback ioclose, void *ioctx,
6925 	      const char *URL,
6926               const char *encoding, int options)
6927 {
6928     xmlParserInputBufferPtr input;
6929     xmlParserInputPtr stream;
6930 
6931     if (ioread == NULL)
6932         return (NULL);
6933     if (ctxt == NULL)
6934         return (NULL);
6935 
6936     htmlCtxtReset(ctxt);
6937 
6938     input = xmlParserInputBufferCreateIO(ioread, ioclose, ioctx,
6939                                          XML_CHAR_ENCODING_NONE);
6940     if (input == NULL)
6941         return (NULL);
6942     stream = xmlNewIOInputStream(ctxt, input, XML_CHAR_ENCODING_NONE);
6943     if (stream == NULL) {
6944         xmlFreeParserInputBuffer(input);
6945         return (NULL);
6946     }
6947     inputPush(ctxt, stream);
6948     return (htmlDoRead(ctxt, URL, encoding, options, 1));
6949 }
6950 
6951 #define bottom_HTMLparser
6952 #include "elfgcchack.h"
6953 #endif /* LIBXML_HTML_ENABLED */
6954