1 /*
2 * HTMLparser.c : an HTML 4.0 non-verifying parser
3 *
4 * See Copyright for the status of this software.
5 *
6 * daniel@veillard.com
7 */
8
9 #define IN_LIBXML
10 #include "libxml.h"
11 #ifdef LIBXML_HTML_ENABLED
12
13 #include <string.h>
14 #include <ctype.h>
15 #include <stdlib.h>
16
17 #include <libxml/HTMLparser.h>
18 #include <libxml/xmlmemory.h>
19 #include <libxml/tree.h>
20 #include <libxml/parser.h>
21 #include <libxml/parserInternals.h>
22 #include <libxml/xmlerror.h>
23 #include <libxml/HTMLtree.h>
24 #include <libxml/entities.h>
25 #include <libxml/encoding.h>
26 #include <libxml/xmlIO.h>
27 #include <libxml/uri.h>
28
29 #include "private/buf.h"
30 #include "private/enc.h"
31 #include "private/error.h"
32 #include "private/html.h"
33 #include "private/io.h"
34 #include "private/parser.h"
35 #include "private/tree.h"
36
37 #define HTML_MAX_NAMELEN 1000
38 #define HTML_PARSER_BIG_BUFFER_SIZE 1000
39 #define HTML_PARSER_BUFFER_SIZE 100
40
41 static int htmlOmittedDefaultValue = 1;
42
43 xmlChar * htmlDecodeEntities(htmlParserCtxtPtr ctxt, int len,
44 xmlChar end, xmlChar end2, xmlChar end3);
45 static void htmlParseComment(htmlParserCtxtPtr ctxt);
46
47 /************************************************************************
48 * *
49 * Some factorized error routines *
50 * *
51 ************************************************************************/
52
53 /**
54 * htmlErrMemory:
55 * @ctxt: an HTML parser context
56 * @extra: extra information
57 *
58 * Handle a redefinition of attribute error
59 */
60 static void
htmlErrMemory(xmlParserCtxtPtr ctxt)61 htmlErrMemory(xmlParserCtxtPtr ctxt)
62 {
63 xmlCtxtErrMemory(ctxt);
64 }
65
66 /**
67 * htmlParseErr:
68 * @ctxt: an HTML parser context
69 * @error: the error number
70 * @msg: the error message
71 * @str1: string infor
72 * @str2: string infor
73 *
74 * Handle a fatal parser error, i.e. violating Well-Formedness constraints
75 */
76 static void LIBXML_ATTR_FORMAT(3,0)
htmlParseErr(xmlParserCtxtPtr ctxt,xmlParserErrors error,const char * msg,const xmlChar * str1,const xmlChar * str2)77 htmlParseErr(xmlParserCtxtPtr ctxt, xmlParserErrors error,
78 const char *msg, const xmlChar *str1, const xmlChar *str2)
79 {
80 xmlCtxtErr(ctxt, NULL, XML_FROM_HTML, error, XML_ERR_ERROR,
81 str1, str2, NULL, 0, msg, str1, str2);
82 }
83
84 /**
85 * htmlParseErrInt:
86 * @ctxt: an HTML parser context
87 * @error: the error number
88 * @msg: the error message
89 * @val: integer info
90 *
91 * Handle a fatal parser error, i.e. violating Well-Formedness constraints
92 */
93 static void LIBXML_ATTR_FORMAT(3,0)
htmlParseErrInt(xmlParserCtxtPtr ctxt,xmlParserErrors error,const char * msg,int val)94 htmlParseErrInt(xmlParserCtxtPtr ctxt, xmlParserErrors error,
95 const char *msg, int val)
96 {
97 xmlCtxtErr(ctxt, NULL, XML_FROM_HTML, error, XML_ERR_ERROR,
98 NULL, NULL, NULL, val, msg, val);
99 }
100
101 /************************************************************************
102 * *
103 * Parser stacks related functions and macros *
104 * *
105 ************************************************************************/
106
107 /**
108 * htmlnamePush:
109 * @ctxt: an HTML parser context
110 * @value: the element name
111 *
112 * Pushes a new element name on top of the name stack
113 *
114 * Returns -1 in case of error, the index in the stack otherwise
115 */
116 static int
htmlnamePush(htmlParserCtxtPtr ctxt,const xmlChar * value)117 htmlnamePush(htmlParserCtxtPtr ctxt, const xmlChar * value)
118 {
119 if ((ctxt->html < 3) && (xmlStrEqual(value, BAD_CAST "head")))
120 ctxt->html = 3;
121 if ((ctxt->html < 10) && (xmlStrEqual(value, BAD_CAST "body")))
122 ctxt->html = 10;
123 if (ctxt->nameNr >= ctxt->nameMax) {
124 size_t newSize = ctxt->nameMax * 2;
125 const xmlChar **tmp;
126
127 tmp = xmlRealloc((xmlChar **) ctxt->nameTab,
128 newSize * sizeof(ctxt->nameTab[0]));
129 if (tmp == NULL) {
130 htmlErrMemory(ctxt);
131 return (-1);
132 }
133 ctxt->nameTab = tmp;
134 ctxt->nameMax = newSize;
135 }
136 ctxt->nameTab[ctxt->nameNr] = value;
137 ctxt->name = value;
138 return (ctxt->nameNr++);
139 }
140 /**
141 * htmlnamePop:
142 * @ctxt: an HTML parser context
143 *
144 * Pops the top element name from the name stack
145 *
146 * Returns the name just removed
147 */
148 static const xmlChar *
htmlnamePop(htmlParserCtxtPtr ctxt)149 htmlnamePop(htmlParserCtxtPtr ctxt)
150 {
151 const xmlChar *ret;
152
153 if (ctxt->nameNr <= 0)
154 return (NULL);
155 ctxt->nameNr--;
156 if (ctxt->nameNr < 0)
157 return (NULL);
158 if (ctxt->nameNr > 0)
159 ctxt->name = ctxt->nameTab[ctxt->nameNr - 1];
160 else
161 ctxt->name = NULL;
162 ret = ctxt->nameTab[ctxt->nameNr];
163 ctxt->nameTab[ctxt->nameNr] = NULL;
164 return (ret);
165 }
166
167 /**
168 * htmlNodeInfoPush:
169 * @ctxt: an HTML parser context
170 * @value: the node info
171 *
172 * Pushes a new element name on top of the node info stack
173 *
174 * Returns 0 in case of error, the index in the stack otherwise
175 */
176 static int
htmlNodeInfoPush(htmlParserCtxtPtr ctxt,htmlParserNodeInfo * value)177 htmlNodeInfoPush(htmlParserCtxtPtr ctxt, htmlParserNodeInfo *value)
178 {
179 if (ctxt->nodeInfoNr >= ctxt->nodeInfoMax) {
180 if (ctxt->nodeInfoMax == 0)
181 ctxt->nodeInfoMax = 5;
182 ctxt->nodeInfoMax *= 2;
183 ctxt->nodeInfoTab = (htmlParserNodeInfo *)
184 xmlRealloc((htmlParserNodeInfo *)ctxt->nodeInfoTab,
185 ctxt->nodeInfoMax *
186 sizeof(ctxt->nodeInfoTab[0]));
187 if (ctxt->nodeInfoTab == NULL) {
188 htmlErrMemory(ctxt);
189 return (0);
190 }
191 }
192 ctxt->nodeInfoTab[ctxt->nodeInfoNr] = *value;
193 ctxt->nodeInfo = &ctxt->nodeInfoTab[ctxt->nodeInfoNr];
194 return (ctxt->nodeInfoNr++);
195 }
196
197 /**
198 * htmlNodeInfoPop:
199 * @ctxt: an HTML parser context
200 *
201 * Pops the top element name from the node info stack
202 *
203 * Returns 0 in case of error, the pointer to NodeInfo otherwise
204 */
205 static htmlParserNodeInfo *
htmlNodeInfoPop(htmlParserCtxtPtr ctxt)206 htmlNodeInfoPop(htmlParserCtxtPtr ctxt)
207 {
208 if (ctxt->nodeInfoNr <= 0)
209 return (NULL);
210 ctxt->nodeInfoNr--;
211 if (ctxt->nodeInfoNr < 0)
212 return (NULL);
213 if (ctxt->nodeInfoNr > 0)
214 ctxt->nodeInfo = &ctxt->nodeInfoTab[ctxt->nodeInfoNr - 1];
215 else
216 ctxt->nodeInfo = NULL;
217 return &ctxt->nodeInfoTab[ctxt->nodeInfoNr];
218 }
219
220 /*
221 * Macros for accessing the content. Those should be used only by the parser,
222 * and not exported.
223 *
224 * Dirty macros, i.e. one need to make assumption on the context to use them
225 *
226 * CUR_PTR return the current pointer to the xmlChar to be parsed.
227 * CUR returns the current xmlChar value, i.e. a 8 bit value if compiled
228 * in ISO-Latin or UTF-8, and the current 16 bit value if compiled
229 * in UNICODE mode. This should be used internally by the parser
230 * only to compare to ASCII values otherwise it would break when
231 * running with UTF-8 encoding.
232 * NXT(n) returns the n'th next xmlChar. Same as CUR is should be used only
233 * to compare on ASCII based substring.
234 * UPP(n) returns the n'th next xmlChar converted to uppercase. Same as CUR
235 * it should be used only to compare on ASCII based substring.
236 * SKIP(n) Skip n xmlChar, and must also be used only to skip ASCII defined
237 * strings without newlines within the parser.
238 *
239 * Clean macros, not dependent of an ASCII context, expect UTF-8 encoding
240 *
241 * NEXT Skip to the next character, this does the proper decoding
242 * in UTF-8 mode. It also pop-up unfinished entities on the fly.
243 * NEXTL(l) Skip the current unicode character of l xmlChars long.
244 * COPY(to) copy one char to *to, increment CUR_PTR and to accordingly
245 */
246
247 #define UPPER (toupper(*ctxt->input->cur))
248
249 #define SKIP(val) ctxt->input->cur += (val),ctxt->input->col+=(val)
250
251 #define NXT(val) ctxt->input->cur[(val)]
252
253 #define UPP(val) (toupper(ctxt->input->cur[(val)]))
254
255 #define CUR_PTR ctxt->input->cur
256 #define BASE_PTR ctxt->input->base
257
258 #define SHRINK \
259 if ((!PARSER_PROGRESSIVE(ctxt)) && \
260 (ctxt->input->cur - ctxt->input->base > 2 * INPUT_CHUNK) && \
261 (ctxt->input->end - ctxt->input->cur < 2 * INPUT_CHUNK)) \
262 xmlParserShrink(ctxt);
263
264 #define GROW \
265 if ((!PARSER_PROGRESSIVE(ctxt)) && \
266 (ctxt->input->end - ctxt->input->cur < INPUT_CHUNK)) \
267 xmlParserGrow(ctxt);
268
269 #define SKIP_BLANKS htmlSkipBlankChars(ctxt)
270
271 /* Imported from XML */
272
273 #define CUR (*ctxt->input->cur)
274 #define NEXT xmlNextChar(ctxt)
275
276 #define RAW (*ctxt->input->cur)
277
278
279 #define NEXTL(l) do { \
280 if (*(ctxt->input->cur) == '\n') { \
281 ctxt->input->line++; ctxt->input->col = 1; \
282 } else ctxt->input->col++; \
283 ctxt->input->cur += l; \
284 } while (0)
285
286 /************
287 \
288 if (*ctxt->input->cur == '%') xmlParserHandlePEReference(ctxt); \
289 if (*ctxt->input->cur == '&') xmlParserHandleReference(ctxt);
290 ************/
291
292 #define CUR_CHAR(l) htmlCurrentChar(ctxt, &l)
293
294 #define COPY_BUF(l,b,i,v) \
295 if (l == 1) b[i++] = v; \
296 else i += xmlCopyChar(l,&b[i],v)
297
298 /**
299 * htmlFindEncoding:
300 * @the HTML parser context
301 *
302 * Ty to find and encoding in the current data available in the input
303 * buffer this is needed to try to switch to the proper encoding when
304 * one face a character error.
305 * That's an heuristic, since it's operating outside of parsing it could
306 * try to use a meta which had been commented out, that's the reason it
307 * should only be used in case of error, not as a default.
308 *
309 * Returns an encoding string or NULL if not found, the string need to
310 * be freed
311 */
312 static xmlChar *
htmlFindEncoding(xmlParserCtxtPtr ctxt)313 htmlFindEncoding(xmlParserCtxtPtr ctxt) {
314 const xmlChar *start, *cur, *end;
315 xmlChar *ret;
316
317 if ((ctxt == NULL) || (ctxt->input == NULL) ||
318 (ctxt->input->flags & XML_INPUT_HAS_ENCODING))
319 return(NULL);
320 if ((ctxt->input->cur == NULL) || (ctxt->input->end == NULL))
321 return(NULL);
322
323 start = ctxt->input->cur;
324 end = ctxt->input->end;
325 /* we also expect the input buffer to be zero terminated */
326 if (*end != 0)
327 return(NULL);
328
329 cur = xmlStrcasestr(start, BAD_CAST "HTTP-EQUIV");
330 if (cur == NULL)
331 return(NULL);
332 cur = xmlStrcasestr(cur, BAD_CAST "CONTENT");
333 if (cur == NULL)
334 return(NULL);
335 cur = xmlStrcasestr(cur, BAD_CAST "CHARSET=");
336 if (cur == NULL)
337 return(NULL);
338 cur += 8;
339 start = cur;
340 while (((*cur >= 'A') && (*cur <= 'Z')) ||
341 ((*cur >= 'a') && (*cur <= 'z')) ||
342 ((*cur >= '0') && (*cur <= '9')) ||
343 (*cur == '-') || (*cur == '_') || (*cur == ':') || (*cur == '/'))
344 cur++;
345 if (cur == start)
346 return(NULL);
347 ret = xmlStrndup(start, cur - start);
348 if (ret == NULL)
349 htmlErrMemory(ctxt);
350 return(ret);
351 }
352
353 /**
354 * htmlCurrentChar:
355 * @ctxt: the HTML parser context
356 * @len: pointer to the length of the char read
357 *
358 * The current char value, if using UTF-8 this may actually span multiple
359 * bytes in the input buffer. Implement the end of line normalization:
360 * 2.11 End-of-Line Handling
361 * If the encoding is unspecified, in the case we find an ISO-Latin-1
362 * char, then the encoding converter is plugged in automatically.
363 *
364 * Returns the current char value and its length
365 */
366
367 static int
htmlCurrentChar(xmlParserCtxtPtr ctxt,int * len)368 htmlCurrentChar(xmlParserCtxtPtr ctxt, int *len) {
369 const unsigned char *cur;
370 unsigned char c;
371 unsigned int val;
372
373 if (ctxt->input->end - ctxt->input->cur < INPUT_CHUNK)
374 xmlParserGrow(ctxt);
375
376 if ((ctxt->input->flags & XML_INPUT_HAS_ENCODING) == 0) {
377 xmlChar * guess;
378
379 /*
380 * Assume it's a fixed length encoding (1) with
381 * a compatible encoding for the ASCII set, since
382 * HTML constructs only use < 128 chars
383 */
384 if (*ctxt->input->cur < 0x80) {
385 if (*ctxt->input->cur == 0) {
386 if (ctxt->input->cur < ctxt->input->end) {
387 htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR,
388 "Char 0x%X out of allowed range\n", 0);
389 *len = 1;
390 return(' ');
391 } else {
392 *len = 0;
393 return(0);
394 }
395 }
396 *len = 1;
397 return(*ctxt->input->cur);
398 }
399
400 /*
401 * Humm this is bad, do an automatic flow conversion
402 */
403 guess = htmlFindEncoding(ctxt);
404 if (guess == NULL) {
405 xmlSwitchEncoding(ctxt, XML_CHAR_ENCODING_8859_1);
406 } else {
407 xmlSwitchEncodingName(ctxt, (const char *) guess);
408 xmlFree(guess);
409 }
410 ctxt->input->flags |= XML_INPUT_HAS_ENCODING;
411 }
412
413 /*
414 * We are supposed to handle UTF8, check it's valid
415 * From rfc2044: encoding of the Unicode values on UTF-8:
416 *
417 * UCS-4 range (hex.) UTF-8 octet sequence (binary)
418 * 0000 0000-0000 007F 0xxxxxxx
419 * 0000 0080-0000 07FF 110xxxxx 10xxxxxx
420 * 0000 0800-0000 FFFF 1110xxxx 10xxxxxx 10xxxxxx
421 *
422 * Check for the 0x110000 limit too
423 */
424 cur = ctxt->input->cur;
425 c = *cur;
426 if (c & 0x80) {
427 size_t avail;
428
429 if ((c & 0x40) == 0)
430 goto encoding_error;
431
432 avail = ctxt->input->end - ctxt->input->cur;
433
434 if ((avail < 2) || ((cur[1] & 0xc0) != 0x80))
435 goto encoding_error;
436 if ((c & 0xe0) == 0xe0) {
437 if ((avail < 3) || ((cur[2] & 0xc0) != 0x80))
438 goto encoding_error;
439 if ((c & 0xf0) == 0xf0) {
440 if (((c & 0xf8) != 0xf0) ||
441 (avail < 4) || ((cur[3] & 0xc0) != 0x80))
442 goto encoding_error;
443 /* 4-byte code */
444 *len = 4;
445 val = (cur[0] & 0x7) << 18;
446 val |= (cur[1] & 0x3f) << 12;
447 val |= (cur[2] & 0x3f) << 6;
448 val |= cur[3] & 0x3f;
449 if (val < 0x10000)
450 goto encoding_error;
451 } else {
452 /* 3-byte code */
453 *len = 3;
454 val = (cur[0] & 0xf) << 12;
455 val |= (cur[1] & 0x3f) << 6;
456 val |= cur[2] & 0x3f;
457 if (val < 0x800)
458 goto encoding_error;
459 }
460 } else {
461 /* 2-byte code */
462 *len = 2;
463 val = (cur[0] & 0x1f) << 6;
464 val |= cur[1] & 0x3f;
465 if (val < 0x80)
466 goto encoding_error;
467 }
468 if (!IS_CHAR(val)) {
469 htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR,
470 "Char 0x%X out of allowed range\n", val);
471 }
472 return(val);
473 } else {
474 if (*ctxt->input->cur == 0) {
475 if (ctxt->input->cur < ctxt->input->end) {
476 htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR,
477 "Char 0x%X out of allowed range\n", 0);
478 *len = 1;
479 return(' ');
480 } else {
481 *len = 0;
482 return(0);
483 }
484 }
485 /* 1-byte code */
486 *len = 1;
487 return(*ctxt->input->cur);
488 }
489
490 encoding_error:
491 xmlCtxtErrIO(ctxt, XML_ERR_INVALID_ENCODING, NULL);
492
493 if ((ctxt->input->flags & XML_INPUT_HAS_ENCODING) == 0)
494 xmlSwitchEncoding(ctxt, XML_CHAR_ENCODING_8859_1);
495 *len = 1;
496 return(*ctxt->input->cur);
497 }
498
499 /**
500 * htmlSkipBlankChars:
501 * @ctxt: the HTML parser context
502 *
503 * skip all blanks character found at that point in the input streams.
504 *
505 * Returns the number of space chars skipped
506 */
507
508 static int
htmlSkipBlankChars(xmlParserCtxtPtr ctxt)509 htmlSkipBlankChars(xmlParserCtxtPtr ctxt) {
510 int res = 0;
511
512 while (IS_BLANK_CH(*(ctxt->input->cur))) {
513 if (*(ctxt->input->cur) == '\n') {
514 ctxt->input->line++; ctxt->input->col = 1;
515 } else ctxt->input->col++;
516 ctxt->input->cur++;
517 if (*ctxt->input->cur == 0)
518 xmlParserGrow(ctxt);
519 if (res < INT_MAX)
520 res++;
521 }
522 return(res);
523 }
524
525
526
527 /************************************************************************
528 * *
529 * The list of HTML elements and their properties *
530 * *
531 ************************************************************************/
532
533 /*
534 * Start Tag: 1 means the start tag can be omitted
535 * End Tag: 1 means the end tag can be omitted
536 * 2 means it's forbidden (empty elements)
537 * 3 means the tag is stylistic and should be closed easily
538 * Depr: this element is deprecated
539 * DTD: 1 means that this element is valid only in the Loose DTD
540 * 2 means that this element is valid only in the Frameset DTD
541 *
542 * Name,Start Tag,End Tag,Save End,Empty,Deprecated,DTD,inline,Description
543 , subElements , impliedsubelt , Attributes, userdata
544 */
545
546 /* Definitions and a couple of vars for HTML Elements */
547
548 #define FONTSTYLE "tt", "i", "b", "u", "s", "strike", "big", "small"
549 #define NB_FONTSTYLE 8
550 #define PHRASE "em", "strong", "dfn", "code", "samp", "kbd", "var", "cite", "abbr", "acronym"
551 #define NB_PHRASE 10
552 #define SPECIAL "a", "img", "applet", "embed", "object", "font", "basefont", "br", "script", "map", "q", "sub", "sup", "span", "bdo", "iframe"
553 #define NB_SPECIAL 16
554 #define INLINE FONTSTYLE, PHRASE, SPECIAL, FORMCTRL
555 #define NB_INLINE NB_PCDATA + NB_FONTSTYLE + NB_PHRASE + NB_SPECIAL + NB_FORMCTRL
556 #define BLOCK HEADING, LIST, "pre", "p", "dl", "div", "center", "noscript", "noframes", "blockquote", "form", "isindex", "hr", "table", "fieldset", "address"
557 #define NB_BLOCK NB_HEADING + NB_LIST + 14
558 #define FORMCTRL "input", "select", "textarea", "label", "button"
559 #define NB_FORMCTRL 5
560 #define PCDATA
561 #define NB_PCDATA 0
562 #define HEADING "h1", "h2", "h3", "h4", "h5", "h6"
563 #define NB_HEADING 6
564 #define LIST "ul", "ol", "dir", "menu"
565 #define NB_LIST 4
566 #define MODIFIER
567 #define NB_MODIFIER 0
568 #define FLOW BLOCK,INLINE
569 #define NB_FLOW NB_BLOCK + NB_INLINE
570 #define EMPTY NULL
571
572
573 static const char* const html_flow[] = { FLOW, NULL } ;
574 static const char* const html_inline[] = { INLINE, NULL } ;
575
576 /* placeholders: elts with content but no subelements */
577 static const char* const html_pcdata[] = { NULL } ;
578 #define html_cdata html_pcdata
579
580
581 /* ... and for HTML Attributes */
582
583 #define COREATTRS "id", "class", "style", "title"
584 #define NB_COREATTRS 4
585 #define I18N "lang", "dir"
586 #define NB_I18N 2
587 #define EVENTS "onclick", "ondblclick", "onmousedown", "onmouseup", "onmouseover", "onmouseout", "onkeypress", "onkeydown", "onkeyup"
588 #define NB_EVENTS 9
589 #define ATTRS COREATTRS,I18N,EVENTS
590 #define NB_ATTRS NB_NB_COREATTRS + NB_I18N + NB_EVENTS
591 #define CELLHALIGN "align", "char", "charoff"
592 #define NB_CELLHALIGN 3
593 #define CELLVALIGN "valign"
594 #define NB_CELLVALIGN 1
595
596 static const char* const html_attrs[] = { ATTRS, NULL } ;
597 static const char* const core_i18n_attrs[] = { COREATTRS, I18N, NULL } ;
598 static const char* const core_attrs[] = { COREATTRS, NULL } ;
599 static const char* const i18n_attrs[] = { I18N, NULL } ;
600
601
602 /* Other declarations that should go inline ... */
603 static const char* const a_attrs[] = { ATTRS, "charset", "type", "name",
604 "href", "hreflang", "rel", "rev", "accesskey", "shape", "coords",
605 "tabindex", "onfocus", "onblur", NULL } ;
606 static const char* const target_attr[] = { "target", NULL } ;
607 static const char* const rows_cols_attr[] = { "rows", "cols", NULL } ;
608 static const char* const alt_attr[] = { "alt", NULL } ;
609 static const char* const src_alt_attrs[] = { "src", "alt", NULL } ;
610 static const char* const href_attrs[] = { "href", NULL } ;
611 static const char* const clear_attrs[] = { "clear", NULL } ;
612 static const char* const inline_p[] = { INLINE, "p", NULL } ;
613
614 static const char* const flow_param[] = { FLOW, "param", NULL } ;
615 static const char* const applet_attrs[] = { COREATTRS , "codebase",
616 "archive", "alt", "name", "height", "width", "align",
617 "hspace", "vspace", NULL } ;
618 static const char* const area_attrs[] = { "shape", "coords", "href", "nohref",
619 "tabindex", "accesskey", "onfocus", "onblur", NULL } ;
620 static const char* const basefont_attrs[] =
621 { "id", "size", "color", "face", NULL } ;
622 static const char* const quote_attrs[] = { ATTRS, "cite", NULL } ;
623 static const char* const body_contents[] = { FLOW, "ins", "del", NULL } ;
624 static const char* const body_attrs[] = { ATTRS, "onload", "onunload", NULL } ;
625 static const char* const body_depr[] = { "background", "bgcolor", "text",
626 "link", "vlink", "alink", NULL } ;
627 static const char* const button_attrs[] = { ATTRS, "name", "value", "type",
628 "disabled", "tabindex", "accesskey", "onfocus", "onblur", NULL } ;
629
630
631 static const char* const col_attrs[] = { ATTRS, "span", "width", CELLHALIGN, CELLVALIGN, NULL } ;
632 static const char* const col_elt[] = { "col", NULL } ;
633 static const char* const edit_attrs[] = { ATTRS, "datetime", "cite", NULL } ;
634 static const char* const compact_attrs[] = { ATTRS, "compact", NULL } ;
635 static const char* const dl_contents[] = { "dt", "dd", NULL } ;
636 static const char* const compact_attr[] = { "compact", NULL } ;
637 static const char* const label_attr[] = { "label", NULL } ;
638 static const char* const fieldset_contents[] = { FLOW, "legend" } ;
639 static const char* const font_attrs[] = { COREATTRS, I18N, "size", "color", "face" , NULL } ;
640 static const char* const form_contents[] = { HEADING, LIST, INLINE, "pre", "p", "div", "center", "noscript", "noframes", "blockquote", "isindex", "hr", "table", "fieldset", "address", NULL } ;
641 static const char* const form_attrs[] = { ATTRS, "method", "enctype", "accept", "name", "onsubmit", "onreset", "accept-charset", NULL } ;
642 static const char* const frame_attrs[] = { COREATTRS, "longdesc", "name", "src", "frameborder", "marginwidth", "marginheight", "noresize", "scrolling" , NULL } ;
643 static const char* const frameset_attrs[] = { COREATTRS, "rows", "cols", "onload", "onunload", NULL } ;
644 static const char* const frameset_contents[] = { "frameset", "frame", "noframes", NULL } ;
645 static const char* const head_attrs[] = { I18N, "profile", NULL } ;
646 static const char* const head_contents[] = { "title", "isindex", "base", "script", "style", "meta", "link", "object", NULL } ;
647 static const char* const hr_depr[] = { "align", "noshade", "size", "width", NULL } ;
648 static const char* const version_attr[] = { "version", NULL } ;
649 static const char* const html_content[] = { "head", "body", "frameset", NULL } ;
650 static const char* const iframe_attrs[] = { COREATTRS, "longdesc", "name", "src", "frameborder", "marginwidth", "marginheight", "scrolling", "align", "height", "width", NULL } ;
651 static const char* const img_attrs[] = { ATTRS, "longdesc", "name", "height", "width", "usemap", "ismap", NULL } ;
652 static const char* const embed_attrs[] = { COREATTRS, "align", "alt", "border", "code", "codebase", "frameborder", "height", "hidden", "hspace", "name", "palette", "pluginspace", "pluginurl", "src", "type", "units", "vspace", "width", NULL } ;
653 static const char* const input_attrs[] = { ATTRS, "type", "name", "value", "checked", "disabled", "readonly", "size", "maxlength", "src", "alt", "usemap", "ismap", "tabindex", "accesskey", "onfocus", "onblur", "onselect", "onchange", "accept", NULL } ;
654 static const char* const prompt_attrs[] = { COREATTRS, I18N, "prompt", NULL } ;
655 static const char* const label_attrs[] = { ATTRS, "for", "accesskey", "onfocus", "onblur", NULL } ;
656 static const char* const legend_attrs[] = { ATTRS, "accesskey", NULL } ;
657 static const char* const align_attr[] = { "align", NULL } ;
658 static const char* const link_attrs[] = { ATTRS, "charset", "href", "hreflang", "type", "rel", "rev", "media", NULL } ;
659 static const char* const map_contents[] = { BLOCK, "area", NULL } ;
660 static const char* const name_attr[] = { "name", NULL } ;
661 static const char* const action_attr[] = { "action", NULL } ;
662 static const char* const blockli_elt[] = { BLOCK, "li", NULL } ;
663 static const char* const meta_attrs[] = { I18N, "http-equiv", "name", "scheme", "charset", NULL } ;
664 static const char* const content_attr[] = { "content", NULL } ;
665 static const char* const type_attr[] = { "type", NULL } ;
666 static const char* const noframes_content[] = { "body", FLOW MODIFIER, NULL } ;
667 static const char* const object_contents[] = { FLOW, "param", NULL } ;
668 static const char* const object_attrs[] = { ATTRS, "declare", "classid", "codebase", "data", "type", "codetype", "archive", "standby", "height", "width", "usemap", "name", "tabindex", NULL } ;
669 static const char* const object_depr[] = { "align", "border", "hspace", "vspace", NULL } ;
670 static const char* const ol_attrs[] = { "type", "compact", "start", NULL} ;
671 static const char* const option_elt[] = { "option", NULL } ;
672 static const char* const optgroup_attrs[] = { ATTRS, "disabled", NULL } ;
673 static const char* const option_attrs[] = { ATTRS, "disabled", "label", "selected", "value", NULL } ;
674 static const char* const param_attrs[] = { "id", "value", "valuetype", "type", NULL } ;
675 static const char* const width_attr[] = { "width", NULL } ;
676 static const char* const pre_content[] = { PHRASE, "tt", "i", "b", "u", "s", "strike", "a", "br", "script", "map", "q", "span", "bdo", "iframe", NULL } ;
677 static const char* const script_attrs[] = { "charset", "src", "defer", "event", "for", NULL } ;
678 static const char* const language_attr[] = { "language", NULL } ;
679 static const char* const select_content[] = { "optgroup", "option", NULL } ;
680 static const char* const select_attrs[] = { ATTRS, "name", "size", "multiple", "disabled", "tabindex", "onfocus", "onblur", "onchange", NULL } ;
681 static const char* const style_attrs[] = { I18N, "media", "title", NULL } ;
682 static const char* const table_attrs[] = { ATTRS, "summary", "width", "border", "frame", "rules", "cellspacing", "cellpadding", "datapagesize", NULL } ;
683 static const char* const table_depr[] = { "align", "bgcolor", NULL } ;
684 static const char* const table_contents[] = { "caption", "col", "colgroup", "thead", "tfoot", "tbody", "tr", NULL} ;
685 static const char* const tr_elt[] = { "tr", NULL } ;
686 static const char* const talign_attrs[] = { ATTRS, CELLHALIGN, CELLVALIGN, NULL} ;
687 static const char* const th_td_depr[] = { "nowrap", "bgcolor", "width", "height", NULL } ;
688 static const char* const th_td_attr[] = { ATTRS, "abbr", "axis", "headers", "scope", "rowspan", "colspan", CELLHALIGN, CELLVALIGN, NULL } ;
689 static const char* const textarea_attrs[] = { ATTRS, "name", "disabled", "readonly", "tabindex", "accesskey", "onfocus", "onblur", "onselect", "onchange", NULL } ;
690 static const char* const tr_contents[] = { "th", "td", NULL } ;
691 static const char* const bgcolor_attr[] = { "bgcolor", NULL } ;
692 static const char* const li_elt[] = { "li", NULL } ;
693 static const char* const ul_depr[] = { "type", "compact", NULL} ;
694 static const char* const dir_attr[] = { "dir", NULL} ;
695
696 #define DECL (const char**)
697
698 static const htmlElemDesc
699 html40ElementTable[] = {
700 { "a", 0, 0, 0, 0, 0, 0, 1, "anchor ",
701 DECL html_inline , NULL , DECL a_attrs , DECL target_attr, NULL
702 },
703 { "abbr", 0, 0, 0, 0, 0, 0, 1, "abbreviated form",
704 DECL html_inline , NULL , DECL html_attrs, NULL, NULL
705 },
706 { "acronym", 0, 0, 0, 0, 0, 0, 1, "",
707 DECL html_inline , NULL , DECL html_attrs, NULL, NULL
708 },
709 { "address", 0, 0, 0, 0, 0, 0, 0, "information on author ",
710 DECL inline_p , NULL , DECL html_attrs, NULL, NULL
711 },
712 { "applet", 0, 0, 0, 0, 1, 1, 2, "java applet ",
713 DECL flow_param , NULL , NULL , DECL applet_attrs, NULL
714 },
715 { "area", 0, 2, 2, 1, 0, 0, 0, "client-side image map area ",
716 EMPTY , NULL , DECL area_attrs , DECL target_attr, DECL alt_attr
717 },
718 { "b", 0, 3, 0, 0, 0, 0, 1, "bold text style",
719 DECL html_inline , NULL , DECL html_attrs, NULL, NULL
720 },
721 { "base", 0, 2, 2, 1, 0, 0, 0, "document base uri ",
722 EMPTY , NULL , NULL , DECL target_attr, DECL href_attrs
723 },
724 { "basefont", 0, 2, 2, 1, 1, 1, 1, "base font size " ,
725 EMPTY , NULL , NULL, DECL basefont_attrs, NULL
726 },
727 { "bdo", 0, 0, 0, 0, 0, 0, 1, "i18n bidi over-ride ",
728 DECL html_inline , NULL , DECL core_i18n_attrs, NULL, DECL dir_attr
729 },
730 { "big", 0, 3, 0, 0, 0, 0, 1, "large text style",
731 DECL html_inline , NULL , DECL html_attrs, NULL, NULL
732 },
733 { "blockquote", 0, 0, 0, 0, 0, 0, 0, "long quotation ",
734 DECL html_flow , NULL , DECL quote_attrs , NULL, NULL
735 },
736 { "body", 1, 1, 0, 0, 0, 0, 0, "document body ",
737 DECL body_contents , "div" , DECL body_attrs, DECL body_depr, NULL
738 },
739 { "br", 0, 2, 2, 1, 0, 0, 1, "forced line break ",
740 EMPTY , NULL , DECL core_attrs, DECL clear_attrs , NULL
741 },
742 { "button", 0, 0, 0, 0, 0, 0, 2, "push button ",
743 DECL html_flow MODIFIER , NULL , DECL button_attrs, NULL, NULL
744 },
745 { "caption", 0, 0, 0, 0, 0, 0, 0, "table caption ",
746 DECL html_inline , NULL , DECL html_attrs, NULL, NULL
747 },
748 { "center", 0, 3, 0, 0, 1, 1, 0, "shorthand for div align=center ",
749 DECL html_flow , NULL , NULL, DECL html_attrs, NULL
750 },
751 { "cite", 0, 0, 0, 0, 0, 0, 1, "citation",
752 DECL html_inline , NULL , DECL html_attrs, NULL, NULL
753 },
754 { "code", 0, 0, 0, 0, 0, 0, 1, "computer code fragment",
755 DECL html_inline , NULL , DECL html_attrs, NULL, NULL
756 },
757 { "col", 0, 2, 2, 1, 0, 0, 0, "table column ",
758 EMPTY , NULL , DECL col_attrs , NULL, NULL
759 },
760 { "colgroup", 0, 1, 0, 0, 0, 0, 0, "table column group ",
761 DECL col_elt , "col" , DECL col_attrs , NULL, NULL
762 },
763 { "dd", 0, 1, 0, 0, 0, 0, 0, "definition description ",
764 DECL html_flow , NULL , DECL html_attrs, NULL, NULL
765 },
766 { "del", 0, 0, 0, 0, 0, 0, 2, "deleted text ",
767 DECL html_flow , NULL , DECL edit_attrs , NULL, NULL
768 },
769 { "dfn", 0, 0, 0, 0, 0, 0, 1, "instance definition",
770 DECL html_inline , NULL , DECL html_attrs, NULL, NULL
771 },
772 { "dir", 0, 0, 0, 0, 1, 1, 0, "directory list",
773 DECL blockli_elt, "li" , NULL, DECL compact_attrs, NULL
774 },
775 { "div", 0, 0, 0, 0, 0, 0, 0, "generic language/style container",
776 DECL html_flow, NULL, DECL html_attrs, DECL align_attr, NULL
777 },
778 { "dl", 0, 0, 0, 0, 0, 0, 0, "definition list ",
779 DECL dl_contents , "dd" , DECL html_attrs, DECL compact_attr, NULL
780 },
781 { "dt", 0, 1, 0, 0, 0, 0, 0, "definition term ",
782 DECL html_inline, NULL, DECL html_attrs, NULL, NULL
783 },
784 { "em", 0, 3, 0, 0, 0, 0, 1, "emphasis",
785 DECL html_inline, NULL, DECL html_attrs, NULL, NULL
786 },
787 { "embed", 0, 1, 0, 0, 1, 1, 1, "generic embedded object ",
788 EMPTY, NULL, DECL embed_attrs, NULL, NULL
789 },
790 { "fieldset", 0, 0, 0, 0, 0, 0, 0, "form control group ",
791 DECL fieldset_contents , NULL, DECL html_attrs, NULL, NULL
792 },
793 { "font", 0, 3, 0, 0, 1, 1, 1, "local change to font ",
794 DECL html_inline, NULL, NULL, DECL font_attrs, NULL
795 },
796 { "form", 0, 0, 0, 0, 0, 0, 0, "interactive form ",
797 DECL form_contents, "fieldset", DECL form_attrs , DECL target_attr, DECL action_attr
798 },
799 { "frame", 0, 2, 2, 1, 0, 2, 0, "subwindow " ,
800 EMPTY, NULL, NULL, DECL frame_attrs, NULL
801 },
802 { "frameset", 0, 0, 0, 0, 0, 2, 0, "window subdivision" ,
803 DECL frameset_contents, "noframes" , NULL , DECL frameset_attrs, NULL
804 },
805 { "h1", 0, 0, 0, 0, 0, 0, 0, "heading ",
806 DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
807 },
808 { "h2", 0, 0, 0, 0, 0, 0, 0, "heading ",
809 DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
810 },
811 { "h3", 0, 0, 0, 0, 0, 0, 0, "heading ",
812 DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
813 },
814 { "h4", 0, 0, 0, 0, 0, 0, 0, "heading ",
815 DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
816 },
817 { "h5", 0, 0, 0, 0, 0, 0, 0, "heading ",
818 DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
819 },
820 { "h6", 0, 0, 0, 0, 0, 0, 0, "heading ",
821 DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
822 },
823 { "head", 1, 1, 0, 0, 0, 0, 0, "document head ",
824 DECL head_contents, NULL, DECL head_attrs, NULL, NULL
825 },
826 { "hr", 0, 2, 2, 1, 0, 0, 0, "horizontal rule " ,
827 EMPTY, NULL, DECL html_attrs, DECL hr_depr, NULL
828 },
829 { "html", 1, 1, 0, 0, 0, 0, 0, "document root element ",
830 DECL html_content , NULL , DECL i18n_attrs, DECL version_attr, NULL
831 },
832 { "i", 0, 3, 0, 0, 0, 0, 1, "italic text style",
833 DECL html_inline, NULL, DECL html_attrs, NULL, NULL
834 },
835 { "iframe", 0, 0, 0, 0, 0, 1, 2, "inline subwindow ",
836 DECL html_flow, NULL, NULL, DECL iframe_attrs, NULL
837 },
838 { "img", 0, 2, 2, 1, 0, 0, 1, "embedded image ",
839 EMPTY, NULL, DECL img_attrs, DECL align_attr, DECL src_alt_attrs
840 },
841 { "input", 0, 2, 2, 1, 0, 0, 1, "form control ",
842 EMPTY, NULL, DECL input_attrs , DECL align_attr, NULL
843 },
844 { "ins", 0, 0, 0, 0, 0, 0, 2, "inserted text",
845 DECL html_flow, NULL, DECL edit_attrs, NULL, NULL
846 },
847 { "isindex", 0, 2, 2, 1, 1, 1, 0, "single line prompt ",
848 EMPTY, NULL, NULL, DECL prompt_attrs, NULL
849 },
850 { "kbd", 0, 0, 0, 0, 0, 0, 1, "text to be entered by the user",
851 DECL html_inline, NULL, DECL html_attrs, NULL, NULL
852 },
853 { "label", 0, 0, 0, 0, 0, 0, 1, "form field label text ",
854 DECL html_inline MODIFIER, NULL, DECL label_attrs , NULL, NULL
855 },
856 { "legend", 0, 0, 0, 0, 0, 0, 0, "fieldset legend ",
857 DECL html_inline, NULL, DECL legend_attrs , DECL align_attr, NULL
858 },
859 { "li", 0, 1, 1, 0, 0, 0, 0, "list item ",
860 DECL html_flow, NULL, DECL html_attrs, NULL, NULL
861 },
862 { "link", 0, 2, 2, 1, 0, 0, 0, "a media-independent link ",
863 EMPTY, NULL, DECL link_attrs, DECL target_attr, NULL
864 },
865 { "map", 0, 0, 0, 0, 0, 0, 2, "client-side image map ",
866 DECL map_contents , NULL, DECL html_attrs , NULL, DECL name_attr
867 },
868 { "menu", 0, 0, 0, 0, 1, 1, 0, "menu list ",
869 DECL blockli_elt , NULL, NULL, DECL compact_attrs, NULL
870 },
871 { "meta", 0, 2, 2, 1, 0, 0, 0, "generic metainformation ",
872 EMPTY, NULL, DECL meta_attrs , NULL , DECL content_attr
873 },
874 { "noframes", 0, 0, 0, 0, 0, 2, 0, "alternate content container for non frame-based rendering ",
875 DECL noframes_content, "body" , DECL html_attrs, NULL, NULL
876 },
877 { "noscript", 0, 0, 0, 0, 0, 0, 0, "alternate content container for non script-based rendering ",
878 DECL html_flow, "div", DECL html_attrs, NULL, NULL
879 },
880 { "object", 0, 0, 0, 0, 0, 0, 2, "generic embedded object ",
881 DECL object_contents , "div" , DECL object_attrs, DECL object_depr, NULL
882 },
883 { "ol", 0, 0, 0, 0, 0, 0, 0, "ordered list ",
884 DECL li_elt , "li" , DECL html_attrs, DECL ol_attrs, NULL
885 },
886 { "optgroup", 0, 0, 0, 0, 0, 0, 0, "option group ",
887 DECL option_elt , "option", DECL optgroup_attrs, NULL, DECL label_attr
888 },
889 { "option", 0, 1, 0, 0, 0, 0, 0, "selectable choice " ,
890 DECL html_pcdata, NULL, DECL option_attrs, NULL, NULL
891 },
892 { "p", 0, 1, 0, 0, 0, 0, 0, "paragraph ",
893 DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
894 },
895 { "param", 0, 2, 2, 1, 0, 0, 0, "named property value ",
896 EMPTY, NULL, DECL param_attrs, NULL, DECL name_attr
897 },
898 { "pre", 0, 0, 0, 0, 0, 0, 0, "preformatted text ",
899 DECL pre_content, NULL, DECL html_attrs, DECL width_attr, NULL
900 },
901 { "q", 0, 0, 0, 0, 0, 0, 1, "short inline quotation ",
902 DECL html_inline, NULL, DECL quote_attrs, NULL, NULL
903 },
904 { "s", 0, 3, 0, 0, 1, 1, 1, "strike-through text style",
905 DECL html_inline, NULL, NULL, DECL html_attrs, NULL
906 },
907 { "samp", 0, 0, 0, 0, 0, 0, 1, "sample program output, scripts, etc.",
908 DECL html_inline, NULL, DECL html_attrs, NULL, NULL
909 },
910 { "script", 0, 0, 0, 0, 0, 0, 2, "script statements ",
911 DECL html_cdata, NULL, DECL script_attrs, DECL language_attr, DECL type_attr
912 },
913 { "select", 0, 0, 0, 0, 0, 0, 1, "option selector ",
914 DECL select_content, NULL, DECL select_attrs, NULL, NULL
915 },
916 { "small", 0, 3, 0, 0, 0, 0, 1, "small text style",
917 DECL html_inline, NULL, DECL html_attrs, NULL, NULL
918 },
919 { "span", 0, 0, 0, 0, 0, 0, 1, "generic language/style container ",
920 DECL html_inline, NULL, DECL html_attrs, NULL, NULL
921 },
922 { "strike", 0, 3, 0, 0, 1, 1, 1, "strike-through text",
923 DECL html_inline, NULL, NULL, DECL html_attrs, NULL
924 },
925 { "strong", 0, 3, 0, 0, 0, 0, 1, "strong emphasis",
926 DECL html_inline, NULL, DECL html_attrs, NULL, NULL
927 },
928 { "style", 0, 0, 0, 0, 0, 0, 0, "style info ",
929 DECL html_cdata, NULL, DECL style_attrs, NULL, DECL type_attr
930 },
931 { "sub", 0, 3, 0, 0, 0, 0, 1, "subscript",
932 DECL html_inline, NULL, DECL html_attrs, NULL, NULL
933 },
934 { "sup", 0, 3, 0, 0, 0, 0, 1, "superscript ",
935 DECL html_inline, NULL, DECL html_attrs, NULL, NULL
936 },
937 { "table", 0, 0, 0, 0, 0, 0, 0, "",
938 DECL table_contents , "tr" , DECL table_attrs , DECL table_depr, NULL
939 },
940 { "tbody", 1, 0, 0, 0, 0, 0, 0, "table body ",
941 DECL tr_elt , "tr" , DECL talign_attrs, NULL, NULL
942 },
943 { "td", 0, 0, 0, 0, 0, 0, 0, "table data cell",
944 DECL html_flow, NULL, DECL th_td_attr, DECL th_td_depr, NULL
945 },
946 { "textarea", 0, 0, 0, 0, 0, 0, 1, "multi-line text field ",
947 DECL html_pcdata, NULL, DECL textarea_attrs, NULL, DECL rows_cols_attr
948 },
949 { "tfoot", 0, 1, 0, 0, 0, 0, 0, "table footer ",
950 DECL tr_elt , "tr" , DECL talign_attrs, NULL, NULL
951 },
952 { "th", 0, 1, 0, 0, 0, 0, 0, "table header cell",
953 DECL html_flow, NULL, DECL th_td_attr, DECL th_td_depr, NULL
954 },
955 { "thead", 0, 1, 0, 0, 0, 0, 0, "table header ",
956 DECL tr_elt , "tr" , DECL talign_attrs, NULL, NULL
957 },
958 { "title", 0, 0, 0, 0, 0, 0, 0, "document title ",
959 DECL html_pcdata, NULL, DECL i18n_attrs, NULL, NULL
960 },
961 { "tr", 0, 0, 0, 0, 0, 0, 0, "table row ",
962 DECL tr_contents , "td" , DECL talign_attrs, DECL bgcolor_attr, NULL
963 },
964 { "tt", 0, 3, 0, 0, 0, 0, 1, "teletype or monospaced text style",
965 DECL html_inline, NULL, DECL html_attrs, NULL, NULL
966 },
967 { "u", 0, 3, 0, 0, 1, 1, 1, "underlined text style",
968 DECL html_inline, NULL, NULL, DECL html_attrs, NULL
969 },
970 { "ul", 0, 0, 0, 0, 0, 0, 0, "unordered list ",
971 DECL li_elt , "li" , DECL html_attrs, DECL ul_depr, NULL
972 },
973 { "var", 0, 0, 0, 0, 0, 0, 1, "instance of a variable or program argument",
974 DECL html_inline, NULL, DECL html_attrs, NULL, NULL
975 }
976 };
977
978 typedef struct {
979 const char *oldTag;
980 const char *newTag;
981 } htmlStartCloseEntry;
982
983 /*
984 * start tags that imply the end of current element
985 */
986 static const htmlStartCloseEntry htmlStartClose[] = {
987 { "a", "a" },
988 { "a", "fieldset" },
989 { "a", "table" },
990 { "a", "td" },
991 { "a", "th" },
992 { "address", "dd" },
993 { "address", "dl" },
994 { "address", "dt" },
995 { "address", "form" },
996 { "address", "li" },
997 { "address", "ul" },
998 { "b", "center" },
999 { "b", "p" },
1000 { "b", "td" },
1001 { "b", "th" },
1002 { "big", "p" },
1003 { "caption", "col" },
1004 { "caption", "colgroup" },
1005 { "caption", "tbody" },
1006 { "caption", "tfoot" },
1007 { "caption", "thead" },
1008 { "caption", "tr" },
1009 { "col", "col" },
1010 { "col", "colgroup" },
1011 { "col", "tbody" },
1012 { "col", "tfoot" },
1013 { "col", "thead" },
1014 { "col", "tr" },
1015 { "colgroup", "colgroup" },
1016 { "colgroup", "tbody" },
1017 { "colgroup", "tfoot" },
1018 { "colgroup", "thead" },
1019 { "colgroup", "tr" },
1020 { "dd", "dt" },
1021 { "dir", "dd" },
1022 { "dir", "dl" },
1023 { "dir", "dt" },
1024 { "dir", "form" },
1025 { "dir", "ul" },
1026 { "dl", "form" },
1027 { "dl", "li" },
1028 { "dt", "dd" },
1029 { "dt", "dl" },
1030 { "font", "center" },
1031 { "font", "td" },
1032 { "font", "th" },
1033 { "form", "form" },
1034 { "h1", "fieldset" },
1035 { "h1", "form" },
1036 { "h1", "li" },
1037 { "h1", "p" },
1038 { "h1", "table" },
1039 { "h2", "fieldset" },
1040 { "h2", "form" },
1041 { "h2", "li" },
1042 { "h2", "p" },
1043 { "h2", "table" },
1044 { "h3", "fieldset" },
1045 { "h3", "form" },
1046 { "h3", "li" },
1047 { "h3", "p" },
1048 { "h3", "table" },
1049 { "h4", "fieldset" },
1050 { "h4", "form" },
1051 { "h4", "li" },
1052 { "h4", "p" },
1053 { "h4", "table" },
1054 { "h5", "fieldset" },
1055 { "h5", "form" },
1056 { "h5", "li" },
1057 { "h5", "p" },
1058 { "h5", "table" },
1059 { "h6", "fieldset" },
1060 { "h6", "form" },
1061 { "h6", "li" },
1062 { "h6", "p" },
1063 { "h6", "table" },
1064 { "head", "a" },
1065 { "head", "abbr" },
1066 { "head", "acronym" },
1067 { "head", "address" },
1068 { "head", "b" },
1069 { "head", "bdo" },
1070 { "head", "big" },
1071 { "head", "blockquote" },
1072 { "head", "body" },
1073 { "head", "br" },
1074 { "head", "center" },
1075 { "head", "cite" },
1076 { "head", "code" },
1077 { "head", "dd" },
1078 { "head", "dfn" },
1079 { "head", "dir" },
1080 { "head", "div" },
1081 { "head", "dl" },
1082 { "head", "dt" },
1083 { "head", "em" },
1084 { "head", "fieldset" },
1085 { "head", "font" },
1086 { "head", "form" },
1087 { "head", "frameset" },
1088 { "head", "h1" },
1089 { "head", "h2" },
1090 { "head", "h3" },
1091 { "head", "h4" },
1092 { "head", "h5" },
1093 { "head", "h6" },
1094 { "head", "hr" },
1095 { "head", "i" },
1096 { "head", "iframe" },
1097 { "head", "img" },
1098 { "head", "kbd" },
1099 { "head", "li" },
1100 { "head", "listing" },
1101 { "head", "map" },
1102 { "head", "menu" },
1103 { "head", "ol" },
1104 { "head", "p" },
1105 { "head", "pre" },
1106 { "head", "q" },
1107 { "head", "s" },
1108 { "head", "samp" },
1109 { "head", "small" },
1110 { "head", "span" },
1111 { "head", "strike" },
1112 { "head", "strong" },
1113 { "head", "sub" },
1114 { "head", "sup" },
1115 { "head", "table" },
1116 { "head", "tt" },
1117 { "head", "u" },
1118 { "head", "ul" },
1119 { "head", "var" },
1120 { "head", "xmp" },
1121 { "hr", "form" },
1122 { "i", "center" },
1123 { "i", "p" },
1124 { "i", "td" },
1125 { "i", "th" },
1126 { "legend", "fieldset" },
1127 { "li", "li" },
1128 { "link", "body" },
1129 { "link", "frameset" },
1130 { "listing", "dd" },
1131 { "listing", "dl" },
1132 { "listing", "dt" },
1133 { "listing", "fieldset" },
1134 { "listing", "form" },
1135 { "listing", "li" },
1136 { "listing", "table" },
1137 { "listing", "ul" },
1138 { "menu", "dd" },
1139 { "menu", "dl" },
1140 { "menu", "dt" },
1141 { "menu", "form" },
1142 { "menu", "ul" },
1143 { "ol", "form" },
1144 { "option", "optgroup" },
1145 { "option", "option" },
1146 { "p", "address" },
1147 { "p", "blockquote" },
1148 { "p", "body" },
1149 { "p", "caption" },
1150 { "p", "center" },
1151 { "p", "col" },
1152 { "p", "colgroup" },
1153 { "p", "dd" },
1154 { "p", "dir" },
1155 { "p", "div" },
1156 { "p", "dl" },
1157 { "p", "dt" },
1158 { "p", "fieldset" },
1159 { "p", "form" },
1160 { "p", "frameset" },
1161 { "p", "h1" },
1162 { "p", "h2" },
1163 { "p", "h3" },
1164 { "p", "h4" },
1165 { "p", "h5" },
1166 { "p", "h6" },
1167 { "p", "head" },
1168 { "p", "hr" },
1169 { "p", "li" },
1170 { "p", "listing" },
1171 { "p", "menu" },
1172 { "p", "ol" },
1173 { "p", "p" },
1174 { "p", "pre" },
1175 { "p", "table" },
1176 { "p", "tbody" },
1177 { "p", "td" },
1178 { "p", "tfoot" },
1179 { "p", "th" },
1180 { "p", "title" },
1181 { "p", "tr" },
1182 { "p", "ul" },
1183 { "p", "xmp" },
1184 { "pre", "dd" },
1185 { "pre", "dl" },
1186 { "pre", "dt" },
1187 { "pre", "fieldset" },
1188 { "pre", "form" },
1189 { "pre", "li" },
1190 { "pre", "table" },
1191 { "pre", "ul" },
1192 { "s", "p" },
1193 { "script", "noscript" },
1194 { "small", "p" },
1195 { "span", "td" },
1196 { "span", "th" },
1197 { "strike", "p" },
1198 { "style", "body" },
1199 { "style", "frameset" },
1200 { "tbody", "tbody" },
1201 { "tbody", "tfoot" },
1202 { "td", "tbody" },
1203 { "td", "td" },
1204 { "td", "tfoot" },
1205 { "td", "th" },
1206 { "td", "tr" },
1207 { "tfoot", "tbody" },
1208 { "th", "tbody" },
1209 { "th", "td" },
1210 { "th", "tfoot" },
1211 { "th", "th" },
1212 { "th", "tr" },
1213 { "thead", "tbody" },
1214 { "thead", "tfoot" },
1215 { "title", "body" },
1216 { "title", "frameset" },
1217 { "tr", "tbody" },
1218 { "tr", "tfoot" },
1219 { "tr", "tr" },
1220 { "tt", "p" },
1221 { "u", "p" },
1222 { "u", "td" },
1223 { "u", "th" },
1224 { "ul", "address" },
1225 { "ul", "form" },
1226 { "ul", "menu" },
1227 { "ul", "pre" },
1228 { "xmp", "dd" },
1229 { "xmp", "dl" },
1230 { "xmp", "dt" },
1231 { "xmp", "fieldset" },
1232 { "xmp", "form" },
1233 { "xmp", "li" },
1234 { "xmp", "table" },
1235 { "xmp", "ul" }
1236 };
1237
1238 /*
1239 * The list of HTML elements which are supposed not to have
1240 * CDATA content and where a p element will be implied
1241 *
1242 * TODO: extend that list by reading the HTML SGML DTD on
1243 * implied paragraph
1244 */
1245 static const char *const htmlNoContentElements[] = {
1246 "html",
1247 "head",
1248 NULL
1249 };
1250
1251 /*
1252 * The list of HTML attributes which are of content %Script;
1253 * NOTE: when adding ones, check htmlIsScriptAttribute() since
1254 * it assumes the name starts with 'on'
1255 */
1256 static const char *const htmlScriptAttributes[] = {
1257 "onclick",
1258 "ondblclick",
1259 "onmousedown",
1260 "onmouseup",
1261 "onmouseover",
1262 "onmousemove",
1263 "onmouseout",
1264 "onkeypress",
1265 "onkeydown",
1266 "onkeyup",
1267 "onload",
1268 "onunload",
1269 "onfocus",
1270 "onblur",
1271 "onsubmit",
1272 "onreset",
1273 "onchange",
1274 "onselect"
1275 };
1276
1277 /*
1278 * This table is used by the htmlparser to know what to do with
1279 * broken html pages. By assigning different priorities to different
1280 * elements the parser can decide how to handle extra endtags.
1281 * Endtags are only allowed to close elements with lower or equal
1282 * priority.
1283 */
1284
1285 typedef struct {
1286 const char *name;
1287 int priority;
1288 } elementPriority;
1289
1290 static const elementPriority htmlEndPriority[] = {
1291 {"div", 150},
1292 {"td", 160},
1293 {"th", 160},
1294 {"tr", 170},
1295 {"thead", 180},
1296 {"tbody", 180},
1297 {"tfoot", 180},
1298 {"table", 190},
1299 {"head", 200},
1300 {"body", 200},
1301 {"html", 220},
1302 {NULL, 100} /* Default priority */
1303 };
1304
1305 /************************************************************************
1306 * *
1307 * functions to handle HTML specific data *
1308 * *
1309 ************************************************************************/
1310
1311 /**
1312 * htmlInitAutoClose:
1313 *
1314 * DEPRECATED: This is a no-op.
1315 */
1316 void
htmlInitAutoClose(void)1317 htmlInitAutoClose(void) {
1318 }
1319
1320 static int
htmlCompareTags(const void * key,const void * member)1321 htmlCompareTags(const void *key, const void *member) {
1322 const xmlChar *tag = (const xmlChar *) key;
1323 const htmlElemDesc *desc = (const htmlElemDesc *) member;
1324
1325 return(xmlStrcasecmp(tag, BAD_CAST desc->name));
1326 }
1327
1328 /**
1329 * htmlTagLookup:
1330 * @tag: The tag name in lowercase
1331 *
1332 * Lookup the HTML tag in the ElementTable
1333 *
1334 * Returns the related htmlElemDescPtr or NULL if not found.
1335 */
1336 const htmlElemDesc *
htmlTagLookup(const xmlChar * tag)1337 htmlTagLookup(const xmlChar *tag) {
1338 if (tag == NULL)
1339 return(NULL);
1340
1341 return((const htmlElemDesc *) bsearch(tag, html40ElementTable,
1342 sizeof(html40ElementTable) / sizeof(htmlElemDesc),
1343 sizeof(htmlElemDesc), htmlCompareTags));
1344 }
1345
1346 /**
1347 * htmlGetEndPriority:
1348 * @name: The name of the element to look up the priority for.
1349 *
1350 * Return value: The "endtag" priority.
1351 **/
1352 static int
htmlGetEndPriority(const xmlChar * name)1353 htmlGetEndPriority (const xmlChar *name) {
1354 int i = 0;
1355
1356 while ((htmlEndPriority[i].name != NULL) &&
1357 (!xmlStrEqual((const xmlChar *)htmlEndPriority[i].name, name)))
1358 i++;
1359
1360 return(htmlEndPriority[i].priority);
1361 }
1362
1363
1364 static int
htmlCompareStartClose(const void * vkey,const void * member)1365 htmlCompareStartClose(const void *vkey, const void *member) {
1366 const htmlStartCloseEntry *key = (const htmlStartCloseEntry *) vkey;
1367 const htmlStartCloseEntry *entry = (const htmlStartCloseEntry *) member;
1368 int ret;
1369
1370 ret = strcmp(key->oldTag, entry->oldTag);
1371 if (ret == 0)
1372 ret = strcmp(key->newTag, entry->newTag);
1373
1374 return(ret);
1375 }
1376
1377 /**
1378 * htmlCheckAutoClose:
1379 * @newtag: The new tag name
1380 * @oldtag: The old tag name
1381 *
1382 * Checks whether the new tag is one of the registered valid tags for
1383 * closing old.
1384 *
1385 * Returns 0 if no, 1 if yes.
1386 */
1387 static int
htmlCheckAutoClose(const xmlChar * newtag,const xmlChar * oldtag)1388 htmlCheckAutoClose(const xmlChar * newtag, const xmlChar * oldtag)
1389 {
1390 htmlStartCloseEntry key;
1391 void *res;
1392
1393 key.oldTag = (const char *) oldtag;
1394 key.newTag = (const char *) newtag;
1395 res = bsearch(&key, htmlStartClose,
1396 sizeof(htmlStartClose) / sizeof(htmlStartCloseEntry),
1397 sizeof(htmlStartCloseEntry), htmlCompareStartClose);
1398 return(res != NULL);
1399 }
1400
1401 /**
1402 * htmlAutoCloseOnClose:
1403 * @ctxt: an HTML parser context
1404 * @newtag: The new tag name
1405 * @force: force the tag closure
1406 *
1407 * The HTML DTD allows an ending tag to implicitly close other tags.
1408 */
1409 static void
htmlAutoCloseOnClose(htmlParserCtxtPtr ctxt,const xmlChar * newtag)1410 htmlAutoCloseOnClose(htmlParserCtxtPtr ctxt, const xmlChar * newtag)
1411 {
1412 const htmlElemDesc *info;
1413 int i, priority;
1414
1415 priority = htmlGetEndPriority(newtag);
1416
1417 for (i = (ctxt->nameNr - 1); i >= 0; i--) {
1418
1419 if (xmlStrEqual(newtag, ctxt->nameTab[i]))
1420 break;
1421 /*
1422 * A misplaced endtag can only close elements with lower
1423 * or equal priority, so if we find an element with higher
1424 * priority before we find an element with
1425 * matching name, we just ignore this endtag
1426 */
1427 if (htmlGetEndPriority(ctxt->nameTab[i]) > priority)
1428 return;
1429 }
1430 if (i < 0)
1431 return;
1432
1433 while (!xmlStrEqual(newtag, ctxt->name)) {
1434 info = htmlTagLookup(ctxt->name);
1435 if ((info != NULL) && (info->endTag == 3)) {
1436 htmlParseErr(ctxt, XML_ERR_TAG_NAME_MISMATCH,
1437 "Opening and ending tag mismatch: %s and %s\n",
1438 newtag, ctxt->name);
1439 }
1440 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
1441 ctxt->sax->endElement(ctxt->userData, ctxt->name);
1442 htmlnamePop(ctxt);
1443 }
1444 }
1445
1446 /**
1447 * htmlAutoCloseOnEnd:
1448 * @ctxt: an HTML parser context
1449 *
1450 * Close all remaining tags at the end of the stream
1451 */
1452 static void
htmlAutoCloseOnEnd(htmlParserCtxtPtr ctxt)1453 htmlAutoCloseOnEnd(htmlParserCtxtPtr ctxt)
1454 {
1455 int i;
1456
1457 if (ctxt->nameNr == 0)
1458 return;
1459 for (i = (ctxt->nameNr - 1); i >= 0; i--) {
1460 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
1461 ctxt->sax->endElement(ctxt->userData, ctxt->name);
1462 htmlnamePop(ctxt);
1463 }
1464 }
1465
1466 /**
1467 * htmlAutoClose:
1468 * @ctxt: an HTML parser context
1469 * @newtag: The new tag name or NULL
1470 *
1471 * The HTML DTD allows a tag to implicitly close other tags.
1472 * The list is kept in htmlStartClose array. This function is
1473 * called when a new tag has been detected and generates the
1474 * appropriates closes if possible/needed.
1475 * If newtag is NULL this mean we are at the end of the resource
1476 * and we should check
1477 */
1478 static void
htmlAutoClose(htmlParserCtxtPtr ctxt,const xmlChar * newtag)1479 htmlAutoClose(htmlParserCtxtPtr ctxt, const xmlChar * newtag)
1480 {
1481 if (newtag == NULL)
1482 return;
1483
1484 while ((ctxt->name != NULL) &&
1485 (htmlCheckAutoClose(newtag, ctxt->name))) {
1486 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
1487 ctxt->sax->endElement(ctxt->userData, ctxt->name);
1488 htmlnamePop(ctxt);
1489 }
1490 }
1491
1492 /**
1493 * htmlAutoCloseTag:
1494 * @doc: the HTML document
1495 * @name: The tag name
1496 * @elem: the HTML element
1497 *
1498 * The HTML DTD allows a tag to implicitly close other tags.
1499 * The list is kept in htmlStartClose array. This function checks
1500 * if the element or one of it's children would autoclose the
1501 * given tag.
1502 *
1503 * Returns 1 if autoclose, 0 otherwise
1504 */
1505 int
htmlAutoCloseTag(htmlDocPtr doc,const xmlChar * name,htmlNodePtr elem)1506 htmlAutoCloseTag(htmlDocPtr doc, const xmlChar *name, htmlNodePtr elem) {
1507 htmlNodePtr child;
1508
1509 if (elem == NULL) return(1);
1510 if (xmlStrEqual(name, elem->name)) return(0);
1511 if (htmlCheckAutoClose(elem->name, name)) return(1);
1512 child = elem->children;
1513 while (child != NULL) {
1514 if (htmlAutoCloseTag(doc, name, child)) return(1);
1515 child = child->next;
1516 }
1517 return(0);
1518 }
1519
1520 /**
1521 * htmlIsAutoClosed:
1522 * @doc: the HTML document
1523 * @elem: the HTML element
1524 *
1525 * The HTML DTD allows a tag to implicitly close other tags.
1526 * The list is kept in htmlStartClose array. This function checks
1527 * if a tag is autoclosed by one of it's child
1528 *
1529 * Returns 1 if autoclosed, 0 otherwise
1530 */
1531 int
htmlIsAutoClosed(htmlDocPtr doc,htmlNodePtr elem)1532 htmlIsAutoClosed(htmlDocPtr doc, htmlNodePtr elem) {
1533 htmlNodePtr child;
1534
1535 if (elem == NULL) return(1);
1536 child = elem->children;
1537 while (child != NULL) {
1538 if (htmlAutoCloseTag(doc, elem->name, child)) return(1);
1539 child = child->next;
1540 }
1541 return(0);
1542 }
1543
1544 /**
1545 * htmlCheckImplied:
1546 * @ctxt: an HTML parser context
1547 * @newtag: The new tag name
1548 *
1549 * The HTML DTD allows a tag to exists only implicitly
1550 * called when a new tag has been detected and generates the
1551 * appropriates implicit tags if missing
1552 */
1553 static void
htmlCheckImplied(htmlParserCtxtPtr ctxt,const xmlChar * newtag)1554 htmlCheckImplied(htmlParserCtxtPtr ctxt, const xmlChar *newtag) {
1555 int i;
1556
1557 if (ctxt->options & HTML_PARSE_NOIMPLIED)
1558 return;
1559 if (!htmlOmittedDefaultValue)
1560 return;
1561 if (xmlStrEqual(newtag, BAD_CAST"html"))
1562 return;
1563 if (ctxt->nameNr <= 0) {
1564 htmlnamePush(ctxt, BAD_CAST"html");
1565 if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
1566 ctxt->sax->startElement(ctxt->userData, BAD_CAST"html", NULL);
1567 }
1568 if ((xmlStrEqual(newtag, BAD_CAST"body")) || (xmlStrEqual(newtag, BAD_CAST"head")))
1569 return;
1570 if ((ctxt->nameNr <= 1) &&
1571 ((xmlStrEqual(newtag, BAD_CAST"script")) ||
1572 (xmlStrEqual(newtag, BAD_CAST"style")) ||
1573 (xmlStrEqual(newtag, BAD_CAST"meta")) ||
1574 (xmlStrEqual(newtag, BAD_CAST"link")) ||
1575 (xmlStrEqual(newtag, BAD_CAST"title")) ||
1576 (xmlStrEqual(newtag, BAD_CAST"base")))) {
1577 if (ctxt->html >= 3) {
1578 /* we already saw or generated an <head> before */
1579 return;
1580 }
1581 /*
1582 * dropped OBJECT ... i you put it first BODY will be
1583 * assumed !
1584 */
1585 htmlnamePush(ctxt, BAD_CAST"head");
1586 if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
1587 ctxt->sax->startElement(ctxt->userData, BAD_CAST"head", NULL);
1588 } else if ((!xmlStrEqual(newtag, BAD_CAST"noframes")) &&
1589 (!xmlStrEqual(newtag, BAD_CAST"frame")) &&
1590 (!xmlStrEqual(newtag, BAD_CAST"frameset"))) {
1591 if (ctxt->html >= 10) {
1592 /* we already saw or generated a <body> before */
1593 return;
1594 }
1595 for (i = 0;i < ctxt->nameNr;i++) {
1596 if (xmlStrEqual(ctxt->nameTab[i], BAD_CAST"body")) {
1597 return;
1598 }
1599 if (xmlStrEqual(ctxt->nameTab[i], BAD_CAST"head")) {
1600 return;
1601 }
1602 }
1603
1604 htmlnamePush(ctxt, BAD_CAST"body");
1605 if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
1606 ctxt->sax->startElement(ctxt->userData, BAD_CAST"body", NULL);
1607 }
1608 }
1609
1610 /**
1611 * htmlCheckParagraph
1612 * @ctxt: an HTML parser context
1613 *
1614 * Check whether a p element need to be implied before inserting
1615 * characters in the current element.
1616 *
1617 * Returns 1 if a paragraph has been inserted, 0 if not and -1
1618 * in case of error.
1619 */
1620
1621 static int
htmlCheckParagraph(htmlParserCtxtPtr ctxt)1622 htmlCheckParagraph(htmlParserCtxtPtr ctxt) {
1623 const xmlChar *tag;
1624 int i;
1625
1626 if (ctxt == NULL)
1627 return(-1);
1628 tag = ctxt->name;
1629 if (tag == NULL) {
1630 htmlAutoClose(ctxt, BAD_CAST"p");
1631 htmlCheckImplied(ctxt, BAD_CAST"p");
1632 htmlnamePush(ctxt, BAD_CAST"p");
1633 if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
1634 ctxt->sax->startElement(ctxt->userData, BAD_CAST"p", NULL);
1635 return(1);
1636 }
1637 if (!htmlOmittedDefaultValue)
1638 return(0);
1639 for (i = 0; htmlNoContentElements[i] != NULL; i++) {
1640 if (xmlStrEqual(tag, BAD_CAST htmlNoContentElements[i])) {
1641 htmlAutoClose(ctxt, BAD_CAST"p");
1642 htmlCheckImplied(ctxt, BAD_CAST"p");
1643 htmlnamePush(ctxt, BAD_CAST"p");
1644 if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
1645 ctxt->sax->startElement(ctxt->userData, BAD_CAST"p", NULL);
1646 return(1);
1647 }
1648 }
1649 return(0);
1650 }
1651
1652 /**
1653 * htmlIsScriptAttribute:
1654 * @name: an attribute name
1655 *
1656 * Check if an attribute is of content type Script
1657 *
1658 * Returns 1 is the attribute is a script 0 otherwise
1659 */
1660 int
htmlIsScriptAttribute(const xmlChar * name)1661 htmlIsScriptAttribute(const xmlChar *name) {
1662 unsigned int i;
1663
1664 if (name == NULL)
1665 return(0);
1666 /*
1667 * all script attributes start with 'on'
1668 */
1669 if ((name[0] != 'o') || (name[1] != 'n'))
1670 return(0);
1671 for (i = 0;
1672 i < sizeof(htmlScriptAttributes)/sizeof(htmlScriptAttributes[0]);
1673 i++) {
1674 if (xmlStrEqual(name, (const xmlChar *) htmlScriptAttributes[i]))
1675 return(1);
1676 }
1677 return(0);
1678 }
1679
1680 /************************************************************************
1681 * *
1682 * The list of HTML predefined entities *
1683 * *
1684 ************************************************************************/
1685
1686
1687 static const htmlEntityDesc html40EntitiesTable[] = {
1688 /*
1689 * the 4 absolute ones, plus apostrophe.
1690 */
1691 { 34, "quot", "quotation mark = APL quote, U+0022 ISOnum" },
1692 { 38, "amp", "ampersand, U+0026 ISOnum" },
1693 { 39, "apos", "single quote" },
1694 { 60, "lt", "less-than sign, U+003C ISOnum" },
1695 { 62, "gt", "greater-than sign, U+003E ISOnum" },
1696
1697 /*
1698 * A bunch still in the 128-255 range
1699 * Replacing them depend really on the charset used.
1700 */
1701 { 160, "nbsp", "no-break space = non-breaking space, U+00A0 ISOnum" },
1702 { 161, "iexcl","inverted exclamation mark, U+00A1 ISOnum" },
1703 { 162, "cent", "cent sign, U+00A2 ISOnum" },
1704 { 163, "pound","pound sign, U+00A3 ISOnum" },
1705 { 164, "curren","currency sign, U+00A4 ISOnum" },
1706 { 165, "yen", "yen sign = yuan sign, U+00A5 ISOnum" },
1707 { 166, "brvbar","broken bar = broken vertical bar, U+00A6 ISOnum" },
1708 { 167, "sect", "section sign, U+00A7 ISOnum" },
1709 { 168, "uml", "diaeresis = spacing diaeresis, U+00A8 ISOdia" },
1710 { 169, "copy", "copyright sign, U+00A9 ISOnum" },
1711 { 170, "ordf", "feminine ordinal indicator, U+00AA ISOnum" },
1712 { 171, "laquo","left-pointing double angle quotation mark = left pointing guillemet, U+00AB ISOnum" },
1713 { 172, "not", "not sign, U+00AC ISOnum" },
1714 { 173, "shy", "soft hyphen = discretionary hyphen, U+00AD ISOnum" },
1715 { 174, "reg", "registered sign = registered trade mark sign, U+00AE ISOnum" },
1716 { 175, "macr", "macron = spacing macron = overline = APL overbar, U+00AF ISOdia" },
1717 { 176, "deg", "degree sign, U+00B0 ISOnum" },
1718 { 177, "plusmn","plus-minus sign = plus-or-minus sign, U+00B1 ISOnum" },
1719 { 178, "sup2", "superscript two = superscript digit two = squared, U+00B2 ISOnum" },
1720 { 179, "sup3", "superscript three = superscript digit three = cubed, U+00B3 ISOnum" },
1721 { 180, "acute","acute accent = spacing acute, U+00B4 ISOdia" },
1722 { 181, "micro","micro sign, U+00B5 ISOnum" },
1723 { 182, "para", "pilcrow sign = paragraph sign, U+00B6 ISOnum" },
1724 { 183, "middot","middle dot = Georgian comma Greek middle dot, U+00B7 ISOnum" },
1725 { 184, "cedil","cedilla = spacing cedilla, U+00B8 ISOdia" },
1726 { 185, "sup1", "superscript one = superscript digit one, U+00B9 ISOnum" },
1727 { 186, "ordm", "masculine ordinal indicator, U+00BA ISOnum" },
1728 { 187, "raquo","right-pointing double angle quotation mark right pointing guillemet, U+00BB ISOnum" },
1729 { 188, "frac14","vulgar fraction one quarter = fraction one quarter, U+00BC ISOnum" },
1730 { 189, "frac12","vulgar fraction one half = fraction one half, U+00BD ISOnum" },
1731 { 190, "frac34","vulgar fraction three quarters = fraction three quarters, U+00BE ISOnum" },
1732 { 191, "iquest","inverted question mark = turned question mark, U+00BF ISOnum" },
1733 { 192, "Agrave","latin capital letter A with grave = latin capital letter A grave, U+00C0 ISOlat1" },
1734 { 193, "Aacute","latin capital letter A with acute, U+00C1 ISOlat1" },
1735 { 194, "Acirc","latin capital letter A with circumflex, U+00C2 ISOlat1" },
1736 { 195, "Atilde","latin capital letter A with tilde, U+00C3 ISOlat1" },
1737 { 196, "Auml", "latin capital letter A with diaeresis, U+00C4 ISOlat1" },
1738 { 197, "Aring","latin capital letter A with ring above = latin capital letter A ring, U+00C5 ISOlat1" },
1739 { 198, "AElig","latin capital letter AE = latin capital ligature AE, U+00C6 ISOlat1" },
1740 { 199, "Ccedil","latin capital letter C with cedilla, U+00C7 ISOlat1" },
1741 { 200, "Egrave","latin capital letter E with grave, U+00C8 ISOlat1" },
1742 { 201, "Eacute","latin capital letter E with acute, U+00C9 ISOlat1" },
1743 { 202, "Ecirc","latin capital letter E with circumflex, U+00CA ISOlat1" },
1744 { 203, "Euml", "latin capital letter E with diaeresis, U+00CB ISOlat1" },
1745 { 204, "Igrave","latin capital letter I with grave, U+00CC ISOlat1" },
1746 { 205, "Iacute","latin capital letter I with acute, U+00CD ISOlat1" },
1747 { 206, "Icirc","latin capital letter I with circumflex, U+00CE ISOlat1" },
1748 { 207, "Iuml", "latin capital letter I with diaeresis, U+00CF ISOlat1" },
1749 { 208, "ETH", "latin capital letter ETH, U+00D0 ISOlat1" },
1750 { 209, "Ntilde","latin capital letter N with tilde, U+00D1 ISOlat1" },
1751 { 210, "Ograve","latin capital letter O with grave, U+00D2 ISOlat1" },
1752 { 211, "Oacute","latin capital letter O with acute, U+00D3 ISOlat1" },
1753 { 212, "Ocirc","latin capital letter O with circumflex, U+00D4 ISOlat1" },
1754 { 213, "Otilde","latin capital letter O with tilde, U+00D5 ISOlat1" },
1755 { 214, "Ouml", "latin capital letter O with diaeresis, U+00D6 ISOlat1" },
1756 { 215, "times","multiplication sign, U+00D7 ISOnum" },
1757 { 216, "Oslash","latin capital letter O with stroke latin capital letter O slash, U+00D8 ISOlat1" },
1758 { 217, "Ugrave","latin capital letter U with grave, U+00D9 ISOlat1" },
1759 { 218, "Uacute","latin capital letter U with acute, U+00DA ISOlat1" },
1760 { 219, "Ucirc","latin capital letter U with circumflex, U+00DB ISOlat1" },
1761 { 220, "Uuml", "latin capital letter U with diaeresis, U+00DC ISOlat1" },
1762 { 221, "Yacute","latin capital letter Y with acute, U+00DD ISOlat1" },
1763 { 222, "THORN","latin capital letter THORN, U+00DE ISOlat1" },
1764 { 223, "szlig","latin small letter sharp s = ess-zed, U+00DF ISOlat1" },
1765 { 224, "agrave","latin small letter a with grave = latin small letter a grave, U+00E0 ISOlat1" },
1766 { 225, "aacute","latin small letter a with acute, U+00E1 ISOlat1" },
1767 { 226, "acirc","latin small letter a with circumflex, U+00E2 ISOlat1" },
1768 { 227, "atilde","latin small letter a with tilde, U+00E3 ISOlat1" },
1769 { 228, "auml", "latin small letter a with diaeresis, U+00E4 ISOlat1" },
1770 { 229, "aring","latin small letter a with ring above = latin small letter a ring, U+00E5 ISOlat1" },
1771 { 230, "aelig","latin small letter ae = latin small ligature ae, U+00E6 ISOlat1" },
1772 { 231, "ccedil","latin small letter c with cedilla, U+00E7 ISOlat1" },
1773 { 232, "egrave","latin small letter e with grave, U+00E8 ISOlat1" },
1774 { 233, "eacute","latin small letter e with acute, U+00E9 ISOlat1" },
1775 { 234, "ecirc","latin small letter e with circumflex, U+00EA ISOlat1" },
1776 { 235, "euml", "latin small letter e with diaeresis, U+00EB ISOlat1" },
1777 { 236, "igrave","latin small letter i with grave, U+00EC ISOlat1" },
1778 { 237, "iacute","latin small letter i with acute, U+00ED ISOlat1" },
1779 { 238, "icirc","latin small letter i with circumflex, U+00EE ISOlat1" },
1780 { 239, "iuml", "latin small letter i with diaeresis, U+00EF ISOlat1" },
1781 { 240, "eth", "latin small letter eth, U+00F0 ISOlat1" },
1782 { 241, "ntilde","latin small letter n with tilde, U+00F1 ISOlat1" },
1783 { 242, "ograve","latin small letter o with grave, U+00F2 ISOlat1" },
1784 { 243, "oacute","latin small letter o with acute, U+00F3 ISOlat1" },
1785 { 244, "ocirc","latin small letter o with circumflex, U+00F4 ISOlat1" },
1786 { 245, "otilde","latin small letter o with tilde, U+00F5 ISOlat1" },
1787 { 246, "ouml", "latin small letter o with diaeresis, U+00F6 ISOlat1" },
1788 { 247, "divide","division sign, U+00F7 ISOnum" },
1789 { 248, "oslash","latin small letter o with stroke, = latin small letter o slash, U+00F8 ISOlat1" },
1790 { 249, "ugrave","latin small letter u with grave, U+00F9 ISOlat1" },
1791 { 250, "uacute","latin small letter u with acute, U+00FA ISOlat1" },
1792 { 251, "ucirc","latin small letter u with circumflex, U+00FB ISOlat1" },
1793 { 252, "uuml", "latin small letter u with diaeresis, U+00FC ISOlat1" },
1794 { 253, "yacute","latin small letter y with acute, U+00FD ISOlat1" },
1795 { 254, "thorn","latin small letter thorn with, U+00FE ISOlat1" },
1796 { 255, "yuml", "latin small letter y with diaeresis, U+00FF ISOlat1" },
1797
1798 { 338, "OElig","latin capital ligature OE, U+0152 ISOlat2" },
1799 { 339, "oelig","latin small ligature oe, U+0153 ISOlat2" },
1800 { 352, "Scaron","latin capital letter S with caron, U+0160 ISOlat2" },
1801 { 353, "scaron","latin small letter s with caron, U+0161 ISOlat2" },
1802 { 376, "Yuml", "latin capital letter Y with diaeresis, U+0178 ISOlat2" },
1803
1804 /*
1805 * Anything below should really be kept as entities references
1806 */
1807 { 402, "fnof", "latin small f with hook = function = florin, U+0192 ISOtech" },
1808
1809 { 710, "circ", "modifier letter circumflex accent, U+02C6 ISOpub" },
1810 { 732, "tilde","small tilde, U+02DC ISOdia" },
1811
1812 { 913, "Alpha","greek capital letter alpha, U+0391" },
1813 { 914, "Beta", "greek capital letter beta, U+0392" },
1814 { 915, "Gamma","greek capital letter gamma, U+0393 ISOgrk3" },
1815 { 916, "Delta","greek capital letter delta, U+0394 ISOgrk3" },
1816 { 917, "Epsilon","greek capital letter epsilon, U+0395" },
1817 { 918, "Zeta", "greek capital letter zeta, U+0396" },
1818 { 919, "Eta", "greek capital letter eta, U+0397" },
1819 { 920, "Theta","greek capital letter theta, U+0398 ISOgrk3" },
1820 { 921, "Iota", "greek capital letter iota, U+0399" },
1821 { 922, "Kappa","greek capital letter kappa, U+039A" },
1822 { 923, "Lambda", "greek capital letter lambda, U+039B ISOgrk3" },
1823 { 924, "Mu", "greek capital letter mu, U+039C" },
1824 { 925, "Nu", "greek capital letter nu, U+039D" },
1825 { 926, "Xi", "greek capital letter xi, U+039E ISOgrk3" },
1826 { 927, "Omicron","greek capital letter omicron, U+039F" },
1827 { 928, "Pi", "greek capital letter pi, U+03A0 ISOgrk3" },
1828 { 929, "Rho", "greek capital letter rho, U+03A1" },
1829 { 931, "Sigma","greek capital letter sigma, U+03A3 ISOgrk3" },
1830 { 932, "Tau", "greek capital letter tau, U+03A4" },
1831 { 933, "Upsilon","greek capital letter upsilon, U+03A5 ISOgrk3" },
1832 { 934, "Phi", "greek capital letter phi, U+03A6 ISOgrk3" },
1833 { 935, "Chi", "greek capital letter chi, U+03A7" },
1834 { 936, "Psi", "greek capital letter psi, U+03A8 ISOgrk3" },
1835 { 937, "Omega","greek capital letter omega, U+03A9 ISOgrk3" },
1836
1837 { 945, "alpha","greek small letter alpha, U+03B1 ISOgrk3" },
1838 { 946, "beta", "greek small letter beta, U+03B2 ISOgrk3" },
1839 { 947, "gamma","greek small letter gamma, U+03B3 ISOgrk3" },
1840 { 948, "delta","greek small letter delta, U+03B4 ISOgrk3" },
1841 { 949, "epsilon","greek small letter epsilon, U+03B5 ISOgrk3" },
1842 { 950, "zeta", "greek small letter zeta, U+03B6 ISOgrk3" },
1843 { 951, "eta", "greek small letter eta, U+03B7 ISOgrk3" },
1844 { 952, "theta","greek small letter theta, U+03B8 ISOgrk3" },
1845 { 953, "iota", "greek small letter iota, U+03B9 ISOgrk3" },
1846 { 954, "kappa","greek small letter kappa, U+03BA ISOgrk3" },
1847 { 955, "lambda","greek small letter lambda, U+03BB ISOgrk3" },
1848 { 956, "mu", "greek small letter mu, U+03BC ISOgrk3" },
1849 { 957, "nu", "greek small letter nu, U+03BD ISOgrk3" },
1850 { 958, "xi", "greek small letter xi, U+03BE ISOgrk3" },
1851 { 959, "omicron","greek small letter omicron, U+03BF NEW" },
1852 { 960, "pi", "greek small letter pi, U+03C0 ISOgrk3" },
1853 { 961, "rho", "greek small letter rho, U+03C1 ISOgrk3" },
1854 { 962, "sigmaf","greek small letter final sigma, U+03C2 ISOgrk3" },
1855 { 963, "sigma","greek small letter sigma, U+03C3 ISOgrk3" },
1856 { 964, "tau", "greek small letter tau, U+03C4 ISOgrk3" },
1857 { 965, "upsilon","greek small letter upsilon, U+03C5 ISOgrk3" },
1858 { 966, "phi", "greek small letter phi, U+03C6 ISOgrk3" },
1859 { 967, "chi", "greek small letter chi, U+03C7 ISOgrk3" },
1860 { 968, "psi", "greek small letter psi, U+03C8 ISOgrk3" },
1861 { 969, "omega","greek small letter omega, U+03C9 ISOgrk3" },
1862 { 977, "thetasym","greek small letter theta symbol, U+03D1 NEW" },
1863 { 978, "upsih","greek upsilon with hook symbol, U+03D2 NEW" },
1864 { 982, "piv", "greek pi symbol, U+03D6 ISOgrk3" },
1865
1866 { 8194, "ensp", "en space, U+2002 ISOpub" },
1867 { 8195, "emsp", "em space, U+2003 ISOpub" },
1868 { 8201, "thinsp","thin space, U+2009 ISOpub" },
1869 { 8204, "zwnj", "zero width non-joiner, U+200C NEW RFC 2070" },
1870 { 8205, "zwj", "zero width joiner, U+200D NEW RFC 2070" },
1871 { 8206, "lrm", "left-to-right mark, U+200E NEW RFC 2070" },
1872 { 8207, "rlm", "right-to-left mark, U+200F NEW RFC 2070" },
1873 { 8211, "ndash","en dash, U+2013 ISOpub" },
1874 { 8212, "mdash","em dash, U+2014 ISOpub" },
1875 { 8216, "lsquo","left single quotation mark, U+2018 ISOnum" },
1876 { 8217, "rsquo","right single quotation mark, U+2019 ISOnum" },
1877 { 8218, "sbquo","single low-9 quotation mark, U+201A NEW" },
1878 { 8220, "ldquo","left double quotation mark, U+201C ISOnum" },
1879 { 8221, "rdquo","right double quotation mark, U+201D ISOnum" },
1880 { 8222, "bdquo","double low-9 quotation mark, U+201E NEW" },
1881 { 8224, "dagger","dagger, U+2020 ISOpub" },
1882 { 8225, "Dagger","double dagger, U+2021 ISOpub" },
1883
1884 { 8226, "bull", "bullet = black small circle, U+2022 ISOpub" },
1885 { 8230, "hellip","horizontal ellipsis = three dot leader, U+2026 ISOpub" },
1886
1887 { 8240, "permil","per mille sign, U+2030 ISOtech" },
1888
1889 { 8242, "prime","prime = minutes = feet, U+2032 ISOtech" },
1890 { 8243, "Prime","double prime = seconds = inches, U+2033 ISOtech" },
1891
1892 { 8249, "lsaquo","single left-pointing angle quotation mark, U+2039 ISO proposed" },
1893 { 8250, "rsaquo","single right-pointing angle quotation mark, U+203A ISO proposed" },
1894
1895 { 8254, "oline","overline = spacing overscore, U+203E NEW" },
1896 { 8260, "frasl","fraction slash, U+2044 NEW" },
1897
1898 { 8364, "euro", "euro sign, U+20AC NEW" },
1899
1900 { 8465, "image","blackletter capital I = imaginary part, U+2111 ISOamso" },
1901 { 8472, "weierp","script capital P = power set = Weierstrass p, U+2118 ISOamso" },
1902 { 8476, "real", "blackletter capital R = real part symbol, U+211C ISOamso" },
1903 { 8482, "trade","trade mark sign, U+2122 ISOnum" },
1904 { 8501, "alefsym","alef symbol = first transfinite cardinal, U+2135 NEW" },
1905 { 8592, "larr", "leftwards arrow, U+2190 ISOnum" },
1906 { 8593, "uarr", "upwards arrow, U+2191 ISOnum" },
1907 { 8594, "rarr", "rightwards arrow, U+2192 ISOnum" },
1908 { 8595, "darr", "downwards arrow, U+2193 ISOnum" },
1909 { 8596, "harr", "left right arrow, U+2194 ISOamsa" },
1910 { 8629, "crarr","downwards arrow with corner leftwards = carriage return, U+21B5 NEW" },
1911 { 8656, "lArr", "leftwards double arrow, U+21D0 ISOtech" },
1912 { 8657, "uArr", "upwards double arrow, U+21D1 ISOamsa" },
1913 { 8658, "rArr", "rightwards double arrow, U+21D2 ISOtech" },
1914 { 8659, "dArr", "downwards double arrow, U+21D3 ISOamsa" },
1915 { 8660, "hArr", "left right double arrow, U+21D4 ISOamsa" },
1916
1917 { 8704, "forall","for all, U+2200 ISOtech" },
1918 { 8706, "part", "partial differential, U+2202 ISOtech" },
1919 { 8707, "exist","there exists, U+2203 ISOtech" },
1920 { 8709, "empty","empty set = null set = diameter, U+2205 ISOamso" },
1921 { 8711, "nabla","nabla = backward difference, U+2207 ISOtech" },
1922 { 8712, "isin", "element of, U+2208 ISOtech" },
1923 { 8713, "notin","not an element of, U+2209 ISOtech" },
1924 { 8715, "ni", "contains as member, U+220B ISOtech" },
1925 { 8719, "prod", "n-ary product = product sign, U+220F ISOamsb" },
1926 { 8721, "sum", "n-ary summation, U+2211 ISOamsb" },
1927 { 8722, "minus","minus sign, U+2212 ISOtech" },
1928 { 8727, "lowast","asterisk operator, U+2217 ISOtech" },
1929 { 8730, "radic","square root = radical sign, U+221A ISOtech" },
1930 { 8733, "prop", "proportional to, U+221D ISOtech" },
1931 { 8734, "infin","infinity, U+221E ISOtech" },
1932 { 8736, "ang", "angle, U+2220 ISOamso" },
1933 { 8743, "and", "logical and = wedge, U+2227 ISOtech" },
1934 { 8744, "or", "logical or = vee, U+2228 ISOtech" },
1935 { 8745, "cap", "intersection = cap, U+2229 ISOtech" },
1936 { 8746, "cup", "union = cup, U+222A ISOtech" },
1937 { 8747, "int", "integral, U+222B ISOtech" },
1938 { 8756, "there4","therefore, U+2234 ISOtech" },
1939 { 8764, "sim", "tilde operator = varies with = similar to, U+223C ISOtech" },
1940 { 8773, "cong", "approximately equal to, U+2245 ISOtech" },
1941 { 8776, "asymp","almost equal to = asymptotic to, U+2248 ISOamsr" },
1942 { 8800, "ne", "not equal to, U+2260 ISOtech" },
1943 { 8801, "equiv","identical to, U+2261 ISOtech" },
1944 { 8804, "le", "less-than or equal to, U+2264 ISOtech" },
1945 { 8805, "ge", "greater-than or equal to, U+2265 ISOtech" },
1946 { 8834, "sub", "subset of, U+2282 ISOtech" },
1947 { 8835, "sup", "superset of, U+2283 ISOtech" },
1948 { 8836, "nsub", "not a subset of, U+2284 ISOamsn" },
1949 { 8838, "sube", "subset of or equal to, U+2286 ISOtech" },
1950 { 8839, "supe", "superset of or equal to, U+2287 ISOtech" },
1951 { 8853, "oplus","circled plus = direct sum, U+2295 ISOamsb" },
1952 { 8855, "otimes","circled times = vector product, U+2297 ISOamsb" },
1953 { 8869, "perp", "up tack = orthogonal to = perpendicular, U+22A5 ISOtech" },
1954 { 8901, "sdot", "dot operator, U+22C5 ISOamsb" },
1955 { 8968, "lceil","left ceiling = apl upstile, U+2308 ISOamsc" },
1956 { 8969, "rceil","right ceiling, U+2309 ISOamsc" },
1957 { 8970, "lfloor","left floor = apl downstile, U+230A ISOamsc" },
1958 { 8971, "rfloor","right floor, U+230B ISOamsc" },
1959 { 9001, "lang", "left-pointing angle bracket = bra, U+2329 ISOtech" },
1960 { 9002, "rang", "right-pointing angle bracket = ket, U+232A ISOtech" },
1961 { 9674, "loz", "lozenge, U+25CA ISOpub" },
1962
1963 { 9824, "spades","black spade suit, U+2660 ISOpub" },
1964 { 9827, "clubs","black club suit = shamrock, U+2663 ISOpub" },
1965 { 9829, "hearts","black heart suit = valentine, U+2665 ISOpub" },
1966 { 9830, "diams","black diamond suit, U+2666 ISOpub" },
1967
1968 };
1969
1970 /************************************************************************
1971 * *
1972 * Commodity functions to handle entities *
1973 * *
1974 ************************************************************************/
1975
1976 /*
1977 * Macro used to grow the current buffer.
1978 */
1979 #define growBuffer(buffer) { \
1980 xmlChar *tmp; \
1981 buffer##_size *= 2; \
1982 tmp = (xmlChar *) xmlRealloc(buffer, buffer##_size); \
1983 if (tmp == NULL) { \
1984 htmlErrMemory(ctxt); \
1985 xmlFree(buffer); \
1986 return(NULL); \
1987 } \
1988 buffer = tmp; \
1989 }
1990
1991 /**
1992 * htmlEntityLookup:
1993 * @name: the entity name
1994 *
1995 * Lookup the given entity in EntitiesTable
1996 *
1997 * TODO: the linear scan is really ugly, an hash table is really needed.
1998 *
1999 * Returns the associated htmlEntityDescPtr if found, NULL otherwise.
2000 */
2001 const htmlEntityDesc *
htmlEntityLookup(const xmlChar * name)2002 htmlEntityLookup(const xmlChar *name) {
2003 unsigned int i;
2004
2005 for (i = 0;i < (sizeof(html40EntitiesTable)/
2006 sizeof(html40EntitiesTable[0]));i++) {
2007 if (xmlStrEqual(name, BAD_CAST html40EntitiesTable[i].name)) {
2008 return((htmlEntityDescPtr) &html40EntitiesTable[i]);
2009 }
2010 }
2011 return(NULL);
2012 }
2013
2014 static int
htmlCompareEntityDesc(const void * vkey,const void * vdesc)2015 htmlCompareEntityDesc(const void *vkey, const void *vdesc) {
2016 const unsigned *key = vkey;
2017 const htmlEntityDesc *desc = vdesc;
2018
2019 return((int) *key - (int) desc->value);
2020 }
2021
2022 /**
2023 * htmlEntityValueLookup:
2024 * @value: the entity's unicode value
2025 *
2026 * Lookup the given entity in EntitiesTable
2027 *
2028 * TODO: the linear scan is really ugly, an hash table is really needed.
2029 *
2030 * Returns the associated htmlEntityDescPtr if found, NULL otherwise.
2031 */
2032 const htmlEntityDesc *
htmlEntityValueLookup(unsigned int value)2033 htmlEntityValueLookup(unsigned int value) {
2034 const htmlEntityDesc *desc;
2035 size_t nmemb;
2036
2037 nmemb = sizeof(html40EntitiesTable) / sizeof(html40EntitiesTable[0]);
2038 desc = bsearch(&value, html40EntitiesTable, nmemb, sizeof(htmlEntityDesc),
2039 htmlCompareEntityDesc);
2040
2041 return(desc);
2042 }
2043
2044 /**
2045 * UTF8ToHtml:
2046 * @out: a pointer to an array of bytes to store the result
2047 * @outlen: the length of @out
2048 * @in: a pointer to an array of UTF-8 chars
2049 * @inlen: the length of @in
2050 *
2051 * Take a block of UTF-8 chars in and try to convert it to an ASCII
2052 * plus HTML entities block of chars out.
2053 *
2054 * Returns 0 if success, -2 if the transcoding fails, or -1 otherwise
2055 * The value of @inlen after return is the number of octets consumed
2056 * as the return value is positive, else unpredictable.
2057 * The value of @outlen after return is the number of octets consumed.
2058 */
2059 int
UTF8ToHtml(unsigned char * out,int * outlen,const unsigned char * in,int * inlen)2060 UTF8ToHtml(unsigned char* out, int *outlen,
2061 const unsigned char* in, int *inlen) {
2062 const unsigned char* processed = in;
2063 const unsigned char* outend;
2064 const unsigned char* outstart = out;
2065 const unsigned char* instart = in;
2066 const unsigned char* inend;
2067 unsigned int c, d;
2068 int trailing;
2069
2070 if ((out == NULL) || (outlen == NULL) || (inlen == NULL)) return(-1);
2071 if (in == NULL) {
2072 /*
2073 * initialization nothing to do
2074 */
2075 *outlen = 0;
2076 *inlen = 0;
2077 return(0);
2078 }
2079 inend = in + (*inlen);
2080 outend = out + (*outlen);
2081 while (in < inend) {
2082 d = *in++;
2083 if (d < 0x80) { c= d; trailing= 0; }
2084 else if (d < 0xC0) {
2085 /* trailing byte in leading position */
2086 *outlen = out - outstart;
2087 *inlen = processed - instart;
2088 return(-2);
2089 } else if (d < 0xE0) { c= d & 0x1F; trailing= 1; }
2090 else if (d < 0xF0) { c= d & 0x0F; trailing= 2; }
2091 else if (d < 0xF8) { c= d & 0x07; trailing= 3; }
2092 else {
2093 /* no chance for this in Ascii */
2094 *outlen = out - outstart;
2095 *inlen = processed - instart;
2096 return(-2);
2097 }
2098
2099 if (inend - in < trailing) {
2100 break;
2101 }
2102
2103 for ( ; trailing; trailing--) {
2104 if ((in >= inend) || (((d= *in++) & 0xC0) != 0x80))
2105 break;
2106 c <<= 6;
2107 c |= d & 0x3F;
2108 }
2109
2110 /* assertion: c is a single UTF-4 value */
2111 if (c < 0x80) {
2112 if (out + 1 >= outend)
2113 break;
2114 *out++ = c;
2115 } else {
2116 int len;
2117 const htmlEntityDesc * ent;
2118 const char *cp;
2119 char nbuf[16];
2120
2121 /*
2122 * Try to lookup a predefined HTML entity for it
2123 */
2124
2125 ent = htmlEntityValueLookup(c);
2126 if (ent == NULL) {
2127 snprintf(nbuf, sizeof(nbuf), "#%u", c);
2128 cp = nbuf;
2129 }
2130 else
2131 cp = ent->name;
2132 len = strlen(cp);
2133 if (out + 2 + len >= outend)
2134 break;
2135 *out++ = '&';
2136 memcpy(out, cp, len);
2137 out += len;
2138 *out++ = ';';
2139 }
2140 processed = in;
2141 }
2142 *outlen = out - outstart;
2143 *inlen = processed - instart;
2144 return(0);
2145 }
2146
2147 /**
2148 * htmlEncodeEntities:
2149 * @out: a pointer to an array of bytes to store the result
2150 * @outlen: the length of @out
2151 * @in: a pointer to an array of UTF-8 chars
2152 * @inlen: the length of @in
2153 * @quoteChar: the quote character to escape (' or ") or zero.
2154 *
2155 * Take a block of UTF-8 chars in and try to convert it to an ASCII
2156 * plus HTML entities block of chars out.
2157 *
2158 * Returns 0 if success, -2 if the transcoding fails, or -1 otherwise
2159 * The value of @inlen after return is the number of octets consumed
2160 * as the return value is positive, else unpredictable.
2161 * The value of @outlen after return is the number of octets consumed.
2162 */
2163 int
htmlEncodeEntities(unsigned char * out,int * outlen,const unsigned char * in,int * inlen,int quoteChar)2164 htmlEncodeEntities(unsigned char* out, int *outlen,
2165 const unsigned char* in, int *inlen, int quoteChar) {
2166 const unsigned char* processed = in;
2167 const unsigned char* outend;
2168 const unsigned char* outstart = out;
2169 const unsigned char* instart = in;
2170 const unsigned char* inend;
2171 unsigned int c, d;
2172 int trailing;
2173
2174 if ((out == NULL) || (outlen == NULL) || (inlen == NULL) || (in == NULL))
2175 return(-1);
2176 outend = out + (*outlen);
2177 inend = in + (*inlen);
2178 while (in < inend) {
2179 d = *in++;
2180 if (d < 0x80) { c= d; trailing= 0; }
2181 else if (d < 0xC0) {
2182 /* trailing byte in leading position */
2183 *outlen = out - outstart;
2184 *inlen = processed - instart;
2185 return(-2);
2186 } else if (d < 0xE0) { c= d & 0x1F; trailing= 1; }
2187 else if (d < 0xF0) { c= d & 0x0F; trailing= 2; }
2188 else if (d < 0xF8) { c= d & 0x07; trailing= 3; }
2189 else {
2190 /* no chance for this in Ascii */
2191 *outlen = out - outstart;
2192 *inlen = processed - instart;
2193 return(-2);
2194 }
2195
2196 if (inend - in < trailing)
2197 break;
2198
2199 while (trailing--) {
2200 if (((d= *in++) & 0xC0) != 0x80) {
2201 *outlen = out - outstart;
2202 *inlen = processed - instart;
2203 return(-2);
2204 }
2205 c <<= 6;
2206 c |= d & 0x3F;
2207 }
2208
2209 /* assertion: c is a single UTF-4 value */
2210 if ((c < 0x80) && (c != (unsigned int) quoteChar) &&
2211 (c != '&') && (c != '<') && (c != '>')) {
2212 if (out >= outend)
2213 break;
2214 *out++ = c;
2215 } else {
2216 const htmlEntityDesc * ent;
2217 const char *cp;
2218 char nbuf[16];
2219 int len;
2220
2221 /*
2222 * Try to lookup a predefined HTML entity for it
2223 */
2224 ent = htmlEntityValueLookup(c);
2225 if (ent == NULL) {
2226 snprintf(nbuf, sizeof(nbuf), "#%u", c);
2227 cp = nbuf;
2228 }
2229 else
2230 cp = ent->name;
2231 len = strlen(cp);
2232 if (outend - out < len + 2)
2233 break;
2234 *out++ = '&';
2235 memcpy(out, cp, len);
2236 out += len;
2237 *out++ = ';';
2238 }
2239 processed = in;
2240 }
2241 *outlen = out - outstart;
2242 *inlen = processed - instart;
2243 return(0);
2244 }
2245
2246 /************************************************************************
2247 * *
2248 * Commodity functions, cleanup needed ? *
2249 * *
2250 ************************************************************************/
2251 /*
2252 * all tags allowing pc data from the html 4.01 loose dtd
2253 * NOTE: it might be more appropriate to integrate this information
2254 * into the html40ElementTable array but I don't want to risk any
2255 * binary incompatibility
2256 */
2257 static const char *allowPCData[] = {
2258 "a", "abbr", "acronym", "address", "applet", "b", "bdo", "big",
2259 "blockquote", "body", "button", "caption", "center", "cite", "code",
2260 "dd", "del", "dfn", "div", "dt", "em", "font", "form", "h1", "h2",
2261 "h3", "h4", "h5", "h6", "i", "iframe", "ins", "kbd", "label", "legend",
2262 "li", "noframes", "noscript", "object", "p", "pre", "q", "s", "samp",
2263 "small", "span", "strike", "strong", "td", "th", "tt", "u", "var"
2264 };
2265
2266 /**
2267 * areBlanks:
2268 * @ctxt: an HTML parser context
2269 * @str: a xmlChar *
2270 * @len: the size of @str
2271 *
2272 * Is this a sequence of blank chars that one can ignore ?
2273 *
2274 * Returns 1 if ignorable 0 otherwise.
2275 */
2276
areBlanks(htmlParserCtxtPtr ctxt,const xmlChar * str,int len)2277 static int areBlanks(htmlParserCtxtPtr ctxt, const xmlChar *str, int len) {
2278 unsigned int i;
2279 int j;
2280 xmlNodePtr lastChild;
2281 xmlDtdPtr dtd;
2282
2283 for (j = 0;j < len;j++)
2284 if (!(IS_BLANK_CH(str[j]))) return(0);
2285
2286 if (CUR == 0) return(1);
2287 if (CUR != '<') return(0);
2288 if (ctxt->name == NULL)
2289 return(1);
2290 if (xmlStrEqual(ctxt->name, BAD_CAST"html"))
2291 return(1);
2292 if (xmlStrEqual(ctxt->name, BAD_CAST"head"))
2293 return(1);
2294
2295 /* Only strip CDATA children of the body tag for strict HTML DTDs */
2296 if (xmlStrEqual(ctxt->name, BAD_CAST "body") && ctxt->myDoc != NULL) {
2297 dtd = xmlGetIntSubset(ctxt->myDoc);
2298 if (dtd != NULL && dtd->ExternalID != NULL) {
2299 if (!xmlStrcasecmp(dtd->ExternalID, BAD_CAST "-//W3C//DTD HTML 4.01//EN") ||
2300 !xmlStrcasecmp(dtd->ExternalID, BAD_CAST "-//W3C//DTD HTML 4//EN"))
2301 return(1);
2302 }
2303 }
2304
2305 if (ctxt->node == NULL) return(0);
2306 lastChild = xmlGetLastChild(ctxt->node);
2307 while ((lastChild) && (lastChild->type == XML_COMMENT_NODE))
2308 lastChild = lastChild->prev;
2309 if (lastChild == NULL) {
2310 if ((ctxt->node->type != XML_ELEMENT_NODE) &&
2311 (ctxt->node->content != NULL)) return(0);
2312 /* keep ws in constructs like ...<b> </b>...
2313 for all tags "b" allowing PCDATA */
2314 for ( i = 0; i < sizeof(allowPCData)/sizeof(allowPCData[0]); i++ ) {
2315 if ( xmlStrEqual(ctxt->name, BAD_CAST allowPCData[i]) ) {
2316 return(0);
2317 }
2318 }
2319 } else if (xmlNodeIsText(lastChild)) {
2320 return(0);
2321 } else {
2322 /* keep ws in constructs like <p><b>xy</b> <i>z</i><p>
2323 for all tags "p" allowing PCDATA */
2324 for ( i = 0; i < sizeof(allowPCData)/sizeof(allowPCData[0]); i++ ) {
2325 if ( xmlStrEqual(lastChild->name, BAD_CAST allowPCData[i]) ) {
2326 return(0);
2327 }
2328 }
2329 }
2330 return(1);
2331 }
2332
2333 /**
2334 * htmlNewDocNoDtD:
2335 * @URI: URI for the dtd, or NULL
2336 * @ExternalID: the external ID of the DTD, or NULL
2337 *
2338 * Creates a new HTML document without a DTD node if @URI and @ExternalID
2339 * are NULL
2340 *
2341 * Returns a new document, do not initialize the DTD if not provided
2342 */
2343 htmlDocPtr
htmlNewDocNoDtD(const xmlChar * URI,const xmlChar * ExternalID)2344 htmlNewDocNoDtD(const xmlChar *URI, const xmlChar *ExternalID) {
2345 xmlDocPtr cur;
2346
2347 /*
2348 * Allocate a new document and fill the fields.
2349 */
2350 cur = (xmlDocPtr) xmlMalloc(sizeof(xmlDoc));
2351 if (cur == NULL)
2352 return(NULL);
2353 memset(cur, 0, sizeof(xmlDoc));
2354
2355 cur->type = XML_HTML_DOCUMENT_NODE;
2356 cur->version = NULL;
2357 cur->intSubset = NULL;
2358 cur->doc = cur;
2359 cur->name = NULL;
2360 cur->children = NULL;
2361 cur->extSubset = NULL;
2362 cur->oldNs = NULL;
2363 cur->encoding = NULL;
2364 cur->standalone = 1;
2365 cur->compression = 0;
2366 cur->ids = NULL;
2367 cur->refs = NULL;
2368 cur->_private = NULL;
2369 cur->charset = XML_CHAR_ENCODING_UTF8;
2370 cur->properties = XML_DOC_HTML | XML_DOC_USERBUILT;
2371 if ((ExternalID != NULL) ||
2372 (URI != NULL)) {
2373 xmlDtdPtr intSubset;
2374
2375 intSubset = xmlCreateIntSubset(cur, BAD_CAST "html", ExternalID, URI);
2376 if (intSubset == NULL) {
2377 xmlFree(cur);
2378 return(NULL);
2379 }
2380 }
2381 if ((__xmlRegisterCallbacks) && (xmlRegisterNodeDefaultValue))
2382 xmlRegisterNodeDefaultValue((xmlNodePtr)cur);
2383 return(cur);
2384 }
2385
2386 /**
2387 * htmlNewDoc:
2388 * @URI: URI for the dtd, or NULL
2389 * @ExternalID: the external ID of the DTD, or NULL
2390 *
2391 * Creates a new HTML document
2392 *
2393 * Returns a new document
2394 */
2395 htmlDocPtr
htmlNewDoc(const xmlChar * URI,const xmlChar * ExternalID)2396 htmlNewDoc(const xmlChar *URI, const xmlChar *ExternalID) {
2397 if ((URI == NULL) && (ExternalID == NULL))
2398 return(htmlNewDocNoDtD(
2399 BAD_CAST "http://www.w3.org/TR/REC-html40/loose.dtd",
2400 BAD_CAST "-//W3C//DTD HTML 4.0 Transitional//EN"));
2401
2402 return(htmlNewDocNoDtD(URI, ExternalID));
2403 }
2404
2405
2406 /************************************************************************
2407 * *
2408 * The parser itself *
2409 * Relates to http://www.w3.org/TR/html40 *
2410 * *
2411 ************************************************************************/
2412
2413 /************************************************************************
2414 * *
2415 * The parser itself *
2416 * *
2417 ************************************************************************/
2418
2419 static const xmlChar * htmlParseNameComplex(xmlParserCtxtPtr ctxt);
2420
2421 static void
htmlSkipBogusComment(htmlParserCtxtPtr ctxt)2422 htmlSkipBogusComment(htmlParserCtxtPtr ctxt) {
2423 int c;
2424
2425 htmlParseErr(ctxt, XML_HTML_INCORRECTLY_OPENED_COMMENT,
2426 "Incorrectly opened comment\n", NULL, NULL);
2427
2428 while (PARSER_STOPPED(ctxt) == 0) {
2429 c = CUR;
2430 if (c == 0)
2431 break;
2432 NEXT;
2433 if (c == '>')
2434 break;
2435 }
2436 }
2437
2438 /**
2439 * htmlParseHTMLName:
2440 * @ctxt: an HTML parser context
2441 *
2442 * parse an HTML tag or attribute name, note that we convert it to lowercase
2443 * since HTML names are not case-sensitive.
2444 *
2445 * Returns the Tag Name parsed or NULL
2446 */
2447
2448 static const xmlChar *
htmlParseHTMLName(htmlParserCtxtPtr ctxt)2449 htmlParseHTMLName(htmlParserCtxtPtr ctxt) {
2450 const xmlChar *ret;
2451 int i = 0;
2452 xmlChar loc[HTML_PARSER_BUFFER_SIZE];
2453
2454 if (!IS_ASCII_LETTER(CUR) && (CUR != '_') &&
2455 (CUR != ':') && (CUR != '.')) return(NULL);
2456
2457 while ((i < HTML_PARSER_BUFFER_SIZE) &&
2458 ((IS_ASCII_LETTER(CUR)) || (IS_ASCII_DIGIT(CUR)) ||
2459 (CUR == ':') || (CUR == '-') || (CUR == '_') ||
2460 (CUR == '.'))) {
2461 if ((CUR >= 'A') && (CUR <= 'Z')) loc[i] = CUR + 0x20;
2462 else loc[i] = CUR;
2463 i++;
2464
2465 NEXT;
2466 }
2467
2468 ret = xmlDictLookup(ctxt->dict, loc, i);
2469 if (ret == NULL)
2470 htmlErrMemory(ctxt);
2471
2472 return(ret);
2473 }
2474
2475
2476 /**
2477 * htmlParseHTMLName_nonInvasive:
2478 * @ctxt: an HTML parser context
2479 *
2480 * parse an HTML tag or attribute name, note that we convert it to lowercase
2481 * since HTML names are not case-sensitive, this doesn't consume the data
2482 * from the stream, it's a look-ahead
2483 *
2484 * Returns the Tag Name parsed or NULL
2485 */
2486
2487 static const xmlChar *
htmlParseHTMLName_nonInvasive(htmlParserCtxtPtr ctxt)2488 htmlParseHTMLName_nonInvasive(htmlParserCtxtPtr ctxt) {
2489 int i = 0;
2490 xmlChar loc[HTML_PARSER_BUFFER_SIZE];
2491 const xmlChar *ret;
2492
2493 if (!IS_ASCII_LETTER(NXT(1)) && (NXT(1) != '_') &&
2494 (NXT(1) != ':')) return(NULL);
2495
2496 while ((i < HTML_PARSER_BUFFER_SIZE) &&
2497 ((IS_ASCII_LETTER(NXT(1+i))) || (IS_ASCII_DIGIT(NXT(1+i))) ||
2498 (NXT(1+i) == ':') || (NXT(1+i) == '-') || (NXT(1+i) == '_'))) {
2499 if ((NXT(1+i) >= 'A') && (NXT(1+i) <= 'Z')) loc[i] = NXT(1+i) + 0x20;
2500 else loc[i] = NXT(1+i);
2501 i++;
2502 }
2503
2504 ret = xmlDictLookup(ctxt->dict, loc, i);
2505 if (ret == NULL)
2506 htmlErrMemory(ctxt);
2507
2508 return(ret);
2509 }
2510
2511
2512 /**
2513 * htmlParseName:
2514 * @ctxt: an HTML parser context
2515 *
2516 * parse an HTML name, this routine is case sensitive.
2517 *
2518 * Returns the Name parsed or NULL
2519 */
2520
2521 static const xmlChar *
htmlParseName(htmlParserCtxtPtr ctxt)2522 htmlParseName(htmlParserCtxtPtr ctxt) {
2523 const xmlChar *in;
2524 const xmlChar *ret;
2525 int count = 0;
2526
2527 GROW;
2528
2529 /*
2530 * Accelerator for simple ASCII names
2531 */
2532 in = ctxt->input->cur;
2533 if (((*in >= 0x61) && (*in <= 0x7A)) ||
2534 ((*in >= 0x41) && (*in <= 0x5A)) ||
2535 (*in == '_') || (*in == ':')) {
2536 in++;
2537 while (((*in >= 0x61) && (*in <= 0x7A)) ||
2538 ((*in >= 0x41) && (*in <= 0x5A)) ||
2539 ((*in >= 0x30) && (*in <= 0x39)) ||
2540 (*in == '_') || (*in == '-') ||
2541 (*in == ':') || (*in == '.'))
2542 in++;
2543
2544 if (in == ctxt->input->end)
2545 return(NULL);
2546
2547 if ((*in > 0) && (*in < 0x80)) {
2548 count = in - ctxt->input->cur;
2549 ret = xmlDictLookup(ctxt->dict, ctxt->input->cur, count);
2550 if (ret == NULL)
2551 htmlErrMemory(ctxt);
2552 ctxt->input->cur = in;
2553 ctxt->input->col += count;
2554 return(ret);
2555 }
2556 }
2557 return(htmlParseNameComplex(ctxt));
2558 }
2559
2560 static const xmlChar *
htmlParseNameComplex(xmlParserCtxtPtr ctxt)2561 htmlParseNameComplex(xmlParserCtxtPtr ctxt) {
2562 int len = 0, l;
2563 int c;
2564 int maxLength = (ctxt->options & XML_PARSE_HUGE) ?
2565 XML_MAX_TEXT_LENGTH :
2566 XML_MAX_NAME_LENGTH;
2567 const xmlChar *base = ctxt->input->base;
2568 const xmlChar *ret;
2569
2570 /*
2571 * Handler for more complex cases
2572 */
2573 c = CUR_CHAR(l);
2574 if ((c == ' ') || (c == '>') || (c == '/') || /* accelerators */
2575 (!IS_LETTER(c) && (c != '_') &&
2576 (c != ':'))) {
2577 return(NULL);
2578 }
2579
2580 while ((c != ' ') && (c != '>') && (c != '/') && /* test bigname.xml */
2581 ((IS_LETTER(c)) || (IS_DIGIT(c)) ||
2582 (c == '.') || (c == '-') ||
2583 (c == '_') || (c == ':') ||
2584 (IS_COMBINING(c)) ||
2585 (IS_EXTENDER(c)))) {
2586 len += l;
2587 if (len > maxLength) {
2588 htmlParseErr(ctxt, XML_ERR_NAME_TOO_LONG, "name too long", NULL, NULL);
2589 return(NULL);
2590 }
2591 NEXTL(l);
2592 c = CUR_CHAR(l);
2593 if (ctxt->input->base != base) {
2594 /*
2595 * We changed encoding from an unknown encoding
2596 * Input buffer changed location, so we better start again
2597 */
2598 return(htmlParseNameComplex(ctxt));
2599 }
2600 }
2601
2602 if (ctxt->input->cur - ctxt->input->base < len) {
2603 /* Sanity check */
2604 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
2605 "unexpected change of input buffer", NULL, NULL);
2606 return (NULL);
2607 }
2608
2609 ret = xmlDictLookup(ctxt->dict, ctxt->input->cur - len, len);
2610 if (ret == NULL)
2611 htmlErrMemory(ctxt);
2612
2613 return(ret);
2614 }
2615
2616
2617 /**
2618 * htmlParseHTMLAttribute:
2619 * @ctxt: an HTML parser context
2620 * @stop: a char stop value
2621 *
2622 * parse an HTML attribute value till the stop (quote), if
2623 * stop is 0 then it stops at the first space
2624 *
2625 * Returns the attribute parsed or NULL
2626 */
2627
2628 static xmlChar *
htmlParseHTMLAttribute(htmlParserCtxtPtr ctxt,const xmlChar stop)2629 htmlParseHTMLAttribute(htmlParserCtxtPtr ctxt, const xmlChar stop) {
2630 xmlChar *buffer = NULL;
2631 int buffer_size = 0;
2632 int maxLength = (ctxt->options & XML_PARSE_HUGE) ?
2633 XML_MAX_HUGE_LENGTH :
2634 XML_MAX_TEXT_LENGTH;
2635 xmlChar *out = NULL;
2636 const xmlChar *name = NULL;
2637 const xmlChar *cur = NULL;
2638 const htmlEntityDesc * ent;
2639
2640 /*
2641 * allocate a translation buffer.
2642 */
2643 buffer_size = HTML_PARSER_BUFFER_SIZE;
2644 buffer = (xmlChar *) xmlMallocAtomic(buffer_size);
2645 if (buffer == NULL) {
2646 htmlErrMemory(ctxt);
2647 return(NULL);
2648 }
2649 out = buffer;
2650
2651 /*
2652 * Ok loop until we reach one of the ending chars
2653 */
2654 while ((PARSER_STOPPED(ctxt) == 0) &&
2655 (CUR != 0) && (CUR != stop)) {
2656 if ((stop == 0) && (CUR == '>')) break;
2657 if ((stop == 0) && (IS_BLANK_CH(CUR))) break;
2658 if (CUR == '&') {
2659 if (NXT(1) == '#') {
2660 unsigned int c;
2661 int bits;
2662
2663 c = htmlParseCharRef(ctxt);
2664 if (c < 0x80)
2665 { *out++ = c; bits= -6; }
2666 else if (c < 0x800)
2667 { *out++ =((c >> 6) & 0x1F) | 0xC0; bits= 0; }
2668 else if (c < 0x10000)
2669 { *out++ =((c >> 12) & 0x0F) | 0xE0; bits= 6; }
2670 else
2671 { *out++ =((c >> 18) & 0x07) | 0xF0; bits= 12; }
2672
2673 for ( ; bits >= 0; bits-= 6) {
2674 *out++ = ((c >> bits) & 0x3F) | 0x80;
2675 }
2676
2677 if (out - buffer > buffer_size - 100) {
2678 int indx = out - buffer;
2679
2680 growBuffer(buffer);
2681 out = &buffer[indx];
2682 }
2683 } else {
2684 ent = htmlParseEntityRef(ctxt, &name);
2685 if (name == NULL) {
2686 *out++ = '&';
2687 if (out - buffer > buffer_size - 100) {
2688 int indx = out - buffer;
2689
2690 growBuffer(buffer);
2691 out = &buffer[indx];
2692 }
2693 } else if (ent == NULL) {
2694 *out++ = '&';
2695 cur = name;
2696 while (*cur != 0) {
2697 if (out - buffer > buffer_size - 100) {
2698 int indx = out - buffer;
2699
2700 growBuffer(buffer);
2701 out = &buffer[indx];
2702 }
2703 *out++ = *cur++;
2704 }
2705 } else {
2706 unsigned int c;
2707 int bits;
2708
2709 if (out - buffer > buffer_size - 100) {
2710 int indx = out - buffer;
2711
2712 growBuffer(buffer);
2713 out = &buffer[indx];
2714 }
2715 c = ent->value;
2716 if (c < 0x80)
2717 { *out++ = c; bits= -6; }
2718 else if (c < 0x800)
2719 { *out++ =((c >> 6) & 0x1F) | 0xC0; bits= 0; }
2720 else if (c < 0x10000)
2721 { *out++ =((c >> 12) & 0x0F) | 0xE0; bits= 6; }
2722 else
2723 { *out++ =((c >> 18) & 0x07) | 0xF0; bits= 12; }
2724
2725 for ( ; bits >= 0; bits-= 6) {
2726 *out++ = ((c >> bits) & 0x3F) | 0x80;
2727 }
2728 }
2729 }
2730 } else {
2731 unsigned int c;
2732 int bits, l;
2733
2734 if (out - buffer > buffer_size - 100) {
2735 int indx = out - buffer;
2736
2737 growBuffer(buffer);
2738 out = &buffer[indx];
2739 }
2740 c = CUR_CHAR(l);
2741 if (c < 0x80)
2742 { *out++ = c; bits= -6; }
2743 else if (c < 0x800)
2744 { *out++ =((c >> 6) & 0x1F) | 0xC0; bits= 0; }
2745 else if (c < 0x10000)
2746 { *out++ =((c >> 12) & 0x0F) | 0xE0; bits= 6; }
2747 else
2748 { *out++ =((c >> 18) & 0x07) | 0xF0; bits= 12; }
2749
2750 for ( ; bits >= 0; bits-= 6) {
2751 *out++ = ((c >> bits) & 0x3F) | 0x80;
2752 }
2753 NEXTL(l);
2754 }
2755 if (out - buffer > maxLength) {
2756 htmlParseErr(ctxt, XML_ERR_ATTRIBUTE_NOT_FINISHED,
2757 "attribute value too long\n", NULL, NULL);
2758 xmlFree(buffer);
2759 return(NULL);
2760 }
2761 }
2762 *out = 0;
2763 return(buffer);
2764 }
2765
2766 /**
2767 * htmlParseEntityRef:
2768 * @ctxt: an HTML parser context
2769 * @str: location to store the entity name
2770 *
2771 * DEPRECATED: Internal function, don't use.
2772 *
2773 * parse an HTML ENTITY references
2774 *
2775 * [68] EntityRef ::= '&' Name ';'
2776 *
2777 * Returns the associated htmlEntityDescPtr if found, or NULL otherwise,
2778 * if non-NULL *str will have to be freed by the caller.
2779 */
2780 const htmlEntityDesc *
htmlParseEntityRef(htmlParserCtxtPtr ctxt,const xmlChar ** str)2781 htmlParseEntityRef(htmlParserCtxtPtr ctxt, const xmlChar **str) {
2782 const xmlChar *name;
2783 const htmlEntityDesc * ent = NULL;
2784
2785 if (str != NULL) *str = NULL;
2786 if ((ctxt == NULL) || (ctxt->input == NULL)) return(NULL);
2787
2788 if (CUR == '&') {
2789 NEXT;
2790 name = htmlParseName(ctxt);
2791 if (name == NULL) {
2792 htmlParseErr(ctxt, XML_ERR_NAME_REQUIRED,
2793 "htmlParseEntityRef: no name\n", NULL, NULL);
2794 } else {
2795 GROW;
2796 if (CUR == ';') {
2797 if (str != NULL)
2798 *str = name;
2799
2800 /*
2801 * Lookup the entity in the table.
2802 */
2803 ent = htmlEntityLookup(name);
2804 if (ent != NULL) /* OK that's ugly !!! */
2805 NEXT;
2806 } else {
2807 htmlParseErr(ctxt, XML_ERR_ENTITYREF_SEMICOL_MISSING,
2808 "htmlParseEntityRef: expecting ';'\n",
2809 NULL, NULL);
2810 if (str != NULL)
2811 *str = name;
2812 }
2813 }
2814 }
2815 return(ent);
2816 }
2817
2818 /**
2819 * htmlParseAttValue:
2820 * @ctxt: an HTML parser context
2821 *
2822 * parse a value for an attribute
2823 * Note: the parser won't do substitution of entities here, this
2824 * will be handled later in xmlStringGetNodeList, unless it was
2825 * asked for ctxt->replaceEntities != 0
2826 *
2827 * Returns the AttValue parsed or NULL.
2828 */
2829
2830 static xmlChar *
htmlParseAttValue(htmlParserCtxtPtr ctxt)2831 htmlParseAttValue(htmlParserCtxtPtr ctxt) {
2832 xmlChar *ret = NULL;
2833
2834 if (CUR == '"') {
2835 NEXT;
2836 ret = htmlParseHTMLAttribute(ctxt, '"');
2837 if (CUR != '"') {
2838 htmlParseErr(ctxt, XML_ERR_ATTRIBUTE_NOT_FINISHED,
2839 "AttValue: \" expected\n", NULL, NULL);
2840 } else
2841 NEXT;
2842 } else if (CUR == '\'') {
2843 NEXT;
2844 ret = htmlParseHTMLAttribute(ctxt, '\'');
2845 if (CUR != '\'') {
2846 htmlParseErr(ctxt, XML_ERR_ATTRIBUTE_NOT_FINISHED,
2847 "AttValue: ' expected\n", NULL, NULL);
2848 } else
2849 NEXT;
2850 } else {
2851 /*
2852 * That's an HTMLism, the attribute value may not be quoted
2853 */
2854 ret = htmlParseHTMLAttribute(ctxt, 0);
2855 if (ret == NULL) {
2856 htmlParseErr(ctxt, XML_ERR_ATTRIBUTE_WITHOUT_VALUE,
2857 "AttValue: no value found\n", NULL, NULL);
2858 }
2859 }
2860 return(ret);
2861 }
2862
2863 /**
2864 * htmlParseSystemLiteral:
2865 * @ctxt: an HTML parser context
2866 *
2867 * parse an HTML Literal
2868 *
2869 * [11] SystemLiteral ::= ('"' [^"]* '"') | ("'" [^']* "'")
2870 *
2871 * Returns the SystemLiteral parsed or NULL
2872 */
2873
2874 static xmlChar *
htmlParseSystemLiteral(htmlParserCtxtPtr ctxt)2875 htmlParseSystemLiteral(htmlParserCtxtPtr ctxt) {
2876 size_t len = 0, startPosition = 0;
2877 int err = 0;
2878 int quote;
2879 xmlChar *ret = NULL;
2880
2881 if ((CUR != '"') && (CUR != '\'')) {
2882 htmlParseErr(ctxt, XML_ERR_LITERAL_NOT_STARTED,
2883 "SystemLiteral \" or ' expected\n", NULL, NULL);
2884 return(NULL);
2885 }
2886 quote = CUR;
2887 NEXT;
2888
2889 if (CUR_PTR < BASE_PTR)
2890 return(ret);
2891 startPosition = CUR_PTR - BASE_PTR;
2892
2893 while ((PARSER_STOPPED(ctxt) == 0) &&
2894 (CUR != 0) && (CUR != quote)) {
2895 /* TODO: Handle UTF-8 */
2896 if (!IS_CHAR_CH(CUR)) {
2897 htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR,
2898 "Invalid char in SystemLiteral 0x%X\n", CUR);
2899 err = 1;
2900 }
2901 NEXT;
2902 len++;
2903 }
2904 if (CUR != quote) {
2905 htmlParseErr(ctxt, XML_ERR_LITERAL_NOT_FINISHED,
2906 "Unfinished SystemLiteral\n", NULL, NULL);
2907 } else {
2908 if (err == 0) {
2909 ret = xmlStrndup((BASE_PTR+startPosition), len);
2910 if (ret == NULL) {
2911 htmlErrMemory(ctxt);
2912 return(NULL);
2913 }
2914 }
2915 NEXT;
2916 }
2917
2918 return(ret);
2919 }
2920
2921 /**
2922 * htmlParsePubidLiteral:
2923 * @ctxt: an HTML parser context
2924 *
2925 * parse an HTML public literal
2926 *
2927 * [12] PubidLiteral ::= '"' PubidChar* '"' | "'" (PubidChar - "'")* "'"
2928 *
2929 * Returns the PubidLiteral parsed or NULL.
2930 */
2931
2932 static xmlChar *
htmlParsePubidLiteral(htmlParserCtxtPtr ctxt)2933 htmlParsePubidLiteral(htmlParserCtxtPtr ctxt) {
2934 size_t len = 0, startPosition = 0;
2935 int err = 0;
2936 int quote;
2937 xmlChar *ret = NULL;
2938
2939 if ((CUR != '"') && (CUR != '\'')) {
2940 htmlParseErr(ctxt, XML_ERR_LITERAL_NOT_STARTED,
2941 "PubidLiteral \" or ' expected\n", NULL, NULL);
2942 return(NULL);
2943 }
2944 quote = CUR;
2945 NEXT;
2946
2947 /*
2948 * Name ::= (Letter | '_') (NameChar)*
2949 */
2950 if (CUR_PTR < BASE_PTR)
2951 return(ret);
2952 startPosition = CUR_PTR - BASE_PTR;
2953
2954 while ((PARSER_STOPPED(ctxt) == 0) &&
2955 (CUR != 0) && (CUR != quote)) {
2956 if (!IS_PUBIDCHAR_CH(CUR)) {
2957 htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR,
2958 "Invalid char in PubidLiteral 0x%X\n", CUR);
2959 err = 1;
2960 }
2961 len++;
2962 NEXT;
2963 }
2964
2965 if (CUR != quote) {
2966 htmlParseErr(ctxt, XML_ERR_LITERAL_NOT_FINISHED,
2967 "Unfinished PubidLiteral\n", NULL, NULL);
2968 } else {
2969 if (err == 0) {
2970 ret = xmlStrndup((BASE_PTR + startPosition), len);
2971 if (ret == NULL) {
2972 htmlErrMemory(ctxt);
2973 return(NULL);
2974 }
2975 }
2976 NEXT;
2977 }
2978
2979 return(ret);
2980 }
2981
2982 /**
2983 * htmlParseScript:
2984 * @ctxt: an HTML parser context
2985 *
2986 * parse the content of an HTML SCRIPT or STYLE element
2987 * http://www.w3.org/TR/html4/sgml/dtd.html#Script
2988 * http://www.w3.org/TR/html4/sgml/dtd.html#StyleSheet
2989 * http://www.w3.org/TR/html4/types.html#type-script
2990 * http://www.w3.org/TR/html4/types.html#h-6.15
2991 * http://www.w3.org/TR/html4/appendix/notes.html#h-B.3.2.1
2992 *
2993 * Script data ( %Script; in the DTD) can be the content of the SCRIPT
2994 * element and the value of intrinsic event attributes. User agents must
2995 * not evaluate script data as HTML markup but instead must pass it on as
2996 * data to a script engine.
2997 * NOTES:
2998 * - The content is passed like CDATA
2999 * - the attributes for style and scripting "onXXX" are also described
3000 * as CDATA but SGML allows entities references in attributes so their
3001 * processing is identical as other attributes
3002 */
3003 static void
htmlParseScript(htmlParserCtxtPtr ctxt)3004 htmlParseScript(htmlParserCtxtPtr ctxt) {
3005 xmlChar buf[HTML_PARSER_BIG_BUFFER_SIZE + 5];
3006 int nbchar = 0;
3007 int cur,l;
3008
3009 cur = CUR_CHAR(l);
3010 while (cur != 0) {
3011 if ((cur == '<') && (NXT(1) == '/')) {
3012 /*
3013 * One should break here, the specification is clear:
3014 * Authors should therefore escape "</" within the content.
3015 * Escape mechanisms are specific to each scripting or
3016 * style sheet language.
3017 *
3018 * In recovery mode, only break if end tag match the
3019 * current tag, effectively ignoring all tags inside the
3020 * script/style block and treating the entire block as
3021 * CDATA.
3022 */
3023 if (ctxt->recovery) {
3024 if (xmlStrncasecmp(ctxt->name, ctxt->input->cur+2,
3025 xmlStrlen(ctxt->name)) == 0)
3026 {
3027 break; /* while */
3028 } else {
3029 htmlParseErr(ctxt, XML_ERR_TAG_NAME_MISMATCH,
3030 "Element %s embeds close tag\n",
3031 ctxt->name, NULL);
3032 }
3033 } else {
3034 if (((NXT(2) >= 'A') && (NXT(2) <= 'Z')) ||
3035 ((NXT(2) >= 'a') && (NXT(2) <= 'z')))
3036 {
3037 break; /* while */
3038 }
3039 }
3040 }
3041 if (IS_CHAR(cur)) {
3042 COPY_BUF(l,buf,nbchar,cur);
3043 } else {
3044 htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR,
3045 "Invalid char in CDATA 0x%X\n", cur);
3046 }
3047 NEXTL(l);
3048 if (nbchar >= HTML_PARSER_BIG_BUFFER_SIZE) {
3049 buf[nbchar] = 0;
3050 if (ctxt->sax->cdataBlock!= NULL) {
3051 /*
3052 * Insert as CDATA, which is the same as HTML_PRESERVE_NODE
3053 */
3054 ctxt->sax->cdataBlock(ctxt->userData, buf, nbchar);
3055 } else if (ctxt->sax->characters != NULL) {
3056 ctxt->sax->characters(ctxt->userData, buf, nbchar);
3057 }
3058 nbchar = 0;
3059 SHRINK;
3060 }
3061 cur = CUR_CHAR(l);
3062 }
3063
3064 if ((nbchar != 0) && (ctxt->sax != NULL) && (!ctxt->disableSAX)) {
3065 buf[nbchar] = 0;
3066 if (ctxt->sax->cdataBlock!= NULL) {
3067 /*
3068 * Insert as CDATA, which is the same as HTML_PRESERVE_NODE
3069 */
3070 ctxt->sax->cdataBlock(ctxt->userData, buf, nbchar);
3071 } else if (ctxt->sax->characters != NULL) {
3072 ctxt->sax->characters(ctxt->userData, buf, nbchar);
3073 }
3074 }
3075 }
3076
3077
3078 /**
3079 * htmlParseCharDataInternal:
3080 * @ctxt: an HTML parser context
3081 * @readahead: optional read ahead character in ascii range
3082 *
3083 * parse a CharData section.
3084 * if we are within a CDATA section ']]>' marks an end of section.
3085 *
3086 * [14] CharData ::= [^<&]* - ([^<&]* ']]>' [^<&]*)
3087 */
3088
3089 static void
htmlParseCharDataInternal(htmlParserCtxtPtr ctxt,int readahead)3090 htmlParseCharDataInternal(htmlParserCtxtPtr ctxt, int readahead) {
3091 xmlChar buf[HTML_PARSER_BIG_BUFFER_SIZE + 6];
3092 int nbchar = 0;
3093 int cur, l;
3094
3095 if (readahead)
3096 buf[nbchar++] = readahead;
3097
3098 cur = CUR_CHAR(l);
3099 while ((cur != '<') &&
3100 (cur != '&') &&
3101 (cur != 0) &&
3102 (!PARSER_STOPPED(ctxt))) {
3103 if (!(IS_CHAR(cur))) {
3104 htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR,
3105 "Invalid char in CDATA 0x%X\n", cur);
3106 } else {
3107 COPY_BUF(l,buf,nbchar,cur);
3108 }
3109 NEXTL(l);
3110 if (nbchar >= HTML_PARSER_BIG_BUFFER_SIZE) {
3111 buf[nbchar] = 0;
3112
3113 /*
3114 * Ok the segment is to be consumed as chars.
3115 */
3116 if ((ctxt->sax != NULL) && (!ctxt->disableSAX)) {
3117 if (areBlanks(ctxt, buf, nbchar)) {
3118 if (ctxt->keepBlanks) {
3119 if (ctxt->sax->characters != NULL)
3120 ctxt->sax->characters(ctxt->userData, buf, nbchar);
3121 } else {
3122 if (ctxt->sax->ignorableWhitespace != NULL)
3123 ctxt->sax->ignorableWhitespace(ctxt->userData,
3124 buf, nbchar);
3125 }
3126 } else {
3127 htmlCheckParagraph(ctxt);
3128 if (ctxt->sax->characters != NULL)
3129 ctxt->sax->characters(ctxt->userData, buf, nbchar);
3130 }
3131 }
3132 nbchar = 0;
3133 SHRINK;
3134 }
3135 cur = CUR_CHAR(l);
3136 }
3137 if (nbchar != 0) {
3138 buf[nbchar] = 0;
3139
3140 /*
3141 * Ok the segment is to be consumed as chars.
3142 */
3143 if ((ctxt->sax != NULL) && (!ctxt->disableSAX)) {
3144 if (areBlanks(ctxt, buf, nbchar)) {
3145 if (ctxt->keepBlanks) {
3146 if (ctxt->sax->characters != NULL)
3147 ctxt->sax->characters(ctxt->userData, buf, nbchar);
3148 } else {
3149 if (ctxt->sax->ignorableWhitespace != NULL)
3150 ctxt->sax->ignorableWhitespace(ctxt->userData,
3151 buf, nbchar);
3152 }
3153 } else {
3154 htmlCheckParagraph(ctxt);
3155 if (ctxt->sax->characters != NULL)
3156 ctxt->sax->characters(ctxt->userData, buf, nbchar);
3157 }
3158 }
3159 }
3160 }
3161
3162 /**
3163 * htmlParseCharData:
3164 * @ctxt: an HTML parser context
3165 *
3166 * parse a CharData section.
3167 * if we are within a CDATA section ']]>' marks an end of section.
3168 *
3169 * [14] CharData ::= [^<&]* - ([^<&]* ']]>' [^<&]*)
3170 */
3171
3172 static void
htmlParseCharData(htmlParserCtxtPtr ctxt)3173 htmlParseCharData(htmlParserCtxtPtr ctxt) {
3174 htmlParseCharDataInternal(ctxt, 0);
3175 }
3176
3177 /**
3178 * htmlParseExternalID:
3179 * @ctxt: an HTML parser context
3180 * @publicID: a xmlChar** receiving PubidLiteral
3181 *
3182 * Parse an External ID or a Public ID
3183 *
3184 * [75] ExternalID ::= 'SYSTEM' S SystemLiteral
3185 * | 'PUBLIC' S PubidLiteral S SystemLiteral
3186 *
3187 * [83] PublicID ::= 'PUBLIC' S PubidLiteral
3188 *
3189 * Returns the function returns SystemLiteral and in the second
3190 * case publicID receives PubidLiteral, is strict is off
3191 * it is possible to return NULL and have publicID set.
3192 */
3193
3194 static xmlChar *
htmlParseExternalID(htmlParserCtxtPtr ctxt,xmlChar ** publicID)3195 htmlParseExternalID(htmlParserCtxtPtr ctxt, xmlChar **publicID) {
3196 xmlChar *URI = NULL;
3197
3198 if ((UPPER == 'S') && (UPP(1) == 'Y') &&
3199 (UPP(2) == 'S') && (UPP(3) == 'T') &&
3200 (UPP(4) == 'E') && (UPP(5) == 'M')) {
3201 SKIP(6);
3202 if (!IS_BLANK_CH(CUR)) {
3203 htmlParseErr(ctxt, XML_ERR_SPACE_REQUIRED,
3204 "Space required after 'SYSTEM'\n", NULL, NULL);
3205 }
3206 SKIP_BLANKS;
3207 URI = htmlParseSystemLiteral(ctxt);
3208 if (URI == NULL) {
3209 htmlParseErr(ctxt, XML_ERR_URI_REQUIRED,
3210 "htmlParseExternalID: SYSTEM, no URI\n", NULL, NULL);
3211 }
3212 } else if ((UPPER == 'P') && (UPP(1) == 'U') &&
3213 (UPP(2) == 'B') && (UPP(3) == 'L') &&
3214 (UPP(4) == 'I') && (UPP(5) == 'C')) {
3215 SKIP(6);
3216 if (!IS_BLANK_CH(CUR)) {
3217 htmlParseErr(ctxt, XML_ERR_SPACE_REQUIRED,
3218 "Space required after 'PUBLIC'\n", NULL, NULL);
3219 }
3220 SKIP_BLANKS;
3221 *publicID = htmlParsePubidLiteral(ctxt);
3222 if (*publicID == NULL) {
3223 htmlParseErr(ctxt, XML_ERR_PUBID_REQUIRED,
3224 "htmlParseExternalID: PUBLIC, no Public Identifier\n",
3225 NULL, NULL);
3226 }
3227 SKIP_BLANKS;
3228 if ((CUR == '"') || (CUR == '\'')) {
3229 URI = htmlParseSystemLiteral(ctxt);
3230 }
3231 }
3232 return(URI);
3233 }
3234
3235 /**
3236 * htmlParsePI:
3237 * @ctxt: an HTML parser context
3238 *
3239 * Parse an XML Processing Instruction. HTML5 doesn't allow processing
3240 * instructions, so this will be removed at some point.
3241 */
3242 static void
htmlParsePI(htmlParserCtxtPtr ctxt)3243 htmlParsePI(htmlParserCtxtPtr ctxt) {
3244 xmlChar *buf = NULL;
3245 int len = 0;
3246 int size = HTML_PARSER_BUFFER_SIZE;
3247 int cur, l;
3248 int maxLength = (ctxt->options & XML_PARSE_HUGE) ?
3249 XML_MAX_HUGE_LENGTH :
3250 XML_MAX_TEXT_LENGTH;
3251 const xmlChar *target;
3252 xmlParserInputState state;
3253
3254 if ((RAW == '<') && (NXT(1) == '?')) {
3255 state = ctxt->instate;
3256 ctxt->instate = XML_PARSER_PI;
3257 /*
3258 * this is a Processing Instruction.
3259 */
3260 SKIP(2);
3261
3262 /*
3263 * Parse the target name and check for special support like
3264 * namespace.
3265 */
3266 target = htmlParseName(ctxt);
3267 if (target != NULL) {
3268 if (RAW == '>') {
3269 SKIP(1);
3270
3271 /*
3272 * SAX: PI detected.
3273 */
3274 if ((ctxt->sax) && (!ctxt->disableSAX) &&
3275 (ctxt->sax->processingInstruction != NULL))
3276 ctxt->sax->processingInstruction(ctxt->userData,
3277 target, NULL);
3278 goto done;
3279 }
3280 buf = (xmlChar *) xmlMallocAtomic(size);
3281 if (buf == NULL) {
3282 htmlErrMemory(ctxt);
3283 return;
3284 }
3285 cur = CUR;
3286 if (!IS_BLANK(cur)) {
3287 htmlParseErr(ctxt, XML_ERR_SPACE_REQUIRED,
3288 "ParsePI: PI %s space expected\n", target, NULL);
3289 }
3290 SKIP_BLANKS;
3291 cur = CUR_CHAR(l);
3292 while ((cur != 0) && (cur != '>')) {
3293 if (len + 5 >= size) {
3294 xmlChar *tmp;
3295
3296 size *= 2;
3297 tmp = (xmlChar *) xmlRealloc(buf, size);
3298 if (tmp == NULL) {
3299 htmlErrMemory(ctxt);
3300 xmlFree(buf);
3301 return;
3302 }
3303 buf = tmp;
3304 }
3305 if (IS_CHAR(cur)) {
3306 COPY_BUF(l,buf,len,cur);
3307 } else {
3308 htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR,
3309 "Invalid char in processing instruction "
3310 "0x%X\n", cur);
3311 }
3312 if (len > maxLength) {
3313 htmlParseErr(ctxt, XML_ERR_PI_NOT_FINISHED,
3314 "PI %s too long", target, NULL);
3315 xmlFree(buf);
3316 goto done;
3317 }
3318 NEXTL(l);
3319 cur = CUR_CHAR(l);
3320 }
3321 buf[len] = 0;
3322 if (cur != '>') {
3323 htmlParseErr(ctxt, XML_ERR_PI_NOT_FINISHED,
3324 "ParsePI: PI %s never end ...\n", target, NULL);
3325 } else {
3326 SKIP(1);
3327
3328 /*
3329 * SAX: PI detected.
3330 */
3331 if ((ctxt->sax) && (!ctxt->disableSAX) &&
3332 (ctxt->sax->processingInstruction != NULL))
3333 ctxt->sax->processingInstruction(ctxt->userData,
3334 target, buf);
3335 }
3336 xmlFree(buf);
3337 } else {
3338 htmlParseErr(ctxt, XML_ERR_PI_NOT_STARTED,
3339 "PI is not started correctly", NULL, NULL);
3340 }
3341
3342 done:
3343 ctxt->instate = state;
3344 }
3345 }
3346
3347 /**
3348 * htmlParseComment:
3349 * @ctxt: an HTML parser context
3350 *
3351 * Parse an HTML comment
3352 */
3353 static void
htmlParseComment(htmlParserCtxtPtr ctxt)3354 htmlParseComment(htmlParserCtxtPtr ctxt) {
3355 xmlChar *buf = NULL;
3356 int len;
3357 int size = HTML_PARSER_BUFFER_SIZE;
3358 int q, ql;
3359 int r, rl;
3360 int cur, l;
3361 int next, nl;
3362 int maxLength = (ctxt->options & XML_PARSE_HUGE) ?
3363 XML_MAX_HUGE_LENGTH :
3364 XML_MAX_TEXT_LENGTH;
3365 xmlParserInputState state;
3366
3367 /*
3368 * Check that there is a comment right here.
3369 */
3370 if ((RAW != '<') || (NXT(1) != '!') ||
3371 (NXT(2) != '-') || (NXT(3) != '-')) return;
3372
3373 state = ctxt->instate;
3374 ctxt->instate = XML_PARSER_COMMENT;
3375 SKIP(4);
3376 buf = (xmlChar *) xmlMallocAtomic(size);
3377 if (buf == NULL) {
3378 htmlErrMemory(ctxt);
3379 return;
3380 }
3381 len = 0;
3382 buf[len] = 0;
3383 q = CUR_CHAR(ql);
3384 if (q == 0)
3385 goto unfinished;
3386 if (q == '>') {
3387 htmlParseErr(ctxt, XML_ERR_COMMENT_ABRUPTLY_ENDED, "Comment abruptly ended", NULL, NULL);
3388 cur = '>';
3389 goto finished;
3390 }
3391 NEXTL(ql);
3392 r = CUR_CHAR(rl);
3393 if (r == 0)
3394 goto unfinished;
3395 if (q == '-' && r == '>') {
3396 htmlParseErr(ctxt, XML_ERR_COMMENT_ABRUPTLY_ENDED, "Comment abruptly ended", NULL, NULL);
3397 cur = '>';
3398 goto finished;
3399 }
3400 NEXTL(rl);
3401 cur = CUR_CHAR(l);
3402 while ((cur != 0) &&
3403 ((cur != '>') ||
3404 (r != '-') || (q != '-'))) {
3405 NEXTL(l);
3406 next = CUR_CHAR(nl);
3407
3408 if ((q == '-') && (r == '-') && (cur == '!') && (next == '>')) {
3409 htmlParseErr(ctxt, XML_ERR_COMMENT_NOT_FINISHED,
3410 "Comment incorrectly closed by '--!>'", NULL, NULL);
3411 cur = '>';
3412 break;
3413 }
3414
3415 if (len + 5 >= size) {
3416 xmlChar *tmp;
3417
3418 size *= 2;
3419 tmp = (xmlChar *) xmlRealloc(buf, size);
3420 if (tmp == NULL) {
3421 xmlFree(buf);
3422 htmlErrMemory(ctxt);
3423 return;
3424 }
3425 buf = tmp;
3426 }
3427 if (IS_CHAR(q)) {
3428 COPY_BUF(ql,buf,len,q);
3429 } else {
3430 htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR,
3431 "Invalid char in comment 0x%X\n", q);
3432 }
3433 if (len > maxLength) {
3434 htmlParseErr(ctxt, XML_ERR_COMMENT_NOT_FINISHED,
3435 "comment too long", NULL, NULL);
3436 xmlFree(buf);
3437 ctxt->instate = state;
3438 return;
3439 }
3440
3441 q = r;
3442 ql = rl;
3443 r = cur;
3444 rl = l;
3445 cur = next;
3446 l = nl;
3447 }
3448 finished:
3449 buf[len] = 0;
3450 if (cur == '>') {
3451 NEXT;
3452 if ((ctxt->sax != NULL) && (ctxt->sax->comment != NULL) &&
3453 (!ctxt->disableSAX))
3454 ctxt->sax->comment(ctxt->userData, buf);
3455 xmlFree(buf);
3456 ctxt->instate = state;
3457 return;
3458 }
3459
3460 unfinished:
3461 htmlParseErr(ctxt, XML_ERR_COMMENT_NOT_FINISHED,
3462 "Comment not terminated \n<!--%.50s\n", buf, NULL);
3463 xmlFree(buf);
3464 }
3465
3466 /**
3467 * htmlParseCharRef:
3468 * @ctxt: an HTML parser context
3469 *
3470 * DEPRECATED: Internal function, don't use.
3471 *
3472 * parse Reference declarations
3473 *
3474 * [66] CharRef ::= '&#' [0-9]+ ';' |
3475 * '&#x' [0-9a-fA-F]+ ';'
3476 *
3477 * Returns the value parsed (as an int)
3478 */
3479 int
htmlParseCharRef(htmlParserCtxtPtr ctxt)3480 htmlParseCharRef(htmlParserCtxtPtr ctxt) {
3481 int val = 0;
3482
3483 if ((ctxt == NULL) || (ctxt->input == NULL))
3484 return(0);
3485 if ((CUR == '&') && (NXT(1) == '#') &&
3486 ((NXT(2) == 'x') || NXT(2) == 'X')) {
3487 SKIP(3);
3488 while (CUR != ';') {
3489 if ((CUR >= '0') && (CUR <= '9')) {
3490 if (val < 0x110000)
3491 val = val * 16 + (CUR - '0');
3492 } else if ((CUR >= 'a') && (CUR <= 'f')) {
3493 if (val < 0x110000)
3494 val = val * 16 + (CUR - 'a') + 10;
3495 } else if ((CUR >= 'A') && (CUR <= 'F')) {
3496 if (val < 0x110000)
3497 val = val * 16 + (CUR - 'A') + 10;
3498 } else {
3499 htmlParseErr(ctxt, XML_ERR_INVALID_HEX_CHARREF,
3500 "htmlParseCharRef: missing semicolon\n",
3501 NULL, NULL);
3502 break;
3503 }
3504 NEXT;
3505 }
3506 if (CUR == ';')
3507 NEXT;
3508 } else if ((CUR == '&') && (NXT(1) == '#')) {
3509 SKIP(2);
3510 while (CUR != ';') {
3511 if ((CUR >= '0') && (CUR <= '9')) {
3512 if (val < 0x110000)
3513 val = val * 10 + (CUR - '0');
3514 } else {
3515 htmlParseErr(ctxt, XML_ERR_INVALID_DEC_CHARREF,
3516 "htmlParseCharRef: missing semicolon\n",
3517 NULL, NULL);
3518 break;
3519 }
3520 NEXT;
3521 }
3522 if (CUR == ';')
3523 NEXT;
3524 } else {
3525 htmlParseErr(ctxt, XML_ERR_INVALID_CHARREF,
3526 "htmlParseCharRef: invalid value\n", NULL, NULL);
3527 }
3528 /*
3529 * Check the value IS_CHAR ...
3530 */
3531 if (IS_CHAR(val)) {
3532 return(val);
3533 } else if (val >= 0x110000) {
3534 htmlParseErr(ctxt, XML_ERR_INVALID_CHAR,
3535 "htmlParseCharRef: value too large\n", NULL, NULL);
3536 } else {
3537 htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR,
3538 "htmlParseCharRef: invalid xmlChar value %d\n",
3539 val);
3540 }
3541 return(0);
3542 }
3543
3544
3545 /**
3546 * htmlParseDocTypeDecl:
3547 * @ctxt: an HTML parser context
3548 *
3549 * parse a DOCTYPE declaration
3550 *
3551 * [28] doctypedecl ::= '<!DOCTYPE' S Name (S ExternalID)? S?
3552 * ('[' (markupdecl | PEReference | S)* ']' S?)? '>'
3553 */
3554
3555 static void
htmlParseDocTypeDecl(htmlParserCtxtPtr ctxt)3556 htmlParseDocTypeDecl(htmlParserCtxtPtr ctxt) {
3557 const xmlChar *name;
3558 xmlChar *ExternalID = NULL;
3559 xmlChar *URI = NULL;
3560
3561 /*
3562 * We know that '<!DOCTYPE' has been detected.
3563 */
3564 SKIP(9);
3565
3566 SKIP_BLANKS;
3567
3568 /*
3569 * Parse the DOCTYPE name.
3570 */
3571 name = htmlParseName(ctxt);
3572 if (name == NULL) {
3573 htmlParseErr(ctxt, XML_ERR_NAME_REQUIRED,
3574 "htmlParseDocTypeDecl : no DOCTYPE name !\n",
3575 NULL, NULL);
3576 }
3577 /*
3578 * Check that upper(name) == "HTML" !!!!!!!!!!!!!
3579 */
3580
3581 SKIP_BLANKS;
3582
3583 /*
3584 * Check for SystemID and ExternalID
3585 */
3586 URI = htmlParseExternalID(ctxt, &ExternalID);
3587 SKIP_BLANKS;
3588
3589 /*
3590 * We should be at the end of the DOCTYPE declaration.
3591 */
3592 if (CUR != '>') {
3593 htmlParseErr(ctxt, XML_ERR_DOCTYPE_NOT_FINISHED,
3594 "DOCTYPE improperly terminated\n", NULL, NULL);
3595 /* Ignore bogus content */
3596 while ((CUR != 0) && (CUR != '>') &&
3597 (PARSER_STOPPED(ctxt) == 0))
3598 NEXT;
3599 }
3600 if (CUR == '>')
3601 NEXT;
3602
3603 /*
3604 * Create or update the document accordingly to the DOCTYPE
3605 */
3606 if ((ctxt->sax != NULL) && (ctxt->sax->internalSubset != NULL) &&
3607 (!ctxt->disableSAX))
3608 ctxt->sax->internalSubset(ctxt->userData, name, ExternalID, URI);
3609
3610 /*
3611 * Cleanup, since we don't use all those identifiers
3612 */
3613 if (URI != NULL) xmlFree(URI);
3614 if (ExternalID != NULL) xmlFree(ExternalID);
3615 }
3616
3617 /**
3618 * htmlParseAttribute:
3619 * @ctxt: an HTML parser context
3620 * @value: a xmlChar ** used to store the value of the attribute
3621 *
3622 * parse an attribute
3623 *
3624 * [41] Attribute ::= Name Eq AttValue
3625 *
3626 * [25] Eq ::= S? '=' S?
3627 *
3628 * With namespace:
3629 *
3630 * [NS 11] Attribute ::= QName Eq AttValue
3631 *
3632 * Also the case QName == xmlns:??? is handled independently as a namespace
3633 * definition.
3634 *
3635 * Returns the attribute name, and the value in *value.
3636 */
3637
3638 static const xmlChar *
htmlParseAttribute(htmlParserCtxtPtr ctxt,xmlChar ** value)3639 htmlParseAttribute(htmlParserCtxtPtr ctxt, xmlChar **value) {
3640 const xmlChar *name;
3641 xmlChar *val = NULL;
3642
3643 *value = NULL;
3644 name = htmlParseHTMLName(ctxt);
3645 if (name == NULL) {
3646 htmlParseErr(ctxt, XML_ERR_NAME_REQUIRED,
3647 "error parsing attribute name\n", NULL, NULL);
3648 return(NULL);
3649 }
3650
3651 /*
3652 * read the value
3653 */
3654 SKIP_BLANKS;
3655 if (CUR == '=') {
3656 NEXT;
3657 SKIP_BLANKS;
3658 val = htmlParseAttValue(ctxt);
3659 }
3660
3661 *value = val;
3662 return(name);
3663 }
3664
3665 /**
3666 * htmlCheckEncoding:
3667 * @ctxt: an HTML parser context
3668 * @attvalue: the attribute value
3669 *
3670 * Checks an http-equiv attribute from a Meta tag to detect
3671 * the encoding
3672 * If a new encoding is detected the parser is switched to decode
3673 * it and pass UTF8
3674 */
3675 static void
htmlCheckEncoding(htmlParserCtxtPtr ctxt,const xmlChar * attvalue)3676 htmlCheckEncoding(htmlParserCtxtPtr ctxt, const xmlChar *attvalue) {
3677 const xmlChar *encoding;
3678 xmlChar *copy;
3679
3680 if (!attvalue)
3681 return;
3682
3683 encoding = xmlStrcasestr(attvalue, BAD_CAST"charset");
3684 if (encoding != NULL) {
3685 encoding += 7;
3686 }
3687 /*
3688 * skip blank
3689 */
3690 if (encoding && IS_BLANK_CH(*encoding))
3691 encoding = xmlStrcasestr(attvalue, BAD_CAST"=");
3692 if (encoding && *encoding == '=') {
3693 encoding ++;
3694 copy = xmlStrdup(encoding);
3695 if (copy == NULL)
3696 htmlErrMemory(ctxt);
3697 xmlSetDeclaredEncoding(ctxt, copy);
3698 }
3699 }
3700
3701 /**
3702 * htmlCheckMeta:
3703 * @ctxt: an HTML parser context
3704 * @atts: the attributes values
3705 *
3706 * Checks an attributes from a Meta tag
3707 */
3708 static void
htmlCheckMeta(htmlParserCtxtPtr ctxt,const xmlChar ** atts)3709 htmlCheckMeta(htmlParserCtxtPtr ctxt, const xmlChar **atts) {
3710 int i;
3711 const xmlChar *att, *value;
3712 int http = 0;
3713 const xmlChar *content = NULL;
3714
3715 if ((ctxt == NULL) || (atts == NULL))
3716 return;
3717
3718 i = 0;
3719 att = atts[i++];
3720 while (att != NULL) {
3721 value = atts[i++];
3722 if (value != NULL) {
3723 if ((!xmlStrcasecmp(att, BAD_CAST "http-equiv")) &&
3724 (!xmlStrcasecmp(value, BAD_CAST "Content-Type"))) {
3725 http = 1;
3726 } else if (!xmlStrcasecmp(att, BAD_CAST "charset")) {
3727 xmlChar *copy;
3728
3729 copy = xmlStrdup(value);
3730 if (copy == NULL)
3731 htmlErrMemory(ctxt);
3732 xmlSetDeclaredEncoding(ctxt, copy);
3733 } else if (!xmlStrcasecmp(att, BAD_CAST "content")) {
3734 content = value;
3735 }
3736 }
3737 att = atts[i++];
3738 }
3739 if ((http) && (content != NULL))
3740 htmlCheckEncoding(ctxt, content);
3741
3742 }
3743
3744 /**
3745 * htmlParseStartTag:
3746 * @ctxt: an HTML parser context
3747 *
3748 * parse a start of tag either for rule element or
3749 * EmptyElement. In both case we don't parse the tag closing chars.
3750 *
3751 * [40] STag ::= '<' Name (S Attribute)* S? '>'
3752 *
3753 * [44] EmptyElemTag ::= '<' Name (S Attribute)* S? '/>'
3754 *
3755 * With namespace:
3756 *
3757 * [NS 8] STag ::= '<' QName (S Attribute)* S? '>'
3758 *
3759 * [NS 10] EmptyElement ::= '<' QName (S Attribute)* S? '/>'
3760 *
3761 * Returns 0 in case of success, -1 in case of error and 1 if discarded
3762 */
3763
3764 static int
htmlParseStartTag(htmlParserCtxtPtr ctxt)3765 htmlParseStartTag(htmlParserCtxtPtr ctxt) {
3766 const xmlChar *name;
3767 const xmlChar *attname;
3768 xmlChar *attvalue;
3769 const xmlChar **atts;
3770 int nbatts = 0;
3771 int maxatts;
3772 int meta = 0;
3773 int i;
3774 int discardtag = 0;
3775
3776 if ((ctxt == NULL) || (ctxt->input == NULL))
3777 return -1;
3778 if (CUR != '<') return -1;
3779 NEXT;
3780
3781 atts = ctxt->atts;
3782 maxatts = ctxt->maxatts;
3783
3784 GROW;
3785 name = htmlParseHTMLName(ctxt);
3786 if (name == NULL) {
3787 htmlParseErr(ctxt, XML_ERR_NAME_REQUIRED,
3788 "htmlParseStartTag: invalid element name\n",
3789 NULL, NULL);
3790 /* Dump the bogus tag like browsers do */
3791 while ((CUR != 0) && (CUR != '>') &&
3792 (PARSER_STOPPED(ctxt) == 0))
3793 NEXT;
3794 return -1;
3795 }
3796 if (xmlStrEqual(name, BAD_CAST"meta"))
3797 meta = 1;
3798
3799 /*
3800 * Check for auto-closure of HTML elements.
3801 */
3802 htmlAutoClose(ctxt, name);
3803
3804 /*
3805 * Check for implied HTML elements.
3806 */
3807 htmlCheckImplied(ctxt, name);
3808
3809 /*
3810 * Avoid html at any level > 0, head at any level != 1
3811 * or any attempt to recurse body
3812 */
3813 if ((ctxt->nameNr > 0) && (xmlStrEqual(name, BAD_CAST"html"))) {
3814 htmlParseErr(ctxt, XML_HTML_STRUCURE_ERROR,
3815 "htmlParseStartTag: misplaced <html> tag\n",
3816 name, NULL);
3817 discardtag = 1;
3818 ctxt->depth++;
3819 }
3820 if ((ctxt->nameNr != 1) &&
3821 (xmlStrEqual(name, BAD_CAST"head"))) {
3822 htmlParseErr(ctxt, XML_HTML_STRUCURE_ERROR,
3823 "htmlParseStartTag: misplaced <head> tag\n",
3824 name, NULL);
3825 discardtag = 1;
3826 ctxt->depth++;
3827 }
3828 if (xmlStrEqual(name, BAD_CAST"body")) {
3829 int indx;
3830 for (indx = 0;indx < ctxt->nameNr;indx++) {
3831 if (xmlStrEqual(ctxt->nameTab[indx], BAD_CAST"body")) {
3832 htmlParseErr(ctxt, XML_HTML_STRUCURE_ERROR,
3833 "htmlParseStartTag: misplaced <body> tag\n",
3834 name, NULL);
3835 discardtag = 1;
3836 ctxt->depth++;
3837 }
3838 }
3839 }
3840
3841 /*
3842 * Now parse the attributes, it ends up with the ending
3843 *
3844 * (S Attribute)* S?
3845 */
3846 SKIP_BLANKS;
3847 while ((CUR != 0) &&
3848 (CUR != '>') &&
3849 ((CUR != '/') || (NXT(1) != '>')) &&
3850 (PARSER_STOPPED(ctxt) == 0)) {
3851 GROW;
3852 attname = htmlParseAttribute(ctxt, &attvalue);
3853 if (attname != NULL) {
3854
3855 /*
3856 * Well formedness requires at most one declaration of an attribute
3857 */
3858 for (i = 0; i < nbatts;i += 2) {
3859 if (xmlStrEqual(atts[i], attname)) {
3860 htmlParseErr(ctxt, XML_ERR_ATTRIBUTE_REDEFINED,
3861 "Attribute %s redefined\n", attname, NULL);
3862 if (attvalue != NULL)
3863 xmlFree(attvalue);
3864 goto failed;
3865 }
3866 }
3867
3868 /*
3869 * Add the pair to atts
3870 */
3871 if (atts == NULL) {
3872 maxatts = 22; /* allow for 10 attrs by default */
3873 atts = (const xmlChar **)
3874 xmlMalloc(maxatts * sizeof(xmlChar *));
3875 if (atts == NULL) {
3876 htmlErrMemory(ctxt);
3877 if (attvalue != NULL)
3878 xmlFree(attvalue);
3879 goto failed;
3880 }
3881 ctxt->atts = atts;
3882 ctxt->maxatts = maxatts;
3883 } else if (nbatts + 4 > maxatts) {
3884 const xmlChar **n;
3885
3886 maxatts *= 2;
3887 n = (const xmlChar **) xmlRealloc((void *) atts,
3888 maxatts * sizeof(const xmlChar *));
3889 if (n == NULL) {
3890 htmlErrMemory(ctxt);
3891 if (attvalue != NULL)
3892 xmlFree(attvalue);
3893 goto failed;
3894 }
3895 atts = n;
3896 ctxt->atts = atts;
3897 ctxt->maxatts = maxatts;
3898 }
3899 atts[nbatts++] = attname;
3900 atts[nbatts++] = attvalue;
3901 atts[nbatts] = NULL;
3902 atts[nbatts + 1] = NULL;
3903 }
3904 else {
3905 if (attvalue != NULL)
3906 xmlFree(attvalue);
3907 /* Dump the bogus attribute string up to the next blank or
3908 * the end of the tag. */
3909 while ((CUR != 0) &&
3910 !(IS_BLANK_CH(CUR)) && (CUR != '>') &&
3911 ((CUR != '/') || (NXT(1) != '>')) &&
3912 (PARSER_STOPPED(ctxt) == 0))
3913 NEXT;
3914 }
3915
3916 failed:
3917 SKIP_BLANKS;
3918 }
3919
3920 /*
3921 * Handle specific association to the META tag
3922 */
3923 if (meta && (nbatts != 0))
3924 htmlCheckMeta(ctxt, atts);
3925
3926 /*
3927 * SAX: Start of Element !
3928 */
3929 if (!discardtag) {
3930 htmlnamePush(ctxt, name);
3931 if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL)) {
3932 if (nbatts != 0)
3933 ctxt->sax->startElement(ctxt->userData, name, atts);
3934 else
3935 ctxt->sax->startElement(ctxt->userData, name, NULL);
3936 }
3937 }
3938
3939 if (atts != NULL) {
3940 for (i = 1;i < nbatts;i += 2) {
3941 if (atts[i] != NULL)
3942 xmlFree((xmlChar *) atts[i]);
3943 }
3944 }
3945
3946 return(discardtag);
3947 }
3948
3949 /**
3950 * htmlParseEndTag:
3951 * @ctxt: an HTML parser context
3952 *
3953 * parse an end of tag
3954 *
3955 * [42] ETag ::= '</' Name S? '>'
3956 *
3957 * With namespace
3958 *
3959 * [NS 9] ETag ::= '</' QName S? '>'
3960 *
3961 * Returns 1 if the current level should be closed.
3962 */
3963
3964 static int
htmlParseEndTag(htmlParserCtxtPtr ctxt)3965 htmlParseEndTag(htmlParserCtxtPtr ctxt)
3966 {
3967 const xmlChar *name;
3968 const xmlChar *oldname;
3969 int i, ret;
3970
3971 if ((CUR != '<') || (NXT(1) != '/')) {
3972 htmlParseErr(ctxt, XML_ERR_LTSLASH_REQUIRED,
3973 "htmlParseEndTag: '</' not found\n", NULL, NULL);
3974 return (0);
3975 }
3976 SKIP(2);
3977
3978 name = htmlParseHTMLName(ctxt);
3979 if (name == NULL)
3980 return (0);
3981 /*
3982 * We should definitely be at the ending "S? '>'" part
3983 */
3984 SKIP_BLANKS;
3985 if (CUR != '>') {
3986 htmlParseErr(ctxt, XML_ERR_GT_REQUIRED,
3987 "End tag : expected '>'\n", NULL, NULL);
3988 /* Skip to next '>' */
3989 while ((PARSER_STOPPED(ctxt) == 0) &&
3990 (CUR != 0) && (CUR != '>'))
3991 NEXT;
3992 }
3993 if (CUR == '>')
3994 NEXT;
3995
3996 /*
3997 * if we ignored misplaced tags in htmlParseStartTag don't pop them
3998 * out now.
3999 */
4000 if ((ctxt->depth > 0) &&
4001 (xmlStrEqual(name, BAD_CAST "html") ||
4002 xmlStrEqual(name, BAD_CAST "body") ||
4003 xmlStrEqual(name, BAD_CAST "head"))) {
4004 ctxt->depth--;
4005 return (0);
4006 }
4007
4008 /*
4009 * If the name read is not one of the element in the parsing stack
4010 * then return, it's just an error.
4011 */
4012 for (i = (ctxt->nameNr - 1); i >= 0; i--) {
4013 if (xmlStrEqual(name, ctxt->nameTab[i]))
4014 break;
4015 }
4016 if (i < 0) {
4017 htmlParseErr(ctxt, XML_ERR_TAG_NAME_MISMATCH,
4018 "Unexpected end tag : %s\n", name, NULL);
4019 return (0);
4020 }
4021
4022
4023 /*
4024 * Check for auto-closure of HTML elements.
4025 */
4026
4027 htmlAutoCloseOnClose(ctxt, name);
4028
4029 /*
4030 * Well formedness constraints, opening and closing must match.
4031 * With the exception that the autoclose may have popped stuff out
4032 * of the stack.
4033 */
4034 if ((ctxt->name != NULL) && (!xmlStrEqual(ctxt->name, name))) {
4035 htmlParseErr(ctxt, XML_ERR_TAG_NAME_MISMATCH,
4036 "Opening and ending tag mismatch: %s and %s\n",
4037 name, ctxt->name);
4038 }
4039
4040 /*
4041 * SAX: End of Tag
4042 */
4043 oldname = ctxt->name;
4044 if ((oldname != NULL) && (xmlStrEqual(oldname, name))) {
4045 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
4046 ctxt->sax->endElement(ctxt->userData, name);
4047 htmlNodeInfoPop(ctxt);
4048 htmlnamePop(ctxt);
4049 ret = 1;
4050 } else {
4051 ret = 0;
4052 }
4053
4054 return (ret);
4055 }
4056
4057
4058 /**
4059 * htmlParseReference:
4060 * @ctxt: an HTML parser context
4061 *
4062 * parse and handle entity references in content,
4063 * this will end-up in a call to character() since this is either a
4064 * CharRef, or a predefined entity.
4065 */
4066 static void
htmlParseReference(htmlParserCtxtPtr ctxt)4067 htmlParseReference(htmlParserCtxtPtr ctxt) {
4068 const htmlEntityDesc * ent;
4069 xmlChar out[6];
4070 const xmlChar *name;
4071 if (CUR != '&') return;
4072
4073 if (NXT(1) == '#') {
4074 unsigned int c;
4075 int bits, i = 0;
4076
4077 c = htmlParseCharRef(ctxt);
4078 if (c == 0)
4079 return;
4080
4081 if (c < 0x80) { out[i++]= c; bits= -6; }
4082 else if (c < 0x800) { out[i++]=((c >> 6) & 0x1F) | 0xC0; bits= 0; }
4083 else if (c < 0x10000) { out[i++]=((c >> 12) & 0x0F) | 0xE0; bits= 6; }
4084 else { out[i++]=((c >> 18) & 0x07) | 0xF0; bits= 12; }
4085
4086 for ( ; bits >= 0; bits-= 6) {
4087 out[i++]= ((c >> bits) & 0x3F) | 0x80;
4088 }
4089 out[i] = 0;
4090
4091 htmlCheckParagraph(ctxt);
4092 if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL))
4093 ctxt->sax->characters(ctxt->userData, out, i);
4094 } else {
4095 ent = htmlParseEntityRef(ctxt, &name);
4096 if (name == NULL) {
4097 htmlCheckParagraph(ctxt);
4098 if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL))
4099 ctxt->sax->characters(ctxt->userData, BAD_CAST "&", 1);
4100 return;
4101 }
4102 if ((ent == NULL) || !(ent->value > 0)) {
4103 htmlCheckParagraph(ctxt);
4104 if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL)) {
4105 ctxt->sax->characters(ctxt->userData, BAD_CAST "&", 1);
4106 ctxt->sax->characters(ctxt->userData, name, xmlStrlen(name));
4107 /* ctxt->sax->characters(ctxt->userData, BAD_CAST ";", 1); */
4108 }
4109 } else {
4110 unsigned int c;
4111 int bits, i = 0;
4112
4113 c = ent->value;
4114 if (c < 0x80)
4115 { out[i++]= c; bits= -6; }
4116 else if (c < 0x800)
4117 { out[i++]=((c >> 6) & 0x1F) | 0xC0; bits= 0; }
4118 else if (c < 0x10000)
4119 { out[i++]=((c >> 12) & 0x0F) | 0xE0; bits= 6; }
4120 else
4121 { out[i++]=((c >> 18) & 0x07) | 0xF0; bits= 12; }
4122
4123 for ( ; bits >= 0; bits-= 6) {
4124 out[i++]= ((c >> bits) & 0x3F) | 0x80;
4125 }
4126 out[i] = 0;
4127
4128 htmlCheckParagraph(ctxt);
4129 if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL))
4130 ctxt->sax->characters(ctxt->userData, out, i);
4131 }
4132 }
4133 }
4134
4135 /**
4136 * htmlParseContent:
4137 * @ctxt: an HTML parser context
4138 *
4139 * Parse a content: comment, sub-element, reference or text.
4140 * Kept for compatibility with old code
4141 */
4142
4143 static void
htmlParseContent(htmlParserCtxtPtr ctxt)4144 htmlParseContent(htmlParserCtxtPtr ctxt) {
4145 xmlChar *currentNode;
4146 int depth;
4147 const xmlChar *name;
4148
4149 currentNode = xmlStrdup(ctxt->name);
4150 depth = ctxt->nameNr;
4151 while (!PARSER_STOPPED(ctxt)) {
4152 GROW;
4153
4154 /*
4155 * Our tag or one of it's parent or children is ending.
4156 */
4157 if ((CUR == '<') && (NXT(1) == '/')) {
4158 if (htmlParseEndTag(ctxt) &&
4159 ((currentNode != NULL) || (ctxt->nameNr == 0))) {
4160 if (currentNode != NULL)
4161 xmlFree(currentNode);
4162 return;
4163 }
4164 continue; /* while */
4165 }
4166
4167 else if ((CUR == '<') &&
4168 ((IS_ASCII_LETTER(NXT(1))) ||
4169 (NXT(1) == '_') || (NXT(1) == ':'))) {
4170 name = htmlParseHTMLName_nonInvasive(ctxt);
4171 if (name == NULL) {
4172 htmlParseErr(ctxt, XML_ERR_NAME_REQUIRED,
4173 "htmlParseStartTag: invalid element name\n",
4174 NULL, NULL);
4175 /* Dump the bogus tag like browsers do */
4176 while ((CUR != 0) && (CUR != '>'))
4177 NEXT;
4178
4179 if (currentNode != NULL)
4180 xmlFree(currentNode);
4181 return;
4182 }
4183
4184 if (ctxt->name != NULL) {
4185 if (htmlCheckAutoClose(name, ctxt->name) == 1) {
4186 htmlAutoClose(ctxt, name);
4187 continue;
4188 }
4189 }
4190 }
4191
4192 /*
4193 * Has this node been popped out during parsing of
4194 * the next element
4195 */
4196 if ((ctxt->nameNr > 0) && (depth >= ctxt->nameNr) &&
4197 (!xmlStrEqual(currentNode, ctxt->name)))
4198 {
4199 if (currentNode != NULL) xmlFree(currentNode);
4200 return;
4201 }
4202
4203 if ((CUR != 0) && ((xmlStrEqual(currentNode, BAD_CAST"script")) ||
4204 (xmlStrEqual(currentNode, BAD_CAST"style")))) {
4205 /*
4206 * Handle SCRIPT/STYLE separately
4207 */
4208 htmlParseScript(ctxt);
4209 }
4210
4211 else if ((CUR == '<') && (NXT(1) == '!')) {
4212 /*
4213 * Sometimes DOCTYPE arrives in the middle of the document
4214 */
4215 if ((UPP(2) == 'D') && (UPP(3) == 'O') &&
4216 (UPP(4) == 'C') && (UPP(5) == 'T') &&
4217 (UPP(6) == 'Y') && (UPP(7) == 'P') &&
4218 (UPP(8) == 'E')) {
4219 htmlParseErr(ctxt, XML_HTML_STRUCURE_ERROR,
4220 "Misplaced DOCTYPE declaration\n",
4221 BAD_CAST "DOCTYPE" , NULL);
4222 htmlParseDocTypeDecl(ctxt);
4223 }
4224 /*
4225 * First case : a comment
4226 */
4227 else if ((NXT(2) == '-') && (NXT(3) == '-')) {
4228 htmlParseComment(ctxt);
4229 }
4230 else {
4231 htmlSkipBogusComment(ctxt);
4232 }
4233 }
4234
4235 /*
4236 * Second case : a Processing Instruction.
4237 */
4238 else if ((CUR == '<') && (NXT(1) == '?')) {
4239 htmlParsePI(ctxt);
4240 }
4241
4242 /*
4243 * Third case : a sub-element.
4244 */
4245 else if ((CUR == '<') && IS_ASCII_LETTER(NXT(1))) {
4246 htmlParseElement(ctxt);
4247 }
4248 else if (CUR == '<') {
4249 if ((ctxt->sax != NULL) && (!ctxt->disableSAX) &&
4250 (ctxt->sax->characters != NULL))
4251 ctxt->sax->characters(ctxt->userData, BAD_CAST "<", 1);
4252 NEXT;
4253 }
4254
4255 /*
4256 * Fourth case : a reference. If if has not been resolved,
4257 * parsing returns it's Name, create the node
4258 */
4259 else if (CUR == '&') {
4260 htmlParseReference(ctxt);
4261 }
4262
4263 /*
4264 * Fifth case : end of the resource
4265 */
4266 else if (CUR == 0) {
4267 htmlAutoCloseOnEnd(ctxt);
4268 break;
4269 }
4270
4271 /*
4272 * Last case, text. Note that References are handled directly.
4273 */
4274 else {
4275 htmlParseCharData(ctxt);
4276 }
4277
4278 SHRINK;
4279 GROW;
4280 }
4281 if (currentNode != NULL) xmlFree(currentNode);
4282 }
4283
4284 /**
4285 * htmlParseElement:
4286 * @ctxt: an HTML parser context
4287 *
4288 * DEPRECATED: Internal function, don't use.
4289 *
4290 * parse an HTML element, this is highly recursive
4291 * this is kept for compatibility with previous code versions
4292 *
4293 * [39] element ::= EmptyElemTag | STag content ETag
4294 *
4295 * [41] Attribute ::= Name Eq AttValue
4296 */
4297
4298 void
htmlParseElement(htmlParserCtxtPtr ctxt)4299 htmlParseElement(htmlParserCtxtPtr ctxt) {
4300 const xmlChar *name;
4301 xmlChar *currentNode = NULL;
4302 const htmlElemDesc * info;
4303 htmlParserNodeInfo node_info;
4304 int failed;
4305 int depth;
4306 const xmlChar *oldptr;
4307
4308 if ((ctxt == NULL) || (ctxt->input == NULL))
4309 return;
4310
4311 /* Capture start position */
4312 if (ctxt->record_info) {
4313 node_info.begin_pos = ctxt->input->consumed +
4314 (CUR_PTR - ctxt->input->base);
4315 node_info.begin_line = ctxt->input->line;
4316 }
4317
4318 failed = htmlParseStartTag(ctxt);
4319 name = ctxt->name;
4320 if ((failed == -1) || (name == NULL)) {
4321 if (CUR == '>')
4322 NEXT;
4323 return;
4324 }
4325
4326 /*
4327 * Lookup the info for that element.
4328 */
4329 info = htmlTagLookup(name);
4330 if (info == NULL) {
4331 htmlParseErr(ctxt, XML_HTML_UNKNOWN_TAG,
4332 "Tag %s invalid\n", name, NULL);
4333 }
4334
4335 /*
4336 * Check for an Empty Element labeled the XML/SGML way
4337 */
4338 if ((CUR == '/') && (NXT(1) == '>')) {
4339 SKIP(2);
4340 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
4341 ctxt->sax->endElement(ctxt->userData, name);
4342 htmlnamePop(ctxt);
4343 return;
4344 }
4345
4346 if (CUR == '>') {
4347 NEXT;
4348 } else {
4349 htmlParseErr(ctxt, XML_ERR_GT_REQUIRED,
4350 "Couldn't find end of Start Tag %s\n", name, NULL);
4351
4352 /*
4353 * end of parsing of this node.
4354 */
4355 if (xmlStrEqual(name, ctxt->name)) {
4356 nodePop(ctxt);
4357 htmlnamePop(ctxt);
4358 }
4359
4360 /*
4361 * Capture end position and add node
4362 */
4363 if (ctxt->record_info) {
4364 node_info.end_pos = ctxt->input->consumed +
4365 (CUR_PTR - ctxt->input->base);
4366 node_info.end_line = ctxt->input->line;
4367 node_info.node = ctxt->node;
4368 xmlParserAddNodeInfo(ctxt, &node_info);
4369 }
4370 return;
4371 }
4372
4373 /*
4374 * Check for an Empty Element from DTD definition
4375 */
4376 if ((info != NULL) && (info->empty)) {
4377 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
4378 ctxt->sax->endElement(ctxt->userData, name);
4379 htmlnamePop(ctxt);
4380 return;
4381 }
4382
4383 /*
4384 * Parse the content of the element:
4385 */
4386 currentNode = xmlStrdup(ctxt->name);
4387 depth = ctxt->nameNr;
4388 while (CUR != 0) {
4389 oldptr = ctxt->input->cur;
4390 htmlParseContent(ctxt);
4391 if (oldptr==ctxt->input->cur) break;
4392 if (ctxt->nameNr < depth) break;
4393 }
4394
4395 /*
4396 * Capture end position and add node
4397 */
4398 if ( currentNode != NULL && ctxt->record_info ) {
4399 node_info.end_pos = ctxt->input->consumed +
4400 (CUR_PTR - ctxt->input->base);
4401 node_info.end_line = ctxt->input->line;
4402 node_info.node = ctxt->node;
4403 xmlParserAddNodeInfo(ctxt, &node_info);
4404 }
4405 if (CUR == 0) {
4406 htmlAutoCloseOnEnd(ctxt);
4407 }
4408
4409 if (currentNode != NULL)
4410 xmlFree(currentNode);
4411 }
4412
4413 static void
htmlParserFinishElementParsing(htmlParserCtxtPtr ctxt)4414 htmlParserFinishElementParsing(htmlParserCtxtPtr ctxt) {
4415 /*
4416 * Capture end position and add node
4417 */
4418 if ( ctxt->node != NULL && ctxt->record_info ) {
4419 ctxt->nodeInfo->end_pos = ctxt->input->consumed +
4420 (CUR_PTR - ctxt->input->base);
4421 ctxt->nodeInfo->end_line = ctxt->input->line;
4422 ctxt->nodeInfo->node = ctxt->node;
4423 xmlParserAddNodeInfo(ctxt, ctxt->nodeInfo);
4424 htmlNodeInfoPop(ctxt);
4425 }
4426 if (CUR == 0) {
4427 htmlAutoCloseOnEnd(ctxt);
4428 }
4429 }
4430
4431 /**
4432 * htmlParseElementInternal:
4433 * @ctxt: an HTML parser context
4434 *
4435 * parse an HTML element, new version, non recursive
4436 *
4437 * [39] element ::= EmptyElemTag | STag content ETag
4438 *
4439 * [41] Attribute ::= Name Eq AttValue
4440 */
4441
4442 static void
htmlParseElementInternal(htmlParserCtxtPtr ctxt)4443 htmlParseElementInternal(htmlParserCtxtPtr ctxt) {
4444 const xmlChar *name;
4445 const htmlElemDesc * info;
4446 htmlParserNodeInfo node_info = { NULL, 0, 0, 0, 0 };
4447 int failed;
4448
4449 if ((ctxt == NULL) || (ctxt->input == NULL))
4450 return;
4451
4452 /* Capture start position */
4453 if (ctxt->record_info) {
4454 node_info.begin_pos = ctxt->input->consumed +
4455 (CUR_PTR - ctxt->input->base);
4456 node_info.begin_line = ctxt->input->line;
4457 }
4458
4459 failed = htmlParseStartTag(ctxt);
4460 name = ctxt->name;
4461 if ((failed == -1) || (name == NULL)) {
4462 if (CUR == '>')
4463 NEXT;
4464 return;
4465 }
4466
4467 /*
4468 * Lookup the info for that element.
4469 */
4470 info = htmlTagLookup(name);
4471 if (info == NULL) {
4472 htmlParseErr(ctxt, XML_HTML_UNKNOWN_TAG,
4473 "Tag %s invalid\n", name, NULL);
4474 }
4475
4476 /*
4477 * Check for an Empty Element labeled the XML/SGML way
4478 */
4479 if ((CUR == '/') && (NXT(1) == '>')) {
4480 SKIP(2);
4481 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
4482 ctxt->sax->endElement(ctxt->userData, name);
4483 htmlnamePop(ctxt);
4484 return;
4485 }
4486
4487 if (CUR == '>') {
4488 NEXT;
4489 } else {
4490 htmlParseErr(ctxt, XML_ERR_GT_REQUIRED,
4491 "Couldn't find end of Start Tag %s\n", name, NULL);
4492
4493 /*
4494 * end of parsing of this node.
4495 */
4496 if (xmlStrEqual(name, ctxt->name)) {
4497 nodePop(ctxt);
4498 htmlnamePop(ctxt);
4499 }
4500
4501 if (ctxt->record_info)
4502 htmlNodeInfoPush(ctxt, &node_info);
4503 htmlParserFinishElementParsing(ctxt);
4504 return;
4505 }
4506
4507 /*
4508 * Check for an Empty Element from DTD definition
4509 */
4510 if ((info != NULL) && (info->empty)) {
4511 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
4512 ctxt->sax->endElement(ctxt->userData, name);
4513 htmlnamePop(ctxt);
4514 return;
4515 }
4516
4517 if (ctxt->record_info)
4518 htmlNodeInfoPush(ctxt, &node_info);
4519 }
4520
4521 /**
4522 * htmlParseContentInternal:
4523 * @ctxt: an HTML parser context
4524 *
4525 * Parse a content: comment, sub-element, reference or text.
4526 * New version for non recursive htmlParseElementInternal
4527 */
4528
4529 static void
htmlParseContentInternal(htmlParserCtxtPtr ctxt)4530 htmlParseContentInternal(htmlParserCtxtPtr ctxt) {
4531 xmlChar *currentNode;
4532 int depth;
4533 const xmlChar *name;
4534
4535 depth = ctxt->nameNr;
4536 if (depth <= 0) {
4537 currentNode = NULL;
4538 } else {
4539 currentNode = xmlStrdup(ctxt->name);
4540 if (currentNode == NULL) {
4541 htmlErrMemory(ctxt);
4542 return;
4543 }
4544 }
4545 while (PARSER_STOPPED(ctxt) == 0) {
4546 GROW;
4547
4548 /*
4549 * Our tag or one of it's parent or children is ending.
4550 */
4551 if ((CUR == '<') && (NXT(1) == '/')) {
4552 if (htmlParseEndTag(ctxt) &&
4553 ((currentNode != NULL) || (ctxt->nameNr == 0))) {
4554 if (currentNode != NULL)
4555 xmlFree(currentNode);
4556
4557 depth = ctxt->nameNr;
4558 if (depth <= 0) {
4559 currentNode = NULL;
4560 } else {
4561 currentNode = xmlStrdup(ctxt->name);
4562 if (currentNode == NULL) {
4563 htmlErrMemory(ctxt);
4564 break;
4565 }
4566 }
4567 }
4568 continue; /* while */
4569 }
4570
4571 else if ((CUR == '<') &&
4572 ((IS_ASCII_LETTER(NXT(1))) ||
4573 (NXT(1) == '_') || (NXT(1) == ':'))) {
4574 name = htmlParseHTMLName_nonInvasive(ctxt);
4575 if (name == NULL) {
4576 htmlParseErr(ctxt, XML_ERR_NAME_REQUIRED,
4577 "htmlParseStartTag: invalid element name\n",
4578 NULL, NULL);
4579 /* Dump the bogus tag like browsers do */
4580 while ((CUR == 0) && (CUR != '>'))
4581 NEXT;
4582
4583 htmlParserFinishElementParsing(ctxt);
4584 if (currentNode != NULL)
4585 xmlFree(currentNode);
4586
4587 if (ctxt->name == NULL) {
4588 currentNode = NULL;
4589 } else {
4590 currentNode = xmlStrdup(ctxt->name);
4591 if (currentNode == NULL) {
4592 htmlErrMemory(ctxt);
4593 break;
4594 }
4595 }
4596 depth = ctxt->nameNr;
4597 continue;
4598 }
4599
4600 if (ctxt->name != NULL) {
4601 if (htmlCheckAutoClose(name, ctxt->name) == 1) {
4602 htmlAutoClose(ctxt, name);
4603 continue;
4604 }
4605 }
4606 }
4607
4608 /*
4609 * Has this node been popped out during parsing of
4610 * the next element
4611 */
4612 if ((ctxt->nameNr > 0) && (depth >= ctxt->nameNr) &&
4613 (!xmlStrEqual(currentNode, ctxt->name)))
4614 {
4615 htmlParserFinishElementParsing(ctxt);
4616 if (currentNode != NULL) xmlFree(currentNode);
4617
4618 if (ctxt->name == NULL) {
4619 currentNode = NULL;
4620 } else {
4621 currentNode = xmlStrdup(ctxt->name);
4622 if (currentNode == NULL) {
4623 htmlErrMemory(ctxt);
4624 break;
4625 }
4626 }
4627 depth = ctxt->nameNr;
4628 continue;
4629 }
4630
4631 if ((CUR != 0) && ((xmlStrEqual(currentNode, BAD_CAST"script")) ||
4632 (xmlStrEqual(currentNode, BAD_CAST"style")))) {
4633 /*
4634 * Handle SCRIPT/STYLE separately
4635 */
4636 htmlParseScript(ctxt);
4637 }
4638
4639 else if ((CUR == '<') && (NXT(1) == '!')) {
4640 /*
4641 * Sometimes DOCTYPE arrives in the middle of the document
4642 */
4643 if ((UPP(2) == 'D') && (UPP(3) == 'O') &&
4644 (UPP(4) == 'C') && (UPP(5) == 'T') &&
4645 (UPP(6) == 'Y') && (UPP(7) == 'P') &&
4646 (UPP(8) == 'E')) {
4647 htmlParseErr(ctxt, XML_HTML_STRUCURE_ERROR,
4648 "Misplaced DOCTYPE declaration\n",
4649 BAD_CAST "DOCTYPE" , NULL);
4650 htmlParseDocTypeDecl(ctxt);
4651 }
4652 /*
4653 * First case : a comment
4654 */
4655 else if ((NXT(2) == '-') && (NXT(3) == '-')) {
4656 htmlParseComment(ctxt);
4657 }
4658 else {
4659 htmlSkipBogusComment(ctxt);
4660 }
4661 }
4662
4663 /*
4664 * Second case : a Processing Instruction.
4665 */
4666 else if ((CUR == '<') && (NXT(1) == '?')) {
4667 htmlParsePI(ctxt);
4668 }
4669
4670 /*
4671 * Third case : a sub-element.
4672 */
4673 else if ((CUR == '<') && IS_ASCII_LETTER(NXT(1))) {
4674 htmlParseElementInternal(ctxt);
4675 if (currentNode != NULL) xmlFree(currentNode);
4676
4677 if (ctxt->name == NULL) {
4678 currentNode = NULL;
4679 } else {
4680 currentNode = xmlStrdup(ctxt->name);
4681 if (currentNode == NULL) {
4682 htmlErrMemory(ctxt);
4683 break;
4684 }
4685 }
4686 depth = ctxt->nameNr;
4687 }
4688 else if (CUR == '<') {
4689 if ((ctxt->sax != NULL) && (!ctxt->disableSAX) &&
4690 (ctxt->sax->characters != NULL))
4691 ctxt->sax->characters(ctxt->userData, BAD_CAST "<", 1);
4692 NEXT;
4693 }
4694
4695 /*
4696 * Fourth case : a reference. If if has not been resolved,
4697 * parsing returns it's Name, create the node
4698 */
4699 else if (CUR == '&') {
4700 htmlParseReference(ctxt);
4701 }
4702
4703 /*
4704 * Fifth case : end of the resource
4705 */
4706 else if (CUR == 0) {
4707 htmlAutoCloseOnEnd(ctxt);
4708 break;
4709 }
4710
4711 /*
4712 * Last case, text. Note that References are handled directly.
4713 */
4714 else {
4715 htmlParseCharData(ctxt);
4716 }
4717
4718 SHRINK;
4719 GROW;
4720 }
4721 if (currentNode != NULL) xmlFree(currentNode);
4722 }
4723
4724 /**
4725 * htmlParseContent:
4726 * @ctxt: an HTML parser context
4727 *
4728 * Parse a content: comment, sub-element, reference or text.
4729 * This is the entry point when called from parser.c
4730 */
4731
4732 void
__htmlParseContent(void * ctxt)4733 __htmlParseContent(void *ctxt) {
4734 if (ctxt != NULL)
4735 htmlParseContentInternal((htmlParserCtxtPtr) ctxt);
4736 }
4737
4738 /**
4739 * htmlParseDocument:
4740 * @ctxt: an HTML parser context
4741 *
4742 * Parse an HTML document and invoke the SAX handlers. This is useful
4743 * if you're only interested in custom SAX callbacks. If you want a
4744 * document tree, use htmlCtxtParseDocument.
4745 *
4746 * Returns 0, -1 in case of error.
4747 */
4748
4749 int
htmlParseDocument(htmlParserCtxtPtr ctxt)4750 htmlParseDocument(htmlParserCtxtPtr ctxt) {
4751 xmlDtdPtr dtd;
4752
4753 if ((ctxt == NULL) || (ctxt->input == NULL))
4754 return(-1);
4755
4756 /*
4757 * Document locator is unused. Only for backward compatibility.
4758 */
4759 if ((ctxt->sax) && (ctxt->sax->setDocumentLocator)) {
4760 xmlSAXLocator copy = xmlDefaultSAXLocator;
4761 ctxt->sax->setDocumentLocator(ctxt->userData, ©);
4762 }
4763
4764 xmlDetectEncoding(ctxt);
4765
4766 /*
4767 * This is wrong but matches long-standing behavior. In most cases,
4768 * a document starting with an XML declaration will specify UTF-8.
4769 */
4770 if (((ctxt->input->flags & XML_INPUT_HAS_ENCODING) == 0) &&
4771 (xmlStrncmp(ctxt->input->cur, BAD_CAST "<?xm", 4) == 0))
4772 xmlSwitchEncoding(ctxt, XML_CHAR_ENCODING_UTF8);
4773
4774 /*
4775 * Wipe out everything which is before the first '<'
4776 */
4777 SKIP_BLANKS;
4778 if (CUR == 0) {
4779 htmlParseErr(ctxt, XML_ERR_DOCUMENT_EMPTY,
4780 "Document is empty\n", NULL, NULL);
4781 }
4782
4783 if ((ctxt->sax) && (ctxt->sax->startDocument) && (!ctxt->disableSAX))
4784 ctxt->sax->startDocument(ctxt->userData);
4785
4786 /*
4787 * Parse possible comments and PIs before any content
4788 */
4789 while (((CUR == '<') && (NXT(1) == '!') &&
4790 (NXT(2) == '-') && (NXT(3) == '-')) ||
4791 ((CUR == '<') && (NXT(1) == '?'))) {
4792 htmlParseComment(ctxt);
4793 htmlParsePI(ctxt);
4794 SKIP_BLANKS;
4795 }
4796
4797
4798 /*
4799 * Then possibly doc type declaration(s) and more Misc
4800 * (doctypedecl Misc*)?
4801 */
4802 if ((CUR == '<') && (NXT(1) == '!') &&
4803 (UPP(2) == 'D') && (UPP(3) == 'O') &&
4804 (UPP(4) == 'C') && (UPP(5) == 'T') &&
4805 (UPP(6) == 'Y') && (UPP(7) == 'P') &&
4806 (UPP(8) == 'E')) {
4807 htmlParseDocTypeDecl(ctxt);
4808 }
4809 SKIP_BLANKS;
4810
4811 /*
4812 * Parse possible comments and PIs before any content
4813 */
4814 while ((PARSER_STOPPED(ctxt) == 0) &&
4815 (((CUR == '<') && (NXT(1) == '!') &&
4816 (NXT(2) == '-') && (NXT(3) == '-')) ||
4817 ((CUR == '<') && (NXT(1) == '?')))) {
4818 htmlParseComment(ctxt);
4819 htmlParsePI(ctxt);
4820 SKIP_BLANKS;
4821 }
4822
4823 /*
4824 * Time to start parsing the tree itself
4825 */
4826 htmlParseContentInternal(ctxt);
4827
4828 /*
4829 * autoclose
4830 */
4831 if (CUR == 0)
4832 htmlAutoCloseOnEnd(ctxt);
4833
4834
4835 /*
4836 * SAX: end of the document processing.
4837 */
4838 if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
4839 ctxt->sax->endDocument(ctxt->userData);
4840
4841 if ((!(ctxt->options & HTML_PARSE_NODEFDTD)) && (ctxt->myDoc != NULL)) {
4842 dtd = xmlGetIntSubset(ctxt->myDoc);
4843 if (dtd == NULL) {
4844 ctxt->myDoc->intSubset =
4845 xmlCreateIntSubset(ctxt->myDoc, BAD_CAST "html",
4846 BAD_CAST "-//W3C//DTD HTML 4.0 Transitional//EN",
4847 BAD_CAST "http://www.w3.org/TR/REC-html40/loose.dtd");
4848 if (ctxt->myDoc->intSubset == NULL)
4849 htmlErrMemory(ctxt);
4850 }
4851 }
4852 if (! ctxt->wellFormed) return(-1);
4853 return(0);
4854 }
4855
4856
4857 /************************************************************************
4858 * *
4859 * Parser contexts handling *
4860 * *
4861 ************************************************************************/
4862
4863 /**
4864 * htmlInitParserCtxt:
4865 * @ctxt: an HTML parser context
4866 * @sax: SAX handler
4867 * @userData: user data
4868 *
4869 * Initialize a parser context
4870 *
4871 * Returns 0 in case of success and -1 in case of error
4872 */
4873
4874 static int
htmlInitParserCtxt(htmlParserCtxtPtr ctxt,const htmlSAXHandler * sax,void * userData)4875 htmlInitParserCtxt(htmlParserCtxtPtr ctxt, const htmlSAXHandler *sax,
4876 void *userData)
4877 {
4878 if (ctxt == NULL) return(-1);
4879 memset(ctxt, 0, sizeof(htmlParserCtxt));
4880
4881 ctxt->dict = xmlDictCreate();
4882 if (ctxt->dict == NULL)
4883 return(-1);
4884
4885 if (ctxt->sax == NULL)
4886 ctxt->sax = (htmlSAXHandler *) xmlMalloc(sizeof(htmlSAXHandler));
4887 if (ctxt->sax == NULL)
4888 return(-1);
4889 if (sax == NULL) {
4890 memset(ctxt->sax, 0, sizeof(htmlSAXHandler));
4891 xmlSAX2InitHtmlDefaultSAXHandler(ctxt->sax);
4892 ctxt->userData = ctxt;
4893 } else {
4894 memcpy(ctxt->sax, sax, sizeof(htmlSAXHandler));
4895 ctxt->userData = userData ? userData : ctxt;
4896 }
4897
4898 /* Allocate the Input stack */
4899 ctxt->inputTab = (htmlParserInputPtr *)
4900 xmlMalloc(5 * sizeof(htmlParserInputPtr));
4901 if (ctxt->inputTab == NULL)
4902 return(-1);
4903 ctxt->inputNr = 0;
4904 ctxt->inputMax = 5;
4905 ctxt->input = NULL;
4906 ctxt->version = NULL;
4907 ctxt->encoding = NULL;
4908 ctxt->standalone = -1;
4909 ctxt->instate = XML_PARSER_START;
4910
4911 /* Allocate the Node stack */
4912 ctxt->nodeTab = (htmlNodePtr *) xmlMalloc(10 * sizeof(htmlNodePtr));
4913 if (ctxt->nodeTab == NULL)
4914 return(-1);
4915 ctxt->nodeNr = 0;
4916 ctxt->nodeMax = 10;
4917 ctxt->node = NULL;
4918
4919 /* Allocate the Name stack */
4920 ctxt->nameTab = (const xmlChar **) xmlMalloc(10 * sizeof(xmlChar *));
4921 if (ctxt->nameTab == NULL)
4922 return(-1);
4923 ctxt->nameNr = 0;
4924 ctxt->nameMax = 10;
4925 ctxt->name = NULL;
4926
4927 ctxt->nodeInfoTab = NULL;
4928 ctxt->nodeInfoNr = 0;
4929 ctxt->nodeInfoMax = 0;
4930
4931 ctxt->myDoc = NULL;
4932 ctxt->wellFormed = 1;
4933 ctxt->replaceEntities = 0;
4934 ctxt->linenumbers = xmlLineNumbersDefaultValue;
4935 ctxt->keepBlanks = xmlKeepBlanksDefaultValue;
4936 ctxt->html = 1;
4937 ctxt->vctxt.flags = XML_VCTXT_USE_PCTXT;
4938 ctxt->vctxt.userData = ctxt;
4939 ctxt->vctxt.error = xmlParserValidityError;
4940 ctxt->vctxt.warning = xmlParserValidityWarning;
4941 ctxt->record_info = 0;
4942 ctxt->validate = 0;
4943 ctxt->checkIndex = 0;
4944 ctxt->catalogs = NULL;
4945 xmlInitNodeInfoSeq(&ctxt->node_seq);
4946 return(0);
4947 }
4948
4949 /**
4950 * htmlFreeParserCtxt:
4951 * @ctxt: an HTML parser context
4952 *
4953 * Free all the memory used by a parser context. However the parsed
4954 * document in ctxt->myDoc is not freed.
4955 */
4956
4957 void
htmlFreeParserCtxt(htmlParserCtxtPtr ctxt)4958 htmlFreeParserCtxt(htmlParserCtxtPtr ctxt)
4959 {
4960 xmlFreeParserCtxt(ctxt);
4961 }
4962
4963 /**
4964 * htmlNewParserCtxt:
4965 *
4966 * Allocate and initialize a new HTML parser context.
4967 *
4968 * This can be used to parse HTML documents into DOM trees with
4969 * functions like xmlCtxtReadFile or xmlCtxtReadMemory.
4970 *
4971 * See htmlCtxtUseOptions for parser options.
4972 *
4973 * See xmlCtxtSetErrorHandler for advanced error handling.
4974 *
4975 * See xmlNewInputURL, xmlNewInputMemory, xmlNewInputIO and similar
4976 * functions for advanced input control.
4977 *
4978 * See htmlNewSAXParserCtxt for custom SAX parsers.
4979 *
4980 * Returns the htmlParserCtxtPtr or NULL in case of allocation error
4981 */
4982
4983 htmlParserCtxtPtr
htmlNewParserCtxt(void)4984 htmlNewParserCtxt(void)
4985 {
4986 return(htmlNewSAXParserCtxt(NULL, NULL));
4987 }
4988
4989 /**
4990 * htmlNewSAXParserCtxt:
4991 * @sax: SAX handler
4992 * @userData: user data
4993 *
4994 * Allocate and initialize a new HTML SAX parser context. If userData
4995 * is NULL, the parser context will be passed as user data.
4996 *
4997 * Available since 2.11.0. If you want support older versions,
4998 * it's best to invoke htmlNewParserCtxt and set ctxt->sax with
4999 * struct assignment.
5000 *
5001 * Also see htmlNewParserCtxt.
5002 *
5003 * Returns the htmlParserCtxtPtr or NULL in case of allocation error
5004 */
5005
5006 htmlParserCtxtPtr
htmlNewSAXParserCtxt(const htmlSAXHandler * sax,void * userData)5007 htmlNewSAXParserCtxt(const htmlSAXHandler *sax, void *userData)
5008 {
5009 xmlParserCtxtPtr ctxt;
5010
5011 xmlInitParser();
5012
5013 ctxt = (xmlParserCtxtPtr) xmlMalloc(sizeof(xmlParserCtxt));
5014 if (ctxt == NULL)
5015 return(NULL);
5016 memset(ctxt, 0, sizeof(xmlParserCtxt));
5017 if (htmlInitParserCtxt(ctxt, sax, userData) < 0) {
5018 htmlFreeParserCtxt(ctxt);
5019 return(NULL);
5020 }
5021 return(ctxt);
5022 }
5023
5024 static htmlParserCtxtPtr
htmlCreateMemoryParserCtxtInternal(const char * url,const char * buffer,size_t size,const char * encoding)5025 htmlCreateMemoryParserCtxtInternal(const char *url,
5026 const char *buffer, size_t size,
5027 const char *encoding) {
5028 xmlParserCtxtPtr ctxt;
5029 xmlParserInputPtr input;
5030
5031 if (buffer == NULL)
5032 return(NULL);
5033
5034 ctxt = htmlNewParserCtxt();
5035 if (ctxt == NULL)
5036 return(NULL);
5037
5038 input = xmlNewInputMemory(ctxt, url, buffer, size, encoding, 0);
5039 if (input == NULL) {
5040 xmlFreeParserCtxt(ctxt);
5041 return(NULL);
5042 }
5043
5044 inputPush(ctxt, input);
5045
5046 return(ctxt);
5047 }
5048
5049 /**
5050 * htmlCreateMemoryParserCtxt:
5051 * @buffer: a pointer to a char array
5052 * @size: the size of the array
5053 *
5054 * DEPRECATED: Use htmlNewParserCtxt and htmlCtxtReadMemory.
5055 *
5056 * Create a parser context for an HTML in-memory document. The input
5057 * buffer must not contain any terminating null bytes.
5058 *
5059 * Returns the new parser context or NULL
5060 */
5061 htmlParserCtxtPtr
htmlCreateMemoryParserCtxt(const char * buffer,int size)5062 htmlCreateMemoryParserCtxt(const char *buffer, int size) {
5063 if (size <= 0)
5064 return(NULL);
5065
5066 return(htmlCreateMemoryParserCtxtInternal(NULL, buffer, size, NULL));
5067 }
5068
5069 /**
5070 * htmlCreateDocParserCtxt:
5071 * @str: a pointer to an array of xmlChar
5072 * @encoding: encoding (optional)
5073 *
5074 * Create a parser context for a null-terminated string.
5075 *
5076 * Returns the new parser context or NULL if a memory allocation failed.
5077 */
5078 static htmlParserCtxtPtr
htmlCreateDocParserCtxt(const xmlChar * str,const char * url,const char * encoding)5079 htmlCreateDocParserCtxt(const xmlChar *str, const char *url,
5080 const char *encoding) {
5081 xmlParserCtxtPtr ctxt;
5082 xmlParserInputPtr input;
5083
5084 if (str == NULL)
5085 return(NULL);
5086
5087 ctxt = htmlNewParserCtxt();
5088 if (ctxt == NULL)
5089 return(NULL);
5090
5091 input = xmlNewInputString(ctxt, url, (const char *) str, encoding, 0);
5092 if (input == NULL) {
5093 xmlFreeParserCtxt(ctxt);
5094 return(NULL);
5095 }
5096
5097 inputPush(ctxt, input);
5098
5099 return(ctxt);
5100 }
5101
5102 #ifdef LIBXML_PUSH_ENABLED
5103 /************************************************************************
5104 * *
5105 * Progressive parsing interfaces *
5106 * *
5107 ************************************************************************/
5108
5109 /**
5110 * htmlParseLookupSequence:
5111 * @ctxt: an HTML parser context
5112 * @first: the first char to lookup
5113 * @next: the next char to lookup or zero
5114 * @third: the next char to lookup or zero
5115 * @ignoreattrval: skip over attribute values
5116 *
5117 * Try to find if a sequence (first, next, third) or just (first next) or
5118 * (first) is available in the input stream.
5119 * This function has a side effect of (possibly) incrementing ctxt->checkIndex
5120 * to avoid rescanning sequences of bytes, it DOES change the state of the
5121 * parser, do not use liberally.
5122 * This is basically similar to xmlParseLookupSequence()
5123 *
5124 * Returns the index to the current parsing point if the full sequence
5125 * is available, -1 otherwise.
5126 */
5127 static int
htmlParseLookupSequence(htmlParserCtxtPtr ctxt,xmlChar first,xmlChar next,xmlChar third,int ignoreattrval)5128 htmlParseLookupSequence(htmlParserCtxtPtr ctxt, xmlChar first,
5129 xmlChar next, xmlChar third, int ignoreattrval)
5130 {
5131 size_t base, len;
5132 htmlParserInputPtr in;
5133 const xmlChar *buf;
5134 int quote;
5135
5136 in = ctxt->input;
5137 if (in == NULL)
5138 return (-1);
5139
5140 base = ctxt->checkIndex;
5141 quote = ctxt->endCheckState;
5142
5143 buf = in->cur;
5144 len = in->end - in->cur;
5145
5146 /* take into account the sequence length */
5147 if (third)
5148 len -= 2;
5149 else if (next)
5150 len--;
5151 for (; base < len; base++) {
5152 if (base >= INT_MAX / 2) {
5153 ctxt->checkIndex = 0;
5154 ctxt->endCheckState = 0;
5155 return (base - 2);
5156 }
5157 if (ignoreattrval) {
5158 if (quote) {
5159 if (buf[base] == quote)
5160 quote = 0;
5161 continue;
5162 }
5163 if (buf[base] == '"' || buf[base] == '\'') {
5164 quote = buf[base];
5165 continue;
5166 }
5167 }
5168 if (buf[base] == first) {
5169 if (third != 0) {
5170 if ((buf[base + 1] != next) || (buf[base + 2] != third))
5171 continue;
5172 } else if (next != 0) {
5173 if (buf[base + 1] != next)
5174 continue;
5175 }
5176 ctxt->checkIndex = 0;
5177 ctxt->endCheckState = 0;
5178 return (base);
5179 }
5180 }
5181 ctxt->checkIndex = base;
5182 ctxt->endCheckState = quote;
5183 return (-1);
5184 }
5185
5186 /**
5187 * htmlParseLookupCommentEnd:
5188 * @ctxt: an HTML parser context
5189 *
5190 * Try to find a comment end tag in the input stream
5191 * The search includes "-->" as well as WHATWG-recommended incorrectly-closed tags.
5192 * (See https://html.spec.whatwg.org/multipage/parsing.html#parse-error-incorrectly-closed-comment)
5193 * This function has a side effect of (possibly) incrementing ctxt->checkIndex
5194 * to avoid rescanning sequences of bytes, it DOES change the state of the
5195 * parser, do not use liberally.
5196 * This wraps to htmlParseLookupSequence()
5197 *
5198 * Returns the index to the current parsing point if the full sequence is available, -1 otherwise.
5199 */
5200 static int
htmlParseLookupCommentEnd(htmlParserCtxtPtr ctxt)5201 htmlParseLookupCommentEnd(htmlParserCtxtPtr ctxt)
5202 {
5203 int mark = 0;
5204 int offset;
5205
5206 while (1) {
5207 mark = htmlParseLookupSequence(ctxt, '-', '-', 0, 0);
5208 if (mark < 0)
5209 break;
5210 if ((NXT(mark+2) == '>') ||
5211 ((NXT(mark+2) == '!') && (NXT(mark+3) == '>'))) {
5212 ctxt->checkIndex = 0;
5213 break;
5214 }
5215 offset = (NXT(mark+2) == '!') ? 3 : 2;
5216 if (mark + offset >= ctxt->input->end - ctxt->input->cur) {
5217 ctxt->checkIndex = mark;
5218 return(-1);
5219 }
5220 ctxt->checkIndex = mark + 1;
5221 }
5222 return mark;
5223 }
5224
5225
5226 /**
5227 * htmlParseTryOrFinish:
5228 * @ctxt: an HTML parser context
5229 * @terminate: last chunk indicator
5230 *
5231 * Try to progress on parsing
5232 *
5233 * Returns zero if no parsing was possible
5234 */
5235 static int
htmlParseTryOrFinish(htmlParserCtxtPtr ctxt,int terminate)5236 htmlParseTryOrFinish(htmlParserCtxtPtr ctxt, int terminate) {
5237 int ret = 0;
5238 htmlParserInputPtr in;
5239 ptrdiff_t avail = 0;
5240 xmlChar cur, next;
5241
5242 htmlParserNodeInfo node_info;
5243
5244 while (PARSER_STOPPED(ctxt) == 0) {
5245
5246 in = ctxt->input;
5247 if (in == NULL) break;
5248 avail = in->end - in->cur;
5249 if ((avail == 0) && (terminate)) {
5250 htmlAutoCloseOnEnd(ctxt);
5251 if ((ctxt->nameNr == 0) && (ctxt->instate != XML_PARSER_EOF)) {
5252 /*
5253 * SAX: end of the document processing.
5254 */
5255 ctxt->instate = XML_PARSER_EOF;
5256 if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
5257 ctxt->sax->endDocument(ctxt->userData);
5258 }
5259 }
5260 if (avail < 1)
5261 goto done;
5262 /*
5263 * This is done to make progress and avoid an infinite loop
5264 * if a parsing attempt was aborted by hitting a NUL byte. After
5265 * changing htmlCurrentChar, this probably isn't necessary anymore.
5266 * We should consider removing this check.
5267 */
5268 cur = in->cur[0];
5269 if (cur == 0) {
5270 SKIP(1);
5271 continue;
5272 }
5273
5274 switch (ctxt->instate) {
5275 case XML_PARSER_EOF:
5276 /*
5277 * Document parsing is done !
5278 */
5279 goto done;
5280 case XML_PARSER_START:
5281 /*
5282 * This is wrong but matches long-standing behavior. In most
5283 * cases, a document starting with an XML declaration will
5284 * specify UTF-8.
5285 */
5286 if (((ctxt->input->flags & XML_INPUT_HAS_ENCODING) == 0) &&
5287 (xmlStrncmp(ctxt->input->cur, BAD_CAST "<?xm", 4) == 0)) {
5288 xmlSwitchEncoding(ctxt, XML_CHAR_ENCODING_UTF8);
5289 }
5290
5291 /*
5292 * Very first chars read from the document flow.
5293 */
5294 cur = in->cur[0];
5295 if (IS_BLANK_CH(cur)) {
5296 SKIP_BLANKS;
5297 avail = in->end - in->cur;
5298 }
5299 if ((ctxt->sax) && (ctxt->sax->setDocumentLocator)) {
5300 xmlSAXLocator copy = xmlDefaultSAXLocator;
5301 ctxt->sax->setDocumentLocator(ctxt->userData, ©);
5302 }
5303 if ((ctxt->sax) && (ctxt->sax->startDocument) &&
5304 (!ctxt->disableSAX))
5305 ctxt->sax->startDocument(ctxt->userData);
5306
5307 cur = in->cur[0];
5308 next = in->cur[1];
5309 if ((cur == '<') && (next == '!') &&
5310 (UPP(2) == 'D') && (UPP(3) == 'O') &&
5311 (UPP(4) == 'C') && (UPP(5) == 'T') &&
5312 (UPP(6) == 'Y') && (UPP(7) == 'P') &&
5313 (UPP(8) == 'E')) {
5314 if ((!terminate) &&
5315 (htmlParseLookupSequence(ctxt, '>', 0, 0, 1) < 0))
5316 goto done;
5317 htmlParseDocTypeDecl(ctxt);
5318 ctxt->instate = XML_PARSER_PROLOG;
5319 } else {
5320 ctxt->instate = XML_PARSER_MISC;
5321 }
5322 break;
5323 case XML_PARSER_MISC:
5324 SKIP_BLANKS;
5325 avail = in->end - in->cur;
5326 /*
5327 * no chars in buffer
5328 */
5329 if (avail < 1)
5330 goto done;
5331 /*
5332 * not enough chars in buffer
5333 */
5334 if (avail < 2) {
5335 if (!terminate)
5336 goto done;
5337 else
5338 next = ' ';
5339 } else {
5340 next = in->cur[1];
5341 }
5342 cur = in->cur[0];
5343 if ((cur == '<') && (next == '!') &&
5344 (in->cur[2] == '-') && (in->cur[3] == '-')) {
5345 if ((!terminate) && (htmlParseLookupCommentEnd(ctxt) < 0))
5346 goto done;
5347 htmlParseComment(ctxt);
5348 ctxt->instate = XML_PARSER_MISC;
5349 } else if ((cur == '<') && (next == '?')) {
5350 if ((!terminate) &&
5351 (htmlParseLookupSequence(ctxt, '>', 0, 0, 0) < 0))
5352 goto done;
5353 htmlParsePI(ctxt);
5354 ctxt->instate = XML_PARSER_MISC;
5355 } else if ((cur == '<') && (next == '!') &&
5356 (UPP(2) == 'D') && (UPP(3) == 'O') &&
5357 (UPP(4) == 'C') && (UPP(5) == 'T') &&
5358 (UPP(6) == 'Y') && (UPP(7) == 'P') &&
5359 (UPP(8) == 'E')) {
5360 if ((!terminate) &&
5361 (htmlParseLookupSequence(ctxt, '>', 0, 0, 1) < 0))
5362 goto done;
5363 htmlParseDocTypeDecl(ctxt);
5364 ctxt->instate = XML_PARSER_PROLOG;
5365 } else if ((cur == '<') && (next == '!') &&
5366 (avail < 9)) {
5367 goto done;
5368 } else {
5369 ctxt->instate = XML_PARSER_CONTENT;
5370 }
5371 break;
5372 case XML_PARSER_PROLOG:
5373 SKIP_BLANKS;
5374 avail = in->end - in->cur;
5375 if (avail < 2)
5376 goto done;
5377 cur = in->cur[0];
5378 next = in->cur[1];
5379 if ((cur == '<') && (next == '!') &&
5380 (in->cur[2] == '-') && (in->cur[3] == '-')) {
5381 if ((!terminate) && (htmlParseLookupCommentEnd(ctxt) < 0))
5382 goto done;
5383 htmlParseComment(ctxt);
5384 ctxt->instate = XML_PARSER_PROLOG;
5385 } else if ((cur == '<') && (next == '?')) {
5386 if ((!terminate) &&
5387 (htmlParseLookupSequence(ctxt, '>', 0, 0, 0) < 0))
5388 goto done;
5389 htmlParsePI(ctxt);
5390 ctxt->instate = XML_PARSER_PROLOG;
5391 } else if ((cur == '<') && (next == '!') &&
5392 (avail < 4)) {
5393 goto done;
5394 } else {
5395 ctxt->instate = XML_PARSER_CONTENT;
5396 }
5397 break;
5398 case XML_PARSER_EPILOG:
5399 avail = in->end - in->cur;
5400 if (avail < 1)
5401 goto done;
5402 cur = in->cur[0];
5403 if (IS_BLANK_CH(cur)) {
5404 htmlParseCharData(ctxt);
5405 goto done;
5406 }
5407 if (avail < 2)
5408 goto done;
5409 next = in->cur[1];
5410 if ((cur == '<') && (next == '!') &&
5411 (in->cur[2] == '-') && (in->cur[3] == '-')) {
5412 if ((!terminate) && (htmlParseLookupCommentEnd(ctxt) < 0))
5413 goto done;
5414 htmlParseComment(ctxt);
5415 ctxt->instate = XML_PARSER_EPILOG;
5416 } else if ((cur == '<') && (next == '?')) {
5417 if ((!terminate) &&
5418 (htmlParseLookupSequence(ctxt, '>', 0, 0, 0) < 0))
5419 goto done;
5420 htmlParsePI(ctxt);
5421 ctxt->instate = XML_PARSER_EPILOG;
5422 } else if ((cur == '<') && (next == '!') &&
5423 (avail < 4)) {
5424 goto done;
5425 } else {
5426 ctxt->errNo = XML_ERR_DOCUMENT_END;
5427 ctxt->wellFormed = 0;
5428 ctxt->instate = XML_PARSER_EOF;
5429 if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
5430 ctxt->sax->endDocument(ctxt->userData);
5431 goto done;
5432 }
5433 break;
5434 case XML_PARSER_START_TAG: {
5435 const xmlChar *name;
5436 int failed;
5437 const htmlElemDesc * info;
5438
5439 /*
5440 * no chars in buffer
5441 */
5442 if (avail < 1)
5443 goto done;
5444 /*
5445 * not enough chars in buffer
5446 */
5447 if (avail < 2) {
5448 if (!terminate)
5449 goto done;
5450 else
5451 next = ' ';
5452 } else {
5453 next = in->cur[1];
5454 }
5455 cur = in->cur[0];
5456 if (cur != '<') {
5457 ctxt->instate = XML_PARSER_CONTENT;
5458 break;
5459 }
5460 if (next == '/') {
5461 ctxt->instate = XML_PARSER_END_TAG;
5462 ctxt->checkIndex = 0;
5463 break;
5464 }
5465 if ((!terminate) &&
5466 (htmlParseLookupSequence(ctxt, '>', 0, 0, 1) < 0))
5467 goto done;
5468
5469 /* Capture start position */
5470 if (ctxt->record_info) {
5471 node_info.begin_pos = ctxt->input->consumed +
5472 (CUR_PTR - ctxt->input->base);
5473 node_info.begin_line = ctxt->input->line;
5474 }
5475
5476
5477 failed = htmlParseStartTag(ctxt);
5478 name = ctxt->name;
5479 if ((failed == -1) ||
5480 (name == NULL)) {
5481 if (CUR == '>')
5482 NEXT;
5483 break;
5484 }
5485
5486 /*
5487 * Lookup the info for that element.
5488 */
5489 info = htmlTagLookup(name);
5490 if (info == NULL) {
5491 htmlParseErr(ctxt, XML_HTML_UNKNOWN_TAG,
5492 "Tag %s invalid\n", name, NULL);
5493 }
5494
5495 /*
5496 * Check for an Empty Element labeled the XML/SGML way
5497 */
5498 if ((CUR == '/') && (NXT(1) == '>')) {
5499 SKIP(2);
5500 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
5501 ctxt->sax->endElement(ctxt->userData, name);
5502 htmlnamePop(ctxt);
5503 ctxt->instate = XML_PARSER_CONTENT;
5504 break;
5505 }
5506
5507 if (CUR == '>') {
5508 NEXT;
5509 } else {
5510 htmlParseErr(ctxt, XML_ERR_GT_REQUIRED,
5511 "Couldn't find end of Start Tag %s\n",
5512 name, NULL);
5513
5514 /*
5515 * end of parsing of this node.
5516 */
5517 if (xmlStrEqual(name, ctxt->name)) {
5518 nodePop(ctxt);
5519 htmlnamePop(ctxt);
5520 }
5521
5522 if (ctxt->record_info)
5523 htmlNodeInfoPush(ctxt, &node_info);
5524
5525 ctxt->instate = XML_PARSER_CONTENT;
5526 break;
5527 }
5528
5529 /*
5530 * Check for an Empty Element from DTD definition
5531 */
5532 if ((info != NULL) && (info->empty)) {
5533 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
5534 ctxt->sax->endElement(ctxt->userData, name);
5535 htmlnamePop(ctxt);
5536 }
5537
5538 if (ctxt->record_info)
5539 htmlNodeInfoPush(ctxt, &node_info);
5540
5541 ctxt->instate = XML_PARSER_CONTENT;
5542 break;
5543 }
5544 case XML_PARSER_CONTENT: {
5545 xmlChar chr[2] = { 0, 0 };
5546
5547 /*
5548 * Handle preparsed entities and charRef
5549 */
5550 if ((avail == 1) && (terminate)) {
5551 cur = in->cur[0];
5552 if ((cur != '<') && (cur != '&')) {
5553 if (ctxt->sax != NULL) {
5554 chr[0] = cur;
5555 if (IS_BLANK_CH(cur)) {
5556 if (ctxt->keepBlanks) {
5557 if (ctxt->sax->characters != NULL)
5558 ctxt->sax->characters(
5559 ctxt->userData, chr, 1);
5560 } else {
5561 if (ctxt->sax->ignorableWhitespace != NULL)
5562 ctxt->sax->ignorableWhitespace(
5563 ctxt->userData, chr, 1);
5564 }
5565 } else {
5566 htmlCheckParagraph(ctxt);
5567 if (ctxt->sax->characters != NULL)
5568 ctxt->sax->characters(
5569 ctxt->userData, chr, 1);
5570 }
5571 }
5572 ctxt->checkIndex = 0;
5573 in->cur++;
5574 break;
5575 }
5576 }
5577 if (avail < 2)
5578 goto done;
5579 cur = in->cur[0];
5580 next = in->cur[1];
5581 if ((xmlStrEqual(ctxt->name, BAD_CAST"script")) ||
5582 (xmlStrEqual(ctxt->name, BAD_CAST"style"))) {
5583 /*
5584 * Handle SCRIPT/STYLE separately
5585 */
5586 if (!terminate) {
5587 int idx;
5588 xmlChar val;
5589
5590 idx = htmlParseLookupSequence(ctxt, '<', '/', 0, 0);
5591 if (idx < 0)
5592 goto done;
5593 val = in->cur[idx + 2];
5594 if (val == 0) { /* bad cut of input */
5595 /*
5596 * FIXME: htmlParseScript checks for additional
5597 * characters after '</'.
5598 */
5599 ctxt->checkIndex = idx;
5600 goto done;
5601 }
5602 }
5603 htmlParseScript(ctxt);
5604 if ((cur == '<') && (next == '/')) {
5605 ctxt->instate = XML_PARSER_END_TAG;
5606 ctxt->checkIndex = 0;
5607 break;
5608 }
5609 } else if ((cur == '<') && (next == '!')) {
5610 if (avail < 4)
5611 goto done;
5612 /*
5613 * Sometimes DOCTYPE arrives in the middle of the document
5614 */
5615 if ((UPP(2) == 'D') && (UPP(3) == 'O') &&
5616 (UPP(4) == 'C') && (UPP(5) == 'T') &&
5617 (UPP(6) == 'Y') && (UPP(7) == 'P') &&
5618 (UPP(8) == 'E')) {
5619 if ((!terminate) &&
5620 (htmlParseLookupSequence(ctxt, '>', 0, 0, 1) < 0))
5621 goto done;
5622 htmlParseErr(ctxt, XML_HTML_STRUCURE_ERROR,
5623 "Misplaced DOCTYPE declaration\n",
5624 BAD_CAST "DOCTYPE" , NULL);
5625 htmlParseDocTypeDecl(ctxt);
5626 } else if ((in->cur[2] == '-') && (in->cur[3] == '-')) {
5627 if ((!terminate) &&
5628 (htmlParseLookupCommentEnd(ctxt) < 0))
5629 goto done;
5630 htmlParseComment(ctxt);
5631 ctxt->instate = XML_PARSER_CONTENT;
5632 } else {
5633 if ((!terminate) &&
5634 (htmlParseLookupSequence(ctxt, '>', 0, 0, 0) < 0))
5635 goto done;
5636 htmlSkipBogusComment(ctxt);
5637 }
5638 } else if ((cur == '<') && (next == '?')) {
5639 if ((!terminate) &&
5640 (htmlParseLookupSequence(ctxt, '>', 0, 0, 0) < 0))
5641 goto done;
5642 htmlParsePI(ctxt);
5643 ctxt->instate = XML_PARSER_CONTENT;
5644 } else if ((cur == '<') && (next == '/')) {
5645 ctxt->instate = XML_PARSER_END_TAG;
5646 ctxt->checkIndex = 0;
5647 break;
5648 } else if ((cur == '<') && IS_ASCII_LETTER(next)) {
5649 if ((!terminate) && (next == 0))
5650 goto done;
5651 ctxt->instate = XML_PARSER_START_TAG;
5652 ctxt->checkIndex = 0;
5653 break;
5654 } else if (cur == '<') {
5655 if ((ctxt->sax != NULL) && (!ctxt->disableSAX) &&
5656 (ctxt->sax->characters != NULL))
5657 ctxt->sax->characters(ctxt->userData,
5658 BAD_CAST "<", 1);
5659 NEXT;
5660 } else {
5661 /*
5662 * check that the text sequence is complete
5663 * before handing out the data to the parser
5664 * to avoid problems with erroneous end of
5665 * data detection.
5666 */
5667 if ((!terminate) &&
5668 (htmlParseLookupSequence(ctxt, '<', 0, 0, 0) < 0))
5669 goto done;
5670 ctxt->checkIndex = 0;
5671 while ((PARSER_STOPPED(ctxt) == 0) &&
5672 (cur != '<') && (in->cur < in->end)) {
5673 if (cur == '&') {
5674 htmlParseReference(ctxt);
5675 } else {
5676 htmlParseCharData(ctxt);
5677 }
5678 cur = in->cur[0];
5679 }
5680 }
5681
5682 break;
5683 }
5684 case XML_PARSER_END_TAG:
5685 if (avail < 2)
5686 goto done;
5687 if ((!terminate) &&
5688 (htmlParseLookupSequence(ctxt, '>', 0, 0, 0) < 0))
5689 goto done;
5690 htmlParseEndTag(ctxt);
5691 if (ctxt->nameNr == 0) {
5692 ctxt->instate = XML_PARSER_EPILOG;
5693 } else {
5694 ctxt->instate = XML_PARSER_CONTENT;
5695 }
5696 ctxt->checkIndex = 0;
5697 break;
5698 default:
5699 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
5700 "HPP: internal error\n", NULL, NULL);
5701 ctxt->instate = XML_PARSER_EOF;
5702 break;
5703 }
5704 }
5705 done:
5706 if ((avail == 0) && (terminate)) {
5707 htmlAutoCloseOnEnd(ctxt);
5708 if ((ctxt->nameNr == 0) && (ctxt->instate != XML_PARSER_EOF)) {
5709 /*
5710 * SAX: end of the document processing.
5711 */
5712 ctxt->instate = XML_PARSER_EOF;
5713 if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
5714 ctxt->sax->endDocument(ctxt->userData);
5715 }
5716 }
5717 if ((!(ctxt->options & HTML_PARSE_NODEFDTD)) && (ctxt->myDoc != NULL) &&
5718 ((terminate) || (ctxt->instate == XML_PARSER_EOF) ||
5719 (ctxt->instate == XML_PARSER_EPILOG))) {
5720 xmlDtdPtr dtd;
5721 dtd = xmlGetIntSubset(ctxt->myDoc);
5722 if (dtd == NULL) {
5723 ctxt->myDoc->intSubset =
5724 xmlCreateIntSubset(ctxt->myDoc, BAD_CAST "html",
5725 BAD_CAST "-//W3C//DTD HTML 4.0 Transitional//EN",
5726 BAD_CAST "http://www.w3.org/TR/REC-html40/loose.dtd");
5727 if (ctxt->myDoc->intSubset == NULL)
5728 htmlErrMemory(ctxt);
5729 }
5730 }
5731 return(ret);
5732 }
5733
5734 /**
5735 * htmlParseChunk:
5736 * @ctxt: an HTML parser context
5737 * @chunk: chunk of memory
5738 * @size: size of chunk in bytes
5739 * @terminate: last chunk indicator
5740 *
5741 * Parse a chunk of memory in push parser mode.
5742 *
5743 * Assumes that the parser context was initialized with
5744 * htmlCreatePushParserCtxt.
5745 *
5746 * The last chunk, which will often be empty, must be marked with
5747 * the @terminate flag. With the default SAX callbacks, the resulting
5748 * document will be available in ctxt->myDoc. This pointer will not
5749 * be freed by the library.
5750 *
5751 * If the document isn't well-formed, ctxt->myDoc is set to NULL.
5752 *
5753 * Returns an xmlParserErrors code (0 on success).
5754 */
5755 int
htmlParseChunk(htmlParserCtxtPtr ctxt,const char * chunk,int size,int terminate)5756 htmlParseChunk(htmlParserCtxtPtr ctxt, const char *chunk, int size,
5757 int terminate) {
5758 if ((ctxt == NULL) || (ctxt->input == NULL))
5759 return(XML_ERR_ARGUMENT);
5760 if (PARSER_STOPPED(ctxt) != 0)
5761 return(ctxt->errNo);
5762 if ((size > 0) && (chunk != NULL) && (ctxt->input != NULL) &&
5763 (ctxt->input->buf != NULL)) {
5764 size_t pos = ctxt->input->cur - ctxt->input->base;
5765 int res;
5766
5767 res = xmlParserInputBufferPush(ctxt->input->buf, size, chunk);
5768 xmlBufUpdateInput(ctxt->input->buf->buffer, ctxt->input, pos);
5769 if (res < 0) {
5770 htmlParseErr(ctxt, ctxt->input->buf->error,
5771 "xmlParserInputBufferPush failed", NULL, NULL);
5772 xmlHaltParser(ctxt);
5773 return (ctxt->errNo);
5774 }
5775 }
5776 htmlParseTryOrFinish(ctxt, terminate);
5777 if (terminate) {
5778 if (ctxt->instate != XML_PARSER_EOF) {
5779 if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
5780 ctxt->sax->endDocument(ctxt->userData);
5781 }
5782 ctxt->instate = XML_PARSER_EOF;
5783 }
5784 return((xmlParserErrors) ctxt->errNo);
5785 }
5786
5787 /************************************************************************
5788 * *
5789 * User entry points *
5790 * *
5791 ************************************************************************/
5792
5793 /**
5794 * htmlCreatePushParserCtxt:
5795 * @sax: a SAX handler (optional)
5796 * @user_data: The user data returned on SAX callbacks (optional)
5797 * @chunk: a pointer to an array of chars (optional)
5798 * @size: number of chars in the array
5799 * @filename: only used for error reporting (optional)
5800 * @enc: encoding (deprecated, pass XML_CHAR_ENCODING_NONE)
5801 *
5802 * Create a parser context for using the HTML parser in push mode.
5803 *
5804 * Returns the new parser context or NULL if a memory allocation
5805 * failed.
5806 */
5807 htmlParserCtxtPtr
htmlCreatePushParserCtxt(htmlSAXHandlerPtr sax,void * user_data,const char * chunk,int size,const char * filename,xmlCharEncoding enc)5808 htmlCreatePushParserCtxt(htmlSAXHandlerPtr sax, void *user_data,
5809 const char *chunk, int size, const char *filename,
5810 xmlCharEncoding enc) {
5811 htmlParserCtxtPtr ctxt;
5812 htmlParserInputPtr input;
5813 const char *encoding;
5814
5815 ctxt = htmlNewSAXParserCtxt(sax, user_data);
5816 if (ctxt == NULL)
5817 return(NULL);
5818
5819 encoding = xmlGetCharEncodingName(enc);
5820 input = xmlInputCreatePush(filename, chunk, size);
5821 if (input == NULL) {
5822 htmlFreeParserCtxt(ctxt);
5823 return(NULL);
5824 }
5825
5826 inputPush(ctxt, input);
5827
5828 if (encoding != NULL)
5829 xmlSwitchEncodingName(ctxt, encoding);
5830
5831 return(ctxt);
5832 }
5833 #endif /* LIBXML_PUSH_ENABLED */
5834
5835 /**
5836 * htmlSAXParseDoc:
5837 * @cur: a pointer to an array of xmlChar
5838 * @encoding: a free form C string describing the HTML document encoding, or NULL
5839 * @sax: the SAX handler block
5840 * @userData: if using SAX, this pointer will be provided on callbacks.
5841 *
5842 * DEPRECATED: Use htmlNewSAXParserCtxt and htmlCtxtReadDoc.
5843 *
5844 * Parse an HTML in-memory document. If sax is not NULL, use the SAX callbacks
5845 * to handle parse events. If sax is NULL, fallback to the default DOM
5846 * behavior and return a tree.
5847 *
5848 * Returns the resulting document tree unless SAX is NULL or the document is
5849 * not well formed.
5850 */
5851
5852 htmlDocPtr
htmlSAXParseDoc(const xmlChar * cur,const char * encoding,htmlSAXHandlerPtr sax,void * userData)5853 htmlSAXParseDoc(const xmlChar *cur, const char *encoding,
5854 htmlSAXHandlerPtr sax, void *userData) {
5855 htmlDocPtr ret;
5856 htmlParserCtxtPtr ctxt;
5857
5858 if (cur == NULL)
5859 return(NULL);
5860
5861 ctxt = htmlCreateDocParserCtxt(cur, NULL, encoding);
5862 if (ctxt == NULL)
5863 return(NULL);
5864
5865 if (sax != NULL) {
5866 *ctxt->sax = *sax;
5867 ctxt->userData = userData;
5868 }
5869
5870 htmlParseDocument(ctxt);
5871 ret = ctxt->myDoc;
5872 htmlFreeParserCtxt(ctxt);
5873
5874 return(ret);
5875 }
5876
5877 /**
5878 * htmlParseDoc:
5879 * @cur: a pointer to an array of xmlChar
5880 * @encoding: the encoding (optional)
5881 *
5882 * DEPRECATED: Use htmlReadDoc.
5883 *
5884 * Parse an HTML in-memory document and build a tree.
5885 *
5886 * This function uses deprecated global parser options.
5887 *
5888 * Returns the resulting document tree
5889 */
5890
5891 htmlDocPtr
htmlParseDoc(const xmlChar * cur,const char * encoding)5892 htmlParseDoc(const xmlChar *cur, const char *encoding) {
5893 return(htmlSAXParseDoc(cur, encoding, NULL, NULL));
5894 }
5895
5896
5897 /**
5898 * htmlCreateFileParserCtxt:
5899 * @filename: the filename
5900 * @encoding: optional encoding
5901 *
5902 * DEPRECATED: Use htmlNewParserCtxt and htmlCtxtReadFile.
5903 *
5904 * Create a parser context to read from a file.
5905 *
5906 * A non-NULL encoding overrides encoding declarations in the document.
5907 *
5908 * Automatic support for ZLIB/Compress compressed document is provided
5909 * by default if found at compile-time.
5910 *
5911 * Returns the new parser context or NULL if a memory allocation failed.
5912 */
5913 htmlParserCtxtPtr
htmlCreateFileParserCtxt(const char * filename,const char * encoding)5914 htmlCreateFileParserCtxt(const char *filename, const char *encoding)
5915 {
5916 htmlParserCtxtPtr ctxt;
5917 htmlParserInputPtr input;
5918
5919 if (filename == NULL)
5920 return(NULL);
5921
5922 ctxt = htmlNewParserCtxt();
5923 if (ctxt == NULL) {
5924 return(NULL);
5925 }
5926
5927 input = xmlNewInputURL(ctxt, filename, NULL, encoding, 0);
5928 if (input == NULL) {
5929 xmlFreeParserCtxt(ctxt);
5930 return(NULL);
5931 }
5932 inputPush(ctxt, input);
5933
5934 return(ctxt);
5935 }
5936
5937 /**
5938 * htmlSAXParseFile:
5939 * @filename: the filename
5940 * @encoding: encoding (optional)
5941 * @sax: the SAX handler block
5942 * @userData: if using SAX, this pointer will be provided on callbacks.
5943 *
5944 * DEPRECATED: Use htmlNewSAXParserCtxt and htmlCtxtReadFile.
5945 *
5946 * parse an HTML file and build a tree. Automatic support for ZLIB/Compress
5947 * compressed document is provided by default if found at compile-time.
5948 * It use the given SAX function block to handle the parsing callback.
5949 * If sax is NULL, fallback to the default DOM tree building routines.
5950 *
5951 * Returns the resulting document tree unless SAX is NULL or the document is
5952 * not well formed.
5953 */
5954
5955 htmlDocPtr
htmlSAXParseFile(const char * filename,const char * encoding,htmlSAXHandlerPtr sax,void * userData)5956 htmlSAXParseFile(const char *filename, const char *encoding, htmlSAXHandlerPtr sax,
5957 void *userData) {
5958 htmlDocPtr ret;
5959 htmlParserCtxtPtr ctxt;
5960 htmlSAXHandlerPtr oldsax = NULL;
5961
5962 ctxt = htmlCreateFileParserCtxt(filename, encoding);
5963 if (ctxt == NULL) return(NULL);
5964 if (sax != NULL) {
5965 oldsax = ctxt->sax;
5966 ctxt->sax = sax;
5967 ctxt->userData = userData;
5968 }
5969
5970 htmlParseDocument(ctxt);
5971
5972 ret = ctxt->myDoc;
5973 if (sax != NULL) {
5974 ctxt->sax = oldsax;
5975 ctxt->userData = NULL;
5976 }
5977 htmlFreeParserCtxt(ctxt);
5978
5979 return(ret);
5980 }
5981
5982 /**
5983 * htmlParseFile:
5984 * @filename: the filename
5985 * @encoding: encoding (optional)
5986 *
5987 * Parse an HTML file and build a tree.
5988 *
5989 * See xmlNewInputURL for details.
5990 *
5991 * Returns the resulting document tree
5992 */
5993
5994 htmlDocPtr
htmlParseFile(const char * filename,const char * encoding)5995 htmlParseFile(const char *filename, const char *encoding) {
5996 return(htmlSAXParseFile(filename, encoding, NULL, NULL));
5997 }
5998
5999 /**
6000 * htmlHandleOmittedElem:
6001 * @val: int 0 or 1
6002 *
6003 * DEPRECATED: Use HTML_PARSE_NOIMPLIED
6004 *
6005 * Set and return the previous value for handling HTML omitted tags.
6006 *
6007 * Returns the last value for 0 for no handling, 1 for auto insertion.
6008 */
6009
6010 int
htmlHandleOmittedElem(int val)6011 htmlHandleOmittedElem(int val) {
6012 int old = htmlOmittedDefaultValue;
6013
6014 htmlOmittedDefaultValue = val;
6015 return(old);
6016 }
6017
6018 /**
6019 * htmlElementAllowedHere:
6020 * @parent: HTML parent element
6021 * @elt: HTML element
6022 *
6023 * Checks whether an HTML element may be a direct child of a parent element.
6024 * Note - doesn't check for deprecated elements
6025 *
6026 * Returns 1 if allowed; 0 otherwise.
6027 */
6028 int
htmlElementAllowedHere(const htmlElemDesc * parent,const xmlChar * elt)6029 htmlElementAllowedHere(const htmlElemDesc* parent, const xmlChar* elt) {
6030 const char** p ;
6031
6032 if ( ! elt || ! parent || ! parent->subelts )
6033 return 0 ;
6034
6035 for ( p = parent->subelts; *p; ++p )
6036 if ( !xmlStrcmp((const xmlChar *)*p, elt) )
6037 return 1 ;
6038
6039 return 0 ;
6040 }
6041 /**
6042 * htmlElementStatusHere:
6043 * @parent: HTML parent element
6044 * @elt: HTML element
6045 *
6046 * Checks whether an HTML element may be a direct child of a parent element.
6047 * and if so whether it is valid or deprecated.
6048 *
6049 * Returns one of HTML_VALID, HTML_DEPRECATED, HTML_INVALID
6050 */
6051 htmlStatus
htmlElementStatusHere(const htmlElemDesc * parent,const htmlElemDesc * elt)6052 htmlElementStatusHere(const htmlElemDesc* parent, const htmlElemDesc* elt) {
6053 if ( ! parent || ! elt )
6054 return HTML_INVALID ;
6055 if ( ! htmlElementAllowedHere(parent, (const xmlChar*) elt->name ) )
6056 return HTML_INVALID ;
6057
6058 return ( elt->dtd == 0 ) ? HTML_VALID : HTML_DEPRECATED ;
6059 }
6060 /**
6061 * htmlAttrAllowed:
6062 * @elt: HTML element
6063 * @attr: HTML attribute
6064 * @legacy: whether to allow deprecated attributes
6065 *
6066 * Checks whether an attribute is valid for an element
6067 * Has full knowledge of Required and Deprecated attributes
6068 *
6069 * Returns one of HTML_REQUIRED, HTML_VALID, HTML_DEPRECATED, HTML_INVALID
6070 */
6071 htmlStatus
htmlAttrAllowed(const htmlElemDesc * elt,const xmlChar * attr,int legacy)6072 htmlAttrAllowed(const htmlElemDesc* elt, const xmlChar* attr, int legacy) {
6073 const char** p ;
6074
6075 if ( !elt || ! attr )
6076 return HTML_INVALID ;
6077
6078 if ( elt->attrs_req )
6079 for ( p = elt->attrs_req; *p; ++p)
6080 if ( !xmlStrcmp((const xmlChar*)*p, attr) )
6081 return HTML_REQUIRED ;
6082
6083 if ( elt->attrs_opt )
6084 for ( p = elt->attrs_opt; *p; ++p)
6085 if ( !xmlStrcmp((const xmlChar*)*p, attr) )
6086 return HTML_VALID ;
6087
6088 if ( legacy && elt->attrs_depr )
6089 for ( p = elt->attrs_depr; *p; ++p)
6090 if ( !xmlStrcmp((const xmlChar*)*p, attr) )
6091 return HTML_DEPRECATED ;
6092
6093 return HTML_INVALID ;
6094 }
6095 /**
6096 * htmlNodeStatus:
6097 * @node: an htmlNodePtr in a tree
6098 * @legacy: whether to allow deprecated elements (YES is faster here
6099 * for Element nodes)
6100 *
6101 * Checks whether the tree node is valid. Experimental (the author
6102 * only uses the HTML enhancements in a SAX parser)
6103 *
6104 * Return: for Element nodes, a return from htmlElementAllowedHere (if
6105 * legacy allowed) or htmlElementStatusHere (otherwise).
6106 * for Attribute nodes, a return from htmlAttrAllowed
6107 * for other nodes, HTML_NA (no checks performed)
6108 */
6109 htmlStatus
htmlNodeStatus(htmlNodePtr node,int legacy)6110 htmlNodeStatus(htmlNodePtr node, int legacy) {
6111 if ( ! node )
6112 return HTML_INVALID ;
6113
6114 switch ( node->type ) {
6115 case XML_ELEMENT_NODE:
6116 return legacy
6117 ? ( htmlElementAllowedHere (
6118 htmlTagLookup(node->parent->name) , node->name
6119 ) ? HTML_VALID : HTML_INVALID )
6120 : htmlElementStatusHere(
6121 htmlTagLookup(node->parent->name) ,
6122 htmlTagLookup(node->name) )
6123 ;
6124 case XML_ATTRIBUTE_NODE:
6125 return htmlAttrAllowed(
6126 htmlTagLookup(node->parent->name) , node->name, legacy) ;
6127 default: return HTML_NA ;
6128 }
6129 }
6130 /************************************************************************
6131 * *
6132 * New set (2.6.0) of simpler and more flexible APIs *
6133 * *
6134 ************************************************************************/
6135 /**
6136 * DICT_FREE:
6137 * @str: a string
6138 *
6139 * Free a string if it is not owned by the "dict" dictionary in the
6140 * current scope
6141 */
6142 #define DICT_FREE(str) \
6143 if ((str) && ((!dict) || \
6144 (xmlDictOwns(dict, (const xmlChar *)(str)) == 0))) \
6145 xmlFree((char *)(str));
6146
6147 /**
6148 * htmlCtxtReset:
6149 * @ctxt: an HTML parser context
6150 *
6151 * Reset a parser context
6152 */
6153 void
htmlCtxtReset(htmlParserCtxtPtr ctxt)6154 htmlCtxtReset(htmlParserCtxtPtr ctxt)
6155 {
6156 xmlParserInputPtr input;
6157 xmlDictPtr dict;
6158
6159 if (ctxt == NULL)
6160 return;
6161
6162 dict = ctxt->dict;
6163
6164 while ((input = inputPop(ctxt)) != NULL) { /* Non consuming */
6165 xmlFreeInputStream(input);
6166 }
6167 ctxt->inputNr = 0;
6168 ctxt->input = NULL;
6169
6170 ctxt->spaceNr = 0;
6171 if (ctxt->spaceTab != NULL) {
6172 ctxt->spaceTab[0] = -1;
6173 ctxt->space = &ctxt->spaceTab[0];
6174 } else {
6175 ctxt->space = NULL;
6176 }
6177
6178
6179 ctxt->nodeNr = 0;
6180 ctxt->node = NULL;
6181
6182 ctxt->nameNr = 0;
6183 ctxt->name = NULL;
6184
6185 ctxt->nsNr = 0;
6186
6187 DICT_FREE(ctxt->version);
6188 ctxt->version = NULL;
6189 DICT_FREE(ctxt->encoding);
6190 ctxt->encoding = NULL;
6191 DICT_FREE(ctxt->extSubURI);
6192 ctxt->extSubURI = NULL;
6193 DICT_FREE(ctxt->extSubSystem);
6194 ctxt->extSubSystem = NULL;
6195 if (ctxt->myDoc != NULL)
6196 xmlFreeDoc(ctxt->myDoc);
6197 ctxt->myDoc = NULL;
6198
6199 ctxt->standalone = -1;
6200 ctxt->hasExternalSubset = 0;
6201 ctxt->hasPErefs = 0;
6202 ctxt->html = 1;
6203 ctxt->instate = XML_PARSER_START;
6204
6205 ctxt->wellFormed = 1;
6206 ctxt->nsWellFormed = 1;
6207 ctxt->disableSAX = 0;
6208 ctxt->valid = 1;
6209 ctxt->vctxt.userData = ctxt;
6210 ctxt->vctxt.flags = XML_VCTXT_USE_PCTXT;
6211 ctxt->vctxt.error = xmlParserValidityError;
6212 ctxt->vctxt.warning = xmlParserValidityWarning;
6213 ctxt->record_info = 0;
6214 ctxt->checkIndex = 0;
6215 ctxt->endCheckState = 0;
6216 ctxt->inSubset = 0;
6217 ctxt->errNo = XML_ERR_OK;
6218 ctxt->depth = 0;
6219 ctxt->catalogs = NULL;
6220 xmlInitNodeInfoSeq(&ctxt->node_seq);
6221
6222 if (ctxt->attsDefault != NULL) {
6223 xmlHashFree(ctxt->attsDefault, xmlHashDefaultDeallocator);
6224 ctxt->attsDefault = NULL;
6225 }
6226 if (ctxt->attsSpecial != NULL) {
6227 xmlHashFree(ctxt->attsSpecial, NULL);
6228 ctxt->attsSpecial = NULL;
6229 }
6230
6231 ctxt->nbErrors = 0;
6232 ctxt->nbWarnings = 0;
6233 if (ctxt->lastError.code != XML_ERR_OK)
6234 xmlResetError(&ctxt->lastError);
6235 }
6236
6237 /**
6238 * htmlCtxtUseOptions:
6239 * @ctxt: an HTML parser context
6240 * @options: a combination of htmlParserOption(s)
6241 *
6242 * Applies the options to the parser context
6243 *
6244 * Returns 0 in case of success, the set of unknown or unimplemented options
6245 * in case of error.
6246 */
6247 int
htmlCtxtUseOptions(htmlParserCtxtPtr ctxt,int options)6248 htmlCtxtUseOptions(htmlParserCtxtPtr ctxt, int options)
6249 {
6250 if (ctxt == NULL)
6251 return(-1);
6252
6253 if (options & HTML_PARSE_NOWARNING) {
6254 ctxt->sax->warning = NULL;
6255 ctxt->vctxt.warning = NULL;
6256 options -= XML_PARSE_NOWARNING;
6257 ctxt->options |= XML_PARSE_NOWARNING;
6258 }
6259 if (options & HTML_PARSE_NOERROR) {
6260 ctxt->sax->error = NULL;
6261 ctxt->vctxt.error = NULL;
6262 ctxt->sax->fatalError = NULL;
6263 options -= XML_PARSE_NOERROR;
6264 ctxt->options |= XML_PARSE_NOERROR;
6265 }
6266 if (options & HTML_PARSE_PEDANTIC) {
6267 ctxt->pedantic = 1;
6268 options -= XML_PARSE_PEDANTIC;
6269 ctxt->options |= XML_PARSE_PEDANTIC;
6270 } else
6271 ctxt->pedantic = 0;
6272 if (options & XML_PARSE_NOBLANKS) {
6273 ctxt->keepBlanks = 0;
6274 ctxt->sax->ignorableWhitespace = xmlSAX2IgnorableWhitespace;
6275 options -= XML_PARSE_NOBLANKS;
6276 ctxt->options |= XML_PARSE_NOBLANKS;
6277 } else
6278 ctxt->keepBlanks = 1;
6279 if (options & HTML_PARSE_RECOVER) {
6280 ctxt->recovery = 1;
6281 options -= HTML_PARSE_RECOVER;
6282 } else
6283 ctxt->recovery = 0;
6284 if (options & HTML_PARSE_COMPACT) {
6285 ctxt->options |= HTML_PARSE_COMPACT;
6286 options -= HTML_PARSE_COMPACT;
6287 }
6288 if (options & XML_PARSE_HUGE) {
6289 ctxt->options |= XML_PARSE_HUGE;
6290 options -= XML_PARSE_HUGE;
6291 }
6292 if (options & HTML_PARSE_NODEFDTD) {
6293 ctxt->options |= HTML_PARSE_NODEFDTD;
6294 options -= HTML_PARSE_NODEFDTD;
6295 }
6296 if (options & HTML_PARSE_IGNORE_ENC) {
6297 ctxt->options |= HTML_PARSE_IGNORE_ENC;
6298 options -= HTML_PARSE_IGNORE_ENC;
6299 }
6300 if (options & HTML_PARSE_NOIMPLIED) {
6301 ctxt->options |= HTML_PARSE_NOIMPLIED;
6302 options -= HTML_PARSE_NOIMPLIED;
6303 }
6304 ctxt->dictNames = 0;
6305 ctxt->linenumbers = 1;
6306 return (options);
6307 }
6308
6309 /**
6310 * htmlCtxtParseDocument:
6311 * @ctxt: an HTML parser context
6312 * @input: parser input
6313 *
6314 * Parse an HTML document and return the resulting document tree.
6315 *
6316 * Available since 2.13.0.
6317 *
6318 * Returns the resulting document tree or NULL
6319 */
6320 htmlDocPtr
htmlCtxtParseDocument(htmlParserCtxtPtr ctxt,xmlParserInputPtr input)6321 htmlCtxtParseDocument(htmlParserCtxtPtr ctxt, xmlParserInputPtr input)
6322 {
6323 htmlDocPtr ret;
6324
6325 if ((ctxt == NULL) || (input == NULL))
6326 return(NULL);
6327
6328 /* assert(ctxt->inputNr == 0); */
6329 while (ctxt->inputNr > 0)
6330 xmlFreeInputStream(inputPop(ctxt));
6331
6332 if (inputPush(ctxt, input) < 0) {
6333 xmlFreeInputStream(input);
6334 return(NULL);
6335 }
6336
6337 ctxt->html = 1;
6338 htmlParseDocument(ctxt);
6339
6340 if (ctxt->errNo != XML_ERR_NO_MEMORY) {
6341 ret = ctxt->myDoc;
6342 } else {
6343 ret = NULL;
6344 xmlFreeDoc(ctxt->myDoc);
6345 }
6346 ctxt->myDoc = NULL;
6347
6348 /* assert(ctxt->inputNr == 1); */
6349 while (ctxt->inputNr > 0)
6350 xmlFreeInputStream(inputPop(ctxt));
6351
6352 return(ret);
6353 }
6354
6355 /**
6356 * htmlReadDoc:
6357 * @str: a pointer to a zero terminated string
6358 * @url: only used for error reporting (optoinal)
6359 * @encoding: the document encoding (optional)
6360 * @options: a combination of htmlParserOptions
6361 *
6362 * Convenience function to parse an HTML document from a zero-terminated
6363 * string.
6364 *
6365 * See htmlCtxtReadDoc for details.
6366 *
6367 * Returns the resulting document tree.
6368 */
6369 htmlDocPtr
htmlReadDoc(const xmlChar * str,const char * url,const char * encoding,int options)6370 htmlReadDoc(const xmlChar *str, const char *url, const char *encoding,
6371 int options)
6372 {
6373 htmlParserCtxtPtr ctxt;
6374 xmlParserInputPtr input;
6375 htmlDocPtr doc;
6376
6377 ctxt = htmlNewParserCtxt();
6378 if (ctxt == NULL)
6379 return(NULL);
6380
6381 htmlCtxtUseOptions(ctxt, options);
6382
6383 input = xmlNewInputString(ctxt, url, (const char *) str, encoding,
6384 XML_INPUT_BUF_STATIC);
6385
6386 doc = htmlCtxtParseDocument(ctxt, input);
6387
6388 htmlFreeParserCtxt(ctxt);
6389 return(doc);
6390 }
6391
6392 /**
6393 * htmlReadFile:
6394 * @filename: a file or URL
6395 * @encoding: the document encoding (optional)
6396 * @options: a combination of htmlParserOptions
6397 *
6398 * Convenience function to parse an HTML file from the filesystem,
6399 * the network or a global user-defined resource loader.
6400 *
6401 * See htmlCtxtReadFile for details.
6402 *
6403 * Returns the resulting document tree.
6404 */
6405 htmlDocPtr
htmlReadFile(const char * filename,const char * encoding,int options)6406 htmlReadFile(const char *filename, const char *encoding, int options)
6407 {
6408 htmlParserCtxtPtr ctxt;
6409 xmlParserInputPtr input;
6410 htmlDocPtr doc;
6411
6412 ctxt = htmlNewParserCtxt();
6413 if (ctxt == NULL)
6414 return(NULL);
6415
6416 htmlCtxtUseOptions(ctxt, options);
6417
6418 input = xmlNewInputURL(ctxt, filename, NULL, encoding, 0);
6419
6420 doc = htmlCtxtParseDocument(ctxt, input);
6421
6422 htmlFreeParserCtxt(ctxt);
6423 return(doc);
6424 }
6425
6426 /**
6427 * htmlReadMemory:
6428 * @buffer: a pointer to a char array
6429 * @size: the size of the array
6430 * @url: only used for error reporting (optional)
6431 * @encoding: the document encoding, or NULL
6432 * @options: a combination of htmlParserOption(s)
6433 *
6434 * Convenience function to parse an HTML document from memory.
6435 * The input buffer must not contain any terminating null bytes.
6436 *
6437 * See htmlCtxtReadMemory for details.
6438 *
6439 * Returns the resulting document tree
6440 */
6441 htmlDocPtr
htmlReadMemory(const char * buffer,int size,const char * url,const char * encoding,int options)6442 htmlReadMemory(const char *buffer, int size, const char *url,
6443 const char *encoding, int options)
6444 {
6445 htmlParserCtxtPtr ctxt;
6446 xmlParserInputPtr input;
6447 htmlDocPtr doc;
6448
6449 if (size < 0)
6450 return(NULL);
6451
6452 ctxt = htmlNewParserCtxt();
6453 if (ctxt == NULL)
6454 return(NULL);
6455
6456 htmlCtxtUseOptions(ctxt, options);
6457
6458 input = xmlNewInputMemory(ctxt, url, buffer, size, encoding,
6459 XML_INPUT_BUF_STATIC);
6460
6461 doc = htmlCtxtParseDocument(ctxt, input);
6462
6463 htmlFreeParserCtxt(ctxt);
6464 return(doc);
6465 }
6466
6467 /**
6468 * htmlReadFd:
6469 * @fd: an open file descriptor
6470 * @url: only used for error reporting (optional)
6471 * @encoding: the document encoding, or NULL
6472 * @options: a combination of htmlParserOptions
6473 *
6474 * Convenience function to parse an HTML document from a
6475 * file descriptor.
6476 *
6477 * NOTE that the file descriptor will not be closed when the
6478 * context is freed or reset.
6479 *
6480 * See htmlCtxtReadFd for details.
6481 *
6482 * Returns the resulting document tree
6483 */
6484 htmlDocPtr
htmlReadFd(int fd,const char * url,const char * encoding,int options)6485 htmlReadFd(int fd, const char *url, const char *encoding, int options)
6486 {
6487 htmlParserCtxtPtr ctxt;
6488 xmlParserInputPtr input;
6489 htmlDocPtr doc;
6490
6491 ctxt = htmlNewParserCtxt();
6492 if (ctxt == NULL)
6493 return(NULL);
6494
6495 htmlCtxtUseOptions(ctxt, options);
6496
6497 input = xmlNewInputFd(ctxt, url, fd, encoding, 0);
6498
6499 doc = htmlCtxtParseDocument(ctxt, input);
6500
6501 htmlFreeParserCtxt(ctxt);
6502 return(doc);
6503 }
6504
6505 /**
6506 * htmlReadIO:
6507 * @ioread: an I/O read function
6508 * @ioclose: an I/O close function (optional)
6509 * @ioctx: an I/O handler
6510 * @url: only used for error reporting (optional)
6511 * @encoding: the document encoding (optional)
6512 * @options: a combination of htmlParserOption(s)
6513 *
6514 * Convenience function to parse an HTML document from I/O functions
6515 * and context.
6516 *
6517 * See htmlCtxtReadIO for details.
6518 *
6519 * Returns the resulting document tree
6520 */
6521 htmlDocPtr
htmlReadIO(xmlInputReadCallback ioread,xmlInputCloseCallback ioclose,void * ioctx,const char * url,const char * encoding,int options)6522 htmlReadIO(xmlInputReadCallback ioread, xmlInputCloseCallback ioclose,
6523 void *ioctx, const char *url, const char *encoding, int options)
6524 {
6525 htmlParserCtxtPtr ctxt;
6526 xmlParserInputPtr input;
6527 htmlDocPtr doc;
6528
6529 ctxt = htmlNewParserCtxt();
6530 if (ctxt == NULL)
6531 return (NULL);
6532
6533 htmlCtxtUseOptions(ctxt, options);
6534
6535 input = xmlNewInputIO(ctxt, url, ioread, ioclose, ioctx, encoding, 0);
6536
6537 doc = htmlCtxtParseDocument(ctxt, input);
6538
6539 htmlFreeParserCtxt(ctxt);
6540 return(doc);
6541 }
6542
6543 /**
6544 * htmlCtxtReadDoc:
6545 * @ctxt: an HTML parser context
6546 * @str: a pointer to a zero terminated string
6547 * @URL: only used for error reporting (optional)
6548 * @encoding: the document encoding (optional)
6549 * @options: a combination of htmlParserOptions
6550 *
6551 * Parse an HTML in-memory document and build a tree.
6552 *
6553 * See htmlCtxtUseOptions for details.
6554 *
6555 * Returns the resulting document tree
6556 */
6557 htmlDocPtr
htmlCtxtReadDoc(htmlParserCtxtPtr ctxt,const xmlChar * str,const char * URL,const char * encoding,int options)6558 htmlCtxtReadDoc(htmlParserCtxtPtr ctxt, const xmlChar *str,
6559 const char *URL, const char *encoding, int options)
6560 {
6561 xmlParserInputPtr input;
6562
6563 if (ctxt == NULL)
6564 return (NULL);
6565
6566 htmlCtxtReset(ctxt);
6567 htmlCtxtUseOptions(ctxt, options);
6568
6569 input = xmlNewInputString(ctxt, URL, (const char *) str, encoding, 0);
6570
6571 return(htmlCtxtParseDocument(ctxt, input));
6572 }
6573
6574 /**
6575 * htmlCtxtReadFile:
6576 * @ctxt: an HTML parser context
6577 * @filename: a file or URL
6578 * @encoding: the document encoding (optional)
6579 * @options: a combination of htmlParserOptions
6580 *
6581 * Parse an HTML file from the filesystem, the network or a
6582 * user-defined resource loader.
6583 *
6584 * See xmlNewInputURL and htmlCtxtUseOptions for details.
6585 *
6586 * Returns the resulting document tree
6587 */
6588 htmlDocPtr
htmlCtxtReadFile(htmlParserCtxtPtr ctxt,const char * filename,const char * encoding,int options)6589 htmlCtxtReadFile(htmlParserCtxtPtr ctxt, const char *filename,
6590 const char *encoding, int options)
6591 {
6592 xmlParserInputPtr input;
6593
6594 if (ctxt == NULL)
6595 return (NULL);
6596
6597 htmlCtxtReset(ctxt);
6598 htmlCtxtUseOptions(ctxt, options);
6599
6600 input = xmlNewInputURL(ctxt, filename, NULL, encoding, 0);
6601
6602 return(htmlCtxtParseDocument(ctxt, input));
6603 }
6604
6605 /**
6606 * htmlCtxtReadMemory:
6607 * @ctxt: an HTML parser context
6608 * @buffer: a pointer to a char array
6609 * @size: the size of the array
6610 * @URL: only used for error reporting (optional)
6611 * @encoding: the document encoding (optinal)
6612 * @options: a combination of htmlParserOptions
6613 *
6614 * Parse an HTML in-memory document and build a tree. The input buffer must
6615 * not contain any terminating null bytes.
6616 *
6617 * See htmlCtxtUseOptions for details.
6618 *
6619 * Returns the resulting document tree
6620 */
6621 htmlDocPtr
htmlCtxtReadMemory(htmlParserCtxtPtr ctxt,const char * buffer,int size,const char * URL,const char * encoding,int options)6622 htmlCtxtReadMemory(htmlParserCtxtPtr ctxt, const char *buffer, int size,
6623 const char *URL, const char *encoding, int options)
6624 {
6625 xmlParserInputPtr input;
6626
6627 if ((ctxt == NULL) || (size < 0))
6628 return (NULL);
6629
6630 htmlCtxtReset(ctxt);
6631 htmlCtxtUseOptions(ctxt, options);
6632
6633 input = xmlNewInputMemory(ctxt, URL, buffer, size, encoding,
6634 XML_INPUT_BUF_STATIC);
6635
6636 return(htmlCtxtParseDocument(ctxt, input));
6637 }
6638
6639 /**
6640 * htmlCtxtReadFd:
6641 * @ctxt: an HTML parser context
6642 * @fd: an open file descriptor
6643 * @URL: only used for error reporting (optional)
6644 * @encoding: the document encoding (optinal)
6645 * @options: a combination of htmlParserOptions
6646 *
6647 * Parse an HTML from a file descriptor and build a tree.
6648 *
6649 * See htmlCtxtUseOptions for details.
6650 *
6651 * NOTE that the file descriptor will not be closed when the
6652 * context is freed or reset.
6653 *
6654 * Returns the resulting document tree
6655 */
6656 htmlDocPtr
htmlCtxtReadFd(htmlParserCtxtPtr ctxt,int fd,const char * URL,const char * encoding,int options)6657 htmlCtxtReadFd(htmlParserCtxtPtr ctxt, int fd,
6658 const char *URL, const char *encoding, int options)
6659 {
6660 xmlParserInputPtr input;
6661
6662 if (ctxt == NULL)
6663 return(NULL);
6664
6665 htmlCtxtReset(ctxt);
6666 htmlCtxtUseOptions(ctxt, options);
6667
6668 input = xmlNewInputFd(ctxt, URL, fd, encoding, 0);
6669
6670 return(htmlCtxtParseDocument(ctxt, input));
6671 }
6672
6673 /**
6674 * htmlCtxtReadIO:
6675 * @ctxt: an HTML parser context
6676 * @ioread: an I/O read function
6677 * @ioclose: an I/O close function
6678 * @ioctx: an I/O handler
6679 * @URL: the base URL to use for the document
6680 * @encoding: the document encoding, or NULL
6681 * @options: a combination of htmlParserOption(s)
6682 *
6683 * Parse an HTML document from I/O functions and source and build a tree.
6684 *
6685 * See xmlNewInputIO and htmlCtxtUseOptions for details.
6686 *
6687 * Returns the resulting document tree
6688 */
6689 htmlDocPtr
htmlCtxtReadIO(htmlParserCtxtPtr ctxt,xmlInputReadCallback ioread,xmlInputCloseCallback ioclose,void * ioctx,const char * URL,const char * encoding,int options)6690 htmlCtxtReadIO(htmlParserCtxtPtr ctxt, xmlInputReadCallback ioread,
6691 xmlInputCloseCallback ioclose, void *ioctx,
6692 const char *URL,
6693 const char *encoding, int options)
6694 {
6695 xmlParserInputPtr input;
6696
6697 if (ctxt == NULL)
6698 return (NULL);
6699
6700 htmlCtxtReset(ctxt);
6701 htmlCtxtUseOptions(ctxt, options);
6702
6703 input = xmlNewInputIO(ctxt, URL, ioread, ioclose, ioctx, encoding, 0);
6704
6705 return(htmlCtxtParseDocument(ctxt, input));
6706 }
6707
6708 #endif /* LIBXML_HTML_ENABLED */
6709