1 /*
2 * HTMLparser.c : an HTML parser
3 *
4 * References:
5 * HTML Living Standard
6 * https://html.spec.whatwg.org/multipage/parsing.html
7 *
8 * Tokenization now conforms to HTML5. Tree construction still follows
9 * a custom, non-standard implementation. See:
10 *
11 * https://gitlab.gnome.org/GNOME/libxml2/-/issues/211
12 *
13 * See Copyright for the status of this software.
14 *
15 * daniel@veillard.com
16 */
17
18 #define IN_LIBXML
19 #include "libxml.h"
20 #ifdef LIBXML_HTML_ENABLED
21
22 #include <string.h>
23 #include <ctype.h>
24 #include <stdlib.h>
25
26 #include <libxml/HTMLparser.h>
27 #include <libxml/xmlmemory.h>
28 #include <libxml/tree.h>
29 #include <libxml/parser.h>
30 #include <libxml/parserInternals.h>
31 #include <libxml/xmlerror.h>
32 #include <libxml/HTMLtree.h>
33 #include <libxml/entities.h>
34 #include <libxml/encoding.h>
35 #include <libxml/xmlIO.h>
36 #include <libxml/uri.h>
37
38 #include "private/buf.h"
39 #include "private/dict.h"
40 #include "private/enc.h"
41 #include "private/error.h"
42 #include "private/html.h"
43 #include "private/io.h"
44 #include "private/memory.h"
45 #include "private/parser.h"
46 #include "private/tree.h"
47
48 #define HTML_MAX_NAMELEN 1000
49 #define HTML_MAX_ATTRS 100000000 /* 100 million */
50 #define HTML_PARSER_BIG_BUFFER_SIZE 1000
51 #define HTML_PARSER_BUFFER_SIZE 100
52
53 #define IS_WS_HTML(c) \
54 (((c) == 0x20) || \
55 (((c) >= 0x09) && ((c) <= 0x0D) && ((c) != 0x0B)))
56
57 #define IS_HEX_DIGIT(c) \
58 ((IS_ASCII_DIGIT(c)) || \
59 ((((c) | 0x20) >= 'a') && (((c) | 0x20) <= 'f')))
60
61 #define IS_UPPER(c) \
62 (((c) >= 'A') && ((c) <= 'Z'))
63
64 #define IS_ALNUM(c) \
65 (IS_ASCII_LETTER(c) || IS_ASCII_DIGIT(c))
66
67 typedef const unsigned htmlAsciiMask[2];
68
69 static htmlAsciiMask MASK_DQ = {
70 0,
71 1u << ('"' - 32),
72 };
73 static htmlAsciiMask MASK_SQ = {
74 0,
75 1u << ('\'' - 32),
76 };
77 static htmlAsciiMask MASK_GT = {
78 0,
79 1u << ('>' - 32),
80 };
81 static htmlAsciiMask MASK_DASH = {
82 0,
83 1u << ('-' - 32),
84 };
85 static htmlAsciiMask MASK_WS_GT = {
86 1u << 0x09 | 1u << 0x0A | 1u << 0x0C | 1u << 0x0D,
87 1u << (' ' - 32) | 1u << ('>' - 32),
88 };
89 static htmlAsciiMask MASK_DQ_GT = {
90 0,
91 1u << ('"' - 32) | 1u << ('>' - 32),
92 };
93 static htmlAsciiMask MASK_SQ_GT = {
94 0,
95 1u << ('\'' - 32) | 1u << ('>' - 32),
96 };
97
98 static int htmlOmittedDefaultValue = 1;
99
100 static int
101 htmlParseElementInternal(htmlParserCtxtPtr ctxt);
102
103 /************************************************************************
104 * *
105 * Some factorized error routines *
106 * *
107 ************************************************************************/
108
109 /**
110 * htmlErrMemory:
111 * @ctxt: an HTML parser context
112 * @extra: extra information
113 *
114 * Handle a redefinition of attribute error
115 */
116 static void
htmlErrMemory(xmlParserCtxtPtr ctxt)117 htmlErrMemory(xmlParserCtxtPtr ctxt)
118 {
119 xmlCtxtErrMemory(ctxt);
120 }
121
122 /**
123 * htmlParseErr:
124 * @ctxt: an HTML parser context
125 * @error: the error number
126 * @msg: the error message
127 * @str1: string infor
128 * @str2: string infor
129 *
130 * Handle a fatal parser error, i.e. violating Well-Formedness constraints
131 */
132 static void LIBXML_ATTR_FORMAT(3,0)
htmlParseErr(xmlParserCtxtPtr ctxt,xmlParserErrors error,const char * msg,const xmlChar * str1,const xmlChar * str2)133 htmlParseErr(xmlParserCtxtPtr ctxt, xmlParserErrors error,
134 const char *msg, const xmlChar *str1, const xmlChar *str2)
135 {
136 xmlCtxtErr(ctxt, NULL, XML_FROM_HTML, error, XML_ERR_ERROR,
137 str1, str2, NULL, 0, msg, str1, str2);
138 }
139
140 /************************************************************************
141 * *
142 * Parser stacks related functions and macros *
143 * *
144 ************************************************************************/
145
146 /**
147 * htmlnamePush:
148 * @ctxt: an HTML parser context
149 * @value: the element name
150 *
151 * Pushes a new element name on top of the name stack
152 *
153 * Returns -1 in case of error, the index in the stack otherwise
154 */
155 static int
htmlnamePush(htmlParserCtxtPtr ctxt,const xmlChar * value)156 htmlnamePush(htmlParserCtxtPtr ctxt, const xmlChar * value)
157 {
158 if ((ctxt->html < 3) && (xmlStrEqual(value, BAD_CAST "head")))
159 ctxt->html = 3;
160 if ((ctxt->html < 10) && (xmlStrEqual(value, BAD_CAST "body")))
161 ctxt->html = 10;
162 if (ctxt->nameNr >= ctxt->nameMax) {
163 const xmlChar **tmp;
164 int newSize;
165
166 newSize = xmlGrowCapacity(ctxt->nameMax, sizeof(tmp[0]),
167 10, XML_MAX_ITEMS);
168 if (newSize < 0) {
169 htmlErrMemory(ctxt);
170 return (-1);
171 }
172 tmp = xmlRealloc(ctxt->nameTab, newSize * sizeof(tmp[0]));
173 if (tmp == NULL) {
174 htmlErrMemory(ctxt);
175 return(-1);
176 }
177 ctxt->nameTab = tmp;
178 ctxt->nameMax = newSize;
179 }
180 ctxt->nameTab[ctxt->nameNr] = value;
181 ctxt->name = value;
182 return (ctxt->nameNr++);
183 }
184 /**
185 * htmlnamePop:
186 * @ctxt: an HTML parser context
187 *
188 * Pops the top element name from the name stack
189 *
190 * Returns the name just removed
191 */
192 static const xmlChar *
htmlnamePop(htmlParserCtxtPtr ctxt)193 htmlnamePop(htmlParserCtxtPtr ctxt)
194 {
195 const xmlChar *ret;
196
197 if (ctxt->nameNr <= 0)
198 return (NULL);
199 ctxt->nameNr--;
200 if (ctxt->nameNr < 0)
201 return (NULL);
202 if (ctxt->nameNr > 0)
203 ctxt->name = ctxt->nameTab[ctxt->nameNr - 1];
204 else
205 ctxt->name = NULL;
206 ret = ctxt->nameTab[ctxt->nameNr];
207 ctxt->nameTab[ctxt->nameNr] = NULL;
208 return (ret);
209 }
210
211 /**
212 * htmlNodeInfoPush:
213 * @ctxt: an HTML parser context
214 * @value: the node info
215 *
216 * Pushes a new element name on top of the node info stack
217 *
218 * Returns 0 in case of error, the index in the stack otherwise
219 */
220 static int
htmlNodeInfoPush(htmlParserCtxtPtr ctxt,htmlParserNodeInfo * value)221 htmlNodeInfoPush(htmlParserCtxtPtr ctxt, htmlParserNodeInfo *value)
222 {
223 if (ctxt->nodeInfoNr >= ctxt->nodeInfoMax) {
224 xmlParserNodeInfo *tmp;
225 int newSize;
226
227 newSize = xmlGrowCapacity(ctxt->nodeInfoMax, sizeof(tmp[0]),
228 5, XML_MAX_ITEMS);
229 if (newSize < 0) {
230 htmlErrMemory(ctxt);
231 return (0);
232 }
233 tmp = xmlRealloc(ctxt->nodeInfoTab, newSize * sizeof(tmp[0]));
234 if (tmp == NULL) {
235 htmlErrMemory(ctxt);
236 return (0);
237 }
238 ctxt->nodeInfoTab = tmp;
239 ctxt->nodeInfoMax = newSize;
240 }
241 ctxt->nodeInfoTab[ctxt->nodeInfoNr] = *value;
242 ctxt->nodeInfo = &ctxt->nodeInfoTab[ctxt->nodeInfoNr];
243 return (ctxt->nodeInfoNr++);
244 }
245
246 /**
247 * htmlNodeInfoPop:
248 * @ctxt: an HTML parser context
249 *
250 * Pops the top element name from the node info stack
251 *
252 * Returns 0 in case of error, the pointer to NodeInfo otherwise
253 */
254 static htmlParserNodeInfo *
htmlNodeInfoPop(htmlParserCtxtPtr ctxt)255 htmlNodeInfoPop(htmlParserCtxtPtr ctxt)
256 {
257 if (ctxt->nodeInfoNr <= 0)
258 return (NULL);
259 ctxt->nodeInfoNr--;
260 if (ctxt->nodeInfoNr < 0)
261 return (NULL);
262 if (ctxt->nodeInfoNr > 0)
263 ctxt->nodeInfo = &ctxt->nodeInfoTab[ctxt->nodeInfoNr - 1];
264 else
265 ctxt->nodeInfo = NULL;
266 return &ctxt->nodeInfoTab[ctxt->nodeInfoNr];
267 }
268
269 /*
270 * Macros for accessing the content. Those should be used only by the parser,
271 * and not exported.
272 *
273 * Dirty macros, i.e. one need to make assumption on the context to use them
274 *
275 * CUR_PTR return the current pointer to the xmlChar to be parsed.
276 * CUR returns the current xmlChar value, i.e. a 8 bit value if compiled
277 * in ISO-Latin or UTF-8, and the current 16 bit value if compiled
278 * in UNICODE mode. This should be used internally by the parser
279 * only to compare to ASCII values otherwise it would break when
280 * running with UTF-8 encoding.
281 * NXT(n) returns the n'th next xmlChar. Same as CUR is should be used only
282 * to compare on ASCII based substring.
283 * UPP(n) returns the n'th next xmlChar converted to uppercase. Same as CUR
284 * it should be used only to compare on ASCII based substring.
285 * SKIP(n) Skip n xmlChar, and must also be used only to skip ASCII defined
286 * strings without newlines within the parser.
287 *
288 * Clean macros, not dependent of an ASCII context, expect UTF-8 encoding
289 *
290 * COPY(to) copy one char to *to, increment CUR_PTR and to accordingly
291 */
292
293 #define UPPER (toupper(*ctxt->input->cur))
294
295 #define SKIP(val) ctxt->input->cur += (val),ctxt->input->col+=(val)
296
297 #define NXT(val) ctxt->input->cur[(val)]
298
299 #define UPP(val) (toupper(ctxt->input->cur[(val)]))
300
301 #define CUR_PTR ctxt->input->cur
302 #define BASE_PTR ctxt->input->base
303
304 #define SHRINK \
305 if ((!PARSER_PROGRESSIVE(ctxt)) && \
306 (ctxt->input->cur - ctxt->input->base > 2 * INPUT_CHUNK) && \
307 (ctxt->input->end - ctxt->input->cur < 2 * INPUT_CHUNK)) \
308 xmlParserShrink(ctxt);
309
310 #define GROW \
311 if ((!PARSER_PROGRESSIVE(ctxt)) && \
312 (ctxt->input->end - ctxt->input->cur < INPUT_CHUNK)) \
313 xmlParserGrow(ctxt);
314
315 #define SKIP_BLANKS htmlSkipBlankChars(ctxt)
316
317 /* Imported from XML */
318
319 #define CUR (*ctxt->input->cur)
320
321 /**
322 * htmlFindEncoding:
323 * @the HTML parser context
324 *
325 * Ty to find and encoding in the current data available in the input
326 * buffer this is needed to try to switch to the proper encoding when
327 * one face a character error.
328 * That's an heuristic, since it's operating outside of parsing it could
329 * try to use a meta which had been commented out, that's the reason it
330 * should only be used in case of error, not as a default.
331 *
332 * Returns an encoding string or NULL if not found, the string need to
333 * be freed
334 */
335 static xmlChar *
htmlFindEncoding(xmlParserCtxtPtr ctxt)336 htmlFindEncoding(xmlParserCtxtPtr ctxt) {
337 const xmlChar *start, *cur, *end;
338 xmlChar *ret;
339
340 if ((ctxt == NULL) || (ctxt->input == NULL) ||
341 (ctxt->input->flags & XML_INPUT_HAS_ENCODING))
342 return(NULL);
343 if ((ctxt->input->cur == NULL) || (ctxt->input->end == NULL))
344 return(NULL);
345
346 start = ctxt->input->cur;
347 end = ctxt->input->end;
348 /* we also expect the input buffer to be zero terminated */
349 if (*end != 0)
350 return(NULL);
351
352 cur = xmlStrcasestr(start, BAD_CAST "HTTP-EQUIV");
353 if (cur == NULL)
354 return(NULL);
355 cur = xmlStrcasestr(cur, BAD_CAST "CONTENT");
356 if (cur == NULL)
357 return(NULL);
358 cur = xmlStrcasestr(cur, BAD_CAST "CHARSET=");
359 if (cur == NULL)
360 return(NULL);
361 cur += 8;
362 start = cur;
363 while ((IS_ALNUM(*cur)) ||
364 (*cur == '-') || (*cur == '_') || (*cur == ':') || (*cur == '/'))
365 cur++;
366 if (cur == start)
367 return(NULL);
368 ret = xmlStrndup(start, cur - start);
369 if (ret == NULL)
370 htmlErrMemory(ctxt);
371 return(ret);
372 }
373
374 static int
htmlMaskMatch(htmlAsciiMask mask,unsigned c)375 htmlMaskMatch(htmlAsciiMask mask, unsigned c) {
376 if (c >= 64)
377 return(0);
378 return((mask[c/32] >> (c & 31)) & 1);
379 }
380
381 static int
htmlValidateUtf8(xmlParserCtxtPtr ctxt,const xmlChar * str,size_t len,int partial)382 htmlValidateUtf8(xmlParserCtxtPtr ctxt, const xmlChar *str, size_t len,
383 int partial) {
384 unsigned c = str[0];
385 int size;
386
387 if (c < 0xC2) {
388 goto invalid;
389 } else if (c < 0xE0) {
390 if (len < 2)
391 goto incomplete;
392 if ((str[1] & 0xC0) != 0x80)
393 goto invalid;
394 size = 2;
395 } else if (c < 0xF0) {
396 unsigned v;
397
398 if (len < 3)
399 goto incomplete;
400
401 v = str[1] << 8 | str[2]; /* hint to generate 16-bit load */
402 v |= c << 16;
403
404 if (((v & 0x00C0C0) != 0x008080) ||
405 ((v & 0x0F2000) == 0x000000) ||
406 ((v & 0x0F2000) == 0x0D2000))
407 goto invalid;
408
409 size = 3;
410 } else {
411 unsigned v;
412
413 if (len < 4)
414 goto incomplete;
415
416 v = c << 24 | str[1] << 16 | str[2] << 8 | str[3];
417
418 if (((v & 0x00C0C0C0) != 0x00808080) ||
419 (v < 0xF0900000) || (v >= 0xF4900000))
420 goto invalid;
421
422 size = 4;
423 }
424
425 return(size);
426
427 incomplete:
428 if (partial)
429 return(0);
430
431 invalid:
432 /* Only report the first error */
433 if ((ctxt->input->flags & XML_INPUT_ENCODING_ERROR) == 0) {
434 htmlParseErr(ctxt, XML_ERR_INVALID_ENCODING,
435 "Invalid bytes in character encoding", NULL, NULL);
436 ctxt->input->flags |= XML_INPUT_ENCODING_ERROR;
437 }
438
439 return(-1);
440 }
441
442 /**
443 * htmlSkipBlankChars:
444 * @ctxt: the HTML parser context
445 *
446 * skip all blanks character found at that point in the input streams.
447 *
448 * Returns the number of space chars skipped
449 */
450
451 static int
htmlSkipBlankChars(xmlParserCtxtPtr ctxt)452 htmlSkipBlankChars(xmlParserCtxtPtr ctxt) {
453 const xmlChar *cur = ctxt->input->cur;
454 size_t avail = ctxt->input->end - cur;
455 int res = 0;
456 int line = ctxt->input->line;
457 int col = ctxt->input->col;
458
459 while (!PARSER_STOPPED(ctxt)) {
460 if (avail == 0) {
461 ctxt->input->cur = cur;
462 GROW;
463 cur = ctxt->input->cur;
464 avail = ctxt->input->end - cur;
465
466 if (avail == 0)
467 break;
468 }
469
470 if (*cur == '\n') {
471 line++;
472 col = 1;
473 } else if (IS_WS_HTML(*cur)) {
474 col++;
475 } else {
476 break;
477 }
478
479 cur += 1;
480 avail -= 1;
481
482 if (res < INT_MAX)
483 res++;
484 }
485
486 ctxt->input->cur = cur;
487 ctxt->input->line = line;
488 ctxt->input->col = col;
489
490 if (res > 8)
491 GROW;
492
493 return(res);
494 }
495
496
497
498 /************************************************************************
499 * *
500 * The list of HTML elements and their properties *
501 * *
502 ************************************************************************/
503
504 /*
505 * Start Tag: 1 means the start tag can be omitted
506 * End Tag: 1 means the end tag can be omitted
507 * 2 means it's forbidden (empty elements)
508 * 3 means the tag is stylistic and should be closed easily
509 * Depr: this element is deprecated
510 * DTD: 1 means that this element is valid only in the Loose DTD
511 * 2 means that this element is valid only in the Frameset DTD
512 *
513 * Name,Start Tag,End Tag,Save End,Empty,Deprecated,DTD,inline,Description
514 */
515
516 #define DATA_RCDATA 1
517 #define DATA_RAWTEXT 2
518 #define DATA_PLAINTEXT 3
519 #define DATA_SCRIPT 4
520 #define DATA_SCRIPT_ESC1 5
521 #define DATA_SCRIPT_ESC2 6
522
523 static const htmlElemDesc
524 html40ElementTable[] = {
525 { "a", 0, 0, 0, 0, 0, 0, 1, "anchor ",
526 NULL, NULL, NULL, NULL, NULL,
527 0
528 },
529 { "abbr", 0, 0, 0, 0, 0, 0, 1, "abbreviated form",
530 NULL, NULL, NULL, NULL, NULL,
531 0
532 },
533 { "acronym", 0, 0, 0, 0, 0, 0, 1, "",
534 NULL, NULL, NULL, NULL, NULL,
535 0
536 },
537 { "address", 0, 0, 0, 0, 0, 0, 0, "information on author ",
538 NULL, NULL, NULL, NULL, NULL,
539 0
540 },
541 { "applet", 0, 0, 0, 0, 1, 1, 2, "java applet ",
542 NULL, NULL, NULL, NULL, NULL,
543 0
544 },
545 { "area", 0, 2, 2, 1, 0, 0, 0, "client-side image map area ",
546 NULL, NULL, NULL, NULL, NULL,
547 0
548 },
549 { "b", 0, 3, 0, 0, 0, 0, 1, "bold text style",
550 NULL, NULL, NULL, NULL, NULL,
551 0
552 },
553 { "base", 0, 2, 2, 1, 0, 0, 0, "document base uri ",
554 NULL, NULL, NULL, NULL, NULL,
555 0
556 },
557 { "basefont", 0, 2, 2, 1, 1, 1, 1, "base font size " ,
558 NULL, NULL, NULL, NULL, NULL,
559 0
560 },
561 { "bdo", 0, 0, 0, 0, 0, 0, 1, "i18n bidi over-ride ",
562 NULL, NULL, NULL, NULL, NULL,
563 0
564 },
565 { "big", 0, 3, 0, 0, 0, 0, 1, "large text style",
566 NULL, NULL, NULL, NULL, NULL,
567 0
568 },
569 { "blockquote", 0, 0, 0, 0, 0, 0, 0, "long quotation ",
570 NULL, NULL, NULL, NULL, NULL,
571 0
572 },
573 { "body", 1, 1, 0, 0, 0, 0, 0, "document body ",
574 NULL, NULL, NULL, NULL, NULL,
575 0
576 },
577 { "br", 0, 2, 2, 1, 0, 0, 1, "forced line break ",
578 NULL, NULL, NULL, NULL, NULL,
579 0
580 },
581 { "button", 0, 0, 0, 0, 0, 0, 2, "push button ",
582 NULL, NULL, NULL, NULL, NULL,
583 0
584 },
585 { "caption", 0, 0, 0, 0, 0, 0, 0, "table caption ",
586 NULL, NULL, NULL, NULL, NULL,
587 0
588 },
589 { "center", 0, 3, 0, 0, 1, 1, 0, "shorthand for div align=center ",
590 NULL, NULL, NULL, NULL, NULL,
591 0
592 },
593 { "cite", 0, 0, 0, 0, 0, 0, 1, "citation",
594 NULL, NULL, NULL, NULL, NULL,
595 0
596 },
597 { "code", 0, 0, 0, 0, 0, 0, 1, "computer code fragment",
598 NULL, NULL, NULL, NULL, NULL,
599 0
600 },
601 { "col", 0, 2, 2, 1, 0, 0, 0, "table column ",
602 NULL, NULL, NULL, NULL, NULL,
603 0
604 },
605 { "colgroup", 0, 1, 0, 0, 0, 0, 0, "table column group ",
606 NULL, NULL, NULL, NULL, NULL,
607 0
608 },
609 { "dd", 0, 1, 0, 0, 0, 0, 0, "definition description ",
610 NULL, NULL, NULL, NULL, NULL,
611 0
612 },
613 { "del", 0, 0, 0, 0, 0, 0, 2, "deleted text ",
614 NULL, NULL, NULL, NULL, NULL,
615 0
616 },
617 { "dfn", 0, 0, 0, 0, 0, 0, 1, "instance definition",
618 NULL, NULL, NULL, NULL, NULL,
619 0
620 },
621 { "dir", 0, 0, 0, 0, 1, 1, 0, "directory list",
622 NULL, NULL, NULL, NULL, NULL,
623 0
624 },
625 { "div", 0, 0, 0, 0, 0, 0, 0, "generic language/style container",
626 NULL, NULL, NULL, NULL, NULL,
627 0
628 },
629 { "dl", 0, 0, 0, 0, 0, 0, 0, "definition list ",
630 NULL, NULL, NULL, NULL, NULL,
631 0
632 },
633 { "dt", 0, 1, 0, 0, 0, 0, 0, "definition term ",
634 NULL, NULL, NULL, NULL, NULL,
635 0
636 },
637 { "em", 0, 3, 0, 0, 0, 0, 1, "emphasis",
638 NULL, NULL, NULL, NULL, NULL,
639 0
640 },
641 { "embed", 0, 1, 0, 0, 1, 1, 1, "generic embedded object ",
642 NULL, NULL, NULL, NULL, NULL,
643 0
644 },
645 { "fieldset", 0, 0, 0, 0, 0, 0, 0, "form control group ",
646 NULL, NULL, NULL, NULL, NULL,
647 0
648 },
649 { "font", 0, 3, 0, 0, 1, 1, 1, "local change to font ",
650 NULL, NULL, NULL, NULL, NULL,
651 0
652 },
653 { "form", 0, 0, 0, 0, 0, 0, 0, "interactive form ",
654 NULL, NULL, NULL, NULL, NULL,
655 0
656 },
657 { "frame", 0, 2, 2, 1, 0, 2, 0, "subwindow " ,
658 NULL, NULL, NULL, NULL, NULL,
659 0
660 },
661 { "frameset", 0, 0, 0, 0, 0, 2, 0, "window subdivision" ,
662 NULL, NULL, NULL, NULL, NULL,
663 0
664 },
665 { "h1", 0, 0, 0, 0, 0, 0, 0, "heading ",
666 NULL, NULL, NULL, NULL, NULL,
667 0
668 },
669 { "h2", 0, 0, 0, 0, 0, 0, 0, "heading ",
670 NULL, NULL, NULL, NULL, NULL,
671 0
672 },
673 { "h3", 0, 0, 0, 0, 0, 0, 0, "heading ",
674 NULL, NULL, NULL, NULL, NULL,
675 0
676 },
677 { "h4", 0, 0, 0, 0, 0, 0, 0, "heading ",
678 NULL, NULL, NULL, NULL, NULL,
679 0
680 },
681 { "h5", 0, 0, 0, 0, 0, 0, 0, "heading ",
682 NULL, NULL, NULL, NULL, NULL,
683 0
684 },
685 { "h6", 0, 0, 0, 0, 0, 0, 0, "heading ",
686 NULL, NULL, NULL, NULL, NULL,
687 0
688 },
689 { "head", 1, 1, 0, 0, 0, 0, 0, "document head ",
690 NULL, NULL, NULL, NULL, NULL,
691 0
692 },
693 { "hr", 0, 2, 2, 1, 0, 0, 0, "horizontal rule " ,
694 NULL, NULL, NULL, NULL, NULL,
695 0
696 },
697 { "html", 1, 1, 0, 0, 0, 0, 0, "document root element ",
698 NULL, NULL, NULL, NULL, NULL,
699 0
700 },
701 { "i", 0, 3, 0, 0, 0, 0, 1, "italic text style",
702 NULL, NULL, NULL, NULL, NULL,
703 0
704 },
705 { "iframe", 0, 0, 0, 0, 0, 1, 2, "inline subwindow ",
706 NULL, NULL, NULL, NULL, NULL,
707 DATA_RAWTEXT
708 },
709 { "img", 0, 2, 2, 1, 0, 0, 1, "embedded image ",
710 NULL, NULL, NULL, NULL, NULL,
711 0
712 },
713 { "input", 0, 2, 2, 1, 0, 0, 1, "form control ",
714 NULL, NULL, NULL, NULL, NULL,
715 0
716 },
717 { "ins", 0, 0, 0, 0, 0, 0, 2, "inserted text",
718 NULL, NULL, NULL, NULL, NULL,
719 0
720 },
721 { "isindex", 0, 2, 2, 1, 1, 1, 0, "single line prompt ",
722 NULL, NULL, NULL, NULL, NULL,
723 0
724 },
725 { "kbd", 0, 0, 0, 0, 0, 0, 1, "text to be entered by the user",
726 NULL, NULL, NULL, NULL, NULL,
727 0
728 },
729 { "label", 0, 0, 0, 0, 0, 0, 1, "form field label text ",
730 NULL, NULL, NULL, NULL, NULL,
731 0
732 },
733 { "legend", 0, 0, 0, 0, 0, 0, 0, "fieldset legend ",
734 NULL, NULL, NULL, NULL, NULL,
735 0
736 },
737 { "li", 0, 1, 1, 0, 0, 0, 0, "list item ",
738 NULL, NULL, NULL, NULL, NULL,
739 0
740 },
741 { "link", 0, 2, 2, 1, 0, 0, 0, "a media-independent link ",
742 NULL, NULL, NULL, NULL, NULL,
743 0
744 },
745 { "map", 0, 0, 0, 0, 0, 0, 2, "client-side image map ",
746 NULL, NULL, NULL, NULL, NULL,
747 0
748 },
749 { "menu", 0, 0, 0, 0, 1, 1, 0, "menu list ",
750 NULL, NULL, NULL, NULL, NULL,
751 0
752 },
753 { "meta", 0, 2, 2, 1, 0, 0, 0, "generic metainformation ",
754 NULL, NULL, NULL, NULL, NULL,
755 0
756 },
757 { "noembed", 0, 0, 0, 0, 0, 0, 0, "",
758 NULL, NULL, NULL, NULL, NULL,
759 DATA_RAWTEXT
760 },
761 { "noframes", 0, 0, 0, 0, 0, 2, 0, "alternate content container for non frame-based rendering ",
762 NULL, NULL, NULL, NULL, NULL,
763 DATA_RAWTEXT
764 },
765 { "noscript", 0, 0, 0, 0, 0, 0, 0, "alternate content container for non script-based rendering ",
766 NULL, NULL, NULL, NULL, NULL,
767 0
768 },
769 { "object", 0, 0, 0, 0, 0, 0, 2, "generic embedded object ",
770 NULL, NULL, NULL, NULL, NULL,
771 0
772 },
773 { "ol", 0, 0, 0, 0, 0, 0, 0, "ordered list ",
774 NULL, NULL, NULL, NULL, NULL,
775 0
776 },
777 { "optgroup", 0, 0, 0, 0, 0, 0, 0, "option group ",
778 NULL, NULL, NULL, NULL, NULL,
779 0
780 },
781 { "option", 0, 1, 0, 0, 0, 0, 0, "selectable choice " ,
782 NULL, NULL, NULL, NULL, NULL,
783 0
784 },
785 { "p", 0, 1, 0, 0, 0, 0, 0, "paragraph ",
786 NULL, NULL, NULL, NULL, NULL,
787 0
788 },
789 { "param", 0, 2, 2, 1, 0, 0, 0, "named property value ",
790 NULL, NULL, NULL, NULL, NULL,
791 0
792 },
793 { "plaintext", 0, 0, 0, 0, 0, 0, 0, "",
794 NULL, NULL, NULL, NULL, NULL,
795 DATA_PLAINTEXT
796 },
797 { "pre", 0, 0, 0, 0, 0, 0, 0, "preformatted text ",
798 NULL, NULL, NULL, NULL, NULL,
799 0
800 },
801 { "q", 0, 0, 0, 0, 0, 0, 1, "short inline quotation ",
802 NULL, NULL, NULL, NULL, NULL,
803 0
804 },
805 { "s", 0, 3, 0, 0, 1, 1, 1, "strike-through text style",
806 NULL, NULL, NULL, NULL, NULL,
807 0
808 },
809 { "samp", 0, 0, 0, 0, 0, 0, 1, "sample program output, scripts, etc.",
810 NULL, NULL, NULL, NULL, NULL,
811 0
812 },
813 { "script", 0, 0, 0, 0, 0, 0, 2, "script statements ",
814 NULL, NULL, NULL, NULL, NULL,
815 DATA_SCRIPT
816 },
817 { "select", 0, 0, 0, 0, 0, 0, 1, "option selector ",
818 NULL, NULL, NULL, NULL, NULL,
819 0
820 },
821 { "small", 0, 3, 0, 0, 0, 0, 1, "small text style",
822 NULL, NULL, NULL, NULL, NULL,
823 0
824 },
825 { "span", 0, 0, 0, 0, 0, 0, 1, "generic language/style container ",
826 NULL, NULL, NULL, NULL, NULL,
827 0
828 },
829 { "strike", 0, 3, 0, 0, 1, 1, 1, "strike-through text",
830 NULL, NULL, NULL, NULL, NULL,
831 0
832 },
833 { "strong", 0, 3, 0, 0, 0, 0, 1, "strong emphasis",
834 NULL, NULL, NULL, NULL, NULL,
835 0
836 },
837 { "style", 0, 0, 0, 0, 0, 0, 0, "style info ",
838 NULL, NULL, NULL, NULL, NULL,
839 DATA_RAWTEXT
840 },
841 { "sub", 0, 3, 0, 0, 0, 0, 1, "subscript",
842 NULL, NULL, NULL, NULL, NULL,
843 0
844 },
845 { "sup", 0, 3, 0, 0, 0, 0, 1, "superscript ",
846 NULL, NULL, NULL, NULL, NULL,
847 0
848 },
849 { "table", 0, 0, 0, 0, 0, 0, 0, "",
850 NULL, NULL, NULL, NULL, NULL,
851 0
852 },
853 { "tbody", 1, 0, 0, 0, 0, 0, 0, "table body ",
854 NULL, NULL, NULL, NULL, NULL,
855 0
856 },
857 { "td", 0, 0, 0, 0, 0, 0, 0, "table data cell",
858 NULL, NULL, NULL, NULL, NULL,
859 0
860 },
861 { "textarea", 0, 0, 0, 0, 0, 0, 1, "multi-line text field ",
862 NULL, NULL, NULL, NULL, NULL,
863 DATA_RCDATA
864 },
865 { "tfoot", 0, 1, 0, 0, 0, 0, 0, "table footer ",
866 NULL, NULL, NULL, NULL, NULL,
867 0
868 },
869 { "th", 0, 1, 0, 0, 0, 0, 0, "table header cell",
870 NULL, NULL, NULL, NULL, NULL,
871 0
872 },
873 { "thead", 0, 1, 0, 0, 0, 0, 0, "table header ",
874 NULL, NULL, NULL, NULL, NULL,
875 0
876 },
877 { "title", 0, 0, 0, 0, 0, 0, 0, "document title ",
878 NULL, NULL, NULL, NULL, NULL,
879 DATA_RCDATA
880 },
881 { "tr", 0, 0, 0, 0, 0, 0, 0, "table row ",
882 NULL, NULL, NULL, NULL, NULL,
883 0
884 },
885 { "tt", 0, 3, 0, 0, 0, 0, 1, "teletype or monospaced text style",
886 NULL, NULL, NULL, NULL, NULL,
887 0
888 },
889 { "u", 0, 3, 0, 0, 1, 1, 1, "underlined text style",
890 NULL, NULL, NULL, NULL, NULL,
891 0
892 },
893 { "ul", 0, 0, 0, 0, 0, 0, 0, "unordered list ",
894 NULL, NULL, NULL, NULL, NULL,
895 0
896 },
897 { "var", 0, 0, 0, 0, 0, 0, 1, "instance of a variable or program argument",
898 NULL, NULL, NULL, NULL, NULL,
899 0
900 },
901 { "xmp", 0, 0, 0, 0, 0, 0, 1, "",
902 NULL, NULL, NULL, NULL, NULL,
903 DATA_RAWTEXT
904 }
905 };
906
907 typedef struct {
908 const char *oldTag;
909 const char *newTag;
910 } htmlStartCloseEntry;
911
912 /*
913 * start tags that imply the end of current element
914 */
915 static const htmlStartCloseEntry htmlStartClose[] = {
916 { "a", "a" },
917 { "a", "fieldset" },
918 { "a", "table" },
919 { "a", "td" },
920 { "a", "th" },
921 { "address", "dd" },
922 { "address", "dl" },
923 { "address", "dt" },
924 { "address", "form" },
925 { "address", "li" },
926 { "address", "ul" },
927 { "b", "center" },
928 { "b", "p" },
929 { "b", "td" },
930 { "b", "th" },
931 { "big", "p" },
932 { "caption", "col" },
933 { "caption", "colgroup" },
934 { "caption", "tbody" },
935 { "caption", "tfoot" },
936 { "caption", "thead" },
937 { "caption", "tr" },
938 { "col", "col" },
939 { "col", "colgroup" },
940 { "col", "tbody" },
941 { "col", "tfoot" },
942 { "col", "thead" },
943 { "col", "tr" },
944 { "colgroup", "colgroup" },
945 { "colgroup", "tbody" },
946 { "colgroup", "tfoot" },
947 { "colgroup", "thead" },
948 { "colgroup", "tr" },
949 { "dd", "dt" },
950 { "dir", "dd" },
951 { "dir", "dl" },
952 { "dir", "dt" },
953 { "dir", "form" },
954 { "dir", "ul" },
955 { "dl", "form" },
956 { "dl", "li" },
957 { "dt", "dd" },
958 { "dt", "dl" },
959 { "font", "center" },
960 { "font", "td" },
961 { "font", "th" },
962 { "form", "form" },
963 { "h1", "fieldset" },
964 { "h1", "form" },
965 { "h1", "li" },
966 { "h1", "p" },
967 { "h1", "table" },
968 { "h2", "fieldset" },
969 { "h2", "form" },
970 { "h2", "li" },
971 { "h2", "p" },
972 { "h2", "table" },
973 { "h3", "fieldset" },
974 { "h3", "form" },
975 { "h3", "li" },
976 { "h3", "p" },
977 { "h3", "table" },
978 { "h4", "fieldset" },
979 { "h4", "form" },
980 { "h4", "li" },
981 { "h4", "p" },
982 { "h4", "table" },
983 { "h5", "fieldset" },
984 { "h5", "form" },
985 { "h5", "li" },
986 { "h5", "p" },
987 { "h5", "table" },
988 { "h6", "fieldset" },
989 { "h6", "form" },
990 { "h6", "li" },
991 { "h6", "p" },
992 { "h6", "table" },
993 { "head", "a" },
994 { "head", "abbr" },
995 { "head", "acronym" },
996 { "head", "address" },
997 { "head", "b" },
998 { "head", "bdo" },
999 { "head", "big" },
1000 { "head", "blockquote" },
1001 { "head", "body" },
1002 { "head", "br" },
1003 { "head", "center" },
1004 { "head", "cite" },
1005 { "head", "code" },
1006 { "head", "dd" },
1007 { "head", "dfn" },
1008 { "head", "dir" },
1009 { "head", "div" },
1010 { "head", "dl" },
1011 { "head", "dt" },
1012 { "head", "em" },
1013 { "head", "fieldset" },
1014 { "head", "font" },
1015 { "head", "form" },
1016 { "head", "frameset" },
1017 { "head", "h1" },
1018 { "head", "h2" },
1019 { "head", "h3" },
1020 { "head", "h4" },
1021 { "head", "h5" },
1022 { "head", "h6" },
1023 { "head", "hr" },
1024 { "head", "i" },
1025 { "head", "iframe" },
1026 { "head", "img" },
1027 { "head", "kbd" },
1028 { "head", "li" },
1029 { "head", "listing" },
1030 { "head", "map" },
1031 { "head", "menu" },
1032 { "head", "ol" },
1033 { "head", "p" },
1034 { "head", "pre" },
1035 { "head", "q" },
1036 { "head", "s" },
1037 { "head", "samp" },
1038 { "head", "small" },
1039 { "head", "span" },
1040 { "head", "strike" },
1041 { "head", "strong" },
1042 { "head", "sub" },
1043 { "head", "sup" },
1044 { "head", "table" },
1045 { "head", "tt" },
1046 { "head", "u" },
1047 { "head", "ul" },
1048 { "head", "var" },
1049 { "head", "xmp" },
1050 { "hr", "form" },
1051 { "i", "center" },
1052 { "i", "p" },
1053 { "i", "td" },
1054 { "i", "th" },
1055 { "legend", "fieldset" },
1056 { "li", "li" },
1057 { "link", "body" },
1058 { "link", "frameset" },
1059 { "listing", "dd" },
1060 { "listing", "dl" },
1061 { "listing", "dt" },
1062 { "listing", "fieldset" },
1063 { "listing", "form" },
1064 { "listing", "li" },
1065 { "listing", "table" },
1066 { "listing", "ul" },
1067 { "menu", "dd" },
1068 { "menu", "dl" },
1069 { "menu", "dt" },
1070 { "menu", "form" },
1071 { "menu", "ul" },
1072 { "ol", "form" },
1073 { "option", "optgroup" },
1074 { "option", "option" },
1075 { "p", "address" },
1076 { "p", "blockquote" },
1077 { "p", "body" },
1078 { "p", "caption" },
1079 { "p", "center" },
1080 { "p", "col" },
1081 { "p", "colgroup" },
1082 { "p", "dd" },
1083 { "p", "dir" },
1084 { "p", "div" },
1085 { "p", "dl" },
1086 { "p", "dt" },
1087 { "p", "fieldset" },
1088 { "p", "form" },
1089 { "p", "frameset" },
1090 { "p", "h1" },
1091 { "p", "h2" },
1092 { "p", "h3" },
1093 { "p", "h4" },
1094 { "p", "h5" },
1095 { "p", "h6" },
1096 { "p", "head" },
1097 { "p", "hr" },
1098 { "p", "li" },
1099 { "p", "listing" },
1100 { "p", "menu" },
1101 { "p", "ol" },
1102 { "p", "p" },
1103 { "p", "pre" },
1104 { "p", "table" },
1105 { "p", "tbody" },
1106 { "p", "td" },
1107 { "p", "tfoot" },
1108 { "p", "th" },
1109 { "p", "title" },
1110 { "p", "tr" },
1111 { "p", "ul" },
1112 { "p", "xmp" },
1113 { "pre", "dd" },
1114 { "pre", "dl" },
1115 { "pre", "dt" },
1116 { "pre", "fieldset" },
1117 { "pre", "form" },
1118 { "pre", "li" },
1119 { "pre", "table" },
1120 { "pre", "ul" },
1121 { "s", "p" },
1122 { "script", "noscript" },
1123 { "small", "p" },
1124 { "span", "td" },
1125 { "span", "th" },
1126 { "strike", "p" },
1127 { "style", "body" },
1128 { "style", "frameset" },
1129 { "tbody", "tbody" },
1130 { "tbody", "tfoot" },
1131 { "td", "tbody" },
1132 { "td", "td" },
1133 { "td", "tfoot" },
1134 { "td", "th" },
1135 { "td", "tr" },
1136 { "tfoot", "tbody" },
1137 { "th", "tbody" },
1138 { "th", "td" },
1139 { "th", "tfoot" },
1140 { "th", "th" },
1141 { "th", "tr" },
1142 { "thead", "tbody" },
1143 { "thead", "tfoot" },
1144 { "title", "body" },
1145 { "title", "frameset" },
1146 { "tr", "tbody" },
1147 { "tr", "tfoot" },
1148 { "tr", "tr" },
1149 { "tt", "p" },
1150 { "u", "p" },
1151 { "u", "td" },
1152 { "u", "th" },
1153 { "ul", "address" },
1154 { "ul", "form" },
1155 { "ul", "menu" },
1156 { "ul", "pre" },
1157 { "xmp", "dd" },
1158 { "xmp", "dl" },
1159 { "xmp", "dt" },
1160 { "xmp", "fieldset" },
1161 { "xmp", "form" },
1162 { "xmp", "li" },
1163 { "xmp", "table" },
1164 { "xmp", "ul" }
1165 };
1166
1167 /*
1168 * The list of HTML elements which are supposed not to have
1169 * CDATA content and where a p element will be implied
1170 *
1171 * TODO: extend that list by reading the HTML SGML DTD on
1172 * implied paragraph
1173 */
1174 static const char *const htmlNoContentElements[] = {
1175 "html",
1176 "head",
1177 NULL
1178 };
1179
1180 /*
1181 * The list of HTML attributes which are of content %Script;
1182 * NOTE: when adding ones, check htmlIsScriptAttribute() since
1183 * it assumes the name starts with 'on'
1184 */
1185 static const char *const htmlScriptAttributes[] = {
1186 "onclick",
1187 "ondblclick",
1188 "onmousedown",
1189 "onmouseup",
1190 "onmouseover",
1191 "onmousemove",
1192 "onmouseout",
1193 "onkeypress",
1194 "onkeydown",
1195 "onkeyup",
1196 "onload",
1197 "onunload",
1198 "onfocus",
1199 "onblur",
1200 "onsubmit",
1201 "onreset",
1202 "onchange",
1203 "onselect"
1204 };
1205
1206 /*
1207 * This table is used by the htmlparser to know what to do with
1208 * broken html pages. By assigning different priorities to different
1209 * elements the parser can decide how to handle extra endtags.
1210 * Endtags are only allowed to close elements with lower or equal
1211 * priority.
1212 */
1213
1214 typedef struct {
1215 const char *name;
1216 int priority;
1217 } elementPriority;
1218
1219 static const elementPriority htmlEndPriority[] = {
1220 {"div", 150},
1221 {"td", 160},
1222 {"th", 160},
1223 {"tr", 170},
1224 {"thead", 180},
1225 {"tbody", 180},
1226 {"tfoot", 180},
1227 {"table", 190},
1228 {"head", 200},
1229 {"body", 200},
1230 {"html", 220},
1231 {NULL, 100} /* Default priority */
1232 };
1233
1234 /************************************************************************
1235 * *
1236 * functions to handle HTML specific data *
1237 * *
1238 ************************************************************************/
1239
1240 static void
htmlParserFinishElementParsing(htmlParserCtxtPtr ctxt)1241 htmlParserFinishElementParsing(htmlParserCtxtPtr ctxt) {
1242 /*
1243 * Capture end position and add node
1244 */
1245 if ( ctxt->node != NULL && ctxt->record_info ) {
1246 ctxt->nodeInfo->end_pos = ctxt->input->consumed +
1247 (CUR_PTR - ctxt->input->base);
1248 ctxt->nodeInfo->end_line = ctxt->input->line;
1249 ctxt->nodeInfo->node = ctxt->node;
1250 xmlParserAddNodeInfo(ctxt, ctxt->nodeInfo);
1251 htmlNodeInfoPop(ctxt);
1252 }
1253 }
1254
1255 /**
1256 * htmlInitAutoClose:
1257 *
1258 * DEPRECATED: This is a no-op.
1259 */
1260 void
htmlInitAutoClose(void)1261 htmlInitAutoClose(void) {
1262 }
1263
1264 static int
htmlCompareTags(const void * key,const void * member)1265 htmlCompareTags(const void *key, const void *member) {
1266 const xmlChar *tag = (const xmlChar *) key;
1267 const htmlElemDesc *desc = (const htmlElemDesc *) member;
1268
1269 return(xmlStrcasecmp(tag, BAD_CAST desc->name));
1270 }
1271
1272 /**
1273 * htmlTagLookup:
1274 * @tag: The tag name in lowercase
1275 *
1276 * Lookup the HTML tag in the ElementTable
1277 *
1278 * Returns the related htmlElemDescPtr or NULL if not found.
1279 */
1280 const htmlElemDesc *
htmlTagLookup(const xmlChar * tag)1281 htmlTagLookup(const xmlChar *tag) {
1282 if (tag == NULL)
1283 return(NULL);
1284
1285 return((const htmlElemDesc *) bsearch(tag, html40ElementTable,
1286 sizeof(html40ElementTable) / sizeof(htmlElemDesc),
1287 sizeof(htmlElemDesc), htmlCompareTags));
1288 }
1289
1290 /**
1291 * htmlGetEndPriority:
1292 * @name: The name of the element to look up the priority for.
1293 *
1294 * Return value: The "endtag" priority.
1295 **/
1296 static int
htmlGetEndPriority(const xmlChar * name)1297 htmlGetEndPriority (const xmlChar *name) {
1298 int i = 0;
1299
1300 while ((htmlEndPriority[i].name != NULL) &&
1301 (!xmlStrEqual((const xmlChar *)htmlEndPriority[i].name, name)))
1302 i++;
1303
1304 return(htmlEndPriority[i].priority);
1305 }
1306
1307
1308 static int
htmlCompareStartClose(const void * vkey,const void * member)1309 htmlCompareStartClose(const void *vkey, const void *member) {
1310 const htmlStartCloseEntry *key = (const htmlStartCloseEntry *) vkey;
1311 const htmlStartCloseEntry *entry = (const htmlStartCloseEntry *) member;
1312 int ret;
1313
1314 ret = strcmp(key->oldTag, entry->oldTag);
1315 if (ret == 0)
1316 ret = strcmp(key->newTag, entry->newTag);
1317
1318 return(ret);
1319 }
1320
1321 /**
1322 * htmlCheckAutoClose:
1323 * @newtag: The new tag name
1324 * @oldtag: The old tag name
1325 *
1326 * Checks whether the new tag is one of the registered valid tags for
1327 * closing old.
1328 *
1329 * Returns 0 if no, 1 if yes.
1330 */
1331 static int
htmlCheckAutoClose(const xmlChar * newtag,const xmlChar * oldtag)1332 htmlCheckAutoClose(const xmlChar * newtag, const xmlChar * oldtag)
1333 {
1334 htmlStartCloseEntry key;
1335 void *res;
1336
1337 key.oldTag = (const char *) oldtag;
1338 key.newTag = (const char *) newtag;
1339 res = bsearch(&key, htmlStartClose,
1340 sizeof(htmlStartClose) / sizeof(htmlStartCloseEntry),
1341 sizeof(htmlStartCloseEntry), htmlCompareStartClose);
1342 return(res != NULL);
1343 }
1344
1345 /**
1346 * htmlAutoCloseOnClose:
1347 * @ctxt: an HTML parser context
1348 * @newtag: The new tag name
1349 * @force: force the tag closure
1350 *
1351 * The HTML DTD allows an ending tag to implicitly close other tags.
1352 */
1353 static void
htmlAutoCloseOnClose(htmlParserCtxtPtr ctxt,const xmlChar * newtag)1354 htmlAutoCloseOnClose(htmlParserCtxtPtr ctxt, const xmlChar * newtag)
1355 {
1356 const htmlElemDesc *info;
1357 int i, priority;
1358
1359 if (ctxt->options & HTML_PARSE_HTML5)
1360 return;
1361
1362 priority = htmlGetEndPriority(newtag);
1363
1364 for (i = (ctxt->nameNr - 1); i >= 0; i--) {
1365
1366 if (xmlStrEqual(newtag, ctxt->nameTab[i]))
1367 break;
1368 /*
1369 * A misplaced endtag can only close elements with lower
1370 * or equal priority, so if we find an element with higher
1371 * priority before we find an element with
1372 * matching name, we just ignore this endtag
1373 */
1374 if (htmlGetEndPriority(ctxt->nameTab[i]) > priority)
1375 return;
1376 }
1377 if (i < 0)
1378 return;
1379
1380 while (!xmlStrEqual(newtag, ctxt->name)) {
1381 info = htmlTagLookup(ctxt->name);
1382 if ((info != NULL) && (info->endTag == 3)) {
1383 htmlParseErr(ctxt, XML_ERR_TAG_NAME_MISMATCH,
1384 "Opening and ending tag mismatch: %s and %s\n",
1385 newtag, ctxt->name);
1386 }
1387 htmlParserFinishElementParsing(ctxt);
1388 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
1389 ctxt->sax->endElement(ctxt->userData, ctxt->name);
1390 htmlnamePop(ctxt);
1391 }
1392 }
1393
1394 /**
1395 * htmlAutoCloseOnEnd:
1396 * @ctxt: an HTML parser context
1397 *
1398 * Close all remaining tags at the end of the stream
1399 */
1400 static void
htmlAutoCloseOnEnd(htmlParserCtxtPtr ctxt)1401 htmlAutoCloseOnEnd(htmlParserCtxtPtr ctxt)
1402 {
1403 int i;
1404
1405 if (ctxt->options & HTML_PARSE_HTML5)
1406 return;
1407
1408 if (ctxt->nameNr == 0)
1409 return;
1410 for (i = (ctxt->nameNr - 1); i >= 0; i--) {
1411 htmlParserFinishElementParsing(ctxt);
1412 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
1413 ctxt->sax->endElement(ctxt->userData, ctxt->name);
1414 htmlnamePop(ctxt);
1415 }
1416 }
1417
1418 /**
1419 * htmlAutoClose:
1420 * @ctxt: an HTML parser context
1421 * @newtag: The new tag name or NULL
1422 *
1423 * The HTML DTD allows a tag to implicitly close other tags.
1424 * The list is kept in htmlStartClose array. This function is
1425 * called when a new tag has been detected and generates the
1426 * appropriates closes if possible/needed.
1427 * If newtag is NULL this mean we are at the end of the resource
1428 * and we should check
1429 */
1430 static void
htmlAutoClose(htmlParserCtxtPtr ctxt,const xmlChar * newtag)1431 htmlAutoClose(htmlParserCtxtPtr ctxt, const xmlChar * newtag)
1432 {
1433 if (ctxt->options & HTML_PARSE_HTML5)
1434 return;
1435
1436 if (newtag == NULL)
1437 return;
1438
1439 while ((ctxt->name != NULL) &&
1440 (htmlCheckAutoClose(newtag, ctxt->name))) {
1441 htmlParserFinishElementParsing(ctxt);
1442 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
1443 ctxt->sax->endElement(ctxt->userData, ctxt->name);
1444 htmlnamePop(ctxt);
1445 }
1446 }
1447
1448 /**
1449 * htmlAutoCloseTag:
1450 * @doc: the HTML document
1451 * @name: The tag name
1452 * @elem: the HTML element
1453 *
1454 * DEPRECATED: Internal function, don't use.
1455 *
1456 * The HTML DTD allows a tag to implicitly close other tags.
1457 * The list is kept in htmlStartClose array. This function checks
1458 * if the element or one of it's children would autoclose the
1459 * given tag.
1460 *
1461 * Returns 1 if autoclose, 0 otherwise
1462 */
1463 int
htmlAutoCloseTag(htmlDocPtr doc,const xmlChar * name,htmlNodePtr elem)1464 htmlAutoCloseTag(htmlDocPtr doc, const xmlChar *name, htmlNodePtr elem) {
1465 htmlNodePtr child;
1466
1467 if (elem == NULL) return(1);
1468 if (xmlStrEqual(name, elem->name)) return(0);
1469 if (htmlCheckAutoClose(elem->name, name)) return(1);
1470 child = elem->children;
1471 while (child != NULL) {
1472 if (htmlAutoCloseTag(doc, name, child)) return(1);
1473 child = child->next;
1474 }
1475 return(0);
1476 }
1477
1478 /**
1479 * htmlIsAutoClosed:
1480 * @doc: the HTML document
1481 * @elem: the HTML element
1482 *
1483 * DEPRECATED: Internal function, don't use.
1484 *
1485 * The HTML DTD allows a tag to implicitly close other tags.
1486 * The list is kept in htmlStartClose array. This function checks
1487 * if a tag is autoclosed by one of it's child
1488 *
1489 * Returns 1 if autoclosed, 0 otherwise
1490 */
1491 int
htmlIsAutoClosed(htmlDocPtr doc,htmlNodePtr elem)1492 htmlIsAutoClosed(htmlDocPtr doc, htmlNodePtr elem) {
1493 htmlNodePtr child;
1494
1495 if (elem == NULL) return(1);
1496 child = elem->children;
1497 while (child != NULL) {
1498 if (htmlAutoCloseTag(doc, elem->name, child)) return(1);
1499 child = child->next;
1500 }
1501 return(0);
1502 }
1503
1504 /**
1505 * htmlCheckImplied:
1506 * @ctxt: an HTML parser context
1507 * @newtag: The new tag name
1508 *
1509 * The HTML DTD allows a tag to exists only implicitly
1510 * called when a new tag has been detected and generates the
1511 * appropriates implicit tags if missing
1512 */
1513 static void
htmlCheckImplied(htmlParserCtxtPtr ctxt,const xmlChar * newtag)1514 htmlCheckImplied(htmlParserCtxtPtr ctxt, const xmlChar *newtag) {
1515 int i;
1516
1517 if (ctxt->options & (HTML_PARSE_NOIMPLIED | HTML_PARSE_HTML5))
1518 return;
1519 if (!htmlOmittedDefaultValue)
1520 return;
1521 if (xmlStrEqual(newtag, BAD_CAST"html"))
1522 return;
1523 if (ctxt->nameNr <= 0) {
1524 htmlnamePush(ctxt, BAD_CAST"html");
1525 if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
1526 ctxt->sax->startElement(ctxt->userData, BAD_CAST"html", NULL);
1527 }
1528 if ((xmlStrEqual(newtag, BAD_CAST"body")) || (xmlStrEqual(newtag, BAD_CAST"head")))
1529 return;
1530 if ((ctxt->nameNr <= 1) &&
1531 ((xmlStrEqual(newtag, BAD_CAST"script")) ||
1532 (xmlStrEqual(newtag, BAD_CAST"style")) ||
1533 (xmlStrEqual(newtag, BAD_CAST"meta")) ||
1534 (xmlStrEqual(newtag, BAD_CAST"link")) ||
1535 (xmlStrEqual(newtag, BAD_CAST"title")) ||
1536 (xmlStrEqual(newtag, BAD_CAST"base")))) {
1537 if (ctxt->html >= 3) {
1538 /* we already saw or generated an <head> before */
1539 return;
1540 }
1541 /*
1542 * dropped OBJECT ... i you put it first BODY will be
1543 * assumed !
1544 */
1545 htmlnamePush(ctxt, BAD_CAST"head");
1546 if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
1547 ctxt->sax->startElement(ctxt->userData, BAD_CAST"head", NULL);
1548 } else if ((!xmlStrEqual(newtag, BAD_CAST"noframes")) &&
1549 (!xmlStrEqual(newtag, BAD_CAST"frame")) &&
1550 (!xmlStrEqual(newtag, BAD_CAST"frameset"))) {
1551 if (ctxt->html >= 10) {
1552 /* we already saw or generated a <body> before */
1553 return;
1554 }
1555 for (i = 0;i < ctxt->nameNr;i++) {
1556 if (xmlStrEqual(ctxt->nameTab[i], BAD_CAST"body")) {
1557 return;
1558 }
1559 if (xmlStrEqual(ctxt->nameTab[i], BAD_CAST"head")) {
1560 return;
1561 }
1562 }
1563
1564 htmlnamePush(ctxt, BAD_CAST"body");
1565 if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
1566 ctxt->sax->startElement(ctxt->userData, BAD_CAST"body", NULL);
1567 }
1568 }
1569
1570 /**
1571 * htmlCheckParagraph
1572 * @ctxt: an HTML parser context
1573 *
1574 * Check whether a p element need to be implied before inserting
1575 * characters in the current element.
1576 *
1577 * Returns 1 if a paragraph has been inserted, 0 if not and -1
1578 * in case of error.
1579 */
1580
1581 static int
htmlCheckParagraph(htmlParserCtxtPtr ctxt)1582 htmlCheckParagraph(htmlParserCtxtPtr ctxt) {
1583 const xmlChar *tag;
1584 int i;
1585
1586 if (ctxt == NULL)
1587 return(-1);
1588 if (ctxt->options & HTML_PARSE_HTML5)
1589 return(0);
1590
1591 tag = ctxt->name;
1592 if (tag == NULL) {
1593 htmlAutoClose(ctxt, BAD_CAST"p");
1594 htmlCheckImplied(ctxt, BAD_CAST"p");
1595 htmlnamePush(ctxt, BAD_CAST"p");
1596 if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
1597 ctxt->sax->startElement(ctxt->userData, BAD_CAST"p", NULL);
1598 return(1);
1599 }
1600 if (!htmlOmittedDefaultValue)
1601 return(0);
1602 for (i = 0; htmlNoContentElements[i] != NULL; i++) {
1603 if (xmlStrEqual(tag, BAD_CAST htmlNoContentElements[i])) {
1604 htmlAutoClose(ctxt, BAD_CAST"p");
1605 htmlCheckImplied(ctxt, BAD_CAST"p");
1606 htmlnamePush(ctxt, BAD_CAST"p");
1607 if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
1608 ctxt->sax->startElement(ctxt->userData, BAD_CAST"p", NULL);
1609 return(1);
1610 }
1611 }
1612 return(0);
1613 }
1614
1615 /**
1616 * htmlIsScriptAttribute:
1617 * @name: an attribute name
1618 *
1619 * Check if an attribute is of content type Script
1620 *
1621 * Returns 1 is the attribute is a script 0 otherwise
1622 */
1623 int
htmlIsScriptAttribute(const xmlChar * name)1624 htmlIsScriptAttribute(const xmlChar *name) {
1625 unsigned int i;
1626
1627 if (name == NULL)
1628 return(0);
1629 /*
1630 * all script attributes start with 'on'
1631 */
1632 if ((name[0] != 'o') || (name[1] != 'n'))
1633 return(0);
1634 for (i = 0;
1635 i < sizeof(htmlScriptAttributes)/sizeof(htmlScriptAttributes[0]);
1636 i++) {
1637 if (xmlStrEqual(name, (const xmlChar *) htmlScriptAttributes[i]))
1638 return(1);
1639 }
1640 return(0);
1641 }
1642
1643 /************************************************************************
1644 * *
1645 * The list of HTML predefined entities *
1646 * *
1647 ************************************************************************/
1648
1649
1650 static const htmlEntityDesc html40EntitiesTable[] = {
1651 /*
1652 * the 4 absolute ones, plus apostrophe.
1653 */
1654 { 34, "quot", "quotation mark = APL quote, U+0022 ISOnum" },
1655 { 38, "amp", "ampersand, U+0026 ISOnum" },
1656 { 39, "apos", "single quote" },
1657 { 60, "lt", "less-than sign, U+003C ISOnum" },
1658 { 62, "gt", "greater-than sign, U+003E ISOnum" },
1659
1660 /*
1661 * A bunch still in the 128-255 range
1662 * Replacing them depend really on the charset used.
1663 */
1664 { 160, "nbsp", "no-break space = non-breaking space, U+00A0 ISOnum" },
1665 { 161, "iexcl","inverted exclamation mark, U+00A1 ISOnum" },
1666 { 162, "cent", "cent sign, U+00A2 ISOnum" },
1667 { 163, "pound","pound sign, U+00A3 ISOnum" },
1668 { 164, "curren","currency sign, U+00A4 ISOnum" },
1669 { 165, "yen", "yen sign = yuan sign, U+00A5 ISOnum" },
1670 { 166, "brvbar","broken bar = broken vertical bar, U+00A6 ISOnum" },
1671 { 167, "sect", "section sign, U+00A7 ISOnum" },
1672 { 168, "uml", "diaeresis = spacing diaeresis, U+00A8 ISOdia" },
1673 { 169, "copy", "copyright sign, U+00A9 ISOnum" },
1674 { 170, "ordf", "feminine ordinal indicator, U+00AA ISOnum" },
1675 { 171, "laquo","left-pointing double angle quotation mark = left pointing guillemet, U+00AB ISOnum" },
1676 { 172, "not", "not sign, U+00AC ISOnum" },
1677 { 173, "shy", "soft hyphen = discretionary hyphen, U+00AD ISOnum" },
1678 { 174, "reg", "registered sign = registered trade mark sign, U+00AE ISOnum" },
1679 { 175, "macr", "macron = spacing macron = overline = APL overbar, U+00AF ISOdia" },
1680 { 176, "deg", "degree sign, U+00B0 ISOnum" },
1681 { 177, "plusmn","plus-minus sign = plus-or-minus sign, U+00B1 ISOnum" },
1682 { 178, "sup2", "superscript two = superscript digit two = squared, U+00B2 ISOnum" },
1683 { 179, "sup3", "superscript three = superscript digit three = cubed, U+00B3 ISOnum" },
1684 { 180, "acute","acute accent = spacing acute, U+00B4 ISOdia" },
1685 { 181, "micro","micro sign, U+00B5 ISOnum" },
1686 { 182, "para", "pilcrow sign = paragraph sign, U+00B6 ISOnum" },
1687 { 183, "middot","middle dot = Georgian comma Greek middle dot, U+00B7 ISOnum" },
1688 { 184, "cedil","cedilla = spacing cedilla, U+00B8 ISOdia" },
1689 { 185, "sup1", "superscript one = superscript digit one, U+00B9 ISOnum" },
1690 { 186, "ordm", "masculine ordinal indicator, U+00BA ISOnum" },
1691 { 187, "raquo","right-pointing double angle quotation mark right pointing guillemet, U+00BB ISOnum" },
1692 { 188, "frac14","vulgar fraction one quarter = fraction one quarter, U+00BC ISOnum" },
1693 { 189, "frac12","vulgar fraction one half = fraction one half, U+00BD ISOnum" },
1694 { 190, "frac34","vulgar fraction three quarters = fraction three quarters, U+00BE ISOnum" },
1695 { 191, "iquest","inverted question mark = turned question mark, U+00BF ISOnum" },
1696 { 192, "Agrave","latin capital letter A with grave = latin capital letter A grave, U+00C0 ISOlat1" },
1697 { 193, "Aacute","latin capital letter A with acute, U+00C1 ISOlat1" },
1698 { 194, "Acirc","latin capital letter A with circumflex, U+00C2 ISOlat1" },
1699 { 195, "Atilde","latin capital letter A with tilde, U+00C3 ISOlat1" },
1700 { 196, "Auml", "latin capital letter A with diaeresis, U+00C4 ISOlat1" },
1701 { 197, "Aring","latin capital letter A with ring above = latin capital letter A ring, U+00C5 ISOlat1" },
1702 { 198, "AElig","latin capital letter AE = latin capital ligature AE, U+00C6 ISOlat1" },
1703 { 199, "Ccedil","latin capital letter C with cedilla, U+00C7 ISOlat1" },
1704 { 200, "Egrave","latin capital letter E with grave, U+00C8 ISOlat1" },
1705 { 201, "Eacute","latin capital letter E with acute, U+00C9 ISOlat1" },
1706 { 202, "Ecirc","latin capital letter E with circumflex, U+00CA ISOlat1" },
1707 { 203, "Euml", "latin capital letter E with diaeresis, U+00CB ISOlat1" },
1708 { 204, "Igrave","latin capital letter I with grave, U+00CC ISOlat1" },
1709 { 205, "Iacute","latin capital letter I with acute, U+00CD ISOlat1" },
1710 { 206, "Icirc","latin capital letter I with circumflex, U+00CE ISOlat1" },
1711 { 207, "Iuml", "latin capital letter I with diaeresis, U+00CF ISOlat1" },
1712 { 208, "ETH", "latin capital letter ETH, U+00D0 ISOlat1" },
1713 { 209, "Ntilde","latin capital letter N with tilde, U+00D1 ISOlat1" },
1714 { 210, "Ograve","latin capital letter O with grave, U+00D2 ISOlat1" },
1715 { 211, "Oacute","latin capital letter O with acute, U+00D3 ISOlat1" },
1716 { 212, "Ocirc","latin capital letter O with circumflex, U+00D4 ISOlat1" },
1717 { 213, "Otilde","latin capital letter O with tilde, U+00D5 ISOlat1" },
1718 { 214, "Ouml", "latin capital letter O with diaeresis, U+00D6 ISOlat1" },
1719 { 215, "times","multiplication sign, U+00D7 ISOnum" },
1720 { 216, "Oslash","latin capital letter O with stroke latin capital letter O slash, U+00D8 ISOlat1" },
1721 { 217, "Ugrave","latin capital letter U with grave, U+00D9 ISOlat1" },
1722 { 218, "Uacute","latin capital letter U with acute, U+00DA ISOlat1" },
1723 { 219, "Ucirc","latin capital letter U with circumflex, U+00DB ISOlat1" },
1724 { 220, "Uuml", "latin capital letter U with diaeresis, U+00DC ISOlat1" },
1725 { 221, "Yacute","latin capital letter Y with acute, U+00DD ISOlat1" },
1726 { 222, "THORN","latin capital letter THORN, U+00DE ISOlat1" },
1727 { 223, "szlig","latin small letter sharp s = ess-zed, U+00DF ISOlat1" },
1728 { 224, "agrave","latin small letter a with grave = latin small letter a grave, U+00E0 ISOlat1" },
1729 { 225, "aacute","latin small letter a with acute, U+00E1 ISOlat1" },
1730 { 226, "acirc","latin small letter a with circumflex, U+00E2 ISOlat1" },
1731 { 227, "atilde","latin small letter a with tilde, U+00E3 ISOlat1" },
1732 { 228, "auml", "latin small letter a with diaeresis, U+00E4 ISOlat1" },
1733 { 229, "aring","latin small letter a with ring above = latin small letter a ring, U+00E5 ISOlat1" },
1734 { 230, "aelig","latin small letter ae = latin small ligature ae, U+00E6 ISOlat1" },
1735 { 231, "ccedil","latin small letter c with cedilla, U+00E7 ISOlat1" },
1736 { 232, "egrave","latin small letter e with grave, U+00E8 ISOlat1" },
1737 { 233, "eacute","latin small letter e with acute, U+00E9 ISOlat1" },
1738 { 234, "ecirc","latin small letter e with circumflex, U+00EA ISOlat1" },
1739 { 235, "euml", "latin small letter e with diaeresis, U+00EB ISOlat1" },
1740 { 236, "igrave","latin small letter i with grave, U+00EC ISOlat1" },
1741 { 237, "iacute","latin small letter i with acute, U+00ED ISOlat1" },
1742 { 238, "icirc","latin small letter i with circumflex, U+00EE ISOlat1" },
1743 { 239, "iuml", "latin small letter i with diaeresis, U+00EF ISOlat1" },
1744 { 240, "eth", "latin small letter eth, U+00F0 ISOlat1" },
1745 { 241, "ntilde","latin small letter n with tilde, U+00F1 ISOlat1" },
1746 { 242, "ograve","latin small letter o with grave, U+00F2 ISOlat1" },
1747 { 243, "oacute","latin small letter o with acute, U+00F3 ISOlat1" },
1748 { 244, "ocirc","latin small letter o with circumflex, U+00F4 ISOlat1" },
1749 { 245, "otilde","latin small letter o with tilde, U+00F5 ISOlat1" },
1750 { 246, "ouml", "latin small letter o with diaeresis, U+00F6 ISOlat1" },
1751 { 247, "divide","division sign, U+00F7 ISOnum" },
1752 { 248, "oslash","latin small letter o with stroke, = latin small letter o slash, U+00F8 ISOlat1" },
1753 { 249, "ugrave","latin small letter u with grave, U+00F9 ISOlat1" },
1754 { 250, "uacute","latin small letter u with acute, U+00FA ISOlat1" },
1755 { 251, "ucirc","latin small letter u with circumflex, U+00FB ISOlat1" },
1756 { 252, "uuml", "latin small letter u with diaeresis, U+00FC ISOlat1" },
1757 { 253, "yacute","latin small letter y with acute, U+00FD ISOlat1" },
1758 { 254, "thorn","latin small letter thorn with, U+00FE ISOlat1" },
1759 { 255, "yuml", "latin small letter y with diaeresis, U+00FF ISOlat1" },
1760
1761 { 338, "OElig","latin capital ligature OE, U+0152 ISOlat2" },
1762 { 339, "oelig","latin small ligature oe, U+0153 ISOlat2" },
1763 { 352, "Scaron","latin capital letter S with caron, U+0160 ISOlat2" },
1764 { 353, "scaron","latin small letter s with caron, U+0161 ISOlat2" },
1765 { 376, "Yuml", "latin capital letter Y with diaeresis, U+0178 ISOlat2" },
1766
1767 /*
1768 * Anything below should really be kept as entities references
1769 */
1770 { 402, "fnof", "latin small f with hook = function = florin, U+0192 ISOtech" },
1771
1772 { 710, "circ", "modifier letter circumflex accent, U+02C6 ISOpub" },
1773 { 732, "tilde","small tilde, U+02DC ISOdia" },
1774
1775 { 913, "Alpha","greek capital letter alpha, U+0391" },
1776 { 914, "Beta", "greek capital letter beta, U+0392" },
1777 { 915, "Gamma","greek capital letter gamma, U+0393 ISOgrk3" },
1778 { 916, "Delta","greek capital letter delta, U+0394 ISOgrk3" },
1779 { 917, "Epsilon","greek capital letter epsilon, U+0395" },
1780 { 918, "Zeta", "greek capital letter zeta, U+0396" },
1781 { 919, "Eta", "greek capital letter eta, U+0397" },
1782 { 920, "Theta","greek capital letter theta, U+0398 ISOgrk3" },
1783 { 921, "Iota", "greek capital letter iota, U+0399" },
1784 { 922, "Kappa","greek capital letter kappa, U+039A" },
1785 { 923, "Lambda", "greek capital letter lambda, U+039B ISOgrk3" },
1786 { 924, "Mu", "greek capital letter mu, U+039C" },
1787 { 925, "Nu", "greek capital letter nu, U+039D" },
1788 { 926, "Xi", "greek capital letter xi, U+039E ISOgrk3" },
1789 { 927, "Omicron","greek capital letter omicron, U+039F" },
1790 { 928, "Pi", "greek capital letter pi, U+03A0 ISOgrk3" },
1791 { 929, "Rho", "greek capital letter rho, U+03A1" },
1792 { 931, "Sigma","greek capital letter sigma, U+03A3 ISOgrk3" },
1793 { 932, "Tau", "greek capital letter tau, U+03A4" },
1794 { 933, "Upsilon","greek capital letter upsilon, U+03A5 ISOgrk3" },
1795 { 934, "Phi", "greek capital letter phi, U+03A6 ISOgrk3" },
1796 { 935, "Chi", "greek capital letter chi, U+03A7" },
1797 { 936, "Psi", "greek capital letter psi, U+03A8 ISOgrk3" },
1798 { 937, "Omega","greek capital letter omega, U+03A9 ISOgrk3" },
1799
1800 { 945, "alpha","greek small letter alpha, U+03B1 ISOgrk3" },
1801 { 946, "beta", "greek small letter beta, U+03B2 ISOgrk3" },
1802 { 947, "gamma","greek small letter gamma, U+03B3 ISOgrk3" },
1803 { 948, "delta","greek small letter delta, U+03B4 ISOgrk3" },
1804 { 949, "epsilon","greek small letter epsilon, U+03B5 ISOgrk3" },
1805 { 950, "zeta", "greek small letter zeta, U+03B6 ISOgrk3" },
1806 { 951, "eta", "greek small letter eta, U+03B7 ISOgrk3" },
1807 { 952, "theta","greek small letter theta, U+03B8 ISOgrk3" },
1808 { 953, "iota", "greek small letter iota, U+03B9 ISOgrk3" },
1809 { 954, "kappa","greek small letter kappa, U+03BA ISOgrk3" },
1810 { 955, "lambda","greek small letter lambda, U+03BB ISOgrk3" },
1811 { 956, "mu", "greek small letter mu, U+03BC ISOgrk3" },
1812 { 957, "nu", "greek small letter nu, U+03BD ISOgrk3" },
1813 { 958, "xi", "greek small letter xi, U+03BE ISOgrk3" },
1814 { 959, "omicron","greek small letter omicron, U+03BF NEW" },
1815 { 960, "pi", "greek small letter pi, U+03C0 ISOgrk3" },
1816 { 961, "rho", "greek small letter rho, U+03C1 ISOgrk3" },
1817 { 962, "sigmaf","greek small letter final sigma, U+03C2 ISOgrk3" },
1818 { 963, "sigma","greek small letter sigma, U+03C3 ISOgrk3" },
1819 { 964, "tau", "greek small letter tau, U+03C4 ISOgrk3" },
1820 { 965, "upsilon","greek small letter upsilon, U+03C5 ISOgrk3" },
1821 { 966, "phi", "greek small letter phi, U+03C6 ISOgrk3" },
1822 { 967, "chi", "greek small letter chi, U+03C7 ISOgrk3" },
1823 { 968, "psi", "greek small letter psi, U+03C8 ISOgrk3" },
1824 { 969, "omega","greek small letter omega, U+03C9 ISOgrk3" },
1825 { 977, "thetasym","greek small letter theta symbol, U+03D1 NEW" },
1826 { 978, "upsih","greek upsilon with hook symbol, U+03D2 NEW" },
1827 { 982, "piv", "greek pi symbol, U+03D6 ISOgrk3" },
1828
1829 { 8194, "ensp", "en space, U+2002 ISOpub" },
1830 { 8195, "emsp", "em space, U+2003 ISOpub" },
1831 { 8201, "thinsp","thin space, U+2009 ISOpub" },
1832 { 8204, "zwnj", "zero width non-joiner, U+200C NEW RFC 2070" },
1833 { 8205, "zwj", "zero width joiner, U+200D NEW RFC 2070" },
1834 { 8206, "lrm", "left-to-right mark, U+200E NEW RFC 2070" },
1835 { 8207, "rlm", "right-to-left mark, U+200F NEW RFC 2070" },
1836 { 8211, "ndash","en dash, U+2013 ISOpub" },
1837 { 8212, "mdash","em dash, U+2014 ISOpub" },
1838 { 8216, "lsquo","left single quotation mark, U+2018 ISOnum" },
1839 { 8217, "rsquo","right single quotation mark, U+2019 ISOnum" },
1840 { 8218, "sbquo","single low-9 quotation mark, U+201A NEW" },
1841 { 8220, "ldquo","left double quotation mark, U+201C ISOnum" },
1842 { 8221, "rdquo","right double quotation mark, U+201D ISOnum" },
1843 { 8222, "bdquo","double low-9 quotation mark, U+201E NEW" },
1844 { 8224, "dagger","dagger, U+2020 ISOpub" },
1845 { 8225, "Dagger","double dagger, U+2021 ISOpub" },
1846
1847 { 8226, "bull", "bullet = black small circle, U+2022 ISOpub" },
1848 { 8230, "hellip","horizontal ellipsis = three dot leader, U+2026 ISOpub" },
1849
1850 { 8240, "permil","per mille sign, U+2030 ISOtech" },
1851
1852 { 8242, "prime","prime = minutes = feet, U+2032 ISOtech" },
1853 { 8243, "Prime","double prime = seconds = inches, U+2033 ISOtech" },
1854
1855 { 8249, "lsaquo","single left-pointing angle quotation mark, U+2039 ISO proposed" },
1856 { 8250, "rsaquo","single right-pointing angle quotation mark, U+203A ISO proposed" },
1857
1858 { 8254, "oline","overline = spacing overscore, U+203E NEW" },
1859 { 8260, "frasl","fraction slash, U+2044 NEW" },
1860
1861 { 8364, "euro", "euro sign, U+20AC NEW" },
1862
1863 { 8465, "image","blackletter capital I = imaginary part, U+2111 ISOamso" },
1864 { 8472, "weierp","script capital P = power set = Weierstrass p, U+2118 ISOamso" },
1865 { 8476, "real", "blackletter capital R = real part symbol, U+211C ISOamso" },
1866 { 8482, "trade","trade mark sign, U+2122 ISOnum" },
1867 { 8501, "alefsym","alef symbol = first transfinite cardinal, U+2135 NEW" },
1868 { 8592, "larr", "leftwards arrow, U+2190 ISOnum" },
1869 { 8593, "uarr", "upwards arrow, U+2191 ISOnum" },
1870 { 8594, "rarr", "rightwards arrow, U+2192 ISOnum" },
1871 { 8595, "darr", "downwards arrow, U+2193 ISOnum" },
1872 { 8596, "harr", "left right arrow, U+2194 ISOamsa" },
1873 { 8629, "crarr","downwards arrow with corner leftwards = carriage return, U+21B5 NEW" },
1874 { 8656, "lArr", "leftwards double arrow, U+21D0 ISOtech" },
1875 { 8657, "uArr", "upwards double arrow, U+21D1 ISOamsa" },
1876 { 8658, "rArr", "rightwards double arrow, U+21D2 ISOtech" },
1877 { 8659, "dArr", "downwards double arrow, U+21D3 ISOamsa" },
1878 { 8660, "hArr", "left right double arrow, U+21D4 ISOamsa" },
1879
1880 { 8704, "forall","for all, U+2200 ISOtech" },
1881 { 8706, "part", "partial differential, U+2202 ISOtech" },
1882 { 8707, "exist","there exists, U+2203 ISOtech" },
1883 { 8709, "empty","empty set = null set = diameter, U+2205 ISOamso" },
1884 { 8711, "nabla","nabla = backward difference, U+2207 ISOtech" },
1885 { 8712, "isin", "element of, U+2208 ISOtech" },
1886 { 8713, "notin","not an element of, U+2209 ISOtech" },
1887 { 8715, "ni", "contains as member, U+220B ISOtech" },
1888 { 8719, "prod", "n-ary product = product sign, U+220F ISOamsb" },
1889 { 8721, "sum", "n-ary summation, U+2211 ISOamsb" },
1890 { 8722, "minus","minus sign, U+2212 ISOtech" },
1891 { 8727, "lowast","asterisk operator, U+2217 ISOtech" },
1892 { 8730, "radic","square root = radical sign, U+221A ISOtech" },
1893 { 8733, "prop", "proportional to, U+221D ISOtech" },
1894 { 8734, "infin","infinity, U+221E ISOtech" },
1895 { 8736, "ang", "angle, U+2220 ISOamso" },
1896 { 8743, "and", "logical and = wedge, U+2227 ISOtech" },
1897 { 8744, "or", "logical or = vee, U+2228 ISOtech" },
1898 { 8745, "cap", "intersection = cap, U+2229 ISOtech" },
1899 { 8746, "cup", "union = cup, U+222A ISOtech" },
1900 { 8747, "int", "integral, U+222B ISOtech" },
1901 { 8756, "there4","therefore, U+2234 ISOtech" },
1902 { 8764, "sim", "tilde operator = varies with = similar to, U+223C ISOtech" },
1903 { 8773, "cong", "approximately equal to, U+2245 ISOtech" },
1904 { 8776, "asymp","almost equal to = asymptotic to, U+2248 ISOamsr" },
1905 { 8800, "ne", "not equal to, U+2260 ISOtech" },
1906 { 8801, "equiv","identical to, U+2261 ISOtech" },
1907 { 8804, "le", "less-than or equal to, U+2264 ISOtech" },
1908 { 8805, "ge", "greater-than or equal to, U+2265 ISOtech" },
1909 { 8834, "sub", "subset of, U+2282 ISOtech" },
1910 { 8835, "sup", "superset of, U+2283 ISOtech" },
1911 { 8836, "nsub", "not a subset of, U+2284 ISOamsn" },
1912 { 8838, "sube", "subset of or equal to, U+2286 ISOtech" },
1913 { 8839, "supe", "superset of or equal to, U+2287 ISOtech" },
1914 { 8853, "oplus","circled plus = direct sum, U+2295 ISOamsb" },
1915 { 8855, "otimes","circled times = vector product, U+2297 ISOamsb" },
1916 { 8869, "perp", "up tack = orthogonal to = perpendicular, U+22A5 ISOtech" },
1917 { 8901, "sdot", "dot operator, U+22C5 ISOamsb" },
1918 { 8968, "lceil","left ceiling = apl upstile, U+2308 ISOamsc" },
1919 { 8969, "rceil","right ceiling, U+2309 ISOamsc" },
1920 { 8970, "lfloor","left floor = apl downstile, U+230A ISOamsc" },
1921 { 8971, "rfloor","right floor, U+230B ISOamsc" },
1922 { 9001, "lang", "left-pointing angle bracket = bra, U+2329 ISOtech" },
1923 { 9002, "rang", "right-pointing angle bracket = ket, U+232A ISOtech" },
1924 { 9674, "loz", "lozenge, U+25CA ISOpub" },
1925
1926 { 9824, "spades","black spade suit, U+2660 ISOpub" },
1927 { 9827, "clubs","black club suit = shamrock, U+2663 ISOpub" },
1928 { 9829, "hearts","black heart suit = valentine, U+2665 ISOpub" },
1929 { 9830, "diams","black diamond suit, U+2666 ISOpub" },
1930
1931 };
1932
1933 /************************************************************************
1934 * *
1935 * Commodity functions to handle entities *
1936 * *
1937 ************************************************************************/
1938
1939 /**
1940 * htmlEntityLookup:
1941 * @name: the entity name
1942 *
1943 * Lookup the given entity in EntitiesTable
1944 *
1945 * TODO: the linear scan is really ugly, an hash table is really needed.
1946 *
1947 * Returns the associated htmlEntityDescPtr if found, NULL otherwise.
1948 */
1949 const htmlEntityDesc *
htmlEntityLookup(const xmlChar * name)1950 htmlEntityLookup(const xmlChar *name) {
1951 unsigned int i;
1952
1953 for (i = 0;i < (sizeof(html40EntitiesTable)/
1954 sizeof(html40EntitiesTable[0]));i++) {
1955 if (xmlStrEqual(name, BAD_CAST html40EntitiesTable[i].name)) {
1956 return((htmlEntityDescPtr) &html40EntitiesTable[i]);
1957 }
1958 }
1959 return(NULL);
1960 }
1961
1962 static int
htmlCompareEntityDesc(const void * vkey,const void * vdesc)1963 htmlCompareEntityDesc(const void *vkey, const void *vdesc) {
1964 const unsigned *key = vkey;
1965 const htmlEntityDesc *desc = vdesc;
1966
1967 return((int) *key - (int) desc->value);
1968 }
1969
1970 /**
1971 * htmlEntityValueLookup:
1972 * @value: the entity's unicode value
1973 *
1974 * Lookup the given entity in EntitiesTable
1975 *
1976 * TODO: the linear scan is really ugly, an hash table is really needed.
1977 *
1978 * Returns the associated htmlEntityDescPtr if found, NULL otherwise.
1979 */
1980 const htmlEntityDesc *
htmlEntityValueLookup(unsigned int value)1981 htmlEntityValueLookup(unsigned int value) {
1982 const htmlEntityDesc *desc;
1983 size_t nmemb;
1984
1985 nmemb = sizeof(html40EntitiesTable) / sizeof(html40EntitiesTable[0]);
1986 desc = bsearch(&value, html40EntitiesTable, nmemb, sizeof(htmlEntityDesc),
1987 htmlCompareEntityDesc);
1988
1989 return(desc);
1990 }
1991
1992 /**
1993 * UTF8ToHtml:
1994 * @out: a pointer to an array of bytes to store the result
1995 * @outlen: the length of @out
1996 * @in: a pointer to an array of UTF-8 chars
1997 * @inlen: the length of @in
1998 *
1999 * Take a block of UTF-8 chars in and try to convert it to an ASCII
2000 * plus HTML entities block of chars out.
2001 *
2002 * Returns 0 if success, -2 if the transcoding fails, or -1 otherwise
2003 * The value of @inlen after return is the number of octets consumed
2004 * as the return value is positive, else unpredictable.
2005 * The value of @outlen after return is the number of octets consumed.
2006 */
2007 int
UTF8ToHtml(unsigned char * out,int * outlen,const unsigned char * in,int * inlen)2008 UTF8ToHtml(unsigned char* out, int *outlen,
2009 const unsigned char* in, int *inlen) {
2010 const unsigned char* instart = in;
2011 const unsigned char* inend;
2012 unsigned char* outstart = out;
2013 unsigned char* outend;
2014 int ret = XML_ENC_ERR_SPACE;
2015
2016 if ((out == NULL) || (outlen == NULL) || (inlen == NULL))
2017 return(XML_ENC_ERR_INTERNAL);
2018
2019 if (in == NULL) {
2020 /*
2021 * initialization nothing to do
2022 */
2023 *outlen = 0;
2024 *inlen = 0;
2025 return(XML_ENC_ERR_SUCCESS);
2026 }
2027
2028 inend = in + *inlen;
2029 outend = out + *outlen;
2030 while (in < inend) {
2031 const htmlEntityDesc *ent;
2032 const char *cp;
2033 char nbuf[16];
2034 unsigned c, d;
2035 int seqlen, len, i;
2036
2037 d = *in;
2038
2039 if (d < 0x80) {
2040 if (out >= outend)
2041 goto done;
2042 *out++ = d;
2043 in += 1;
2044 continue;
2045 }
2046
2047 if (d < 0xE0) { c = d & 0x1F; seqlen = 2; }
2048 else if (d < 0xF0) { c = d & 0x0F; seqlen = 3; }
2049 else { c = d & 0x07; seqlen = 4; }
2050
2051 if (inend - in < seqlen)
2052 break;
2053
2054 for (i = 1; i < seqlen; i++) {
2055 d = in[i];
2056 c <<= 6;
2057 c |= d & 0x3F;
2058 }
2059
2060 /*
2061 * Try to lookup a predefined HTML entity for it
2062 */
2063 ent = htmlEntityValueLookup(c);
2064
2065 if (ent == NULL) {
2066 snprintf(nbuf, sizeof(nbuf), "#%u", c);
2067 cp = nbuf;
2068 } else {
2069 cp = ent->name;
2070 }
2071
2072 len = strlen(cp);
2073 if (outend - out < len + 2)
2074 goto done;
2075
2076 *out++ = '&';
2077 memcpy(out, cp, len);
2078 out += len;
2079 *out++ = ';';
2080
2081 in += seqlen;
2082 }
2083
2084 ret = out - outstart;
2085
2086 done:
2087 *outlen = out - outstart;
2088 *inlen = in - instart;
2089 return(ret);
2090 }
2091
2092 /**
2093 * htmlEncodeEntities:
2094 * @out: a pointer to an array of bytes to store the result
2095 * @outlen: the length of @out
2096 * @in: a pointer to an array of UTF-8 chars
2097 * @inlen: the length of @in
2098 * @quoteChar: the quote character to escape (' or ") or zero.
2099 *
2100 * Take a block of UTF-8 chars in and try to convert it to an ASCII
2101 * plus HTML entities block of chars out.
2102 *
2103 * Returns 0 if success, -2 if the transcoding fails, or -1 otherwise
2104 * The value of @inlen after return is the number of octets consumed
2105 * as the return value is positive, else unpredictable.
2106 * The value of @outlen after return is the number of octets consumed.
2107 */
2108 int
htmlEncodeEntities(unsigned char * out,int * outlen,const unsigned char * in,int * inlen,int quoteChar)2109 htmlEncodeEntities(unsigned char* out, int *outlen,
2110 const unsigned char* in, int *inlen, int quoteChar) {
2111 const unsigned char* processed = in;
2112 const unsigned char* outend;
2113 const unsigned char* outstart = out;
2114 const unsigned char* instart = in;
2115 const unsigned char* inend;
2116 unsigned int c, d;
2117 int trailing;
2118
2119 if ((out == NULL) || (outlen == NULL) || (inlen == NULL) || (in == NULL))
2120 return(-1);
2121 outend = out + (*outlen);
2122 inend = in + (*inlen);
2123 while (in < inend) {
2124 d = *in++;
2125 if (d < 0x80) { c= d; trailing= 0; }
2126 else if (d < 0xC0) {
2127 /* trailing byte in leading position */
2128 *outlen = out - outstart;
2129 *inlen = processed - instart;
2130 return(-2);
2131 } else if (d < 0xE0) { c= d & 0x1F; trailing= 1; }
2132 else if (d < 0xF0) { c= d & 0x0F; trailing= 2; }
2133 else if (d < 0xF8) { c= d & 0x07; trailing= 3; }
2134 else {
2135 /* no chance for this in Ascii */
2136 *outlen = out - outstart;
2137 *inlen = processed - instart;
2138 return(-2);
2139 }
2140
2141 if (inend - in < trailing)
2142 break;
2143
2144 while (trailing--) {
2145 if (((d= *in++) & 0xC0) != 0x80) {
2146 *outlen = out - outstart;
2147 *inlen = processed - instart;
2148 return(-2);
2149 }
2150 c <<= 6;
2151 c |= d & 0x3F;
2152 }
2153
2154 /* assertion: c is a single UTF-4 value */
2155 if ((c < 0x80) && (c != (unsigned int) quoteChar) &&
2156 (c != '&') && (c != '<') && (c != '>')) {
2157 if (out >= outend)
2158 break;
2159 *out++ = c;
2160 } else {
2161 const htmlEntityDesc * ent;
2162 const char *cp;
2163 char nbuf[16];
2164 int len;
2165
2166 /*
2167 * Try to lookup a predefined HTML entity for it
2168 */
2169 ent = htmlEntityValueLookup(c);
2170 if (ent == NULL) {
2171 snprintf(nbuf, sizeof(nbuf), "#%u", c);
2172 cp = nbuf;
2173 }
2174 else
2175 cp = ent->name;
2176 len = strlen(cp);
2177 if (outend - out < len + 2)
2178 break;
2179 *out++ = '&';
2180 memcpy(out, cp, len);
2181 out += len;
2182 *out++ = ';';
2183 }
2184 processed = in;
2185 }
2186 *outlen = out - outstart;
2187 *inlen = processed - instart;
2188 return(0);
2189 }
2190
2191 /************************************************************************
2192 * *
2193 * Commodity functions, cleanup needed ? *
2194 * *
2195 ************************************************************************/
2196 /*
2197 * all tags allowing pc data from the html 4.01 loose dtd
2198 * NOTE: it might be more appropriate to integrate this information
2199 * into the html40ElementTable array but I don't want to risk any
2200 * binary incompatibility
2201 */
2202 static const char *allowPCData[] = {
2203 "a", "abbr", "acronym", "address", "applet", "b", "bdo", "big",
2204 "blockquote", "body", "button", "caption", "center", "cite", "code",
2205 "dd", "del", "dfn", "div", "dt", "em", "font", "form", "h1", "h2",
2206 "h3", "h4", "h5", "h6", "i", "iframe", "ins", "kbd", "label", "legend",
2207 "li", "noframes", "noscript", "object", "p", "pre", "q", "s", "samp",
2208 "small", "span", "strike", "strong", "td", "th", "tt", "u", "var"
2209 };
2210
2211 /**
2212 * areBlanks:
2213 * @ctxt: an HTML parser context
2214 * @str: a xmlChar *
2215 * @len: the size of @str
2216 *
2217 * Is this a sequence of blank chars that one can ignore ?
2218 *
2219 * Returns 1 if ignorable 0 if whitespace, -1 otherwise.
2220 */
2221
areBlanks(htmlParserCtxtPtr ctxt,const xmlChar * str,int len)2222 static int areBlanks(htmlParserCtxtPtr ctxt, const xmlChar *str, int len) {
2223 unsigned int i;
2224 int j;
2225 xmlNodePtr lastChild;
2226 xmlDtdPtr dtd;
2227
2228 for (j = 0;j < len;j++)
2229 if (!(IS_WS_HTML(str[j]))) return(-1);
2230
2231 if (CUR == 0) return(1);
2232 if (CUR != '<') return(0);
2233 if (ctxt->name == NULL)
2234 return(1);
2235 if (xmlStrEqual(ctxt->name, BAD_CAST"html"))
2236 return(1);
2237 if (xmlStrEqual(ctxt->name, BAD_CAST"head"))
2238 return(1);
2239
2240 /* Only strip CDATA children of the body tag for strict HTML DTDs */
2241 if (xmlStrEqual(ctxt->name, BAD_CAST "body") && ctxt->myDoc != NULL) {
2242 dtd = xmlGetIntSubset(ctxt->myDoc);
2243 if (dtd != NULL && dtd->ExternalID != NULL) {
2244 if (!xmlStrcasecmp(dtd->ExternalID, BAD_CAST "-//W3C//DTD HTML 4.01//EN") ||
2245 !xmlStrcasecmp(dtd->ExternalID, BAD_CAST "-//W3C//DTD HTML 4//EN"))
2246 return(1);
2247 }
2248 }
2249
2250 if (ctxt->node == NULL) return(0);
2251 lastChild = xmlGetLastChild(ctxt->node);
2252 while ((lastChild) && (lastChild->type == XML_COMMENT_NODE))
2253 lastChild = lastChild->prev;
2254 if (lastChild == NULL) {
2255 if ((ctxt->node->type != XML_ELEMENT_NODE) &&
2256 (ctxt->node->content != NULL)) return(0);
2257 /* keep ws in constructs like ...<b> </b>...
2258 for all tags "b" allowing PCDATA */
2259 for ( i = 0; i < sizeof(allowPCData)/sizeof(allowPCData[0]); i++ ) {
2260 if ( xmlStrEqual(ctxt->name, BAD_CAST allowPCData[i]) ) {
2261 return(0);
2262 }
2263 }
2264 } else if (xmlNodeIsText(lastChild)) {
2265 return(0);
2266 } else {
2267 /* keep ws in constructs like <p><b>xy</b> <i>z</i><p>
2268 for all tags "p" allowing PCDATA */
2269 for ( i = 0; i < sizeof(allowPCData)/sizeof(allowPCData[0]); i++ ) {
2270 if ( xmlStrEqual(lastChild->name, BAD_CAST allowPCData[i]) ) {
2271 return(0);
2272 }
2273 }
2274 }
2275 return(1);
2276 }
2277
2278 /**
2279 * htmlNewDocNoDtD:
2280 * @URI: URI for the dtd, or NULL
2281 * @ExternalID: the external ID of the DTD, or NULL
2282 *
2283 * Creates a new HTML document without a DTD node if @URI and @ExternalID
2284 * are NULL
2285 *
2286 * Returns a new document, do not initialize the DTD if not provided
2287 */
2288 htmlDocPtr
htmlNewDocNoDtD(const xmlChar * URI,const xmlChar * ExternalID)2289 htmlNewDocNoDtD(const xmlChar *URI, const xmlChar *ExternalID) {
2290 xmlDocPtr cur;
2291
2292 /*
2293 * Allocate a new document and fill the fields.
2294 */
2295 cur = (xmlDocPtr) xmlMalloc(sizeof(xmlDoc));
2296 if (cur == NULL)
2297 return(NULL);
2298 memset(cur, 0, sizeof(xmlDoc));
2299
2300 cur->type = XML_HTML_DOCUMENT_NODE;
2301 cur->version = NULL;
2302 cur->intSubset = NULL;
2303 cur->doc = cur;
2304 cur->name = NULL;
2305 cur->children = NULL;
2306 cur->extSubset = NULL;
2307 cur->oldNs = NULL;
2308 cur->encoding = NULL;
2309 cur->standalone = 1;
2310 cur->compression = 0;
2311 cur->ids = NULL;
2312 cur->refs = NULL;
2313 cur->_private = NULL;
2314 cur->charset = XML_CHAR_ENCODING_UTF8;
2315 cur->properties = XML_DOC_HTML | XML_DOC_USERBUILT;
2316 if ((ExternalID != NULL) ||
2317 (URI != NULL)) {
2318 xmlDtdPtr intSubset;
2319
2320 intSubset = xmlCreateIntSubset(cur, BAD_CAST "html", ExternalID, URI);
2321 if (intSubset == NULL) {
2322 xmlFree(cur);
2323 return(NULL);
2324 }
2325 }
2326 if ((xmlRegisterCallbacks) && (xmlRegisterNodeDefaultValue))
2327 xmlRegisterNodeDefaultValue((xmlNodePtr)cur);
2328 return(cur);
2329 }
2330
2331 /**
2332 * htmlNewDoc:
2333 * @URI: URI for the dtd, or NULL
2334 * @ExternalID: the external ID of the DTD, or NULL
2335 *
2336 * Creates a new HTML document
2337 *
2338 * Returns a new document
2339 */
2340 htmlDocPtr
htmlNewDoc(const xmlChar * URI,const xmlChar * ExternalID)2341 htmlNewDoc(const xmlChar *URI, const xmlChar *ExternalID) {
2342 if ((URI == NULL) && (ExternalID == NULL))
2343 return(htmlNewDocNoDtD(
2344 BAD_CAST "http://www.w3.org/TR/REC-html40/loose.dtd",
2345 BAD_CAST "-//W3C//DTD HTML 4.0 Transitional//EN"));
2346
2347 return(htmlNewDocNoDtD(URI, ExternalID));
2348 }
2349
2350
2351 /************************************************************************
2352 * *
2353 * The parser itself *
2354 * Relates to http://www.w3.org/TR/html40 *
2355 * *
2356 ************************************************************************/
2357
2358 /************************************************************************
2359 * *
2360 * The parser itself *
2361 * *
2362 ************************************************************************/
2363
2364 /**
2365 * htmlParseHTMLName:
2366 * @ctxt: an HTML parser context
2367 *
2368 * parse an HTML tag or attribute name, note that we convert it to lowercase
2369 * since HTML names are not case-sensitive.
2370 *
2371 * Returns the Tag Name parsed or NULL
2372 */
2373
2374 static xmlHashedString
htmlParseHTMLName(htmlParserCtxtPtr ctxt,int attr)2375 htmlParseHTMLName(htmlParserCtxtPtr ctxt, int attr) {
2376 xmlHashedString ret;
2377 xmlChar buf[HTML_PARSER_BUFFER_SIZE];
2378 const xmlChar *in;
2379 size_t avail;
2380 int eof = PARSER_PROGRESSIVE(ctxt);
2381 int nbchar = 0;
2382 int stop = attr ? '=' : ' ';
2383
2384 in = ctxt->input->cur;
2385 avail = ctxt->input->end - in;
2386
2387 while (1) {
2388 int c, size;
2389
2390 if ((!eof) && (avail < 32)) {
2391 size_t oldAvail = avail;
2392
2393 ctxt->input->cur = in;
2394
2395 SHRINK;
2396 xmlParserGrow(ctxt);
2397
2398 in = ctxt->input->cur;
2399 avail = ctxt->input->end - in;
2400
2401 if (oldAvail == avail)
2402 eof = 1;
2403 }
2404
2405 if (avail == 0)
2406 break;
2407
2408 c = *in;
2409 size = 1;
2410
2411 if ((nbchar != 0) &&
2412 ((c == '/') || (c == '>') || (c == stop) ||
2413 (IS_WS_HTML(c))))
2414 break;
2415
2416 if (c == 0) {
2417 if (nbchar + 3 <= HTML_PARSER_BUFFER_SIZE) {
2418 buf[nbchar++] = 0xEF;
2419 buf[nbchar++] = 0xBF;
2420 buf[nbchar++] = 0xBD;
2421 }
2422 } else if (c < 0x80) {
2423 if (nbchar < HTML_PARSER_BUFFER_SIZE) {
2424 if (IS_UPPER(c))
2425 c += 0x20;
2426 buf[nbchar++] = c;
2427 }
2428 } else {
2429 size = htmlValidateUtf8(ctxt, in, avail, /* partial */ 0);
2430
2431 if (size > 0) {
2432 if (nbchar + size <= HTML_PARSER_BUFFER_SIZE) {
2433 memcpy(buf + nbchar, in, size);
2434 nbchar += size;
2435 }
2436 } else {
2437 size = 1;
2438
2439 if (nbchar + 3 <= HTML_PARSER_BUFFER_SIZE) {
2440 buf[nbchar++] = 0xEF;
2441 buf[nbchar++] = 0xBF;
2442 buf[nbchar++] = 0xBD;
2443 }
2444 }
2445 }
2446
2447 in += size;
2448 avail -= size;
2449 }
2450
2451 ctxt->input->cur = in;
2452
2453 SHRINK;
2454
2455 ret = xmlDictLookupHashed(ctxt->dict, buf, nbchar);
2456 if (ret.name == NULL)
2457 htmlErrMemory(ctxt);
2458
2459 return(ret);
2460 }
2461
2462 static const short htmlC1Remap[32] = {
2463 0x20AC, 0x0081, 0x201A, 0x0192, 0x201E, 0x2026, 0x2020, 0x2021,
2464 0x02C6, 0x2030, 0x0160, 0x2039, 0x0152, 0x008D, 0x017D, 0x008F,
2465 0x0090, 0x2018, 0x2019, 0x201C, 0x201D, 0x2022, 0x2013, 0x2014,
2466 0x02DC, 0x2122, 0x0161, 0x203A, 0x0153, 0x009D, 0x017E, 0x0178
2467 };
2468
2469 static const xmlChar *
htmlCodePointToUtf8(int c,xmlChar * out,int * osize)2470 htmlCodePointToUtf8(int c, xmlChar *out, int *osize) {
2471 int i = 0;
2472 int bits, hi;
2473
2474 if ((c >= 0x80) && (c < 0xA0)) {
2475 c = htmlC1Remap[c - 0x80];
2476 } else if ((c <= 0) ||
2477 ((c >= 0xD800) && (c < 0xE000)) ||
2478 (c > 0x10FFFF)) {
2479 c = 0xFFFD;
2480 }
2481
2482 if (c < 0x80) { bits = 0; hi = 0x00; }
2483 else if (c < 0x800) { bits = 6; hi = 0xC0; }
2484 else if (c < 0x10000) { bits = 12; hi = 0xE0; }
2485 else { bits = 18; hi = 0xF0; }
2486
2487 out[i++] = (c >> bits) | hi;
2488
2489 while (bits > 0) {
2490 bits -= 6;
2491 out[i++] = ((c >> bits) & 0x3F) | 0x80;
2492 }
2493
2494 *osize = i;
2495 return(out);
2496 }
2497
2498 #include "html5ent.inc"
2499
2500 #define ENT_F_SEMICOLON 0x80u
2501 #define ENT_F_SUBTABLE 0x40u
2502 #define ENT_F_ALL 0xC0u
2503
2504 static const xmlChar *
htmlFindEntityPrefix(const xmlChar * string,size_t slen,int isAttr,int * nlen,int * rlen)2505 htmlFindEntityPrefix(const xmlChar *string, size_t slen, int isAttr,
2506 int *nlen, int *rlen) {
2507 const xmlChar *match = NULL;
2508 unsigned left, right;
2509 int first = string[0];
2510 size_t matchLen = 0;
2511 size_t soff = 1;
2512
2513 if (slen < 2)
2514 return(NULL);
2515 if (!IS_ASCII_LETTER(first))
2516 return(NULL);
2517
2518 /*
2519 * Look up range by first character
2520 */
2521 first &= 63;
2522 left = htmlEntAlpha[first*3] | htmlEntAlpha[first*3+1] << 8;
2523 right = left + htmlEntAlpha[first*3+2];
2524
2525 /*
2526 * Binary search
2527 */
2528 while (left < right) {
2529 const xmlChar *bytes;
2530 unsigned mid;
2531 size_t len;
2532 int cmp;
2533
2534 mid = left + (right - left) / 2;
2535 bytes = htmlEntStrings + htmlEntValues[mid];
2536 len = bytes[0] & ~ENT_F_ALL;
2537
2538 cmp = string[soff] - bytes[1];
2539
2540 if (cmp == 0) {
2541 if (slen < len) {
2542 cmp = strncmp((const char *) string + soff + 1,
2543 (const char *) bytes + 2,
2544 slen - 1);
2545 /* Prefix can never match */
2546 if (cmp == 0)
2547 break;
2548 } else {
2549 cmp = strncmp((const char *) string + soff + 1,
2550 (const char *) bytes + 2,
2551 len - 1);
2552 }
2553 }
2554
2555 if (cmp < 0) {
2556 right = mid;
2557 } else if (cmp > 0) {
2558 left = mid + 1;
2559 } else {
2560 int term = soff + len < slen ? string[soff + len] : 0;
2561 int isAlnum, isTerm;
2562
2563 isAlnum = IS_ALNUM(term);
2564 isTerm = ((term == ';') ||
2565 ((bytes[0] & ENT_F_SEMICOLON) &&
2566 ((!isAttr) ||
2567 ((!isAlnum) && (term != '=')))));
2568
2569 if (isTerm) {
2570 match = bytes + len + 1;
2571 matchLen = soff + len;
2572 if (term == ';')
2573 matchLen += 1;
2574 }
2575
2576 if (bytes[0] & ENT_F_SUBTABLE) {
2577 if (isTerm)
2578 match += 2;
2579
2580 if ((isAlnum) && (soff + len < slen)) {
2581 left = mid + bytes[len + 1];
2582 right = left + bytes[len + 2];
2583 soff += len;
2584 continue;
2585 }
2586 }
2587
2588 break;
2589 }
2590 }
2591
2592 if (match == NULL)
2593 return(NULL);
2594
2595 *nlen = matchLen;
2596 *rlen = match[0];
2597 return(match + 1);
2598 }
2599
2600 /**
2601 * htmlParseData:
2602 * @ctxt: an HTML parser context
2603 * @mask: mask of terminating characters
2604 * @comment: true if parsing a comment
2605 * @refs: true if references are allowed
2606 * @maxLength: maximum output length
2607 *
2608 * Parse data until terminator is reached.
2609 *
2610 * Returns the parsed string or NULL in case of errors.
2611 */
2612
2613 static xmlChar *
htmlParseData(htmlParserCtxtPtr ctxt,htmlAsciiMask mask,int comment,int refs,int maxLength)2614 htmlParseData(htmlParserCtxtPtr ctxt, htmlAsciiMask mask,
2615 int comment, int refs, int maxLength) {
2616 xmlParserInputPtr input = ctxt->input;
2617 xmlChar *ret = NULL;
2618 xmlChar *buffer;
2619 xmlChar utf8Char[4];
2620 size_t buffer_size;
2621 size_t used;
2622 int eof = PARSER_PROGRESSIVE(ctxt);
2623 int line, col;
2624 int termSkip = -1;
2625
2626 used = 0;
2627 buffer_size = ctxt->spaceMax;
2628 buffer = (xmlChar *) ctxt->spaceTab;
2629 if (buffer == NULL) {
2630 buffer_size = 500;
2631 buffer = xmlMalloc(buffer_size + 1);
2632 if (buffer == NULL) {
2633 htmlErrMemory(ctxt);
2634 return(NULL);
2635 }
2636 }
2637
2638 line = input->line;
2639 col = input->col;
2640
2641 while (!PARSER_STOPPED(ctxt)) {
2642 const xmlChar *chunk, *in, *repl;
2643 size_t avail, chunkSize, extraSize;
2644 int replSize;
2645 int skip = 0;
2646 int ncr = 0;
2647 int ncrSize = 0;
2648 int cp = 0;
2649
2650 chunk = input->cur;
2651 avail = input->end - chunk;
2652 in = chunk;
2653
2654 repl = BAD_CAST "";
2655 replSize = 0;
2656
2657 while (!PARSER_STOPPED(ctxt)) {
2658 size_t j;
2659 int cur, size;
2660
2661 if ((!eof) && (avail <= 64)) {
2662 size_t oldAvail = avail;
2663 size_t off = in - chunk;
2664
2665 input->cur = in;
2666
2667 xmlParserGrow(ctxt);
2668
2669 in = input->cur;
2670 chunk = in - off;
2671 input->cur = chunk;
2672 avail = input->end - in;
2673
2674 if (oldAvail == avail)
2675 eof = 1;
2676 }
2677
2678 if (avail == 0) {
2679 termSkip = 0;
2680 break;
2681 }
2682
2683 cur = *in;
2684 size = 1;
2685 col += 1;
2686
2687 if (htmlMaskMatch(mask, cur)) {
2688 if (comment) {
2689 if (avail < 2) {
2690 termSkip = 1;
2691 } else if (in[1] == '-') {
2692 if (avail < 3) {
2693 termSkip = 2;
2694 } else if (in[2] == '>') {
2695 termSkip = 3;
2696 } else if (in[2] == '!') {
2697 if (avail < 4)
2698 termSkip = 3;
2699 else if (in[3] == '>')
2700 termSkip = 4;
2701 }
2702 }
2703
2704 if (termSkip >= 0)
2705 break;
2706 } else {
2707 termSkip = 0;
2708 break;
2709 }
2710 }
2711
2712 if (ncr) {
2713 int lc = cur | 0x20;
2714 int digit;
2715
2716 if ((cur >= '0') && (cur <= '9')) {
2717 digit = cur - '0';
2718 } else if ((ncr == 16) && (lc >= 'a') && (lc <= 'f')) {
2719 digit = (lc - 'a') + 10;
2720 } else {
2721 if (cur == ';') {
2722 in += 1;
2723 size += 1;
2724 ncrSize += 1;
2725 }
2726 goto next_chunk;
2727 }
2728
2729 cp = cp * ncr + digit;
2730 if (cp >= 0x110000)
2731 cp = 0x110000;
2732
2733 ncrSize += 1;
2734
2735 goto next_char;
2736 }
2737
2738 switch (cur) {
2739 case '&':
2740 if (!refs)
2741 break;
2742
2743 j = 1;
2744
2745 if ((j < avail) && (in[j] == '#')) {
2746 j += 1;
2747 if (j < avail) {
2748 if ((in[j] | 0x20) == 'x') {
2749 j += 1;
2750 if ((j < avail) && (IS_HEX_DIGIT(in[j]))) {
2751 ncr = 16;
2752 size = 3;
2753 ncrSize = 3;
2754 cp = 0;
2755 }
2756 } else if (IS_ASCII_DIGIT(in[j])) {
2757 ncr = 10;
2758 size = 2;
2759 ncrSize = 2;
2760 cp = 0;
2761 }
2762 }
2763 } else {
2764 repl = htmlFindEntityPrefix(in + j,
2765 avail - j,
2766 /* isAttr */ 1,
2767 &skip, &replSize);
2768 if (repl != NULL) {
2769 skip += 1;
2770 goto next_chunk;
2771 }
2772
2773 skip = 0;
2774 }
2775
2776 break;
2777
2778 case '\0':
2779 skip = 1;
2780 repl = BAD_CAST "\xEF\xBF\xBD";
2781 replSize = 3;
2782 goto next_chunk;
2783
2784 case '\n':
2785 line += 1;
2786 col = 1;
2787 break;
2788
2789 case '\r':
2790 skip = 1;
2791 if (in[1] != 0x0A) {
2792 repl = BAD_CAST "\x0A";
2793 replSize = 1;
2794 }
2795 goto next_chunk;
2796
2797 default:
2798 if (cur < 0x80)
2799 break;
2800
2801 if ((input->flags & XML_INPUT_HAS_ENCODING) == 0) {
2802 xmlChar * guess;
2803
2804 #ifdef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION
2805 guess = NULL;
2806 #else
2807 guess = htmlFindEncoding(ctxt);
2808 #endif
2809 if (guess == NULL) {
2810 xmlSwitchEncoding(ctxt, XML_CHAR_ENCODING_8859_1);
2811 } else {
2812 xmlSwitchEncodingName(ctxt, (const char *) guess);
2813 xmlFree(guess);
2814 }
2815 input->flags |= XML_INPUT_HAS_ENCODING;
2816
2817 goto restart;
2818 }
2819
2820 size = htmlValidateUtf8(ctxt, in, avail, /* partial */ 0);
2821
2822 if (size <= 0) {
2823 skip = 1;
2824 repl = BAD_CAST "\xEF\xBF\xBD";
2825 replSize = 3;
2826 goto next_chunk;
2827 }
2828
2829 break;
2830 }
2831
2832 next_char:
2833 in += size;
2834 avail -= size;
2835 }
2836
2837 next_chunk:
2838 if (ncrSize > 0) {
2839 skip = ncrSize;
2840 in -= ncrSize;
2841
2842 repl = htmlCodePointToUtf8(cp, utf8Char, &replSize);
2843 }
2844
2845 chunkSize = in - chunk;
2846 extraSize = chunkSize + replSize;
2847
2848 if (extraSize > maxLength - used) {
2849 htmlParseErr(ctxt, XML_ERR_RESOURCE_LIMIT,
2850 "value too long\n", NULL, NULL);
2851 goto error;
2852 }
2853
2854 if (extraSize > buffer_size - used) {
2855 size_t newSize = (used + extraSize) * 2;
2856 xmlChar *tmp = xmlRealloc(buffer, newSize + 1);
2857
2858 if (tmp == NULL) {
2859 htmlErrMemory(ctxt);
2860 goto error;
2861 }
2862 buffer = tmp;
2863 buffer_size = newSize;
2864 }
2865
2866 if (chunkSize > 0) {
2867 input->cur += chunkSize;
2868 memcpy(buffer + used, chunk, chunkSize);
2869 used += chunkSize;
2870 }
2871
2872 input->cur += skip;
2873 if (replSize > 0) {
2874 memcpy(buffer + used, repl, replSize);
2875 used += replSize;
2876 }
2877
2878 SHRINK;
2879
2880 if (termSkip >= 0)
2881 break;
2882
2883 restart:
2884 ;
2885 }
2886
2887 if (termSkip > 0) {
2888 input->cur += termSkip;
2889 col += termSkip;
2890 }
2891
2892 input->line = line;
2893 input->col = col;
2894
2895 ret = xmlMalloc(used + 1);
2896 if (ret == NULL) {
2897 htmlErrMemory(ctxt);
2898 } else {
2899 memcpy(ret, buffer, used);
2900 ret[used] = 0;
2901 }
2902
2903 error:
2904 ctxt->spaceTab = (void *) buffer;
2905 ctxt->spaceMax = buffer_size;
2906
2907 return(ret);
2908 }
2909
2910 /**
2911 * htmlParseEntityRef:
2912 * @ctxt: an HTML parser context
2913 * @str: location to store the entity name
2914 *
2915 * DEPRECATED: Internal function, don't use.
2916 *
2917 * Returns NULL.
2918 */
2919 const htmlEntityDesc *
htmlParseEntityRef(htmlParserCtxtPtr ctxt ATTRIBUTE_UNUSED,const xmlChar ** str ATTRIBUTE_UNUSED)2920 htmlParseEntityRef(htmlParserCtxtPtr ctxt ATTRIBUTE_UNUSED,
2921 const xmlChar **str ATTRIBUTE_UNUSED) {
2922 return(NULL);
2923 }
2924
2925 /**
2926 * htmlParseAttValue:
2927 * @ctxt: an HTML parser context
2928 *
2929 * parse a value for an attribute
2930 * Note: the parser won't do substitution of entities here, this
2931 * will be handled later in xmlStringGetNodeList, unless it was
2932 * asked for ctxt->replaceEntities != 0
2933 *
2934 * Returns the AttValue parsed or NULL.
2935 */
2936
2937 static xmlChar *
htmlParseAttValue(htmlParserCtxtPtr ctxt)2938 htmlParseAttValue(htmlParserCtxtPtr ctxt) {
2939 xmlChar *ret = NULL;
2940 int maxLength = (ctxt->options & HTML_PARSE_HUGE) ?
2941 XML_MAX_HUGE_LENGTH :
2942 XML_MAX_TEXT_LENGTH;
2943
2944 if (CUR == '"') {
2945 SKIP(1);
2946 ret = htmlParseData(ctxt, MASK_DQ, 0, 1, maxLength);
2947 if (CUR == '"')
2948 SKIP(1);
2949 } else if (CUR == '\'') {
2950 SKIP(1);
2951 ret = htmlParseData(ctxt, MASK_SQ, 0, 1, maxLength);
2952 if (CUR == '\'')
2953 SKIP(1);
2954 } else {
2955 ret = htmlParseData(ctxt, MASK_WS_GT, 0, 1, maxLength);
2956 }
2957 return(ret);
2958 }
2959
2960 static void
htmlCharDataSAXCallback(htmlParserCtxtPtr ctxt,const xmlChar * buf,int size,int mode)2961 htmlCharDataSAXCallback(htmlParserCtxtPtr ctxt, const xmlChar *buf,
2962 int size, int mode) {
2963 if ((ctxt->sax == NULL) || (ctxt->disableSAX))
2964 return;
2965
2966 if ((mode == 0) || (mode == DATA_RCDATA) ||
2967 (ctxt->sax->cdataBlock == NULL)) {
2968 int blank = areBlanks(ctxt, buf, size);
2969
2970 if ((mode == 0) && (blank > 0) && (!ctxt->keepBlanks)) {
2971 if (ctxt->sax->ignorableWhitespace != NULL)
2972 ctxt->sax->ignorableWhitespace(ctxt->userData,
2973 buf, size);
2974 } else {
2975 if ((mode == 0) && (blank < 0))
2976 htmlCheckParagraph(ctxt);
2977
2978 if (ctxt->sax->characters != NULL)
2979 ctxt->sax->characters(ctxt->userData, buf, size);
2980 }
2981 } else {
2982 /*
2983 * Insert as CDATA, which is the same as HTML_PRESERVE_NODE
2984 */
2985 ctxt->sax->cdataBlock(ctxt->userData, buf, size);
2986 }
2987 }
2988
2989 /**
2990 * htmlParseCharData:
2991 * @ctxt: an HTML parser context
2992 * @partial: true if the input buffer is incomplete
2993 *
2994 * Parse character data and references.
2995 *
2996 * Returns 1 if all data was parsed, 0 otherwise.
2997 */
2998
2999 static int
htmlParseCharData(htmlParserCtxtPtr ctxt,int partial)3000 htmlParseCharData(htmlParserCtxtPtr ctxt, int partial) {
3001 xmlParserInputPtr input = ctxt->input;
3002 xmlChar utf8Char[4];
3003 int complete = 0;
3004 int done = 0;
3005 int mode;
3006 int eof = PARSER_PROGRESSIVE(ctxt);
3007 int line, col;
3008
3009 mode = ctxt->endCheckState;
3010
3011 line = input->line;
3012 col = input->col;
3013
3014 while (!PARSER_STOPPED(ctxt)) {
3015 const xmlChar *chunk, *in, *repl;
3016 size_t avail;
3017 int replSize;
3018 int skip = 0;
3019 int ncr = 0;
3020 int ncrSize = 0;
3021 int cp = 0;
3022
3023 chunk = input->cur;
3024 avail = input->end - chunk;
3025 in = chunk;
3026
3027 repl = BAD_CAST "";
3028 replSize = 0;
3029
3030 while (!PARSER_STOPPED(ctxt)) {
3031 size_t j;
3032 int cur, size;
3033
3034 if (avail <= 64) {
3035 if (!eof) {
3036 size_t oldAvail = avail;
3037 size_t off = in - chunk;
3038
3039 input->cur = in;
3040
3041 xmlParserGrow(ctxt);
3042
3043 in = input->cur;
3044 chunk = in - off;
3045 input->cur = chunk;
3046 avail = input->end - in;
3047
3048 if (oldAvail == avail)
3049 eof = 1;
3050 }
3051
3052 if (avail == 0) {
3053 if ((partial) && (ncr)) {
3054 in -= ncrSize;
3055 ncrSize = 0;
3056 }
3057
3058 done = 1;
3059 break;
3060 }
3061 }
3062
3063 /* Accelerator */
3064 if (!ncr) {
3065 while (avail > 0) {
3066 static const unsigned mask[8] = {
3067 0x00002401, 0x10002040,
3068 0x00000000, 0x00000000,
3069 0xFFFFFFFF, 0xFFFFFFFF,
3070 0xFFFFFFFF, 0xFFFFFFFF
3071 };
3072 cur = *in;
3073 if ((1u << (cur & 0x1F)) & mask[cur >> 5])
3074 break;
3075 col += 1;
3076 in += 1;
3077 avail -= 1;
3078 }
3079
3080 if ((!eof) && (avail <= 64))
3081 continue;
3082 if (avail == 0)
3083 continue;
3084 }
3085
3086 cur = *in;
3087 size = 1;
3088 col += 1;
3089
3090 if (ncr) {
3091 int lc = cur | 0x20;
3092 int digit;
3093
3094 if ((cur >= '0') && (cur <= '9')) {
3095 digit = cur - '0';
3096 } else if ((ncr == 16) && (lc >= 'a') && (lc <= 'f')) {
3097 digit = (lc - 'a') + 10;
3098 } else {
3099 if (cur == ';') {
3100 in += 1;
3101 size += 1;
3102 ncrSize += 1;
3103 }
3104 goto next_chunk;
3105 }
3106
3107 cp = cp * ncr + digit;
3108 if (cp >= 0x110000)
3109 cp = 0x110000;
3110
3111 ncrSize += 1;
3112
3113 goto next_char;
3114 }
3115
3116 switch (cur) {
3117 case '<':
3118 if (mode == 0) {
3119 done = 1;
3120 complete = 1;
3121 goto next_chunk;
3122 }
3123 if (mode == DATA_PLAINTEXT)
3124 break;
3125
3126 j = 1;
3127 if (j < avail) {
3128 if ((mode == DATA_SCRIPT) && (in[j] == '!')) {
3129 /* Check for comment start */
3130
3131 j += 1;
3132 if ((j < avail) && (in[j] == '-')) {
3133 j += 1;
3134 if ((j < avail) && (in[j] == '-'))
3135 mode = DATA_SCRIPT_ESC1;
3136 }
3137 } else {
3138 int i = 0;
3139 int solidus = 0;
3140
3141 /* Check for tag */
3142
3143 if (in[j] == '/') {
3144 j += 1;
3145 solidus = 1;
3146 }
3147
3148 if ((solidus) || (mode == DATA_SCRIPT_ESC1)) {
3149 while ((j < avail) &&
3150 (ctxt->name[i] != 0) &&
3151 (ctxt->name[i] == (in[j] | 0x20))) {
3152 i += 1;
3153 j += 1;
3154 }
3155
3156 if ((ctxt->name[i] == 0) && (j < avail)) {
3157 int c = in[j];
3158
3159 if ((c == '>') || (c == '/') ||
3160 (IS_WS_HTML(c))) {
3161 if ((mode == DATA_SCRIPT_ESC1) &&
3162 (!solidus)) {
3163 mode = DATA_SCRIPT_ESC2;
3164 } else if (mode == DATA_SCRIPT_ESC2) {
3165 mode = DATA_SCRIPT_ESC1;
3166 } else {
3167 complete = 1;
3168 done = 1;
3169 goto next_chunk;
3170 }
3171 }
3172 }
3173 }
3174 }
3175 }
3176
3177 if ((partial) && (j >= avail)) {
3178 done = 1;
3179 goto next_chunk;
3180 }
3181
3182 break;
3183
3184 case '-':
3185 if ((mode != DATA_SCRIPT_ESC1) && (mode != DATA_SCRIPT_ESC2))
3186 break;
3187
3188 /* Check for comment end */
3189
3190 j = 1;
3191 if ((j < avail) && (in[j] == '-')) {
3192 j += 1;
3193 if ((j < avail) && (in[j] == '>'))
3194 mode = DATA_SCRIPT;
3195 }
3196
3197 if ((partial) && (j >= avail)) {
3198 done = 1;
3199 goto next_chunk;
3200 }
3201
3202 break;
3203
3204 case '&':
3205 if ((mode != 0) && (mode != DATA_RCDATA))
3206 break;
3207
3208 j = 1;
3209
3210 if ((j < avail) && (in[j] == '#')) {
3211 j += 1;
3212 if (j < avail) {
3213 if ((in[j] | 0x20) == 'x') {
3214 j += 1;
3215 if ((j < avail) && (IS_HEX_DIGIT(in[j]))) {
3216 ncr = 16;
3217 size = 3;
3218 ncrSize = 3;
3219 cp = 0;
3220 }
3221 } else if (IS_ASCII_DIGIT(in[j])) {
3222 ncr = 10;
3223 size = 2;
3224 ncrSize = 2;
3225 cp = 0;
3226 }
3227 }
3228 } else {
3229 if (partial) {
3230 int terminated = 0;
3231 size_t i;
3232
3233 /*
3234 * ∳ has 33 bytes.
3235 */
3236 for (i = 1; i < avail; i++) {
3237 if ((i >= 32) || !IS_ASCII_LETTER(in[i])) {
3238 terminated = 1;
3239 break;
3240 }
3241 }
3242
3243 if (!terminated) {
3244 done = 1;
3245 goto next_chunk;
3246 }
3247 }
3248
3249 repl = htmlFindEntityPrefix(in + j,
3250 avail - j,
3251 /* isAttr */ 0,
3252 &skip, &replSize);
3253 if (repl != NULL) {
3254 skip += 1;
3255 goto next_chunk;
3256 }
3257
3258 skip = 0;
3259 }
3260
3261 if ((partial) && (j >= avail)) {
3262 done = 1;
3263 goto next_chunk;
3264 }
3265
3266 break;
3267
3268 case '\0':
3269 skip = 1;
3270 repl = BAD_CAST "\xEF\xBF\xBD";
3271 replSize = 3;
3272 goto next_chunk;
3273
3274 case '\n':
3275 line += 1;
3276 col = 1;
3277 break;
3278
3279 case '\r':
3280 if (partial && avail < 2) {
3281 done = 1;
3282 goto next_chunk;
3283 }
3284
3285 skip = 1;
3286 if (in[1] != 0x0A) {
3287 repl = BAD_CAST "\x0A";
3288 replSize = 1;
3289 }
3290 goto next_chunk;
3291
3292 default:
3293 if (cur < 0x80)
3294 break;
3295
3296 if ((input->flags & XML_INPUT_HAS_ENCODING) == 0) {
3297 xmlChar * guess;
3298
3299 if (in > chunk)
3300 goto next_chunk;
3301
3302 #ifdef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION
3303 guess = NULL;
3304 #else
3305 guess = htmlFindEncoding(ctxt);
3306 #endif
3307 if (guess == NULL) {
3308 xmlSwitchEncoding(ctxt, XML_CHAR_ENCODING_8859_1);
3309 } else {
3310 xmlSwitchEncodingName(ctxt, (const char *) guess);
3311 xmlFree(guess);
3312 }
3313 input->flags |= XML_INPUT_HAS_ENCODING;
3314
3315 goto restart;
3316 }
3317
3318 size = htmlValidateUtf8(ctxt, in, avail, partial);
3319
3320 if ((partial) && (size == 0)) {
3321 done = 1;
3322 goto next_chunk;
3323 }
3324
3325 if (size <= 0) {
3326 skip = 1;
3327 repl = BAD_CAST "\xEF\xBF\xBD";
3328 replSize = 3;
3329 goto next_chunk;
3330 }
3331
3332 break;
3333 }
3334
3335 next_char:
3336 in += size;
3337 avail -= size;
3338 }
3339
3340 next_chunk:
3341 if (ncrSize > 0) {
3342 skip = ncrSize;
3343 in -= ncrSize;
3344
3345 repl = htmlCodePointToUtf8(cp, utf8Char, &replSize);
3346 }
3347
3348 if (in > chunk) {
3349 input->cur += in - chunk;
3350 htmlCharDataSAXCallback(ctxt, chunk, in - chunk, mode);
3351 }
3352
3353 input->cur += skip;
3354 if (replSize > 0)
3355 htmlCharDataSAXCallback(ctxt, repl, replSize, mode);
3356
3357 SHRINK;
3358
3359 if (done)
3360 break;
3361
3362 restart:
3363 ;
3364 }
3365
3366 input->line = line;
3367 input->col = col;
3368
3369 if (complete)
3370 ctxt->endCheckState = 0;
3371 else
3372 ctxt->endCheckState = mode;
3373
3374 return(complete);
3375 }
3376
3377 /**
3378 * htmlParseComment:
3379 * @ctxt: an HTML parser context
3380 * @bogus: true if this is a bogus comment
3381 *
3382 * Parse an HTML comment
3383 */
3384 static void
htmlParseComment(htmlParserCtxtPtr ctxt,int bogus)3385 htmlParseComment(htmlParserCtxtPtr ctxt, int bogus) {
3386 const xmlChar *comment = BAD_CAST "";
3387 xmlChar *buf = NULL;
3388 int maxLength = (ctxt->options & HTML_PARSE_HUGE) ?
3389 XML_MAX_HUGE_LENGTH :
3390 XML_MAX_TEXT_LENGTH;
3391
3392 if (bogus) {
3393 buf = htmlParseData(ctxt, MASK_GT, 0, 0, maxLength);
3394 if (CUR == '>')
3395 SKIP(1);
3396 comment = buf;
3397 } else {
3398 if (CUR == '>') {
3399 SKIP(1);
3400 } else if ((CUR == '-') && (NXT(1) == '>')) {
3401 SKIP(2);
3402 } else {
3403 buf = htmlParseData(ctxt, MASK_DASH, 1, 0, maxLength);
3404 comment = buf;
3405 }
3406 }
3407
3408 if (comment == NULL)
3409 return;
3410
3411 if ((ctxt->sax != NULL) && (ctxt->sax->comment != NULL) &&
3412 (!ctxt->disableSAX))
3413 ctxt->sax->comment(ctxt->userData, comment);
3414
3415 xmlFree(buf);
3416 }
3417
3418 /**
3419 * htmlParseCharRef:
3420 * @ctxt: an HTML parser context
3421 *
3422 * DEPRECATED: Internal function, don't use.
3423 *
3424 * Returns 0
3425 */
3426 int
htmlParseCharRef(htmlParserCtxtPtr ctxt ATTRIBUTE_UNUSED)3427 htmlParseCharRef(htmlParserCtxtPtr ctxt ATTRIBUTE_UNUSED) {
3428 return(0);
3429 }
3430
3431
3432 /**
3433 * htmlParseDoctypeLiteral:
3434 * @ctxt: an HTML parser context
3435 *
3436 * Parse a DOCTYPE SYTSTEM or PUBLIC literal.
3437 *
3438 * Returns the literal or NULL in case of error.
3439 */
3440
3441 static xmlChar *
htmlParseDoctypeLiteral(htmlParserCtxtPtr ctxt)3442 htmlParseDoctypeLiteral(htmlParserCtxtPtr ctxt) {
3443 xmlChar *ret;
3444 int maxLength = (ctxt->options & HTML_PARSE_HUGE) ?
3445 XML_MAX_TEXT_LENGTH :
3446 XML_MAX_NAME_LENGTH;
3447
3448 if (CUR == '"') {
3449 SKIP(1);
3450 ret = htmlParseData(ctxt, MASK_DQ_GT, 0, 0, maxLength);
3451 if (CUR == '"')
3452 SKIP(1);
3453 } else if (CUR == '\'') {
3454 SKIP(1);
3455 ret = htmlParseData(ctxt, MASK_SQ_GT, 0, 0, maxLength);
3456 if (CUR == '\'')
3457 SKIP(1);
3458 } else {
3459 return(NULL);
3460 }
3461
3462 return(ret);
3463 }
3464
3465 static void
htmlSkipBogusDoctype(htmlParserCtxtPtr ctxt)3466 htmlSkipBogusDoctype(htmlParserCtxtPtr ctxt) {
3467 const xmlChar *in;
3468 size_t avail;
3469 int eof = PARSER_PROGRESSIVE(ctxt);
3470 int line, col;
3471
3472 line = ctxt->input->line;
3473 col = ctxt->input->col;
3474
3475 in = ctxt->input->cur;
3476 avail = ctxt->input->end - in;
3477
3478 while (!PARSER_STOPPED(ctxt)) {
3479 int cur;
3480
3481 if ((!eof) && (avail <= 64)) {
3482 size_t oldAvail = avail;
3483
3484 ctxt->input->cur = in;
3485
3486 xmlParserGrow(ctxt);
3487
3488 in = ctxt->input->cur;
3489 avail = ctxt->input->end - in;
3490
3491 if (oldAvail == avail)
3492 eof = 1;
3493 }
3494
3495 if (avail == 0)
3496 break;
3497
3498 col += 1;
3499
3500 cur = *in;
3501 if (cur == '>') {
3502 in += 1;
3503 break;
3504 } else if (cur == 0x0A) {
3505 line += 1;
3506 col = 1;
3507 }
3508
3509 in += 1;
3510 avail -= 1;
3511
3512 SHRINK;
3513 }
3514
3515 ctxt->input->cur = in;
3516 ctxt->input->line = line;
3517 ctxt->input->col = col;
3518 }
3519
3520 /**
3521 * htmlParseDocTypeDecl:
3522 * @ctxt: an HTML parser context
3523 *
3524 * Parse a DOCTYPE declaration.
3525 */
3526
3527 static void
htmlParseDocTypeDecl(htmlParserCtxtPtr ctxt)3528 htmlParseDocTypeDecl(htmlParserCtxtPtr ctxt) {
3529 xmlChar *name = NULL;
3530 xmlChar *publicId = NULL;
3531 xmlChar *URI = NULL;
3532 int maxLength = (ctxt->options & HTML_PARSE_HUGE) ?
3533 XML_MAX_TEXT_LENGTH :
3534 XML_MAX_NAME_LENGTH;
3535
3536 /*
3537 * We know that '<!DOCTYPE' has been detected.
3538 */
3539 SKIP(9);
3540
3541 SKIP_BLANKS;
3542
3543 if ((ctxt->input->cur < ctxt->input->end) && (CUR != '>')) {
3544 name = htmlParseData(ctxt, MASK_WS_GT, 0, 0, maxLength);
3545
3546 if ((ctxt->options & HTML_PARSE_HTML5) && (name != NULL)) {
3547 xmlChar *cur;
3548
3549 for (cur = name; *cur; cur++) {
3550 if (IS_UPPER(*cur))
3551 *cur += 0x20;
3552 }
3553 }
3554
3555 SKIP_BLANKS;
3556 }
3557
3558 /*
3559 * Check for SystemID and publicId
3560 */
3561 if ((UPPER == 'P') && (UPP(1) == 'U') &&
3562 (UPP(2) == 'B') && (UPP(3) == 'L') &&
3563 (UPP(4) == 'I') && (UPP(5) == 'C')) {
3564 SKIP(6);
3565 SKIP_BLANKS;
3566 publicId = htmlParseDoctypeLiteral(ctxt);
3567 if (publicId == NULL)
3568 goto bogus;
3569 SKIP_BLANKS;
3570 URI = htmlParseDoctypeLiteral(ctxt);
3571 } else if ((UPPER == 'S') && (UPP(1) == 'Y') &&
3572 (UPP(2) == 'S') && (UPP(3) == 'T') &&
3573 (UPP(4) == 'E') && (UPP(5) == 'M')) {
3574 SKIP(6);
3575 SKIP_BLANKS;
3576 URI = htmlParseDoctypeLiteral(ctxt);
3577 }
3578
3579 bogus:
3580 htmlSkipBogusDoctype(ctxt);
3581
3582 /*
3583 * Create or update the document accordingly to the DOCTYPE
3584 */
3585 if ((ctxt->sax != NULL) && (ctxt->sax->internalSubset != NULL) &&
3586 (!ctxt->disableSAX))
3587 ctxt->sax->internalSubset(ctxt->userData, name, publicId, URI);
3588
3589 xmlFree(name);
3590 xmlFree(URI);
3591 xmlFree(publicId);
3592 }
3593
3594 /**
3595 * htmlParseAttribute:
3596 * @ctxt: an HTML parser context
3597 * @value: a xmlChar ** used to store the value of the attribute
3598 *
3599 * parse an attribute
3600 *
3601 * [41] Attribute ::= Name Eq AttValue
3602 *
3603 * [25] Eq ::= S? '=' S?
3604 *
3605 * With namespace:
3606 *
3607 * [NS 11] Attribute ::= QName Eq AttValue
3608 *
3609 * Also the case QName == xmlns:??? is handled independently as a namespace
3610 * definition.
3611 *
3612 * Returns the attribute name, and the value in *value.
3613 */
3614
3615 static xmlHashedString
htmlParseAttribute(htmlParserCtxtPtr ctxt,xmlChar ** value)3616 htmlParseAttribute(htmlParserCtxtPtr ctxt, xmlChar **value) {
3617 xmlHashedString hname;
3618 xmlChar *val = NULL;
3619
3620 *value = NULL;
3621 hname = htmlParseHTMLName(ctxt, 1);
3622 if (hname.name == NULL)
3623 return(hname);
3624
3625 /*
3626 * read the value
3627 */
3628 SKIP_BLANKS;
3629 if (CUR == '=') {
3630 SKIP(1);
3631 SKIP_BLANKS;
3632 val = htmlParseAttValue(ctxt);
3633 }
3634
3635 *value = val;
3636 return(hname);
3637 }
3638
3639 /**
3640 * htmlCheckEncoding:
3641 * @ctxt: an HTML parser context
3642 * @attvalue: the attribute value
3643 *
3644 * Checks an http-equiv attribute from a Meta tag to detect
3645 * the encoding
3646 * If a new encoding is detected the parser is switched to decode
3647 * it and pass UTF8
3648 */
3649 static void
htmlCheckEncoding(htmlParserCtxtPtr ctxt,const xmlChar * attvalue)3650 htmlCheckEncoding(htmlParserCtxtPtr ctxt, const xmlChar *attvalue) {
3651 const xmlChar *encoding;
3652 xmlChar *copy;
3653
3654 if (!attvalue)
3655 return;
3656
3657 encoding = xmlStrcasestr(attvalue, BAD_CAST"charset");
3658 if (encoding != NULL) {
3659 encoding += 7;
3660 }
3661 /*
3662 * skip blank
3663 */
3664 if (encoding && IS_WS_HTML(*encoding))
3665 encoding = xmlStrcasestr(attvalue, BAD_CAST"=");
3666 if (encoding && *encoding == '=') {
3667 encoding ++;
3668 copy = xmlStrdup(encoding);
3669 if (copy == NULL)
3670 htmlErrMemory(ctxt);
3671 xmlSetDeclaredEncoding(ctxt, copy);
3672 }
3673 }
3674
3675 /**
3676 * htmlCheckMeta:
3677 * @ctxt: an HTML parser context
3678 * @atts: the attributes values
3679 *
3680 * Checks an attributes from a Meta tag
3681 */
3682 static void
htmlCheckMeta(htmlParserCtxtPtr ctxt,const xmlChar ** atts)3683 htmlCheckMeta(htmlParserCtxtPtr ctxt, const xmlChar **atts) {
3684 int i;
3685 const xmlChar *att, *value;
3686 int http = 0;
3687 const xmlChar *content = NULL;
3688
3689 if ((ctxt == NULL) || (atts == NULL))
3690 return;
3691
3692 i = 0;
3693 att = atts[i++];
3694 while (att != NULL) {
3695 value = atts[i++];
3696 if (value != NULL) {
3697 if ((!xmlStrcasecmp(att, BAD_CAST "http-equiv")) &&
3698 (!xmlStrcasecmp(value, BAD_CAST "Content-Type"))) {
3699 http = 1;
3700 } else if (!xmlStrcasecmp(att, BAD_CAST "charset")) {
3701 xmlChar *copy;
3702
3703 copy = xmlStrdup(value);
3704 if (copy == NULL)
3705 htmlErrMemory(ctxt);
3706 xmlSetDeclaredEncoding(ctxt, copy);
3707 } else if (!xmlStrcasecmp(att, BAD_CAST "content")) {
3708 content = value;
3709 }
3710 }
3711 att = atts[i++];
3712 }
3713 if ((http) && (content != NULL))
3714 htmlCheckEncoding(ctxt, content);
3715
3716 }
3717
3718 /**
3719 * htmlAttrHashInsert:
3720 * @ctxt: parser context
3721 * @size: size of the hash table
3722 * @name: attribute name
3723 * @hashValue: hash value of name
3724 * @aindex: attribute index (this is a multiple of 5)
3725 *
3726 * Inserts a new attribute into the hash table.
3727 *
3728 * Returns INT_MAX if no existing attribute was found, the attribute
3729 * index if an attribute was found, -1 if a memory allocation failed.
3730 */
3731 static int
htmlAttrHashInsert(xmlParserCtxtPtr ctxt,unsigned size,const xmlChar * name,unsigned hashValue,int aindex)3732 htmlAttrHashInsert(xmlParserCtxtPtr ctxt, unsigned size, const xmlChar *name,
3733 unsigned hashValue, int aindex) {
3734 xmlAttrHashBucket *table = ctxt->attrHash;
3735 xmlAttrHashBucket *bucket;
3736 unsigned hindex;
3737
3738 hindex = hashValue & (size - 1);
3739 bucket = &table[hindex];
3740
3741 while (bucket->index >= 0) {
3742 const xmlChar **atts = &ctxt->atts[bucket->index];
3743
3744 if (name == atts[0])
3745 return(bucket->index);
3746
3747 hindex++;
3748 bucket++;
3749 if (hindex >= size) {
3750 hindex = 0;
3751 bucket = table;
3752 }
3753 }
3754
3755 bucket->index = aindex;
3756
3757 return(INT_MAX);
3758 }
3759
3760 /**
3761 * htmlParseStartTag:
3762 * @ctxt: an HTML parser context
3763 *
3764 * parse a start of tag either for rule element or
3765 * EmptyElement. In both case we don't parse the tag closing chars.
3766 *
3767 * [40] STag ::= '<' Name (S Attribute)* S? '>'
3768 *
3769 * [44] EmptyElemTag ::= '<' Name (S Attribute)* S? '/>'
3770 *
3771 * With namespace:
3772 *
3773 * [NS 8] STag ::= '<' QName (S Attribute)* S? '>'
3774 *
3775 * [NS 10] EmptyElement ::= '<' QName (S Attribute)* S? '/>'
3776 *
3777 * Returns 0 in case of success, -1 in case of error and 1 if discarded
3778 */
3779
3780 static void
htmlParseStartTag(htmlParserCtxtPtr ctxt)3781 htmlParseStartTag(htmlParserCtxtPtr ctxt) {
3782 const xmlChar *name;
3783 const xmlChar *attname;
3784 xmlChar *attvalue;
3785 const xmlChar **atts;
3786 int nbatts = 0;
3787 int maxatts;
3788 int meta = 0;
3789 int i;
3790 int discardtag = 0;
3791
3792 ctxt->endCheckState = 0;
3793
3794 SKIP(1);
3795
3796 atts = ctxt->atts;
3797 maxatts = ctxt->maxatts;
3798
3799 GROW;
3800 name = htmlParseHTMLName(ctxt, 0).name;
3801 if (name == NULL)
3802 return;
3803 if (xmlStrEqual(name, BAD_CAST"meta"))
3804 meta = 1;
3805
3806 if ((ctxt->options & HTML_PARSE_HTML5) == 0) {
3807 /*
3808 * Check for auto-closure of HTML elements.
3809 */
3810 htmlAutoClose(ctxt, name);
3811
3812 /*
3813 * Check for implied HTML elements.
3814 */
3815 htmlCheckImplied(ctxt, name);
3816
3817 /*
3818 * Avoid html at any level > 0, head at any level != 1
3819 * or any attempt to recurse body
3820 */
3821 if ((ctxt->nameNr > 0) && (xmlStrEqual(name, BAD_CAST"html"))) {
3822 htmlParseErr(ctxt, XML_HTML_STRUCURE_ERROR,
3823 "htmlParseStartTag: misplaced <html> tag\n",
3824 name, NULL);
3825 discardtag = 1;
3826 ctxt->depth++;
3827 }
3828 if ((ctxt->nameNr != 1) &&
3829 (xmlStrEqual(name, BAD_CAST"head"))) {
3830 htmlParseErr(ctxt, XML_HTML_STRUCURE_ERROR,
3831 "htmlParseStartTag: misplaced <head> tag\n",
3832 name, NULL);
3833 discardtag = 1;
3834 ctxt->depth++;
3835 }
3836 if (xmlStrEqual(name, BAD_CAST"body")) {
3837 int indx;
3838 for (indx = 0;indx < ctxt->nameNr;indx++) {
3839 if (xmlStrEqual(ctxt->nameTab[indx], BAD_CAST"body")) {
3840 htmlParseErr(ctxt, XML_HTML_STRUCURE_ERROR,
3841 "htmlParseStartTag: misplaced <body> tag\n",
3842 name, NULL);
3843 discardtag = 1;
3844 ctxt->depth++;
3845 }
3846 }
3847 }
3848 }
3849
3850 /*
3851 * Now parse the attributes, it ends up with the ending
3852 *
3853 * (S Attribute)* S?
3854 */
3855 SKIP_BLANKS;
3856 while ((ctxt->input->cur < ctxt->input->end) &&
3857 (CUR != '>') &&
3858 ((CUR != '/') || (NXT(1) != '>')) &&
3859 (PARSER_STOPPED(ctxt) == 0)) {
3860 xmlHashedString hattname;
3861
3862 /* unexpected-solidus-in-tag */
3863 if (CUR == '/') {
3864 SKIP(1);
3865 SKIP_BLANKS;
3866 continue;
3867 }
3868 GROW;
3869 hattname = htmlParseAttribute(ctxt, &attvalue);
3870 attname = hattname.name;
3871
3872 if (attname != NULL) {
3873 /*
3874 * Add the pair to atts
3875 */
3876 if (nbatts + 4 > maxatts) {
3877 const xmlChar **tmp;
3878 unsigned *utmp;
3879 int newSize;
3880
3881 newSize = xmlGrowCapacity(maxatts,
3882 sizeof(tmp[0]) * 2 + sizeof(utmp[0]),
3883 11, HTML_MAX_ATTRS);
3884 if (newSize < 0) {
3885 htmlErrMemory(ctxt);
3886 goto failed;
3887 }
3888 #ifdef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION
3889 if (newSize < 2)
3890 newSize = 2;
3891 #endif
3892 tmp = xmlRealloc(atts, newSize * sizeof(tmp[0]) * 2);
3893 if (tmp == NULL) {
3894 htmlErrMemory(ctxt);
3895 goto failed;
3896 }
3897 atts = tmp;
3898 ctxt->atts = tmp;
3899
3900 utmp = xmlRealloc(ctxt->attallocs, newSize * sizeof(utmp[0]));
3901 if (utmp == NULL) {
3902 htmlErrMemory(ctxt);
3903 goto failed;
3904 }
3905 ctxt->attallocs = utmp;
3906
3907 maxatts = newSize * 2;
3908 ctxt->maxatts = maxatts;
3909 }
3910
3911 ctxt->attallocs[nbatts/2] = hattname.hashValue;
3912 atts[nbatts++] = attname;
3913 atts[nbatts++] = attvalue;
3914
3915 attvalue = NULL;
3916 }
3917
3918 failed:
3919 if (attvalue != NULL)
3920 xmlFree(attvalue);
3921
3922 SKIP_BLANKS;
3923 }
3924
3925 if (ctxt->input->cur >= ctxt->input->end) {
3926 discardtag = 1;
3927 goto done;
3928 }
3929
3930 /*
3931 * Verify that attribute names are unique.
3932 */
3933 if (nbatts > 2) {
3934 unsigned attrHashSize;
3935 int j, k;
3936
3937 attrHashSize = 4;
3938 while (attrHashSize / 2 < (unsigned) nbatts / 2)
3939 attrHashSize *= 2;
3940
3941 if (attrHashSize > ctxt->attrHashMax) {
3942 xmlAttrHashBucket *tmp;
3943
3944 tmp = xmlRealloc(ctxt->attrHash, attrHashSize * sizeof(tmp[0]));
3945 if (tmp == NULL) {
3946 htmlErrMemory(ctxt);
3947 goto done;
3948 }
3949
3950 ctxt->attrHash = tmp;
3951 ctxt->attrHashMax = attrHashSize;
3952 }
3953
3954 memset(ctxt->attrHash, -1, attrHashSize * sizeof(ctxt->attrHash[0]));
3955
3956 for (i = 0, j = 0, k = 0; i < nbatts; i += 2, k++) {
3957 unsigned hashValue;
3958 int res;
3959
3960 attname = atts[i];
3961 hashValue = ctxt->attallocs[k] | 0x80000000;
3962
3963 res = htmlAttrHashInsert(ctxt, attrHashSize, attname,
3964 hashValue, j);
3965 if (res < 0)
3966 continue;
3967
3968 if (res == INT_MAX) {
3969 atts[j] = atts[i];
3970 atts[j+1] = atts[i+1];
3971 j += 2;
3972 } else {
3973 xmlFree((xmlChar *) atts[i+1]);
3974 }
3975 }
3976
3977 nbatts = j;
3978 }
3979
3980 if (nbatts > 0) {
3981 atts[nbatts] = NULL;
3982 atts[nbatts + 1] = NULL;
3983
3984 /*
3985 * Apple's new libiconv is so broken that you routinely run into
3986 * issues when fuzz testing (by accident with an uninstrumented
3987 * libiconv). Here's a harmless (?) example:
3988 *
3989 * printf '>' | iconv -f shift_jis -t utf-8 | hexdump -C
3990 * printf '\xfc\x00\x00' | iconv -f shift_jis -t utf-8 | hexdump -C
3991 * printf '>\xfc\x00\x00' | iconv -f shift_jis -t utf-8 | hexdump -C
3992 *
3993 * The last command fails to detect the illegal sequence.
3994 */
3995 #if !defined(__APPLE__) || \
3996 !defined(FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION)
3997 /*
3998 * Handle specific association to the META tag
3999 */
4000 if (meta)
4001 htmlCheckMeta(ctxt, atts);
4002 #endif
4003 }
4004
4005 /*
4006 * SAX: Start of Element !
4007 */
4008 if (!discardtag) {
4009 if (ctxt->options & HTML_PARSE_HTML5) {
4010 if (ctxt->nameNr > 0)
4011 htmlnamePop(ctxt);
4012 }
4013
4014 htmlnamePush(ctxt, name);
4015 if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL)) {
4016 if (nbatts != 0)
4017 ctxt->sax->startElement(ctxt->userData, name, atts);
4018 else
4019 ctxt->sax->startElement(ctxt->userData, name, NULL);
4020 }
4021 }
4022
4023 done:
4024 if (atts != NULL) {
4025 for (i = 1;i < nbatts;i += 2) {
4026 if (atts[i] != NULL)
4027 xmlFree((xmlChar *) atts[i]);
4028 }
4029 }
4030 }
4031
4032 /**
4033 * htmlParseEndTag:
4034 * @ctxt: an HTML parser context
4035 *
4036 * parse an end of tag
4037 *
4038 * [42] ETag ::= '</' Name S? '>'
4039 *
4040 * With namespace
4041 *
4042 * [NS 9] ETag ::= '</' QName S? '>'
4043 *
4044 * Returns 1 if the current level should be closed.
4045 */
4046
4047 static void
htmlParseEndTag(htmlParserCtxtPtr ctxt)4048 htmlParseEndTag(htmlParserCtxtPtr ctxt)
4049 {
4050 const xmlChar *name;
4051 const xmlChar *oldname;
4052 int i;
4053
4054 ctxt->endCheckState = 0;
4055
4056 SKIP(2);
4057
4058 if (ctxt->input->cur >= ctxt->input->end) {
4059 htmlCheckParagraph(ctxt);
4060 if ((ctxt->sax != NULL) && (!ctxt->disableSAX) &&
4061 (ctxt->sax->characters != NULL))
4062 ctxt->sax->characters(ctxt->userData,
4063 BAD_CAST "</", 2);
4064 return;
4065 }
4066
4067 if (CUR == '>') {
4068 SKIP(1);
4069 return;
4070 }
4071
4072 if (!IS_ASCII_LETTER(CUR)) {
4073 htmlParseComment(ctxt, /* bogus */ 1);
4074 return;
4075 }
4076
4077 name = htmlParseHTMLName(ctxt, 0).name;
4078 if (name == NULL)
4079 return;
4080
4081 /*
4082 * Parse and ignore attributes.
4083 */
4084 SKIP_BLANKS;
4085 while ((ctxt->input->cur < ctxt->input->end) &&
4086 (CUR != '>') &&
4087 ((CUR != '/') || (NXT(1) != '>')) &&
4088 (ctxt->instate != XML_PARSER_EOF)) {
4089 xmlChar *attvalue = NULL;
4090
4091 /* unexpected-solidus-in-tag */
4092 if (CUR == '/') {
4093 SKIP(1);
4094 SKIP_BLANKS;
4095 continue;
4096 }
4097 GROW;
4098 htmlParseAttribute(ctxt, &attvalue);
4099 if (attvalue != NULL)
4100 xmlFree(attvalue);
4101
4102 SKIP_BLANKS;
4103 }
4104
4105 if (CUR == '>') {
4106 SKIP(1);
4107 } else if ((CUR == '/') && (NXT(1) == '>')) {
4108 SKIP(2);
4109 } else {
4110 return;
4111 }
4112
4113 if (ctxt->options & HTML_PARSE_HTML5) {
4114 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
4115 ctxt->sax->endElement(ctxt->userData, name);
4116 return;
4117 }
4118
4119 /*
4120 * if we ignored misplaced tags in htmlParseStartTag don't pop them
4121 * out now.
4122 */
4123 if ((ctxt->depth > 0) &&
4124 (xmlStrEqual(name, BAD_CAST "html") ||
4125 xmlStrEqual(name, BAD_CAST "body") ||
4126 xmlStrEqual(name, BAD_CAST "head"))) {
4127 ctxt->depth--;
4128 return;
4129 }
4130
4131 /*
4132 * If the name read is not one of the element in the parsing stack
4133 * then return, it's just an error.
4134 */
4135 for (i = (ctxt->nameNr - 1); i >= 0; i--) {
4136 if (xmlStrEqual(name, ctxt->nameTab[i]))
4137 break;
4138 }
4139 if (i < 0) {
4140 htmlParseErr(ctxt, XML_ERR_TAG_NAME_MISMATCH,
4141 "Unexpected end tag : %s\n", name, NULL);
4142 return;
4143 }
4144
4145
4146 /*
4147 * Check for auto-closure of HTML elements.
4148 */
4149
4150 htmlAutoCloseOnClose(ctxt, name);
4151
4152 /*
4153 * Well formedness constraints, opening and closing must match.
4154 * With the exception that the autoclose may have popped stuff out
4155 * of the stack.
4156 */
4157 if ((ctxt->name != NULL) && (!xmlStrEqual(ctxt->name, name))) {
4158 htmlParseErr(ctxt, XML_ERR_TAG_NAME_MISMATCH,
4159 "Opening and ending tag mismatch: %s and %s\n",
4160 name, ctxt->name);
4161 }
4162
4163 /*
4164 * SAX: End of Tag
4165 */
4166 oldname = ctxt->name;
4167 if ((oldname != NULL) && (xmlStrEqual(oldname, name))) {
4168 htmlParserFinishElementParsing(ctxt);
4169 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
4170 ctxt->sax->endElement(ctxt->userData, name);
4171 htmlnamePop(ctxt);
4172 }
4173 }
4174
4175 /**
4176 * htmlParseContent:
4177 * @ctxt: an HTML parser context
4178 *
4179 * Parse a content: comment, sub-element, reference or text.
4180 * New version for non recursive htmlParseElementInternal
4181 */
4182
4183 static void
htmlParseContent(htmlParserCtxtPtr ctxt)4184 htmlParseContent(htmlParserCtxtPtr ctxt) {
4185 GROW;
4186
4187 while ((PARSER_STOPPED(ctxt) == 0) &&
4188 (ctxt->input->cur < ctxt->input->end)) {
4189 int mode;
4190
4191 mode = ctxt->endCheckState;
4192
4193 if ((mode == 0) && (CUR == '<')) {
4194 if (NXT(1) == '/') {
4195 htmlParseEndTag(ctxt);
4196 } else if (NXT(1) == '!') {
4197 /*
4198 * Sometimes DOCTYPE arrives in the middle of the document
4199 */
4200 if ((UPP(2) == 'D') && (UPP(3) == 'O') &&
4201 (UPP(4) == 'C') && (UPP(5) == 'T') &&
4202 (UPP(6) == 'Y') && (UPP(7) == 'P') &&
4203 (UPP(8) == 'E')) {
4204 htmlParseDocTypeDecl(ctxt);
4205 } else if ((NXT(2) == '-') && (NXT(3) == '-')) {
4206 SKIP(4);
4207 htmlParseComment(ctxt, /* bogus */ 0);
4208 } else {
4209 SKIP(2);
4210 htmlParseComment(ctxt, /* bogus */ 1);
4211 }
4212 } else if (NXT(1) == '?') {
4213 SKIP(1);
4214 htmlParseComment(ctxt, /* bogus */ 1);
4215 } else if (IS_ASCII_LETTER(NXT(1))) {
4216 htmlParseElementInternal(ctxt);
4217 } else {
4218 htmlCheckParagraph(ctxt);
4219 if ((ctxt->sax != NULL) && (!ctxt->disableSAX) &&
4220 (ctxt->sax->characters != NULL))
4221 ctxt->sax->characters(ctxt->userData, BAD_CAST "<", 1);
4222 SKIP(1);
4223 }
4224 } else {
4225 htmlParseCharData(ctxt, /* partial */ 0);
4226 }
4227
4228 SHRINK;
4229 GROW;
4230 }
4231
4232 if (ctxt->input->cur >= ctxt->input->end)
4233 htmlAutoCloseOnEnd(ctxt);
4234 }
4235
4236 /**
4237 * htmlParseElementInternal:
4238 * @ctxt: an HTML parser context
4239 *
4240 * parse an HTML element, new version, non recursive
4241 *
4242 * [39] element ::= EmptyElemTag | STag content ETag
4243 *
4244 * [41] Attribute ::= Name Eq AttValue
4245 */
4246
4247 static int
htmlParseElementInternal(htmlParserCtxtPtr ctxt)4248 htmlParseElementInternal(htmlParserCtxtPtr ctxt) {
4249 const xmlChar *name;
4250 const htmlElemDesc * info;
4251 htmlParserNodeInfo node_info = { NULL, 0, 0, 0, 0 };
4252
4253 if ((ctxt == NULL) || (ctxt->input == NULL))
4254 return(0);
4255
4256 /* Capture start position */
4257 if (ctxt->record_info) {
4258 node_info.begin_pos = ctxt->input->consumed +
4259 (CUR_PTR - ctxt->input->base);
4260 node_info.begin_line = ctxt->input->line;
4261 }
4262
4263 htmlParseStartTag(ctxt);
4264 name = ctxt->name;
4265 if (name == NULL)
4266 return(0);
4267
4268 if (ctxt->record_info)
4269 htmlNodeInfoPush(ctxt, &node_info);
4270
4271 /*
4272 * Check for an Empty Element labeled the XML/SGML way
4273 */
4274 if ((CUR == '/') && (NXT(1) == '>')) {
4275 SKIP(2);
4276 htmlParserFinishElementParsing(ctxt);
4277 if ((ctxt->options & HTML_PARSE_HTML5) == 0) {
4278 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
4279 ctxt->sax->endElement(ctxt->userData, name);
4280 }
4281 htmlnamePop(ctxt);
4282 return(0);
4283 }
4284
4285 if (CUR != '>')
4286 return(0);
4287 SKIP(1);
4288
4289 /*
4290 * Lookup the info for that element.
4291 */
4292 info = htmlTagLookup(name);
4293
4294 /*
4295 * Check for an Empty Element from DTD definition
4296 */
4297 if ((info != NULL) && (info->empty)) {
4298 htmlParserFinishElementParsing(ctxt);
4299 if ((ctxt->options & HTML_PARSE_HTML5) == 0) {
4300 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
4301 ctxt->sax->endElement(ctxt->userData, name);
4302 }
4303 htmlnamePop(ctxt);
4304 return(0);
4305 }
4306
4307 if (info != NULL)
4308 ctxt->endCheckState = info->dataMode;
4309
4310 return(1);
4311 }
4312
4313 /**
4314 * htmlParseElement:
4315 * @ctxt: an HTML parser context
4316 *
4317 * DEPRECATED: Internal function, don't use.
4318 *
4319 * parse an HTML element, this is highly recursive
4320 * this is kept for compatibility with previous code versions
4321 *
4322 * [39] element ::= EmptyElemTag | STag content ETag
4323 *
4324 * [41] Attribute ::= Name Eq AttValue
4325 */
4326
4327 void
htmlParseElement(htmlParserCtxtPtr ctxt)4328 htmlParseElement(htmlParserCtxtPtr ctxt) {
4329 const xmlChar *oldptr;
4330 int depth;
4331
4332 if ((ctxt == NULL) || (ctxt->input == NULL))
4333 return;
4334
4335 if (htmlParseElementInternal(ctxt) == 0)
4336 return;
4337
4338 /*
4339 * Parse the content of the element:
4340 */
4341 depth = ctxt->nameNr;
4342 while (CUR != 0) {
4343 oldptr = ctxt->input->cur;
4344 htmlParseContent(ctxt);
4345 if (oldptr==ctxt->input->cur) break;
4346 if (ctxt->nameNr < depth) break;
4347 }
4348
4349 if (CUR == 0) {
4350 htmlAutoCloseOnEnd(ctxt);
4351 }
4352 }
4353
4354 /**
4355 * htmlCtxtParseContentInternal:
4356 * @ctxt: parser context
4357 * @input: parser input
4358 *
4359 * Returns a node list.
4360 */
4361 xmlNodePtr
htmlCtxtParseContentInternal(htmlParserCtxtPtr ctxt,xmlParserInputPtr input)4362 htmlCtxtParseContentInternal(htmlParserCtxtPtr ctxt, xmlParserInputPtr input) {
4363 xmlNodePtr root;
4364 xmlNodePtr list = NULL;
4365 xmlChar *rootName = BAD_CAST "#root";
4366
4367 root = xmlNewDocNode(ctxt->myDoc, NULL, rootName, NULL);
4368 if (root == NULL) {
4369 htmlErrMemory(ctxt);
4370 return(NULL);
4371 }
4372
4373 if (xmlCtxtPushInput(ctxt, input) < 0) {
4374 xmlFreeNode(root);
4375 return(NULL);
4376 }
4377
4378 htmlnamePush(ctxt, rootName);
4379 nodePush(ctxt, root);
4380
4381 htmlParseContent(ctxt);
4382
4383 /* TODO: Use xmlCtxtIsCatastrophicError */
4384 if (ctxt->errNo != XML_ERR_NO_MEMORY) {
4385 xmlNodePtr cur;
4386
4387 /*
4388 * Unlink newly created node list.
4389 */
4390 list = root->children;
4391 root->children = NULL;
4392 root->last = NULL;
4393 for (cur = list; cur != NULL; cur = cur->next)
4394 cur->parent = NULL;
4395 }
4396
4397 nodePop(ctxt);
4398 htmlnamePop(ctxt);
4399
4400 xmlCtxtPopInput(ctxt);
4401
4402 xmlFreeNode(root);
4403 return(list);
4404 }
4405
4406 /**
4407 * htmlParseDocument:
4408 * @ctxt: an HTML parser context
4409 *
4410 * Parse an HTML document and invoke the SAX handlers. This is useful
4411 * if you're only interested in custom SAX callbacks. If you want a
4412 * document tree, use htmlCtxtParseDocument.
4413 *
4414 * Returns 0, -1 in case of error.
4415 */
4416
4417 int
htmlParseDocument(htmlParserCtxtPtr ctxt)4418 htmlParseDocument(htmlParserCtxtPtr ctxt) {
4419 xmlDtdPtr dtd;
4420
4421 if ((ctxt == NULL) || (ctxt->input == NULL))
4422 return(-1);
4423
4424 if ((ctxt->sax) && (ctxt->sax->setDocumentLocator)) {
4425 ctxt->sax->setDocumentLocator(ctxt->userData,
4426 (xmlSAXLocator *) &xmlDefaultSAXLocator);
4427 }
4428
4429 xmlDetectEncoding(ctxt);
4430
4431 /*
4432 * TODO: Implement HTML5 prescan algorithm
4433 */
4434
4435 /*
4436 * This is wrong but matches long-standing behavior. In most
4437 * cases, a document starting with an XML declaration will
4438 * specify UTF-8. The HTML5 prescan algorithm handles
4439 * XML declarations in a better way.
4440 */
4441 if (((ctxt->input->flags & XML_INPUT_HAS_ENCODING) == 0) &&
4442 (xmlStrncmp(ctxt->input->cur, BAD_CAST "<?xm", 4) == 0))
4443 xmlSwitchEncoding(ctxt, XML_CHAR_ENCODING_UTF8);
4444
4445 /*
4446 * Wipe out everything which is before the first '<'
4447 */
4448 SKIP_BLANKS;
4449
4450 if ((ctxt->sax) && (ctxt->sax->startDocument) && (!ctxt->disableSAX))
4451 ctxt->sax->startDocument(ctxt->userData);
4452
4453 /*
4454 * Parse possible comments and PIs before any content
4455 */
4456 while (CUR == '<') {
4457 if ((NXT(1) == '!') && (NXT(2) == '-') && (NXT(3) == '-')) {
4458 SKIP(4);
4459 htmlParseComment(ctxt, /* bogus */ 0);
4460 } else if (NXT(1) == '?') {
4461 SKIP(1);
4462 htmlParseComment(ctxt, /* bogus */ 1);
4463 } else {
4464 break;
4465 }
4466 SKIP_BLANKS;
4467 }
4468
4469 /*
4470 * Then possibly doc type declaration(s) and more Misc
4471 * (doctypedecl Misc*)?
4472 */
4473 if ((CUR == '<') && (NXT(1) == '!') &&
4474 (UPP(2) == 'D') && (UPP(3) == 'O') &&
4475 (UPP(4) == 'C') && (UPP(5) == 'T') &&
4476 (UPP(6) == 'Y') && (UPP(7) == 'P') &&
4477 (UPP(8) == 'E')) {
4478 ctxt->instate = XML_PARSER_MISC;
4479 htmlParseDocTypeDecl(ctxt);
4480 }
4481 SKIP_BLANKS;
4482
4483 /*
4484 * Parse possible comments and PIs before any content
4485 */
4486 ctxt->instate = XML_PARSER_PROLOG;
4487 while (CUR == '<') {
4488 if ((NXT(1) == '!') && (NXT(2) == '-') && (NXT(3) == '-')) {
4489 SKIP(4);
4490 htmlParseComment(ctxt, /* bogus */ 0);
4491 } else if (NXT(1) == '?') {
4492 SKIP(1);
4493 htmlParseComment(ctxt, /* bogus */ 1);
4494 } else {
4495 break;
4496 }
4497 SKIP_BLANKS;
4498 }
4499
4500 /*
4501 * Time to start parsing the tree itself
4502 */
4503 ctxt->instate = XML_PARSER_CONTENT;
4504 htmlParseContent(ctxt);
4505
4506 /*
4507 * autoclose
4508 */
4509 if (CUR == 0)
4510 htmlAutoCloseOnEnd(ctxt);
4511
4512
4513 /*
4514 * SAX: end of the document processing.
4515 */
4516 if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
4517 ctxt->sax->endDocument(ctxt->userData);
4518
4519 if ((!(ctxt->options & HTML_PARSE_NODEFDTD)) && (ctxt->myDoc != NULL)) {
4520 dtd = xmlGetIntSubset(ctxt->myDoc);
4521 if (dtd == NULL) {
4522 ctxt->myDoc->intSubset =
4523 xmlCreateIntSubset(ctxt->myDoc, BAD_CAST "html",
4524 BAD_CAST "-//W3C//DTD HTML 4.0 Transitional//EN",
4525 BAD_CAST "http://www.w3.org/TR/REC-html40/loose.dtd");
4526 if (ctxt->myDoc->intSubset == NULL)
4527 htmlErrMemory(ctxt);
4528 }
4529 }
4530 if (! ctxt->wellFormed) return(-1);
4531 return(0);
4532 }
4533
4534
4535 /************************************************************************
4536 * *
4537 * Parser contexts handling *
4538 * *
4539 ************************************************************************/
4540
4541 /**
4542 * htmlInitParserCtxt:
4543 * @ctxt: an HTML parser context
4544 * @sax: SAX handler
4545 * @userData: user data
4546 *
4547 * Initialize a parser context
4548 *
4549 * Returns 0 in case of success and -1 in case of error
4550 */
4551
4552 static int
htmlInitParserCtxt(htmlParserCtxtPtr ctxt,const htmlSAXHandler * sax,void * userData)4553 htmlInitParserCtxt(htmlParserCtxtPtr ctxt, const htmlSAXHandler *sax,
4554 void *userData)
4555 {
4556 if (ctxt == NULL) return(-1);
4557 memset(ctxt, 0, sizeof(htmlParserCtxt));
4558
4559 ctxt->dict = xmlDictCreate();
4560 if (ctxt->dict == NULL)
4561 return(-1);
4562
4563 if (ctxt->sax == NULL)
4564 ctxt->sax = (htmlSAXHandler *) xmlMalloc(sizeof(htmlSAXHandler));
4565 if (ctxt->sax == NULL)
4566 return(-1);
4567 if (sax == NULL) {
4568 memset(ctxt->sax, 0, sizeof(htmlSAXHandler));
4569 xmlSAX2InitHtmlDefaultSAXHandler(ctxt->sax);
4570 ctxt->userData = ctxt;
4571 } else {
4572 memcpy(ctxt->sax, sax, sizeof(htmlSAXHandler));
4573 ctxt->userData = userData ? userData : ctxt;
4574 }
4575
4576 /* Allocate the Input stack */
4577 ctxt->inputTab = (htmlParserInputPtr *)
4578 xmlMalloc(5 * sizeof(htmlParserInputPtr));
4579 if (ctxt->inputTab == NULL)
4580 return(-1);
4581 ctxt->inputNr = 0;
4582 ctxt->inputMax = 5;
4583 ctxt->input = NULL;
4584 ctxt->version = NULL;
4585 ctxt->encoding = NULL;
4586 ctxt->standalone = -1;
4587 ctxt->instate = XML_PARSER_START;
4588
4589 /* Allocate the Node stack */
4590 ctxt->nodeTab = (htmlNodePtr *) xmlMalloc(10 * sizeof(htmlNodePtr));
4591 if (ctxt->nodeTab == NULL)
4592 return(-1);
4593 ctxt->nodeNr = 0;
4594 ctxt->nodeMax = 10;
4595 ctxt->node = NULL;
4596
4597 /* Allocate the Name stack */
4598 ctxt->nameTab = (const xmlChar **) xmlMalloc(10 * sizeof(xmlChar *));
4599 if (ctxt->nameTab == NULL)
4600 return(-1);
4601 ctxt->nameNr = 0;
4602 ctxt->nameMax = 10;
4603 ctxt->name = NULL;
4604
4605 ctxt->nodeInfoTab = NULL;
4606 ctxt->nodeInfoNr = 0;
4607 ctxt->nodeInfoMax = 0;
4608
4609 ctxt->myDoc = NULL;
4610 ctxt->wellFormed = 1;
4611 ctxt->replaceEntities = 0;
4612 ctxt->linenumbers = xmlLineNumbersDefaultValue;
4613 ctxt->keepBlanks = xmlKeepBlanksDefaultValue;
4614 ctxt->html = 1;
4615 ctxt->vctxt.flags = XML_VCTXT_USE_PCTXT;
4616 ctxt->vctxt.userData = ctxt;
4617 ctxt->vctxt.error = xmlParserValidityError;
4618 ctxt->vctxt.warning = xmlParserValidityWarning;
4619 ctxt->record_info = 0;
4620 ctxt->validate = 0;
4621 ctxt->checkIndex = 0;
4622 ctxt->catalogs = NULL;
4623 xmlInitNodeInfoSeq(&ctxt->node_seq);
4624 return(0);
4625 }
4626
4627 /**
4628 * htmlFreeParserCtxt:
4629 * @ctxt: an HTML parser context
4630 *
4631 * Free all the memory used by a parser context. However the parsed
4632 * document in ctxt->myDoc is not freed.
4633 */
4634
4635 void
htmlFreeParserCtxt(htmlParserCtxtPtr ctxt)4636 htmlFreeParserCtxt(htmlParserCtxtPtr ctxt)
4637 {
4638 xmlFreeParserCtxt(ctxt);
4639 }
4640
4641 /**
4642 * htmlNewParserCtxt:
4643 *
4644 * Allocate and initialize a new HTML parser context.
4645 *
4646 * This can be used to parse HTML documents into DOM trees with
4647 * functions like xmlCtxtReadFile or xmlCtxtReadMemory.
4648 *
4649 * See htmlCtxtUseOptions for parser options.
4650 *
4651 * See xmlCtxtSetErrorHandler for advanced error handling.
4652 *
4653 * See htmlNewSAXParserCtxt for custom SAX parsers.
4654 *
4655 * Returns the htmlParserCtxtPtr or NULL in case of allocation error
4656 */
4657
4658 htmlParserCtxtPtr
htmlNewParserCtxt(void)4659 htmlNewParserCtxt(void)
4660 {
4661 return(htmlNewSAXParserCtxt(NULL, NULL));
4662 }
4663
4664 /**
4665 * htmlNewSAXParserCtxt:
4666 * @sax: SAX handler
4667 * @userData: user data
4668 *
4669 * Allocate and initialize a new HTML SAX parser context. If userData
4670 * is NULL, the parser context will be passed as user data.
4671 *
4672 * Available since 2.11.0. If you want support older versions,
4673 * it's best to invoke htmlNewParserCtxt and set ctxt->sax with
4674 * struct assignment.
4675 *
4676 * Also see htmlNewParserCtxt.
4677 *
4678 * Returns the htmlParserCtxtPtr or NULL in case of allocation error
4679 */
4680
4681 htmlParserCtxtPtr
htmlNewSAXParserCtxt(const htmlSAXHandler * sax,void * userData)4682 htmlNewSAXParserCtxt(const htmlSAXHandler *sax, void *userData)
4683 {
4684 xmlParserCtxtPtr ctxt;
4685
4686 xmlInitParser();
4687
4688 ctxt = (xmlParserCtxtPtr) xmlMalloc(sizeof(xmlParserCtxt));
4689 if (ctxt == NULL)
4690 return(NULL);
4691 memset(ctxt, 0, sizeof(xmlParserCtxt));
4692 if (htmlInitParserCtxt(ctxt, sax, userData) < 0) {
4693 htmlFreeParserCtxt(ctxt);
4694 return(NULL);
4695 }
4696 return(ctxt);
4697 }
4698
4699 static htmlParserCtxtPtr
htmlCreateMemoryParserCtxtInternal(const char * url,const char * buffer,size_t size,const char * encoding)4700 htmlCreateMemoryParserCtxtInternal(const char *url,
4701 const char *buffer, size_t size,
4702 const char *encoding) {
4703 xmlParserCtxtPtr ctxt;
4704 xmlParserInputPtr input;
4705
4706 if (buffer == NULL)
4707 return(NULL);
4708
4709 ctxt = htmlNewParserCtxt();
4710 if (ctxt == NULL)
4711 return(NULL);
4712
4713 input = xmlCtxtNewInputFromMemory(ctxt, url, buffer, size, encoding, 0);
4714 if (input == NULL) {
4715 xmlFreeParserCtxt(ctxt);
4716 return(NULL);
4717 }
4718
4719 if (xmlCtxtPushInput(ctxt, input) < 0) {
4720 xmlFreeInputStream(input);
4721 xmlFreeParserCtxt(ctxt);
4722 return(NULL);
4723 }
4724
4725 return(ctxt);
4726 }
4727
4728 /**
4729 * htmlCreateMemoryParserCtxt:
4730 * @buffer: a pointer to a char array
4731 * @size: the size of the array
4732 *
4733 * DEPRECATED: Use htmlNewParserCtxt and htmlCtxtReadMemory.
4734 *
4735 * Create a parser context for an HTML in-memory document. The input
4736 * buffer must not contain any terminating null bytes.
4737 *
4738 * Returns the new parser context or NULL
4739 */
4740 htmlParserCtxtPtr
htmlCreateMemoryParserCtxt(const char * buffer,int size)4741 htmlCreateMemoryParserCtxt(const char *buffer, int size) {
4742 if (size <= 0)
4743 return(NULL);
4744
4745 return(htmlCreateMemoryParserCtxtInternal(NULL, buffer, size, NULL));
4746 }
4747
4748 /**
4749 * htmlCreateDocParserCtxt:
4750 * @str: a pointer to an array of xmlChar
4751 * @encoding: encoding (optional)
4752 *
4753 * Create a parser context for a null-terminated string.
4754 *
4755 * Returns the new parser context or NULL if a memory allocation failed.
4756 */
4757 static htmlParserCtxtPtr
htmlCreateDocParserCtxt(const xmlChar * str,const char * url,const char * encoding)4758 htmlCreateDocParserCtxt(const xmlChar *str, const char *url,
4759 const char *encoding) {
4760 xmlParserCtxtPtr ctxt;
4761 xmlParserInputPtr input;
4762
4763 if (str == NULL)
4764 return(NULL);
4765
4766 ctxt = htmlNewParserCtxt();
4767 if (ctxt == NULL)
4768 return(NULL);
4769
4770 input = xmlCtxtNewInputFromString(ctxt, url, (const char *) str,
4771 encoding, 0);
4772 if (input == NULL) {
4773 xmlFreeParserCtxt(ctxt);
4774 return(NULL);
4775 }
4776
4777 if (xmlCtxtPushInput(ctxt, input) < 0) {
4778 xmlFreeInputStream(input);
4779 xmlFreeParserCtxt(ctxt);
4780 return(NULL);
4781 }
4782
4783 return(ctxt);
4784 }
4785
4786 #ifdef LIBXML_PUSH_ENABLED
4787 /************************************************************************
4788 * *
4789 * Progressive parsing interfaces *
4790 * *
4791 ************************************************************************/
4792
4793 typedef enum {
4794 LSTATE_TAG_NAME = 0,
4795 LSTATE_BEFORE_ATTR_NAME,
4796 LSTATE_ATTR_NAME,
4797 LSTATE_AFTER_ATTR_NAME,
4798 LSTATE_BEFORE_ATTR_VALUE,
4799 LSTATE_ATTR_VALUE_DQUOTED,
4800 LSTATE_ATTR_VALUE_SQUOTED,
4801 LSTATE_ATTR_VALUE_UNQUOTED
4802 } xmlLookupStates;
4803
4804 /**
4805 * htmlParseLookupGt:
4806 * @ctxt: an HTML parser context
4807 *
4808 * Check whether there's enough data in the input buffer to finish parsing
4809 * a tag. This has to take quotes into account.
4810 */
4811 static int
htmlParseLookupGt(xmlParserCtxtPtr ctxt)4812 htmlParseLookupGt(xmlParserCtxtPtr ctxt) {
4813 const xmlChar *cur;
4814 const xmlChar *end = ctxt->input->end;
4815 int state = ctxt->endCheckState;
4816 size_t index;
4817
4818 if (ctxt->checkIndex == 0)
4819 cur = ctxt->input->cur + 2; /* Skip '<a' or '</' */
4820 else
4821 cur = ctxt->input->cur + ctxt->checkIndex;
4822
4823 while (cur < end) {
4824 int c = *cur++;
4825
4826 if (state != LSTATE_ATTR_VALUE_SQUOTED &&
4827 state != LSTATE_ATTR_VALUE_DQUOTED) {
4828 if (c == '/' &&
4829 state != LSTATE_BEFORE_ATTR_VALUE &&
4830 state != LSTATE_ATTR_VALUE_UNQUOTED) {
4831 state = LSTATE_BEFORE_ATTR_NAME;
4832 continue;
4833 } else if (c == '>') {
4834 ctxt->checkIndex = 0;
4835 ctxt->endCheckState = 0;
4836 return(0);
4837 }
4838 }
4839
4840 switch (state) {
4841 case LSTATE_TAG_NAME:
4842 if (IS_WS_HTML(c))
4843 state = LSTATE_BEFORE_ATTR_NAME;
4844 break;
4845
4846 case LSTATE_BEFORE_ATTR_NAME:
4847 if (!IS_WS_HTML(c))
4848 state = LSTATE_ATTR_NAME;
4849 break;
4850
4851 case LSTATE_ATTR_NAME:
4852 if (c == '=')
4853 state = LSTATE_BEFORE_ATTR_VALUE;
4854 else if (IS_WS_HTML(c))
4855 state = LSTATE_AFTER_ATTR_NAME;
4856 break;
4857
4858 case LSTATE_AFTER_ATTR_NAME:
4859 if (c == '=')
4860 state = LSTATE_BEFORE_ATTR_VALUE;
4861 else if (!IS_WS_HTML(c))
4862 state = LSTATE_ATTR_NAME;
4863 break;
4864
4865 case LSTATE_BEFORE_ATTR_VALUE:
4866 if (c == '"')
4867 state = LSTATE_ATTR_VALUE_DQUOTED;
4868 else if (c == '\'')
4869 state = LSTATE_ATTR_VALUE_SQUOTED;
4870 else if (!IS_WS_HTML(c))
4871 state = LSTATE_ATTR_VALUE_UNQUOTED;
4872 break;
4873
4874 case LSTATE_ATTR_VALUE_DQUOTED:
4875 if (c == '"')
4876 state = LSTATE_BEFORE_ATTR_NAME;
4877 break;
4878
4879 case LSTATE_ATTR_VALUE_SQUOTED:
4880 if (c == '\'')
4881 state = LSTATE_BEFORE_ATTR_NAME;
4882 break;
4883
4884 case LSTATE_ATTR_VALUE_UNQUOTED:
4885 if (IS_WS_HTML(c))
4886 state = LSTATE_BEFORE_ATTR_NAME;
4887 break;
4888 }
4889 }
4890
4891 index = cur - ctxt->input->cur;
4892 if (index > LONG_MAX) {
4893 ctxt->checkIndex = 0;
4894 ctxt->endCheckState = 0;
4895 return(0);
4896 }
4897 ctxt->checkIndex = index;
4898 ctxt->endCheckState = state;
4899 return(-1);
4900 }
4901
4902 /**
4903 * htmlParseLookupString:
4904 * @ctxt: an XML parser context
4905 * @startDelta: delta to apply at the start
4906 * @str: string
4907 * @strLen: length of string
4908 *
4909 * Check whether the input buffer contains a string.
4910 */
4911 static int
htmlParseLookupString(xmlParserCtxtPtr ctxt,size_t startDelta,const char * str,size_t strLen,size_t extraLen)4912 htmlParseLookupString(xmlParserCtxtPtr ctxt, size_t startDelta,
4913 const char *str, size_t strLen, size_t extraLen) {
4914 const xmlChar *end = ctxt->input->end;
4915 const xmlChar *cur, *term;
4916 size_t index, rescan;
4917 int ret;
4918
4919 if (ctxt->checkIndex == 0) {
4920 cur = ctxt->input->cur + startDelta;
4921 } else {
4922 cur = ctxt->input->cur + ctxt->checkIndex;
4923 }
4924
4925 term = BAD_CAST strstr((const char *) cur, str);
4926 if ((term != NULL) &&
4927 ((size_t) (ctxt->input->end - term) >= extraLen + 1)) {
4928 ctxt->checkIndex = 0;
4929
4930 if (term - ctxt->input->cur > INT_MAX / 2)
4931 ret = INT_MAX / 2;
4932 else
4933 ret = term - ctxt->input->cur;
4934
4935 return(ret);
4936 }
4937
4938 /* Rescan (strLen + extraLen - 1) characters. */
4939 rescan = strLen + extraLen - 1;
4940 if ((size_t) (end - cur) <= rescan)
4941 end = cur;
4942 else
4943 end -= rescan;
4944 index = end - ctxt->input->cur;
4945 if (index > INT_MAX / 2) {
4946 ctxt->checkIndex = 0;
4947 ret = INT_MAX / 2;
4948 } else {
4949 ctxt->checkIndex = index;
4950 ret = -1;
4951 }
4952
4953 return(ret);
4954 }
4955
4956 /**
4957 * htmlParseLookupCommentEnd:
4958 * @ctxt: an HTML parser context
4959 *
4960 * Try to find a comment end tag in the input stream
4961 * The search includes "-->" as well as WHATWG-recommended incorrectly-closed tags.
4962 * (See https://html.spec.whatwg.org/multipage/parsing.html#parse-error-incorrectly-closed-comment)
4963 * This function has a side effect of (possibly) incrementing ctxt->checkIndex
4964 * to avoid rescanning sequences of bytes, it DOES change the state of the
4965 * parser, do not use liberally.
4966 *
4967 * Returns the index to the current parsing point if the full sequence is available, -1 otherwise.
4968 */
4969 static int
htmlParseLookupCommentEnd(htmlParserCtxtPtr ctxt)4970 htmlParseLookupCommentEnd(htmlParserCtxtPtr ctxt)
4971 {
4972 int mark = 0;
4973 int offset;
4974
4975 while (1) {
4976 mark = htmlParseLookupString(ctxt, 2, "--", 2, 0);
4977 if (mark < 0)
4978 break;
4979 /*
4980 * <!--> is a complete comment, but
4981 * <!--!> is not
4982 * <!---!> is not
4983 * <!----!> is
4984 */
4985 if ((NXT(mark+2) == '>') ||
4986 ((mark >= 4) && (NXT(mark+2) == '!') && (NXT(mark+3) == '>'))) {
4987 ctxt->checkIndex = 0;
4988 break;
4989 }
4990 offset = (NXT(mark+2) == '!') ? 3 : 2;
4991 if (mark + offset >= ctxt->input->end - ctxt->input->cur) {
4992 ctxt->checkIndex = mark;
4993 return(-1);
4994 }
4995 ctxt->checkIndex = mark + 1;
4996 }
4997 return mark;
4998 }
4999
5000
5001 /**
5002 * htmlParseTryOrFinish:
5003 * @ctxt: an HTML parser context
5004 * @terminate: last chunk indicator
5005 *
5006 * Try to progress on parsing
5007 *
5008 * Returns zero if no parsing was possible
5009 */
5010 static void
htmlParseTryOrFinish(htmlParserCtxtPtr ctxt,int terminate)5011 htmlParseTryOrFinish(htmlParserCtxtPtr ctxt, int terminate) {
5012 while (PARSER_STOPPED(ctxt) == 0) {
5013 htmlParserInputPtr in;
5014 size_t avail;
5015
5016 in = ctxt->input;
5017 if (in == NULL) break;
5018 avail = in->end - in->cur;
5019
5020 switch (ctxt->instate) {
5021 case XML_PARSER_EOF:
5022 /*
5023 * Document parsing is done !
5024 */
5025 return;
5026
5027 case XML_PARSER_START:
5028 /*
5029 * Very first chars read from the document flow.
5030 */
5031 if ((!terminate) && (avail < 4))
5032 return;
5033
5034 xmlDetectEncoding(ctxt);
5035
5036 /*
5037 * TODO: Implement HTML5 prescan algorithm
5038 */
5039
5040 /*
5041 * This is wrong but matches long-standing behavior. In most
5042 * cases, a document starting with an XML declaration will
5043 * specify UTF-8. The HTML5 prescan algorithm handles
5044 * XML declarations in a better way.
5045 */
5046 if (((ctxt->input->flags & XML_INPUT_HAS_ENCODING) == 0) &&
5047 (xmlStrncmp(ctxt->input->cur, BAD_CAST "<?xm", 4) == 0)) {
5048 xmlSwitchEncoding(ctxt, XML_CHAR_ENCODING_UTF8);
5049 }
5050
5051 /* fall through */
5052
5053 case XML_PARSER_XML_DECL:
5054 if ((ctxt->sax) && (ctxt->sax->setDocumentLocator)) {
5055 ctxt->sax->setDocumentLocator(ctxt->userData,
5056 (xmlSAXLocator *) &xmlDefaultSAXLocator);
5057 }
5058 if ((ctxt->sax) && (ctxt->sax->startDocument) &&
5059 (!ctxt->disableSAX))
5060 ctxt->sax->startDocument(ctxt->userData);
5061
5062 /* Allow callback to modify state for tests */
5063 if ((ctxt->instate == XML_PARSER_START) ||
5064 (ctxt->instate == XML_PARSER_XML_DECL))
5065 ctxt->instate = XML_PARSER_MISC;
5066 break;
5067
5068 case XML_PARSER_START_TAG:
5069 if ((!terminate) &&
5070 (htmlParseLookupGt(ctxt) < 0))
5071 return;
5072
5073 htmlParseElementInternal(ctxt);
5074
5075 ctxt->instate = XML_PARSER_CONTENT;
5076 break;
5077
5078 case XML_PARSER_MISC:
5079 case XML_PARSER_PROLOG:
5080 case XML_PARSER_CONTENT: {
5081 int mode;
5082
5083 if ((ctxt->instate == XML_PARSER_MISC) ||
5084 (ctxt->instate == XML_PARSER_PROLOG)) {
5085 SKIP_BLANKS;
5086 avail = in->end - in->cur;
5087 }
5088
5089 if (avail < 1)
5090 return;
5091 /*
5092 * Note that endCheckState is also used by
5093 * xmlParseLookupGt.
5094 */
5095 mode = ctxt->endCheckState;
5096
5097 if (mode != 0) {
5098 if (htmlParseCharData(ctxt, !terminate) == 0)
5099 return;
5100 } else if (in->cur[0] == '<') {
5101 int next;
5102
5103 if (avail < 2) {
5104 if (!terminate)
5105 return;
5106 next = ' ';
5107 } else {
5108 next = in->cur[1];
5109 }
5110
5111 if (next == '!') {
5112 if ((!terminate) && (avail < 4))
5113 return;
5114 if ((in->cur[2] == '-') && (in->cur[3] == '-')) {
5115 if ((!terminate) &&
5116 (htmlParseLookupCommentEnd(ctxt) < 0))
5117 return;
5118 SKIP(4);
5119 htmlParseComment(ctxt, /* bogus */ 0);
5120 /* don't change state */
5121 break;
5122 }
5123
5124 if ((!terminate) && (avail < 9))
5125 return;
5126 if ((UPP(2) == 'D') && (UPP(3) == 'O') &&
5127 (UPP(4) == 'C') && (UPP(5) == 'T') &&
5128 (UPP(6) == 'Y') && (UPP(7) == 'P') &&
5129 (UPP(8) == 'E')) {
5130 if ((!terminate) &&
5131 (htmlParseLookupString(ctxt, 9, ">", 1,
5132 0) < 0))
5133 return;
5134 htmlParseDocTypeDecl(ctxt);
5135 if (ctxt->instate == XML_PARSER_MISC)
5136 ctxt->instate = XML_PARSER_PROLOG;
5137 else
5138 ctxt->instate = XML_PARSER_CONTENT;
5139 } else {
5140 ctxt->instate = XML_PARSER_CONTENT;
5141 if ((!terminate) &&
5142 (htmlParseLookupString(ctxt, 2, ">", 1, 0) < 0))
5143 return;
5144 SKIP(2);
5145 htmlParseComment(ctxt, /* bogus */ 1);
5146 }
5147 } else if (next == '?') {
5148 if ((!terminate) &&
5149 (htmlParseLookupString(ctxt, 2, ">", 1, 0) < 0))
5150 return;
5151 SKIP(1);
5152 htmlParseComment(ctxt, /* bogus */ 1);
5153 /* don't change state */
5154 } else if (next == '/') {
5155 ctxt->instate = XML_PARSER_END_TAG;
5156 ctxt->checkIndex = 0;
5157 } else if (IS_ASCII_LETTER(next)) {
5158 ctxt->instate = XML_PARSER_START_TAG;
5159 ctxt->checkIndex = 0;
5160 } else {
5161 ctxt->instate = XML_PARSER_CONTENT;
5162 htmlCheckParagraph(ctxt);
5163 if ((ctxt->sax != NULL) && (!ctxt->disableSAX) &&
5164 (ctxt->sax->characters != NULL))
5165 ctxt->sax->characters(ctxt->userData,
5166 BAD_CAST "<", 1);
5167 SKIP(1);
5168 }
5169 } else {
5170 ctxt->instate = XML_PARSER_CONTENT;
5171 /*
5172 * We follow the logic of the XML push parser
5173 */
5174 if (avail < HTML_PARSER_BIG_BUFFER_SIZE) {
5175 if ((!terminate) &&
5176 (htmlParseLookupString(ctxt, 0, "<", 1, 0) < 0))
5177 return;
5178 }
5179 ctxt->checkIndex = 0;
5180 if (htmlParseCharData(ctxt, !terminate) == 0)
5181 return;
5182 }
5183
5184 break;
5185 }
5186
5187 case XML_PARSER_END_TAG:
5188 if ((!terminate) &&
5189 (htmlParseLookupGt(ctxt) < 0))
5190 return;
5191 htmlParseEndTag(ctxt);
5192 ctxt->instate = XML_PARSER_CONTENT;
5193 ctxt->checkIndex = 0;
5194 break;
5195
5196 default:
5197 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
5198 "HPP: internal error\n", NULL, NULL);
5199 ctxt->instate = XML_PARSER_EOF;
5200 break;
5201 }
5202 }
5203 }
5204
5205 /**
5206 * htmlParseChunk:
5207 * @ctxt: an HTML parser context
5208 * @chunk: chunk of memory
5209 * @size: size of chunk in bytes
5210 * @terminate: last chunk indicator
5211 *
5212 * Parse a chunk of memory in push parser mode.
5213 *
5214 * Assumes that the parser context was initialized with
5215 * htmlCreatePushParserCtxt.
5216 *
5217 * The last chunk, which will often be empty, must be marked with
5218 * the @terminate flag. With the default SAX callbacks, the resulting
5219 * document will be available in ctxt->myDoc. This pointer will not
5220 * be freed by the library.
5221 *
5222 * If the document isn't well-formed, ctxt->myDoc is set to NULL.
5223 *
5224 * Returns an xmlParserErrors code (0 on success).
5225 */
5226 int
htmlParseChunk(htmlParserCtxtPtr ctxt,const char * chunk,int size,int terminate)5227 htmlParseChunk(htmlParserCtxtPtr ctxt, const char *chunk, int size,
5228 int terminate) {
5229 if ((ctxt == NULL) || (ctxt->input == NULL))
5230 return(XML_ERR_ARGUMENT);
5231 if (PARSER_STOPPED(ctxt) != 0)
5232 return(ctxt->errNo);
5233 if ((size > 0) && (chunk != NULL) && (ctxt->input != NULL) &&
5234 (ctxt->input->buf != NULL)) {
5235 size_t pos = ctxt->input->cur - ctxt->input->base;
5236 int res;
5237
5238 res = xmlParserInputBufferPush(ctxt->input->buf, size, chunk);
5239 xmlBufUpdateInput(ctxt->input->buf->buffer, ctxt->input, pos);
5240 if (res < 0) {
5241 htmlParseErr(ctxt, ctxt->input->buf->error,
5242 "xmlParserInputBufferPush failed", NULL, NULL);
5243 xmlHaltParser(ctxt);
5244 return (ctxt->errNo);
5245 }
5246 }
5247
5248 htmlParseTryOrFinish(ctxt, terminate);
5249
5250 if ((terminate) && (ctxt->instate != XML_PARSER_EOF)) {
5251 htmlAutoCloseOnEnd(ctxt);
5252
5253 if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
5254 ctxt->sax->endDocument(ctxt->userData);
5255
5256 if ((!(ctxt->options & HTML_PARSE_NODEFDTD)) &&
5257 (ctxt->myDoc != NULL)) {
5258 xmlDtdPtr dtd;
5259 dtd = xmlGetIntSubset(ctxt->myDoc);
5260 if (dtd == NULL) {
5261 ctxt->myDoc->intSubset =
5262 xmlCreateIntSubset(ctxt->myDoc, BAD_CAST "html",
5263 BAD_CAST "-//W3C//DTD HTML 4.0 Transitional//EN",
5264 BAD_CAST "http://www.w3.org/TR/REC-html40/loose.dtd");
5265 if (ctxt->myDoc->intSubset == NULL)
5266 htmlErrMemory(ctxt);
5267 }
5268 }
5269
5270 ctxt->instate = XML_PARSER_EOF;
5271 }
5272
5273 return((xmlParserErrors) ctxt->errNo);
5274 }
5275
5276 /************************************************************************
5277 * *
5278 * User entry points *
5279 * *
5280 ************************************************************************/
5281
5282 /**
5283 * htmlCreatePushParserCtxt:
5284 * @sax: a SAX handler (optional)
5285 * @user_data: The user data returned on SAX callbacks (optional)
5286 * @chunk: a pointer to an array of chars (optional)
5287 * @size: number of chars in the array
5288 * @filename: only used for error reporting (optional)
5289 * @enc: encoding (deprecated, pass XML_CHAR_ENCODING_NONE)
5290 *
5291 * Create a parser context for using the HTML parser in push mode.
5292 *
5293 * Returns the new parser context or NULL if a memory allocation
5294 * failed.
5295 */
5296 htmlParserCtxtPtr
htmlCreatePushParserCtxt(htmlSAXHandlerPtr sax,void * user_data,const char * chunk,int size,const char * filename,xmlCharEncoding enc)5297 htmlCreatePushParserCtxt(htmlSAXHandlerPtr sax, void *user_data,
5298 const char *chunk, int size, const char *filename,
5299 xmlCharEncoding enc) {
5300 htmlParserCtxtPtr ctxt;
5301 htmlParserInputPtr input;
5302 const char *encoding;
5303
5304 ctxt = htmlNewSAXParserCtxt(sax, user_data);
5305 if (ctxt == NULL)
5306 return(NULL);
5307
5308 encoding = xmlGetCharEncodingName(enc);
5309 input = xmlNewPushInput(filename, chunk, size);
5310 if (input == NULL) {
5311 htmlFreeParserCtxt(ctxt);
5312 return(NULL);
5313 }
5314
5315 if (xmlCtxtPushInput(ctxt, input) < 0) {
5316 xmlFreeInputStream(input);
5317 xmlFreeParserCtxt(ctxt);
5318 return(NULL);
5319 }
5320
5321 if (encoding != NULL)
5322 xmlSwitchEncodingName(ctxt, encoding);
5323
5324 return(ctxt);
5325 }
5326 #endif /* LIBXML_PUSH_ENABLED */
5327
5328 /**
5329 * htmlSAXParseDoc:
5330 * @cur: a pointer to an array of xmlChar
5331 * @encoding: a free form C string describing the HTML document encoding, or NULL
5332 * @sax: the SAX handler block
5333 * @userData: if using SAX, this pointer will be provided on callbacks.
5334 *
5335 * DEPRECATED: Use htmlNewSAXParserCtxt and htmlCtxtReadDoc.
5336 *
5337 * Parse an HTML in-memory document. If sax is not NULL, use the SAX callbacks
5338 * to handle parse events. If sax is NULL, fallback to the default DOM
5339 * behavior and return a tree.
5340 *
5341 * Returns the resulting document tree unless SAX is NULL or the document is
5342 * not well formed.
5343 */
5344
5345 htmlDocPtr
htmlSAXParseDoc(const xmlChar * cur,const char * encoding,htmlSAXHandlerPtr sax,void * userData)5346 htmlSAXParseDoc(const xmlChar *cur, const char *encoding,
5347 htmlSAXHandlerPtr sax, void *userData) {
5348 htmlDocPtr ret;
5349 htmlParserCtxtPtr ctxt;
5350
5351 if (cur == NULL)
5352 return(NULL);
5353
5354 ctxt = htmlCreateDocParserCtxt(cur, NULL, encoding);
5355 if (ctxt == NULL)
5356 return(NULL);
5357
5358 if (sax != NULL) {
5359 *ctxt->sax = *sax;
5360 ctxt->userData = userData;
5361 }
5362
5363 htmlParseDocument(ctxt);
5364 ret = ctxt->myDoc;
5365 htmlFreeParserCtxt(ctxt);
5366
5367 return(ret);
5368 }
5369
5370 /**
5371 * htmlParseDoc:
5372 * @cur: a pointer to an array of xmlChar
5373 * @encoding: the encoding (optional)
5374 *
5375 * DEPRECATED: Use htmlReadDoc.
5376 *
5377 * Parse an HTML in-memory document and build a tree.
5378 *
5379 * This function uses deprecated global parser options.
5380 *
5381 * Returns the resulting document tree
5382 */
5383
5384 htmlDocPtr
htmlParseDoc(const xmlChar * cur,const char * encoding)5385 htmlParseDoc(const xmlChar *cur, const char *encoding) {
5386 return(htmlSAXParseDoc(cur, encoding, NULL, NULL));
5387 }
5388
5389
5390 /**
5391 * htmlCreateFileParserCtxt:
5392 * @filename: the filename
5393 * @encoding: optional encoding
5394 *
5395 * DEPRECATED: Use htmlNewParserCtxt and htmlCtxtReadFile.
5396 *
5397 * Create a parser context to read from a file.
5398 *
5399 * A non-NULL encoding overrides encoding declarations in the document.
5400 *
5401 * Automatic support for ZLIB/Compress compressed document is provided
5402 * by default if found at compile-time.
5403 *
5404 * Returns the new parser context or NULL if a memory allocation failed.
5405 */
5406 htmlParserCtxtPtr
htmlCreateFileParserCtxt(const char * filename,const char * encoding)5407 htmlCreateFileParserCtxt(const char *filename, const char *encoding)
5408 {
5409 htmlParserCtxtPtr ctxt;
5410 htmlParserInputPtr input;
5411
5412 if (filename == NULL)
5413 return(NULL);
5414
5415 ctxt = htmlNewParserCtxt();
5416 if (ctxt == NULL) {
5417 return(NULL);
5418 }
5419
5420 input = xmlCtxtNewInputFromUrl(ctxt, filename, NULL, encoding, 0);
5421 if (input == NULL) {
5422 xmlFreeParserCtxt(ctxt);
5423 return(NULL);
5424 }
5425 if (xmlCtxtPushInput(ctxt, input) < 0) {
5426 xmlFreeInputStream(input);
5427 xmlFreeParserCtxt(ctxt);
5428 return(NULL);
5429 }
5430
5431 return(ctxt);
5432 }
5433
5434 /**
5435 * htmlSAXParseFile:
5436 * @filename: the filename
5437 * @encoding: encoding (optional)
5438 * @sax: the SAX handler block
5439 * @userData: if using SAX, this pointer will be provided on callbacks.
5440 *
5441 * DEPRECATED: Use htmlNewSAXParserCtxt and htmlCtxtReadFile.
5442 *
5443 * parse an HTML file and build a tree. Automatic support for ZLIB/Compress
5444 * compressed document is provided by default if found at compile-time.
5445 * It use the given SAX function block to handle the parsing callback.
5446 * If sax is NULL, fallback to the default DOM tree building routines.
5447 *
5448 * Returns the resulting document tree unless SAX is NULL or the document is
5449 * not well formed.
5450 */
5451
5452 htmlDocPtr
htmlSAXParseFile(const char * filename,const char * encoding,htmlSAXHandlerPtr sax,void * userData)5453 htmlSAXParseFile(const char *filename, const char *encoding, htmlSAXHandlerPtr sax,
5454 void *userData) {
5455 htmlDocPtr ret;
5456 htmlParserCtxtPtr ctxt;
5457 htmlSAXHandlerPtr oldsax = NULL;
5458
5459 ctxt = htmlCreateFileParserCtxt(filename, encoding);
5460 if (ctxt == NULL) return(NULL);
5461 if (sax != NULL) {
5462 oldsax = ctxt->sax;
5463 ctxt->sax = sax;
5464 ctxt->userData = userData;
5465 }
5466
5467 htmlParseDocument(ctxt);
5468
5469 ret = ctxt->myDoc;
5470 if (sax != NULL) {
5471 ctxt->sax = oldsax;
5472 ctxt->userData = NULL;
5473 }
5474 htmlFreeParserCtxt(ctxt);
5475
5476 return(ret);
5477 }
5478
5479 /**
5480 * htmlParseFile:
5481 * @filename: the filename
5482 * @encoding: encoding (optional)
5483 *
5484 * Parse an HTML file and build a tree.
5485 *
5486 * Returns the resulting document tree
5487 */
5488
5489 htmlDocPtr
htmlParseFile(const char * filename,const char * encoding)5490 htmlParseFile(const char *filename, const char *encoding) {
5491 return(htmlSAXParseFile(filename, encoding, NULL, NULL));
5492 }
5493
5494 /**
5495 * htmlHandleOmittedElem:
5496 * @val: int 0 or 1
5497 *
5498 * DEPRECATED: Use HTML_PARSE_NOIMPLIED
5499 *
5500 * Set and return the previous value for handling HTML omitted tags.
5501 *
5502 * Returns the last value for 0 for no handling, 1 for auto insertion.
5503 */
5504
5505 int
htmlHandleOmittedElem(int val)5506 htmlHandleOmittedElem(int val) {
5507 int old = htmlOmittedDefaultValue;
5508
5509 htmlOmittedDefaultValue = val;
5510 return(old);
5511 }
5512
5513 /**
5514 * htmlElementAllowedHere:
5515 * @parent: HTML parent element
5516 * @elt: HTML element
5517 *
5518 * DEPRECATED: Don't use.
5519 *
5520 * Returns 1
5521 */
5522 int
htmlElementAllowedHere(const htmlElemDesc * parent ATTRIBUTE_UNUSED,const xmlChar * elt ATTRIBUTE_UNUSED)5523 htmlElementAllowedHere(const htmlElemDesc* parent ATTRIBUTE_UNUSED,
5524 const xmlChar* elt ATTRIBUTE_UNUSED) {
5525 return(1);
5526 }
5527
5528 /**
5529 * htmlElementStatusHere:
5530 * @parent: HTML parent element
5531 * @elt: HTML element
5532 *
5533 * DEPRECATED: Don't use.
5534 *
5535 * Returns HTML_VALID
5536 */
5537 htmlStatus
htmlElementStatusHere(const htmlElemDesc * parent ATTRIBUTE_UNUSED,const htmlElemDesc * elt ATTRIBUTE_UNUSED)5538 htmlElementStatusHere(const htmlElemDesc* parent ATTRIBUTE_UNUSED,
5539 const htmlElemDesc* elt ATTRIBUTE_UNUSED) {
5540 return(HTML_VALID);
5541 }
5542
5543 /**
5544 * htmlAttrAllowed:
5545 * @elt: HTML element
5546 * @attr: HTML attribute
5547 * @legacy: whether to allow deprecated attributes
5548 *
5549 * DEPRECATED: Don't use.
5550 *
5551 * Returns HTML_VALID
5552 */
5553 htmlStatus
htmlAttrAllowed(const htmlElemDesc * elt ATTRIBUTE_UNUSED,const xmlChar * attr ATTRIBUTE_UNUSED,int legacy ATTRIBUTE_UNUSED)5554 htmlAttrAllowed(const htmlElemDesc* elt ATTRIBUTE_UNUSED,
5555 const xmlChar* attr ATTRIBUTE_UNUSED,
5556 int legacy ATTRIBUTE_UNUSED) {
5557 return(HTML_VALID);
5558 }
5559
5560 /**
5561 * htmlNodeStatus:
5562 * @node: an htmlNodePtr in a tree
5563 * @legacy: whether to allow deprecated elements (YES is faster here
5564 * for Element nodes)
5565 *
5566 * DEPRECATED: Don't use.
5567 *
5568 * Returns HTML_VALID
5569 */
5570 htmlStatus
htmlNodeStatus(htmlNodePtr node ATTRIBUTE_UNUSED,int legacy ATTRIBUTE_UNUSED)5571 htmlNodeStatus(htmlNodePtr node ATTRIBUTE_UNUSED,
5572 int legacy ATTRIBUTE_UNUSED) {
5573 return(HTML_VALID);
5574 }
5575
5576 /************************************************************************
5577 * *
5578 * New set (2.6.0) of simpler and more flexible APIs *
5579 * *
5580 ************************************************************************/
5581 /**
5582 * DICT_FREE:
5583 * @str: a string
5584 *
5585 * Free a string if it is not owned by the "dict" dictionary in the
5586 * current scope
5587 */
5588 #define DICT_FREE(str) \
5589 if ((str) && ((!dict) || \
5590 (xmlDictOwns(dict, (const xmlChar *)(str)) == 0))) \
5591 xmlFree((char *)(str));
5592
5593 /**
5594 * htmlCtxtReset:
5595 * @ctxt: an HTML parser context
5596 *
5597 * Reset a parser context
5598 */
5599 void
htmlCtxtReset(htmlParserCtxtPtr ctxt)5600 htmlCtxtReset(htmlParserCtxtPtr ctxt)
5601 {
5602 xmlParserInputPtr input;
5603 xmlDictPtr dict;
5604
5605 if (ctxt == NULL)
5606 return;
5607
5608 dict = ctxt->dict;
5609
5610 while ((input = xmlCtxtPopInput(ctxt)) != NULL) { /* Non consuming */
5611 xmlFreeInputStream(input);
5612 }
5613 ctxt->inputNr = 0;
5614 ctxt->input = NULL;
5615
5616 ctxt->spaceNr = 0;
5617 if (ctxt->spaceTab != NULL) {
5618 ctxt->spaceTab[0] = -1;
5619 ctxt->space = &ctxt->spaceTab[0];
5620 } else {
5621 ctxt->space = NULL;
5622 }
5623
5624
5625 ctxt->nodeNr = 0;
5626 ctxt->node = NULL;
5627
5628 ctxt->nameNr = 0;
5629 ctxt->name = NULL;
5630
5631 ctxt->nsNr = 0;
5632
5633 DICT_FREE(ctxt->version);
5634 ctxt->version = NULL;
5635 DICT_FREE(ctxt->encoding);
5636 ctxt->encoding = NULL;
5637 DICT_FREE(ctxt->extSubURI);
5638 ctxt->extSubURI = NULL;
5639 DICT_FREE(ctxt->extSubSystem);
5640 ctxt->extSubSystem = NULL;
5641
5642 if (ctxt->directory != NULL) {
5643 xmlFree(ctxt->directory);
5644 ctxt->directory = NULL;
5645 }
5646
5647 if (ctxt->myDoc != NULL)
5648 xmlFreeDoc(ctxt->myDoc);
5649 ctxt->myDoc = NULL;
5650
5651 ctxt->standalone = -1;
5652 ctxt->hasExternalSubset = 0;
5653 ctxt->hasPErefs = 0;
5654 ctxt->html = 1;
5655 ctxt->instate = XML_PARSER_START;
5656
5657 ctxt->wellFormed = 1;
5658 ctxt->nsWellFormed = 1;
5659 ctxt->disableSAX = 0;
5660 ctxt->valid = 1;
5661 ctxt->vctxt.userData = ctxt;
5662 ctxt->vctxt.flags = XML_VCTXT_USE_PCTXT;
5663 ctxt->vctxt.error = xmlParserValidityError;
5664 ctxt->vctxt.warning = xmlParserValidityWarning;
5665 ctxt->record_info = 0;
5666 ctxt->checkIndex = 0;
5667 ctxt->endCheckState = 0;
5668 ctxt->inSubset = 0;
5669 ctxt->errNo = XML_ERR_OK;
5670 ctxt->depth = 0;
5671 ctxt->catalogs = NULL;
5672 xmlInitNodeInfoSeq(&ctxt->node_seq);
5673
5674 if (ctxt->attsDefault != NULL) {
5675 xmlHashFree(ctxt->attsDefault, xmlHashDefaultDeallocator);
5676 ctxt->attsDefault = NULL;
5677 }
5678 if (ctxt->attsSpecial != NULL) {
5679 xmlHashFree(ctxt->attsSpecial, NULL);
5680 ctxt->attsSpecial = NULL;
5681 }
5682
5683 ctxt->nbErrors = 0;
5684 ctxt->nbWarnings = 0;
5685 if (ctxt->lastError.code != XML_ERR_OK)
5686 xmlResetError(&ctxt->lastError);
5687 }
5688
5689 static int
htmlCtxtSetOptionsInternal(xmlParserCtxtPtr ctxt,int options,int keepMask)5690 htmlCtxtSetOptionsInternal(xmlParserCtxtPtr ctxt, int options, int keepMask)
5691 {
5692 int allMask;
5693
5694 if (ctxt == NULL)
5695 return(-1);
5696
5697 allMask = HTML_PARSE_RECOVER |
5698 HTML_PARSE_HTML5 |
5699 HTML_PARSE_NODEFDTD |
5700 HTML_PARSE_NOERROR |
5701 HTML_PARSE_NOWARNING |
5702 HTML_PARSE_PEDANTIC |
5703 HTML_PARSE_NOBLANKS |
5704 HTML_PARSE_NONET |
5705 HTML_PARSE_NOIMPLIED |
5706 HTML_PARSE_COMPACT |
5707 HTML_PARSE_HUGE |
5708 HTML_PARSE_IGNORE_ENC |
5709 HTML_PARSE_BIG_LINES;
5710
5711 ctxt->options = (ctxt->options & keepMask) | (options & allMask);
5712
5713 /*
5714 * For some options, struct members are historically the source
5715 * of truth. See xmlCtxtSetOptionsInternal.
5716 */
5717 ctxt->keepBlanks = (options & HTML_PARSE_NOBLANKS) ? 0 : 1;
5718
5719 /*
5720 * Changing SAX callbacks is a bad idea. This should be fixed.
5721 */
5722 if (options & HTML_PARSE_NOBLANKS) {
5723 ctxt->sax->ignorableWhitespace = xmlSAX2IgnorableWhitespace;
5724 }
5725 if (options & HTML_PARSE_HUGE) {
5726 if (ctxt->dict != NULL)
5727 xmlDictSetLimit(ctxt->dict, 0);
5728 }
5729
5730 /*
5731 * It would be useful to allow this feature.
5732 */
5733 ctxt->dictNames = 0;
5734
5735 ctxt->linenumbers = 1;
5736
5737 return(options & ~allMask);
5738 }
5739
5740 /**
5741 * htmlCtxtSetOptions:
5742 * @ctxt: an HTML parser context
5743 * @options: a bitmask of xmlParserOption values
5744 *
5745 * Applies the options to the parser context. Unset options are
5746 * cleared.
5747 *
5748 * Available since 2.14.0. With older versions, you can use
5749 * htmlCtxtUseOptions.
5750 *
5751 * HTML_PARSE_RECOVER
5752 *
5753 * No effect as of 2.14.0.
5754 *
5755 * HTML_PARSE_HTML5
5756 *
5757 * Make the tokenizer emit a SAX callback for each token. This results
5758 * in unbalanced invocations of startElement and endElement.
5759 *
5760 * For now, this is only usable with custom SAX callbacks.
5761 *
5762 * HTML_PARSE_NODEFDTD
5763 *
5764 * Do not default to a doctype if none was found.
5765 *
5766 * HTML_PARSE_NOERROR
5767 *
5768 * Disable error and warning reports to the error handlers.
5769 * Errors are still accessible with xmlCtxtGetLastError.
5770 *
5771 * HTML_PARSE_NOWARNING
5772 *
5773 * Disable warning reports.
5774 *
5775 * HTML_PARSE_PEDANTIC
5776 *
5777 * No effect.
5778 *
5779 * HTML_PARSE_NOBLANKS
5780 *
5781 * Remove some text nodes containing only whitespace from the
5782 * result document. Which nodes are removed depends on a conservative
5783 * heuristic. The reindenting feature of the serialization code relies
5784 * on this option to be set when parsing. Use of this option is
5785 * DISCOURAGED.
5786 *
5787 * HTML_PARSE_NONET
5788 *
5789 * No effect.
5790 *
5791 * HTML_PARSE_NOIMPLIED
5792 *
5793 * Do not add implied html, head or body elements.
5794 *
5795 * HTML_PARSE_COMPACT
5796 *
5797 * Store small strings directly in the node struct to save
5798 * memory.
5799 *
5800 * HTML_PARSE_HUGE
5801 *
5802 * Relax some internal limits.
5803 *
5804 * Available since 2.14.0. Use XML_PARSE_HUGE works with older
5805 * versions.
5806 *
5807 * Maximum size of text nodes, tags, comments, CDATA sections
5808 *
5809 * normal: 10M
5810 * huge: 1B
5811 *
5812 * Maximum size of names, system literals, pubid literals
5813 *
5814 * normal: 50K
5815 * huge: 10M
5816 *
5817 * Maximum nesting depth of elements
5818 *
5819 * normal: 256
5820 * huge: 2048
5821 *
5822 * HTML_PARSE_IGNORE_ENC
5823 *
5824 * Ignore the encoding in the HTML declaration. This option is
5825 * mostly unneeded these days. The only effect is to enforce
5826 * UTF-8 decoding of ASCII-like data.
5827 *
5828 * HTML_PARSE_BIG_LINES
5829 *
5830 * Enable reporting of line numbers larger than 65535.
5831 *
5832 * Available since 2.14.0.
5833 *
5834 * Returns 0 in case of success, the set of unknown or unimplemented options
5835 * in case of error.
5836 */
5837 int
htmlCtxtSetOptions(xmlParserCtxtPtr ctxt,int options)5838 htmlCtxtSetOptions(xmlParserCtxtPtr ctxt, int options)
5839 {
5840 return(htmlCtxtSetOptionsInternal(ctxt, options, 0));
5841 }
5842
5843 /**
5844 * htmlCtxtUseOptions:
5845 * @ctxt: an HTML parser context
5846 * @options: a combination of htmlParserOption(s)
5847 *
5848 * DEPRECATED: Use htmlCtxtSetOptions.
5849 *
5850 * Applies the options to the parser context. The following options
5851 * are never cleared and can only be enabled:
5852 *
5853 * HTML_PARSE_NODEFDTD
5854 * HTML_PARSE_NOERROR
5855 * HTML_PARSE_NOWARNING
5856 * HTML_PARSE_NOIMPLIED
5857 * HTML_PARSE_COMPACT
5858 * HTML_PARSE_HUGE
5859 * HTML_PARSE_IGNORE_ENC
5860 * HTML_PARSE_BIG_LINES
5861 *
5862 * Returns 0 in case of success, the set of unknown or unimplemented options
5863 * in case of error.
5864 */
5865 int
htmlCtxtUseOptions(htmlParserCtxtPtr ctxt,int options)5866 htmlCtxtUseOptions(htmlParserCtxtPtr ctxt, int options)
5867 {
5868 int keepMask;
5869
5870 /*
5871 * For historic reasons, some options can only be enabled.
5872 */
5873 keepMask = HTML_PARSE_NODEFDTD |
5874 HTML_PARSE_NOERROR |
5875 HTML_PARSE_NOWARNING |
5876 HTML_PARSE_NOIMPLIED |
5877 HTML_PARSE_COMPACT |
5878 HTML_PARSE_HUGE |
5879 HTML_PARSE_IGNORE_ENC |
5880 HTML_PARSE_BIG_LINES;
5881
5882 return(htmlCtxtSetOptionsInternal(ctxt, options, keepMask));
5883 }
5884
5885 /**
5886 * htmlCtxtParseDocument:
5887 * @ctxt: an HTML parser context
5888 * @input: parser input
5889 *
5890 * Parse an HTML document and return the resulting document tree.
5891 *
5892 * Available since 2.13.0.
5893 *
5894 * Returns the resulting document tree or NULL
5895 */
5896 htmlDocPtr
htmlCtxtParseDocument(htmlParserCtxtPtr ctxt,xmlParserInputPtr input)5897 htmlCtxtParseDocument(htmlParserCtxtPtr ctxt, xmlParserInputPtr input)
5898 {
5899 htmlDocPtr ret;
5900
5901 if ((ctxt == NULL) || (input == NULL)) {
5902 xmlFatalErr(ctxt, XML_ERR_ARGUMENT, NULL);
5903 xmlFreeInputStream(input);
5904 return(NULL);
5905 }
5906
5907 /* assert(ctxt->inputNr == 0); */
5908 while (ctxt->inputNr > 0)
5909 xmlFreeInputStream(xmlCtxtPopInput(ctxt));
5910
5911 if (xmlCtxtPushInput(ctxt, input) < 0) {
5912 xmlFreeInputStream(input);
5913 return(NULL);
5914 }
5915
5916 ctxt->html = 1;
5917 htmlParseDocument(ctxt);
5918
5919 if (ctxt->errNo != XML_ERR_NO_MEMORY) {
5920 ret = ctxt->myDoc;
5921 } else {
5922 ret = NULL;
5923 xmlFreeDoc(ctxt->myDoc);
5924 }
5925 ctxt->myDoc = NULL;
5926
5927 /* assert(ctxt->inputNr == 1); */
5928 while (ctxt->inputNr > 0)
5929 xmlFreeInputStream(xmlCtxtPopInput(ctxt));
5930
5931 return(ret);
5932 }
5933
5934 /**
5935 * htmlReadDoc:
5936 * @str: a pointer to a zero terminated string
5937 * @url: only used for error reporting (optoinal)
5938 * @encoding: the document encoding (optional)
5939 * @options: a combination of htmlParserOptions
5940 *
5941 * Convenience function to parse an HTML document from a zero-terminated
5942 * string.
5943 *
5944 * See htmlCtxtReadDoc for details.
5945 *
5946 * Returns the resulting document tree.
5947 */
5948 htmlDocPtr
htmlReadDoc(const xmlChar * str,const char * url,const char * encoding,int options)5949 htmlReadDoc(const xmlChar *str, const char *url, const char *encoding,
5950 int options)
5951 {
5952 htmlParserCtxtPtr ctxt;
5953 xmlParserInputPtr input;
5954 htmlDocPtr doc = NULL;
5955
5956 ctxt = htmlNewParserCtxt();
5957 if (ctxt == NULL)
5958 return(NULL);
5959
5960 htmlCtxtUseOptions(ctxt, options);
5961
5962 input = xmlCtxtNewInputFromString(ctxt, url, (const char *) str, encoding,
5963 XML_INPUT_BUF_STATIC);
5964
5965 if (input != NULL)
5966 doc = htmlCtxtParseDocument(ctxt, input);
5967
5968 htmlFreeParserCtxt(ctxt);
5969 return(doc);
5970 }
5971
5972 /**
5973 * htmlReadFile:
5974 * @filename: a file or URL
5975 * @encoding: the document encoding (optional)
5976 * @options: a combination of htmlParserOptions
5977 *
5978 * Convenience function to parse an HTML file from the filesystem,
5979 * the network or a global user-defined resource loader.
5980 *
5981 * See htmlCtxtReadFile for details.
5982 *
5983 * Returns the resulting document tree.
5984 */
5985 htmlDocPtr
htmlReadFile(const char * filename,const char * encoding,int options)5986 htmlReadFile(const char *filename, const char *encoding, int options)
5987 {
5988 htmlParserCtxtPtr ctxt;
5989 xmlParserInputPtr input;
5990 htmlDocPtr doc = NULL;
5991
5992 ctxt = htmlNewParserCtxt();
5993 if (ctxt == NULL)
5994 return(NULL);
5995
5996 htmlCtxtUseOptions(ctxt, options);
5997
5998 input = xmlCtxtNewInputFromUrl(ctxt, filename, NULL, encoding, 0);
5999
6000 if (input != NULL)
6001 doc = htmlCtxtParseDocument(ctxt, input);
6002
6003 htmlFreeParserCtxt(ctxt);
6004 return(doc);
6005 }
6006
6007 /**
6008 * htmlReadMemory:
6009 * @buffer: a pointer to a char array
6010 * @size: the size of the array
6011 * @url: only used for error reporting (optional)
6012 * @encoding: the document encoding, or NULL
6013 * @options: a combination of htmlParserOption(s)
6014 *
6015 * Convenience function to parse an HTML document from memory.
6016 * The input buffer must not contain any terminating null bytes.
6017 *
6018 * See htmlCtxtReadMemory for details.
6019 *
6020 * Returns the resulting document tree
6021 */
6022 htmlDocPtr
htmlReadMemory(const char * buffer,int size,const char * url,const char * encoding,int options)6023 htmlReadMemory(const char *buffer, int size, const char *url,
6024 const char *encoding, int options)
6025 {
6026 htmlParserCtxtPtr ctxt;
6027 xmlParserInputPtr input;
6028 htmlDocPtr doc = NULL;
6029
6030 if (size < 0)
6031 return(NULL);
6032
6033 ctxt = htmlNewParserCtxt();
6034 if (ctxt == NULL)
6035 return(NULL);
6036
6037 htmlCtxtUseOptions(ctxt, options);
6038
6039 input = xmlCtxtNewInputFromMemory(ctxt, url, buffer, size, encoding,
6040 XML_INPUT_BUF_STATIC);
6041
6042 if (input != NULL)
6043 doc = htmlCtxtParseDocument(ctxt, input);
6044
6045 htmlFreeParserCtxt(ctxt);
6046 return(doc);
6047 }
6048
6049 /**
6050 * htmlReadFd:
6051 * @fd: an open file descriptor
6052 * @url: only used for error reporting (optional)
6053 * @encoding: the document encoding, or NULL
6054 * @options: a combination of htmlParserOptions
6055 *
6056 * Convenience function to parse an HTML document from a
6057 * file descriptor.
6058 *
6059 * NOTE that the file descriptor will not be closed when the
6060 * context is freed or reset.
6061 *
6062 * See htmlCtxtReadFd for details.
6063 *
6064 * Returns the resulting document tree
6065 */
6066 htmlDocPtr
htmlReadFd(int fd,const char * url,const char * encoding,int options)6067 htmlReadFd(int fd, const char *url, const char *encoding, int options)
6068 {
6069 htmlParserCtxtPtr ctxt;
6070 xmlParserInputPtr input;
6071 htmlDocPtr doc = NULL;
6072
6073 ctxt = htmlNewParserCtxt();
6074 if (ctxt == NULL)
6075 return(NULL);
6076
6077 htmlCtxtUseOptions(ctxt, options);
6078
6079 input = xmlCtxtNewInputFromFd(ctxt, url, fd, encoding, 0);
6080
6081 if (input != NULL)
6082 doc = htmlCtxtParseDocument(ctxt, input);
6083
6084 htmlFreeParserCtxt(ctxt);
6085 return(doc);
6086 }
6087
6088 /**
6089 * htmlReadIO:
6090 * @ioread: an I/O read function
6091 * @ioclose: an I/O close function (optional)
6092 * @ioctx: an I/O handler
6093 * @url: only used for error reporting (optional)
6094 * @encoding: the document encoding (optional)
6095 * @options: a combination of htmlParserOption(s)
6096 *
6097 * Convenience function to parse an HTML document from I/O functions
6098 * and context.
6099 *
6100 * See htmlCtxtReadIO for details.
6101 *
6102 * Returns the resulting document tree
6103 */
6104 htmlDocPtr
htmlReadIO(xmlInputReadCallback ioread,xmlInputCloseCallback ioclose,void * ioctx,const char * url,const char * encoding,int options)6105 htmlReadIO(xmlInputReadCallback ioread, xmlInputCloseCallback ioclose,
6106 void *ioctx, const char *url, const char *encoding, int options)
6107 {
6108 htmlParserCtxtPtr ctxt;
6109 xmlParserInputPtr input;
6110 htmlDocPtr doc = NULL;
6111
6112 ctxt = htmlNewParserCtxt();
6113 if (ctxt == NULL)
6114 return (NULL);
6115
6116 htmlCtxtUseOptions(ctxt, options);
6117
6118 input = xmlCtxtNewInputFromIO(ctxt, url, ioread, ioclose, ioctx,
6119 encoding, 0);
6120
6121 if (input != NULL)
6122 doc = htmlCtxtParseDocument(ctxt, input);
6123
6124 htmlFreeParserCtxt(ctxt);
6125 return(doc);
6126 }
6127
6128 /**
6129 * htmlCtxtReadDoc:
6130 * @ctxt: an HTML parser context
6131 * @str: a pointer to a zero terminated string
6132 * @URL: only used for error reporting (optional)
6133 * @encoding: the document encoding (optional)
6134 * @options: a combination of htmlParserOptions
6135 *
6136 * Parse an HTML in-memory document and build a tree.
6137 *
6138 * See htmlCtxtUseOptions for details.
6139 *
6140 * Returns the resulting document tree
6141 */
6142 htmlDocPtr
htmlCtxtReadDoc(htmlParserCtxtPtr ctxt,const xmlChar * str,const char * URL,const char * encoding,int options)6143 htmlCtxtReadDoc(htmlParserCtxtPtr ctxt, const xmlChar *str,
6144 const char *URL, const char *encoding, int options)
6145 {
6146 xmlParserInputPtr input;
6147
6148 if (ctxt == NULL)
6149 return (NULL);
6150
6151 htmlCtxtReset(ctxt);
6152 htmlCtxtUseOptions(ctxt, options);
6153
6154 input = xmlCtxtNewInputFromString(ctxt, URL, (const char *) str,
6155 encoding, 0);
6156 if (input == NULL)
6157 return(NULL);
6158
6159 return(htmlCtxtParseDocument(ctxt, input));
6160 }
6161
6162 /**
6163 * htmlCtxtReadFile:
6164 * @ctxt: an HTML parser context
6165 * @filename: a file or URL
6166 * @encoding: the document encoding (optional)
6167 * @options: a combination of htmlParserOptions
6168 *
6169 * Parse an HTML file from the filesystem, the network or a
6170 * user-defined resource loader.
6171 *
6172 * See htmlCtxtUseOptions for details.
6173 *
6174 * Returns the resulting document tree
6175 */
6176 htmlDocPtr
htmlCtxtReadFile(htmlParserCtxtPtr ctxt,const char * filename,const char * encoding,int options)6177 htmlCtxtReadFile(htmlParserCtxtPtr ctxt, const char *filename,
6178 const char *encoding, int options)
6179 {
6180 xmlParserInputPtr input;
6181
6182 if (ctxt == NULL)
6183 return (NULL);
6184
6185 htmlCtxtReset(ctxt);
6186 htmlCtxtUseOptions(ctxt, options);
6187
6188 input = xmlCtxtNewInputFromUrl(ctxt, filename, NULL, encoding, 0);
6189 if (input == NULL)
6190 return(NULL);
6191
6192 return(htmlCtxtParseDocument(ctxt, input));
6193 }
6194
6195 /**
6196 * htmlCtxtReadMemory:
6197 * @ctxt: an HTML parser context
6198 * @buffer: a pointer to a char array
6199 * @size: the size of the array
6200 * @URL: only used for error reporting (optional)
6201 * @encoding: the document encoding (optinal)
6202 * @options: a combination of htmlParserOptions
6203 *
6204 * Parse an HTML in-memory document and build a tree. The input buffer must
6205 * not contain any terminating null bytes.
6206 *
6207 * See htmlCtxtUseOptions for details.
6208 *
6209 * Returns the resulting document tree
6210 */
6211 htmlDocPtr
htmlCtxtReadMemory(htmlParserCtxtPtr ctxt,const char * buffer,int size,const char * URL,const char * encoding,int options)6212 htmlCtxtReadMemory(htmlParserCtxtPtr ctxt, const char *buffer, int size,
6213 const char *URL, const char *encoding, int options)
6214 {
6215 xmlParserInputPtr input;
6216
6217 if ((ctxt == NULL) || (size < 0))
6218 return (NULL);
6219
6220 htmlCtxtReset(ctxt);
6221 htmlCtxtUseOptions(ctxt, options);
6222
6223 input = xmlCtxtNewInputFromMemory(ctxt, URL, buffer, size, encoding,
6224 XML_INPUT_BUF_STATIC);
6225 if (input == NULL)
6226 return(NULL);
6227
6228 return(htmlCtxtParseDocument(ctxt, input));
6229 }
6230
6231 /**
6232 * htmlCtxtReadFd:
6233 * @ctxt: an HTML parser context
6234 * @fd: an open file descriptor
6235 * @URL: only used for error reporting (optional)
6236 * @encoding: the document encoding (optinal)
6237 * @options: a combination of htmlParserOptions
6238 *
6239 * Parse an HTML from a file descriptor and build a tree.
6240 *
6241 * See htmlCtxtUseOptions for details.
6242 *
6243 * NOTE that the file descriptor will not be closed when the
6244 * context is freed or reset.
6245 *
6246 * Returns the resulting document tree
6247 */
6248 htmlDocPtr
htmlCtxtReadFd(htmlParserCtxtPtr ctxt,int fd,const char * URL,const char * encoding,int options)6249 htmlCtxtReadFd(htmlParserCtxtPtr ctxt, int fd,
6250 const char *URL, const char *encoding, int options)
6251 {
6252 xmlParserInputPtr input;
6253
6254 if (ctxt == NULL)
6255 return(NULL);
6256
6257 htmlCtxtReset(ctxt);
6258 htmlCtxtUseOptions(ctxt, options);
6259
6260 input = xmlCtxtNewInputFromFd(ctxt, URL, fd, encoding, 0);
6261 if (input == NULL)
6262 return(NULL);
6263
6264 return(htmlCtxtParseDocument(ctxt, input));
6265 }
6266
6267 /**
6268 * htmlCtxtReadIO:
6269 * @ctxt: an HTML parser context
6270 * @ioread: an I/O read function
6271 * @ioclose: an I/O close function
6272 * @ioctx: an I/O handler
6273 * @URL: the base URL to use for the document
6274 * @encoding: the document encoding, or NULL
6275 * @options: a combination of htmlParserOption(s)
6276 *
6277 * Parse an HTML document from I/O functions and source and build a tree.
6278 *
6279 * See htmlCtxtUseOptions for details.
6280 *
6281 * Returns the resulting document tree
6282 */
6283 htmlDocPtr
htmlCtxtReadIO(htmlParserCtxtPtr ctxt,xmlInputReadCallback ioread,xmlInputCloseCallback ioclose,void * ioctx,const char * URL,const char * encoding,int options)6284 htmlCtxtReadIO(htmlParserCtxtPtr ctxt, xmlInputReadCallback ioread,
6285 xmlInputCloseCallback ioclose, void *ioctx,
6286 const char *URL,
6287 const char *encoding, int options)
6288 {
6289 xmlParserInputPtr input;
6290
6291 if (ctxt == NULL)
6292 return (NULL);
6293
6294 htmlCtxtReset(ctxt);
6295 htmlCtxtUseOptions(ctxt, options);
6296
6297 input = xmlCtxtNewInputFromIO(ctxt, URL, ioread, ioclose, ioctx,
6298 encoding, 0);
6299 if (input == NULL)
6300 return(NULL);
6301
6302 return(htmlCtxtParseDocument(ctxt, input));
6303 }
6304
6305 #endif /* LIBXML_HTML_ENABLED */
6306