1 /*
2 * HTMLparser.c : an HTML 4.0 non-verifying parser
3 *
4 * See Copyright for the status of this software.
5 *
6 * daniel@veillard.com
7 */
8
9 #define IN_LIBXML
10 #include "libxml.h"
11 #ifdef LIBXML_HTML_ENABLED
12
13 #include <string.h>
14 #include <ctype.h>
15 #include <stdlib.h>
16
17 #include <libxml/HTMLparser.h>
18 #include <libxml/xmlmemory.h>
19 #include <libxml/tree.h>
20 #include <libxml/parser.h>
21 #include <libxml/parserInternals.h>
22 #include <libxml/xmlerror.h>
23 #include <libxml/HTMLtree.h>
24 #include <libxml/entities.h>
25 #include <libxml/encoding.h>
26 #include <libxml/xmlIO.h>
27 #include <libxml/uri.h>
28
29 #include "private/buf.h"
30 #include "private/enc.h"
31 #include "private/error.h"
32 #include "private/html.h"
33 #include "private/io.h"
34 #include "private/parser.h"
35 #include "private/tree.h"
36
37 #define HTML_MAX_NAMELEN 1000
38 #define HTML_PARSER_BIG_BUFFER_SIZE 1000
39 #define HTML_PARSER_BUFFER_SIZE 100
40
41 static int htmlOmittedDefaultValue = 1;
42
43 xmlChar * htmlDecodeEntities(htmlParserCtxtPtr ctxt, int len,
44 xmlChar end, xmlChar end2, xmlChar end3);
45 static void htmlParseComment(htmlParserCtxtPtr ctxt);
46
47 /************************************************************************
48 * *
49 * Some factorized error routines *
50 * *
51 ************************************************************************/
52
53 /**
54 * htmlErrMemory:
55 * @ctxt: an HTML parser context
56 * @extra: extra information
57 *
58 * Handle a redefinition of attribute error
59 */
60 static void
htmlErrMemory(xmlParserCtxtPtr ctxt,const char * extra)61 htmlErrMemory(xmlParserCtxtPtr ctxt, const char *extra)
62 {
63 if ((ctxt != NULL) && (ctxt->disableSAX != 0) &&
64 (ctxt->instate == XML_PARSER_EOF))
65 return;
66 if (ctxt != NULL) {
67 ctxt->errNo = XML_ERR_NO_MEMORY;
68 ctxt->instate = XML_PARSER_EOF;
69 ctxt->disableSAX = 1;
70 }
71 if (extra)
72 __xmlRaiseError(NULL, NULL, NULL, ctxt, NULL, XML_FROM_PARSER,
73 XML_ERR_NO_MEMORY, XML_ERR_FATAL, NULL, 0, extra,
74 NULL, NULL, 0, 0,
75 "Memory allocation failed : %s\n", extra);
76 else
77 __xmlRaiseError(NULL, NULL, NULL, ctxt, NULL, XML_FROM_PARSER,
78 XML_ERR_NO_MEMORY, XML_ERR_FATAL, NULL, 0, NULL,
79 NULL, NULL, 0, 0, "Memory allocation failed\n");
80 }
81
82 /**
83 * htmlParseErr:
84 * @ctxt: an HTML parser context
85 * @error: the error number
86 * @msg: the error message
87 * @str1: string infor
88 * @str2: string infor
89 *
90 * Handle a fatal parser error, i.e. violating Well-Formedness constraints
91 */
92 static void LIBXML_ATTR_FORMAT(3,0)
htmlParseErr(xmlParserCtxtPtr ctxt,xmlParserErrors error,const char * msg,const xmlChar * str1,const xmlChar * str2)93 htmlParseErr(xmlParserCtxtPtr ctxt, xmlParserErrors error,
94 const char *msg, const xmlChar *str1, const xmlChar *str2)
95 {
96 if ((ctxt != NULL) && (ctxt->disableSAX != 0) &&
97 (ctxt->instate == XML_PARSER_EOF))
98 return;
99 if (ctxt != NULL)
100 ctxt->errNo = error;
101 __xmlRaiseError(NULL, NULL, NULL, ctxt, NULL, XML_FROM_HTML, error,
102 XML_ERR_ERROR, NULL, 0,
103 (const char *) str1, (const char *) str2,
104 NULL, 0, 0,
105 msg, str1, str2);
106 if (ctxt != NULL)
107 ctxt->wellFormed = 0;
108 }
109
110 /**
111 * htmlParseErrInt:
112 * @ctxt: an HTML parser context
113 * @error: the error number
114 * @msg: the error message
115 * @val: integer info
116 *
117 * Handle a fatal parser error, i.e. violating Well-Formedness constraints
118 */
119 static void LIBXML_ATTR_FORMAT(3,0)
htmlParseErrInt(xmlParserCtxtPtr ctxt,xmlParserErrors error,const char * msg,int val)120 htmlParseErrInt(xmlParserCtxtPtr ctxt, xmlParserErrors error,
121 const char *msg, int val)
122 {
123 if ((ctxt != NULL) && (ctxt->disableSAX != 0) &&
124 (ctxt->instate == XML_PARSER_EOF))
125 return;
126 if (ctxt != NULL)
127 ctxt->errNo = error;
128 __xmlRaiseError(NULL, NULL, NULL, ctxt, NULL, XML_FROM_HTML, error,
129 XML_ERR_ERROR, NULL, 0, NULL, NULL,
130 NULL, val, 0, msg, val);
131 if (ctxt != NULL)
132 ctxt->wellFormed = 0;
133 }
134
135 /************************************************************************
136 * *
137 * Parser stacks related functions and macros *
138 * *
139 ************************************************************************/
140
141 /**
142 * htmlnamePush:
143 * @ctxt: an HTML parser context
144 * @value: the element name
145 *
146 * Pushes a new element name on top of the name stack
147 *
148 * Returns -1 in case of error, the index in the stack otherwise
149 */
150 static int
htmlnamePush(htmlParserCtxtPtr ctxt,const xmlChar * value)151 htmlnamePush(htmlParserCtxtPtr ctxt, const xmlChar * value)
152 {
153 if ((ctxt->html < 3) && (xmlStrEqual(value, BAD_CAST "head")))
154 ctxt->html = 3;
155 if ((ctxt->html < 10) && (xmlStrEqual(value, BAD_CAST "body")))
156 ctxt->html = 10;
157 if (ctxt->nameNr >= ctxt->nameMax) {
158 size_t newSize = ctxt->nameMax * 2;
159 const xmlChar **tmp;
160
161 tmp = xmlRealloc((xmlChar **) ctxt->nameTab,
162 newSize * sizeof(ctxt->nameTab[0]));
163 if (tmp == NULL) {
164 htmlErrMemory(ctxt, NULL);
165 return (-1);
166 }
167 ctxt->nameTab = tmp;
168 ctxt->nameMax = newSize;
169 }
170 ctxt->nameTab[ctxt->nameNr] = value;
171 ctxt->name = value;
172 return (ctxt->nameNr++);
173 }
174 /**
175 * htmlnamePop:
176 * @ctxt: an HTML parser context
177 *
178 * Pops the top element name from the name stack
179 *
180 * Returns the name just removed
181 */
182 static const xmlChar *
htmlnamePop(htmlParserCtxtPtr ctxt)183 htmlnamePop(htmlParserCtxtPtr ctxt)
184 {
185 const xmlChar *ret;
186
187 if (ctxt->nameNr <= 0)
188 return (NULL);
189 ctxt->nameNr--;
190 if (ctxt->nameNr < 0)
191 return (NULL);
192 if (ctxt->nameNr > 0)
193 ctxt->name = ctxt->nameTab[ctxt->nameNr - 1];
194 else
195 ctxt->name = NULL;
196 ret = ctxt->nameTab[ctxt->nameNr];
197 ctxt->nameTab[ctxt->nameNr] = NULL;
198 return (ret);
199 }
200
201 /**
202 * htmlNodeInfoPush:
203 * @ctxt: an HTML parser context
204 * @value: the node info
205 *
206 * Pushes a new element name on top of the node info stack
207 *
208 * Returns 0 in case of error, the index in the stack otherwise
209 */
210 static int
htmlNodeInfoPush(htmlParserCtxtPtr ctxt,htmlParserNodeInfo * value)211 htmlNodeInfoPush(htmlParserCtxtPtr ctxt, htmlParserNodeInfo *value)
212 {
213 if (ctxt->nodeInfoNr >= ctxt->nodeInfoMax) {
214 if (ctxt->nodeInfoMax == 0)
215 ctxt->nodeInfoMax = 5;
216 ctxt->nodeInfoMax *= 2;
217 ctxt->nodeInfoTab = (htmlParserNodeInfo *)
218 xmlRealloc((htmlParserNodeInfo *)ctxt->nodeInfoTab,
219 ctxt->nodeInfoMax *
220 sizeof(ctxt->nodeInfoTab[0]));
221 if (ctxt->nodeInfoTab == NULL) {
222 htmlErrMemory(ctxt, NULL);
223 return (0);
224 }
225 }
226 ctxt->nodeInfoTab[ctxt->nodeInfoNr] = *value;
227 ctxt->nodeInfo = &ctxt->nodeInfoTab[ctxt->nodeInfoNr];
228 return (ctxt->nodeInfoNr++);
229 }
230
231 /**
232 * htmlNodeInfoPop:
233 * @ctxt: an HTML parser context
234 *
235 * Pops the top element name from the node info stack
236 *
237 * Returns 0 in case of error, the pointer to NodeInfo otherwise
238 */
239 static htmlParserNodeInfo *
htmlNodeInfoPop(htmlParserCtxtPtr ctxt)240 htmlNodeInfoPop(htmlParserCtxtPtr ctxt)
241 {
242 if (ctxt->nodeInfoNr <= 0)
243 return (NULL);
244 ctxt->nodeInfoNr--;
245 if (ctxt->nodeInfoNr < 0)
246 return (NULL);
247 if (ctxt->nodeInfoNr > 0)
248 ctxt->nodeInfo = &ctxt->nodeInfoTab[ctxt->nodeInfoNr - 1];
249 else
250 ctxt->nodeInfo = NULL;
251 return &ctxt->nodeInfoTab[ctxt->nodeInfoNr];
252 }
253
254 /*
255 * Macros for accessing the content. Those should be used only by the parser,
256 * and not exported.
257 *
258 * Dirty macros, i.e. one need to make assumption on the context to use them
259 *
260 * CUR_PTR return the current pointer to the xmlChar to be parsed.
261 * CUR returns the current xmlChar value, i.e. a 8 bit value if compiled
262 * in ISO-Latin or UTF-8, and the current 16 bit value if compiled
263 * in UNICODE mode. This should be used internally by the parser
264 * only to compare to ASCII values otherwise it would break when
265 * running with UTF-8 encoding.
266 * NXT(n) returns the n'th next xmlChar. Same as CUR is should be used only
267 * to compare on ASCII based substring.
268 * UPP(n) returns the n'th next xmlChar converted to uppercase. Same as CUR
269 * it should be used only to compare on ASCII based substring.
270 * SKIP(n) Skip n xmlChar, and must also be used only to skip ASCII defined
271 * strings without newlines within the parser.
272 *
273 * Clean macros, not dependent of an ASCII context, expect UTF-8 encoding
274 *
275 * NEXT Skip to the next character, this does the proper decoding
276 * in UTF-8 mode. It also pop-up unfinished entities on the fly.
277 * NEXTL(l) Skip the current unicode character of l xmlChars long.
278 * COPY(to) copy one char to *to, increment CUR_PTR and to accordingly
279 */
280
281 #define UPPER (toupper(*ctxt->input->cur))
282
283 #define SKIP(val) ctxt->input->cur += (val),ctxt->input->col+=(val)
284
285 #define NXT(val) ctxt->input->cur[(val)]
286
287 #define UPP(val) (toupper(ctxt->input->cur[(val)]))
288
289 #define CUR_PTR ctxt->input->cur
290 #define BASE_PTR ctxt->input->base
291
292 #define SHRINK if ((ctxt->input->cur - ctxt->input->base > 2 * INPUT_CHUNK) && \
293 (ctxt->input->end - ctxt->input->cur < 2 * INPUT_CHUNK)) \
294 xmlParserShrink(ctxt)
295
296 #define GROW if ((ctxt->progressive == 0) && \
297 (ctxt->input->end - ctxt->input->cur < INPUT_CHUNK)) \
298 xmlParserGrow(ctxt)
299
300 #define SKIP_BLANKS htmlSkipBlankChars(ctxt)
301
302 /* Imported from XML */
303
304 #define CUR (*ctxt->input->cur)
305 #define NEXT xmlNextChar(ctxt)
306
307 #define RAW (ctxt->token ? -1 : (*ctxt->input->cur))
308
309
310 #define NEXTL(l) do { \
311 if (*(ctxt->input->cur) == '\n') { \
312 ctxt->input->line++; ctxt->input->col = 1; \
313 } else ctxt->input->col++; \
314 ctxt->token = 0; ctxt->input->cur += l; \
315 } while (0)
316
317 /************
318 \
319 if (*ctxt->input->cur == '%') xmlParserHandlePEReference(ctxt); \
320 if (*ctxt->input->cur == '&') xmlParserHandleReference(ctxt);
321 ************/
322
323 #define CUR_CHAR(l) htmlCurrentChar(ctxt, &l)
324
325 #define COPY_BUF(l,b,i,v) \
326 if (l == 1) b[i++] = v; \
327 else i += xmlCopyChar(l,&b[i],v)
328
329 /**
330 * htmlFindEncoding:
331 * @the HTML parser context
332 *
333 * Ty to find and encoding in the current data available in the input
334 * buffer this is needed to try to switch to the proper encoding when
335 * one face a character error.
336 * That's an heuristic, since it's operating outside of parsing it could
337 * try to use a meta which had been commented out, that's the reason it
338 * should only be used in case of error, not as a default.
339 *
340 * Returns an encoding string or NULL if not found, the string need to
341 * be freed
342 */
343 static xmlChar *
htmlFindEncoding(xmlParserCtxtPtr ctxt)344 htmlFindEncoding(xmlParserCtxtPtr ctxt) {
345 const xmlChar *start, *cur, *end;
346
347 if ((ctxt == NULL) || (ctxt->input == NULL) ||
348 (ctxt->input->flags & XML_INPUT_HAS_ENCODING))
349 return(NULL);
350 if ((ctxt->input->cur == NULL) || (ctxt->input->end == NULL))
351 return(NULL);
352
353 start = ctxt->input->cur;
354 end = ctxt->input->end;
355 /* we also expect the input buffer to be zero terminated */
356 if (*end != 0)
357 return(NULL);
358
359 cur = xmlStrcasestr(start, BAD_CAST "HTTP-EQUIV");
360 if (cur == NULL)
361 return(NULL);
362 cur = xmlStrcasestr(cur, BAD_CAST "CONTENT");
363 if (cur == NULL)
364 return(NULL);
365 cur = xmlStrcasestr(cur, BAD_CAST "CHARSET=");
366 if (cur == NULL)
367 return(NULL);
368 cur += 8;
369 start = cur;
370 while (((*cur >= 'A') && (*cur <= 'Z')) ||
371 ((*cur >= 'a') && (*cur <= 'z')) ||
372 ((*cur >= '0') && (*cur <= '9')) ||
373 (*cur == '-') || (*cur == '_') || (*cur == ':') || (*cur == '/'))
374 cur++;
375 if (cur == start)
376 return(NULL);
377 return(xmlStrndup(start, cur - start));
378 }
379
380 /**
381 * htmlCurrentChar:
382 * @ctxt: the HTML parser context
383 * @len: pointer to the length of the char read
384 *
385 * The current char value, if using UTF-8 this may actually span multiple
386 * bytes in the input buffer. Implement the end of line normalization:
387 * 2.11 End-of-Line Handling
388 * If the encoding is unspecified, in the case we find an ISO-Latin-1
389 * char, then the encoding converter is plugged in automatically.
390 *
391 * Returns the current char value and its length
392 */
393
394 static int
htmlCurrentChar(xmlParserCtxtPtr ctxt,int * len)395 htmlCurrentChar(xmlParserCtxtPtr ctxt, int *len) {
396 const unsigned char *cur;
397 unsigned char c;
398 unsigned int val;
399
400 if (ctxt->instate == XML_PARSER_EOF)
401 return(0);
402
403 if (ctxt->token != 0) {
404 *len = 0;
405 return(ctxt->token);
406 }
407
408 if (ctxt->input->end - ctxt->input->cur < INPUT_CHUNK) {
409 xmlParserGrow(ctxt);
410 if (ctxt->instate == XML_PARSER_EOF)
411 return(0);
412 }
413
414 if ((ctxt->input->flags & XML_INPUT_HAS_ENCODING) == 0) {
415 xmlChar * guess;
416 xmlCharEncodingHandlerPtr handler;
417
418 /*
419 * Assume it's a fixed length encoding (1) with
420 * a compatible encoding for the ASCII set, since
421 * HTML constructs only use < 128 chars
422 */
423 if (*ctxt->input->cur < 0x80) {
424 *len = 1;
425 if ((*ctxt->input->cur == 0) &&
426 (ctxt->input->cur < ctxt->input->end)) {
427 htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR,
428 "Char 0x%X out of allowed range\n", 0);
429 return(' ');
430 }
431 return(*ctxt->input->cur);
432 }
433
434 /*
435 * Humm this is bad, do an automatic flow conversion
436 */
437 guess = htmlFindEncoding(ctxt);
438 if (guess == NULL) {
439 xmlSwitchEncoding(ctxt, XML_CHAR_ENCODING_8859_1);
440 } else {
441 handler = xmlFindCharEncodingHandler((const char *) guess);
442 if (handler != NULL) {
443 /*
444 * Don't use UTF-8 encoder which isn't required and
445 * can produce invalid UTF-8.
446 */
447 if (!xmlStrEqual(BAD_CAST handler->name, BAD_CAST "UTF-8"))
448 xmlSwitchToEncoding(ctxt, handler);
449 } else {
450 htmlParseErr(ctxt, XML_ERR_INVALID_ENCODING,
451 "Unsupported encoding %s", guess, NULL);
452 }
453 xmlFree(guess);
454 }
455 ctxt->input->flags |= XML_INPUT_HAS_ENCODING;
456 }
457
458 /*
459 * We are supposed to handle UTF8, check it's valid
460 * From rfc2044: encoding of the Unicode values on UTF-8:
461 *
462 * UCS-4 range (hex.) UTF-8 octet sequence (binary)
463 * 0000 0000-0000 007F 0xxxxxxx
464 * 0000 0080-0000 07FF 110xxxxx 10xxxxxx
465 * 0000 0800-0000 FFFF 1110xxxx 10xxxxxx 10xxxxxx
466 *
467 * Check for the 0x110000 limit too
468 */
469 cur = ctxt->input->cur;
470 c = *cur;
471 if (c & 0x80) {
472 size_t avail;
473
474 if ((c & 0x40) == 0)
475 goto encoding_error;
476
477 avail = ctxt->input->end - ctxt->input->cur;
478
479 if ((avail < 2) || ((cur[1] & 0xc0) != 0x80))
480 goto encoding_error;
481 if ((c & 0xe0) == 0xe0) {
482 if ((avail < 3) || ((cur[2] & 0xc0) != 0x80))
483 goto encoding_error;
484 if ((c & 0xf0) == 0xf0) {
485 if (((c & 0xf8) != 0xf0) ||
486 (avail < 4) || ((cur[3] & 0xc0) != 0x80))
487 goto encoding_error;
488 /* 4-byte code */
489 *len = 4;
490 val = (cur[0] & 0x7) << 18;
491 val |= (cur[1] & 0x3f) << 12;
492 val |= (cur[2] & 0x3f) << 6;
493 val |= cur[3] & 0x3f;
494 if (val < 0x10000)
495 goto encoding_error;
496 } else {
497 /* 3-byte code */
498 *len = 3;
499 val = (cur[0] & 0xf) << 12;
500 val |= (cur[1] & 0x3f) << 6;
501 val |= cur[2] & 0x3f;
502 if (val < 0x800)
503 goto encoding_error;
504 }
505 } else {
506 /* 2-byte code */
507 *len = 2;
508 val = (cur[0] & 0x1f) << 6;
509 val |= cur[1] & 0x3f;
510 if (val < 0x80)
511 goto encoding_error;
512 }
513 if (!IS_CHAR(val)) {
514 htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR,
515 "Char 0x%X out of allowed range\n", val);
516 }
517 return(val);
518 } else {
519 if ((*ctxt->input->cur == 0) &&
520 (ctxt->input->cur < ctxt->input->end)) {
521 htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR,
522 "Char 0x%X out of allowed range\n", 0);
523 *len = 1;
524 return(' ');
525 }
526 /* 1-byte code */
527 *len = 1;
528 return(*ctxt->input->cur);
529 }
530
531 encoding_error:
532 {
533 char buffer[150];
534
535 if (ctxt->input->end - ctxt->input->cur >= 4) {
536 snprintf(buffer, 149, "Bytes: 0x%02X 0x%02X 0x%02X 0x%02X\n",
537 ctxt->input->cur[0], ctxt->input->cur[1],
538 ctxt->input->cur[2], ctxt->input->cur[3]);
539 } else {
540 snprintf(buffer, 149, "Bytes: 0x%02X\n", ctxt->input->cur[0]);
541 }
542 htmlParseErr(ctxt, XML_ERR_INVALID_ENCODING,
543 "Input is not proper UTF-8, indicate encoding !\n",
544 BAD_CAST buffer, NULL);
545 }
546
547 if ((ctxt->input->flags & XML_INPUT_HAS_ENCODING) == 0)
548 xmlSwitchEncoding(ctxt, XML_CHAR_ENCODING_8859_1);
549 *len = 1;
550 return(*ctxt->input->cur);
551 }
552
553 /**
554 * htmlSkipBlankChars:
555 * @ctxt: the HTML parser context
556 *
557 * skip all blanks character found at that point in the input streams.
558 *
559 * Returns the number of space chars skipped
560 */
561
562 static int
htmlSkipBlankChars(xmlParserCtxtPtr ctxt)563 htmlSkipBlankChars(xmlParserCtxtPtr ctxt) {
564 int res = 0;
565
566 while (IS_BLANK_CH(*(ctxt->input->cur))) {
567 if (*(ctxt->input->cur) == '\n') {
568 ctxt->input->line++; ctxt->input->col = 1;
569 } else ctxt->input->col++;
570 ctxt->input->cur++;
571 if (*ctxt->input->cur == 0)
572 xmlParserGrow(ctxt);
573 if (res < INT_MAX)
574 res++;
575 }
576 return(res);
577 }
578
579
580
581 /************************************************************************
582 * *
583 * The list of HTML elements and their properties *
584 * *
585 ************************************************************************/
586
587 /*
588 * Start Tag: 1 means the start tag can be omitted
589 * End Tag: 1 means the end tag can be omitted
590 * 2 means it's forbidden (empty elements)
591 * 3 means the tag is stylistic and should be closed easily
592 * Depr: this element is deprecated
593 * DTD: 1 means that this element is valid only in the Loose DTD
594 * 2 means that this element is valid only in the Frameset DTD
595 *
596 * Name,Start Tag,End Tag,Save End,Empty,Deprecated,DTD,inline,Description
597 , subElements , impliedsubelt , Attributes, userdata
598 */
599
600 /* Definitions and a couple of vars for HTML Elements */
601
602 #define FONTSTYLE "tt", "i", "b", "u", "s", "strike", "big", "small"
603 #define NB_FONTSTYLE 8
604 #define PHRASE "em", "strong", "dfn", "code", "samp", "kbd", "var", "cite", "abbr", "acronym"
605 #define NB_PHRASE 10
606 #define SPECIAL "a", "img", "applet", "embed", "object", "font", "basefont", "br", "script", "map", "q", "sub", "sup", "span", "bdo", "iframe"
607 #define NB_SPECIAL 16
608 #define INLINE FONTSTYLE, PHRASE, SPECIAL, FORMCTRL
609 #define NB_INLINE NB_PCDATA + NB_FONTSTYLE + NB_PHRASE + NB_SPECIAL + NB_FORMCTRL
610 #define BLOCK HEADING, LIST, "pre", "p", "dl", "div", "center", "noscript", "noframes", "blockquote", "form", "isindex", "hr", "table", "fieldset", "address"
611 #define NB_BLOCK NB_HEADING + NB_LIST + 14
612 #define FORMCTRL "input", "select", "textarea", "label", "button"
613 #define NB_FORMCTRL 5
614 #define PCDATA
615 #define NB_PCDATA 0
616 #define HEADING "h1", "h2", "h3", "h4", "h5", "h6"
617 #define NB_HEADING 6
618 #define LIST "ul", "ol", "dir", "menu"
619 #define NB_LIST 4
620 #define MODIFIER
621 #define NB_MODIFIER 0
622 #define FLOW BLOCK,INLINE
623 #define NB_FLOW NB_BLOCK + NB_INLINE
624 #define EMPTY NULL
625
626
627 static const char* const html_flow[] = { FLOW, NULL } ;
628 static const char* const html_inline[] = { INLINE, NULL } ;
629
630 /* placeholders: elts with content but no subelements */
631 static const char* const html_pcdata[] = { NULL } ;
632 #define html_cdata html_pcdata
633
634
635 /* ... and for HTML Attributes */
636
637 #define COREATTRS "id", "class", "style", "title"
638 #define NB_COREATTRS 4
639 #define I18N "lang", "dir"
640 #define NB_I18N 2
641 #define EVENTS "onclick", "ondblclick", "onmousedown", "onmouseup", "onmouseover", "onmouseout", "onkeypress", "onkeydown", "onkeyup"
642 #define NB_EVENTS 9
643 #define ATTRS COREATTRS,I18N,EVENTS
644 #define NB_ATTRS NB_NB_COREATTRS + NB_I18N + NB_EVENTS
645 #define CELLHALIGN "align", "char", "charoff"
646 #define NB_CELLHALIGN 3
647 #define CELLVALIGN "valign"
648 #define NB_CELLVALIGN 1
649
650 static const char* const html_attrs[] = { ATTRS, NULL } ;
651 static const char* const core_i18n_attrs[] = { COREATTRS, I18N, NULL } ;
652 static const char* const core_attrs[] = { COREATTRS, NULL } ;
653 static const char* const i18n_attrs[] = { I18N, NULL } ;
654
655
656 /* Other declarations that should go inline ... */
657 static const char* const a_attrs[] = { ATTRS, "charset", "type", "name",
658 "href", "hreflang", "rel", "rev", "accesskey", "shape", "coords",
659 "tabindex", "onfocus", "onblur", NULL } ;
660 static const char* const target_attr[] = { "target", NULL } ;
661 static const char* const rows_cols_attr[] = { "rows", "cols", NULL } ;
662 static const char* const alt_attr[] = { "alt", NULL } ;
663 static const char* const src_alt_attrs[] = { "src", "alt", NULL } ;
664 static const char* const href_attrs[] = { "href", NULL } ;
665 static const char* const clear_attrs[] = { "clear", NULL } ;
666 static const char* const inline_p[] = { INLINE, "p", NULL } ;
667
668 static const char* const flow_param[] = { FLOW, "param", NULL } ;
669 static const char* const applet_attrs[] = { COREATTRS , "codebase",
670 "archive", "alt", "name", "height", "width", "align",
671 "hspace", "vspace", NULL } ;
672 static const char* const area_attrs[] = { "shape", "coords", "href", "nohref",
673 "tabindex", "accesskey", "onfocus", "onblur", NULL } ;
674 static const char* const basefont_attrs[] =
675 { "id", "size", "color", "face", NULL } ;
676 static const char* const quote_attrs[] = { ATTRS, "cite", NULL } ;
677 static const char* const body_contents[] = { FLOW, "ins", "del", NULL } ;
678 static const char* const body_attrs[] = { ATTRS, "onload", "onunload", NULL } ;
679 static const char* const body_depr[] = { "background", "bgcolor", "text",
680 "link", "vlink", "alink", NULL } ;
681 static const char* const button_attrs[] = { ATTRS, "name", "value", "type",
682 "disabled", "tabindex", "accesskey", "onfocus", "onblur", NULL } ;
683
684
685 static const char* const col_attrs[] = { ATTRS, "span", "width", CELLHALIGN, CELLVALIGN, NULL } ;
686 static const char* const col_elt[] = { "col", NULL } ;
687 static const char* const edit_attrs[] = { ATTRS, "datetime", "cite", NULL } ;
688 static const char* const compact_attrs[] = { ATTRS, "compact", NULL } ;
689 static const char* const dl_contents[] = { "dt", "dd", NULL } ;
690 static const char* const compact_attr[] = { "compact", NULL } ;
691 static const char* const label_attr[] = { "label", NULL } ;
692 static const char* const fieldset_contents[] = { FLOW, "legend" } ;
693 static const char* const font_attrs[] = { COREATTRS, I18N, "size", "color", "face" , NULL } ;
694 static const char* const form_contents[] = { HEADING, LIST, INLINE, "pre", "p", "div", "center", "noscript", "noframes", "blockquote", "isindex", "hr", "table", "fieldset", "address", NULL } ;
695 static const char* const form_attrs[] = { ATTRS, "method", "enctype", "accept", "name", "onsubmit", "onreset", "accept-charset", NULL } ;
696 static const char* const frame_attrs[] = { COREATTRS, "longdesc", "name", "src", "frameborder", "marginwidth", "marginheight", "noresize", "scrolling" , NULL } ;
697 static const char* const frameset_attrs[] = { COREATTRS, "rows", "cols", "onload", "onunload", NULL } ;
698 static const char* const frameset_contents[] = { "frameset", "frame", "noframes", NULL } ;
699 static const char* const head_attrs[] = { I18N, "profile", NULL } ;
700 static const char* const head_contents[] = { "title", "isindex", "base", "script", "style", "meta", "link", "object", NULL } ;
701 static const char* const hr_depr[] = { "align", "noshade", "size", "width", NULL } ;
702 static const char* const version_attr[] = { "version", NULL } ;
703 static const char* const html_content[] = { "head", "body", "frameset", NULL } ;
704 static const char* const iframe_attrs[] = { COREATTRS, "longdesc", "name", "src", "frameborder", "marginwidth", "marginheight", "scrolling", "align", "height", "width", NULL } ;
705 static const char* const img_attrs[] = { ATTRS, "longdesc", "name", "height", "width", "usemap", "ismap", NULL } ;
706 static const char* const embed_attrs[] = { COREATTRS, "align", "alt", "border", "code", "codebase", "frameborder", "height", "hidden", "hspace", "name", "palette", "pluginspace", "pluginurl", "src", "type", "units", "vspace", "width", NULL } ;
707 static const char* const input_attrs[] = { ATTRS, "type", "name", "value", "checked", "disabled", "readonly", "size", "maxlength", "src", "alt", "usemap", "ismap", "tabindex", "accesskey", "onfocus", "onblur", "onselect", "onchange", "accept", NULL } ;
708 static const char* const prompt_attrs[] = { COREATTRS, I18N, "prompt", NULL } ;
709 static const char* const label_attrs[] = { ATTRS, "for", "accesskey", "onfocus", "onblur", NULL } ;
710 static const char* const legend_attrs[] = { ATTRS, "accesskey", NULL } ;
711 static const char* const align_attr[] = { "align", NULL } ;
712 static const char* const link_attrs[] = { ATTRS, "charset", "href", "hreflang", "type", "rel", "rev", "media", NULL } ;
713 static const char* const map_contents[] = { BLOCK, "area", NULL } ;
714 static const char* const name_attr[] = { "name", NULL } ;
715 static const char* const action_attr[] = { "action", NULL } ;
716 static const char* const blockli_elt[] = { BLOCK, "li", NULL } ;
717 static const char* const meta_attrs[] = { I18N, "http-equiv", "name", "scheme", "charset", NULL } ;
718 static const char* const content_attr[] = { "content", NULL } ;
719 static const char* const type_attr[] = { "type", NULL } ;
720 static const char* const noframes_content[] = { "body", FLOW MODIFIER, NULL } ;
721 static const char* const object_contents[] = { FLOW, "param", NULL } ;
722 static const char* const object_attrs[] = { ATTRS, "declare", "classid", "codebase", "data", "type", "codetype", "archive", "standby", "height", "width", "usemap", "name", "tabindex", NULL } ;
723 static const char* const object_depr[] = { "align", "border", "hspace", "vspace", NULL } ;
724 static const char* const ol_attrs[] = { "type", "compact", "start", NULL} ;
725 static const char* const option_elt[] = { "option", NULL } ;
726 static const char* const optgroup_attrs[] = { ATTRS, "disabled", NULL } ;
727 static const char* const option_attrs[] = { ATTRS, "disabled", "label", "selected", "value", NULL } ;
728 static const char* const param_attrs[] = { "id", "value", "valuetype", "type", NULL } ;
729 static const char* const width_attr[] = { "width", NULL } ;
730 static const char* const pre_content[] = { PHRASE, "tt", "i", "b", "u", "s", "strike", "a", "br", "script", "map", "q", "span", "bdo", "iframe", NULL } ;
731 static const char* const script_attrs[] = { "charset", "src", "defer", "event", "for", NULL } ;
732 static const char* const language_attr[] = { "language", NULL } ;
733 static const char* const select_content[] = { "optgroup", "option", NULL } ;
734 static const char* const select_attrs[] = { ATTRS, "name", "size", "multiple", "disabled", "tabindex", "onfocus", "onblur", "onchange", NULL } ;
735 static const char* const style_attrs[] = { I18N, "media", "title", NULL } ;
736 static const char* const table_attrs[] = { ATTRS, "summary", "width", "border", "frame", "rules", "cellspacing", "cellpadding", "datapagesize", NULL } ;
737 static const char* const table_depr[] = { "align", "bgcolor", NULL } ;
738 static const char* const table_contents[] = { "caption", "col", "colgroup", "thead", "tfoot", "tbody", "tr", NULL} ;
739 static const char* const tr_elt[] = { "tr", NULL } ;
740 static const char* const talign_attrs[] = { ATTRS, CELLHALIGN, CELLVALIGN, NULL} ;
741 static const char* const th_td_depr[] = { "nowrap", "bgcolor", "width", "height", NULL } ;
742 static const char* const th_td_attr[] = { ATTRS, "abbr", "axis", "headers", "scope", "rowspan", "colspan", CELLHALIGN, CELLVALIGN, NULL } ;
743 static const char* const textarea_attrs[] = { ATTRS, "name", "disabled", "readonly", "tabindex", "accesskey", "onfocus", "onblur", "onselect", "onchange", NULL } ;
744 static const char* const tr_contents[] = { "th", "td", NULL } ;
745 static const char* const bgcolor_attr[] = { "bgcolor", NULL } ;
746 static const char* const li_elt[] = { "li", NULL } ;
747 static const char* const ul_depr[] = { "type", "compact", NULL} ;
748 static const char* const dir_attr[] = { "dir", NULL} ;
749
750 #define DECL (const char**)
751
752 static const htmlElemDesc
753 html40ElementTable[] = {
754 { "a", 0, 0, 0, 0, 0, 0, 1, "anchor ",
755 DECL html_inline , NULL , DECL a_attrs , DECL target_attr, NULL
756 },
757 { "abbr", 0, 0, 0, 0, 0, 0, 1, "abbreviated form",
758 DECL html_inline , NULL , DECL html_attrs, NULL, NULL
759 },
760 { "acronym", 0, 0, 0, 0, 0, 0, 1, "",
761 DECL html_inline , NULL , DECL html_attrs, NULL, NULL
762 },
763 { "address", 0, 0, 0, 0, 0, 0, 0, "information on author ",
764 DECL inline_p , NULL , DECL html_attrs, NULL, NULL
765 },
766 { "applet", 0, 0, 0, 0, 1, 1, 2, "java applet ",
767 DECL flow_param , NULL , NULL , DECL applet_attrs, NULL
768 },
769 { "area", 0, 2, 2, 1, 0, 0, 0, "client-side image map area ",
770 EMPTY , NULL , DECL area_attrs , DECL target_attr, DECL alt_attr
771 },
772 { "b", 0, 3, 0, 0, 0, 0, 1, "bold text style",
773 DECL html_inline , NULL , DECL html_attrs, NULL, NULL
774 },
775 { "base", 0, 2, 2, 1, 0, 0, 0, "document base uri ",
776 EMPTY , NULL , NULL , DECL target_attr, DECL href_attrs
777 },
778 { "basefont", 0, 2, 2, 1, 1, 1, 1, "base font size " ,
779 EMPTY , NULL , NULL, DECL basefont_attrs, NULL
780 },
781 { "bdo", 0, 0, 0, 0, 0, 0, 1, "i18n bidi over-ride ",
782 DECL html_inline , NULL , DECL core_i18n_attrs, NULL, DECL dir_attr
783 },
784 { "big", 0, 3, 0, 0, 0, 0, 1, "large text style",
785 DECL html_inline , NULL , DECL html_attrs, NULL, NULL
786 },
787 { "blockquote", 0, 0, 0, 0, 0, 0, 0, "long quotation ",
788 DECL html_flow , NULL , DECL quote_attrs , NULL, NULL
789 },
790 { "body", 1, 1, 0, 0, 0, 0, 0, "document body ",
791 DECL body_contents , "div" , DECL body_attrs, DECL body_depr, NULL
792 },
793 { "br", 0, 2, 2, 1, 0, 0, 1, "forced line break ",
794 EMPTY , NULL , DECL core_attrs, DECL clear_attrs , NULL
795 },
796 { "button", 0, 0, 0, 0, 0, 0, 2, "push button ",
797 DECL html_flow MODIFIER , NULL , DECL button_attrs, NULL, NULL
798 },
799 { "caption", 0, 0, 0, 0, 0, 0, 0, "table caption ",
800 DECL html_inline , NULL , DECL html_attrs, NULL, NULL
801 },
802 { "center", 0, 3, 0, 0, 1, 1, 0, "shorthand for div align=center ",
803 DECL html_flow , NULL , NULL, DECL html_attrs, NULL
804 },
805 { "cite", 0, 0, 0, 0, 0, 0, 1, "citation",
806 DECL html_inline , NULL , DECL html_attrs, NULL, NULL
807 },
808 { "code", 0, 0, 0, 0, 0, 0, 1, "computer code fragment",
809 DECL html_inline , NULL , DECL html_attrs, NULL, NULL
810 },
811 { "col", 0, 2, 2, 1, 0, 0, 0, "table column ",
812 EMPTY , NULL , DECL col_attrs , NULL, NULL
813 },
814 { "colgroup", 0, 1, 0, 0, 0, 0, 0, "table column group ",
815 DECL col_elt , "col" , DECL col_attrs , NULL, NULL
816 },
817 { "dd", 0, 1, 0, 0, 0, 0, 0, "definition description ",
818 DECL html_flow , NULL , DECL html_attrs, NULL, NULL
819 },
820 { "del", 0, 0, 0, 0, 0, 0, 2, "deleted text ",
821 DECL html_flow , NULL , DECL edit_attrs , NULL, NULL
822 },
823 { "dfn", 0, 0, 0, 0, 0, 0, 1, "instance definition",
824 DECL html_inline , NULL , DECL html_attrs, NULL, NULL
825 },
826 { "dir", 0, 0, 0, 0, 1, 1, 0, "directory list",
827 DECL blockli_elt, "li" , NULL, DECL compact_attrs, NULL
828 },
829 { "div", 0, 0, 0, 0, 0, 0, 0, "generic language/style container",
830 DECL html_flow, NULL, DECL html_attrs, DECL align_attr, NULL
831 },
832 { "dl", 0, 0, 0, 0, 0, 0, 0, "definition list ",
833 DECL dl_contents , "dd" , DECL html_attrs, DECL compact_attr, NULL
834 },
835 { "dt", 0, 1, 0, 0, 0, 0, 0, "definition term ",
836 DECL html_inline, NULL, DECL html_attrs, NULL, NULL
837 },
838 { "em", 0, 3, 0, 0, 0, 0, 1, "emphasis",
839 DECL html_inline, NULL, DECL html_attrs, NULL, NULL
840 },
841 { "embed", 0, 1, 0, 0, 1, 1, 1, "generic embedded object ",
842 EMPTY, NULL, DECL embed_attrs, NULL, NULL
843 },
844 { "fieldset", 0, 0, 0, 0, 0, 0, 0, "form control group ",
845 DECL fieldset_contents , NULL, DECL html_attrs, NULL, NULL
846 },
847 { "font", 0, 3, 0, 0, 1, 1, 1, "local change to font ",
848 DECL html_inline, NULL, NULL, DECL font_attrs, NULL
849 },
850 { "form", 0, 0, 0, 0, 0, 0, 0, "interactive form ",
851 DECL form_contents, "fieldset", DECL form_attrs , DECL target_attr, DECL action_attr
852 },
853 { "frame", 0, 2, 2, 1, 0, 2, 0, "subwindow " ,
854 EMPTY, NULL, NULL, DECL frame_attrs, NULL
855 },
856 { "frameset", 0, 0, 0, 0, 0, 2, 0, "window subdivision" ,
857 DECL frameset_contents, "noframes" , NULL , DECL frameset_attrs, NULL
858 },
859 { "h1", 0, 0, 0, 0, 0, 0, 0, "heading ",
860 DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
861 },
862 { "h2", 0, 0, 0, 0, 0, 0, 0, "heading ",
863 DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
864 },
865 { "h3", 0, 0, 0, 0, 0, 0, 0, "heading ",
866 DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
867 },
868 { "h4", 0, 0, 0, 0, 0, 0, 0, "heading ",
869 DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
870 },
871 { "h5", 0, 0, 0, 0, 0, 0, 0, "heading ",
872 DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
873 },
874 { "h6", 0, 0, 0, 0, 0, 0, 0, "heading ",
875 DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
876 },
877 { "head", 1, 1, 0, 0, 0, 0, 0, "document head ",
878 DECL head_contents, NULL, DECL head_attrs, NULL, NULL
879 },
880 { "hr", 0, 2, 2, 1, 0, 0, 0, "horizontal rule " ,
881 EMPTY, NULL, DECL html_attrs, DECL hr_depr, NULL
882 },
883 { "html", 1, 1, 0, 0, 0, 0, 0, "document root element ",
884 DECL html_content , NULL , DECL i18n_attrs, DECL version_attr, NULL
885 },
886 { "i", 0, 3, 0, 0, 0, 0, 1, "italic text style",
887 DECL html_inline, NULL, DECL html_attrs, NULL, NULL
888 },
889 { "iframe", 0, 0, 0, 0, 0, 1, 2, "inline subwindow ",
890 DECL html_flow, NULL, NULL, DECL iframe_attrs, NULL
891 },
892 { "img", 0, 2, 2, 1, 0, 0, 1, "embedded image ",
893 EMPTY, NULL, DECL img_attrs, DECL align_attr, DECL src_alt_attrs
894 },
895 { "input", 0, 2, 2, 1, 0, 0, 1, "form control ",
896 EMPTY, NULL, DECL input_attrs , DECL align_attr, NULL
897 },
898 { "ins", 0, 0, 0, 0, 0, 0, 2, "inserted text",
899 DECL html_flow, NULL, DECL edit_attrs, NULL, NULL
900 },
901 { "isindex", 0, 2, 2, 1, 1, 1, 0, "single line prompt ",
902 EMPTY, NULL, NULL, DECL prompt_attrs, NULL
903 },
904 { "kbd", 0, 0, 0, 0, 0, 0, 1, "text to be entered by the user",
905 DECL html_inline, NULL, DECL html_attrs, NULL, NULL
906 },
907 { "label", 0, 0, 0, 0, 0, 0, 1, "form field label text ",
908 DECL html_inline MODIFIER, NULL, DECL label_attrs , NULL, NULL
909 },
910 { "legend", 0, 0, 0, 0, 0, 0, 0, "fieldset legend ",
911 DECL html_inline, NULL, DECL legend_attrs , DECL align_attr, NULL
912 },
913 { "li", 0, 1, 1, 0, 0, 0, 0, "list item ",
914 DECL html_flow, NULL, DECL html_attrs, NULL, NULL
915 },
916 { "link", 0, 2, 2, 1, 0, 0, 0, "a media-independent link ",
917 EMPTY, NULL, DECL link_attrs, DECL target_attr, NULL
918 },
919 { "map", 0, 0, 0, 0, 0, 0, 2, "client-side image map ",
920 DECL map_contents , NULL, DECL html_attrs , NULL, DECL name_attr
921 },
922 { "menu", 0, 0, 0, 0, 1, 1, 0, "menu list ",
923 DECL blockli_elt , NULL, NULL, DECL compact_attrs, NULL
924 },
925 { "meta", 0, 2, 2, 1, 0, 0, 0, "generic metainformation ",
926 EMPTY, NULL, DECL meta_attrs , NULL , DECL content_attr
927 },
928 { "noframes", 0, 0, 0, 0, 0, 2, 0, "alternate content container for non frame-based rendering ",
929 DECL noframes_content, "body" , DECL html_attrs, NULL, NULL
930 },
931 { "noscript", 0, 0, 0, 0, 0, 0, 0, "alternate content container for non script-based rendering ",
932 DECL html_flow, "div", DECL html_attrs, NULL, NULL
933 },
934 { "object", 0, 0, 0, 0, 0, 0, 2, "generic embedded object ",
935 DECL object_contents , "div" , DECL object_attrs, DECL object_depr, NULL
936 },
937 { "ol", 0, 0, 0, 0, 0, 0, 0, "ordered list ",
938 DECL li_elt , "li" , DECL html_attrs, DECL ol_attrs, NULL
939 },
940 { "optgroup", 0, 0, 0, 0, 0, 0, 0, "option group ",
941 DECL option_elt , "option", DECL optgroup_attrs, NULL, DECL label_attr
942 },
943 { "option", 0, 1, 0, 0, 0, 0, 0, "selectable choice " ,
944 DECL html_pcdata, NULL, DECL option_attrs, NULL, NULL
945 },
946 { "p", 0, 1, 0, 0, 0, 0, 0, "paragraph ",
947 DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
948 },
949 { "param", 0, 2, 2, 1, 0, 0, 0, "named property value ",
950 EMPTY, NULL, DECL param_attrs, NULL, DECL name_attr
951 },
952 { "pre", 0, 0, 0, 0, 0, 0, 0, "preformatted text ",
953 DECL pre_content, NULL, DECL html_attrs, DECL width_attr, NULL
954 },
955 { "q", 0, 0, 0, 0, 0, 0, 1, "short inline quotation ",
956 DECL html_inline, NULL, DECL quote_attrs, NULL, NULL
957 },
958 { "s", 0, 3, 0, 0, 1, 1, 1, "strike-through text style",
959 DECL html_inline, NULL, NULL, DECL html_attrs, NULL
960 },
961 { "samp", 0, 0, 0, 0, 0, 0, 1, "sample program output, scripts, etc.",
962 DECL html_inline, NULL, DECL html_attrs, NULL, NULL
963 },
964 { "script", 0, 0, 0, 0, 0, 0, 2, "script statements ",
965 DECL html_cdata, NULL, DECL script_attrs, DECL language_attr, DECL type_attr
966 },
967 { "select", 0, 0, 0, 0, 0, 0, 1, "option selector ",
968 DECL select_content, NULL, DECL select_attrs, NULL, NULL
969 },
970 { "small", 0, 3, 0, 0, 0, 0, 1, "small text style",
971 DECL html_inline, NULL, DECL html_attrs, NULL, NULL
972 },
973 { "span", 0, 0, 0, 0, 0, 0, 1, "generic language/style container ",
974 DECL html_inline, NULL, DECL html_attrs, NULL, NULL
975 },
976 { "strike", 0, 3, 0, 0, 1, 1, 1, "strike-through text",
977 DECL html_inline, NULL, NULL, DECL html_attrs, NULL
978 },
979 { "strong", 0, 3, 0, 0, 0, 0, 1, "strong emphasis",
980 DECL html_inline, NULL, DECL html_attrs, NULL, NULL
981 },
982 { "style", 0, 0, 0, 0, 0, 0, 0, "style info ",
983 DECL html_cdata, NULL, DECL style_attrs, NULL, DECL type_attr
984 },
985 { "sub", 0, 3, 0, 0, 0, 0, 1, "subscript",
986 DECL html_inline, NULL, DECL html_attrs, NULL, NULL
987 },
988 { "sup", 0, 3, 0, 0, 0, 0, 1, "superscript ",
989 DECL html_inline, NULL, DECL html_attrs, NULL, NULL
990 },
991 { "table", 0, 0, 0, 0, 0, 0, 0, "",
992 DECL table_contents , "tr" , DECL table_attrs , DECL table_depr, NULL
993 },
994 { "tbody", 1, 0, 0, 0, 0, 0, 0, "table body ",
995 DECL tr_elt , "tr" , DECL talign_attrs, NULL, NULL
996 },
997 { "td", 0, 0, 0, 0, 0, 0, 0, "table data cell",
998 DECL html_flow, NULL, DECL th_td_attr, DECL th_td_depr, NULL
999 },
1000 { "textarea", 0, 0, 0, 0, 0, 0, 1, "multi-line text field ",
1001 DECL html_pcdata, NULL, DECL textarea_attrs, NULL, DECL rows_cols_attr
1002 },
1003 { "tfoot", 0, 1, 0, 0, 0, 0, 0, "table footer ",
1004 DECL tr_elt , "tr" , DECL talign_attrs, NULL, NULL
1005 },
1006 { "th", 0, 1, 0, 0, 0, 0, 0, "table header cell",
1007 DECL html_flow, NULL, DECL th_td_attr, DECL th_td_depr, NULL
1008 },
1009 { "thead", 0, 1, 0, 0, 0, 0, 0, "table header ",
1010 DECL tr_elt , "tr" , DECL talign_attrs, NULL, NULL
1011 },
1012 { "title", 0, 0, 0, 0, 0, 0, 0, "document title ",
1013 DECL html_pcdata, NULL, DECL i18n_attrs, NULL, NULL
1014 },
1015 { "tr", 0, 0, 0, 0, 0, 0, 0, "table row ",
1016 DECL tr_contents , "td" , DECL talign_attrs, DECL bgcolor_attr, NULL
1017 },
1018 { "tt", 0, 3, 0, 0, 0, 0, 1, "teletype or monospaced text style",
1019 DECL html_inline, NULL, DECL html_attrs, NULL, NULL
1020 },
1021 { "u", 0, 3, 0, 0, 1, 1, 1, "underlined text style",
1022 DECL html_inline, NULL, NULL, DECL html_attrs, NULL
1023 },
1024 { "ul", 0, 0, 0, 0, 0, 0, 0, "unordered list ",
1025 DECL li_elt , "li" , DECL html_attrs, DECL ul_depr, NULL
1026 },
1027 { "var", 0, 0, 0, 0, 0, 0, 1, "instance of a variable or program argument",
1028 DECL html_inline, NULL, DECL html_attrs, NULL, NULL
1029 }
1030 };
1031
1032 typedef struct {
1033 const char *oldTag;
1034 const char *newTag;
1035 } htmlStartCloseEntry;
1036
1037 /*
1038 * start tags that imply the end of current element
1039 */
1040 static const htmlStartCloseEntry htmlStartClose[] = {
1041 { "a", "a" },
1042 { "a", "fieldset" },
1043 { "a", "table" },
1044 { "a", "td" },
1045 { "a", "th" },
1046 { "address", "dd" },
1047 { "address", "dl" },
1048 { "address", "dt" },
1049 { "address", "form" },
1050 { "address", "li" },
1051 { "address", "ul" },
1052 { "b", "center" },
1053 { "b", "p" },
1054 { "b", "td" },
1055 { "b", "th" },
1056 { "big", "p" },
1057 { "caption", "col" },
1058 { "caption", "colgroup" },
1059 { "caption", "tbody" },
1060 { "caption", "tfoot" },
1061 { "caption", "thead" },
1062 { "caption", "tr" },
1063 { "col", "col" },
1064 { "col", "colgroup" },
1065 { "col", "tbody" },
1066 { "col", "tfoot" },
1067 { "col", "thead" },
1068 { "col", "tr" },
1069 { "colgroup", "colgroup" },
1070 { "colgroup", "tbody" },
1071 { "colgroup", "tfoot" },
1072 { "colgroup", "thead" },
1073 { "colgroup", "tr" },
1074 { "dd", "dt" },
1075 { "dir", "dd" },
1076 { "dir", "dl" },
1077 { "dir", "dt" },
1078 { "dir", "form" },
1079 { "dir", "ul" },
1080 { "dl", "form" },
1081 { "dl", "li" },
1082 { "dt", "dd" },
1083 { "dt", "dl" },
1084 { "font", "center" },
1085 { "font", "td" },
1086 { "font", "th" },
1087 { "form", "form" },
1088 { "h1", "fieldset" },
1089 { "h1", "form" },
1090 { "h1", "li" },
1091 { "h1", "p" },
1092 { "h1", "table" },
1093 { "h2", "fieldset" },
1094 { "h2", "form" },
1095 { "h2", "li" },
1096 { "h2", "p" },
1097 { "h2", "table" },
1098 { "h3", "fieldset" },
1099 { "h3", "form" },
1100 { "h3", "li" },
1101 { "h3", "p" },
1102 { "h3", "table" },
1103 { "h4", "fieldset" },
1104 { "h4", "form" },
1105 { "h4", "li" },
1106 { "h4", "p" },
1107 { "h4", "table" },
1108 { "h5", "fieldset" },
1109 { "h5", "form" },
1110 { "h5", "li" },
1111 { "h5", "p" },
1112 { "h5", "table" },
1113 { "h6", "fieldset" },
1114 { "h6", "form" },
1115 { "h6", "li" },
1116 { "h6", "p" },
1117 { "h6", "table" },
1118 { "head", "a" },
1119 { "head", "abbr" },
1120 { "head", "acronym" },
1121 { "head", "address" },
1122 { "head", "b" },
1123 { "head", "bdo" },
1124 { "head", "big" },
1125 { "head", "blockquote" },
1126 { "head", "body" },
1127 { "head", "br" },
1128 { "head", "center" },
1129 { "head", "cite" },
1130 { "head", "code" },
1131 { "head", "dd" },
1132 { "head", "dfn" },
1133 { "head", "dir" },
1134 { "head", "div" },
1135 { "head", "dl" },
1136 { "head", "dt" },
1137 { "head", "em" },
1138 { "head", "fieldset" },
1139 { "head", "font" },
1140 { "head", "form" },
1141 { "head", "frameset" },
1142 { "head", "h1" },
1143 { "head", "h2" },
1144 { "head", "h3" },
1145 { "head", "h4" },
1146 { "head", "h5" },
1147 { "head", "h6" },
1148 { "head", "hr" },
1149 { "head", "i" },
1150 { "head", "iframe" },
1151 { "head", "img" },
1152 { "head", "kbd" },
1153 { "head", "li" },
1154 { "head", "listing" },
1155 { "head", "map" },
1156 { "head", "menu" },
1157 { "head", "ol" },
1158 { "head", "p" },
1159 { "head", "pre" },
1160 { "head", "q" },
1161 { "head", "s" },
1162 { "head", "samp" },
1163 { "head", "small" },
1164 { "head", "span" },
1165 { "head", "strike" },
1166 { "head", "strong" },
1167 { "head", "sub" },
1168 { "head", "sup" },
1169 { "head", "table" },
1170 { "head", "tt" },
1171 { "head", "u" },
1172 { "head", "ul" },
1173 { "head", "var" },
1174 { "head", "xmp" },
1175 { "hr", "form" },
1176 { "i", "center" },
1177 { "i", "p" },
1178 { "i", "td" },
1179 { "i", "th" },
1180 { "legend", "fieldset" },
1181 { "li", "li" },
1182 { "link", "body" },
1183 { "link", "frameset" },
1184 { "listing", "dd" },
1185 { "listing", "dl" },
1186 { "listing", "dt" },
1187 { "listing", "fieldset" },
1188 { "listing", "form" },
1189 { "listing", "li" },
1190 { "listing", "table" },
1191 { "listing", "ul" },
1192 { "menu", "dd" },
1193 { "menu", "dl" },
1194 { "menu", "dt" },
1195 { "menu", "form" },
1196 { "menu", "ul" },
1197 { "ol", "form" },
1198 { "option", "optgroup" },
1199 { "option", "option" },
1200 { "p", "address" },
1201 { "p", "blockquote" },
1202 { "p", "body" },
1203 { "p", "caption" },
1204 { "p", "center" },
1205 { "p", "col" },
1206 { "p", "colgroup" },
1207 { "p", "dd" },
1208 { "p", "dir" },
1209 { "p", "div" },
1210 { "p", "dl" },
1211 { "p", "dt" },
1212 { "p", "fieldset" },
1213 { "p", "form" },
1214 { "p", "frameset" },
1215 { "p", "h1" },
1216 { "p", "h2" },
1217 { "p", "h3" },
1218 { "p", "h4" },
1219 { "p", "h5" },
1220 { "p", "h6" },
1221 { "p", "head" },
1222 { "p", "hr" },
1223 { "p", "li" },
1224 { "p", "listing" },
1225 { "p", "menu" },
1226 { "p", "ol" },
1227 { "p", "p" },
1228 { "p", "pre" },
1229 { "p", "table" },
1230 { "p", "tbody" },
1231 { "p", "td" },
1232 { "p", "tfoot" },
1233 { "p", "th" },
1234 { "p", "title" },
1235 { "p", "tr" },
1236 { "p", "ul" },
1237 { "p", "xmp" },
1238 { "pre", "dd" },
1239 { "pre", "dl" },
1240 { "pre", "dt" },
1241 { "pre", "fieldset" },
1242 { "pre", "form" },
1243 { "pre", "li" },
1244 { "pre", "table" },
1245 { "pre", "ul" },
1246 { "s", "p" },
1247 { "script", "noscript" },
1248 { "small", "p" },
1249 { "span", "td" },
1250 { "span", "th" },
1251 { "strike", "p" },
1252 { "style", "body" },
1253 { "style", "frameset" },
1254 { "tbody", "tbody" },
1255 { "tbody", "tfoot" },
1256 { "td", "tbody" },
1257 { "td", "td" },
1258 { "td", "tfoot" },
1259 { "td", "th" },
1260 { "td", "tr" },
1261 { "tfoot", "tbody" },
1262 { "th", "tbody" },
1263 { "th", "td" },
1264 { "th", "tfoot" },
1265 { "th", "th" },
1266 { "th", "tr" },
1267 { "thead", "tbody" },
1268 { "thead", "tfoot" },
1269 { "title", "body" },
1270 { "title", "frameset" },
1271 { "tr", "tbody" },
1272 { "tr", "tfoot" },
1273 { "tr", "tr" },
1274 { "tt", "p" },
1275 { "u", "p" },
1276 { "u", "td" },
1277 { "u", "th" },
1278 { "ul", "address" },
1279 { "ul", "form" },
1280 { "ul", "menu" },
1281 { "ul", "pre" },
1282 { "xmp", "dd" },
1283 { "xmp", "dl" },
1284 { "xmp", "dt" },
1285 { "xmp", "fieldset" },
1286 { "xmp", "form" },
1287 { "xmp", "li" },
1288 { "xmp", "table" },
1289 { "xmp", "ul" }
1290 };
1291
1292 /*
1293 * The list of HTML elements which are supposed not to have
1294 * CDATA content and where a p element will be implied
1295 *
1296 * TODO: extend that list by reading the HTML SGML DTD on
1297 * implied paragraph
1298 */
1299 static const char *const htmlNoContentElements[] = {
1300 "html",
1301 "head",
1302 NULL
1303 };
1304
1305 /*
1306 * The list of HTML attributes which are of content %Script;
1307 * NOTE: when adding ones, check htmlIsScriptAttribute() since
1308 * it assumes the name starts with 'on'
1309 */
1310 static const char *const htmlScriptAttributes[] = {
1311 "onclick",
1312 "ondblclick",
1313 "onmousedown",
1314 "onmouseup",
1315 "onmouseover",
1316 "onmousemove",
1317 "onmouseout",
1318 "onkeypress",
1319 "onkeydown",
1320 "onkeyup",
1321 "onload",
1322 "onunload",
1323 "onfocus",
1324 "onblur",
1325 "onsubmit",
1326 "onreset",
1327 "onchange",
1328 "onselect"
1329 };
1330
1331 /*
1332 * This table is used by the htmlparser to know what to do with
1333 * broken html pages. By assigning different priorities to different
1334 * elements the parser can decide how to handle extra endtags.
1335 * Endtags are only allowed to close elements with lower or equal
1336 * priority.
1337 */
1338
1339 typedef struct {
1340 const char *name;
1341 int priority;
1342 } elementPriority;
1343
1344 static const elementPriority htmlEndPriority[] = {
1345 {"div", 150},
1346 {"td", 160},
1347 {"th", 160},
1348 {"tr", 170},
1349 {"thead", 180},
1350 {"tbody", 180},
1351 {"tfoot", 180},
1352 {"table", 190},
1353 {"head", 200},
1354 {"body", 200},
1355 {"html", 220},
1356 {NULL, 100} /* Default priority */
1357 };
1358
1359 /************************************************************************
1360 * *
1361 * functions to handle HTML specific data *
1362 * *
1363 ************************************************************************/
1364
1365 /**
1366 * htmlInitAutoClose:
1367 *
1368 * DEPRECATED: This is a no-op.
1369 */
1370 void
htmlInitAutoClose(void)1371 htmlInitAutoClose(void) {
1372 }
1373
1374 static int
htmlCompareTags(const void * key,const void * member)1375 htmlCompareTags(const void *key, const void *member) {
1376 const xmlChar *tag = (const xmlChar *) key;
1377 const htmlElemDesc *desc = (const htmlElemDesc *) member;
1378
1379 return(xmlStrcasecmp(tag, BAD_CAST desc->name));
1380 }
1381
1382 /**
1383 * htmlTagLookup:
1384 * @tag: The tag name in lowercase
1385 *
1386 * Lookup the HTML tag in the ElementTable
1387 *
1388 * Returns the related htmlElemDescPtr or NULL if not found.
1389 */
1390 const htmlElemDesc *
htmlTagLookup(const xmlChar * tag)1391 htmlTagLookup(const xmlChar *tag) {
1392 if (tag == NULL)
1393 return(NULL);
1394
1395 return((const htmlElemDesc *) bsearch(tag, html40ElementTable,
1396 sizeof(html40ElementTable) / sizeof(htmlElemDesc),
1397 sizeof(htmlElemDesc), htmlCompareTags));
1398 }
1399
1400 /**
1401 * htmlGetEndPriority:
1402 * @name: The name of the element to look up the priority for.
1403 *
1404 * Return value: The "endtag" priority.
1405 **/
1406 static int
htmlGetEndPriority(const xmlChar * name)1407 htmlGetEndPriority (const xmlChar *name) {
1408 int i = 0;
1409
1410 while ((htmlEndPriority[i].name != NULL) &&
1411 (!xmlStrEqual((const xmlChar *)htmlEndPriority[i].name, name)))
1412 i++;
1413
1414 return(htmlEndPriority[i].priority);
1415 }
1416
1417
1418 static int
htmlCompareStartClose(const void * vkey,const void * member)1419 htmlCompareStartClose(const void *vkey, const void *member) {
1420 const htmlStartCloseEntry *key = (const htmlStartCloseEntry *) vkey;
1421 const htmlStartCloseEntry *entry = (const htmlStartCloseEntry *) member;
1422 int ret;
1423
1424 ret = strcmp(key->oldTag, entry->oldTag);
1425 if (ret == 0)
1426 ret = strcmp(key->newTag, entry->newTag);
1427
1428 return(ret);
1429 }
1430
1431 /**
1432 * htmlCheckAutoClose:
1433 * @newtag: The new tag name
1434 * @oldtag: The old tag name
1435 *
1436 * Checks whether the new tag is one of the registered valid tags for
1437 * closing old.
1438 *
1439 * Returns 0 if no, 1 if yes.
1440 */
1441 static int
htmlCheckAutoClose(const xmlChar * newtag,const xmlChar * oldtag)1442 htmlCheckAutoClose(const xmlChar * newtag, const xmlChar * oldtag)
1443 {
1444 htmlStartCloseEntry key;
1445 void *res;
1446
1447 key.oldTag = (const char *) oldtag;
1448 key.newTag = (const char *) newtag;
1449 res = bsearch(&key, htmlStartClose,
1450 sizeof(htmlStartClose) / sizeof(htmlStartCloseEntry),
1451 sizeof(htmlStartCloseEntry), htmlCompareStartClose);
1452 return(res != NULL);
1453 }
1454
1455 /**
1456 * htmlAutoCloseOnClose:
1457 * @ctxt: an HTML parser context
1458 * @newtag: The new tag name
1459 * @force: force the tag closure
1460 *
1461 * The HTML DTD allows an ending tag to implicitly close other tags.
1462 */
1463 static void
htmlAutoCloseOnClose(htmlParserCtxtPtr ctxt,const xmlChar * newtag)1464 htmlAutoCloseOnClose(htmlParserCtxtPtr ctxt, const xmlChar * newtag)
1465 {
1466 const htmlElemDesc *info;
1467 int i, priority;
1468
1469 priority = htmlGetEndPriority(newtag);
1470
1471 for (i = (ctxt->nameNr - 1); i >= 0; i--) {
1472
1473 if (xmlStrEqual(newtag, ctxt->nameTab[i]))
1474 break;
1475 /*
1476 * A misplaced endtag can only close elements with lower
1477 * or equal priority, so if we find an element with higher
1478 * priority before we find an element with
1479 * matching name, we just ignore this endtag
1480 */
1481 if (htmlGetEndPriority(ctxt->nameTab[i]) > priority)
1482 return;
1483 }
1484 if (i < 0)
1485 return;
1486
1487 while (!xmlStrEqual(newtag, ctxt->name)) {
1488 info = htmlTagLookup(ctxt->name);
1489 if ((info != NULL) && (info->endTag == 3)) {
1490 htmlParseErr(ctxt, XML_ERR_TAG_NAME_MISMATCH,
1491 "Opening and ending tag mismatch: %s and %s\n",
1492 newtag, ctxt->name);
1493 }
1494 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
1495 ctxt->sax->endElement(ctxt->userData, ctxt->name);
1496 htmlnamePop(ctxt);
1497 }
1498 }
1499
1500 /**
1501 * htmlAutoCloseOnEnd:
1502 * @ctxt: an HTML parser context
1503 *
1504 * Close all remaining tags at the end of the stream
1505 */
1506 static void
htmlAutoCloseOnEnd(htmlParserCtxtPtr ctxt)1507 htmlAutoCloseOnEnd(htmlParserCtxtPtr ctxt)
1508 {
1509 int i;
1510
1511 if (ctxt->nameNr == 0)
1512 return;
1513 for (i = (ctxt->nameNr - 1); i >= 0; i--) {
1514 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
1515 ctxt->sax->endElement(ctxt->userData, ctxt->name);
1516 htmlnamePop(ctxt);
1517 }
1518 }
1519
1520 /**
1521 * htmlAutoClose:
1522 * @ctxt: an HTML parser context
1523 * @newtag: The new tag name or NULL
1524 *
1525 * The HTML DTD allows a tag to implicitly close other tags.
1526 * The list is kept in htmlStartClose array. This function is
1527 * called when a new tag has been detected and generates the
1528 * appropriates closes if possible/needed.
1529 * If newtag is NULL this mean we are at the end of the resource
1530 * and we should check
1531 */
1532 static void
htmlAutoClose(htmlParserCtxtPtr ctxt,const xmlChar * newtag)1533 htmlAutoClose(htmlParserCtxtPtr ctxt, const xmlChar * newtag)
1534 {
1535 if (newtag == NULL)
1536 return;
1537
1538 while ((ctxt->name != NULL) &&
1539 (htmlCheckAutoClose(newtag, ctxt->name))) {
1540 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
1541 ctxt->sax->endElement(ctxt->userData, ctxt->name);
1542 htmlnamePop(ctxt);
1543 }
1544 }
1545
1546 /**
1547 * htmlAutoCloseTag:
1548 * @doc: the HTML document
1549 * @name: The tag name
1550 * @elem: the HTML element
1551 *
1552 * The HTML DTD allows a tag to implicitly close other tags.
1553 * The list is kept in htmlStartClose array. This function checks
1554 * if the element or one of it's children would autoclose the
1555 * given tag.
1556 *
1557 * Returns 1 if autoclose, 0 otherwise
1558 */
1559 int
htmlAutoCloseTag(htmlDocPtr doc,const xmlChar * name,htmlNodePtr elem)1560 htmlAutoCloseTag(htmlDocPtr doc, const xmlChar *name, htmlNodePtr elem) {
1561 htmlNodePtr child;
1562
1563 if (elem == NULL) return(1);
1564 if (xmlStrEqual(name, elem->name)) return(0);
1565 if (htmlCheckAutoClose(elem->name, name)) return(1);
1566 child = elem->children;
1567 while (child != NULL) {
1568 if (htmlAutoCloseTag(doc, name, child)) return(1);
1569 child = child->next;
1570 }
1571 return(0);
1572 }
1573
1574 /**
1575 * htmlIsAutoClosed:
1576 * @doc: the HTML document
1577 * @elem: the HTML element
1578 *
1579 * The HTML DTD allows a tag to implicitly close other tags.
1580 * The list is kept in htmlStartClose array. This function checks
1581 * if a tag is autoclosed by one of it's child
1582 *
1583 * Returns 1 if autoclosed, 0 otherwise
1584 */
1585 int
htmlIsAutoClosed(htmlDocPtr doc,htmlNodePtr elem)1586 htmlIsAutoClosed(htmlDocPtr doc, htmlNodePtr elem) {
1587 htmlNodePtr child;
1588
1589 if (elem == NULL) return(1);
1590 child = elem->children;
1591 while (child != NULL) {
1592 if (htmlAutoCloseTag(doc, elem->name, child)) return(1);
1593 child = child->next;
1594 }
1595 return(0);
1596 }
1597
1598 /**
1599 * htmlCheckImplied:
1600 * @ctxt: an HTML parser context
1601 * @newtag: The new tag name
1602 *
1603 * The HTML DTD allows a tag to exists only implicitly
1604 * called when a new tag has been detected and generates the
1605 * appropriates implicit tags if missing
1606 */
1607 static void
htmlCheckImplied(htmlParserCtxtPtr ctxt,const xmlChar * newtag)1608 htmlCheckImplied(htmlParserCtxtPtr ctxt, const xmlChar *newtag) {
1609 int i;
1610
1611 if (ctxt->options & HTML_PARSE_NOIMPLIED)
1612 return;
1613 if (!htmlOmittedDefaultValue)
1614 return;
1615 if (xmlStrEqual(newtag, BAD_CAST"html"))
1616 return;
1617 if (ctxt->nameNr <= 0) {
1618 htmlnamePush(ctxt, BAD_CAST"html");
1619 if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
1620 ctxt->sax->startElement(ctxt->userData, BAD_CAST"html", NULL);
1621 }
1622 if ((xmlStrEqual(newtag, BAD_CAST"body")) || (xmlStrEqual(newtag, BAD_CAST"head")))
1623 return;
1624 if ((ctxt->nameNr <= 1) &&
1625 ((xmlStrEqual(newtag, BAD_CAST"script")) ||
1626 (xmlStrEqual(newtag, BAD_CAST"style")) ||
1627 (xmlStrEqual(newtag, BAD_CAST"meta")) ||
1628 (xmlStrEqual(newtag, BAD_CAST"link")) ||
1629 (xmlStrEqual(newtag, BAD_CAST"title")) ||
1630 (xmlStrEqual(newtag, BAD_CAST"base")))) {
1631 if (ctxt->html >= 3) {
1632 /* we already saw or generated an <head> before */
1633 return;
1634 }
1635 /*
1636 * dropped OBJECT ... i you put it first BODY will be
1637 * assumed !
1638 */
1639 htmlnamePush(ctxt, BAD_CAST"head");
1640 if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
1641 ctxt->sax->startElement(ctxt->userData, BAD_CAST"head", NULL);
1642 } else if ((!xmlStrEqual(newtag, BAD_CAST"noframes")) &&
1643 (!xmlStrEqual(newtag, BAD_CAST"frame")) &&
1644 (!xmlStrEqual(newtag, BAD_CAST"frameset"))) {
1645 if (ctxt->html >= 10) {
1646 /* we already saw or generated a <body> before */
1647 return;
1648 }
1649 for (i = 0;i < ctxt->nameNr;i++) {
1650 if (xmlStrEqual(ctxt->nameTab[i], BAD_CAST"body")) {
1651 return;
1652 }
1653 if (xmlStrEqual(ctxt->nameTab[i], BAD_CAST"head")) {
1654 return;
1655 }
1656 }
1657
1658 htmlnamePush(ctxt, BAD_CAST"body");
1659 if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
1660 ctxt->sax->startElement(ctxt->userData, BAD_CAST"body", NULL);
1661 }
1662 }
1663
1664 /**
1665 * htmlCheckParagraph
1666 * @ctxt: an HTML parser context
1667 *
1668 * Check whether a p element need to be implied before inserting
1669 * characters in the current element.
1670 *
1671 * Returns 1 if a paragraph has been inserted, 0 if not and -1
1672 * in case of error.
1673 */
1674
1675 static int
htmlCheckParagraph(htmlParserCtxtPtr ctxt)1676 htmlCheckParagraph(htmlParserCtxtPtr ctxt) {
1677 const xmlChar *tag;
1678 int i;
1679
1680 if (ctxt == NULL)
1681 return(-1);
1682 tag = ctxt->name;
1683 if (tag == NULL) {
1684 htmlAutoClose(ctxt, BAD_CAST"p");
1685 htmlCheckImplied(ctxt, BAD_CAST"p");
1686 htmlnamePush(ctxt, BAD_CAST"p");
1687 if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
1688 ctxt->sax->startElement(ctxt->userData, BAD_CAST"p", NULL);
1689 return(1);
1690 }
1691 if (!htmlOmittedDefaultValue)
1692 return(0);
1693 for (i = 0; htmlNoContentElements[i] != NULL; i++) {
1694 if (xmlStrEqual(tag, BAD_CAST htmlNoContentElements[i])) {
1695 htmlAutoClose(ctxt, BAD_CAST"p");
1696 htmlCheckImplied(ctxt, BAD_CAST"p");
1697 htmlnamePush(ctxt, BAD_CAST"p");
1698 if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
1699 ctxt->sax->startElement(ctxt->userData, BAD_CAST"p", NULL);
1700 return(1);
1701 }
1702 }
1703 return(0);
1704 }
1705
1706 /**
1707 * htmlIsScriptAttribute:
1708 * @name: an attribute name
1709 *
1710 * Check if an attribute is of content type Script
1711 *
1712 * Returns 1 is the attribute is a script 0 otherwise
1713 */
1714 int
htmlIsScriptAttribute(const xmlChar * name)1715 htmlIsScriptAttribute(const xmlChar *name) {
1716 unsigned int i;
1717
1718 if (name == NULL)
1719 return(0);
1720 /*
1721 * all script attributes start with 'on'
1722 */
1723 if ((name[0] != 'o') || (name[1] != 'n'))
1724 return(0);
1725 for (i = 0;
1726 i < sizeof(htmlScriptAttributes)/sizeof(htmlScriptAttributes[0]);
1727 i++) {
1728 if (xmlStrEqual(name, (const xmlChar *) htmlScriptAttributes[i]))
1729 return(1);
1730 }
1731 return(0);
1732 }
1733
1734 /************************************************************************
1735 * *
1736 * The list of HTML predefined entities *
1737 * *
1738 ************************************************************************/
1739
1740
1741 static const htmlEntityDesc html40EntitiesTable[] = {
1742 /*
1743 * the 4 absolute ones, plus apostrophe.
1744 */
1745 { 34, "quot", "quotation mark = APL quote, U+0022 ISOnum" },
1746 { 38, "amp", "ampersand, U+0026 ISOnum" },
1747 { 39, "apos", "single quote" },
1748 { 60, "lt", "less-than sign, U+003C ISOnum" },
1749 { 62, "gt", "greater-than sign, U+003E ISOnum" },
1750
1751 /*
1752 * A bunch still in the 128-255 range
1753 * Replacing them depend really on the charset used.
1754 */
1755 { 160, "nbsp", "no-break space = non-breaking space, U+00A0 ISOnum" },
1756 { 161, "iexcl","inverted exclamation mark, U+00A1 ISOnum" },
1757 { 162, "cent", "cent sign, U+00A2 ISOnum" },
1758 { 163, "pound","pound sign, U+00A3 ISOnum" },
1759 { 164, "curren","currency sign, U+00A4 ISOnum" },
1760 { 165, "yen", "yen sign = yuan sign, U+00A5 ISOnum" },
1761 { 166, "brvbar","broken bar = broken vertical bar, U+00A6 ISOnum" },
1762 { 167, "sect", "section sign, U+00A7 ISOnum" },
1763 { 168, "uml", "diaeresis = spacing diaeresis, U+00A8 ISOdia" },
1764 { 169, "copy", "copyright sign, U+00A9 ISOnum" },
1765 { 170, "ordf", "feminine ordinal indicator, U+00AA ISOnum" },
1766 { 171, "laquo","left-pointing double angle quotation mark = left pointing guillemet, U+00AB ISOnum" },
1767 { 172, "not", "not sign, U+00AC ISOnum" },
1768 { 173, "shy", "soft hyphen = discretionary hyphen, U+00AD ISOnum" },
1769 { 174, "reg", "registered sign = registered trade mark sign, U+00AE ISOnum" },
1770 { 175, "macr", "macron = spacing macron = overline = APL overbar, U+00AF ISOdia" },
1771 { 176, "deg", "degree sign, U+00B0 ISOnum" },
1772 { 177, "plusmn","plus-minus sign = plus-or-minus sign, U+00B1 ISOnum" },
1773 { 178, "sup2", "superscript two = superscript digit two = squared, U+00B2 ISOnum" },
1774 { 179, "sup3", "superscript three = superscript digit three = cubed, U+00B3 ISOnum" },
1775 { 180, "acute","acute accent = spacing acute, U+00B4 ISOdia" },
1776 { 181, "micro","micro sign, U+00B5 ISOnum" },
1777 { 182, "para", "pilcrow sign = paragraph sign, U+00B6 ISOnum" },
1778 { 183, "middot","middle dot = Georgian comma Greek middle dot, U+00B7 ISOnum" },
1779 { 184, "cedil","cedilla = spacing cedilla, U+00B8 ISOdia" },
1780 { 185, "sup1", "superscript one = superscript digit one, U+00B9 ISOnum" },
1781 { 186, "ordm", "masculine ordinal indicator, U+00BA ISOnum" },
1782 { 187, "raquo","right-pointing double angle quotation mark right pointing guillemet, U+00BB ISOnum" },
1783 { 188, "frac14","vulgar fraction one quarter = fraction one quarter, U+00BC ISOnum" },
1784 { 189, "frac12","vulgar fraction one half = fraction one half, U+00BD ISOnum" },
1785 { 190, "frac34","vulgar fraction three quarters = fraction three quarters, U+00BE ISOnum" },
1786 { 191, "iquest","inverted question mark = turned question mark, U+00BF ISOnum" },
1787 { 192, "Agrave","latin capital letter A with grave = latin capital letter A grave, U+00C0 ISOlat1" },
1788 { 193, "Aacute","latin capital letter A with acute, U+00C1 ISOlat1" },
1789 { 194, "Acirc","latin capital letter A with circumflex, U+00C2 ISOlat1" },
1790 { 195, "Atilde","latin capital letter A with tilde, U+00C3 ISOlat1" },
1791 { 196, "Auml", "latin capital letter A with diaeresis, U+00C4 ISOlat1" },
1792 { 197, "Aring","latin capital letter A with ring above = latin capital letter A ring, U+00C5 ISOlat1" },
1793 { 198, "AElig","latin capital letter AE = latin capital ligature AE, U+00C6 ISOlat1" },
1794 { 199, "Ccedil","latin capital letter C with cedilla, U+00C7 ISOlat1" },
1795 { 200, "Egrave","latin capital letter E with grave, U+00C8 ISOlat1" },
1796 { 201, "Eacute","latin capital letter E with acute, U+00C9 ISOlat1" },
1797 { 202, "Ecirc","latin capital letter E with circumflex, U+00CA ISOlat1" },
1798 { 203, "Euml", "latin capital letter E with diaeresis, U+00CB ISOlat1" },
1799 { 204, "Igrave","latin capital letter I with grave, U+00CC ISOlat1" },
1800 { 205, "Iacute","latin capital letter I with acute, U+00CD ISOlat1" },
1801 { 206, "Icirc","latin capital letter I with circumflex, U+00CE ISOlat1" },
1802 { 207, "Iuml", "latin capital letter I with diaeresis, U+00CF ISOlat1" },
1803 { 208, "ETH", "latin capital letter ETH, U+00D0 ISOlat1" },
1804 { 209, "Ntilde","latin capital letter N with tilde, U+00D1 ISOlat1" },
1805 { 210, "Ograve","latin capital letter O with grave, U+00D2 ISOlat1" },
1806 { 211, "Oacute","latin capital letter O with acute, U+00D3 ISOlat1" },
1807 { 212, "Ocirc","latin capital letter O with circumflex, U+00D4 ISOlat1" },
1808 { 213, "Otilde","latin capital letter O with tilde, U+00D5 ISOlat1" },
1809 { 214, "Ouml", "latin capital letter O with diaeresis, U+00D6 ISOlat1" },
1810 { 215, "times","multiplication sign, U+00D7 ISOnum" },
1811 { 216, "Oslash","latin capital letter O with stroke latin capital letter O slash, U+00D8 ISOlat1" },
1812 { 217, "Ugrave","latin capital letter U with grave, U+00D9 ISOlat1" },
1813 { 218, "Uacute","latin capital letter U with acute, U+00DA ISOlat1" },
1814 { 219, "Ucirc","latin capital letter U with circumflex, U+00DB ISOlat1" },
1815 { 220, "Uuml", "latin capital letter U with diaeresis, U+00DC ISOlat1" },
1816 { 221, "Yacute","latin capital letter Y with acute, U+00DD ISOlat1" },
1817 { 222, "THORN","latin capital letter THORN, U+00DE ISOlat1" },
1818 { 223, "szlig","latin small letter sharp s = ess-zed, U+00DF ISOlat1" },
1819 { 224, "agrave","latin small letter a with grave = latin small letter a grave, U+00E0 ISOlat1" },
1820 { 225, "aacute","latin small letter a with acute, U+00E1 ISOlat1" },
1821 { 226, "acirc","latin small letter a with circumflex, U+00E2 ISOlat1" },
1822 { 227, "atilde","latin small letter a with tilde, U+00E3 ISOlat1" },
1823 { 228, "auml", "latin small letter a with diaeresis, U+00E4 ISOlat1" },
1824 { 229, "aring","latin small letter a with ring above = latin small letter a ring, U+00E5 ISOlat1" },
1825 { 230, "aelig","latin small letter ae = latin small ligature ae, U+00E6 ISOlat1" },
1826 { 231, "ccedil","latin small letter c with cedilla, U+00E7 ISOlat1" },
1827 { 232, "egrave","latin small letter e with grave, U+00E8 ISOlat1" },
1828 { 233, "eacute","latin small letter e with acute, U+00E9 ISOlat1" },
1829 { 234, "ecirc","latin small letter e with circumflex, U+00EA ISOlat1" },
1830 { 235, "euml", "latin small letter e with diaeresis, U+00EB ISOlat1" },
1831 { 236, "igrave","latin small letter i with grave, U+00EC ISOlat1" },
1832 { 237, "iacute","latin small letter i with acute, U+00ED ISOlat1" },
1833 { 238, "icirc","latin small letter i with circumflex, U+00EE ISOlat1" },
1834 { 239, "iuml", "latin small letter i with diaeresis, U+00EF ISOlat1" },
1835 { 240, "eth", "latin small letter eth, U+00F0 ISOlat1" },
1836 { 241, "ntilde","latin small letter n with tilde, U+00F1 ISOlat1" },
1837 { 242, "ograve","latin small letter o with grave, U+00F2 ISOlat1" },
1838 { 243, "oacute","latin small letter o with acute, U+00F3 ISOlat1" },
1839 { 244, "ocirc","latin small letter o with circumflex, U+00F4 ISOlat1" },
1840 { 245, "otilde","latin small letter o with tilde, U+00F5 ISOlat1" },
1841 { 246, "ouml", "latin small letter o with diaeresis, U+00F6 ISOlat1" },
1842 { 247, "divide","division sign, U+00F7 ISOnum" },
1843 { 248, "oslash","latin small letter o with stroke, = latin small letter o slash, U+00F8 ISOlat1" },
1844 { 249, "ugrave","latin small letter u with grave, U+00F9 ISOlat1" },
1845 { 250, "uacute","latin small letter u with acute, U+00FA ISOlat1" },
1846 { 251, "ucirc","latin small letter u with circumflex, U+00FB ISOlat1" },
1847 { 252, "uuml", "latin small letter u with diaeresis, U+00FC ISOlat1" },
1848 { 253, "yacute","latin small letter y with acute, U+00FD ISOlat1" },
1849 { 254, "thorn","latin small letter thorn with, U+00FE ISOlat1" },
1850 { 255, "yuml", "latin small letter y with diaeresis, U+00FF ISOlat1" },
1851
1852 { 338, "OElig","latin capital ligature OE, U+0152 ISOlat2" },
1853 { 339, "oelig","latin small ligature oe, U+0153 ISOlat2" },
1854 { 352, "Scaron","latin capital letter S with caron, U+0160 ISOlat2" },
1855 { 353, "scaron","latin small letter s with caron, U+0161 ISOlat2" },
1856 { 376, "Yuml", "latin capital letter Y with diaeresis, U+0178 ISOlat2" },
1857
1858 /*
1859 * Anything below should really be kept as entities references
1860 */
1861 { 402, "fnof", "latin small f with hook = function = florin, U+0192 ISOtech" },
1862
1863 { 710, "circ", "modifier letter circumflex accent, U+02C6 ISOpub" },
1864 { 732, "tilde","small tilde, U+02DC ISOdia" },
1865
1866 { 913, "Alpha","greek capital letter alpha, U+0391" },
1867 { 914, "Beta", "greek capital letter beta, U+0392" },
1868 { 915, "Gamma","greek capital letter gamma, U+0393 ISOgrk3" },
1869 { 916, "Delta","greek capital letter delta, U+0394 ISOgrk3" },
1870 { 917, "Epsilon","greek capital letter epsilon, U+0395" },
1871 { 918, "Zeta", "greek capital letter zeta, U+0396" },
1872 { 919, "Eta", "greek capital letter eta, U+0397" },
1873 { 920, "Theta","greek capital letter theta, U+0398 ISOgrk3" },
1874 { 921, "Iota", "greek capital letter iota, U+0399" },
1875 { 922, "Kappa","greek capital letter kappa, U+039A" },
1876 { 923, "Lambda", "greek capital letter lambda, U+039B ISOgrk3" },
1877 { 924, "Mu", "greek capital letter mu, U+039C" },
1878 { 925, "Nu", "greek capital letter nu, U+039D" },
1879 { 926, "Xi", "greek capital letter xi, U+039E ISOgrk3" },
1880 { 927, "Omicron","greek capital letter omicron, U+039F" },
1881 { 928, "Pi", "greek capital letter pi, U+03A0 ISOgrk3" },
1882 { 929, "Rho", "greek capital letter rho, U+03A1" },
1883 { 931, "Sigma","greek capital letter sigma, U+03A3 ISOgrk3" },
1884 { 932, "Tau", "greek capital letter tau, U+03A4" },
1885 { 933, "Upsilon","greek capital letter upsilon, U+03A5 ISOgrk3" },
1886 { 934, "Phi", "greek capital letter phi, U+03A6 ISOgrk3" },
1887 { 935, "Chi", "greek capital letter chi, U+03A7" },
1888 { 936, "Psi", "greek capital letter psi, U+03A8 ISOgrk3" },
1889 { 937, "Omega","greek capital letter omega, U+03A9 ISOgrk3" },
1890
1891 { 945, "alpha","greek small letter alpha, U+03B1 ISOgrk3" },
1892 { 946, "beta", "greek small letter beta, U+03B2 ISOgrk3" },
1893 { 947, "gamma","greek small letter gamma, U+03B3 ISOgrk3" },
1894 { 948, "delta","greek small letter delta, U+03B4 ISOgrk3" },
1895 { 949, "epsilon","greek small letter epsilon, U+03B5 ISOgrk3" },
1896 { 950, "zeta", "greek small letter zeta, U+03B6 ISOgrk3" },
1897 { 951, "eta", "greek small letter eta, U+03B7 ISOgrk3" },
1898 { 952, "theta","greek small letter theta, U+03B8 ISOgrk3" },
1899 { 953, "iota", "greek small letter iota, U+03B9 ISOgrk3" },
1900 { 954, "kappa","greek small letter kappa, U+03BA ISOgrk3" },
1901 { 955, "lambda","greek small letter lambda, U+03BB ISOgrk3" },
1902 { 956, "mu", "greek small letter mu, U+03BC ISOgrk3" },
1903 { 957, "nu", "greek small letter nu, U+03BD ISOgrk3" },
1904 { 958, "xi", "greek small letter xi, U+03BE ISOgrk3" },
1905 { 959, "omicron","greek small letter omicron, U+03BF NEW" },
1906 { 960, "pi", "greek small letter pi, U+03C0 ISOgrk3" },
1907 { 961, "rho", "greek small letter rho, U+03C1 ISOgrk3" },
1908 { 962, "sigmaf","greek small letter final sigma, U+03C2 ISOgrk3" },
1909 { 963, "sigma","greek small letter sigma, U+03C3 ISOgrk3" },
1910 { 964, "tau", "greek small letter tau, U+03C4 ISOgrk3" },
1911 { 965, "upsilon","greek small letter upsilon, U+03C5 ISOgrk3" },
1912 { 966, "phi", "greek small letter phi, U+03C6 ISOgrk3" },
1913 { 967, "chi", "greek small letter chi, U+03C7 ISOgrk3" },
1914 { 968, "psi", "greek small letter psi, U+03C8 ISOgrk3" },
1915 { 969, "omega","greek small letter omega, U+03C9 ISOgrk3" },
1916 { 977, "thetasym","greek small letter theta symbol, U+03D1 NEW" },
1917 { 978, "upsih","greek upsilon with hook symbol, U+03D2 NEW" },
1918 { 982, "piv", "greek pi symbol, U+03D6 ISOgrk3" },
1919
1920 { 8194, "ensp", "en space, U+2002 ISOpub" },
1921 { 8195, "emsp", "em space, U+2003 ISOpub" },
1922 { 8201, "thinsp","thin space, U+2009 ISOpub" },
1923 { 8204, "zwnj", "zero width non-joiner, U+200C NEW RFC 2070" },
1924 { 8205, "zwj", "zero width joiner, U+200D NEW RFC 2070" },
1925 { 8206, "lrm", "left-to-right mark, U+200E NEW RFC 2070" },
1926 { 8207, "rlm", "right-to-left mark, U+200F NEW RFC 2070" },
1927 { 8211, "ndash","en dash, U+2013 ISOpub" },
1928 { 8212, "mdash","em dash, U+2014 ISOpub" },
1929 { 8216, "lsquo","left single quotation mark, U+2018 ISOnum" },
1930 { 8217, "rsquo","right single quotation mark, U+2019 ISOnum" },
1931 { 8218, "sbquo","single low-9 quotation mark, U+201A NEW" },
1932 { 8220, "ldquo","left double quotation mark, U+201C ISOnum" },
1933 { 8221, "rdquo","right double quotation mark, U+201D ISOnum" },
1934 { 8222, "bdquo","double low-9 quotation mark, U+201E NEW" },
1935 { 8224, "dagger","dagger, U+2020 ISOpub" },
1936 { 8225, "Dagger","double dagger, U+2021 ISOpub" },
1937
1938 { 8226, "bull", "bullet = black small circle, U+2022 ISOpub" },
1939 { 8230, "hellip","horizontal ellipsis = three dot leader, U+2026 ISOpub" },
1940
1941 { 8240, "permil","per mille sign, U+2030 ISOtech" },
1942
1943 { 8242, "prime","prime = minutes = feet, U+2032 ISOtech" },
1944 { 8243, "Prime","double prime = seconds = inches, U+2033 ISOtech" },
1945
1946 { 8249, "lsaquo","single left-pointing angle quotation mark, U+2039 ISO proposed" },
1947 { 8250, "rsaquo","single right-pointing angle quotation mark, U+203A ISO proposed" },
1948
1949 { 8254, "oline","overline = spacing overscore, U+203E NEW" },
1950 { 8260, "frasl","fraction slash, U+2044 NEW" },
1951
1952 { 8364, "euro", "euro sign, U+20AC NEW" },
1953
1954 { 8465, "image","blackletter capital I = imaginary part, U+2111 ISOamso" },
1955 { 8472, "weierp","script capital P = power set = Weierstrass p, U+2118 ISOamso" },
1956 { 8476, "real", "blackletter capital R = real part symbol, U+211C ISOamso" },
1957 { 8482, "trade","trade mark sign, U+2122 ISOnum" },
1958 { 8501, "alefsym","alef symbol = first transfinite cardinal, U+2135 NEW" },
1959 { 8592, "larr", "leftwards arrow, U+2190 ISOnum" },
1960 { 8593, "uarr", "upwards arrow, U+2191 ISOnum" },
1961 { 8594, "rarr", "rightwards arrow, U+2192 ISOnum" },
1962 { 8595, "darr", "downwards arrow, U+2193 ISOnum" },
1963 { 8596, "harr", "left right arrow, U+2194 ISOamsa" },
1964 { 8629, "crarr","downwards arrow with corner leftwards = carriage return, U+21B5 NEW" },
1965 { 8656, "lArr", "leftwards double arrow, U+21D0 ISOtech" },
1966 { 8657, "uArr", "upwards double arrow, U+21D1 ISOamsa" },
1967 { 8658, "rArr", "rightwards double arrow, U+21D2 ISOtech" },
1968 { 8659, "dArr", "downwards double arrow, U+21D3 ISOamsa" },
1969 { 8660, "hArr", "left right double arrow, U+21D4 ISOamsa" },
1970
1971 { 8704, "forall","for all, U+2200 ISOtech" },
1972 { 8706, "part", "partial differential, U+2202 ISOtech" },
1973 { 8707, "exist","there exists, U+2203 ISOtech" },
1974 { 8709, "empty","empty set = null set = diameter, U+2205 ISOamso" },
1975 { 8711, "nabla","nabla = backward difference, U+2207 ISOtech" },
1976 { 8712, "isin", "element of, U+2208 ISOtech" },
1977 { 8713, "notin","not an element of, U+2209 ISOtech" },
1978 { 8715, "ni", "contains as member, U+220B ISOtech" },
1979 { 8719, "prod", "n-ary product = product sign, U+220F ISOamsb" },
1980 { 8721, "sum", "n-ary summation, U+2211 ISOamsb" },
1981 { 8722, "minus","minus sign, U+2212 ISOtech" },
1982 { 8727, "lowast","asterisk operator, U+2217 ISOtech" },
1983 { 8730, "radic","square root = radical sign, U+221A ISOtech" },
1984 { 8733, "prop", "proportional to, U+221D ISOtech" },
1985 { 8734, "infin","infinity, U+221E ISOtech" },
1986 { 8736, "ang", "angle, U+2220 ISOamso" },
1987 { 8743, "and", "logical and = wedge, U+2227 ISOtech" },
1988 { 8744, "or", "logical or = vee, U+2228 ISOtech" },
1989 { 8745, "cap", "intersection = cap, U+2229 ISOtech" },
1990 { 8746, "cup", "union = cup, U+222A ISOtech" },
1991 { 8747, "int", "integral, U+222B ISOtech" },
1992 { 8756, "there4","therefore, U+2234 ISOtech" },
1993 { 8764, "sim", "tilde operator = varies with = similar to, U+223C ISOtech" },
1994 { 8773, "cong", "approximately equal to, U+2245 ISOtech" },
1995 { 8776, "asymp","almost equal to = asymptotic to, U+2248 ISOamsr" },
1996 { 8800, "ne", "not equal to, U+2260 ISOtech" },
1997 { 8801, "equiv","identical to, U+2261 ISOtech" },
1998 { 8804, "le", "less-than or equal to, U+2264 ISOtech" },
1999 { 8805, "ge", "greater-than or equal to, U+2265 ISOtech" },
2000 { 8834, "sub", "subset of, U+2282 ISOtech" },
2001 { 8835, "sup", "superset of, U+2283 ISOtech" },
2002 { 8836, "nsub", "not a subset of, U+2284 ISOamsn" },
2003 { 8838, "sube", "subset of or equal to, U+2286 ISOtech" },
2004 { 8839, "supe", "superset of or equal to, U+2287 ISOtech" },
2005 { 8853, "oplus","circled plus = direct sum, U+2295 ISOamsb" },
2006 { 8855, "otimes","circled times = vector product, U+2297 ISOamsb" },
2007 { 8869, "perp", "up tack = orthogonal to = perpendicular, U+22A5 ISOtech" },
2008 { 8901, "sdot", "dot operator, U+22C5 ISOamsb" },
2009 { 8968, "lceil","left ceiling = apl upstile, U+2308 ISOamsc" },
2010 { 8969, "rceil","right ceiling, U+2309 ISOamsc" },
2011 { 8970, "lfloor","left floor = apl downstile, U+230A ISOamsc" },
2012 { 8971, "rfloor","right floor, U+230B ISOamsc" },
2013 { 9001, "lang", "left-pointing angle bracket = bra, U+2329 ISOtech" },
2014 { 9002, "rang", "right-pointing angle bracket = ket, U+232A ISOtech" },
2015 { 9674, "loz", "lozenge, U+25CA ISOpub" },
2016
2017 { 9824, "spades","black spade suit, U+2660 ISOpub" },
2018 { 9827, "clubs","black club suit = shamrock, U+2663 ISOpub" },
2019 { 9829, "hearts","black heart suit = valentine, U+2665 ISOpub" },
2020 { 9830, "diams","black diamond suit, U+2666 ISOpub" },
2021
2022 };
2023
2024 /************************************************************************
2025 * *
2026 * Commodity functions to handle entities *
2027 * *
2028 ************************************************************************/
2029
2030 /*
2031 * Macro used to grow the current buffer.
2032 */
2033 #define growBuffer(buffer) { \
2034 xmlChar *tmp; \
2035 buffer##_size *= 2; \
2036 tmp = (xmlChar *) xmlRealloc(buffer, buffer##_size); \
2037 if (tmp == NULL) { \
2038 htmlErrMemory(ctxt, "growing buffer\n"); \
2039 xmlFree(buffer); \
2040 return(NULL); \
2041 } \
2042 buffer = tmp; \
2043 }
2044
2045 /**
2046 * htmlEntityLookup:
2047 * @name: the entity name
2048 *
2049 * Lookup the given entity in EntitiesTable
2050 *
2051 * TODO: the linear scan is really ugly, an hash table is really needed.
2052 *
2053 * Returns the associated htmlEntityDescPtr if found, NULL otherwise.
2054 */
2055 const htmlEntityDesc *
htmlEntityLookup(const xmlChar * name)2056 htmlEntityLookup(const xmlChar *name) {
2057 unsigned int i;
2058
2059 for (i = 0;i < (sizeof(html40EntitiesTable)/
2060 sizeof(html40EntitiesTable[0]));i++) {
2061 if (xmlStrEqual(name, BAD_CAST html40EntitiesTable[i].name)) {
2062 return((htmlEntityDescPtr) &html40EntitiesTable[i]);
2063 }
2064 }
2065 return(NULL);
2066 }
2067
2068 /**
2069 * htmlEntityValueLookup:
2070 * @value: the entity's unicode value
2071 *
2072 * Lookup the given entity in EntitiesTable
2073 *
2074 * TODO: the linear scan is really ugly, an hash table is really needed.
2075 *
2076 * Returns the associated htmlEntityDescPtr if found, NULL otherwise.
2077 */
2078 const htmlEntityDesc *
htmlEntityValueLookup(unsigned int value)2079 htmlEntityValueLookup(unsigned int value) {
2080 unsigned int i;
2081
2082 for (i = 0;i < (sizeof(html40EntitiesTable)/
2083 sizeof(html40EntitiesTable[0]));i++) {
2084 if (html40EntitiesTable[i].value >= value) {
2085 if (html40EntitiesTable[i].value > value)
2086 break;
2087 return((htmlEntityDescPtr) &html40EntitiesTable[i]);
2088 }
2089 }
2090 return(NULL);
2091 }
2092
2093 /**
2094 * UTF8ToHtml:
2095 * @out: a pointer to an array of bytes to store the result
2096 * @outlen: the length of @out
2097 * @in: a pointer to an array of UTF-8 chars
2098 * @inlen: the length of @in
2099 *
2100 * Take a block of UTF-8 chars in and try to convert it to an ASCII
2101 * plus HTML entities block of chars out.
2102 *
2103 * Returns 0 if success, -2 if the transcoding fails, or -1 otherwise
2104 * The value of @inlen after return is the number of octets consumed
2105 * as the return value is positive, else unpredictable.
2106 * The value of @outlen after return is the number of octets consumed.
2107 */
2108 int
UTF8ToHtml(unsigned char * out,int * outlen,const unsigned char * in,int * inlen)2109 UTF8ToHtml(unsigned char* out, int *outlen,
2110 const unsigned char* in, int *inlen) {
2111 const unsigned char* processed = in;
2112 const unsigned char* outend;
2113 const unsigned char* outstart = out;
2114 const unsigned char* instart = in;
2115 const unsigned char* inend;
2116 unsigned int c, d;
2117 int trailing;
2118
2119 if ((out == NULL) || (outlen == NULL) || (inlen == NULL)) return(-1);
2120 if (in == NULL) {
2121 /*
2122 * initialization nothing to do
2123 */
2124 *outlen = 0;
2125 *inlen = 0;
2126 return(0);
2127 }
2128 inend = in + (*inlen);
2129 outend = out + (*outlen);
2130 while (in < inend) {
2131 d = *in++;
2132 if (d < 0x80) { c= d; trailing= 0; }
2133 else if (d < 0xC0) {
2134 /* trailing byte in leading position */
2135 *outlen = out - outstart;
2136 *inlen = processed - instart;
2137 return(-2);
2138 } else if (d < 0xE0) { c= d & 0x1F; trailing= 1; }
2139 else if (d < 0xF0) { c= d & 0x0F; trailing= 2; }
2140 else if (d < 0xF8) { c= d & 0x07; trailing= 3; }
2141 else {
2142 /* no chance for this in Ascii */
2143 *outlen = out - outstart;
2144 *inlen = processed - instart;
2145 return(-2);
2146 }
2147
2148 if (inend - in < trailing) {
2149 break;
2150 }
2151
2152 for ( ; trailing; trailing--) {
2153 if ((in >= inend) || (((d= *in++) & 0xC0) != 0x80))
2154 break;
2155 c <<= 6;
2156 c |= d & 0x3F;
2157 }
2158
2159 /* assertion: c is a single UTF-4 value */
2160 if (c < 0x80) {
2161 if (out + 1 >= outend)
2162 break;
2163 *out++ = c;
2164 } else {
2165 int len;
2166 const htmlEntityDesc * ent;
2167 const char *cp;
2168 char nbuf[16];
2169
2170 /*
2171 * Try to lookup a predefined HTML entity for it
2172 */
2173
2174 ent = htmlEntityValueLookup(c);
2175 if (ent == NULL) {
2176 snprintf(nbuf, sizeof(nbuf), "#%u", c);
2177 cp = nbuf;
2178 }
2179 else
2180 cp = ent->name;
2181 len = strlen(cp);
2182 if (out + 2 + len >= outend)
2183 break;
2184 *out++ = '&';
2185 memcpy(out, cp, len);
2186 out += len;
2187 *out++ = ';';
2188 }
2189 processed = in;
2190 }
2191 *outlen = out - outstart;
2192 *inlen = processed - instart;
2193 return(0);
2194 }
2195
2196 /**
2197 * htmlEncodeEntities:
2198 * @out: a pointer to an array of bytes to store the result
2199 * @outlen: the length of @out
2200 * @in: a pointer to an array of UTF-8 chars
2201 * @inlen: the length of @in
2202 * @quoteChar: the quote character to escape (' or ") or zero.
2203 *
2204 * Take a block of UTF-8 chars in and try to convert it to an ASCII
2205 * plus HTML entities block of chars out.
2206 *
2207 * Returns 0 if success, -2 if the transcoding fails, or -1 otherwise
2208 * The value of @inlen after return is the number of octets consumed
2209 * as the return value is positive, else unpredictable.
2210 * The value of @outlen after return is the number of octets consumed.
2211 */
2212 int
htmlEncodeEntities(unsigned char * out,int * outlen,const unsigned char * in,int * inlen,int quoteChar)2213 htmlEncodeEntities(unsigned char* out, int *outlen,
2214 const unsigned char* in, int *inlen, int quoteChar) {
2215 const unsigned char* processed = in;
2216 const unsigned char* outend;
2217 const unsigned char* outstart = out;
2218 const unsigned char* instart = in;
2219 const unsigned char* inend;
2220 unsigned int c, d;
2221 int trailing;
2222
2223 if ((out == NULL) || (outlen == NULL) || (inlen == NULL) || (in == NULL))
2224 return(-1);
2225 outend = out + (*outlen);
2226 inend = in + (*inlen);
2227 while (in < inend) {
2228 d = *in++;
2229 if (d < 0x80) { c= d; trailing= 0; }
2230 else if (d < 0xC0) {
2231 /* trailing byte in leading position */
2232 *outlen = out - outstart;
2233 *inlen = processed - instart;
2234 return(-2);
2235 } else if (d < 0xE0) { c= d & 0x1F; trailing= 1; }
2236 else if (d < 0xF0) { c= d & 0x0F; trailing= 2; }
2237 else if (d < 0xF8) { c= d & 0x07; trailing= 3; }
2238 else {
2239 /* no chance for this in Ascii */
2240 *outlen = out - outstart;
2241 *inlen = processed - instart;
2242 return(-2);
2243 }
2244
2245 if (inend - in < trailing)
2246 break;
2247
2248 while (trailing--) {
2249 if (((d= *in++) & 0xC0) != 0x80) {
2250 *outlen = out - outstart;
2251 *inlen = processed - instart;
2252 return(-2);
2253 }
2254 c <<= 6;
2255 c |= d & 0x3F;
2256 }
2257
2258 /* assertion: c is a single UTF-4 value */
2259 if ((c < 0x80) && (c != (unsigned int) quoteChar) &&
2260 (c != '&') && (c != '<') && (c != '>')) {
2261 if (out >= outend)
2262 break;
2263 *out++ = c;
2264 } else {
2265 const htmlEntityDesc * ent;
2266 const char *cp;
2267 char nbuf[16];
2268 int len;
2269
2270 /*
2271 * Try to lookup a predefined HTML entity for it
2272 */
2273 ent = htmlEntityValueLookup(c);
2274 if (ent == NULL) {
2275 snprintf(nbuf, sizeof(nbuf), "#%u", c);
2276 cp = nbuf;
2277 }
2278 else
2279 cp = ent->name;
2280 len = strlen(cp);
2281 if (outend - out < len + 2)
2282 break;
2283 *out++ = '&';
2284 memcpy(out, cp, len);
2285 out += len;
2286 *out++ = ';';
2287 }
2288 processed = in;
2289 }
2290 *outlen = out - outstart;
2291 *inlen = processed - instart;
2292 return(0);
2293 }
2294
2295 /************************************************************************
2296 * *
2297 * Commodity functions to handle streams *
2298 * *
2299 ************************************************************************/
2300
2301 #ifdef LIBXML_PUSH_ENABLED
2302 /**
2303 * htmlNewInputStream:
2304 * @ctxt: an HTML parser context
2305 *
2306 * Create a new input stream structure
2307 * Returns the new input stream or NULL
2308 */
2309 static htmlParserInputPtr
htmlNewInputStream(htmlParserCtxtPtr ctxt)2310 htmlNewInputStream(htmlParserCtxtPtr ctxt) {
2311 htmlParserInputPtr input;
2312
2313 input = (xmlParserInputPtr) xmlMalloc(sizeof(htmlParserInput));
2314 if (input == NULL) {
2315 htmlErrMemory(ctxt, "couldn't allocate a new input stream\n");
2316 return(NULL);
2317 }
2318 memset(input, 0, sizeof(htmlParserInput));
2319 input->filename = NULL;
2320 input->directory = NULL;
2321 input->base = NULL;
2322 input->cur = NULL;
2323 input->buf = NULL;
2324 input->line = 1;
2325 input->col = 1;
2326 input->buf = NULL;
2327 input->free = NULL;
2328 input->version = NULL;
2329 input->consumed = 0;
2330 input->length = 0;
2331 return(input);
2332 }
2333 #endif
2334
2335
2336 /************************************************************************
2337 * *
2338 * Commodity functions, cleanup needed ? *
2339 * *
2340 ************************************************************************/
2341 /*
2342 * all tags allowing pc data from the html 4.01 loose dtd
2343 * NOTE: it might be more appropriate to integrate this information
2344 * into the html40ElementTable array but I don't want to risk any
2345 * binary incompatibility
2346 */
2347 static const char *allowPCData[] = {
2348 "a", "abbr", "acronym", "address", "applet", "b", "bdo", "big",
2349 "blockquote", "body", "button", "caption", "center", "cite", "code",
2350 "dd", "del", "dfn", "div", "dt", "em", "font", "form", "h1", "h2",
2351 "h3", "h4", "h5", "h6", "i", "iframe", "ins", "kbd", "label", "legend",
2352 "li", "noframes", "noscript", "object", "p", "pre", "q", "s", "samp",
2353 "small", "span", "strike", "strong", "td", "th", "tt", "u", "var"
2354 };
2355
2356 /**
2357 * areBlanks:
2358 * @ctxt: an HTML parser context
2359 * @str: a xmlChar *
2360 * @len: the size of @str
2361 *
2362 * Is this a sequence of blank chars that one can ignore ?
2363 *
2364 * Returns 1 if ignorable 0 otherwise.
2365 */
2366
areBlanks(htmlParserCtxtPtr ctxt,const xmlChar * str,int len)2367 static int areBlanks(htmlParserCtxtPtr ctxt, const xmlChar *str, int len) {
2368 unsigned int i;
2369 int j;
2370 xmlNodePtr lastChild;
2371 xmlDtdPtr dtd;
2372
2373 for (j = 0;j < len;j++)
2374 if (!(IS_BLANK_CH(str[j]))) return(0);
2375
2376 if (CUR == 0) return(1);
2377 if (CUR != '<') return(0);
2378 if (ctxt->name == NULL)
2379 return(1);
2380 if (xmlStrEqual(ctxt->name, BAD_CAST"html"))
2381 return(1);
2382 if (xmlStrEqual(ctxt->name, BAD_CAST"head"))
2383 return(1);
2384
2385 /* Only strip CDATA children of the body tag for strict HTML DTDs */
2386 if (xmlStrEqual(ctxt->name, BAD_CAST "body") && ctxt->myDoc != NULL) {
2387 dtd = xmlGetIntSubset(ctxt->myDoc);
2388 if (dtd != NULL && dtd->ExternalID != NULL) {
2389 if (!xmlStrcasecmp(dtd->ExternalID, BAD_CAST "-//W3C//DTD HTML 4.01//EN") ||
2390 !xmlStrcasecmp(dtd->ExternalID, BAD_CAST "-//W3C//DTD HTML 4//EN"))
2391 return(1);
2392 }
2393 }
2394
2395 if (ctxt->node == NULL) return(0);
2396 lastChild = xmlGetLastChild(ctxt->node);
2397 while ((lastChild) && (lastChild->type == XML_COMMENT_NODE))
2398 lastChild = lastChild->prev;
2399 if (lastChild == NULL) {
2400 if ((ctxt->node->type != XML_ELEMENT_NODE) &&
2401 (ctxt->node->content != NULL)) return(0);
2402 /* keep ws in constructs like ...<b> </b>...
2403 for all tags "b" allowing PCDATA */
2404 for ( i = 0; i < sizeof(allowPCData)/sizeof(allowPCData[0]); i++ ) {
2405 if ( xmlStrEqual(ctxt->name, BAD_CAST allowPCData[i]) ) {
2406 return(0);
2407 }
2408 }
2409 } else if (xmlNodeIsText(lastChild)) {
2410 return(0);
2411 } else {
2412 /* keep ws in constructs like <p><b>xy</b> <i>z</i><p>
2413 for all tags "p" allowing PCDATA */
2414 for ( i = 0; i < sizeof(allowPCData)/sizeof(allowPCData[0]); i++ ) {
2415 if ( xmlStrEqual(lastChild->name, BAD_CAST allowPCData[i]) ) {
2416 return(0);
2417 }
2418 }
2419 }
2420 return(1);
2421 }
2422
2423 /**
2424 * htmlNewDocNoDtD:
2425 * @URI: URI for the dtd, or NULL
2426 * @ExternalID: the external ID of the DTD, or NULL
2427 *
2428 * Creates a new HTML document without a DTD node if @URI and @ExternalID
2429 * are NULL
2430 *
2431 * Returns a new document, do not initialize the DTD if not provided
2432 */
2433 htmlDocPtr
htmlNewDocNoDtD(const xmlChar * URI,const xmlChar * ExternalID)2434 htmlNewDocNoDtD(const xmlChar *URI, const xmlChar *ExternalID) {
2435 xmlDocPtr cur;
2436
2437 /*
2438 * Allocate a new document and fill the fields.
2439 */
2440 cur = (xmlDocPtr) xmlMalloc(sizeof(xmlDoc));
2441 if (cur == NULL) {
2442 htmlErrMemory(NULL, "HTML document creation failed\n");
2443 return(NULL);
2444 }
2445 memset(cur, 0, sizeof(xmlDoc));
2446
2447 cur->type = XML_HTML_DOCUMENT_NODE;
2448 cur->version = NULL;
2449 cur->intSubset = NULL;
2450 cur->doc = cur;
2451 cur->name = NULL;
2452 cur->children = NULL;
2453 cur->extSubset = NULL;
2454 cur->oldNs = NULL;
2455 cur->encoding = NULL;
2456 cur->standalone = 1;
2457 cur->compression = 0;
2458 cur->ids = NULL;
2459 cur->refs = NULL;
2460 cur->_private = NULL;
2461 cur->charset = XML_CHAR_ENCODING_UTF8;
2462 cur->properties = XML_DOC_HTML | XML_DOC_USERBUILT;
2463 if ((ExternalID != NULL) ||
2464 (URI != NULL))
2465 xmlCreateIntSubset(cur, BAD_CAST "html", ExternalID, URI);
2466 if ((__xmlRegisterCallbacks) && (xmlRegisterNodeDefaultValue))
2467 xmlRegisterNodeDefaultValue((xmlNodePtr)cur);
2468 return(cur);
2469 }
2470
2471 /**
2472 * htmlNewDoc:
2473 * @URI: URI for the dtd, or NULL
2474 * @ExternalID: the external ID of the DTD, or NULL
2475 *
2476 * Creates a new HTML document
2477 *
2478 * Returns a new document
2479 */
2480 htmlDocPtr
htmlNewDoc(const xmlChar * URI,const xmlChar * ExternalID)2481 htmlNewDoc(const xmlChar *URI, const xmlChar *ExternalID) {
2482 if ((URI == NULL) && (ExternalID == NULL))
2483 return(htmlNewDocNoDtD(
2484 BAD_CAST "http://www.w3.org/TR/REC-html40/loose.dtd",
2485 BAD_CAST "-//W3C//DTD HTML 4.0 Transitional//EN"));
2486
2487 return(htmlNewDocNoDtD(URI, ExternalID));
2488 }
2489
2490
2491 /************************************************************************
2492 * *
2493 * The parser itself *
2494 * Relates to http://www.w3.org/TR/html40 *
2495 * *
2496 ************************************************************************/
2497
2498 /************************************************************************
2499 * *
2500 * The parser itself *
2501 * *
2502 ************************************************************************/
2503
2504 static const xmlChar * htmlParseNameComplex(xmlParserCtxtPtr ctxt);
2505
2506 static void
htmlSkipBogusComment(htmlParserCtxtPtr ctxt)2507 htmlSkipBogusComment(htmlParserCtxtPtr ctxt) {
2508 int c;
2509
2510 htmlParseErr(ctxt, XML_HTML_INCORRECTLY_OPENED_COMMENT,
2511 "Incorrectly opened comment\n", NULL, NULL);
2512
2513 do {
2514 c = CUR;
2515 if (c == 0)
2516 break;
2517 NEXT;
2518 } while (c != '>');
2519 }
2520
2521 /**
2522 * htmlParseHTMLName:
2523 * @ctxt: an HTML parser context
2524 *
2525 * parse an HTML tag or attribute name, note that we convert it to lowercase
2526 * since HTML names are not case-sensitive.
2527 *
2528 * Returns the Tag Name parsed or NULL
2529 */
2530
2531 static const xmlChar *
htmlParseHTMLName(htmlParserCtxtPtr ctxt)2532 htmlParseHTMLName(htmlParserCtxtPtr ctxt) {
2533 const xmlChar *ret;
2534 int i = 0;
2535 xmlChar loc[HTML_PARSER_BUFFER_SIZE];
2536
2537 if (!IS_ASCII_LETTER(CUR) && (CUR != '_') &&
2538 (CUR != ':') && (CUR != '.')) return(NULL);
2539
2540 while ((i < HTML_PARSER_BUFFER_SIZE) &&
2541 ((IS_ASCII_LETTER(CUR)) || (IS_ASCII_DIGIT(CUR)) ||
2542 (CUR == ':') || (CUR == '-') || (CUR == '_') ||
2543 (CUR == '.'))) {
2544 if ((CUR >= 'A') && (CUR <= 'Z')) loc[i] = CUR + 0x20;
2545 else loc[i] = CUR;
2546 i++;
2547
2548 NEXT;
2549 }
2550
2551 ret = xmlDictLookup(ctxt->dict, loc, i);
2552 if (ret == NULL)
2553 htmlErrMemory(ctxt, NULL);
2554
2555 return(ret);
2556 }
2557
2558
2559 /**
2560 * htmlParseHTMLName_nonInvasive:
2561 * @ctxt: an HTML parser context
2562 *
2563 * parse an HTML tag or attribute name, note that we convert it to lowercase
2564 * since HTML names are not case-sensitive, this doesn't consume the data
2565 * from the stream, it's a look-ahead
2566 *
2567 * Returns the Tag Name parsed or NULL
2568 */
2569
2570 static const xmlChar *
htmlParseHTMLName_nonInvasive(htmlParserCtxtPtr ctxt)2571 htmlParseHTMLName_nonInvasive(htmlParserCtxtPtr ctxt) {
2572 int i = 0;
2573 xmlChar loc[HTML_PARSER_BUFFER_SIZE];
2574
2575 if (!IS_ASCII_LETTER(NXT(1)) && (NXT(1) != '_') &&
2576 (NXT(1) != ':')) return(NULL);
2577
2578 while ((i < HTML_PARSER_BUFFER_SIZE) &&
2579 ((IS_ASCII_LETTER(NXT(1+i))) || (IS_ASCII_DIGIT(NXT(1+i))) ||
2580 (NXT(1+i) == ':') || (NXT(1+i) == '-') || (NXT(1+i) == '_'))) {
2581 if ((NXT(1+i) >= 'A') && (NXT(1+i) <= 'Z')) loc[i] = NXT(1+i) + 0x20;
2582 else loc[i] = NXT(1+i);
2583 i++;
2584 }
2585
2586 return(xmlDictLookup(ctxt->dict, loc, i));
2587 }
2588
2589
2590 /**
2591 * htmlParseName:
2592 * @ctxt: an HTML parser context
2593 *
2594 * parse an HTML name, this routine is case sensitive.
2595 *
2596 * Returns the Name parsed or NULL
2597 */
2598
2599 static const xmlChar *
htmlParseName(htmlParserCtxtPtr ctxt)2600 htmlParseName(htmlParserCtxtPtr ctxt) {
2601 const xmlChar *in;
2602 const xmlChar *ret;
2603 int count = 0;
2604
2605 GROW;
2606
2607 /*
2608 * Accelerator for simple ASCII names
2609 */
2610 in = ctxt->input->cur;
2611 if (((*in >= 0x61) && (*in <= 0x7A)) ||
2612 ((*in >= 0x41) && (*in <= 0x5A)) ||
2613 (*in == '_') || (*in == ':')) {
2614 in++;
2615 while (((*in >= 0x61) && (*in <= 0x7A)) ||
2616 ((*in >= 0x41) && (*in <= 0x5A)) ||
2617 ((*in >= 0x30) && (*in <= 0x39)) ||
2618 (*in == '_') || (*in == '-') ||
2619 (*in == ':') || (*in == '.'))
2620 in++;
2621
2622 if (in == ctxt->input->end)
2623 return(NULL);
2624
2625 if ((*in > 0) && (*in < 0x80)) {
2626 count = in - ctxt->input->cur;
2627 ret = xmlDictLookup(ctxt->dict, ctxt->input->cur, count);
2628 ctxt->input->cur = in;
2629 ctxt->input->col += count;
2630 return(ret);
2631 }
2632 }
2633 return(htmlParseNameComplex(ctxt));
2634 }
2635
2636 static const xmlChar *
htmlParseNameComplex(xmlParserCtxtPtr ctxt)2637 htmlParseNameComplex(xmlParserCtxtPtr ctxt) {
2638 int len = 0, l;
2639 int c;
2640 int maxLength = (ctxt->options & XML_PARSE_HUGE) ?
2641 XML_MAX_TEXT_LENGTH :
2642 XML_MAX_NAME_LENGTH;
2643 const xmlChar *base = ctxt->input->base;
2644
2645 /*
2646 * Handler for more complex cases
2647 */
2648 c = CUR_CHAR(l);
2649 if ((c == ' ') || (c == '>') || (c == '/') || /* accelerators */
2650 (!IS_LETTER(c) && (c != '_') &&
2651 (c != ':'))) {
2652 return(NULL);
2653 }
2654
2655 while ((c != ' ') && (c != '>') && (c != '/') && /* test bigname.xml */
2656 ((IS_LETTER(c)) || (IS_DIGIT(c)) ||
2657 (c == '.') || (c == '-') ||
2658 (c == '_') || (c == ':') ||
2659 (IS_COMBINING(c)) ||
2660 (IS_EXTENDER(c)))) {
2661 len += l;
2662 if (len > maxLength) {
2663 htmlParseErr(ctxt, XML_ERR_NAME_TOO_LONG, "name too long", NULL, NULL);
2664 return(NULL);
2665 }
2666 NEXTL(l);
2667 c = CUR_CHAR(l);
2668 if (ctxt->input->base != base) {
2669 /*
2670 * We changed encoding from an unknown encoding
2671 * Input buffer changed location, so we better start again
2672 */
2673 return(htmlParseNameComplex(ctxt));
2674 }
2675 }
2676 if (ctxt->instate == XML_PARSER_EOF)
2677 return(NULL);
2678
2679 if (ctxt->input->cur - ctxt->input->base < len) {
2680 /* Sanity check */
2681 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
2682 "unexpected change of input buffer", NULL, NULL);
2683 return (NULL);
2684 }
2685
2686 return(xmlDictLookup(ctxt->dict, ctxt->input->cur - len, len));
2687 }
2688
2689
2690 /**
2691 * htmlParseHTMLAttribute:
2692 * @ctxt: an HTML parser context
2693 * @stop: a char stop value
2694 *
2695 * parse an HTML attribute value till the stop (quote), if
2696 * stop is 0 then it stops at the first space
2697 *
2698 * Returns the attribute parsed or NULL
2699 */
2700
2701 static xmlChar *
htmlParseHTMLAttribute(htmlParserCtxtPtr ctxt,const xmlChar stop)2702 htmlParseHTMLAttribute(htmlParserCtxtPtr ctxt, const xmlChar stop) {
2703 xmlChar *buffer = NULL;
2704 int buffer_size = 0;
2705 int maxLength = (ctxt->options & XML_PARSE_HUGE) ?
2706 XML_MAX_HUGE_LENGTH :
2707 XML_MAX_TEXT_LENGTH;
2708 xmlChar *out = NULL;
2709 const xmlChar *name = NULL;
2710 const xmlChar *cur = NULL;
2711 const htmlEntityDesc * ent;
2712
2713 /*
2714 * allocate a translation buffer.
2715 */
2716 buffer_size = HTML_PARSER_BUFFER_SIZE;
2717 buffer = (xmlChar *) xmlMallocAtomic(buffer_size);
2718 if (buffer == NULL) {
2719 htmlErrMemory(ctxt, "buffer allocation failed\n");
2720 return(NULL);
2721 }
2722 out = buffer;
2723
2724 /*
2725 * Ok loop until we reach one of the ending chars
2726 */
2727 while ((CUR != 0) && (CUR != stop)) {
2728 if ((stop == 0) && (CUR == '>')) break;
2729 if ((stop == 0) && (IS_BLANK_CH(CUR))) break;
2730 if (CUR == '&') {
2731 if (NXT(1) == '#') {
2732 unsigned int c;
2733 int bits;
2734
2735 c = htmlParseCharRef(ctxt);
2736 if (c < 0x80)
2737 { *out++ = c; bits= -6; }
2738 else if (c < 0x800)
2739 { *out++ =((c >> 6) & 0x1F) | 0xC0; bits= 0; }
2740 else if (c < 0x10000)
2741 { *out++ =((c >> 12) & 0x0F) | 0xE0; bits= 6; }
2742 else
2743 { *out++ =((c >> 18) & 0x07) | 0xF0; bits= 12; }
2744
2745 for ( ; bits >= 0; bits-= 6) {
2746 *out++ = ((c >> bits) & 0x3F) | 0x80;
2747 }
2748
2749 if (out - buffer > buffer_size - 100) {
2750 int indx = out - buffer;
2751
2752 growBuffer(buffer);
2753 out = &buffer[indx];
2754 }
2755 } else {
2756 ent = htmlParseEntityRef(ctxt, &name);
2757 if (name == NULL) {
2758 *out++ = '&';
2759 if (out - buffer > buffer_size - 100) {
2760 int indx = out - buffer;
2761
2762 growBuffer(buffer);
2763 out = &buffer[indx];
2764 }
2765 } else if (ent == NULL) {
2766 *out++ = '&';
2767 cur = name;
2768 while (*cur != 0) {
2769 if (out - buffer > buffer_size - 100) {
2770 int indx = out - buffer;
2771
2772 growBuffer(buffer);
2773 out = &buffer[indx];
2774 }
2775 *out++ = *cur++;
2776 }
2777 } else {
2778 unsigned int c;
2779 int bits;
2780
2781 if (out - buffer > buffer_size - 100) {
2782 int indx = out - buffer;
2783
2784 growBuffer(buffer);
2785 out = &buffer[indx];
2786 }
2787 c = ent->value;
2788 if (c < 0x80)
2789 { *out++ = c; bits= -6; }
2790 else if (c < 0x800)
2791 { *out++ =((c >> 6) & 0x1F) | 0xC0; bits= 0; }
2792 else if (c < 0x10000)
2793 { *out++ =((c >> 12) & 0x0F) | 0xE0; bits= 6; }
2794 else
2795 { *out++ =((c >> 18) & 0x07) | 0xF0; bits= 12; }
2796
2797 for ( ; bits >= 0; bits-= 6) {
2798 *out++ = ((c >> bits) & 0x3F) | 0x80;
2799 }
2800 }
2801 }
2802 } else {
2803 unsigned int c;
2804 int bits, l;
2805
2806 if (out - buffer > buffer_size - 100) {
2807 int indx = out - buffer;
2808
2809 growBuffer(buffer);
2810 out = &buffer[indx];
2811 }
2812 c = CUR_CHAR(l);
2813 if (ctxt->instate == XML_PARSER_EOF) {
2814 xmlFree(buffer);
2815 return(NULL);
2816 }
2817 if (c < 0x80)
2818 { *out++ = c; bits= -6; }
2819 else if (c < 0x800)
2820 { *out++ =((c >> 6) & 0x1F) | 0xC0; bits= 0; }
2821 else if (c < 0x10000)
2822 { *out++ =((c >> 12) & 0x0F) | 0xE0; bits= 6; }
2823 else
2824 { *out++ =((c >> 18) & 0x07) | 0xF0; bits= 12; }
2825
2826 for ( ; bits >= 0; bits-= 6) {
2827 *out++ = ((c >> bits) & 0x3F) | 0x80;
2828 }
2829 NEXTL(l);
2830 }
2831 if (out - buffer > maxLength) {
2832 htmlParseErr(ctxt, XML_ERR_ATTRIBUTE_NOT_FINISHED,
2833 "attribute value too long\n", NULL, NULL);
2834 xmlFree(buffer);
2835 return(NULL);
2836 }
2837 }
2838 *out = 0;
2839 return(buffer);
2840 }
2841
2842 /**
2843 * htmlParseEntityRef:
2844 * @ctxt: an HTML parser context
2845 * @str: location to store the entity name
2846 *
2847 * DEPRECATED: Internal function, don't use.
2848 *
2849 * parse an HTML ENTITY references
2850 *
2851 * [68] EntityRef ::= '&' Name ';'
2852 *
2853 * Returns the associated htmlEntityDescPtr if found, or NULL otherwise,
2854 * if non-NULL *str will have to be freed by the caller.
2855 */
2856 const htmlEntityDesc *
htmlParseEntityRef(htmlParserCtxtPtr ctxt,const xmlChar ** str)2857 htmlParseEntityRef(htmlParserCtxtPtr ctxt, const xmlChar **str) {
2858 const xmlChar *name;
2859 const htmlEntityDesc * ent = NULL;
2860
2861 if (str != NULL) *str = NULL;
2862 if ((ctxt == NULL) || (ctxt->input == NULL)) return(NULL);
2863
2864 if (CUR == '&') {
2865 NEXT;
2866 name = htmlParseName(ctxt);
2867 if (name == NULL) {
2868 htmlParseErr(ctxt, XML_ERR_NAME_REQUIRED,
2869 "htmlParseEntityRef: no name\n", NULL, NULL);
2870 } else {
2871 GROW;
2872 if (CUR == ';') {
2873 if (str != NULL)
2874 *str = name;
2875
2876 /*
2877 * Lookup the entity in the table.
2878 */
2879 ent = htmlEntityLookup(name);
2880 if (ent != NULL) /* OK that's ugly !!! */
2881 NEXT;
2882 } else {
2883 htmlParseErr(ctxt, XML_ERR_ENTITYREF_SEMICOL_MISSING,
2884 "htmlParseEntityRef: expecting ';'\n",
2885 NULL, NULL);
2886 if (str != NULL)
2887 *str = name;
2888 }
2889 }
2890 }
2891 return(ent);
2892 }
2893
2894 /**
2895 * htmlParseAttValue:
2896 * @ctxt: an HTML parser context
2897 *
2898 * parse a value for an attribute
2899 * Note: the parser won't do substitution of entities here, this
2900 * will be handled later in xmlStringGetNodeList, unless it was
2901 * asked for ctxt->replaceEntities != 0
2902 *
2903 * Returns the AttValue parsed or NULL.
2904 */
2905
2906 static xmlChar *
htmlParseAttValue(htmlParserCtxtPtr ctxt)2907 htmlParseAttValue(htmlParserCtxtPtr ctxt) {
2908 xmlChar *ret = NULL;
2909
2910 if (CUR == '"') {
2911 NEXT;
2912 ret = htmlParseHTMLAttribute(ctxt, '"');
2913 if (CUR != '"') {
2914 htmlParseErr(ctxt, XML_ERR_ATTRIBUTE_NOT_FINISHED,
2915 "AttValue: \" expected\n", NULL, NULL);
2916 } else
2917 NEXT;
2918 } else if (CUR == '\'') {
2919 NEXT;
2920 ret = htmlParseHTMLAttribute(ctxt, '\'');
2921 if (CUR != '\'') {
2922 htmlParseErr(ctxt, XML_ERR_ATTRIBUTE_NOT_FINISHED,
2923 "AttValue: ' expected\n", NULL, NULL);
2924 } else
2925 NEXT;
2926 } else {
2927 /*
2928 * That's an HTMLism, the attribute value may not be quoted
2929 */
2930 ret = htmlParseHTMLAttribute(ctxt, 0);
2931 if (ret == NULL) {
2932 htmlParseErr(ctxt, XML_ERR_ATTRIBUTE_WITHOUT_VALUE,
2933 "AttValue: no value found\n", NULL, NULL);
2934 }
2935 }
2936 return(ret);
2937 }
2938
2939 /**
2940 * htmlParseSystemLiteral:
2941 * @ctxt: an HTML parser context
2942 *
2943 * parse an HTML Literal
2944 *
2945 * [11] SystemLiteral ::= ('"' [^"]* '"') | ("'" [^']* "'")
2946 *
2947 * Returns the SystemLiteral parsed or NULL
2948 */
2949
2950 static xmlChar *
htmlParseSystemLiteral(htmlParserCtxtPtr ctxt)2951 htmlParseSystemLiteral(htmlParserCtxtPtr ctxt) {
2952 size_t len = 0, startPosition = 0;
2953 int err = 0;
2954 int quote;
2955 xmlChar *ret = NULL;
2956
2957 if ((CUR != '"') && (CUR != '\'')) {
2958 htmlParseErr(ctxt, XML_ERR_LITERAL_NOT_STARTED,
2959 "SystemLiteral \" or ' expected\n", NULL, NULL);
2960 return(NULL);
2961 }
2962 quote = CUR;
2963 NEXT;
2964
2965 if (CUR_PTR < BASE_PTR)
2966 return(ret);
2967 startPosition = CUR_PTR - BASE_PTR;
2968
2969 while ((CUR != 0) && (CUR != quote)) {
2970 /* TODO: Handle UTF-8 */
2971 if (!IS_CHAR_CH(CUR)) {
2972 htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR,
2973 "Invalid char in SystemLiteral 0x%X\n", CUR);
2974 err = 1;
2975 }
2976 NEXT;
2977 len++;
2978 }
2979 if (CUR != quote) {
2980 htmlParseErr(ctxt, XML_ERR_LITERAL_NOT_FINISHED,
2981 "Unfinished SystemLiteral\n", NULL, NULL);
2982 } else {
2983 if (err == 0)
2984 ret = xmlStrndup((BASE_PTR+startPosition), len);
2985 NEXT;
2986 }
2987
2988 return(ret);
2989 }
2990
2991 /**
2992 * htmlParsePubidLiteral:
2993 * @ctxt: an HTML parser context
2994 *
2995 * parse an HTML public literal
2996 *
2997 * [12] PubidLiteral ::= '"' PubidChar* '"' | "'" (PubidChar - "'")* "'"
2998 *
2999 * Returns the PubidLiteral parsed or NULL.
3000 */
3001
3002 static xmlChar *
htmlParsePubidLiteral(htmlParserCtxtPtr ctxt)3003 htmlParsePubidLiteral(htmlParserCtxtPtr ctxt) {
3004 size_t len = 0, startPosition = 0;
3005 int err = 0;
3006 int quote;
3007 xmlChar *ret = NULL;
3008
3009 if ((CUR != '"') && (CUR != '\'')) {
3010 htmlParseErr(ctxt, XML_ERR_LITERAL_NOT_STARTED,
3011 "PubidLiteral \" or ' expected\n", NULL, NULL);
3012 return(NULL);
3013 }
3014 quote = CUR;
3015 NEXT;
3016
3017 /*
3018 * Name ::= (Letter | '_') (NameChar)*
3019 */
3020 if (CUR_PTR < BASE_PTR)
3021 return(ret);
3022 startPosition = CUR_PTR - BASE_PTR;
3023
3024 while ((CUR != 0) && (CUR != quote)) {
3025 if (!IS_PUBIDCHAR_CH(CUR)) {
3026 htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR,
3027 "Invalid char in PubidLiteral 0x%X\n", CUR);
3028 err = 1;
3029 }
3030 len++;
3031 NEXT;
3032 }
3033
3034 if (CUR != quote) {
3035 htmlParseErr(ctxt, XML_ERR_LITERAL_NOT_FINISHED,
3036 "Unfinished PubidLiteral\n", NULL, NULL);
3037 } else {
3038 if (err == 0)
3039 ret = xmlStrndup((BASE_PTR + startPosition), len);
3040 NEXT;
3041 }
3042
3043 return(ret);
3044 }
3045
3046 /**
3047 * htmlParseScript:
3048 * @ctxt: an HTML parser context
3049 *
3050 * parse the content of an HTML SCRIPT or STYLE element
3051 * http://www.w3.org/TR/html4/sgml/dtd.html#Script
3052 * http://www.w3.org/TR/html4/sgml/dtd.html#StyleSheet
3053 * http://www.w3.org/TR/html4/types.html#type-script
3054 * http://www.w3.org/TR/html4/types.html#h-6.15
3055 * http://www.w3.org/TR/html4/appendix/notes.html#h-B.3.2.1
3056 *
3057 * Script data ( %Script; in the DTD) can be the content of the SCRIPT
3058 * element and the value of intrinsic event attributes. User agents must
3059 * not evaluate script data as HTML markup but instead must pass it on as
3060 * data to a script engine.
3061 * NOTES:
3062 * - The content is passed like CDATA
3063 * - the attributes for style and scripting "onXXX" are also described
3064 * as CDATA but SGML allows entities references in attributes so their
3065 * processing is identical as other attributes
3066 */
3067 static void
htmlParseScript(htmlParserCtxtPtr ctxt)3068 htmlParseScript(htmlParserCtxtPtr ctxt) {
3069 xmlChar buf[HTML_PARSER_BIG_BUFFER_SIZE + 5];
3070 int nbchar = 0;
3071 int cur,l;
3072
3073 cur = CUR_CHAR(l);
3074 while (cur != 0) {
3075 if ((cur == '<') && (NXT(1) == '/')) {
3076 /*
3077 * One should break here, the specification is clear:
3078 * Authors should therefore escape "</" within the content.
3079 * Escape mechanisms are specific to each scripting or
3080 * style sheet language.
3081 *
3082 * In recovery mode, only break if end tag match the
3083 * current tag, effectively ignoring all tags inside the
3084 * script/style block and treating the entire block as
3085 * CDATA.
3086 */
3087 if (ctxt->recovery) {
3088 if (xmlStrncasecmp(ctxt->name, ctxt->input->cur+2,
3089 xmlStrlen(ctxt->name)) == 0)
3090 {
3091 break; /* while */
3092 } else {
3093 htmlParseErr(ctxt, XML_ERR_TAG_NAME_MISMATCH,
3094 "Element %s embeds close tag\n",
3095 ctxt->name, NULL);
3096 }
3097 } else {
3098 if (((NXT(2) >= 'A') && (NXT(2) <= 'Z')) ||
3099 ((NXT(2) >= 'a') && (NXT(2) <= 'z')))
3100 {
3101 break; /* while */
3102 }
3103 }
3104 }
3105 if (IS_CHAR(cur)) {
3106 COPY_BUF(l,buf,nbchar,cur);
3107 } else {
3108 htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR,
3109 "Invalid char in CDATA 0x%X\n", cur);
3110 }
3111 NEXTL(l);
3112 if (nbchar >= HTML_PARSER_BIG_BUFFER_SIZE) {
3113 buf[nbchar] = 0;
3114 if (ctxt->sax->cdataBlock!= NULL) {
3115 /*
3116 * Insert as CDATA, which is the same as HTML_PRESERVE_NODE
3117 */
3118 ctxt->sax->cdataBlock(ctxt->userData, buf, nbchar);
3119 } else if (ctxt->sax->characters != NULL) {
3120 ctxt->sax->characters(ctxt->userData, buf, nbchar);
3121 }
3122 nbchar = 0;
3123 SHRINK;
3124 }
3125 cur = CUR_CHAR(l);
3126 }
3127
3128 if (ctxt->instate == XML_PARSER_EOF)
3129 return;
3130
3131 if ((nbchar != 0) && (ctxt->sax != NULL) && (!ctxt->disableSAX)) {
3132 buf[nbchar] = 0;
3133 if (ctxt->sax->cdataBlock!= NULL) {
3134 /*
3135 * Insert as CDATA, which is the same as HTML_PRESERVE_NODE
3136 */
3137 ctxt->sax->cdataBlock(ctxt->userData, buf, nbchar);
3138 } else if (ctxt->sax->characters != NULL) {
3139 ctxt->sax->characters(ctxt->userData, buf, nbchar);
3140 }
3141 }
3142 }
3143
3144
3145 /**
3146 * htmlParseCharDataInternal:
3147 * @ctxt: an HTML parser context
3148 * @readahead: optional read ahead character in ascii range
3149 *
3150 * parse a CharData section.
3151 * if we are within a CDATA section ']]>' marks an end of section.
3152 *
3153 * [14] CharData ::= [^<&]* - ([^<&]* ']]>' [^<&]*)
3154 */
3155
3156 static void
htmlParseCharDataInternal(htmlParserCtxtPtr ctxt,int readahead)3157 htmlParseCharDataInternal(htmlParserCtxtPtr ctxt, int readahead) {
3158 xmlChar buf[HTML_PARSER_BIG_BUFFER_SIZE + 6];
3159 int nbchar = 0;
3160 int cur, l;
3161
3162 if (readahead)
3163 buf[nbchar++] = readahead;
3164
3165 cur = CUR_CHAR(l);
3166 while (((cur != '<') || (ctxt->token == '<')) &&
3167 ((cur != '&') || (ctxt->token == '&')) &&
3168 (cur != 0)) {
3169 if (!(IS_CHAR(cur))) {
3170 htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR,
3171 "Invalid char in CDATA 0x%X\n", cur);
3172 } else {
3173 COPY_BUF(l,buf,nbchar,cur);
3174 }
3175 NEXTL(l);
3176 if (nbchar >= HTML_PARSER_BIG_BUFFER_SIZE) {
3177 buf[nbchar] = 0;
3178
3179 /*
3180 * Ok the segment is to be consumed as chars.
3181 */
3182 if ((ctxt->sax != NULL) && (!ctxt->disableSAX)) {
3183 if (areBlanks(ctxt, buf, nbchar)) {
3184 if (ctxt->keepBlanks) {
3185 if (ctxt->sax->characters != NULL)
3186 ctxt->sax->characters(ctxt->userData, buf, nbchar);
3187 } else {
3188 if (ctxt->sax->ignorableWhitespace != NULL)
3189 ctxt->sax->ignorableWhitespace(ctxt->userData,
3190 buf, nbchar);
3191 }
3192 } else {
3193 htmlCheckParagraph(ctxt);
3194 if (ctxt->sax->characters != NULL)
3195 ctxt->sax->characters(ctxt->userData, buf, nbchar);
3196 }
3197 }
3198 nbchar = 0;
3199 SHRINK;
3200 }
3201 cur = CUR_CHAR(l);
3202 }
3203 if (ctxt->instate == XML_PARSER_EOF)
3204 return;
3205 if (nbchar != 0) {
3206 buf[nbchar] = 0;
3207
3208 /*
3209 * Ok the segment is to be consumed as chars.
3210 */
3211 if ((ctxt->sax != NULL) && (!ctxt->disableSAX)) {
3212 if (areBlanks(ctxt, buf, nbchar)) {
3213 if (ctxt->keepBlanks) {
3214 if (ctxt->sax->characters != NULL)
3215 ctxt->sax->characters(ctxt->userData, buf, nbchar);
3216 } else {
3217 if (ctxt->sax->ignorableWhitespace != NULL)
3218 ctxt->sax->ignorableWhitespace(ctxt->userData,
3219 buf, nbchar);
3220 }
3221 } else {
3222 htmlCheckParagraph(ctxt);
3223 if (ctxt->sax->characters != NULL)
3224 ctxt->sax->characters(ctxt->userData, buf, nbchar);
3225 }
3226 }
3227 }
3228 }
3229
3230 /**
3231 * htmlParseCharData:
3232 * @ctxt: an HTML parser context
3233 *
3234 * parse a CharData section.
3235 * if we are within a CDATA section ']]>' marks an end of section.
3236 *
3237 * [14] CharData ::= [^<&]* - ([^<&]* ']]>' [^<&]*)
3238 */
3239
3240 static void
htmlParseCharData(htmlParserCtxtPtr ctxt)3241 htmlParseCharData(htmlParserCtxtPtr ctxt) {
3242 htmlParseCharDataInternal(ctxt, 0);
3243 }
3244
3245 /**
3246 * htmlParseExternalID:
3247 * @ctxt: an HTML parser context
3248 * @publicID: a xmlChar** receiving PubidLiteral
3249 *
3250 * Parse an External ID or a Public ID
3251 *
3252 * [75] ExternalID ::= 'SYSTEM' S SystemLiteral
3253 * | 'PUBLIC' S PubidLiteral S SystemLiteral
3254 *
3255 * [83] PublicID ::= 'PUBLIC' S PubidLiteral
3256 *
3257 * Returns the function returns SystemLiteral and in the second
3258 * case publicID receives PubidLiteral, is strict is off
3259 * it is possible to return NULL and have publicID set.
3260 */
3261
3262 static xmlChar *
htmlParseExternalID(htmlParserCtxtPtr ctxt,xmlChar ** publicID)3263 htmlParseExternalID(htmlParserCtxtPtr ctxt, xmlChar **publicID) {
3264 xmlChar *URI = NULL;
3265
3266 if ((UPPER == 'S') && (UPP(1) == 'Y') &&
3267 (UPP(2) == 'S') && (UPP(3) == 'T') &&
3268 (UPP(4) == 'E') && (UPP(5) == 'M')) {
3269 SKIP(6);
3270 if (!IS_BLANK_CH(CUR)) {
3271 htmlParseErr(ctxt, XML_ERR_SPACE_REQUIRED,
3272 "Space required after 'SYSTEM'\n", NULL, NULL);
3273 }
3274 SKIP_BLANKS;
3275 URI = htmlParseSystemLiteral(ctxt);
3276 if (URI == NULL) {
3277 htmlParseErr(ctxt, XML_ERR_URI_REQUIRED,
3278 "htmlParseExternalID: SYSTEM, no URI\n", NULL, NULL);
3279 }
3280 } else if ((UPPER == 'P') && (UPP(1) == 'U') &&
3281 (UPP(2) == 'B') && (UPP(3) == 'L') &&
3282 (UPP(4) == 'I') && (UPP(5) == 'C')) {
3283 SKIP(6);
3284 if (!IS_BLANK_CH(CUR)) {
3285 htmlParseErr(ctxt, XML_ERR_SPACE_REQUIRED,
3286 "Space required after 'PUBLIC'\n", NULL, NULL);
3287 }
3288 SKIP_BLANKS;
3289 *publicID = htmlParsePubidLiteral(ctxt);
3290 if (*publicID == NULL) {
3291 htmlParseErr(ctxt, XML_ERR_PUBID_REQUIRED,
3292 "htmlParseExternalID: PUBLIC, no Public Identifier\n",
3293 NULL, NULL);
3294 }
3295 SKIP_BLANKS;
3296 if ((CUR == '"') || (CUR == '\'')) {
3297 URI = htmlParseSystemLiteral(ctxt);
3298 }
3299 }
3300 return(URI);
3301 }
3302
3303 /**
3304 * xmlParsePI:
3305 * @ctxt: an XML parser context
3306 *
3307 * parse an XML Processing Instruction.
3308 *
3309 * [16] PI ::= '<?' PITarget (S (Char* - (Char* '?>' Char*)))? '?>'
3310 */
3311 static void
htmlParsePI(htmlParserCtxtPtr ctxt)3312 htmlParsePI(htmlParserCtxtPtr ctxt) {
3313 xmlChar *buf = NULL;
3314 int len = 0;
3315 int size = HTML_PARSER_BUFFER_SIZE;
3316 int cur, l;
3317 int maxLength = (ctxt->options & XML_PARSE_HUGE) ?
3318 XML_MAX_HUGE_LENGTH :
3319 XML_MAX_TEXT_LENGTH;
3320 const xmlChar *target;
3321 xmlParserInputState state;
3322
3323 if ((RAW == '<') && (NXT(1) == '?')) {
3324 state = ctxt->instate;
3325 ctxt->instate = XML_PARSER_PI;
3326 /*
3327 * this is a Processing Instruction.
3328 */
3329 SKIP(2);
3330
3331 /*
3332 * Parse the target name and check for special support like
3333 * namespace.
3334 */
3335 target = htmlParseName(ctxt);
3336 if (target != NULL) {
3337 if (RAW == '>') {
3338 SKIP(1);
3339
3340 /*
3341 * SAX: PI detected.
3342 */
3343 if ((ctxt->sax) && (!ctxt->disableSAX) &&
3344 (ctxt->sax->processingInstruction != NULL))
3345 ctxt->sax->processingInstruction(ctxt->userData,
3346 target, NULL);
3347 ctxt->instate = state;
3348 return;
3349 }
3350 buf = (xmlChar *) xmlMallocAtomic(size);
3351 if (buf == NULL) {
3352 htmlErrMemory(ctxt, NULL);
3353 ctxt->instate = state;
3354 return;
3355 }
3356 cur = CUR;
3357 if (!IS_BLANK(cur)) {
3358 htmlParseErr(ctxt, XML_ERR_SPACE_REQUIRED,
3359 "ParsePI: PI %s space expected\n", target, NULL);
3360 }
3361 SKIP_BLANKS;
3362 cur = CUR_CHAR(l);
3363 while ((cur != 0) && (cur != '>')) {
3364 if (len + 5 >= size) {
3365 xmlChar *tmp;
3366
3367 size *= 2;
3368 tmp = (xmlChar *) xmlRealloc(buf, size);
3369 if (tmp == NULL) {
3370 htmlErrMemory(ctxt, NULL);
3371 xmlFree(buf);
3372 ctxt->instate = state;
3373 return;
3374 }
3375 buf = tmp;
3376 }
3377 if (IS_CHAR(cur)) {
3378 COPY_BUF(l,buf,len,cur);
3379 } else {
3380 htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR,
3381 "Invalid char in processing instruction "
3382 "0x%X\n", cur);
3383 }
3384 if (len > maxLength) {
3385 htmlParseErr(ctxt, XML_ERR_PI_NOT_FINISHED,
3386 "PI %s too long", target, NULL);
3387 xmlFree(buf);
3388 ctxt->instate = state;
3389 return;
3390 }
3391 NEXTL(l);
3392 cur = CUR_CHAR(l);
3393 }
3394 buf[len] = 0;
3395 if (ctxt->instate == XML_PARSER_EOF) {
3396 xmlFree(buf);
3397 return;
3398 }
3399 if (cur != '>') {
3400 htmlParseErr(ctxt, XML_ERR_PI_NOT_FINISHED,
3401 "ParsePI: PI %s never end ...\n", target, NULL);
3402 } else {
3403 SKIP(1);
3404
3405 /*
3406 * SAX: PI detected.
3407 */
3408 if ((ctxt->sax) && (!ctxt->disableSAX) &&
3409 (ctxt->sax->processingInstruction != NULL))
3410 ctxt->sax->processingInstruction(ctxt->userData,
3411 target, buf);
3412 }
3413 xmlFree(buf);
3414 } else {
3415 htmlParseErr(ctxt, XML_ERR_PI_NOT_STARTED,
3416 "PI is not started correctly", NULL, NULL);
3417 }
3418 ctxt->instate = state;
3419 }
3420 }
3421
3422 /**
3423 * htmlParseComment:
3424 * @ctxt: an HTML parser context
3425 *
3426 * Parse an XML (SGML) comment <!-- .... -->
3427 *
3428 * [15] Comment ::= '<!--' ((Char - '-') | ('-' (Char - '-')))* '-->'
3429 */
3430 static void
htmlParseComment(htmlParserCtxtPtr ctxt)3431 htmlParseComment(htmlParserCtxtPtr ctxt) {
3432 xmlChar *buf = NULL;
3433 int len;
3434 int size = HTML_PARSER_BUFFER_SIZE;
3435 int q, ql;
3436 int r, rl;
3437 int cur, l;
3438 int next, nl;
3439 int maxLength = (ctxt->options & XML_PARSE_HUGE) ?
3440 XML_MAX_HUGE_LENGTH :
3441 XML_MAX_TEXT_LENGTH;
3442 xmlParserInputState state;
3443
3444 /*
3445 * Check that there is a comment right here.
3446 */
3447 if ((RAW != '<') || (NXT(1) != '!') ||
3448 (NXT(2) != '-') || (NXT(3) != '-')) return;
3449
3450 state = ctxt->instate;
3451 ctxt->instate = XML_PARSER_COMMENT;
3452 SKIP(4);
3453 buf = (xmlChar *) xmlMallocAtomic(size);
3454 if (buf == NULL) {
3455 htmlErrMemory(ctxt, "buffer allocation failed\n");
3456 ctxt->instate = state;
3457 return;
3458 }
3459 len = 0;
3460 buf[len] = 0;
3461 q = CUR_CHAR(ql);
3462 if (q == 0)
3463 goto unfinished;
3464 if (q == '>') {
3465 htmlParseErr(ctxt, XML_ERR_COMMENT_ABRUPTLY_ENDED, "Comment abruptly ended", NULL, NULL);
3466 cur = '>';
3467 goto finished;
3468 }
3469 NEXTL(ql);
3470 r = CUR_CHAR(rl);
3471 if (r == 0)
3472 goto unfinished;
3473 if (q == '-' && r == '>') {
3474 htmlParseErr(ctxt, XML_ERR_COMMENT_ABRUPTLY_ENDED, "Comment abruptly ended", NULL, NULL);
3475 cur = '>';
3476 goto finished;
3477 }
3478 NEXTL(rl);
3479 cur = CUR_CHAR(l);
3480 while ((cur != 0) &&
3481 ((cur != '>') ||
3482 (r != '-') || (q != '-'))) {
3483 NEXTL(l);
3484 next = CUR_CHAR(nl);
3485
3486 if ((q == '-') && (r == '-') && (cur == '!') && (next == '>')) {
3487 htmlParseErr(ctxt, XML_ERR_COMMENT_NOT_FINISHED,
3488 "Comment incorrectly closed by '--!>'", NULL, NULL);
3489 cur = '>';
3490 break;
3491 }
3492
3493 if (len + 5 >= size) {
3494 xmlChar *tmp;
3495
3496 size *= 2;
3497 tmp = (xmlChar *) xmlRealloc(buf, size);
3498 if (tmp == NULL) {
3499 xmlFree(buf);
3500 htmlErrMemory(ctxt, "growing buffer failed\n");
3501 ctxt->instate = state;
3502 return;
3503 }
3504 buf = tmp;
3505 }
3506 if (IS_CHAR(q)) {
3507 COPY_BUF(ql,buf,len,q);
3508 } else {
3509 htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR,
3510 "Invalid char in comment 0x%X\n", q);
3511 }
3512 if (len > maxLength) {
3513 htmlParseErr(ctxt, XML_ERR_COMMENT_NOT_FINISHED,
3514 "comment too long", NULL, NULL);
3515 xmlFree(buf);
3516 ctxt->instate = state;
3517 return;
3518 }
3519
3520 q = r;
3521 ql = rl;
3522 r = cur;
3523 rl = l;
3524 cur = next;
3525 l = nl;
3526 }
3527 finished:
3528 buf[len] = 0;
3529 if (ctxt->instate == XML_PARSER_EOF) {
3530 xmlFree(buf);
3531 return;
3532 }
3533 if (cur == '>') {
3534 NEXT;
3535 if ((ctxt->sax != NULL) && (ctxt->sax->comment != NULL) &&
3536 (!ctxt->disableSAX))
3537 ctxt->sax->comment(ctxt->userData, buf);
3538 xmlFree(buf);
3539 ctxt->instate = state;
3540 return;
3541 }
3542
3543 unfinished:
3544 htmlParseErr(ctxt, XML_ERR_COMMENT_NOT_FINISHED,
3545 "Comment not terminated \n<!--%.50s\n", buf, NULL);
3546 xmlFree(buf);
3547 }
3548
3549 /**
3550 * htmlParseCharRef:
3551 * @ctxt: an HTML parser context
3552 *
3553 * DEPRECATED: Internal function, don't use.
3554 *
3555 * parse Reference declarations
3556 *
3557 * [66] CharRef ::= '&#' [0-9]+ ';' |
3558 * '&#x' [0-9a-fA-F]+ ';'
3559 *
3560 * Returns the value parsed (as an int)
3561 */
3562 int
htmlParseCharRef(htmlParserCtxtPtr ctxt)3563 htmlParseCharRef(htmlParserCtxtPtr ctxt) {
3564 int val = 0;
3565
3566 if ((ctxt == NULL) || (ctxt->input == NULL)) {
3567 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
3568 "htmlParseCharRef: context error\n",
3569 NULL, NULL);
3570 return(0);
3571 }
3572 if ((CUR == '&') && (NXT(1) == '#') &&
3573 ((NXT(2) == 'x') || NXT(2) == 'X')) {
3574 SKIP(3);
3575 while (CUR != ';') {
3576 if ((CUR >= '0') && (CUR <= '9')) {
3577 if (val < 0x110000)
3578 val = val * 16 + (CUR - '0');
3579 } else if ((CUR >= 'a') && (CUR <= 'f')) {
3580 if (val < 0x110000)
3581 val = val * 16 + (CUR - 'a') + 10;
3582 } else if ((CUR >= 'A') && (CUR <= 'F')) {
3583 if (val < 0x110000)
3584 val = val * 16 + (CUR - 'A') + 10;
3585 } else {
3586 htmlParseErr(ctxt, XML_ERR_INVALID_HEX_CHARREF,
3587 "htmlParseCharRef: missing semicolon\n",
3588 NULL, NULL);
3589 break;
3590 }
3591 NEXT;
3592 }
3593 if (CUR == ';')
3594 NEXT;
3595 } else if ((CUR == '&') && (NXT(1) == '#')) {
3596 SKIP(2);
3597 while (CUR != ';') {
3598 if ((CUR >= '0') && (CUR <= '9')) {
3599 if (val < 0x110000)
3600 val = val * 10 + (CUR - '0');
3601 } else {
3602 htmlParseErr(ctxt, XML_ERR_INVALID_DEC_CHARREF,
3603 "htmlParseCharRef: missing semicolon\n",
3604 NULL, NULL);
3605 break;
3606 }
3607 NEXT;
3608 }
3609 if (CUR == ';')
3610 NEXT;
3611 } else {
3612 htmlParseErr(ctxt, XML_ERR_INVALID_CHARREF,
3613 "htmlParseCharRef: invalid value\n", NULL, NULL);
3614 }
3615 /*
3616 * Check the value IS_CHAR ...
3617 */
3618 if (IS_CHAR(val)) {
3619 return(val);
3620 } else if (val >= 0x110000) {
3621 htmlParseErr(ctxt, XML_ERR_INVALID_CHAR,
3622 "htmlParseCharRef: value too large\n", NULL, NULL);
3623 } else {
3624 htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR,
3625 "htmlParseCharRef: invalid xmlChar value %d\n",
3626 val);
3627 }
3628 return(0);
3629 }
3630
3631
3632 /**
3633 * htmlParseDocTypeDecl:
3634 * @ctxt: an HTML parser context
3635 *
3636 * parse a DOCTYPE declaration
3637 *
3638 * [28] doctypedecl ::= '<!DOCTYPE' S Name (S ExternalID)? S?
3639 * ('[' (markupdecl | PEReference | S)* ']' S?)? '>'
3640 */
3641
3642 static void
htmlParseDocTypeDecl(htmlParserCtxtPtr ctxt)3643 htmlParseDocTypeDecl(htmlParserCtxtPtr ctxt) {
3644 const xmlChar *name;
3645 xmlChar *ExternalID = NULL;
3646 xmlChar *URI = NULL;
3647
3648 /*
3649 * We know that '<!DOCTYPE' has been detected.
3650 */
3651 SKIP(9);
3652
3653 SKIP_BLANKS;
3654
3655 /*
3656 * Parse the DOCTYPE name.
3657 */
3658 name = htmlParseName(ctxt);
3659 if (name == NULL) {
3660 htmlParseErr(ctxt, XML_ERR_NAME_REQUIRED,
3661 "htmlParseDocTypeDecl : no DOCTYPE name !\n",
3662 NULL, NULL);
3663 }
3664 /*
3665 * Check that upper(name) == "HTML" !!!!!!!!!!!!!
3666 */
3667
3668 SKIP_BLANKS;
3669
3670 /*
3671 * Check for SystemID and ExternalID
3672 */
3673 URI = htmlParseExternalID(ctxt, &ExternalID);
3674 SKIP_BLANKS;
3675
3676 /*
3677 * We should be at the end of the DOCTYPE declaration.
3678 */
3679 if (CUR != '>') {
3680 htmlParseErr(ctxt, XML_ERR_DOCTYPE_NOT_FINISHED,
3681 "DOCTYPE improperly terminated\n", NULL, NULL);
3682 /* Ignore bogus content */
3683 while ((CUR != 0) && (CUR != '>') &&
3684 (ctxt->instate != XML_PARSER_EOF))
3685 NEXT;
3686 }
3687 if (CUR == '>')
3688 NEXT;
3689
3690 /*
3691 * Create or update the document accordingly to the DOCTYPE
3692 */
3693 if ((ctxt->sax != NULL) && (ctxt->sax->internalSubset != NULL) &&
3694 (!ctxt->disableSAX))
3695 ctxt->sax->internalSubset(ctxt->userData, name, ExternalID, URI);
3696
3697 /*
3698 * Cleanup, since we don't use all those identifiers
3699 */
3700 if (URI != NULL) xmlFree(URI);
3701 if (ExternalID != NULL) xmlFree(ExternalID);
3702 }
3703
3704 /**
3705 * htmlParseAttribute:
3706 * @ctxt: an HTML parser context
3707 * @value: a xmlChar ** used to store the value of the attribute
3708 *
3709 * parse an attribute
3710 *
3711 * [41] Attribute ::= Name Eq AttValue
3712 *
3713 * [25] Eq ::= S? '=' S?
3714 *
3715 * With namespace:
3716 *
3717 * [NS 11] Attribute ::= QName Eq AttValue
3718 *
3719 * Also the case QName == xmlns:??? is handled independently as a namespace
3720 * definition.
3721 *
3722 * Returns the attribute name, and the value in *value.
3723 */
3724
3725 static const xmlChar *
htmlParseAttribute(htmlParserCtxtPtr ctxt,xmlChar ** value)3726 htmlParseAttribute(htmlParserCtxtPtr ctxt, xmlChar **value) {
3727 const xmlChar *name;
3728 xmlChar *val = NULL;
3729
3730 *value = NULL;
3731 name = htmlParseHTMLName(ctxt);
3732 if (name == NULL) {
3733 htmlParseErr(ctxt, XML_ERR_NAME_REQUIRED,
3734 "error parsing attribute name\n", NULL, NULL);
3735 return(NULL);
3736 }
3737
3738 /*
3739 * read the value
3740 */
3741 SKIP_BLANKS;
3742 if (CUR == '=') {
3743 NEXT;
3744 SKIP_BLANKS;
3745 val = htmlParseAttValue(ctxt);
3746 }
3747
3748 *value = val;
3749 return(name);
3750 }
3751
3752 /**
3753 * htmlCheckEncoding:
3754 * @ctxt: an HTML parser context
3755 * @attvalue: the attribute value
3756 *
3757 * Checks an http-equiv attribute from a Meta tag to detect
3758 * the encoding
3759 * If a new encoding is detected the parser is switched to decode
3760 * it and pass UTF8
3761 */
3762 static void
htmlCheckEncoding(htmlParserCtxtPtr ctxt,const xmlChar * attvalue)3763 htmlCheckEncoding(htmlParserCtxtPtr ctxt, const xmlChar *attvalue) {
3764 const xmlChar *encoding;
3765
3766 if (!attvalue)
3767 return;
3768
3769 encoding = xmlStrcasestr(attvalue, BAD_CAST"charset");
3770 if (encoding != NULL) {
3771 encoding += 7;
3772 }
3773 /*
3774 * skip blank
3775 */
3776 if (encoding && IS_BLANK_CH(*encoding))
3777 encoding = xmlStrcasestr(attvalue, BAD_CAST"=");
3778 if (encoding && *encoding == '=') {
3779 encoding ++;
3780 xmlSetDeclaredEncoding(ctxt, xmlStrdup(encoding));
3781 }
3782 }
3783
3784 /**
3785 * htmlCheckMeta:
3786 * @ctxt: an HTML parser context
3787 * @atts: the attributes values
3788 *
3789 * Checks an attributes from a Meta tag
3790 */
3791 static void
htmlCheckMeta(htmlParserCtxtPtr ctxt,const xmlChar ** atts)3792 htmlCheckMeta(htmlParserCtxtPtr ctxt, const xmlChar **atts) {
3793 int i;
3794 const xmlChar *att, *value;
3795 int http = 0;
3796 const xmlChar *content = NULL;
3797
3798 if ((ctxt == NULL) || (atts == NULL))
3799 return;
3800
3801 i = 0;
3802 att = atts[i++];
3803 while (att != NULL) {
3804 value = atts[i++];
3805 if ((value != NULL) && (!xmlStrcasecmp(att, BAD_CAST"http-equiv"))
3806 && (!xmlStrcasecmp(value, BAD_CAST"Content-Type")))
3807 http = 1;
3808 else if ((value != NULL) && (!xmlStrcasecmp(att, BAD_CAST"charset")))
3809 xmlSetDeclaredEncoding(ctxt, xmlStrdup(value));
3810 else if ((value != NULL) && (!xmlStrcasecmp(att, BAD_CAST"content")))
3811 content = value;
3812 att = atts[i++];
3813 }
3814 if ((http) && (content != NULL))
3815 htmlCheckEncoding(ctxt, content);
3816
3817 }
3818
3819 /**
3820 * htmlParseStartTag:
3821 * @ctxt: an HTML parser context
3822 *
3823 * parse a start of tag either for rule element or
3824 * EmptyElement. In both case we don't parse the tag closing chars.
3825 *
3826 * [40] STag ::= '<' Name (S Attribute)* S? '>'
3827 *
3828 * [44] EmptyElemTag ::= '<' Name (S Attribute)* S? '/>'
3829 *
3830 * With namespace:
3831 *
3832 * [NS 8] STag ::= '<' QName (S Attribute)* S? '>'
3833 *
3834 * [NS 10] EmptyElement ::= '<' QName (S Attribute)* S? '/>'
3835 *
3836 * Returns 0 in case of success, -1 in case of error and 1 if discarded
3837 */
3838
3839 static int
htmlParseStartTag(htmlParserCtxtPtr ctxt)3840 htmlParseStartTag(htmlParserCtxtPtr ctxt) {
3841 const xmlChar *name;
3842 const xmlChar *attname;
3843 xmlChar *attvalue;
3844 const xmlChar **atts;
3845 int nbatts = 0;
3846 int maxatts;
3847 int meta = 0;
3848 int i;
3849 int discardtag = 0;
3850
3851 if ((ctxt == NULL) || (ctxt->input == NULL)) {
3852 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
3853 "htmlParseStartTag: context error\n", NULL, NULL);
3854 return -1;
3855 }
3856 if (ctxt->instate == XML_PARSER_EOF)
3857 return(-1);
3858 if (CUR != '<') return -1;
3859 NEXT;
3860
3861 atts = ctxt->atts;
3862 maxatts = ctxt->maxatts;
3863
3864 GROW;
3865 name = htmlParseHTMLName(ctxt);
3866 if (name == NULL) {
3867 htmlParseErr(ctxt, XML_ERR_NAME_REQUIRED,
3868 "htmlParseStartTag: invalid element name\n",
3869 NULL, NULL);
3870 /* Dump the bogus tag like browsers do */
3871 while ((CUR != 0) && (CUR != '>') &&
3872 (ctxt->instate != XML_PARSER_EOF))
3873 NEXT;
3874 return -1;
3875 }
3876 if (xmlStrEqual(name, BAD_CAST"meta"))
3877 meta = 1;
3878
3879 /*
3880 * Check for auto-closure of HTML elements.
3881 */
3882 htmlAutoClose(ctxt, name);
3883
3884 /*
3885 * Check for implied HTML elements.
3886 */
3887 htmlCheckImplied(ctxt, name);
3888
3889 /*
3890 * Avoid html at any level > 0, head at any level != 1
3891 * or any attempt to recurse body
3892 */
3893 if ((ctxt->nameNr > 0) && (xmlStrEqual(name, BAD_CAST"html"))) {
3894 htmlParseErr(ctxt, XML_HTML_STRUCURE_ERROR,
3895 "htmlParseStartTag: misplaced <html> tag\n",
3896 name, NULL);
3897 discardtag = 1;
3898 ctxt->depth++;
3899 }
3900 if ((ctxt->nameNr != 1) &&
3901 (xmlStrEqual(name, BAD_CAST"head"))) {
3902 htmlParseErr(ctxt, XML_HTML_STRUCURE_ERROR,
3903 "htmlParseStartTag: misplaced <head> tag\n",
3904 name, NULL);
3905 discardtag = 1;
3906 ctxt->depth++;
3907 }
3908 if (xmlStrEqual(name, BAD_CAST"body")) {
3909 int indx;
3910 for (indx = 0;indx < ctxt->nameNr;indx++) {
3911 if (xmlStrEqual(ctxt->nameTab[indx], BAD_CAST"body")) {
3912 htmlParseErr(ctxt, XML_HTML_STRUCURE_ERROR,
3913 "htmlParseStartTag: misplaced <body> tag\n",
3914 name, NULL);
3915 discardtag = 1;
3916 ctxt->depth++;
3917 }
3918 }
3919 }
3920
3921 /*
3922 * Now parse the attributes, it ends up with the ending
3923 *
3924 * (S Attribute)* S?
3925 */
3926 SKIP_BLANKS;
3927 while ((CUR != 0) &&
3928 (CUR != '>') &&
3929 ((CUR != '/') || (NXT(1) != '>')) &&
3930 (ctxt->instate != XML_PARSER_EOF)) {
3931 GROW;
3932 attname = htmlParseAttribute(ctxt, &attvalue);
3933 if (attname != NULL) {
3934
3935 /*
3936 * Well formedness requires at most one declaration of an attribute
3937 */
3938 for (i = 0; i < nbatts;i += 2) {
3939 if (xmlStrEqual(atts[i], attname)) {
3940 htmlParseErr(ctxt, XML_ERR_ATTRIBUTE_REDEFINED,
3941 "Attribute %s redefined\n", attname, NULL);
3942 if (attvalue != NULL)
3943 xmlFree(attvalue);
3944 goto failed;
3945 }
3946 }
3947
3948 /*
3949 * Add the pair to atts
3950 */
3951 if (atts == NULL) {
3952 maxatts = 22; /* allow for 10 attrs by default */
3953 atts = (const xmlChar **)
3954 xmlMalloc(maxatts * sizeof(xmlChar *));
3955 if (atts == NULL) {
3956 htmlErrMemory(ctxt, NULL);
3957 if (attvalue != NULL)
3958 xmlFree(attvalue);
3959 goto failed;
3960 }
3961 ctxt->atts = atts;
3962 ctxt->maxatts = maxatts;
3963 } else if (nbatts + 4 > maxatts) {
3964 const xmlChar **n;
3965
3966 maxatts *= 2;
3967 n = (const xmlChar **) xmlRealloc((void *) atts,
3968 maxatts * sizeof(const xmlChar *));
3969 if (n == NULL) {
3970 htmlErrMemory(ctxt, NULL);
3971 if (attvalue != NULL)
3972 xmlFree(attvalue);
3973 goto failed;
3974 }
3975 atts = n;
3976 ctxt->atts = atts;
3977 ctxt->maxatts = maxatts;
3978 }
3979 atts[nbatts++] = attname;
3980 atts[nbatts++] = attvalue;
3981 atts[nbatts] = NULL;
3982 atts[nbatts + 1] = NULL;
3983 }
3984 else {
3985 if (attvalue != NULL)
3986 xmlFree(attvalue);
3987 /* Dump the bogus attribute string up to the next blank or
3988 * the end of the tag. */
3989 while ((CUR != 0) &&
3990 !(IS_BLANK_CH(CUR)) && (CUR != '>') &&
3991 ((CUR != '/') || (NXT(1) != '>')) &&
3992 (ctxt->instate != XML_PARSER_EOF))
3993 NEXT;
3994 }
3995
3996 failed:
3997 SKIP_BLANKS;
3998 }
3999
4000 /*
4001 * Handle specific association to the META tag
4002 */
4003 if (meta && (nbatts != 0))
4004 htmlCheckMeta(ctxt, atts);
4005
4006 /*
4007 * SAX: Start of Element !
4008 */
4009 if (!discardtag) {
4010 htmlnamePush(ctxt, name);
4011 if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL)) {
4012 if (nbatts != 0)
4013 ctxt->sax->startElement(ctxt->userData, name, atts);
4014 else
4015 ctxt->sax->startElement(ctxt->userData, name, NULL);
4016 }
4017 }
4018
4019 if (atts != NULL) {
4020 for (i = 1;i < nbatts;i += 2) {
4021 if (atts[i] != NULL)
4022 xmlFree((xmlChar *) atts[i]);
4023 }
4024 }
4025
4026 return(discardtag);
4027 }
4028
4029 /**
4030 * htmlParseEndTag:
4031 * @ctxt: an HTML parser context
4032 *
4033 * parse an end of tag
4034 *
4035 * [42] ETag ::= '</' Name S? '>'
4036 *
4037 * With namespace
4038 *
4039 * [NS 9] ETag ::= '</' QName S? '>'
4040 *
4041 * Returns 1 if the current level should be closed.
4042 */
4043
4044 static int
htmlParseEndTag(htmlParserCtxtPtr ctxt)4045 htmlParseEndTag(htmlParserCtxtPtr ctxt)
4046 {
4047 const xmlChar *name;
4048 const xmlChar *oldname;
4049 int i, ret;
4050
4051 if ((CUR != '<') || (NXT(1) != '/')) {
4052 htmlParseErr(ctxt, XML_ERR_LTSLASH_REQUIRED,
4053 "htmlParseEndTag: '</' not found\n", NULL, NULL);
4054 return (0);
4055 }
4056 SKIP(2);
4057
4058 name = htmlParseHTMLName(ctxt);
4059 if (name == NULL)
4060 return (0);
4061 /*
4062 * We should definitely be at the ending "S? '>'" part
4063 */
4064 SKIP_BLANKS;
4065 if (CUR != '>') {
4066 htmlParseErr(ctxt, XML_ERR_GT_REQUIRED,
4067 "End tag : expected '>'\n", NULL, NULL);
4068 /* Skip to next '>' */
4069 while ((CUR != 0) && (CUR != '>'))
4070 NEXT;
4071 }
4072 if (CUR == '>')
4073 NEXT;
4074
4075 /*
4076 * if we ignored misplaced tags in htmlParseStartTag don't pop them
4077 * out now.
4078 */
4079 if ((ctxt->depth > 0) &&
4080 (xmlStrEqual(name, BAD_CAST "html") ||
4081 xmlStrEqual(name, BAD_CAST "body") ||
4082 xmlStrEqual(name, BAD_CAST "head"))) {
4083 ctxt->depth--;
4084 return (0);
4085 }
4086
4087 /*
4088 * If the name read is not one of the element in the parsing stack
4089 * then return, it's just an error.
4090 */
4091 for (i = (ctxt->nameNr - 1); i >= 0; i--) {
4092 if (xmlStrEqual(name, ctxt->nameTab[i]))
4093 break;
4094 }
4095 if (i < 0) {
4096 htmlParseErr(ctxt, XML_ERR_TAG_NAME_MISMATCH,
4097 "Unexpected end tag : %s\n", name, NULL);
4098 return (0);
4099 }
4100
4101
4102 /*
4103 * Check for auto-closure of HTML elements.
4104 */
4105
4106 htmlAutoCloseOnClose(ctxt, name);
4107
4108 /*
4109 * Well formedness constraints, opening and closing must match.
4110 * With the exception that the autoclose may have popped stuff out
4111 * of the stack.
4112 */
4113 if ((ctxt->name != NULL) && (!xmlStrEqual(ctxt->name, name))) {
4114 htmlParseErr(ctxt, XML_ERR_TAG_NAME_MISMATCH,
4115 "Opening and ending tag mismatch: %s and %s\n",
4116 name, ctxt->name);
4117 }
4118
4119 /*
4120 * SAX: End of Tag
4121 */
4122 oldname = ctxt->name;
4123 if ((oldname != NULL) && (xmlStrEqual(oldname, name))) {
4124 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
4125 ctxt->sax->endElement(ctxt->userData, name);
4126 htmlNodeInfoPop(ctxt);
4127 htmlnamePop(ctxt);
4128 ret = 1;
4129 } else {
4130 ret = 0;
4131 }
4132
4133 return (ret);
4134 }
4135
4136
4137 /**
4138 * htmlParseReference:
4139 * @ctxt: an HTML parser context
4140 *
4141 * parse and handle entity references in content,
4142 * this will end-up in a call to character() since this is either a
4143 * CharRef, or a predefined entity.
4144 */
4145 static void
htmlParseReference(htmlParserCtxtPtr ctxt)4146 htmlParseReference(htmlParserCtxtPtr ctxt) {
4147 const htmlEntityDesc * ent;
4148 xmlChar out[6];
4149 const xmlChar *name;
4150 if (CUR != '&') return;
4151
4152 if (NXT(1) == '#') {
4153 unsigned int c;
4154 int bits, i = 0;
4155
4156 c = htmlParseCharRef(ctxt);
4157 if (c == 0)
4158 return;
4159
4160 if (c < 0x80) { out[i++]= c; bits= -6; }
4161 else if (c < 0x800) { out[i++]=((c >> 6) & 0x1F) | 0xC0; bits= 0; }
4162 else if (c < 0x10000) { out[i++]=((c >> 12) & 0x0F) | 0xE0; bits= 6; }
4163 else { out[i++]=((c >> 18) & 0x07) | 0xF0; bits= 12; }
4164
4165 for ( ; bits >= 0; bits-= 6) {
4166 out[i++]= ((c >> bits) & 0x3F) | 0x80;
4167 }
4168 out[i] = 0;
4169
4170 htmlCheckParagraph(ctxt);
4171 if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL))
4172 ctxt->sax->characters(ctxt->userData, out, i);
4173 } else {
4174 ent = htmlParseEntityRef(ctxt, &name);
4175 if (name == NULL) {
4176 htmlCheckParagraph(ctxt);
4177 if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL))
4178 ctxt->sax->characters(ctxt->userData, BAD_CAST "&", 1);
4179 return;
4180 }
4181 if ((ent == NULL) || !(ent->value > 0)) {
4182 htmlCheckParagraph(ctxt);
4183 if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL)) {
4184 ctxt->sax->characters(ctxt->userData, BAD_CAST "&", 1);
4185 ctxt->sax->characters(ctxt->userData, name, xmlStrlen(name));
4186 /* ctxt->sax->characters(ctxt->userData, BAD_CAST ";", 1); */
4187 }
4188 } else {
4189 unsigned int c;
4190 int bits, i = 0;
4191
4192 c = ent->value;
4193 if (c < 0x80)
4194 { out[i++]= c; bits= -6; }
4195 else if (c < 0x800)
4196 { out[i++]=((c >> 6) & 0x1F) | 0xC0; bits= 0; }
4197 else if (c < 0x10000)
4198 { out[i++]=((c >> 12) & 0x0F) | 0xE0; bits= 6; }
4199 else
4200 { out[i++]=((c >> 18) & 0x07) | 0xF0; bits= 12; }
4201
4202 for ( ; bits >= 0; bits-= 6) {
4203 out[i++]= ((c >> bits) & 0x3F) | 0x80;
4204 }
4205 out[i] = 0;
4206
4207 htmlCheckParagraph(ctxt);
4208 if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL))
4209 ctxt->sax->characters(ctxt->userData, out, i);
4210 }
4211 }
4212 }
4213
4214 /**
4215 * htmlParseContent:
4216 * @ctxt: an HTML parser context
4217 *
4218 * Parse a content: comment, sub-element, reference or text.
4219 * Kept for compatibility with old code
4220 */
4221
4222 static void
htmlParseContent(htmlParserCtxtPtr ctxt)4223 htmlParseContent(htmlParserCtxtPtr ctxt) {
4224 xmlChar *currentNode;
4225 int depth;
4226 const xmlChar *name;
4227
4228 currentNode = xmlStrdup(ctxt->name);
4229 depth = ctxt->nameNr;
4230 while (1) {
4231 GROW;
4232
4233 if (ctxt->instate == XML_PARSER_EOF)
4234 break;
4235
4236 /*
4237 * Our tag or one of it's parent or children is ending.
4238 */
4239 if ((CUR == '<') && (NXT(1) == '/')) {
4240 if (htmlParseEndTag(ctxt) &&
4241 ((currentNode != NULL) || (ctxt->nameNr == 0))) {
4242 if (currentNode != NULL)
4243 xmlFree(currentNode);
4244 return;
4245 }
4246 continue; /* while */
4247 }
4248
4249 else if ((CUR == '<') &&
4250 ((IS_ASCII_LETTER(NXT(1))) ||
4251 (NXT(1) == '_') || (NXT(1) == ':'))) {
4252 name = htmlParseHTMLName_nonInvasive(ctxt);
4253 if (name == NULL) {
4254 htmlParseErr(ctxt, XML_ERR_NAME_REQUIRED,
4255 "htmlParseStartTag: invalid element name\n",
4256 NULL, NULL);
4257 /* Dump the bogus tag like browsers do */
4258 while ((CUR != 0) && (CUR != '>'))
4259 NEXT;
4260
4261 if (currentNode != NULL)
4262 xmlFree(currentNode);
4263 return;
4264 }
4265
4266 if (ctxt->name != NULL) {
4267 if (htmlCheckAutoClose(name, ctxt->name) == 1) {
4268 htmlAutoClose(ctxt, name);
4269 continue;
4270 }
4271 }
4272 }
4273
4274 /*
4275 * Has this node been popped out during parsing of
4276 * the next element
4277 */
4278 if ((ctxt->nameNr > 0) && (depth >= ctxt->nameNr) &&
4279 (!xmlStrEqual(currentNode, ctxt->name)))
4280 {
4281 if (currentNode != NULL) xmlFree(currentNode);
4282 return;
4283 }
4284
4285 if ((CUR != 0) && ((xmlStrEqual(currentNode, BAD_CAST"script")) ||
4286 (xmlStrEqual(currentNode, BAD_CAST"style")))) {
4287 /*
4288 * Handle SCRIPT/STYLE separately
4289 */
4290 htmlParseScript(ctxt);
4291 }
4292
4293 else if ((CUR == '<') && (NXT(1) == '!')) {
4294 /*
4295 * Sometimes DOCTYPE arrives in the middle of the document
4296 */
4297 if ((UPP(2) == 'D') && (UPP(3) == 'O') &&
4298 (UPP(4) == 'C') && (UPP(5) == 'T') &&
4299 (UPP(6) == 'Y') && (UPP(7) == 'P') &&
4300 (UPP(8) == 'E')) {
4301 htmlParseErr(ctxt, XML_HTML_STRUCURE_ERROR,
4302 "Misplaced DOCTYPE declaration\n",
4303 BAD_CAST "DOCTYPE" , NULL);
4304 htmlParseDocTypeDecl(ctxt);
4305 }
4306 /*
4307 * First case : a comment
4308 */
4309 else if ((NXT(2) == '-') && (NXT(3) == '-')) {
4310 htmlParseComment(ctxt);
4311 }
4312 else {
4313 htmlSkipBogusComment(ctxt);
4314 }
4315 }
4316
4317 /*
4318 * Second case : a Processing Instruction.
4319 */
4320 else if ((CUR == '<') && (NXT(1) == '?')) {
4321 htmlParsePI(ctxt);
4322 }
4323
4324 /*
4325 * Third case : a sub-element.
4326 */
4327 else if ((CUR == '<') && IS_ASCII_LETTER(NXT(1))) {
4328 htmlParseElement(ctxt);
4329 }
4330 else if (CUR == '<') {
4331 if ((ctxt->sax != NULL) && (!ctxt->disableSAX) &&
4332 (ctxt->sax->characters != NULL))
4333 ctxt->sax->characters(ctxt->userData, BAD_CAST "<", 1);
4334 NEXT;
4335 }
4336
4337 /*
4338 * Fourth case : a reference. If if has not been resolved,
4339 * parsing returns it's Name, create the node
4340 */
4341 else if (CUR == '&') {
4342 htmlParseReference(ctxt);
4343 }
4344
4345 /*
4346 * Fifth case : end of the resource
4347 */
4348 else if (CUR == 0) {
4349 htmlAutoCloseOnEnd(ctxt);
4350 break;
4351 }
4352
4353 /*
4354 * Last case, text. Note that References are handled directly.
4355 */
4356 else {
4357 htmlParseCharData(ctxt);
4358 }
4359
4360 SHRINK;
4361 GROW;
4362 }
4363 if (currentNode != NULL) xmlFree(currentNode);
4364 }
4365
4366 /**
4367 * htmlParseElement:
4368 * @ctxt: an HTML parser context
4369 *
4370 * DEPRECATED: Internal function, don't use.
4371 *
4372 * parse an HTML element, this is highly recursive
4373 * this is kept for compatibility with previous code versions
4374 *
4375 * [39] element ::= EmptyElemTag | STag content ETag
4376 *
4377 * [41] Attribute ::= Name Eq AttValue
4378 */
4379
4380 void
htmlParseElement(htmlParserCtxtPtr ctxt)4381 htmlParseElement(htmlParserCtxtPtr ctxt) {
4382 const xmlChar *name;
4383 xmlChar *currentNode = NULL;
4384 const htmlElemDesc * info;
4385 htmlParserNodeInfo node_info;
4386 int failed;
4387 int depth;
4388 const xmlChar *oldptr;
4389
4390 if ((ctxt == NULL) || (ctxt->input == NULL)) {
4391 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
4392 "htmlParseElement: context error\n", NULL, NULL);
4393 return;
4394 }
4395
4396 if (ctxt->instate == XML_PARSER_EOF)
4397 return;
4398
4399 /* Capture start position */
4400 if (ctxt->record_info) {
4401 node_info.begin_pos = ctxt->input->consumed +
4402 (CUR_PTR - ctxt->input->base);
4403 node_info.begin_line = ctxt->input->line;
4404 }
4405
4406 failed = htmlParseStartTag(ctxt);
4407 name = ctxt->name;
4408 if ((failed == -1) || (name == NULL)) {
4409 if (CUR == '>')
4410 NEXT;
4411 return;
4412 }
4413
4414 /*
4415 * Lookup the info for that element.
4416 */
4417 info = htmlTagLookup(name);
4418 if (info == NULL) {
4419 htmlParseErr(ctxt, XML_HTML_UNKNOWN_TAG,
4420 "Tag %s invalid\n", name, NULL);
4421 }
4422
4423 /*
4424 * Check for an Empty Element labeled the XML/SGML way
4425 */
4426 if ((CUR == '/') && (NXT(1) == '>')) {
4427 SKIP(2);
4428 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
4429 ctxt->sax->endElement(ctxt->userData, name);
4430 htmlnamePop(ctxt);
4431 return;
4432 }
4433
4434 if (CUR == '>') {
4435 NEXT;
4436 } else {
4437 htmlParseErr(ctxt, XML_ERR_GT_REQUIRED,
4438 "Couldn't find end of Start Tag %s\n", name, NULL);
4439
4440 /*
4441 * end of parsing of this node.
4442 */
4443 if (xmlStrEqual(name, ctxt->name)) {
4444 nodePop(ctxt);
4445 htmlnamePop(ctxt);
4446 }
4447
4448 /*
4449 * Capture end position and add node
4450 */
4451 if (ctxt->record_info) {
4452 node_info.end_pos = ctxt->input->consumed +
4453 (CUR_PTR - ctxt->input->base);
4454 node_info.end_line = ctxt->input->line;
4455 node_info.node = ctxt->node;
4456 xmlParserAddNodeInfo(ctxt, &node_info);
4457 }
4458 return;
4459 }
4460
4461 /*
4462 * Check for an Empty Element from DTD definition
4463 */
4464 if ((info != NULL) && (info->empty)) {
4465 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
4466 ctxt->sax->endElement(ctxt->userData, name);
4467 htmlnamePop(ctxt);
4468 return;
4469 }
4470
4471 /*
4472 * Parse the content of the element:
4473 */
4474 currentNode = xmlStrdup(ctxt->name);
4475 depth = ctxt->nameNr;
4476 while (CUR != 0) {
4477 oldptr = ctxt->input->cur;
4478 htmlParseContent(ctxt);
4479 if (oldptr==ctxt->input->cur) break;
4480 if (ctxt->nameNr < depth) break;
4481 }
4482
4483 /*
4484 * Capture end position and add node
4485 */
4486 if ( currentNode != NULL && ctxt->record_info ) {
4487 node_info.end_pos = ctxt->input->consumed +
4488 (CUR_PTR - ctxt->input->base);
4489 node_info.end_line = ctxt->input->line;
4490 node_info.node = ctxt->node;
4491 xmlParserAddNodeInfo(ctxt, &node_info);
4492 }
4493 if (CUR == 0) {
4494 htmlAutoCloseOnEnd(ctxt);
4495 }
4496
4497 if (currentNode != NULL)
4498 xmlFree(currentNode);
4499 }
4500
4501 static void
htmlParserFinishElementParsing(htmlParserCtxtPtr ctxt)4502 htmlParserFinishElementParsing(htmlParserCtxtPtr ctxt) {
4503 /*
4504 * Capture end position and add node
4505 */
4506 if ( ctxt->node != NULL && ctxt->record_info ) {
4507 ctxt->nodeInfo->end_pos = ctxt->input->consumed +
4508 (CUR_PTR - ctxt->input->base);
4509 ctxt->nodeInfo->end_line = ctxt->input->line;
4510 ctxt->nodeInfo->node = ctxt->node;
4511 xmlParserAddNodeInfo(ctxt, ctxt->nodeInfo);
4512 htmlNodeInfoPop(ctxt);
4513 }
4514 if (CUR == 0) {
4515 htmlAutoCloseOnEnd(ctxt);
4516 }
4517 }
4518
4519 /**
4520 * htmlParseElementInternal:
4521 * @ctxt: an HTML parser context
4522 *
4523 * parse an HTML element, new version, non recursive
4524 *
4525 * [39] element ::= EmptyElemTag | STag content ETag
4526 *
4527 * [41] Attribute ::= Name Eq AttValue
4528 */
4529
4530 static void
htmlParseElementInternal(htmlParserCtxtPtr ctxt)4531 htmlParseElementInternal(htmlParserCtxtPtr ctxt) {
4532 const xmlChar *name;
4533 const htmlElemDesc * info;
4534 htmlParserNodeInfo node_info = { NULL, 0, 0, 0, 0 };
4535 int failed;
4536
4537 if ((ctxt == NULL) || (ctxt->input == NULL)) {
4538 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
4539 "htmlParseElementInternal: context error\n", NULL, NULL);
4540 return;
4541 }
4542
4543 if (ctxt->instate == XML_PARSER_EOF)
4544 return;
4545
4546 /* Capture start position */
4547 if (ctxt->record_info) {
4548 node_info.begin_pos = ctxt->input->consumed +
4549 (CUR_PTR - ctxt->input->base);
4550 node_info.begin_line = ctxt->input->line;
4551 }
4552
4553 failed = htmlParseStartTag(ctxt);
4554 name = ctxt->name;
4555 if ((failed == -1) || (name == NULL)) {
4556 if (CUR == '>')
4557 NEXT;
4558 return;
4559 }
4560
4561 /*
4562 * Lookup the info for that element.
4563 */
4564 info = htmlTagLookup(name);
4565 if (info == NULL) {
4566 htmlParseErr(ctxt, XML_HTML_UNKNOWN_TAG,
4567 "Tag %s invalid\n", name, NULL);
4568 }
4569
4570 /*
4571 * Check for an Empty Element labeled the XML/SGML way
4572 */
4573 if ((CUR == '/') && (NXT(1) == '>')) {
4574 SKIP(2);
4575 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
4576 ctxt->sax->endElement(ctxt->userData, name);
4577 htmlnamePop(ctxt);
4578 return;
4579 }
4580
4581 if (CUR == '>') {
4582 NEXT;
4583 } else {
4584 htmlParseErr(ctxt, XML_ERR_GT_REQUIRED,
4585 "Couldn't find end of Start Tag %s\n", name, NULL);
4586
4587 /*
4588 * end of parsing of this node.
4589 */
4590 if (xmlStrEqual(name, ctxt->name)) {
4591 nodePop(ctxt);
4592 htmlnamePop(ctxt);
4593 }
4594
4595 if (ctxt->record_info)
4596 htmlNodeInfoPush(ctxt, &node_info);
4597 htmlParserFinishElementParsing(ctxt);
4598 return;
4599 }
4600
4601 /*
4602 * Check for an Empty Element from DTD definition
4603 */
4604 if ((info != NULL) && (info->empty)) {
4605 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
4606 ctxt->sax->endElement(ctxt->userData, name);
4607 htmlnamePop(ctxt);
4608 return;
4609 }
4610
4611 if (ctxt->record_info)
4612 htmlNodeInfoPush(ctxt, &node_info);
4613 }
4614
4615 /**
4616 * htmlParseContentInternal:
4617 * @ctxt: an HTML parser context
4618 *
4619 * Parse a content: comment, sub-element, reference or text.
4620 * New version for non recursive htmlParseElementInternal
4621 */
4622
4623 static void
htmlParseContentInternal(htmlParserCtxtPtr ctxt)4624 htmlParseContentInternal(htmlParserCtxtPtr ctxt) {
4625 xmlChar *currentNode;
4626 int depth;
4627 const xmlChar *name;
4628
4629 depth = ctxt->nameNr;
4630 if (depth <= 0) {
4631 currentNode = NULL;
4632 } else {
4633 currentNode = xmlStrdup(ctxt->name);
4634 if (currentNode == NULL) {
4635 htmlErrMemory(ctxt, NULL);
4636 return;
4637 }
4638 }
4639 while (1) {
4640 GROW;
4641
4642 if (ctxt->instate == XML_PARSER_EOF)
4643 break;
4644
4645 /*
4646 * Our tag or one of it's parent or children is ending.
4647 */
4648 if ((CUR == '<') && (NXT(1) == '/')) {
4649 if (htmlParseEndTag(ctxt) &&
4650 ((currentNode != NULL) || (ctxt->nameNr == 0))) {
4651 if (currentNode != NULL)
4652 xmlFree(currentNode);
4653
4654 depth = ctxt->nameNr;
4655 if (depth <= 0) {
4656 currentNode = NULL;
4657 } else {
4658 currentNode = xmlStrdup(ctxt->name);
4659 if (currentNode == NULL) {
4660 htmlErrMemory(ctxt, NULL);
4661 break;
4662 }
4663 }
4664 }
4665 continue; /* while */
4666 }
4667
4668 else if ((CUR == '<') &&
4669 ((IS_ASCII_LETTER(NXT(1))) ||
4670 (NXT(1) == '_') || (NXT(1) == ':'))) {
4671 name = htmlParseHTMLName_nonInvasive(ctxt);
4672 if (name == NULL) {
4673 htmlParseErr(ctxt, XML_ERR_NAME_REQUIRED,
4674 "htmlParseStartTag: invalid element name\n",
4675 NULL, NULL);
4676 /* Dump the bogus tag like browsers do */
4677 while ((CUR == 0) && (CUR != '>'))
4678 NEXT;
4679
4680 htmlParserFinishElementParsing(ctxt);
4681 if (currentNode != NULL)
4682 xmlFree(currentNode);
4683
4684 currentNode = xmlStrdup(ctxt->name);
4685 if (currentNode == NULL) {
4686 htmlErrMemory(ctxt, NULL);
4687 break;
4688 }
4689 depth = ctxt->nameNr;
4690 continue;
4691 }
4692
4693 if (ctxt->name != NULL) {
4694 if (htmlCheckAutoClose(name, ctxt->name) == 1) {
4695 htmlAutoClose(ctxt, name);
4696 continue;
4697 }
4698 }
4699 }
4700
4701 /*
4702 * Has this node been popped out during parsing of
4703 * the next element
4704 */
4705 if ((ctxt->nameNr > 0) && (depth >= ctxt->nameNr) &&
4706 (!xmlStrEqual(currentNode, ctxt->name)))
4707 {
4708 htmlParserFinishElementParsing(ctxt);
4709 if (currentNode != NULL) xmlFree(currentNode);
4710
4711 currentNode = xmlStrdup(ctxt->name);
4712 if (currentNode == NULL) {
4713 htmlErrMemory(ctxt, NULL);
4714 break;
4715 }
4716 depth = ctxt->nameNr;
4717 continue;
4718 }
4719
4720 if ((CUR != 0) && ((xmlStrEqual(currentNode, BAD_CAST"script")) ||
4721 (xmlStrEqual(currentNode, BAD_CAST"style")))) {
4722 /*
4723 * Handle SCRIPT/STYLE separately
4724 */
4725 htmlParseScript(ctxt);
4726 }
4727
4728 else if ((CUR == '<') && (NXT(1) == '!')) {
4729 /*
4730 * Sometimes DOCTYPE arrives in the middle of the document
4731 */
4732 if ((UPP(2) == 'D') && (UPP(3) == 'O') &&
4733 (UPP(4) == 'C') && (UPP(5) == 'T') &&
4734 (UPP(6) == 'Y') && (UPP(7) == 'P') &&
4735 (UPP(8) == 'E')) {
4736 htmlParseErr(ctxt, XML_HTML_STRUCURE_ERROR,
4737 "Misplaced DOCTYPE declaration\n",
4738 BAD_CAST "DOCTYPE" , NULL);
4739 htmlParseDocTypeDecl(ctxt);
4740 }
4741 /*
4742 * First case : a comment
4743 */
4744 else if ((NXT(2) == '-') && (NXT(3) == '-')) {
4745 htmlParseComment(ctxt);
4746 }
4747 else {
4748 htmlSkipBogusComment(ctxt);
4749 }
4750 }
4751
4752 /*
4753 * Second case : a Processing Instruction.
4754 */
4755 else if ((CUR == '<') && (NXT(1) == '?')) {
4756 htmlParsePI(ctxt);
4757 }
4758
4759 /*
4760 * Third case : a sub-element.
4761 */
4762 else if ((CUR == '<') && IS_ASCII_LETTER(NXT(1))) {
4763 htmlParseElementInternal(ctxt);
4764 if (currentNode != NULL) xmlFree(currentNode);
4765
4766 currentNode = xmlStrdup(ctxt->name);
4767 if (currentNode == NULL) {
4768 htmlErrMemory(ctxt, NULL);
4769 break;
4770 }
4771 depth = ctxt->nameNr;
4772 }
4773 else if (CUR == '<') {
4774 if ((ctxt->sax != NULL) && (!ctxt->disableSAX) &&
4775 (ctxt->sax->characters != NULL))
4776 ctxt->sax->characters(ctxt->userData, BAD_CAST "<", 1);
4777 NEXT;
4778 }
4779
4780 /*
4781 * Fourth case : a reference. If if has not been resolved,
4782 * parsing returns it's Name, create the node
4783 */
4784 else if (CUR == '&') {
4785 htmlParseReference(ctxt);
4786 }
4787
4788 /*
4789 * Fifth case : end of the resource
4790 */
4791 else if (CUR == 0) {
4792 htmlAutoCloseOnEnd(ctxt);
4793 break;
4794 }
4795
4796 /*
4797 * Last case, text. Note that References are handled directly.
4798 */
4799 else {
4800 htmlParseCharData(ctxt);
4801 }
4802
4803 SHRINK;
4804 GROW;
4805 }
4806 if (currentNode != NULL) xmlFree(currentNode);
4807 }
4808
4809 /**
4810 * htmlParseContent:
4811 * @ctxt: an HTML parser context
4812 *
4813 * Parse a content: comment, sub-element, reference or text.
4814 * This is the entry point when called from parser.c
4815 */
4816
4817 void
__htmlParseContent(void * ctxt)4818 __htmlParseContent(void *ctxt) {
4819 if (ctxt != NULL)
4820 htmlParseContentInternal((htmlParserCtxtPtr) ctxt);
4821 }
4822
4823 /**
4824 * htmlParseDocument:
4825 * @ctxt: an HTML parser context
4826 *
4827 * parse an HTML document (and build a tree if using the standard SAX
4828 * interface).
4829 *
4830 * Returns 0, -1 in case of error. the parser context is augmented
4831 * as a result of the parsing.
4832 */
4833
4834 int
htmlParseDocument(htmlParserCtxtPtr ctxt)4835 htmlParseDocument(htmlParserCtxtPtr ctxt) {
4836 xmlDtdPtr dtd;
4837
4838 xmlInitParser();
4839
4840 if ((ctxt == NULL) || (ctxt->input == NULL)) {
4841 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
4842 "htmlParseDocument: context error\n", NULL, NULL);
4843 return(XML_ERR_INTERNAL_ERROR);
4844 }
4845
4846 /*
4847 * SAX: beginning of the document processing.
4848 */
4849 if ((ctxt->sax) && (ctxt->sax->setDocumentLocator))
4850 ctxt->sax->setDocumentLocator(ctxt->userData, &xmlDefaultSAXLocator);
4851
4852 xmlDetectEncoding(ctxt);
4853
4854 /*
4855 * Wipe out everything which is before the first '<'
4856 */
4857 SKIP_BLANKS;
4858 if (CUR == 0) {
4859 htmlParseErr(ctxt, XML_ERR_DOCUMENT_EMPTY,
4860 "Document is empty\n", NULL, NULL);
4861 }
4862
4863 if ((ctxt->sax) && (ctxt->sax->startDocument) && (!ctxt->disableSAX))
4864 ctxt->sax->startDocument(ctxt->userData);
4865
4866
4867 /*
4868 * Parse possible comments and PIs before any content
4869 */
4870 while (((CUR == '<') && (NXT(1) == '!') &&
4871 (NXT(2) == '-') && (NXT(3) == '-')) ||
4872 ((CUR == '<') && (NXT(1) == '?'))) {
4873 htmlParseComment(ctxt);
4874 htmlParsePI(ctxt);
4875 SKIP_BLANKS;
4876 }
4877
4878
4879 /*
4880 * Then possibly doc type declaration(s) and more Misc
4881 * (doctypedecl Misc*)?
4882 */
4883 if ((CUR == '<') && (NXT(1) == '!') &&
4884 (UPP(2) == 'D') && (UPP(3) == 'O') &&
4885 (UPP(4) == 'C') && (UPP(5) == 'T') &&
4886 (UPP(6) == 'Y') && (UPP(7) == 'P') &&
4887 (UPP(8) == 'E')) {
4888 htmlParseDocTypeDecl(ctxt);
4889 }
4890 SKIP_BLANKS;
4891
4892 /*
4893 * Parse possible comments and PIs before any content
4894 */
4895 while (((CUR == '<') && (NXT(1) == '!') &&
4896 (NXT(2) == '-') && (NXT(3) == '-')) ||
4897 ((CUR == '<') && (NXT(1) == '?'))) {
4898 htmlParseComment(ctxt);
4899 htmlParsePI(ctxt);
4900 SKIP_BLANKS;
4901 }
4902
4903 /*
4904 * Time to start parsing the tree itself
4905 */
4906 htmlParseContentInternal(ctxt);
4907
4908 /*
4909 * autoclose
4910 */
4911 if (CUR == 0)
4912 htmlAutoCloseOnEnd(ctxt);
4913
4914
4915 /*
4916 * SAX: end of the document processing.
4917 */
4918 if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
4919 ctxt->sax->endDocument(ctxt->userData);
4920
4921 if ((!(ctxt->options & HTML_PARSE_NODEFDTD)) && (ctxt->myDoc != NULL)) {
4922 dtd = xmlGetIntSubset(ctxt->myDoc);
4923 if (dtd == NULL)
4924 ctxt->myDoc->intSubset =
4925 xmlCreateIntSubset(ctxt->myDoc, BAD_CAST "html",
4926 BAD_CAST "-//W3C//DTD HTML 4.0 Transitional//EN",
4927 BAD_CAST "http://www.w3.org/TR/REC-html40/loose.dtd");
4928 }
4929 if (! ctxt->wellFormed) return(-1);
4930 return(0);
4931 }
4932
4933
4934 /************************************************************************
4935 * *
4936 * Parser contexts handling *
4937 * *
4938 ************************************************************************/
4939
4940 /**
4941 * htmlInitParserCtxt:
4942 * @ctxt: an HTML parser context
4943 * @sax: SAX handler
4944 * @userData: user data
4945 *
4946 * Initialize a parser context
4947 *
4948 * Returns 0 in case of success and -1 in case of error
4949 */
4950
4951 static int
htmlInitParserCtxt(htmlParserCtxtPtr ctxt,const htmlSAXHandler * sax,void * userData)4952 htmlInitParserCtxt(htmlParserCtxtPtr ctxt, const htmlSAXHandler *sax,
4953 void *userData)
4954 {
4955 if (ctxt == NULL) return(-1);
4956 memset(ctxt, 0, sizeof(htmlParserCtxt));
4957
4958 ctxt->dict = xmlDictCreate();
4959 if (ctxt->dict == NULL) {
4960 htmlErrMemory(NULL, "htmlInitParserCtxt: out of memory\n");
4961 return(-1);
4962 }
4963
4964 if (ctxt->sax == NULL)
4965 ctxt->sax = (htmlSAXHandler *) xmlMalloc(sizeof(htmlSAXHandler));
4966 if (ctxt->sax == NULL) {
4967 htmlErrMemory(NULL, "htmlInitParserCtxt: out of memory\n");
4968 return(-1);
4969 }
4970 if (sax == NULL) {
4971 memset(ctxt->sax, 0, sizeof(htmlSAXHandler));
4972 xmlSAX2InitHtmlDefaultSAXHandler(ctxt->sax);
4973 ctxt->userData = ctxt;
4974 } else {
4975 memcpy(ctxt->sax, sax, sizeof(htmlSAXHandler));
4976 ctxt->userData = userData ? userData : ctxt;
4977 }
4978
4979 /* Allocate the Input stack */
4980 ctxt->inputTab = (htmlParserInputPtr *)
4981 xmlMalloc(5 * sizeof(htmlParserInputPtr));
4982 if (ctxt->inputTab == NULL) {
4983 htmlErrMemory(NULL, "htmlInitParserCtxt: out of memory\n");
4984 ctxt->inputNr = 0;
4985 ctxt->inputMax = 0;
4986 ctxt->input = NULL;
4987 return(-1);
4988 }
4989 ctxt->inputNr = 0;
4990 ctxt->inputMax = 5;
4991 ctxt->input = NULL;
4992 ctxt->version = NULL;
4993 ctxt->encoding = NULL;
4994 ctxt->standalone = -1;
4995 ctxt->instate = XML_PARSER_START;
4996
4997 /* Allocate the Node stack */
4998 ctxt->nodeTab = (htmlNodePtr *) xmlMalloc(10 * sizeof(htmlNodePtr));
4999 if (ctxt->nodeTab == NULL) {
5000 htmlErrMemory(NULL, "htmlInitParserCtxt: out of memory\n");
5001 ctxt->nodeNr = 0;
5002 ctxt->nodeMax = 0;
5003 ctxt->node = NULL;
5004 ctxt->inputNr = 0;
5005 ctxt->inputMax = 0;
5006 ctxt->input = NULL;
5007 return(-1);
5008 }
5009 ctxt->nodeNr = 0;
5010 ctxt->nodeMax = 10;
5011 ctxt->node = NULL;
5012
5013 /* Allocate the Name stack */
5014 ctxt->nameTab = (const xmlChar **) xmlMalloc(10 * sizeof(xmlChar *));
5015 if (ctxt->nameTab == NULL) {
5016 htmlErrMemory(NULL, "htmlInitParserCtxt: out of memory\n");
5017 ctxt->nameNr = 0;
5018 ctxt->nameMax = 0;
5019 ctxt->name = NULL;
5020 ctxt->nodeNr = 0;
5021 ctxt->nodeMax = 0;
5022 ctxt->node = NULL;
5023 ctxt->inputNr = 0;
5024 ctxt->inputMax = 0;
5025 ctxt->input = NULL;
5026 return(-1);
5027 }
5028 ctxt->nameNr = 0;
5029 ctxt->nameMax = 10;
5030 ctxt->name = NULL;
5031
5032 ctxt->nodeInfoTab = NULL;
5033 ctxt->nodeInfoNr = 0;
5034 ctxt->nodeInfoMax = 0;
5035
5036 ctxt->myDoc = NULL;
5037 ctxt->wellFormed = 1;
5038 ctxt->replaceEntities = 0;
5039 ctxt->linenumbers = xmlLineNumbersDefaultValue;
5040 ctxt->keepBlanks = xmlKeepBlanksDefaultValue;
5041 ctxt->html = 1;
5042 ctxt->vctxt.flags = XML_VCTXT_USE_PCTXT;
5043 ctxt->vctxt.userData = ctxt;
5044 ctxt->vctxt.error = xmlParserValidityError;
5045 ctxt->vctxt.warning = xmlParserValidityWarning;
5046 ctxt->record_info = 0;
5047 ctxt->validate = 0;
5048 ctxt->checkIndex = 0;
5049 ctxt->catalogs = NULL;
5050 xmlInitNodeInfoSeq(&ctxt->node_seq);
5051 return(0);
5052 }
5053
5054 /**
5055 * htmlFreeParserCtxt:
5056 * @ctxt: an HTML parser context
5057 *
5058 * Free all the memory used by a parser context. However the parsed
5059 * document in ctxt->myDoc is not freed.
5060 */
5061
5062 void
htmlFreeParserCtxt(htmlParserCtxtPtr ctxt)5063 htmlFreeParserCtxt(htmlParserCtxtPtr ctxt)
5064 {
5065 xmlFreeParserCtxt(ctxt);
5066 }
5067
5068 /**
5069 * htmlNewParserCtxt:
5070 *
5071 * Allocate and initialize a new parser context.
5072 *
5073 * Returns the htmlParserCtxtPtr or NULL in case of allocation error
5074 */
5075
5076 htmlParserCtxtPtr
htmlNewParserCtxt(void)5077 htmlNewParserCtxt(void)
5078 {
5079 return(htmlNewSAXParserCtxt(NULL, NULL));
5080 }
5081
5082 /**
5083 * htmlNewSAXParserCtxt:
5084 * @sax: SAX handler
5085 * @userData: user data
5086 *
5087 * Allocate and initialize a new SAX parser context. If userData is NULL,
5088 * the parser context will be passed as user data.
5089 *
5090 * Returns the htmlParserCtxtPtr or NULL in case of allocation error
5091 */
5092
5093 htmlParserCtxtPtr
htmlNewSAXParserCtxt(const htmlSAXHandler * sax,void * userData)5094 htmlNewSAXParserCtxt(const htmlSAXHandler *sax, void *userData)
5095 {
5096 xmlParserCtxtPtr ctxt;
5097
5098 ctxt = (xmlParserCtxtPtr) xmlMalloc(sizeof(xmlParserCtxt));
5099 if (ctxt == NULL) {
5100 htmlErrMemory(NULL, "NewParserCtxt: out of memory\n");
5101 return(NULL);
5102 }
5103 memset(ctxt, 0, sizeof(xmlParserCtxt));
5104 if (htmlInitParserCtxt(ctxt, sax, userData) < 0) {
5105 htmlFreeParserCtxt(ctxt);
5106 return(NULL);
5107 }
5108 return(ctxt);
5109 }
5110
5111 /**
5112 * htmlCreateMemoryParserCtxt:
5113 * @buffer: a pointer to a char array
5114 * @size: the size of the array
5115 *
5116 * Create a parser context for an HTML in-memory document.
5117 *
5118 * Returns the new parser context or NULL
5119 */
5120 htmlParserCtxtPtr
htmlCreateMemoryParserCtxt(const char * buffer,int size)5121 htmlCreateMemoryParserCtxt(const char *buffer, int size) {
5122 xmlParserCtxtPtr ctxt;
5123 xmlParserInputPtr input;
5124 xmlParserInputBufferPtr buf;
5125
5126 if (buffer == NULL)
5127 return(NULL);
5128 if (size <= 0)
5129 return(NULL);
5130
5131 ctxt = htmlNewParserCtxt();
5132 if (ctxt == NULL)
5133 return(NULL);
5134
5135 buf = xmlParserInputBufferCreateMem(buffer, size, XML_CHAR_ENCODING_NONE);
5136 if (buf == NULL) {
5137 xmlFreeParserCtxt(ctxt);
5138 return(NULL);
5139 }
5140
5141 input = xmlNewInputStream(ctxt);
5142 if (input == NULL) {
5143 xmlFreeParserInputBuffer(buf);
5144 xmlFreeParserCtxt(ctxt);
5145 return(NULL);
5146 }
5147
5148 input->filename = NULL;
5149 input->buf = buf;
5150 xmlBufResetInput(buf->buffer, input);
5151
5152 inputPush(ctxt, input);
5153 return(ctxt);
5154 }
5155
5156 /**
5157 * htmlCreateDocParserCtxt:
5158 * @str: a pointer to an array of xmlChar
5159 * @encoding: a free form C string describing the HTML document encoding, or NULL
5160 *
5161 * Create a parser context for an HTML document.
5162 *
5163 * TODO: check the need to add encoding handling there
5164 *
5165 * Returns the new parser context or NULL
5166 */
5167 static htmlParserCtxtPtr
htmlCreateDocParserCtxt(const xmlChar * str,const char * encoding)5168 htmlCreateDocParserCtxt(const xmlChar *str, const char *encoding) {
5169 xmlParserCtxtPtr ctxt;
5170 xmlParserInputPtr input;
5171 xmlParserInputBufferPtr buf;
5172
5173 if (str == NULL)
5174 return(NULL);
5175
5176 ctxt = htmlNewParserCtxt();
5177 if (ctxt == NULL)
5178 return(NULL);
5179
5180 buf = xmlParserInputBufferCreateString(str);
5181 if (buf == NULL) {
5182 xmlFreeParserCtxt(ctxt);
5183 return(NULL);
5184 }
5185
5186 input = xmlNewInputStream(ctxt);
5187 if (input == NULL) {
5188 xmlFreeParserInputBuffer(buf);
5189 xmlFreeParserCtxt(ctxt);
5190 return(NULL);
5191 }
5192
5193 input->filename = NULL;
5194 input->buf = buf;
5195 xmlBufResetInput(buf->buffer, input);
5196
5197 inputPush(ctxt, input);
5198
5199 if (encoding != NULL) {
5200 xmlCharEncoding enc;
5201 xmlCharEncodingHandlerPtr handler;
5202
5203 enc = xmlParseCharEncoding(encoding);
5204 /*
5205 * registered set of known encodings
5206 */
5207 if (enc != XML_CHAR_ENCODING_ERROR) {
5208 xmlSwitchEncoding(ctxt, enc);
5209 if (ctxt->errNo == XML_ERR_UNSUPPORTED_ENCODING) {
5210 htmlParseErr(ctxt, XML_ERR_UNSUPPORTED_ENCODING,
5211 "Unsupported encoding %s\n",
5212 (const xmlChar *) encoding, NULL);
5213 }
5214 } else {
5215 /*
5216 * fallback for unknown encodings
5217 */
5218 handler = xmlFindCharEncodingHandler((const char *) encoding);
5219 if (handler != NULL) {
5220 xmlSwitchToEncoding(ctxt, handler);
5221 } else {
5222 htmlParseErr(ctxt, XML_ERR_UNSUPPORTED_ENCODING,
5223 "Unsupported encoding %s\n",
5224 (const xmlChar *) encoding, NULL);
5225 }
5226 }
5227 }
5228
5229 return(ctxt);
5230 }
5231
5232 #ifdef LIBXML_PUSH_ENABLED
5233 /************************************************************************
5234 * *
5235 * Progressive parsing interfaces *
5236 * *
5237 ************************************************************************/
5238
5239 /**
5240 * htmlParseLookupSequence:
5241 * @ctxt: an HTML parser context
5242 * @first: the first char to lookup
5243 * @next: the next char to lookup or zero
5244 * @third: the next char to lookup or zero
5245 * @ignoreattrval: skip over attribute values
5246 *
5247 * Try to find if a sequence (first, next, third) or just (first next) or
5248 * (first) is available in the input stream.
5249 * This function has a side effect of (possibly) incrementing ctxt->checkIndex
5250 * to avoid rescanning sequences of bytes, it DOES change the state of the
5251 * parser, do not use liberally.
5252 * This is basically similar to xmlParseLookupSequence()
5253 *
5254 * Returns the index to the current parsing point if the full sequence
5255 * is available, -1 otherwise.
5256 */
5257 static int
htmlParseLookupSequence(htmlParserCtxtPtr ctxt,xmlChar first,xmlChar next,xmlChar third,int ignoreattrval)5258 htmlParseLookupSequence(htmlParserCtxtPtr ctxt, xmlChar first,
5259 xmlChar next, xmlChar third, int ignoreattrval)
5260 {
5261 size_t base, len;
5262 htmlParserInputPtr in;
5263 const xmlChar *buf;
5264 int quote;
5265
5266 in = ctxt->input;
5267 if (in == NULL)
5268 return (-1);
5269
5270 base = ctxt->checkIndex;
5271 quote = ctxt->endCheckState;
5272
5273 buf = in->cur;
5274 len = in->end - in->cur;
5275
5276 /* take into account the sequence length */
5277 if (third)
5278 len -= 2;
5279 else if (next)
5280 len--;
5281 for (; base < len; base++) {
5282 if (base >= INT_MAX / 2) {
5283 ctxt->checkIndex = 0;
5284 ctxt->endCheckState = 0;
5285 return (base - 2);
5286 }
5287 if (ignoreattrval) {
5288 if (quote) {
5289 if (buf[base] == quote)
5290 quote = 0;
5291 continue;
5292 }
5293 if (buf[base] == '"' || buf[base] == '\'') {
5294 quote = buf[base];
5295 continue;
5296 }
5297 }
5298 if (buf[base] == first) {
5299 if (third != 0) {
5300 if ((buf[base + 1] != next) || (buf[base + 2] != third))
5301 continue;
5302 } else if (next != 0) {
5303 if (buf[base + 1] != next)
5304 continue;
5305 }
5306 ctxt->checkIndex = 0;
5307 ctxt->endCheckState = 0;
5308 return (base);
5309 }
5310 }
5311 ctxt->checkIndex = base;
5312 ctxt->endCheckState = quote;
5313 return (-1);
5314 }
5315
5316 /**
5317 * htmlParseLookupCommentEnd:
5318 * @ctxt: an HTML parser context
5319 *
5320 * Try to find a comment end tag in the input stream
5321 * The search includes "-->" as well as WHATWG-recommended incorrectly-closed tags.
5322 * (See https://html.spec.whatwg.org/multipage/parsing.html#parse-error-incorrectly-closed-comment)
5323 * This function has a side effect of (possibly) incrementing ctxt->checkIndex
5324 * to avoid rescanning sequences of bytes, it DOES change the state of the
5325 * parser, do not use liberally.
5326 * This wraps to htmlParseLookupSequence()
5327 *
5328 * Returns the index to the current parsing point if the full sequence is available, -1 otherwise.
5329 */
5330 static int
htmlParseLookupCommentEnd(htmlParserCtxtPtr ctxt)5331 htmlParseLookupCommentEnd(htmlParserCtxtPtr ctxt)
5332 {
5333 int mark = 0;
5334 int offset;
5335
5336 while (1) {
5337 mark = htmlParseLookupSequence(ctxt, '-', '-', 0, 0);
5338 if (mark < 0)
5339 break;
5340 if ((NXT(mark+2) == '>') ||
5341 ((NXT(mark+2) == '!') && (NXT(mark+3) == '>'))) {
5342 ctxt->checkIndex = 0;
5343 break;
5344 }
5345 offset = (NXT(mark+2) == '!') ? 3 : 2;
5346 if (mark + offset >= ctxt->input->end - ctxt->input->cur) {
5347 ctxt->checkIndex = mark;
5348 return(-1);
5349 }
5350 ctxt->checkIndex = mark + 1;
5351 }
5352 return mark;
5353 }
5354
5355
5356 /**
5357 * htmlParseTryOrFinish:
5358 * @ctxt: an HTML parser context
5359 * @terminate: last chunk indicator
5360 *
5361 * Try to progress on parsing
5362 *
5363 * Returns zero if no parsing was possible
5364 */
5365 static int
htmlParseTryOrFinish(htmlParserCtxtPtr ctxt,int terminate)5366 htmlParseTryOrFinish(htmlParserCtxtPtr ctxt, int terminate) {
5367 int ret = 0;
5368 htmlParserInputPtr in;
5369 ptrdiff_t avail = 0;
5370 xmlChar cur, next;
5371
5372 htmlParserNodeInfo node_info;
5373
5374 while (1) {
5375
5376 in = ctxt->input;
5377 if (in == NULL) break;
5378 avail = in->end - in->cur;
5379 if ((avail == 0) && (terminate)) {
5380 htmlAutoCloseOnEnd(ctxt);
5381 if ((ctxt->nameNr == 0) && (ctxt->instate != XML_PARSER_EOF)) {
5382 /*
5383 * SAX: end of the document processing.
5384 */
5385 ctxt->instate = XML_PARSER_EOF;
5386 if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
5387 ctxt->sax->endDocument(ctxt->userData);
5388 }
5389 }
5390 if (avail < 1)
5391 goto done;
5392 /*
5393 * This is done to make progress and avoid an infinite loop
5394 * if a parsing attempt was aborted by hitting a NUL byte. After
5395 * changing htmlCurrentChar, this probably isn't necessary anymore.
5396 * We should consider removing this check.
5397 */
5398 cur = in->cur[0];
5399 if (cur == 0) {
5400 SKIP(1);
5401 continue;
5402 }
5403
5404 switch (ctxt->instate) {
5405 case XML_PARSER_EOF:
5406 /*
5407 * Document parsing is done !
5408 */
5409 goto done;
5410 case XML_PARSER_START:
5411 /*
5412 * Very first chars read from the document flow.
5413 */
5414 cur = in->cur[0];
5415 if (IS_BLANK_CH(cur)) {
5416 SKIP_BLANKS;
5417 avail = in->end - in->cur;
5418 }
5419 if ((ctxt->sax) && (ctxt->sax->setDocumentLocator))
5420 ctxt->sax->setDocumentLocator(ctxt->userData,
5421 &xmlDefaultSAXLocator);
5422 if ((ctxt->sax) && (ctxt->sax->startDocument) &&
5423 (!ctxt->disableSAX))
5424 ctxt->sax->startDocument(ctxt->userData);
5425 if (ctxt->instate == XML_PARSER_EOF)
5426 goto done;
5427
5428 cur = in->cur[0];
5429 next = in->cur[1];
5430 if ((cur == '<') && (next == '!') &&
5431 (UPP(2) == 'D') && (UPP(3) == 'O') &&
5432 (UPP(4) == 'C') && (UPP(5) == 'T') &&
5433 (UPP(6) == 'Y') && (UPP(7) == 'P') &&
5434 (UPP(8) == 'E')) {
5435 if ((!terminate) &&
5436 (htmlParseLookupSequence(ctxt, '>', 0, 0, 1) < 0))
5437 goto done;
5438 htmlParseDocTypeDecl(ctxt);
5439 if (ctxt->instate == XML_PARSER_EOF)
5440 goto done;
5441 ctxt->instate = XML_PARSER_PROLOG;
5442 } else {
5443 ctxt->instate = XML_PARSER_MISC;
5444 }
5445 break;
5446 case XML_PARSER_MISC:
5447 SKIP_BLANKS;
5448 avail = in->end - in->cur;
5449 /*
5450 * no chars in buffer
5451 */
5452 if (avail < 1)
5453 goto done;
5454 /*
5455 * not enough chars in buffer
5456 */
5457 if (avail < 2) {
5458 if (!terminate)
5459 goto done;
5460 else
5461 next = ' ';
5462 } else {
5463 next = in->cur[1];
5464 }
5465 cur = in->cur[0];
5466 if ((cur == '<') && (next == '!') &&
5467 (in->cur[2] == '-') && (in->cur[3] == '-')) {
5468 if ((!terminate) && (htmlParseLookupCommentEnd(ctxt) < 0))
5469 goto done;
5470 htmlParseComment(ctxt);
5471 if (ctxt->instate == XML_PARSER_EOF)
5472 goto done;
5473 ctxt->instate = XML_PARSER_MISC;
5474 } else if ((cur == '<') && (next == '?')) {
5475 if ((!terminate) &&
5476 (htmlParseLookupSequence(ctxt, '>', 0, 0, 0) < 0))
5477 goto done;
5478 htmlParsePI(ctxt);
5479 if (ctxt->instate == XML_PARSER_EOF)
5480 goto done;
5481 ctxt->instate = XML_PARSER_MISC;
5482 } else if ((cur == '<') && (next == '!') &&
5483 (UPP(2) == 'D') && (UPP(3) == 'O') &&
5484 (UPP(4) == 'C') && (UPP(5) == 'T') &&
5485 (UPP(6) == 'Y') && (UPP(7) == 'P') &&
5486 (UPP(8) == 'E')) {
5487 if ((!terminate) &&
5488 (htmlParseLookupSequence(ctxt, '>', 0, 0, 1) < 0))
5489 goto done;
5490 htmlParseDocTypeDecl(ctxt);
5491 if (ctxt->instate == XML_PARSER_EOF)
5492 goto done;
5493 ctxt->instate = XML_PARSER_PROLOG;
5494 } else if ((cur == '<') && (next == '!') &&
5495 (avail < 9)) {
5496 goto done;
5497 } else {
5498 ctxt->instate = XML_PARSER_CONTENT;
5499 }
5500 break;
5501 case XML_PARSER_PROLOG:
5502 SKIP_BLANKS;
5503 avail = in->end - in->cur;
5504 if (avail < 2)
5505 goto done;
5506 cur = in->cur[0];
5507 next = in->cur[1];
5508 if ((cur == '<') && (next == '!') &&
5509 (in->cur[2] == '-') && (in->cur[3] == '-')) {
5510 if ((!terminate) && (htmlParseLookupCommentEnd(ctxt) < 0))
5511 goto done;
5512 htmlParseComment(ctxt);
5513 if (ctxt->instate == XML_PARSER_EOF)
5514 goto done;
5515 ctxt->instate = XML_PARSER_PROLOG;
5516 } else if ((cur == '<') && (next == '?')) {
5517 if ((!terminate) &&
5518 (htmlParseLookupSequence(ctxt, '>', 0, 0, 0) < 0))
5519 goto done;
5520 htmlParsePI(ctxt);
5521 if (ctxt->instate == XML_PARSER_EOF)
5522 goto done;
5523 ctxt->instate = XML_PARSER_PROLOG;
5524 } else if ((cur == '<') && (next == '!') &&
5525 (avail < 4)) {
5526 goto done;
5527 } else {
5528 ctxt->instate = XML_PARSER_CONTENT;
5529 }
5530 break;
5531 case XML_PARSER_EPILOG:
5532 avail = in->end - in->cur;
5533 if (avail < 1)
5534 goto done;
5535 cur = in->cur[0];
5536 if (IS_BLANK_CH(cur)) {
5537 htmlParseCharData(ctxt);
5538 goto done;
5539 }
5540 if (avail < 2)
5541 goto done;
5542 next = in->cur[1];
5543 if ((cur == '<') && (next == '!') &&
5544 (in->cur[2] == '-') && (in->cur[3] == '-')) {
5545 if ((!terminate) && (htmlParseLookupCommentEnd(ctxt) < 0))
5546 goto done;
5547 htmlParseComment(ctxt);
5548 if (ctxt->instate == XML_PARSER_EOF)
5549 goto done;
5550 ctxt->instate = XML_PARSER_EPILOG;
5551 } else if ((cur == '<') && (next == '?')) {
5552 if ((!terminate) &&
5553 (htmlParseLookupSequence(ctxt, '>', 0, 0, 0) < 0))
5554 goto done;
5555 htmlParsePI(ctxt);
5556 if (ctxt->instate == XML_PARSER_EOF)
5557 goto done;
5558 ctxt->instate = XML_PARSER_EPILOG;
5559 } else if ((cur == '<') && (next == '!') &&
5560 (avail < 4)) {
5561 goto done;
5562 } else {
5563 ctxt->errNo = XML_ERR_DOCUMENT_END;
5564 ctxt->wellFormed = 0;
5565 ctxt->instate = XML_PARSER_EOF;
5566 if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
5567 ctxt->sax->endDocument(ctxt->userData);
5568 goto done;
5569 }
5570 break;
5571 case XML_PARSER_START_TAG: {
5572 const xmlChar *name;
5573 int failed;
5574 const htmlElemDesc * info;
5575
5576 /*
5577 * no chars in buffer
5578 */
5579 if (avail < 1)
5580 goto done;
5581 /*
5582 * not enough chars in buffer
5583 */
5584 if (avail < 2) {
5585 if (!terminate)
5586 goto done;
5587 else
5588 next = ' ';
5589 } else {
5590 next = in->cur[1];
5591 }
5592 cur = in->cur[0];
5593 if (cur != '<') {
5594 ctxt->instate = XML_PARSER_CONTENT;
5595 break;
5596 }
5597 if (next == '/') {
5598 ctxt->instate = XML_PARSER_END_TAG;
5599 ctxt->checkIndex = 0;
5600 break;
5601 }
5602 if ((!terminate) &&
5603 (htmlParseLookupSequence(ctxt, '>', 0, 0, 1) < 0))
5604 goto done;
5605
5606 /* Capture start position */
5607 if (ctxt->record_info) {
5608 node_info.begin_pos = ctxt->input->consumed +
5609 (CUR_PTR - ctxt->input->base);
5610 node_info.begin_line = ctxt->input->line;
5611 }
5612
5613
5614 failed = htmlParseStartTag(ctxt);
5615 name = ctxt->name;
5616 if ((failed == -1) ||
5617 (name == NULL)) {
5618 if (CUR == '>')
5619 NEXT;
5620 break;
5621 }
5622
5623 /*
5624 * Lookup the info for that element.
5625 */
5626 info = htmlTagLookup(name);
5627 if (info == NULL) {
5628 htmlParseErr(ctxt, XML_HTML_UNKNOWN_TAG,
5629 "Tag %s invalid\n", name, NULL);
5630 }
5631
5632 /*
5633 * Check for an Empty Element labeled the XML/SGML way
5634 */
5635 if ((CUR == '/') && (NXT(1) == '>')) {
5636 SKIP(2);
5637 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
5638 ctxt->sax->endElement(ctxt->userData, name);
5639 htmlnamePop(ctxt);
5640 if (ctxt->instate == XML_PARSER_EOF)
5641 goto done;
5642 ctxt->instate = XML_PARSER_CONTENT;
5643 break;
5644 }
5645
5646 if (CUR == '>') {
5647 NEXT;
5648 } else {
5649 htmlParseErr(ctxt, XML_ERR_GT_REQUIRED,
5650 "Couldn't find end of Start Tag %s\n",
5651 name, NULL);
5652
5653 /*
5654 * end of parsing of this node.
5655 */
5656 if (xmlStrEqual(name, ctxt->name)) {
5657 nodePop(ctxt);
5658 htmlnamePop(ctxt);
5659 }
5660
5661 if (ctxt->record_info)
5662 htmlNodeInfoPush(ctxt, &node_info);
5663
5664 if (ctxt->instate == XML_PARSER_EOF)
5665 goto done;
5666 ctxt->instate = XML_PARSER_CONTENT;
5667 break;
5668 }
5669
5670 /*
5671 * Check for an Empty Element from DTD definition
5672 */
5673 if ((info != NULL) && (info->empty)) {
5674 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
5675 ctxt->sax->endElement(ctxt->userData, name);
5676 htmlnamePop(ctxt);
5677 }
5678
5679 if (ctxt->record_info)
5680 htmlNodeInfoPush(ctxt, &node_info);
5681
5682 if (ctxt->instate == XML_PARSER_EOF)
5683 goto done;
5684 ctxt->instate = XML_PARSER_CONTENT;
5685 break;
5686 }
5687 case XML_PARSER_CONTENT: {
5688 xmlChar chr[2] = { 0, 0 };
5689
5690 /*
5691 * Handle preparsed entities and charRef
5692 */
5693 if (ctxt->token != 0) {
5694 chr[0] = ctxt->token;
5695 htmlCheckParagraph(ctxt);
5696 if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL))
5697 ctxt->sax->characters(ctxt->userData, chr, 1);
5698 ctxt->token = 0;
5699 ctxt->checkIndex = 0;
5700 }
5701 if ((avail == 1) && (terminate)) {
5702 cur = in->cur[0];
5703 if ((cur != '<') && (cur != '&')) {
5704 if (ctxt->sax != NULL) {
5705 chr[0] = cur;
5706 if (IS_BLANK_CH(cur)) {
5707 if (ctxt->keepBlanks) {
5708 if (ctxt->sax->characters != NULL)
5709 ctxt->sax->characters(
5710 ctxt->userData, chr, 1);
5711 } else {
5712 if (ctxt->sax->ignorableWhitespace != NULL)
5713 ctxt->sax->ignorableWhitespace(
5714 ctxt->userData, chr, 1);
5715 }
5716 } else {
5717 htmlCheckParagraph(ctxt);
5718 if (ctxt->sax->characters != NULL)
5719 ctxt->sax->characters(
5720 ctxt->userData, chr, 1);
5721 }
5722 }
5723 ctxt->token = 0;
5724 ctxt->checkIndex = 0;
5725 in->cur++;
5726 break;
5727 }
5728 }
5729 if (avail < 2)
5730 goto done;
5731 cur = in->cur[0];
5732 next = in->cur[1];
5733 if ((xmlStrEqual(ctxt->name, BAD_CAST"script")) ||
5734 (xmlStrEqual(ctxt->name, BAD_CAST"style"))) {
5735 /*
5736 * Handle SCRIPT/STYLE separately
5737 */
5738 if (!terminate) {
5739 int idx;
5740 xmlChar val;
5741
5742 idx = htmlParseLookupSequence(ctxt, '<', '/', 0, 0);
5743 if (idx < 0)
5744 goto done;
5745 val = in->cur[idx + 2];
5746 if (val == 0) { /* bad cut of input */
5747 /*
5748 * FIXME: htmlParseScript checks for additional
5749 * characters after '</'.
5750 */
5751 ctxt->checkIndex = idx;
5752 goto done;
5753 }
5754 }
5755 htmlParseScript(ctxt);
5756 if (ctxt->instate == XML_PARSER_EOF)
5757 goto done;
5758 if ((cur == '<') && (next == '/')) {
5759 ctxt->instate = XML_PARSER_END_TAG;
5760 ctxt->checkIndex = 0;
5761 break;
5762 }
5763 } else if ((cur == '<') && (next == '!')) {
5764 if (avail < 4)
5765 goto done;
5766 /*
5767 * Sometimes DOCTYPE arrives in the middle of the document
5768 */
5769 if ((UPP(2) == 'D') && (UPP(3) == 'O') &&
5770 (UPP(4) == 'C') && (UPP(5) == 'T') &&
5771 (UPP(6) == 'Y') && (UPP(7) == 'P') &&
5772 (UPP(8) == 'E')) {
5773 if ((!terminate) &&
5774 (htmlParseLookupSequence(ctxt, '>', 0, 0, 1) < 0))
5775 goto done;
5776 htmlParseErr(ctxt, XML_HTML_STRUCURE_ERROR,
5777 "Misplaced DOCTYPE declaration\n",
5778 BAD_CAST "DOCTYPE" , NULL);
5779 htmlParseDocTypeDecl(ctxt);
5780 } else if ((in->cur[2] == '-') && (in->cur[3] == '-')) {
5781 if ((!terminate) &&
5782 (htmlParseLookupCommentEnd(ctxt) < 0))
5783 goto done;
5784 htmlParseComment(ctxt);
5785 if (ctxt->instate == XML_PARSER_EOF)
5786 goto done;
5787 ctxt->instate = XML_PARSER_CONTENT;
5788 } else {
5789 if ((!terminate) &&
5790 (htmlParseLookupSequence(ctxt, '>', 0, 0, 0) < 0))
5791 goto done;
5792 htmlSkipBogusComment(ctxt);
5793 }
5794 } else if ((cur == '<') && (next == '?')) {
5795 if ((!terminate) &&
5796 (htmlParseLookupSequence(ctxt, '>', 0, 0, 0) < 0))
5797 goto done;
5798 htmlParsePI(ctxt);
5799 if (ctxt->instate == XML_PARSER_EOF)
5800 goto done;
5801 ctxt->instate = XML_PARSER_CONTENT;
5802 } else if ((cur == '<') && (next == '/')) {
5803 ctxt->instate = XML_PARSER_END_TAG;
5804 ctxt->checkIndex = 0;
5805 break;
5806 } else if ((cur == '<') && IS_ASCII_LETTER(next)) {
5807 if ((!terminate) && (next == 0))
5808 goto done;
5809 ctxt->instate = XML_PARSER_START_TAG;
5810 ctxt->checkIndex = 0;
5811 break;
5812 } else if (cur == '<') {
5813 if ((ctxt->sax != NULL) && (!ctxt->disableSAX) &&
5814 (ctxt->sax->characters != NULL))
5815 ctxt->sax->characters(ctxt->userData,
5816 BAD_CAST "<", 1);
5817 NEXT;
5818 } else {
5819 /*
5820 * check that the text sequence is complete
5821 * before handing out the data to the parser
5822 * to avoid problems with erroneous end of
5823 * data detection.
5824 */
5825 if ((!terminate) &&
5826 (htmlParseLookupSequence(ctxt, '<', 0, 0, 0) < 0))
5827 goto done;
5828 ctxt->checkIndex = 0;
5829 while ((ctxt->instate != XML_PARSER_EOF) &&
5830 (cur != '<') && (in->cur < in->end)) {
5831 if (cur == '&') {
5832 htmlParseReference(ctxt);
5833 } else {
5834 htmlParseCharData(ctxt);
5835 }
5836 cur = in->cur[0];
5837 }
5838 }
5839
5840 break;
5841 }
5842 case XML_PARSER_END_TAG:
5843 if (avail < 2)
5844 goto done;
5845 if ((!terminate) &&
5846 (htmlParseLookupSequence(ctxt, '>', 0, 0, 0) < 0))
5847 goto done;
5848 htmlParseEndTag(ctxt);
5849 if (ctxt->instate == XML_PARSER_EOF)
5850 goto done;
5851 if (ctxt->nameNr == 0) {
5852 ctxt->instate = XML_PARSER_EPILOG;
5853 } else {
5854 ctxt->instate = XML_PARSER_CONTENT;
5855 }
5856 ctxt->checkIndex = 0;
5857 break;
5858 default:
5859 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
5860 "HPP: internal error\n", NULL, NULL);
5861 ctxt->instate = XML_PARSER_EOF;
5862 break;
5863 }
5864 }
5865 done:
5866 if ((avail == 0) && (terminate)) {
5867 htmlAutoCloseOnEnd(ctxt);
5868 if ((ctxt->nameNr == 0) && (ctxt->instate != XML_PARSER_EOF)) {
5869 /*
5870 * SAX: end of the document processing.
5871 */
5872 ctxt->instate = XML_PARSER_EOF;
5873 if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
5874 ctxt->sax->endDocument(ctxt->userData);
5875 }
5876 }
5877 if ((!(ctxt->options & HTML_PARSE_NODEFDTD)) && (ctxt->myDoc != NULL) &&
5878 ((terminate) || (ctxt->instate == XML_PARSER_EOF) ||
5879 (ctxt->instate == XML_PARSER_EPILOG))) {
5880 xmlDtdPtr dtd;
5881 dtd = xmlGetIntSubset(ctxt->myDoc);
5882 if (dtd == NULL)
5883 ctxt->myDoc->intSubset =
5884 xmlCreateIntSubset(ctxt->myDoc, BAD_CAST "html",
5885 BAD_CAST "-//W3C//DTD HTML 4.0 Transitional//EN",
5886 BAD_CAST "http://www.w3.org/TR/REC-html40/loose.dtd");
5887 }
5888 return(ret);
5889 }
5890
5891 /**
5892 * htmlParseChunk:
5893 * @ctxt: an HTML parser context
5894 * @chunk: an char array
5895 * @size: the size in byte of the chunk
5896 * @terminate: last chunk indicator
5897 *
5898 * Parse a Chunk of memory
5899 *
5900 * Returns zero if no error, the xmlParserErrors otherwise.
5901 */
5902 int
htmlParseChunk(htmlParserCtxtPtr ctxt,const char * chunk,int size,int terminate)5903 htmlParseChunk(htmlParserCtxtPtr ctxt, const char *chunk, int size,
5904 int terminate) {
5905 if ((ctxt == NULL) || (ctxt->input == NULL)) {
5906 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
5907 "htmlParseChunk: context error\n", NULL, NULL);
5908 return(XML_ERR_INTERNAL_ERROR);
5909 }
5910 if ((size > 0) && (chunk != NULL) && (ctxt->input != NULL) &&
5911 (ctxt->input->buf != NULL) && (ctxt->instate != XML_PARSER_EOF)) {
5912 size_t pos = ctxt->input->cur - ctxt->input->base;
5913 int res;
5914
5915 res = xmlParserInputBufferPush(ctxt->input->buf, size, chunk);
5916 xmlBufUpdateInput(ctxt->input->buf->buffer, ctxt->input, pos);
5917 if (res < 0) {
5918 htmlParseErr(ctxt, ctxt->input->buf->error,
5919 "xmlParserInputBufferPush failed", NULL, NULL);
5920 xmlHaltParser(ctxt);
5921 return (ctxt->errNo);
5922 }
5923 }
5924 htmlParseTryOrFinish(ctxt, terminate);
5925 if (terminate) {
5926 if ((ctxt->instate != XML_PARSER_EOF) &&
5927 (ctxt->instate != XML_PARSER_EPILOG) &&
5928 (ctxt->instate != XML_PARSER_MISC)) {
5929 ctxt->errNo = XML_ERR_DOCUMENT_END;
5930 ctxt->wellFormed = 0;
5931 }
5932 if (ctxt->instate != XML_PARSER_EOF) {
5933 if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
5934 ctxt->sax->endDocument(ctxt->userData);
5935 }
5936 ctxt->instate = XML_PARSER_EOF;
5937 }
5938 return((xmlParserErrors) ctxt->errNo);
5939 }
5940
5941 /************************************************************************
5942 * *
5943 * User entry points *
5944 * *
5945 ************************************************************************/
5946
5947 /**
5948 * htmlCreatePushParserCtxt:
5949 * @sax: a SAX handler
5950 * @user_data: The user data returned on SAX callbacks
5951 * @chunk: a pointer to an array of chars
5952 * @size: number of chars in the array
5953 * @filename: an optional file name or URI
5954 * @enc: an optional encoding
5955 *
5956 * Create a parser context for using the HTML parser in push mode
5957 * The value of @filename is used for fetching external entities
5958 * and error/warning reports.
5959 *
5960 * Returns the new parser context or NULL
5961 */
5962 htmlParserCtxtPtr
htmlCreatePushParserCtxt(htmlSAXHandlerPtr sax,void * user_data,const char * chunk,int size,const char * filename,xmlCharEncoding enc)5963 htmlCreatePushParserCtxt(htmlSAXHandlerPtr sax, void *user_data,
5964 const char *chunk, int size, const char *filename,
5965 xmlCharEncoding enc) {
5966 htmlParserCtxtPtr ctxt;
5967 htmlParserInputPtr inputStream;
5968 xmlParserInputBufferPtr buf;
5969
5970 xmlInitParser();
5971
5972 buf = xmlAllocParserInputBuffer(enc);
5973 if (buf == NULL) return(NULL);
5974
5975 ctxt = htmlNewSAXParserCtxt(sax, user_data);
5976 if (ctxt == NULL) {
5977 xmlFreeParserInputBuffer(buf);
5978 return(NULL);
5979 }
5980 if (filename == NULL) {
5981 ctxt->directory = NULL;
5982 } else {
5983 ctxt->directory = xmlParserGetDirectory(filename);
5984 }
5985
5986 inputStream = htmlNewInputStream(ctxt);
5987 if (inputStream == NULL) {
5988 xmlFreeParserCtxt(ctxt);
5989 xmlFreeParserInputBuffer(buf);
5990 return(NULL);
5991 }
5992
5993 if (filename == NULL)
5994 inputStream->filename = NULL;
5995 else
5996 inputStream->filename = (char *)
5997 xmlCanonicPath((const xmlChar *) filename);
5998 inputStream->buf = buf;
5999 xmlBufResetInput(buf->buffer, inputStream);
6000
6001 inputPush(ctxt, inputStream);
6002
6003 if ((size > 0) && (chunk != NULL) && (ctxt->input != NULL) &&
6004 (ctxt->input->buf != NULL)) {
6005 size_t pos = ctxt->input->cur - ctxt->input->base;
6006 int res;
6007
6008 res = xmlParserInputBufferPush(ctxt->input->buf, size, chunk);
6009 xmlBufUpdateInput(ctxt->input->buf->buffer, ctxt->input, pos);
6010 if (res < 0) {
6011 htmlParseErr(ctxt, ctxt->input->buf->error,
6012 "xmlParserInputBufferPush failed\n", NULL, NULL);
6013 xmlHaltParser(ctxt);
6014 }
6015 }
6016 ctxt->progressive = 1;
6017
6018 return(ctxt);
6019 }
6020 #endif /* LIBXML_PUSH_ENABLED */
6021
6022 /**
6023 * htmlSAXParseDoc:
6024 * @cur: a pointer to an array of xmlChar
6025 * @encoding: a free form C string describing the HTML document encoding, or NULL
6026 * @sax: the SAX handler block
6027 * @userData: if using SAX, this pointer will be provided on callbacks.
6028 *
6029 * DEPRECATED: Use htmlNewSAXParserCtxt and htmlCtxtReadDoc.
6030 *
6031 * Parse an HTML in-memory document. If sax is not NULL, use the SAX callbacks
6032 * to handle parse events. If sax is NULL, fallback to the default DOM
6033 * behavior and return a tree.
6034 *
6035 * Returns the resulting document tree unless SAX is NULL or the document is
6036 * not well formed.
6037 */
6038
6039 htmlDocPtr
htmlSAXParseDoc(const xmlChar * cur,const char * encoding,htmlSAXHandlerPtr sax,void * userData)6040 htmlSAXParseDoc(const xmlChar *cur, const char *encoding,
6041 htmlSAXHandlerPtr sax, void *userData) {
6042 htmlDocPtr ret;
6043 htmlParserCtxtPtr ctxt;
6044
6045 xmlInitParser();
6046
6047 if (cur == NULL) return(NULL);
6048
6049
6050 ctxt = htmlCreateDocParserCtxt(cur, encoding);
6051 if (ctxt == NULL) return(NULL);
6052 if (sax != NULL) {
6053 if (ctxt->sax != NULL) xmlFree (ctxt->sax);
6054 ctxt->sax = sax;
6055 ctxt->userData = userData;
6056 }
6057
6058 htmlParseDocument(ctxt);
6059 ret = ctxt->myDoc;
6060 if (sax != NULL) {
6061 ctxt->sax = NULL;
6062 ctxt->userData = NULL;
6063 }
6064 htmlFreeParserCtxt(ctxt);
6065
6066 return(ret);
6067 }
6068
6069 /**
6070 * htmlParseDoc:
6071 * @cur: a pointer to an array of xmlChar
6072 * @encoding: a free form C string describing the HTML document encoding, or NULL
6073 *
6074 * parse an HTML in-memory document and build a tree.
6075 *
6076 * Returns the resulting document tree
6077 */
6078
6079 htmlDocPtr
htmlParseDoc(const xmlChar * cur,const char * encoding)6080 htmlParseDoc(const xmlChar *cur, const char *encoding) {
6081 return(htmlSAXParseDoc(cur, encoding, NULL, NULL));
6082 }
6083
6084
6085 /**
6086 * htmlCreateFileParserCtxt:
6087 * @filename: the filename
6088 * @encoding: a free form C string describing the HTML document encoding, or NULL
6089 *
6090 * Create a parser context for a file content.
6091 * Automatic support for ZLIB/Compress compressed document is provided
6092 * by default if found at compile-time.
6093 *
6094 * Returns the new parser context or NULL
6095 */
6096 htmlParserCtxtPtr
htmlCreateFileParserCtxt(const char * filename,const char * encoding)6097 htmlCreateFileParserCtxt(const char *filename, const char *encoding)
6098 {
6099 htmlParserCtxtPtr ctxt;
6100 htmlParserInputPtr inputStream;
6101 char *canonicFilename;
6102
6103 if (filename == NULL)
6104 return(NULL);
6105
6106 ctxt = htmlNewParserCtxt();
6107 if (ctxt == NULL) {
6108 return(NULL);
6109 }
6110 canonicFilename = (char *) xmlCanonicPath((const xmlChar *) filename);
6111 if (canonicFilename == NULL) {
6112 xmlFreeParserCtxt(ctxt);
6113 return(NULL);
6114 }
6115
6116 inputStream = xmlLoadExternalEntity(canonicFilename, NULL, ctxt);
6117 xmlFree(canonicFilename);
6118 if (inputStream == NULL) {
6119 xmlFreeParserCtxt(ctxt);
6120 return(NULL);
6121 }
6122
6123 inputPush(ctxt, inputStream);
6124
6125 /* set encoding */
6126 if (encoding) {
6127 xmlCharEncodingHandlerPtr hdlr;
6128
6129 hdlr = xmlFindCharEncodingHandler(encoding);
6130 if (hdlr != NULL) {
6131 xmlSwitchToEncoding(ctxt, hdlr);
6132 }
6133 }
6134
6135 return(ctxt);
6136 }
6137
6138 /**
6139 * htmlSAXParseFile:
6140 * @filename: the filename
6141 * @encoding: a free form C string describing the HTML document encoding, or NULL
6142 * @sax: the SAX handler block
6143 * @userData: if using SAX, this pointer will be provided on callbacks.
6144 *
6145 * DEPRECATED: Use htmlNewSAXParserCtxt and htmlCtxtReadFile.
6146 *
6147 * parse an HTML file and build a tree. Automatic support for ZLIB/Compress
6148 * compressed document is provided by default if found at compile-time.
6149 * It use the given SAX function block to handle the parsing callback.
6150 * If sax is NULL, fallback to the default DOM tree building routines.
6151 *
6152 * Returns the resulting document tree unless SAX is NULL or the document is
6153 * not well formed.
6154 */
6155
6156 htmlDocPtr
htmlSAXParseFile(const char * filename,const char * encoding,htmlSAXHandlerPtr sax,void * userData)6157 htmlSAXParseFile(const char *filename, const char *encoding, htmlSAXHandlerPtr sax,
6158 void *userData) {
6159 htmlDocPtr ret;
6160 htmlParserCtxtPtr ctxt;
6161 htmlSAXHandlerPtr oldsax = NULL;
6162
6163 xmlInitParser();
6164
6165 ctxt = htmlCreateFileParserCtxt(filename, encoding);
6166 if (ctxt == NULL) return(NULL);
6167 if (sax != NULL) {
6168 oldsax = ctxt->sax;
6169 ctxt->sax = sax;
6170 ctxt->userData = userData;
6171 }
6172
6173 htmlParseDocument(ctxt);
6174
6175 ret = ctxt->myDoc;
6176 if (sax != NULL) {
6177 ctxt->sax = oldsax;
6178 ctxt->userData = NULL;
6179 }
6180 htmlFreeParserCtxt(ctxt);
6181
6182 return(ret);
6183 }
6184
6185 /**
6186 * htmlParseFile:
6187 * @filename: the filename
6188 * @encoding: a free form C string describing the HTML document encoding, or NULL
6189 *
6190 * parse an HTML file and build a tree. Automatic support for ZLIB/Compress
6191 * compressed document is provided by default if found at compile-time.
6192 *
6193 * Returns the resulting document tree
6194 */
6195
6196 htmlDocPtr
htmlParseFile(const char * filename,const char * encoding)6197 htmlParseFile(const char *filename, const char *encoding) {
6198 return(htmlSAXParseFile(filename, encoding, NULL, NULL));
6199 }
6200
6201 /**
6202 * htmlHandleOmittedElem:
6203 * @val: int 0 or 1
6204 *
6205 * Set and return the previous value for handling HTML omitted tags.
6206 *
6207 * Returns the last value for 0 for no handling, 1 for auto insertion.
6208 */
6209
6210 int
htmlHandleOmittedElem(int val)6211 htmlHandleOmittedElem(int val) {
6212 int old = htmlOmittedDefaultValue;
6213
6214 htmlOmittedDefaultValue = val;
6215 return(old);
6216 }
6217
6218 /**
6219 * htmlElementAllowedHere:
6220 * @parent: HTML parent element
6221 * @elt: HTML element
6222 *
6223 * Checks whether an HTML element may be a direct child of a parent element.
6224 * Note - doesn't check for deprecated elements
6225 *
6226 * Returns 1 if allowed; 0 otherwise.
6227 */
6228 int
htmlElementAllowedHere(const htmlElemDesc * parent,const xmlChar * elt)6229 htmlElementAllowedHere(const htmlElemDesc* parent, const xmlChar* elt) {
6230 const char** p ;
6231
6232 if ( ! elt || ! parent || ! parent->subelts )
6233 return 0 ;
6234
6235 for ( p = parent->subelts; *p; ++p )
6236 if ( !xmlStrcmp((const xmlChar *)*p, elt) )
6237 return 1 ;
6238
6239 return 0 ;
6240 }
6241 /**
6242 * htmlElementStatusHere:
6243 * @parent: HTML parent element
6244 * @elt: HTML element
6245 *
6246 * Checks whether an HTML element may be a direct child of a parent element.
6247 * and if so whether it is valid or deprecated.
6248 *
6249 * Returns one of HTML_VALID, HTML_DEPRECATED, HTML_INVALID
6250 */
6251 htmlStatus
htmlElementStatusHere(const htmlElemDesc * parent,const htmlElemDesc * elt)6252 htmlElementStatusHere(const htmlElemDesc* parent, const htmlElemDesc* elt) {
6253 if ( ! parent || ! elt )
6254 return HTML_INVALID ;
6255 if ( ! htmlElementAllowedHere(parent, (const xmlChar*) elt->name ) )
6256 return HTML_INVALID ;
6257
6258 return ( elt->dtd == 0 ) ? HTML_VALID : HTML_DEPRECATED ;
6259 }
6260 /**
6261 * htmlAttrAllowed:
6262 * @elt: HTML element
6263 * @attr: HTML attribute
6264 * @legacy: whether to allow deprecated attributes
6265 *
6266 * Checks whether an attribute is valid for an element
6267 * Has full knowledge of Required and Deprecated attributes
6268 *
6269 * Returns one of HTML_REQUIRED, HTML_VALID, HTML_DEPRECATED, HTML_INVALID
6270 */
6271 htmlStatus
htmlAttrAllowed(const htmlElemDesc * elt,const xmlChar * attr,int legacy)6272 htmlAttrAllowed(const htmlElemDesc* elt, const xmlChar* attr, int legacy) {
6273 const char** p ;
6274
6275 if ( !elt || ! attr )
6276 return HTML_INVALID ;
6277
6278 if ( elt->attrs_req )
6279 for ( p = elt->attrs_req; *p; ++p)
6280 if ( !xmlStrcmp((const xmlChar*)*p, attr) )
6281 return HTML_REQUIRED ;
6282
6283 if ( elt->attrs_opt )
6284 for ( p = elt->attrs_opt; *p; ++p)
6285 if ( !xmlStrcmp((const xmlChar*)*p, attr) )
6286 return HTML_VALID ;
6287
6288 if ( legacy && elt->attrs_depr )
6289 for ( p = elt->attrs_depr; *p; ++p)
6290 if ( !xmlStrcmp((const xmlChar*)*p, attr) )
6291 return HTML_DEPRECATED ;
6292
6293 return HTML_INVALID ;
6294 }
6295 /**
6296 * htmlNodeStatus:
6297 * @node: an htmlNodePtr in a tree
6298 * @legacy: whether to allow deprecated elements (YES is faster here
6299 * for Element nodes)
6300 *
6301 * Checks whether the tree node is valid. Experimental (the author
6302 * only uses the HTML enhancements in a SAX parser)
6303 *
6304 * Return: for Element nodes, a return from htmlElementAllowedHere (if
6305 * legacy allowed) or htmlElementStatusHere (otherwise).
6306 * for Attribute nodes, a return from htmlAttrAllowed
6307 * for other nodes, HTML_NA (no checks performed)
6308 */
6309 htmlStatus
htmlNodeStatus(const htmlNodePtr node,int legacy)6310 htmlNodeStatus(const htmlNodePtr node, int legacy) {
6311 if ( ! node )
6312 return HTML_INVALID ;
6313
6314 switch ( node->type ) {
6315 case XML_ELEMENT_NODE:
6316 return legacy
6317 ? ( htmlElementAllowedHere (
6318 htmlTagLookup(node->parent->name) , node->name
6319 ) ? HTML_VALID : HTML_INVALID )
6320 : htmlElementStatusHere(
6321 htmlTagLookup(node->parent->name) ,
6322 htmlTagLookup(node->name) )
6323 ;
6324 case XML_ATTRIBUTE_NODE:
6325 return htmlAttrAllowed(
6326 htmlTagLookup(node->parent->name) , node->name, legacy) ;
6327 default: return HTML_NA ;
6328 }
6329 }
6330 /************************************************************************
6331 * *
6332 * New set (2.6.0) of simpler and more flexible APIs *
6333 * *
6334 ************************************************************************/
6335 /**
6336 * DICT_FREE:
6337 * @str: a string
6338 *
6339 * Free a string if it is not owned by the "dict" dictionary in the
6340 * current scope
6341 */
6342 #define DICT_FREE(str) \
6343 if ((str) && ((!dict) || \
6344 (xmlDictOwns(dict, (const xmlChar *)(str)) == 0))) \
6345 xmlFree((char *)(str));
6346
6347 /**
6348 * htmlCtxtReset:
6349 * @ctxt: an HTML parser context
6350 *
6351 * Reset a parser context
6352 */
6353 void
htmlCtxtReset(htmlParserCtxtPtr ctxt)6354 htmlCtxtReset(htmlParserCtxtPtr ctxt)
6355 {
6356 xmlParserInputPtr input;
6357 xmlDictPtr dict;
6358
6359 if (ctxt == NULL)
6360 return;
6361
6362 xmlInitParser();
6363 dict = ctxt->dict;
6364
6365 while ((input = inputPop(ctxt)) != NULL) { /* Non consuming */
6366 xmlFreeInputStream(input);
6367 }
6368 ctxt->inputNr = 0;
6369 ctxt->input = NULL;
6370
6371 ctxt->spaceNr = 0;
6372 if (ctxt->spaceTab != NULL) {
6373 ctxt->spaceTab[0] = -1;
6374 ctxt->space = &ctxt->spaceTab[0];
6375 } else {
6376 ctxt->space = NULL;
6377 }
6378
6379
6380 ctxt->nodeNr = 0;
6381 ctxt->node = NULL;
6382
6383 ctxt->nameNr = 0;
6384 ctxt->name = NULL;
6385
6386 ctxt->nsNr = 0;
6387
6388 DICT_FREE(ctxt->version);
6389 ctxt->version = NULL;
6390 DICT_FREE(ctxt->encoding);
6391 ctxt->encoding = NULL;
6392 DICT_FREE(ctxt->directory);
6393 ctxt->directory = NULL;
6394 DICT_FREE(ctxt->extSubURI);
6395 ctxt->extSubURI = NULL;
6396 DICT_FREE(ctxt->extSubSystem);
6397 ctxt->extSubSystem = NULL;
6398 if (ctxt->myDoc != NULL)
6399 xmlFreeDoc(ctxt->myDoc);
6400 ctxt->myDoc = NULL;
6401
6402 ctxt->standalone = -1;
6403 ctxt->hasExternalSubset = 0;
6404 ctxt->hasPErefs = 0;
6405 ctxt->html = 1;
6406 ctxt->external = 0;
6407 ctxt->instate = XML_PARSER_START;
6408 ctxt->token = 0;
6409
6410 ctxt->wellFormed = 1;
6411 ctxt->nsWellFormed = 1;
6412 ctxt->disableSAX = 0;
6413 ctxt->valid = 1;
6414 ctxt->vctxt.userData = ctxt;
6415 ctxt->vctxt.flags = XML_VCTXT_USE_PCTXT;
6416 ctxt->vctxt.error = xmlParserValidityError;
6417 ctxt->vctxt.warning = xmlParserValidityWarning;
6418 ctxt->record_info = 0;
6419 ctxt->checkIndex = 0;
6420 ctxt->endCheckState = 0;
6421 ctxt->inSubset = 0;
6422 ctxt->errNo = XML_ERR_OK;
6423 ctxt->depth = 0;
6424 ctxt->catalogs = NULL;
6425 xmlInitNodeInfoSeq(&ctxt->node_seq);
6426
6427 if (ctxt->attsDefault != NULL) {
6428 xmlHashFree(ctxt->attsDefault, xmlHashDefaultDeallocator);
6429 ctxt->attsDefault = NULL;
6430 }
6431 if (ctxt->attsSpecial != NULL) {
6432 xmlHashFree(ctxt->attsSpecial, NULL);
6433 ctxt->attsSpecial = NULL;
6434 }
6435
6436 ctxt->nbErrors = 0;
6437 ctxt->nbWarnings = 0;
6438 if (ctxt->lastError.code != XML_ERR_OK)
6439 xmlResetError(&ctxt->lastError);
6440 }
6441
6442 /**
6443 * htmlCtxtUseOptions:
6444 * @ctxt: an HTML parser context
6445 * @options: a combination of htmlParserOption(s)
6446 *
6447 * Applies the options to the parser context
6448 *
6449 * Returns 0 in case of success, the set of unknown or unimplemented options
6450 * in case of error.
6451 */
6452 int
htmlCtxtUseOptions(htmlParserCtxtPtr ctxt,int options)6453 htmlCtxtUseOptions(htmlParserCtxtPtr ctxt, int options)
6454 {
6455 if (ctxt == NULL)
6456 return(-1);
6457
6458 if (options & HTML_PARSE_NOWARNING) {
6459 ctxt->sax->warning = NULL;
6460 ctxt->vctxt.warning = NULL;
6461 options -= XML_PARSE_NOWARNING;
6462 ctxt->options |= XML_PARSE_NOWARNING;
6463 }
6464 if (options & HTML_PARSE_NOERROR) {
6465 ctxt->sax->error = NULL;
6466 ctxt->vctxt.error = NULL;
6467 ctxt->sax->fatalError = NULL;
6468 options -= XML_PARSE_NOERROR;
6469 ctxt->options |= XML_PARSE_NOERROR;
6470 }
6471 if (options & HTML_PARSE_PEDANTIC) {
6472 ctxt->pedantic = 1;
6473 options -= XML_PARSE_PEDANTIC;
6474 ctxt->options |= XML_PARSE_PEDANTIC;
6475 } else
6476 ctxt->pedantic = 0;
6477 if (options & XML_PARSE_NOBLANKS) {
6478 ctxt->keepBlanks = 0;
6479 ctxt->sax->ignorableWhitespace = xmlSAX2IgnorableWhitespace;
6480 options -= XML_PARSE_NOBLANKS;
6481 ctxt->options |= XML_PARSE_NOBLANKS;
6482 } else
6483 ctxt->keepBlanks = 1;
6484 if (options & HTML_PARSE_RECOVER) {
6485 ctxt->recovery = 1;
6486 options -= HTML_PARSE_RECOVER;
6487 } else
6488 ctxt->recovery = 0;
6489 if (options & HTML_PARSE_COMPACT) {
6490 ctxt->options |= HTML_PARSE_COMPACT;
6491 options -= HTML_PARSE_COMPACT;
6492 }
6493 if (options & XML_PARSE_HUGE) {
6494 ctxt->options |= XML_PARSE_HUGE;
6495 options -= XML_PARSE_HUGE;
6496 }
6497 if (options & HTML_PARSE_NODEFDTD) {
6498 ctxt->options |= HTML_PARSE_NODEFDTD;
6499 options -= HTML_PARSE_NODEFDTD;
6500 }
6501 if (options & HTML_PARSE_IGNORE_ENC) {
6502 ctxt->options |= HTML_PARSE_IGNORE_ENC;
6503 options -= HTML_PARSE_IGNORE_ENC;
6504 }
6505 if (options & HTML_PARSE_NOIMPLIED) {
6506 ctxt->options |= HTML_PARSE_NOIMPLIED;
6507 options -= HTML_PARSE_NOIMPLIED;
6508 }
6509 ctxt->dictNames = 0;
6510 ctxt->linenumbers = 1;
6511 return (options);
6512 }
6513
6514 /**
6515 * htmlDoRead:
6516 * @ctxt: an HTML parser context
6517 * @URL: the base URL to use for the document
6518 * @encoding: the document encoding, or NULL
6519 * @options: a combination of htmlParserOption(s)
6520 * @reuse: keep the context for reuse
6521 *
6522 * Common front-end for the htmlRead functions
6523 *
6524 * Returns the resulting document tree or NULL
6525 */
6526 static htmlDocPtr
htmlDoRead(htmlParserCtxtPtr ctxt,const char * URL,const char * encoding,int options,int reuse)6527 htmlDoRead(htmlParserCtxtPtr ctxt, const char *URL, const char *encoding,
6528 int options, int reuse)
6529 {
6530 htmlDocPtr ret;
6531
6532 htmlCtxtUseOptions(ctxt, options);
6533 ctxt->html = 1;
6534 if (encoding != NULL) {
6535 xmlCharEncodingHandlerPtr hdlr;
6536
6537 hdlr = xmlFindCharEncodingHandler(encoding);
6538 if (hdlr != NULL) {
6539 xmlSwitchToEncoding(ctxt, hdlr);
6540 }
6541 }
6542 if ((URL != NULL) && (ctxt->input != NULL) &&
6543 (ctxt->input->filename == NULL))
6544 ctxt->input->filename = (char *) xmlStrdup((const xmlChar *) URL);
6545 htmlParseDocument(ctxt);
6546 ret = ctxt->myDoc;
6547 ctxt->myDoc = NULL;
6548 if (!reuse) {
6549 if ((ctxt->dictNames) &&
6550 (ret != NULL) &&
6551 (ret->dict == ctxt->dict))
6552 ctxt->dict = NULL;
6553 xmlFreeParserCtxt(ctxt);
6554 }
6555 return (ret);
6556 }
6557
6558 /**
6559 * htmlReadDoc:
6560 * @cur: a pointer to a zero terminated string
6561 * @URL: the base URL to use for the document
6562 * @encoding: the document encoding, or NULL
6563 * @options: a combination of htmlParserOption(s)
6564 *
6565 * parse an XML in-memory document and build a tree.
6566 *
6567 * Returns the resulting document tree
6568 */
6569 htmlDocPtr
htmlReadDoc(const xmlChar * cur,const char * URL,const char * encoding,int options)6570 htmlReadDoc(const xmlChar * cur, const char *URL, const char *encoding, int options)
6571 {
6572 htmlParserCtxtPtr ctxt;
6573
6574 if (cur == NULL)
6575 return (NULL);
6576
6577 xmlInitParser();
6578 ctxt = htmlCreateDocParserCtxt(cur, NULL);
6579 if (ctxt == NULL)
6580 return (NULL);
6581 return (htmlDoRead(ctxt, URL, encoding, options, 0));
6582 }
6583
6584 /**
6585 * htmlReadFile:
6586 * @filename: a file or URL
6587 * @encoding: the document encoding, or NULL
6588 * @options: a combination of htmlParserOption(s)
6589 *
6590 * parse an XML file from the filesystem or the network.
6591 *
6592 * Returns the resulting document tree
6593 */
6594 htmlDocPtr
htmlReadFile(const char * filename,const char * encoding,int options)6595 htmlReadFile(const char *filename, const char *encoding, int options)
6596 {
6597 htmlParserCtxtPtr ctxt;
6598
6599 xmlInitParser();
6600 ctxt = htmlCreateFileParserCtxt(filename, encoding);
6601 if (ctxt == NULL)
6602 return (NULL);
6603 return (htmlDoRead(ctxt, NULL, NULL, options, 0));
6604 }
6605
6606 /**
6607 * htmlReadMemory:
6608 * @buffer: a pointer to a char array
6609 * @size: the size of the array
6610 * @URL: the base URL to use for the document
6611 * @encoding: the document encoding, or NULL
6612 * @options: a combination of htmlParserOption(s)
6613 *
6614 * parse an XML in-memory document and build a tree.
6615 *
6616 * Returns the resulting document tree
6617 */
6618 htmlDocPtr
htmlReadMemory(const char * buffer,int size,const char * URL,const char * encoding,int options)6619 htmlReadMemory(const char *buffer, int size, const char *URL, const char *encoding, int options)
6620 {
6621 htmlParserCtxtPtr ctxt;
6622
6623 xmlInitParser();
6624 ctxt = htmlCreateMemoryParserCtxt(buffer, size);
6625 if (ctxt == NULL)
6626 return (NULL);
6627 return (htmlDoRead(ctxt, URL, encoding, options, 0));
6628 }
6629
6630 /**
6631 * htmlReadFd:
6632 * @fd: an open file descriptor
6633 * @URL: the base URL to use for the document
6634 * @encoding: the document encoding, or NULL
6635 * @options: a combination of htmlParserOption(s)
6636 *
6637 * parse an HTML from a file descriptor and build a tree.
6638 * NOTE that the file descriptor will not be closed when the
6639 * reader is closed or reset.
6640 *
6641 * Returns the resulting document tree
6642 */
6643 htmlDocPtr
htmlReadFd(int fd,const char * URL,const char * encoding,int options)6644 htmlReadFd(int fd, const char *URL, const char *encoding, int options)
6645 {
6646 htmlParserCtxtPtr ctxt;
6647 xmlParserInputBufferPtr input;
6648 htmlParserInputPtr stream;
6649
6650 if (fd < 0)
6651 return (NULL);
6652
6653 xmlInitParser();
6654 input = xmlParserInputBufferCreateFd(fd, XML_CHAR_ENCODING_NONE);
6655 if (input == NULL)
6656 return (NULL);
6657 input->closecallback = NULL;
6658 ctxt = htmlNewParserCtxt();
6659 if (ctxt == NULL) {
6660 xmlFreeParserInputBuffer(input);
6661 return (NULL);
6662 }
6663 stream = xmlNewIOInputStream(ctxt, input, XML_CHAR_ENCODING_NONE);
6664 if (stream == NULL) {
6665 xmlFreeParserInputBuffer(input);
6666 htmlFreeParserCtxt(ctxt);
6667 return (NULL);
6668 }
6669 inputPush(ctxt, stream);
6670 return (htmlDoRead(ctxt, URL, encoding, options, 0));
6671 }
6672
6673 /**
6674 * htmlReadIO:
6675 * @ioread: an I/O read function
6676 * @ioclose: an I/O close function
6677 * @ioctx: an I/O handler
6678 * @URL: the base URL to use for the document
6679 * @encoding: the document encoding, or NULL
6680 * @options: a combination of htmlParserOption(s)
6681 *
6682 * parse an HTML document from I/O functions and source and build a tree.
6683 *
6684 * Returns the resulting document tree
6685 */
6686 htmlDocPtr
htmlReadIO(xmlInputReadCallback ioread,xmlInputCloseCallback ioclose,void * ioctx,const char * URL,const char * encoding,int options)6687 htmlReadIO(xmlInputReadCallback ioread, xmlInputCloseCallback ioclose,
6688 void *ioctx, const char *URL, const char *encoding, int options)
6689 {
6690 htmlParserCtxtPtr ctxt;
6691 xmlParserInputBufferPtr input;
6692 xmlParserInputPtr stream;
6693
6694 if (ioread == NULL)
6695 return (NULL);
6696 xmlInitParser();
6697
6698 input = xmlParserInputBufferCreateIO(ioread, ioclose, ioctx,
6699 XML_CHAR_ENCODING_NONE);
6700 if (input == NULL) {
6701 if (ioclose != NULL)
6702 ioclose(ioctx);
6703 return (NULL);
6704 }
6705 ctxt = htmlNewParserCtxt();
6706 if (ctxt == NULL) {
6707 xmlFreeParserInputBuffer(input);
6708 return (NULL);
6709 }
6710 stream = xmlNewIOInputStream(ctxt, input, XML_CHAR_ENCODING_NONE);
6711 if (stream == NULL) {
6712 xmlFreeParserInputBuffer(input);
6713 xmlFreeParserCtxt(ctxt);
6714 return (NULL);
6715 }
6716 inputPush(ctxt, stream);
6717 return (htmlDoRead(ctxt, URL, encoding, options, 0));
6718 }
6719
6720 /**
6721 * htmlCtxtReadDoc:
6722 * @ctxt: an HTML parser context
6723 * @str: a pointer to a zero terminated string
6724 * @URL: the base URL to use for the document
6725 * @encoding: the document encoding, or NULL
6726 * @options: a combination of htmlParserOption(s)
6727 *
6728 * parse an XML in-memory document and build a tree.
6729 * This reuses the existing @ctxt parser context
6730 *
6731 * Returns the resulting document tree
6732 */
6733 htmlDocPtr
htmlCtxtReadDoc(htmlParserCtxtPtr ctxt,const xmlChar * str,const char * URL,const char * encoding,int options)6734 htmlCtxtReadDoc(htmlParserCtxtPtr ctxt, const xmlChar *str,
6735 const char *URL, const char *encoding, int options)
6736 {
6737 xmlParserInputBufferPtr input;
6738 xmlParserInputPtr stream;
6739
6740 if (ctxt == NULL)
6741 return (NULL);
6742 if (str == NULL)
6743 return (NULL);
6744 xmlInitParser();
6745
6746 htmlCtxtReset(ctxt);
6747
6748 input = xmlParserInputBufferCreateString(str);
6749 if (input == NULL) {
6750 return(NULL);
6751 }
6752
6753 stream = xmlNewIOInputStream(ctxt, input, XML_CHAR_ENCODING_NONE);
6754 if (stream == NULL) {
6755 xmlFreeParserInputBuffer(input);
6756 return(NULL);
6757 }
6758
6759 inputPush(ctxt, stream);
6760 return (htmlDoRead(ctxt, URL, encoding, options, 1));
6761 }
6762
6763 /**
6764 * htmlCtxtReadFile:
6765 * @ctxt: an HTML parser context
6766 * @filename: a file or URL
6767 * @encoding: the document encoding, or NULL
6768 * @options: a combination of htmlParserOption(s)
6769 *
6770 * parse an XML file from the filesystem or the network.
6771 * This reuses the existing @ctxt parser context
6772 *
6773 * Returns the resulting document tree
6774 */
6775 htmlDocPtr
htmlCtxtReadFile(htmlParserCtxtPtr ctxt,const char * filename,const char * encoding,int options)6776 htmlCtxtReadFile(htmlParserCtxtPtr ctxt, const char *filename,
6777 const char *encoding, int options)
6778 {
6779 xmlParserInputPtr stream;
6780
6781 if (filename == NULL)
6782 return (NULL);
6783 if (ctxt == NULL)
6784 return (NULL);
6785 xmlInitParser();
6786
6787 htmlCtxtReset(ctxt);
6788
6789 stream = xmlLoadExternalEntity(filename, NULL, ctxt);
6790 if (stream == NULL) {
6791 return (NULL);
6792 }
6793 inputPush(ctxt, stream);
6794 return (htmlDoRead(ctxt, NULL, encoding, options, 1));
6795 }
6796
6797 /**
6798 * htmlCtxtReadMemory:
6799 * @ctxt: an HTML parser context
6800 * @buffer: a pointer to a char array
6801 * @size: the size of the array
6802 * @URL: the base URL to use for the document
6803 * @encoding: the document encoding, or NULL
6804 * @options: a combination of htmlParserOption(s)
6805 *
6806 * parse an XML in-memory document and build a tree.
6807 * This reuses the existing @ctxt parser context
6808 *
6809 * Returns the resulting document tree
6810 */
6811 htmlDocPtr
htmlCtxtReadMemory(htmlParserCtxtPtr ctxt,const char * buffer,int size,const char * URL,const char * encoding,int options)6812 htmlCtxtReadMemory(htmlParserCtxtPtr ctxt, const char *buffer, int size,
6813 const char *URL, const char *encoding, int options)
6814 {
6815 xmlParserInputBufferPtr input;
6816 xmlParserInputPtr stream;
6817
6818 if (ctxt == NULL)
6819 return (NULL);
6820 if (buffer == NULL)
6821 return (NULL);
6822 xmlInitParser();
6823
6824 htmlCtxtReset(ctxt);
6825
6826 input = xmlParserInputBufferCreateMem(buffer, size, XML_CHAR_ENCODING_NONE);
6827 if (input == NULL) {
6828 return(NULL);
6829 }
6830
6831 stream = xmlNewIOInputStream(ctxt, input, XML_CHAR_ENCODING_NONE);
6832 if (stream == NULL) {
6833 xmlFreeParserInputBuffer(input);
6834 return(NULL);
6835 }
6836
6837 inputPush(ctxt, stream);
6838 return (htmlDoRead(ctxt, URL, encoding, options, 1));
6839 }
6840
6841 /**
6842 * htmlCtxtReadFd:
6843 * @ctxt: an HTML parser context
6844 * @fd: an open file descriptor
6845 * @URL: the base URL to use for the document
6846 * @encoding: the document encoding, or NULL
6847 * @options: a combination of htmlParserOption(s)
6848 *
6849 * parse an XML from a file descriptor and build a tree.
6850 * This reuses the existing @ctxt parser context
6851 *
6852 * Returns the resulting document tree
6853 */
6854 htmlDocPtr
htmlCtxtReadFd(htmlParserCtxtPtr ctxt,int fd,const char * URL,const char * encoding,int options)6855 htmlCtxtReadFd(htmlParserCtxtPtr ctxt, int fd,
6856 const char *URL, const char *encoding, int options)
6857 {
6858 xmlParserInputBufferPtr input;
6859 xmlParserInputPtr stream;
6860
6861 if (fd < 0)
6862 return (NULL);
6863 if (ctxt == NULL)
6864 return (NULL);
6865 xmlInitParser();
6866
6867 htmlCtxtReset(ctxt);
6868
6869
6870 input = xmlParserInputBufferCreateFd(fd, XML_CHAR_ENCODING_NONE);
6871 if (input == NULL)
6872 return (NULL);
6873 stream = xmlNewIOInputStream(ctxt, input, XML_CHAR_ENCODING_NONE);
6874 if (stream == NULL) {
6875 xmlFreeParserInputBuffer(input);
6876 return (NULL);
6877 }
6878 inputPush(ctxt, stream);
6879 return (htmlDoRead(ctxt, URL, encoding, options, 1));
6880 }
6881
6882 /**
6883 * htmlCtxtReadIO:
6884 * @ctxt: an HTML parser context
6885 * @ioread: an I/O read function
6886 * @ioclose: an I/O close function
6887 * @ioctx: an I/O handler
6888 * @URL: the base URL to use for the document
6889 * @encoding: the document encoding, or NULL
6890 * @options: a combination of htmlParserOption(s)
6891 *
6892 * parse an HTML document from I/O functions and source and build a tree.
6893 * This reuses the existing @ctxt parser context
6894 *
6895 * Returns the resulting document tree
6896 */
6897 htmlDocPtr
htmlCtxtReadIO(htmlParserCtxtPtr ctxt,xmlInputReadCallback ioread,xmlInputCloseCallback ioclose,void * ioctx,const char * URL,const char * encoding,int options)6898 htmlCtxtReadIO(htmlParserCtxtPtr ctxt, xmlInputReadCallback ioread,
6899 xmlInputCloseCallback ioclose, void *ioctx,
6900 const char *URL,
6901 const char *encoding, int options)
6902 {
6903 xmlParserInputBufferPtr input;
6904 xmlParserInputPtr stream;
6905
6906 if (ioread == NULL)
6907 return (NULL);
6908 if (ctxt == NULL)
6909 return (NULL);
6910 xmlInitParser();
6911
6912 htmlCtxtReset(ctxt);
6913
6914 input = xmlParserInputBufferCreateIO(ioread, ioclose, ioctx,
6915 XML_CHAR_ENCODING_NONE);
6916 if (input == NULL) {
6917 if (ioclose != NULL)
6918 ioclose(ioctx);
6919 return (NULL);
6920 }
6921 stream = xmlNewIOInputStream(ctxt, input, XML_CHAR_ENCODING_NONE);
6922 if (stream == NULL) {
6923 xmlFreeParserInputBuffer(input);
6924 return (NULL);
6925 }
6926 inputPush(ctxt, stream);
6927 return (htmlDoRead(ctxt, URL, encoding, options, 1));
6928 }
6929
6930 #endif /* LIBXML_HTML_ENABLED */
6931