1 /*
2 * HTMLparser.c : an HTML 4.0 non-verifying parser
3 *
4 * See Copyright for the status of this software.
5 *
6 * daniel@veillard.com
7 */
8
9 #define IN_LIBXML
10 #include "libxml.h"
11 #ifdef LIBXML_HTML_ENABLED
12
13 #include <string.h>
14 #ifdef HAVE_CTYPE_H
15 #include <ctype.h>
16 #endif
17 #ifdef HAVE_STDLIB_H
18 #include <stdlib.h>
19 #endif
20 #ifdef HAVE_SYS_STAT_H
21 #include <sys/stat.h>
22 #endif
23 #ifdef HAVE_FCNTL_H
24 #include <fcntl.h>
25 #endif
26 #ifdef HAVE_UNISTD_H
27 #include <unistd.h>
28 #endif
29 #ifdef LIBXML_ZLIB_ENABLED
30 #include <zlib.h>
31 #endif
32
33 #include <libxml/xmlmemory.h>
34 #include <libxml/tree.h>
35 #include <libxml/parser.h>
36 #include <libxml/parserInternals.h>
37 #include <libxml/xmlerror.h>
38 #include <libxml/HTMLparser.h>
39 #include <libxml/HTMLtree.h>
40 #include <libxml/entities.h>
41 #include <libxml/encoding.h>
42 #include <libxml/valid.h>
43 #include <libxml/xmlIO.h>
44 #include <libxml/globals.h>
45 #include <libxml/uri.h>
46
47 #include "buf.h"
48 #include "enc.h"
49
50 #define HTML_MAX_NAMELEN 1000
51 #define HTML_PARSER_BIG_BUFFER_SIZE 1000
52 #define HTML_PARSER_BUFFER_SIZE 100
53
54 /* #define DEBUG */
55 /* #define DEBUG_PUSH */
56
57 static int htmlOmittedDefaultValue = 1;
58
59 xmlChar * htmlDecodeEntities(htmlParserCtxtPtr ctxt, int len,
60 xmlChar end, xmlChar end2, xmlChar end3);
61 static void htmlParseComment(htmlParserCtxtPtr ctxt);
62
63 /************************************************************************
64 * *
65 * Some factorized error routines *
66 * *
67 ************************************************************************/
68
69 /**
70 * htmlErrMemory:
71 * @ctxt: an HTML parser context
72 * @extra: extra information
73 *
74 * Handle a redefinition of attribute error
75 */
76 static void
htmlErrMemory(xmlParserCtxtPtr ctxt,const char * extra)77 htmlErrMemory(xmlParserCtxtPtr ctxt, const char *extra)
78 {
79 if ((ctxt != NULL) && (ctxt->disableSAX != 0) &&
80 (ctxt->instate == XML_PARSER_EOF))
81 return;
82 if (ctxt != NULL) {
83 ctxt->errNo = XML_ERR_NO_MEMORY;
84 ctxt->instate = XML_PARSER_EOF;
85 ctxt->disableSAX = 1;
86 }
87 if (extra)
88 __xmlRaiseError(NULL, NULL, NULL, ctxt, NULL, XML_FROM_PARSER,
89 XML_ERR_NO_MEMORY, XML_ERR_FATAL, NULL, 0, extra,
90 NULL, NULL, 0, 0,
91 "Memory allocation failed : %s\n", extra);
92 else
93 __xmlRaiseError(NULL, NULL, NULL, ctxt, NULL, XML_FROM_PARSER,
94 XML_ERR_NO_MEMORY, XML_ERR_FATAL, NULL, 0, NULL,
95 NULL, NULL, 0, 0, "Memory allocation failed\n");
96 }
97
98 /**
99 * htmlParseErr:
100 * @ctxt: an HTML parser context
101 * @error: the error number
102 * @msg: the error message
103 * @str1: string infor
104 * @str2: string infor
105 *
106 * Handle a fatal parser error, i.e. violating Well-Formedness constraints
107 */
108 static void LIBXML_ATTR_FORMAT(3,0)
htmlParseErr(xmlParserCtxtPtr ctxt,xmlParserErrors error,const char * msg,const xmlChar * str1,const xmlChar * str2)109 htmlParseErr(xmlParserCtxtPtr ctxt, xmlParserErrors error,
110 const char *msg, const xmlChar *str1, const xmlChar *str2)
111 {
112 if ((ctxt != NULL) && (ctxt->disableSAX != 0) &&
113 (ctxt->instate == XML_PARSER_EOF))
114 return;
115 if (ctxt != NULL)
116 ctxt->errNo = error;
117 __xmlRaiseError(NULL, NULL, NULL, ctxt, NULL, XML_FROM_HTML, error,
118 XML_ERR_ERROR, NULL, 0,
119 (const char *) str1, (const char *) str2,
120 NULL, 0, 0,
121 msg, str1, str2);
122 if (ctxt != NULL)
123 ctxt->wellFormed = 0;
124 }
125
126 /**
127 * htmlParseErrInt:
128 * @ctxt: an HTML parser context
129 * @error: the error number
130 * @msg: the error message
131 * @val: integer info
132 *
133 * Handle a fatal parser error, i.e. violating Well-Formedness constraints
134 */
135 static void LIBXML_ATTR_FORMAT(3,0)
htmlParseErrInt(xmlParserCtxtPtr ctxt,xmlParserErrors error,const char * msg,int val)136 htmlParseErrInt(xmlParserCtxtPtr ctxt, xmlParserErrors error,
137 const char *msg, int val)
138 {
139 if ((ctxt != NULL) && (ctxt->disableSAX != 0) &&
140 (ctxt->instate == XML_PARSER_EOF))
141 return;
142 if (ctxt != NULL)
143 ctxt->errNo = error;
144 __xmlRaiseError(NULL, NULL, NULL, ctxt, NULL, XML_FROM_HTML, error,
145 XML_ERR_ERROR, NULL, 0, NULL, NULL,
146 NULL, val, 0, msg, val);
147 if (ctxt != NULL)
148 ctxt->wellFormed = 0;
149 }
150
151 /************************************************************************
152 * *
153 * Parser stacks related functions and macros *
154 * *
155 ************************************************************************/
156
157 /**
158 * htmlnamePush:
159 * @ctxt: an HTML parser context
160 * @value: the element name
161 *
162 * Pushes a new element name on top of the name stack
163 *
164 * Returns 0 in case of error, the index in the stack otherwise
165 */
166 static int
htmlnamePush(htmlParserCtxtPtr ctxt,const xmlChar * value)167 htmlnamePush(htmlParserCtxtPtr ctxt, const xmlChar * value)
168 {
169 if ((ctxt->html < 3) && (xmlStrEqual(value, BAD_CAST "head")))
170 ctxt->html = 3;
171 if ((ctxt->html < 10) && (xmlStrEqual(value, BAD_CAST "body")))
172 ctxt->html = 10;
173 if (ctxt->nameNr >= ctxt->nameMax) {
174 ctxt->nameMax *= 2;
175 ctxt->nameTab = (const xmlChar * *)
176 xmlRealloc((xmlChar * *)ctxt->nameTab,
177 ctxt->nameMax *
178 sizeof(ctxt->nameTab[0]));
179 if (ctxt->nameTab == NULL) {
180 htmlErrMemory(ctxt, NULL);
181 return (0);
182 }
183 }
184 ctxt->nameTab[ctxt->nameNr] = value;
185 ctxt->name = value;
186 return (ctxt->nameNr++);
187 }
188 /**
189 * htmlnamePop:
190 * @ctxt: an HTML parser context
191 *
192 * Pops the top element name from the name stack
193 *
194 * Returns the name just removed
195 */
196 static const xmlChar *
htmlnamePop(htmlParserCtxtPtr ctxt)197 htmlnamePop(htmlParserCtxtPtr ctxt)
198 {
199 const xmlChar *ret;
200
201 if (ctxt->nameNr <= 0)
202 return (NULL);
203 ctxt->nameNr--;
204 if (ctxt->nameNr < 0)
205 return (NULL);
206 if (ctxt->nameNr > 0)
207 ctxt->name = ctxt->nameTab[ctxt->nameNr - 1];
208 else
209 ctxt->name = NULL;
210 ret = ctxt->nameTab[ctxt->nameNr];
211 ctxt->nameTab[ctxt->nameNr] = NULL;
212 return (ret);
213 }
214
215 /**
216 * htmlNodeInfoPush:
217 * @ctxt: an HTML parser context
218 * @value: the node info
219 *
220 * Pushes a new element name on top of the node info stack
221 *
222 * Returns 0 in case of error, the index in the stack otherwise
223 */
224 static int
htmlNodeInfoPush(htmlParserCtxtPtr ctxt,htmlParserNodeInfo * value)225 htmlNodeInfoPush(htmlParserCtxtPtr ctxt, htmlParserNodeInfo *value)
226 {
227 if (ctxt->nodeInfoNr >= ctxt->nodeInfoMax) {
228 if (ctxt->nodeInfoMax == 0)
229 ctxt->nodeInfoMax = 5;
230 ctxt->nodeInfoMax *= 2;
231 ctxt->nodeInfoTab = (htmlParserNodeInfo *)
232 xmlRealloc((htmlParserNodeInfo *)ctxt->nodeInfoTab,
233 ctxt->nodeInfoMax *
234 sizeof(ctxt->nodeInfoTab[0]));
235 if (ctxt->nodeInfoTab == NULL) {
236 htmlErrMemory(ctxt, NULL);
237 return (0);
238 }
239 }
240 ctxt->nodeInfoTab[ctxt->nodeInfoNr] = *value;
241 ctxt->nodeInfo = &ctxt->nodeInfoTab[ctxt->nodeInfoNr];
242 return (ctxt->nodeInfoNr++);
243 }
244
245 /**
246 * htmlNodeInfoPop:
247 * @ctxt: an HTML parser context
248 *
249 * Pops the top element name from the node info stack
250 *
251 * Returns 0 in case of error, the pointer to NodeInfo otherwise
252 */
253 static htmlParserNodeInfo *
htmlNodeInfoPop(htmlParserCtxtPtr ctxt)254 htmlNodeInfoPop(htmlParserCtxtPtr ctxt)
255 {
256 if (ctxt->nodeInfoNr <= 0)
257 return (NULL);
258 ctxt->nodeInfoNr--;
259 if (ctxt->nodeInfoNr < 0)
260 return (NULL);
261 if (ctxt->nodeInfoNr > 0)
262 ctxt->nodeInfo = &ctxt->nodeInfoTab[ctxt->nodeInfoNr - 1];
263 else
264 ctxt->nodeInfo = NULL;
265 return &ctxt->nodeInfoTab[ctxt->nodeInfoNr];
266 }
267
268 /*
269 * Macros for accessing the content. Those should be used only by the parser,
270 * and not exported.
271 *
272 * Dirty macros, i.e. one need to make assumption on the context to use them
273 *
274 * CUR_PTR return the current pointer to the xmlChar to be parsed.
275 * CUR returns the current xmlChar value, i.e. a 8 bit value if compiled
276 * in ISO-Latin or UTF-8, and the current 16 bit value if compiled
277 * in UNICODE mode. This should be used internally by the parser
278 * only to compare to ASCII values otherwise it would break when
279 * running with UTF-8 encoding.
280 * NXT(n) returns the n'th next xmlChar. Same as CUR is should be used only
281 * to compare on ASCII based substring.
282 * UPP(n) returns the n'th next xmlChar converted to uppercase. Same as CUR
283 * it should be used only to compare on ASCII based substring.
284 * SKIP(n) Skip n xmlChar, and must also be used only to skip ASCII defined
285 * strings without newlines within the parser.
286 *
287 * Clean macros, not dependent of an ASCII context, expect UTF-8 encoding
288 *
289 * CURRENT Returns the current char value, with the full decoding of
290 * UTF-8 if we are using this mode. It returns an int.
291 * NEXT Skip to the next character, this does the proper decoding
292 * in UTF-8 mode. It also pop-up unfinished entities on the fly.
293 * NEXTL(l) Skip the current unicode character of l xmlChars long.
294 * COPY(to) copy one char to *to, increment CUR_PTR and to accordingly
295 */
296
297 #define UPPER (toupper(*ctxt->input->cur))
298
299 #define SKIP(val) ctxt->input->cur += (val),ctxt->input->col+=(val)
300
301 #define NXT(val) ctxt->input->cur[(val)]
302
303 #define UPP(val) (toupper(ctxt->input->cur[(val)]))
304
305 #define CUR_PTR ctxt->input->cur
306 #define BASE_PTR ctxt->input->base
307
308 #define SHRINK if ((ctxt->input->cur - ctxt->input->base > 2 * INPUT_CHUNK) && \
309 (ctxt->input->end - ctxt->input->cur < 2 * INPUT_CHUNK)) \
310 xmlParserInputShrink(ctxt->input)
311
312 #define GROW if ((ctxt->progressive == 0) && \
313 (ctxt->input->end - ctxt->input->cur < INPUT_CHUNK)) \
314 xmlParserInputGrow(ctxt->input, INPUT_CHUNK)
315
316 #define CURRENT ((int) (*ctxt->input->cur))
317
318 #define SKIP_BLANKS htmlSkipBlankChars(ctxt)
319
320 /* Imported from XML */
321
322 /* #define CUR (ctxt->token ? ctxt->token : (int) (*ctxt->input->cur)) */
323 #define CUR ((int) (*ctxt->input->cur))
324 #define NEXT xmlNextChar(ctxt)
325
326 #define RAW (ctxt->token ? -1 : (*ctxt->input->cur))
327
328
329 #define NEXTL(l) do { \
330 if (*(ctxt->input->cur) == '\n') { \
331 ctxt->input->line++; ctxt->input->col = 1; \
332 } else ctxt->input->col++; \
333 ctxt->token = 0; ctxt->input->cur += l; \
334 } while (0)
335
336 /************
337 \
338 if (*ctxt->input->cur == '%') xmlParserHandlePEReference(ctxt); \
339 if (*ctxt->input->cur == '&') xmlParserHandleReference(ctxt);
340 ************/
341
342 #define CUR_CHAR(l) htmlCurrentChar(ctxt, &l)
343 #define CUR_SCHAR(s, l) xmlStringCurrentChar(ctxt, s, &l)
344
345 #define COPY_BUF(l,b,i,v) \
346 if (l == 1) b[i++] = (xmlChar) v; \
347 else i += xmlCopyChar(l,&b[i],v)
348
349 /**
350 * htmlFindEncoding:
351 * @the HTML parser context
352 *
353 * Ty to find and encoding in the current data available in the input
354 * buffer this is needed to try to switch to the proper encoding when
355 * one face a character error.
356 * That's an heuristic, since it's operating outside of parsing it could
357 * try to use a meta which had been commented out, that's the reason it
358 * should only be used in case of error, not as a default.
359 *
360 * Returns an encoding string or NULL if not found, the string need to
361 * be freed
362 */
363 static xmlChar *
htmlFindEncoding(xmlParserCtxtPtr ctxt)364 htmlFindEncoding(xmlParserCtxtPtr ctxt) {
365 const xmlChar *start, *cur, *end;
366
367 if ((ctxt == NULL) || (ctxt->input == NULL) ||
368 (ctxt->input->encoding != NULL) || (ctxt->input->buf == NULL) ||
369 (ctxt->input->buf->encoder != NULL))
370 return(NULL);
371 if ((ctxt->input->cur == NULL) || (ctxt->input->end == NULL))
372 return(NULL);
373
374 start = ctxt->input->cur;
375 end = ctxt->input->end;
376 /* we also expect the input buffer to be zero terminated */
377 if (*end != 0)
378 return(NULL);
379
380 cur = xmlStrcasestr(start, BAD_CAST "HTTP-EQUIV");
381 if (cur == NULL)
382 return(NULL);
383 cur = xmlStrcasestr(cur, BAD_CAST "CONTENT");
384 if (cur == NULL)
385 return(NULL);
386 cur = xmlStrcasestr(cur, BAD_CAST "CHARSET=");
387 if (cur == NULL)
388 return(NULL);
389 cur += 8;
390 start = cur;
391 while (((*cur >= 'A') && (*cur <= 'Z')) ||
392 ((*cur >= 'a') && (*cur <= 'z')) ||
393 ((*cur >= '0') && (*cur <= '9')) ||
394 (*cur == '-') || (*cur == '_') || (*cur == ':') || (*cur == '/'))
395 cur++;
396 if (cur == start)
397 return(NULL);
398 return(xmlStrndup(start, cur - start));
399 }
400
401 /**
402 * htmlCurrentChar:
403 * @ctxt: the HTML parser context
404 * @len: pointer to the length of the char read
405 *
406 * The current char value, if using UTF-8 this may actually span multiple
407 * bytes in the input buffer. Implement the end of line normalization:
408 * 2.11 End-of-Line Handling
409 * If the encoding is unspecified, in the case we find an ISO-Latin-1
410 * char, then the encoding converter is plugged in automatically.
411 *
412 * Returns the current char value and its length
413 */
414
415 static int
htmlCurrentChar(xmlParserCtxtPtr ctxt,int * len)416 htmlCurrentChar(xmlParserCtxtPtr ctxt, int *len) {
417 const unsigned char *cur;
418 unsigned char c;
419 unsigned int val;
420
421 if (ctxt->instate == XML_PARSER_EOF)
422 return(0);
423
424 if (ctxt->token != 0) {
425 *len = 0;
426 return(ctxt->token);
427 }
428 if (ctxt->charset != XML_CHAR_ENCODING_UTF8) {
429 xmlChar * guess;
430 xmlCharEncodingHandlerPtr handler;
431
432 /*
433 * Assume it's a fixed length encoding (1) with
434 * a compatible encoding for the ASCII set, since
435 * HTML constructs only use < 128 chars
436 */
437 if ((int) *ctxt->input->cur < 0x80) {
438 *len = 1;
439 if ((*ctxt->input->cur == 0) &&
440 (ctxt->input->cur < ctxt->input->end)) {
441 htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR,
442 "Char 0x%X out of allowed range\n", 0);
443 return(' ');
444 }
445 return((int) *ctxt->input->cur);
446 }
447
448 /*
449 * Humm this is bad, do an automatic flow conversion
450 */
451 guess = htmlFindEncoding(ctxt);
452 if (guess == NULL) {
453 xmlSwitchEncoding(ctxt, XML_CHAR_ENCODING_8859_1);
454 } else {
455 if (ctxt->input->encoding != NULL)
456 xmlFree((xmlChar *) ctxt->input->encoding);
457 ctxt->input->encoding = guess;
458 handler = xmlFindCharEncodingHandler((const char *) guess);
459 if (handler != NULL) {
460 /*
461 * Don't use UTF-8 encoder which isn't required and
462 * can produce invalid UTF-8.
463 */
464 if (!xmlStrEqual(BAD_CAST handler->name, BAD_CAST "UTF-8"))
465 xmlSwitchToEncoding(ctxt, handler);
466 } else {
467 htmlParseErr(ctxt, XML_ERR_INVALID_ENCODING,
468 "Unsupported encoding %s", guess, NULL);
469 }
470 }
471 ctxt->charset = XML_CHAR_ENCODING_UTF8;
472 }
473
474 /*
475 * We are supposed to handle UTF8, check it's valid
476 * From rfc2044: encoding of the Unicode values on UTF-8:
477 *
478 * UCS-4 range (hex.) UTF-8 octet sequence (binary)
479 * 0000 0000-0000 007F 0xxxxxxx
480 * 0000 0080-0000 07FF 110xxxxx 10xxxxxx
481 * 0000 0800-0000 FFFF 1110xxxx 10xxxxxx 10xxxxxx
482 *
483 * Check for the 0x110000 limit too
484 */
485 cur = ctxt->input->cur;
486 c = *cur;
487 if (c & 0x80) {
488 if ((c & 0x40) == 0)
489 goto encoding_error;
490 if (cur[1] == 0) {
491 xmlParserInputGrow(ctxt->input, INPUT_CHUNK);
492 cur = ctxt->input->cur;
493 }
494 if ((cur[1] & 0xc0) != 0x80)
495 goto encoding_error;
496 if ((c & 0xe0) == 0xe0) {
497
498 if (cur[2] == 0) {
499 xmlParserInputGrow(ctxt->input, INPUT_CHUNK);
500 cur = ctxt->input->cur;
501 }
502 if ((cur[2] & 0xc0) != 0x80)
503 goto encoding_error;
504 if ((c & 0xf0) == 0xf0) {
505 if (cur[3] == 0) {
506 xmlParserInputGrow(ctxt->input, INPUT_CHUNK);
507 cur = ctxt->input->cur;
508 }
509 if (((c & 0xf8) != 0xf0) ||
510 ((cur[3] & 0xc0) != 0x80))
511 goto encoding_error;
512 /* 4-byte code */
513 *len = 4;
514 val = (cur[0] & 0x7) << 18;
515 val |= (cur[1] & 0x3f) << 12;
516 val |= (cur[2] & 0x3f) << 6;
517 val |= cur[3] & 0x3f;
518 if (val < 0x10000)
519 goto encoding_error;
520 } else {
521 /* 3-byte code */
522 *len = 3;
523 val = (cur[0] & 0xf) << 12;
524 val |= (cur[1] & 0x3f) << 6;
525 val |= cur[2] & 0x3f;
526 if (val < 0x800)
527 goto encoding_error;
528 }
529 } else {
530 /* 2-byte code */
531 *len = 2;
532 val = (cur[0] & 0x1f) << 6;
533 val |= cur[1] & 0x3f;
534 if (val < 0x80)
535 goto encoding_error;
536 }
537 if (!IS_CHAR(val)) {
538 htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR,
539 "Char 0x%X out of allowed range\n", val);
540 }
541 return(val);
542 } else {
543 if ((*ctxt->input->cur == 0) &&
544 (ctxt->input->cur < ctxt->input->end)) {
545 htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR,
546 "Char 0x%X out of allowed range\n", 0);
547 *len = 1;
548 return(' ');
549 }
550 /* 1-byte code */
551 *len = 1;
552 return((int) *ctxt->input->cur);
553 }
554
555 encoding_error:
556 /*
557 * If we detect an UTF8 error that probably mean that the
558 * input encoding didn't get properly advertised in the
559 * declaration header. Report the error and switch the encoding
560 * to ISO-Latin-1 (if you don't like this policy, just declare the
561 * encoding !)
562 */
563 {
564 char buffer[150];
565
566 if (ctxt->input->end - ctxt->input->cur >= 4) {
567 snprintf(buffer, 149, "Bytes: 0x%02X 0x%02X 0x%02X 0x%02X\n",
568 ctxt->input->cur[0], ctxt->input->cur[1],
569 ctxt->input->cur[2], ctxt->input->cur[3]);
570 } else {
571 snprintf(buffer, 149, "Bytes: 0x%02X\n", ctxt->input->cur[0]);
572 }
573 htmlParseErr(ctxt, XML_ERR_INVALID_ENCODING,
574 "Input is not proper UTF-8, indicate encoding !\n",
575 BAD_CAST buffer, NULL);
576 }
577
578 /*
579 * Don't switch encodings twice. Note that if there's an encoder, we
580 * shouldn't receive invalid UTF-8 anyway.
581 *
582 * Note that if ctxt->input->buf == NULL, switching encodings is
583 * impossible, see Gitlab issue #34.
584 */
585 if ((ctxt->input->buf != NULL) &&
586 (ctxt->input->buf->encoder == NULL))
587 xmlSwitchEncoding(ctxt, XML_CHAR_ENCODING_8859_1);
588 *len = 1;
589 return((int) *ctxt->input->cur);
590 }
591
592 /**
593 * htmlSkipBlankChars:
594 * @ctxt: the HTML parser context
595 *
596 * skip all blanks character found at that point in the input streams.
597 *
598 * Returns the number of space chars skipped
599 */
600
601 static int
htmlSkipBlankChars(xmlParserCtxtPtr ctxt)602 htmlSkipBlankChars(xmlParserCtxtPtr ctxt) {
603 int res = 0;
604
605 while (IS_BLANK_CH(*(ctxt->input->cur))) {
606 if ((*ctxt->input->cur == 0) &&
607 (xmlParserInputGrow(ctxt->input, INPUT_CHUNK) <= 0)) {
608 xmlPopInput(ctxt);
609 } else {
610 if (*(ctxt->input->cur) == '\n') {
611 ctxt->input->line++; ctxt->input->col = 1;
612 } else ctxt->input->col++;
613 ctxt->input->cur++;
614 if (*ctxt->input->cur == 0)
615 xmlParserInputGrow(ctxt->input, INPUT_CHUNK);
616 }
617 res++;
618 }
619 return(res);
620 }
621
622
623
624 /************************************************************************
625 * *
626 * The list of HTML elements and their properties *
627 * *
628 ************************************************************************/
629
630 /*
631 * Start Tag: 1 means the start tag can be omitted
632 * End Tag: 1 means the end tag can be omitted
633 * 2 means it's forbidden (empty elements)
634 * 3 means the tag is stylistic and should be closed easily
635 * Depr: this element is deprecated
636 * DTD: 1 means that this element is valid only in the Loose DTD
637 * 2 means that this element is valid only in the Frameset DTD
638 *
639 * Name,Start Tag,End Tag,Save End,Empty,Deprecated,DTD,inline,Description
640 , subElements , impliedsubelt , Attributes, userdata
641 */
642
643 /* Definitions and a couple of vars for HTML Elements */
644
645 #define FONTSTYLE "tt", "i", "b", "u", "s", "strike", "big", "small"
646 #define NB_FONTSTYLE 8
647 #define PHRASE "em", "strong", "dfn", "code", "samp", "kbd", "var", "cite", "abbr", "acronym"
648 #define NB_PHRASE 10
649 #define SPECIAL "a", "img", "applet", "embed", "object", "font", "basefont", "br", "script", "map", "q", "sub", "sup", "span", "bdo", "iframe"
650 #define NB_SPECIAL 16
651 #define INLINE FONTSTYLE, PHRASE, SPECIAL, FORMCTRL
652 #define NB_INLINE NB_PCDATA + NB_FONTSTYLE + NB_PHRASE + NB_SPECIAL + NB_FORMCTRL
653 #define BLOCK HEADING, LIST, "pre", "p", "dl", "div", "center", "noscript", "noframes", "blockquote", "form", "isindex", "hr", "table", "fieldset", "address"
654 #define NB_BLOCK NB_HEADING + NB_LIST + 14
655 #define FORMCTRL "input", "select", "textarea", "label", "button"
656 #define NB_FORMCTRL 5
657 #define PCDATA
658 #define NB_PCDATA 0
659 #define HEADING "h1", "h2", "h3", "h4", "h5", "h6"
660 #define NB_HEADING 6
661 #define LIST "ul", "ol", "dir", "menu"
662 #define NB_LIST 4
663 #define MODIFIER
664 #define NB_MODIFIER 0
665 #define FLOW BLOCK,INLINE
666 #define NB_FLOW NB_BLOCK + NB_INLINE
667 #define EMPTY NULL
668
669
670 static const char* const html_flow[] = { FLOW, NULL } ;
671 static const char* const html_inline[] = { INLINE, NULL } ;
672
673 /* placeholders: elts with content but no subelements */
674 static const char* const html_pcdata[] = { NULL } ;
675 #define html_cdata html_pcdata
676
677
678 /* ... and for HTML Attributes */
679
680 #define COREATTRS "id", "class", "style", "title"
681 #define NB_COREATTRS 4
682 #define I18N "lang", "dir"
683 #define NB_I18N 2
684 #define EVENTS "onclick", "ondblclick", "onmousedown", "onmouseup", "onmouseover", "onmouseout", "onkeypress", "onkeydown", "onkeyup"
685 #define NB_EVENTS 9
686 #define ATTRS COREATTRS,I18N,EVENTS
687 #define NB_ATTRS NB_NB_COREATTRS + NB_I18N + NB_EVENTS
688 #define CELLHALIGN "align", "char", "charoff"
689 #define NB_CELLHALIGN 3
690 #define CELLVALIGN "valign"
691 #define NB_CELLVALIGN 1
692
693 static const char* const html_attrs[] = { ATTRS, NULL } ;
694 static const char* const core_i18n_attrs[] = { COREATTRS, I18N, NULL } ;
695 static const char* const core_attrs[] = { COREATTRS, NULL } ;
696 static const char* const i18n_attrs[] = { I18N, NULL } ;
697
698
699 /* Other declarations that should go inline ... */
700 static const char* const a_attrs[] = { ATTRS, "charset", "type", "name",
701 "href", "hreflang", "rel", "rev", "accesskey", "shape", "coords",
702 "tabindex", "onfocus", "onblur", NULL } ;
703 static const char* const target_attr[] = { "target", NULL } ;
704 static const char* const rows_cols_attr[] = { "rows", "cols", NULL } ;
705 static const char* const alt_attr[] = { "alt", NULL } ;
706 static const char* const src_alt_attrs[] = { "src", "alt", NULL } ;
707 static const char* const href_attrs[] = { "href", NULL } ;
708 static const char* const clear_attrs[] = { "clear", NULL } ;
709 static const char* const inline_p[] = { INLINE, "p", NULL } ;
710
711 static const char* const flow_param[] = { FLOW, "param", NULL } ;
712 static const char* const applet_attrs[] = { COREATTRS , "codebase",
713 "archive", "alt", "name", "height", "width", "align",
714 "hspace", "vspace", NULL } ;
715 static const char* const area_attrs[] = { "shape", "coords", "href", "nohref",
716 "tabindex", "accesskey", "onfocus", "onblur", NULL } ;
717 static const char* const basefont_attrs[] =
718 { "id", "size", "color", "face", NULL } ;
719 static const char* const quote_attrs[] = { ATTRS, "cite", NULL } ;
720 static const char* const body_contents[] = { FLOW, "ins", "del", NULL } ;
721 static const char* const body_attrs[] = { ATTRS, "onload", "onunload", NULL } ;
722 static const char* const body_depr[] = { "background", "bgcolor", "text",
723 "link", "vlink", "alink", NULL } ;
724 static const char* const button_attrs[] = { ATTRS, "name", "value", "type",
725 "disabled", "tabindex", "accesskey", "onfocus", "onblur", NULL } ;
726
727
728 static const char* const col_attrs[] = { ATTRS, "span", "width", CELLHALIGN, CELLVALIGN, NULL } ;
729 static const char* const col_elt[] = { "col", NULL } ;
730 static const char* const edit_attrs[] = { ATTRS, "datetime", "cite", NULL } ;
731 static const char* const compact_attrs[] = { ATTRS, "compact", NULL } ;
732 static const char* const dl_contents[] = { "dt", "dd", NULL } ;
733 static const char* const compact_attr[] = { "compact", NULL } ;
734 static const char* const label_attr[] = { "label", NULL } ;
735 static const char* const fieldset_contents[] = { FLOW, "legend" } ;
736 static const char* const font_attrs[] = { COREATTRS, I18N, "size", "color", "face" , NULL } ;
737 static const char* const form_contents[] = { HEADING, LIST, INLINE, "pre", "p", "div", "center", "noscript", "noframes", "blockquote", "isindex", "hr", "table", "fieldset", "address", NULL } ;
738 static const char* const form_attrs[] = { ATTRS, "method", "enctype", "accept", "name", "onsubmit", "onreset", "accept-charset", NULL } ;
739 static const char* const frame_attrs[] = { COREATTRS, "longdesc", "name", "src", "frameborder", "marginwidth", "marginheight", "noresize", "scrolling" , NULL } ;
740 static const char* const frameset_attrs[] = { COREATTRS, "rows", "cols", "onload", "onunload", NULL } ;
741 static const char* const frameset_contents[] = { "frameset", "frame", "noframes", NULL } ;
742 static const char* const head_attrs[] = { I18N, "profile", NULL } ;
743 static const char* const head_contents[] = { "title", "isindex", "base", "script", "style", "meta", "link", "object", NULL } ;
744 static const char* const hr_depr[] = { "align", "noshade", "size", "width", NULL } ;
745 static const char* const version_attr[] = { "version", NULL } ;
746 static const char* const html_content[] = { "head", "body", "frameset", NULL } ;
747 static const char* const iframe_attrs[] = { COREATTRS, "longdesc", "name", "src", "frameborder", "marginwidth", "marginheight", "scrolling", "align", "height", "width", NULL } ;
748 static const char* const img_attrs[] = { ATTRS, "longdesc", "name", "height", "width", "usemap", "ismap", NULL } ;
749 static const char* const embed_attrs[] = { COREATTRS, "align", "alt", "border", "code", "codebase", "frameborder", "height", "hidden", "hspace", "name", "palette", "pluginspace", "pluginurl", "src", "type", "units", "vspace", "width", NULL } ;
750 static const char* const input_attrs[] = { ATTRS, "type", "name", "value", "checked", "disabled", "readonly", "size", "maxlength", "src", "alt", "usemap", "ismap", "tabindex", "accesskey", "onfocus", "onblur", "onselect", "onchange", "accept", NULL } ;
751 static const char* const prompt_attrs[] = { COREATTRS, I18N, "prompt", NULL } ;
752 static const char* const label_attrs[] = { ATTRS, "for", "accesskey", "onfocus", "onblur", NULL } ;
753 static const char* const legend_attrs[] = { ATTRS, "accesskey", NULL } ;
754 static const char* const align_attr[] = { "align", NULL } ;
755 static const char* const link_attrs[] = { ATTRS, "charset", "href", "hreflang", "type", "rel", "rev", "media", NULL } ;
756 static const char* const map_contents[] = { BLOCK, "area", NULL } ;
757 static const char* const name_attr[] = { "name", NULL } ;
758 static const char* const action_attr[] = { "action", NULL } ;
759 static const char* const blockli_elt[] = { BLOCK, "li", NULL } ;
760 static const char* const meta_attrs[] = { I18N, "http-equiv", "name", "scheme", "charset", NULL } ;
761 static const char* const content_attr[] = { "content", NULL } ;
762 static const char* const type_attr[] = { "type", NULL } ;
763 static const char* const noframes_content[] = { "body", FLOW MODIFIER, NULL } ;
764 static const char* const object_contents[] = { FLOW, "param", NULL } ;
765 static const char* const object_attrs[] = { ATTRS, "declare", "classid", "codebase", "data", "type", "codetype", "archive", "standby", "height", "width", "usemap", "name", "tabindex", NULL } ;
766 static const char* const object_depr[] = { "align", "border", "hspace", "vspace", NULL } ;
767 static const char* const ol_attrs[] = { "type", "compact", "start", NULL} ;
768 static const char* const option_elt[] = { "option", NULL } ;
769 static const char* const optgroup_attrs[] = { ATTRS, "disabled", NULL } ;
770 static const char* const option_attrs[] = { ATTRS, "disabled", "label", "selected", "value", NULL } ;
771 static const char* const param_attrs[] = { "id", "value", "valuetype", "type", NULL } ;
772 static const char* const width_attr[] = { "width", NULL } ;
773 static const char* const pre_content[] = { PHRASE, "tt", "i", "b", "u", "s", "strike", "a", "br", "script", "map", "q", "span", "bdo", "iframe", NULL } ;
774 static const char* const script_attrs[] = { "charset", "src", "defer", "event", "for", NULL } ;
775 static const char* const language_attr[] = { "language", NULL } ;
776 static const char* const select_content[] = { "optgroup", "option", NULL } ;
777 static const char* const select_attrs[] = { ATTRS, "name", "size", "multiple", "disabled", "tabindex", "onfocus", "onblur", "onchange", NULL } ;
778 static const char* const style_attrs[] = { I18N, "media", "title", NULL } ;
779 static const char* const table_attrs[] = { ATTRS, "summary", "width", "border", "frame", "rules", "cellspacing", "cellpadding", "datapagesize", NULL } ;
780 static const char* const table_depr[] = { "align", "bgcolor", NULL } ;
781 static const char* const table_contents[] = { "caption", "col", "colgroup", "thead", "tfoot", "tbody", "tr", NULL} ;
782 static const char* const tr_elt[] = { "tr", NULL } ;
783 static const char* const talign_attrs[] = { ATTRS, CELLHALIGN, CELLVALIGN, NULL} ;
784 static const char* const th_td_depr[] = { "nowrap", "bgcolor", "width", "height", NULL } ;
785 static const char* const th_td_attr[] = { ATTRS, "abbr", "axis", "headers", "scope", "rowspan", "colspan", CELLHALIGN, CELLVALIGN, NULL } ;
786 static const char* const textarea_attrs[] = { ATTRS, "name", "disabled", "readonly", "tabindex", "accesskey", "onfocus", "onblur", "onselect", "onchange", NULL } ;
787 static const char* const tr_contents[] = { "th", "td", NULL } ;
788 static const char* const bgcolor_attr[] = { "bgcolor", NULL } ;
789 static const char* const li_elt[] = { "li", NULL } ;
790 static const char* const ul_depr[] = { "type", "compact", NULL} ;
791 static const char* const dir_attr[] = { "dir", NULL} ;
792
793 #define DECL (const char**)
794
795 static const htmlElemDesc
796 html40ElementTable[] = {
797 { "a", 0, 0, 0, 0, 0, 0, 1, "anchor ",
798 DECL html_inline , NULL , DECL a_attrs , DECL target_attr, NULL
799 },
800 { "abbr", 0, 0, 0, 0, 0, 0, 1, "abbreviated form",
801 DECL html_inline , NULL , DECL html_attrs, NULL, NULL
802 },
803 { "acronym", 0, 0, 0, 0, 0, 0, 1, "",
804 DECL html_inline , NULL , DECL html_attrs, NULL, NULL
805 },
806 { "address", 0, 0, 0, 0, 0, 0, 0, "information on author ",
807 DECL inline_p , NULL , DECL html_attrs, NULL, NULL
808 },
809 { "applet", 0, 0, 0, 0, 1, 1, 2, "java applet ",
810 DECL flow_param , NULL , NULL , DECL applet_attrs, NULL
811 },
812 { "area", 0, 2, 2, 1, 0, 0, 0, "client-side image map area ",
813 EMPTY , NULL , DECL area_attrs , DECL target_attr, DECL alt_attr
814 },
815 { "b", 0, 3, 0, 0, 0, 0, 1, "bold text style",
816 DECL html_inline , NULL , DECL html_attrs, NULL, NULL
817 },
818 { "base", 0, 2, 2, 1, 0, 0, 0, "document base uri ",
819 EMPTY , NULL , NULL , DECL target_attr, DECL href_attrs
820 },
821 { "basefont", 0, 2, 2, 1, 1, 1, 1, "base font size " ,
822 EMPTY , NULL , NULL, DECL basefont_attrs, NULL
823 },
824 { "bdo", 0, 0, 0, 0, 0, 0, 1, "i18n bidi over-ride ",
825 DECL html_inline , NULL , DECL core_i18n_attrs, NULL, DECL dir_attr
826 },
827 { "big", 0, 3, 0, 0, 0, 0, 1, "large text style",
828 DECL html_inline , NULL , DECL html_attrs, NULL, NULL
829 },
830 { "blockquote", 0, 0, 0, 0, 0, 0, 0, "long quotation ",
831 DECL html_flow , NULL , DECL quote_attrs , NULL, NULL
832 },
833 { "body", 1, 1, 0, 0, 0, 0, 0, "document body ",
834 DECL body_contents , "div" , DECL body_attrs, DECL body_depr, NULL
835 },
836 { "br", 0, 2, 2, 1, 0, 0, 1, "forced line break ",
837 EMPTY , NULL , DECL core_attrs, DECL clear_attrs , NULL
838 },
839 { "button", 0, 0, 0, 0, 0, 0, 2, "push button ",
840 DECL html_flow MODIFIER , NULL , DECL button_attrs, NULL, NULL
841 },
842 { "caption", 0, 0, 0, 0, 0, 0, 0, "table caption ",
843 DECL html_inline , NULL , DECL html_attrs, NULL, NULL
844 },
845 { "center", 0, 3, 0, 0, 1, 1, 0, "shorthand for div align=center ",
846 DECL html_flow , NULL , NULL, DECL html_attrs, NULL
847 },
848 { "cite", 0, 0, 0, 0, 0, 0, 1, "citation",
849 DECL html_inline , NULL , DECL html_attrs, NULL, NULL
850 },
851 { "code", 0, 0, 0, 0, 0, 0, 1, "computer code fragment",
852 DECL html_inline , NULL , DECL html_attrs, NULL, NULL
853 },
854 { "col", 0, 2, 2, 1, 0, 0, 0, "table column ",
855 EMPTY , NULL , DECL col_attrs , NULL, NULL
856 },
857 { "colgroup", 0, 1, 0, 0, 0, 0, 0, "table column group ",
858 DECL col_elt , "col" , DECL col_attrs , NULL, NULL
859 },
860 { "dd", 0, 1, 0, 0, 0, 0, 0, "definition description ",
861 DECL html_flow , NULL , DECL html_attrs, NULL, NULL
862 },
863 { "del", 0, 0, 0, 0, 0, 0, 2, "deleted text ",
864 DECL html_flow , NULL , DECL edit_attrs , NULL, NULL
865 },
866 { "dfn", 0, 0, 0, 0, 0, 0, 1, "instance definition",
867 DECL html_inline , NULL , DECL html_attrs, NULL, NULL
868 },
869 { "dir", 0, 0, 0, 0, 1, 1, 0, "directory list",
870 DECL blockli_elt, "li" , NULL, DECL compact_attrs, NULL
871 },
872 { "div", 0, 0, 0, 0, 0, 0, 0, "generic language/style container",
873 DECL html_flow, NULL, DECL html_attrs, DECL align_attr, NULL
874 },
875 { "dl", 0, 0, 0, 0, 0, 0, 0, "definition list ",
876 DECL dl_contents , "dd" , DECL html_attrs, DECL compact_attr, NULL
877 },
878 { "dt", 0, 1, 0, 0, 0, 0, 0, "definition term ",
879 DECL html_inline, NULL, DECL html_attrs, NULL, NULL
880 },
881 { "em", 0, 3, 0, 0, 0, 0, 1, "emphasis",
882 DECL html_inline, NULL, DECL html_attrs, NULL, NULL
883 },
884 { "embed", 0, 1, 0, 0, 1, 1, 1, "generic embedded object ",
885 EMPTY, NULL, DECL embed_attrs, NULL, NULL
886 },
887 { "fieldset", 0, 0, 0, 0, 0, 0, 0, "form control group ",
888 DECL fieldset_contents , NULL, DECL html_attrs, NULL, NULL
889 },
890 { "font", 0, 3, 0, 0, 1, 1, 1, "local change to font ",
891 DECL html_inline, NULL, NULL, DECL font_attrs, NULL
892 },
893 { "form", 0, 0, 0, 0, 0, 0, 0, "interactive form ",
894 DECL form_contents, "fieldset", DECL form_attrs , DECL target_attr, DECL action_attr
895 },
896 { "frame", 0, 2, 2, 1, 0, 2, 0, "subwindow " ,
897 EMPTY, NULL, NULL, DECL frame_attrs, NULL
898 },
899 { "frameset", 0, 0, 0, 0, 0, 2, 0, "window subdivision" ,
900 DECL frameset_contents, "noframes" , NULL , DECL frameset_attrs, NULL
901 },
902 { "h1", 0, 0, 0, 0, 0, 0, 0, "heading ",
903 DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
904 },
905 { "h2", 0, 0, 0, 0, 0, 0, 0, "heading ",
906 DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
907 },
908 { "h3", 0, 0, 0, 0, 0, 0, 0, "heading ",
909 DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
910 },
911 { "h4", 0, 0, 0, 0, 0, 0, 0, "heading ",
912 DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
913 },
914 { "h5", 0, 0, 0, 0, 0, 0, 0, "heading ",
915 DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
916 },
917 { "h6", 0, 0, 0, 0, 0, 0, 0, "heading ",
918 DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
919 },
920 { "head", 1, 1, 0, 0, 0, 0, 0, "document head ",
921 DECL head_contents, NULL, DECL head_attrs, NULL, NULL
922 },
923 { "hr", 0, 2, 2, 1, 0, 0, 0, "horizontal rule " ,
924 EMPTY, NULL, DECL html_attrs, DECL hr_depr, NULL
925 },
926 { "html", 1, 1, 0, 0, 0, 0, 0, "document root element ",
927 DECL html_content , NULL , DECL i18n_attrs, DECL version_attr, NULL
928 },
929 { "i", 0, 3, 0, 0, 0, 0, 1, "italic text style",
930 DECL html_inline, NULL, DECL html_attrs, NULL, NULL
931 },
932 { "iframe", 0, 0, 0, 0, 0, 1, 2, "inline subwindow ",
933 DECL html_flow, NULL, NULL, DECL iframe_attrs, NULL
934 },
935 { "img", 0, 2, 2, 1, 0, 0, 1, "embedded image ",
936 EMPTY, NULL, DECL img_attrs, DECL align_attr, DECL src_alt_attrs
937 },
938 { "input", 0, 2, 2, 1, 0, 0, 1, "form control ",
939 EMPTY, NULL, DECL input_attrs , DECL align_attr, NULL
940 },
941 { "ins", 0, 0, 0, 0, 0, 0, 2, "inserted text",
942 DECL html_flow, NULL, DECL edit_attrs, NULL, NULL
943 },
944 { "isindex", 0, 2, 2, 1, 1, 1, 0, "single line prompt ",
945 EMPTY, NULL, NULL, DECL prompt_attrs, NULL
946 },
947 { "kbd", 0, 0, 0, 0, 0, 0, 1, "text to be entered by the user",
948 DECL html_inline, NULL, DECL html_attrs, NULL, NULL
949 },
950 { "label", 0, 0, 0, 0, 0, 0, 1, "form field label text ",
951 DECL html_inline MODIFIER, NULL, DECL label_attrs , NULL, NULL
952 },
953 { "legend", 0, 0, 0, 0, 0, 0, 0, "fieldset legend ",
954 DECL html_inline, NULL, DECL legend_attrs , DECL align_attr, NULL
955 },
956 { "li", 0, 1, 1, 0, 0, 0, 0, "list item ",
957 DECL html_flow, NULL, DECL html_attrs, NULL, NULL
958 },
959 { "link", 0, 2, 2, 1, 0, 0, 0, "a media-independent link ",
960 EMPTY, NULL, DECL link_attrs, DECL target_attr, NULL
961 },
962 { "map", 0, 0, 0, 0, 0, 0, 2, "client-side image map ",
963 DECL map_contents , NULL, DECL html_attrs , NULL, DECL name_attr
964 },
965 { "menu", 0, 0, 0, 0, 1, 1, 0, "menu list ",
966 DECL blockli_elt , NULL, NULL, DECL compact_attrs, NULL
967 },
968 { "meta", 0, 2, 2, 1, 0, 0, 0, "generic metainformation ",
969 EMPTY, NULL, DECL meta_attrs , NULL , DECL content_attr
970 },
971 { "noframes", 0, 0, 0, 0, 0, 2, 0, "alternate content container for non frame-based rendering ",
972 DECL noframes_content, "body" , DECL html_attrs, NULL, NULL
973 },
974 { "noscript", 0, 0, 0, 0, 0, 0, 0, "alternate content container for non script-based rendering ",
975 DECL html_flow, "div", DECL html_attrs, NULL, NULL
976 },
977 { "object", 0, 0, 0, 0, 0, 0, 2, "generic embedded object ",
978 DECL object_contents , "div" , DECL object_attrs, DECL object_depr, NULL
979 },
980 { "ol", 0, 0, 0, 0, 0, 0, 0, "ordered list ",
981 DECL li_elt , "li" , DECL html_attrs, DECL ol_attrs, NULL
982 },
983 { "optgroup", 0, 0, 0, 0, 0, 0, 0, "option group ",
984 DECL option_elt , "option", DECL optgroup_attrs, NULL, DECL label_attr
985 },
986 { "option", 0, 1, 0, 0, 0, 0, 0, "selectable choice " ,
987 DECL html_pcdata, NULL, DECL option_attrs, NULL, NULL
988 },
989 { "p", 0, 1, 0, 0, 0, 0, 0, "paragraph ",
990 DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
991 },
992 { "param", 0, 2, 2, 1, 0, 0, 0, "named property value ",
993 EMPTY, NULL, DECL param_attrs, NULL, DECL name_attr
994 },
995 { "pre", 0, 0, 0, 0, 0, 0, 0, "preformatted text ",
996 DECL pre_content, NULL, DECL html_attrs, DECL width_attr, NULL
997 },
998 { "q", 0, 0, 0, 0, 0, 0, 1, "short inline quotation ",
999 DECL html_inline, NULL, DECL quote_attrs, NULL, NULL
1000 },
1001 { "s", 0, 3, 0, 0, 1, 1, 1, "strike-through text style",
1002 DECL html_inline, NULL, NULL, DECL html_attrs, NULL
1003 },
1004 { "samp", 0, 0, 0, 0, 0, 0, 1, "sample program output, scripts, etc.",
1005 DECL html_inline, NULL, DECL html_attrs, NULL, NULL
1006 },
1007 { "script", 0, 0, 0, 0, 0, 0, 2, "script statements ",
1008 DECL html_cdata, NULL, DECL script_attrs, DECL language_attr, DECL type_attr
1009 },
1010 { "select", 0, 0, 0, 0, 0, 0, 1, "option selector ",
1011 DECL select_content, NULL, DECL select_attrs, NULL, NULL
1012 },
1013 { "small", 0, 3, 0, 0, 0, 0, 1, "small text style",
1014 DECL html_inline, NULL, DECL html_attrs, NULL, NULL
1015 },
1016 { "span", 0, 0, 0, 0, 0, 0, 1, "generic language/style container ",
1017 DECL html_inline, NULL, DECL html_attrs, NULL, NULL
1018 },
1019 { "strike", 0, 3, 0, 0, 1, 1, 1, "strike-through text",
1020 DECL html_inline, NULL, NULL, DECL html_attrs, NULL
1021 },
1022 { "strong", 0, 3, 0, 0, 0, 0, 1, "strong emphasis",
1023 DECL html_inline, NULL, DECL html_attrs, NULL, NULL
1024 },
1025 { "style", 0, 0, 0, 0, 0, 0, 0, "style info ",
1026 DECL html_cdata, NULL, DECL style_attrs, NULL, DECL type_attr
1027 },
1028 { "sub", 0, 3, 0, 0, 0, 0, 1, "subscript",
1029 DECL html_inline, NULL, DECL html_attrs, NULL, NULL
1030 },
1031 { "sup", 0, 3, 0, 0, 0, 0, 1, "superscript ",
1032 DECL html_inline, NULL, DECL html_attrs, NULL, NULL
1033 },
1034 { "table", 0, 0, 0, 0, 0, 0, 0, "",
1035 DECL table_contents , "tr" , DECL table_attrs , DECL table_depr, NULL
1036 },
1037 { "tbody", 1, 0, 0, 0, 0, 0, 0, "table body ",
1038 DECL tr_elt , "tr" , DECL talign_attrs, NULL, NULL
1039 },
1040 { "td", 0, 0, 0, 0, 0, 0, 0, "table data cell",
1041 DECL html_flow, NULL, DECL th_td_attr, DECL th_td_depr, NULL
1042 },
1043 { "textarea", 0, 0, 0, 0, 0, 0, 1, "multi-line text field ",
1044 DECL html_pcdata, NULL, DECL textarea_attrs, NULL, DECL rows_cols_attr
1045 },
1046 { "tfoot", 0, 1, 0, 0, 0, 0, 0, "table footer ",
1047 DECL tr_elt , "tr" , DECL talign_attrs, NULL, NULL
1048 },
1049 { "th", 0, 1, 0, 0, 0, 0, 0, "table header cell",
1050 DECL html_flow, NULL, DECL th_td_attr, DECL th_td_depr, NULL
1051 },
1052 { "thead", 0, 1, 0, 0, 0, 0, 0, "table header ",
1053 DECL tr_elt , "tr" , DECL talign_attrs, NULL, NULL
1054 },
1055 { "title", 0, 0, 0, 0, 0, 0, 0, "document title ",
1056 DECL html_pcdata, NULL, DECL i18n_attrs, NULL, NULL
1057 },
1058 { "tr", 0, 0, 0, 0, 0, 0, 0, "table row ",
1059 DECL tr_contents , "td" , DECL talign_attrs, DECL bgcolor_attr, NULL
1060 },
1061 { "tt", 0, 3, 0, 0, 0, 0, 1, "teletype or monospaced text style",
1062 DECL html_inline, NULL, DECL html_attrs, NULL, NULL
1063 },
1064 { "u", 0, 3, 0, 0, 1, 1, 1, "underlined text style",
1065 DECL html_inline, NULL, NULL, DECL html_attrs, NULL
1066 },
1067 { "ul", 0, 0, 0, 0, 0, 0, 0, "unordered list ",
1068 DECL li_elt , "li" , DECL html_attrs, DECL ul_depr, NULL
1069 },
1070 { "var", 0, 0, 0, 0, 0, 0, 1, "instance of a variable or program argument",
1071 DECL html_inline, NULL, DECL html_attrs, NULL, NULL
1072 }
1073 };
1074
1075 typedef struct {
1076 const char *oldTag;
1077 const char *newTag;
1078 } htmlStartCloseEntry;
1079
1080 /*
1081 * start tags that imply the end of current element
1082 */
1083 static const htmlStartCloseEntry htmlStartClose[] = {
1084 { "a", "a" },
1085 { "a", "fieldset" },
1086 { "a", "table" },
1087 { "a", "td" },
1088 { "a", "th" },
1089 { "address", "dd" },
1090 { "address", "dl" },
1091 { "address", "dt" },
1092 { "address", "form" },
1093 { "address", "li" },
1094 { "address", "ul" },
1095 { "b", "center" },
1096 { "b", "p" },
1097 { "b", "td" },
1098 { "b", "th" },
1099 { "big", "p" },
1100 { "caption", "col" },
1101 { "caption", "colgroup" },
1102 { "caption", "tbody" },
1103 { "caption", "tfoot" },
1104 { "caption", "thead" },
1105 { "caption", "tr" },
1106 { "col", "col" },
1107 { "col", "colgroup" },
1108 { "col", "tbody" },
1109 { "col", "tfoot" },
1110 { "col", "thead" },
1111 { "col", "tr" },
1112 { "colgroup", "colgroup" },
1113 { "colgroup", "tbody" },
1114 { "colgroup", "tfoot" },
1115 { "colgroup", "thead" },
1116 { "colgroup", "tr" },
1117 { "dd", "dt" },
1118 { "dir", "dd" },
1119 { "dir", "dl" },
1120 { "dir", "dt" },
1121 { "dir", "form" },
1122 { "dir", "ul" },
1123 { "dl", "form" },
1124 { "dl", "li" },
1125 { "dt", "dd" },
1126 { "dt", "dl" },
1127 { "font", "center" },
1128 { "font", "td" },
1129 { "font", "th" },
1130 { "form", "form" },
1131 { "h1", "fieldset" },
1132 { "h1", "form" },
1133 { "h1", "li" },
1134 { "h1", "p" },
1135 { "h1", "table" },
1136 { "h2", "fieldset" },
1137 { "h2", "form" },
1138 { "h2", "li" },
1139 { "h2", "p" },
1140 { "h2", "table" },
1141 { "h3", "fieldset" },
1142 { "h3", "form" },
1143 { "h3", "li" },
1144 { "h3", "p" },
1145 { "h3", "table" },
1146 { "h4", "fieldset" },
1147 { "h4", "form" },
1148 { "h4", "li" },
1149 { "h4", "p" },
1150 { "h4", "table" },
1151 { "h5", "fieldset" },
1152 { "h5", "form" },
1153 { "h5", "li" },
1154 { "h5", "p" },
1155 { "h5", "table" },
1156 { "h6", "fieldset" },
1157 { "h6", "form" },
1158 { "h6", "li" },
1159 { "h6", "p" },
1160 { "h6", "table" },
1161 { "head", "a" },
1162 { "head", "abbr" },
1163 { "head", "acronym" },
1164 { "head", "address" },
1165 { "head", "b" },
1166 { "head", "bdo" },
1167 { "head", "big" },
1168 { "head", "blockquote" },
1169 { "head", "body" },
1170 { "head", "br" },
1171 { "head", "center" },
1172 { "head", "cite" },
1173 { "head", "code" },
1174 { "head", "dd" },
1175 { "head", "dfn" },
1176 { "head", "dir" },
1177 { "head", "div" },
1178 { "head", "dl" },
1179 { "head", "dt" },
1180 { "head", "em" },
1181 { "head", "fieldset" },
1182 { "head", "font" },
1183 { "head", "form" },
1184 { "head", "frameset" },
1185 { "head", "h1" },
1186 { "head", "h2" },
1187 { "head", "h3" },
1188 { "head", "h4" },
1189 { "head", "h5" },
1190 { "head", "h6" },
1191 { "head", "hr" },
1192 { "head", "i" },
1193 { "head", "iframe" },
1194 { "head", "img" },
1195 { "head", "kbd" },
1196 { "head", "li" },
1197 { "head", "listing" },
1198 { "head", "map" },
1199 { "head", "menu" },
1200 { "head", "ol" },
1201 { "head", "p" },
1202 { "head", "pre" },
1203 { "head", "q" },
1204 { "head", "s" },
1205 { "head", "samp" },
1206 { "head", "small" },
1207 { "head", "span" },
1208 { "head", "strike" },
1209 { "head", "strong" },
1210 { "head", "sub" },
1211 { "head", "sup" },
1212 { "head", "table" },
1213 { "head", "tt" },
1214 { "head", "u" },
1215 { "head", "ul" },
1216 { "head", "var" },
1217 { "head", "xmp" },
1218 { "hr", "form" },
1219 { "i", "center" },
1220 { "i", "p" },
1221 { "i", "td" },
1222 { "i", "th" },
1223 { "legend", "fieldset" },
1224 { "li", "li" },
1225 { "link", "body" },
1226 { "link", "frameset" },
1227 { "listing", "dd" },
1228 { "listing", "dl" },
1229 { "listing", "dt" },
1230 { "listing", "fieldset" },
1231 { "listing", "form" },
1232 { "listing", "li" },
1233 { "listing", "table" },
1234 { "listing", "ul" },
1235 { "menu", "dd" },
1236 { "menu", "dl" },
1237 { "menu", "dt" },
1238 { "menu", "form" },
1239 { "menu", "ul" },
1240 { "ol", "form" },
1241 { "ol", "ul" },
1242 { "option", "optgroup" },
1243 { "option", "option" },
1244 { "p", "address" },
1245 { "p", "blockquote" },
1246 { "p", "body" },
1247 { "p", "caption" },
1248 { "p", "center" },
1249 { "p", "col" },
1250 { "p", "colgroup" },
1251 { "p", "dd" },
1252 { "p", "dir" },
1253 { "p", "div" },
1254 { "p", "dl" },
1255 { "p", "dt" },
1256 { "p", "fieldset" },
1257 { "p", "form" },
1258 { "p", "frameset" },
1259 { "p", "h1" },
1260 { "p", "h2" },
1261 { "p", "h3" },
1262 { "p", "h4" },
1263 { "p", "h5" },
1264 { "p", "h6" },
1265 { "p", "head" },
1266 { "p", "hr" },
1267 { "p", "li" },
1268 { "p", "listing" },
1269 { "p", "menu" },
1270 { "p", "ol" },
1271 { "p", "p" },
1272 { "p", "pre" },
1273 { "p", "table" },
1274 { "p", "tbody" },
1275 { "p", "td" },
1276 { "p", "tfoot" },
1277 { "p", "th" },
1278 { "p", "title" },
1279 { "p", "tr" },
1280 { "p", "ul" },
1281 { "p", "xmp" },
1282 { "pre", "dd" },
1283 { "pre", "dl" },
1284 { "pre", "dt" },
1285 { "pre", "fieldset" },
1286 { "pre", "form" },
1287 { "pre", "li" },
1288 { "pre", "table" },
1289 { "pre", "ul" },
1290 { "s", "p" },
1291 { "script", "noscript" },
1292 { "small", "p" },
1293 { "span", "td" },
1294 { "span", "th" },
1295 { "strike", "p" },
1296 { "style", "body" },
1297 { "style", "frameset" },
1298 { "tbody", "tbody" },
1299 { "tbody", "tfoot" },
1300 { "td", "tbody" },
1301 { "td", "td" },
1302 { "td", "tfoot" },
1303 { "td", "th" },
1304 { "td", "tr" },
1305 { "tfoot", "tbody" },
1306 { "th", "tbody" },
1307 { "th", "td" },
1308 { "th", "tfoot" },
1309 { "th", "th" },
1310 { "th", "tr" },
1311 { "thead", "tbody" },
1312 { "thead", "tfoot" },
1313 { "title", "body" },
1314 { "title", "frameset" },
1315 { "tr", "tbody" },
1316 { "tr", "tfoot" },
1317 { "tr", "tr" },
1318 { "tt", "p" },
1319 { "u", "p" },
1320 { "u", "td" },
1321 { "u", "th" },
1322 { "ul", "address" },
1323 { "ul", "form" },
1324 { "ul", "menu" },
1325 { "ul", "ol" },
1326 { "ul", "pre" },
1327 { "xmp", "dd" },
1328 { "xmp", "dl" },
1329 { "xmp", "dt" },
1330 { "xmp", "fieldset" },
1331 { "xmp", "form" },
1332 { "xmp", "li" },
1333 { "xmp", "table" },
1334 { "xmp", "ul" }
1335 };
1336
1337 /*
1338 * The list of HTML elements which are supposed not to have
1339 * CDATA content and where a p element will be implied
1340 *
1341 * TODO: extend that list by reading the HTML SGML DTD on
1342 * implied paragraph
1343 */
1344 static const char *const htmlNoContentElements[] = {
1345 "html",
1346 "head",
1347 NULL
1348 };
1349
1350 /*
1351 * The list of HTML attributes which are of content %Script;
1352 * NOTE: when adding ones, check htmlIsScriptAttribute() since
1353 * it assumes the name starts with 'on'
1354 */
1355 static const char *const htmlScriptAttributes[] = {
1356 "onclick",
1357 "ondblclick",
1358 "onmousedown",
1359 "onmouseup",
1360 "onmouseover",
1361 "onmousemove",
1362 "onmouseout",
1363 "onkeypress",
1364 "onkeydown",
1365 "onkeyup",
1366 "onload",
1367 "onunload",
1368 "onfocus",
1369 "onblur",
1370 "onsubmit",
1371 "onreset",
1372 "onchange",
1373 "onselect"
1374 };
1375
1376 /*
1377 * This table is used by the htmlparser to know what to do with
1378 * broken html pages. By assigning different priorities to different
1379 * elements the parser can decide how to handle extra endtags.
1380 * Endtags are only allowed to close elements with lower or equal
1381 * priority.
1382 */
1383
1384 typedef struct {
1385 const char *name;
1386 int priority;
1387 } elementPriority;
1388
1389 static const elementPriority htmlEndPriority[] = {
1390 {"div", 150},
1391 {"td", 160},
1392 {"th", 160},
1393 {"tr", 170},
1394 {"thead", 180},
1395 {"tbody", 180},
1396 {"tfoot", 180},
1397 {"table", 190},
1398 {"head", 200},
1399 {"body", 200},
1400 {"html", 220},
1401 {NULL, 100} /* Default priority */
1402 };
1403
1404 /************************************************************************
1405 * *
1406 * functions to handle HTML specific data *
1407 * *
1408 ************************************************************************/
1409
1410 /**
1411 * htmlInitAutoClose:
1412 *
1413 * This is a no-op now.
1414 */
1415 void
htmlInitAutoClose(void)1416 htmlInitAutoClose(void) {
1417 }
1418
1419 static int
htmlCompareTags(const void * key,const void * member)1420 htmlCompareTags(const void *key, const void *member) {
1421 const xmlChar *tag = (const xmlChar *) key;
1422 const htmlElemDesc *desc = (const htmlElemDesc *) member;
1423
1424 return(xmlStrcasecmp(tag, BAD_CAST desc->name));
1425 }
1426
1427 /**
1428 * htmlTagLookup:
1429 * @tag: The tag name in lowercase
1430 *
1431 * Lookup the HTML tag in the ElementTable
1432 *
1433 * Returns the related htmlElemDescPtr or NULL if not found.
1434 */
1435 const htmlElemDesc *
htmlTagLookup(const xmlChar * tag)1436 htmlTagLookup(const xmlChar *tag) {
1437 if (tag == NULL)
1438 return(NULL);
1439
1440 return((const htmlElemDesc *) bsearch(tag, html40ElementTable,
1441 sizeof(html40ElementTable) / sizeof(htmlElemDesc),
1442 sizeof(htmlElemDesc), htmlCompareTags));
1443 }
1444
1445 /**
1446 * htmlGetEndPriority:
1447 * @name: The name of the element to look up the priority for.
1448 *
1449 * Return value: The "endtag" priority.
1450 **/
1451 static int
htmlGetEndPriority(const xmlChar * name)1452 htmlGetEndPriority (const xmlChar *name) {
1453 int i = 0;
1454
1455 while ((htmlEndPriority[i].name != NULL) &&
1456 (!xmlStrEqual((const xmlChar *)htmlEndPriority[i].name, name)))
1457 i++;
1458
1459 return(htmlEndPriority[i].priority);
1460 }
1461
1462
1463 static int
htmlCompareStartClose(const void * vkey,const void * member)1464 htmlCompareStartClose(const void *vkey, const void *member) {
1465 const htmlStartCloseEntry *key = (const htmlStartCloseEntry *) vkey;
1466 const htmlStartCloseEntry *entry = (const htmlStartCloseEntry *) member;
1467 int ret;
1468
1469 ret = strcmp(key->oldTag, entry->oldTag);
1470 if (ret == 0)
1471 ret = strcmp(key->newTag, entry->newTag);
1472
1473 return(ret);
1474 }
1475
1476 /**
1477 * htmlCheckAutoClose:
1478 * @newtag: The new tag name
1479 * @oldtag: The old tag name
1480 *
1481 * Checks whether the new tag is one of the registered valid tags for
1482 * closing old.
1483 *
1484 * Returns 0 if no, 1 if yes.
1485 */
1486 static int
htmlCheckAutoClose(const xmlChar * newtag,const xmlChar * oldtag)1487 htmlCheckAutoClose(const xmlChar * newtag, const xmlChar * oldtag)
1488 {
1489 htmlStartCloseEntry key;
1490 void *res;
1491
1492 key.oldTag = (const char *) oldtag;
1493 key.newTag = (const char *) newtag;
1494 res = bsearch(&key, htmlStartClose,
1495 sizeof(htmlStartClose) / sizeof(htmlStartCloseEntry),
1496 sizeof(htmlStartCloseEntry), htmlCompareStartClose);
1497 return(res != NULL);
1498 }
1499
1500 /**
1501 * htmlAutoCloseOnClose:
1502 * @ctxt: an HTML parser context
1503 * @newtag: The new tag name
1504 * @force: force the tag closure
1505 *
1506 * The HTML DTD allows an ending tag to implicitly close other tags.
1507 */
1508 static void
htmlAutoCloseOnClose(htmlParserCtxtPtr ctxt,const xmlChar * newtag)1509 htmlAutoCloseOnClose(htmlParserCtxtPtr ctxt, const xmlChar * newtag)
1510 {
1511 const htmlElemDesc *info;
1512 int i, priority;
1513
1514 priority = htmlGetEndPriority(newtag);
1515
1516 for (i = (ctxt->nameNr - 1); i >= 0; i--) {
1517
1518 if (xmlStrEqual(newtag, ctxt->nameTab[i]))
1519 break;
1520 /*
1521 * A misplaced endtag can only close elements with lower
1522 * or equal priority, so if we find an element with higher
1523 * priority before we find an element with
1524 * matching name, we just ignore this endtag
1525 */
1526 if (htmlGetEndPriority(ctxt->nameTab[i]) > priority)
1527 return;
1528 }
1529 if (i < 0)
1530 return;
1531
1532 while (!xmlStrEqual(newtag, ctxt->name)) {
1533 info = htmlTagLookup(ctxt->name);
1534 if ((info != NULL) && (info->endTag == 3)) {
1535 htmlParseErr(ctxt, XML_ERR_TAG_NAME_MISMATCH,
1536 "Opening and ending tag mismatch: %s and %s\n",
1537 newtag, ctxt->name);
1538 }
1539 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
1540 ctxt->sax->endElement(ctxt->userData, ctxt->name);
1541 htmlnamePop(ctxt);
1542 }
1543 }
1544
1545 /**
1546 * htmlAutoCloseOnEnd:
1547 * @ctxt: an HTML parser context
1548 *
1549 * Close all remaining tags at the end of the stream
1550 */
1551 static void
htmlAutoCloseOnEnd(htmlParserCtxtPtr ctxt)1552 htmlAutoCloseOnEnd(htmlParserCtxtPtr ctxt)
1553 {
1554 int i;
1555
1556 if (ctxt->nameNr == 0)
1557 return;
1558 for (i = (ctxt->nameNr - 1); i >= 0; i--) {
1559 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
1560 ctxt->sax->endElement(ctxt->userData, ctxt->name);
1561 htmlnamePop(ctxt);
1562 }
1563 }
1564
1565 /**
1566 * htmlAutoClose:
1567 * @ctxt: an HTML parser context
1568 * @newtag: The new tag name or NULL
1569 *
1570 * The HTML DTD allows a tag to implicitly close other tags.
1571 * The list is kept in htmlStartClose array. This function is
1572 * called when a new tag has been detected and generates the
1573 * appropriates closes if possible/needed.
1574 * If newtag is NULL this mean we are at the end of the resource
1575 * and we should check
1576 */
1577 static void
htmlAutoClose(htmlParserCtxtPtr ctxt,const xmlChar * newtag)1578 htmlAutoClose(htmlParserCtxtPtr ctxt, const xmlChar * newtag)
1579 {
1580 while ((newtag != NULL) && (ctxt->name != NULL) &&
1581 (htmlCheckAutoClose(newtag, ctxt->name))) {
1582 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
1583 ctxt->sax->endElement(ctxt->userData, ctxt->name);
1584 htmlnamePop(ctxt);
1585 }
1586 if (newtag == NULL) {
1587 htmlAutoCloseOnEnd(ctxt);
1588 return;
1589 }
1590 while ((newtag == NULL) && (ctxt->name != NULL) &&
1591 ((xmlStrEqual(ctxt->name, BAD_CAST "head")) ||
1592 (xmlStrEqual(ctxt->name, BAD_CAST "body")) ||
1593 (xmlStrEqual(ctxt->name, BAD_CAST "html")))) {
1594 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
1595 ctxt->sax->endElement(ctxt->userData, ctxt->name);
1596 htmlnamePop(ctxt);
1597 }
1598 }
1599
1600 /**
1601 * htmlAutoCloseTag:
1602 * @doc: the HTML document
1603 * @name: The tag name
1604 * @elem: the HTML element
1605 *
1606 * The HTML DTD allows a tag to implicitly close other tags.
1607 * The list is kept in htmlStartClose array. This function checks
1608 * if the element or one of it's children would autoclose the
1609 * given tag.
1610 *
1611 * Returns 1 if autoclose, 0 otherwise
1612 */
1613 int
htmlAutoCloseTag(htmlDocPtr doc,const xmlChar * name,htmlNodePtr elem)1614 htmlAutoCloseTag(htmlDocPtr doc, const xmlChar *name, htmlNodePtr elem) {
1615 htmlNodePtr child;
1616
1617 if (elem == NULL) return(1);
1618 if (xmlStrEqual(name, elem->name)) return(0);
1619 if (htmlCheckAutoClose(elem->name, name)) return(1);
1620 child = elem->children;
1621 while (child != NULL) {
1622 if (htmlAutoCloseTag(doc, name, child)) return(1);
1623 child = child->next;
1624 }
1625 return(0);
1626 }
1627
1628 /**
1629 * htmlIsAutoClosed:
1630 * @doc: the HTML document
1631 * @elem: the HTML element
1632 *
1633 * The HTML DTD allows a tag to implicitly close other tags.
1634 * The list is kept in htmlStartClose array. This function checks
1635 * if a tag is autoclosed by one of it's child
1636 *
1637 * Returns 1 if autoclosed, 0 otherwise
1638 */
1639 int
htmlIsAutoClosed(htmlDocPtr doc,htmlNodePtr elem)1640 htmlIsAutoClosed(htmlDocPtr doc, htmlNodePtr elem) {
1641 htmlNodePtr child;
1642
1643 if (elem == NULL) return(1);
1644 child = elem->children;
1645 while (child != NULL) {
1646 if (htmlAutoCloseTag(doc, elem->name, child)) return(1);
1647 child = child->next;
1648 }
1649 return(0);
1650 }
1651
1652 /**
1653 * htmlCheckImplied:
1654 * @ctxt: an HTML parser context
1655 * @newtag: The new tag name
1656 *
1657 * The HTML DTD allows a tag to exists only implicitly
1658 * called when a new tag has been detected and generates the
1659 * appropriates implicit tags if missing
1660 */
1661 static void
htmlCheckImplied(htmlParserCtxtPtr ctxt,const xmlChar * newtag)1662 htmlCheckImplied(htmlParserCtxtPtr ctxt, const xmlChar *newtag) {
1663 int i;
1664
1665 if (ctxt->options & HTML_PARSE_NOIMPLIED)
1666 return;
1667 if (!htmlOmittedDefaultValue)
1668 return;
1669 if (xmlStrEqual(newtag, BAD_CAST"html"))
1670 return;
1671 if (ctxt->nameNr <= 0) {
1672 htmlnamePush(ctxt, BAD_CAST"html");
1673 if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
1674 ctxt->sax->startElement(ctxt->userData, BAD_CAST"html", NULL);
1675 }
1676 if ((xmlStrEqual(newtag, BAD_CAST"body")) || (xmlStrEqual(newtag, BAD_CAST"head")))
1677 return;
1678 if ((ctxt->nameNr <= 1) &&
1679 ((xmlStrEqual(newtag, BAD_CAST"script")) ||
1680 (xmlStrEqual(newtag, BAD_CAST"style")) ||
1681 (xmlStrEqual(newtag, BAD_CAST"meta")) ||
1682 (xmlStrEqual(newtag, BAD_CAST"link")) ||
1683 (xmlStrEqual(newtag, BAD_CAST"title")) ||
1684 (xmlStrEqual(newtag, BAD_CAST"base")))) {
1685 if (ctxt->html >= 3) {
1686 /* we already saw or generated an <head> before */
1687 return;
1688 }
1689 /*
1690 * dropped OBJECT ... i you put it first BODY will be
1691 * assumed !
1692 */
1693 htmlnamePush(ctxt, BAD_CAST"head");
1694 if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
1695 ctxt->sax->startElement(ctxt->userData, BAD_CAST"head", NULL);
1696 } else if ((!xmlStrEqual(newtag, BAD_CAST"noframes")) &&
1697 (!xmlStrEqual(newtag, BAD_CAST"frame")) &&
1698 (!xmlStrEqual(newtag, BAD_CAST"frameset"))) {
1699 if (ctxt->html >= 10) {
1700 /* we already saw or generated a <body> before */
1701 return;
1702 }
1703 for (i = 0;i < ctxt->nameNr;i++) {
1704 if (xmlStrEqual(ctxt->nameTab[i], BAD_CAST"body")) {
1705 return;
1706 }
1707 if (xmlStrEqual(ctxt->nameTab[i], BAD_CAST"head")) {
1708 return;
1709 }
1710 }
1711
1712 htmlnamePush(ctxt, BAD_CAST"body");
1713 if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
1714 ctxt->sax->startElement(ctxt->userData, BAD_CAST"body", NULL);
1715 }
1716 }
1717
1718 /**
1719 * htmlCheckParagraph
1720 * @ctxt: an HTML parser context
1721 *
1722 * Check whether a p element need to be implied before inserting
1723 * characters in the current element.
1724 *
1725 * Returns 1 if a paragraph has been inserted, 0 if not and -1
1726 * in case of error.
1727 */
1728
1729 static int
htmlCheckParagraph(htmlParserCtxtPtr ctxt)1730 htmlCheckParagraph(htmlParserCtxtPtr ctxt) {
1731 const xmlChar *tag;
1732 int i;
1733
1734 if (ctxt == NULL)
1735 return(-1);
1736 tag = ctxt->name;
1737 if (tag == NULL) {
1738 htmlAutoClose(ctxt, BAD_CAST"p");
1739 htmlCheckImplied(ctxt, BAD_CAST"p");
1740 htmlnamePush(ctxt, BAD_CAST"p");
1741 if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
1742 ctxt->sax->startElement(ctxt->userData, BAD_CAST"p", NULL);
1743 return(1);
1744 }
1745 if (!htmlOmittedDefaultValue)
1746 return(0);
1747 for (i = 0; htmlNoContentElements[i] != NULL; i++) {
1748 if (xmlStrEqual(tag, BAD_CAST htmlNoContentElements[i])) {
1749 htmlAutoClose(ctxt, BAD_CAST"p");
1750 htmlCheckImplied(ctxt, BAD_CAST"p");
1751 htmlnamePush(ctxt, BAD_CAST"p");
1752 if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
1753 ctxt->sax->startElement(ctxt->userData, BAD_CAST"p", NULL);
1754 return(1);
1755 }
1756 }
1757 return(0);
1758 }
1759
1760 /**
1761 * htmlIsScriptAttribute:
1762 * @name: an attribute name
1763 *
1764 * Check if an attribute is of content type Script
1765 *
1766 * Returns 1 is the attribute is a script 0 otherwise
1767 */
1768 int
htmlIsScriptAttribute(const xmlChar * name)1769 htmlIsScriptAttribute(const xmlChar *name) {
1770 unsigned int i;
1771
1772 if (name == NULL)
1773 return(0);
1774 /*
1775 * all script attributes start with 'on'
1776 */
1777 if ((name[0] != 'o') || (name[1] != 'n'))
1778 return(0);
1779 for (i = 0;
1780 i < sizeof(htmlScriptAttributes)/sizeof(htmlScriptAttributes[0]);
1781 i++) {
1782 if (xmlStrEqual(name, (const xmlChar *) htmlScriptAttributes[i]))
1783 return(1);
1784 }
1785 return(0);
1786 }
1787
1788 /************************************************************************
1789 * *
1790 * The list of HTML predefined entities *
1791 * *
1792 ************************************************************************/
1793
1794
1795 static const htmlEntityDesc html40EntitiesTable[] = {
1796 /*
1797 * the 4 absolute ones, plus apostrophe.
1798 */
1799 { 34, "quot", "quotation mark = APL quote, U+0022 ISOnum" },
1800 { 38, "amp", "ampersand, U+0026 ISOnum" },
1801 { 39, "apos", "single quote" },
1802 { 60, "lt", "less-than sign, U+003C ISOnum" },
1803 { 62, "gt", "greater-than sign, U+003E ISOnum" },
1804
1805 /*
1806 * A bunch still in the 128-255 range
1807 * Replacing them depend really on the charset used.
1808 */
1809 { 160, "nbsp", "no-break space = non-breaking space, U+00A0 ISOnum" },
1810 { 161, "iexcl","inverted exclamation mark, U+00A1 ISOnum" },
1811 { 162, "cent", "cent sign, U+00A2 ISOnum" },
1812 { 163, "pound","pound sign, U+00A3 ISOnum" },
1813 { 164, "curren","currency sign, U+00A4 ISOnum" },
1814 { 165, "yen", "yen sign = yuan sign, U+00A5 ISOnum" },
1815 { 166, "brvbar","broken bar = broken vertical bar, U+00A6 ISOnum" },
1816 { 167, "sect", "section sign, U+00A7 ISOnum" },
1817 { 168, "uml", "diaeresis = spacing diaeresis, U+00A8 ISOdia" },
1818 { 169, "copy", "copyright sign, U+00A9 ISOnum" },
1819 { 170, "ordf", "feminine ordinal indicator, U+00AA ISOnum" },
1820 { 171, "laquo","left-pointing double angle quotation mark = left pointing guillemet, U+00AB ISOnum" },
1821 { 172, "not", "not sign, U+00AC ISOnum" },
1822 { 173, "shy", "soft hyphen = discretionary hyphen, U+00AD ISOnum" },
1823 { 174, "reg", "registered sign = registered trade mark sign, U+00AE ISOnum" },
1824 { 175, "macr", "macron = spacing macron = overline = APL overbar, U+00AF ISOdia" },
1825 { 176, "deg", "degree sign, U+00B0 ISOnum" },
1826 { 177, "plusmn","plus-minus sign = plus-or-minus sign, U+00B1 ISOnum" },
1827 { 178, "sup2", "superscript two = superscript digit two = squared, U+00B2 ISOnum" },
1828 { 179, "sup3", "superscript three = superscript digit three = cubed, U+00B3 ISOnum" },
1829 { 180, "acute","acute accent = spacing acute, U+00B4 ISOdia" },
1830 { 181, "micro","micro sign, U+00B5 ISOnum" },
1831 { 182, "para", "pilcrow sign = paragraph sign, U+00B6 ISOnum" },
1832 { 183, "middot","middle dot = Georgian comma Greek middle dot, U+00B7 ISOnum" },
1833 { 184, "cedil","cedilla = spacing cedilla, U+00B8 ISOdia" },
1834 { 185, "sup1", "superscript one = superscript digit one, U+00B9 ISOnum" },
1835 { 186, "ordm", "masculine ordinal indicator, U+00BA ISOnum" },
1836 { 187, "raquo","right-pointing double angle quotation mark right pointing guillemet, U+00BB ISOnum" },
1837 { 188, "frac14","vulgar fraction one quarter = fraction one quarter, U+00BC ISOnum" },
1838 { 189, "frac12","vulgar fraction one half = fraction one half, U+00BD ISOnum" },
1839 { 190, "frac34","vulgar fraction three quarters = fraction three quarters, U+00BE ISOnum" },
1840 { 191, "iquest","inverted question mark = turned question mark, U+00BF ISOnum" },
1841 { 192, "Agrave","latin capital letter A with grave = latin capital letter A grave, U+00C0 ISOlat1" },
1842 { 193, "Aacute","latin capital letter A with acute, U+00C1 ISOlat1" },
1843 { 194, "Acirc","latin capital letter A with circumflex, U+00C2 ISOlat1" },
1844 { 195, "Atilde","latin capital letter A with tilde, U+00C3 ISOlat1" },
1845 { 196, "Auml", "latin capital letter A with diaeresis, U+00C4 ISOlat1" },
1846 { 197, "Aring","latin capital letter A with ring above = latin capital letter A ring, U+00C5 ISOlat1" },
1847 { 198, "AElig","latin capital letter AE = latin capital ligature AE, U+00C6 ISOlat1" },
1848 { 199, "Ccedil","latin capital letter C with cedilla, U+00C7 ISOlat1" },
1849 { 200, "Egrave","latin capital letter E with grave, U+00C8 ISOlat1" },
1850 { 201, "Eacute","latin capital letter E with acute, U+00C9 ISOlat1" },
1851 { 202, "Ecirc","latin capital letter E with circumflex, U+00CA ISOlat1" },
1852 { 203, "Euml", "latin capital letter E with diaeresis, U+00CB ISOlat1" },
1853 { 204, "Igrave","latin capital letter I with grave, U+00CC ISOlat1" },
1854 { 205, "Iacute","latin capital letter I with acute, U+00CD ISOlat1" },
1855 { 206, "Icirc","latin capital letter I with circumflex, U+00CE ISOlat1" },
1856 { 207, "Iuml", "latin capital letter I with diaeresis, U+00CF ISOlat1" },
1857 { 208, "ETH", "latin capital letter ETH, U+00D0 ISOlat1" },
1858 { 209, "Ntilde","latin capital letter N with tilde, U+00D1 ISOlat1" },
1859 { 210, "Ograve","latin capital letter O with grave, U+00D2 ISOlat1" },
1860 { 211, "Oacute","latin capital letter O with acute, U+00D3 ISOlat1" },
1861 { 212, "Ocirc","latin capital letter O with circumflex, U+00D4 ISOlat1" },
1862 { 213, "Otilde","latin capital letter O with tilde, U+00D5 ISOlat1" },
1863 { 214, "Ouml", "latin capital letter O with diaeresis, U+00D6 ISOlat1" },
1864 { 215, "times","multiplication sign, U+00D7 ISOnum" },
1865 { 216, "Oslash","latin capital letter O with stroke latin capital letter O slash, U+00D8 ISOlat1" },
1866 { 217, "Ugrave","latin capital letter U with grave, U+00D9 ISOlat1" },
1867 { 218, "Uacute","latin capital letter U with acute, U+00DA ISOlat1" },
1868 { 219, "Ucirc","latin capital letter U with circumflex, U+00DB ISOlat1" },
1869 { 220, "Uuml", "latin capital letter U with diaeresis, U+00DC ISOlat1" },
1870 { 221, "Yacute","latin capital letter Y with acute, U+00DD ISOlat1" },
1871 { 222, "THORN","latin capital letter THORN, U+00DE ISOlat1" },
1872 { 223, "szlig","latin small letter sharp s = ess-zed, U+00DF ISOlat1" },
1873 { 224, "agrave","latin small letter a with grave = latin small letter a grave, U+00E0 ISOlat1" },
1874 { 225, "aacute","latin small letter a with acute, U+00E1 ISOlat1" },
1875 { 226, "acirc","latin small letter a with circumflex, U+00E2 ISOlat1" },
1876 { 227, "atilde","latin small letter a with tilde, U+00E3 ISOlat1" },
1877 { 228, "auml", "latin small letter a with diaeresis, U+00E4 ISOlat1" },
1878 { 229, "aring","latin small letter a with ring above = latin small letter a ring, U+00E5 ISOlat1" },
1879 { 230, "aelig","latin small letter ae = latin small ligature ae, U+00E6 ISOlat1" },
1880 { 231, "ccedil","latin small letter c with cedilla, U+00E7 ISOlat1" },
1881 { 232, "egrave","latin small letter e with grave, U+00E8 ISOlat1" },
1882 { 233, "eacute","latin small letter e with acute, U+00E9 ISOlat1" },
1883 { 234, "ecirc","latin small letter e with circumflex, U+00EA ISOlat1" },
1884 { 235, "euml", "latin small letter e with diaeresis, U+00EB ISOlat1" },
1885 { 236, "igrave","latin small letter i with grave, U+00EC ISOlat1" },
1886 { 237, "iacute","latin small letter i with acute, U+00ED ISOlat1" },
1887 { 238, "icirc","latin small letter i with circumflex, U+00EE ISOlat1" },
1888 { 239, "iuml", "latin small letter i with diaeresis, U+00EF ISOlat1" },
1889 { 240, "eth", "latin small letter eth, U+00F0 ISOlat1" },
1890 { 241, "ntilde","latin small letter n with tilde, U+00F1 ISOlat1" },
1891 { 242, "ograve","latin small letter o with grave, U+00F2 ISOlat1" },
1892 { 243, "oacute","latin small letter o with acute, U+00F3 ISOlat1" },
1893 { 244, "ocirc","latin small letter o with circumflex, U+00F4 ISOlat1" },
1894 { 245, "otilde","latin small letter o with tilde, U+00F5 ISOlat1" },
1895 { 246, "ouml", "latin small letter o with diaeresis, U+00F6 ISOlat1" },
1896 { 247, "divide","division sign, U+00F7 ISOnum" },
1897 { 248, "oslash","latin small letter o with stroke, = latin small letter o slash, U+00F8 ISOlat1" },
1898 { 249, "ugrave","latin small letter u with grave, U+00F9 ISOlat1" },
1899 { 250, "uacute","latin small letter u with acute, U+00FA ISOlat1" },
1900 { 251, "ucirc","latin small letter u with circumflex, U+00FB ISOlat1" },
1901 { 252, "uuml", "latin small letter u with diaeresis, U+00FC ISOlat1" },
1902 { 253, "yacute","latin small letter y with acute, U+00FD ISOlat1" },
1903 { 254, "thorn","latin small letter thorn with, U+00FE ISOlat1" },
1904 { 255, "yuml", "latin small letter y with diaeresis, U+00FF ISOlat1" },
1905
1906 { 338, "OElig","latin capital ligature OE, U+0152 ISOlat2" },
1907 { 339, "oelig","latin small ligature oe, U+0153 ISOlat2" },
1908 { 352, "Scaron","latin capital letter S with caron, U+0160 ISOlat2" },
1909 { 353, "scaron","latin small letter s with caron, U+0161 ISOlat2" },
1910 { 376, "Yuml", "latin capital letter Y with diaeresis, U+0178 ISOlat2" },
1911
1912 /*
1913 * Anything below should really be kept as entities references
1914 */
1915 { 402, "fnof", "latin small f with hook = function = florin, U+0192 ISOtech" },
1916
1917 { 710, "circ", "modifier letter circumflex accent, U+02C6 ISOpub" },
1918 { 732, "tilde","small tilde, U+02DC ISOdia" },
1919
1920 { 913, "Alpha","greek capital letter alpha, U+0391" },
1921 { 914, "Beta", "greek capital letter beta, U+0392" },
1922 { 915, "Gamma","greek capital letter gamma, U+0393 ISOgrk3" },
1923 { 916, "Delta","greek capital letter delta, U+0394 ISOgrk3" },
1924 { 917, "Epsilon","greek capital letter epsilon, U+0395" },
1925 { 918, "Zeta", "greek capital letter zeta, U+0396" },
1926 { 919, "Eta", "greek capital letter eta, U+0397" },
1927 { 920, "Theta","greek capital letter theta, U+0398 ISOgrk3" },
1928 { 921, "Iota", "greek capital letter iota, U+0399" },
1929 { 922, "Kappa","greek capital letter kappa, U+039A" },
1930 { 923, "Lambda", "greek capital letter lambda, U+039B ISOgrk3" },
1931 { 924, "Mu", "greek capital letter mu, U+039C" },
1932 { 925, "Nu", "greek capital letter nu, U+039D" },
1933 { 926, "Xi", "greek capital letter xi, U+039E ISOgrk3" },
1934 { 927, "Omicron","greek capital letter omicron, U+039F" },
1935 { 928, "Pi", "greek capital letter pi, U+03A0 ISOgrk3" },
1936 { 929, "Rho", "greek capital letter rho, U+03A1" },
1937 { 931, "Sigma","greek capital letter sigma, U+03A3 ISOgrk3" },
1938 { 932, "Tau", "greek capital letter tau, U+03A4" },
1939 { 933, "Upsilon","greek capital letter upsilon, U+03A5 ISOgrk3" },
1940 { 934, "Phi", "greek capital letter phi, U+03A6 ISOgrk3" },
1941 { 935, "Chi", "greek capital letter chi, U+03A7" },
1942 { 936, "Psi", "greek capital letter psi, U+03A8 ISOgrk3" },
1943 { 937, "Omega","greek capital letter omega, U+03A9 ISOgrk3" },
1944
1945 { 945, "alpha","greek small letter alpha, U+03B1 ISOgrk3" },
1946 { 946, "beta", "greek small letter beta, U+03B2 ISOgrk3" },
1947 { 947, "gamma","greek small letter gamma, U+03B3 ISOgrk3" },
1948 { 948, "delta","greek small letter delta, U+03B4 ISOgrk3" },
1949 { 949, "epsilon","greek small letter epsilon, U+03B5 ISOgrk3" },
1950 { 950, "zeta", "greek small letter zeta, U+03B6 ISOgrk3" },
1951 { 951, "eta", "greek small letter eta, U+03B7 ISOgrk3" },
1952 { 952, "theta","greek small letter theta, U+03B8 ISOgrk3" },
1953 { 953, "iota", "greek small letter iota, U+03B9 ISOgrk3" },
1954 { 954, "kappa","greek small letter kappa, U+03BA ISOgrk3" },
1955 { 955, "lambda","greek small letter lambda, U+03BB ISOgrk3" },
1956 { 956, "mu", "greek small letter mu, U+03BC ISOgrk3" },
1957 { 957, "nu", "greek small letter nu, U+03BD ISOgrk3" },
1958 { 958, "xi", "greek small letter xi, U+03BE ISOgrk3" },
1959 { 959, "omicron","greek small letter omicron, U+03BF NEW" },
1960 { 960, "pi", "greek small letter pi, U+03C0 ISOgrk3" },
1961 { 961, "rho", "greek small letter rho, U+03C1 ISOgrk3" },
1962 { 962, "sigmaf","greek small letter final sigma, U+03C2 ISOgrk3" },
1963 { 963, "sigma","greek small letter sigma, U+03C3 ISOgrk3" },
1964 { 964, "tau", "greek small letter tau, U+03C4 ISOgrk3" },
1965 { 965, "upsilon","greek small letter upsilon, U+03C5 ISOgrk3" },
1966 { 966, "phi", "greek small letter phi, U+03C6 ISOgrk3" },
1967 { 967, "chi", "greek small letter chi, U+03C7 ISOgrk3" },
1968 { 968, "psi", "greek small letter psi, U+03C8 ISOgrk3" },
1969 { 969, "omega","greek small letter omega, U+03C9 ISOgrk3" },
1970 { 977, "thetasym","greek small letter theta symbol, U+03D1 NEW" },
1971 { 978, "upsih","greek upsilon with hook symbol, U+03D2 NEW" },
1972 { 982, "piv", "greek pi symbol, U+03D6 ISOgrk3" },
1973
1974 { 8194, "ensp", "en space, U+2002 ISOpub" },
1975 { 8195, "emsp", "em space, U+2003 ISOpub" },
1976 { 8201, "thinsp","thin space, U+2009 ISOpub" },
1977 { 8204, "zwnj", "zero width non-joiner, U+200C NEW RFC 2070" },
1978 { 8205, "zwj", "zero width joiner, U+200D NEW RFC 2070" },
1979 { 8206, "lrm", "left-to-right mark, U+200E NEW RFC 2070" },
1980 { 8207, "rlm", "right-to-left mark, U+200F NEW RFC 2070" },
1981 { 8211, "ndash","en dash, U+2013 ISOpub" },
1982 { 8212, "mdash","em dash, U+2014 ISOpub" },
1983 { 8216, "lsquo","left single quotation mark, U+2018 ISOnum" },
1984 { 8217, "rsquo","right single quotation mark, U+2019 ISOnum" },
1985 { 8218, "sbquo","single low-9 quotation mark, U+201A NEW" },
1986 { 8220, "ldquo","left double quotation mark, U+201C ISOnum" },
1987 { 8221, "rdquo","right double quotation mark, U+201D ISOnum" },
1988 { 8222, "bdquo","double low-9 quotation mark, U+201E NEW" },
1989 { 8224, "dagger","dagger, U+2020 ISOpub" },
1990 { 8225, "Dagger","double dagger, U+2021 ISOpub" },
1991
1992 { 8226, "bull", "bullet = black small circle, U+2022 ISOpub" },
1993 { 8230, "hellip","horizontal ellipsis = three dot leader, U+2026 ISOpub" },
1994
1995 { 8240, "permil","per mille sign, U+2030 ISOtech" },
1996
1997 { 8242, "prime","prime = minutes = feet, U+2032 ISOtech" },
1998 { 8243, "Prime","double prime = seconds = inches, U+2033 ISOtech" },
1999
2000 { 8249, "lsaquo","single left-pointing angle quotation mark, U+2039 ISO proposed" },
2001 { 8250, "rsaquo","single right-pointing angle quotation mark, U+203A ISO proposed" },
2002
2003 { 8254, "oline","overline = spacing overscore, U+203E NEW" },
2004 { 8260, "frasl","fraction slash, U+2044 NEW" },
2005
2006 { 8364, "euro", "euro sign, U+20AC NEW" },
2007
2008 { 8465, "image","blackletter capital I = imaginary part, U+2111 ISOamso" },
2009 { 8472, "weierp","script capital P = power set = Weierstrass p, U+2118 ISOamso" },
2010 { 8476, "real", "blackletter capital R = real part symbol, U+211C ISOamso" },
2011 { 8482, "trade","trade mark sign, U+2122 ISOnum" },
2012 { 8501, "alefsym","alef symbol = first transfinite cardinal, U+2135 NEW" },
2013 { 8592, "larr", "leftwards arrow, U+2190 ISOnum" },
2014 { 8593, "uarr", "upwards arrow, U+2191 ISOnum" },
2015 { 8594, "rarr", "rightwards arrow, U+2192 ISOnum" },
2016 { 8595, "darr", "downwards arrow, U+2193 ISOnum" },
2017 { 8596, "harr", "left right arrow, U+2194 ISOamsa" },
2018 { 8629, "crarr","downwards arrow with corner leftwards = carriage return, U+21B5 NEW" },
2019 { 8656, "lArr", "leftwards double arrow, U+21D0 ISOtech" },
2020 { 8657, "uArr", "upwards double arrow, U+21D1 ISOamsa" },
2021 { 8658, "rArr", "rightwards double arrow, U+21D2 ISOtech" },
2022 { 8659, "dArr", "downwards double arrow, U+21D3 ISOamsa" },
2023 { 8660, "hArr", "left right double arrow, U+21D4 ISOamsa" },
2024
2025 { 8704, "forall","for all, U+2200 ISOtech" },
2026 { 8706, "part", "partial differential, U+2202 ISOtech" },
2027 { 8707, "exist","there exists, U+2203 ISOtech" },
2028 { 8709, "empty","empty set = null set = diameter, U+2205 ISOamso" },
2029 { 8711, "nabla","nabla = backward difference, U+2207 ISOtech" },
2030 { 8712, "isin", "element of, U+2208 ISOtech" },
2031 { 8713, "notin","not an element of, U+2209 ISOtech" },
2032 { 8715, "ni", "contains as member, U+220B ISOtech" },
2033 { 8719, "prod", "n-ary product = product sign, U+220F ISOamsb" },
2034 { 8721, "sum", "n-ary summation, U+2211 ISOamsb" },
2035 { 8722, "minus","minus sign, U+2212 ISOtech" },
2036 { 8727, "lowast","asterisk operator, U+2217 ISOtech" },
2037 { 8730, "radic","square root = radical sign, U+221A ISOtech" },
2038 { 8733, "prop", "proportional to, U+221D ISOtech" },
2039 { 8734, "infin","infinity, U+221E ISOtech" },
2040 { 8736, "ang", "angle, U+2220 ISOamso" },
2041 { 8743, "and", "logical and = wedge, U+2227 ISOtech" },
2042 { 8744, "or", "logical or = vee, U+2228 ISOtech" },
2043 { 8745, "cap", "intersection = cap, U+2229 ISOtech" },
2044 { 8746, "cup", "union = cup, U+222A ISOtech" },
2045 { 8747, "int", "integral, U+222B ISOtech" },
2046 { 8756, "there4","therefore, U+2234 ISOtech" },
2047 { 8764, "sim", "tilde operator = varies with = similar to, U+223C ISOtech" },
2048 { 8773, "cong", "approximately equal to, U+2245 ISOtech" },
2049 { 8776, "asymp","almost equal to = asymptotic to, U+2248 ISOamsr" },
2050 { 8800, "ne", "not equal to, U+2260 ISOtech" },
2051 { 8801, "equiv","identical to, U+2261 ISOtech" },
2052 { 8804, "le", "less-than or equal to, U+2264 ISOtech" },
2053 { 8805, "ge", "greater-than or equal to, U+2265 ISOtech" },
2054 { 8834, "sub", "subset of, U+2282 ISOtech" },
2055 { 8835, "sup", "superset of, U+2283 ISOtech" },
2056 { 8836, "nsub", "not a subset of, U+2284 ISOamsn" },
2057 { 8838, "sube", "subset of or equal to, U+2286 ISOtech" },
2058 { 8839, "supe", "superset of or equal to, U+2287 ISOtech" },
2059 { 8853, "oplus","circled plus = direct sum, U+2295 ISOamsb" },
2060 { 8855, "otimes","circled times = vector product, U+2297 ISOamsb" },
2061 { 8869, "perp", "up tack = orthogonal to = perpendicular, U+22A5 ISOtech" },
2062 { 8901, "sdot", "dot operator, U+22C5 ISOamsb" },
2063 { 8968, "lceil","left ceiling = apl upstile, U+2308 ISOamsc" },
2064 { 8969, "rceil","right ceiling, U+2309 ISOamsc" },
2065 { 8970, "lfloor","left floor = apl downstile, U+230A ISOamsc" },
2066 { 8971, "rfloor","right floor, U+230B ISOamsc" },
2067 { 9001, "lang", "left-pointing angle bracket = bra, U+2329 ISOtech" },
2068 { 9002, "rang", "right-pointing angle bracket = ket, U+232A ISOtech" },
2069 { 9674, "loz", "lozenge, U+25CA ISOpub" },
2070
2071 { 9824, "spades","black spade suit, U+2660 ISOpub" },
2072 { 9827, "clubs","black club suit = shamrock, U+2663 ISOpub" },
2073 { 9829, "hearts","black heart suit = valentine, U+2665 ISOpub" },
2074 { 9830, "diams","black diamond suit, U+2666 ISOpub" },
2075
2076 };
2077
2078 /************************************************************************
2079 * *
2080 * Commodity functions to handle entities *
2081 * *
2082 ************************************************************************/
2083
2084 /*
2085 * Macro used to grow the current buffer.
2086 */
2087 #define growBuffer(buffer) { \
2088 xmlChar *tmp; \
2089 buffer##_size *= 2; \
2090 tmp = (xmlChar *) xmlRealloc(buffer, buffer##_size * sizeof(xmlChar)); \
2091 if (tmp == NULL) { \
2092 htmlErrMemory(ctxt, "growing buffer\n"); \
2093 xmlFree(buffer); \
2094 return(NULL); \
2095 } \
2096 buffer = tmp; \
2097 }
2098
2099 /**
2100 * htmlEntityLookup:
2101 * @name: the entity name
2102 *
2103 * Lookup the given entity in EntitiesTable
2104 *
2105 * TODO: the linear scan is really ugly, an hash table is really needed.
2106 *
2107 * Returns the associated htmlEntityDescPtr if found, NULL otherwise.
2108 */
2109 const htmlEntityDesc *
htmlEntityLookup(const xmlChar * name)2110 htmlEntityLookup(const xmlChar *name) {
2111 unsigned int i;
2112
2113 for (i = 0;i < (sizeof(html40EntitiesTable)/
2114 sizeof(html40EntitiesTable[0]));i++) {
2115 if (xmlStrEqual(name, BAD_CAST html40EntitiesTable[i].name)) {
2116 return((htmlEntityDescPtr) &html40EntitiesTable[i]);
2117 }
2118 }
2119 return(NULL);
2120 }
2121
2122 /**
2123 * htmlEntityValueLookup:
2124 * @value: the entity's unicode value
2125 *
2126 * Lookup the given entity in EntitiesTable
2127 *
2128 * TODO: the linear scan is really ugly, an hash table is really needed.
2129 *
2130 * Returns the associated htmlEntityDescPtr if found, NULL otherwise.
2131 */
2132 const htmlEntityDesc *
htmlEntityValueLookup(unsigned int value)2133 htmlEntityValueLookup(unsigned int value) {
2134 unsigned int i;
2135
2136 for (i = 0;i < (sizeof(html40EntitiesTable)/
2137 sizeof(html40EntitiesTable[0]));i++) {
2138 if (html40EntitiesTable[i].value >= value) {
2139 if (html40EntitiesTable[i].value > value)
2140 break;
2141 return((htmlEntityDescPtr) &html40EntitiesTable[i]);
2142 }
2143 }
2144 return(NULL);
2145 }
2146
2147 /**
2148 * UTF8ToHtml:
2149 * @out: a pointer to an array of bytes to store the result
2150 * @outlen: the length of @out
2151 * @in: a pointer to an array of UTF-8 chars
2152 * @inlen: the length of @in
2153 *
2154 * Take a block of UTF-8 chars in and try to convert it to an ASCII
2155 * plus HTML entities block of chars out.
2156 *
2157 * Returns 0 if success, -2 if the transcoding fails, or -1 otherwise
2158 * The value of @inlen after return is the number of octets consumed
2159 * as the return value is positive, else unpredictable.
2160 * The value of @outlen after return is the number of octets consumed.
2161 */
2162 int
UTF8ToHtml(unsigned char * out,int * outlen,const unsigned char * in,int * inlen)2163 UTF8ToHtml(unsigned char* out, int *outlen,
2164 const unsigned char* in, int *inlen) {
2165 const unsigned char* processed = in;
2166 const unsigned char* outend;
2167 const unsigned char* outstart = out;
2168 const unsigned char* instart = in;
2169 const unsigned char* inend;
2170 unsigned int c, d;
2171 int trailing;
2172
2173 if ((out == NULL) || (outlen == NULL) || (inlen == NULL)) return(-1);
2174 if (in == NULL) {
2175 /*
2176 * initialization nothing to do
2177 */
2178 *outlen = 0;
2179 *inlen = 0;
2180 return(0);
2181 }
2182 inend = in + (*inlen);
2183 outend = out + (*outlen);
2184 while (in < inend) {
2185 d = *in++;
2186 if (d < 0x80) { c= d; trailing= 0; }
2187 else if (d < 0xC0) {
2188 /* trailing byte in leading position */
2189 *outlen = out - outstart;
2190 *inlen = processed - instart;
2191 return(-2);
2192 } else if (d < 0xE0) { c= d & 0x1F; trailing= 1; }
2193 else if (d < 0xF0) { c= d & 0x0F; trailing= 2; }
2194 else if (d < 0xF8) { c= d & 0x07; trailing= 3; }
2195 else {
2196 /* no chance for this in Ascii */
2197 *outlen = out - outstart;
2198 *inlen = processed - instart;
2199 return(-2);
2200 }
2201
2202 if (inend - in < trailing) {
2203 break;
2204 }
2205
2206 for ( ; trailing; trailing--) {
2207 if ((in >= inend) || (((d= *in++) & 0xC0) != 0x80))
2208 break;
2209 c <<= 6;
2210 c |= d & 0x3F;
2211 }
2212
2213 /* assertion: c is a single UTF-4 value */
2214 if (c < 0x80) {
2215 if (out + 1 >= outend)
2216 break;
2217 *out++ = c;
2218 } else {
2219 int len;
2220 const htmlEntityDesc * ent;
2221 const char *cp;
2222 char nbuf[16];
2223
2224 /*
2225 * Try to lookup a predefined HTML entity for it
2226 */
2227
2228 ent = htmlEntityValueLookup(c);
2229 if (ent == NULL) {
2230 snprintf(nbuf, sizeof(nbuf), "#%u", c);
2231 cp = nbuf;
2232 }
2233 else
2234 cp = ent->name;
2235 len = strlen(cp);
2236 if (out + 2 + len >= outend)
2237 break;
2238 *out++ = '&';
2239 memcpy(out, cp, len);
2240 out += len;
2241 *out++ = ';';
2242 }
2243 processed = in;
2244 }
2245 *outlen = out - outstart;
2246 *inlen = processed - instart;
2247 return(0);
2248 }
2249
2250 /**
2251 * htmlEncodeEntities:
2252 * @out: a pointer to an array of bytes to store the result
2253 * @outlen: the length of @out
2254 * @in: a pointer to an array of UTF-8 chars
2255 * @inlen: the length of @in
2256 * @quoteChar: the quote character to escape (' or ") or zero.
2257 *
2258 * Take a block of UTF-8 chars in and try to convert it to an ASCII
2259 * plus HTML entities block of chars out.
2260 *
2261 * Returns 0 if success, -2 if the transcoding fails, or -1 otherwise
2262 * The value of @inlen after return is the number of octets consumed
2263 * as the return value is positive, else unpredictable.
2264 * The value of @outlen after return is the number of octets consumed.
2265 */
2266 int
htmlEncodeEntities(unsigned char * out,int * outlen,const unsigned char * in,int * inlen,int quoteChar)2267 htmlEncodeEntities(unsigned char* out, int *outlen,
2268 const unsigned char* in, int *inlen, int quoteChar) {
2269 const unsigned char* processed = in;
2270 const unsigned char* outend;
2271 const unsigned char* outstart = out;
2272 const unsigned char* instart = in;
2273 const unsigned char* inend;
2274 unsigned int c, d;
2275 int trailing;
2276
2277 if ((out == NULL) || (outlen == NULL) || (inlen == NULL) || (in == NULL))
2278 return(-1);
2279 outend = out + (*outlen);
2280 inend = in + (*inlen);
2281 while (in < inend) {
2282 d = *in++;
2283 if (d < 0x80) { c= d; trailing= 0; }
2284 else if (d < 0xC0) {
2285 /* trailing byte in leading position */
2286 *outlen = out - outstart;
2287 *inlen = processed - instart;
2288 return(-2);
2289 } else if (d < 0xE0) { c= d & 0x1F; trailing= 1; }
2290 else if (d < 0xF0) { c= d & 0x0F; trailing= 2; }
2291 else if (d < 0xF8) { c= d & 0x07; trailing= 3; }
2292 else {
2293 /* no chance for this in Ascii */
2294 *outlen = out - outstart;
2295 *inlen = processed - instart;
2296 return(-2);
2297 }
2298
2299 if (inend - in < trailing)
2300 break;
2301
2302 while (trailing--) {
2303 if (((d= *in++) & 0xC0) != 0x80) {
2304 *outlen = out - outstart;
2305 *inlen = processed - instart;
2306 return(-2);
2307 }
2308 c <<= 6;
2309 c |= d & 0x3F;
2310 }
2311
2312 /* assertion: c is a single UTF-4 value */
2313 if ((c < 0x80) && (c != (unsigned int) quoteChar) &&
2314 (c != '&') && (c != '<') && (c != '>')) {
2315 if (out >= outend)
2316 break;
2317 *out++ = c;
2318 } else {
2319 const htmlEntityDesc * ent;
2320 const char *cp;
2321 char nbuf[16];
2322 int len;
2323
2324 /*
2325 * Try to lookup a predefined HTML entity for it
2326 */
2327 ent = htmlEntityValueLookup(c);
2328 if (ent == NULL) {
2329 snprintf(nbuf, sizeof(nbuf), "#%u", c);
2330 cp = nbuf;
2331 }
2332 else
2333 cp = ent->name;
2334 len = strlen(cp);
2335 if (out + 2 + len > outend)
2336 break;
2337 *out++ = '&';
2338 memcpy(out, cp, len);
2339 out += len;
2340 *out++ = ';';
2341 }
2342 processed = in;
2343 }
2344 *outlen = out - outstart;
2345 *inlen = processed - instart;
2346 return(0);
2347 }
2348
2349 /************************************************************************
2350 * *
2351 * Commodity functions to handle streams *
2352 * *
2353 ************************************************************************/
2354
2355 #ifdef LIBXML_PUSH_ENABLED
2356 /**
2357 * htmlNewInputStream:
2358 * @ctxt: an HTML parser context
2359 *
2360 * Create a new input stream structure
2361 * Returns the new input stream or NULL
2362 */
2363 static htmlParserInputPtr
htmlNewInputStream(htmlParserCtxtPtr ctxt)2364 htmlNewInputStream(htmlParserCtxtPtr ctxt) {
2365 htmlParserInputPtr input;
2366
2367 input = (xmlParserInputPtr) xmlMalloc(sizeof(htmlParserInput));
2368 if (input == NULL) {
2369 htmlErrMemory(ctxt, "couldn't allocate a new input stream\n");
2370 return(NULL);
2371 }
2372 memset(input, 0, sizeof(htmlParserInput));
2373 input->filename = NULL;
2374 input->directory = NULL;
2375 input->base = NULL;
2376 input->cur = NULL;
2377 input->buf = NULL;
2378 input->line = 1;
2379 input->col = 1;
2380 input->buf = NULL;
2381 input->free = NULL;
2382 input->version = NULL;
2383 input->consumed = 0;
2384 input->length = 0;
2385 return(input);
2386 }
2387 #endif
2388
2389
2390 /************************************************************************
2391 * *
2392 * Commodity functions, cleanup needed ? *
2393 * *
2394 ************************************************************************/
2395 /*
2396 * all tags allowing pc data from the html 4.01 loose dtd
2397 * NOTE: it might be more appropriate to integrate this information
2398 * into the html40ElementTable array but I don't want to risk any
2399 * binary incompatibility
2400 */
2401 static const char *allowPCData[] = {
2402 "a", "abbr", "acronym", "address", "applet", "b", "bdo", "big",
2403 "blockquote", "body", "button", "caption", "center", "cite", "code",
2404 "dd", "del", "dfn", "div", "dt", "em", "font", "form", "h1", "h2",
2405 "h3", "h4", "h5", "h6", "i", "iframe", "ins", "kbd", "label", "legend",
2406 "li", "noframes", "noscript", "object", "p", "pre", "q", "s", "samp",
2407 "small", "span", "strike", "strong", "td", "th", "tt", "u", "var"
2408 };
2409
2410 /**
2411 * areBlanks:
2412 * @ctxt: an HTML parser context
2413 * @str: a xmlChar *
2414 * @len: the size of @str
2415 *
2416 * Is this a sequence of blank chars that one can ignore ?
2417 *
2418 * Returns 1 if ignorable 0 otherwise.
2419 */
2420
areBlanks(htmlParserCtxtPtr ctxt,const xmlChar * str,int len)2421 static int areBlanks(htmlParserCtxtPtr ctxt, const xmlChar *str, int len) {
2422 unsigned int i;
2423 int j;
2424 xmlNodePtr lastChild;
2425 xmlDtdPtr dtd;
2426
2427 for (j = 0;j < len;j++)
2428 if (!(IS_BLANK_CH(str[j]))) return(0);
2429
2430 if (CUR == 0) return(1);
2431 if (CUR != '<') return(0);
2432 if (ctxt->name == NULL)
2433 return(1);
2434 if (xmlStrEqual(ctxt->name, BAD_CAST"html"))
2435 return(1);
2436 if (xmlStrEqual(ctxt->name, BAD_CAST"head"))
2437 return(1);
2438
2439 /* Only strip CDATA children of the body tag for strict HTML DTDs */
2440 if (xmlStrEqual(ctxt->name, BAD_CAST "body") && ctxt->myDoc != NULL) {
2441 dtd = xmlGetIntSubset(ctxt->myDoc);
2442 if (dtd != NULL && dtd->ExternalID != NULL) {
2443 if (!xmlStrcasecmp(dtd->ExternalID, BAD_CAST "-//W3C//DTD HTML 4.01//EN") ||
2444 !xmlStrcasecmp(dtd->ExternalID, BAD_CAST "-//W3C//DTD HTML 4//EN"))
2445 return(1);
2446 }
2447 }
2448
2449 if (ctxt->node == NULL) return(0);
2450 lastChild = xmlGetLastChild(ctxt->node);
2451 while ((lastChild) && (lastChild->type == XML_COMMENT_NODE))
2452 lastChild = lastChild->prev;
2453 if (lastChild == NULL) {
2454 if ((ctxt->node->type != XML_ELEMENT_NODE) &&
2455 (ctxt->node->content != NULL)) return(0);
2456 /* keep ws in constructs like ...<b> </b>...
2457 for all tags "b" allowing PCDATA */
2458 for ( i = 0; i < sizeof(allowPCData)/sizeof(allowPCData[0]); i++ ) {
2459 if ( xmlStrEqual(ctxt->name, BAD_CAST allowPCData[i]) ) {
2460 return(0);
2461 }
2462 }
2463 } else if (xmlNodeIsText(lastChild)) {
2464 return(0);
2465 } else {
2466 /* keep ws in constructs like <p><b>xy</b> <i>z</i><p>
2467 for all tags "p" allowing PCDATA */
2468 for ( i = 0; i < sizeof(allowPCData)/sizeof(allowPCData[0]); i++ ) {
2469 if ( xmlStrEqual(lastChild->name, BAD_CAST allowPCData[i]) ) {
2470 return(0);
2471 }
2472 }
2473 }
2474 return(1);
2475 }
2476
2477 /**
2478 * htmlNewDocNoDtD:
2479 * @URI: URI for the dtd, or NULL
2480 * @ExternalID: the external ID of the DTD, or NULL
2481 *
2482 * Creates a new HTML document without a DTD node if @URI and @ExternalID
2483 * are NULL
2484 *
2485 * Returns a new document, do not initialize the DTD if not provided
2486 */
2487 htmlDocPtr
htmlNewDocNoDtD(const xmlChar * URI,const xmlChar * ExternalID)2488 htmlNewDocNoDtD(const xmlChar *URI, const xmlChar *ExternalID) {
2489 xmlDocPtr cur;
2490
2491 /*
2492 * Allocate a new document and fill the fields.
2493 */
2494 cur = (xmlDocPtr) xmlMalloc(sizeof(xmlDoc));
2495 if (cur == NULL) {
2496 htmlErrMemory(NULL, "HTML document creation failed\n");
2497 return(NULL);
2498 }
2499 memset(cur, 0, sizeof(xmlDoc));
2500
2501 cur->type = XML_HTML_DOCUMENT_NODE;
2502 cur->version = NULL;
2503 cur->intSubset = NULL;
2504 cur->doc = cur;
2505 cur->name = NULL;
2506 cur->children = NULL;
2507 cur->extSubset = NULL;
2508 cur->oldNs = NULL;
2509 cur->encoding = NULL;
2510 cur->standalone = 1;
2511 cur->compression = 0;
2512 cur->ids = NULL;
2513 cur->refs = NULL;
2514 cur->_private = NULL;
2515 cur->charset = XML_CHAR_ENCODING_UTF8;
2516 cur->properties = XML_DOC_HTML | XML_DOC_USERBUILT;
2517 if ((ExternalID != NULL) ||
2518 (URI != NULL))
2519 xmlCreateIntSubset(cur, BAD_CAST "html", ExternalID, URI);
2520 return(cur);
2521 }
2522
2523 /**
2524 * htmlNewDoc:
2525 * @URI: URI for the dtd, or NULL
2526 * @ExternalID: the external ID of the DTD, or NULL
2527 *
2528 * Creates a new HTML document
2529 *
2530 * Returns a new document
2531 */
2532 htmlDocPtr
htmlNewDoc(const xmlChar * URI,const xmlChar * ExternalID)2533 htmlNewDoc(const xmlChar *URI, const xmlChar *ExternalID) {
2534 if ((URI == NULL) && (ExternalID == NULL))
2535 return(htmlNewDocNoDtD(
2536 BAD_CAST "http://www.w3.org/TR/REC-html40/loose.dtd",
2537 BAD_CAST "-//W3C//DTD HTML 4.0 Transitional//EN"));
2538
2539 return(htmlNewDocNoDtD(URI, ExternalID));
2540 }
2541
2542
2543 /************************************************************************
2544 * *
2545 * The parser itself *
2546 * Relates to http://www.w3.org/TR/html40 *
2547 * *
2548 ************************************************************************/
2549
2550 /************************************************************************
2551 * *
2552 * The parser itself *
2553 * *
2554 ************************************************************************/
2555
2556 static const xmlChar * htmlParseNameComplex(xmlParserCtxtPtr ctxt);
2557
2558 /**
2559 * htmlParseHTMLName:
2560 * @ctxt: an HTML parser context
2561 *
2562 * parse an HTML tag or attribute name, note that we convert it to lowercase
2563 * since HTML names are not case-sensitive.
2564 *
2565 * Returns the Tag Name parsed or NULL
2566 */
2567
2568 static const xmlChar *
htmlParseHTMLName(htmlParserCtxtPtr ctxt)2569 htmlParseHTMLName(htmlParserCtxtPtr ctxt) {
2570 int i = 0;
2571 xmlChar loc[HTML_PARSER_BUFFER_SIZE];
2572
2573 if (!IS_ASCII_LETTER(CUR) && (CUR != '_') &&
2574 (CUR != ':') && (CUR != '.')) return(NULL);
2575
2576 while ((i < HTML_PARSER_BUFFER_SIZE) &&
2577 ((IS_ASCII_LETTER(CUR)) || (IS_ASCII_DIGIT(CUR)) ||
2578 (CUR == ':') || (CUR == '-') || (CUR == '_') ||
2579 (CUR == '.'))) {
2580 if ((CUR >= 'A') && (CUR <= 'Z')) loc[i] = CUR + 0x20;
2581 else loc[i] = CUR;
2582 i++;
2583
2584 NEXT;
2585 }
2586
2587 return(xmlDictLookup(ctxt->dict, loc, i));
2588 }
2589
2590
2591 /**
2592 * htmlParseHTMLName_nonInvasive:
2593 * @ctxt: an HTML parser context
2594 *
2595 * parse an HTML tag or attribute name, note that we convert it to lowercase
2596 * since HTML names are not case-sensitive, this doesn't consume the data
2597 * from the stream, it's a look-ahead
2598 *
2599 * Returns the Tag Name parsed or NULL
2600 */
2601
2602 static const xmlChar *
htmlParseHTMLName_nonInvasive(htmlParserCtxtPtr ctxt)2603 htmlParseHTMLName_nonInvasive(htmlParserCtxtPtr ctxt) {
2604 int i = 0;
2605 xmlChar loc[HTML_PARSER_BUFFER_SIZE];
2606
2607 if (!IS_ASCII_LETTER(NXT(1)) && (NXT(1) != '_') &&
2608 (NXT(1) != ':')) return(NULL);
2609
2610 while ((i < HTML_PARSER_BUFFER_SIZE) &&
2611 ((IS_ASCII_LETTER(NXT(1+i))) || (IS_ASCII_DIGIT(NXT(1+i))) ||
2612 (NXT(1+i) == ':') || (NXT(1+i) == '-') || (NXT(1+i) == '_'))) {
2613 if ((NXT(1+i) >= 'A') && (NXT(1+i) <= 'Z')) loc[i] = NXT(1+i) + 0x20;
2614 else loc[i] = NXT(1+i);
2615 i++;
2616 }
2617
2618 return(xmlDictLookup(ctxt->dict, loc, i));
2619 }
2620
2621
2622 /**
2623 * htmlParseName:
2624 * @ctxt: an HTML parser context
2625 *
2626 * parse an HTML name, this routine is case sensitive.
2627 *
2628 * Returns the Name parsed or NULL
2629 */
2630
2631 static const xmlChar *
htmlParseName(htmlParserCtxtPtr ctxt)2632 htmlParseName(htmlParserCtxtPtr ctxt) {
2633 const xmlChar *in;
2634 const xmlChar *ret;
2635 int count = 0;
2636
2637 GROW;
2638
2639 /*
2640 * Accelerator for simple ASCII names
2641 */
2642 in = ctxt->input->cur;
2643 if (((*in >= 0x61) && (*in <= 0x7A)) ||
2644 ((*in >= 0x41) && (*in <= 0x5A)) ||
2645 (*in == '_') || (*in == ':')) {
2646 in++;
2647 while (((*in >= 0x61) && (*in <= 0x7A)) ||
2648 ((*in >= 0x41) && (*in <= 0x5A)) ||
2649 ((*in >= 0x30) && (*in <= 0x39)) ||
2650 (*in == '_') || (*in == '-') ||
2651 (*in == ':') || (*in == '.'))
2652 in++;
2653
2654 if (in == ctxt->input->end)
2655 return(NULL);
2656
2657 if ((*in > 0) && (*in < 0x80)) {
2658 count = in - ctxt->input->cur;
2659 ret = xmlDictLookup(ctxt->dict, ctxt->input->cur, count);
2660 ctxt->input->cur = in;
2661 ctxt->input->col += count;
2662 return(ret);
2663 }
2664 }
2665 return(htmlParseNameComplex(ctxt));
2666 }
2667
2668 static const xmlChar *
htmlParseNameComplex(xmlParserCtxtPtr ctxt)2669 htmlParseNameComplex(xmlParserCtxtPtr ctxt) {
2670 int len = 0, l;
2671 int c;
2672 int count = 0;
2673 const xmlChar *base = ctxt->input->base;
2674
2675 /*
2676 * Handler for more complex cases
2677 */
2678 GROW;
2679 c = CUR_CHAR(l);
2680 if ((c == ' ') || (c == '>') || (c == '/') || /* accelerators */
2681 (!IS_LETTER(c) && (c != '_') &&
2682 (c != ':'))) {
2683 return(NULL);
2684 }
2685
2686 while ((c != ' ') && (c != '>') && (c != '/') && /* test bigname.xml */
2687 ((IS_LETTER(c)) || (IS_DIGIT(c)) ||
2688 (c == '.') || (c == '-') ||
2689 (c == '_') || (c == ':') ||
2690 (IS_COMBINING(c)) ||
2691 (IS_EXTENDER(c)))) {
2692 if (count++ > 100) {
2693 count = 0;
2694 GROW;
2695 }
2696 len += l;
2697 NEXTL(l);
2698 c = CUR_CHAR(l);
2699 if (ctxt->input->base != base) {
2700 /*
2701 * We changed encoding from an unknown encoding
2702 * Input buffer changed location, so we better start again
2703 */
2704 return(htmlParseNameComplex(ctxt));
2705 }
2706 }
2707
2708 if (ctxt->input->cur - ctxt->input->base < len) {
2709 /* Sanity check */
2710 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
2711 "unexpected change of input buffer", NULL, NULL);
2712 return (NULL);
2713 }
2714
2715 return(xmlDictLookup(ctxt->dict, ctxt->input->cur - len, len));
2716 }
2717
2718
2719 /**
2720 * htmlParseHTMLAttribute:
2721 * @ctxt: an HTML parser context
2722 * @stop: a char stop value
2723 *
2724 * parse an HTML attribute value till the stop (quote), if
2725 * stop is 0 then it stops at the first space
2726 *
2727 * Returns the attribute parsed or NULL
2728 */
2729
2730 static xmlChar *
htmlParseHTMLAttribute(htmlParserCtxtPtr ctxt,const xmlChar stop)2731 htmlParseHTMLAttribute(htmlParserCtxtPtr ctxt, const xmlChar stop) {
2732 xmlChar *buffer = NULL;
2733 int buffer_size = 0;
2734 xmlChar *out = NULL;
2735 const xmlChar *name = NULL;
2736 const xmlChar *cur = NULL;
2737 const htmlEntityDesc * ent;
2738
2739 /*
2740 * allocate a translation buffer.
2741 */
2742 buffer_size = HTML_PARSER_BUFFER_SIZE;
2743 buffer = (xmlChar *) xmlMallocAtomic(buffer_size * sizeof(xmlChar));
2744 if (buffer == NULL) {
2745 htmlErrMemory(ctxt, "buffer allocation failed\n");
2746 return(NULL);
2747 }
2748 out = buffer;
2749
2750 /*
2751 * Ok loop until we reach one of the ending chars
2752 */
2753 while ((CUR != 0) && (CUR != stop)) {
2754 if ((stop == 0) && (CUR == '>')) break;
2755 if ((stop == 0) && (IS_BLANK_CH(CUR))) break;
2756 if (CUR == '&') {
2757 if (NXT(1) == '#') {
2758 unsigned int c;
2759 int bits;
2760
2761 c = htmlParseCharRef(ctxt);
2762 if (c < 0x80)
2763 { *out++ = c; bits= -6; }
2764 else if (c < 0x800)
2765 { *out++ =((c >> 6) & 0x1F) | 0xC0; bits= 0; }
2766 else if (c < 0x10000)
2767 { *out++ =((c >> 12) & 0x0F) | 0xE0; bits= 6; }
2768 else
2769 { *out++ =((c >> 18) & 0x07) | 0xF0; bits= 12; }
2770
2771 for ( ; bits >= 0; bits-= 6) {
2772 *out++ = ((c >> bits) & 0x3F) | 0x80;
2773 }
2774
2775 if (out - buffer > buffer_size - 100) {
2776 int indx = out - buffer;
2777
2778 growBuffer(buffer);
2779 out = &buffer[indx];
2780 }
2781 } else {
2782 ent = htmlParseEntityRef(ctxt, &name);
2783 if (name == NULL) {
2784 *out++ = '&';
2785 if (out - buffer > buffer_size - 100) {
2786 int indx = out - buffer;
2787
2788 growBuffer(buffer);
2789 out = &buffer[indx];
2790 }
2791 } else if (ent == NULL) {
2792 *out++ = '&';
2793 cur = name;
2794 while (*cur != 0) {
2795 if (out - buffer > buffer_size - 100) {
2796 int indx = out - buffer;
2797
2798 growBuffer(buffer);
2799 out = &buffer[indx];
2800 }
2801 *out++ = *cur++;
2802 }
2803 } else {
2804 unsigned int c;
2805 int bits;
2806
2807 if (out - buffer > buffer_size - 100) {
2808 int indx = out - buffer;
2809
2810 growBuffer(buffer);
2811 out = &buffer[indx];
2812 }
2813 c = ent->value;
2814 if (c < 0x80)
2815 { *out++ = c; bits= -6; }
2816 else if (c < 0x800)
2817 { *out++ =((c >> 6) & 0x1F) | 0xC0; bits= 0; }
2818 else if (c < 0x10000)
2819 { *out++ =((c >> 12) & 0x0F) | 0xE0; bits= 6; }
2820 else
2821 { *out++ =((c >> 18) & 0x07) | 0xF0; bits= 12; }
2822
2823 for ( ; bits >= 0; bits-= 6) {
2824 *out++ = ((c >> bits) & 0x3F) | 0x80;
2825 }
2826 }
2827 }
2828 } else {
2829 unsigned int c;
2830 int bits, l;
2831
2832 if (out - buffer > buffer_size - 100) {
2833 int indx = out - buffer;
2834
2835 growBuffer(buffer);
2836 out = &buffer[indx];
2837 }
2838 c = CUR_CHAR(l);
2839 if (c < 0x80)
2840 { *out++ = c; bits= -6; }
2841 else if (c < 0x800)
2842 { *out++ =((c >> 6) & 0x1F) | 0xC0; bits= 0; }
2843 else if (c < 0x10000)
2844 { *out++ =((c >> 12) & 0x0F) | 0xE0; bits= 6; }
2845 else
2846 { *out++ =((c >> 18) & 0x07) | 0xF0; bits= 12; }
2847
2848 for ( ; bits >= 0; bits-= 6) {
2849 *out++ = ((c >> bits) & 0x3F) | 0x80;
2850 }
2851 NEXT;
2852 }
2853 }
2854 *out = 0;
2855 return(buffer);
2856 }
2857
2858 /**
2859 * htmlParseEntityRef:
2860 * @ctxt: an HTML parser context
2861 * @str: location to store the entity name
2862 *
2863 * parse an HTML ENTITY references
2864 *
2865 * [68] EntityRef ::= '&' Name ';'
2866 *
2867 * Returns the associated htmlEntityDescPtr if found, or NULL otherwise,
2868 * if non-NULL *str will have to be freed by the caller.
2869 */
2870 const htmlEntityDesc *
htmlParseEntityRef(htmlParserCtxtPtr ctxt,const xmlChar ** str)2871 htmlParseEntityRef(htmlParserCtxtPtr ctxt, const xmlChar **str) {
2872 const xmlChar *name;
2873 const htmlEntityDesc * ent = NULL;
2874
2875 if (str != NULL) *str = NULL;
2876 if ((ctxt == NULL) || (ctxt->input == NULL)) return(NULL);
2877
2878 if (CUR == '&') {
2879 NEXT;
2880 name = htmlParseName(ctxt);
2881 if (name == NULL) {
2882 htmlParseErr(ctxt, XML_ERR_NAME_REQUIRED,
2883 "htmlParseEntityRef: no name\n", NULL, NULL);
2884 } else {
2885 GROW;
2886 if (CUR == ';') {
2887 if (str != NULL)
2888 *str = name;
2889
2890 /*
2891 * Lookup the entity in the table.
2892 */
2893 ent = htmlEntityLookup(name);
2894 if (ent != NULL) /* OK that's ugly !!! */
2895 NEXT;
2896 } else {
2897 htmlParseErr(ctxt, XML_ERR_ENTITYREF_SEMICOL_MISSING,
2898 "htmlParseEntityRef: expecting ';'\n",
2899 NULL, NULL);
2900 if (str != NULL)
2901 *str = name;
2902 }
2903 }
2904 }
2905 return(ent);
2906 }
2907
2908 /**
2909 * htmlParseAttValue:
2910 * @ctxt: an HTML parser context
2911 *
2912 * parse a value for an attribute
2913 * Note: the parser won't do substitution of entities here, this
2914 * will be handled later in xmlStringGetNodeList, unless it was
2915 * asked for ctxt->replaceEntities != 0
2916 *
2917 * Returns the AttValue parsed or NULL.
2918 */
2919
2920 static xmlChar *
htmlParseAttValue(htmlParserCtxtPtr ctxt)2921 htmlParseAttValue(htmlParserCtxtPtr ctxt) {
2922 xmlChar *ret = NULL;
2923
2924 if (CUR == '"') {
2925 NEXT;
2926 ret = htmlParseHTMLAttribute(ctxt, '"');
2927 if (CUR != '"') {
2928 htmlParseErr(ctxt, XML_ERR_ATTRIBUTE_NOT_FINISHED,
2929 "AttValue: \" expected\n", NULL, NULL);
2930 } else
2931 NEXT;
2932 } else if (CUR == '\'') {
2933 NEXT;
2934 ret = htmlParseHTMLAttribute(ctxt, '\'');
2935 if (CUR != '\'') {
2936 htmlParseErr(ctxt, XML_ERR_ATTRIBUTE_NOT_FINISHED,
2937 "AttValue: ' expected\n", NULL, NULL);
2938 } else
2939 NEXT;
2940 } else {
2941 /*
2942 * That's an HTMLism, the attribute value may not be quoted
2943 */
2944 ret = htmlParseHTMLAttribute(ctxt, 0);
2945 if (ret == NULL) {
2946 htmlParseErr(ctxt, XML_ERR_ATTRIBUTE_WITHOUT_VALUE,
2947 "AttValue: no value found\n", NULL, NULL);
2948 }
2949 }
2950 return(ret);
2951 }
2952
2953 /**
2954 * htmlParseSystemLiteral:
2955 * @ctxt: an HTML parser context
2956 *
2957 * parse an HTML Literal
2958 *
2959 * [11] SystemLiteral ::= ('"' [^"]* '"') | ("'" [^']* "'")
2960 *
2961 * Returns the SystemLiteral parsed or NULL
2962 */
2963
2964 static xmlChar *
htmlParseSystemLiteral(htmlParserCtxtPtr ctxt)2965 htmlParseSystemLiteral(htmlParserCtxtPtr ctxt) {
2966 size_t len = 0, startPosition = 0;
2967 int err = 0;
2968 int quote;
2969 xmlChar *ret = NULL;
2970
2971 if ((CUR != '"') && (CUR != '\'')) {
2972 htmlParseErr(ctxt, XML_ERR_LITERAL_NOT_STARTED,
2973 "SystemLiteral \" or ' expected\n", NULL, NULL);
2974 return(NULL);
2975 }
2976 quote = CUR;
2977 NEXT;
2978
2979 if (CUR_PTR < BASE_PTR)
2980 return(ret);
2981 startPosition = CUR_PTR - BASE_PTR;
2982
2983 while ((CUR != 0) && (CUR != quote)) {
2984 /* TODO: Handle UTF-8 */
2985 if (!IS_CHAR_CH(CUR)) {
2986 htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR,
2987 "Invalid char in SystemLiteral 0x%X\n", CUR);
2988 err = 1;
2989 }
2990 NEXT;
2991 len++;
2992 }
2993 if (CUR != quote) {
2994 htmlParseErr(ctxt, XML_ERR_LITERAL_NOT_FINISHED,
2995 "Unfinished SystemLiteral\n", NULL, NULL);
2996 } else {
2997 NEXT;
2998 if (err == 0)
2999 ret = xmlStrndup((BASE_PTR+startPosition), len);
3000 }
3001
3002 return(ret);
3003 }
3004
3005 /**
3006 * htmlParsePubidLiteral:
3007 * @ctxt: an HTML parser context
3008 *
3009 * parse an HTML public literal
3010 *
3011 * [12] PubidLiteral ::= '"' PubidChar* '"' | "'" (PubidChar - "'")* "'"
3012 *
3013 * Returns the PubidLiteral parsed or NULL.
3014 */
3015
3016 static xmlChar *
htmlParsePubidLiteral(htmlParserCtxtPtr ctxt)3017 htmlParsePubidLiteral(htmlParserCtxtPtr ctxt) {
3018 size_t len = 0, startPosition = 0;
3019 int err = 0;
3020 int quote;
3021 xmlChar *ret = NULL;
3022
3023 if ((CUR != '"') && (CUR != '\'')) {
3024 htmlParseErr(ctxt, XML_ERR_LITERAL_NOT_STARTED,
3025 "PubidLiteral \" or ' expected\n", NULL, NULL);
3026 return(NULL);
3027 }
3028 quote = CUR;
3029 NEXT;
3030
3031 /*
3032 * Name ::= (Letter | '_') (NameChar)*
3033 */
3034 if (CUR_PTR < BASE_PTR)
3035 return(ret);
3036 startPosition = CUR_PTR - BASE_PTR;
3037
3038 while ((CUR != 0) && (CUR != quote)) {
3039 if (!IS_PUBIDCHAR_CH(CUR)) {
3040 htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR,
3041 "Invalid char in PubidLiteral 0x%X\n", CUR);
3042 err = 1;
3043 }
3044 len++;
3045 NEXT;
3046 }
3047
3048 if (CUR != '"') {
3049 htmlParseErr(ctxt, XML_ERR_LITERAL_NOT_FINISHED,
3050 "Unfinished PubidLiteral\n", NULL, NULL);
3051 } else {
3052 NEXT;
3053 if (err == 0)
3054 ret = xmlStrndup((BASE_PTR + startPosition), len);
3055 }
3056
3057 return(ret);
3058 }
3059
3060 /**
3061 * htmlParseScript:
3062 * @ctxt: an HTML parser context
3063 *
3064 * parse the content of an HTML SCRIPT or STYLE element
3065 * http://www.w3.org/TR/html4/sgml/dtd.html#Script
3066 * http://www.w3.org/TR/html4/sgml/dtd.html#StyleSheet
3067 * http://www.w3.org/TR/html4/types.html#type-script
3068 * http://www.w3.org/TR/html4/types.html#h-6.15
3069 * http://www.w3.org/TR/html4/appendix/notes.html#h-B.3.2.1
3070 *
3071 * Script data ( %Script; in the DTD) can be the content of the SCRIPT
3072 * element and the value of intrinsic event attributes. User agents must
3073 * not evaluate script data as HTML markup but instead must pass it on as
3074 * data to a script engine.
3075 * NOTES:
3076 * - The content is passed like CDATA
3077 * - the attributes for style and scripting "onXXX" are also described
3078 * as CDATA but SGML allows entities references in attributes so their
3079 * processing is identical as other attributes
3080 */
3081 static void
htmlParseScript(htmlParserCtxtPtr ctxt)3082 htmlParseScript(htmlParserCtxtPtr ctxt) {
3083 xmlChar buf[HTML_PARSER_BIG_BUFFER_SIZE + 5];
3084 int nbchar = 0;
3085 int cur,l;
3086
3087 SHRINK;
3088 cur = CUR_CHAR(l);
3089 while (cur != 0) {
3090 if ((cur == '<') && (NXT(1) == '/')) {
3091 /*
3092 * One should break here, the specification is clear:
3093 * Authors should therefore escape "</" within the content.
3094 * Escape mechanisms are specific to each scripting or
3095 * style sheet language.
3096 *
3097 * In recovery mode, only break if end tag match the
3098 * current tag, effectively ignoring all tags inside the
3099 * script/style block and treating the entire block as
3100 * CDATA.
3101 */
3102 if (ctxt->recovery) {
3103 if (xmlStrncasecmp(ctxt->name, ctxt->input->cur+2,
3104 xmlStrlen(ctxt->name)) == 0)
3105 {
3106 break; /* while */
3107 } else {
3108 htmlParseErr(ctxt, XML_ERR_TAG_NAME_MISMATCH,
3109 "Element %s embeds close tag\n",
3110 ctxt->name, NULL);
3111 }
3112 } else {
3113 if (((NXT(2) >= 'A') && (NXT(2) <= 'Z')) ||
3114 ((NXT(2) >= 'a') && (NXT(2) <= 'z')))
3115 {
3116 break; /* while */
3117 }
3118 }
3119 }
3120 if (IS_CHAR(cur)) {
3121 COPY_BUF(l,buf,nbchar,cur);
3122 } else {
3123 htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR,
3124 "Invalid char in CDATA 0x%X\n", cur);
3125 }
3126 if (nbchar >= HTML_PARSER_BIG_BUFFER_SIZE) {
3127 buf[nbchar] = 0;
3128 if (ctxt->sax->cdataBlock!= NULL) {
3129 /*
3130 * Insert as CDATA, which is the same as HTML_PRESERVE_NODE
3131 */
3132 ctxt->sax->cdataBlock(ctxt->userData, buf, nbchar);
3133 } else if (ctxt->sax->characters != NULL) {
3134 ctxt->sax->characters(ctxt->userData, buf, nbchar);
3135 }
3136 nbchar = 0;
3137 }
3138 GROW;
3139 NEXTL(l);
3140 cur = CUR_CHAR(l);
3141 }
3142
3143 if ((nbchar != 0) && (ctxt->sax != NULL) && (!ctxt->disableSAX)) {
3144 buf[nbchar] = 0;
3145 if (ctxt->sax->cdataBlock!= NULL) {
3146 /*
3147 * Insert as CDATA, which is the same as HTML_PRESERVE_NODE
3148 */
3149 ctxt->sax->cdataBlock(ctxt->userData, buf, nbchar);
3150 } else if (ctxt->sax->characters != NULL) {
3151 ctxt->sax->characters(ctxt->userData, buf, nbchar);
3152 }
3153 }
3154 }
3155
3156
3157 /**
3158 * htmlParseCharDataInternal:
3159 * @ctxt: an HTML parser context
3160 * @readahead: optional read ahead character in ascii range
3161 *
3162 * parse a CharData section.
3163 * if we are within a CDATA section ']]>' marks an end of section.
3164 *
3165 * [14] CharData ::= [^<&]* - ([^<&]* ']]>' [^<&]*)
3166 */
3167
3168 static void
htmlParseCharDataInternal(htmlParserCtxtPtr ctxt,int readahead)3169 htmlParseCharDataInternal(htmlParserCtxtPtr ctxt, int readahead) {
3170 xmlChar buf[HTML_PARSER_BIG_BUFFER_SIZE + 6];
3171 int nbchar = 0;
3172 int cur, l;
3173 int chunk = 0;
3174
3175 if (readahead)
3176 buf[nbchar++] = readahead;
3177
3178 SHRINK;
3179 cur = CUR_CHAR(l);
3180 while (((cur != '<') || (ctxt->token == '<')) &&
3181 ((cur != '&') || (ctxt->token == '&')) &&
3182 (cur != 0)) {
3183 if (!(IS_CHAR(cur))) {
3184 htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR,
3185 "Invalid char in CDATA 0x%X\n", cur);
3186 } else {
3187 COPY_BUF(l,buf,nbchar,cur);
3188 }
3189 if (nbchar >= HTML_PARSER_BIG_BUFFER_SIZE) {
3190 buf[nbchar] = 0;
3191
3192 /*
3193 * Ok the segment is to be consumed as chars.
3194 */
3195 if ((ctxt->sax != NULL) && (!ctxt->disableSAX)) {
3196 if (areBlanks(ctxt, buf, nbchar)) {
3197 if (ctxt->keepBlanks) {
3198 if (ctxt->sax->characters != NULL)
3199 ctxt->sax->characters(ctxt->userData, buf, nbchar);
3200 } else {
3201 if (ctxt->sax->ignorableWhitespace != NULL)
3202 ctxt->sax->ignorableWhitespace(ctxt->userData,
3203 buf, nbchar);
3204 }
3205 } else {
3206 htmlCheckParagraph(ctxt);
3207 if (ctxt->sax->characters != NULL)
3208 ctxt->sax->characters(ctxt->userData, buf, nbchar);
3209 }
3210 }
3211 nbchar = 0;
3212 }
3213 NEXTL(l);
3214 chunk++;
3215 if (chunk > HTML_PARSER_BUFFER_SIZE) {
3216 chunk = 0;
3217 SHRINK;
3218 GROW;
3219 }
3220 cur = CUR_CHAR(l);
3221 if (cur == 0) {
3222 SHRINK;
3223 GROW;
3224 cur = CUR_CHAR(l);
3225 }
3226 }
3227 if (nbchar != 0) {
3228 buf[nbchar] = 0;
3229
3230 /*
3231 * Ok the segment is to be consumed as chars.
3232 */
3233 if ((ctxt->sax != NULL) && (!ctxt->disableSAX)) {
3234 if (areBlanks(ctxt, buf, nbchar)) {
3235 if (ctxt->keepBlanks) {
3236 if (ctxt->sax->characters != NULL)
3237 ctxt->sax->characters(ctxt->userData, buf, nbchar);
3238 } else {
3239 if (ctxt->sax->ignorableWhitespace != NULL)
3240 ctxt->sax->ignorableWhitespace(ctxt->userData,
3241 buf, nbchar);
3242 }
3243 } else {
3244 htmlCheckParagraph(ctxt);
3245 if (ctxt->sax->characters != NULL)
3246 ctxt->sax->characters(ctxt->userData, buf, nbchar);
3247 }
3248 }
3249 } else {
3250 /*
3251 * Loop detection
3252 */
3253 if (cur == 0)
3254 ctxt->instate = XML_PARSER_EOF;
3255 }
3256 }
3257
3258 /**
3259 * htmlParseCharData:
3260 * @ctxt: an HTML parser context
3261 *
3262 * parse a CharData section.
3263 * if we are within a CDATA section ']]>' marks an end of section.
3264 *
3265 * [14] CharData ::= [^<&]* - ([^<&]* ']]>' [^<&]*)
3266 */
3267
3268 static void
htmlParseCharData(htmlParserCtxtPtr ctxt)3269 htmlParseCharData(htmlParserCtxtPtr ctxt) {
3270 htmlParseCharDataInternal(ctxt, 0);
3271 }
3272
3273 /**
3274 * htmlParseExternalID:
3275 * @ctxt: an HTML parser context
3276 * @publicID: a xmlChar** receiving PubidLiteral
3277 *
3278 * Parse an External ID or a Public ID
3279 *
3280 * [75] ExternalID ::= 'SYSTEM' S SystemLiteral
3281 * | 'PUBLIC' S PubidLiteral S SystemLiteral
3282 *
3283 * [83] PublicID ::= 'PUBLIC' S PubidLiteral
3284 *
3285 * Returns the function returns SystemLiteral and in the second
3286 * case publicID receives PubidLiteral, is strict is off
3287 * it is possible to return NULL and have publicID set.
3288 */
3289
3290 static xmlChar *
htmlParseExternalID(htmlParserCtxtPtr ctxt,xmlChar ** publicID)3291 htmlParseExternalID(htmlParserCtxtPtr ctxt, xmlChar **publicID) {
3292 xmlChar *URI = NULL;
3293
3294 if ((UPPER == 'S') && (UPP(1) == 'Y') &&
3295 (UPP(2) == 'S') && (UPP(3) == 'T') &&
3296 (UPP(4) == 'E') && (UPP(5) == 'M')) {
3297 SKIP(6);
3298 if (!IS_BLANK_CH(CUR)) {
3299 htmlParseErr(ctxt, XML_ERR_SPACE_REQUIRED,
3300 "Space required after 'SYSTEM'\n", NULL, NULL);
3301 }
3302 SKIP_BLANKS;
3303 URI = htmlParseSystemLiteral(ctxt);
3304 if (URI == NULL) {
3305 htmlParseErr(ctxt, XML_ERR_URI_REQUIRED,
3306 "htmlParseExternalID: SYSTEM, no URI\n", NULL, NULL);
3307 }
3308 } else if ((UPPER == 'P') && (UPP(1) == 'U') &&
3309 (UPP(2) == 'B') && (UPP(3) == 'L') &&
3310 (UPP(4) == 'I') && (UPP(5) == 'C')) {
3311 SKIP(6);
3312 if (!IS_BLANK_CH(CUR)) {
3313 htmlParseErr(ctxt, XML_ERR_SPACE_REQUIRED,
3314 "Space required after 'PUBLIC'\n", NULL, NULL);
3315 }
3316 SKIP_BLANKS;
3317 *publicID = htmlParsePubidLiteral(ctxt);
3318 if (*publicID == NULL) {
3319 htmlParseErr(ctxt, XML_ERR_PUBID_REQUIRED,
3320 "htmlParseExternalID: PUBLIC, no Public Identifier\n",
3321 NULL, NULL);
3322 }
3323 SKIP_BLANKS;
3324 if ((CUR == '"') || (CUR == '\'')) {
3325 URI = htmlParseSystemLiteral(ctxt);
3326 }
3327 }
3328 return(URI);
3329 }
3330
3331 /**
3332 * xmlParsePI:
3333 * @ctxt: an XML parser context
3334 *
3335 * parse an XML Processing Instruction.
3336 *
3337 * [16] PI ::= '<?' PITarget (S (Char* - (Char* '?>' Char*)))? '?>'
3338 */
3339 static void
htmlParsePI(htmlParserCtxtPtr ctxt)3340 htmlParsePI(htmlParserCtxtPtr ctxt) {
3341 xmlChar *buf = NULL;
3342 int len = 0;
3343 int size = HTML_PARSER_BUFFER_SIZE;
3344 int cur, l;
3345 const xmlChar *target;
3346 xmlParserInputState state;
3347 int count = 0;
3348
3349 if ((RAW == '<') && (NXT(1) == '?')) {
3350 state = ctxt->instate;
3351 ctxt->instate = XML_PARSER_PI;
3352 /*
3353 * this is a Processing Instruction.
3354 */
3355 SKIP(2);
3356 SHRINK;
3357
3358 /*
3359 * Parse the target name and check for special support like
3360 * namespace.
3361 */
3362 target = htmlParseName(ctxt);
3363 if (target != NULL) {
3364 if (RAW == '>') {
3365 SKIP(1);
3366
3367 /*
3368 * SAX: PI detected.
3369 */
3370 if ((ctxt->sax) && (!ctxt->disableSAX) &&
3371 (ctxt->sax->processingInstruction != NULL))
3372 ctxt->sax->processingInstruction(ctxt->userData,
3373 target, NULL);
3374 ctxt->instate = state;
3375 return;
3376 }
3377 buf = (xmlChar *) xmlMallocAtomic(size * sizeof(xmlChar));
3378 if (buf == NULL) {
3379 htmlErrMemory(ctxt, NULL);
3380 ctxt->instate = state;
3381 return;
3382 }
3383 cur = CUR;
3384 if (!IS_BLANK(cur)) {
3385 htmlParseErr(ctxt, XML_ERR_SPACE_REQUIRED,
3386 "ParsePI: PI %s space expected\n", target, NULL);
3387 }
3388 SKIP_BLANKS;
3389 cur = CUR_CHAR(l);
3390 while ((cur != 0) && (cur != '>')) {
3391 if (len + 5 >= size) {
3392 xmlChar *tmp;
3393
3394 size *= 2;
3395 tmp = (xmlChar *) xmlRealloc(buf, size * sizeof(xmlChar));
3396 if (tmp == NULL) {
3397 htmlErrMemory(ctxt, NULL);
3398 xmlFree(buf);
3399 ctxt->instate = state;
3400 return;
3401 }
3402 buf = tmp;
3403 }
3404 count++;
3405 if (count > 50) {
3406 GROW;
3407 count = 0;
3408 }
3409 if (IS_CHAR(cur)) {
3410 COPY_BUF(l,buf,len,cur);
3411 } else {
3412 htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR,
3413 "Invalid char in processing instruction "
3414 "0x%X\n", cur);
3415 }
3416 NEXTL(l);
3417 cur = CUR_CHAR(l);
3418 if (cur == 0) {
3419 SHRINK;
3420 GROW;
3421 cur = CUR_CHAR(l);
3422 }
3423 }
3424 buf[len] = 0;
3425 if (cur != '>') {
3426 htmlParseErr(ctxt, XML_ERR_PI_NOT_FINISHED,
3427 "ParsePI: PI %s never end ...\n", target, NULL);
3428 } else {
3429 SKIP(1);
3430
3431 /*
3432 * SAX: PI detected.
3433 */
3434 if ((ctxt->sax) && (!ctxt->disableSAX) &&
3435 (ctxt->sax->processingInstruction != NULL))
3436 ctxt->sax->processingInstruction(ctxt->userData,
3437 target, buf);
3438 }
3439 xmlFree(buf);
3440 } else {
3441 htmlParseErr(ctxt, XML_ERR_PI_NOT_STARTED,
3442 "PI is not started correctly", NULL, NULL);
3443 }
3444 ctxt->instate = state;
3445 }
3446 }
3447
3448 /**
3449 * htmlParseComment:
3450 * @ctxt: an HTML parser context
3451 *
3452 * Parse an XML (SGML) comment <!-- .... -->
3453 *
3454 * [15] Comment ::= '<!--' ((Char - '-') | ('-' (Char - '-')))* '-->'
3455 */
3456 static void
htmlParseComment(htmlParserCtxtPtr ctxt)3457 htmlParseComment(htmlParserCtxtPtr ctxt) {
3458 xmlChar *buf = NULL;
3459 int len;
3460 int size = HTML_PARSER_BUFFER_SIZE;
3461 int q, ql;
3462 int r, rl;
3463 int cur, l;
3464 int next, nl;
3465 xmlParserInputState state;
3466
3467 /*
3468 * Check that there is a comment right here.
3469 */
3470 if ((RAW != '<') || (NXT(1) != '!') ||
3471 (NXT(2) != '-') || (NXT(3) != '-')) return;
3472
3473 state = ctxt->instate;
3474 ctxt->instate = XML_PARSER_COMMENT;
3475 SHRINK;
3476 SKIP(4);
3477 buf = (xmlChar *) xmlMallocAtomic(size * sizeof(xmlChar));
3478 if (buf == NULL) {
3479 htmlErrMemory(ctxt, "buffer allocation failed\n");
3480 ctxt->instate = state;
3481 return;
3482 }
3483 len = 0;
3484 buf[len] = 0;
3485 q = CUR_CHAR(ql);
3486 if (q == 0)
3487 goto unfinished;
3488 NEXTL(ql);
3489 r = CUR_CHAR(rl);
3490 if (r == 0)
3491 goto unfinished;
3492 NEXTL(rl);
3493 cur = CUR_CHAR(l);
3494 while ((cur != 0) &&
3495 ((cur != '>') ||
3496 (r != '-') || (q != '-'))) {
3497 NEXTL(l);
3498 next = CUR_CHAR(nl);
3499 if (next == 0) {
3500 SHRINK;
3501 GROW;
3502 next = CUR_CHAR(nl);
3503 }
3504
3505 if ((q == '-') && (r == '-') && (cur == '!') && (next == '>')) {
3506 htmlParseErr(ctxt, XML_ERR_COMMENT_NOT_FINISHED,
3507 "Comment incorrectly closed by '--!>'", NULL, NULL);
3508 cur = '>';
3509 break;
3510 }
3511
3512 if (len + 5 >= size) {
3513 xmlChar *tmp;
3514
3515 size *= 2;
3516 tmp = (xmlChar *) xmlRealloc(buf, size * sizeof(xmlChar));
3517 if (tmp == NULL) {
3518 xmlFree(buf);
3519 htmlErrMemory(ctxt, "growing buffer failed\n");
3520 ctxt->instate = state;
3521 return;
3522 }
3523 buf = tmp;
3524 }
3525 if (IS_CHAR(q)) {
3526 COPY_BUF(ql,buf,len,q);
3527 } else {
3528 htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR,
3529 "Invalid char in comment 0x%X\n", q);
3530 }
3531
3532 q = r;
3533 ql = rl;
3534 r = cur;
3535 rl = l;
3536 cur = next;
3537 l = nl;
3538 }
3539 buf[len] = 0;
3540 if (cur == '>') {
3541 NEXT;
3542 if ((ctxt->sax != NULL) && (ctxt->sax->comment != NULL) &&
3543 (!ctxt->disableSAX))
3544 ctxt->sax->comment(ctxt->userData, buf);
3545 xmlFree(buf);
3546 ctxt->instate = state;
3547 return;
3548 }
3549
3550 unfinished:
3551 htmlParseErr(ctxt, XML_ERR_COMMENT_NOT_FINISHED,
3552 "Comment not terminated \n<!--%.50s\n", buf, NULL);
3553 xmlFree(buf);
3554 }
3555
3556 /**
3557 * htmlParseCharRef:
3558 * @ctxt: an HTML parser context
3559 *
3560 * parse Reference declarations
3561 *
3562 * [66] CharRef ::= '&#' [0-9]+ ';' |
3563 * '&#x' [0-9a-fA-F]+ ';'
3564 *
3565 * Returns the value parsed (as an int)
3566 */
3567 int
htmlParseCharRef(htmlParserCtxtPtr ctxt)3568 htmlParseCharRef(htmlParserCtxtPtr ctxt) {
3569 int val = 0;
3570
3571 if ((ctxt == NULL) || (ctxt->input == NULL)) {
3572 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
3573 "htmlParseCharRef: context error\n",
3574 NULL, NULL);
3575 return(0);
3576 }
3577 if ((CUR == '&') && (NXT(1) == '#') &&
3578 ((NXT(2) == 'x') || NXT(2) == 'X')) {
3579 SKIP(3);
3580 while (CUR != ';') {
3581 if ((CUR >= '0') && (CUR <= '9')) {
3582 if (val < 0x110000)
3583 val = val * 16 + (CUR - '0');
3584 } else if ((CUR >= 'a') && (CUR <= 'f')) {
3585 if (val < 0x110000)
3586 val = val * 16 + (CUR - 'a') + 10;
3587 } else if ((CUR >= 'A') && (CUR <= 'F')) {
3588 if (val < 0x110000)
3589 val = val * 16 + (CUR - 'A') + 10;
3590 } else {
3591 htmlParseErr(ctxt, XML_ERR_INVALID_HEX_CHARREF,
3592 "htmlParseCharRef: missing semicolon\n",
3593 NULL, NULL);
3594 break;
3595 }
3596 NEXT;
3597 }
3598 if (CUR == ';')
3599 NEXT;
3600 } else if ((CUR == '&') && (NXT(1) == '#')) {
3601 SKIP(2);
3602 while (CUR != ';') {
3603 if ((CUR >= '0') && (CUR <= '9')) {
3604 if (val < 0x110000)
3605 val = val * 10 + (CUR - '0');
3606 } else {
3607 htmlParseErr(ctxt, XML_ERR_INVALID_DEC_CHARREF,
3608 "htmlParseCharRef: missing semicolon\n",
3609 NULL, NULL);
3610 break;
3611 }
3612 NEXT;
3613 }
3614 if (CUR == ';')
3615 NEXT;
3616 } else {
3617 htmlParseErr(ctxt, XML_ERR_INVALID_CHARREF,
3618 "htmlParseCharRef: invalid value\n", NULL, NULL);
3619 }
3620 /*
3621 * Check the value IS_CHAR ...
3622 */
3623 if (IS_CHAR(val)) {
3624 return(val);
3625 } else if (val >= 0x110000) {
3626 htmlParseErr(ctxt, XML_ERR_INVALID_CHAR,
3627 "htmlParseCharRef: value too large\n", NULL, NULL);
3628 } else {
3629 htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR,
3630 "htmlParseCharRef: invalid xmlChar value %d\n",
3631 val);
3632 }
3633 return(0);
3634 }
3635
3636
3637 /**
3638 * htmlParseDocTypeDecl:
3639 * @ctxt: an HTML parser context
3640 *
3641 * parse a DOCTYPE declaration
3642 *
3643 * [28] doctypedecl ::= '<!DOCTYPE' S Name (S ExternalID)? S?
3644 * ('[' (markupdecl | PEReference | S)* ']' S?)? '>'
3645 */
3646
3647 static void
htmlParseDocTypeDecl(htmlParserCtxtPtr ctxt)3648 htmlParseDocTypeDecl(htmlParserCtxtPtr ctxt) {
3649 const xmlChar *name;
3650 xmlChar *ExternalID = NULL;
3651 xmlChar *URI = NULL;
3652
3653 /*
3654 * We know that '<!DOCTYPE' has been detected.
3655 */
3656 SKIP(9);
3657
3658 SKIP_BLANKS;
3659
3660 /*
3661 * Parse the DOCTYPE name.
3662 */
3663 name = htmlParseName(ctxt);
3664 if (name == NULL) {
3665 htmlParseErr(ctxt, XML_ERR_NAME_REQUIRED,
3666 "htmlParseDocTypeDecl : no DOCTYPE name !\n",
3667 NULL, NULL);
3668 }
3669 /*
3670 * Check that upper(name) == "HTML" !!!!!!!!!!!!!
3671 */
3672
3673 SKIP_BLANKS;
3674
3675 /*
3676 * Check for SystemID and ExternalID
3677 */
3678 URI = htmlParseExternalID(ctxt, &ExternalID);
3679 SKIP_BLANKS;
3680
3681 /*
3682 * We should be at the end of the DOCTYPE declaration.
3683 */
3684 if (CUR != '>') {
3685 htmlParseErr(ctxt, XML_ERR_DOCTYPE_NOT_FINISHED,
3686 "DOCTYPE improperly terminated\n", NULL, NULL);
3687 /* Ignore bogus content */
3688 while ((CUR != 0) && (CUR != '>'))
3689 NEXT;
3690 }
3691 if (CUR == '>')
3692 NEXT;
3693
3694 /*
3695 * Create or update the document accordingly to the DOCTYPE
3696 */
3697 if ((ctxt->sax != NULL) && (ctxt->sax->internalSubset != NULL) &&
3698 (!ctxt->disableSAX))
3699 ctxt->sax->internalSubset(ctxt->userData, name, ExternalID, URI);
3700
3701 /*
3702 * Cleanup, since we don't use all those identifiers
3703 */
3704 if (URI != NULL) xmlFree(URI);
3705 if (ExternalID != NULL) xmlFree(ExternalID);
3706 }
3707
3708 /**
3709 * htmlParseAttribute:
3710 * @ctxt: an HTML parser context
3711 * @value: a xmlChar ** used to store the value of the attribute
3712 *
3713 * parse an attribute
3714 *
3715 * [41] Attribute ::= Name Eq AttValue
3716 *
3717 * [25] Eq ::= S? '=' S?
3718 *
3719 * With namespace:
3720 *
3721 * [NS 11] Attribute ::= QName Eq AttValue
3722 *
3723 * Also the case QName == xmlns:??? is handled independently as a namespace
3724 * definition.
3725 *
3726 * Returns the attribute name, and the value in *value.
3727 */
3728
3729 static const xmlChar *
htmlParseAttribute(htmlParserCtxtPtr ctxt,xmlChar ** value)3730 htmlParseAttribute(htmlParserCtxtPtr ctxt, xmlChar **value) {
3731 const xmlChar *name;
3732 xmlChar *val = NULL;
3733
3734 *value = NULL;
3735 name = htmlParseHTMLName(ctxt);
3736 if (name == NULL) {
3737 htmlParseErr(ctxt, XML_ERR_NAME_REQUIRED,
3738 "error parsing attribute name\n", NULL, NULL);
3739 return(NULL);
3740 }
3741
3742 /*
3743 * read the value
3744 */
3745 SKIP_BLANKS;
3746 if (CUR == '=') {
3747 NEXT;
3748 SKIP_BLANKS;
3749 val = htmlParseAttValue(ctxt);
3750 }
3751
3752 *value = val;
3753 return(name);
3754 }
3755
3756 /**
3757 * htmlCheckEncodingDirect:
3758 * @ctxt: an HTML parser context
3759 * @attvalue: the attribute value
3760 *
3761 * Checks an attribute value to detect
3762 * the encoding
3763 * If a new encoding is detected the parser is switched to decode
3764 * it and pass UTF8
3765 */
3766 static void
htmlCheckEncodingDirect(htmlParserCtxtPtr ctxt,const xmlChar * encoding)3767 htmlCheckEncodingDirect(htmlParserCtxtPtr ctxt, const xmlChar *encoding) {
3768
3769 if ((ctxt == NULL) || (encoding == NULL) ||
3770 (ctxt->options & HTML_PARSE_IGNORE_ENC))
3771 return;
3772
3773 /* do not change encoding */
3774 if (ctxt->input->encoding != NULL)
3775 return;
3776
3777 if (encoding != NULL) {
3778 xmlCharEncoding enc;
3779 xmlCharEncodingHandlerPtr handler;
3780
3781 while ((*encoding == ' ') || (*encoding == '\t')) encoding++;
3782
3783 if (ctxt->input->encoding != NULL)
3784 xmlFree((xmlChar *) ctxt->input->encoding);
3785 ctxt->input->encoding = xmlStrdup(encoding);
3786
3787 enc = xmlParseCharEncoding((const char *) encoding);
3788 /*
3789 * registered set of known encodings
3790 */
3791 if (enc != XML_CHAR_ENCODING_ERROR) {
3792 if (((enc == XML_CHAR_ENCODING_UTF16LE) ||
3793 (enc == XML_CHAR_ENCODING_UTF16BE) ||
3794 (enc == XML_CHAR_ENCODING_UCS4LE) ||
3795 (enc == XML_CHAR_ENCODING_UCS4BE)) &&
3796 (ctxt->input->buf != NULL) &&
3797 (ctxt->input->buf->encoder == NULL)) {
3798 htmlParseErr(ctxt, XML_ERR_INVALID_ENCODING,
3799 "htmlCheckEncoding: wrong encoding meta\n",
3800 NULL, NULL);
3801 } else {
3802 xmlSwitchEncoding(ctxt, enc);
3803 }
3804 ctxt->charset = XML_CHAR_ENCODING_UTF8;
3805 } else {
3806 /*
3807 * fallback for unknown encodings
3808 */
3809 handler = xmlFindCharEncodingHandler((const char *) encoding);
3810 if (handler != NULL) {
3811 xmlSwitchToEncoding(ctxt, handler);
3812 ctxt->charset = XML_CHAR_ENCODING_UTF8;
3813 } else {
3814 htmlParseErr(ctxt, XML_ERR_UNSUPPORTED_ENCODING,
3815 "htmlCheckEncoding: unknown encoding %s\n",
3816 encoding, NULL);
3817 }
3818 }
3819
3820 if ((ctxt->input->buf != NULL) &&
3821 (ctxt->input->buf->encoder != NULL) &&
3822 (ctxt->input->buf->raw != NULL) &&
3823 (ctxt->input->buf->buffer != NULL)) {
3824 int nbchars;
3825 int processed;
3826
3827 /*
3828 * convert as much as possible to the parser reading buffer.
3829 */
3830 processed = ctxt->input->cur - ctxt->input->base;
3831 xmlBufShrink(ctxt->input->buf->buffer, processed);
3832 nbchars = xmlCharEncInput(ctxt->input->buf, 1);
3833 xmlBufResetInput(ctxt->input->buf->buffer, ctxt->input);
3834 if (nbchars < 0) {
3835 htmlParseErr(ctxt, XML_ERR_INVALID_ENCODING,
3836 "htmlCheckEncoding: encoder error\n",
3837 NULL, NULL);
3838 }
3839 }
3840 }
3841 }
3842
3843 /**
3844 * htmlCheckEncoding:
3845 * @ctxt: an HTML parser context
3846 * @attvalue: the attribute value
3847 *
3848 * Checks an http-equiv attribute from a Meta tag to detect
3849 * the encoding
3850 * If a new encoding is detected the parser is switched to decode
3851 * it and pass UTF8
3852 */
3853 static void
htmlCheckEncoding(htmlParserCtxtPtr ctxt,const xmlChar * attvalue)3854 htmlCheckEncoding(htmlParserCtxtPtr ctxt, const xmlChar *attvalue) {
3855 const xmlChar *encoding;
3856
3857 if (!attvalue)
3858 return;
3859
3860 encoding = xmlStrcasestr(attvalue, BAD_CAST"charset");
3861 if (encoding != NULL) {
3862 encoding += 7;
3863 }
3864 /*
3865 * skip blank
3866 */
3867 if (encoding && IS_BLANK_CH(*encoding))
3868 encoding = xmlStrcasestr(attvalue, BAD_CAST"=");
3869 if (encoding && *encoding == '=') {
3870 encoding ++;
3871 htmlCheckEncodingDirect(ctxt, encoding);
3872 }
3873 }
3874
3875 /**
3876 * htmlCheckMeta:
3877 * @ctxt: an HTML parser context
3878 * @atts: the attributes values
3879 *
3880 * Checks an attributes from a Meta tag
3881 */
3882 static void
htmlCheckMeta(htmlParserCtxtPtr ctxt,const xmlChar ** atts)3883 htmlCheckMeta(htmlParserCtxtPtr ctxt, const xmlChar **atts) {
3884 int i;
3885 const xmlChar *att, *value;
3886 int http = 0;
3887 const xmlChar *content = NULL;
3888
3889 if ((ctxt == NULL) || (atts == NULL))
3890 return;
3891
3892 i = 0;
3893 att = atts[i++];
3894 while (att != NULL) {
3895 value = atts[i++];
3896 if ((value != NULL) && (!xmlStrcasecmp(att, BAD_CAST"http-equiv"))
3897 && (!xmlStrcasecmp(value, BAD_CAST"Content-Type")))
3898 http = 1;
3899 else if ((value != NULL) && (!xmlStrcasecmp(att, BAD_CAST"charset")))
3900 htmlCheckEncodingDirect(ctxt, value);
3901 else if ((value != NULL) && (!xmlStrcasecmp(att, BAD_CAST"content")))
3902 content = value;
3903 att = atts[i++];
3904 }
3905 if ((http) && (content != NULL))
3906 htmlCheckEncoding(ctxt, content);
3907
3908 }
3909
3910 /**
3911 * htmlParseStartTag:
3912 * @ctxt: an HTML parser context
3913 *
3914 * parse a start of tag either for rule element or
3915 * EmptyElement. In both case we don't parse the tag closing chars.
3916 *
3917 * [40] STag ::= '<' Name (S Attribute)* S? '>'
3918 *
3919 * [44] EmptyElemTag ::= '<' Name (S Attribute)* S? '/>'
3920 *
3921 * With namespace:
3922 *
3923 * [NS 8] STag ::= '<' QName (S Attribute)* S? '>'
3924 *
3925 * [NS 10] EmptyElement ::= '<' QName (S Attribute)* S? '/>'
3926 *
3927 * Returns 0 in case of success, -1 in case of error and 1 if discarded
3928 */
3929
3930 static int
htmlParseStartTag(htmlParserCtxtPtr ctxt)3931 htmlParseStartTag(htmlParserCtxtPtr ctxt) {
3932 const xmlChar *name;
3933 const xmlChar *attname;
3934 xmlChar *attvalue;
3935 const xmlChar **atts;
3936 int nbatts = 0;
3937 int maxatts;
3938 int meta = 0;
3939 int i;
3940 int discardtag = 0;
3941
3942 if ((ctxt == NULL) || (ctxt->input == NULL)) {
3943 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
3944 "htmlParseStartTag: context error\n", NULL, NULL);
3945 return -1;
3946 }
3947 if (ctxt->instate == XML_PARSER_EOF)
3948 return(-1);
3949 if (CUR != '<') return -1;
3950 NEXT;
3951
3952 atts = ctxt->atts;
3953 maxatts = ctxt->maxatts;
3954
3955 GROW;
3956 name = htmlParseHTMLName(ctxt);
3957 if (name == NULL) {
3958 htmlParseErr(ctxt, XML_ERR_NAME_REQUIRED,
3959 "htmlParseStartTag: invalid element name\n",
3960 NULL, NULL);
3961 /* if recover preserve text on classic misconstructs */
3962 if ((ctxt->recovery) && ((IS_BLANK_CH(CUR)) || (CUR == '<') ||
3963 (CUR == '=') || (CUR == '>') || (((CUR >= '0') && (CUR <= '9'))))) {
3964 htmlParseCharDataInternal(ctxt, '<');
3965 return(-1);
3966 }
3967
3968
3969 /* Dump the bogus tag like browsers do */
3970 while ((CUR != 0) && (CUR != '>') &&
3971 (ctxt->instate != XML_PARSER_EOF))
3972 NEXT;
3973 return -1;
3974 }
3975 if (xmlStrEqual(name, BAD_CAST"meta"))
3976 meta = 1;
3977
3978 /*
3979 * Check for auto-closure of HTML elements.
3980 */
3981 htmlAutoClose(ctxt, name);
3982
3983 /*
3984 * Check for implied HTML elements.
3985 */
3986 htmlCheckImplied(ctxt, name);
3987
3988 /*
3989 * Avoid html at any level > 0, head at any level != 1
3990 * or any attempt to recurse body
3991 */
3992 if ((ctxt->nameNr > 0) && (xmlStrEqual(name, BAD_CAST"html"))) {
3993 htmlParseErr(ctxt, XML_HTML_STRUCURE_ERROR,
3994 "htmlParseStartTag: misplaced <html> tag\n",
3995 name, NULL);
3996 discardtag = 1;
3997 ctxt->depth++;
3998 }
3999 if ((ctxt->nameNr != 1) &&
4000 (xmlStrEqual(name, BAD_CAST"head"))) {
4001 htmlParseErr(ctxt, XML_HTML_STRUCURE_ERROR,
4002 "htmlParseStartTag: misplaced <head> tag\n",
4003 name, NULL);
4004 discardtag = 1;
4005 ctxt->depth++;
4006 }
4007 if (xmlStrEqual(name, BAD_CAST"body")) {
4008 int indx;
4009 for (indx = 0;indx < ctxt->nameNr;indx++) {
4010 if (xmlStrEqual(ctxt->nameTab[indx], BAD_CAST"body")) {
4011 htmlParseErr(ctxt, XML_HTML_STRUCURE_ERROR,
4012 "htmlParseStartTag: misplaced <body> tag\n",
4013 name, NULL);
4014 discardtag = 1;
4015 ctxt->depth++;
4016 }
4017 }
4018 }
4019
4020 /*
4021 * Now parse the attributes, it ends up with the ending
4022 *
4023 * (S Attribute)* S?
4024 */
4025 SKIP_BLANKS;
4026 while ((CUR != 0) &&
4027 (CUR != '>') &&
4028 ((CUR != '/') || (NXT(1) != '>'))) {
4029 GROW;
4030 attname = htmlParseAttribute(ctxt, &attvalue);
4031 if (attname != NULL) {
4032
4033 /*
4034 * Well formedness requires at most one declaration of an attribute
4035 */
4036 for (i = 0; i < nbatts;i += 2) {
4037 if (xmlStrEqual(atts[i], attname)) {
4038 htmlParseErr(ctxt, XML_ERR_ATTRIBUTE_REDEFINED,
4039 "Attribute %s redefined\n", attname, NULL);
4040 if (attvalue != NULL)
4041 xmlFree(attvalue);
4042 goto failed;
4043 }
4044 }
4045
4046 /*
4047 * Add the pair to atts
4048 */
4049 if (atts == NULL) {
4050 maxatts = 22; /* allow for 10 attrs by default */
4051 atts = (const xmlChar **)
4052 xmlMalloc(maxatts * sizeof(xmlChar *));
4053 if (atts == NULL) {
4054 htmlErrMemory(ctxt, NULL);
4055 if (attvalue != NULL)
4056 xmlFree(attvalue);
4057 goto failed;
4058 }
4059 ctxt->atts = atts;
4060 ctxt->maxatts = maxatts;
4061 } else if (nbatts + 4 > maxatts) {
4062 const xmlChar **n;
4063
4064 maxatts *= 2;
4065 n = (const xmlChar **) xmlRealloc((void *) atts,
4066 maxatts * sizeof(const xmlChar *));
4067 if (n == NULL) {
4068 htmlErrMemory(ctxt, NULL);
4069 if (attvalue != NULL)
4070 xmlFree(attvalue);
4071 goto failed;
4072 }
4073 atts = n;
4074 ctxt->atts = atts;
4075 ctxt->maxatts = maxatts;
4076 }
4077 atts[nbatts++] = attname;
4078 atts[nbatts++] = attvalue;
4079 atts[nbatts] = NULL;
4080 atts[nbatts + 1] = NULL;
4081 }
4082 else {
4083 if (attvalue != NULL)
4084 xmlFree(attvalue);
4085 /* Dump the bogus attribute string up to the next blank or
4086 * the end of the tag. */
4087 while ((CUR != 0) &&
4088 !(IS_BLANK_CH(CUR)) && (CUR != '>') &&
4089 ((CUR != '/') || (NXT(1) != '>')))
4090 NEXT;
4091 }
4092
4093 failed:
4094 SKIP_BLANKS;
4095 }
4096
4097 /*
4098 * Handle specific association to the META tag
4099 */
4100 if (meta && (nbatts != 0))
4101 htmlCheckMeta(ctxt, atts);
4102
4103 /*
4104 * SAX: Start of Element !
4105 */
4106 if (!discardtag) {
4107 htmlnamePush(ctxt, name);
4108 if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL)) {
4109 if (nbatts != 0)
4110 ctxt->sax->startElement(ctxt->userData, name, atts);
4111 else
4112 ctxt->sax->startElement(ctxt->userData, name, NULL);
4113 }
4114 }
4115
4116 if (atts != NULL) {
4117 for (i = 1;i < nbatts;i += 2) {
4118 if (atts[i] != NULL)
4119 xmlFree((xmlChar *) atts[i]);
4120 }
4121 }
4122
4123 return(discardtag);
4124 }
4125
4126 /**
4127 * htmlParseEndTag:
4128 * @ctxt: an HTML parser context
4129 *
4130 * parse an end of tag
4131 *
4132 * [42] ETag ::= '</' Name S? '>'
4133 *
4134 * With namespace
4135 *
4136 * [NS 9] ETag ::= '</' QName S? '>'
4137 *
4138 * Returns 1 if the current level should be closed.
4139 */
4140
4141 static int
htmlParseEndTag(htmlParserCtxtPtr ctxt)4142 htmlParseEndTag(htmlParserCtxtPtr ctxt)
4143 {
4144 const xmlChar *name;
4145 const xmlChar *oldname;
4146 int i, ret;
4147
4148 if ((CUR != '<') || (NXT(1) != '/')) {
4149 htmlParseErr(ctxt, XML_ERR_LTSLASH_REQUIRED,
4150 "htmlParseEndTag: '</' not found\n", NULL, NULL);
4151 return (0);
4152 }
4153 SKIP(2);
4154
4155 name = htmlParseHTMLName(ctxt);
4156 if (name == NULL)
4157 return (0);
4158 /*
4159 * We should definitely be at the ending "S? '>'" part
4160 */
4161 SKIP_BLANKS;
4162 if (CUR != '>') {
4163 htmlParseErr(ctxt, XML_ERR_GT_REQUIRED,
4164 "End tag : expected '>'\n", NULL, NULL);
4165 /* Skip to next '>' */
4166 while ((CUR != 0) && (CUR != '>'))
4167 NEXT;
4168 }
4169 if (CUR == '>')
4170 NEXT;
4171
4172 /*
4173 * if we ignored misplaced tags in htmlParseStartTag don't pop them
4174 * out now.
4175 */
4176 if ((ctxt->depth > 0) &&
4177 (xmlStrEqual(name, BAD_CAST "html") ||
4178 xmlStrEqual(name, BAD_CAST "body") ||
4179 xmlStrEqual(name, BAD_CAST "head"))) {
4180 ctxt->depth--;
4181 return (0);
4182 }
4183
4184 /*
4185 * If the name read is not one of the element in the parsing stack
4186 * then return, it's just an error.
4187 */
4188 for (i = (ctxt->nameNr - 1); i >= 0; i--) {
4189 if (xmlStrEqual(name, ctxt->nameTab[i]))
4190 break;
4191 }
4192 if (i < 0) {
4193 htmlParseErr(ctxt, XML_ERR_TAG_NAME_MISMATCH,
4194 "Unexpected end tag : %s\n", name, NULL);
4195 return (0);
4196 }
4197
4198
4199 /*
4200 * Check for auto-closure of HTML elements.
4201 */
4202
4203 htmlAutoCloseOnClose(ctxt, name);
4204
4205 /*
4206 * Well formedness constraints, opening and closing must match.
4207 * With the exception that the autoclose may have popped stuff out
4208 * of the stack.
4209 */
4210 if ((ctxt->name != NULL) && (!xmlStrEqual(ctxt->name, name))) {
4211 htmlParseErr(ctxt, XML_ERR_TAG_NAME_MISMATCH,
4212 "Opening and ending tag mismatch: %s and %s\n",
4213 name, ctxt->name);
4214 }
4215
4216 /*
4217 * SAX: End of Tag
4218 */
4219 oldname = ctxt->name;
4220 if ((oldname != NULL) && (xmlStrEqual(oldname, name))) {
4221 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
4222 ctxt->sax->endElement(ctxt->userData, name);
4223 htmlNodeInfoPop(ctxt);
4224 htmlnamePop(ctxt);
4225 ret = 1;
4226 } else {
4227 ret = 0;
4228 }
4229
4230 return (ret);
4231 }
4232
4233
4234 /**
4235 * htmlParseReference:
4236 * @ctxt: an HTML parser context
4237 *
4238 * parse and handle entity references in content,
4239 * this will end-up in a call to character() since this is either a
4240 * CharRef, or a predefined entity.
4241 */
4242 static void
htmlParseReference(htmlParserCtxtPtr ctxt)4243 htmlParseReference(htmlParserCtxtPtr ctxt) {
4244 const htmlEntityDesc * ent;
4245 xmlChar out[6];
4246 const xmlChar *name;
4247 if (CUR != '&') return;
4248
4249 if (NXT(1) == '#') {
4250 unsigned int c;
4251 int bits, i = 0;
4252
4253 c = htmlParseCharRef(ctxt);
4254 if (c == 0)
4255 return;
4256
4257 if (c < 0x80) { out[i++]= c; bits= -6; }
4258 else if (c < 0x800) { out[i++]=((c >> 6) & 0x1F) | 0xC0; bits= 0; }
4259 else if (c < 0x10000) { out[i++]=((c >> 12) & 0x0F) | 0xE0; bits= 6; }
4260 else { out[i++]=((c >> 18) & 0x07) | 0xF0; bits= 12; }
4261
4262 for ( ; bits >= 0; bits-= 6) {
4263 out[i++]= ((c >> bits) & 0x3F) | 0x80;
4264 }
4265 out[i] = 0;
4266
4267 htmlCheckParagraph(ctxt);
4268 if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL))
4269 ctxt->sax->characters(ctxt->userData, out, i);
4270 } else {
4271 ent = htmlParseEntityRef(ctxt, &name);
4272 if (name == NULL) {
4273 htmlCheckParagraph(ctxt);
4274 if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL))
4275 ctxt->sax->characters(ctxt->userData, BAD_CAST "&", 1);
4276 return;
4277 }
4278 if ((ent == NULL) || !(ent->value > 0)) {
4279 htmlCheckParagraph(ctxt);
4280 if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL)) {
4281 ctxt->sax->characters(ctxt->userData, BAD_CAST "&", 1);
4282 ctxt->sax->characters(ctxt->userData, name, xmlStrlen(name));
4283 /* ctxt->sax->characters(ctxt->userData, BAD_CAST ";", 1); */
4284 }
4285 } else {
4286 unsigned int c;
4287 int bits, i = 0;
4288
4289 c = ent->value;
4290 if (c < 0x80)
4291 { out[i++]= c; bits= -6; }
4292 else if (c < 0x800)
4293 { out[i++]=((c >> 6) & 0x1F) | 0xC0; bits= 0; }
4294 else if (c < 0x10000)
4295 { out[i++]=((c >> 12) & 0x0F) | 0xE0; bits= 6; }
4296 else
4297 { out[i++]=((c >> 18) & 0x07) | 0xF0; bits= 12; }
4298
4299 for ( ; bits >= 0; bits-= 6) {
4300 out[i++]= ((c >> bits) & 0x3F) | 0x80;
4301 }
4302 out[i] = 0;
4303
4304 htmlCheckParagraph(ctxt);
4305 if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL))
4306 ctxt->sax->characters(ctxt->userData, out, i);
4307 }
4308 }
4309 }
4310
4311 /**
4312 * htmlParseContent:
4313 * @ctxt: an HTML parser context
4314 *
4315 * Parse a content: comment, sub-element, reference or text.
4316 * Kept for compatibility with old code
4317 */
4318
4319 static void
htmlParseContent(htmlParserCtxtPtr ctxt)4320 htmlParseContent(htmlParserCtxtPtr ctxt) {
4321 xmlChar *currentNode;
4322 int depth;
4323 const xmlChar *name;
4324
4325 currentNode = xmlStrdup(ctxt->name);
4326 depth = ctxt->nameNr;
4327 while (1) {
4328 GROW;
4329
4330 if (ctxt->instate == XML_PARSER_EOF)
4331 break;
4332
4333 /*
4334 * Our tag or one of it's parent or children is ending.
4335 */
4336 if ((CUR == '<') && (NXT(1) == '/')) {
4337 if (htmlParseEndTag(ctxt) &&
4338 ((currentNode != NULL) || (ctxt->nameNr == 0))) {
4339 if (currentNode != NULL)
4340 xmlFree(currentNode);
4341 return;
4342 }
4343 continue; /* while */
4344 }
4345
4346 else if ((CUR == '<') &&
4347 ((IS_ASCII_LETTER(NXT(1))) ||
4348 (NXT(1) == '_') || (NXT(1) == ':'))) {
4349 name = htmlParseHTMLName_nonInvasive(ctxt);
4350 if (name == NULL) {
4351 htmlParseErr(ctxt, XML_ERR_NAME_REQUIRED,
4352 "htmlParseStartTag: invalid element name\n",
4353 NULL, NULL);
4354 /* Dump the bogus tag like browsers do */
4355 while ((CUR != 0) && (CUR != '>'))
4356 NEXT;
4357
4358 if (currentNode != NULL)
4359 xmlFree(currentNode);
4360 return;
4361 }
4362
4363 if (ctxt->name != NULL) {
4364 if (htmlCheckAutoClose(name, ctxt->name) == 1) {
4365 htmlAutoClose(ctxt, name);
4366 continue;
4367 }
4368 }
4369 }
4370
4371 /*
4372 * Has this node been popped out during parsing of
4373 * the next element
4374 */
4375 if ((ctxt->nameNr > 0) && (depth >= ctxt->nameNr) &&
4376 (!xmlStrEqual(currentNode, ctxt->name)))
4377 {
4378 if (currentNode != NULL) xmlFree(currentNode);
4379 return;
4380 }
4381
4382 if ((CUR != 0) && ((xmlStrEqual(currentNode, BAD_CAST"script")) ||
4383 (xmlStrEqual(currentNode, BAD_CAST"style")))) {
4384 /*
4385 * Handle SCRIPT/STYLE separately
4386 */
4387 htmlParseScript(ctxt);
4388 } else {
4389 /*
4390 * Sometimes DOCTYPE arrives in the middle of the document
4391 */
4392 if ((CUR == '<') && (NXT(1) == '!') &&
4393 (UPP(2) == 'D') && (UPP(3) == 'O') &&
4394 (UPP(4) == 'C') && (UPP(5) == 'T') &&
4395 (UPP(6) == 'Y') && (UPP(7) == 'P') &&
4396 (UPP(8) == 'E')) {
4397 htmlParseErr(ctxt, XML_HTML_STRUCURE_ERROR,
4398 "Misplaced DOCTYPE declaration\n",
4399 BAD_CAST "DOCTYPE" , NULL);
4400 htmlParseDocTypeDecl(ctxt);
4401 }
4402
4403 /*
4404 * First case : a comment
4405 */
4406 if ((CUR == '<') && (NXT(1) == '!') &&
4407 (NXT(2) == '-') && (NXT(3) == '-')) {
4408 htmlParseComment(ctxt);
4409 }
4410
4411 /*
4412 * Second case : a Processing Instruction.
4413 */
4414 else if ((CUR == '<') && (NXT(1) == '?')) {
4415 htmlParsePI(ctxt);
4416 }
4417
4418 /*
4419 * Third case : a sub-element.
4420 */
4421 else if (CUR == '<') {
4422 htmlParseElement(ctxt);
4423 }
4424
4425 /*
4426 * Fourth case : a reference. If if has not been resolved,
4427 * parsing returns it's Name, create the node
4428 */
4429 else if (CUR == '&') {
4430 htmlParseReference(ctxt);
4431 }
4432
4433 /*
4434 * Fifth case : end of the resource
4435 */
4436 else if (CUR == 0) {
4437 htmlAutoCloseOnEnd(ctxt);
4438 break;
4439 }
4440
4441 /*
4442 * Last case, text. Note that References are handled directly.
4443 */
4444 else {
4445 htmlParseCharData(ctxt);
4446 }
4447 }
4448 GROW;
4449 }
4450 if (currentNode != NULL) xmlFree(currentNode);
4451 }
4452
4453 /**
4454 * htmlParseElement:
4455 * @ctxt: an HTML parser context
4456 *
4457 * parse an HTML element, this is highly recursive
4458 * this is kept for compatibility with previous code versions
4459 *
4460 * [39] element ::= EmptyElemTag | STag content ETag
4461 *
4462 * [41] Attribute ::= Name Eq AttValue
4463 */
4464
4465 void
htmlParseElement(htmlParserCtxtPtr ctxt)4466 htmlParseElement(htmlParserCtxtPtr ctxt) {
4467 const xmlChar *name;
4468 xmlChar *currentNode = NULL;
4469 const htmlElemDesc * info;
4470 htmlParserNodeInfo node_info;
4471 int failed;
4472 int depth;
4473 const xmlChar *oldptr;
4474
4475 if ((ctxt == NULL) || (ctxt->input == NULL)) {
4476 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
4477 "htmlParseElement: context error\n", NULL, NULL);
4478 return;
4479 }
4480
4481 if (ctxt->instate == XML_PARSER_EOF)
4482 return;
4483
4484 /* Capture start position */
4485 if (ctxt->record_info) {
4486 node_info.begin_pos = ctxt->input->consumed +
4487 (CUR_PTR - ctxt->input->base);
4488 node_info.begin_line = ctxt->input->line;
4489 }
4490
4491 failed = htmlParseStartTag(ctxt);
4492 name = ctxt->name;
4493 if ((failed == -1) || (name == NULL)) {
4494 if (CUR == '>')
4495 NEXT;
4496 return;
4497 }
4498
4499 /*
4500 * Lookup the info for that element.
4501 */
4502 info = htmlTagLookup(name);
4503 if (info == NULL) {
4504 htmlParseErr(ctxt, XML_HTML_UNKNOWN_TAG,
4505 "Tag %s invalid\n", name, NULL);
4506 }
4507
4508 /*
4509 * Check for an Empty Element labeled the XML/SGML way
4510 */
4511 if ((CUR == '/') && (NXT(1) == '>')) {
4512 SKIP(2);
4513 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
4514 ctxt->sax->endElement(ctxt->userData, name);
4515 htmlnamePop(ctxt);
4516 return;
4517 }
4518
4519 if (CUR == '>') {
4520 NEXT;
4521 } else {
4522 htmlParseErr(ctxt, XML_ERR_GT_REQUIRED,
4523 "Couldn't find end of Start Tag %s\n", name, NULL);
4524
4525 /*
4526 * end of parsing of this node.
4527 */
4528 if (xmlStrEqual(name, ctxt->name)) {
4529 nodePop(ctxt);
4530 htmlnamePop(ctxt);
4531 }
4532
4533 /*
4534 * Capture end position and add node
4535 */
4536 if (ctxt->record_info) {
4537 node_info.end_pos = ctxt->input->consumed +
4538 (CUR_PTR - ctxt->input->base);
4539 node_info.end_line = ctxt->input->line;
4540 node_info.node = ctxt->node;
4541 xmlParserAddNodeInfo(ctxt, &node_info);
4542 }
4543 return;
4544 }
4545
4546 /*
4547 * Check for an Empty Element from DTD definition
4548 */
4549 if ((info != NULL) && (info->empty)) {
4550 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
4551 ctxt->sax->endElement(ctxt->userData, name);
4552 htmlnamePop(ctxt);
4553 return;
4554 }
4555
4556 /*
4557 * Parse the content of the element:
4558 */
4559 currentNode = xmlStrdup(ctxt->name);
4560 depth = ctxt->nameNr;
4561 while (CUR != 0) {
4562 oldptr = ctxt->input->cur;
4563 htmlParseContent(ctxt);
4564 if (oldptr==ctxt->input->cur) break;
4565 if (ctxt->nameNr < depth) break;
4566 }
4567
4568 /*
4569 * Capture end position and add node
4570 */
4571 if ( currentNode != NULL && ctxt->record_info ) {
4572 node_info.end_pos = ctxt->input->consumed +
4573 (CUR_PTR - ctxt->input->base);
4574 node_info.end_line = ctxt->input->line;
4575 node_info.node = ctxt->node;
4576 xmlParserAddNodeInfo(ctxt, &node_info);
4577 }
4578 if (CUR == 0) {
4579 htmlAutoCloseOnEnd(ctxt);
4580 }
4581
4582 if (currentNode != NULL)
4583 xmlFree(currentNode);
4584 }
4585
4586 static void
htmlParserFinishElementParsing(htmlParserCtxtPtr ctxt)4587 htmlParserFinishElementParsing(htmlParserCtxtPtr ctxt) {
4588 /*
4589 * Capture end position and add node
4590 */
4591 if ( ctxt->node != NULL && ctxt->record_info ) {
4592 ctxt->nodeInfo->end_pos = ctxt->input->consumed +
4593 (CUR_PTR - ctxt->input->base);
4594 ctxt->nodeInfo->end_line = ctxt->input->line;
4595 ctxt->nodeInfo->node = ctxt->node;
4596 xmlParserAddNodeInfo(ctxt, ctxt->nodeInfo);
4597 htmlNodeInfoPop(ctxt);
4598 }
4599 if (CUR == 0) {
4600 htmlAutoCloseOnEnd(ctxt);
4601 }
4602 }
4603
4604 /**
4605 * htmlParseElementInternal:
4606 * @ctxt: an HTML parser context
4607 *
4608 * parse an HTML element, new version, non recursive
4609 *
4610 * [39] element ::= EmptyElemTag | STag content ETag
4611 *
4612 * [41] Attribute ::= Name Eq AttValue
4613 */
4614
4615 static void
htmlParseElementInternal(htmlParserCtxtPtr ctxt)4616 htmlParseElementInternal(htmlParserCtxtPtr ctxt) {
4617 const xmlChar *name;
4618 const htmlElemDesc * info;
4619 htmlParserNodeInfo node_info = { NULL, 0, 0, 0, 0 };
4620 int failed;
4621
4622 if ((ctxt == NULL) || (ctxt->input == NULL)) {
4623 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
4624 "htmlParseElementInternal: context error\n", NULL, NULL);
4625 return;
4626 }
4627
4628 if (ctxt->instate == XML_PARSER_EOF)
4629 return;
4630
4631 /* Capture start position */
4632 if (ctxt->record_info) {
4633 node_info.begin_pos = ctxt->input->consumed +
4634 (CUR_PTR - ctxt->input->base);
4635 node_info.begin_line = ctxt->input->line;
4636 }
4637
4638 failed = htmlParseStartTag(ctxt);
4639 name = ctxt->name;
4640 if ((failed == -1) || (name == NULL)) {
4641 if (CUR == '>')
4642 NEXT;
4643 return;
4644 }
4645
4646 /*
4647 * Lookup the info for that element.
4648 */
4649 info = htmlTagLookup(name);
4650 if (info == NULL) {
4651 htmlParseErr(ctxt, XML_HTML_UNKNOWN_TAG,
4652 "Tag %s invalid\n", name, NULL);
4653 }
4654
4655 /*
4656 * Check for an Empty Element labeled the XML/SGML way
4657 */
4658 if ((CUR == '/') && (NXT(1) == '>')) {
4659 SKIP(2);
4660 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
4661 ctxt->sax->endElement(ctxt->userData, name);
4662 htmlnamePop(ctxt);
4663 return;
4664 }
4665
4666 if (CUR == '>') {
4667 NEXT;
4668 } else {
4669 htmlParseErr(ctxt, XML_ERR_GT_REQUIRED,
4670 "Couldn't find end of Start Tag %s\n", name, NULL);
4671
4672 /*
4673 * end of parsing of this node.
4674 */
4675 if (xmlStrEqual(name, ctxt->name)) {
4676 nodePop(ctxt);
4677 htmlnamePop(ctxt);
4678 }
4679
4680 if (ctxt->record_info)
4681 htmlNodeInfoPush(ctxt, &node_info);
4682 htmlParserFinishElementParsing(ctxt);
4683 return;
4684 }
4685
4686 /*
4687 * Check for an Empty Element from DTD definition
4688 */
4689 if ((info != NULL) && (info->empty)) {
4690 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
4691 ctxt->sax->endElement(ctxt->userData, name);
4692 htmlnamePop(ctxt);
4693 return;
4694 }
4695
4696 if (ctxt->record_info)
4697 htmlNodeInfoPush(ctxt, &node_info);
4698 }
4699
4700 /**
4701 * htmlParseContentInternal:
4702 * @ctxt: an HTML parser context
4703 *
4704 * Parse a content: comment, sub-element, reference or text.
4705 * New version for non recursive htmlParseElementInternal
4706 */
4707
4708 static void
htmlParseContentInternal(htmlParserCtxtPtr ctxt)4709 htmlParseContentInternal(htmlParserCtxtPtr ctxt) {
4710 xmlChar *currentNode;
4711 int depth;
4712 const xmlChar *name;
4713
4714 currentNode = xmlStrdup(ctxt->name);
4715 depth = ctxt->nameNr;
4716 while (1) {
4717 GROW;
4718
4719 if (ctxt->instate == XML_PARSER_EOF)
4720 break;
4721
4722 /*
4723 * Our tag or one of it's parent or children is ending.
4724 */
4725 if ((CUR == '<') && (NXT(1) == '/')) {
4726 if (htmlParseEndTag(ctxt) &&
4727 ((currentNode != NULL) || (ctxt->nameNr == 0))) {
4728 if (currentNode != NULL)
4729 xmlFree(currentNode);
4730
4731 currentNode = xmlStrdup(ctxt->name);
4732 depth = ctxt->nameNr;
4733 }
4734 continue; /* while */
4735 }
4736
4737 else if ((CUR == '<') &&
4738 ((IS_ASCII_LETTER(NXT(1))) ||
4739 (NXT(1) == '_') || (NXT(1) == ':'))) {
4740 name = htmlParseHTMLName_nonInvasive(ctxt);
4741 if (name == NULL) {
4742 htmlParseErr(ctxt, XML_ERR_NAME_REQUIRED,
4743 "htmlParseStartTag: invalid element name\n",
4744 NULL, NULL);
4745 /* Dump the bogus tag like browsers do */
4746 while ((CUR == 0) && (CUR != '>'))
4747 NEXT;
4748
4749 htmlParserFinishElementParsing(ctxt);
4750 if (currentNode != NULL)
4751 xmlFree(currentNode);
4752
4753 currentNode = xmlStrdup(ctxt->name);
4754 depth = ctxt->nameNr;
4755 continue;
4756 }
4757
4758 if (ctxt->name != NULL) {
4759 if (htmlCheckAutoClose(name, ctxt->name) == 1) {
4760 htmlAutoClose(ctxt, name);
4761 continue;
4762 }
4763 }
4764 }
4765
4766 /*
4767 * Has this node been popped out during parsing of
4768 * the next element
4769 */
4770 if ((ctxt->nameNr > 0) && (depth >= ctxt->nameNr) &&
4771 (!xmlStrEqual(currentNode, ctxt->name)))
4772 {
4773 htmlParserFinishElementParsing(ctxt);
4774 if (currentNode != NULL) xmlFree(currentNode);
4775
4776 currentNode = xmlStrdup(ctxt->name);
4777 depth = ctxt->nameNr;
4778 continue;
4779 }
4780
4781 if ((CUR != 0) && ((xmlStrEqual(currentNode, BAD_CAST"script")) ||
4782 (xmlStrEqual(currentNode, BAD_CAST"style")))) {
4783 /*
4784 * Handle SCRIPT/STYLE separately
4785 */
4786 htmlParseScript(ctxt);
4787 } else {
4788 /*
4789 * Sometimes DOCTYPE arrives in the middle of the document
4790 */
4791 if ((CUR == '<') && (NXT(1) == '!') &&
4792 (UPP(2) == 'D') && (UPP(3) == 'O') &&
4793 (UPP(4) == 'C') && (UPP(5) == 'T') &&
4794 (UPP(6) == 'Y') && (UPP(7) == 'P') &&
4795 (UPP(8) == 'E')) {
4796 htmlParseErr(ctxt, XML_HTML_STRUCURE_ERROR,
4797 "Misplaced DOCTYPE declaration\n",
4798 BAD_CAST "DOCTYPE" , NULL);
4799 htmlParseDocTypeDecl(ctxt);
4800 }
4801
4802 /*
4803 * First case : a comment
4804 */
4805 if ((CUR == '<') && (NXT(1) == '!') &&
4806 (NXT(2) == '-') && (NXT(3) == '-')) {
4807 htmlParseComment(ctxt);
4808 }
4809
4810 /*
4811 * Second case : a Processing Instruction.
4812 */
4813 else if ((CUR == '<') && (NXT(1) == '?')) {
4814 htmlParsePI(ctxt);
4815 }
4816
4817 /*
4818 * Third case : a sub-element.
4819 */
4820 else if (CUR == '<') {
4821 htmlParseElementInternal(ctxt);
4822 if (currentNode != NULL) xmlFree(currentNode);
4823
4824 currentNode = xmlStrdup(ctxt->name);
4825 depth = ctxt->nameNr;
4826 }
4827
4828 /*
4829 * Fourth case : a reference. If if has not been resolved,
4830 * parsing returns it's Name, create the node
4831 */
4832 else if (CUR == '&') {
4833 htmlParseReference(ctxt);
4834 }
4835
4836 /*
4837 * Fifth case : end of the resource
4838 */
4839 else if (CUR == 0) {
4840 htmlAutoCloseOnEnd(ctxt);
4841 break;
4842 }
4843
4844 /*
4845 * Last case, text. Note that References are handled directly.
4846 */
4847 else {
4848 htmlParseCharData(ctxt);
4849 }
4850 }
4851 GROW;
4852 }
4853 if (currentNode != NULL) xmlFree(currentNode);
4854 }
4855
4856 /**
4857 * htmlParseContent:
4858 * @ctxt: an HTML parser context
4859 *
4860 * Parse a content: comment, sub-element, reference or text.
4861 * This is the entry point when called from parser.c
4862 */
4863
4864 void
__htmlParseContent(void * ctxt)4865 __htmlParseContent(void *ctxt) {
4866 if (ctxt != NULL)
4867 htmlParseContentInternal((htmlParserCtxtPtr) ctxt);
4868 }
4869
4870 /**
4871 * htmlParseDocument:
4872 * @ctxt: an HTML parser context
4873 *
4874 * parse an HTML document (and build a tree if using the standard SAX
4875 * interface).
4876 *
4877 * Returns 0, -1 in case of error. the parser context is augmented
4878 * as a result of the parsing.
4879 */
4880
4881 int
htmlParseDocument(htmlParserCtxtPtr ctxt)4882 htmlParseDocument(htmlParserCtxtPtr ctxt) {
4883 xmlChar start[4];
4884 xmlCharEncoding enc;
4885 xmlDtdPtr dtd;
4886
4887 xmlInitParser();
4888
4889 htmlDefaultSAXHandlerInit();
4890
4891 if ((ctxt == NULL) || (ctxt->input == NULL)) {
4892 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
4893 "htmlParseDocument: context error\n", NULL, NULL);
4894 return(XML_ERR_INTERNAL_ERROR);
4895 }
4896 ctxt->html = 1;
4897 ctxt->linenumbers = 1;
4898 GROW;
4899 /*
4900 * SAX: beginning of the document processing.
4901 */
4902 if ((ctxt->sax) && (ctxt->sax->setDocumentLocator))
4903 ctxt->sax->setDocumentLocator(ctxt->userData, &xmlDefaultSAXLocator);
4904
4905 if ((ctxt->encoding == (const xmlChar *)XML_CHAR_ENCODING_NONE) &&
4906 ((ctxt->input->end - ctxt->input->cur) >= 4)) {
4907 /*
4908 * Get the 4 first bytes and decode the charset
4909 * if enc != XML_CHAR_ENCODING_NONE
4910 * plug some encoding conversion routines.
4911 */
4912 start[0] = RAW;
4913 start[1] = NXT(1);
4914 start[2] = NXT(2);
4915 start[3] = NXT(3);
4916 enc = xmlDetectCharEncoding(&start[0], 4);
4917 if (enc != XML_CHAR_ENCODING_NONE) {
4918 xmlSwitchEncoding(ctxt, enc);
4919 }
4920 }
4921
4922 /*
4923 * Wipe out everything which is before the first '<'
4924 */
4925 SKIP_BLANKS;
4926 if (CUR == 0) {
4927 htmlParseErr(ctxt, XML_ERR_DOCUMENT_EMPTY,
4928 "Document is empty\n", NULL, NULL);
4929 }
4930
4931 if ((ctxt->sax) && (ctxt->sax->startDocument) && (!ctxt->disableSAX))
4932 ctxt->sax->startDocument(ctxt->userData);
4933
4934
4935 /*
4936 * Parse possible comments and PIs before any content
4937 */
4938 while (((CUR == '<') && (NXT(1) == '!') &&
4939 (NXT(2) == '-') && (NXT(3) == '-')) ||
4940 ((CUR == '<') && (NXT(1) == '?'))) {
4941 htmlParseComment(ctxt);
4942 htmlParsePI(ctxt);
4943 SKIP_BLANKS;
4944 }
4945
4946
4947 /*
4948 * Then possibly doc type declaration(s) and more Misc
4949 * (doctypedecl Misc*)?
4950 */
4951 if ((CUR == '<') && (NXT(1) == '!') &&
4952 (UPP(2) == 'D') && (UPP(3) == 'O') &&
4953 (UPP(4) == 'C') && (UPP(5) == 'T') &&
4954 (UPP(6) == 'Y') && (UPP(7) == 'P') &&
4955 (UPP(8) == 'E')) {
4956 htmlParseDocTypeDecl(ctxt);
4957 }
4958 SKIP_BLANKS;
4959
4960 /*
4961 * Parse possible comments and PIs before any content
4962 */
4963 while (((CUR == '<') && (NXT(1) == '!') &&
4964 (NXT(2) == '-') && (NXT(3) == '-')) ||
4965 ((CUR == '<') && (NXT(1) == '?'))) {
4966 htmlParseComment(ctxt);
4967 htmlParsePI(ctxt);
4968 SKIP_BLANKS;
4969 }
4970
4971 /*
4972 * Time to start parsing the tree itself
4973 */
4974 htmlParseContentInternal(ctxt);
4975
4976 /*
4977 * autoclose
4978 */
4979 if (CUR == 0)
4980 htmlAutoCloseOnEnd(ctxt);
4981
4982
4983 /*
4984 * SAX: end of the document processing.
4985 */
4986 if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
4987 ctxt->sax->endDocument(ctxt->userData);
4988
4989 if ((!(ctxt->options & HTML_PARSE_NODEFDTD)) && (ctxt->myDoc != NULL)) {
4990 dtd = xmlGetIntSubset(ctxt->myDoc);
4991 if (dtd == NULL)
4992 ctxt->myDoc->intSubset =
4993 xmlCreateIntSubset(ctxt->myDoc, BAD_CAST "html",
4994 BAD_CAST "-//W3C//DTD HTML 4.0 Transitional//EN",
4995 BAD_CAST "http://www.w3.org/TR/REC-html40/loose.dtd");
4996 }
4997 if (! ctxt->wellFormed) return(-1);
4998 return(0);
4999 }
5000
5001
5002 /************************************************************************
5003 * *
5004 * Parser contexts handling *
5005 * *
5006 ************************************************************************/
5007
5008 /**
5009 * htmlInitParserCtxt:
5010 * @ctxt: an HTML parser context
5011 *
5012 * Initialize a parser context
5013 *
5014 * Returns 0 in case of success and -1 in case of error
5015 */
5016
5017 static int
htmlInitParserCtxt(htmlParserCtxtPtr ctxt)5018 htmlInitParserCtxt(htmlParserCtxtPtr ctxt)
5019 {
5020 htmlSAXHandler *sax;
5021
5022 if (ctxt == NULL) return(-1);
5023 memset(ctxt, 0, sizeof(htmlParserCtxt));
5024
5025 ctxt->dict = xmlDictCreate();
5026 if (ctxt->dict == NULL) {
5027 htmlErrMemory(NULL, "htmlInitParserCtxt: out of memory\n");
5028 return(-1);
5029 }
5030 sax = (htmlSAXHandler *) xmlMalloc(sizeof(htmlSAXHandler));
5031 if (sax == NULL) {
5032 htmlErrMemory(NULL, "htmlInitParserCtxt: out of memory\n");
5033 return(-1);
5034 }
5035 else
5036 memset(sax, 0, sizeof(htmlSAXHandler));
5037
5038 /* Allocate the Input stack */
5039 ctxt->inputTab = (htmlParserInputPtr *)
5040 xmlMalloc(5 * sizeof(htmlParserInputPtr));
5041 if (ctxt->inputTab == NULL) {
5042 htmlErrMemory(NULL, "htmlInitParserCtxt: out of memory\n");
5043 ctxt->inputNr = 0;
5044 ctxt->inputMax = 0;
5045 ctxt->input = NULL;
5046 return(-1);
5047 }
5048 ctxt->inputNr = 0;
5049 ctxt->inputMax = 5;
5050 ctxt->input = NULL;
5051 ctxt->version = NULL;
5052 ctxt->encoding = NULL;
5053 ctxt->standalone = -1;
5054 ctxt->instate = XML_PARSER_START;
5055
5056 /* Allocate the Node stack */
5057 ctxt->nodeTab = (htmlNodePtr *) xmlMalloc(10 * sizeof(htmlNodePtr));
5058 if (ctxt->nodeTab == NULL) {
5059 htmlErrMemory(NULL, "htmlInitParserCtxt: out of memory\n");
5060 ctxt->nodeNr = 0;
5061 ctxt->nodeMax = 0;
5062 ctxt->node = NULL;
5063 ctxt->inputNr = 0;
5064 ctxt->inputMax = 0;
5065 ctxt->input = NULL;
5066 return(-1);
5067 }
5068 ctxt->nodeNr = 0;
5069 ctxt->nodeMax = 10;
5070 ctxt->node = NULL;
5071
5072 /* Allocate the Name stack */
5073 ctxt->nameTab = (const xmlChar **) xmlMalloc(10 * sizeof(xmlChar *));
5074 if (ctxt->nameTab == NULL) {
5075 htmlErrMemory(NULL, "htmlInitParserCtxt: out of memory\n");
5076 ctxt->nameNr = 0;
5077 ctxt->nameMax = 0;
5078 ctxt->name = NULL;
5079 ctxt->nodeNr = 0;
5080 ctxt->nodeMax = 0;
5081 ctxt->node = NULL;
5082 ctxt->inputNr = 0;
5083 ctxt->inputMax = 0;
5084 ctxt->input = NULL;
5085 return(-1);
5086 }
5087 ctxt->nameNr = 0;
5088 ctxt->nameMax = 10;
5089 ctxt->name = NULL;
5090
5091 ctxt->nodeInfoTab = NULL;
5092 ctxt->nodeInfoNr = 0;
5093 ctxt->nodeInfoMax = 0;
5094
5095 if (sax == NULL) ctxt->sax = (xmlSAXHandlerPtr) &htmlDefaultSAXHandler;
5096 else {
5097 ctxt->sax = sax;
5098 memcpy(sax, &htmlDefaultSAXHandler, sizeof(xmlSAXHandlerV1));
5099 }
5100 ctxt->userData = ctxt;
5101 ctxt->myDoc = NULL;
5102 ctxt->wellFormed = 1;
5103 ctxt->replaceEntities = 0;
5104 ctxt->linenumbers = xmlLineNumbersDefaultValue;
5105 ctxt->keepBlanks = xmlKeepBlanksDefaultValue;
5106 ctxt->html = 1;
5107 ctxt->vctxt.finishDtd = XML_CTXT_FINISH_DTD_0;
5108 ctxt->vctxt.userData = ctxt;
5109 ctxt->vctxt.error = xmlParserValidityError;
5110 ctxt->vctxt.warning = xmlParserValidityWarning;
5111 ctxt->record_info = 0;
5112 ctxt->validate = 0;
5113 ctxt->checkIndex = 0;
5114 ctxt->catalogs = NULL;
5115 xmlInitNodeInfoSeq(&ctxt->node_seq);
5116 return(0);
5117 }
5118
5119 /**
5120 * htmlFreeParserCtxt:
5121 * @ctxt: an HTML parser context
5122 *
5123 * Free all the memory used by a parser context. However the parsed
5124 * document in ctxt->myDoc is not freed.
5125 */
5126
5127 void
htmlFreeParserCtxt(htmlParserCtxtPtr ctxt)5128 htmlFreeParserCtxt(htmlParserCtxtPtr ctxt)
5129 {
5130 xmlFreeParserCtxt(ctxt);
5131 }
5132
5133 /**
5134 * htmlNewParserCtxt:
5135 *
5136 * Allocate and initialize a new parser context.
5137 *
5138 * Returns the htmlParserCtxtPtr or NULL in case of allocation error
5139 */
5140
5141 htmlParserCtxtPtr
htmlNewParserCtxt(void)5142 htmlNewParserCtxt(void)
5143 {
5144 xmlParserCtxtPtr ctxt;
5145
5146 ctxt = (xmlParserCtxtPtr) xmlMalloc(sizeof(xmlParserCtxt));
5147 if (ctxt == NULL) {
5148 htmlErrMemory(NULL, "NewParserCtxt: out of memory\n");
5149 return(NULL);
5150 }
5151 memset(ctxt, 0, sizeof(xmlParserCtxt));
5152 if (htmlInitParserCtxt(ctxt) < 0) {
5153 htmlFreeParserCtxt(ctxt);
5154 return(NULL);
5155 }
5156 return(ctxt);
5157 }
5158
5159 /**
5160 * htmlCreateMemoryParserCtxt:
5161 * @buffer: a pointer to a char array
5162 * @size: the size of the array
5163 *
5164 * Create a parser context for an HTML in-memory document.
5165 *
5166 * Returns the new parser context or NULL
5167 */
5168 htmlParserCtxtPtr
htmlCreateMemoryParserCtxt(const char * buffer,int size)5169 htmlCreateMemoryParserCtxt(const char *buffer, int size) {
5170 xmlParserCtxtPtr ctxt;
5171 xmlParserInputPtr input;
5172 xmlParserInputBufferPtr buf;
5173
5174 if (buffer == NULL)
5175 return(NULL);
5176 if (size <= 0)
5177 return(NULL);
5178
5179 ctxt = htmlNewParserCtxt();
5180 if (ctxt == NULL)
5181 return(NULL);
5182
5183 buf = xmlParserInputBufferCreateMem(buffer, size, XML_CHAR_ENCODING_NONE);
5184 if (buf == NULL) return(NULL);
5185
5186 input = xmlNewInputStream(ctxt);
5187 if (input == NULL) {
5188 xmlFreeParserCtxt(ctxt);
5189 return(NULL);
5190 }
5191
5192 input->filename = NULL;
5193 input->buf = buf;
5194 xmlBufResetInput(buf->buffer, input);
5195
5196 inputPush(ctxt, input);
5197 return(ctxt);
5198 }
5199
5200 /**
5201 * htmlCreateDocParserCtxt:
5202 * @cur: a pointer to an array of xmlChar
5203 * @encoding: a free form C string describing the HTML document encoding, or NULL
5204 *
5205 * Create a parser context for an HTML document.
5206 *
5207 * TODO: check the need to add encoding handling there
5208 *
5209 * Returns the new parser context or NULL
5210 */
5211 static htmlParserCtxtPtr
htmlCreateDocParserCtxt(const xmlChar * cur,const char * encoding)5212 htmlCreateDocParserCtxt(const xmlChar *cur, const char *encoding) {
5213 int len;
5214 htmlParserCtxtPtr ctxt;
5215
5216 if (cur == NULL)
5217 return(NULL);
5218 len = xmlStrlen(cur);
5219 ctxt = htmlCreateMemoryParserCtxt((char *)cur, len);
5220 if (ctxt == NULL)
5221 return(NULL);
5222
5223 if (encoding != NULL) {
5224 xmlCharEncoding enc;
5225 xmlCharEncodingHandlerPtr handler;
5226
5227 if (ctxt->input->encoding != NULL)
5228 xmlFree((xmlChar *) ctxt->input->encoding);
5229 ctxt->input->encoding = xmlStrdup((const xmlChar *) encoding);
5230
5231 enc = xmlParseCharEncoding(encoding);
5232 /*
5233 * registered set of known encodings
5234 */
5235 if (enc != XML_CHAR_ENCODING_ERROR) {
5236 xmlSwitchEncoding(ctxt, enc);
5237 if (ctxt->errNo == XML_ERR_UNSUPPORTED_ENCODING) {
5238 htmlParseErr(ctxt, XML_ERR_UNSUPPORTED_ENCODING,
5239 "Unsupported encoding %s\n",
5240 (const xmlChar *) encoding, NULL);
5241 }
5242 } else {
5243 /*
5244 * fallback for unknown encodings
5245 */
5246 handler = xmlFindCharEncodingHandler((const char *) encoding);
5247 if (handler != NULL) {
5248 xmlSwitchToEncoding(ctxt, handler);
5249 } else {
5250 htmlParseErr(ctxt, XML_ERR_UNSUPPORTED_ENCODING,
5251 "Unsupported encoding %s\n",
5252 (const xmlChar *) encoding, NULL);
5253 }
5254 }
5255 }
5256 return(ctxt);
5257 }
5258
5259 #ifdef LIBXML_PUSH_ENABLED
5260 /************************************************************************
5261 * *
5262 * Progressive parsing interfaces *
5263 * *
5264 ************************************************************************/
5265
5266 /**
5267 * htmlParseLookupSequence:
5268 * @ctxt: an HTML parser context
5269 * @first: the first char to lookup
5270 * @next: the next char to lookup or zero
5271 * @third: the next char to lookup or zero
5272 * @ignoreattrval: skip over attribute values
5273 *
5274 * Try to find if a sequence (first, next, third) or just (first next) or
5275 * (first) is available in the input stream.
5276 * This function has a side effect of (possibly) incrementing ctxt->checkIndex
5277 * to avoid rescanning sequences of bytes, it DOES change the state of the
5278 * parser, do not use liberally.
5279 * This is basically similar to xmlParseLookupSequence()
5280 *
5281 * Returns the index to the current parsing point if the full sequence
5282 * is available, -1 otherwise.
5283 */
5284 static int
htmlParseLookupSequence(htmlParserCtxtPtr ctxt,xmlChar first,xmlChar next,xmlChar third,int ignoreattrval)5285 htmlParseLookupSequence(htmlParserCtxtPtr ctxt, xmlChar first,
5286 xmlChar next, xmlChar third, int ignoreattrval)
5287 {
5288 int base, len;
5289 htmlParserInputPtr in;
5290 const xmlChar *buf;
5291 int invalue = 0;
5292 char valdellim = 0x0;
5293
5294 in = ctxt->input;
5295 if (in == NULL)
5296 return (-1);
5297
5298 base = in->cur - in->base;
5299 if (base < 0)
5300 return (-1);
5301
5302 if (ctxt->checkIndex > base) {
5303 base = ctxt->checkIndex;
5304 /* Abuse hasPErefs member to restore current state. */
5305 invalue = ctxt->hasPErefs & 1 ? 1 : 0;
5306 }
5307
5308 if (in->buf == NULL) {
5309 buf = in->base;
5310 len = in->length;
5311 } else {
5312 buf = xmlBufContent(in->buf->buffer);
5313 len = xmlBufUse(in->buf->buffer);
5314 }
5315
5316 /* take into account the sequence length */
5317 if (third)
5318 len -= 2;
5319 else if (next)
5320 len--;
5321 for (; base < len; base++) {
5322 if (ignoreattrval) {
5323 if (buf[base] == '"' || buf[base] == '\'') {
5324 if (invalue) {
5325 if (buf[base] == valdellim) {
5326 invalue = 0;
5327 continue;
5328 }
5329 } else {
5330 valdellim = buf[base];
5331 invalue = 1;
5332 continue;
5333 }
5334 } else if (invalue) {
5335 continue;
5336 }
5337 }
5338 if (buf[base] == first) {
5339 if (third != 0) {
5340 if ((buf[base + 1] != next) || (buf[base + 2] != third))
5341 continue;
5342 } else if (next != 0) {
5343 if (buf[base + 1] != next)
5344 continue;
5345 }
5346 ctxt->checkIndex = 0;
5347 #ifdef DEBUG_PUSH
5348 if (next == 0)
5349 xmlGenericError(xmlGenericErrorContext,
5350 "HPP: lookup '%c' found at %d\n",
5351 first, base);
5352 else if (third == 0)
5353 xmlGenericError(xmlGenericErrorContext,
5354 "HPP: lookup '%c%c' found at %d\n",
5355 first, next, base);
5356 else
5357 xmlGenericError(xmlGenericErrorContext,
5358 "HPP: lookup '%c%c%c' found at %d\n",
5359 first, next, third, base);
5360 #endif
5361 return (base - (in->cur - in->base));
5362 }
5363 }
5364 ctxt->checkIndex = base;
5365 /* Abuse hasPErefs member to track current state. */
5366 if (invalue)
5367 ctxt->hasPErefs |= 1;
5368 else
5369 ctxt->hasPErefs &= ~1;
5370 #ifdef DEBUG_PUSH
5371 if (next == 0)
5372 xmlGenericError(xmlGenericErrorContext,
5373 "HPP: lookup '%c' failed\n", first);
5374 else if (third == 0)
5375 xmlGenericError(xmlGenericErrorContext,
5376 "HPP: lookup '%c%c' failed\n", first, next);
5377 else
5378 xmlGenericError(xmlGenericErrorContext,
5379 "HPP: lookup '%c%c%c' failed\n", first, next,
5380 third);
5381 #endif
5382 return (-1);
5383 }
5384
5385 /**
5386 * htmlParseLookupCommentEnd:
5387 * @ctxt: an HTML parser context
5388 *
5389 * Try to find a comment end tag in the input stream
5390 * The search includes "-->" as well as WHATWG-recommended incorrectly-closed tags.
5391 * (See https://html.spec.whatwg.org/multipage/parsing.html#parse-error-incorrectly-closed-comment)
5392 * This function has a side effect of (possibly) incrementing ctxt->checkIndex
5393 * to avoid rescanning sequences of bytes, it DOES change the state of the
5394 * parser, do not use liberally.
5395 * This wraps to htmlParseLookupSequence()
5396 *
5397 * Returns the index to the current parsing point if the full sequence is available, -1 otherwise.
5398 */
5399 static int
htmlParseLookupCommentEnd(htmlParserCtxtPtr ctxt)5400 htmlParseLookupCommentEnd(htmlParserCtxtPtr ctxt)
5401 {
5402 int mark = 0;
5403 int cur = CUR_PTR - BASE_PTR;
5404
5405 while (mark >= 0) {
5406 mark = htmlParseLookupSequence(ctxt, '-', '-', 0, 0);
5407 if ((mark < 0) ||
5408 (NXT(mark+2) == '>') ||
5409 ((NXT(mark+2) == '!') && (NXT(mark+3) == '>'))) {
5410 return mark;
5411 }
5412 ctxt->checkIndex = cur + mark + 1;
5413 }
5414 return mark;
5415 }
5416
5417
5418 /**
5419 * htmlParseTryOrFinish:
5420 * @ctxt: an HTML parser context
5421 * @terminate: last chunk indicator
5422 *
5423 * Try to progress on parsing
5424 *
5425 * Returns zero if no parsing was possible
5426 */
5427 static int
htmlParseTryOrFinish(htmlParserCtxtPtr ctxt,int terminate)5428 htmlParseTryOrFinish(htmlParserCtxtPtr ctxt, int terminate) {
5429 int ret = 0;
5430 htmlParserInputPtr in;
5431 ptrdiff_t avail = 0;
5432 xmlChar cur, next;
5433
5434 htmlParserNodeInfo node_info;
5435
5436 #ifdef DEBUG_PUSH
5437 switch (ctxt->instate) {
5438 case XML_PARSER_EOF:
5439 xmlGenericError(xmlGenericErrorContext,
5440 "HPP: try EOF\n"); break;
5441 case XML_PARSER_START:
5442 xmlGenericError(xmlGenericErrorContext,
5443 "HPP: try START\n"); break;
5444 case XML_PARSER_MISC:
5445 xmlGenericError(xmlGenericErrorContext,
5446 "HPP: try MISC\n");break;
5447 case XML_PARSER_COMMENT:
5448 xmlGenericError(xmlGenericErrorContext,
5449 "HPP: try COMMENT\n");break;
5450 case XML_PARSER_PROLOG:
5451 xmlGenericError(xmlGenericErrorContext,
5452 "HPP: try PROLOG\n");break;
5453 case XML_PARSER_START_TAG:
5454 xmlGenericError(xmlGenericErrorContext,
5455 "HPP: try START_TAG\n");break;
5456 case XML_PARSER_CONTENT:
5457 xmlGenericError(xmlGenericErrorContext,
5458 "HPP: try CONTENT\n");break;
5459 case XML_PARSER_CDATA_SECTION:
5460 xmlGenericError(xmlGenericErrorContext,
5461 "HPP: try CDATA_SECTION\n");break;
5462 case XML_PARSER_END_TAG:
5463 xmlGenericError(xmlGenericErrorContext,
5464 "HPP: try END_TAG\n");break;
5465 case XML_PARSER_ENTITY_DECL:
5466 xmlGenericError(xmlGenericErrorContext,
5467 "HPP: try ENTITY_DECL\n");break;
5468 case XML_PARSER_ENTITY_VALUE:
5469 xmlGenericError(xmlGenericErrorContext,
5470 "HPP: try ENTITY_VALUE\n");break;
5471 case XML_PARSER_ATTRIBUTE_VALUE:
5472 xmlGenericError(xmlGenericErrorContext,
5473 "HPP: try ATTRIBUTE_VALUE\n");break;
5474 case XML_PARSER_DTD:
5475 xmlGenericError(xmlGenericErrorContext,
5476 "HPP: try DTD\n");break;
5477 case XML_PARSER_EPILOG:
5478 xmlGenericError(xmlGenericErrorContext,
5479 "HPP: try EPILOG\n");break;
5480 case XML_PARSER_PI:
5481 xmlGenericError(xmlGenericErrorContext,
5482 "HPP: try PI\n");break;
5483 case XML_PARSER_SYSTEM_LITERAL:
5484 xmlGenericError(xmlGenericErrorContext,
5485 "HPP: try SYSTEM_LITERAL\n");break;
5486 }
5487 #endif
5488
5489 while (1) {
5490
5491 in = ctxt->input;
5492 if (in == NULL) break;
5493 if (in->buf == NULL)
5494 avail = in->length - (in->cur - in->base);
5495 else
5496 avail = (ptrdiff_t)xmlBufUse(in->buf->buffer) -
5497 (in->cur - in->base);
5498 if ((avail == 0) && (terminate)) {
5499 htmlAutoCloseOnEnd(ctxt);
5500 if ((ctxt->nameNr == 0) && (ctxt->instate != XML_PARSER_EOF)) {
5501 /*
5502 * SAX: end of the document processing.
5503 */
5504 ctxt->instate = XML_PARSER_EOF;
5505 if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
5506 ctxt->sax->endDocument(ctxt->userData);
5507 }
5508 }
5509 if (avail < 1)
5510 goto done;
5511 /*
5512 * This is done to make progress and avoid an infinite loop
5513 * if a parsing attempt was aborted by hitting a NUL byte. After
5514 * changing htmlCurrentChar, this probably isn't necessary anymore.
5515 * We should consider removing this check.
5516 */
5517 cur = in->cur[0];
5518 if (cur == 0) {
5519 SKIP(1);
5520 continue;
5521 }
5522
5523 switch (ctxt->instate) {
5524 case XML_PARSER_EOF:
5525 /*
5526 * Document parsing is done !
5527 */
5528 goto done;
5529 case XML_PARSER_START:
5530 /*
5531 * Very first chars read from the document flow.
5532 */
5533 cur = in->cur[0];
5534 if (IS_BLANK_CH(cur)) {
5535 SKIP_BLANKS;
5536 if (in->buf == NULL)
5537 avail = in->length - (in->cur - in->base);
5538 else
5539 avail = (ptrdiff_t)xmlBufUse(in->buf->buffer) -
5540 (in->cur - in->base);
5541 }
5542 if ((ctxt->sax) && (ctxt->sax->setDocumentLocator))
5543 ctxt->sax->setDocumentLocator(ctxt->userData,
5544 &xmlDefaultSAXLocator);
5545 if ((ctxt->sax) && (ctxt->sax->startDocument) &&
5546 (!ctxt->disableSAX))
5547 ctxt->sax->startDocument(ctxt->userData);
5548
5549 cur = in->cur[0];
5550 next = in->cur[1];
5551 if ((cur == '<') && (next == '!') &&
5552 (UPP(2) == 'D') && (UPP(3) == 'O') &&
5553 (UPP(4) == 'C') && (UPP(5) == 'T') &&
5554 (UPP(6) == 'Y') && (UPP(7) == 'P') &&
5555 (UPP(8) == 'E')) {
5556 if ((!terminate) &&
5557 (htmlParseLookupSequence(ctxt, '>', 0, 0, 1) < 0))
5558 goto done;
5559 #ifdef DEBUG_PUSH
5560 xmlGenericError(xmlGenericErrorContext,
5561 "HPP: Parsing internal subset\n");
5562 #endif
5563 htmlParseDocTypeDecl(ctxt);
5564 ctxt->instate = XML_PARSER_PROLOG;
5565 #ifdef DEBUG_PUSH
5566 xmlGenericError(xmlGenericErrorContext,
5567 "HPP: entering PROLOG\n");
5568 #endif
5569 } else {
5570 ctxt->instate = XML_PARSER_MISC;
5571 #ifdef DEBUG_PUSH
5572 xmlGenericError(xmlGenericErrorContext,
5573 "HPP: entering MISC\n");
5574 #endif
5575 }
5576 break;
5577 case XML_PARSER_MISC:
5578 SKIP_BLANKS;
5579 if (in->buf == NULL)
5580 avail = in->length - (in->cur - in->base);
5581 else
5582 avail = (ptrdiff_t)xmlBufUse(in->buf->buffer) -
5583 (in->cur - in->base);
5584 /*
5585 * no chars in buffer
5586 */
5587 if (avail < 1)
5588 goto done;
5589 /*
5590 * not enough chars in buffer
5591 */
5592 if (avail < 2) {
5593 if (!terminate)
5594 goto done;
5595 else
5596 next = ' ';
5597 } else {
5598 next = in->cur[1];
5599 }
5600 cur = in->cur[0];
5601 if ((cur == '<') && (next == '!') &&
5602 (in->cur[2] == '-') && (in->cur[3] == '-')) {
5603 if ((!terminate) && (htmlParseLookupCommentEnd(ctxt) < 0))
5604 goto done;
5605 #ifdef DEBUG_PUSH
5606 xmlGenericError(xmlGenericErrorContext,
5607 "HPP: Parsing Comment\n");
5608 #endif
5609 htmlParseComment(ctxt);
5610 ctxt->instate = XML_PARSER_MISC;
5611 } else if ((cur == '<') && (next == '?')) {
5612 if ((!terminate) &&
5613 (htmlParseLookupSequence(ctxt, '>', 0, 0, 0) < 0))
5614 goto done;
5615 #ifdef DEBUG_PUSH
5616 xmlGenericError(xmlGenericErrorContext,
5617 "HPP: Parsing PI\n");
5618 #endif
5619 htmlParsePI(ctxt);
5620 ctxt->instate = XML_PARSER_MISC;
5621 } else if ((cur == '<') && (next == '!') &&
5622 (UPP(2) == 'D') && (UPP(3) == 'O') &&
5623 (UPP(4) == 'C') && (UPP(5) == 'T') &&
5624 (UPP(6) == 'Y') && (UPP(7) == 'P') &&
5625 (UPP(8) == 'E')) {
5626 if ((!terminate) &&
5627 (htmlParseLookupSequence(ctxt, '>', 0, 0, 1) < 0))
5628 goto done;
5629 #ifdef DEBUG_PUSH
5630 xmlGenericError(xmlGenericErrorContext,
5631 "HPP: Parsing internal subset\n");
5632 #endif
5633 htmlParseDocTypeDecl(ctxt);
5634 ctxt->instate = XML_PARSER_PROLOG;
5635 #ifdef DEBUG_PUSH
5636 xmlGenericError(xmlGenericErrorContext,
5637 "HPP: entering PROLOG\n");
5638 #endif
5639 } else if ((cur == '<') && (next == '!') &&
5640 (avail < 9)) {
5641 goto done;
5642 } else {
5643 ctxt->instate = XML_PARSER_CONTENT;
5644 #ifdef DEBUG_PUSH
5645 xmlGenericError(xmlGenericErrorContext,
5646 "HPP: entering START_TAG\n");
5647 #endif
5648 }
5649 break;
5650 case XML_PARSER_PROLOG:
5651 SKIP_BLANKS;
5652 if (in->buf == NULL)
5653 avail = in->length - (in->cur - in->base);
5654 else
5655 avail = (ptrdiff_t)xmlBufUse(in->buf->buffer) -
5656 (in->cur - in->base);
5657 if (avail < 2)
5658 goto done;
5659 cur = in->cur[0];
5660 next = in->cur[1];
5661 if ((cur == '<') && (next == '!') &&
5662 (in->cur[2] == '-') && (in->cur[3] == '-')) {
5663 if ((!terminate) && (htmlParseLookupCommentEnd(ctxt) < 0))
5664 goto done;
5665 #ifdef DEBUG_PUSH
5666 xmlGenericError(xmlGenericErrorContext,
5667 "HPP: Parsing Comment\n");
5668 #endif
5669 htmlParseComment(ctxt);
5670 ctxt->instate = XML_PARSER_PROLOG;
5671 } else if ((cur == '<') && (next == '?')) {
5672 if ((!terminate) &&
5673 (htmlParseLookupSequence(ctxt, '>', 0, 0, 0) < 0))
5674 goto done;
5675 #ifdef DEBUG_PUSH
5676 xmlGenericError(xmlGenericErrorContext,
5677 "HPP: Parsing PI\n");
5678 #endif
5679 htmlParsePI(ctxt);
5680 ctxt->instate = XML_PARSER_PROLOG;
5681 } else if ((cur == '<') && (next == '!') &&
5682 (avail < 4)) {
5683 goto done;
5684 } else {
5685 ctxt->instate = XML_PARSER_CONTENT;
5686 #ifdef DEBUG_PUSH
5687 xmlGenericError(xmlGenericErrorContext,
5688 "HPP: entering START_TAG\n");
5689 #endif
5690 }
5691 break;
5692 case XML_PARSER_EPILOG:
5693 if (in->buf == NULL)
5694 avail = in->length - (in->cur - in->base);
5695 else
5696 avail = (ptrdiff_t)xmlBufUse(in->buf->buffer) -
5697 (in->cur - in->base);
5698 if (avail < 1)
5699 goto done;
5700 cur = in->cur[0];
5701 if (IS_BLANK_CH(cur)) {
5702 htmlParseCharData(ctxt);
5703 goto done;
5704 }
5705 if (avail < 2)
5706 goto done;
5707 next = in->cur[1];
5708 if ((cur == '<') && (next == '!') &&
5709 (in->cur[2] == '-') && (in->cur[3] == '-')) {
5710 if ((!terminate) && (htmlParseLookupCommentEnd(ctxt) < 0))
5711 goto done;
5712 #ifdef DEBUG_PUSH
5713 xmlGenericError(xmlGenericErrorContext,
5714 "HPP: Parsing Comment\n");
5715 #endif
5716 htmlParseComment(ctxt);
5717 ctxt->instate = XML_PARSER_EPILOG;
5718 } else if ((cur == '<') && (next == '?')) {
5719 if ((!terminate) &&
5720 (htmlParseLookupSequence(ctxt, '>', 0, 0, 0) < 0))
5721 goto done;
5722 #ifdef DEBUG_PUSH
5723 xmlGenericError(xmlGenericErrorContext,
5724 "HPP: Parsing PI\n");
5725 #endif
5726 htmlParsePI(ctxt);
5727 ctxt->instate = XML_PARSER_EPILOG;
5728 } else if ((cur == '<') && (next == '!') &&
5729 (avail < 4)) {
5730 goto done;
5731 } else {
5732 ctxt->errNo = XML_ERR_DOCUMENT_END;
5733 ctxt->wellFormed = 0;
5734 ctxt->instate = XML_PARSER_EOF;
5735 #ifdef DEBUG_PUSH
5736 xmlGenericError(xmlGenericErrorContext,
5737 "HPP: entering EOF\n");
5738 #endif
5739 if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
5740 ctxt->sax->endDocument(ctxt->userData);
5741 goto done;
5742 }
5743 break;
5744 case XML_PARSER_START_TAG: {
5745 const xmlChar *name;
5746 int failed;
5747 const htmlElemDesc * info;
5748
5749 /*
5750 * no chars in buffer
5751 */
5752 if (avail < 1)
5753 goto done;
5754 /*
5755 * not enough chars in buffer
5756 */
5757 if (avail < 2) {
5758 if (!terminate)
5759 goto done;
5760 else
5761 next = ' ';
5762 } else {
5763 next = in->cur[1];
5764 }
5765 cur = in->cur[0];
5766 if (cur != '<') {
5767 ctxt->instate = XML_PARSER_CONTENT;
5768 #ifdef DEBUG_PUSH
5769 xmlGenericError(xmlGenericErrorContext,
5770 "HPP: entering CONTENT\n");
5771 #endif
5772 break;
5773 }
5774 if (next == '/') {
5775 ctxt->instate = XML_PARSER_END_TAG;
5776 ctxt->checkIndex = 0;
5777 #ifdef DEBUG_PUSH
5778 xmlGenericError(xmlGenericErrorContext,
5779 "HPP: entering END_TAG\n");
5780 #endif
5781 break;
5782 }
5783 if ((!terminate) &&
5784 (htmlParseLookupSequence(ctxt, '>', 0, 0, 1) < 0))
5785 goto done;
5786
5787 /* Capture start position */
5788 if (ctxt->record_info) {
5789 node_info.begin_pos = ctxt->input->consumed +
5790 (CUR_PTR - ctxt->input->base);
5791 node_info.begin_line = ctxt->input->line;
5792 }
5793
5794
5795 failed = htmlParseStartTag(ctxt);
5796 name = ctxt->name;
5797 if ((failed == -1) ||
5798 (name == NULL)) {
5799 if (CUR == '>')
5800 NEXT;
5801 break;
5802 }
5803
5804 /*
5805 * Lookup the info for that element.
5806 */
5807 info = htmlTagLookup(name);
5808 if (info == NULL) {
5809 htmlParseErr(ctxt, XML_HTML_UNKNOWN_TAG,
5810 "Tag %s invalid\n", name, NULL);
5811 }
5812
5813 /*
5814 * Check for an Empty Element labeled the XML/SGML way
5815 */
5816 if ((CUR == '/') && (NXT(1) == '>')) {
5817 SKIP(2);
5818 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
5819 ctxt->sax->endElement(ctxt->userData, name);
5820 htmlnamePop(ctxt);
5821 ctxt->instate = XML_PARSER_CONTENT;
5822 #ifdef DEBUG_PUSH
5823 xmlGenericError(xmlGenericErrorContext,
5824 "HPP: entering CONTENT\n");
5825 #endif
5826 break;
5827 }
5828
5829 if (CUR == '>') {
5830 NEXT;
5831 } else {
5832 htmlParseErr(ctxt, XML_ERR_GT_REQUIRED,
5833 "Couldn't find end of Start Tag %s\n",
5834 name, NULL);
5835
5836 /*
5837 * end of parsing of this node.
5838 */
5839 if (xmlStrEqual(name, ctxt->name)) {
5840 nodePop(ctxt);
5841 htmlnamePop(ctxt);
5842 }
5843
5844 if (ctxt->record_info)
5845 htmlNodeInfoPush(ctxt, &node_info);
5846
5847 ctxt->instate = XML_PARSER_CONTENT;
5848 #ifdef DEBUG_PUSH
5849 xmlGenericError(xmlGenericErrorContext,
5850 "HPP: entering CONTENT\n");
5851 #endif
5852 break;
5853 }
5854
5855 /*
5856 * Check for an Empty Element from DTD definition
5857 */
5858 if ((info != NULL) && (info->empty)) {
5859 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
5860 ctxt->sax->endElement(ctxt->userData, name);
5861 htmlnamePop(ctxt);
5862 }
5863
5864 if (ctxt->record_info)
5865 htmlNodeInfoPush(ctxt, &node_info);
5866
5867 ctxt->instate = XML_PARSER_CONTENT;
5868 #ifdef DEBUG_PUSH
5869 xmlGenericError(xmlGenericErrorContext,
5870 "HPP: entering CONTENT\n");
5871 #endif
5872 break;
5873 }
5874 case XML_PARSER_CONTENT: {
5875 xmlChar chr[2] = { 0, 0 };
5876
5877 /*
5878 * Handle preparsed entities and charRef
5879 */
5880 if (ctxt->token != 0) {
5881 chr[0] = (xmlChar) ctxt->token;
5882 htmlCheckParagraph(ctxt);
5883 if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL))
5884 ctxt->sax->characters(ctxt->userData, chr, 1);
5885 ctxt->token = 0;
5886 ctxt->checkIndex = 0;
5887 }
5888 if ((avail == 1) && (terminate)) {
5889 cur = in->cur[0];
5890 if ((cur != '<') && (cur != '&')) {
5891 if (ctxt->sax != NULL) {
5892 chr[0] = cur;
5893 if (IS_BLANK_CH(cur)) {
5894 if (ctxt->keepBlanks) {
5895 if (ctxt->sax->characters != NULL)
5896 ctxt->sax->characters(
5897 ctxt->userData, chr, 1);
5898 } else {
5899 if (ctxt->sax->ignorableWhitespace != NULL)
5900 ctxt->sax->ignorableWhitespace(
5901 ctxt->userData, chr, 1);
5902 }
5903 } else {
5904 htmlCheckParagraph(ctxt);
5905 if (ctxt->sax->characters != NULL)
5906 ctxt->sax->characters(
5907 ctxt->userData, chr, 1);
5908 }
5909 }
5910 ctxt->token = 0;
5911 ctxt->checkIndex = 0;
5912 in->cur++;
5913 break;
5914 }
5915 }
5916 if (avail < 2)
5917 goto done;
5918 cur = in->cur[0];
5919 next = in->cur[1];
5920 if ((xmlStrEqual(ctxt->name, BAD_CAST"script")) ||
5921 (xmlStrEqual(ctxt->name, BAD_CAST"style"))) {
5922 /*
5923 * Handle SCRIPT/STYLE separately
5924 */
5925 if (!terminate) {
5926 int idx;
5927 xmlChar val;
5928
5929 idx = htmlParseLookupSequence(ctxt, '<', '/', 0, 0);
5930 if (idx < 0)
5931 goto done;
5932 val = in->cur[idx + 2];
5933 if (val == 0) /* bad cut of input */
5934 goto done;
5935 }
5936 htmlParseScript(ctxt);
5937 if ((cur == '<') && (next == '/')) {
5938 ctxt->instate = XML_PARSER_END_TAG;
5939 ctxt->checkIndex = 0;
5940 #ifdef DEBUG_PUSH
5941 xmlGenericError(xmlGenericErrorContext,
5942 "HPP: entering END_TAG\n");
5943 #endif
5944 break;
5945 }
5946 } else {
5947 /*
5948 * Sometimes DOCTYPE arrives in the middle of the document
5949 */
5950 if ((cur == '<') && (next == '!') &&
5951 (UPP(2) == 'D') && (UPP(3) == 'O') &&
5952 (UPP(4) == 'C') && (UPP(5) == 'T') &&
5953 (UPP(6) == 'Y') && (UPP(7) == 'P') &&
5954 (UPP(8) == 'E')) {
5955 if ((!terminate) &&
5956 (htmlParseLookupSequence(ctxt, '>', 0, 0, 1) < 0))
5957 goto done;
5958 htmlParseErr(ctxt, XML_HTML_STRUCURE_ERROR,
5959 "Misplaced DOCTYPE declaration\n",
5960 BAD_CAST "DOCTYPE" , NULL);
5961 htmlParseDocTypeDecl(ctxt);
5962 } else if ((cur == '<') && (next == '!') &&
5963 (in->cur[2] == '-') && (in->cur[3] == '-')) {
5964 if ((!terminate) && (htmlParseLookupCommentEnd(ctxt) < 0))
5965 goto done;
5966 #ifdef DEBUG_PUSH
5967 xmlGenericError(xmlGenericErrorContext,
5968 "HPP: Parsing Comment\n");
5969 #endif
5970 htmlParseComment(ctxt);
5971 ctxt->instate = XML_PARSER_CONTENT;
5972 } else if ((cur == '<') && (next == '?')) {
5973 if ((!terminate) &&
5974 (htmlParseLookupSequence(ctxt, '>', 0, 0, 0) < 0))
5975 goto done;
5976 #ifdef DEBUG_PUSH
5977 xmlGenericError(xmlGenericErrorContext,
5978 "HPP: Parsing PI\n");
5979 #endif
5980 htmlParsePI(ctxt);
5981 ctxt->instate = XML_PARSER_CONTENT;
5982 } else if ((cur == '<') && (next == '!') && (avail < 4)) {
5983 goto done;
5984 } else if ((cur == '<') && (next == '/')) {
5985 ctxt->instate = XML_PARSER_END_TAG;
5986 ctxt->checkIndex = 0;
5987 #ifdef DEBUG_PUSH
5988 xmlGenericError(xmlGenericErrorContext,
5989 "HPP: entering END_TAG\n");
5990 #endif
5991 break;
5992 } else if (cur == '<') {
5993 if ((!terminate) && (next == 0))
5994 goto done;
5995 /*
5996 * Only switch to START_TAG if the next character
5997 * starts a valid name. Otherwise, htmlParseStartTag
5998 * might return without consuming all characters
5999 * up to the final '>'.
6000 */
6001 if ((IS_ASCII_LETTER(next)) ||
6002 (next == '_') || (next == ':') || (next == '.')) {
6003 ctxt->instate = XML_PARSER_START_TAG;
6004 ctxt->checkIndex = 0;
6005 #ifdef DEBUG_PUSH
6006 xmlGenericError(xmlGenericErrorContext,
6007 "HPP: entering START_TAG\n");
6008 #endif
6009 } else {
6010 htmlParseErr(ctxt, XML_ERR_NAME_REQUIRED,
6011 "htmlParseTryOrFinish: "
6012 "invalid element name\n",
6013 NULL, NULL);
6014 htmlCheckParagraph(ctxt);
6015 if ((ctxt->sax != NULL) &&
6016 (ctxt->sax->characters != NULL))
6017 ctxt->sax->characters(ctxt->userData,
6018 in->cur, 1);
6019 NEXT;
6020 }
6021 break;
6022 } else {
6023 /*
6024 * check that the text sequence is complete
6025 * before handing out the data to the parser
6026 * to avoid problems with erroneous end of
6027 * data detection.
6028 */
6029 if ((!terminate) &&
6030 (htmlParseLookupSequence(ctxt, '<', 0, 0, 0) < 0))
6031 goto done;
6032 ctxt->checkIndex = 0;
6033 #ifdef DEBUG_PUSH
6034 xmlGenericError(xmlGenericErrorContext,
6035 "HPP: Parsing char data\n");
6036 #endif
6037 while ((ctxt->instate != XML_PARSER_EOF) &&
6038 (cur != '<') && (in->cur < in->end)) {
6039 if (cur == '&') {
6040 htmlParseReference(ctxt);
6041 } else {
6042 htmlParseCharData(ctxt);
6043 }
6044 cur = in->cur[0];
6045 }
6046 }
6047 }
6048
6049 break;
6050 }
6051 case XML_PARSER_END_TAG:
6052 if (avail < 2)
6053 goto done;
6054 if ((!terminate) &&
6055 (htmlParseLookupSequence(ctxt, '>', 0, 0, 0) < 0))
6056 goto done;
6057 htmlParseEndTag(ctxt);
6058 if (ctxt->nameNr == 0) {
6059 ctxt->instate = XML_PARSER_EPILOG;
6060 } else {
6061 ctxt->instate = XML_PARSER_CONTENT;
6062 }
6063 ctxt->checkIndex = 0;
6064 #ifdef DEBUG_PUSH
6065 xmlGenericError(xmlGenericErrorContext,
6066 "HPP: entering CONTENT\n");
6067 #endif
6068 break;
6069 case XML_PARSER_CDATA_SECTION:
6070 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
6071 "HPP: internal error, state == CDATA\n",
6072 NULL, NULL);
6073 ctxt->instate = XML_PARSER_CONTENT;
6074 ctxt->checkIndex = 0;
6075 #ifdef DEBUG_PUSH
6076 xmlGenericError(xmlGenericErrorContext,
6077 "HPP: entering CONTENT\n");
6078 #endif
6079 break;
6080 case XML_PARSER_DTD:
6081 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
6082 "HPP: internal error, state == DTD\n",
6083 NULL, NULL);
6084 ctxt->instate = XML_PARSER_CONTENT;
6085 ctxt->checkIndex = 0;
6086 #ifdef DEBUG_PUSH
6087 xmlGenericError(xmlGenericErrorContext,
6088 "HPP: entering CONTENT\n");
6089 #endif
6090 break;
6091 case XML_PARSER_COMMENT:
6092 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
6093 "HPP: internal error, state == COMMENT\n",
6094 NULL, NULL);
6095 ctxt->instate = XML_PARSER_CONTENT;
6096 ctxt->checkIndex = 0;
6097 #ifdef DEBUG_PUSH
6098 xmlGenericError(xmlGenericErrorContext,
6099 "HPP: entering CONTENT\n");
6100 #endif
6101 break;
6102 case XML_PARSER_PI:
6103 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
6104 "HPP: internal error, state == PI\n",
6105 NULL, NULL);
6106 ctxt->instate = XML_PARSER_CONTENT;
6107 ctxt->checkIndex = 0;
6108 #ifdef DEBUG_PUSH
6109 xmlGenericError(xmlGenericErrorContext,
6110 "HPP: entering CONTENT\n");
6111 #endif
6112 break;
6113 case XML_PARSER_ENTITY_DECL:
6114 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
6115 "HPP: internal error, state == ENTITY_DECL\n",
6116 NULL, NULL);
6117 ctxt->instate = XML_PARSER_CONTENT;
6118 ctxt->checkIndex = 0;
6119 #ifdef DEBUG_PUSH
6120 xmlGenericError(xmlGenericErrorContext,
6121 "HPP: entering CONTENT\n");
6122 #endif
6123 break;
6124 case XML_PARSER_ENTITY_VALUE:
6125 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
6126 "HPP: internal error, state == ENTITY_VALUE\n",
6127 NULL, NULL);
6128 ctxt->instate = XML_PARSER_CONTENT;
6129 ctxt->checkIndex = 0;
6130 #ifdef DEBUG_PUSH
6131 xmlGenericError(xmlGenericErrorContext,
6132 "HPP: entering DTD\n");
6133 #endif
6134 break;
6135 case XML_PARSER_ATTRIBUTE_VALUE:
6136 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
6137 "HPP: internal error, state == ATTRIBUTE_VALUE\n",
6138 NULL, NULL);
6139 ctxt->instate = XML_PARSER_START_TAG;
6140 ctxt->checkIndex = 0;
6141 #ifdef DEBUG_PUSH
6142 xmlGenericError(xmlGenericErrorContext,
6143 "HPP: entering START_TAG\n");
6144 #endif
6145 break;
6146 case XML_PARSER_SYSTEM_LITERAL:
6147 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
6148 "HPP: internal error, state == XML_PARSER_SYSTEM_LITERAL\n",
6149 NULL, NULL);
6150 ctxt->instate = XML_PARSER_CONTENT;
6151 ctxt->checkIndex = 0;
6152 #ifdef DEBUG_PUSH
6153 xmlGenericError(xmlGenericErrorContext,
6154 "HPP: entering CONTENT\n");
6155 #endif
6156 break;
6157 case XML_PARSER_IGNORE:
6158 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
6159 "HPP: internal error, state == XML_PARSER_IGNORE\n",
6160 NULL, NULL);
6161 ctxt->instate = XML_PARSER_CONTENT;
6162 ctxt->checkIndex = 0;
6163 #ifdef DEBUG_PUSH
6164 xmlGenericError(xmlGenericErrorContext,
6165 "HPP: entering CONTENT\n");
6166 #endif
6167 break;
6168 case XML_PARSER_PUBLIC_LITERAL:
6169 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
6170 "HPP: internal error, state == XML_PARSER_LITERAL\n",
6171 NULL, NULL);
6172 ctxt->instate = XML_PARSER_CONTENT;
6173 ctxt->checkIndex = 0;
6174 #ifdef DEBUG_PUSH
6175 xmlGenericError(xmlGenericErrorContext,
6176 "HPP: entering CONTENT\n");
6177 #endif
6178 break;
6179
6180 }
6181 }
6182 done:
6183 if ((avail == 0) && (terminate)) {
6184 htmlAutoCloseOnEnd(ctxt);
6185 if ((ctxt->nameNr == 0) && (ctxt->instate != XML_PARSER_EOF)) {
6186 /*
6187 * SAX: end of the document processing.
6188 */
6189 ctxt->instate = XML_PARSER_EOF;
6190 if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
6191 ctxt->sax->endDocument(ctxt->userData);
6192 }
6193 }
6194 if ((!(ctxt->options & HTML_PARSE_NODEFDTD)) && (ctxt->myDoc != NULL) &&
6195 ((terminate) || (ctxt->instate == XML_PARSER_EOF) ||
6196 (ctxt->instate == XML_PARSER_EPILOG))) {
6197 xmlDtdPtr dtd;
6198 dtd = xmlGetIntSubset(ctxt->myDoc);
6199 if (dtd == NULL)
6200 ctxt->myDoc->intSubset =
6201 xmlCreateIntSubset(ctxt->myDoc, BAD_CAST "html",
6202 BAD_CAST "-//W3C//DTD HTML 4.0 Transitional//EN",
6203 BAD_CAST "http://www.w3.org/TR/REC-html40/loose.dtd");
6204 }
6205 #ifdef DEBUG_PUSH
6206 xmlGenericError(xmlGenericErrorContext, "HPP: done %d\n", ret);
6207 #endif
6208 return(ret);
6209 }
6210
6211 /**
6212 * htmlParseChunk:
6213 * @ctxt: an HTML parser context
6214 * @chunk: an char array
6215 * @size: the size in byte of the chunk
6216 * @terminate: last chunk indicator
6217 *
6218 * Parse a Chunk of memory
6219 *
6220 * Returns zero if no error, the xmlParserErrors otherwise.
6221 */
6222 int
htmlParseChunk(htmlParserCtxtPtr ctxt,const char * chunk,int size,int terminate)6223 htmlParseChunk(htmlParserCtxtPtr ctxt, const char *chunk, int size,
6224 int terminate) {
6225 if ((ctxt == NULL) || (ctxt->input == NULL)) {
6226 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
6227 "htmlParseChunk: context error\n", NULL, NULL);
6228 return(XML_ERR_INTERNAL_ERROR);
6229 }
6230 if ((size > 0) && (chunk != NULL) && (ctxt->input != NULL) &&
6231 (ctxt->input->buf != NULL) && (ctxt->instate != XML_PARSER_EOF)) {
6232 size_t base = xmlBufGetInputBase(ctxt->input->buf->buffer, ctxt->input);
6233 size_t cur = ctxt->input->cur - ctxt->input->base;
6234 int res;
6235
6236 res = xmlParserInputBufferPush(ctxt->input->buf, size, chunk);
6237 xmlBufSetInputBaseCur(ctxt->input->buf->buffer, ctxt->input, base, cur);
6238 if (res < 0) {
6239 ctxt->errNo = XML_PARSER_EOF;
6240 ctxt->disableSAX = 1;
6241 return (XML_PARSER_EOF);
6242 }
6243 #ifdef DEBUG_PUSH
6244 xmlGenericError(xmlGenericErrorContext, "HPP: pushed %d\n", size);
6245 #endif
6246
6247 #if 0
6248 if ((terminate) || (ctxt->input->buf->buffer->use > 80))
6249 htmlParseTryOrFinish(ctxt, terminate);
6250 #endif
6251 } else if (ctxt->instate != XML_PARSER_EOF) {
6252 if ((ctxt->input != NULL) && ctxt->input->buf != NULL) {
6253 xmlParserInputBufferPtr in = ctxt->input->buf;
6254 if ((in->encoder != NULL) && (in->buffer != NULL) &&
6255 (in->raw != NULL)) {
6256 int nbchars;
6257 size_t base = xmlBufGetInputBase(in->buffer, ctxt->input);
6258 size_t current = ctxt->input->cur - ctxt->input->base;
6259
6260 nbchars = xmlCharEncInput(in, terminate);
6261 xmlBufSetInputBaseCur(in->buffer, ctxt->input, base, current);
6262 if (nbchars < 0) {
6263 htmlParseErr(ctxt, XML_ERR_INVALID_ENCODING,
6264 "encoder error\n", NULL, NULL);
6265 return(XML_ERR_INVALID_ENCODING);
6266 }
6267 }
6268 }
6269 }
6270 htmlParseTryOrFinish(ctxt, terminate);
6271 if (terminate) {
6272 if ((ctxt->instate != XML_PARSER_EOF) &&
6273 (ctxt->instate != XML_PARSER_EPILOG) &&
6274 (ctxt->instate != XML_PARSER_MISC)) {
6275 ctxt->errNo = XML_ERR_DOCUMENT_END;
6276 ctxt->wellFormed = 0;
6277 }
6278 if (ctxt->instate != XML_PARSER_EOF) {
6279 if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
6280 ctxt->sax->endDocument(ctxt->userData);
6281 }
6282 ctxt->instate = XML_PARSER_EOF;
6283 }
6284 return((xmlParserErrors) ctxt->errNo);
6285 }
6286
6287 /************************************************************************
6288 * *
6289 * User entry points *
6290 * *
6291 ************************************************************************/
6292
6293 /**
6294 * htmlCreatePushParserCtxt:
6295 * @sax: a SAX handler
6296 * @user_data: The user data returned on SAX callbacks
6297 * @chunk: a pointer to an array of chars
6298 * @size: number of chars in the array
6299 * @filename: an optional file name or URI
6300 * @enc: an optional encoding
6301 *
6302 * Create a parser context for using the HTML parser in push mode
6303 * The value of @filename is used for fetching external entities
6304 * and error/warning reports.
6305 *
6306 * Returns the new parser context or NULL
6307 */
6308 htmlParserCtxtPtr
htmlCreatePushParserCtxt(htmlSAXHandlerPtr sax,void * user_data,const char * chunk,int size,const char * filename,xmlCharEncoding enc)6309 htmlCreatePushParserCtxt(htmlSAXHandlerPtr sax, void *user_data,
6310 const char *chunk, int size, const char *filename,
6311 xmlCharEncoding enc) {
6312 htmlParserCtxtPtr ctxt;
6313 htmlParserInputPtr inputStream;
6314 xmlParserInputBufferPtr buf;
6315
6316 xmlInitParser();
6317
6318 buf = xmlAllocParserInputBuffer(enc);
6319 if (buf == NULL) return(NULL);
6320
6321 ctxt = htmlNewParserCtxt();
6322 if (ctxt == NULL) {
6323 xmlFreeParserInputBuffer(buf);
6324 return(NULL);
6325 }
6326 if(enc==XML_CHAR_ENCODING_UTF8 || buf->encoder)
6327 ctxt->charset=XML_CHAR_ENCODING_UTF8;
6328 if (sax != NULL) {
6329 if (ctxt->sax != (xmlSAXHandlerPtr) &htmlDefaultSAXHandler)
6330 xmlFree(ctxt->sax);
6331 ctxt->sax = (htmlSAXHandlerPtr) xmlMalloc(sizeof(htmlSAXHandler));
6332 if (ctxt->sax == NULL) {
6333 xmlFree(buf);
6334 xmlFree(ctxt);
6335 return(NULL);
6336 }
6337 memcpy(ctxt->sax, sax, sizeof(htmlSAXHandler));
6338 if (user_data != NULL)
6339 ctxt->userData = user_data;
6340 }
6341 if (filename == NULL) {
6342 ctxt->directory = NULL;
6343 } else {
6344 ctxt->directory = xmlParserGetDirectory(filename);
6345 }
6346
6347 inputStream = htmlNewInputStream(ctxt);
6348 if (inputStream == NULL) {
6349 xmlFreeParserCtxt(ctxt);
6350 xmlFree(buf);
6351 return(NULL);
6352 }
6353
6354 if (filename == NULL)
6355 inputStream->filename = NULL;
6356 else
6357 inputStream->filename = (char *)
6358 xmlCanonicPath((const xmlChar *) filename);
6359 inputStream->buf = buf;
6360 xmlBufResetInput(buf->buffer, inputStream);
6361
6362 inputPush(ctxt, inputStream);
6363
6364 if ((size > 0) && (chunk != NULL) && (ctxt->input != NULL) &&
6365 (ctxt->input->buf != NULL)) {
6366 size_t base = xmlBufGetInputBase(ctxt->input->buf->buffer, ctxt->input);
6367 size_t cur = ctxt->input->cur - ctxt->input->base;
6368
6369 xmlParserInputBufferPush(ctxt->input->buf, size, chunk);
6370
6371 xmlBufSetInputBaseCur(ctxt->input->buf->buffer, ctxt->input, base, cur);
6372 #ifdef DEBUG_PUSH
6373 xmlGenericError(xmlGenericErrorContext, "HPP: pushed %d\n", size);
6374 #endif
6375 }
6376 ctxt->progressive = 1;
6377
6378 return(ctxt);
6379 }
6380 #endif /* LIBXML_PUSH_ENABLED */
6381
6382 /**
6383 * htmlSAXParseDoc:
6384 * @cur: a pointer to an array of xmlChar
6385 * @encoding: a free form C string describing the HTML document encoding, or NULL
6386 * @sax: the SAX handler block
6387 * @userData: if using SAX, this pointer will be provided on callbacks.
6388 *
6389 * Parse an HTML in-memory document. If sax is not NULL, use the SAX callbacks
6390 * to handle parse events. If sax is NULL, fallback to the default DOM
6391 * behavior and return a tree.
6392 *
6393 * Returns the resulting document tree unless SAX is NULL or the document is
6394 * not well formed.
6395 */
6396
6397 htmlDocPtr
htmlSAXParseDoc(const xmlChar * cur,const char * encoding,htmlSAXHandlerPtr sax,void * userData)6398 htmlSAXParseDoc(const xmlChar *cur, const char *encoding,
6399 htmlSAXHandlerPtr sax, void *userData) {
6400 htmlDocPtr ret;
6401 htmlParserCtxtPtr ctxt;
6402
6403 xmlInitParser();
6404
6405 if (cur == NULL) return(NULL);
6406
6407
6408 ctxt = htmlCreateDocParserCtxt(cur, encoding);
6409 if (ctxt == NULL) return(NULL);
6410 if (sax != NULL) {
6411 if (ctxt->sax != NULL) xmlFree (ctxt->sax);
6412 ctxt->sax = sax;
6413 ctxt->userData = userData;
6414 }
6415
6416 htmlParseDocument(ctxt);
6417 ret = ctxt->myDoc;
6418 if (sax != NULL) {
6419 ctxt->sax = NULL;
6420 ctxt->userData = NULL;
6421 }
6422 htmlFreeParserCtxt(ctxt);
6423
6424 return(ret);
6425 }
6426
6427 /**
6428 * htmlParseDoc:
6429 * @cur: a pointer to an array of xmlChar
6430 * @encoding: a free form C string describing the HTML document encoding, or NULL
6431 *
6432 * parse an HTML in-memory document and build a tree.
6433 *
6434 * Returns the resulting document tree
6435 */
6436
6437 htmlDocPtr
htmlParseDoc(const xmlChar * cur,const char * encoding)6438 htmlParseDoc(const xmlChar *cur, const char *encoding) {
6439 return(htmlSAXParseDoc(cur, encoding, NULL, NULL));
6440 }
6441
6442
6443 /**
6444 * htmlCreateFileParserCtxt:
6445 * @filename: the filename
6446 * @encoding: a free form C string describing the HTML document encoding, or NULL
6447 *
6448 * Create a parser context for a file content.
6449 * Automatic support for ZLIB/Compress compressed document is provided
6450 * by default if found at compile-time.
6451 *
6452 * Returns the new parser context or NULL
6453 */
6454 htmlParserCtxtPtr
htmlCreateFileParserCtxt(const char * filename,const char * encoding)6455 htmlCreateFileParserCtxt(const char *filename, const char *encoding)
6456 {
6457 htmlParserCtxtPtr ctxt;
6458 htmlParserInputPtr inputStream;
6459 char *canonicFilename;
6460 /* htmlCharEncoding enc; */
6461 xmlChar *content, *content_line = (xmlChar *) "charset=";
6462
6463 if (filename == NULL)
6464 return(NULL);
6465
6466 ctxt = htmlNewParserCtxt();
6467 if (ctxt == NULL) {
6468 return(NULL);
6469 }
6470 canonicFilename = (char *) xmlCanonicPath((const xmlChar *) filename);
6471 if (canonicFilename == NULL) {
6472 #ifdef LIBXML_SAX1_ENABLED
6473 if (xmlDefaultSAXHandler.error != NULL) {
6474 xmlDefaultSAXHandler.error(NULL, "out of memory\n");
6475 }
6476 #endif
6477 xmlFreeParserCtxt(ctxt);
6478 return(NULL);
6479 }
6480
6481 inputStream = xmlLoadExternalEntity(canonicFilename, NULL, ctxt);
6482 xmlFree(canonicFilename);
6483 if (inputStream == NULL) {
6484 xmlFreeParserCtxt(ctxt);
6485 return(NULL);
6486 }
6487
6488 inputPush(ctxt, inputStream);
6489
6490 /* set encoding */
6491 if (encoding) {
6492 size_t l = strlen(encoding);
6493
6494 if (l < 1000) {
6495 content = xmlMallocAtomic (xmlStrlen(content_line) + l + 1);
6496 if (content) {
6497 strcpy ((char *)content, (char *)content_line);
6498 strcat ((char *)content, (char *)encoding);
6499 htmlCheckEncoding (ctxt, content);
6500 xmlFree (content);
6501 }
6502 }
6503 }
6504
6505 return(ctxt);
6506 }
6507
6508 /**
6509 * htmlSAXParseFile:
6510 * @filename: the filename
6511 * @encoding: a free form C string describing the HTML document encoding, or NULL
6512 * @sax: the SAX handler block
6513 * @userData: if using SAX, this pointer will be provided on callbacks.
6514 *
6515 * parse an HTML file and build a tree. Automatic support for ZLIB/Compress
6516 * compressed document is provided by default if found at compile-time.
6517 * It use the given SAX function block to handle the parsing callback.
6518 * If sax is NULL, fallback to the default DOM tree building routines.
6519 *
6520 * Returns the resulting document tree unless SAX is NULL or the document is
6521 * not well formed.
6522 */
6523
6524 htmlDocPtr
htmlSAXParseFile(const char * filename,const char * encoding,htmlSAXHandlerPtr sax,void * userData)6525 htmlSAXParseFile(const char *filename, const char *encoding, htmlSAXHandlerPtr sax,
6526 void *userData) {
6527 htmlDocPtr ret;
6528 htmlParserCtxtPtr ctxt;
6529 htmlSAXHandlerPtr oldsax = NULL;
6530
6531 xmlInitParser();
6532
6533 ctxt = htmlCreateFileParserCtxt(filename, encoding);
6534 if (ctxt == NULL) return(NULL);
6535 if (sax != NULL) {
6536 oldsax = ctxt->sax;
6537 ctxt->sax = sax;
6538 ctxt->userData = userData;
6539 }
6540
6541 htmlParseDocument(ctxt);
6542
6543 ret = ctxt->myDoc;
6544 if (sax != NULL) {
6545 ctxt->sax = oldsax;
6546 ctxt->userData = NULL;
6547 }
6548 htmlFreeParserCtxt(ctxt);
6549
6550 return(ret);
6551 }
6552
6553 /**
6554 * htmlParseFile:
6555 * @filename: the filename
6556 * @encoding: a free form C string describing the HTML document encoding, or NULL
6557 *
6558 * parse an HTML file and build a tree. Automatic support for ZLIB/Compress
6559 * compressed document is provided by default if found at compile-time.
6560 *
6561 * Returns the resulting document tree
6562 */
6563
6564 htmlDocPtr
htmlParseFile(const char * filename,const char * encoding)6565 htmlParseFile(const char *filename, const char *encoding) {
6566 return(htmlSAXParseFile(filename, encoding, NULL, NULL));
6567 }
6568
6569 /**
6570 * htmlHandleOmittedElem:
6571 * @val: int 0 or 1
6572 *
6573 * Set and return the previous value for handling HTML omitted tags.
6574 *
6575 * Returns the last value for 0 for no handling, 1 for auto insertion.
6576 */
6577
6578 int
htmlHandleOmittedElem(int val)6579 htmlHandleOmittedElem(int val) {
6580 int old = htmlOmittedDefaultValue;
6581
6582 htmlOmittedDefaultValue = val;
6583 return(old);
6584 }
6585
6586 /**
6587 * htmlElementAllowedHere:
6588 * @parent: HTML parent element
6589 * @elt: HTML element
6590 *
6591 * Checks whether an HTML element may be a direct child of a parent element.
6592 * Note - doesn't check for deprecated elements
6593 *
6594 * Returns 1 if allowed; 0 otherwise.
6595 */
6596 int
htmlElementAllowedHere(const htmlElemDesc * parent,const xmlChar * elt)6597 htmlElementAllowedHere(const htmlElemDesc* parent, const xmlChar* elt) {
6598 const char** p ;
6599
6600 if ( ! elt || ! parent || ! parent->subelts )
6601 return 0 ;
6602
6603 for ( p = parent->subelts; *p; ++p )
6604 if ( !xmlStrcmp((const xmlChar *)*p, elt) )
6605 return 1 ;
6606
6607 return 0 ;
6608 }
6609 /**
6610 * htmlElementStatusHere:
6611 * @parent: HTML parent element
6612 * @elt: HTML element
6613 *
6614 * Checks whether an HTML element may be a direct child of a parent element.
6615 * and if so whether it is valid or deprecated.
6616 *
6617 * Returns one of HTML_VALID, HTML_DEPRECATED, HTML_INVALID
6618 */
6619 htmlStatus
htmlElementStatusHere(const htmlElemDesc * parent,const htmlElemDesc * elt)6620 htmlElementStatusHere(const htmlElemDesc* parent, const htmlElemDesc* elt) {
6621 if ( ! parent || ! elt )
6622 return HTML_INVALID ;
6623 if ( ! htmlElementAllowedHere(parent, (const xmlChar*) elt->name ) )
6624 return HTML_INVALID ;
6625
6626 return ( elt->dtd == 0 ) ? HTML_VALID : HTML_DEPRECATED ;
6627 }
6628 /**
6629 * htmlAttrAllowed:
6630 * @elt: HTML element
6631 * @attr: HTML attribute
6632 * @legacy: whether to allow deprecated attributes
6633 *
6634 * Checks whether an attribute is valid for an element
6635 * Has full knowledge of Required and Deprecated attributes
6636 *
6637 * Returns one of HTML_REQUIRED, HTML_VALID, HTML_DEPRECATED, HTML_INVALID
6638 */
6639 htmlStatus
htmlAttrAllowed(const htmlElemDesc * elt,const xmlChar * attr,int legacy)6640 htmlAttrAllowed(const htmlElemDesc* elt, const xmlChar* attr, int legacy) {
6641 const char** p ;
6642
6643 if ( !elt || ! attr )
6644 return HTML_INVALID ;
6645
6646 if ( elt->attrs_req )
6647 for ( p = elt->attrs_req; *p; ++p)
6648 if ( !xmlStrcmp((const xmlChar*)*p, attr) )
6649 return HTML_REQUIRED ;
6650
6651 if ( elt->attrs_opt )
6652 for ( p = elt->attrs_opt; *p; ++p)
6653 if ( !xmlStrcmp((const xmlChar*)*p, attr) )
6654 return HTML_VALID ;
6655
6656 if ( legacy && elt->attrs_depr )
6657 for ( p = elt->attrs_depr; *p; ++p)
6658 if ( !xmlStrcmp((const xmlChar*)*p, attr) )
6659 return HTML_DEPRECATED ;
6660
6661 return HTML_INVALID ;
6662 }
6663 /**
6664 * htmlNodeStatus:
6665 * @node: an htmlNodePtr in a tree
6666 * @legacy: whether to allow deprecated elements (YES is faster here
6667 * for Element nodes)
6668 *
6669 * Checks whether the tree node is valid. Experimental (the author
6670 * only uses the HTML enhancements in a SAX parser)
6671 *
6672 * Return: for Element nodes, a return from htmlElementAllowedHere (if
6673 * legacy allowed) or htmlElementStatusHere (otherwise).
6674 * for Attribute nodes, a return from htmlAttrAllowed
6675 * for other nodes, HTML_NA (no checks performed)
6676 */
6677 htmlStatus
htmlNodeStatus(const htmlNodePtr node,int legacy)6678 htmlNodeStatus(const htmlNodePtr node, int legacy) {
6679 if ( ! node )
6680 return HTML_INVALID ;
6681
6682 switch ( node->type ) {
6683 case XML_ELEMENT_NODE:
6684 return legacy
6685 ? ( htmlElementAllowedHere (
6686 htmlTagLookup(node->parent->name) , node->name
6687 ) ? HTML_VALID : HTML_INVALID )
6688 : htmlElementStatusHere(
6689 htmlTagLookup(node->parent->name) ,
6690 htmlTagLookup(node->name) )
6691 ;
6692 case XML_ATTRIBUTE_NODE:
6693 return htmlAttrAllowed(
6694 htmlTagLookup(node->parent->name) , node->name, legacy) ;
6695 default: return HTML_NA ;
6696 }
6697 }
6698 /************************************************************************
6699 * *
6700 * New set (2.6.0) of simpler and more flexible APIs *
6701 * *
6702 ************************************************************************/
6703 /**
6704 * DICT_FREE:
6705 * @str: a string
6706 *
6707 * Free a string if it is not owned by the "dict" dictionary in the
6708 * current scope
6709 */
6710 #define DICT_FREE(str) \
6711 if ((str) && ((!dict) || \
6712 (xmlDictOwns(dict, (const xmlChar *)(str)) == 0))) \
6713 xmlFree((char *)(str));
6714
6715 /**
6716 * htmlCtxtReset:
6717 * @ctxt: an HTML parser context
6718 *
6719 * Reset a parser context
6720 */
6721 void
htmlCtxtReset(htmlParserCtxtPtr ctxt)6722 htmlCtxtReset(htmlParserCtxtPtr ctxt)
6723 {
6724 xmlParserInputPtr input;
6725 xmlDictPtr dict;
6726
6727 if (ctxt == NULL)
6728 return;
6729
6730 xmlInitParser();
6731 dict = ctxt->dict;
6732
6733 while ((input = inputPop(ctxt)) != NULL) { /* Non consuming */
6734 xmlFreeInputStream(input);
6735 }
6736 ctxt->inputNr = 0;
6737 ctxt->input = NULL;
6738
6739 ctxt->spaceNr = 0;
6740 if (ctxt->spaceTab != NULL) {
6741 ctxt->spaceTab[0] = -1;
6742 ctxt->space = &ctxt->spaceTab[0];
6743 } else {
6744 ctxt->space = NULL;
6745 }
6746
6747
6748 ctxt->nodeNr = 0;
6749 ctxt->node = NULL;
6750
6751 ctxt->nameNr = 0;
6752 ctxt->name = NULL;
6753
6754 DICT_FREE(ctxt->version);
6755 ctxt->version = NULL;
6756 DICT_FREE(ctxt->encoding);
6757 ctxt->encoding = NULL;
6758 DICT_FREE(ctxt->directory);
6759 ctxt->directory = NULL;
6760 DICT_FREE(ctxt->extSubURI);
6761 ctxt->extSubURI = NULL;
6762 DICT_FREE(ctxt->extSubSystem);
6763 ctxt->extSubSystem = NULL;
6764 if (ctxt->myDoc != NULL)
6765 xmlFreeDoc(ctxt->myDoc);
6766 ctxt->myDoc = NULL;
6767
6768 ctxt->standalone = -1;
6769 ctxt->hasExternalSubset = 0;
6770 ctxt->hasPErefs = 0;
6771 ctxt->html = 1;
6772 ctxt->external = 0;
6773 ctxt->instate = XML_PARSER_START;
6774 ctxt->token = 0;
6775
6776 ctxt->wellFormed = 1;
6777 ctxt->nsWellFormed = 1;
6778 ctxt->disableSAX = 0;
6779 ctxt->valid = 1;
6780 ctxt->vctxt.userData = ctxt;
6781 ctxt->vctxt.error = xmlParserValidityError;
6782 ctxt->vctxt.warning = xmlParserValidityWarning;
6783 ctxt->record_info = 0;
6784 ctxt->checkIndex = 0;
6785 ctxt->inSubset = 0;
6786 ctxt->errNo = XML_ERR_OK;
6787 ctxt->depth = 0;
6788 ctxt->charset = XML_CHAR_ENCODING_NONE;
6789 ctxt->catalogs = NULL;
6790 xmlInitNodeInfoSeq(&ctxt->node_seq);
6791
6792 if (ctxt->attsDefault != NULL) {
6793 xmlHashFree(ctxt->attsDefault, xmlHashDefaultDeallocator);
6794 ctxt->attsDefault = NULL;
6795 }
6796 if (ctxt->attsSpecial != NULL) {
6797 xmlHashFree(ctxt->attsSpecial, NULL);
6798 ctxt->attsSpecial = NULL;
6799 }
6800 }
6801
6802 /**
6803 * htmlCtxtUseOptions:
6804 * @ctxt: an HTML parser context
6805 * @options: a combination of htmlParserOption(s)
6806 *
6807 * Applies the options to the parser context
6808 *
6809 * Returns 0 in case of success, the set of unknown or unimplemented options
6810 * in case of error.
6811 */
6812 int
htmlCtxtUseOptions(htmlParserCtxtPtr ctxt,int options)6813 htmlCtxtUseOptions(htmlParserCtxtPtr ctxt, int options)
6814 {
6815 if (ctxt == NULL)
6816 return(-1);
6817
6818 if (options & HTML_PARSE_NOWARNING) {
6819 ctxt->sax->warning = NULL;
6820 ctxt->vctxt.warning = NULL;
6821 options -= XML_PARSE_NOWARNING;
6822 ctxt->options |= XML_PARSE_NOWARNING;
6823 }
6824 if (options & HTML_PARSE_NOERROR) {
6825 ctxt->sax->error = NULL;
6826 ctxt->vctxt.error = NULL;
6827 ctxt->sax->fatalError = NULL;
6828 options -= XML_PARSE_NOERROR;
6829 ctxt->options |= XML_PARSE_NOERROR;
6830 }
6831 if (options & HTML_PARSE_PEDANTIC) {
6832 ctxt->pedantic = 1;
6833 options -= XML_PARSE_PEDANTIC;
6834 ctxt->options |= XML_PARSE_PEDANTIC;
6835 } else
6836 ctxt->pedantic = 0;
6837 if (options & XML_PARSE_NOBLANKS) {
6838 ctxt->keepBlanks = 0;
6839 ctxt->sax->ignorableWhitespace = xmlSAX2IgnorableWhitespace;
6840 options -= XML_PARSE_NOBLANKS;
6841 ctxt->options |= XML_PARSE_NOBLANKS;
6842 } else
6843 ctxt->keepBlanks = 1;
6844 if (options & HTML_PARSE_RECOVER) {
6845 ctxt->recovery = 1;
6846 options -= HTML_PARSE_RECOVER;
6847 } else
6848 ctxt->recovery = 0;
6849 if (options & HTML_PARSE_COMPACT) {
6850 ctxt->options |= HTML_PARSE_COMPACT;
6851 options -= HTML_PARSE_COMPACT;
6852 }
6853 if (options & XML_PARSE_HUGE) {
6854 ctxt->options |= XML_PARSE_HUGE;
6855 options -= XML_PARSE_HUGE;
6856 }
6857 if (options & HTML_PARSE_NODEFDTD) {
6858 ctxt->options |= HTML_PARSE_NODEFDTD;
6859 options -= HTML_PARSE_NODEFDTD;
6860 }
6861 if (options & HTML_PARSE_IGNORE_ENC) {
6862 ctxt->options |= HTML_PARSE_IGNORE_ENC;
6863 options -= HTML_PARSE_IGNORE_ENC;
6864 }
6865 if (options & HTML_PARSE_NOIMPLIED) {
6866 ctxt->options |= HTML_PARSE_NOIMPLIED;
6867 options -= HTML_PARSE_NOIMPLIED;
6868 }
6869 ctxt->dictNames = 0;
6870 return (options);
6871 }
6872
6873 /**
6874 * htmlDoRead:
6875 * @ctxt: an HTML parser context
6876 * @URL: the base URL to use for the document
6877 * @encoding: the document encoding, or NULL
6878 * @options: a combination of htmlParserOption(s)
6879 * @reuse: keep the context for reuse
6880 *
6881 * Common front-end for the htmlRead functions
6882 *
6883 * Returns the resulting document tree or NULL
6884 */
6885 static htmlDocPtr
htmlDoRead(htmlParserCtxtPtr ctxt,const char * URL,const char * encoding,int options,int reuse)6886 htmlDoRead(htmlParserCtxtPtr ctxt, const char *URL, const char *encoding,
6887 int options, int reuse)
6888 {
6889 htmlDocPtr ret;
6890
6891 htmlCtxtUseOptions(ctxt, options);
6892 ctxt->html = 1;
6893 if (encoding != NULL) {
6894 xmlCharEncodingHandlerPtr hdlr;
6895
6896 hdlr = xmlFindCharEncodingHandler(encoding);
6897 if (hdlr != NULL) {
6898 xmlSwitchToEncoding(ctxt, hdlr);
6899 if (ctxt->input->encoding != NULL)
6900 xmlFree((xmlChar *) ctxt->input->encoding);
6901 ctxt->input->encoding = xmlStrdup((xmlChar *)encoding);
6902 }
6903 }
6904 if ((URL != NULL) && (ctxt->input != NULL) &&
6905 (ctxt->input->filename == NULL))
6906 ctxt->input->filename = (char *) xmlStrdup((const xmlChar *) URL);
6907 htmlParseDocument(ctxt);
6908 ret = ctxt->myDoc;
6909 ctxt->myDoc = NULL;
6910 if (!reuse) {
6911 if ((ctxt->dictNames) &&
6912 (ret != NULL) &&
6913 (ret->dict == ctxt->dict))
6914 ctxt->dict = NULL;
6915 xmlFreeParserCtxt(ctxt);
6916 }
6917 return (ret);
6918 }
6919
6920 /**
6921 * htmlReadDoc:
6922 * @cur: a pointer to a zero terminated string
6923 * @URL: the base URL to use for the document
6924 * @encoding: the document encoding, or NULL
6925 * @options: a combination of htmlParserOption(s)
6926 *
6927 * parse an XML in-memory document and build a tree.
6928 *
6929 * Returns the resulting document tree
6930 */
6931 htmlDocPtr
htmlReadDoc(const xmlChar * cur,const char * URL,const char * encoding,int options)6932 htmlReadDoc(const xmlChar * cur, const char *URL, const char *encoding, int options)
6933 {
6934 htmlParserCtxtPtr ctxt;
6935
6936 if (cur == NULL)
6937 return (NULL);
6938
6939 xmlInitParser();
6940 ctxt = htmlCreateDocParserCtxt(cur, NULL);
6941 if (ctxt == NULL)
6942 return (NULL);
6943 return (htmlDoRead(ctxt, URL, encoding, options, 0));
6944 }
6945
6946 /**
6947 * htmlReadFile:
6948 * @filename: a file or URL
6949 * @encoding: the document encoding, or NULL
6950 * @options: a combination of htmlParserOption(s)
6951 *
6952 * parse an XML file from the filesystem or the network.
6953 *
6954 * Returns the resulting document tree
6955 */
6956 htmlDocPtr
htmlReadFile(const char * filename,const char * encoding,int options)6957 htmlReadFile(const char *filename, const char *encoding, int options)
6958 {
6959 htmlParserCtxtPtr ctxt;
6960
6961 xmlInitParser();
6962 ctxt = htmlCreateFileParserCtxt(filename, encoding);
6963 if (ctxt == NULL)
6964 return (NULL);
6965 return (htmlDoRead(ctxt, NULL, NULL, options, 0));
6966 }
6967
6968 /**
6969 * htmlReadMemory:
6970 * @buffer: a pointer to a char array
6971 * @size: the size of the array
6972 * @URL: the base URL to use for the document
6973 * @encoding: the document encoding, or NULL
6974 * @options: a combination of htmlParserOption(s)
6975 *
6976 * parse an XML in-memory document and build a tree.
6977 *
6978 * Returns the resulting document tree
6979 */
6980 htmlDocPtr
htmlReadMemory(const char * buffer,int size,const char * URL,const char * encoding,int options)6981 htmlReadMemory(const char *buffer, int size, const char *URL, const char *encoding, int options)
6982 {
6983 htmlParserCtxtPtr ctxt;
6984
6985 xmlInitParser();
6986 ctxt = xmlCreateMemoryParserCtxt(buffer, size);
6987 if (ctxt == NULL)
6988 return (NULL);
6989 htmlDefaultSAXHandlerInit();
6990 if (ctxt->sax != NULL)
6991 memcpy(ctxt->sax, &htmlDefaultSAXHandler, sizeof(xmlSAXHandlerV1));
6992 return (htmlDoRead(ctxt, URL, encoding, options, 0));
6993 }
6994
6995 /**
6996 * htmlReadFd:
6997 * @fd: an open file descriptor
6998 * @URL: the base URL to use for the document
6999 * @encoding: the document encoding, or NULL
7000 * @options: a combination of htmlParserOption(s)
7001 *
7002 * parse an XML from a file descriptor and build a tree.
7003 *
7004 * Returns the resulting document tree
7005 */
7006 htmlDocPtr
htmlReadFd(int fd,const char * URL,const char * encoding,int options)7007 htmlReadFd(int fd, const char *URL, const char *encoding, int options)
7008 {
7009 htmlParserCtxtPtr ctxt;
7010 xmlParserInputBufferPtr input;
7011 xmlParserInputPtr stream;
7012
7013 if (fd < 0)
7014 return (NULL);
7015 xmlInitParser();
7016
7017 xmlInitParser();
7018 input = xmlParserInputBufferCreateFd(fd, XML_CHAR_ENCODING_NONE);
7019 if (input == NULL)
7020 return (NULL);
7021 ctxt = xmlNewParserCtxt();
7022 if (ctxt == NULL) {
7023 xmlFreeParserInputBuffer(input);
7024 return (NULL);
7025 }
7026 stream = xmlNewIOInputStream(ctxt, input, XML_CHAR_ENCODING_NONE);
7027 if (stream == NULL) {
7028 xmlFreeParserInputBuffer(input);
7029 xmlFreeParserCtxt(ctxt);
7030 return (NULL);
7031 }
7032 inputPush(ctxt, stream);
7033 return (htmlDoRead(ctxt, URL, encoding, options, 0));
7034 }
7035
7036 /**
7037 * htmlReadIO:
7038 * @ioread: an I/O read function
7039 * @ioclose: an I/O close function
7040 * @ioctx: an I/O handler
7041 * @URL: the base URL to use for the document
7042 * @encoding: the document encoding, or NULL
7043 * @options: a combination of htmlParserOption(s)
7044 *
7045 * parse an HTML document from I/O functions and source and build a tree.
7046 *
7047 * Returns the resulting document tree
7048 */
7049 htmlDocPtr
htmlReadIO(xmlInputReadCallback ioread,xmlInputCloseCallback ioclose,void * ioctx,const char * URL,const char * encoding,int options)7050 htmlReadIO(xmlInputReadCallback ioread, xmlInputCloseCallback ioclose,
7051 void *ioctx, const char *URL, const char *encoding, int options)
7052 {
7053 htmlParserCtxtPtr ctxt;
7054 xmlParserInputBufferPtr input;
7055 xmlParserInputPtr stream;
7056
7057 if (ioread == NULL)
7058 return (NULL);
7059 xmlInitParser();
7060
7061 input = xmlParserInputBufferCreateIO(ioread, ioclose, ioctx,
7062 XML_CHAR_ENCODING_NONE);
7063 if (input == NULL) {
7064 if (ioclose != NULL)
7065 ioclose(ioctx);
7066 return (NULL);
7067 }
7068 ctxt = htmlNewParserCtxt();
7069 if (ctxt == NULL) {
7070 xmlFreeParserInputBuffer(input);
7071 return (NULL);
7072 }
7073 stream = xmlNewIOInputStream(ctxt, input, XML_CHAR_ENCODING_NONE);
7074 if (stream == NULL) {
7075 xmlFreeParserInputBuffer(input);
7076 xmlFreeParserCtxt(ctxt);
7077 return (NULL);
7078 }
7079 inputPush(ctxt, stream);
7080 return (htmlDoRead(ctxt, URL, encoding, options, 0));
7081 }
7082
7083 /**
7084 * htmlCtxtReadDoc:
7085 * @ctxt: an HTML parser context
7086 * @cur: a pointer to a zero terminated string
7087 * @URL: the base URL to use for the document
7088 * @encoding: the document encoding, or NULL
7089 * @options: a combination of htmlParserOption(s)
7090 *
7091 * parse an XML in-memory document and build a tree.
7092 * This reuses the existing @ctxt parser context
7093 *
7094 * Returns the resulting document tree
7095 */
7096 htmlDocPtr
htmlCtxtReadDoc(htmlParserCtxtPtr ctxt,const xmlChar * cur,const char * URL,const char * encoding,int options)7097 htmlCtxtReadDoc(htmlParserCtxtPtr ctxt, const xmlChar * cur,
7098 const char *URL, const char *encoding, int options)
7099 {
7100 xmlParserInputPtr stream;
7101
7102 if (cur == NULL)
7103 return (NULL);
7104 if (ctxt == NULL)
7105 return (NULL);
7106 xmlInitParser();
7107
7108 htmlCtxtReset(ctxt);
7109
7110 stream = xmlNewStringInputStream(ctxt, cur);
7111 if (stream == NULL) {
7112 return (NULL);
7113 }
7114 inputPush(ctxt, stream);
7115 return (htmlDoRead(ctxt, URL, encoding, options, 1));
7116 }
7117
7118 /**
7119 * htmlCtxtReadFile:
7120 * @ctxt: an HTML parser context
7121 * @filename: a file or URL
7122 * @encoding: the document encoding, or NULL
7123 * @options: a combination of htmlParserOption(s)
7124 *
7125 * parse an XML file from the filesystem or the network.
7126 * This reuses the existing @ctxt parser context
7127 *
7128 * Returns the resulting document tree
7129 */
7130 htmlDocPtr
htmlCtxtReadFile(htmlParserCtxtPtr ctxt,const char * filename,const char * encoding,int options)7131 htmlCtxtReadFile(htmlParserCtxtPtr ctxt, const char *filename,
7132 const char *encoding, int options)
7133 {
7134 xmlParserInputPtr stream;
7135
7136 if (filename == NULL)
7137 return (NULL);
7138 if (ctxt == NULL)
7139 return (NULL);
7140 xmlInitParser();
7141
7142 htmlCtxtReset(ctxt);
7143
7144 stream = xmlLoadExternalEntity(filename, NULL, ctxt);
7145 if (stream == NULL) {
7146 return (NULL);
7147 }
7148 inputPush(ctxt, stream);
7149 return (htmlDoRead(ctxt, NULL, encoding, options, 1));
7150 }
7151
7152 /**
7153 * htmlCtxtReadMemory:
7154 * @ctxt: an HTML parser context
7155 * @buffer: a pointer to a char array
7156 * @size: the size of the array
7157 * @URL: the base URL to use for the document
7158 * @encoding: the document encoding, or NULL
7159 * @options: a combination of htmlParserOption(s)
7160 *
7161 * parse an XML in-memory document and build a tree.
7162 * This reuses the existing @ctxt parser context
7163 *
7164 * Returns the resulting document tree
7165 */
7166 htmlDocPtr
htmlCtxtReadMemory(htmlParserCtxtPtr ctxt,const char * buffer,int size,const char * URL,const char * encoding,int options)7167 htmlCtxtReadMemory(htmlParserCtxtPtr ctxt, const char *buffer, int size,
7168 const char *URL, const char *encoding, int options)
7169 {
7170 xmlParserInputBufferPtr input;
7171 xmlParserInputPtr stream;
7172
7173 if (ctxt == NULL)
7174 return (NULL);
7175 if (buffer == NULL)
7176 return (NULL);
7177 xmlInitParser();
7178
7179 htmlCtxtReset(ctxt);
7180
7181 input = xmlParserInputBufferCreateMem(buffer, size, XML_CHAR_ENCODING_NONE);
7182 if (input == NULL) {
7183 return(NULL);
7184 }
7185
7186 stream = xmlNewIOInputStream(ctxt, input, XML_CHAR_ENCODING_NONE);
7187 if (stream == NULL) {
7188 xmlFreeParserInputBuffer(input);
7189 return(NULL);
7190 }
7191
7192 inputPush(ctxt, stream);
7193 return (htmlDoRead(ctxt, URL, encoding, options, 1));
7194 }
7195
7196 /**
7197 * htmlCtxtReadFd:
7198 * @ctxt: an HTML parser context
7199 * @fd: an open file descriptor
7200 * @URL: the base URL to use for the document
7201 * @encoding: the document encoding, or NULL
7202 * @options: a combination of htmlParserOption(s)
7203 *
7204 * parse an XML from a file descriptor and build a tree.
7205 * This reuses the existing @ctxt parser context
7206 *
7207 * Returns the resulting document tree
7208 */
7209 htmlDocPtr
htmlCtxtReadFd(htmlParserCtxtPtr ctxt,int fd,const char * URL,const char * encoding,int options)7210 htmlCtxtReadFd(htmlParserCtxtPtr ctxt, int fd,
7211 const char *URL, const char *encoding, int options)
7212 {
7213 xmlParserInputBufferPtr input;
7214 xmlParserInputPtr stream;
7215
7216 if (fd < 0)
7217 return (NULL);
7218 if (ctxt == NULL)
7219 return (NULL);
7220 xmlInitParser();
7221
7222 htmlCtxtReset(ctxt);
7223
7224
7225 input = xmlParserInputBufferCreateFd(fd, XML_CHAR_ENCODING_NONE);
7226 if (input == NULL)
7227 return (NULL);
7228 stream = xmlNewIOInputStream(ctxt, input, XML_CHAR_ENCODING_NONE);
7229 if (stream == NULL) {
7230 xmlFreeParserInputBuffer(input);
7231 return (NULL);
7232 }
7233 inputPush(ctxt, stream);
7234 return (htmlDoRead(ctxt, URL, encoding, options, 1));
7235 }
7236
7237 /**
7238 * htmlCtxtReadIO:
7239 * @ctxt: an HTML parser context
7240 * @ioread: an I/O read function
7241 * @ioclose: an I/O close function
7242 * @ioctx: an I/O handler
7243 * @URL: the base URL to use for the document
7244 * @encoding: the document encoding, or NULL
7245 * @options: a combination of htmlParserOption(s)
7246 *
7247 * parse an HTML document from I/O functions and source and build a tree.
7248 * This reuses the existing @ctxt parser context
7249 *
7250 * Returns the resulting document tree
7251 */
7252 htmlDocPtr
htmlCtxtReadIO(htmlParserCtxtPtr ctxt,xmlInputReadCallback ioread,xmlInputCloseCallback ioclose,void * ioctx,const char * URL,const char * encoding,int options)7253 htmlCtxtReadIO(htmlParserCtxtPtr ctxt, xmlInputReadCallback ioread,
7254 xmlInputCloseCallback ioclose, void *ioctx,
7255 const char *URL,
7256 const char *encoding, int options)
7257 {
7258 xmlParserInputBufferPtr input;
7259 xmlParserInputPtr stream;
7260
7261 if (ioread == NULL)
7262 return (NULL);
7263 if (ctxt == NULL)
7264 return (NULL);
7265 xmlInitParser();
7266
7267 htmlCtxtReset(ctxt);
7268
7269 input = xmlParserInputBufferCreateIO(ioread, ioclose, ioctx,
7270 XML_CHAR_ENCODING_NONE);
7271 if (input == NULL) {
7272 if (ioclose != NULL)
7273 ioclose(ioctx);
7274 return (NULL);
7275 }
7276 stream = xmlNewIOInputStream(ctxt, input, XML_CHAR_ENCODING_NONE);
7277 if (stream == NULL) {
7278 xmlFreeParserInputBuffer(input);
7279 return (NULL);
7280 }
7281 inputPush(ctxt, stream);
7282 return (htmlDoRead(ctxt, URL, encoding, options, 1));
7283 }
7284
7285 #define bottom_HTMLparser
7286 #include "elfgcchack.h"
7287 #endif /* LIBXML_HTML_ENABLED */
7288