1 /*
2 * HTMLparser.c : an HTML 4.0 non-verifying parser
3 *
4 * See Copyright for the status of this software.
5 *
6 * daniel@veillard.com
7 */
8
9 #define IN_LIBXML
10 #include "libxml.h"
11 #ifdef LIBXML_HTML_ENABLED
12
13 #include <string.h>
14 #ifdef HAVE_CTYPE_H
15 #include <ctype.h>
16 #endif
17 #ifdef HAVE_STDLIB_H
18 #include <stdlib.h>
19 #endif
20 #ifdef HAVE_SYS_STAT_H
21 #include <sys/stat.h>
22 #endif
23 #ifdef HAVE_FCNTL_H
24 #include <fcntl.h>
25 #endif
26 #ifdef HAVE_UNISTD_H
27 #include <unistd.h>
28 #endif
29 #ifdef LIBXML_ZLIB_ENABLED
30 #include <zlib.h>
31 #endif
32
33 #include <libxml/xmlmemory.h>
34 #include <libxml/tree.h>
35 #include <libxml/parser.h>
36 #include <libxml/parserInternals.h>
37 #include <libxml/xmlerror.h>
38 #include <libxml/HTMLparser.h>
39 #include <libxml/HTMLtree.h>
40 #include <libxml/entities.h>
41 #include <libxml/encoding.h>
42 #include <libxml/valid.h>
43 #include <libxml/xmlIO.h>
44 #include <libxml/globals.h>
45 #include <libxml/uri.h>
46
47 #include "buf.h"
48 #include "enc.h"
49
50 #define HTML_MAX_NAMELEN 1000
51 #define HTML_PARSER_BIG_BUFFER_SIZE 1000
52 #define HTML_PARSER_BUFFER_SIZE 100
53
54 /* #define DEBUG */
55 /* #define DEBUG_PUSH */
56
57 static int htmlOmittedDefaultValue = 1;
58
59 xmlChar * htmlDecodeEntities(htmlParserCtxtPtr ctxt, int len,
60 xmlChar end, xmlChar end2, xmlChar end3);
61 static void htmlParseComment(htmlParserCtxtPtr ctxt);
62
63 /************************************************************************
64 * *
65 * Some factorized error routines *
66 * *
67 ************************************************************************/
68
69 /**
70 * htmlErrMemory:
71 * @ctxt: an HTML parser context
72 * @extra: extra information
73 *
74 * Handle a redefinition of attribute error
75 */
76 static void
htmlErrMemory(xmlParserCtxtPtr ctxt,const char * extra)77 htmlErrMemory(xmlParserCtxtPtr ctxt, const char *extra)
78 {
79 if ((ctxt != NULL) && (ctxt->disableSAX != 0) &&
80 (ctxt->instate == XML_PARSER_EOF))
81 return;
82 if (ctxt != NULL) {
83 ctxt->errNo = XML_ERR_NO_MEMORY;
84 ctxt->instate = XML_PARSER_EOF;
85 ctxt->disableSAX = 1;
86 }
87 if (extra)
88 __xmlRaiseError(NULL, NULL, NULL, ctxt, NULL, XML_FROM_PARSER,
89 XML_ERR_NO_MEMORY, XML_ERR_FATAL, NULL, 0, extra,
90 NULL, NULL, 0, 0,
91 "Memory allocation failed : %s\n", extra);
92 else
93 __xmlRaiseError(NULL, NULL, NULL, ctxt, NULL, XML_FROM_PARSER,
94 XML_ERR_NO_MEMORY, XML_ERR_FATAL, NULL, 0, NULL,
95 NULL, NULL, 0, 0, "Memory allocation failed\n");
96 }
97
98 /**
99 * htmlParseErr:
100 * @ctxt: an HTML parser context
101 * @error: the error number
102 * @msg: the error message
103 * @str1: string infor
104 * @str2: string infor
105 *
106 * Handle a fatal parser error, i.e. violating Well-Formedness constraints
107 */
108 static void LIBXML_ATTR_FORMAT(3,0)
htmlParseErr(xmlParserCtxtPtr ctxt,xmlParserErrors error,const char * msg,const xmlChar * str1,const xmlChar * str2)109 htmlParseErr(xmlParserCtxtPtr ctxt, xmlParserErrors error,
110 const char *msg, const xmlChar *str1, const xmlChar *str2)
111 {
112 if ((ctxt != NULL) && (ctxt->disableSAX != 0) &&
113 (ctxt->instate == XML_PARSER_EOF))
114 return;
115 if (ctxt != NULL)
116 ctxt->errNo = error;
117 __xmlRaiseError(NULL, NULL, NULL, ctxt, NULL, XML_FROM_HTML, error,
118 XML_ERR_ERROR, NULL, 0,
119 (const char *) str1, (const char *) str2,
120 NULL, 0, 0,
121 msg, str1, str2);
122 if (ctxt != NULL)
123 ctxt->wellFormed = 0;
124 }
125
126 /**
127 * htmlParseErrInt:
128 * @ctxt: an HTML parser context
129 * @error: the error number
130 * @msg: the error message
131 * @val: integer info
132 *
133 * Handle a fatal parser error, i.e. violating Well-Formedness constraints
134 */
135 static void LIBXML_ATTR_FORMAT(3,0)
htmlParseErrInt(xmlParserCtxtPtr ctxt,xmlParserErrors error,const char * msg,int val)136 htmlParseErrInt(xmlParserCtxtPtr ctxt, xmlParserErrors error,
137 const char *msg, int val)
138 {
139 if ((ctxt != NULL) && (ctxt->disableSAX != 0) &&
140 (ctxt->instate == XML_PARSER_EOF))
141 return;
142 if (ctxt != NULL)
143 ctxt->errNo = error;
144 __xmlRaiseError(NULL, NULL, NULL, ctxt, NULL, XML_FROM_HTML, error,
145 XML_ERR_ERROR, NULL, 0, NULL, NULL,
146 NULL, val, 0, msg, val);
147 if (ctxt != NULL)
148 ctxt->wellFormed = 0;
149 }
150
151 /************************************************************************
152 * *
153 * Parser stacks related functions and macros *
154 * *
155 ************************************************************************/
156
157 /**
158 * htmlnamePush:
159 * @ctxt: an HTML parser context
160 * @value: the element name
161 *
162 * Pushes a new element name on top of the name stack
163 *
164 * Returns 0 in case of error, the index in the stack otherwise
165 */
166 static int
htmlnamePush(htmlParserCtxtPtr ctxt,const xmlChar * value)167 htmlnamePush(htmlParserCtxtPtr ctxt, const xmlChar * value)
168 {
169 if ((ctxt->html < 3) && (xmlStrEqual(value, BAD_CAST "head")))
170 ctxt->html = 3;
171 if ((ctxt->html < 10) && (xmlStrEqual(value, BAD_CAST "body")))
172 ctxt->html = 10;
173 if (ctxt->nameNr >= ctxt->nameMax) {
174 ctxt->nameMax *= 2;
175 ctxt->nameTab = (const xmlChar * *)
176 xmlRealloc((xmlChar * *)ctxt->nameTab,
177 ctxt->nameMax *
178 sizeof(ctxt->nameTab[0]));
179 if (ctxt->nameTab == NULL) {
180 htmlErrMemory(ctxt, NULL);
181 return (0);
182 }
183 }
184 ctxt->nameTab[ctxt->nameNr] = value;
185 ctxt->name = value;
186 return (ctxt->nameNr++);
187 }
188 /**
189 * htmlnamePop:
190 * @ctxt: an HTML parser context
191 *
192 * Pops the top element name from the name stack
193 *
194 * Returns the name just removed
195 */
196 static const xmlChar *
htmlnamePop(htmlParserCtxtPtr ctxt)197 htmlnamePop(htmlParserCtxtPtr ctxt)
198 {
199 const xmlChar *ret;
200
201 if (ctxt->nameNr <= 0)
202 return (NULL);
203 ctxt->nameNr--;
204 if (ctxt->nameNr < 0)
205 return (NULL);
206 if (ctxt->nameNr > 0)
207 ctxt->name = ctxt->nameTab[ctxt->nameNr - 1];
208 else
209 ctxt->name = NULL;
210 ret = ctxt->nameTab[ctxt->nameNr];
211 ctxt->nameTab[ctxt->nameNr] = NULL;
212 return (ret);
213 }
214
215 /**
216 * htmlNodeInfoPush:
217 * @ctxt: an HTML parser context
218 * @value: the node info
219 *
220 * Pushes a new element name on top of the node info stack
221 *
222 * Returns 0 in case of error, the index in the stack otherwise
223 */
224 static int
htmlNodeInfoPush(htmlParserCtxtPtr ctxt,htmlParserNodeInfo * value)225 htmlNodeInfoPush(htmlParserCtxtPtr ctxt, htmlParserNodeInfo *value)
226 {
227 if (ctxt->nodeInfoNr >= ctxt->nodeInfoMax) {
228 if (ctxt->nodeInfoMax == 0)
229 ctxt->nodeInfoMax = 5;
230 ctxt->nodeInfoMax *= 2;
231 ctxt->nodeInfoTab = (htmlParserNodeInfo *)
232 xmlRealloc((htmlParserNodeInfo *)ctxt->nodeInfoTab,
233 ctxt->nodeInfoMax *
234 sizeof(ctxt->nodeInfoTab[0]));
235 if (ctxt->nodeInfoTab == NULL) {
236 htmlErrMemory(ctxt, NULL);
237 return (0);
238 }
239 }
240 ctxt->nodeInfoTab[ctxt->nodeInfoNr] = *value;
241 ctxt->nodeInfo = &ctxt->nodeInfoTab[ctxt->nodeInfoNr];
242 return (ctxt->nodeInfoNr++);
243 }
244
245 /**
246 * htmlNodeInfoPop:
247 * @ctxt: an HTML parser context
248 *
249 * Pops the top element name from the node info stack
250 *
251 * Returns 0 in case of error, the pointer to NodeInfo otherwise
252 */
253 static htmlParserNodeInfo *
htmlNodeInfoPop(htmlParserCtxtPtr ctxt)254 htmlNodeInfoPop(htmlParserCtxtPtr ctxt)
255 {
256 if (ctxt->nodeInfoNr <= 0)
257 return (NULL);
258 ctxt->nodeInfoNr--;
259 if (ctxt->nodeInfoNr < 0)
260 return (NULL);
261 if (ctxt->nodeInfoNr > 0)
262 ctxt->nodeInfo = &ctxt->nodeInfoTab[ctxt->nodeInfoNr - 1];
263 else
264 ctxt->nodeInfo = NULL;
265 return &ctxt->nodeInfoTab[ctxt->nodeInfoNr];
266 }
267
268 /*
269 * Macros for accessing the content. Those should be used only by the parser,
270 * and not exported.
271 *
272 * Dirty macros, i.e. one need to make assumption on the context to use them
273 *
274 * CUR_PTR return the current pointer to the xmlChar to be parsed.
275 * CUR returns the current xmlChar value, i.e. a 8 bit value if compiled
276 * in ISO-Latin or UTF-8, and the current 16 bit value if compiled
277 * in UNICODE mode. This should be used internally by the parser
278 * only to compare to ASCII values otherwise it would break when
279 * running with UTF-8 encoding.
280 * NXT(n) returns the n'th next xmlChar. Same as CUR is should be used only
281 * to compare on ASCII based substring.
282 * UPP(n) returns the n'th next xmlChar converted to uppercase. Same as CUR
283 * it should be used only to compare on ASCII based substring.
284 * SKIP(n) Skip n xmlChar, and must also be used only to skip ASCII defined
285 * strings without newlines within the parser.
286 *
287 * Clean macros, not dependent of an ASCII context, expect UTF-8 encoding
288 *
289 * CURRENT Returns the current char value, with the full decoding of
290 * UTF-8 if we are using this mode. It returns an int.
291 * NEXT Skip to the next character, this does the proper decoding
292 * in UTF-8 mode. It also pop-up unfinished entities on the fly.
293 * NEXTL(l) Skip the current unicode character of l xmlChars long.
294 * COPY(to) copy one char to *to, increment CUR_PTR and to accordingly
295 */
296
297 #define UPPER (toupper(*ctxt->input->cur))
298
299 #define SKIP(val) ctxt->input->cur += (val),ctxt->input->col+=(val)
300
301 #define NXT(val) ctxt->input->cur[(val)]
302
303 #define UPP(val) (toupper(ctxt->input->cur[(val)]))
304
305 #define CUR_PTR ctxt->input->cur
306 #define BASE_PTR ctxt->input->base
307
308 #define SHRINK if ((ctxt->input->cur - ctxt->input->base > 2 * INPUT_CHUNK) && \
309 (ctxt->input->end - ctxt->input->cur < 2 * INPUT_CHUNK)) \
310 xmlParserInputShrink(ctxt->input)
311
312 #define GROW if ((ctxt->progressive == 0) && \
313 (ctxt->input->end - ctxt->input->cur < INPUT_CHUNK)) \
314 xmlParserInputGrow(ctxt->input, INPUT_CHUNK)
315
316 #define CURRENT ((int) (*ctxt->input->cur))
317
318 #define SKIP_BLANKS htmlSkipBlankChars(ctxt)
319
320 /* Imported from XML */
321
322 /* #define CUR (ctxt->token ? ctxt->token : (int) (*ctxt->input->cur)) */
323 #define CUR ((int) (*ctxt->input->cur))
324 #define NEXT xmlNextChar(ctxt)
325
326 #define RAW (ctxt->token ? -1 : (*ctxt->input->cur))
327
328
329 #define NEXTL(l) do { \
330 if (*(ctxt->input->cur) == '\n') { \
331 ctxt->input->line++; ctxt->input->col = 1; \
332 } else ctxt->input->col++; \
333 ctxt->token = 0; ctxt->input->cur += l; \
334 } while (0)
335
336 /************
337 \
338 if (*ctxt->input->cur == '%') xmlParserHandlePEReference(ctxt); \
339 if (*ctxt->input->cur == '&') xmlParserHandleReference(ctxt);
340 ************/
341
342 #define CUR_CHAR(l) htmlCurrentChar(ctxt, &l)
343 #define CUR_SCHAR(s, l) xmlStringCurrentChar(ctxt, s, &l)
344
345 #define COPY_BUF(l,b,i,v) \
346 if (l == 1) b[i++] = (xmlChar) v; \
347 else i += xmlCopyChar(l,&b[i],v)
348
349 /**
350 * htmlFindEncoding:
351 * @the HTML parser context
352 *
353 * Ty to find and encoding in the current data available in the input
354 * buffer this is needed to try to switch to the proper encoding when
355 * one face a character error.
356 * That's an heuristic, since it's operating outside of parsing it could
357 * try to use a meta which had been commented out, that's the reason it
358 * should only be used in case of error, not as a default.
359 *
360 * Returns an encoding string or NULL if not found, the string need to
361 * be freed
362 */
363 static xmlChar *
htmlFindEncoding(xmlParserCtxtPtr ctxt)364 htmlFindEncoding(xmlParserCtxtPtr ctxt) {
365 const xmlChar *start, *cur, *end;
366
367 if ((ctxt == NULL) || (ctxt->input == NULL) ||
368 (ctxt->input->encoding != NULL) || (ctxt->input->buf == NULL) ||
369 (ctxt->input->buf->encoder != NULL))
370 return(NULL);
371 if ((ctxt->input->cur == NULL) || (ctxt->input->end == NULL))
372 return(NULL);
373
374 start = ctxt->input->cur;
375 end = ctxt->input->end;
376 /* we also expect the input buffer to be zero terminated */
377 if (*end != 0)
378 return(NULL);
379
380 cur = xmlStrcasestr(start, BAD_CAST "HTTP-EQUIV");
381 if (cur == NULL)
382 return(NULL);
383 cur = xmlStrcasestr(cur, BAD_CAST "CONTENT");
384 if (cur == NULL)
385 return(NULL);
386 cur = xmlStrcasestr(cur, BAD_CAST "CHARSET=");
387 if (cur == NULL)
388 return(NULL);
389 cur += 8;
390 start = cur;
391 while (((*cur >= 'A') && (*cur <= 'Z')) ||
392 ((*cur >= 'a') && (*cur <= 'z')) ||
393 ((*cur >= '0') && (*cur <= '9')) ||
394 (*cur == '-') || (*cur == '_') || (*cur == ':') || (*cur == '/'))
395 cur++;
396 if (cur == start)
397 return(NULL);
398 return(xmlStrndup(start, cur - start));
399 }
400
401 /**
402 * htmlCurrentChar:
403 * @ctxt: the HTML parser context
404 * @len: pointer to the length of the char read
405 *
406 * The current char value, if using UTF-8 this may actually span multiple
407 * bytes in the input buffer. Implement the end of line normalization:
408 * 2.11 End-of-Line Handling
409 * If the encoding is unspecified, in the case we find an ISO-Latin-1
410 * char, then the encoding converter is plugged in automatically.
411 *
412 * Returns the current char value and its length
413 */
414
415 static int
htmlCurrentChar(xmlParserCtxtPtr ctxt,int * len)416 htmlCurrentChar(xmlParserCtxtPtr ctxt, int *len) {
417 const unsigned char *cur;
418 unsigned char c;
419 unsigned int val;
420
421 if (ctxt->instate == XML_PARSER_EOF)
422 return(0);
423
424 if (ctxt->token != 0) {
425 *len = 0;
426 return(ctxt->token);
427 }
428 if (ctxt->charset != XML_CHAR_ENCODING_UTF8) {
429 xmlChar * guess;
430 xmlCharEncodingHandlerPtr handler;
431
432 /*
433 * Assume it's a fixed length encoding (1) with
434 * a compatible encoding for the ASCII set, since
435 * HTML constructs only use < 128 chars
436 */
437 if ((int) *ctxt->input->cur < 0x80) {
438 *len = 1;
439 if ((*ctxt->input->cur == 0) &&
440 (ctxt->input->cur < ctxt->input->end)) {
441 htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR,
442 "Char 0x%X out of allowed range\n", 0);
443 return(' ');
444 }
445 return((int) *ctxt->input->cur);
446 }
447
448 /*
449 * Humm this is bad, do an automatic flow conversion
450 */
451 guess = htmlFindEncoding(ctxt);
452 if (guess == NULL) {
453 xmlSwitchEncoding(ctxt, XML_CHAR_ENCODING_8859_1);
454 } else {
455 if (ctxt->input->encoding != NULL)
456 xmlFree((xmlChar *) ctxt->input->encoding);
457 ctxt->input->encoding = guess;
458 handler = xmlFindCharEncodingHandler((const char *) guess);
459 if (handler != NULL) {
460 /*
461 * Don't use UTF-8 encoder which isn't required and
462 * can produce invalid UTF-8.
463 */
464 if (!xmlStrEqual(BAD_CAST handler->name, BAD_CAST "UTF-8"))
465 xmlSwitchToEncoding(ctxt, handler);
466 } else {
467 htmlParseErr(ctxt, XML_ERR_INVALID_ENCODING,
468 "Unsupported encoding %s", guess, NULL);
469 }
470 }
471 ctxt->charset = XML_CHAR_ENCODING_UTF8;
472 }
473
474 /*
475 * We are supposed to handle UTF8, check it's valid
476 * From rfc2044: encoding of the Unicode values on UTF-8:
477 *
478 * UCS-4 range (hex.) UTF-8 octet sequence (binary)
479 * 0000 0000-0000 007F 0xxxxxxx
480 * 0000 0080-0000 07FF 110xxxxx 10xxxxxx
481 * 0000 0800-0000 FFFF 1110xxxx 10xxxxxx 10xxxxxx
482 *
483 * Check for the 0x110000 limit too
484 */
485 cur = ctxt->input->cur;
486 c = *cur;
487 if (c & 0x80) {
488 if ((c & 0x40) == 0)
489 goto encoding_error;
490 if (cur[1] == 0) {
491 xmlParserInputGrow(ctxt->input, INPUT_CHUNK);
492 cur = ctxt->input->cur;
493 }
494 if ((cur[1] & 0xc0) != 0x80)
495 goto encoding_error;
496 if ((c & 0xe0) == 0xe0) {
497
498 if (cur[2] == 0) {
499 xmlParserInputGrow(ctxt->input, INPUT_CHUNK);
500 cur = ctxt->input->cur;
501 }
502 if ((cur[2] & 0xc0) != 0x80)
503 goto encoding_error;
504 if ((c & 0xf0) == 0xf0) {
505 if (cur[3] == 0) {
506 xmlParserInputGrow(ctxt->input, INPUT_CHUNK);
507 cur = ctxt->input->cur;
508 }
509 if (((c & 0xf8) != 0xf0) ||
510 ((cur[3] & 0xc0) != 0x80))
511 goto encoding_error;
512 /* 4-byte code */
513 *len = 4;
514 val = (cur[0] & 0x7) << 18;
515 val |= (cur[1] & 0x3f) << 12;
516 val |= (cur[2] & 0x3f) << 6;
517 val |= cur[3] & 0x3f;
518 if (val < 0x10000)
519 goto encoding_error;
520 } else {
521 /* 3-byte code */
522 *len = 3;
523 val = (cur[0] & 0xf) << 12;
524 val |= (cur[1] & 0x3f) << 6;
525 val |= cur[2] & 0x3f;
526 if (val < 0x800)
527 goto encoding_error;
528 }
529 } else {
530 /* 2-byte code */
531 *len = 2;
532 val = (cur[0] & 0x1f) << 6;
533 val |= cur[1] & 0x3f;
534 if (val < 0x80)
535 goto encoding_error;
536 }
537 if (!IS_CHAR(val)) {
538 htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR,
539 "Char 0x%X out of allowed range\n", val);
540 }
541 return(val);
542 } else {
543 if ((*ctxt->input->cur == 0) &&
544 (ctxt->input->cur < ctxt->input->end)) {
545 htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR,
546 "Char 0x%X out of allowed range\n", 0);
547 *len = 1;
548 return(' ');
549 }
550 /* 1-byte code */
551 *len = 1;
552 return((int) *ctxt->input->cur);
553 }
554
555 encoding_error:
556 /*
557 * If we detect an UTF8 error that probably mean that the
558 * input encoding didn't get properly advertised in the
559 * declaration header. Report the error and switch the encoding
560 * to ISO-Latin-1 (if you don't like this policy, just declare the
561 * encoding !)
562 */
563 {
564 char buffer[150];
565
566 if (ctxt->input->end - ctxt->input->cur >= 4) {
567 snprintf(buffer, 149, "Bytes: 0x%02X 0x%02X 0x%02X 0x%02X\n",
568 ctxt->input->cur[0], ctxt->input->cur[1],
569 ctxt->input->cur[2], ctxt->input->cur[3]);
570 } else {
571 snprintf(buffer, 149, "Bytes: 0x%02X\n", ctxt->input->cur[0]);
572 }
573 htmlParseErr(ctxt, XML_ERR_INVALID_ENCODING,
574 "Input is not proper UTF-8, indicate encoding !\n",
575 BAD_CAST buffer, NULL);
576 }
577
578 /*
579 * Don't switch encodings twice. Note that if there's an encoder, we
580 * shouldn't receive invalid UTF-8 anyway.
581 *
582 * Note that if ctxt->input->buf == NULL, switching encodings is
583 * impossible, see Gitlab issue #34.
584 */
585 if ((ctxt->input->buf != NULL) &&
586 (ctxt->input->buf->encoder == NULL))
587 xmlSwitchEncoding(ctxt, XML_CHAR_ENCODING_8859_1);
588 *len = 1;
589 return((int) *ctxt->input->cur);
590 }
591
592 /**
593 * htmlSkipBlankChars:
594 * @ctxt: the HTML parser context
595 *
596 * skip all blanks character found at that point in the input streams.
597 *
598 * Returns the number of space chars skipped
599 */
600
601 static int
htmlSkipBlankChars(xmlParserCtxtPtr ctxt)602 htmlSkipBlankChars(xmlParserCtxtPtr ctxt) {
603 int res = 0;
604
605 while (IS_BLANK_CH(*(ctxt->input->cur))) {
606 if ((*ctxt->input->cur == 0) &&
607 (xmlParserInputGrow(ctxt->input, INPUT_CHUNK) <= 0)) {
608 xmlPopInput(ctxt);
609 } else {
610 if (*(ctxt->input->cur) == '\n') {
611 ctxt->input->line++; ctxt->input->col = 1;
612 } else ctxt->input->col++;
613 ctxt->input->cur++;
614 if (*ctxt->input->cur == 0)
615 xmlParserInputGrow(ctxt->input, INPUT_CHUNK);
616 }
617 res++;
618 }
619 return(res);
620 }
621
622
623
624 /************************************************************************
625 * *
626 * The list of HTML elements and their properties *
627 * *
628 ************************************************************************/
629
630 /*
631 * Start Tag: 1 means the start tag can be omitted
632 * End Tag: 1 means the end tag can be omitted
633 * 2 means it's forbidden (empty elements)
634 * 3 means the tag is stylistic and should be closed easily
635 * Depr: this element is deprecated
636 * DTD: 1 means that this element is valid only in the Loose DTD
637 * 2 means that this element is valid only in the Frameset DTD
638 *
639 * Name,Start Tag,End Tag,Save End,Empty,Deprecated,DTD,inline,Description
640 , subElements , impliedsubelt , Attributes, userdata
641 */
642
643 /* Definitions and a couple of vars for HTML Elements */
644
645 #define FONTSTYLE "tt", "i", "b", "u", "s", "strike", "big", "small"
646 #define NB_FONTSTYLE 8
647 #define PHRASE "em", "strong", "dfn", "code", "samp", "kbd", "var", "cite", "abbr", "acronym"
648 #define NB_PHRASE 10
649 #define SPECIAL "a", "img", "applet", "embed", "object", "font", "basefont", "br", "script", "map", "q", "sub", "sup", "span", "bdo", "iframe"
650 #define NB_SPECIAL 16
651 #define INLINE FONTSTYLE, PHRASE, SPECIAL, FORMCTRL
652 #define NB_INLINE NB_PCDATA + NB_FONTSTYLE + NB_PHRASE + NB_SPECIAL + NB_FORMCTRL
653 #define BLOCK HEADING, LIST, "pre", "p", "dl", "div", "center", "noscript", "noframes", "blockquote", "form", "isindex", "hr", "table", "fieldset", "address"
654 #define NB_BLOCK NB_HEADING + NB_LIST + 14
655 #define FORMCTRL "input", "select", "textarea", "label", "button"
656 #define NB_FORMCTRL 5
657 #define PCDATA
658 #define NB_PCDATA 0
659 #define HEADING "h1", "h2", "h3", "h4", "h5", "h6"
660 #define NB_HEADING 6
661 #define LIST "ul", "ol", "dir", "menu"
662 #define NB_LIST 4
663 #define MODIFIER
664 #define NB_MODIFIER 0
665 #define FLOW BLOCK,INLINE
666 #define NB_FLOW NB_BLOCK + NB_INLINE
667 #define EMPTY NULL
668
669
670 static const char* const html_flow[] = { FLOW, NULL } ;
671 static const char* const html_inline[] = { INLINE, NULL } ;
672
673 /* placeholders: elts with content but no subelements */
674 static const char* const html_pcdata[] = { NULL } ;
675 #define html_cdata html_pcdata
676
677
678 /* ... and for HTML Attributes */
679
680 #define COREATTRS "id", "class", "style", "title"
681 #define NB_COREATTRS 4
682 #define I18N "lang", "dir"
683 #define NB_I18N 2
684 #define EVENTS "onclick", "ondblclick", "onmousedown", "onmouseup", "onmouseover", "onmouseout", "onkeypress", "onkeydown", "onkeyup"
685 #define NB_EVENTS 9
686 #define ATTRS COREATTRS,I18N,EVENTS
687 #define NB_ATTRS NB_NB_COREATTRS + NB_I18N + NB_EVENTS
688 #define CELLHALIGN "align", "char", "charoff"
689 #define NB_CELLHALIGN 3
690 #define CELLVALIGN "valign"
691 #define NB_CELLVALIGN 1
692
693 static const char* const html_attrs[] = { ATTRS, NULL } ;
694 static const char* const core_i18n_attrs[] = { COREATTRS, I18N, NULL } ;
695 static const char* const core_attrs[] = { COREATTRS, NULL } ;
696 static const char* const i18n_attrs[] = { I18N, NULL } ;
697
698
699 /* Other declarations that should go inline ... */
700 static const char* const a_attrs[] = { ATTRS, "charset", "type", "name",
701 "href", "hreflang", "rel", "rev", "accesskey", "shape", "coords",
702 "tabindex", "onfocus", "onblur", NULL } ;
703 static const char* const target_attr[] = { "target", NULL } ;
704 static const char* const rows_cols_attr[] = { "rows", "cols", NULL } ;
705 static const char* const alt_attr[] = { "alt", NULL } ;
706 static const char* const src_alt_attrs[] = { "src", "alt", NULL } ;
707 static const char* const href_attrs[] = { "href", NULL } ;
708 static const char* const clear_attrs[] = { "clear", NULL } ;
709 static const char* const inline_p[] = { INLINE, "p", NULL } ;
710
711 static const char* const flow_param[] = { FLOW, "param", NULL } ;
712 static const char* const applet_attrs[] = { COREATTRS , "codebase",
713 "archive", "alt", "name", "height", "width", "align",
714 "hspace", "vspace", NULL } ;
715 static const char* const area_attrs[] = { "shape", "coords", "href", "nohref",
716 "tabindex", "accesskey", "onfocus", "onblur", NULL } ;
717 static const char* const basefont_attrs[] =
718 { "id", "size", "color", "face", NULL } ;
719 static const char* const quote_attrs[] = { ATTRS, "cite", NULL } ;
720 static const char* const body_contents[] = { FLOW, "ins", "del", NULL } ;
721 static const char* const body_attrs[] = { ATTRS, "onload", "onunload", NULL } ;
722 static const char* const body_depr[] = { "background", "bgcolor", "text",
723 "link", "vlink", "alink", NULL } ;
724 static const char* const button_attrs[] = { ATTRS, "name", "value", "type",
725 "disabled", "tabindex", "accesskey", "onfocus", "onblur", NULL } ;
726
727
728 static const char* const col_attrs[] = { ATTRS, "span", "width", CELLHALIGN, CELLVALIGN, NULL } ;
729 static const char* const col_elt[] = { "col", NULL } ;
730 static const char* const edit_attrs[] = { ATTRS, "datetime", "cite", NULL } ;
731 static const char* const compact_attrs[] = { ATTRS, "compact", NULL } ;
732 static const char* const dl_contents[] = { "dt", "dd", NULL } ;
733 static const char* const compact_attr[] = { "compact", NULL } ;
734 static const char* const label_attr[] = { "label", NULL } ;
735 static const char* const fieldset_contents[] = { FLOW, "legend" } ;
736 static const char* const font_attrs[] = { COREATTRS, I18N, "size", "color", "face" , NULL } ;
737 static const char* const form_contents[] = { HEADING, LIST, INLINE, "pre", "p", "div", "center", "noscript", "noframes", "blockquote", "isindex", "hr", "table", "fieldset", "address", NULL } ;
738 static const char* const form_attrs[] = { ATTRS, "method", "enctype", "accept", "name", "onsubmit", "onreset", "accept-charset", NULL } ;
739 static const char* const frame_attrs[] = { COREATTRS, "longdesc", "name", "src", "frameborder", "marginwidth", "marginheight", "noresize", "scrolling" , NULL } ;
740 static const char* const frameset_attrs[] = { COREATTRS, "rows", "cols", "onload", "onunload", NULL } ;
741 static const char* const frameset_contents[] = { "frameset", "frame", "noframes", NULL } ;
742 static const char* const head_attrs[] = { I18N, "profile", NULL } ;
743 static const char* const head_contents[] = { "title", "isindex", "base", "script", "style", "meta", "link", "object", NULL } ;
744 static const char* const hr_depr[] = { "align", "noshade", "size", "width", NULL } ;
745 static const char* const version_attr[] = { "version", NULL } ;
746 static const char* const html_content[] = { "head", "body", "frameset", NULL } ;
747 static const char* const iframe_attrs[] = { COREATTRS, "longdesc", "name", "src", "frameborder", "marginwidth", "marginheight", "scrolling", "align", "height", "width", NULL } ;
748 static const char* const img_attrs[] = { ATTRS, "longdesc", "name", "height", "width", "usemap", "ismap", NULL } ;
749 static const char* const embed_attrs[] = { COREATTRS, "align", "alt", "border", "code", "codebase", "frameborder", "height", "hidden", "hspace", "name", "palette", "pluginspace", "pluginurl", "src", "type", "units", "vspace", "width", NULL } ;
750 static const char* const input_attrs[] = { ATTRS, "type", "name", "value", "checked", "disabled", "readonly", "size", "maxlength", "src", "alt", "usemap", "ismap", "tabindex", "accesskey", "onfocus", "onblur", "onselect", "onchange", "accept", NULL } ;
751 static const char* const prompt_attrs[] = { COREATTRS, I18N, "prompt", NULL } ;
752 static const char* const label_attrs[] = { ATTRS, "for", "accesskey", "onfocus", "onblur", NULL } ;
753 static const char* const legend_attrs[] = { ATTRS, "accesskey", NULL } ;
754 static const char* const align_attr[] = { "align", NULL } ;
755 static const char* const link_attrs[] = { ATTRS, "charset", "href", "hreflang", "type", "rel", "rev", "media", NULL } ;
756 static const char* const map_contents[] = { BLOCK, "area", NULL } ;
757 static const char* const name_attr[] = { "name", NULL } ;
758 static const char* const action_attr[] = { "action", NULL } ;
759 static const char* const blockli_elt[] = { BLOCK, "li", NULL } ;
760 static const char* const meta_attrs[] = { I18N, "http-equiv", "name", "scheme", "charset", NULL } ;
761 static const char* const content_attr[] = { "content", NULL } ;
762 static const char* const type_attr[] = { "type", NULL } ;
763 static const char* const noframes_content[] = { "body", FLOW MODIFIER, NULL } ;
764 static const char* const object_contents[] = { FLOW, "param", NULL } ;
765 static const char* const object_attrs[] = { ATTRS, "declare", "classid", "codebase", "data", "type", "codetype", "archive", "standby", "height", "width", "usemap", "name", "tabindex", NULL } ;
766 static const char* const object_depr[] = { "align", "border", "hspace", "vspace", NULL } ;
767 static const char* const ol_attrs[] = { "type", "compact", "start", NULL} ;
768 static const char* const option_elt[] = { "option", NULL } ;
769 static const char* const optgroup_attrs[] = { ATTRS, "disabled", NULL } ;
770 static const char* const option_attrs[] = { ATTRS, "disabled", "label", "selected", "value", NULL } ;
771 static const char* const param_attrs[] = { "id", "value", "valuetype", "type", NULL } ;
772 static const char* const width_attr[] = { "width", NULL } ;
773 static const char* const pre_content[] = { PHRASE, "tt", "i", "b", "u", "s", "strike", "a", "br", "script", "map", "q", "span", "bdo", "iframe", NULL } ;
774 static const char* const script_attrs[] = { "charset", "src", "defer", "event", "for", NULL } ;
775 static const char* const language_attr[] = { "language", NULL } ;
776 static const char* const select_content[] = { "optgroup", "option", NULL } ;
777 static const char* const select_attrs[] = { ATTRS, "name", "size", "multiple", "disabled", "tabindex", "onfocus", "onblur", "onchange", NULL } ;
778 static const char* const style_attrs[] = { I18N, "media", "title", NULL } ;
779 static const char* const table_attrs[] = { ATTRS, "summary", "width", "border", "frame", "rules", "cellspacing", "cellpadding", "datapagesize", NULL } ;
780 static const char* const table_depr[] = { "align", "bgcolor", NULL } ;
781 static const char* const table_contents[] = { "caption", "col", "colgroup", "thead", "tfoot", "tbody", "tr", NULL} ;
782 static const char* const tr_elt[] = { "tr", NULL } ;
783 static const char* const talign_attrs[] = { ATTRS, CELLHALIGN, CELLVALIGN, NULL} ;
784 static const char* const th_td_depr[] = { "nowrap", "bgcolor", "width", "height", NULL } ;
785 static const char* const th_td_attr[] = { ATTRS, "abbr", "axis", "headers", "scope", "rowspan", "colspan", CELLHALIGN, CELLVALIGN, NULL } ;
786 static const char* const textarea_attrs[] = { ATTRS, "name", "disabled", "readonly", "tabindex", "accesskey", "onfocus", "onblur", "onselect", "onchange", NULL } ;
787 static const char* const tr_contents[] = { "th", "td", NULL } ;
788 static const char* const bgcolor_attr[] = { "bgcolor", NULL } ;
789 static const char* const li_elt[] = { "li", NULL } ;
790 static const char* const ul_depr[] = { "type", "compact", NULL} ;
791 static const char* const dir_attr[] = { "dir", NULL} ;
792
793 #define DECL (const char**)
794
795 static const htmlElemDesc
796 html40ElementTable[] = {
797 { "a", 0, 0, 0, 0, 0, 0, 1, "anchor ",
798 DECL html_inline , NULL , DECL a_attrs , DECL target_attr, NULL
799 },
800 { "abbr", 0, 0, 0, 0, 0, 0, 1, "abbreviated form",
801 DECL html_inline , NULL , DECL html_attrs, NULL, NULL
802 },
803 { "acronym", 0, 0, 0, 0, 0, 0, 1, "",
804 DECL html_inline , NULL , DECL html_attrs, NULL, NULL
805 },
806 { "address", 0, 0, 0, 0, 0, 0, 0, "information on author ",
807 DECL inline_p , NULL , DECL html_attrs, NULL, NULL
808 },
809 { "applet", 0, 0, 0, 0, 1, 1, 2, "java applet ",
810 DECL flow_param , NULL , NULL , DECL applet_attrs, NULL
811 },
812 { "area", 0, 2, 2, 1, 0, 0, 0, "client-side image map area ",
813 EMPTY , NULL , DECL area_attrs , DECL target_attr, DECL alt_attr
814 },
815 { "b", 0, 3, 0, 0, 0, 0, 1, "bold text style",
816 DECL html_inline , NULL , DECL html_attrs, NULL, NULL
817 },
818 { "base", 0, 2, 2, 1, 0, 0, 0, "document base uri ",
819 EMPTY , NULL , NULL , DECL target_attr, DECL href_attrs
820 },
821 { "basefont", 0, 2, 2, 1, 1, 1, 1, "base font size " ,
822 EMPTY , NULL , NULL, DECL basefont_attrs, NULL
823 },
824 { "bdo", 0, 0, 0, 0, 0, 0, 1, "i18n bidi over-ride ",
825 DECL html_inline , NULL , DECL core_i18n_attrs, NULL, DECL dir_attr
826 },
827 { "big", 0, 3, 0, 0, 0, 0, 1, "large text style",
828 DECL html_inline , NULL , DECL html_attrs, NULL, NULL
829 },
830 { "blockquote", 0, 0, 0, 0, 0, 0, 0, "long quotation ",
831 DECL html_flow , NULL , DECL quote_attrs , NULL, NULL
832 },
833 { "body", 1, 1, 0, 0, 0, 0, 0, "document body ",
834 DECL body_contents , "div" , DECL body_attrs, DECL body_depr, NULL
835 },
836 { "br", 0, 2, 2, 1, 0, 0, 1, "forced line break ",
837 EMPTY , NULL , DECL core_attrs, DECL clear_attrs , NULL
838 },
839 { "button", 0, 0, 0, 0, 0, 0, 2, "push button ",
840 DECL html_flow MODIFIER , NULL , DECL button_attrs, NULL, NULL
841 },
842 { "caption", 0, 0, 0, 0, 0, 0, 0, "table caption ",
843 DECL html_inline , NULL , DECL html_attrs, NULL, NULL
844 },
845 { "center", 0, 3, 0, 0, 1, 1, 0, "shorthand for div align=center ",
846 DECL html_flow , NULL , NULL, DECL html_attrs, NULL
847 },
848 { "cite", 0, 0, 0, 0, 0, 0, 1, "citation",
849 DECL html_inline , NULL , DECL html_attrs, NULL, NULL
850 },
851 { "code", 0, 0, 0, 0, 0, 0, 1, "computer code fragment",
852 DECL html_inline , NULL , DECL html_attrs, NULL, NULL
853 },
854 { "col", 0, 2, 2, 1, 0, 0, 0, "table column ",
855 EMPTY , NULL , DECL col_attrs , NULL, NULL
856 },
857 { "colgroup", 0, 1, 0, 0, 0, 0, 0, "table column group ",
858 DECL col_elt , "col" , DECL col_attrs , NULL, NULL
859 },
860 { "dd", 0, 1, 0, 0, 0, 0, 0, "definition description ",
861 DECL html_flow , NULL , DECL html_attrs, NULL, NULL
862 },
863 { "del", 0, 0, 0, 0, 0, 0, 2, "deleted text ",
864 DECL html_flow , NULL , DECL edit_attrs , NULL, NULL
865 },
866 { "dfn", 0, 0, 0, 0, 0, 0, 1, "instance definition",
867 DECL html_inline , NULL , DECL html_attrs, NULL, NULL
868 },
869 { "dir", 0, 0, 0, 0, 1, 1, 0, "directory list",
870 DECL blockli_elt, "li" , NULL, DECL compact_attrs, NULL
871 },
872 { "div", 0, 0, 0, 0, 0, 0, 0, "generic language/style container",
873 DECL html_flow, NULL, DECL html_attrs, DECL align_attr, NULL
874 },
875 { "dl", 0, 0, 0, 0, 0, 0, 0, "definition list ",
876 DECL dl_contents , "dd" , DECL html_attrs, DECL compact_attr, NULL
877 },
878 { "dt", 0, 1, 0, 0, 0, 0, 0, "definition term ",
879 DECL html_inline, NULL, DECL html_attrs, NULL, NULL
880 },
881 { "em", 0, 3, 0, 0, 0, 0, 1, "emphasis",
882 DECL html_inline, NULL, DECL html_attrs, NULL, NULL
883 },
884 { "embed", 0, 1, 0, 0, 1, 1, 1, "generic embedded object ",
885 EMPTY, NULL, DECL embed_attrs, NULL, NULL
886 },
887 { "fieldset", 0, 0, 0, 0, 0, 0, 0, "form control group ",
888 DECL fieldset_contents , NULL, DECL html_attrs, NULL, NULL
889 },
890 { "font", 0, 3, 0, 0, 1, 1, 1, "local change to font ",
891 DECL html_inline, NULL, NULL, DECL font_attrs, NULL
892 },
893 { "form", 0, 0, 0, 0, 0, 0, 0, "interactive form ",
894 DECL form_contents, "fieldset", DECL form_attrs , DECL target_attr, DECL action_attr
895 },
896 { "frame", 0, 2, 2, 1, 0, 2, 0, "subwindow " ,
897 EMPTY, NULL, NULL, DECL frame_attrs, NULL
898 },
899 { "frameset", 0, 0, 0, 0, 0, 2, 0, "window subdivision" ,
900 DECL frameset_contents, "noframes" , NULL , DECL frameset_attrs, NULL
901 },
902 { "h1", 0, 0, 0, 0, 0, 0, 0, "heading ",
903 DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
904 },
905 { "h2", 0, 0, 0, 0, 0, 0, 0, "heading ",
906 DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
907 },
908 { "h3", 0, 0, 0, 0, 0, 0, 0, "heading ",
909 DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
910 },
911 { "h4", 0, 0, 0, 0, 0, 0, 0, "heading ",
912 DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
913 },
914 { "h5", 0, 0, 0, 0, 0, 0, 0, "heading ",
915 DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
916 },
917 { "h6", 0, 0, 0, 0, 0, 0, 0, "heading ",
918 DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
919 },
920 { "head", 1, 1, 0, 0, 0, 0, 0, "document head ",
921 DECL head_contents, NULL, DECL head_attrs, NULL, NULL
922 },
923 { "hr", 0, 2, 2, 1, 0, 0, 0, "horizontal rule " ,
924 EMPTY, NULL, DECL html_attrs, DECL hr_depr, NULL
925 },
926 { "html", 1, 1, 0, 0, 0, 0, 0, "document root element ",
927 DECL html_content , NULL , DECL i18n_attrs, DECL version_attr, NULL
928 },
929 { "i", 0, 3, 0, 0, 0, 0, 1, "italic text style",
930 DECL html_inline, NULL, DECL html_attrs, NULL, NULL
931 },
932 { "iframe", 0, 0, 0, 0, 0, 1, 2, "inline subwindow ",
933 DECL html_flow, NULL, NULL, DECL iframe_attrs, NULL
934 },
935 { "img", 0, 2, 2, 1, 0, 0, 1, "embedded image ",
936 EMPTY, NULL, DECL img_attrs, DECL align_attr, DECL src_alt_attrs
937 },
938 { "input", 0, 2, 2, 1, 0, 0, 1, "form control ",
939 EMPTY, NULL, DECL input_attrs , DECL align_attr, NULL
940 },
941 { "ins", 0, 0, 0, 0, 0, 0, 2, "inserted text",
942 DECL html_flow, NULL, DECL edit_attrs, NULL, NULL
943 },
944 { "isindex", 0, 2, 2, 1, 1, 1, 0, "single line prompt ",
945 EMPTY, NULL, NULL, DECL prompt_attrs, NULL
946 },
947 { "kbd", 0, 0, 0, 0, 0, 0, 1, "text to be entered by the user",
948 DECL html_inline, NULL, DECL html_attrs, NULL, NULL
949 },
950 { "label", 0, 0, 0, 0, 0, 0, 1, "form field label text ",
951 DECL html_inline MODIFIER, NULL, DECL label_attrs , NULL, NULL
952 },
953 { "legend", 0, 0, 0, 0, 0, 0, 0, "fieldset legend ",
954 DECL html_inline, NULL, DECL legend_attrs , DECL align_attr, NULL
955 },
956 { "li", 0, 1, 1, 0, 0, 0, 0, "list item ",
957 DECL html_flow, NULL, DECL html_attrs, NULL, NULL
958 },
959 { "link", 0, 2, 2, 1, 0, 0, 0, "a media-independent link ",
960 EMPTY, NULL, DECL link_attrs, DECL target_attr, NULL
961 },
962 { "map", 0, 0, 0, 0, 0, 0, 2, "client-side image map ",
963 DECL map_contents , NULL, DECL html_attrs , NULL, DECL name_attr
964 },
965 { "menu", 0, 0, 0, 0, 1, 1, 0, "menu list ",
966 DECL blockli_elt , NULL, NULL, DECL compact_attrs, NULL
967 },
968 { "meta", 0, 2, 2, 1, 0, 0, 0, "generic metainformation ",
969 EMPTY, NULL, DECL meta_attrs , NULL , DECL content_attr
970 },
971 { "noframes", 0, 0, 0, 0, 0, 2, 0, "alternate content container for non frame-based rendering ",
972 DECL noframes_content, "body" , DECL html_attrs, NULL, NULL
973 },
974 { "noscript", 0, 0, 0, 0, 0, 0, 0, "alternate content container for non script-based rendering ",
975 DECL html_flow, "div", DECL html_attrs, NULL, NULL
976 },
977 { "object", 0, 0, 0, 0, 0, 0, 2, "generic embedded object ",
978 DECL object_contents , "div" , DECL object_attrs, DECL object_depr, NULL
979 },
980 { "ol", 0, 0, 0, 0, 0, 0, 0, "ordered list ",
981 DECL li_elt , "li" , DECL html_attrs, DECL ol_attrs, NULL
982 },
983 { "optgroup", 0, 0, 0, 0, 0, 0, 0, "option group ",
984 DECL option_elt , "option", DECL optgroup_attrs, NULL, DECL label_attr
985 },
986 { "option", 0, 1, 0, 0, 0, 0, 0, "selectable choice " ,
987 DECL html_pcdata, NULL, DECL option_attrs, NULL, NULL
988 },
989 { "p", 0, 1, 0, 0, 0, 0, 0, "paragraph ",
990 DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
991 },
992 { "param", 0, 2, 2, 1, 0, 0, 0, "named property value ",
993 EMPTY, NULL, DECL param_attrs, NULL, DECL name_attr
994 },
995 { "pre", 0, 0, 0, 0, 0, 0, 0, "preformatted text ",
996 DECL pre_content, NULL, DECL html_attrs, DECL width_attr, NULL
997 },
998 { "q", 0, 0, 0, 0, 0, 0, 1, "short inline quotation ",
999 DECL html_inline, NULL, DECL quote_attrs, NULL, NULL
1000 },
1001 { "s", 0, 3, 0, 0, 1, 1, 1, "strike-through text style",
1002 DECL html_inline, NULL, NULL, DECL html_attrs, NULL
1003 },
1004 { "samp", 0, 0, 0, 0, 0, 0, 1, "sample program output, scripts, etc.",
1005 DECL html_inline, NULL, DECL html_attrs, NULL, NULL
1006 },
1007 { "script", 0, 0, 0, 0, 0, 0, 2, "script statements ",
1008 DECL html_cdata, NULL, DECL script_attrs, DECL language_attr, DECL type_attr
1009 },
1010 { "select", 0, 0, 0, 0, 0, 0, 1, "option selector ",
1011 DECL select_content, NULL, DECL select_attrs, NULL, NULL
1012 },
1013 { "small", 0, 3, 0, 0, 0, 0, 1, "small text style",
1014 DECL html_inline, NULL, DECL html_attrs, NULL, NULL
1015 },
1016 { "span", 0, 0, 0, 0, 0, 0, 1, "generic language/style container ",
1017 DECL html_inline, NULL, DECL html_attrs, NULL, NULL
1018 },
1019 { "strike", 0, 3, 0, 0, 1, 1, 1, "strike-through text",
1020 DECL html_inline, NULL, NULL, DECL html_attrs, NULL
1021 },
1022 { "strong", 0, 3, 0, 0, 0, 0, 1, "strong emphasis",
1023 DECL html_inline, NULL, DECL html_attrs, NULL, NULL
1024 },
1025 { "style", 0, 0, 0, 0, 0, 0, 0, "style info ",
1026 DECL html_cdata, NULL, DECL style_attrs, NULL, DECL type_attr
1027 },
1028 { "sub", 0, 3, 0, 0, 0, 0, 1, "subscript",
1029 DECL html_inline, NULL, DECL html_attrs, NULL, NULL
1030 },
1031 { "sup", 0, 3, 0, 0, 0, 0, 1, "superscript ",
1032 DECL html_inline, NULL, DECL html_attrs, NULL, NULL
1033 },
1034 { "table", 0, 0, 0, 0, 0, 0, 0, "",
1035 DECL table_contents , "tr" , DECL table_attrs , DECL table_depr, NULL
1036 },
1037 { "tbody", 1, 0, 0, 0, 0, 0, 0, "table body ",
1038 DECL tr_elt , "tr" , DECL talign_attrs, NULL, NULL
1039 },
1040 { "td", 0, 0, 0, 0, 0, 0, 0, "table data cell",
1041 DECL html_flow, NULL, DECL th_td_attr, DECL th_td_depr, NULL
1042 },
1043 { "textarea", 0, 0, 0, 0, 0, 0, 1, "multi-line text field ",
1044 DECL html_pcdata, NULL, DECL textarea_attrs, NULL, DECL rows_cols_attr
1045 },
1046 { "tfoot", 0, 1, 0, 0, 0, 0, 0, "table footer ",
1047 DECL tr_elt , "tr" , DECL talign_attrs, NULL, NULL
1048 },
1049 { "th", 0, 1, 0, 0, 0, 0, 0, "table header cell",
1050 DECL html_flow, NULL, DECL th_td_attr, DECL th_td_depr, NULL
1051 },
1052 { "thead", 0, 1, 0, 0, 0, 0, 0, "table header ",
1053 DECL tr_elt , "tr" , DECL talign_attrs, NULL, NULL
1054 },
1055 { "title", 0, 0, 0, 0, 0, 0, 0, "document title ",
1056 DECL html_pcdata, NULL, DECL i18n_attrs, NULL, NULL
1057 },
1058 { "tr", 0, 0, 0, 0, 0, 0, 0, "table row ",
1059 DECL tr_contents , "td" , DECL talign_attrs, DECL bgcolor_attr, NULL
1060 },
1061 { "tt", 0, 3, 0, 0, 0, 0, 1, "teletype or monospaced text style",
1062 DECL html_inline, NULL, DECL html_attrs, NULL, NULL
1063 },
1064 { "u", 0, 3, 0, 0, 1, 1, 1, "underlined text style",
1065 DECL html_inline, NULL, NULL, DECL html_attrs, NULL
1066 },
1067 { "ul", 0, 0, 0, 0, 0, 0, 0, "unordered list ",
1068 DECL li_elt , "li" , DECL html_attrs, DECL ul_depr, NULL
1069 },
1070 { "var", 0, 0, 0, 0, 0, 0, 1, "instance of a variable or program argument",
1071 DECL html_inline, NULL, DECL html_attrs, NULL, NULL
1072 }
1073 };
1074
1075 typedef struct {
1076 const char *oldTag;
1077 const char *newTag;
1078 } htmlStartCloseEntry;
1079
1080 /*
1081 * start tags that imply the end of current element
1082 */
1083 static const htmlStartCloseEntry htmlStartClose[] = {
1084 { "a", "a" },
1085 { "a", "fieldset" },
1086 { "a", "table" },
1087 { "a", "td" },
1088 { "a", "th" },
1089 { "address", "dd" },
1090 { "address", "dl" },
1091 { "address", "dt" },
1092 { "address", "form" },
1093 { "address", "li" },
1094 { "address", "ul" },
1095 { "b", "center" },
1096 { "b", "p" },
1097 { "b", "td" },
1098 { "b", "th" },
1099 { "big", "p" },
1100 { "caption", "col" },
1101 { "caption", "colgroup" },
1102 { "caption", "tbody" },
1103 { "caption", "tfoot" },
1104 { "caption", "thead" },
1105 { "caption", "tr" },
1106 { "col", "col" },
1107 { "col", "colgroup" },
1108 { "col", "tbody" },
1109 { "col", "tfoot" },
1110 { "col", "thead" },
1111 { "col", "tr" },
1112 { "colgroup", "colgroup" },
1113 { "colgroup", "tbody" },
1114 { "colgroup", "tfoot" },
1115 { "colgroup", "thead" },
1116 { "colgroup", "tr" },
1117 { "dd", "dt" },
1118 { "dir", "dd" },
1119 { "dir", "dl" },
1120 { "dir", "dt" },
1121 { "dir", "form" },
1122 { "dir", "ul" },
1123 { "dl", "form" },
1124 { "dl", "li" },
1125 { "dt", "dd" },
1126 { "dt", "dl" },
1127 { "font", "center" },
1128 { "font", "td" },
1129 { "font", "th" },
1130 { "form", "form" },
1131 { "h1", "fieldset" },
1132 { "h1", "form" },
1133 { "h1", "li" },
1134 { "h1", "p" },
1135 { "h1", "table" },
1136 { "h2", "fieldset" },
1137 { "h2", "form" },
1138 { "h2", "li" },
1139 { "h2", "p" },
1140 { "h2", "table" },
1141 { "h3", "fieldset" },
1142 { "h3", "form" },
1143 { "h3", "li" },
1144 { "h3", "p" },
1145 { "h3", "table" },
1146 { "h4", "fieldset" },
1147 { "h4", "form" },
1148 { "h4", "li" },
1149 { "h4", "p" },
1150 { "h4", "table" },
1151 { "h5", "fieldset" },
1152 { "h5", "form" },
1153 { "h5", "li" },
1154 { "h5", "p" },
1155 { "h5", "table" },
1156 { "h6", "fieldset" },
1157 { "h6", "form" },
1158 { "h6", "li" },
1159 { "h6", "p" },
1160 { "h6", "table" },
1161 { "head", "a" },
1162 { "head", "abbr" },
1163 { "head", "acronym" },
1164 { "head", "address" },
1165 { "head", "b" },
1166 { "head", "bdo" },
1167 { "head", "big" },
1168 { "head", "blockquote" },
1169 { "head", "body" },
1170 { "head", "br" },
1171 { "head", "center" },
1172 { "head", "cite" },
1173 { "head", "code" },
1174 { "head", "dd" },
1175 { "head", "dfn" },
1176 { "head", "dir" },
1177 { "head", "div" },
1178 { "head", "dl" },
1179 { "head", "dt" },
1180 { "head", "em" },
1181 { "head", "fieldset" },
1182 { "head", "font" },
1183 { "head", "form" },
1184 { "head", "frameset" },
1185 { "head", "h1" },
1186 { "head", "h2" },
1187 { "head", "h3" },
1188 { "head", "h4" },
1189 { "head", "h5" },
1190 { "head", "h6" },
1191 { "head", "hr" },
1192 { "head", "i" },
1193 { "head", "iframe" },
1194 { "head", "img" },
1195 { "head", "kbd" },
1196 { "head", "li" },
1197 { "head", "listing" },
1198 { "head", "map" },
1199 { "head", "menu" },
1200 { "head", "ol" },
1201 { "head", "p" },
1202 { "head", "pre" },
1203 { "head", "q" },
1204 { "head", "s" },
1205 { "head", "samp" },
1206 { "head", "small" },
1207 { "head", "span" },
1208 { "head", "strike" },
1209 { "head", "strong" },
1210 { "head", "sub" },
1211 { "head", "sup" },
1212 { "head", "table" },
1213 { "head", "tt" },
1214 { "head", "u" },
1215 { "head", "ul" },
1216 { "head", "var" },
1217 { "head", "xmp" },
1218 { "hr", "form" },
1219 { "i", "center" },
1220 { "i", "p" },
1221 { "i", "td" },
1222 { "i", "th" },
1223 { "legend", "fieldset" },
1224 { "li", "li" },
1225 { "link", "body" },
1226 { "link", "frameset" },
1227 { "listing", "dd" },
1228 { "listing", "dl" },
1229 { "listing", "dt" },
1230 { "listing", "fieldset" },
1231 { "listing", "form" },
1232 { "listing", "li" },
1233 { "listing", "table" },
1234 { "listing", "ul" },
1235 { "menu", "dd" },
1236 { "menu", "dl" },
1237 { "menu", "dt" },
1238 { "menu", "form" },
1239 { "menu", "ul" },
1240 { "ol", "form" },
1241 { "ol", "ul" },
1242 { "option", "optgroup" },
1243 { "option", "option" },
1244 { "p", "address" },
1245 { "p", "blockquote" },
1246 { "p", "body" },
1247 { "p", "caption" },
1248 { "p", "center" },
1249 { "p", "col" },
1250 { "p", "colgroup" },
1251 { "p", "dd" },
1252 { "p", "dir" },
1253 { "p", "div" },
1254 { "p", "dl" },
1255 { "p", "dt" },
1256 { "p", "fieldset" },
1257 { "p", "form" },
1258 { "p", "frameset" },
1259 { "p", "h1" },
1260 { "p", "h2" },
1261 { "p", "h3" },
1262 { "p", "h4" },
1263 { "p", "h5" },
1264 { "p", "h6" },
1265 { "p", "head" },
1266 { "p", "hr" },
1267 { "p", "li" },
1268 { "p", "listing" },
1269 { "p", "menu" },
1270 { "p", "ol" },
1271 { "p", "p" },
1272 { "p", "pre" },
1273 { "p", "table" },
1274 { "p", "tbody" },
1275 { "p", "td" },
1276 { "p", "tfoot" },
1277 { "p", "th" },
1278 { "p", "title" },
1279 { "p", "tr" },
1280 { "p", "ul" },
1281 { "p", "xmp" },
1282 { "pre", "dd" },
1283 { "pre", "dl" },
1284 { "pre", "dt" },
1285 { "pre", "fieldset" },
1286 { "pre", "form" },
1287 { "pre", "li" },
1288 { "pre", "table" },
1289 { "pre", "ul" },
1290 { "s", "p" },
1291 { "script", "noscript" },
1292 { "small", "p" },
1293 { "span", "td" },
1294 { "span", "th" },
1295 { "strike", "p" },
1296 { "style", "body" },
1297 { "style", "frameset" },
1298 { "tbody", "tbody" },
1299 { "tbody", "tfoot" },
1300 { "td", "tbody" },
1301 { "td", "td" },
1302 { "td", "tfoot" },
1303 { "td", "th" },
1304 { "td", "tr" },
1305 { "tfoot", "tbody" },
1306 { "th", "tbody" },
1307 { "th", "td" },
1308 { "th", "tfoot" },
1309 { "th", "th" },
1310 { "th", "tr" },
1311 { "thead", "tbody" },
1312 { "thead", "tfoot" },
1313 { "title", "body" },
1314 { "title", "frameset" },
1315 { "tr", "tbody" },
1316 { "tr", "tfoot" },
1317 { "tr", "tr" },
1318 { "tt", "p" },
1319 { "u", "p" },
1320 { "u", "td" },
1321 { "u", "th" },
1322 { "ul", "address" },
1323 { "ul", "form" },
1324 { "ul", "menu" },
1325 { "ul", "ol" },
1326 { "ul", "pre" },
1327 { "xmp", "dd" },
1328 { "xmp", "dl" },
1329 { "xmp", "dt" },
1330 { "xmp", "fieldset" },
1331 { "xmp", "form" },
1332 { "xmp", "li" },
1333 { "xmp", "table" },
1334 { "xmp", "ul" }
1335 };
1336
1337 /*
1338 * The list of HTML elements which are supposed not to have
1339 * CDATA content and where a p element will be implied
1340 *
1341 * TODO: extend that list by reading the HTML SGML DTD on
1342 * implied paragraph
1343 */
1344 static const char *const htmlNoContentElements[] = {
1345 "html",
1346 "head",
1347 NULL
1348 };
1349
1350 /*
1351 * The list of HTML attributes which are of content %Script;
1352 * NOTE: when adding ones, check htmlIsScriptAttribute() since
1353 * it assumes the name starts with 'on'
1354 */
1355 static const char *const htmlScriptAttributes[] = {
1356 "onclick",
1357 "ondblclick",
1358 "onmousedown",
1359 "onmouseup",
1360 "onmouseover",
1361 "onmousemove",
1362 "onmouseout",
1363 "onkeypress",
1364 "onkeydown",
1365 "onkeyup",
1366 "onload",
1367 "onunload",
1368 "onfocus",
1369 "onblur",
1370 "onsubmit",
1371 "onreset",
1372 "onchange",
1373 "onselect"
1374 };
1375
1376 /*
1377 * This table is used by the htmlparser to know what to do with
1378 * broken html pages. By assigning different priorities to different
1379 * elements the parser can decide how to handle extra endtags.
1380 * Endtags are only allowed to close elements with lower or equal
1381 * priority.
1382 */
1383
1384 typedef struct {
1385 const char *name;
1386 int priority;
1387 } elementPriority;
1388
1389 static const elementPriority htmlEndPriority[] = {
1390 {"div", 150},
1391 {"td", 160},
1392 {"th", 160},
1393 {"tr", 170},
1394 {"thead", 180},
1395 {"tbody", 180},
1396 {"tfoot", 180},
1397 {"table", 190},
1398 {"head", 200},
1399 {"body", 200},
1400 {"html", 220},
1401 {NULL, 100} /* Default priority */
1402 };
1403
1404 /************************************************************************
1405 * *
1406 * functions to handle HTML specific data *
1407 * *
1408 ************************************************************************/
1409
1410 /**
1411 * htmlInitAutoClose:
1412 *
1413 * This is a no-op now.
1414 */
1415 void
htmlInitAutoClose(void)1416 htmlInitAutoClose(void) {
1417 }
1418
1419 static int
htmlCompareTags(const void * key,const void * member)1420 htmlCompareTags(const void *key, const void *member) {
1421 const xmlChar *tag = (const xmlChar *) key;
1422 const htmlElemDesc *desc = (const htmlElemDesc *) member;
1423
1424 return(xmlStrcasecmp(tag, BAD_CAST desc->name));
1425 }
1426
1427 /**
1428 * htmlTagLookup:
1429 * @tag: The tag name in lowercase
1430 *
1431 * Lookup the HTML tag in the ElementTable
1432 *
1433 * Returns the related htmlElemDescPtr or NULL if not found.
1434 */
1435 const htmlElemDesc *
htmlTagLookup(const xmlChar * tag)1436 htmlTagLookup(const xmlChar *tag) {
1437 if (tag == NULL)
1438 return(NULL);
1439
1440 return((const htmlElemDesc *) bsearch(tag, html40ElementTable,
1441 sizeof(html40ElementTable) / sizeof(htmlElemDesc),
1442 sizeof(htmlElemDesc), htmlCompareTags));
1443 }
1444
1445 /**
1446 * htmlGetEndPriority:
1447 * @name: The name of the element to look up the priority for.
1448 *
1449 * Return value: The "endtag" priority.
1450 **/
1451 static int
htmlGetEndPriority(const xmlChar * name)1452 htmlGetEndPriority (const xmlChar *name) {
1453 int i = 0;
1454
1455 while ((htmlEndPriority[i].name != NULL) &&
1456 (!xmlStrEqual((const xmlChar *)htmlEndPriority[i].name, name)))
1457 i++;
1458
1459 return(htmlEndPriority[i].priority);
1460 }
1461
1462
1463 static int
htmlCompareStartClose(const void * vkey,const void * member)1464 htmlCompareStartClose(const void *vkey, const void *member) {
1465 const htmlStartCloseEntry *key = (const htmlStartCloseEntry *) vkey;
1466 const htmlStartCloseEntry *entry = (const htmlStartCloseEntry *) member;
1467 int ret;
1468
1469 ret = strcmp(key->oldTag, entry->oldTag);
1470 if (ret == 0)
1471 ret = strcmp(key->newTag, entry->newTag);
1472
1473 return(ret);
1474 }
1475
1476 /**
1477 * htmlCheckAutoClose:
1478 * @newtag: The new tag name
1479 * @oldtag: The old tag name
1480 *
1481 * Checks whether the new tag is one of the registered valid tags for
1482 * closing old.
1483 *
1484 * Returns 0 if no, 1 if yes.
1485 */
1486 static int
htmlCheckAutoClose(const xmlChar * newtag,const xmlChar * oldtag)1487 htmlCheckAutoClose(const xmlChar * newtag, const xmlChar * oldtag)
1488 {
1489 htmlStartCloseEntry key;
1490 void *res;
1491
1492 key.oldTag = (const char *) oldtag;
1493 key.newTag = (const char *) newtag;
1494 res = bsearch(&key, htmlStartClose,
1495 sizeof(htmlStartClose) / sizeof(htmlStartCloseEntry),
1496 sizeof(htmlStartCloseEntry), htmlCompareStartClose);
1497 return(res != NULL);
1498 }
1499
1500 /**
1501 * htmlAutoCloseOnClose:
1502 * @ctxt: an HTML parser context
1503 * @newtag: The new tag name
1504 * @force: force the tag closure
1505 *
1506 * The HTML DTD allows an ending tag to implicitly close other tags.
1507 */
1508 static void
htmlAutoCloseOnClose(htmlParserCtxtPtr ctxt,const xmlChar * newtag)1509 htmlAutoCloseOnClose(htmlParserCtxtPtr ctxt, const xmlChar * newtag)
1510 {
1511 const htmlElemDesc *info;
1512 int i, priority;
1513
1514 priority = htmlGetEndPriority(newtag);
1515
1516 for (i = (ctxt->nameNr - 1); i >= 0; i--) {
1517
1518 if (xmlStrEqual(newtag, ctxt->nameTab[i]))
1519 break;
1520 /*
1521 * A misplaced endtag can only close elements with lower
1522 * or equal priority, so if we find an element with higher
1523 * priority before we find an element with
1524 * matching name, we just ignore this endtag
1525 */
1526 if (htmlGetEndPriority(ctxt->nameTab[i]) > priority)
1527 return;
1528 }
1529 if (i < 0)
1530 return;
1531
1532 while (!xmlStrEqual(newtag, ctxt->name)) {
1533 info = htmlTagLookup(ctxt->name);
1534 if ((info != NULL) && (info->endTag == 3)) {
1535 htmlParseErr(ctxt, XML_ERR_TAG_NAME_MISMATCH,
1536 "Opening and ending tag mismatch: %s and %s\n",
1537 newtag, ctxt->name);
1538 }
1539 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
1540 ctxt->sax->endElement(ctxt->userData, ctxt->name);
1541 htmlnamePop(ctxt);
1542 }
1543 }
1544
1545 /**
1546 * htmlAutoCloseOnEnd:
1547 * @ctxt: an HTML parser context
1548 *
1549 * Close all remaining tags at the end of the stream
1550 */
1551 static void
htmlAutoCloseOnEnd(htmlParserCtxtPtr ctxt)1552 htmlAutoCloseOnEnd(htmlParserCtxtPtr ctxt)
1553 {
1554 int i;
1555
1556 if (ctxt->nameNr == 0)
1557 return;
1558 for (i = (ctxt->nameNr - 1); i >= 0; i--) {
1559 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
1560 ctxt->sax->endElement(ctxt->userData, ctxt->name);
1561 htmlnamePop(ctxt);
1562 }
1563 }
1564
1565 /**
1566 * htmlAutoClose:
1567 * @ctxt: an HTML parser context
1568 * @newtag: The new tag name or NULL
1569 *
1570 * The HTML DTD allows a tag to implicitly close other tags.
1571 * The list is kept in htmlStartClose array. This function is
1572 * called when a new tag has been detected and generates the
1573 * appropriates closes if possible/needed.
1574 * If newtag is NULL this mean we are at the end of the resource
1575 * and we should check
1576 */
1577 static void
htmlAutoClose(htmlParserCtxtPtr ctxt,const xmlChar * newtag)1578 htmlAutoClose(htmlParserCtxtPtr ctxt, const xmlChar * newtag)
1579 {
1580 while ((newtag != NULL) && (ctxt->name != NULL) &&
1581 (htmlCheckAutoClose(newtag, ctxt->name))) {
1582 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
1583 ctxt->sax->endElement(ctxt->userData, ctxt->name);
1584 htmlnamePop(ctxt);
1585 }
1586 if (newtag == NULL) {
1587 htmlAutoCloseOnEnd(ctxt);
1588 return;
1589 }
1590 while ((newtag == NULL) && (ctxt->name != NULL) &&
1591 ((xmlStrEqual(ctxt->name, BAD_CAST "head")) ||
1592 (xmlStrEqual(ctxt->name, BAD_CAST "body")) ||
1593 (xmlStrEqual(ctxt->name, BAD_CAST "html")))) {
1594 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
1595 ctxt->sax->endElement(ctxt->userData, ctxt->name);
1596 htmlnamePop(ctxt);
1597 }
1598 }
1599
1600 /**
1601 * htmlAutoCloseTag:
1602 * @doc: the HTML document
1603 * @name: The tag name
1604 * @elem: the HTML element
1605 *
1606 * The HTML DTD allows a tag to implicitly close other tags.
1607 * The list is kept in htmlStartClose array. This function checks
1608 * if the element or one of it's children would autoclose the
1609 * given tag.
1610 *
1611 * Returns 1 if autoclose, 0 otherwise
1612 */
1613 int
htmlAutoCloseTag(htmlDocPtr doc,const xmlChar * name,htmlNodePtr elem)1614 htmlAutoCloseTag(htmlDocPtr doc, const xmlChar *name, htmlNodePtr elem) {
1615 htmlNodePtr child;
1616
1617 if (elem == NULL) return(1);
1618 if (xmlStrEqual(name, elem->name)) return(0);
1619 if (htmlCheckAutoClose(elem->name, name)) return(1);
1620 child = elem->children;
1621 while (child != NULL) {
1622 if (htmlAutoCloseTag(doc, name, child)) return(1);
1623 child = child->next;
1624 }
1625 return(0);
1626 }
1627
1628 /**
1629 * htmlIsAutoClosed:
1630 * @doc: the HTML document
1631 * @elem: the HTML element
1632 *
1633 * The HTML DTD allows a tag to implicitly close other tags.
1634 * The list is kept in htmlStartClose array. This function checks
1635 * if a tag is autoclosed by one of it's child
1636 *
1637 * Returns 1 if autoclosed, 0 otherwise
1638 */
1639 int
htmlIsAutoClosed(htmlDocPtr doc,htmlNodePtr elem)1640 htmlIsAutoClosed(htmlDocPtr doc, htmlNodePtr elem) {
1641 htmlNodePtr child;
1642
1643 if (elem == NULL) return(1);
1644 child = elem->children;
1645 while (child != NULL) {
1646 if (htmlAutoCloseTag(doc, elem->name, child)) return(1);
1647 child = child->next;
1648 }
1649 return(0);
1650 }
1651
1652 /**
1653 * htmlCheckImplied:
1654 * @ctxt: an HTML parser context
1655 * @newtag: The new tag name
1656 *
1657 * The HTML DTD allows a tag to exists only implicitly
1658 * called when a new tag has been detected and generates the
1659 * appropriates implicit tags if missing
1660 */
1661 static void
htmlCheckImplied(htmlParserCtxtPtr ctxt,const xmlChar * newtag)1662 htmlCheckImplied(htmlParserCtxtPtr ctxt, const xmlChar *newtag) {
1663 int i;
1664
1665 if (ctxt->options & HTML_PARSE_NOIMPLIED)
1666 return;
1667 if (!htmlOmittedDefaultValue)
1668 return;
1669 if (xmlStrEqual(newtag, BAD_CAST"html"))
1670 return;
1671 if (ctxt->nameNr <= 0) {
1672 htmlnamePush(ctxt, BAD_CAST"html");
1673 if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
1674 ctxt->sax->startElement(ctxt->userData, BAD_CAST"html", NULL);
1675 }
1676 if ((xmlStrEqual(newtag, BAD_CAST"body")) || (xmlStrEqual(newtag, BAD_CAST"head")))
1677 return;
1678 if ((ctxt->nameNr <= 1) &&
1679 ((xmlStrEqual(newtag, BAD_CAST"script")) ||
1680 (xmlStrEqual(newtag, BAD_CAST"style")) ||
1681 (xmlStrEqual(newtag, BAD_CAST"meta")) ||
1682 (xmlStrEqual(newtag, BAD_CAST"link")) ||
1683 (xmlStrEqual(newtag, BAD_CAST"title")) ||
1684 (xmlStrEqual(newtag, BAD_CAST"base")))) {
1685 if (ctxt->html >= 3) {
1686 /* we already saw or generated an <head> before */
1687 return;
1688 }
1689 /*
1690 * dropped OBJECT ... i you put it first BODY will be
1691 * assumed !
1692 */
1693 htmlnamePush(ctxt, BAD_CAST"head");
1694 if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
1695 ctxt->sax->startElement(ctxt->userData, BAD_CAST"head", NULL);
1696 } else if ((!xmlStrEqual(newtag, BAD_CAST"noframes")) &&
1697 (!xmlStrEqual(newtag, BAD_CAST"frame")) &&
1698 (!xmlStrEqual(newtag, BAD_CAST"frameset"))) {
1699 if (ctxt->html >= 10) {
1700 /* we already saw or generated a <body> before */
1701 return;
1702 }
1703 for (i = 0;i < ctxt->nameNr;i++) {
1704 if (xmlStrEqual(ctxt->nameTab[i], BAD_CAST"body")) {
1705 return;
1706 }
1707 if (xmlStrEqual(ctxt->nameTab[i], BAD_CAST"head")) {
1708 return;
1709 }
1710 }
1711
1712 htmlnamePush(ctxt, BAD_CAST"body");
1713 if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
1714 ctxt->sax->startElement(ctxt->userData, BAD_CAST"body", NULL);
1715 }
1716 }
1717
1718 /**
1719 * htmlCheckParagraph
1720 * @ctxt: an HTML parser context
1721 *
1722 * Check whether a p element need to be implied before inserting
1723 * characters in the current element.
1724 *
1725 * Returns 1 if a paragraph has been inserted, 0 if not and -1
1726 * in case of error.
1727 */
1728
1729 static int
htmlCheckParagraph(htmlParserCtxtPtr ctxt)1730 htmlCheckParagraph(htmlParserCtxtPtr ctxt) {
1731 const xmlChar *tag;
1732 int i;
1733
1734 if (ctxt == NULL)
1735 return(-1);
1736 tag = ctxt->name;
1737 if (tag == NULL) {
1738 htmlAutoClose(ctxt, BAD_CAST"p");
1739 htmlCheckImplied(ctxt, BAD_CAST"p");
1740 htmlnamePush(ctxt, BAD_CAST"p");
1741 if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
1742 ctxt->sax->startElement(ctxt->userData, BAD_CAST"p", NULL);
1743 return(1);
1744 }
1745 if (!htmlOmittedDefaultValue)
1746 return(0);
1747 for (i = 0; htmlNoContentElements[i] != NULL; i++) {
1748 if (xmlStrEqual(tag, BAD_CAST htmlNoContentElements[i])) {
1749 htmlAutoClose(ctxt, BAD_CAST"p");
1750 htmlCheckImplied(ctxt, BAD_CAST"p");
1751 htmlnamePush(ctxt, BAD_CAST"p");
1752 if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
1753 ctxt->sax->startElement(ctxt->userData, BAD_CAST"p", NULL);
1754 return(1);
1755 }
1756 }
1757 return(0);
1758 }
1759
1760 /**
1761 * htmlIsScriptAttribute:
1762 * @name: an attribute name
1763 *
1764 * Check if an attribute is of content type Script
1765 *
1766 * Returns 1 is the attribute is a script 0 otherwise
1767 */
1768 int
htmlIsScriptAttribute(const xmlChar * name)1769 htmlIsScriptAttribute(const xmlChar *name) {
1770 unsigned int i;
1771
1772 if (name == NULL)
1773 return(0);
1774 /*
1775 * all script attributes start with 'on'
1776 */
1777 if ((name[0] != 'o') || (name[1] != 'n'))
1778 return(0);
1779 for (i = 0;
1780 i < sizeof(htmlScriptAttributes)/sizeof(htmlScriptAttributes[0]);
1781 i++) {
1782 if (xmlStrEqual(name, (const xmlChar *) htmlScriptAttributes[i]))
1783 return(1);
1784 }
1785 return(0);
1786 }
1787
1788 /************************************************************************
1789 * *
1790 * The list of HTML predefined entities *
1791 * *
1792 ************************************************************************/
1793
1794
1795 static const htmlEntityDesc html40EntitiesTable[] = {
1796 /*
1797 * the 4 absolute ones, plus apostrophe.
1798 */
1799 { 34, "quot", "quotation mark = APL quote, U+0022 ISOnum" },
1800 { 38, "amp", "ampersand, U+0026 ISOnum" },
1801 { 39, "apos", "single quote" },
1802 { 60, "lt", "less-than sign, U+003C ISOnum" },
1803 { 62, "gt", "greater-than sign, U+003E ISOnum" },
1804
1805 /*
1806 * A bunch still in the 128-255 range
1807 * Replacing them depend really on the charset used.
1808 */
1809 { 160, "nbsp", "no-break space = non-breaking space, U+00A0 ISOnum" },
1810 { 161, "iexcl","inverted exclamation mark, U+00A1 ISOnum" },
1811 { 162, "cent", "cent sign, U+00A2 ISOnum" },
1812 { 163, "pound","pound sign, U+00A3 ISOnum" },
1813 { 164, "curren","currency sign, U+00A4 ISOnum" },
1814 { 165, "yen", "yen sign = yuan sign, U+00A5 ISOnum" },
1815 { 166, "brvbar","broken bar = broken vertical bar, U+00A6 ISOnum" },
1816 { 167, "sect", "section sign, U+00A7 ISOnum" },
1817 { 168, "uml", "diaeresis = spacing diaeresis, U+00A8 ISOdia" },
1818 { 169, "copy", "copyright sign, U+00A9 ISOnum" },
1819 { 170, "ordf", "feminine ordinal indicator, U+00AA ISOnum" },
1820 { 171, "laquo","left-pointing double angle quotation mark = left pointing guillemet, U+00AB ISOnum" },
1821 { 172, "not", "not sign, U+00AC ISOnum" },
1822 { 173, "shy", "soft hyphen = discretionary hyphen, U+00AD ISOnum" },
1823 { 174, "reg", "registered sign = registered trade mark sign, U+00AE ISOnum" },
1824 { 175, "macr", "macron = spacing macron = overline = APL overbar, U+00AF ISOdia" },
1825 { 176, "deg", "degree sign, U+00B0 ISOnum" },
1826 { 177, "plusmn","plus-minus sign = plus-or-minus sign, U+00B1 ISOnum" },
1827 { 178, "sup2", "superscript two = superscript digit two = squared, U+00B2 ISOnum" },
1828 { 179, "sup3", "superscript three = superscript digit three = cubed, U+00B3 ISOnum" },
1829 { 180, "acute","acute accent = spacing acute, U+00B4 ISOdia" },
1830 { 181, "micro","micro sign, U+00B5 ISOnum" },
1831 { 182, "para", "pilcrow sign = paragraph sign, U+00B6 ISOnum" },
1832 { 183, "middot","middle dot = Georgian comma Greek middle dot, U+00B7 ISOnum" },
1833 { 184, "cedil","cedilla = spacing cedilla, U+00B8 ISOdia" },
1834 { 185, "sup1", "superscript one = superscript digit one, U+00B9 ISOnum" },
1835 { 186, "ordm", "masculine ordinal indicator, U+00BA ISOnum" },
1836 { 187, "raquo","right-pointing double angle quotation mark right pointing guillemet, U+00BB ISOnum" },
1837 { 188, "frac14","vulgar fraction one quarter = fraction one quarter, U+00BC ISOnum" },
1838 { 189, "frac12","vulgar fraction one half = fraction one half, U+00BD ISOnum" },
1839 { 190, "frac34","vulgar fraction three quarters = fraction three quarters, U+00BE ISOnum" },
1840 { 191, "iquest","inverted question mark = turned question mark, U+00BF ISOnum" },
1841 { 192, "Agrave","latin capital letter A with grave = latin capital letter A grave, U+00C0 ISOlat1" },
1842 { 193, "Aacute","latin capital letter A with acute, U+00C1 ISOlat1" },
1843 { 194, "Acirc","latin capital letter A with circumflex, U+00C2 ISOlat1" },
1844 { 195, "Atilde","latin capital letter A with tilde, U+00C3 ISOlat1" },
1845 { 196, "Auml", "latin capital letter A with diaeresis, U+00C4 ISOlat1" },
1846 { 197, "Aring","latin capital letter A with ring above = latin capital letter A ring, U+00C5 ISOlat1" },
1847 { 198, "AElig","latin capital letter AE = latin capital ligature AE, U+00C6 ISOlat1" },
1848 { 199, "Ccedil","latin capital letter C with cedilla, U+00C7 ISOlat1" },
1849 { 200, "Egrave","latin capital letter E with grave, U+00C8 ISOlat1" },
1850 { 201, "Eacute","latin capital letter E with acute, U+00C9 ISOlat1" },
1851 { 202, "Ecirc","latin capital letter E with circumflex, U+00CA ISOlat1" },
1852 { 203, "Euml", "latin capital letter E with diaeresis, U+00CB ISOlat1" },
1853 { 204, "Igrave","latin capital letter I with grave, U+00CC ISOlat1" },
1854 { 205, "Iacute","latin capital letter I with acute, U+00CD ISOlat1" },
1855 { 206, "Icirc","latin capital letter I with circumflex, U+00CE ISOlat1" },
1856 { 207, "Iuml", "latin capital letter I with diaeresis, U+00CF ISOlat1" },
1857 { 208, "ETH", "latin capital letter ETH, U+00D0 ISOlat1" },
1858 { 209, "Ntilde","latin capital letter N with tilde, U+00D1 ISOlat1" },
1859 { 210, "Ograve","latin capital letter O with grave, U+00D2 ISOlat1" },
1860 { 211, "Oacute","latin capital letter O with acute, U+00D3 ISOlat1" },
1861 { 212, "Ocirc","latin capital letter O with circumflex, U+00D4 ISOlat1" },
1862 { 213, "Otilde","latin capital letter O with tilde, U+00D5 ISOlat1" },
1863 { 214, "Ouml", "latin capital letter O with diaeresis, U+00D6 ISOlat1" },
1864 { 215, "times","multiplication sign, U+00D7 ISOnum" },
1865 { 216, "Oslash","latin capital letter O with stroke latin capital letter O slash, U+00D8 ISOlat1" },
1866 { 217, "Ugrave","latin capital letter U with grave, U+00D9 ISOlat1" },
1867 { 218, "Uacute","latin capital letter U with acute, U+00DA ISOlat1" },
1868 { 219, "Ucirc","latin capital letter U with circumflex, U+00DB ISOlat1" },
1869 { 220, "Uuml", "latin capital letter U with diaeresis, U+00DC ISOlat1" },
1870 { 221, "Yacute","latin capital letter Y with acute, U+00DD ISOlat1" },
1871 { 222, "THORN","latin capital letter THORN, U+00DE ISOlat1" },
1872 { 223, "szlig","latin small letter sharp s = ess-zed, U+00DF ISOlat1" },
1873 { 224, "agrave","latin small letter a with grave = latin small letter a grave, U+00E0 ISOlat1" },
1874 { 225, "aacute","latin small letter a with acute, U+00E1 ISOlat1" },
1875 { 226, "acirc","latin small letter a with circumflex, U+00E2 ISOlat1" },
1876 { 227, "atilde","latin small letter a with tilde, U+00E3 ISOlat1" },
1877 { 228, "auml", "latin small letter a with diaeresis, U+00E4 ISOlat1" },
1878 { 229, "aring","latin small letter a with ring above = latin small letter a ring, U+00E5 ISOlat1" },
1879 { 230, "aelig","latin small letter ae = latin small ligature ae, U+00E6 ISOlat1" },
1880 { 231, "ccedil","latin small letter c with cedilla, U+00E7 ISOlat1" },
1881 { 232, "egrave","latin small letter e with grave, U+00E8 ISOlat1" },
1882 { 233, "eacute","latin small letter e with acute, U+00E9 ISOlat1" },
1883 { 234, "ecirc","latin small letter e with circumflex, U+00EA ISOlat1" },
1884 { 235, "euml", "latin small letter e with diaeresis, U+00EB ISOlat1" },
1885 { 236, "igrave","latin small letter i with grave, U+00EC ISOlat1" },
1886 { 237, "iacute","latin small letter i with acute, U+00ED ISOlat1" },
1887 { 238, "icirc","latin small letter i with circumflex, U+00EE ISOlat1" },
1888 { 239, "iuml", "latin small letter i with diaeresis, U+00EF ISOlat1" },
1889 { 240, "eth", "latin small letter eth, U+00F0 ISOlat1" },
1890 { 241, "ntilde","latin small letter n with tilde, U+00F1 ISOlat1" },
1891 { 242, "ograve","latin small letter o with grave, U+00F2 ISOlat1" },
1892 { 243, "oacute","latin small letter o with acute, U+00F3 ISOlat1" },
1893 { 244, "ocirc","latin small letter o with circumflex, U+00F4 ISOlat1" },
1894 { 245, "otilde","latin small letter o with tilde, U+00F5 ISOlat1" },
1895 { 246, "ouml", "latin small letter o with diaeresis, U+00F6 ISOlat1" },
1896 { 247, "divide","division sign, U+00F7 ISOnum" },
1897 { 248, "oslash","latin small letter o with stroke, = latin small letter o slash, U+00F8 ISOlat1" },
1898 { 249, "ugrave","latin small letter u with grave, U+00F9 ISOlat1" },
1899 { 250, "uacute","latin small letter u with acute, U+00FA ISOlat1" },
1900 { 251, "ucirc","latin small letter u with circumflex, U+00FB ISOlat1" },
1901 { 252, "uuml", "latin small letter u with diaeresis, U+00FC ISOlat1" },
1902 { 253, "yacute","latin small letter y with acute, U+00FD ISOlat1" },
1903 { 254, "thorn","latin small letter thorn with, U+00FE ISOlat1" },
1904 { 255, "yuml", "latin small letter y with diaeresis, U+00FF ISOlat1" },
1905
1906 { 338, "OElig","latin capital ligature OE, U+0152 ISOlat2" },
1907 { 339, "oelig","latin small ligature oe, U+0153 ISOlat2" },
1908 { 352, "Scaron","latin capital letter S with caron, U+0160 ISOlat2" },
1909 { 353, "scaron","latin small letter s with caron, U+0161 ISOlat2" },
1910 { 376, "Yuml", "latin capital letter Y with diaeresis, U+0178 ISOlat2" },
1911
1912 /*
1913 * Anything below should really be kept as entities references
1914 */
1915 { 402, "fnof", "latin small f with hook = function = florin, U+0192 ISOtech" },
1916
1917 { 710, "circ", "modifier letter circumflex accent, U+02C6 ISOpub" },
1918 { 732, "tilde","small tilde, U+02DC ISOdia" },
1919
1920 { 913, "Alpha","greek capital letter alpha, U+0391" },
1921 { 914, "Beta", "greek capital letter beta, U+0392" },
1922 { 915, "Gamma","greek capital letter gamma, U+0393 ISOgrk3" },
1923 { 916, "Delta","greek capital letter delta, U+0394 ISOgrk3" },
1924 { 917, "Epsilon","greek capital letter epsilon, U+0395" },
1925 { 918, "Zeta", "greek capital letter zeta, U+0396" },
1926 { 919, "Eta", "greek capital letter eta, U+0397" },
1927 { 920, "Theta","greek capital letter theta, U+0398 ISOgrk3" },
1928 { 921, "Iota", "greek capital letter iota, U+0399" },
1929 { 922, "Kappa","greek capital letter kappa, U+039A" },
1930 { 923, "Lambda", "greek capital letter lambda, U+039B ISOgrk3" },
1931 { 924, "Mu", "greek capital letter mu, U+039C" },
1932 { 925, "Nu", "greek capital letter nu, U+039D" },
1933 { 926, "Xi", "greek capital letter xi, U+039E ISOgrk3" },
1934 { 927, "Omicron","greek capital letter omicron, U+039F" },
1935 { 928, "Pi", "greek capital letter pi, U+03A0 ISOgrk3" },
1936 { 929, "Rho", "greek capital letter rho, U+03A1" },
1937 { 931, "Sigma","greek capital letter sigma, U+03A3 ISOgrk3" },
1938 { 932, "Tau", "greek capital letter tau, U+03A4" },
1939 { 933, "Upsilon","greek capital letter upsilon, U+03A5 ISOgrk3" },
1940 { 934, "Phi", "greek capital letter phi, U+03A6 ISOgrk3" },
1941 { 935, "Chi", "greek capital letter chi, U+03A7" },
1942 { 936, "Psi", "greek capital letter psi, U+03A8 ISOgrk3" },
1943 { 937, "Omega","greek capital letter omega, U+03A9 ISOgrk3" },
1944
1945 { 945, "alpha","greek small letter alpha, U+03B1 ISOgrk3" },
1946 { 946, "beta", "greek small letter beta, U+03B2 ISOgrk3" },
1947 { 947, "gamma","greek small letter gamma, U+03B3 ISOgrk3" },
1948 { 948, "delta","greek small letter delta, U+03B4 ISOgrk3" },
1949 { 949, "epsilon","greek small letter epsilon, U+03B5 ISOgrk3" },
1950 { 950, "zeta", "greek small letter zeta, U+03B6 ISOgrk3" },
1951 { 951, "eta", "greek small letter eta, U+03B7 ISOgrk3" },
1952 { 952, "theta","greek small letter theta, U+03B8 ISOgrk3" },
1953 { 953, "iota", "greek small letter iota, U+03B9 ISOgrk3" },
1954 { 954, "kappa","greek small letter kappa, U+03BA ISOgrk3" },
1955 { 955, "lambda","greek small letter lambda, U+03BB ISOgrk3" },
1956 { 956, "mu", "greek small letter mu, U+03BC ISOgrk3" },
1957 { 957, "nu", "greek small letter nu, U+03BD ISOgrk3" },
1958 { 958, "xi", "greek small letter xi, U+03BE ISOgrk3" },
1959 { 959, "omicron","greek small letter omicron, U+03BF NEW" },
1960 { 960, "pi", "greek small letter pi, U+03C0 ISOgrk3" },
1961 { 961, "rho", "greek small letter rho, U+03C1 ISOgrk3" },
1962 { 962, "sigmaf","greek small letter final sigma, U+03C2 ISOgrk3" },
1963 { 963, "sigma","greek small letter sigma, U+03C3 ISOgrk3" },
1964 { 964, "tau", "greek small letter tau, U+03C4 ISOgrk3" },
1965 { 965, "upsilon","greek small letter upsilon, U+03C5 ISOgrk3" },
1966 { 966, "phi", "greek small letter phi, U+03C6 ISOgrk3" },
1967 { 967, "chi", "greek small letter chi, U+03C7 ISOgrk3" },
1968 { 968, "psi", "greek small letter psi, U+03C8 ISOgrk3" },
1969 { 969, "omega","greek small letter omega, U+03C9 ISOgrk3" },
1970 { 977, "thetasym","greek small letter theta symbol, U+03D1 NEW" },
1971 { 978, "upsih","greek upsilon with hook symbol, U+03D2 NEW" },
1972 { 982, "piv", "greek pi symbol, U+03D6 ISOgrk3" },
1973
1974 { 8194, "ensp", "en space, U+2002 ISOpub" },
1975 { 8195, "emsp", "em space, U+2003 ISOpub" },
1976 { 8201, "thinsp","thin space, U+2009 ISOpub" },
1977 { 8204, "zwnj", "zero width non-joiner, U+200C NEW RFC 2070" },
1978 { 8205, "zwj", "zero width joiner, U+200D NEW RFC 2070" },
1979 { 8206, "lrm", "left-to-right mark, U+200E NEW RFC 2070" },
1980 { 8207, "rlm", "right-to-left mark, U+200F NEW RFC 2070" },
1981 { 8211, "ndash","en dash, U+2013 ISOpub" },
1982 { 8212, "mdash","em dash, U+2014 ISOpub" },
1983 { 8216, "lsquo","left single quotation mark, U+2018 ISOnum" },
1984 { 8217, "rsquo","right single quotation mark, U+2019 ISOnum" },
1985 { 8218, "sbquo","single low-9 quotation mark, U+201A NEW" },
1986 { 8220, "ldquo","left double quotation mark, U+201C ISOnum" },
1987 { 8221, "rdquo","right double quotation mark, U+201D ISOnum" },
1988 { 8222, "bdquo","double low-9 quotation mark, U+201E NEW" },
1989 { 8224, "dagger","dagger, U+2020 ISOpub" },
1990 { 8225, "Dagger","double dagger, U+2021 ISOpub" },
1991
1992 { 8226, "bull", "bullet = black small circle, U+2022 ISOpub" },
1993 { 8230, "hellip","horizontal ellipsis = three dot leader, U+2026 ISOpub" },
1994
1995 { 8240, "permil","per mille sign, U+2030 ISOtech" },
1996
1997 { 8242, "prime","prime = minutes = feet, U+2032 ISOtech" },
1998 { 8243, "Prime","double prime = seconds = inches, U+2033 ISOtech" },
1999
2000 { 8249, "lsaquo","single left-pointing angle quotation mark, U+2039 ISO proposed" },
2001 { 8250, "rsaquo","single right-pointing angle quotation mark, U+203A ISO proposed" },
2002
2003 { 8254, "oline","overline = spacing overscore, U+203E NEW" },
2004 { 8260, "frasl","fraction slash, U+2044 NEW" },
2005
2006 { 8364, "euro", "euro sign, U+20AC NEW" },
2007
2008 { 8465, "image","blackletter capital I = imaginary part, U+2111 ISOamso" },
2009 { 8472, "weierp","script capital P = power set = Weierstrass p, U+2118 ISOamso" },
2010 { 8476, "real", "blackletter capital R = real part symbol, U+211C ISOamso" },
2011 { 8482, "trade","trade mark sign, U+2122 ISOnum" },
2012 { 8501, "alefsym","alef symbol = first transfinite cardinal, U+2135 NEW" },
2013 { 8592, "larr", "leftwards arrow, U+2190 ISOnum" },
2014 { 8593, "uarr", "upwards arrow, U+2191 ISOnum" },
2015 { 8594, "rarr", "rightwards arrow, U+2192 ISOnum" },
2016 { 8595, "darr", "downwards arrow, U+2193 ISOnum" },
2017 { 8596, "harr", "left right arrow, U+2194 ISOamsa" },
2018 { 8629, "crarr","downwards arrow with corner leftwards = carriage return, U+21B5 NEW" },
2019 { 8656, "lArr", "leftwards double arrow, U+21D0 ISOtech" },
2020 { 8657, "uArr", "upwards double arrow, U+21D1 ISOamsa" },
2021 { 8658, "rArr", "rightwards double arrow, U+21D2 ISOtech" },
2022 { 8659, "dArr", "downwards double arrow, U+21D3 ISOamsa" },
2023 { 8660, "hArr", "left right double arrow, U+21D4 ISOamsa" },
2024
2025 { 8704, "forall","for all, U+2200 ISOtech" },
2026 { 8706, "part", "partial differential, U+2202 ISOtech" },
2027 { 8707, "exist","there exists, U+2203 ISOtech" },
2028 { 8709, "empty","empty set = null set = diameter, U+2205 ISOamso" },
2029 { 8711, "nabla","nabla = backward difference, U+2207 ISOtech" },
2030 { 8712, "isin", "element of, U+2208 ISOtech" },
2031 { 8713, "notin","not an element of, U+2209 ISOtech" },
2032 { 8715, "ni", "contains as member, U+220B ISOtech" },
2033 { 8719, "prod", "n-ary product = product sign, U+220F ISOamsb" },
2034 { 8721, "sum", "n-ary summation, U+2211 ISOamsb" },
2035 { 8722, "minus","minus sign, U+2212 ISOtech" },
2036 { 8727, "lowast","asterisk operator, U+2217 ISOtech" },
2037 { 8730, "radic","square root = radical sign, U+221A ISOtech" },
2038 { 8733, "prop", "proportional to, U+221D ISOtech" },
2039 { 8734, "infin","infinity, U+221E ISOtech" },
2040 { 8736, "ang", "angle, U+2220 ISOamso" },
2041 { 8743, "and", "logical and = wedge, U+2227 ISOtech" },
2042 { 8744, "or", "logical or = vee, U+2228 ISOtech" },
2043 { 8745, "cap", "intersection = cap, U+2229 ISOtech" },
2044 { 8746, "cup", "union = cup, U+222A ISOtech" },
2045 { 8747, "int", "integral, U+222B ISOtech" },
2046 { 8756, "there4","therefore, U+2234 ISOtech" },
2047 { 8764, "sim", "tilde operator = varies with = similar to, U+223C ISOtech" },
2048 { 8773, "cong", "approximately equal to, U+2245 ISOtech" },
2049 { 8776, "asymp","almost equal to = asymptotic to, U+2248 ISOamsr" },
2050 { 8800, "ne", "not equal to, U+2260 ISOtech" },
2051 { 8801, "equiv","identical to, U+2261 ISOtech" },
2052 { 8804, "le", "less-than or equal to, U+2264 ISOtech" },
2053 { 8805, "ge", "greater-than or equal to, U+2265 ISOtech" },
2054 { 8834, "sub", "subset of, U+2282 ISOtech" },
2055 { 8835, "sup", "superset of, U+2283 ISOtech" },
2056 { 8836, "nsub", "not a subset of, U+2284 ISOamsn" },
2057 { 8838, "sube", "subset of or equal to, U+2286 ISOtech" },
2058 { 8839, "supe", "superset of or equal to, U+2287 ISOtech" },
2059 { 8853, "oplus","circled plus = direct sum, U+2295 ISOamsb" },
2060 { 8855, "otimes","circled times = vector product, U+2297 ISOamsb" },
2061 { 8869, "perp", "up tack = orthogonal to = perpendicular, U+22A5 ISOtech" },
2062 { 8901, "sdot", "dot operator, U+22C5 ISOamsb" },
2063 { 8968, "lceil","left ceiling = apl upstile, U+2308 ISOamsc" },
2064 { 8969, "rceil","right ceiling, U+2309 ISOamsc" },
2065 { 8970, "lfloor","left floor = apl downstile, U+230A ISOamsc" },
2066 { 8971, "rfloor","right floor, U+230B ISOamsc" },
2067 { 9001, "lang", "left-pointing angle bracket = bra, U+2329 ISOtech" },
2068 { 9002, "rang", "right-pointing angle bracket = ket, U+232A ISOtech" },
2069 { 9674, "loz", "lozenge, U+25CA ISOpub" },
2070
2071 { 9824, "spades","black spade suit, U+2660 ISOpub" },
2072 { 9827, "clubs","black club suit = shamrock, U+2663 ISOpub" },
2073 { 9829, "hearts","black heart suit = valentine, U+2665 ISOpub" },
2074 { 9830, "diams","black diamond suit, U+2666 ISOpub" },
2075
2076 };
2077
2078 /************************************************************************
2079 * *
2080 * Commodity functions to handle entities *
2081 * *
2082 ************************************************************************/
2083
2084 /*
2085 * Macro used to grow the current buffer.
2086 */
2087 #define growBuffer(buffer) { \
2088 xmlChar *tmp; \
2089 buffer##_size *= 2; \
2090 tmp = (xmlChar *) xmlRealloc(buffer, buffer##_size * sizeof(xmlChar)); \
2091 if (tmp == NULL) { \
2092 htmlErrMemory(ctxt, "growing buffer\n"); \
2093 xmlFree(buffer); \
2094 return(NULL); \
2095 } \
2096 buffer = tmp; \
2097 }
2098
2099 /**
2100 * htmlEntityLookup:
2101 * @name: the entity name
2102 *
2103 * Lookup the given entity in EntitiesTable
2104 *
2105 * TODO: the linear scan is really ugly, an hash table is really needed.
2106 *
2107 * Returns the associated htmlEntityDescPtr if found, NULL otherwise.
2108 */
2109 const htmlEntityDesc *
htmlEntityLookup(const xmlChar * name)2110 htmlEntityLookup(const xmlChar *name) {
2111 unsigned int i;
2112
2113 for (i = 0;i < (sizeof(html40EntitiesTable)/
2114 sizeof(html40EntitiesTable[0]));i++) {
2115 if (xmlStrEqual(name, BAD_CAST html40EntitiesTable[i].name)) {
2116 return((htmlEntityDescPtr) &html40EntitiesTable[i]);
2117 }
2118 }
2119 return(NULL);
2120 }
2121
2122 /**
2123 * htmlEntityValueLookup:
2124 * @value: the entity's unicode value
2125 *
2126 * Lookup the given entity in EntitiesTable
2127 *
2128 * TODO: the linear scan is really ugly, an hash table is really needed.
2129 *
2130 * Returns the associated htmlEntityDescPtr if found, NULL otherwise.
2131 */
2132 const htmlEntityDesc *
htmlEntityValueLookup(unsigned int value)2133 htmlEntityValueLookup(unsigned int value) {
2134 unsigned int i;
2135
2136 for (i = 0;i < (sizeof(html40EntitiesTable)/
2137 sizeof(html40EntitiesTable[0]));i++) {
2138 if (html40EntitiesTable[i].value >= value) {
2139 if (html40EntitiesTable[i].value > value)
2140 break;
2141 return((htmlEntityDescPtr) &html40EntitiesTable[i]);
2142 }
2143 }
2144 return(NULL);
2145 }
2146
2147 /**
2148 * UTF8ToHtml:
2149 * @out: a pointer to an array of bytes to store the result
2150 * @outlen: the length of @out
2151 * @in: a pointer to an array of UTF-8 chars
2152 * @inlen: the length of @in
2153 *
2154 * Take a block of UTF-8 chars in and try to convert it to an ASCII
2155 * plus HTML entities block of chars out.
2156 *
2157 * Returns 0 if success, -2 if the transcoding fails, or -1 otherwise
2158 * The value of @inlen after return is the number of octets consumed
2159 * as the return value is positive, else unpredictable.
2160 * The value of @outlen after return is the number of octets consumed.
2161 */
2162 int
UTF8ToHtml(unsigned char * out,int * outlen,const unsigned char * in,int * inlen)2163 UTF8ToHtml(unsigned char* out, int *outlen,
2164 const unsigned char* in, int *inlen) {
2165 const unsigned char* processed = in;
2166 const unsigned char* outend;
2167 const unsigned char* outstart = out;
2168 const unsigned char* instart = in;
2169 const unsigned char* inend;
2170 unsigned int c, d;
2171 int trailing;
2172
2173 if ((out == NULL) || (outlen == NULL) || (inlen == NULL)) return(-1);
2174 if (in == NULL) {
2175 /*
2176 * initialization nothing to do
2177 */
2178 *outlen = 0;
2179 *inlen = 0;
2180 return(0);
2181 }
2182 inend = in + (*inlen);
2183 outend = out + (*outlen);
2184 while (in < inend) {
2185 d = *in++;
2186 if (d < 0x80) { c= d; trailing= 0; }
2187 else if (d < 0xC0) {
2188 /* trailing byte in leading position */
2189 *outlen = out - outstart;
2190 *inlen = processed - instart;
2191 return(-2);
2192 } else if (d < 0xE0) { c= d & 0x1F; trailing= 1; }
2193 else if (d < 0xF0) { c= d & 0x0F; trailing= 2; }
2194 else if (d < 0xF8) { c= d & 0x07; trailing= 3; }
2195 else {
2196 /* no chance for this in Ascii */
2197 *outlen = out - outstart;
2198 *inlen = processed - instart;
2199 return(-2);
2200 }
2201
2202 if (inend - in < trailing) {
2203 break;
2204 }
2205
2206 for ( ; trailing; trailing--) {
2207 if ((in >= inend) || (((d= *in++) & 0xC0) != 0x80))
2208 break;
2209 c <<= 6;
2210 c |= d & 0x3F;
2211 }
2212
2213 /* assertion: c is a single UTF-4 value */
2214 if (c < 0x80) {
2215 if (out + 1 >= outend)
2216 break;
2217 *out++ = c;
2218 } else {
2219 int len;
2220 const htmlEntityDesc * ent;
2221 const char *cp;
2222 char nbuf[16];
2223
2224 /*
2225 * Try to lookup a predefined HTML entity for it
2226 */
2227
2228 ent = htmlEntityValueLookup(c);
2229 if (ent == NULL) {
2230 snprintf(nbuf, sizeof(nbuf), "#%u", c);
2231 cp = nbuf;
2232 }
2233 else
2234 cp = ent->name;
2235 len = strlen(cp);
2236 if (out + 2 + len >= outend)
2237 break;
2238 *out++ = '&';
2239 memcpy(out, cp, len);
2240 out += len;
2241 *out++ = ';';
2242 }
2243 processed = in;
2244 }
2245 *outlen = out - outstart;
2246 *inlen = processed - instart;
2247 return(0);
2248 }
2249
2250 /**
2251 * htmlEncodeEntities:
2252 * @out: a pointer to an array of bytes to store the result
2253 * @outlen: the length of @out
2254 * @in: a pointer to an array of UTF-8 chars
2255 * @inlen: the length of @in
2256 * @quoteChar: the quote character to escape (' or ") or zero.
2257 *
2258 * Take a block of UTF-8 chars in and try to convert it to an ASCII
2259 * plus HTML entities block of chars out.
2260 *
2261 * Returns 0 if success, -2 if the transcoding fails, or -1 otherwise
2262 * The value of @inlen after return is the number of octets consumed
2263 * as the return value is positive, else unpredictable.
2264 * The value of @outlen after return is the number of octets consumed.
2265 */
2266 int
htmlEncodeEntities(unsigned char * out,int * outlen,const unsigned char * in,int * inlen,int quoteChar)2267 htmlEncodeEntities(unsigned char* out, int *outlen,
2268 const unsigned char* in, int *inlen, int quoteChar) {
2269 const unsigned char* processed = in;
2270 const unsigned char* outend;
2271 const unsigned char* outstart = out;
2272 const unsigned char* instart = in;
2273 const unsigned char* inend;
2274 unsigned int c, d;
2275 int trailing;
2276
2277 if ((out == NULL) || (outlen == NULL) || (inlen == NULL) || (in == NULL))
2278 return(-1);
2279 outend = out + (*outlen);
2280 inend = in + (*inlen);
2281 while (in < inend) {
2282 d = *in++;
2283 if (d < 0x80) { c= d; trailing= 0; }
2284 else if (d < 0xC0) {
2285 /* trailing byte in leading position */
2286 *outlen = out - outstart;
2287 *inlen = processed - instart;
2288 return(-2);
2289 } else if (d < 0xE0) { c= d & 0x1F; trailing= 1; }
2290 else if (d < 0xF0) { c= d & 0x0F; trailing= 2; }
2291 else if (d < 0xF8) { c= d & 0x07; trailing= 3; }
2292 else {
2293 /* no chance for this in Ascii */
2294 *outlen = out - outstart;
2295 *inlen = processed - instart;
2296 return(-2);
2297 }
2298
2299 if (inend - in < trailing)
2300 break;
2301
2302 while (trailing--) {
2303 if (((d= *in++) & 0xC0) != 0x80) {
2304 *outlen = out - outstart;
2305 *inlen = processed - instart;
2306 return(-2);
2307 }
2308 c <<= 6;
2309 c |= d & 0x3F;
2310 }
2311
2312 /* assertion: c is a single UTF-4 value */
2313 if ((c < 0x80) && (c != (unsigned int) quoteChar) &&
2314 (c != '&') && (c != '<') && (c != '>')) {
2315 if (out >= outend)
2316 break;
2317 *out++ = c;
2318 } else {
2319 const htmlEntityDesc * ent;
2320 const char *cp;
2321 char nbuf[16];
2322 int len;
2323
2324 /*
2325 * Try to lookup a predefined HTML entity for it
2326 */
2327 ent = htmlEntityValueLookup(c);
2328 if (ent == NULL) {
2329 snprintf(nbuf, sizeof(nbuf), "#%u", c);
2330 cp = nbuf;
2331 }
2332 else
2333 cp = ent->name;
2334 len = strlen(cp);
2335 if (out + 2 + len > outend)
2336 break;
2337 *out++ = '&';
2338 memcpy(out, cp, len);
2339 out += len;
2340 *out++ = ';';
2341 }
2342 processed = in;
2343 }
2344 *outlen = out - outstart;
2345 *inlen = processed - instart;
2346 return(0);
2347 }
2348
2349 /************************************************************************
2350 * *
2351 * Commodity functions to handle streams *
2352 * *
2353 ************************************************************************/
2354
2355 #ifdef LIBXML_PUSH_ENABLED
2356 /**
2357 * htmlNewInputStream:
2358 * @ctxt: an HTML parser context
2359 *
2360 * Create a new input stream structure
2361 * Returns the new input stream or NULL
2362 */
2363 static htmlParserInputPtr
htmlNewInputStream(htmlParserCtxtPtr ctxt)2364 htmlNewInputStream(htmlParserCtxtPtr ctxt) {
2365 htmlParserInputPtr input;
2366
2367 input = (xmlParserInputPtr) xmlMalloc(sizeof(htmlParserInput));
2368 if (input == NULL) {
2369 htmlErrMemory(ctxt, "couldn't allocate a new input stream\n");
2370 return(NULL);
2371 }
2372 memset(input, 0, sizeof(htmlParserInput));
2373 input->filename = NULL;
2374 input->directory = NULL;
2375 input->base = NULL;
2376 input->cur = NULL;
2377 input->buf = NULL;
2378 input->line = 1;
2379 input->col = 1;
2380 input->buf = NULL;
2381 input->free = NULL;
2382 input->version = NULL;
2383 input->consumed = 0;
2384 input->length = 0;
2385 return(input);
2386 }
2387 #endif
2388
2389
2390 /************************************************************************
2391 * *
2392 * Commodity functions, cleanup needed ? *
2393 * *
2394 ************************************************************************/
2395 /*
2396 * all tags allowing pc data from the html 4.01 loose dtd
2397 * NOTE: it might be more appropriate to integrate this information
2398 * into the html40ElementTable array but I don't want to risk any
2399 * binary incompatibility
2400 */
2401 static const char *allowPCData[] = {
2402 "a", "abbr", "acronym", "address", "applet", "b", "bdo", "big",
2403 "blockquote", "body", "button", "caption", "center", "cite", "code",
2404 "dd", "del", "dfn", "div", "dt", "em", "font", "form", "h1", "h2",
2405 "h3", "h4", "h5", "h6", "i", "iframe", "ins", "kbd", "label", "legend",
2406 "li", "noframes", "noscript", "object", "p", "pre", "q", "s", "samp",
2407 "small", "span", "strike", "strong", "td", "th", "tt", "u", "var"
2408 };
2409
2410 /**
2411 * areBlanks:
2412 * @ctxt: an HTML parser context
2413 * @str: a xmlChar *
2414 * @len: the size of @str
2415 *
2416 * Is this a sequence of blank chars that one can ignore ?
2417 *
2418 * Returns 1 if ignorable 0 otherwise.
2419 */
2420
areBlanks(htmlParserCtxtPtr ctxt,const xmlChar * str,int len)2421 static int areBlanks(htmlParserCtxtPtr ctxt, const xmlChar *str, int len) {
2422 unsigned int i;
2423 int j;
2424 xmlNodePtr lastChild;
2425 xmlDtdPtr dtd;
2426
2427 for (j = 0;j < len;j++)
2428 if (!(IS_BLANK_CH(str[j]))) return(0);
2429
2430 if (CUR == 0) return(1);
2431 if (CUR != '<') return(0);
2432 if (ctxt->name == NULL)
2433 return(1);
2434 if (xmlStrEqual(ctxt->name, BAD_CAST"html"))
2435 return(1);
2436 if (xmlStrEqual(ctxt->name, BAD_CAST"head"))
2437 return(1);
2438
2439 /* Only strip CDATA children of the body tag for strict HTML DTDs */
2440 if (xmlStrEqual(ctxt->name, BAD_CAST "body") && ctxt->myDoc != NULL) {
2441 dtd = xmlGetIntSubset(ctxt->myDoc);
2442 if (dtd != NULL && dtd->ExternalID != NULL) {
2443 if (!xmlStrcasecmp(dtd->ExternalID, BAD_CAST "-//W3C//DTD HTML 4.01//EN") ||
2444 !xmlStrcasecmp(dtd->ExternalID, BAD_CAST "-//W3C//DTD HTML 4//EN"))
2445 return(1);
2446 }
2447 }
2448
2449 if (ctxt->node == NULL) return(0);
2450 lastChild = xmlGetLastChild(ctxt->node);
2451 while ((lastChild) && (lastChild->type == XML_COMMENT_NODE))
2452 lastChild = lastChild->prev;
2453 if (lastChild == NULL) {
2454 if ((ctxt->node->type != XML_ELEMENT_NODE) &&
2455 (ctxt->node->content != NULL)) return(0);
2456 /* keep ws in constructs like ...<b> </b>...
2457 for all tags "b" allowing PCDATA */
2458 for ( i = 0; i < sizeof(allowPCData)/sizeof(allowPCData[0]); i++ ) {
2459 if ( xmlStrEqual(ctxt->name, BAD_CAST allowPCData[i]) ) {
2460 return(0);
2461 }
2462 }
2463 } else if (xmlNodeIsText(lastChild)) {
2464 return(0);
2465 } else {
2466 /* keep ws in constructs like <p><b>xy</b> <i>z</i><p>
2467 for all tags "p" allowing PCDATA */
2468 for ( i = 0; i < sizeof(allowPCData)/sizeof(allowPCData[0]); i++ ) {
2469 if ( xmlStrEqual(lastChild->name, BAD_CAST allowPCData[i]) ) {
2470 return(0);
2471 }
2472 }
2473 }
2474 return(1);
2475 }
2476
2477 /**
2478 * htmlNewDocNoDtD:
2479 * @URI: URI for the dtd, or NULL
2480 * @ExternalID: the external ID of the DTD, or NULL
2481 *
2482 * Creates a new HTML document without a DTD node if @URI and @ExternalID
2483 * are NULL
2484 *
2485 * Returns a new document, do not initialize the DTD if not provided
2486 */
2487 htmlDocPtr
htmlNewDocNoDtD(const xmlChar * URI,const xmlChar * ExternalID)2488 htmlNewDocNoDtD(const xmlChar *URI, const xmlChar *ExternalID) {
2489 xmlDocPtr cur;
2490
2491 /*
2492 * Allocate a new document and fill the fields.
2493 */
2494 cur = (xmlDocPtr) xmlMalloc(sizeof(xmlDoc));
2495 if (cur == NULL) {
2496 htmlErrMemory(NULL, "HTML document creation failed\n");
2497 return(NULL);
2498 }
2499 memset(cur, 0, sizeof(xmlDoc));
2500
2501 cur->type = XML_HTML_DOCUMENT_NODE;
2502 cur->version = NULL;
2503 cur->intSubset = NULL;
2504 cur->doc = cur;
2505 cur->name = NULL;
2506 cur->children = NULL;
2507 cur->extSubset = NULL;
2508 cur->oldNs = NULL;
2509 cur->encoding = NULL;
2510 cur->standalone = 1;
2511 cur->compression = 0;
2512 cur->ids = NULL;
2513 cur->refs = NULL;
2514 cur->_private = NULL;
2515 cur->charset = XML_CHAR_ENCODING_UTF8;
2516 cur->properties = XML_DOC_HTML | XML_DOC_USERBUILT;
2517 if ((ExternalID != NULL) ||
2518 (URI != NULL))
2519 xmlCreateIntSubset(cur, BAD_CAST "html", ExternalID, URI);
2520 if ((__xmlRegisterCallbacks) && (xmlRegisterNodeDefaultValue))
2521 xmlRegisterNodeDefaultValue((xmlNodePtr)cur);
2522 return(cur);
2523 }
2524
2525 /**
2526 * htmlNewDoc:
2527 * @URI: URI for the dtd, or NULL
2528 * @ExternalID: the external ID of the DTD, or NULL
2529 *
2530 * Creates a new HTML document
2531 *
2532 * Returns a new document
2533 */
2534 htmlDocPtr
htmlNewDoc(const xmlChar * URI,const xmlChar * ExternalID)2535 htmlNewDoc(const xmlChar *URI, const xmlChar *ExternalID) {
2536 if ((URI == NULL) && (ExternalID == NULL))
2537 return(htmlNewDocNoDtD(
2538 BAD_CAST "http://www.w3.org/TR/REC-html40/loose.dtd",
2539 BAD_CAST "-//W3C//DTD HTML 4.0 Transitional//EN"));
2540
2541 return(htmlNewDocNoDtD(URI, ExternalID));
2542 }
2543
2544
2545 /************************************************************************
2546 * *
2547 * The parser itself *
2548 * Relates to http://www.w3.org/TR/html40 *
2549 * *
2550 ************************************************************************/
2551
2552 /************************************************************************
2553 * *
2554 * The parser itself *
2555 * *
2556 ************************************************************************/
2557
2558 static const xmlChar * htmlParseNameComplex(xmlParserCtxtPtr ctxt);
2559
2560 /**
2561 * htmlParseHTMLName:
2562 * @ctxt: an HTML parser context
2563 *
2564 * parse an HTML tag or attribute name, note that we convert it to lowercase
2565 * since HTML names are not case-sensitive.
2566 *
2567 * Returns the Tag Name parsed or NULL
2568 */
2569
2570 static const xmlChar *
htmlParseHTMLName(htmlParserCtxtPtr ctxt)2571 htmlParseHTMLName(htmlParserCtxtPtr ctxt) {
2572 int i = 0;
2573 xmlChar loc[HTML_PARSER_BUFFER_SIZE];
2574
2575 if (!IS_ASCII_LETTER(CUR) && (CUR != '_') &&
2576 (CUR != ':') && (CUR != '.')) return(NULL);
2577
2578 while ((i < HTML_PARSER_BUFFER_SIZE) &&
2579 ((IS_ASCII_LETTER(CUR)) || (IS_ASCII_DIGIT(CUR)) ||
2580 (CUR == ':') || (CUR == '-') || (CUR == '_') ||
2581 (CUR == '.'))) {
2582 if ((CUR >= 'A') && (CUR <= 'Z')) loc[i] = CUR + 0x20;
2583 else loc[i] = CUR;
2584 i++;
2585
2586 NEXT;
2587 }
2588
2589 return(xmlDictLookup(ctxt->dict, loc, i));
2590 }
2591
2592
2593 /**
2594 * htmlParseHTMLName_nonInvasive:
2595 * @ctxt: an HTML parser context
2596 *
2597 * parse an HTML tag or attribute name, note that we convert it to lowercase
2598 * since HTML names are not case-sensitive, this doesn't consume the data
2599 * from the stream, it's a look-ahead
2600 *
2601 * Returns the Tag Name parsed or NULL
2602 */
2603
2604 static const xmlChar *
htmlParseHTMLName_nonInvasive(htmlParserCtxtPtr ctxt)2605 htmlParseHTMLName_nonInvasive(htmlParserCtxtPtr ctxt) {
2606 int i = 0;
2607 xmlChar loc[HTML_PARSER_BUFFER_SIZE];
2608
2609 if (!IS_ASCII_LETTER(NXT(1)) && (NXT(1) != '_') &&
2610 (NXT(1) != ':')) return(NULL);
2611
2612 while ((i < HTML_PARSER_BUFFER_SIZE) &&
2613 ((IS_ASCII_LETTER(NXT(1+i))) || (IS_ASCII_DIGIT(NXT(1+i))) ||
2614 (NXT(1+i) == ':') || (NXT(1+i) == '-') || (NXT(1+i) == '_'))) {
2615 if ((NXT(1+i) >= 'A') && (NXT(1+i) <= 'Z')) loc[i] = NXT(1+i) + 0x20;
2616 else loc[i] = NXT(1+i);
2617 i++;
2618 }
2619
2620 return(xmlDictLookup(ctxt->dict, loc, i));
2621 }
2622
2623
2624 /**
2625 * htmlParseName:
2626 * @ctxt: an HTML parser context
2627 *
2628 * parse an HTML name, this routine is case sensitive.
2629 *
2630 * Returns the Name parsed or NULL
2631 */
2632
2633 static const xmlChar *
htmlParseName(htmlParserCtxtPtr ctxt)2634 htmlParseName(htmlParserCtxtPtr ctxt) {
2635 const xmlChar *in;
2636 const xmlChar *ret;
2637 int count = 0;
2638
2639 GROW;
2640
2641 /*
2642 * Accelerator for simple ASCII names
2643 */
2644 in = ctxt->input->cur;
2645 if (((*in >= 0x61) && (*in <= 0x7A)) ||
2646 ((*in >= 0x41) && (*in <= 0x5A)) ||
2647 (*in == '_') || (*in == ':')) {
2648 in++;
2649 while (((*in >= 0x61) && (*in <= 0x7A)) ||
2650 ((*in >= 0x41) && (*in <= 0x5A)) ||
2651 ((*in >= 0x30) && (*in <= 0x39)) ||
2652 (*in == '_') || (*in == '-') ||
2653 (*in == ':') || (*in == '.'))
2654 in++;
2655
2656 if (in == ctxt->input->end)
2657 return(NULL);
2658
2659 if ((*in > 0) && (*in < 0x80)) {
2660 count = in - ctxt->input->cur;
2661 ret = xmlDictLookup(ctxt->dict, ctxt->input->cur, count);
2662 ctxt->input->cur = in;
2663 ctxt->input->col += count;
2664 return(ret);
2665 }
2666 }
2667 return(htmlParseNameComplex(ctxt));
2668 }
2669
2670 static const xmlChar *
htmlParseNameComplex(xmlParserCtxtPtr ctxt)2671 htmlParseNameComplex(xmlParserCtxtPtr ctxt) {
2672 int len = 0, l;
2673 int c;
2674 int count = 0;
2675 const xmlChar *base = ctxt->input->base;
2676
2677 /*
2678 * Handler for more complex cases
2679 */
2680 GROW;
2681 c = CUR_CHAR(l);
2682 if ((c == ' ') || (c == '>') || (c == '/') || /* accelerators */
2683 (!IS_LETTER(c) && (c != '_') &&
2684 (c != ':'))) {
2685 return(NULL);
2686 }
2687
2688 while ((c != ' ') && (c != '>') && (c != '/') && /* test bigname.xml */
2689 ((IS_LETTER(c)) || (IS_DIGIT(c)) ||
2690 (c == '.') || (c == '-') ||
2691 (c == '_') || (c == ':') ||
2692 (IS_COMBINING(c)) ||
2693 (IS_EXTENDER(c)))) {
2694 if (count++ > 100) {
2695 count = 0;
2696 GROW;
2697 }
2698 len += l;
2699 NEXTL(l);
2700 c = CUR_CHAR(l);
2701 if (ctxt->input->base != base) {
2702 /*
2703 * We changed encoding from an unknown encoding
2704 * Input buffer changed location, so we better start again
2705 */
2706 return(htmlParseNameComplex(ctxt));
2707 }
2708 }
2709
2710 if (ctxt->input->cur - ctxt->input->base < len) {
2711 /* Sanity check */
2712 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
2713 "unexpected change of input buffer", NULL, NULL);
2714 return (NULL);
2715 }
2716
2717 return(xmlDictLookup(ctxt->dict, ctxt->input->cur - len, len));
2718 }
2719
2720
2721 /**
2722 * htmlParseHTMLAttribute:
2723 * @ctxt: an HTML parser context
2724 * @stop: a char stop value
2725 *
2726 * parse an HTML attribute value till the stop (quote), if
2727 * stop is 0 then it stops at the first space
2728 *
2729 * Returns the attribute parsed or NULL
2730 */
2731
2732 static xmlChar *
htmlParseHTMLAttribute(htmlParserCtxtPtr ctxt,const xmlChar stop)2733 htmlParseHTMLAttribute(htmlParserCtxtPtr ctxt, const xmlChar stop) {
2734 xmlChar *buffer = NULL;
2735 int buffer_size = 0;
2736 xmlChar *out = NULL;
2737 const xmlChar *name = NULL;
2738 const xmlChar *cur = NULL;
2739 const htmlEntityDesc * ent;
2740
2741 /*
2742 * allocate a translation buffer.
2743 */
2744 buffer_size = HTML_PARSER_BUFFER_SIZE;
2745 buffer = (xmlChar *) xmlMallocAtomic(buffer_size * sizeof(xmlChar));
2746 if (buffer == NULL) {
2747 htmlErrMemory(ctxt, "buffer allocation failed\n");
2748 return(NULL);
2749 }
2750 out = buffer;
2751
2752 /*
2753 * Ok loop until we reach one of the ending chars
2754 */
2755 while ((CUR != 0) && (CUR != stop)) {
2756 if ((stop == 0) && (CUR == '>')) break;
2757 if ((stop == 0) && (IS_BLANK_CH(CUR))) break;
2758 if (CUR == '&') {
2759 if (NXT(1) == '#') {
2760 unsigned int c;
2761 int bits;
2762
2763 c = htmlParseCharRef(ctxt);
2764 if (c < 0x80)
2765 { *out++ = c; bits= -6; }
2766 else if (c < 0x800)
2767 { *out++ =((c >> 6) & 0x1F) | 0xC0; bits= 0; }
2768 else if (c < 0x10000)
2769 { *out++ =((c >> 12) & 0x0F) | 0xE0; bits= 6; }
2770 else
2771 { *out++ =((c >> 18) & 0x07) | 0xF0; bits= 12; }
2772
2773 for ( ; bits >= 0; bits-= 6) {
2774 *out++ = ((c >> bits) & 0x3F) | 0x80;
2775 }
2776
2777 if (out - buffer > buffer_size - 100) {
2778 int indx = out - buffer;
2779
2780 growBuffer(buffer);
2781 out = &buffer[indx];
2782 }
2783 } else {
2784 ent = htmlParseEntityRef(ctxt, &name);
2785 if (name == NULL) {
2786 *out++ = '&';
2787 if (out - buffer > buffer_size - 100) {
2788 int indx = out - buffer;
2789
2790 growBuffer(buffer);
2791 out = &buffer[indx];
2792 }
2793 } else if (ent == NULL) {
2794 *out++ = '&';
2795 cur = name;
2796 while (*cur != 0) {
2797 if (out - buffer > buffer_size - 100) {
2798 int indx = out - buffer;
2799
2800 growBuffer(buffer);
2801 out = &buffer[indx];
2802 }
2803 *out++ = *cur++;
2804 }
2805 } else {
2806 unsigned int c;
2807 int bits;
2808
2809 if (out - buffer > buffer_size - 100) {
2810 int indx = out - buffer;
2811
2812 growBuffer(buffer);
2813 out = &buffer[indx];
2814 }
2815 c = ent->value;
2816 if (c < 0x80)
2817 { *out++ = c; bits= -6; }
2818 else if (c < 0x800)
2819 { *out++ =((c >> 6) & 0x1F) | 0xC0; bits= 0; }
2820 else if (c < 0x10000)
2821 { *out++ =((c >> 12) & 0x0F) | 0xE0; bits= 6; }
2822 else
2823 { *out++ =((c >> 18) & 0x07) | 0xF0; bits= 12; }
2824
2825 for ( ; bits >= 0; bits-= 6) {
2826 *out++ = ((c >> bits) & 0x3F) | 0x80;
2827 }
2828 }
2829 }
2830 } else {
2831 unsigned int c;
2832 int bits, l;
2833
2834 if (out - buffer > buffer_size - 100) {
2835 int indx = out - buffer;
2836
2837 growBuffer(buffer);
2838 out = &buffer[indx];
2839 }
2840 c = CUR_CHAR(l);
2841 if (c < 0x80)
2842 { *out++ = c; bits= -6; }
2843 else if (c < 0x800)
2844 { *out++ =((c >> 6) & 0x1F) | 0xC0; bits= 0; }
2845 else if (c < 0x10000)
2846 { *out++ =((c >> 12) & 0x0F) | 0xE0; bits= 6; }
2847 else
2848 { *out++ =((c >> 18) & 0x07) | 0xF0; bits= 12; }
2849
2850 for ( ; bits >= 0; bits-= 6) {
2851 *out++ = ((c >> bits) & 0x3F) | 0x80;
2852 }
2853 NEXT;
2854 }
2855 }
2856 *out = 0;
2857 return(buffer);
2858 }
2859
2860 /**
2861 * htmlParseEntityRef:
2862 * @ctxt: an HTML parser context
2863 * @str: location to store the entity name
2864 *
2865 * parse an HTML ENTITY references
2866 *
2867 * [68] EntityRef ::= '&' Name ';'
2868 *
2869 * Returns the associated htmlEntityDescPtr if found, or NULL otherwise,
2870 * if non-NULL *str will have to be freed by the caller.
2871 */
2872 const htmlEntityDesc *
htmlParseEntityRef(htmlParserCtxtPtr ctxt,const xmlChar ** str)2873 htmlParseEntityRef(htmlParserCtxtPtr ctxt, const xmlChar **str) {
2874 const xmlChar *name;
2875 const htmlEntityDesc * ent = NULL;
2876
2877 if (str != NULL) *str = NULL;
2878 if ((ctxt == NULL) || (ctxt->input == NULL)) return(NULL);
2879
2880 if (CUR == '&') {
2881 NEXT;
2882 name = htmlParseName(ctxt);
2883 if (name == NULL) {
2884 htmlParseErr(ctxt, XML_ERR_NAME_REQUIRED,
2885 "htmlParseEntityRef: no name\n", NULL, NULL);
2886 } else {
2887 GROW;
2888 if (CUR == ';') {
2889 if (str != NULL)
2890 *str = name;
2891
2892 /*
2893 * Lookup the entity in the table.
2894 */
2895 ent = htmlEntityLookup(name);
2896 if (ent != NULL) /* OK that's ugly !!! */
2897 NEXT;
2898 } else {
2899 htmlParseErr(ctxt, XML_ERR_ENTITYREF_SEMICOL_MISSING,
2900 "htmlParseEntityRef: expecting ';'\n",
2901 NULL, NULL);
2902 if (str != NULL)
2903 *str = name;
2904 }
2905 }
2906 }
2907 return(ent);
2908 }
2909
2910 /**
2911 * htmlParseAttValue:
2912 * @ctxt: an HTML parser context
2913 *
2914 * parse a value for an attribute
2915 * Note: the parser won't do substitution of entities here, this
2916 * will be handled later in xmlStringGetNodeList, unless it was
2917 * asked for ctxt->replaceEntities != 0
2918 *
2919 * Returns the AttValue parsed or NULL.
2920 */
2921
2922 static xmlChar *
htmlParseAttValue(htmlParserCtxtPtr ctxt)2923 htmlParseAttValue(htmlParserCtxtPtr ctxt) {
2924 xmlChar *ret = NULL;
2925
2926 if (CUR == '"') {
2927 NEXT;
2928 ret = htmlParseHTMLAttribute(ctxt, '"');
2929 if (CUR != '"') {
2930 htmlParseErr(ctxt, XML_ERR_ATTRIBUTE_NOT_FINISHED,
2931 "AttValue: \" expected\n", NULL, NULL);
2932 } else
2933 NEXT;
2934 } else if (CUR == '\'') {
2935 NEXT;
2936 ret = htmlParseHTMLAttribute(ctxt, '\'');
2937 if (CUR != '\'') {
2938 htmlParseErr(ctxt, XML_ERR_ATTRIBUTE_NOT_FINISHED,
2939 "AttValue: ' expected\n", NULL, NULL);
2940 } else
2941 NEXT;
2942 } else {
2943 /*
2944 * That's an HTMLism, the attribute value may not be quoted
2945 */
2946 ret = htmlParseHTMLAttribute(ctxt, 0);
2947 if (ret == NULL) {
2948 htmlParseErr(ctxt, XML_ERR_ATTRIBUTE_WITHOUT_VALUE,
2949 "AttValue: no value found\n", NULL, NULL);
2950 }
2951 }
2952 return(ret);
2953 }
2954
2955 /**
2956 * htmlParseSystemLiteral:
2957 * @ctxt: an HTML parser context
2958 *
2959 * parse an HTML Literal
2960 *
2961 * [11] SystemLiteral ::= ('"' [^"]* '"') | ("'" [^']* "'")
2962 *
2963 * Returns the SystemLiteral parsed or NULL
2964 */
2965
2966 static xmlChar *
htmlParseSystemLiteral(htmlParserCtxtPtr ctxt)2967 htmlParseSystemLiteral(htmlParserCtxtPtr ctxt) {
2968 size_t len = 0, startPosition = 0;
2969 int err = 0;
2970 int quote;
2971 xmlChar *ret = NULL;
2972
2973 if ((CUR != '"') && (CUR != '\'')) {
2974 htmlParseErr(ctxt, XML_ERR_LITERAL_NOT_STARTED,
2975 "SystemLiteral \" or ' expected\n", NULL, NULL);
2976 return(NULL);
2977 }
2978 quote = CUR;
2979 NEXT;
2980
2981 if (CUR_PTR < BASE_PTR)
2982 return(ret);
2983 startPosition = CUR_PTR - BASE_PTR;
2984
2985 while ((CUR != 0) && (CUR != quote)) {
2986 /* TODO: Handle UTF-8 */
2987 if (!IS_CHAR_CH(CUR)) {
2988 htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR,
2989 "Invalid char in SystemLiteral 0x%X\n", CUR);
2990 err = 1;
2991 }
2992 NEXT;
2993 len++;
2994 }
2995 if (CUR != quote) {
2996 htmlParseErr(ctxt, XML_ERR_LITERAL_NOT_FINISHED,
2997 "Unfinished SystemLiteral\n", NULL, NULL);
2998 } else {
2999 NEXT;
3000 if (err == 0)
3001 ret = xmlStrndup((BASE_PTR+startPosition), len);
3002 }
3003
3004 return(ret);
3005 }
3006
3007 /**
3008 * htmlParsePubidLiteral:
3009 * @ctxt: an HTML parser context
3010 *
3011 * parse an HTML public literal
3012 *
3013 * [12] PubidLiteral ::= '"' PubidChar* '"' | "'" (PubidChar - "'")* "'"
3014 *
3015 * Returns the PubidLiteral parsed or NULL.
3016 */
3017
3018 static xmlChar *
htmlParsePubidLiteral(htmlParserCtxtPtr ctxt)3019 htmlParsePubidLiteral(htmlParserCtxtPtr ctxt) {
3020 size_t len = 0, startPosition = 0;
3021 int err = 0;
3022 int quote;
3023 xmlChar *ret = NULL;
3024
3025 if ((CUR != '"') && (CUR != '\'')) {
3026 htmlParseErr(ctxt, XML_ERR_LITERAL_NOT_STARTED,
3027 "PubidLiteral \" or ' expected\n", NULL, NULL);
3028 return(NULL);
3029 }
3030 quote = CUR;
3031 NEXT;
3032
3033 /*
3034 * Name ::= (Letter | '_') (NameChar)*
3035 */
3036 if (CUR_PTR < BASE_PTR)
3037 return(ret);
3038 startPosition = CUR_PTR - BASE_PTR;
3039
3040 while ((CUR != 0) && (CUR != quote)) {
3041 if (!IS_PUBIDCHAR_CH(CUR)) {
3042 htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR,
3043 "Invalid char in PubidLiteral 0x%X\n", CUR);
3044 err = 1;
3045 }
3046 len++;
3047 NEXT;
3048 }
3049
3050 if (CUR != quote) {
3051 htmlParseErr(ctxt, XML_ERR_LITERAL_NOT_FINISHED,
3052 "Unfinished PubidLiteral\n", NULL, NULL);
3053 } else {
3054 NEXT;
3055 if (err == 0)
3056 ret = xmlStrndup((BASE_PTR + startPosition), len);
3057 }
3058
3059 return(ret);
3060 }
3061
3062 /**
3063 * htmlParseScript:
3064 * @ctxt: an HTML parser context
3065 *
3066 * parse the content of an HTML SCRIPT or STYLE element
3067 * http://www.w3.org/TR/html4/sgml/dtd.html#Script
3068 * http://www.w3.org/TR/html4/sgml/dtd.html#StyleSheet
3069 * http://www.w3.org/TR/html4/types.html#type-script
3070 * http://www.w3.org/TR/html4/types.html#h-6.15
3071 * http://www.w3.org/TR/html4/appendix/notes.html#h-B.3.2.1
3072 *
3073 * Script data ( %Script; in the DTD) can be the content of the SCRIPT
3074 * element and the value of intrinsic event attributes. User agents must
3075 * not evaluate script data as HTML markup but instead must pass it on as
3076 * data to a script engine.
3077 * NOTES:
3078 * - The content is passed like CDATA
3079 * - the attributes for style and scripting "onXXX" are also described
3080 * as CDATA but SGML allows entities references in attributes so their
3081 * processing is identical as other attributes
3082 */
3083 static void
htmlParseScript(htmlParserCtxtPtr ctxt)3084 htmlParseScript(htmlParserCtxtPtr ctxt) {
3085 xmlChar buf[HTML_PARSER_BIG_BUFFER_SIZE + 5];
3086 int nbchar = 0;
3087 int cur,l;
3088
3089 SHRINK;
3090 cur = CUR_CHAR(l);
3091 while (cur != 0) {
3092 if ((cur == '<') && (NXT(1) == '/')) {
3093 /*
3094 * One should break here, the specification is clear:
3095 * Authors should therefore escape "</" within the content.
3096 * Escape mechanisms are specific to each scripting or
3097 * style sheet language.
3098 *
3099 * In recovery mode, only break if end tag match the
3100 * current tag, effectively ignoring all tags inside the
3101 * script/style block and treating the entire block as
3102 * CDATA.
3103 */
3104 if (ctxt->recovery) {
3105 if (xmlStrncasecmp(ctxt->name, ctxt->input->cur+2,
3106 xmlStrlen(ctxt->name)) == 0)
3107 {
3108 break; /* while */
3109 } else {
3110 htmlParseErr(ctxt, XML_ERR_TAG_NAME_MISMATCH,
3111 "Element %s embeds close tag\n",
3112 ctxt->name, NULL);
3113 }
3114 } else {
3115 if (((NXT(2) >= 'A') && (NXT(2) <= 'Z')) ||
3116 ((NXT(2) >= 'a') && (NXT(2) <= 'z')))
3117 {
3118 break; /* while */
3119 }
3120 }
3121 }
3122 if (IS_CHAR(cur)) {
3123 COPY_BUF(l,buf,nbchar,cur);
3124 } else {
3125 htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR,
3126 "Invalid char in CDATA 0x%X\n", cur);
3127 }
3128 if (nbchar >= HTML_PARSER_BIG_BUFFER_SIZE) {
3129 buf[nbchar] = 0;
3130 if (ctxt->sax->cdataBlock!= NULL) {
3131 /*
3132 * Insert as CDATA, which is the same as HTML_PRESERVE_NODE
3133 */
3134 ctxt->sax->cdataBlock(ctxt->userData, buf, nbchar);
3135 } else if (ctxt->sax->characters != NULL) {
3136 ctxt->sax->characters(ctxt->userData, buf, nbchar);
3137 }
3138 nbchar = 0;
3139 }
3140 GROW;
3141 NEXTL(l);
3142 cur = CUR_CHAR(l);
3143 }
3144
3145 if ((nbchar != 0) && (ctxt->sax != NULL) && (!ctxt->disableSAX)) {
3146 buf[nbchar] = 0;
3147 if (ctxt->sax->cdataBlock!= NULL) {
3148 /*
3149 * Insert as CDATA, which is the same as HTML_PRESERVE_NODE
3150 */
3151 ctxt->sax->cdataBlock(ctxt->userData, buf, nbchar);
3152 } else if (ctxt->sax->characters != NULL) {
3153 ctxt->sax->characters(ctxt->userData, buf, nbchar);
3154 }
3155 }
3156 }
3157
3158
3159 /**
3160 * htmlParseCharDataInternal:
3161 * @ctxt: an HTML parser context
3162 * @readahead: optional read ahead character in ascii range
3163 *
3164 * parse a CharData section.
3165 * if we are within a CDATA section ']]>' marks an end of section.
3166 *
3167 * [14] CharData ::= [^<&]* - ([^<&]* ']]>' [^<&]*)
3168 */
3169
3170 static void
htmlParseCharDataInternal(htmlParserCtxtPtr ctxt,int readahead)3171 htmlParseCharDataInternal(htmlParserCtxtPtr ctxt, int readahead) {
3172 xmlChar buf[HTML_PARSER_BIG_BUFFER_SIZE + 6];
3173 int nbchar = 0;
3174 int cur, l;
3175 int chunk = 0;
3176
3177 if (readahead)
3178 buf[nbchar++] = readahead;
3179
3180 SHRINK;
3181 cur = CUR_CHAR(l);
3182 while (((cur != '<') || (ctxt->token == '<')) &&
3183 ((cur != '&') || (ctxt->token == '&')) &&
3184 (cur != 0)) {
3185 if (!(IS_CHAR(cur))) {
3186 htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR,
3187 "Invalid char in CDATA 0x%X\n", cur);
3188 } else {
3189 COPY_BUF(l,buf,nbchar,cur);
3190 }
3191 if (nbchar >= HTML_PARSER_BIG_BUFFER_SIZE) {
3192 buf[nbchar] = 0;
3193
3194 /*
3195 * Ok the segment is to be consumed as chars.
3196 */
3197 if ((ctxt->sax != NULL) && (!ctxt->disableSAX)) {
3198 if (areBlanks(ctxt, buf, nbchar)) {
3199 if (ctxt->keepBlanks) {
3200 if (ctxt->sax->characters != NULL)
3201 ctxt->sax->characters(ctxt->userData, buf, nbchar);
3202 } else {
3203 if (ctxt->sax->ignorableWhitespace != NULL)
3204 ctxt->sax->ignorableWhitespace(ctxt->userData,
3205 buf, nbchar);
3206 }
3207 } else {
3208 htmlCheckParagraph(ctxt);
3209 if (ctxt->sax->characters != NULL)
3210 ctxt->sax->characters(ctxt->userData, buf, nbchar);
3211 }
3212 }
3213 nbchar = 0;
3214 }
3215 NEXTL(l);
3216 chunk++;
3217 if (chunk > HTML_PARSER_BUFFER_SIZE) {
3218 chunk = 0;
3219 SHRINK;
3220 GROW;
3221 }
3222 cur = CUR_CHAR(l);
3223 if (cur == 0) {
3224 SHRINK;
3225 GROW;
3226 cur = CUR_CHAR(l);
3227 }
3228 }
3229 if (nbchar != 0) {
3230 buf[nbchar] = 0;
3231
3232 /*
3233 * Ok the segment is to be consumed as chars.
3234 */
3235 if ((ctxt->sax != NULL) && (!ctxt->disableSAX)) {
3236 if (areBlanks(ctxt, buf, nbchar)) {
3237 if (ctxt->keepBlanks) {
3238 if (ctxt->sax->characters != NULL)
3239 ctxt->sax->characters(ctxt->userData, buf, nbchar);
3240 } else {
3241 if (ctxt->sax->ignorableWhitespace != NULL)
3242 ctxt->sax->ignorableWhitespace(ctxt->userData,
3243 buf, nbchar);
3244 }
3245 } else {
3246 htmlCheckParagraph(ctxt);
3247 if (ctxt->sax->characters != NULL)
3248 ctxt->sax->characters(ctxt->userData, buf, nbchar);
3249 }
3250 }
3251 } else {
3252 /*
3253 * Loop detection
3254 */
3255 if (cur == 0)
3256 ctxt->instate = XML_PARSER_EOF;
3257 }
3258 }
3259
3260 /**
3261 * htmlParseCharData:
3262 * @ctxt: an HTML parser context
3263 *
3264 * parse a CharData section.
3265 * if we are within a CDATA section ']]>' marks an end of section.
3266 *
3267 * [14] CharData ::= [^<&]* - ([^<&]* ']]>' [^<&]*)
3268 */
3269
3270 static void
htmlParseCharData(htmlParserCtxtPtr ctxt)3271 htmlParseCharData(htmlParserCtxtPtr ctxt) {
3272 htmlParseCharDataInternal(ctxt, 0);
3273 }
3274
3275 /**
3276 * htmlParseExternalID:
3277 * @ctxt: an HTML parser context
3278 * @publicID: a xmlChar** receiving PubidLiteral
3279 *
3280 * Parse an External ID or a Public ID
3281 *
3282 * [75] ExternalID ::= 'SYSTEM' S SystemLiteral
3283 * | 'PUBLIC' S PubidLiteral S SystemLiteral
3284 *
3285 * [83] PublicID ::= 'PUBLIC' S PubidLiteral
3286 *
3287 * Returns the function returns SystemLiteral and in the second
3288 * case publicID receives PubidLiteral, is strict is off
3289 * it is possible to return NULL and have publicID set.
3290 */
3291
3292 static xmlChar *
htmlParseExternalID(htmlParserCtxtPtr ctxt,xmlChar ** publicID)3293 htmlParseExternalID(htmlParserCtxtPtr ctxt, xmlChar **publicID) {
3294 xmlChar *URI = NULL;
3295
3296 if ((UPPER == 'S') && (UPP(1) == 'Y') &&
3297 (UPP(2) == 'S') && (UPP(3) == 'T') &&
3298 (UPP(4) == 'E') && (UPP(5) == 'M')) {
3299 SKIP(6);
3300 if (!IS_BLANK_CH(CUR)) {
3301 htmlParseErr(ctxt, XML_ERR_SPACE_REQUIRED,
3302 "Space required after 'SYSTEM'\n", NULL, NULL);
3303 }
3304 SKIP_BLANKS;
3305 URI = htmlParseSystemLiteral(ctxt);
3306 if (URI == NULL) {
3307 htmlParseErr(ctxt, XML_ERR_URI_REQUIRED,
3308 "htmlParseExternalID: SYSTEM, no URI\n", NULL, NULL);
3309 }
3310 } else if ((UPPER == 'P') && (UPP(1) == 'U') &&
3311 (UPP(2) == 'B') && (UPP(3) == 'L') &&
3312 (UPP(4) == 'I') && (UPP(5) == 'C')) {
3313 SKIP(6);
3314 if (!IS_BLANK_CH(CUR)) {
3315 htmlParseErr(ctxt, XML_ERR_SPACE_REQUIRED,
3316 "Space required after 'PUBLIC'\n", NULL, NULL);
3317 }
3318 SKIP_BLANKS;
3319 *publicID = htmlParsePubidLiteral(ctxt);
3320 if (*publicID == NULL) {
3321 htmlParseErr(ctxt, XML_ERR_PUBID_REQUIRED,
3322 "htmlParseExternalID: PUBLIC, no Public Identifier\n",
3323 NULL, NULL);
3324 }
3325 SKIP_BLANKS;
3326 if ((CUR == '"') || (CUR == '\'')) {
3327 URI = htmlParseSystemLiteral(ctxt);
3328 }
3329 }
3330 return(URI);
3331 }
3332
3333 /**
3334 * xmlParsePI:
3335 * @ctxt: an XML parser context
3336 *
3337 * parse an XML Processing Instruction.
3338 *
3339 * [16] PI ::= '<?' PITarget (S (Char* - (Char* '?>' Char*)))? '?>'
3340 */
3341 static void
htmlParsePI(htmlParserCtxtPtr ctxt)3342 htmlParsePI(htmlParserCtxtPtr ctxt) {
3343 xmlChar *buf = NULL;
3344 int len = 0;
3345 int size = HTML_PARSER_BUFFER_SIZE;
3346 int cur, l;
3347 const xmlChar *target;
3348 xmlParserInputState state;
3349 int count = 0;
3350
3351 if ((RAW == '<') && (NXT(1) == '?')) {
3352 state = ctxt->instate;
3353 ctxt->instate = XML_PARSER_PI;
3354 /*
3355 * this is a Processing Instruction.
3356 */
3357 SKIP(2);
3358 SHRINK;
3359
3360 /*
3361 * Parse the target name and check for special support like
3362 * namespace.
3363 */
3364 target = htmlParseName(ctxt);
3365 if (target != NULL) {
3366 if (RAW == '>') {
3367 SKIP(1);
3368
3369 /*
3370 * SAX: PI detected.
3371 */
3372 if ((ctxt->sax) && (!ctxt->disableSAX) &&
3373 (ctxt->sax->processingInstruction != NULL))
3374 ctxt->sax->processingInstruction(ctxt->userData,
3375 target, NULL);
3376 ctxt->instate = state;
3377 return;
3378 }
3379 buf = (xmlChar *) xmlMallocAtomic(size * sizeof(xmlChar));
3380 if (buf == NULL) {
3381 htmlErrMemory(ctxt, NULL);
3382 ctxt->instate = state;
3383 return;
3384 }
3385 cur = CUR;
3386 if (!IS_BLANK(cur)) {
3387 htmlParseErr(ctxt, XML_ERR_SPACE_REQUIRED,
3388 "ParsePI: PI %s space expected\n", target, NULL);
3389 }
3390 SKIP_BLANKS;
3391 cur = CUR_CHAR(l);
3392 while ((cur != 0) && (cur != '>')) {
3393 if (len + 5 >= size) {
3394 xmlChar *tmp;
3395
3396 size *= 2;
3397 tmp = (xmlChar *) xmlRealloc(buf, size * sizeof(xmlChar));
3398 if (tmp == NULL) {
3399 htmlErrMemory(ctxt, NULL);
3400 xmlFree(buf);
3401 ctxt->instate = state;
3402 return;
3403 }
3404 buf = tmp;
3405 }
3406 count++;
3407 if (count > 50) {
3408 GROW;
3409 count = 0;
3410 }
3411 if (IS_CHAR(cur)) {
3412 COPY_BUF(l,buf,len,cur);
3413 } else {
3414 htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR,
3415 "Invalid char in processing instruction "
3416 "0x%X\n", cur);
3417 }
3418 NEXTL(l);
3419 cur = CUR_CHAR(l);
3420 if (cur == 0) {
3421 SHRINK;
3422 GROW;
3423 cur = CUR_CHAR(l);
3424 }
3425 }
3426 buf[len] = 0;
3427 if (cur != '>') {
3428 htmlParseErr(ctxt, XML_ERR_PI_NOT_FINISHED,
3429 "ParsePI: PI %s never end ...\n", target, NULL);
3430 } else {
3431 SKIP(1);
3432
3433 /*
3434 * SAX: PI detected.
3435 */
3436 if ((ctxt->sax) && (!ctxt->disableSAX) &&
3437 (ctxt->sax->processingInstruction != NULL))
3438 ctxt->sax->processingInstruction(ctxt->userData,
3439 target, buf);
3440 }
3441 xmlFree(buf);
3442 } else {
3443 htmlParseErr(ctxt, XML_ERR_PI_NOT_STARTED,
3444 "PI is not started correctly", NULL, NULL);
3445 }
3446 ctxt->instate = state;
3447 }
3448 }
3449
3450 /**
3451 * htmlParseComment:
3452 * @ctxt: an HTML parser context
3453 *
3454 * Parse an XML (SGML) comment <!-- .... -->
3455 *
3456 * [15] Comment ::= '<!--' ((Char - '-') | ('-' (Char - '-')))* '-->'
3457 */
3458 static void
htmlParseComment(htmlParserCtxtPtr ctxt)3459 htmlParseComment(htmlParserCtxtPtr ctxt) {
3460 xmlChar *buf = NULL;
3461 int len;
3462 int size = HTML_PARSER_BUFFER_SIZE;
3463 int q, ql;
3464 int r, rl;
3465 int cur, l;
3466 int next, nl;
3467 xmlParserInputState state;
3468
3469 /*
3470 * Check that there is a comment right here.
3471 */
3472 if ((RAW != '<') || (NXT(1) != '!') ||
3473 (NXT(2) != '-') || (NXT(3) != '-')) return;
3474
3475 state = ctxt->instate;
3476 ctxt->instate = XML_PARSER_COMMENT;
3477 SHRINK;
3478 SKIP(4);
3479 buf = (xmlChar *) xmlMallocAtomic(size * sizeof(xmlChar));
3480 if (buf == NULL) {
3481 htmlErrMemory(ctxt, "buffer allocation failed\n");
3482 ctxt->instate = state;
3483 return;
3484 }
3485 len = 0;
3486 buf[len] = 0;
3487 q = CUR_CHAR(ql);
3488 if (q == 0)
3489 goto unfinished;
3490 NEXTL(ql);
3491 r = CUR_CHAR(rl);
3492 if (r == 0)
3493 goto unfinished;
3494 NEXTL(rl);
3495 cur = CUR_CHAR(l);
3496 while ((cur != 0) &&
3497 ((cur != '>') ||
3498 (r != '-') || (q != '-'))) {
3499 NEXTL(l);
3500 next = CUR_CHAR(nl);
3501 if (next == 0) {
3502 SHRINK;
3503 GROW;
3504 next = CUR_CHAR(nl);
3505 }
3506
3507 if ((q == '-') && (r == '-') && (cur == '!') && (next == '>')) {
3508 htmlParseErr(ctxt, XML_ERR_COMMENT_NOT_FINISHED,
3509 "Comment incorrectly closed by '--!>'", NULL, NULL);
3510 cur = '>';
3511 break;
3512 }
3513
3514 if (len + 5 >= size) {
3515 xmlChar *tmp;
3516
3517 size *= 2;
3518 tmp = (xmlChar *) xmlRealloc(buf, size * sizeof(xmlChar));
3519 if (tmp == NULL) {
3520 xmlFree(buf);
3521 htmlErrMemory(ctxt, "growing buffer failed\n");
3522 ctxt->instate = state;
3523 return;
3524 }
3525 buf = tmp;
3526 }
3527 if (IS_CHAR(q)) {
3528 COPY_BUF(ql,buf,len,q);
3529 } else {
3530 htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR,
3531 "Invalid char in comment 0x%X\n", q);
3532 }
3533
3534 q = r;
3535 ql = rl;
3536 r = cur;
3537 rl = l;
3538 cur = next;
3539 l = nl;
3540 }
3541 buf[len] = 0;
3542 if (cur == '>') {
3543 NEXT;
3544 if ((ctxt->sax != NULL) && (ctxt->sax->comment != NULL) &&
3545 (!ctxt->disableSAX))
3546 ctxt->sax->comment(ctxt->userData, buf);
3547 xmlFree(buf);
3548 ctxt->instate = state;
3549 return;
3550 }
3551
3552 unfinished:
3553 htmlParseErr(ctxt, XML_ERR_COMMENT_NOT_FINISHED,
3554 "Comment not terminated \n<!--%.50s\n", buf, NULL);
3555 xmlFree(buf);
3556 }
3557
3558 /**
3559 * htmlParseCharRef:
3560 * @ctxt: an HTML parser context
3561 *
3562 * parse Reference declarations
3563 *
3564 * [66] CharRef ::= '&#' [0-9]+ ';' |
3565 * '&#x' [0-9a-fA-F]+ ';'
3566 *
3567 * Returns the value parsed (as an int)
3568 */
3569 int
htmlParseCharRef(htmlParserCtxtPtr ctxt)3570 htmlParseCharRef(htmlParserCtxtPtr ctxt) {
3571 int val = 0;
3572
3573 if ((ctxt == NULL) || (ctxt->input == NULL)) {
3574 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
3575 "htmlParseCharRef: context error\n",
3576 NULL, NULL);
3577 return(0);
3578 }
3579 if ((CUR == '&') && (NXT(1) == '#') &&
3580 ((NXT(2) == 'x') || NXT(2) == 'X')) {
3581 SKIP(3);
3582 while (CUR != ';') {
3583 if ((CUR >= '0') && (CUR <= '9')) {
3584 if (val < 0x110000)
3585 val = val * 16 + (CUR - '0');
3586 } else if ((CUR >= 'a') && (CUR <= 'f')) {
3587 if (val < 0x110000)
3588 val = val * 16 + (CUR - 'a') + 10;
3589 } else if ((CUR >= 'A') && (CUR <= 'F')) {
3590 if (val < 0x110000)
3591 val = val * 16 + (CUR - 'A') + 10;
3592 } else {
3593 htmlParseErr(ctxt, XML_ERR_INVALID_HEX_CHARREF,
3594 "htmlParseCharRef: missing semicolon\n",
3595 NULL, NULL);
3596 break;
3597 }
3598 NEXT;
3599 }
3600 if (CUR == ';')
3601 NEXT;
3602 } else if ((CUR == '&') && (NXT(1) == '#')) {
3603 SKIP(2);
3604 while (CUR != ';') {
3605 if ((CUR >= '0') && (CUR <= '9')) {
3606 if (val < 0x110000)
3607 val = val * 10 + (CUR - '0');
3608 } else {
3609 htmlParseErr(ctxt, XML_ERR_INVALID_DEC_CHARREF,
3610 "htmlParseCharRef: missing semicolon\n",
3611 NULL, NULL);
3612 break;
3613 }
3614 NEXT;
3615 }
3616 if (CUR == ';')
3617 NEXT;
3618 } else {
3619 htmlParseErr(ctxt, XML_ERR_INVALID_CHARREF,
3620 "htmlParseCharRef: invalid value\n", NULL, NULL);
3621 }
3622 /*
3623 * Check the value IS_CHAR ...
3624 */
3625 if (IS_CHAR(val)) {
3626 return(val);
3627 } else if (val >= 0x110000) {
3628 htmlParseErr(ctxt, XML_ERR_INVALID_CHAR,
3629 "htmlParseCharRef: value too large\n", NULL, NULL);
3630 } else {
3631 htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR,
3632 "htmlParseCharRef: invalid xmlChar value %d\n",
3633 val);
3634 }
3635 return(0);
3636 }
3637
3638
3639 /**
3640 * htmlParseDocTypeDecl:
3641 * @ctxt: an HTML parser context
3642 *
3643 * parse a DOCTYPE declaration
3644 *
3645 * [28] doctypedecl ::= '<!DOCTYPE' S Name (S ExternalID)? S?
3646 * ('[' (markupdecl | PEReference | S)* ']' S?)? '>'
3647 */
3648
3649 static void
htmlParseDocTypeDecl(htmlParserCtxtPtr ctxt)3650 htmlParseDocTypeDecl(htmlParserCtxtPtr ctxt) {
3651 const xmlChar *name;
3652 xmlChar *ExternalID = NULL;
3653 xmlChar *URI = NULL;
3654
3655 /*
3656 * We know that '<!DOCTYPE' has been detected.
3657 */
3658 SKIP(9);
3659
3660 SKIP_BLANKS;
3661
3662 /*
3663 * Parse the DOCTYPE name.
3664 */
3665 name = htmlParseName(ctxt);
3666 if (name == NULL) {
3667 htmlParseErr(ctxt, XML_ERR_NAME_REQUIRED,
3668 "htmlParseDocTypeDecl : no DOCTYPE name !\n",
3669 NULL, NULL);
3670 }
3671 /*
3672 * Check that upper(name) == "HTML" !!!!!!!!!!!!!
3673 */
3674
3675 SKIP_BLANKS;
3676
3677 /*
3678 * Check for SystemID and ExternalID
3679 */
3680 URI = htmlParseExternalID(ctxt, &ExternalID);
3681 SKIP_BLANKS;
3682
3683 /*
3684 * We should be at the end of the DOCTYPE declaration.
3685 */
3686 if (CUR != '>') {
3687 htmlParseErr(ctxt, XML_ERR_DOCTYPE_NOT_FINISHED,
3688 "DOCTYPE improperly terminated\n", NULL, NULL);
3689 /* Ignore bogus content */
3690 while ((CUR != 0) && (CUR != '>'))
3691 NEXT;
3692 }
3693 if (CUR == '>')
3694 NEXT;
3695
3696 /*
3697 * Create or update the document accordingly to the DOCTYPE
3698 */
3699 if ((ctxt->sax != NULL) && (ctxt->sax->internalSubset != NULL) &&
3700 (!ctxt->disableSAX))
3701 ctxt->sax->internalSubset(ctxt->userData, name, ExternalID, URI);
3702
3703 /*
3704 * Cleanup, since we don't use all those identifiers
3705 */
3706 if (URI != NULL) xmlFree(URI);
3707 if (ExternalID != NULL) xmlFree(ExternalID);
3708 }
3709
3710 /**
3711 * htmlParseAttribute:
3712 * @ctxt: an HTML parser context
3713 * @value: a xmlChar ** used to store the value of the attribute
3714 *
3715 * parse an attribute
3716 *
3717 * [41] Attribute ::= Name Eq AttValue
3718 *
3719 * [25] Eq ::= S? '=' S?
3720 *
3721 * With namespace:
3722 *
3723 * [NS 11] Attribute ::= QName Eq AttValue
3724 *
3725 * Also the case QName == xmlns:??? is handled independently as a namespace
3726 * definition.
3727 *
3728 * Returns the attribute name, and the value in *value.
3729 */
3730
3731 static const xmlChar *
htmlParseAttribute(htmlParserCtxtPtr ctxt,xmlChar ** value)3732 htmlParseAttribute(htmlParserCtxtPtr ctxt, xmlChar **value) {
3733 const xmlChar *name;
3734 xmlChar *val = NULL;
3735
3736 *value = NULL;
3737 name = htmlParseHTMLName(ctxt);
3738 if (name == NULL) {
3739 htmlParseErr(ctxt, XML_ERR_NAME_REQUIRED,
3740 "error parsing attribute name\n", NULL, NULL);
3741 return(NULL);
3742 }
3743
3744 /*
3745 * read the value
3746 */
3747 SKIP_BLANKS;
3748 if (CUR == '=') {
3749 NEXT;
3750 SKIP_BLANKS;
3751 val = htmlParseAttValue(ctxt);
3752 }
3753
3754 *value = val;
3755 return(name);
3756 }
3757
3758 /**
3759 * htmlCheckEncodingDirect:
3760 * @ctxt: an HTML parser context
3761 * @attvalue: the attribute value
3762 *
3763 * Checks an attribute value to detect
3764 * the encoding
3765 * If a new encoding is detected the parser is switched to decode
3766 * it and pass UTF8
3767 */
3768 static void
htmlCheckEncodingDirect(htmlParserCtxtPtr ctxt,const xmlChar * encoding)3769 htmlCheckEncodingDirect(htmlParserCtxtPtr ctxt, const xmlChar *encoding) {
3770
3771 if ((ctxt == NULL) || (encoding == NULL) ||
3772 (ctxt->options & HTML_PARSE_IGNORE_ENC))
3773 return;
3774
3775 /* do not change encoding */
3776 if (ctxt->input->encoding != NULL)
3777 return;
3778
3779 if (encoding != NULL) {
3780 xmlCharEncoding enc;
3781 xmlCharEncodingHandlerPtr handler;
3782
3783 while ((*encoding == ' ') || (*encoding == '\t')) encoding++;
3784
3785 if (ctxt->input->encoding != NULL)
3786 xmlFree((xmlChar *) ctxt->input->encoding);
3787 ctxt->input->encoding = xmlStrdup(encoding);
3788
3789 enc = xmlParseCharEncoding((const char *) encoding);
3790 /*
3791 * registered set of known encodings
3792 */
3793 if (enc != XML_CHAR_ENCODING_ERROR) {
3794 if (((enc == XML_CHAR_ENCODING_UTF16LE) ||
3795 (enc == XML_CHAR_ENCODING_UTF16BE) ||
3796 (enc == XML_CHAR_ENCODING_UCS4LE) ||
3797 (enc == XML_CHAR_ENCODING_UCS4BE)) &&
3798 (ctxt->input->buf != NULL) &&
3799 (ctxt->input->buf->encoder == NULL)) {
3800 htmlParseErr(ctxt, XML_ERR_INVALID_ENCODING,
3801 "htmlCheckEncoding: wrong encoding meta\n",
3802 NULL, NULL);
3803 } else {
3804 xmlSwitchEncoding(ctxt, enc);
3805 }
3806 ctxt->charset = XML_CHAR_ENCODING_UTF8;
3807 } else {
3808 /*
3809 * fallback for unknown encodings
3810 */
3811 handler = xmlFindCharEncodingHandler((const char *) encoding);
3812 if (handler != NULL) {
3813 xmlSwitchToEncoding(ctxt, handler);
3814 ctxt->charset = XML_CHAR_ENCODING_UTF8;
3815 } else {
3816 htmlParseErr(ctxt, XML_ERR_UNSUPPORTED_ENCODING,
3817 "htmlCheckEncoding: unknown encoding %s\n",
3818 encoding, NULL);
3819 }
3820 }
3821
3822 if ((ctxt->input->buf != NULL) &&
3823 (ctxt->input->buf->encoder != NULL) &&
3824 (ctxt->input->buf->raw != NULL) &&
3825 (ctxt->input->buf->buffer != NULL)) {
3826 int nbchars;
3827 int processed;
3828
3829 /*
3830 * convert as much as possible to the parser reading buffer.
3831 */
3832 processed = ctxt->input->cur - ctxt->input->base;
3833 xmlBufShrink(ctxt->input->buf->buffer, processed);
3834 nbchars = xmlCharEncInput(ctxt->input->buf, 1);
3835 xmlBufResetInput(ctxt->input->buf->buffer, ctxt->input);
3836 if (nbchars < 0) {
3837 htmlParseErr(ctxt, XML_ERR_INVALID_ENCODING,
3838 "htmlCheckEncoding: encoder error\n",
3839 NULL, NULL);
3840 }
3841 }
3842 }
3843 }
3844
3845 /**
3846 * htmlCheckEncoding:
3847 * @ctxt: an HTML parser context
3848 * @attvalue: the attribute value
3849 *
3850 * Checks an http-equiv attribute from a Meta tag to detect
3851 * the encoding
3852 * If a new encoding is detected the parser is switched to decode
3853 * it and pass UTF8
3854 */
3855 static void
htmlCheckEncoding(htmlParserCtxtPtr ctxt,const xmlChar * attvalue)3856 htmlCheckEncoding(htmlParserCtxtPtr ctxt, const xmlChar *attvalue) {
3857 const xmlChar *encoding;
3858
3859 if (!attvalue)
3860 return;
3861
3862 encoding = xmlStrcasestr(attvalue, BAD_CAST"charset");
3863 if (encoding != NULL) {
3864 encoding += 7;
3865 }
3866 /*
3867 * skip blank
3868 */
3869 if (encoding && IS_BLANK_CH(*encoding))
3870 encoding = xmlStrcasestr(attvalue, BAD_CAST"=");
3871 if (encoding && *encoding == '=') {
3872 encoding ++;
3873 htmlCheckEncodingDirect(ctxt, encoding);
3874 }
3875 }
3876
3877 /**
3878 * htmlCheckMeta:
3879 * @ctxt: an HTML parser context
3880 * @atts: the attributes values
3881 *
3882 * Checks an attributes from a Meta tag
3883 */
3884 static void
htmlCheckMeta(htmlParserCtxtPtr ctxt,const xmlChar ** atts)3885 htmlCheckMeta(htmlParserCtxtPtr ctxt, const xmlChar **atts) {
3886 int i;
3887 const xmlChar *att, *value;
3888 int http = 0;
3889 const xmlChar *content = NULL;
3890
3891 if ((ctxt == NULL) || (atts == NULL))
3892 return;
3893
3894 i = 0;
3895 att = atts[i++];
3896 while (att != NULL) {
3897 value = atts[i++];
3898 if ((value != NULL) && (!xmlStrcasecmp(att, BAD_CAST"http-equiv"))
3899 && (!xmlStrcasecmp(value, BAD_CAST"Content-Type")))
3900 http = 1;
3901 else if ((value != NULL) && (!xmlStrcasecmp(att, BAD_CAST"charset")))
3902 htmlCheckEncodingDirect(ctxt, value);
3903 else if ((value != NULL) && (!xmlStrcasecmp(att, BAD_CAST"content")))
3904 content = value;
3905 att = atts[i++];
3906 }
3907 if ((http) && (content != NULL))
3908 htmlCheckEncoding(ctxt, content);
3909
3910 }
3911
3912 /**
3913 * htmlParseStartTag:
3914 * @ctxt: an HTML parser context
3915 *
3916 * parse a start of tag either for rule element or
3917 * EmptyElement. In both case we don't parse the tag closing chars.
3918 *
3919 * [40] STag ::= '<' Name (S Attribute)* S? '>'
3920 *
3921 * [44] EmptyElemTag ::= '<' Name (S Attribute)* S? '/>'
3922 *
3923 * With namespace:
3924 *
3925 * [NS 8] STag ::= '<' QName (S Attribute)* S? '>'
3926 *
3927 * [NS 10] EmptyElement ::= '<' QName (S Attribute)* S? '/>'
3928 *
3929 * Returns 0 in case of success, -1 in case of error and 1 if discarded
3930 */
3931
3932 static int
htmlParseStartTag(htmlParserCtxtPtr ctxt)3933 htmlParseStartTag(htmlParserCtxtPtr ctxt) {
3934 const xmlChar *name;
3935 const xmlChar *attname;
3936 xmlChar *attvalue;
3937 const xmlChar **atts;
3938 int nbatts = 0;
3939 int maxatts;
3940 int meta = 0;
3941 int i;
3942 int discardtag = 0;
3943
3944 if ((ctxt == NULL) || (ctxt->input == NULL)) {
3945 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
3946 "htmlParseStartTag: context error\n", NULL, NULL);
3947 return -1;
3948 }
3949 if (ctxt->instate == XML_PARSER_EOF)
3950 return(-1);
3951 if (CUR != '<') return -1;
3952 NEXT;
3953
3954 atts = ctxt->atts;
3955 maxatts = ctxt->maxatts;
3956
3957 GROW;
3958 name = htmlParseHTMLName(ctxt);
3959 if (name == NULL) {
3960 htmlParseErr(ctxt, XML_ERR_NAME_REQUIRED,
3961 "htmlParseStartTag: invalid element name\n",
3962 NULL, NULL);
3963 /*
3964 * The recovery code is disabled for now as it can result in
3965 * quadratic behavior with the push parser. htmlParseStartTag
3966 * must consume all content up to the final '>' in order to avoid
3967 * rescanning for this terminator.
3968 *
3969 * For a proper fix in line with HTML5, htmlParseStartTag and
3970 * htmlParseElement should only be called when there's an ASCII
3971 * alpha character following the initial '<'. Otherwise, the '<'
3972 * should be emitted as text (unless followed by '!', '/' or '?').
3973 */
3974 #if 0
3975 /* if recover preserve text on classic misconstructs */
3976 if ((ctxt->recovery) && ((IS_BLANK_CH(CUR)) || (CUR == '<') ||
3977 (CUR == '=') || (CUR == '>') || (((CUR >= '0') && (CUR <= '9'))))) {
3978 htmlParseCharDataInternal(ctxt, '<');
3979 return(-1);
3980 }
3981 #endif
3982
3983 /* Dump the bogus tag like browsers do */
3984 while ((CUR != 0) && (CUR != '>') &&
3985 (ctxt->instate != XML_PARSER_EOF))
3986 NEXT;
3987 return -1;
3988 }
3989 if (xmlStrEqual(name, BAD_CAST"meta"))
3990 meta = 1;
3991
3992 /*
3993 * Check for auto-closure of HTML elements.
3994 */
3995 htmlAutoClose(ctxt, name);
3996
3997 /*
3998 * Check for implied HTML elements.
3999 */
4000 htmlCheckImplied(ctxt, name);
4001
4002 /*
4003 * Avoid html at any level > 0, head at any level != 1
4004 * or any attempt to recurse body
4005 */
4006 if ((ctxt->nameNr > 0) && (xmlStrEqual(name, BAD_CAST"html"))) {
4007 htmlParseErr(ctxt, XML_HTML_STRUCURE_ERROR,
4008 "htmlParseStartTag: misplaced <html> tag\n",
4009 name, NULL);
4010 discardtag = 1;
4011 ctxt->depth++;
4012 }
4013 if ((ctxt->nameNr != 1) &&
4014 (xmlStrEqual(name, BAD_CAST"head"))) {
4015 htmlParseErr(ctxt, XML_HTML_STRUCURE_ERROR,
4016 "htmlParseStartTag: misplaced <head> tag\n",
4017 name, NULL);
4018 discardtag = 1;
4019 ctxt->depth++;
4020 }
4021 if (xmlStrEqual(name, BAD_CAST"body")) {
4022 int indx;
4023 for (indx = 0;indx < ctxt->nameNr;indx++) {
4024 if (xmlStrEqual(ctxt->nameTab[indx], BAD_CAST"body")) {
4025 htmlParseErr(ctxt, XML_HTML_STRUCURE_ERROR,
4026 "htmlParseStartTag: misplaced <body> tag\n",
4027 name, NULL);
4028 discardtag = 1;
4029 ctxt->depth++;
4030 }
4031 }
4032 }
4033
4034 /*
4035 * Now parse the attributes, it ends up with the ending
4036 *
4037 * (S Attribute)* S?
4038 */
4039 SKIP_BLANKS;
4040 while ((CUR != 0) &&
4041 (CUR != '>') &&
4042 ((CUR != '/') || (NXT(1) != '>'))) {
4043 GROW;
4044 attname = htmlParseAttribute(ctxt, &attvalue);
4045 if (attname != NULL) {
4046
4047 /*
4048 * Well formedness requires at most one declaration of an attribute
4049 */
4050 for (i = 0; i < nbatts;i += 2) {
4051 if (xmlStrEqual(atts[i], attname)) {
4052 htmlParseErr(ctxt, XML_ERR_ATTRIBUTE_REDEFINED,
4053 "Attribute %s redefined\n", attname, NULL);
4054 if (attvalue != NULL)
4055 xmlFree(attvalue);
4056 goto failed;
4057 }
4058 }
4059
4060 /*
4061 * Add the pair to atts
4062 */
4063 if (atts == NULL) {
4064 maxatts = 22; /* allow for 10 attrs by default */
4065 atts = (const xmlChar **)
4066 xmlMalloc(maxatts * sizeof(xmlChar *));
4067 if (atts == NULL) {
4068 htmlErrMemory(ctxt, NULL);
4069 if (attvalue != NULL)
4070 xmlFree(attvalue);
4071 goto failed;
4072 }
4073 ctxt->atts = atts;
4074 ctxt->maxatts = maxatts;
4075 } else if (nbatts + 4 > maxatts) {
4076 const xmlChar **n;
4077
4078 maxatts *= 2;
4079 n = (const xmlChar **) xmlRealloc((void *) atts,
4080 maxatts * sizeof(const xmlChar *));
4081 if (n == NULL) {
4082 htmlErrMemory(ctxt, NULL);
4083 if (attvalue != NULL)
4084 xmlFree(attvalue);
4085 goto failed;
4086 }
4087 atts = n;
4088 ctxt->atts = atts;
4089 ctxt->maxatts = maxatts;
4090 }
4091 atts[nbatts++] = attname;
4092 atts[nbatts++] = attvalue;
4093 atts[nbatts] = NULL;
4094 atts[nbatts + 1] = NULL;
4095 }
4096 else {
4097 if (attvalue != NULL)
4098 xmlFree(attvalue);
4099 /* Dump the bogus attribute string up to the next blank or
4100 * the end of the tag. */
4101 while ((CUR != 0) &&
4102 !(IS_BLANK_CH(CUR)) && (CUR != '>') &&
4103 ((CUR != '/') || (NXT(1) != '>')))
4104 NEXT;
4105 }
4106
4107 failed:
4108 SKIP_BLANKS;
4109 }
4110
4111 /*
4112 * Handle specific association to the META tag
4113 */
4114 if (meta && (nbatts != 0))
4115 htmlCheckMeta(ctxt, atts);
4116
4117 /*
4118 * SAX: Start of Element !
4119 */
4120 if (!discardtag) {
4121 htmlnamePush(ctxt, name);
4122 if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL)) {
4123 if (nbatts != 0)
4124 ctxt->sax->startElement(ctxt->userData, name, atts);
4125 else
4126 ctxt->sax->startElement(ctxt->userData, name, NULL);
4127 }
4128 }
4129
4130 if (atts != NULL) {
4131 for (i = 1;i < nbatts;i += 2) {
4132 if (atts[i] != NULL)
4133 xmlFree((xmlChar *) atts[i]);
4134 }
4135 }
4136
4137 return(discardtag);
4138 }
4139
4140 /**
4141 * htmlParseEndTag:
4142 * @ctxt: an HTML parser context
4143 *
4144 * parse an end of tag
4145 *
4146 * [42] ETag ::= '</' Name S? '>'
4147 *
4148 * With namespace
4149 *
4150 * [NS 9] ETag ::= '</' QName S? '>'
4151 *
4152 * Returns 1 if the current level should be closed.
4153 */
4154
4155 static int
htmlParseEndTag(htmlParserCtxtPtr ctxt)4156 htmlParseEndTag(htmlParserCtxtPtr ctxt)
4157 {
4158 const xmlChar *name;
4159 const xmlChar *oldname;
4160 int i, ret;
4161
4162 if ((CUR != '<') || (NXT(1) != '/')) {
4163 htmlParseErr(ctxt, XML_ERR_LTSLASH_REQUIRED,
4164 "htmlParseEndTag: '</' not found\n", NULL, NULL);
4165 return (0);
4166 }
4167 SKIP(2);
4168
4169 name = htmlParseHTMLName(ctxt);
4170 if (name == NULL)
4171 return (0);
4172 /*
4173 * We should definitely be at the ending "S? '>'" part
4174 */
4175 SKIP_BLANKS;
4176 if (CUR != '>') {
4177 htmlParseErr(ctxt, XML_ERR_GT_REQUIRED,
4178 "End tag : expected '>'\n", NULL, NULL);
4179 /* Skip to next '>' */
4180 while ((CUR != 0) && (CUR != '>'))
4181 NEXT;
4182 }
4183 if (CUR == '>')
4184 NEXT;
4185
4186 /*
4187 * if we ignored misplaced tags in htmlParseStartTag don't pop them
4188 * out now.
4189 */
4190 if ((ctxt->depth > 0) &&
4191 (xmlStrEqual(name, BAD_CAST "html") ||
4192 xmlStrEqual(name, BAD_CAST "body") ||
4193 xmlStrEqual(name, BAD_CAST "head"))) {
4194 ctxt->depth--;
4195 return (0);
4196 }
4197
4198 /*
4199 * If the name read is not one of the element in the parsing stack
4200 * then return, it's just an error.
4201 */
4202 for (i = (ctxt->nameNr - 1); i >= 0; i--) {
4203 if (xmlStrEqual(name, ctxt->nameTab[i]))
4204 break;
4205 }
4206 if (i < 0) {
4207 htmlParseErr(ctxt, XML_ERR_TAG_NAME_MISMATCH,
4208 "Unexpected end tag : %s\n", name, NULL);
4209 return (0);
4210 }
4211
4212
4213 /*
4214 * Check for auto-closure of HTML elements.
4215 */
4216
4217 htmlAutoCloseOnClose(ctxt, name);
4218
4219 /*
4220 * Well formedness constraints, opening and closing must match.
4221 * With the exception that the autoclose may have popped stuff out
4222 * of the stack.
4223 */
4224 if ((ctxt->name != NULL) && (!xmlStrEqual(ctxt->name, name))) {
4225 htmlParseErr(ctxt, XML_ERR_TAG_NAME_MISMATCH,
4226 "Opening and ending tag mismatch: %s and %s\n",
4227 name, ctxt->name);
4228 }
4229
4230 /*
4231 * SAX: End of Tag
4232 */
4233 oldname = ctxt->name;
4234 if ((oldname != NULL) && (xmlStrEqual(oldname, name))) {
4235 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
4236 ctxt->sax->endElement(ctxt->userData, name);
4237 htmlNodeInfoPop(ctxt);
4238 htmlnamePop(ctxt);
4239 ret = 1;
4240 } else {
4241 ret = 0;
4242 }
4243
4244 return (ret);
4245 }
4246
4247
4248 /**
4249 * htmlParseReference:
4250 * @ctxt: an HTML parser context
4251 *
4252 * parse and handle entity references in content,
4253 * this will end-up in a call to character() since this is either a
4254 * CharRef, or a predefined entity.
4255 */
4256 static void
htmlParseReference(htmlParserCtxtPtr ctxt)4257 htmlParseReference(htmlParserCtxtPtr ctxt) {
4258 const htmlEntityDesc * ent;
4259 xmlChar out[6];
4260 const xmlChar *name;
4261 if (CUR != '&') return;
4262
4263 if (NXT(1) == '#') {
4264 unsigned int c;
4265 int bits, i = 0;
4266
4267 c = htmlParseCharRef(ctxt);
4268 if (c == 0)
4269 return;
4270
4271 if (c < 0x80) { out[i++]= c; bits= -6; }
4272 else if (c < 0x800) { out[i++]=((c >> 6) & 0x1F) | 0xC0; bits= 0; }
4273 else if (c < 0x10000) { out[i++]=((c >> 12) & 0x0F) | 0xE0; bits= 6; }
4274 else { out[i++]=((c >> 18) & 0x07) | 0xF0; bits= 12; }
4275
4276 for ( ; bits >= 0; bits-= 6) {
4277 out[i++]= ((c >> bits) & 0x3F) | 0x80;
4278 }
4279 out[i] = 0;
4280
4281 htmlCheckParagraph(ctxt);
4282 if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL))
4283 ctxt->sax->characters(ctxt->userData, out, i);
4284 } else {
4285 ent = htmlParseEntityRef(ctxt, &name);
4286 if (name == NULL) {
4287 htmlCheckParagraph(ctxt);
4288 if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL))
4289 ctxt->sax->characters(ctxt->userData, BAD_CAST "&", 1);
4290 return;
4291 }
4292 if ((ent == NULL) || !(ent->value > 0)) {
4293 htmlCheckParagraph(ctxt);
4294 if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL)) {
4295 ctxt->sax->characters(ctxt->userData, BAD_CAST "&", 1);
4296 ctxt->sax->characters(ctxt->userData, name, xmlStrlen(name));
4297 /* ctxt->sax->characters(ctxt->userData, BAD_CAST ";", 1); */
4298 }
4299 } else {
4300 unsigned int c;
4301 int bits, i = 0;
4302
4303 c = ent->value;
4304 if (c < 0x80)
4305 { out[i++]= c; bits= -6; }
4306 else if (c < 0x800)
4307 { out[i++]=((c >> 6) & 0x1F) | 0xC0; bits= 0; }
4308 else if (c < 0x10000)
4309 { out[i++]=((c >> 12) & 0x0F) | 0xE0; bits= 6; }
4310 else
4311 { out[i++]=((c >> 18) & 0x07) | 0xF0; bits= 12; }
4312
4313 for ( ; bits >= 0; bits-= 6) {
4314 out[i++]= ((c >> bits) & 0x3F) | 0x80;
4315 }
4316 out[i] = 0;
4317
4318 htmlCheckParagraph(ctxt);
4319 if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL))
4320 ctxt->sax->characters(ctxt->userData, out, i);
4321 }
4322 }
4323 }
4324
4325 /**
4326 * htmlParseContent:
4327 * @ctxt: an HTML parser context
4328 *
4329 * Parse a content: comment, sub-element, reference or text.
4330 * Kept for compatibility with old code
4331 */
4332
4333 static void
htmlParseContent(htmlParserCtxtPtr ctxt)4334 htmlParseContent(htmlParserCtxtPtr ctxt) {
4335 xmlChar *currentNode;
4336 int depth;
4337 const xmlChar *name;
4338
4339 currentNode = xmlStrdup(ctxt->name);
4340 depth = ctxt->nameNr;
4341 while (1) {
4342 GROW;
4343
4344 if (ctxt->instate == XML_PARSER_EOF)
4345 break;
4346
4347 /*
4348 * Our tag or one of it's parent or children is ending.
4349 */
4350 if ((CUR == '<') && (NXT(1) == '/')) {
4351 if (htmlParseEndTag(ctxt) &&
4352 ((currentNode != NULL) || (ctxt->nameNr == 0))) {
4353 if (currentNode != NULL)
4354 xmlFree(currentNode);
4355 return;
4356 }
4357 continue; /* while */
4358 }
4359
4360 else if ((CUR == '<') &&
4361 ((IS_ASCII_LETTER(NXT(1))) ||
4362 (NXT(1) == '_') || (NXT(1) == ':'))) {
4363 name = htmlParseHTMLName_nonInvasive(ctxt);
4364 if (name == NULL) {
4365 htmlParseErr(ctxt, XML_ERR_NAME_REQUIRED,
4366 "htmlParseStartTag: invalid element name\n",
4367 NULL, NULL);
4368 /* Dump the bogus tag like browsers do */
4369 while ((CUR != 0) && (CUR != '>'))
4370 NEXT;
4371
4372 if (currentNode != NULL)
4373 xmlFree(currentNode);
4374 return;
4375 }
4376
4377 if (ctxt->name != NULL) {
4378 if (htmlCheckAutoClose(name, ctxt->name) == 1) {
4379 htmlAutoClose(ctxt, name);
4380 continue;
4381 }
4382 }
4383 }
4384
4385 /*
4386 * Has this node been popped out during parsing of
4387 * the next element
4388 */
4389 if ((ctxt->nameNr > 0) && (depth >= ctxt->nameNr) &&
4390 (!xmlStrEqual(currentNode, ctxt->name)))
4391 {
4392 if (currentNode != NULL) xmlFree(currentNode);
4393 return;
4394 }
4395
4396 if ((CUR != 0) && ((xmlStrEqual(currentNode, BAD_CAST"script")) ||
4397 (xmlStrEqual(currentNode, BAD_CAST"style")))) {
4398 /*
4399 * Handle SCRIPT/STYLE separately
4400 */
4401 htmlParseScript(ctxt);
4402 } else {
4403 /*
4404 * Sometimes DOCTYPE arrives in the middle of the document
4405 */
4406 if ((CUR == '<') && (NXT(1) == '!') &&
4407 (UPP(2) == 'D') && (UPP(3) == 'O') &&
4408 (UPP(4) == 'C') && (UPP(5) == 'T') &&
4409 (UPP(6) == 'Y') && (UPP(7) == 'P') &&
4410 (UPP(8) == 'E')) {
4411 htmlParseErr(ctxt, XML_HTML_STRUCURE_ERROR,
4412 "Misplaced DOCTYPE declaration\n",
4413 BAD_CAST "DOCTYPE" , NULL);
4414 htmlParseDocTypeDecl(ctxt);
4415 }
4416
4417 /*
4418 * First case : a comment
4419 */
4420 if ((CUR == '<') && (NXT(1) == '!') &&
4421 (NXT(2) == '-') && (NXT(3) == '-')) {
4422 htmlParseComment(ctxt);
4423 }
4424
4425 /*
4426 * Second case : a Processing Instruction.
4427 */
4428 else if ((CUR == '<') && (NXT(1) == '?')) {
4429 htmlParsePI(ctxt);
4430 }
4431
4432 /*
4433 * Third case : a sub-element.
4434 */
4435 else if (CUR == '<') {
4436 htmlParseElement(ctxt);
4437 }
4438
4439 /*
4440 * Fourth case : a reference. If if has not been resolved,
4441 * parsing returns it's Name, create the node
4442 */
4443 else if (CUR == '&') {
4444 htmlParseReference(ctxt);
4445 }
4446
4447 /*
4448 * Fifth case : end of the resource
4449 */
4450 else if (CUR == 0) {
4451 htmlAutoCloseOnEnd(ctxt);
4452 break;
4453 }
4454
4455 /*
4456 * Last case, text. Note that References are handled directly.
4457 */
4458 else {
4459 htmlParseCharData(ctxt);
4460 }
4461 }
4462 GROW;
4463 }
4464 if (currentNode != NULL) xmlFree(currentNode);
4465 }
4466
4467 /**
4468 * htmlParseElement:
4469 * @ctxt: an HTML parser context
4470 *
4471 * parse an HTML element, this is highly recursive
4472 * this is kept for compatibility with previous code versions
4473 *
4474 * [39] element ::= EmptyElemTag | STag content ETag
4475 *
4476 * [41] Attribute ::= Name Eq AttValue
4477 */
4478
4479 void
htmlParseElement(htmlParserCtxtPtr ctxt)4480 htmlParseElement(htmlParserCtxtPtr ctxt) {
4481 const xmlChar *name;
4482 xmlChar *currentNode = NULL;
4483 const htmlElemDesc * info;
4484 htmlParserNodeInfo node_info;
4485 int failed;
4486 int depth;
4487 const xmlChar *oldptr;
4488
4489 if ((ctxt == NULL) || (ctxt->input == NULL)) {
4490 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
4491 "htmlParseElement: context error\n", NULL, NULL);
4492 return;
4493 }
4494
4495 if (ctxt->instate == XML_PARSER_EOF)
4496 return;
4497
4498 /* Capture start position */
4499 if (ctxt->record_info) {
4500 node_info.begin_pos = ctxt->input->consumed +
4501 (CUR_PTR - ctxt->input->base);
4502 node_info.begin_line = ctxt->input->line;
4503 }
4504
4505 failed = htmlParseStartTag(ctxt);
4506 name = ctxt->name;
4507 if ((failed == -1) || (name == NULL)) {
4508 if (CUR == '>')
4509 NEXT;
4510 return;
4511 }
4512
4513 /*
4514 * Lookup the info for that element.
4515 */
4516 info = htmlTagLookup(name);
4517 if (info == NULL) {
4518 htmlParseErr(ctxt, XML_HTML_UNKNOWN_TAG,
4519 "Tag %s invalid\n", name, NULL);
4520 }
4521
4522 /*
4523 * Check for an Empty Element labeled the XML/SGML way
4524 */
4525 if ((CUR == '/') && (NXT(1) == '>')) {
4526 SKIP(2);
4527 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
4528 ctxt->sax->endElement(ctxt->userData, name);
4529 htmlnamePop(ctxt);
4530 return;
4531 }
4532
4533 if (CUR == '>') {
4534 NEXT;
4535 } else {
4536 htmlParseErr(ctxt, XML_ERR_GT_REQUIRED,
4537 "Couldn't find end of Start Tag %s\n", name, NULL);
4538
4539 /*
4540 * end of parsing of this node.
4541 */
4542 if (xmlStrEqual(name, ctxt->name)) {
4543 nodePop(ctxt);
4544 htmlnamePop(ctxt);
4545 }
4546
4547 /*
4548 * Capture end position and add node
4549 */
4550 if (ctxt->record_info) {
4551 node_info.end_pos = ctxt->input->consumed +
4552 (CUR_PTR - ctxt->input->base);
4553 node_info.end_line = ctxt->input->line;
4554 node_info.node = ctxt->node;
4555 xmlParserAddNodeInfo(ctxt, &node_info);
4556 }
4557 return;
4558 }
4559
4560 /*
4561 * Check for an Empty Element from DTD definition
4562 */
4563 if ((info != NULL) && (info->empty)) {
4564 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
4565 ctxt->sax->endElement(ctxt->userData, name);
4566 htmlnamePop(ctxt);
4567 return;
4568 }
4569
4570 /*
4571 * Parse the content of the element:
4572 */
4573 currentNode = xmlStrdup(ctxt->name);
4574 depth = ctxt->nameNr;
4575 while (CUR != 0) {
4576 oldptr = ctxt->input->cur;
4577 htmlParseContent(ctxt);
4578 if (oldptr==ctxt->input->cur) break;
4579 if (ctxt->nameNr < depth) break;
4580 }
4581
4582 /*
4583 * Capture end position and add node
4584 */
4585 if ( currentNode != NULL && ctxt->record_info ) {
4586 node_info.end_pos = ctxt->input->consumed +
4587 (CUR_PTR - ctxt->input->base);
4588 node_info.end_line = ctxt->input->line;
4589 node_info.node = ctxt->node;
4590 xmlParserAddNodeInfo(ctxt, &node_info);
4591 }
4592 if (CUR == 0) {
4593 htmlAutoCloseOnEnd(ctxt);
4594 }
4595
4596 if (currentNode != NULL)
4597 xmlFree(currentNode);
4598 }
4599
4600 static void
htmlParserFinishElementParsing(htmlParserCtxtPtr ctxt)4601 htmlParserFinishElementParsing(htmlParserCtxtPtr ctxt) {
4602 /*
4603 * Capture end position and add node
4604 */
4605 if ( ctxt->node != NULL && ctxt->record_info ) {
4606 ctxt->nodeInfo->end_pos = ctxt->input->consumed +
4607 (CUR_PTR - ctxt->input->base);
4608 ctxt->nodeInfo->end_line = ctxt->input->line;
4609 ctxt->nodeInfo->node = ctxt->node;
4610 xmlParserAddNodeInfo(ctxt, ctxt->nodeInfo);
4611 htmlNodeInfoPop(ctxt);
4612 }
4613 if (CUR == 0) {
4614 htmlAutoCloseOnEnd(ctxt);
4615 }
4616 }
4617
4618 /**
4619 * htmlParseElementInternal:
4620 * @ctxt: an HTML parser context
4621 *
4622 * parse an HTML element, new version, non recursive
4623 *
4624 * [39] element ::= EmptyElemTag | STag content ETag
4625 *
4626 * [41] Attribute ::= Name Eq AttValue
4627 */
4628
4629 static void
htmlParseElementInternal(htmlParserCtxtPtr ctxt)4630 htmlParseElementInternal(htmlParserCtxtPtr ctxt) {
4631 const xmlChar *name;
4632 const htmlElemDesc * info;
4633 htmlParserNodeInfo node_info = { NULL, 0, 0, 0, 0 };
4634 int failed;
4635
4636 if ((ctxt == NULL) || (ctxt->input == NULL)) {
4637 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
4638 "htmlParseElementInternal: context error\n", NULL, NULL);
4639 return;
4640 }
4641
4642 if (ctxt->instate == XML_PARSER_EOF)
4643 return;
4644
4645 /* Capture start position */
4646 if (ctxt->record_info) {
4647 node_info.begin_pos = ctxt->input->consumed +
4648 (CUR_PTR - ctxt->input->base);
4649 node_info.begin_line = ctxt->input->line;
4650 }
4651
4652 failed = htmlParseStartTag(ctxt);
4653 name = ctxt->name;
4654 if ((failed == -1) || (name == NULL)) {
4655 if (CUR == '>')
4656 NEXT;
4657 return;
4658 }
4659
4660 /*
4661 * Lookup the info for that element.
4662 */
4663 info = htmlTagLookup(name);
4664 if (info == NULL) {
4665 htmlParseErr(ctxt, XML_HTML_UNKNOWN_TAG,
4666 "Tag %s invalid\n", name, NULL);
4667 }
4668
4669 /*
4670 * Check for an Empty Element labeled the XML/SGML way
4671 */
4672 if ((CUR == '/') && (NXT(1) == '>')) {
4673 SKIP(2);
4674 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
4675 ctxt->sax->endElement(ctxt->userData, name);
4676 htmlnamePop(ctxt);
4677 return;
4678 }
4679
4680 if (CUR == '>') {
4681 NEXT;
4682 } else {
4683 htmlParseErr(ctxt, XML_ERR_GT_REQUIRED,
4684 "Couldn't find end of Start Tag %s\n", name, NULL);
4685
4686 /*
4687 * end of parsing of this node.
4688 */
4689 if (xmlStrEqual(name, ctxt->name)) {
4690 nodePop(ctxt);
4691 htmlnamePop(ctxt);
4692 }
4693
4694 if (ctxt->record_info)
4695 htmlNodeInfoPush(ctxt, &node_info);
4696 htmlParserFinishElementParsing(ctxt);
4697 return;
4698 }
4699
4700 /*
4701 * Check for an Empty Element from DTD definition
4702 */
4703 if ((info != NULL) && (info->empty)) {
4704 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
4705 ctxt->sax->endElement(ctxt->userData, name);
4706 htmlnamePop(ctxt);
4707 return;
4708 }
4709
4710 if (ctxt->record_info)
4711 htmlNodeInfoPush(ctxt, &node_info);
4712 }
4713
4714 /**
4715 * htmlParseContentInternal:
4716 * @ctxt: an HTML parser context
4717 *
4718 * Parse a content: comment, sub-element, reference or text.
4719 * New version for non recursive htmlParseElementInternal
4720 */
4721
4722 static void
htmlParseContentInternal(htmlParserCtxtPtr ctxt)4723 htmlParseContentInternal(htmlParserCtxtPtr ctxt) {
4724 xmlChar *currentNode;
4725 int depth;
4726 const xmlChar *name;
4727
4728 currentNode = xmlStrdup(ctxt->name);
4729 depth = ctxt->nameNr;
4730 while (1) {
4731 GROW;
4732
4733 if (ctxt->instate == XML_PARSER_EOF)
4734 break;
4735
4736 /*
4737 * Our tag or one of it's parent or children is ending.
4738 */
4739 if ((CUR == '<') && (NXT(1) == '/')) {
4740 if (htmlParseEndTag(ctxt) &&
4741 ((currentNode != NULL) || (ctxt->nameNr == 0))) {
4742 if (currentNode != NULL)
4743 xmlFree(currentNode);
4744
4745 currentNode = xmlStrdup(ctxt->name);
4746 depth = ctxt->nameNr;
4747 }
4748 continue; /* while */
4749 }
4750
4751 else if ((CUR == '<') &&
4752 ((IS_ASCII_LETTER(NXT(1))) ||
4753 (NXT(1) == '_') || (NXT(1) == ':'))) {
4754 name = htmlParseHTMLName_nonInvasive(ctxt);
4755 if (name == NULL) {
4756 htmlParseErr(ctxt, XML_ERR_NAME_REQUIRED,
4757 "htmlParseStartTag: invalid element name\n",
4758 NULL, NULL);
4759 /* Dump the bogus tag like browsers do */
4760 while ((CUR == 0) && (CUR != '>'))
4761 NEXT;
4762
4763 htmlParserFinishElementParsing(ctxt);
4764 if (currentNode != NULL)
4765 xmlFree(currentNode);
4766
4767 currentNode = xmlStrdup(ctxt->name);
4768 depth = ctxt->nameNr;
4769 continue;
4770 }
4771
4772 if (ctxt->name != NULL) {
4773 if (htmlCheckAutoClose(name, ctxt->name) == 1) {
4774 htmlAutoClose(ctxt, name);
4775 continue;
4776 }
4777 }
4778 }
4779
4780 /*
4781 * Has this node been popped out during parsing of
4782 * the next element
4783 */
4784 if ((ctxt->nameNr > 0) && (depth >= ctxt->nameNr) &&
4785 (!xmlStrEqual(currentNode, ctxt->name)))
4786 {
4787 htmlParserFinishElementParsing(ctxt);
4788 if (currentNode != NULL) xmlFree(currentNode);
4789
4790 currentNode = xmlStrdup(ctxt->name);
4791 depth = ctxt->nameNr;
4792 continue;
4793 }
4794
4795 if ((CUR != 0) && ((xmlStrEqual(currentNode, BAD_CAST"script")) ||
4796 (xmlStrEqual(currentNode, BAD_CAST"style")))) {
4797 /*
4798 * Handle SCRIPT/STYLE separately
4799 */
4800 htmlParseScript(ctxt);
4801 } else {
4802 /*
4803 * Sometimes DOCTYPE arrives in the middle of the document
4804 */
4805 if ((CUR == '<') && (NXT(1) == '!') &&
4806 (UPP(2) == 'D') && (UPP(3) == 'O') &&
4807 (UPP(4) == 'C') && (UPP(5) == 'T') &&
4808 (UPP(6) == 'Y') && (UPP(7) == 'P') &&
4809 (UPP(8) == 'E')) {
4810 htmlParseErr(ctxt, XML_HTML_STRUCURE_ERROR,
4811 "Misplaced DOCTYPE declaration\n",
4812 BAD_CAST "DOCTYPE" , NULL);
4813 htmlParseDocTypeDecl(ctxt);
4814 }
4815
4816 /*
4817 * First case : a comment
4818 */
4819 if ((CUR == '<') && (NXT(1) == '!') &&
4820 (NXT(2) == '-') && (NXT(3) == '-')) {
4821 htmlParseComment(ctxt);
4822 }
4823
4824 /*
4825 * Second case : a Processing Instruction.
4826 */
4827 else if ((CUR == '<') && (NXT(1) == '?')) {
4828 htmlParsePI(ctxt);
4829 }
4830
4831 /*
4832 * Third case : a sub-element.
4833 */
4834 else if (CUR == '<') {
4835 htmlParseElementInternal(ctxt);
4836 if (currentNode != NULL) xmlFree(currentNode);
4837
4838 currentNode = xmlStrdup(ctxt->name);
4839 depth = ctxt->nameNr;
4840 }
4841
4842 /*
4843 * Fourth case : a reference. If if has not been resolved,
4844 * parsing returns it's Name, create the node
4845 */
4846 else if (CUR == '&') {
4847 htmlParseReference(ctxt);
4848 }
4849
4850 /*
4851 * Fifth case : end of the resource
4852 */
4853 else if (CUR == 0) {
4854 htmlAutoCloseOnEnd(ctxt);
4855 break;
4856 }
4857
4858 /*
4859 * Last case, text. Note that References are handled directly.
4860 */
4861 else {
4862 htmlParseCharData(ctxt);
4863 }
4864 }
4865 GROW;
4866 }
4867 if (currentNode != NULL) xmlFree(currentNode);
4868 }
4869
4870 /**
4871 * htmlParseContent:
4872 * @ctxt: an HTML parser context
4873 *
4874 * Parse a content: comment, sub-element, reference or text.
4875 * This is the entry point when called from parser.c
4876 */
4877
4878 void
__htmlParseContent(void * ctxt)4879 __htmlParseContent(void *ctxt) {
4880 if (ctxt != NULL)
4881 htmlParseContentInternal((htmlParserCtxtPtr) ctxt);
4882 }
4883
4884 /**
4885 * htmlParseDocument:
4886 * @ctxt: an HTML parser context
4887 *
4888 * parse an HTML document (and build a tree if using the standard SAX
4889 * interface).
4890 *
4891 * Returns 0, -1 in case of error. the parser context is augmented
4892 * as a result of the parsing.
4893 */
4894
4895 int
htmlParseDocument(htmlParserCtxtPtr ctxt)4896 htmlParseDocument(htmlParserCtxtPtr ctxt) {
4897 xmlChar start[4];
4898 xmlCharEncoding enc;
4899 xmlDtdPtr dtd;
4900
4901 xmlInitParser();
4902
4903 htmlDefaultSAXHandlerInit();
4904
4905 if ((ctxt == NULL) || (ctxt->input == NULL)) {
4906 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
4907 "htmlParseDocument: context error\n", NULL, NULL);
4908 return(XML_ERR_INTERNAL_ERROR);
4909 }
4910 ctxt->html = 1;
4911 ctxt->linenumbers = 1;
4912 GROW;
4913 /*
4914 * SAX: beginning of the document processing.
4915 */
4916 if ((ctxt->sax) && (ctxt->sax->setDocumentLocator))
4917 ctxt->sax->setDocumentLocator(ctxt->userData, &xmlDefaultSAXLocator);
4918
4919 if ((ctxt->encoding == (const xmlChar *)XML_CHAR_ENCODING_NONE) &&
4920 ((ctxt->input->end - ctxt->input->cur) >= 4)) {
4921 /*
4922 * Get the 4 first bytes and decode the charset
4923 * if enc != XML_CHAR_ENCODING_NONE
4924 * plug some encoding conversion routines.
4925 */
4926 start[0] = RAW;
4927 start[1] = NXT(1);
4928 start[2] = NXT(2);
4929 start[3] = NXT(3);
4930 enc = xmlDetectCharEncoding(&start[0], 4);
4931 if (enc != XML_CHAR_ENCODING_NONE) {
4932 xmlSwitchEncoding(ctxt, enc);
4933 }
4934 }
4935
4936 /*
4937 * Wipe out everything which is before the first '<'
4938 */
4939 SKIP_BLANKS;
4940 if (CUR == 0) {
4941 htmlParseErr(ctxt, XML_ERR_DOCUMENT_EMPTY,
4942 "Document is empty\n", NULL, NULL);
4943 }
4944
4945 if ((ctxt->sax) && (ctxt->sax->startDocument) && (!ctxt->disableSAX))
4946 ctxt->sax->startDocument(ctxt->userData);
4947
4948
4949 /*
4950 * Parse possible comments and PIs before any content
4951 */
4952 while (((CUR == '<') && (NXT(1) == '!') &&
4953 (NXT(2) == '-') && (NXT(3) == '-')) ||
4954 ((CUR == '<') && (NXT(1) == '?'))) {
4955 htmlParseComment(ctxt);
4956 htmlParsePI(ctxt);
4957 SKIP_BLANKS;
4958 }
4959
4960
4961 /*
4962 * Then possibly doc type declaration(s) and more Misc
4963 * (doctypedecl Misc*)?
4964 */
4965 if ((CUR == '<') && (NXT(1) == '!') &&
4966 (UPP(2) == 'D') && (UPP(3) == 'O') &&
4967 (UPP(4) == 'C') && (UPP(5) == 'T') &&
4968 (UPP(6) == 'Y') && (UPP(7) == 'P') &&
4969 (UPP(8) == 'E')) {
4970 htmlParseDocTypeDecl(ctxt);
4971 }
4972 SKIP_BLANKS;
4973
4974 /*
4975 * Parse possible comments and PIs before any content
4976 */
4977 while (((CUR == '<') && (NXT(1) == '!') &&
4978 (NXT(2) == '-') && (NXT(3) == '-')) ||
4979 ((CUR == '<') && (NXT(1) == '?'))) {
4980 htmlParseComment(ctxt);
4981 htmlParsePI(ctxt);
4982 SKIP_BLANKS;
4983 }
4984
4985 /*
4986 * Time to start parsing the tree itself
4987 */
4988 htmlParseContentInternal(ctxt);
4989
4990 /*
4991 * autoclose
4992 */
4993 if (CUR == 0)
4994 htmlAutoCloseOnEnd(ctxt);
4995
4996
4997 /*
4998 * SAX: end of the document processing.
4999 */
5000 if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
5001 ctxt->sax->endDocument(ctxt->userData);
5002
5003 if ((!(ctxt->options & HTML_PARSE_NODEFDTD)) && (ctxt->myDoc != NULL)) {
5004 dtd = xmlGetIntSubset(ctxt->myDoc);
5005 if (dtd == NULL)
5006 ctxt->myDoc->intSubset =
5007 xmlCreateIntSubset(ctxt->myDoc, BAD_CAST "html",
5008 BAD_CAST "-//W3C//DTD HTML 4.0 Transitional//EN",
5009 BAD_CAST "http://www.w3.org/TR/REC-html40/loose.dtd");
5010 }
5011 if (! ctxt->wellFormed) return(-1);
5012 return(0);
5013 }
5014
5015
5016 /************************************************************************
5017 * *
5018 * Parser contexts handling *
5019 * *
5020 ************************************************************************/
5021
5022 /**
5023 * htmlInitParserCtxt:
5024 * @ctxt: an HTML parser context
5025 *
5026 * Initialize a parser context
5027 *
5028 * Returns 0 in case of success and -1 in case of error
5029 */
5030
5031 static int
htmlInitParserCtxt(htmlParserCtxtPtr ctxt)5032 htmlInitParserCtxt(htmlParserCtxtPtr ctxt)
5033 {
5034 htmlSAXHandler *sax;
5035
5036 if (ctxt == NULL) return(-1);
5037 memset(ctxt, 0, sizeof(htmlParserCtxt));
5038
5039 ctxt->dict = xmlDictCreate();
5040 if (ctxt->dict == NULL) {
5041 htmlErrMemory(NULL, "htmlInitParserCtxt: out of memory\n");
5042 return(-1);
5043 }
5044 sax = (htmlSAXHandler *) xmlMalloc(sizeof(htmlSAXHandler));
5045 if (sax == NULL) {
5046 htmlErrMemory(NULL, "htmlInitParserCtxt: out of memory\n");
5047 return(-1);
5048 }
5049 else
5050 memset(sax, 0, sizeof(htmlSAXHandler));
5051
5052 /* Allocate the Input stack */
5053 ctxt->inputTab = (htmlParserInputPtr *)
5054 xmlMalloc(5 * sizeof(htmlParserInputPtr));
5055 if (ctxt->inputTab == NULL) {
5056 htmlErrMemory(NULL, "htmlInitParserCtxt: out of memory\n");
5057 ctxt->inputNr = 0;
5058 ctxt->inputMax = 0;
5059 ctxt->input = NULL;
5060 return(-1);
5061 }
5062 ctxt->inputNr = 0;
5063 ctxt->inputMax = 5;
5064 ctxt->input = NULL;
5065 ctxt->version = NULL;
5066 ctxt->encoding = NULL;
5067 ctxt->standalone = -1;
5068 ctxt->instate = XML_PARSER_START;
5069
5070 /* Allocate the Node stack */
5071 ctxt->nodeTab = (htmlNodePtr *) xmlMalloc(10 * sizeof(htmlNodePtr));
5072 if (ctxt->nodeTab == NULL) {
5073 htmlErrMemory(NULL, "htmlInitParserCtxt: out of memory\n");
5074 ctxt->nodeNr = 0;
5075 ctxt->nodeMax = 0;
5076 ctxt->node = NULL;
5077 ctxt->inputNr = 0;
5078 ctxt->inputMax = 0;
5079 ctxt->input = NULL;
5080 return(-1);
5081 }
5082 ctxt->nodeNr = 0;
5083 ctxt->nodeMax = 10;
5084 ctxt->node = NULL;
5085
5086 /* Allocate the Name stack */
5087 ctxt->nameTab = (const xmlChar **) xmlMalloc(10 * sizeof(xmlChar *));
5088 if (ctxt->nameTab == NULL) {
5089 htmlErrMemory(NULL, "htmlInitParserCtxt: out of memory\n");
5090 ctxt->nameNr = 0;
5091 ctxt->nameMax = 0;
5092 ctxt->name = NULL;
5093 ctxt->nodeNr = 0;
5094 ctxt->nodeMax = 0;
5095 ctxt->node = NULL;
5096 ctxt->inputNr = 0;
5097 ctxt->inputMax = 0;
5098 ctxt->input = NULL;
5099 return(-1);
5100 }
5101 ctxt->nameNr = 0;
5102 ctxt->nameMax = 10;
5103 ctxt->name = NULL;
5104
5105 ctxt->nodeInfoTab = NULL;
5106 ctxt->nodeInfoNr = 0;
5107 ctxt->nodeInfoMax = 0;
5108
5109 if (sax == NULL) ctxt->sax = (xmlSAXHandlerPtr) &htmlDefaultSAXHandler;
5110 else {
5111 ctxt->sax = sax;
5112 memcpy(sax, &htmlDefaultSAXHandler, sizeof(xmlSAXHandlerV1));
5113 }
5114 ctxt->userData = ctxt;
5115 ctxt->myDoc = NULL;
5116 ctxt->wellFormed = 1;
5117 ctxt->replaceEntities = 0;
5118 ctxt->linenumbers = xmlLineNumbersDefaultValue;
5119 ctxt->keepBlanks = xmlKeepBlanksDefaultValue;
5120 ctxt->html = 1;
5121 ctxt->vctxt.finishDtd = XML_CTXT_FINISH_DTD_0;
5122 ctxt->vctxt.userData = ctxt;
5123 ctxt->vctxt.error = xmlParserValidityError;
5124 ctxt->vctxt.warning = xmlParserValidityWarning;
5125 ctxt->record_info = 0;
5126 ctxt->validate = 0;
5127 ctxt->checkIndex = 0;
5128 ctxt->catalogs = NULL;
5129 xmlInitNodeInfoSeq(&ctxt->node_seq);
5130 return(0);
5131 }
5132
5133 /**
5134 * htmlFreeParserCtxt:
5135 * @ctxt: an HTML parser context
5136 *
5137 * Free all the memory used by a parser context. However the parsed
5138 * document in ctxt->myDoc is not freed.
5139 */
5140
5141 void
htmlFreeParserCtxt(htmlParserCtxtPtr ctxt)5142 htmlFreeParserCtxt(htmlParserCtxtPtr ctxt)
5143 {
5144 xmlFreeParserCtxt(ctxt);
5145 }
5146
5147 /**
5148 * htmlNewParserCtxt:
5149 *
5150 * Allocate and initialize a new parser context.
5151 *
5152 * Returns the htmlParserCtxtPtr or NULL in case of allocation error
5153 */
5154
5155 htmlParserCtxtPtr
htmlNewParserCtxt(void)5156 htmlNewParserCtxt(void)
5157 {
5158 xmlParserCtxtPtr ctxt;
5159
5160 ctxt = (xmlParserCtxtPtr) xmlMalloc(sizeof(xmlParserCtxt));
5161 if (ctxt == NULL) {
5162 htmlErrMemory(NULL, "NewParserCtxt: out of memory\n");
5163 return(NULL);
5164 }
5165 memset(ctxt, 0, sizeof(xmlParserCtxt));
5166 if (htmlInitParserCtxt(ctxt) < 0) {
5167 htmlFreeParserCtxt(ctxt);
5168 return(NULL);
5169 }
5170 return(ctxt);
5171 }
5172
5173 /**
5174 * htmlCreateMemoryParserCtxt:
5175 * @buffer: a pointer to a char array
5176 * @size: the size of the array
5177 *
5178 * Create a parser context for an HTML in-memory document.
5179 *
5180 * Returns the new parser context or NULL
5181 */
5182 htmlParserCtxtPtr
htmlCreateMemoryParserCtxt(const char * buffer,int size)5183 htmlCreateMemoryParserCtxt(const char *buffer, int size) {
5184 xmlParserCtxtPtr ctxt;
5185 xmlParserInputPtr input;
5186 xmlParserInputBufferPtr buf;
5187
5188 if (buffer == NULL)
5189 return(NULL);
5190 if (size <= 0)
5191 return(NULL);
5192
5193 ctxt = htmlNewParserCtxt();
5194 if (ctxt == NULL)
5195 return(NULL);
5196
5197 buf = xmlParserInputBufferCreateMem(buffer, size, XML_CHAR_ENCODING_NONE);
5198 if (buf == NULL) return(NULL);
5199
5200 input = xmlNewInputStream(ctxt);
5201 if (input == NULL) {
5202 xmlFreeParserInputBuffer(buf);
5203 xmlFreeParserCtxt(ctxt);
5204 return(NULL);
5205 }
5206
5207 input->filename = NULL;
5208 input->buf = buf;
5209 xmlBufResetInput(buf->buffer, input);
5210
5211 inputPush(ctxt, input);
5212 return(ctxt);
5213 }
5214
5215 /**
5216 * htmlCreateDocParserCtxt:
5217 * @cur: a pointer to an array of xmlChar
5218 * @encoding: a free form C string describing the HTML document encoding, or NULL
5219 *
5220 * Create a parser context for an HTML document.
5221 *
5222 * TODO: check the need to add encoding handling there
5223 *
5224 * Returns the new parser context or NULL
5225 */
5226 static htmlParserCtxtPtr
htmlCreateDocParserCtxt(const xmlChar * cur,const char * encoding)5227 htmlCreateDocParserCtxt(const xmlChar *cur, const char *encoding) {
5228 int len;
5229 htmlParserCtxtPtr ctxt;
5230
5231 if (cur == NULL)
5232 return(NULL);
5233 len = xmlStrlen(cur);
5234 ctxt = htmlCreateMemoryParserCtxt((char *)cur, len);
5235 if (ctxt == NULL)
5236 return(NULL);
5237
5238 if (encoding != NULL) {
5239 xmlCharEncoding enc;
5240 xmlCharEncodingHandlerPtr handler;
5241
5242 if (ctxt->input->encoding != NULL)
5243 xmlFree((xmlChar *) ctxt->input->encoding);
5244 ctxt->input->encoding = xmlStrdup((const xmlChar *) encoding);
5245
5246 enc = xmlParseCharEncoding(encoding);
5247 /*
5248 * registered set of known encodings
5249 */
5250 if (enc != XML_CHAR_ENCODING_ERROR) {
5251 xmlSwitchEncoding(ctxt, enc);
5252 if (ctxt->errNo == XML_ERR_UNSUPPORTED_ENCODING) {
5253 htmlParseErr(ctxt, XML_ERR_UNSUPPORTED_ENCODING,
5254 "Unsupported encoding %s\n",
5255 (const xmlChar *) encoding, NULL);
5256 }
5257 } else {
5258 /*
5259 * fallback for unknown encodings
5260 */
5261 handler = xmlFindCharEncodingHandler((const char *) encoding);
5262 if (handler != NULL) {
5263 xmlSwitchToEncoding(ctxt, handler);
5264 } else {
5265 htmlParseErr(ctxt, XML_ERR_UNSUPPORTED_ENCODING,
5266 "Unsupported encoding %s\n",
5267 (const xmlChar *) encoding, NULL);
5268 }
5269 }
5270 }
5271 return(ctxt);
5272 }
5273
5274 #ifdef LIBXML_PUSH_ENABLED
5275 /************************************************************************
5276 * *
5277 * Progressive parsing interfaces *
5278 * *
5279 ************************************************************************/
5280
5281 /**
5282 * htmlParseLookupSequence:
5283 * @ctxt: an HTML parser context
5284 * @first: the first char to lookup
5285 * @next: the next char to lookup or zero
5286 * @third: the next char to lookup or zero
5287 * @ignoreattrval: skip over attribute values
5288 *
5289 * Try to find if a sequence (first, next, third) or just (first next) or
5290 * (first) is available in the input stream.
5291 * This function has a side effect of (possibly) incrementing ctxt->checkIndex
5292 * to avoid rescanning sequences of bytes, it DOES change the state of the
5293 * parser, do not use liberally.
5294 * This is basically similar to xmlParseLookupSequence()
5295 *
5296 * Returns the index to the current parsing point if the full sequence
5297 * is available, -1 otherwise.
5298 */
5299 static int
htmlParseLookupSequence(htmlParserCtxtPtr ctxt,xmlChar first,xmlChar next,xmlChar third,int ignoreattrval)5300 htmlParseLookupSequence(htmlParserCtxtPtr ctxt, xmlChar first,
5301 xmlChar next, xmlChar third, int ignoreattrval)
5302 {
5303 int base, len;
5304 htmlParserInputPtr in;
5305 const xmlChar *buf;
5306 int invalue = 0;
5307 char valdellim = 0x0;
5308
5309 in = ctxt->input;
5310 if (in == NULL)
5311 return (-1);
5312
5313 base = in->cur - in->base;
5314 if (base < 0)
5315 return (-1);
5316
5317 if (ctxt->checkIndex > base) {
5318 base = ctxt->checkIndex;
5319 /* Abuse hasPErefs member to restore current state. */
5320 invalue = ctxt->hasPErefs & 1 ? 1 : 0;
5321 }
5322
5323 if (in->buf == NULL) {
5324 buf = in->base;
5325 len = in->length;
5326 } else {
5327 buf = xmlBufContent(in->buf->buffer);
5328 len = xmlBufUse(in->buf->buffer);
5329 }
5330
5331 /* take into account the sequence length */
5332 if (third)
5333 len -= 2;
5334 else if (next)
5335 len--;
5336 for (; base < len; base++) {
5337 if (ignoreattrval) {
5338 if (buf[base] == '"' || buf[base] == '\'') {
5339 if (invalue) {
5340 if (buf[base] == valdellim) {
5341 invalue = 0;
5342 continue;
5343 }
5344 } else {
5345 valdellim = buf[base];
5346 invalue = 1;
5347 continue;
5348 }
5349 } else if (invalue) {
5350 continue;
5351 }
5352 }
5353 if (buf[base] == first) {
5354 if (third != 0) {
5355 if ((buf[base + 1] != next) || (buf[base + 2] != third))
5356 continue;
5357 } else if (next != 0) {
5358 if (buf[base + 1] != next)
5359 continue;
5360 }
5361 ctxt->checkIndex = 0;
5362 #ifdef DEBUG_PUSH
5363 if (next == 0)
5364 xmlGenericError(xmlGenericErrorContext,
5365 "HPP: lookup '%c' found at %d\n",
5366 first, base);
5367 else if (third == 0)
5368 xmlGenericError(xmlGenericErrorContext,
5369 "HPP: lookup '%c%c' found at %d\n",
5370 first, next, base);
5371 else
5372 xmlGenericError(xmlGenericErrorContext,
5373 "HPP: lookup '%c%c%c' found at %d\n",
5374 first, next, third, base);
5375 #endif
5376 return (base - (in->cur - in->base));
5377 }
5378 }
5379 ctxt->checkIndex = base;
5380 /* Abuse hasPErefs member to track current state. */
5381 if (invalue)
5382 ctxt->hasPErefs |= 1;
5383 else
5384 ctxt->hasPErefs &= ~1;
5385 #ifdef DEBUG_PUSH
5386 if (next == 0)
5387 xmlGenericError(xmlGenericErrorContext,
5388 "HPP: lookup '%c' failed\n", first);
5389 else if (third == 0)
5390 xmlGenericError(xmlGenericErrorContext,
5391 "HPP: lookup '%c%c' failed\n", first, next);
5392 else
5393 xmlGenericError(xmlGenericErrorContext,
5394 "HPP: lookup '%c%c%c' failed\n", first, next,
5395 third);
5396 #endif
5397 return (-1);
5398 }
5399
5400 /**
5401 * htmlParseLookupCommentEnd:
5402 * @ctxt: an HTML parser context
5403 *
5404 * Try to find a comment end tag in the input stream
5405 * The search includes "-->" as well as WHATWG-recommended incorrectly-closed tags.
5406 * (See https://html.spec.whatwg.org/multipage/parsing.html#parse-error-incorrectly-closed-comment)
5407 * This function has a side effect of (possibly) incrementing ctxt->checkIndex
5408 * to avoid rescanning sequences of bytes, it DOES change the state of the
5409 * parser, do not use liberally.
5410 * This wraps to htmlParseLookupSequence()
5411 *
5412 * Returns the index to the current parsing point if the full sequence is available, -1 otherwise.
5413 */
5414 static int
htmlParseLookupCommentEnd(htmlParserCtxtPtr ctxt)5415 htmlParseLookupCommentEnd(htmlParserCtxtPtr ctxt)
5416 {
5417 int mark = 0;
5418 int cur = CUR_PTR - BASE_PTR;
5419
5420 while (mark >= 0) {
5421 mark = htmlParseLookupSequence(ctxt, '-', '-', 0, 0);
5422 if ((mark < 0) ||
5423 (NXT(mark+2) == '>') ||
5424 ((NXT(mark+2) == '!') && (NXT(mark+3) == '>'))) {
5425 return mark;
5426 }
5427 ctxt->checkIndex = cur + mark + 1;
5428 }
5429 return mark;
5430 }
5431
5432
5433 /**
5434 * htmlParseTryOrFinish:
5435 * @ctxt: an HTML parser context
5436 * @terminate: last chunk indicator
5437 *
5438 * Try to progress on parsing
5439 *
5440 * Returns zero if no parsing was possible
5441 */
5442 static int
htmlParseTryOrFinish(htmlParserCtxtPtr ctxt,int terminate)5443 htmlParseTryOrFinish(htmlParserCtxtPtr ctxt, int terminate) {
5444 int ret = 0;
5445 htmlParserInputPtr in;
5446 ptrdiff_t avail = 0;
5447 xmlChar cur, next;
5448
5449 htmlParserNodeInfo node_info;
5450
5451 #ifdef DEBUG_PUSH
5452 switch (ctxt->instate) {
5453 case XML_PARSER_EOF:
5454 xmlGenericError(xmlGenericErrorContext,
5455 "HPP: try EOF\n"); break;
5456 case XML_PARSER_START:
5457 xmlGenericError(xmlGenericErrorContext,
5458 "HPP: try START\n"); break;
5459 case XML_PARSER_MISC:
5460 xmlGenericError(xmlGenericErrorContext,
5461 "HPP: try MISC\n");break;
5462 case XML_PARSER_COMMENT:
5463 xmlGenericError(xmlGenericErrorContext,
5464 "HPP: try COMMENT\n");break;
5465 case XML_PARSER_PROLOG:
5466 xmlGenericError(xmlGenericErrorContext,
5467 "HPP: try PROLOG\n");break;
5468 case XML_PARSER_START_TAG:
5469 xmlGenericError(xmlGenericErrorContext,
5470 "HPP: try START_TAG\n");break;
5471 case XML_PARSER_CONTENT:
5472 xmlGenericError(xmlGenericErrorContext,
5473 "HPP: try CONTENT\n");break;
5474 case XML_PARSER_CDATA_SECTION:
5475 xmlGenericError(xmlGenericErrorContext,
5476 "HPP: try CDATA_SECTION\n");break;
5477 case XML_PARSER_END_TAG:
5478 xmlGenericError(xmlGenericErrorContext,
5479 "HPP: try END_TAG\n");break;
5480 case XML_PARSER_ENTITY_DECL:
5481 xmlGenericError(xmlGenericErrorContext,
5482 "HPP: try ENTITY_DECL\n");break;
5483 case XML_PARSER_ENTITY_VALUE:
5484 xmlGenericError(xmlGenericErrorContext,
5485 "HPP: try ENTITY_VALUE\n");break;
5486 case XML_PARSER_ATTRIBUTE_VALUE:
5487 xmlGenericError(xmlGenericErrorContext,
5488 "HPP: try ATTRIBUTE_VALUE\n");break;
5489 case XML_PARSER_DTD:
5490 xmlGenericError(xmlGenericErrorContext,
5491 "HPP: try DTD\n");break;
5492 case XML_PARSER_EPILOG:
5493 xmlGenericError(xmlGenericErrorContext,
5494 "HPP: try EPILOG\n");break;
5495 case XML_PARSER_PI:
5496 xmlGenericError(xmlGenericErrorContext,
5497 "HPP: try PI\n");break;
5498 case XML_PARSER_SYSTEM_LITERAL:
5499 xmlGenericError(xmlGenericErrorContext,
5500 "HPP: try SYSTEM_LITERAL\n");break;
5501 }
5502 #endif
5503
5504 while (1) {
5505
5506 in = ctxt->input;
5507 if (in == NULL) break;
5508 if (in->buf == NULL)
5509 avail = in->length - (in->cur - in->base);
5510 else
5511 avail = (ptrdiff_t)xmlBufUse(in->buf->buffer) -
5512 (in->cur - in->base);
5513 if ((avail == 0) && (terminate)) {
5514 htmlAutoCloseOnEnd(ctxt);
5515 if ((ctxt->nameNr == 0) && (ctxt->instate != XML_PARSER_EOF)) {
5516 /*
5517 * SAX: end of the document processing.
5518 */
5519 ctxt->instate = XML_PARSER_EOF;
5520 if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
5521 ctxt->sax->endDocument(ctxt->userData);
5522 }
5523 }
5524 if (avail < 1)
5525 goto done;
5526 /*
5527 * This is done to make progress and avoid an infinite loop
5528 * if a parsing attempt was aborted by hitting a NUL byte. After
5529 * changing htmlCurrentChar, this probably isn't necessary anymore.
5530 * We should consider removing this check.
5531 */
5532 cur = in->cur[0];
5533 if (cur == 0) {
5534 SKIP(1);
5535 continue;
5536 }
5537
5538 switch (ctxt->instate) {
5539 case XML_PARSER_EOF:
5540 /*
5541 * Document parsing is done !
5542 */
5543 goto done;
5544 case XML_PARSER_START:
5545 /*
5546 * Very first chars read from the document flow.
5547 */
5548 cur = in->cur[0];
5549 if (IS_BLANK_CH(cur)) {
5550 SKIP_BLANKS;
5551 if (in->buf == NULL)
5552 avail = in->length - (in->cur - in->base);
5553 else
5554 avail = (ptrdiff_t)xmlBufUse(in->buf->buffer) -
5555 (in->cur - in->base);
5556 }
5557 if ((ctxt->sax) && (ctxt->sax->setDocumentLocator))
5558 ctxt->sax->setDocumentLocator(ctxt->userData,
5559 &xmlDefaultSAXLocator);
5560 if ((ctxt->sax) && (ctxt->sax->startDocument) &&
5561 (!ctxt->disableSAX))
5562 ctxt->sax->startDocument(ctxt->userData);
5563
5564 cur = in->cur[0];
5565 next = in->cur[1];
5566 if ((cur == '<') && (next == '!') &&
5567 (UPP(2) == 'D') && (UPP(3) == 'O') &&
5568 (UPP(4) == 'C') && (UPP(5) == 'T') &&
5569 (UPP(6) == 'Y') && (UPP(7) == 'P') &&
5570 (UPP(8) == 'E')) {
5571 if ((!terminate) &&
5572 (htmlParseLookupSequence(ctxt, '>', 0, 0, 1) < 0))
5573 goto done;
5574 #ifdef DEBUG_PUSH
5575 xmlGenericError(xmlGenericErrorContext,
5576 "HPP: Parsing internal subset\n");
5577 #endif
5578 htmlParseDocTypeDecl(ctxt);
5579 ctxt->instate = XML_PARSER_PROLOG;
5580 #ifdef DEBUG_PUSH
5581 xmlGenericError(xmlGenericErrorContext,
5582 "HPP: entering PROLOG\n");
5583 #endif
5584 } else {
5585 ctxt->instate = XML_PARSER_MISC;
5586 #ifdef DEBUG_PUSH
5587 xmlGenericError(xmlGenericErrorContext,
5588 "HPP: entering MISC\n");
5589 #endif
5590 }
5591 break;
5592 case XML_PARSER_MISC:
5593 SKIP_BLANKS;
5594 if (in->buf == NULL)
5595 avail = in->length - (in->cur - in->base);
5596 else
5597 avail = (ptrdiff_t)xmlBufUse(in->buf->buffer) -
5598 (in->cur - in->base);
5599 /*
5600 * no chars in buffer
5601 */
5602 if (avail < 1)
5603 goto done;
5604 /*
5605 * not enough chars in buffer
5606 */
5607 if (avail < 2) {
5608 if (!terminate)
5609 goto done;
5610 else
5611 next = ' ';
5612 } else {
5613 next = in->cur[1];
5614 }
5615 cur = in->cur[0];
5616 if ((cur == '<') && (next == '!') &&
5617 (in->cur[2] == '-') && (in->cur[3] == '-')) {
5618 if ((!terminate) && (htmlParseLookupCommentEnd(ctxt) < 0))
5619 goto done;
5620 #ifdef DEBUG_PUSH
5621 xmlGenericError(xmlGenericErrorContext,
5622 "HPP: Parsing Comment\n");
5623 #endif
5624 htmlParseComment(ctxt);
5625 ctxt->instate = XML_PARSER_MISC;
5626 } else if ((cur == '<') && (next == '?')) {
5627 if ((!terminate) &&
5628 (htmlParseLookupSequence(ctxt, '>', 0, 0, 0) < 0))
5629 goto done;
5630 #ifdef DEBUG_PUSH
5631 xmlGenericError(xmlGenericErrorContext,
5632 "HPP: Parsing PI\n");
5633 #endif
5634 htmlParsePI(ctxt);
5635 ctxt->instate = XML_PARSER_MISC;
5636 } else if ((cur == '<') && (next == '!') &&
5637 (UPP(2) == 'D') && (UPP(3) == 'O') &&
5638 (UPP(4) == 'C') && (UPP(5) == 'T') &&
5639 (UPP(6) == 'Y') && (UPP(7) == 'P') &&
5640 (UPP(8) == 'E')) {
5641 if ((!terminate) &&
5642 (htmlParseLookupSequence(ctxt, '>', 0, 0, 1) < 0))
5643 goto done;
5644 #ifdef DEBUG_PUSH
5645 xmlGenericError(xmlGenericErrorContext,
5646 "HPP: Parsing internal subset\n");
5647 #endif
5648 htmlParseDocTypeDecl(ctxt);
5649 ctxt->instate = XML_PARSER_PROLOG;
5650 #ifdef DEBUG_PUSH
5651 xmlGenericError(xmlGenericErrorContext,
5652 "HPP: entering PROLOG\n");
5653 #endif
5654 } else if ((cur == '<') && (next == '!') &&
5655 (avail < 9)) {
5656 goto done;
5657 } else {
5658 ctxt->instate = XML_PARSER_CONTENT;
5659 #ifdef DEBUG_PUSH
5660 xmlGenericError(xmlGenericErrorContext,
5661 "HPP: entering START_TAG\n");
5662 #endif
5663 }
5664 break;
5665 case XML_PARSER_PROLOG:
5666 SKIP_BLANKS;
5667 if (in->buf == NULL)
5668 avail = in->length - (in->cur - in->base);
5669 else
5670 avail = (ptrdiff_t)xmlBufUse(in->buf->buffer) -
5671 (in->cur - in->base);
5672 if (avail < 2)
5673 goto done;
5674 cur = in->cur[0];
5675 next = in->cur[1];
5676 if ((cur == '<') && (next == '!') &&
5677 (in->cur[2] == '-') && (in->cur[3] == '-')) {
5678 if ((!terminate) && (htmlParseLookupCommentEnd(ctxt) < 0))
5679 goto done;
5680 #ifdef DEBUG_PUSH
5681 xmlGenericError(xmlGenericErrorContext,
5682 "HPP: Parsing Comment\n");
5683 #endif
5684 htmlParseComment(ctxt);
5685 ctxt->instate = XML_PARSER_PROLOG;
5686 } else if ((cur == '<') && (next == '?')) {
5687 if ((!terminate) &&
5688 (htmlParseLookupSequence(ctxt, '>', 0, 0, 0) < 0))
5689 goto done;
5690 #ifdef DEBUG_PUSH
5691 xmlGenericError(xmlGenericErrorContext,
5692 "HPP: Parsing PI\n");
5693 #endif
5694 htmlParsePI(ctxt);
5695 ctxt->instate = XML_PARSER_PROLOG;
5696 } else if ((cur == '<') && (next == '!') &&
5697 (avail < 4)) {
5698 goto done;
5699 } else {
5700 ctxt->instate = XML_PARSER_CONTENT;
5701 #ifdef DEBUG_PUSH
5702 xmlGenericError(xmlGenericErrorContext,
5703 "HPP: entering START_TAG\n");
5704 #endif
5705 }
5706 break;
5707 case XML_PARSER_EPILOG:
5708 if (in->buf == NULL)
5709 avail = in->length - (in->cur - in->base);
5710 else
5711 avail = (ptrdiff_t)xmlBufUse(in->buf->buffer) -
5712 (in->cur - in->base);
5713 if (avail < 1)
5714 goto done;
5715 cur = in->cur[0];
5716 if (IS_BLANK_CH(cur)) {
5717 htmlParseCharData(ctxt);
5718 goto done;
5719 }
5720 if (avail < 2)
5721 goto done;
5722 next = in->cur[1];
5723 if ((cur == '<') && (next == '!') &&
5724 (in->cur[2] == '-') && (in->cur[3] == '-')) {
5725 if ((!terminate) && (htmlParseLookupCommentEnd(ctxt) < 0))
5726 goto done;
5727 #ifdef DEBUG_PUSH
5728 xmlGenericError(xmlGenericErrorContext,
5729 "HPP: Parsing Comment\n");
5730 #endif
5731 htmlParseComment(ctxt);
5732 ctxt->instate = XML_PARSER_EPILOG;
5733 } else if ((cur == '<') && (next == '?')) {
5734 if ((!terminate) &&
5735 (htmlParseLookupSequence(ctxt, '>', 0, 0, 0) < 0))
5736 goto done;
5737 #ifdef DEBUG_PUSH
5738 xmlGenericError(xmlGenericErrorContext,
5739 "HPP: Parsing PI\n");
5740 #endif
5741 htmlParsePI(ctxt);
5742 ctxt->instate = XML_PARSER_EPILOG;
5743 } else if ((cur == '<') && (next == '!') &&
5744 (avail < 4)) {
5745 goto done;
5746 } else {
5747 ctxt->errNo = XML_ERR_DOCUMENT_END;
5748 ctxt->wellFormed = 0;
5749 ctxt->instate = XML_PARSER_EOF;
5750 #ifdef DEBUG_PUSH
5751 xmlGenericError(xmlGenericErrorContext,
5752 "HPP: entering EOF\n");
5753 #endif
5754 if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
5755 ctxt->sax->endDocument(ctxt->userData);
5756 goto done;
5757 }
5758 break;
5759 case XML_PARSER_START_TAG: {
5760 const xmlChar *name;
5761 int failed;
5762 const htmlElemDesc * info;
5763
5764 /*
5765 * no chars in buffer
5766 */
5767 if (avail < 1)
5768 goto done;
5769 /*
5770 * not enough chars in buffer
5771 */
5772 if (avail < 2) {
5773 if (!terminate)
5774 goto done;
5775 else
5776 next = ' ';
5777 } else {
5778 next = in->cur[1];
5779 }
5780 cur = in->cur[0];
5781 if (cur != '<') {
5782 ctxt->instate = XML_PARSER_CONTENT;
5783 #ifdef DEBUG_PUSH
5784 xmlGenericError(xmlGenericErrorContext,
5785 "HPP: entering CONTENT\n");
5786 #endif
5787 break;
5788 }
5789 if (next == '/') {
5790 ctxt->instate = XML_PARSER_END_TAG;
5791 ctxt->checkIndex = 0;
5792 #ifdef DEBUG_PUSH
5793 xmlGenericError(xmlGenericErrorContext,
5794 "HPP: entering END_TAG\n");
5795 #endif
5796 break;
5797 }
5798 if ((!terminate) &&
5799 (htmlParseLookupSequence(ctxt, '>', 0, 0, 1) < 0))
5800 goto done;
5801
5802 /* Capture start position */
5803 if (ctxt->record_info) {
5804 node_info.begin_pos = ctxt->input->consumed +
5805 (CUR_PTR - ctxt->input->base);
5806 node_info.begin_line = ctxt->input->line;
5807 }
5808
5809
5810 failed = htmlParseStartTag(ctxt);
5811 name = ctxt->name;
5812 if ((failed == -1) ||
5813 (name == NULL)) {
5814 if (CUR == '>')
5815 NEXT;
5816 break;
5817 }
5818
5819 /*
5820 * Lookup the info for that element.
5821 */
5822 info = htmlTagLookup(name);
5823 if (info == NULL) {
5824 htmlParseErr(ctxt, XML_HTML_UNKNOWN_TAG,
5825 "Tag %s invalid\n", name, NULL);
5826 }
5827
5828 /*
5829 * Check for an Empty Element labeled the XML/SGML way
5830 */
5831 if ((CUR == '/') && (NXT(1) == '>')) {
5832 SKIP(2);
5833 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
5834 ctxt->sax->endElement(ctxt->userData, name);
5835 htmlnamePop(ctxt);
5836 ctxt->instate = XML_PARSER_CONTENT;
5837 #ifdef DEBUG_PUSH
5838 xmlGenericError(xmlGenericErrorContext,
5839 "HPP: entering CONTENT\n");
5840 #endif
5841 break;
5842 }
5843
5844 if (CUR == '>') {
5845 NEXT;
5846 } else {
5847 htmlParseErr(ctxt, XML_ERR_GT_REQUIRED,
5848 "Couldn't find end of Start Tag %s\n",
5849 name, NULL);
5850
5851 /*
5852 * end of parsing of this node.
5853 */
5854 if (xmlStrEqual(name, ctxt->name)) {
5855 nodePop(ctxt);
5856 htmlnamePop(ctxt);
5857 }
5858
5859 if (ctxt->record_info)
5860 htmlNodeInfoPush(ctxt, &node_info);
5861
5862 ctxt->instate = XML_PARSER_CONTENT;
5863 #ifdef DEBUG_PUSH
5864 xmlGenericError(xmlGenericErrorContext,
5865 "HPP: entering CONTENT\n");
5866 #endif
5867 break;
5868 }
5869
5870 /*
5871 * Check for an Empty Element from DTD definition
5872 */
5873 if ((info != NULL) && (info->empty)) {
5874 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
5875 ctxt->sax->endElement(ctxt->userData, name);
5876 htmlnamePop(ctxt);
5877 }
5878
5879 if (ctxt->record_info)
5880 htmlNodeInfoPush(ctxt, &node_info);
5881
5882 ctxt->instate = XML_PARSER_CONTENT;
5883 #ifdef DEBUG_PUSH
5884 xmlGenericError(xmlGenericErrorContext,
5885 "HPP: entering CONTENT\n");
5886 #endif
5887 break;
5888 }
5889 case XML_PARSER_CONTENT: {
5890 xmlChar chr[2] = { 0, 0 };
5891
5892 /*
5893 * Handle preparsed entities and charRef
5894 */
5895 if (ctxt->token != 0) {
5896 chr[0] = (xmlChar) ctxt->token;
5897 htmlCheckParagraph(ctxt);
5898 if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL))
5899 ctxt->sax->characters(ctxt->userData, chr, 1);
5900 ctxt->token = 0;
5901 ctxt->checkIndex = 0;
5902 }
5903 if ((avail == 1) && (terminate)) {
5904 cur = in->cur[0];
5905 if ((cur != '<') && (cur != '&')) {
5906 if (ctxt->sax != NULL) {
5907 chr[0] = cur;
5908 if (IS_BLANK_CH(cur)) {
5909 if (ctxt->keepBlanks) {
5910 if (ctxt->sax->characters != NULL)
5911 ctxt->sax->characters(
5912 ctxt->userData, chr, 1);
5913 } else {
5914 if (ctxt->sax->ignorableWhitespace != NULL)
5915 ctxt->sax->ignorableWhitespace(
5916 ctxt->userData, chr, 1);
5917 }
5918 } else {
5919 htmlCheckParagraph(ctxt);
5920 if (ctxt->sax->characters != NULL)
5921 ctxt->sax->characters(
5922 ctxt->userData, chr, 1);
5923 }
5924 }
5925 ctxt->token = 0;
5926 ctxt->checkIndex = 0;
5927 in->cur++;
5928 break;
5929 }
5930 }
5931 if (avail < 2)
5932 goto done;
5933 cur = in->cur[0];
5934 next = in->cur[1];
5935 if ((xmlStrEqual(ctxt->name, BAD_CAST"script")) ||
5936 (xmlStrEqual(ctxt->name, BAD_CAST"style"))) {
5937 /*
5938 * Handle SCRIPT/STYLE separately
5939 */
5940 if (!terminate) {
5941 int idx;
5942 xmlChar val;
5943
5944 idx = htmlParseLookupSequence(ctxt, '<', '/', 0, 0);
5945 if (idx < 0)
5946 goto done;
5947 val = in->cur[idx + 2];
5948 if (val == 0) /* bad cut of input */
5949 goto done;
5950 }
5951 htmlParseScript(ctxt);
5952 if ((cur == '<') && (next == '/')) {
5953 ctxt->instate = XML_PARSER_END_TAG;
5954 ctxt->checkIndex = 0;
5955 #ifdef DEBUG_PUSH
5956 xmlGenericError(xmlGenericErrorContext,
5957 "HPP: entering END_TAG\n");
5958 #endif
5959 break;
5960 }
5961 } else {
5962 /*
5963 * Sometimes DOCTYPE arrives in the middle of the document
5964 */
5965 if ((cur == '<') && (next == '!') &&
5966 (UPP(2) == 'D') && (UPP(3) == 'O') &&
5967 (UPP(4) == 'C') && (UPP(5) == 'T') &&
5968 (UPP(6) == 'Y') && (UPP(7) == 'P') &&
5969 (UPP(8) == 'E')) {
5970 if ((!terminate) &&
5971 (htmlParseLookupSequence(ctxt, '>', 0, 0, 1) < 0))
5972 goto done;
5973 htmlParseErr(ctxt, XML_HTML_STRUCURE_ERROR,
5974 "Misplaced DOCTYPE declaration\n",
5975 BAD_CAST "DOCTYPE" , NULL);
5976 htmlParseDocTypeDecl(ctxt);
5977 } else if ((cur == '<') && (next == '!') &&
5978 (in->cur[2] == '-') && (in->cur[3] == '-')) {
5979 if ((!terminate) && (htmlParseLookupCommentEnd(ctxt) < 0))
5980 goto done;
5981 #ifdef DEBUG_PUSH
5982 xmlGenericError(xmlGenericErrorContext,
5983 "HPP: Parsing Comment\n");
5984 #endif
5985 htmlParseComment(ctxt);
5986 ctxt->instate = XML_PARSER_CONTENT;
5987 } else if ((cur == '<') && (next == '?')) {
5988 if ((!terminate) &&
5989 (htmlParseLookupSequence(ctxt, '>', 0, 0, 0) < 0))
5990 goto done;
5991 #ifdef DEBUG_PUSH
5992 xmlGenericError(xmlGenericErrorContext,
5993 "HPP: Parsing PI\n");
5994 #endif
5995 htmlParsePI(ctxt);
5996 ctxt->instate = XML_PARSER_CONTENT;
5997 } else if ((cur == '<') && (next == '!') && (avail < 4)) {
5998 goto done;
5999 } else if ((cur == '<') && (next == '/')) {
6000 ctxt->instate = XML_PARSER_END_TAG;
6001 ctxt->checkIndex = 0;
6002 #ifdef DEBUG_PUSH
6003 xmlGenericError(xmlGenericErrorContext,
6004 "HPP: entering END_TAG\n");
6005 #endif
6006 break;
6007 } else if (cur == '<') {
6008 if ((!terminate) && (next == 0))
6009 goto done;
6010 ctxt->instate = XML_PARSER_START_TAG;
6011 ctxt->checkIndex = 0;
6012 #ifdef DEBUG_PUSH
6013 xmlGenericError(xmlGenericErrorContext,
6014 "HPP: entering START_TAG\n");
6015 #endif
6016 break;
6017 } else {
6018 /*
6019 * check that the text sequence is complete
6020 * before handing out the data to the parser
6021 * to avoid problems with erroneous end of
6022 * data detection.
6023 */
6024 if ((!terminate) &&
6025 (htmlParseLookupSequence(ctxt, '<', 0, 0, 0) < 0))
6026 goto done;
6027 ctxt->checkIndex = 0;
6028 #ifdef DEBUG_PUSH
6029 xmlGenericError(xmlGenericErrorContext,
6030 "HPP: Parsing char data\n");
6031 #endif
6032 while ((ctxt->instate != XML_PARSER_EOF) &&
6033 (cur != '<') && (in->cur < in->end)) {
6034 if (cur == '&') {
6035 htmlParseReference(ctxt);
6036 } else {
6037 htmlParseCharData(ctxt);
6038 }
6039 cur = in->cur[0];
6040 }
6041 }
6042 }
6043
6044 break;
6045 }
6046 case XML_PARSER_END_TAG:
6047 if (avail < 2)
6048 goto done;
6049 if ((!terminate) &&
6050 (htmlParseLookupSequence(ctxt, '>', 0, 0, 0) < 0))
6051 goto done;
6052 htmlParseEndTag(ctxt);
6053 if (ctxt->nameNr == 0) {
6054 ctxt->instate = XML_PARSER_EPILOG;
6055 } else {
6056 ctxt->instate = XML_PARSER_CONTENT;
6057 }
6058 ctxt->checkIndex = 0;
6059 #ifdef DEBUG_PUSH
6060 xmlGenericError(xmlGenericErrorContext,
6061 "HPP: entering CONTENT\n");
6062 #endif
6063 break;
6064 case XML_PARSER_CDATA_SECTION:
6065 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
6066 "HPP: internal error, state == CDATA\n",
6067 NULL, NULL);
6068 ctxt->instate = XML_PARSER_CONTENT;
6069 ctxt->checkIndex = 0;
6070 #ifdef DEBUG_PUSH
6071 xmlGenericError(xmlGenericErrorContext,
6072 "HPP: entering CONTENT\n");
6073 #endif
6074 break;
6075 case XML_PARSER_DTD:
6076 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
6077 "HPP: internal error, state == DTD\n",
6078 NULL, NULL);
6079 ctxt->instate = XML_PARSER_CONTENT;
6080 ctxt->checkIndex = 0;
6081 #ifdef DEBUG_PUSH
6082 xmlGenericError(xmlGenericErrorContext,
6083 "HPP: entering CONTENT\n");
6084 #endif
6085 break;
6086 case XML_PARSER_COMMENT:
6087 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
6088 "HPP: internal error, state == COMMENT\n",
6089 NULL, NULL);
6090 ctxt->instate = XML_PARSER_CONTENT;
6091 ctxt->checkIndex = 0;
6092 #ifdef DEBUG_PUSH
6093 xmlGenericError(xmlGenericErrorContext,
6094 "HPP: entering CONTENT\n");
6095 #endif
6096 break;
6097 case XML_PARSER_PI:
6098 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
6099 "HPP: internal error, state == PI\n",
6100 NULL, NULL);
6101 ctxt->instate = XML_PARSER_CONTENT;
6102 ctxt->checkIndex = 0;
6103 #ifdef DEBUG_PUSH
6104 xmlGenericError(xmlGenericErrorContext,
6105 "HPP: entering CONTENT\n");
6106 #endif
6107 break;
6108 case XML_PARSER_ENTITY_DECL:
6109 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
6110 "HPP: internal error, state == ENTITY_DECL\n",
6111 NULL, NULL);
6112 ctxt->instate = XML_PARSER_CONTENT;
6113 ctxt->checkIndex = 0;
6114 #ifdef DEBUG_PUSH
6115 xmlGenericError(xmlGenericErrorContext,
6116 "HPP: entering CONTENT\n");
6117 #endif
6118 break;
6119 case XML_PARSER_ENTITY_VALUE:
6120 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
6121 "HPP: internal error, state == ENTITY_VALUE\n",
6122 NULL, NULL);
6123 ctxt->instate = XML_PARSER_CONTENT;
6124 ctxt->checkIndex = 0;
6125 #ifdef DEBUG_PUSH
6126 xmlGenericError(xmlGenericErrorContext,
6127 "HPP: entering DTD\n");
6128 #endif
6129 break;
6130 case XML_PARSER_ATTRIBUTE_VALUE:
6131 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
6132 "HPP: internal error, state == ATTRIBUTE_VALUE\n",
6133 NULL, NULL);
6134 ctxt->instate = XML_PARSER_START_TAG;
6135 ctxt->checkIndex = 0;
6136 #ifdef DEBUG_PUSH
6137 xmlGenericError(xmlGenericErrorContext,
6138 "HPP: entering START_TAG\n");
6139 #endif
6140 break;
6141 case XML_PARSER_SYSTEM_LITERAL:
6142 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
6143 "HPP: internal error, state == XML_PARSER_SYSTEM_LITERAL\n",
6144 NULL, NULL);
6145 ctxt->instate = XML_PARSER_CONTENT;
6146 ctxt->checkIndex = 0;
6147 #ifdef DEBUG_PUSH
6148 xmlGenericError(xmlGenericErrorContext,
6149 "HPP: entering CONTENT\n");
6150 #endif
6151 break;
6152 case XML_PARSER_IGNORE:
6153 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
6154 "HPP: internal error, state == XML_PARSER_IGNORE\n",
6155 NULL, NULL);
6156 ctxt->instate = XML_PARSER_CONTENT;
6157 ctxt->checkIndex = 0;
6158 #ifdef DEBUG_PUSH
6159 xmlGenericError(xmlGenericErrorContext,
6160 "HPP: entering CONTENT\n");
6161 #endif
6162 break;
6163 case XML_PARSER_PUBLIC_LITERAL:
6164 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
6165 "HPP: internal error, state == XML_PARSER_LITERAL\n",
6166 NULL, NULL);
6167 ctxt->instate = XML_PARSER_CONTENT;
6168 ctxt->checkIndex = 0;
6169 #ifdef DEBUG_PUSH
6170 xmlGenericError(xmlGenericErrorContext,
6171 "HPP: entering CONTENT\n");
6172 #endif
6173 break;
6174
6175 }
6176 }
6177 done:
6178 if ((avail == 0) && (terminate)) {
6179 htmlAutoCloseOnEnd(ctxt);
6180 if ((ctxt->nameNr == 0) && (ctxt->instate != XML_PARSER_EOF)) {
6181 /*
6182 * SAX: end of the document processing.
6183 */
6184 ctxt->instate = XML_PARSER_EOF;
6185 if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
6186 ctxt->sax->endDocument(ctxt->userData);
6187 }
6188 }
6189 if ((!(ctxt->options & HTML_PARSE_NODEFDTD)) && (ctxt->myDoc != NULL) &&
6190 ((terminate) || (ctxt->instate == XML_PARSER_EOF) ||
6191 (ctxt->instate == XML_PARSER_EPILOG))) {
6192 xmlDtdPtr dtd;
6193 dtd = xmlGetIntSubset(ctxt->myDoc);
6194 if (dtd == NULL)
6195 ctxt->myDoc->intSubset =
6196 xmlCreateIntSubset(ctxt->myDoc, BAD_CAST "html",
6197 BAD_CAST "-//W3C//DTD HTML 4.0 Transitional//EN",
6198 BAD_CAST "http://www.w3.org/TR/REC-html40/loose.dtd");
6199 }
6200 #ifdef DEBUG_PUSH
6201 xmlGenericError(xmlGenericErrorContext, "HPP: done %d\n", ret);
6202 #endif
6203 return(ret);
6204 }
6205
6206 /**
6207 * htmlParseChunk:
6208 * @ctxt: an HTML parser context
6209 * @chunk: an char array
6210 * @size: the size in byte of the chunk
6211 * @terminate: last chunk indicator
6212 *
6213 * Parse a Chunk of memory
6214 *
6215 * Returns zero if no error, the xmlParserErrors otherwise.
6216 */
6217 int
htmlParseChunk(htmlParserCtxtPtr ctxt,const char * chunk,int size,int terminate)6218 htmlParseChunk(htmlParserCtxtPtr ctxt, const char *chunk, int size,
6219 int terminate) {
6220 if ((ctxt == NULL) || (ctxt->input == NULL)) {
6221 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
6222 "htmlParseChunk: context error\n", NULL, NULL);
6223 return(XML_ERR_INTERNAL_ERROR);
6224 }
6225 if ((size > 0) && (chunk != NULL) && (ctxt->input != NULL) &&
6226 (ctxt->input->buf != NULL) && (ctxt->instate != XML_PARSER_EOF)) {
6227 size_t base = xmlBufGetInputBase(ctxt->input->buf->buffer, ctxt->input);
6228 size_t cur = ctxt->input->cur - ctxt->input->base;
6229 int res;
6230
6231 res = xmlParserInputBufferPush(ctxt->input->buf, size, chunk);
6232 xmlBufSetInputBaseCur(ctxt->input->buf->buffer, ctxt->input, base, cur);
6233 if (res < 0) {
6234 ctxt->errNo = XML_PARSER_EOF;
6235 ctxt->disableSAX = 1;
6236 return (XML_PARSER_EOF);
6237 }
6238 #ifdef DEBUG_PUSH
6239 xmlGenericError(xmlGenericErrorContext, "HPP: pushed %d\n", size);
6240 #endif
6241
6242 #if 0
6243 if ((terminate) || (ctxt->input->buf->buffer->use > 80))
6244 htmlParseTryOrFinish(ctxt, terminate);
6245 #endif
6246 } else if (ctxt->instate != XML_PARSER_EOF) {
6247 if ((ctxt->input != NULL) && ctxt->input->buf != NULL) {
6248 xmlParserInputBufferPtr in = ctxt->input->buf;
6249 if ((in->encoder != NULL) && (in->buffer != NULL) &&
6250 (in->raw != NULL)) {
6251 int nbchars;
6252 size_t base = xmlBufGetInputBase(in->buffer, ctxt->input);
6253 size_t current = ctxt->input->cur - ctxt->input->base;
6254
6255 nbchars = xmlCharEncInput(in, terminate);
6256 xmlBufSetInputBaseCur(in->buffer, ctxt->input, base, current);
6257 if (nbchars < 0) {
6258 htmlParseErr(ctxt, XML_ERR_INVALID_ENCODING,
6259 "encoder error\n", NULL, NULL);
6260 return(XML_ERR_INVALID_ENCODING);
6261 }
6262 }
6263 }
6264 }
6265 htmlParseTryOrFinish(ctxt, terminate);
6266 if (terminate) {
6267 if ((ctxt->instate != XML_PARSER_EOF) &&
6268 (ctxt->instate != XML_PARSER_EPILOG) &&
6269 (ctxt->instate != XML_PARSER_MISC)) {
6270 ctxt->errNo = XML_ERR_DOCUMENT_END;
6271 ctxt->wellFormed = 0;
6272 }
6273 if (ctxt->instate != XML_PARSER_EOF) {
6274 if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
6275 ctxt->sax->endDocument(ctxt->userData);
6276 }
6277 ctxt->instate = XML_PARSER_EOF;
6278 }
6279 return((xmlParserErrors) ctxt->errNo);
6280 }
6281
6282 /************************************************************************
6283 * *
6284 * User entry points *
6285 * *
6286 ************************************************************************/
6287
6288 /**
6289 * htmlCreatePushParserCtxt:
6290 * @sax: a SAX handler
6291 * @user_data: The user data returned on SAX callbacks
6292 * @chunk: a pointer to an array of chars
6293 * @size: number of chars in the array
6294 * @filename: an optional file name or URI
6295 * @enc: an optional encoding
6296 *
6297 * Create a parser context for using the HTML parser in push mode
6298 * The value of @filename is used for fetching external entities
6299 * and error/warning reports.
6300 *
6301 * Returns the new parser context or NULL
6302 */
6303 htmlParserCtxtPtr
htmlCreatePushParserCtxt(htmlSAXHandlerPtr sax,void * user_data,const char * chunk,int size,const char * filename,xmlCharEncoding enc)6304 htmlCreatePushParserCtxt(htmlSAXHandlerPtr sax, void *user_data,
6305 const char *chunk, int size, const char *filename,
6306 xmlCharEncoding enc) {
6307 htmlParserCtxtPtr ctxt;
6308 htmlParserInputPtr inputStream;
6309 xmlParserInputBufferPtr buf;
6310
6311 xmlInitParser();
6312
6313 buf = xmlAllocParserInputBuffer(enc);
6314 if (buf == NULL) return(NULL);
6315
6316 ctxt = htmlNewParserCtxt();
6317 if (ctxt == NULL) {
6318 xmlFreeParserInputBuffer(buf);
6319 return(NULL);
6320 }
6321 if(enc==XML_CHAR_ENCODING_UTF8 || buf->encoder)
6322 ctxt->charset=XML_CHAR_ENCODING_UTF8;
6323 if (sax != NULL) {
6324 if (ctxt->sax != (xmlSAXHandlerPtr) &htmlDefaultSAXHandler)
6325 xmlFree(ctxt->sax);
6326 ctxt->sax = (htmlSAXHandlerPtr) xmlMalloc(sizeof(htmlSAXHandler));
6327 if (ctxt->sax == NULL) {
6328 xmlFree(buf);
6329 xmlFree(ctxt);
6330 return(NULL);
6331 }
6332 memcpy(ctxt->sax, sax, sizeof(htmlSAXHandler));
6333 if (user_data != NULL)
6334 ctxt->userData = user_data;
6335 }
6336 if (filename == NULL) {
6337 ctxt->directory = NULL;
6338 } else {
6339 ctxt->directory = xmlParserGetDirectory(filename);
6340 }
6341
6342 inputStream = htmlNewInputStream(ctxt);
6343 if (inputStream == NULL) {
6344 xmlFreeParserCtxt(ctxt);
6345 xmlFree(buf);
6346 return(NULL);
6347 }
6348
6349 if (filename == NULL)
6350 inputStream->filename = NULL;
6351 else
6352 inputStream->filename = (char *)
6353 xmlCanonicPath((const xmlChar *) filename);
6354 inputStream->buf = buf;
6355 xmlBufResetInput(buf->buffer, inputStream);
6356
6357 inputPush(ctxt, inputStream);
6358
6359 if ((size > 0) && (chunk != NULL) && (ctxt->input != NULL) &&
6360 (ctxt->input->buf != NULL)) {
6361 size_t base = xmlBufGetInputBase(ctxt->input->buf->buffer, ctxt->input);
6362 size_t cur = ctxt->input->cur - ctxt->input->base;
6363
6364 xmlParserInputBufferPush(ctxt->input->buf, size, chunk);
6365
6366 xmlBufSetInputBaseCur(ctxt->input->buf->buffer, ctxt->input, base, cur);
6367 #ifdef DEBUG_PUSH
6368 xmlGenericError(xmlGenericErrorContext, "HPP: pushed %d\n", size);
6369 #endif
6370 }
6371 ctxt->progressive = 1;
6372
6373 return(ctxt);
6374 }
6375 #endif /* LIBXML_PUSH_ENABLED */
6376
6377 /**
6378 * htmlSAXParseDoc:
6379 * @cur: a pointer to an array of xmlChar
6380 * @encoding: a free form C string describing the HTML document encoding, or NULL
6381 * @sax: the SAX handler block
6382 * @userData: if using SAX, this pointer will be provided on callbacks.
6383 *
6384 * Parse an HTML in-memory document. If sax is not NULL, use the SAX callbacks
6385 * to handle parse events. If sax is NULL, fallback to the default DOM
6386 * behavior and return a tree.
6387 *
6388 * Returns the resulting document tree unless SAX is NULL or the document is
6389 * not well formed.
6390 */
6391
6392 htmlDocPtr
htmlSAXParseDoc(const xmlChar * cur,const char * encoding,htmlSAXHandlerPtr sax,void * userData)6393 htmlSAXParseDoc(const xmlChar *cur, const char *encoding,
6394 htmlSAXHandlerPtr sax, void *userData) {
6395 htmlDocPtr ret;
6396 htmlParserCtxtPtr ctxt;
6397
6398 xmlInitParser();
6399
6400 if (cur == NULL) return(NULL);
6401
6402
6403 ctxt = htmlCreateDocParserCtxt(cur, encoding);
6404 if (ctxt == NULL) return(NULL);
6405 if (sax != NULL) {
6406 if (ctxt->sax != NULL) xmlFree (ctxt->sax);
6407 ctxt->sax = sax;
6408 ctxt->userData = userData;
6409 }
6410
6411 htmlParseDocument(ctxt);
6412 ret = ctxt->myDoc;
6413 if (sax != NULL) {
6414 ctxt->sax = NULL;
6415 ctxt->userData = NULL;
6416 }
6417 htmlFreeParserCtxt(ctxt);
6418
6419 return(ret);
6420 }
6421
6422 /**
6423 * htmlParseDoc:
6424 * @cur: a pointer to an array of xmlChar
6425 * @encoding: a free form C string describing the HTML document encoding, or NULL
6426 *
6427 * parse an HTML in-memory document and build a tree.
6428 *
6429 * Returns the resulting document tree
6430 */
6431
6432 htmlDocPtr
htmlParseDoc(const xmlChar * cur,const char * encoding)6433 htmlParseDoc(const xmlChar *cur, const char *encoding) {
6434 return(htmlSAXParseDoc(cur, encoding, NULL, NULL));
6435 }
6436
6437
6438 /**
6439 * htmlCreateFileParserCtxt:
6440 * @filename: the filename
6441 * @encoding: a free form C string describing the HTML document encoding, or NULL
6442 *
6443 * Create a parser context for a file content.
6444 * Automatic support for ZLIB/Compress compressed document is provided
6445 * by default if found at compile-time.
6446 *
6447 * Returns the new parser context or NULL
6448 */
6449 htmlParserCtxtPtr
htmlCreateFileParserCtxt(const char * filename,const char * encoding)6450 htmlCreateFileParserCtxt(const char *filename, const char *encoding)
6451 {
6452 htmlParserCtxtPtr ctxt;
6453 htmlParserInputPtr inputStream;
6454 char *canonicFilename;
6455 /* htmlCharEncoding enc; */
6456 xmlChar *content, *content_line = (xmlChar *) "charset=";
6457
6458 if (filename == NULL)
6459 return(NULL);
6460
6461 ctxt = htmlNewParserCtxt();
6462 if (ctxt == NULL) {
6463 return(NULL);
6464 }
6465 canonicFilename = (char *) xmlCanonicPath((const xmlChar *) filename);
6466 if (canonicFilename == NULL) {
6467 #ifdef LIBXML_SAX1_ENABLED
6468 if (xmlDefaultSAXHandler.error != NULL) {
6469 xmlDefaultSAXHandler.error(NULL, "out of memory\n");
6470 }
6471 #endif
6472 xmlFreeParserCtxt(ctxt);
6473 return(NULL);
6474 }
6475
6476 inputStream = xmlLoadExternalEntity(canonicFilename, NULL, ctxt);
6477 xmlFree(canonicFilename);
6478 if (inputStream == NULL) {
6479 xmlFreeParserCtxt(ctxt);
6480 return(NULL);
6481 }
6482
6483 inputPush(ctxt, inputStream);
6484
6485 /* set encoding */
6486 if (encoding) {
6487 size_t l = strlen(encoding);
6488
6489 if (l < 1000) {
6490 content = xmlMallocAtomic (xmlStrlen(content_line) + l + 1);
6491 if (content) {
6492 strcpy ((char *)content, (char *)content_line);
6493 strcat ((char *)content, (char *)encoding);
6494 htmlCheckEncoding (ctxt, content);
6495 xmlFree (content);
6496 }
6497 }
6498 }
6499
6500 return(ctxt);
6501 }
6502
6503 /**
6504 * htmlSAXParseFile:
6505 * @filename: the filename
6506 * @encoding: a free form C string describing the HTML document encoding, or NULL
6507 * @sax: the SAX handler block
6508 * @userData: if using SAX, this pointer will be provided on callbacks.
6509 *
6510 * parse an HTML file and build a tree. Automatic support for ZLIB/Compress
6511 * compressed document is provided by default if found at compile-time.
6512 * It use the given SAX function block to handle the parsing callback.
6513 * If sax is NULL, fallback to the default DOM tree building routines.
6514 *
6515 * Returns the resulting document tree unless SAX is NULL or the document is
6516 * not well formed.
6517 */
6518
6519 htmlDocPtr
htmlSAXParseFile(const char * filename,const char * encoding,htmlSAXHandlerPtr sax,void * userData)6520 htmlSAXParseFile(const char *filename, const char *encoding, htmlSAXHandlerPtr sax,
6521 void *userData) {
6522 htmlDocPtr ret;
6523 htmlParserCtxtPtr ctxt;
6524 htmlSAXHandlerPtr oldsax = NULL;
6525
6526 xmlInitParser();
6527
6528 ctxt = htmlCreateFileParserCtxt(filename, encoding);
6529 if (ctxt == NULL) return(NULL);
6530 if (sax != NULL) {
6531 oldsax = ctxt->sax;
6532 ctxt->sax = sax;
6533 ctxt->userData = userData;
6534 }
6535
6536 htmlParseDocument(ctxt);
6537
6538 ret = ctxt->myDoc;
6539 if (sax != NULL) {
6540 ctxt->sax = oldsax;
6541 ctxt->userData = NULL;
6542 }
6543 htmlFreeParserCtxt(ctxt);
6544
6545 return(ret);
6546 }
6547
6548 /**
6549 * htmlParseFile:
6550 * @filename: the filename
6551 * @encoding: a free form C string describing the HTML document encoding, or NULL
6552 *
6553 * parse an HTML file and build a tree. Automatic support for ZLIB/Compress
6554 * compressed document is provided by default if found at compile-time.
6555 *
6556 * Returns the resulting document tree
6557 */
6558
6559 htmlDocPtr
htmlParseFile(const char * filename,const char * encoding)6560 htmlParseFile(const char *filename, const char *encoding) {
6561 return(htmlSAXParseFile(filename, encoding, NULL, NULL));
6562 }
6563
6564 /**
6565 * htmlHandleOmittedElem:
6566 * @val: int 0 or 1
6567 *
6568 * Set and return the previous value for handling HTML omitted tags.
6569 *
6570 * Returns the last value for 0 for no handling, 1 for auto insertion.
6571 */
6572
6573 int
htmlHandleOmittedElem(int val)6574 htmlHandleOmittedElem(int val) {
6575 int old = htmlOmittedDefaultValue;
6576
6577 htmlOmittedDefaultValue = val;
6578 return(old);
6579 }
6580
6581 /**
6582 * htmlElementAllowedHere:
6583 * @parent: HTML parent element
6584 * @elt: HTML element
6585 *
6586 * Checks whether an HTML element may be a direct child of a parent element.
6587 * Note - doesn't check for deprecated elements
6588 *
6589 * Returns 1 if allowed; 0 otherwise.
6590 */
6591 int
htmlElementAllowedHere(const htmlElemDesc * parent,const xmlChar * elt)6592 htmlElementAllowedHere(const htmlElemDesc* parent, const xmlChar* elt) {
6593 const char** p ;
6594
6595 if ( ! elt || ! parent || ! parent->subelts )
6596 return 0 ;
6597
6598 for ( p = parent->subelts; *p; ++p )
6599 if ( !xmlStrcmp((const xmlChar *)*p, elt) )
6600 return 1 ;
6601
6602 return 0 ;
6603 }
6604 /**
6605 * htmlElementStatusHere:
6606 * @parent: HTML parent element
6607 * @elt: HTML element
6608 *
6609 * Checks whether an HTML element may be a direct child of a parent element.
6610 * and if so whether it is valid or deprecated.
6611 *
6612 * Returns one of HTML_VALID, HTML_DEPRECATED, HTML_INVALID
6613 */
6614 htmlStatus
htmlElementStatusHere(const htmlElemDesc * parent,const htmlElemDesc * elt)6615 htmlElementStatusHere(const htmlElemDesc* parent, const htmlElemDesc* elt) {
6616 if ( ! parent || ! elt )
6617 return HTML_INVALID ;
6618 if ( ! htmlElementAllowedHere(parent, (const xmlChar*) elt->name ) )
6619 return HTML_INVALID ;
6620
6621 return ( elt->dtd == 0 ) ? HTML_VALID : HTML_DEPRECATED ;
6622 }
6623 /**
6624 * htmlAttrAllowed:
6625 * @elt: HTML element
6626 * @attr: HTML attribute
6627 * @legacy: whether to allow deprecated attributes
6628 *
6629 * Checks whether an attribute is valid for an element
6630 * Has full knowledge of Required and Deprecated attributes
6631 *
6632 * Returns one of HTML_REQUIRED, HTML_VALID, HTML_DEPRECATED, HTML_INVALID
6633 */
6634 htmlStatus
htmlAttrAllowed(const htmlElemDesc * elt,const xmlChar * attr,int legacy)6635 htmlAttrAllowed(const htmlElemDesc* elt, const xmlChar* attr, int legacy) {
6636 const char** p ;
6637
6638 if ( !elt || ! attr )
6639 return HTML_INVALID ;
6640
6641 if ( elt->attrs_req )
6642 for ( p = elt->attrs_req; *p; ++p)
6643 if ( !xmlStrcmp((const xmlChar*)*p, attr) )
6644 return HTML_REQUIRED ;
6645
6646 if ( elt->attrs_opt )
6647 for ( p = elt->attrs_opt; *p; ++p)
6648 if ( !xmlStrcmp((const xmlChar*)*p, attr) )
6649 return HTML_VALID ;
6650
6651 if ( legacy && elt->attrs_depr )
6652 for ( p = elt->attrs_depr; *p; ++p)
6653 if ( !xmlStrcmp((const xmlChar*)*p, attr) )
6654 return HTML_DEPRECATED ;
6655
6656 return HTML_INVALID ;
6657 }
6658 /**
6659 * htmlNodeStatus:
6660 * @node: an htmlNodePtr in a tree
6661 * @legacy: whether to allow deprecated elements (YES is faster here
6662 * for Element nodes)
6663 *
6664 * Checks whether the tree node is valid. Experimental (the author
6665 * only uses the HTML enhancements in a SAX parser)
6666 *
6667 * Return: for Element nodes, a return from htmlElementAllowedHere (if
6668 * legacy allowed) or htmlElementStatusHere (otherwise).
6669 * for Attribute nodes, a return from htmlAttrAllowed
6670 * for other nodes, HTML_NA (no checks performed)
6671 */
6672 htmlStatus
htmlNodeStatus(const htmlNodePtr node,int legacy)6673 htmlNodeStatus(const htmlNodePtr node, int legacy) {
6674 if ( ! node )
6675 return HTML_INVALID ;
6676
6677 switch ( node->type ) {
6678 case XML_ELEMENT_NODE:
6679 return legacy
6680 ? ( htmlElementAllowedHere (
6681 htmlTagLookup(node->parent->name) , node->name
6682 ) ? HTML_VALID : HTML_INVALID )
6683 : htmlElementStatusHere(
6684 htmlTagLookup(node->parent->name) ,
6685 htmlTagLookup(node->name) )
6686 ;
6687 case XML_ATTRIBUTE_NODE:
6688 return htmlAttrAllowed(
6689 htmlTagLookup(node->parent->name) , node->name, legacy) ;
6690 default: return HTML_NA ;
6691 }
6692 }
6693 /************************************************************************
6694 * *
6695 * New set (2.6.0) of simpler and more flexible APIs *
6696 * *
6697 ************************************************************************/
6698 /**
6699 * DICT_FREE:
6700 * @str: a string
6701 *
6702 * Free a string if it is not owned by the "dict" dictionary in the
6703 * current scope
6704 */
6705 #define DICT_FREE(str) \
6706 if ((str) && ((!dict) || \
6707 (xmlDictOwns(dict, (const xmlChar *)(str)) == 0))) \
6708 xmlFree((char *)(str));
6709
6710 /**
6711 * htmlCtxtReset:
6712 * @ctxt: an HTML parser context
6713 *
6714 * Reset a parser context
6715 */
6716 void
htmlCtxtReset(htmlParserCtxtPtr ctxt)6717 htmlCtxtReset(htmlParserCtxtPtr ctxt)
6718 {
6719 xmlParserInputPtr input;
6720 xmlDictPtr dict;
6721
6722 if (ctxt == NULL)
6723 return;
6724
6725 xmlInitParser();
6726 dict = ctxt->dict;
6727
6728 while ((input = inputPop(ctxt)) != NULL) { /* Non consuming */
6729 xmlFreeInputStream(input);
6730 }
6731 ctxt->inputNr = 0;
6732 ctxt->input = NULL;
6733
6734 ctxt->spaceNr = 0;
6735 if (ctxt->spaceTab != NULL) {
6736 ctxt->spaceTab[0] = -1;
6737 ctxt->space = &ctxt->spaceTab[0];
6738 } else {
6739 ctxt->space = NULL;
6740 }
6741
6742
6743 ctxt->nodeNr = 0;
6744 ctxt->node = NULL;
6745
6746 ctxt->nameNr = 0;
6747 ctxt->name = NULL;
6748
6749 DICT_FREE(ctxt->version);
6750 ctxt->version = NULL;
6751 DICT_FREE(ctxt->encoding);
6752 ctxt->encoding = NULL;
6753 DICT_FREE(ctxt->directory);
6754 ctxt->directory = NULL;
6755 DICT_FREE(ctxt->extSubURI);
6756 ctxt->extSubURI = NULL;
6757 DICT_FREE(ctxt->extSubSystem);
6758 ctxt->extSubSystem = NULL;
6759 if (ctxt->myDoc != NULL)
6760 xmlFreeDoc(ctxt->myDoc);
6761 ctxt->myDoc = NULL;
6762
6763 ctxt->standalone = -1;
6764 ctxt->hasExternalSubset = 0;
6765 ctxt->hasPErefs = 0;
6766 ctxt->html = 1;
6767 ctxt->external = 0;
6768 ctxt->instate = XML_PARSER_START;
6769 ctxt->token = 0;
6770
6771 ctxt->wellFormed = 1;
6772 ctxt->nsWellFormed = 1;
6773 ctxt->disableSAX = 0;
6774 ctxt->valid = 1;
6775 ctxt->vctxt.userData = ctxt;
6776 ctxt->vctxt.error = xmlParserValidityError;
6777 ctxt->vctxt.warning = xmlParserValidityWarning;
6778 ctxt->record_info = 0;
6779 ctxt->checkIndex = 0;
6780 ctxt->inSubset = 0;
6781 ctxt->errNo = XML_ERR_OK;
6782 ctxt->depth = 0;
6783 ctxt->charset = XML_CHAR_ENCODING_NONE;
6784 ctxt->catalogs = NULL;
6785 xmlInitNodeInfoSeq(&ctxt->node_seq);
6786
6787 if (ctxt->attsDefault != NULL) {
6788 xmlHashFree(ctxt->attsDefault, xmlHashDefaultDeallocator);
6789 ctxt->attsDefault = NULL;
6790 }
6791 if (ctxt->attsSpecial != NULL) {
6792 xmlHashFree(ctxt->attsSpecial, NULL);
6793 ctxt->attsSpecial = NULL;
6794 }
6795 }
6796
6797 /**
6798 * htmlCtxtUseOptions:
6799 * @ctxt: an HTML parser context
6800 * @options: a combination of htmlParserOption(s)
6801 *
6802 * Applies the options to the parser context
6803 *
6804 * Returns 0 in case of success, the set of unknown or unimplemented options
6805 * in case of error.
6806 */
6807 int
htmlCtxtUseOptions(htmlParserCtxtPtr ctxt,int options)6808 htmlCtxtUseOptions(htmlParserCtxtPtr ctxt, int options)
6809 {
6810 if (ctxt == NULL)
6811 return(-1);
6812
6813 if (options & HTML_PARSE_NOWARNING) {
6814 ctxt->sax->warning = NULL;
6815 ctxt->vctxt.warning = NULL;
6816 options -= XML_PARSE_NOWARNING;
6817 ctxt->options |= XML_PARSE_NOWARNING;
6818 }
6819 if (options & HTML_PARSE_NOERROR) {
6820 ctxt->sax->error = NULL;
6821 ctxt->vctxt.error = NULL;
6822 ctxt->sax->fatalError = NULL;
6823 options -= XML_PARSE_NOERROR;
6824 ctxt->options |= XML_PARSE_NOERROR;
6825 }
6826 if (options & HTML_PARSE_PEDANTIC) {
6827 ctxt->pedantic = 1;
6828 options -= XML_PARSE_PEDANTIC;
6829 ctxt->options |= XML_PARSE_PEDANTIC;
6830 } else
6831 ctxt->pedantic = 0;
6832 if (options & XML_PARSE_NOBLANKS) {
6833 ctxt->keepBlanks = 0;
6834 ctxt->sax->ignorableWhitespace = xmlSAX2IgnorableWhitespace;
6835 options -= XML_PARSE_NOBLANKS;
6836 ctxt->options |= XML_PARSE_NOBLANKS;
6837 } else
6838 ctxt->keepBlanks = 1;
6839 if (options & HTML_PARSE_RECOVER) {
6840 ctxt->recovery = 1;
6841 options -= HTML_PARSE_RECOVER;
6842 } else
6843 ctxt->recovery = 0;
6844 if (options & HTML_PARSE_COMPACT) {
6845 ctxt->options |= HTML_PARSE_COMPACT;
6846 options -= HTML_PARSE_COMPACT;
6847 }
6848 if (options & XML_PARSE_HUGE) {
6849 ctxt->options |= XML_PARSE_HUGE;
6850 options -= XML_PARSE_HUGE;
6851 }
6852 if (options & HTML_PARSE_NODEFDTD) {
6853 ctxt->options |= HTML_PARSE_NODEFDTD;
6854 options -= HTML_PARSE_NODEFDTD;
6855 }
6856 if (options & HTML_PARSE_IGNORE_ENC) {
6857 ctxt->options |= HTML_PARSE_IGNORE_ENC;
6858 options -= HTML_PARSE_IGNORE_ENC;
6859 }
6860 if (options & HTML_PARSE_NOIMPLIED) {
6861 ctxt->options |= HTML_PARSE_NOIMPLIED;
6862 options -= HTML_PARSE_NOIMPLIED;
6863 }
6864 ctxt->dictNames = 0;
6865 return (options);
6866 }
6867
6868 /**
6869 * htmlDoRead:
6870 * @ctxt: an HTML parser context
6871 * @URL: the base URL to use for the document
6872 * @encoding: the document encoding, or NULL
6873 * @options: a combination of htmlParserOption(s)
6874 * @reuse: keep the context for reuse
6875 *
6876 * Common front-end for the htmlRead functions
6877 *
6878 * Returns the resulting document tree or NULL
6879 */
6880 static htmlDocPtr
htmlDoRead(htmlParserCtxtPtr ctxt,const char * URL,const char * encoding,int options,int reuse)6881 htmlDoRead(htmlParserCtxtPtr ctxt, const char *URL, const char *encoding,
6882 int options, int reuse)
6883 {
6884 htmlDocPtr ret;
6885
6886 htmlCtxtUseOptions(ctxt, options);
6887 ctxt->html = 1;
6888 if (encoding != NULL) {
6889 xmlCharEncodingHandlerPtr hdlr;
6890
6891 hdlr = xmlFindCharEncodingHandler(encoding);
6892 if (hdlr != NULL) {
6893 xmlSwitchToEncoding(ctxt, hdlr);
6894 if (ctxt->input->encoding != NULL)
6895 xmlFree((xmlChar *) ctxt->input->encoding);
6896 ctxt->input->encoding = xmlStrdup((xmlChar *)encoding);
6897 }
6898 }
6899 if ((URL != NULL) && (ctxt->input != NULL) &&
6900 (ctxt->input->filename == NULL))
6901 ctxt->input->filename = (char *) xmlStrdup((const xmlChar *) URL);
6902 htmlParseDocument(ctxt);
6903 ret = ctxt->myDoc;
6904 ctxt->myDoc = NULL;
6905 if (!reuse) {
6906 if ((ctxt->dictNames) &&
6907 (ret != NULL) &&
6908 (ret->dict == ctxt->dict))
6909 ctxt->dict = NULL;
6910 xmlFreeParserCtxt(ctxt);
6911 }
6912 return (ret);
6913 }
6914
6915 /**
6916 * htmlReadDoc:
6917 * @cur: a pointer to a zero terminated string
6918 * @URL: the base URL to use for the document
6919 * @encoding: the document encoding, or NULL
6920 * @options: a combination of htmlParserOption(s)
6921 *
6922 * parse an XML in-memory document and build a tree.
6923 *
6924 * Returns the resulting document tree
6925 */
6926 htmlDocPtr
htmlReadDoc(const xmlChar * cur,const char * URL,const char * encoding,int options)6927 htmlReadDoc(const xmlChar * cur, const char *URL, const char *encoding, int options)
6928 {
6929 htmlParserCtxtPtr ctxt;
6930
6931 if (cur == NULL)
6932 return (NULL);
6933
6934 xmlInitParser();
6935 ctxt = htmlCreateDocParserCtxt(cur, NULL);
6936 if (ctxt == NULL)
6937 return (NULL);
6938 return (htmlDoRead(ctxt, URL, encoding, options, 0));
6939 }
6940
6941 /**
6942 * htmlReadFile:
6943 * @filename: a file or URL
6944 * @encoding: the document encoding, or NULL
6945 * @options: a combination of htmlParserOption(s)
6946 *
6947 * parse an XML file from the filesystem or the network.
6948 *
6949 * Returns the resulting document tree
6950 */
6951 htmlDocPtr
htmlReadFile(const char * filename,const char * encoding,int options)6952 htmlReadFile(const char *filename, const char *encoding, int options)
6953 {
6954 htmlParserCtxtPtr ctxt;
6955
6956 xmlInitParser();
6957 ctxt = htmlCreateFileParserCtxt(filename, encoding);
6958 if (ctxt == NULL)
6959 return (NULL);
6960 return (htmlDoRead(ctxt, NULL, NULL, options, 0));
6961 }
6962
6963 /**
6964 * htmlReadMemory:
6965 * @buffer: a pointer to a char array
6966 * @size: the size of the array
6967 * @URL: the base URL to use for the document
6968 * @encoding: the document encoding, or NULL
6969 * @options: a combination of htmlParserOption(s)
6970 *
6971 * parse an XML in-memory document and build a tree.
6972 *
6973 * Returns the resulting document tree
6974 */
6975 htmlDocPtr
htmlReadMemory(const char * buffer,int size,const char * URL,const char * encoding,int options)6976 htmlReadMemory(const char *buffer, int size, const char *URL, const char *encoding, int options)
6977 {
6978 htmlParserCtxtPtr ctxt;
6979
6980 xmlInitParser();
6981 ctxt = xmlCreateMemoryParserCtxt(buffer, size);
6982 if (ctxt == NULL)
6983 return (NULL);
6984 htmlDefaultSAXHandlerInit();
6985 if (ctxt->sax != NULL)
6986 memcpy(ctxt->sax, &htmlDefaultSAXHandler, sizeof(xmlSAXHandlerV1));
6987 return (htmlDoRead(ctxt, URL, encoding, options, 0));
6988 }
6989
6990 /**
6991 * htmlReadFd:
6992 * @fd: an open file descriptor
6993 * @URL: the base URL to use for the document
6994 * @encoding: the document encoding, or NULL
6995 * @options: a combination of htmlParserOption(s)
6996 *
6997 * parse an HTML from a file descriptor and build a tree.
6998 * NOTE that the file descriptor will not be closed when the
6999 * reader is closed or reset.
7000 *
7001 * Returns the resulting document tree
7002 */
7003 htmlDocPtr
htmlReadFd(int fd,const char * URL,const char * encoding,int options)7004 htmlReadFd(int fd, const char *URL, const char *encoding, int options)
7005 {
7006 htmlParserCtxtPtr ctxt;
7007 xmlParserInputBufferPtr input;
7008 htmlParserInputPtr stream;
7009
7010 if (fd < 0)
7011 return (NULL);
7012
7013 xmlInitParser();
7014 input = xmlParserInputBufferCreateFd(fd, XML_CHAR_ENCODING_NONE);
7015 if (input == NULL)
7016 return (NULL);
7017 input->closecallback = NULL;
7018 ctxt = htmlNewParserCtxt();
7019 if (ctxt == NULL) {
7020 xmlFreeParserInputBuffer(input);
7021 return (NULL);
7022 }
7023 stream = xmlNewIOInputStream(ctxt, input, XML_CHAR_ENCODING_NONE);
7024 if (stream == NULL) {
7025 xmlFreeParserInputBuffer(input);
7026 htmlFreeParserCtxt(ctxt);
7027 return (NULL);
7028 }
7029 inputPush(ctxt, stream);
7030 return (htmlDoRead(ctxt, URL, encoding, options, 0));
7031 }
7032
7033 /**
7034 * htmlReadIO:
7035 * @ioread: an I/O read function
7036 * @ioclose: an I/O close function
7037 * @ioctx: an I/O handler
7038 * @URL: the base URL to use for the document
7039 * @encoding: the document encoding, or NULL
7040 * @options: a combination of htmlParserOption(s)
7041 *
7042 * parse an HTML document from I/O functions and source and build a tree.
7043 *
7044 * Returns the resulting document tree
7045 */
7046 htmlDocPtr
htmlReadIO(xmlInputReadCallback ioread,xmlInputCloseCallback ioclose,void * ioctx,const char * URL,const char * encoding,int options)7047 htmlReadIO(xmlInputReadCallback ioread, xmlInputCloseCallback ioclose,
7048 void *ioctx, const char *URL, const char *encoding, int options)
7049 {
7050 htmlParserCtxtPtr ctxt;
7051 xmlParserInputBufferPtr input;
7052 xmlParserInputPtr stream;
7053
7054 if (ioread == NULL)
7055 return (NULL);
7056 xmlInitParser();
7057
7058 input = xmlParserInputBufferCreateIO(ioread, ioclose, ioctx,
7059 XML_CHAR_ENCODING_NONE);
7060 if (input == NULL) {
7061 if (ioclose != NULL)
7062 ioclose(ioctx);
7063 return (NULL);
7064 }
7065 ctxt = htmlNewParserCtxt();
7066 if (ctxt == NULL) {
7067 xmlFreeParserInputBuffer(input);
7068 return (NULL);
7069 }
7070 stream = xmlNewIOInputStream(ctxt, input, XML_CHAR_ENCODING_NONE);
7071 if (stream == NULL) {
7072 xmlFreeParserInputBuffer(input);
7073 xmlFreeParserCtxt(ctxt);
7074 return (NULL);
7075 }
7076 inputPush(ctxt, stream);
7077 return (htmlDoRead(ctxt, URL, encoding, options, 0));
7078 }
7079
7080 /**
7081 * htmlCtxtReadDoc:
7082 * @ctxt: an HTML parser context
7083 * @cur: a pointer to a zero terminated string
7084 * @URL: the base URL to use for the document
7085 * @encoding: the document encoding, or NULL
7086 * @options: a combination of htmlParserOption(s)
7087 *
7088 * parse an XML in-memory document and build a tree.
7089 * This reuses the existing @ctxt parser context
7090 *
7091 * Returns the resulting document tree
7092 */
7093 htmlDocPtr
htmlCtxtReadDoc(htmlParserCtxtPtr ctxt,const xmlChar * cur,const char * URL,const char * encoding,int options)7094 htmlCtxtReadDoc(htmlParserCtxtPtr ctxt, const xmlChar * cur,
7095 const char *URL, const char *encoding, int options)
7096 {
7097 xmlParserInputPtr stream;
7098
7099 if (cur == NULL)
7100 return (NULL);
7101 if (ctxt == NULL)
7102 return (NULL);
7103 xmlInitParser();
7104
7105 htmlCtxtReset(ctxt);
7106
7107 stream = xmlNewStringInputStream(ctxt, cur);
7108 if (stream == NULL) {
7109 return (NULL);
7110 }
7111 inputPush(ctxt, stream);
7112 return (htmlDoRead(ctxt, URL, encoding, options, 1));
7113 }
7114
7115 /**
7116 * htmlCtxtReadFile:
7117 * @ctxt: an HTML parser context
7118 * @filename: a file or URL
7119 * @encoding: the document encoding, or NULL
7120 * @options: a combination of htmlParserOption(s)
7121 *
7122 * parse an XML file from the filesystem or the network.
7123 * This reuses the existing @ctxt parser context
7124 *
7125 * Returns the resulting document tree
7126 */
7127 htmlDocPtr
htmlCtxtReadFile(htmlParserCtxtPtr ctxt,const char * filename,const char * encoding,int options)7128 htmlCtxtReadFile(htmlParserCtxtPtr ctxt, const char *filename,
7129 const char *encoding, int options)
7130 {
7131 xmlParserInputPtr stream;
7132
7133 if (filename == NULL)
7134 return (NULL);
7135 if (ctxt == NULL)
7136 return (NULL);
7137 xmlInitParser();
7138
7139 htmlCtxtReset(ctxt);
7140
7141 stream = xmlLoadExternalEntity(filename, NULL, ctxt);
7142 if (stream == NULL) {
7143 return (NULL);
7144 }
7145 inputPush(ctxt, stream);
7146 return (htmlDoRead(ctxt, NULL, encoding, options, 1));
7147 }
7148
7149 /**
7150 * htmlCtxtReadMemory:
7151 * @ctxt: an HTML parser context
7152 * @buffer: a pointer to a char array
7153 * @size: the size of the array
7154 * @URL: the base URL to use for the document
7155 * @encoding: the document encoding, or NULL
7156 * @options: a combination of htmlParserOption(s)
7157 *
7158 * parse an XML in-memory document and build a tree.
7159 * This reuses the existing @ctxt parser context
7160 *
7161 * Returns the resulting document tree
7162 */
7163 htmlDocPtr
htmlCtxtReadMemory(htmlParserCtxtPtr ctxt,const char * buffer,int size,const char * URL,const char * encoding,int options)7164 htmlCtxtReadMemory(htmlParserCtxtPtr ctxt, const char *buffer, int size,
7165 const char *URL, const char *encoding, int options)
7166 {
7167 xmlParserInputBufferPtr input;
7168 xmlParserInputPtr stream;
7169
7170 if (ctxt == NULL)
7171 return (NULL);
7172 if (buffer == NULL)
7173 return (NULL);
7174 xmlInitParser();
7175
7176 htmlCtxtReset(ctxt);
7177
7178 input = xmlParserInputBufferCreateMem(buffer, size, XML_CHAR_ENCODING_NONE);
7179 if (input == NULL) {
7180 return(NULL);
7181 }
7182
7183 stream = xmlNewIOInputStream(ctxt, input, XML_CHAR_ENCODING_NONE);
7184 if (stream == NULL) {
7185 xmlFreeParserInputBuffer(input);
7186 return(NULL);
7187 }
7188
7189 inputPush(ctxt, stream);
7190 return (htmlDoRead(ctxt, URL, encoding, options, 1));
7191 }
7192
7193 /**
7194 * htmlCtxtReadFd:
7195 * @ctxt: an HTML parser context
7196 * @fd: an open file descriptor
7197 * @URL: the base URL to use for the document
7198 * @encoding: the document encoding, or NULL
7199 * @options: a combination of htmlParserOption(s)
7200 *
7201 * parse an XML from a file descriptor and build a tree.
7202 * This reuses the existing @ctxt parser context
7203 *
7204 * Returns the resulting document tree
7205 */
7206 htmlDocPtr
htmlCtxtReadFd(htmlParserCtxtPtr ctxt,int fd,const char * URL,const char * encoding,int options)7207 htmlCtxtReadFd(htmlParserCtxtPtr ctxt, int fd,
7208 const char *URL, const char *encoding, int options)
7209 {
7210 xmlParserInputBufferPtr input;
7211 xmlParserInputPtr stream;
7212
7213 if (fd < 0)
7214 return (NULL);
7215 if (ctxt == NULL)
7216 return (NULL);
7217 xmlInitParser();
7218
7219 htmlCtxtReset(ctxt);
7220
7221
7222 input = xmlParserInputBufferCreateFd(fd, XML_CHAR_ENCODING_NONE);
7223 if (input == NULL)
7224 return (NULL);
7225 stream = xmlNewIOInputStream(ctxt, input, XML_CHAR_ENCODING_NONE);
7226 if (stream == NULL) {
7227 xmlFreeParserInputBuffer(input);
7228 return (NULL);
7229 }
7230 inputPush(ctxt, stream);
7231 return (htmlDoRead(ctxt, URL, encoding, options, 1));
7232 }
7233
7234 /**
7235 * htmlCtxtReadIO:
7236 * @ctxt: an HTML parser context
7237 * @ioread: an I/O read function
7238 * @ioclose: an I/O close function
7239 * @ioctx: an I/O handler
7240 * @URL: the base URL to use for the document
7241 * @encoding: the document encoding, or NULL
7242 * @options: a combination of htmlParserOption(s)
7243 *
7244 * parse an HTML document from I/O functions and source and build a tree.
7245 * This reuses the existing @ctxt parser context
7246 *
7247 * Returns the resulting document tree
7248 */
7249 htmlDocPtr
htmlCtxtReadIO(htmlParserCtxtPtr ctxt,xmlInputReadCallback ioread,xmlInputCloseCallback ioclose,void * ioctx,const char * URL,const char * encoding,int options)7250 htmlCtxtReadIO(htmlParserCtxtPtr ctxt, xmlInputReadCallback ioread,
7251 xmlInputCloseCallback ioclose, void *ioctx,
7252 const char *URL,
7253 const char *encoding, int options)
7254 {
7255 xmlParserInputBufferPtr input;
7256 xmlParserInputPtr stream;
7257
7258 if (ioread == NULL)
7259 return (NULL);
7260 if (ctxt == NULL)
7261 return (NULL);
7262 xmlInitParser();
7263
7264 htmlCtxtReset(ctxt);
7265
7266 input = xmlParserInputBufferCreateIO(ioread, ioclose, ioctx,
7267 XML_CHAR_ENCODING_NONE);
7268 if (input == NULL) {
7269 if (ioclose != NULL)
7270 ioclose(ioctx);
7271 return (NULL);
7272 }
7273 stream = xmlNewIOInputStream(ctxt, input, XML_CHAR_ENCODING_NONE);
7274 if (stream == NULL) {
7275 xmlFreeParserInputBuffer(input);
7276 return (NULL);
7277 }
7278 inputPush(ctxt, stream);
7279 return (htmlDoRead(ctxt, URL, encoding, options, 1));
7280 }
7281
7282 #define bottom_HTMLparser
7283 #include "elfgcchack.h"
7284 #endif /* LIBXML_HTML_ENABLED */
7285