1 /* 2 * Summary: interface for an HTML 4.0 non-verifying parser 3 * Description: this module implements an HTML 4.0 non-verifying parser 4 * with API compatible with the XML parser ones. It should 5 * be able to parse "real world" HTML, even if severely 6 * broken from a specification point of view. 7 * 8 * Copy: See Copyright for the status of this software. 9 * 10 * Author: Daniel Veillard 11 */ 12 13 #ifndef __HTML_PARSER_H__ 14 #define __HTML_PARSER_H__ 15 #include <libxml/xmlversion.h> 16 #include <libxml/parser.h> 17 18 #ifdef LIBXML_HTML_ENABLED 19 20 #ifdef __cplusplus 21 extern "C" { 22 #endif 23 24 /* 25 * Most of the back-end structures from XML and HTML are shared. 26 */ 27 typedef xmlParserCtxt htmlParserCtxt; 28 typedef xmlParserCtxtPtr htmlParserCtxtPtr; 29 typedef xmlParserNodeInfo htmlParserNodeInfo; 30 typedef xmlSAXHandler htmlSAXHandler; 31 typedef xmlSAXHandlerPtr htmlSAXHandlerPtr; 32 typedef xmlParserInput htmlParserInput; 33 typedef xmlParserInputPtr htmlParserInputPtr; 34 typedef xmlDocPtr htmlDocPtr; 35 typedef xmlNodePtr htmlNodePtr; 36 37 /* 38 * Internal description of an HTML element, representing HTML 4.01 39 * and XHTML 1.0 (which share the same structure). 40 */ 41 typedef struct _htmlElemDesc htmlElemDesc; 42 typedef htmlElemDesc *htmlElemDescPtr; 43 struct _htmlElemDesc { 44 const char *name; /* The tag name */ 45 char startTag; /* Whether the start tag can be implied */ 46 char endTag; /* Whether the end tag can be implied */ 47 char saveEndTag; /* Whether the end tag should be saved */ 48 char empty; /* Is this an empty element ? */ 49 char depr; /* Is this a deprecated element ? */ 50 char dtd; /* 1: only in Loose DTD, 2: only Frameset one */ 51 char isinline; /* is this a block 0 or inline 1 element */ 52 const char *desc; /* the description */ 53 54 /* NRK Jan.2003 55 * New fields encapsulating HTML structure 56 * 57 * Bugs: 58 * This is a very limited representation. It fails to tell us when 59 * an element *requires* subelements (we only have whether they're 60 * allowed or not), and it doesn't tell us where CDATA and PCDATA 61 * are allowed. Some element relationships are not fully represented: 62 * these are flagged with the word MODIFIER 63 */ 64 const char** subelts; /* allowed sub-elements of this element */ 65 const char* defaultsubelt; /* subelement for suggested auto-repair 66 if necessary or NULL */ 67 const char** attrs_opt; /* Optional Attributes */ 68 const char** attrs_depr; /* Additional deprecated attributes */ 69 const char** attrs_req; /* Required attributes */ 70 }; 71 72 /* 73 * Internal description of an HTML entity. 74 */ 75 typedef struct _htmlEntityDesc htmlEntityDesc; 76 typedef htmlEntityDesc *htmlEntityDescPtr; 77 struct _htmlEntityDesc { 78 unsigned int value; /* the UNICODE value for the character */ 79 const char *name; /* The entity name */ 80 const char *desc; /* the description */ 81 }; 82 83 /* 84 * There is only few public functions. 85 */ 86 XMLPUBFUN const htmlElemDesc * XMLCALL 87 htmlTagLookup (const xmlChar *tag); 88 XMLPUBFUN const htmlEntityDesc * XMLCALL 89 htmlEntityLookup(const xmlChar *name); 90 XMLPUBFUN const htmlEntityDesc * XMLCALL 91 htmlEntityValueLookup(unsigned int value); 92 93 XMLPUBFUN int XMLCALL 94 htmlIsAutoClosed(htmlDocPtr doc, 95 htmlNodePtr elem); 96 XMLPUBFUN int XMLCALL 97 htmlAutoCloseTag(htmlDocPtr doc, 98 const xmlChar *name, 99 htmlNodePtr elem); 100 XMLPUBFUN const htmlEntityDesc * XMLCALL 101 htmlParseEntityRef(htmlParserCtxtPtr ctxt, 102 const xmlChar **str); 103 XMLPUBFUN int XMLCALL 104 htmlParseCharRef(htmlParserCtxtPtr ctxt); 105 XMLPUBFUN void XMLCALL 106 htmlParseElement(htmlParserCtxtPtr ctxt); 107 108 XMLPUBFUN htmlParserCtxtPtr XMLCALL 109 htmlNewParserCtxt(void); 110 111 XMLPUBFUN htmlParserCtxtPtr XMLCALL 112 htmlCreateMemoryParserCtxt(const char *buffer, 113 int size); 114 115 XMLPUBFUN int XMLCALL 116 htmlParseDocument(htmlParserCtxtPtr ctxt); 117 XMLPUBFUN htmlDocPtr XMLCALL 118 htmlSAXParseDoc (const xmlChar *cur, 119 const char *encoding, 120 htmlSAXHandlerPtr sax, 121 void *userData); 122 XMLPUBFUN htmlDocPtr XMLCALL 123 htmlParseDoc (const xmlChar *cur, 124 const char *encoding); 125 XMLPUBFUN htmlDocPtr XMLCALL 126 htmlSAXParseFile(const char *filename, 127 const char *encoding, 128 htmlSAXHandlerPtr sax, 129 void *userData); 130 XMLPUBFUN htmlDocPtr XMLCALL 131 htmlParseFile (const char *filename, 132 const char *encoding); 133 XMLPUBFUN int XMLCALL 134 UTF8ToHtml (unsigned char *out, 135 int *outlen, 136 const unsigned char *in, 137 int *inlen); 138 XMLPUBFUN int XMLCALL 139 htmlEncodeEntities(unsigned char *out, 140 int *outlen, 141 const unsigned char *in, 142 int *inlen, int quoteChar); 143 XMLPUBFUN int XMLCALL 144 htmlIsScriptAttribute(const xmlChar *name); 145 XMLPUBFUN int XMLCALL 146 htmlHandleOmittedElem(int val); 147 148 #ifdef LIBXML_PUSH_ENABLED 149 /** 150 * Interfaces for the Push mode. 151 */ 152 XMLPUBFUN htmlParserCtxtPtr XMLCALL 153 htmlCreatePushParserCtxt(htmlSAXHandlerPtr sax, 154 void *user_data, 155 const char *chunk, 156 int size, 157 const char *filename, 158 xmlCharEncoding enc); 159 XMLPUBFUN int XMLCALL 160 htmlParseChunk (htmlParserCtxtPtr ctxt, 161 const char *chunk, 162 int size, 163 int terminate); 164 #endif /* LIBXML_PUSH_ENABLED */ 165 166 XMLPUBFUN void XMLCALL 167 htmlFreeParserCtxt (htmlParserCtxtPtr ctxt); 168 169 /* 170 * New set of simpler/more flexible APIs 171 */ 172 /** 173 * xmlParserOption: 174 * 175 * This is the set of XML parser options that can be passed down 176 * to the xmlReadDoc() and similar calls. 177 */ 178 typedef enum { 179 HTML_PARSE_RECOVER = 1<<0, /* Relaxed parsing */ 180 HTML_PARSE_NODEFDTD = 1<<2, /* do not default a doctype if not found */ 181 HTML_PARSE_NOERROR = 1<<5, /* suppress error reports */ 182 HTML_PARSE_NOWARNING= 1<<6, /* suppress warning reports */ 183 HTML_PARSE_PEDANTIC = 1<<7, /* pedantic error reporting */ 184 HTML_PARSE_NOBLANKS = 1<<8, /* remove blank nodes */ 185 HTML_PARSE_NONET = 1<<11,/* Forbid network access */ 186 HTML_PARSE_NOIMPLIED= 1<<13,/* Do not add implied html/body... elements */ 187 HTML_PARSE_COMPACT = 1<<16,/* compact small text nodes */ 188 HTML_PARSE_IGNORE_ENC=1<<21 /* ignore internal document encoding hint */ 189 } htmlParserOption; 190 191 XMLPUBFUN void XMLCALL 192 htmlCtxtReset (htmlParserCtxtPtr ctxt); 193 XMLPUBFUN int XMLCALL 194 htmlCtxtUseOptions (htmlParserCtxtPtr ctxt, 195 int options); 196 XMLPUBFUN htmlDocPtr XMLCALL 197 htmlReadDoc (const xmlChar *cur, 198 const char *URL, 199 const char *encoding, 200 int options); 201 XMLPUBFUN htmlDocPtr XMLCALL 202 htmlReadFile (const char *URL, 203 const char *encoding, 204 int options); 205 XMLPUBFUN htmlDocPtr XMLCALL 206 htmlReadMemory (const char *buffer, 207 int size, 208 const char *URL, 209 const char *encoding, 210 int options); 211 XMLPUBFUN htmlDocPtr XMLCALL 212 htmlReadFd (int fd, 213 const char *URL, 214 const char *encoding, 215 int options); 216 XMLPUBFUN htmlDocPtr XMLCALL 217 htmlReadIO (xmlInputReadCallback ioread, 218 xmlInputCloseCallback ioclose, 219 void *ioctx, 220 const char *URL, 221 const char *encoding, 222 int options); 223 XMLPUBFUN htmlDocPtr XMLCALL 224 htmlCtxtReadDoc (xmlParserCtxtPtr ctxt, 225 const xmlChar *cur, 226 const char *URL, 227 const char *encoding, 228 int options); 229 XMLPUBFUN htmlDocPtr XMLCALL 230 htmlCtxtReadFile (xmlParserCtxtPtr ctxt, 231 const char *filename, 232 const char *encoding, 233 int options); 234 XMLPUBFUN htmlDocPtr XMLCALL 235 htmlCtxtReadMemory (xmlParserCtxtPtr ctxt, 236 const char *buffer, 237 int size, 238 const char *URL, 239 const char *encoding, 240 int options); 241 XMLPUBFUN htmlDocPtr XMLCALL 242 htmlCtxtReadFd (xmlParserCtxtPtr ctxt, 243 int fd, 244 const char *URL, 245 const char *encoding, 246 int options); 247 XMLPUBFUN htmlDocPtr XMLCALL 248 htmlCtxtReadIO (xmlParserCtxtPtr ctxt, 249 xmlInputReadCallback ioread, 250 xmlInputCloseCallback ioclose, 251 void *ioctx, 252 const char *URL, 253 const char *encoding, 254 int options); 255 256 /* NRK/Jan2003: further knowledge of HTML structure 257 */ 258 typedef enum { 259 HTML_NA = 0 , /* something we don't check at all */ 260 HTML_INVALID = 0x1 , 261 HTML_DEPRECATED = 0x2 , 262 HTML_VALID = 0x4 , 263 HTML_REQUIRED = 0xc /* VALID bit set so ( & HTML_VALID ) is TRUE */ 264 } htmlStatus ; 265 266 /* Using htmlElemDesc rather than name here, to emphasise the fact 267 that otherwise there's a lookup overhead 268 */ 269 XMLPUBFUN htmlStatus XMLCALL htmlAttrAllowed(const htmlElemDesc*, const xmlChar*, int) ; 270 XMLPUBFUN int XMLCALL htmlElementAllowedHere(const htmlElemDesc*, const xmlChar*) ; 271 XMLPUBFUN htmlStatus XMLCALL htmlElementStatusHere(const htmlElemDesc*, const htmlElemDesc*) ; 272 XMLPUBFUN htmlStatus XMLCALL htmlNodeStatus(const htmlNodePtr, int) ; 273 /** 274 * htmlDefaultSubelement: 275 * @elt: HTML element 276 * 277 * Returns the default subelement for this element 278 */ 279 #define htmlDefaultSubelement(elt) elt->defaultsubelt 280 /** 281 * htmlElementAllowedHereDesc: 282 * @parent: HTML parent element 283 * @elt: HTML element 284 * 285 * Checks whether an HTML element description may be a 286 * direct child of the specified element. 287 * 288 * Returns 1 if allowed; 0 otherwise. 289 */ 290 #define htmlElementAllowedHereDesc(parent,elt) \ 291 htmlElementAllowedHere((parent), (elt)->name) 292 /** 293 * htmlRequiredAttrs: 294 * @elt: HTML element 295 * 296 * Returns the attributes required for the specified element. 297 */ 298 #define htmlRequiredAttrs(elt) (elt)->attrs_req 299 300 301 #ifdef __cplusplus 302 } 303 #endif 304 305 #endif /* LIBXML_HTML_ENABLED */ 306 #endif /* __HTML_PARSER_H__ */ 307