1 /* 2 * Summary: interface for an HTML 4.0 non-verifying parser 3 * Description: this module implements an HTML 4.0 non-verifying parser 4 * with API compatible with the XML parser ones. It should 5 * be able to parse "real world" HTML, even if severely 6 * broken from a specification point of view. 7 * 8 * Copy: See Copyright for the status of this software. 9 * 10 * Author: Daniel Veillard 11 */ 12 13 #ifndef __HTML_PARSER_H__ 14 #define __HTML_PARSER_H__ 15 #include <libxml/xmlversion.h> 16 #include <libxml/parser.h> 17 18 #ifdef LIBXML_HTML_ENABLED 19 20 #ifdef __cplusplus 21 extern "C" { 22 #endif 23 24 /* 25 * Most of the back-end structures from XML and HTML are shared. 26 */ 27 typedef xmlParserCtxt htmlParserCtxt; 28 typedef xmlParserCtxtPtr htmlParserCtxtPtr; 29 typedef xmlParserNodeInfo htmlParserNodeInfo; 30 typedef xmlSAXHandler htmlSAXHandler; 31 typedef xmlSAXHandlerPtr htmlSAXHandlerPtr; 32 typedef xmlParserInput htmlParserInput; 33 typedef xmlParserInputPtr htmlParserInputPtr; 34 typedef xmlDocPtr htmlDocPtr; 35 typedef xmlNodePtr htmlNodePtr; 36 37 /* 38 * Internal description of an HTML element, representing HTML 4.01 39 * and XHTML 1.0 (which share the same structure). 40 */ 41 typedef struct _htmlElemDesc htmlElemDesc; 42 typedef htmlElemDesc *htmlElemDescPtr; 43 struct _htmlElemDesc { 44 const char *name; /* The tag name */ 45 char startTag; /* Whether the start tag can be implied */ 46 char endTag; /* Whether the end tag can be implied */ 47 char saveEndTag; /* Whether the end tag should be saved */ 48 char empty; /* Is this an empty element ? */ 49 char depr; /* Is this a deprecated element ? */ 50 char dtd; /* 1: only in Loose DTD, 2: only Frameset one */ 51 char isinline; /* is this a block 0 or inline 1 element */ 52 const char *desc; /* the description */ 53 54 /* NRK Jan.2003 55 * New fields encapsulating HTML structure 56 * 57 * Bugs: 58 * This is a very limited representation. It fails to tell us when 59 * an element *requires* subelements (we only have whether they're 60 * allowed or not), and it doesn't tell us where CDATA and PCDATA 61 * are allowed. Some element relationships are not fully represented: 62 * these are flagged with the word MODIFIER 63 */ 64 const char** subelts; /* allowed sub-elements of this element */ 65 const char* defaultsubelt; /* subelement for suggested auto-repair 66 if necessary or NULL */ 67 const char** attrs_opt; /* Optional Attributes */ 68 const char** attrs_depr; /* Additional deprecated attributes */ 69 const char** attrs_req; /* Required attributes */ 70 }; 71 72 /* 73 * Internal description of an HTML entity. 74 */ 75 typedef struct _htmlEntityDesc htmlEntityDesc; 76 typedef htmlEntityDesc *htmlEntityDescPtr; 77 struct _htmlEntityDesc { 78 unsigned int value; /* the UNICODE value for the character */ 79 const char *name; /* The entity name */ 80 const char *desc; /* the description */ 81 }; 82 83 #ifdef LIBXML_SAX1_ENABLED 84 85 XML_DEPRECATED 86 XMLPUBVAR const xmlSAXHandlerV1 htmlDefaultSAXHandler; 87 88 #ifdef LIBXML_THREAD_ENABLED 89 XML_DEPRECATED 90 XMLPUBFUN const xmlSAXHandlerV1 *__htmlDefaultSAXHandler(void); 91 #endif 92 93 #endif /* LIBXML_SAX1_ENABLED */ 94 95 /* 96 * There is only few public functions. 97 */ 98 XML_DEPRECATED 99 XMLPUBFUN void 100 htmlInitAutoClose (void); 101 XMLPUBFUN const htmlElemDesc * 102 htmlTagLookup (const xmlChar *tag); 103 XMLPUBFUN const htmlEntityDesc * 104 htmlEntityLookup(const xmlChar *name); 105 XMLPUBFUN const htmlEntityDesc * 106 htmlEntityValueLookup(unsigned int value); 107 108 XMLPUBFUN int 109 htmlIsAutoClosed(htmlDocPtr doc, 110 htmlNodePtr elem); 111 XMLPUBFUN int 112 htmlAutoCloseTag(htmlDocPtr doc, 113 const xmlChar *name, 114 htmlNodePtr elem); 115 XML_DEPRECATED 116 XMLPUBFUN const htmlEntityDesc * 117 htmlParseEntityRef(htmlParserCtxtPtr ctxt, 118 const xmlChar **str); 119 XML_DEPRECATED 120 XMLPUBFUN int 121 htmlParseCharRef(htmlParserCtxtPtr ctxt); 122 XML_DEPRECATED 123 XMLPUBFUN void 124 htmlParseElement(htmlParserCtxtPtr ctxt); 125 126 XMLPUBFUN htmlParserCtxtPtr 127 htmlNewParserCtxt(void); 128 XMLPUBFUN htmlParserCtxtPtr 129 htmlNewSAXParserCtxt(const htmlSAXHandler *sax, 130 void *userData); 131 132 XMLPUBFUN htmlParserCtxtPtr 133 htmlCreateMemoryParserCtxt(const char *buffer, 134 int size); 135 136 XMLPUBFUN int 137 htmlParseDocument(htmlParserCtxtPtr ctxt); 138 XML_DEPRECATED 139 XMLPUBFUN htmlDocPtr 140 htmlSAXParseDoc (const xmlChar *cur, 141 const char *encoding, 142 htmlSAXHandlerPtr sax, 143 void *userData); 144 XMLPUBFUN htmlDocPtr 145 htmlParseDoc (const xmlChar *cur, 146 const char *encoding); 147 XMLPUBFUN htmlParserCtxtPtr 148 htmlCreateFileParserCtxt(const char *filename, 149 const char *encoding); 150 XML_DEPRECATED 151 XMLPUBFUN htmlDocPtr 152 htmlSAXParseFile(const char *filename, 153 const char *encoding, 154 htmlSAXHandlerPtr sax, 155 void *userData); 156 XMLPUBFUN htmlDocPtr 157 htmlParseFile (const char *filename, 158 const char *encoding); 159 XMLPUBFUN int 160 UTF8ToHtml (unsigned char *out, 161 int *outlen, 162 const unsigned char *in, 163 int *inlen); 164 XMLPUBFUN int 165 htmlEncodeEntities(unsigned char *out, 166 int *outlen, 167 const unsigned char *in, 168 int *inlen, int quoteChar); 169 XMLPUBFUN int 170 htmlIsScriptAttribute(const xmlChar *name); 171 XML_DEPRECATED 172 XMLPUBFUN int 173 htmlHandleOmittedElem(int val); 174 175 #ifdef LIBXML_PUSH_ENABLED 176 /** 177 * Interfaces for the Push mode. 178 */ 179 XMLPUBFUN htmlParserCtxtPtr 180 htmlCreatePushParserCtxt(htmlSAXHandlerPtr sax, 181 void *user_data, 182 const char *chunk, 183 int size, 184 const char *filename, 185 xmlCharEncoding enc); 186 XMLPUBFUN int 187 htmlParseChunk (htmlParserCtxtPtr ctxt, 188 const char *chunk, 189 int size, 190 int terminate); 191 #endif /* LIBXML_PUSH_ENABLED */ 192 193 XMLPUBFUN void 194 htmlFreeParserCtxt (htmlParserCtxtPtr ctxt); 195 196 /* 197 * New set of simpler/more flexible APIs 198 */ 199 /** 200 * xmlParserOption: 201 * 202 * This is the set of XML parser options that can be passed down 203 * to the xmlReadDoc() and similar calls. 204 */ 205 typedef enum { 206 HTML_PARSE_RECOVER = 1<<0, /* Relaxed parsing */ 207 HTML_PARSE_NODEFDTD = 1<<2, /* do not default a doctype if not found */ 208 HTML_PARSE_NOERROR = 1<<5, /* suppress error reports */ 209 HTML_PARSE_NOWARNING= 1<<6, /* suppress warning reports */ 210 HTML_PARSE_PEDANTIC = 1<<7, /* pedantic error reporting */ 211 HTML_PARSE_NOBLANKS = 1<<8, /* remove blank nodes */ 212 HTML_PARSE_NONET = 1<<11,/* Forbid network access */ 213 HTML_PARSE_NOIMPLIED= 1<<13,/* Do not add implied html/body... elements */ 214 HTML_PARSE_COMPACT = 1<<16,/* compact small text nodes */ 215 HTML_PARSE_IGNORE_ENC=1<<21 /* ignore internal document encoding hint */ 216 } htmlParserOption; 217 218 XMLPUBFUN void 219 htmlCtxtReset (htmlParserCtxtPtr ctxt); 220 XMLPUBFUN int 221 htmlCtxtUseOptions (htmlParserCtxtPtr ctxt, 222 int options); 223 XMLPUBFUN htmlDocPtr 224 htmlReadDoc (const xmlChar *cur, 225 const char *URL, 226 const char *encoding, 227 int options); 228 XMLPUBFUN htmlDocPtr 229 htmlReadFile (const char *URL, 230 const char *encoding, 231 int options); 232 XMLPUBFUN htmlDocPtr 233 htmlReadMemory (const char *buffer, 234 int size, 235 const char *URL, 236 const char *encoding, 237 int options); 238 XMLPUBFUN htmlDocPtr 239 htmlReadFd (int fd, 240 const char *URL, 241 const char *encoding, 242 int options); 243 XMLPUBFUN htmlDocPtr 244 htmlReadIO (xmlInputReadCallback ioread, 245 xmlInputCloseCallback ioclose, 246 void *ioctx, 247 const char *URL, 248 const char *encoding, 249 int options); 250 XMLPUBFUN htmlDocPtr 251 htmlCtxtParseDocument (htmlParserCtxtPtr ctxt, 252 xmlParserInputPtr input); 253 XMLPUBFUN htmlDocPtr 254 htmlCtxtReadDoc (xmlParserCtxtPtr ctxt, 255 const xmlChar *cur, 256 const char *URL, 257 const char *encoding, 258 int options); 259 XMLPUBFUN htmlDocPtr 260 htmlCtxtReadFile (xmlParserCtxtPtr ctxt, 261 const char *filename, 262 const char *encoding, 263 int options); 264 XMLPUBFUN htmlDocPtr 265 htmlCtxtReadMemory (xmlParserCtxtPtr ctxt, 266 const char *buffer, 267 int size, 268 const char *URL, 269 const char *encoding, 270 int options); 271 XMLPUBFUN htmlDocPtr 272 htmlCtxtReadFd (xmlParserCtxtPtr ctxt, 273 int fd, 274 const char *URL, 275 const char *encoding, 276 int options); 277 XMLPUBFUN htmlDocPtr 278 htmlCtxtReadIO (xmlParserCtxtPtr ctxt, 279 xmlInputReadCallback ioread, 280 xmlInputCloseCallback ioclose, 281 void *ioctx, 282 const char *URL, 283 const char *encoding, 284 int options); 285 286 /* NRK/Jan2003: further knowledge of HTML structure 287 */ 288 typedef enum { 289 HTML_NA = 0 , /* something we don't check at all */ 290 HTML_INVALID = 0x1 , 291 HTML_DEPRECATED = 0x2 , 292 HTML_VALID = 0x4 , 293 HTML_REQUIRED = 0xc /* VALID bit set so ( & HTML_VALID ) is TRUE */ 294 } htmlStatus ; 295 296 /* Using htmlElemDesc rather than name here, to emphasise the fact 297 that otherwise there's a lookup overhead 298 */ 299 XMLPUBFUN htmlStatus htmlAttrAllowed(const htmlElemDesc*, const xmlChar*, int) ; 300 XMLPUBFUN int htmlElementAllowedHere(const htmlElemDesc*, const xmlChar*) ; 301 XMLPUBFUN htmlStatus htmlElementStatusHere(const htmlElemDesc*, const htmlElemDesc*) ; 302 XMLPUBFUN htmlStatus htmlNodeStatus(htmlNodePtr, int) ; 303 /** 304 * htmlDefaultSubelement: 305 * @elt: HTML element 306 * 307 * Returns the default subelement for this element 308 */ 309 #define htmlDefaultSubelement(elt) elt->defaultsubelt 310 /** 311 * htmlElementAllowedHereDesc: 312 * @parent: HTML parent element 313 * @elt: HTML element 314 * 315 * Checks whether an HTML element description may be a 316 * direct child of the specified element. 317 * 318 * Returns 1 if allowed; 0 otherwise. 319 */ 320 #define htmlElementAllowedHereDesc(parent,elt) \ 321 htmlElementAllowedHere((parent), (elt)->name) 322 /** 323 * htmlRequiredAttrs: 324 * @elt: HTML element 325 * 326 * Returns the attributes required for the specified element. 327 */ 328 #define htmlRequiredAttrs(elt) (elt)->attrs_req 329 330 331 #ifdef __cplusplus 332 } 333 #endif 334 335 #endif /* LIBXML_HTML_ENABLED */ 336 #endif /* __HTML_PARSER_H__ */ 337