1 // This file is part of TagSoup and is Copyright 2002-2008 by John Cowan. 2 // 3 // TagSoup is licensed under the Apache License, 4 // Version 2.0. You may obtain a copy of this license at 5 // http://www.apache.org/licenses/LICENSE-2.0 . You may also have 6 // additional legal rights not granted by this license. 7 // 8 // TagSoup is distributed in the hope that it will be useful, but 9 // unless required by applicable law or agreed to in writing, TagSoup 10 // is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS 11 // OF ANY KIND, either express or implied; not even the implied warranty 12 // of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. 13 // 14 // 15 // The TagSoup parser 16 17 package org.ccil.cowan.tagsoup; 18 import java.util.HashMap; 19 import java.util.ArrayList; 20 import java.util.Locale; 21 import java.io.*; 22 import java.net.URL; 23 import java.net.URLConnection; 24 import org.xml.sax.*; 25 import org.xml.sax.helpers.DefaultHandler; 26 import org.xml.sax.ext.LexicalHandler; 27 28 29 /** 30 The SAX parser class. 31 **/ 32 public class Parser extends DefaultHandler implements ScanHandler, XMLReader, LexicalHandler { 33 34 // XMLReader implementation 35 36 private ContentHandler theContentHandler = this; 37 private LexicalHandler theLexicalHandler = this; 38 private DTDHandler theDTDHandler = this; 39 private ErrorHandler theErrorHandler = this; 40 private EntityResolver theEntityResolver = this; 41 private Schema theSchema; 42 private Scanner theScanner; 43 private AutoDetector theAutoDetector; 44 45 // Default values for feature flags 46 47 private static boolean DEFAULT_NAMESPACES = true; 48 private static boolean DEFAULT_IGNORE_BOGONS = false; 49 private static boolean DEFAULT_BOGONS_EMPTY = false; 50 private static boolean DEFAULT_ROOT_BOGONS = true; 51 private static boolean DEFAULT_DEFAULT_ATTRIBUTES = true; 52 private static boolean DEFAULT_TRANSLATE_COLONS = false; 53 private static boolean DEFAULT_RESTART_ELEMENTS = true; 54 private static boolean DEFAULT_IGNORABLE_WHITESPACE = false; 55 private static boolean DEFAULT_CDATA_ELEMENTS = true; 56 57 // Feature flags. 58 59 private boolean namespaces = DEFAULT_NAMESPACES; 60 private boolean ignoreBogons = DEFAULT_IGNORE_BOGONS; 61 private boolean bogonsEmpty = DEFAULT_BOGONS_EMPTY; 62 private boolean rootBogons = DEFAULT_ROOT_BOGONS; 63 private boolean defaultAttributes = DEFAULT_DEFAULT_ATTRIBUTES; 64 private boolean translateColons = DEFAULT_TRANSLATE_COLONS; 65 private boolean restartElements = DEFAULT_RESTART_ELEMENTS; 66 private boolean ignorableWhitespace = DEFAULT_IGNORABLE_WHITESPACE; 67 private boolean CDATAElements = DEFAULT_CDATA_ELEMENTS; 68 69 /** 70 A value of "true" indicates namespace URIs and unprefixed local 71 names for element and attribute names will be available. 72 **/ 73 public final static String namespacesFeature = 74 "http://xml.org/sax/features/namespaces"; 75 76 /** 77 A value of "true" indicates that XML qualified names (with prefixes) 78 and attributes (including xmlns* attributes) will be available. 79 We don't support this value. 80 **/ 81 public final static String namespacePrefixesFeature = 82 "http://xml.org/sax/features/namespace-prefixes"; 83 84 /** 85 Reports whether this parser processes external general entities 86 (it doesn't). 87 **/ 88 public final static String externalGeneralEntitiesFeature = 89 "http://xml.org/sax/features/external-general-entities"; 90 91 /** 92 Reports whether this parser processes external parameter entities 93 (it doesn't). 94 **/ 95 public final static String externalParameterEntitiesFeature = 96 "http://xml.org/sax/features/external-parameter-entities"; 97 98 /** 99 May be examined only during a parse, after the startDocument() 100 callback has been completed; read-only. The value is true if 101 the document specified standalone="yes" in its XML declaration, 102 and otherwise is false. (It's always false.) 103 **/ 104 public final static String isStandaloneFeature = 105 "http://xml.org/sax/features/is-standalone"; 106 107 /** 108 A value of "true" indicates that the LexicalHandler will report 109 the beginning and end of parameter entities (it won't). 110 **/ 111 public final static String lexicalHandlerParameterEntitiesFeature = 112 "http://xml.org/sax/features/lexical-handler/parameter-entities"; 113 114 /** 115 A value of "true" indicates that system IDs in declarations will 116 be absolutized (relative to their base URIs) before reporting. 117 (This returns true but doesn't actually do anything.) 118 **/ 119 public final static String resolveDTDURIsFeature = 120 "http://xml.org/sax/features/resolve-dtd-uris"; 121 122 /** 123 Has a value of "true" if all XML names (for elements, 124 prefixes, attributes, entities, notations, and local 125 names), as well as Namespace URIs, will have been interned 126 using java.lang.String.intern. This supports fast testing of 127 equality/inequality against string constants, rather than forcing 128 slower calls to String.equals(). (We always intern.) 129 **/ 130 public final static String stringInterningFeature = 131 "http://xml.org/sax/features/string-interning"; 132 133 /** 134 Returns "true" if the Attributes objects passed by this 135 parser in ContentHandler.startElement() implement the 136 org.xml.sax.ext.Attributes2 interface. (They don't.) 137 **/ 138 139 public final static String useAttributes2Feature = 140 "http://xml.org/sax/features/use-attributes2"; 141 142 /** 143 Returns "true" if the Locator objects passed by this parser 144 in ContentHandler.setDocumentLocator() implement the 145 org.xml.sax.ext.Locator2 interface. (They don't.) 146 **/ 147 public final static String useLocator2Feature = 148 "http://xml.org/sax/features/use-locator2"; 149 150 /** 151 Returns "true" if, when setEntityResolver is given an object 152 implementing the org.xml.sax.ext.EntityResolver2 interface, 153 those new methods will be used. (They won't be.) 154 **/ 155 public final static String useEntityResolver2Feature = 156 "http://xml.org/sax/features/use-entity-resolver2"; 157 158 /** 159 Controls whether the parser is reporting all validity errors 160 (We don't report any validity errors.) 161 **/ 162 public final static String validationFeature = 163 "http://xml.org/sax/features/validation"; 164 165 /** 166 Controls whether the parser reports Unicode normalization 167 errors as described in section 2.13 and Appendix B of the XML 168 1.1 Recommendation. (We don't normalize.) 169 **/ 170 public final static String unicodeNormalizationCheckingFeature = 171 "http://xml.org/sax/features/unicode-normalization-checking"; 172 173 /** 174 Controls whether, when the namespace-prefixes feature is set, 175 the parser treats namespace declaration attributes as being in 176 the http://www.w3.org/2000/xmlns/ namespace. (It doesn't.) 177 **/ 178 public final static String xmlnsURIsFeature = 179 "http://xml.org/sax/features/xmlns-uris"; 180 181 /** 182 Returns "true" if the parser supports both XML 1.1 and XML 1.0. 183 (Always false.) 184 **/ 185 public final static String XML11Feature = 186 "http://xml.org/sax/features/xml-1.1"; 187 188 /** 189 A value of "true" indicates that the parser will ignore 190 unknown elements. 191 **/ 192 public final static String ignoreBogonsFeature = 193 "http://www.ccil.org/~cowan/tagsoup/features/ignore-bogons"; 194 195 /** 196 A value of "true" indicates that the parser will give unknown 197 elements a content model of EMPTY; a value of "false", a 198 content model of ANY. 199 **/ 200 public final static String bogonsEmptyFeature = 201 "http://www.ccil.org/~cowan/tagsoup/features/bogons-empty"; 202 203 /** 204 A value of "true" indicates that the parser will allow unknown 205 elements to be the root element. 206 **/ 207 public final static String rootBogonsFeature = 208 "http://www.ccil.org/~cowan/tagsoup/features/root-bogons"; 209 210 /** 211 A value of "true" indicates that the parser will return default 212 attribute values for missing attributes that have default values. 213 **/ 214 public final static String defaultAttributesFeature = 215 "http://www.ccil.org/~cowan/tagsoup/features/default-attributes"; 216 217 /** 218 A value of "true" indicates that the parser will 219 translate colons into underscores in names. 220 **/ 221 public final static String translateColonsFeature = 222 "http://www.ccil.org/~cowan/tagsoup/features/translate-colons"; 223 224 /** 225 A value of "true" indicates that the parser will 226 attempt to restart the restartable elements. 227 **/ 228 public final static String restartElementsFeature = 229 "http://www.ccil.org/~cowan/tagsoup/features/restart-elements"; 230 231 /** 232 A value of "true" indicates that the parser will 233 transmit whitespace in element-only content via the SAX 234 ignorableWhitespace callback. Normally this is not done, 235 because HTML is an SGML application and SGML suppresses 236 such whitespace. 237 **/ 238 public final static String ignorableWhitespaceFeature = 239 "http://www.ccil.org/~cowan/tagsoup/features/ignorable-whitespace"; 240 241 /** 242 A value of "true" indicates that the parser will treat CDATA 243 elements specially. Normally true, since the input is by 244 default HTML. 245 **/ 246 public final static String CDATAElementsFeature = 247 "http://www.ccil.org/~cowan/tagsoup/features/cdata-elements"; 248 249 /** 250 Used to see some syntax events that are essential in some 251 applications: comments, CDATA delimiters, selected general 252 entity inclusions, and the start and end of the DTD (and 253 declaration of document element name). The Object must implement 254 org.xml.sax.ext.LexicalHandler. 255 **/ 256 public final static String lexicalHandlerProperty = 257 "http://xml.org/sax/properties/lexical-handler"; 258 259 /** 260 Specifies the Scanner object this Parser uses. 261 **/ 262 public final static String scannerProperty = 263 "http://www.ccil.org/~cowan/tagsoup/properties/scanner"; 264 265 /** 266 Specifies the Schema object this Parser uses. 267 **/ 268 public final static String schemaProperty = 269 "http://www.ccil.org/~cowan/tagsoup/properties/schema"; 270 271 /** 272 Specifies the AutoDetector (for encoding detection) this Parser uses. 273 **/ 274 public final static String autoDetectorProperty = 275 "http://www.ccil.org/~cowan/tagsoup/properties/auto-detector"; 276 277 // Due to sucky Java order of initialization issues, these 278 // entries are maintained separately from the initial values of 279 // the corresponding instance variables, but care must be taken 280 // to keep them in sync. 281 282 private HashMap theFeatures = new HashMap(); 283 { theFeatures.put(namespacesFeature, truthValue(DEFAULT_NAMESPACES))284 theFeatures.put(namespacesFeature, truthValue(DEFAULT_NAMESPACES)); theFeatures.put(namespacePrefixesFeature, Boolean.FALSE)285 theFeatures.put(namespacePrefixesFeature, Boolean.FALSE); theFeatures.put(externalGeneralEntitiesFeature, Boolean.FALSE)286 theFeatures.put(externalGeneralEntitiesFeature, Boolean.FALSE); theFeatures.put(externalParameterEntitiesFeature, Boolean.FALSE)287 theFeatures.put(externalParameterEntitiesFeature, Boolean.FALSE); theFeatures.put(isStandaloneFeature, Boolean.FALSE)288 theFeatures.put(isStandaloneFeature, Boolean.FALSE); theFeatures.put(lexicalHandlerParameterEntitiesFeature, Boolean.FALSE)289 theFeatures.put(lexicalHandlerParameterEntitiesFeature, 290 Boolean.FALSE); theFeatures.put(resolveDTDURIsFeature, Boolean.TRUE)291 theFeatures.put(resolveDTDURIsFeature, Boolean.TRUE); theFeatures.put(stringInterningFeature, Boolean.TRUE)292 theFeatures.put(stringInterningFeature, Boolean.TRUE); theFeatures.put(useAttributes2Feature, Boolean.FALSE)293 theFeatures.put(useAttributes2Feature, Boolean.FALSE); theFeatures.put(useLocator2Feature, Boolean.FALSE)294 theFeatures.put(useLocator2Feature, Boolean.FALSE); theFeatures.put(useEntityResolver2Feature, Boolean.FALSE)295 theFeatures.put(useEntityResolver2Feature, Boolean.FALSE); theFeatures.put(validationFeature, Boolean.FALSE)296 theFeatures.put(validationFeature, Boolean.FALSE); theFeatures.put(xmlnsURIsFeature, Boolean.FALSE)297 theFeatures.put(xmlnsURIsFeature, Boolean.FALSE); theFeatures.put(xmlnsURIsFeature, Boolean.FALSE)298 theFeatures.put(xmlnsURIsFeature, Boolean.FALSE); theFeatures.put(XML11Feature, Boolean.FALSE)299 theFeatures.put(XML11Feature, Boolean.FALSE); theFeatures.put(ignoreBogonsFeature, truthValue(DEFAULT_IGNORE_BOGONS))300 theFeatures.put(ignoreBogonsFeature, truthValue(DEFAULT_IGNORE_BOGONS)); theFeatures.put(bogonsEmptyFeature, truthValue(DEFAULT_BOGONS_EMPTY))301 theFeatures.put(bogonsEmptyFeature, truthValue(DEFAULT_BOGONS_EMPTY)); theFeatures.put(rootBogonsFeature, truthValue(DEFAULT_ROOT_BOGONS))302 theFeatures.put(rootBogonsFeature, truthValue(DEFAULT_ROOT_BOGONS)); theFeatures.put(defaultAttributesFeature, truthValue(DEFAULT_DEFAULT_ATTRIBUTES))303 theFeatures.put(defaultAttributesFeature, truthValue(DEFAULT_DEFAULT_ATTRIBUTES)); theFeatures.put(translateColonsFeature, truthValue(DEFAULT_TRANSLATE_COLONS))304 theFeatures.put(translateColonsFeature, truthValue(DEFAULT_TRANSLATE_COLONS)); theFeatures.put(restartElementsFeature, truthValue(DEFAULT_RESTART_ELEMENTS))305 theFeatures.put(restartElementsFeature, truthValue(DEFAULT_RESTART_ELEMENTS)); theFeatures.put(ignorableWhitespaceFeature, truthValue(DEFAULT_IGNORABLE_WHITESPACE))306 theFeatures.put(ignorableWhitespaceFeature, truthValue(DEFAULT_IGNORABLE_WHITESPACE)); theFeatures.put(CDATAElementsFeature, truthValue(DEFAULT_CDATA_ELEMENTS))307 theFeatures.put(CDATAElementsFeature, truthValue(DEFAULT_CDATA_ELEMENTS)); 308 } 309 310 // Private clone of Boolean.valueOf that is guaranteed to return 311 // Boolean.TRUE or Boolean.FALSE truthValue(boolean b)312 private static Boolean truthValue(boolean b) { 313 return b ? Boolean.TRUE : Boolean.FALSE; 314 } 315 316 getFeature(String name)317 public boolean getFeature (String name) 318 throws SAXNotRecognizedException, SAXNotSupportedException { 319 Boolean b = (Boolean)theFeatures.get(name); 320 if (b == null) { 321 throw new SAXNotRecognizedException("Unknown feature " + name); 322 } 323 return b.booleanValue(); 324 } 325 setFeature(String name, boolean value)326 public void setFeature (String name, boolean value) 327 throws SAXNotRecognizedException, SAXNotSupportedException { 328 Boolean b = (Boolean)theFeatures.get(name); 329 if (b == null) { 330 throw new SAXNotRecognizedException("Unknown feature " + name); 331 } 332 if (value) theFeatures.put(name, Boolean.TRUE); 333 else theFeatures.put(name, Boolean.FALSE); 334 335 if (name.equals(namespacesFeature)) namespaces = value; 336 else if (name.equals(ignoreBogonsFeature)) ignoreBogons = value; 337 else if (name.equals(bogonsEmptyFeature)) bogonsEmpty = value; 338 else if (name.equals(rootBogonsFeature)) rootBogons = value; 339 else if (name.equals(defaultAttributesFeature)) defaultAttributes = value; 340 else if (name.equals(translateColonsFeature)) translateColons = value; 341 else if (name.equals(restartElementsFeature)) restartElements = value; 342 else if (name.equals(ignorableWhitespaceFeature)) ignorableWhitespace = value; 343 else if (name.equals(CDATAElementsFeature)) CDATAElements = value; 344 } 345 getProperty(String name)346 public Object getProperty (String name) 347 throws SAXNotRecognizedException, SAXNotSupportedException { 348 if (name.equals(lexicalHandlerProperty)) { 349 return theLexicalHandler == this ? null : theLexicalHandler; 350 } 351 else if (name.equals(scannerProperty)) { 352 return theScanner; 353 } 354 else if (name.equals(schemaProperty)) { 355 return theSchema; 356 } 357 else if (name.equals(autoDetectorProperty)) { 358 return theAutoDetector; 359 } 360 else { 361 throw new SAXNotRecognizedException("Unknown property " + name); 362 } 363 } 364 setProperty(String name, Object value)365 public void setProperty (String name, Object value) 366 throws SAXNotRecognizedException, SAXNotSupportedException { 367 if (name.equals(lexicalHandlerProperty)) { 368 if (value == null) { 369 theLexicalHandler = this; 370 } 371 else if (value instanceof LexicalHandler) { 372 theLexicalHandler = (LexicalHandler)value; 373 } 374 else { 375 throw new SAXNotSupportedException("Your lexical handler is not a LexicalHandler"); 376 } 377 } 378 else if (name.equals(scannerProperty)) { 379 if (value instanceof Scanner) { 380 theScanner = (Scanner)value; 381 } 382 else { 383 throw new SAXNotSupportedException("Your scanner is not a Scanner"); 384 } 385 } 386 else if (name.equals(schemaProperty)) { 387 if (value instanceof Schema) { 388 theSchema = (Schema)value; 389 } 390 else { 391 throw new SAXNotSupportedException("Your schema is not a Schema"); 392 } 393 } 394 else if (name.equals(autoDetectorProperty)) { 395 if (value instanceof AutoDetector) { 396 theAutoDetector = (AutoDetector)value; 397 } 398 else { 399 throw new SAXNotSupportedException("Your auto-detector is not an AutoDetector"); 400 } 401 } 402 else { 403 throw new SAXNotRecognizedException("Unknown property " + name); 404 } 405 } 406 setEntityResolver(EntityResolver resolver)407 public void setEntityResolver (EntityResolver resolver) { 408 theEntityResolver = (resolver == null) ? this : resolver; 409 } 410 getEntityResolver()411 public EntityResolver getEntityResolver () { 412 return (theEntityResolver == this) ? null : theEntityResolver; 413 } 414 setDTDHandler(DTDHandler handler)415 public void setDTDHandler (DTDHandler handler) { 416 theDTDHandler = (handler == null) ? this : handler; 417 } 418 getDTDHandler()419 public DTDHandler getDTDHandler () { 420 return (theDTDHandler == this) ? null : theDTDHandler; 421 } 422 setContentHandler(ContentHandler handler)423 public void setContentHandler (ContentHandler handler) { 424 theContentHandler = (handler == null) ? this : handler; 425 } 426 getContentHandler()427 public ContentHandler getContentHandler () { 428 return (theContentHandler == this) ? null : theContentHandler; 429 } 430 setErrorHandler(ErrorHandler handler)431 public void setErrorHandler (ErrorHandler handler) { 432 theErrorHandler = (handler == null) ? this : handler; 433 } 434 getErrorHandler()435 public ErrorHandler getErrorHandler () { 436 return (theErrorHandler == this) ? null : theErrorHandler; 437 } 438 parse(InputSource input)439 public void parse (InputSource input) throws IOException, SAXException { 440 setup(); 441 Reader r = getReader(input); 442 theContentHandler.startDocument(); 443 theScanner.resetDocumentLocator(input.getPublicId(), input.getSystemId()); 444 if (theScanner instanceof Locator) { 445 theContentHandler.setDocumentLocator((Locator)theScanner); 446 } 447 if (!(theSchema.getURI().equals(""))) 448 theContentHandler.startPrefixMapping(theSchema.getPrefix(), 449 theSchema.getURI()); 450 theScanner.scan(r, this); 451 } 452 parse(String systemid)453 public void parse (String systemid) throws IOException, SAXException { 454 parse(new InputSource(systemid)); 455 } 456 457 // Sets up instance variables that haven't been set by setFeature setup()458 private void setup() { 459 if (theSchema == null) theSchema = new HTMLSchema(); 460 if (theScanner == null) theScanner = new HTMLScanner(); 461 if (theAutoDetector == null) { 462 theAutoDetector = new AutoDetector() { 463 public Reader autoDetectingReader(InputStream i) { 464 return new InputStreamReader(i); 465 } 466 }; 467 } 468 theStack = new Element(theSchema.getElementType("<root>"), defaultAttributes); 469 thePCDATA = new Element(theSchema.getElementType("<pcdata>"), defaultAttributes); 470 theNewElement = null; 471 theAttributeName = null; 472 thePITarget = null; 473 theSaved = null; 474 theEntity = 0; 475 virginStack = true; 476 theDoctypeName = theDoctypePublicId = theDoctypeSystemId = null; 477 } 478 479 // Return a Reader based on the contents of an InputSource 480 // Buffer both the InputStream and the Reader getReader(InputSource s)481 private Reader getReader(InputSource s) throws SAXException, IOException { 482 Reader r = s.getCharacterStream(); 483 InputStream i = s.getByteStream(); 484 String encoding = s.getEncoding(); 485 String publicid = s.getPublicId(); 486 String systemid = s.getSystemId(); 487 if (r == null) { 488 if (i == null) i = getInputStream(publicid, systemid); 489 // i = new BufferedInputStream(i); 490 if (encoding == null) { 491 r = theAutoDetector.autoDetectingReader(i); 492 } 493 else { 494 try { 495 r = new InputStreamReader(i, encoding); 496 } 497 catch (UnsupportedEncodingException e) { 498 r = new InputStreamReader(i); 499 } 500 } 501 } 502 // r = new BufferedReader(r); 503 return r; 504 } 505 506 // Get an InputStream based on a publicid and a systemid getInputStream(String publicid, String systemid)507 private InputStream getInputStream(String publicid, String systemid) throws IOException, SAXException { 508 URL basis = new URL("file", "", System.getProperty("user.dir") + "/."); 509 URL url = new URL(basis, systemid); 510 URLConnection c = url.openConnection(); 511 return c.getInputStream(); 512 } 513 // We don't process publicids (who uses them anyhow?) 514 515 // ScanHandler implementation 516 517 private Element theNewElement = null; 518 private String theAttributeName = null; 519 private boolean theDoctypeIsPresent = false; 520 private String theDoctypePublicId = null; 521 private String theDoctypeSystemId = null; 522 private String theDoctypeName = null; 523 private String thePITarget = null; 524 private Element theStack = null; 525 private Element theSaved = null; 526 private Element thePCDATA = null; 527 private int theEntity = 0; // needs to support chars past U+FFFF 528 adup(char[] buff, int offset, int length)529 public void adup(char[] buff, int offset, int length) throws SAXException { 530 if (theNewElement == null || theAttributeName == null) return; 531 theNewElement.setAttribute(theAttributeName, null, theAttributeName); 532 theAttributeName = null; 533 } 534 aname(char[] buff, int offset, int length)535 public void aname(char[] buff, int offset, int length) throws SAXException { 536 if (theNewElement == null) return; 537 // Currently we don't rely on Schema to canonicalize 538 // attribute names. 539 theAttributeName = makeName(buff, offset, length).toLowerCase(Locale.ROOT); 540 // System.err.println("%% Attribute name " + theAttributeName); 541 } 542 aval(char[] buff, int offset, int length)543 public void aval(char[] buff, int offset, int length) throws SAXException { 544 if (theNewElement == null || theAttributeName == null) return; 545 String value = new String(buff, offset, length); 546 // System.err.println("%% Attribute value [" + value + "]"); 547 value = expandEntities(value); 548 theNewElement.setAttribute(theAttributeName, null, value); 549 theAttributeName = null; 550 // System.err.println("%% Aval done"); 551 } 552 553 // Expand entity references in attribute values selectively. 554 // Currently we expand a reference iff it is properly terminated 555 // with a semicolon. expandEntities(String src)556 private String expandEntities(String src) { 557 int refStart = -1; 558 int len = src.length(); 559 char[] dst = new char[len]; 560 int dstlen = 0; 561 for (int i = 0; i < len; i++) { 562 char ch = src.charAt(i); 563 dst[dstlen++] = ch; 564 // System.err.print("i = " + i + ", d = " + dstlen + ", ch = [" + ch + "] "); 565 if (ch == '&' && refStart == -1) { 566 // start of a ref excluding & 567 refStart = dstlen; 568 // System.err.println("start of ref"); 569 } 570 else if (refStart == -1) { 571 // not in a ref 572 // System.err.println("not in ref"); 573 } 574 else if (Character.isLetter(ch) || 575 Character.isDigit(ch) || 576 ch == '#') { 577 // valid entity char 578 // System.err.println("valid"); 579 } 580 else if (ch == ';') { 581 // properly terminated ref 582 // System.err.print("got [" + new String(dst, refStart, dstlen-refStart-1) + "]"); 583 int ent = lookupEntity(dst, refStart, dstlen - refStart - 1); 584 // System.err.println(" = " + ent); 585 if (ent > 0xFFFF) { 586 ent -= 0x10000; 587 dst[refStart - 1] = (char)((ent>>10) + 0xD800); 588 dst[refStart] = (char)((ent&0x3FF) + 0xDC00); 589 dstlen = refStart + 1; 590 } 591 else if (ent != 0) { 592 dst[refStart - 1] = (char)ent; 593 dstlen = refStart; 594 } 595 refStart = -1; 596 } 597 else { 598 // improperly terminated ref 599 // System.err.println("end of ref"); 600 refStart = -1; 601 } 602 } 603 return new String(dst, 0, dstlen); 604 } 605 entity(char[] buff, int offset, int length)606 public void entity(char[] buff, int offset, int length) throws SAXException { 607 theEntity = lookupEntity(buff, offset, length); 608 } 609 610 // Process numeric character references, 611 // deferring to the schema for named ones. lookupEntity(char[] buff, int offset, int length)612 private int lookupEntity(char[] buff, int offset, int length) { 613 int result = 0; 614 if (length < 1) return result; 615 // System.err.println("%% Entity at " + offset + " " + length); 616 // System.err.println("%% Got entity [" + new String(buff, offset, length) + "]"); 617 if (buff[offset] == '#') { 618 if (length > 1 && (buff[offset+1] == 'x' 619 || buff[offset+1] == 'X')) { 620 try { 621 return Integer.parseInt(new String(buff, offset + 2, length - 2), 16); 622 } 623 catch (NumberFormatException e) { return 0; } 624 } 625 try { 626 return Integer.parseInt(new String(buff, offset + 1, length - 1), 10); 627 } 628 catch (NumberFormatException e) { return 0; } 629 } 630 return theSchema.getEntity(new String(buff, offset, length)); 631 } 632 eof(char[] buff, int offset, int length)633 public void eof(char[] buff, int offset, int length) throws SAXException { 634 if (virginStack) rectify(thePCDATA); 635 while (theStack.next() != null) { 636 pop(); 637 } 638 if (!(theSchema.getURI().equals(""))) 639 theContentHandler.endPrefixMapping(theSchema.getPrefix()); 640 theContentHandler.endDocument(); 641 } 642 etag(char[] buff, int offset, int length)643 public void etag(char[] buff, int offset, int length) throws SAXException { 644 if (etag_cdata(buff, offset, length)) return; 645 etag_basic(buff, offset, length); 646 } 647 648 private static char[] etagchars = {'<', '/', '>'}; etag_cdata(char[] buff, int offset, int length)649 public boolean etag_cdata(char[] buff, int offset, int length) throws SAXException { 650 String currentName = theStack.name(); 651 // If this is a CDATA element and the tag doesn't match, 652 // or isn't properly formed (junk after the name), 653 // restart CDATA mode and process the tag as characters. 654 if (CDATAElements && (theStack.flags() & Schema.F_CDATA) != 0) { 655 boolean realTag = (length == currentName.length()); 656 if (realTag) { 657 for (int i = 0; i < length; i++) { 658 if (Character.toLowerCase(buff[offset + i]) != Character.toLowerCase(currentName.charAt(i))) { 659 realTag = false; 660 break; 661 } 662 } 663 } 664 if (!realTag) { 665 theContentHandler.characters(etagchars, 0, 2); 666 theContentHandler.characters(buff, offset, length); 667 theContentHandler.characters(etagchars, 2, 1); 668 theScanner.startCDATA(); 669 return true; 670 } 671 } 672 return false; 673 } 674 etag_basic(char[] buff, int offset, int length)675 public void etag_basic(char[] buff, int offset, int length) throws SAXException { 676 theNewElement = null; 677 String name; 678 if (length != 0) { 679 // Canonicalize case of name 680 name = makeName(buff, offset, length); 681 // System.err.println("got etag [" + name + "]"); 682 ElementType type = theSchema.getElementType(name); 683 if (type == null) return; // mysterious end-tag 684 name = type.name(); 685 } 686 else { 687 name = theStack.name(); 688 } 689 // System.err.println("%% Got end of " + name); 690 691 Element sp; 692 boolean inNoforce = false; 693 for (sp = theStack; sp != null; sp = sp.next()) { 694 if (sp.name().equals(name)) break; 695 if ((sp.flags() & Schema.F_NOFORCE) != 0) inNoforce = true; 696 } 697 698 if (sp == null) return; // Ignore unknown etags 699 if (sp.next() == null || sp.next().next() == null) return; 700 if (inNoforce) { // inside an F_NOFORCE element? 701 sp.preclose(); // preclose the matching element 702 } 703 else { // restartably pop everything above us 704 while (theStack != sp) { 705 restartablyPop(); 706 } 707 pop(); 708 } 709 // pop any preclosed elements now at the top 710 while (theStack.isPreclosed()) { 711 pop(); 712 } 713 restart(null); 714 } 715 716 // Push restartables on the stack if possible 717 // e is the next element to be started, if we know what it is restart(Element e)718 private void restart(Element e) throws SAXException { 719 while (theSaved != null && theStack.canContain(theSaved) && 720 (e == null || theSaved.canContain(e))) { 721 Element next = theSaved.next(); 722 push(theSaved); 723 theSaved = next; 724 } 725 } 726 727 // Pop the stack irrevocably pop()728 private void pop() throws SAXException { 729 if (theStack == null) return; // empty stack 730 String name = theStack.name(); 731 String localName = theStack.localName(); 732 String namespace = theStack.namespace(); 733 String prefix = prefixOf(name); 734 735 // System.err.println("%% Popping " + name); 736 if (!namespaces) namespace = localName = ""; 737 theContentHandler.endElement(namespace, localName, name); 738 if (foreign(prefix, namespace)) { 739 theContentHandler.endPrefixMapping(prefix); 740 // System.err.println("%% Unmapping [" + prefix + "] for elements to " + namespace); 741 } 742 Attributes atts = theStack.atts(); 743 for (int i = atts.getLength() - 1; i >= 0; i--) { 744 String attNamespace = atts.getURI(i); 745 String attPrefix = prefixOf(atts.getQName(i)); 746 if (foreign(attPrefix, attNamespace)) { 747 theContentHandler.endPrefixMapping(attPrefix); 748 // System.err.println("%% Unmapping [" + attPrefix + "] for attributes to " + attNamespace); 749 } 750 } 751 theStack = theStack.next(); 752 } 753 754 // Pop the stack restartably restartablyPop()755 private void restartablyPop() throws SAXException { 756 Element popped = theStack; 757 pop(); 758 if (restartElements && (popped.flags() & Schema.F_RESTART) != 0) { 759 popped.anonymize(); 760 popped.setNext(theSaved); 761 theSaved = popped; 762 } 763 } 764 765 // Push element onto stack 766 private boolean virginStack = true; push(Element e)767 private void push(Element e) throws SAXException { 768 String name = e.name(); 769 String localName = e.localName(); 770 String namespace = e.namespace(); 771 String prefix = prefixOf(name); 772 773 // System.err.println("%% Pushing " + name); 774 e.clean(); 775 if (!namespaces) namespace = localName = ""; 776 if (virginStack && localName.equalsIgnoreCase(theDoctypeName)) { 777 try { 778 theEntityResolver.resolveEntity(theDoctypePublicId, theDoctypeSystemId); 779 } catch (IOException ew) { } // Can't be thrown for root I believe. 780 } 781 if (foreign(prefix, namespace)) { 782 theContentHandler.startPrefixMapping(prefix, namespace); 783 // System.err.println("%% Mapping [" + prefix + "] for elements to " + namespace); 784 } 785 Attributes atts = e.atts(); 786 int len = atts.getLength(); 787 for (int i = 0; i < len; i++) { 788 String attNamespace = atts.getURI(i); 789 String attPrefix = prefixOf(atts.getQName(i)); 790 if (foreign(attPrefix, attNamespace)) { 791 theContentHandler.startPrefixMapping(attPrefix, attNamespace); 792 // System.err.println("%% Mapping [" + attPrefix + "] for attributes to " + attNamespace); 793 } 794 } 795 theContentHandler.startElement(namespace, localName, name, e.atts()); 796 e.setNext(theStack); 797 theStack = e; 798 virginStack = false; 799 if (CDATAElements && (theStack.flags() & Schema.F_CDATA) != 0) { 800 theScanner.startCDATA(); 801 } 802 } 803 804 // Get the prefix from a QName prefixOf(String name)805 private String prefixOf(String name) { 806 int i = name.indexOf(':'); 807 String prefix = ""; 808 if (i != -1) prefix = name.substring(0, i); 809 // System.err.println("%% " + prefix + " is prefix of " + name); 810 return prefix; 811 } 812 813 // Return true if we have a foreign name foreign(String prefix, String namespace)814 private boolean foreign(String prefix, String namespace) { 815 // System.err.print("%% Testing " + prefix + " and " + namespace + " for foreignness -- "); 816 boolean foreign = !(prefix.equals("") || namespace.equals("") || 817 namespace.equals(theSchema.getURI())); 818 // System.err.println(foreign); 819 return foreign; 820 } 821 822 /** 823 * Parsing the complete XML Document Type Definition is way too complex, 824 * but for many simple cases we can extract something useful from it. 825 * 826 * doctypedecl ::= '<!DOCTYPE' S Name (S ExternalID)? S? ('[' intSubset ']' S?)? '>' 827 * DeclSep ::= PEReference | S 828 * intSubset ::= (markupdecl | DeclSep)* 829 * markupdecl ::= elementdecl | AttlistDecl | EntityDecl | NotationDecl | PI | Comment 830 * ExternalID ::= 'SYSTEM' S SystemLiteral | 'PUBLIC' S PubidLiteral S SystemLiteral 831 */ decl(char[] buff, int offset, int length)832 public void decl(char[] buff, int offset, int length) throws SAXException { 833 String s = new String(buff, offset, length); 834 String name = null; 835 String systemid = null; 836 String publicid = null; 837 String[] v = split(s); 838 if (v.length > 0 && "DOCTYPE".equalsIgnoreCase(v[0])) { 839 if (theDoctypeIsPresent) return; // one doctype only! 840 theDoctypeIsPresent = true; 841 if (v.length > 1) { 842 name = v[1]; 843 if (v.length>3 && "SYSTEM".equals(v[2])) { 844 systemid = v[3]; 845 } 846 else if (v.length > 3 && "PUBLIC".equals(v[2])) { 847 publicid = v[3]; 848 if (v.length > 4) { 849 systemid = v[4]; 850 } 851 else { 852 systemid = ""; 853 } 854 } 855 } 856 } 857 publicid = trimquotes(publicid); 858 systemid = trimquotes(systemid); 859 if (name != null) { 860 publicid = cleanPublicid(publicid); 861 theLexicalHandler.startDTD(name, publicid, systemid); 862 theLexicalHandler.endDTD(); 863 theDoctypeName = name; 864 theDoctypePublicId = publicid; 865 if (theScanner instanceof Locator) { // Must resolve systemid 866 theDoctypeSystemId = ((Locator)theScanner).getSystemId(); 867 try { 868 theDoctypeSystemId = new URL(new URL(theDoctypeSystemId), systemid).toString(); 869 } catch (Exception e) {} 870 } 871 } 872 } 873 874 // If the String is quoted, trim the quotes. trimquotes(String in)875 private static String trimquotes(String in) { 876 if (in == null) return in; 877 int length = in.length(); 878 if (length == 0) return in; 879 char s = in.charAt(0); 880 char e = in.charAt(length - 1); 881 if (s == e && (s == '\'' || s == '"')) { 882 in = in.substring(1, in.length() - 1); 883 } 884 return in; 885 } 886 887 // Split the supplied String into words or phrases seperated by spaces. 888 // Recognises quotes around a phrase and doesn't split it. split(String val)889 private static String[] split(String val) throws IllegalArgumentException { 890 val = val.trim(); 891 if (val.length() == 0) { 892 return new String[0]; 893 } 894 else { 895 ArrayList l = new ArrayList(); 896 int s = 0; 897 int e = 0; 898 boolean sq = false; // single quote 899 boolean dq = false; // double quote 900 char lastc = 0; 901 int len = val.length(); 902 for (e=0; e < len; e++) { 903 char c = val.charAt(e); 904 if (!dq && c == '\'' && lastc != '\\') { 905 sq = !sq; 906 if (s < 0) s = e; 907 } 908 else if (!sq && c == '\"' && lastc != '\\') { 909 dq = !dq; 910 if (s < 0) s = e; 911 } 912 else if (!sq && !dq) { 913 if (Character.isWhitespace(c)) { 914 if (s >= 0) l.add(val.substring(s, e)); 915 s = -1; 916 } 917 else if (s < 0 && c != ' ') { 918 s = e; 919 } 920 } 921 lastc = c; 922 } 923 l.add(val.substring(s, e)); 924 return (String[])l.toArray(new String[0]); 925 } 926 } 927 928 // Replace junk in publicids with spaces 929 private static String legal = 930 "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789-'()+,./:=?;!*#@$_%"; 931 cleanPublicid(String src)932 private String cleanPublicid(String src) { 933 if (src == null) return null; 934 int len = src.length(); 935 StringBuffer dst = new StringBuffer(len); 936 boolean suppressSpace = true; 937 for (int i = 0; i < len; i++) { 938 char ch = src.charAt(i); 939 if (legal.indexOf(ch) != -1) { // legal but not whitespace 940 dst.append(ch); 941 suppressSpace = false; 942 } 943 else if (suppressSpace) { // normalizable whitespace or junk 944 ; 945 } 946 else { 947 dst.append(' '); 948 suppressSpace = true; 949 } 950 } 951 // System.err.println("%% Publicid [" + dst.toString().trim() + "]"); 952 return dst.toString().trim(); // trim any final junk whitespace 953 } 954 955 gi(char[] buff, int offset, int length)956 public void gi(char[] buff, int offset, int length) throws SAXException { 957 if (theNewElement != null) return; 958 String name = makeName(buff, offset, length); 959 if (name == null) return; 960 ElementType type = theSchema.getElementType(name); 961 if (type == null) { 962 // Suppress unknown elements if ignore-bogons is on 963 if (ignoreBogons) return; 964 int bogonModel = bogonsEmpty ? Schema.M_EMPTY : Schema.M_ANY; 965 int bogonMemberOf = rootBogons ? Schema.M_ANY : (Schema.M_ANY & ~ Schema.M_ROOT); 966 theSchema.elementType(name, bogonModel, bogonMemberOf, 0); 967 if (!rootBogons) theSchema.parent(name, theSchema.rootElementType().name()); 968 type = theSchema.getElementType(name); 969 } 970 971 theNewElement = new Element(type, defaultAttributes); 972 // System.err.println("%% Got GI " + theNewElement.name()); 973 } 974 cdsect(char[] buff, int offset, int length)975 public void cdsect(char[] buff, int offset, int length) throws SAXException { 976 theLexicalHandler.startCDATA(); 977 pcdata(buff, offset, length); 978 theLexicalHandler.endCDATA(); 979 } pcdata(char[] buff, int offset, int length)980 public void pcdata(char[] buff, int offset, int length) throws SAXException { 981 if (length == 0) return; 982 boolean allWhite = true; 983 for (int i = 0; i < length; i++) { 984 if (!Character.isWhitespace(buff[offset+i])) { 985 allWhite = false; 986 } 987 } 988 if (allWhite && !theStack.canContain(thePCDATA)) { 989 if (ignorableWhitespace) { 990 theContentHandler.ignorableWhitespace(buff, offset, length); 991 } 992 } 993 else { 994 rectify(thePCDATA); 995 theContentHandler.characters(buff, offset, length); 996 } 997 } 998 pitarget(char[] buff, int offset, int length)999 public void pitarget(char[] buff, int offset, int length) throws SAXException { 1000 if (theNewElement != null) return; 1001 thePITarget = makeName(buff, offset, length).replace(':', '_'); 1002 } 1003 pi(char[] buff, int offset, int length)1004 public void pi(char[] buff, int offset, int length) throws SAXException { 1005 if (theNewElement != null || thePITarget == null) return; 1006 if ("xml".equalsIgnoreCase(thePITarget)) return; 1007 // if (length > 0 && buff[length - 1] == '?') System.err.println("%% Removing ? from PI"); 1008 if (length > 0 && buff[length - 1] == '?') length--; // remove trailing ? 1009 theContentHandler.processingInstruction(thePITarget, 1010 new String(buff, offset, length)); 1011 thePITarget = null; 1012 } 1013 stagc(char[] buff, int offset, int length)1014 public void stagc(char[] buff, int offset, int length) throws SAXException { 1015 // System.err.println("%% Start-tag"); 1016 if (theNewElement == null) return; 1017 rectify(theNewElement); 1018 if (theStack.model() == Schema.M_EMPTY) { 1019 // Force an immediate end tag 1020 etag_basic(buff, offset, length); 1021 } 1022 } 1023 stage(char[] buff, int offset, int length)1024 public void stage(char[] buff, int offset, int length) throws SAXException { 1025 // System.err.println("%% Empty-tag"); 1026 if (theNewElement == null) return; 1027 rectify(theNewElement); 1028 // Force an immediate end tag 1029 etag_basic(buff, offset, length); 1030 } 1031 1032 // Comment buffer is twice the size of the output buffer 1033 private char[] theCommentBuffer = new char[2000]; cmnt(char[] buff, int offset, int length)1034 public void cmnt(char[] buff, int offset, int length) throws SAXException { 1035 theLexicalHandler.comment(buff, offset, length); 1036 } 1037 1038 // Rectify the stack, pushing and popping as needed 1039 // so that the argument can be safely pushed rectify(Element e)1040 private void rectify(Element e) throws SAXException { 1041 Element sp; 1042 while (true) { 1043 for (sp = theStack; sp != null; sp = sp.next()) { 1044 if (sp.canContain(e)) break; 1045 } 1046 if (sp != null) break; 1047 ElementType parentType = e.parent(); 1048 if (parentType == null) break; 1049 Element parent = new Element(parentType, defaultAttributes); 1050 // System.err.println("%% Ascending from " + e.name() + " to " + parent.name()); 1051 parent.setNext(e); 1052 e = parent; 1053 } 1054 if (sp == null) return; // don't know what to do 1055 while (theStack != sp) { 1056 if (theStack == null || theStack.next() == null || 1057 theStack.next().next() == null) break; 1058 restartablyPop(); 1059 } 1060 while (e != null) { 1061 Element nexte = e.next(); 1062 if (!e.name().equals("<pcdata>")) push(e); 1063 e = nexte; 1064 restart(e); 1065 } 1066 theNewElement = null; 1067 } 1068 getEntity()1069 public int getEntity() { 1070 return theEntity; 1071 } 1072 1073 // Return the argument as a valid XML name 1074 // This no longer lowercases the result: we depend on Schema to 1075 // canonicalize case. makeName(char[] buff, int offset, int length)1076 private String makeName(char[] buff, int offset, int length) { 1077 StringBuffer dst = new StringBuffer(length + 2); 1078 boolean seenColon = false; 1079 boolean start = true; 1080 // String src = new String(buff, offset, length); // DEBUG 1081 for (; length-- > 0; offset++) { 1082 char ch = buff[offset]; 1083 if (Character.isLetter(ch) || ch == '_') { 1084 start = false; 1085 dst.append(ch); 1086 } 1087 else if (Character.isDigit(ch) || ch == '-' || ch == '.') { 1088 if (start) dst.append('_'); 1089 start = false; 1090 dst.append(ch); 1091 } 1092 else if (ch == ':' && !seenColon) { 1093 seenColon = true; 1094 if (start) dst.append('_'); 1095 start = true; 1096 dst.append(translateColons ? '_' : ch); 1097 } 1098 } 1099 int dstLength = dst.length(); 1100 if (dstLength == 0 || dst.charAt(dstLength - 1) == ':') dst.append('_'); 1101 // System.err.println("Made name \"" + dst + "\" from \"" + src + "\""); 1102 return dst.toString().intern(); 1103 } 1104 1105 // Default LexicalHandler implementation 1106 comment(char[] ch, int start, int length)1107 public void comment(char[] ch, int start, int length) throws SAXException { } endCDATA()1108 public void endCDATA() throws SAXException { } endDTD()1109 public void endDTD() throws SAXException { } endEntity(String name)1110 public void endEntity(String name) throws SAXException { } startCDATA()1111 public void startCDATA() throws SAXException { } startDTD(String name, String publicid, String systemid)1112 public void startDTD(String name, String publicid, String systemid) throws SAXException { } startEntity(String name)1113 public void startEntity(String name) throws SAXException { } 1114 1115 } 1116