1 // This file is part of TagSoup and is Copyright 2002-2008 by John Cowan. 2 // 3 // TagSoup is licensed under the Apache License, 4 // Version 2.0. You may obtain a copy of this license at 5 // http://www.apache.org/licenses/LICENSE-2.0 . You may also have 6 // additional legal rights not granted by this license. 7 // 8 // TagSoup is distributed in the hope that it will be useful, but 9 // unless required by applicable law or agreed to in writing, TagSoup 10 // is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS 11 // OF ANY KIND, either express or implied; not even the implied warranty 12 // of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. 13 // 14 // 15 package org.ccil.cowan.tagsoup; 16 import java.io.*; 17 import org.xml.sax.SAXException; 18 import org.xml.sax.Locator; 19 20 /** 21 This class implements a table-driven scanner for HTML, allowing for lots of 22 defects. It implements the Scanner interface, which accepts a Reader 23 object to fetch characters from and a ScanHandler object to report lexical 24 events to. 25 */ 26 27 public class HTMLScanner implements Scanner, Locator { 28 29 // Start of state table 30 private static final int S_ANAME = 1; 31 private static final int S_APOS = 2; 32 private static final int S_AVAL = 3; 33 private static final int S_BB = 4; 34 private static final int S_BBC = 5; 35 private static final int S_BBCD = 6; 36 private static final int S_BBCDA = 7; 37 private static final int S_BBCDAT = 8; 38 private static final int S_BBCDATA = 9; 39 private static final int S_CDATA = 10; 40 private static final int S_CDATA2 = 11; 41 private static final int S_CDSECT = 12; 42 private static final int S_CDSECT1 = 13; 43 private static final int S_CDSECT2 = 14; 44 private static final int S_COM = 15; 45 private static final int S_COM2 = 16; 46 private static final int S_COM3 = 17; 47 private static final int S_COM4 = 18; 48 private static final int S_DECL = 19; 49 private static final int S_DECL2 = 20; 50 private static final int S_DONE = 21; 51 private static final int S_EMPTYTAG = 22; 52 private static final int S_ENT = 23; 53 private static final int S_EQ = 24; 54 private static final int S_ETAG = 25; 55 private static final int S_GI = 26; 56 private static final int S_NCR = 27; 57 private static final int S_PCDATA = 28; 58 private static final int S_PI = 29; 59 private static final int S_PITARGET = 30; 60 private static final int S_QUOT = 31; 61 private static final int S_STAGC = 32; 62 private static final int S_TAG = 33; 63 private static final int S_TAGWS = 34; 64 private static final int S_XNCR = 35; 65 private static final int A_ADUP = 1; 66 private static final int A_ADUP_SAVE = 2; 67 private static final int A_ADUP_STAGC = 3; 68 private static final int A_ANAME = 4; 69 private static final int A_ANAME_ADUP = 5; 70 private static final int A_ANAME_ADUP_STAGC = 6; 71 private static final int A_AVAL = 7; 72 private static final int A_AVAL_STAGC = 8; 73 private static final int A_CDATA = 9; 74 private static final int A_CMNT = 10; 75 private static final int A_DECL = 11; 76 private static final int A_EMPTYTAG = 12; 77 private static final int A_ENTITY = 13; 78 private static final int A_ENTITY_START = 14; 79 private static final int A_ETAG = 15; 80 private static final int A_GI = 16; 81 private static final int A_GI_STAGC = 17; 82 private static final int A_LT = 18; 83 private static final int A_LT_PCDATA = 19; 84 private static final int A_MINUS = 20; 85 private static final int A_MINUS2 = 21; 86 private static final int A_MINUS3 = 22; 87 private static final int A_PCDATA = 23; 88 private static final int A_PI = 24; 89 private static final int A_PITARGET = 25; 90 private static final int A_PITARGET_PI = 26; 91 private static final int A_SAVE = 27; 92 private static final int A_SKIP = 28; 93 private static final int A_SP = 29; 94 private static final int A_STAGC = 30; 95 private static final int A_UNGET = 31; 96 private static final int A_UNSAVE_PCDATA = 32; 97 private static int[] statetable = { 98 S_ANAME, '/', A_ANAME_ADUP, S_EMPTYTAG, 99 S_ANAME, '=', A_ANAME, S_AVAL, 100 S_ANAME, '>', A_ANAME_ADUP_STAGC, S_PCDATA, 101 S_ANAME, 0, A_SAVE, S_ANAME, 102 S_ANAME, -1, A_ANAME_ADUP_STAGC, S_DONE, 103 S_ANAME, ' ', A_ANAME, S_EQ, 104 S_ANAME, '\n', A_ANAME, S_EQ, 105 S_ANAME, '\t', A_ANAME, S_EQ, 106 S_APOS, '\'', A_AVAL, S_TAGWS, 107 S_APOS, 0, A_SAVE, S_APOS, 108 S_APOS, -1, A_AVAL_STAGC, S_DONE, 109 S_APOS, ' ', A_SP, S_APOS, 110 S_APOS, '\n', A_SP, S_APOS, 111 S_APOS, '\t', A_SP, S_APOS, 112 S_AVAL, '\'', A_SKIP, S_APOS, 113 S_AVAL, '"', A_SKIP, S_QUOT, 114 S_AVAL, '>', A_AVAL_STAGC, S_PCDATA, 115 S_AVAL, 0, A_SAVE, S_STAGC, 116 S_AVAL, -1, A_AVAL_STAGC, S_DONE, 117 S_AVAL, ' ', A_SKIP, S_AVAL, 118 S_AVAL, '\n', A_SKIP, S_AVAL, 119 S_AVAL, '\t', A_SKIP, S_AVAL, 120 S_BB, 'C', A_SKIP, S_BBC, 121 S_BB, 0, A_SKIP, S_DECL, 122 S_BB, -1, A_SKIP, S_DONE, 123 S_BBC, 'D', A_SKIP, S_BBCD, 124 S_BBC, 0, A_SKIP, S_DECL, 125 S_BBC, -1, A_SKIP, S_DONE, 126 S_BBCD, 'A', A_SKIP, S_BBCDA, 127 S_BBCD, 0, A_SKIP, S_DECL, 128 S_BBCD, -1, A_SKIP, S_DONE, 129 S_BBCDA, 'T', A_SKIP, S_BBCDAT, 130 S_BBCDA, 0, A_SKIP, S_DECL, 131 S_BBCDA, -1, A_SKIP, S_DONE, 132 S_BBCDAT, 'A', A_SKIP, S_BBCDATA, 133 S_BBCDAT, 0, A_SKIP, S_DECL, 134 S_BBCDAT, -1, A_SKIP, S_DONE, 135 S_BBCDATA, '[', A_SKIP, S_CDSECT, 136 S_BBCDATA, 0, A_SKIP, S_DECL, 137 S_BBCDATA, -1, A_SKIP, S_DONE, 138 S_CDATA, '<', A_SAVE, S_CDATA2, 139 S_CDATA, 0, A_SAVE, S_CDATA, 140 S_CDATA, -1, A_PCDATA, S_DONE, 141 S_CDATA2, '/', A_UNSAVE_PCDATA, S_ETAG, 142 S_CDATA2, 0, A_SAVE, S_CDATA, 143 S_CDATA2, -1, A_UNSAVE_PCDATA, S_DONE, 144 S_CDSECT, ']', A_SAVE, S_CDSECT1, 145 S_CDSECT, 0, A_SAVE, S_CDSECT, 146 S_CDSECT, -1, A_SKIP, S_DONE, 147 S_CDSECT1, ']', A_SAVE, S_CDSECT2, 148 S_CDSECT1, 0, A_SAVE, S_CDSECT, 149 S_CDSECT1, -1, A_SKIP, S_DONE, 150 S_CDSECT2, '>', A_CDATA, S_PCDATA, 151 S_CDSECT2, 0, A_SAVE, S_CDSECT, 152 S_CDSECT2, -1, A_SKIP, S_DONE, 153 S_COM, '-', A_SKIP, S_COM2, 154 S_COM, 0, A_SAVE, S_COM2, 155 S_COM, -1, A_CMNT, S_DONE, 156 S_COM2, '-', A_SKIP, S_COM3, 157 S_COM2, 0, A_SAVE, S_COM2, 158 S_COM2, -1, A_CMNT, S_DONE, 159 S_COM3, '-', A_SKIP, S_COM4, 160 S_COM3, 0, A_MINUS, S_COM2, 161 S_COM3, -1, A_CMNT, S_DONE, 162 S_COM4, '-', A_MINUS3, S_COM4, 163 S_COM4, '>', A_CMNT, S_PCDATA, 164 S_COM4, 0, A_MINUS2, S_COM2, 165 S_COM4, -1, A_CMNT, S_DONE, 166 S_DECL, '-', A_SKIP, S_COM, 167 S_DECL, '[', A_SKIP, S_BB, 168 S_DECL, '>', A_SKIP, S_PCDATA, 169 S_DECL, 0, A_SAVE, S_DECL2, 170 S_DECL, -1, A_SKIP, S_DONE, 171 S_DECL2, '>', A_DECL, S_PCDATA, 172 S_DECL2, 0, A_SAVE, S_DECL2, 173 S_DECL2, -1, A_SKIP, S_DONE, 174 S_EMPTYTAG, '>', A_EMPTYTAG, S_PCDATA, 175 S_EMPTYTAG, 0, A_SAVE, S_ANAME, 176 S_EMPTYTAG, ' ', A_SKIP, S_TAGWS, 177 S_EMPTYTAG, '\n', A_SKIP, S_TAGWS, 178 S_EMPTYTAG, '\t', A_SKIP, S_TAGWS, 179 S_ENT, 0, A_ENTITY, S_ENT, 180 S_ENT, -1, A_ENTITY, S_DONE, 181 S_EQ, '=', A_SKIP, S_AVAL, 182 S_EQ, '>', A_ADUP_STAGC, S_PCDATA, 183 S_EQ, 0, A_ADUP_SAVE, S_ANAME, 184 S_EQ, -1, A_ADUP_STAGC, S_DONE, 185 S_EQ, ' ', A_SKIP, S_EQ, 186 S_EQ, '\n', A_SKIP, S_EQ, 187 S_EQ, '\t', A_SKIP, S_EQ, 188 S_ETAG, '>', A_ETAG, S_PCDATA, 189 S_ETAG, 0, A_SAVE, S_ETAG, 190 S_ETAG, -1, A_ETAG, S_DONE, 191 S_ETAG, ' ', A_SKIP, S_ETAG, 192 S_ETAG, '\n', A_SKIP, S_ETAG, 193 S_ETAG, '\t', A_SKIP, S_ETAG, 194 S_GI, '/', A_SKIP, S_EMPTYTAG, 195 S_GI, '>', A_GI_STAGC, S_PCDATA, 196 S_GI, 0, A_SAVE, S_GI, 197 S_GI, -1, A_SKIP, S_DONE, 198 S_GI, ' ', A_GI, S_TAGWS, 199 S_GI, '\n', A_GI, S_TAGWS, 200 S_GI, '\t', A_GI, S_TAGWS, 201 S_NCR, 0, A_ENTITY, S_NCR, 202 S_NCR, -1, A_ENTITY, S_DONE, 203 S_PCDATA, '&', A_ENTITY_START, S_ENT, 204 S_PCDATA, '<', A_PCDATA, S_TAG, 205 S_PCDATA, 0, A_SAVE, S_PCDATA, 206 S_PCDATA, -1, A_PCDATA, S_DONE, 207 S_PI, '>', A_PI, S_PCDATA, 208 S_PI, 0, A_SAVE, S_PI, 209 S_PI, -1, A_PI, S_DONE, 210 S_PITARGET, '>', A_PITARGET_PI, S_PCDATA, 211 S_PITARGET, 0, A_SAVE, S_PITARGET, 212 S_PITARGET, -1, A_PITARGET_PI, S_DONE, 213 S_PITARGET, ' ', A_PITARGET, S_PI, 214 S_PITARGET, '\n', A_PITARGET, S_PI, 215 S_PITARGET, '\t', A_PITARGET, S_PI, 216 S_QUOT, '"', A_AVAL, S_TAGWS, 217 S_QUOT, 0, A_SAVE, S_QUOT, 218 S_QUOT, -1, A_AVAL_STAGC, S_DONE, 219 S_QUOT, ' ', A_SP, S_QUOT, 220 S_QUOT, '\n', A_SP, S_QUOT, 221 S_QUOT, '\t', A_SP, S_QUOT, 222 S_STAGC, '>', A_AVAL_STAGC, S_PCDATA, 223 S_STAGC, 0, A_SAVE, S_STAGC, 224 S_STAGC, -1, A_AVAL_STAGC, S_DONE, 225 S_STAGC, ' ', A_AVAL, S_TAGWS, 226 S_STAGC, '\n', A_AVAL, S_TAGWS, 227 S_STAGC, '\t', A_AVAL, S_TAGWS, 228 S_TAG, '!', A_SKIP, S_DECL, 229 S_TAG, '?', A_SKIP, S_PITARGET, 230 S_TAG, '/', A_SKIP, S_ETAG, 231 S_TAG, '<', A_SAVE, S_TAG, 232 S_TAG, 0, A_SAVE, S_GI, 233 S_TAG, -1, A_LT_PCDATA, S_DONE, 234 S_TAG, ' ', A_LT, S_PCDATA, 235 S_TAG, '\n', A_LT, S_PCDATA, 236 S_TAG, '\t', A_LT, S_PCDATA, 237 S_TAGWS, '/', A_SKIP, S_EMPTYTAG, 238 S_TAGWS, '>', A_STAGC, S_PCDATA, 239 S_TAGWS, 0, A_SAVE, S_ANAME, 240 S_TAGWS, -1, A_STAGC, S_DONE, 241 S_TAGWS, ' ', A_SKIP, S_TAGWS, 242 S_TAGWS, '\n', A_SKIP, S_TAGWS, 243 S_TAGWS, '\t', A_SKIP, S_TAGWS, 244 S_XNCR, 0, A_ENTITY, S_XNCR, 245 S_XNCR, -1, A_ENTITY, S_DONE, 246 247 }; 248 private static final String[] debug_actionnames = { "", "A_ADUP", "A_ADUP_SAVE", "A_ADUP_STAGC", "A_ANAME", "A_ANAME_ADUP", "A_ANAME_ADUP_STAGC", "A_AVAL", "A_AVAL_STAGC", "A_CDATA", "A_CMNT", "A_DECL", "A_EMPTYTAG", "A_ENTITY", "A_ENTITY_START", "A_ETAG", "A_GI", "A_GI_STAGC", "A_LT", "A_LT_PCDATA", "A_MINUS", "A_MINUS2", "A_MINUS3", "A_PCDATA", "A_PI", "A_PITARGET", "A_PITARGET_PI", "A_SAVE", "A_SKIP", "A_SP", "A_STAGC", "A_UNGET", "A_UNSAVE_PCDATA"}; 249 private static final String[] debug_statenames = { "", "S_ANAME", "S_APOS", "S_AVAL", "S_BB", "S_BBC", "S_BBCD", "S_BBCDA", "S_BBCDAT", "S_BBCDATA", "S_CDATA", "S_CDATA2", "S_CDSECT", "S_CDSECT1", "S_CDSECT2", "S_COM", "S_COM2", "S_COM3", "S_COM4", "S_DECL", "S_DECL2", "S_DONE", "S_EMPTYTAG", "S_ENT", "S_EQ", "S_ETAG", "S_GI", "S_NCR", "S_PCDATA", "S_PI", "S_PITARGET", "S_QUOT", "S_STAGC", "S_TAG", "S_TAGWS", "S_XNCR"}; 250 251 252 // End of state table 253 254 private String thePublicid; // Locator state 255 private String theSystemid; 256 private int theLastLine; 257 private int theLastColumn; 258 private int theCurrentLine; 259 private int theCurrentColumn; 260 261 int theState; // Current state 262 int theNextState; // Next state 263 char[] theOutputBuffer = new char[200]; // Output buffer 264 int theSize; // Current buffer size 265 int[] theWinMap = { // Windows chars map 266 0x20AC, 0xFFFD, 0x201A, 0x0192, 0x201E, 0x2026, 0x2020, 0x2021, 267 0x02C6, 0x2030, 0x0160, 0x2039, 0x0152, 0xFFFD, 0x017D, 0xFFFD, 268 0xFFFD, 0x2018, 0x2019, 0x201C, 0x201D, 0x2022, 0x2013, 0x2014, 269 0x02DC, 0x2122, 0x0161, 0x203A, 0x0153, 0xFFFD, 0x017E, 0x0178}; 270 271 // Compensate for bug in PushbackReader that allows 272 // pushing back EOF. unread(PushbackReader r, int c)273 private void unread(PushbackReader r, int c) throws IOException { 274 if (c != -1) r.unread(c); 275 } 276 277 // Locator implementation 278 getLineNumber()279 public int getLineNumber() { 280 return theLastLine; 281 } getColumnNumber()282 public int getColumnNumber() { 283 return theLastColumn; 284 } getPublicId()285 public String getPublicId() { 286 return thePublicid; 287 } getSystemId()288 public String getSystemId() { 289 return theSystemid; 290 } 291 292 293 // Scanner implementation 294 295 /** 296 Reset document locator, supplying systemid and publicid. 297 @param systemid System id 298 @param publicid Public id 299 */ 300 resetDocumentLocator(String publicid, String systemid)301 public void resetDocumentLocator(String publicid, String systemid) { 302 thePublicid = publicid; 303 theSystemid = systemid; 304 theLastLine = theLastColumn = theCurrentLine = theCurrentColumn = 0; 305 } 306 307 /** 308 Scan HTML source, reporting lexical events. 309 @param r0 Reader that provides characters 310 @param h ScanHandler that accepts lexical events. 311 */ 312 scan(Reader r0, ScanHandler h)313 public void scan(Reader r0, ScanHandler h) throws IOException, SAXException { 314 theState = S_PCDATA; 315 PushbackReader r; 316 if (r0 instanceof PushbackReader) { 317 r = (PushbackReader)r0; 318 } 319 else if (r0 instanceof BufferedReader) { 320 r = new PushbackReader(r0); 321 } 322 else { 323 r = new PushbackReader(new BufferedReader(r0, 200)); 324 } 325 326 int firstChar = r.read(); // Remove any leading BOM 327 if (firstChar != '\uFEFF') unread(r, firstChar); 328 329 while (theState != S_DONE) { 330 int ch = r.read(); 331 332 // Process control characters 333 if (ch >= 0x80 && ch <= 0x9F) ch = theWinMap[ch-0x80]; 334 335 if (ch == '\r') { 336 ch = r.read(); // expect LF next 337 if (ch != '\n') { 338 unread(r, ch); // nope 339 ch = '\n'; 340 } 341 } 342 343 if (ch == '\n') { 344 theCurrentLine++; 345 theCurrentColumn = 0; 346 } 347 else { 348 theCurrentColumn++; 349 } 350 351 if (!(ch >= 0x20 || ch == '\n' || ch == '\t' || ch == -1)) continue; 352 353 // Search state table 354 int action = 0; 355 for (int i = 0; i < statetable.length; i += 4) { 356 if (theState != statetable[i]) { 357 if (action != 0) break; 358 continue; 359 } 360 if (statetable[i+1] == 0) { 361 action = statetable[i+2]; 362 theNextState = statetable[i+3]; 363 } 364 else if (statetable[i+1] == ch) { 365 action = statetable[i+2]; 366 theNextState = statetable[i+3]; 367 break; 368 } 369 } 370 // System.err.println("In " + debug_statenames[theState] + " got " + nicechar(ch) + " doing " + debug_actionnames[action] + " then " + debug_statenames[theNextState]); 371 switch (action) { 372 case 0: 373 throw new Error( 374 "HTMLScanner can't cope with " + Integer.toString(ch) + " in state " + 375 Integer.toString(theState)); 376 case A_ADUP: 377 h.adup(theOutputBuffer, 0, theSize); 378 theSize = 0; 379 break; 380 case A_ADUP_SAVE: 381 h.adup(theOutputBuffer, 0, theSize); 382 theSize = 0; 383 save(ch, h); 384 break; 385 case A_ADUP_STAGC: 386 h.adup(theOutputBuffer, 0, theSize); 387 theSize = 0; 388 h.stagc(theOutputBuffer, 0, theSize); 389 break; 390 case A_ANAME: 391 h.aname(theOutputBuffer, 0, theSize); 392 theSize = 0; 393 break; 394 case A_ANAME_ADUP: 395 h.aname(theOutputBuffer, 0, theSize); 396 theSize = 0; 397 h.adup(theOutputBuffer, 0, theSize); 398 break; 399 case A_ANAME_ADUP_STAGC: 400 h.aname(theOutputBuffer, 0, theSize); 401 theSize = 0; 402 h.adup(theOutputBuffer, 0, theSize); 403 h.stagc(theOutputBuffer, 0, theSize); 404 break; 405 case A_AVAL: 406 h.aval(theOutputBuffer, 0, theSize); 407 theSize = 0; 408 break; 409 case A_AVAL_STAGC: 410 h.aval(theOutputBuffer, 0, theSize); 411 theSize = 0; 412 h.stagc(theOutputBuffer, 0, theSize); 413 break; 414 case A_CDATA: 415 mark(); 416 // suppress the final "]]" in the buffer 417 if (theSize > 1) theSize -= 2; 418 h.pcdata(theOutputBuffer, 0, theSize); 419 theSize = 0; 420 break; 421 case A_ENTITY_START: 422 h.pcdata(theOutputBuffer, 0, theSize); 423 theSize = 0; 424 save(ch, h); 425 break; 426 case A_ENTITY: 427 mark(); 428 char ch1 = (char)ch; 429 // System.out.println("Got " + ch1 + " in state " + ((theState == S_ENT) ? "S_ENT" : ((theState == S_NCR) ? "S_NCR" : "UNK"))); 430 if (theState == S_ENT && ch1 == '#') { 431 theNextState = S_NCR; 432 save(ch, h); 433 break; 434 } 435 else if (theState == S_NCR && (ch1 == 'x' || ch1 == 'X')) { 436 theNextState = S_XNCR; 437 save(ch, h); 438 break; 439 } 440 else if (theState == S_ENT && Character.isLetterOrDigit(ch1)) { 441 save(ch, h); 442 break; 443 } 444 else if (theState == S_NCR && Character.isDigit(ch1)) { 445 save(ch, h); 446 break; 447 } 448 else if (theState == S_XNCR && (Character.isDigit(ch1) || "abcdefABCDEF".indexOf(ch1) != -1)) { 449 save(ch, h); 450 break; 451 } 452 453 // The whole entity reference has been collected 454 // System.err.println("%%" + new String(theOutputBuffer, 0, theSize)); 455 h.entity(theOutputBuffer, 1, theSize - 1); 456 int ent = h.getEntity(); 457 // System.err.println("%% value = " + ent); 458 if (ent != 0) { 459 theSize = 0; 460 if (ent >= 0x80 && ent <= 0x9F) { 461 ent = theWinMap[ent-0x80]; 462 } 463 if (ent < 0x20) { 464 // Control becomes space 465 ent = 0x20; 466 } 467 else if (ent >= 0xD800 && ent <= 0xDFFF) { 468 // Surrogates get dropped 469 ent = 0; 470 } 471 else if (ent <= 0xFFFF) { 472 // BMP character 473 save(ent, h); 474 } 475 else { 476 // Astral converted to two surrogates 477 ent -= 0x10000; 478 save((ent>>10) + 0xD800, h); 479 save((ent&0x3FF) + 0xDC00, h); 480 } 481 if (ch != ';') { 482 unread(r, ch); 483 theCurrentColumn--; 484 } 485 } 486 else { 487 unread(r, ch); 488 theCurrentColumn--; 489 } 490 theNextState = S_PCDATA; 491 break; 492 case A_ETAG: 493 h.etag(theOutputBuffer, 0, theSize); 494 theSize = 0; 495 break; 496 case A_DECL: 497 h.decl(theOutputBuffer, 0, theSize); 498 theSize = 0; 499 break; 500 case A_GI: 501 h.gi(theOutputBuffer, 0, theSize); 502 theSize = 0; 503 break; 504 case A_GI_STAGC: 505 h.gi(theOutputBuffer, 0, theSize); 506 theSize = 0; 507 h.stagc(theOutputBuffer, 0, theSize); 508 break; 509 case A_LT: 510 mark(); 511 save('<', h); 512 save(ch, h); 513 break; 514 case A_LT_PCDATA: 515 mark(); 516 save('<', h); 517 h.pcdata(theOutputBuffer, 0, theSize); 518 theSize = 0; 519 break; 520 case A_PCDATA: 521 mark(); 522 h.pcdata(theOutputBuffer, 0, theSize); 523 theSize = 0; 524 break; 525 case A_CMNT: 526 mark(); 527 h.cmnt(theOutputBuffer, 0, theSize); 528 theSize = 0; 529 break; 530 case A_MINUS3: 531 save('-', h); 532 save(' ', h); 533 break; 534 case A_MINUS2: 535 save('-', h); 536 save(' ', h); 537 // fall through into A_MINUS 538 case A_MINUS: 539 save('-', h); 540 save(ch, h); 541 break; 542 case A_PI: 543 mark(); 544 h.pi(theOutputBuffer, 0, theSize); 545 theSize = 0; 546 break; 547 case A_PITARGET: 548 h.pitarget(theOutputBuffer, 0, theSize); 549 theSize = 0; 550 break; 551 case A_PITARGET_PI: 552 h.pitarget(theOutputBuffer, 0, theSize); 553 theSize = 0; 554 h.pi(theOutputBuffer, 0, theSize); 555 break; 556 case A_SAVE: 557 save(ch, h); 558 break; 559 case A_SKIP: 560 break; 561 case A_SP: 562 save(' ', h); 563 break; 564 case A_STAGC: 565 h.stagc(theOutputBuffer, 0, theSize); 566 theSize = 0; 567 break; 568 case A_EMPTYTAG: 569 mark(); 570 // System.err.println("%%% Empty tag seen"); 571 if (theSize > 0) h.gi(theOutputBuffer, 0, theSize); 572 theSize = 0; 573 h.stage(theOutputBuffer, 0, theSize); 574 break; 575 case A_UNGET: 576 unread(r, ch); 577 theCurrentColumn--; 578 break; 579 case A_UNSAVE_PCDATA: 580 if (theSize > 0) theSize--; 581 h.pcdata(theOutputBuffer, 0, theSize); 582 theSize = 0; 583 break; 584 default: 585 throw new Error("Can't process state " + action); 586 } 587 theState = theNextState; 588 } 589 h.eof(theOutputBuffer, 0, 0); 590 } 591 592 /** 593 * Mark the current scan position as a "point of interest" - start of a tag, 594 * cdata, processing instruction etc. 595 */ 596 mark()597 private void mark() { 598 theLastColumn = theCurrentColumn; 599 theLastLine = theCurrentLine; 600 } 601 602 /** 603 A callback for the ScanHandler that allows it to force 604 the lexer state to CDATA content (no markup is recognized except 605 the end of element. 606 */ 607 startCDATA()608 public void startCDATA() { theNextState = S_CDATA; } 609 save(int ch, ScanHandler h)610 private void save(int ch, ScanHandler h) throws IOException, SAXException { 611 if (theSize >= theOutputBuffer.length - 20) { 612 if (theState == S_PCDATA || theState == S_CDATA) { 613 // Return a buffer-sized chunk of PCDATA 614 h.pcdata(theOutputBuffer, 0, theSize); 615 theSize = 0; 616 } 617 else { 618 // Grow the buffer size 619 char[] newOutputBuffer = new char[theOutputBuffer.length * 2]; 620 System.arraycopy(theOutputBuffer, 0, newOutputBuffer, 0, theSize+1); 621 theOutputBuffer = newOutputBuffer; 622 } 623 } 624 theOutputBuffer[theSize++] = (char)ch; 625 } 626 627 /** 628 Test procedure. Reads HTML from the standard input and writes 629 PYX to the standard output. 630 */ 631 main(String[] argv)632 public static void main(String[] argv) throws IOException, SAXException { 633 Scanner s = new HTMLScanner(); 634 Reader r = new InputStreamReader(System.in, "UTF-8"); 635 Writer w = new OutputStreamWriter(System.out, "UTF-8"); 636 PYXWriter pw = new PYXWriter(w); 637 s.scan(r, pw); 638 w.close(); 639 } 640 641 nicechar(int in)642 private static String nicechar(int in) { 643 if (in == '\n') return "\\n"; 644 if (in < 32) return "0x"+Integer.toHexString(in); 645 return "'"+((char)in)+"'"; 646 } 647 648 } 649