1#! /usr/bin/env python 2 3# Original code by Guido van Rossum; extensive changes by Sam Bayer, 4# including code to check URL fragments. 5 6"""Web tree checker. 7 8This utility is handy to check a subweb of the world-wide web for 9errors. A subweb is specified by giving one or more ``root URLs''; a 10page belongs to the subweb if one of the root URLs is an initial 11prefix of it. 12 13File URL extension: 14 15In order to easy the checking of subwebs via the local file system, 16the interpretation of ``file:'' URLs is extended to mimic the behavior 17of your average HTTP daemon: if a directory pathname is given, the 18file index.html in that directory is returned if it exists, otherwise 19a directory listing is returned. Now, you can point webchecker to the 20document tree in the local file system of your HTTP daemon, and have 21most of it checked. In fact the default works this way if your local 22web tree is located at /usr/local/etc/httpd/htdpcs (the default for 23the NCSA HTTP daemon and probably others). 24 25Report printed: 26 27When done, it reports pages with bad links within the subweb. When 28interrupted, it reports for the pages that it has checked already. 29 30In verbose mode, additional messages are printed during the 31information gathering phase. By default, it prints a summary of its 32work status every 50 URLs (adjustable with the -r option), and it 33reports errors as they are encountered. Use the -q option to disable 34this output. 35 36Checkpoint feature: 37 38Whether interrupted or not, it dumps its state (a Python pickle) to a 39checkpoint file and the -R option allows it to restart from the 40checkpoint (assuming that the pages on the subweb that were already 41processed haven't changed). Even when it has run till completion, -R 42can still be useful -- it will print the reports again, and -Rq prints 43the errors only. In this case, the checkpoint file is not written 44again. The checkpoint file can be set with the -d option. 45 46The checkpoint file is written as a Python pickle. Remember that 47Python's pickle module is currently quite slow. Give it the time it 48needs to load and save the checkpoint file. When interrupted while 49writing the checkpoint file, the old checkpoint file is not 50overwritten, but all work done in the current run is lost. 51 52Miscellaneous: 53 54- You may find the (Tk-based) GUI version easier to use. See wcgui.py. 55 56- Webchecker honors the "robots.txt" convention. Thanks to Skip 57Montanaro for his robotparser.py module (included in this directory)! 58The agent name is hardwired to "webchecker". URLs that are disallowed 59by the robots.txt file are reported as external URLs. 60 61- Because the SGML parser is a bit slow, very large SGML files are 62skipped. The size limit can be set with the -m option. 63 64- When the server or protocol does not tell us a file's type, we guess 65it based on the URL's suffix. The mimetypes.py module (also in this 66directory) has a built-in table mapping most currently known suffixes, 67and in addition attempts to read the mime.types configuration files in 68the default locations of Netscape and the NCSA HTTP daemon. 69 70- We follow links indicated by <A>, <FRAME> and <IMG> tags. We also 71honor the <BASE> tag. 72 73- We now check internal NAME anchor links, as well as toplevel links. 74 75- Checking external links is now done by default; use -x to *disable* 76this feature. External links are now checked during normal 77processing. (XXX The status of a checked link could be categorized 78better. Later...) 79 80- If external links are not checked, you can use the -t flag to 81provide specific overrides to -x. 82 83Usage: webchecker.py [option] ... [rooturl] ... 84 85Options: 86 87-R -- restart from checkpoint file 88-d file -- checkpoint filename (default %(DUMPFILE)s) 89-m bytes -- skip HTML pages larger than this size (default %(MAXPAGE)d) 90-n -- reports only, no checking (use with -R) 91-q -- quiet operation (also suppresses external links report) 92-r number -- number of links processed per round (default %(ROUNDSIZE)d) 93-t root -- specify root dir which should be treated as internal (can repeat) 94-v -- verbose operation; repeating -v will increase verbosity 95-x -- don't check external links (these are often slow to check) 96-a -- don't check name anchors 97 98Arguments: 99 100rooturl -- URL to start checking 101 (default %(DEFROOT)s) 102 103""" 104 105 106__version__ = "$Revision$" 107 108 109import sys 110import os 111from types import * 112import StringIO 113import getopt 114import pickle 115 116import urllib 117import urlparse 118import sgmllib 119import cgi 120 121import mimetypes 122import robotparser 123 124# Extract real version number if necessary 125if __version__[0] == '$': 126 _v = __version__.split() 127 if len(_v) == 3: 128 __version__ = _v[1] 129 130 131# Tunable parameters 132DEFROOT = "file:/usr/local/etc/httpd/htdocs/" # Default root URL 133CHECKEXT = 1 # Check external references (1 deep) 134VERBOSE = 1 # Verbosity level (0-3) 135MAXPAGE = 150000 # Ignore files bigger than this 136ROUNDSIZE = 50 # Number of links processed per round 137DUMPFILE = "@webchecker.pickle" # Pickled checkpoint 138AGENTNAME = "webchecker" # Agent name for robots.txt parser 139NONAMES = 0 # Force name anchor checking 140 141 142# Global variables 143 144 145def main(): 146 checkext = CHECKEXT 147 verbose = VERBOSE 148 maxpage = MAXPAGE 149 roundsize = ROUNDSIZE 150 dumpfile = DUMPFILE 151 restart = 0 152 norun = 0 153 154 try: 155 opts, args = getopt.getopt(sys.argv[1:], 'Rd:m:nqr:t:vxa') 156 except getopt.error, msg: 157 sys.stdout = sys.stderr 158 print msg 159 print __doc__%globals() 160 sys.exit(2) 161 162 # The extra_roots variable collects extra roots. 163 extra_roots = [] 164 nonames = NONAMES 165 166 for o, a in opts: 167 if o == '-R': 168 restart = 1 169 if o == '-d': 170 dumpfile = a 171 if o == '-m': 172 maxpage = int(a) 173 if o == '-n': 174 norun = 1 175 if o == '-q': 176 verbose = 0 177 if o == '-r': 178 roundsize = int(a) 179 if o == '-t': 180 extra_roots.append(a) 181 if o == '-a': 182 nonames = not nonames 183 if o == '-v': 184 verbose = verbose + 1 185 if o == '-x': 186 checkext = not checkext 187 188 if verbose > 0: 189 print AGENTNAME, "version", __version__ 190 191 if restart: 192 c = load_pickle(dumpfile=dumpfile, verbose=verbose) 193 else: 194 c = Checker() 195 196 c.setflags(checkext=checkext, verbose=verbose, 197 maxpage=maxpage, roundsize=roundsize, 198 nonames=nonames 199 ) 200 201 if not restart and not args: 202 args.append(DEFROOT) 203 204 for arg in args: 205 c.addroot(arg) 206 207 # The -t flag is only needed if external links are not to be 208 # checked. So -t values are ignored unless -x was specified. 209 if not checkext: 210 for root in extra_roots: 211 # Make sure it's terminated by a slash, 212 # so that addroot doesn't discard the last 213 # directory component. 214 if root[-1] != "/": 215 root = root + "/" 216 c.addroot(root, add_to_do = 0) 217 218 try: 219 220 if not norun: 221 try: 222 c.run() 223 except KeyboardInterrupt: 224 if verbose > 0: 225 print "[run interrupted]" 226 227 try: 228 c.report() 229 except KeyboardInterrupt: 230 if verbose > 0: 231 print "[report interrupted]" 232 233 finally: 234 if c.save_pickle(dumpfile): 235 if dumpfile == DUMPFILE: 236 print "Use ``%s -R'' to restart." % sys.argv[0] 237 else: 238 print "Use ``%s -R -d %s'' to restart." % (sys.argv[0], 239 dumpfile) 240 241 242def load_pickle(dumpfile=DUMPFILE, verbose=VERBOSE): 243 if verbose > 0: 244 print "Loading checkpoint from %s ..." % dumpfile 245 f = open(dumpfile, "rb") 246 c = pickle.load(f) 247 f.close() 248 if verbose > 0: 249 print "Done." 250 print "Root:", "\n ".join(c.roots) 251 return c 252 253 254class Checker: 255 256 checkext = CHECKEXT 257 verbose = VERBOSE 258 maxpage = MAXPAGE 259 roundsize = ROUNDSIZE 260 nonames = NONAMES 261 262 validflags = tuple(dir()) 263 264 def __init__(self): 265 self.reset() 266 267 def setflags(self, **kw): 268 for key in kw.keys(): 269 if key not in self.validflags: 270 raise NameError, "invalid keyword argument: %s" % str(key) 271 for key, value in kw.items(): 272 setattr(self, key, value) 273 274 def reset(self): 275 self.roots = [] 276 self.todo = {} 277 self.done = {} 278 self.bad = {} 279 280 # Add a name table, so that the name URLs can be checked. Also 281 # serves as an implicit cache for which URLs are done. 282 self.name_table = {} 283 284 self.round = 0 285 # The following are not pickled: 286 self.robots = {} 287 self.errors = {} 288 self.urlopener = MyURLopener() 289 self.changed = 0 290 291 def note(self, level, format, *args): 292 if self.verbose > level: 293 if args: 294 format = format%args 295 self.message(format) 296 297 def message(self, format, *args): 298 if args: 299 format = format%args 300 print format 301 302 def __getstate__(self): 303 return (self.roots, self.todo, self.done, self.bad, self.round) 304 305 def __setstate__(self, state): 306 self.reset() 307 (self.roots, self.todo, self.done, self.bad, self.round) = state 308 for root in self.roots: 309 self.addrobot(root) 310 for url in self.bad.keys(): 311 self.markerror(url) 312 313 def addroot(self, root, add_to_do = 1): 314 if root not in self.roots: 315 troot = root 316 scheme, netloc, path, params, query, fragment = \ 317 urlparse.urlparse(root) 318 i = path.rfind("/") + 1 319 if 0 < i < len(path): 320 path = path[:i] 321 troot = urlparse.urlunparse((scheme, netloc, path, 322 params, query, fragment)) 323 self.roots.append(troot) 324 self.addrobot(root) 325 if add_to_do: 326 self.newlink((root, ""), ("<root>", root)) 327 328 def addrobot(self, root): 329 root = urlparse.urljoin(root, "/") 330 if self.robots.has_key(root): return 331 url = urlparse.urljoin(root, "/robots.txt") 332 self.robots[root] = rp = robotparser.RobotFileParser() 333 self.note(2, "Parsing %s", url) 334 rp.debug = self.verbose > 3 335 rp.set_url(url) 336 try: 337 rp.read() 338 except (OSError, IOError), msg: 339 self.note(1, "I/O error parsing %s: %s", url, msg) 340 341 def run(self): 342 while self.todo: 343 self.round = self.round + 1 344 self.note(0, "\nRound %d (%s)\n", self.round, self.status()) 345 urls = self.todo.keys() 346 urls.sort() 347 del urls[self.roundsize:] 348 for url in urls: 349 self.dopage(url) 350 351 def status(self): 352 return "%d total, %d to do, %d done, %d bad" % ( 353 len(self.todo)+len(self.done), 354 len(self.todo), len(self.done), 355 len(self.bad)) 356 357 def report(self): 358 self.message("") 359 if not self.todo: s = "Final" 360 else: s = "Interim" 361 self.message("%s Report (%s)", s, self.status()) 362 self.report_errors() 363 364 def report_errors(self): 365 if not self.bad: 366 self.message("\nNo errors") 367 return 368 self.message("\nError Report:") 369 sources = self.errors.keys() 370 sources.sort() 371 for source in sources: 372 triples = self.errors[source] 373 self.message("") 374 if len(triples) > 1: 375 self.message("%d Errors in %s", len(triples), source) 376 else: 377 self.message("Error in %s", source) 378 # Call self.format_url() instead of referring 379 # to the URL directly, since the URLs in these 380 # triples is now a (URL, fragment) pair. The value 381 # of the "source" variable comes from the list of 382 # origins, and is a URL, not a pair. 383 for url, rawlink, msg in triples: 384 if rawlink != self.format_url(url): s = " (%s)" % rawlink 385 else: s = "" 386 self.message(" HREF %s%s\n msg %s", 387 self.format_url(url), s, msg) 388 389 def dopage(self, url_pair): 390 391 # All printing of URLs uses format_url(); argument changed to 392 # url_pair for clarity. 393 if self.verbose > 1: 394 if self.verbose > 2: 395 self.show("Check ", self.format_url(url_pair), 396 " from", self.todo[url_pair]) 397 else: 398 self.message("Check %s", self.format_url(url_pair)) 399 url, local_fragment = url_pair 400 if local_fragment and self.nonames: 401 self.markdone(url_pair) 402 return 403 try: 404 page = self.getpage(url_pair) 405 except sgmllib.SGMLParseError, msg: 406 msg = self.sanitize(msg) 407 self.note(0, "Error parsing %s: %s", 408 self.format_url(url_pair), msg) 409 # Dont actually mark the URL as bad - it exists, just 410 # we can't parse it! 411 page = None 412 if page: 413 # Store the page which corresponds to this URL. 414 self.name_table[url] = page 415 # If there is a fragment in this url_pair, and it's not 416 # in the list of names for the page, call setbad(), since 417 # it's a missing anchor. 418 if local_fragment and local_fragment not in page.getnames(): 419 self.setbad(url_pair, ("Missing name anchor `%s'" % local_fragment)) 420 for info in page.getlinkinfos(): 421 # getlinkinfos() now returns the fragment as well, 422 # and we store that fragment here in the "todo" dictionary. 423 link, rawlink, fragment = info 424 # However, we don't want the fragment as the origin, since 425 # the origin is logically a page. 426 origin = url, rawlink 427 self.newlink((link, fragment), origin) 428 else: 429 # If no page has been created yet, we want to 430 # record that fact. 431 self.name_table[url_pair[0]] = None 432 self.markdone(url_pair) 433 434 def newlink(self, url, origin): 435 if self.done.has_key(url): 436 self.newdonelink(url, origin) 437 else: 438 self.newtodolink(url, origin) 439 440 def newdonelink(self, url, origin): 441 if origin not in self.done[url]: 442 self.done[url].append(origin) 443 444 # Call self.format_url(), since the URL here 445 # is now a (URL, fragment) pair. 446 self.note(3, " Done link %s", self.format_url(url)) 447 448 # Make sure that if it's bad, that the origin gets added. 449 if self.bad.has_key(url): 450 source, rawlink = origin 451 triple = url, rawlink, self.bad[url] 452 self.seterror(source, triple) 453 454 def newtodolink(self, url, origin): 455 # Call self.format_url(), since the URL here 456 # is now a (URL, fragment) pair. 457 if self.todo.has_key(url): 458 if origin not in self.todo[url]: 459 self.todo[url].append(origin) 460 self.note(3, " Seen todo link %s", self.format_url(url)) 461 else: 462 self.todo[url] = [origin] 463 self.note(3, " New todo link %s", self.format_url(url)) 464 465 def format_url(self, url): 466 link, fragment = url 467 if fragment: return link + "#" + fragment 468 else: return link 469 470 def markdone(self, url): 471 self.done[url] = self.todo[url] 472 del self.todo[url] 473 self.changed = 1 474 475 def inroots(self, url): 476 for root in self.roots: 477 if url[:len(root)] == root: 478 return self.isallowed(root, url) 479 return 0 480 481 def isallowed(self, root, url): 482 root = urlparse.urljoin(root, "/") 483 return self.robots[root].can_fetch(AGENTNAME, url) 484 485 def getpage(self, url_pair): 486 # Incoming argument name is a (URL, fragment) pair. 487 # The page may have been cached in the name_table variable. 488 url, fragment = url_pair 489 if self.name_table.has_key(url): 490 return self.name_table[url] 491 492 scheme, path = urllib.splittype(url) 493 if scheme in ('mailto', 'news', 'javascript', 'telnet'): 494 self.note(1, " Not checking %s URL" % scheme) 495 return None 496 isint = self.inroots(url) 497 498 # Ensure that openpage gets the URL pair to 499 # print out its error message and record the error pair 500 # correctly. 501 if not isint: 502 if not self.checkext: 503 self.note(1, " Not checking ext link") 504 return None 505 f = self.openpage(url_pair) 506 if f: 507 self.safeclose(f) 508 return None 509 text, nurl = self.readhtml(url_pair) 510 511 if nurl != url: 512 self.note(1, " Redirected to %s", nurl) 513 url = nurl 514 if text: 515 return Page(text, url, maxpage=self.maxpage, checker=self) 516 517 # These next three functions take (URL, fragment) pairs as 518 # arguments, so that openpage() receives the appropriate tuple to 519 # record error messages. 520 def readhtml(self, url_pair): 521 url, fragment = url_pair 522 text = None 523 f, url = self.openhtml(url_pair) 524 if f: 525 text = f.read() 526 f.close() 527 return text, url 528 529 def openhtml(self, url_pair): 530 url, fragment = url_pair 531 f = self.openpage(url_pair) 532 if f: 533 url = f.geturl() 534 info = f.info() 535 if not self.checkforhtml(info, url): 536 self.safeclose(f) 537 f = None 538 return f, url 539 540 def openpage(self, url_pair): 541 url, fragment = url_pair 542 try: 543 return self.urlopener.open(url) 544 except (OSError, IOError), msg: 545 msg = self.sanitize(msg) 546 self.note(0, "Error %s", msg) 547 if self.verbose > 0: 548 self.show(" HREF ", url, " from", self.todo[url_pair]) 549 self.setbad(url_pair, msg) 550 return None 551 552 def checkforhtml(self, info, url): 553 if info.has_key('content-type'): 554 ctype = cgi.parse_header(info['content-type'])[0].lower() 555 if ';' in ctype: 556 # handle content-type: text/html; charset=iso8859-1 : 557 ctype = ctype.split(';', 1)[0].strip() 558 else: 559 if url[-1:] == "/": 560 return 1 561 ctype, encoding = mimetypes.guess_type(url) 562 if ctype == 'text/html': 563 return 1 564 else: 565 self.note(1, " Not HTML, mime type %s", ctype) 566 return 0 567 568 def setgood(self, url): 569 if self.bad.has_key(url): 570 del self.bad[url] 571 self.changed = 1 572 self.note(0, "(Clear previously seen error)") 573 574 def setbad(self, url, msg): 575 if self.bad.has_key(url) and self.bad[url] == msg: 576 self.note(0, "(Seen this error before)") 577 return 578 self.bad[url] = msg 579 self.changed = 1 580 self.markerror(url) 581 582 def markerror(self, url): 583 try: 584 origins = self.todo[url] 585 except KeyError: 586 origins = self.done[url] 587 for source, rawlink in origins: 588 triple = url, rawlink, self.bad[url] 589 self.seterror(source, triple) 590 591 def seterror(self, url, triple): 592 try: 593 # Because of the way the URLs are now processed, I need to 594 # check to make sure the URL hasn't been entered in the 595 # error list. The first element of the triple here is a 596 # (URL, fragment) pair, but the URL key is not, since it's 597 # from the list of origins. 598 if triple not in self.errors[url]: 599 self.errors[url].append(triple) 600 except KeyError: 601 self.errors[url] = [triple] 602 603 # The following used to be toplevel functions; they have been 604 # changed into methods so they can be overridden in subclasses. 605 606 def show(self, p1, link, p2, origins): 607 self.message("%s %s", p1, link) 608 i = 0 609 for source, rawlink in origins: 610 i = i+1 611 if i == 2: 612 p2 = ' '*len(p2) 613 if rawlink != link: s = " (%s)" % rawlink 614 else: s = "" 615 self.message("%s %s%s", p2, source, s) 616 617 def sanitize(self, msg): 618 if isinstance(IOError, ClassType) and isinstance(msg, IOError): 619 # Do the other branch recursively 620 msg.args = self.sanitize(msg.args) 621 elif isinstance(msg, TupleType): 622 if len(msg) >= 4 and msg[0] == 'http error' and \ 623 isinstance(msg[3], InstanceType): 624 # Remove the Message instance -- it may contain 625 # a file object which prevents pickling. 626 msg = msg[:3] + msg[4:] 627 return msg 628 629 def safeclose(self, f): 630 try: 631 url = f.geturl() 632 except AttributeError: 633 pass 634 else: 635 if url[:4] == 'ftp:' or url[:7] == 'file://': 636 # Apparently ftp connections don't like to be closed 637 # prematurely... 638 text = f.read() 639 f.close() 640 641 def save_pickle(self, dumpfile=DUMPFILE): 642 if not self.changed: 643 self.note(0, "\nNo need to save checkpoint") 644 elif not dumpfile: 645 self.note(0, "No dumpfile, won't save checkpoint") 646 else: 647 self.note(0, "\nSaving checkpoint to %s ...", dumpfile) 648 newfile = dumpfile + ".new" 649 f = open(newfile, "wb") 650 pickle.dump(self, f) 651 f.close() 652 try: 653 os.unlink(dumpfile) 654 except os.error: 655 pass 656 os.rename(newfile, dumpfile) 657 self.note(0, "Done.") 658 return 1 659 660 661class Page: 662 663 def __init__(self, text, url, verbose=VERBOSE, maxpage=MAXPAGE, checker=None): 664 self.text = text 665 self.url = url 666 self.verbose = verbose 667 self.maxpage = maxpage 668 self.checker = checker 669 670 # The parsing of the page is done in the __init__() routine in 671 # order to initialize the list of names the file 672 # contains. Stored the parser in an instance variable. Passed 673 # the URL to MyHTMLParser(). 674 size = len(self.text) 675 if size > self.maxpage: 676 self.note(0, "Skip huge file %s (%.0f Kbytes)", self.url, (size*0.001)) 677 self.parser = None 678 return 679 self.checker.note(2, " Parsing %s (%d bytes)", self.url, size) 680 self.parser = MyHTMLParser(url, verbose=self.verbose, 681 checker=self.checker) 682 self.parser.feed(self.text) 683 self.parser.close() 684 685 def note(self, level, msg, *args): 686 if self.checker: 687 apply(self.checker.note, (level, msg) + args) 688 else: 689 if self.verbose >= level: 690 if args: 691 msg = msg%args 692 print msg 693 694 # Method to retrieve names. 695 def getnames(self): 696 if self.parser: 697 return self.parser.names 698 else: 699 return [] 700 701 def getlinkinfos(self): 702 # File reading is done in __init__() routine. Store parser in 703 # local variable to indicate success of parsing. 704 705 # If no parser was stored, fail. 706 if not self.parser: return [] 707 708 rawlinks = self.parser.getlinks() 709 base = urlparse.urljoin(self.url, self.parser.getbase() or "") 710 infos = [] 711 for rawlink in rawlinks: 712 t = urlparse.urlparse(rawlink) 713 # DON'T DISCARD THE FRAGMENT! Instead, include 714 # it in the tuples which are returned. See Checker.dopage(). 715 fragment = t[-1] 716 t = t[:-1] + ('',) 717 rawlink = urlparse.urlunparse(t) 718 link = urlparse.urljoin(base, rawlink) 719 infos.append((link, rawlink, fragment)) 720 721 return infos 722 723 724class MyStringIO(StringIO.StringIO): 725 726 def __init__(self, url, info): 727 self.__url = url 728 self.__info = info 729 StringIO.StringIO.__init__(self) 730 731 def info(self): 732 return self.__info 733 734 def geturl(self): 735 return self.__url 736 737 738class MyURLopener(urllib.FancyURLopener): 739 740 http_error_default = urllib.URLopener.http_error_default 741 742 def __init__(*args): 743 self = args[0] 744 apply(urllib.FancyURLopener.__init__, args) 745 self.addheaders = [ 746 ('User-agent', 'Python-webchecker/%s' % __version__), 747 ] 748 749 def http_error_401(self, url, fp, errcode, errmsg, headers): 750 return None 751 752 def open_file(self, url): 753 path = urllib.url2pathname(urllib.unquote(url)) 754 if os.path.isdir(path): 755 if path[-1] != os.sep: 756 url = url + '/' 757 indexpath = os.path.join(path, "index.html") 758 if os.path.exists(indexpath): 759 return self.open_file(url + "index.html") 760 try: 761 names = os.listdir(path) 762 except os.error, msg: 763 exc_type, exc_value, exc_tb = sys.exc_info() 764 raise IOError, msg, exc_tb 765 names.sort() 766 s = MyStringIO("file:"+url, {'content-type': 'text/html'}) 767 s.write('<BASE HREF="file:%s">\n' % 768 urllib.quote(os.path.join(path, ""))) 769 for name in names: 770 q = urllib.quote(name) 771 s.write('<A HREF="%s">%s</A>\n' % (q, q)) 772 s.seek(0) 773 return s 774 return urllib.FancyURLopener.open_file(self, url) 775 776 777class MyHTMLParser(sgmllib.SGMLParser): 778 779 def __init__(self, url, verbose=VERBOSE, checker=None): 780 self.myverbose = verbose # now unused 781 self.checker = checker 782 self.base = None 783 self.links = {} 784 self.names = [] 785 self.url = url 786 sgmllib.SGMLParser.__init__(self) 787 788 def check_name_id(self, attributes): 789 """ Check the name or id attributes on an element. 790 """ 791 # We must rescue the NAME or id (name is deprecated in XHTML) 792 # attributes from the anchor, in order to 793 # cache the internal anchors which are made 794 # available in the page. 795 for name, value in attributes: 796 if name == "name" or name == "id": 797 if value in self.names: 798 self.checker.message("WARNING: duplicate ID name %s in %s", 799 value, self.url) 800 else: self.names.append(value) 801 break 802 803 def unknown_starttag(self, tag, attributes): 804 """ In XHTML, you can have id attributes on any element. 805 """ 806 self.check_name_id(attributes) 807 808 def start_a(self, attributes): 809 self.link_attr(attributes, 'href') 810 self.check_name_id(attributes) 811 812 def end_a(self): pass 813 814 def do_area(self, attributes): 815 self.link_attr(attributes, 'href') 816 self.check_name_id(attributes) 817 818 def do_body(self, attributes): 819 self.link_attr(attributes, 'background', 'bgsound') 820 self.check_name_id(attributes) 821 822 def do_img(self, attributes): 823 self.link_attr(attributes, 'src', 'lowsrc') 824 self.check_name_id(attributes) 825 826 def do_frame(self, attributes): 827 self.link_attr(attributes, 'src', 'longdesc') 828 self.check_name_id(attributes) 829 830 def do_iframe(self, attributes): 831 self.link_attr(attributes, 'src', 'longdesc') 832 self.check_name_id(attributes) 833 834 def do_link(self, attributes): 835 for name, value in attributes: 836 if name == "rel": 837 parts = value.lower().split() 838 if ( parts == ["stylesheet"] 839 or parts == ["alternate", "stylesheet"]): 840 self.link_attr(attributes, "href") 841 break 842 self.check_name_id(attributes) 843 844 def do_object(self, attributes): 845 self.link_attr(attributes, 'data', 'usemap') 846 self.check_name_id(attributes) 847 848 def do_script(self, attributes): 849 self.link_attr(attributes, 'src') 850 self.check_name_id(attributes) 851 852 def do_table(self, attributes): 853 self.link_attr(attributes, 'background') 854 self.check_name_id(attributes) 855 856 def do_td(self, attributes): 857 self.link_attr(attributes, 'background') 858 self.check_name_id(attributes) 859 860 def do_th(self, attributes): 861 self.link_attr(attributes, 'background') 862 self.check_name_id(attributes) 863 864 def do_tr(self, attributes): 865 self.link_attr(attributes, 'background') 866 self.check_name_id(attributes) 867 868 def link_attr(self, attributes, *args): 869 for name, value in attributes: 870 if name in args: 871 if value: value = value.strip() 872 if value: self.links[value] = None 873 874 def do_base(self, attributes): 875 for name, value in attributes: 876 if name == 'href': 877 if value: value = value.strip() 878 if value: 879 if self.checker: 880 self.checker.note(1, " Base %s", value) 881 self.base = value 882 self.check_name_id(attributes) 883 884 def getlinks(self): 885 return self.links.keys() 886 887 def getbase(self): 888 return self.base 889 890 891if __name__ == '__main__': 892 main() 893