1""" robotparser.py 2 3 Copyright (C) 2000 Bastian Kleineidam 4 5 You can choose between two licenses when using this package: 6 1) GNU GPLv2 7 2) PSF license for Python 2.2 8 9 The robots.txt Exclusion Protocol is implemented as specified in 10 http://www.robotstxt.org/norobots-rfc.txt 11 12""" 13import urlparse 14import urllib 15 16__all__ = ["RobotFileParser"] 17 18 19class RobotFileParser: 20 """ This class provides a set of methods to read, parse and answer 21 questions about a single robots.txt file. 22 23 """ 24 25 def __init__(self, url=''): 26 self.entries = [] 27 self.default_entry = None 28 self.disallow_all = False 29 self.allow_all = False 30 self.set_url(url) 31 self.last_checked = 0 32 33 def mtime(self): 34 """Returns the time the robots.txt file was last fetched. 35 36 This is useful for long-running web spiders that need to 37 check for new robots.txt files periodically. 38 39 """ 40 return self.last_checked 41 42 def modified(self): 43 """Sets the time the robots.txt file was last fetched to the 44 current time. 45 46 """ 47 import time 48 self.last_checked = time.time() 49 50 def set_url(self, url): 51 """Sets the URL referring to a robots.txt file.""" 52 self.url = url 53 self.host, self.path = urlparse.urlparse(url)[1:3] 54 55 def read(self): 56 """Reads the robots.txt URL and feeds it to the parser.""" 57 opener = URLopener() 58 f = opener.open(self.url) 59 lines = [line.strip() for line in f] 60 f.close() 61 self.errcode = opener.errcode 62 if self.errcode in (401, 403): 63 self.disallow_all = True 64 elif self.errcode >= 400 and self.errcode < 500: 65 self.allow_all = True 66 elif self.errcode == 200 and lines: 67 self.parse(lines) 68 69 def _add_entry(self, entry): 70 if "*" in entry.useragents: 71 # the default entry is considered last 72 if self.default_entry is None: 73 # the first default entry wins 74 self.default_entry = entry 75 else: 76 self.entries.append(entry) 77 78 def parse(self, lines): 79 """parse the input lines from a robots.txt file. 80 We allow that a user-agent: line is not preceded by 81 one or more blank lines.""" 82 # states: 83 # 0: start state 84 # 1: saw user-agent line 85 # 2: saw an allow or disallow line 86 state = 0 87 linenumber = 0 88 entry = Entry() 89 90 self.modified() 91 for line in lines: 92 linenumber += 1 93 if not line: 94 if state == 1: 95 entry = Entry() 96 state = 0 97 elif state == 2: 98 self._add_entry(entry) 99 entry = Entry() 100 state = 0 101 # remove optional comment and strip line 102 i = line.find('#') 103 if i >= 0: 104 line = line[:i] 105 line = line.strip() 106 if not line: 107 continue 108 line = line.split(':', 1) 109 if len(line) == 2: 110 line[0] = line[0].strip().lower() 111 line[1] = urllib.unquote(line[1].strip()) 112 if line[0] == "user-agent": 113 if state == 2: 114 self._add_entry(entry) 115 entry = Entry() 116 entry.useragents.append(line[1]) 117 state = 1 118 elif line[0] == "disallow": 119 if state != 0: 120 entry.rulelines.append(RuleLine(line[1], False)) 121 state = 2 122 elif line[0] == "allow": 123 if state != 0: 124 entry.rulelines.append(RuleLine(line[1], True)) 125 state = 2 126 if state == 2: 127 self._add_entry(entry) 128 129 130 def can_fetch(self, useragent, url): 131 """using the parsed robots.txt decide if useragent can fetch url""" 132 if self.disallow_all: 133 return False 134 if self.allow_all: 135 return True 136 137 # Until the robots.txt file has been read or found not 138 # to exist, we must assume that no url is allowable. 139 # This prevents false positives when a user erroneously 140 # calls can_fetch() before calling read(). 141 if not self.last_checked: 142 return False 143 144 # search for given user agent matches 145 # the first match counts 146 parsed_url = urlparse.urlparse(urllib.unquote(url)) 147 url = urlparse.urlunparse(('', '', parsed_url.path, 148 parsed_url.params, parsed_url.query, parsed_url.fragment)) 149 url = urllib.quote(url) 150 if not url: 151 url = "/" 152 for entry in self.entries: 153 if entry.applies_to(useragent): 154 return entry.allowance(url) 155 # try the default entry last 156 if self.default_entry: 157 return self.default_entry.allowance(url) 158 # agent not found ==> access granted 159 return True 160 161 162 def __str__(self): 163 entries = self.entries 164 if self.default_entry is not None: 165 entries = entries + [self.default_entry] 166 return '\n'.join(map(str, entries)) + '\n' 167 168 169class RuleLine: 170 """A rule line is a single "Allow:" (allowance==True) or "Disallow:" 171 (allowance==False) followed by a path.""" 172 def __init__(self, path, allowance): 173 if path == '' and not allowance: 174 # an empty value means allow all 175 allowance = True 176 path = urlparse.urlunparse(urlparse.urlparse(path)) 177 self.path = urllib.quote(path) 178 self.allowance = allowance 179 180 def applies_to(self, filename): 181 return self.path == "*" or filename.startswith(self.path) 182 183 def __str__(self): 184 return (self.allowance and "Allow" or "Disallow") + ": " + self.path 185 186 187class Entry: 188 """An entry has one or more user-agents and zero or more rulelines""" 189 def __init__(self): 190 self.useragents = [] 191 self.rulelines = [] 192 193 def __str__(self): 194 ret = [] 195 for agent in self.useragents: 196 ret.extend(["User-agent: ", agent, "\n"]) 197 for line in self.rulelines: 198 ret.extend([str(line), "\n"]) 199 return ''.join(ret) 200 201 def applies_to(self, useragent): 202 """check if this entry applies to the specified agent""" 203 # split the name token and make it lower case 204 useragent = useragent.split("/")[0].lower() 205 for agent in self.useragents: 206 if agent == '*': 207 # we have the catch-all agent 208 return True 209 agent = agent.lower() 210 if agent in useragent: 211 return True 212 return False 213 214 def allowance(self, filename): 215 """Preconditions: 216 - our agent applies to this entry 217 - filename is URL decoded""" 218 for line in self.rulelines: 219 if line.applies_to(filename): 220 return line.allowance 221 return True 222 223class URLopener(urllib.FancyURLopener): 224 def __init__(self, *args): 225 urllib.FancyURLopener.__init__(self, *args) 226 self.errcode = 200 227 228 def prompt_user_passwd(self, host, realm): 229 ## If robots.txt file is accessible only with a password, 230 ## we act as if the file wasn't there. 231 return None, None 232 233 def http_error_default(self, url, fp, errcode, errmsg, headers): 234 self.errcode = errcode 235 return urllib.FancyURLopener.http_error_default(self, url, fp, errcode, 236 errmsg, headers) 237