1""" robotparser.py 2 3 Copyright (C) 2000 Bastian Kleineidam 4 5 You can choose between two licenses when using this package: 6 1) GNU GPLv2 7 2) PSF license for Python 2.2 8 9 The robots.txt Exclusion Protocol is implemented as specified in 10 http://www.robotstxt.org/norobots-rfc.txt 11""" 12 13import collections 14import urllib.parse 15import urllib.request 16 17__all__ = ["RobotFileParser"] 18 19RequestRate = collections.namedtuple("RequestRate", "requests seconds") 20 21 22class RobotFileParser: 23 """ This class provides a set of methods to read, parse and answer 24 questions about a single robots.txt file. 25 26 """ 27 28 def __init__(self, url=''): 29 self.entries = [] 30 self.sitemaps = [] 31 self.default_entry = None 32 self.disallow_all = False 33 self.allow_all = False 34 self.set_url(url) 35 self.last_checked = 0 36 37 def mtime(self): 38 """Returns the time the robots.txt file was last fetched. 39 40 This is useful for long-running web spiders that need to 41 check for new robots.txt files periodically. 42 43 """ 44 return self.last_checked 45 46 def modified(self): 47 """Sets the time the robots.txt file was last fetched to the 48 current time. 49 50 """ 51 import time 52 self.last_checked = time.time() 53 54 def set_url(self, url): 55 """Sets the URL referring to a robots.txt file.""" 56 self.url = url 57 self.host, self.path = urllib.parse.urlparse(url)[1:3] 58 59 def read(self): 60 """Reads the robots.txt URL and feeds it to the parser.""" 61 try: 62 f = urllib.request.urlopen(self.url) 63 except urllib.error.HTTPError as err: 64 if err.code in (401, 403): 65 self.disallow_all = True 66 elif err.code >= 400 and err.code < 500: 67 self.allow_all = True 68 else: 69 raw = f.read() 70 self.parse(raw.decode("utf-8").splitlines()) 71 72 def _add_entry(self, entry): 73 if "*" in entry.useragents: 74 # the default entry is considered last 75 if self.default_entry is None: 76 # the first default entry wins 77 self.default_entry = entry 78 else: 79 self.entries.append(entry) 80 81 def parse(self, lines): 82 """Parse the input lines from a robots.txt file. 83 84 We allow that a user-agent: line is not preceded by 85 one or more blank lines. 86 """ 87 # states: 88 # 0: start state 89 # 1: saw user-agent line 90 # 2: saw an allow or disallow line 91 state = 0 92 entry = Entry() 93 94 self.modified() 95 for line in lines: 96 if not line: 97 if state == 1: 98 entry = Entry() 99 state = 0 100 elif state == 2: 101 self._add_entry(entry) 102 entry = Entry() 103 state = 0 104 # remove optional comment and strip line 105 i = line.find('#') 106 if i >= 0: 107 line = line[:i] 108 line = line.strip() 109 if not line: 110 continue 111 line = line.split(':', 1) 112 if len(line) == 2: 113 line[0] = line[0].strip().lower() 114 line[1] = urllib.parse.unquote(line[1].strip()) 115 if line[0] == "user-agent": 116 if state == 2: 117 self._add_entry(entry) 118 entry = Entry() 119 entry.useragents.append(line[1]) 120 state = 1 121 elif line[0] == "disallow": 122 if state != 0: 123 entry.rulelines.append(RuleLine(line[1], False)) 124 state = 2 125 elif line[0] == "allow": 126 if state != 0: 127 entry.rulelines.append(RuleLine(line[1], True)) 128 state = 2 129 elif line[0] == "crawl-delay": 130 if state != 0: 131 # before trying to convert to int we need to make 132 # sure that robots.txt has valid syntax otherwise 133 # it will crash 134 if line[1].strip().isdigit(): 135 entry.delay = int(line[1]) 136 state = 2 137 elif line[0] == "request-rate": 138 if state != 0: 139 numbers = line[1].split('/') 140 # check if all values are sane 141 if (len(numbers) == 2 and numbers[0].strip().isdigit() 142 and numbers[1].strip().isdigit()): 143 entry.req_rate = RequestRate(int(numbers[0]), int(numbers[1])) 144 state = 2 145 elif line[0] == "sitemap": 146 # According to http://www.sitemaps.org/protocol.html 147 # "This directive is independent of the user-agent line, 148 # so it doesn't matter where you place it in your file." 149 # Therefore we do not change the state of the parser. 150 self.sitemaps.append(line[1]) 151 if state == 2: 152 self._add_entry(entry) 153 154 def can_fetch(self, useragent, url): 155 """using the parsed robots.txt decide if useragent can fetch url""" 156 if self.disallow_all: 157 return False 158 if self.allow_all: 159 return True 160 # Until the robots.txt file has been read or found not 161 # to exist, we must assume that no url is allowable. 162 # This prevents false positives when a user erroneously 163 # calls can_fetch() before calling read(). 164 if not self.last_checked: 165 return False 166 # search for given user agent matches 167 # the first match counts 168 parsed_url = urllib.parse.urlparse(urllib.parse.unquote(url)) 169 url = urllib.parse.urlunparse(('','',parsed_url.path, 170 parsed_url.params,parsed_url.query, parsed_url.fragment)) 171 url = urllib.parse.quote(url) 172 if not url: 173 url = "/" 174 for entry in self.entries: 175 if entry.applies_to(useragent): 176 return entry.allowance(url) 177 # try the default entry last 178 if self.default_entry: 179 return self.default_entry.allowance(url) 180 # agent not found ==> access granted 181 return True 182 183 def crawl_delay(self, useragent): 184 if not self.mtime(): 185 return None 186 for entry in self.entries: 187 if entry.applies_to(useragent): 188 return entry.delay 189 if self.default_entry: 190 return self.default_entry.delay 191 return None 192 193 def request_rate(self, useragent): 194 if not self.mtime(): 195 return None 196 for entry in self.entries: 197 if entry.applies_to(useragent): 198 return entry.req_rate 199 if self.default_entry: 200 return self.default_entry.req_rate 201 return None 202 203 def site_maps(self): 204 if not self.sitemaps: 205 return None 206 return self.sitemaps 207 208 def __str__(self): 209 entries = self.entries 210 if self.default_entry is not None: 211 entries = entries + [self.default_entry] 212 return '\n\n'.join(map(str, entries)) 213 214 215class RuleLine: 216 """A rule line is a single "Allow:" (allowance==True) or "Disallow:" 217 (allowance==False) followed by a path.""" 218 def __init__(self, path, allowance): 219 if path == '' and not allowance: 220 # an empty value means allow all 221 allowance = True 222 path = urllib.parse.urlunparse(urllib.parse.urlparse(path)) 223 self.path = urllib.parse.quote(path) 224 self.allowance = allowance 225 226 def applies_to(self, filename): 227 return self.path == "*" or filename.startswith(self.path) 228 229 def __str__(self): 230 return ("Allow" if self.allowance else "Disallow") + ": " + self.path 231 232 233class Entry: 234 """An entry has one or more user-agents and zero or more rulelines""" 235 def __init__(self): 236 self.useragents = [] 237 self.rulelines = [] 238 self.delay = None 239 self.req_rate = None 240 241 def __str__(self): 242 ret = [] 243 for agent in self.useragents: 244 ret.append(f"User-agent: {agent}") 245 if self.delay is not None: 246 ret.append(f"Crawl-delay: {self.delay}") 247 if self.req_rate is not None: 248 rate = self.req_rate 249 ret.append(f"Request-rate: {rate.requests}/{rate.seconds}") 250 ret.extend(map(str, self.rulelines)) 251 return '\n'.join(ret) 252 253 def applies_to(self, useragent): 254 """check if this entry applies to the specified agent""" 255 # split the name token and make it lower case 256 useragent = useragent.split("/")[0].lower() 257 for agent in self.useragents: 258 if agent == '*': 259 # we have the catch-all agent 260 return True 261 agent = agent.lower() 262 if agent in useragent: 263 return True 264 return False 265 266 def allowance(self, filename): 267 """Preconditions: 268 - our agent applies to this entry 269 - filename is URL decoded""" 270 for line in self.rulelines: 271 if line.applies_to(filename): 272 return line.allowance 273 return True 274