• Home
  • Raw
  • Download

Lines Matching +full:url +full:- +full:parse

10     http://www.robotstxt.org/norobots-rfc.txt
14 import urllib.parse
23 """ This class provides a set of methods to read, parse and answer
28 def __init__(self, url=''): argument
34 self.set_url(url)
40 This is useful for long-running web spiders that need to
54 def set_url(self, url): argument
55 """Sets the URL referring to a robots.txt file."""
56 self.url = url
57 self.host, self.path = urllib.parse.urlparse(url)[1:3]
60 """Reads the robots.txt URL and feeds it to the parser."""
62 f = urllib.request.urlopen(self.url)
70 self.parse(raw.decode("utf-8").splitlines())
81 def parse(self, lines): member in RobotFileParser
82 """Parse the input lines from a robots.txt file.
84 We allow that a user-agent: line is not preceded by
89 # 1: saw user-agent line
114 line[1] = urllib.parse.unquote(line[1].strip())
115 if line[0] == "user-agent":
129 elif line[0] == "crawl-delay":
137 elif line[0] == "request-rate":
147 # "This directive is independent of the user-agent line,
154 def can_fetch(self, useragent, url): argument
155 """using the parsed robots.txt decide if useragent can fetch url"""
161 # to exist, we must assume that no url is allowable.
168 parsed_url = urllib.parse.urlparse(urllib.parse.unquote(url))
169 url = urllib.parse.urlunparse(('','',parsed_url.path,
171 url = urllib.parse.quote(url)
172 if not url:
173 url = "/"
176 return entry.allowance(url)
179 return self.default_entry.allowance(url)
222 path = urllib.parse.urlunparse(urllib.parse.urlparse(path))
223 self.path = urllib.parse.quote(path)
234 """An entry has one or more user-agents and zero or more rulelines"""
244 ret.append(f"User-agent: {agent}")
246 ret.append(f"Crawl-delay: {self.delay}")
249 ret.append(f"Request-rate: {rate.requests}/{rate.seconds}")
259 # we have the catch-all agent
268 - our agent applies to this entry
269 - filename is URL decoded"""