robotparser.py - OpenGrok cross reference for /external/python/cpython3/Lib/urllib/robotparser.py

Lines Matching +full:url +full:- +full:parse
10     http://www.robotstxt.org/norobots-rfc.txt
14 import urllib.parse
23     """ This class provides a set of methods to read, parse and answer
28     def __init__(self, url=''):  argument
34         self.set_url(url)
40         This is useful for long-running web spiders that need to
54     def set_url(self, url):  argument
55         """Sets the URL referring to a robots.txt file."""
56         self.url = url
57         self.host, self.path = urllib.parse.urlparse(url)[1:3]
60         """Reads the robots.txt URL and feeds it to the parser."""
62             f = urllib.request.urlopen(self.url)
70             self.parse(raw.decode("utf-8").splitlines())
81     def parse(self, lines):  member in RobotFileParser
82         """Parse the input lines from a robots.txt file.
84         We allow that a user-agent: line is not preceded by
89         #   1: saw user-agent line
114                 line[1] = urllib.parse.unquote(line[1].strip())
115                 if line[0] == "user-agent":
129                 elif line[0] == "crawl-delay":
137                 elif line[0] == "request-rate":
147                     # "This directive is independent of the user-agent line,
154     def can_fetch(self, useragent, url):  argument
155         """using the parsed robots.txt decide if useragent can fetch url"""
161         # to exist, we must assume that no url is allowable.
168         parsed_url = urllib.parse.urlparse(urllib.parse.unquote(url))
169         url = urllib.parse.urlunparse(('','',parsed_url.path,
171         url = urllib.parse.quote(url)
172         if not url:
173             url = "/"
176                 return entry.allowance(url)
179             return self.default_entry.allowance(url)
222         path = urllib.parse.urlunparse(urllib.parse.urlparse(path))
223         self.path = urllib.parse.quote(path)
234     """An entry has one or more user-agents and zero or more rulelines"""
244             ret.append(f"User-agent: {agent}")
246             ret.append(f"Crawl-delay: {self.delay}")
249             ret.append(f"Request-rate: {rate.requests}/{rate.seconds}")
259                 # we have the catch-all agent
268         - our agent applies to this entry
269         - filename is URL decoded"""