• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1import os
2import robotparser
3import unittest
4from test import support
5from BaseHTTPServer import BaseHTTPRequestHandler, HTTPServer
6import StringIO
7try:
8    import threading
9except ImportError:
10    threading = None
11
12
13class BaseRobotTest:
14    robots_txt = ''
15    agent = 'test_robotparser'
16    good = []
17    bad = []
18
19    def setUp(self):
20        lines = StringIO.StringIO(self.robots_txt).readlines()
21        self.parser = robotparser.RobotFileParser()
22        self.parser.parse(lines)
23
24    def get_agent_and_url(self, url):
25        if isinstance(url, tuple):
26            agent, url = url
27            return agent, url
28        return self.agent, url
29
30    def test_good_urls(self):
31        for url in self.good:
32            agent, url = self.get_agent_and_url(url)
33            self.assertTrue(self.parser.can_fetch(agent, url))
34
35    def test_bad_urls(self):
36        for url in self.bad:
37            agent, url = self.get_agent_and_url(url)
38            self.assertFalse(self.parser.can_fetch(agent, url))
39
40
41class UserAgentWildcardTest(BaseRobotTest, unittest.TestCase):
42    robots_txt = """\
43User-agent: *
44Disallow: /cyberworld/map/ # This is an infinite virtual URL space
45Disallow: /tmp/ # these will soon disappear
46Disallow: /foo.html
47    """
48    good = ['/', '/test.html']
49    bad = ['/cyberworld/map/index.html', '/tmp/xxx', '/foo.html']
50
51
52class RejectAllRobotsTest(BaseRobotTest, unittest.TestCase):
53    robots_txt = """\
54# go away
55User-agent: *
56Disallow: /
57    """
58    good = []
59    bad = ['/cyberworld/map/index.html', '/', '/tmp/']
60
61
62class UserAgentOrderingTest(BaseRobotTest, unittest.TestCase):
63    # the order of User-agent should be correct. note
64    # that this file is incorrect because "Googlebot" is a
65    # substring of "Googlebot-Mobile"
66    robots_txt = """\
67User-agent: Googlebot
68Disallow: /
69
70User-agent: Googlebot-Mobile
71Allow: /
72    """
73    agent = 'Googlebot'
74    bad = ['/something.jpg']
75
76
77class UserAgentGoogleMobileTest(UserAgentOrderingTest):
78    agent = 'Googlebot-Mobile'
79
80
81class GoogleURLOrderingTest(BaseRobotTest, unittest.TestCase):
82    # Google also got the order wrong. You need
83    # to specify the URLs from more specific to more general
84    robots_txt = """\
85User-agent: Googlebot
86Allow: /folder1/myfile.html
87Disallow: /folder1/
88    """
89    agent = 'googlebot'
90    good = ['/folder1/myfile.html']
91    bad = ['/folder1/anotherfile.html']
92
93
94class DisallowQueryStringTest(BaseRobotTest, unittest.TestCase):
95    # see issue #6325 for details
96    robots_txt = """\
97User-agent: *
98Disallow: /some/path?name=value
99    """
100    good = ['/some/path']
101    bad = ['/some/path?name=value']
102
103
104class UseFirstUserAgentWildcardTest(BaseRobotTest, unittest.TestCase):
105    # obey first * entry (#4108)
106    robots_txt = """\
107User-agent: *
108Disallow: /some/path
109
110User-agent: *
111Disallow: /another/path
112    """
113    good = ['/another/path']
114    bad = ['/some/path']
115
116
117class EmptyQueryStringTest(BaseRobotTest, unittest.TestCase):
118    # normalize the URL first (#17403)
119    robots_txt = """\
120User-agent: *
121Allow: /some/path?
122Disallow: /another/path?
123    """
124    good = ['/some/path?']
125    bad = ['/another/path?']
126
127
128class DefaultEntryTest(BaseRobotTest, unittest.TestCase):
129    robots_txt = """\
130User-agent: *
131Crawl-delay: 1
132Request-rate: 3/15
133Disallow: /cyberworld/map/
134    """
135    good = ['/', '/test.html']
136    bad = ['/cyberworld/map/index.html']
137
138
139class StringFormattingTest(BaseRobotTest, unittest.TestCase):
140    robots_txt = """\
141User-agent: *
142Crawl-delay: 1
143Request-rate: 3/15
144Disallow: /cyberworld/map/ # This is an infinite virtual URL space
145
146# Cybermapper knows where to go.
147User-agent: cybermapper
148Disallow: /some/path
149    """
150
151    expected_output = """\
152User-agent: cybermapper
153Disallow: /some/path
154
155User-agent: *
156Disallow: /cyberworld/map/
157
158"""
159
160    def test_string_formatting(self):
161        self.assertEqual(str(self.parser), self.expected_output)
162
163
164class RobotHandler(BaseHTTPRequestHandler):
165
166    def do_GET(self):
167        self.send_error(403, "Forbidden access")
168
169    def log_message(self, format, *args):
170        pass
171
172
173@unittest.skipUnless(threading, 'threading required for this test')
174class PasswordProtectedSiteTestCase(unittest.TestCase):
175
176    def setUp(self):
177        self.server = HTTPServer((support.HOST, 0), RobotHandler)
178
179        self.t = threading.Thread(
180            name='HTTPServer serving',
181            target=self.server.serve_forever,
182            # Short poll interval to make the test finish quickly.
183            # Time between requests is short enough that we won't wake
184            # up spuriously too many times.
185            kwargs={'poll_interval':0.01})
186        self.t.daemon = True  # In case this function raises.
187        self.t.start()
188
189    def tearDown(self):
190        self.server.shutdown()
191        self.t.join()
192        self.server.server_close()
193
194    @support.reap_threads
195    def testPasswordProtectedSite(self):
196        addr = self.server.server_address
197        url = 'http://' + support.HOST + ':' + str(addr[1])
198        robots_url = url + "/robots.txt"
199        parser = robotparser.RobotFileParser()
200        parser.set_url(url)
201        parser.read()
202        self.assertFalse(parser.can_fetch("*", robots_url))
203
204
205class NetworkTestCase(unittest.TestCase):
206
207    base_url = 'http://www.pythontest.net/'
208    robots_txt = '{}elsewhere/robots.txt'.format(base_url)
209
210    @classmethod
211    def setUpClass(cls):
212        support.requires('network')
213        with support.transient_internet(cls.base_url):
214            cls.parser = robotparser.RobotFileParser(cls.robots_txt)
215            cls.parser.read()
216
217    def url(self, path):
218        return '{}{}{}'.format(
219            self.base_url, path, '/' if not os.path.splitext(path)[1] else ''
220        )
221
222    def test_basic(self):
223        self.assertFalse(self.parser.disallow_all)
224        self.assertFalse(self.parser.allow_all)
225        self.assertGreater(self.parser.mtime(), 0)
226
227    def test_can_fetch(self):
228        self.assertTrue(self.parser.can_fetch('*', self.url('elsewhere')))
229        self.assertFalse(self.parser.can_fetch('Nutch', self.base_url))
230        self.assertFalse(self.parser.can_fetch('Nutch', self.url('brian')))
231        self.assertFalse(self.parser.can_fetch('Nutch', self.url('webstats')))
232        self.assertFalse(self.parser.can_fetch('*', self.url('webstats')))
233        self.assertTrue(self.parser.can_fetch('*', self.base_url))
234
235    def test_read_404(self):
236        parser = robotparser.RobotFileParser(self.url('i-robot.txt'))
237        parser.read()
238        self.assertTrue(parser.allow_all)
239        self.assertFalse(parser.disallow_all)
240        self.assertEqual(parser.mtime(), 0)
241
242
243def test_main():
244    support.run_unittest(
245        UserAgentWildcardTest,
246        RejectAllRobotsTest,
247        UserAgentOrderingTest,
248        UserAgentGoogleMobileTest,
249        GoogleURLOrderingTest,
250        DisallowQueryStringTest,
251        UseFirstUserAgentWildcardTest,
252        EmptyQueryStringTest,
253        DefaultEntryTest,
254        StringFormattingTest,
255        PasswordProtectedSiteTestCase,
256        NetworkTestCase)
257
258
259if __name__ == "__main__":
260    test_main()
261