1import os 2import robotparser 3import unittest 4from test import support 5from BaseHTTPServer import BaseHTTPRequestHandler, HTTPServer 6import StringIO 7try: 8 import threading 9except ImportError: 10 threading = None 11 12 13class BaseRobotTest: 14 robots_txt = '' 15 agent = 'test_robotparser' 16 good = [] 17 bad = [] 18 19 def setUp(self): 20 lines = StringIO.StringIO(self.robots_txt).readlines() 21 self.parser = robotparser.RobotFileParser() 22 self.parser.parse(lines) 23 24 def get_agent_and_url(self, url): 25 if isinstance(url, tuple): 26 agent, url = url 27 return agent, url 28 return self.agent, url 29 30 def test_good_urls(self): 31 for url in self.good: 32 agent, url = self.get_agent_and_url(url) 33 self.assertTrue(self.parser.can_fetch(agent, url)) 34 35 def test_bad_urls(self): 36 for url in self.bad: 37 agent, url = self.get_agent_and_url(url) 38 self.assertFalse(self.parser.can_fetch(agent, url)) 39 40 41class UserAgentWildcardTest(BaseRobotTest, unittest.TestCase): 42 robots_txt = """\ 43User-agent: * 44Disallow: /cyberworld/map/ # This is an infinite virtual URL space 45Disallow: /tmp/ # these will soon disappear 46Disallow: /foo.html 47 """ 48 good = ['/', '/test.html'] 49 bad = ['/cyberworld/map/index.html', '/tmp/xxx', '/foo.html'] 50 51 52class RejectAllRobotsTest(BaseRobotTest, unittest.TestCase): 53 robots_txt = """\ 54# go away 55User-agent: * 56Disallow: / 57 """ 58 good = [] 59 bad = ['/cyberworld/map/index.html', '/', '/tmp/'] 60 61 62class UserAgentOrderingTest(BaseRobotTest, unittest.TestCase): 63 # the order of User-agent should be correct. note 64 # that this file is incorrect because "Googlebot" is a 65 # substring of "Googlebot-Mobile" 66 robots_txt = """\ 67User-agent: Googlebot 68Disallow: / 69 70User-agent: Googlebot-Mobile 71Allow: / 72 """ 73 agent = 'Googlebot' 74 bad = ['/something.jpg'] 75 76 77class UserAgentGoogleMobileTest(UserAgentOrderingTest): 78 agent = 'Googlebot-Mobile' 79 80 81class GoogleURLOrderingTest(BaseRobotTest, unittest.TestCase): 82 # Google also got the order wrong. You need 83 # to specify the URLs from more specific to more general 84 robots_txt = """\ 85User-agent: Googlebot 86Allow: /folder1/myfile.html 87Disallow: /folder1/ 88 """ 89 agent = 'googlebot' 90 good = ['/folder1/myfile.html'] 91 bad = ['/folder1/anotherfile.html'] 92 93 94class DisallowQueryStringTest(BaseRobotTest, unittest.TestCase): 95 # see issue #6325 for details 96 robots_txt = """\ 97User-agent: * 98Disallow: /some/path?name=value 99 """ 100 good = ['/some/path'] 101 bad = ['/some/path?name=value'] 102 103 104class UseFirstUserAgentWildcardTest(BaseRobotTest, unittest.TestCase): 105 # obey first * entry (#4108) 106 robots_txt = """\ 107User-agent: * 108Disallow: /some/path 109 110User-agent: * 111Disallow: /another/path 112 """ 113 good = ['/another/path'] 114 bad = ['/some/path'] 115 116 117class EmptyQueryStringTest(BaseRobotTest, unittest.TestCase): 118 # normalize the URL first (#17403) 119 robots_txt = """\ 120User-agent: * 121Allow: /some/path? 122Disallow: /another/path? 123 """ 124 good = ['/some/path?'] 125 bad = ['/another/path?'] 126 127 128class DefaultEntryTest(BaseRobotTest, unittest.TestCase): 129 robots_txt = """\ 130User-agent: * 131Crawl-delay: 1 132Request-rate: 3/15 133Disallow: /cyberworld/map/ 134 """ 135 good = ['/', '/test.html'] 136 bad = ['/cyberworld/map/index.html'] 137 138 139class StringFormattingTest(BaseRobotTest, unittest.TestCase): 140 robots_txt = """\ 141User-agent: * 142Crawl-delay: 1 143Request-rate: 3/15 144Disallow: /cyberworld/map/ # This is an infinite virtual URL space 145 146# Cybermapper knows where to go. 147User-agent: cybermapper 148Disallow: /some/path 149 """ 150 151 expected_output = """\ 152User-agent: cybermapper 153Disallow: /some/path 154 155User-agent: * 156Disallow: /cyberworld/map/ 157 158""" 159 160 def test_string_formatting(self): 161 self.assertEqual(str(self.parser), self.expected_output) 162 163 164class RobotHandler(BaseHTTPRequestHandler): 165 166 def do_GET(self): 167 self.send_error(403, "Forbidden access") 168 169 def log_message(self, format, *args): 170 pass 171 172 173@unittest.skipUnless(threading, 'threading required for this test') 174class PasswordProtectedSiteTestCase(unittest.TestCase): 175 176 def setUp(self): 177 self.server = HTTPServer((support.HOST, 0), RobotHandler) 178 179 self.t = threading.Thread( 180 name='HTTPServer serving', 181 target=self.server.serve_forever, 182 # Short poll interval to make the test finish quickly. 183 # Time between requests is short enough that we won't wake 184 # up spuriously too many times. 185 kwargs={'poll_interval':0.01}) 186 self.t.daemon = True # In case this function raises. 187 self.t.start() 188 189 def tearDown(self): 190 self.server.shutdown() 191 self.t.join() 192 self.server.server_close() 193 194 @support.reap_threads 195 def testPasswordProtectedSite(self): 196 addr = self.server.server_address 197 url = 'http://' + support.HOST + ':' + str(addr[1]) 198 robots_url = url + "/robots.txt" 199 parser = robotparser.RobotFileParser() 200 parser.set_url(url) 201 parser.read() 202 self.assertFalse(parser.can_fetch("*", robots_url)) 203 204 205class NetworkTestCase(unittest.TestCase): 206 207 base_url = 'http://www.pythontest.net/' 208 robots_txt = '{}elsewhere/robots.txt'.format(base_url) 209 210 @classmethod 211 def setUpClass(cls): 212 support.requires('network') 213 with support.transient_internet(cls.base_url): 214 cls.parser = robotparser.RobotFileParser(cls.robots_txt) 215 cls.parser.read() 216 217 def url(self, path): 218 return '{}{}{}'.format( 219 self.base_url, path, '/' if not os.path.splitext(path)[1] else '' 220 ) 221 222 def test_basic(self): 223 self.assertFalse(self.parser.disallow_all) 224 self.assertFalse(self.parser.allow_all) 225 self.assertGreater(self.parser.mtime(), 0) 226 227 def test_can_fetch(self): 228 self.assertTrue(self.parser.can_fetch('*', self.url('elsewhere'))) 229 self.assertFalse(self.parser.can_fetch('Nutch', self.base_url)) 230 self.assertFalse(self.parser.can_fetch('Nutch', self.url('brian'))) 231 self.assertFalse(self.parser.can_fetch('Nutch', self.url('webstats'))) 232 self.assertFalse(self.parser.can_fetch('*', self.url('webstats'))) 233 self.assertTrue(self.parser.can_fetch('*', self.base_url)) 234 235 def test_read_404(self): 236 parser = robotparser.RobotFileParser(self.url('i-robot.txt')) 237 parser.read() 238 self.assertTrue(parser.allow_all) 239 self.assertFalse(parser.disallow_all) 240 self.assertEqual(parser.mtime(), 0) 241 242 243def test_main(): 244 support.run_unittest( 245 UserAgentWildcardTest, 246 RejectAllRobotsTest, 247 UserAgentOrderingTest, 248 UserAgentGoogleMobileTest, 249 GoogleURLOrderingTest, 250 DisallowQueryStringTest, 251 UseFirstUserAgentWildcardTest, 252 EmptyQueryStringTest, 253 DefaultEntryTest, 254 StringFormattingTest, 255 PasswordProtectedSiteTestCase, 256 NetworkTestCase) 257 258 259if __name__ == "__main__": 260 test_main() 261