1import unittest, StringIO, robotparser 2from test import test_support 3from urllib2 import urlopen, HTTPError 4 5HAVE_HTTPS = True 6try: 7 from urllib2 import HTTPSHandler 8except ImportError: 9 HAVE_HTTPS = False 10 11class RobotTestCase(unittest.TestCase): 12 def __init__(self, index, parser, url, good, agent): 13 unittest.TestCase.__init__(self) 14 if good: 15 self.str = "RobotTest(%d, good, %s)" % (index, url) 16 else: 17 self.str = "RobotTest(%d, bad, %s)" % (index, url) 18 self.parser = parser 19 self.url = url 20 self.good = good 21 self.agent = agent 22 23 def runTest(self): 24 if isinstance(self.url, tuple): 25 agent, url = self.url 26 else: 27 url = self.url 28 agent = self.agent 29 if self.good: 30 self.assertTrue(self.parser.can_fetch(agent, url)) 31 else: 32 self.assertFalse(self.parser.can_fetch(agent, url)) 33 34 def __str__(self): 35 return self.str 36 37tests = unittest.TestSuite() 38 39def RobotTest(index, robots_txt, good_urls, bad_urls, 40 agent="test_robotparser"): 41 42 lines = StringIO.StringIO(robots_txt).readlines() 43 parser = robotparser.RobotFileParser() 44 parser.parse(lines) 45 for url in good_urls: 46 tests.addTest(RobotTestCase(index, parser, url, 1, agent)) 47 for url in bad_urls: 48 tests.addTest(RobotTestCase(index, parser, url, 0, agent)) 49 50# Examples from http://www.robotstxt.org/wc/norobots.html (fetched 2002) 51 52# 1. 53doc = """ 54User-agent: * 55Disallow: /cyberworld/map/ # This is an infinite virtual URL space 56Disallow: /tmp/ # these will soon disappear 57Disallow: /foo.html 58""" 59 60good = ['/','/test.html'] 61bad = ['/cyberworld/map/index.html','/tmp/xxx','/foo.html'] 62 63RobotTest(1, doc, good, bad) 64 65# 2. 66doc = """ 67# robots.txt for http://www.example.com/ 68 69User-agent: * 70Disallow: /cyberworld/map/ # This is an infinite virtual URL space 71 72# Cybermapper knows where to go. 73User-agent: cybermapper 74Disallow: 75 76""" 77 78good = ['/','/test.html',('cybermapper','/cyberworld/map/index.html')] 79bad = ['/cyberworld/map/index.html'] 80 81RobotTest(2, doc, good, bad) 82 83# 3. 84doc = """ 85# go away 86User-agent: * 87Disallow: / 88""" 89 90good = [] 91bad = ['/cyberworld/map/index.html','/','/tmp/'] 92 93RobotTest(3, doc, good, bad) 94 95# Examples from http://www.robotstxt.org/wc/norobots-rfc.html (fetched 2002) 96 97# 4. 98doc = """ 99User-agent: figtree 100Disallow: /tmp 101Disallow: /a%3cd.html 102Disallow: /a%2fb.html 103Disallow: /%7ejoe/index.html 104""" 105 106good = [] # XFAIL '/a/b.html' 107bad = ['/tmp','/tmp.html','/tmp/a.html', 108 '/a%3cd.html','/a%3Cd.html','/a%2fb.html', 109 '/~joe/index.html' 110 ] 111 112RobotTest(4, doc, good, bad, 'figtree') 113RobotTest(5, doc, good, bad, 'FigTree Robot libwww-perl/5.04') 114 115# 6. 116doc = """ 117User-agent: * 118Disallow: /tmp/ 119Disallow: /a%3Cd.html 120Disallow: /a/b.html 121Disallow: /%7ejoe/index.html 122""" 123 124good = ['/tmp',] # XFAIL: '/a%2fb.html' 125bad = ['/tmp/','/tmp/a.html', 126 '/a%3cd.html','/a%3Cd.html',"/a/b.html", 127 '/%7Ejoe/index.html'] 128 129RobotTest(6, doc, good, bad) 130 131# From bug report #523041 132 133# 7. 134doc = """ 135User-Agent: * 136Disallow: /. 137""" 138 139good = ['/foo.html'] 140bad = [] # Bug report says "/" should be denied, but that is not in the RFC 141 142RobotTest(7, doc, good, bad) 143 144# From Google: http://www.google.com/support/webmasters/bin/answer.py?hl=en&answer=40364 145 146# 8. 147doc = """ 148User-agent: Googlebot 149Allow: /folder1/myfile.html 150Disallow: /folder1/ 151""" 152 153good = ['/folder1/myfile.html'] 154bad = ['/folder1/anotherfile.html'] 155 156RobotTest(8, doc, good, bad, agent="Googlebot") 157 158# 9. This file is incorrect because "Googlebot" is a substring of 159# "Googlebot-Mobile", so test 10 works just like test 9. 160doc = """ 161User-agent: Googlebot 162Disallow: / 163 164User-agent: Googlebot-Mobile 165Allow: / 166""" 167 168good = [] 169bad = ['/something.jpg'] 170 171RobotTest(9, doc, good, bad, agent="Googlebot") 172 173good = [] 174bad = ['/something.jpg'] 175 176RobotTest(10, doc, good, bad, agent="Googlebot-Mobile") 177 178# 11. Get the order correct. 179doc = """ 180User-agent: Googlebot-Mobile 181Allow: / 182 183User-agent: Googlebot 184Disallow: / 185""" 186 187good = [] 188bad = ['/something.jpg'] 189 190RobotTest(11, doc, good, bad, agent="Googlebot") 191 192good = ['/something.jpg'] 193bad = [] 194 195RobotTest(12, doc, good, bad, agent="Googlebot-Mobile") 196 197 198# 13. Google also got the order wrong in #8. You need to specify the 199# URLs from more specific to more general. 200doc = """ 201User-agent: Googlebot 202Allow: /folder1/myfile.html 203Disallow: /folder1/ 204""" 205 206good = ['/folder1/myfile.html'] 207bad = ['/folder1/anotherfile.html'] 208 209RobotTest(13, doc, good, bad, agent="googlebot") 210 211 212# 14. For issue #6325 (query string support) 213doc = """ 214User-agent: * 215Disallow: /some/path?name=value 216""" 217 218good = ['/some/path'] 219bad = ['/some/path?name=value'] 220 221RobotTest(14, doc, good, bad) 222 223# 15. For issue #4108 (obey first * entry) 224doc = """ 225User-agent: * 226Disallow: /some/path 227 228User-agent: * 229Disallow: /another/path 230""" 231 232good = ['/another/path'] 233bad = ['/some/path'] 234 235RobotTest(15, doc, good, bad) 236 237# 16. Empty query (issue #17403). Normalizing the url first. 238doc = """ 239User-agent: * 240Allow: /some/path? 241Disallow: /another/path? 242""" 243 244good = ['/some/path?'] 245bad = ['/another/path?'] 246 247RobotTest(16, doc, good, bad) 248 249 250class NetworkTestCase(unittest.TestCase): 251 252 def testPasswordProtectedSite(self): 253 test_support.requires('network') 254 with test_support.transient_internet('mueblesmoraleda.com'): 255 url = 'http://mueblesmoraleda.com' 256 robots_url = url + "/robots.txt" 257 # First check the URL is usable for our purposes, since the 258 # test site is a bit flaky. 259 try: 260 urlopen(robots_url) 261 except HTTPError as e: 262 if e.code not in {401, 403}: 263 self.skipTest( 264 "%r should return a 401 or 403 HTTP error, not %r" 265 % (robots_url, e.code)) 266 else: 267 self.skipTest( 268 "%r should return a 401 or 403 HTTP error, not succeed" 269 % (robots_url)) 270 parser = robotparser.RobotFileParser() 271 parser.set_url(url) 272 try: 273 parser.read() 274 except IOError: 275 self.skipTest('%s is unavailable' % url) 276 self.assertEqual(parser.can_fetch("*", robots_url), False) 277 278 @unittest.skipUnless(HAVE_HTTPS, 'need SSL support to download license') 279 @test_support.system_must_validate_cert 280 def testPythonOrg(self): 281 test_support.requires('network') 282 with test_support.transient_internet('www.python.org'): 283 parser = robotparser.RobotFileParser( 284 "https://www.python.org/robots.txt") 285 parser.read() 286 self.assertTrue( 287 parser.can_fetch("*", "https://www.python.org/robots.txt")) 288 289 290def test_main(): 291 test_support.run_unittest(tests) 292 test_support.run_unittest(NetworkTestCase) 293 294if __name__=='__main__': 295 test_support.verbose = 1 296 test_main() 297