1#!/usr/bin/env python 2# Copyright (c) 2011 The Chromium Authors. All rights reserved. 3# Use of this source code is governed by a BSD-style license that can be 4# found in the LICENSE file. 5 6"""Does scraping for Firefox 2.0.""" 7 8import pywintypes 9import time 10import types 11 12from drivers import keyboard 13from drivers import mouse 14from drivers import windowing 15 16# Default version 17version = "2.0.0.6" 18 19DEFAULT_PATH = r"c:\program files\mozilla firefox\firefox.exe" 20 21# TODO(jhaas): the Firefox scraper is a bit rickety at the moment. Known 22# issues: 1) won't work if the default profile puts toolbars in different 23# locations, 2) uses sleep() statements rather than more robust checks, 24# 3) fails badly if an existing Firefox window is open when the scrape 25# is invoked. This needs to be fortified at some point. 26 27def GetBrowser(path): 28 """Invoke the Firefox browser and return the process and window. 29 30 Args: 31 path: full path to browser 32 33 Returns: 34 A tuple of (process handle, render pane) 35 """ 36 if not path: path = DEFAULT_PATH 37 38 # Invoke Firefox 39 (proc, wnd) = windowing.InvokeAndWait(path) 40 41 # Get the content pane 42 render_pane = windowing.FindChildWindow( 43 wnd, 44 "MozillaWindowClass/MozillaWindowClass/MozillaWindowClass") 45 46 return (proc, wnd, render_pane) 47 48 49def InvokeBrowser(path): 50 """Invoke the Firefox browser. 51 52 Args: 53 path: full path to browser 54 55 Returns: 56 A tuple of (main window, process handle, render pane) 57 """ 58 # Reuse an existing instance of the browser if we can find one. This 59 # may not work correctly, especially if the window is behind other windows. 60 wnds = windowing.FindChildWindows(0, "MozillaUIWindowClass") 61 if len(wnds): 62 wnd = wnds[0] 63 proc = None 64 else: 65 # Invoke Firefox 66 (proc, wnd) = windowing.InvokeAndWait(path) 67 68 # Get the content pane 69 render_pane = windowing.FindChildWindow( 70 wnd, 71 "MozillaWindowClass/MozillaWindowClass/MozillaWindowClass") 72 73 return (wnd, proc, render_pane) 74 75 76def Scrape(urls, outdir, size, pos, timeout=20, **kwargs): 77 """Invoke a browser, send it to a series of URLs, and save its output. 78 79 Args: 80 urls: list of URLs to scrape 81 outdir: directory to place output 82 size: size of browser window to use 83 pos: position of browser window 84 timeout: amount of time to wait for page to load 85 kwargs: miscellaneous keyword args 86 87 Returns: 88 None if success, else an error string 89 """ 90 if "path" in kwargs and kwargs["path"]: path = kwargs["path"] 91 else: path = DEFAULT_PATH 92 93 (wnd, proc, render_pane) = InvokeBrowser(path) 94 95 # Resize and reposition the frame 96 windowing.MoveAndSizeWindow(wnd, pos, size, render_pane) 97 98 time.sleep(3) 99 100 # Firefox is a bit of a pain: it doesn't use standard edit controls, 101 # and it doesn't display a throbber when there's no tab. Let's make 102 # sure there's at least one tab, then select the first one 103 104 mouse.ClickInWindow(wnd) 105 keyboard.TypeString("[t]", True) 106 mouse.ClickInWindow(wnd, (30, 115)) 107 time.sleep(2) 108 109 timedout = False 110 111 # Visit each URL we're given 112 if type(urls) in types.StringTypes: urls = [urls] 113 114 for url in urls: 115 116 # Use keyboard shortcuts 117 keyboard.TypeString("{d}", True) 118 keyboard.TypeString(url) 119 keyboard.TypeString("\n") 120 121 # Wait for the page to finish loading 122 load_time = windowing.WaitForThrobber(wnd, (10, 96, 26, 112), timeout) 123 timedout = load_time < 0 124 125 if timedout: 126 break 127 128 # Scrape the page 129 image = windowing.ScrapeWindow(render_pane) 130 131 # Save to disk 132 if "filename" in kwargs: 133 if callable(kwargs["filename"]): 134 filename = kwargs["filename"](url) 135 else: 136 filename = kwargs["filename"] 137 else: 138 filename = windowing.URLtoFilename(url, outdir, ".bmp") 139 image.save(filename) 140 141 # Close all the tabs, cheesily 142 mouse.ClickInWindow(wnd) 143 144 while len(windowing.FindChildWindows(0, "MozillaUIWindowClass")): 145 keyboard.TypeString("[w]", True) 146 time.sleep(1) 147 148 if timedout: 149 return "timeout" 150 151 152def Time(urls, size, timeout, **kwargs): 153 """Measure how long it takes to load each of a series of URLs 154 155 Args: 156 urls: list of URLs to time 157 size: size of browser window to use 158 timeout: amount of time to wait for page to load 159 kwargs: miscellaneous keyword args 160 161 Returns: 162 A list of tuples (url, time). "time" can be "crashed" or "timeout" 163 """ 164 if "path" in kwargs and kwargs["path"]: path = kwargs["path"] 165 else: path = DEFAULT_PATH 166 proc = None 167 168 # Visit each URL we're given 169 if type(urls) in types.StringTypes: urls = [urls] 170 171 ret = [] 172 for url in urls: 173 try: 174 # Invoke the browser if necessary 175 if not proc: 176 (wnd, proc, render_pane) = InvokeBrowser(path) 177 178 # Resize and reposition the frame 179 windowing.MoveAndSizeWindow(wnd, (0,0), size, render_pane) 180 181 time.sleep(3) 182 183 # Firefox is a bit of a pain: it doesn't use standard edit controls, 184 # and it doesn't display a throbber when there's no tab. Let's make 185 # sure there's at least one tab, then select the first one 186 187 mouse.ClickInWindow(wnd) 188 keyboard.TypeString("[t]", True) 189 mouse.ClickInWindow(wnd, (30, 115)) 190 time.sleep(2) 191 192 # Use keyboard shortcuts 193 keyboard.TypeString("{d}", True) 194 keyboard.TypeString(url) 195 keyboard.TypeString("\n") 196 197 # Wait for the page to finish loading 198 load_time = windowing.WaitForThrobber(wnd, (10, 96, 26, 112), timeout) 199 timedout = load_time < 0 200 201 if timedout: 202 load_time = "timeout" 203 204 # Try to close the browser; if this fails it's probably a crash 205 mouse.ClickInWindow(wnd) 206 207 count = 0 208 while (len(windowing.FindChildWindows(0, "MozillaUIWindowClass")) 209 and count < 5): 210 keyboard.TypeString("[w]", True) 211 time.sleep(1) 212 count = count + 1 213 214 if len(windowing.FindChildWindows(0, "MozillaUIWindowClass")): 215 windowing.EndProcess(proc) 216 load_time = "crashed" 217 218 proc = None 219 except pywintypes.error: 220 proc = None 221 load_time = "crashed" 222 223 ret.append( (url, load_time) ) 224 225 if proc: 226 count = 0 227 while (len(windowing.FindChildWindows(0, "MozillaUIWindowClass")) 228 and count < 5): 229 keyboard.TypeString("[w]", True) 230 time.sleep(1) 231 count = count + 1 232 return ret 233 234 235def main(): 236 # We're being invoked rather than imported, so run some tests 237 path = r"c:\sitecompare\scrapes\Firefox\2.0.0.6" 238 windowing.PreparePath(path) 239 240 # Scrape three sites and save the results 241 Scrape( 242 ["http://www.microsoft.com", "http://www.google.com", 243 "http://www.sun.com"], 244 path, (1024, 768), (0, 0)) 245 return 0 246 247 248if __name__ == "__main__": 249 sys.exit(main()) 250