• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1#!/usr/bin/env python
2# Copyright (c) 2011 The Chromium Authors. All rights reserved.
3# Use of this source code is governed by a BSD-style license that can be
4# found in the LICENSE file.
5
6"""Does scraping for Firefox 2.0."""
7
8import pywintypes
9import time
10import types
11
12from drivers import keyboard
13from drivers import mouse
14from drivers import windowing
15
16# Default version
17version = "2.0.0.6"
18
19DEFAULT_PATH = r"c:\program files\mozilla firefox\firefox.exe"
20
21# TODO(jhaas): the Firefox scraper is a bit rickety at the moment. Known
22# issues: 1) won't work if the default profile puts toolbars in different
23# locations, 2) uses sleep() statements rather than more robust checks,
24# 3) fails badly if an existing Firefox window is open when the scrape
25# is invoked. This needs to be fortified at some point.
26
27def GetBrowser(path):
28  """Invoke the Firefox browser and return the process and window.
29
30  Args:
31    path: full path to browser
32
33  Returns:
34    A tuple of (process handle, render pane)
35  """
36  if not path: path = DEFAULT_PATH
37
38  # Invoke Firefox
39  (proc, wnd) = windowing.InvokeAndWait(path)
40
41  # Get the content pane
42  render_pane = windowing.FindChildWindow(
43    wnd,
44    "MozillaWindowClass/MozillaWindowClass/MozillaWindowClass")
45
46  return (proc, wnd, render_pane)
47
48
49def InvokeBrowser(path):
50  """Invoke the Firefox browser.
51
52  Args:
53    path: full path to browser
54
55  Returns:
56    A tuple of (main window, process handle, render pane)
57  """
58  # Reuse an existing instance of the browser if we can find one. This
59  # may not work correctly, especially if the window is behind other windows.
60  wnds = windowing.FindChildWindows(0, "MozillaUIWindowClass")
61  if len(wnds):
62    wnd = wnds[0]
63    proc = None
64  else:
65    # Invoke Firefox
66    (proc, wnd) = windowing.InvokeAndWait(path)
67
68  # Get the content pane
69  render_pane = windowing.FindChildWindow(
70    wnd,
71    "MozillaWindowClass/MozillaWindowClass/MozillaWindowClass")
72
73  return (wnd, proc, render_pane)
74
75
76def Scrape(urls, outdir, size, pos, timeout=20, **kwargs):
77  """Invoke a browser, send it to a series of URLs, and save its output.
78
79  Args:
80    urls: list of URLs to scrape
81    outdir: directory to place output
82    size: size of browser window to use
83    pos: position of browser window
84    timeout: amount of time to wait for page to load
85    kwargs: miscellaneous keyword args
86
87  Returns:
88    None if success, else an error string
89  """
90  if "path" in kwargs and kwargs["path"]: path = kwargs["path"]
91  else: path = DEFAULT_PATH
92
93  (wnd, proc, render_pane) = InvokeBrowser(path)
94
95  # Resize and reposition the frame
96  windowing.MoveAndSizeWindow(wnd, pos, size, render_pane)
97
98  time.sleep(3)
99
100  # Firefox is a bit of a pain: it doesn't use standard edit controls,
101  # and it doesn't display a throbber when there's no tab. Let's make
102  # sure there's at least one tab, then select the first one
103
104  mouse.ClickInWindow(wnd)
105  keyboard.TypeString("[t]", True)
106  mouse.ClickInWindow(wnd, (30, 115))
107  time.sleep(2)
108
109  timedout = False
110
111  # Visit each URL we're given
112  if type(urls) in types.StringTypes: urls = [urls]
113
114  for url in urls:
115
116    # Use keyboard shortcuts
117    keyboard.TypeString("{d}", True)
118    keyboard.TypeString(url)
119    keyboard.TypeString("\n")
120
121    # Wait for the page to finish loading
122    load_time = windowing.WaitForThrobber(wnd, (10, 96, 26, 112), timeout)
123    timedout = load_time < 0
124
125    if timedout:
126      break
127
128    # Scrape the page
129    image = windowing.ScrapeWindow(render_pane)
130
131    # Save to disk
132    if "filename" in kwargs:
133      if callable(kwargs["filename"]):
134        filename = kwargs["filename"](url)
135      else:
136        filename = kwargs["filename"]
137    else:
138      filename = windowing.URLtoFilename(url, outdir, ".bmp")
139    image.save(filename)
140
141  # Close all the tabs, cheesily
142  mouse.ClickInWindow(wnd)
143
144  while len(windowing.FindChildWindows(0, "MozillaUIWindowClass")):
145    keyboard.TypeString("[w]", True)
146    time.sleep(1)
147
148  if timedout:
149    return "timeout"
150
151
152def Time(urls, size, timeout, **kwargs):
153  """Measure how long it takes to load each of a series of URLs
154
155  Args:
156    urls: list of URLs to time
157    size: size of browser window to use
158    timeout: amount of time to wait for page to load
159    kwargs: miscellaneous keyword args
160
161  Returns:
162    A list of tuples (url, time). "time" can be "crashed" or "timeout"
163  """
164  if "path" in kwargs and kwargs["path"]: path = kwargs["path"]
165  else: path = DEFAULT_PATH
166  proc = None
167
168  # Visit each URL we're given
169  if type(urls) in types.StringTypes: urls = [urls]
170
171  ret = []
172  for url in urls:
173    try:
174      # Invoke the browser if necessary
175      if not proc:
176        (wnd, proc, render_pane) = InvokeBrowser(path)
177
178        # Resize and reposition the frame
179        windowing.MoveAndSizeWindow(wnd, (0,0), size, render_pane)
180
181        time.sleep(3)
182
183        # Firefox is a bit of a pain: it doesn't use standard edit controls,
184        # and it doesn't display a throbber when there's no tab. Let's make
185        # sure there's at least one tab, then select the first one
186
187        mouse.ClickInWindow(wnd)
188        keyboard.TypeString("[t]", True)
189        mouse.ClickInWindow(wnd, (30, 115))
190        time.sleep(2)
191
192      # Use keyboard shortcuts
193      keyboard.TypeString("{d}", True)
194      keyboard.TypeString(url)
195      keyboard.TypeString("\n")
196
197      # Wait for the page to finish loading
198      load_time = windowing.WaitForThrobber(wnd, (10, 96, 26, 112), timeout)
199      timedout = load_time < 0
200
201      if timedout:
202        load_time = "timeout"
203
204        # Try to close the browser; if this fails it's probably a crash
205        mouse.ClickInWindow(wnd)
206
207        count = 0
208        while (len(windowing.FindChildWindows(0, "MozillaUIWindowClass"))
209          and count < 5):
210          keyboard.TypeString("[w]", True)
211          time.sleep(1)
212          count = count + 1
213
214        if len(windowing.FindChildWindows(0, "MozillaUIWindowClass")):
215          windowing.EndProcess(proc)
216          load_time = "crashed"
217
218        proc = None
219    except pywintypes.error:
220      proc = None
221      load_time = "crashed"
222
223    ret.append( (url, load_time) )
224
225  if proc:
226    count = 0
227    while (len(windowing.FindChildWindows(0, "MozillaUIWindowClass"))
228      and count < 5):
229      keyboard.TypeString("[w]", True)
230      time.sleep(1)
231      count = count + 1
232  return ret
233
234
235def main():
236  # We're being invoked rather than imported, so run some tests
237  path = r"c:\sitecompare\scrapes\Firefox\2.0.0.6"
238  windowing.PreparePath(path)
239
240  # Scrape three sites and save the results
241  Scrape(
242    ["http://www.microsoft.com", "http://www.google.com",
243     "http://www.sun.com"],
244    path, (1024, 768), (0, 0))
245  return 0
246
247
248if __name__ == "__main__":
249  sys.exit(main())
250