• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1#!/usr/bin/env python
2# Copyright 2014 The Chromium Authors. All rights reserved.
3# Use of this source code is governed by a BSD-style license that can be
4# found in the LICENSE file.
5
6# Tool for seeing the real world impact of a patch.
7#
8# Layout Tests can tell you whether something has changed, but this can help
9# you determine whether a subtle/controversial change is beneficial or not.
10#
11# It dumps the rendering of a large number of sites, both with and without a
12# patch being evaluated, then sorts them by greatest difference in rendering,
13# such that a human reviewer can quickly review the most impacted sites,
14# rather than having to manually try sites to see if anything changes.
15#
16# In future it might be possible to extend this to other kinds of differences,
17# e.g. page load times.
18
19import argparse
20from argparse import RawTextHelpFormatter
21from contextlib import closing
22import datetime
23import errno
24from distutils.spawn import find_executable
25from operator import itemgetter
26import multiprocessing
27import os
28import re
29from cStringIO import StringIO
30import subprocess
31import sys
32import textwrap
33import time
34from urllib2 import urlopen
35from urlparse import urlparse
36import webbrowser
37from zipfile import ZipFile
38
39from nsfw_urls import nsfw_urls
40
41action = None
42allow_js = False
43additional_content_shell_flags = ""
44chromium_src_root = ""
45chromium_out_dir = ""
46image_diff = ""
47content_shell = ""
48output_dir = ""
49num_sites = 100
50urls = []
51print_lock = multiprocessing.Lock()
52
53
54def MakeDirsIfNotExist(dir):
55  try:
56    os.makedirs(dir)
57  except OSError as e:
58    if e.errno != errno.EEXIST:
59      raise
60
61
62def SetupPathsAndOut():
63  global chromium_src_root, chromium_out_dir, output_dir
64  global image_diff, content_shell
65  chromium_src_root = os.path.abspath(os.path.join(os.path.dirname(__file__),
66                                                   os.pardir,
67                                                   os.pardir))
68  # Find out directory (might be out_linux for users of cr).
69  for out_suffix in ["_linux", ""]:
70    out_dir = os.path.join(chromium_src_root, "out" + out_suffix)
71    if os.path.exists(out_dir):
72      chromium_out_dir = out_dir
73      break
74  if not chromium_out_dir:
75    return False
76
77  this_script_name = "real_world_impact"
78  output_dir = os.path.join(chromium_out_dir,
79                            "Release",
80                            this_script_name)
81  MakeDirsIfNotExist(output_dir)
82
83  image_diff = os.path.join(chromium_out_dir, "Release", "image_diff")
84
85  if sys.platform == 'darwin':
86    content_shell = os.path.join(chromium_out_dir, "Release",
87                    "Content Shell.app/Contents/MacOS/Content Shell")
88  elif sys.platform.startswith('linux'):
89    content_shell = os.path.join(chromium_out_dir, "Release",
90                    "content_shell")
91  elif sys.platform.startswith('win'):
92    content_shell = os.path.join(chromium_out_dir, "Release",
93                    "content_shell.exe")
94  return True
95
96
97def CheckPrerequisites():
98  if not find_executable("wget"):
99    print "wget not found! Install wget and re-run this."
100    return False
101  if not os.path.exists(image_diff):
102    print "image_diff not found (%s)!" % image_diff
103    print "Build the image_diff target and re-run this."
104    return False
105  if not os.path.exists(content_shell):
106    print "Content shell not found (%s)!" % content_shell
107    print "Build Release/content_shell and re-run this."
108    return False
109  return True
110
111
112def PickSampleUrls():
113  global urls
114  data_dir = os.path.join(output_dir, "data")
115  MakeDirsIfNotExist(data_dir)
116
117  # Download Alexa top 1,000,000 sites
118  # TODO(johnme): Should probably update this when it gets too stale...
119  csv_path = os.path.join(data_dir, "top-1m.csv")
120  if not os.path.exists(csv_path):
121    print "Downloading list of top 1,000,000 sites from Alexa..."
122    csv_url = "http://s3.amazonaws.com/alexa-static/top-1m.csv.zip"
123    with closing(urlopen(csv_url)) as stream:
124      ZipFile(StringIO(stream.read())).extract("top-1m.csv", data_dir)
125
126  bad_urls_path = os.path.join(data_dir, "bad_urls.txt")
127  if os.path.exists(bad_urls_path):
128    with open(bad_urls_path) as f:
129      bad_urls = set(f.read().splitlines())
130  else:
131    bad_urls = set()
132
133  # See if we've already selected a sample of size num_sites (this way, if you
134  # call this script with arguments "before N" then "after N", where N is the
135  # same number, we'll use the same sample, as expected!).
136  urls_path = os.path.join(data_dir, "%06d_urls.txt" % num_sites)
137  if not os.path.exists(urls_path):
138    if action == 'compare':
139      print ("Error: you must run 'before %d' and 'after %d' before "
140             "running 'compare %d'") % (num_sites, num_sites, num_sites)
141      return False
142    print "Picking %d sample urls..." % num_sites
143
144    # TODO(johnme): For now this just gets the top num_sites entries. In future
145    # this should pick a weighted random sample. For example, it could fit a
146    # power-law distribution, which is a good model of website popularity
147    # (http://www.useit.com/alertbox/9704b.html).
148    urls = []
149    remaining_num_sites = num_sites
150    with open(csv_path) as f:
151      for entry in f:
152        if remaining_num_sites <= 0:
153          break
154        remaining_num_sites -= 1
155        hostname = entry.strip().split(',')[1]
156        if not '/' in hostname:  # Skip Alexa 1,000,000 entries that have paths.
157          url = "http://%s/" % hostname
158          if not url in bad_urls:
159            urls.append(url)
160    # Don't write these to disk yet; we'll do that in SaveWorkingUrls below
161    # once we have tried to download them and seen which ones fail.
162  else:
163    with open(urls_path) as f:
164      urls = [u for u in f.read().splitlines() if not u in bad_urls]
165  return True
166
167
168def SaveWorkingUrls():
169  # TODO(johnme): Update the list if a url that used to work goes offline.
170  urls_path = os.path.join(output_dir, "data", "%06d_urls.txt" % num_sites)
171  if not os.path.exists(urls_path):
172    with open(urls_path, 'w') as f:
173      f.writelines(u + '\n' for u in urls)
174
175
176def PrintElapsedTime(elapsed, detail=""):
177  elapsed = round(elapsed * 10) / 10.0
178  m = elapsed / 60
179  s = elapsed % 60
180  print "Took %dm%.1fs" % (m, s), detail
181
182
183def DownloadStaticCopyTask(url):
184  url_parts = urlparse(url)
185  host_dir = os.path.join(output_dir, "data", url_parts.hostname)
186  # Use wget for now, as does a reasonable job of spidering page dependencies
187  # (e.g. CSS, JS, images).
188  success = True
189  try:
190    subprocess.check_call(["wget",
191                           "--execute", "robots=off",
192                           ("--user-agent=Mozilla/5.0 (Macintosh; Intel Mac OS "
193                            "X 10_8_5) AppleWebKit/537.36 (KHTML, like Gecko) C"
194                            "hrome/32.0.1700.14 Safari/537.36"),
195                           "--page-requisites",
196                           "--span-hosts",
197                           "--adjust-extension",
198                           "--convert-links",
199                           "--directory-prefix=" + host_dir,
200                           "--force-directories",
201                           "--default-page=index.html",
202                           "--no-check-certificate",
203                           "--timeout=5", # 5s timeout
204                           "--tries=2",
205                           "--quiet",
206                           url])
207  except KeyboardInterrupt:
208    success = False
209  except subprocess.CalledProcessError:
210    # Ignoring these for now, as some sites have issues with their subresources
211    # yet still produce a renderable index.html
212    pass #success = False
213  if success:
214    download_path = os.path.join(host_dir, url_parts.hostname, "index.html")
215    if not os.path.exists(download_path):
216      success = False
217    else:
218      with print_lock:
219        print "Downloaded:", url
220  if not success:
221    with print_lock:
222      print "Failed to download:", url
223    return False
224  return True
225
226
227def DownloadStaticCopies():
228  global urls
229  new_urls = []
230  for url in urls:
231    url_parts = urlparse(url)
232    host_dir = os.path.join(output_dir, "data", url_parts.hostname)
233    download_path = os.path.join(host_dir, url_parts.hostname, "index.html")
234    if not os.path.exists(download_path):
235      new_urls.append(url)
236
237  if new_urls:
238    print "Downloading static copies of %d sites..." % len(new_urls)
239    start_time = time.time()
240
241    results = multiprocessing.Pool(20).map(DownloadStaticCopyTask, new_urls)
242    failed_urls = [new_urls[i] for i,ret in enumerate(results) if not ret]
243    if failed_urls:
244      bad_urls_path = os.path.join(output_dir, "data", "bad_urls.txt")
245      with open(bad_urls_path, 'a') as f:
246        f.writelines(u + '\n' for u in failed_urls)
247      failed_urls_set = set(failed_urls)
248      urls = [u for u in urls if u not in failed_urls_set]
249
250    PrintElapsedTime(time.time() - start_time)
251
252  SaveWorkingUrls()
253
254
255def RunDrtTask(url):
256  url_parts = urlparse(url)
257  host_dir = os.path.join(output_dir, "data", url_parts.hostname)
258  html_path = os.path.join(host_dir, url_parts.hostname, "index.html")
259
260  if not allow_js:
261    nojs_path = os.path.join(host_dir, url_parts.hostname, "index-nojs.html")
262    if not os.path.exists(nojs_path):
263      with open(html_path) as f:
264        html = f.read()
265      if not html:
266        return False
267      # These aren't intended to be XSS safe :)
268      block_tags = (r'<\s*(script|object|video|audio|iframe|frameset|frame)'
269                    r'\b.*?<\s*\/\s*\1\s*>')
270      block_attrs = r'\s(onload|onerror)\s*=\s*(\'[^\']*\'|"[^"]*|\S*)'
271      html = re.sub(block_tags, '', html, flags=re.I|re.S)
272      html = re.sub(block_attrs, '', html, flags=re.I)
273      with open(nojs_path, 'w') as f:
274        f.write(html)
275    html_path = nojs_path
276
277  start_time = time.time()
278
279  with open(os.devnull, "w") as fnull:
280    p = subprocess.Popen([content_shell,
281                          "--dump-render-tree",
282                          additional_content_shell_flags,
283                          # The single quote is not a typo, it's a separator!
284                          html_path + "'--pixel-test"
285                         ],
286                         shell=False,
287                         stdout=subprocess.PIPE,
288                         stderr=fnull)
289  result = p.stdout.read()
290  PNG_START = b"\x89\x50\x4E\x47\x0D\x0A\x1A\x0A"
291  PNG_END = b"\x49\x45\x4E\x44\xAE\x42\x60\x82"
292  try:
293    start = result.index(PNG_START)
294    end = result.rindex(PNG_END) + 8
295  except ValueError:
296    return False
297
298  png_path = os.path.join(output_dir, action, url_parts.hostname + ".png")
299  MakeDirsIfNotExist(os.path.dirname(png_path))
300  with open(png_path, 'wb') as f:
301    f.write(result[start:end])
302  elapsed_time = (time.time() - start_time, url)
303  return elapsed_time
304
305
306def RunDrt():
307  print "Taking screenshots of %d pages..." % len(urls)
308  start_time = time.time()
309
310  results = multiprocessing.Pool().map(RunDrtTask, urls, 1)
311
312  max_time, url = max(t for t in results if t)
313  elapsed_detail = "(slowest: %.2fs on %s)" % (max_time, url)
314  PrintElapsedTime(time.time() - start_time, elapsed_detail)
315
316
317def CompareResultsTask(url):
318  url_parts = urlparse(url)
319  before_path = os.path.join(output_dir, "before", url_parts.hostname + ".png")
320  after_path = os.path.join(output_dir, "after", url_parts.hostname + ".png")
321  diff_path = os.path.join(output_dir, "diff", url_parts.hostname + ".png")
322  MakeDirsIfNotExist(os.path.join(output_dir, "diff"))
323
324  # TODO(johnme): Don't hardcode "real_world_impact".
325  red_path = ("data:image/gif;base64,R0lGODlhAQABAPAAAP8AAP///yH5BAAAAAAALAAAAA"
326              "ABAAEAAAICRAEAOw==")
327
328  before_exists = os.path.exists(before_path)
329  after_exists = os.path.exists(after_path)
330  if not before_exists and not after_exists:
331    # TODO(johnme): Make this more informative.
332    return (-100, url, red_path)
333  if before_exists != after_exists:
334    # TODO(johnme): Make this more informative.
335    return (200, url, red_path)
336
337  # Get percentage difference.
338  p = subprocess.Popen([image_diff, "--histogram",
339                        before_path, after_path],
340                        shell=False,
341                        stdout=subprocess.PIPE)
342  output,_ = p.communicate()
343  if p.returncode == 0:
344    return (0, url, before_path)
345  diff_match = re.match(r'histogram diff: (\d+\.\d{2})% (?:passed|failed)\n'
346                         'exact diff: (\d+\.\d{2})% (?:passed|failed)', output)
347  if not diff_match:
348    raise Exception("image_diff output format changed")
349  histogram_diff = float(diff_match.group(1))
350  exact_diff = float(diff_match.group(2))
351  combined_diff = max(histogram_diff + exact_diff / 8, 0.001)
352
353  # Produce diff PNG.
354  subprocess.call([image_diff, "--diff", before_path, after_path, diff_path])
355  return (combined_diff, url, diff_path)
356
357
358def CompareResults():
359  print "Running image_diff on %d pages..." % len(urls)
360  start_time = time.time()
361
362  results = multiprocessing.Pool().map(CompareResultsTask, urls)
363  results.sort(key=itemgetter(0), reverse=True)
364
365  PrintElapsedTime(time.time() - start_time)
366
367  now = datetime.datetime.today().strftime("%a %Y-%m-%d %H:%M")
368  html_start = textwrap.dedent("""\
369  <!DOCTYPE html>
370  <html>
371  <head>
372  <title>Real World Impact report %s</title>
373  <script>
374    var togglingImg = null;
375    var toggleTimer = null;
376
377    var before = true;
378    function toggle() {
379      var newFolder = before ? "before" : "after";
380      togglingImg.src = togglingImg.src.replace(/before|after|diff/, newFolder);
381      before = !before;
382      toggleTimer = setTimeout(toggle, 300);
383    }
384
385    function startToggle(img) {
386      before = true;
387      togglingImg = img;
388      if (!img.origSrc)
389        img.origSrc = img.src;
390      toggle();
391    }
392    function stopToggle(img) {
393      clearTimeout(toggleTimer);
394      img.src = img.origSrc;
395    }
396
397    document.onkeydown = function(e) {
398      e = e || window.event;
399      var keyCode = e.keyCode || e.which;
400      var newFolder;
401      switch (keyCode) {
402        case 49: //'1'
403          newFolder = "before"; break;
404        case 50: //'2'
405          newFolder = "after"; break;
406        case 51: //'3'
407          newFolder = "diff"; break;
408        default:
409          return;
410      }
411      var imgs = document.getElementsByTagName("img");
412      for (var i = 0; i < imgs.length; i++) {
413        imgs[i].src = imgs[i].src.replace(/before|after|diff/, newFolder);
414      }
415    };
416  </script>
417  <style>
418    h1 {
419      font-family: sans;
420    }
421    h2 {
422      font-family: monospace;
423      white-space: pre;
424    }
425    .nsfw-spacer {
426      height: 50vh;
427    }
428    .nsfw-warning {
429      background: yellow;
430      border: 10px solid red;
431    }
432    .info {
433      font-size: 1.2em;
434      font-style: italic;
435    }
436    body:not(.details-supported) details {
437      display: none;
438    }
439  </style>
440  </head>
441  <body>
442    <script>
443    if ('open' in document.createElement('details'))
444      document.body.className = "details-supported";
445    </script>
446    <!--<div class="nsfw-spacer"></div>-->
447    <p class="nsfw-warning">Warning: sites below are taken from the Alexa top %d
448    and may be NSFW.</p>
449    <!--<div class="nsfw-spacer"></div>-->
450    <h1>Real World Impact report %s</h1>
451    <p class="info">Press 1, 2 and 3 to switch between before, after and diff
452    screenshots respectively; or hover over the images to rapidly alternate
453    between before and after.</p>
454  """ % (now, num_sites, now))
455
456  html_same_row = """\
457  <h2>No difference on <a href="%s">%s</a>.</h2>
458  """
459
460  html_diff_row = """\
461  <h2>%7.3f%% difference on <a href="%s">%s</a>:</h2>
462  <img src="%s" width="800" height="600"
463       onmouseover="startToggle(this)" onmouseout="stopToggle(this)">
464  """
465
466  html_nsfw_diff_row = """\
467  <h2>%7.3f%% difference on <a href="%s">%s</a>:</h2>
468  <details>
469    <summary>This site may be NSFW. Click to expand/collapse.</summary>
470    <img src="%s" width="800" height="600"
471         onmouseover="startToggle(this)" onmouseout="stopToggle(this)">
472  </details>
473  """
474
475  html_end = textwrap.dedent("""\
476  </body>
477  </html>""")
478
479  html_path = os.path.join(output_dir, "diff.html")
480  with open(html_path, 'w') as f:
481    f.write(html_start)
482    for (diff_float, url, diff_path) in results:
483      diff_path = os.path.relpath(diff_path, output_dir)
484      if diff_float == 0:
485        f.write(html_same_row % (url, url))
486      elif url in nsfw_urls:
487        f.write(html_nsfw_diff_row % (diff_float, url, url, diff_path))
488      else:
489        f.write(html_diff_row % (diff_float, url, url, diff_path))
490    f.write(html_end)
491
492  webbrowser.open_new_tab("file://" + html_path)
493
494
495def main(argv):
496  global num_sites, action, allow_js, additional_content_shell_flags
497
498  parser = argparse.ArgumentParser(
499      formatter_class=RawTextHelpFormatter,
500      description="Compare the real world impact of a content shell change.",
501      epilog=textwrap.dedent("""\
502          Example usage:
503            1. Build content_shell in out/Release without any changes.
504            2. Run: %s before [num sites to test (default %d)].
505            3. Either:
506                 a. Apply your controversial patch and rebuild content_shell.
507                 b. Pass --additional_flags="--enable_your_flag" in step 4.
508            4. Run: %s after [num sites to test (default %d)].
509            5. Run: %s compare [num sites to test (default %d)].
510               This will open the results in your web browser.
511          """ % (argv[0], num_sites, argv[0], num_sites, argv[0], num_sites)))
512  parser.add_argument("--allow_js", help="Don't disable Javascript",
513                      action="store_true")
514  parser.add_argument("--additional_flags",
515                      help="Additional flags to pass to content shell")
516  parser.add_argument("action",
517                      help=textwrap.dedent("""\
518                        Action to perform.
519                          download - Just download the sites.
520                          before - Run content shell and record 'before' result.
521                          after - Run content shell and record 'after' result.
522                          compare - Compare before and after results.
523                      """),
524                      choices=["download", "before", "after", "compare"])
525  parser.add_argument("num_sites",
526                      help="Number of sites (default %s)" % num_sites,
527                      type=int, default=num_sites, nargs='?')
528  args = parser.parse_args()
529
530  action = args.action
531
532  if (args.num_sites):
533    num_sites = args.num_sites
534
535  if (args.allow_js):
536    allow_js = args.allow_js
537
538  if (args.additional_flags):
539    additional_content_shell_flags = args.additional_flags
540
541  if not SetupPathsAndOut() or not CheckPrerequisites() or not PickSampleUrls():
542    return 1
543
544  if action == 'compare':
545    CompareResults()
546  else:
547    DownloadStaticCopies()
548    if action != 'download':
549      RunDrt()
550  return 0
551
552
553if __name__ == '__main__':
554  sys.exit(main(sys.argv))