1#!/usr/bin/env python 2# Copyright (C) 2010 Google Inc. All rights reserved. 3# 4# Redistribution and use in source and binary forms, with or without 5# modification, are permitted provided that the following conditions 6# are met: 7# 8# 1. Redistributions of source code must retain the above copyright 9# notice, this list of conditions and the following disclaimer. 10# 2. Redistributions in binary form must reproduce the above copyright 11# notice, this list of conditions and the following disclaimer in the 12# documentation and/or other materials provided with the distribution. 13# 14# THIS SOFTWARE IS PROVIDED BY APPLE AND ITS CONTRIBUTORS "AS IS" AND ANY 15# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 16# WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 17# DISCLAIMED. IN NO EVENT SHALL APPLE OR ITS CONTRIBUTORS BE LIABLE FOR ANY 18# DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 19# (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 20# LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND 21# ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 22# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF 23# THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 24 25"""deduplicate_tests -- lists duplicated between platforms. 26 27If platform/mac-leopard is missing an expected test output, we fall back on 28platform/mac. This means it's possible to grow redundant test outputs, 29where we have the same expected data in both a platform directory and another 30platform it falls back on. 31""" 32 33import collections 34import fnmatch 35import os 36import subprocess 37import sys 38import re 39 40from webkitpy.common.checkout import scm 41from webkitpy.common.system import executive 42from webkitpy.common.system import logutils 43from webkitpy.common.system import ospath 44from webkitpy.layout_tests.port import factory as port_factory 45 46_log = logutils.get_logger(__file__) 47 48_BASE_PLATFORM = 'base' 49 50 51def port_fallbacks(): 52 """Get the port fallback information. 53 Returns: 54 A dictionary mapping platform name to a list of other platforms to fall 55 back on. All platforms fall back on 'base'. 56 """ 57 fallbacks = {_BASE_PLATFORM: []} 58 for port_name in port_factory.all_port_names(): 59 try: 60 platforms = port_factory.get(port_name).baseline_search_path() 61 except NotImplementedError: 62 _log.error("'%s' lacks baseline_search_path(), please fix." 63 % port_name) 64 fallbacks[port_name] = [_BASE_PLATFORM] 65 continue 66 fallbacks[port_name] = [os.path.basename(p) for p in platforms][1:] 67 fallbacks[port_name].append(_BASE_PLATFORM) 68 69 return fallbacks 70 71 72def parse_git_output(git_output, glob_pattern): 73 """Parses the output of git ls-tree and filters based on glob_pattern. 74 Args: 75 git_output: result of git ls-tree -r HEAD LayoutTests. 76 glob_pattern: a pattern to filter the files. 77 Returns: 78 A dictionary mapping (test name, hash of content) => [paths] 79 """ 80 hashes = collections.defaultdict(set) 81 for line in git_output.split('\n'): 82 if not line: 83 break 84 attrs, path = line.strip().split('\t') 85 if not fnmatch.fnmatch(path, glob_pattern): 86 continue 87 path = path[len('LayoutTests/'):] 88 match = re.match(r'^(platform/.*?/)?(.*)', path) 89 test = match.group(2) 90 _, _, hash = attrs.split(' ') 91 hashes[(test, hash)].add(path) 92 return hashes 93 94 95def cluster_file_hashes(glob_pattern): 96 """Get the hashes of all the test expectations in the tree. 97 We cheat and use git's hashes. 98 Args: 99 glob_pattern: a pattern to filter the files. 100 Returns: 101 A dictionary mapping (test name, hash of content) => [paths] 102 """ 103 104 # A map of file hash => set of all files with that hash. 105 hashes = collections.defaultdict(set) 106 107 # Fill in the map. 108 cmd = ('git', 'ls-tree', '-r', 'HEAD', 'LayoutTests') 109 try: 110 git_output = executive.Executive().run_command(cmd, 111 cwd=scm.find_checkout_root()) 112 except OSError, e: 113 if e.errno == 2: # No such file or directory. 114 _log.error("Error: 'No such file' when running git.") 115 _log.error("This script requires git.") 116 sys.exit(1) 117 raise e 118 return parse_git_output(git_output, glob_pattern) 119 120 121def dirname_to_platform(dirname): 122 if dirname == 'chromium-linux': 123 return 'chromium-linux-x86' 124 elif dirname == 'chromium-win': 125 return 'chromium-win-win7' 126 elif dirname == 'chromium-mac': 127 return 'chromium-mac-snowleopard' 128 return dirname 129 130def extract_platforms(paths): 131 """Extracts the platforms from a list of paths matching ^platform/(.*?)/. 132 Args: 133 paths: a list of paths. 134 Returns: 135 A dictionary containing all platforms from paths. 136 """ 137 platforms = {} 138 for path in paths: 139 match = re.match(r'^platform/(.*?)/', path) 140 if match: 141 platform = dirname_to_platform(match.group(1)) 142 else: 143 platform = _BASE_PLATFORM 144 platforms[platform] = path 145 return platforms 146 147 148def has_intermediate_results(test, fallbacks, matching_platform, 149 path_exists=os.path.exists): 150 """Returns True if there is a test result that causes us to not delete 151 this duplicate. 152 153 For example, chromium-linux may be a duplicate of the checked in result, 154 but chromium-win may have a different result checked in. In this case, 155 we need to keep the duplicate results. 156 157 Args: 158 test: The test name. 159 fallbacks: A list of platforms we fall back on. 160 matching_platform: The platform that we found the duplicate test 161 result. We can stop checking here. 162 path_exists: Optional parameter that allows us to stub out 163 os.path.exists for testing. 164 """ 165 for dirname in fallbacks: 166 platform = dirname_to_platform(dirname) 167 if platform == matching_platform: 168 return False 169 test_path = os.path.join('LayoutTests', 'platform', dirname, test) 170 if path_exists(test_path): 171 return True 172 return False 173 174 175def get_relative_test_path(filename, relative_to, 176 checkout_root=scm.find_checkout_root()): 177 """Constructs a relative path to |filename| from |relative_to|. 178 Args: 179 filename: The test file we're trying to get a relative path to. 180 relative_to: The absolute path we're relative to. 181 Returns: 182 A relative path to filename or None if |filename| is not below 183 |relative_to|. 184 """ 185 layout_test_dir = os.path.join(checkout_root, 'LayoutTests') 186 abs_path = os.path.join(layout_test_dir, filename) 187 return ospath.relpath(abs_path, relative_to) 188 189 190def find_dups(hashes, port_fallbacks, relative_to): 191 """Yields info about redundant test expectations. 192 Args: 193 hashes: a list of hashes as returned by cluster_file_hashes. 194 port_fallbacks: a list of fallback information as returned by 195 get_port_fallbacks. 196 relative_to: the directory that we want the results relative to 197 Returns: 198 a tuple containing (test, platform, fallback, platforms) 199 """ 200 for (test, hash), cluster in hashes.items(): 201 if len(cluster) < 2: 202 continue # Common case: only one file with that hash. 203 204 # Compute the list of platforms we have this particular hash for. 205 platforms = extract_platforms(cluster) 206 if len(platforms) == 1: 207 continue 208 209 # See if any of the platforms are redundant with each other. 210 for platform in platforms.keys(): 211 if platform not in port_factory.all_port_names(): 212 continue 213 for dirname in port_fallbacks[platform]: 214 fallback = dirname_to_platform(dirname) 215 if fallback not in platforms.keys(): 216 continue 217 # We have to verify that there isn't an intermediate result 218 # that causes this duplicate hash to exist. 219 if has_intermediate_results(test, port_fallbacks[platform], 220 fallback): 221 continue 222 # We print the relative path so it's easy to pipe the results 223 # to xargs rm. 224 path = get_relative_test_path(platforms[platform], relative_to) 225 if not path: 226 continue 227 yield { 228 'test': test, 229 'platform': platform, 230 'fallback': dirname, 231 'path': path, 232 } 233 234 235def deduplicate(glob_pattern): 236 """Traverses LayoutTests and returns information about duplicated files. 237 Args: 238 glob pattern to filter the files in LayoutTests. 239 Returns: 240 a dictionary containing test, path, platform and fallback. 241 """ 242 fallbacks = port_fallbacks() 243 hashes = cluster_file_hashes(glob_pattern) 244 return list(find_dups(hashes, fallbacks, os.getcwd())) 245