1"""Utilities for comparing files and directories. 2 3Classes: 4 dircmp 5 6Functions: 7 cmp(f1, f2, shallow=True) -> int 8 cmpfiles(a, b, common) -> ([], [], []) 9 clear_cache() 10 11""" 12 13import os 14import stat 15from itertools import filterfalse 16from types import GenericAlias 17 18__all__ = ['clear_cache', 'cmp', 'dircmp', 'cmpfiles', 'DEFAULT_IGNORES'] 19 20_cache = {} 21BUFSIZE = 8*1024 22 23DEFAULT_IGNORES = [ 24 'RCS', 'CVS', 'tags', '.git', '.hg', '.bzr', '_darcs', '__pycache__'] 25 26def clear_cache(): 27 """Clear the filecmp cache.""" 28 _cache.clear() 29 30def cmp(f1, f2, shallow=True): 31 """Compare two files. 32 33 Arguments: 34 35 f1 -- First file name 36 37 f2 -- Second file name 38 39 shallow -- Just check stat signature (do not read the files). 40 defaults to True. 41 42 Return value: 43 44 True if the files are the same, False otherwise. 45 46 This function uses a cache for past comparisons and the results, 47 with cache entries invalidated if their stat information 48 changes. The cache may be cleared by calling clear_cache(). 49 50 """ 51 52 s1 = _sig(os.stat(f1)) 53 s2 = _sig(os.stat(f2)) 54 if s1[0] != stat.S_IFREG or s2[0] != stat.S_IFREG: 55 return False 56 if shallow and s1 == s2: 57 return True 58 if s1[1] != s2[1]: 59 return False 60 61 outcome = _cache.get((f1, f2, s1, s2)) 62 if outcome is None: 63 outcome = _do_cmp(f1, f2) 64 if len(_cache) > 100: # limit the maximum size of the cache 65 clear_cache() 66 _cache[f1, f2, s1, s2] = outcome 67 return outcome 68 69def _sig(st): 70 return (stat.S_IFMT(st.st_mode), 71 st.st_size, 72 st.st_mtime) 73 74def _do_cmp(f1, f2): 75 bufsize = BUFSIZE 76 with open(f1, 'rb') as fp1, open(f2, 'rb') as fp2: 77 while True: 78 b1 = fp1.read(bufsize) 79 b2 = fp2.read(bufsize) 80 if b1 != b2: 81 return False 82 if not b1: 83 return True 84 85# Directory comparison class. 86# 87class dircmp: 88 """A class that manages the comparison of 2 directories. 89 90 dircmp(a, b, ignore=None, hide=None) 91 A and B are directories. 92 IGNORE is a list of names to ignore, 93 defaults to DEFAULT_IGNORES. 94 HIDE is a list of names to hide, 95 defaults to [os.curdir, os.pardir]. 96 97 High level usage: 98 x = dircmp(dir1, dir2) 99 x.report() -> prints a report on the differences between dir1 and dir2 100 or 101 x.report_partial_closure() -> prints report on differences between dir1 102 and dir2, and reports on common immediate subdirectories. 103 x.report_full_closure() -> like report_partial_closure, 104 but fully recursive. 105 106 Attributes: 107 left_list, right_list: The files in dir1 and dir2, 108 filtered by hide and ignore. 109 common: a list of names in both dir1 and dir2. 110 left_only, right_only: names only in dir1, dir2. 111 common_dirs: subdirectories in both dir1 and dir2. 112 common_files: files in both dir1 and dir2. 113 common_funny: names in both dir1 and dir2 where the type differs between 114 dir1 and dir2, or the name is not stat-able. 115 same_files: list of identical files. 116 diff_files: list of filenames which differ. 117 funny_files: list of files which could not be compared. 118 subdirs: a dictionary of dircmp objects, keyed by names in common_dirs. 119 """ 120 121 def __init__(self, a, b, ignore=None, hide=None): # Initialize 122 self.left = a 123 self.right = b 124 if hide is None: 125 self.hide = [os.curdir, os.pardir] # Names never to be shown 126 else: 127 self.hide = hide 128 if ignore is None: 129 self.ignore = DEFAULT_IGNORES 130 else: 131 self.ignore = ignore 132 133 def phase0(self): # Compare everything except common subdirectories 134 self.left_list = _filter(os.listdir(self.left), 135 self.hide+self.ignore) 136 self.right_list = _filter(os.listdir(self.right), 137 self.hide+self.ignore) 138 self.left_list.sort() 139 self.right_list.sort() 140 141 def phase1(self): # Compute common names 142 a = dict(zip(map(os.path.normcase, self.left_list), self.left_list)) 143 b = dict(zip(map(os.path.normcase, self.right_list), self.right_list)) 144 self.common = list(map(a.__getitem__, filter(b.__contains__, a))) 145 self.left_only = list(map(a.__getitem__, filterfalse(b.__contains__, a))) 146 self.right_only = list(map(b.__getitem__, filterfalse(a.__contains__, b))) 147 148 def phase2(self): # Distinguish files, directories, funnies 149 self.common_dirs = [] 150 self.common_files = [] 151 self.common_funny = [] 152 153 for x in self.common: 154 a_path = os.path.join(self.left, x) 155 b_path = os.path.join(self.right, x) 156 157 ok = 1 158 try: 159 a_stat = os.stat(a_path) 160 except OSError: 161 # print('Can\'t stat', a_path, ':', why.args[1]) 162 ok = 0 163 try: 164 b_stat = os.stat(b_path) 165 except OSError: 166 # print('Can\'t stat', b_path, ':', why.args[1]) 167 ok = 0 168 169 if ok: 170 a_type = stat.S_IFMT(a_stat.st_mode) 171 b_type = stat.S_IFMT(b_stat.st_mode) 172 if a_type != b_type: 173 self.common_funny.append(x) 174 elif stat.S_ISDIR(a_type): 175 self.common_dirs.append(x) 176 elif stat.S_ISREG(a_type): 177 self.common_files.append(x) 178 else: 179 self.common_funny.append(x) 180 else: 181 self.common_funny.append(x) 182 183 def phase3(self): # Find out differences between common files 184 xx = cmpfiles(self.left, self.right, self.common_files) 185 self.same_files, self.diff_files, self.funny_files = xx 186 187 def phase4(self): # Find out differences between common subdirectories 188 # A new dircmp object is created for each common subdirectory, 189 # these are stored in a dictionary indexed by filename. 190 # The hide and ignore properties are inherited from the parent 191 self.subdirs = {} 192 for x in self.common_dirs: 193 a_x = os.path.join(self.left, x) 194 b_x = os.path.join(self.right, x) 195 self.subdirs[x] = dircmp(a_x, b_x, self.ignore, self.hide) 196 197 def phase4_closure(self): # Recursively call phase4() on subdirectories 198 self.phase4() 199 for sd in self.subdirs.values(): 200 sd.phase4_closure() 201 202 def report(self): # Print a report on the differences between a and b 203 # Output format is purposely lousy 204 print('diff', self.left, self.right) 205 if self.left_only: 206 self.left_only.sort() 207 print('Only in', self.left, ':', self.left_only) 208 if self.right_only: 209 self.right_only.sort() 210 print('Only in', self.right, ':', self.right_only) 211 if self.same_files: 212 self.same_files.sort() 213 print('Identical files :', self.same_files) 214 if self.diff_files: 215 self.diff_files.sort() 216 print('Differing files :', self.diff_files) 217 if self.funny_files: 218 self.funny_files.sort() 219 print('Trouble with common files :', self.funny_files) 220 if self.common_dirs: 221 self.common_dirs.sort() 222 print('Common subdirectories :', self.common_dirs) 223 if self.common_funny: 224 self.common_funny.sort() 225 print('Common funny cases :', self.common_funny) 226 227 def report_partial_closure(self): # Print reports on self and on subdirs 228 self.report() 229 for sd in self.subdirs.values(): 230 print() 231 sd.report() 232 233 def report_full_closure(self): # Report on self and subdirs recursively 234 self.report() 235 for sd in self.subdirs.values(): 236 print() 237 sd.report_full_closure() 238 239 methodmap = dict(subdirs=phase4, 240 same_files=phase3, diff_files=phase3, funny_files=phase3, 241 common_dirs = phase2, common_files=phase2, common_funny=phase2, 242 common=phase1, left_only=phase1, right_only=phase1, 243 left_list=phase0, right_list=phase0) 244 245 def __getattr__(self, attr): 246 if attr not in self.methodmap: 247 raise AttributeError(attr) 248 self.methodmap[attr](self) 249 return getattr(self, attr) 250 251 __class_getitem__ = classmethod(GenericAlias) 252 253 254def cmpfiles(a, b, common, shallow=True): 255 """Compare common files in two directories. 256 257 a, b -- directory names 258 common -- list of file names found in both directories 259 shallow -- if true, do comparison based solely on stat() information 260 261 Returns a tuple of three lists: 262 files that compare equal 263 files that are different 264 filenames that aren't regular files. 265 266 """ 267 res = ([], [], []) 268 for x in common: 269 ax = os.path.join(a, x) 270 bx = os.path.join(b, x) 271 res[_cmp(ax, bx, shallow)].append(x) 272 return res 273 274 275# Compare two files. 276# Return: 277# 0 for equal 278# 1 for different 279# 2 for funny cases (can't stat, etc.) 280# 281def _cmp(a, b, sh, abs=abs, cmp=cmp): 282 try: 283 return not abs(cmp(a, b, sh)) 284 except OSError: 285 return 2 286 287 288# Return a copy with items that occur in skip removed. 289# 290def _filter(flist, skip): 291 return list(filterfalse(skip.__contains__, flist)) 292 293 294# Demonstration and testing. 295# 296def demo(): 297 import sys 298 import getopt 299 options, args = getopt.getopt(sys.argv[1:], 'r') 300 if len(args) != 2: 301 raise getopt.GetoptError('need exactly two args', None) 302 dd = dircmp(args[0], args[1]) 303 if ('-r', '') in options: 304 dd.report_full_closure() 305 else: 306 dd.report() 307 308if __name__ == '__main__': 309 demo() 310