1"""Utilities for comparing files and directories. 2 3Classes: 4 dircmp 5 6Functions: 7 cmp(f1, f2, shallow=True) -> int 8 cmpfiles(a, b, common) -> ([], [], []) 9 clear_cache() 10 11""" 12 13import os 14import stat 15from itertools import filterfalse 16from types import GenericAlias 17 18__all__ = ['clear_cache', 'cmp', 'dircmp', 'cmpfiles', 'DEFAULT_IGNORES'] 19 20_cache = {} 21BUFSIZE = 8*1024 22 23DEFAULT_IGNORES = [ 24 'RCS', 'CVS', 'tags', '.git', '.hg', '.bzr', '_darcs', '__pycache__'] 25 26def clear_cache(): 27 """Clear the filecmp cache.""" 28 _cache.clear() 29 30def cmp(f1, f2, shallow=True): 31 """Compare two files. 32 33 Arguments: 34 35 f1 -- First file name 36 37 f2 -- Second file name 38 39 shallow -- treat files as identical if their stat signatures (type, size, 40 mtime) are identical. Otherwise, files are considered different 41 if their sizes or contents differ. [default: True] 42 43 Return value: 44 45 True if the files are the same, False otherwise. 46 47 This function uses a cache for past comparisons and the results, 48 with cache entries invalidated if their stat information 49 changes. The cache may be cleared by calling clear_cache(). 50 51 """ 52 53 s1 = _sig(os.stat(f1)) 54 s2 = _sig(os.stat(f2)) 55 if s1[0] != stat.S_IFREG or s2[0] != stat.S_IFREG: 56 return False 57 if shallow and s1 == s2: 58 return True 59 if s1[1] != s2[1]: 60 return False 61 62 outcome = _cache.get((f1, f2, s1, s2)) 63 if outcome is None: 64 outcome = _do_cmp(f1, f2) 65 if len(_cache) > 100: # limit the maximum size of the cache 66 clear_cache() 67 _cache[f1, f2, s1, s2] = outcome 68 return outcome 69 70def _sig(st): 71 return (stat.S_IFMT(st.st_mode), 72 st.st_size, 73 st.st_mtime) 74 75def _do_cmp(f1, f2): 76 bufsize = BUFSIZE 77 with open(f1, 'rb') as fp1, open(f2, 'rb') as fp2: 78 while True: 79 b1 = fp1.read(bufsize) 80 b2 = fp2.read(bufsize) 81 if b1 != b2: 82 return False 83 if not b1: 84 return True 85 86# Directory comparison class. 87# 88class dircmp: 89 """A class that manages the comparison of 2 directories. 90 91 dircmp(a, b, ignore=None, hide=None, *, shallow=True) 92 A and B are directories. 93 IGNORE is a list of names to ignore, 94 defaults to DEFAULT_IGNORES. 95 HIDE is a list of names to hide, 96 defaults to [os.curdir, os.pardir]. 97 SHALLOW specifies whether to just check the stat signature (do not read 98 the files). 99 defaults to True. 100 101 High level usage: 102 x = dircmp(dir1, dir2) 103 x.report() -> prints a report on the differences between dir1 and dir2 104 or 105 x.report_partial_closure() -> prints report on differences between dir1 106 and dir2, and reports on common immediate subdirectories. 107 x.report_full_closure() -> like report_partial_closure, 108 but fully recursive. 109 110 Attributes: 111 left_list, right_list: The files in dir1 and dir2, 112 filtered by hide and ignore. 113 common: a list of names in both dir1 and dir2. 114 left_only, right_only: names only in dir1, dir2. 115 common_dirs: subdirectories in both dir1 and dir2. 116 common_files: files in both dir1 and dir2. 117 common_funny: names in both dir1 and dir2 where the type differs between 118 dir1 and dir2, or the name is not stat-able. 119 same_files: list of identical files. 120 diff_files: list of filenames which differ. 121 funny_files: list of files which could not be compared. 122 subdirs: a dictionary of dircmp instances (or MyDirCmp instances if this 123 object is of type MyDirCmp, a subclass of dircmp), keyed by names 124 in common_dirs. 125 """ 126 127 def __init__(self, a, b, ignore=None, hide=None, *, shallow=True): # Initialize 128 self.left = a 129 self.right = b 130 if hide is None: 131 self.hide = [os.curdir, os.pardir] # Names never to be shown 132 else: 133 self.hide = hide 134 if ignore is None: 135 self.ignore = DEFAULT_IGNORES 136 else: 137 self.ignore = ignore 138 self.shallow = shallow 139 140 def phase0(self): # Compare everything except common subdirectories 141 self.left_list = _filter(os.listdir(self.left), 142 self.hide+self.ignore) 143 self.right_list = _filter(os.listdir(self.right), 144 self.hide+self.ignore) 145 self.left_list.sort() 146 self.right_list.sort() 147 148 def phase1(self): # Compute common names 149 a = dict(zip(map(os.path.normcase, self.left_list), self.left_list)) 150 b = dict(zip(map(os.path.normcase, self.right_list), self.right_list)) 151 self.common = list(map(a.__getitem__, filter(b.__contains__, a))) 152 self.left_only = list(map(a.__getitem__, filterfalse(b.__contains__, a))) 153 self.right_only = list(map(b.__getitem__, filterfalse(a.__contains__, b))) 154 155 def phase2(self): # Distinguish files, directories, funnies 156 self.common_dirs = [] 157 self.common_files = [] 158 self.common_funny = [] 159 160 for x in self.common: 161 a_path = os.path.join(self.left, x) 162 b_path = os.path.join(self.right, x) 163 164 ok = True 165 try: 166 a_stat = os.stat(a_path) 167 except (OSError, ValueError): 168 # See https://github.com/python/cpython/issues/122400 169 # for the rationale for protecting against ValueError. 170 # print('Can\'t stat', a_path, ':', why.args[1]) 171 ok = False 172 try: 173 b_stat = os.stat(b_path) 174 except (OSError, ValueError): 175 # print('Can\'t stat', b_path, ':', why.args[1]) 176 ok = False 177 178 if ok: 179 a_type = stat.S_IFMT(a_stat.st_mode) 180 b_type = stat.S_IFMT(b_stat.st_mode) 181 if a_type != b_type: 182 self.common_funny.append(x) 183 elif stat.S_ISDIR(a_type): 184 self.common_dirs.append(x) 185 elif stat.S_ISREG(a_type): 186 self.common_files.append(x) 187 else: 188 self.common_funny.append(x) 189 else: 190 self.common_funny.append(x) 191 192 def phase3(self): # Find out differences between common files 193 xx = cmpfiles(self.left, self.right, self.common_files, self.shallow) 194 self.same_files, self.diff_files, self.funny_files = xx 195 196 def phase4(self): # Find out differences between common subdirectories 197 # A new dircmp (or MyDirCmp if dircmp was subclassed) object is created 198 # for each common subdirectory, 199 # these are stored in a dictionary indexed by filename. 200 # The hide and ignore properties are inherited from the parent 201 self.subdirs = {} 202 for x in self.common_dirs: 203 a_x = os.path.join(self.left, x) 204 b_x = os.path.join(self.right, x) 205 self.subdirs[x] = self.__class__(a_x, b_x, self.ignore, self.hide, 206 shallow=self.shallow) 207 208 def phase4_closure(self): # Recursively call phase4() on subdirectories 209 self.phase4() 210 for sd in self.subdirs.values(): 211 sd.phase4_closure() 212 213 def report(self): # Print a report on the differences between a and b 214 # Output format is purposely lousy 215 print('diff', self.left, self.right) 216 if self.left_only: 217 self.left_only.sort() 218 print('Only in', self.left, ':', self.left_only) 219 if self.right_only: 220 self.right_only.sort() 221 print('Only in', self.right, ':', self.right_only) 222 if self.same_files: 223 self.same_files.sort() 224 print('Identical files :', self.same_files) 225 if self.diff_files: 226 self.diff_files.sort() 227 print('Differing files :', self.diff_files) 228 if self.funny_files: 229 self.funny_files.sort() 230 print('Trouble with common files :', self.funny_files) 231 if self.common_dirs: 232 self.common_dirs.sort() 233 print('Common subdirectories :', self.common_dirs) 234 if self.common_funny: 235 self.common_funny.sort() 236 print('Common funny cases :', self.common_funny) 237 238 def report_partial_closure(self): # Print reports on self and on subdirs 239 self.report() 240 for sd in self.subdirs.values(): 241 print() 242 sd.report() 243 244 def report_full_closure(self): # Report on self and subdirs recursively 245 self.report() 246 for sd in self.subdirs.values(): 247 print() 248 sd.report_full_closure() 249 250 methodmap = dict(subdirs=phase4, 251 same_files=phase3, diff_files=phase3, funny_files=phase3, 252 common_dirs=phase2, common_files=phase2, common_funny=phase2, 253 common=phase1, left_only=phase1, right_only=phase1, 254 left_list=phase0, right_list=phase0) 255 256 def __getattr__(self, attr): 257 if attr not in self.methodmap: 258 raise AttributeError(attr) 259 self.methodmap[attr](self) 260 return getattr(self, attr) 261 262 __class_getitem__ = classmethod(GenericAlias) 263 264 265def cmpfiles(a, b, common, shallow=True): 266 """Compare common files in two directories. 267 268 a, b -- directory names 269 common -- list of file names found in both directories 270 shallow -- if true, do comparison based solely on stat() information 271 272 Returns a tuple of three lists: 273 files that compare equal 274 files that are different 275 filenames that aren't regular files. 276 277 """ 278 res = ([], [], []) 279 for x in common: 280 ax = os.path.join(a, x) 281 bx = os.path.join(b, x) 282 res[_cmp(ax, bx, shallow)].append(x) 283 return res 284 285 286# Compare two files. 287# Return: 288# 0 for equal 289# 1 for different 290# 2 for funny cases (can't stat, NUL bytes, etc.) 291# 292def _cmp(a, b, sh, abs=abs, cmp=cmp): 293 try: 294 return not abs(cmp(a, b, sh)) 295 except (OSError, ValueError): 296 return 2 297 298 299# Return a copy with items that occur in skip removed. 300# 301def _filter(flist, skip): 302 return list(filterfalse(skip.__contains__, flist)) 303 304 305# Demonstration and testing. 306# 307def demo(): 308 import sys 309 import getopt 310 options, args = getopt.getopt(sys.argv[1:], 'r') 311 if len(args) != 2: 312 raise getopt.GetoptError('need exactly two args', None) 313 dd = dircmp(args[0], args[1]) 314 if ('-r', '') in options: 315 dd.report_full_closure() 316 else: 317 dd.report() 318 319if __name__ == '__main__': 320 demo() 321