1"""Utilities for comparing files and directories. 2 3Classes: 4 dircmp 5 6Functions: 7 cmp(f1, f2, shallow=True) -> int 8 cmpfiles(a, b, common) -> ([], [], []) 9 clear_cache() 10 11""" 12 13import os 14import stat 15from itertools import filterfalse 16from types import GenericAlias 17 18__all__ = ['clear_cache', 'cmp', 'dircmp', 'cmpfiles', 'DEFAULT_IGNORES'] 19 20_cache = {} 21BUFSIZE = 8*1024 22 23DEFAULT_IGNORES = [ 24 'RCS', 'CVS', 'tags', '.git', '.hg', '.bzr', '_darcs', '__pycache__'] 25 26def clear_cache(): 27 """Clear the filecmp cache.""" 28 _cache.clear() 29 30def cmp(f1, f2, shallow=True): 31 """Compare two files. 32 33 Arguments: 34 35 f1 -- First file name 36 37 f2 -- Second file name 38 39 shallow -- treat files as identical if their stat signatures (type, size, 40 mtime) are identical. Otherwise, files are considered different 41 if their sizes or contents differ. [default: True] 42 43 Return value: 44 45 True if the files are the same, False otherwise. 46 47 This function uses a cache for past comparisons and the results, 48 with cache entries invalidated if their stat information 49 changes. The cache may be cleared by calling clear_cache(). 50 51 """ 52 53 s1 = _sig(os.stat(f1)) 54 s2 = _sig(os.stat(f2)) 55 if s1[0] != stat.S_IFREG or s2[0] != stat.S_IFREG: 56 return False 57 if shallow and s1 == s2: 58 return True 59 if s1[1] != s2[1]: 60 return False 61 62 outcome = _cache.get((f1, f2, s1, s2)) 63 if outcome is None: 64 outcome = _do_cmp(f1, f2) 65 if len(_cache) > 100: # limit the maximum size of the cache 66 clear_cache() 67 _cache[f1, f2, s1, s2] = outcome 68 return outcome 69 70def _sig(st): 71 return (stat.S_IFMT(st.st_mode), 72 st.st_size, 73 st.st_mtime) 74 75def _do_cmp(f1, f2): 76 bufsize = BUFSIZE 77 with open(f1, 'rb') as fp1, open(f2, 'rb') as fp2: 78 while True: 79 b1 = fp1.read(bufsize) 80 b2 = fp2.read(bufsize) 81 if b1 != b2: 82 return False 83 if not b1: 84 return True 85 86# Directory comparison class. 87# 88class dircmp: 89 """A class that manages the comparison of 2 directories. 90 91 dircmp(a, b, ignore=None, hide=None) 92 A and B are directories. 93 IGNORE is a list of names to ignore, 94 defaults to DEFAULT_IGNORES. 95 HIDE is a list of names to hide, 96 defaults to [os.curdir, os.pardir]. 97 98 High level usage: 99 x = dircmp(dir1, dir2) 100 x.report() -> prints a report on the differences between dir1 and dir2 101 or 102 x.report_partial_closure() -> prints report on differences between dir1 103 and dir2, and reports on common immediate subdirectories. 104 x.report_full_closure() -> like report_partial_closure, 105 but fully recursive. 106 107 Attributes: 108 left_list, right_list: The files in dir1 and dir2, 109 filtered by hide and ignore. 110 common: a list of names in both dir1 and dir2. 111 left_only, right_only: names only in dir1, dir2. 112 common_dirs: subdirectories in both dir1 and dir2. 113 common_files: files in both dir1 and dir2. 114 common_funny: names in both dir1 and dir2 where the type differs between 115 dir1 and dir2, or the name is not stat-able. 116 same_files: list of identical files. 117 diff_files: list of filenames which differ. 118 funny_files: list of files which could not be compared. 119 subdirs: a dictionary of dircmp instances (or MyDirCmp instances if this 120 object is of type MyDirCmp, a subclass of dircmp), keyed by names 121 in common_dirs. 122 """ 123 124 def __init__(self, a, b, ignore=None, hide=None): # Initialize 125 self.left = a 126 self.right = b 127 if hide is None: 128 self.hide = [os.curdir, os.pardir] # Names never to be shown 129 else: 130 self.hide = hide 131 if ignore is None: 132 self.ignore = DEFAULT_IGNORES 133 else: 134 self.ignore = ignore 135 136 def phase0(self): # Compare everything except common subdirectories 137 self.left_list = _filter(os.listdir(self.left), 138 self.hide+self.ignore) 139 self.right_list = _filter(os.listdir(self.right), 140 self.hide+self.ignore) 141 self.left_list.sort() 142 self.right_list.sort() 143 144 def phase1(self): # Compute common names 145 a = dict(zip(map(os.path.normcase, self.left_list), self.left_list)) 146 b = dict(zip(map(os.path.normcase, self.right_list), self.right_list)) 147 self.common = list(map(a.__getitem__, filter(b.__contains__, a))) 148 self.left_only = list(map(a.__getitem__, filterfalse(b.__contains__, a))) 149 self.right_only = list(map(b.__getitem__, filterfalse(a.__contains__, b))) 150 151 def phase2(self): # Distinguish files, directories, funnies 152 self.common_dirs = [] 153 self.common_files = [] 154 self.common_funny = [] 155 156 for x in self.common: 157 a_path = os.path.join(self.left, x) 158 b_path = os.path.join(self.right, x) 159 160 ok = 1 161 try: 162 a_stat = os.stat(a_path) 163 except OSError: 164 # print('Can\'t stat', a_path, ':', why.args[1]) 165 ok = 0 166 try: 167 b_stat = os.stat(b_path) 168 except OSError: 169 # print('Can\'t stat', b_path, ':', why.args[1]) 170 ok = 0 171 172 if ok: 173 a_type = stat.S_IFMT(a_stat.st_mode) 174 b_type = stat.S_IFMT(b_stat.st_mode) 175 if a_type != b_type: 176 self.common_funny.append(x) 177 elif stat.S_ISDIR(a_type): 178 self.common_dirs.append(x) 179 elif stat.S_ISREG(a_type): 180 self.common_files.append(x) 181 else: 182 self.common_funny.append(x) 183 else: 184 self.common_funny.append(x) 185 186 def phase3(self): # Find out differences between common files 187 xx = cmpfiles(self.left, self.right, self.common_files) 188 self.same_files, self.diff_files, self.funny_files = xx 189 190 def phase4(self): # Find out differences between common subdirectories 191 # A new dircmp (or MyDirCmp if dircmp was subclassed) object is created 192 # for each common subdirectory, 193 # these are stored in a dictionary indexed by filename. 194 # The hide and ignore properties are inherited from the parent 195 self.subdirs = {} 196 for x in self.common_dirs: 197 a_x = os.path.join(self.left, x) 198 b_x = os.path.join(self.right, x) 199 self.subdirs[x] = self.__class__(a_x, b_x, self.ignore, self.hide) 200 201 def phase4_closure(self): # Recursively call phase4() on subdirectories 202 self.phase4() 203 for sd in self.subdirs.values(): 204 sd.phase4_closure() 205 206 def report(self): # Print a report on the differences between a and b 207 # Output format is purposely lousy 208 print('diff', self.left, self.right) 209 if self.left_only: 210 self.left_only.sort() 211 print('Only in', self.left, ':', self.left_only) 212 if self.right_only: 213 self.right_only.sort() 214 print('Only in', self.right, ':', self.right_only) 215 if self.same_files: 216 self.same_files.sort() 217 print('Identical files :', self.same_files) 218 if self.diff_files: 219 self.diff_files.sort() 220 print('Differing files :', self.diff_files) 221 if self.funny_files: 222 self.funny_files.sort() 223 print('Trouble with common files :', self.funny_files) 224 if self.common_dirs: 225 self.common_dirs.sort() 226 print('Common subdirectories :', self.common_dirs) 227 if self.common_funny: 228 self.common_funny.sort() 229 print('Common funny cases :', self.common_funny) 230 231 def report_partial_closure(self): # Print reports on self and on subdirs 232 self.report() 233 for sd in self.subdirs.values(): 234 print() 235 sd.report() 236 237 def report_full_closure(self): # Report on self and subdirs recursively 238 self.report() 239 for sd in self.subdirs.values(): 240 print() 241 sd.report_full_closure() 242 243 methodmap = dict(subdirs=phase4, 244 same_files=phase3, diff_files=phase3, funny_files=phase3, 245 common_dirs = phase2, common_files=phase2, common_funny=phase2, 246 common=phase1, left_only=phase1, right_only=phase1, 247 left_list=phase0, right_list=phase0) 248 249 def __getattr__(self, attr): 250 if attr not in self.methodmap: 251 raise AttributeError(attr) 252 self.methodmap[attr](self) 253 return getattr(self, attr) 254 255 __class_getitem__ = classmethod(GenericAlias) 256 257 258def cmpfiles(a, b, common, shallow=True): 259 """Compare common files in two directories. 260 261 a, b -- directory names 262 common -- list of file names found in both directories 263 shallow -- if true, do comparison based solely on stat() information 264 265 Returns a tuple of three lists: 266 files that compare equal 267 files that are different 268 filenames that aren't regular files. 269 270 """ 271 res = ([], [], []) 272 for x in common: 273 ax = os.path.join(a, x) 274 bx = os.path.join(b, x) 275 res[_cmp(ax, bx, shallow)].append(x) 276 return res 277 278 279# Compare two files. 280# Return: 281# 0 for equal 282# 1 for different 283# 2 for funny cases (can't stat, etc.) 284# 285def _cmp(a, b, sh, abs=abs, cmp=cmp): 286 try: 287 return not abs(cmp(a, b, sh)) 288 except OSError: 289 return 2 290 291 292# Return a copy with items that occur in skip removed. 293# 294def _filter(flist, skip): 295 return list(filterfalse(skip.__contains__, flist)) 296 297 298# Demonstration and testing. 299# 300def demo(): 301 import sys 302 import getopt 303 options, args = getopt.getopt(sys.argv[1:], 'r') 304 if len(args) != 2: 305 raise getopt.GetoptError('need exactly two args', None) 306 dd = dircmp(args[0], args[1]) 307 if ('-r', '') in options: 308 dd.report_full_closure() 309 else: 310 dd.report() 311 312if __name__ == '__main__': 313 demo() 314