1"""Utilities for comparing files and directories. 2 3Classes: 4 dircmp 5 6Functions: 7 cmp(f1, f2, shallow=1) -> int 8 cmpfiles(a, b, common) -> ([], [], []) 9 10""" 11 12import os 13import stat 14from itertools import ifilter, ifilterfalse, imap, izip 15 16__all__ = ["cmp","dircmp","cmpfiles"] 17 18_cache = {} 19BUFSIZE=8*1024 20 21def cmp(f1, f2, shallow=1): 22 """Compare two files. 23 24 Arguments: 25 26 f1 -- First file name 27 28 f2 -- Second file name 29 30 shallow -- Just check stat signature (do not read the files). 31 defaults to 1. 32 33 Return value: 34 35 True if the files are the same, False otherwise. 36 37 This function uses a cache for past comparisons and the results, 38 with a cache invalidation mechanism relying on stale signatures. 39 40 """ 41 42 s1 = _sig(os.stat(f1)) 43 s2 = _sig(os.stat(f2)) 44 if s1[0] != stat.S_IFREG or s2[0] != stat.S_IFREG: 45 return False 46 if shallow and s1 == s2: 47 return True 48 if s1[1] != s2[1]: 49 return False 50 51 outcome = _cache.get((f1, f2, s1, s2)) 52 if outcome is None: 53 outcome = _do_cmp(f1, f2) 54 if len(_cache) > 100: # limit the maximum size of the cache 55 _cache.clear() 56 _cache[f1, f2, s1, s2] = outcome 57 return outcome 58 59def _sig(st): 60 return (stat.S_IFMT(st.st_mode), 61 st.st_size, 62 st.st_mtime) 63 64def _do_cmp(f1, f2): 65 bufsize = BUFSIZE 66 with open(f1, 'rb') as fp1, open(f2, 'rb') as fp2: 67 while True: 68 b1 = fp1.read(bufsize) 69 b2 = fp2.read(bufsize) 70 if b1 != b2: 71 return False 72 if not b1: 73 return True 74 75# Directory comparison class. 76# 77class dircmp: 78 """A class that manages the comparison of 2 directories. 79 80 dircmp(a,b,ignore=None,hide=None) 81 A and B are directories. 82 IGNORE is a list of names to ignore, 83 defaults to ['RCS', 'CVS', 'tags']. 84 HIDE is a list of names to hide, 85 defaults to [os.curdir, os.pardir]. 86 87 High level usage: 88 x = dircmp(dir1, dir2) 89 x.report() -> prints a report on the differences between dir1 and dir2 90 or 91 x.report_partial_closure() -> prints report on differences between dir1 92 and dir2, and reports on common immediate subdirectories. 93 x.report_full_closure() -> like report_partial_closure, 94 but fully recursive. 95 96 Attributes: 97 left_list, right_list: The files in dir1 and dir2, 98 filtered by hide and ignore. 99 common: a list of names in both dir1 and dir2. 100 left_only, right_only: names only in dir1, dir2. 101 common_dirs: subdirectories in both dir1 and dir2. 102 common_files: files in both dir1 and dir2. 103 common_funny: names in both dir1 and dir2 where the type differs between 104 dir1 and dir2, or the name is not stat-able. 105 same_files: list of identical files. 106 diff_files: list of filenames which differ. 107 funny_files: list of files which could not be compared. 108 subdirs: a dictionary of dircmp objects, keyed by names in common_dirs. 109 """ 110 111 def __init__(self, a, b, ignore=None, hide=None): # Initialize 112 self.left = a 113 self.right = b 114 if hide is None: 115 self.hide = [os.curdir, os.pardir] # Names never to be shown 116 else: 117 self.hide = hide 118 if ignore is None: 119 self.ignore = ['RCS', 'CVS', 'tags'] # Names ignored in comparison 120 else: 121 self.ignore = ignore 122 123 def phase0(self): # Compare everything except common subdirectories 124 self.left_list = _filter(os.listdir(self.left), 125 self.hide+self.ignore) 126 self.right_list = _filter(os.listdir(self.right), 127 self.hide+self.ignore) 128 self.left_list.sort() 129 self.right_list.sort() 130 131 def phase1(self): # Compute common names 132 a = dict(izip(imap(os.path.normcase, self.left_list), self.left_list)) 133 b = dict(izip(imap(os.path.normcase, self.right_list), self.right_list)) 134 self.common = map(a.__getitem__, ifilter(b.__contains__, a)) 135 self.left_only = map(a.__getitem__, ifilterfalse(b.__contains__, a)) 136 self.right_only = map(b.__getitem__, ifilterfalse(a.__contains__, b)) 137 138 def phase2(self): # Distinguish files, directories, funnies 139 self.common_dirs = [] 140 self.common_files = [] 141 self.common_funny = [] 142 143 for x in self.common: 144 a_path = os.path.join(self.left, x) 145 b_path = os.path.join(self.right, x) 146 147 ok = 1 148 try: 149 a_stat = os.stat(a_path) 150 except os.error, why: 151 # print 'Can\'t stat', a_path, ':', why[1] 152 ok = 0 153 try: 154 b_stat = os.stat(b_path) 155 except os.error, why: 156 # print 'Can\'t stat', b_path, ':', why[1] 157 ok = 0 158 159 if ok: 160 a_type = stat.S_IFMT(a_stat.st_mode) 161 b_type = stat.S_IFMT(b_stat.st_mode) 162 if a_type != b_type: 163 self.common_funny.append(x) 164 elif stat.S_ISDIR(a_type): 165 self.common_dirs.append(x) 166 elif stat.S_ISREG(a_type): 167 self.common_files.append(x) 168 else: 169 self.common_funny.append(x) 170 else: 171 self.common_funny.append(x) 172 173 def phase3(self): # Find out differences between common files 174 xx = cmpfiles(self.left, self.right, self.common_files) 175 self.same_files, self.diff_files, self.funny_files = xx 176 177 def phase4(self): # Find out differences between common subdirectories 178 # A new dircmp object is created for each common subdirectory, 179 # these are stored in a dictionary indexed by filename. 180 # The hide and ignore properties are inherited from the parent 181 self.subdirs = {} 182 for x in self.common_dirs: 183 a_x = os.path.join(self.left, x) 184 b_x = os.path.join(self.right, x) 185 self.subdirs[x] = dircmp(a_x, b_x, self.ignore, self.hide) 186 187 def phase4_closure(self): # Recursively call phase4() on subdirectories 188 self.phase4() 189 for sd in self.subdirs.itervalues(): 190 sd.phase4_closure() 191 192 def report(self): # Print a report on the differences between a and b 193 # Output format is purposely lousy 194 print 'diff', self.left, self.right 195 if self.left_only: 196 self.left_only.sort() 197 print 'Only in', self.left, ':', self.left_only 198 if self.right_only: 199 self.right_only.sort() 200 print 'Only in', self.right, ':', self.right_only 201 if self.same_files: 202 self.same_files.sort() 203 print 'Identical files :', self.same_files 204 if self.diff_files: 205 self.diff_files.sort() 206 print 'Differing files :', self.diff_files 207 if self.funny_files: 208 self.funny_files.sort() 209 print 'Trouble with common files :', self.funny_files 210 if self.common_dirs: 211 self.common_dirs.sort() 212 print 'Common subdirectories :', self.common_dirs 213 if self.common_funny: 214 self.common_funny.sort() 215 print 'Common funny cases :', self.common_funny 216 217 def report_partial_closure(self): # Print reports on self and on subdirs 218 self.report() 219 for sd in self.subdirs.itervalues(): 220 print 221 sd.report() 222 223 def report_full_closure(self): # Report on self and subdirs recursively 224 self.report() 225 for sd in self.subdirs.itervalues(): 226 print 227 sd.report_full_closure() 228 229 methodmap = dict(subdirs=phase4, 230 same_files=phase3, diff_files=phase3, funny_files=phase3, 231 common_dirs = phase2, common_files=phase2, common_funny=phase2, 232 common=phase1, left_only=phase1, right_only=phase1, 233 left_list=phase0, right_list=phase0) 234 235 def __getattr__(self, attr): 236 if attr not in self.methodmap: 237 raise AttributeError, attr 238 self.methodmap[attr](self) 239 return getattr(self, attr) 240 241def cmpfiles(a, b, common, shallow=1): 242 """Compare common files in two directories. 243 244 a, b -- directory names 245 common -- list of file names found in both directories 246 shallow -- if true, do comparison based solely on stat() information 247 248 Returns a tuple of three lists: 249 files that compare equal 250 files that are different 251 filenames that aren't regular files. 252 253 """ 254 res = ([], [], []) 255 for x in common: 256 ax = os.path.join(a, x) 257 bx = os.path.join(b, x) 258 res[_cmp(ax, bx, shallow)].append(x) 259 return res 260 261 262# Compare two files. 263# Return: 264# 0 for equal 265# 1 for different 266# 2 for funny cases (can't stat, etc.) 267# 268def _cmp(a, b, sh, abs=abs, cmp=cmp): 269 try: 270 return not abs(cmp(a, b, sh)) 271 except (os.error, IOError): 272 return 2 273 274 275# Return a copy with items that occur in skip removed. 276# 277def _filter(flist, skip): 278 return list(ifilterfalse(skip.__contains__, flist)) 279 280 281# Demonstration and testing. 282# 283def demo(): 284 import sys 285 import getopt 286 options, args = getopt.getopt(sys.argv[1:], 'r') 287 if len(args) != 2: 288 raise getopt.GetoptError('need exactly two args', None) 289 dd = dircmp(args[0], args[1]) 290 if ('-r', '') in options: 291 dd.report_full_closure() 292 else: 293 dd.report() 294 295if __name__ == '__main__': 296 demo() 297