1"""Utilities for comparing files and directories. 2 3Classes: 4 dircmp 5 6Functions: 7 cmp(f1, f2, shallow=1) -> int 8 cmpfiles(a, b, common) -> ([], [], []) 9 10""" 11 12import os 13import stat 14from itertools import ifilter, ifilterfalse, imap, izip 15 16__all__ = ["cmp","dircmp","cmpfiles"] 17 18_cache = {} 19BUFSIZE=8*1024 20 21def cmp(f1, f2, shallow=1): 22 """Compare two files. 23 24 Arguments: 25 26 f1 -- First file name 27 28 f2 -- Second file name 29 30 shallow -- Just check stat signature (do not read the files). 31 defaults to 1. 32 33 Return value: 34 35 True if the files are the same, False otherwise. 36 37 This function uses a cache for past comparisons and the results, 38 with a cache invalidation mechanism relying on stale signatures. 39 40 """ 41 42 s1 = _sig(os.stat(f1)) 43 s2 = _sig(os.stat(f2)) 44 if s1[0] != stat.S_IFREG or s2[0] != stat.S_IFREG: 45 return False 46 if shallow and s1 == s2: 47 return True 48 if s1[1] != s2[1]: 49 return False 50 51 result = _cache.get((f1, f2)) 52 if result and (s1, s2) == result[:2]: 53 return result[2] 54 outcome = _do_cmp(f1, f2) 55 _cache[f1, f2] = s1, s2, outcome 56 return outcome 57 58def _sig(st): 59 return (stat.S_IFMT(st.st_mode), 60 st.st_size, 61 st.st_mtime) 62 63def _do_cmp(f1, f2): 64 bufsize = BUFSIZE 65 with open(f1, 'rb') as fp1, open(f2, 'rb') as fp2: 66 while True: 67 b1 = fp1.read(bufsize) 68 b2 = fp2.read(bufsize) 69 if b1 != b2: 70 return False 71 if not b1: 72 return True 73 74# Directory comparison class. 75# 76class dircmp: 77 """A class that manages the comparison of 2 directories. 78 79 dircmp(a,b,ignore=None,hide=None) 80 A and B are directories. 81 IGNORE is a list of names to ignore, 82 defaults to ['RCS', 'CVS', 'tags']. 83 HIDE is a list of names to hide, 84 defaults to [os.curdir, os.pardir]. 85 86 High level usage: 87 x = dircmp(dir1, dir2) 88 x.report() -> prints a report on the differences between dir1 and dir2 89 or 90 x.report_partial_closure() -> prints report on differences between dir1 91 and dir2, and reports on common immediate subdirectories. 92 x.report_full_closure() -> like report_partial_closure, 93 but fully recursive. 94 95 Attributes: 96 left_list, right_list: The files in dir1 and dir2, 97 filtered by hide and ignore. 98 common: a list of names in both dir1 and dir2. 99 left_only, right_only: names only in dir1, dir2. 100 common_dirs: subdirectories in both dir1 and dir2. 101 common_files: files in both dir1 and dir2. 102 common_funny: names in both dir1 and dir2 where the type differs between 103 dir1 and dir2, or the name is not stat-able. 104 same_files: list of identical files. 105 diff_files: list of filenames which differ. 106 funny_files: list of files which could not be compared. 107 subdirs: a dictionary of dircmp objects, keyed by names in common_dirs. 108 """ 109 110 def __init__(self, a, b, ignore=None, hide=None): # Initialize 111 self.left = a 112 self.right = b 113 if hide is None: 114 self.hide = [os.curdir, os.pardir] # Names never to be shown 115 else: 116 self.hide = hide 117 if ignore is None: 118 self.ignore = ['RCS', 'CVS', 'tags'] # Names ignored in comparison 119 else: 120 self.ignore = ignore 121 122 def phase0(self): # Compare everything except common subdirectories 123 self.left_list = _filter(os.listdir(self.left), 124 self.hide+self.ignore) 125 self.right_list = _filter(os.listdir(self.right), 126 self.hide+self.ignore) 127 self.left_list.sort() 128 self.right_list.sort() 129 130 def phase1(self): # Compute common names 131 a = dict(izip(imap(os.path.normcase, self.left_list), self.left_list)) 132 b = dict(izip(imap(os.path.normcase, self.right_list), self.right_list)) 133 self.common = map(a.__getitem__, ifilter(b.__contains__, a)) 134 self.left_only = map(a.__getitem__, ifilterfalse(b.__contains__, a)) 135 self.right_only = map(b.__getitem__, ifilterfalse(a.__contains__, b)) 136 137 def phase2(self): # Distinguish files, directories, funnies 138 self.common_dirs = [] 139 self.common_files = [] 140 self.common_funny = [] 141 142 for x in self.common: 143 a_path = os.path.join(self.left, x) 144 b_path = os.path.join(self.right, x) 145 146 ok = 1 147 try: 148 a_stat = os.stat(a_path) 149 except os.error, why: 150 # print 'Can\'t stat', a_path, ':', why[1] 151 ok = 0 152 try: 153 b_stat = os.stat(b_path) 154 except os.error, why: 155 # print 'Can\'t stat', b_path, ':', why[1] 156 ok = 0 157 158 if ok: 159 a_type = stat.S_IFMT(a_stat.st_mode) 160 b_type = stat.S_IFMT(b_stat.st_mode) 161 if a_type != b_type: 162 self.common_funny.append(x) 163 elif stat.S_ISDIR(a_type): 164 self.common_dirs.append(x) 165 elif stat.S_ISREG(a_type): 166 self.common_files.append(x) 167 else: 168 self.common_funny.append(x) 169 else: 170 self.common_funny.append(x) 171 172 def phase3(self): # Find out differences between common files 173 xx = cmpfiles(self.left, self.right, self.common_files) 174 self.same_files, self.diff_files, self.funny_files = xx 175 176 def phase4(self): # Find out differences between common subdirectories 177 # A new dircmp object is created for each common subdirectory, 178 # these are stored in a dictionary indexed by filename. 179 # The hide and ignore properties are inherited from the parent 180 self.subdirs = {} 181 for x in self.common_dirs: 182 a_x = os.path.join(self.left, x) 183 b_x = os.path.join(self.right, x) 184 self.subdirs[x] = dircmp(a_x, b_x, self.ignore, self.hide) 185 186 def phase4_closure(self): # Recursively call phase4() on subdirectories 187 self.phase4() 188 for sd in self.subdirs.itervalues(): 189 sd.phase4_closure() 190 191 def report(self): # Print a report on the differences between a and b 192 # Output format is purposely lousy 193 print 'diff', self.left, self.right 194 if self.left_only: 195 self.left_only.sort() 196 print 'Only in', self.left, ':', self.left_only 197 if self.right_only: 198 self.right_only.sort() 199 print 'Only in', self.right, ':', self.right_only 200 if self.same_files: 201 self.same_files.sort() 202 print 'Identical files :', self.same_files 203 if self.diff_files: 204 self.diff_files.sort() 205 print 'Differing files :', self.diff_files 206 if self.funny_files: 207 self.funny_files.sort() 208 print 'Trouble with common files :', self.funny_files 209 if self.common_dirs: 210 self.common_dirs.sort() 211 print 'Common subdirectories :', self.common_dirs 212 if self.common_funny: 213 self.common_funny.sort() 214 print 'Common funny cases :', self.common_funny 215 216 def report_partial_closure(self): # Print reports on self and on subdirs 217 self.report() 218 for sd in self.subdirs.itervalues(): 219 print 220 sd.report() 221 222 def report_full_closure(self): # Report on self and subdirs recursively 223 self.report() 224 for sd in self.subdirs.itervalues(): 225 print 226 sd.report_full_closure() 227 228 methodmap = dict(subdirs=phase4, 229 same_files=phase3, diff_files=phase3, funny_files=phase3, 230 common_dirs = phase2, common_files=phase2, common_funny=phase2, 231 common=phase1, left_only=phase1, right_only=phase1, 232 left_list=phase0, right_list=phase0) 233 234 def __getattr__(self, attr): 235 if attr not in self.methodmap: 236 raise AttributeError, attr 237 self.methodmap[attr](self) 238 return getattr(self, attr) 239 240def cmpfiles(a, b, common, shallow=1): 241 """Compare common files in two directories. 242 243 a, b -- directory names 244 common -- list of file names found in both directories 245 shallow -- if true, do comparison based solely on stat() information 246 247 Returns a tuple of three lists: 248 files that compare equal 249 files that are different 250 filenames that aren't regular files. 251 252 """ 253 res = ([], [], []) 254 for x in common: 255 ax = os.path.join(a, x) 256 bx = os.path.join(b, x) 257 res[_cmp(ax, bx, shallow)].append(x) 258 return res 259 260 261# Compare two files. 262# Return: 263# 0 for equal 264# 1 for different 265# 2 for funny cases (can't stat, etc.) 266# 267def _cmp(a, b, sh, abs=abs, cmp=cmp): 268 try: 269 return not abs(cmp(a, b, sh)) 270 except os.error: 271 return 2 272 273 274# Return a copy with items that occur in skip removed. 275# 276def _filter(flist, skip): 277 return list(ifilterfalse(skip.__contains__, flist)) 278 279 280# Demonstration and testing. 281# 282def demo(): 283 import sys 284 import getopt 285 options, args = getopt.getopt(sys.argv[1:], 'r') 286 if len(args) != 2: 287 raise getopt.GetoptError('need exactly two args', None) 288 dd = dircmp(args[0], args[1]) 289 if ('-r', '') in options: 290 dd.report_full_closure() 291 else: 292 dd.report() 293 294if __name__ == '__main__': 295 demo() 296