1#!/usr/bin/python3 2# 3# Copyright (C) 2021 The Android Open Source Project 4# 5# Licensed under the Apache License, Version 2.0 (the "License"); 6# you may not use this file except in compliance with the License. 7# You may obtain a copy of the License at 8# 9# http://www.apache.org/licenses/LICENSE-2.0 10# 11# Unless required by applicable law or agreed to in writing, software 12# distributed under the License is distributed on an "AS IS" BASIS, 13# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14# See the License for the specific language governing permissions and 15# limitations under the License. 16"""Utilities for comparing two version of a codebase.""" 17 18import argparse 19import difflib 20import filecmp 21import os 22import pathlib 23import re 24 25 26class FileStat: 27 """File statistics class for a file.""" 28 29 NON_TEXT = 0 30 TEXT = 1 31 32 def __init__(self, file_path): 33 """Initializes with a file path string.""" 34 if file_path: 35 self.file_name = str(file_path) 36 self.size = file_path.stat().st_size 37 else: 38 self.file_name = '' 39 self.size = 0 40 41 self.line_cnt = 0 42 self.group_cnt = 0 43 self.add_line_cnt = 0 44 self.remove_line_cnt = 0 45 self.replace_line_cnt = 0 46 47 @staticmethod 48 def get_csv_header(prefix=None): 49 """Returns CSV header string.""" 50 cols = ['file', 'size', 'line', 'group', 'add', 'remove', 'replace'] 51 if prefix: 52 return ','.join('{0}_{1}'.format(prefix, c) for c in cols) 53 else: 54 return ','.join(c for c in cols) 55 56 def get_csv_str(self, strip_dir_len=0): 57 """Returns the file statistic CSV string.""" 58 name = self.file_name[strip_dir_len:] 59 csv = [ 60 FileStat.no_comma(name), self.size, self.line_cnt, self.group_cnt, 61 self.add_line_cnt, self.remove_line_cnt, self.replace_line_cnt 62 ] 63 return ','.join(str(i) for i in csv) 64 65 @staticmethod 66 def no_comma(astr): 67 """Replaces , with _.""" 68 return astr.replace(',', '_') 69 70 71class DiffStat: 72 """Diff statistic class for 2 versions of a file.""" 73 74 SAME = 0 75 NEW = 1 76 REMOVED = 2 77 MODIFIED = 3 78 INCOMPARABLE = 4 79 80 def __init__(self, common_name, old_file_stat, new_file_stat, state): 81 """Initializes with the common names & etc.""" 82 self.old_file_stat = old_file_stat 83 self.new_file_stat = new_file_stat 84 self.name = common_name 85 self.ext = os.path.splitext(self.name)[1].lstrip('.') 86 self.state = state 87 self.file_type = FileStat.NON_TEXT 88 89 def add_diff_stat(self, diff_lines): 90 """Adds the statistic by the diff lines.""" 91 # These align with https://github.com/python/cpython/blob/3.9/Lib/difflib.py 92 old_pattern = re.compile(r'\*{3} (.*)') 93 new_pattern = re.compile(r'-{3} (.*)') 94 group_separator = '***************' 95 old_group_header = re.compile(r'\*{3} (\d*),(\d*) \*{4}') 96 new_group_header = re.compile(r'-{3} (\d*),(\d*) -{4}') 97 98 # section 0 is old verion & 1 is new verion 99 section = -1 100 diff_stats = [self.old_file_stat, self.new_file_stat] 101 in_group = False 102 103 h1m = old_pattern.match(diff_lines[0]) 104 if not h1m: 105 print('ERROR: wrong diff header line 1: %s' % diff_lines[0]) 106 return 107 108 h2m = new_pattern.match(diff_lines[1]) 109 if not h2m: 110 print('ERROR: wrong diff header line 2: %s' % diff_lines[1]) 111 return 112 113 for line in diff_lines[2:]: 114 if in_group: 115 if line.startswith(' '): 116 # equal 117 continue 118 elif line.startswith('! '): 119 # replace 120 diff_stats[section].replace_line_cnt += 1 121 continue 122 elif line.startswith('+ '): 123 # add 124 diff_stats[section].add_line_cnt += 1 125 continue 126 elif line.startswith('- '): 127 # removed 128 diff_stats[section].remove_line_cnt += 1 129 continue 130 131 oghm = old_group_header.match(line) 132 if oghm: 133 section = 0 134 diff_stats[section].group_cnt += 1 135 continue 136 137 nghm = new_group_header.match(line) 138 if nghm: 139 section = 1 140 diff_stats[section].group_cnt += 1 141 continue 142 143 if line.startswith(group_separator): 144 in_group = True 145 continue 146 147 148class ChangeReport: 149 """Change report class for the diff statistics on 2 versions of a codebase. 150 151 Attributes: 152 old_dir: The old codebase dir path string. 153 new_dir: The new codebase dir path string. 154 dircmp: The dircmp object 155 group_cnt: How many diff groups. 156 add_line_cnt: How many lines are added. 157 remove_line_cnt: How many lines are removed. 158 replace_line_cnt: Hoe many lines are changed. 159 """ 160 161 def __init__(self, old_dir, new_dir, ignores=None, state_filter=None): 162 """Initializes with old & new dir path strings.""" 163 self.old_dir = os.path.abspath(old_dir) 164 self._old_dir_prefix_len = len(self.old_dir) + 1 165 self.new_dir = os.path.abspath(new_dir) 166 self._new_dir_prefix_len = len(self.new_dir) + 1 167 if ignores: 168 self._ignores = ignores.split(',') 169 self._ignores.extend(filecmp.DEFAULT_IGNORES) 170 else: 171 self._ignores = filecmp.DEFAULT_IGNORES 172 173 if state_filter: 174 self._state_filter = list(map(int, state_filter.split(','))) 175 else: 176 self._state_filter = [0, 1, 2, 3, 4] 177 178 self._do_same = DiffStat.SAME in self._state_filter 179 self._do_new = DiffStat.NEW in self._state_filter 180 self._do_removed = DiffStat.REMOVED in self._state_filter 181 self._do_moeified = DiffStat.MODIFIED in self._state_filter 182 self._do_incomparable = DiffStat.INCOMPARABLE in self._state_filter 183 184 self.dircmp = filecmp.dircmp( 185 self.old_dir, self.new_dir, ignore=self._ignores) 186 self._diff_stats = [] 187 self._diff_stat_lines = [] 188 self._diff_lines = [] 189 self._processed_cnt = 0 190 self._common_dir_len = ChangeReport.get_common_path_len( 191 self.old_dir, self.new_dir) 192 193 @staticmethod 194 def get_common_path_len(dir1, dir2): 195 """Gets the length of the common path of old & new folders.""" 196 sep = os.path.sep 197 last_sep_pos = 0 198 for i in range(len(dir1)): 199 if dir1[i] == sep: 200 last_sep_pos = i 201 if dir1[i] != dir2[i]: 202 break 203 return last_sep_pos + 1 204 205 @staticmethod 206 def get_diff_stat_header(): 207 """Gets the diff statistic CSV header.""" 208 return 'file,ext,text,state,{0},{1}\n'.format( 209 FileStat.get_csv_header('new'), FileStat.get_csv_header('old')) 210 211 def get_diff_stat_lines(self): 212 """Gets the diff statistic CSV lines.""" 213 if self._processed_cnt < 1: 214 self._process_dircmp(self.dircmp) 215 self._processed_cnt += 1 216 217 self._diff_stat_lines = [] 218 for diff_stat in self._diff_stats: 219 self._diff_stat_lines.append('{0},{1},{2},{3},{4},{5}\n'.format( 220 FileStat.no_comma(diff_stat.name), diff_stat.ext, 221 diff_stat.file_type, diff_stat.state, 222 diff_stat.new_file_stat.get_csv_str(self._common_dir_len), 223 diff_stat.old_file_stat.get_csv_str(self._common_dir_len))) 224 225 return self._diff_stat_lines 226 227 def get_diff_lines(self): 228 """Gets the diff output lines.""" 229 if self._processed_cnt < 1: 230 self._process_dircmp(self.dircmp) 231 self._processed_cnt += 1 232 return self._diff_lines 233 234 def _process_dircmp(self, dircmp): 235 """Compare all files in a dircmp object for diff statstics & output.""" 236 if self._do_moeified: 237 self._process_diff_files(dircmp) 238 239 for subdir_dircmp in dircmp.subdirs.values(): 240 rp = pathlib.Path(subdir_dircmp.right) 241 lp = pathlib.Path(subdir_dircmp.left) 242 if rp.is_symlink() or lp.is_symlink(): 243 print('SKIP: symlink: {0} or {1}'.format(subdir_dircmp.right, 244 subdir_dircmp.left)) 245 continue 246 self._process_dircmp(subdir_dircmp) 247 248 if self._do_new: 249 self._process_others(dircmp.right_only, dircmp.right, 250 self._new_dir_prefix_len, DiffStat.NEW) 251 if self._do_same: 252 self._process_others(dircmp.same_files, dircmp.right, 253 self._new_dir_prefix_len, DiffStat.SAME) 254 if self._do_incomparable: 255 self._process_others(dircmp.funny_files, dircmp.right, 256 self._new_dir_prefix_len, DiffStat.INCOMPARABLE) 257 if self._do_removed: 258 self._process_others(dircmp.left_only, dircmp.left, 259 self._old_dir_prefix_len, DiffStat.REMOVED) 260 261 def _process_others(self, files, adir, prefix_len, state): 262 """Processes files are not modified.""" 263 empty_stat = FileStat(None) 264 for file in files: 265 file_path = pathlib.Path(adir, file) 266 if file_path.is_symlink(): 267 print('SKIP: symlink: {0}, {1}'.format(state, file_path)) 268 continue 269 elif file_path.is_dir(): 270 flist = self._get_filtered_files(file_path) 271 self._process_others(flist, adir, prefix_len, state) 272 else: 273 file_stat = FileStat(file_path) 274 common_name = str(file_path)[prefix_len:] 275 if state == DiffStat.REMOVED: 276 diff_stat = DiffStat(common_name, file_stat, empty_stat, state) 277 else: 278 diff_stat = DiffStat(common_name, empty_stat, file_stat, state) 279 try: 280 with open(file_path, encoding='utf-8') as f: 281 lines = f.readlines() 282 file_stat.line_cnt = len(lines) 283 file_type = FileStat.TEXT 284 except UnicodeDecodeError: 285 file_type = FileStat.NON_TEXT 286 287 diff_stat.file_type = file_type 288 self._diff_stats.append(diff_stat) 289 290 def _process_diff_files(self, dircmp): 291 """Processes files are modified.""" 292 for file in dircmp.diff_files: 293 old_file_path = pathlib.Path(dircmp.left, file) 294 new_file_path = pathlib.Path(dircmp.right, file) 295 self._diff_files(old_file_path, new_file_path) 296 297 def _diff_files(self, old_file_path, new_file_path): 298 """Diff old & new files.""" 299 old_file_stat = FileStat(old_file_path) 300 new_file_stat = FileStat(new_file_path) 301 common_name = str(new_file_path)[self._new_dir_prefix_len:] 302 diff_stat = DiffStat(common_name, old_file_stat, new_file_stat, 303 DiffStat.MODIFIED) 304 305 try: 306 with open(old_file_path, encoding='utf-8') as f1: 307 old_lines = f1.readlines() 308 old_file_stat.line_cnt = len(old_lines) 309 with open(new_file_path, encoding='utf-8') as f2: 310 new_lines = f2.readlines() 311 new_file_stat.line_cnt = len(new_lines) 312 diff_lines = list( 313 difflib.context_diff(old_lines, new_lines, old_file_path.name, 314 new_file_path.name)) 315 file_type = FileStat.TEXT 316 if diff_lines: 317 self._diff_lines.extend(diff_lines) 318 diff_stat.add_diff_stat(diff_lines) 319 else: 320 print('WARNING: no diff lines on {0} {1}'.format( 321 old_file_path, new_file_path)) 322 323 except UnicodeDecodeError: 324 file_type = FileStat.NON_TEXT 325 326 diff_stat.file_type = file_type 327 self._diff_stats.append(diff_stat) 328 329 def _get_filtered_files(self, dir_path): 330 """Returns a filtered file list.""" 331 flist = [] 332 for f in dir_path.glob('*'): 333 if f.name not in self._ignores: 334 if f.is_symlink(): 335 print('SKIP: symlink: %s' % f) 336 continue 337 else: 338 flist.append(f) 339 return flist 340 341 342def write_file(file, lines, header=None): 343 """Write lines into a file.""" 344 345 with open(file, 'w') as f: 346 if header: 347 f.write(header) 348 349 f.writelines(lines) 350 print('OUTPUT: {0}, {1} lines'.format(file, len(lines))) 351 352 353def main(): 354 parser = argparse.ArgumentParser( 355 'Generate a diff stat cvs file for 2 versions of a codebase') 356 parser.add_argument('--old_dir', help='the old version codebase dir') 357 parser.add_argument('--new_dir', help='the new version codebase dir') 358 parser.add_argument( 359 '--csv_file', required=False, help='the diff stat cvs file if to create') 360 parser.add_argument( 361 '--diff_output_file', 362 required=False, 363 help='the diff output file if to create') 364 parser.add_argument( 365 '--ignores', 366 required=False, 367 default='.repo,.git,.github,.idea,__MACOSX,.prebuilt_info', 368 help='names to ignore') 369 parser.add_argument( 370 '--state_filter', 371 required=False, 372 default='1,2,3', 373 help='csv diff states to process, 0:SAME, 1:NEW, 2:REMOVED, 3:MODIFIED, ' 374 '4:INCOMPARABLE') 375 376 args = parser.parse_args() 377 378 if not os.path.isdir(args.old_dir): 379 print('ERROR: %s does not exist.' % args.old_dir) 380 exit() 381 382 if not os.path.isdir(args.new_dir): 383 print('ERROR: %s does not exist.' % args.new_dir) 384 exit() 385 386 change_report = ChangeReport(args.old_dir, args.new_dir, args.ignores, 387 args.state_filter) 388 if args.csv_file: 389 write_file( 390 args.csv_file, 391 change_report.get_diff_stat_lines(), 392 header=ChangeReport.get_diff_stat_header()) 393 394 if args.diff_output_file: 395 write_file(args.diff_output_file, change_report.get_diff_lines()) 396 397 398if __name__ == '__main__': 399 main() 400