1# Copyright 2024 The Pigweed Authors 2# 3# Licensed under the Apache License, Version 2.0 (the "License"); you may not 4# use this file except in compliance with the License. You may obtain a copy of 5# the License at 6# 7# https://www.apache.org/licenses/LICENSE-2.0 8# 9# Unless required by applicable law or agreed to in writing, software 10# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT 11# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the 12# License for the specific language governing permissions and limitations under 13# the License. 14"""Utilities for file collection in a repository.""" 15 16import argparse 17from collections import Counter, defaultdict 18import logging 19import os 20from pathlib import Path 21import re 22from typing import Any, Collection, Iterable, Pattern, Sequence 23 24from pw_cli.tool_runner import ToolRunner 25from pw_cli.file_filter import exclude_paths 26from pw_cli.git_repo import ( 27 describe_git_pattern, 28 find_git_repo, 29 GitError, 30 TRACKING_BRANCH_ALIAS, 31) 32from pw_cli.plural import plural 33 34 35_LOG = logging.getLogger(__name__) 36 37 38def add_file_collection_arguments(parser: argparse.ArgumentParser) -> None: 39 """Adds arguments required by ``collect_files()``.""" 40 41 parser.add_argument( 42 'paths', 43 metavar='pathspec', 44 nargs='*', 45 help=( 46 'Paths or patterns to which to restrict the checks. These are ' 47 'interpreted as Git pathspecs. If --base is provided, only ' 48 'paths changed since that commit are checked.' 49 ), 50 ) 51 52 base = parser.add_mutually_exclusive_group() 53 base.add_argument( 54 '-b', 55 '--base', 56 metavar='commit', 57 default=TRACKING_BRANCH_ALIAS, 58 help=( 59 'Git revision against which to diff for changed files. ' 60 'Default is the tracking branch of the current branch: ' 61 f'{TRACKING_BRANCH_ALIAS}' 62 ), 63 ) 64 65 base.add_argument( 66 '--all', 67 '--full', 68 dest='base', 69 action='store_const', 70 const=None, 71 help='Run actions for all files, not just changed files.', 72 ) 73 74 parser.add_argument( 75 '-e', 76 '--exclude', 77 metavar='regular_expression', 78 default=[], 79 action='append', 80 type=re.compile, # type: ignore[arg-type] 81 help=( 82 'Exclude paths matching any of these regular expressions, ' 83 "which are interpreted relative to each Git repository's root." 84 ), 85 ) 86 87 88def collect_files_in_current_repo( 89 pathspecs: Collection[Path | str], 90 tool_runner: ToolRunner, 91 modified_since_git_ref: str | None = None, 92 exclude_patterns: Collection[Pattern] = tuple(), 93 action_flavor_text: str = 'Collecting', 94) -> Sequence[Path]: 95 """Collects files given a variety of pathspecs and maps them to their repo. 96 97 This is a relatively fuzzy file finder for projects tracked in a Git repo. 98 It's designed to adhere to the following constraints: 99 100 * If a pathspec is a real file, unconditionally return it. 101 * If no pathspecs are passed, collect from the current working directory. 102 * Return the path of any files modified since `modified_since_git_ref` 103 (which may be a branch, tag, or commit) that match the provided 104 pathspecs. 105 * Passing no pathspecs has the same behavior as passing `.` (everything in 106 the current directory). 107 108 Args: 109 pathspecs: Files or git pathspecs to collect files from. Wildcards (e.g. 110 `pw_cl*`) are accepted. 111 tool_runner: The ToolRunner to use for Git operations. 112 modified_since_git_ref: If the passed pathspec is tracked by a git repo, 113 it is excluded if unmodified since the specified pathspec. If the 114 pathspec is `None`, no files are excluded. 115 exclude_patterns: A collection of exclude patterns to exclude from the 116 set of collected files. 117 action_flavor_text: Replaces "Collecting" in the 118 "Collecting all files in the foo repo" log message with a 119 personalized string (e.g. "Formatting all files..."). 120 121 Returns: 122 A dictionary mapping a GitRepo to a list of paths relative to that 123 repo's root that match the provided pathspecs. Files not tracked by 124 any git repo are mapped to the `None` key. 125 """ 126 # TODO: https://pwbug.dev/391690594 - This is brittle and not covered by 127 # tests. Someday it should be re-thought, particularly to better handle 128 # multi-repo setups. 129 files = [Path(path).resolve() for path in pathspecs if Path(path).is_file()] 130 try: 131 current_repo = find_git_repo(Path.cwd(), tool_runner) 132 except GitError: 133 current_repo = None 134 135 # If this is a Git repo, list the original paths with git ls-files or diff. 136 if current_repo is not None: 137 # Implement a graceful fallback in case the tracking branch isn't 138 # available. 139 if ( 140 modified_since_git_ref == TRACKING_BRANCH_ALIAS 141 and not current_repo.tracking_branch() 142 ): 143 _LOG.warning( 144 'Failed to determine the tracking branch, using --base HEAD~1 ' 145 'instead of listing all files' 146 ) 147 modified_since_git_ref = 'HEAD~1' 148 149 _LOG.info( 150 '%s %s', 151 action_flavor_text, 152 describe_git_pattern( 153 Path.cwd(), 154 modified_since_git_ref, 155 pathspecs, 156 exclude_patterns, 157 tool_runner, 158 current_repo.root(), 159 ), 160 ) 161 162 # Add files from Git and remove duplicates. 163 files = sorted( 164 set( 165 exclude_paths( 166 exclude_patterns, 167 current_repo.list_files(modified_since_git_ref, pathspecs), 168 ) 169 ) 170 | set(files) 171 ) 172 elif modified_since_git_ref: 173 _LOG.critical( 174 'A base commit may only be provided if running from a Git repo' 175 ) 176 177 return files 178 179 180def file_summary( 181 paths: Iterable[Path], 182 levels: int = 2, 183 max_lines: int = 12, 184 max_types: int = 3, 185 pad: str = ' ', 186 pad_start: str = ' ', 187 pad_end: str = ' ', 188) -> list[str]: 189 """Summarizes a list of files by the file types in each directory.""" 190 191 # Count the file types in each directory. 192 all_counts: dict[Any, Counter] = defaultdict(Counter) 193 194 for path in paths: 195 parent = path.parents[max(len(path.parents) - levels, 0)] 196 all_counts[parent][path.suffix] += 1 197 198 # If there are too many lines, condense directories with the fewest files. 199 if len(all_counts) > max_lines: 200 counts = sorted( 201 all_counts.items(), key=lambda item: -sum(item[1].values()) 202 ) 203 counts, others = ( 204 sorted(counts[: max_lines - 1]), 205 counts[max_lines - 1 :], 206 ) 207 counts.append( 208 ( 209 f'({plural(others, "other")})', 210 sum((c for _, c in others), Counter()), 211 ) 212 ) 213 else: 214 counts = sorted(all_counts.items()) 215 216 width = max(len(str(d)) + len(os.sep) for d, _ in counts) if counts else 0 217 width += len(pad_start) 218 219 # Prepare the output. 220 output = [] 221 for path, files in counts: 222 total = sum(files.values()) 223 del files[''] # Never display no-extension files individually. 224 225 if files: 226 extensions = files.most_common(max_types) 227 other_extensions = total - sum(count for _, count in extensions) 228 if other_extensions: 229 extensions.append(('other', other_extensions)) 230 231 types = ' (' + ', '.join(f'{c} {e}' for e, c in extensions) + ')' 232 else: 233 types = '' 234 235 root = f'{path}{os.sep}{pad_start}'.ljust(width, pad) 236 output.append(f'{root}{pad_end}{plural(total, "file")}{types}') 237 238 return output 239