• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1# Copyright 2024 The Pigweed Authors
2#
3# Licensed under the Apache License, Version 2.0 (the "License"); you may not
4# use this file except in compliance with the License. You may obtain a copy of
5# the License at
6#
7#     https://www.apache.org/licenses/LICENSE-2.0
8#
9# Unless required by applicable law or agreed to in writing, software
10# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
11# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
12# License for the specific language governing permissions and limitations under
13# the License.
14"""Utilities for file collection in a repository."""
15
16import argparse
17from collections import Counter, defaultdict
18import logging
19import os
20from pathlib import Path
21import re
22from typing import Any, Collection, Iterable, Pattern, Sequence
23
24from pw_cli.tool_runner import ToolRunner
25from pw_cli.file_filter import exclude_paths
26from pw_cli.git_repo import (
27    describe_git_pattern,
28    find_git_repo,
29    GitError,
30    TRACKING_BRANCH_ALIAS,
31)
32from pw_cli.plural import plural
33
34
35_LOG = logging.getLogger(__name__)
36
37
38def add_file_collection_arguments(parser: argparse.ArgumentParser) -> None:
39    """Adds arguments required by ``collect_files()``."""
40
41    parser.add_argument(
42        'paths',
43        metavar='pathspec',
44        nargs='*',
45        help=(
46            'Paths or patterns to which to restrict the checks. These are '
47            'interpreted as Git pathspecs. If --base is provided, only '
48            'paths changed since that commit are checked.'
49        ),
50    )
51
52    base = parser.add_mutually_exclusive_group()
53    base.add_argument(
54        '-b',
55        '--base',
56        metavar='commit',
57        default=TRACKING_BRANCH_ALIAS,
58        help=(
59            'Git revision against which to diff for changed files. '
60            'Default is the tracking branch of the current branch: '
61            f'{TRACKING_BRANCH_ALIAS}'
62        ),
63    )
64
65    base.add_argument(
66        '--all',
67        '--full',
68        dest='base',
69        action='store_const',
70        const=None,
71        help='Run actions for all files, not just changed files.',
72    )
73
74    parser.add_argument(
75        '-e',
76        '--exclude',
77        metavar='regular_expression',
78        default=[],
79        action='append',
80        type=re.compile,  # type: ignore[arg-type]
81        help=(
82            'Exclude paths matching any of these regular expressions, '
83            "which are interpreted relative to each Git repository's root."
84        ),
85    )
86
87
88def collect_files_in_current_repo(
89    pathspecs: Collection[Path | str],
90    tool_runner: ToolRunner,
91    modified_since_git_ref: str | None = None,
92    exclude_patterns: Collection[Pattern] = tuple(),
93    action_flavor_text: str = 'Collecting',
94) -> Sequence[Path]:
95    """Collects files given a variety of pathspecs and maps them to their repo.
96
97    This is a relatively fuzzy file finder for projects tracked in a Git repo.
98    It's designed to adhere to the following constraints:
99
100      * If a pathspec is a real file, unconditionally return it.
101      * If no pathspecs are passed, collect from the current working directory.
102      * Return the path of any files modified since `modified_since_git_ref`
103        (which may be a branch, tag, or commit) that match the provided
104        pathspecs.
105      * Passing no pathspecs has the same behavior as passing `.` (everything in
106        the current directory).
107
108    Args:
109        pathspecs: Files or git pathspecs to collect files from. Wildcards (e.g.
110            `pw_cl*`) are accepted.
111        tool_runner: The ToolRunner to use for Git operations.
112        modified_since_git_ref: If the passed pathspec is tracked by a git repo,
113            it is excluded if unmodified since the specified pathspec. If the
114            pathspec is `None`, no files are excluded.
115        exclude_patterns: A collection of exclude patterns to exclude from the
116            set of collected files.
117        action_flavor_text: Replaces "Collecting" in the
118            "Collecting all files in the foo repo" log message with a
119            personalized string (e.g. "Formatting all files...").
120
121    Returns:
122        A dictionary mapping a GitRepo to a list of paths relative to that
123        repo's root that match the provided pathspecs. Files not tracked by
124        any git repo are mapped to the `None` key.
125    """
126    # TODO: https://pwbug.dev/391690594 - This is brittle and not covered by
127    # tests. Someday it should be re-thought, particularly to better handle
128    # multi-repo setups.
129    files = [Path(path).resolve() for path in pathspecs if Path(path).is_file()]
130    try:
131        current_repo = find_git_repo(Path.cwd(), tool_runner)
132    except GitError:
133        current_repo = None
134
135    # If this is a Git repo, list the original paths with git ls-files or diff.
136    if current_repo is not None:
137        # Implement a graceful fallback in case the tracking branch isn't
138        # available.
139        if (
140            modified_since_git_ref == TRACKING_BRANCH_ALIAS
141            and not current_repo.tracking_branch()
142        ):
143            _LOG.warning(
144                'Failed to determine the tracking branch, using --base HEAD~1 '
145                'instead of listing all files'
146            )
147            modified_since_git_ref = 'HEAD~1'
148
149        _LOG.info(
150            '%s %s',
151            action_flavor_text,
152            describe_git_pattern(
153                Path.cwd(),
154                modified_since_git_ref,
155                pathspecs,
156                exclude_patterns,
157                tool_runner,
158                current_repo.root(),
159            ),
160        )
161
162        # Add files from Git and remove duplicates.
163        files = sorted(
164            set(
165                exclude_paths(
166                    exclude_patterns,
167                    current_repo.list_files(modified_since_git_ref, pathspecs),
168                )
169            )
170            | set(files)
171        )
172    elif modified_since_git_ref:
173        _LOG.critical(
174            'A base commit may only be provided if running from a Git repo'
175        )
176
177    return files
178
179
180def file_summary(
181    paths: Iterable[Path],
182    levels: int = 2,
183    max_lines: int = 12,
184    max_types: int = 3,
185    pad: str = ' ',
186    pad_start: str = ' ',
187    pad_end: str = ' ',
188) -> list[str]:
189    """Summarizes a list of files by the file types in each directory."""
190
191    # Count the file types in each directory.
192    all_counts: dict[Any, Counter] = defaultdict(Counter)
193
194    for path in paths:
195        parent = path.parents[max(len(path.parents) - levels, 0)]
196        all_counts[parent][path.suffix] += 1
197
198    # If there are too many lines, condense directories with the fewest files.
199    if len(all_counts) > max_lines:
200        counts = sorted(
201            all_counts.items(), key=lambda item: -sum(item[1].values())
202        )
203        counts, others = (
204            sorted(counts[: max_lines - 1]),
205            counts[max_lines - 1 :],
206        )
207        counts.append(
208            (
209                f'({plural(others, "other")})',
210                sum((c for _, c in others), Counter()),
211            )
212        )
213    else:
214        counts = sorted(all_counts.items())
215
216    width = max(len(str(d)) + len(os.sep) for d, _ in counts) if counts else 0
217    width += len(pad_start)
218
219    # Prepare the output.
220    output = []
221    for path, files in counts:
222        total = sum(files.values())
223        del files['']  # Never display no-extension files individually.
224
225        if files:
226            extensions = files.most_common(max_types)
227            other_extensions = total - sum(count for _, count in extensions)
228            if other_extensions:
229                extensions.append(('other', other_extensions))
230
231            types = ' (' + ', '.join(f'{c} {e}' for e, c in extensions) + ')'
232        else:
233            types = ''
234
235        root = f'{path}{os.sep}{pad_start}'.ljust(width, pad)
236        output.append(f'{root}{pad_end}{plural(total, "file")}{types}')
237
238    return output
239