• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1#!/usr/bin/env python3
2# Copyright 2020 The Pigweed Authors
3#
4# Licensed under the Apache License, Version 2.0 (the "License"); you may not
5# use this file except in compliance with the License. You may obtain a copy of
6# the License at
7#
8#     https://www.apache.org/licenses/LICENSE-2.0
9#
10# Unless required by applicable law or agreed to in writing, software
11# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
12# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
13# License for the specific language governing permissions and limitations under
14# the License.
15"""Creates and manages token databases.
16
17This module manages reading tokenized strings from ELF files and building and
18maintaining token databases.
19"""
20
21import argparse
22from datetime import datetime
23import glob
24import itertools
25import json
26import logging
27import os
28from pathlib import Path
29import re
30import struct
31import sys
32from typing import (
33    Any,
34    Callable,
35    Iterable,
36    Iterator,
37    Pattern,
38    Set,
39    TextIO,
40)
41
42try:
43    from pw_tokenizer import elf_reader, tokens
44except ImportError:
45    # Append this path to the module search path to allow running this module
46    # without installing the pw_tokenizer package.
47    sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
48    from pw_tokenizer import elf_reader, tokens
49
50_LOG = logging.getLogger('pw_tokenizer')
51
52
53def _elf_reader(elf) -> elf_reader.Elf:
54    return elf if isinstance(elf, elf_reader.Elf) else elf_reader.Elf(elf)
55
56
57# Magic number used to indicate the beginning of a tokenized string entry. This
58# value MUST match the value of _PW_TOKENIZER_ENTRY_MAGIC in
59# pw_tokenizer/public/pw_tokenizer/internal/tokenize_string.h.
60_TOKENIZED_ENTRY_MAGIC = 0xBAA98DEE
61_ENTRY = struct.Struct('<4I')
62_TOKENIZED_ENTRY_SECTIONS = re.compile(r'^\.pw_tokenizer.entries(?:\.[_\d]+)?$')
63
64_ERROR_HANDLER = 'surrogateescape'  # How to deal with UTF-8 decoding errors
65
66
67class Error(Exception):
68    """Failed to extract token entries from an ELF file."""
69
70
71def _read_tokenized_entries(
72    data: bytes, domain: Pattern[str]
73) -> Iterator[tokens.TokenizedStringEntry]:
74    index = 0
75
76    while index + _ENTRY.size <= len(data):
77        magic, token, domain_len, string_len = _ENTRY.unpack_from(data, index)
78
79        if magic != _TOKENIZED_ENTRY_MAGIC:
80            raise Error(
81                f'Expected magic number 0x{_TOKENIZED_ENTRY_MAGIC:08x}, '
82                f'found 0x{magic:08x}'
83            )
84
85        start = index + _ENTRY.size
86        index = start + domain_len + string_len
87
88        # Create the entries, trimming null terminators.
89        entry = tokens.TokenizedStringEntry(
90            token,
91            data[start + domain_len : index - 1].decode(errors=_ERROR_HANDLER),
92            data[start : start + domain_len - 1].decode(errors=_ERROR_HANDLER),
93        )
94
95        if data[start + domain_len - 1] != 0:
96            raise Error(
97                f'Domain {entry.domain} for {entry.string} not null terminated'
98            )
99
100        if data[index - 1] != 0:
101            raise Error(f'String {entry.string} is not null terminated')
102
103        if domain.fullmatch(entry.domain):
104            yield entry
105
106
107def _database_from_elf(elf, domain: Pattern[str]) -> tokens.Database:
108    """Reads the tokenized strings from an elf_reader.Elf or ELF file object."""
109    _LOG.debug('Reading tokenized strings in domain "%s" from %s', domain, elf)
110
111    reader = _elf_reader(elf)
112
113    # Read tokenized string entries.
114    section_data = reader.dump_section_contents(_TOKENIZED_ENTRY_SECTIONS)
115    if section_data is not None:
116        return tokens.Database(_read_tokenized_entries(section_data, domain))
117
118    return tokens.Database([])
119
120
121def tokenization_domains(elf) -> Iterator[str]:
122    """Lists all tokenization domains in an ELF file."""
123    reader = _elf_reader(elf)
124    section_data = reader.dump_section_contents(_TOKENIZED_ENTRY_SECTIONS)
125    if section_data is not None:
126        yield from frozenset(
127            e.domain
128            for e in _read_tokenized_entries(section_data, re.compile('.*'))
129        )
130
131
132def read_tokenizer_metadata(elf) -> dict[str, int]:
133    """Reads the metadata entries from an ELF."""
134    sections = _elf_reader(elf).dump_section_contents(r'\.pw_tokenizer\.info')
135
136    metadata: dict[str, int] = {}
137    if sections is not None:
138        for key, value in struct.iter_unpack('12sI', sections):
139            try:
140                metadata[key.rstrip(b'\0').decode()] = value
141            except UnicodeDecodeError as err:
142                _LOG.error(
143                    'Failed to decode metadata key %r: %s',
144                    key.rstrip(b'\0'),
145                    err,
146                )
147
148    return metadata
149
150
151def _database_from_strings(strings: list[str]) -> tokens.Database:
152    """Generates a C and C++ compatible database from untokenized strings."""
153    # Generate a C-compatible database from the fixed length hash.
154    c_db = tokens.Database.from_strings(strings, tokenize=tokens.c_hash)
155
156    # Generate a C++ compatible database by allowing the hash to follow the
157    # string length.
158    cpp_db = tokens.Database.from_strings(
159        strings, tokenize=tokens.pw_tokenizer_65599_hash
160    )
161
162    # Use a union of the C and C++ compatible databases.
163    return tokens.Database.merged(c_db, cpp_db)
164
165
166def _database_from_json(fd) -> tokens.Database:
167    return _database_from_strings(json.load(fd))
168
169
170def _load_token_database(  # pylint: disable=too-many-return-statements
171    db, domain: Pattern[str]
172) -> tokens.Database:
173    """Loads a Database from supported database types.
174
175    Supports Database objects, JSONs, ELFs, CSVs, and binary databases.
176    """
177    if db is None:
178        return tokens.Database()
179
180    if isinstance(db, tokens.Database):
181        return db
182
183    if isinstance(db, elf_reader.Elf):
184        return _database_from_elf(db, domain)
185
186    # If it's a str, it might be a path. Check if it's an ELF, CSV, or JSON.
187    if isinstance(db, (str, Path)):
188        if not os.path.exists(db):
189            raise FileNotFoundError(f'"{db}" is not a path to a token database')
190
191        if Path(db).is_dir():
192            return tokens.DatabaseFile.load(Path(db))
193
194        # Read the path as an ELF file.
195        with open(db, 'rb') as fd:
196            if elf_reader.compatible_file(fd):
197                return _database_from_elf(fd, domain)
198
199        # Generate a database from JSON.
200        if str(db).endswith('.json'):
201            with open(db, 'r', encoding='utf-8') as json_fd:
202                return _database_from_json(json_fd)
203
204        # Read the path as a packed binary or CSV file.
205        return tokens.DatabaseFile.load(Path(db))
206
207    # Assume that it's a file object and check if it's an ELF.
208    if elf_reader.compatible_file(db):
209        return _database_from_elf(db, domain)
210
211    # Read the database as JSON, CSV, or packed binary from a file object's
212    # path.
213    if hasattr(db, 'name') and os.path.exists(db.name):
214        if db.name.endswith('.json'):
215            return _database_from_json(db)
216
217        return tokens.DatabaseFile.load(Path(db.name))
218
219    # Read CSV directly from the file object.
220    return tokens.Database(tokens.parse_csv(db))
221
222
223def load_token_database(
224    *databases, domain: str | Pattern[str] = tokens.DEFAULT_DOMAIN
225) -> tokens.Database:
226    """Loads a Database from supported database types.
227
228    Supports Database objects, JSONs, ELFs, CSVs, and binary databases.
229    """
230    domain = re.compile(domain)
231    return tokens.Database.merged(
232        *(_load_token_database(db, domain) for db in databases)
233    )
234
235
236def database_summary(db: tokens.Database) -> dict[str, Any]:
237    """Returns a simple report of properties of the database."""
238    present = [entry for entry in db.entries() if not entry.date_removed]
239    collisions = {
240        token: list(e.string for e in entries)
241        for token, entries in db.collisions()
242    }
243
244    # Add 1 to each string's size to account for the null terminator.
245    return dict(
246        present_entries=len(present),
247        present_size_bytes=sum(len(entry.string) + 1 for entry in present),
248        total_entries=len(db.entries()),
249        total_size_bytes=sum(len(entry.string) + 1 for entry in db.entries()),
250        collisions=collisions,
251    )
252
253
254_DatabaseReport = dict[str, dict[str, dict[str, Any]]]
255
256
257def generate_reports(paths: Iterable[Path]) -> _DatabaseReport:
258    """Returns a dictionary with information about the provided databases."""
259    reports: _DatabaseReport = {}
260
261    for path in paths:
262        domains = ['']
263        if path.is_file():
264            with path.open('rb') as file:
265                if elf_reader.compatible_file(file):
266                    domains = list(tokenization_domains(file))
267
268        domain_reports = {}
269
270        for domain in domains:
271            domain_reports[domain] = database_summary(
272                load_token_database(path, domain=domain)
273            )
274
275        reports[str(path)] = domain_reports
276
277    return reports
278
279
280def _handle_create(
281    databases,
282    database: Path,
283    force: bool,
284    output_type: str,
285    include: list,
286    exclude: list,
287    replace: list,
288) -> None:
289    """Creates a token database file from one or more ELF files."""
290    if not force and database.exists():
291        raise FileExistsError(
292            f'The file {database} already exists! Use --force to overwrite.'
293        )
294
295    if not database.parent.exists():
296        database.parent.mkdir(parents=True)
297
298    if output_type == 'directory':
299        if str(database) == '-':
300            raise ValueError(
301                'Cannot specify "-" (stdout) for directory databases'
302            )
303
304        database.mkdir(exist_ok=True)
305        database = database / f'database{tokens.DIR_DB_SUFFIX}'
306        output_type = 'csv'
307
308    if str(database) == '-':
309        # Must write bytes to stdout; use sys.stdout.buffer.
310        fd = sys.stdout.buffer
311    else:
312        fd = database.open('wb')
313
314    db = tokens.Database.merged(*databases)
315    db.filter(include, exclude, replace)
316
317    with fd:
318        if output_type == 'csv':
319            tokens.write_csv(db, fd)
320        elif output_type == 'binary':
321            tokens.write_binary(db, fd)
322        else:
323            raise ValueError(f'Unknown database type "{output_type}"')
324
325    _LOG.info(
326        'Wrote database with %d entries to %s as %s',
327        len(db),
328        fd.name,
329        output_type,
330    )
331
332
333def _handle_add(
334    token_database: tokens.DatabaseFile,
335    databases: list[tokens.Database],
336    commit: str | None,
337) -> None:
338    initial = len(token_database)
339    if commit:
340        entries = itertools.chain.from_iterable(
341            db.entries() for db in databases
342        )
343        token_database.add_and_discard_temporary(entries, commit)
344    else:
345        for source in databases:
346            token_database.add(source.entries())
347
348        token_database.write_to_file()
349
350    number_of_changes = len(token_database) - initial
351
352    if number_of_changes:
353        _LOG.info(
354            'Added %d entries to %s', number_of_changes, token_database.path
355        )
356
357
358def _handle_mark_removed(
359    token_database: tokens.DatabaseFile,
360    databases: list[tokens.Database],
361    date: datetime | None,
362):
363    marked_removed = token_database.mark_removed(
364        (
365            entry
366            for entry in tokens.Database.merged(*databases).entries()
367            if not entry.date_removed
368        ),
369        date,
370    )
371
372    token_database.write_to_file(rewrite=True)
373
374    _LOG.info(
375        'Marked %d of %d entries as removed in %s',
376        len(marked_removed),
377        len(token_database),
378        token_database.path,
379    )
380
381
382def _handle_purge(token_database: tokens.DatabaseFile, before: datetime | None):
383    purged = token_database.purge(before)
384    token_database.write_to_file(rewrite=True)
385
386    _LOG.info('Removed %d entries from %s', len(purged), token_database.path)
387
388
389def _handle_report(token_database_or_elf: list[Path], output: TextIO) -> None:
390    json.dump(generate_reports(token_database_or_elf), output, indent=2)
391    output.write('\n')
392
393
394def expand_paths_or_globs(*paths_or_globs: str) -> Iterable[Path]:
395    """Expands any globs in a list of paths; raises FileNotFoundError."""
396    for path_or_glob in paths_or_globs:
397        if os.path.exists(path_or_glob):
398            # This is a valid path; yield it without evaluating it as a glob.
399            yield Path(path_or_glob)
400        else:
401            paths = glob.glob(path_or_glob, recursive=True)
402
403            # If no paths were found and the path is not a glob, raise an Error.
404            if not paths and not any(c in path_or_glob for c in '*?[]!'):
405                raise FileNotFoundError(f'{path_or_glob} is not a valid path')
406
407            for path in paths:
408                # Resolve globs to JSON, CSV, or compatible binary files.
409                if elf_reader.compatible_file(path) or path.endswith(
410                    ('.csv', '.json')
411                ):
412                    yield Path(path)
413
414
415class ExpandGlobs(argparse.Action):
416    """Argparse action that expands and appends paths."""
417
418    def __call__(self, parser, namespace, values, unused_option_string=None):
419        setattr(namespace, self.dest, list(expand_paths_or_globs(*values)))
420
421
422def _read_elf_with_domain(
423    elf: str, domain: Pattern[str]
424) -> Iterable[tokens.Database]:
425    for path in expand_paths_or_globs(elf):
426        with path.open('rb') as file:
427            if not elf_reader.compatible_file(file):
428                raise ValueError(
429                    f'{elf} is not an ELF file, '
430                    f'but the "{domain}" domain was specified'
431                )
432
433            yield _database_from_elf(file, domain)
434
435
436class LoadTokenDatabases(argparse.Action):
437    """Argparse action that reads tokenize databases from paths or globs.
438
439    ELF files may have #domain appended to them to specify a tokenization domain
440    other than the default.
441    """
442
443    def __call__(self, parser, namespace, values, option_string=None):
444        databases: list[tokens.Database] = []
445        paths: Set[Path] = set()
446
447        try:
448            for value in values:
449                if value.count('#') == 1:
450                    path, domain = value.split('#')
451                    domain = re.compile(domain)
452                    databases.extend(_read_elf_with_domain(path, domain))
453                else:
454                    paths.update(expand_paths_or_globs(value))
455
456            for path in paths:
457                databases.append(load_token_database(path))
458        except tokens.DatabaseFormatError as err:
459            parser.error(
460                f'argument elf_or_token_database: {path} is not a supported '
461                'token database file. Only ELF files or token databases (CSV '
462                f'or binary format) are supported. {err}. '
463            )
464        except FileNotFoundError as err:
465            parser.error(f'argument elf_or_token_database: {err}')
466        except:  # pylint: disable=bare-except
467            _LOG.exception('Failed to load token database %s', path)
468            parser.error(
469                'argument elf_or_token_database: '
470                f'Error occurred while loading token database {path}'
471            )
472
473        setattr(namespace, self.dest, databases)
474
475
476def token_databases_parser(nargs: str = '+') -> argparse.ArgumentParser:
477    """Returns an argument parser for reading token databases.
478
479    These arguments can be added to another parser using the parents arg.
480    """
481    parser = argparse.ArgumentParser(add_help=False)
482    parser.add_argument(
483        'databases',
484        metavar='elf_or_token_database',
485        nargs=nargs,
486        action=LoadTokenDatabases,
487        help=(
488            'ELF or token database files from which to read strings and '
489            'tokens. For ELF files, the tokenization domain to read from '
490            'may specified after the path as #domain_name (e.g. '
491            'foo.elf#TEST_DOMAIN). Unless specified, only the default '
492            'domain ("") is read from ELF files; .* reads all domains. '
493            'Globs are expanded to compatible database files.'
494        ),
495    )
496    return parser
497
498
499def _parse_args():
500    """Parse and return command line arguments."""
501
502    def year_month_day(value) -> datetime:
503        if value == 'today':
504            return datetime.now()
505
506        return datetime.fromisoformat(value)
507
508    year_month_day.__name__ = 'year-month-day (YYYY-MM-DD)'
509
510    # Shared command line options.
511    option_db = argparse.ArgumentParser(add_help=False)
512    option_db.add_argument(
513        '-d',
514        '--database',
515        dest='token_database',
516        type=lambda arg: tokens.DatabaseFile.load(Path(arg)),
517        required=True,
518        help='The database file to update.',
519    )
520
521    option_tokens = token_databases_parser('*')
522
523    # Top-level argument parser.
524    parser = argparse.ArgumentParser(
525        description=__doc__,
526        formatter_class=argparse.RawDescriptionHelpFormatter,
527    )
528    parser.set_defaults(handler=lambda **_: parser.print_help())
529
530    subparsers = parser.add_subparsers(
531        help='Tokenized string database management actions:'
532    )
533
534    # The 'create' command creates a database file.
535    subparser = subparsers.add_parser(
536        'create',
537        parents=[option_tokens],
538        help=(
539            'Creates a database with tokenized strings from one or more '
540            'sources.'
541        ),
542    )
543    subparser.set_defaults(handler=_handle_create)
544    subparser.add_argument(
545        '-d',
546        '--database',
547        required=True,
548        type=Path,
549        help='Path to the database file to create; use - for stdout.',
550    )
551    subparser.add_argument(
552        '-t',
553        '--type',
554        dest='output_type',
555        choices=('csv', 'binary', 'directory'),
556        default='csv',
557        help='Which type of database to create. (default: csv)',
558    )
559    subparser.add_argument(
560        '-f',
561        '--force',
562        action='store_true',
563        help='Overwrite the database if it exists.',
564    )
565    subparser.add_argument(
566        '-i',
567        '--include',
568        type=re.compile,
569        default=[],
570        action='append',
571        help=(
572            'If provided, at least one of these regular expressions must '
573            'match for a string to be included in the database.'
574        ),
575    )
576    subparser.add_argument(
577        '-e',
578        '--exclude',
579        type=re.compile,
580        default=[],
581        action='append',
582        help=(
583            'If provided, none of these regular expressions may match for a '
584            'string to be included in the database.'
585        ),
586    )
587
588    unescaped_slash = re.compile(r'(?<!\\)/')
589
590    def replacement(value: str) -> tuple[Pattern, 'str']:
591        try:
592            find, sub = unescaped_slash.split(value, 1)
593        except ValueError as _err:
594            raise argparse.ArgumentTypeError(
595                'replacements must be specified as "search_regex/replacement"'
596            )
597
598        try:
599            return re.compile(find.replace(r'\/', '/')), sub
600        except re.error as err:
601            raise argparse.ArgumentTypeError(
602                f'"{value}" is not a valid regular expression: {err}'
603            )
604
605    subparser.add_argument(
606        '--replace',
607        type=replacement,
608        default=[],
609        action='append',
610        help=(
611            'If provided, replaces text that matches a regular expression. '
612            'This can be used to replace sensitive terms in a token '
613            'database that will be distributed publicly. The expression and '
614            'replacement are specified as "search_regex/replacement". '
615            'Plain slash characters in the regex must be escaped with a '
616            r'backslash (\/). The replacement text may include '
617            'backreferences for captured groups in the regex.'
618        ),
619    )
620
621    # The 'add' command adds strings to a database from a set of ELFs.
622    subparser = subparsers.add_parser(
623        'add',
624        parents=[option_db, option_tokens],
625        help=(
626            'Adds new strings to a database with tokenized strings from a set '
627            'of ELF files or other token databases. Missing entries are NOT '
628            'marked as removed.'
629        ),
630    )
631    subparser.set_defaults(handler=_handle_add)
632    subparser.add_argument(
633        '--discard-temporary',
634        dest='commit',
635        help=(
636            'Deletes temporary tokens in memory and on disk when a CSV exists '
637            'within a commit. Afterwards, new strings are added to the '
638            'database from a set of ELF files or other token databases. '
639            'Missing entries are NOT marked as removed.'
640        ),
641    )
642
643    # The 'mark_removed' command marks removed entries to match a set of ELFs.
644    subparser = subparsers.add_parser(
645        'mark_removed',
646        parents=[option_db, option_tokens],
647        help=(
648            'Updates a database with tokenized strings from a set of strings. '
649            'Strings not present in the set remain in the database but are '
650            'marked as removed. New strings are NOT added.'
651        ),
652    )
653    subparser.set_defaults(handler=_handle_mark_removed)
654    subparser.add_argument(
655        '--date',
656        type=year_month_day,
657        help=(
658            'The removal date to use for all strings. '
659            'May be YYYY-MM-DD or "today". (default: today)'
660        ),
661    )
662
663    # The 'purge' command removes old entries.
664    subparser = subparsers.add_parser(
665        'purge',
666        parents=[option_db],
667        help='Purges removed strings from a database.',
668    )
669    subparser.set_defaults(handler=_handle_purge)
670    subparser.add_argument(
671        '-b',
672        '--before',
673        type=year_month_day,
674        help=(
675            'Delete all entries removed on or before this date. '
676            'May be YYYY-MM-DD or "today".'
677        ),
678    )
679
680    # The 'report' command prints a report about a database.
681    subparser = subparsers.add_parser(
682        'report', help='Prints a report about a database.'
683    )
684    subparser.set_defaults(handler=_handle_report)
685    subparser.add_argument(
686        'token_database_or_elf',
687        nargs='+',
688        action=ExpandGlobs,
689        help=(
690            'The ELF files or token databases about which to generate '
691            'reports.'
692        ),
693    )
694    subparser.add_argument(
695        '-o',
696        '--output',
697        type=argparse.FileType('w'),
698        default=sys.stdout,
699        help='The file to which to write the output; use - for stdout.',
700    )
701
702    args = parser.parse_args()
703
704    handler = args.handler
705    del args.handler
706
707    return handler, args
708
709
710def _init_logging(level: int) -> None:
711    _LOG.setLevel(logging.DEBUG)
712    log_to_stderr = logging.StreamHandler()
713    log_to_stderr.setLevel(level)
714    log_to_stderr.setFormatter(
715        logging.Formatter(
716            fmt='%(asctime)s.%(msecs)03d-%(levelname)s: %(message)s',
717            datefmt='%H:%M:%S',
718        )
719    )
720
721    _LOG.addHandler(log_to_stderr)
722
723
724def _main(handler: Callable, args: argparse.Namespace) -> int:
725    _init_logging(logging.INFO)
726    handler(**vars(args))
727    return 0
728
729
730if __name__ == '__main__':
731    sys.exit(_main(*_parse_args()))
732