• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1#!/usr/bin/env python3
2# Copyright 2020 The Pigweed Authors
3#
4# Licensed under the Apache License, Version 2.0 (the "License"); you may not
5# use this file except in compliance with the License. You may obtain a copy of
6# the License at
7#
8#     https://www.apache.org/licenses/LICENSE-2.0
9#
10# Unless required by applicable law or agreed to in writing, software
11# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
12# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
13# License for the specific language governing permissions and limitations under
14# the License.
15"""Creates and manages token databases.
16
17This module manages reading tokenized strings from ELF files and building and
18maintaining token databases.
19"""
20
21import argparse
22from datetime import datetime
23import glob
24import itertools
25import json
26import logging
27import os
28from pathlib import Path
29import re
30import struct
31import sys
32from typing import (
33    Any,
34    Callable,
35    Dict,
36    Iterable,
37    Iterator,
38    List,
39    Optional,
40    Pattern,
41    Set,
42    TextIO,
43    Tuple,
44    Union,
45)
46
47try:
48    from pw_tokenizer import elf_reader, tokens
49except ImportError:
50    # Append this path to the module search path to allow running this module
51    # without installing the pw_tokenizer package.
52    sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
53    from pw_tokenizer import elf_reader, tokens
54
55_LOG = logging.getLogger('pw_tokenizer')
56
57
58def _elf_reader(elf) -> elf_reader.Elf:
59    return elf if isinstance(elf, elf_reader.Elf) else elf_reader.Elf(elf)
60
61
62# Magic number used to indicate the beginning of a tokenized string entry. This
63# value MUST match the value of _PW_TOKENIZER_ENTRY_MAGIC in
64# pw_tokenizer/public/pw_tokenizer/internal/tokenize_string.h.
65_TOKENIZED_ENTRY_MAGIC = 0xBAA98DEE
66_ENTRY = struct.Struct('<4I')
67_TOKENIZED_ENTRY_SECTIONS = re.compile(r'^\.pw_tokenizer.entries(?:\.[_\d]+)?$')
68
69_ERROR_HANDLER = 'surrogateescape'  # How to deal with UTF-8 decoding errors
70
71
72class Error(Exception):
73    """Failed to extract token entries from an ELF file."""
74
75
76def _read_tokenized_entries(
77    data: bytes, domain: Pattern[str]
78) -> Iterator[tokens.TokenizedStringEntry]:
79    index = 0
80
81    while index + _ENTRY.size <= len(data):
82        magic, token, domain_len, string_len = _ENTRY.unpack_from(data, index)
83
84        if magic != _TOKENIZED_ENTRY_MAGIC:
85            raise Error(
86                f'Expected magic number 0x{_TOKENIZED_ENTRY_MAGIC:08x}, '
87                f'found 0x{magic:08x}'
88            )
89
90        start = index + _ENTRY.size
91        index = start + domain_len + string_len
92
93        # Create the entries, trimming null terminators.
94        entry = tokens.TokenizedStringEntry(
95            token,
96            data[start + domain_len : index - 1].decode(errors=_ERROR_HANDLER),
97            data[start : start + domain_len - 1].decode(errors=_ERROR_HANDLER),
98        )
99
100        if data[start + domain_len - 1] != 0:
101            raise Error(
102                f'Domain {entry.domain} for {entry.string} not null terminated'
103            )
104
105        if data[index - 1] != 0:
106            raise Error(f'String {entry.string} is not null terminated')
107
108        if domain.fullmatch(entry.domain):
109            yield entry
110
111
112def _database_from_elf(elf, domain: Pattern[str]) -> tokens.Database:
113    """Reads the tokenized strings from an elf_reader.Elf or ELF file object."""
114    _LOG.debug('Reading tokenized strings in domain "%s" from %s', domain, elf)
115
116    reader = _elf_reader(elf)
117
118    # Read tokenized string entries.
119    section_data = reader.dump_section_contents(_TOKENIZED_ENTRY_SECTIONS)
120    if section_data is not None:
121        return tokens.Database(_read_tokenized_entries(section_data, domain))
122
123    return tokens.Database([])
124
125
126def tokenization_domains(elf) -> Iterator[str]:
127    """Lists all tokenization domains in an ELF file."""
128    reader = _elf_reader(elf)
129    section_data = reader.dump_section_contents(_TOKENIZED_ENTRY_SECTIONS)
130    if section_data is not None:
131        yield from frozenset(
132            e.domain
133            for e in _read_tokenized_entries(section_data, re.compile('.*'))
134        )
135
136
137def read_tokenizer_metadata(elf) -> Dict[str, int]:
138    """Reads the metadata entries from an ELF."""
139    sections = _elf_reader(elf).dump_section_contents(r'\.pw_tokenizer\.info')
140
141    metadata: Dict[str, int] = {}
142    if sections is not None:
143        for key, value in struct.iter_unpack('12sI', sections):
144            try:
145                metadata[key.rstrip(b'\0').decode()] = value
146            except UnicodeDecodeError as err:
147                _LOG.error(
148                    'Failed to decode metadata key %r: %s',
149                    key.rstrip(b'\0'),
150                    err,
151                )
152
153    return metadata
154
155
156def _database_from_strings(strings: List[str]) -> tokens.Database:
157    """Generates a C and C++ compatible database from untokenized strings."""
158    # Generate a C-compatible database from the fixed length hash.
159    c_db = tokens.Database.from_strings(strings, tokenize=tokens.c_hash)
160
161    # Generate a C++ compatible database by allowing the hash to follow the
162    # string length.
163    cpp_db = tokens.Database.from_strings(
164        strings, tokenize=tokens.pw_tokenizer_65599_hash
165    )
166
167    # Use a union of the C and C++ compatible databases.
168    return tokens.Database.merged(c_db, cpp_db)
169
170
171def _database_from_json(fd) -> tokens.Database:
172    return _database_from_strings(json.load(fd))
173
174
175def _load_token_database(  # pylint: disable=too-many-return-statements
176    db, domain: Pattern[str]
177) -> tokens.Database:
178    """Loads a Database from supported database types.
179
180    Supports Database objects, JSONs, ELFs, CSVs, and binary databases.
181    """
182    if db is None:
183        return tokens.Database()
184
185    if isinstance(db, tokens.Database):
186        return db
187
188    if isinstance(db, elf_reader.Elf):
189        return _database_from_elf(db, domain)
190
191    # If it's a str, it might be a path. Check if it's an ELF, CSV, or JSON.
192    if isinstance(db, (str, Path)):
193        if not os.path.exists(db):
194            raise FileNotFoundError(f'"{db}" is not a path to a token database')
195
196        if Path(db).is_dir():
197            return tokens.DatabaseFile.load(Path(db))
198
199        # Read the path as an ELF file.
200        with open(db, 'rb') as fd:
201            if elf_reader.compatible_file(fd):
202                return _database_from_elf(fd, domain)
203
204        # Generate a database from JSON.
205        if str(db).endswith('.json'):
206            with open(db, 'r', encoding='utf-8') as json_fd:
207                return _database_from_json(json_fd)
208
209        # Read the path as a packed binary or CSV file.
210        return tokens.DatabaseFile.load(Path(db))
211
212    # Assume that it's a file object and check if it's an ELF.
213    if elf_reader.compatible_file(db):
214        return _database_from_elf(db, domain)
215
216    # Read the database as JSON, CSV, or packed binary from a file object's
217    # path.
218    if hasattr(db, 'name') and os.path.exists(db.name):
219        if db.name.endswith('.json'):
220            return _database_from_json(db)
221
222        return tokens.DatabaseFile.load(Path(db.name))
223
224    # Read CSV directly from the file object.
225    return tokens.Database(tokens.parse_csv(db))
226
227
228def load_token_database(
229    *databases, domain: Union[str, Pattern[str]] = tokens.DEFAULT_DOMAIN
230) -> tokens.Database:
231    """Loads a Database from supported database types.
232
233    Supports Database objects, JSONs, ELFs, CSVs, and binary databases.
234    """
235    domain = re.compile(domain)
236    return tokens.Database.merged(
237        *(_load_token_database(db, domain) for db in databases)
238    )
239
240
241def database_summary(db: tokens.Database) -> Dict[str, Any]:
242    """Returns a simple report of properties of the database."""
243    present = [entry for entry in db.entries() if not entry.date_removed]
244    collisions = {
245        token: list(e.string for e in entries)
246        for token, entries in db.collisions()
247    }
248
249    # Add 1 to each string's size to account for the null terminator.
250    return dict(
251        present_entries=len(present),
252        present_size_bytes=sum(len(entry.string) + 1 for entry in present),
253        total_entries=len(db.entries()),
254        total_size_bytes=sum(len(entry.string) + 1 for entry in db.entries()),
255        collisions=collisions,
256    )
257
258
259_DatabaseReport = Dict[str, Dict[str, Dict[str, Any]]]
260
261
262def generate_reports(paths: Iterable[Path]) -> _DatabaseReport:
263    """Returns a dictionary with information about the provided databases."""
264    reports: _DatabaseReport = {}
265
266    for path in paths:
267        domains = ['']
268        if path.is_file():
269            with path.open('rb') as file:
270                if elf_reader.compatible_file(file):
271                    domains = list(tokenization_domains(file))
272
273        domain_reports = {}
274
275        for domain in domains:
276            domain_reports[domain] = database_summary(
277                load_token_database(path, domain=domain)
278            )
279
280        reports[str(path)] = domain_reports
281
282    return reports
283
284
285def _handle_create(
286    databases,
287    database: Path,
288    force: bool,
289    output_type: str,
290    include: list,
291    exclude: list,
292    replace: list,
293) -> None:
294    """Creates a token database file from one or more ELF files."""
295    if not force and database.exists():
296        raise FileExistsError(
297            f'The file {database} already exists! Use --force to overwrite.'
298        )
299
300    if output_type == 'directory':
301        if str(database) == '-':
302            raise ValueError(
303                'Cannot specify "-" (stdout) for directory databases'
304            )
305
306        database.mkdir(exist_ok=True)
307        database = database / f'database{tokens.DIR_DB_SUFFIX}'
308        output_type = 'csv'
309
310    if str(database) == '-':
311        # Must write bytes to stdout; use sys.stdout.buffer.
312        fd = sys.stdout.buffer
313    else:
314        fd = database.open('wb')
315
316    db = tokens.Database.merged(*databases)
317    db.filter(include, exclude, replace)
318
319    with fd:
320        if output_type == 'csv':
321            tokens.write_csv(db, fd)
322        elif output_type == 'binary':
323            tokens.write_binary(db, fd)
324        else:
325            raise ValueError(f'Unknown database type "{output_type}"')
326
327    _LOG.info(
328        'Wrote database with %d entries to %s as %s',
329        len(db),
330        fd.name,
331        output_type,
332    )
333
334
335def _handle_add(
336    token_database: tokens.DatabaseFile,
337    databases: List[tokens.Database],
338    commit: Optional[str],
339) -> None:
340    initial = len(token_database)
341    if commit:
342        entries = itertools.chain.from_iterable(
343            db.entries() for db in databases
344        )
345        token_database.add_and_discard_temporary(entries, commit)
346    else:
347        for source in databases:
348            token_database.add(source.entries())
349
350        token_database.write_to_file()
351
352    number_of_changes = len(token_database) - initial
353
354    if number_of_changes:
355        _LOG.info(
356            'Added %d entries to %s', number_of_changes, token_database.path
357        )
358
359
360def _handle_mark_removed(
361    token_database: tokens.DatabaseFile,
362    databases: List[tokens.Database],
363    date: Optional[datetime],
364):
365    marked_removed = token_database.mark_removed(
366        (
367            entry
368            for entry in tokens.Database.merged(*databases).entries()
369            if not entry.date_removed
370        ),
371        date,
372    )
373
374    token_database.write_to_file(rewrite=True)
375
376    _LOG.info(
377        'Marked %d of %d entries as removed in %s',
378        len(marked_removed),
379        len(token_database),
380        token_database.path,
381    )
382
383
384def _handle_purge(
385    token_database: tokens.DatabaseFile, before: Optional[datetime]
386):
387    purged = token_database.purge(before)
388    token_database.write_to_file(rewrite=True)
389
390    _LOG.info('Removed %d entries from %s', len(purged), token_database.path)
391
392
393def _handle_report(token_database_or_elf: List[Path], output: TextIO) -> None:
394    json.dump(generate_reports(token_database_or_elf), output, indent=2)
395    output.write('\n')
396
397
398def expand_paths_or_globs(*paths_or_globs: str) -> Iterable[Path]:
399    """Expands any globs in a list of paths; raises FileNotFoundError."""
400    for path_or_glob in paths_or_globs:
401        if os.path.exists(path_or_glob):
402            # This is a valid path; yield it without evaluating it as a glob.
403            yield Path(path_or_glob)
404        else:
405            paths = glob.glob(path_or_glob, recursive=True)
406
407            # If no paths were found and the path is not a glob, raise an Error.
408            if not paths and not any(c in path_or_glob for c in '*?[]!'):
409                raise FileNotFoundError(f'{path_or_glob} is not a valid path')
410
411            for path in paths:
412                # Resolve globs to JSON, CSV, or compatible binary files.
413                if elf_reader.compatible_file(path) or path.endswith(
414                    ('.csv', '.json')
415                ):
416                    yield Path(path)
417
418
419class ExpandGlobs(argparse.Action):
420    """Argparse action that expands and appends paths."""
421
422    def __call__(self, parser, namespace, values, unused_option_string=None):
423        setattr(namespace, self.dest, list(expand_paths_or_globs(*values)))
424
425
426def _read_elf_with_domain(
427    elf: str, domain: Pattern[str]
428) -> Iterable[tokens.Database]:
429    for path in expand_paths_or_globs(elf):
430        with path.open('rb') as file:
431            if not elf_reader.compatible_file(file):
432                raise ValueError(
433                    f'{elf} is not an ELF file, '
434                    f'but the "{domain}" domain was specified'
435                )
436
437            yield _database_from_elf(file, domain)
438
439
440class LoadTokenDatabases(argparse.Action):
441    """Argparse action that reads tokenize databases from paths or globs.
442
443    ELF files may have #domain appended to them to specify a tokenization domain
444    other than the default.
445    """
446
447    def __call__(self, parser, namespace, values, option_string=None):
448        databases: List[tokens.Database] = []
449        paths: Set[Path] = set()
450
451        try:
452            for value in values:
453                if value.count('#') == 1:
454                    path, domain = value.split('#')
455                    domain = re.compile(domain)
456                    databases.extend(_read_elf_with_domain(path, domain))
457                else:
458                    paths.update(expand_paths_or_globs(value))
459
460            for path in paths:
461                databases.append(load_token_database(path))
462        except tokens.DatabaseFormatError as err:
463            parser.error(
464                f'argument elf_or_token_database: {path} is not a supported '
465                'token database file. Only ELF files or token databases (CSV '
466                f'or binary format) are supported. {err}. '
467            )
468        except FileNotFoundError as err:
469            parser.error(f'argument elf_or_token_database: {err}')
470        except:  # pylint: disable=bare-except
471            _LOG.exception('Failed to load token database %s', path)
472            parser.error(
473                'argument elf_or_token_database: '
474                f'Error occurred while loading token database {path}'
475            )
476
477        setattr(namespace, self.dest, databases)
478
479
480def token_databases_parser(nargs: str = '+') -> argparse.ArgumentParser:
481    """Returns an argument parser for reading token databases.
482
483    These arguments can be added to another parser using the parents arg.
484    """
485    parser = argparse.ArgumentParser(add_help=False)
486    parser.add_argument(
487        'databases',
488        metavar='elf_or_token_database',
489        nargs=nargs,
490        action=LoadTokenDatabases,
491        help=(
492            'ELF or token database files from which to read strings and '
493            'tokens. For ELF files, the tokenization domain to read from '
494            'may specified after the path as #domain_name (e.g. '
495            'foo.elf#TEST_DOMAIN). Unless specified, only the default '
496            'domain ("") is read from ELF files; .* reads all domains. '
497            'Globs are expanded to compatible database files.'
498        ),
499    )
500    return parser
501
502
503def _parse_args():
504    """Parse and return command line arguments."""
505
506    def year_month_day(value) -> datetime:
507        if value == 'today':
508            return datetime.now()
509
510        return datetime.fromisoformat(value)
511
512    year_month_day.__name__ = 'year-month-day (YYYY-MM-DD)'
513
514    # Shared command line options.
515    option_db = argparse.ArgumentParser(add_help=False)
516    option_db.add_argument(
517        '-d',
518        '--database',
519        dest='token_database',
520        type=lambda arg: tokens.DatabaseFile.load(Path(arg)),
521        required=True,
522        help='The database file to update.',
523    )
524
525    option_tokens = token_databases_parser('*')
526
527    # Top-level argument parser.
528    parser = argparse.ArgumentParser(
529        description=__doc__,
530        formatter_class=argparse.RawDescriptionHelpFormatter,
531    )
532    parser.set_defaults(handler=lambda **_: parser.print_help())
533
534    subparsers = parser.add_subparsers(
535        help='Tokenized string database management actions:'
536    )
537
538    # The 'create' command creates a database file.
539    subparser = subparsers.add_parser(
540        'create',
541        parents=[option_tokens],
542        help=(
543            'Creates a database with tokenized strings from one or more '
544            'sources.'
545        ),
546    )
547    subparser.set_defaults(handler=_handle_create)
548    subparser.add_argument(
549        '-d',
550        '--database',
551        required=True,
552        type=Path,
553        help='Path to the database file to create; use - for stdout.',
554    )
555    subparser.add_argument(
556        '-t',
557        '--type',
558        dest='output_type',
559        choices=('csv', 'binary', 'directory'),
560        default='csv',
561        help='Which type of database to create. (default: csv)',
562    )
563    subparser.add_argument(
564        '-f',
565        '--force',
566        action='store_true',
567        help='Overwrite the database if it exists.',
568    )
569    subparser.add_argument(
570        '-i',
571        '--include',
572        type=re.compile,
573        default=[],
574        action='append',
575        help=(
576            'If provided, at least one of these regular expressions must '
577            'match for a string to be included in the database.'
578        ),
579    )
580    subparser.add_argument(
581        '-e',
582        '--exclude',
583        type=re.compile,
584        default=[],
585        action='append',
586        help=(
587            'If provided, none of these regular expressions may match for a '
588            'string to be included in the database.'
589        ),
590    )
591
592    unescaped_slash = re.compile(r'(?<!\\)/')
593
594    def replacement(value: str) -> Tuple[Pattern, 'str']:
595        try:
596            find, sub = unescaped_slash.split(value, 1)
597        except ValueError as _err:
598            raise argparse.ArgumentTypeError(
599                'replacements must be specified as "search_regex/replacement"'
600            )
601
602        try:
603            return re.compile(find.replace(r'\/', '/')), sub
604        except re.error as err:
605            raise argparse.ArgumentTypeError(
606                f'"{value}" is not a valid regular expression: {err}'
607            )
608
609    subparser.add_argument(
610        '--replace',
611        type=replacement,
612        default=[],
613        action='append',
614        help=(
615            'If provided, replaces text that matches a regular expression. '
616            'This can be used to replace sensitive terms in a token '
617            'database that will be distributed publicly. The expression and '
618            'replacement are specified as "search_regex/replacement". '
619            'Plain slash characters in the regex must be escaped with a '
620            r'backslash (\/). The replacement text may include '
621            'backreferences for captured groups in the regex.'
622        ),
623    )
624
625    # The 'add' command adds strings to a database from a set of ELFs.
626    subparser = subparsers.add_parser(
627        'add',
628        parents=[option_db, option_tokens],
629        help=(
630            'Adds new strings to a database with tokenized strings from a set '
631            'of ELF files or other token databases. Missing entries are NOT '
632            'marked as removed.'
633        ),
634    )
635    subparser.set_defaults(handler=_handle_add)
636    subparser.add_argument(
637        '--discard-temporary',
638        dest='commit',
639        help=(
640            'Deletes temporary tokens in memory and on disk when a CSV exists '
641            'within a commit. Afterwards, new strings are added to the '
642            'database from a set of ELF files or other token databases. '
643            'Missing entries are NOT marked as removed.'
644        ),
645    )
646
647    # The 'mark_removed' command marks removed entries to match a set of ELFs.
648    subparser = subparsers.add_parser(
649        'mark_removed',
650        parents=[option_db, option_tokens],
651        help=(
652            'Updates a database with tokenized strings from a set of strings. '
653            'Strings not present in the set remain in the database but are '
654            'marked as removed. New strings are NOT added.'
655        ),
656    )
657    subparser.set_defaults(handler=_handle_mark_removed)
658    subparser.add_argument(
659        '--date',
660        type=year_month_day,
661        help=(
662            'The removal date to use for all strings. '
663            'May be YYYY-MM-DD or "today". (default: today)'
664        ),
665    )
666
667    # The 'purge' command removes old entries.
668    subparser = subparsers.add_parser(
669        'purge',
670        parents=[option_db],
671        help='Purges removed strings from a database.',
672    )
673    subparser.set_defaults(handler=_handle_purge)
674    subparser.add_argument(
675        '-b',
676        '--before',
677        type=year_month_day,
678        help=(
679            'Delete all entries removed on or before this date. '
680            'May be YYYY-MM-DD or "today".'
681        ),
682    )
683
684    # The 'report' command prints a report about a database.
685    subparser = subparsers.add_parser(
686        'report', help='Prints a report about a database.'
687    )
688    subparser.set_defaults(handler=_handle_report)
689    subparser.add_argument(
690        'token_database_or_elf',
691        nargs='+',
692        action=ExpandGlobs,
693        help=(
694            'The ELF files or token databases about which to generate '
695            'reports.'
696        ),
697    )
698    subparser.add_argument(
699        '-o',
700        '--output',
701        type=argparse.FileType('w'),
702        default=sys.stdout,
703        help='The file to which to write the output; use - for stdout.',
704    )
705
706    args = parser.parse_args()
707
708    handler = args.handler
709    del args.handler
710
711    return handler, args
712
713
714def _init_logging(level: int) -> None:
715    _LOG.setLevel(logging.DEBUG)
716    log_to_stderr = logging.StreamHandler()
717    log_to_stderr.setLevel(level)
718    log_to_stderr.setFormatter(
719        logging.Formatter(
720            fmt='%(asctime)s.%(msecs)03d-%(levelname)s: %(message)s',
721            datefmt='%H:%M:%S',
722        )
723    )
724
725    _LOG.addHandler(log_to_stderr)
726
727
728def _main(handler: Callable, args: argparse.Namespace) -> int:
729    _init_logging(logging.INFO)
730    handler(**vars(args))
731    return 0
732
733
734if __name__ == '__main__':
735    sys.exit(_main(*_parse_args()))
736