• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1#!/usr/bin/env python3
2# Copyright 2020 The Pigweed Authors
3#
4# Licensed under the Apache License, Version 2.0 (the "License"); you may not
5# use this file except in compliance with the License. You may obtain a copy of
6# the License at
7#
8#     https://www.apache.org/licenses/LICENSE-2.0
9#
10# Unless required by applicable law or agreed to in writing, software
11# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
12# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
13# License for the specific language governing permissions and limitations under
14# the License.
15"""Creates and manages token databases.
16
17This module manages reading tokenized strings from ELF files and building and
18maintaining token databases.
19"""
20
21import argparse
22from datetime import datetime
23import glob
24import json
25import logging
26import os
27from pathlib import Path
28import re
29import struct
30import sys
31from typing import (Any, Callable, Dict, Iterable, Iterator, List, Pattern,
32                    Set, TextIO, Tuple, Union)
33
34try:
35    from pw_tokenizer import elf_reader, tokens
36except ImportError:
37    # Append this path to the module search path to allow running this module
38    # without installing the pw_tokenizer package.
39    sys.path.append(os.path.dirname(os.path.dirname(
40        os.path.abspath(__file__))))
41    from pw_tokenizer import elf_reader, tokens
42
43_LOG = logging.getLogger('pw_tokenizer')
44
45
46def _elf_reader(elf) -> elf_reader.Elf:
47    return elf if isinstance(elf, elf_reader.Elf) else elf_reader.Elf(elf)
48
49
50# Magic number used to indicate the beginning of a tokenized string entry. This
51# value MUST match the value of _PW_TOKENIZER_ENTRY_MAGIC in
52# pw_tokenizer/public/pw_tokenizer/internal/tokenize_string.h.
53_TOKENIZED_ENTRY_MAGIC = 0xBAA98DEE
54_ENTRY = struct.Struct('<4I')
55_TOKENIZED_ENTRY_SECTIONS = re.compile(
56    r'^\.pw_tokenizer.entries(?:\.[_\d]+)?$')
57
58_LEGACY_STRING_SECTIONS = re.compile(
59    r'^\.pw_tokenized\.(?P<domain>[^.]+)(?:\.\d+)?$')
60
61_ERROR_HANDLER = 'surrogateescape'  # How to deal with UTF-8 decoding errors
62
63
64class Error(Exception):
65    """Failed to extract token entries from an ELF file."""
66
67
68def _read_tokenized_entries(
69        data: bytes,
70        domain: Pattern[str]) -> Iterator[tokens.TokenizedStringEntry]:
71    index = 0
72
73    while index + _ENTRY.size <= len(data):
74        magic, token, domain_len, string_len = _ENTRY.unpack_from(data, index)
75
76        if magic != _TOKENIZED_ENTRY_MAGIC:
77            raise Error(
78                f'Expected magic number 0x{_TOKENIZED_ENTRY_MAGIC:08x}, '
79                f'found 0x{magic:08x}')
80
81        start = index + _ENTRY.size
82        index = start + domain_len + string_len
83
84        # Create the entries, trimming null terminators.
85        entry = tokens.TokenizedStringEntry(
86            token,
87            data[start + domain_len:index - 1].decode(errors=_ERROR_HANDLER),
88            data[start:start + domain_len - 1].decode(errors=_ERROR_HANDLER),
89        )
90
91        if data[start + domain_len - 1] != 0:
92            raise Error(
93                f'Domain {entry.domain} for {entry.string} not null terminated'
94            )
95
96        if data[index - 1] != 0:
97            raise Error(f'String {entry.string} is not null terminated')
98
99        if domain.fullmatch(entry.domain):
100            yield entry
101
102
103def _read_tokenized_strings(sections: Dict[str, bytes],
104                            domain: Pattern[str]) -> Iterator[tokens.Database]:
105    # Legacy ELF files used "default" as the default domain instead of "". Remap
106    # the default if necessary.
107    if domain.pattern == tokens.DEFAULT_DOMAIN:
108        domain = re.compile('default')
109
110    for section, data in sections.items():
111        match = _LEGACY_STRING_SECTIONS.match(section)
112        if match and domain.match(match.group('domain')):
113            yield tokens.Database.from_strings(
114                (s.decode(errors=_ERROR_HANDLER) for s in data.split(b'\0')),
115                match.group('domain'))
116
117
118def _database_from_elf(elf, domain: Pattern[str]) -> tokens.Database:
119    """Reads the tokenized strings from an elf_reader.Elf or ELF file object."""
120    _LOG.debug('Reading tokenized strings in domain "%s" from %s', domain, elf)
121
122    reader = _elf_reader(elf)
123
124    # Read tokenized string entries.
125    section_data = reader.dump_section_contents(_TOKENIZED_ENTRY_SECTIONS)
126    if section_data is not None:
127        return tokens.Database(_read_tokenized_entries(section_data, domain))
128
129    # Read legacy null-terminated string entries.
130    sections = reader.dump_sections(_LEGACY_STRING_SECTIONS)
131    if sections:
132        return tokens.Database.merged(
133            *_read_tokenized_strings(sections, domain))
134
135    return tokens.Database([])
136
137
138def tokenization_domains(elf) -> Iterator[str]:
139    """Lists all tokenization domains in an ELF file."""
140    reader = _elf_reader(elf)
141    section_data = reader.dump_section_contents(_TOKENIZED_ENTRY_SECTIONS)
142    if section_data is not None:
143        yield from frozenset(
144            e.domain
145            for e in _read_tokenized_entries(section_data, re.compile('.*')))
146    else:  # Check for the legacy domain sections
147        for section in reader.sections:
148            match = _LEGACY_STRING_SECTIONS.match(section.name)
149            if match:
150                yield match.group('domain')
151
152
153def read_tokenizer_metadata(elf) -> Dict[str, int]:
154    """Reads the metadata entries from an ELF."""
155    sections = _elf_reader(elf).dump_section_contents(r'\.pw_tokenizer\.info')
156
157    metadata: Dict[str, int] = {}
158    if sections is not None:
159        for key, value in struct.iter_unpack('12sI', sections):
160            try:
161                metadata[key.rstrip(b'\0').decode()] = value
162            except UnicodeDecodeError as err:
163                _LOG.error('Failed to decode metadata key %r: %s',
164                           key.rstrip(b'\0'), err)
165
166    return metadata
167
168
169def _database_from_strings(strings: List[str]) -> tokens.Database:
170    """Generates a C and C++ compatible database from untokenized strings."""
171    # Generate a C compatible database from the fixed length hash.
172    c_db = tokens.Database.from_strings(
173        strings,
174        tokenize=lambda string: tokens.pw_tokenizer_65599_hash(
175            string, tokens.DEFAULT_C_HASH_LENGTH))
176
177    # Generate a C++ compatible database by allowing the hash to follow the
178    # string length.
179    cpp_db = tokens.Database.from_strings(
180        strings, tokenize=tokens.pw_tokenizer_65599_hash)
181
182    # Use a union of the C and C++ compatible databases.
183    return tokens.Database.merged(c_db, cpp_db)
184
185
186def _database_from_json(fd) -> tokens.Database:
187    return _database_from_strings(json.load(fd))
188
189
190def _load_token_database(db, domain: Pattern[str]) -> tokens.Database:
191    """Loads a Database from supported database types.
192
193    Supports Database objects, JSONs, ELFs, CSVs, and binary databases.
194    """
195    if db is None:
196        return tokens.Database()
197
198    if isinstance(db, tokens.Database):
199        return db
200
201    if isinstance(db, elf_reader.Elf):
202        return _database_from_elf(db, domain)
203
204    # If it's a str, it might be a path. Check if it's an ELF, CSV, or JSON.
205    if isinstance(db, (str, Path)):
206        if not os.path.exists(db):
207            raise FileNotFoundError(
208                f'"{db}" is not a path to a token database')
209
210        # Read the path as an ELF file.
211        with open(db, 'rb') as fd:
212            if elf_reader.compatible_file(fd):
213                return _database_from_elf(fd, domain)
214
215        # Generate a database from JSON.
216        if str(db).endswith('.json'):
217            with open(db, 'r') as json_fd:
218                return _database_from_json(json_fd)
219
220        # Read the path as a packed binary or CSV file.
221        return tokens.DatabaseFile(db)
222
223    # Assume that it's a file object and check if it's an ELF.
224    if elf_reader.compatible_file(db):
225        return _database_from_elf(db, domain)
226
227    # Read the database as JSON, CSV, or packed binary from a file object's
228    # path.
229    if hasattr(db, 'name') and os.path.exists(db.name):
230        if db.name.endswith('.json'):
231            return _database_from_json(db)
232
233        return tokens.DatabaseFile(db.name)
234
235    # Read CSV directly from the file object.
236    return tokens.Database(tokens.parse_csv(db))
237
238
239def load_token_database(
240    *databases,
241    domain: Union[str,
242                  Pattern[str]] = tokens.DEFAULT_DOMAIN) -> tokens.Database:
243    """Loads a Database from supported database types.
244
245    Supports Database objects, JSONs, ELFs, CSVs, and binary databases.
246    """
247    domain = re.compile(domain)
248    return tokens.Database.merged(*(_load_token_database(db, domain)
249                                    for db in databases))
250
251
252def database_summary(db: tokens.Database) -> Dict[str, Any]:
253    """Returns a simple report of properties of the database."""
254    present = [entry for entry in db.entries() if not entry.date_removed]
255    collisions = {
256        token: list(e.string for e in entries)
257        for token, entries in db.collisions()
258    }
259
260    # Add 1 to each string's size to account for the null terminator.
261    return dict(
262        present_entries=len(present),
263        present_size_bytes=sum(len(entry.string) + 1 for entry in present),
264        total_entries=len(db.entries()),
265        total_size_bytes=sum(len(entry.string) + 1 for entry in db.entries()),
266        collisions=collisions,
267    )
268
269
270_DatabaseReport = Dict[str, Dict[str, Dict[str, Any]]]
271
272
273def generate_reports(paths: Iterable[Path]) -> _DatabaseReport:
274    """Returns a dictionary with information about the provided databases."""
275    reports: _DatabaseReport = {}
276
277    for path in paths:
278        with path.open('rb') as file:
279            if elf_reader.compatible_file(file):
280                domains = list(tokenization_domains(file))
281            else:
282                domains = ['']
283
284        domain_reports = {}
285
286        for domain in domains:
287            domain_reports[domain] = database_summary(
288                load_token_database(path, domain=domain))
289
290        reports[str(path)] = domain_reports
291
292    return reports
293
294
295def _handle_create(databases, database, force, output_type, include, exclude,
296                   replace):
297    """Creates a token database file from one or more ELF files."""
298
299    if database == '-':
300        # Must write bytes to stdout; use sys.stdout.buffer.
301        fd = sys.stdout.buffer
302    elif not force and os.path.exists(database):
303        raise FileExistsError(
304            f'The file {database} already exists! Use --force to overwrite.')
305    else:
306        fd = open(database, 'wb')
307
308    database = tokens.Database.merged(*databases)
309    database.filter(include, exclude, replace)
310
311    with fd:
312        if output_type == 'csv':
313            tokens.write_csv(database, fd)
314        elif output_type == 'binary':
315            tokens.write_binary(database, fd)
316        else:
317            raise ValueError(f'Unknown database type "{output_type}"')
318
319    _LOG.info('Wrote database with %d entries to %s as %s', len(database),
320              fd.name, output_type)
321
322
323def _handle_add(token_database, databases):
324    initial = len(token_database)
325
326    for source in databases:
327        token_database.add(source.entries())
328
329    token_database.write_to_file()
330
331    _LOG.info('Added %d entries to %s',
332              len(token_database) - initial, token_database.path)
333
334
335def _handle_mark_removed(token_database, databases, date):
336    marked_removed = token_database.mark_removed(
337        (entry for entry in tokens.Database.merged(*databases).entries()
338         if not entry.date_removed), date)
339
340    token_database.write_to_file()
341
342    _LOG.info('Marked %d of %d entries as removed in %s', len(marked_removed),
343              len(token_database), token_database.path)
344
345
346def _handle_purge(token_database, before):
347    purged = token_database.purge(before)
348    token_database.write_to_file()
349
350    _LOG.info('Removed %d entries from %s', len(purged), token_database.path)
351
352
353def _handle_report(token_database_or_elf: List[Path], output: TextIO) -> None:
354    json.dump(generate_reports(token_database_or_elf), output, indent=2)
355    output.write('\n')
356
357
358def expand_paths_or_globs(*paths_or_globs: str) -> Iterable[Path]:
359    """Expands any globs in a list of paths; raises FileNotFoundError."""
360    for path_or_glob in paths_or_globs:
361        if os.path.exists(path_or_glob):
362            # This is a valid path; yield it without evaluating it as a glob.
363            yield Path(path_or_glob)
364        else:
365            paths = glob.glob(path_or_glob, recursive=True)
366
367            # If no paths were found and the path is not a glob, raise an Error.
368            if not paths and not any(c in path_or_glob for c in '*?[]!'):
369                raise FileNotFoundError(f'{path_or_glob} is not a valid path')
370
371            for path in paths:
372                # Resolve globs to JSON, CSV, or compatible binary files.
373                if elf_reader.compatible_file(path) or path.endswith(
374                    ('.csv', '.json')):
375                    yield Path(path)
376
377
378class ExpandGlobs(argparse.Action):
379    """Argparse action that expands and appends paths."""
380    def __call__(self, parser, namespace, values, unused_option_string=None):
381        setattr(namespace, self.dest, list(expand_paths_or_globs(*values)))
382
383
384def _read_elf_with_domain(elf: str,
385                          domain: Pattern[str]) -> Iterable[tokens.Database]:
386    for path in expand_paths_or_globs(elf):
387        with path.open('rb') as file:
388            if not elf_reader.compatible_file(file):
389                raise ValueError(f'{elf} is not an ELF file, '
390                                 f'but the "{domain}" domain was specified')
391
392            yield _database_from_elf(file, domain)
393
394
395class LoadTokenDatabases(argparse.Action):
396    """Argparse action that reads tokenize databases from paths or globs.
397
398    ELF files may have #domain appended to them to specify a tokenization domain
399    other than the default.
400    """
401    def __call__(self, parser, namespace, values, option_string=None):
402        databases: List[tokens.Database] = []
403        paths: Set[Path] = set()
404
405        try:
406            for value in values:
407                if value.count('#') == 1:
408                    path, domain = value.split('#')
409                    domain = re.compile(domain)
410                    databases.extend(_read_elf_with_domain(path, domain))
411                else:
412                    paths.update(expand_paths_or_globs(value))
413
414            for path in paths:
415                databases.append(load_token_database(path))
416        except tokens.DatabaseFormatError as err:
417            parser.error(
418                f'argument elf_or_token_database: {path} is not a supported '
419                'token database file. Only ELF files or token databases (CSV '
420                f'or binary format) are supported. {err}. ')
421        except FileNotFoundError as err:
422            parser.error(f'argument elf_or_token_database: {err}')
423        except:  # pylint: disable=bare-except
424            _LOG.exception('Failed to load token database %s', path)
425            parser.error('argument elf_or_token_database: '
426                         f'Error occurred while loading token database {path}')
427
428        setattr(namespace, self.dest, databases)
429
430
431def token_databases_parser(nargs: str = '+') -> argparse.ArgumentParser:
432    """Returns an argument parser for reading token databases.
433
434    These arguments can be added to another parser using the parents arg.
435    """
436    parser = argparse.ArgumentParser(add_help=False)
437    parser.add_argument(
438        'databases',
439        metavar='elf_or_token_database',
440        nargs=nargs,
441        action=LoadTokenDatabases,
442        help=('ELF or token database files from which to read strings and '
443              'tokens. For ELF files, the tokenization domain to read from '
444              'may specified after the path as #domain_name (e.g. '
445              'foo.elf#TEST_DOMAIN). Unless specified, only the default '
446              'domain ("") is read from ELF files; .* reads all domains. '
447              'Globs are expanded to compatible database files.'))
448    return parser
449
450
451def _parse_args():
452    """Parse and return command line arguments."""
453    def year_month_day(value) -> datetime:
454        if value == 'today':
455            return datetime.now()
456
457        return datetime.strptime(value, tokens.DATE_FORMAT)
458
459    year_month_day.__name__ = 'year-month-day (YYYY-MM-DD)'
460
461    # Shared command line options.
462    option_db = argparse.ArgumentParser(add_help=False)
463    option_db.add_argument('-d',
464                           '--database',
465                           dest='token_database',
466                           type=tokens.DatabaseFile,
467                           required=True,
468                           help='The database file to update.')
469
470    option_tokens = token_databases_parser('*')
471
472    # Top-level argument parser.
473    parser = argparse.ArgumentParser(
474        description=__doc__,
475        formatter_class=argparse.RawDescriptionHelpFormatter)
476    parser.set_defaults(handler=lambda **_: parser.print_help())
477
478    subparsers = parser.add_subparsers(
479        help='Tokenized string database management actions:')
480
481    # The 'create' command creates a database file.
482    subparser = subparsers.add_parser(
483        'create',
484        parents=[option_tokens],
485        help=
486        'Creates a database with tokenized strings from one or more sources.')
487    subparser.set_defaults(handler=_handle_create)
488    subparser.add_argument(
489        '-d',
490        '--database',
491        required=True,
492        help='Path to the database file to create; use - for stdout.')
493    subparser.add_argument(
494        '-t',
495        '--type',
496        dest='output_type',
497        choices=('csv', 'binary'),
498        default='csv',
499        help='Which type of database to create. (default: csv)')
500    subparser.add_argument('-f',
501                           '--force',
502                           action='store_true',
503                           help='Overwrite the database if it exists.')
504    subparser.add_argument(
505        '-i',
506        '--include',
507        type=re.compile,
508        default=[],
509        action='append',
510        help=('If provided, at least one of these regular expressions must '
511              'match for a string to be included in the database.'))
512    subparser.add_argument(
513        '-e',
514        '--exclude',
515        type=re.compile,
516        default=[],
517        action='append',
518        help=('If provided, none of these regular expressions may match for a '
519              'string to be included in the database.'))
520
521    unescaped_slash = re.compile(r'(?<!\\)/')
522
523    def replacement(value: str) -> Tuple[Pattern, 'str']:
524        try:
525            find, sub = unescaped_slash.split(value, 1)
526        except ValueError as err:
527            raise argparse.ArgumentTypeError(
528                'replacements must be specified as "search_regex/replacement"')
529
530        try:
531            return re.compile(find.replace(r'\/', '/')), sub
532        except re.error as err:
533            raise argparse.ArgumentTypeError(
534                f'"{value}" is not a valid regular expression: {err}')
535
536    subparser.add_argument(
537        '--replace',
538        type=replacement,
539        default=[],
540        action='append',
541        help=('If provided, replaces text that matches a regular expression. '
542              'This can be used to replace sensitive terms in a token '
543              'database that will be distributed publicly. The expression and '
544              'replacement are specified as "search_regex/replacement". '
545              'Plain slash characters in the regex must be escaped with a '
546              r'backslash (\/). The replacement text may include '
547              'backreferences for captured groups in the regex.'))
548
549    # The 'add' command adds strings to a database from a set of ELFs.
550    subparser = subparsers.add_parser(
551        'add',
552        parents=[option_db, option_tokens],
553        help=(
554            'Adds new strings to a database with tokenized strings from a set '
555            'of ELF files or other token databases. Missing entries are NOT '
556            'marked as removed.'))
557    subparser.set_defaults(handler=_handle_add)
558
559    # The 'mark_removed' command marks removed entries to match a set of ELFs.
560    subparser = subparsers.add_parser(
561        'mark_removed',
562        parents=[option_db, option_tokens],
563        help=(
564            'Updates a database with tokenized strings from a set of strings. '
565            'Strings not present in the set remain in the database but are '
566            'marked as removed. New strings are NOT added.'))
567    subparser.set_defaults(handler=_handle_mark_removed)
568    subparser.add_argument(
569        '--date',
570        type=year_month_day,
571        help=('The removal date to use for all strings. '
572              'May be YYYY-MM-DD or "today". (default: today)'))
573
574    # The 'purge' command removes old entries.
575    subparser = subparsers.add_parser(
576        'purge',
577        parents=[option_db],
578        help='Purges removed strings from a database.')
579    subparser.set_defaults(handler=_handle_purge)
580    subparser.add_argument(
581        '-b',
582        '--before',
583        type=year_month_day,
584        help=('Delete all entries removed on or before this date. '
585              'May be YYYY-MM-DD or "today".'))
586
587    # The 'report' command prints a report about a database.
588    subparser = subparsers.add_parser('report',
589                                      help='Prints a report about a database.')
590    subparser.set_defaults(handler=_handle_report)
591    subparser.add_argument(
592        'token_database_or_elf',
593        nargs='+',
594        action=ExpandGlobs,
595        help='The ELF files or token databases about which to generate reports.'
596    )
597    subparser.add_argument(
598        '-o',
599        '--output',
600        type=argparse.FileType('w'),
601        default=sys.stdout,
602        help='The file to which to write the output; use - for stdout.')
603
604    args = parser.parse_args()
605
606    handler = args.handler
607    del args.handler
608
609    return handler, args
610
611
612def _init_logging(level: int) -> None:
613    _LOG.setLevel(logging.DEBUG)
614    log_to_stderr = logging.StreamHandler()
615    log_to_stderr.setLevel(level)
616    log_to_stderr.setFormatter(
617        logging.Formatter(
618            fmt='%(asctime)s.%(msecs)03d-%(levelname)s: %(message)s',
619            datefmt='%H:%M:%S'))
620
621    _LOG.addHandler(log_to_stderr)
622
623
624def _main(handler: Callable, args: argparse.Namespace) -> int:
625    _init_logging(logging.INFO)
626    handler(**vars(args))
627    return 0
628
629
630if __name__ == '__main__':
631    sys.exit(_main(*_parse_args()))
632