1#!/usr/bin/env python3 2# Copyright 2020 The Pigweed Authors 3# 4# Licensed under the Apache License, Version 2.0 (the "License"); you may not 5# use this file except in compliance with the License. You may obtain a copy of 6# the License at 7# 8# https://www.apache.org/licenses/LICENSE-2.0 9# 10# Unless required by applicable law or agreed to in writing, software 11# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT 12# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the 13# License for the specific language governing permissions and limitations under 14# the License. 15"""Creates and manages token databases. 16 17This module manages reading tokenized strings from ELF files and building and 18maintaining token databases. 19""" 20 21import argparse 22from datetime import datetime 23import glob 24import itertools 25import json 26import logging 27import os 28from pathlib import Path 29import re 30import struct 31import sys 32from typing import ( 33 Any, 34 Callable, 35 Iterable, 36 Iterator, 37 Pattern, 38 Set, 39 TextIO, 40) 41 42try: 43 from pw_tokenizer import elf_reader, tokens 44except ImportError: 45 # Append this path to the module search path to allow running this module 46 # without installing the pw_tokenizer package. 47 sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) 48 from pw_tokenizer import elf_reader, tokens 49 50_LOG = logging.getLogger('pw_tokenizer') 51 52 53def _elf_reader(elf) -> elf_reader.Elf: 54 return elf if isinstance(elf, elf_reader.Elf) else elf_reader.Elf(elf) 55 56 57# Magic number used to indicate the beginning of a tokenized string entry. This 58# value MUST match the value of _PW_TOKENIZER_ENTRY_MAGIC in 59# pw_tokenizer/public/pw_tokenizer/internal/tokenize_string.h. 60_TOKENIZED_ENTRY_MAGIC = 0xBAA98DEE 61_ENTRY = struct.Struct('<4I') 62_TOKENIZED_ENTRY_SECTIONS = re.compile(r'^\.pw_tokenizer.entries(?:\.[_\d]+)?$') 63 64_ERROR_HANDLER = 'surrogateescape' # How to deal with UTF-8 decoding errors 65 66 67class Error(Exception): 68 """Failed to extract token entries from an ELF file.""" 69 70 71def _read_tokenized_entries( 72 data: bytes, domain: Pattern[str] 73) -> Iterator[tokens.TokenizedStringEntry]: 74 index = 0 75 76 while index + _ENTRY.size <= len(data): 77 magic, token, domain_len, string_len = _ENTRY.unpack_from(data, index) 78 79 if magic != _TOKENIZED_ENTRY_MAGIC: 80 raise Error( 81 f'Expected magic number 0x{_TOKENIZED_ENTRY_MAGIC:08x}, ' 82 f'found 0x{magic:08x}' 83 ) 84 85 start = index + _ENTRY.size 86 index = start + domain_len + string_len 87 88 # Create the entries, trimming null terminators. 89 entry = tokens.TokenizedStringEntry( 90 token, 91 data[start + domain_len : index - 1].decode(errors=_ERROR_HANDLER), 92 data[start : start + domain_len - 1].decode(errors=_ERROR_HANDLER), 93 ) 94 95 if data[start + domain_len - 1] != 0: 96 raise Error( 97 f'Domain {entry.domain} for {entry.string} not null terminated' 98 ) 99 100 if data[index - 1] != 0: 101 raise Error(f'String {entry.string} is not null terminated') 102 103 if domain.fullmatch(entry.domain): 104 yield entry 105 106 107def _database_from_elf(elf, domain: Pattern[str]) -> tokens.Database: 108 """Reads the tokenized strings from an elf_reader.Elf or ELF file object.""" 109 _LOG.debug('Reading tokenized strings in domain "%s" from %s', domain, elf) 110 111 reader = _elf_reader(elf) 112 113 # Read tokenized string entries. 114 section_data = reader.dump_section_contents(_TOKENIZED_ENTRY_SECTIONS) 115 if section_data is not None: 116 return tokens.Database(_read_tokenized_entries(section_data, domain)) 117 118 return tokens.Database([]) 119 120 121def tokenization_domains(elf) -> Iterator[str]: 122 """Lists all tokenization domains in an ELF file.""" 123 reader = _elf_reader(elf) 124 section_data = reader.dump_section_contents(_TOKENIZED_ENTRY_SECTIONS) 125 if section_data is not None: 126 yield from frozenset( 127 e.domain 128 for e in _read_tokenized_entries(section_data, re.compile('.*')) 129 ) 130 131 132def read_tokenizer_metadata(elf) -> dict[str, int]: 133 """Reads the metadata entries from an ELF.""" 134 sections = _elf_reader(elf).dump_section_contents(r'\.pw_tokenizer\.info') 135 136 metadata: dict[str, int] = {} 137 if sections is not None: 138 for key, value in struct.iter_unpack('12sI', sections): 139 try: 140 metadata[key.rstrip(b'\0').decode()] = value 141 except UnicodeDecodeError as err: 142 _LOG.error( 143 'Failed to decode metadata key %r: %s', 144 key.rstrip(b'\0'), 145 err, 146 ) 147 148 return metadata 149 150 151def _database_from_strings(strings: list[str]) -> tokens.Database: 152 """Generates a C and C++ compatible database from untokenized strings.""" 153 # Generate a C-compatible database from the fixed length hash. 154 c_db = tokens.Database.from_strings(strings, tokenize=tokens.c_hash) 155 156 # Generate a C++ compatible database by allowing the hash to follow the 157 # string length. 158 cpp_db = tokens.Database.from_strings( 159 strings, tokenize=tokens.pw_tokenizer_65599_hash 160 ) 161 162 # Use a union of the C and C++ compatible databases. 163 return tokens.Database.merged(c_db, cpp_db) 164 165 166def _database_from_json(fd) -> tokens.Database: 167 return _database_from_strings(json.load(fd)) 168 169 170def _load_token_database( # pylint: disable=too-many-return-statements 171 db, domain: Pattern[str] 172) -> tokens.Database: 173 """Loads a Database from supported database types. 174 175 Supports Database objects, JSONs, ELFs, CSVs, and binary databases. 176 """ 177 if db is None: 178 return tokens.Database() 179 180 if isinstance(db, tokens.Database): 181 return db 182 183 if isinstance(db, elf_reader.Elf): 184 return _database_from_elf(db, domain) 185 186 # If it's a str, it might be a path. Check if it's an ELF, CSV, or JSON. 187 if isinstance(db, (str, Path)): 188 if not os.path.exists(db): 189 raise FileNotFoundError(f'"{db}" is not a path to a token database') 190 191 if Path(db).is_dir(): 192 return tokens.DatabaseFile.load(Path(db)) 193 194 # Read the path as an ELF file. 195 with open(db, 'rb') as fd: 196 if elf_reader.compatible_file(fd): 197 return _database_from_elf(fd, domain) 198 199 # Generate a database from JSON. 200 if str(db).endswith('.json'): 201 with open(db, 'r', encoding='utf-8') as json_fd: 202 return _database_from_json(json_fd) 203 204 # Read the path as a packed binary or CSV file. 205 return tokens.DatabaseFile.load(Path(db)) 206 207 # Assume that it's a file object and check if it's an ELF. 208 if elf_reader.compatible_file(db): 209 return _database_from_elf(db, domain) 210 211 # Read the database as JSON, CSV, or packed binary from a file object's 212 # path. 213 if hasattr(db, 'name') and os.path.exists(db.name): 214 if db.name.endswith('.json'): 215 return _database_from_json(db) 216 217 return tokens.DatabaseFile.load(Path(db.name)) 218 219 # Read CSV directly from the file object. 220 return tokens.Database(tokens.parse_csv(db)) 221 222 223def load_token_database( 224 *databases, domain: str | Pattern[str] = tokens.DEFAULT_DOMAIN 225) -> tokens.Database: 226 """Loads a Database from supported database types. 227 228 Supports Database objects, JSONs, ELFs, CSVs, and binary databases. 229 """ 230 domain = re.compile(domain) 231 return tokens.Database.merged( 232 *(_load_token_database(db, domain) for db in databases) 233 ) 234 235 236def database_summary(db: tokens.Database) -> dict[str, Any]: 237 """Returns a simple report of properties of the database.""" 238 present = [entry for entry in db.entries() if not entry.date_removed] 239 collisions = { 240 token: list(e.string for e in entries) 241 for token, entries in db.collisions() 242 } 243 244 # Add 1 to each string's size to account for the null terminator. 245 return dict( 246 present_entries=len(present), 247 present_size_bytes=sum(len(entry.string) + 1 for entry in present), 248 total_entries=len(db.entries()), 249 total_size_bytes=sum(len(entry.string) + 1 for entry in db.entries()), 250 collisions=collisions, 251 ) 252 253 254_DatabaseReport = dict[str, dict[str, dict[str, Any]]] 255 256 257def generate_reports(paths: Iterable[Path]) -> _DatabaseReport: 258 """Returns a dictionary with information about the provided databases.""" 259 reports: _DatabaseReport = {} 260 261 for path in paths: 262 domains = [''] 263 if path.is_file(): 264 with path.open('rb') as file: 265 if elf_reader.compatible_file(file): 266 domains = list(tokenization_domains(file)) 267 268 domain_reports = {} 269 270 for domain in domains: 271 domain_reports[domain] = database_summary( 272 load_token_database(path, domain=domain) 273 ) 274 275 reports[str(path)] = domain_reports 276 277 return reports 278 279 280def _handle_create( 281 databases, 282 database: Path, 283 force: bool, 284 output_type: str, 285 include: list, 286 exclude: list, 287 replace: list, 288) -> None: 289 """Creates a token database file from one or more ELF files.""" 290 if not force and database.exists(): 291 raise FileExistsError( 292 f'The file {database} already exists! Use --force to overwrite.' 293 ) 294 295 if not database.parent.exists(): 296 database.parent.mkdir(parents=True) 297 298 if output_type == 'directory': 299 if str(database) == '-': 300 raise ValueError( 301 'Cannot specify "-" (stdout) for directory databases' 302 ) 303 304 database.mkdir(exist_ok=True) 305 database = database / f'database{tokens.DIR_DB_SUFFIX}' 306 output_type = 'csv' 307 308 if str(database) == '-': 309 # Must write bytes to stdout; use sys.stdout.buffer. 310 fd = sys.stdout.buffer 311 else: 312 fd = database.open('wb') 313 314 db = tokens.Database.merged(*databases) 315 db.filter(include, exclude, replace) 316 317 with fd: 318 if output_type == 'csv': 319 tokens.write_csv(db, fd) 320 elif output_type == 'binary': 321 tokens.write_binary(db, fd) 322 else: 323 raise ValueError(f'Unknown database type "{output_type}"') 324 325 _LOG.info( 326 'Wrote database with %d entries to %s as %s', 327 len(db), 328 fd.name, 329 output_type, 330 ) 331 332 333def _handle_add( 334 token_database: tokens.DatabaseFile, 335 databases: list[tokens.Database], 336 commit: str | None, 337) -> None: 338 initial = len(token_database) 339 if commit: 340 entries = itertools.chain.from_iterable( 341 db.entries() for db in databases 342 ) 343 token_database.add_and_discard_temporary(entries, commit) 344 else: 345 for source in databases: 346 token_database.add(source.entries()) 347 348 token_database.write_to_file() 349 350 number_of_changes = len(token_database) - initial 351 352 if number_of_changes: 353 _LOG.info( 354 'Added %d entries to %s', number_of_changes, token_database.path 355 ) 356 357 358def _handle_mark_removed( 359 token_database: tokens.DatabaseFile, 360 databases: list[tokens.Database], 361 date: datetime | None, 362): 363 marked_removed = token_database.mark_removed( 364 ( 365 entry 366 for entry in tokens.Database.merged(*databases).entries() 367 if not entry.date_removed 368 ), 369 date, 370 ) 371 372 token_database.write_to_file(rewrite=True) 373 374 _LOG.info( 375 'Marked %d of %d entries as removed in %s', 376 len(marked_removed), 377 len(token_database), 378 token_database.path, 379 ) 380 381 382def _handle_purge(token_database: tokens.DatabaseFile, before: datetime | None): 383 purged = token_database.purge(before) 384 token_database.write_to_file(rewrite=True) 385 386 _LOG.info('Removed %d entries from %s', len(purged), token_database.path) 387 388 389def _handle_report(token_database_or_elf: list[Path], output: TextIO) -> None: 390 json.dump(generate_reports(token_database_or_elf), output, indent=2) 391 output.write('\n') 392 393 394def expand_paths_or_globs(*paths_or_globs: str) -> Iterable[Path]: 395 """Expands any globs in a list of paths; raises FileNotFoundError.""" 396 for path_or_glob in paths_or_globs: 397 if os.path.exists(path_or_glob): 398 # This is a valid path; yield it without evaluating it as a glob. 399 yield Path(path_or_glob) 400 else: 401 paths = glob.glob(path_or_glob, recursive=True) 402 403 # If no paths were found and the path is not a glob, raise an Error. 404 if not paths and not any(c in path_or_glob for c in '*?[]!'): 405 raise FileNotFoundError(f'{path_or_glob} is not a valid path') 406 407 for path in paths: 408 # Resolve globs to JSON, CSV, or compatible binary files. 409 if elf_reader.compatible_file(path) or path.endswith( 410 ('.csv', '.json') 411 ): 412 yield Path(path) 413 414 415class ExpandGlobs(argparse.Action): 416 """Argparse action that expands and appends paths.""" 417 418 def __call__(self, parser, namespace, values, unused_option_string=None): 419 setattr(namespace, self.dest, list(expand_paths_or_globs(*values))) 420 421 422def _read_elf_with_domain( 423 elf: str, domain: Pattern[str] 424) -> Iterable[tokens.Database]: 425 for path in expand_paths_or_globs(elf): 426 with path.open('rb') as file: 427 if not elf_reader.compatible_file(file): 428 raise ValueError( 429 f'{elf} is not an ELF file, ' 430 f'but the "{domain}" domain was specified' 431 ) 432 433 yield _database_from_elf(file, domain) 434 435 436class LoadTokenDatabases(argparse.Action): 437 """Argparse action that reads tokenize databases from paths or globs. 438 439 ELF files may have #domain appended to them to specify a tokenization domain 440 other than the default. 441 """ 442 443 def __call__(self, parser, namespace, values, option_string=None): 444 databases: list[tokens.Database] = [] 445 paths: Set[Path] = set() 446 447 try: 448 for value in values: 449 if value.count('#') == 1: 450 path, domain = value.split('#') 451 domain = re.compile(domain) 452 databases.extend(_read_elf_with_domain(path, domain)) 453 else: 454 paths.update(expand_paths_or_globs(value)) 455 456 for path in paths: 457 databases.append(load_token_database(path)) 458 except tokens.DatabaseFormatError as err: 459 parser.error( 460 f'argument elf_or_token_database: {path} is not a supported ' 461 'token database file. Only ELF files or token databases (CSV ' 462 f'or binary format) are supported. {err}. ' 463 ) 464 except FileNotFoundError as err: 465 parser.error(f'argument elf_or_token_database: {err}') 466 except: # pylint: disable=bare-except 467 _LOG.exception('Failed to load token database %s', path) 468 parser.error( 469 'argument elf_or_token_database: ' 470 f'Error occurred while loading token database {path}' 471 ) 472 473 setattr(namespace, self.dest, databases) 474 475 476def token_databases_parser(nargs: str = '+') -> argparse.ArgumentParser: 477 """Returns an argument parser for reading token databases. 478 479 These arguments can be added to another parser using the parents arg. 480 """ 481 parser = argparse.ArgumentParser(add_help=False) 482 parser.add_argument( 483 'databases', 484 metavar='elf_or_token_database', 485 nargs=nargs, 486 action=LoadTokenDatabases, 487 help=( 488 'ELF or token database files from which to read strings and ' 489 'tokens. For ELF files, the tokenization domain to read from ' 490 'may specified after the path as #domain_name (e.g. ' 491 'foo.elf#TEST_DOMAIN). Unless specified, only the default ' 492 'domain ("") is read from ELF files; .* reads all domains. ' 493 'Globs are expanded to compatible database files.' 494 ), 495 ) 496 return parser 497 498 499def _parse_args(): 500 """Parse and return command line arguments.""" 501 502 def year_month_day(value) -> datetime: 503 if value == 'today': 504 return datetime.now() 505 506 return datetime.fromisoformat(value) 507 508 year_month_day.__name__ = 'year-month-day (YYYY-MM-DD)' 509 510 # Shared command line options. 511 option_db = argparse.ArgumentParser(add_help=False) 512 option_db.add_argument( 513 '-d', 514 '--database', 515 dest='token_database', 516 type=lambda arg: tokens.DatabaseFile.load(Path(arg)), 517 required=True, 518 help='The database file to update.', 519 ) 520 521 option_tokens = token_databases_parser('*') 522 523 # Top-level argument parser. 524 parser = argparse.ArgumentParser( 525 description=__doc__, 526 formatter_class=argparse.RawDescriptionHelpFormatter, 527 ) 528 parser.set_defaults(handler=lambda **_: parser.print_help()) 529 530 subparsers = parser.add_subparsers( 531 help='Tokenized string database management actions:' 532 ) 533 534 # The 'create' command creates a database file. 535 subparser = subparsers.add_parser( 536 'create', 537 parents=[option_tokens], 538 help=( 539 'Creates a database with tokenized strings from one or more ' 540 'sources.' 541 ), 542 ) 543 subparser.set_defaults(handler=_handle_create) 544 subparser.add_argument( 545 '-d', 546 '--database', 547 required=True, 548 type=Path, 549 help='Path to the database file to create; use - for stdout.', 550 ) 551 subparser.add_argument( 552 '-t', 553 '--type', 554 dest='output_type', 555 choices=('csv', 'binary', 'directory'), 556 default='csv', 557 help='Which type of database to create. (default: csv)', 558 ) 559 subparser.add_argument( 560 '-f', 561 '--force', 562 action='store_true', 563 help='Overwrite the database if it exists.', 564 ) 565 subparser.add_argument( 566 '-i', 567 '--include', 568 type=re.compile, 569 default=[], 570 action='append', 571 help=( 572 'If provided, at least one of these regular expressions must ' 573 'match for a string to be included in the database.' 574 ), 575 ) 576 subparser.add_argument( 577 '-e', 578 '--exclude', 579 type=re.compile, 580 default=[], 581 action='append', 582 help=( 583 'If provided, none of these regular expressions may match for a ' 584 'string to be included in the database.' 585 ), 586 ) 587 588 unescaped_slash = re.compile(r'(?<!\\)/') 589 590 def replacement(value: str) -> tuple[Pattern, 'str']: 591 try: 592 find, sub = unescaped_slash.split(value, 1) 593 except ValueError as _err: 594 raise argparse.ArgumentTypeError( 595 'replacements must be specified as "search_regex/replacement"' 596 ) 597 598 try: 599 return re.compile(find.replace(r'\/', '/')), sub 600 except re.error as err: 601 raise argparse.ArgumentTypeError( 602 f'"{value}" is not a valid regular expression: {err}' 603 ) 604 605 subparser.add_argument( 606 '--replace', 607 type=replacement, 608 default=[], 609 action='append', 610 help=( 611 'If provided, replaces text that matches a regular expression. ' 612 'This can be used to replace sensitive terms in a token ' 613 'database that will be distributed publicly. The expression and ' 614 'replacement are specified as "search_regex/replacement". ' 615 'Plain slash characters in the regex must be escaped with a ' 616 r'backslash (\/). The replacement text may include ' 617 'backreferences for captured groups in the regex.' 618 ), 619 ) 620 621 # The 'add' command adds strings to a database from a set of ELFs. 622 subparser = subparsers.add_parser( 623 'add', 624 parents=[option_db, option_tokens], 625 help=( 626 'Adds new strings to a database with tokenized strings from a set ' 627 'of ELF files or other token databases. Missing entries are NOT ' 628 'marked as removed.' 629 ), 630 ) 631 subparser.set_defaults(handler=_handle_add) 632 subparser.add_argument( 633 '--discard-temporary', 634 dest='commit', 635 help=( 636 'Deletes temporary tokens in memory and on disk when a CSV exists ' 637 'within a commit. Afterwards, new strings are added to the ' 638 'database from a set of ELF files or other token databases. ' 639 'Missing entries are NOT marked as removed.' 640 ), 641 ) 642 643 # The 'mark_removed' command marks removed entries to match a set of ELFs. 644 subparser = subparsers.add_parser( 645 'mark_removed', 646 parents=[option_db, option_tokens], 647 help=( 648 'Updates a database with tokenized strings from a set of strings. ' 649 'Strings not present in the set remain in the database but are ' 650 'marked as removed. New strings are NOT added.' 651 ), 652 ) 653 subparser.set_defaults(handler=_handle_mark_removed) 654 subparser.add_argument( 655 '--date', 656 type=year_month_day, 657 help=( 658 'The removal date to use for all strings. ' 659 'May be YYYY-MM-DD or "today". (default: today)' 660 ), 661 ) 662 663 # The 'purge' command removes old entries. 664 subparser = subparsers.add_parser( 665 'purge', 666 parents=[option_db], 667 help='Purges removed strings from a database.', 668 ) 669 subparser.set_defaults(handler=_handle_purge) 670 subparser.add_argument( 671 '-b', 672 '--before', 673 type=year_month_day, 674 help=( 675 'Delete all entries removed on or before this date. ' 676 'May be YYYY-MM-DD or "today".' 677 ), 678 ) 679 680 # The 'report' command prints a report about a database. 681 subparser = subparsers.add_parser( 682 'report', help='Prints a report about a database.' 683 ) 684 subparser.set_defaults(handler=_handle_report) 685 subparser.add_argument( 686 'token_database_or_elf', 687 nargs='+', 688 action=ExpandGlobs, 689 help=( 690 'The ELF files or token databases about which to generate ' 691 'reports.' 692 ), 693 ) 694 subparser.add_argument( 695 '-o', 696 '--output', 697 type=argparse.FileType('w'), 698 default=sys.stdout, 699 help='The file to which to write the output; use - for stdout.', 700 ) 701 702 args = parser.parse_args() 703 704 handler = args.handler 705 del args.handler 706 707 return handler, args 708 709 710def _init_logging(level: int) -> None: 711 _LOG.setLevel(logging.DEBUG) 712 log_to_stderr = logging.StreamHandler() 713 log_to_stderr.setLevel(level) 714 log_to_stderr.setFormatter( 715 logging.Formatter( 716 fmt='%(asctime)s.%(msecs)03d-%(levelname)s: %(message)s', 717 datefmt='%H:%M:%S', 718 ) 719 ) 720 721 _LOG.addHandler(log_to_stderr) 722 723 724def _main(handler: Callable, args: argparse.Namespace) -> int: 725 _init_logging(logging.INFO) 726 handler(**vars(args)) 727 return 0 728 729 730if __name__ == '__main__': 731 sys.exit(_main(*_parse_args())) 732