1#!/usr/bin/env python3 2# Copyright 2020 The Pigweed Authors 3# 4# Licensed under the Apache License, Version 2.0 (the "License"); you may not 5# use this file except in compliance with the License. You may obtain a copy of 6# the License at 7# 8# https://www.apache.org/licenses/LICENSE-2.0 9# 10# Unless required by applicable law or agreed to in writing, software 11# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT 12# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the 13# License for the specific language governing permissions and limitations under 14# the License. 15"""Creates and manages token databases. 16 17This module manages reading tokenized strings from ELF files and building and 18maintaining token databases. 19""" 20 21import argparse 22from datetime import datetime 23import glob 24import itertools 25import json 26import logging 27import os 28from pathlib import Path 29import re 30import struct 31import sys 32from typing import ( 33 Any, 34 Callable, 35 Dict, 36 Iterable, 37 Iterator, 38 List, 39 Optional, 40 Pattern, 41 Set, 42 TextIO, 43 Tuple, 44 Union, 45) 46 47try: 48 from pw_tokenizer import elf_reader, tokens 49except ImportError: 50 # Append this path to the module search path to allow running this module 51 # without installing the pw_tokenizer package. 52 sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) 53 from pw_tokenizer import elf_reader, tokens 54 55_LOG = logging.getLogger('pw_tokenizer') 56 57 58def _elf_reader(elf) -> elf_reader.Elf: 59 return elf if isinstance(elf, elf_reader.Elf) else elf_reader.Elf(elf) 60 61 62# Magic number used to indicate the beginning of a tokenized string entry. This 63# value MUST match the value of _PW_TOKENIZER_ENTRY_MAGIC in 64# pw_tokenizer/public/pw_tokenizer/internal/tokenize_string.h. 65_TOKENIZED_ENTRY_MAGIC = 0xBAA98DEE 66_ENTRY = struct.Struct('<4I') 67_TOKENIZED_ENTRY_SECTIONS = re.compile(r'^\.pw_tokenizer.entries(?:\.[_\d]+)?$') 68 69_ERROR_HANDLER = 'surrogateescape' # How to deal with UTF-8 decoding errors 70 71 72class Error(Exception): 73 """Failed to extract token entries from an ELF file.""" 74 75 76def _read_tokenized_entries( 77 data: bytes, domain: Pattern[str] 78) -> Iterator[tokens.TokenizedStringEntry]: 79 index = 0 80 81 while index + _ENTRY.size <= len(data): 82 magic, token, domain_len, string_len = _ENTRY.unpack_from(data, index) 83 84 if magic != _TOKENIZED_ENTRY_MAGIC: 85 raise Error( 86 f'Expected magic number 0x{_TOKENIZED_ENTRY_MAGIC:08x}, ' 87 f'found 0x{magic:08x}' 88 ) 89 90 start = index + _ENTRY.size 91 index = start + domain_len + string_len 92 93 # Create the entries, trimming null terminators. 94 entry = tokens.TokenizedStringEntry( 95 token, 96 data[start + domain_len : index - 1].decode(errors=_ERROR_HANDLER), 97 data[start : start + domain_len - 1].decode(errors=_ERROR_HANDLER), 98 ) 99 100 if data[start + domain_len - 1] != 0: 101 raise Error( 102 f'Domain {entry.domain} for {entry.string} not null terminated' 103 ) 104 105 if data[index - 1] != 0: 106 raise Error(f'String {entry.string} is not null terminated') 107 108 if domain.fullmatch(entry.domain): 109 yield entry 110 111 112def _database_from_elf(elf, domain: Pattern[str]) -> tokens.Database: 113 """Reads the tokenized strings from an elf_reader.Elf or ELF file object.""" 114 _LOG.debug('Reading tokenized strings in domain "%s" from %s', domain, elf) 115 116 reader = _elf_reader(elf) 117 118 # Read tokenized string entries. 119 section_data = reader.dump_section_contents(_TOKENIZED_ENTRY_SECTIONS) 120 if section_data is not None: 121 return tokens.Database(_read_tokenized_entries(section_data, domain)) 122 123 return tokens.Database([]) 124 125 126def tokenization_domains(elf) -> Iterator[str]: 127 """Lists all tokenization domains in an ELF file.""" 128 reader = _elf_reader(elf) 129 section_data = reader.dump_section_contents(_TOKENIZED_ENTRY_SECTIONS) 130 if section_data is not None: 131 yield from frozenset( 132 e.domain 133 for e in _read_tokenized_entries(section_data, re.compile('.*')) 134 ) 135 136 137def read_tokenizer_metadata(elf) -> Dict[str, int]: 138 """Reads the metadata entries from an ELF.""" 139 sections = _elf_reader(elf).dump_section_contents(r'\.pw_tokenizer\.info') 140 141 metadata: Dict[str, int] = {} 142 if sections is not None: 143 for key, value in struct.iter_unpack('12sI', sections): 144 try: 145 metadata[key.rstrip(b'\0').decode()] = value 146 except UnicodeDecodeError as err: 147 _LOG.error( 148 'Failed to decode metadata key %r: %s', 149 key.rstrip(b'\0'), 150 err, 151 ) 152 153 return metadata 154 155 156def _database_from_strings(strings: List[str]) -> tokens.Database: 157 """Generates a C and C++ compatible database from untokenized strings.""" 158 # Generate a C-compatible database from the fixed length hash. 159 c_db = tokens.Database.from_strings(strings, tokenize=tokens.c_hash) 160 161 # Generate a C++ compatible database by allowing the hash to follow the 162 # string length. 163 cpp_db = tokens.Database.from_strings( 164 strings, tokenize=tokens.pw_tokenizer_65599_hash 165 ) 166 167 # Use a union of the C and C++ compatible databases. 168 return tokens.Database.merged(c_db, cpp_db) 169 170 171def _database_from_json(fd) -> tokens.Database: 172 return _database_from_strings(json.load(fd)) 173 174 175def _load_token_database( # pylint: disable=too-many-return-statements 176 db, domain: Pattern[str] 177) -> tokens.Database: 178 """Loads a Database from supported database types. 179 180 Supports Database objects, JSONs, ELFs, CSVs, and binary databases. 181 """ 182 if db is None: 183 return tokens.Database() 184 185 if isinstance(db, tokens.Database): 186 return db 187 188 if isinstance(db, elf_reader.Elf): 189 return _database_from_elf(db, domain) 190 191 # If it's a str, it might be a path. Check if it's an ELF, CSV, or JSON. 192 if isinstance(db, (str, Path)): 193 if not os.path.exists(db): 194 raise FileNotFoundError(f'"{db}" is not a path to a token database') 195 196 if Path(db).is_dir(): 197 return tokens.DatabaseFile.load(Path(db)) 198 199 # Read the path as an ELF file. 200 with open(db, 'rb') as fd: 201 if elf_reader.compatible_file(fd): 202 return _database_from_elf(fd, domain) 203 204 # Generate a database from JSON. 205 if str(db).endswith('.json'): 206 with open(db, 'r', encoding='utf-8') as json_fd: 207 return _database_from_json(json_fd) 208 209 # Read the path as a packed binary or CSV file. 210 return tokens.DatabaseFile.load(Path(db)) 211 212 # Assume that it's a file object and check if it's an ELF. 213 if elf_reader.compatible_file(db): 214 return _database_from_elf(db, domain) 215 216 # Read the database as JSON, CSV, or packed binary from a file object's 217 # path. 218 if hasattr(db, 'name') and os.path.exists(db.name): 219 if db.name.endswith('.json'): 220 return _database_from_json(db) 221 222 return tokens.DatabaseFile.load(Path(db.name)) 223 224 # Read CSV directly from the file object. 225 return tokens.Database(tokens.parse_csv(db)) 226 227 228def load_token_database( 229 *databases, domain: Union[str, Pattern[str]] = tokens.DEFAULT_DOMAIN 230) -> tokens.Database: 231 """Loads a Database from supported database types. 232 233 Supports Database objects, JSONs, ELFs, CSVs, and binary databases. 234 """ 235 domain = re.compile(domain) 236 return tokens.Database.merged( 237 *(_load_token_database(db, domain) for db in databases) 238 ) 239 240 241def database_summary(db: tokens.Database) -> Dict[str, Any]: 242 """Returns a simple report of properties of the database.""" 243 present = [entry for entry in db.entries() if not entry.date_removed] 244 collisions = { 245 token: list(e.string for e in entries) 246 for token, entries in db.collisions() 247 } 248 249 # Add 1 to each string's size to account for the null terminator. 250 return dict( 251 present_entries=len(present), 252 present_size_bytes=sum(len(entry.string) + 1 for entry in present), 253 total_entries=len(db.entries()), 254 total_size_bytes=sum(len(entry.string) + 1 for entry in db.entries()), 255 collisions=collisions, 256 ) 257 258 259_DatabaseReport = Dict[str, Dict[str, Dict[str, Any]]] 260 261 262def generate_reports(paths: Iterable[Path]) -> _DatabaseReport: 263 """Returns a dictionary with information about the provided databases.""" 264 reports: _DatabaseReport = {} 265 266 for path in paths: 267 domains = [''] 268 if path.is_file(): 269 with path.open('rb') as file: 270 if elf_reader.compatible_file(file): 271 domains = list(tokenization_domains(file)) 272 273 domain_reports = {} 274 275 for domain in domains: 276 domain_reports[domain] = database_summary( 277 load_token_database(path, domain=domain) 278 ) 279 280 reports[str(path)] = domain_reports 281 282 return reports 283 284 285def _handle_create( 286 databases, 287 database: Path, 288 force: bool, 289 output_type: str, 290 include: list, 291 exclude: list, 292 replace: list, 293) -> None: 294 """Creates a token database file from one or more ELF files.""" 295 if not force and database.exists(): 296 raise FileExistsError( 297 f'The file {database} already exists! Use --force to overwrite.' 298 ) 299 300 if output_type == 'directory': 301 if str(database) == '-': 302 raise ValueError( 303 'Cannot specify "-" (stdout) for directory databases' 304 ) 305 306 database.mkdir(exist_ok=True) 307 database = database / f'database{tokens.DIR_DB_SUFFIX}' 308 output_type = 'csv' 309 310 if str(database) == '-': 311 # Must write bytes to stdout; use sys.stdout.buffer. 312 fd = sys.stdout.buffer 313 else: 314 fd = database.open('wb') 315 316 db = tokens.Database.merged(*databases) 317 db.filter(include, exclude, replace) 318 319 with fd: 320 if output_type == 'csv': 321 tokens.write_csv(db, fd) 322 elif output_type == 'binary': 323 tokens.write_binary(db, fd) 324 else: 325 raise ValueError(f'Unknown database type "{output_type}"') 326 327 _LOG.info( 328 'Wrote database with %d entries to %s as %s', 329 len(db), 330 fd.name, 331 output_type, 332 ) 333 334 335def _handle_add( 336 token_database: tokens.DatabaseFile, 337 databases: List[tokens.Database], 338 commit: Optional[str], 339) -> None: 340 initial = len(token_database) 341 if commit: 342 entries = itertools.chain.from_iterable( 343 db.entries() for db in databases 344 ) 345 token_database.add_and_discard_temporary(entries, commit) 346 else: 347 for source in databases: 348 token_database.add(source.entries()) 349 350 token_database.write_to_file() 351 352 number_of_changes = len(token_database) - initial 353 354 if number_of_changes: 355 _LOG.info( 356 'Added %d entries to %s', number_of_changes, token_database.path 357 ) 358 359 360def _handle_mark_removed( 361 token_database: tokens.DatabaseFile, 362 databases: List[tokens.Database], 363 date: Optional[datetime], 364): 365 marked_removed = token_database.mark_removed( 366 ( 367 entry 368 for entry in tokens.Database.merged(*databases).entries() 369 if not entry.date_removed 370 ), 371 date, 372 ) 373 374 token_database.write_to_file(rewrite=True) 375 376 _LOG.info( 377 'Marked %d of %d entries as removed in %s', 378 len(marked_removed), 379 len(token_database), 380 token_database.path, 381 ) 382 383 384def _handle_purge( 385 token_database: tokens.DatabaseFile, before: Optional[datetime] 386): 387 purged = token_database.purge(before) 388 token_database.write_to_file(rewrite=True) 389 390 _LOG.info('Removed %d entries from %s', len(purged), token_database.path) 391 392 393def _handle_report(token_database_or_elf: List[Path], output: TextIO) -> None: 394 json.dump(generate_reports(token_database_or_elf), output, indent=2) 395 output.write('\n') 396 397 398def expand_paths_or_globs(*paths_or_globs: str) -> Iterable[Path]: 399 """Expands any globs in a list of paths; raises FileNotFoundError.""" 400 for path_or_glob in paths_or_globs: 401 if os.path.exists(path_or_glob): 402 # This is a valid path; yield it without evaluating it as a glob. 403 yield Path(path_or_glob) 404 else: 405 paths = glob.glob(path_or_glob, recursive=True) 406 407 # If no paths were found and the path is not a glob, raise an Error. 408 if not paths and not any(c in path_or_glob for c in '*?[]!'): 409 raise FileNotFoundError(f'{path_or_glob} is not a valid path') 410 411 for path in paths: 412 # Resolve globs to JSON, CSV, or compatible binary files. 413 if elf_reader.compatible_file(path) or path.endswith( 414 ('.csv', '.json') 415 ): 416 yield Path(path) 417 418 419class ExpandGlobs(argparse.Action): 420 """Argparse action that expands and appends paths.""" 421 422 def __call__(self, parser, namespace, values, unused_option_string=None): 423 setattr(namespace, self.dest, list(expand_paths_or_globs(*values))) 424 425 426def _read_elf_with_domain( 427 elf: str, domain: Pattern[str] 428) -> Iterable[tokens.Database]: 429 for path in expand_paths_or_globs(elf): 430 with path.open('rb') as file: 431 if not elf_reader.compatible_file(file): 432 raise ValueError( 433 f'{elf} is not an ELF file, ' 434 f'but the "{domain}" domain was specified' 435 ) 436 437 yield _database_from_elf(file, domain) 438 439 440class LoadTokenDatabases(argparse.Action): 441 """Argparse action that reads tokenize databases from paths or globs. 442 443 ELF files may have #domain appended to them to specify a tokenization domain 444 other than the default. 445 """ 446 447 def __call__(self, parser, namespace, values, option_string=None): 448 databases: List[tokens.Database] = [] 449 paths: Set[Path] = set() 450 451 try: 452 for value in values: 453 if value.count('#') == 1: 454 path, domain = value.split('#') 455 domain = re.compile(domain) 456 databases.extend(_read_elf_with_domain(path, domain)) 457 else: 458 paths.update(expand_paths_or_globs(value)) 459 460 for path in paths: 461 databases.append(load_token_database(path)) 462 except tokens.DatabaseFormatError as err: 463 parser.error( 464 f'argument elf_or_token_database: {path} is not a supported ' 465 'token database file. Only ELF files or token databases (CSV ' 466 f'or binary format) are supported. {err}. ' 467 ) 468 except FileNotFoundError as err: 469 parser.error(f'argument elf_or_token_database: {err}') 470 except: # pylint: disable=bare-except 471 _LOG.exception('Failed to load token database %s', path) 472 parser.error( 473 'argument elf_or_token_database: ' 474 f'Error occurred while loading token database {path}' 475 ) 476 477 setattr(namespace, self.dest, databases) 478 479 480def token_databases_parser(nargs: str = '+') -> argparse.ArgumentParser: 481 """Returns an argument parser for reading token databases. 482 483 These arguments can be added to another parser using the parents arg. 484 """ 485 parser = argparse.ArgumentParser(add_help=False) 486 parser.add_argument( 487 'databases', 488 metavar='elf_or_token_database', 489 nargs=nargs, 490 action=LoadTokenDatabases, 491 help=( 492 'ELF or token database files from which to read strings and ' 493 'tokens. For ELF files, the tokenization domain to read from ' 494 'may specified after the path as #domain_name (e.g. ' 495 'foo.elf#TEST_DOMAIN). Unless specified, only the default ' 496 'domain ("") is read from ELF files; .* reads all domains. ' 497 'Globs are expanded to compatible database files.' 498 ), 499 ) 500 return parser 501 502 503def _parse_args(): 504 """Parse and return command line arguments.""" 505 506 def year_month_day(value) -> datetime: 507 if value == 'today': 508 return datetime.now() 509 510 return datetime.fromisoformat(value) 511 512 year_month_day.__name__ = 'year-month-day (YYYY-MM-DD)' 513 514 # Shared command line options. 515 option_db = argparse.ArgumentParser(add_help=False) 516 option_db.add_argument( 517 '-d', 518 '--database', 519 dest='token_database', 520 type=lambda arg: tokens.DatabaseFile.load(Path(arg)), 521 required=True, 522 help='The database file to update.', 523 ) 524 525 option_tokens = token_databases_parser('*') 526 527 # Top-level argument parser. 528 parser = argparse.ArgumentParser( 529 description=__doc__, 530 formatter_class=argparse.RawDescriptionHelpFormatter, 531 ) 532 parser.set_defaults(handler=lambda **_: parser.print_help()) 533 534 subparsers = parser.add_subparsers( 535 help='Tokenized string database management actions:' 536 ) 537 538 # The 'create' command creates a database file. 539 subparser = subparsers.add_parser( 540 'create', 541 parents=[option_tokens], 542 help=( 543 'Creates a database with tokenized strings from one or more ' 544 'sources.' 545 ), 546 ) 547 subparser.set_defaults(handler=_handle_create) 548 subparser.add_argument( 549 '-d', 550 '--database', 551 required=True, 552 type=Path, 553 help='Path to the database file to create; use - for stdout.', 554 ) 555 subparser.add_argument( 556 '-t', 557 '--type', 558 dest='output_type', 559 choices=('csv', 'binary', 'directory'), 560 default='csv', 561 help='Which type of database to create. (default: csv)', 562 ) 563 subparser.add_argument( 564 '-f', 565 '--force', 566 action='store_true', 567 help='Overwrite the database if it exists.', 568 ) 569 subparser.add_argument( 570 '-i', 571 '--include', 572 type=re.compile, 573 default=[], 574 action='append', 575 help=( 576 'If provided, at least one of these regular expressions must ' 577 'match for a string to be included in the database.' 578 ), 579 ) 580 subparser.add_argument( 581 '-e', 582 '--exclude', 583 type=re.compile, 584 default=[], 585 action='append', 586 help=( 587 'If provided, none of these regular expressions may match for a ' 588 'string to be included in the database.' 589 ), 590 ) 591 592 unescaped_slash = re.compile(r'(?<!\\)/') 593 594 def replacement(value: str) -> Tuple[Pattern, 'str']: 595 try: 596 find, sub = unescaped_slash.split(value, 1) 597 except ValueError as _err: 598 raise argparse.ArgumentTypeError( 599 'replacements must be specified as "search_regex/replacement"' 600 ) 601 602 try: 603 return re.compile(find.replace(r'\/', '/')), sub 604 except re.error as err: 605 raise argparse.ArgumentTypeError( 606 f'"{value}" is not a valid regular expression: {err}' 607 ) 608 609 subparser.add_argument( 610 '--replace', 611 type=replacement, 612 default=[], 613 action='append', 614 help=( 615 'If provided, replaces text that matches a regular expression. ' 616 'This can be used to replace sensitive terms in a token ' 617 'database that will be distributed publicly. The expression and ' 618 'replacement are specified as "search_regex/replacement". ' 619 'Plain slash characters in the regex must be escaped with a ' 620 r'backslash (\/). The replacement text may include ' 621 'backreferences for captured groups in the regex.' 622 ), 623 ) 624 625 # The 'add' command adds strings to a database from a set of ELFs. 626 subparser = subparsers.add_parser( 627 'add', 628 parents=[option_db, option_tokens], 629 help=( 630 'Adds new strings to a database with tokenized strings from a set ' 631 'of ELF files or other token databases. Missing entries are NOT ' 632 'marked as removed.' 633 ), 634 ) 635 subparser.set_defaults(handler=_handle_add) 636 subparser.add_argument( 637 '--discard-temporary', 638 dest='commit', 639 help=( 640 'Deletes temporary tokens in memory and on disk when a CSV exists ' 641 'within a commit. Afterwards, new strings are added to the ' 642 'database from a set of ELF files or other token databases. ' 643 'Missing entries are NOT marked as removed.' 644 ), 645 ) 646 647 # The 'mark_removed' command marks removed entries to match a set of ELFs. 648 subparser = subparsers.add_parser( 649 'mark_removed', 650 parents=[option_db, option_tokens], 651 help=( 652 'Updates a database with tokenized strings from a set of strings. ' 653 'Strings not present in the set remain in the database but are ' 654 'marked as removed. New strings are NOT added.' 655 ), 656 ) 657 subparser.set_defaults(handler=_handle_mark_removed) 658 subparser.add_argument( 659 '--date', 660 type=year_month_day, 661 help=( 662 'The removal date to use for all strings. ' 663 'May be YYYY-MM-DD or "today". (default: today)' 664 ), 665 ) 666 667 # The 'purge' command removes old entries. 668 subparser = subparsers.add_parser( 669 'purge', 670 parents=[option_db], 671 help='Purges removed strings from a database.', 672 ) 673 subparser.set_defaults(handler=_handle_purge) 674 subparser.add_argument( 675 '-b', 676 '--before', 677 type=year_month_day, 678 help=( 679 'Delete all entries removed on or before this date. ' 680 'May be YYYY-MM-DD or "today".' 681 ), 682 ) 683 684 # The 'report' command prints a report about a database. 685 subparser = subparsers.add_parser( 686 'report', help='Prints a report about a database.' 687 ) 688 subparser.set_defaults(handler=_handle_report) 689 subparser.add_argument( 690 'token_database_or_elf', 691 nargs='+', 692 action=ExpandGlobs, 693 help=( 694 'The ELF files or token databases about which to generate ' 695 'reports.' 696 ), 697 ) 698 subparser.add_argument( 699 '-o', 700 '--output', 701 type=argparse.FileType('w'), 702 default=sys.stdout, 703 help='The file to which to write the output; use - for stdout.', 704 ) 705 706 args = parser.parse_args() 707 708 handler = args.handler 709 del args.handler 710 711 return handler, args 712 713 714def _init_logging(level: int) -> None: 715 _LOG.setLevel(logging.DEBUG) 716 log_to_stderr = logging.StreamHandler() 717 log_to_stderr.setLevel(level) 718 log_to_stderr.setFormatter( 719 logging.Formatter( 720 fmt='%(asctime)s.%(msecs)03d-%(levelname)s: %(message)s', 721 datefmt='%H:%M:%S', 722 ) 723 ) 724 725 _LOG.addHandler(log_to_stderr) 726 727 728def _main(handler: Callable, args: argparse.Namespace) -> int: 729 _init_logging(logging.INFO) 730 handler(**vars(args)) 731 return 0 732 733 734if __name__ == '__main__': 735 sys.exit(_main(*_parse_args())) 736