1#!/usr/bin/env python3 2# Copyright 2020 The Pigweed Authors 3# 4# Licensed under the Apache License, Version 2.0 (the "License"); you may not 5# use this file except in compliance with the License. You may obtain a copy of 6# the License at 7# 8# https://www.apache.org/licenses/LICENSE-2.0 9# 10# Unless required by applicable law or agreed to in writing, software 11# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT 12# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the 13# License for the specific language governing permissions and limitations under 14# the License. 15"""Creates and manages token databases. 16 17This module manages reading tokenized strings from ELF files and building and 18maintaining token databases. 19""" 20 21import argparse 22from datetime import datetime 23import glob 24import json 25import logging 26import os 27from pathlib import Path 28import re 29import struct 30import sys 31from typing import (Any, Callable, Dict, Iterable, Iterator, List, Pattern, 32 Set, TextIO, Tuple, Union) 33 34try: 35 from pw_tokenizer import elf_reader, tokens 36except ImportError: 37 # Append this path to the module search path to allow running this module 38 # without installing the pw_tokenizer package. 39 sys.path.append(os.path.dirname(os.path.dirname( 40 os.path.abspath(__file__)))) 41 from pw_tokenizer import elf_reader, tokens 42 43_LOG = logging.getLogger('pw_tokenizer') 44 45 46def _elf_reader(elf) -> elf_reader.Elf: 47 return elf if isinstance(elf, elf_reader.Elf) else elf_reader.Elf(elf) 48 49 50# Magic number used to indicate the beginning of a tokenized string entry. This 51# value MUST match the value of _PW_TOKENIZER_ENTRY_MAGIC in 52# pw_tokenizer/public/pw_tokenizer/internal/tokenize_string.h. 53_TOKENIZED_ENTRY_MAGIC = 0xBAA98DEE 54_ENTRY = struct.Struct('<4I') 55_TOKENIZED_ENTRY_SECTIONS = re.compile( 56 r'^\.pw_tokenizer.entries(?:\.[_\d]+)?$') 57 58_LEGACY_STRING_SECTIONS = re.compile( 59 r'^\.pw_tokenized\.(?P<domain>[^.]+)(?:\.\d+)?$') 60 61_ERROR_HANDLER = 'surrogateescape' # How to deal with UTF-8 decoding errors 62 63 64class Error(Exception): 65 """Failed to extract token entries from an ELF file.""" 66 67 68def _read_tokenized_entries( 69 data: bytes, 70 domain: Pattern[str]) -> Iterator[tokens.TokenizedStringEntry]: 71 index = 0 72 73 while index + _ENTRY.size <= len(data): 74 magic, token, domain_len, string_len = _ENTRY.unpack_from(data, index) 75 76 if magic != _TOKENIZED_ENTRY_MAGIC: 77 raise Error( 78 f'Expected magic number 0x{_TOKENIZED_ENTRY_MAGIC:08x}, ' 79 f'found 0x{magic:08x}') 80 81 start = index + _ENTRY.size 82 index = start + domain_len + string_len 83 84 # Create the entries, trimming null terminators. 85 entry = tokens.TokenizedStringEntry( 86 token, 87 data[start + domain_len:index - 1].decode(errors=_ERROR_HANDLER), 88 data[start:start + domain_len - 1].decode(errors=_ERROR_HANDLER), 89 ) 90 91 if data[start + domain_len - 1] != 0: 92 raise Error( 93 f'Domain {entry.domain} for {entry.string} not null terminated' 94 ) 95 96 if data[index - 1] != 0: 97 raise Error(f'String {entry.string} is not null terminated') 98 99 if domain.fullmatch(entry.domain): 100 yield entry 101 102 103def _read_tokenized_strings(sections: Dict[str, bytes], 104 domain: Pattern[str]) -> Iterator[tokens.Database]: 105 # Legacy ELF files used "default" as the default domain instead of "". Remap 106 # the default if necessary. 107 if domain.pattern == tokens.DEFAULT_DOMAIN: 108 domain = re.compile('default') 109 110 for section, data in sections.items(): 111 match = _LEGACY_STRING_SECTIONS.match(section) 112 if match and domain.match(match.group('domain')): 113 yield tokens.Database.from_strings( 114 (s.decode(errors=_ERROR_HANDLER) for s in data.split(b'\0')), 115 match.group('domain')) 116 117 118def _database_from_elf(elf, domain: Pattern[str]) -> tokens.Database: 119 """Reads the tokenized strings from an elf_reader.Elf or ELF file object.""" 120 _LOG.debug('Reading tokenized strings in domain "%s" from %s', domain, elf) 121 122 reader = _elf_reader(elf) 123 124 # Read tokenized string entries. 125 section_data = reader.dump_section_contents(_TOKENIZED_ENTRY_SECTIONS) 126 if section_data is not None: 127 return tokens.Database(_read_tokenized_entries(section_data, domain)) 128 129 # Read legacy null-terminated string entries. 130 sections = reader.dump_sections(_LEGACY_STRING_SECTIONS) 131 if sections: 132 return tokens.Database.merged( 133 *_read_tokenized_strings(sections, domain)) 134 135 return tokens.Database([]) 136 137 138def tokenization_domains(elf) -> Iterator[str]: 139 """Lists all tokenization domains in an ELF file.""" 140 reader = _elf_reader(elf) 141 section_data = reader.dump_section_contents(_TOKENIZED_ENTRY_SECTIONS) 142 if section_data is not None: 143 yield from frozenset( 144 e.domain 145 for e in _read_tokenized_entries(section_data, re.compile('.*'))) 146 else: # Check for the legacy domain sections 147 for section in reader.sections: 148 match = _LEGACY_STRING_SECTIONS.match(section.name) 149 if match: 150 yield match.group('domain') 151 152 153def read_tokenizer_metadata(elf) -> Dict[str, int]: 154 """Reads the metadata entries from an ELF.""" 155 sections = _elf_reader(elf).dump_section_contents(r'\.pw_tokenizer\.info') 156 157 metadata: Dict[str, int] = {} 158 if sections is not None: 159 for key, value in struct.iter_unpack('12sI', sections): 160 try: 161 metadata[key.rstrip(b'\0').decode()] = value 162 except UnicodeDecodeError as err: 163 _LOG.error('Failed to decode metadata key %r: %s', 164 key.rstrip(b'\0'), err) 165 166 return metadata 167 168 169def _database_from_strings(strings: List[str]) -> tokens.Database: 170 """Generates a C and C++ compatible database from untokenized strings.""" 171 # Generate a C compatible database from the fixed length hash. 172 c_db = tokens.Database.from_strings( 173 strings, 174 tokenize=lambda string: tokens.pw_tokenizer_65599_hash( 175 string, tokens.DEFAULT_C_HASH_LENGTH)) 176 177 # Generate a C++ compatible database by allowing the hash to follow the 178 # string length. 179 cpp_db = tokens.Database.from_strings( 180 strings, tokenize=tokens.pw_tokenizer_65599_hash) 181 182 # Use a union of the C and C++ compatible databases. 183 return tokens.Database.merged(c_db, cpp_db) 184 185 186def _database_from_json(fd) -> tokens.Database: 187 return _database_from_strings(json.load(fd)) 188 189 190def _load_token_database(db, domain: Pattern[str]) -> tokens.Database: 191 """Loads a Database from supported database types. 192 193 Supports Database objects, JSONs, ELFs, CSVs, and binary databases. 194 """ 195 if db is None: 196 return tokens.Database() 197 198 if isinstance(db, tokens.Database): 199 return db 200 201 if isinstance(db, elf_reader.Elf): 202 return _database_from_elf(db, domain) 203 204 # If it's a str, it might be a path. Check if it's an ELF, CSV, or JSON. 205 if isinstance(db, (str, Path)): 206 if not os.path.exists(db): 207 raise FileNotFoundError( 208 f'"{db}" is not a path to a token database') 209 210 # Read the path as an ELF file. 211 with open(db, 'rb') as fd: 212 if elf_reader.compatible_file(fd): 213 return _database_from_elf(fd, domain) 214 215 # Generate a database from JSON. 216 if str(db).endswith('.json'): 217 with open(db, 'r') as json_fd: 218 return _database_from_json(json_fd) 219 220 # Read the path as a packed binary or CSV file. 221 return tokens.DatabaseFile(db) 222 223 # Assume that it's a file object and check if it's an ELF. 224 if elf_reader.compatible_file(db): 225 return _database_from_elf(db, domain) 226 227 # Read the database as JSON, CSV, or packed binary from a file object's 228 # path. 229 if hasattr(db, 'name') and os.path.exists(db.name): 230 if db.name.endswith('.json'): 231 return _database_from_json(db) 232 233 return tokens.DatabaseFile(db.name) 234 235 # Read CSV directly from the file object. 236 return tokens.Database(tokens.parse_csv(db)) 237 238 239def load_token_database( 240 *databases, 241 domain: Union[str, 242 Pattern[str]] = tokens.DEFAULT_DOMAIN) -> tokens.Database: 243 """Loads a Database from supported database types. 244 245 Supports Database objects, JSONs, ELFs, CSVs, and binary databases. 246 """ 247 domain = re.compile(domain) 248 return tokens.Database.merged(*(_load_token_database(db, domain) 249 for db in databases)) 250 251 252def database_summary(db: tokens.Database) -> Dict[str, Any]: 253 """Returns a simple report of properties of the database.""" 254 present = [entry for entry in db.entries() if not entry.date_removed] 255 collisions = { 256 token: list(e.string for e in entries) 257 for token, entries in db.collisions() 258 } 259 260 # Add 1 to each string's size to account for the null terminator. 261 return dict( 262 present_entries=len(present), 263 present_size_bytes=sum(len(entry.string) + 1 for entry in present), 264 total_entries=len(db.entries()), 265 total_size_bytes=sum(len(entry.string) + 1 for entry in db.entries()), 266 collisions=collisions, 267 ) 268 269 270_DatabaseReport = Dict[str, Dict[str, Dict[str, Any]]] 271 272 273def generate_reports(paths: Iterable[Path]) -> _DatabaseReport: 274 """Returns a dictionary with information about the provided databases.""" 275 reports: _DatabaseReport = {} 276 277 for path in paths: 278 with path.open('rb') as file: 279 if elf_reader.compatible_file(file): 280 domains = list(tokenization_domains(file)) 281 else: 282 domains = [''] 283 284 domain_reports = {} 285 286 for domain in domains: 287 domain_reports[domain] = database_summary( 288 load_token_database(path, domain=domain)) 289 290 reports[str(path)] = domain_reports 291 292 return reports 293 294 295def _handle_create(databases, database, force, output_type, include, exclude, 296 replace): 297 """Creates a token database file from one or more ELF files.""" 298 299 if database == '-': 300 # Must write bytes to stdout; use sys.stdout.buffer. 301 fd = sys.stdout.buffer 302 elif not force and os.path.exists(database): 303 raise FileExistsError( 304 f'The file {database} already exists! Use --force to overwrite.') 305 else: 306 fd = open(database, 'wb') 307 308 database = tokens.Database.merged(*databases) 309 database.filter(include, exclude, replace) 310 311 with fd: 312 if output_type == 'csv': 313 tokens.write_csv(database, fd) 314 elif output_type == 'binary': 315 tokens.write_binary(database, fd) 316 else: 317 raise ValueError(f'Unknown database type "{output_type}"') 318 319 _LOG.info('Wrote database with %d entries to %s as %s', len(database), 320 fd.name, output_type) 321 322 323def _handle_add(token_database, databases): 324 initial = len(token_database) 325 326 for source in databases: 327 token_database.add(source.entries()) 328 329 token_database.write_to_file() 330 331 _LOG.info('Added %d entries to %s', 332 len(token_database) - initial, token_database.path) 333 334 335def _handle_mark_removed(token_database, databases, date): 336 marked_removed = token_database.mark_removed( 337 (entry for entry in tokens.Database.merged(*databases).entries() 338 if not entry.date_removed), date) 339 340 token_database.write_to_file() 341 342 _LOG.info('Marked %d of %d entries as removed in %s', len(marked_removed), 343 len(token_database), token_database.path) 344 345 346def _handle_purge(token_database, before): 347 purged = token_database.purge(before) 348 token_database.write_to_file() 349 350 _LOG.info('Removed %d entries from %s', len(purged), token_database.path) 351 352 353def _handle_report(token_database_or_elf: List[Path], output: TextIO) -> None: 354 json.dump(generate_reports(token_database_or_elf), output, indent=2) 355 output.write('\n') 356 357 358def expand_paths_or_globs(*paths_or_globs: str) -> Iterable[Path]: 359 """Expands any globs in a list of paths; raises FileNotFoundError.""" 360 for path_or_glob in paths_or_globs: 361 if os.path.exists(path_or_glob): 362 # This is a valid path; yield it without evaluating it as a glob. 363 yield Path(path_or_glob) 364 else: 365 paths = glob.glob(path_or_glob, recursive=True) 366 367 # If no paths were found and the path is not a glob, raise an Error. 368 if not paths and not any(c in path_or_glob for c in '*?[]!'): 369 raise FileNotFoundError(f'{path_or_glob} is not a valid path') 370 371 for path in paths: 372 # Resolve globs to JSON, CSV, or compatible binary files. 373 if elf_reader.compatible_file(path) or path.endswith( 374 ('.csv', '.json')): 375 yield Path(path) 376 377 378class ExpandGlobs(argparse.Action): 379 """Argparse action that expands and appends paths.""" 380 def __call__(self, parser, namespace, values, unused_option_string=None): 381 setattr(namespace, self.dest, list(expand_paths_or_globs(*values))) 382 383 384def _read_elf_with_domain(elf: str, 385 domain: Pattern[str]) -> Iterable[tokens.Database]: 386 for path in expand_paths_or_globs(elf): 387 with path.open('rb') as file: 388 if not elf_reader.compatible_file(file): 389 raise ValueError(f'{elf} is not an ELF file, ' 390 f'but the "{domain}" domain was specified') 391 392 yield _database_from_elf(file, domain) 393 394 395class LoadTokenDatabases(argparse.Action): 396 """Argparse action that reads tokenize databases from paths or globs. 397 398 ELF files may have #domain appended to them to specify a tokenization domain 399 other than the default. 400 """ 401 def __call__(self, parser, namespace, values, option_string=None): 402 databases: List[tokens.Database] = [] 403 paths: Set[Path] = set() 404 405 try: 406 for value in values: 407 if value.count('#') == 1: 408 path, domain = value.split('#') 409 domain = re.compile(domain) 410 databases.extend(_read_elf_with_domain(path, domain)) 411 else: 412 paths.update(expand_paths_or_globs(value)) 413 414 for path in paths: 415 databases.append(load_token_database(path)) 416 except tokens.DatabaseFormatError as err: 417 parser.error( 418 f'argument elf_or_token_database: {path} is not a supported ' 419 'token database file. Only ELF files or token databases (CSV ' 420 f'or binary format) are supported. {err}. ') 421 except FileNotFoundError as err: 422 parser.error(f'argument elf_or_token_database: {err}') 423 except: # pylint: disable=bare-except 424 _LOG.exception('Failed to load token database %s', path) 425 parser.error('argument elf_or_token_database: ' 426 f'Error occurred while loading token database {path}') 427 428 setattr(namespace, self.dest, databases) 429 430 431def token_databases_parser(nargs: str = '+') -> argparse.ArgumentParser: 432 """Returns an argument parser for reading token databases. 433 434 These arguments can be added to another parser using the parents arg. 435 """ 436 parser = argparse.ArgumentParser(add_help=False) 437 parser.add_argument( 438 'databases', 439 metavar='elf_or_token_database', 440 nargs=nargs, 441 action=LoadTokenDatabases, 442 help=('ELF or token database files from which to read strings and ' 443 'tokens. For ELF files, the tokenization domain to read from ' 444 'may specified after the path as #domain_name (e.g. ' 445 'foo.elf#TEST_DOMAIN). Unless specified, only the default ' 446 'domain ("") is read from ELF files; .* reads all domains. ' 447 'Globs are expanded to compatible database files.')) 448 return parser 449 450 451def _parse_args(): 452 """Parse and return command line arguments.""" 453 def year_month_day(value) -> datetime: 454 if value == 'today': 455 return datetime.now() 456 457 return datetime.strptime(value, tokens.DATE_FORMAT) 458 459 year_month_day.__name__ = 'year-month-day (YYYY-MM-DD)' 460 461 # Shared command line options. 462 option_db = argparse.ArgumentParser(add_help=False) 463 option_db.add_argument('-d', 464 '--database', 465 dest='token_database', 466 type=tokens.DatabaseFile, 467 required=True, 468 help='The database file to update.') 469 470 option_tokens = token_databases_parser('*') 471 472 # Top-level argument parser. 473 parser = argparse.ArgumentParser( 474 description=__doc__, 475 formatter_class=argparse.RawDescriptionHelpFormatter) 476 parser.set_defaults(handler=lambda **_: parser.print_help()) 477 478 subparsers = parser.add_subparsers( 479 help='Tokenized string database management actions:') 480 481 # The 'create' command creates a database file. 482 subparser = subparsers.add_parser( 483 'create', 484 parents=[option_tokens], 485 help= 486 'Creates a database with tokenized strings from one or more sources.') 487 subparser.set_defaults(handler=_handle_create) 488 subparser.add_argument( 489 '-d', 490 '--database', 491 required=True, 492 help='Path to the database file to create; use - for stdout.') 493 subparser.add_argument( 494 '-t', 495 '--type', 496 dest='output_type', 497 choices=('csv', 'binary'), 498 default='csv', 499 help='Which type of database to create. (default: csv)') 500 subparser.add_argument('-f', 501 '--force', 502 action='store_true', 503 help='Overwrite the database if it exists.') 504 subparser.add_argument( 505 '-i', 506 '--include', 507 type=re.compile, 508 default=[], 509 action='append', 510 help=('If provided, at least one of these regular expressions must ' 511 'match for a string to be included in the database.')) 512 subparser.add_argument( 513 '-e', 514 '--exclude', 515 type=re.compile, 516 default=[], 517 action='append', 518 help=('If provided, none of these regular expressions may match for a ' 519 'string to be included in the database.')) 520 521 unescaped_slash = re.compile(r'(?<!\\)/') 522 523 def replacement(value: str) -> Tuple[Pattern, 'str']: 524 try: 525 find, sub = unescaped_slash.split(value, 1) 526 except ValueError as err: 527 raise argparse.ArgumentTypeError( 528 'replacements must be specified as "search_regex/replacement"') 529 530 try: 531 return re.compile(find.replace(r'\/', '/')), sub 532 except re.error as err: 533 raise argparse.ArgumentTypeError( 534 f'"{value}" is not a valid regular expression: {err}') 535 536 subparser.add_argument( 537 '--replace', 538 type=replacement, 539 default=[], 540 action='append', 541 help=('If provided, replaces text that matches a regular expression. ' 542 'This can be used to replace sensitive terms in a token ' 543 'database that will be distributed publicly. The expression and ' 544 'replacement are specified as "search_regex/replacement". ' 545 'Plain slash characters in the regex must be escaped with a ' 546 r'backslash (\/). The replacement text may include ' 547 'backreferences for captured groups in the regex.')) 548 549 # The 'add' command adds strings to a database from a set of ELFs. 550 subparser = subparsers.add_parser( 551 'add', 552 parents=[option_db, option_tokens], 553 help=( 554 'Adds new strings to a database with tokenized strings from a set ' 555 'of ELF files or other token databases. Missing entries are NOT ' 556 'marked as removed.')) 557 subparser.set_defaults(handler=_handle_add) 558 559 # The 'mark_removed' command marks removed entries to match a set of ELFs. 560 subparser = subparsers.add_parser( 561 'mark_removed', 562 parents=[option_db, option_tokens], 563 help=( 564 'Updates a database with tokenized strings from a set of strings. ' 565 'Strings not present in the set remain in the database but are ' 566 'marked as removed. New strings are NOT added.')) 567 subparser.set_defaults(handler=_handle_mark_removed) 568 subparser.add_argument( 569 '--date', 570 type=year_month_day, 571 help=('The removal date to use for all strings. ' 572 'May be YYYY-MM-DD or "today". (default: today)')) 573 574 # The 'purge' command removes old entries. 575 subparser = subparsers.add_parser( 576 'purge', 577 parents=[option_db], 578 help='Purges removed strings from a database.') 579 subparser.set_defaults(handler=_handle_purge) 580 subparser.add_argument( 581 '-b', 582 '--before', 583 type=year_month_day, 584 help=('Delete all entries removed on or before this date. ' 585 'May be YYYY-MM-DD or "today".')) 586 587 # The 'report' command prints a report about a database. 588 subparser = subparsers.add_parser('report', 589 help='Prints a report about a database.') 590 subparser.set_defaults(handler=_handle_report) 591 subparser.add_argument( 592 'token_database_or_elf', 593 nargs='+', 594 action=ExpandGlobs, 595 help='The ELF files or token databases about which to generate reports.' 596 ) 597 subparser.add_argument( 598 '-o', 599 '--output', 600 type=argparse.FileType('w'), 601 default=sys.stdout, 602 help='The file to which to write the output; use - for stdout.') 603 604 args = parser.parse_args() 605 606 handler = args.handler 607 del args.handler 608 609 return handler, args 610 611 612def _init_logging(level: int) -> None: 613 _LOG.setLevel(logging.DEBUG) 614 log_to_stderr = logging.StreamHandler() 615 log_to_stderr.setLevel(level) 616 log_to_stderr.setFormatter( 617 logging.Formatter( 618 fmt='%(asctime)s.%(msecs)03d-%(levelname)s: %(message)s', 619 datefmt='%H:%M:%S')) 620 621 _LOG.addHandler(log_to_stderr) 622 623 624def _main(handler: Callable, args: argparse.Namespace) -> int: 625 _init_logging(logging.INFO) 626 handler(**vars(args)) 627 return 0 628 629 630if __name__ == '__main__': 631 sys.exit(_main(*_parse_args())) 632