1#!/usr/bin/python 2# Copyright 2015 The Chromium OS Authors. All rights reserved. 3# Use of this source code is governed by a BSD-style license that can be 4# found in the LICENSE file. 5 6"""Module for parsing TCG TPM2 library specification in HTML format. 7 8This module processes parts 2 and 3 of the specification, extracting 9information related to tables defined in the documents, feeding the 10information into the Table object for further processing and creating the 11appropriate TPM2 objects. 12""" 13 14from __future__ import print_function 15 16import HTMLParser 17import os 18import re 19import sys 20 21import tpm_table 22 23table_name = re.compile(r'^\s*Table\s+[0-9]+') 24 25 26class SpecParser(HTMLParser.HTMLParser): 27 """A class for parsing TCG specifications in html format.""" 28 29 # The state machine of the parser could be in one of the following states. 30 ANCHOR = 0 # Look for table title anchor 31 TABLE_NAME = 1 # Look for table title in the data stream 32 TABLE_BODY = 2 # Scraping the actual table body 33 MAYBE_DONE = 3 # Could be over, unless a single spec table is split in 34 # multiple HTML tables (to continue on the next page) 35 SKIP_HEADER = 4 # Ignore the header of the split tables 36 37 def __init__(self): 38 """Initialize a parser object to default state.""" 39 HTMLParser.HTMLParser.__init__(self) 40 self._state = self.ANCHOR 41 self._title = '' 42 self._table = tpm_table.Table() 43 self._previous_table_number = 0 # Used to check if there are skipped tables 44 45 def _Normalize(self, data): 46 """Normalize HTML data. 47 48 HTML files generated from TCG specifications sometimes include utf8 49 characters (like long dashes), which appear only in comments/table titles 50 and can be safely ignored. 51 52 Args: 53 data: a string representing portion of data from the HTML being parsed. 54 55 Returns: 56 a string, the input data with characters above ASCII printable range 57 excluded. 58 """ 59 return ' ' + ''.join(x for x in self.unescape(data) if ord(x) < 128) 60 61 def GetTable(self): 62 """Return the Table object containing all information parsed so far.""" 63 return self._table 64 65 def _SetState(self, new_state): 66 if self._state != new_state: 67 self._state = new_state 68 if new_state == self.TABLE_NAME: 69 self._title = '' 70 71 def handle_starttag(self, tag, attrs): 72 """Invoked each time a new HTML tag is opened. 73 74 This method drives changes in the parser FSM states, its heuristics are 75 derived from the format of the HTML files the TCG specs get converted to. 76 77 Each specification table is preceded with a tittle. The title is wrapped 78 in an anchor tag with a property 'name' set to 'bookmark#xxx. The title 79 text starts with ' Table [0-9]+ '. Once the table title is detected, 80 the state machine switches to looking for the actual HTML table, i.e. tags 81 'table', 'tr' and 'td' (the generated specs do not use the 'th' tags). 82 83 Large specification tables can be split into multiple HTML tables (so that 84 they fit in a page). This is why the presence of the closing 'table' tag 85 is not enough to close the parsing of the current specification table. 86 87 In some cases the next table is defined in the spec immediately after the 88 current one - this is when the new anchor tag is used as a signal that the 89 previous table has been completely consumed. 90 91 Args: 92 tag: a string, the HTML tag 93 attrs: a tuple of zero or more two-string tuples, the first element - 94 the HTML tag's attribute, the second element - the attribute 95 value. 96 """ 97 if tag == 'a': 98 if [x for x in attrs if x[0] == 'name' and x[1].startswith('bookmark')]: 99 if self._state == self.ANCHOR: 100 self._SetState(self.TABLE_NAME) 101 elif self._state == self.MAYBE_DONE: 102 # Done indeed 103 self._table.ProcessTable() 104 self._table.Init() 105 self._SetState(self.TABLE_NAME) 106 elif self._state == self.TABLE_NAME: 107 self._title = '' 108 elif tag == 'p' and self._state == self.TABLE_NAME and not self._title: 109 # This was not a valid table start, back to looking for the right anchor. 110 self._SetState(self.ANCHOR) 111 elif self._state == self.TABLE_NAME and tag == 'table': 112 if not table_name.search(self._title): 113 # Table title does not match the expected format - back to square one. 114 self._SetState(self.ANCHOR) 115 return # will have to start over 116 table_number = int(self._title.split()[1]) 117 self._previous_table_number += 1 118 if table_number > self._previous_table_number: 119 print('Table(s) %s missing' % ' '.join( 120 '%d' % x for x in 121 range(self._previous_table_number, table_number)), file=sys.stderr) 122 self._previous_table_number = table_number 123 self._table.Init(self._title) 124 self._SetState(self.TABLE_BODY) 125 elif self._state == self.MAYBE_DONE and tag == 'tr': 126 self._SetState(self.SKIP_HEADER) 127 elif self._state == self.SKIP_HEADER and tag == 'tr': 128 self._SetState(self.TABLE_BODY) 129 self._table.NewRow() 130 elif self._state == self.TABLE_BODY: 131 if tag == 'tr': 132 self._table.NewRow() 133 elif tag == 'td': 134 self._table.NewCell() 135 136 def handle_endtag(self, tag): 137 """Invoked each time an HTML tag is closed.""" 138 if tag == 'table' and self._table.InProgress(): 139 self._SetState(self.MAYBE_DONE) 140 141 def handle_data(self, data): 142 """Process data outside HTML tags.""" 143 if self._state == self.TABLE_NAME: 144 self._title += ' %s' % self._Normalize(data) 145 elif self._state == self.TABLE_BODY: 146 self._table.AddData(self._Normalize(data)) 147 elif self._state == self.MAYBE_DONE: 148 # Done indeed 149 self._table.ProcessTable() 150 self._table.Init() 151 self._SetState(self.ANCHOR) 152 153 def close(self): 154 """Finish processing of the HTML buffer.""" 155 if self._state in (self.TABLE_BODY, self.MAYBE_DONE): 156 self._table.ProcessTable() 157 self._state = self.ANCHOR 158 159 def handle_entityref(self, name): 160 """Process HTML escape sequence.""" 161 entmap = { 162 'amp': '&', 163 'gt': '>', 164 'lt': '<', 165 'quot': '"', 166 } 167 if name in entmap: 168 if self._state == self.TABLE_BODY: 169 self._table.AddData(entmap[name]) 170 elif self._state == self.TABLE_NAME: 171 self._title += entmap[name] 172 173 174def main(structs_html_file_name): 175 """When invoked standalone - dump .h file on the console.""" 176 parser = SpecParser() 177 with open(structs_html_file_name) as input_file: 178 html_content = input_file.read() 179 parser.feed(html_content) 180 parser.close() 181 print(parser.GetTable().GetHFile()) 182 183if __name__ == '__main__': 184 if len(sys.argv) != 2: 185 print('%s: One parameter is required, the name of the html file ' 186 'which is the TPM2 library Part 2 specification' % 187 os.path.basename(sys.argv[0]), file=sys.stderr) 188 sys.exit(1) 189 main(sys.argv[1]) 190