1#!/usr/bin/env python3 2# Copyright 2020 The Pigweed Authors 3# 4# Licensed under the Apache License, Version 2.0 (the "License"); you may not 5# use this file except in compliance with the License. You may obtain a copy of 6# the License at 7# 8# https://www.apache.org/licenses/LICENSE-2.0 9# 10# Unless required by applicable law or agreed to in writing, software 11# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT 12# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the 13# License for the specific language governing permissions and limitations under 14# the License. 15"""Tests for the database module.""" 16 17import json 18import io 19from pathlib import Path 20import shutil 21import sys 22import tempfile 23import unittest 24from unittest import mock 25 26from pw_tokenizer import database 27 28# This is an ELF file with only the pw_tokenizer sections. It was created 29# from a tokenize_test binary built for the STM32F429i Discovery board. The 30# pw_tokenizer sections were extracted with this command: 31# 32# arm-none-eabi-objcopy -S --only-section ".pw_tokenize*" <ELF> <OUTPUT> 33# 34TOKENIZED_ENTRIES_ELF = Path( 35 __file__).parent / 'example_binary_with_tokenized_strings.elf' 36LEGACY_PLAIN_STRING_ELF = Path( 37 __file__).parent / 'example_legacy_binary_with_tokenized_strings.elf' 38 39CSV_DEFAULT_DOMAIN = '''\ 4000000000, ,"" 41141c35d5, ,"The answer: ""%s""" 4229aef586, ,"1234" 432b78825f, ,"[:-)" 442e668cd6, ,"Jello, world!" 4531631781, ,"%d" 4661fd1e26, ,"%ld" 4768ab92da, ,"%s there are %x (%.2f) of them%c" 487b940e2a, ,"Hello %s! %hd %e" 497da55d52, ,">:-[]" 507f35a9a5, ,"TestName" 51851beeb6, ,"%u %d" 52881436a0, ,"The answer is: %s" 5388808930, ,"%u%d%02x%X%hu%hhd%d%ld%lu%lld%llu%c%c%c" 5492723f44, ,"???" 55a09d6698, ,"won-won-won-wonderful" 56aa9ffa66, ,"void pw::tokenizer::{anonymous}::TestName()" 57ad002c97, ,"%llx" 58b3653e13, ,"Jello!" 59cc6d3131, ,"Jello?" 60e13b0f94, ,"%llu" 61e65aefef, ,"Won't fit : %s%d" 62''' 63 64CSV_TEST_DOMAIN = """\ 6517fa86d3, ,"hello" 6618c5017c, ,"yes" 6759b2701c, ,"The answer was: %s" 68881436a0, ,"The answer is: %s" 69d18ada0f, ,"something" 70""" 71 72CSV_ALL_DOMAINS = '''\ 7300000000, ,"" 74141c35d5, ,"The answer: ""%s""" 7517fa86d3, ,"hello" 7618c5017c, ,"yes" 7729aef586, ,"1234" 782b78825f, ,"[:-)" 792e668cd6, ,"Jello, world!" 8031631781, ,"%d" 8159b2701c, ,"The answer was: %s" 8261fd1e26, ,"%ld" 8368ab92da, ,"%s there are %x (%.2f) of them%c" 847b940e2a, ,"Hello %s! %hd %e" 857da55d52, ,">:-[]" 867f35a9a5, ,"TestName" 87851beeb6, ,"%u %d" 88881436a0, ,"The answer is: %s" 8988808930, ,"%u%d%02x%X%hu%hhd%d%ld%lu%lld%llu%c%c%c" 9092723f44, ,"???" 91a09d6698, ,"won-won-won-wonderful" 92aa9ffa66, ,"void pw::tokenizer::{anonymous}::TestName()" 93ad002c97, ,"%llx" 94b3653e13, ,"Jello!" 95cc6d3131, ,"Jello?" 96d18ada0f, ,"something" 97e13b0f94, ,"%llu" 98e65aefef, ,"Won't fit : %s%d" 99''' 100 101JSON_SOURCE_STRINGS = '''\ 102[ 103 "pigweed/pw_polyfill/standard_library_public/pw_polyfill/standard_library/assert.h", 104 "protocol_buffer/gen/pigweed/pw_protobuf/common_protos.proto_library/nanopb/pw_protobuf_protos/status.pb.h", 105 "pigweed/pw_rpc/client_server.cc", 106 "pigweed/pw_rpc/public/pw_rpc/client_server.h", 107 "This is a very long string that will produce two tokens; one for C++ and one for C. This is because this string exceeds the default C hash length." 108] 109''' 110 111CSV_STRINGS = '''\ 1122cbf627a, ,"pigweed/pw_rpc/client_server.cc" 113666562a1, ,"protocol_buffer/gen/pigweed/pw_protobuf/common_protos.proto_library/nanopb/pw_protobuf_protos/status.pb.h" 1146c1e6eb3, ,"pigweed/pw_rpc/public/pw_rpc/client_server.h" 115b25a9932, ,"This is a very long string that will produce two tokens; one for C++ and one for C. This is because this string exceeds the default C hash length." 116eadf017f, ,"pigweed/pw_polyfill/standard_library_public/pw_polyfill/standard_library/assert.h" 117f815dc5c, ,"This is a very long string that will produce two tokens; one for C++ and one for C. This is because this string exceeds the default C hash length." 118''' 119 120EXPECTED_REPORT = { 121 str(TOKENIZED_ENTRIES_ELF): { 122 '': { 123 'present_entries': 22, 124 'present_size_bytes': 289, 125 'total_entries': 22, 126 'total_size_bytes': 289, 127 'collisions': {} 128 }, 129 'TEST_DOMAIN': { 130 'present_entries': 5, 131 'present_size_bytes': 57, 132 'total_entries': 5, 133 'total_size_bytes': 57, 134 'collisions': {} 135 } 136 } 137} 138 139 140def run_cli(*args) -> None: 141 original_argv = sys.argv 142 sys.argv = ['database.py', *(str(a) for a in args)] 143 # pylint: disable=protected-access 144 try: 145 database._main(*database._parse_args()) 146 finally: 147 # Remove the log handler added by _main to avoid duplicate logs. 148 if database._LOG.handlers: 149 database._LOG.handlers.pop() 150 # pylint: enable=protected-access 151 152 sys.argv = original_argv 153 154 155def _mock_output() -> io.TextIOWrapper: 156 output = io.BytesIO() 157 output.name = '<fake stdout>' 158 return io.TextIOWrapper(output, write_through=True) 159 160 161class DatabaseCommandLineTest(unittest.TestCase): 162 """Tests the database.py command line interface.""" 163 def setUp(self): 164 self._dir = Path(tempfile.mkdtemp('_pw_tokenizer_test')) 165 self._csv = self._dir / 'db.csv' 166 self._elf = TOKENIZED_ENTRIES_ELF 167 168 self._csv_test_domain = CSV_TEST_DOMAIN 169 170 def tearDown(self): 171 shutil.rmtree(self._dir) 172 173 def test_create_csv(self): 174 run_cli('create', '--database', self._csv, self._elf) 175 176 self.assertEqual(CSV_DEFAULT_DOMAIN.splitlines(), 177 self._csv.read_text().splitlines()) 178 179 def test_create_csv_test_domain(self): 180 run_cli('create', '--database', self._csv, f'{self._elf}#TEST_DOMAIN') 181 182 self.assertEqual(self._csv_test_domain.splitlines(), 183 self._csv.read_text().splitlines()) 184 185 def test_create_csv_all_domains(self): 186 run_cli('create', '--database', self._csv, f'{self._elf}#.*') 187 188 self.assertEqual(CSV_ALL_DOMAINS.splitlines(), 189 self._csv.read_text().splitlines()) 190 191 def test_create_force(self): 192 self._csv.write_text(CSV_ALL_DOMAINS) 193 194 with self.assertRaises(FileExistsError): 195 run_cli('create', '--database', self._csv, self._elf) 196 197 run_cli('create', '--force', '--database', self._csv, self._elf) 198 199 def test_create_binary(self): 200 binary = self._dir / 'db.bin' 201 run_cli('create', '--type', 'binary', '--database', binary, self._elf) 202 203 # Write the binary database as CSV to verify its contents. 204 run_cli('create', '--database', self._csv, binary) 205 206 self.assertEqual(CSV_DEFAULT_DOMAIN.splitlines(), 207 self._csv.read_text().splitlines()) 208 209 def test_add_does_not_recalculate_tokens(self): 210 db_with_custom_token = '01234567, ,"hello"' 211 212 to_add = self._dir / 'add_this.csv' 213 to_add.write_text(db_with_custom_token + '\n') 214 self._csv.touch() 215 216 run_cli('add', '--database', self._csv, to_add) 217 self.assertEqual(db_with_custom_token.splitlines(), 218 self._csv.read_text().splitlines()) 219 220 def test_mark_removed(self): 221 self._csv.write_text(CSV_ALL_DOMAINS) 222 223 run_cli('mark_removed', '--database', self._csv, '--date', 224 '1998-09-04', self._elf) 225 226 # Add the removal date to the four tokens not in the default domain 227 new_csv = CSV_ALL_DOMAINS 228 new_csv = new_csv.replace('17fa86d3, ,"hello"', 229 '17fa86d3,1998-09-04,"hello"') 230 new_csv = new_csv.replace('18c5017c, ,"yes"', 231 '18c5017c,1998-09-04,"yes"') 232 new_csv = new_csv.replace('59b2701c, ,"The answer was: %s"', 233 '59b2701c,1998-09-04,"The answer was: %s"') 234 new_csv = new_csv.replace('d18ada0f, ,"something"', 235 'd18ada0f,1998-09-04,"something"') 236 self.assertNotEqual(CSV_ALL_DOMAINS, new_csv) 237 238 self.assertEqual(new_csv.splitlines(), 239 self._csv.read_text().splitlines()) 240 241 def test_purge(self): 242 self._csv.write_text(CSV_ALL_DOMAINS) 243 244 # Mark everything not in TEST_DOMAIN as removed. 245 run_cli('mark_removed', '--database', self._csv, 246 f'{self._elf}#TEST_DOMAIN') 247 248 # Delete all entries except those in TEST_DOMAIN. 249 run_cli('purge', '--database', self._csv) 250 251 self.assertEqual(self._csv_test_domain.splitlines(), 252 self._csv.read_text().splitlines()) 253 254 @mock.patch('sys.stdout', new_callable=_mock_output) 255 def test_report(self, mock_stdout): 256 run_cli('report', self._elf) 257 258 self.assertEqual(json.loads(mock_stdout.buffer.getvalue()), 259 EXPECTED_REPORT) 260 261 def test_replace(self): 262 sub = 'replace/ment' 263 run_cli('create', '--database', self._csv, self._elf, '--replace', 264 r'(?i)\b[jh]ello\b/' + sub) 265 self.assertEqual( 266 CSV_DEFAULT_DOMAIN.replace('Jello', sub).replace('Hello', sub), 267 self._csv.read_text()) 268 269 def test_json_strings(self): 270 strings_file = self._dir / "strings.json" 271 272 with open(strings_file, 'w') as file: 273 file.write(JSON_SOURCE_STRINGS) 274 275 run_cli('create', '--force', '--database', self._csv, strings_file) 276 self.assertEqual(CSV_STRINGS.splitlines(), 277 self._csv.read_text().splitlines()) 278 279 280class LegacyDatabaseCommandLineTest(DatabaseCommandLineTest): 281 """Test an ELF with the legacy plain string storage format.""" 282 def setUp(self): 283 super().setUp() 284 self._elf = LEGACY_PLAIN_STRING_ELF 285 286 # The legacy approach for storing tokenized strings in an ELF always 287 # adds an entry for "", even if the empty string was never tokenized. 288 self._csv_test_domain = '00000000, ,""\n' + CSV_TEST_DOMAIN 289 290 @mock.patch('sys.stdout', new_callable=_mock_output) 291 def test_report(self, mock_stdout): 292 run_cli('report', self._elf) 293 294 report = EXPECTED_REPORT[str(TOKENIZED_ENTRIES_ELF)].copy() 295 296 # Count the implicitly added "" entry in TEST_DOMAIN. 297 report['TEST_DOMAIN']['present_entries'] += 1 298 report['TEST_DOMAIN']['present_size_bytes'] += 1 299 report['TEST_DOMAIN']['total_entries'] += 1 300 report['TEST_DOMAIN']['total_size_bytes'] += 1 301 302 # Rename "" to the legacy name "default" 303 report['default'] = report[''] 304 del report[''] 305 306 self.assertEqual({str(LEGACY_PLAIN_STRING_ELF): report}, 307 json.loads(mock_stdout.buffer.getvalue())) 308 309 310if __name__ == '__main__': 311 unittest.main() 312