pw_tokenizer/py/database_test.py

#!/usr/bin/env python3
# Copyright 2020 The Pigweed Authors
#
# Licensed under the Apache License, Version 2.0 (the "License"); you may not
# use this file except in compliance with the License. You may obtain a copy of
# the License at
#
#     https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
# License for the specific language governing permissions and limitations under
# the License.
"""Tests for the database module."""

import json
import io
from pathlib import Path
import shutil
import sys
import tempfile
import unittest
from unittest import mock

from pw_tokenizer import database

# This is an ELF file with only the pw_tokenizer sections. It was created
# from a tokenize_test binary built for the STM32F429i Discovery board. The
# pw_tokenizer sections were extracted with this command:
#
#   arm-none-eabi-objcopy -S --only-section ".pw_tokenize*" <ELF> <OUTPUT>
#
TOKENIZED_ENTRIES_ELF = Path(
    __file__).parent / 'example_binary_with_tokenized_strings.elf'
LEGACY_PLAIN_STRING_ELF = Path(
    __file__).parent / 'example_legacy_binary_with_tokenized_strings.elf'

CSV_DEFAULT_DOMAIN = '''\
00000000,          ,""
141c35d5,          ,"The answer: ""%s"""
29aef586,          ,"1234"
2b78825f,          ,"[:-)"
2e668cd6,          ,"Jello, world!"
31631781,          ,"%d"
61fd1e26,          ,"%ld"
68ab92da,          ,"%s there are %x (%.2f) of them%c"
7b940e2a,          ,"Hello %s! %hd %e"
7da55d52,          ,">:-[]"
7f35a9a5,          ,"TestName"
851beeb6,          ,"%u %d"
881436a0,          ,"The answer is: %s"
88808930,          ,"%u%d%02x%X%hu%hhd%d%ld%lu%lld%llu%c%c%c"
92723f44,          ,"???"
a09d6698,          ,"won-won-won-wonderful"
aa9ffa66,          ,"void pw::tokenizer::{anonymous}::TestName()"
ad002c97,          ,"%llx"
b3653e13,          ,"Jello!"
cc6d3131,          ,"Jello?"
e13b0f94,          ,"%llu"
e65aefef,          ,"Won't fit : %s%d"
'''

CSV_TEST_DOMAIN = """\
17fa86d3,          ,"hello"
18c5017c,          ,"yes"
59b2701c,          ,"The answer was: %s"
881436a0,          ,"The answer is: %s"
d18ada0f,          ,"something"
"""

CSV_ALL_DOMAINS = '''\
00000000,          ,""
141c35d5,          ,"The answer: ""%s"""
17fa86d3,          ,"hello"
18c5017c,          ,"yes"
29aef586,          ,"1234"
2b78825f,          ,"[:-)"
2e668cd6,          ,"Jello, world!"
31631781,          ,"%d"
59b2701c,          ,"The answer was: %s"
61fd1e26,          ,"%ld"
68ab92da,          ,"%s there are %x (%.2f) of them%c"
7b940e2a,          ,"Hello %s! %hd %e"
7da55d52,          ,">:-[]"
7f35a9a5,          ,"TestName"
851beeb6,          ,"%u %d"
881436a0,          ,"The answer is: %s"
88808930,          ,"%u%d%02x%X%hu%hhd%d%ld%lu%lld%llu%c%c%c"
92723f44,          ,"???"
a09d6698,          ,"won-won-won-wonderful"
aa9ffa66,          ,"void pw::tokenizer::{anonymous}::TestName()"
ad002c97,          ,"%llx"
b3653e13,          ,"Jello!"
cc6d3131,          ,"Jello?"
d18ada0f,          ,"something"
e13b0f94,          ,"%llu"
e65aefef,          ,"Won't fit : %s%d"
'''

JSON_SOURCE_STRINGS = '''\
[
  "pigweed/pw_polyfill/standard_library_public/pw_polyfill/standard_library/assert.h",
  "protocol_buffer/gen/pigweed/pw_protobuf/common_protos.proto_library/nanopb/pw_protobuf_protos/status.pb.h",
  "pigweed/pw_rpc/client_server.cc",
  "pigweed/pw_rpc/public/pw_rpc/client_server.h",
  "This is a very long string that will produce two tokens; one for C++ and one for C. This is because this string exceeds the default C hash length."
]
'''

CSV_STRINGS = '''\
2cbf627a,          ,"pigweed/pw_rpc/client_server.cc"
666562a1,          ,"protocol_buffer/gen/pigweed/pw_protobuf/common_protos.proto_library/nanopb/pw_protobuf_protos/status.pb.h"
6c1e6eb3,          ,"pigweed/pw_rpc/public/pw_rpc/client_server.h"
b25a9932,          ,"This is a very long string that will produce two tokens; one for C++ and one for C. This is because this string exceeds the default C hash length."
eadf017f,          ,"pigweed/pw_polyfill/standard_library_public/pw_polyfill/standard_library/assert.h"
f815dc5c,          ,"This is a very long string that will produce two tokens; one for C++ and one for C. This is because this string exceeds the default C hash length."
'''

EXPECTED_REPORT = {
    str(TOKENIZED_ENTRIES_ELF): {
        '': {
            'present_entries': 22,
            'present_size_bytes': 289,
            'total_entries': 22,
            'total_size_bytes': 289,
            'collisions': {}
        },
        'TEST_DOMAIN': {
            'present_entries': 5,
            'present_size_bytes': 57,
            'total_entries': 5,
            'total_size_bytes': 57,
            'collisions': {}
        }
    }
}


def run_cli(*args) -> None:
    original_argv = sys.argv
    sys.argv = ['database.py', *(str(a) for a in args)]
    # pylint: disable=protected-access
    try:
        database._main(*database._parse_args())
    finally:
        # Remove the log handler added by _main to avoid duplicate logs.
        if database._LOG.handlers:
            database._LOG.handlers.pop()
        # pylint: enable=protected-access

        sys.argv = original_argv


def _mock_output() -> io.TextIOWrapper:
    output = io.BytesIO()
    output.name = '<fake stdout>'
    return io.TextIOWrapper(output, write_through=True)


class DatabaseCommandLineTest(unittest.TestCase):
    """Tests the database.py command line interface."""
    def setUp(self):
        self._dir = Path(tempfile.mkdtemp('_pw_tokenizer_test'))
        self._csv = self._dir / 'db.csv'
        self._elf = TOKENIZED_ENTRIES_ELF

        self._csv_test_domain = CSV_TEST_DOMAIN

    def tearDown(self):
        shutil.rmtree(self._dir)

    def test_create_csv(self):
        run_cli('create', '--database', self._csv, self._elf)

        self.assertEqual(CSV_DEFAULT_DOMAIN.splitlines(),
                         self._csv.read_text().splitlines())

    def test_create_csv_test_domain(self):
        run_cli('create', '--database', self._csv, f'{self._elf}#TEST_DOMAIN')

        self.assertEqual(self._csv_test_domain.splitlines(),
                         self._csv.read_text().splitlines())

    def test_create_csv_all_domains(self):
        run_cli('create', '--database', self._csv, f'{self._elf}#.*')

        self.assertEqual(CSV_ALL_DOMAINS.splitlines(),
                         self._csv.read_text().splitlines())

    def test_create_force(self):
        self._csv.write_text(CSV_ALL_DOMAINS)

        with self.assertRaises(FileExistsError):
            run_cli('create', '--database', self._csv, self._elf)

        run_cli('create', '--force', '--database', self._csv, self._elf)

    def test_create_binary(self):
        binary = self._dir / 'db.bin'
        run_cli('create', '--type', 'binary', '--database', binary, self._elf)

        # Write the binary database as CSV to verify its contents.
        run_cli('create', '--database', self._csv, binary)

        self.assertEqual(CSV_DEFAULT_DOMAIN.splitlines(),
                         self._csv.read_text().splitlines())

    def test_add_does_not_recalculate_tokens(self):
        db_with_custom_token = '01234567,          ,"hello"'

        to_add = self._dir / 'add_this.csv'
        to_add.write_text(db_with_custom_token + '\n')
        self._csv.touch()

        run_cli('add', '--database', self._csv, to_add)
        self.assertEqual(db_with_custom_token.splitlines(),
                         self._csv.read_text().splitlines())

    def test_mark_removed(self):
        self._csv.write_text(CSV_ALL_DOMAINS)

        run_cli('mark_removed', '--database', self._csv, '--date',
                '1998-09-04', self._elf)

        # Add the removal date to the four tokens not in the default domain
        new_csv = CSV_ALL_DOMAINS
        new_csv = new_csv.replace('17fa86d3,          ,"hello"',
                                  '17fa86d3,1998-09-04,"hello"')
        new_csv = new_csv.replace('18c5017c,          ,"yes"',
                                  '18c5017c,1998-09-04,"yes"')
        new_csv = new_csv.replace('59b2701c,          ,"The answer was: %s"',
                                  '59b2701c,1998-09-04,"The answer was: %s"')
        new_csv = new_csv.replace('d18ada0f,          ,"something"',
                                  'd18ada0f,1998-09-04,"something"')
        self.assertNotEqual(CSV_ALL_DOMAINS, new_csv)

        self.assertEqual(new_csv.splitlines(),
                         self._csv.read_text().splitlines())

    def test_purge(self):
        self._csv.write_text(CSV_ALL_DOMAINS)

        # Mark everything not in TEST_DOMAIN as removed.
        run_cli('mark_removed', '--database', self._csv,
                f'{self._elf}#TEST_DOMAIN')

        # Delete all entries except those in TEST_DOMAIN.
        run_cli('purge', '--database', self._csv)

        self.assertEqual(self._csv_test_domain.splitlines(),
                         self._csv.read_text().splitlines())

    @mock.patch('sys.stdout', new_callable=_mock_output)
    def test_report(self, mock_stdout):
        run_cli('report', self._elf)

        self.assertEqual(json.loads(mock_stdout.buffer.getvalue()),
                         EXPECTED_REPORT)

    def test_replace(self):
        sub = 'replace/ment'
        run_cli('create', '--database', self._csv, self._elf, '--replace',
                r'(?i)\b[jh]ello\b/' + sub)
        self.assertEqual(
            CSV_DEFAULT_DOMAIN.replace('Jello', sub).replace('Hello', sub),
            self._csv.read_text())

    def test_json_strings(self):
        strings_file = self._dir / "strings.json"

        with open(strings_file, 'w') as file:
            file.write(JSON_SOURCE_STRINGS)

        run_cli('create', '--force', '--database', self._csv, strings_file)
        self.assertEqual(CSV_STRINGS.splitlines(),
                         self._csv.read_text().splitlines())


class LegacyDatabaseCommandLineTest(DatabaseCommandLineTest):
    """Test an ELF with the legacy plain string storage format."""
    def setUp(self):
        super().setUp()
        self._elf = LEGACY_PLAIN_STRING_ELF

        # The legacy approach for storing tokenized strings in an ELF always
        # adds an entry for "", even if the empty string was never tokenized.
        self._csv_test_domain = '00000000,          ,""\n' + CSV_TEST_DOMAIN

    @mock.patch('sys.stdout', new_callable=_mock_output)
    def test_report(self, mock_stdout):
        run_cli('report', self._elf)

        report = EXPECTED_REPORT[str(TOKENIZED_ENTRIES_ELF)].copy()

        # Count the implicitly added "" entry in TEST_DOMAIN.
        report['TEST_DOMAIN']['present_entries'] += 1
        report['TEST_DOMAIN']['present_size_bytes'] += 1
        report['TEST_DOMAIN']['total_entries'] += 1
        report['TEST_DOMAIN']['total_size_bytes'] += 1

        # Rename "" to the legacy name "default"
        report['default'] = report['']
        del report['']

        self.assertEqual({str(LEGACY_PLAIN_STRING_ELF): report},
                         json.loads(mock_stdout.buffer.getvalue()))


if __name__ == '__main__':
    unittest.main()