1 // Copyright 2020 The Pigweed Authors 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); you may not 4 // use this file except in compliance with the License. You may obtain a copy of 5 // the License at 6 // 7 // https://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, WITHOUT 11 // WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the 12 // License for the specific language governing permissions and limitations under 13 // the License. 14 15 // This file provides the Detokenizer class, which is used to decode tokenized 16 // strings. To use a Detokenizer, load a binary format token database into 17 // memory, construct a TokenDatabase, and pass it to a Detokenizer: 18 // 19 // std::vector data = ReadFile("my_tokenized_strings.db"); 20 // Detokenizer detok(TokenDatabase::Create(data)); 21 // 22 // DetokenizedString result = detok.Detokenize(my_data); 23 // std::cout << result.BestString() << '\n'; 24 // 25 #pragma once 26 27 #include <cstddef> 28 #include <cstdint> 29 #include <string> 30 #include <unordered_map> 31 #include <utility> 32 #include <vector> 33 34 #include "pw_span/span.h" 35 #include "pw_tokenizer/internal/decode.h" 36 #include "pw_tokenizer/token_database.h" 37 38 namespace pw::tokenizer { 39 40 using TokenizedStringEntry = std::pair<FormatString, uint32_t /*date removed*/>; 41 42 // A string that has been detokenized. This class tracks all possible results if 43 // there are token collisions. 44 class DetokenizedString { 45 public: 46 DetokenizedString(uint32_t token, 47 const span<const TokenizedStringEntry>& entries, 48 const span<const uint8_t>& arguments); 49 DetokenizedString()50 DetokenizedString() : has_token_(false) {} 51 52 // True if there was only one valid match and it decoded successfully. ok()53 bool ok() const { return matches_.size() == 1 && matches_[0].ok(); } 54 55 // Returns the strings that matched the token, with the best matches first. matches()56 const std::vector<DecodedFormatString>& matches() const { return matches_; } 57 token()58 const uint32_t& token() const { return token_; } 59 60 // Returns the detokenized string or an empty string if there were no matches. 61 // If there are multiple possible results, the DetokenizedString returns the 62 // first match. 63 std::string BestString() const; 64 65 // Returns the best match, with error messages inserted for arguments that 66 // failed to parse. 67 std::string BestStringWithErrors() const; 68 69 private: 70 uint32_t token_; 71 bool has_token_; 72 std::vector<DecodedFormatString> matches_; 73 }; 74 75 // Decodes and detokenizes strings from a TokenDatabase. This class builds a 76 // hash table from the TokenDatabase to give O(1) token lookups. 77 class Detokenizer { 78 public: 79 // Constructs a detokenizer from a TokenDatabase. The TokenDatabase is not 80 // referenced by the Detokenizer after construction; its memory can be freed. 81 Detokenizer(const TokenDatabase& database); 82 83 // Decodes and detokenizes the encoded message. Returns a DetokenizedString 84 // that stores all possible detokenized string results. 85 DetokenizedString Detokenize(const span<const uint8_t>& encoded) const; 86 Detokenize(const std::string_view & encoded)87 DetokenizedString Detokenize(const std::string_view& encoded) const { 88 return Detokenize(encoded.data(), encoded.size()); 89 } 90 Detokenize(const void * encoded,size_t size_bytes)91 DetokenizedString Detokenize(const void* encoded, size_t size_bytes) const { 92 return Detokenize(span(static_cast<const uint8_t*>(encoded), size_bytes)); 93 } 94 95 private: 96 std::unordered_map<uint32_t, std::vector<TokenizedStringEntry>> database_; 97 }; 98 99 } // namespace pw::tokenizer 100