1 // Copyright 2020 The Pigweed Authors 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); you may not 4 // use this file except in compliance with the License. You may obtain a copy of 5 // the License at 6 // 7 // https://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, WITHOUT 11 // WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the 12 // License for the specific language governing permissions and limitations under 13 // the License. 14 15 // This file provides the Detokenizer class, which is used to decode tokenized 16 // strings. To use a Detokenizer, load a binary format token database into 17 // memory, construct a TokenDatabase, and pass it to a Detokenizer: 18 // 19 // std::vector data = ReadFile("my_tokenized_strings.db"); 20 // Detokenizer detok(TokenDatabase::Create(data)); 21 // 22 // DetokenizedString result = detok.Detokenize(my_data); 23 // std::cout << result.BestString() << '\n'; 24 // 25 #pragma once 26 27 #include <cstddef> 28 #include <cstdint> 29 #include <string> 30 #include <unordered_map> 31 #include <utility> 32 #include <vector> 33 34 #include "pw_result/result.h" 35 #include "pw_span/span.h" 36 #include "pw_stream/stream.h" 37 #include "pw_tokenizer/internal/decode.h" 38 #include "pw_tokenizer/token_database.h" 39 40 namespace pw::tokenizer { 41 42 /// @defgroup pw_tokenizer_detokenize 43 /// @{ 44 45 /// Token database entry. 46 using TokenizedStringEntry = std::pair<FormatString, uint32_t /*date removed*/>; 47 using DomainTokenEntriesMap = std::unordered_map< 48 std::string, 49 std::unordered_map<uint32_t, std::vector<TokenizedStringEntry>>>; 50 51 /// A string that has been detokenized. This class tracks all possible results 52 /// if there are token collisions. 53 class DetokenizedString { 54 public: 55 DetokenizedString(uint32_t token, 56 const span<const TokenizedStringEntry>& entries, 57 const span<const std::byte>& arguments); 58 DetokenizedString()59 DetokenizedString() : has_token_(false) {} 60 61 /// True if there was only one valid match and it decoded successfully. ok()62 bool ok() const { return matches_.size() == 1 && matches_[0].ok(); } 63 64 /// Returns the strings that matched the token, with the best matches first. matches()65 const std::vector<DecodedFormatString>& matches() const { return matches_; } 66 token()67 const uint32_t& token() const { return token_; } 68 69 /// Returns the detokenized string or an empty string if there were no 70 /// matches. If there are multiple possible results, the `DetokenizedString` 71 /// returns the first match. 72 std::string BestString() const; 73 74 /// Returns the best match, with error messages inserted for arguments that 75 /// failed to parse. 76 std::string BestStringWithErrors() const; 77 78 private: 79 uint32_t token_; 80 bool has_token_; 81 std::vector<DecodedFormatString> matches_; 82 }; 83 84 /// Decodes and detokenizes from a token database. This class builds a hash 85 /// table of tokens to give `O(1)` token lookups. 86 class Detokenizer { 87 public: 88 /// Constructs a detokenizer from a `TokenDatabase`. The `TokenDatabase` is 89 /// not referenced by the `Detokenizer` after construction; its memory can be 90 /// freed. 91 explicit Detokenizer(const TokenDatabase& database); 92 93 /// Constructs a detokenizer by directly passing the parsed database. Detokenizer(std::unordered_map<std::string,std::unordered_map<uint32_t,std::vector<TokenizedStringEntry>>> && database)94 explicit Detokenizer( 95 std::unordered_map< 96 std::string, 97 std::unordered_map<uint32_t, std::vector<TokenizedStringEntry>>>&& 98 database) 99 : database_(std::move(database)) {} 100 101 /// Constructs a detokenizer from the `.pw_tokenizer.entries` section of an 102 /// ELF binary. 103 static Result<Detokenizer> FromElfSection(span<const std::byte> elf_section); 104 105 /// Overload of `FromElfSection` for a `uint8_t` span. FromElfSection(span<const uint8_t> elf_section)106 static Result<Detokenizer> FromElfSection(span<const uint8_t> elf_section) { 107 return FromElfSection(as_bytes(elf_section)); 108 } 109 110 /// Constructs a detokenizer from the `.pw_tokenizer.entries` section of an 111 /// ELF binary. 112 static Result<Detokenizer> FromElfFile(stream::SeekableReader& stream); 113 114 /// Constructs a detokenizer from a parsed CSV database. 115 static Result<Detokenizer> FromCsv(std::string_view csv); 116 117 /// Decodes and detokenizes the binary encoded message. Returns a 118 /// `DetokenizedString` that stores all possible detokenized string results. 119 DetokenizedString Detokenize(const span<const std::byte>& encoded) const; 120 121 /// Overload of `Detokenize` for `span<const uint8_t>`. Detokenize(const span<const uint8_t> & encoded)122 DetokenizedString Detokenize(const span<const uint8_t>& encoded) const { 123 return Detokenize(as_bytes(encoded)); 124 } 125 126 /// Overload of `Detokenize` for `std::string_view`. Detokenize(std::string_view encoded)127 DetokenizedString Detokenize(std::string_view encoded) const { 128 return Detokenize(encoded.data(), encoded.size()); 129 } 130 131 /// Overload of `Detokenize` for a pointer and length. Detokenize(const void * encoded,size_t size_bytes)132 DetokenizedString Detokenize(const void* encoded, size_t size_bytes) const { 133 return Detokenize(span(static_cast<const std::byte*>(encoded), size_bytes)); 134 } 135 136 /// Decodes and detokenizes a Base64-encoded message. Returns a 137 /// `DetokenizedString` that stores all possible detokenized string results. 138 DetokenizedString DetokenizeBase64Message(std::string_view text) const; 139 140 /// Decodes and detokenizes nested tokenized messages in a string. 141 /// 142 /// This function currently only supports Base64 nested tokenized messages. 143 /// Support for hexadecimal-encoded string literals will be added. 144 /// 145 /// @param[in] text Text potentially containing tokenized messages. 146 /// 147 /// @param[in] max_passes `DetokenizeText` supports recursive detokenization. 148 /// Tokens can expand to other tokens. The maximum number of detokenization 149 /// passes is specified by `max_passes` (0 is equivalent to 1). 150 /// 151 /// @returns The original string with nested tokenized messages decoded in 152 /// context. Messages that fail to decode are left as-is. 153 std::string DetokenizeText(std::string_view text, 154 unsigned max_passes = 3) const; 155 156 /// Deprecated version of `DetokenizeText` with no recursive detokenization. 157 /// @deprecated Call `DetokenizeText` instead. DetokenizeBase64(std::string_view text)158 [[deprecated("Use DetokenizeText() instead")]] std::string DetokenizeBase64( 159 std::string_view text) const { 160 return DetokenizeText(text, 1); 161 } 162 163 /// Decodes data that may or may not be tokenized, such as proto fields marked 164 /// as optionally tokenized. 165 /// 166 /// This function currently only supports Base64 nested tokenized messages. 167 /// Support for hexadecimal-encoded string literals will be added. 168 /// 169 /// This function currently assumes when data is not tokenized it is printable 170 /// ASCII. Otherwise, the returned string will be base64-encoded. 171 /// 172 /// @param[in] optionally_tokenized_data Data optionally tokenized. 173 /// 174 /// @returns The decoded text if successfully detokenized or if the data is 175 /// printable, otherwise returns the data base64-encoded. 176 std::string DecodeOptionallyTokenizedData( 177 const span<const std::byte>& optionally_tokenized_data); 178 database()179 const DomainTokenEntriesMap& database() const { return database_; } 180 181 private: 182 DomainTokenEntriesMap database_; 183 }; 184 185 /// @} 186 187 } // namespace pw::tokenizer 188