• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // Copyright 2020 The Pigweed Authors
2 //
3 // Licensed under the Apache License, Version 2.0 (the "License"); you may not
4 // use this file except in compliance with the License. You may obtain a copy of
5 // the License at
6 //
7 //     https://www.apache.org/licenses/LICENSE-2.0
8 //
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
11 // WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
12 // License for the specific language governing permissions and limitations under
13 // the License.
14 
15 // This file provides the Detokenizer class, which is used to decode tokenized
16 // strings.  To use a Detokenizer, load a binary format token database into
17 // memory, construct a TokenDatabase, and pass it to a Detokenizer:
18 //
19 //   std::vector data = ReadFile("my_tokenized_strings.db");
20 //   Detokenizer detok(TokenDatabase::Create(data));
21 //
22 //   DetokenizedString result = detok.Detokenize(my_data);
23 //   std::cout << result.BestString() << '\n';
24 //
25 #pragma once
26 
27 #include <cstddef>
28 #include <cstdint>
29 #include <string>
30 #include <unordered_map>
31 #include <utility>
32 #include <vector>
33 
34 #include "pw_result/result.h"
35 #include "pw_span/span.h"
36 #include "pw_stream/stream.h"
37 #include "pw_tokenizer/internal/decode.h"
38 #include "pw_tokenizer/token_database.h"
39 
40 namespace pw::tokenizer {
41 
42 /// @defgroup pw_tokenizer_detokenize
43 /// @{
44 
45 /// Token database entry.
46 using TokenizedStringEntry = std::pair<FormatString, uint32_t /*date removed*/>;
47 using DomainTokenEntriesMap = std::unordered_map<
48     std::string,
49     std::unordered_map<uint32_t, std::vector<TokenizedStringEntry>>>;
50 
51 /// A string that has been detokenized. This class tracks all possible results
52 /// if there are token collisions.
53 class DetokenizedString {
54  public:
55   DetokenizedString(uint32_t token,
56                     const span<const TokenizedStringEntry>& entries,
57                     const span<const std::byte>& arguments);
58 
DetokenizedString()59   DetokenizedString() : has_token_(false) {}
60 
61   /// True if there was only one valid match and it decoded successfully.
ok()62   bool ok() const { return matches_.size() == 1 && matches_[0].ok(); }
63 
64   /// Returns the strings that matched the token, with the best matches first.
matches()65   const std::vector<DecodedFormatString>& matches() const { return matches_; }
66 
token()67   const uint32_t& token() const { return token_; }
68 
69   /// Returns the detokenized string or an empty string if there were no
70   /// matches. If there are multiple possible results, the `DetokenizedString`
71   /// returns the first match.
72   std::string BestString() const;
73 
74   /// Returns the best match, with error messages inserted for arguments that
75   /// failed to parse.
76   std::string BestStringWithErrors() const;
77 
78  private:
79   uint32_t token_;
80   bool has_token_;
81   std::vector<DecodedFormatString> matches_;
82 };
83 
84 /// Decodes and detokenizes from a token database. This class builds a hash
85 /// table of tokens to give `O(1)` token lookups.
86 class Detokenizer {
87  public:
88   /// Constructs a detokenizer from a `TokenDatabase`. The `TokenDatabase` is
89   /// not referenced by the `Detokenizer` after construction; its memory can be
90   /// freed.
91   explicit Detokenizer(const TokenDatabase& database);
92 
93   /// Constructs a detokenizer by directly passing the parsed database.
Detokenizer(std::unordered_map<std::string,std::unordered_map<uint32_t,std::vector<TokenizedStringEntry>>> && database)94   explicit Detokenizer(
95       std::unordered_map<
96           std::string,
97           std::unordered_map<uint32_t, std::vector<TokenizedStringEntry>>>&&
98           database)
99       : database_(std::move(database)) {}
100 
101   /// Constructs a detokenizer from the `.pw_tokenizer.entries` section of an
102   /// ELF binary.
103   static Result<Detokenizer> FromElfSection(span<const std::byte> elf_section);
104 
105   /// Overload of `FromElfSection` for a `uint8_t` span.
FromElfSection(span<const uint8_t> elf_section)106   static Result<Detokenizer> FromElfSection(span<const uint8_t> elf_section) {
107     return FromElfSection(as_bytes(elf_section));
108   }
109 
110   /// Constructs a detokenizer from the `.pw_tokenizer.entries` section of an
111   /// ELF binary.
112   static Result<Detokenizer> FromElfFile(stream::SeekableReader& stream);
113 
114   /// Constructs a detokenizer from a parsed CSV database.
115   static Result<Detokenizer> FromCsv(std::string_view csv);
116 
117   /// Decodes and detokenizes the binary encoded message. Returns a
118   /// `DetokenizedString` that stores all possible detokenized string results.
119   DetokenizedString Detokenize(const span<const std::byte>& encoded) const;
120 
121   /// Overload of `Detokenize` for `span<const uint8_t>`.
Detokenize(const span<const uint8_t> & encoded)122   DetokenizedString Detokenize(const span<const uint8_t>& encoded) const {
123     return Detokenize(as_bytes(encoded));
124   }
125 
126   /// Overload of `Detokenize` for `std::string_view`.
Detokenize(std::string_view encoded)127   DetokenizedString Detokenize(std::string_view encoded) const {
128     return Detokenize(encoded.data(), encoded.size());
129   }
130 
131   /// Overload of `Detokenize` for a pointer and length.
Detokenize(const void * encoded,size_t size_bytes)132   DetokenizedString Detokenize(const void* encoded, size_t size_bytes) const {
133     return Detokenize(span(static_cast<const std::byte*>(encoded), size_bytes));
134   }
135 
136   /// Decodes and detokenizes a Base64-encoded message. Returns a
137   /// `DetokenizedString` that stores all possible detokenized string results.
138   DetokenizedString DetokenizeBase64Message(std::string_view text) const;
139 
140   /// Decodes and detokenizes nested tokenized messages in a string.
141   ///
142   /// This function currently only supports Base64 nested tokenized messages.
143   /// Support for hexadecimal-encoded string literals will be added.
144   ///
145   /// @param[in] text Text potentially containing tokenized messages.
146   ///
147   /// @param[in] max_passes `DetokenizeText` supports recursive detokenization.
148   /// Tokens can expand to other tokens. The maximum number of detokenization
149   /// passes is specified by `max_passes` (0 is equivalent to 1).
150   ///
151   /// @returns The original string with nested tokenized messages decoded in
152   ///     context. Messages that fail to decode are left as-is.
153   std::string DetokenizeText(std::string_view text,
154                              unsigned max_passes = 3) const;
155 
156   /// Deprecated version of `DetokenizeText` with no recursive detokenization.
157   /// @deprecated Call `DetokenizeText` instead.
DetokenizeBase64(std::string_view text)158   [[deprecated("Use DetokenizeText() instead")]] std::string DetokenizeBase64(
159       std::string_view text) const {
160     return DetokenizeText(text, 1);
161   }
162 
163   /// Decodes data that may or may not be tokenized, such as proto fields marked
164   /// as optionally tokenized.
165   ///
166   /// This function currently only supports Base64 nested tokenized messages.
167   /// Support for hexadecimal-encoded string literals will be added.
168   ///
169   /// This function currently assumes when data is not tokenized it is printable
170   /// ASCII. Otherwise, the returned string will be base64-encoded.
171   ///
172   /// @param[in] optionally_tokenized_data Data optionally tokenized.
173   ///
174   /// @returns The decoded text if successfully detokenized or if the data is
175   /// printable, otherwise returns the data base64-encoded.
176   std::string DecodeOptionallyTokenizedData(
177       const span<const std::byte>& optionally_tokenized_data);
178 
database()179   const DomainTokenEntriesMap& database() const { return database_; }
180 
181  private:
182   DomainTokenEntriesMap database_;
183 };
184 
185 /// @}
186 
187 }  // namespace pw::tokenizer
188