1 // Copyright 2020 The Pigweed Authors
2 //
3 // Licensed under the Apache License, Version 2.0 (the "License"); you may not
4 // use this file except in compliance with the License. You may obtain a copy of
5 // the License at
6 //
7 // https://www.apache.org/licenses/LICENSE-2.0
8 //
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
11 // WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
12 // License for the specific language governing permissions and limitations under
13 // the License.
14
15 #include "pw_tokenizer/detokenize.h"
16
17 #include <algorithm>
18 #include <cctype>
19 #include <cstring>
20 #include <string_view>
21 #include <vector>
22
23 #include "pw_bytes/bit.h"
24 #include "pw_bytes/endian.h"
25 #include "pw_result/result.h"
26 #include "pw_tokenizer/base64.h"
27 #include "pw_tokenizer/internal/decode.h"
28 #include "pw_tokenizer/nested_tokenization.h"
29
30 namespace pw::tokenizer {
31 namespace {
32
33 class NestedMessageDetokenizer {
34 public:
NestedMessageDetokenizer(const Detokenizer & detokenizer)35 NestedMessageDetokenizer(const Detokenizer& detokenizer)
36 : detokenizer_(detokenizer) {}
37
Detokenize(std::string_view chunk)38 void Detokenize(std::string_view chunk) {
39 for (char next_char : chunk) {
40 Detokenize(next_char);
41 }
42 }
43
OutputChangedSinceLastCheck()44 bool OutputChangedSinceLastCheck() {
45 const bool changed = output_changed_;
46 output_changed_ = false;
47 return changed;
48 }
49
Detokenize(char next_char)50 void Detokenize(char next_char) {
51 switch (state_) {
52 case kNonMessage:
53 if (next_char == PW_TOKENIZER_NESTED_PREFIX) {
54 message_buffer_.push_back(next_char);
55 state_ = kMessage;
56 } else {
57 output_.push_back(next_char);
58 }
59 break;
60 case kMessage:
61 if (base64::IsValidChar(next_char)) {
62 message_buffer_.push_back(next_char);
63 } else {
64 HandleEndOfMessage();
65 if (next_char == PW_TOKENIZER_NESTED_PREFIX) {
66 message_buffer_.push_back(next_char);
67 } else {
68 output_.push_back(next_char);
69 state_ = kNonMessage;
70 }
71 }
72 break;
73 }
74 }
75
Flush()76 std::string Flush() {
77 if (state_ == kMessage) {
78 HandleEndOfMessage();
79 state_ = kNonMessage;
80 }
81 std::string output(std::move(output_));
82 output_.clear();
83 return output;
84 }
85
86 private:
HandleEndOfMessage()87 void HandleEndOfMessage() {
88 if (auto result = detokenizer_.DetokenizeBase64Message(message_buffer_);
89 result.ok()) {
90 output_ += result.BestString();
91 output_changed_ = true;
92 } else {
93 output_ += message_buffer_; // Keep the original if it doesn't decode.
94 }
95 message_buffer_.clear();
96 }
97
98 const Detokenizer& detokenizer_;
99 std::string output_;
100 std::string message_buffer_;
101
102 enum : uint8_t { kNonMessage, kMessage } state_ = kNonMessage;
103 bool output_changed_ = false;
104 };
105
UnknownTokenMessage(uint32_t value)106 std::string UnknownTokenMessage(uint32_t value) {
107 std::string output(PW_TOKENIZER_ARG_DECODING_ERROR_PREFIX "unknown token ");
108
109 // Output a hexadecimal version of the token.
110 for (int shift = 28; shift >= 0; shift -= 4) {
111 output.push_back("0123456789abcdef"[(value >> shift) & 0xF]);
112 }
113
114 output.append(PW_TOKENIZER_ARG_DECODING_ERROR_SUFFIX);
115 return output;
116 }
117
118 // Decoding result with the date removed, for sorting.
119 using DecodingResult = std::pair<DecodedFormatString, uint32_t>;
120
121 // Determines if one result is better than the other if collisions occurred.
122 // Returns true if lhs is preferred over rhs. This logic should match the
123 // collision resolution logic in detokenize.py.
IsBetterResult(const DecodingResult & lhs,const DecodingResult & rhs)124 bool IsBetterResult(const DecodingResult& lhs, const DecodingResult& rhs) {
125 // Favor the result for which decoding succeeded.
126 if (lhs.first.ok() != rhs.first.ok()) {
127 return lhs.first.ok();
128 }
129
130 // Favor the result for which all bytes were decoded.
131 if ((lhs.first.remaining_bytes() == 0u) !=
132 (rhs.first.remaining_bytes() == 0u)) {
133 return lhs.first.remaining_bytes() == 0u;
134 }
135
136 // Favor the result with fewer decoding errors.
137 if (lhs.first.decoding_errors() != rhs.first.decoding_errors()) {
138 return lhs.first.decoding_errors() < rhs.first.decoding_errors();
139 }
140
141 // Favor the result that successfully decoded the most arguments.
142 if (lhs.first.argument_count() != rhs.first.argument_count()) {
143 return lhs.first.argument_count() > rhs.first.argument_count();
144 }
145
146 // Favor the result that was removed from the database most recently.
147 return lhs.second > rhs.second;
148 }
149
150 // Returns true if all characters in data are printable, space, or if the string
151 // is empty.
IsPrintableAscii(std::string_view data)152 constexpr bool IsPrintableAscii(std::string_view data) {
153 // This follows the logic in pw_tokenizer.decode_optionally_tokenized below:
154 //
155 // if ''.join(text.split()).isprintable():
156 // return text
157 //
158 for (int letter : data) {
159 if (std::isprint(letter) == 0 && std::isspace(letter) == 0) {
160 return false;
161 }
162 }
163 return true;
164 }
165
166 } // namespace
167
DetokenizedString(uint32_t token,const span<const TokenizedStringEntry> & entries,const span<const std::byte> & arguments)168 DetokenizedString::DetokenizedString(
169 uint32_t token,
170 const span<const TokenizedStringEntry>& entries,
171 const span<const std::byte>& arguments)
172 : token_(token), has_token_(true) {
173 std::vector<DecodingResult> results;
174
175 for (const auto& [format, date_removed] : entries) {
176 results.push_back(DecodingResult{
177 format.Format(span(reinterpret_cast<const uint8_t*>(arguments.data()),
178 arguments.size())),
179 date_removed});
180 }
181
182 std::sort(results.begin(), results.end(), IsBetterResult);
183
184 for (auto& result : results) {
185 matches_.push_back(std::move(result.first));
186 }
187 }
188
BestString() const189 std::string DetokenizedString::BestString() const {
190 return matches_.empty() ? std::string() : matches_[0].value();
191 }
192
BestStringWithErrors() const193 std::string DetokenizedString::BestStringWithErrors() const {
194 if (matches_.empty()) {
195 return has_token_ ? UnknownTokenMessage(token_)
196 : PW_TOKENIZER_ARG_DECODING_ERROR("missing token");
197 }
198 return matches_[0].value_with_errors();
199 }
200
Detokenizer(const TokenDatabase & database)201 Detokenizer::Detokenizer(const TokenDatabase& database) {
202 for (const auto& entry : database) {
203 database_[entry.token].emplace_back(entry.string, entry.date_removed);
204 }
205 }
206
FromElfSection(span<const std::byte> elf_section)207 Result<Detokenizer> Detokenizer::FromElfSection(
208 span<const std::byte> elf_section) {
209 size_t index = 0;
210 std::unordered_map<uint32_t, std::vector<TokenizedStringEntry>> database;
211
212 while (index + sizeof(_pw_tokenizer_EntryHeader) < elf_section.size()) {
213 _pw_tokenizer_EntryHeader header;
214 std::memcpy(
215 &header, elf_section.data() + index, sizeof(_pw_tokenizer_EntryHeader));
216 index += sizeof(_pw_tokenizer_EntryHeader);
217
218 if (header.magic != _PW_TOKENIZER_ENTRY_MAGIC) {
219 return Status::DataLoss();
220 }
221
222 index += header.domain_length;
223 if (index + header.string_length <= elf_section.size()) {
224 // TODO(b/326365218): Construct FormatString with string_view to avoid
225 // creating a copy here.
226 std::string entry(
227 reinterpret_cast<const char*>(elf_section.data() + index),
228 header.string_length);
229 index += header.string_length;
230 database[header.token].emplace_back(entry.c_str(),
231 TokenDatabase::kDateRemovedNever);
232 }
233 }
234 return Detokenizer(std::move(database));
235 }
236
Detokenize(const span<const std::byte> & encoded) const237 DetokenizedString Detokenizer::Detokenize(
238 const span<const std::byte>& encoded) const {
239 // The token is missing from the encoded data; there is nothing to do.
240 if (encoded.empty()) {
241 return DetokenizedString();
242 }
243
244 uint32_t token = bytes::ReadInOrder<uint32_t>(
245 endian::little, encoded.data(), encoded.size());
246
247 const auto result = database_.find(token);
248
249 return DetokenizedString(
250 token,
251 result == database_.end() ? span<TokenizedStringEntry>()
252 : span(result->second),
253 encoded.size() < sizeof(token) ? span<const std::byte>()
254 : encoded.subspan(sizeof(token)));
255 }
256
DetokenizeBase64Message(std::string_view text) const257 DetokenizedString Detokenizer::DetokenizeBase64Message(
258 std::string_view text) const {
259 std::string buffer(text);
260 buffer.resize(PrefixedBase64DecodeInPlace(buffer));
261 return Detokenize(buffer);
262 }
263
DetokenizeText(std::string_view text,const unsigned max_passes) const264 std::string Detokenizer::DetokenizeText(std::string_view text,
265 const unsigned max_passes) const {
266 NestedMessageDetokenizer detokenizer(*this);
267 detokenizer.Detokenize(text);
268
269 std::string result;
270 unsigned pass = 1;
271
272 while (true) {
273 result = detokenizer.Flush();
274 if (pass >= max_passes || !detokenizer.OutputChangedSinceLastCheck()) {
275 break;
276 }
277 detokenizer.Detokenize(result);
278 pass += 1;
279 }
280 return result;
281 }
282
DecodeOptionallyTokenizedData(const ConstByteSpan & optionally_tokenized_data)283 std::string Detokenizer::DecodeOptionallyTokenizedData(
284 const ConstByteSpan& optionally_tokenized_data) {
285 // Try detokenizing as binary using the best result if available, else use
286 // the input data as a string.
287 const auto result = Detokenize(optionally_tokenized_data);
288 const bool found_matches = !result.matches().empty();
289 // Note: unlike pw_tokenizer.proto.decode_optionally_tokenized, this decoding
290 // process does not encode and decode UTF8 format, it is sufficient to check
291 // if the data is printable ASCII.
292 const std::string data =
293 found_matches
294 ? result.BestString()
295 : std::string(
296 reinterpret_cast<const char*>(optionally_tokenized_data.data()),
297 optionally_tokenized_data.size());
298
299 const bool is_data_printable = IsPrintableAscii(data);
300 if (!found_matches && !is_data_printable) {
301 // Assume the token is unknown or the data is corrupt.
302 std::vector<char> base64_encoding_buffer(
303 Base64EncodedBufferSize(optionally_tokenized_data.size()));
304 const size_t encoded_length = PrefixedBase64Encode(
305 optionally_tokenized_data, span(base64_encoding_buffer));
306 return std::string{base64_encoding_buffer.data(), encoded_length};
307 }
308
309 // Successfully detokenized, check if the field has more prefixed
310 // base64-encoded tokens.
311 const std::string field = DetokenizeText(data);
312 // If anything detokenized successfully, use that.
313 if (field != data) {
314 return field;
315 }
316
317 // Attempt to determine whether this is an unknown token or plain text.
318 // Any string with only printable or whitespace characters is plain text.
319 if (found_matches || is_data_printable) {
320 return data;
321 }
322
323 // Assume this field is tokenized data that could not be decoded.
324 std::vector<char> base64_encoding_buffer(
325 Base64EncodedBufferSize(optionally_tokenized_data.size()));
326 const size_t encoded_length = PrefixedBase64Encode(
327 optionally_tokenized_data, span(base64_encoding_buffer));
328 return std::string{base64_encoding_buffer.data(), encoded_length};
329 }
330
331 } // namespace pw::tokenizer
332