1 // Copyright 2020 The Pigweed Authors
2 //
3 // Licensed under the Apache License, Version 2.0 (the "License"); you may not
4 // use this file except in compliance with the License. You may obtain a copy of
5 // the License at
6 //
7 // https://www.apache.org/licenses/LICENSE-2.0
8 //
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
11 // WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
12 // License for the specific language governing permissions and limitations under
13 // the License.
14
15 #include "pw_tokenizer/detokenize.h"
16
17 #include <algorithm>
18 #include <cctype>
19 #include <cstring>
20 #include <string_view>
21 #include <vector>
22
23 #include "pw_bytes/bit.h"
24 #include "pw_bytes/endian.h"
25 #include "pw_elf/reader.h"
26 #include "pw_log/log.h"
27 #include "pw_result/result.h"
28 #include "pw_status/try.h"
29 #include "pw_tokenizer/base64.h"
30 #include "pw_tokenizer/internal/decode.h"
31 #include "pw_tokenizer/nested_tokenization.h"
32 #include "pw_tokenizer/tokenize.h"
33 #include "pw_tokenizer_private/csv.h"
34
35 namespace pw::tokenizer {
36 namespace {
37
38 class NestedMessageDetokenizer {
39 public:
NestedMessageDetokenizer(const Detokenizer & detokenizer)40 NestedMessageDetokenizer(const Detokenizer& detokenizer)
41 : detokenizer_(detokenizer) {}
42
Detokenize(std::string_view chunk)43 void Detokenize(std::string_view chunk) {
44 for (char next_char : chunk) {
45 Detokenize(next_char);
46 }
47 }
48
OutputChangedSinceLastCheck()49 bool OutputChangedSinceLastCheck() {
50 const bool changed = output_changed_;
51 output_changed_ = false;
52 return changed;
53 }
54
Detokenize(char next_char)55 void Detokenize(char next_char) {
56 switch (state_) {
57 case kNonMessage:
58 if (next_char == PW_TOKENIZER_NESTED_PREFIX) {
59 message_buffer_.push_back(next_char);
60 state_ = kMessage;
61 } else {
62 output_.push_back(next_char);
63 }
64 break;
65 case kMessage:
66 if (base64::IsValidChar(next_char)) {
67 message_buffer_.push_back(next_char);
68 } else {
69 HandleEndOfMessage();
70 if (next_char == PW_TOKENIZER_NESTED_PREFIX) {
71 message_buffer_.push_back(next_char);
72 } else {
73 output_.push_back(next_char);
74 state_ = kNonMessage;
75 }
76 }
77 break;
78 }
79 }
80
Flush()81 std::string Flush() {
82 if (state_ == kMessage) {
83 HandleEndOfMessage();
84 state_ = kNonMessage;
85 }
86 std::string output(std::move(output_));
87 output_.clear();
88 return output;
89 }
90
91 private:
HandleEndOfMessage()92 void HandleEndOfMessage() {
93 if (auto result = detokenizer_.DetokenizeBase64Message(message_buffer_);
94 result.ok()) {
95 output_ += result.BestString();
96 output_changed_ = true;
97 } else {
98 output_ += message_buffer_; // Keep the original if it doesn't decode.
99 }
100 message_buffer_.clear();
101 }
102
103 const Detokenizer& detokenizer_;
104 std::string output_;
105 std::string message_buffer_;
106
107 enum : uint8_t { kNonMessage, kMessage } state_ = kNonMessage;
108 bool output_changed_ = false;
109 };
110
UnknownTokenMessage(uint32_t value)111 std::string UnknownTokenMessage(uint32_t value) {
112 std::string output(PW_TOKENIZER_ARG_DECODING_ERROR_PREFIX "unknown token ");
113
114 // Output a hexadecimal version of the token.
115 for (int shift = 28; shift >= 0; shift -= 4) {
116 output.push_back("0123456789abcdef"[(value >> shift) & 0xF]);
117 }
118
119 output.append(PW_TOKENIZER_ARG_DECODING_ERROR_SUFFIX);
120 return output;
121 }
122
123 // Decoding result with the date removed, for sorting.
124 using DecodingResult = std::pair<DecodedFormatString, uint32_t>;
125
126 // Determines if one result is better than the other if collisions occurred.
127 // Returns true if lhs is preferred over rhs. This logic should match the
128 // collision resolution logic in detokenize.py.
IsBetterResult(const DecodingResult & lhs,const DecodingResult & rhs)129 bool IsBetterResult(const DecodingResult& lhs, const DecodingResult& rhs) {
130 // Favor the result for which decoding succeeded.
131 if (lhs.first.ok() != rhs.first.ok()) {
132 return lhs.first.ok();
133 }
134
135 // Favor the result for which all bytes were decoded.
136 if ((lhs.first.remaining_bytes() == 0u) !=
137 (rhs.first.remaining_bytes() == 0u)) {
138 return lhs.first.remaining_bytes() == 0u;
139 }
140
141 // Favor the result with fewer decoding errors.
142 if (lhs.first.decoding_errors() != rhs.first.decoding_errors()) {
143 return lhs.first.decoding_errors() < rhs.first.decoding_errors();
144 }
145
146 // Favor the result that successfully decoded the most arguments.
147 if (lhs.first.argument_count() != rhs.first.argument_count()) {
148 return lhs.first.argument_count() > rhs.first.argument_count();
149 }
150
151 // Favor the result that was removed from the database most recently.
152 return lhs.second > rhs.second;
153 }
154
155 // Returns true if all characters in data are printable, space, or if the string
156 // is empty.
IsPrintableAscii(std::string_view data)157 constexpr bool IsPrintableAscii(std::string_view data) {
158 // This follows the logic in pw_tokenizer.decode_optionally_tokenized below:
159 //
160 // if ''.join(text.split()).isprintable():
161 // return text
162 //
163 for (int letter : data) {
164 if (std::isprint(letter) == 0 && std::isspace(letter) == 0) {
165 return false;
166 }
167 }
168 return true;
169 }
170
171 } // namespace
172
DetokenizedString(uint32_t token,const span<const TokenizedStringEntry> & entries,const span<const std::byte> & arguments)173 DetokenizedString::DetokenizedString(
174 uint32_t token,
175 const span<const TokenizedStringEntry>& entries,
176 const span<const std::byte>& arguments)
177 : token_(token), has_token_(true) {
178 std::vector<DecodingResult> results;
179
180 for (const auto& [format, date_removed] : entries) {
181 results.push_back(DecodingResult{
182 format.Format(span(reinterpret_cast<const uint8_t*>(arguments.data()),
183 arguments.size())),
184 date_removed});
185 }
186
187 std::sort(results.begin(), results.end(), IsBetterResult);
188
189 for (auto& result : results) {
190 matches_.push_back(std::move(result.first));
191 }
192 }
193
BestString() const194 std::string DetokenizedString::BestString() const {
195 return matches_.empty() ? std::string() : matches_[0].value();
196 }
197
BestStringWithErrors() const198 std::string DetokenizedString::BestStringWithErrors() const {
199 if (matches_.empty()) {
200 return has_token_ ? UnknownTokenMessage(token_)
201 : PW_TOKENIZER_ARG_DECODING_ERROR("missing token");
202 }
203 return matches_[0].value_with_errors();
204 }
205
Detokenizer(const TokenDatabase & database)206 Detokenizer::Detokenizer(const TokenDatabase& database) {
207 for (const auto& entry : database) {
208 database_[kDefaultDomain][entry.token].emplace_back(entry.string,
209 entry.date_removed);
210 }
211 }
212
FromElfSection(span<const std::byte> elf_section)213 Result<Detokenizer> Detokenizer::FromElfSection(
214 span<const std::byte> elf_section) {
215 size_t index = 0;
216 DomainTokenEntriesMap database;
217
218 while (index + sizeof(_pw_tokenizer_EntryHeader) < elf_section.size()) {
219 _pw_tokenizer_EntryHeader header;
220 std::memcpy(
221 &header, elf_section.data() + index, sizeof(_pw_tokenizer_EntryHeader));
222 index += sizeof(_pw_tokenizer_EntryHeader);
223
224 if (header.magic != _PW_TOKENIZER_ENTRY_MAGIC) {
225 return Status::DataLoss();
226 }
227
228 if (index + header.domain_length + header.string_length <=
229 elf_section.size()) {
230 std::string domain(
231 reinterpret_cast<const char*>(elf_section.data() + index),
232 header.domain_length - 1);
233 index += header.domain_length;
234 // TODO(b/326365218): Construct FormatString with string_view to avoid
235 // creating a copy here.
236 std::string entry(
237 reinterpret_cast<const char*>(elf_section.data() + index),
238 header.string_length - 1);
239 index += header.string_length;
240 database[std::move(domain)][header.token].emplace_back(
241 entry.c_str(), TokenDatabase::kDateRemovedNever);
242 }
243 }
244 return Detokenizer(std::move(database));
245 }
246
FromElfFile(stream::SeekableReader & stream)247 Result<Detokenizer> Detokenizer::FromElfFile(stream::SeekableReader& stream) {
248 PW_TRY_ASSIGN(auto reader, pw::elf::ElfReader::FromStream(stream));
249
250 constexpr auto kTokenSectionName = ".pw_tokenizer.entries";
251 PW_TRY_ASSIGN(std::vector<std::byte> section_data,
252 reader.ReadSection(kTokenSectionName));
253
254 return Detokenizer::FromElfSection(section_data);
255 }
256
FromCsv(std::string_view csv)257 Result<Detokenizer> Detokenizer::FromCsv(std::string_view csv) {
258 std::vector<std::vector<std::string>> parsed_csv = ParseCsv(csv);
259 DomainTokenEntriesMap database;
260
261 // CSV databases are in the format -> token, date, domain, string.
262 int invalid_row_count = 0;
263 for (const auto& row : parsed_csv) {
264 if (row.size() != 4) {
265 invalid_row_count++;
266 continue;
267 }
268 // Ignore whitespace in the domain.
269 std::string domain = "";
270 for (char c : row[2]) {
271 if (!std::isspace(c)) {
272 domain += c;
273 }
274 }
275
276 const std::string& token = row[0];
277 const std::string& date_removed = row[1];
278
279 // Validate length of token.
280 if (token.empty()) {
281 PW_LOG_ERROR("Corrupt database due to missing token");
282 return Status::DataLoss();
283 }
284
285 // Validate token contents.
286 for (char c : token) {
287 if (!std::isxdigit(c)) {
288 PW_LOG_ERROR("Corrupt database due to token format");
289 return Status::DataLoss();
290 }
291 }
292
293 // Validate date contents.
294 uint32_t date = TokenDatabase::kDateRemovedNever;
295 if (!date_removed.empty() &&
296 date_removed.find_first_not_of(' ') != std::string::npos) {
297 size_t first_dash = date_removed.find('-');
298 if (first_dash == std::string::npos || first_dash != 4) {
299 PW_LOG_ERROR("Wrong date format in database");
300 return Status::DataLoss();
301 }
302
303 size_t second_dash = date_removed.find('-', first_dash + 1);
304 if (second_dash == std::string::npos || second_dash != 7) {
305 PW_LOG_ERROR("Wrong date format in database");
306 return Status::DataLoss();
307 }
308
309 size_t pos;
310 int year = std::stoi(date_removed.substr(0, first_dash), &pos);
311 if (pos != first_dash) {
312 PW_LOG_ERROR("Wrong date format in database");
313 return Status::DataLoss();
314 }
315
316 int month = std::stoi(
317 date_removed.substr(first_dash + 1, second_dash - first_dash - 1),
318 &pos);
319 if (pos != second_dash - first_dash - 1) {
320 PW_LOG_ERROR("Wrong date format in database");
321 return Status::DataLoss();
322 }
323
324 int day = std::stoi(date_removed.substr(second_dash + 1), &pos);
325 if (pos != date_removed.size() - second_dash - 1) {
326 PW_LOG_ERROR("Wrong date format in database");
327 return Status::DataLoss();
328 }
329
330 date = (year << 16) | (month << 8) | day;
331 }
332
333 // Add to database.
334 database[std::move(domain)][std::stoul(token, nullptr, 16)].emplace_back(
335 row[3].c_str(), date);
336 }
337
338 // Log warning if any data lines were skipped.
339 if (invalid_row_count > 0) {
340 PW_LOG_WARN(
341 "Skipped %d of %zu lines because they did not have 4 columns as "
342 "expected.",
343 invalid_row_count,
344 parsed_csv.size());
345 }
346
347 return Detokenizer(std::move(database));
348 }
349
Detokenize(const span<const std::byte> & encoded) const350 DetokenizedString Detokenizer::Detokenize(
351 const span<const std::byte>& encoded) const {
352 // The token is missing from the encoded data; there is nothing to do.
353 if (encoded.empty()) {
354 return DetokenizedString();
355 }
356
357 uint32_t token = bytes::ReadInOrder<uint32_t>(
358 endian::little, encoded.data(), encoded.size());
359
360 const auto domain_it = database_.find(kDefaultDomain);
361 if (domain_it == database_.end()) {
362 return DetokenizedString();
363 }
364
365 const auto result = domain_it->second.find(token);
366
367 return DetokenizedString(
368 token,
369 result == domain_it->second.end() ? span<TokenizedStringEntry>()
370 : span(result->second),
371 encoded.size() < sizeof(token) ? span<const std::byte>()
372 : encoded.subspan(sizeof(token)));
373 }
374
DetokenizeBase64Message(std::string_view text) const375 DetokenizedString Detokenizer::DetokenizeBase64Message(
376 std::string_view text) const {
377 std::string buffer(text);
378 buffer.resize(PrefixedBase64DecodeInPlace(buffer));
379 return Detokenize(buffer);
380 }
381
DetokenizeText(std::string_view text,const unsigned max_passes) const382 std::string Detokenizer::DetokenizeText(std::string_view text,
383 const unsigned max_passes) const {
384 NestedMessageDetokenizer detokenizer(*this);
385 detokenizer.Detokenize(text);
386
387 std::string result;
388 unsigned pass = 1;
389
390 while (true) {
391 result = detokenizer.Flush();
392 if (pass >= max_passes || !detokenizer.OutputChangedSinceLastCheck()) {
393 break;
394 }
395 detokenizer.Detokenize(result);
396 pass += 1;
397 }
398 return result;
399 }
400
DecodeOptionallyTokenizedData(const ConstByteSpan & optionally_tokenized_data)401 std::string Detokenizer::DecodeOptionallyTokenizedData(
402 const ConstByteSpan& optionally_tokenized_data) {
403 // Try detokenizing as binary using the best result if available, else use
404 // the input data as a string.
405 const auto result = Detokenize(optionally_tokenized_data);
406 const bool found_matches = !result.matches().empty();
407 // Note: unlike pw_tokenizer.proto.decode_optionally_tokenized, this decoding
408 // process does not encode and decode UTF8 format, it is sufficient to check
409 // if the data is printable ASCII.
410 const std::string data =
411 found_matches
412 ? result.BestString()
413 : std::string(
414 reinterpret_cast<const char*>(optionally_tokenized_data.data()),
415 optionally_tokenized_data.size());
416
417 const bool is_data_printable = IsPrintableAscii(data);
418 if (!found_matches && !is_data_printable) {
419 // Assume the token is unknown or the data is corrupt.
420 std::vector<char> base64_encoding_buffer(
421 Base64EncodedBufferSize(optionally_tokenized_data.size()));
422 const size_t encoded_length = PrefixedBase64Encode(
423 optionally_tokenized_data, span(base64_encoding_buffer));
424 return std::string{base64_encoding_buffer.data(), encoded_length};
425 }
426
427 // Successfully detokenized, check if the field has more prefixed
428 // base64-encoded tokens.
429 const std::string field = DetokenizeText(data);
430 // If anything detokenized successfully, use that.
431 if (field != data) {
432 return field;
433 }
434
435 // Attempt to determine whether this is an unknown token or plain text.
436 // Any string with only printable or whitespace characters is plain text.
437 if (found_matches || is_data_printable) {
438 return data;
439 }
440
441 // Assume this field is tokenized data that could not be decoded.
442 std::vector<char> base64_encoding_buffer(
443 Base64EncodedBufferSize(optionally_tokenized_data.size()));
444 const size_t encoded_length = PrefixedBase64Encode(
445 optionally_tokenized_data, span(base64_encoding_buffer));
446 return std::string{base64_encoding_buffer.data(), encoded_length};
447 }
448
449 } // namespace pw::tokenizer
450