• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // Copyright 2020 The Pigweed Authors
2 //
3 // Licensed under the Apache License, Version 2.0 (the "License"); you may not
4 // use this file except in compliance with the License. You may obtain a copy of
5 // the License at
6 //
7 //     https://www.apache.org/licenses/LICENSE-2.0
8 //
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
11 // WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
12 // License for the specific language governing permissions and limitations under
13 // the License.
14 
15 #include "pw_tokenizer/detokenize.h"
16 
17 #include <algorithm>
18 #include <cctype>
19 #include <cstring>
20 #include <string_view>
21 #include <vector>
22 
23 #include "pw_bytes/bit.h"
24 #include "pw_bytes/endian.h"
25 #include "pw_elf/reader.h"
26 #include "pw_log/log.h"
27 #include "pw_result/result.h"
28 #include "pw_status/try.h"
29 #include "pw_tokenizer/base64.h"
30 #include "pw_tokenizer/internal/decode.h"
31 #include "pw_tokenizer/nested_tokenization.h"
32 #include "pw_tokenizer/tokenize.h"
33 #include "pw_tokenizer_private/csv.h"
34 
35 namespace pw::tokenizer {
36 namespace {
37 
38 class NestedMessageDetokenizer {
39  public:
NestedMessageDetokenizer(const Detokenizer & detokenizer)40   NestedMessageDetokenizer(const Detokenizer& detokenizer)
41       : detokenizer_(detokenizer) {}
42 
Detokenize(std::string_view chunk)43   void Detokenize(std::string_view chunk) {
44     for (char next_char : chunk) {
45       Detokenize(next_char);
46     }
47   }
48 
OutputChangedSinceLastCheck()49   bool OutputChangedSinceLastCheck() {
50     const bool changed = output_changed_;
51     output_changed_ = false;
52     return changed;
53   }
54 
Detokenize(char next_char)55   void Detokenize(char next_char) {
56     switch (state_) {
57       case kNonMessage:
58         if (next_char == PW_TOKENIZER_NESTED_PREFIX) {
59           message_buffer_.push_back(next_char);
60           state_ = kMessage;
61         } else {
62           output_.push_back(next_char);
63         }
64         break;
65       case kMessage:
66         if (base64::IsValidChar(next_char)) {
67           message_buffer_.push_back(next_char);
68         } else {
69           HandleEndOfMessage();
70           if (next_char == PW_TOKENIZER_NESTED_PREFIX) {
71             message_buffer_.push_back(next_char);
72           } else {
73             output_.push_back(next_char);
74             state_ = kNonMessage;
75           }
76         }
77         break;
78     }
79   }
80 
Flush()81   std::string Flush() {
82     if (state_ == kMessage) {
83       HandleEndOfMessage();
84       state_ = kNonMessage;
85     }
86     std::string output(std::move(output_));
87     output_.clear();
88     return output;
89   }
90 
91  private:
HandleEndOfMessage()92   void HandleEndOfMessage() {
93     if (auto result = detokenizer_.DetokenizeBase64Message(message_buffer_);
94         result.ok()) {
95       output_ += result.BestString();
96       output_changed_ = true;
97     } else {
98       output_ += message_buffer_;  // Keep the original if it doesn't decode.
99     }
100     message_buffer_.clear();
101   }
102 
103   const Detokenizer& detokenizer_;
104   std::string output_;
105   std::string message_buffer_;
106 
107   enum : uint8_t { kNonMessage, kMessage } state_ = kNonMessage;
108   bool output_changed_ = false;
109 };
110 
UnknownTokenMessage(uint32_t value)111 std::string UnknownTokenMessage(uint32_t value) {
112   std::string output(PW_TOKENIZER_ARG_DECODING_ERROR_PREFIX "unknown token ");
113 
114   // Output a hexadecimal version of the token.
115   for (int shift = 28; shift >= 0; shift -= 4) {
116     output.push_back("0123456789abcdef"[(value >> shift) & 0xF]);
117   }
118 
119   output.append(PW_TOKENIZER_ARG_DECODING_ERROR_SUFFIX);
120   return output;
121 }
122 
123 // Decoding result with the date removed, for sorting.
124 using DecodingResult = std::pair<DecodedFormatString, uint32_t>;
125 
126 // Determines if one result is better than the other if collisions occurred.
127 // Returns true if lhs is preferred over rhs. This logic should match the
128 // collision resolution logic in detokenize.py.
IsBetterResult(const DecodingResult & lhs,const DecodingResult & rhs)129 bool IsBetterResult(const DecodingResult& lhs, const DecodingResult& rhs) {
130   // Favor the result for which decoding succeeded.
131   if (lhs.first.ok() != rhs.first.ok()) {
132     return lhs.first.ok();
133   }
134 
135   // Favor the result for which all bytes were decoded.
136   if ((lhs.first.remaining_bytes() == 0u) !=
137       (rhs.first.remaining_bytes() == 0u)) {
138     return lhs.first.remaining_bytes() == 0u;
139   }
140 
141   // Favor the result with fewer decoding errors.
142   if (lhs.first.decoding_errors() != rhs.first.decoding_errors()) {
143     return lhs.first.decoding_errors() < rhs.first.decoding_errors();
144   }
145 
146   // Favor the result that successfully decoded the most arguments.
147   if (lhs.first.argument_count() != rhs.first.argument_count()) {
148     return lhs.first.argument_count() > rhs.first.argument_count();
149   }
150 
151   // Favor the result that was removed from the database most recently.
152   return lhs.second > rhs.second;
153 }
154 
155 // Returns true if all characters in data are printable, space, or if the string
156 // is empty.
IsPrintableAscii(std::string_view data)157 constexpr bool IsPrintableAscii(std::string_view data) {
158   // This follows the logic in pw_tokenizer.decode_optionally_tokenized below:
159   //
160   //   if ''.join(text.split()).isprintable():
161   //     return text
162   //
163   for (int letter : data) {
164     if (std::isprint(letter) == 0 && std::isspace(letter) == 0) {
165       return false;
166     }
167   }
168   return true;
169 }
170 
171 }  // namespace
172 
DetokenizedString(uint32_t token,const span<const TokenizedStringEntry> & entries,const span<const std::byte> & arguments)173 DetokenizedString::DetokenizedString(
174     uint32_t token,
175     const span<const TokenizedStringEntry>& entries,
176     const span<const std::byte>& arguments)
177     : token_(token), has_token_(true) {
178   std::vector<DecodingResult> results;
179 
180   for (const auto& [format, date_removed] : entries) {
181     results.push_back(DecodingResult{
182         format.Format(span(reinterpret_cast<const uint8_t*>(arguments.data()),
183                            arguments.size())),
184         date_removed});
185   }
186 
187   std::sort(results.begin(), results.end(), IsBetterResult);
188 
189   for (auto& result : results) {
190     matches_.push_back(std::move(result.first));
191   }
192 }
193 
BestString() const194 std::string DetokenizedString::BestString() const {
195   return matches_.empty() ? std::string() : matches_[0].value();
196 }
197 
BestStringWithErrors() const198 std::string DetokenizedString::BestStringWithErrors() const {
199   if (matches_.empty()) {
200     return has_token_ ? UnknownTokenMessage(token_)
201                       : PW_TOKENIZER_ARG_DECODING_ERROR("missing token");
202   }
203   return matches_[0].value_with_errors();
204 }
205 
Detokenizer(const TokenDatabase & database)206 Detokenizer::Detokenizer(const TokenDatabase& database) {
207   for (const auto& entry : database) {
208     database_[kDefaultDomain][entry.token].emplace_back(entry.string,
209                                                         entry.date_removed);
210   }
211 }
212 
FromElfSection(span<const std::byte> elf_section)213 Result<Detokenizer> Detokenizer::FromElfSection(
214     span<const std::byte> elf_section) {
215   size_t index = 0;
216   DomainTokenEntriesMap database;
217 
218   while (index + sizeof(_pw_tokenizer_EntryHeader) < elf_section.size()) {
219     _pw_tokenizer_EntryHeader header;
220     std::memcpy(
221         &header, elf_section.data() + index, sizeof(_pw_tokenizer_EntryHeader));
222     index += sizeof(_pw_tokenizer_EntryHeader);
223 
224     if (header.magic != _PW_TOKENIZER_ENTRY_MAGIC) {
225       return Status::DataLoss();
226     }
227 
228     if (index + header.domain_length + header.string_length <=
229         elf_section.size()) {
230       std::string domain(
231           reinterpret_cast<const char*>(elf_section.data() + index),
232           header.domain_length - 1);
233       index += header.domain_length;
234       // TODO(b/326365218): Construct FormatString with string_view to avoid
235       // creating a copy here.
236       std::string entry(
237           reinterpret_cast<const char*>(elf_section.data() + index),
238           header.string_length - 1);
239       index += header.string_length;
240       database[std::move(domain)][header.token].emplace_back(
241           entry.c_str(), TokenDatabase::kDateRemovedNever);
242     }
243   }
244   return Detokenizer(std::move(database));
245 }
246 
FromElfFile(stream::SeekableReader & stream)247 Result<Detokenizer> Detokenizer::FromElfFile(stream::SeekableReader& stream) {
248   PW_TRY_ASSIGN(auto reader, pw::elf::ElfReader::FromStream(stream));
249 
250   constexpr auto kTokenSectionName = ".pw_tokenizer.entries";
251   PW_TRY_ASSIGN(std::vector<std::byte> section_data,
252                 reader.ReadSection(kTokenSectionName));
253 
254   return Detokenizer::FromElfSection(section_data);
255 }
256 
FromCsv(std::string_view csv)257 Result<Detokenizer> Detokenizer::FromCsv(std::string_view csv) {
258   std::vector<std::vector<std::string>> parsed_csv = ParseCsv(csv);
259   DomainTokenEntriesMap database;
260 
261   // CSV databases are in the format -> token, date, domain, string.
262   int invalid_row_count = 0;
263   for (const auto& row : parsed_csv) {
264     if (row.size() != 4) {
265       invalid_row_count++;
266       continue;
267     }
268     // Ignore whitespace in the domain.
269     std::string domain = "";
270     for (char c : row[2]) {
271       if (!std::isspace(c)) {
272         domain += c;
273       }
274     }
275 
276     const std::string& token = row[0];
277     const std::string& date_removed = row[1];
278 
279     // Validate length of token.
280     if (token.empty()) {
281       PW_LOG_ERROR("Corrupt database due to missing token");
282       return Status::DataLoss();
283     }
284 
285     // Validate token contents.
286     for (char c : token) {
287       if (!std::isxdigit(c)) {
288         PW_LOG_ERROR("Corrupt database due to token format");
289         return Status::DataLoss();
290       }
291     }
292 
293     // Validate date contents.
294     uint32_t date = TokenDatabase::kDateRemovedNever;
295     if (!date_removed.empty() &&
296         date_removed.find_first_not_of(' ') != std::string::npos) {
297       size_t first_dash = date_removed.find('-');
298       if (first_dash == std::string::npos || first_dash != 4) {
299         PW_LOG_ERROR("Wrong date format in database");
300         return Status::DataLoss();
301       }
302 
303       size_t second_dash = date_removed.find('-', first_dash + 1);
304       if (second_dash == std::string::npos || second_dash != 7) {
305         PW_LOG_ERROR("Wrong date format in database");
306         return Status::DataLoss();
307       }
308 
309       size_t pos;
310       int year = std::stoi(date_removed.substr(0, first_dash), &pos);
311       if (pos != first_dash) {
312         PW_LOG_ERROR("Wrong date format in database");
313         return Status::DataLoss();
314       }
315 
316       int month = std::stoi(
317           date_removed.substr(first_dash + 1, second_dash - first_dash - 1),
318           &pos);
319       if (pos != second_dash - first_dash - 1) {
320         PW_LOG_ERROR("Wrong date format in database");
321         return Status::DataLoss();
322       }
323 
324       int day = std::stoi(date_removed.substr(second_dash + 1), &pos);
325       if (pos != date_removed.size() - second_dash - 1) {
326         PW_LOG_ERROR("Wrong date format in database");
327         return Status::DataLoss();
328       }
329 
330       date = (year << 16) | (month << 8) | day;
331     }
332 
333     // Add to database.
334     database[std::move(domain)][std::stoul(token, nullptr, 16)].emplace_back(
335         row[3].c_str(), date);
336   }
337 
338   // Log warning if any data lines were skipped.
339   if (invalid_row_count > 0) {
340     PW_LOG_WARN(
341         "Skipped %d of %zu lines because they did not have 4 columns as "
342         "expected.",
343         invalid_row_count,
344         parsed_csv.size());
345   }
346 
347   return Detokenizer(std::move(database));
348 }
349 
Detokenize(const span<const std::byte> & encoded) const350 DetokenizedString Detokenizer::Detokenize(
351     const span<const std::byte>& encoded) const {
352   // The token is missing from the encoded data; there is nothing to do.
353   if (encoded.empty()) {
354     return DetokenizedString();
355   }
356 
357   uint32_t token = bytes::ReadInOrder<uint32_t>(
358       endian::little, encoded.data(), encoded.size());
359 
360   const auto domain_it = database_.find(kDefaultDomain);
361   if (domain_it == database_.end()) {
362     return DetokenizedString();
363   }
364 
365   const auto result = domain_it->second.find(token);
366 
367   return DetokenizedString(
368       token,
369       result == domain_it->second.end() ? span<TokenizedStringEntry>()
370                                         : span(result->second),
371       encoded.size() < sizeof(token) ? span<const std::byte>()
372                                      : encoded.subspan(sizeof(token)));
373 }
374 
DetokenizeBase64Message(std::string_view text) const375 DetokenizedString Detokenizer::DetokenizeBase64Message(
376     std::string_view text) const {
377   std::string buffer(text);
378   buffer.resize(PrefixedBase64DecodeInPlace(buffer));
379   return Detokenize(buffer);
380 }
381 
DetokenizeText(std::string_view text,const unsigned max_passes) const382 std::string Detokenizer::DetokenizeText(std::string_view text,
383                                         const unsigned max_passes) const {
384   NestedMessageDetokenizer detokenizer(*this);
385   detokenizer.Detokenize(text);
386 
387   std::string result;
388   unsigned pass = 1;
389 
390   while (true) {
391     result = detokenizer.Flush();
392     if (pass >= max_passes || !detokenizer.OutputChangedSinceLastCheck()) {
393       break;
394     }
395     detokenizer.Detokenize(result);
396     pass += 1;
397   }
398   return result;
399 }
400 
DecodeOptionallyTokenizedData(const ConstByteSpan & optionally_tokenized_data)401 std::string Detokenizer::DecodeOptionallyTokenizedData(
402     const ConstByteSpan& optionally_tokenized_data) {
403   // Try detokenizing as binary using the best result if available, else use
404   // the input data as a string.
405   const auto result = Detokenize(optionally_tokenized_data);
406   const bool found_matches = !result.matches().empty();
407   // Note: unlike pw_tokenizer.proto.decode_optionally_tokenized, this decoding
408   // process does not encode and decode UTF8 format, it is sufficient to check
409   // if the data is printable ASCII.
410   const std::string data =
411       found_matches
412           ? result.BestString()
413           : std::string(
414                 reinterpret_cast<const char*>(optionally_tokenized_data.data()),
415                 optionally_tokenized_data.size());
416 
417   const bool is_data_printable = IsPrintableAscii(data);
418   if (!found_matches && !is_data_printable) {
419     // Assume the token is unknown or the data is corrupt.
420     std::vector<char> base64_encoding_buffer(
421         Base64EncodedBufferSize(optionally_tokenized_data.size()));
422     const size_t encoded_length = PrefixedBase64Encode(
423         optionally_tokenized_data, span(base64_encoding_buffer));
424     return std::string{base64_encoding_buffer.data(), encoded_length};
425   }
426 
427   // Successfully detokenized, check if the field has more prefixed
428   // base64-encoded tokens.
429   const std::string field = DetokenizeText(data);
430   // If anything detokenized successfully, use that.
431   if (field != data) {
432     return field;
433   }
434 
435   // Attempt to determine whether this is an unknown token or plain text.
436   // Any string with only printable or whitespace characters is plain text.
437   if (found_matches || is_data_printable) {
438     return data;
439   }
440 
441   // Assume this field is tokenized data that could not be decoded.
442   std::vector<char> base64_encoding_buffer(
443       Base64EncodedBufferSize(optionally_tokenized_data.size()));
444   const size_t encoded_length = PrefixedBase64Encode(
445       optionally_tokenized_data, span(base64_encoding_buffer));
446   return std::string{base64_encoding_buffer.data(), encoded_length};
447 }
448 
449 }  // namespace pw::tokenizer
450