• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // Copyright 2020 The Pigweed Authors
2 //
3 // Licensed under the Apache License, Version 2.0 (the "License"); you may not
4 // use this file except in compliance with the License. You may obtain a copy of
5 // the License at
6 //
7 //     https://www.apache.org/licenses/LICENSE-2.0
8 //
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
11 // WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
12 // License for the specific language governing permissions and limitations under
13 // the License.
14 
15 #include "pw_tokenizer/detokenize.h"
16 
17 #include <algorithm>
18 #include <cctype>
19 #include <cstring>
20 #include <string_view>
21 #include <vector>
22 
23 #include "pw_bytes/bit.h"
24 #include "pw_bytes/endian.h"
25 #include "pw_result/result.h"
26 #include "pw_tokenizer/base64.h"
27 #include "pw_tokenizer/internal/decode.h"
28 #include "pw_tokenizer/nested_tokenization.h"
29 
30 namespace pw::tokenizer {
31 namespace {
32 
33 class NestedMessageDetokenizer {
34  public:
NestedMessageDetokenizer(const Detokenizer & detokenizer)35   NestedMessageDetokenizer(const Detokenizer& detokenizer)
36       : detokenizer_(detokenizer) {}
37 
Detokenize(std::string_view chunk)38   void Detokenize(std::string_view chunk) {
39     for (char next_char : chunk) {
40       Detokenize(next_char);
41     }
42   }
43 
OutputChangedSinceLastCheck()44   bool OutputChangedSinceLastCheck() {
45     const bool changed = output_changed_;
46     output_changed_ = false;
47     return changed;
48   }
49 
Detokenize(char next_char)50   void Detokenize(char next_char) {
51     switch (state_) {
52       case kNonMessage:
53         if (next_char == PW_TOKENIZER_NESTED_PREFIX) {
54           message_buffer_.push_back(next_char);
55           state_ = kMessage;
56         } else {
57           output_.push_back(next_char);
58         }
59         break;
60       case kMessage:
61         if (base64::IsValidChar(next_char)) {
62           message_buffer_.push_back(next_char);
63         } else {
64           HandleEndOfMessage();
65           if (next_char == PW_TOKENIZER_NESTED_PREFIX) {
66             message_buffer_.push_back(next_char);
67           } else {
68             output_.push_back(next_char);
69             state_ = kNonMessage;
70           }
71         }
72         break;
73     }
74   }
75 
Flush()76   std::string Flush() {
77     if (state_ == kMessage) {
78       HandleEndOfMessage();
79       state_ = kNonMessage;
80     }
81     std::string output(std::move(output_));
82     output_.clear();
83     return output;
84   }
85 
86  private:
HandleEndOfMessage()87   void HandleEndOfMessage() {
88     if (auto result = detokenizer_.DetokenizeBase64Message(message_buffer_);
89         result.ok()) {
90       output_ += result.BestString();
91       output_changed_ = true;
92     } else {
93       output_ += message_buffer_;  // Keep the original if it doesn't decode.
94     }
95     message_buffer_.clear();
96   }
97 
98   const Detokenizer& detokenizer_;
99   std::string output_;
100   std::string message_buffer_;
101 
102   enum : uint8_t { kNonMessage, kMessage } state_ = kNonMessage;
103   bool output_changed_ = false;
104 };
105 
UnknownTokenMessage(uint32_t value)106 std::string UnknownTokenMessage(uint32_t value) {
107   std::string output(PW_TOKENIZER_ARG_DECODING_ERROR_PREFIX "unknown token ");
108 
109   // Output a hexadecimal version of the token.
110   for (int shift = 28; shift >= 0; shift -= 4) {
111     output.push_back("0123456789abcdef"[(value >> shift) & 0xF]);
112   }
113 
114   output.append(PW_TOKENIZER_ARG_DECODING_ERROR_SUFFIX);
115   return output;
116 }
117 
118 // Decoding result with the date removed, for sorting.
119 using DecodingResult = std::pair<DecodedFormatString, uint32_t>;
120 
121 // Determines if one result is better than the other if collisions occurred.
122 // Returns true if lhs is preferred over rhs. This logic should match the
123 // collision resolution logic in detokenize.py.
IsBetterResult(const DecodingResult & lhs,const DecodingResult & rhs)124 bool IsBetterResult(const DecodingResult& lhs, const DecodingResult& rhs) {
125   // Favor the result for which decoding succeeded.
126   if (lhs.first.ok() != rhs.first.ok()) {
127     return lhs.first.ok();
128   }
129 
130   // Favor the result for which all bytes were decoded.
131   if ((lhs.first.remaining_bytes() == 0u) !=
132       (rhs.first.remaining_bytes() == 0u)) {
133     return lhs.first.remaining_bytes() == 0u;
134   }
135 
136   // Favor the result with fewer decoding errors.
137   if (lhs.first.decoding_errors() != rhs.first.decoding_errors()) {
138     return lhs.first.decoding_errors() < rhs.first.decoding_errors();
139   }
140 
141   // Favor the result that successfully decoded the most arguments.
142   if (lhs.first.argument_count() != rhs.first.argument_count()) {
143     return lhs.first.argument_count() > rhs.first.argument_count();
144   }
145 
146   // Favor the result that was removed from the database most recently.
147   return lhs.second > rhs.second;
148 }
149 
150 // Returns true if all characters in data are printable, space, or if the string
151 // is empty.
IsPrintableAscii(std::string_view data)152 constexpr bool IsPrintableAscii(std::string_view data) {
153   // This follows the logic in pw_tokenizer.decode_optionally_tokenized below:
154   //
155   //   if ''.join(text.split()).isprintable():
156   //     return text
157   //
158   for (int letter : data) {
159     if (std::isprint(letter) == 0 && std::isspace(letter) == 0) {
160       return false;
161     }
162   }
163   return true;
164 }
165 
166 }  // namespace
167 
DetokenizedString(uint32_t token,const span<const TokenizedStringEntry> & entries,const span<const std::byte> & arguments)168 DetokenizedString::DetokenizedString(
169     uint32_t token,
170     const span<const TokenizedStringEntry>& entries,
171     const span<const std::byte>& arguments)
172     : token_(token), has_token_(true) {
173   std::vector<DecodingResult> results;
174 
175   for (const auto& [format, date_removed] : entries) {
176     results.push_back(DecodingResult{
177         format.Format(span(reinterpret_cast<const uint8_t*>(arguments.data()),
178                            arguments.size())),
179         date_removed});
180   }
181 
182   std::sort(results.begin(), results.end(), IsBetterResult);
183 
184   for (auto& result : results) {
185     matches_.push_back(std::move(result.first));
186   }
187 }
188 
BestString() const189 std::string DetokenizedString::BestString() const {
190   return matches_.empty() ? std::string() : matches_[0].value();
191 }
192 
BestStringWithErrors() const193 std::string DetokenizedString::BestStringWithErrors() const {
194   if (matches_.empty()) {
195     return has_token_ ? UnknownTokenMessage(token_)
196                       : PW_TOKENIZER_ARG_DECODING_ERROR("missing token");
197   }
198   return matches_[0].value_with_errors();
199 }
200 
Detokenizer(const TokenDatabase & database)201 Detokenizer::Detokenizer(const TokenDatabase& database) {
202   for (const auto& entry : database) {
203     database_[entry.token].emplace_back(entry.string, entry.date_removed);
204   }
205 }
206 
FromElfSection(span<const std::byte> elf_section)207 Result<Detokenizer> Detokenizer::FromElfSection(
208     span<const std::byte> elf_section) {
209   size_t index = 0;
210   std::unordered_map<uint32_t, std::vector<TokenizedStringEntry>> database;
211 
212   while (index + sizeof(_pw_tokenizer_EntryHeader) < elf_section.size()) {
213     _pw_tokenizer_EntryHeader header;
214     std::memcpy(
215         &header, elf_section.data() + index, sizeof(_pw_tokenizer_EntryHeader));
216     index += sizeof(_pw_tokenizer_EntryHeader);
217 
218     if (header.magic != _PW_TOKENIZER_ENTRY_MAGIC) {
219       return Status::DataLoss();
220     }
221 
222     index += header.domain_length;
223     if (index + header.string_length <= elf_section.size()) {
224       // TODO(b/326365218): Construct FormatString with string_view to avoid
225       // creating a copy here.
226       std::string entry(
227           reinterpret_cast<const char*>(elf_section.data() + index),
228           header.string_length);
229       index += header.string_length;
230       database[header.token].emplace_back(entry.c_str(),
231                                           TokenDatabase::kDateRemovedNever);
232     }
233   }
234   return Detokenizer(std::move(database));
235 }
236 
Detokenize(const span<const std::byte> & encoded) const237 DetokenizedString Detokenizer::Detokenize(
238     const span<const std::byte>& encoded) const {
239   // The token is missing from the encoded data; there is nothing to do.
240   if (encoded.empty()) {
241     return DetokenizedString();
242   }
243 
244   uint32_t token = bytes::ReadInOrder<uint32_t>(
245       endian::little, encoded.data(), encoded.size());
246 
247   const auto result = database_.find(token);
248 
249   return DetokenizedString(
250       token,
251       result == database_.end() ? span<TokenizedStringEntry>()
252                                 : span(result->second),
253       encoded.size() < sizeof(token) ? span<const std::byte>()
254                                      : encoded.subspan(sizeof(token)));
255 }
256 
DetokenizeBase64Message(std::string_view text) const257 DetokenizedString Detokenizer::DetokenizeBase64Message(
258     std::string_view text) const {
259   std::string buffer(text);
260   buffer.resize(PrefixedBase64DecodeInPlace(buffer));
261   return Detokenize(buffer);
262 }
263 
DetokenizeText(std::string_view text,const unsigned max_passes) const264 std::string Detokenizer::DetokenizeText(std::string_view text,
265                                         const unsigned max_passes) const {
266   NestedMessageDetokenizer detokenizer(*this);
267   detokenizer.Detokenize(text);
268 
269   std::string result;
270   unsigned pass = 1;
271 
272   while (true) {
273     result = detokenizer.Flush();
274     if (pass >= max_passes || !detokenizer.OutputChangedSinceLastCheck()) {
275       break;
276     }
277     detokenizer.Detokenize(result);
278     pass += 1;
279   }
280   return result;
281 }
282 
DecodeOptionallyTokenizedData(const ConstByteSpan & optionally_tokenized_data)283 std::string Detokenizer::DecodeOptionallyTokenizedData(
284     const ConstByteSpan& optionally_tokenized_data) {
285   // Try detokenizing as binary using the best result if available, else use
286   // the input data as a string.
287   const auto result = Detokenize(optionally_tokenized_data);
288   const bool found_matches = !result.matches().empty();
289   // Note: unlike pw_tokenizer.proto.decode_optionally_tokenized, this decoding
290   // process does not encode and decode UTF8 format, it is sufficient to check
291   // if the data is printable ASCII.
292   const std::string data =
293       found_matches
294           ? result.BestString()
295           : std::string(
296                 reinterpret_cast<const char*>(optionally_tokenized_data.data()),
297                 optionally_tokenized_data.size());
298 
299   const bool is_data_printable = IsPrintableAscii(data);
300   if (!found_matches && !is_data_printable) {
301     // Assume the token is unknown or the data is corrupt.
302     std::vector<char> base64_encoding_buffer(
303         Base64EncodedBufferSize(optionally_tokenized_data.size()));
304     const size_t encoded_length = PrefixedBase64Encode(
305         optionally_tokenized_data, span(base64_encoding_buffer));
306     return std::string{base64_encoding_buffer.data(), encoded_length};
307   }
308 
309   // Successfully detokenized, check if the field has more prefixed
310   // base64-encoded tokens.
311   const std::string field = DetokenizeText(data);
312   // If anything detokenized successfully, use that.
313   if (field != data) {
314     return field;
315   }
316 
317   // Attempt to determine whether this is an unknown token or plain text.
318   // Any string with only printable or whitespace characters is plain text.
319   if (found_matches || is_data_printable) {
320     return data;
321   }
322 
323   // Assume this field is tokenized data that could not be decoded.
324   std::vector<char> base64_encoding_buffer(
325       Base64EncodedBufferSize(optionally_tokenized_data.size()));
326   const size_t encoded_length = PrefixedBase64Encode(
327       optionally_tokenized_data, span(base64_encoding_buffer));
328   return std::string{base64_encoding_buffer.data(), encoded_length};
329 }
330 
331 }  // namespace pw::tokenizer
332