/proc/self/cwd/pw_tokenizer/detokenize.cc
Line | Count | Source (jump to first uncovered line) |
1 | | // Copyright 2020 The Pigweed Authors |
2 | | // |
3 | | // Licensed under the Apache License, Version 2.0 (the "License"); you may not |
4 | | // use this file except in compliance with the License. You may obtain a copy of |
5 | | // the License at |
6 | | // |
7 | | // https://www.apache.org/licenses/LICENSE-2.0 |
8 | | // |
9 | | // Unless required by applicable law or agreed to in writing, software |
10 | | // distributed under the License is distributed on an "AS IS" BASIS, WITHOUT |
11 | | // WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the |
12 | | // License for the specific language governing permissions and limitations under |
13 | | // the License. |
14 | | |
15 | | #include "pw_tokenizer/detokenize.h" |
16 | | |
17 | | #include <algorithm> |
18 | | #include <cctype> |
19 | | #include <cstring> |
20 | | #include <string_view> |
21 | | #include <vector> |
22 | | |
23 | | #include "pw_bytes/bit.h" |
24 | | #include "pw_bytes/endian.h" |
25 | | #include "pw_result/result.h" |
26 | | #include "pw_tokenizer/base64.h" |
27 | | #include "pw_tokenizer/internal/decode.h" |
28 | | #include "pw_tokenizer/nested_tokenization.h" |
29 | | |
30 | | namespace pw::tokenizer { |
31 | | namespace { |
32 | | |
33 | | class NestedMessageDetokenizer { |
34 | | public: |
35 | | NestedMessageDetokenizer(const Detokenizer& detokenizer) |
36 | 0 | : detokenizer_(detokenizer) {} |
37 | | |
38 | 0 | void Detokenize(std::string_view chunk) { |
39 | 0 | for (char next_char : chunk) { |
40 | 0 | Detokenize(next_char); |
41 | 0 | } |
42 | 0 | } |
43 | | |
44 | 0 | bool OutputChangedSinceLastCheck() { |
45 | 0 | const bool changed = output_changed_; |
46 | 0 | output_changed_ = false; |
47 | 0 | return changed; |
48 | 0 | } |
49 | | |
50 | 0 | void Detokenize(char next_char) { |
51 | 0 | switch (state_) { |
52 | 0 | case kNonMessage: |
53 | 0 | if (next_char == PW_TOKENIZER_NESTED_PREFIX) { |
54 | 0 | message_buffer_.push_back(next_char); |
55 | 0 | state_ = kMessage; |
56 | 0 | } else { |
57 | 0 | output_.push_back(next_char); |
58 | 0 | } |
59 | 0 | break; |
60 | 0 | case kMessage: |
61 | 0 | if (base64::IsValidChar(next_char)) { |
62 | 0 | message_buffer_.push_back(next_char); |
63 | 0 | } else { |
64 | 0 | HandleEndOfMessage(); |
65 | 0 | if (next_char == PW_TOKENIZER_NESTED_PREFIX) { |
66 | 0 | message_buffer_.push_back(next_char); |
67 | 0 | } else { |
68 | 0 | output_.push_back(next_char); |
69 | 0 | state_ = kNonMessage; |
70 | 0 | } |
71 | 0 | } |
72 | 0 | break; |
73 | 0 | } |
74 | 0 | } |
75 | | |
76 | 0 | std::string Flush() { |
77 | 0 | if (state_ == kMessage) { |
78 | 0 | HandleEndOfMessage(); |
79 | 0 | state_ = kNonMessage; |
80 | 0 | } |
81 | 0 | std::string output(std::move(output_)); |
82 | 0 | output_.clear(); |
83 | 0 | return output; |
84 | 0 | } |
85 | | |
86 | | private: |
87 | 0 | void HandleEndOfMessage() { |
88 | 0 | if (auto result = detokenizer_.DetokenizeBase64Message(message_buffer_); |
89 | 0 | result.ok()) { |
90 | 0 | output_ += result.BestString(); |
91 | 0 | output_changed_ = true; |
92 | 0 | } else { |
93 | 0 | output_ += message_buffer_; // Keep the original if it doesn't decode. |
94 | 0 | } |
95 | 0 | message_buffer_.clear(); |
96 | 0 | } |
97 | | |
98 | | const Detokenizer& detokenizer_; |
99 | | std::string output_; |
100 | | std::string message_buffer_; |
101 | | |
102 | | enum : uint8_t { kNonMessage, kMessage } state_ = kNonMessage; |
103 | | bool output_changed_ = false; |
104 | | }; |
105 | | |
106 | 0 | std::string UnknownTokenMessage(uint32_t value) { |
107 | 0 | std::string output(PW_TOKENIZER_ARG_DECODING_ERROR_PREFIX "unknown token "); |
108 | | |
109 | | // Output a hexadecimal version of the token. |
110 | 0 | for (int shift = 28; shift >= 0; shift -= 4) { |
111 | 0 | output.push_back("0123456789abcdef"[(value >> shift) & 0xF]); |
112 | 0 | } |
113 | |
|
114 | 0 | output.append(PW_TOKENIZER_ARG_DECODING_ERROR_SUFFIX); |
115 | 0 | return output; |
116 | 0 | } |
117 | | |
118 | | // Decoding result with the date removed, for sorting. |
119 | | using DecodingResult = std::pair<DecodedFormatString, uint32_t>; |
120 | | |
121 | | // Determines if one result is better than the other if collisions occurred. |
122 | | // Returns true if lhs is preferred over rhs. This logic should match the |
123 | | // collision resolution logic in detokenize.py. |
124 | 0 | bool IsBetterResult(const DecodingResult& lhs, const DecodingResult& rhs) { |
125 | | // Favor the result for which decoding succeeded. |
126 | 0 | if (lhs.first.ok() != rhs.first.ok()) { |
127 | 0 | return lhs.first.ok(); |
128 | 0 | } |
129 | | |
130 | | // Favor the result for which all bytes were decoded. |
131 | 0 | if ((lhs.first.remaining_bytes() == 0u) != |
132 | 0 | (rhs.first.remaining_bytes() == 0u)) { |
133 | 0 | return lhs.first.remaining_bytes() == 0u; |
134 | 0 | } |
135 | | |
136 | | // Favor the result with fewer decoding errors. |
137 | 0 | if (lhs.first.decoding_errors() != rhs.first.decoding_errors()) { |
138 | 0 | return lhs.first.decoding_errors() < rhs.first.decoding_errors(); |
139 | 0 | } |
140 | | |
141 | | // Favor the result that successfully decoded the most arguments. |
142 | 0 | if (lhs.first.argument_count() != rhs.first.argument_count()) { |
143 | 0 | return lhs.first.argument_count() > rhs.first.argument_count(); |
144 | 0 | } |
145 | | |
146 | | // Favor the result that was removed from the database most recently. |
147 | 0 | return lhs.second > rhs.second; |
148 | 0 | } |
149 | | |
150 | | // Returns true if all characters in data are printable, space, or if the string |
151 | | // is empty. |
152 | 0 | constexpr bool IsPrintableAscii(std::string_view data) { |
153 | | // This follows the logic in pw_tokenizer.decode_optionally_tokenized below: |
154 | | // |
155 | | // if ''.join(text.split()).isprintable(): |
156 | | // return text |
157 | | // |
158 | 0 | for (int letter : data) { |
159 | 0 | if (std::isprint(letter) == 0 && std::isspace(letter) == 0) { |
160 | 0 | return false; |
161 | 0 | } |
162 | 0 | } |
163 | 0 | return true; |
164 | 0 | } |
165 | | |
166 | | } // namespace |
167 | | |
168 | | DetokenizedString::DetokenizedString( |
169 | | uint32_t token, |
170 | | const span<const TokenizedStringEntry>& entries, |
171 | | const span<const std::byte>& arguments) |
172 | 13.9k | : token_(token), has_token_(true) { |
173 | 13.9k | std::vector<DecodingResult> results; |
174 | | |
175 | 13.9k | for (const auto& [format, date_removed] : entries) { |
176 | 1.87k | results.push_back(DecodingResult{ |
177 | 1.87k | format.Format(span(reinterpret_cast<const uint8_t*>(arguments.data()), |
178 | 1.87k | arguments.size())), |
179 | 1.87k | date_removed}); |
180 | 1.87k | } |
181 | | |
182 | 13.9k | std::sort(results.begin(), results.end(), IsBetterResult); |
183 | | |
184 | 13.9k | for (auto& result : results) { |
185 | 1.87k | matches_.push_back(std::move(result.first)); |
186 | 1.87k | } |
187 | 13.9k | } |
188 | | |
189 | 0 | std::string DetokenizedString::BestString() const { |
190 | 0 | return matches_.empty() ? std::string() : matches_[0].value(); |
191 | 0 | } |
192 | | |
193 | 0 | std::string DetokenizedString::BestStringWithErrors() const { |
194 | 0 | if (matches_.empty()) { |
195 | 0 | return has_token_ ? UnknownTokenMessage(token_) |
196 | 0 | : PW_TOKENIZER_ARG_DECODING_ERROR("missing token"); |
197 | 0 | } |
198 | 0 | return matches_[0].value_with_errors(); |
199 | 0 | } |
200 | | |
201 | 1 | Detokenizer::Detokenizer(const TokenDatabase& database) { |
202 | 4 | for (const auto& entry : database) { |
203 | 4 | database_[entry.token].emplace_back(entry.string, entry.date_removed); |
204 | 4 | } |
205 | 1 | } |
206 | | |
207 | | Result<Detokenizer> Detokenizer::FromElfSection( |
208 | 0 | span<const std::byte> elf_section) { |
209 | 0 | size_t index = 0; |
210 | 0 | std::unordered_map<uint32_t, std::vector<TokenizedStringEntry>> database; |
211 | |
|
212 | 0 | while (index + sizeof(_pw_tokenizer_EntryHeader) < elf_section.size()) { |
213 | 0 | _pw_tokenizer_EntryHeader header; |
214 | 0 | std::memcpy( |
215 | 0 | &header, elf_section.data() + index, sizeof(_pw_tokenizer_EntryHeader)); |
216 | 0 | index += sizeof(_pw_tokenizer_EntryHeader); |
217 | |
|
218 | 0 | if (header.magic != _PW_TOKENIZER_ENTRY_MAGIC) { |
219 | 0 | return Status::DataLoss(); |
220 | 0 | } |
221 | | |
222 | 0 | index += header.domain_length; |
223 | 0 | if (index + header.string_length <= elf_section.size()) { |
224 | | // TODO(b/326365218): Construct FormatString with string_view to avoid |
225 | | // creating a copy here. |
226 | 0 | std::string entry( |
227 | 0 | reinterpret_cast<const char*>(elf_section.data() + index), |
228 | 0 | header.string_length); |
229 | 0 | index += header.string_length; |
230 | 0 | database[header.token].emplace_back(entry.c_str(), |
231 | 0 | TokenDatabase::kDateRemovedNever); |
232 | 0 | } |
233 | 0 | } |
234 | 0 | return Detokenizer(std::move(database)); |
235 | 0 | } |
236 | | |
237 | | DetokenizedString Detokenizer::Detokenize( |
238 | 16.7k | const span<const std::byte>& encoded) const { |
239 | | // The token is missing from the encoded data; there is nothing to do. |
240 | 16.7k | if (encoded.empty()) { |
241 | 2.79k | return DetokenizedString(); |
242 | 2.79k | } |
243 | | |
244 | 13.9k | uint32_t token = bytes::ReadInOrder<uint32_t>( |
245 | 13.9k | endian::little, encoded.data(), encoded.size()); |
246 | | |
247 | 13.9k | const auto result = database_.find(token); |
248 | | |
249 | 13.9k | return DetokenizedString( |
250 | 13.9k | token, |
251 | 13.9k | result == database_.end() ? span<TokenizedStringEntry>() |
252 | 13.9k | : span(result->second), |
253 | 13.9k | encoded.size() < sizeof(token) ? span<const std::byte>() |
254 | 13.9k | : encoded.subspan(sizeof(token))); |
255 | 16.7k | } |
256 | | |
257 | | DetokenizedString Detokenizer::DetokenizeBase64Message( |
258 | 0 | std::string_view text) const { |
259 | 0 | std::string buffer(text); |
260 | 0 | buffer.resize(PrefixedBase64DecodeInPlace(buffer)); |
261 | 0 | return Detokenize(buffer); |
262 | 0 | } |
263 | | |
264 | | std::string Detokenizer::DetokenizeText(std::string_view text, |
265 | 0 | const unsigned max_passes) const { |
266 | 0 | NestedMessageDetokenizer detokenizer(*this); |
267 | 0 | detokenizer.Detokenize(text); |
268 | |
|
269 | 0 | std::string result; |
270 | 0 | unsigned pass = 1; |
271 | |
|
272 | 0 | while (true) { |
273 | 0 | result = detokenizer.Flush(); |
274 | 0 | if (pass >= max_passes || !detokenizer.OutputChangedSinceLastCheck()) { |
275 | 0 | break; |
276 | 0 | } |
277 | 0 | detokenizer.Detokenize(result); |
278 | 0 | pass += 1; |
279 | 0 | } |
280 | 0 | return result; |
281 | 0 | } |
282 | | |
283 | | std::string Detokenizer::DecodeOptionallyTokenizedData( |
284 | 0 | const ConstByteSpan& optionally_tokenized_data) { |
285 | | // Try detokenizing as binary using the best result if available, else use |
286 | | // the input data as a string. |
287 | 0 | const auto result = Detokenize(optionally_tokenized_data); |
288 | 0 | const bool found_matches = !result.matches().empty(); |
289 | | // Note: unlike pw_tokenizer.proto.decode_optionally_tokenized, this decoding |
290 | | // process does not encode and decode UTF8 format, it is sufficient to check |
291 | | // if the data is printable ASCII. |
292 | 0 | const std::string data = |
293 | 0 | found_matches |
294 | 0 | ? result.BestString() |
295 | 0 | : std::string( |
296 | 0 | reinterpret_cast<const char*>(optionally_tokenized_data.data()), |
297 | 0 | optionally_tokenized_data.size()); |
298 | |
|
299 | 0 | const bool is_data_printable = IsPrintableAscii(data); |
300 | 0 | if (!found_matches && !is_data_printable) { |
301 | | // Assume the token is unknown or the data is corrupt. |
302 | 0 | std::vector<char> base64_encoding_buffer( |
303 | 0 | Base64EncodedBufferSize(optionally_tokenized_data.size())); |
304 | 0 | const size_t encoded_length = PrefixedBase64Encode( |
305 | 0 | optionally_tokenized_data, span(base64_encoding_buffer)); |
306 | 0 | return std::string{base64_encoding_buffer.data(), encoded_length}; |
307 | 0 | } |
308 | | |
309 | | // Successfully detokenized, check if the field has more prefixed |
310 | | // base64-encoded tokens. |
311 | 0 | const std::string field = DetokenizeText(data); |
312 | | // If anything detokenized successfully, use that. |
313 | 0 | if (field != data) { |
314 | 0 | return field; |
315 | 0 | } |
316 | | |
317 | | // Attempt to determine whether this is an unknown token or plain text. |
318 | | // Any string with only printable or whitespace characters is plain text. |
319 | 0 | if (found_matches || is_data_printable) { |
320 | 0 | return data; |
321 | 0 | } |
322 | | |
323 | | // Assume this field is tokenized data that could not be decoded. |
324 | 0 | std::vector<char> base64_encoding_buffer( |
325 | 0 | Base64EncodedBufferSize(optionally_tokenized_data.size())); |
326 | 0 | const size_t encoded_length = PrefixedBase64Encode( |
327 | 0 | optionally_tokenized_data, span(base64_encoding_buffer)); |
328 | 0 | return std::string{base64_encoding_buffer.data(), encoded_length}; |
329 | 0 | } |
330 | | |
331 | | } // namespace pw::tokenizer |