/proc/self/cwd/pw_tokenizer/detokenize.cc
Line | Count | Source (jump to first uncovered line) |
1 | | // Copyright 2025 The Pigweed Authors |
2 | | // |
3 | | // Licensed under the Apache License, Version 2.0 (the "License"); you may not |
4 | | // use this file except in compliance with the License. You may obtain a copy of |
5 | | // the License at |
6 | | // |
7 | | // https://www.apache.org/licenses/LICENSE-2.0 |
8 | | // |
9 | | // Unless required by applicable law or agreed to in writing, software |
10 | | // distributed under the License is distributed on an "AS IS" BASIS, WITHOUT |
11 | | // WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the |
12 | | // License for the specific language governing permissions and limitations under |
13 | | // the License. |
14 | | |
15 | | #include "pw_tokenizer/detokenize.h" |
16 | | |
17 | | #include <algorithm> |
18 | | #include <cctype> |
19 | | #include <charconv> |
20 | | #include <cstring> |
21 | | #include <string_view> |
22 | | #include <utility> |
23 | | #include <vector> |
24 | | |
25 | | #include "pw_base64/base64.h" |
26 | | #include "pw_bytes/bit.h" |
27 | | #include "pw_bytes/endian.h" |
28 | | #include "pw_elf/reader.h" |
29 | | #include "pw_log/log.h" |
30 | | #include "pw_result/result.h" |
31 | | #include "pw_status/try.h" |
32 | | #include "pw_tokenizer/base64.h" |
33 | | #include "pw_tokenizer/internal/decode.h" |
34 | | #include "pw_tokenizer/nested_tokenization.h" |
35 | | #include "pw_tokenizer/tokenize.h" |
36 | | #include "pw_tokenizer_private/csv.h" |
37 | | |
38 | | namespace pw::tokenizer { |
39 | | namespace { |
40 | | |
41 | | // True if a Base10 character. |
42 | 0 | constexpr bool IsValidBase10(char ch) { return ('0' <= ch && ch <= '9'); } |
43 | | |
44 | | // True if a Base16 character. |
45 | 0 | constexpr bool IsValidBase16(char ch) { |
46 | 0 | return ('0' <= ch && ch <= '9') || ('A' <= ch && ch <= 'F') || |
47 | 0 | ('a' <= ch && ch <= 'f'); |
48 | 0 | } |
49 | | |
50 | | class NestedMessageDetokenizer { |
51 | | public: |
52 | | NestedMessageDetokenizer(const Detokenizer& detokenizer) |
53 | 0 | : detokenizer_(detokenizer), |
54 | 0 | message_start_(0), |
55 | 0 | domain_size_(0), |
56 | 0 | data_start_(0) {} |
57 | | |
58 | 0 | void Detokenize(std::string_view chunk) { |
59 | 0 | for (char next_char : chunk) { |
60 | 0 | Detokenize(next_char); |
61 | 0 | } |
62 | 0 | } |
63 | | |
64 | 0 | bool OutputChangedSinceLastCheck() { |
65 | 0 | return std::exchange(output_changed_, false); |
66 | 0 | } |
67 | | |
68 | 0 | void Detokenize(char next_char) { |
69 | 0 | if (next_char == PW_TOKENIZER_NESTED_PREFIX) { |
70 | 0 | HandleEndOfMessage(); |
71 | |
|
72 | 0 | message_start_ = output_.size(); |
73 | 0 | state_ = kMessageStart; |
74 | 0 | output_.push_back(next_char); |
75 | 0 | return; |
76 | 0 | } |
77 | | |
78 | 0 | output_.push_back(next_char); |
79 | 0 | switch (state_) { |
80 | 0 | case kPassthrough: |
81 | 0 | break; |
82 | 0 | case kMessageStart: |
83 | 0 | if (next_char == '{') { |
84 | 0 | state_ = kDomain; |
85 | 0 | } else { |
86 | 0 | HandleRadixOrBase64Data(next_char); |
87 | 0 | } |
88 | 0 | break; |
89 | 0 | case kDomain: |
90 | 0 | if (next_char == '}') { |
91 | 0 | state_ = kRadixOrData; |
92 | 0 | } else if (internal::ValidDomainChar(next_char)) { |
93 | 0 | domain_size_ += 1; |
94 | 0 | } else { |
95 | 0 | ResetMessage(); |
96 | 0 | } |
97 | 0 | break; |
98 | 0 | case kRadixOrData: |
99 | 0 | HandleRadixOrBase64Data(next_char); |
100 | 0 | break; |
101 | 0 | case kRadix10Or16: |
102 | 0 | if (next_char == '0' || next_char == '6') { |
103 | 0 | state_ = kRadixEnd; |
104 | 0 | } else { |
105 | 0 | state_ = kData64; |
106 | 0 | HandleBase64Char(next_char); |
107 | 0 | } |
108 | 0 | break; |
109 | 0 | case kRadix64: |
110 | 0 | if (next_char == '4') { |
111 | 0 | state_ = kRadixEnd; |
112 | 0 | } else { |
113 | 0 | state_ = kData64; |
114 | 0 | HandleBase64Char(next_char); |
115 | 0 | } |
116 | 0 | break; |
117 | 0 | case kRadixEnd: |
118 | 0 | if (next_char == '#') { |
119 | | // Check if the radix was 10, 16, or 64. |
120 | 0 | const char digit = output_[output_.size() - 2]; |
121 | 0 | state_ = digit == '0' ? kData10 : digit == '6' ? kData16 : kData64; |
122 | 0 | data_start_ = output_.size(); |
123 | 0 | } else { |
124 | 0 | state_ = kData64; |
125 | 0 | HandleBase64Char(next_char); |
126 | 0 | } |
127 | 0 | break; |
128 | 0 | case kData10: |
129 | 0 | HandleBase10Char(next_char); |
130 | 0 | break; |
131 | 0 | case kData16: |
132 | 0 | HandleBase16Char(next_char); |
133 | 0 | break; |
134 | 0 | case kData64: |
135 | 0 | HandleBase64Char(next_char); |
136 | 0 | break; |
137 | 0 | case kData64Padding: |
138 | 0 | if (next_char == '=') { |
139 | 0 | HandleEndOfMessageValidBase64(); |
140 | 0 | } else { |
141 | 0 | ResetMessage(); |
142 | 0 | } |
143 | 0 | break; |
144 | 0 | } |
145 | 0 | } |
146 | | |
147 | 0 | std::string Flush() { |
148 | 0 | HandleEndOfMessage(); |
149 | 0 | std::string output(std::move(output_)); |
150 | 0 | output_.clear(); |
151 | 0 | return output; |
152 | 0 | } |
153 | | |
154 | | private: |
155 | 0 | std::string_view domain() const { |
156 | | // The domain starts 2 characters after the message start ("${domain}"). |
157 | 0 | return std::string_view(output_.data() + message_start_ + 2, domain_size_); |
158 | 0 | } |
159 | | |
160 | 0 | void HandleRadixOrBase64Data(char next_char) { |
161 | 0 | if (next_char == '#') { |
162 | 0 | state_ = kData16; // $# or ${}# means base 16 |
163 | 0 | data_start_ = output_.size(); // data starts after the # |
164 | 0 | return; |
165 | 0 | } |
166 | | |
167 | | // If this is Base64 data, it includes this character. |
168 | 0 | data_start_ = output_.size() - 1; |
169 | 0 | if (next_char == '1') { |
170 | 0 | state_ = kRadix10Or16; |
171 | 0 | } else if (next_char == '6') { |
172 | 0 | state_ = kRadix64; |
173 | 0 | } else if (base64::IsValidChar(next_char)) { |
174 | 0 | state_ = kData64; |
175 | 0 | } else { |
176 | 0 | ResetMessage(); |
177 | 0 | } |
178 | 0 | } |
179 | | |
180 | 0 | void HandleBase10Char(char next_char) { |
181 | 0 | if (!IsValidBase10(next_char)) { |
182 | 0 | ResetMessage(); |
183 | 0 | return; |
184 | 0 | } |
185 | | |
186 | | // Base10 data must be 10 chars long. |
187 | 0 | const size_t block_size = (output_.size() - data_start_); |
188 | 0 | if (block_size == 10) { |
189 | 0 | HandleEndOfMessageValidBase10OrBase16(10); |
190 | 0 | } |
191 | 0 | } |
192 | | |
193 | 0 | void HandleBase16Char(char next_char) { |
194 | 0 | if (!IsValidBase16(next_char)) { |
195 | 0 | ResetMessage(); |
196 | 0 | return; |
197 | 0 | } |
198 | | |
199 | | // Base16 data must be 8 chars long. |
200 | 0 | const size_t block_size = (output_.size() - data_start_); |
201 | 0 | if (block_size == 8) { |
202 | 0 | HandleEndOfMessageValidBase10OrBase16(16); |
203 | 0 | } |
204 | 0 | } |
205 | | |
206 | | // The first few characters after $ could be either a radix specification or |
207 | | // Base64 data (e.g. $16dAw5== versus $16#00000001). |
208 | 0 | void HandleBase64Char(char next_char) { |
209 | 0 | if (base64::IsValidChar(next_char)) { |
210 | 0 | return; |
211 | 0 | } |
212 | | |
213 | | // Base64 data must be in 4 char blocks, ending with padding if needed. |
214 | 0 | const size_t block_size = (output_.size() - data_start_) % 4; |
215 | 0 | if (block_size == 1) { |
216 | | // Got invalid character after a 4-byte block. Pop that char and decode. |
217 | 0 | output_.pop_back(); |
218 | 0 | HandleEndOfMessageValidBase64(); |
219 | 0 | output_.push_back(next_char); |
220 | 0 | } else if (block_size == 2 || next_char != '=') { |
221 | | // Invalid character not on a 4-char block boundary. Could try decoding at |
222 | | // the block boundary instead of resetting. |
223 | 0 | ResetMessage(); |
224 | 0 | } else if (block_size == 3) { // Found padding '=' character, need 1 more. |
225 | 0 | state_ = kData64Padding; |
226 | 0 | } else { // The '=' was the final character of the block. |
227 | 0 | HandleEndOfMessageValidBase64(); |
228 | 0 | } |
229 | 0 | } |
230 | | |
231 | 0 | void HandleEndOfMessage() { |
232 | 0 | if (state_ < kData10) { |
233 | | // It's not possible to have a complete token outside of the kData |
234 | | // states, even for the shortest possible messages ($10==). |
235 | 0 | ResetMessage(); |
236 | 0 | return; |
237 | 0 | } |
238 | | |
239 | 0 | if (state_ >= kData64) { |
240 | | // Base64 data must come in 4-byte blocks. |
241 | 0 | if ((output_.size() - data_start_) % 4 == 0) { |
242 | 0 | HandleEndOfMessageValidBase64(); |
243 | 0 | } else { |
244 | 0 | ResetMessage(); |
245 | 0 | } |
246 | 0 | return; |
247 | 0 | } |
248 | | |
249 | 0 | if (state_ == kData10) { |
250 | 0 | if (output_.size() - data_start_ == 10) { |
251 | 0 | HandleEndOfMessageValidBase10OrBase16(10); |
252 | 0 | } |
253 | 0 | } else if (state_ == kData16) { |
254 | 0 | if (output_.size() - data_start_ == 8) { |
255 | 0 | HandleEndOfMessageValidBase10OrBase16(16); |
256 | 0 | } |
257 | 0 | } |
258 | 0 | ResetMessage(); |
259 | 0 | } |
260 | | |
261 | 0 | void HandleEndOfMessageValidBase10OrBase16(int base) { |
262 | 0 | char* data_start = output_.data() + data_start_; |
263 | 0 | char* data_end = output_.data() + output_.size(); |
264 | |
|
265 | 0 | uint32_t token = 0; |
266 | |
|
267 | 0 | auto [_, ec] = std::from_chars(data_start, data_end, token, base); |
268 | |
|
269 | 0 | if (ec == std::errc()) { |
270 | 0 | DetokenizeOnce(token); |
271 | 0 | } else { |
272 | 0 | ResetMessage(); |
273 | 0 | } |
274 | 0 | } |
275 | | |
276 | 0 | void HandleEndOfMessageValidBase64() { |
277 | 0 | std::string_view data(output_.data() + data_start_, |
278 | 0 | output_.size() - data_start_); |
279 | 0 | std::vector<std::byte> bytes(base64::DecodedSize(data)); |
280 | 0 | base64::Decode(data, bytes.data()); |
281 | 0 | DetokenizeOnceBase64(bytes); |
282 | 0 | } |
283 | | |
284 | 0 | void DetokenizeOnce(uint32_t token) { |
285 | 0 | if (auto result = detokenizer_.DatabaseLookup(token, domain()); |
286 | 0 | result.size() == 1) { |
287 | 0 | std::string replacement = |
288 | 0 | result.front().first.Format(span<const uint8_t>()).value(); |
289 | 0 | output_.replace(message_start_, output_.size(), replacement); |
290 | 0 | output_changed_ = true; |
291 | 0 | } |
292 | 0 | ResetMessage(); |
293 | 0 | } |
294 | | |
295 | 0 | void DetokenizeOnceBase64(span<const std::byte> bytes) { |
296 | 0 | if (auto result = detokenizer_.Detokenize(bytes, domain()); result.ok()) { |
297 | 0 | output_.replace(message_start_, output_.size(), result.BestString()); |
298 | 0 | output_changed_ = true; |
299 | 0 | } |
300 | 0 | ResetMessage(); |
301 | 0 | } |
302 | | |
303 | 0 | void ResetMessage() { |
304 | 0 | message_start_ = 0; |
305 | 0 | domain_size_ = 0; |
306 | 0 | data_start_ = 0; |
307 | 0 | state_ = kPassthrough; |
308 | 0 | } |
309 | | |
310 | | const Detokenizer& detokenizer_; |
311 | | std::string output_; |
312 | | size_t message_start_; // Index of the message prefix ($) |
313 | | size_t domain_size_; |
314 | | size_t data_start_; // Index of the token data |
315 | | |
316 | | enum : uint8_t { |
317 | | kPassthrough, // not parsing a nested message |
318 | | kMessageStart, |
319 | | kDomain, |
320 | | kRadixOrData, |
321 | | kRadix10Or16, |
322 | | kRadix64, |
323 | | kRadixEnd, |
324 | | kData10, |
325 | | kData16, |
326 | | kData64, |
327 | | kData64Padding, |
328 | | } state_ = kPassthrough; |
329 | | bool output_changed_ = false; |
330 | | }; |
331 | | |
332 | 0 | std::string UnknownTokenMessage(uint32_t value) { |
333 | 0 | std::string output(PW_TOKENIZER_ARG_DECODING_ERROR_PREFIX "unknown token "); |
334 | | |
335 | | // Output a hexadecimal version of the token. |
336 | 0 | for (int shift = 28; shift >= 0; shift -= 4) { |
337 | 0 | output.push_back("0123456789abcdef"[(value >> shift) & 0xF]); |
338 | 0 | } |
339 | |
|
340 | 0 | output.append(PW_TOKENIZER_ARG_DECODING_ERROR_SUFFIX); |
341 | 0 | return output; |
342 | 0 | } |
343 | | |
344 | | // Decoding result with the date removed, for sorting. |
345 | | using DecodingResult = std::pair<DecodedFormatString, uint32_t>; |
346 | | |
347 | | // Determines if one result is better than the other if collisions occurred. |
348 | | // Returns true if lhs is preferred over rhs. This logic should match the |
349 | | // collision resolution logic in detokenize.py. |
350 | 0 | bool IsBetterResult(const DecodingResult& lhs, const DecodingResult& rhs) { |
351 | | // Favor the result for which decoding succeeded. |
352 | 0 | if (lhs.first.ok() != rhs.first.ok()) { |
353 | 0 | return lhs.first.ok(); |
354 | 0 | } |
355 | | |
356 | | // Favor the result for which all bytes were decoded. |
357 | 0 | if ((lhs.first.remaining_bytes() == 0u) != |
358 | 0 | (rhs.first.remaining_bytes() == 0u)) { |
359 | 0 | return lhs.first.remaining_bytes() == 0u; |
360 | 0 | } |
361 | | |
362 | | // Favor the result with fewer decoding errors. |
363 | 0 | if (lhs.first.decoding_errors() != rhs.first.decoding_errors()) { |
364 | 0 | return lhs.first.decoding_errors() < rhs.first.decoding_errors(); |
365 | 0 | } |
366 | | |
367 | | // Favor the result that successfully decoded the most arguments. |
368 | 0 | if (lhs.first.argument_count() != rhs.first.argument_count()) { |
369 | 0 | return lhs.first.argument_count() > rhs.first.argument_count(); |
370 | 0 | } |
371 | | |
372 | | // Favor the result that was removed from the database most recently. |
373 | 0 | return lhs.second > rhs.second; |
374 | 0 | } |
375 | | |
376 | | // Returns true if all characters in data are printable, space, or if the string |
377 | | // is empty. |
378 | 0 | constexpr bool IsPrintableAscii(std::string_view data) { |
379 | | // This follows the logic in pw_tokenizer.decode_optionally_tokenized below: |
380 | | // |
381 | | // if ''.join(text.split()).isprintable(): |
382 | | // return text |
383 | | // |
384 | 0 | for (int letter : data) { |
385 | 0 | if (std::isprint(letter) == 0 && std::isspace(letter) == 0) { |
386 | 0 | return false; |
387 | 0 | } |
388 | 0 | } |
389 | 0 | return true; |
390 | 0 | } |
391 | | |
392 | | void AddEntryIfUnique(std::vector<TokenizedStringEntry>& entries, |
393 | 0 | std::string_view new_entry) { |
394 | | // TODO(b/326365218): Construct FormatString with string_view to avoid |
395 | | // creating a copy here. |
396 | 0 | FormatString format_string(std::string(new_entry).c_str()); |
397 | 0 | for (const TokenizedStringEntry& entry : entries) { |
398 | 0 | if (format_string == entry.first) { |
399 | 0 | return; // An identical string is already present |
400 | 0 | } |
401 | 0 | } |
402 | | |
403 | 0 | entries.emplace_back(std::move(format_string), |
404 | 0 | TokenDatabase::kDateRemovedNever); |
405 | 0 | } |
406 | | |
407 | | } // namespace |
408 | | |
409 | | DetokenizedString::DetokenizedString( |
410 | | const Detokenizer& detokenizer, |
411 | | bool recursion, |
412 | | uint32_t token, |
413 | | const span<const TokenizedStringEntry>& entries, |
414 | | const span<const std::byte>& arguments) |
415 | 17.7k | : token_(token), has_token_(true) { |
416 | 17.7k | std::vector<DecodingResult> results; |
417 | | |
418 | 17.7k | for (const auto& [format, date_removed] : entries) { |
419 | 2.35k | results.push_back(DecodingResult{ |
420 | 2.35k | format.Format(span(reinterpret_cast<const uint8_t*>(arguments.data()), |
421 | 2.35k | arguments.size())), |
422 | 2.35k | date_removed}); |
423 | 2.35k | } |
424 | | |
425 | 17.7k | std::sort(results.begin(), results.end(), IsBetterResult); |
426 | 17.7k | for (auto& result : results) { |
427 | 2.35k | matches_.push_back(std::move(result.first)); |
428 | 2.35k | } |
429 | | |
430 | 17.7k | if (recursion && !matches_.empty()) { |
431 | 0 | best_string_ = detokenizer.DetokenizeText(matches_[0].value()); |
432 | 17.7k | } else if (!matches_.empty()) { |
433 | 2.35k | best_string_ = matches_[0].value(); |
434 | 15.4k | } else { |
435 | 15.4k | best_string_ = std::string(); |
436 | 15.4k | } |
437 | 17.7k | } |
438 | | |
439 | 0 | std::string DetokenizedString::BestStringWithErrors() const { |
440 | 0 | if (matches_.empty()) { |
441 | 0 | return has_token_ ? UnknownTokenMessage(token_) |
442 | 0 | : PW_TOKENIZER_ARG_DECODING_ERROR("missing token"); |
443 | 0 | } |
444 | 0 | return matches_[0].value_with_errors(); |
445 | 0 | } |
446 | | |
447 | 1 | Detokenizer::Detokenizer(const TokenDatabase& database) { |
448 | 4 | for (const auto& entry : database) { |
449 | 4 | database_[kDefaultDomain][entry.token].emplace_back(entry.string, |
450 | 4 | entry.date_removed); |
451 | 4 | } |
452 | 1 | } |
453 | | |
454 | | Result<Detokenizer> Detokenizer::FromElfSection( |
455 | 0 | span<const std::byte> elf_section) { |
456 | 0 | size_t index = 0; |
457 | 0 | DomainTokenEntriesMap database; |
458 | |
|
459 | 0 | while (index + sizeof(_pw_tokenizer_EntryHeader) < elf_section.size()) { |
460 | 0 | _pw_tokenizer_EntryHeader header; |
461 | 0 | std::memcpy( |
462 | 0 | &header, elf_section.data() + index, sizeof(_pw_tokenizer_EntryHeader)); |
463 | 0 | index += sizeof(_pw_tokenizer_EntryHeader); |
464 | |
|
465 | 0 | if (header.magic != _PW_TOKENIZER_ENTRY_MAGIC) { |
466 | 0 | return Status::DataLoss(); |
467 | 0 | } |
468 | | |
469 | 0 | if (index + header.domain_length + header.string_length <= |
470 | 0 | elf_section.size()) { |
471 | 0 | std::string domain( |
472 | 0 | reinterpret_cast<const char*>(elf_section.data() + index), |
473 | 0 | header.domain_length - 1); |
474 | 0 | index += header.domain_length; |
475 | |
|
476 | 0 | std::string_view entry( |
477 | 0 | reinterpret_cast<const char*>(elf_section.data() + index), |
478 | 0 | header.string_length - 1); |
479 | 0 | index += header.string_length; |
480 | |
|
481 | 0 | AddEntryIfUnique(database[std::move(domain)][header.token], entry); |
482 | 0 | } |
483 | 0 | } |
484 | 0 | return Detokenizer(std::move(database)); |
485 | 0 | } |
486 | | |
487 | 0 | Result<Detokenizer> Detokenizer::FromElfFile(stream::SeekableReader& stream) { |
488 | 0 | PW_TRY_ASSIGN(auto reader, pw::elf::ElfReader::FromStream(stream)); |
489 | |
|
490 | 0 | constexpr auto kTokenSectionName = ".pw_tokenizer.entries"; |
491 | 0 | PW_TRY_ASSIGN(std::vector<std::byte> section_data, |
492 | 0 | reader.ReadSection(kTokenSectionName)); |
493 | |
|
494 | 0 | return Detokenizer::FromElfSection(section_data); |
495 | 0 | } |
496 | | |
497 | 0 | Result<Detokenizer> Detokenizer::FromCsv(std::string_view csv) { |
498 | 0 | std::vector<std::vector<std::string>> parsed_csv = ParseCsv(csv); |
499 | 0 | DomainTokenEntriesMap database; |
500 | | |
501 | | // CSV databases are in the format -> token, date, domain, string. |
502 | 0 | int invalid_row_count = 0; |
503 | 0 | for (const auto& row : parsed_csv) { |
504 | 0 | if (row.size() != 4) { |
505 | 0 | invalid_row_count++; |
506 | 0 | continue; |
507 | 0 | } |
508 | | // Ignore whitespace in the domain. |
509 | 0 | std::string domain = ""; |
510 | 0 | for (char c : row[2]) { |
511 | 0 | if (!std::isspace(c)) { |
512 | 0 | domain += c; |
513 | 0 | } |
514 | 0 | } |
515 | |
|
516 | 0 | const std::string& token = row[0]; |
517 | 0 | const std::string& date_removed = row[1]; |
518 | | |
519 | | // Validate length of token. |
520 | 0 | if (token.empty()) { |
521 | 0 | PW_LOG_ERROR("Corrupt database due to missing token"); |
522 | 0 | return Status::DataLoss(); |
523 | 0 | } |
524 | | |
525 | | // Validate token contents. |
526 | 0 | for (char c : token) { |
527 | 0 | if (!std::isxdigit(c)) { |
528 | 0 | PW_LOG_ERROR("Corrupt database due to token format"); |
529 | 0 | return Status::DataLoss(); |
530 | 0 | } |
531 | 0 | } |
532 | | |
533 | | // Validate date contents. |
534 | 0 | uint32_t date = TokenDatabase::kDateRemovedNever; |
535 | 0 | if (!date_removed.empty() && |
536 | 0 | date_removed.find_first_not_of(' ') != std::string::npos) { |
537 | 0 | size_t first_dash = date_removed.find('-'); |
538 | 0 | if (first_dash == std::string::npos || first_dash != 4) { |
539 | 0 | PW_LOG_ERROR("Wrong date format in database"); |
540 | 0 | return Status::DataLoss(); |
541 | 0 | } |
542 | | |
543 | 0 | size_t second_dash = date_removed.find('-', first_dash + 1); |
544 | 0 | if (second_dash == std::string::npos || second_dash != 7) { |
545 | 0 | PW_LOG_ERROR("Wrong date format in database"); |
546 | 0 | return Status::DataLoss(); |
547 | 0 | } |
548 | | |
549 | 0 | size_t pos; |
550 | 0 | int year = std::stoi(date_removed.substr(0, first_dash), &pos); |
551 | 0 | if (pos != first_dash) { |
552 | 0 | PW_LOG_ERROR("Wrong date format in database"); |
553 | 0 | return Status::DataLoss(); |
554 | 0 | } |
555 | | |
556 | 0 | int month = std::stoi( |
557 | 0 | date_removed.substr(first_dash + 1, second_dash - first_dash - 1), |
558 | 0 | &pos); |
559 | 0 | if (pos != second_dash - first_dash - 1) { |
560 | 0 | PW_LOG_ERROR("Wrong date format in database"); |
561 | 0 | return Status::DataLoss(); |
562 | 0 | } |
563 | | |
564 | 0 | int day = std::stoi(date_removed.substr(second_dash + 1), &pos); |
565 | 0 | if (pos != date_removed.size() - second_dash - 1) { |
566 | 0 | PW_LOG_ERROR("Wrong date format in database"); |
567 | 0 | return Status::DataLoss(); |
568 | 0 | } |
569 | | |
570 | 0 | date = static_cast<uint32_t>(year << 16) | |
571 | 0 | static_cast<uint32_t>(month << 8) | static_cast<uint32_t>(day); |
572 | 0 | } |
573 | | |
574 | | // Add to database. |
575 | 0 | database[std::move(domain)] |
576 | 0 | [static_cast<uint32_t>(std::stoul(token, nullptr, 16))] |
577 | 0 | .emplace_back(row[3].c_str(), date); |
578 | 0 | } |
579 | | |
580 | | // Log warning if any data lines were skipped. |
581 | 0 | if (invalid_row_count > 0) { |
582 | 0 | PW_LOG_WARN( |
583 | 0 | "Skipped %d of %zu lines because they did not have 4 columns as " |
584 | 0 | "expected.", |
585 | 0 | invalid_row_count, |
586 | 0 | parsed_csv.size()); |
587 | 0 | } |
588 | |
|
589 | 0 | return Detokenizer(std::move(database)); |
590 | 0 | } |
591 | | |
592 | | DetokenizedString Detokenizer::Detokenize(const span<const std::byte>& encoded, |
593 | | std::string_view domain, |
594 | 20.6k | bool recursion) const { |
595 | | // The token is missing from the encoded data; there is nothing to do. |
596 | 20.6k | if (encoded.empty()) { |
597 | 2.88k | return DetokenizedString(); |
598 | 2.88k | } |
599 | | |
600 | 17.7k | uint32_t token = bytes::ReadInOrder<uint32_t>( |
601 | 17.7k | endian::little, encoded.data(), encoded.size()); |
602 | | |
603 | 17.7k | const auto result = DatabaseLookup(token, domain); |
604 | | |
605 | 17.7k | return DetokenizedString(*this, |
606 | 17.7k | recursion, |
607 | 17.7k | token, |
608 | 17.7k | result, |
609 | 17.7k | encoded.size() < sizeof(token) |
610 | 17.7k | ? span<const std::byte>() |
611 | 17.7k | : encoded.subspan(sizeof(token))); |
612 | 20.6k | } |
613 | | |
614 | | DetokenizedString Detokenizer::DetokenizeBase64Message( |
615 | 0 | std::string_view text) const { |
616 | 0 | std::string buffer(text); |
617 | 0 | buffer.resize(PrefixedBase64DecodeInPlace(buffer)); |
618 | 0 | return Detokenize(buffer); |
619 | 0 | } |
620 | | |
621 | | span<const TokenizedStringEntry> Detokenizer::DatabaseLookup( |
622 | 17.7k | uint32_t token, std::string_view domain) const { |
623 | 17.7k | std::string canonical_domain; |
624 | 17.7k | for (char ch : domain) { |
625 | 0 | if (!std::isspace(ch)) { |
626 | 0 | canonical_domain.push_back(ch); |
627 | 0 | } |
628 | 0 | } |
629 | | |
630 | 17.7k | auto domain_it = database_.find(canonical_domain); |
631 | 17.7k | if (domain_it == database_.end()) { |
632 | 0 | return span<TokenizedStringEntry>(); |
633 | 0 | } |
634 | 17.7k | auto token_it = domain_it->second.find(token); |
635 | 17.7k | if (token_it == domain_it->second.end()) { |
636 | 15.4k | return span<TokenizedStringEntry>(); |
637 | 15.4k | } |
638 | | |
639 | 2.35k | return span(token_it->second); |
640 | 17.7k | } |
641 | | |
642 | | std::string Detokenizer::DetokenizeTextRecursive(std::string_view text, |
643 | 0 | unsigned max_passes) const { |
644 | 0 | NestedMessageDetokenizer detokenizer(*this); |
645 | 0 | detokenizer.Detokenize(text); |
646 | |
|
647 | 0 | std::string result; |
648 | 0 | unsigned pass = 1; |
649 | |
|
650 | 0 | while (true) { |
651 | 0 | result = detokenizer.Flush(); |
652 | 0 | if (pass >= max_passes || !detokenizer.OutputChangedSinceLastCheck()) { |
653 | 0 | break; |
654 | 0 | } |
655 | 0 | detokenizer.Detokenize(result); |
656 | 0 | pass += 1; |
657 | 0 | } |
658 | 0 | return result; |
659 | 0 | } |
660 | | |
661 | | std::string Detokenizer::DecodeOptionallyTokenizedData( |
662 | 0 | const ConstByteSpan& optionally_tokenized_data) { |
663 | | // Try detokenizing as binary using the best result if available, else use |
664 | | // the input data as a string. |
665 | 0 | const auto result = Detokenize(optionally_tokenized_data); |
666 | 0 | const bool found_matches = !result.matches().empty(); |
667 | | // Note: unlike pw_tokenizer.proto.decode_optionally_tokenized, this decoding |
668 | | // process does not encode and decode UTF8 format, it is sufficient to check |
669 | | // if the data is printable ASCII. |
670 | 0 | const std::string data = |
671 | 0 | found_matches |
672 | 0 | ? result.BestString() |
673 | 0 | : std::string( |
674 | 0 | reinterpret_cast<const char*>(optionally_tokenized_data.data()), |
675 | 0 | optionally_tokenized_data.size()); |
676 | |
|
677 | 0 | const bool is_data_printable = IsPrintableAscii(data); |
678 | 0 | if (!found_matches && !is_data_printable) { |
679 | | // Assume the token is unknown or the data is corrupt. |
680 | 0 | std::vector<char> base64_encoding_buffer( |
681 | 0 | Base64EncodedBufferSize(optionally_tokenized_data.size())); |
682 | 0 | const size_t encoded_length = PrefixedBase64Encode( |
683 | 0 | optionally_tokenized_data, span(base64_encoding_buffer)); |
684 | 0 | return std::string{base64_encoding_buffer.data(), encoded_length}; |
685 | 0 | } |
686 | | |
687 | | // Successfully detokenized, check if the field has more prefixed |
688 | | // base64-encoded tokens. |
689 | 0 | const std::string field = DetokenizeText(data); |
690 | | // If anything detokenized successfully, use that. |
691 | 0 | if (field != data) { |
692 | 0 | return field; |
693 | 0 | } |
694 | | |
695 | | // Attempt to determine whether this is an unknown token or plain text. |
696 | | // Any string with only printable or whitespace characters is plain text. |
697 | 0 | if (found_matches || is_data_printable) { |
698 | 0 | return data; |
699 | 0 | } |
700 | | |
701 | | // Assume this field is tokenized data that could not be decoded. |
702 | 0 | std::vector<char> base64_encoding_buffer( |
703 | 0 | Base64EncodedBufferSize(optionally_tokenized_data.size())); |
704 | 0 | const size_t encoded_length = PrefixedBase64Encode( |
705 | 0 | optionally_tokenized_data, span(base64_encoding_buffer)); |
706 | 0 | return std::string{base64_encoding_buffer.data(), encoded_length}; |
707 | 0 | } |
708 | | |
709 | | } // namespace pw::tokenizer |