/proc/self/cwd/pw_tokenizer/detokenize.cc
Line | Count | Source |
1 | | // Copyright 2025 The Pigweed Authors |
2 | | // |
3 | | // Licensed under the Apache License, Version 2.0 (the "License"); you may not |
4 | | // use this file except in compliance with the License. You may obtain a copy of |
5 | | // the License at |
6 | | // |
7 | | // https://www.apache.org/licenses/LICENSE-2.0 |
8 | | // |
9 | | // Unless required by applicable law or agreed to in writing, software |
10 | | // distributed under the License is distributed on an "AS IS" BASIS, WITHOUT |
11 | | // WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the |
12 | | // License for the specific language governing permissions and limitations under |
13 | | // the License. |
14 | | |
15 | | #include "pw_tokenizer/detokenize.h" |
16 | | |
17 | | #include <algorithm> |
18 | | #include <cctype> |
19 | | #include <charconv> |
20 | | #include <cstring> |
21 | | #include <string_view> |
22 | | #include <utility> |
23 | | #include <vector> |
24 | | |
25 | | #include "pw_base64/base64.h" |
26 | | #include "pw_bytes/bit.h" |
27 | | #include "pw_bytes/endian.h" |
28 | | #include "pw_elf/reader.h" |
29 | | #include "pw_log/log.h" |
30 | | #include "pw_preprocessor/compiler.h" |
31 | | #include "pw_result/result.h" |
32 | | #include "pw_status/try.h" |
33 | | #include "pw_tokenizer/base64.h" |
34 | | #include "pw_tokenizer/internal/decode.h" |
35 | | #include "pw_tokenizer/nested_tokenization.h" |
36 | | #include "pw_tokenizer/tokenize.h" |
37 | | #include "pw_tokenizer_private/csv.h" |
38 | | |
39 | | namespace pw::tokenizer { |
40 | | namespace { |
41 | | |
42 | | // True if a Base10 character. |
43 | 0 | constexpr bool IsValidBase10(char ch) { return ('0' <= ch && ch <= '9'); } |
44 | | |
45 | | // True if a Base16 character. |
46 | 0 | constexpr bool IsValidBase16(char ch) { |
47 | 0 | return ('0' <= ch && ch <= '9') || ('A' <= ch && ch <= 'F') || |
48 | 0 | ('a' <= ch && ch <= 'f'); |
49 | 0 | } |
50 | | |
51 | | class NestedMessageDetokenizer { |
52 | | public: |
53 | | NestedMessageDetokenizer(const Detokenizer& detokenizer) |
54 | 0 | : detokenizer_(detokenizer), |
55 | 0 | message_start_(0), |
56 | 0 | domain_size_(0), |
57 | 0 | data_start_(0) {} |
58 | | |
59 | 0 | void Detokenize(std::string_view chunk) { |
60 | 0 | for (char next_char : chunk) { |
61 | 0 | Detokenize(next_char); |
62 | 0 | } |
63 | 0 | } |
64 | | |
65 | 0 | bool OutputChangedSinceLastCheck() { |
66 | 0 | return std::exchange(output_changed_, false); |
67 | 0 | } |
68 | | |
69 | 0 | void Detokenize(char next_char) { |
70 | 0 | if (next_char == PW_TOKENIZER_NESTED_PREFIX) { |
71 | 0 | HandleEndOfMessage(); |
72 | |
|
73 | 0 | message_start_ = output_.size(); |
74 | 0 | state_ = kMessageStart; |
75 | 0 | output_.push_back(next_char); |
76 | 0 | return; |
77 | 0 | } |
78 | | |
79 | 0 | output_.push_back(next_char); |
80 | 0 | switch (state_) { |
81 | 0 | case kPassthrough: |
82 | 0 | break; |
83 | 0 | case kMessageStart: |
84 | 0 | if (next_char == '{') { |
85 | 0 | state_ = kDomain; |
86 | 0 | } else { |
87 | 0 | HandleRadixOrBase64Data(next_char); |
88 | 0 | } |
89 | 0 | break; |
90 | 0 | case kDomain: |
91 | 0 | if (next_char == '}') { |
92 | 0 | state_ = kRadixOrData; |
93 | 0 | } else if (internal::ValidDomainChar(next_char)) { |
94 | 0 | domain_size_ += 1; |
95 | 0 | } else { |
96 | 0 | ResetMessage(); |
97 | 0 | } |
98 | 0 | break; |
99 | 0 | case kRadixOrData: |
100 | 0 | HandleRadixOrBase64Data(next_char); |
101 | 0 | break; |
102 | 0 | case kRadix10Or16: |
103 | 0 | if (next_char == '0' || next_char == '6') { |
104 | 0 | state_ = kRadixEnd; |
105 | 0 | } else { |
106 | 0 | state_ = kData64; |
107 | 0 | HandleBase64Char(next_char); |
108 | 0 | } |
109 | 0 | break; |
110 | 0 | case kRadix64: |
111 | 0 | if (next_char == '4') { |
112 | 0 | state_ = kRadixEnd; |
113 | 0 | } else { |
114 | 0 | state_ = kData64; |
115 | 0 | HandleBase64Char(next_char); |
116 | 0 | } |
117 | 0 | break; |
118 | 0 | case kRadixEnd: |
119 | 0 | if (next_char == '#') { |
120 | | // Check if the radix was 10, 16, or 64. |
121 | 0 | const char digit = output_[output_.size() - 2]; |
122 | 0 | state_ = digit == '0' ? kData10 : digit == '6' ? kData16 : kData64; |
123 | 0 | data_start_ = output_.size(); |
124 | 0 | } else { |
125 | 0 | state_ = kData64; |
126 | 0 | HandleBase64Char(next_char); |
127 | 0 | } |
128 | 0 | break; |
129 | 0 | case kData10: |
130 | 0 | HandleBase10Char(next_char); |
131 | 0 | break; |
132 | 0 | case kData16: |
133 | 0 | HandleBase16Char(next_char); |
134 | 0 | break; |
135 | 0 | case kData64: |
136 | 0 | HandleBase64Char(next_char); |
137 | 0 | break; |
138 | 0 | case kData64Padding: |
139 | 0 | if (next_char == '=') { |
140 | 0 | HandleEndOfMessageValidBase64(); |
141 | 0 | } else { |
142 | 0 | ResetMessage(); |
143 | 0 | } |
144 | 0 | break; |
145 | 0 | } |
146 | 0 | } |
147 | | |
148 | 0 | std::string Flush() { |
149 | 0 | HandleEndOfMessage(); |
150 | 0 | std::string output(std::move(output_)); |
151 | 0 | output_.clear(); |
152 | 0 | return output; |
153 | 0 | } |
154 | | |
155 | | private: |
156 | 0 | std::string_view domain() const { |
157 | | // The domain starts 2 characters after the message start ("${domain}"). |
158 | 0 | return std::string_view(output_.data() + message_start_ + 2, domain_size_); |
159 | 0 | } |
160 | | |
161 | 0 | void HandleRadixOrBase64Data(char next_char) { |
162 | 0 | if (next_char == '#') { |
163 | 0 | state_ = kData16; // $# or ${}# means base 16 |
164 | 0 | data_start_ = output_.size(); // data starts after the # |
165 | 0 | return; |
166 | 0 | } |
167 | | |
168 | | // If this is Base64 data, it includes this character. |
169 | 0 | data_start_ = output_.size() - 1; |
170 | 0 | if (next_char == '1') { |
171 | 0 | state_ = kRadix10Or16; |
172 | 0 | } else if (next_char == '6') { |
173 | 0 | state_ = kRadix64; |
174 | 0 | } else if (base64::IsValidChar(next_char)) { |
175 | 0 | state_ = kData64; |
176 | 0 | } else { |
177 | 0 | ResetMessage(); |
178 | 0 | } |
179 | 0 | } |
180 | | |
181 | 0 | void HandleBase10Char(char next_char) { |
182 | 0 | if (!IsValidBase10(next_char)) { |
183 | 0 | ResetMessage(); |
184 | 0 | return; |
185 | 0 | } |
186 | | |
187 | | // Base10 data must be 10 chars long. |
188 | 0 | const size_t block_size = (output_.size() - data_start_); |
189 | 0 | if (block_size == 10) { |
190 | 0 | HandleEndOfMessageValidBase10OrBase16(10); |
191 | 0 | } |
192 | 0 | } |
193 | | |
194 | 0 | void HandleBase16Char(char next_char) { |
195 | 0 | if (!IsValidBase16(next_char)) { |
196 | 0 | ResetMessage(); |
197 | 0 | return; |
198 | 0 | } |
199 | | |
200 | | // Base16 data must be 8 chars long. |
201 | 0 | const size_t block_size = (output_.size() - data_start_); |
202 | 0 | if (block_size == 8) { |
203 | 0 | HandleEndOfMessageValidBase10OrBase16(16); |
204 | 0 | } |
205 | 0 | } |
206 | | |
207 | | // The first few characters after $ could be either a radix specification or |
208 | | // Base64 data (e.g. $16dAw5== versus $16#00000001). |
209 | 0 | void HandleBase64Char(char next_char) { |
210 | 0 | if (base64::IsValidChar(next_char)) { |
211 | 0 | return; |
212 | 0 | } |
213 | | |
214 | | // Base64 data must be in 4 char blocks, ending with padding if needed. |
215 | 0 | const size_t block_size = (output_.size() - data_start_) % 4; |
216 | 0 | if (block_size == 1) { |
217 | | // Got invalid character after a 4-byte block. Pop that char and decode. |
218 | 0 | output_.pop_back(); |
219 | 0 | HandleEndOfMessageValidBase64(); |
220 | 0 | output_.push_back(next_char); |
221 | 0 | } else if (block_size == 2 || next_char != '=') { |
222 | | // Invalid character not on a 4-char block boundary. Could try decoding at |
223 | | // the block boundary instead of resetting. |
224 | 0 | ResetMessage(); |
225 | 0 | } else if (block_size == 3) { // Found padding '=' character, need 1 more. |
226 | 0 | state_ = kData64Padding; |
227 | 0 | } else { // The '=' was the final character of the block. |
228 | 0 | HandleEndOfMessageValidBase64(); |
229 | 0 | } |
230 | 0 | } |
231 | | |
232 | 0 | void HandleEndOfMessage() { |
233 | 0 | if (state_ < kData10) { |
234 | | // It's not possible to have a complete token outside of the kData |
235 | | // states, even for the shortest possible messages ($10==). |
236 | 0 | ResetMessage(); |
237 | 0 | return; |
238 | 0 | } |
239 | | |
240 | 0 | if (state_ >= kData64) { |
241 | | // Base64 data must come in 4-byte blocks. |
242 | 0 | if ((output_.size() - data_start_) % 4 == 0) { |
243 | 0 | HandleEndOfMessageValidBase64(); |
244 | 0 | } else { |
245 | 0 | ResetMessage(); |
246 | 0 | } |
247 | 0 | return; |
248 | 0 | } |
249 | | |
250 | 0 | if (state_ == kData10) { |
251 | 0 | if (output_.size() - data_start_ == 10) { |
252 | 0 | HandleEndOfMessageValidBase10OrBase16(10); |
253 | 0 | } |
254 | 0 | } else if (state_ == kData16) { |
255 | 0 | if (output_.size() - data_start_ == 8) { |
256 | 0 | HandleEndOfMessageValidBase10OrBase16(16); |
257 | 0 | } |
258 | 0 | } |
259 | 0 | ResetMessage(); |
260 | 0 | } |
261 | | |
262 | 0 | void HandleEndOfMessageValidBase10OrBase16(int base) { |
263 | 0 | char* data_start = output_.data() + data_start_; |
264 | 0 | char* data_end = output_.data() + output_.size(); |
265 | |
|
266 | 0 | uint32_t token = 0; |
267 | |
|
268 | 0 | auto [_, ec] = std::from_chars(data_start, data_end, token, base); |
269 | |
|
270 | 0 | if (ec == std::errc()) { |
271 | 0 | DetokenizeOnce(token); |
272 | 0 | } else { |
273 | 0 | ResetMessage(); |
274 | 0 | } |
275 | 0 | } |
276 | | |
277 | 0 | void HandleEndOfMessageValidBase64() { |
278 | 0 | std::string_view data(output_.data() + data_start_, |
279 | 0 | output_.size() - data_start_); |
280 | 0 | std::vector<std::byte> bytes(base64::DecodedSize(data)); |
281 | 0 | base64::Decode(data, bytes.data()); |
282 | 0 | DetokenizeOnceBase64(bytes); |
283 | 0 | } |
284 | | |
285 | 0 | void DetokenizeOnce(uint32_t token) { |
286 | 0 | if (auto result = detokenizer_.DatabaseLookup(token, domain()); |
287 | 0 | result.size() == 1) { |
288 | 0 | std::string replacement = |
289 | 0 | result.front().first.Format(span<const uint8_t>()).value(); |
290 | 0 | output_.replace(message_start_, output_.size(), replacement); |
291 | 0 | output_changed_ = true; |
292 | 0 | } |
293 | 0 | ResetMessage(); |
294 | 0 | } |
295 | | |
296 | 0 | void DetokenizeOnceBase64(span<const std::byte> bytes) { |
297 | 0 | if (auto result = detokenizer_.Detokenize(bytes, domain()); result.ok()) { |
298 | 0 | output_.replace(message_start_, output_.size(), result.BestString()); |
299 | 0 | output_changed_ = true; |
300 | 0 | } |
301 | 0 | ResetMessage(); |
302 | 0 | } |
303 | | |
304 | 0 | void ResetMessage() { |
305 | 0 | message_start_ = 0; |
306 | 0 | domain_size_ = 0; |
307 | 0 | data_start_ = 0; |
308 | 0 | state_ = kPassthrough; |
309 | 0 | } |
310 | | |
311 | | const Detokenizer& detokenizer_; |
312 | | std::string output_; |
313 | | size_t message_start_; // Index of the message prefix ($) |
314 | | size_t domain_size_; |
315 | | size_t data_start_; // Index of the token data |
316 | | |
317 | | enum : uint8_t { |
318 | | kPassthrough, // not parsing a nested message |
319 | | kMessageStart, |
320 | | kDomain, |
321 | | kRadixOrData, |
322 | | kRadix10Or16, |
323 | | kRadix64, |
324 | | kRadixEnd, |
325 | | kData10, |
326 | | kData16, |
327 | | kData64, |
328 | | kData64Padding, |
329 | | } state_ = kPassthrough; |
330 | | bool output_changed_ = false; |
331 | | }; |
332 | | |
333 | 0 | std::string UnknownTokenMessage(uint32_t value) { |
334 | 0 | std::string output(PW_TOKENIZER_ARG_DECODING_ERROR_PREFIX "unknown token "); |
335 | | |
336 | | // Output a hexadecimal version of the token. |
337 | 0 | for (int shift = 28; shift >= 0; shift -= 4) { |
338 | 0 | output.push_back("0123456789abcdef"[(value >> shift) & 0xF]); |
339 | 0 | } |
340 | |
|
341 | 0 | output.append(PW_TOKENIZER_ARG_DECODING_ERROR_SUFFIX); |
342 | 0 | return output; |
343 | 0 | } |
344 | | |
345 | | // Decoding result with the date removed, for sorting. |
346 | | using DecodingResult = std::pair<DecodedFormatString, uint32_t>; |
347 | | |
348 | | // Determines if one result is better than the other if collisions occurred. |
349 | | // Returns true if lhs is preferred over rhs. This logic should match the |
350 | | // collision resolution logic in detokenize.py. |
351 | 0 | bool IsBetterResult(const DecodingResult& lhs, const DecodingResult& rhs) { |
352 | | // Favor the result for which decoding succeeded. |
353 | 0 | if (lhs.first.ok() != rhs.first.ok()) { |
354 | 0 | return lhs.first.ok(); |
355 | 0 | } |
356 | | |
357 | | // Favor the result for which all bytes were decoded. |
358 | 0 | if ((lhs.first.remaining_bytes() == 0u) != |
359 | 0 | (rhs.first.remaining_bytes() == 0u)) { |
360 | 0 | return lhs.first.remaining_bytes() == 0u; |
361 | 0 | } |
362 | | |
363 | | // Favor the result with fewer decoding errors. |
364 | 0 | if (lhs.first.decoding_errors() != rhs.first.decoding_errors()) { |
365 | 0 | return lhs.first.decoding_errors() < rhs.first.decoding_errors(); |
366 | 0 | } |
367 | | |
368 | | // Favor the result that successfully decoded the most arguments. |
369 | 0 | if (lhs.first.argument_count() != rhs.first.argument_count()) { |
370 | 0 | return lhs.first.argument_count() > rhs.first.argument_count(); |
371 | 0 | } |
372 | | |
373 | | // Favor the result that was removed from the database most recently. |
374 | 0 | return lhs.second > rhs.second; |
375 | 0 | } |
376 | | |
377 | | // Returns true if all characters in data are printable, space, or if the string |
378 | | // is empty. |
379 | 0 | constexpr bool IsPrintableAscii(std::string_view data) { |
380 | | // This follows the logic in pw_tokenizer.decode_optionally_tokenized below: |
381 | | // |
382 | | // if ''.join(text.split()).isprintable(): |
383 | | // return text |
384 | | // |
385 | 0 | for (int letter : data) { |
386 | 0 | if (std::isprint(letter) == 0 && std::isspace(letter) == 0) { |
387 | 0 | return false; |
388 | 0 | } |
389 | 0 | } |
390 | 0 | return true; |
391 | 0 | } |
392 | | |
393 | | void AddEntryIfUnique(std::vector<TokenizedStringEntry>& entries, |
394 | 0 | std::string_view new_entry) { |
395 | | // TODO(b/326365218): Construct FormatString with string_view to avoid |
396 | | // creating a copy here. |
397 | 0 | FormatString format_string(std::string(new_entry).c_str()); |
398 | 0 | for (const TokenizedStringEntry& entry : entries) { |
399 | 0 | if (format_string == entry.first) { |
400 | 0 | return; // An identical string is already present |
401 | 0 | } |
402 | 0 | } |
403 | | |
404 | 0 | entries.emplace_back(std::move(format_string), |
405 | 0 | TokenDatabase::kDateRemovedNever); |
406 | 0 | } |
407 | | |
408 | | } // namespace |
409 | | |
410 | | DetokenizedString::DetokenizedString( |
411 | | const Detokenizer& detokenizer, |
412 | | bool recursion, |
413 | | uint32_t token, |
414 | | const span<const TokenizedStringEntry>& entries, |
415 | | const span<const std::byte>& arguments) |
416 | 11.2k | : token_(token), has_token_(true) { |
417 | 11.2k | std::vector<DecodingResult> results; |
418 | | |
419 | 11.2k | for (const auto& [format, date_removed] : entries) { |
420 | 1.75k | results.emplace_back( |
421 | 1.75k | format.Format(span(reinterpret_cast<const uint8_t*>(arguments.data()), |
422 | 1.75k | arguments.size())), |
423 | 1.75k | date_removed); |
424 | 1.75k | } |
425 | | |
426 | 11.2k | std::sort(results.begin(), results.end(), IsBetterResult); |
427 | 11.2k | for (auto& result : results) { |
428 | 1.75k | matches_.push_back(std::move(result.first)); |
429 | 1.75k | } |
430 | | |
431 | 11.2k | if (recursion && !matches_.empty()) { |
432 | 0 | best_string_ = detokenizer.DetokenizeText(matches_[0].value()); |
433 | 11.2k | } else if (!matches_.empty()) { |
434 | 1.75k | best_string_ = matches_[0].value(); |
435 | 9.48k | } else { |
436 | 9.48k | best_string_ = std::string(); |
437 | 9.48k | } |
438 | 11.2k | } |
439 | | |
440 | 0 | std::string DetokenizedString::BestStringWithErrors() const { |
441 | 0 | if (matches_.empty()) { |
442 | 0 | return has_token_ ? UnknownTokenMessage(token_) |
443 | 0 | : PW_TOKENIZER_ARG_DECODING_ERROR("missing token"); |
444 | 0 | } |
445 | 0 | return matches_[0].value_with_errors(); |
446 | 0 | } |
447 | | |
448 | 1 | Detokenizer::Detokenizer(const TokenDatabase& database) { |
449 | 4 | for (const auto& entry : database) { |
450 | 4 | database_[kDefaultDomain][entry.token].emplace_back(entry.string, |
451 | 4 | entry.date_removed); |
452 | 4 | } |
453 | 1 | } |
454 | | |
455 | | Result<Detokenizer> Detokenizer::FromElfSection( |
456 | 0 | span<const std::byte> elf_section) { |
457 | 0 | size_t index = 0; |
458 | 0 | DomainTokenEntriesMap database; |
459 | |
|
460 | 0 | while (index + sizeof(_pw_tokenizer_EntryHeader) < elf_section.size()) { |
461 | 0 | _pw_tokenizer_EntryHeader header; |
462 | 0 | std::memcpy( |
463 | 0 | &header, elf_section.data() + index, sizeof(_pw_tokenizer_EntryHeader)); |
464 | 0 | index += sizeof(_pw_tokenizer_EntryHeader); |
465 | |
|
466 | 0 | if (header.magic != _PW_TOKENIZER_ENTRY_MAGIC) { |
467 | 0 | return Status::DataLoss(); |
468 | 0 | } |
469 | | |
470 | 0 | if (index + header.domain_length + header.string_length <= |
471 | 0 | elf_section.size()) { |
472 | 0 | std::string domain( |
473 | 0 | reinterpret_cast<const char*>(elf_section.data() + index), |
474 | 0 | header.domain_length - 1); |
475 | 0 | index += header.domain_length; |
476 | |
|
477 | 0 | std::string_view entry( |
478 | 0 | reinterpret_cast<const char*>(elf_section.data() + index), |
479 | 0 | header.string_length - 1); |
480 | 0 | index += header.string_length; |
481 | |
|
482 | 0 | AddEntryIfUnique(database[std::move(domain)][header.token], entry); |
483 | 0 | } |
484 | 0 | } |
485 | 0 | return Detokenizer(std::move(database)); |
486 | 0 | } |
487 | | |
488 | 0 | Result<Detokenizer> Detokenizer::FromElfFile(stream::SeekableReader& stream) { |
489 | 0 | PW_TRY_ASSIGN(auto reader, pw::elf::ElfReader::FromStream(stream)); |
490 | |
|
491 | 0 | constexpr auto kTokenSectionName = ".pw_tokenizer.entries"; |
492 | 0 | PW_TRY_ASSIGN(std::vector<std::byte> section_data, |
493 | 0 | reader.ReadSection(kTokenSectionName)); |
494 | |
|
495 | 0 | return Detokenizer::FromElfSection(section_data); |
496 | 0 | } |
497 | | |
498 | 0 | Result<Detokenizer> Detokenizer::FromCsv(std::string_view csv) { |
499 | 0 | std::vector<std::vector<std::string>> parsed_csv = ParseCsv(csv); |
500 | 0 | DomainTokenEntriesMap database; |
501 | | |
502 | | // CSV databases are in the format -> token, date, domain, string. |
503 | 0 | int invalid_row_count = 0; |
504 | 0 | for (const auto& row : parsed_csv) { |
505 | 0 | if (row.size() != 4) { |
506 | 0 | invalid_row_count++; |
507 | 0 | continue; |
508 | 0 | } |
509 | | // Ignore whitespace in the domain. |
510 | 0 | std::string domain = ""; |
511 | 0 | for (char c : row[2]) { |
512 | 0 | if (!std::isspace(c)) { |
513 | 0 | domain += c; |
514 | 0 | } |
515 | 0 | } |
516 | |
|
517 | 0 | const std::string& token = row[0]; |
518 | 0 | const std::string& date_removed = row[1]; |
519 | | |
520 | | // Validate length of token. |
521 | 0 | if (token.empty()) { |
522 | 0 | PW_LOG_ERROR("Corrupt database due to missing token"); |
523 | 0 | return Status::DataLoss(); |
524 | 0 | } |
525 | | |
526 | | // Validate token contents. |
527 | 0 | for (char c : token) { |
528 | 0 | if (!std::isxdigit(c)) { |
529 | 0 | PW_LOG_ERROR("Corrupt database due to token format"); |
530 | 0 | return Status::DataLoss(); |
531 | 0 | } |
532 | 0 | } |
533 | | |
534 | | // Validate date contents. |
535 | 0 | uint32_t date = TokenDatabase::kDateRemovedNever; |
536 | 0 | if (!date_removed.empty() && |
537 | 0 | date_removed.find_first_not_of(' ') != std::string::npos) { |
538 | 0 | size_t first_dash = date_removed.find('-'); |
539 | 0 | if (first_dash == std::string::npos || first_dash != 4) { |
540 | 0 | PW_LOG_ERROR("Wrong date format in database"); |
541 | 0 | return Status::DataLoss(); |
542 | 0 | } |
543 | | |
544 | 0 | size_t second_dash = date_removed.find('-', first_dash + 1); |
545 | 0 | if (second_dash == std::string::npos || second_dash != 7) { |
546 | 0 | PW_LOG_ERROR("Wrong date format in database"); |
547 | 0 | return Status::DataLoss(); |
548 | 0 | } |
549 | | |
550 | 0 | size_t pos; |
551 | 0 | int year = std::stoi(date_removed.substr(0, first_dash), &pos); |
552 | 0 | if (pos != first_dash) { |
553 | 0 | PW_LOG_ERROR("Wrong date format in database"); |
554 | 0 | return Status::DataLoss(); |
555 | 0 | } |
556 | | |
557 | 0 | int month = std::stoi( |
558 | 0 | date_removed.substr(first_dash + 1, second_dash - first_dash - 1), |
559 | 0 | &pos); |
560 | 0 | if (pos != second_dash - first_dash - 1) { |
561 | 0 | PW_LOG_ERROR("Wrong date format in database"); |
562 | 0 | return Status::DataLoss(); |
563 | 0 | } |
564 | | |
565 | 0 | int day = std::stoi(date_removed.substr(second_dash + 1), &pos); |
566 | 0 | if (pos != date_removed.size() - second_dash - 1) { |
567 | 0 | PW_LOG_ERROR("Wrong date format in database"); |
568 | 0 | return Status::DataLoss(); |
569 | 0 | } |
570 | | |
571 | 0 | date = static_cast<uint32_t>(year << 16) | |
572 | 0 | static_cast<uint32_t>(month << 8) | static_cast<uint32_t>(day); |
573 | 0 | } |
574 | | |
575 | | // Add to database. |
576 | 0 | database[std::move(domain)] |
577 | 0 | [static_cast<uint32_t>(std::stoul(token, nullptr, 16))] |
578 | 0 | .emplace_back(row[3].c_str(), date); |
579 | 0 | } |
580 | | |
581 | | // Log warning if any data lines were skipped. |
582 | 0 | if (invalid_row_count > 0) { |
583 | 0 | PW_LOG_WARN( |
584 | 0 | "Skipped %d of %zu lines because they did not have 4 columns as " |
585 | 0 | "expected.", |
586 | 0 | invalid_row_count, |
587 | 0 | parsed_csv.size()); |
588 | 0 | } |
589 | |
|
590 | 0 | return Detokenizer(std::move(database)); |
591 | 0 | } |
592 | | |
593 | | DetokenizedString Detokenizer::Detokenize(const span<const std::byte>& encoded, |
594 | | std::string_view domain, |
595 | 13.3k | bool recursion) const { |
596 | | // The token is missing from the encoded data; there is nothing to do. |
597 | 13.3k | if (encoded.empty()) { |
598 | 2.09k | return DetokenizedString(); |
599 | 2.09k | } |
600 | | |
601 | 11.2k | uint32_t token = bytes::ReadInOrder<uint32_t>( |
602 | 11.2k | endian::little, encoded.data(), encoded.size()); |
603 | | |
604 | 11.2k | const auto result = DatabaseLookup(token, domain); |
605 | | |
606 | 11.2k | return DetokenizedString(*this, |
607 | 11.2k | recursion, |
608 | 11.2k | token, |
609 | 11.2k | result, |
610 | 11.2k | encoded.size() < sizeof(token) |
611 | 11.2k | ? span<const std::byte>() |
612 | 11.2k | : encoded.subspan(sizeof(token))); |
613 | 13.3k | } |
614 | | |
615 | | DetokenizedString Detokenizer::DetokenizeBase64Message( |
616 | 0 | std::string_view text) const { |
617 | 0 | std::string buffer(text); |
618 | 0 | buffer.resize(PrefixedBase64DecodeInPlace(buffer)); |
619 | 0 | return Detokenize(buffer); |
620 | 0 | } |
621 | | |
622 | | span<const TokenizedStringEntry> Detokenizer::DatabaseLookup( |
623 | 11.2k | uint32_t token, std::string_view domain) const { |
624 | 11.2k | std::string canonical_domain; |
625 | 11.2k | for (char ch : domain) { |
626 | 0 | if (!std::isspace(ch)) { |
627 | 0 | canonical_domain.push_back(ch); |
628 | 0 | } |
629 | 0 | } |
630 | | |
631 | 11.2k | auto domain_it = database_.find(canonical_domain); |
632 | 11.2k | if (domain_it == database_.end()) { |
633 | 0 | return span<TokenizedStringEntry>(); |
634 | 0 | } |
635 | 11.2k | auto token_it = domain_it->second.find(token); |
636 | 11.2k | if (token_it == domain_it->second.end()) { |
637 | 9.48k | return span<TokenizedStringEntry>(); |
638 | 9.48k | } |
639 | | |
640 | 1.75k | return span(token_it->second); |
641 | 11.2k | } |
642 | | |
643 | | std::string Detokenizer::DetokenizeTextRecursive(std::string_view text, |
644 | 0 | unsigned max_passes) const { |
645 | 0 | NestedMessageDetokenizer detokenizer(*this); |
646 | 0 | detokenizer.Detokenize(text); |
647 | |
|
648 | 0 | std::string result; |
649 | 0 | unsigned pass = 1; |
650 | |
|
651 | 0 | while (true) { |
652 | 0 | result = detokenizer.Flush(); |
653 | 0 | if (pass >= max_passes || !detokenizer.OutputChangedSinceLastCheck()) { |
654 | 0 | break; |
655 | 0 | } |
656 | 0 | detokenizer.Detokenize(result); |
657 | 0 | pass += 1; |
658 | 0 | } |
659 | 0 | return result; |
660 | 0 | } |
661 | | |
662 | | std::string Detokenizer::DecodeOptionallyTokenizedData( |
663 | 0 | span<const std::byte> optionally_tokenized_data) const { |
664 | | // Try detokenizing as binary using the best result if available, else use |
665 | | // the input data as a string. |
666 | 0 | const auto result = Detokenize(optionally_tokenized_data); |
667 | 0 | const bool found_matches = !result.matches().empty(); |
668 | | // Note: unlike pw_tokenizer.proto.decode_optionally_tokenized, this decoding |
669 | | // process does not encode and decode UTF8 format, it is sufficient to check |
670 | | // if the data is printable ASCII. |
671 | 0 | const std::string data = |
672 | 0 | found_matches |
673 | 0 | ? result.BestString() |
674 | 0 | : std::string( |
675 | 0 | reinterpret_cast<const char*>(optionally_tokenized_data.data()), |
676 | 0 | optionally_tokenized_data.size()); |
677 | |
|
678 | 0 | const bool is_data_printable = IsPrintableAscii(data); |
679 | 0 | if (!found_matches && !is_data_printable) { |
680 | | // Assume the token is unknown or the data is corrupt. |
681 | 0 | std::vector<char> base64_encoding_buffer( |
682 | 0 | Base64EncodedBufferSize(optionally_tokenized_data.size())); |
683 | 0 | const size_t encoded_length = PrefixedBase64Encode( |
684 | 0 | optionally_tokenized_data, span(base64_encoding_buffer)); |
685 | 0 | return std::string{base64_encoding_buffer.data(), encoded_length}; |
686 | 0 | } |
687 | | |
688 | | // Successfully detokenized, check if the field has more prefixed |
689 | | // base64-encoded tokens. |
690 | 0 | const std::string field = DetokenizeText(data); |
691 | | // If anything detokenized successfully, use that. |
692 | 0 | if (field != data) { |
693 | 0 | return field; |
694 | 0 | } |
695 | | |
696 | | // Attempt to determine whether this is an unknown token or plain text. |
697 | | // Any string with only printable or whitespace characters is plain text. |
698 | 0 | if (found_matches || is_data_printable) { |
699 | 0 | return data; |
700 | 0 | } |
701 | | |
702 | | // Assume this field is tokenized data that could not be decoded. |
703 | 0 | std::vector<char> base64_encoding_buffer( |
704 | 0 | Base64EncodedBufferSize(optionally_tokenized_data.size())); |
705 | 0 | const size_t encoded_length = PrefixedBase64Encode( |
706 | 0 | optionally_tokenized_data, span(base64_encoding_buffer)); |
707 | 0 | return std::string{base64_encoding_buffer.data(), encoded_length}; |
708 | 0 | } |
709 | | |
710 | | } // namespace pw::tokenizer |