Coverage Report

Created: 2026-02-14 07:14

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/proc/self/cwd/pw_tokenizer/detokenize.cc
Line
Count
Source
1
// Copyright 2025 The Pigweed Authors
2
//
3
// Licensed under the Apache License, Version 2.0 (the "License"); you may not
4
// use this file except in compliance with the License. You may obtain a copy of
5
// the License at
6
//
7
//     https://www.apache.org/licenses/LICENSE-2.0
8
//
9
// Unless required by applicable law or agreed to in writing, software
10
// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
11
// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
12
// License for the specific language governing permissions and limitations under
13
// the License.
14
15
#include "pw_tokenizer/detokenize.h"
16
17
#include <algorithm>
18
#include <cctype>
19
#include <charconv>
20
#include <cstring>
21
#include <string_view>
22
#include <utility>
23
#include <vector>
24
25
#include "pw_base64/base64.h"
26
#include "pw_bytes/bit.h"
27
#include "pw_bytes/endian.h"
28
#include "pw_elf/reader.h"
29
#include "pw_log/log.h"
30
#include "pw_preprocessor/compiler.h"
31
#include "pw_result/result.h"
32
#include "pw_status/try.h"
33
#include "pw_tokenizer/base64.h"
34
#include "pw_tokenizer/internal/decode.h"
35
#include "pw_tokenizer/nested_tokenization.h"
36
#include "pw_tokenizer/tokenize.h"
37
#include "pw_tokenizer_private/csv.h"
38
39
namespace pw::tokenizer {
40
namespace {
41
42
// True if a Base10 character.
43
0
constexpr bool IsValidBase10(char ch) { return ('0' <= ch && ch <= '9'); }
44
45
// True if a Base16 character.
46
0
constexpr bool IsValidBase16(char ch) {
47
0
  return ('0' <= ch && ch <= '9') || ('A' <= ch && ch <= 'F') ||
48
0
         ('a' <= ch && ch <= 'f');
49
0
}
50
51
class NestedMessageDetokenizer {
52
 public:
53
  NestedMessageDetokenizer(const Detokenizer& detokenizer)
54
0
      : detokenizer_(detokenizer),
55
0
        message_start_(0),
56
0
        domain_size_(0),
57
0
        data_start_(0) {}
58
59
0
  void Detokenize(std::string_view chunk) {
60
0
    for (char next_char : chunk) {
61
0
      Detokenize(next_char);
62
0
    }
63
0
  }
64
65
0
  bool OutputChangedSinceLastCheck() {
66
0
    return std::exchange(output_changed_, false);
67
0
  }
68
69
0
  void Detokenize(char next_char) {
70
0
    if (next_char == PW_TOKENIZER_NESTED_PREFIX) {
71
0
      HandleEndOfMessage();
72
73
0
      message_start_ = output_.size();
74
0
      state_ = kMessageStart;
75
0
      output_.push_back(next_char);
76
0
      return;
77
0
    }
78
79
0
    output_.push_back(next_char);
80
0
    switch (state_) {
81
0
      case kPassthrough:
82
0
        break;
83
0
      case kMessageStart:
84
0
        if (next_char == '{') {
85
0
          state_ = kDomain;
86
0
        } else {
87
0
          HandleRadixOrBase64Data(next_char);
88
0
        }
89
0
        break;
90
0
      case kDomain:
91
0
        if (next_char == '}') {
92
0
          state_ = kRadixOrData;
93
0
        } else if (internal::ValidDomainChar(next_char)) {
94
0
          domain_size_ += 1;
95
0
        } else {
96
0
          ResetMessage();
97
0
        }
98
0
        break;
99
0
      case kRadixOrData:
100
0
        HandleRadixOrBase64Data(next_char);
101
0
        break;
102
0
      case kRadix10Or16:
103
0
        if (next_char == '0' || next_char == '6') {
104
0
          state_ = kRadixEnd;
105
0
        } else {
106
0
          state_ = kData64;
107
0
          HandleBase64Char(next_char);
108
0
        }
109
0
        break;
110
0
      case kRadix64:
111
0
        if (next_char == '4') {
112
0
          state_ = kRadixEnd;
113
0
        } else {
114
0
          state_ = kData64;
115
0
          HandleBase64Char(next_char);
116
0
        }
117
0
        break;
118
0
      case kRadixEnd:
119
0
        if (next_char == '#') {
120
          // Check if the radix was 10, 16, or 64.
121
0
          const char digit = output_[output_.size() - 2];
122
0
          state_ = digit == '0' ? kData10 : digit == '6' ? kData16 : kData64;
123
0
          data_start_ = output_.size();
124
0
        } else {
125
0
          state_ = kData64;
126
0
          HandleBase64Char(next_char);
127
0
        }
128
0
        break;
129
0
      case kData10:
130
0
        HandleBase10Char(next_char);
131
0
        break;
132
0
      case kData16:
133
0
        HandleBase16Char(next_char);
134
0
        break;
135
0
      case kData64:
136
0
        HandleBase64Char(next_char);
137
0
        break;
138
0
      case kData64Padding:
139
0
        if (next_char == '=') {
140
0
          HandleEndOfMessageValidBase64();
141
0
        } else {
142
0
          ResetMessage();
143
0
        }
144
0
        break;
145
0
    }
146
0
  }
147
148
0
  std::string Flush() {
149
0
    HandleEndOfMessage();
150
0
    std::string output(std::move(output_));
151
0
    output_.clear();
152
0
    return output;
153
0
  }
154
155
 private:
156
0
  std::string_view domain() const {
157
    // The domain starts 2 characters after the message start ("${domain}").
158
0
    return std::string_view(output_.data() + message_start_ + 2, domain_size_);
159
0
  }
160
161
0
  void HandleRadixOrBase64Data(char next_char) {
162
0
    if (next_char == '#') {
163
0
      state_ = kData16;              // $# or ${}# means base 16
164
0
      data_start_ = output_.size();  // data starts after the #
165
0
      return;
166
0
    }
167
168
    // If this is Base64 data, it includes this character.
169
0
    data_start_ = output_.size() - 1;
170
0
    if (next_char == '1') {
171
0
      state_ = kRadix10Or16;
172
0
    } else if (next_char == '6') {
173
0
      state_ = kRadix64;
174
0
    } else if (base64::IsValidChar(next_char)) {
175
0
      state_ = kData64;
176
0
    } else {
177
0
      ResetMessage();
178
0
    }
179
0
  }
180
181
0
  void HandleBase10Char(char next_char) {
182
0
    if (!IsValidBase10(next_char)) {
183
0
      ResetMessage();
184
0
      return;
185
0
    }
186
187
    // Base10 data must be 10 chars long.
188
0
    const size_t block_size = (output_.size() - data_start_);
189
0
    if (block_size == 10) {
190
0
      HandleEndOfMessageValidBase10OrBase16(10);
191
0
    }
192
0
  }
193
194
0
  void HandleBase16Char(char next_char) {
195
0
    if (!IsValidBase16(next_char)) {
196
0
      ResetMessage();
197
0
      return;
198
0
    }
199
200
    // Base16 data must be 8 chars long.
201
0
    const size_t block_size = (output_.size() - data_start_);
202
0
    if (block_size == 8) {
203
0
      HandleEndOfMessageValidBase10OrBase16(16);
204
0
    }
205
0
  }
206
207
  // The first few characters after $ could be either a radix specification or
208
  // Base64 data (e.g. $16dAw5== versus $16#00000001).
209
0
  void HandleBase64Char(char next_char) {
210
0
    if (base64::IsValidChar(next_char)) {
211
0
      return;
212
0
    }
213
214
    // Base64 data must be in 4 char blocks, ending with padding if needed.
215
0
    const size_t block_size = (output_.size() - data_start_) % 4;
216
0
    if (block_size == 1) {
217
      // Got invalid character after a 4-byte block. Pop that char and decode.
218
0
      output_.pop_back();
219
0
      HandleEndOfMessageValidBase64();
220
0
      output_.push_back(next_char);
221
0
    } else if (block_size == 2 || next_char != '=') {
222
      // Invalid character not on a 4-char block boundary. Could try decoding at
223
      // the block boundary instead of resetting.
224
0
      ResetMessage();
225
0
    } else if (block_size == 3) {  // Found padding '=' character, need 1 more.
226
0
      state_ = kData64Padding;
227
0
    } else {  // The '=' was the final character of the block.
228
0
      HandleEndOfMessageValidBase64();
229
0
    }
230
0
  }
231
232
0
  void HandleEndOfMessage() {
233
0
    if (state_ < kData10) {
234
      // It's not possible to have a complete token outside of the kData
235
      // states, even for the shortest possible messages ($10==).
236
0
      ResetMessage();
237
0
      return;
238
0
    }
239
240
0
    if (state_ >= kData64) {
241
      // Base64 data must come in 4-byte blocks.
242
0
      if ((output_.size() - data_start_) % 4 == 0) {
243
0
        HandleEndOfMessageValidBase64();
244
0
      } else {
245
0
        ResetMessage();
246
0
      }
247
0
      return;
248
0
    }
249
250
0
    if (state_ == kData10) {
251
0
      if (output_.size() - data_start_ == 10) {
252
0
        HandleEndOfMessageValidBase10OrBase16(10);
253
0
      }
254
0
    } else if (state_ == kData16) {
255
0
      if (output_.size() - data_start_ == 8) {
256
0
        HandleEndOfMessageValidBase10OrBase16(16);
257
0
      }
258
0
    }
259
0
    ResetMessage();
260
0
  }
261
262
0
  void HandleEndOfMessageValidBase10OrBase16(int base) {
263
0
    char* data_start = output_.data() + data_start_;
264
0
    char* data_end = output_.data() + output_.size();
265
266
0
    uint32_t token = 0;
267
268
0
    auto [_, ec] = std::from_chars(data_start, data_end, token, base);
269
270
0
    if (ec == std::errc()) {
271
0
      DetokenizeOnce(token);
272
0
    } else {
273
0
      ResetMessage();
274
0
    }
275
0
  }
276
277
0
  void HandleEndOfMessageValidBase64() {
278
0
    std::string_view data(output_.data() + data_start_,
279
0
                          output_.size() - data_start_);
280
0
    std::vector<std::byte> bytes(base64::DecodedSize(data));
281
0
    base64::Decode(data, bytes.data());
282
0
    DetokenizeOnceBase64(bytes);
283
0
  }
284
285
0
  void DetokenizeOnce(uint32_t token) {
286
0
    if (auto result = detokenizer_.DatabaseLookup(token, domain());
287
0
        result.size() == 1) {
288
0
      std::string replacement =
289
0
          result.front().first.Format(span<const uint8_t>()).value();
290
0
      output_.replace(message_start_, output_.size(), replacement);
291
0
      output_changed_ = true;
292
0
    }
293
0
    ResetMessage();
294
0
  }
295
296
0
  void DetokenizeOnceBase64(span<const std::byte> bytes) {
297
0
    if (auto result = detokenizer_.Detokenize(bytes, domain()); result.ok()) {
298
0
      output_.replace(message_start_, output_.size(), result.BestString());
299
0
      output_changed_ = true;
300
0
    }
301
0
    ResetMessage();
302
0
  }
303
304
0
  void ResetMessage() {
305
0
    message_start_ = 0;
306
0
    domain_size_ = 0;
307
0
    data_start_ = 0;
308
0
    state_ = kPassthrough;
309
0
  }
310
311
  const Detokenizer& detokenizer_;
312
  std::string output_;
313
  size_t message_start_;  // Index of the message prefix ($)
314
  size_t domain_size_;
315
  size_t data_start_;  // Index of the token data
316
317
  enum : uint8_t {
318
    kPassthrough,  // not parsing a nested message
319
    kMessageStart,
320
    kDomain,
321
    kRadixOrData,
322
    kRadix10Or16,
323
    kRadix64,
324
    kRadixEnd,
325
    kData10,
326
    kData16,
327
    kData64,
328
    kData64Padding,
329
  } state_ = kPassthrough;
330
  bool output_changed_ = false;
331
};
332
333
0
std::string UnknownTokenMessage(uint32_t value) {
334
0
  std::string output(PW_TOKENIZER_ARG_DECODING_ERROR_PREFIX "unknown token ");
335
336
  // Output a hexadecimal version of the token.
337
0
  for (int shift = 28; shift >= 0; shift -= 4) {
338
0
    output.push_back("0123456789abcdef"[(value >> shift) & 0xF]);
339
0
  }
340
341
0
  output.append(PW_TOKENIZER_ARG_DECODING_ERROR_SUFFIX);
342
0
  return output;
343
0
}
344
345
// Decoding result with the date removed, for sorting.
346
using DecodingResult = std::pair<DecodedFormatString, uint32_t>;
347
348
// Determines if one result is better than the other if collisions occurred.
349
// Returns true if lhs is preferred over rhs. This logic should match the
350
// collision resolution logic in detokenize.py.
351
0
bool IsBetterResult(const DecodingResult& lhs, const DecodingResult& rhs) {
352
  // Favor the result for which decoding succeeded.
353
0
  if (lhs.first.ok() != rhs.first.ok()) {
354
0
    return lhs.first.ok();
355
0
  }
356
357
  // Favor the result for which all bytes were decoded.
358
0
  if ((lhs.first.remaining_bytes() == 0u) !=
359
0
      (rhs.first.remaining_bytes() == 0u)) {
360
0
    return lhs.first.remaining_bytes() == 0u;
361
0
  }
362
363
  // Favor the result with fewer decoding errors.
364
0
  if (lhs.first.decoding_errors() != rhs.first.decoding_errors()) {
365
0
    return lhs.first.decoding_errors() < rhs.first.decoding_errors();
366
0
  }
367
368
  // Favor the result that successfully decoded the most arguments.
369
0
  if (lhs.first.argument_count() != rhs.first.argument_count()) {
370
0
    return lhs.first.argument_count() > rhs.first.argument_count();
371
0
  }
372
373
  // Favor the result that was removed from the database most recently.
374
0
  return lhs.second > rhs.second;
375
0
}
376
377
// Returns true if all characters in data are printable, space, or if the string
378
// is empty.
379
0
constexpr bool IsPrintableAscii(std::string_view data) {
380
  // This follows the logic in pw_tokenizer.decode_optionally_tokenized below:
381
  //
382
  //   if ''.join(text.split()).isprintable():
383
  //     return text
384
  //
385
0
  for (int letter : data) {
386
0
    if (std::isprint(letter) == 0 && std::isspace(letter) == 0) {
387
0
      return false;
388
0
    }
389
0
  }
390
0
  return true;
391
0
}
392
393
void AddEntryIfUnique(std::vector<TokenizedStringEntry>& entries,
394
                      std::string_view new_entry,
395
0
                      uint32_t date_removed) {
396
  // TODO(b/326365218): Construct FormatString with string_view to avoid
397
  // creating a copy here.
398
0
  FormatString format_string(std::string(new_entry).c_str());
399
0
  for (TokenizedStringEntry& entry : entries) {
400
0
    if (format_string == entry.first) {
401
0
      if (date_removed > entry.second) {
402
0
        entry.second = date_removed;
403
0
      }
404
0
      return;
405
0
    }
406
0
  }
407
408
0
  entries.emplace_back(std::move(format_string), date_removed);
409
0
}
410
411
}  // namespace
412
413
DetokenizedString::DetokenizedString(
414
    const Detokenizer& detokenizer,
415
    bool recursion,
416
    uint32_t token,
417
    const span<const TokenizedStringEntry>& entries,
418
    const span<const std::byte>& arguments)
419
12.3k
    : token_(token), has_token_(true) {
420
12.3k
  std::vector<DecodingResult> results;
421
422
12.3k
  for (const auto& [format, date_removed] : entries) {
423
1.63k
    results.emplace_back(
424
1.63k
        format.Format(span(reinterpret_cast<const uint8_t*>(arguments.data()),
425
1.63k
                           arguments.size())),
426
1.63k
        date_removed);
427
1.63k
  }
428
429
12.3k
  std::sort(results.begin(), results.end(), IsBetterResult);
430
12.3k
  for (auto& result : results) {
431
1.63k
    matches_.push_back(std::move(result.first));
432
1.63k
  }
433
434
12.3k
  if (recursion && !matches_.empty()) {
435
0
    best_string_ = detokenizer.DetokenizeText(matches_[0].value());
436
12.3k
  } else if (!matches_.empty()) {
437
1.63k
    best_string_ = matches_[0].value();
438
10.6k
  } else {
439
10.6k
    best_string_ = std::string();
440
10.6k
  }
441
12.3k
}
442
443
0
std::string DetokenizedString::BestStringWithErrors() const {
444
0
  if (matches_.empty()) {
445
0
    return has_token_ ? UnknownTokenMessage(token_)
446
0
                      : PW_TOKENIZER_ARG_DECODING_ERROR("missing token");
447
0
  }
448
0
  return matches_[0].value_with_errors();
449
0
}
450
451
1
Detokenizer::Detokenizer(const TokenDatabase& database) {
452
4
  for (const auto& entry : database) {
453
4
    database_[kDefaultDomain][entry.token].emplace_back(entry.string,
454
4
                                                        entry.date_removed);
455
4
  }
456
1
}
457
458
Result<Detokenizer> Detokenizer::FromElfSection(
459
0
    span<const std::byte> elf_section) {
460
0
  size_t index = 0;
461
0
  DomainTokenEntriesMap database;
462
463
0
  while (index + sizeof(_pw_tokenizer_EntryHeader) < elf_section.size()) {
464
0
    _pw_tokenizer_EntryHeader header;
465
0
    std::memcpy(
466
0
        &header, elf_section.data() + index, sizeof(_pw_tokenizer_EntryHeader));
467
0
    index += sizeof(_pw_tokenizer_EntryHeader);
468
469
0
    if (header.magic != _PW_TOKENIZER_ENTRY_MAGIC) {
470
0
      return Status::DataLoss();
471
0
    }
472
473
0
    if (index + header.domain_length + header.string_length <=
474
0
        elf_section.size()) {
475
0
      std::string domain(
476
0
          reinterpret_cast<const char*>(elf_section.data() + index),
477
0
          header.domain_length - 1);
478
0
      index += header.domain_length;
479
480
0
      std::string_view entry(
481
0
          reinterpret_cast<const char*>(elf_section.data() + index),
482
0
          header.string_length - 1);
483
0
      index += header.string_length;
484
485
0
      AddEntryIfUnique(database[std::move(domain)][header.token],
486
0
                       entry,
487
0
                       TokenDatabase::kDateRemovedNever);
488
0
    }
489
0
  }
490
0
  return Detokenizer(std::move(database));
491
0
}
492
493
0
Result<Detokenizer> Detokenizer::FromElfFile(stream::SeekableReader& stream) {
494
0
  PW_TRY_ASSIGN(auto reader, pw::elf::ElfReader::FromStream(stream));
495
496
0
  constexpr auto kTokenSectionName = ".pw_tokenizer.entries";
497
0
  PW_TRY_ASSIGN(std::vector<std::byte> section_data,
498
0
                reader.ReadSection(kTokenSectionName));
499
500
0
  return Detokenizer::FromElfSection(section_data);
501
0
}
502
503
0
Result<Detokenizer> Detokenizer::FromCsv(std::string_view csv) {
504
0
  std::vector<std::vector<std::string>> parsed_csv = ParseCsv(csv);
505
0
  DomainTokenEntriesMap database;
506
507
  // CSV databases are in the format -> token, date, domain, string.
508
0
  int invalid_row_count = 0;
509
0
  for (const auto& row : parsed_csv) {
510
0
    if (row.size() != 4) {
511
0
      invalid_row_count++;
512
0
      continue;
513
0
    }
514
    // Ignore whitespace in the domain.
515
0
    std::string domain = "";
516
0
    for (char c : row[2]) {
517
0
      if (!std::isspace(c)) {
518
0
        domain += c;
519
0
      }
520
0
    }
521
522
0
    const std::string& token = row[0];
523
0
    const std::string& date_removed = row[1];
524
525
    // Validate length of token.
526
0
    if (token.empty()) {
527
0
      PW_LOG_ERROR("Corrupt database due to missing token");
528
0
      return Status::DataLoss();
529
0
    }
530
531
    // Validate token contents.
532
0
    for (char c : token) {
533
0
      if (!std::isxdigit(c)) {
534
0
        PW_LOG_ERROR("Corrupt database due to token format");
535
0
        return Status::DataLoss();
536
0
      }
537
0
    }
538
539
    // Validate date contents.
540
0
    uint32_t date = TokenDatabase::kDateRemovedNever;
541
0
    if (!date_removed.empty() &&
542
0
        date_removed.find_first_not_of(' ') != std::string::npos) {
543
0
      size_t first_dash = date_removed.find('-');
544
0
      if (first_dash == std::string::npos || first_dash != 4) {
545
0
        PW_LOG_ERROR("Wrong date format in database");
546
0
        return Status::DataLoss();
547
0
      }
548
549
0
      size_t second_dash = date_removed.find('-', first_dash + 1);
550
0
      if (second_dash == std::string::npos || second_dash != 7) {
551
0
        PW_LOG_ERROR("Wrong date format in database");
552
0
        return Status::DataLoss();
553
0
      }
554
555
0
      size_t pos;
556
0
      int year = std::stoi(date_removed.substr(0, first_dash), &pos);
557
0
      if (pos != first_dash) {
558
0
        PW_LOG_ERROR("Wrong date format in database");
559
0
        return Status::DataLoss();
560
0
      }
561
562
0
      int month = std::stoi(
563
0
          date_removed.substr(first_dash + 1, second_dash - first_dash - 1),
564
0
          &pos);
565
0
      if (pos != second_dash - first_dash - 1) {
566
0
        PW_LOG_ERROR("Wrong date format in database");
567
0
        return Status::DataLoss();
568
0
      }
569
570
0
      int day = std::stoi(date_removed.substr(second_dash + 1), &pos);
571
0
      if (pos != date_removed.size() - second_dash - 1) {
572
0
        PW_LOG_ERROR("Wrong date format in database");
573
0
        return Status::DataLoss();
574
0
      }
575
576
0
      date = static_cast<uint32_t>(year << 16) |
577
0
             static_cast<uint32_t>(month << 8) | static_cast<uint32_t>(day);
578
0
    }
579
580
    // Add to database.
581
0
    AddEntryIfUnique(database[std::move(domain)][static_cast<uint32_t>(
582
0
                         std::stoul(token, nullptr, 16))],
583
0
                     row[3],
584
0
                     date);
585
0
  }
586
587
  // Log warning if any data lines were skipped.
588
0
  if (invalid_row_count > 0) {
589
0
    PW_LOG_WARN(
590
0
        "Skipped %d of %zu lines because they did not have 4 columns as "
591
0
        "expected.",
592
0
        invalid_row_count,
593
0
        parsed_csv.size());
594
0
  }
595
596
0
  return Detokenizer(std::move(database));
597
0
}
598
599
DetokenizedString Detokenizer::Detokenize(const span<const std::byte>& encoded,
600
                                          std::string_view domain,
601
14.6k
                                          bool recursion) const {
602
  // The token is missing from the encoded data; there is nothing to do.
603
14.6k
  if (encoded.empty()) {
604
2.28k
    return DetokenizedString();
605
2.28k
  }
606
607
12.3k
  uint32_t token = bytes::ReadInOrder<uint32_t>(
608
12.3k
      endian::little, encoded.data(), encoded.size());
609
610
12.3k
  const auto result = DatabaseLookup(token, domain);
611
612
12.3k
  return DetokenizedString(*this,
613
12.3k
                           recursion,
614
12.3k
                           token,
615
12.3k
                           result,
616
12.3k
                           encoded.size() < sizeof(token)
617
12.3k
                               ? span<const std::byte>()
618
12.3k
                               : encoded.subspan(sizeof(token)));
619
14.6k
}
620
621
DetokenizedString Detokenizer::DetokenizeBase64Message(
622
0
    std::string_view text) const {
623
0
  std::string buffer(text);
624
0
  buffer.resize(PrefixedBase64DecodeInPlace(buffer));
625
0
  return Detokenize(buffer);
626
0
}
627
628
span<const TokenizedStringEntry> Detokenizer::DatabaseLookup(
629
12.3k
    uint32_t token, std::string_view domain) const {
630
12.3k
  std::string canonical_domain;
631
12.3k
  for (char ch : domain) {
632
0
    if (!std::isspace(ch)) {
633
0
      canonical_domain.push_back(ch);
634
0
    }
635
0
  }
636
637
12.3k
  auto domain_it = database_.find(canonical_domain);
638
12.3k
  if (domain_it == database_.end()) {
639
0
    return span<TokenizedStringEntry>();
640
0
  }
641
12.3k
  auto token_it = domain_it->second.find(token);
642
12.3k
  if (token_it == domain_it->second.end()) {
643
10.6k
    return span<TokenizedStringEntry>();
644
10.6k
  }
645
646
1.63k
  return span(token_it->second);
647
12.3k
}
648
649
std::string Detokenizer::DetokenizeTextRecursive(std::string_view text,
650
0
                                                 unsigned max_passes) const {
651
0
  NestedMessageDetokenizer detokenizer(*this);
652
0
  detokenizer.Detokenize(text);
653
654
0
  std::string result;
655
0
  unsigned pass = 1;
656
657
0
  while (true) {
658
0
    result = detokenizer.Flush();
659
0
    if (pass >= max_passes || !detokenizer.OutputChangedSinceLastCheck()) {
660
0
      break;
661
0
    }
662
0
    detokenizer.Detokenize(result);
663
0
    pass += 1;
664
0
  }
665
0
  return result;
666
0
}
667
668
std::string Detokenizer::DecodeOptionallyTokenizedData(
669
0
    span<const std::byte> optionally_tokenized_data) const {
670
  // Try detokenizing as binary using the best result if available, else use
671
  // the input data as a string.
672
0
  const auto result = Detokenize(optionally_tokenized_data);
673
0
  const bool found_matches = !result.matches().empty();
674
  // Note: unlike pw_tokenizer.proto.decode_optionally_tokenized, this decoding
675
  // process does not encode and decode UTF8 format, it is sufficient to check
676
  // if the data is printable ASCII.
677
0
  const std::string data =
678
0
      found_matches
679
0
          ? result.BestString()
680
0
          : std::string(
681
0
                reinterpret_cast<const char*>(optionally_tokenized_data.data()),
682
0
                optionally_tokenized_data.size());
683
684
0
  const bool is_data_printable = IsPrintableAscii(data);
685
0
  if (!found_matches && !is_data_printable) {
686
    // Assume the token is unknown or the data is corrupt.
687
0
    std::vector<char> base64_encoding_buffer(
688
0
        Base64EncodedBufferSize(optionally_tokenized_data.size()));
689
0
    const size_t encoded_length = PrefixedBase64Encode(
690
0
        optionally_tokenized_data, span(base64_encoding_buffer));
691
0
    return std::string{base64_encoding_buffer.data(), encoded_length};
692
0
  }
693
694
  // Successfully detokenized, check if the field has more prefixed
695
  // base64-encoded tokens.
696
0
  const std::string field = DetokenizeText(data);
697
  // If anything detokenized successfully, use that.
698
0
  if (field != data) {
699
0
    return field;
700
0
  }
701
702
  // Attempt to determine whether this is an unknown token or plain text.
703
  // Any string with only printable or whitespace characters is plain text.
704
0
  if (found_matches || is_data_printable) {
705
0
    return data;
706
0
  }
707
708
  // Assume this field is tokenized data that could not be decoded.
709
0
  std::vector<char> base64_encoding_buffer(
710
0
      Base64EncodedBufferSize(optionally_tokenized_data.size()));
711
0
  const size_t encoded_length = PrefixedBase64Encode(
712
0
      optionally_tokenized_data, span(base64_encoding_buffer));
713
0
  return std::string{base64_encoding_buffer.data(), encoded_length};
714
0
}
715
716
}  // namespace pw::tokenizer