Coverage Report

Created: 2024-07-09 06:09

/proc/self/cwd/pw_tokenizer/detokenize.cc
Line
Count
Source (jump to first uncovered line)
1
// Copyright 2020 The Pigweed Authors
2
//
3
// Licensed under the Apache License, Version 2.0 (the "License"); you may not
4
// use this file except in compliance with the License. You may obtain a copy of
5
// the License at
6
//
7
//     https://www.apache.org/licenses/LICENSE-2.0
8
//
9
// Unless required by applicable law or agreed to in writing, software
10
// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
11
// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
12
// License for the specific language governing permissions and limitations under
13
// the License.
14
15
#include "pw_tokenizer/detokenize.h"
16
17
#include <algorithm>
18
#include <cctype>
19
#include <cstring>
20
#include <string_view>
21
#include <vector>
22
23
#include "pw_bytes/bit.h"
24
#include "pw_bytes/endian.h"
25
#include "pw_result/result.h"
26
#include "pw_tokenizer/base64.h"
27
#include "pw_tokenizer/internal/decode.h"
28
#include "pw_tokenizer/nested_tokenization.h"
29
30
namespace pw::tokenizer {
31
namespace {
32
33
class NestedMessageDetokenizer {
34
 public:
35
  NestedMessageDetokenizer(const Detokenizer& detokenizer)
36
0
      : detokenizer_(detokenizer) {}
37
38
0
  void Detokenize(std::string_view chunk) {
39
0
    for (char next_char : chunk) {
40
0
      Detokenize(next_char);
41
0
    }
42
0
  }
43
44
0
  bool OutputChangedSinceLastCheck() {
45
0
    const bool changed = output_changed_;
46
0
    output_changed_ = false;
47
0
    return changed;
48
0
  }
49
50
0
  void Detokenize(char next_char) {
51
0
    switch (state_) {
52
0
      case kNonMessage:
53
0
        if (next_char == PW_TOKENIZER_NESTED_PREFIX) {
54
0
          message_buffer_.push_back(next_char);
55
0
          state_ = kMessage;
56
0
        } else {
57
0
          output_.push_back(next_char);
58
0
        }
59
0
        break;
60
0
      case kMessage:
61
0
        if (base64::IsValidChar(next_char)) {
62
0
          message_buffer_.push_back(next_char);
63
0
        } else {
64
0
          HandleEndOfMessage();
65
0
          if (next_char == PW_TOKENIZER_NESTED_PREFIX) {
66
0
            message_buffer_.push_back(next_char);
67
0
          } else {
68
0
            output_.push_back(next_char);
69
0
            state_ = kNonMessage;
70
0
          }
71
0
        }
72
0
        break;
73
0
    }
74
0
  }
75
76
0
  std::string Flush() {
77
0
    if (state_ == kMessage) {
78
0
      HandleEndOfMessage();
79
0
      state_ = kNonMessage;
80
0
    }
81
0
    std::string output(std::move(output_));
82
0
    output_.clear();
83
0
    return output;
84
0
  }
85
86
 private:
87
0
  void HandleEndOfMessage() {
88
0
    if (auto result = detokenizer_.DetokenizeBase64Message(message_buffer_);
89
0
        result.ok()) {
90
0
      output_ += result.BestString();
91
0
      output_changed_ = true;
92
0
    } else {
93
0
      output_ += message_buffer_;  // Keep the original if it doesn't decode.
94
0
    }
95
0
    message_buffer_.clear();
96
0
  }
97
98
  const Detokenizer& detokenizer_;
99
  std::string output_;
100
  std::string message_buffer_;
101
102
  enum : uint8_t { kNonMessage, kMessage } state_ = kNonMessage;
103
  bool output_changed_ = false;
104
};
105
106
0
std::string UnknownTokenMessage(uint32_t value) {
107
0
  std::string output(PW_TOKENIZER_ARG_DECODING_ERROR_PREFIX "unknown token ");
108
109
  // Output a hexadecimal version of the token.
110
0
  for (int shift = 28; shift >= 0; shift -= 4) {
111
0
    output.push_back("0123456789abcdef"[(value >> shift) & 0xF]);
112
0
  }
113
114
0
  output.append(PW_TOKENIZER_ARG_DECODING_ERROR_SUFFIX);
115
0
  return output;
116
0
}
117
118
// Decoding result with the date removed, for sorting.
119
using DecodingResult = std::pair<DecodedFormatString, uint32_t>;
120
121
// Determines if one result is better than the other if collisions occurred.
122
// Returns true if lhs is preferred over rhs. This logic should match the
123
// collision resolution logic in detokenize.py.
124
0
bool IsBetterResult(const DecodingResult& lhs, const DecodingResult& rhs) {
125
  // Favor the result for which decoding succeeded.
126
0
  if (lhs.first.ok() != rhs.first.ok()) {
127
0
    return lhs.first.ok();
128
0
  }
129
130
  // Favor the result for which all bytes were decoded.
131
0
  if ((lhs.first.remaining_bytes() == 0u) !=
132
0
      (rhs.first.remaining_bytes() == 0u)) {
133
0
    return lhs.first.remaining_bytes() == 0u;
134
0
  }
135
136
  // Favor the result with fewer decoding errors.
137
0
  if (lhs.first.decoding_errors() != rhs.first.decoding_errors()) {
138
0
    return lhs.first.decoding_errors() < rhs.first.decoding_errors();
139
0
  }
140
141
  // Favor the result that successfully decoded the most arguments.
142
0
  if (lhs.first.argument_count() != rhs.first.argument_count()) {
143
0
    return lhs.first.argument_count() > rhs.first.argument_count();
144
0
  }
145
146
  // Favor the result that was removed from the database most recently.
147
0
  return lhs.second > rhs.second;
148
0
}
149
150
// Returns true if all characters in data are printable, space, or if the string
151
// is empty.
152
0
constexpr bool IsPrintableAscii(std::string_view data) {
153
  // This follows the logic in pw_tokenizer.decode_optionally_tokenized below:
154
  //
155
  //   if ''.join(text.split()).isprintable():
156
  //     return text
157
  //
158
0
  for (int letter : data) {
159
0
    if (std::isprint(letter) == 0 && std::isspace(letter) == 0) {
160
0
      return false;
161
0
    }
162
0
  }
163
0
  return true;
164
0
}
165
166
}  // namespace
167
168
DetokenizedString::DetokenizedString(
169
    uint32_t token,
170
    const span<const TokenizedStringEntry>& entries,
171
    const span<const std::byte>& arguments)
172
13.9k
    : token_(token), has_token_(true) {
173
13.9k
  std::vector<DecodingResult> results;
174
175
13.9k
  for (const auto& [format, date_removed] : entries) {
176
1.87k
    results.push_back(DecodingResult{
177
1.87k
        format.Format(span(reinterpret_cast<const uint8_t*>(arguments.data()),
178
1.87k
                           arguments.size())),
179
1.87k
        date_removed});
180
1.87k
  }
181
182
13.9k
  std::sort(results.begin(), results.end(), IsBetterResult);
183
184
13.9k
  for (auto& result : results) {
185
1.87k
    matches_.push_back(std::move(result.first));
186
1.87k
  }
187
13.9k
}
188
189
0
std::string DetokenizedString::BestString() const {
190
0
  return matches_.empty() ? std::string() : matches_[0].value();
191
0
}
192
193
0
std::string DetokenizedString::BestStringWithErrors() const {
194
0
  if (matches_.empty()) {
195
0
    return has_token_ ? UnknownTokenMessage(token_)
196
0
                      : PW_TOKENIZER_ARG_DECODING_ERROR("missing token");
197
0
  }
198
0
  return matches_[0].value_with_errors();
199
0
}
200
201
1
Detokenizer::Detokenizer(const TokenDatabase& database) {
202
4
  for (const auto& entry : database) {
203
4
    database_[entry.token].emplace_back(entry.string, entry.date_removed);
204
4
  }
205
1
}
206
207
Result<Detokenizer> Detokenizer::FromElfSection(
208
0
    span<const std::byte> elf_section) {
209
0
  size_t index = 0;
210
0
  std::unordered_map<uint32_t, std::vector<TokenizedStringEntry>> database;
211
212
0
  while (index + sizeof(_pw_tokenizer_EntryHeader) < elf_section.size()) {
213
0
    _pw_tokenizer_EntryHeader header;
214
0
    std::memcpy(
215
0
        &header, elf_section.data() + index, sizeof(_pw_tokenizer_EntryHeader));
216
0
    index += sizeof(_pw_tokenizer_EntryHeader);
217
218
0
    if (header.magic != _PW_TOKENIZER_ENTRY_MAGIC) {
219
0
      return Status::DataLoss();
220
0
    }
221
222
0
    index += header.domain_length;
223
0
    if (index + header.string_length <= elf_section.size()) {
224
      // TODO(b/326365218): Construct FormatString with string_view to avoid
225
      // creating a copy here.
226
0
      std::string entry(
227
0
          reinterpret_cast<const char*>(elf_section.data() + index),
228
0
          header.string_length);
229
0
      index += header.string_length;
230
0
      database[header.token].emplace_back(entry.c_str(),
231
0
                                          TokenDatabase::kDateRemovedNever);
232
0
    }
233
0
  }
234
0
  return Detokenizer(std::move(database));
235
0
}
236
237
DetokenizedString Detokenizer::Detokenize(
238
16.7k
    const span<const std::byte>& encoded) const {
239
  // The token is missing from the encoded data; there is nothing to do.
240
16.7k
  if (encoded.empty()) {
241
2.79k
    return DetokenizedString();
242
2.79k
  }
243
244
13.9k
  uint32_t token = bytes::ReadInOrder<uint32_t>(
245
13.9k
      endian::little, encoded.data(), encoded.size());
246
247
13.9k
  const auto result = database_.find(token);
248
249
13.9k
  return DetokenizedString(
250
13.9k
      token,
251
13.9k
      result == database_.end() ? span<TokenizedStringEntry>()
252
13.9k
                                : span(result->second),
253
13.9k
      encoded.size() < sizeof(token) ? span<const std::byte>()
254
13.9k
                                     : encoded.subspan(sizeof(token)));
255
16.7k
}
256
257
DetokenizedString Detokenizer::DetokenizeBase64Message(
258
0
    std::string_view text) const {
259
0
  std::string buffer(text);
260
0
  buffer.resize(PrefixedBase64DecodeInPlace(buffer));
261
0
  return Detokenize(buffer);
262
0
}
263
264
std::string Detokenizer::DetokenizeText(std::string_view text,
265
0
                                        const unsigned max_passes) const {
266
0
  NestedMessageDetokenizer detokenizer(*this);
267
0
  detokenizer.Detokenize(text);
268
269
0
  std::string result;
270
0
  unsigned pass = 1;
271
272
0
  while (true) {
273
0
    result = detokenizer.Flush();
274
0
    if (pass >= max_passes || !detokenizer.OutputChangedSinceLastCheck()) {
275
0
      break;
276
0
    }
277
0
    detokenizer.Detokenize(result);
278
0
    pass += 1;
279
0
  }
280
0
  return result;
281
0
}
282
283
std::string Detokenizer::DecodeOptionallyTokenizedData(
284
0
    const ConstByteSpan& optionally_tokenized_data) {
285
  // Try detokenizing as binary using the best result if available, else use
286
  // the input data as a string.
287
0
  const auto result = Detokenize(optionally_tokenized_data);
288
0
  const bool found_matches = !result.matches().empty();
289
  // Note: unlike pw_tokenizer.proto.decode_optionally_tokenized, this decoding
290
  // process does not encode and decode UTF8 format, it is sufficient to check
291
  // if the data is printable ASCII.
292
0
  const std::string data =
293
0
      found_matches
294
0
          ? result.BestString()
295
0
          : std::string(
296
0
                reinterpret_cast<const char*>(optionally_tokenized_data.data()),
297
0
                optionally_tokenized_data.size());
298
299
0
  const bool is_data_printable = IsPrintableAscii(data);
300
0
  if (!found_matches && !is_data_printable) {
301
    // Assume the token is unknown or the data is corrupt.
302
0
    std::vector<char> base64_encoding_buffer(
303
0
        Base64EncodedBufferSize(optionally_tokenized_data.size()));
304
0
    const size_t encoded_length = PrefixedBase64Encode(
305
0
        optionally_tokenized_data, span(base64_encoding_buffer));
306
0
    return std::string{base64_encoding_buffer.data(), encoded_length};
307
0
  }
308
309
  // Successfully detokenized, check if the field has more prefixed
310
  // base64-encoded tokens.
311
0
  const std::string field = DetokenizeText(data);
312
  // If anything detokenized successfully, use that.
313
0
  if (field != data) {
314
0
    return field;
315
0
  }
316
317
  // Attempt to determine whether this is an unknown token or plain text.
318
  // Any string with only printable or whitespace characters is plain text.
319
0
  if (found_matches || is_data_printable) {
320
0
    return data;
321
0
  }
322
323
  // Assume this field is tokenized data that could not be decoded.
324
0
  std::vector<char> base64_encoding_buffer(
325
0
      Base64EncodedBufferSize(optionally_tokenized_data.size()));
326
0
  const size_t encoded_length = PrefixedBase64Encode(
327
0
      optionally_tokenized_data, span(base64_encoding_buffer));
328
0
  return std::string{base64_encoding_buffer.data(), encoded_length};
329
0
}
330
331
}  // namespace pw::tokenizer