Coverage Report

Created: 2024-04-15 06:29

/proc/self/cwd/pw_tokenizer/detokenize.cc
Line
Count
Source (jump to first uncovered line)
1
// Copyright 2020 The Pigweed Authors
2
//
3
// Licensed under the Apache License, Version 2.0 (the "License"); you may not
4
// use this file except in compliance with the License. You may obtain a copy of
5
// the License at
6
//
7
//     https://www.apache.org/licenses/LICENSE-2.0
8
//
9
// Unless required by applicable law or agreed to in writing, software
10
// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
11
// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
12
// License for the specific language governing permissions and limitations under
13
// the License.
14
15
#include "pw_tokenizer/detokenize.h"
16
17
#include <algorithm>
18
#include <cstring>
19
20
#include "pw_bytes/bit.h"
21
#include "pw_bytes/endian.h"
22
#include "pw_result/result.h"
23
#include "pw_tokenizer/base64.h"
24
#include "pw_tokenizer/internal/decode.h"
25
#include "pw_tokenizer/nested_tokenization.h"
26
27
namespace pw::tokenizer {
28
namespace {
29
30
class NestedMessageDetokenizer {
31
 public:
32
  NestedMessageDetokenizer(const Detokenizer& detokenizer)
33
0
      : detokenizer_(detokenizer) {}
34
35
0
  void Detokenize(std::string_view chunk) {
36
0
    for (char next_char : chunk) {
37
0
      Detokenize(next_char);
38
0
    }
39
0
  }
40
41
0
  void Detokenize(char next_char) {
42
0
    switch (state_) {
43
0
      case kNonMessage:
44
0
        if (next_char == PW_TOKENIZER_NESTED_PREFIX) {
45
0
          message_buffer_.push_back(next_char);
46
0
          state_ = kMessage;
47
0
        } else {
48
0
          output_.push_back(next_char);
49
0
        }
50
0
        break;
51
0
      case kMessage:
52
0
        if (base64::IsValidChar(next_char)) {
53
0
          message_buffer_.push_back(next_char);
54
0
        } else {
55
0
          HandleEndOfMessage();
56
0
          if (next_char == PW_TOKENIZER_NESTED_PREFIX) {
57
0
            message_buffer_.push_back(next_char);
58
0
          } else {
59
0
            output_.push_back(next_char);
60
0
            state_ = kNonMessage;
61
0
          }
62
0
        }
63
0
        break;
64
0
    }
65
0
  }
66
67
0
  std::string Flush() {
68
0
    if (state_ == kMessage) {
69
0
      HandleEndOfMessage();
70
0
      state_ = kNonMessage;
71
0
    }
72
0
    return std::move(output_);
73
0
  }
74
75
 private:
76
0
  void HandleEndOfMessage() {
77
0
    if (auto result = detokenizer_.DetokenizeBase64Message(message_buffer_);
78
0
        result.ok()) {
79
0
      output_ += result.BestString();
80
0
    } else {
81
0
      output_ += message_buffer_;  // Keep the original if it doesn't decode.
82
0
    }
83
0
    message_buffer_.clear();
84
0
  }
85
86
  const Detokenizer& detokenizer_;
87
  std::string output_;
88
  std::string message_buffer_;
89
90
  enum { kNonMessage, kMessage } state_ = kNonMessage;
91
};
92
93
0
std::string UnknownTokenMessage(uint32_t value) {
94
0
  std::string output(PW_TOKENIZER_ARG_DECODING_ERROR_PREFIX "unknown token ");
95
96
  // Output a hexadecimal version of the token.
97
0
  for (int shift = 28; shift >= 0; shift -= 4) {
98
0
    output.push_back("0123456789abcdef"[(value >> shift) & 0xF]);
99
0
  }
100
101
0
  output.append(PW_TOKENIZER_ARG_DECODING_ERROR_SUFFIX);
102
0
  return output;
103
0
}
104
105
// Decoding result with the date removed, for sorting.
106
using DecodingResult = std::pair<DecodedFormatString, uint32_t>;
107
108
// Determines if one result is better than the other if collisions occurred.
109
// Returns true if lhs is preferred over rhs. This logic should match the
110
// collision resolution logic in detokenize.py.
111
0
bool IsBetterResult(const DecodingResult& lhs, const DecodingResult& rhs) {
112
  // Favor the result for which decoding succeeded.
113
0
  if (lhs.first.ok() != rhs.first.ok()) {
114
0
    return lhs.first.ok();
115
0
  }
116
117
  // Favor the result for which all bytes were decoded.
118
0
  if ((lhs.first.remaining_bytes() == 0u) !=
119
0
      (rhs.first.remaining_bytes() == 0u)) {
120
0
    return lhs.first.remaining_bytes() == 0u;
121
0
  }
122
123
  // Favor the result with fewer decoding errors.
124
0
  if (lhs.first.decoding_errors() != rhs.first.decoding_errors()) {
125
0
    return lhs.first.decoding_errors() < rhs.first.decoding_errors();
126
0
  }
127
128
  // Favor the result that successfully decoded the most arguments.
129
0
  if (lhs.first.argument_count() != rhs.first.argument_count()) {
130
0
    return lhs.first.argument_count() > rhs.first.argument_count();
131
0
  }
132
133
  // Favor the result that was removed from the database most recently.
134
0
  return lhs.second > rhs.second;
135
0
}
136
137
}  // namespace
138
139
DetokenizedString::DetokenizedString(
140
    uint32_t token,
141
    const span<const TokenizedStringEntry>& entries,
142
    const span<const uint8_t>& arguments)
143
11.3k
    : token_(token), has_token_(true) {
144
11.3k
  std::vector<DecodingResult> results;
145
146
11.3k
  for (const auto& [format, date_removed] : entries) {
147
1.94k
    results.push_back(DecodingResult{format.Format(arguments), date_removed});
148
1.94k
  }
149
150
11.3k
  std::sort(results.begin(), results.end(), IsBetterResult);
151
152
11.3k
  for (auto& result : results) {
153
1.94k
    matches_.push_back(std::move(result.first));
154
1.94k
  }
155
11.3k
}
156
157
0
std::string DetokenizedString::BestString() const {
158
0
  return matches_.empty() ? std::string() : matches_[0].value();
159
0
}
160
161
0
std::string DetokenizedString::BestStringWithErrors() const {
162
0
  if (matches_.empty()) {
163
0
    return has_token_ ? UnknownTokenMessage(token_)
164
0
                      : PW_TOKENIZER_ARG_DECODING_ERROR("missing token");
165
0
  }
166
0
  return matches_[0].value_with_errors();
167
0
}
168
169
1
Detokenizer::Detokenizer(const TokenDatabase& database) {
170
4
  for (const auto& entry : database) {
171
4
    database_[entry.token].emplace_back(entry.string, entry.date_removed);
172
4
  }
173
1
}
174
175
Result<Detokenizer> Detokenizer::FromElfSection(
176
0
    span<const uint8_t> elf_section) {
177
0
  size_t index = 0;
178
0
  std::unordered_map<uint32_t, std::vector<TokenizedStringEntry>> database;
179
180
0
  while (index + sizeof(_pw_tokenizer_EntryHeader) < elf_section.size()) {
181
0
    _pw_tokenizer_EntryHeader header;
182
0
    std::memcpy(
183
0
        &header, elf_section.data() + index, sizeof(_pw_tokenizer_EntryHeader));
184
0
    index += sizeof(_pw_tokenizer_EntryHeader);
185
186
0
    if (header.magic != _PW_TOKENIZER_ENTRY_MAGIC) {
187
0
      return Status::DataLoss();
188
0
    }
189
190
0
    index += header.domain_length;
191
0
    if (index + header.string_length <= elf_section.size()) {
192
      // TODO(b/326365218): Construct FormatString with string_view to avoid
193
      // creating a copy here.
194
0
      std::string entry(
195
0
          reinterpret_cast<const char*>(elf_section.data() + index),
196
0
          header.string_length);
197
0
      index += header.string_length;
198
0
      database[header.token].emplace_back(entry.c_str(),
199
0
                                          TokenDatabase::kDateRemovedNever);
200
0
    }
201
0
  }
202
0
  return Detokenizer(std::move(database));
203
0
}
204
205
DetokenizedString Detokenizer::Detokenize(
206
13.3k
    const span<const uint8_t>& encoded) const {
207
  // The token is missing from the encoded data; there is nothing to do.
208
13.3k
  if (encoded.empty()) {
209
2.00k
    return DetokenizedString();
210
2.00k
  }
211
212
11.3k
  uint32_t token = bytes::ReadInOrder<uint32_t>(
213
11.3k
      endian::little, encoded.data(), encoded.size());
214
215
11.3k
  const auto result = database_.find(token);
216
217
11.3k
  return DetokenizedString(
218
11.3k
      token,
219
11.3k
      result == database_.end() ? span<TokenizedStringEntry>()
220
11.3k
                                : span(result->second),
221
11.3k
      encoded.size() < sizeof(token) ? span<const uint8_t>()
222
11.3k
                                     : encoded.subspan(sizeof(token)));
223
13.3k
}
224
225
DetokenizedString Detokenizer::DetokenizeBase64Message(
226
0
    std::string_view text) const {
227
0
  std::string buffer(text);
228
0
  buffer.resize(PrefixedBase64DecodeInPlace(buffer));
229
0
  return Detokenize(buffer);
230
0
}
231
232
0
std::string Detokenizer::DetokenizeBase64(std::string_view text) const {
233
0
  NestedMessageDetokenizer nested_detokenizer(*this);
234
0
  nested_detokenizer.Detokenize(text);
235
0
  return nested_detokenizer.Flush();
236
0
}
237
238
}  // namespace pw::tokenizer