Coverage Report

Created: 2023-12-16 06:39

/proc/self/cwd/pw_tokenizer/detokenize.cc
Line
Count
Source (jump to first uncovered line)
1
// Copyright 2020 The Pigweed Authors
2
//
3
// Licensed under the Apache License, Version 2.0 (the "License"); you may not
4
// use this file except in compliance with the License. You may obtain a copy of
5
// the License at
6
//
7
//     https://www.apache.org/licenses/LICENSE-2.0
8
//
9
// Unless required by applicable law or agreed to in writing, software
10
// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
11
// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
12
// License for the specific language governing permissions and limitations under
13
// the License.
14
15
#include "pw_tokenizer/detokenize.h"
16
17
#include <algorithm>
18
#include <cstring>
19
20
#include "pw_bytes/bit.h"
21
#include "pw_bytes/endian.h"
22
#include "pw_tokenizer/base64.h"
23
#include "pw_tokenizer/internal/decode.h"
24
#include "pw_tokenizer/nested_tokenization.h"
25
26
namespace pw::tokenizer {
27
namespace {
28
29
class NestedMessageDetokenizer {
30
 public:
31
  NestedMessageDetokenizer(const Detokenizer& detokenizer)
32
0
      : detokenizer_(detokenizer) {}
33
34
0
  void Detokenize(std::string_view chunk) {
35
0
    for (char next_char : chunk) {
36
0
      Detokenize(next_char);
37
0
    }
38
0
  }
39
40
0
  void Detokenize(char next_char) {
41
0
    switch (state_) {
42
0
      case kNonMessage:
43
0
        if (next_char == PW_TOKENIZER_NESTED_PREFIX) {
44
0
          message_buffer_.push_back(next_char);
45
0
          state_ = kMessage;
46
0
        } else {
47
0
          output_.push_back(next_char);
48
0
        }
49
0
        break;
50
0
      case kMessage:
51
0
        if (base64::IsValidChar(next_char)) {
52
0
          message_buffer_.push_back(next_char);
53
0
        } else {
54
0
          HandleEndOfMessage();
55
0
          if (next_char == PW_TOKENIZER_NESTED_PREFIX) {
56
0
            message_buffer_.push_back(next_char);
57
0
          } else {
58
0
            output_.push_back(next_char);
59
0
            state_ = kNonMessage;
60
0
          }
61
0
        }
62
0
        break;
63
0
    }
64
0
  }
65
66
0
  std::string Flush() {
67
0
    if (state_ == kMessage) {
68
0
      HandleEndOfMessage();
69
0
      state_ = kNonMessage;
70
0
    }
71
0
    return std::move(output_);
72
0
  }
73
74
 private:
75
0
  void HandleEndOfMessage() {
76
0
    if (auto result = detokenizer_.DetokenizeBase64Message(message_buffer_);
77
0
        result.ok()) {
78
0
      output_ += result.BestString();
79
0
    } else {
80
0
      output_ += message_buffer_;  // Keep the original if it doesn't decode.
81
0
    }
82
0
    message_buffer_.clear();
83
0
  }
84
85
  const Detokenizer& detokenizer_;
86
  std::string output_;
87
  std::string message_buffer_;
88
89
  enum { kNonMessage, kMessage } state_ = kNonMessage;
90
};
91
92
0
std::string UnknownTokenMessage(uint32_t value) {
93
0
  std::string output(PW_TOKENIZER_ARG_DECODING_ERROR_PREFIX "unknown token ");
94
95
  // Output a hexadecimal version of the token.
96
0
  for (int shift = 28; shift >= 0; shift -= 4) {
97
0
    output.push_back("0123456789abcdef"[(value >> shift) & 0xF]);
98
0
  }
99
100
0
  output.append(PW_TOKENIZER_ARG_DECODING_ERROR_SUFFIX);
101
0
  return output;
102
0
}
103
104
// Decoding result with the date removed, for sorting.
105
using DecodingResult = std::pair<DecodedFormatString, uint32_t>;
106
107
// Determines if one result is better than the other if collisions occurred.
108
// Returns true if lhs is preferred over rhs. This logic should match the
109
// collision resolution logic in detokenize.py.
110
0
bool IsBetterResult(const DecodingResult& lhs, const DecodingResult& rhs) {
111
  // Favor the result for which decoding succeeded.
112
0
  if (lhs.first.ok() != rhs.first.ok()) {
113
0
    return lhs.first.ok();
114
0
  }
115
116
  // Favor the result for which all bytes were decoded.
117
0
  if ((lhs.first.remaining_bytes() == 0u) !=
118
0
      (rhs.first.remaining_bytes() == 0u)) {
119
0
    return lhs.first.remaining_bytes() == 0u;
120
0
  }
121
122
  // Favor the result with fewer decoding errors.
123
0
  if (lhs.first.decoding_errors() != rhs.first.decoding_errors()) {
124
0
    return lhs.first.decoding_errors() < rhs.first.decoding_errors();
125
0
  }
126
127
  // Favor the result that successfully decoded the most arguments.
128
0
  if (lhs.first.argument_count() != rhs.first.argument_count()) {
129
0
    return lhs.first.argument_count() > rhs.first.argument_count();
130
0
  }
131
132
  // Favor the result that was removed from the database most recently.
133
0
  return lhs.second > rhs.second;
134
0
}
135
136
}  // namespace
137
138
DetokenizedString::DetokenizedString(
139
    uint32_t token,
140
    const span<const TokenizedStringEntry>& entries,
141
    const span<const uint8_t>& arguments)
142
10.7k
    : token_(token), has_token_(true) {
143
10.7k
  std::vector<DecodingResult> results;
144
145
10.7k
  for (const auto& [format, date_removed] : entries) {
146
2.18k
    results.push_back(DecodingResult{format.Format(arguments), date_removed});
147
2.18k
  }
148
149
10.7k
  std::sort(results.begin(), results.end(), IsBetterResult);
150
151
10.7k
  for (auto& result : results) {
152
2.18k
    matches_.push_back(std::move(result.first));
153
2.18k
  }
154
10.7k
}
155
156
0
std::string DetokenizedString::BestString() const {
157
0
  return matches_.empty() ? std::string() : matches_[0].value();
158
0
}
159
160
0
std::string DetokenizedString::BestStringWithErrors() const {
161
0
  if (matches_.empty()) {
162
0
    return has_token_ ? UnknownTokenMessage(token_)
163
0
                      : PW_TOKENIZER_ARG_DECODING_ERROR("missing token");
164
0
  }
165
0
  return matches_[0].value_with_errors();
166
0
}
167
168
1
Detokenizer::Detokenizer(const TokenDatabase& database) {
169
4
  for (const auto& entry : database) {
170
4
    database_[entry.token].emplace_back(entry.string, entry.date_removed);
171
4
  }
172
1
}
173
174
DetokenizedString Detokenizer::Detokenize(
175
12.8k
    const span<const uint8_t>& encoded) const {
176
  // The token is missing from the encoded data; there is nothing to do.
177
12.8k
  if (encoded.empty()) {
178
2.05k
    return DetokenizedString();
179
2.05k
  }
180
181
10.7k
  uint32_t token = bytes::ReadInOrder<uint32_t>(
182
10.7k
      endian::little, encoded.data(), encoded.size());
183
184
10.7k
  const auto result = database_.find(token);
185
186
10.7k
  return DetokenizedString(
187
10.7k
      token,
188
10.7k
      result == database_.end() ? span<TokenizedStringEntry>()
189
10.7k
                                : span(result->second),
190
10.7k
      encoded.size() < sizeof(token) ? span<const uint8_t>()
191
10.7k
                                     : encoded.subspan(sizeof(token)));
192
12.8k
}
193
194
DetokenizedString Detokenizer::DetokenizeBase64Message(
195
0
    std::string_view text) const {
196
0
  std::string buffer(text);
197
0
  buffer.resize(PrefixedBase64DecodeInPlace(buffer));
198
0
  return Detokenize(buffer);
199
0
}
200
201
0
std::string Detokenizer::DetokenizeBase64(std::string_view text) const {
202
0
  NestedMessageDetokenizer nested_detokenizer(*this);
203
0
  nested_detokenizer.Detokenize(text);
204
0
  return nested_detokenizer.Flush();
205
0
}
206
207
}  // namespace pw::tokenizer