/proc/self/cwd/pw_tokenizer/detokenize.cc
Line | Count | Source (jump to first uncovered line) |
1 | | // Copyright 2020 The Pigweed Authors |
2 | | // |
3 | | // Licensed under the Apache License, Version 2.0 (the "License"); you may not |
4 | | // use this file except in compliance with the License. You may obtain a copy of |
5 | | // the License at |
6 | | // |
7 | | // https://www.apache.org/licenses/LICENSE-2.0 |
8 | | // |
9 | | // Unless required by applicable law or agreed to in writing, software |
10 | | // distributed under the License is distributed on an "AS IS" BASIS, WITHOUT |
11 | | // WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the |
12 | | // License for the specific language governing permissions and limitations under |
13 | | // the License. |
14 | | |
15 | | #include "pw_tokenizer/detokenize.h" |
16 | | |
17 | | #include <algorithm> |
18 | | #include <cstring> |
19 | | |
20 | | #include "pw_bytes/bit.h" |
21 | | #include "pw_bytes/endian.h" |
22 | | #include "pw_tokenizer/base64.h" |
23 | | #include "pw_tokenizer/internal/decode.h" |
24 | | #include "pw_tokenizer/nested_tokenization.h" |
25 | | |
26 | | namespace pw::tokenizer { |
27 | | namespace { |
28 | | |
29 | | class NestedMessageDetokenizer { |
30 | | public: |
31 | | NestedMessageDetokenizer(const Detokenizer& detokenizer) |
32 | 0 | : detokenizer_(detokenizer) {} |
33 | | |
34 | 0 | void Detokenize(std::string_view chunk) { |
35 | 0 | for (char next_char : chunk) { |
36 | 0 | Detokenize(next_char); |
37 | 0 | } |
38 | 0 | } |
39 | | |
40 | 0 | void Detokenize(char next_char) { |
41 | 0 | switch (state_) { |
42 | 0 | case kNonMessage: |
43 | 0 | if (next_char == PW_TOKENIZER_NESTED_PREFIX) { |
44 | 0 | message_buffer_.push_back(next_char); |
45 | 0 | state_ = kMessage; |
46 | 0 | } else { |
47 | 0 | output_.push_back(next_char); |
48 | 0 | } |
49 | 0 | break; |
50 | 0 | case kMessage: |
51 | 0 | if (base64::IsValidChar(next_char)) { |
52 | 0 | message_buffer_.push_back(next_char); |
53 | 0 | } else { |
54 | 0 | HandleEndOfMessage(); |
55 | 0 | if (next_char == PW_TOKENIZER_NESTED_PREFIX) { |
56 | 0 | message_buffer_.push_back(next_char); |
57 | 0 | } else { |
58 | 0 | output_.push_back(next_char); |
59 | 0 | state_ = kNonMessage; |
60 | 0 | } |
61 | 0 | } |
62 | 0 | break; |
63 | 0 | } |
64 | 0 | } |
65 | | |
66 | 0 | std::string Flush() { |
67 | 0 | if (state_ == kMessage) { |
68 | 0 | HandleEndOfMessage(); |
69 | 0 | state_ = kNonMessage; |
70 | 0 | } |
71 | 0 | return std::move(output_); |
72 | 0 | } |
73 | | |
74 | | private: |
75 | 0 | void HandleEndOfMessage() { |
76 | 0 | if (auto result = detokenizer_.DetokenizeBase64Message(message_buffer_); |
77 | 0 | result.ok()) { |
78 | 0 | output_ += result.BestString(); |
79 | 0 | } else { |
80 | 0 | output_ += message_buffer_; // Keep the original if it doesn't decode. |
81 | 0 | } |
82 | 0 | message_buffer_.clear(); |
83 | 0 | } |
84 | | |
85 | | const Detokenizer& detokenizer_; |
86 | | std::string output_; |
87 | | std::string message_buffer_; |
88 | | |
89 | | enum { kNonMessage, kMessage } state_ = kNonMessage; |
90 | | }; |
91 | | |
92 | 0 | std::string UnknownTokenMessage(uint32_t value) { |
93 | 0 | std::string output(PW_TOKENIZER_ARG_DECODING_ERROR_PREFIX "unknown token "); |
94 | | |
95 | | // Output a hexadecimal version of the token. |
96 | 0 | for (int shift = 28; shift >= 0; shift -= 4) { |
97 | 0 | output.push_back("0123456789abcdef"[(value >> shift) & 0xF]); |
98 | 0 | } |
99 | |
|
100 | 0 | output.append(PW_TOKENIZER_ARG_DECODING_ERROR_SUFFIX); |
101 | 0 | return output; |
102 | 0 | } |
103 | | |
104 | | // Decoding result with the date removed, for sorting. |
105 | | using DecodingResult = std::pair<DecodedFormatString, uint32_t>; |
106 | | |
107 | | // Determines if one result is better than the other if collisions occurred. |
108 | | // Returns true if lhs is preferred over rhs. This logic should match the |
109 | | // collision resolution logic in detokenize.py. |
110 | 0 | bool IsBetterResult(const DecodingResult& lhs, const DecodingResult& rhs) { |
111 | | // Favor the result for which decoding succeeded. |
112 | 0 | if (lhs.first.ok() != rhs.first.ok()) { |
113 | 0 | return lhs.first.ok(); |
114 | 0 | } |
115 | | |
116 | | // Favor the result for which all bytes were decoded. |
117 | 0 | if ((lhs.first.remaining_bytes() == 0u) != |
118 | 0 | (rhs.first.remaining_bytes() == 0u)) { |
119 | 0 | return lhs.first.remaining_bytes() == 0u; |
120 | 0 | } |
121 | | |
122 | | // Favor the result with fewer decoding errors. |
123 | 0 | if (lhs.first.decoding_errors() != rhs.first.decoding_errors()) { |
124 | 0 | return lhs.first.decoding_errors() < rhs.first.decoding_errors(); |
125 | 0 | } |
126 | | |
127 | | // Favor the result that successfully decoded the most arguments. |
128 | 0 | if (lhs.first.argument_count() != rhs.first.argument_count()) { |
129 | 0 | return lhs.first.argument_count() > rhs.first.argument_count(); |
130 | 0 | } |
131 | | |
132 | | // Favor the result that was removed from the database most recently. |
133 | 0 | return lhs.second > rhs.second; |
134 | 0 | } |
135 | | |
136 | | } // namespace |
137 | | |
138 | | DetokenizedString::DetokenizedString( |
139 | | uint32_t token, |
140 | | const span<const TokenizedStringEntry>& entries, |
141 | | const span<const uint8_t>& arguments) |
142 | 10.7k | : token_(token), has_token_(true) { |
143 | 10.7k | std::vector<DecodingResult> results; |
144 | | |
145 | 10.7k | for (const auto& [format, date_removed] : entries) { |
146 | 2.18k | results.push_back(DecodingResult{format.Format(arguments), date_removed}); |
147 | 2.18k | } |
148 | | |
149 | 10.7k | std::sort(results.begin(), results.end(), IsBetterResult); |
150 | | |
151 | 10.7k | for (auto& result : results) { |
152 | 2.18k | matches_.push_back(std::move(result.first)); |
153 | 2.18k | } |
154 | 10.7k | } |
155 | | |
156 | 0 | std::string DetokenizedString::BestString() const { |
157 | 0 | return matches_.empty() ? std::string() : matches_[0].value(); |
158 | 0 | } |
159 | | |
160 | 0 | std::string DetokenizedString::BestStringWithErrors() const { |
161 | 0 | if (matches_.empty()) { |
162 | 0 | return has_token_ ? UnknownTokenMessage(token_) |
163 | 0 | : PW_TOKENIZER_ARG_DECODING_ERROR("missing token"); |
164 | 0 | } |
165 | 0 | return matches_[0].value_with_errors(); |
166 | 0 | } |
167 | | |
168 | 1 | Detokenizer::Detokenizer(const TokenDatabase& database) { |
169 | 4 | for (const auto& entry : database) { |
170 | 4 | database_[entry.token].emplace_back(entry.string, entry.date_removed); |
171 | 4 | } |
172 | 1 | } |
173 | | |
174 | | DetokenizedString Detokenizer::Detokenize( |
175 | 12.8k | const span<const uint8_t>& encoded) const { |
176 | | // The token is missing from the encoded data; there is nothing to do. |
177 | 12.8k | if (encoded.empty()) { |
178 | 2.05k | return DetokenizedString(); |
179 | 2.05k | } |
180 | | |
181 | 10.7k | uint32_t token = bytes::ReadInOrder<uint32_t>( |
182 | 10.7k | endian::little, encoded.data(), encoded.size()); |
183 | | |
184 | 10.7k | const auto result = database_.find(token); |
185 | | |
186 | 10.7k | return DetokenizedString( |
187 | 10.7k | token, |
188 | 10.7k | result == database_.end() ? span<TokenizedStringEntry>() |
189 | 10.7k | : span(result->second), |
190 | 10.7k | encoded.size() < sizeof(token) ? span<const uint8_t>() |
191 | 10.7k | : encoded.subspan(sizeof(token))); |
192 | 12.8k | } |
193 | | |
194 | | DetokenizedString Detokenizer::DetokenizeBase64Message( |
195 | 0 | std::string_view text) const { |
196 | 0 | std::string buffer(text); |
197 | 0 | buffer.resize(PrefixedBase64DecodeInPlace(buffer)); |
198 | 0 | return Detokenize(buffer); |
199 | 0 | } |
200 | | |
201 | 0 | std::string Detokenizer::DetokenizeBase64(std::string_view text) const { |
202 | 0 | NestedMessageDetokenizer nested_detokenizer(*this); |
203 | 0 | nested_detokenizer.Detokenize(text); |
204 | 0 | return nested_detokenizer.Flush(); |
205 | 0 | } |
206 | | |
207 | | } // namespace pw::tokenizer |