/proc/self/cwd/pw_tokenizer/detokenize.cc
Line | Count | Source (jump to first uncovered line) |
1 | | // Copyright 2020 The Pigweed Authors |
2 | | // |
3 | | // Licensed under the Apache License, Version 2.0 (the "License"); you may not |
4 | | // use this file except in compliance with the License. You may obtain a copy of |
5 | | // the License at |
6 | | // |
7 | | // https://www.apache.org/licenses/LICENSE-2.0 |
8 | | // |
9 | | // Unless required by applicable law or agreed to in writing, software |
10 | | // distributed under the License is distributed on an "AS IS" BASIS, WITHOUT |
11 | | // WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the |
12 | | // License for the specific language governing permissions and limitations under |
13 | | // the License. |
14 | | |
15 | | #include "pw_tokenizer/detokenize.h" |
16 | | |
17 | | #include <algorithm> |
18 | | #include <cstring> |
19 | | |
20 | | #include "pw_bytes/bit.h" |
21 | | #include "pw_bytes/endian.h" |
22 | | #include "pw_result/result.h" |
23 | | #include "pw_tokenizer/base64.h" |
24 | | #include "pw_tokenizer/internal/decode.h" |
25 | | #include "pw_tokenizer/nested_tokenization.h" |
26 | | |
27 | | namespace pw::tokenizer { |
28 | | namespace { |
29 | | |
30 | | class NestedMessageDetokenizer { |
31 | | public: |
32 | | NestedMessageDetokenizer(const Detokenizer& detokenizer) |
33 | 0 | : detokenizer_(detokenizer) {} |
34 | | |
35 | 0 | void Detokenize(std::string_view chunk) { |
36 | 0 | for (char next_char : chunk) { |
37 | 0 | Detokenize(next_char); |
38 | 0 | } |
39 | 0 | } |
40 | | |
41 | 0 | void Detokenize(char next_char) { |
42 | 0 | switch (state_) { |
43 | 0 | case kNonMessage: |
44 | 0 | if (next_char == PW_TOKENIZER_NESTED_PREFIX) { |
45 | 0 | message_buffer_.push_back(next_char); |
46 | 0 | state_ = kMessage; |
47 | 0 | } else { |
48 | 0 | output_.push_back(next_char); |
49 | 0 | } |
50 | 0 | break; |
51 | 0 | case kMessage: |
52 | 0 | if (base64::IsValidChar(next_char)) { |
53 | 0 | message_buffer_.push_back(next_char); |
54 | 0 | } else { |
55 | 0 | HandleEndOfMessage(); |
56 | 0 | if (next_char == PW_TOKENIZER_NESTED_PREFIX) { |
57 | 0 | message_buffer_.push_back(next_char); |
58 | 0 | } else { |
59 | 0 | output_.push_back(next_char); |
60 | 0 | state_ = kNonMessage; |
61 | 0 | } |
62 | 0 | } |
63 | 0 | break; |
64 | 0 | } |
65 | 0 | } |
66 | | |
67 | 0 | std::string Flush() { |
68 | 0 | if (state_ == kMessage) { |
69 | 0 | HandleEndOfMessage(); |
70 | 0 | state_ = kNonMessage; |
71 | 0 | } |
72 | 0 | return std::move(output_); |
73 | 0 | } |
74 | | |
75 | | private: |
76 | 0 | void HandleEndOfMessage() { |
77 | 0 | if (auto result = detokenizer_.DetokenizeBase64Message(message_buffer_); |
78 | 0 | result.ok()) { |
79 | 0 | output_ += result.BestString(); |
80 | 0 | } else { |
81 | 0 | output_ += message_buffer_; // Keep the original if it doesn't decode. |
82 | 0 | } |
83 | 0 | message_buffer_.clear(); |
84 | 0 | } |
85 | | |
86 | | const Detokenizer& detokenizer_; |
87 | | std::string output_; |
88 | | std::string message_buffer_; |
89 | | |
90 | | enum { kNonMessage, kMessage } state_ = kNonMessage; |
91 | | }; |
92 | | |
93 | 0 | std::string UnknownTokenMessage(uint32_t value) { |
94 | 0 | std::string output(PW_TOKENIZER_ARG_DECODING_ERROR_PREFIX "unknown token "); |
95 | | |
96 | | // Output a hexadecimal version of the token. |
97 | 0 | for (int shift = 28; shift >= 0; shift -= 4) { |
98 | 0 | output.push_back("0123456789abcdef"[(value >> shift) & 0xF]); |
99 | 0 | } |
100 | |
|
101 | 0 | output.append(PW_TOKENIZER_ARG_DECODING_ERROR_SUFFIX); |
102 | 0 | return output; |
103 | 0 | } |
104 | | |
105 | | // Decoding result with the date removed, for sorting. |
106 | | using DecodingResult = std::pair<DecodedFormatString, uint32_t>; |
107 | | |
108 | | // Determines if one result is better than the other if collisions occurred. |
109 | | // Returns true if lhs is preferred over rhs. This logic should match the |
110 | | // collision resolution logic in detokenize.py. |
111 | 0 | bool IsBetterResult(const DecodingResult& lhs, const DecodingResult& rhs) { |
112 | | // Favor the result for which decoding succeeded. |
113 | 0 | if (lhs.first.ok() != rhs.first.ok()) { |
114 | 0 | return lhs.first.ok(); |
115 | 0 | } |
116 | | |
117 | | // Favor the result for which all bytes were decoded. |
118 | 0 | if ((lhs.first.remaining_bytes() == 0u) != |
119 | 0 | (rhs.first.remaining_bytes() == 0u)) { |
120 | 0 | return lhs.first.remaining_bytes() == 0u; |
121 | 0 | } |
122 | | |
123 | | // Favor the result with fewer decoding errors. |
124 | 0 | if (lhs.first.decoding_errors() != rhs.first.decoding_errors()) { |
125 | 0 | return lhs.first.decoding_errors() < rhs.first.decoding_errors(); |
126 | 0 | } |
127 | | |
128 | | // Favor the result that successfully decoded the most arguments. |
129 | 0 | if (lhs.first.argument_count() != rhs.first.argument_count()) { |
130 | 0 | return lhs.first.argument_count() > rhs.first.argument_count(); |
131 | 0 | } |
132 | | |
133 | | // Favor the result that was removed from the database most recently. |
134 | 0 | return lhs.second > rhs.second; |
135 | 0 | } |
136 | | |
137 | | } // namespace |
138 | | |
139 | | DetokenizedString::DetokenizedString( |
140 | | uint32_t token, |
141 | | const span<const TokenizedStringEntry>& entries, |
142 | | const span<const uint8_t>& arguments) |
143 | 11.3k | : token_(token), has_token_(true) { |
144 | 11.3k | std::vector<DecodingResult> results; |
145 | | |
146 | 11.3k | for (const auto& [format, date_removed] : entries) { |
147 | 1.94k | results.push_back(DecodingResult{format.Format(arguments), date_removed}); |
148 | 1.94k | } |
149 | | |
150 | 11.3k | std::sort(results.begin(), results.end(), IsBetterResult); |
151 | | |
152 | 11.3k | for (auto& result : results) { |
153 | 1.94k | matches_.push_back(std::move(result.first)); |
154 | 1.94k | } |
155 | 11.3k | } |
156 | | |
157 | 0 | std::string DetokenizedString::BestString() const { |
158 | 0 | return matches_.empty() ? std::string() : matches_[0].value(); |
159 | 0 | } |
160 | | |
161 | 0 | std::string DetokenizedString::BestStringWithErrors() const { |
162 | 0 | if (matches_.empty()) { |
163 | 0 | return has_token_ ? UnknownTokenMessage(token_) |
164 | 0 | : PW_TOKENIZER_ARG_DECODING_ERROR("missing token"); |
165 | 0 | } |
166 | 0 | return matches_[0].value_with_errors(); |
167 | 0 | } |
168 | | |
169 | 1 | Detokenizer::Detokenizer(const TokenDatabase& database) { |
170 | 4 | for (const auto& entry : database) { |
171 | 4 | database_[entry.token].emplace_back(entry.string, entry.date_removed); |
172 | 4 | } |
173 | 1 | } |
174 | | |
175 | | Result<Detokenizer> Detokenizer::FromElfSection( |
176 | 0 | span<const uint8_t> elf_section) { |
177 | 0 | size_t index = 0; |
178 | 0 | std::unordered_map<uint32_t, std::vector<TokenizedStringEntry>> database; |
179 | |
|
180 | 0 | while (index + sizeof(_pw_tokenizer_EntryHeader) < elf_section.size()) { |
181 | 0 | _pw_tokenizer_EntryHeader header; |
182 | 0 | std::memcpy( |
183 | 0 | &header, elf_section.data() + index, sizeof(_pw_tokenizer_EntryHeader)); |
184 | 0 | index += sizeof(_pw_tokenizer_EntryHeader); |
185 | |
|
186 | 0 | if (header.magic != _PW_TOKENIZER_ENTRY_MAGIC) { |
187 | 0 | return Status::DataLoss(); |
188 | 0 | } |
189 | | |
190 | 0 | index += header.domain_length; |
191 | 0 | if (index + header.string_length <= elf_section.size()) { |
192 | | // TODO(b/326365218): Construct FormatString with string_view to avoid |
193 | | // creating a copy here. |
194 | 0 | std::string entry( |
195 | 0 | reinterpret_cast<const char*>(elf_section.data() + index), |
196 | 0 | header.string_length); |
197 | 0 | index += header.string_length; |
198 | 0 | database[header.token].emplace_back(entry.c_str(), |
199 | 0 | TokenDatabase::kDateRemovedNever); |
200 | 0 | } |
201 | 0 | } |
202 | 0 | return Detokenizer(std::move(database)); |
203 | 0 | } |
204 | | |
205 | | DetokenizedString Detokenizer::Detokenize( |
206 | 13.3k | const span<const uint8_t>& encoded) const { |
207 | | // The token is missing from the encoded data; there is nothing to do. |
208 | 13.3k | if (encoded.empty()) { |
209 | 2.00k | return DetokenizedString(); |
210 | 2.00k | } |
211 | | |
212 | 11.3k | uint32_t token = bytes::ReadInOrder<uint32_t>( |
213 | 11.3k | endian::little, encoded.data(), encoded.size()); |
214 | | |
215 | 11.3k | const auto result = database_.find(token); |
216 | | |
217 | 11.3k | return DetokenizedString( |
218 | 11.3k | token, |
219 | 11.3k | result == database_.end() ? span<TokenizedStringEntry>() |
220 | 11.3k | : span(result->second), |
221 | 11.3k | encoded.size() < sizeof(token) ? span<const uint8_t>() |
222 | 11.3k | : encoded.subspan(sizeof(token))); |
223 | 13.3k | } |
224 | | |
225 | | DetokenizedString Detokenizer::DetokenizeBase64Message( |
226 | 0 | std::string_view text) const { |
227 | 0 | std::string buffer(text); |
228 | 0 | buffer.resize(PrefixedBase64DecodeInPlace(buffer)); |
229 | 0 | return Detokenize(buffer); |
230 | 0 | } |
231 | | |
232 | 0 | std::string Detokenizer::DetokenizeBase64(std::string_view text) const { |
233 | 0 | NestedMessageDetokenizer nested_detokenizer(*this); |
234 | 0 | nested_detokenizer.Detokenize(text); |
235 | 0 | return nested_detokenizer.Flush(); |
236 | 0 | } |
237 | | |
238 | | } // namespace pw::tokenizer |