/proc/self/cwd/pw_tokenizer/detokenize.cc

Source (jump to first uncovered line)
// Copyright 2020 The Pigweed Authors
//
// Licensed under the Apache License, Version 2.0 (the "License"); you may not
// use this file except in compliance with the License. You may obtain a copy of
// the License at
//
//     https://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
// License for the specific language governing permissions and limitations under
// the License.

#include "pw_tokenizer/detokenize.h"

#include <algorithm>
#include <cctype>
#include <cstring>
#include <string_view>
#include <vector>

#include "pw_bytes/bit.h"
#include "pw_bytes/endian.h"
#include "pw_result/result.h"
#include "pw_tokenizer/base64.h"
#include "pw_tokenizer/internal/decode.h"
#include "pw_tokenizer/nested_tokenization.h"

namespace pw::tokenizer {
namespace {

class NestedMessageDetokenizer {
 public:
  NestedMessageDetokenizer(const Detokenizer& detokenizer)
      : detokenizer_(detokenizer) {}

  void Detokenize(std::string_view chunk) {
    for (char next_char : chunk) {
      Detokenize(next_char);
    }
  }

  bool OutputChangedSinceLastCheck() {
    const bool changed = output_changed_;
    output_changed_ = false;
    return changed;
  }

  void Detokenize(char next_char) {
    switch (state_) {
      case kNonMessage:
        if (next_char == PW_TOKENIZER_NESTED_PREFIX) {
          message_buffer_.push_back(next_char);
          state_ = kMessage;
        } else {
          output_.push_back(next_char);
        }
        break;
      case kMessage:
        if (base64::IsValidChar(next_char)) {
          message_buffer_.push_back(next_char);
        } else {
          HandleEndOfMessage();
          if (next_char == PW_TOKENIZER_NESTED_PREFIX) {
            message_buffer_.push_back(next_char);
          } else {
            output_.push_back(next_char);
            state_ = kNonMessage;
          }
        }
        break;
    }
  }

  std::string Flush() {
    if (state_ == kMessage) {
      HandleEndOfMessage();
      state_ = kNonMessage;
    }
    std::string output(std::move(output_));
    output_.clear();
    return output;
  }

 private:
  void HandleEndOfMessage() {
    if (auto result = detokenizer_.DetokenizeBase64Message(message_buffer_);
        result.ok()) {
      output_ += result.BestString();
      output_changed_ = true;
    } else {
      output_ += message_buffer_;  // Keep the original if it doesn't decode.
    }
    message_buffer_.clear();
  }

  const Detokenizer& detokenizer_;
  std::string output_;
  std::string message_buffer_;

  enum : uint8_t { kNonMessage, kMessage } state_ = kNonMessage;
  bool output_changed_ = false;
};

std::string UnknownTokenMessage(uint32_t value) {
  std::string output(PW_TOKENIZER_ARG_DECODING_ERROR_PREFIX "unknown token ");

  // Output a hexadecimal version of the token.
  for (int shift = 28; shift >= 0; shift -= 4) {
    output.push_back("0123456789abcdef"[(value >> shift) & 0xF]);
  }

  output.append(PW_TOKENIZER_ARG_DECODING_ERROR_SUFFIX);
  return output;
}

// Decoding result with the date removed, for sorting.
using DecodingResult = std::pair<DecodedFormatString, uint32_t>;

// Determines if one result is better than the other if collisions occurred.
// Returns true if lhs is preferred over rhs. This logic should match the
// collision resolution logic in detokenize.py.
bool IsBetterResult(const DecodingResult& lhs, const DecodingResult& rhs) {
  // Favor the result for which decoding succeeded.
  if (lhs.first.ok() != rhs.first.ok()) {
    return lhs.first.ok();
  }

  // Favor the result for which all bytes were decoded.
  if ((lhs.first.remaining_bytes() == 0u) !=
      (rhs.first.remaining_bytes() == 0u)) {
    return lhs.first.remaining_bytes() == 0u;
  }

  // Favor the result with fewer decoding errors.
  if (lhs.first.decoding_errors() != rhs.first.decoding_errors()) {
    return lhs.first.decoding_errors() < rhs.first.decoding_errors();
  }

  // Favor the result that successfully decoded the most arguments.
  if (lhs.first.argument_count() != rhs.first.argument_count()) {
    return lhs.first.argument_count() > rhs.first.argument_count();
  }

  // Favor the result that was removed from the database most recently.
  return lhs.second > rhs.second;
}

// Returns true if all characters in data are printable, space, or if the string
// is empty.
constexpr bool IsPrintableAscii(std::string_view data) {
  // This follows the logic in pw_tokenizer.decode_optionally_tokenized below:
  //
  //   if ''.join(text.split()).isprintable():
  //     return text
  //
  for (int letter : data) {
    if (std::isprint(letter) == 0 && std::isspace(letter) == 0) {
      return false;
    }
  }
  return true;
}

}  // namespace

DetokenizedString::DetokenizedString(
    uint32_t token,
    const span<const TokenizedStringEntry>& entries,
    const span<const std::byte>& arguments)
    : token_(token), has_token_(true) {
  std::vector<DecodingResult> results;

  for (const auto& [format, date_removed] : entries) {
    results.push_back(DecodingResult{
        format.Format(span(reinterpret_cast<const uint8_t*>(arguments.data()),
                           arguments.size())),
        date_removed});
  }

  std::sort(results.begin(), results.end(), IsBetterResult);

  for (auto& result : results) {
    matches_.push_back(std::move(result.first));
  }
}

std::string DetokenizedString::BestString() const {
  return matches_.empty() ? std::string() : matches_[0].value();
}

std::string DetokenizedString::BestStringWithErrors() const {
  if (matches_.empty()) {
    return has_token_ ? UnknownTokenMessage(token_)
                      : PW_TOKENIZER_ARG_DECODING_ERROR("missing token");
  }
  return matches_[0].value_with_errors();
}

Detokenizer::Detokenizer(const TokenDatabase& database) {
  for (const auto& entry : database) {
    database_[entry.token].emplace_back(entry.string, entry.date_removed);
  }
}

Result<Detokenizer> Detokenizer::FromElfSection(
    span<const std::byte> elf_section) {
  size_t index = 0;
  std::unordered_map<uint32_t, std::vector<TokenizedStringEntry>> database;

  while (index + sizeof(_pw_tokenizer_EntryHeader) < elf_section.size()) {
    _pw_tokenizer_EntryHeader header;
    std::memcpy(
        &header, elf_section.data() + index, sizeof(_pw_tokenizer_EntryHeader));
    index += sizeof(_pw_tokenizer_EntryHeader);

    if (header.magic != _PW_TOKENIZER_ENTRY_MAGIC) {
      return Status::DataLoss();
    }

    index += header.domain_length;
    if (index + header.string_length <= elf_section.size()) {
      // TODO(b/326365218): Construct FormatString with string_view to avoid
      // creating a copy here.
      std::string entry(
          reinterpret_cast<const char*>(elf_section.data() + index),
          header.string_length);
      index += header.string_length;
      database[header.token].emplace_back(entry.c_str(),
                                          TokenDatabase::kDateRemovedNever);
    }
  }
  return Detokenizer(std::move(database));
}

DetokenizedString Detokenizer::Detokenize(
    const span<const std::byte>& encoded) const {
  // The token is missing from the encoded data; there is nothing to do.
  if (encoded.empty()) {
    return DetokenizedString();
  }

  uint32_t token = bytes::ReadInOrder<uint32_t>(
      endian::little, encoded.data(), encoded.size());

  const auto result = database_.find(token);

  return DetokenizedString(
      token,
      result == database_.end() ? span<TokenizedStringEntry>()
                                : span(result->second),
      encoded.size() < sizeof(token) ? span<const std::byte>()
                                     : encoded.subspan(sizeof(token)));
}

DetokenizedString Detokenizer::DetokenizeBase64Message(
    std::string_view text) const {
  std::string buffer(text);
  buffer.resize(PrefixedBase64DecodeInPlace(buffer));
  return Detokenize(buffer);
}

std::string Detokenizer::DetokenizeText(std::string_view text,
                                        const unsigned max_passes) const {
  NestedMessageDetokenizer detokenizer(*this);
  detokenizer.Detokenize(text);

  std::string result;
  unsigned pass = 1;

  while (true) {
    result = detokenizer.Flush();
    if (pass >= max_passes || !detokenizer.OutputChangedSinceLastCheck()) {
      break;
    }
    detokenizer.Detokenize(result);
    pass += 1;
  }
  return result;
}

std::string Detokenizer::DecodeOptionallyTokenizedData(
    const ConstByteSpan& optionally_tokenized_data) {
  // Try detokenizing as binary using the best result if available, else use
  // the input data as a string.
  const auto result = Detokenize(optionally_tokenized_data);
  const bool found_matches = !result.matches().empty();
  // Note: unlike pw_tokenizer.proto.decode_optionally_tokenized, this decoding
  // process does not encode and decode UTF8 format, it is sufficient to check
  // if the data is printable ASCII.
  const std::string data =
      found_matches
          ? result.BestString()
          : std::string(
                reinterpret_cast<const char*>(optionally_tokenized_data.data()),
                optionally_tokenized_data.size());

  const bool is_data_printable = IsPrintableAscii(data);
  if (!found_matches && !is_data_printable) {
    // Assume the token is unknown or the data is corrupt.
    std::vector<char> base64_encoding_buffer(
        Base64EncodedBufferSize(optionally_tokenized_data.size()));
    const size_t encoded_length = PrefixedBase64Encode(
        optionally_tokenized_data, span(base64_encoding_buffer));
    return std::string{base64_encoding_buffer.data(), encoded_length};
  }

  // Successfully detokenized, check if the field has more prefixed
  // base64-encoded tokens.
  const std::string field = DetokenizeText(data);
  // If anything detokenized successfully, use that.
  if (field != data) {
    return field;
  }

  // Attempt to determine whether this is an unknown token or plain text.
  // Any string with only printable or whitespace characters is plain text.
  if (found_matches || is_data_printable) {
    return data;
  }

  // Assume this field is tokenized data that could not be decoded.
  std::vector<char> base64_encoding_buffer(
      Base64EncodedBufferSize(optionally_tokenized_data.size()));
  const size_t encoded_length = PrefixedBase64Encode(
      optionally_tokenized_data, span(base64_encoding_buffer));
  return std::string{base64_encoding_buffer.data(), encoded_length};
}

}  // namespace pw::tokenizer

Coverage Report

Created: 2024-07-09 06:09

Line	Count	Source (jump to first uncovered line)
1		// Copyright 2020 The Pigweed Authors
2		//
3		// Licensed under the Apache License, Version 2.0 (the "License"); you may not
4		// use this file except in compliance with the License. You may obtain a copy of
5		// the License at
6		//
7		// https://www.apache.org/licenses/LICENSE-2.0
8		//
9		// Unless required by applicable law or agreed to in writing, software
10		// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
11		// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
12		// License for the specific language governing permissions and limitations under
13		// the License.
14
15		#include "pw_tokenizer/detokenize.h"
16
17		#include <algorithm>
18		#include <cctype>
19		#include <cstring>
20		#include <string_view>
21		#include <vector>
22
23		#include "pw_bytes/bit.h"
24		#include "pw_bytes/endian.h"
25		#include "pw_result/result.h"
26		#include "pw_tokenizer/base64.h"
27		#include "pw_tokenizer/internal/decode.h"
28		#include "pw_tokenizer/nested_tokenization.h"
29
30		namespace pw::tokenizer {
31		namespace {
32
33		class NestedMessageDetokenizer {
34		public:
35		NestedMessageDetokenizer(const Detokenizer& detokenizer)
36	0	: detokenizer_(detokenizer) {}
37
38	0	void Detokenize(std::string_view chunk) {
39	0	for (char next_char : chunk) {
40	0	Detokenize(next_char);
41	0	}
42	0	}
43
44	0	bool OutputChangedSinceLastCheck() {
45	0	const bool changed = output_changed_;
46	0	output_changed_ = false;
47	0	return changed;
48	0	}
49
50	0	void Detokenize(char next_char) {
51	0	switch (state_) {
52	0	case kNonMessage:
53	0	if (next_char == PW_TOKENIZER_NESTED_PREFIX) {
54	0	message_buffer_.push_back(next_char);
55	0	state_ = kMessage;
56	0	} else {
57	0	output_.push_back(next_char);
58	0	}
59	0	break;
60	0	case kMessage:
61	0	if (base64::IsValidChar(next_char)) {
62	0	message_buffer_.push_back(next_char);
63	0	} else {
64	0	HandleEndOfMessage();
65	0	if (next_char == PW_TOKENIZER_NESTED_PREFIX) {
66	0	message_buffer_.push_back(next_char);
67	0	} else {
68	0	output_.push_back(next_char);
69	0	state_ = kNonMessage;
70	0	}
71	0	}
72	0	break;
73	0	}
74	0	}
75
76	0	std::string Flush() {
77	0	if (state_ == kMessage) {
78	0	HandleEndOfMessage();
79	0	state_ = kNonMessage;
80	0	}
81	0	std::string output(std::move(output_));
82	0	output_.clear();
83	0	return output;
84	0	}
85
86		private:
87	0	void HandleEndOfMessage() {
88	0	if (auto result = detokenizer_.DetokenizeBase64Message(message_buffer_);
89	0	result.ok()) {
90	0	output_ += result.BestString();
91	0	output_changed_ = true;
92	0	} else {
93	0	output_ += message_buffer_; // Keep the original if it doesn't decode.
94	0	}
95	0	message_buffer_.clear();
96	0	}
97
98		const Detokenizer& detokenizer_;
99		std::string output_;
100		std::string message_buffer_;
101
102		enum : uint8_t { kNonMessage, kMessage } state_ = kNonMessage;
103		bool output_changed_ = false;
104		};
105
106	0	std::string UnknownTokenMessage(uint32_t value) {
107	0	std::string output(PW_TOKENIZER_ARG_DECODING_ERROR_PREFIX "unknown token ");
108
109		// Output a hexadecimal version of the token.
110	0	for (int shift = 28; shift >= 0; shift -= 4) {
111	0	output.push_back("0123456789abcdef"[(value >> shift) & 0xF]);
112	0	}
113
114	0	output.append(PW_TOKENIZER_ARG_DECODING_ERROR_SUFFIX);
115	0	return output;
116	0	}
117
118		// Decoding result with the date removed, for sorting.
119		using DecodingResult = std::pair<DecodedFormatString, uint32_t>;
120
121		// Determines if one result is better than the other if collisions occurred.
122		// Returns true if lhs is preferred over rhs. This logic should match the
123		// collision resolution logic in detokenize.py.
124	0	bool IsBetterResult(const DecodingResult& lhs, const DecodingResult& rhs) {
125		// Favor the result for which decoding succeeded.
126	0	if (lhs.first.ok() != rhs.first.ok()) {
127	0	return lhs.first.ok();
128	0	}
129
130		// Favor the result for which all bytes were decoded.
131	0	if ((lhs.first.remaining_bytes() == 0u) !=
132	0	(rhs.first.remaining_bytes() == 0u)) {
133	0	return lhs.first.remaining_bytes() == 0u;
134	0	}
135
136		// Favor the result with fewer decoding errors.
137	0	if (lhs.first.decoding_errors() != rhs.first.decoding_errors()) {
138	0	return lhs.first.decoding_errors() < rhs.first.decoding_errors();
139	0	}
140
141		// Favor the result that successfully decoded the most arguments.
142	0	if (lhs.first.argument_count() != rhs.first.argument_count()) {
143	0	return lhs.first.argument_count() > rhs.first.argument_count();
144	0	}
145
146		// Favor the result that was removed from the database most recently.
147	0	return lhs.second > rhs.second;
148	0	}
149
150		// Returns true if all characters in data are printable, space, or if the string
151		// is empty.
152	0	constexpr bool IsPrintableAscii(std::string_view data) {
153		// This follows the logic in pw_tokenizer.decode_optionally_tokenized below:
154		//
155		// if ''.join(text.split()).isprintable():
156		// return text
157		//
158	0	for (int letter : data) {
159	0	if (std::isprint(letter) == 0 && std::isspace(letter) == 0) {
160	0	return false;
161	0	}
162	0	}
163	0	return true;
164	0	}
165
166		} // namespace
167
168		DetokenizedString::DetokenizedString(
169		uint32_t token,
170		const span<const TokenizedStringEntry>& entries,
171		const span<const std::byte>& arguments)
172	13.9k	: token_(token), has_token_(true) {
173	13.9k	std::vector<DecodingResult> results;
174
175	13.9k	for (const auto& [format, date_removed] : entries) {
176	1.87k	results.push_back(DecodingResult{
177	1.87k	format.Format(span(reinterpret_cast<const uint8_t*>(arguments.data()),
178	1.87k	arguments.size())),
179	1.87k	date_removed});
180	1.87k	}
181
182	13.9k	std::sort(results.begin(), results.end(), IsBetterResult);
183
184	13.9k	for (auto& result : results) {
185	1.87k	matches_.push_back(std::move(result.first));
186	1.87k	}
187	13.9k	}
188
189	0	std::string DetokenizedString::BestString() const {
190	0	return matches_.empty() ? std::string() : matches_[0].value();
191	0	}
192
193	0	std::string DetokenizedString::BestStringWithErrors() const {
194	0	if (matches_.empty()) {
195	0	return has_token_ ? UnknownTokenMessage(token_)
196	0	: PW_TOKENIZER_ARG_DECODING_ERROR("missing token");
197	0	}
198	0	return matches_[0].value_with_errors();
199	0	}
200
201	1	Detokenizer::Detokenizer(const TokenDatabase& database) {
202	4	for (const auto& entry : database) {
203	4	database_[entry.token].emplace_back(entry.string, entry.date_removed);
204	4	}
205	1	}
206
207		Result<Detokenizer> Detokenizer::FromElfSection(
208	0	span<const std::byte> elf_section) {
209	0	size_t index = 0;
210	0	std::unordered_map<uint32_t, std::vector<TokenizedStringEntry>> database;
211
212	0	while (index + sizeof(_pw_tokenizer_EntryHeader) < elf_section.size()) {
213	0	_pw_tokenizer_EntryHeader header;
214	0	std::memcpy(
215	0	&header, elf_section.data() + index, sizeof(_pw_tokenizer_EntryHeader));
216	0	index += sizeof(_pw_tokenizer_EntryHeader);
217
218	0	if (header.magic != _PW_TOKENIZER_ENTRY_MAGIC) {
219	0	return Status::DataLoss();
220	0	}
221
222	0	index += header.domain_length;
223	0	if (index + header.string_length <= elf_section.size()) {
224		// TODO(b/326365218): Construct FormatString with string_view to avoid
225		// creating a copy here.
226	0	std::string entry(
227	0	reinterpret_cast<const char*>(elf_section.data() + index),
228	0	header.string_length);
229	0	index += header.string_length;
230	0	database[header.token].emplace_back(entry.c_str(),
231	0	TokenDatabase::kDateRemovedNever);
232	0	}
233	0	}
234	0	return Detokenizer(std::move(database));
235	0	}
236
237		DetokenizedString Detokenizer::Detokenize(
238	16.7k	const span<const std::byte>& encoded) const {
239		// The token is missing from the encoded data; there is nothing to do.
240	16.7k	if (encoded.empty()) {
241	2.79k	return DetokenizedString();
242	2.79k	}
243
244	13.9k	uint32_t token = bytes::ReadInOrder<uint32_t>(
245	13.9k	endian::little, encoded.data(), encoded.size());
246
247	13.9k	const auto result = database_.find(token);
248
249	13.9k	return DetokenizedString(
250	13.9k	token,
251	13.9k	result == database_.end() ? span<TokenizedStringEntry>()
252	13.9k	: span(result->second),
253	13.9k	encoded.size() < sizeof(token) ? span<const std::byte>()
254	13.9k	: encoded.subspan(sizeof(token)));
255	16.7k	}
256
257		DetokenizedString Detokenizer::DetokenizeBase64Message(
258	0	std::string_view text) const {
259	0	std::string buffer(text);
260	0	buffer.resize(PrefixedBase64DecodeInPlace(buffer));
261	0	return Detokenize(buffer);
262	0	}
263
264		std::string Detokenizer::DetokenizeText(std::string_view text,
265	0	const unsigned max_passes) const {
266	0	NestedMessageDetokenizer detokenizer(*this);
267	0	detokenizer.Detokenize(text);
268
269	0	std::string result;
270	0	unsigned pass = 1;
271
272	0	while (true) {
273	0	result = detokenizer.Flush();
274	0	if (pass >= max_passes \|\| !detokenizer.OutputChangedSinceLastCheck()) {
275	0	break;
276	0	}
277	0	detokenizer.Detokenize(result);
278	0	pass += 1;
279	0	}
280	0	return result;
281	0	}
282
283		std::string Detokenizer::DecodeOptionallyTokenizedData(
284	0	const ConstByteSpan& optionally_tokenized_data) {
285		// Try detokenizing as binary using the best result if available, else use
286		// the input data as a string.
287	0	const auto result = Detokenize(optionally_tokenized_data);
288	0	const bool found_matches = !result.matches().empty();
289		// Note: unlike pw_tokenizer.proto.decode_optionally_tokenized, this decoding
290		// process does not encode and decode UTF8 format, it is sufficient to check
291		// if the data is printable ASCII.
292	0	const std::string data =
293	0	found_matches
294	0	? result.BestString()
295	0	: std::string(
296	0	reinterpret_cast<const char*>(optionally_tokenized_data.data()),
297	0	optionally_tokenized_data.size());
298
299	0	const bool is_data_printable = IsPrintableAscii(data);
300	0	if (!found_matches && !is_data_printable) {
301		// Assume the token is unknown or the data is corrupt.
302	0	std::vector<char> base64_encoding_buffer(
303	0	Base64EncodedBufferSize(optionally_tokenized_data.size()));
304	0	const size_t encoded_length = PrefixedBase64Encode(
305	0	optionally_tokenized_data, span(base64_encoding_buffer));
306	0	return std::string{base64_encoding_buffer.data(), encoded_length};
307	0	}
308
309		// Successfully detokenized, check if the field has more prefixed
310		// base64-encoded tokens.
311	0	const std::string field = DetokenizeText(data);
312		// If anything detokenized successfully, use that.
313	0	if (field != data) {
314	0	return field;
315	0	}
316
317		// Attempt to determine whether this is an unknown token or plain text.
318		// Any string with only printable or whitespace characters is plain text.
319	0	if (found_matches \|\| is_data_printable) {
320	0	return data;
321	0	}
322
323		// Assume this field is tokenized data that could not be decoded.
324	0	std::vector<char> base64_encoding_buffer(
325	0	Base64EncodedBufferSize(optionally_tokenized_data.size()));
326	0	const size_t encoded_length = PrefixedBase64Encode(
327	0	optionally_tokenized_data, span(base64_encoding_buffer));
328	0	return std::string{base64_encoding_buffer.data(), encoded_length};
329	0	}
330
331		} // namespace pw::tokenizer