/proc/self/cwd/pw_tokenizer/detokenize.cc

Source (jump to first uncovered line)
// Copyright 2020 The Pigweed Authors
//
// Licensed under the Apache License, Version 2.0 (the "License"); you may not
// use this file except in compliance with the License. You may obtain a copy of
// the License at
//
//     https://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
// License for the specific language governing permissions and limitations under
// the License.

#include "pw_tokenizer/detokenize.h"

#include <algorithm>
#include <cstring>

#include "pw_bytes/bit.h"
#include "pw_bytes/endian.h"
#include "pw_result/result.h"
#include "pw_tokenizer/base64.h"
#include "pw_tokenizer/internal/decode.h"
#include "pw_tokenizer/nested_tokenization.h"

namespace pw::tokenizer {
namespace {

class NestedMessageDetokenizer {
 public:
  NestedMessageDetokenizer(const Detokenizer& detokenizer)
      : detokenizer_(detokenizer) {}

  void Detokenize(std::string_view chunk) {
    for (char next_char : chunk) {
      Detokenize(next_char);
    }
  }

  void Detokenize(char next_char) {
    switch (state_) {
      case kNonMessage:
        if (next_char == PW_TOKENIZER_NESTED_PREFIX) {
          message_buffer_.push_back(next_char);
          state_ = kMessage;
        } else {
          output_.push_back(next_char);
        }
        break;
      case kMessage:
        if (base64::IsValidChar(next_char)) {
          message_buffer_.push_back(next_char);
        } else {
          HandleEndOfMessage();
          if (next_char == PW_TOKENIZER_NESTED_PREFIX) {
            message_buffer_.push_back(next_char);
          } else {
            output_.push_back(next_char);
            state_ = kNonMessage;
          }
        }
        break;
    }
  }

  std::string Flush() {
    if (state_ == kMessage) {
      HandleEndOfMessage();
      state_ = kNonMessage;
    }
    return std::move(output_);
  }

 private:
  void HandleEndOfMessage() {
    if (auto result = detokenizer_.DetokenizeBase64Message(message_buffer_);
        result.ok()) {
      output_ += result.BestString();
    } else {
      output_ += message_buffer_;  // Keep the original if it doesn't decode.
    }
    message_buffer_.clear();
  }

  const Detokenizer& detokenizer_;
  std::string output_;
  std::string message_buffer_;

  enum { kNonMessage, kMessage } state_ = kNonMessage;
};

std::string UnknownTokenMessage(uint32_t value) {
  std::string output(PW_TOKENIZER_ARG_DECODING_ERROR_PREFIX "unknown token ");

  // Output a hexadecimal version of the token.
  for (int shift = 28; shift >= 0; shift -= 4) {
    output.push_back("0123456789abcdef"[(value >> shift) & 0xF]);
  }

  output.append(PW_TOKENIZER_ARG_DECODING_ERROR_SUFFIX);
  return output;
}

// Decoding result with the date removed, for sorting.
using DecodingResult = std::pair<DecodedFormatString, uint32_t>;

// Determines if one result is better than the other if collisions occurred.
// Returns true if lhs is preferred over rhs. This logic should match the
// collision resolution logic in detokenize.py.
bool IsBetterResult(const DecodingResult& lhs, const DecodingResult& rhs) {
  // Favor the result for which decoding succeeded.
  if (lhs.first.ok() != rhs.first.ok()) {
    return lhs.first.ok();
  }

  // Favor the result for which all bytes were decoded.
  if ((lhs.first.remaining_bytes() == 0u) !=
      (rhs.first.remaining_bytes() == 0u)) {
    return lhs.first.remaining_bytes() == 0u;
  }

  // Favor the result with fewer decoding errors.
  if (lhs.first.decoding_errors() != rhs.first.decoding_errors()) {
    return lhs.first.decoding_errors() < rhs.first.decoding_errors();
  }

  // Favor the result that successfully decoded the most arguments.
  if (lhs.first.argument_count() != rhs.first.argument_count()) {
    return lhs.first.argument_count() > rhs.first.argument_count();
  }

  // Favor the result that was removed from the database most recently.
  return lhs.second > rhs.second;
}

}  // namespace

DetokenizedString::DetokenizedString(
    uint32_t token,
    const span<const TokenizedStringEntry>& entries,
    const span<const uint8_t>& arguments)
    : token_(token), has_token_(true) {
  std::vector<DecodingResult> results;

  for (const auto& [format, date_removed] : entries) {
    results.push_back(DecodingResult{format.Format(arguments), date_removed});
  }

  std::sort(results.begin(), results.end(), IsBetterResult);

  for (auto& result : results) {
    matches_.push_back(std::move(result.first));
  }
}

std::string DetokenizedString::BestString() const {
  return matches_.empty() ? std::string() : matches_[0].value();
}

std::string DetokenizedString::BestStringWithErrors() const {
  if (matches_.empty()) {
    return has_token_ ? UnknownTokenMessage(token_)
                      : PW_TOKENIZER_ARG_DECODING_ERROR("missing token");
  }
  return matches_[0].value_with_errors();
}

Detokenizer::Detokenizer(const TokenDatabase& database) {
  for (const auto& entry : database) {
    database_[entry.token].emplace_back(entry.string, entry.date_removed);
  }
}

Result<Detokenizer> Detokenizer::FromElfSection(
    span<const uint8_t> elf_section) {
  size_t index = 0;
  std::unordered_map<uint32_t, std::vector<TokenizedStringEntry>> database;

  while (index + sizeof(_pw_tokenizer_EntryHeader) < elf_section.size()) {
    _pw_tokenizer_EntryHeader header;
    std::memcpy(
        &header, elf_section.data() + index, sizeof(_pw_tokenizer_EntryHeader));
    index += sizeof(_pw_tokenizer_EntryHeader);

    if (header.magic != _PW_TOKENIZER_ENTRY_MAGIC) {
      return Status::DataLoss();
    }

    index += header.domain_length;
    if (index + header.string_length <= elf_section.size()) {
      // TODO(b/326365218): Construct FormatString with string_view to avoid
      // creating a copy here.
      std::string entry(
          reinterpret_cast<const char*>(elf_section.data() + index),
          header.string_length);
      index += header.string_length;
      database[header.token].emplace_back(entry.c_str(),
                                          TokenDatabase::kDateRemovedNever);
    }
  }
  return Detokenizer(std::move(database));
}

DetokenizedString Detokenizer::Detokenize(
    const span<const uint8_t>& encoded) const {
  // The token is missing from the encoded data; there is nothing to do.
  if (encoded.empty()) {
    return DetokenizedString();
  }

  uint32_t token = bytes::ReadInOrder<uint32_t>(
      endian::little, encoded.data(), encoded.size());

  const auto result = database_.find(token);

  return DetokenizedString(
      token,
      result == database_.end() ? span<TokenizedStringEntry>()
                                : span(result->second),
      encoded.size() < sizeof(token) ? span<const uint8_t>()
                                     : encoded.subspan(sizeof(token)));
}

DetokenizedString Detokenizer::DetokenizeBase64Message(
    std::string_view text) const {
  std::string buffer(text);
  buffer.resize(PrefixedBase64DecodeInPlace(buffer));
  return Detokenize(buffer);
}

std::string Detokenizer::DetokenizeBase64(std::string_view text) const {
  NestedMessageDetokenizer nested_detokenizer(*this);
  nested_detokenizer.Detokenize(text);
  return nested_detokenizer.Flush();
}

}  // namespace pw::tokenizer

Coverage Report

Created: 2024-04-15 06:29

Line	Count	Source (jump to first uncovered line)
1		// Copyright 2020 The Pigweed Authors
2		//
3		// Licensed under the Apache License, Version 2.0 (the "License"); you may not
4		// use this file except in compliance with the License. You may obtain a copy of
5		// the License at
6		//
7		// https://www.apache.org/licenses/LICENSE-2.0
8		//
9		// Unless required by applicable law or agreed to in writing, software
10		// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
11		// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
12		// License for the specific language governing permissions and limitations under
13		// the License.
14
15		#include "pw_tokenizer/detokenize.h"
16
17		#include <algorithm>
18		#include <cstring>
19
20		#include "pw_bytes/bit.h"
21		#include "pw_bytes/endian.h"
22		#include "pw_result/result.h"
23		#include "pw_tokenizer/base64.h"
24		#include "pw_tokenizer/internal/decode.h"
25		#include "pw_tokenizer/nested_tokenization.h"
26
27		namespace pw::tokenizer {
28		namespace {
29
30		class NestedMessageDetokenizer {
31		public:
32		NestedMessageDetokenizer(const Detokenizer& detokenizer)
33	0	: detokenizer_(detokenizer) {}
34
35	0	void Detokenize(std::string_view chunk) {
36	0	for (char next_char : chunk) {
37	0	Detokenize(next_char);
38	0	}
39	0	}
40
41	0	void Detokenize(char next_char) {
42	0	switch (state_) {
43	0	case kNonMessage:
44	0	if (next_char == PW_TOKENIZER_NESTED_PREFIX) {
45	0	message_buffer_.push_back(next_char);
46	0	state_ = kMessage;
47	0	} else {
48	0	output_.push_back(next_char);
49	0	}
50	0	break;
51	0	case kMessage:
52	0	if (base64::IsValidChar(next_char)) {
53	0	message_buffer_.push_back(next_char);
54	0	} else {
55	0	HandleEndOfMessage();
56	0	if (next_char == PW_TOKENIZER_NESTED_PREFIX) {
57	0	message_buffer_.push_back(next_char);
58	0	} else {
59	0	output_.push_back(next_char);
60	0	state_ = kNonMessage;
61	0	}
62	0	}
63	0	break;
64	0	}
65	0	}
66
67	0	std::string Flush() {
68	0	if (state_ == kMessage) {
69	0	HandleEndOfMessage();
70	0	state_ = kNonMessage;
71	0	}
72	0	return std::move(output_);
73	0	}
74
75		private:
76	0	void HandleEndOfMessage() {
77	0	if (auto result = detokenizer_.DetokenizeBase64Message(message_buffer_);
78	0	result.ok()) {
79	0	output_ += result.BestString();
80	0	} else {
81	0	output_ += message_buffer_; // Keep the original if it doesn't decode.
82	0	}
83	0	message_buffer_.clear();
84	0	}
85
86		const Detokenizer& detokenizer_;
87		std::string output_;
88		std::string message_buffer_;
89
90		enum { kNonMessage, kMessage } state_ = kNonMessage;
91		};
92
93	0	std::string UnknownTokenMessage(uint32_t value) {
94	0	std::string output(PW_TOKENIZER_ARG_DECODING_ERROR_PREFIX "unknown token ");
95
96		// Output a hexadecimal version of the token.
97	0	for (int shift = 28; shift >= 0; shift -= 4) {
98	0	output.push_back("0123456789abcdef"[(value >> shift) & 0xF]);
99	0	}
100
101	0	output.append(PW_TOKENIZER_ARG_DECODING_ERROR_SUFFIX);
102	0	return output;
103	0	}
104
105		// Decoding result with the date removed, for sorting.
106		using DecodingResult = std::pair<DecodedFormatString, uint32_t>;
107
108		// Determines if one result is better than the other if collisions occurred.
109		// Returns true if lhs is preferred over rhs. This logic should match the
110		// collision resolution logic in detokenize.py.
111	0	bool IsBetterResult(const DecodingResult& lhs, const DecodingResult& rhs) {
112		// Favor the result for which decoding succeeded.
113	0	if (lhs.first.ok() != rhs.first.ok()) {
114	0	return lhs.first.ok();
115	0	}
116
117		// Favor the result for which all bytes were decoded.
118	0	if ((lhs.first.remaining_bytes() == 0u) !=
119	0	(rhs.first.remaining_bytes() == 0u)) {
120	0	return lhs.first.remaining_bytes() == 0u;
121	0	}
122
123		// Favor the result with fewer decoding errors.
124	0	if (lhs.first.decoding_errors() != rhs.first.decoding_errors()) {
125	0	return lhs.first.decoding_errors() < rhs.first.decoding_errors();
126	0	}
127
128		// Favor the result that successfully decoded the most arguments.
129	0	if (lhs.first.argument_count() != rhs.first.argument_count()) {
130	0	return lhs.first.argument_count() > rhs.first.argument_count();
131	0	}
132
133		// Favor the result that was removed from the database most recently.
134	0	return lhs.second > rhs.second;
135	0	}
136
137		} // namespace
138
139		DetokenizedString::DetokenizedString(
140		uint32_t token,
141		const span<const TokenizedStringEntry>& entries,
142		const span<const uint8_t>& arguments)
143	11.3k	: token_(token), has_token_(true) {
144	11.3k	std::vector<DecodingResult> results;
145
146	11.3k	for (const auto& [format, date_removed] : entries) {
147	1.94k	results.push_back(DecodingResult{format.Format(arguments), date_removed});
148	1.94k	}
149
150	11.3k	std::sort(results.begin(), results.end(), IsBetterResult);
151
152	11.3k	for (auto& result : results) {
153	1.94k	matches_.push_back(std::move(result.first));
154	1.94k	}
155	11.3k	}
156
157	0	std::string DetokenizedString::BestString() const {
158	0	return matches_.empty() ? std::string() : matches_[0].value();
159	0	}
160
161	0	std::string DetokenizedString::BestStringWithErrors() const {
162	0	if (matches_.empty()) {
163	0	return has_token_ ? UnknownTokenMessage(token_)
164	0	: PW_TOKENIZER_ARG_DECODING_ERROR("missing token");
165	0	}
166	0	return matches_[0].value_with_errors();
167	0	}
168
169	1	Detokenizer::Detokenizer(const TokenDatabase& database) {
170	4	for (const auto& entry : database) {
171	4	database_[entry.token].emplace_back(entry.string, entry.date_removed);
172	4	}
173	1	}
174
175		Result<Detokenizer> Detokenizer::FromElfSection(
176	0	span<const uint8_t> elf_section) {
177	0	size_t index = 0;
178	0	std::unordered_map<uint32_t, std::vector<TokenizedStringEntry>> database;
179
180	0	while (index + sizeof(_pw_tokenizer_EntryHeader) < elf_section.size()) {
181	0	_pw_tokenizer_EntryHeader header;
182	0	std::memcpy(
183	0	&header, elf_section.data() + index, sizeof(_pw_tokenizer_EntryHeader));
184	0	index += sizeof(_pw_tokenizer_EntryHeader);
185
186	0	if (header.magic != _PW_TOKENIZER_ENTRY_MAGIC) {
187	0	return Status::DataLoss();
188	0	}
189
190	0	index += header.domain_length;
191	0	if (index + header.string_length <= elf_section.size()) {
192		// TODO(b/326365218): Construct FormatString with string_view to avoid
193		// creating a copy here.
194	0	std::string entry(
195	0	reinterpret_cast<const char*>(elf_section.data() + index),
196	0	header.string_length);
197	0	index += header.string_length;
198	0	database[header.token].emplace_back(entry.c_str(),
199	0	TokenDatabase::kDateRemovedNever);
200	0	}
201	0	}
202	0	return Detokenizer(std::move(database));
203	0	}
204
205		DetokenizedString Detokenizer::Detokenize(
206	13.3k	const span<const uint8_t>& encoded) const {
207		// The token is missing from the encoded data; there is nothing to do.
208	13.3k	if (encoded.empty()) {
209	2.00k	return DetokenizedString();
210	2.00k	}
211
212	11.3k	uint32_t token = bytes::ReadInOrder<uint32_t>(
213	11.3k	endian::little, encoded.data(), encoded.size());
214
215	11.3k	const auto result = database_.find(token);
216
217	11.3k	return DetokenizedString(
218	11.3k	token,
219	11.3k	result == database_.end() ? span<TokenizedStringEntry>()
220	11.3k	: span(result->second),
221	11.3k	encoded.size() < sizeof(token) ? span<const uint8_t>()
222	11.3k	: encoded.subspan(sizeof(token)));
223	13.3k	}
224
225		DetokenizedString Detokenizer::DetokenizeBase64Message(
226	0	std::string_view text) const {
227	0	std::string buffer(text);
228	0	buffer.resize(PrefixedBase64DecodeInPlace(buffer));
229	0	return Detokenize(buffer);
230	0	}
231
232	0	std::string Detokenizer::DetokenizeBase64(std::string_view text) const {
233	0	NestedMessageDetokenizer nested_detokenizer(*this);
234	0	nested_detokenizer.Detokenize(text);
235	0	return nested_detokenizer.Flush();
236	0	}
237
238		} // namespace pw::tokenizer