/proc/self/cwd/pw_tokenizer/detokenize.cc

Source (jump to first uncovered line)
// Copyright 2020 The Pigweed Authors
//
// Licensed under the Apache License, Version 2.0 (the "License"); you may not
// use this file except in compliance with the License. You may obtain a copy of
// the License at
//
//     https://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
// License for the specific language governing permissions and limitations under
// the License.

#include "pw_tokenizer/detokenize.h"

#include <algorithm>
#include <cstring>

#include "pw_bytes/bit.h"
#include "pw_bytes/endian.h"
#include "pw_tokenizer/base64.h"
#include "pw_tokenizer/internal/decode.h"
#include "pw_tokenizer/nested_tokenization.h"

namespace pw::tokenizer {
namespace {

class NestedMessageDetokenizer {
 public:
  NestedMessageDetokenizer(const Detokenizer& detokenizer)
      : detokenizer_(detokenizer) {}

  void Detokenize(std::string_view chunk) {
    for (char next_char : chunk) {
      Detokenize(next_char);
    }
  }

  void Detokenize(char next_char) {
    switch (state_) {
      case kNonMessage:
        if (next_char == PW_TOKENIZER_NESTED_PREFIX) {
          message_buffer_.push_back(next_char);
          state_ = kMessage;
        } else {
          output_.push_back(next_char);
        }
        break;
      case kMessage:
        if (base64::IsValidChar(next_char)) {
          message_buffer_.push_back(next_char);
        } else {
          HandleEndOfMessage();
          if (next_char == PW_TOKENIZER_NESTED_PREFIX) {
            message_buffer_.push_back(next_char);
          } else {
            output_.push_back(next_char);
            state_ = kNonMessage;
          }
        }
        break;
    }
  }

  std::string Flush() {
    if (state_ == kMessage) {
      HandleEndOfMessage();
      state_ = kNonMessage;
    }
    return std::move(output_);
  }

 private:
  void HandleEndOfMessage() {
    if (auto result = detokenizer_.DetokenizeBase64Message(message_buffer_);
        result.ok()) {
      output_ += result.BestString();
    } else {
      output_ += message_buffer_;  // Keep the original if it doesn't decode.
    }
    message_buffer_.clear();
  }

  const Detokenizer& detokenizer_;
  std::string output_;
  std::string message_buffer_;

  enum { kNonMessage, kMessage } state_ = kNonMessage;
};

std::string UnknownTokenMessage(uint32_t value) {
  std::string output(PW_TOKENIZER_ARG_DECODING_ERROR_PREFIX "unknown token ");

  // Output a hexadecimal version of the token.
  for (int shift = 28; shift >= 0; shift -= 4) {
    output.push_back("0123456789abcdef"[(value >> shift) & 0xF]);
  }

  output.append(PW_TOKENIZER_ARG_DECODING_ERROR_SUFFIX);
  return output;
}

// Decoding result with the date removed, for sorting.
using DecodingResult = std::pair<DecodedFormatString, uint32_t>;

// Determines if one result is better than the other if collisions occurred.
// Returns true if lhs is preferred over rhs. This logic should match the
// collision resolution logic in detokenize.py.
bool IsBetterResult(const DecodingResult& lhs, const DecodingResult& rhs) {
  // Favor the result for which decoding succeeded.
  if (lhs.first.ok() != rhs.first.ok()) {
    return lhs.first.ok();
  }

  // Favor the result for which all bytes were decoded.
  if ((lhs.first.remaining_bytes() == 0u) !=
      (rhs.first.remaining_bytes() == 0u)) {
    return lhs.first.remaining_bytes() == 0u;
  }

  // Favor the result with fewer decoding errors.
  if (lhs.first.decoding_errors() != rhs.first.decoding_errors()) {
    return lhs.first.decoding_errors() < rhs.first.decoding_errors();
  }

  // Favor the result that successfully decoded the most arguments.
  if (lhs.first.argument_count() != rhs.first.argument_count()) {
    return lhs.first.argument_count() > rhs.first.argument_count();
  }

  // Favor the result that was removed from the database most recently.
  return lhs.second > rhs.second;
}

}  // namespace

DetokenizedString::DetokenizedString(
    uint32_t token,
    const span<const TokenizedStringEntry>& entries,
    const span<const uint8_t>& arguments)
    : token_(token), has_token_(true) {
  std::vector<DecodingResult> results;

  for (const auto& [format, date_removed] : entries) {
    results.push_back(DecodingResult{format.Format(arguments), date_removed});
  }

  std::sort(results.begin(), results.end(), IsBetterResult);

  for (auto& result : results) {
    matches_.push_back(std::move(result.first));
  }
}

std::string DetokenizedString::BestString() const {
  return matches_.empty() ? std::string() : matches_[0].value();
}

std::string DetokenizedString::BestStringWithErrors() const {
  if (matches_.empty()) {
    return has_token_ ? UnknownTokenMessage(token_)
                      : PW_TOKENIZER_ARG_DECODING_ERROR("missing token");
  }
  return matches_[0].value_with_errors();
}

Detokenizer::Detokenizer(const TokenDatabase& database) {
  for (const auto& entry : database) {
    database_[entry.token].emplace_back(entry.string, entry.date_removed);
  }
}

DetokenizedString Detokenizer::Detokenize(
    const span<const uint8_t>& encoded) const {
  // The token is missing from the encoded data; there is nothing to do.
  if (encoded.empty()) {
    return DetokenizedString();
  }

  uint32_t token = bytes::ReadInOrder<uint32_t>(
      endian::little, encoded.data(), encoded.size());

  const auto result = database_.find(token);

  return DetokenizedString(
      token,
      result == database_.end() ? span<TokenizedStringEntry>()
                                : span(result->second),
      encoded.size() < sizeof(token) ? span<const uint8_t>()
                                     : encoded.subspan(sizeof(token)));
}

DetokenizedString Detokenizer::DetokenizeBase64Message(
    std::string_view text) const {
  std::string buffer(text);
  buffer.resize(PrefixedBase64DecodeInPlace(buffer));
  return Detokenize(buffer);
}

std::string Detokenizer::DetokenizeBase64(std::string_view text) const {
  NestedMessageDetokenizer nested_detokenizer(*this);
  nested_detokenizer.Detokenize(text);
  return nested_detokenizer.Flush();
}

}  // namespace pw::tokenizer

Coverage Report

Created: 2023-12-16 06:39

Line	Count	Source (jump to first uncovered line)
1		// Copyright 2020 The Pigweed Authors
2		//
3		// Licensed under the Apache License, Version 2.0 (the "License"); you may not
4		// use this file except in compliance with the License. You may obtain a copy of
5		// the License at
6		//
7		// https://www.apache.org/licenses/LICENSE-2.0
8		//
9		// Unless required by applicable law or agreed to in writing, software
10		// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
11		// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
12		// License for the specific language governing permissions and limitations under
13		// the License.
14
15		#include "pw_tokenizer/detokenize.h"
16
17		#include <algorithm>
18		#include <cstring>
19
20		#include "pw_bytes/bit.h"
21		#include "pw_bytes/endian.h"
22		#include "pw_tokenizer/base64.h"
23		#include "pw_tokenizer/internal/decode.h"
24		#include "pw_tokenizer/nested_tokenization.h"
25
26		namespace pw::tokenizer {
27		namespace {
28
29		class NestedMessageDetokenizer {
30		public:
31		NestedMessageDetokenizer(const Detokenizer& detokenizer)
32	0	: detokenizer_(detokenizer) {}
33
34	0	void Detokenize(std::string_view chunk) {
35	0	for (char next_char : chunk) {
36	0	Detokenize(next_char);
37	0	}
38	0	}
39
40	0	void Detokenize(char next_char) {
41	0	switch (state_) {
42	0	case kNonMessage:
43	0	if (next_char == PW_TOKENIZER_NESTED_PREFIX) {
44	0	message_buffer_.push_back(next_char);
45	0	state_ = kMessage;
46	0	} else {
47	0	output_.push_back(next_char);
48	0	}
49	0	break;
50	0	case kMessage:
51	0	if (base64::IsValidChar(next_char)) {
52	0	message_buffer_.push_back(next_char);
53	0	} else {
54	0	HandleEndOfMessage();
55	0	if (next_char == PW_TOKENIZER_NESTED_PREFIX) {
56	0	message_buffer_.push_back(next_char);
57	0	} else {
58	0	output_.push_back(next_char);
59	0	state_ = kNonMessage;
60	0	}
61	0	}
62	0	break;
63	0	}
64	0	}
65
66	0	std::string Flush() {
67	0	if (state_ == kMessage) {
68	0	HandleEndOfMessage();
69	0	state_ = kNonMessage;
70	0	}
71	0	return std::move(output_);
72	0	}
73
74		private:
75	0	void HandleEndOfMessage() {
76	0	if (auto result = detokenizer_.DetokenizeBase64Message(message_buffer_);
77	0	result.ok()) {
78	0	output_ += result.BestString();
79	0	} else {
80	0	output_ += message_buffer_; // Keep the original if it doesn't decode.
81	0	}
82	0	message_buffer_.clear();
83	0	}
84
85		const Detokenizer& detokenizer_;
86		std::string output_;
87		std::string message_buffer_;
88
89		enum { kNonMessage, kMessage } state_ = kNonMessage;
90		};
91
92	0	std::string UnknownTokenMessage(uint32_t value) {
93	0	std::string output(PW_TOKENIZER_ARG_DECODING_ERROR_PREFIX "unknown token ");
94
95		// Output a hexadecimal version of the token.
96	0	for (int shift = 28; shift >= 0; shift -= 4) {
97	0	output.push_back("0123456789abcdef"[(value >> shift) & 0xF]);
98	0	}
99
100	0	output.append(PW_TOKENIZER_ARG_DECODING_ERROR_SUFFIX);
101	0	return output;
102	0	}
103
104		// Decoding result with the date removed, for sorting.
105		using DecodingResult = std::pair<DecodedFormatString, uint32_t>;
106
107		// Determines if one result is better than the other if collisions occurred.
108		// Returns true if lhs is preferred over rhs. This logic should match the
109		// collision resolution logic in detokenize.py.
110	0	bool IsBetterResult(const DecodingResult& lhs, const DecodingResult& rhs) {
111		// Favor the result for which decoding succeeded.
112	0	if (lhs.first.ok() != rhs.first.ok()) {
113	0	return lhs.first.ok();
114	0	}
115
116		// Favor the result for which all bytes were decoded.
117	0	if ((lhs.first.remaining_bytes() == 0u) !=
118	0	(rhs.first.remaining_bytes() == 0u)) {
119	0	return lhs.first.remaining_bytes() == 0u;
120	0	}
121
122		// Favor the result with fewer decoding errors.
123	0	if (lhs.first.decoding_errors() != rhs.first.decoding_errors()) {
124	0	return lhs.first.decoding_errors() < rhs.first.decoding_errors();
125	0	}
126
127		// Favor the result that successfully decoded the most arguments.
128	0	if (lhs.first.argument_count() != rhs.first.argument_count()) {
129	0	return lhs.first.argument_count() > rhs.first.argument_count();
130	0	}
131
132		// Favor the result that was removed from the database most recently.
133	0	return lhs.second > rhs.second;
134	0	}
135
136		} // namespace
137
138		DetokenizedString::DetokenizedString(
139		uint32_t token,
140		const span<const TokenizedStringEntry>& entries,
141		const span<const uint8_t>& arguments)
142	10.7k	: token_(token), has_token_(true) {
143	10.7k	std::vector<DecodingResult> results;
144
145	10.7k	for (const auto& [format, date_removed] : entries) {
146	2.18k	results.push_back(DecodingResult{format.Format(arguments), date_removed});
147	2.18k	}
148
149	10.7k	std::sort(results.begin(), results.end(), IsBetterResult);
150
151	10.7k	for (auto& result : results) {
152	2.18k	matches_.push_back(std::move(result.first));
153	2.18k	}
154	10.7k	}
155
156	0	std::string DetokenizedString::BestString() const {
157	0	return matches_.empty() ? std::string() : matches_[0].value();
158	0	}
159
160	0	std::string DetokenizedString::BestStringWithErrors() const {
161	0	if (matches_.empty()) {
162	0	return has_token_ ? UnknownTokenMessage(token_)
163	0	: PW_TOKENIZER_ARG_DECODING_ERROR("missing token");
164	0	}
165	0	return matches_[0].value_with_errors();
166	0	}
167
168	1	Detokenizer::Detokenizer(const TokenDatabase& database) {
169	4	for (const auto& entry : database) {
170	4	database_[entry.token].emplace_back(entry.string, entry.date_removed);
171	4	}
172	1	}
173
174		DetokenizedString Detokenizer::Detokenize(
175	12.8k	const span<const uint8_t>& encoded) const {
176		// The token is missing from the encoded data; there is nothing to do.
177	12.8k	if (encoded.empty()) {
178	2.05k	return DetokenizedString();
179	2.05k	}
180
181	10.7k	uint32_t token = bytes::ReadInOrder<uint32_t>(
182	10.7k	endian::little, encoded.data(), encoded.size());
183
184	10.7k	const auto result = database_.find(token);
185
186	10.7k	return DetokenizedString(
187	10.7k	token,
188	10.7k	result == database_.end() ? span<TokenizedStringEntry>()
189	10.7k	: span(result->second),
190	10.7k	encoded.size() < sizeof(token) ? span<const uint8_t>()
191	10.7k	: encoded.subspan(sizeof(token)));
192	12.8k	}
193
194		DetokenizedString Detokenizer::DetokenizeBase64Message(
195	0	std::string_view text) const {
196	0	std::string buffer(text);
197	0	buffer.resize(PrefixedBase64DecodeInPlace(buffer));
198	0	return Detokenize(buffer);
199	0	}
200
201	0	std::string Detokenizer::DetokenizeBase64(std::string_view text) const {
202	0	NestedMessageDetokenizer nested_detokenizer(*this);
203	0	nested_detokenizer.Detokenize(text);
204	0	return nested_detokenizer.Flush();
205	0	}
206
207		} // namespace pw::tokenizer