/src/abseil-cpp/absl/debugging/internal/decode_rust_punycode.cc

Source (jump to first uncovered line)
// Copyright 2024 The Abseil Authors
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     https://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

#include "absl/debugging/internal/decode_rust_punycode.h"

#include <cstddef>
#include <cstdint>
#include <cstring>

#include "absl/base/config.h"
#include "absl/base/nullability.h"
#include "absl/debugging/internal/bounded_utf8_length_sequence.h"
#include "absl/debugging/internal/utf8_for_code_point.h"

namespace absl {
ABSL_NAMESPACE_BEGIN
namespace debugging_internal {

namespace {

// Decoding Punycode requires repeated random-access insertion into a stream of
// variable-length UTF-8 code-point encodings.  We need this to be tolerably
// fast (no N^2 slowdown for unfortunate inputs), and we can't allocate any data
// structures on the heap (async-signal-safety).
//
// It is pragmatic to impose a moderately low limit on the identifier length and
// bail out if we ever hit it.  Then BoundedUtf8LengthSequence efficiently
// determines where to insert the next code point, and memmove efficiently makes
// room for it.
//
// The chosen limit is a round number several times larger than identifiers
// expected in practice, yet still small enough that a memmove of this many
// UTF-8 characters is not much more expensive than the division and modulus
// operations that Punycode decoding requires.
constexpr uint32_t kMaxChars = 256;

// Constants from RFC 3492 section 5.
constexpr uint32_t kBase = 36, kTMin = 1, kTMax = 26, kSkew = 38, kDamp = 700;

constexpr uint32_t kMaxCodePoint = 0x10ffff;

// Overflow threshold in DecodeRustPunycode's inner loop; see comments there.
constexpr uint32_t kMaxI = 1 << 30;

// If punycode_begin .. punycode_end begins with a prefix matching the regular
// expression [0-9a-zA-Z_]+_, removes that prefix, copies all but the final
// underscore into out_begin .. out_end, sets num_ascii_chars to the number of
// bytes copied, and returns true.  (A prefix of this sort represents the
// nonempty subsequence of ASCII characters in the corresponding plaintext.)
//
// If punycode_begin .. punycode_end does not contain an underscore, sets
// num_ascii_chars to zero and returns true.  (The encoding of a plaintext
// without any ASCII characters does not carry such a prefix.)
//
// Returns false and zeroes num_ascii_chars on failure (either parse error or
// not enough space in the output buffer).
bool ConsumeOptionalAsciiPrefix(const char*& punycode_begin,
                                const char* const punycode_end,
                                char* const out_begin,
                                char* const out_end,
                                uint32_t& num_ascii_chars) {
  num_ascii_chars = 0;

  // Remember the last underscore if any.  Also use the same string scan to
  // reject any ASCII bytes that do not belong in an identifier, including NUL,
  // as well as non-ASCII bytes, which should have been delta-encoded instead.
  int last_underscore = -1;
  for (int i = 0; i < punycode_end - punycode_begin; ++i) {
    const char c = punycode_begin[i];
    if (c == '_') {
      last_underscore = i;
      continue;
    }
    // We write out the meaning of absl::ascii_isalnum rather than call that
    // function because its documentation does not promise it will remain
    // async-signal-safe under future development.
    if ('a' <= c && c <= 'z') continue;
    if ('A' <= c && c <= 'Z') continue;
    if ('0' <= c && c <= '9') continue;
    return false;
  }

  // If there was no underscore, that means there were no ASCII characters in
  // the plaintext, so there is no prefix to consume.  Our work is done.
  if (last_underscore < 0) return true;

  // Otherwise there will be an underscore delimiter somewhere.  It can't be
  // initial because then there would be no ASCII characters to its left, and no
  // delimiter would have been added in that case.
  if (last_underscore == 0) return false;

  // Any other position is reasonable.  Make sure there's room in the buffer.
  if (last_underscore + 1 > out_end - out_begin) return false;

  // Consume and write out the ASCII characters.
  num_ascii_chars = static_cast<uint32_t>(last_underscore);
  std::memcpy(out_begin, punycode_begin, num_ascii_chars);
  out_begin[num_ascii_chars] = '\0';
  punycode_begin += num_ascii_chars + 1;
  return true;
}

// Returns the value of `c` as a base-36 digit according to RFC 3492 section 5,
// or -1 if `c` is not such a digit.
int DigitValue(char c) {
  if ('0' <= c && c <= '9') return c - '0' + 26;
  if ('a' <= c && c <= 'z') return c - 'a';
  if ('A' <= c && c <= 'Z') return c - 'A';
  return -1;
}

// Consumes the next delta encoding from punycode_begin .. punycode_end,
// updating i accordingly.  Returns true on success.  Returns false on parse
// failure or arithmetic overflow.
bool ScanNextDelta(const char*& punycode_begin, const char* const punycode_end,
                   uint32_t bias, uint32_t& i) {
  uint64_t w = 1;  // 64 bits to prevent overflow in w *= kBase - t

  // "for k = base to infinity in steps of base do begin ... end" in RFC 3492
  // section 6.2.  Each loop iteration scans one digit of the delta.
  for (uint32_t k = kBase; punycode_begin != punycode_end; k += kBase) {
    const int digit_value = DigitValue(*punycode_begin++);
    if (digit_value < 0) return false;

    // Compute this in 64-bit arithmetic so we can check for overflow afterward.
    const uint64_t new_i = i + static_cast<uint64_t>(digit_value) * w;

    // Valid deltas are bounded by (#chars already emitted) * kMaxCodePoint, but
    // invalid input could encode an arbitrarily large delta.  Nip that in the
    // bud here.
    static_assert(
        kMaxI >= kMaxChars * kMaxCodePoint,
        "kMaxI is too small to prevent spurious failures on good input");
    if (new_i > kMaxI) return false;

    static_assert(
        kMaxI < (uint64_t{1} << 32),
        "Make kMaxI smaller or i 64 bits wide to prevent silent wraparound");
    i = static_cast<uint32_t>(new_i);

    // Compute the threshold that determines whether this is the last digit and
    // (if not) what the next digit's place value will be.  This logic from RFC
    // 3492 section 6.2 is explained in section 3.3.
    uint32_t t;
    if (k <= bias + kTMin) {
      t = kTMin;
    } else if (k >= bias + kTMax) {
      t = kTMax;
    } else {
      t = k - bias;
    }
    if (static_cast<uint32_t>(digit_value) < t) return true;

    // If this gets too large, the range check on new_i in the next iteration
    // will catch it.  We know this multiplication will not overwrap because w
    // is 64 bits wide.
    w *= kBase - t;
  }
  return false;
}

}  // namespace

absl::Nullable<char*> DecodeRustPunycode(DecodeRustPunycodeOptions options) {
  const char* punycode_begin = options.punycode_begin;
  const char* const punycode_end = options.punycode_end;
  char* const out_begin = options.out_begin;
  char* const out_end = options.out_end;

  // Write a NUL terminator first.  Later memcpy calls will keep bumping it
  // along to its new right place.
  const size_t out_size = static_cast<size_t>(out_end - out_begin);
  if (out_size == 0) return nullptr;
  *out_begin = '\0';

  // RFC 3492 section 6.2 begins here.  We retain the names of integer variables
  // appearing in that text.
  uint32_t n = 128, i = 0, bias = 72, num_chars = 0;

  // If there are any ASCII characters, consume them and their trailing
  // underscore delimiter.
  if (!ConsumeOptionalAsciiPrefix(punycode_begin, punycode_end,
                                  out_begin, out_end, num_chars)) {
    return nullptr;
  }
  uint32_t total_utf8_bytes = num_chars;

  BoundedUtf8LengthSequence<kMaxChars> utf8_lengths;

  // "while the input is not exhausted do begin ... end"
  while (punycode_begin != punycode_end) {
    if (num_chars >= kMaxChars) return nullptr;

    const uint32_t old_i = i;

    if (!ScanNextDelta(punycode_begin, punycode_end, bias, i)) return nullptr;

    // Update bias as in RFC 3492 section 6.1.  (We have inlined adapt.)
    uint32_t delta = i - old_i;
    delta /= (old_i == 0 ? kDamp : 2);
    delta += delta/(num_chars + 1);
    bias = 0;
    while (delta > ((kBase - kTMin) * kTMax)/2) {
      delta /= kBase - kTMin;
      bias += kBase;
    }
    bias += ((kBase - kTMin + 1) * delta)/(delta + kSkew);

    // Back in section 6.2, compute the new code point and insertion index.
    static_assert(
        kMaxI + kMaxCodePoint < (uint64_t{1} << 32),
        "Make kMaxI smaller or n 64 bits wide to prevent silent wraparound");
    n += i/(num_chars + 1);
    i %= num_chars + 1;

    // To actually insert, we need to convert the code point n to UTF-8 and the
    // character index i to an index into the byte stream emitted so far.  First
    // prepare the UTF-8 encoding for n, rejecting surrogates, overlarge values,
    // and anything that won't fit into the remaining output storage.
    Utf8ForCodePoint utf8_for_code_point(n);
    if (!utf8_for_code_point.ok()) return nullptr;
    if (total_utf8_bytes + utf8_for_code_point.length + 1 > out_size) {
      return nullptr;
    }

    // Now insert the new character into both our length map and the output.
    uint32_t n_index =
        utf8_lengths.InsertAndReturnSumOfPredecessors(
            i, utf8_for_code_point.length);
    std::memmove(
        out_begin + n_index + utf8_for_code_point.length, out_begin + n_index,
        total_utf8_bytes + 1 - n_index);
    std::memcpy(out_begin + n_index, utf8_for_code_point.bytes,
                utf8_for_code_point.length);
    total_utf8_bytes += utf8_for_code_point.length;
    ++num_chars;

    // Finally, advance to the next state before continuing.
    ++i;
  }

  return out_begin + total_utf8_bytes;
}

}  // namespace debugging_internal
ABSL_NAMESPACE_END
}  // namespace absl

Coverage Report

Created: 2024-09-23 06:29

Line	Count	Source (jump to first uncovered line)
1		// Copyright 2024 The Abseil Authors
2		//
3		// Licensed under the Apache License, Version 2.0 (the "License");
4		// you may not use this file except in compliance with the License.
5		// You may obtain a copy of the License at
6		//
7		// https://www.apache.org/licenses/LICENSE-2.0
8		//
9		// Unless required by applicable law or agreed to in writing, software
10		// distributed under the License is distributed on an "AS IS" BASIS,
11		// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12		// See the License for the specific language governing permissions and
13		// limitations under the License.
14
15		#include "absl/debugging/internal/decode_rust_punycode.h"
16
17		#include <cstddef>
18		#include <cstdint>
19		#include <cstring>
20
21		#include "absl/base/config.h"
22		#include "absl/base/nullability.h"
23		#include "absl/debugging/internal/bounded_utf8_length_sequence.h"
24		#include "absl/debugging/internal/utf8_for_code_point.h"
25
26		namespace absl {
27		ABSL_NAMESPACE_BEGIN
28		namespace debugging_internal {
29
30		namespace {
31
32		// Decoding Punycode requires repeated random-access insertion into a stream of
33		// variable-length UTF-8 code-point encodings. We need this to be tolerably
34		// fast (no N^2 slowdown for unfortunate inputs), and we can't allocate any data
35		// structures on the heap (async-signal-safety).
36		//
37		// It is pragmatic to impose a moderately low limit on the identifier length and
38		// bail out if we ever hit it. Then BoundedUtf8LengthSequence efficiently
39		// determines where to insert the next code point, and memmove efficiently makes
40		// room for it.
41		//
42		// The chosen limit is a round number several times larger than identifiers
43		// expected in practice, yet still small enough that a memmove of this many
44		// UTF-8 characters is not much more expensive than the division and modulus
45		// operations that Punycode decoding requires.
46		constexpr uint32_t kMaxChars = 256;
47
48		// Constants from RFC 3492 section 5.
49		constexpr uint32_t kBase = 36, kTMin = 1, kTMax = 26, kSkew = 38, kDamp = 700;
50
51		constexpr uint32_t kMaxCodePoint = 0x10ffff;
52
53		// Overflow threshold in DecodeRustPunycode's inner loop; see comments there.
54		constexpr uint32_t kMaxI = 1 << 30;
55
56		// If punycode_begin .. punycode_end begins with a prefix matching the regular
57		// expression [0-9a-zA-Z_]+_, removes that prefix, copies all but the final
58		// underscore into out_begin .. out_end, sets num_ascii_chars to the number of
59		// bytes copied, and returns true. (A prefix of this sort represents the
60		// nonempty subsequence of ASCII characters in the corresponding plaintext.)
61		//
62		// If punycode_begin .. punycode_end does not contain an underscore, sets
63		// num_ascii_chars to zero and returns true. (The encoding of a plaintext
64		// without any ASCII characters does not carry such a prefix.)
65		//
66		// Returns false and zeroes num_ascii_chars on failure (either parse error or
67		// not enough space in the output buffer).
68		bool ConsumeOptionalAsciiPrefix(const char*& punycode_begin,
69		const char* const punycode_end,
70		char* const out_begin,
71		char* const out_end,
72	0	uint32_t& num_ascii_chars) {
73	0	num_ascii_chars = 0;
74
75		// Remember the last underscore if any. Also use the same string scan to
76		// reject any ASCII bytes that do not belong in an identifier, including NUL,
77		// as well as non-ASCII bytes, which should have been delta-encoded instead.
78	0	int last_underscore = -1;
79	0	for (int i = 0; i < punycode_end - punycode_begin; ++i) {
80	0	const char c = punycode_begin[i];
81	0	if (c == '_') {
82	0	last_underscore = i;
83	0	continue;
84	0	}
85		// We write out the meaning of absl::ascii_isalnum rather than call that
86		// function because its documentation does not promise it will remain
87		// async-signal-safe under future development.
88	0	if ('a' <= c && c <= 'z') continue;
89	0	if ('A' <= c && c <= 'Z') continue;
90	0	if ('0' <= c && c <= '9') continue;
91	0	return false;
92	0	}
93
94		// If there was no underscore, that means there were no ASCII characters in
95		// the plaintext, so there is no prefix to consume. Our work is done.
96	0	if (last_underscore < 0) return true;
97
98		// Otherwise there will be an underscore delimiter somewhere. It can't be
99		// initial because then there would be no ASCII characters to its left, and no
100		// delimiter would have been added in that case.
101	0	if (last_underscore == 0) return false;
102
103		// Any other position is reasonable. Make sure there's room in the buffer.
104	0	if (last_underscore + 1 > out_end - out_begin) return false;
105
106		// Consume and write out the ASCII characters.
107	0	num_ascii_chars = static_cast<uint32_t>(last_underscore);
108	0	std::memcpy(out_begin, punycode_begin, num_ascii_chars);
109	0	out_begin[num_ascii_chars] = '\0';
110	0	punycode_begin += num_ascii_chars + 1;
111	0	return true;
112	0	}
113
114		// Returns the value of `c` as a base-36 digit according to RFC 3492 section 5,
115		// or -1 if `c` is not such a digit.
116	0	int DigitValue(char c) {
117	0	if ('0' <= c && c <= '9') return c - '0' + 26;
118	0	if ('a' <= c && c <= 'z') return c - 'a';
119	0	if ('A' <= c && c <= 'Z') return c - 'A';
120	0	return -1;
121	0	}
122
123		// Consumes the next delta encoding from punycode_begin .. punycode_end,
124		// updating i accordingly. Returns true on success. Returns false on parse
125		// failure or arithmetic overflow.
126		bool ScanNextDelta(const char& punycode_begin, const char const punycode_end,
127	0	uint32_t bias, uint32_t& i) {
128	0	uint64_t w = 1; // 64 bits to prevent overflow in w *= kBase - t
129
130		// "for k = base to infinity in steps of base do begin ... end" in RFC 3492
131		// section 6.2. Each loop iteration scans one digit of the delta.
132	0	for (uint32_t k = kBase; punycode_begin != punycode_end; k += kBase) {
133	0	const int digit_value = DigitValue(*punycode_begin++);
134	0	if (digit_value < 0) return false;
135
136		// Compute this in 64-bit arithmetic so we can check for overflow afterward.
137	0	const uint64_t new_i = i + static_cast<uint64_t>(digit_value) * w;
138
139		// Valid deltas are bounded by (#chars already emitted) * kMaxCodePoint, but
140		// invalid input could encode an arbitrarily large delta. Nip that in the
141		// bud here.
142	0	static_assert(
143	0	kMaxI >= kMaxChars * kMaxCodePoint,
144	0	"kMaxI is too small to prevent spurious failures on good input");
145	0	if (new_i > kMaxI) return false;
146
147	0	static_assert(
148	0	kMaxI < (uint64_t{1} << 32),
149	0	"Make kMaxI smaller or i 64 bits wide to prevent silent wraparound");
150	0	i = static_cast<uint32_t>(new_i);
151
152		// Compute the threshold that determines whether this is the last digit and
153		// (if not) what the next digit's place value will be. This logic from RFC
154		// 3492 section 6.2 is explained in section 3.3.
155	0	uint32_t t;
156	0	if (k <= bias + kTMin) {
157	0	t = kTMin;
158	0	} else if (k >= bias + kTMax) {
159	0	t = kTMax;
160	0	} else {
161	0	t = k - bias;
162	0	}
163	0	if (static_cast<uint32_t>(digit_value) < t) return true;
164
165		// If this gets too large, the range check on new_i in the next iteration
166		// will catch it. We know this multiplication will not overwrap because w
167		// is 64 bits wide.
168	0	w *= kBase - t;
169	0	}
170	0	return false;
171	0	}
172
173		} // namespace
174
175	0	absl::Nullable<char*> DecodeRustPunycode(DecodeRustPunycodeOptions options) {
176	0	const char* punycode_begin = options.punycode_begin;
177	0	const char* const punycode_end = options.punycode_end;
178	0	char* const out_begin = options.out_begin;
179	0	char* const out_end = options.out_end;
180
181		// Write a NUL terminator first. Later memcpy calls will keep bumping it
182		// along to its new right place.
183	0	const size_t out_size = static_cast<size_t>(out_end - out_begin);
184	0	if (out_size == 0) return nullptr;
185	0	*out_begin = '\0';
186
187		// RFC 3492 section 6.2 begins here. We retain the names of integer variables
188		// appearing in that text.
189	0	uint32_t n = 128, i = 0, bias = 72, num_chars = 0;
190
191		// If there are any ASCII characters, consume them and their trailing
192		// underscore delimiter.
193	0	if (!ConsumeOptionalAsciiPrefix(punycode_begin, punycode_end,
194	0	out_begin, out_end, num_chars)) {
195	0	return nullptr;
196	0	}
197	0	uint32_t total_utf8_bytes = num_chars;
198
199	0	BoundedUtf8LengthSequence<kMaxChars> utf8_lengths;
200
201		// "while the input is not exhausted do begin ... end"
202	0	while (punycode_begin != punycode_end) {
203	0	if (num_chars >= kMaxChars) return nullptr;
204
205	0	const uint32_t old_i = i;
206
207	0	if (!ScanNextDelta(punycode_begin, punycode_end, bias, i)) return nullptr;
208
209		// Update bias as in RFC 3492 section 6.1. (We have inlined adapt.)
210	0	uint32_t delta = i - old_i;
211	0	delta /= (old_i == 0 ? kDamp : 2);
212	0	delta += delta/(num_chars + 1);
213	0	bias = 0;
214	0	while (delta > ((kBase - kTMin) * kTMax)/2) {
215	0	delta /= kBase - kTMin;
216	0	bias += kBase;
217	0	}
218	0	bias += ((kBase - kTMin + 1) * delta)/(delta + kSkew);
219
220		// Back in section 6.2, compute the new code point and insertion index.
221	0	static_assert(
222	0	kMaxI + kMaxCodePoint < (uint64_t{1} << 32),
223	0	"Make kMaxI smaller or n 64 bits wide to prevent silent wraparound");
224	0	n += i/(num_chars + 1);
225	0	i %= num_chars + 1;
226
227		// To actually insert, we need to convert the code point n to UTF-8 and the
228		// character index i to an index into the byte stream emitted so far. First
229		// prepare the UTF-8 encoding for n, rejecting surrogates, overlarge values,
230		// and anything that won't fit into the remaining output storage.
231	0	Utf8ForCodePoint utf8_for_code_point(n);
232	0	if (!utf8_for_code_point.ok()) return nullptr;
233	0	if (total_utf8_bytes + utf8_for_code_point.length + 1 > out_size) {
234	0	return nullptr;
235	0	}
236
237		// Now insert the new character into both our length map and the output.
238	0	uint32_t n_index =
239	0	utf8_lengths.InsertAndReturnSumOfPredecessors(
240	0	i, utf8_for_code_point.length);
241	0	std::memmove(
242	0	out_begin + n_index + utf8_for_code_point.length, out_begin + n_index,
243	0	total_utf8_bytes + 1 - n_index);
244	0	std::memcpy(out_begin + n_index, utf8_for_code_point.bytes,
245	0	utf8_for_code_point.length);
246	0	total_utf8_bytes += utf8_for_code_point.length;
247	0	++num_chars;
248
249		// Finally, advance to the next state before continuing.
250	0	++i;
251	0	}
252
253	0	return out_begin + total_utf8_bytes;
254	0	}
255
256		} // namespace debugging_internal
257		ABSL_NAMESPACE_END
258		} // namespace absl