/src/abseil-cpp/absl/debugging/internal/decode_rust_punycode.cc
Line | Count | Source (jump to first uncovered line) |
1 | | // Copyright 2024 The Abseil Authors |
2 | | // |
3 | | // Licensed under the Apache License, Version 2.0 (the "License"); |
4 | | // you may not use this file except in compliance with the License. |
5 | | // You may obtain a copy of the License at |
6 | | // |
7 | | // https://www.apache.org/licenses/LICENSE-2.0 |
8 | | // |
9 | | // Unless required by applicable law or agreed to in writing, software |
10 | | // distributed under the License is distributed on an "AS IS" BASIS, |
11 | | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
12 | | // See the License for the specific language governing permissions and |
13 | | // limitations under the License. |
14 | | |
15 | | #include "absl/debugging/internal/decode_rust_punycode.h" |
16 | | |
17 | | #include <cstddef> |
18 | | #include <cstdint> |
19 | | #include <cstring> |
20 | | |
21 | | #include "absl/base/config.h" |
22 | | #include "absl/base/nullability.h" |
23 | | #include "absl/debugging/internal/bounded_utf8_length_sequence.h" |
24 | | #include "absl/debugging/internal/utf8_for_code_point.h" |
25 | | |
26 | | namespace absl { |
27 | | ABSL_NAMESPACE_BEGIN |
28 | | namespace debugging_internal { |
29 | | |
30 | | namespace { |
31 | | |
32 | | // Decoding Punycode requires repeated random-access insertion into a stream of |
33 | | // variable-length UTF-8 code-point encodings. We need this to be tolerably |
34 | | // fast (no N^2 slowdown for unfortunate inputs), and we can't allocate any data |
35 | | // structures on the heap (async-signal-safety). |
36 | | // |
37 | | // It is pragmatic to impose a moderately low limit on the identifier length and |
38 | | // bail out if we ever hit it. Then BoundedUtf8LengthSequence efficiently |
39 | | // determines where to insert the next code point, and memmove efficiently makes |
40 | | // room for it. |
41 | | // |
42 | | // The chosen limit is a round number several times larger than identifiers |
43 | | // expected in practice, yet still small enough that a memmove of this many |
44 | | // UTF-8 characters is not much more expensive than the division and modulus |
45 | | // operations that Punycode decoding requires. |
46 | | constexpr uint32_t kMaxChars = 256; |
47 | | |
48 | | // Constants from RFC 3492 section 5. |
49 | | constexpr uint32_t kBase = 36, kTMin = 1, kTMax = 26, kSkew = 38, kDamp = 700; |
50 | | |
51 | | constexpr uint32_t kMaxCodePoint = 0x10ffff; |
52 | | |
53 | | // Overflow threshold in DecodeRustPunycode's inner loop; see comments there. |
54 | | constexpr uint32_t kMaxI = 1 << 30; |
55 | | |
56 | | // If punycode_begin .. punycode_end begins with a prefix matching the regular |
57 | | // expression [0-9a-zA-Z_]+_, removes that prefix, copies all but the final |
58 | | // underscore into out_begin .. out_end, sets num_ascii_chars to the number of |
59 | | // bytes copied, and returns true. (A prefix of this sort represents the |
60 | | // nonempty subsequence of ASCII characters in the corresponding plaintext.) |
61 | | // |
62 | | // If punycode_begin .. punycode_end does not contain an underscore, sets |
63 | | // num_ascii_chars to zero and returns true. (The encoding of a plaintext |
64 | | // without any ASCII characters does not carry such a prefix.) |
65 | | // |
66 | | // Returns false and zeroes num_ascii_chars on failure (either parse error or |
67 | | // not enough space in the output buffer). |
68 | | bool ConsumeOptionalAsciiPrefix(const char*& punycode_begin, |
69 | | const char* const punycode_end, |
70 | | char* const out_begin, |
71 | | char* const out_end, |
72 | 0 | uint32_t& num_ascii_chars) { |
73 | 0 | num_ascii_chars = 0; |
74 | | |
75 | | // Remember the last underscore if any. Also use the same string scan to |
76 | | // reject any ASCII bytes that do not belong in an identifier, including NUL, |
77 | | // as well as non-ASCII bytes, which should have been delta-encoded instead. |
78 | 0 | int last_underscore = -1; |
79 | 0 | for (int i = 0; i < punycode_end - punycode_begin; ++i) { |
80 | 0 | const char c = punycode_begin[i]; |
81 | 0 | if (c == '_') { |
82 | 0 | last_underscore = i; |
83 | 0 | continue; |
84 | 0 | } |
85 | | // We write out the meaning of absl::ascii_isalnum rather than call that |
86 | | // function because its documentation does not promise it will remain |
87 | | // async-signal-safe under future development. |
88 | 0 | if ('a' <= c && c <= 'z') continue; |
89 | 0 | if ('A' <= c && c <= 'Z') continue; |
90 | 0 | if ('0' <= c && c <= '9') continue; |
91 | 0 | return false; |
92 | 0 | } |
93 | | |
94 | | // If there was no underscore, that means there were no ASCII characters in |
95 | | // the plaintext, so there is no prefix to consume. Our work is done. |
96 | 0 | if (last_underscore < 0) return true; |
97 | | |
98 | | // Otherwise there will be an underscore delimiter somewhere. It can't be |
99 | | // initial because then there would be no ASCII characters to its left, and no |
100 | | // delimiter would have been added in that case. |
101 | 0 | if (last_underscore == 0) return false; |
102 | | |
103 | | // Any other position is reasonable. Make sure there's room in the buffer. |
104 | 0 | if (last_underscore + 1 > out_end - out_begin) return false; |
105 | | |
106 | | // Consume and write out the ASCII characters. |
107 | 0 | num_ascii_chars = static_cast<uint32_t>(last_underscore); |
108 | 0 | std::memcpy(out_begin, punycode_begin, num_ascii_chars); |
109 | 0 | out_begin[num_ascii_chars] = '\0'; |
110 | 0 | punycode_begin += num_ascii_chars + 1; |
111 | 0 | return true; |
112 | 0 | } |
113 | | |
114 | | // Returns the value of `c` as a base-36 digit according to RFC 3492 section 5, |
115 | | // or -1 if `c` is not such a digit. |
116 | 0 | int DigitValue(char c) { |
117 | 0 | if ('0' <= c && c <= '9') return c - '0' + 26; |
118 | 0 | if ('a' <= c && c <= 'z') return c - 'a'; |
119 | 0 | if ('A' <= c && c <= 'Z') return c - 'A'; |
120 | 0 | return -1; |
121 | 0 | } |
122 | | |
123 | | // Consumes the next delta encoding from punycode_begin .. punycode_end, |
124 | | // updating i accordingly. Returns true on success. Returns false on parse |
125 | | // failure or arithmetic overflow. |
126 | | bool ScanNextDelta(const char*& punycode_begin, const char* const punycode_end, |
127 | 0 | uint32_t bias, uint32_t& i) { |
128 | 0 | uint64_t w = 1; // 64 bits to prevent overflow in w *= kBase - t |
129 | | |
130 | | // "for k = base to infinity in steps of base do begin ... end" in RFC 3492 |
131 | | // section 6.2. Each loop iteration scans one digit of the delta. |
132 | 0 | for (uint32_t k = kBase; punycode_begin != punycode_end; k += kBase) { |
133 | 0 | const int digit_value = DigitValue(*punycode_begin++); |
134 | 0 | if (digit_value < 0) return false; |
135 | | |
136 | | // Compute this in 64-bit arithmetic so we can check for overflow afterward. |
137 | 0 | const uint64_t new_i = i + static_cast<uint64_t>(digit_value) * w; |
138 | | |
139 | | // Valid deltas are bounded by (#chars already emitted) * kMaxCodePoint, but |
140 | | // invalid input could encode an arbitrarily large delta. Nip that in the |
141 | | // bud here. |
142 | 0 | static_assert( |
143 | 0 | kMaxI >= kMaxChars * kMaxCodePoint, |
144 | 0 | "kMaxI is too small to prevent spurious failures on good input"); |
145 | 0 | if (new_i > kMaxI) return false; |
146 | | |
147 | 0 | static_assert( |
148 | 0 | kMaxI < (uint64_t{1} << 32), |
149 | 0 | "Make kMaxI smaller or i 64 bits wide to prevent silent wraparound"); |
150 | 0 | i = static_cast<uint32_t>(new_i); |
151 | | |
152 | | // Compute the threshold that determines whether this is the last digit and |
153 | | // (if not) what the next digit's place value will be. This logic from RFC |
154 | | // 3492 section 6.2 is explained in section 3.3. |
155 | 0 | uint32_t t; |
156 | 0 | if (k <= bias + kTMin) { |
157 | 0 | t = kTMin; |
158 | 0 | } else if (k >= bias + kTMax) { |
159 | 0 | t = kTMax; |
160 | 0 | } else { |
161 | 0 | t = k - bias; |
162 | 0 | } |
163 | 0 | if (static_cast<uint32_t>(digit_value) < t) return true; |
164 | | |
165 | | // If this gets too large, the range check on new_i in the next iteration |
166 | | // will catch it. We know this multiplication will not overwrap because w |
167 | | // is 64 bits wide. |
168 | 0 | w *= kBase - t; |
169 | 0 | } |
170 | 0 | return false; |
171 | 0 | } |
172 | | |
173 | | } // namespace |
174 | | |
175 | 0 | absl::Nullable<char*> DecodeRustPunycode(DecodeRustPunycodeOptions options) { |
176 | 0 | const char* punycode_begin = options.punycode_begin; |
177 | 0 | const char* const punycode_end = options.punycode_end; |
178 | 0 | char* const out_begin = options.out_begin; |
179 | 0 | char* const out_end = options.out_end; |
180 | | |
181 | | // Write a NUL terminator first. Later memcpy calls will keep bumping it |
182 | | // along to its new right place. |
183 | 0 | const size_t out_size = static_cast<size_t>(out_end - out_begin); |
184 | 0 | if (out_size == 0) return nullptr; |
185 | 0 | *out_begin = '\0'; |
186 | | |
187 | | // RFC 3492 section 6.2 begins here. We retain the names of integer variables |
188 | | // appearing in that text. |
189 | 0 | uint32_t n = 128, i = 0, bias = 72, num_chars = 0; |
190 | | |
191 | | // If there are any ASCII characters, consume them and their trailing |
192 | | // underscore delimiter. |
193 | 0 | if (!ConsumeOptionalAsciiPrefix(punycode_begin, punycode_end, |
194 | 0 | out_begin, out_end, num_chars)) { |
195 | 0 | return nullptr; |
196 | 0 | } |
197 | 0 | uint32_t total_utf8_bytes = num_chars; |
198 | |
|
199 | 0 | BoundedUtf8LengthSequence<kMaxChars> utf8_lengths; |
200 | | |
201 | | // "while the input is not exhausted do begin ... end" |
202 | 0 | while (punycode_begin != punycode_end) { |
203 | 0 | if (num_chars >= kMaxChars) return nullptr; |
204 | | |
205 | 0 | const uint32_t old_i = i; |
206 | |
|
207 | 0 | if (!ScanNextDelta(punycode_begin, punycode_end, bias, i)) return nullptr; |
208 | | |
209 | | // Update bias as in RFC 3492 section 6.1. (We have inlined adapt.) |
210 | 0 | uint32_t delta = i - old_i; |
211 | 0 | delta /= (old_i == 0 ? kDamp : 2); |
212 | 0 | delta += delta/(num_chars + 1); |
213 | 0 | bias = 0; |
214 | 0 | while (delta > ((kBase - kTMin) * kTMax)/2) { |
215 | 0 | delta /= kBase - kTMin; |
216 | 0 | bias += kBase; |
217 | 0 | } |
218 | 0 | bias += ((kBase - kTMin + 1) * delta)/(delta + kSkew); |
219 | | |
220 | | // Back in section 6.2, compute the new code point and insertion index. |
221 | 0 | static_assert( |
222 | 0 | kMaxI + kMaxCodePoint < (uint64_t{1} << 32), |
223 | 0 | "Make kMaxI smaller or n 64 bits wide to prevent silent wraparound"); |
224 | 0 | n += i/(num_chars + 1); |
225 | 0 | i %= num_chars + 1; |
226 | | |
227 | | // To actually insert, we need to convert the code point n to UTF-8 and the |
228 | | // character index i to an index into the byte stream emitted so far. First |
229 | | // prepare the UTF-8 encoding for n, rejecting surrogates, overlarge values, |
230 | | // and anything that won't fit into the remaining output storage. |
231 | 0 | Utf8ForCodePoint utf8_for_code_point(n); |
232 | 0 | if (!utf8_for_code_point.ok()) return nullptr; |
233 | 0 | if (total_utf8_bytes + utf8_for_code_point.length + 1 > out_size) { |
234 | 0 | return nullptr; |
235 | 0 | } |
236 | | |
237 | | // Now insert the new character into both our length map and the output. |
238 | 0 | uint32_t n_index = |
239 | 0 | utf8_lengths.InsertAndReturnSumOfPredecessors( |
240 | 0 | i, utf8_for_code_point.length); |
241 | 0 | std::memmove( |
242 | 0 | out_begin + n_index + utf8_for_code_point.length, out_begin + n_index, |
243 | 0 | total_utf8_bytes + 1 - n_index); |
244 | 0 | std::memcpy(out_begin + n_index, utf8_for_code_point.bytes, |
245 | 0 | utf8_for_code_point.length); |
246 | 0 | total_utf8_bytes += utf8_for_code_point.length; |
247 | 0 | ++num_chars; |
248 | | |
249 | | // Finally, advance to the next state before continuing. |
250 | 0 | ++i; |
251 | 0 | } |
252 | | |
253 | 0 | return out_begin + total_utf8_bytes; |
254 | 0 | } |
255 | | |
256 | | } // namespace debugging_internal |
257 | | ABSL_NAMESPACE_END |
258 | | } // namespace absl |