/src/abseil-cpp/absl/strings/ascii.cc
Line | Count | Source (jump to first uncovered line) |
1 | | // Copyright 2017 The Abseil Authors. |
2 | | // |
3 | | // Licensed under the Apache License, Version 2.0 (the "License"); |
4 | | // you may not use this file except in compliance with the License. |
5 | | // You may obtain a copy of the License at |
6 | | // |
7 | | // https://www.apache.org/licenses/LICENSE-2.0 |
8 | | // |
9 | | // Unless required by applicable law or agreed to in writing, software |
10 | | // distributed under the License is distributed on an "AS IS" BASIS, |
11 | | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
12 | | // See the License for the specific language governing permissions and |
13 | | // limitations under the License. |
14 | | |
15 | | #include "absl/strings/ascii.h" |
16 | | |
17 | | #include <climits> |
18 | | #include <cstddef> |
19 | | #include <cstring> |
20 | | #include <string> |
21 | | |
22 | | #include "absl/base/attributes.h" |
23 | | #include "absl/base/config.h" |
24 | | #include "absl/base/nullability.h" |
25 | | #include "absl/base/optimization.h" |
26 | | |
27 | | namespace absl { |
28 | | ABSL_NAMESPACE_BEGIN |
29 | | namespace ascii_internal { |
30 | | |
31 | | // # Table generated by this Python code (bit 0x02 is currently unused): |
32 | | // TODO(mbar) Move Python code for generation of table to BUILD and link here. |
33 | | |
34 | | // NOTE: The kAsciiPropertyBits table used within this code was generated by |
35 | | // Python code of the following form. (Bit 0x02 is currently unused and |
36 | | // available.) |
37 | | // |
38 | | // def Hex2(n): |
39 | | // return '0x' + hex(n/16)[2:] + hex(n%16)[2:] |
40 | | // def IsPunct(ch): |
41 | | // return (ord(ch) >= 32 and ord(ch) < 127 and |
42 | | // not ch.isspace() and not ch.isalnum()) |
43 | | // def IsBlank(ch): |
44 | | // return ch in ' \t' |
45 | | // def IsCntrl(ch): |
46 | | // return ord(ch) < 32 or ord(ch) == 127 |
47 | | // def IsXDigit(ch): |
48 | | // return ch.isdigit() or ch.lower() in 'abcdef' |
49 | | // for i in range(128): |
50 | | // ch = chr(i) |
51 | | // mask = ((ch.isalpha() and 0x01 or 0) | |
52 | | // (ch.isalnum() and 0x04 or 0) | |
53 | | // (ch.isspace() and 0x08 or 0) | |
54 | | // (IsPunct(ch) and 0x10 or 0) | |
55 | | // (IsBlank(ch) and 0x20 or 0) | |
56 | | // (IsCntrl(ch) and 0x40 or 0) | |
57 | | // (IsXDigit(ch) and 0x80 or 0)) |
58 | | // print Hex2(mask) + ',', |
59 | | // if i % 16 == 7: |
60 | | // print ' //', Hex2(i & 0x78) |
61 | | // elif i % 16 == 15: |
62 | | // print |
63 | | |
64 | | // clang-format off |
65 | | // Array of bitfields holding character information. Each bit value corresponds |
66 | | // to a particular character feature. For readability, and because the value |
67 | | // of these bits is tightly coupled to this implementation, the individual bits |
68 | | // are not named. Note that bitfields for all characters above ASCII 127 are |
69 | | // zero-initialized. |
70 | | ABSL_DLL const unsigned char kPropertyBits[256] = { |
71 | | 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, // 0x00 |
72 | | 0x40, 0x68, 0x48, 0x48, 0x48, 0x48, 0x40, 0x40, |
73 | | 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, // 0x10 |
74 | | 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, |
75 | | 0x28, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, // 0x20 |
76 | | 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, |
77 | | 0x84, 0x84, 0x84, 0x84, 0x84, 0x84, 0x84, 0x84, // 0x30 |
78 | | 0x84, 0x84, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, |
79 | | 0x10, 0x85, 0x85, 0x85, 0x85, 0x85, 0x85, 0x05, // 0x40 |
80 | | 0x05, 0x05, 0x05, 0x05, 0x05, 0x05, 0x05, 0x05, |
81 | | 0x05, 0x05, 0x05, 0x05, 0x05, 0x05, 0x05, 0x05, // 0x50 |
82 | | 0x05, 0x05, 0x05, 0x10, 0x10, 0x10, 0x10, 0x10, |
83 | | 0x10, 0x85, 0x85, 0x85, 0x85, 0x85, 0x85, 0x05, // 0x60 |
84 | | 0x05, 0x05, 0x05, 0x05, 0x05, 0x05, 0x05, 0x05, |
85 | | 0x05, 0x05, 0x05, 0x05, 0x05, 0x05, 0x05, 0x05, // 0x70 |
86 | | 0x05, 0x05, 0x05, 0x10, 0x10, 0x10, 0x10, 0x40, |
87 | | }; |
88 | | |
89 | | // Array of characters for the ascii_tolower() function. For values 'A' |
90 | | // through 'Z', return the lower-case character; otherwise, return the |
91 | | // identity of the passed character. |
92 | | ABSL_DLL const char kToLower[256] = { |
93 | | '\x00', '\x01', '\x02', '\x03', '\x04', '\x05', '\x06', '\x07', |
94 | | '\x08', '\x09', '\x0a', '\x0b', '\x0c', '\x0d', '\x0e', '\x0f', |
95 | | '\x10', '\x11', '\x12', '\x13', '\x14', '\x15', '\x16', '\x17', |
96 | | '\x18', '\x19', '\x1a', '\x1b', '\x1c', '\x1d', '\x1e', '\x1f', |
97 | | '\x20', '\x21', '\x22', '\x23', '\x24', '\x25', '\x26', '\x27', |
98 | | '\x28', '\x29', '\x2a', '\x2b', '\x2c', '\x2d', '\x2e', '\x2f', |
99 | | '\x30', '\x31', '\x32', '\x33', '\x34', '\x35', '\x36', '\x37', |
100 | | '\x38', '\x39', '\x3a', '\x3b', '\x3c', '\x3d', '\x3e', '\x3f', |
101 | | '\x40', 'a', 'b', 'c', 'd', 'e', 'f', 'g', |
102 | | 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', |
103 | | 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', |
104 | | 'x', 'y', 'z', '\x5b', '\x5c', '\x5d', '\x5e', '\x5f', |
105 | | '\x60', '\x61', '\x62', '\x63', '\x64', '\x65', '\x66', '\x67', |
106 | | '\x68', '\x69', '\x6a', '\x6b', '\x6c', '\x6d', '\x6e', '\x6f', |
107 | | '\x70', '\x71', '\x72', '\x73', '\x74', '\x75', '\x76', '\x77', |
108 | | '\x78', '\x79', '\x7a', '\x7b', '\x7c', '\x7d', '\x7e', '\x7f', |
109 | | '\x80', '\x81', '\x82', '\x83', '\x84', '\x85', '\x86', '\x87', |
110 | | '\x88', '\x89', '\x8a', '\x8b', '\x8c', '\x8d', '\x8e', '\x8f', |
111 | | '\x90', '\x91', '\x92', '\x93', '\x94', '\x95', '\x96', '\x97', |
112 | | '\x98', '\x99', '\x9a', '\x9b', '\x9c', '\x9d', '\x9e', '\x9f', |
113 | | '\xa0', '\xa1', '\xa2', '\xa3', '\xa4', '\xa5', '\xa6', '\xa7', |
114 | | '\xa8', '\xa9', '\xaa', '\xab', '\xac', '\xad', '\xae', '\xaf', |
115 | | '\xb0', '\xb1', '\xb2', '\xb3', '\xb4', '\xb5', '\xb6', '\xb7', |
116 | | '\xb8', '\xb9', '\xba', '\xbb', '\xbc', '\xbd', '\xbe', '\xbf', |
117 | | '\xc0', '\xc1', '\xc2', '\xc3', '\xc4', '\xc5', '\xc6', '\xc7', |
118 | | '\xc8', '\xc9', '\xca', '\xcb', '\xcc', '\xcd', '\xce', '\xcf', |
119 | | '\xd0', '\xd1', '\xd2', '\xd3', '\xd4', '\xd5', '\xd6', '\xd7', |
120 | | '\xd8', '\xd9', '\xda', '\xdb', '\xdc', '\xdd', '\xde', '\xdf', |
121 | | '\xe0', '\xe1', '\xe2', '\xe3', '\xe4', '\xe5', '\xe6', '\xe7', |
122 | | '\xe8', '\xe9', '\xea', '\xeb', '\xec', '\xed', '\xee', '\xef', |
123 | | '\xf0', '\xf1', '\xf2', '\xf3', '\xf4', '\xf5', '\xf6', '\xf7', |
124 | | '\xf8', '\xf9', '\xfa', '\xfb', '\xfc', '\xfd', '\xfe', '\xff', |
125 | | }; |
126 | | |
127 | | // Array of characters for the ascii_toupper() function. For values 'a' |
128 | | // through 'z', return the upper-case character; otherwise, return the |
129 | | // identity of the passed character. |
130 | | ABSL_DLL const char kToUpper[256] = { |
131 | | '\x00', '\x01', '\x02', '\x03', '\x04', '\x05', '\x06', '\x07', |
132 | | '\x08', '\x09', '\x0a', '\x0b', '\x0c', '\x0d', '\x0e', '\x0f', |
133 | | '\x10', '\x11', '\x12', '\x13', '\x14', '\x15', '\x16', '\x17', |
134 | | '\x18', '\x19', '\x1a', '\x1b', '\x1c', '\x1d', '\x1e', '\x1f', |
135 | | '\x20', '\x21', '\x22', '\x23', '\x24', '\x25', '\x26', '\x27', |
136 | | '\x28', '\x29', '\x2a', '\x2b', '\x2c', '\x2d', '\x2e', '\x2f', |
137 | | '\x30', '\x31', '\x32', '\x33', '\x34', '\x35', '\x36', '\x37', |
138 | | '\x38', '\x39', '\x3a', '\x3b', '\x3c', '\x3d', '\x3e', '\x3f', |
139 | | '\x40', '\x41', '\x42', '\x43', '\x44', '\x45', '\x46', '\x47', |
140 | | '\x48', '\x49', '\x4a', '\x4b', '\x4c', '\x4d', '\x4e', '\x4f', |
141 | | '\x50', '\x51', '\x52', '\x53', '\x54', '\x55', '\x56', '\x57', |
142 | | '\x58', '\x59', '\x5a', '\x5b', '\x5c', '\x5d', '\x5e', '\x5f', |
143 | | '\x60', 'A', 'B', 'C', 'D', 'E', 'F', 'G', |
144 | | 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', |
145 | | 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', |
146 | | 'X', 'Y', 'Z', '\x7b', '\x7c', '\x7d', '\x7e', '\x7f', |
147 | | '\x80', '\x81', '\x82', '\x83', '\x84', '\x85', '\x86', '\x87', |
148 | | '\x88', '\x89', '\x8a', '\x8b', '\x8c', '\x8d', '\x8e', '\x8f', |
149 | | '\x90', '\x91', '\x92', '\x93', '\x94', '\x95', '\x96', '\x97', |
150 | | '\x98', '\x99', '\x9a', '\x9b', '\x9c', '\x9d', '\x9e', '\x9f', |
151 | | '\xa0', '\xa1', '\xa2', '\xa3', '\xa4', '\xa5', '\xa6', '\xa7', |
152 | | '\xa8', '\xa9', '\xaa', '\xab', '\xac', '\xad', '\xae', '\xaf', |
153 | | '\xb0', '\xb1', '\xb2', '\xb3', '\xb4', '\xb5', '\xb6', '\xb7', |
154 | | '\xb8', '\xb9', '\xba', '\xbb', '\xbc', '\xbd', '\xbe', '\xbf', |
155 | | '\xc0', '\xc1', '\xc2', '\xc3', '\xc4', '\xc5', '\xc6', '\xc7', |
156 | | '\xc8', '\xc9', '\xca', '\xcb', '\xcc', '\xcd', '\xce', '\xcf', |
157 | | '\xd0', '\xd1', '\xd2', '\xd3', '\xd4', '\xd5', '\xd6', '\xd7', |
158 | | '\xd8', '\xd9', '\xda', '\xdb', '\xdc', '\xdd', '\xde', '\xdf', |
159 | | '\xe0', '\xe1', '\xe2', '\xe3', '\xe4', '\xe5', '\xe6', '\xe7', |
160 | | '\xe8', '\xe9', '\xea', '\xeb', '\xec', '\xed', '\xee', '\xef', |
161 | | '\xf0', '\xf1', '\xf2', '\xf3', '\xf4', '\xf5', '\xf6', '\xf7', |
162 | | '\xf8', '\xf9', '\xfa', '\xfb', '\xfc', '\xfd', '\xfe', '\xff', |
163 | | }; |
164 | | // clang-format on |
165 | | |
166 | | // Returns whether `c` is in the a-z/A-Z range (w.r.t. `ToUpper`). |
167 | | // Implemented by: |
168 | | // 1. Pushing the a-z/A-Z range to [SCHAR_MIN, SCHAR_MIN + 26). |
169 | | // 2. Comparing to SCHAR_MIN + 26. |
170 | | template <bool ToUpper> |
171 | 0 | constexpr bool AsciiInAZRange(unsigned char c) { |
172 | 0 | constexpr unsigned char sub = (ToUpper ? 'a' : 'A') - SCHAR_MIN; |
173 | 0 | constexpr signed char threshold = SCHAR_MIN + 26; // 26 = alphabet size. |
174 | | // Using unsigned arithmetic as overflows/underflows are well defined. |
175 | 0 | unsigned char u = c - sub; |
176 | | // Using signed cmp, as SIMD unsigned cmp isn't available in many platforms. |
177 | 0 | return static_cast<signed char>(u) < threshold; |
178 | 0 | } Unexecuted instantiation: bool absl::ascii_internal::AsciiInAZRange<false>(unsigned char) Unexecuted instantiation: bool absl::ascii_internal::AsciiInAZRange<true>(unsigned char) |
179 | | |
180 | | // Force-inline so the compiler won't merge the short and long implementations. |
181 | | // `src` may be null iff `size` is zero. |
182 | | template <bool ToUpper> |
183 | | ABSL_ATTRIBUTE_ALWAYS_INLINE inline constexpr void AsciiStrCaseFoldImpl( |
184 | 0 | absl::Nonnull<char*> dst, absl::Nullable<const char*> src, size_t size) { |
185 | | // The upper- and lowercase versions of ASCII characters differ by only 1 bit. |
186 | | // When we need to flip the case, we can xor with this bit to achieve the |
187 | | // desired result. Note that the choice of 'a' and 'A' here is arbitrary. We |
188 | | // could have chosen 'z' and 'Z', or any other pair of characters as they all |
189 | | // have the same single bit difference. |
190 | 0 | constexpr unsigned char kAsciiCaseBitFlip = 'a' ^ 'A'; |
191 | |
|
192 | 0 | for (size_t i = 0; i < size; ++i) { |
193 | 0 | unsigned char v = static_cast<unsigned char>(src[i]); |
194 | 0 | v ^= AsciiInAZRange<ToUpper>(v) ? kAsciiCaseBitFlip : 0; |
195 | 0 | dst[i] = static_cast<char>(v); |
196 | 0 | } |
197 | 0 | } Unexecuted instantiation: void absl::ascii_internal::AsciiStrCaseFoldImpl<false>(char*, char const*, unsigned long) Unexecuted instantiation: void absl::ascii_internal::AsciiStrCaseFoldImpl<true>(char*, char const*, unsigned long) |
198 | | |
199 | | // The string size threshold for starting using the long string version. |
200 | | constexpr size_t kCaseFoldThreshold = 16; |
201 | | |
202 | | // No-inline so the compiler won't merge the short and long implementations. |
203 | | // `src` may be null iff `size` is zero. |
204 | | template <bool ToUpper> |
205 | | ABSL_ATTRIBUTE_NOINLINE constexpr void AsciiStrCaseFoldLong( |
206 | 0 | absl::Nonnull<char*> dst, absl::Nullable<const char*> src, size_t size) { |
207 | 0 | ABSL_ASSUME(size >= kCaseFoldThreshold); |
208 | 0 | AsciiStrCaseFoldImpl<ToUpper>(dst, src, size); |
209 | 0 | } Unexecuted instantiation: void absl::ascii_internal::AsciiStrCaseFoldLong<false>(char*, char const*, unsigned long) Unexecuted instantiation: void absl::ascii_internal::AsciiStrCaseFoldLong<true>(char*, char const*, unsigned long) |
210 | | |
211 | | // Splitting to short and long strings to allow vectorization decisions |
212 | | // to be made separately in the long and short cases. |
213 | | // `src` may be null iff `size` is zero. |
214 | | template <bool ToUpper> |
215 | | constexpr void AsciiStrCaseFold(absl::Nonnull<char*> dst, |
216 | 0 | absl::Nullable<const char*> src, size_t size) { |
217 | 0 | size < kCaseFoldThreshold ? AsciiStrCaseFoldImpl<ToUpper>(dst, src, size) |
218 | 0 | : AsciiStrCaseFoldLong<ToUpper>(dst, src, size); |
219 | 0 | } Unexecuted instantiation: void absl::ascii_internal::AsciiStrCaseFold<false>(char*, char const*, unsigned long) Unexecuted instantiation: void absl::ascii_internal::AsciiStrCaseFold<true>(char*, char const*, unsigned long) |
220 | | |
221 | | void AsciiStrToLower(absl::Nonnull<char*> dst, absl::Nullable<const char*> src, |
222 | 0 | size_t n) { |
223 | 0 | return AsciiStrCaseFold<false>(dst, src, n); |
224 | 0 | } |
225 | | |
226 | | void AsciiStrToUpper(absl::Nonnull<char*> dst, absl::Nullable<const char*> src, |
227 | 0 | size_t n) { |
228 | 0 | return AsciiStrCaseFold<true>(dst, src, n); |
229 | 0 | } |
230 | | |
231 | 0 | static constexpr size_t ValidateAsciiCasefold() { |
232 | 0 | constexpr size_t num_chars = 1 + CHAR_MAX - CHAR_MIN; |
233 | 0 | size_t incorrect_index = 0; |
234 | 0 | char lowered[num_chars] = {}; |
235 | 0 | char uppered[num_chars] = {}; |
236 | 0 | for (unsigned int i = 0; i < num_chars; ++i) { |
237 | 0 | uppered[i] = lowered[i] = static_cast<char>(i); |
238 | 0 | } |
239 | 0 | AsciiStrCaseFold<false>(&lowered[0], &lowered[0], num_chars); |
240 | 0 | AsciiStrCaseFold<true>(&uppered[0], &uppered[0], num_chars); |
241 | 0 | for (size_t i = 0; i < num_chars; ++i) { |
242 | 0 | const char ch = static_cast<char>(i), |
243 | 0 | ch_upper = ('a' <= ch && ch <= 'z' ? 'A' + (ch - 'a') : ch), |
244 | 0 | ch_lower = ('A' <= ch && ch <= 'Z' ? 'a' + (ch - 'A') : ch); |
245 | 0 | if (uppered[i] != ch_upper || lowered[i] != ch_lower) { |
246 | 0 | incorrect_index = i > 0 ? i : num_chars; |
247 | 0 | break; |
248 | 0 | } |
249 | 0 | } |
250 | 0 | return incorrect_index; |
251 | 0 | } |
252 | | |
253 | | static_assert(ValidateAsciiCasefold() == 0, "error in case conversion"); |
254 | | |
255 | | } // namespace ascii_internal |
256 | | |
257 | 0 | void AsciiStrToLower(absl::Nonnull<std::string*> s) { |
258 | 0 | char* p = &(*s)[0]; |
259 | 0 | return ascii_internal::AsciiStrCaseFold<false>(p, p, s->size()); |
260 | 0 | } |
261 | | |
262 | 0 | void AsciiStrToUpper(absl::Nonnull<std::string*> s) { |
263 | 0 | char* p = &(*s)[0]; |
264 | 0 | return ascii_internal::AsciiStrCaseFold<true>(p, p, s->size()); |
265 | 0 | } |
266 | | |
267 | 0 | void RemoveExtraAsciiWhitespace(absl::Nonnull<std::string*> str) { |
268 | 0 | auto stripped = StripAsciiWhitespace(*str); |
269 | |
|
270 | 0 | if (stripped.empty()) { |
271 | 0 | str->clear(); |
272 | 0 | return; |
273 | 0 | } |
274 | | |
275 | 0 | auto input_it = stripped.begin(); |
276 | 0 | auto input_end = stripped.end(); |
277 | 0 | auto output_it = &(*str)[0]; |
278 | 0 | bool is_ws = false; |
279 | |
|
280 | 0 | for (; input_it < input_end; ++input_it) { |
281 | 0 | if (is_ws) { |
282 | | // Consecutive whitespace? Keep only the last. |
283 | 0 | is_ws = absl::ascii_isspace(static_cast<unsigned char>(*input_it)); |
284 | 0 | if (is_ws) --output_it; |
285 | 0 | } else { |
286 | 0 | is_ws = absl::ascii_isspace(static_cast<unsigned char>(*input_it)); |
287 | 0 | } |
288 | |
|
289 | 0 | *output_it = *input_it; |
290 | 0 | ++output_it; |
291 | 0 | } |
292 | |
|
293 | 0 | str->erase(static_cast<size_t>(output_it - &(*str)[0])); |
294 | 0 | } |
295 | | |
296 | | ABSL_NAMESPACE_END |
297 | | } // namespace absl |