Coverage Report

Created: 2025-07-11 06:37

/src/abseil-cpp/absl/strings/ascii.cc
Line
Count
Source (jump to first uncovered line)
1
// Copyright 2017 The Abseil Authors.
2
//
3
// Licensed under the Apache License, Version 2.0 (the "License");
4
// you may not use this file except in compliance with the License.
5
// You may obtain a copy of the License at
6
//
7
//      https://www.apache.org/licenses/LICENSE-2.0
8
//
9
// Unless required by applicable law or agreed to in writing, software
10
// distributed under the License is distributed on an "AS IS" BASIS,
11
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
// See the License for the specific language governing permissions and
13
// limitations under the License.
14
15
#include "absl/strings/ascii.h"
16
17
#include <climits>
18
#include <cstddef>
19
#include <cstring>
20
#include <string>
21
22
#include "absl/base/config.h"
23
#include "absl/base/nullability.h"
24
25
namespace absl {
26
ABSL_NAMESPACE_BEGIN
27
namespace ascii_internal {
28
29
// # Table generated by this Python code (bit 0x02 is currently unused):
30
// TODO(mbar) Move Python code for generation of table to BUILD and link here.
31
32
// NOTE: The kAsciiPropertyBits table used within this code was generated by
33
// Python code of the following form. (Bit 0x02 is currently unused and
34
// available.)
35
//
36
// def Hex2(n):
37
//   return '0x' + hex(n/16)[2:] + hex(n%16)[2:]
38
// def IsPunct(ch):
39
//   return (ord(ch) >= 32 and ord(ch) < 127 and
40
//           not ch.isspace() and not ch.isalnum())
41
// def IsBlank(ch):
42
//   return ch in ' \t'
43
// def IsCntrl(ch):
44
//   return ord(ch) < 32 or ord(ch) == 127
45
// def IsXDigit(ch):
46
//   return ch.isdigit() or ch.lower() in 'abcdef'
47
// for i in range(128):
48
//   ch = chr(i)
49
//   mask = ((ch.isalpha() and 0x01 or 0) |
50
//           (ch.isalnum() and 0x04 or 0) |
51
//           (ch.isspace() and 0x08 or 0) |
52
//           (IsPunct(ch) and 0x10 or 0) |
53
//           (IsBlank(ch) and 0x20 or 0) |
54
//           (IsCntrl(ch) and 0x40 or 0) |
55
//           (IsXDigit(ch) and 0x80 or 0))
56
//   print Hex2(mask) + ',',
57
//   if i % 16 == 7:
58
//     print ' //', Hex2(i & 0x78)
59
//   elif i % 16 == 15:
60
//     print
61
62
// clang-format off
63
// Array of bitfields holding character information. Each bit value corresponds
64
// to a particular character feature. For readability, and because the value
65
// of these bits is tightly coupled to this implementation, the individual bits
66
// are not named. Note that bitfields for all characters above ASCII 127 are
67
// zero-initialized.
68
ABSL_DLL const unsigned char kPropertyBits[256] = {
69
    0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40,  // 0x00
70
    0x40, 0x68, 0x48, 0x48, 0x48, 0x48, 0x40, 0x40,
71
    0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40,  // 0x10
72
    0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40,
73
    0x28, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10,  // 0x20
74
    0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10,
75
    0x84, 0x84, 0x84, 0x84, 0x84, 0x84, 0x84, 0x84,  // 0x30
76
    0x84, 0x84, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10,
77
    0x10, 0x85, 0x85, 0x85, 0x85, 0x85, 0x85, 0x05,  // 0x40
78
    0x05, 0x05, 0x05, 0x05, 0x05, 0x05, 0x05, 0x05,
79
    0x05, 0x05, 0x05, 0x05, 0x05, 0x05, 0x05, 0x05,  // 0x50
80
    0x05, 0x05, 0x05, 0x10, 0x10, 0x10, 0x10, 0x10,
81
    0x10, 0x85, 0x85, 0x85, 0x85, 0x85, 0x85, 0x05,  // 0x60
82
    0x05, 0x05, 0x05, 0x05, 0x05, 0x05, 0x05, 0x05,
83
    0x05, 0x05, 0x05, 0x05, 0x05, 0x05, 0x05, 0x05,  // 0x70
84
    0x05, 0x05, 0x05, 0x10, 0x10, 0x10, 0x10, 0x40,
85
};
86
87
// Array of characters for the ascii_tolower() function. For values 'A'
88
// through 'Z', return the lower-case character; otherwise, return the
89
// identity of the passed character.
90
ABSL_DLL const char kToLower[256] = {
91
  '\x00', '\x01', '\x02', '\x03', '\x04', '\x05', '\x06', '\x07',
92
  '\x08', '\x09', '\x0a', '\x0b', '\x0c', '\x0d', '\x0e', '\x0f',
93
  '\x10', '\x11', '\x12', '\x13', '\x14', '\x15', '\x16', '\x17',
94
  '\x18', '\x19', '\x1a', '\x1b', '\x1c', '\x1d', '\x1e', '\x1f',
95
  '\x20', '\x21', '\x22', '\x23', '\x24', '\x25', '\x26', '\x27',
96
  '\x28', '\x29', '\x2a', '\x2b', '\x2c', '\x2d', '\x2e', '\x2f',
97
  '\x30', '\x31', '\x32', '\x33', '\x34', '\x35', '\x36', '\x37',
98
  '\x38', '\x39', '\x3a', '\x3b', '\x3c', '\x3d', '\x3e', '\x3f',
99
  '\x40',    'a',    'b',    'c',    'd',    'e',    'f',    'g',
100
     'h',    'i',    'j',    'k',    'l',    'm',    'n',    'o',
101
     'p',    'q',    'r',    's',    't',    'u',    'v',    'w',
102
     'x',    'y',    'z', '\x5b', '\x5c', '\x5d', '\x5e', '\x5f',
103
  '\x60', '\x61', '\x62', '\x63', '\x64', '\x65', '\x66', '\x67',
104
  '\x68', '\x69', '\x6a', '\x6b', '\x6c', '\x6d', '\x6e', '\x6f',
105
  '\x70', '\x71', '\x72', '\x73', '\x74', '\x75', '\x76', '\x77',
106
  '\x78', '\x79', '\x7a', '\x7b', '\x7c', '\x7d', '\x7e', '\x7f',
107
  '\x80', '\x81', '\x82', '\x83', '\x84', '\x85', '\x86', '\x87',
108
  '\x88', '\x89', '\x8a', '\x8b', '\x8c', '\x8d', '\x8e', '\x8f',
109
  '\x90', '\x91', '\x92', '\x93', '\x94', '\x95', '\x96', '\x97',
110
  '\x98', '\x99', '\x9a', '\x9b', '\x9c', '\x9d', '\x9e', '\x9f',
111
  '\xa0', '\xa1', '\xa2', '\xa3', '\xa4', '\xa5', '\xa6', '\xa7',
112
  '\xa8', '\xa9', '\xaa', '\xab', '\xac', '\xad', '\xae', '\xaf',
113
  '\xb0', '\xb1', '\xb2', '\xb3', '\xb4', '\xb5', '\xb6', '\xb7',
114
  '\xb8', '\xb9', '\xba', '\xbb', '\xbc', '\xbd', '\xbe', '\xbf',
115
  '\xc0', '\xc1', '\xc2', '\xc3', '\xc4', '\xc5', '\xc6', '\xc7',
116
  '\xc8', '\xc9', '\xca', '\xcb', '\xcc', '\xcd', '\xce', '\xcf',
117
  '\xd0', '\xd1', '\xd2', '\xd3', '\xd4', '\xd5', '\xd6', '\xd7',
118
  '\xd8', '\xd9', '\xda', '\xdb', '\xdc', '\xdd', '\xde', '\xdf',
119
  '\xe0', '\xe1', '\xe2', '\xe3', '\xe4', '\xe5', '\xe6', '\xe7',
120
  '\xe8', '\xe9', '\xea', '\xeb', '\xec', '\xed', '\xee', '\xef',
121
  '\xf0', '\xf1', '\xf2', '\xf3', '\xf4', '\xf5', '\xf6', '\xf7',
122
  '\xf8', '\xf9', '\xfa', '\xfb', '\xfc', '\xfd', '\xfe', '\xff',
123
};
124
125
// Array of characters for the ascii_toupper() function. For values 'a'
126
// through 'z', return the upper-case character; otherwise, return the
127
// identity of the passed character.
128
ABSL_DLL const char kToUpper[256] = {
129
  '\x00', '\x01', '\x02', '\x03', '\x04', '\x05', '\x06', '\x07',
130
  '\x08', '\x09', '\x0a', '\x0b', '\x0c', '\x0d', '\x0e', '\x0f',
131
  '\x10', '\x11', '\x12', '\x13', '\x14', '\x15', '\x16', '\x17',
132
  '\x18', '\x19', '\x1a', '\x1b', '\x1c', '\x1d', '\x1e', '\x1f',
133
  '\x20', '\x21', '\x22', '\x23', '\x24', '\x25', '\x26', '\x27',
134
  '\x28', '\x29', '\x2a', '\x2b', '\x2c', '\x2d', '\x2e', '\x2f',
135
  '\x30', '\x31', '\x32', '\x33', '\x34', '\x35', '\x36', '\x37',
136
  '\x38', '\x39', '\x3a', '\x3b', '\x3c', '\x3d', '\x3e', '\x3f',
137
  '\x40', '\x41', '\x42', '\x43', '\x44', '\x45', '\x46', '\x47',
138
  '\x48', '\x49', '\x4a', '\x4b', '\x4c', '\x4d', '\x4e', '\x4f',
139
  '\x50', '\x51', '\x52', '\x53', '\x54', '\x55', '\x56', '\x57',
140
  '\x58', '\x59', '\x5a', '\x5b', '\x5c', '\x5d', '\x5e', '\x5f',
141
  '\x60',    'A',    'B',    'C',    'D',    'E',    'F',    'G',
142
     'H',    'I',    'J',    'K',    'L',    'M',    'N',    'O',
143
     'P',    'Q',    'R',    'S',    'T',    'U',    'V',    'W',
144
     'X',    'Y',    'Z', '\x7b', '\x7c', '\x7d', '\x7e', '\x7f',
145
  '\x80', '\x81', '\x82', '\x83', '\x84', '\x85', '\x86', '\x87',
146
  '\x88', '\x89', '\x8a', '\x8b', '\x8c', '\x8d', '\x8e', '\x8f',
147
  '\x90', '\x91', '\x92', '\x93', '\x94', '\x95', '\x96', '\x97',
148
  '\x98', '\x99', '\x9a', '\x9b', '\x9c', '\x9d', '\x9e', '\x9f',
149
  '\xa0', '\xa1', '\xa2', '\xa3', '\xa4', '\xa5', '\xa6', '\xa7',
150
  '\xa8', '\xa9', '\xaa', '\xab', '\xac', '\xad', '\xae', '\xaf',
151
  '\xb0', '\xb1', '\xb2', '\xb3', '\xb4', '\xb5', '\xb6', '\xb7',
152
  '\xb8', '\xb9', '\xba', '\xbb', '\xbc', '\xbd', '\xbe', '\xbf',
153
  '\xc0', '\xc1', '\xc2', '\xc3', '\xc4', '\xc5', '\xc6', '\xc7',
154
  '\xc8', '\xc9', '\xca', '\xcb', '\xcc', '\xcd', '\xce', '\xcf',
155
  '\xd0', '\xd1', '\xd2', '\xd3', '\xd4', '\xd5', '\xd6', '\xd7',
156
  '\xd8', '\xd9', '\xda', '\xdb', '\xdc', '\xdd', '\xde', '\xdf',
157
  '\xe0', '\xe1', '\xe2', '\xe3', '\xe4', '\xe5', '\xe6', '\xe7',
158
  '\xe8', '\xe9', '\xea', '\xeb', '\xec', '\xed', '\xee', '\xef',
159
  '\xf0', '\xf1', '\xf2', '\xf3', '\xf4', '\xf5', '\xf6', '\xf7',
160
  '\xf8', '\xf9', '\xfa', '\xfb', '\xfc', '\xfd', '\xfe', '\xff',
161
};
162
// clang-format on
163
164
// Returns whether `c` is in the a-z/A-Z range (w.r.t. `ToUpper`).
165
// Implemented by:
166
//  1. Pushing the a-z/A-Z range to [SCHAR_MIN, SCHAR_MIN + 26).
167
//  2. Comparing to SCHAR_MIN + 26.
168
template <bool ToUpper>
169
0
constexpr bool AsciiInAZRange(unsigned char c) {
170
0
  constexpr unsigned char sub = (ToUpper ? 'a' : 'A') - SCHAR_MIN;
171
0
  constexpr signed char threshold = SCHAR_MIN + 26;  // 26 = alphabet size.
172
  // Using unsigned arithmetic as overflows/underflows are well defined.
173
0
  unsigned char u = c - sub;
174
  // Using signed cmp, as SIMD unsigned cmp isn't available in many platforms.
175
0
  return static_cast<signed char>(u) < threshold;
176
0
}
Unexecuted instantiation: bool absl::ascii_internal::AsciiInAZRange<false>(unsigned char)
Unexecuted instantiation: bool absl::ascii_internal::AsciiInAZRange<true>(unsigned char)
177
178
template <bool ToUpper>
179
0
constexpr bool AsciiInAZRangeNaive(unsigned char c) {
180
0
  constexpr unsigned char a = (ToUpper ? 'a' : 'A');
181
0
  constexpr unsigned char z = (ToUpper ? 'z' : 'Z');
182
0
  return a <= c && c <= z;
183
0
}
Unexecuted instantiation: bool absl::ascii_internal::AsciiInAZRangeNaive<false>(unsigned char)
Unexecuted instantiation: bool absl::ascii_internal::AsciiInAZRangeNaive<true>(unsigned char)
184
185
template <bool ToUpper, bool Naive>
186
constexpr void AsciiStrCaseFoldImpl(char* absl_nonnull dst,
187
                                    const char* absl_nullable src,
188
0
                                    size_t size) {
189
  // The upper- and lowercase versions of ASCII characters differ by only 1 bit.
190
  // When we need to flip the case, we can xor with this bit to achieve the
191
  // desired result. Note that the choice of 'a' and 'A' here is arbitrary. We
192
  // could have chosen 'z' and 'Z', or any other pair of characters as they all
193
  // have the same single bit difference.
194
0
  constexpr unsigned char kAsciiCaseBitFlip = 'a' ^ 'A';
195
196
0
  for (size_t i = 0; i < size; ++i) {
197
0
    unsigned char v = static_cast<unsigned char>(src[i]);
198
0
    if ABSL_INTERNAL_CONSTEXPR_SINCE_CXX17 (Naive) {
199
0
      v ^= AsciiInAZRangeNaive<ToUpper>(v) ? kAsciiCaseBitFlip : 0;
200
0
    } else {
201
0
      v ^= AsciiInAZRange<ToUpper>(v) ? kAsciiCaseBitFlip : 0;
202
0
    }
203
0
    dst[i] = static_cast<char>(v);
204
0
  }
205
0
}
Unexecuted instantiation: void absl::ascii_internal::AsciiStrCaseFoldImpl<false, true>(char*, char const*, unsigned long)
Unexecuted instantiation: void absl::ascii_internal::AsciiStrCaseFoldImpl<false, false>(char*, char const*, unsigned long)
Unexecuted instantiation: void absl::ascii_internal::AsciiStrCaseFoldImpl<true, true>(char*, char const*, unsigned long)
Unexecuted instantiation: void absl::ascii_internal::AsciiStrCaseFoldImpl<true, false>(char*, char const*, unsigned long)
206
207
// Splitting to short and long strings to allow vectorization decisions
208
// to be made separately in the long and short cases.
209
// Using slightly different implementations so the compiler won't optimize them
210
// into the same code (the non-naive version is needed for SIMD, so for short
211
// strings it's not important).
212
// `src` may be null iff `size` is zero.
213
template <bool ToUpper>
214
constexpr void AsciiStrCaseFold(char* absl_nonnull dst,
215
0
                                const char* absl_nullable src, size_t size) {
216
0
  size < 16 ? AsciiStrCaseFoldImpl<ToUpper, /*Naive=*/true>(dst, src, size)
217
0
            : AsciiStrCaseFoldImpl<ToUpper, /*Naive=*/false>(dst, src, size);
218
0
}
Unexecuted instantiation: void absl::ascii_internal::AsciiStrCaseFold<false>(char*, char const*, unsigned long)
Unexecuted instantiation: void absl::ascii_internal::AsciiStrCaseFold<true>(char*, char const*, unsigned long)
219
220
void AsciiStrToLower(char* absl_nonnull dst, const char* absl_nullable src,
221
0
                     size_t n) {
222
0
  return AsciiStrCaseFold<false>(dst, src, n);
223
0
}
224
225
void AsciiStrToUpper(char* absl_nonnull dst, const char* absl_nullable src,
226
0
                     size_t n) {
227
0
  return AsciiStrCaseFold<true>(dst, src, n);
228
0
}
229
230
0
static constexpr size_t ValidateAsciiCasefold() {
231
0
  constexpr size_t num_chars = 1 + CHAR_MAX - CHAR_MIN;
232
0
  size_t incorrect_index = 0;
233
0
  char lowered[num_chars] = {};
234
0
  char uppered[num_chars] = {};
235
0
  for (unsigned int i = 0; i < num_chars; ++i) {
236
0
    uppered[i] = lowered[i] = static_cast<char>(i);
237
0
  }
238
0
  AsciiStrCaseFold<false>(&lowered[0], &lowered[0], num_chars);
239
0
  AsciiStrCaseFold<true>(&uppered[0], &uppered[0], num_chars);
240
0
  for (size_t i = 0; i < num_chars; ++i) {
241
0
    const char ch = static_cast<char>(i),
242
0
               ch_upper = ('a' <= ch && ch <= 'z' ? 'A' + (ch - 'a') : ch),
243
0
               ch_lower = ('A' <= ch && ch <= 'Z' ? 'a' + (ch - 'A') : ch);
244
0
    if (uppered[i] != ch_upper || lowered[i] != ch_lower) {
245
0
      incorrect_index = i > 0 ? i : num_chars;
246
0
      break;
247
0
    }
248
0
  }
249
0
  return incorrect_index;
250
0
}
251
252
static_assert(ValidateAsciiCasefold() == 0, "error in case conversion");
253
254
}  // namespace ascii_internal
255
256
0
void AsciiStrToLower(std::string* absl_nonnull s) {
257
0
  char* p = &(*s)[0];
258
0
  return ascii_internal::AsciiStrCaseFold<false>(p, p, s->size());
259
0
}
260
261
0
void AsciiStrToUpper(std::string* absl_nonnull s) {
262
0
  char* p = &(*s)[0];
263
0
  return ascii_internal::AsciiStrCaseFold<true>(p, p, s->size());
264
0
}
265
266
0
void RemoveExtraAsciiWhitespace(std::string* absl_nonnull str) {
267
0
  auto stripped = StripAsciiWhitespace(*str);
268
269
0
  if (stripped.empty()) {
270
0
    str->clear();
271
0
    return;
272
0
  }
273
274
0
  auto input_it = stripped.begin();
275
0
  auto input_end = stripped.end();
276
0
  auto output_it = &(*str)[0];
277
0
  bool is_ws = false;
278
279
0
  for (; input_it < input_end; ++input_it) {
280
0
    if (is_ws) {
281
      // Consecutive whitespace?  Keep only the last.
282
0
      is_ws = absl::ascii_isspace(static_cast<unsigned char>(*input_it));
283
0
      if (is_ws) --output_it;
284
0
    } else {
285
0
      is_ws = absl::ascii_isspace(static_cast<unsigned char>(*input_it));
286
0
    }
287
288
0
    *output_it = *input_it;
289
0
    ++output_it;
290
0
  }
291
292
0
  str->erase(static_cast<size_t>(output_it - &(*str)[0]));
293
0
}
294
295
ABSL_NAMESPACE_END
296
}  // namespace absl