Coverage Report

Created: 2023-09-25 06:27

/src/abseil-cpp/absl/strings/ascii.cc
Line
Count
Source (jump to first uncovered line)
1
// Copyright 2017 The Abseil Authors.
2
//
3
// Licensed under the Apache License, Version 2.0 (the "License");
4
// you may not use this file except in compliance with the License.
5
// You may obtain a copy of the License at
6
//
7
//      https://www.apache.org/licenses/LICENSE-2.0
8
//
9
// Unless required by applicable law or agreed to in writing, software
10
// distributed under the License is distributed on an "AS IS" BASIS,
11
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
// See the License for the specific language governing permissions and
13
// limitations under the License.
14
15
#include "absl/strings/ascii.h"
16
17
#include <climits>
18
#include <cstring>
19
#include <string>
20
21
#include "absl/base/config.h"
22
23
namespace absl {
24
ABSL_NAMESPACE_BEGIN
25
namespace ascii_internal {
26
27
// # Table generated by this Python code (bit 0x02 is currently unused):
28
// TODO(mbar) Move Python code for generation of table to BUILD and link here.
29
30
// NOTE: The kAsciiPropertyBits table used within this code was generated by
31
// Python code of the following form. (Bit 0x02 is currently unused and
32
// available.)
33
//
34
// def Hex2(n):
35
//   return '0x' + hex(n/16)[2:] + hex(n%16)[2:]
36
// def IsPunct(ch):
37
//   return (ord(ch) >= 32 and ord(ch) < 127 and
38
//           not ch.isspace() and not ch.isalnum())
39
// def IsBlank(ch):
40
//   return ch in ' \t'
41
// def IsCntrl(ch):
42
//   return ord(ch) < 32 or ord(ch) == 127
43
// def IsXDigit(ch):
44
//   return ch.isdigit() or ch.lower() in 'abcdef'
45
// for i in range(128):
46
//   ch = chr(i)
47
//   mask = ((ch.isalpha() and 0x01 or 0) |
48
//           (ch.isalnum() and 0x04 or 0) |
49
//           (ch.isspace() and 0x08 or 0) |
50
//           (IsPunct(ch) and 0x10 or 0) |
51
//           (IsBlank(ch) and 0x20 or 0) |
52
//           (IsCntrl(ch) and 0x40 or 0) |
53
//           (IsXDigit(ch) and 0x80 or 0))
54
//   print Hex2(mask) + ',',
55
//   if i % 16 == 7:
56
//     print ' //', Hex2(i & 0x78)
57
//   elif i % 16 == 15:
58
//     print
59
60
// clang-format off
61
// Array of bitfields holding character information. Each bit value corresponds
62
// to a particular character feature. For readability, and because the value
63
// of these bits is tightly coupled to this implementation, the individual bits
64
// are not named. Note that bitfields for all characters above ASCII 127 are
65
// zero-initialized.
66
ABSL_DLL const unsigned char kPropertyBits[256] = {
67
    0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40,  // 0x00
68
    0x40, 0x68, 0x48, 0x48, 0x48, 0x48, 0x40, 0x40,
69
    0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40,  // 0x10
70
    0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40,
71
    0x28, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10,  // 0x20
72
    0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10,
73
    0x84, 0x84, 0x84, 0x84, 0x84, 0x84, 0x84, 0x84,  // 0x30
74
    0x84, 0x84, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10,
75
    0x10, 0x85, 0x85, 0x85, 0x85, 0x85, 0x85, 0x05,  // 0x40
76
    0x05, 0x05, 0x05, 0x05, 0x05, 0x05, 0x05, 0x05,
77
    0x05, 0x05, 0x05, 0x05, 0x05, 0x05, 0x05, 0x05,  // 0x50
78
    0x05, 0x05, 0x05, 0x10, 0x10, 0x10, 0x10, 0x10,
79
    0x10, 0x85, 0x85, 0x85, 0x85, 0x85, 0x85, 0x05,  // 0x60
80
    0x05, 0x05, 0x05, 0x05, 0x05, 0x05, 0x05, 0x05,
81
    0x05, 0x05, 0x05, 0x05, 0x05, 0x05, 0x05, 0x05,  // 0x70
82
    0x05, 0x05, 0x05, 0x10, 0x10, 0x10, 0x10, 0x40,
83
};
84
85
// Array of characters for the ascii_tolower() function. For values 'A'
86
// through 'Z', return the lower-case character; otherwise, return the
87
// identity of the passed character.
88
ABSL_DLL const char kToLower[256] = {
89
  '\x00', '\x01', '\x02', '\x03', '\x04', '\x05', '\x06', '\x07',
90
  '\x08', '\x09', '\x0a', '\x0b', '\x0c', '\x0d', '\x0e', '\x0f',
91
  '\x10', '\x11', '\x12', '\x13', '\x14', '\x15', '\x16', '\x17',
92
  '\x18', '\x19', '\x1a', '\x1b', '\x1c', '\x1d', '\x1e', '\x1f',
93
  '\x20', '\x21', '\x22', '\x23', '\x24', '\x25', '\x26', '\x27',
94
  '\x28', '\x29', '\x2a', '\x2b', '\x2c', '\x2d', '\x2e', '\x2f',
95
  '\x30', '\x31', '\x32', '\x33', '\x34', '\x35', '\x36', '\x37',
96
  '\x38', '\x39', '\x3a', '\x3b', '\x3c', '\x3d', '\x3e', '\x3f',
97
  '\x40',    'a',    'b',    'c',    'd',    'e',    'f',    'g',
98
     'h',    'i',    'j',    'k',    'l',    'm',    'n',    'o',
99
     'p',    'q',    'r',    's',    't',    'u',    'v',    'w',
100
     'x',    'y',    'z', '\x5b', '\x5c', '\x5d', '\x5e', '\x5f',
101
  '\x60', '\x61', '\x62', '\x63', '\x64', '\x65', '\x66', '\x67',
102
  '\x68', '\x69', '\x6a', '\x6b', '\x6c', '\x6d', '\x6e', '\x6f',
103
  '\x70', '\x71', '\x72', '\x73', '\x74', '\x75', '\x76', '\x77',
104
  '\x78', '\x79', '\x7a', '\x7b', '\x7c', '\x7d', '\x7e', '\x7f',
105
  '\x80', '\x81', '\x82', '\x83', '\x84', '\x85', '\x86', '\x87',
106
  '\x88', '\x89', '\x8a', '\x8b', '\x8c', '\x8d', '\x8e', '\x8f',
107
  '\x90', '\x91', '\x92', '\x93', '\x94', '\x95', '\x96', '\x97',
108
  '\x98', '\x99', '\x9a', '\x9b', '\x9c', '\x9d', '\x9e', '\x9f',
109
  '\xa0', '\xa1', '\xa2', '\xa3', '\xa4', '\xa5', '\xa6', '\xa7',
110
  '\xa8', '\xa9', '\xaa', '\xab', '\xac', '\xad', '\xae', '\xaf',
111
  '\xb0', '\xb1', '\xb2', '\xb3', '\xb4', '\xb5', '\xb6', '\xb7',
112
  '\xb8', '\xb9', '\xba', '\xbb', '\xbc', '\xbd', '\xbe', '\xbf',
113
  '\xc0', '\xc1', '\xc2', '\xc3', '\xc4', '\xc5', '\xc6', '\xc7',
114
  '\xc8', '\xc9', '\xca', '\xcb', '\xcc', '\xcd', '\xce', '\xcf',
115
  '\xd0', '\xd1', '\xd2', '\xd3', '\xd4', '\xd5', '\xd6', '\xd7',
116
  '\xd8', '\xd9', '\xda', '\xdb', '\xdc', '\xdd', '\xde', '\xdf',
117
  '\xe0', '\xe1', '\xe2', '\xe3', '\xe4', '\xe5', '\xe6', '\xe7',
118
  '\xe8', '\xe9', '\xea', '\xeb', '\xec', '\xed', '\xee', '\xef',
119
  '\xf0', '\xf1', '\xf2', '\xf3', '\xf4', '\xf5', '\xf6', '\xf7',
120
  '\xf8', '\xf9', '\xfa', '\xfb', '\xfc', '\xfd', '\xfe', '\xff',
121
};
122
123
// Array of characters for the ascii_toupper() function. For values 'a'
124
// through 'z', return the upper-case character; otherwise, return the
125
// identity of the passed character.
126
ABSL_DLL const char kToUpper[256] = {
127
  '\x00', '\x01', '\x02', '\x03', '\x04', '\x05', '\x06', '\x07',
128
  '\x08', '\x09', '\x0a', '\x0b', '\x0c', '\x0d', '\x0e', '\x0f',
129
  '\x10', '\x11', '\x12', '\x13', '\x14', '\x15', '\x16', '\x17',
130
  '\x18', '\x19', '\x1a', '\x1b', '\x1c', '\x1d', '\x1e', '\x1f',
131
  '\x20', '\x21', '\x22', '\x23', '\x24', '\x25', '\x26', '\x27',
132
  '\x28', '\x29', '\x2a', '\x2b', '\x2c', '\x2d', '\x2e', '\x2f',
133
  '\x30', '\x31', '\x32', '\x33', '\x34', '\x35', '\x36', '\x37',
134
  '\x38', '\x39', '\x3a', '\x3b', '\x3c', '\x3d', '\x3e', '\x3f',
135
  '\x40', '\x41', '\x42', '\x43', '\x44', '\x45', '\x46', '\x47',
136
  '\x48', '\x49', '\x4a', '\x4b', '\x4c', '\x4d', '\x4e', '\x4f',
137
  '\x50', '\x51', '\x52', '\x53', '\x54', '\x55', '\x56', '\x57',
138
  '\x58', '\x59', '\x5a', '\x5b', '\x5c', '\x5d', '\x5e', '\x5f',
139
  '\x60',    'A',    'B',    'C',    'D',    'E',    'F',    'G',
140
     'H',    'I',    'J',    'K',    'L',    'M',    'N',    'O',
141
     'P',    'Q',    'R',    'S',    'T',    'U',    'V',    'W',
142
     'X',    'Y',    'Z', '\x7b', '\x7c', '\x7d', '\x7e', '\x7f',
143
  '\x80', '\x81', '\x82', '\x83', '\x84', '\x85', '\x86', '\x87',
144
  '\x88', '\x89', '\x8a', '\x8b', '\x8c', '\x8d', '\x8e', '\x8f',
145
  '\x90', '\x91', '\x92', '\x93', '\x94', '\x95', '\x96', '\x97',
146
  '\x98', '\x99', '\x9a', '\x9b', '\x9c', '\x9d', '\x9e', '\x9f',
147
  '\xa0', '\xa1', '\xa2', '\xa3', '\xa4', '\xa5', '\xa6', '\xa7',
148
  '\xa8', '\xa9', '\xaa', '\xab', '\xac', '\xad', '\xae', '\xaf',
149
  '\xb0', '\xb1', '\xb2', '\xb3', '\xb4', '\xb5', '\xb6', '\xb7',
150
  '\xb8', '\xb9', '\xba', '\xbb', '\xbc', '\xbd', '\xbe', '\xbf',
151
  '\xc0', '\xc1', '\xc2', '\xc3', '\xc4', '\xc5', '\xc6', '\xc7',
152
  '\xc8', '\xc9', '\xca', '\xcb', '\xcc', '\xcd', '\xce', '\xcf',
153
  '\xd0', '\xd1', '\xd2', '\xd3', '\xd4', '\xd5', '\xd6', '\xd7',
154
  '\xd8', '\xd9', '\xda', '\xdb', '\xdc', '\xdd', '\xde', '\xdf',
155
  '\xe0', '\xe1', '\xe2', '\xe3', '\xe4', '\xe5', '\xe6', '\xe7',
156
  '\xe8', '\xe9', '\xea', '\xeb', '\xec', '\xed', '\xee', '\xef',
157
  '\xf0', '\xf1', '\xf2', '\xf3', '\xf4', '\xf5', '\xf6', '\xf7',
158
  '\xf8', '\xf9', '\xfa', '\xfb', '\xfc', '\xfd', '\xfe', '\xff',
159
};
160
// clang-format on
161
162
template <bool ToUpper>
163
0
constexpr void AsciiStrCaseFold(char* p, char* end) {
164
  // The upper- and lowercase versions of ASCII characters differ by only 1 bit.
165
  // When we need to flip the case, we can xor with this bit to achieve the
166
  // desired result. Note that the choice of 'a' and 'A' here is arbitrary. We
167
  // could have chosen 'z' and 'Z', or any other pair of characters as they all
168
  // have the same single bit difference.
169
0
  constexpr unsigned char kAsciiCaseBitFlip = 'a' ^ 'A';
170
171
0
  constexpr char ch_a = ToUpper ? 'a' : 'A';
172
0
  constexpr char ch_z = ToUpper ? 'z' : 'Z';
173
0
  for (; p < end; ++p) {
174
0
    unsigned char v = static_cast<unsigned char>(*p);
175
    // We use & instead of && to ensure this always stays branchless
176
    // We use static_cast<int> to suppress -Wbitwise-instead-of-logical
177
0
    bool is_in_range = static_cast<bool>(static_cast<int>(ch_a <= v) &
178
0
                                         static_cast<int>(v <= ch_z));
179
0
    v ^= is_in_range ? kAsciiCaseBitFlip : 0;
180
0
    *p = static_cast<char>(v);
181
0
  }
182
0
}
Unexecuted instantiation: void absl::ascii_internal::AsciiStrCaseFold<false>(char*, char*)
Unexecuted instantiation: void absl::ascii_internal::AsciiStrCaseFold<true>(char*, char*)
183
184
0
static constexpr size_t ValidateAsciiCasefold() {
185
0
  constexpr size_t num_chars = 1 + CHAR_MAX - CHAR_MIN;
186
0
  size_t incorrect_index = 0;
187
0
  char lowered[num_chars] = {};
188
0
  char uppered[num_chars] = {};
189
0
  for (unsigned int i = 0; i < num_chars; ++i) {
190
0
    uppered[i] = lowered[i] = static_cast<char>(i);
191
0
  }
192
0
  AsciiStrCaseFold<false>(&lowered[0], &lowered[num_chars]);
193
0
  AsciiStrCaseFold<true>(&uppered[0], &uppered[num_chars]);
194
0
  for (size_t i = 0; i < num_chars; ++i) {
195
0
    const char ch = static_cast<char>(i),
196
0
               ch_upper = ('a' <= ch && ch <= 'z' ? 'A' + (ch - 'a') : ch),
197
0
               ch_lower = ('A' <= ch && ch <= 'Z' ? 'a' + (ch - 'A') : ch);
198
0
    if (uppered[i] != ch_upper || lowered[i] != ch_lower) {
199
0
      incorrect_index = i > 0 ? i : num_chars;
200
0
      break;
201
0
    }
202
0
  }
203
0
  return incorrect_index;
204
0
}
205
206
static_assert(ValidateAsciiCasefold() == 0, "error in case conversion");
207
208
}  // namespace ascii_internal
209
210
0
void AsciiStrToLower(std::string* s) {
211
0
  char* p = &(*s)[0];  // Guaranteed to be valid for empty strings
212
0
  return ascii_internal::AsciiStrCaseFold<false>(p, p + s->size());
213
0
}
214
215
0
void AsciiStrToUpper(std::string* s) {
216
0
  char* p = &(*s)[0];  // Guaranteed to be valid for empty strings
217
0
  return ascii_internal::AsciiStrCaseFold<true>(p, p + s->size());
218
0
}
219
220
0
void RemoveExtraAsciiWhitespace(std::string* str) {
221
0
  auto stripped = StripAsciiWhitespace(*str);
222
223
0
  if (stripped.empty()) {
224
0
    str->clear();
225
0
    return;
226
0
  }
227
228
0
  auto input_it = stripped.begin();
229
0
  auto input_end = stripped.end();
230
0
  auto output_it = &(*str)[0];
231
0
  bool is_ws = false;
232
233
0
  for (; input_it < input_end; ++input_it) {
234
0
    if (is_ws) {
235
      // Consecutive whitespace?  Keep only the last.
236
0
      is_ws = absl::ascii_isspace(static_cast<unsigned char>(*input_it));
237
0
      if (is_ws) --output_it;
238
0
    } else {
239
0
      is_ws = absl::ascii_isspace(static_cast<unsigned char>(*input_it));
240
0
    }
241
242
0
    *output_it = *input_it;
243
0
    ++output_it;
244
0
  }
245
246
0
  str->erase(static_cast<size_t>(output_it - &(*str)[0]));
247
0
}
248
249
ABSL_NAMESPACE_END
250
}  // namespace absl