Coverage Report

Created: 2023-05-25 06:18

/proc/self/cwd/internal/utf8.cc
Line
Count
Source (jump to first uncovered line)
1
// Copyright 2021 Google LLC
2
//
3
// Licensed under the Apache License, Version 2.0 (the "License");
4
// you may not use this file except in compliance with the License.
5
// You may obtain a copy of the License at
6
//
7
//     https://www.apache.org/licenses/LICENSE-2.0
8
//
9
// Unless required by applicable law or agreed to in writing, software
10
// distributed under the License is distributed on an "AS IS" BASIS,
11
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
// See the License for the specific language governing permissions and
13
// limitations under the License.
14
15
#include "internal/utf8.h"
16
17
#include <algorithm>
18
#include <cstdint>
19
#include <string>
20
21
#include "absl/base/macros.h"
22
#include "absl/base/optimization.h"
23
#include "internal/unicode.h"
24
25
// Implementation is based on
26
// https://go.googlesource.com/go/+/refs/heads/master/src/unicode/utf8/utf8.go
27
// but adapted for C++.
28
29
namespace cel::internal {
30
31
namespace {
32
33
constexpr uint8_t kUtf8RuneSelf = 0x80;
34
constexpr size_t kUtf8Max = 4;
35
36
constexpr uint8_t kLow = 0x80;
37
constexpr uint8_t kHigh = 0xbf;
38
39
constexpr uint8_t kMaskX = 0x3f;
40
constexpr uint8_t kMask2 = 0x1f;
41
constexpr uint8_t kMask3 = 0xf;
42
constexpr uint8_t kMask4 = 0x7;
43
44
constexpr uint8_t kTX = 0x80;
45
constexpr uint8_t kT2 = 0xc0;
46
constexpr uint8_t kT3 = 0xe0;
47
constexpr uint8_t kT4 = 0xf0;
48
49
constexpr uint8_t kXX = 0xf1;
50
constexpr uint8_t kAS = 0xf0;
51
constexpr uint8_t kS1 = 0x02;
52
constexpr uint8_t kS2 = 0x13;
53
constexpr uint8_t kS3 = 0x03;
54
constexpr uint8_t kS4 = 0x23;
55
constexpr uint8_t kS5 = 0x34;
56
constexpr uint8_t kS6 = 0x04;
57
constexpr uint8_t kS7 = 0x44;
58
59
// NOLINTBEGIN
60
// clang-format off
61
constexpr uint8_t kLeading[256] = {
62
  //   1    2    3    4    5    6    7    8    9    A    B    C    D    E    F
63
  kAS, kAS, kAS, kAS, kAS, kAS, kAS, kAS, kAS, kAS, kAS, kAS, kAS, kAS, kAS, kAS, // 0x00-0x0F
64
  kAS, kAS, kAS, kAS, kAS, kAS, kAS, kAS, kAS, kAS, kAS, kAS, kAS, kAS, kAS, kAS, // 0x10-0x1F
65
  kAS, kAS, kAS, kAS, kAS, kAS, kAS, kAS, kAS, kAS, kAS, kAS, kAS, kAS, kAS, kAS, // 0x20-0x2F
66
  kAS, kAS, kAS, kAS, kAS, kAS, kAS, kAS, kAS, kAS, kAS, kAS, kAS, kAS, kAS, kAS, // 0x30-0x3F
67
  kAS, kAS, kAS, kAS, kAS, kAS, kAS, kAS, kAS, kAS, kAS, kAS, kAS, kAS, kAS, kAS, // 0x40-0x4F
68
  kAS, kAS, kAS, kAS, kAS, kAS, kAS, kAS, kAS, kAS, kAS, kAS, kAS, kAS, kAS, kAS, // 0x50-0x5F
69
  kAS, kAS, kAS, kAS, kAS, kAS, kAS, kAS, kAS, kAS, kAS, kAS, kAS, kAS, kAS, kAS, // 0x60-0x6F
70
  kAS, kAS, kAS, kAS, kAS, kAS, kAS, kAS, kAS, kAS, kAS, kAS, kAS, kAS, kAS, kAS, // 0x70-0x7F
71
  //   1    2    3    4    5    6    7    8    9    A    B    C    D    E    F
72
  kXX, kXX, kXX, kXX, kXX, kXX, kXX, kXX, kXX, kXX, kXX, kXX, kXX, kXX, kXX, kXX, // 0x80-0x8F
73
  kXX, kXX, kXX, kXX, kXX, kXX, kXX, kXX, kXX, kXX, kXX, kXX, kXX, kXX, kXX, kXX, // 0x90-0x9F
74
  kXX, kXX, kXX, kXX, kXX, kXX, kXX, kXX, kXX, kXX, kXX, kXX, kXX, kXX, kXX, kXX, // 0xA0-0xAF
75
  kXX, kXX, kXX, kXX, kXX, kXX, kXX, kXX, kXX, kXX, kXX, kXX, kXX, kXX, kXX, kXX, // 0xB0-0xBF
76
  kXX, kXX, kS1, kS1, kS1, kS1, kS1, kS1, kS1, kS1, kS1, kS1, kS1, kS1, kS1, kS1, // 0xC0-0xCF
77
  kS1, kS1, kS1, kS1, kS1, kS1, kS1, kS1, kS1, kS1, kS1, kS1, kS1, kS1, kS1, kS1, // 0xD0-0xDF
78
  kS2, kS3, kS3, kS3, kS3, kS3, kS3, kS3, kS3, kS3, kS3, kS3, kS3, kS4, kS3, kS3, // 0xE0-0xEF
79
  kS5, kS6, kS6, kS6, kS7, kXX, kXX, kXX, kXX, kXX, kXX, kXX, kXX, kXX, kXX, kXX, // 0xF0-0xFF
80
};
81
// clang-format on
82
// NOLINTEND
83
84
constexpr std::pair<uint8_t, uint8_t> kAccept[16] = {
85
    {kLow, kHigh}, {0xa0, kHigh}, {kLow, 0x9f}, {0x90, kHigh},
86
    {kLow, 0x8f},  {0x0, 0x0},    {0x0, 0x0},   {0x0, 0x0},
87
    {0x0, 0x0},    {0x0, 0x0},    {0x0, 0x0},   {0x0, 0x0},
88
    {0x0, 0x0},    {0x0, 0x0},    {0x0, 0x0},   {0x0, 0x0},
89
};
90
91
class StringReader final {
92
 public:
93
495k
  constexpr explicit StringReader(absl::string_view input) : input_(input) {}
94
95
6.64k
  size_t Remaining() const { return input_.size(); }
96
97
5.35M
  bool HasRemaining() const { return !input_.empty(); }
98
99
6.64k
  absl::string_view Peek(size_t n) {
100
6.64k
    ABSL_ASSERT(n <= Remaining());
101
6.64k
    return input_.substr(0, n);
102
6.64k
  }
103
104
4.86M
  char Read() {
105
4.86M
    ABSL_ASSERT(HasRemaining());
106
4.86M
    char value = input_.front();
107
4.86M
    input_.remove_prefix(1);
108
4.86M
    return value;
109
4.86M
  }
110
111
6.64k
  void Advance(size_t n) {
112
6.64k
    ABSL_ASSERT(n <= Remaining());
113
6.64k
    input_.remove_prefix(n);
114
6.64k
  }
115
116
0
  void Reset(absl::string_view input) { input_ = input; }
117
118
 private:
119
  absl::string_view input_;
120
};
121
122
class CordReader final {
123
 public:
124
  explicit CordReader(const absl::Cord& input)
125
0
      : input_(input), size_(input_.size()), buffer_(), index_(0) {}
126
127
0
  size_t Remaining() const { return size_; }
128
129
0
  bool HasRemaining() const { return size_ != 0; }
130
131
0
  absl::string_view Peek(size_t n) {
132
0
    ABSL_ASSERT(n <= Remaining());
133
0
    if (n == 0) {
134
0
      return absl::string_view();
135
0
    }
136
0
    if (n <= buffer_.size() - index_) {
137
      // Enough data remaining in temporary buffer.
138
0
      return absl::string_view(buffer_.data() + index_, n);
139
0
    }
140
    // We do not have enough data. See if we can fit it without allocating by
141
    // shifting data back to the beginning of the buffer.
142
0
    if (buffer_.capacity() >= n) {
143
      // It will fit in the current capacity, see if we need to shift the
144
      // existing data to make it fit.
145
0
      if (buffer_.capacity() - buffer_.size() < n && index_ != 0) {
146
        // We need to shift.
147
0
        buffer_.erase(buffer_.begin(), buffer_.begin() + index_);
148
0
        index_ = 0;
149
0
      }
150
0
    }
151
    // Ensure we never reserve less than kUtf8Max.
152
0
    buffer_.reserve(std::max(buffer_.size() + n, kUtf8Max));
153
0
    size_t to_copy = n - (buffer_.size() - index_);
154
0
    absl::CopyCordToString(input_.Subcord(0, to_copy), &buffer_);
155
0
    input_.RemovePrefix(to_copy);
156
0
    return absl::string_view(buffer_.data() + index_, n);
157
0
  }
158
159
0
  char Read() {
160
0
    char value = Peek(1).front();
161
0
    Advance(1);
162
0
    return value;
163
0
  }
164
165
0
  void Advance(size_t n) {
166
0
    ABSL_ASSERT(n <= Remaining());
167
0
    if (n == 0) {
168
0
      return;
169
0
    }
170
0
    if (index_ < buffer_.size()) {
171
0
      size_t count = std::min(n, buffer_.size() - index_);
172
0
      index_ += count;
173
0
      n -= count;
174
0
      size_ -= count;
175
0
      if (index_ < buffer_.size()) {
176
0
        return;
177
0
      }
178
      // Temporary buffer is empty, clear it.
179
0
      buffer_.clear();
180
0
      index_ = 0;
181
0
    }
182
0
    input_.RemovePrefix(n);
183
0
    size_ -= n;
184
0
  }
185
186
0
  void Reset(const absl::Cord& input) {
187
0
    input_ = input;
188
0
    size_ = input_.size();
189
0
    buffer_.clear();
190
0
    index_ = 0;
191
0
  }
192
193
 private:
194
  absl::Cord input_;
195
  size_t size_;
196
  std::string buffer_;
197
  size_t index_;
198
};
199
200
template <typename BufferedByteReader>
201
495k
bool Utf8IsValidImpl(BufferedByteReader* reader) {
202
5.35M
  while (reader->HasRemaining()) {
203
4.86M
    const auto b = static_cast<uint8_t>(reader->Read());
204
4.86M
    if (b < kUtf8RuneSelf) {
205
4.85M
      continue;
206
4.85M
    }
207
6.64k
    const auto leading = kLeading[b];
208
6.64k
    if (leading == kXX) {
209
0
      return false;
210
0
    }
211
6.64k
    const auto size = static_cast<size_t>(leading & 7) - 1;
212
6.64k
    if (size > reader->Remaining()) {
213
0
      return false;
214
0
    }
215
6.64k
    const absl::string_view segment = reader->Peek(size);
216
6.64k
    const auto& accept = kAccept[leading >> 4];
217
6.64k
    if (static_cast<uint8_t>(segment[0]) < accept.first ||
218
6.64k
        static_cast<uint8_t>(segment[0]) > accept.second) {
219
0
      return false;
220
6.64k
    } else if (size == 1) {
221
5.16k
    } else if (static_cast<uint8_t>(segment[1]) < kLow ||
222
5.16k
               static_cast<uint8_t>(segment[1]) > kHigh) {
223
0
      return false;
224
5.16k
    } else if (size == 2) {
225
3.78k
    } else if (static_cast<uint8_t>(segment[2]) < kLow ||
226
1.38k
               static_cast<uint8_t>(segment[2]) > kHigh) {
227
0
      return false;
228
0
    }
229
6.64k
    reader->Advance(size);
230
6.64k
  }
231
495k
  return true;
232
495k
}
utf8.cc:bool cel::internal::(anonymous namespace)::Utf8IsValidImpl<cel::internal::(anonymous namespace)::StringReader>(cel::internal::(anonymous namespace)::StringReader*)
Line
Count
Source
201
495k
bool Utf8IsValidImpl(BufferedByteReader* reader) {
202
5.35M
  while (reader->HasRemaining()) {
203
4.86M
    const auto b = static_cast<uint8_t>(reader->Read());
204
4.86M
    if (b < kUtf8RuneSelf) {
205
4.85M
      continue;
206
4.85M
    }
207
6.64k
    const auto leading = kLeading[b];
208
6.64k
    if (leading == kXX) {
209
0
      return false;
210
0
    }
211
6.64k
    const auto size = static_cast<size_t>(leading & 7) - 1;
212
6.64k
    if (size > reader->Remaining()) {
213
0
      return false;
214
0
    }
215
6.64k
    const absl::string_view segment = reader->Peek(size);
216
6.64k
    const auto& accept = kAccept[leading >> 4];
217
6.64k
    if (static_cast<uint8_t>(segment[0]) < accept.first ||
218
6.64k
        static_cast<uint8_t>(segment[0]) > accept.second) {
219
0
      return false;
220
6.64k
    } else if (size == 1) {
221
5.16k
    } else if (static_cast<uint8_t>(segment[1]) < kLow ||
222
5.16k
               static_cast<uint8_t>(segment[1]) > kHigh) {
223
0
      return false;
224
5.16k
    } else if (size == 2) {
225
3.78k
    } else if (static_cast<uint8_t>(segment[2]) < kLow ||
226
1.38k
               static_cast<uint8_t>(segment[2]) > kHigh) {
227
0
      return false;
228
0
    }
229
6.64k
    reader->Advance(size);
230
6.64k
  }
231
495k
  return true;
232
495k
}
Unexecuted instantiation: utf8.cc:bool cel::internal::(anonymous namespace)::Utf8IsValidImpl<cel::internal::(anonymous namespace)::CordReader>(cel::internal::(anonymous namespace)::CordReader*)
233
234
template <typename BufferedByteReader>
235
0
size_t Utf8CodePointCountImpl(BufferedByteReader* reader) {
236
0
  size_t count = 0;
237
0
  while (reader->HasRemaining()) {
238
0
    count++;
239
0
    const auto b = static_cast<uint8_t>(reader->Read());
240
0
    if (b < kUtf8RuneSelf) {
241
0
      continue;
242
0
    }
243
0
    const auto leading = kLeading[b];
244
0
    if (leading == kXX) {
245
0
      continue;
246
0
    }
247
0
    auto size = static_cast<size_t>(leading & 7) - 1;
248
0
    if (size > reader->Remaining()) {
249
0
      continue;
250
0
    }
251
0
    const absl::string_view segment = reader->Peek(size);
252
0
    const auto& accept = kAccept[leading >> 4];
253
0
    if (static_cast<uint8_t>(segment[0]) < accept.first ||
254
0
        static_cast<uint8_t>(segment[0]) > accept.second) {
255
0
      size = 0;
256
0
    } else if (size == 1) {
257
0
    } else if (static_cast<uint8_t>(segment[1]) < kLow ||
258
0
               static_cast<uint8_t>(segment[1]) > kHigh) {
259
0
      size = 0;
260
0
    } else if (size == 2) {
261
0
    } else if (static_cast<uint8_t>(segment[2]) < kLow ||
262
0
               static_cast<uint8_t>(segment[2]) > kHigh) {
263
0
      size = 0;
264
0
    }
265
0
    reader->Advance(size);
266
0
  }
267
0
  return count;
268
0
}
Unexecuted instantiation: utf8.cc:unsigned long cel::internal::(anonymous namespace)::Utf8CodePointCountImpl<cel::internal::(anonymous namespace)::StringReader>(cel::internal::(anonymous namespace)::StringReader*)
Unexecuted instantiation: utf8.cc:unsigned long cel::internal::(anonymous namespace)::Utf8CodePointCountImpl<cel::internal::(anonymous namespace)::CordReader>(cel::internal::(anonymous namespace)::CordReader*)
269
270
template <typename BufferedByteReader>
271
0
std::pair<size_t, bool> Utf8ValidateImpl(BufferedByteReader* reader) {
272
0
  size_t count = 0;
273
0
  while (reader->HasRemaining()) {
274
0
    const auto b = static_cast<uint8_t>(reader->Read());
275
0
    if (b < kUtf8RuneSelf) {
276
0
      count++;
277
0
      continue;
278
0
    }
279
0
    const auto leading = kLeading[b];
280
0
    if (leading == kXX) {
281
0
      return {count, false};
282
0
    }
283
0
    const auto size = static_cast<size_t>(leading & 7) - 1;
284
0
    if (size > reader->Remaining()) {
285
0
      return {count, false};
286
0
    }
287
0
    const absl::string_view segment = reader->Peek(size);
288
0
    const auto& accept = kAccept[leading >> 4];
289
0
    if (static_cast<uint8_t>(segment[0]) < accept.first ||
290
0
        static_cast<uint8_t>(segment[0]) > accept.second) {
291
0
      return {count, false};
292
0
    } else if (size == 1) {
293
0
      count++;
294
0
    } else if (static_cast<uint8_t>(segment[1]) < kLow ||
295
0
               static_cast<uint8_t>(segment[1]) > kHigh) {
296
0
      return {count, false};
297
0
    } else if (size == 2) {
298
0
      count++;
299
0
    } else if (static_cast<uint8_t>(segment[2]) < kLow ||
300
0
               static_cast<uint8_t>(segment[2]) > kHigh) {
301
0
      return {count, false};
302
0
    } else {
303
0
      count++;
304
0
    }
305
0
    reader->Advance(size);
306
0
  }
307
0
  return {count, true};
308
0
}
Unexecuted instantiation: utf8.cc:std::__1::pair<unsigned long, bool> cel::internal::(anonymous namespace)::Utf8ValidateImpl<cel::internal::(anonymous namespace)::StringReader>(cel::internal::(anonymous namespace)::StringReader*)
Unexecuted instantiation: utf8.cc:std::__1::pair<unsigned long, bool> cel::internal::(anonymous namespace)::Utf8ValidateImpl<cel::internal::(anonymous namespace)::CordReader>(cel::internal::(anonymous namespace)::CordReader*)
309
310
}  // namespace
311
312
495k
bool Utf8IsValid(absl::string_view str) {
313
495k
  StringReader reader(str);
314
495k
  bool valid = Utf8IsValidImpl(&reader);
315
495k
  ABSL_ASSERT((reader.Reset(str), valid == Utf8ValidateImpl(&reader).second));
316
495k
  return valid;
317
495k
}
318
319
0
bool Utf8IsValid(const absl::Cord& str) {
320
0
  CordReader reader(str);
321
0
  bool valid = Utf8IsValidImpl(&reader);
322
0
  ABSL_ASSERT((reader.Reset(str), valid == Utf8ValidateImpl(&reader).second));
323
0
  return valid;
324
0
}
325
326
0
size_t Utf8CodePointCount(absl::string_view str) {
327
0
  StringReader reader(str);
328
0
  return Utf8CodePointCountImpl(&reader);
329
0
}
330
331
0
size_t Utf8CodePointCount(const absl::Cord& str) {
332
0
  CordReader reader(str);
333
0
  return Utf8CodePointCountImpl(&reader);
334
0
}
335
336
0
std::pair<size_t, bool> Utf8Validate(absl::string_view str) {
337
0
  StringReader reader(str);
338
0
  auto result = Utf8ValidateImpl(&reader);
339
0
  ABSL_ASSERT((reader.Reset(str), result.second == Utf8IsValidImpl(&reader)));
340
0
  return result;
341
0
}
342
343
0
std::pair<size_t, bool> Utf8Validate(const absl::Cord& str) {
344
0
  CordReader reader(str);
345
0
  auto result = Utf8ValidateImpl(&reader);
346
0
  ABSL_ASSERT((reader.Reset(str), result.second == Utf8IsValidImpl(&reader)));
347
0
  return result;
348
0
}
349
350
100M
std::pair<char32_t, size_t> Utf8Decode(absl::string_view str) {
351
100M
  ABSL_ASSERT(!str.empty());
352
100M
  const auto b = static_cast<uint8_t>(str.front());
353
100M
  str.remove_prefix(1);
354
100M
  if (b < kUtf8RuneSelf) {
355
99.9M
    return {static_cast<char32_t>(b), 1};
356
99.9M
  }
357
19.8k
  const auto leading = kLeading[b];
358
19.8k
  if (leading == kXX) {
359
163
    return {kUnicodeReplacementCharacter, 1};
360
163
  }
361
19.6k
  auto size = static_cast<size_t>(leading & 7) - 1;
362
19.6k
  if (size > str.size()) {
363
25
    return {kUnicodeReplacementCharacter, 1};
364
25
  }
365
19.6k
  const auto& accept = kAccept[leading >> 4];
366
19.6k
  const auto b1 = static_cast<uint8_t>(str.front());
367
19.6k
  str.remove_prefix(1);
368
19.6k
  if (b1 < accept.first || b1 > accept.second) {
369
78
    return {kUnicodeReplacementCharacter, 1};
370
78
  }
371
19.5k
  if (size <= 1) {
372
7.36k
    return {(static_cast<char32_t>(b & kMask2) << 6) |
373
7.36k
                static_cast<char32_t>(b1 & kMaskX),
374
7.36k
            2};
375
7.36k
  }
376
12.1k
  const auto b2 = static_cast<uint8_t>(str.front());
377
12.1k
  str.remove_prefix(1);
378
12.1k
  if (b2 < kLow || b2 > kHigh) {
379
17
    return {kUnicodeReplacementCharacter, 1};
380
17
  }
381
12.1k
  if (size <= 2) {
382
7.12k
    return {(static_cast<char32_t>(b & kMask3) << 12) |
383
7.12k
                (static_cast<char32_t>(b1 & kMaskX) << 6) |
384
7.12k
                static_cast<char32_t>(b2 & kMaskX),
385
7.12k
            3};
386
7.12k
  }
387
5.03k
  const auto b3 = static_cast<uint8_t>(str.front());
388
5.03k
  str.remove_prefix(1);
389
5.03k
  if (b3 < kLow || b3 > kHigh) {
390
12
    return {kUnicodeReplacementCharacter, 1};
391
12
  }
392
5.02k
  return {(static_cast<char32_t>(b & kMask4) << 18) |
393
5.02k
              (static_cast<char32_t>(b1 & kMaskX) << 12) |
394
5.02k
              (static_cast<char32_t>(b2 & kMaskX) << 6) |
395
5.02k
              static_cast<char32_t>(b3 & kMaskX),
396
5.02k
          4};
397
5.03k
}
398
399
14.6M
std::string& Utf8Encode(std::string* buffer, char32_t code_point) {
400
14.6M
  ABSL_ASSERT(buffer != nullptr);
401
14.6M
  if (!UnicodeIsValid(code_point)) {
402
0
    code_point = kUnicodeReplacementCharacter;
403
0
  }
404
14.6M
  if (code_point <= 0x7f) {
405
14.6M
    buffer->push_back(static_cast<char>(static_cast<uint8_t>(code_point)));
406
14.6M
  } else if (code_point <= 0x7ff) {
407
7.12k
    buffer->push_back(
408
7.12k
        static_cast<char>(kT2 | static_cast<uint8_t>(code_point >> 6)));
409
7.12k
    buffer->push_back(
410
7.12k
        static_cast<char>(kTX | (static_cast<uint8_t>(code_point) & kMaskX)));
411
11.8k
  } else if (code_point <= 0xffff) {
412
7.09k
    buffer->push_back(
413
7.09k
        static_cast<char>(kT3 | static_cast<uint8_t>(code_point >> 12)));
414
7.09k
    buffer->push_back(static_cast<char>(
415
7.09k
        kTX | (static_cast<uint8_t>(code_point >> 6) & kMaskX)));
416
7.09k
    buffer->push_back(
417
7.09k
        static_cast<char>(kTX | (static_cast<uint8_t>(code_point) & kMaskX)));
418
7.09k
  } else {
419
4.73k
    buffer->push_back(
420
4.73k
        static_cast<char>(kT4 | static_cast<uint8_t>(code_point >> 18)));
421
4.73k
    buffer->push_back(static_cast<char>(
422
4.73k
        kTX | (static_cast<uint8_t>(code_point >> 12) & kMaskX)));
423
4.73k
    buffer->push_back(static_cast<char>(
424
4.73k
        kTX | (static_cast<uint8_t>(code_point >> 6) & kMaskX)));
425
4.73k
    buffer->push_back(
426
4.73k
        static_cast<char>(kTX | (static_cast<uint8_t>(code_point) & kMaskX)));
427
4.73k
  }
428
14.6M
  return *buffer;
429
14.6M
}
430
431
}  // namespace cel::internal