Coverage Report

Created: 2025-07-11 06:37

/src/abseil-cpp/absl/strings/internal/utf8.cc
Line
Count
Source (jump to first uncovered line)
1
// Copyright 2017 The Abseil Authors.
2
//
3
// Licensed under the Apache License, Version 2.0 (the "License");
4
// you may not use this file except in compliance with the License.
5
// You may obtain a copy of the License at
6
//
7
//      https://www.apache.org/licenses/LICENSE-2.0
8
//
9
// Unless required by applicable law or agreed to in writing, software
10
// distributed under the License is distributed on an "AS IS" BASIS,
11
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
// See the License for the specific language governing permissions and
13
// limitations under the License.
14
15
// UTF8 utilities, implemented to reduce dependencies.
16
17
#include "absl/strings/internal/utf8.h"
18
19
#include <cstddef>
20
#include <cstdint>
21
#include <limits>
22
23
#include "absl/base/config.h"
24
25
namespace absl {
26
ABSL_NAMESPACE_BEGIN
27
namespace strings_internal {
28
29
0
size_t EncodeUTF8Char(char* buffer, char32_t utf8_char) {
30
0
  if (utf8_char <= 0x7F) {
31
0
    *buffer = static_cast<char>(utf8_char);
32
0
    return 1;
33
0
  } else if (utf8_char <= 0x7FF) {
34
0
    buffer[1] = static_cast<char>(0x80 | (utf8_char & 0x3F));
35
0
    utf8_char >>= 6;
36
0
    buffer[0] = static_cast<char>(0xC0 | utf8_char);
37
0
    return 2;
38
0
  } else if (utf8_char <= 0xFFFF) {
39
0
    buffer[2] = static_cast<char>(0x80 | (utf8_char & 0x3F));
40
0
    utf8_char >>= 6;
41
0
    buffer[1] = static_cast<char>(0x80 | (utf8_char & 0x3F));
42
0
    utf8_char >>= 6;
43
0
    buffer[0] = static_cast<char>(0xE0 | utf8_char);
44
0
    return 3;
45
0
  } else {
46
0
    buffer[3] = static_cast<char>(0x80 | (utf8_char & 0x3F));
47
0
    utf8_char >>= 6;
48
0
    buffer[2] = static_cast<char>(0x80 | (utf8_char & 0x3F));
49
0
    utf8_char >>= 6;
50
0
    buffer[1] = static_cast<char>(0x80 | (utf8_char & 0x3F));
51
0
    utf8_char >>= 6;
52
0
    buffer[0] = static_cast<char>(0xF0 | utf8_char);
53
0
    return 4;
54
0
  }
55
0
}
56
57
0
size_t WideToUtf8(wchar_t wc, char* buf, ShiftState& s) {
58
  // Reinterpret the output buffer `buf` as `unsigned char*` for subsequent
59
  // bitwise operations. This ensures well-defined behavior for bit
60
  // manipulations (avoiding issues with signed `char`) and is safe under C++
61
  // aliasing rules, as `unsigned char` can alias any type.
62
0
  auto* ubuf = reinterpret_cast<unsigned char*>(buf);
63
0
  const uint32_t v = static_cast<uint32_t>(wc);
64
0
  constexpr size_t kError = static_cast<size_t>(-1);
65
66
0
  if (v <= 0x007F) {
67
    // 1-byte sequence (U+0000 to U+007F).
68
    // 0xxxxxxx.
69
0
    ubuf[0] = (0b0111'1111 & v);
70
0
    s = {};  // Reset surrogate state.
71
0
    return 1;
72
0
  } else if (0x0080 <= v && v <= 0x07FF) {
73
    // 2-byte sequence (U+0080 to U+07FF).
74
    // 110xxxxx 10xxxxxx.
75
0
    ubuf[0] = 0b1100'0000 | (0b0001'1111 & (v >> 6));
76
0
    ubuf[1] = 0b1000'0000 | (0b0011'1111 & v);
77
0
    s = {};  // Reset surrogate state.
78
0
    return 2;
79
0
  } else if ((0x0800 <= v && v <= 0xD7FF) || (0xE000 <= v && v <= 0xFFFF)) {
80
    // 3-byte sequence (U+0800 to U+D7FF or U+E000 to U+FFFF).
81
    // Excludes surrogate code points U+D800-U+DFFF.
82
    // 1110xxxx 10xxxxxx 10xxxxxx.
83
0
    ubuf[0] = 0b1110'0000 | (0b0000'1111 & (v >> 12));
84
0
    ubuf[1] = 0b1000'0000 | (0b0011'1111 & (v >> 6));
85
0
    ubuf[2] = 0b1000'0000 | (0b0011'1111 & v);
86
0
    s = {};  // Reset surrogate state.
87
0
    return 3;
88
0
  } else if (0xD800 <= v && v <= 0xDBFF) {
89
    // High Surrogate (U+D800 to U+DBFF).
90
    // This part forms the first two bytes of an eventual 4-byte UTF-8 sequence.
91
0
    const unsigned char high_bits_val = (0b0000'1111 & (v >> 6)) + 1;
92
93
    // First byte of the 4-byte UTF-8 sequence (11110xxx).
94
0
    ubuf[0] = 0b1111'0000 | (0b0000'0111 & (high_bits_val >> 2));
95
    // Second byte of the 4-byte UTF-8 sequence (10xxxxxx).
96
0
    ubuf[1] = 0b1000'0000 |                           //
97
0
              (0b0011'0000 & (high_bits_val << 4)) |  //
98
0
              (0b0000'1111 & (v >> 2));
99
    // Set state for high surrogate after writing to buffer.
100
0
    s = {true, static_cast<unsigned char>(0b0000'0011 & v)};
101
0
    return 2;  // Wrote 2 bytes, expecting 2 more from a low surrogate.
102
0
  } else if (0xDC00 <= v && v <= 0xDFFF) {
103
    // Low Surrogate (U+DC00 to U+DFFF).
104
    // This part forms the last two bytes of a 4-byte UTF-8 sequence,
105
    // using state from a preceding high surrogate.
106
0
    if (!s.saw_high_surrogate) {
107
      // Error: Isolated low surrogate without a preceding high surrogate.
108
      // s remains in its current (problematic) state.
109
      // Caller should handle error.
110
0
      return kError;
111
0
    }
112
113
    // Third byte of the 4-byte UTF-8 sequence (10xxxxxx).
114
0
    ubuf[0] = 0b1000'0000 |                    //
115
0
              (0b0011'0000 & (s.bits << 4)) |  //
116
0
              (0b0000'1111 & (v >> 6));
117
    // Fourth byte of the 4-byte UTF-8 sequence (10xxxxxx).
118
0
    ubuf[1] = 0b1000'0000 | (0b0011'1111 & v);
119
120
0
    s = {};    // Reset surrogate state, pair complete.
121
0
    return 2;  // Wrote 2 more bytes, completing the 4-byte sequence.
122
0
  } else if constexpr (0xFFFF < std::numeric_limits<wchar_t>::max()) {
123
    // Conditionally compile the 4-byte direct conversion branch.
124
    // This block is compiled only if wchar_t can represent values > 0xFFFF.
125
    // It's placed after surrogate checks to ensure surrogates are handled by
126
    // their specific logic. This inner 'if' is the runtime check for the 4-byte
127
    // range. At this point, v is known not to be in the 1, 2, or 3-byte BMP
128
    // ranges, nor is it a surrogate code point.
129
0
    if (0x10000 <= v && v <= 0x10FFFF) {
130
      // 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx.
131
0
      ubuf[0] = 0b1111'0000 | (0b0000'0111 & (v >> 18));
132
0
      ubuf[1] = 0b1000'0000 | (0b0011'1111 & (v >> 12));
133
0
      ubuf[2] = 0b1000'0000 | (0b0011'1111 & (v >> 6));
134
0
      ubuf[3] = 0b1000'0000 | (0b0011'1111 & v);
135
0
      s = {};  // Reset surrogate state.
136
0
      return 4;
137
0
    }
138
0
  }
139
140
  // Invalid wchar_t value (e.g., out of Unicode range, or unhandled after all
141
  // checks).
142
0
  s = {};  // Reset surrogate state.
143
0
  return kError;
144
0
}
145
146
}  // namespace strings_internal
147
ABSL_NAMESPACE_END
148
}  // namespace absl