/src/abseil-cpp/absl/strings/internal/utf8.cc

Source
// Copyright 2017 The Abseil Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//      https://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

// UTF8 utilities, implemented to reduce dependencies.

#include "absl/strings/internal/utf8.h"

#include <cstddef>
#include <cstdint>
#include <limits>

#include "absl/base/config.h"

namespace absl {
ABSL_NAMESPACE_BEGIN
namespace strings_internal {

size_t EncodeUTF8Char(char* buffer, char32_t utf8_char) {
  if (utf8_char <= 0x7F) {
    *buffer = static_cast<char>(utf8_char);
    return 1;
  } else if (utf8_char <= 0x7FF) {
    buffer[1] = static_cast<char>(0x80 | (utf8_char & 0x3F));
    utf8_char >>= 6;
    buffer[0] = static_cast<char>(0xC0 | utf8_char);
    return 2;
  } else if (utf8_char <= 0xFFFF) {
    buffer[2] = static_cast<char>(0x80 | (utf8_char & 0x3F));
    utf8_char >>= 6;
    buffer[1] = static_cast<char>(0x80 | (utf8_char & 0x3F));
    utf8_char >>= 6;
    buffer[0] = static_cast<char>(0xE0 | utf8_char);
    return 3;
  } else {
    buffer[3] = static_cast<char>(0x80 | (utf8_char & 0x3F));
    utf8_char >>= 6;
    buffer[2] = static_cast<char>(0x80 | (utf8_char & 0x3F));
    utf8_char >>= 6;
    buffer[1] = static_cast<char>(0x80 | (utf8_char & 0x3F));
    utf8_char >>= 6;
    buffer[0] = static_cast<char>(0xF0 | utf8_char);
    return 4;
  }
}

size_t WideToUtf8(wchar_t wc, char* buf, ShiftState& s) {
  // Reinterpret the output buffer `buf` as `unsigned char*` for subsequent
  // bitwise operations. This ensures well-defined behavior for bit
  // manipulations (avoiding issues with signed `char`) and is safe under C++
  // aliasing rules, as `unsigned char` can alias any type.
  auto* ubuf = reinterpret_cast<unsigned char*>(buf);
  const uint32_t v = static_cast<uint32_t>(wc);
  constexpr size_t kError = static_cast<size_t>(-1);

  if (v <= 0x007F) {
    // 1-byte sequence (U+0000 to U+007F).
    // 0xxxxxxx.
    ubuf[0] = (0b0111'1111 & v);
    s = {};  // Reset surrogate state.
    return 1;
  } else if (0x0080 <= v && v <= 0x07FF) {
    // 2-byte sequence (U+0080 to U+07FF).
    // 110xxxxx 10xxxxxx.
    ubuf[0] = 0b1100'0000 | (0b0001'1111 & (v >> 6));
    ubuf[1] = 0b1000'0000 | (0b0011'1111 & v);
    s = {};  // Reset surrogate state.
    return 2;
  } else if ((0x0800 <= v && v <= 0xD7FF) || (0xE000 <= v && v <= 0xFFFF)) {
    // 3-byte sequence (U+0800 to U+D7FF or U+E000 to U+FFFF).
    // Excludes surrogate code points U+D800-U+DFFF.
    // 1110xxxx 10xxxxxx 10xxxxxx.
    ubuf[0] = 0b1110'0000 | (0b0000'1111 & (v >> 12));
    ubuf[1] = 0b1000'0000 | (0b0011'1111 & (v >> 6));
    ubuf[2] = 0b1000'0000 | (0b0011'1111 & v);
    s = {};  // Reset surrogate state.
    return 3;
  } else if (0xD800 <= v && v <= 0xDBFF) {
    // High Surrogate (U+D800 to U+DBFF).
    // This part forms the first two bytes of an eventual 4-byte UTF-8 sequence.
    const unsigned char high_bits_val = (0b0000'1111 & (v >> 6)) + 1;

    // First byte of the 4-byte UTF-8 sequence (11110xxx).
    ubuf[0] = 0b1111'0000 | (0b0000'0111 & (high_bits_val >> 2));
    // Second byte of the 4-byte UTF-8 sequence (10xxxxxx).
    ubuf[1] = 0b1000'0000 |                           //
              (0b0011'0000 & (high_bits_val << 4)) |  //
              (0b0000'1111 & (v >> 2));
    // Set state for high surrogate after writing to buffer.
    s = {true, static_cast<unsigned char>(0b0000'0011 & v)};
    return 2;  // Wrote 2 bytes, expecting 2 more from a low surrogate.
  } else if (0xDC00 <= v && v <= 0xDFFF) {
    // Low Surrogate (U+DC00 to U+DFFF).
    // This part forms the last two bytes of a 4-byte UTF-8 sequence,
    // using state from a preceding high surrogate.
    if (!s.saw_high_surrogate) {
      // Error: Isolated low surrogate without a preceding high surrogate.
      // s remains in its current (problematic) state.
      // Caller should handle error.
      return kError;
    }

    // Third byte of the 4-byte UTF-8 sequence (10xxxxxx).
    ubuf[0] = 0b1000'0000 |                    //
              (0b0011'0000 & (s.bits << 4)) |  //
              (0b0000'1111 & (v >> 6));
    // Fourth byte of the 4-byte UTF-8 sequence (10xxxxxx).
    ubuf[1] = 0b1000'0000 | (0b0011'1111 & v);

    s = {};    // Reset surrogate state, pair complete.
    return 2;  // Wrote 2 more bytes, completing the 4-byte sequence.
  } else if constexpr (0xFFFF < std::numeric_limits<wchar_t>::max()) {
    // Conditionally compile the 4-byte direct conversion branch.
    // This block is compiled only if wchar_t can represent values > 0xFFFF.
    // It's placed after surrogate checks to ensure surrogates are handled by
    // their specific logic. This inner 'if' is the runtime check for the 4-byte
    // range. At this point, v is known not to be in the 1, 2, or 3-byte BMP
    // ranges, nor is it a surrogate code point.
    if (0x10000 <= v && v <= 0x10FFFF) {
      // 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx.
      ubuf[0] = 0b1111'0000 | (0b0000'0111 & (v >> 18));
      ubuf[1] = 0b1000'0000 | (0b0011'1111 & (v >> 12));
      ubuf[2] = 0b1000'0000 | (0b0011'1111 & (v >> 6));
      ubuf[3] = 0b1000'0000 | (0b0011'1111 & v);
      s = {};  // Reset surrogate state.
      return 4;
    }
  }

  // Invalid wchar_t value (e.g., out of Unicode range, or unhandled after all
  // checks).
  s = {};  // Reset surrogate state.
  return kError;
}

}  // namespace strings_internal
ABSL_NAMESPACE_END
}  // namespace absl

Coverage Report

Created: 2025-10-28 06:50

Line	Count	Source
1		// Copyright 2017 The Abseil Authors.
2		//
3		// Licensed under the Apache License, Version 2.0 (the "License");
4		// you may not use this file except in compliance with the License.
5		// You may obtain a copy of the License at
6		//
7		// https://www.apache.org/licenses/LICENSE-2.0
8		//
9		// Unless required by applicable law or agreed to in writing, software
10		// distributed under the License is distributed on an "AS IS" BASIS,
11		// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12		// See the License for the specific language governing permissions and
13		// limitations under the License.
14
15		// UTF8 utilities, implemented to reduce dependencies.
16
17		#include "absl/strings/internal/utf8.h"
18
19		#include <cstddef>
20		#include <cstdint>
21		#include <limits>
22
23		#include "absl/base/config.h"
24
25		namespace absl {
26		ABSL_NAMESPACE_BEGIN
27		namespace strings_internal {
28
29	0	size_t EncodeUTF8Char(char* buffer, char32_t utf8_char) {
30	0	if (utf8_char <= 0x7F) {
31	0	*buffer = static_cast<char>(utf8_char);
32	0	return 1;
33	0	} else if (utf8_char <= 0x7FF) {
34	0	buffer[1] = static_cast<char>(0x80 \| (utf8_char & 0x3F));
35	0	utf8_char >>= 6;
36	0	buffer[0] = static_cast<char>(0xC0 \| utf8_char);
37	0	return 2;
38	0	} else if (utf8_char <= 0xFFFF) {
39	0	buffer[2] = static_cast<char>(0x80 \| (utf8_char & 0x3F));
40	0	utf8_char >>= 6;
41	0	buffer[1] = static_cast<char>(0x80 \| (utf8_char & 0x3F));
42	0	utf8_char >>= 6;
43	0	buffer[0] = static_cast<char>(0xE0 \| utf8_char);
44	0	return 3;
45	0	} else {
46	0	buffer[3] = static_cast<char>(0x80 \| (utf8_char & 0x3F));
47	0	utf8_char >>= 6;
48	0	buffer[2] = static_cast<char>(0x80 \| (utf8_char & 0x3F));
49	0	utf8_char >>= 6;
50	0	buffer[1] = static_cast<char>(0x80 \| (utf8_char & 0x3F));
51	0	utf8_char >>= 6;
52	0	buffer[0] = static_cast<char>(0xF0 \| utf8_char);
53	0	return 4;
54	0	}
55	0	}
56
57	0	size_t WideToUtf8(wchar_t wc, char* buf, ShiftState& s) {
58		// Reinterpret the output buffer `buf` as `unsigned char*` for subsequent
59		// bitwise operations. This ensures well-defined behavior for bit
60		// manipulations (avoiding issues with signed `char`) and is safe under C++
61		// aliasing rules, as `unsigned char` can alias any type.
62	0	auto* ubuf = reinterpret_cast<unsigned char*>(buf);
63	0	const uint32_t v = static_cast<uint32_t>(wc);
64	0	constexpr size_t kError = static_cast<size_t>(-1);
65
66	0	if (v <= 0x007F) {
67		// 1-byte sequence (U+0000 to U+007F).
68		// 0xxxxxxx.
69	0	ubuf[0] = (0b0111'1111 & v);
70	0	s = {}; // Reset surrogate state.
71	0	return 1;
72	0	} else if (0x0080 <= v && v <= 0x07FF) {
73		// 2-byte sequence (U+0080 to U+07FF).
74		// 110xxxxx 10xxxxxx.
75	0	ubuf[0] = 0b1100'0000 \| (0b0001'1111 & (v >> 6));
76	0	ubuf[1] = 0b1000'0000 \| (0b0011'1111 & v);
77	0	s = {}; // Reset surrogate state.
78	0	return 2;
79	0	} else if ((0x0800 <= v && v <= 0xD7FF) \|\| (0xE000 <= v && v <= 0xFFFF)) {
80		// 3-byte sequence (U+0800 to U+D7FF or U+E000 to U+FFFF).
81		// Excludes surrogate code points U+D800-U+DFFF.
82		// 1110xxxx 10xxxxxx 10xxxxxx.
83	0	ubuf[0] = 0b1110'0000 \| (0b0000'1111 & (v >> 12));
84	0	ubuf[1] = 0b1000'0000 \| (0b0011'1111 & (v >> 6));
85	0	ubuf[2] = 0b1000'0000 \| (0b0011'1111 & v);
86	0	s = {}; // Reset surrogate state.
87	0	return 3;
88	0	} else if (0xD800 <= v && v <= 0xDBFF) {
89		// High Surrogate (U+D800 to U+DBFF).
90		// This part forms the first two bytes of an eventual 4-byte UTF-8 sequence.
91	0	const unsigned char high_bits_val = (0b0000'1111 & (v >> 6)) + 1;
92
93		// First byte of the 4-byte UTF-8 sequence (11110xxx).
94	0	ubuf[0] = 0b1111'0000 \| (0b0000'0111 & (high_bits_val >> 2));
95		// Second byte of the 4-byte UTF-8 sequence (10xxxxxx).
96	0	ubuf[1] = 0b1000'0000 \| //
97	0	(0b0011'0000 & (high_bits_val << 4)) \| //
98	0	(0b0000'1111 & (v >> 2));
99		// Set state for high surrogate after writing to buffer.
100	0	s = {true, static_cast<unsigned char>(0b0000'0011 & v)};
101	0	return 2; // Wrote 2 bytes, expecting 2 more from a low surrogate.
102	0	} else if (0xDC00 <= v && v <= 0xDFFF) {
103		// Low Surrogate (U+DC00 to U+DFFF).
104		// This part forms the last two bytes of a 4-byte UTF-8 sequence,
105		// using state from a preceding high surrogate.
106	0	if (!s.saw_high_surrogate) {
107		// Error: Isolated low surrogate without a preceding high surrogate.
108		// s remains in its current (problematic) state.
109		// Caller should handle error.
110	0	return kError;
111	0	}
112
113		// Third byte of the 4-byte UTF-8 sequence (10xxxxxx).
114	0	ubuf[0] = 0b1000'0000 \| //
115	0	(0b0011'0000 & (s.bits << 4)) \| //
116	0	(0b0000'1111 & (v >> 6));
117		// Fourth byte of the 4-byte UTF-8 sequence (10xxxxxx).
118	0	ubuf[1] = 0b1000'0000 \| (0b0011'1111 & v);
119
120	0	s = {}; // Reset surrogate state, pair complete.
121	0	return 2; // Wrote 2 more bytes, completing the 4-byte sequence.
122	0	} else if constexpr (0xFFFF < std::numeric_limits<wchar_t>::max()) {
123		// Conditionally compile the 4-byte direct conversion branch.
124		// This block is compiled only if wchar_t can represent values > 0xFFFF.
125		// It's placed after surrogate checks to ensure surrogates are handled by
126		// their specific logic. This inner 'if' is the runtime check for the 4-byte
127		// range. At this point, v is known not to be in the 1, 2, or 3-byte BMP
128		// ranges, nor is it a surrogate code point.
129	0	if (0x10000 <= v && v <= 0x10FFFF) {
130		// 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx.
131	0	ubuf[0] = 0b1111'0000 \| (0b0000'0111 & (v >> 18));
132	0	ubuf[1] = 0b1000'0000 \| (0b0011'1111 & (v >> 12));
133	0	ubuf[2] = 0b1000'0000 \| (0b0011'1111 & (v >> 6));
134	0	ubuf[3] = 0b1000'0000 \| (0b0011'1111 & v);
135	0	s = {}; // Reset surrogate state.
136	0	return 4;
137	0	}
138	0	}
139
140		// Invalid wchar_t value (e.g., out of Unicode range, or unhandled after all
141		// checks).
142	0	s = {}; // Reset surrogate state.
143	0	return kError;
144	0	}
145
146		} // namespace strings_internal
147		ABSL_NAMESPACE_END
148		} // namespace absl