/proc/self/cwd/internal/utf8.h

Source
// Copyright 2021 Google LLC
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     https://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

#ifndef THIRD_PARTY_CEL_CPP_INTERNAL_UTF8_H_
#define THIRD_PARTY_CEL_CPP_INTERNAL_UTF8_H_

#include <cstddef>
#include <string>
#include <utility>

#include "absl/base/attributes.h"
#include "absl/base/nullability.h"
#include "absl/strings/cord.h"
#include "absl/strings/string_view.h"

namespace cel::internal {

// Returns true if the given UTF-8 encoded string is not malformed, false
// otherwise.
bool Utf8IsValid(absl::string_view str);
bool Utf8IsValid(const absl::Cord& str);

// Returns the number of Unicode code points in the UTF-8 encoded string.
//
// If there are any invalid bytes, they will each be counted as an invalid code
// point.
size_t Utf8CodePointCount(absl::string_view str);
size_t Utf8CodePointCount(const absl::Cord& str);

// Validates the given UTF-8 encoded string. The first return value is the
// number of code points and its meaning depends on the second return value. If
// the second return value is true the entire string is not malformed and the
// first return value is the number of code points. If the second return value
// is false the string is malformed and the first return value is the number of
// code points up until the malformed sequence was encountered.
std::pair<size_t, bool> Utf8Validate(absl::string_view str);
std::pair<size_t, bool> Utf8Validate(const absl::Cord& str);

// Decodes the next code point, returning the decoded code point and the number
// of code units (a.k.a. bytes) consumed. In the event that an invalid code unit
// sequence is returned the replacement character, U+FFFD, is returned with a
// code unit count of 1. As U+FFFD requires 3 code units when encoded, this can
// be used to differentiate valid input from malformed input.
size_t Utf8Decode(absl::string_view str, char32_t* absl_nullable code_point);
size_t Utf8Decode(const absl::Cord::CharIterator& it,
                  char32_t* absl_nullable code_point);
inline std::pair<char32_t, size_t> Utf8Decode(absl::string_view str) {
  char32_t code_point;
  size_t code_units = Utf8Decode(str, &code_point);
  return std::pair{code_point, code_units};
}
inline std::pair<char32_t, size_t> Utf8Decode(
    const absl::Cord::CharIterator& it) {
  char32_t code_point;
  size_t code_units = Utf8Decode(it, &code_point);
  return std::pair{code_point, code_units};
}

// Encodes the given code point and appends it to the buffer. If the code point
// is an unpaired surrogate or outside of the valid Unicode range it is replaced
// with the replacement character, U+FFFD.
size_t Utf8Encode(char32_t code_point, std::string* absl_nonnull buffer);
size_t Utf8Encode(char32_t code_point, char* absl_nonnull buffer);
ABSL_DEPRECATED("Use other overload")
inline size_t Utf8Encode(std::string& buffer, char32_t code_point) {
  return Utf8Encode(code_point, &buffer);
}

}  // namespace cel::internal

#endif  // THIRD_PARTY_CEL_CPP_INTERNAL_UTF8_H_

Line	Count	Source
1		// Copyright 2021 Google LLC
2		//
3		// Licensed under the Apache License, Version 2.0 (the "License");
4		// you may not use this file except in compliance with the License.
5		// You may obtain a copy of the License at
6		//
7		// https://www.apache.org/licenses/LICENSE-2.0
8		//
9		// Unless required by applicable law or agreed to in writing, software
10		// distributed under the License is distributed on an "AS IS" BASIS,
11		// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12		// See the License for the specific language governing permissions and
13		// limitations under the License.
14
15		#ifndef THIRD_PARTY_CEL_CPP_INTERNAL_UTF8_H_
16		#define THIRD_PARTY_CEL_CPP_INTERNAL_UTF8_H_
17
18		#include <cstddef>
19		#include <string>
20		#include <utility>
21
22		#include "absl/base/attributes.h"
23		#include "absl/base/nullability.h"
24		#include "absl/strings/cord.h"
25		#include "absl/strings/string_view.h"
26
27		namespace cel::internal {
28
29		// Returns true if the given UTF-8 encoded string is not malformed, false
30		// otherwise.
31		bool Utf8IsValid(absl::string_view str);
32		bool Utf8IsValid(const absl::Cord& str);
33
34		// Returns the number of Unicode code points in the UTF-8 encoded string.
35		//
36		// If there are any invalid bytes, they will each be counted as an invalid code
37		// point.
38		size_t Utf8CodePointCount(absl::string_view str);
39		size_t Utf8CodePointCount(const absl::Cord& str);
40
41		// Validates the given UTF-8 encoded string. The first return value is the
42		// number of code points and its meaning depends on the second return value. If
43		// the second return value is true the entire string is not malformed and the
44		// first return value is the number of code points. If the second return value
45		// is false the string is malformed and the first return value is the number of
46		// code points up until the malformed sequence was encountered.
47		std::pair<size_t, bool> Utf8Validate(absl::string_view str);
48		std::pair<size_t, bool> Utf8Validate(const absl::Cord& str);
49
50		// Decodes the next code point, returning the decoded code point and the number
51		// of code units (a.k.a. bytes) consumed. In the event that an invalid code unit
52		// sequence is returned the replacement character, U+FFFD, is returned with a
53		// code unit count of 1. As U+FFFD requires 3 code units when encoded, this can
54		// be used to differentiate valid input from malformed input.
55		size_t Utf8Decode(absl::string_view str, char32_t* absl_nullable code_point);
56		size_t Utf8Decode(const absl::Cord::CharIterator& it,
57		char32_t* absl_nullable code_point);
58	293M	inline std::pair<char32_t, size_t> Utf8Decode(absl::string_view str) {
59	293M	char32_t code_point;
60	293M	size_t code_units = Utf8Decode(str, &code_point);
61	293M	return std::pair{code_point, code_units};
62	293M	}
63		inline std::pair<char32_t, size_t> Utf8Decode(
64	0	const absl::Cord::CharIterator& it) {
65	0	char32_t code_point;
66	0	size_t code_units = Utf8Decode(it, &code_point);
67	0	return std::pair{code_point, code_units};
68	0	}
69
70		// Encodes the given code point and appends it to the buffer. If the code point
71		// is an unpaired surrogate or outside of the valid Unicode range it is replaced
72		// with the replacement character, U+FFFD.
73		size_t Utf8Encode(char32_t code_point, std::string* absl_nonnull buffer);
74		size_t Utf8Encode(char32_t code_point, char* absl_nonnull buffer);
75		ABSL_DEPRECATED("Use other overload")
76	608M	inline size_t Utf8Encode(std::string& buffer, char32_t code_point) {
77	608M	return Utf8Encode(code_point, &buffer);
78	608M	}
79
80		} // namespace cel::internal
81
82		#endif // THIRD_PARTY_CEL_CPP_INTERNAL_UTF8_H_

Coverage Report

Created: 2025-11-29 07:01