/src/serenity/AK/Utf16View.h
Line | Count | Source |
1 | | /* |
2 | | * Copyright (c) 2021-2023, Tim Flynn <trflynn89@serenityos.org> |
3 | | * |
4 | | * SPDX-License-Identifier: BSD-2-Clause |
5 | | */ |
6 | | |
7 | | #pragma once |
8 | | |
9 | | #include <AK/ByteString.h> |
10 | | #include <AK/Error.h> |
11 | | #include <AK/Format.h> |
12 | | #include <AK/Forward.h> |
13 | | #include <AK/Optional.h> |
14 | | #include <AK/Span.h> |
15 | | #include <AK/String.h> |
16 | | #include <AK/Types.h> |
17 | | #include <AK/Vector.h> |
18 | | |
19 | | namespace AK { |
20 | | |
21 | | using Utf16Data = Vector<u16, 1>; |
22 | | |
23 | | ErrorOr<Utf16Data> utf8_to_utf16(StringView); |
24 | | ErrorOr<Utf16Data> utf8_to_utf16(Utf8View const&); |
25 | | ErrorOr<Utf16Data> utf32_to_utf16(Utf32View const&); |
26 | | ErrorOr<void> code_point_to_utf16(Utf16Data&, u32); |
27 | | |
28 | | size_t utf16_code_unit_length_from_utf8(StringView); |
29 | | |
30 | | class Utf16View; |
31 | | |
32 | | class Utf16CodePointIterator { |
33 | | friend class Utf16View; |
34 | | |
35 | | public: |
36 | | Utf16CodePointIterator() = default; |
37 | | ~Utf16CodePointIterator() = default; |
38 | | |
39 | | bool operator==(Utf16CodePointIterator const& other) const |
40 | 0 | { |
41 | 0 | return (m_ptr == other.m_ptr) && (m_remaining_code_units == other.m_remaining_code_units); |
42 | 0 | } |
43 | | |
44 | | Utf16CodePointIterator& operator++(); |
45 | | u32 operator*() const; |
46 | | |
47 | | size_t length_in_code_units() const; |
48 | | |
49 | | private: |
50 | | Utf16CodePointIterator(u16 const* ptr, size_t length) |
51 | 0 | : m_ptr(ptr) |
52 | 0 | , m_remaining_code_units(length) |
53 | 0 | { |
54 | 0 | } |
55 | | |
56 | | u16 const* m_ptr { nullptr }; |
57 | | size_t m_remaining_code_units { 0 }; |
58 | | }; |
59 | | |
60 | | class Utf16View { |
61 | | public: |
62 | | using Iterator = Utf16CodePointIterator; |
63 | | |
64 | | static bool is_high_surrogate(u16); |
65 | | static bool is_low_surrogate(u16); |
66 | | static u32 decode_surrogate_pair(u16 high_surrogate, u16 low_surrogate); |
67 | | |
68 | 6 | Utf16View() = default; |
69 | | ~Utf16View() = default; |
70 | | |
71 | | explicit Utf16View(ReadonlySpan<u16> code_units) |
72 | 146 | : m_code_units(code_units) |
73 | 146 | { |
74 | 146 | } |
75 | | |
76 | | template<size_t Size> |
77 | | Utf16View(char16_t const (&code_units)[Size]) |
78 | 0 | : m_code_units( |
79 | 0 | reinterpret_cast<u16 const*>(&code_units[0]), |
80 | 0 | code_units[Size - 1] == u'\0' ? Size - 1 : Size) |
81 | 0 | { |
82 | 0 | } Unexecuted instantiation: AK::Utf16View::Utf16View<3ul>(char16_t const (&) [3ul]) Unexecuted instantiation: AK::Utf16View::Utf16View<2ul>(char16_t const (&) [2ul]) |
83 | | |
84 | 0 | bool operator==(Utf16View const& other) const { return m_code_units == other.m_code_units; } |
85 | | |
86 | | enum class AllowInvalidCodeUnits { |
87 | | Yes, |
88 | | No, |
89 | | }; |
90 | | |
91 | | ErrorOr<ByteString> to_byte_string(AllowInvalidCodeUnits = AllowInvalidCodeUnits::No) const; |
92 | | ErrorOr<String> to_utf8(AllowInvalidCodeUnits = AllowInvalidCodeUnits::No) const; |
93 | | |
94 | 0 | bool is_null() const { return m_code_units.is_null(); } |
95 | 0 | bool is_empty() const { return m_code_units.is_empty(); } |
96 | 4.81k | size_t length_in_code_units() const { return m_code_units.size(); } |
97 | | size_t length_in_code_points() const; |
98 | | |
99 | 0 | Utf16CodePointIterator begin() const { return { begin_ptr(), m_code_units.size() }; } |
100 | 0 | Utf16CodePointIterator end() const { return { end_ptr(), 0 }; } |
101 | | |
102 | 0 | u16 const* data() const { return m_code_units.data(); } |
103 | | u16 code_unit_at(size_t index) const; |
104 | | u32 code_point_at(size_t index) const; |
105 | | |
106 | | size_t code_point_offset_of(size_t code_unit_offset) const; |
107 | | size_t code_unit_offset_of(size_t code_point_offset) const; |
108 | | size_t code_unit_offset_of(Utf16CodePointIterator const&) const; |
109 | | |
110 | | Utf16View substring_view(size_t code_unit_offset, size_t code_unit_length) const; |
111 | 0 | Utf16View substring_view(size_t code_unit_offset) const { return substring_view(code_unit_offset, length_in_code_units() - code_unit_offset); } |
112 | | |
113 | | Utf16View unicode_substring_view(size_t code_point_offset, size_t code_point_length) const; |
114 | 0 | Utf16View unicode_substring_view(size_t code_point_offset) const { return unicode_substring_view(code_point_offset, length_in_code_points() - code_point_offset); } |
115 | | |
116 | | bool starts_with(Utf16View const&) const; |
117 | | |
118 | | bool validate(size_t& valid_code_units) const; |
119 | | bool validate() const |
120 | 0 | { |
121 | 0 | size_t valid_code_units; |
122 | 0 | return validate(valid_code_units); |
123 | 0 | } |
124 | | |
125 | | bool equals_ignoring_case(Utf16View const&) const; |
126 | | |
127 | | private: |
128 | 0 | u16 const* begin_ptr() const { return m_code_units.data(); } |
129 | 0 | u16 const* end_ptr() const { return begin_ptr() + m_code_units.size(); } |
130 | | |
131 | | size_t calculate_length_in_code_points() const; |
132 | | |
133 | | ReadonlySpan<u16> m_code_units; |
134 | | mutable Optional<size_t> m_length_in_code_points; |
135 | | }; |
136 | | |
137 | | } |
138 | | |
139 | | template<> |
140 | | struct AK::Formatter<AK::Utf16View> : Formatter<FormatString> { |
141 | | ErrorOr<void> format(FormatBuilder& builder, AK::Utf16View const& value) |
142 | 0 | { |
143 | 0 | return builder.builder().try_append(value); |
144 | 0 | } |
145 | | }; |
146 | | |
147 | | #if USING_AK_GLOBALLY |
148 | | using AK::Utf16Data; |
149 | | using AK::Utf16View; |
150 | | #endif |