/src/CMake/Source/cm_codecvt.cxx
Line | Count | Source |
1 | | /* Distributed under the OSI-approved BSD 3-Clause License. See accompanying |
2 | | file LICENSE.rst or https://cmake.org/licensing for details. */ |
3 | | #include "cm_codecvt.hxx" |
4 | | |
5 | | #if defined(_WIN32) |
6 | | # include <cassert> |
7 | | # include <cstring> |
8 | | |
9 | | # include <windows.h> |
10 | | # undef max |
11 | | # include "cmsys/Encoding.hxx" |
12 | | |
13 | | # include "cm_utf8.h" |
14 | | #endif |
15 | | |
16 | | #include "cm_codecvt_Encoding.hxx" |
17 | | |
18 | | codecvt::codecvt(codecvt_Encoding e) |
19 | | #if defined(_WIN32) |
20 | | : m_codepage(0) |
21 | | #endif |
22 | 0 | { |
23 | 0 | switch (e) { |
24 | 0 | case codecvt_Encoding::ConsoleOutput: |
25 | | #if defined(_WIN32) |
26 | | m_noconv = false; |
27 | | m_codepage = GetConsoleOutputCP(); |
28 | | break; |
29 | | #endif |
30 | 0 | case codecvt_Encoding::ANSI: |
31 | | #if defined(_WIN32) |
32 | | m_noconv = false; |
33 | | m_codepage = CP_ACP; |
34 | | break; |
35 | | #endif |
36 | | // We don't know which ANSI encoding to use for other platforms than |
37 | | // Windows so we don't do any conversion there |
38 | 0 | case codecvt_Encoding::UTF8: |
39 | 0 | case codecvt_Encoding::UTF8_WITH_BOM: |
40 | | // Assume internal encoding is UTF-8 |
41 | 0 | case codecvt_Encoding::None: |
42 | | // No encoding |
43 | 0 | default: |
44 | 0 | this->m_noconv = true; |
45 | 0 | } |
46 | 0 | } |
47 | | |
48 | 0 | codecvt::~codecvt() = default; |
49 | | |
50 | | bool codecvt::do_always_noconv() const noexcept |
51 | 0 | { |
52 | 0 | return this->m_noconv; |
53 | 0 | } |
54 | | |
55 | | std::codecvt_base::result codecvt::do_out(mbstate_t& state, char const* from, |
56 | | char const* from_end, |
57 | | char const*& from_next, char* to, |
58 | | char* to_end, char*& to_next) const |
59 | 0 | { |
60 | 0 | from_next = from; |
61 | 0 | to_next = to; |
62 | 0 | if (this->m_noconv) { |
63 | 0 | return std::codecvt_base::noconv; |
64 | 0 | } |
65 | | #if defined(_WIN32) |
66 | | // Use a const view of the state because we should not modify it until we |
67 | | // have fully processed and consume a byte (with sufficient space in the |
68 | | // output buffer). We call helpers to re-cast and modify the state |
69 | | State const& lstate = reinterpret_cast<State&>(state); |
70 | | |
71 | | while (from_next != from_end) { |
72 | | // Count leading ones in the bits of the next byte. |
73 | | unsigned char const ones = |
74 | | cm_utf8_ones[static_cast<unsigned char>(*from_next)]; |
75 | | |
76 | | if (ones != 1 && lstate.buffered != 0) { |
77 | | // We have a buffered partial codepoint that we never completed. |
78 | | return std::codecvt_base::error; |
79 | | } else if (ones == 1 && lstate.buffered == 0) { |
80 | | // This is a continuation of a codepoint that never started. |
81 | | return std::codecvt_base::error; |
82 | | } |
83 | | |
84 | | // Compute the number of bytes in the current codepoint. |
85 | | int need = 0; |
86 | | switch (ones) { |
87 | | case 0: // 0xxx xxxx: new codepoint of size 1 |
88 | | need = 1; |
89 | | break; |
90 | | case 1: // 10xx xxxx: continues a codepoint |
91 | | assert(lstate.size != 0); |
92 | | need = lstate.size; |
93 | | break; |
94 | | case 2: // 110x xxxx: new codepoint of size 2 |
95 | | need = 2; |
96 | | break; |
97 | | case 3: // 1110 xxxx: new codepoint of size 3 |
98 | | need = 3; |
99 | | break; |
100 | | case 4: // 1111 0xxx: new codepoint of size 4 |
101 | | need = 4; |
102 | | break; |
103 | | default: // invalid byte |
104 | | return std::codecvt_base::error; |
105 | | } |
106 | | assert(need > 0); |
107 | | |
108 | | if (lstate.buffered + 1 == need) { |
109 | | // This byte completes a codepoint. |
110 | | std::codecvt_base::result decode_result = |
111 | | this->Decode(state, need, from_next, to_next, to_end); |
112 | | if (decode_result != std::codecvt_base::ok) { |
113 | | return decode_result; |
114 | | } |
115 | | } else { |
116 | | // This byte does not complete a codepoint. |
117 | | this->BufferPartial(state, need, from_next); |
118 | | } |
119 | | } |
120 | | |
121 | | return std::codecvt_base::ok; |
122 | | #else |
123 | 0 | static_cast<void>(state); |
124 | 0 | static_cast<void>(from); |
125 | 0 | static_cast<void>(from_end); |
126 | 0 | static_cast<void>(from_next); |
127 | 0 | static_cast<void>(to); |
128 | 0 | static_cast<void>(to_end); |
129 | 0 | static_cast<void>(to_next); |
130 | 0 | return std::codecvt_base::noconv; |
131 | 0 | #endif |
132 | 0 | } |
133 | | |
134 | | std::codecvt_base::result codecvt::do_unshift(mbstate_t& state, char* to, |
135 | | char* to_end, |
136 | | char*& to_next) const |
137 | 0 | { |
138 | 0 | to_next = to; |
139 | 0 | if (this->m_noconv) { |
140 | 0 | return std::codecvt_base::noconv; |
141 | 0 | } |
142 | | #if defined(_WIN32) |
143 | | State& lstate = reinterpret_cast<State&>(state); |
144 | | if (lstate.buffered != 0) { |
145 | | return this->DecodePartial(state, to_next, to_end); |
146 | | } |
147 | | return std::codecvt_base::ok; |
148 | | #else |
149 | 0 | static_cast<void>(state); |
150 | 0 | static_cast<void>(to_end); |
151 | 0 | return std::codecvt_base::ok; |
152 | 0 | #endif |
153 | 0 | } |
154 | | |
155 | | #if defined(_WIN32) |
156 | | std::codecvt_base::result codecvt::Decode(mbstate_t& state, int size, |
157 | | char const*& from_next, |
158 | | char*& to_next, char* to_end) const |
159 | | { |
160 | | State& lstate = reinterpret_cast<State&>(state); |
161 | | |
162 | | // Collect all the bytes for this codepoint. |
163 | | char buf[4]; |
164 | | memcpy(buf, lstate.partial, lstate.buffered); |
165 | | buf[lstate.buffered] = *from_next; |
166 | | |
167 | | // Convert the encoding. |
168 | | wchar_t wbuf[2]; |
169 | | int wlen = |
170 | | MultiByteToWideChar(CP_UTF8, MB_ERR_INVALID_CHARS, buf, size, wbuf, 2); |
171 | | if (wlen <= 0) { |
172 | | return std::codecvt_base::error; |
173 | | } |
174 | | |
175 | | int tlen = WideCharToMultiByte(m_codepage, 0, wbuf, wlen, to_next, |
176 | | to_end - to_next, nullptr, nullptr); |
177 | | if (tlen <= 0) { |
178 | | if (GetLastError() == ERROR_INSUFFICIENT_BUFFER) { |
179 | | return std::codecvt_base::partial; |
180 | | } |
181 | | return std::codecvt_base::error; |
182 | | } |
183 | | |
184 | | // Move past the now-consumed byte in the input buffer. |
185 | | ++from_next; |
186 | | |
187 | | // Move past the converted codepoint in the output buffer. |
188 | | to_next += tlen; |
189 | | |
190 | | // Re-initialize the state for the next codepoint to start. |
191 | | lstate = State(); |
192 | | |
193 | | return std::codecvt_base::ok; |
194 | | } |
195 | | |
196 | | std::codecvt_base::result codecvt::DecodePartial(mbstate_t& state, |
197 | | char*& to_next, |
198 | | char* to_end) const |
199 | | { |
200 | | State& lstate = reinterpret_cast<State&>(state); |
201 | | |
202 | | // Try converting the partial codepoint. |
203 | | wchar_t wbuf[2]; |
204 | | int wlen = MultiByteToWideChar(CP_UTF8, MB_ERR_INVALID_CHARS, lstate.partial, |
205 | | lstate.buffered, wbuf, 2); |
206 | | if (wlen <= 0) { |
207 | | return std::codecvt_base::error; |
208 | | } |
209 | | |
210 | | int tlen = WideCharToMultiByte(m_codepage, 0, wbuf, wlen, to_next, |
211 | | to_end - to_next, nullptr, nullptr); |
212 | | if (tlen <= 0) { |
213 | | if (GetLastError() == ERROR_INSUFFICIENT_BUFFER) { |
214 | | return std::codecvt_base::partial; |
215 | | } |
216 | | return std::codecvt_base::error; |
217 | | } |
218 | | |
219 | | // Move past the converted codepoint in the output buffer. |
220 | | to_next += tlen; |
221 | | |
222 | | // Re-initialize the state for the next codepoint to start. |
223 | | lstate = State(); |
224 | | |
225 | | return std::codecvt_base::ok; |
226 | | } |
227 | | |
228 | | void codecvt::BufferPartial(mbstate_t& state, int size, |
229 | | char const*& from_next) const |
230 | | { |
231 | | State& lstate = reinterpret_cast<State&>(state); |
232 | | |
233 | | // Save the byte in our buffer for later. |
234 | | lstate.partial[lstate.buffered++] = *from_next; |
235 | | lstate.size = size; |
236 | | |
237 | | // Move past the now-consumed byte in the input buffer. |
238 | | ++from_next; |
239 | | } |
240 | | #endif |
241 | | |
242 | | int codecvt::do_max_length() const noexcept |
243 | 0 | { |
244 | 0 | return 4; |
245 | 0 | } |
246 | | |
247 | | int codecvt::do_encoding() const noexcept |
248 | 0 | { |
249 | 0 | return 0; |
250 | 0 | } |