/src/CMake/Source/cm_codecvt.cxx

Source
/* Distributed under the OSI-approved BSD 3-Clause License.  See accompanying
   file LICENSE.rst or https://cmake.org/licensing for details.  */
#include "cm_codecvt.hxx"

#if defined(_WIN32)
#  include <cassert>
#  include <cstring>

#  include <windows.h>
#  undef max
#  include "cmsys/Encoding.hxx"

#  include "cm_utf8.h"
#endif

#include "cm_codecvt_Encoding.hxx"

codecvt::codecvt(codecvt_Encoding e)
#if defined(_WIN32)
  : m_codepage(0)
#endif
{
  switch (e) {
    case codecvt_Encoding::ConsoleOutput:
#if defined(_WIN32)
      m_noconv = false;
      m_codepage = GetConsoleOutputCP();
      break;
#endif
    case codecvt_Encoding::ANSI:
#if defined(_WIN32)
      m_noconv = false;
      m_codepage = CP_ACP;
      break;
#endif
    // We don't know which ANSI encoding to use for other platforms than
    // Windows so we don't do any conversion there
    case codecvt_Encoding::UTF8:
    case codecvt_Encoding::UTF8_WITH_BOM:
    // Assume internal encoding is UTF-8
    case codecvt_Encoding::None:
    // No encoding
    default:
      this->m_noconv = true;
  }
}

codecvt::~codecvt() = default;

bool codecvt::do_always_noconv() const noexcept
{
  return this->m_noconv;
}

std::codecvt_base::result codecvt::do_out(mbstate_t& state, char const* from,
                                          char const* from_end,
                                          char const*& from_next, char* to,
                                          char* to_end, char*& to_next) const
{
  from_next = from;
  to_next = to;
  if (this->m_noconv) {
    return std::codecvt_base::noconv;
  }
#if defined(_WIN32)
  // Use a const view of the state because we should not modify it until we
  // have fully processed and consume a byte (with sufficient space in the
  // output buffer).  We call helpers to re-cast and modify the state
  State const& lstate = reinterpret_cast<State&>(state);

  while (from_next != from_end) {
    // Count leading ones in the bits of the next byte.
    unsigned char const ones =
      cm_utf8_ones[static_cast<unsigned char>(*from_next)];

    if (ones != 1 && lstate.buffered != 0) {
      // We have a buffered partial codepoint that we never completed.
      return std::codecvt_base::error;
    } else if (ones == 1 && lstate.buffered == 0) {
      // This is a continuation of a codepoint that never started.
      return std::codecvt_base::error;
    }

    // Compute the number of bytes in the current codepoint.
    int need = 0;
    switch (ones) {
      case 0: // 0xxx xxxx: new codepoint of size 1
        need = 1;
        break;
      case 1: // 10xx xxxx: continues a codepoint
        assert(lstate.size != 0);
        need = lstate.size;
        break;
      case 2: // 110x xxxx: new codepoint of size 2
        need = 2;
        break;
      case 3: // 1110 xxxx: new codepoint of size 3
        need = 3;
        break;
      case 4: // 1111 0xxx: new codepoint of size 4
        need = 4;
        break;
      default: // invalid byte
        return std::codecvt_base::error;
    }
    assert(need > 0);

    if (lstate.buffered + 1 == need) {
      // This byte completes a codepoint.
      std::codecvt_base::result decode_result =
        this->Decode(state, need, from_next, to_next, to_end);
      if (decode_result != std::codecvt_base::ok) {
        return decode_result;
      }
    } else {
      // This byte does not complete a codepoint.
      this->BufferPartial(state, need, from_next);
    }
  }

  return std::codecvt_base::ok;
#else
  static_cast<void>(state);
  static_cast<void>(from);
  static_cast<void>(from_end);
  static_cast<void>(from_next);
  static_cast<void>(to);
  static_cast<void>(to_end);
  static_cast<void>(to_next);
  return std::codecvt_base::noconv;
#endif
}

std::codecvt_base::result codecvt::do_unshift(mbstate_t& state, char* to,
                                              char* to_end,
                                              char*& to_next) const
{
  to_next = to;
  if (this->m_noconv) {
    return std::codecvt_base::noconv;
  }
#if defined(_WIN32)
  State& lstate = reinterpret_cast<State&>(state);
  if (lstate.buffered != 0) {
    return this->DecodePartial(state, to_next, to_end);
  }
  return std::codecvt_base::ok;
#else
  static_cast<void>(state);
  static_cast<void>(to_end);
  return std::codecvt_base::ok;
#endif
}

#if defined(_WIN32)
std::codecvt_base::result codecvt::Decode(mbstate_t& state, int size,
                                          char const*& from_next,
                                          char*& to_next, char* to_end) const
{
  State& lstate = reinterpret_cast<State&>(state);

  // Collect all the bytes for this codepoint.
  char buf[4];
  memcpy(buf, lstate.partial, lstate.buffered);
  buf[lstate.buffered] = *from_next;

  // Convert the encoding.
  wchar_t wbuf[2];
  int wlen =
    MultiByteToWideChar(CP_UTF8, MB_ERR_INVALID_CHARS, buf, size, wbuf, 2);
  if (wlen <= 0) {
    return std::codecvt_base::error;
  }

  int tlen = WideCharToMultiByte(m_codepage, 0, wbuf, wlen, to_next,
                                 to_end - to_next, nullptr, nullptr);
  if (tlen <= 0) {
    if (GetLastError() == ERROR_INSUFFICIENT_BUFFER) {
      return std::codecvt_base::partial;
    }
    return std::codecvt_base::error;
  }

  // Move past the now-consumed byte in the input buffer.
  ++from_next;

  // Move past the converted codepoint in the output buffer.
  to_next += tlen;

  // Re-initialize the state for the next codepoint to start.
  lstate = State();

  return std::codecvt_base::ok;
}

std::codecvt_base::result codecvt::DecodePartial(mbstate_t& state,
                                                 char*& to_next,
                                                 char* to_end) const
{
  State& lstate = reinterpret_cast<State&>(state);

  // Try converting the partial codepoint.
  wchar_t wbuf[2];
  int wlen = MultiByteToWideChar(CP_UTF8, MB_ERR_INVALID_CHARS, lstate.partial,
                                 lstate.buffered, wbuf, 2);
  if (wlen <= 0) {
    return std::codecvt_base::error;
  }

  int tlen = WideCharToMultiByte(m_codepage, 0, wbuf, wlen, to_next,
                                 to_end - to_next, nullptr, nullptr);
  if (tlen <= 0) {
    if (GetLastError() == ERROR_INSUFFICIENT_BUFFER) {
      return std::codecvt_base::partial;
    }
    return std::codecvt_base::error;
  }

  // Move past the converted codepoint in the output buffer.
  to_next += tlen;

  // Re-initialize the state for the next codepoint to start.
  lstate = State();

  return std::codecvt_base::ok;
}

void codecvt::BufferPartial(mbstate_t& state, int size,
                            char const*& from_next) const
{
  State& lstate = reinterpret_cast<State&>(state);

  // Save the byte in our buffer for later.
  lstate.partial[lstate.buffered++] = *from_next;
  lstate.size = size;

  // Move past the now-consumed byte in the input buffer.
  ++from_next;
}
#endif

int codecvt::do_max_length() const noexcept
{
  return 4;
}

int codecvt::do_encoding() const noexcept
{
  return 0;
}

Coverage Report

Created: 2026-02-09 06:05

Line	Count	Source
1		/* Distributed under the OSI-approved BSD 3-Clause License. See accompanying
2		file LICENSE.rst or https://cmake.org/licensing for details. */
3		#include "cm_codecvt.hxx"
4
5		#if defined(_WIN32)
6		# include <cassert>
7		# include <cstring>
8
9		# include <windows.h>
10		# undef max
11		# include "cmsys/Encoding.hxx"
12
13		# include "cm_utf8.h"
14		#endif
15
16		#include "cm_codecvt_Encoding.hxx"
17
18		codecvt::codecvt(codecvt_Encoding e)
19		#if defined(_WIN32)
20		: m_codepage(0)
21		#endif
22	0	{
23	0	switch (e) {
24	0	case codecvt_Encoding::ConsoleOutput:
25		#if defined(_WIN32)
26		m_noconv = false;
27		m_codepage = GetConsoleOutputCP();
28		break;
29		#endif
30	0	case codecvt_Encoding::ANSI:
31		#if defined(_WIN32)
32		m_noconv = false;
33		m_codepage = CP_ACP;
34		break;
35		#endif
36		// We don't know which ANSI encoding to use for other platforms than
37		// Windows so we don't do any conversion there
38	0	case codecvt_Encoding::UTF8:
39	0	case codecvt_Encoding::UTF8_WITH_BOM:
40		// Assume internal encoding is UTF-8
41	0	case codecvt_Encoding::None:
42		// No encoding
43	0	default:
44	0	this->m_noconv = true;
45	0	}
46	0	}
47
48	0	codecvt::~codecvt() = default;
49
50		bool codecvt::do_always_noconv() const noexcept
51	0	{
52	0	return this->m_noconv;
53	0	}
54
55		std::codecvt_base::result codecvt::do_out(mbstate_t& state, char const* from,
56		char const* from_end,
57		char const& from_next, char to,
58		char* to_end, char*& to_next) const
59	0	{
60	0	from_next = from;
61	0	to_next = to;
62	0	if (this->m_noconv) {
63	0	return std::codecvt_base::noconv;
64	0	}
65		#if defined(_WIN32)
66		// Use a const view of the state because we should not modify it until we
67		// have fully processed and consume a byte (with sufficient space in the
68		// output buffer). We call helpers to re-cast and modify the state
69		State const& lstate = reinterpret_cast<State&>(state);
70
71		while (from_next != from_end) {
72		// Count leading ones in the bits of the next byte.
73		unsigned char const ones =
74		cm_utf8_ones[static_cast<unsigned char>(*from_next)];
75
76		if (ones != 1 && lstate.buffered != 0) {
77		// We have a buffered partial codepoint that we never completed.
78		return std::codecvt_base::error;
79		} else if (ones == 1 && lstate.buffered == 0) {
80		// This is a continuation of a codepoint that never started.
81		return std::codecvt_base::error;
82		}
83
84		// Compute the number of bytes in the current codepoint.
85		int need = 0;
86		switch (ones) {
87		case 0: // 0xxx xxxx: new codepoint of size 1
88		need = 1;
89		break;
90		case 1: // 10xx xxxx: continues a codepoint
91		assert(lstate.size != 0);
92		need = lstate.size;
93		break;
94		case 2: // 110x xxxx: new codepoint of size 2
95		need = 2;
96		break;
97		case 3: // 1110 xxxx: new codepoint of size 3
98		need = 3;
99		break;
100		case 4: // 1111 0xxx: new codepoint of size 4
101		need = 4;
102		break;
103		default: // invalid byte
104		return std::codecvt_base::error;
105		}
106		assert(need > 0);
107
108		if (lstate.buffered + 1 == need) {
109		// This byte completes a codepoint.
110		std::codecvt_base::result decode_result =
111		this->Decode(state, need, from_next, to_next, to_end);
112		if (decode_result != std::codecvt_base::ok) {
113		return decode_result;
114		}
115		} else {
116		// This byte does not complete a codepoint.
117		this->BufferPartial(state, need, from_next);
118		}
119		}
120
121		return std::codecvt_base::ok;
122		#else
123	0	static_cast<void>(state);
124	0	static_cast<void>(from);
125	0	static_cast<void>(from_end);
126	0	static_cast<void>(from_next);
127	0	static_cast<void>(to);
128	0	static_cast<void>(to_end);
129	0	static_cast<void>(to_next);
130	0	return std::codecvt_base::noconv;
131	0	#endif
132	0	}
133
134		std::codecvt_base::result codecvt::do_unshift(mbstate_t& state, char* to,
135		char* to_end,
136		char*& to_next) const
137	0	{
138	0	to_next = to;
139	0	if (this->m_noconv) {
140	0	return std::codecvt_base::noconv;
141	0	}
142		#if defined(_WIN32)
143		State& lstate = reinterpret_cast<State&>(state);
144		if (lstate.buffered != 0) {
145		return this->DecodePartial(state, to_next, to_end);
146		}
147		return std::codecvt_base::ok;
148		#else
149	0	static_cast<void>(state);
150	0	static_cast<void>(to_end);
151	0	return std::codecvt_base::ok;
152	0	#endif
153	0	}
154
155		#if defined(_WIN32)
156		std::codecvt_base::result codecvt::Decode(mbstate_t& state, int size,
157		char const*& from_next,
158		char& to_next, char to_end) const
159		{
160		State& lstate = reinterpret_cast<State&>(state);
161
162		// Collect all the bytes for this codepoint.
163		char buf[4];
164		memcpy(buf, lstate.partial, lstate.buffered);
165		buf[lstate.buffered] = *from_next;
166
167		// Convert the encoding.
168		wchar_t wbuf[2];
169		int wlen =
170		MultiByteToWideChar(CP_UTF8, MB_ERR_INVALID_CHARS, buf, size, wbuf, 2);
171		if (wlen <= 0) {
172		return std::codecvt_base::error;
173		}
174
175		int tlen = WideCharToMultiByte(m_codepage, 0, wbuf, wlen, to_next,
176		to_end - to_next, nullptr, nullptr);
177		if (tlen <= 0) {
178		if (GetLastError() == ERROR_INSUFFICIENT_BUFFER) {
179		return std::codecvt_base::partial;
180		}
181		return std::codecvt_base::error;
182		}
183
184		// Move past the now-consumed byte in the input buffer.
185		++from_next;
186
187		// Move past the converted codepoint in the output buffer.
188		to_next += tlen;
189
190		// Re-initialize the state for the next codepoint to start.
191		lstate = State();
192
193		return std::codecvt_base::ok;
194		}
195
196		std::codecvt_base::result codecvt::DecodePartial(mbstate_t& state,
197		char*& to_next,
198		char* to_end) const
199		{
200		State& lstate = reinterpret_cast<State&>(state);
201
202		// Try converting the partial codepoint.
203		wchar_t wbuf[2];
204		int wlen = MultiByteToWideChar(CP_UTF8, MB_ERR_INVALID_CHARS, lstate.partial,
205		lstate.buffered, wbuf, 2);
206		if (wlen <= 0) {
207		return std::codecvt_base::error;
208		}
209
210		int tlen = WideCharToMultiByte(m_codepage, 0, wbuf, wlen, to_next,
211		to_end - to_next, nullptr, nullptr);
212		if (tlen <= 0) {
213		if (GetLastError() == ERROR_INSUFFICIENT_BUFFER) {
214		return std::codecvt_base::partial;
215		}
216		return std::codecvt_base::error;
217		}
218
219		// Move past the converted codepoint in the output buffer.
220		to_next += tlen;
221
222		// Re-initialize the state for the next codepoint to start.
223		lstate = State();
224
225		return std::codecvt_base::ok;
226		}
227
228		void codecvt::BufferPartial(mbstate_t& state, int size,
229		char const*& from_next) const
230		{
231		State& lstate = reinterpret_cast<State&>(state);
232
233		// Save the byte in our buffer for later.
234		lstate.partial[lstate.buffered++] = *from_next;
235		lstate.size = size;
236
237		// Move past the now-consumed byte in the input buffer.
238		++from_next;
239		}
240		#endif
241
242		int codecvt::do_max_length() const noexcept
243	0	{
244	0	return 4;
245	0	}
246
247		int codecvt::do_encoding() const noexcept
248	0	{
249	0	return 0;
250	0	}