/src/libtorrent/src/utf8.cpp

Source (jump to first uncovered line)
/*

Copyright (c) 2017, Andrei Kurushin
Copyright (c) 2012, 2015-2017, 2019-2020, Arvid Norberg
Copyright (c) 2017, Alden Torres
Copyright (c) 2021, AllSeeingEyeTolledEweSew
All rights reserved.

Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions
are met:

    * Redistributions of source code must retain the above copyright
      notice, this list of conditions and the following disclaimer.
    * Redistributions in binary form must reproduce the above copyright
      notice, this list of conditions and the following disclaimer in
      the documentation and/or other materials provided with the distribution.
    * Neither the name of the author nor the names of its
      contributors may be used to endorse or promote products derived
      from this software without specific prior written permission.

THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
POSSIBILITY OF SUCH DAMAGE.

*/

#include "libtorrent/config.hpp"

#include <iterator>
#include "libtorrent/utf8.hpp"
#include "libtorrent/assert.hpp"
#include "libtorrent/error_code.hpp"
#include "libtorrent/aux_/throw.hpp"
#include "libtorrent/aux_/numeric_cast.hpp"


namespace libtorrent {

namespace {
  // return the number of bytes in the UTF-8 sequence starting with this
  // character. Returns 0 if the lead by is invalid
  int utf8_sequence_length(char const c)
  {
    auto const b = static_cast<std::uint8_t>(c);
    if (b < 0b10000000) return 1;
    if ((b >> 5) == 0b110) return 2;
    if ((b >> 4) == 0b1110) return 3;
    if ((b >> 3) == 0b11110) return 4;
    // this is an invalid prefix, but we still parse it to skip this many
    // bytes
    if ((b >> 2) == 0b111110) return 5;
    return 0;
  }

} // anonymous namespace

  std::int32_t const max_codepoint = 0x10ffff;
  std::int32_t const surrogate_start = 0xd800;
  std::int32_t const surrogate_end = 0xdfff;

  void append_utf8_codepoint(std::string& ret, std::int32_t codepoint)
  {
    if (codepoint >= surrogate_start
      && codepoint <= surrogate_end)
      codepoint = '_';

    if (codepoint > max_codepoint)
      codepoint = '_';

    int seq_len = 0;
    if (codepoint < 0x80) seq_len = 1;
    else if (codepoint < 0x800) seq_len = 2;
    else if (codepoint < 0x10000) seq_len = 3;
    else seq_len = 4;

    switch (seq_len)
    {
      case 1:
        ret.push_back(static_cast<char>(codepoint));
        break;
      case 2:
        ret.push_back(static_cast<char>(0b11000000 | (codepoint >> 6)));
        break;
      case 3:
        ret.push_back(static_cast<char>(0b11100000 | (codepoint >> 12)));
        break;
      case 4:
        ret.push_back(static_cast<char>(0b11110000 | (codepoint >> 18)));
        break;
    }

    for (int i = seq_len - 2; i >= 0; --i)
      ret.push_back(static_cast<char>(0b10000000 | ((codepoint >> (6 * i)) & 0b111111)));
  }

  // returns the unicode codepoint and the number of bytes of the utf8 sequence
  // that was parsed. The codepoint is -1 if it's invalid
  std::pair<std::int32_t, int> parse_utf8_codepoint(string_view str)
  {
    TORRENT_ASSERT(!str.empty());
    if (str.empty()) return std::make_pair(-1, 0);

    int const sequence_len = utf8_sequence_length(str[0]);

    // this is likely the most common case
    if (sequence_len == 1) return std::make_pair(std::int32_t(str[0]), sequence_len);

    // if we find an invalid sequence length, skip one byte
    if (sequence_len == 0)
      return std::make_pair(-1, 1);

    if (sequence_len > 4)
      return std::make_pair(-1, sequence_len);

    if (sequence_len > int(str.size()))
      return std::make_pair(-1, static_cast<int>(str.size()));

    std::int32_t ch = 0;
    // first byte
    switch (sequence_len)
    {
      case 1:
        ch = str[0] & 0b01111111;
        break;
      case 2:
        ch = str[0] & 0b00011111;
        break;
      case 3:
        ch = str[0] & 0b00001111;
        break;
      case 4:
        ch = str[0] & 0b00000111;
        break;
    }
    for (int i = 1; i < sequence_len; ++i)
    {
      auto const b = static_cast<std::uint8_t>(str[static_cast<std::size_t>(i)]);
      // continuation bytes must start with 10xxxxxx
      if (b > 0b10111111 || b < 0b10000000)
        return std::make_pair(-1, sequence_len);
      ch <<= 6;
      ch += b & 0b111111;
    }

    // check if the sequence is overlong, i.e. whether it has leading
    // (redundant) zeros
    switch (sequence_len)
    {
      case 2:
        if (ch < 0x80) return std::make_pair(-1, sequence_len);
        break;
      case 3:
        if (ch < 0x800) return std::make_pair(-1, sequence_len);
        break;
      case 4:
        if (ch < 0x10000) return std::make_pair(-1, sequence_len);
        break;
    }

    if (ch > max_codepoint)
      return std::make_pair(-1, sequence_len);

    // per RFC 3629, surrogates should not appear in utf-8
    if (ch >= surrogate_start && ch <= surrogate_end)
      return std::make_pair(-1, sequence_len);

    return std::make_pair(static_cast<std::int32_t>(ch), sequence_len);
  }
}

Line	Count	Source (jump to first uncovered line)
1		/*
2
3		Copyright (c) 2017, Andrei Kurushin
4		Copyright (c) 2012, 2015-2017, 2019-2020, Arvid Norberg
5		Copyright (c) 2017, Alden Torres
6		Copyright (c) 2021, AllSeeingEyeTolledEweSew
7		All rights reserved.
8
9		Redistribution and use in source and binary forms, with or without
10		modification, are permitted provided that the following conditions
11		are met:
12
13		* Redistributions of source code must retain the above copyright
14		notice, this list of conditions and the following disclaimer.
15		* Redistributions in binary form must reproduce the above copyright
16		notice, this list of conditions and the following disclaimer in
17		the documentation and/or other materials provided with the distribution.
18		* Neither the name of the author nor the names of its
19		contributors may be used to endorse or promote products derived
20		from this software without specific prior written permission.
21
22		THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
23		AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
24		IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
25		ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
26		LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
27		CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
28		SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
29		INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
30		CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
31		ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
32		POSSIBILITY OF SUCH DAMAGE.
33
34		*/
35
36		#include "libtorrent/config.hpp"
37
38		#include <iterator>
39		#include "libtorrent/utf8.hpp"
40		#include "libtorrent/assert.hpp"
41		#include "libtorrent/error_code.hpp"
42		#include "libtorrent/aux_/throw.hpp"
43		#include "libtorrent/aux_/numeric_cast.hpp"
44
45
46		namespace libtorrent {
47
48		namespace {
49		// return the number of bytes in the UTF-8 sequence starting with this
50		// character. Returns 0 if the lead by is invalid
51		int utf8_sequence_length(char const c)
52	165	{
53	165	auto const b = static_cast<std::uint8_t>(c);
54	165	if (b < 0b10000000) return 1;
55	0	if ((b >> 5) == 0b110) return 2;
56	0	if ((b >> 4) == 0b1110) return 3;
57	0	if ((b >> 3) == 0b11110) return 4;
58		// this is an invalid prefix, but we still parse it to skip this many
59		// bytes
60	0	if ((b >> 2) == 0b111110) return 5;
61	0	return 0;
62	0	}
63
64		} // anonymous namespace
65
66		std::int32_t const max_codepoint = 0x10ffff;
67		std::int32_t const surrogate_start = 0xd800;
68		std::int32_t const surrogate_end = 0xdfff;
69
70		void append_utf8_codepoint(std::string& ret, std::int32_t codepoint)
71	40	{
72	40	if (codepoint >= surrogate_start
73	40	&& codepoint <= surrogate_end)
74	0	codepoint = '_';
75
76	40	if (codepoint > max_codepoint)
77	0	codepoint = '_';
78
79	40	int seq_len = 0;
80	40	if (codepoint < 0x80) seq_len = 1;
81	0	else if (codepoint < 0x800) seq_len = 2;
82	0	else if (codepoint < 0x10000) seq_len = 3;
83	0	else seq_len = 4;
84
85	40	switch (seq_len)
86	40	{
87	40	case 1:
88	40	ret.push_back(static_cast<char>(codepoint));
89	40	break;
90	0	case 2:
91	0	ret.push_back(static_cast<char>(0b11000000 \| (codepoint >> 6)));
92	0	break;
93	0	case 3:
94	0	ret.push_back(static_cast<char>(0b11100000 \| (codepoint >> 12)));
95	0	break;
96	0	case 4:
97	0	ret.push_back(static_cast<char>(0b11110000 \| (codepoint >> 18)));
98	0	break;
99	40	}
100
101	40	for (int i = seq_len - 2; i >= 0; --i)
102	0	ret.push_back(static_cast<char>(0b10000000 \| ((codepoint >> (6 * i)) & 0b111111)));
103	40	}
104
105		// returns the unicode codepoint and the number of bytes of the utf8 sequence
106		// that was parsed. The codepoint is -1 if it's invalid
107		std::pair<std::int32_t, int> parse_utf8_codepoint(string_view str)
108	165	{
109	165	TORRENT_ASSERT(!str.empty());
110	165	if (str.empty()) return std::make_pair(-1, 0);
111
112	165	int const sequence_len = utf8_sequence_length(str[0]);
113
114		// this is likely the most common case
115	165	if (sequence_len == 1) return std::make_pair(std::int32_t(str[0]), sequence_len);
116
117		// if we find an invalid sequence length, skip one byte
118	0	if (sequence_len == 0)
119	0	return std::make_pair(-1, 1);
120
121	0	if (sequence_len > 4)
122	0	return std::make_pair(-1, sequence_len);
123
124	0	if (sequence_len > int(str.size()))
125	0	return std::make_pair(-1, static_cast<int>(str.size()));
126
127	0	std::int32_t ch = 0;
128		// first byte
129	0	switch (sequence_len)
130	0	{
131	0	case 1:
132	0	ch = str[0] & 0b01111111;
133	0	break;
134	0	case 2:
135	0	ch = str[0] & 0b00011111;
136	0	break;
137	0	case 3:
138	0	ch = str[0] & 0b00001111;
139	0	break;
140	0	case 4:
141	0	ch = str[0] & 0b00000111;
142	0	break;
143	0	}
144	0	for (int i = 1; i < sequence_len; ++i)
145	0	{
146	0	auto const b = static_cast<std::uint8_t>(str[static_cast<std::size_t>(i)]);
147		// continuation bytes must start with 10xxxxxx
148	0	if (b > 0b10111111 \|\| b < 0b10000000)
149	0	return std::make_pair(-1, sequence_len);
150	0	ch <<= 6;
151	0	ch += b & 0b111111;
152	0	}
153
154		// check if the sequence is overlong, i.e. whether it has leading
155		// (redundant) zeros
156	0	switch (sequence_len)
157	0	{
158	0	case 2:
159	0	if (ch < 0x80) return std::make_pair(-1, sequence_len);
160	0	break;
161	0	case 3:
162	0	if (ch < 0x800) return std::make_pair(-1, sequence_len);
163	0	break;
164	0	case 4:
165	0	if (ch < 0x10000) return std::make_pair(-1, sequence_len);
166	0	break;
167	0	}
168
169	0	if (ch > max_codepoint)
170	0	return std::make_pair(-1, sequence_len);
171
172		// per RFC 3629, surrogates should not appear in utf-8
173	0	if (ch >= surrogate_start && ch <= surrogate_end)
174	0	return std::make_pair(-1, sequence_len);
175
176	0	return std::make_pair(static_cast<std::int32_t>(ch), sequence_len);
177	0	}
178		}

Coverage Report

Created: 2025-08-28 06:21