Coverage Report

Created: 2025-08-28 06:21

/src/libtorrent/src/utf8.cpp
Line
Count
Source (jump to first uncovered line)
1
/*
2
3
Copyright (c) 2017, Andrei Kurushin
4
Copyright (c) 2012, 2015-2017, 2019-2020, Arvid Norberg
5
Copyright (c) 2017, Alden Torres
6
Copyright (c) 2021, AllSeeingEyeTolledEweSew
7
All rights reserved.
8
9
Redistribution and use in source and binary forms, with or without
10
modification, are permitted provided that the following conditions
11
are met:
12
13
    * Redistributions of source code must retain the above copyright
14
      notice, this list of conditions and the following disclaimer.
15
    * Redistributions in binary form must reproduce the above copyright
16
      notice, this list of conditions and the following disclaimer in
17
      the documentation and/or other materials provided with the distribution.
18
    * Neither the name of the author nor the names of its
19
      contributors may be used to endorse or promote products derived
20
      from this software without specific prior written permission.
21
22
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
23
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
24
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
25
ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
26
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
27
CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
28
SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
29
INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
30
CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
31
ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
32
POSSIBILITY OF SUCH DAMAGE.
33
34
*/
35
36
#include "libtorrent/config.hpp"
37
38
#include <iterator>
39
#include "libtorrent/utf8.hpp"
40
#include "libtorrent/assert.hpp"
41
#include "libtorrent/error_code.hpp"
42
#include "libtorrent/aux_/throw.hpp"
43
#include "libtorrent/aux_/numeric_cast.hpp"
44
45
46
namespace libtorrent {
47
48
namespace {
49
  // return the number of bytes in the UTF-8 sequence starting with this
50
  // character. Returns 0 if the lead by is invalid
51
  int utf8_sequence_length(char const c)
52
165
  {
53
165
    auto const b = static_cast<std::uint8_t>(c);
54
165
    if (b < 0b10000000) return 1;
55
0
    if ((b >> 5) == 0b110) return 2;
56
0
    if ((b >> 4) == 0b1110) return 3;
57
0
    if ((b >> 3) == 0b11110) return 4;
58
    // this is an invalid prefix, but we still parse it to skip this many
59
    // bytes
60
0
    if ((b >> 2) == 0b111110) return 5;
61
0
    return 0;
62
0
  }
63
64
} // anonymous namespace
65
66
  std::int32_t const max_codepoint = 0x10ffff;
67
  std::int32_t const surrogate_start = 0xd800;
68
  std::int32_t const surrogate_end = 0xdfff;
69
70
  void append_utf8_codepoint(std::string& ret, std::int32_t codepoint)
71
40
  {
72
40
    if (codepoint >= surrogate_start
73
40
      && codepoint <= surrogate_end)
74
0
      codepoint = '_';
75
76
40
    if (codepoint > max_codepoint)
77
0
      codepoint = '_';
78
79
40
    int seq_len = 0;
80
40
    if (codepoint < 0x80) seq_len = 1;
81
0
    else if (codepoint < 0x800) seq_len = 2;
82
0
    else if (codepoint < 0x10000) seq_len = 3;
83
0
    else seq_len = 4;
84
85
40
    switch (seq_len)
86
40
    {
87
40
      case 1:
88
40
        ret.push_back(static_cast<char>(codepoint));
89
40
        break;
90
0
      case 2:
91
0
        ret.push_back(static_cast<char>(0b11000000 | (codepoint >> 6)));
92
0
        break;
93
0
      case 3:
94
0
        ret.push_back(static_cast<char>(0b11100000 | (codepoint >> 12)));
95
0
        break;
96
0
      case 4:
97
0
        ret.push_back(static_cast<char>(0b11110000 | (codepoint >> 18)));
98
0
        break;
99
40
    }
100
101
40
    for (int i = seq_len - 2; i >= 0; --i)
102
0
      ret.push_back(static_cast<char>(0b10000000 | ((codepoint >> (6 * i)) & 0b111111)));
103
40
  }
104
105
  // returns the unicode codepoint and the number of bytes of the utf8 sequence
106
  // that was parsed. The codepoint is -1 if it's invalid
107
  std::pair<std::int32_t, int> parse_utf8_codepoint(string_view str)
108
165
  {
109
165
    TORRENT_ASSERT(!str.empty());
110
165
    if (str.empty()) return std::make_pair(-1, 0);
111
112
165
    int const sequence_len = utf8_sequence_length(str[0]);
113
114
    // this is likely the most common case
115
165
    if (sequence_len == 1) return std::make_pair(std::int32_t(str[0]), sequence_len);
116
117
    // if we find an invalid sequence length, skip one byte
118
0
    if (sequence_len == 0)
119
0
      return std::make_pair(-1, 1);
120
121
0
    if (sequence_len > 4)
122
0
      return std::make_pair(-1, sequence_len);
123
124
0
    if (sequence_len > int(str.size()))
125
0
      return std::make_pair(-1, static_cast<int>(str.size()));
126
127
0
    std::int32_t ch = 0;
128
    // first byte
129
0
    switch (sequence_len)
130
0
    {
131
0
      case 1:
132
0
        ch = str[0] & 0b01111111;
133
0
        break;
134
0
      case 2:
135
0
        ch = str[0] & 0b00011111;
136
0
        break;
137
0
      case 3:
138
0
        ch = str[0] & 0b00001111;
139
0
        break;
140
0
      case 4:
141
0
        ch = str[0] & 0b00000111;
142
0
        break;
143
0
    }
144
0
    for (int i = 1; i < sequence_len; ++i)
145
0
    {
146
0
      auto const b = static_cast<std::uint8_t>(str[static_cast<std::size_t>(i)]);
147
      // continuation bytes must start with 10xxxxxx
148
0
      if (b > 0b10111111 || b < 0b10000000)
149
0
        return std::make_pair(-1, sequence_len);
150
0
      ch <<= 6;
151
0
      ch += b & 0b111111;
152
0
    }
153
154
    // check if the sequence is overlong, i.e. whether it has leading
155
    // (redundant) zeros
156
0
    switch (sequence_len)
157
0
    {
158
0
      case 2:
159
0
        if (ch < 0x80) return std::make_pair(-1, sequence_len);
160
0
        break;
161
0
      case 3:
162
0
        if (ch < 0x800) return std::make_pair(-1, sequence_len);
163
0
        break;
164
0
      case 4:
165
0
        if (ch < 0x10000) return std::make_pair(-1, sequence_len);
166
0
        break;
167
0
    }
168
169
0
    if (ch > max_codepoint)
170
0
      return std::make_pair(-1, sequence_len);
171
172
    // per RFC 3629, surrogates should not appear in utf-8
173
0
    if (ch >= surrogate_start && ch <= surrogate_end)
174
0
      return std::make_pair(-1, sequence_len);
175
176
0
    return std::make_pair(static_cast<std::int32_t>(ch), sequence_len);
177
0
  }
178
}