/src/libtorrent/src/utf8.cpp
Line | Count | Source (jump to first uncovered line) |
1 | | /* |
2 | | |
3 | | Copyright (c) 2017, Andrei Kurushin |
4 | | Copyright (c) 2012, 2015-2017, 2019-2020, Arvid Norberg |
5 | | Copyright (c) 2017, Alden Torres |
6 | | Copyright (c) 2021, AllSeeingEyeTolledEweSew |
7 | | All rights reserved. |
8 | | |
9 | | Redistribution and use in source and binary forms, with or without |
10 | | modification, are permitted provided that the following conditions |
11 | | are met: |
12 | | |
13 | | * Redistributions of source code must retain the above copyright |
14 | | notice, this list of conditions and the following disclaimer. |
15 | | * Redistributions in binary form must reproduce the above copyright |
16 | | notice, this list of conditions and the following disclaimer in |
17 | | the documentation and/or other materials provided with the distribution. |
18 | | * Neither the name of the author nor the names of its |
19 | | contributors may be used to endorse or promote products derived |
20 | | from this software without specific prior written permission. |
21 | | |
22 | | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" |
23 | | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE |
24 | | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE |
25 | | ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE |
26 | | LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR |
27 | | CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF |
28 | | SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS |
29 | | INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN |
30 | | CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) |
31 | | ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE |
32 | | POSSIBILITY OF SUCH DAMAGE. |
33 | | |
34 | | */ |
35 | | |
36 | | #include "libtorrent/config.hpp" |
37 | | |
38 | | #include <iterator> |
39 | | #include "libtorrent/utf8.hpp" |
40 | | #include "libtorrent/assert.hpp" |
41 | | #include "libtorrent/error_code.hpp" |
42 | | #include "libtorrent/aux_/throw.hpp" |
43 | | #include "libtorrent/aux_/numeric_cast.hpp" |
44 | | |
45 | | |
46 | | namespace libtorrent { |
47 | | |
48 | | namespace { |
49 | | // return the number of bytes in the UTF-8 sequence starting with this |
50 | | // character. Returns 0 if the lead by is invalid |
51 | | int utf8_sequence_length(char const c) |
52 | 165 | { |
53 | 165 | auto const b = static_cast<std::uint8_t>(c); |
54 | 165 | if (b < 0b10000000) return 1; |
55 | 0 | if ((b >> 5) == 0b110) return 2; |
56 | 0 | if ((b >> 4) == 0b1110) return 3; |
57 | 0 | if ((b >> 3) == 0b11110) return 4; |
58 | | // this is an invalid prefix, but we still parse it to skip this many |
59 | | // bytes |
60 | 0 | if ((b >> 2) == 0b111110) return 5; |
61 | 0 | return 0; |
62 | 0 | } |
63 | | |
64 | | } // anonymous namespace |
65 | | |
66 | | std::int32_t const max_codepoint = 0x10ffff; |
67 | | std::int32_t const surrogate_start = 0xd800; |
68 | | std::int32_t const surrogate_end = 0xdfff; |
69 | | |
70 | | void append_utf8_codepoint(std::string& ret, std::int32_t codepoint) |
71 | 40 | { |
72 | 40 | if (codepoint >= surrogate_start |
73 | 40 | && codepoint <= surrogate_end) |
74 | 0 | codepoint = '_'; |
75 | | |
76 | 40 | if (codepoint > max_codepoint) |
77 | 0 | codepoint = '_'; |
78 | | |
79 | 40 | int seq_len = 0; |
80 | 40 | if (codepoint < 0x80) seq_len = 1; |
81 | 0 | else if (codepoint < 0x800) seq_len = 2; |
82 | 0 | else if (codepoint < 0x10000) seq_len = 3; |
83 | 0 | else seq_len = 4; |
84 | | |
85 | 40 | switch (seq_len) |
86 | 40 | { |
87 | 40 | case 1: |
88 | 40 | ret.push_back(static_cast<char>(codepoint)); |
89 | 40 | break; |
90 | 0 | case 2: |
91 | 0 | ret.push_back(static_cast<char>(0b11000000 | (codepoint >> 6))); |
92 | 0 | break; |
93 | 0 | case 3: |
94 | 0 | ret.push_back(static_cast<char>(0b11100000 | (codepoint >> 12))); |
95 | 0 | break; |
96 | 0 | case 4: |
97 | 0 | ret.push_back(static_cast<char>(0b11110000 | (codepoint >> 18))); |
98 | 0 | break; |
99 | 40 | } |
100 | | |
101 | 40 | for (int i = seq_len - 2; i >= 0; --i) |
102 | 0 | ret.push_back(static_cast<char>(0b10000000 | ((codepoint >> (6 * i)) & 0b111111))); |
103 | 40 | } |
104 | | |
105 | | // returns the unicode codepoint and the number of bytes of the utf8 sequence |
106 | | // that was parsed. The codepoint is -1 if it's invalid |
107 | | std::pair<std::int32_t, int> parse_utf8_codepoint(string_view str) |
108 | 165 | { |
109 | 165 | TORRENT_ASSERT(!str.empty()); |
110 | 165 | if (str.empty()) return std::make_pair(-1, 0); |
111 | | |
112 | 165 | int const sequence_len = utf8_sequence_length(str[0]); |
113 | | |
114 | | // this is likely the most common case |
115 | 165 | if (sequence_len == 1) return std::make_pair(std::int32_t(str[0]), sequence_len); |
116 | | |
117 | | // if we find an invalid sequence length, skip one byte |
118 | 0 | if (sequence_len == 0) |
119 | 0 | return std::make_pair(-1, 1); |
120 | | |
121 | 0 | if (sequence_len > 4) |
122 | 0 | return std::make_pair(-1, sequence_len); |
123 | | |
124 | 0 | if (sequence_len > int(str.size())) |
125 | 0 | return std::make_pair(-1, static_cast<int>(str.size())); |
126 | | |
127 | 0 | std::int32_t ch = 0; |
128 | | // first byte |
129 | 0 | switch (sequence_len) |
130 | 0 | { |
131 | 0 | case 1: |
132 | 0 | ch = str[0] & 0b01111111; |
133 | 0 | break; |
134 | 0 | case 2: |
135 | 0 | ch = str[0] & 0b00011111; |
136 | 0 | break; |
137 | 0 | case 3: |
138 | 0 | ch = str[0] & 0b00001111; |
139 | 0 | break; |
140 | 0 | case 4: |
141 | 0 | ch = str[0] & 0b00000111; |
142 | 0 | break; |
143 | 0 | } |
144 | 0 | for (int i = 1; i < sequence_len; ++i) |
145 | 0 | { |
146 | 0 | auto const b = static_cast<std::uint8_t>(str[static_cast<std::size_t>(i)]); |
147 | | // continuation bytes must start with 10xxxxxx |
148 | 0 | if (b > 0b10111111 || b < 0b10000000) |
149 | 0 | return std::make_pair(-1, sequence_len); |
150 | 0 | ch <<= 6; |
151 | 0 | ch += b & 0b111111; |
152 | 0 | } |
153 | | |
154 | | // check if the sequence is overlong, i.e. whether it has leading |
155 | | // (redundant) zeros |
156 | 0 | switch (sequence_len) |
157 | 0 | { |
158 | 0 | case 2: |
159 | 0 | if (ch < 0x80) return std::make_pair(-1, sequence_len); |
160 | 0 | break; |
161 | 0 | case 3: |
162 | 0 | if (ch < 0x800) return std::make_pair(-1, sequence_len); |
163 | 0 | break; |
164 | 0 | case 4: |
165 | 0 | if (ch < 0x10000) return std::make_pair(-1, sequence_len); |
166 | 0 | break; |
167 | 0 | } |
168 | | |
169 | 0 | if (ch > max_codepoint) |
170 | 0 | return std::make_pair(-1, sequence_len); |
171 | | |
172 | | // per RFC 3629, surrogates should not appear in utf-8 |
173 | 0 | if (ch >= surrogate_start && ch <= surrogate_end) |
174 | 0 | return std::make_pair(-1, sequence_len); |
175 | | |
176 | 0 | return std::make_pair(static_cast<std::int32_t>(ch), sequence_len); |
177 | 0 | } |
178 | | } |