/src/solidity/libsolutil/UTF8.cpp
Line | Count | Source (jump to first uncovered line) |
1 | | /* |
2 | | This file is part of solidity. |
3 | | |
4 | | solidity is free software: you can redistribute it and/or modify |
5 | | it under the terms of the GNU General Public License as published by |
6 | | the Free Software Foundation, either version 3 of the License, or |
7 | | (at your option) any later version. |
8 | | |
9 | | solidity is distributed in the hope that it will be useful, |
10 | | but WITHOUT ANY WARRANTY; without even the implied warranty of |
11 | | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
12 | | GNU General Public License for more details. |
13 | | |
14 | | You should have received a copy of the GNU General Public License |
15 | | along with solidity. If not, see <http://www.gnu.org/licenses/>. |
16 | | */ |
17 | | // SPDX-License-Identifier: GPL-3.0 |
18 | | /** @file UTF8.cpp |
19 | | * @author Alex Beregszaszi |
20 | | * @date 2016 |
21 | | * |
22 | | * UTF-8 related helpers |
23 | | */ |
24 | | |
25 | | #include <libsolutil/UTF8.h> |
26 | | |
27 | | namespace solidity::util |
28 | | { |
29 | | namespace |
30 | | { |
31 | | |
32 | | /// Validate byte sequence against Unicode chapter 3 Table 3-7. |
33 | | bool isWellFormed(unsigned char byte1, unsigned char byte2) |
34 | 0 | { |
35 | 0 | if (byte1 == 0xc0 || byte1 == 0xc1) |
36 | 0 | return false; |
37 | 0 | else if (byte1 >= 0xc2 && byte1 <= 0xdf) |
38 | 0 | return true; |
39 | 0 | else if (byte1 == 0xe0) |
40 | 0 | { |
41 | 0 | if (byte2 < 0xa0) |
42 | 0 | return false; |
43 | 0 | else |
44 | 0 | return true; |
45 | 0 | } |
46 | 0 | else if (byte1 >= 0xe1 && byte1 <= 0xec) |
47 | 0 | return true; |
48 | 0 | else if (byte1 == 0xed) |
49 | 0 | { |
50 | 0 | if (byte2 > 0x9f) |
51 | 0 | return false; |
52 | 0 | else |
53 | 0 | return true; |
54 | 0 | } |
55 | 0 | else if (byte1 == 0xee || byte1 == 0xef) |
56 | 0 | return true; |
57 | 0 | else if (byte1 == 0xf0) |
58 | 0 | { |
59 | 0 | if (byte2 < 0x90) |
60 | 0 | return false; |
61 | 0 | else |
62 | 0 | return true; |
63 | 0 | } |
64 | 0 | else if (byte1 >= 0xf1 && byte1 <= 0xf3) |
65 | 0 | return true; |
66 | 0 | else if (byte1 == 0xf4) |
67 | 0 | { |
68 | 0 | if (byte2 > 0x8f) |
69 | 0 | return false; |
70 | 0 | else |
71 | 0 | return true; |
72 | 0 | } |
73 | | /// 0xf5 .. 0xf7 is disallowed |
74 | | /// Technically anything below 0xc0 or above 0xf7 is |
75 | | /// not possible to encode using Table 3-6 anyway. |
76 | 0 | return false; |
77 | 0 | } |
78 | | |
79 | | bool validateUTF8(unsigned char const* _input, size_t _length, size_t& _invalidPosition) |
80 | 0 | { |
81 | 0 | bool valid = true; |
82 | 0 | size_t i = 0; |
83 | |
|
84 | 0 | for (; i < _length; i++) |
85 | 0 | { |
86 | | // Check for Unicode Chapter 3 Table 3-6 conformity. |
87 | 0 | if (_input[i] < 0x80) |
88 | 0 | continue; |
89 | | |
90 | 0 | size_t count = 0; |
91 | 0 | if (_input[i] >= 0xc0 && _input[i] <= 0xdf) |
92 | 0 | count = 1; |
93 | 0 | else if (_input[i] >= 0xe0 && _input[i] <= 0xef) |
94 | 0 | count = 2; |
95 | 0 | else if (_input[i] >= 0xf0 && _input[i] <= 0xf7) |
96 | 0 | count = 3; |
97 | |
|
98 | 0 | if (count == 0) |
99 | 0 | { |
100 | 0 | valid = false; |
101 | 0 | break; |
102 | 0 | } |
103 | | |
104 | 0 | if ((i + count) >= _length) |
105 | 0 | { |
106 | 0 | valid = false; |
107 | 0 | break; |
108 | 0 | } |
109 | | |
110 | 0 | for (size_t j = 0; j < count; j++) |
111 | 0 | { |
112 | 0 | i++; |
113 | 0 | if ((_input[i] & 0xc0) != 0x80) |
114 | 0 | { |
115 | 0 | valid = false; |
116 | 0 | break; |
117 | 0 | } |
118 | | |
119 | | // Check for Unicode Chapter 3 Table 3-7 conformity. |
120 | 0 | if ((j == 0) && !isWellFormed(_input[i - 1], _input[i])) |
121 | 0 | { |
122 | 0 | valid = false; |
123 | 0 | break; |
124 | 0 | } |
125 | 0 | } |
126 | 0 | } |
127 | |
|
128 | 0 | if (valid) |
129 | 0 | return true; |
130 | | |
131 | 0 | _invalidPosition = i; |
132 | 0 | return false; |
133 | 0 | } |
134 | | |
135 | | } |
136 | | |
137 | | bool validateUTF8(std::string const& _input, size_t& _invalidPosition) |
138 | 0 | { |
139 | 0 | return validateUTF8(reinterpret_cast<unsigned char const*>(_input.c_str()), _input.length(), _invalidPosition); |
140 | 0 | } |
141 | | |
142 | | } |