/src/solidity/libsolutil/UTF8.cpp
Line | Count | Source |
1 | | /* |
2 | | This file is part of solidity. |
3 | | |
4 | | solidity is free software: you can redistribute it and/or modify |
5 | | it under the terms of the GNU General Public License as published by |
6 | | the Free Software Foundation, either version 3 of the License, or |
7 | | (at your option) any later version. |
8 | | |
9 | | solidity is distributed in the hope that it will be useful, |
10 | | but WITHOUT ANY WARRANTY; without even the implied warranty of |
11 | | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
12 | | GNU General Public License for more details. |
13 | | |
14 | | You should have received a copy of the GNU General Public License |
15 | | along with solidity. If not, see <http://www.gnu.org/licenses/>. |
16 | | */ |
17 | | // SPDX-License-Identifier: GPL-3.0 |
18 | | /** @file UTF8.cpp |
19 | | * @author Alex Beregszaszi |
20 | | * @date 2016 |
21 | | * |
22 | | * UTF-8 related helpers |
23 | | */ |
24 | | |
25 | | #include <libsolutil/UTF8.h> |
26 | | |
27 | | namespace solidity::util |
28 | | { |
29 | | namespace |
30 | | { |
31 | | |
32 | | /// Validate byte sequence against Unicode chapter 3 Table 3-7. |
33 | | bool isWellFormed(unsigned char byte1, unsigned char byte2) |
34 | 1.66k | { |
35 | 1.66k | if (byte1 == 0xc0 || byte1 == 0xc1) |
36 | 12 | return false; |
37 | 1.65k | else if (byte1 >= 0xc2 && byte1 <= 0xdf) |
38 | 105 | return true; |
39 | 1.54k | else if (byte1 == 0xe0) |
40 | 18 | { |
41 | 18 | if (byte2 < 0xa0) |
42 | 6 | return false; |
43 | 12 | else |
44 | 12 | return true; |
45 | 18 | } |
46 | 1.53k | else if (byte1 >= 0xe1 && byte1 <= 0xec) |
47 | 74 | return true; |
48 | 1.45k | else if (byte1 == 0xed) |
49 | 7 | { |
50 | 7 | if (byte2 > 0x9f) |
51 | 4 | return false; |
52 | 3 | else |
53 | 3 | return true; |
54 | 7 | } |
55 | 1.44k | else if (byte1 == 0xee || byte1 == 0xef) |
56 | 443 | return true; |
57 | 1.00k | else if (byte1 == 0xf0) |
58 | 915 | { |
59 | 915 | if (byte2 < 0x90) |
60 | 1 | return false; |
61 | 914 | else |
62 | 914 | return true; |
63 | 915 | } |
64 | 91 | else if (byte1 >= 0xf1 && byte1 <= 0xf3) |
65 | 81 | return true; |
66 | 10 | else if (byte1 == 0xf4) |
67 | 8 | { |
68 | 8 | if (byte2 > 0x8f) |
69 | 8 | return false; |
70 | 0 | else |
71 | 0 | return true; |
72 | 8 | } |
73 | | /// 0xf5 .. 0xf7 is disallowed |
74 | | /// Technically anything below 0xc0 or above 0xf7 is |
75 | | /// not possible to encode using Table 3-6 anyway. |
76 | 2 | return false; |
77 | 1.66k | } |
78 | | |
79 | | bool validateUTF8(unsigned char const* _input, size_t _length, size_t& _invalidPosition) |
80 | 2.70k | { |
81 | 2.70k | bool valid = true; |
82 | 2.70k | size_t i = 0; |
83 | | |
84 | 147k | for (; i < _length; i++) |
85 | 144k | { |
86 | | // Check for Unicode Chapter 3 Table 3-6 conformity. |
87 | 144k | if (_input[i] < 0x80) |
88 | 142k | continue; |
89 | | |
90 | 2.49k | size_t count = 0; |
91 | 2.49k | if (_input[i] >= 0xc0 && _input[i] <= 0xdf) |
92 | 448 | count = 1; |
93 | 2.04k | else if (_input[i] >= 0xe0 && _input[i] <= 0xef) |
94 | 757 | count = 2; |
95 | 1.28k | else if (_input[i] >= 0xf0 && _input[i] <= 0xf7) |
96 | 1.20k | count = 3; |
97 | | |
98 | 2.49k | if (count == 0) |
99 | 84 | { |
100 | 84 | valid = false; |
101 | 84 | break; |
102 | 84 | } |
103 | | |
104 | 2.40k | if ((i + count) >= _length) |
105 | 4 | { |
106 | 4 | valid = false; |
107 | 4 | break; |
108 | 4 | } |
109 | | |
110 | 6.44k | for (size_t j = 0; j < count; j++) |
111 | 4.89k | { |
112 | 4.89k | i++; |
113 | 4.89k | if ((_input[i] & 0xc0) != 0x80) |
114 | 813 | { |
115 | 813 | valid = false; |
116 | 813 | break; |
117 | 813 | } |
118 | | |
119 | | // Check for Unicode Chapter 3 Table 3-7 conformity. |
120 | 4.07k | if ((j == 0) && !isWellFormed(_input[i - 1], _input[i])) |
121 | 33 | { |
122 | 33 | valid = false; |
123 | 33 | break; |
124 | 33 | } |
125 | 4.07k | } |
126 | 2.40k | } |
127 | | |
128 | 2.70k | if (valid) |
129 | 2.55k | return true; |
130 | | |
131 | 150 | _invalidPosition = i; |
132 | 150 | return false; |
133 | 2.70k | } |
134 | | |
135 | | } |
136 | | |
137 | | bool validateUTF8(std::string const& _input, size_t& _invalidPosition) |
138 | 2.70k | { |
139 | 2.70k | return validateUTF8(reinterpret_cast<unsigned char const*>(_input.c_str()), _input.length(), _invalidPosition); |
140 | 2.70k | } |
141 | | |
142 | | } |