/src/botan/src/lib/utils/charset.cpp
Line | Count | Source (jump to first uncovered line) |
1 | | /* |
2 | | * Character Set Handling |
3 | | * (C) 1999-2007 Jack Lloyd |
4 | | * |
5 | | * Botan is released under the Simplified BSD License (see license.txt) |
6 | | */ |
7 | | |
8 | | #include <botan/charset.h> |
9 | | #include <botan/exceptn.h> |
10 | | #include <botan/loadstor.h> |
11 | | #include <cctype> |
12 | | |
13 | | namespace Botan { |
14 | | |
15 | | namespace { |
16 | | |
17 | | void append_utf8_for(std::string& s, uint32_t c) |
18 | 30.0k | { |
19 | 30.0k | if(c >= 0xD800 && c < 0xE000) |
20 | 290 | throw Decoding_Error("Invalid Unicode character"); |
21 | 29.7k | |
22 | 29.7k | if(c <= 0x7F) |
23 | 7.62k | { |
24 | 7.62k | const uint8_t b0 = static_cast<uint8_t>(c); |
25 | 7.62k | s.push_back(static_cast<char>(b0)); |
26 | 7.62k | } |
27 | 22.1k | else if(c <= 0x7FF) |
28 | 4.82k | { |
29 | 4.82k | const uint8_t b0 = 0xC0 | static_cast<uint8_t>(c >> 6); |
30 | 4.82k | const uint8_t b1 = 0x80 | static_cast<uint8_t>(c & 0x3F); |
31 | 4.82k | s.push_back(static_cast<char>(b0)); |
32 | 4.82k | s.push_back(static_cast<char>(b1)); |
33 | 4.82k | } |
34 | 17.3k | else if(c <= 0xFFFF) |
35 | 16.1k | { |
36 | 16.1k | const uint8_t b0 = 0xE0 | static_cast<uint8_t>(c >> 12); |
37 | 16.1k | const uint8_t b1 = 0x80 | static_cast<uint8_t>((c >> 6) & 0x3F); |
38 | 16.1k | const uint8_t b2 = 0x80 | static_cast<uint8_t>(c & 0x3F); |
39 | 16.1k | s.push_back(static_cast<char>(b0)); |
40 | 16.1k | s.push_back(static_cast<char>(b1)); |
41 | 16.1k | s.push_back(static_cast<char>(b2)); |
42 | 16.1k | } |
43 | 1.21k | else if(c <= 0x10FFFF) |
44 | 668 | { |
45 | 668 | const uint8_t b0 = 0xF0 | static_cast<uint8_t>(c >> 18); |
46 | 668 | const uint8_t b1 = 0x80 | static_cast<uint8_t>((c >> 12) & 0x3F); |
47 | 668 | const uint8_t b2 = 0x80 | static_cast<uint8_t>((c >> 6) & 0x3F); |
48 | 668 | const uint8_t b3 = 0x80 | static_cast<uint8_t>(c & 0x3F); |
49 | 668 | s.push_back(static_cast<char>(b0)); |
50 | 668 | s.push_back(static_cast<char>(b1)); |
51 | 668 | s.push_back(static_cast<char>(b2)); |
52 | 668 | s.push_back(static_cast<char>(b3)); |
53 | 668 | } |
54 | 548 | else |
55 | 548 | throw Decoding_Error("Invalid Unicode character"); |
56 | 29.7k | |
57 | 29.7k | } |
58 | | |
59 | | } |
60 | | |
61 | | std::string ucs2_to_utf8(const uint8_t ucs2[], size_t len) |
62 | 4.06k | { |
63 | 4.06k | if(len % 2 != 0) |
64 | 234 | throw Decoding_Error("Invalid length for UCS-2 string"); |
65 | 3.82k | |
66 | 3.82k | const size_t chars = len / 2; |
67 | 3.82k | |
68 | 3.82k | std::string s; |
69 | 31.0k | for(size_t i = 0; i != chars; ++i) |
70 | 27.2k | { |
71 | 27.2k | const uint16_t c = load_be<uint16_t>(ucs2, i); |
72 | 27.2k | append_utf8_for(s, c); |
73 | 27.2k | } |
74 | 3.82k | |
75 | 3.82k | return s; |
76 | 3.82k | } |
77 | | |
78 | | std::string ucs4_to_utf8(const uint8_t ucs4[], size_t len) |
79 | 2.44k | { |
80 | 2.44k | if(len % 4 != 0) |
81 | 238 | throw Decoding_Error("Invalid length for UCS-4 string"); |
82 | 2.20k | |
83 | 2.20k | const size_t chars = len / 4; |
84 | 2.20k | |
85 | 2.20k | std::string s; |
86 | 5.06k | for(size_t i = 0; i != chars; ++i) |
87 | 2.85k | { |
88 | 2.85k | const uint32_t c = load_be<uint32_t>(ucs4, i); |
89 | 2.85k | append_utf8_for(s, c); |
90 | 2.85k | } |
91 | 2.20k | |
92 | 2.20k | return s; |
93 | 2.20k | } |
94 | | |
95 | | /* |
96 | | * Convert from UTF-8 to ISO 8859-1 |
97 | | */ |
98 | | std::string utf8_to_latin1(const std::string& utf8) |
99 | 0 | { |
100 | 0 | std::string iso8859; |
101 | 0 |
|
102 | 0 | size_t position = 0; |
103 | 0 | while(position != utf8.size()) |
104 | 0 | { |
105 | 0 | const uint8_t c1 = static_cast<uint8_t>(utf8[position++]); |
106 | 0 |
|
107 | 0 | if(c1 <= 0x7F) |
108 | 0 | { |
109 | 0 | iso8859 += static_cast<char>(c1); |
110 | 0 | } |
111 | 0 | else if(c1 >= 0xC0 && c1 <= 0xC7) |
112 | 0 | { |
113 | 0 | if(position == utf8.size()) |
114 | 0 | throw Decoding_Error("UTF-8: sequence truncated"); |
115 | 0 | |
116 | 0 | const uint8_t c2 = static_cast<uint8_t>(utf8[position++]); |
117 | 0 | const uint8_t iso_char = ((c1 & 0x07) << 6) | (c2 & 0x3F); |
118 | 0 |
|
119 | 0 | if(iso_char <= 0x7F) |
120 | 0 | throw Decoding_Error("UTF-8: sequence longer than needed"); |
121 | 0 | |
122 | 0 | iso8859 += static_cast<char>(iso_char); |
123 | 0 | } |
124 | 0 | else |
125 | 0 | throw Decoding_Error("UTF-8: Unicode chars not in Latin1 used"); |
126 | 0 | } |
127 | 0 |
|
128 | 0 | return iso8859; |
129 | 0 | } |
130 | | |
131 | | namespace Charset { |
132 | | |
133 | | namespace { |
134 | | |
135 | | /* |
136 | | * Convert from UCS-2 to ISO 8859-1 |
137 | | */ |
138 | | std::string ucs2_to_latin1(const std::string& ucs2) |
139 | 0 | { |
140 | 0 | if(ucs2.size() % 2 == 1) |
141 | 0 | throw Decoding_Error("UCS-2 string has an odd number of bytes"); |
142 | 0 | |
143 | 0 | std::string latin1; |
144 | 0 |
|
145 | 0 | for(size_t i = 0; i != ucs2.size(); i += 2) |
146 | 0 | { |
147 | 0 | const uint8_t c1 = ucs2[i]; |
148 | 0 | const uint8_t c2 = ucs2[i+1]; |
149 | 0 |
|
150 | 0 | if(c1 != 0) |
151 | 0 | throw Decoding_Error("UCS-2 has non-Latin1 characters"); |
152 | 0 | |
153 | 0 | latin1 += static_cast<char>(c2); |
154 | 0 | } |
155 | 0 |
|
156 | 0 | return latin1; |
157 | 0 | } |
158 | | |
159 | | /* |
160 | | * Convert from ISO 8859-1 to UTF-8 |
161 | | */ |
162 | | std::string latin1_to_utf8(const std::string& iso8859) |
163 | 0 | { |
164 | 0 | std::string utf8; |
165 | 0 | for(size_t i = 0; i != iso8859.size(); ++i) |
166 | 0 | { |
167 | 0 | const uint8_t c = static_cast<uint8_t>(iso8859[i]); |
168 | 0 |
|
169 | 0 | if(c <= 0x7F) |
170 | 0 | utf8 += static_cast<char>(c); |
171 | 0 | else |
172 | 0 | { |
173 | 0 | utf8 += static_cast<char>((0xC0 | (c >> 6))); |
174 | 0 | utf8 += static_cast<char>((0x80 | (c & 0x3F))); |
175 | 0 | } |
176 | 0 | } |
177 | 0 | return utf8; |
178 | 0 | } |
179 | | |
180 | | } |
181 | | |
182 | | /* |
183 | | * Perform character set transcoding |
184 | | */ |
185 | | std::string transcode(const std::string& str, |
186 | | Character_Set to, Character_Set from) |
187 | 0 | { |
188 | 0 | if(to == LOCAL_CHARSET) |
189 | 0 | to = LATIN1_CHARSET; |
190 | 0 | if(from == LOCAL_CHARSET) |
191 | 0 | from = LATIN1_CHARSET; |
192 | 0 |
|
193 | 0 | if(to == from) |
194 | 0 | return str; |
195 | 0 | |
196 | 0 | if(from == LATIN1_CHARSET && to == UTF8_CHARSET) |
197 | 0 | return latin1_to_utf8(str); |
198 | 0 | if(from == UTF8_CHARSET && to == LATIN1_CHARSET) |
199 | 0 | return utf8_to_latin1(str); |
200 | 0 | if(from == UCS2_CHARSET && to == LATIN1_CHARSET) |
201 | 0 | return ucs2_to_latin1(str); |
202 | 0 | |
203 | 0 | throw Invalid_Argument("Unknown transcoding operation from " + |
204 | 0 | std::to_string(from) + " to " + std::to_string(to)); |
205 | 0 | } |
206 | | |
207 | | /* |
208 | | * Check if a character represents a digit |
209 | | */ |
210 | | bool is_digit(char c) |
211 | 0 | { |
212 | 0 | if(c == '0' || c == '1' || c == '2' || c == '3' || c == '4' || |
213 | 0 | c == '5' || c == '6' || c == '7' || c == '8' || c == '9') |
214 | 0 | return true; |
215 | 0 | return false; |
216 | 0 | } |
217 | | |
218 | | /* |
219 | | * Check if a character represents whitespace |
220 | | */ |
221 | | bool is_space(char c) |
222 | 1.04M | { |
223 | 1.04M | if(c == ' ' || c == '\t' || c == '\n' || c == '\r') |
224 | 203k | return true; |
225 | 840k | return false; |
226 | 840k | } |
227 | | |
228 | | /* |
229 | | * Convert a character to a digit |
230 | | */ |
231 | | uint8_t char2digit(char c) |
232 | 0 | { |
233 | 0 | switch(c) |
234 | 0 | { |
235 | 0 | case '0': return 0; |
236 | 0 | case '1': return 1; |
237 | 0 | case '2': return 2; |
238 | 0 | case '3': return 3; |
239 | 0 | case '4': return 4; |
240 | 0 | case '5': return 5; |
241 | 0 | case '6': return 6; |
242 | 0 | case '7': return 7; |
243 | 0 | case '8': return 8; |
244 | 0 | case '9': return 9; |
245 | 0 | } |
246 | 0 | |
247 | 0 | throw Invalid_Argument("char2digit: Input is not a digit character"); |
248 | 0 | } |
249 | | |
250 | | /* |
251 | | * Convert a digit to a character |
252 | | */ |
253 | | char digit2char(uint8_t b) |
254 | 0 | { |
255 | 0 | switch(b) |
256 | 0 | { |
257 | 0 | case 0: return '0'; |
258 | 0 | case 1: return '1'; |
259 | 0 | case 2: return '2'; |
260 | 0 | case 3: return '3'; |
261 | 0 | case 4: return '4'; |
262 | 0 | case 5: return '5'; |
263 | 0 | case 6: return '6'; |
264 | 0 | case 7: return '7'; |
265 | 0 | case 8: return '8'; |
266 | 0 | case 9: return '9'; |
267 | 0 | } |
268 | 0 | |
269 | 0 | throw Invalid_Argument("digit2char: Input is not a digit"); |
270 | 0 | } |
271 | | |
272 | | /* |
273 | | * Case-insensitive character comparison |
274 | | */ |
275 | | bool caseless_cmp(char a, char b) |
276 | 655k | { |
277 | 655k | return (std::tolower(static_cast<unsigned char>(a)) == |
278 | 655k | std::tolower(static_cast<unsigned char>(b))); |
279 | 655k | } |
280 | | |
281 | | } |
282 | | |
283 | | } |