/src/botan/src/lib/utils/charset.cpp
Line | Count | Source (jump to first uncovered line) |
1 | | /* |
2 | | * Character Set Handling |
3 | | * (C) 1999-2007 Jack Lloyd |
4 | | * |
5 | | * Botan is released under the Simplified BSD License (see license.txt) |
6 | | */ |
7 | | |
8 | | #include <botan/internal/charset.h> |
9 | | #include <botan/exceptn.h> |
10 | | #include <botan/internal/loadstor.h> |
11 | | #include <cctype> |
12 | | |
13 | | namespace Botan { |
14 | | |
15 | | namespace { |
16 | | |
17 | | void append_utf8_for(std::string& s, uint32_t c) |
18 | 59.2k | { |
19 | 59.2k | if(c >= 0xD800 && c < 0xE000) |
20 | 316 | throw Decoding_Error("Invalid Unicode character"); |
21 | | |
22 | 58.9k | if(c <= 0x7F) |
23 | 4.55k | { |
24 | 4.55k | const uint8_t b0 = static_cast<uint8_t>(c); |
25 | 4.55k | s.push_back(static_cast<char>(b0)); |
26 | 4.55k | } |
27 | 54.3k | else if(c <= 0x7FF) |
28 | 6.68k | { |
29 | 6.68k | const uint8_t b0 = 0xC0 | static_cast<uint8_t>(c >> 6); |
30 | 6.68k | const uint8_t b1 = 0x80 | static_cast<uint8_t>(c & 0x3F); |
31 | 6.68k | s.push_back(static_cast<char>(b0)); |
32 | 6.68k | s.push_back(static_cast<char>(b1)); |
33 | 6.68k | } |
34 | 47.6k | else if(c <= 0xFFFF) |
35 | 46.1k | { |
36 | 46.1k | const uint8_t b0 = 0xE0 | static_cast<uint8_t>(c >> 12); |
37 | 46.1k | const uint8_t b1 = 0x80 | static_cast<uint8_t>((c >> 6) & 0x3F); |
38 | 46.1k | const uint8_t b2 = 0x80 | static_cast<uint8_t>(c & 0x3F); |
39 | 46.1k | s.push_back(static_cast<char>(b0)); |
40 | 46.1k | s.push_back(static_cast<char>(b1)); |
41 | 46.1k | s.push_back(static_cast<char>(b2)); |
42 | 46.1k | } |
43 | 1.58k | else if(c <= 0x10FFFF) |
44 | 964 | { |
45 | 964 | const uint8_t b0 = 0xF0 | static_cast<uint8_t>(c >> 18); |
46 | 964 | const uint8_t b1 = 0x80 | static_cast<uint8_t>((c >> 12) & 0x3F); |
47 | 964 | const uint8_t b2 = 0x80 | static_cast<uint8_t>((c >> 6) & 0x3F); |
48 | 964 | const uint8_t b3 = 0x80 | static_cast<uint8_t>(c & 0x3F); |
49 | 964 | s.push_back(static_cast<char>(b0)); |
50 | 964 | s.push_back(static_cast<char>(b1)); |
51 | 964 | s.push_back(static_cast<char>(b2)); |
52 | 964 | s.push_back(static_cast<char>(b3)); |
53 | 964 | } |
54 | 620 | else |
55 | 620 | throw Decoding_Error("Invalid Unicode character"); |
56 | | |
57 | 58.9k | } |
58 | | |
59 | | } |
60 | | |
61 | | std::string ucs2_to_utf8(const uint8_t ucs2[], size_t len) |
62 | 4.13k | { |
63 | 4.13k | if(len % 2 != 0) |
64 | 238 | throw Decoding_Error("Invalid length for UCS-2 string"); |
65 | | |
66 | 3.89k | const size_t chars = len / 2; |
67 | | |
68 | 3.89k | std::string s; |
69 | 60.5k | for(size_t i = 0; i != chars; ++i) |
70 | 56.6k | { |
71 | 56.6k | const uint16_t c = load_be<uint16_t>(ucs2, i); |
72 | 56.6k | append_utf8_for(s, c); |
73 | 56.6k | } |
74 | | |
75 | 3.89k | return s; |
76 | 3.89k | } |
77 | | |
78 | | std::string ucs4_to_utf8(const uint8_t ucs4[], size_t len) |
79 | 3.46k | { |
80 | 3.46k | if(len % 4 != 0) |
81 | 240 | throw Decoding_Error("Invalid length for UCS-4 string"); |
82 | | |
83 | 3.22k | const size_t chars = len / 4; |
84 | | |
85 | 3.22k | std::string s; |
86 | 5.76k | for(size_t i = 0; i != chars; ++i) |
87 | 2.54k | { |
88 | 2.54k | const uint32_t c = load_be<uint32_t>(ucs4, i); |
89 | 2.54k | append_utf8_for(s, c); |
90 | 2.54k | } |
91 | | |
92 | 3.22k | return s; |
93 | 3.22k | } |
94 | | |
95 | | /* |
96 | | * Convert from UTF-8 to ISO 8859-1 |
97 | | */ |
98 | | std::string utf8_to_latin1(const std::string& utf8) |
99 | 0 | { |
100 | 0 | std::string iso8859; |
101 | |
|
102 | 0 | size_t position = 0; |
103 | 0 | while(position != utf8.size()) |
104 | 0 | { |
105 | 0 | const uint8_t c1 = static_cast<uint8_t>(utf8[position++]); |
106 | |
|
107 | 0 | if(c1 <= 0x7F) |
108 | 0 | { |
109 | 0 | iso8859 += static_cast<char>(c1); |
110 | 0 | } |
111 | 0 | else if(c1 >= 0xC0 && c1 <= 0xC7) |
112 | 0 | { |
113 | 0 | if(position == utf8.size()) |
114 | 0 | throw Decoding_Error("UTF-8: sequence truncated"); |
115 | | |
116 | 0 | const uint8_t c2 = static_cast<uint8_t>(utf8[position++]); |
117 | 0 | const uint8_t iso_char = ((c1 & 0x07) << 6) | (c2 & 0x3F); |
118 | |
|
119 | 0 | if(iso_char <= 0x7F) |
120 | 0 | throw Decoding_Error("UTF-8: sequence longer than needed"); |
121 | | |
122 | 0 | iso8859 += static_cast<char>(iso_char); |
123 | 0 | } |
124 | 0 | else |
125 | 0 | throw Decoding_Error("UTF-8: Unicode chars not in Latin1 used"); |
126 | 0 | } |
127 | |
|
128 | 0 | return iso8859; |
129 | 0 | } |
130 | | |
131 | | /* |
132 | | * Convert from UCS-2 to ISO 8859-1 |
133 | | */ |
134 | | std::string ucs2_to_latin1(const std::string& ucs2) |
135 | 0 | { |
136 | 0 | if(ucs2.size() % 2 == 1) |
137 | 0 | throw Decoding_Error("UCS-2 string has an odd number of bytes"); |
138 | | |
139 | 0 | std::string latin1; |
140 | |
|
141 | 0 | for(size_t i = 0; i != ucs2.size(); i += 2) |
142 | 0 | { |
143 | 0 | const uint8_t c1 = ucs2[i]; |
144 | 0 | const uint8_t c2 = ucs2[i+1]; |
145 | |
|
146 | 0 | if(c1 != 0) |
147 | 0 | throw Decoding_Error("UCS-2 has non-Latin1 characters"); |
148 | | |
149 | 0 | latin1 += static_cast<char>(c2); |
150 | 0 | } |
151 | |
|
152 | 0 | return latin1; |
153 | 0 | } |
154 | | |
155 | | /* |
156 | | * Convert from ISO 8859-1 to UTF-8 |
157 | | */ |
158 | | std::string latin1_to_utf8(const std::string& iso8859) |
159 | 0 | { |
160 | 0 | std::string utf8; |
161 | 0 | for(size_t i = 0; i != iso8859.size(); ++i) |
162 | 0 | { |
163 | 0 | const uint8_t c = static_cast<uint8_t>(iso8859[i]); |
164 | |
|
165 | 0 | if(c <= 0x7F) |
166 | 0 | utf8 += static_cast<char>(c); |
167 | 0 | else |
168 | 0 | { |
169 | 0 | utf8 += static_cast<char>((0xC0 | (c >> 6))); |
170 | 0 | utf8 += static_cast<char>((0x80 | (c & 0x3F))); |
171 | 0 | } |
172 | 0 | } |
173 | 0 | return utf8; |
174 | 0 | } |
175 | | |
176 | | namespace Charset { |
177 | | |
178 | | /* |
179 | | * Check if a character represents a digit |
180 | | */ |
181 | | bool is_digit(char c) |
182 | 0 | { |
183 | 0 | if(c == '0' || c == '1' || c == '2' || c == '3' || c == '4' || |
184 | 0 | c == '5' || c == '6' || c == '7' || c == '8' || c == '9') |
185 | 0 | return true; |
186 | 0 | return false; |
187 | 0 | } |
188 | | |
189 | | /* |
190 | | * Check if a character represents whitespace |
191 | | */ |
192 | | bool is_space(char c) |
193 | 948k | { |
194 | 948k | if(c == ' ' || c == '\t' || c == '\n' || c == '\r') |
195 | 188k | return true; |
196 | 760k | return false; |
197 | 760k | } |
198 | | |
199 | | /* |
200 | | * Convert a character to a digit |
201 | | */ |
202 | | uint8_t char2digit(char c) |
203 | 0 | { |
204 | 0 | switch(c) |
205 | 0 | { |
206 | 0 | case '0': return 0; |
207 | 0 | case '1': return 1; |
208 | 0 | case '2': return 2; |
209 | 0 | case '3': return 3; |
210 | 0 | case '4': return 4; |
211 | 0 | case '5': return 5; |
212 | 0 | case '6': return 6; |
213 | 0 | case '7': return 7; |
214 | 0 | case '8': return 8; |
215 | 0 | case '9': return 9; |
216 | 0 | } |
217 | | |
218 | 0 | throw Invalid_Argument("char2digit: Input is not a digit character"); |
219 | 0 | } |
220 | | |
221 | | /* |
222 | | * Convert a digit to a character |
223 | | */ |
224 | | char digit2char(uint8_t b) |
225 | 0 | { |
226 | 0 | switch(b) |
227 | 0 | { |
228 | 0 | case 0: return '0'; |
229 | 0 | case 1: return '1'; |
230 | 0 | case 2: return '2'; |
231 | 0 | case 3: return '3'; |
232 | 0 | case 4: return '4'; |
233 | 0 | case 5: return '5'; |
234 | 0 | case 6: return '6'; |
235 | 0 | case 7: return '7'; |
236 | 0 | case 8: return '8'; |
237 | 0 | case 9: return '9'; |
238 | 0 | } |
239 | | |
240 | 0 | throw Invalid_Argument("digit2char: Input is not a digit"); |
241 | 0 | } |
242 | | |
243 | | /* |
244 | | * Case-insensitive character comparison |
245 | | */ |
246 | | bool caseless_cmp(char a, char b) |
247 | 594k | { |
248 | 594k | return (std::tolower(static_cast<unsigned char>(a)) == |
249 | 594k | std::tolower(static_cast<unsigned char>(b))); |
250 | 594k | } |
251 | | |
252 | | } |
253 | | |
254 | | } |