Coverage Report

Created: 2019-09-11 14:12

/src/botan/src/lib/utils/charset.cpp
Line
Count
Source (jump to first uncovered line)
1
/*
2
* Character Set Handling
3
* (C) 1999-2007 Jack Lloyd
4
*
5
* Botan is released under the Simplified BSD License (see license.txt)
6
*/
7
8
#include <botan/charset.h>
9
#include <botan/exceptn.h>
10
#include <botan/loadstor.h>
11
#include <cctype>
12
13
namespace Botan {
14
15
namespace {
16
17
void append_utf8_for(std::string& s, uint32_t c)
18
30.0k
   {
19
30.0k
   if(c >= 0xD800 && c < 0xE000)
20
290
      throw Decoding_Error("Invalid Unicode character");
21
29.7k
22
29.7k
   if(c <= 0x7F)
23
7.62k
      {
24
7.62k
      const uint8_t b0 = static_cast<uint8_t>(c);
25
7.62k
      s.push_back(static_cast<char>(b0));
26
7.62k
      }
27
22.1k
   else if(c <= 0x7FF)
28
4.82k
      {
29
4.82k
      const uint8_t b0 = 0xC0 | static_cast<uint8_t>(c >> 6);
30
4.82k
      const uint8_t b1 = 0x80 | static_cast<uint8_t>(c & 0x3F);
31
4.82k
      s.push_back(static_cast<char>(b0));
32
4.82k
      s.push_back(static_cast<char>(b1));
33
4.82k
      }
34
17.3k
   else if(c <= 0xFFFF)
35
16.1k
      {
36
16.1k
      const uint8_t b0 = 0xE0 | static_cast<uint8_t>(c >> 12);
37
16.1k
      const uint8_t b1 = 0x80 | static_cast<uint8_t>((c >> 6) & 0x3F);
38
16.1k
      const uint8_t b2 = 0x80 | static_cast<uint8_t>(c & 0x3F);
39
16.1k
      s.push_back(static_cast<char>(b0));
40
16.1k
      s.push_back(static_cast<char>(b1));
41
16.1k
      s.push_back(static_cast<char>(b2));
42
16.1k
      }
43
1.21k
   else if(c <= 0x10FFFF)
44
668
      {
45
668
      const uint8_t b0 = 0xF0 | static_cast<uint8_t>(c >> 18);
46
668
      const uint8_t b1 = 0x80 | static_cast<uint8_t>((c >> 12) & 0x3F);
47
668
      const uint8_t b2 = 0x80 | static_cast<uint8_t>((c >> 6) & 0x3F);
48
668
      const uint8_t b3 = 0x80 | static_cast<uint8_t>(c & 0x3F);
49
668
      s.push_back(static_cast<char>(b0));
50
668
      s.push_back(static_cast<char>(b1));
51
668
      s.push_back(static_cast<char>(b2));
52
668
      s.push_back(static_cast<char>(b3));
53
668
      }
54
548
   else
55
548
      throw Decoding_Error("Invalid Unicode character");
56
29.7k
57
29.7k
   }
58
59
}
60
61
std::string ucs2_to_utf8(const uint8_t ucs2[], size_t len)
62
4.06k
   {
63
4.06k
   if(len % 2 != 0)
64
234
      throw Decoding_Error("Invalid length for UCS-2 string");
65
3.82k
66
3.82k
   const size_t chars = len / 2;
67
3.82k
68
3.82k
   std::string s;
69
31.0k
   for(size_t i = 0; i != chars; ++i)
70
27.2k
      {
71
27.2k
      const uint16_t c = load_be<uint16_t>(ucs2, i);
72
27.2k
      append_utf8_for(s, c);
73
27.2k
      }
74
3.82k
75
3.82k
   return s;
76
3.82k
   }
77
78
std::string ucs4_to_utf8(const uint8_t ucs4[], size_t len)
79
2.44k
   {
80
2.44k
   if(len % 4 != 0)
81
238
      throw Decoding_Error("Invalid length for UCS-4 string");
82
2.20k
83
2.20k
   const size_t chars = len / 4;
84
2.20k
85
2.20k
   std::string s;
86
5.06k
   for(size_t i = 0; i != chars; ++i)
87
2.85k
      {
88
2.85k
      const uint32_t c = load_be<uint32_t>(ucs4, i);
89
2.85k
      append_utf8_for(s, c);
90
2.85k
      }
91
2.20k
92
2.20k
   return s;
93
2.20k
   }
94
95
/*
96
* Convert from UTF-8 to ISO 8859-1
97
*/
98
std::string utf8_to_latin1(const std::string& utf8)
99
0
   {
100
0
   std::string iso8859;
101
0
102
0
   size_t position = 0;
103
0
   while(position != utf8.size())
104
0
      {
105
0
      const uint8_t c1 = static_cast<uint8_t>(utf8[position++]);
106
0
107
0
      if(c1 <= 0x7F)
108
0
         {
109
0
         iso8859 += static_cast<char>(c1);
110
0
         }
111
0
      else if(c1 >= 0xC0 && c1 <= 0xC7)
112
0
         {
113
0
         if(position == utf8.size())
114
0
            throw Decoding_Error("UTF-8: sequence truncated");
115
0
116
0
         const uint8_t c2 = static_cast<uint8_t>(utf8[position++]);
117
0
         const uint8_t iso_char = ((c1 & 0x07) << 6) | (c2 & 0x3F);
118
0
119
0
         if(iso_char <= 0x7F)
120
0
            throw Decoding_Error("UTF-8: sequence longer than needed");
121
0
122
0
         iso8859 += static_cast<char>(iso_char);
123
0
         }
124
0
      else
125
0
         throw Decoding_Error("UTF-8: Unicode chars not in Latin1 used");
126
0
      }
127
0
128
0
   return iso8859;
129
0
   }
130
131
namespace Charset {
132
133
namespace {
134
135
/*
136
* Convert from UCS-2 to ISO 8859-1
137
*/
138
std::string ucs2_to_latin1(const std::string& ucs2)
139
0
   {
140
0
   if(ucs2.size() % 2 == 1)
141
0
      throw Decoding_Error("UCS-2 string has an odd number of bytes");
142
0
143
0
   std::string latin1;
144
0
145
0
   for(size_t i = 0; i != ucs2.size(); i += 2)
146
0
      {
147
0
      const uint8_t c1 = ucs2[i];
148
0
      const uint8_t c2 = ucs2[i+1];
149
0
150
0
      if(c1 != 0)
151
0
         throw Decoding_Error("UCS-2 has non-Latin1 characters");
152
0
153
0
      latin1 += static_cast<char>(c2);
154
0
      }
155
0
156
0
   return latin1;
157
0
   }
158
159
/*
160
* Convert from ISO 8859-1 to UTF-8
161
*/
162
std::string latin1_to_utf8(const std::string& iso8859)
163
0
   {
164
0
   std::string utf8;
165
0
   for(size_t i = 0; i != iso8859.size(); ++i)
166
0
      {
167
0
      const uint8_t c = static_cast<uint8_t>(iso8859[i]);
168
0
169
0
      if(c <= 0x7F)
170
0
         utf8 += static_cast<char>(c);
171
0
      else
172
0
         {
173
0
         utf8 += static_cast<char>((0xC0 | (c >> 6)));
174
0
         utf8 += static_cast<char>((0x80 | (c & 0x3F)));
175
0
         }
176
0
      }
177
0
   return utf8;
178
0
   }
179
180
}
181
182
/*
183
* Perform character set transcoding
184
*/
185
std::string transcode(const std::string& str,
186
                      Character_Set to, Character_Set from)
187
0
   {
188
0
   if(to == LOCAL_CHARSET)
189
0
      to = LATIN1_CHARSET;
190
0
   if(from == LOCAL_CHARSET)
191
0
      from = LATIN1_CHARSET;
192
0
193
0
   if(to == from)
194
0
      return str;
195
0
196
0
   if(from == LATIN1_CHARSET && to == UTF8_CHARSET)
197
0
      return latin1_to_utf8(str);
198
0
   if(from == UTF8_CHARSET && to == LATIN1_CHARSET)
199
0
      return utf8_to_latin1(str);
200
0
   if(from == UCS2_CHARSET && to == LATIN1_CHARSET)
201
0
      return ucs2_to_latin1(str);
202
0
203
0
   throw Invalid_Argument("Unknown transcoding operation from " +
204
0
                          std::to_string(from) + " to " + std::to_string(to));
205
0
   }
206
207
/*
208
* Check if a character represents a digit
209
*/
210
bool is_digit(char c)
211
0
   {
212
0
   if(c == '0' || c == '1' || c == '2' || c == '3' || c == '4' ||
213
0
      c == '5' || c == '6' || c == '7' || c == '8' || c == '9')
214
0
      return true;
215
0
   return false;
216
0
   }
217
218
/*
219
* Check if a character represents whitespace
220
*/
221
bool is_space(char c)
222
1.04M
   {
223
1.04M
   if(c == ' ' || c == '\t' || c == '\n' || c == '\r')
224
203k
      return true;
225
840k
   return false;
226
840k
   }
227
228
/*
229
* Convert a character to a digit
230
*/
231
uint8_t char2digit(char c)
232
0
   {
233
0
   switch(c)
234
0
      {
235
0
      case '0': return 0;
236
0
      case '1': return 1;
237
0
      case '2': return 2;
238
0
      case '3': return 3;
239
0
      case '4': return 4;
240
0
      case '5': return 5;
241
0
      case '6': return 6;
242
0
      case '7': return 7;
243
0
      case '8': return 8;
244
0
      case '9': return 9;
245
0
      }
246
0
247
0
   throw Invalid_Argument("char2digit: Input is not a digit character");
248
0
   }
249
250
/*
251
* Convert a digit to a character
252
*/
253
char digit2char(uint8_t b)
254
0
   {
255
0
   switch(b)
256
0
      {
257
0
      case 0: return '0';
258
0
      case 1: return '1';
259
0
      case 2: return '2';
260
0
      case 3: return '3';
261
0
      case 4: return '4';
262
0
      case 5: return '5';
263
0
      case 6: return '6';
264
0
      case 7: return '7';
265
0
      case 8: return '8';
266
0
      case 9: return '9';
267
0
      }
268
0
269
0
   throw Invalid_Argument("digit2char: Input is not a digit");
270
0
   }
271
272
/*
273
* Case-insensitive character comparison
274
*/
275
bool caseless_cmp(char a, char b)
276
655k
   {
277
655k
   return (std::tolower(static_cast<unsigned char>(a)) ==
278
655k
           std::tolower(static_cast<unsigned char>(b)));
279
655k
   }
280
281
}
282
283
}