Coverage Report

Created: 2020-11-21 08:34

/src/botan/src/lib/utils/charset.cpp
Line
Count
Source (jump to first uncovered line)
1
/*
2
* Character Set Handling
3
* (C) 1999-2007 Jack Lloyd
4
*
5
* Botan is released under the Simplified BSD License (see license.txt)
6
*/
7
8
#include <botan/internal/charset.h>
9
#include <botan/exceptn.h>
10
#include <botan/internal/loadstor.h>
11
#include <cctype>
12
13
namespace Botan {
14
15
namespace {
16
17
void append_utf8_for(std::string& s, uint32_t c)
18
59.2k
   {
19
59.2k
   if(c >= 0xD800 && c < 0xE000)
20
316
      throw Decoding_Error("Invalid Unicode character");
21
22
58.9k
   if(c <= 0x7F)
23
4.55k
      {
24
4.55k
      const uint8_t b0 = static_cast<uint8_t>(c);
25
4.55k
      s.push_back(static_cast<char>(b0));
26
4.55k
      }
27
54.3k
   else if(c <= 0x7FF)
28
6.68k
      {
29
6.68k
      const uint8_t b0 = 0xC0 | static_cast<uint8_t>(c >> 6);
30
6.68k
      const uint8_t b1 = 0x80 | static_cast<uint8_t>(c & 0x3F);
31
6.68k
      s.push_back(static_cast<char>(b0));
32
6.68k
      s.push_back(static_cast<char>(b1));
33
6.68k
      }
34
47.6k
   else if(c <= 0xFFFF)
35
46.1k
      {
36
46.1k
      const uint8_t b0 = 0xE0 | static_cast<uint8_t>(c >> 12);
37
46.1k
      const uint8_t b1 = 0x80 | static_cast<uint8_t>((c >> 6) & 0x3F);
38
46.1k
      const uint8_t b2 = 0x80 | static_cast<uint8_t>(c & 0x3F);
39
46.1k
      s.push_back(static_cast<char>(b0));
40
46.1k
      s.push_back(static_cast<char>(b1));
41
46.1k
      s.push_back(static_cast<char>(b2));
42
46.1k
      }
43
1.58k
   else if(c <= 0x10FFFF)
44
964
      {
45
964
      const uint8_t b0 = 0xF0 | static_cast<uint8_t>(c >> 18);
46
964
      const uint8_t b1 = 0x80 | static_cast<uint8_t>((c >> 12) & 0x3F);
47
964
      const uint8_t b2 = 0x80 | static_cast<uint8_t>((c >> 6) & 0x3F);
48
964
      const uint8_t b3 = 0x80 | static_cast<uint8_t>(c & 0x3F);
49
964
      s.push_back(static_cast<char>(b0));
50
964
      s.push_back(static_cast<char>(b1));
51
964
      s.push_back(static_cast<char>(b2));
52
964
      s.push_back(static_cast<char>(b3));
53
964
      }
54
620
   else
55
620
      throw Decoding_Error("Invalid Unicode character");
56
57
58.9k
   }
58
59
}
60
61
std::string ucs2_to_utf8(const uint8_t ucs2[], size_t len)
62
4.13k
   {
63
4.13k
   if(len % 2 != 0)
64
238
      throw Decoding_Error("Invalid length for UCS-2 string");
65
66
3.89k
   const size_t chars = len / 2;
67
68
3.89k
   std::string s;
69
60.5k
   for(size_t i = 0; i != chars; ++i)
70
56.6k
      {
71
56.6k
      const uint16_t c = load_be<uint16_t>(ucs2, i);
72
56.6k
      append_utf8_for(s, c);
73
56.6k
      }
74
75
3.89k
   return s;
76
3.89k
   }
77
78
std::string ucs4_to_utf8(const uint8_t ucs4[], size_t len)
79
3.46k
   {
80
3.46k
   if(len % 4 != 0)
81
240
      throw Decoding_Error("Invalid length for UCS-4 string");
82
83
3.22k
   const size_t chars = len / 4;
84
85
3.22k
   std::string s;
86
5.76k
   for(size_t i = 0; i != chars; ++i)
87
2.54k
      {
88
2.54k
      const uint32_t c = load_be<uint32_t>(ucs4, i);
89
2.54k
      append_utf8_for(s, c);
90
2.54k
      }
91
92
3.22k
   return s;
93
3.22k
   }
94
95
/*
96
* Convert from UTF-8 to ISO 8859-1
97
*/
98
std::string utf8_to_latin1(const std::string& utf8)
99
0
   {
100
0
   std::string iso8859;
101
102
0
   size_t position = 0;
103
0
   while(position != utf8.size())
104
0
      {
105
0
      const uint8_t c1 = static_cast<uint8_t>(utf8[position++]);
106
107
0
      if(c1 <= 0x7F)
108
0
         {
109
0
         iso8859 += static_cast<char>(c1);
110
0
         }
111
0
      else if(c1 >= 0xC0 && c1 <= 0xC7)
112
0
         {
113
0
         if(position == utf8.size())
114
0
            throw Decoding_Error("UTF-8: sequence truncated");
115
116
0
         const uint8_t c2 = static_cast<uint8_t>(utf8[position++]);
117
0
         const uint8_t iso_char = ((c1 & 0x07) << 6) | (c2 & 0x3F);
118
119
0
         if(iso_char <= 0x7F)
120
0
            throw Decoding_Error("UTF-8: sequence longer than needed");
121
122
0
         iso8859 += static_cast<char>(iso_char);
123
0
         }
124
0
      else
125
0
         throw Decoding_Error("UTF-8: Unicode chars not in Latin1 used");
126
0
      }
127
128
0
   return iso8859;
129
0
   }
130
131
/*
132
* Convert from UCS-2 to ISO 8859-1
133
*/
134
std::string ucs2_to_latin1(const std::string& ucs2)
135
0
   {
136
0
   if(ucs2.size() % 2 == 1)
137
0
      throw Decoding_Error("UCS-2 string has an odd number of bytes");
138
139
0
   std::string latin1;
140
141
0
   for(size_t i = 0; i != ucs2.size(); i += 2)
142
0
      {
143
0
      const uint8_t c1 = ucs2[i];
144
0
      const uint8_t c2 = ucs2[i+1];
145
146
0
      if(c1 != 0)
147
0
         throw Decoding_Error("UCS-2 has non-Latin1 characters");
148
149
0
      latin1 += static_cast<char>(c2);
150
0
      }
151
152
0
   return latin1;
153
0
   }
154
155
/*
156
* Convert from ISO 8859-1 to UTF-8
157
*/
158
std::string latin1_to_utf8(const std::string& iso8859)
159
0
   {
160
0
   std::string utf8;
161
0
   for(size_t i = 0; i != iso8859.size(); ++i)
162
0
      {
163
0
      const uint8_t c = static_cast<uint8_t>(iso8859[i]);
164
165
0
      if(c <= 0x7F)
166
0
         utf8 += static_cast<char>(c);
167
0
      else
168
0
         {
169
0
         utf8 += static_cast<char>((0xC0 | (c >> 6)));
170
0
         utf8 += static_cast<char>((0x80 | (c & 0x3F)));
171
0
         }
172
0
      }
173
0
   return utf8;
174
0
   }
175
176
namespace Charset {
177
178
/*
179
* Check if a character represents a digit
180
*/
181
bool is_digit(char c)
182
0
   {
183
0
   if(c == '0' || c == '1' || c == '2' || c == '3' || c == '4' ||
184
0
      c == '5' || c == '6' || c == '7' || c == '8' || c == '9')
185
0
      return true;
186
0
   return false;
187
0
   }
188
189
/*
190
* Check if a character represents whitespace
191
*/
192
bool is_space(char c)
193
948k
   {
194
948k
   if(c == ' ' || c == '\t' || c == '\n' || c == '\r')
195
188k
      return true;
196
760k
   return false;
197
760k
   }
198
199
/*
200
* Convert a character to a digit
201
*/
202
uint8_t char2digit(char c)
203
0
   {
204
0
   switch(c)
205
0
      {
206
0
      case '0': return 0;
207
0
      case '1': return 1;
208
0
      case '2': return 2;
209
0
      case '3': return 3;
210
0
      case '4': return 4;
211
0
      case '5': return 5;
212
0
      case '6': return 6;
213
0
      case '7': return 7;
214
0
      case '8': return 8;
215
0
      case '9': return 9;
216
0
      }
217
218
0
   throw Invalid_Argument("char2digit: Input is not a digit character");
219
0
   }
220
221
/*
222
* Convert a digit to a character
223
*/
224
char digit2char(uint8_t b)
225
0
   {
226
0
   switch(b)
227
0
      {
228
0
      case 0: return '0';
229
0
      case 1: return '1';
230
0
      case 2: return '2';
231
0
      case 3: return '3';
232
0
      case 4: return '4';
233
0
      case 5: return '5';
234
0
      case 6: return '6';
235
0
      case 7: return '7';
236
0
      case 8: return '8';
237
0
      case 9: return '9';
238
0
      }
239
240
0
   throw Invalid_Argument("digit2char: Input is not a digit");
241
0
   }
242
243
/*
244
* Case-insensitive character comparison
245
*/
246
bool caseless_cmp(char a, char b)
247
594k
   {
248
594k
   return (std::tolower(static_cast<unsigned char>(a)) ==
249
594k
           std::tolower(static_cast<unsigned char>(b)));
250
594k
   }
251
252
}
253
254
}