/src/botan/src/lib/utils/charset.cpp
Line | Count | Source |
1 | | /* |
2 | | * Character Set Handling |
3 | | * (C) 1999-2007,2021 Jack Lloyd |
4 | | * |
5 | | * Botan is released under the Simplified BSD License (see license.txt) |
6 | | */ |
7 | | |
8 | | #include <botan/internal/charset.h> |
9 | | #include <botan/internal/loadstor.h> |
10 | | #include <botan/exceptn.h> |
11 | | |
12 | | namespace Botan { |
13 | | |
14 | | namespace { |
15 | | |
16 | | void append_utf8_for(std::string& s, uint32_t c) |
17 | 147k | { |
18 | 147k | if(c >= 0xD800 && c < 0xE000) |
19 | 566 | throw Decoding_Error("Invalid Unicode character"); |
20 | | |
21 | 146k | if(c <= 0x7F) |
22 | 68.5k | { |
23 | 68.5k | const uint8_t b0 = static_cast<uint8_t>(c); |
24 | 68.5k | s.push_back(static_cast<char>(b0)); |
25 | 68.5k | } |
26 | 78.0k | else if(c <= 0x7FF) |
27 | 55.9k | { |
28 | 55.9k | const uint8_t b0 = 0xC0 | static_cast<uint8_t>(c >> 6); |
29 | 55.9k | const uint8_t b1 = 0x80 | static_cast<uint8_t>(c & 0x3F); |
30 | 55.9k | s.push_back(static_cast<char>(b0)); |
31 | 55.9k | s.push_back(static_cast<char>(b1)); |
32 | 55.9k | } |
33 | 22.0k | else if(c <= 0xFFFF) |
34 | 17.7k | { |
35 | 17.7k | const uint8_t b0 = 0xE0 | static_cast<uint8_t>(c >> 12); |
36 | 17.7k | const uint8_t b1 = 0x80 | static_cast<uint8_t>((c >> 6) & 0x3F); |
37 | 17.7k | const uint8_t b2 = 0x80 | static_cast<uint8_t>(c & 0x3F); |
38 | 17.7k | s.push_back(static_cast<char>(b0)); |
39 | 17.7k | s.push_back(static_cast<char>(b1)); |
40 | 17.7k | s.push_back(static_cast<char>(b2)); |
41 | 17.7k | } |
42 | 4.34k | else if(c <= 0x10FFFF) |
43 | 3.26k | { |
44 | 3.26k | const uint8_t b0 = 0xF0 | static_cast<uint8_t>(c >> 18); |
45 | 3.26k | const uint8_t b1 = 0x80 | static_cast<uint8_t>((c >> 12) & 0x3F); |
46 | 3.26k | const uint8_t b2 = 0x80 | static_cast<uint8_t>((c >> 6) & 0x3F); |
47 | 3.26k | const uint8_t b3 = 0x80 | static_cast<uint8_t>(c & 0x3F); |
48 | 3.26k | s.push_back(static_cast<char>(b0)); |
49 | 3.26k | s.push_back(static_cast<char>(b1)); |
50 | 3.26k | s.push_back(static_cast<char>(b2)); |
51 | 3.26k | s.push_back(static_cast<char>(b3)); |
52 | 3.26k | } |
53 | 1.07k | else |
54 | 1.07k | throw Decoding_Error("Invalid Unicode character"); |
55 | | |
56 | 146k | } |
57 | | |
58 | | } |
59 | | |
60 | | std::string ucs2_to_utf8(const uint8_t ucs2[], size_t len) |
61 | 4.57k | { |
62 | 4.57k | if(len % 2 != 0) |
63 | 255 | throw Decoding_Error("Invalid length for UCS-2 string"); |
64 | | |
65 | 4.32k | const size_t chars = len / 2; |
66 | | |
67 | 4.32k | std::string s; |
68 | 27.3k | for(size_t i = 0; i != chars; ++i) |
69 | 23.0k | { |
70 | 23.0k | const uint32_t c = load_be<uint16_t>(ucs2, i); |
71 | 23.0k | append_utf8_for(s, c); |
72 | 23.0k | } |
73 | | |
74 | 4.32k | return s; |
75 | 4.57k | } |
76 | | |
77 | | std::string ucs4_to_utf8(const uint8_t ucs4[], size_t len) |
78 | 3.47k | { |
79 | 3.47k | if(len % 4 != 0) |
80 | 289 | throw Decoding_Error("Invalid length for UCS-4 string"); |
81 | | |
82 | 3.18k | const size_t chars = len / 4; |
83 | | |
84 | 3.18k | std::string s; |
85 | 8.61k | for(size_t i = 0; i != chars; ++i) |
86 | 5.43k | { |
87 | 5.43k | const uint32_t c = load_be<uint32_t>(ucs4, i); |
88 | 5.43k | append_utf8_for(s, c); |
89 | 5.43k | } |
90 | | |
91 | 3.18k | return s; |
92 | 3.47k | } |
93 | | |
94 | | /* |
95 | | * Convert from ISO 8859-1 to UTF-8 |
96 | | */ |
97 | | std::string latin1_to_utf8(const uint8_t chars[], size_t len) |
98 | 7.56k | { |
99 | 7.56k | std::string s; |
100 | 126k | for(size_t i = 0; i != len; ++i) |
101 | 118k | { |
102 | 118k | const uint32_t c = static_cast<uint8_t>(chars[i]); |
103 | 118k | append_utf8_for(s, c); |
104 | 118k | } |
105 | 7.56k | return s; |
106 | 7.56k | } |
107 | | |
108 | | } |
109 | | |