/src/botan/src/lib/utils/charset.cpp
Line | Count | Source |
1 | | /* |
2 | | * Character Set Handling |
3 | | * (C) 1999-2007,2021 Jack Lloyd |
4 | | * |
5 | | * Botan is released under the Simplified BSD License (see license.txt) |
6 | | */ |
7 | | |
8 | | #include <botan/internal/charset.h> |
9 | | #include <botan/internal/loadstor.h> |
10 | | #include <botan/exceptn.h> |
11 | | |
12 | | namespace Botan { |
13 | | |
14 | | namespace { |
15 | | |
16 | | void append_utf8_for(std::string& s, uint32_t c) |
17 | 112k | { |
18 | 112k | if(c >= 0xD800 && c < 0xE000) |
19 | 283 | throw Decoding_Error("Invalid Unicode character"); |
20 | | |
21 | 112k | if(c <= 0x7F) |
22 | 63.6k | { |
23 | 63.6k | const uint8_t b0 = static_cast<uint8_t>(c); |
24 | 63.6k | s.push_back(static_cast<char>(b0)); |
25 | 63.6k | } |
26 | 49.0k | else if(c <= 0x7FF) |
27 | 34.8k | { |
28 | 34.8k | const uint8_t b0 = 0xC0 | static_cast<uint8_t>(c >> 6); |
29 | 34.8k | const uint8_t b1 = 0x80 | static_cast<uint8_t>(c & 0x3F); |
30 | 34.8k | s.push_back(static_cast<char>(b0)); |
31 | 34.8k | s.push_back(static_cast<char>(b1)); |
32 | 34.8k | } |
33 | 14.1k | else if(c <= 0xFFFF) |
34 | 12.4k | { |
35 | 12.4k | const uint8_t b0 = 0xE0 | static_cast<uint8_t>(c >> 12); |
36 | 12.4k | const uint8_t b1 = 0x80 | static_cast<uint8_t>((c >> 6) & 0x3F); |
37 | 12.4k | const uint8_t b2 = 0x80 | static_cast<uint8_t>(c & 0x3F); |
38 | 12.4k | s.push_back(static_cast<char>(b0)); |
39 | 12.4k | s.push_back(static_cast<char>(b1)); |
40 | 12.4k | s.push_back(static_cast<char>(b2)); |
41 | 12.4k | } |
42 | 1.74k | else if(c <= 0x10FFFF) |
43 | 1.16k | { |
44 | 1.16k | const uint8_t b0 = 0xF0 | static_cast<uint8_t>(c >> 18); |
45 | 1.16k | const uint8_t b1 = 0x80 | static_cast<uint8_t>((c >> 12) & 0x3F); |
46 | 1.16k | const uint8_t b2 = 0x80 | static_cast<uint8_t>((c >> 6) & 0x3F); |
47 | 1.16k | const uint8_t b3 = 0x80 | static_cast<uint8_t>(c & 0x3F); |
48 | 1.16k | s.push_back(static_cast<char>(b0)); |
49 | 1.16k | s.push_back(static_cast<char>(b1)); |
50 | 1.16k | s.push_back(static_cast<char>(b2)); |
51 | 1.16k | s.push_back(static_cast<char>(b3)); |
52 | 1.16k | } |
53 | 572 | else |
54 | 572 | throw Decoding_Error("Invalid Unicode character"); |
55 | | |
56 | 112k | } |
57 | | |
58 | | } |
59 | | |
60 | | std::string ucs2_to_utf8(const uint8_t ucs2[], size_t len) |
61 | 4.31k | { |
62 | 4.31k | if(len % 2 != 0) |
63 | 239 | throw Decoding_Error("Invalid length for UCS-2 string"); |
64 | | |
65 | 4.07k | const size_t chars = len / 2; |
66 | | |
67 | 4.07k | std::string s; |
68 | 22.1k | for(size_t i = 0; i != chars; ++i) |
69 | 18.1k | { |
70 | 18.1k | const uint32_t c = load_be<uint16_t>(ucs2, i); |
71 | 18.1k | append_utf8_for(s, c); |
72 | 18.1k | } |
73 | | |
74 | 4.07k | return s; |
75 | 4.07k | } |
76 | | |
77 | | std::string ucs4_to_utf8(const uint8_t ucs4[], size_t len) |
78 | 3.11k | { |
79 | 3.11k | if(len % 4 != 0) |
80 | 231 | throw Decoding_Error("Invalid length for UCS-4 string"); |
81 | | |
82 | 2.88k | const size_t chars = len / 4; |
83 | | |
84 | 2.88k | std::string s; |
85 | 5.38k | for(size_t i = 0; i != chars; ++i) |
86 | 2.50k | { |
87 | 2.50k | const uint32_t c = load_be<uint32_t>(ucs4, i); |
88 | 2.50k | append_utf8_for(s, c); |
89 | 2.50k | } |
90 | | |
91 | 2.88k | return s; |
92 | 2.88k | } |
93 | | |
94 | | /* |
95 | | * Convert from ISO 8859-1 to UTF-8 |
96 | | */ |
97 | | std::string latin1_to_utf8(const uint8_t chars[], size_t len) |
98 | 3.84k | { |
99 | 3.84k | std::string s; |
100 | 96.1k | for(size_t i = 0; i != len; ++i) |
101 | 92.3k | { |
102 | 92.3k | const uint32_t c = static_cast<uint8_t>(chars[i]); |
103 | 92.3k | append_utf8_for(s, c); |
104 | 92.3k | } |
105 | 3.84k | return s; |
106 | 3.84k | } |
107 | | |
108 | | } |
109 | | |