/src/wt/src/Wt/WStringUtil.C
Line | Count | Source |
1 | | /* |
2 | | * Copyright (C) 2008 Emweb bv, Herent, Belgium. |
3 | | * |
4 | | * See the LICENSE file for terms of use. |
5 | | */ |
6 | | |
7 | | #include "Wt/WLogger.h" |
8 | | #include "Wt/WStringUtil.h" |
9 | | |
10 | | #include "thirdparty/rapidxml/rapidxml.hpp" |
11 | | |
12 | | #include <locale> |
13 | | |
14 | | #if WCHAR_MAX == 0xFFFF |
15 | | #define TWO_BYTE_CHAR |
16 | | #else |
17 | | #define FOUR_BYTE_CHAR |
18 | | #endif |
19 | | |
20 | | namespace Wt { |
21 | | |
22 | | LOGGER("WString"); |
23 | | |
24 | | namespace { |
25 | | static const std::size_t stack_buffer_size = 512; |
26 | | |
27 | | template<typename OutStrT> |
28 | | OutStrT do_widen(const std::string& s, const std::locale &loc) |
29 | 0 | { |
30 | 0 | typedef typename OutStrT::value_type OutCharT; |
31 | |
|
32 | 0 | typedef std::codecvt<wchar_t, char, std::mbstate_t> Cvt; |
33 | |
|
34 | 0 | OutStrT result; |
35 | 0 | result.reserve(s.length()); |
36 | |
|
37 | 0 | const Cvt& myfacet = std::use_facet<Cvt>(loc); |
38 | 0 | Cvt::result myresult; |
39 | 0 | std::mbstate_t mystate = std::mbstate_t(); |
40 | |
|
41 | 0 | wchar_t stack_buffer[stack_buffer_size + 1]; |
42 | 0 | const char* next_to_convert = s.c_str(); |
43 | 0 | const char* const to_convert_end = s.c_str() + s.length(); |
44 | |
|
45 | 0 | bool error = false; |
46 | |
|
47 | 0 | while (next_to_convert != to_convert_end) { |
48 | 0 | wchar_t* converted_end = stack_buffer; |
49 | 0 | myresult = myfacet.in(mystate, next_to_convert, to_convert_end, |
50 | 0 | next_to_convert, |
51 | 0 | stack_buffer, stack_buffer + stack_buffer_size, |
52 | 0 | converted_end); |
53 | |
|
54 | 0 | result.append((OutCharT*)stack_buffer, (OutCharT*)converted_end); |
55 | |
|
56 | 0 | if (myresult == Cvt::error) { |
57 | 0 | result += '?'; |
58 | 0 | ++ next_to_convert; |
59 | 0 | error = true; |
60 | 0 | } |
61 | 0 | } |
62 | |
|
63 | 0 | if (error) |
64 | 0 | LOG_ERROR("widen(): could not widen string: " << s); |
65 | |
|
66 | 0 | return result; |
67 | 0 | } Unexecuted instantiation: WStringUtil.C:std::__cxx11::basic_string<wchar_t, std::char_traits<wchar_t>, std::allocator<wchar_t> > Wt::(anonymous namespace)::do_widen<std::__cxx11::basic_string<wchar_t, std::char_traits<wchar_t>, std::allocator<wchar_t> > >(std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > const&, std::locale const&) Unexecuted instantiation: WStringUtil.C:std::__cxx11::basic_string<char32_t, std::char_traits<char32_t>, std::allocator<char32_t> > Wt::(anonymous namespace)::do_widen<std::__cxx11::basic_string<char32_t, std::char_traits<char32_t>, std::allocator<char32_t> > >(std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > const&, std::locale const&) |
68 | | |
69 | | template<typename InStrT> |
70 | | std::string do_narrow(const InStrT &s, const std::locale &loc) |
71 | 0 | { |
72 | 0 | typedef std::codecvt<wchar_t, char, std::mbstate_t> cvt; |
73 | |
|
74 | 0 | const cvt& myfacet = std::use_facet<cvt>(loc); |
75 | |
|
76 | 0 | cvt::result myresult; |
77 | |
|
78 | 0 | const wchar_t *pwstr = (const wchar_t*)(s.c_str()); |
79 | 0 | const wchar_t *pwend = (const wchar_t*)(s.c_str() + s.length()); |
80 | 0 | const wchar_t *pwc = pwstr; |
81 | |
|
82 | 0 | int size = s.length() + 1; |
83 | |
|
84 | 0 | char *pstr = (char*)std::malloc(size); |
85 | 0 | char *pc = pstr; |
86 | |
|
87 | 0 | std::mbstate_t mystate = std::mbstate_t(); |
88 | 0 | bool error = false; |
89 | |
|
90 | 0 | for (;;) { |
91 | 0 | myresult = myfacet.out(mystate, pwc, pwend, pwc, pc, pstr + size, pc); |
92 | |
|
93 | 0 | if (myresult == cvt::ok) { |
94 | 0 | break; |
95 | 0 | } else { |
96 | 0 | if (myresult == cvt::partial || pc >= pstr + size) { |
97 | 0 | size += s.length(); |
98 | 0 | std::size_t sofar = pc - pstr; |
99 | 0 | pstr = (char *)std::realloc(pstr, size); |
100 | 0 | pc = pstr + sofar; |
101 | 0 | } |
102 | |
|
103 | 0 | if (myresult == cvt::error) { |
104 | 0 | *pc++ = '?'; |
105 | 0 | error = true; |
106 | | #ifdef TWO_BYTE_CHAR |
107 | | if (*pwc >= 0xD800 && |
108 | | *pwc < 0xDC00) |
109 | | ++pwc; // skip low surrogate too |
110 | | if (pwc == pwend) |
111 | | break; // highly unusual |
112 | | #endif |
113 | 0 | ++pwc; |
114 | 0 | } |
115 | 0 | } |
116 | 0 | } |
117 | |
|
118 | 0 | std::string result(pstr, pc - pstr); |
119 | |
|
120 | 0 | if (error) |
121 | 0 | LOG_WARN("narrow(): loss of detail: " << result); |
122 | |
|
123 | 0 | std::free(pstr); |
124 | |
|
125 | 0 | return result; |
126 | 0 | } Unexecuted instantiation: WStringUtil.C:std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > Wt::(anonymous namespace)::do_narrow<std::__cxx11::basic_string<wchar_t, std::char_traits<wchar_t>, std::allocator<wchar_t> > >(std::__cxx11::basic_string<wchar_t, std::char_traits<wchar_t>, std::allocator<wchar_t> > const&, std::locale const&) Unexecuted instantiation: WStringUtil.C:std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > Wt::(anonymous namespace)::do_narrow<std::__cxx11::basic_string<char32_t, std::char_traits<char32_t>, std::allocator<char32_t> > >(std::__cxx11::basic_string<char32_t, std::char_traits<char32_t>, std::allocator<char32_t> > const&, std::locale const&) |
127 | | |
128 | | template<typename OutStrT, typename InStrT> |
129 | | OutStrT utf16_to_utf32(const InStrT &s) |
130 | 0 | { |
131 | 0 | OutStrT result; |
132 | 0 | result.reserve(s.size()); |
133 | 0 | for (std::size_t i = 0; i < s.size(); ++i) { |
134 | 0 | typename InStrT::value_type c = s[i]; |
135 | 0 | if (c < 0xD800 || c > 0xDFFF) |
136 | 0 | result.push_back((typename OutStrT::value_type) c); |
137 | 0 | else if (i + 1 < s.size() && |
138 | 0 | s[i] >= 0xD800 && s[i] < 0xDC00 && |
139 | 0 | s[i+1] >= 0xDC00 && s[i+1] <= 0xDFFF) { |
140 | 0 | result.push_back((typename OutStrT::value_type) |
141 | 0 | (0x10000 + ((s[i] - 0xD800) << 10) + (s[i+1] - 0xDC00))); |
142 | 0 | ++i; |
143 | 0 | } else { |
144 | 0 | result.push_back((typename OutStrT::value_type) 0xFFFD); // invalid |
145 | 0 | } |
146 | 0 | } |
147 | 0 | return result; |
148 | 0 | } Unexecuted instantiation: WStringUtil.C:std::__cxx11::basic_string<wchar_t, std::char_traits<wchar_t>, std::allocator<wchar_t> > Wt::(anonymous namespace)::utf16_to_utf32<std::__cxx11::basic_string<wchar_t, std::char_traits<wchar_t>, std::allocator<wchar_t> >, std::__cxx11::basic_string<char16_t, std::char_traits<char16_t>, std::allocator<char16_t> > >(std::__cxx11::basic_string<char16_t, std::char_traits<char16_t>, std::allocator<char16_t> > const&) Unexecuted instantiation: WStringUtil.C:std::__cxx11::basic_string<char32_t, std::char_traits<char32_t>, std::allocator<char32_t> > Wt::(anonymous namespace)::utf16_to_utf32<std::__cxx11::basic_string<char32_t, std::char_traits<char32_t>, std::allocator<char32_t> >, std::__cxx11::basic_string<char16_t, std::char_traits<char16_t>, std::allocator<char16_t> > >(std::__cxx11::basic_string<char16_t, std::char_traits<char16_t>, std::allocator<char16_t> > const&) |
149 | | |
150 | | template<typename OutStrT, typename InStrT> |
151 | | OutStrT utf32_to_utf16(const InStrT &s) |
152 | 0 | { |
153 | 0 | OutStrT result; |
154 | 0 | result.reserve(s.size()); |
155 | 0 | for (std::size_t i = 0; i < s.size(); ++i) { |
156 | 0 | typename InStrT::value_type c = s[i]; |
157 | 0 | if (c < 0x10000) { |
158 | 0 | if (c < 0xD800 || c > 0xDFFF) |
159 | 0 | result.push_back((typename OutStrT::value_type) c); |
160 | 0 | else |
161 | 0 | result.push_back((typename OutStrT::value_type) 0xFFFD); // invalid |
162 | 0 | } else { |
163 | 0 | result.push_back((typename OutStrT::value_type) (((c - 0x10000) >> 10) + 0xD800)); |
164 | 0 | result.push_back((typename OutStrT::value_type) (((c - 0x10000) & 0x3FF) + 0xDC00)); |
165 | 0 | } |
166 | 0 | } |
167 | 0 | return result; |
168 | 0 | } Unexecuted instantiation: WStringUtil.C:std::__cxx11::basic_string<char16_t, std::char_traits<char16_t>, std::allocator<char16_t> > Wt::(anonymous namespace)::utf32_to_utf16<std::__cxx11::basic_string<char16_t, std::char_traits<char16_t>, std::allocator<char16_t> >, std::__cxx11::basic_string<char32_t, std::char_traits<char32_t>, std::allocator<char32_t> > >(std::__cxx11::basic_string<char32_t, std::char_traits<char32_t>, std::allocator<char32_t> > const&) Unexecuted instantiation: WStringUtil.C:std::__cxx11::basic_string<char16_t, std::char_traits<char16_t>, std::allocator<char16_t> > Wt::(anonymous namespace)::utf32_to_utf16<std::__cxx11::basic_string<char16_t, std::char_traits<char16_t>, std::allocator<char16_t> >, std::__cxx11::basic_string<wchar_t, std::char_traits<wchar_t>, std::allocator<wchar_t> > >(std::__cxx11::basic_string<wchar_t, std::char_traits<wchar_t>, std::allocator<wchar_t> > const&) |
169 | | |
170 | | template<typename OutStrT> |
171 | | OutStrT utf8_to_utf32(const std::string &s) |
172 | 0 | { |
173 | 0 | typedef typename OutStrT::value_type char_type; |
174 | |
|
175 | 0 | OutStrT result; |
176 | 0 | result.reserve(s.length()); |
177 | |
|
178 | 0 | for (unsigned i = 0; i < s.length(); ++i) { |
179 | 0 | bool legal = false; |
180 | 0 | if ((unsigned char)s[i] <= 0x7F) { |
181 | 0 | unsigned char c = s[i]; |
182 | 0 | if (c == 0x09 || c == 0x0A || c == 0x0D || c >= 0x20) { |
183 | 0 | result += (char_type)(c); |
184 | 0 | legal = true; |
185 | 0 | } |
186 | 0 | } else if ((unsigned char)s[i] >= 0xF0) { |
187 | 0 | if (i + 3 < s.length()) { |
188 | 0 | if (( |
189 | | // F0 90-BF 80-BF 80-BF |
190 | 0 | ( (unsigned char)s[i] == 0xF0) |
191 | 0 | && (0x90 <= (unsigned char)s[i+1] && (unsigned char)s[i+1] <= 0xBF) |
192 | 0 | && (0x80 <= (unsigned char)s[i+2] && (unsigned char)s[i+2] <= 0xBF) |
193 | 0 | && (0x80 <= (unsigned char)s[i+3] && (unsigned char)s[i+3] <= 0xBF) |
194 | 0 | ) || |
195 | 0 | ( |
196 | | // F1-F3 80-BF 80-BF 80-BF |
197 | 0 | ( 0xF1 <= (unsigned char)s[i] && (unsigned char)s[i] <= 0xF3) |
198 | 0 | && (0x80 <= (unsigned char)s[i+1] && (unsigned char)s[i+1] <= 0xBF) |
199 | 0 | && (0x80 <= (unsigned char)s[i+2] && (unsigned char)s[i+2] <= 0xBF) |
200 | 0 | && (0x80 <= (unsigned char)s[i+3] && (unsigned char)s[i+3] <= 0xBF) |
201 | 0 | )) { |
202 | 0 | legal = true; |
203 | |
|
204 | 0 | uint32_t cp = ((unsigned char)s[i]) & 0x0F; |
205 | 0 | for (unsigned j = 1; j < 4; ++j) { |
206 | 0 | cp <<= 6; |
207 | 0 | cp |= ((unsigned char)s[i+j]) & 0x3F; |
208 | 0 | } |
209 | |
|
210 | 0 | char_type wc = cp; |
211 | 0 | if ((uint32_t)wc == cp) |
212 | 0 | result += wc; |
213 | 0 | else |
214 | 0 | legal = false; |
215 | 0 | } |
216 | 0 | } |
217 | 0 | i += 3; |
218 | 0 | } else if ((unsigned char)s[i] >= 0xE0) { |
219 | 0 | if (i + 2 < s.length()) { |
220 | 0 | if (( |
221 | | // E0 A0*-BF 80-BF |
222 | 0 | ( (unsigned char)s[i] == 0xE0) |
223 | 0 | && (0xA0 <= (unsigned char)s[i+1] && (unsigned char)s[i+1] <= 0xBF) |
224 | 0 | && (0x80 <= (unsigned char)s[i+2] && (unsigned char)s[i+2] <= 0xBF) |
225 | 0 | ) || |
226 | 0 | ( |
227 | | // E1-EF 80-BF 80-BF |
228 | 0 | ( 0xE1 <= (unsigned char)s[i] && (unsigned char)s[i] <= 0xF1) |
229 | 0 | && (0x80 <= (unsigned char)s[i+1] && (unsigned char)s[i+1] <= 0xBF) |
230 | 0 | && (0x80 <= (unsigned char)s[i+2] && (unsigned char)s[i+2] <= 0xBF) |
231 | 0 | )) { |
232 | 0 | legal = true; |
233 | |
|
234 | 0 | char_type cp = ((unsigned char)s[i]) & 0x1F; |
235 | 0 | for (unsigned j = 1; j < 3; ++j) { |
236 | 0 | cp <<= 6; |
237 | 0 | cp |= ((unsigned char)s[i+j]) & 0x3F; |
238 | 0 | } |
239 | |
|
240 | 0 | char_type wc = cp; |
241 | 0 | if (wc == cp) |
242 | 0 | result += wc; |
243 | 0 | else |
244 | 0 | legal = false; |
245 | 0 | } |
246 | 0 | } |
247 | 0 | i += 2; |
248 | 0 | } else if ((unsigned char)s[i] >= 0xC0) { |
249 | 0 | if (i + 1 < s.length()) { |
250 | 0 | if ( |
251 | | // C2-DF 80-BF |
252 | 0 | ( 0xC2 <= (unsigned char)s[i] && (unsigned char)s[i] <= 0xDF) |
253 | 0 | && (0x80 <= (unsigned char)s[i+1] && (unsigned char)s[i+1] <= 0xBF) |
254 | 0 | ) { |
255 | 0 | legal = true; |
256 | |
|
257 | 0 | char_type cp = ((unsigned char)s[i]) & 0x3F; |
258 | 0 | for (unsigned j = 1; j < 2; ++j) { |
259 | 0 | cp <<= 6; |
260 | 0 | cp |= ((unsigned char)s[i+j]) & 0x3F; |
261 | 0 | } |
262 | |
|
263 | 0 | char_type wc = cp; |
264 | 0 | if (wc == cp) |
265 | 0 | result += wc; |
266 | 0 | else |
267 | 0 | legal = false; |
268 | 0 | } |
269 | 0 | } |
270 | 0 | i += 1; |
271 | 0 | } |
272 | |
|
273 | 0 | if (!legal) |
274 | 0 | result += (char_type)0xFFFD; |
275 | 0 | } |
276 | |
|
277 | 0 | return result; |
278 | 0 | } Unexecuted instantiation: WStringUtil.C:std::__cxx11::basic_string<wchar_t, std::char_traits<wchar_t>, std::allocator<wchar_t> > Wt::(anonymous namespace)::utf8_to_utf32<std::__cxx11::basic_string<wchar_t, std::char_traits<wchar_t>, std::allocator<wchar_t> > >(std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > const&) Unexecuted instantiation: WStringUtil.C:std::__cxx11::basic_string<char32_t, std::char_traits<char32_t>, std::allocator<char32_t> > Wt::(anonymous namespace)::utf8_to_utf32<std::__cxx11::basic_string<char32_t, std::char_traits<char32_t>, std::allocator<char32_t> > >(std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > const&) |
279 | | |
280 | | template<typename InStrT> |
281 | | std::string utf32_to_utf8(const InStrT &s) |
282 | 0 | { |
283 | 0 | std::string result; |
284 | 0 | result.reserve(s.length() * 3); |
285 | |
|
286 | 0 | char buf[4]; |
287 | 0 | for (typename InStrT::const_iterator i = s.begin(); i != s.end(); ++i) { |
288 | 0 | char *end = buf; |
289 | 0 | try { |
290 | 0 | Wt::rapidxml::xml_document<>::insert_coded_character<0>(end, *i); |
291 | 0 | for (char *b = buf; b != end; ++b) |
292 | 0 | result += *b; |
293 | 0 | } catch (Wt::rapidxml::parse_error& e) { |
294 | 0 | LOG_ERROR("toUTF8(): " << e.what()); |
295 | 0 | } |
296 | 0 | } |
297 | |
|
298 | 0 | return result; |
299 | 0 | } Unexecuted instantiation: WStringUtil.C:std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > Wt::(anonymous namespace)::utf32_to_utf8<std::__cxx11::basic_string<wchar_t, std::char_traits<wchar_t>, std::allocator<wchar_t> > >(std::__cxx11::basic_string<wchar_t, std::char_traits<wchar_t>, std::allocator<wchar_t> > const&) Unexecuted instantiation: WStringUtil.C:std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > Wt::(anonymous namespace)::utf32_to_utf8<std::__cxx11::basic_string<char32_t, std::char_traits<char32_t>, std::allocator<char32_t> > >(std::__cxx11::basic_string<char32_t, std::char_traits<char32_t>, std::allocator<char32_t> > const&) |
300 | | } |
301 | | |
302 | | std::wstring widen(const std::string& s, const std::locale &loc) |
303 | 0 | { |
304 | 0 | return do_widen<std::wstring>(s, loc); |
305 | 0 | } |
306 | | |
307 | | std::u16string toUTF16(const std::string& s, const std::locale &loc) |
308 | 0 | { |
309 | | #ifdef TWO_BYTE_CHAR |
310 | | return do_widen<std::u16string>(s, loc); |
311 | | #else |
312 | 0 | return utf32_to_utf16<std::u16string>(do_widen<std::u32string>(s, loc)); |
313 | 0 | #endif |
314 | 0 | } |
315 | | |
316 | | std::u32string toUTF32(const std::string& s, const std::locale &loc) |
317 | 0 | { |
318 | | #ifdef TWO_BYTE_CHAR |
319 | | return utf16_to_utf32<std::u32string>(do_widen<std::u16string>(s, loc)); |
320 | | #else |
321 | 0 | return do_widen<std::u32string>(s, loc); |
322 | 0 | #endif |
323 | 0 | } |
324 | | |
325 | | std::string narrow(const std::wstring& s, const std::locale &loc) |
326 | 0 | { |
327 | 0 | return do_narrow(s, loc); |
328 | 0 | } |
329 | | |
330 | | std::string narrow(const std::u16string& s, const std::locale &loc) |
331 | 0 | { |
332 | | #ifdef TWO_BYTE_CHAR |
333 | | return do_narrow(s, loc); |
334 | | #else |
335 | 0 | return do_narrow(utf16_to_utf32<std::wstring>(s), loc); |
336 | 0 | #endif |
337 | 0 | } |
338 | | |
339 | | std::string narrow(const std::u32string& s, const std::locale &loc) |
340 | 0 | { |
341 | | #ifdef TWO_BYTE_CHAR |
342 | | return do_narrow(utf32_to_utf16<std::wstring>(s), loc); |
343 | | #else |
344 | 0 | return do_narrow(s, loc); |
345 | 0 | #endif |
346 | 0 | } |
347 | | |
348 | | std::string toUTF8(const std::wstring& s) |
349 | 0 | { |
350 | | #ifdef TWO_BYTE_CHAR |
351 | | return utf32_to_utf8(utf16_to_utf32<std::u32string>(s)); |
352 | | #else |
353 | 0 | return utf32_to_utf8(s); |
354 | 0 | #endif |
355 | 0 | } |
356 | | |
357 | | std::string toUTF8(const std::u16string& s) |
358 | 0 | { |
359 | 0 | return utf32_to_utf8(utf16_to_utf32<std::u32string>(s)); |
360 | 0 | } |
361 | | |
362 | | std::string toUTF8(const std::u32string& s) |
363 | 0 | { |
364 | 0 | return utf32_to_utf8(s); |
365 | 0 | } |
366 | | |
367 | | std::wstring fromUTF8(const std::string& s) |
368 | 0 | { |
369 | | #ifdef TWO_BYTE_CHAR |
370 | | return utf32_to_utf16<std::wstring>(utf8_to_utf32<std::u32string>(s)); |
371 | | #else |
372 | 0 | return utf8_to_utf32<std::wstring>(s); |
373 | 0 | #endif |
374 | 0 | } |
375 | | |
376 | | std::u16string utf8ToUTF16(const std::string &s) |
377 | 0 | { |
378 | 0 | return utf32_to_utf16<std::u16string>(utf8_to_utf32<std::u32string>(s)); |
379 | 0 | } |
380 | | |
381 | | std::u32string utf8ToUTF32(const std::string &s) |
382 | 0 | { |
383 | 0 | return utf8_to_utf32<std::u32string>(s); |
384 | 0 | } |
385 | | |
386 | | std::string fromUTF8(const std::string& s, const std::locale &loc) |
387 | 0 | { |
388 | 0 | return narrow(fromUTF8(s), loc); |
389 | 0 | } |
390 | | |
391 | | std::string toUTF8(const std::string& s, const std::locale &loc) |
392 | 0 | { |
393 | 0 | return toUTF8(widen(s, loc)); |
394 | 0 | } |
395 | | |
396 | | std::u16string toUTF16(const std::wstring& s) |
397 | 0 | { |
398 | | #ifdef TWO_BYTE_CHAR |
399 | | return std::u16string((const char16_t*)s.c_str()); |
400 | | #else |
401 | 0 | return utf32_to_utf16<std::u16string>(s); |
402 | 0 | #endif |
403 | 0 | } |
404 | | |
405 | | std::u16string toUTF16(const std::u32string& s) |
406 | 0 | { |
407 | 0 | return utf32_to_utf16<std::u16string>(s); |
408 | 0 | } |
409 | | |
410 | | std::u32string toUTF32(const std::wstring& s) |
411 | 0 | { |
412 | | #ifdef TWO_BYTE_CHAR |
413 | | return utf16_to_utf32<std::u32string>(s); |
414 | | #else |
415 | 0 | return std::u32string((const char32_t*)s.c_str()); |
416 | 0 | #endif |
417 | 0 | } |
418 | | |
419 | | std::u32string toUTF32(const std::u16string& s) |
420 | 0 | { |
421 | 0 | return utf16_to_utf32<std::u32string>(s); |
422 | 0 | } |
423 | | |
424 | | std::wostream& streamUTF8(std::wostream &os, const std::string &s) |
425 | 0 | { |
426 | | #ifdef TWO_BYTE_CHAR |
427 | | os << utf32_to_utf16<std::wstring>(utf8_to_utf32<std::u32string>(s)); |
428 | | #else |
429 | 0 | os << utf8_to_utf32<std::wstring>(s); |
430 | 0 | #endif |
431 | 0 | return os; |
432 | 0 | } |
433 | | |
434 | | std::string UTF8Substr(const std::string &s, int begin, int length) |
435 | 0 | { |
436 | 0 | std::string retval; |
437 | | // pos, beginPos and endPos refer to byte positions in s |
438 | 0 | unsigned pos = 0; |
439 | 0 | unsigned beginPos = 0; |
440 | 0 | unsigned endPos = -1; |
441 | |
|
442 | 0 | for(int i = 0; i < begin && pos < s.size(); ++i) { |
443 | 0 | unsigned char c = s[pos]; |
444 | 0 | if ((c & 0x80) == 0x0) pos++; |
445 | 0 | else if ((c & 0xe0) == 0xc0) pos += 2; |
446 | 0 | else if ((c & 0xf0) == 0xe0) pos += 3; |
447 | 0 | else if ((c & 0xf8) == 0xf0) pos += 4; |
448 | 0 | else pos++; // invalid! |
449 | 0 | } |
450 | 0 | beginPos = pos; |
451 | |
|
452 | 0 | if (length != -1) { |
453 | 0 | for(int i = 0; i < length && pos < s.size(); ++i) { |
454 | 0 | unsigned char c = s[pos]; |
455 | 0 | if ((c & 0x80) == 0x0) pos++; |
456 | 0 | else if ((c & 0xe0) == 0xc0) pos += 2; |
457 | 0 | else if ((c & 0xf0) == 0xe0) pos += 3; |
458 | 0 | else if ((c & 0xf8) == 0xf0) pos += 4; |
459 | 0 | else pos++; // invalid! |
460 | 0 | } |
461 | 0 | endPos = pos; |
462 | 0 | return s.substr(beginPos, endPos - beginPos); |
463 | 0 | } else { |
464 | 0 | endPos = -1; |
465 | 0 | return s.substr(beginPos, std::string::npos); |
466 | 0 | } |
467 | 0 | } |
468 | | |
469 | | } |