Coverage Report

Created: 2025-11-09 06:21

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/src/wt/src/Wt/WStringUtil.C
Line
Count
Source
1
/*
2
 * Copyright (C) 2008 Emweb bv, Herent, Belgium.
3
 *
4
 * See the LICENSE file for terms of use.
5
 */
6
7
#include "Wt/WLogger.h"
8
#include "Wt/WStringUtil.h"
9
10
#include "thirdparty/rapidxml/rapidxml.hpp"
11
12
#include <locale>
13
14
#if WCHAR_MAX == 0xFFFF
15
#define TWO_BYTE_CHAR
16
#else
17
#define FOUR_BYTE_CHAR
18
#endif
19
20
namespace Wt {
21
22
LOGGER("WString");
23
24
namespace {
25
  static const std::size_t stack_buffer_size = 512;
26
27
  template<typename OutStrT>
28
  OutStrT do_widen(const std::string& s, const std::locale &loc)
29
0
  {
30
0
    typedef typename OutStrT::value_type OutCharT;
31
32
0
    typedef std::codecvt<wchar_t, char, std::mbstate_t> Cvt;
33
34
0
    OutStrT result;
35
0
    result.reserve(s.length());
36
37
0
    const Cvt& myfacet = std::use_facet<Cvt>(loc);
38
0
    Cvt::result myresult;
39
0
    std::mbstate_t mystate = std::mbstate_t();
40
41
0
    wchar_t stack_buffer[stack_buffer_size + 1];
42
0
    const char* next_to_convert = s.c_str();
43
0
    const char* const to_convert_end = s.c_str() + s.length();
44
45
0
    bool error = false;
46
47
0
    while (next_to_convert != to_convert_end) {
48
0
      wchar_t* converted_end = stack_buffer;
49
0
      myresult = myfacet.in(mystate, next_to_convert, to_convert_end,
50
0
                            next_to_convert,
51
0
                            stack_buffer, stack_buffer + stack_buffer_size,
52
0
                            converted_end);
53
54
0
      result.append((OutCharT*)stack_buffer, (OutCharT*)converted_end);
55
56
0
      if (myresult == Cvt::error) {
57
0
        result += '?';
58
0
        ++ next_to_convert;
59
0
        error = true;
60
0
      }
61
0
    }
62
63
0
    if (error)
64
0
      LOG_ERROR("widen(): could not widen string: " << s);
65
66
0
    return result;
67
0
  }
Unexecuted instantiation: WStringUtil.C:std::__cxx11::basic_string<wchar_t, std::char_traits<wchar_t>, std::allocator<wchar_t> > Wt::(anonymous namespace)::do_widen<std::__cxx11::basic_string<wchar_t, std::char_traits<wchar_t>, std::allocator<wchar_t> > >(std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > const&, std::locale const&)
Unexecuted instantiation: WStringUtil.C:std::__cxx11::basic_string<char32_t, std::char_traits<char32_t>, std::allocator<char32_t> > Wt::(anonymous namespace)::do_widen<std::__cxx11::basic_string<char32_t, std::char_traits<char32_t>, std::allocator<char32_t> > >(std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > const&, std::locale const&)
68
69
  template<typename InStrT>
70
  std::string do_narrow(const InStrT &s, const std::locale &loc)
71
0
  {
72
0
    typedef std::codecvt<wchar_t, char, std::mbstate_t> cvt;
73
74
0
    const cvt& myfacet = std::use_facet<cvt>(loc);
75
76
0
    cvt::result myresult;
77
78
0
    const wchar_t *pwstr = (const wchar_t*)(s.c_str());
79
0
    const wchar_t *pwend = (const wchar_t*)(s.c_str() + s.length());
80
0
    const wchar_t *pwc = pwstr;
81
82
0
    int size = s.length() + 1;
83
84
0
    char *pstr = (char*)std::malloc(size);
85
0
    char *pc = pstr;
86
87
0
    std::mbstate_t mystate = std::mbstate_t();
88
0
    bool error = false;
89
90
0
    for (;;) {
91
0
      myresult = myfacet.out(mystate, pwc, pwend, pwc, pc, pstr + size, pc);
92
93
0
      if (myresult == cvt::ok) {
94
0
        break;
95
0
      } else {
96
0
        if (myresult == cvt::partial || pc >= pstr + size) {
97
0
          size += s.length();
98
0
          std::size_t sofar = pc - pstr;
99
0
          pstr = (char *)std::realloc(pstr, size);
100
0
          pc = pstr + sofar;
101
0
        }
102
103
0
        if (myresult == cvt::error) {
104
0
          *pc++ = '?';
105
0
          error = true;
106
#ifdef TWO_BYTE_CHAR
107
          if (*pwc >= 0xD800 &&
108
              *pwc < 0xDC00)
109
            ++pwc; // skip low surrogate too
110
          if (pwc == pwend)
111
            break; // highly unusual
112
#endif
113
0
          ++pwc;
114
0
        }
115
0
      }
116
0
    }
117
118
0
    std::string result(pstr, pc - pstr);
119
120
0
    if (error)
121
0
      LOG_WARN("narrow(): loss of detail: " << result);
122
123
0
    std::free(pstr);
124
125
0
    return result;
126
0
  }
Unexecuted instantiation: WStringUtil.C:std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > Wt::(anonymous namespace)::do_narrow<std::__cxx11::basic_string<wchar_t, std::char_traits<wchar_t>, std::allocator<wchar_t> > >(std::__cxx11::basic_string<wchar_t, std::char_traits<wchar_t>, std::allocator<wchar_t> > const&, std::locale const&)
Unexecuted instantiation: WStringUtil.C:std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > Wt::(anonymous namespace)::do_narrow<std::__cxx11::basic_string<char32_t, std::char_traits<char32_t>, std::allocator<char32_t> > >(std::__cxx11::basic_string<char32_t, std::char_traits<char32_t>, std::allocator<char32_t> > const&, std::locale const&)
127
128
  template<typename OutStrT, typename InStrT>
129
  OutStrT utf16_to_utf32(const InStrT &s)
130
0
  {
131
0
    OutStrT result;
132
0
    result.reserve(s.size());
133
0
    for (std::size_t i = 0; i < s.size(); ++i) {
134
0
      typename InStrT::value_type c = s[i];
135
0
      if (c < 0xD800 || c > 0xDFFF)
136
0
        result.push_back((typename OutStrT::value_type) c);
137
0
      else if (i + 1 < s.size() &&
138
0
               s[i] >= 0xD800 && s[i] < 0xDC00 &&
139
0
               s[i+1] >= 0xDC00 && s[i+1] <= 0xDFFF) {
140
0
        result.push_back((typename OutStrT::value_type)
141
0
                         (0x10000 + ((s[i] - 0xD800) << 10) + (s[i+1] - 0xDC00)));
142
0
        ++i;
143
0
      } else {
144
0
        result.push_back((typename OutStrT::value_type) 0xFFFD); // invalid
145
0
      }
146
0
    }
147
0
    return result;
148
0
  }
Unexecuted instantiation: WStringUtil.C:std::__cxx11::basic_string<wchar_t, std::char_traits<wchar_t>, std::allocator<wchar_t> > Wt::(anonymous namespace)::utf16_to_utf32<std::__cxx11::basic_string<wchar_t, std::char_traits<wchar_t>, std::allocator<wchar_t> >, std::__cxx11::basic_string<char16_t, std::char_traits<char16_t>, std::allocator<char16_t> > >(std::__cxx11::basic_string<char16_t, std::char_traits<char16_t>, std::allocator<char16_t> > const&)
Unexecuted instantiation: WStringUtil.C:std::__cxx11::basic_string<char32_t, std::char_traits<char32_t>, std::allocator<char32_t> > Wt::(anonymous namespace)::utf16_to_utf32<std::__cxx11::basic_string<char32_t, std::char_traits<char32_t>, std::allocator<char32_t> >, std::__cxx11::basic_string<char16_t, std::char_traits<char16_t>, std::allocator<char16_t> > >(std::__cxx11::basic_string<char16_t, std::char_traits<char16_t>, std::allocator<char16_t> > const&)
149
150
  template<typename OutStrT, typename InStrT>
151
  OutStrT utf32_to_utf16(const InStrT &s)
152
0
  {
153
0
    OutStrT result;
154
0
    result.reserve(s.size());
155
0
    for (std::size_t i = 0; i < s.size(); ++i) {
156
0
      typename InStrT::value_type c = s[i];
157
0
      if (c < 0x10000) {
158
0
        if (c < 0xD800 || c > 0xDFFF)
159
0
          result.push_back((typename OutStrT::value_type) c);
160
0
        else
161
0
          result.push_back((typename OutStrT::value_type) 0xFFFD); // invalid
162
0
      } else {
163
0
        result.push_back((typename OutStrT::value_type) (((c - 0x10000) >> 10) + 0xD800));
164
0
        result.push_back((typename OutStrT::value_type) (((c - 0x10000) & 0x3FF) + 0xDC00));
165
0
      }
166
0
    }
167
0
    return result;
168
0
  }
Unexecuted instantiation: WStringUtil.C:std::__cxx11::basic_string<char16_t, std::char_traits<char16_t>, std::allocator<char16_t> > Wt::(anonymous namespace)::utf32_to_utf16<std::__cxx11::basic_string<char16_t, std::char_traits<char16_t>, std::allocator<char16_t> >, std::__cxx11::basic_string<char32_t, std::char_traits<char32_t>, std::allocator<char32_t> > >(std::__cxx11::basic_string<char32_t, std::char_traits<char32_t>, std::allocator<char32_t> > const&)
Unexecuted instantiation: WStringUtil.C:std::__cxx11::basic_string<char16_t, std::char_traits<char16_t>, std::allocator<char16_t> > Wt::(anonymous namespace)::utf32_to_utf16<std::__cxx11::basic_string<char16_t, std::char_traits<char16_t>, std::allocator<char16_t> >, std::__cxx11::basic_string<wchar_t, std::char_traits<wchar_t>, std::allocator<wchar_t> > >(std::__cxx11::basic_string<wchar_t, std::char_traits<wchar_t>, std::allocator<wchar_t> > const&)
169
170
  template<typename OutStrT>
171
  OutStrT utf8_to_utf32(const std::string &s)
172
0
  {
173
0
    typedef typename OutStrT::value_type char_type;
174
175
0
    OutStrT result;
176
0
    result.reserve(s.length());
177
178
0
    for (unsigned i = 0; i < s.length(); ++i) {
179
0
      bool legal = false;
180
0
      if ((unsigned char)s[i] <= 0x7F) {
181
0
        unsigned char c = s[i];
182
0
        if (c == 0x09 || c == 0x0A || c == 0x0D || c >= 0x20) {
183
0
          result += (char_type)(c);
184
0
          legal = true;
185
0
        }
186
0
      } else if ((unsigned char)s[i] >= 0xF0) {
187
0
        if (i + 3 < s.length()) {
188
0
          if ((
189
               // F0 90-BF 80-BF 80-BF
190
0
               (                                    (unsigned char)s[i] == 0xF0)
191
0
               && (0x90 <= (unsigned char)s[i+1] && (unsigned char)s[i+1] <= 0xBF)
192
0
               && (0x80 <= (unsigned char)s[i+2] && (unsigned char)s[i+2] <= 0xBF)
193
0
               && (0x80 <= (unsigned char)s[i+3] && (unsigned char)s[i+3] <= 0xBF)
194
0
               ) ||
195
0
              (
196
               // F1-F3 80-BF 80-BF 80-BF
197
0
               (   0xF1 <= (unsigned char)s[i]   && (unsigned char)s[i] <= 0xF3)
198
0
               && (0x80 <= (unsigned char)s[i+1] && (unsigned char)s[i+1] <= 0xBF)
199
0
               && (0x80 <= (unsigned char)s[i+2] && (unsigned char)s[i+2] <= 0xBF)
200
0
               && (0x80 <= (unsigned char)s[i+3] && (unsigned char)s[i+3] <= 0xBF)
201
0
               )) {
202
0
            legal = true;
203
204
0
            uint32_t cp = ((unsigned char)s[i]) & 0x0F;
205
0
            for (unsigned j = 1; j < 4; ++j) {
206
0
              cp <<= 6;
207
0
              cp |= ((unsigned char)s[i+j]) & 0x3F;
208
0
            }
209
210
0
            char_type wc = cp;
211
0
            if ((uint32_t)wc == cp)
212
0
              result += wc;
213
0
            else
214
0
              legal = false;
215
0
          }
216
0
        }
217
0
        i += 3;
218
0
      } else if ((unsigned char)s[i] >= 0xE0) {
219
0
        if (i + 2 < s.length()) {
220
0
          if ((
221
               // E0 A0*-BF 80-BF
222
0
               (                                    (unsigned char)s[i] == 0xE0)
223
0
               && (0xA0 <= (unsigned char)s[i+1] && (unsigned char)s[i+1] <= 0xBF)
224
0
               && (0x80 <= (unsigned char)s[i+2] && (unsigned char)s[i+2] <= 0xBF)
225
0
               ) ||
226
0
              (
227
               // E1-EF 80-BF 80-BF
228
0
               (   0xE1 <= (unsigned char)s[i]   && (unsigned char)s[i] <= 0xF1)
229
0
               && (0x80 <= (unsigned char)s[i+1] && (unsigned char)s[i+1] <= 0xBF)
230
0
               && (0x80 <= (unsigned char)s[i+2] && (unsigned char)s[i+2] <= 0xBF)
231
0
               )) {
232
0
            legal = true;
233
234
0
            char_type cp = ((unsigned char)s[i]) & 0x1F;
235
0
            for (unsigned j = 1; j < 3; ++j) {
236
0
              cp <<= 6;
237
0
              cp |= ((unsigned char)s[i+j]) & 0x3F;
238
0
            }
239
240
0
            char_type wc = cp;
241
0
            if (wc == cp)
242
0
              result += wc;
243
0
            else
244
0
              legal = false;
245
0
          }
246
0
        }
247
0
        i += 2;
248
0
      } else if ((unsigned char)s[i] >= 0xC0) {
249
0
        if (i + 1 < s.length()) {
250
0
          if (
251
              // C2-DF 80-BF
252
0
              (   0xC2 <= (unsigned char)s[i]   && (unsigned char)s[i] <= 0xDF)
253
0
              && (0x80 <= (unsigned char)s[i+1] && (unsigned char)s[i+1] <= 0xBF)
254
0
              ) {
255
0
            legal = true;
256
257
0
            char_type cp = ((unsigned char)s[i]) & 0x3F;
258
0
            for (unsigned j = 1; j < 2; ++j) {
259
0
              cp <<= 6;
260
0
              cp |= ((unsigned char)s[i+j]) & 0x3F;
261
0
            }
262
263
0
            char_type wc = cp;
264
0
            if (wc == cp)
265
0
              result += wc;
266
0
            else
267
0
              legal = false;
268
0
          }
269
0
        }
270
0
        i += 1;
271
0
      }
272
273
0
      if (!legal)
274
0
        result += (char_type)0xFFFD;
275
0
    }
276
277
0
    return result;
278
0
  }
Unexecuted instantiation: WStringUtil.C:std::__cxx11::basic_string<wchar_t, std::char_traits<wchar_t>, std::allocator<wchar_t> > Wt::(anonymous namespace)::utf8_to_utf32<std::__cxx11::basic_string<wchar_t, std::char_traits<wchar_t>, std::allocator<wchar_t> > >(std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > const&)
Unexecuted instantiation: WStringUtil.C:std::__cxx11::basic_string<char32_t, std::char_traits<char32_t>, std::allocator<char32_t> > Wt::(anonymous namespace)::utf8_to_utf32<std::__cxx11::basic_string<char32_t, std::char_traits<char32_t>, std::allocator<char32_t> > >(std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > const&)
279
280
  template<typename InStrT>
281
  std::string utf32_to_utf8(const InStrT &s)
282
0
  {
283
0
    std::string result;
284
0
    result.reserve(s.length() * 3);
285
286
0
    char buf[4];
287
0
    for (typename InStrT::const_iterator i = s.begin(); i != s.end(); ++i) {
288
0
      char *end = buf;
289
0
      try {
290
0
        Wt::rapidxml::xml_document<>::insert_coded_character<0>(end, *i);
291
0
        for (char *b = buf; b != end; ++b)
292
0
          result += *b;
293
0
      } catch (Wt::rapidxml::parse_error& e) {
294
0
        LOG_ERROR("toUTF8(): " << e.what());
295
0
      }
296
0
    }
297
298
0
    return result;
299
0
  }
Unexecuted instantiation: WStringUtil.C:std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > Wt::(anonymous namespace)::utf32_to_utf8<std::__cxx11::basic_string<wchar_t, std::char_traits<wchar_t>, std::allocator<wchar_t> > >(std::__cxx11::basic_string<wchar_t, std::char_traits<wchar_t>, std::allocator<wchar_t> > const&)
Unexecuted instantiation: WStringUtil.C:std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > Wt::(anonymous namespace)::utf32_to_utf8<std::__cxx11::basic_string<char32_t, std::char_traits<char32_t>, std::allocator<char32_t> > >(std::__cxx11::basic_string<char32_t, std::char_traits<char32_t>, std::allocator<char32_t> > const&)
300
}
301
302
std::wstring widen(const std::string& s, const std::locale &loc)
303
0
{
304
0
  return do_widen<std::wstring>(s, loc);
305
0
}
306
307
std::u16string toUTF16(const std::string& s, const std::locale &loc)
308
0
{
309
#ifdef TWO_BYTE_CHAR
310
  return do_widen<std::u16string>(s, loc);
311
#else
312
0
  return utf32_to_utf16<std::u16string>(do_widen<std::u32string>(s, loc));
313
0
#endif
314
0
}
315
316
std::u32string toUTF32(const std::string& s, const std::locale &loc)
317
0
{
318
#ifdef TWO_BYTE_CHAR
319
  return utf16_to_utf32<std::u32string>(do_widen<std::u16string>(s, loc));
320
#else
321
0
  return do_widen<std::u32string>(s, loc);
322
0
#endif
323
0
}
324
325
std::string narrow(const std::wstring& s, const std::locale &loc)
326
0
{
327
0
  return do_narrow(s, loc);
328
0
}
329
330
std::string narrow(const std::u16string& s, const std::locale &loc)
331
0
{
332
#ifdef TWO_BYTE_CHAR
333
  return do_narrow(s, loc);
334
#else
335
0
  return do_narrow(utf16_to_utf32<std::wstring>(s), loc);
336
0
#endif
337
0
}
338
339
std::string narrow(const std::u32string& s, const std::locale &loc)
340
0
{
341
#ifdef TWO_BYTE_CHAR
342
  return do_narrow(utf32_to_utf16<std::wstring>(s), loc);
343
#else
344
0
  return do_narrow(s, loc);
345
0
#endif
346
0
}
347
348
std::string toUTF8(const std::wstring& s)
349
0
{
350
#ifdef TWO_BYTE_CHAR
351
  return utf32_to_utf8(utf16_to_utf32<std::u32string>(s));
352
#else
353
0
  return utf32_to_utf8(s);
354
0
#endif
355
0
}
356
357
std::string toUTF8(const std::u16string& s)
358
0
{
359
0
  return utf32_to_utf8(utf16_to_utf32<std::u32string>(s));
360
0
}
361
362
std::string toUTF8(const std::u32string& s)
363
0
{
364
0
  return utf32_to_utf8(s);
365
0
}
366
367
std::wstring fromUTF8(const std::string& s)
368
0
{
369
#ifdef TWO_BYTE_CHAR
370
  return utf32_to_utf16<std::wstring>(utf8_to_utf32<std::u32string>(s));
371
#else
372
0
  return utf8_to_utf32<std::wstring>(s);
373
0
#endif
374
0
}
375
376
std::u16string utf8ToUTF16(const std::string &s)
377
0
{
378
0
  return utf32_to_utf16<std::u16string>(utf8_to_utf32<std::u32string>(s));
379
0
}
380
381
std::u32string utf8ToUTF32(const std::string &s)
382
0
{
383
0
  return utf8_to_utf32<std::u32string>(s);
384
0
}
385
386
std::string fromUTF8(const std::string& s, const std::locale &loc)
387
0
{
388
0
  return narrow(fromUTF8(s), loc);
389
0
}
390
391
std::string toUTF8(const std::string& s, const std::locale &loc)
392
0
{
393
0
  return toUTF8(widen(s, loc));
394
0
}
395
396
std::u16string toUTF16(const std::wstring& s)
397
0
{
398
#ifdef TWO_BYTE_CHAR
399
  return std::u16string((const char16_t*)s.c_str());
400
#else
401
0
  return utf32_to_utf16<std::u16string>(s);
402
0
#endif
403
0
}
404
405
std::u16string toUTF16(const std::u32string& s)
406
0
{
407
0
  return utf32_to_utf16<std::u16string>(s);
408
0
}
409
410
std::u32string toUTF32(const std::wstring& s)
411
0
{
412
#ifdef TWO_BYTE_CHAR
413
  return utf16_to_utf32<std::u32string>(s);
414
#else
415
0
  return std::u32string((const char32_t*)s.c_str());
416
0
#endif
417
0
}
418
419
std::u32string toUTF32(const std::u16string& s)
420
0
{
421
0
  return utf16_to_utf32<std::u32string>(s);
422
0
}
423
424
std::wostream& streamUTF8(std::wostream &os, const std::string &s)
425
0
{
426
#ifdef TWO_BYTE_CHAR
427
  os << utf32_to_utf16<std::wstring>(utf8_to_utf32<std::u32string>(s));
428
#else
429
0
  os << utf8_to_utf32<std::wstring>(s);
430
0
#endif
431
0
  return os;
432
0
}
433
434
std::string UTF8Substr(const std::string &s, int begin, int length)
435
0
{
436
0
  std::string retval;
437
  // pos, beginPos and endPos refer to byte positions in s
438
0
  unsigned pos = 0;
439
0
  unsigned beginPos = 0;
440
0
  unsigned endPos = -1;
441
442
0
  for(int i = 0; i < begin && pos < s.size(); ++i) {
443
0
    unsigned char c = s[pos];
444
0
    if ((c & 0x80) == 0x0) pos++;
445
0
    else if ((c & 0xe0) == 0xc0) pos += 2;
446
0
    else if ((c & 0xf0) == 0xe0) pos += 3;
447
0
    else if ((c & 0xf8) == 0xf0) pos += 4;
448
0
    else pos++; // invalid!
449
0
  }
450
0
  beginPos = pos;
451
452
0
  if (length != -1) {
453
0
    for(int i = 0; i < length && pos < s.size(); ++i) {
454
0
      unsigned char c = s[pos];
455
0
      if ((c & 0x80) == 0x0) pos++;
456
0
      else if ((c & 0xe0) == 0xc0) pos += 2;
457
0
      else if ((c & 0xf0) == 0xe0) pos += 3;
458
0
      else if ((c & 0xf8) == 0xf0) pos += 4;
459
0
      else pos++; // invalid!
460
0
    }
461
0
    endPos = pos;
462
0
    return s.substr(beginPos, endPos - beginPos);
463
0
  } else {
464
0
    endPos = -1;
465
0
    return s.substr(beginPos, std::string::npos);
466
0
  }
467
0
}
468
469
}