Coverage Report

Created: 2025-03-11 06:49

/src/neomutt/mutt/mbyte.c
Line
Count
Source (jump to first uncovered line)
1
/**
2
 * @file
3
 * Multi-byte String manipulation functions
4
 *
5
 * @authors
6
 * Copyright (C) 2017-2023 Richard Russon <rich@flatcap.org>
7
 * Copyright (C) 2019 Pietro Cerutti <gahr@gahr.ch>
8
 *
9
 * @copyright
10
 * This program is free software: you can redistribute it and/or modify it under
11
 * the terms of the GNU General Public License as published by the Free Software
12
 * Foundation, either version 2 of the License, or (at your option) any later
13
 * version.
14
 *
15
 * This program is distributed in the hope that it will be useful, but WITHOUT
16
 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
17
 * FOR A PARTICULAR PURPOSE.  See the GNU General Public License for more
18
 * details.
19
 *
20
 * You should have received a copy of the GNU General Public License along with
21
 * this program.  If not, see <http://www.gnu.org/licenses/>.
22
 */
23
24
/**
25
 * @page mutt_mbyte Multi-byte String manipulation functions
26
 *
27
 * Some commonly-used multi-byte string manipulation routines.
28
 */
29
30
#include "config.h"
31
#include <ctype.h>
32
#include <limits.h>
33
#include <stdbool.h>
34
#include <string.h>
35
#include <wchar.h>
36
#include <wctype.h>
37
#include "mbyte.h"
38
#include "buffer.h"
39
#include "charset.h"
40
#include "memory.h"
41
#include "pool.h"
42
#include "string2.h"
43
44
bool OptLocales; ///< (pseudo) set if user has valid locale definition
45
46
/**
47
 * mutt_mb_charlen - Count the bytes in a (multibyte) character
48
 * @param[in]  s     String to be examined
49
 * @param[out] width Number of screen columns the character would use
50
 * @retval num Bytes in the first (multibyte) character of input consumes
51
 * @retval <0  Conversion error
52
 * @retval =0  End of input
53
 * @retval >0  Length (bytes)
54
 */
55
int mutt_mb_charlen(const char *s, int *width)
56
67.3k
{
57
67.3k
  if (!s || (*s == '\0'))
58
0
    return 0;
59
60
67.3k
  wchar_t wc = 0;
61
67.3k
  mbstate_t mbstate = { 0 };
62
63
67.3k
  size_t n = mutt_str_len(s);
64
67.3k
  size_t k = mbrtowc(&wc, s, n, &mbstate);
65
67.3k
  if (width)
66
0
    *width = wcwidth(wc);
67
67.3k
  return ((k == ICONV_ILLEGAL_SEQ) || (k == ICONV_BUF_TOO_SMALL)) ? -1 : k;
68
67.3k
}
69
70
/**
71
 * mutt_mb_get_initials - Turn a name into initials
72
 * @param name   String to be converted
73
 * @param buf    Buffer for the result
74
 * @param buflen Size of the buffer
75
 * @retval 1 Success
76
 * @retval 0 Failure
77
 *
78
 * Take a name, e.g. "John F. Kennedy" and reduce it to initials "JFK".
79
 * The function saves the first character from each word.  Words are delimited
80
 * by whitespace, or hyphens (so "Jean-Pierre" becomes "JP").
81
 */
82
bool mutt_mb_get_initials(const char *name, char *buf, size_t buflen)
83
0
{
84
0
  if (!name || !buf)
85
0
    return false;
86
87
0
  while (*name)
88
0
  {
89
    /* Char's length in bytes */
90
0
    int clen = mutt_mb_charlen(name, NULL);
91
0
    if (clen < 1)
92
0
      return false;
93
94
    /* Ignore punctuation at the beginning of a word */
95
0
    if ((clen == 1) && ispunct(*name))
96
0
    {
97
0
      name++;
98
0
      continue;
99
0
    }
100
101
0
    if (clen >= buflen)
102
0
      return false;
103
104
    /* Copy one multibyte character */
105
0
    buflen -= clen;
106
0
    while (clen--)
107
0
      *buf++ = *name++;
108
109
    /* Skip to end-of-word */
110
0
    for (; *name; name += clen)
111
0
    {
112
0
      clen = mutt_mb_charlen(name, NULL);
113
0
      if (clen < 1)
114
0
        return false;
115
0
      if ((clen == 1) && (isspace(*name) || (*name == '-')))
116
0
        break;
117
0
    }
118
119
    /* Skip any whitespace, or hyphens */
120
0
    while (*name && (isspace(*name) || (*name == '-')))
121
0
      name++;
122
0
  }
123
124
0
  *buf = '\0';
125
0
  return true;
126
0
}
127
128
/**
129
 * mutt_mb_width - Measure a string's display width (in screen columns)
130
 * @param str     String to measure
131
 * @param col     Display column (used for expanding tabs)
132
 * @param indent  If true, newline-space will be indented 8 chars
133
 * @retval num String's width in screen columns
134
 *
135
 * This is like wcwidth(), but gets const char* not wchar_t*.
136
 */
137
int mutt_mb_width(const char *str, int col, bool indent)
138
0
{
139
0
  if (!str || !*str)
140
0
    return 0;
141
142
0
  bool nl = false;
143
0
  int total_width = 0;
144
0
  mbstate_t mbstate = { 0 };
145
146
0
  size_t str_len = mutt_str_len(str);
147
148
0
  while (*str && (str_len > 0))
149
0
  {
150
0
    wchar_t wc = L'\0';
151
0
    size_t consumed = mbrtowc(&wc, str, str_len, &mbstate);
152
0
    if (consumed == 0)
153
0
      break;
154
155
0
    if (consumed == ICONV_ILLEGAL_SEQ)
156
0
    {
157
0
      memset(&mbstate, 0, sizeof(mbstate));
158
0
      wc = ReplacementChar;
159
0
      consumed = 1;
160
0
    }
161
0
    else if (consumed == ICONV_BUF_TOO_SMALL)
162
0
    {
163
0
      wc = ReplacementChar;
164
0
      consumed = str_len;
165
0
    }
166
167
0
    int wchar_width = wcwidth(wc);
168
0
    if (wchar_width < 0)
169
0
      wchar_width = 1;
170
171
0
    if ((wc == L'\t') || (nl && (wc == L' ')))
172
0
    {
173
      /* correctly calc tab stop, even for sending as the line should look
174
       * pretty on the receiving end */
175
0
      nl = false;
176
0
      wchar_width = 8 - (col % 8);
177
0
    }
178
0
    else if (indent && (wc == '\n'))
179
0
    {
180
      /* track newlines for display-case: if we have a space after a newline,
181
       * assume 8 spaces as for display we always tab-fold */
182
0
      nl = true;
183
0
    }
184
185
0
    total_width += wchar_width;
186
0
    str += consumed;
187
0
    str_len -= consumed;
188
0
  }
189
190
0
  return total_width;
191
0
}
192
193
/**
194
 * mutt_mb_wcwidth - Measure the screen width of a character
195
 * @param wc Character to examine
196
 * @retval num Width in screen columns
197
 */
198
int mutt_mb_wcwidth(wchar_t wc)
199
0
{
200
0
  int n = wcwidth(wc);
201
0
  if (IsWPrint(wc) && (n > 0))
202
0
    return n;
203
0
  if (!(wc & ~0x7f))
204
0
    return 2;
205
0
  if (!(wc & ~0xffff))
206
0
    return 6;
207
0
  return 10;
208
0
}
209
210
/**
211
 * mutt_mb_wcswidth - Measure the screen width of a string
212
 * @param s String to measure
213
 * @param n Length of string in characters
214
 * @retval num Width in screen columns
215
 */
216
int mutt_mb_wcswidth(const wchar_t *s, size_t n)
217
0
{
218
0
  if (!s)
219
0
    return 0;
220
221
0
  int w = 0;
222
0
  while (n--)
223
0
    w += mutt_mb_wcwidth(*s++);
224
0
  return w;
225
0
}
226
227
/**
228
 * mutt_mb_width_ceiling - Keep the end of the string on-screen
229
 * @param s String being displayed
230
 * @param n Length of string in characters
231
 * @param w1 Width limit
232
 * @retval num Chars to skip
233
 *
234
 * Given a string and a width, determine how many characters from the
235
 * beginning of the string should be skipped so that the string fits.
236
 */
237
size_t mutt_mb_width_ceiling(const wchar_t *s, size_t n, int w1)
238
0
{
239
0
  if (!s)
240
0
    return 0;
241
242
0
  const wchar_t *s0 = s;
243
0
  int w = 0;
244
0
  for (; n; s++, n--)
245
0
    if ((w += mutt_mb_wcwidth(*s)) > w1)
246
0
      break;
247
0
  return s - s0;
248
0
}
249
250
/**
251
 * buf_mb_wcstombs - Convert a string from wide to multibyte characters
252
 * @param dest Buffer for the result
253
 * @param wstr Source wide string to convert
254
 * @param wlen Length of the wide string
255
 */
256
void buf_mb_wcstombs(struct Buffer *dest, const wchar_t *wstr, size_t wlen)
257
0
{
258
0
  if (!dest || !wstr)
259
0
    return;
260
261
  // Give ourselves 4 utf-8 bytes per wide character
262
0
  buf_alloc(dest, 4 * wlen);
263
264
0
  mbstate_t mbstate = { 0 };
265
0
  size_t k = 0;
266
267
0
  char *buf = dest->data;
268
0
  size_t buflen = dest->dsize;
269
270
0
  for (; (wlen > 0) && (buflen >= MB_LEN_MAX); buf += k, buflen -= k, wstr++, wlen--)
271
0
  {
272
0
    k = wcrtomb(buf, *wstr, &mbstate);
273
0
    if (k == ICONV_ILLEGAL_SEQ)
274
0
      break;
275
0
    if (*wstr == L'\0')
276
0
      break;
277
0
  }
278
279
0
  *buf = '\0';
280
0
  buf_fix_dptr(dest);
281
0
}
282
283
/**
284
 * mutt_mb_mbstowcs - Convert a string from multibyte to wide characters
285
 * @param[out] pwbuf    Buffer for the result
286
 * @param[out] pwbuflen Length of the result buffer
287
 * @param[in]  i        Starting index into the result buffer
288
 * @param[in]  buf      String to convert
289
 * @retval num First character after the result
290
 */
291
size_t mutt_mb_mbstowcs(wchar_t **pwbuf, size_t *pwbuflen, size_t i, const char *buf)
292
0
{
293
0
  if (!pwbuf || !pwbuflen || !buf)
294
0
    return 0;
295
296
0
  wchar_t wc = 0;
297
0
  mbstate_t mbstate = { 0 };
298
0
  size_t k;
299
0
  wchar_t *wbuf = *pwbuf;
300
0
  size_t wbuflen = *pwbuflen;
301
302
0
  while (*buf != '\0')
303
0
  {
304
0
    memset(&mbstate, 0, sizeof(mbstate));
305
0
    for (; (k = mbrtowc(&wc, buf, MB_LEN_MAX, &mbstate)) &&
306
0
           (k != ICONV_ILLEGAL_SEQ) && (k != ICONV_BUF_TOO_SMALL);
307
0
         buf += k)
308
0
    {
309
0
      if (i >= wbuflen)
310
0
      {
311
0
        wbuflen = i + 20;
312
0
        MUTT_MEM_REALLOC(&wbuf, wbuflen, wchar_t);
313
0
      }
314
0
      wbuf[i++] = wc;
315
0
    }
316
0
    if ((*buf != '\0') && ((k == ICONV_ILLEGAL_SEQ) || (k == ICONV_BUF_TOO_SMALL)))
317
0
    {
318
0
      if (i >= wbuflen)
319
0
      {
320
0
        wbuflen = i + 20;
321
0
        MUTT_MEM_REALLOC(&wbuf, wbuflen, wchar_t);
322
0
      }
323
0
      wbuf[i++] = ReplacementChar;
324
0
      buf++;
325
0
    }
326
0
  }
327
0
  *pwbuf = wbuf;
328
0
  *pwbuflen = wbuflen;
329
0
  return i;
330
0
}
331
332
/**
333
 * mutt_mb_is_shell_char - Is character not typically part of a pathname
334
 * @param ch Character to examine
335
 * @retval true  Character is not typically part of a pathname
336
 * @retval false Character is typically part of a pathname
337
 *
338
 * @note The name is very confusing.
339
 */
340
bool mutt_mb_is_shell_char(wchar_t ch)
341
0
{
342
0
  static const wchar_t shell_chars[] = L"<>&()$?*;{}| "; /* ! not included because it can be part of a pathname in NeoMutt */
343
0
  return wcschr(shell_chars, ch);
344
0
}
345
346
/**
347
 * mutt_mb_is_lower - Does a multi-byte string contain only lowercase characters?
348
 * @param s String to check
349
 * @retval true  String contains no uppercase characters
350
 * @retval false Error, or contains some uppercase characters
351
 *
352
 * Non-alphabetic characters are considered lowercase.
353
 */
354
bool mutt_mb_is_lower(const char *s)
355
48.1k
{
356
48.1k
  if (!s)
357
0
    return false;
358
359
48.1k
  wchar_t wc = 0;
360
48.1k
  mbstate_t mbstate = { 0 };
361
48.1k
  size_t l;
362
363
48.1k
  memset(&mbstate, 0, sizeof(mbstate));
364
48.1k
  size_t n = mutt_str_len(s);
365
366
865k
  for (; (n > 0) && (*s != '\0') && (l = mbrtowc(&wc, s, n, &mbstate)) != 0; s += l, n -= l)
367
827k
  {
368
827k
    if ((l == ICONV_BUF_TOO_SMALL) || (l == ICONV_ILLEGAL_SEQ))
369
0
      return false; // error; assume upper-case
370
827k
    if (iswalpha((wint_t) wc) && iswupper((wint_t) wc))
371
9.62k
      return false; // upper-case
372
827k
  }
373
374
38.4k
  return true; // lower-case
375
48.1k
}
376
377
/**
378
 * mutt_mb_is_display_corrupting_utf8 - Will this character corrupt the display?
379
 * @param wc Character to examine
380
 * @retval true  Character would corrupt the display
381
 * @retval false Character is safe to display
382
 *
383
 * @note This list isn't complete.
384
 */
385
bool mutt_mb_is_display_corrupting_utf8(wchar_t wc)
386
0
{
387
0
  if ((wc == (wchar_t) 0x00ad) || /* soft hyphen */
388
0
      (wc == (wchar_t) 0x200e) || /* left-to-right mark */
389
0
      (wc == (wchar_t) 0x200f) || /* right-to-left mark */
390
0
      (wc == (wchar_t) 0xfeff))   /* zero width no-break space */
391
0
  {
392
0
    return true;
393
0
  }
394
395
  /* left-to-right isolate, right-to-left isolate, first strong isolate,
396
   * pop directional isolate */
397
0
  if ((wc >= (wchar_t) 0x2066) && (wc <= (wchar_t) 0x2069))
398
0
    return true;
399
400
  /* left-to-right embedding, right-to-left embedding, pop directional formatting,
401
   * left-to-right override, right-to-left override */
402
0
  if ((wc >= (wchar_t) 0x202a) && (wc <= (wchar_t) 0x202e))
403
0
    return true;
404
405
  /* arabic letter mark */
406
0
  if (wc == (wchar_t) 0x061c)
407
0
    return true;
408
409
0
  return false;
410
0
}
411
412
/**
413
 * mutt_mb_filter_unprintable - Replace unprintable characters
414
 * @param[in,out] s String to modify
415
 * @retval  0 Success
416
 * @retval -1 Error
417
 *
418
 * Unprintable characters will be replaced with #ReplacementChar.
419
 *
420
 * @note The source string will be freed and a newly allocated string will be
421
 * returned in its place.  The caller should free the returned string.
422
 */
423
int mutt_mb_filter_unprintable(char **s)
424
6.43k
{
425
6.43k
  if (!s || !*s)
426
0
    return -1;
427
428
6.43k
  wchar_t wc = 0;
429
6.43k
  size_t k, k2;
430
6.43k
  char scratch[MB_LEN_MAX + 1];
431
6.43k
  char *p = *s;
432
6.43k
  mbstate_t mbstate1 = { 0 };
433
6.43k
  mbstate_t mbstate2 = { 0 };
434
435
6.43k
  struct Buffer *buf = buf_pool_get();
436
747k
  for (; (k = mbrtowc(&wc, p, MB_LEN_MAX, &mbstate1)); p += k)
437
741k
  {
438
741k
    if ((k == ICONV_ILLEGAL_SEQ) || (k == ICONV_BUF_TOO_SMALL))
439
284k
    {
440
284k
      k = 1;
441
284k
      memset(&mbstate1, 0, sizeof(mbstate1));
442
284k
      wc = ReplacementChar;
443
284k
    }
444
741k
    if (CharsetIsUtf8 && IsBOM(wc))
445
0
    {
446
0
      continue;
447
0
    }
448
741k
    if (!IsWPrint(wc))
449
10.7k
      wc = '?';
450
730k
    else if (CharsetIsUtf8 && mutt_mb_is_display_corrupting_utf8(wc))
451
0
      continue;
452
741k
    k2 = wcrtomb(scratch, wc, &mbstate2);
453
741k
    scratch[k2] = '\0';
454
741k
    buf_addstr(buf, scratch);
455
741k
  }
456
6.43k
  FREE(s);
457
458
6.43k
  if (buf_is_empty(buf))
459
2.43k
    *s = MUTT_MEM_CALLOC(1, char); // Fake empty string
460
4.00k
  else
461
4.00k
    *s = buf_strdup(buf);
462
463
6.43k
  buf_pool_release(&buf);
464
6.43k
  return 0;
465
6.43k
}