/src/neomutt/mutt/mbyte.c
Line | Count | Source (jump to first uncovered line) |
1 | | /** |
2 | | * @file |
3 | | * Multi-byte String manipulation functions |
4 | | * |
5 | | * @authors |
6 | | * Copyright (C) 2017-2023 Richard Russon <rich@flatcap.org> |
7 | | * Copyright (C) 2019 Pietro Cerutti <gahr@gahr.ch> |
8 | | * |
9 | | * @copyright |
10 | | * This program is free software: you can redistribute it and/or modify it under |
11 | | * the terms of the GNU General Public License as published by the Free Software |
12 | | * Foundation, either version 2 of the License, or (at your option) any later |
13 | | * version. |
14 | | * |
15 | | * This program is distributed in the hope that it will be useful, but WITHOUT |
16 | | * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS |
17 | | * FOR A PARTICULAR PURPOSE. See the GNU General Public License for more |
18 | | * details. |
19 | | * |
20 | | * You should have received a copy of the GNU General Public License along with |
21 | | * this program. If not, see <http://www.gnu.org/licenses/>. |
22 | | */ |
23 | | |
24 | | /** |
25 | | * @page mutt_mbyte Multi-byte String manipulation functions |
26 | | * |
27 | | * Some commonly-used multi-byte string manipulation routines. |
28 | | */ |
29 | | |
30 | | #include "config.h" |
31 | | #include <ctype.h> |
32 | | #include <limits.h> |
33 | | #include <stdbool.h> |
34 | | #include <string.h> |
35 | | #include <wchar.h> |
36 | | #include <wctype.h> |
37 | | #include "mbyte.h" |
38 | | #include "buffer.h" |
39 | | #include "charset.h" |
40 | | #include "memory.h" |
41 | | #include "pool.h" |
42 | | #include "string2.h" |
43 | | |
44 | | bool OptLocales; ///< (pseudo) set if user has valid locale definition |
45 | | |
46 | | /** |
47 | | * mutt_mb_charlen - Count the bytes in a (multibyte) character |
48 | | * @param[in] s String to be examined |
49 | | * @param[out] width Number of screen columns the character would use |
50 | | * @retval num Bytes in the first (multibyte) character of input consumes |
51 | | * @retval <0 Conversion error |
52 | | * @retval =0 End of input |
53 | | * @retval >0 Length (bytes) |
54 | | */ |
55 | | int mutt_mb_charlen(const char *s, int *width) |
56 | 67.3k | { |
57 | 67.3k | if (!s || (*s == '\0')) |
58 | 0 | return 0; |
59 | | |
60 | 67.3k | wchar_t wc = 0; |
61 | 67.3k | mbstate_t mbstate = { 0 }; |
62 | | |
63 | 67.3k | size_t n = mutt_str_len(s); |
64 | 67.3k | size_t k = mbrtowc(&wc, s, n, &mbstate); |
65 | 67.3k | if (width) |
66 | 0 | *width = wcwidth(wc); |
67 | 67.3k | return ((k == ICONV_ILLEGAL_SEQ) || (k == ICONV_BUF_TOO_SMALL)) ? -1 : k; |
68 | 67.3k | } |
69 | | |
70 | | /** |
71 | | * mutt_mb_get_initials - Turn a name into initials |
72 | | * @param name String to be converted |
73 | | * @param buf Buffer for the result |
74 | | * @param buflen Size of the buffer |
75 | | * @retval 1 Success |
76 | | * @retval 0 Failure |
77 | | * |
78 | | * Take a name, e.g. "John F. Kennedy" and reduce it to initials "JFK". |
79 | | * The function saves the first character from each word. Words are delimited |
80 | | * by whitespace, or hyphens (so "Jean-Pierre" becomes "JP"). |
81 | | */ |
82 | | bool mutt_mb_get_initials(const char *name, char *buf, size_t buflen) |
83 | 0 | { |
84 | 0 | if (!name || !buf) |
85 | 0 | return false; |
86 | | |
87 | 0 | while (*name) |
88 | 0 | { |
89 | | /* Char's length in bytes */ |
90 | 0 | int clen = mutt_mb_charlen(name, NULL); |
91 | 0 | if (clen < 1) |
92 | 0 | return false; |
93 | | |
94 | | /* Ignore punctuation at the beginning of a word */ |
95 | 0 | if ((clen == 1) && ispunct(*name)) |
96 | 0 | { |
97 | 0 | name++; |
98 | 0 | continue; |
99 | 0 | } |
100 | | |
101 | 0 | if (clen >= buflen) |
102 | 0 | return false; |
103 | | |
104 | | /* Copy one multibyte character */ |
105 | 0 | buflen -= clen; |
106 | 0 | while (clen--) |
107 | 0 | *buf++ = *name++; |
108 | | |
109 | | /* Skip to end-of-word */ |
110 | 0 | for (; *name; name += clen) |
111 | 0 | { |
112 | 0 | clen = mutt_mb_charlen(name, NULL); |
113 | 0 | if (clen < 1) |
114 | 0 | return false; |
115 | 0 | if ((clen == 1) && (isspace(*name) || (*name == '-'))) |
116 | 0 | break; |
117 | 0 | } |
118 | | |
119 | | /* Skip any whitespace, or hyphens */ |
120 | 0 | while (*name && (isspace(*name) || (*name == '-'))) |
121 | 0 | name++; |
122 | 0 | } |
123 | | |
124 | 0 | *buf = '\0'; |
125 | 0 | return true; |
126 | 0 | } |
127 | | |
128 | | /** |
129 | | * mutt_mb_width - Measure a string's display width (in screen columns) |
130 | | * @param str String to measure |
131 | | * @param col Display column (used for expanding tabs) |
132 | | * @param indent If true, newline-space will be indented 8 chars |
133 | | * @retval num String's width in screen columns |
134 | | * |
135 | | * This is like wcwidth(), but gets const char* not wchar_t*. |
136 | | */ |
137 | | int mutt_mb_width(const char *str, int col, bool indent) |
138 | 0 | { |
139 | 0 | if (!str || !*str) |
140 | 0 | return 0; |
141 | | |
142 | 0 | bool nl = false; |
143 | 0 | int total_width = 0; |
144 | 0 | mbstate_t mbstate = { 0 }; |
145 | |
|
146 | 0 | size_t str_len = mutt_str_len(str); |
147 | |
|
148 | 0 | while (*str && (str_len > 0)) |
149 | 0 | { |
150 | 0 | wchar_t wc = L'\0'; |
151 | 0 | size_t consumed = mbrtowc(&wc, str, str_len, &mbstate); |
152 | 0 | if (consumed == 0) |
153 | 0 | break; |
154 | | |
155 | 0 | if (consumed == ICONV_ILLEGAL_SEQ) |
156 | 0 | { |
157 | 0 | memset(&mbstate, 0, sizeof(mbstate)); |
158 | 0 | wc = ReplacementChar; |
159 | 0 | consumed = 1; |
160 | 0 | } |
161 | 0 | else if (consumed == ICONV_BUF_TOO_SMALL) |
162 | 0 | { |
163 | 0 | wc = ReplacementChar; |
164 | 0 | consumed = str_len; |
165 | 0 | } |
166 | |
|
167 | 0 | int wchar_width = wcwidth(wc); |
168 | 0 | if (wchar_width < 0) |
169 | 0 | wchar_width = 1; |
170 | |
|
171 | 0 | if ((wc == L'\t') || (nl && (wc == L' '))) |
172 | 0 | { |
173 | | /* correctly calc tab stop, even for sending as the line should look |
174 | | * pretty on the receiving end */ |
175 | 0 | nl = false; |
176 | 0 | wchar_width = 8 - (col % 8); |
177 | 0 | } |
178 | 0 | else if (indent && (wc == '\n')) |
179 | 0 | { |
180 | | /* track newlines for display-case: if we have a space after a newline, |
181 | | * assume 8 spaces as for display we always tab-fold */ |
182 | 0 | nl = true; |
183 | 0 | } |
184 | |
|
185 | 0 | total_width += wchar_width; |
186 | 0 | str += consumed; |
187 | 0 | str_len -= consumed; |
188 | 0 | } |
189 | |
|
190 | 0 | return total_width; |
191 | 0 | } |
192 | | |
193 | | /** |
194 | | * mutt_mb_wcwidth - Measure the screen width of a character |
195 | | * @param wc Character to examine |
196 | | * @retval num Width in screen columns |
197 | | */ |
198 | | int mutt_mb_wcwidth(wchar_t wc) |
199 | 0 | { |
200 | 0 | int n = wcwidth(wc); |
201 | 0 | if (IsWPrint(wc) && (n > 0)) |
202 | 0 | return n; |
203 | 0 | if (!(wc & ~0x7f)) |
204 | 0 | return 2; |
205 | 0 | if (!(wc & ~0xffff)) |
206 | 0 | return 6; |
207 | 0 | return 10; |
208 | 0 | } |
209 | | |
210 | | /** |
211 | | * mutt_mb_wcswidth - Measure the screen width of a string |
212 | | * @param s String to measure |
213 | | * @param n Length of string in characters |
214 | | * @retval num Width in screen columns |
215 | | */ |
216 | | int mutt_mb_wcswidth(const wchar_t *s, size_t n) |
217 | 0 | { |
218 | 0 | if (!s) |
219 | 0 | return 0; |
220 | | |
221 | 0 | int w = 0; |
222 | 0 | while (n--) |
223 | 0 | w += mutt_mb_wcwidth(*s++); |
224 | 0 | return w; |
225 | 0 | } |
226 | | |
227 | | /** |
228 | | * mutt_mb_width_ceiling - Keep the end of the string on-screen |
229 | | * @param s String being displayed |
230 | | * @param n Length of string in characters |
231 | | * @param w1 Width limit |
232 | | * @retval num Chars to skip |
233 | | * |
234 | | * Given a string and a width, determine how many characters from the |
235 | | * beginning of the string should be skipped so that the string fits. |
236 | | */ |
237 | | size_t mutt_mb_width_ceiling(const wchar_t *s, size_t n, int w1) |
238 | 0 | { |
239 | 0 | if (!s) |
240 | 0 | return 0; |
241 | | |
242 | 0 | const wchar_t *s0 = s; |
243 | 0 | int w = 0; |
244 | 0 | for (; n; s++, n--) |
245 | 0 | if ((w += mutt_mb_wcwidth(*s)) > w1) |
246 | 0 | break; |
247 | 0 | return s - s0; |
248 | 0 | } |
249 | | |
250 | | /** |
251 | | * buf_mb_wcstombs - Convert a string from wide to multibyte characters |
252 | | * @param dest Buffer for the result |
253 | | * @param wstr Source wide string to convert |
254 | | * @param wlen Length of the wide string |
255 | | */ |
256 | | void buf_mb_wcstombs(struct Buffer *dest, const wchar_t *wstr, size_t wlen) |
257 | 0 | { |
258 | 0 | if (!dest || !wstr) |
259 | 0 | return; |
260 | | |
261 | | // Give ourselves 4 utf-8 bytes per wide character |
262 | 0 | buf_alloc(dest, 4 * wlen); |
263 | |
|
264 | 0 | mbstate_t mbstate = { 0 }; |
265 | 0 | size_t k = 0; |
266 | |
|
267 | 0 | char *buf = dest->data; |
268 | 0 | size_t buflen = dest->dsize; |
269 | |
|
270 | 0 | for (; (wlen > 0) && (buflen >= MB_LEN_MAX); buf += k, buflen -= k, wstr++, wlen--) |
271 | 0 | { |
272 | 0 | k = wcrtomb(buf, *wstr, &mbstate); |
273 | 0 | if (k == ICONV_ILLEGAL_SEQ) |
274 | 0 | break; |
275 | 0 | if (*wstr == L'\0') |
276 | 0 | break; |
277 | 0 | } |
278 | |
|
279 | 0 | *buf = '\0'; |
280 | 0 | buf_fix_dptr(dest); |
281 | 0 | } |
282 | | |
283 | | /** |
284 | | * mutt_mb_mbstowcs - Convert a string from multibyte to wide characters |
285 | | * @param[out] pwbuf Buffer for the result |
286 | | * @param[out] pwbuflen Length of the result buffer |
287 | | * @param[in] i Starting index into the result buffer |
288 | | * @param[in] buf String to convert |
289 | | * @retval num First character after the result |
290 | | */ |
291 | | size_t mutt_mb_mbstowcs(wchar_t **pwbuf, size_t *pwbuflen, size_t i, const char *buf) |
292 | 0 | { |
293 | 0 | if (!pwbuf || !pwbuflen || !buf) |
294 | 0 | return 0; |
295 | | |
296 | 0 | wchar_t wc = 0; |
297 | 0 | mbstate_t mbstate = { 0 }; |
298 | 0 | size_t k; |
299 | 0 | wchar_t *wbuf = *pwbuf; |
300 | 0 | size_t wbuflen = *pwbuflen; |
301 | |
|
302 | 0 | while (*buf != '\0') |
303 | 0 | { |
304 | 0 | memset(&mbstate, 0, sizeof(mbstate)); |
305 | 0 | for (; (k = mbrtowc(&wc, buf, MB_LEN_MAX, &mbstate)) && |
306 | 0 | (k != ICONV_ILLEGAL_SEQ) && (k != ICONV_BUF_TOO_SMALL); |
307 | 0 | buf += k) |
308 | 0 | { |
309 | 0 | if (i >= wbuflen) |
310 | 0 | { |
311 | 0 | wbuflen = i + 20; |
312 | 0 | MUTT_MEM_REALLOC(&wbuf, wbuflen, wchar_t); |
313 | 0 | } |
314 | 0 | wbuf[i++] = wc; |
315 | 0 | } |
316 | 0 | if ((*buf != '\0') && ((k == ICONV_ILLEGAL_SEQ) || (k == ICONV_BUF_TOO_SMALL))) |
317 | 0 | { |
318 | 0 | if (i >= wbuflen) |
319 | 0 | { |
320 | 0 | wbuflen = i + 20; |
321 | 0 | MUTT_MEM_REALLOC(&wbuf, wbuflen, wchar_t); |
322 | 0 | } |
323 | 0 | wbuf[i++] = ReplacementChar; |
324 | 0 | buf++; |
325 | 0 | } |
326 | 0 | } |
327 | 0 | *pwbuf = wbuf; |
328 | 0 | *pwbuflen = wbuflen; |
329 | 0 | return i; |
330 | 0 | } |
331 | | |
332 | | /** |
333 | | * mutt_mb_is_shell_char - Is character not typically part of a pathname |
334 | | * @param ch Character to examine |
335 | | * @retval true Character is not typically part of a pathname |
336 | | * @retval false Character is typically part of a pathname |
337 | | * |
338 | | * @note The name is very confusing. |
339 | | */ |
340 | | bool mutt_mb_is_shell_char(wchar_t ch) |
341 | 0 | { |
342 | 0 | static const wchar_t shell_chars[] = L"<>&()$?*;{}| "; /* ! not included because it can be part of a pathname in NeoMutt */ |
343 | 0 | return wcschr(shell_chars, ch); |
344 | 0 | } |
345 | | |
346 | | /** |
347 | | * mutt_mb_is_lower - Does a multi-byte string contain only lowercase characters? |
348 | | * @param s String to check |
349 | | * @retval true String contains no uppercase characters |
350 | | * @retval false Error, or contains some uppercase characters |
351 | | * |
352 | | * Non-alphabetic characters are considered lowercase. |
353 | | */ |
354 | | bool mutt_mb_is_lower(const char *s) |
355 | 48.1k | { |
356 | 48.1k | if (!s) |
357 | 0 | return false; |
358 | | |
359 | 48.1k | wchar_t wc = 0; |
360 | 48.1k | mbstate_t mbstate = { 0 }; |
361 | 48.1k | size_t l; |
362 | | |
363 | 48.1k | memset(&mbstate, 0, sizeof(mbstate)); |
364 | 48.1k | size_t n = mutt_str_len(s); |
365 | | |
366 | 865k | for (; (n > 0) && (*s != '\0') && (l = mbrtowc(&wc, s, n, &mbstate)) != 0; s += l, n -= l) |
367 | 827k | { |
368 | 827k | if ((l == ICONV_BUF_TOO_SMALL) || (l == ICONV_ILLEGAL_SEQ)) |
369 | 0 | return false; // error; assume upper-case |
370 | 827k | if (iswalpha((wint_t) wc) && iswupper((wint_t) wc)) |
371 | 9.62k | return false; // upper-case |
372 | 827k | } |
373 | | |
374 | 38.4k | return true; // lower-case |
375 | 48.1k | } |
376 | | |
377 | | /** |
378 | | * mutt_mb_is_display_corrupting_utf8 - Will this character corrupt the display? |
379 | | * @param wc Character to examine |
380 | | * @retval true Character would corrupt the display |
381 | | * @retval false Character is safe to display |
382 | | * |
383 | | * @note This list isn't complete. |
384 | | */ |
385 | | bool mutt_mb_is_display_corrupting_utf8(wchar_t wc) |
386 | 0 | { |
387 | 0 | if ((wc == (wchar_t) 0x00ad) || /* soft hyphen */ |
388 | 0 | (wc == (wchar_t) 0x200e) || /* left-to-right mark */ |
389 | 0 | (wc == (wchar_t) 0x200f) || /* right-to-left mark */ |
390 | 0 | (wc == (wchar_t) 0xfeff)) /* zero width no-break space */ |
391 | 0 | { |
392 | 0 | return true; |
393 | 0 | } |
394 | | |
395 | | /* left-to-right isolate, right-to-left isolate, first strong isolate, |
396 | | * pop directional isolate */ |
397 | 0 | if ((wc >= (wchar_t) 0x2066) && (wc <= (wchar_t) 0x2069)) |
398 | 0 | return true; |
399 | | |
400 | | /* left-to-right embedding, right-to-left embedding, pop directional formatting, |
401 | | * left-to-right override, right-to-left override */ |
402 | 0 | if ((wc >= (wchar_t) 0x202a) && (wc <= (wchar_t) 0x202e)) |
403 | 0 | return true; |
404 | | |
405 | | /* arabic letter mark */ |
406 | 0 | if (wc == (wchar_t) 0x061c) |
407 | 0 | return true; |
408 | | |
409 | 0 | return false; |
410 | 0 | } |
411 | | |
412 | | /** |
413 | | * mutt_mb_filter_unprintable - Replace unprintable characters |
414 | | * @param[in,out] s String to modify |
415 | | * @retval 0 Success |
416 | | * @retval -1 Error |
417 | | * |
418 | | * Unprintable characters will be replaced with #ReplacementChar. |
419 | | * |
420 | | * @note The source string will be freed and a newly allocated string will be |
421 | | * returned in its place. The caller should free the returned string. |
422 | | */ |
423 | | int mutt_mb_filter_unprintable(char **s) |
424 | 6.43k | { |
425 | 6.43k | if (!s || !*s) |
426 | 0 | return -1; |
427 | | |
428 | 6.43k | wchar_t wc = 0; |
429 | 6.43k | size_t k, k2; |
430 | 6.43k | char scratch[MB_LEN_MAX + 1]; |
431 | 6.43k | char *p = *s; |
432 | 6.43k | mbstate_t mbstate1 = { 0 }; |
433 | 6.43k | mbstate_t mbstate2 = { 0 }; |
434 | | |
435 | 6.43k | struct Buffer *buf = buf_pool_get(); |
436 | 747k | for (; (k = mbrtowc(&wc, p, MB_LEN_MAX, &mbstate1)); p += k) |
437 | 741k | { |
438 | 741k | if ((k == ICONV_ILLEGAL_SEQ) || (k == ICONV_BUF_TOO_SMALL)) |
439 | 284k | { |
440 | 284k | k = 1; |
441 | 284k | memset(&mbstate1, 0, sizeof(mbstate1)); |
442 | 284k | wc = ReplacementChar; |
443 | 284k | } |
444 | 741k | if (CharsetIsUtf8 && IsBOM(wc)) |
445 | 0 | { |
446 | 0 | continue; |
447 | 0 | } |
448 | 741k | if (!IsWPrint(wc)) |
449 | 10.7k | wc = '?'; |
450 | 730k | else if (CharsetIsUtf8 && mutt_mb_is_display_corrupting_utf8(wc)) |
451 | 0 | continue; |
452 | 741k | k2 = wcrtomb(scratch, wc, &mbstate2); |
453 | 741k | scratch[k2] = '\0'; |
454 | 741k | buf_addstr(buf, scratch); |
455 | 741k | } |
456 | 6.43k | FREE(s); |
457 | | |
458 | 6.43k | if (buf_is_empty(buf)) |
459 | 2.43k | *s = MUTT_MEM_CALLOC(1, char); // Fake empty string |
460 | 4.00k | else |
461 | 4.00k | *s = buf_strdup(buf); |
462 | | |
463 | 6.43k | buf_pool_release(&buf); |
464 | 6.43k | return 0; |
465 | 6.43k | } |