Coverage Report

Created: 2025-09-04 07:15

/src/mpv/misc/charset_conv.c
Line
Count
Source (jump to first uncovered line)
1
/*
2
 * This file is part of mpv.
3
 *
4
 * Based on code taken from libass (ISC license), which was originally part
5
 * of MPlayer (GPL).
6
 * Copyright (C) 2006 Evgeniy Stepanov <eugeni.stepanov@gmail.com>
7
 *
8
 * mpv is free software; you can redistribute it and/or
9
 * modify it under the terms of the GNU Lesser General Public
10
 * License as published by the Free Software Foundation; either
11
 * version 2.1 of the License, or (at your option) any later version.
12
 *
13
 * mpv is distributed in the hope that it will be useful,
14
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
15
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
16
 * GNU Lesser General Public License for more details.
17
 *
18
 * You should have received a copy of the GNU Lesser General Public
19
 * License along with mpv.  If not, see <http://www.gnu.org/licenses/>.
20
 */
21
22
#include <stdlib.h>
23
#include <errno.h>
24
#include <assert.h>
25
26
#include "config.h"
27
28
#include "common/common.h"
29
#include "common/msg.h"
30
31
#if HAVE_UCHARDET
32
#include <uchardet.h>
33
#endif
34
35
#if HAVE_ICONV
36
#include <iconv.h>
37
#endif
38
39
#include "charset_conv.h"
40
41
bool mp_charset_is_utf8(const char *user_cp)
42
1.98M
{
43
1.98M
    return user_cp && (strcasecmp(user_cp, "utf8") == 0 ||
44
1.98M
                       strcasecmp(user_cp, "utf-8") == 0);
45
1.98M
}
46
47
bool mp_charset_is_utf16(const char *user_cp)
48
6.52k
{
49
6.52k
    bstr s = bstr0(user_cp);
50
6.52k
    return bstr_case_startswith(s, bstr0("utf16")) ||
51
6.52k
           bstr_case_startswith(s, bstr0("utf-16"));
52
6.52k
}
53
54
static const char *const utf_bom[3] = {"\xEF\xBB\xBF", "\xFF\xFE", "\xFE\xFF"};
55
static const char *const utf_enc[3] = {"utf-8",        "utf-16le", "utf-16be"};
56
57
static const char *ms_bom_guess(bstr buf)
58
1.32M
{
59
5.30M
    for (int n = 0; n < 3; n++) {
60
3.97M
        if (bstr_startswith0(buf, utf_bom[n]))
61
5.75k
            return utf_enc[n];
62
3.97M
    }
63
1.32M
    return NULL;
64
1.32M
}
65
66
#if HAVE_UCHARDET
67
static const char *mp_uchardet(void *talloc_ctx, struct mp_log *log, bstr buf)
68
416k
{
69
416k
    uchardet_t det = uchardet_new();
70
416k
    if (!det)
71
0
        return NULL;
72
416k
    if (uchardet_handle_data(det, buf.start, buf.len) != 0) {
73
0
        uchardet_delete(det);
74
0
        return NULL;
75
0
    }
76
416k
    uchardet_data_end(det);
77
416k
    char *res = talloc_strdup(talloc_ctx, uchardet_get_charset(det));
78
416k
    if (res && !res[0])
79
55.0k
        res = NULL;
80
416k
    if (res) {
81
360k
        mp_verbose(log, "libuchardet detected charset as %s\n", res);
82
360k
        iconv_t icdsc = iconv_open("UTF-8", res);
83
360k
        if (icdsc == (iconv_t)(-1)) {
84
0
            mp_warn(log, "Charset '%s' not supported by iconv.\n", res);
85
0
            res = NULL;
86
360k
        } else {
87
360k
            iconv_close(icdsc);
88
360k
        }
89
360k
    }
90
416k
    uchardet_delete(det);
91
416k
    return res;
92
416k
}
93
#endif
94
95
// Runs charset auto-detection on the input buffer, and returns the result.
96
// If auto-detection fails, NULL is returned.
97
// If user_cp doesn't refer to any known auto-detection (for example because
98
// it's a real iconv codepage), user_cp is returned without even looking at
99
// the buf data.
100
// The return value may (but doesn't have to) be allocated under talloc_ctx.
101
const char *mp_charset_guess(void *talloc_ctx, struct mp_log *log,  bstr buf,
102
                             const char *user_cp, int flags)
103
1.32M
{
104
1.32M
    if (user_cp[0] == '+') {
105
0
        mp_verbose(log, "Forcing charset '%s'.\n", user_cp + 1);
106
0
        return user_cp + 1;
107
0
    }
108
109
1.32M
    const char *bom_cp = ms_bom_guess(buf);
110
1.32M
    if (bom_cp) {
111
5.75k
        mp_verbose(log, "Data has a BOM, assuming %s as charset.\n", bom_cp);
112
5.75k
        return bom_cp;
113
5.75k
    }
114
115
1.32M
    int r = bstr_validate_utf8(buf);
116
1.32M
    if (r >= 0 || (r > -8 && (flags & MP_ICONV_ALLOW_CUTOFF))) {
117
906k
        if (strcmp(user_cp, "auto") != 0 && !mp_charset_is_utf8(user_cp))
118
12
            mp_verbose(log, "Data looks like UTF-8, ignoring user-provided charset.\n");
119
906k
        return "utf-8";
120
906k
    }
121
122
416k
    const char *res = NULL;
123
416k
    if (strcasecmp(user_cp, "auto") == 0) {
124
416k
#if HAVE_UCHARDET
125
416k
        res = mp_uchardet(talloc_ctx, log, buf);
126
416k
#endif
127
416k
        if (!res) {
128
55.0k
            mp_verbose(log, "Charset auto-detection failed.\n");
129
55.0k
            res = "UTF-8-BROKEN";
130
55.0k
        }
131
416k
    } else {
132
0
        res = user_cp;
133
0
    }
134
135
416k
    mp_verbose(log, "Using charset '%s'.\n", res);
136
416k
    return res;
137
1.32M
}
138
139
// Use iconv to convert buf to UTF-8.
140
// Returns buf.start==NULL on error. Returns buf if cp is NULL, or if there is
141
// obviously no conversion required (e.g. if cp is "UTF-8").
142
// Returns a newly allocated buffer if conversion is done and succeeds. The
143
// buffer will be terminated with 0 for convenience (the terminating 0 is not
144
// included in the returned length).
145
// Free the returned buffer with talloc_free().
146
//  buf: input data
147
//  cp: iconv codepage (or NULL)
148
//  flags: combination of MP_ICONV_* flags
149
//  returns: buf (no conversion), .start==NULL (error), or allocated buffer
150
bstr mp_iconv_to_utf8(struct mp_log *log, bstr buf, const char *cp, int flags)
151
525k
{
152
525k
#if HAVE_ICONV
153
525k
    if (!buf.len)
154
15.6k
        return buf;
155
156
509k
    if (!cp || !cp[0] || mp_charset_is_utf8(cp))
157
0
        return buf;
158
159
509k
    if (strcasecmp(cp, "ASCII") == 0)
160
0
        return buf;
161
162
509k
    if (strcasecmp(cp, "UTF-8-BROKEN") == 0)
163
62.6k
        return bstr_sanitize_utf8_latin1(NULL, buf);
164
165
    // Force CP949 over EUC-KR since iconv distinguishes them and
166
    // EUC-KR causes error on CP949 encoded data
167
447k
    if (strcasecmp(cp, "EUC-KR") == 0)
168
0
        cp = "CP949";
169
170
447k
    iconv_t icdsc;
171
447k
    if ((icdsc = iconv_open("UTF-8", cp)) == (iconv_t) (-1)) {
172
71.9k
        if (flags & MP_ICONV_VERBOSE)
173
0
            mp_err(log, "Error opening iconv with codepage '%s'\n", cp);
174
71.9k
        goto failure;
175
71.9k
    }
176
177
375k
    size_t size = buf.len;
178
375k
    size_t osize = size;
179
375k
    size_t ileft = size;
180
375k
    size_t oleft = size - 1;
181
182
375k
    char *outbuf = talloc_size(NULL, osize);
183
375k
    char *ip = buf.start;
184
375k
    char *op = outbuf;
185
186
1.19M
    while (1) {
187
1.19M
        int clear = 0;
188
1.19M
        size_t rc;
189
1.19M
        if (ileft)
190
843k
            rc = iconv(icdsc, &ip, &ileft, &op, &oleft);
191
352k
        else {
192
352k
            clear = 1; // clear the conversion state and leave
193
352k
            rc = iconv(icdsc, NULL, NULL, &op, &oleft);
194
352k
        }
195
1.19M
        if (rc == (size_t) (-1)) {
196
492k
            if (errno == E2BIG) {
197
469k
                size_t offset = op - outbuf;
198
469k
                outbuf = talloc_realloc_size(NULL, outbuf, osize + size);
199
469k
                op = outbuf + offset;
200
469k
                osize += size;
201
469k
                oleft += size;
202
469k
            } else {
203
23.4k
                if (errno == EINVAL && (flags & MP_ICONV_ALLOW_CUTOFF)) {
204
                    // This is intended for cases where the input buffer is cut
205
                    // at a random byte position. If this happens in the middle
206
                    // of the buffer, it should still be an error. We say it's
207
                    // fine if the error is within 10 bytes of the end.
208
0
                    if (ileft <= 10)
209
0
                        break;
210
0
                }
211
23.4k
                if (flags & MP_ICONV_VERBOSE) {
212
4.20k
                    mp_err(log, "Error recoding text with codepage '%s'\n", cp);
213
4.20k
                }
214
23.4k
                talloc_free(outbuf);
215
23.4k
                iconv_close(icdsc);
216
23.4k
                goto failure;
217
23.4k
            }
218
703k
        } else if (clear)
219
351k
            break;
220
1.19M
    }
221
222
351k
    iconv_close(icdsc);
223
224
351k
    outbuf[osize - oleft - 1] = 0;
225
351k
    return (bstr){outbuf, osize - oleft - 1};
226
227
95.3k
failure:
228
95.3k
#endif
229
230
95.3k
    if (flags & MP_NO_LATIN1_FALLBACK) {
231
71.9k
        return buf;
232
71.9k
    } else {
233
23.4k
        return bstr_sanitize_utf8_latin1(NULL, buf);
234
23.4k
    }
235
95.3k
}