/src/mpv/misc/charset_conv.c

Source (jump to first uncovered line)
/*
 * This file is part of mpv.
 *
 * Based on code taken from libass (ISC license), which was originally part
 * of MPlayer (GPL).
 * Copyright (C) 2006 Evgeniy Stepanov <eugeni.stepanov@gmail.com>
 *
 * mpv is free software; you can redistribute it and/or
 * modify it under the terms of the GNU Lesser General Public
 * License as published by the Free Software Foundation; either
 * version 2.1 of the License, or (at your option) any later version.
 *
 * mpv is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU Lesser General Public License for more details.
 *
 * You should have received a copy of the GNU Lesser General Public
 * License along with mpv.  If not, see <http://www.gnu.org/licenses/>.
 */

#include <stdlib.h>
#include <errno.h>
#include <assert.h>

#include "config.h"

#include "common/common.h"
#include "common/msg.h"

#if HAVE_UCHARDET
#include <uchardet.h>
#endif

#if HAVE_ICONV
#include <iconv.h>
#endif

#include "charset_conv.h"

bool mp_charset_is_utf8(const char *user_cp)
{
    return user_cp && (strcasecmp(user_cp, "utf8") == 0 ||
                       strcasecmp(user_cp, "utf-8") == 0);
}

bool mp_charset_is_utf16(const char *user_cp)
{
    bstr s = bstr0(user_cp);
    return bstr_case_startswith(s, bstr0("utf16")) ||
           bstr_case_startswith(s, bstr0("utf-16"));
}

static const char *const utf_bom[3] = {"\xEF\xBB\xBF", "\xFF\xFE", "\xFE\xFF"};
static const char *const utf_enc[3] = {"utf-8",        "utf-16le", "utf-16be"};

static const char *ms_bom_guess(bstr buf)
{
    for (int n = 0; n < 3; n++) {
        if (bstr_startswith0(buf, utf_bom[n]))
            return utf_enc[n];
    }
    return NULL;
}

#if HAVE_UCHARDET
static const char *mp_uchardet(void *talloc_ctx, struct mp_log *log, bstr buf)
{
    uchardet_t det = uchardet_new();
    if (!det)
        return NULL;
    if (uchardet_handle_data(det, buf.start, buf.len) != 0) {
        uchardet_delete(det);
        return NULL;
    }
    uchardet_data_end(det);
    char *res = talloc_strdup(talloc_ctx, uchardet_get_charset(det));
    if (res && !res[0])
        res = NULL;
    if (res) {
        mp_verbose(log, "libuchardet detected charset as %s\n", res);
        iconv_t icdsc = iconv_open("UTF-8", res);
        if (icdsc == (iconv_t)(-1)) {
            mp_warn(log, "Charset '%s' not supported by iconv.\n", res);
            res = NULL;
        } else {
            iconv_close(icdsc);
        }
    }
    uchardet_delete(det);
    return res;
}
#endif

// Runs charset auto-detection on the input buffer, and returns the result.
// If auto-detection fails, NULL is returned.
// If user_cp doesn't refer to any known auto-detection (for example because
// it's a real iconv codepage), user_cp is returned without even looking at
// the buf data.
// The return value may (but doesn't have to) be allocated under talloc_ctx.
const char *mp_charset_guess(void *talloc_ctx, struct mp_log *log,  bstr buf,
                             const char *user_cp, int flags)
{
    if (user_cp[0] == '+') {
        mp_verbose(log, "Forcing charset '%s'.\n", user_cp + 1);
        return user_cp + 1;
    }

    const char *bom_cp = ms_bom_guess(buf);
    if (bom_cp) {
        mp_verbose(log, "Data has a BOM, assuming %s as charset.\n", bom_cp);
        return bom_cp;
    }

    int r = bstr_validate_utf8(buf);
    if (r >= 0 || (r > -8 && (flags & MP_ICONV_ALLOW_CUTOFF))) {
        if (strcmp(user_cp, "auto") != 0 && !mp_charset_is_utf8(user_cp))
            mp_verbose(log, "Data looks like UTF-8, ignoring user-provided charset.\n");
        return "utf-8";
    }

    const char *res = NULL;
    if (strcasecmp(user_cp, "auto") == 0) {
#if HAVE_UCHARDET
        res = mp_uchardet(talloc_ctx, log, buf);
#endif
        if (!res) {
            mp_verbose(log, "Charset auto-detection failed.\n");
            res = "UTF-8-BROKEN";
        }
    } else {
        res = user_cp;
    }

    mp_verbose(log, "Using charset '%s'.\n", res);
    return res;
}

// Use iconv to convert buf to UTF-8.
// Returns buf.start==NULL on error. Returns buf if cp is NULL, or if there is
// obviously no conversion required (e.g. if cp is "UTF-8").
// Returns a newly allocated buffer if conversion is done and succeeds. The
// buffer will be terminated with 0 for convenience (the terminating 0 is not
// included in the returned length).
// Free the returned buffer with talloc_free().
//  buf: input data
//  cp: iconv codepage (or NULL)
//  flags: combination of MP_ICONV_* flags
//  returns: buf (no conversion), .start==NULL (error), or allocated buffer
bstr mp_iconv_to_utf8(struct mp_log *log, bstr buf, const char *cp, int flags)
{
#if HAVE_ICONV
    if (!buf.len)
        return buf;

    if (!cp || !cp[0] || mp_charset_is_utf8(cp))
        return buf;

    if (strcasecmp(cp, "ASCII") == 0)
        return buf;

    if (strcasecmp(cp, "UTF-8-BROKEN") == 0)
        return bstr_sanitize_utf8_latin1(NULL, buf);

    // Force CP949 over EUC-KR since iconv distinguishes them and
    // EUC-KR causes error on CP949 encoded data
    if (strcasecmp(cp, "EUC-KR") == 0)
        cp = "CP949";

    iconv_t icdsc;
    if ((icdsc = iconv_open("UTF-8", cp)) == (iconv_t) (-1)) {
        if (flags & MP_ICONV_VERBOSE)
            mp_err(log, "Error opening iconv with codepage '%s'\n", cp);
        goto failure;
    }

    size_t size = buf.len;
    size_t osize = size;
    size_t ileft = size;
    size_t oleft = size - 1;

    char *outbuf = talloc_size(NULL, osize);
    char *ip = buf.start;
    char *op = outbuf;

    while (1) {
        int clear = 0;
        size_t rc;
        if (ileft)
            rc = iconv(icdsc, &ip, &ileft, &op, &oleft);
        else {
            clear = 1; // clear the conversion state and leave
            rc = iconv(icdsc, NULL, NULL, &op, &oleft);
        }
        if (rc == (size_t) (-1)) {
            if (errno == E2BIG) {
                size_t offset = op - outbuf;
                outbuf = talloc_realloc_size(NULL, outbuf, osize + size);
                op = outbuf + offset;
                osize += size;
                oleft += size;
            } else {
                if (errno == EINVAL && (flags & MP_ICONV_ALLOW_CUTOFF)) {
                    // This is intended for cases where the input buffer is cut
                    // at a random byte position. If this happens in the middle
                    // of the buffer, it should still be an error. We say it's
                    // fine if the error is within 10 bytes of the end.
                    if (ileft <= 10)
                        break;
                }
                if (flags & MP_ICONV_VERBOSE) {
                    mp_err(log, "Error recoding text with codepage '%s'\n", cp);
                }
                talloc_free(outbuf);
                iconv_close(icdsc);
                goto failure;
            }
        } else if (clear)
            break;
    }

    iconv_close(icdsc);

    outbuf[osize - oleft - 1] = 0;
    return (bstr){outbuf, osize - oleft - 1};

failure:
#endif

    if (flags & MP_NO_LATIN1_FALLBACK) {
        return buf;
    } else {
        return bstr_sanitize_utf8_latin1(NULL, buf);
    }
}

Coverage Report

Created: 2025-09-04 07:15

Line	Count	Source (jump to first uncovered line)
1		/*
2		* This file is part of mpv.
3		*
4		* Based on code taken from libass (ISC license), which was originally part
5		* of MPlayer (GPL).
6		* Copyright (C) 2006 Evgeniy Stepanov <eugeni.stepanov@gmail.com>
7		*
8		* mpv is free software; you can redistribute it and/or
9		* modify it under the terms of the GNU Lesser General Public
10		* License as published by the Free Software Foundation; either
11		* version 2.1 of the License, or (at your option) any later version.
12		*
13		* mpv is distributed in the hope that it will be useful,
14		* but WITHOUT ANY WARRANTY; without even the implied warranty of
15		* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16		* GNU Lesser General Public License for more details.
17		*
18		* You should have received a copy of the GNU Lesser General Public
19		* License along with mpv. If not, see <http://www.gnu.org/licenses/>.
20		*/
21
22		#include <stdlib.h>
23		#include <errno.h>
24		#include <assert.h>
25
26		#include "config.h"
27
28		#include "common/common.h"
29		#include "common/msg.h"
30
31		#if HAVE_UCHARDET
32		#include <uchardet.h>
33		#endif
34
35		#if HAVE_ICONV
36		#include <iconv.h>
37		#endif
38
39		#include "charset_conv.h"
40
41		bool mp_charset_is_utf8(const char *user_cp)
42	1.98M	{
43	1.98M	return user_cp && (strcasecmp(user_cp, "utf8") == 0 \|\|
44	1.98M	strcasecmp(user_cp, "utf-8") == 0);
45	1.98M	}
46
47		bool mp_charset_is_utf16(const char *user_cp)
48	6.52k	{
49	6.52k	bstr s = bstr0(user_cp);
50	6.52k	return bstr_case_startswith(s, bstr0("utf16")) \|\|
51	6.52k	bstr_case_startswith(s, bstr0("utf-16"));
52	6.52k	}
53
54		static const char *const utf_bom[3] = {"\xEF\xBB\xBF", "\xFF\xFE", "\xFE\xFF"};
55		static const char *const utf_enc[3] = {"utf-8", "utf-16le", "utf-16be"};
56
57		static const char *ms_bom_guess(bstr buf)
58	1.32M	{
59	5.30M	for (int n = 0; n < 3; n++) {
60	3.97M	if (bstr_startswith0(buf, utf_bom[n]))
61	5.75k	return utf_enc[n];
62	3.97M	}
63	1.32M	return NULL;
64	1.32M	}
65
66		#if HAVE_UCHARDET
67		static const char mp_uchardet(void talloc_ctx, struct mp_log *log, bstr buf)
68	416k	{
69	416k	uchardet_t det = uchardet_new();
70	416k	if (!det)
71	0	return NULL;
72	416k	if (uchardet_handle_data(det, buf.start, buf.len) != 0) {
73	0	uchardet_delete(det);
74	0	return NULL;
75	0	}
76	416k	uchardet_data_end(det);
77	416k	char *res = talloc_strdup(talloc_ctx, uchardet_get_charset(det));
78	416k	if (res && !res[0])
79	55.0k	res = NULL;
80	416k	if (res) {
81	360k	mp_verbose(log, "libuchardet detected charset as %s\n", res);
82	360k	iconv_t icdsc = iconv_open("UTF-8", res);
83	360k	if (icdsc == (iconv_t)(-1)) {
84	0	mp_warn(log, "Charset '%s' not supported by iconv.\n", res);
85	0	res = NULL;
86	360k	} else {
87	360k	iconv_close(icdsc);
88	360k	}
89	360k	}
90	416k	uchardet_delete(det);
91	416k	return res;
92	416k	}
93		#endif
94
95		// Runs charset auto-detection on the input buffer, and returns the result.
96		// If auto-detection fails, NULL is returned.
97		// If user_cp doesn't refer to any known auto-detection (for example because
98		// it's a real iconv codepage), user_cp is returned without even looking at
99		// the buf data.
100		// The return value may (but doesn't have to) be allocated under talloc_ctx.
101		const char mp_charset_guess(void talloc_ctx, struct mp_log *log, bstr buf,
102		const char *user_cp, int flags)
103	1.32M	{
104	1.32M	if (user_cp[0] == '+') {
105	0	mp_verbose(log, "Forcing charset '%s'.\n", user_cp + 1);
106	0	return user_cp + 1;
107	0	}
108
109	1.32M	const char *bom_cp = ms_bom_guess(buf);
110	1.32M	if (bom_cp) {
111	5.75k	mp_verbose(log, "Data has a BOM, assuming %s as charset.\n", bom_cp);
112	5.75k	return bom_cp;
113	5.75k	}
114
115	1.32M	int r = bstr_validate_utf8(buf);
116	1.32M	if (r >= 0 \|\| (r > -8 && (flags & MP_ICONV_ALLOW_CUTOFF))) {
117	906k	if (strcmp(user_cp, "auto") != 0 && !mp_charset_is_utf8(user_cp))
118	12	mp_verbose(log, "Data looks like UTF-8, ignoring user-provided charset.\n");
119	906k	return "utf-8";
120	906k	}
121
122	416k	const char *res = NULL;
123	416k	if (strcasecmp(user_cp, "auto") == 0) {
124	416k	#if HAVE_UCHARDET
125	416k	res = mp_uchardet(talloc_ctx, log, buf);
126	416k	#endif
127	416k	if (!res) {
128	55.0k	mp_verbose(log, "Charset auto-detection failed.\n");
129	55.0k	res = "UTF-8-BROKEN";
130	55.0k	}
131	416k	} else {
132	0	res = user_cp;
133	0	}
134
135	416k	mp_verbose(log, "Using charset '%s'.\n", res);
136	416k	return res;
137	1.32M	}
138
139		// Use iconv to convert buf to UTF-8.
140		// Returns buf.start==NULL on error. Returns buf if cp is NULL, or if there is
141		// obviously no conversion required (e.g. if cp is "UTF-8").
142		// Returns a newly allocated buffer if conversion is done and succeeds. The
143		// buffer will be terminated with 0 for convenience (the terminating 0 is not
144		// included in the returned length).
145		// Free the returned buffer with talloc_free().
146		// buf: input data
147		// cp: iconv codepage (or NULL)
148		// flags: combination of MP_ICONV_* flags
149		// returns: buf (no conversion), .start==NULL (error), or allocated buffer
150		bstr mp_iconv_to_utf8(struct mp_log log, bstr buf, const char cp, int flags)
151	525k	{
152	525k	#if HAVE_ICONV
153	525k	if (!buf.len)
154	15.6k	return buf;
155
156	509k	if (!cp \|\| !cp[0] \|\| mp_charset_is_utf8(cp))
157	0	return buf;
158
159	509k	if (strcasecmp(cp, "ASCII") == 0)
160	0	return buf;
161
162	509k	if (strcasecmp(cp, "UTF-8-BROKEN") == 0)
163	62.6k	return bstr_sanitize_utf8_latin1(NULL, buf);
164
165		// Force CP949 over EUC-KR since iconv distinguishes them and
166		// EUC-KR causes error on CP949 encoded data
167	447k	if (strcasecmp(cp, "EUC-KR") == 0)
168	0	cp = "CP949";
169
170	447k	iconv_t icdsc;
171	447k	if ((icdsc = iconv_open("UTF-8", cp)) == (iconv_t) (-1)) {
172	71.9k	if (flags & MP_ICONV_VERBOSE)
173	0	mp_err(log, "Error opening iconv with codepage '%s'\n", cp);
174	71.9k	goto failure;
175	71.9k	}
176
177	375k	size_t size = buf.len;
178	375k	size_t osize = size;
179	375k	size_t ileft = size;
180	375k	size_t oleft = size - 1;
181
182	375k	char *outbuf = talloc_size(NULL, osize);
183	375k	char *ip = buf.start;
184	375k	char *op = outbuf;
185
186	1.19M	while (1) {
187	1.19M	int clear = 0;
188	1.19M	size_t rc;
189	1.19M	if (ileft)
190	843k	rc = iconv(icdsc, &ip, &ileft, &op, &oleft);
191	352k	else {
192	352k	clear = 1; // clear the conversion state and leave
193	352k	rc = iconv(icdsc, NULL, NULL, &op, &oleft);
194	352k	}
195	1.19M	if (rc == (size_t) (-1)) {
196	492k	if (errno == E2BIG) {
197	469k	size_t offset = op - outbuf;
198	469k	outbuf = talloc_realloc_size(NULL, outbuf, osize + size);
199	469k	op = outbuf + offset;
200	469k	osize += size;
201	469k	oleft += size;
202	469k	} else {
203	23.4k	if (errno == EINVAL && (flags & MP_ICONV_ALLOW_CUTOFF)) {
204		// This is intended for cases where the input buffer is cut
205		// at a random byte position. If this happens in the middle
206		// of the buffer, it should still be an error. We say it's
207		// fine if the error is within 10 bytes of the end.
208	0	if (ileft <= 10)
209	0	break;
210	0	}
211	23.4k	if (flags & MP_ICONV_VERBOSE) {
212	4.20k	mp_err(log, "Error recoding text with codepage '%s'\n", cp);
213	4.20k	}
214	23.4k	talloc_free(outbuf);
215	23.4k	iconv_close(icdsc);
216	23.4k	goto failure;
217	23.4k	}
218	703k	} else if (clear)
219	351k	break;
220	1.19M	}
221
222	351k	iconv_close(icdsc);
223
224	351k	outbuf[osize - oleft - 1] = 0;
225	351k	return (bstr){outbuf, osize - oleft - 1};
226
227	95.3k	failure:
228	95.3k	#endif
229
230	95.3k	if (flags & MP_NO_LATIN1_FALLBACK) {
231	71.9k	return buf;
232	71.9k	} else {
233	23.4k	return bstr_sanitize_utf8_latin1(NULL, buf);
234	23.4k	}
235	95.3k	}