/src/mpv/misc/charset_conv.c
Line | Count | Source (jump to first uncovered line) |
1 | | /* |
2 | | * This file is part of mpv. |
3 | | * |
4 | | * Based on code taken from libass (ISC license), which was originally part |
5 | | * of MPlayer (GPL). |
6 | | * Copyright (C) 2006 Evgeniy Stepanov <eugeni.stepanov@gmail.com> |
7 | | * |
8 | | * mpv is free software; you can redistribute it and/or |
9 | | * modify it under the terms of the GNU Lesser General Public |
10 | | * License as published by the Free Software Foundation; either |
11 | | * version 2.1 of the License, or (at your option) any later version. |
12 | | * |
13 | | * mpv is distributed in the hope that it will be useful, |
14 | | * but WITHOUT ANY WARRANTY; without even the implied warranty of |
15 | | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
16 | | * GNU Lesser General Public License for more details. |
17 | | * |
18 | | * You should have received a copy of the GNU Lesser General Public |
19 | | * License along with mpv. If not, see <http://www.gnu.org/licenses/>. |
20 | | */ |
21 | | |
22 | | #include <stdlib.h> |
23 | | #include <errno.h> |
24 | | #include <assert.h> |
25 | | |
26 | | #include "config.h" |
27 | | |
28 | | #include "common/common.h" |
29 | | #include "common/msg.h" |
30 | | |
31 | | #if HAVE_UCHARDET |
32 | | #include <uchardet.h> |
33 | | #endif |
34 | | |
35 | | #if HAVE_ICONV |
36 | | #include <iconv.h> |
37 | | #endif |
38 | | |
39 | | #include "charset_conv.h" |
40 | | |
41 | | bool mp_charset_is_utf8(const char *user_cp) |
42 | 1.98M | { |
43 | 1.98M | return user_cp && (strcasecmp(user_cp, "utf8") == 0 || |
44 | 1.98M | strcasecmp(user_cp, "utf-8") == 0); |
45 | 1.98M | } |
46 | | |
47 | | bool mp_charset_is_utf16(const char *user_cp) |
48 | 6.52k | { |
49 | 6.52k | bstr s = bstr0(user_cp); |
50 | 6.52k | return bstr_case_startswith(s, bstr0("utf16")) || |
51 | 6.52k | bstr_case_startswith(s, bstr0("utf-16")); |
52 | 6.52k | } |
53 | | |
54 | | static const char *const utf_bom[3] = {"\xEF\xBB\xBF", "\xFF\xFE", "\xFE\xFF"}; |
55 | | static const char *const utf_enc[3] = {"utf-8", "utf-16le", "utf-16be"}; |
56 | | |
57 | | static const char *ms_bom_guess(bstr buf) |
58 | 1.32M | { |
59 | 5.30M | for (int n = 0; n < 3; n++) { |
60 | 3.97M | if (bstr_startswith0(buf, utf_bom[n])) |
61 | 5.75k | return utf_enc[n]; |
62 | 3.97M | } |
63 | 1.32M | return NULL; |
64 | 1.32M | } |
65 | | |
66 | | #if HAVE_UCHARDET |
67 | | static const char *mp_uchardet(void *talloc_ctx, struct mp_log *log, bstr buf) |
68 | 416k | { |
69 | 416k | uchardet_t det = uchardet_new(); |
70 | 416k | if (!det) |
71 | 0 | return NULL; |
72 | 416k | if (uchardet_handle_data(det, buf.start, buf.len) != 0) { |
73 | 0 | uchardet_delete(det); |
74 | 0 | return NULL; |
75 | 0 | } |
76 | 416k | uchardet_data_end(det); |
77 | 416k | char *res = talloc_strdup(talloc_ctx, uchardet_get_charset(det)); |
78 | 416k | if (res && !res[0]) |
79 | 55.0k | res = NULL; |
80 | 416k | if (res) { |
81 | 360k | mp_verbose(log, "libuchardet detected charset as %s\n", res); |
82 | 360k | iconv_t icdsc = iconv_open("UTF-8", res); |
83 | 360k | if (icdsc == (iconv_t)(-1)) { |
84 | 0 | mp_warn(log, "Charset '%s' not supported by iconv.\n", res); |
85 | 0 | res = NULL; |
86 | 360k | } else { |
87 | 360k | iconv_close(icdsc); |
88 | 360k | } |
89 | 360k | } |
90 | 416k | uchardet_delete(det); |
91 | 416k | return res; |
92 | 416k | } |
93 | | #endif |
94 | | |
95 | | // Runs charset auto-detection on the input buffer, and returns the result. |
96 | | // If auto-detection fails, NULL is returned. |
97 | | // If user_cp doesn't refer to any known auto-detection (for example because |
98 | | // it's a real iconv codepage), user_cp is returned without even looking at |
99 | | // the buf data. |
100 | | // The return value may (but doesn't have to) be allocated under talloc_ctx. |
101 | | const char *mp_charset_guess(void *talloc_ctx, struct mp_log *log, bstr buf, |
102 | | const char *user_cp, int flags) |
103 | 1.32M | { |
104 | 1.32M | if (user_cp[0] == '+') { |
105 | 0 | mp_verbose(log, "Forcing charset '%s'.\n", user_cp + 1); |
106 | 0 | return user_cp + 1; |
107 | 0 | } |
108 | | |
109 | 1.32M | const char *bom_cp = ms_bom_guess(buf); |
110 | 1.32M | if (bom_cp) { |
111 | 5.75k | mp_verbose(log, "Data has a BOM, assuming %s as charset.\n", bom_cp); |
112 | 5.75k | return bom_cp; |
113 | 5.75k | } |
114 | | |
115 | 1.32M | int r = bstr_validate_utf8(buf); |
116 | 1.32M | if (r >= 0 || (r > -8 && (flags & MP_ICONV_ALLOW_CUTOFF))) { |
117 | 906k | if (strcmp(user_cp, "auto") != 0 && !mp_charset_is_utf8(user_cp)) |
118 | 12 | mp_verbose(log, "Data looks like UTF-8, ignoring user-provided charset.\n"); |
119 | 906k | return "utf-8"; |
120 | 906k | } |
121 | | |
122 | 416k | const char *res = NULL; |
123 | 416k | if (strcasecmp(user_cp, "auto") == 0) { |
124 | 416k | #if HAVE_UCHARDET |
125 | 416k | res = mp_uchardet(talloc_ctx, log, buf); |
126 | 416k | #endif |
127 | 416k | if (!res) { |
128 | 55.0k | mp_verbose(log, "Charset auto-detection failed.\n"); |
129 | 55.0k | res = "UTF-8-BROKEN"; |
130 | 55.0k | } |
131 | 416k | } else { |
132 | 0 | res = user_cp; |
133 | 0 | } |
134 | | |
135 | 416k | mp_verbose(log, "Using charset '%s'.\n", res); |
136 | 416k | return res; |
137 | 1.32M | } |
138 | | |
139 | | // Use iconv to convert buf to UTF-8. |
140 | | // Returns buf.start==NULL on error. Returns buf if cp is NULL, or if there is |
141 | | // obviously no conversion required (e.g. if cp is "UTF-8"). |
142 | | // Returns a newly allocated buffer if conversion is done and succeeds. The |
143 | | // buffer will be terminated with 0 for convenience (the terminating 0 is not |
144 | | // included in the returned length). |
145 | | // Free the returned buffer with talloc_free(). |
146 | | // buf: input data |
147 | | // cp: iconv codepage (or NULL) |
148 | | // flags: combination of MP_ICONV_* flags |
149 | | // returns: buf (no conversion), .start==NULL (error), or allocated buffer |
150 | | bstr mp_iconv_to_utf8(struct mp_log *log, bstr buf, const char *cp, int flags) |
151 | 525k | { |
152 | 525k | #if HAVE_ICONV |
153 | 525k | if (!buf.len) |
154 | 15.6k | return buf; |
155 | | |
156 | 509k | if (!cp || !cp[0] || mp_charset_is_utf8(cp)) |
157 | 0 | return buf; |
158 | | |
159 | 509k | if (strcasecmp(cp, "ASCII") == 0) |
160 | 0 | return buf; |
161 | | |
162 | 509k | if (strcasecmp(cp, "UTF-8-BROKEN") == 0) |
163 | 62.6k | return bstr_sanitize_utf8_latin1(NULL, buf); |
164 | | |
165 | | // Force CP949 over EUC-KR since iconv distinguishes them and |
166 | | // EUC-KR causes error on CP949 encoded data |
167 | 447k | if (strcasecmp(cp, "EUC-KR") == 0) |
168 | 0 | cp = "CP949"; |
169 | | |
170 | 447k | iconv_t icdsc; |
171 | 447k | if ((icdsc = iconv_open("UTF-8", cp)) == (iconv_t) (-1)) { |
172 | 71.9k | if (flags & MP_ICONV_VERBOSE) |
173 | 0 | mp_err(log, "Error opening iconv with codepage '%s'\n", cp); |
174 | 71.9k | goto failure; |
175 | 71.9k | } |
176 | | |
177 | 375k | size_t size = buf.len; |
178 | 375k | size_t osize = size; |
179 | 375k | size_t ileft = size; |
180 | 375k | size_t oleft = size - 1; |
181 | | |
182 | 375k | char *outbuf = talloc_size(NULL, osize); |
183 | 375k | char *ip = buf.start; |
184 | 375k | char *op = outbuf; |
185 | | |
186 | 1.19M | while (1) { |
187 | 1.19M | int clear = 0; |
188 | 1.19M | size_t rc; |
189 | 1.19M | if (ileft) |
190 | 843k | rc = iconv(icdsc, &ip, &ileft, &op, &oleft); |
191 | 352k | else { |
192 | 352k | clear = 1; // clear the conversion state and leave |
193 | 352k | rc = iconv(icdsc, NULL, NULL, &op, &oleft); |
194 | 352k | } |
195 | 1.19M | if (rc == (size_t) (-1)) { |
196 | 492k | if (errno == E2BIG) { |
197 | 469k | size_t offset = op - outbuf; |
198 | 469k | outbuf = talloc_realloc_size(NULL, outbuf, osize + size); |
199 | 469k | op = outbuf + offset; |
200 | 469k | osize += size; |
201 | 469k | oleft += size; |
202 | 469k | } else { |
203 | 23.4k | if (errno == EINVAL && (flags & MP_ICONV_ALLOW_CUTOFF)) { |
204 | | // This is intended for cases where the input buffer is cut |
205 | | // at a random byte position. If this happens in the middle |
206 | | // of the buffer, it should still be an error. We say it's |
207 | | // fine if the error is within 10 bytes of the end. |
208 | 0 | if (ileft <= 10) |
209 | 0 | break; |
210 | 0 | } |
211 | 23.4k | if (flags & MP_ICONV_VERBOSE) { |
212 | 4.20k | mp_err(log, "Error recoding text with codepage '%s'\n", cp); |
213 | 4.20k | } |
214 | 23.4k | talloc_free(outbuf); |
215 | 23.4k | iconv_close(icdsc); |
216 | 23.4k | goto failure; |
217 | 23.4k | } |
218 | 703k | } else if (clear) |
219 | 351k | break; |
220 | 1.19M | } |
221 | | |
222 | 351k | iconv_close(icdsc); |
223 | | |
224 | 351k | outbuf[osize - oleft - 1] = 0; |
225 | 351k | return (bstr){outbuf, osize - oleft - 1}; |
226 | | |
227 | 95.3k | failure: |
228 | 95.3k | #endif |
229 | | |
230 | 95.3k | if (flags & MP_NO_LATIN1_FALLBACK) { |
231 | 71.9k | return buf; |
232 | 71.9k | } else { |
233 | 23.4k | return bstr_sanitize_utf8_latin1(NULL, buf); |
234 | 23.4k | } |
235 | 95.3k | } |