Line | Count | Source |
1 | | /* |
2 | | * This file is part of mpv. |
3 | | * |
4 | | * mpv is free software; you can redistribute it and/or |
5 | | * modify it under the terms of the GNU Lesser General Public |
6 | | * License as published by the Free Software Foundation; either |
7 | | * version 2.1 of the License, or (at your option) any later version. |
8 | | * |
9 | | * mpv is distributed in the hope that it will be useful, |
10 | | * but WITHOUT ANY WARRANTY; without even the implied warranty of |
11 | | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
12 | | * GNU Lesser General Public License for more details. |
13 | | * |
14 | | * You should have received a copy of the GNU Lesser General Public |
15 | | * License along with mpv. If not, see <http://www.gnu.org/licenses/>. |
16 | | */ |
17 | | |
18 | | #include "language.h" |
19 | | |
20 | | #include <limits.h> |
21 | | #include <stdint.h> |
22 | | |
23 | | #include "common/common.h" |
24 | | #include "misc/ctype.h" |
25 | | |
26 | | #define L(s) { #s, sizeof(#s) - 1 } |
27 | | |
28 | | static const struct lang { |
29 | | struct { const char s[3] MP_NONSTRING; uint8_t l; } match; |
30 | | struct { const char s[3] MP_NONSTRING; uint8_t l; } canonical; |
31 | | } langmap[] = { |
32 | | {L(aa), L(aar)}, |
33 | | {L(ab), L(abk)}, |
34 | | {L(ae), L(ave)}, |
35 | | {L(af), L(afr)}, |
36 | | {L(ak), L(aka)}, |
37 | | {L(am), L(amh)}, |
38 | | {L(an), L(arg)}, |
39 | | {L(ar), L(ara)}, |
40 | | {L(as), L(asm)}, |
41 | | {L(av), L(ava)}, |
42 | | {L(ay), L(aym)}, |
43 | | {L(az), L(aze)}, |
44 | | {L(ba), L(bak)}, |
45 | | {L(be), L(bel)}, |
46 | | {L(bg), L(bul)}, |
47 | | {L(bh), L(bih)}, |
48 | | {L(bi), L(bis)}, |
49 | | {L(bm), L(bam)}, |
50 | | {L(bn), L(ben)}, |
51 | | {L(bo), L(tib)}, |
52 | | {L(bod), L(tib)}, |
53 | | {L(br), L(bre)}, |
54 | | {L(bs), L(bos)}, |
55 | | {L(ca), L(cat)}, |
56 | | {L(ce), L(che)}, |
57 | | {L(ces), L(cze)}, |
58 | | {L(ch), L(cha)}, |
59 | | {L(co), L(cos)}, |
60 | | {L(cr), L(cre)}, |
61 | | {L(cs), L(cze)}, |
62 | | {L(cu), L(chu)}, |
63 | | {L(cv), L(chv)}, |
64 | | {L(cy), L(wel)}, |
65 | | {L(cym), L(wel)}, |
66 | | {L(da), L(dan)}, |
67 | | {L(de), L(ger)}, |
68 | | {L(deu), L(ger)}, |
69 | | {L(dv), L(div)}, |
70 | | {L(dz), L(dzo)}, |
71 | | {L(ee), L(ewe)}, |
72 | | {L(el), L(gre)}, |
73 | | {L(ell), L(gre)}, |
74 | | {L(en), L(eng)}, |
75 | | {L(eo), L(epo)}, |
76 | | {L(es), L(spa)}, |
77 | | {L(et), L(est)}, |
78 | | {L(eu), L(baq)}, |
79 | | {L(eus), L(baq)}, |
80 | | {L(fa), L(per)}, |
81 | | {L(fas), L(per)}, |
82 | | {L(ff), L(ful)}, |
83 | | {L(fi), L(fin)}, |
84 | | {L(fj), L(fij)}, |
85 | | {L(fo), L(fao)}, |
86 | | {L(fr), L(fre)}, |
87 | | {L(fra), L(fre)}, |
88 | | {L(fy), L(fry)}, |
89 | | {L(ga), L(gle)}, |
90 | | {L(gd), L(gla)}, |
91 | | {L(gl), L(glg)}, |
92 | | {L(gn), L(grn)}, |
93 | | {L(gu), L(guj)}, |
94 | | {L(gv), L(glv)}, |
95 | | {L(ha), L(hau)}, |
96 | | {L(he), L(heb)}, |
97 | | {L(hi), L(hin)}, |
98 | | {L(ho), L(hmo)}, |
99 | | {L(hr), L(hrv)}, |
100 | | {L(ht), L(hat)}, |
101 | | {L(hu), L(hun)}, |
102 | | {L(hy), L(arm)}, |
103 | | {L(hye), L(arm)}, |
104 | | {L(hz), L(her)}, |
105 | | {L(ia), L(ina)}, |
106 | | {L(id), L(ind)}, |
107 | | {L(ie), L(ile)}, |
108 | | {L(ig), L(ibo)}, |
109 | | {L(ii), L(iii)}, |
110 | | {L(ik), L(ipk)}, |
111 | | {L(io), L(ido)}, |
112 | | {L(is), L(ice)}, |
113 | | {L(isl), L(ice)}, |
114 | | {L(it), L(ita)}, |
115 | | {L(iu), L(iku)}, |
116 | | {L(ja), L(jpn)}, |
117 | | {L(jv), L(jav)}, |
118 | | {L(ka), L(geo)}, |
119 | | {L(kat), L(geo)}, |
120 | | {L(kg), L(kon)}, |
121 | | {L(ki), L(kik)}, |
122 | | {L(kj), L(kua)}, |
123 | | {L(kk), L(kaz)}, |
124 | | {L(kl), L(kal)}, |
125 | | {L(km), L(khm)}, |
126 | | {L(kn), L(kan)}, |
127 | | {L(ko), L(kor)}, |
128 | | {L(kr), L(kau)}, |
129 | | {L(ks), L(kas)}, |
130 | | {L(ku), L(kur)}, |
131 | | {L(kv), L(kom)}, |
132 | | {L(kw), L(cor)}, |
133 | | {L(ky), L(kir)}, |
134 | | {L(la), L(lat)}, |
135 | | {L(lb), L(ltz)}, |
136 | | {L(lg), L(lug)}, |
137 | | {L(li), L(lim)}, |
138 | | {L(ln), L(lin)}, |
139 | | {L(lo), L(lao)}, |
140 | | {L(lt), L(lit)}, |
141 | | {L(lu), L(lub)}, |
142 | | {L(lv), L(lav)}, |
143 | | {L(mg), L(mlg)}, |
144 | | {L(mh), L(mah)}, |
145 | | {L(mi), L(mao)}, |
146 | | {L(mk), L(mac)}, |
147 | | {L(mkd), L(mac)}, |
148 | | {L(ml), L(mal)}, |
149 | | {L(mn), L(mon)}, |
150 | | {L(mr), L(mar)}, |
151 | | {L(mri), L(mao)}, |
152 | | {L(ms), L(may)}, |
153 | | {L(msa), L(may)}, |
154 | | {L(mt), L(mlt)}, |
155 | | {L(my), L(bur)}, |
156 | | {L(mya), L(bur)}, |
157 | | {L(na), L(nau)}, |
158 | | {L(nb), L(nob)}, |
159 | | {L(nd), L(nde)}, |
160 | | {L(ne), L(nep)}, |
161 | | {L(ng), L(ndo)}, |
162 | | {L(nl), L(dut)}, |
163 | | {L(nld), L(dut)}, |
164 | | {L(nn), L(nno)}, |
165 | | {L(no), L(nor)}, |
166 | | {L(nr), L(nbl)}, |
167 | | {L(nv), L(nav)}, |
168 | | {L(ny), L(nya)}, |
169 | | {L(oc), L(oci)}, |
170 | | {L(oj), L(oji)}, |
171 | | {L(om), L(orm)}, |
172 | | {L(or), L(ori)}, |
173 | | {L(os), L(oss)}, |
174 | | {L(pa), L(pan)}, |
175 | | {L(pi), L(pli)}, |
176 | | {L(pl), L(pol)}, |
177 | | {L(ps), L(pus)}, |
178 | | {L(pt), L(por)}, |
179 | | {L(qu), L(que)}, |
180 | | {L(rm), L(roh)}, |
181 | | {L(rn), L(run)}, |
182 | | {L(ro), L(rum)}, |
183 | | {L(ron), L(rum)}, |
184 | | {L(ru), L(rus)}, |
185 | | {L(rw), L(kin)}, |
186 | | {L(sa), L(san)}, |
187 | | {L(sc), L(srd)}, |
188 | | {L(sd), L(snd)}, |
189 | | {L(se), L(sme)}, |
190 | | {L(sg), L(sag)}, |
191 | | {L(si), L(sin)}, |
192 | | {L(sk), L(slo)}, |
193 | | {L(sl), L(slv)}, |
194 | | {L(slk), L(slo)}, |
195 | | {L(sm), L(smo)}, |
196 | | {L(sn), L(sna)}, |
197 | | {L(so), L(som)}, |
198 | | {L(sq), L(alb)}, |
199 | | {L(sqi), L(alb)}, |
200 | | {L(sr), L(srp)}, |
201 | | {L(ss), L(ssw)}, |
202 | | {L(st), L(sot)}, |
203 | | {L(su), L(sun)}, |
204 | | {L(sv), L(swe)}, |
205 | | {L(sw), L(swa)}, |
206 | | {L(ta), L(tam)}, |
207 | | {L(te), L(tel)}, |
208 | | {L(tg), L(tgk)}, |
209 | | {L(th), L(tha)}, |
210 | | {L(ti), L(tir)}, |
211 | | {L(tk), L(tuk)}, |
212 | | {L(tl), L(tgl)}, |
213 | | {L(tn), L(tsn)}, |
214 | | {L(to), L(ton)}, |
215 | | {L(tr), L(tur)}, |
216 | | {L(ts), L(tso)}, |
217 | | {L(tt), L(tat)}, |
218 | | {L(tw), L(twi)}, |
219 | | {L(ty), L(tah)}, |
220 | | {L(ug), L(uig)}, |
221 | | {L(uk), L(ukr)}, |
222 | | {L(ur), L(urd)}, |
223 | | {L(uz), L(uzb)}, |
224 | | {L(ve), L(ven)}, |
225 | | {L(vi), L(vie)}, |
226 | | {L(vo), L(vol)}, |
227 | | {L(wa), L(wln)}, |
228 | | {L(wo), L(wol)}, |
229 | | {L(xh), L(xho)}, |
230 | | {L(yi), L(yid)}, |
231 | | {L(yo), L(yor)}, |
232 | | {L(za), L(zha)}, |
233 | | {L(zh), L(chi)}, |
234 | | {L(zho), L(chi)}, |
235 | | {L(zu), L(zul)}, |
236 | | }; |
237 | | |
238 | | static int lang_compare(const void *key, const void *lang) |
239 | 49.2k | { |
240 | 49.2k | const struct lang *l = lang; |
241 | 49.2k | return bstrcasecmp(*(const bstr*)key, (bstr){(unsigned char *)l->match.s, l->match.l}); |
242 | 49.2k | } |
243 | | |
244 | | static bstr canonicalize(bstr lang) |
245 | 6.16k | { |
246 | 6.16k | const struct lang *l = bsearch(&lang, langmap, MP_ARRAY_SIZE(langmap), |
247 | 6.16k | sizeof(langmap[0]), &lang_compare); |
248 | 6.16k | return l ? (bstr){(unsigned char *)l->canonical.s, l->canonical.l} : lang; |
249 | 6.16k | } |
250 | | |
251 | | int mp_match_lang(char **langs, const char *lang) |
252 | 40.6k | { |
253 | 40.6k | if (!lang) |
254 | 12.4k | return 0; |
255 | | |
256 | 28.1k | void *ta_ctx = talloc_new(NULL); |
257 | 28.1k | int lang_parts_n = 0; |
258 | 28.1k | bstr *lang_parts = NULL; |
259 | 28.1k | bstr rest = bstr0(lang); |
260 | 56.3k | while (rest.len) { |
261 | 28.1k | bstr s = bstr_split(rest, "-", &rest); |
262 | 28.1k | MP_TARRAY_APPEND(ta_ctx, lang_parts, lang_parts_n, s); |
263 | 28.1k | } |
264 | | |
265 | 28.1k | int best_score = 0; |
266 | 28.1k | if (!lang_parts_n) |
267 | 13 | goto done; |
268 | | |
269 | 31.2k | for (int idx = 0; langs && langs[idx]; idx++) { |
270 | 3.08k | rest = bstr0(langs[idx]); |
271 | 3.08k | int part = 0; |
272 | 3.08k | int score = 0; |
273 | 4.62k | while (rest.len) { |
274 | 3.08k | bstr s = bstr_split(rest, "-", &rest); |
275 | 3.08k | if (!part) { |
276 | 3.08k | if (bstrcasecmp(canonicalize(lang_parts[0]), canonicalize(s))) |
277 | 1.54k | break; |
278 | 1.53k | score = INT_MAX - idx; |
279 | 1.53k | part++; |
280 | 1.53k | continue; |
281 | 3.08k | } |
282 | | |
283 | 0 | if (part >= lang_parts_n) |
284 | 0 | break; |
285 | | |
286 | 0 | if (bstrcasecmp(lang_parts[part], s)) |
287 | 0 | score -= 1000; |
288 | |
|
289 | 0 | part++; |
290 | 0 | } |
291 | 3.08k | score -= (lang_parts_n - part) * 1000; |
292 | 3.08k | best_score = MPMAX(best_score, score); |
293 | 3.08k | } |
294 | | |
295 | 28.1k | done: |
296 | 28.1k | talloc_free(ta_ctx); |
297 | 28.1k | return best_score; |
298 | 28.1k | } |
299 | | |
300 | | bstr mp_guess_lang_from_filename(bstr name, int *lang_start, enum track_flags *flags) |
301 | 0 | { |
302 | 0 | name = bstr_strip(bstr_strip_ext(name)); |
303 | |
|
304 | 0 | if (lang_start) |
305 | 0 | *lang_start = -1; |
306 | |
|
307 | 0 | if (flags) |
308 | 0 | *flags = 0; |
309 | |
|
310 | 0 | if (name.len < 2) |
311 | 0 | return (bstr){0}; |
312 | | |
313 | 0 | int lang_length = 0; |
314 | 0 | int i = name.len - 1; |
315 | 0 | int suffixes_length = 0; |
316 | |
|
317 | 0 | char delimiter = '.'; |
318 | 0 | if (name.start[i] == ')') { |
319 | 0 | delimiter = '('; |
320 | 0 | i--; |
321 | 0 | } |
322 | 0 | if (name.start[i] == ']') { |
323 | 0 | delimiter = '['; |
324 | 0 | i--; |
325 | 0 | } |
326 | |
|
327 | 0 | enum track_flags *f = flags ? flags : &(enum track_flags){0}; |
328 | |
|
329 | 0 | while (true) { |
330 | 0 | while (i >= 0 && mp_isalpha(name.start[i])) { |
331 | 0 | lang_length++; |
332 | 0 | i--; |
333 | 0 | } |
334 | |
|
335 | 0 | if (i >= 0 && lang_length >= 2 && name.start[i] == delimiter) { |
336 | 0 | bool matched = false; |
337 | 0 | static const char *const suffixes[] = { "sdh", "hi", "cc" }; |
338 | 0 | bstr tag = { name.start + i + 1, lang_length }; |
339 | 0 | for (int n = 0; n < MP_ARRAY_SIZE(suffixes); n++) { |
340 | 0 | if (!bstrcasecmp0(tag, suffixes[n])) { |
341 | 0 | *f |= TRACK_HEARING_IMPAIRED; |
342 | 0 | matched = true; |
343 | 0 | break; |
344 | 0 | } |
345 | 0 | } |
346 | 0 | if (!bstrcasecmp0(tag, "forced")) { |
347 | 0 | *f |= TRACK_FORCED; |
348 | 0 | matched = true; |
349 | 0 | } |
350 | 0 | if (!bstrcasecmp0(tag, "default")) { |
351 | 0 | *f |= TRACK_DEFAULT; |
352 | 0 | matched = true; |
353 | 0 | } |
354 | 0 | if (matched) { |
355 | 0 | lang_length = 0; |
356 | 0 | i -= (delimiter != '.') ? 2 : 1; |
357 | 0 | continue; |
358 | 0 | } |
359 | 0 | } |
360 | | |
361 | | // According to |
362 | | // https://en.wikipedia.org/wiki/IETF_language_tag#Syntax_of_language_tags |
363 | | // subtags after the first are composed of 1 to 8 letters. |
364 | 0 | if (lang_length < suffixes_length + 1 || lang_length > suffixes_length + 8) |
365 | 0 | return (bstr){0}; |
366 | | |
367 | 0 | if (i >= 0 && name.start[i] == '-') { |
368 | 0 | lang_length++; |
369 | 0 | i--; |
370 | 0 | suffixes_length = lang_length; |
371 | 0 | } else { |
372 | 0 | break; |
373 | 0 | } |
374 | 0 | } |
375 | | |
376 | | // The primary subtag can have 2 or 3 letters. |
377 | 0 | if (lang_length < suffixes_length + 2 || lang_length > suffixes_length + 3 || |
378 | 0 | i <= 0 || name.start[i] != delimiter) |
379 | 0 | return (bstr){0}; |
380 | | |
381 | 0 | if (lang_start) |
382 | 0 | *lang_start = i; |
383 | |
|
384 | 0 | return (bstr){name.start + i + 1, lang_length}; |
385 | 0 | } |