Coverage Report

Created: 2025-12-27 07:18

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/src/mpv/misc/language.c
Line
Count
Source
1
/*
2
 * This file is part of mpv.
3
 *
4
 * mpv is free software; you can redistribute it and/or
5
 * modify it under the terms of the GNU Lesser General Public
6
 * License as published by the Free Software Foundation; either
7
 * version 2.1 of the License, or (at your option) any later version.
8
 *
9
 * mpv is distributed in the hope that it will be useful,
10
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
12
 * GNU Lesser General Public License for more details.
13
 *
14
 * You should have received a copy of the GNU Lesser General Public
15
 * License along with mpv.  If not, see <http://www.gnu.org/licenses/>.
16
 */
17
18
#include "language.h"
19
20
#include <limits.h>
21
#include <stdint.h>
22
23
#include "common/common.h"
24
#include "misc/ctype.h"
25
26
#define L(s) { #s, sizeof(#s) - 1 }
27
28
static const struct lang {
29
    struct { const char s[3] MP_NONSTRING; uint8_t l; } match;
30
    struct { const char s[3] MP_NONSTRING; uint8_t l; } canonical;
31
} langmap[] = {
32
    {L(aa), L(aar)},
33
    {L(ab), L(abk)},
34
    {L(ae), L(ave)},
35
    {L(af), L(afr)},
36
    {L(ak), L(aka)},
37
    {L(am), L(amh)},
38
    {L(an), L(arg)},
39
    {L(ar), L(ara)},
40
    {L(as), L(asm)},
41
    {L(av), L(ava)},
42
    {L(ay), L(aym)},
43
    {L(az), L(aze)},
44
    {L(ba), L(bak)},
45
    {L(be), L(bel)},
46
    {L(bg), L(bul)},
47
    {L(bh), L(bih)},
48
    {L(bi), L(bis)},
49
    {L(bm), L(bam)},
50
    {L(bn), L(ben)},
51
    {L(bo), L(tib)},
52
    {L(bod), L(tib)},
53
    {L(br), L(bre)},
54
    {L(bs), L(bos)},
55
    {L(ca), L(cat)},
56
    {L(ce), L(che)},
57
    {L(ces), L(cze)},
58
    {L(ch), L(cha)},
59
    {L(co), L(cos)},
60
    {L(cr), L(cre)},
61
    {L(cs), L(cze)},
62
    {L(cu), L(chu)},
63
    {L(cv), L(chv)},
64
    {L(cy), L(wel)},
65
    {L(cym), L(wel)},
66
    {L(da), L(dan)},
67
    {L(de), L(ger)},
68
    {L(deu), L(ger)},
69
    {L(dv), L(div)},
70
    {L(dz), L(dzo)},
71
    {L(ee), L(ewe)},
72
    {L(el), L(gre)},
73
    {L(ell), L(gre)},
74
    {L(en), L(eng)},
75
    {L(eo), L(epo)},
76
    {L(es), L(spa)},
77
    {L(et), L(est)},
78
    {L(eu), L(baq)},
79
    {L(eus), L(baq)},
80
    {L(fa), L(per)},
81
    {L(fas), L(per)},
82
    {L(ff), L(ful)},
83
    {L(fi), L(fin)},
84
    {L(fj), L(fij)},
85
    {L(fo), L(fao)},
86
    {L(fr), L(fre)},
87
    {L(fra), L(fre)},
88
    {L(fy), L(fry)},
89
    {L(ga), L(gle)},
90
    {L(gd), L(gla)},
91
    {L(gl), L(glg)},
92
    {L(gn), L(grn)},
93
    {L(gu), L(guj)},
94
    {L(gv), L(glv)},
95
    {L(ha), L(hau)},
96
    {L(he), L(heb)},
97
    {L(hi), L(hin)},
98
    {L(ho), L(hmo)},
99
    {L(hr), L(hrv)},
100
    {L(ht), L(hat)},
101
    {L(hu), L(hun)},
102
    {L(hy), L(arm)},
103
    {L(hye), L(arm)},
104
    {L(hz), L(her)},
105
    {L(ia), L(ina)},
106
    {L(id), L(ind)},
107
    {L(ie), L(ile)},
108
    {L(ig), L(ibo)},
109
    {L(ii), L(iii)},
110
    {L(ik), L(ipk)},
111
    {L(io), L(ido)},
112
    {L(is), L(ice)},
113
    {L(isl), L(ice)},
114
    {L(it), L(ita)},
115
    {L(iu), L(iku)},
116
    {L(ja), L(jpn)},
117
    {L(jv), L(jav)},
118
    {L(ka), L(geo)},
119
    {L(kat), L(geo)},
120
    {L(kg), L(kon)},
121
    {L(ki), L(kik)},
122
    {L(kj), L(kua)},
123
    {L(kk), L(kaz)},
124
    {L(kl), L(kal)},
125
    {L(km), L(khm)},
126
    {L(kn), L(kan)},
127
    {L(ko), L(kor)},
128
    {L(kr), L(kau)},
129
    {L(ks), L(kas)},
130
    {L(ku), L(kur)},
131
    {L(kv), L(kom)},
132
    {L(kw), L(cor)},
133
    {L(ky), L(kir)},
134
    {L(la), L(lat)},
135
    {L(lb), L(ltz)},
136
    {L(lg), L(lug)},
137
    {L(li), L(lim)},
138
    {L(ln), L(lin)},
139
    {L(lo), L(lao)},
140
    {L(lt), L(lit)},
141
    {L(lu), L(lub)},
142
    {L(lv), L(lav)},
143
    {L(mg), L(mlg)},
144
    {L(mh), L(mah)},
145
    {L(mi), L(mao)},
146
    {L(mk), L(mac)},
147
    {L(mkd), L(mac)},
148
    {L(ml), L(mal)},
149
    {L(mn), L(mon)},
150
    {L(mr), L(mar)},
151
    {L(mri), L(mao)},
152
    {L(ms), L(may)},
153
    {L(msa), L(may)},
154
    {L(mt), L(mlt)},
155
    {L(my), L(bur)},
156
    {L(mya), L(bur)},
157
    {L(na), L(nau)},
158
    {L(nb), L(nob)},
159
    {L(nd), L(nde)},
160
    {L(ne), L(nep)},
161
    {L(ng), L(ndo)},
162
    {L(nl), L(dut)},
163
    {L(nld), L(dut)},
164
    {L(nn), L(nno)},
165
    {L(no), L(nor)},
166
    {L(nr), L(nbl)},
167
    {L(nv), L(nav)},
168
    {L(ny), L(nya)},
169
    {L(oc), L(oci)},
170
    {L(oj), L(oji)},
171
    {L(om), L(orm)},
172
    {L(or), L(ori)},
173
    {L(os), L(oss)},
174
    {L(pa), L(pan)},
175
    {L(pi), L(pli)},
176
    {L(pl), L(pol)},
177
    {L(ps), L(pus)},
178
    {L(pt), L(por)},
179
    {L(qu), L(que)},
180
    {L(rm), L(roh)},
181
    {L(rn), L(run)},
182
    {L(ro), L(rum)},
183
    {L(ron), L(rum)},
184
    {L(ru), L(rus)},
185
    {L(rw), L(kin)},
186
    {L(sa), L(san)},
187
    {L(sc), L(srd)},
188
    {L(sd), L(snd)},
189
    {L(se), L(sme)},
190
    {L(sg), L(sag)},
191
    {L(si), L(sin)},
192
    {L(sk), L(slo)},
193
    {L(sl), L(slv)},
194
    {L(slk), L(slo)},
195
    {L(sm), L(smo)},
196
    {L(sn), L(sna)},
197
    {L(so), L(som)},
198
    {L(sq), L(alb)},
199
    {L(sqi), L(alb)},
200
    {L(sr), L(srp)},
201
    {L(ss), L(ssw)},
202
    {L(st), L(sot)},
203
    {L(su), L(sun)},
204
    {L(sv), L(swe)},
205
    {L(sw), L(swa)},
206
    {L(ta), L(tam)},
207
    {L(te), L(tel)},
208
    {L(tg), L(tgk)},
209
    {L(th), L(tha)},
210
    {L(ti), L(tir)},
211
    {L(tk), L(tuk)},
212
    {L(tl), L(tgl)},
213
    {L(tn), L(tsn)},
214
    {L(to), L(ton)},
215
    {L(tr), L(tur)},
216
    {L(ts), L(tso)},
217
    {L(tt), L(tat)},
218
    {L(tw), L(twi)},
219
    {L(ty), L(tah)},
220
    {L(ug), L(uig)},
221
    {L(uk), L(ukr)},
222
    {L(ur), L(urd)},
223
    {L(uz), L(uzb)},
224
    {L(ve), L(ven)},
225
    {L(vi), L(vie)},
226
    {L(vo), L(vol)},
227
    {L(wa), L(wln)},
228
    {L(wo), L(wol)},
229
    {L(xh), L(xho)},
230
    {L(yi), L(yid)},
231
    {L(yo), L(yor)},
232
    {L(za), L(zha)},
233
    {L(zh), L(chi)},
234
    {L(zho), L(chi)},
235
    {L(zu), L(zul)},
236
};
237
238
static int lang_compare(const void *key, const void *lang)
239
49.2k
{
240
49.2k
    const struct lang *l = lang;
241
49.2k
    return bstrcasecmp(*(const bstr*)key, (bstr){(unsigned char *)l->match.s, l->match.l});
242
49.2k
}
243
244
static bstr canonicalize(bstr lang)
245
6.16k
{
246
6.16k
    const struct lang *l = bsearch(&lang, langmap, MP_ARRAY_SIZE(langmap),
247
6.16k
                                   sizeof(langmap[0]), &lang_compare);
248
6.16k
    return l ? (bstr){(unsigned char *)l->canonical.s, l->canonical.l} : lang;
249
6.16k
}
250
251
int mp_match_lang(char **langs, const char *lang)
252
40.6k
{
253
40.6k
    if (!lang)
254
12.4k
        return 0;
255
256
28.1k
    void *ta_ctx = talloc_new(NULL);
257
28.1k
    int lang_parts_n = 0;
258
28.1k
    bstr *lang_parts = NULL;
259
28.1k
    bstr rest = bstr0(lang);
260
56.3k
    while (rest.len) {
261
28.1k
        bstr s = bstr_split(rest, "-", &rest);
262
28.1k
        MP_TARRAY_APPEND(ta_ctx, lang_parts, lang_parts_n, s);
263
28.1k
    }
264
265
28.1k
    int best_score = 0;
266
28.1k
    if (!lang_parts_n)
267
13
        goto done;
268
269
31.2k
    for (int idx = 0; langs && langs[idx]; idx++) {
270
3.08k
        rest = bstr0(langs[idx]);
271
3.08k
        int part = 0;
272
3.08k
        int score = 0;
273
4.62k
        while (rest.len) {
274
3.08k
            bstr s = bstr_split(rest, "-", &rest);
275
3.08k
            if (!part) {
276
3.08k
                if (bstrcasecmp(canonicalize(lang_parts[0]), canonicalize(s)))
277
1.54k
                    break;
278
1.53k
                score = INT_MAX - idx;
279
1.53k
                part++;
280
1.53k
                continue;
281
3.08k
            }
282
283
0
            if (part >= lang_parts_n)
284
0
                break;
285
286
0
            if (bstrcasecmp(lang_parts[part], s))
287
0
                score -= 1000;
288
289
0
            part++;
290
0
        }
291
3.08k
        score -= (lang_parts_n - part) * 1000;
292
3.08k
        best_score = MPMAX(best_score, score);
293
3.08k
    }
294
295
28.1k
done:
296
28.1k
    talloc_free(ta_ctx);
297
28.1k
    return best_score;
298
28.1k
}
299
300
bstr mp_guess_lang_from_filename(bstr name, int *lang_start, enum track_flags *flags)
301
0
{
302
0
    name = bstr_strip(bstr_strip_ext(name));
303
304
0
    if (lang_start)
305
0
        *lang_start = -1;
306
307
0
    if (flags)
308
0
        *flags = 0;
309
310
0
    if (name.len < 2)
311
0
        return (bstr){0};
312
313
0
    int lang_length = 0;
314
0
    int i = name.len - 1;
315
0
    int suffixes_length = 0;
316
317
0
    char delimiter = '.';
318
0
    if (name.start[i] == ')') {
319
0
        delimiter = '(';
320
0
        i--;
321
0
    }
322
0
    if (name.start[i] == ']') {
323
0
        delimiter = '[';
324
0
        i--;
325
0
    }
326
327
0
    enum track_flags *f = flags ? flags : &(enum track_flags){0};
328
329
0
    while (true) {
330
0
        while (i >= 0 && mp_isalpha(name.start[i])) {
331
0
            lang_length++;
332
0
            i--;
333
0
        }
334
335
0
        if (i >= 0 && lang_length >= 2 && name.start[i] == delimiter) {
336
0
            bool matched = false;
337
0
            static const char *const suffixes[] = { "sdh", "hi", "cc" };
338
0
            bstr tag = { name.start + i + 1, lang_length };
339
0
            for (int n = 0; n < MP_ARRAY_SIZE(suffixes); n++) {
340
0
                if (!bstrcasecmp0(tag, suffixes[n])) {
341
0
                    *f |= TRACK_HEARING_IMPAIRED;
342
0
                    matched = true;
343
0
                    break;
344
0
                }
345
0
            }
346
0
            if (!bstrcasecmp0(tag, "forced")) {
347
0
                *f |= TRACK_FORCED;
348
0
                matched = true;
349
0
            }
350
0
            if (!bstrcasecmp0(tag, "default")) {
351
0
                *f |= TRACK_DEFAULT;
352
0
                matched = true;
353
0
            }
354
0
            if (matched) {
355
0
                lang_length = 0;
356
0
                i -= (delimiter != '.') ? 2 : 1;
357
0
                continue;
358
0
            }
359
0
        }
360
361
        // According to
362
        // https://en.wikipedia.org/wiki/IETF_language_tag#Syntax_of_language_tags
363
        // subtags after the first are composed of 1 to 8 letters.
364
0
        if (lang_length < suffixes_length + 1 || lang_length > suffixes_length + 8)
365
0
            return (bstr){0};
366
367
0
        if (i >= 0 && name.start[i] == '-') {
368
0
            lang_length++;
369
0
            i--;
370
0
            suffixes_length = lang_length;
371
0
        } else {
372
0
            break;
373
0
        }
374
0
    }
375
376
    // The primary subtag can have 2 or 3 letters.
377
0
    if (lang_length < suffixes_length + 2 || lang_length > suffixes_length + 3 ||
378
0
        i <= 0 || name.start[i] != delimiter)
379
0
        return (bstr){0};
380
381
0
    if (lang_start)
382
0
        *lang_start = i;
383
384
0
    return (bstr){name.start + i + 1, lang_length};
385
0
}