Coverage Report

Created: 2023-03-26 07:08

/src/vlc/modules/codec/subsdec.c
Line
Count
Source (jump to first uncovered line)
1
/*****************************************************************************
2
 * subsdec.c : text subtitle decoder
3
 *****************************************************************************
4
 * Copyright (C) 2000-2006 VLC authors and VideoLAN
5
 *
6
 * Authors: Gildas Bazin <gbazin@videolan.org>
7
 *          Samuel Hocevar <sam@zoy.org>
8
 *          Derk-Jan Hartman <hartman at videolan dot org>
9
 *          Bernie Purcell <bitmap@videolan.org>
10
 *
11
 * This program is free software; you can redistribute it and/or modify it
12
 * under the terms of the GNU Lesser General Public License as published by
13
 * the Free Software Foundation; either version 2.1 of the License, or
14
 * (at your option) any later version.
15
 *
16
 * This program is distributed in the hope that it will be useful,
17
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
18
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19
 * GNU Lesser General Public License for more details.
20
 *
21
 * You should have received a copy of the GNU Lesser General Public License
22
 * along with this program; if not, write to the Free Software Foundation,
23
 * Inc., 51 Franklin Street, Fifth Floor, Boston MA 02110-1301, USA.
24
 *****************************************************************************/
25
26
/*****************************************************************************
27
 * Preamble
28
 *****************************************************************************/
29
#ifdef HAVE_CONFIG_H
30
# include "config.h"
31
#endif
32
33
#include <limits.h>
34
#include <errno.h>
35
#include <ctype.h>
36
37
#include <vlc_common.h>
38
#include <vlc_plugin.h>
39
#include <vlc_codec.h>
40
#include <vlc_charset.h>
41
#include <vlc_xml.h>
42
43
#include "substext.h"
44
45
/*****************************************************************************
46
 * Module descriptor.
47
 *****************************************************************************/
48
static const char *const ppsz_encodings[] = {
49
    "",
50
    "system",
51
    "UTF-8",
52
    "UTF-16",
53
    "UTF-16BE",
54
    "UTF-16LE",
55
    "GB18030",
56
    "ISO-8859-15",
57
    "Windows-1252",
58
    "IBM850",
59
    "ISO-8859-2",
60
    "Windows-1250",
61
    "ISO-8859-3",
62
    "ISO-8859-10",
63
    "Windows-1251",
64
    "KOI8-R",
65
    "KOI8-U",
66
    "ISO-8859-6",
67
    "Windows-1256",
68
    "ISO-8859-7",
69
    "Windows-1253",
70
    "ISO-8859-8",
71
    "Windows-1255",
72
    "ISO-8859-9",
73
    "Windows-1254",
74
    "ISO-8859-11",
75
    "Windows-874",
76
    "ISO-8859-13",
77
    "Windows-1257",
78
    "ISO-8859-14",
79
    "ISO-8859-16",
80
    "ISO-2022-CN-EXT",
81
    "EUC-CN",
82
    "ISO-2022-JP-2",
83
    "EUC-JP",
84
    "Shift_JIS",
85
    "CP949",
86
    "ISO-2022-KR",
87
    "Big5",
88
    "ISO-2022-TW",
89
    "Big5-HKSCS",
90
    "VISCII",
91
    "Windows-1258",
92
};
93
94
static const char *const ppsz_encoding_names[] = {
95
    /* xgettext:
96
      The character encoding name in parenthesis corresponds to that used for
97
      the GetACP translation. "Windows-1252" applies to Western European
98
      languages using the Latin alphabet. */
99
    N_("Default (Windows-1252)"),
100
    N_("System codeset"),
101
    N_("Universal (UTF-8)"),
102
    N_("Universal (UTF-16)"),
103
    N_("Universal (big endian UTF-16)"),
104
    N_("Universal (little endian UTF-16)"),
105
    N_("Universal, Chinese (GB18030)"),
106
107
  /* ISO 8859 and the likes */
108
    /* 1 */
109
    N_("Western European (Latin-9)"), /* mostly superset of Latin-1 */
110
    N_("Western European (Windows-1252)"),
111
    N_("Western European (IBM 00850)"),
112
    /* 2 */
113
    N_("Eastern European (Latin-2)"),
114
    N_("Eastern European (Windows-1250)"),
115
    /* 3 */
116
    N_("Esperanto (Latin-3)"),
117
    /* 4 */
118
    N_("Nordic (Latin-6)"), /* Latin 6 supersedes Latin 4 */
119
    /* 5 */
120
    N_("Cyrillic (Windows-1251)"), /* ISO 8859-5 is not practically used */
121
    N_("Russian (KOI8-R)"),
122
    N_("Ukrainian (KOI8-U)"),
123
    /* 6 */
124
    N_("Arabic (ISO 8859-6)"),
125
    N_("Arabic (Windows-1256)"),
126
    /* 7 */
127
    N_("Greek (ISO 8859-7)"),
128
    N_("Greek (Windows-1253)"),
129
    /* 8 */
130
    N_("Hebrew (ISO 8859-8)"),
131
    N_("Hebrew (Windows-1255)"),
132
    /* 9 */
133
    N_("Turkish (ISO 8859-9)"),
134
    N_("Turkish (Windows-1254)"),
135
    /* 10 -> 4 */
136
    /* 11 */
137
    N_("Thai (TIS 620-2533/ISO 8859-11)"),
138
    N_("Thai (Windows-874)"),
139
    /* 13 */
140
    N_("Baltic (Latin-7)"),
141
    N_("Baltic (Windows-1257)"),
142
    /* 12 -> /dev/null */
143
    /* 14 */
144
    N_("Celtic (Latin-8)"),
145
    /* 15 -> 1 */
146
    /* 16 */
147
    N_("South-Eastern European (Latin-10)"),
148
  /* CJK families */
149
    N_("Simplified Chinese (ISO-2022-CN-EXT)"),
150
    N_("Simplified Chinese Unix (EUC-CN)"),
151
    N_("Japanese (7-bits JIS/ISO-2022-JP-2)"),
152
    N_("Japanese Unix (EUC-JP)"),
153
    N_("Japanese (Shift JIS)"),
154
    N_("Korean (EUC-KR/CP949)"),
155
    N_("Korean (ISO-2022-KR)"),
156
    N_("Traditional Chinese (Big5)"),
157
    N_("Traditional Chinese Unix (EUC-TW)"),
158
    N_("Hong-Kong Supplementary (HKSCS)"),
159
  /* Other */
160
    N_("Vietnamese (VISCII)"),
161
    N_("Vietnamese (Windows-1258)"),
162
};
163
164
static const int  pi_justification[] = { -1, 0, 1, 2 };
165
static const char *const ppsz_justification_text[] = {
166
    N_("Auto"),N_("Center"),N_("Left"),N_("Right")
167
};
168
169
#define ENCODING_TEXT N_("Subtitle text encoding")
170
#define ENCODING_LONGTEXT N_("Set the encoding used in text subtitles")
171
#define ALIGN_TEXT N_("Subtitle justification")
172
#define ALIGN_LONGTEXT N_("Set the justification of subtitles")
173
#define AUTODETECT_UTF8_TEXT N_("UTF-8 subtitle autodetection")
174
#define AUTODETECT_UTF8_LONGTEXT N_("This enables automatic detection of " \
175
            "UTF-8 encoding within subtitle files.")
176
177
static int  OpenDecoder   ( vlc_object_t * );
178
static void CloseDecoder  ( vlc_object_t * );
179
180
0
vlc_module_begin ()
181
0
    set_shortname( N_("Subtitles"))
182
0
    set_description( N_("Text subtitle decoder") )
183
0
    set_capability( "spu decoder", 50 )
184
0
    set_callbacks( OpenDecoder, CloseDecoder )
185
0
    set_subcategory( SUBCAT_INPUT_SCODEC )
186
187
0
    add_integer( "subsdec-align", -1, ALIGN_TEXT, ALIGN_LONGTEXT )
188
0
        change_integer_list( pi_justification, ppsz_justification_text )
189
0
    add_string( "subsdec-encoding", "",
190
0
                ENCODING_TEXT, ENCODING_LONGTEXT )
191
0
        change_string_list( ppsz_encodings, ppsz_encoding_names )
192
0
    add_bool( "subsdec-autodetect-utf8", true,
193
0
              AUTODETECT_UTF8_TEXT, AUTODETECT_UTF8_LONGTEXT )
194
0
vlc_module_end ()
195
196
/*****************************************************************************
197
 * Local prototypes
198
 *****************************************************************************/
199
#define NO_BREAKING_SPACE  "&#160;"
200
201
typedef struct
202
{
203
    int                 i_align;          /* Subtitles alignment on the vout */
204
205
    vlc_iconv_t         iconv_handle;            /* handle to iconv instance */
206
    bool                b_autodetect_utf8;
207
} decoder_sys_t;
208
209
210
static int             DecodeBlock   ( decoder_t *, block_t * );
211
static subpicture_t   *ParseText     ( decoder_t *, block_t * );
212
static text_segment_t *ParseSubtitles(int *pi_align, const char * );
213
214
/*****************************************************************************
215
 * OpenDecoder: probe the decoder and return score
216
 *****************************************************************************
217
 * Tries to launch a decoder and return score so that the interface is able
218
 * to chose.
219
 *****************************************************************************/
220
static int OpenDecoder( vlc_object_t *p_this )
221
0
{
222
0
    decoder_t     *p_dec = (decoder_t*)p_this;
223
0
    decoder_sys_t *p_sys;
224
225
0
    switch( p_dec->fmt_in->i_codec )
226
0
    {
227
0
        case VLC_CODEC_SUBT:
228
0
        case VLC_CODEC_ITU_T140:
229
0
            break;
230
0
        default:
231
0
            return VLC_EGENERIC;
232
0
    }
233
234
    /* Allocate the memory needed to store the decoder's structure */
235
0
    p_dec->p_sys = p_sys = calloc( 1, sizeof( *p_sys ) );
236
0
    if( p_sys == NULL )
237
0
        return VLC_ENOMEM;
238
239
0
    p_dec->pf_decode = DecodeBlock;
240
0
    p_dec->fmt_out.i_codec = 0;
241
242
    /* init of p_sys */
243
0
    p_sys->i_align = -1;
244
0
    p_sys->iconv_handle = (vlc_iconv_t)-1;
245
0
    p_sys->b_autodetect_utf8 = false;
246
247
0
    const char *encoding;
248
0
    char *var = NULL;
249
250
    /* First try demux-specified encoding */
251
0
    if( p_dec->fmt_in->i_codec == VLC_CODEC_ITU_T140 )
252
0
        encoding = "UTF-8"; /* IUT T.140 is always using UTF-8 */
253
0
    else
254
0
    if( p_dec->fmt_in->subs.psz_encoding && *p_dec->fmt_in->subs.psz_encoding )
255
0
    {
256
0
        encoding = p_dec->fmt_in->subs.psz_encoding;
257
0
        msg_Dbg (p_dec, "trying demuxer-specified character encoding: %s",
258
0
                 encoding);
259
0
    }
260
0
    else
261
0
    {
262
        /* Second, try configured encoding */
263
0
        if ((var = var_InheritString (p_dec, "subsdec-encoding")) != NULL)
264
0
        {
265
0
            msg_Dbg (p_dec, "trying configured character encoding: %s", var);
266
0
            if (!strcmp (var, "system"))
267
0
            {
268
0
                free (var);
269
0
                var = NULL;
270
0
                encoding = "";
271
                /* ^ iconv() treats "" as nl_langinfo(CODESET) */
272
0
            }
273
0
            else
274
0
                encoding = var;
275
0
        }
276
0
        else
277
        /* Third, try "local" encoding */
278
0
        {
279
        /* xgettext:
280
           The Windows ANSI code page most commonly used for this language.
281
           VLC uses this as a guess of the subtitle files character set
282
           (if UTF-8 and UTF-16 autodetection fails).
283
           Western European languages normally use "CP1252", which is a
284
           Microsoft-variant of ISO 8859-1. That suits the Latin alphabet.
285
           Other scripts use other code pages.
286
287
           This MUST be a valid iconv character set. If unsure, please refer
288
           the VideoLAN translators mailing list. */
289
0
            encoding = vlc_pgettext("GetACP", "CP1252");
290
0
            msg_Dbg (p_dec, "trying default character encoding: %s", encoding);
291
0
        }
292
293
        /* Check UTF-8 autodetection */
294
0
        if (var_InheritBool (p_dec, "subsdec-autodetect-utf8"))
295
0
        {
296
0
            msg_Dbg (p_dec, "using automatic UTF-8 detection");
297
0
            p_sys->b_autodetect_utf8 = true;
298
0
        }
299
0
    }
300
301
0
    if (strcasecmp (encoding, "UTF-8") && strcasecmp (encoding, "utf8"))
302
0
    {
303
0
        p_sys->iconv_handle = vlc_iconv_open ("UTF-8", encoding);
304
0
        if (p_sys->iconv_handle == (vlc_iconv_t)(-1))
305
0
            msg_Err (p_dec, "cannot convert from %s: %s", encoding,
306
0
                     vlc_strerror_c(errno));
307
0
    }
308
0
    free (var);
309
310
0
    p_sys->i_align = var_InheritInteger( p_dec, "subsdec-align" );
311
312
0
    return VLC_SUCCESS;
313
0
}
314
315
/****************************************************************************
316
 * DecodeBlock: the whole thing
317
 ****************************************************************************
318
 * This function must be fed with complete subtitles units.
319
 ****************************************************************************/
320
static int DecodeBlock( decoder_t *p_dec, block_t *p_block )
321
0
{
322
0
    subpicture_t *p_spu;
323
324
0
    if( p_block == NULL ) /* No Drain */
325
0
        return VLCDEC_SUCCESS;
326
327
0
    if( p_block->i_flags & BLOCK_FLAG_CORRUPTED )
328
0
    {
329
0
        block_Release( p_block );
330
0
        return VLCDEC_SUCCESS;
331
0
    }
332
333
0
    p_spu = ParseText( p_dec, p_block );
334
335
0
    block_Release( p_block );
336
0
    if( p_spu != NULL )
337
0
        decoder_QueueSub( p_dec, p_spu );
338
0
    return VLCDEC_SUCCESS;
339
0
}
340
341
/*****************************************************************************
342
 * CloseDecoder: clean up the decoder
343
 *****************************************************************************/
344
static void CloseDecoder( vlc_object_t *p_this )
345
0
{
346
0
    decoder_t *p_dec = (decoder_t *)p_this;
347
0
    decoder_sys_t *p_sys = p_dec->p_sys;
348
349
0
    if( p_sys->iconv_handle != (vlc_iconv_t)-1 )
350
0
        vlc_iconv_close( p_sys->iconv_handle );
351
352
0
    free( p_sys );
353
0
}
354
355
/*****************************************************************************
356
 * ParseText: parse an text subtitle packet and send it to the video output
357
 *****************************************************************************/
358
static subpicture_t *ParseText( decoder_t *p_dec, block_t *p_block )
359
0
{
360
0
    decoder_sys_t *p_sys = p_dec->p_sys;
361
0
    subpicture_t *p_spu = NULL;
362
363
0
    if( p_block->i_flags & BLOCK_FLAG_CORRUPTED )
364
0
        return NULL;
365
366
    /* We cannot display a subpicture with no date */
367
0
    if( p_block->i_pts == VLC_TICK_INVALID )
368
0
    {
369
0
        msg_Warn( p_dec, "subtitle without a date" );
370
0
        return NULL;
371
0
    }
372
373
    /* Check validity of packet data */
374
    /* An "empty" line containing only \0 can be used to force
375
       and ephemer picture from the screen */
376
0
    if( p_block->i_buffer < 1 )
377
0
    {
378
0
        msg_Warn( p_dec, "no subtitle data" );
379
0
        return NULL;
380
0
    }
381
382
0
    char *psz_subtitle = NULL;
383
384
    /* Should be resiliant against bad subtitles */
385
0
    if( p_sys->iconv_handle == (vlc_iconv_t)-1 ||
386
0
        p_sys->b_autodetect_utf8 )
387
0
    {
388
0
        psz_subtitle = malloc( p_block->i_buffer + 1 );
389
0
        if( psz_subtitle == NULL )
390
0
            return NULL;
391
0
        memcpy( psz_subtitle, p_block->p_buffer, p_block->i_buffer );
392
0
        psz_subtitle[p_block->i_buffer] = '\0';
393
0
    }
394
395
0
    if( p_sys->iconv_handle == (vlc_iconv_t)-1 )
396
0
    {
397
0
        if (EnsureUTF8( psz_subtitle ) == NULL)
398
0
        {
399
0
            msg_Err( p_dec, "failed to convert subtitle encoding.\n"
400
0
                     "Try manually setting a character-encoding "
401
0
                     "before you open the file." );
402
0
        }
403
0
    }
404
0
    else
405
0
    {
406
0
        if( p_sys->b_autodetect_utf8 )
407
0
        {
408
0
            if( IsUTF8( psz_subtitle ) == NULL )
409
0
            {
410
0
                msg_Dbg( p_dec, "invalid UTF-8 sequence: "
411
0
                         "disabling UTF-8 subtitles autodetection" );
412
0
                p_sys->b_autodetect_utf8 = false;
413
0
            }
414
0
        }
415
416
0
        if( !p_sys->b_autodetect_utf8 )
417
0
        {
418
0
            size_t inbytes_left = p_block->i_buffer;
419
0
            size_t outbytes_left = 6 * inbytes_left;
420
0
            char *psz_new_subtitle = xmalloc( outbytes_left + 1 );
421
0
            char *psz_convert_buffer_out = psz_new_subtitle;
422
0
            const char *psz_convert_buffer_in =
423
0
                    psz_subtitle ? psz_subtitle : (char *)p_block->p_buffer;
424
425
0
            size_t ret = vlc_iconv( p_sys->iconv_handle,
426
0
                                    &psz_convert_buffer_in, &inbytes_left,
427
0
                                    &psz_convert_buffer_out, &outbytes_left );
428
429
0
            *psz_convert_buffer_out++ = '\0';
430
0
            free( psz_subtitle );
431
432
0
            if( ( ret == (size_t)(-1) ) || inbytes_left )
433
0
            {
434
0
                free( psz_new_subtitle );
435
0
                msg_Err( p_dec, "failed to convert subtitle encoding.\n"
436
0
                        "Try manually setting a character-encoding "
437
0
                                "before you open the file." );
438
0
                return NULL;
439
0
            }
440
441
0
            psz_subtitle = realloc( psz_new_subtitle,
442
0
                                    psz_convert_buffer_out - psz_new_subtitle );
443
0
            if( !psz_subtitle )
444
0
                psz_subtitle = psz_new_subtitle;
445
0
        }
446
0
    }
447
448
    /* Create the subpicture unit */
449
0
    p_spu = decoder_NewSubpictureText( p_dec );
450
0
    if( !p_spu )
451
0
    {
452
0
        free( psz_subtitle );
453
0
        return NULL;
454
0
    }
455
0
    p_spu->i_start    = p_block->i_pts;
456
0
    p_spu->i_stop     = p_block->i_pts + p_block->i_length;
457
0
    p_spu->b_ephemer  = (p_block->i_length == VLC_TICK_INVALID);
458
0
    p_spu->b_absolute = false;
459
460
0
    subtext_updater_sys_t *p_spu_sys = p_spu->updater.p_sys;
461
462
0
    int i_inline_align = -1;
463
0
    p_spu_sys->region.p_segments = ParseSubtitles( &i_inline_align, psz_subtitle );
464
0
    free( psz_subtitle );
465
0
    if( p_sys->i_align >= 0 ) /* bottom ; left, right or centered */
466
0
    {
467
0
        p_spu_sys->region.align = SUBPICTURE_ALIGN_BOTTOM | p_sys->i_align;
468
0
        p_spu_sys->region.inner_align = p_sys->i_align;
469
0
    }
470
0
    else if( i_inline_align >= 0 )
471
0
    {
472
0
        p_spu_sys->region.align = i_inline_align;
473
0
        p_spu_sys->region.inner_align = i_inline_align;
474
0
    }
475
0
    else /* default, bottom ; centered */
476
0
    {
477
0
        p_spu_sys->region.align = SUBPICTURE_ALIGN_BOTTOM;
478
0
        p_spu_sys->region.inner_align = 0;
479
0
    }
480
481
0
    return p_spu;
482
0
}
483
484
static bool AppendCharacter( text_segment_t* p_segment, char c )
485
0
{
486
0
    char* tmp;
487
0
    if ( asprintf( &tmp, "%s%c", p_segment->psz_text ? p_segment->psz_text : "", c ) < 0 )
488
0
        return false;
489
0
    free( p_segment->psz_text );
490
0
    p_segment->psz_text = tmp;
491
0
    return true;
492
0
}
493
494
static bool AppendString( text_segment_t* p_segment, const char* psz_str )
495
0
{
496
0
    char* tmp;
497
0
    if ( asprintf( &tmp, "%s%s", p_segment->psz_text ? p_segment->psz_text : "", psz_str ) < 0 )
498
0
        return false;
499
0
    free( p_segment->psz_text );
500
0
    p_segment->psz_text = tmp;
501
0
    return true;
502
0
}
503
504
static char* ConsumeAttribute( const char** ppsz_subtitle, char** ppsz_attribute_value )
505
0
{
506
0
    const char* psz_subtitle = *ppsz_subtitle;
507
0
    char* psz_attribute_name;
508
0
    *ppsz_attribute_value = NULL;
509
510
0
    while (*psz_subtitle == ' ')
511
0
        psz_subtitle++;
512
513
0
    size_t attr_len = 0;
514
0
    char delimiter;
515
516
0
    while ( *psz_subtitle && isalpha( *psz_subtitle ) )
517
0
    {
518
0
        psz_subtitle++;
519
0
        attr_len++;
520
0
    }
521
0
    if ( !*psz_subtitle || attr_len == 0 )
522
0
        return NULL;
523
0
    psz_attribute_name = malloc( attr_len + 1 );
524
0
    if ( unlikely( !psz_attribute_name ) )
525
0
        return NULL;
526
0
    strncpy( psz_attribute_name, psz_subtitle - attr_len, attr_len );
527
0
    psz_attribute_name[attr_len] = 0;
528
529
    // Skip over to the attribute value
530
0
    while ( *psz_subtitle && *psz_subtitle != '=' )
531
0
        psz_subtitle++;
532
0
    if ( !*psz_subtitle )
533
0
    {
534
0
        *ppsz_subtitle = psz_subtitle;
535
0
        return psz_attribute_name;
536
0
    }
537
    // Skip the '=' sign
538
0
    psz_subtitle++;
539
540
    // Aknoledge the delimiter if any
541
0
    while ( *psz_subtitle && isspace( *psz_subtitle) )
542
0
        psz_subtitle++;
543
544
0
    if ( *psz_subtitle == '\'' || *psz_subtitle == '"' )
545
0
    {
546
        // Save the delimiter and skip it
547
0
        delimiter = *psz_subtitle;
548
0
        psz_subtitle++;
549
0
    }
550
0
    else
551
0
        delimiter = 0;
552
553
    // Skip spaces, just in case
554
0
    while ( *psz_subtitle && isspace( *psz_subtitle ) )
555
0
        psz_subtitle++;
556
557
0
    attr_len = 0;
558
0
    while ( *psz_subtitle && ( ( delimiter != 0 && *psz_subtitle != delimiter ) ||
559
0
                               ( delimiter == 0 && ( !isspace(*psz_subtitle) && *psz_subtitle != '>' ) ) ) )
560
0
    {
561
0
        psz_subtitle++;
562
0
        attr_len++;
563
0
    }
564
0
    if ( attr_len == 0 )
565
0
    {
566
0
        *ppsz_subtitle = psz_subtitle;
567
0
        return psz_attribute_name;
568
0
    }
569
0
    if ( unlikely( !( *ppsz_attribute_value = malloc( attr_len + 1 ) ) ) )
570
0
    {
571
0
        free( psz_attribute_name );
572
0
        return NULL;
573
0
    }
574
0
    strncpy( *ppsz_attribute_value, psz_subtitle - attr_len, attr_len );
575
0
    (*ppsz_attribute_value)[attr_len] = 0;
576
    // Finally, skip over the final delimiter
577
0
    if (delimiter != 0 && *psz_subtitle)
578
0
        psz_subtitle++;
579
0
    *ppsz_subtitle = psz_subtitle;
580
0
    return psz_attribute_name;
581
0
}
582
583
// Returns the next tag and consume the string up to after the tag name, or
584
// returns NULL and doesn't advance if the angle bracket was not a tag opening
585
// For instance, if psz_subtitle == "<some_tag attribute=value>"
586
// GetTag will return "some_tag", and will advance up to the first 'a' in "attribute"
587
// The returned value must be freed.
588
static char* GetTag( const char** ppsz_subtitle, bool b_closing )
589
0
{
590
0
    const char* psz_subtitle = *ppsz_subtitle;
591
0
    if ( *psz_subtitle != '<' )
592
0
        return NULL;
593
    // Skip the '<'
594
0
    psz_subtitle++;
595
0
    if ( b_closing && *psz_subtitle == '/' )
596
0
        psz_subtitle++;
597
    // Skip potential spaces
598
0
    while ( *psz_subtitle == ' ' )
599
0
        psz_subtitle++;
600
    // Now we need to verify if what comes next is a valid tag:
601
0
    if ( !isalpha( *psz_subtitle ) )
602
0
        return NULL;
603
0
    size_t tag_size = 1;
604
0
    while ( isalnum( psz_subtitle[tag_size] ) || psz_subtitle[tag_size] == '_' )
605
0
        tag_size++;
606
0
    char* psz_tagname = vlc_alloc( tag_size + 1, sizeof( *psz_tagname ) );
607
0
    if ( unlikely( !psz_tagname ) )
608
0
        return NULL;
609
0
    strncpy( psz_tagname, psz_subtitle, tag_size );
610
0
    psz_tagname[tag_size] = 0;
611
0
    psz_subtitle += tag_size;
612
0
    *ppsz_subtitle = psz_subtitle;
613
0
    return psz_tagname;
614
0
}
615
616
static bool IsClosed( const char* psz_subtitle, const char* psz_tagname )
617
0
{
618
0
    const char* psz_tagpos = strcasestr( psz_subtitle, psz_tagname );
619
0
    if ( !psz_tagpos )
620
0
        return false;
621
    // Search for '</' and '>' immediately before & after (minding the potential spaces)
622
0
    const char* psz_endtag = psz_tagpos + strlen( psz_tagname );
623
0
    while ( *psz_endtag == ' ' )
624
0
        psz_endtag++;
625
0
    if ( *psz_endtag != '>' )
626
0
        return false;
627
    // Skip back before the tag itself
628
0
    psz_tagpos--;
629
0
    while ( *psz_tagpos == ' ' && psz_tagpos > psz_subtitle )
630
0
        psz_tagpos--;
631
0
    if ( *psz_tagpos-- != '/' )
632
0
        return false;
633
0
    if ( *psz_tagpos != '<' )
634
0
        return false;
635
0
    return true;
636
0
}
637
638
typedef struct tag_stack tag_stack_t;
639
struct tag_stack
640
{
641
    char* psz_tagname;
642
    tag_stack_t *p_next;
643
};
644
645
static void AppendTag( tag_stack_t **pp_stack, char* psz_tagname )
646
0
{
647
0
    tag_stack_t* p_elem = malloc( sizeof( *p_elem ) );
648
0
    if ( unlikely( !p_elem ) )
649
0
        return;
650
0
    p_elem->p_next = *pp_stack;
651
0
    p_elem->psz_tagname = psz_tagname;
652
0
    *pp_stack = p_elem;
653
0
}
654
655
static bool HasTag( tag_stack_t **pp_stack, const char* psz_tagname )
656
0
{
657
0
    tag_stack_t *p_prev = NULL;
658
0
    for ( tag_stack_t* p_current = *pp_stack; p_current; p_current = p_current->p_next )
659
0
    {
660
0
        if ( !strcasecmp( psz_tagname, p_current->psz_tagname ) )
661
0
        {
662
0
            if ( p_current == *pp_stack )
663
0
            {
664
0
                *pp_stack = p_current->p_next;
665
0
            }
666
0
            else
667
0
            {
668
0
                p_prev->p_next = p_current->p_next;
669
0
            }
670
0
            free( p_current->psz_tagname );
671
0
            free( p_current );
672
0
            return true;
673
0
        }
674
0
        p_prev = p_current;
675
0
    }
676
0
    return false;
677
0
}
678
679
/*
680
 * mini style stack implementation
681
 */
682
typedef struct style_stack style_stack_t;
683
struct  style_stack
684
{
685
    text_style_t* p_style;
686
    style_stack_t* p_next;
687
};
688
689
static text_style_t* DuplicateAndPushStyle(style_stack_t** pp_stack)
690
0
{
691
0
    text_style_t* p_dup = ( *pp_stack ) ? text_style_Duplicate( (*pp_stack)->p_style ) : text_style_Create( STYLE_NO_DEFAULTS );
692
0
    if ( unlikely( !p_dup ) )
693
0
        return NULL;
694
0
    style_stack_t* p_entry = malloc( sizeof( *p_entry ) );
695
0
    if ( unlikely( !p_entry ) )
696
0
    {
697
0
        text_style_Delete( p_dup );
698
0
        return NULL;
699
0
    }
700
    // Give the style ownership to the segment.
701
0
    p_entry->p_style = p_dup;
702
0
    p_entry->p_next = *pp_stack;
703
0
    *pp_stack = p_entry;
704
0
    return p_dup;
705
0
}
706
707
static void PopStyle(style_stack_t** pp_stack)
708
0
{
709
0
    style_stack_t* p_old = *pp_stack;
710
0
    if ( !p_old )
711
0
        return;
712
0
    *pp_stack = p_old->p_next;
713
    // Don't free the style, it is now owned by the text_segment_t
714
0
    free( p_old );
715
0
}
716
717
static text_segment_t* NewTextSegmentPushStyle( text_segment_t* p_segment, style_stack_t** pp_stack )
718
0
{
719
0
    text_segment_t* p_new = text_segment_New( NULL );
720
0
    if ( unlikely( p_new == NULL ) )
721
0
        return NULL;
722
0
    text_style_t* p_style = DuplicateAndPushStyle( pp_stack );
723
0
    p_new->style = p_style;
724
0
    p_segment->p_next = p_new;
725
0
    return p_new;
726
0
}
727
728
static text_segment_t* NewTextSegmentPopStyle( text_segment_t* p_segment, style_stack_t** pp_stack )
729
0
{
730
0
    text_segment_t* p_new = text_segment_New( NULL );
731
0
    if ( unlikely( p_new == NULL ) )
732
0
        return NULL;
733
    // We shouldn't have an empty stack since this happens when closing a tag,
734
    // but better be safe than sorry if (/when) we encounter a broken subtitle file.
735
0
    PopStyle( pp_stack );
736
0
    text_style_t* p_dup = ( *pp_stack ) ? text_style_Duplicate( (*pp_stack)->p_style ) : text_style_Create( STYLE_NO_DEFAULTS );
737
0
    p_new->style = p_dup;
738
0
    p_segment->p_next = p_new;
739
0
    return p_new;
740
0
}
741
742
static text_segment_t* ParseSubtitles( int *pi_align, const char *psz_subtitle )
743
0
{
744
0
    text_segment_t* p_segment;
745
0
    text_segment_t* p_first_segment;
746
0
    style_stack_t* p_stack = NULL;
747
0
    tag_stack_t* p_tag_stack = NULL;
748
749
    //FIXME: Remove initial allocation? Might make the below code more complicated
750
0
    p_first_segment = p_segment = text_segment_New( "" );
751
752
0
    *pi_align = -1;
753
754
    /* */
755
0
    while( *psz_subtitle )
756
0
    {
757
        /* HTML extensions */
758
0
        if( *psz_subtitle == '<' )
759
0
        {
760
0
            char *psz_tagname = GetTag( &psz_subtitle, false );
761
0
            if ( psz_tagname != NULL )
762
0
            {
763
0
                if( !strcasecmp( psz_tagname, "br" ) )
764
0
                {
765
0
                    if ( !AppendCharacter( p_segment, '\n' ) )
766
0
                    {
767
0
                        free( psz_tagname );
768
0
                        goto fail;
769
0
                    }
770
0
                }
771
0
                else if( !strcasecmp( psz_tagname, "b" ) )
772
0
                {
773
0
                    p_segment = NewTextSegmentPushStyle( p_segment, &p_stack );
774
0
                    p_segment->style->i_style_flags |= STYLE_BOLD;
775
0
                    p_segment->style->i_features |= STYLE_HAS_FLAGS;
776
0
                }
777
0
                else if( !strcasecmp( psz_tagname, "i" ) )
778
0
                {
779
0
                    p_segment = NewTextSegmentPushStyle( p_segment, &p_stack );
780
0
                    p_segment->style->i_style_flags |= STYLE_ITALIC;
781
0
                    p_segment->style->i_features |= STYLE_HAS_FLAGS;
782
0
                }
783
0
                else if( !strcasecmp( psz_tagname, "u" ) )
784
0
                {
785
0
                    p_segment = NewTextSegmentPushStyle( p_segment, &p_stack );
786
0
                    p_segment->style->i_style_flags |= STYLE_UNDERLINE;
787
0
                    p_segment->style->i_features |= STYLE_HAS_FLAGS;
788
0
                }
789
0
                else if( !strcasecmp( psz_tagname, "s" ) )
790
0
                {
791
0
                    p_segment = NewTextSegmentPushStyle( p_segment, &p_stack );
792
0
                    p_segment->style->i_style_flags |= STYLE_STRIKEOUT;
793
0
                    p_segment->style->i_features |= STYLE_HAS_FLAGS;
794
0
                }
795
0
                else if( !strcasecmp( psz_tagname, "font" ) )
796
0
                {
797
0
                    p_segment = NewTextSegmentPushStyle( p_segment, &p_stack );
798
799
0
                    char* psz_attribute_name;
800
0
                    char* psz_attribute_value;
801
802
0
                    while( ( psz_attribute_name = ConsumeAttribute( &psz_subtitle, &psz_attribute_value ) ) )
803
0
                    {
804
0
                        if ( !psz_attribute_value )
805
0
                        {
806
0
                            free( psz_attribute_name );
807
0
                            continue;
808
0
                        }
809
0
                        if ( !strcasecmp( psz_attribute_name, "face" ) )
810
0
                        {
811
0
                            free(p_segment->style->psz_fontname);
812
0
                            p_segment->style->psz_fontname = psz_attribute_value;
813
                            // We don't want to free the attribute value since it has become our fontname
814
0
                            psz_attribute_value = NULL;
815
0
                        }
816
0
                        else if ( !strcasecmp( psz_attribute_name, "family" ) )
817
0
                        {
818
0
                            free(p_segment->style->psz_monofontname);
819
0
                            p_segment->style->psz_monofontname = psz_attribute_value;
820
0
                            psz_attribute_value = NULL;
821
0
                        }
822
0
                        else if ( !strcasecmp( psz_attribute_name, "size" ) )
823
0
                        {
824
0
                            int size = atoi( psz_attribute_value );
825
0
                            if( size )
826
0
                            {
827
0
                                p_segment->style->i_font_size = size;
828
0
                                p_segment->style->f_font_relsize = STYLE_DEFAULT_REL_FONT_SIZE *
829
0
                                        STYLE_DEFAULT_FONT_SIZE / p_segment->style->i_font_size;
830
0
                            }
831
0
                        }
832
0
                        else if ( !strcasecmp( psz_attribute_name, "color" ) )
833
0
                        {
834
0
                            p_segment->style->i_font_color = vlc_html_color( psz_attribute_value, NULL );
835
0
                            p_segment->style->i_features |= STYLE_HAS_FONT_COLOR;
836
0
                        }
837
0
                        else if ( !strcasecmp( psz_attribute_name, "outline-color" ) )
838
0
                        {
839
0
                            p_segment->style->i_outline_color = vlc_html_color( psz_attribute_value, NULL );
840
0
                            p_segment->style->i_features |= STYLE_HAS_OUTLINE_COLOR;
841
0
                        }
842
0
                        else if ( !strcasecmp( psz_attribute_name, "shadow-color" ) )
843
0
                        {
844
0
                            p_segment->style->i_shadow_color = vlc_html_color( psz_attribute_value, NULL );
845
0
                            p_segment->style->i_features |= STYLE_HAS_SHADOW_COLOR;
846
0
                        }
847
0
                        else if ( !strcasecmp( psz_attribute_name, "outline-level" ) )
848
0
                        {
849
0
                            p_segment->style->i_outline_width = atoi( psz_attribute_value );
850
0
                        }
851
0
                        else if ( !strcasecmp( psz_attribute_name, "shadow-level" ) )
852
0
                        {
853
0
                            p_segment->style->i_shadow_width = atoi( psz_attribute_value );
854
0
                        }
855
0
                        else if ( !strcasecmp( psz_attribute_name, "back-color" ) )
856
0
                        {
857
0
                            p_segment->style->i_background_color = vlc_html_color( psz_attribute_value, NULL );
858
0
                            p_segment->style->i_features |= STYLE_HAS_BACKGROUND_COLOR;
859
0
                        }
860
0
                        else if ( !strcasecmp( psz_attribute_name, "alpha" ) )
861
0
                        {
862
0
                            p_segment->style->i_font_alpha = atoi( psz_attribute_value );
863
0
                            p_segment->style->i_features |= STYLE_HAS_FONT_ALPHA;
864
0
                        }
865
866
0
                        free( psz_attribute_name );
867
0
                        free( psz_attribute_value );
868
0
                    }
869
0
                }
870
0
                else
871
0
                {
872
                    // This is an unknown tag. We need to hide it if it's properly closed, and display it otherwise
873
0
                    if ( !IsClosed( psz_subtitle, psz_tagname ) )
874
0
                    {
875
0
                        AppendCharacter( p_segment, '<' );
876
0
                        AppendString( p_segment, psz_tagname );
877
0
                        AppendCharacter( p_segment, '>' );
878
0
                    }
879
0
                    else
880
0
                    {
881
0
                        AppendTag( &p_tag_stack, psz_tagname );
882
                        // We don't want to free the tagname now, it will be freed when the tag
883
                        // gets poped from the stack.
884
0
                        psz_tagname = NULL;
885
0
                    }
886
                    // In any case, fall through and skip to the closing tag.
887
0
                }
888
                // Skip potential spaces & end tag
889
0
                while ( *psz_subtitle && *psz_subtitle != '>' )
890
0
                    psz_subtitle++;
891
0
                if ( *psz_subtitle == '>' )
892
0
                    psz_subtitle++;
893
894
0
                free( psz_tagname );
895
0
            }
896
0
            else if( !strncmp( psz_subtitle, "</", 2 ))
897
0
            {
898
0
                char* psz_closetagname = GetTag( &psz_subtitle, true );
899
0
                if ( psz_closetagname != NULL )
900
0
                {
901
0
                    if ( !strcasecmp( psz_closetagname, "b" ) ||
902
0
                         !strcasecmp( psz_closetagname, "i" ) ||
903
0
                         !strcasecmp( psz_closetagname, "u" ) ||
904
0
                         !strcasecmp( psz_closetagname, "s" ) ||
905
0
                         !strcasecmp( psz_closetagname, "font" ) )
906
0
                    {
907
                        // A closing tag for one of the tags we handle, meaning
908
                        // we pushed a style onto the stack earlier
909
0
                        p_segment = NewTextSegmentPopStyle( p_segment, &p_stack );
910
0
                    }
911
0
                    else
912
0
                    {
913
                        // Unknown closing tag. If it is closing an unknown tag, ignore it. Otherwise, display it
914
0
                        if ( !HasTag( &p_tag_stack, psz_closetagname ) )
915
0
                        {
916
0
                            AppendString( p_segment, "</" );
917
0
                            AppendString( p_segment, psz_closetagname );
918
0
                            AppendCharacter( p_segment, '>' );
919
0
                        }
920
0
                    }
921
0
                    while ( *psz_subtitle == ' ' )
922
0
                        psz_subtitle++;
923
0
                    if ( *psz_subtitle == '>' )
924
0
                        psz_subtitle++;
925
0
                    free( psz_closetagname );
926
0
                }
927
0
                else
928
0
                {
929
                    /**
930
                      * This doesn't appear to be a valid tag closing syntax.
931
                      * Simply append the text
932
                      */
933
0
                    AppendString( p_segment, "</" );
934
0
                    psz_subtitle += 2;
935
0
                }
936
0
            }
937
0
            else
938
0
            {
939
                /* We have an unknown tag, just append it, and move on.
940
                 * The rest of the string won't be recognized as a tag, and
941
                 * we will ignore unknown closing tag
942
                 */
943
0
                AppendCharacter( p_segment, '<' );
944
0
                psz_subtitle++;
945
0
            }
946
0
        }
947
        /* SSA extensions */
948
0
        else if( psz_subtitle[0] == '{' && psz_subtitle[1] == '\\' &&
949
0
                 strchr( psz_subtitle, '}' ) )
950
0
        {
951
            /* Check for forced alignment */
952
0
            if( *pi_align < 0 &&
953
0
                !strncmp( psz_subtitle, "{\\an", 4 ) && psz_subtitle[4] >= '1' && psz_subtitle[4] <= '9' && psz_subtitle[5] == '}' )
954
0
            {
955
0
                static const int pi_vertical[3] = { SUBPICTURE_ALIGN_BOTTOM, 0, SUBPICTURE_ALIGN_TOP };
956
0
                static const int pi_horizontal[3] = { SUBPICTURE_ALIGN_LEFT, 0, SUBPICTURE_ALIGN_RIGHT };
957
0
                const int i_id = psz_subtitle[4] - '1';
958
959
0
                *pi_align = pi_vertical[i_id/3] | pi_horizontal[i_id%3];
960
0
            }
961
            /* TODO fr -> rotation */
962
963
            /* Hide {\stupidity} */
964
0
            psz_subtitle = strchr( psz_subtitle, '}' ) + 1;
965
0
        }
966
        /* MicroDVD extensions */
967
        /* FIXME:
968
         *  - Currently, we don't do difference between X and x, and we should:
969
         *    Capital Letters applies to the whole text and not one line
970
         *  - We don't support Position and Coordinates
971
         *  - We don't support the DEFAULT flag (HEADER)
972
         */
973
974
0
        else if( psz_subtitle[0] == '{' && psz_subtitle[1] != 0 &&
975
0
                 psz_subtitle[2] == ':' && strchr( &psz_subtitle[2], '}' ) )
976
0
        {
977
0
            const char *psz_tag_end = strchr( &psz_subtitle[2], '}' );
978
0
            size_t i_len = psz_tag_end - &psz_subtitle[3];
979
980
0
            if( psz_subtitle[1] == 'Y' || psz_subtitle[1] == 'y' )
981
0
            {
982
0
                if( psz_subtitle[3] == 'i' )
983
0
                {
984
0
                    p_segment = NewTextSegmentPushStyle( p_segment, &p_stack );
985
0
                    p_segment->style->i_style_flags |= STYLE_ITALIC;
986
0
                    p_segment->style->i_features |= STYLE_HAS_FLAGS;
987
0
                    psz_subtitle++;
988
0
                }
989
0
                if( psz_subtitle[3] == 'b' )
990
0
                {
991
0
                    p_segment = NewTextSegmentPushStyle( p_segment, &p_stack );
992
0
                    p_segment->style->i_style_flags |= STYLE_BOLD;
993
0
                    p_segment->style->i_features |= STYLE_HAS_FLAGS;
994
0
                    psz_subtitle++;
995
0
                }
996
0
                if( psz_subtitle[3] == 'u' )
997
0
                {
998
0
                    p_segment = NewTextSegmentPushStyle( p_segment, &p_stack );
999
0
                    p_segment->style->i_style_flags |= STYLE_UNDERLINE;
1000
0
                    p_segment->style->i_features |= STYLE_HAS_FLAGS;
1001
0
                    psz_subtitle++;
1002
0
                }
1003
0
            }
1004
0
            else if( (psz_subtitle[1] == 'C' || psz_subtitle[1] == 'c' )
1005
0
                    && psz_subtitle[3] == '$' && i_len >= 7 )
1006
0
            {
1007
                /* Yes, they use BBGGRR, instead of RRGGBB */
1008
0
                char psz_color[7];
1009
0
                psz_color[0] = psz_subtitle[8]; psz_color[1] = psz_subtitle[9];
1010
0
                psz_color[2] = psz_subtitle[6]; psz_color[3] = psz_subtitle[7];
1011
0
                psz_color[4] = psz_subtitle[4]; psz_color[5] = psz_subtitle[5];
1012
0
                psz_color[6] = '\0';
1013
0
                p_segment = NewTextSegmentPushStyle( p_segment, &p_stack );
1014
0
                p_segment->style->i_font_color = vlc_html_color( psz_color, NULL );
1015
0
                p_segment->style->i_features |= STYLE_HAS_FONT_COLOR;
1016
0
            }
1017
0
            else if( psz_subtitle[1] == 'F' || psz_subtitle[1] == 'f' )
1018
0
            {
1019
0
                p_segment = NewTextSegmentPushStyle( p_segment, &p_stack );
1020
0
                free(p_segment->style->psz_fontname);
1021
0
                p_segment->style->psz_fontname = strndup( &psz_subtitle[3], i_len );
1022
0
            }
1023
0
            else if( psz_subtitle[1] == 'S' || psz_subtitle[1] == 's' )
1024
0
            {
1025
0
                int size = atoi( &psz_subtitle[3] );
1026
0
                if( size )
1027
0
                {
1028
0
                    p_segment = NewTextSegmentPushStyle( p_segment, &p_stack );
1029
0
                    p_segment->style->i_font_size = size;
1030
0
                    p_segment->style->f_font_relsize = STYLE_DEFAULT_REL_FONT_SIZE *
1031
0
                                STYLE_DEFAULT_FONT_SIZE / p_segment->style->i_font_size;
1032
1033
0
                }
1034
0
            }
1035
            /* Currently unsupported since we don't have access to the i_align flag here
1036
            else if( psz_subtitle[1] == 'P' )
1037
            {
1038
                if( psz_subtitle[3] == "1" )
1039
                    i_align = SUBPICTURE_ALIGN_TOP;
1040
                else if( psz_subtitle[3] == "0" )
1041
                    i_align = SUBPICTURE_ALIGN_BOTTOM;
1042
            } */
1043
            // Hide other {x:y} atrocities, notably {o:x}
1044
0
            psz_subtitle = psz_tag_end + 1;
1045
0
        }
1046
0
        else
1047
0
        {
1048
0
            if( *psz_subtitle == '\n' || !strncasecmp( psz_subtitle, "\\n", 2 ) )
1049
0
            {
1050
0
                if ( !AppendCharacter( p_segment, '\n' ) )
1051
0
                    goto fail;
1052
0
                if ( *psz_subtitle == '\n' )
1053
0
                    psz_subtitle++;
1054
0
                else
1055
0
                    psz_subtitle += 2;
1056
0
            }
1057
0
            else if( !strncasecmp( psz_subtitle, "\\h", 2 ) )
1058
0
            {
1059
0
                if ( !AppendString( p_segment, "\xC2\xA0" ) )
1060
0
                    goto fail;
1061
0
                psz_subtitle += 2;
1062
0
            }
1063
0
            else
1064
0
            {
1065
                //FIXME: Highly inneficient
1066
0
                AppendCharacter( p_segment, *psz_subtitle );
1067
0
                psz_subtitle++;
1068
0
            }
1069
0
        }
1070
0
    }
1071
0
    while ( p_stack )
1072
0
        PopStyle( &p_stack );
1073
0
    while ( p_tag_stack )
1074
0
    {
1075
0
        tag_stack_t *p_tag = p_tag_stack;
1076
0
        p_tag_stack = p_tag_stack->p_next;
1077
0
        free( p_tag->psz_tagname );
1078
0
        free( p_tag );
1079
0
    }
1080
1081
0
    return p_first_segment;
1082
1083
0
fail:
1084
0
    text_segment_ChainDelete( p_first_segment );
1085
0
    return NULL;
1086
0
}