/src/vlc/modules/demux/dvb-text.h

Source
/*****************************************************************************
 * dvb-text.h:
 *****************************************************************************
 * Copyright (C) 2007-2011 VLC authors and VideoLAN
 *
 * This program is free software; you can redistribute it and/or modify it
 * under the terms of the GNU Lesser General Public License as published by
 * the Free Software Foundation; either version 2.1 of the License, or
 * (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
 * GNU Lesser General Public License for more details.
 *
 * You should have received a copy of the GNU Lesser General Public License
 * along with this program; if not, write to the Free Software Foundation,
 * Inc., 51 Franklin Street, Fifth Floor, Boston MA 02110-1301, USA.
 *****************************************************************************/

/**
 * Converts a DVB SI text item to UTF-8.
 * Refer to EN 800 486 annex A.
 * @return a heap-allocation nul-terminated UTF-8 string or NULL on error.
 */
static char *vlc_from_EIT (const void *buf, size_t length)
{
    if (unlikely(length == 0))
        return NULL;

    char encbuf[12];
    const char *encoding = encbuf;

    const char *in = buf;
    size_t offset = 1;
    unsigned char c = *in;

    if (c >= 0x20)
    {
        offset = 0;
        encoding = "ISO_6937";
    }
    else if ((1 << c) & 0x0EFE) /* 1-7, 9-11 -> ISO 8859-(c+4) */
    {
        snprintf (encbuf, sizeof (encbuf), "ISO_8859-%u", 4u + c);
    }
    else switch (c)
    {
        case 0x10: /* two more bytes */
            offset = 3;
            if (length < 3 || in[1] != 0x00)
                return NULL;

            c = in[2];
            if ((1 << c) & 0xEFFE) /* 1-11, 13-15 -> ISO 8859-(c) */
               snprintf (encbuf, sizeof (encbuf), "ISO_8859-%hhu", c);
           else
               return NULL;
           break;
        case 0x11: /* the BMP */
        case 0x14: /* Big5 subset of the BMP */
            encoding = "UCS-2BE";
            break;
        case 0x12:
            /* DVB has no clue about Korean. KS X 1001 (a.k.a. KS C 5601) is a
             * character set, not a character encoding... So we assume EUC-KR.
             * It is an encoding of KS X 1001. In practice, I guess nobody uses
             * this in any real DVB system. */
            encoding = "EUC-KR";
            break;
        case 0x13: /* GB-2312-1980 */
            encoding = "GB2312";
            break;
        case 0x15:
            encoding = "UTF-8";
            break;
#if 0
        case 0x1F: /* operator-specific(?) */
            offset = 2;
#endif
        default:
            return NULL;
    }

    in += offset;
    length -= offset;

    char *out = FromCharset (encoding, in, length);
    if (out == NULL)
    {   /* Fallback... */
        out = strndup (in, length);
        if (unlikely(out == NULL))
            return NULL;
        EnsureUTF8 (out);
    }

    length = strlen(out);
    /* Convert control codes */
    for (char *p = strchr (out, '\xC2'); p; p = strchr (p + 1, '\xC2'))
    {
        /* We have valid UTF-8, to 0xC2 is followed by a continuation byte. */
        /* 0x80-0x85,0x88-0x89 are reserved.
         * 0x86-0x87 are identical to Unicode and Latin-1.
         * 0x8A is CR/LF.
         * 0x8B-0x9F are unspecified. */
        if (p[1] == '\x8A')
            memcpy (p, "\r\n", 2);

        /* Strip character emphasis */
        if (p[1] == '\x86' || p[1] == '\x87') {
            const size_t n = p - out;
            memmove (p, p+2, length - n);
            length -= 2;
            out[length] = '\0';
            if (length == n)
                break;
        }
    }

    /* Private use area */
    for (char *p = strchr (out, '\xEE'); p; p = strchr (p + 1, '\xEE'))
    {
        /* Within UTF-8, 0xEE is followed by a two continuation bytes. */
        if (p[1] != '\x82')
            continue;
        if (p[2] == '\x8A')
            memcpy (p, "\r\r\n", 3); /* we need three bytes, so to CRs ;) */

        /* Strip character emphasis */
        if (p[2] == '\x86' || p[2] == '\x87') {
            const size_t n = p - out;
            memmove (p, p+3, length - n);
            length -= 3;
            out[length] = '\0';
            if (length == n)
                break;
        }
    }

    return out;
}

Coverage Report

Created: 2026-04-12 07:27

Line	Count	Source
1		/*****************************************************************************
2		* dvb-text.h:
3		*****************************************************************************
4		* Copyright (C) 2007-2011 VLC authors and VideoLAN
5		*
6		* This program is free software; you can redistribute it and/or modify it
7		* under the terms of the GNU Lesser General Public License as published by
8		* the Free Software Foundation; either version 2.1 of the License, or
9		* (at your option) any later version.
10		*
11		* This program is distributed in the hope that it will be useful,
12		* but WITHOUT ANY WARRANTY; without even the implied warranty of
13		* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14		* GNU Lesser General Public License for more details.
15		*
16		* You should have received a copy of the GNU Lesser General Public License
17		* along with this program; if not, write to the Free Software Foundation,
18		* Inc., 51 Franklin Street, Fifth Floor, Boston MA 02110-1301, USA.
19		*****************************************************************************/
20
21		/**
22		* Converts a DVB SI text item to UTF-8.
23		* Refer to EN 800 486 annex A.
24		* @return a heap-allocation nul-terminated UTF-8 string or NULL on error.
25		*/
26		static char vlc_from_EIT (const void buf, size_t length)
27	770	{
28	770	if (unlikely(length == 0))
29	0	return NULL;
30
31	770	char encbuf[12];
32	770	const char *encoding = encbuf;
33
34	770	const char *in = buf;
35	770	size_t offset = 1;
36	770	unsigned char c = *in;
37
38	770	if (c >= 0x20)
39	770	{
40	770	offset = 0;
41	770	encoding = "ISO_6937";
42	770	}
43	0	else if ((1 << c) & 0x0EFE) /* 1-7, 9-11 -> ISO 8859-(c+4) */
44	0	{
45	0	snprintf (encbuf, sizeof (encbuf), "ISO_8859-%u", 4u + c);
46	0	}
47	0	else switch (c)
48	0	{
49	0	case 0x10: /* two more bytes */
50	0	offset = 3;
51	0	if (length < 3 \|\| in[1] != 0x00)
52	0	return NULL;
53
54	0	c = in[2];
55	0	if ((1 << c) & 0xEFFE) /* 1-11, 13-15 -> ISO 8859-(c) */
56	0	snprintf (encbuf, sizeof (encbuf), "ISO_8859-%hhu", c);
57	0	else
58	0	return NULL;
59	0	break;
60	0	case 0x11: /* the BMP */
61	0	case 0x14: /* Big5 subset of the BMP */
62	0	encoding = "UCS-2BE";
63	0	break;
64	0	case 0x12:
65		/* DVB has no clue about Korean. KS X 1001 (a.k.a. KS C 5601) is a
66		* character set, not a character encoding... So we assume EUC-KR.
67		* It is an encoding of KS X 1001. In practice, I guess nobody uses
68		* this in any real DVB system. */
69	0	encoding = "EUC-KR";
70	0	break;
71	0	case 0x13: /* GB-2312-1980 */
72	0	encoding = "GB2312";
73	0	break;
74	0	case 0x15:
75	0	encoding = "UTF-8";
76	0	break;
77		#if 0
78		case 0x1F: /* operator-specific(?) */
79		offset = 2;
80		#endif
81	0	default:
82	0	return NULL;
83	0	}
84
85	770	in += offset;
86	770	length -= offset;
87
88	770	char *out = FromCharset (encoding, in, length);
89	770	if (out == NULL)
90	0	{ /* Fallback... */
91	0	out = strndup (in, length);
92	0	if (unlikely(out == NULL))
93	0	return NULL;
94	0	EnsureUTF8 (out);
95	0	}
96
97	770	length = strlen(out);
98		/* Convert control codes */
99	770	for (char *p = strchr (out, '\xC2'); p; p = strchr (p + 1, '\xC2'))
100	0	{
101		/* We have valid UTF-8, to 0xC2 is followed by a continuation byte. */
102		/* 0x80-0x85,0x88-0x89 are reserved.
103		* 0x86-0x87 are identical to Unicode and Latin-1.
104		* 0x8A is CR/LF.
105		* 0x8B-0x9F are unspecified. */
106	0	if (p[1] == '\x8A')
107	0	memcpy (p, "\r\n", 2);
108
109		/* Strip character emphasis */
110	0	if (p[1] == '\x86' \|\| p[1] == '\x87') {
111	0	const size_t n = p - out;
112	0	memmove (p, p+2, length - n);
113	0	length -= 2;
114	0	out[length] = '\0';
115	0	if (length == n)
116	0	break;
117	0	}
118	0	}
119
120		/* Private use area */
121	770	for (char *p = strchr (out, '\xEE'); p; p = strchr (p + 1, '\xEE'))
122	0	{
123		/* Within UTF-8, 0xEE is followed by a two continuation bytes. */
124	0	if (p[1] != '\x82')
125	0	continue;
126	0	if (p[2] == '\x8A')
127	0	memcpy (p, "\r\r\n", 3); /* we need three bytes, so to CRs ;) */
128
129		/* Strip character emphasis */
130	0	if (p[2] == '\x86' \|\| p[2] == '\x87') {
131	0	const size_t n = p - out;
132	0	memmove (p, p+3, length - n);
133	0	length -= 3;
134	0	out[length] = '\0';
135	0	if (length == n)
136	0	break;
137	0	}
138	0	}
139
140	770	return out;
141	770	}