/src/vlc/modules/demux/dvb-text.h
Line | Count | Source |
1 | | /***************************************************************************** |
2 | | * dvb-text.h: |
3 | | ***************************************************************************** |
4 | | * Copyright (C) 2007-2011 VLC authors and VideoLAN |
5 | | * |
6 | | * This program is free software; you can redistribute it and/or modify it |
7 | | * under the terms of the GNU Lesser General Public License as published by |
8 | | * the Free Software Foundation; either version 2.1 of the License, or |
9 | | * (at your option) any later version. |
10 | | * |
11 | | * This program is distributed in the hope that it will be useful, |
12 | | * but WITHOUT ANY WARRANTY; without even the implied warranty of |
13 | | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
14 | | * GNU Lesser General Public License for more details. |
15 | | * |
16 | | * You should have received a copy of the GNU Lesser General Public License |
17 | | * along with this program; if not, write to the Free Software Foundation, |
18 | | * Inc., 51 Franklin Street, Fifth Floor, Boston MA 02110-1301, USA. |
19 | | *****************************************************************************/ |
20 | | |
21 | | /** |
22 | | * Converts a DVB SI text item to UTF-8. |
23 | | * Refer to EN 800 486 annex A. |
24 | | * @return a heap-allocation nul-terminated UTF-8 string or NULL on error. |
25 | | */ |
26 | | static char *vlc_from_EIT (const void *buf, size_t length) |
27 | 770 | { |
28 | 770 | if (unlikely(length == 0)) |
29 | 0 | return NULL; |
30 | | |
31 | 770 | char encbuf[12]; |
32 | 770 | const char *encoding = encbuf; |
33 | | |
34 | 770 | const char *in = buf; |
35 | 770 | size_t offset = 1; |
36 | 770 | unsigned char c = *in; |
37 | | |
38 | 770 | if (c >= 0x20) |
39 | 770 | { |
40 | 770 | offset = 0; |
41 | 770 | encoding = "ISO_6937"; |
42 | 770 | } |
43 | 0 | else if ((1 << c) & 0x0EFE) /* 1-7, 9-11 -> ISO 8859-(c+4) */ |
44 | 0 | { |
45 | 0 | snprintf (encbuf, sizeof (encbuf), "ISO_8859-%u", 4u + c); |
46 | 0 | } |
47 | 0 | else switch (c) |
48 | 0 | { |
49 | 0 | case 0x10: /* two more bytes */ |
50 | 0 | offset = 3; |
51 | 0 | if (length < 3 || in[1] != 0x00) |
52 | 0 | return NULL; |
53 | | |
54 | 0 | c = in[2]; |
55 | 0 | if ((1 << c) & 0xEFFE) /* 1-11, 13-15 -> ISO 8859-(c) */ |
56 | 0 | snprintf (encbuf, sizeof (encbuf), "ISO_8859-%hhu", c); |
57 | 0 | else |
58 | 0 | return NULL; |
59 | 0 | break; |
60 | 0 | case 0x11: /* the BMP */ |
61 | 0 | case 0x14: /* Big5 subset of the BMP */ |
62 | 0 | encoding = "UCS-2BE"; |
63 | 0 | break; |
64 | 0 | case 0x12: |
65 | | /* DVB has no clue about Korean. KS X 1001 (a.k.a. KS C 5601) is a |
66 | | * character set, not a character encoding... So we assume EUC-KR. |
67 | | * It is an encoding of KS X 1001. In practice, I guess nobody uses |
68 | | * this in any real DVB system. */ |
69 | 0 | encoding = "EUC-KR"; |
70 | 0 | break; |
71 | 0 | case 0x13: /* GB-2312-1980 */ |
72 | 0 | encoding = "GB2312"; |
73 | 0 | break; |
74 | 0 | case 0x15: |
75 | 0 | encoding = "UTF-8"; |
76 | 0 | break; |
77 | | #if 0 |
78 | | case 0x1F: /* operator-specific(?) */ |
79 | | offset = 2; |
80 | | #endif |
81 | 0 | default: |
82 | 0 | return NULL; |
83 | 0 | } |
84 | | |
85 | 770 | in += offset; |
86 | 770 | length -= offset; |
87 | | |
88 | 770 | char *out = FromCharset (encoding, in, length); |
89 | 770 | if (out == NULL) |
90 | 0 | { /* Fallback... */ |
91 | 0 | out = strndup (in, length); |
92 | 0 | if (unlikely(out == NULL)) |
93 | 0 | return NULL; |
94 | 0 | EnsureUTF8 (out); |
95 | 0 | } |
96 | | |
97 | 770 | length = strlen(out); |
98 | | /* Convert control codes */ |
99 | 770 | for (char *p = strchr (out, '\xC2'); p; p = strchr (p + 1, '\xC2')) |
100 | 0 | { |
101 | | /* We have valid UTF-8, to 0xC2 is followed by a continuation byte. */ |
102 | | /* 0x80-0x85,0x88-0x89 are reserved. |
103 | | * 0x86-0x87 are identical to Unicode and Latin-1. |
104 | | * 0x8A is CR/LF. |
105 | | * 0x8B-0x9F are unspecified. */ |
106 | 0 | if (p[1] == '\x8A') |
107 | 0 | memcpy (p, "\r\n", 2); |
108 | | |
109 | | /* Strip character emphasis */ |
110 | 0 | if (p[1] == '\x86' || p[1] == '\x87') { |
111 | 0 | const size_t n = p - out; |
112 | 0 | memmove (p, p+2, length - n); |
113 | 0 | length -= 2; |
114 | 0 | out[length] = '\0'; |
115 | 0 | if (length == n) |
116 | 0 | break; |
117 | 0 | } |
118 | 0 | } |
119 | | |
120 | | /* Private use area */ |
121 | 770 | for (char *p = strchr (out, '\xEE'); p; p = strchr (p + 1, '\xEE')) |
122 | 0 | { |
123 | | /* Within UTF-8, 0xEE is followed by a two continuation bytes. */ |
124 | 0 | if (p[1] != '\x82') |
125 | 0 | continue; |
126 | 0 | if (p[2] == '\x8A') |
127 | 0 | memcpy (p, "\r\r\n", 3); /* we need three bytes, so to CRs ;) */ |
128 | | |
129 | | /* Strip character emphasis */ |
130 | 0 | if (p[2] == '\x86' || p[2] == '\x87') { |
131 | 0 | const size_t n = p - out; |
132 | 0 | memmove (p, p+3, length - n); |
133 | 0 | length -= 3; |
134 | 0 | out[length] = '\0'; |
135 | 0 | if (length == n) |
136 | 0 | break; |
137 | 0 | } |
138 | 0 | } |
139 | | |
140 | 770 | return out; |
141 | 770 | } |