/src/xpdf-4.06/xpdf/UTF8.cc
Line | Count | Source |
1 | | //======================================================================== |
2 | | // |
3 | | // UTF8.cc |
4 | | // |
5 | | // Copyright 2001-2017 Glyph & Cog, LLC |
6 | | // |
7 | | //======================================================================== |
8 | | |
9 | | #include <aconf.h> |
10 | | #include "UTF8.h" |
11 | | |
12 | 37.4k | int mapUTF8(Unicode u, char *buf, int bufSize) { |
13 | 37.4k | if (u <= 0x0000007f) { |
14 | 34.3k | if (bufSize < 1) { |
15 | 0 | return 0; |
16 | 0 | } |
17 | 34.3k | buf[0] = (char)u; |
18 | 34.3k | return 1; |
19 | 34.3k | } else if (u <= 0x000007ff) { |
20 | 948 | if (bufSize < 2) { |
21 | 0 | return 0; |
22 | 0 | } |
23 | 948 | buf[0] = (char)(0xc0 + (u >> 6)); |
24 | 948 | buf[1] = (char)(0x80 + (u & 0x3f)); |
25 | 948 | return 2; |
26 | 2.18k | } else if (u <= 0x0000ffff) { |
27 | 1.99k | if (bufSize < 3) { |
28 | 0 | return 0; |
29 | 0 | } |
30 | 1.99k | buf[0] = (char)(0xe0 + (u >> 12)); |
31 | 1.99k | buf[1] = (char)(0x80 + ((u >> 6) & 0x3f)); |
32 | 1.99k | buf[2] = (char)(0x80 + (u & 0x3f)); |
33 | 1.99k | return 3; |
34 | 1.99k | } else if (u <= 0x0010ffff) { |
35 | 83 | if (bufSize < 4) { |
36 | 0 | return 0; |
37 | 0 | } |
38 | 83 | buf[0] = (char)(0xf0 + (u >> 18)); |
39 | 83 | buf[1] = (char)(0x80 + ((u >> 12) & 0x3f)); |
40 | 83 | buf[2] = (char)(0x80 + ((u >> 6) & 0x3f)); |
41 | 83 | buf[3] = (char)(0x80 + (u & 0x3f)); |
42 | 83 | return 4; |
43 | 106 | } else { |
44 | 106 | return 0; |
45 | 106 | } |
46 | 37.4k | } |
47 | | |
48 | 0 | int mapUCS2(Unicode u, char *buf, int bufSize) { |
49 | 0 | if (u <= 0xffff) { |
50 | 0 | if (bufSize < 2) { |
51 | 0 | return 0; |
52 | 0 | } |
53 | 0 | buf[0] = (char)((u >> 8) & 0xff); |
54 | 0 | buf[1] = (char)(u & 0xff); |
55 | 0 | return 2; |
56 | 0 | } else { |
57 | 0 | return 0; |
58 | 0 | } |
59 | 0 | } |
60 | | |
61 | 3.41M | GBool getUTF8(GString *s, int *i, Unicode *u) { |
62 | 3.41M | Guchar c0, c1, c2, c3, c4, c5; |
63 | | |
64 | 3.41M | if (*i >= s->getLength()) { |
65 | 962 | return gFalse; |
66 | 962 | } |
67 | 3.41M | c0 = (Guchar)s->getChar((*i)++); |
68 | 3.41M | if (c0 < 0x80) { |
69 | 2.61M | *u = (Unicode)c0; |
70 | 2.61M | } else if (c0 < 0xe0) { |
71 | 521k | if (*i < s->getLength() && |
72 | 521k | ((c1 = (Guchar)s->getChar(*i)) & 0xc0) == 0x80) { |
73 | 190k | *i += 1; |
74 | 190k | *u = (Unicode)(((c0 & 0x1f) << 6) | |
75 | 190k | (c1 & 0x3f)); |
76 | 330k | } else { |
77 | 330k | *u = (Unicode)c0; |
78 | 330k | } |
79 | 521k | } else if (c0 < 0xf0) { |
80 | 100k | if (*i < s->getLength() - 1 && |
81 | 100k | ((c1 = (Guchar)s->getChar(*i)) & 0xc0) == 0x80 && |
82 | 14.5k | ((c2 = (Guchar)s->getChar(*i + 1)) & 0xc0) == 0x80) { |
83 | 5.36k | *i += 2; |
84 | 5.36k | *u = (Unicode)(((c0 & 0x0f) << 12) | |
85 | 5.36k | ((c1 & 0x3f) << 6) | |
86 | 5.36k | (c2 & 0x3f)); |
87 | 95.5k | } else { |
88 | 95.5k | *u = (Unicode)c0; |
89 | 95.5k | } |
90 | 174k | } else if (c0 < 0xf8) { |
91 | 24.6k | if (*i < s->getLength() - 2 && |
92 | 24.5k | ((c1 = (Guchar)s->getChar(*i)) & 0xc0) == 0x80 && |
93 | 8.47k | ((c2 = (Guchar)s->getChar(*i + 1)) & 0xc0) == 0x80 && |
94 | 3.63k | ((c3 = (Guchar)s->getChar(*i + 2)) & 0xc0) == 0x80) { |
95 | 1.67k | *i += 3; |
96 | 1.67k | *u = (Unicode)(((c0 & 0x07) << 18) | |
97 | 1.67k | ((c1 & 0x3f) << 12) | |
98 | 1.67k | ((c2 & 0x3f) << 6) | |
99 | 1.67k | (c3 & 0x3f)); |
100 | 23.0k | } else { |
101 | 23.0k | *u = (Unicode)c0; |
102 | 23.0k | } |
103 | 149k | } else if (c0 < 0xfc) { |
104 | 7.85k | if (*i < s->getLength() - 3 && |
105 | 7.77k | ((c1 = (Guchar)s->getChar(*i)) & 0xc0) == 0x80 && |
106 | 1.71k | ((c2 = (Guchar)s->getChar(*i + 1)) & 0xc0) == 0x80 && |
107 | 866 | ((c3 = (Guchar)s->getChar(*i + 2)) & 0xc0) == 0x80 && |
108 | 765 | ((c4 = (Guchar)s->getChar(*i + 3)) & 0xc0) == 0x80) { |
109 | 662 | *i += 4; |
110 | 662 | *u = (Unicode)(((c0 & 0x03) << 24) | |
111 | 662 | ((c1 & 0x3f) << 18) | |
112 | 662 | ((c2 & 0x3f) << 12) | |
113 | 662 | ((c3 & 0x3f) << 6) | |
114 | 662 | (c4 & 0x3f)); |
115 | 7.19k | } else { |
116 | 7.19k | *u = (Unicode)c0; |
117 | 7.19k | } |
118 | 141k | } else if (c0 < 0xfe) { |
119 | 23.2k | if (*i < s->getLength() - 4 && |
120 | 22.8k | ((c1 = (Guchar)s->getChar(*i)) & 0xc0) == 0x80 && |
121 | 3.60k | ((c2 = (Guchar)s->getChar(*i + 1)) & 0xc0) == 0x80 && |
122 | 2.90k | ((c3 = (Guchar)s->getChar(*i + 2)) & 0xc0) == 0x80 && |
123 | 2.71k | ((c4 = (Guchar)s->getChar(*i + 3)) & 0xc0) == 0x80 && |
124 | 2.49k | ((c5 = (Guchar)s->getChar(*i + 4)) & 0xc0) == 0x80) { |
125 | 2.11k | *i += 5; |
126 | 2.11k | *u = (Unicode)(((c0 & 0x01) << 30) | |
127 | 2.11k | ((c1 & 0x3f) << 24) | |
128 | 2.11k | ((c2 & 0x3f) << 18) | |
129 | 2.11k | ((c3 & 0x3f) << 12) | |
130 | 2.11k | ((c4 & 0x3f) << 6) | |
131 | 2.11k | (c5 & 0x3f)); |
132 | 21.1k | } else { |
133 | 21.1k | *u = (Unicode)c0; |
134 | 21.1k | } |
135 | 118k | } else { |
136 | 118k | *u = (Unicode)c0; |
137 | 118k | } |
138 | 3.41M | return gTrue; |
139 | 3.41M | } |
140 | | |
141 | 3.06M | GBool getUTF16BE(GString *s, int *i, Unicode *u) { |
142 | 3.06M | int w0, w1; |
143 | | |
144 | 3.06M | if (*i >= s->getLength() - 1) { |
145 | 14.0k | return gFalse; |
146 | 14.0k | } |
147 | 3.05M | w0 = ((s->getChar(*i) & 0xff) << 8) | (s->getChar(*i + 1) & 0xff); |
148 | 3.05M | *i += 2; |
149 | 3.05M | if (w0 < 0xd800 || w0 >= 0xe000) { |
150 | 2.96M | *u = (Unicode)w0; |
151 | 2.96M | } else { |
152 | 91.0k | if (*i < s->getLength() - 1) { |
153 | 90.6k | w1 = ((s->getChar(*i) & 0xff) << 8) | (s->getChar(*i + 1) & 0xff); |
154 | 90.6k | *i += 2; |
155 | 90.6k | *u = 0x10000 + ((w0 - 0xd800) << 10) + (w1 - 0xdc00); |
156 | 90.6k | } else { |
157 | 423 | *u = (Unicode)w0; |
158 | 423 | } |
159 | 91.0k | } |
160 | 3.05M | return gTrue; |
161 | 3.06M | } |
162 | | |
163 | 5.95M | GBool getUTF16LE(GString *s, int *i, Unicode *u) { |
164 | 5.95M | int w0, w1; |
165 | | |
166 | 5.95M | if (*i >= s->getLength() - 1) { |
167 | 1.45k | return gFalse; |
168 | 1.45k | } |
169 | 5.94M | w0 = (s->getChar(*i) & 0xff) | ((s->getChar(*i + 1) & 0xff) << 8); |
170 | 5.94M | *i += 2; |
171 | 5.94M | if (w0 < 0xd800 || w0 >= 0xe000) { |
172 | 5.94M | *u = (Unicode)w0; |
173 | 5.94M | } else { |
174 | 9.13k | if (*i < s->getLength() - 1) { |
175 | 8.91k | w1 = (s->getChar(*i) & 0xff) | ((s->getChar(*i + 1) & 0xff) << 8); |
176 | 8.91k | *i += 2; |
177 | 8.91k | *u = 0x10000 + ((w0 - 0xd800) << 10) + (w1 - 0xdc00); |
178 | 8.91k | } else { |
179 | 221 | *u = (Unicode)w0; |
180 | 221 | } |
181 | 9.13k | } |
182 | 5.94M | return gTrue; |
183 | 5.95M | } |