/src/assimp/code/AssetLib/STEPParser/STEPFileEncoding.cpp
Line | Count | Source (jump to first uncovered line) |
1 | | /* |
2 | | Open Asset Import Library (assimp) |
3 | | ---------------------------------------------------------------------- |
4 | | |
5 | | Copyright (c) 2006-2025, assimp team |
6 | | |
7 | | All rights reserved. |
8 | | |
9 | | Redistribution and use of this software in source and binary forms, |
10 | | with or without modification, are permitted provided that the |
11 | | following conditions are met: |
12 | | |
13 | | * Redistributions of source code must retain the above |
14 | | copyright notice, this list of conditions and the |
15 | | following disclaimer. |
16 | | |
17 | | * Redistributions in binary form must reproduce the above |
18 | | copyright notice, this list of conditions and the |
19 | | following disclaimer in the documentation and/or other |
20 | | materials provided with the distribution. |
21 | | |
22 | | * Neither the name of the assimp team, nor the names of its |
23 | | contributors may be used to endorse or promote products |
24 | | derived from this software without specific prior |
25 | | written permission of the assimp team. |
26 | | |
27 | | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS |
28 | | "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT |
29 | | LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR |
30 | | A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT |
31 | | OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, |
32 | | SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT |
33 | | LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, |
34 | | DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY |
35 | | THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT |
36 | | (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE |
37 | | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. |
38 | | |
39 | | ---------------------------------------------------------------------- |
40 | | */ |
41 | | |
42 | | /** @file STEPFileEncoding.cpp |
43 | | * @brief STEP character handling, string un-escaping |
44 | | */ |
45 | | #include "STEPFileEncoding.h" |
46 | | #include <assimp/fast_atof.h> |
47 | | #include "utf8.h" |
48 | | |
49 | | #include <memory> |
50 | | |
51 | | using namespace Assimp; |
52 | | |
53 | | // roman1 to utf16 table |
54 | | static const uint16_t mac_codetable[] = { |
55 | | // 0x20 unassig./nonprint. slots |
56 | | 0x0020 , |
57 | | 0x0021 , |
58 | | 0x0022 , |
59 | | 0x0023 , |
60 | | 0x0024 , |
61 | | 0x0025 , |
62 | | 0x0026 , |
63 | | 0x0027 , |
64 | | 0x0028 , |
65 | | 0x0029 , |
66 | | 0x002A , |
67 | | 0x002B , |
68 | | 0x002C , |
69 | | 0x002D , |
70 | | 0x002E , |
71 | | 0x002F , |
72 | | 0x0030 , |
73 | | 0x0031 , |
74 | | 0x0032 , |
75 | | 0x0033 , |
76 | | 0x0034 , |
77 | | 0x0035 , |
78 | | 0x0036 , |
79 | | 0x0037 , |
80 | | 0x0038 , |
81 | | 0x0039 , |
82 | | 0x003A , |
83 | | 0x003B , |
84 | | 0x003C , |
85 | | 0x003D , |
86 | | 0x003E , |
87 | | 0x003F , |
88 | | 0x0040 , |
89 | | 0x0041 , |
90 | | 0x0042 , |
91 | | 0x0043 , |
92 | | 0x0044 , |
93 | | 0x0045 , |
94 | | 0x0046 , |
95 | | 0x0047 , |
96 | | 0x0048 , |
97 | | 0x0049 , |
98 | | 0x004A , |
99 | | 0x004B , |
100 | | 0x004C , |
101 | | 0x004D , |
102 | | 0x004E , |
103 | | 0x004F , |
104 | | 0x0050 , |
105 | | 0x0051 , |
106 | | 0x0052 , |
107 | | 0x0053 , |
108 | | 0x0054 , |
109 | | 0x0055 , |
110 | | 0x0056 , |
111 | | 0x0057 , |
112 | | 0x0058 , |
113 | | 0x0059 , |
114 | | 0x005A , |
115 | | 0x005B , |
116 | | 0x005C , |
117 | | 0x005D , |
118 | | 0x005E , |
119 | | 0x005F , |
120 | | 0x0060 , |
121 | | 0x0061 , |
122 | | 0x0062 , |
123 | | 0x0063 , |
124 | | 0x0064 , |
125 | | 0x0065 , |
126 | | 0x0066 , |
127 | | 0x0067 , |
128 | | 0x0068 , |
129 | | 0x0069 , |
130 | | 0x006A , |
131 | | 0x006B , |
132 | | 0x006C , |
133 | | 0x006D , |
134 | | 0x006E , |
135 | | 0x006F , |
136 | | 0x0070 , |
137 | | 0x0071 , |
138 | | 0x0072 , |
139 | | 0x0073 , |
140 | | 0x0074 , |
141 | | 0x0075 , |
142 | | 0x0076 , |
143 | | 0x0077 , |
144 | | 0x0078 , |
145 | | 0x0079 , |
146 | | 0x007A , |
147 | | 0x007B , |
148 | | 0x007C , |
149 | | 0x007D , |
150 | | 0x007E , |
151 | | 0x0000 , // unassig. |
152 | | 0x00C4 , |
153 | | 0x00C5 , |
154 | | 0x00C7 , |
155 | | 0x00C9 , |
156 | | 0x00D1 , |
157 | | 0x00D6 , |
158 | | 0x00DC , |
159 | | 0x00E1 , |
160 | | 0x00E0 , |
161 | | 0x00E2 , |
162 | | 0x00E4 , |
163 | | 0x00E3 , |
164 | | 0x00E5 , |
165 | | 0x00E7 , |
166 | | 0x00E9 , |
167 | | 0x00E8 , |
168 | | 0x00EA , |
169 | | 0x00EB , |
170 | | 0x00ED , |
171 | | 0x00EC , |
172 | | 0x00EE , |
173 | | 0x00EF , |
174 | | 0x00F1 , |
175 | | 0x00F3 , |
176 | | 0x00F2 , |
177 | | 0x00F4 , |
178 | | 0x00F6 , |
179 | | 0x00F5 , |
180 | | 0x00FA , |
181 | | 0x00F9 , |
182 | | 0x00FB , |
183 | | 0x00FC , |
184 | | 0x2020 , |
185 | | 0x00B0 , |
186 | | 0x00A2 , |
187 | | 0x00A3 , |
188 | | 0x00A7 , |
189 | | 0x2022 , |
190 | | 0x00B6 , |
191 | | 0x00DF , |
192 | | 0x00AE , |
193 | | 0x00A9 , |
194 | | 0x2122 , |
195 | | 0x00B4 , |
196 | | 0x00A8 , |
197 | | 0x2260 , |
198 | | 0x00C6 , |
199 | | 0x00D8 , |
200 | | 0x221E , |
201 | | 0x00B1 , |
202 | | 0x2264 , |
203 | | 0x2265 , |
204 | | 0x00A5 , |
205 | | 0x00B5 , |
206 | | 0x2202 , |
207 | | 0x2211 , |
208 | | 0x220F , |
209 | | 0x03C0 , |
210 | | 0x222B , |
211 | | 0x00AA , |
212 | | 0x00BA , |
213 | | 0x03A9 , |
214 | | 0x00E6 , |
215 | | 0x00F8 , |
216 | | 0x00BF , |
217 | | 0x00A1 , |
218 | | 0x00AC , |
219 | | 0x221A , |
220 | | 0x0192 , |
221 | | 0x2248 , |
222 | | 0x2206 , |
223 | | 0x00AB , |
224 | | 0x00BB , |
225 | | 0x2026 , |
226 | | 0x00A0 , |
227 | | 0x00C0 , |
228 | | 0x00C3 , |
229 | | 0x00D5 , |
230 | | 0x0152 , |
231 | | 0x0153 , |
232 | | 0x2013 , |
233 | | 0x2014 , |
234 | | 0x201C , |
235 | | 0x201D , |
236 | | 0x2018 , |
237 | | 0x2019 , |
238 | | 0x00F7 , |
239 | | 0x25CA , |
240 | | 0x00FF , |
241 | | 0x0178 , |
242 | | 0x2044 , |
243 | | 0x20AC , |
244 | | 0x2039 , |
245 | | 0x203A , |
246 | | 0xFB01 , |
247 | | 0xFB02 , |
248 | | 0x2021 , |
249 | | 0x00B7 , |
250 | | 0x201A , |
251 | | 0x201E , |
252 | | 0x2030 , |
253 | | 0x00C2 , |
254 | | 0x00CA , |
255 | | 0x00C1 , |
256 | | 0x00CB , |
257 | | 0x00C8 , |
258 | | 0x00CD , |
259 | | 0x00CE , |
260 | | 0x00CF , |
261 | | 0x00CC , |
262 | | 0x00D3 , |
263 | | 0x00D4 , |
264 | | 0xF8FF , |
265 | | 0x00D2 , |
266 | | 0x00DA , |
267 | | 0x00DB , |
268 | | 0x00D9 , |
269 | | 0x0131 , |
270 | | 0x02C6 , |
271 | | 0x02DC , |
272 | | 0x00AF , |
273 | | 0x02D8 , |
274 | | 0x02D9 , |
275 | | 0x02DA , |
276 | | 0x00B8 , |
277 | | 0x02DD , |
278 | | 0x02DB , |
279 | | 0x02C7 |
280 | | }; |
281 | | |
282 | | // ------------------------------------------------------------------------------------------------ |
283 | | bool STEP::StringToUTF8(std::string& s) |
284 | 376 | { |
285 | | // very basic handling for escaped string sequences |
286 | | // http://doc.spatial.com/index.php?title=InterOp:Connect/STEP&redirect=no |
287 | | |
288 | 3.46k | for (size_t i = 0; i < s.size(); ) { |
289 | 3.09k | if (s[i] == '\\') { |
290 | | // \S\X - cp1252 (X is the character remapped to [0,127]) |
291 | 1.11k | if (i+3 < s.size() && s[i+1] == 'S' && s[i+2] == '\\') { |
292 | | // http://stackoverflow.com/questions/5586214/how-to-convert-char-from-iso-8859-1-to-utf-8-in-c-multiplatformly |
293 | 1 | ai_assert((uint8_t)s[i+3] < 0x80); |
294 | 1 | const uint8_t ch = s[i+3] + 0x80; |
295 | | |
296 | 1 | s[i] = 0xc0 | (ch & 0xc0) >> 6; |
297 | 1 | s[i+1] = 0x80 | (ch & 0x3f); |
298 | | |
299 | 1 | s.erase(i + 2,2); |
300 | 1 | ++i; |
301 | 1 | } |
302 | | // \X\xx - mac/roman (xx is a hex sequence) |
303 | 1.11k | else if (i+4 < s.size() && s[i+1] == 'X' && s[i+2] == '\\') { |
304 | | |
305 | 380 | const uint8_t macval = HexOctetToDecimal(s.c_str() + i + 3); |
306 | 380 | if(macval < 0x20) { |
307 | 0 | return false; |
308 | 0 | } |
309 | | |
310 | 380 | ai_assert(sizeof(mac_codetable) / sizeof(mac_codetable[0]) == 0x100-0x20); |
311 | | |
312 | 380 | const uint32_t unival = mac_codetable[macval - 0x20], *univalp = &unival; |
313 | | |
314 | 380 | unsigned char temp[5], *tempp = temp; |
315 | 380 | ai_assert(sizeof( unsigned char ) == 1); |
316 | | |
317 | 380 | utf8::utf32to8( univalp, univalp + 1, tempp ); |
318 | | |
319 | 380 | const size_t outcount = static_cast<size_t>(tempp-temp); |
320 | | |
321 | 380 | s.erase(i,5); |
322 | 380 | s.insert(i, reinterpret_cast<char*>(temp), outcount); |
323 | 380 | i += outcount; |
324 | 380 | } |
325 | | // \Xn\ .. \X0\ - various unicode encodings (n=2: utf16; n=4: utf32) |
326 | 738 | else if (i+3 < s.size() && s[i+1] == 'X' && s[i+2] >= '0' && s[i+2] <= '9') { |
327 | 10 | switch(s[i+2]) { |
328 | | // utf16 |
329 | 0 | case '2': |
330 | | // utf32 |
331 | 10 | case '4': |
332 | 10 | if (s[i+3] == '\\') { |
333 | 10 | const size_t basei = i+4; |
334 | 10 | size_t j = basei, jend = s.size()-3; |
335 | | |
336 | 242k | for (; j < jend; ++j) { |
337 | 242k | if (s[j] == '\\' && s[j+1] == 'X' && s[j+2] == '0' && s[j+3] == '\\') { |
338 | 8 | break; |
339 | 8 | } |
340 | 242k | } |
341 | 10 | if (j == jend) { |
342 | 2 | return false; |
343 | 2 | } |
344 | | |
345 | 8 | if (j == basei) { |
346 | 0 | s.erase(i,8); |
347 | 0 | continue; |
348 | 0 | } |
349 | | |
350 | 8 | if (s[i+2] == '2') { |
351 | 0 | if (((j - basei) % 4) != 0) { |
352 | 0 | return false; |
353 | 0 | } |
354 | | |
355 | 0 | const size_t count = (j-basei)/4; |
356 | 0 | std::unique_ptr<uint16_t[]> src(new uint16_t[count]); |
357 | |
|
358 | 0 | const char* cur = s.c_str() + basei; |
359 | 0 | for (size_t k = 0; k < count; ++k, cur += 4) { |
360 | 0 | src[k] = (static_cast<uint16_t>(HexOctetToDecimal(cur)) << 8u) | |
361 | 0 | static_cast<uint16_t>(HexOctetToDecimal(cur+2)); |
362 | 0 | } |
363 | |
|
364 | 0 | const size_t dcount = count * 3; // this is enough to hold all possible outputs |
365 | 0 | std::unique_ptr<unsigned char[]> dest(new unsigned char[dcount]); |
366 | |
|
367 | 0 | const uint16_t* srct = src.get(); |
368 | 0 | unsigned char* destt = dest.get(); |
369 | 0 | utf8::utf16to8( srct, srct + count, destt ); |
370 | |
|
371 | 0 | const size_t outcount = static_cast<size_t>(destt-dest.get()); |
372 | |
|
373 | 0 | s.erase(i,(j+4-i)); |
374 | |
|
375 | 0 | ai_assert(sizeof(unsigned char) == 1); |
376 | 0 | s.insert(i, reinterpret_cast<char*>(dest.get()), outcount); |
377 | |
|
378 | 0 | i += outcount; |
379 | 0 | continue; |
380 | 0 | } |
381 | 8 | else if (s[i+2] == '4') { |
382 | 8 | if (((j - basei) % 8) != 0) { |
383 | 4 | return false; |
384 | 4 | } |
385 | | |
386 | 4 | const size_t count = (j-basei)/8; |
387 | 4 | std::unique_ptr<uint32_t[]> src(new uint32_t[count]); |
388 | | |
389 | 4 | const char* cur = s.c_str() + basei; |
390 | 24 | for (size_t k = 0; k < count; ++k, cur += 8) { |
391 | 20 | src[k] = (static_cast<uint32_t>(HexOctetToDecimal(cur )) << 24u) | |
392 | 20 | (static_cast<uint32_t>(HexOctetToDecimal(cur+2)) << 16u) | |
393 | 20 | (static_cast<uint32_t>(HexOctetToDecimal(cur+4)) << 8u) | |
394 | 20 | (static_cast<uint32_t>(HexOctetToDecimal(cur+6))); |
395 | 20 | } |
396 | | |
397 | 4 | const size_t dcount = count * 5; // this is enough to hold all possible outputs |
398 | 4 | std::unique_ptr<unsigned char[]> dest(new unsigned char[dcount]); |
399 | | |
400 | 4 | const uint32_t* srct = src.get(); |
401 | 4 | unsigned char* destt = dest.get(); |
402 | 4 | utf8::utf32to8( srct, srct + count, destt ); |
403 | | |
404 | 4 | const size_t outcount = static_cast<size_t>(destt-dest.get()); |
405 | | |
406 | 4 | s.erase(i,(j+4-i)); |
407 | | |
408 | 4 | ai_assert(sizeof(unsigned char) == 1); |
409 | 4 | s.insert(i, reinterpret_cast<char*>(dest.get()), outcount); |
410 | | |
411 | 4 | i += outcount; |
412 | 4 | continue; |
413 | 8 | } |
414 | 8 | } |
415 | 0 | break; |
416 | | |
417 | | // TODO: other encoding patterns? |
418 | | |
419 | 0 | default: |
420 | 0 | return false; |
421 | 10 | } |
422 | 10 | } |
423 | 1.11k | } |
424 | 3.08k | ++i; |
425 | 3.08k | } |
426 | 370 | return true; |
427 | 376 | } |