Coverage Report

Created: 2025-06-22 07:30

/src/assimp/code/AssetLib/STEPParser/STEPFileEncoding.cpp
Line
Count
Source (jump to first uncovered line)
1
/*
2
Open Asset Import Library (assimp)
3
----------------------------------------------------------------------
4
5
Copyright (c) 2006-2025, assimp team
6
7
All rights reserved.
8
9
Redistribution and use of this software in source and binary forms,
10
with or without modification, are permitted provided that the
11
following conditions are met:
12
13
* Redistributions of source code must retain the above
14
  copyright notice, this list of conditions and the
15
  following disclaimer.
16
17
* Redistributions in binary form must reproduce the above
18
  copyright notice, this list of conditions and the
19
  following disclaimer in the documentation and/or other
20
  materials provided with the distribution.
21
22
* Neither the name of the assimp team, nor the names of its
23
  contributors may be used to endorse or promote products
24
  derived from this software without specific prior
25
  written permission of the assimp team.
26
27
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
28
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
29
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
30
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
31
OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
32
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
33
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
34
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
35
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
36
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
37
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
38
39
----------------------------------------------------------------------
40
*/
41
42
/** @file  STEPFileEncoding.cpp
43
 *  @brief STEP character handling, string un-escaping
44
 */
45
#include "STEPFileEncoding.h"
46
#include <assimp/fast_atof.h>
47
#include "utf8.h"
48
49
#include <memory>
50
51
using namespace Assimp;
52
53
// roman1 to utf16 table
54
static const uint16_t mac_codetable[] = {
55
    // 0x20 unassig./nonprint. slots
56
     0x0020 ,
57
     0x0021 ,
58
     0x0022 ,
59
     0x0023 ,
60
     0x0024 ,
61
     0x0025 ,
62
     0x0026 ,
63
     0x0027 ,
64
     0x0028 ,
65
     0x0029 ,
66
     0x002A ,
67
     0x002B ,
68
     0x002C ,
69
     0x002D ,
70
     0x002E ,
71
     0x002F ,
72
     0x0030 ,
73
     0x0031 ,
74
     0x0032 ,
75
     0x0033 ,
76
     0x0034 ,
77
     0x0035 ,
78
     0x0036 ,
79
     0x0037 ,
80
     0x0038 ,
81
     0x0039 ,
82
     0x003A ,
83
     0x003B ,
84
     0x003C ,
85
     0x003D ,
86
     0x003E ,
87
     0x003F ,
88
     0x0040 ,
89
     0x0041 ,
90
     0x0042 ,
91
     0x0043 ,
92
     0x0044 ,
93
     0x0045 ,
94
     0x0046 ,
95
     0x0047 ,
96
     0x0048 ,
97
     0x0049 ,
98
     0x004A ,
99
     0x004B ,
100
     0x004C ,
101
     0x004D ,
102
     0x004E ,
103
     0x004F ,
104
     0x0050 ,
105
     0x0051 ,
106
     0x0052 ,
107
     0x0053 ,
108
     0x0054 ,
109
     0x0055 ,
110
     0x0056 ,
111
     0x0057 ,
112
     0x0058 ,
113
     0x0059 ,
114
     0x005A ,
115
     0x005B ,
116
     0x005C ,
117
     0x005D ,
118
     0x005E ,
119
     0x005F ,
120
     0x0060 ,
121
     0x0061 ,
122
     0x0062 ,
123
     0x0063 ,
124
     0x0064 ,
125
     0x0065 ,
126
     0x0066 ,
127
     0x0067 ,
128
     0x0068 ,
129
     0x0069 ,
130
     0x006A ,
131
     0x006B ,
132
     0x006C ,
133
     0x006D ,
134
     0x006E ,
135
     0x006F ,
136
     0x0070 ,
137
     0x0071 ,
138
     0x0072 ,
139
     0x0073 ,
140
     0x0074 ,
141
     0x0075 ,
142
     0x0076 ,
143
     0x0077 ,
144
     0x0078 ,
145
     0x0079 ,
146
     0x007A ,
147
     0x007B ,
148
     0x007C ,
149
     0x007D ,
150
     0x007E ,
151
     0x0000 , // unassig.
152
     0x00C4 ,
153
     0x00C5 ,
154
     0x00C7 ,
155
     0x00C9 ,
156
     0x00D1 ,
157
     0x00D6 ,
158
     0x00DC ,
159
     0x00E1 ,
160
     0x00E0 ,
161
     0x00E2 ,
162
     0x00E4 ,
163
     0x00E3 ,
164
     0x00E5 ,
165
     0x00E7 ,
166
     0x00E9 ,
167
     0x00E8 ,
168
     0x00EA ,
169
     0x00EB ,
170
     0x00ED ,
171
     0x00EC ,
172
     0x00EE ,
173
     0x00EF ,
174
     0x00F1 ,
175
     0x00F3 ,
176
     0x00F2 ,
177
     0x00F4 ,
178
     0x00F6 ,
179
     0x00F5 ,
180
     0x00FA ,
181
     0x00F9 ,
182
     0x00FB ,
183
     0x00FC ,
184
     0x2020 ,
185
     0x00B0 ,
186
     0x00A2 ,
187
     0x00A3 ,
188
     0x00A7 ,
189
     0x2022 ,
190
     0x00B6 ,
191
     0x00DF ,
192
     0x00AE ,
193
     0x00A9 ,
194
     0x2122 ,
195
     0x00B4 ,
196
     0x00A8 ,
197
     0x2260 ,
198
     0x00C6 ,
199
     0x00D8 ,
200
     0x221E ,
201
     0x00B1 ,
202
     0x2264 ,
203
     0x2265 ,
204
     0x00A5 ,
205
     0x00B5 ,
206
     0x2202 ,
207
     0x2211 ,
208
     0x220F ,
209
     0x03C0 ,
210
     0x222B ,
211
     0x00AA ,
212
     0x00BA ,
213
     0x03A9 ,
214
     0x00E6 ,
215
     0x00F8 ,
216
     0x00BF ,
217
     0x00A1 ,
218
     0x00AC ,
219
     0x221A ,
220
     0x0192 ,
221
     0x2248 ,
222
     0x2206 ,
223
     0x00AB ,
224
     0x00BB ,
225
     0x2026 ,
226
     0x00A0 ,
227
     0x00C0 ,
228
     0x00C3 ,
229
     0x00D5 ,
230
     0x0152 ,
231
     0x0153 ,
232
     0x2013 ,
233
     0x2014 ,
234
     0x201C ,
235
     0x201D ,
236
     0x2018 ,
237
     0x2019 ,
238
     0x00F7 ,
239
     0x25CA ,
240
     0x00FF ,
241
     0x0178 ,
242
     0x2044 ,
243
     0x20AC ,
244
     0x2039 ,
245
     0x203A ,
246
     0xFB01 ,
247
     0xFB02 ,
248
     0x2021 ,
249
     0x00B7 ,
250
     0x201A ,
251
     0x201E ,
252
     0x2030 ,
253
     0x00C2 ,
254
     0x00CA ,
255
     0x00C1 ,
256
     0x00CB ,
257
     0x00C8 ,
258
     0x00CD ,
259
     0x00CE ,
260
     0x00CF ,
261
     0x00CC ,
262
     0x00D3 ,
263
     0x00D4 ,
264
     0xF8FF ,
265
     0x00D2 ,
266
     0x00DA ,
267
     0x00DB ,
268
     0x00D9 ,
269
     0x0131 ,
270
     0x02C6 ,
271
     0x02DC ,
272
     0x00AF ,
273
     0x02D8 ,
274
     0x02D9 ,
275
     0x02DA ,
276
     0x00B8 ,
277
     0x02DD ,
278
     0x02DB ,
279
     0x02C7
280
};
281
282
// ------------------------------------------------------------------------------------------------
283
bool STEP::StringToUTF8(std::string& s)
284
376
{
285
    // very basic handling for escaped string sequences
286
    // http://doc.spatial.com/index.php?title=InterOp:Connect/STEP&redirect=no
287
288
3.46k
    for (size_t i = 0; i < s.size(); ) {
289
3.09k
        if (s[i] == '\\') {
290
            // \S\X - cp1252 (X is the character remapped to [0,127])
291
1.11k
            if (i+3 < s.size() && s[i+1] == 'S' && s[i+2] == '\\') {
292
                // http://stackoverflow.com/questions/5586214/how-to-convert-char-from-iso-8859-1-to-utf-8-in-c-multiplatformly
293
1
                ai_assert((uint8_t)s[i+3] < 0x80);
294
1
                const uint8_t ch = s[i+3] + 0x80;
295
296
1
                s[i] = 0xc0 | (ch & 0xc0) >> 6;
297
1
                s[i+1] =  0x80 | (ch & 0x3f);
298
299
1
                s.erase(i + 2,2);
300
1
                ++i;
301
1
            }
302
            // \X\xx - mac/roman (xx is a hex sequence)
303
1.11k
            else if (i+4 < s.size() && s[i+1] == 'X' && s[i+2] == '\\') {
304
305
380
                const uint8_t macval = HexOctetToDecimal(s.c_str() + i + 3);
306
380
                if(macval < 0x20) {
307
0
                    return false;
308
0
                }
309
310
380
                ai_assert(sizeof(mac_codetable) / sizeof(mac_codetable[0]) == 0x100-0x20);
311
312
380
                const uint32_t unival = mac_codetable[macval - 0x20], *univalp = &unival;
313
314
380
                unsigned char temp[5], *tempp = temp;
315
380
                ai_assert(sizeof( unsigned char ) == 1);
316
317
380
                utf8::utf32to8( univalp, univalp + 1, tempp );
318
319
380
                const size_t outcount = static_cast<size_t>(tempp-temp);
320
321
380
                s.erase(i,5);
322
380
                s.insert(i, reinterpret_cast<char*>(temp), outcount);
323
380
                i += outcount;
324
380
            }
325
            // \Xn\ .. \X0\ - various unicode encodings (n=2: utf16; n=4: utf32)
326
738
            else if (i+3 < s.size() && s[i+1] == 'X' && s[i+2] >= '0' && s[i+2] <= '9') {
327
10
                switch(s[i+2]) {
328
                    // utf16
329
0
                case '2':
330
                    // utf32
331
10
                case '4':
332
10
                    if (s[i+3] == '\\') {
333
10
                        const size_t basei = i+4;
334
10
                        size_t j = basei, jend = s.size()-3;
335
336
242k
                        for (; j < jend; ++j) {
337
242k
                            if (s[j] == '\\' && s[j+1] == 'X' && s[j+2] == '0' && s[j+3] == '\\') {
338
8
                                break;
339
8
                            }
340
242k
                        }
341
10
                        if (j == jend) {
342
2
                            return false;
343
2
                        }
344
345
8
                        if (j == basei) {
346
0
                            s.erase(i,8);
347
0
                            continue;
348
0
                        }
349
350
8
                        if (s[i+2] == '2') {
351
0
                            if (((j - basei) % 4) != 0) {
352
0
                                return false;
353
0
                            }
354
355
0
                            const size_t count = (j-basei)/4;
356
0
                            std::unique_ptr<uint16_t[]> src(new uint16_t[count]);
357
358
0
                            const char* cur = s.c_str() + basei;
359
0
                            for (size_t k = 0; k < count; ++k, cur += 4) {
360
0
                                src[k] = (static_cast<uint16_t>(HexOctetToDecimal(cur)) << 8u)  |
361
0
                                     static_cast<uint16_t>(HexOctetToDecimal(cur+2));
362
0
                            }
363
364
0
                            const size_t dcount = count * 3; // this is enough to hold all possible outputs
365
0
                            std::unique_ptr<unsigned char[]> dest(new unsigned char[dcount]);
366
367
0
                            const uint16_t* srct = src.get();
368
0
                            unsigned char* destt = dest.get();
369
0
                            utf8::utf16to8( srct, srct + count, destt );
370
371
0
                            const size_t outcount = static_cast<size_t>(destt-dest.get());
372
373
0
                            s.erase(i,(j+4-i));
374
375
0
                            ai_assert(sizeof(unsigned char) == 1);
376
0
                            s.insert(i, reinterpret_cast<char*>(dest.get()), outcount);
377
378
0
                            i += outcount;
379
0
                            continue;
380
0
                        }
381
8
                        else if (s[i+2] == '4') {
382
8
                            if (((j - basei) % 8) != 0) {
383
4
                                return false;
384
4
                            }
385
386
4
                            const size_t count = (j-basei)/8;
387
4
                            std::unique_ptr<uint32_t[]> src(new uint32_t[count]);
388
389
4
                            const char* cur = s.c_str() + basei;
390
24
                            for (size_t k = 0; k < count; ++k, cur += 8) {
391
20
                                src[k] = (static_cast<uint32_t>(HexOctetToDecimal(cur  )) << 24u) |
392
20
                                         (static_cast<uint32_t>(HexOctetToDecimal(cur+2)) << 16u) |
393
20
                                         (static_cast<uint32_t>(HexOctetToDecimal(cur+4)) << 8u)  |
394
20
                                         (static_cast<uint32_t>(HexOctetToDecimal(cur+6)));
395
20
                            }
396
397
4
                            const size_t dcount = count * 5; // this is enough to hold all possible outputs
398
4
                            std::unique_ptr<unsigned char[]> dest(new unsigned char[dcount]);
399
400
4
                            const uint32_t* srct = src.get();
401
4
                            unsigned char* destt = dest.get();
402
4
                            utf8::utf32to8( srct, srct + count, destt );
403
404
4
                            const size_t outcount = static_cast<size_t>(destt-dest.get());
405
406
4
                            s.erase(i,(j+4-i));
407
408
4
                            ai_assert(sizeof(unsigned char) == 1);
409
4
                            s.insert(i, reinterpret_cast<char*>(dest.get()), outcount);
410
411
4
                            i += outcount;
412
4
                            continue;
413
8
                        }
414
8
                    }
415
0
                    break;
416
417
                    // TODO: other encoding patterns?
418
419
0
                default:
420
0
                    return false;
421
10
                }
422
10
            }
423
1.11k
        }
424
3.08k
        ++i;
425
3.08k
    }
426
370
    return true;
427
376
}