Coverage Report

Created: 2024-06-18 06:11

/src/meshoptimizer/src/vertexcodec.cpp
Line
Count
Source (jump to first uncovered line)
1
// This file is part of meshoptimizer library; see meshoptimizer.h for version/license details
2
#include "meshoptimizer.h"
3
4
#include <assert.h>
5
#include <string.h>
6
7
// The block below auto-detects SIMD ISA that can be used on the target platform
8
#ifndef MESHOPTIMIZER_NO_SIMD
9
10
// The SIMD implementation requires SSSE3, which can be enabled unconditionally through compiler settings
11
#if defined(__AVX__) || defined(__SSSE3__)
12
#define SIMD_SSE
13
#endif
14
15
// An experimental implementation using AVX512 instructions; it's only enabled when AVX512 is enabled through compiler settings
16
#if defined(__AVX512VBMI2__) && defined(__AVX512VBMI__) && defined(__AVX512VL__) && defined(__POPCNT__)
17
#undef SIMD_SSE
18
#define SIMD_AVX
19
#endif
20
21
// MSVC supports compiling SSSE3 code regardless of compile options; we use a cpuid-based scalar fallback
22
#if !defined(SIMD_SSE) && !defined(SIMD_AVX) && defined(_MSC_VER) && !defined(__clang__) && (defined(_M_IX86) || defined(_M_X64))
23
#define SIMD_SSE
24
#define SIMD_FALLBACK
25
#endif
26
27
// GCC 4.9+ and clang 3.8+ support targeting SIMD ISA from individual functions; we use a cpuid-based scalar fallback
28
#if !defined(SIMD_SSE) && !defined(SIMD_AVX) && ((defined(__clang__) && __clang_major__ * 100 + __clang_minor__ >= 308) || (defined(__GNUC__) && __GNUC__ * 100 + __GNUC_MINOR__ >= 409)) && (defined(__i386__) || defined(__x86_64__))
29
#define SIMD_SSE
30
#define SIMD_FALLBACK
31
#define SIMD_TARGET __attribute__((target("ssse3")))
32
#endif
33
34
// GCC/clang define these when NEON support is available
35
#if defined(__ARM_NEON__) || defined(__ARM_NEON)
36
#define SIMD_NEON
37
#endif
38
39
// On MSVC, we assume that ARM builds always target NEON-capable devices
40
#if !defined(SIMD_NEON) && defined(_MSC_VER) && (defined(_M_ARM) || defined(_M_ARM64))
41
#define SIMD_NEON
42
#endif
43
44
// When targeting Wasm SIMD we can't use runtime cpuid checks so we unconditionally enable SIMD
45
#if defined(__wasm_simd128__)
46
#define SIMD_WASM
47
// Prevent compiling other variant when wasm simd compilation is active
48
#undef SIMD_NEON
49
#undef SIMD_SSE
50
#undef SIMD_AVX
51
#endif
52
53
#ifndef SIMD_TARGET
54
#define SIMD_TARGET
55
#endif
56
57
// When targeting AArch64/x64, optimize for latency to allow decoding of individual 16-byte groups to overlap
58
// We don't do this for 32-bit systems because we need 64-bit math for this and this will hurt in-order CPUs
59
#if defined(__x86_64__) || defined(_M_X64) || defined(__aarch64__) || defined(_M_ARM64)
60
#define SIMD_LATENCYOPT
61
#endif
62
63
#endif // !MESHOPTIMIZER_NO_SIMD
64
65
#ifdef SIMD_SSE
66
#include <tmmintrin.h>
67
#endif
68
69
#if defined(SIMD_SSE) && defined(SIMD_FALLBACK)
70
#ifdef _MSC_VER
71
#include <intrin.h> // __cpuid
72
#else
73
#include <cpuid.h> // __cpuid
74
#endif
75
#endif
76
77
#ifdef SIMD_AVX
78
#include <immintrin.h>
79
#endif
80
81
#ifdef SIMD_NEON
82
#if defined(_MSC_VER) && defined(_M_ARM64)
83
#include <arm64_neon.h>
84
#else
85
#include <arm_neon.h>
86
#endif
87
#endif
88
89
#ifdef SIMD_WASM
90
#include <wasm_simd128.h>
91
#endif
92
93
#ifdef SIMD_WASM
94
#define wasmx_splat_v32x4(v, i) wasm_i32x4_shuffle(v, v, i, i, i, i)
95
#define wasmx_unpacklo_v8x16(a, b) wasm_i8x16_shuffle(a, b, 0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23)
96
#define wasmx_unpackhi_v8x16(a, b) wasm_i8x16_shuffle(a, b, 8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31)
97
#define wasmx_unpacklo_v16x8(a, b) wasm_i16x8_shuffle(a, b, 0, 8, 1, 9, 2, 10, 3, 11)
98
#define wasmx_unpackhi_v16x8(a, b) wasm_i16x8_shuffle(a, b, 4, 12, 5, 13, 6, 14, 7, 15)
99
#define wasmx_unpacklo_v64x2(a, b) wasm_i64x2_shuffle(a, b, 0, 2)
100
#define wasmx_unpackhi_v64x2(a, b) wasm_i64x2_shuffle(a, b, 1, 3)
101
#endif
102
103
namespace meshopt
104
{
105
106
const unsigned char kVertexHeader = 0xa0;
107
108
static int gEncodeVertexVersion = 0;
109
110
const size_t kVertexBlockSizeBytes = 8192;
111
const size_t kVertexBlockMaxSize = 256;
112
const size_t kByteGroupSize = 16;
113
const size_t kByteGroupDecodeLimit = 24;
114
const size_t kTailMaxSize = 32;
115
116
static size_t getVertexBlockSize(size_t vertex_size)
117
1.01k
{
118
  // make sure the entire block fits into the scratch buffer
119
1.01k
  size_t result = kVertexBlockSizeBytes / vertex_size;
120
121
  // align to byte group size; we encode each byte as a byte group
122
  // if vertex block is misaligned, it results in wasted bytes, so just truncate the block size
123
1.01k
  result &= ~(kByteGroupSize - 1);
124
125
1.01k
  return (result < kVertexBlockMaxSize) ? result : kVertexBlockMaxSize;
126
1.01k
}
127
128
inline unsigned char zigzag8(unsigned char v)
129
0
{
130
0
  return ((signed char)(v) >> 7) ^ (v << 1);
131
0
}
132
133
inline unsigned char unzigzag8(unsigned char v)
134
0
{
135
0
  return -(v & 1) ^ (v >> 1);
136
0
}
137
138
static bool encodeBytesGroupZero(const unsigned char* buffer)
139
0
{
140
0
  for (size_t i = 0; i < kByteGroupSize; ++i)
141
0
    if (buffer[i])
142
0
      return false;
143
144
0
  return true;
145
0
}
146
147
static size_t encodeBytesGroupMeasure(const unsigned char* buffer, int bits)
148
0
{
149
0
  assert(bits >= 1 && bits <= 8);
150
151
0
  if (bits == 1)
152
0
    return encodeBytesGroupZero(buffer) ? 0 : size_t(-1);
153
154
0
  if (bits == 8)
155
0
    return kByteGroupSize;
156
157
0
  size_t result = kByteGroupSize * bits / 8;
158
159
0
  unsigned char sentinel = (1 << bits) - 1;
160
161
0
  for (size_t i = 0; i < kByteGroupSize; ++i)
162
0
    result += buffer[i] >= sentinel;
163
164
0
  return result;
165
0
}
166
167
static unsigned char* encodeBytesGroup(unsigned char* data, const unsigned char* buffer, int bits)
168
0
{
169
0
  assert(bits >= 1 && bits <= 8);
170
171
0
  if (bits == 1)
172
0
    return data;
173
174
0
  if (bits == 8)
175
0
  {
176
0
    memcpy(data, buffer, kByteGroupSize);
177
0
    return data + kByteGroupSize;
178
0
  }
179
180
0
  size_t byte_size = 8 / bits;
181
0
  assert(kByteGroupSize % byte_size == 0);
182
183
  // fixed portion: bits bits for each value
184
  // variable portion: full byte for each out-of-range value (using 1...1 as sentinel)
185
0
  unsigned char sentinel = (1 << bits) - 1;
186
187
0
  for (size_t i = 0; i < kByteGroupSize; i += byte_size)
188
0
  {
189
0
    unsigned char byte = 0;
190
191
0
    for (size_t k = 0; k < byte_size; ++k)
192
0
    {
193
0
      unsigned char enc = (buffer[i + k] >= sentinel) ? sentinel : buffer[i + k];
194
195
0
      byte <<= bits;
196
0
      byte |= enc;
197
0
    }
198
199
0
    *data++ = byte;
200
0
  }
201
202
0
  for (size_t i = 0; i < kByteGroupSize; ++i)
203
0
  {
204
0
    if (buffer[i] >= sentinel)
205
0
    {
206
0
      *data++ = buffer[i];
207
0
    }
208
0
  }
209
210
0
  return data;
211
0
}
212
213
static unsigned char* encodeBytes(unsigned char* data, unsigned char* data_end, const unsigned char* buffer, size_t buffer_size)
214
0
{
215
0
  assert(buffer_size % kByteGroupSize == 0);
216
217
0
  unsigned char* header = data;
218
219
  // round number of groups to 4 to get number of header bytes
220
0
  size_t header_size = (buffer_size / kByteGroupSize + 3) / 4;
221
222
0
  if (size_t(data_end - data) < header_size)
223
0
    return NULL;
224
225
0
  data += header_size;
226
227
0
  memset(header, 0, header_size);
228
229
0
  for (size_t i = 0; i < buffer_size; i += kByteGroupSize)
230
0
  {
231
0
    if (size_t(data_end - data) < kByteGroupDecodeLimit)
232
0
      return NULL;
233
234
0
    int best_bits = 8;
235
0
    size_t best_size = encodeBytesGroupMeasure(buffer + i, 8);
236
237
0
    for (int bits = 1; bits < 8; bits *= 2)
238
0
    {
239
0
      size_t size = encodeBytesGroupMeasure(buffer + i, bits);
240
241
0
      if (size < best_size)
242
0
      {
243
0
        best_bits = bits;
244
0
        best_size = size;
245
0
      }
246
0
    }
247
248
0
    int bitslog2 = (best_bits == 1) ? 0 : (best_bits == 2 ? 1 : (best_bits == 4 ? 2 : 3));
249
0
    assert((1 << bitslog2) == best_bits);
250
251
0
    size_t header_offset = i / kByteGroupSize;
252
253
0
    header[header_offset / 4] |= bitslog2 << ((header_offset % 4) * 2);
254
255
0
    unsigned char* next = encodeBytesGroup(data, buffer + i, best_bits);
256
257
0
    assert(data + best_size == next);
258
0
    data = next;
259
0
  }
260
261
0
  return data;
262
0
}
263
264
static unsigned char* encodeVertexBlock(unsigned char* data, unsigned char* data_end, const unsigned char* vertex_data, size_t vertex_count, size_t vertex_size, unsigned char last_vertex[256])
265
0
{
266
0
  assert(vertex_count > 0 && vertex_count <= kVertexBlockMaxSize);
267
268
0
  unsigned char buffer[kVertexBlockMaxSize];
269
0
  assert(sizeof(buffer) % kByteGroupSize == 0);
270
271
  // we sometimes encode elements we didn't fill when rounding to kByteGroupSize
272
0
  memset(buffer, 0, sizeof(buffer));
273
274
0
  for (size_t k = 0; k < vertex_size; ++k)
275
0
  {
276
0
    size_t vertex_offset = k;
277
278
0
    unsigned char p = last_vertex[k];
279
280
0
    for (size_t i = 0; i < vertex_count; ++i)
281
0
    {
282
0
      buffer[i] = zigzag8(vertex_data[vertex_offset] - p);
283
284
0
      p = vertex_data[vertex_offset];
285
286
0
      vertex_offset += vertex_size;
287
0
    }
288
289
0
    data = encodeBytes(data, data_end, buffer, (vertex_count + kByteGroupSize - 1) & ~(kByteGroupSize - 1));
290
0
    if (!data)
291
0
      return NULL;
292
0
  }
293
294
0
  memcpy(last_vertex, &vertex_data[vertex_size * (vertex_count - 1)], vertex_size);
295
296
0
  return data;
297
0
}
298
299
#if defined(SIMD_FALLBACK) || (!defined(SIMD_SSE) && !defined(SIMD_NEON) && !defined(SIMD_AVX) && !defined(SIMD_WASM))
300
static const unsigned char* decodeBytesGroup(const unsigned char* data, unsigned char* buffer, int bitslog2)
301
0
{
302
0
#define READ() byte = *data++
303
0
#define NEXT(bits) enc = byte >> (8 - bits), byte <<= bits, encv = *data_var, *buffer++ = (enc == (1 << bits) - 1) ? encv : enc, data_var += (enc == (1 << bits) - 1)
304
305
0
  unsigned char byte, enc, encv;
306
0
  const unsigned char* data_var;
307
308
0
  switch (bitslog2)
309
0
  {
310
0
  case 0:
311
0
    memset(buffer, 0, kByteGroupSize);
312
0
    return data;
313
0
  case 1:
314
0
    data_var = data + 4;
315
316
    // 4 groups with 4 2-bit values in each byte
317
0
    READ(), NEXT(2), NEXT(2), NEXT(2), NEXT(2);
318
0
    READ(), NEXT(2), NEXT(2), NEXT(2), NEXT(2);
319
0
    READ(), NEXT(2), NEXT(2), NEXT(2), NEXT(2);
320
0
    READ(), NEXT(2), NEXT(2), NEXT(2), NEXT(2);
321
322
0
    return data_var;
323
0
  case 2:
324
0
    data_var = data + 8;
325
326
    // 8 groups with 2 4-bit values in each byte
327
0
    READ(), NEXT(4), NEXT(4);
328
0
    READ(), NEXT(4), NEXT(4);
329
0
    READ(), NEXT(4), NEXT(4);
330
0
    READ(), NEXT(4), NEXT(4);
331
0
    READ(), NEXT(4), NEXT(4);
332
0
    READ(), NEXT(4), NEXT(4);
333
0
    READ(), NEXT(4), NEXT(4);
334
0
    READ(), NEXT(4), NEXT(4);
335
336
0
    return data_var;
337
0
  case 3:
338
0
    memcpy(buffer, data, kByteGroupSize);
339
0
    return data + kByteGroupSize;
340
0
  default:
341
0
    assert(!"Unexpected bit length"); // unreachable since bitslog2 is a 2-bit value
342
0
    return data;
343
0
  }
344
345
0
#undef READ
346
0
#undef NEXT
347
0
}
348
349
static const unsigned char* decodeBytes(const unsigned char* data, const unsigned char* data_end, unsigned char* buffer, size_t buffer_size)
350
0
{
351
0
  assert(buffer_size % kByteGroupSize == 0);
352
353
0
  const unsigned char* header = data;
354
355
  // round number of groups to 4 to get number of header bytes
356
0
  size_t header_size = (buffer_size / kByteGroupSize + 3) / 4;
357
358
0
  if (size_t(data_end - data) < header_size)
359
0
    return NULL;
360
361
0
  data += header_size;
362
363
0
  for (size_t i = 0; i < buffer_size; i += kByteGroupSize)
364
0
  {
365
0
    if (size_t(data_end - data) < kByteGroupDecodeLimit)
366
0
      return NULL;
367
368
0
    size_t header_offset = i / kByteGroupSize;
369
370
0
    int bitslog2 = (header[header_offset / 4] >> ((header_offset % 4) * 2)) & 3;
371
372
0
    data = decodeBytesGroup(data, buffer + i, bitslog2);
373
0
  }
374
375
0
  return data;
376
0
}
377
378
static const unsigned char* decodeVertexBlock(const unsigned char* data, const unsigned char* data_end, unsigned char* vertex_data, size_t vertex_count, size_t vertex_size, unsigned char last_vertex[256])
379
0
{
380
0
  assert(vertex_count > 0 && vertex_count <= kVertexBlockMaxSize);
381
382
0
  unsigned char buffer[kVertexBlockMaxSize];
383
0
  unsigned char transposed[kVertexBlockSizeBytes];
384
385
0
  size_t vertex_count_aligned = (vertex_count + kByteGroupSize - 1) & ~(kByteGroupSize - 1);
386
387
0
  for (size_t k = 0; k < vertex_size; ++k)
388
0
  {
389
0
    data = decodeBytes(data, data_end, buffer, vertex_count_aligned);
390
0
    if (!data)
391
0
      return NULL;
392
393
0
    size_t vertex_offset = k;
394
395
0
    unsigned char p = last_vertex[k];
396
397
0
    for (size_t i = 0; i < vertex_count; ++i)
398
0
    {
399
0
      unsigned char v = unzigzag8(buffer[i]) + p;
400
401
0
      transposed[vertex_offset] = v;
402
0
      p = v;
403
404
0
      vertex_offset += vertex_size;
405
0
    }
406
0
  }
407
408
0
  memcpy(vertex_data, transposed, vertex_count * vertex_size);
409
410
0
  memcpy(last_vertex, &transposed[vertex_size * (vertex_count - 1)], vertex_size);
411
412
0
  return data;
413
0
}
414
#endif
415
416
#if defined(SIMD_SSE) || defined(SIMD_NEON) || defined(SIMD_WASM)
417
static unsigned char kDecodeBytesGroupShuffle[256][8];
418
static unsigned char kDecodeBytesGroupCount[256];
419
420
#ifdef __wasm__
421
__attribute__((cold)) // this saves 500 bytes in the output binary - we don't need to vectorize this loop!
422
#endif
423
static bool
424
decodeBytesGroupBuildTables()
425
2
{
426
514
  for (int mask = 0; mask < 256; ++mask)
427
512
  {
428
512
    unsigned char shuffle[8];
429
512
    unsigned char count = 0;
430
431
4.60k
    for (int i = 0; i < 8; ++i)
432
4.09k
    {
433
4.09k
      int maski = (mask >> i) & 1;
434
4.09k
      shuffle[i] = maski ? count : 0x80;
435
4.09k
      count += (unsigned char)(maski);
436
4.09k
    }
437
438
512
    memcpy(kDecodeBytesGroupShuffle[mask], shuffle, 8);
439
512
    kDecodeBytesGroupCount[mask] = count;
440
512
  }
441
442
2
  return true;
443
2
}
444
445
static bool gDecodeBytesGroupInitialized = decodeBytesGroupBuildTables();
446
#endif
447
448
#ifdef SIMD_SSE
449
SIMD_TARGET
450
static __m128i decodeShuffleMask(unsigned char mask0, unsigned char mask1)
451
10.5k
{
452
10.5k
  __m128i sm0 = _mm_loadl_epi64(reinterpret_cast<const __m128i*>(&kDecodeBytesGroupShuffle[mask0]));
453
10.5k
  __m128i sm1 = _mm_loadl_epi64(reinterpret_cast<const __m128i*>(&kDecodeBytesGroupShuffle[mask1]));
454
10.5k
  __m128i sm1off = _mm_set1_epi8(kDecodeBytesGroupCount[mask0]);
455
456
10.5k
  __m128i sm1r = _mm_add_epi8(sm1, sm1off);
457
458
10.5k
  return _mm_unpacklo_epi64(sm0, sm1r);
459
10.5k
}
460
461
SIMD_TARGET
462
static const unsigned char* decodeBytesGroupSimd(const unsigned char* data, unsigned char* buffer, int bitslog2)
463
63.1k
{
464
63.1k
  switch (bitslog2)
465
63.1k
  {
466
47.0k
  case 0:
467
47.0k
  {
468
47.0k
    __m128i result = _mm_setzero_si128();
469
470
47.0k
    _mm_storeu_si128(reinterpret_cast<__m128i*>(buffer), result);
471
472
47.0k
    return data;
473
0
  }
474
475
5.50k
  case 1:
476
5.50k
  {
477
5.50k
#ifdef __GNUC__
478
5.50k
    typedef int __attribute__((aligned(1))) unaligned_int;
479
#else
480
    typedef int unaligned_int;
481
#endif
482
483
5.50k
#ifdef SIMD_LATENCYOPT
484
5.50k
    unsigned int data32;
485
5.50k
    memcpy(&data32, data, 4);
486
5.50k
    data32 &= data32 >> 1;
487
488
    // arrange bits such that low bits of nibbles of data64 contain all 2-bit elements of data32
489
5.50k
    unsigned long long data64 = ((unsigned long long)data32 << 30) | (data32 & 0x3fffffff);
490
491
    // adds all 1-bit nibbles together; the sum fits in 4 bits because datacnt=16 would have used mode 3
492
5.50k
    int datacnt = int(((data64 & 0x1111111111111111ull) * 0x1111111111111111ull) >> 60);
493
5.50k
#endif
494
495
5.50k
    __m128i sel2 = _mm_cvtsi32_si128(*reinterpret_cast<const unaligned_int*>(data));
496
5.50k
    __m128i rest = _mm_loadu_si128(reinterpret_cast<const __m128i*>(data + 4));
497
498
5.50k
    __m128i sel22 = _mm_unpacklo_epi8(_mm_srli_epi16(sel2, 4), sel2);
499
5.50k
    __m128i sel2222 = _mm_unpacklo_epi8(_mm_srli_epi16(sel22, 2), sel22);
500
5.50k
    __m128i sel = _mm_and_si128(sel2222, _mm_set1_epi8(3));
501
502
5.50k
    __m128i mask = _mm_cmpeq_epi8(sel, _mm_set1_epi8(3));
503
5.50k
    int mask16 = _mm_movemask_epi8(mask);
504
5.50k
    unsigned char mask0 = (unsigned char)(mask16 & 255);
505
5.50k
    unsigned char mask1 = (unsigned char)(mask16 >> 8);
506
507
5.50k
    __m128i shuf = decodeShuffleMask(mask0, mask1);
508
509
5.50k
    __m128i result = _mm_or_si128(_mm_shuffle_epi8(rest, shuf), _mm_andnot_si128(mask, sel));
510
511
5.50k
    _mm_storeu_si128(reinterpret_cast<__m128i*>(buffer), result);
512
513
5.50k
#ifdef SIMD_LATENCYOPT
514
5.50k
    return data + 4 + datacnt;
515
#else
516
    return data + 4 + kDecodeBytesGroupCount[mask0] + kDecodeBytesGroupCount[mask1];
517
#endif
518
0
  }
519
520
5.01k
  case 2:
521
5.01k
  {
522
5.01k
#ifdef SIMD_LATENCYOPT
523
5.01k
    unsigned long long data64;
524
5.01k
    memcpy(&data64, data, 8);
525
5.01k
    data64 &= data64 >> 1;
526
5.01k
    data64 &= data64 >> 2;
527
528
    // adds all 1-bit nibbles together; the sum fits in 4 bits because datacnt=16 would have used mode 3
529
5.01k
    int datacnt = int(((data64 & 0x1111111111111111ull) * 0x1111111111111111ull) >> 60);
530
5.01k
#endif
531
532
5.01k
    __m128i sel4 = _mm_loadl_epi64(reinterpret_cast<const __m128i*>(data));
533
5.01k
    __m128i rest = _mm_loadu_si128(reinterpret_cast<const __m128i*>(data + 8));
534
535
5.01k
    __m128i sel44 = _mm_unpacklo_epi8(_mm_srli_epi16(sel4, 4), sel4);
536
5.01k
    __m128i sel = _mm_and_si128(sel44, _mm_set1_epi8(15));
537
538
5.01k
    __m128i mask = _mm_cmpeq_epi8(sel, _mm_set1_epi8(15));
539
5.01k
    int mask16 = _mm_movemask_epi8(mask);
540
5.01k
    unsigned char mask0 = (unsigned char)(mask16 & 255);
541
5.01k
    unsigned char mask1 = (unsigned char)(mask16 >> 8);
542
543
5.01k
    __m128i shuf = decodeShuffleMask(mask0, mask1);
544
545
5.01k
    __m128i result = _mm_or_si128(_mm_shuffle_epi8(rest, shuf), _mm_andnot_si128(mask, sel));
546
547
5.01k
    _mm_storeu_si128(reinterpret_cast<__m128i*>(buffer), result);
548
549
5.01k
#ifdef SIMD_LATENCYOPT
550
5.01k
    return data + 8 + datacnt;
551
#else
552
    return data + 8 + kDecodeBytesGroupCount[mask0] + kDecodeBytesGroupCount[mask1];
553
#endif
554
0
  }
555
556
5.57k
  case 3:
557
5.57k
  {
558
5.57k
    __m128i result = _mm_loadu_si128(reinterpret_cast<const __m128i*>(data));
559
560
5.57k
    _mm_storeu_si128(reinterpret_cast<__m128i*>(buffer), result);
561
562
5.57k
    return data + 16;
563
0
  }
564
565
0
  default:
566
0
    assert(!"Unexpected bit length"); // unreachable since bitslog2 is a 2-bit value
567
0
    return data;
568
63.1k
  }
569
63.1k
}
570
#endif
571
572
#ifdef SIMD_AVX
573
static const __m128i decodeBytesGroupConfig[] = {
574
    _mm_set1_epi8(3),
575
    _mm_set1_epi8(15),
576
    _mm_setr_epi8(6, 4, 2, 0, 14, 12, 10, 8, 22, 20, 18, 16, 30, 28, 26, 24),
577
    _mm_setr_epi8(4, 0, 12, 8, 20, 16, 28, 24, 36, 32, 44, 40, 52, 48, 60, 56),
578
};
579
580
static const unsigned char* decodeBytesGroupSimd(const unsigned char* data, unsigned char* buffer, int bitslog2)
581
{
582
  switch (bitslog2)
583
  {
584
  case 0:
585
  {
586
    __m128i result = _mm_setzero_si128();
587
588
    _mm_storeu_si128(reinterpret_cast<__m128i*>(buffer), result);
589
590
    return data;
591
  }
592
593
  case 1:
594
  case 2:
595
  {
596
    const unsigned char* skip = data + (bitslog2 << 2);
597
598
    __m128i selb = _mm_loadl_epi64(reinterpret_cast<const __m128i*>(data));
599
    __m128i rest = _mm_loadu_si128(reinterpret_cast<const __m128i*>(skip));
600
601
    __m128i sent = decodeBytesGroupConfig[bitslog2 - 1];
602
    __m128i ctrl = decodeBytesGroupConfig[bitslog2 + 1];
603
604
    __m128i selw = _mm_shuffle_epi32(selb, 0x44);
605
    __m128i sel = _mm_and_si128(sent, _mm_multishift_epi64_epi8(ctrl, selw));
606
    __mmask16 mask16 = _mm_cmp_epi8_mask(sel, sent, _MM_CMPINT_EQ);
607
608
    __m128i result = _mm_mask_expand_epi8(sel, mask16, rest);
609
610
    _mm_storeu_si128(reinterpret_cast<__m128i*>(buffer), result);
611
612
    return skip + _mm_popcnt_u32(mask16);
613
  }
614
615
  case 3:
616
  {
617
    __m128i result = _mm_loadu_si128(reinterpret_cast<const __m128i*>(data));
618
619
    _mm_storeu_si128(reinterpret_cast<__m128i*>(buffer), result);
620
621
    return data + 16;
622
  }
623
624
  default:
625
    assert(!"Unexpected bit length"); // unreachable since bitslog2 is a 2-bit value
626
    return data;
627
  }
628
}
629
#endif
630
631
#ifdef SIMD_NEON
632
static uint8x16_t shuffleBytes(unsigned char mask0, unsigned char mask1, uint8x8_t rest0, uint8x8_t rest1)
633
{
634
  uint8x8_t sm0 = vld1_u8(kDecodeBytesGroupShuffle[mask0]);
635
  uint8x8_t sm1 = vld1_u8(kDecodeBytesGroupShuffle[mask1]);
636
637
  uint8x8_t r0 = vtbl1_u8(rest0, sm0);
638
  uint8x8_t r1 = vtbl1_u8(rest1, sm1);
639
640
  return vcombine_u8(r0, r1);
641
}
642
643
static void neonMoveMask(uint8x16_t mask, unsigned char& mask0, unsigned char& mask1)
644
{
645
  // magic constant found using z3 SMT assuming mask has 8 groups of 0xff or 0x00
646
  const uint64_t magic = 0x000103070f1f3f80ull;
647
648
  uint64x2_t mask2 = vreinterpretq_u64_u8(mask);
649
650
  mask0 = uint8_t((vgetq_lane_u64(mask2, 0) * magic) >> 56);
651
  mask1 = uint8_t((vgetq_lane_u64(mask2, 1) * magic) >> 56);
652
}
653
654
static const unsigned char* decodeBytesGroupSimd(const unsigned char* data, unsigned char* buffer, int bitslog2)
655
{
656
  switch (bitslog2)
657
  {
658
  case 0:
659
  {
660
    uint8x16_t result = vdupq_n_u8(0);
661
662
    vst1q_u8(buffer, result);
663
664
    return data;
665
  }
666
667
  case 1:
668
  {
669
#ifdef SIMD_LATENCYOPT
670
    unsigned int data32;
671
    memcpy(&data32, data, 4);
672
    data32 &= data32 >> 1;
673
674
    // arrange bits such that low bits of nibbles of data64 contain all 2-bit elements of data32
675
    unsigned long long data64 = ((unsigned long long)data32 << 30) | (data32 & 0x3fffffff);
676
677
    // adds all 1-bit nibbles together; the sum fits in 4 bits because datacnt=16 would have used mode 3
678
    int datacnt = int(((data64 & 0x1111111111111111ull) * 0x1111111111111111ull) >> 60);
679
#endif
680
681
    uint8x8_t sel2 = vld1_u8(data);
682
    uint8x8_t sel22 = vzip_u8(vshr_n_u8(sel2, 4), sel2).val[0];
683
    uint8x8x2_t sel2222 = vzip_u8(vshr_n_u8(sel22, 2), sel22);
684
    uint8x16_t sel = vandq_u8(vcombine_u8(sel2222.val[0], sel2222.val[1]), vdupq_n_u8(3));
685
686
    uint8x16_t mask = vceqq_u8(sel, vdupq_n_u8(3));
687
    unsigned char mask0, mask1;
688
    neonMoveMask(mask, mask0, mask1);
689
690
    uint8x8_t rest0 = vld1_u8(data + 4);
691
    uint8x8_t rest1 = vld1_u8(data + 4 + kDecodeBytesGroupCount[mask0]);
692
693
    uint8x16_t result = vbslq_u8(mask, shuffleBytes(mask0, mask1, rest0, rest1), sel);
694
695
    vst1q_u8(buffer, result);
696
697
#ifdef SIMD_LATENCYOPT
698
    return data + 4 + datacnt;
699
#else
700
    return data + 4 + kDecodeBytesGroupCount[mask0] + kDecodeBytesGroupCount[mask1];
701
#endif
702
  }
703
704
  case 2:
705
  {
706
#ifdef SIMD_LATENCYOPT
707
    unsigned long long data64;
708
    memcpy(&data64, data, 8);
709
    data64 &= data64 >> 1;
710
    data64 &= data64 >> 2;
711
712
    // adds all 1-bit nibbles together; the sum fits in 4 bits because datacnt=16 would have used mode 3
713
    int datacnt = int(((data64 & 0x1111111111111111ull) * 0x1111111111111111ull) >> 60);
714
#endif
715
716
    uint8x8_t sel4 = vld1_u8(data);
717
    uint8x8x2_t sel44 = vzip_u8(vshr_n_u8(sel4, 4), vand_u8(sel4, vdup_n_u8(15)));
718
    uint8x16_t sel = vcombine_u8(sel44.val[0], sel44.val[1]);
719
720
    uint8x16_t mask = vceqq_u8(sel, vdupq_n_u8(15));
721
    unsigned char mask0, mask1;
722
    neonMoveMask(mask, mask0, mask1);
723
724
    uint8x8_t rest0 = vld1_u8(data + 8);
725
    uint8x8_t rest1 = vld1_u8(data + 8 + kDecodeBytesGroupCount[mask0]);
726
727
    uint8x16_t result = vbslq_u8(mask, shuffleBytes(mask0, mask1, rest0, rest1), sel);
728
729
    vst1q_u8(buffer, result);
730
731
#ifdef SIMD_LATENCYOPT
732
    return data + 8 + datacnt;
733
#else
734
    return data + 8 + kDecodeBytesGroupCount[mask0] + kDecodeBytesGroupCount[mask1];
735
#endif
736
  }
737
738
  case 3:
739
  {
740
    uint8x16_t result = vld1q_u8(data);
741
742
    vst1q_u8(buffer, result);
743
744
    return data + 16;
745
  }
746
747
  default:
748
    assert(!"Unexpected bit length"); // unreachable since bitslog2 is a 2-bit value
749
    return data;
750
  }
751
}
752
#endif
753
754
#ifdef SIMD_WASM
755
SIMD_TARGET
756
static v128_t decodeShuffleMask(unsigned char mask0, unsigned char mask1)
757
{
758
  v128_t sm0 = wasm_v128_load(&kDecodeBytesGroupShuffle[mask0]);
759
  v128_t sm1 = wasm_v128_load(&kDecodeBytesGroupShuffle[mask1]);
760
761
  v128_t sm1off = wasm_v128_load(&kDecodeBytesGroupCount[mask0]);
762
  sm1off = wasm_i8x16_shuffle(sm1off, sm1off, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
763
764
  v128_t sm1r = wasm_i8x16_add(sm1, sm1off);
765
766
  return wasmx_unpacklo_v64x2(sm0, sm1r);
767
}
768
769
SIMD_TARGET
770
static void wasmMoveMask(v128_t mask, unsigned char& mask0, unsigned char& mask1)
771
{
772
  // magic constant found using z3 SMT assuming mask has 8 groups of 0xff or 0x00
773
  const uint64_t magic = 0x000103070f1f3f80ull;
774
775
  mask0 = uint8_t((wasm_i64x2_extract_lane(mask, 0) * magic) >> 56);
776
  mask1 = uint8_t((wasm_i64x2_extract_lane(mask, 1) * magic) >> 56);
777
}
778
779
SIMD_TARGET
780
static const unsigned char* decodeBytesGroupSimd(const unsigned char* data, unsigned char* buffer, int bitslog2)
781
{
782
  switch (bitslog2)
783
  {
784
  case 0:
785
  {
786
    v128_t result = wasm_i8x16_splat(0);
787
788
    wasm_v128_store(buffer, result);
789
790
    return data;
791
  }
792
793
  case 1:
794
  {
795
    v128_t sel2 = wasm_v128_load(data);
796
    v128_t rest = wasm_v128_load(data + 4);
797
798
    v128_t sel22 = wasmx_unpacklo_v8x16(wasm_i16x8_shr(sel2, 4), sel2);
799
    v128_t sel2222 = wasmx_unpacklo_v8x16(wasm_i16x8_shr(sel22, 2), sel22);
800
    v128_t sel = wasm_v128_and(sel2222, wasm_i8x16_splat(3));
801
802
    v128_t mask = wasm_i8x16_eq(sel, wasm_i8x16_splat(3));
803
804
    unsigned char mask0, mask1;
805
    wasmMoveMask(mask, mask0, mask1);
806
807
    v128_t shuf = decodeShuffleMask(mask0, mask1);
808
809
    v128_t result = wasm_v128_bitselect(wasm_i8x16_swizzle(rest, shuf), sel, mask);
810
811
    wasm_v128_store(buffer, result);
812
813
    return data + 4 + kDecodeBytesGroupCount[mask0] + kDecodeBytesGroupCount[mask1];
814
  }
815
816
  case 2:
817
  {
818
    v128_t sel4 = wasm_v128_load(data);
819
    v128_t rest = wasm_v128_load(data + 8);
820
821
    v128_t sel44 = wasmx_unpacklo_v8x16(wasm_i16x8_shr(sel4, 4), sel4);
822
    v128_t sel = wasm_v128_and(sel44, wasm_i8x16_splat(15));
823
824
    v128_t mask = wasm_i8x16_eq(sel, wasm_i8x16_splat(15));
825
826
    unsigned char mask0, mask1;
827
    wasmMoveMask(mask, mask0, mask1);
828
829
    v128_t shuf = decodeShuffleMask(mask0, mask1);
830
831
    v128_t result = wasm_v128_bitselect(wasm_i8x16_swizzle(rest, shuf), sel, mask);
832
833
    wasm_v128_store(buffer, result);
834
835
    return data + 8 + kDecodeBytesGroupCount[mask0] + kDecodeBytesGroupCount[mask1];
836
  }
837
838
  case 3:
839
  {
840
    v128_t result = wasm_v128_load(data);
841
842
    wasm_v128_store(buffer, result);
843
844
    return data + 16;
845
  }
846
847
  default:
848
    assert(!"Unexpected bit length"); // unreachable since bitslog2 is a 2-bit value
849
    return data;
850
  }
851
}
852
#endif
853
854
#if defined(SIMD_SSE) || defined(SIMD_AVX)
855
SIMD_TARGET
856
static void transpose8(__m128i& x0, __m128i& x1, __m128i& x2, __m128i& x3)
857
14.9k
{
858
14.9k
  __m128i t0 = _mm_unpacklo_epi8(x0, x1);
859
14.9k
  __m128i t1 = _mm_unpackhi_epi8(x0, x1);
860
14.9k
  __m128i t2 = _mm_unpacklo_epi8(x2, x3);
861
14.9k
  __m128i t3 = _mm_unpackhi_epi8(x2, x3);
862
863
14.9k
  x0 = _mm_unpacklo_epi16(t0, t2);
864
14.9k
  x1 = _mm_unpackhi_epi16(t0, t2);
865
14.9k
  x2 = _mm_unpacklo_epi16(t1, t3);
866
14.9k
  x3 = _mm_unpackhi_epi16(t1, t3);
867
14.9k
}
868
869
SIMD_TARGET
870
static __m128i unzigzag8(__m128i v)
871
59.7k
{
872
59.7k
  __m128i xl = _mm_sub_epi8(_mm_setzero_si128(), _mm_and_si128(v, _mm_set1_epi8(1)));
873
59.7k
  __m128i xr = _mm_and_si128(_mm_srli_epi16(v, 1), _mm_set1_epi8(127));
874
875
59.7k
  return _mm_xor_si128(xl, xr);
876
59.7k
}
877
#endif
878
879
#ifdef SIMD_NEON
880
static void transpose8(uint8x16_t& x0, uint8x16_t& x1, uint8x16_t& x2, uint8x16_t& x3)
881
{
882
  uint8x16x2_t t01 = vzipq_u8(x0, x1);
883
  uint8x16x2_t t23 = vzipq_u8(x2, x3);
884
885
  uint16x8x2_t x01 = vzipq_u16(vreinterpretq_u16_u8(t01.val[0]), vreinterpretq_u16_u8(t23.val[0]));
886
  uint16x8x2_t x23 = vzipq_u16(vreinterpretq_u16_u8(t01.val[1]), vreinterpretq_u16_u8(t23.val[1]));
887
888
  x0 = vreinterpretq_u8_u16(x01.val[0]);
889
  x1 = vreinterpretq_u8_u16(x01.val[1]);
890
  x2 = vreinterpretq_u8_u16(x23.val[0]);
891
  x3 = vreinterpretq_u8_u16(x23.val[1]);
892
}
893
894
static uint8x16_t unzigzag8(uint8x16_t v)
895
{
896
  uint8x16_t xl = vreinterpretq_u8_s8(vnegq_s8(vreinterpretq_s8_u8(vandq_u8(v, vdupq_n_u8(1)))));
897
  uint8x16_t xr = vshrq_n_u8(v, 1);
898
899
  return veorq_u8(xl, xr);
900
}
901
#endif
902
903
#ifdef SIMD_WASM
904
SIMD_TARGET
905
static void transpose8(v128_t& x0, v128_t& x1, v128_t& x2, v128_t& x3)
906
{
907
  v128_t t0 = wasmx_unpacklo_v8x16(x0, x1);
908
  v128_t t1 = wasmx_unpackhi_v8x16(x0, x1);
909
  v128_t t2 = wasmx_unpacklo_v8x16(x2, x3);
910
  v128_t t3 = wasmx_unpackhi_v8x16(x2, x3);
911
912
  x0 = wasmx_unpacklo_v16x8(t0, t2);
913
  x1 = wasmx_unpackhi_v16x8(t0, t2);
914
  x2 = wasmx_unpacklo_v16x8(t1, t3);
915
  x3 = wasmx_unpackhi_v16x8(t1, t3);
916
}
917
918
SIMD_TARGET
919
static v128_t unzigzag8(v128_t v)
920
{
921
  v128_t xl = wasm_i8x16_neg(wasm_v128_and(v, wasm_i8x16_splat(1)));
922
  v128_t xr = wasm_u8x16_shr(v, 1);
923
924
  return wasm_v128_xor(xl, xr);
925
}
926
#endif
927
928
#if defined(SIMD_SSE) || defined(SIMD_AVX) || defined(SIMD_NEON) || defined(SIMD_WASM)
929
SIMD_TARGET
930
static const unsigned char* decodeBytesSimd(const unsigned char* data, const unsigned char* data_end, unsigned char* buffer, size_t buffer_size)
931
12.9k
{
932
12.9k
  assert(buffer_size % kByteGroupSize == 0);
933
12.9k
  assert(kByteGroupSize == 16);
934
935
12.9k
  const unsigned char* header = data;
936
937
  // round number of groups to 4 to get number of header bytes
938
12.9k
  size_t header_size = (buffer_size / kByteGroupSize + 3) / 4;
939
940
12.9k
  if (size_t(data_end - data) < header_size)
941
10
    return NULL;
942
943
12.9k
  data += header_size;
944
945
12.9k
  size_t i = 0;
946
947
  // fast-path: process 4 groups at a time, do a shared bounds check - each group reads <=24b
948
23.0k
  for (; i + kByteGroupSize * 4 <= buffer_size && size_t(data_end - data) >= kByteGroupDecodeLimit * 4; i += kByteGroupSize * 4)
949
10.1k
  {
950
10.1k
    size_t header_offset = i / kByteGroupSize;
951
10.1k
    unsigned char header_byte = header[header_offset / 4];
952
953
10.1k
    data = decodeBytesGroupSimd(data, buffer + i + kByteGroupSize * 0, (header_byte >> 0) & 3);
954
10.1k
    data = decodeBytesGroupSimd(data, buffer + i + kByteGroupSize * 1, (header_byte >> 2) & 3);
955
10.1k
    data = decodeBytesGroupSimd(data, buffer + i + kByteGroupSize * 2, (header_byte >> 4) & 3);
956
10.1k
    data = decodeBytesGroupSimd(data, buffer + i + kByteGroupSize * 3, (header_byte >> 6) & 3);
957
10.1k
  }
958
959
  // slow-path: process remaining groups
960
35.5k
  for (; i < buffer_size; i += kByteGroupSize)
961
23.0k
  {
962
23.0k
    if (size_t(data_end - data) < kByteGroupDecodeLimit)
963
433
      return NULL;
964
965
22.6k
    size_t header_offset = i / kByteGroupSize;
966
967
22.6k
    int bitslog2 = (header[header_offset / 4] >> ((header_offset % 4) * 2)) & 3;
968
969
22.6k
    data = decodeBytesGroupSimd(data, buffer + i, bitslog2);
970
22.6k
  }
971
972
12.5k
  return data;
973
12.9k
}
974
975
SIMD_TARGET
976
static const unsigned char* decodeVertexBlockSimd(const unsigned char* data, const unsigned char* data_end, unsigned char* vertex_data, size_t vertex_count, size_t vertex_size, unsigned char last_vertex[256])
977
1.01k
{
978
1.01k
  assert(vertex_count > 0 && vertex_count <= kVertexBlockMaxSize);
979
980
1.01k
  unsigned char buffer[kVertexBlockMaxSize * 4];
981
1.01k
  unsigned char transposed[kVertexBlockSizeBytes];
982
983
1.01k
  size_t vertex_count_aligned = (vertex_count + kByteGroupSize - 1) & ~(kByteGroupSize - 1);
984
985
3.99k
  for (size_t k = 0; k < vertex_size; k += 4)
986
3.43k
  {
987
15.9k
    for (size_t j = 0; j < 4; ++j)
988
12.9k
    {
989
12.9k
      data = decodeBytesSimd(data, data_end, buffer + j * vertex_count_aligned, vertex_count_aligned);
990
12.9k
      if (!data)
991
443
        return NULL;
992
12.9k
    }
993
994
2.98k
#if defined(SIMD_SSE) || defined(SIMD_AVX)
995
14.9k
#define TEMP __m128i
996
2.98k
#define PREP() __m128i pi = _mm_cvtsi32_si128(*reinterpret_cast<const int*>(last_vertex + k))
997
59.7k
#define LOAD(i) __m128i r##i = _mm_loadu_si128(reinterpret_cast<const __m128i*>(buffer + j + i * vertex_count_aligned))
998
59.7k
#define GRP4(i) t0 = _mm_shuffle_epi32(r##i, 0), t1 = _mm_shuffle_epi32(r##i, 1), t2 = _mm_shuffle_epi32(r##i, 2), t3 = _mm_shuffle_epi32(r##i, 3)
999
238k
#define FIXD(i) t##i = pi = _mm_add_epi8(pi, t##i)
1000
238k
#define SAVE(i) *reinterpret_cast<int*>(savep) = _mm_cvtsi128_si32(t##i), savep += vertex_size
1001
2.98k
#endif
1002
1003
#ifdef SIMD_NEON
1004
#define TEMP uint8x8_t
1005
#define PREP() uint8x8_t pi = vreinterpret_u8_u32(vld1_lane_u32(reinterpret_cast<uint32_t*>(last_vertex + k), vdup_n_u32(0), 0))
1006
#define LOAD(i) uint8x16_t r##i = vld1q_u8(buffer + j + i * vertex_count_aligned)
1007
#define GRP4(i) t0 = vget_low_u8(r##i), t1 = vreinterpret_u8_u32(vdup_lane_u32(vreinterpret_u32_u8(t0), 1)), t2 = vget_high_u8(r##i), t3 = vreinterpret_u8_u32(vdup_lane_u32(vreinterpret_u32_u8(t2), 1))
1008
#define FIXD(i) t##i = pi = vadd_u8(pi, t##i)
1009
#define SAVE(i) vst1_lane_u32(reinterpret_cast<uint32_t*>(savep), vreinterpret_u32_u8(t##i), 0), savep += vertex_size
1010
#endif
1011
1012
#ifdef SIMD_WASM
1013
#define TEMP v128_t
1014
#define PREP() v128_t pi = wasm_v128_load(last_vertex + k)
1015
#define LOAD(i) v128_t r##i = wasm_v128_load(buffer + j + i * vertex_count_aligned)
1016
#define GRP4(i) t0 = wasmx_splat_v32x4(r##i, 0), t1 = wasmx_splat_v32x4(r##i, 1), t2 = wasmx_splat_v32x4(r##i, 2), t3 = wasmx_splat_v32x4(r##i, 3)
1017
#define FIXD(i) t##i = pi = wasm_i8x16_add(pi, t##i)
1018
#define SAVE(i) *reinterpret_cast<int*>(savep) = wasm_i32x4_extract_lane(t##i, 0), savep += vertex_size
1019
#endif
1020
1021
2.98k
    PREP();
1022
1023
2.98k
    unsigned char* savep = transposed + k;
1024
1025
17.9k
    for (size_t j = 0; j < vertex_count_aligned; j += 16)
1026
14.9k
    {
1027
14.9k
      LOAD(0);
1028
14.9k
      LOAD(1);
1029
14.9k
      LOAD(2);
1030
14.9k
      LOAD(3);
1031
1032
14.9k
      r0 = unzigzag8(r0);
1033
14.9k
      r1 = unzigzag8(r1);
1034
14.9k
      r2 = unzigzag8(r2);
1035
14.9k
      r3 = unzigzag8(r3);
1036
1037
14.9k
      transpose8(r0, r1, r2, r3);
1038
1039
14.9k
      TEMP t0, t1, t2, t3;
1040
1041
14.9k
      GRP4(0);
1042
14.9k
      FIXD(0), FIXD(1), FIXD(2), FIXD(3);
1043
14.9k
      SAVE(0), SAVE(1), SAVE(2), SAVE(3);
1044
1045
14.9k
      GRP4(1);
1046
14.9k
      FIXD(0), FIXD(1), FIXD(2), FIXD(3);
1047
14.9k
      SAVE(0), SAVE(1), SAVE(2), SAVE(3);
1048
1049
14.9k
      GRP4(2);
1050
14.9k
      FIXD(0), FIXD(1), FIXD(2), FIXD(3);
1051
14.9k
      SAVE(0), SAVE(1), SAVE(2), SAVE(3);
1052
1053
14.9k
      GRP4(3);
1054
14.9k
      FIXD(0), FIXD(1), FIXD(2), FIXD(3);
1055
14.9k
      SAVE(0), SAVE(1), SAVE(2), SAVE(3);
1056
1057
14.9k
#undef TEMP
1058
14.9k
#undef PREP
1059
14.9k
#undef LOAD
1060
14.9k
#undef GRP4
1061
14.9k
#undef FIXD
1062
14.9k
#undef SAVE
1063
14.9k
    }
1064
2.98k
  }
1065
1066
567
  memcpy(vertex_data, transposed, vertex_count * vertex_size);
1067
1068
567
  memcpy(last_vertex, &transposed[vertex_size * (vertex_count - 1)], vertex_size);
1069
1070
567
  return data;
1071
1.01k
}
1072
#endif
1073
1074
#if defined(SIMD_SSE) && defined(SIMD_FALLBACK)
1075
static unsigned int getCpuFeatures()
1076
2
{
1077
2
  int cpuinfo[4] = {};
1078
#ifdef _MSC_VER
1079
  __cpuid(cpuinfo, 1);
1080
#else
1081
2
  __cpuid(1, cpuinfo[0], cpuinfo[1], cpuinfo[2], cpuinfo[3]);
1082
2
#endif
1083
2
  return cpuinfo[2];
1084
2
}
1085
1086
static unsigned int cpuid = getCpuFeatures();
1087
#endif
1088
1089
} // namespace meshopt
1090
1091
size_t meshopt_encodeVertexBuffer(unsigned char* buffer, size_t buffer_size, const void* vertices, size_t vertex_count, size_t vertex_size)
1092
0
{
1093
0
  using namespace meshopt;
1094
1095
0
  assert(vertex_size > 0 && vertex_size <= 256);
1096
0
  assert(vertex_size % 4 == 0);
1097
1098
0
  const unsigned char* vertex_data = static_cast<const unsigned char*>(vertices);
1099
1100
0
  unsigned char* data = buffer;
1101
0
  unsigned char* data_end = buffer + buffer_size;
1102
1103
0
  if (size_t(data_end - data) < 1 + vertex_size)
1104
0
    return 0;
1105
1106
0
  int version = gEncodeVertexVersion;
1107
1108
0
  *data++ = (unsigned char)(kVertexHeader | version);
1109
1110
0
  unsigned char first_vertex[256] = {};
1111
0
  if (vertex_count > 0)
1112
0
    memcpy(first_vertex, vertex_data, vertex_size);
1113
1114
0
  unsigned char last_vertex[256] = {};
1115
0
  memcpy(last_vertex, first_vertex, vertex_size);
1116
1117
0
  size_t vertex_block_size = getVertexBlockSize(vertex_size);
1118
1119
0
  size_t vertex_offset = 0;
1120
1121
0
  while (vertex_offset < vertex_count)
1122
0
  {
1123
0
    size_t block_size = (vertex_offset + vertex_block_size < vertex_count) ? vertex_block_size : vertex_count - vertex_offset;
1124
1125
0
    data = encodeVertexBlock(data, data_end, vertex_data + vertex_offset * vertex_size, block_size, vertex_size, last_vertex);
1126
0
    if (!data)
1127
0
      return 0;
1128
1129
0
    vertex_offset += block_size;
1130
0
  }
1131
1132
0
  size_t tail_size = vertex_size < kTailMaxSize ? kTailMaxSize : vertex_size;
1133
1134
0
  if (size_t(data_end - data) < tail_size)
1135
0
    return 0;
1136
1137
  // write first vertex to the end of the stream and pad it to 32 bytes; this is important to simplify bounds checks in decoder
1138
0
  if (vertex_size < kTailMaxSize)
1139
0
  {
1140
0
    memset(data, 0, kTailMaxSize - vertex_size);
1141
0
    data += kTailMaxSize - vertex_size;
1142
0
  }
1143
1144
0
  memcpy(data, first_vertex, vertex_size);
1145
0
  data += vertex_size;
1146
1147
0
  assert(data >= buffer + tail_size);
1148
0
  assert(data <= buffer + buffer_size);
1149
1150
0
  return data - buffer;
1151
0
}
1152
1153
size_t meshopt_encodeVertexBufferBound(size_t vertex_count, size_t vertex_size)
1154
0
{
1155
0
  using namespace meshopt;
1156
1157
0
  assert(vertex_size > 0 && vertex_size <= 256);
1158
0
  assert(vertex_size % 4 == 0);
1159
1160
0
  size_t vertex_block_size = getVertexBlockSize(vertex_size);
1161
0
  size_t vertex_block_count = (vertex_count + vertex_block_size - 1) / vertex_block_size;
1162
1163
0
  size_t vertex_block_header_size = (vertex_block_size / kByteGroupSize + 3) / 4;
1164
0
  size_t vertex_block_data_size = vertex_block_size;
1165
1166
0
  size_t tail_size = vertex_size < kTailMaxSize ? kTailMaxSize : vertex_size;
1167
1168
0
  return 1 + vertex_block_count * vertex_size * (vertex_block_header_size + vertex_block_data_size) + tail_size;
1169
0
}
1170
1171
void meshopt_encodeVertexVersion(int version)
1172
0
{
1173
0
  assert(unsigned(version) <= 0);
1174
1175
0
  meshopt::gEncodeVertexVersion = version;
1176
0
}
1177
1178
int meshopt_decodeVertexBuffer(void* destination, size_t vertex_count, size_t vertex_size, const unsigned char* buffer, size_t buffer_size)
1179
2.01k
{
1180
2.01k
  using namespace meshopt;
1181
1182
2.01k
  assert(vertex_size > 0 && vertex_size <= 256);
1183
2.01k
  assert(vertex_size % 4 == 0);
1184
1185
2.01k
  const unsigned char* (*decode)(const unsigned char*, const unsigned char*, unsigned char*, size_t, size_t, unsigned char[256]) = NULL;
1186
1187
2.01k
#if defined(SIMD_SSE) && defined(SIMD_FALLBACK)
1188
2.01k
  decode = (cpuid & (1 << 9)) ? decodeVertexBlockSimd : decodeVertexBlock;
1189
#elif defined(SIMD_SSE) || defined(SIMD_AVX) || defined(SIMD_NEON) || defined(SIMD_WASM)
1190
  decode = decodeVertexBlockSimd;
1191
#else
1192
  decode = decodeVertexBlock;
1193
#endif
1194
1195
2.01k
#if defined(SIMD_SSE) || defined(SIMD_NEON) || defined(SIMD_WASM)
1196
2.01k
  assert(gDecodeBytesGroupInitialized);
1197
2.01k
  (void)gDecodeBytesGroupInitialized;
1198
2.01k
#endif
1199
1200
2.01k
  unsigned char* vertex_data = static_cast<unsigned char*>(destination);
1201
1202
2.01k
  const unsigned char* data = buffer;
1203
2.01k
  const unsigned char* data_end = buffer + buffer_size;
1204
1205
2.01k
  if (size_t(data_end - data) < 1 + vertex_size)
1206
110
    return -2;
1207
1208
1.90k
  unsigned char data_header = *data++;
1209
1210
1.90k
  if ((data_header & 0xf0) != kVertexHeader)
1211
880
    return -1;
1212
1213
1.02k
  int version = data_header & 0x0f;
1214
1.02k
  if (version > 0)
1215
16
    return -1;
1216
1217
1.01k
  unsigned char last_vertex[256];
1218
1.01k
  memcpy(last_vertex, data_end - vertex_size, vertex_size);
1219
1220
1.01k
  size_t vertex_block_size = getVertexBlockSize(vertex_size);
1221
1222
1.01k
  size_t vertex_offset = 0;
1223
1224
1.57k
  while (vertex_offset < vertex_count)
1225
1.01k
  {
1226
1.01k
    size_t block_size = (vertex_offset + vertex_block_size < vertex_count) ? vertex_block_size : vertex_count - vertex_offset;
1227
1228
1.01k
    data = decode(data, data_end, vertex_data + vertex_offset * vertex_size, block_size, vertex_size, last_vertex);
1229
1.01k
    if (!data)
1230
443
      return -2;
1231
1232
567
    vertex_offset += block_size;
1233
567
  }
1234
1235
567
  size_t tail_size = vertex_size < kTailMaxSize ? kTailMaxSize : vertex_size;
1236
1237
567
  if (size_t(data_end - data) != tail_size)
1238
562
    return -3;
1239
1240
5
  return 0;
1241
567
}
1242
1243
#undef SIMD_NEON
1244
#undef SIMD_SSE
1245
#undef SIMD_AVX
1246
#undef SIMD_WASM
1247
#undef SIMD_FALLBACK
1248
#undef SIMD_TARGET