Coverage Report

Created: 2025-12-14 06:41

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/src/meshoptimizer/src/vertexcodec.cpp
Line
Count
Source
1
// This file is part of meshoptimizer library; see meshoptimizer.h for version/license details
2
#include "meshoptimizer.h"
3
4
#include <assert.h>
5
#include <string.h>
6
7
// The block below auto-detects SIMD ISA that can be used on the target platform
8
#ifndef MESHOPTIMIZER_NO_SIMD
9
10
// The SIMD implementation requires SSSE3, which can be enabled unconditionally through compiler settings
11
#if defined(__AVX__) || defined(__SSSE3__)
12
#define SIMD_SSE
13
#endif
14
15
// An experimental implementation using AVX512 instructions; it's only enabled when AVX512 is enabled through compiler settings
16
#if defined(__AVX512VBMI2__) && defined(__AVX512VBMI__) && defined(__AVX512VL__) && defined(__POPCNT__)
17
#undef SIMD_SSE
18
#define SIMD_AVX
19
#endif
20
21
// MSVC supports compiling SSSE3 code regardless of compile options; we use a cpuid-based scalar fallback
22
#if !defined(SIMD_SSE) && !defined(SIMD_AVX) && defined(_MSC_VER) && !defined(__clang__) && (defined(_M_IX86) || defined(_M_X64))
23
#define SIMD_SSE
24
#define SIMD_FALLBACK
25
#endif
26
27
// GCC 4.9+ and clang 3.8+ support targeting SIMD ISA from individual functions; we use a cpuid-based scalar fallback
28
#if !defined(SIMD_SSE) && !defined(SIMD_AVX) && ((defined(__clang__) && __clang_major__ * 100 + __clang_minor__ >= 308) || (defined(__GNUC__) && __GNUC__ * 100 + __GNUC_MINOR__ >= 409)) && (defined(__i386__) || defined(__x86_64__))
29
#define SIMD_SSE
30
#define SIMD_FALLBACK
31
#define SIMD_TARGET __attribute__((target("ssse3")))
32
#endif
33
34
// GCC/clang define these when NEON support is available
35
#if defined(__ARM_NEON__) || defined(__ARM_NEON)
36
#define SIMD_NEON
37
#endif
38
39
// On MSVC, we assume that ARM builds always target NEON-capable devices
40
#if !defined(SIMD_NEON) && defined(_MSC_VER) && (defined(_M_ARM) || defined(_M_ARM64))
41
#define SIMD_NEON
42
#endif
43
44
// When targeting Wasm SIMD we can't use runtime cpuid checks so we unconditionally enable SIMD
45
#if defined(__wasm_simd128__)
46
#define SIMD_WASM
47
// Prevent compiling other variant when wasm simd compilation is active
48
#undef SIMD_NEON
49
#undef SIMD_SSE
50
#undef SIMD_AVX
51
#endif
52
53
#ifndef SIMD_TARGET
54
#define SIMD_TARGET
55
#endif
56
57
// When targeting AArch64/x64, optimize for latency to allow decoding of individual 16-byte groups to overlap
58
// We don't do this for 32-bit systems because we need 64-bit math for this and this will hurt in-order CPUs
59
#if defined(__x86_64__) || defined(_M_X64) || defined(__aarch64__) || defined(_M_ARM64)
60
#define SIMD_LATENCYOPT
61
#endif
62
63
// In switch dispatch, marking default case as unreachable allows to remove redundant bounds checks
64
#if defined(__GNUC__)
65
0
#define SIMD_UNREACHABLE() __builtin_unreachable()
66
#elif defined(_MSC_VER)
67
#define SIMD_UNREACHABLE() __assume(false)
68
#else
69
#define SIMD_UNREACHABLE() assert(!"Unreachable")
70
#endif
71
72
#endif // !MESHOPTIMIZER_NO_SIMD
73
74
#ifdef SIMD_SSE
75
#include <tmmintrin.h>
76
#endif
77
78
#if defined(SIMD_SSE) && defined(SIMD_FALLBACK)
79
#ifdef _MSC_VER
80
#include <intrin.h> // __cpuid
81
#else
82
#include <cpuid.h> // __cpuid
83
#endif
84
#endif
85
86
#ifdef SIMD_AVX
87
#include <immintrin.h>
88
#endif
89
90
#ifdef SIMD_NEON
91
#if defined(_MSC_VER) && defined(_M_ARM64)
92
#include <arm64_neon.h>
93
#else
94
#include <arm_neon.h>
95
#endif
96
#endif
97
98
#ifdef SIMD_WASM
99
#include <wasm_simd128.h>
100
#endif
101
102
#ifndef TRACE
103
#define TRACE 0
104
#endif
105
106
#if TRACE
107
#include <stdio.h>
108
#endif
109
110
#ifdef SIMD_WASM
111
#define wasmx_splat_v32x4(v, i) wasm_i32x4_shuffle(v, v, i, i, i, i)
112
#define wasmx_unpacklo_v8x16(a, b) wasm_i8x16_shuffle(a, b, 0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23)
113
#define wasmx_unpackhi_v8x16(a, b) wasm_i8x16_shuffle(a, b, 8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31)
114
#define wasmx_unpacklo_v16x8(a, b) wasm_i16x8_shuffle(a, b, 0, 8, 1, 9, 2, 10, 3, 11)
115
#define wasmx_unpackhi_v16x8(a, b) wasm_i16x8_shuffle(a, b, 4, 12, 5, 13, 6, 14, 7, 15)
116
#define wasmx_unpacklo_v64x2(a, b) wasm_i64x2_shuffle(a, b, 0, 2)
117
#define wasmx_unpackhi_v64x2(a, b) wasm_i64x2_shuffle(a, b, 1, 3)
118
#endif
119
120
namespace meshopt
121
{
122
123
const unsigned char kVertexHeader = 0xa0;
124
125
static int gEncodeVertexVersion = 1;
126
const int kDecodeVertexVersion = 1;
127
128
const size_t kVertexBlockSizeBytes = 8192;
129
const size_t kVertexBlockMaxSize = 256;
130
const size_t kByteGroupSize = 16;
131
const size_t kByteGroupDecodeLimit = 24;
132
const size_t kTailMinSizeV0 = 32;
133
const size_t kTailMinSizeV1 = 24;
134
135
static const int kBitsV0[4] = {0, 2, 4, 8};
136
static const int kBitsV1[5] = {0, 1, 2, 4, 8};
137
138
const int kEncodeDefaultLevel = 2;
139
140
static size_t getVertexBlockSize(size_t vertex_size)
141
25.6k
{
142
  // make sure the entire block fits into the scratch buffer and is aligned to byte group size
143
  // note: the block size is implicitly part of the format, so we can't change it without breaking compatibility
144
25.6k
  size_t result = (kVertexBlockSizeBytes / vertex_size) & ~(kByteGroupSize - 1);
145
146
25.6k
  return (result < kVertexBlockMaxSize) ? result : kVertexBlockMaxSize;
147
25.6k
}
148
149
inline unsigned int rotate(unsigned int v, int r)
150
144M
{
151
144M
  return (v << r) | (v >> ((32 - r) & 31));
152
144M
}
153
154
template <typename T>
155
inline T zigzag(T v)
156
952M
{
157
952M
  return (0 - (v >> (sizeof(T) * 8 - 1))) ^ (v << 1);
158
952M
}
unsigned char meshopt::zigzag<unsigned char>(unsigned char)
Line
Count
Source
156
806M
{
157
806M
  return (0 - (v >> (sizeof(T) * 8 - 1))) ^ (v << 1);
158
806M
}
unsigned short meshopt::zigzag<unsigned short>(unsigned short)
Line
Count
Source
156
146M
{
157
146M
  return (0 - (v >> (sizeof(T) * 8 - 1))) ^ (v << 1);
158
146M
}
Unexecuted instantiation: unsigned int meshopt::zigzag<unsigned int>(unsigned int)
159
160
template <typename T>
161
inline T unzigzag(T v)
162
0
{
163
0
  return (0 - (v & 1)) ^ (v >> 1);
164
0
}
Unexecuted instantiation: unsigned char meshopt::unzigzag<unsigned char>(unsigned char)
Unexecuted instantiation: unsigned short meshopt::unzigzag<unsigned short>(unsigned short)
Unexecuted instantiation: unsigned int meshopt::unzigzag<unsigned int>(unsigned int)
165
166
#if TRACE
167
struct Stats
168
{
169
  size_t size;
170
  size_t header;  // bytes for header
171
  size_t bitg[9]; // bytes for bit groups
172
  size_t bitc[8]; // bit consistency: how many bits are shared between all bytes in a group
173
  size_t ctrl[4]; // number of control groups
174
};
175
176
static Stats* bytestats = NULL;
177
static Stats vertexstats[256];
178
#endif
179
180
static bool encodeBytesGroupZero(const unsigned char* buffer)
181
38.3M
{
182
38.3M
  assert(kByteGroupSize == sizeof(unsigned long long) * 2);
183
184
38.3M
  unsigned long long v[2];
185
38.3M
  memcpy(v, buffer, sizeof(v));
186
187
38.3M
  return (v[0] | v[1]) == 0;
188
38.3M
}
189
190
static size_t encodeBytesGroupMeasure(const unsigned char* buffer, int bits)
191
261M
{
192
261M
  assert(bits >= 0 && bits <= 8);
193
194
261M
  if (bits == 0)
195
22.8M
    return encodeBytesGroupZero(buffer) ? 0 : size_t(-1);
196
197
239M
  if (bits == 8)
198
58.2M
    return kByteGroupSize;
199
200
180M
  size_t result = kByteGroupSize * bits / 8;
201
202
180M
  unsigned char sentinel = (1 << bits) - 1;
203
204
3.07G
  for (size_t i = 0; i < kByteGroupSize; ++i)
205
2.89G
    result += buffer[i] >= sentinel;
206
207
180M
  return result;
208
239M
}
209
210
static unsigned char* encodeBytesGroup(unsigned char* data, const unsigned char* buffer, int bits)
211
27.1M
{
212
27.1M
  assert(bits >= 0 && bits <= 8);
213
27.1M
  assert(kByteGroupSize % 8 == 0);
214
215
27.1M
  if (bits == 0)
216
3.14M
    return data;
217
218
23.9M
  if (bits == 8)
219
15.9M
  {
220
15.9M
    memcpy(data, buffer, kByteGroupSize);
221
15.9M
    return data + kByteGroupSize;
222
15.9M
  }
223
224
8.07M
  size_t byte_size = 8 / bits;
225
8.07M
  assert(kByteGroupSize % byte_size == 0);
226
227
  // fixed portion: bits bits for each value
228
  // variable portion: full byte for each out-of-range value (using 1...1 as sentinel)
229
8.07M
  unsigned char sentinel = (1 << bits) - 1;
230
231
31.6M
  for (size_t i = 0; i < kByteGroupSize; i += byte_size)
232
23.5M
  {
233
23.5M
    unsigned char byte = 0;
234
235
152M
    for (size_t k = 0; k < byte_size; ++k)
236
129M
    {
237
129M
      unsigned char enc = (buffer[i + k] >= sentinel) ? sentinel : buffer[i + k];
238
239
129M
      byte <<= bits;
240
129M
      byte |= enc;
241
129M
    }
242
243
    // encode 1-bit groups in reverse bit order
244
    // this makes them faster to decode alongside other groups
245
23.5M
    if (bits == 1)
246
9.78M
      byte = (unsigned char)(((byte * 0x80200802ull) & 0x0884422110ull) * 0x0101010101ull >> 32);
247
248
23.5M
    *data++ = byte;
249
23.5M
  }
250
251
137M
  for (size_t i = 0; i < kByteGroupSize; ++i)
252
129M
  {
253
129M
    unsigned char v = buffer[i];
254
255
    // branchless append of out-of-range values
256
129M
    *data = v;
257
129M
    data += v >= sentinel;
258
129M
  }
259
260
8.07M
  return data;
261
8.07M
}
262
263
static unsigned char* encodeBytes(unsigned char* data, unsigned char* data_end, const unsigned char* buffer, size_t buffer_size, const int bits[4])
264
1.77M
{
265
1.77M
  assert(buffer_size % kByteGroupSize == 0);
266
267
1.77M
  unsigned char* header = data;
268
269
  // round number of groups to 4 to get number of header bytes
270
1.77M
  size_t header_size = (buffer_size / kByteGroupSize + 3) / 4;
271
272
1.77M
  if (size_t(data_end - data) < header_size)
273
0
    return NULL;
274
275
1.77M
  data += header_size;
276
277
1.77M
  memset(header, 0, header_size);
278
279
1.77M
  int last_bits = -1;
280
281
28.9M
  for (size_t i = 0; i < buffer_size; i += kByteGroupSize)
282
27.1M
  {
283
27.1M
    if (size_t(data_end - data) < kByteGroupDecodeLimit)
284
0
      return NULL;
285
286
27.1M
    int best_bitk = 3;
287
27.1M
    size_t best_size = encodeBytesGroupMeasure(buffer + i, bits[best_bitk]);
288
289
108M
    for (int bitk = 0; bitk < 3; ++bitk)
290
81.4M
    {
291
81.4M
      size_t size = encodeBytesGroupMeasure(buffer + i, bits[bitk]);
292
293
      // favor consistent bit selection across groups, but never replace literals
294
81.4M
      if (size < best_size || (size == best_size && bits[bitk] == last_bits && bits[best_bitk] != 8))
295
13.4M
      {
296
13.4M
        best_bitk = bitk;
297
13.4M
        best_size = size;
298
13.4M
      }
299
81.4M
    }
300
301
27.1M
    size_t header_offset = i / kByteGroupSize;
302
27.1M
    header[header_offset / 4] |= best_bitk << ((header_offset % 4) * 2);
303
304
27.1M
    int best_bits = bits[best_bitk];
305
27.1M
    unsigned char* next = encodeBytesGroup(data, buffer + i, best_bits);
306
307
27.1M
    assert(data + best_size == next);
308
27.1M
    data = next;
309
27.1M
    last_bits = best_bits;
310
311
#if TRACE
312
    bytestats->bitg[best_bits] += best_size;
313
#endif
314
27.1M
  }
315
316
#if TRACE
317
  bytestats->header += header_size;
318
#endif
319
320
1.77M
  return data;
321
1.77M
}
322
323
template <typename T, bool Xor>
324
static void encodeDeltas1(unsigned char* buffer, const unsigned char* vertex_data, size_t vertex_count, size_t vertex_size, const unsigned char last_vertex[256], size_t k, int rot)
325
4.42M
{
326
4.42M
  size_t k0 = k & ~(sizeof(T) - 1);
327
4.42M
  int ks = (k & (sizeof(T) - 1)) * 8;
328
329
4.42M
  T p = last_vertex[k0];
330
6.46M
  for (size_t j = 1; j < sizeof(T); ++j)
331
2.04M
    p |= T(last_vertex[k0 + j]) << (j * 8);
332
333
4.42M
  const unsigned char* vertex = vertex_data + k0;
334
335
1.07G
  for (size_t i = 0; i < vertex_count; ++i)
336
1.06G
  {
337
1.06G
    T v = vertex[0];
338
1.55G
    for (size_t j = 1; j < sizeof(T); ++j)
339
486M
      v |= vertex[j] << (j * 8);
340
341
1.06G
    T d = Xor ? T(rotate(v ^ p, rot)) : zigzag(T(v - p));
342
343
1.06G
    buffer[i] = (unsigned char)(d >> ks);
344
1.06G
    p = v;
345
1.06G
    vertex += vertex_size;
346
1.06G
  }
347
4.42M
}
vertexcodec.cpp:void meshopt::encodeDeltas1<unsigned char, false>(unsigned char*, unsigned char const*, unsigned long, unsigned long, unsigned char const*, unsigned long, int)
Line
Count
Source
325
3.33M
{
326
3.33M
  size_t k0 = k & ~(sizeof(T) - 1);
327
3.33M
  int ks = (k & (sizeof(T) - 1)) * 8;
328
329
3.33M
  T p = last_vertex[k0];
330
3.33M
  for (size_t j = 1; j < sizeof(T); ++j)
331
0
    p |= T(last_vertex[k0 + j]) << (j * 8);
332
333
3.33M
  const unsigned char* vertex = vertex_data + k0;
334
335
809M
  for (size_t i = 0; i < vertex_count; ++i)
336
806M
  {
337
806M
    T v = vertex[0];
338
806M
    for (size_t j = 1; j < sizeof(T); ++j)
339
0
      v |= vertex[j] << (j * 8);
340
341
806M
    T d = Xor ? T(rotate(v ^ p, rot)) : zigzag(T(v - p));
342
343
806M
    buffer[i] = (unsigned char)(d >> ks);
344
806M
    p = v;
345
806M
    vertex += vertex_size;
346
806M
  }
347
3.33M
}
vertexcodec.cpp:void meshopt::encodeDeltas1<unsigned short, false>(unsigned char*, unsigned char const*, unsigned long, unsigned long, unsigned char const*, unsigned long, int)
Line
Count
Source
325
610k
{
326
610k
  size_t k0 = k & ~(sizeof(T) - 1);
327
610k
  int ks = (k & (sizeof(T) - 1)) * 8;
328
329
610k
  T p = last_vertex[k0];
330
1.22M
  for (size_t j = 1; j < sizeof(T); ++j)
331
610k
    p |= T(last_vertex[k0 + j]) << (j * 8);
332
333
610k
  const unsigned char* vertex = vertex_data + k0;
334
335
146M
  for (size_t i = 0; i < vertex_count; ++i)
336
146M
  {
337
146M
    T v = vertex[0];
338
292M
    for (size_t j = 1; j < sizeof(T); ++j)
339
146M
      v |= vertex[j] << (j * 8);
340
341
146M
    T d = Xor ? T(rotate(v ^ p, rot)) : zigzag(T(v - p));
342
343
146M
    buffer[i] = (unsigned char)(d >> ks);
344
146M
    p = v;
345
146M
    vertex += vertex_size;
346
146M
  }
347
610k
}
vertexcodec.cpp:void meshopt::encodeDeltas1<unsigned int, true>(unsigned char*, unsigned char const*, unsigned long, unsigned long, unsigned char const*, unsigned long, int)
Line
Count
Source
325
478k
{
326
478k
  size_t k0 = k & ~(sizeof(T) - 1);
327
478k
  int ks = (k & (sizeof(T) - 1)) * 8;
328
329
478k
  T p = last_vertex[k0];
330
1.91M
  for (size_t j = 1; j < sizeof(T); ++j)
331
1.43M
    p |= T(last_vertex[k0 + j]) << (j * 8);
332
333
478k
  const unsigned char* vertex = vertex_data + k0;
334
335
114M
  for (size_t i = 0; i < vertex_count; ++i)
336
113M
  {
337
113M
    T v = vertex[0];
338
454M
    for (size_t j = 1; j < sizeof(T); ++j)
339
340M
      v |= vertex[j] << (j * 8);
340
341
113M
    T d = Xor ? T(rotate(v ^ p, rot)) : zigzag(T(v - p));
342
343
113M
    buffer[i] = (unsigned char)(d >> ks);
344
113M
    p = v;
345
113M
    vertex += vertex_size;
346
113M
  }
347
478k
}
348
349
static void encodeDeltas(unsigned char* buffer, const unsigned char* vertex_data, size_t vertex_count, size_t vertex_size, const unsigned char last_vertex[256], size_t k, int channel)
350
4.42M
{
351
4.42M
  switch (channel & 3)
352
4.42M
  {
353
3.33M
  case 0:
354
3.33M
    return encodeDeltas1<unsigned char, false>(buffer, vertex_data, vertex_count, vertex_size, last_vertex, k, 0);
355
610k
  case 1:
356
610k
    return encodeDeltas1<unsigned short, false>(buffer, vertex_data, vertex_count, vertex_size, last_vertex, k, 0);
357
478k
  case 2:
358
478k
    return encodeDeltas1<unsigned int, true>(buffer, vertex_data, vertex_count, vertex_size, last_vertex, k, channel >> 4);
359
0
  default:
360
0
    assert(!"Unsupported channel encoding"); // unreachable
361
4.42M
  }
362
4.42M
}
363
364
static int estimateBits(unsigned char v)
365
123M
{
366
123M
  return v <= 15 ? (v <= 3 ? (v == 0 ? 0 : 2) : 4) : 8;
367
123M
}
368
369
static int estimateRotate(const unsigned char* vertex_data, size_t vertex_count, size_t vertex_size, size_t k, size_t group_size)
370
12.9k
{
371
12.9k
  size_t sizes[8] = {};
372
373
12.9k
  const unsigned char* vertex = vertex_data + k;
374
12.9k
  unsigned int last = vertex[0] | (vertex[1] << 8) | (vertex[2] << 16) | (vertex[3] << 24);
375
376
3.88M
  for (size_t i = 0; i < vertex_count; i += group_size)
377
3.87M
  {
378
3.87M
    unsigned int bitg = 0;
379
380
    // calculate bit consistency mask for the group
381
65.7M
    for (size_t j = 0; j < group_size && i + j < vertex_count; ++j)
382
61.8M
    {
383
61.8M
      unsigned int v = vertex[0] | (vertex[1] << 8) | (vertex[2] << 16) | (vertex[3] << 24);
384
61.8M
      unsigned int d = v ^ last;
385
386
61.8M
      bitg |= d;
387
61.8M
      last = v;
388
61.8M
      vertex += vertex_size;
389
61.8M
    }
390
391
#if TRACE
392
    for (int j = 0; j < 32; ++j)
393
      vertexstats[k + (j / 8)].bitc[j % 8] += (i + group_size < vertex_count ? group_size : vertex_count - i) * (1 - ((bitg >> j) & 1));
394
#endif
395
396
34.8M
    for (int j = 0; j < 8; ++j)
397
30.9M
    {
398
30.9M
      unsigned int bitr = rotate(bitg, j);
399
400
30.9M
      sizes[j] += estimateBits((unsigned char)(bitr >> 0)) + estimateBits((unsigned char)(bitr >> 8));
401
30.9M
      sizes[j] += estimateBits((unsigned char)(bitr >> 16)) + estimateBits((unsigned char)(bitr >> 24));
402
30.9M
    }
403
3.87M
  }
404
405
12.9k
  int best_rot = 0;
406
103k
  for (int rot = 1; rot < 8; ++rot)
407
90.3k
    best_rot = (sizes[rot] < sizes[best_rot]) ? rot : best_rot;
408
409
12.9k
  return best_rot;
410
12.9k
}
411
412
static int estimateChannel(const unsigned char* vertex_data, size_t vertex_count, size_t vertex_size, size_t k, size_t vertex_block_size, size_t block_skip, int max_channel, int xor_rot)
413
14.6k
{
414
14.6k
  unsigned char block[kVertexBlockMaxSize];
415
14.6k
  assert(vertex_block_size <= kVertexBlockMaxSize);
416
417
14.6k
  unsigned char last_vertex[256] = {};
418
419
14.6k
  size_t sizes[3] = {};
420
14.6k
  assert(max_channel <= 3);
421
422
133k
  for (size_t i = 0; i < vertex_count; i += vertex_block_size * block_skip)
423
119k
  {
424
119k
    size_t block_size = i + vertex_block_size < vertex_count ? vertex_block_size : vertex_count - i;
425
119k
    size_t block_size_aligned = (block_size + kByteGroupSize - 1) & ~(kByteGroupSize - 1);
426
427
119k
    memcpy(last_vertex, vertex_data + (i == 0 ? 0 : i - 1) * vertex_size, vertex_size);
428
429
    // we sometimes encode elements we didn't fill when rounding to kByteGroupSize
430
119k
    if (block_size < block_size_aligned)
431
9.35k
      memset(block + block_size, 0, block_size_aligned - block_size);
432
433
448k
    for (int channel = 0; channel < max_channel; ++channel)
434
1.64M
      for (size_t j = 0; j < 4; ++j)
435
1.31M
      {
436
1.31M
        encodeDeltas(block, vertex_data + i * vertex_size, block_size, vertex_size, last_vertex, k + j, channel | (xor_rot << 4));
437
438
20.9M
        for (size_t ig = 0; ig < block_size; ig += kByteGroupSize)
439
19.5M
        {
440
          // to maximize encoding performance we only evaluate 1/2/4/8 bit groups
441
19.5M
          size_t size1 = encodeBytesGroupMeasure(block + ig, 1);
442
19.5M
          size_t size2 = encodeBytesGroupMeasure(block + ig, 2);
443
19.5M
          size_t size4 = encodeBytesGroupMeasure(block + ig, 4);
444
19.5M
          size_t size8 = encodeBytesGroupMeasure(block + ig, 8);
445
446
19.5M
          size_t best_size = size1 < size2 ? size1 : size2;
447
19.5M
          best_size = best_size < size4 ? best_size : size4;
448
19.5M
          best_size = best_size < size8 ? best_size : size8;
449
450
19.5M
          sizes[channel] += best_size;
451
19.5M
        }
452
1.31M
      }
453
119k
  }
454
455
14.6k
  int best_channel = 0;
456
42.2k
  for (int channel = 1; channel < max_channel; ++channel)
457
27.5k
    best_channel = (sizes[channel] < sizes[best_channel]) ? channel : best_channel;
458
459
14.6k
  return best_channel == 2 ? best_channel | (xor_rot << 4) : best_channel;
460
14.6k
}
461
462
static bool estimateControlZero(const unsigned char* buffer, size_t vertex_count_aligned)
463
2.78M
{
464
16.3M
  for (size_t i = 0; i < vertex_count_aligned; i += kByteGroupSize)
465
15.5M
    if (!encodeBytesGroupZero(buffer + i))
466
1.97M
      return false;
467
468
812k
  return true;
469
2.78M
}
470
471
static int estimateControl(const unsigned char* buffer, size_t vertex_count, size_t vertex_count_aligned, int level)
472
2.78M
{
473
2.78M
  if (estimateControlZero(buffer, vertex_count_aligned))
474
812k
    return 2; // zero encoding
475
476
1.97M
  if (level == 0)
477
980k
    return 1; // 1248 encoding in level 0 for encoding speed
478
479
  // round number of groups to 4 to get number of header bytes
480
992k
  size_t header_size = (vertex_count_aligned / kByteGroupSize + 3) / 4;
481
482
992k
  size_t est_bytes0 = header_size, est_bytes1 = header_size;
483
484
15.9M
  for (size_t i = 0; i < vertex_count_aligned; i += kByteGroupSize)
485
14.9M
  {
486
    // assumes kBitsV1[] = {0, 1, 2, 4, 8} for performance
487
14.9M
    size_t size0 = encodeBytesGroupMeasure(buffer + i, 0);
488
14.9M
    size_t size1 = encodeBytesGroupMeasure(buffer + i, 1);
489
14.9M
    size_t size2 = encodeBytesGroupMeasure(buffer + i, 2);
490
14.9M
    size_t size4 = encodeBytesGroupMeasure(buffer + i, 4);
491
14.9M
    size_t size8 = encodeBytesGroupMeasure(buffer + i, 8);
492
493
    // both control modes have access to 1/2/4 bit encoding
494
14.9M
    size_t size12 = size1 < size2 ? size1 : size2;
495
14.9M
    size_t size124 = size12 < size4 ? size12 : size4;
496
497
    // each control mode has access to 0/8 bit encoding respectively
498
14.9M
    est_bytes0 += size124 < size0 ? size124 : size0;
499
14.9M
    est_bytes1 += size124 < size8 ? size124 : size8;
500
14.9M
  }
501
502
  // pick shortest control entry but prefer literal encoding
503
992k
  if (est_bytes0 < vertex_count || est_bytes1 < vertex_count)
504
479k
    return est_bytes0 < est_bytes1 ? 0 : 1;
505
513k
  else
506
513k
    return 3; // literal encoding
507
992k
}
508
509
static unsigned char* encodeVertexBlock(unsigned char* data, unsigned char* data_end, const unsigned char* vertex_data, size_t vertex_count, size_t vertex_size, unsigned char last_vertex[256], const unsigned char* channels, int version, int level)
510
292k
{
511
292k
  assert(vertex_count > 0 && vertex_count <= kVertexBlockMaxSize);
512
292k
  assert(vertex_size % 4 == 0);
513
514
292k
  unsigned char buffer[kVertexBlockMaxSize];
515
292k
  assert(sizeof(buffer) % kByteGroupSize == 0);
516
517
292k
  size_t vertex_count_aligned = (vertex_count + kByteGroupSize - 1) & ~(kByteGroupSize - 1);
518
519
  // we sometimes encode elements we didn't fill when rounding to kByteGroupSize
520
292k
  memset(buffer, 0, sizeof(buffer));
521
522
292k
  size_t control_size = version == 0 ? 0 : vertex_size / 4;
523
292k
  if (size_t(data_end - data) < control_size)
524
0
    return NULL;
525
526
292k
  unsigned char* control = data;
527
292k
  data += control_size;
528
529
292k
  memset(control, 0, control_size);
530
531
3.39M
  for (size_t k = 0; k < vertex_size; ++k)
532
3.10M
  {
533
3.10M
    encodeDeltas(buffer, vertex_data, vertex_count, vertex_size, last_vertex, k, version == 0 ? 0 : channels[k / 4]);
534
535
#if TRACE
536
    const unsigned char* olddata = data;
537
    bytestats = &vertexstats[k];
538
#endif
539
540
3.10M
    int ctrl = 0;
541
542
3.10M
    if (version != 0)
543
2.78M
    {
544
2.78M
      ctrl = estimateControl(buffer, vertex_count, vertex_count_aligned, level);
545
546
2.78M
      assert(unsigned(ctrl) < 4);
547
2.78M
      control[k / 4] |= ctrl << ((k % 4) * 2);
548
549
#if TRACE
550
      vertexstats[k].ctrl[ctrl]++;
551
#endif
552
2.78M
    }
553
554
3.10M
    if (ctrl == 3)
555
513k
    {
556
      // literal encoding
557
513k
      if (size_t(data_end - data) < vertex_count)
558
0
        return NULL;
559
560
513k
      memcpy(data, buffer, vertex_count);
561
513k
      data += vertex_count;
562
513k
    }
563
2.58M
    else if (ctrl != 2) // non-zero encoding
564
1.77M
    {
565
1.77M
      data = encodeBytes(data, data_end, buffer, vertex_count_aligned, version == 0 ? kBitsV0 : kBitsV1 + ctrl);
566
1.77M
      if (!data)
567
0
        return NULL;
568
1.77M
    }
569
570
#if TRACE
571
    bytestats = NULL;
572
    vertexstats[k].size += data - olddata;
573
#endif
574
3.10M
  }
575
576
292k
  memcpy(last_vertex, &vertex_data[vertex_size * (vertex_count - 1)], vertex_size);
577
578
292k
  return data;
579
292k
}
580
581
#if defined(SIMD_FALLBACK) || (!defined(SIMD_SSE) && !defined(SIMD_NEON) && !defined(SIMD_AVX) && !defined(SIMD_WASM))
582
static const unsigned char* decodeBytesGroup(const unsigned char* data, unsigned char* buffer, int bits)
583
0
{
584
0
#define READ() byte = *data++
585
0
#define NEXT(bits) enc = byte >> (8 - bits), byte <<= bits, encv = *data_var, *buffer++ = (enc == (1 << bits) - 1) ? encv : enc, data_var += (enc == (1 << bits) - 1)
586
587
0
  unsigned char byte, enc, encv;
588
0
  const unsigned char* data_var;
589
590
0
  switch (bits)
591
0
  {
592
0
  case 0:
593
0
    memset(buffer, 0, kByteGroupSize);
594
0
    return data;
595
0
  case 1:
596
0
    data_var = data + 2;
597
598
    // 2 groups with 8 1-bit values in each byte (reversed from the order in other groups)
599
0
    READ();
600
0
    byte = (unsigned char)(((byte * 0x80200802ull) & 0x0884422110ull) * 0x0101010101ull >> 32);
601
0
    NEXT(1), NEXT(1), NEXT(1), NEXT(1), NEXT(1), NEXT(1), NEXT(1), NEXT(1);
602
0
    READ();
603
0
    byte = (unsigned char)(((byte * 0x80200802ull) & 0x0884422110ull) * 0x0101010101ull >> 32);
604
0
    NEXT(1), NEXT(1), NEXT(1), NEXT(1), NEXT(1), NEXT(1), NEXT(1), NEXT(1);
605
606
0
    return data_var;
607
0
  case 2:
608
0
    data_var = data + 4;
609
610
    // 4 groups with 4 2-bit values in each byte
611
0
    READ(), NEXT(2), NEXT(2), NEXT(2), NEXT(2);
612
0
    READ(), NEXT(2), NEXT(2), NEXT(2), NEXT(2);
613
0
    READ(), NEXT(2), NEXT(2), NEXT(2), NEXT(2);
614
0
    READ(), NEXT(2), NEXT(2), NEXT(2), NEXT(2);
615
616
0
    return data_var;
617
0
  case 4:
618
0
    data_var = data + 8;
619
620
    // 8 groups with 2 4-bit values in each byte
621
0
    READ(), NEXT(4), NEXT(4);
622
0
    READ(), NEXT(4), NEXT(4);
623
0
    READ(), NEXT(4), NEXT(4);
624
0
    READ(), NEXT(4), NEXT(4);
625
0
    READ(), NEXT(4), NEXT(4);
626
0
    READ(), NEXT(4), NEXT(4);
627
0
    READ(), NEXT(4), NEXT(4);
628
0
    READ(), NEXT(4), NEXT(4);
629
630
0
    return data_var;
631
0
  case 8:
632
0
    memcpy(buffer, data, kByteGroupSize);
633
0
    return data + kByteGroupSize;
634
0
  default:
635
0
    assert(!"Unexpected bit length"); // unreachable
636
0
    return data;
637
0
  }
638
639
0
#undef READ
640
0
#undef NEXT
641
0
}
642
643
static const unsigned char* decodeBytes(const unsigned char* data, const unsigned char* data_end, unsigned char* buffer, size_t buffer_size, const int* bits)
644
0
{
645
0
  assert(buffer_size % kByteGroupSize == 0);
646
647
  // round number of groups to 4 to get number of header bytes
648
0
  size_t header_size = (buffer_size / kByteGroupSize + 3) / 4;
649
0
  if (size_t(data_end - data) < header_size)
650
0
    return NULL;
651
652
0
  const unsigned char* header = data;
653
0
  data += header_size;
654
655
0
  for (size_t i = 0; i < buffer_size; i += kByteGroupSize)
656
0
  {
657
0
    if (size_t(data_end - data) < kByteGroupDecodeLimit)
658
0
      return NULL;
659
660
0
    size_t header_offset = i / kByteGroupSize;
661
0
    int bitsk = (header[header_offset / 4] >> ((header_offset % 4) * 2)) & 3;
662
663
0
    data = decodeBytesGroup(data, buffer + i, bits[bitsk]);
664
0
  }
665
666
0
  return data;
667
0
}
668
669
template <typename T, bool Xor>
670
static void decodeDeltas1(const unsigned char* buffer, unsigned char* transposed, size_t vertex_count, size_t vertex_size, const unsigned char* last_vertex, int rot)
671
0
{
672
0
  for (size_t k = 0; k < 4; k += sizeof(T))
673
0
  {
674
0
    size_t vertex_offset = k;
675
676
0
    T p = last_vertex[0];
677
0
    for (size_t j = 1; j < sizeof(T); ++j)
678
0
      p |= last_vertex[j] << (8 * j);
679
680
0
    for (size_t i = 0; i < vertex_count; ++i)
681
0
    {
682
0
      T v = buffer[i];
683
0
      for (size_t j = 1; j < sizeof(T); ++j)
684
0
        v |= buffer[i + vertex_count * j] << (8 * j);
685
686
0
      v = Xor ? T(rotate(v, rot)) ^ p : unzigzag(v) + p;
687
688
0
      for (size_t j = 0; j < sizeof(T); ++j)
689
0
        transposed[vertex_offset + j] = (unsigned char)(v >> (j * 8));
690
691
0
      p = v;
692
693
0
      vertex_offset += vertex_size;
694
0
    }
695
696
0
    buffer += vertex_count * sizeof(T);
697
0
    last_vertex += sizeof(T);
698
0
  }
699
0
}
Unexecuted instantiation: vertexcodec.cpp:void meshopt::decodeDeltas1<unsigned char, false>(unsigned char const*, unsigned char*, unsigned long, unsigned long, unsigned char const*, int)
Unexecuted instantiation: vertexcodec.cpp:void meshopt::decodeDeltas1<unsigned short, false>(unsigned char const*, unsigned char*, unsigned long, unsigned long, unsigned char const*, int)
Unexecuted instantiation: vertexcodec.cpp:void meshopt::decodeDeltas1<unsigned int, true>(unsigned char const*, unsigned char*, unsigned long, unsigned long, unsigned char const*, int)
700
701
static const unsigned char* decodeVertexBlock(const unsigned char* data, const unsigned char* data_end, unsigned char* vertex_data, size_t vertex_count, size_t vertex_size, unsigned char last_vertex[256], const unsigned char* channels, int version)
702
0
{
703
0
  assert(vertex_count > 0 && vertex_count <= kVertexBlockMaxSize);
704
705
0
  unsigned char buffer[kVertexBlockMaxSize * 4];
706
0
  unsigned char transposed[kVertexBlockSizeBytes];
707
708
0
  size_t vertex_count_aligned = (vertex_count + kByteGroupSize - 1) & ~(kByteGroupSize - 1);
709
0
  assert(vertex_count <= vertex_count_aligned);
710
711
0
  size_t control_size = version == 0 ? 0 : vertex_size / 4;
712
0
  if (size_t(data_end - data) < control_size)
713
0
    return NULL;
714
715
0
  const unsigned char* control = data;
716
0
  data += control_size;
717
718
0
  for (size_t k = 0; k < vertex_size; k += 4)
719
0
  {
720
0
    unsigned char ctrl_byte = version == 0 ? 0 : control[k / 4];
721
722
0
    for (size_t j = 0; j < 4; ++j)
723
0
    {
724
0
      int ctrl = (ctrl_byte >> (j * 2)) & 3;
725
726
0
      if (ctrl == 3)
727
0
      {
728
        // literal encoding
729
0
        if (size_t(data_end - data) < vertex_count)
730
0
          return NULL;
731
732
0
        memcpy(buffer + j * vertex_count, data, vertex_count);
733
0
        data += vertex_count;
734
0
      }
735
0
      else if (ctrl == 2)
736
0
      {
737
        // zero encoding
738
0
        memset(buffer + j * vertex_count, 0, vertex_count);
739
0
      }
740
0
      else
741
0
      {
742
0
        data = decodeBytes(data, data_end, buffer + j * vertex_count, vertex_count_aligned, version == 0 ? kBitsV0 : kBitsV1 + ctrl);
743
0
        if (!data)
744
0
          return NULL;
745
0
      }
746
0
    }
747
748
0
    int channel = version == 0 ? 0 : channels[k / 4];
749
750
0
    switch (channel & 3)
751
0
    {
752
0
    case 0:
753
0
      decodeDeltas1<unsigned char, false>(buffer, transposed + k, vertex_count, vertex_size, last_vertex + k, 0);
754
0
      break;
755
0
    case 1:
756
0
      decodeDeltas1<unsigned short, false>(buffer, transposed + k, vertex_count, vertex_size, last_vertex + k, 0);
757
0
      break;
758
0
    case 2:
759
0
      decodeDeltas1<unsigned int, true>(buffer, transposed + k, vertex_count, vertex_size, last_vertex + k, (32 - (channel >> 4)) & 31);
760
0
      break;
761
0
    default:
762
0
      return NULL; // invalid channel type
763
0
    }
764
0
  }
765
766
0
  memcpy(vertex_data, transposed, vertex_count * vertex_size);
767
768
0
  memcpy(last_vertex, &transposed[vertex_size * (vertex_count - 1)], vertex_size);
769
770
0
  return data;
771
0
}
772
#endif
773
774
#if defined(SIMD_SSE) || defined(SIMD_NEON) || defined(SIMD_WASM)
775
static unsigned char kDecodeBytesGroupShuffle[256][8];
776
static unsigned char kDecodeBytesGroupCount[256];
777
778
#ifdef __wasm__
779
__attribute__((cold)) // this saves 500 bytes in the output binary - we don't need to vectorize this loop!
780
#endif
781
static bool
782
decodeBytesGroupBuildTables()
783
2
{
784
514
  for (int mask = 0; mask < 256; ++mask)
785
512
  {
786
512
    unsigned char shuffle[8];
787
512
    unsigned char count = 0;
788
789
4.60k
    for (int i = 0; i < 8; ++i)
790
4.09k
    {
791
4.09k
      int maski = (mask >> i) & 1;
792
4.09k
      shuffle[i] = maski ? count : 0x80;
793
4.09k
      count += (unsigned char)(maski);
794
4.09k
    }
795
796
512
    memcpy(kDecodeBytesGroupShuffle[mask], shuffle, 8);
797
512
    kDecodeBytesGroupCount[mask] = count;
798
512
  }
799
800
2
  return true;
801
2
}
802
803
static bool gDecodeBytesGroupInitialized = decodeBytesGroupBuildTables();
804
#endif
805
806
#ifdef SIMD_SSE
807
SIMD_TARGET
808
inline __m128i decodeShuffleMask(unsigned char mask0, unsigned char mask1)
809
4.05M
{
810
4.05M
  __m128i sm0 = _mm_loadl_epi64(reinterpret_cast<const __m128i*>(&kDecodeBytesGroupShuffle[mask0]));
811
4.05M
  __m128i sm1 = _mm_loadl_epi64(reinterpret_cast<const __m128i*>(&kDecodeBytesGroupShuffle[mask1]));
812
4.05M
  __m128i sm1off = _mm_set1_epi8(kDecodeBytesGroupCount[mask0]);
813
814
4.05M
  __m128i sm1r = _mm_add_epi8(sm1, sm1off);
815
816
4.05M
  return _mm_unpacklo_epi64(sm0, sm1r);
817
4.05M
}
818
819
SIMD_TARGET
820
inline const unsigned char* decodeBytesGroupSimd(const unsigned char* data, unsigned char* buffer, int hbits)
821
13.6M
{
822
13.6M
  switch (hbits)
823
13.6M
  {
824
276k
  case 0:
825
1.60M
  case 4:
826
1.60M
  {
827
1.60M
    __m128i result = _mm_setzero_si128();
828
829
1.60M
    _mm_storeu_si128(reinterpret_cast<__m128i*>(buffer), result);
830
831
1.60M
    return data;
832
276k
  }
833
834
190k
  case 1:
835
1.46M
  case 6:
836
1.46M
  {
837
1.46M
#ifdef __GNUC__
838
1.46M
    typedef int __attribute__((aligned(1))) unaligned_int;
839
#else
840
    typedef int unaligned_int;
841
#endif
842
843
1.46M
#ifdef SIMD_LATENCYOPT
844
1.46M
    unsigned int data32;
845
1.46M
    memcpy(&data32, data, 4);
846
1.46M
    data32 &= data32 >> 1;
847
848
    // arrange bits such that low bits of nibbles of data64 contain all 2-bit elements of data32
849
1.46M
    unsigned long long data64 = ((unsigned long long)data32 << 30) | (data32 & 0x3fffffff);
850
851
    // adds all 1-bit nibbles together; the sum fits in 4 bits because datacnt=16 would have used mode 3
852
1.46M
    int datacnt = int(((data64 & 0x1111111111111111ull) * 0x1111111111111111ull) >> 60);
853
1.46M
#endif
854
855
1.46M
    __m128i sel2 = _mm_cvtsi32_si128(*reinterpret_cast<const unaligned_int*>(data));
856
1.46M
    __m128i rest = _mm_loadu_si128(reinterpret_cast<const __m128i*>(data + 4));
857
858
1.46M
    __m128i sel22 = _mm_unpacklo_epi8(_mm_srli_epi16(sel2, 4), sel2);
859
1.46M
    __m128i sel2222 = _mm_unpacklo_epi8(_mm_srli_epi16(sel22, 2), sel22);
860
1.46M
    __m128i sel = _mm_and_si128(sel2222, _mm_set1_epi8(3));
861
862
1.46M
    __m128i mask = _mm_cmpeq_epi8(sel, _mm_set1_epi8(3));
863
1.46M
    int mask16 = _mm_movemask_epi8(mask);
864
1.46M
    unsigned char mask0 = (unsigned char)(mask16 & 255);
865
1.46M
    unsigned char mask1 = (unsigned char)(mask16 >> 8);
866
867
1.46M
    __m128i shuf = decodeShuffleMask(mask0, mask1);
868
1.46M
    __m128i result = _mm_or_si128(_mm_shuffle_epi8(rest, shuf), _mm_andnot_si128(mask, sel));
869
870
1.46M
    _mm_storeu_si128(reinterpret_cast<__m128i*>(buffer), result);
871
872
1.46M
#ifdef SIMD_LATENCYOPT
873
1.46M
    return data + 4 + datacnt;
874
#else
875
    return data + 4 + kDecodeBytesGroupCount[mask0] + kDecodeBytesGroupCount[mask1];
876
#endif
877
190k
  }
878
879
18.2k
  case 2:
880
138k
  case 7:
881
138k
  {
882
138k
#ifdef SIMD_LATENCYOPT
883
138k
    unsigned long long data64;
884
138k
    memcpy(&data64, data, 8);
885
138k
    data64 &= data64 >> 1;
886
138k
    data64 &= data64 >> 2;
887
888
    // adds all 1-bit nibbles together; the sum fits in 4 bits because datacnt=16 would have used mode 3
889
138k
    int datacnt = int(((data64 & 0x1111111111111111ull) * 0x1111111111111111ull) >> 60);
890
138k
#endif
891
892
138k
    __m128i sel4 = _mm_loadl_epi64(reinterpret_cast<const __m128i*>(data));
893
138k
    __m128i rest = _mm_loadu_si128(reinterpret_cast<const __m128i*>(data + 8));
894
895
138k
    __m128i sel44 = _mm_unpacklo_epi8(_mm_srli_epi16(sel4, 4), sel4);
896
138k
    __m128i sel = _mm_and_si128(sel44, _mm_set1_epi8(15));
897
898
138k
    __m128i mask = _mm_cmpeq_epi8(sel, _mm_set1_epi8(15));
899
138k
    int mask16 = _mm_movemask_epi8(mask);
900
138k
    unsigned char mask0 = (unsigned char)(mask16 & 255);
901
138k
    unsigned char mask1 = (unsigned char)(mask16 >> 8);
902
903
138k
    __m128i shuf = decodeShuffleMask(mask0, mask1);
904
138k
    __m128i result = _mm_or_si128(_mm_shuffle_epi8(rest, shuf), _mm_andnot_si128(mask, sel));
905
906
138k
    _mm_storeu_si128(reinterpret_cast<__m128i*>(buffer), result);
907
908
138k
#ifdef SIMD_LATENCYOPT
909
138k
    return data + 8 + datacnt;
910
#else
911
    return data + 8 + kDecodeBytesGroupCount[mask0] + kDecodeBytesGroupCount[mask1];
912
#endif
913
18.2k
  }
914
915
1.73M
  case 3:
916
7.96M
  case 8:
917
7.96M
  {
918
7.96M
    __m128i result = _mm_loadu_si128(reinterpret_cast<const __m128i*>(data));
919
920
7.96M
    _mm_storeu_si128(reinterpret_cast<__m128i*>(buffer), result);
921
922
7.96M
    return data + 16;
923
1.73M
  }
924
925
2.45M
  case 5:
926
2.45M
  {
927
2.45M
    __m128i rest = _mm_loadu_si128(reinterpret_cast<const __m128i*>(data + 2));
928
929
2.45M
    unsigned char mask0 = data[0];
930
2.45M
    unsigned char mask1 = data[1];
931
932
2.45M
    __m128i shuf = decodeShuffleMask(mask0, mask1);
933
2.45M
    __m128i result = _mm_shuffle_epi8(rest, shuf);
934
935
2.45M
    _mm_storeu_si128(reinterpret_cast<__m128i*>(buffer), result);
936
937
2.45M
    return data + 2 + kDecodeBytesGroupCount[mask0] + kDecodeBytesGroupCount[mask1];
938
1.73M
  }
939
940
0
  default:
941
0
    SIMD_UNREACHABLE(); // unreachable
942
13.6M
  }
943
13.6M
}
944
#endif
945
946
#ifdef SIMD_AVX
947
static const __m128i kDecodeBytesGroupConfig[8][2] = {
948
    {_mm_setzero_si128(), _mm_setzero_si128()},
949
    {_mm_set1_epi8(3), _mm_setr_epi8(6, 4, 2, 0, 14, 12, 10, 8, 22, 20, 18, 16, 30, 28, 26, 24)},
950
    {_mm_set1_epi8(15), _mm_setr_epi8(4, 0, 12, 8, 20, 16, 28, 24, 36, 32, 44, 40, 52, 48, 60, 56)},
951
    {_mm_setzero_si128(), _mm_setzero_si128()},
952
    {_mm_setzero_si128(), _mm_setzero_si128()},
953
    {_mm_set1_epi8(1), _mm_setr_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15)},
954
    {_mm_set1_epi8(3), _mm_setr_epi8(6, 4, 2, 0, 14, 12, 10, 8, 22, 20, 18, 16, 30, 28, 26, 24)},
955
    {_mm_set1_epi8(15), _mm_setr_epi8(4, 0, 12, 8, 20, 16, 28, 24, 36, 32, 44, 40, 52, 48, 60, 56)},
956
};
957
958
SIMD_TARGET
959
inline const unsigned char* decodeBytesGroupSimd(const unsigned char* data, unsigned char* buffer, int hbits)
960
{
961
  switch (hbits)
962
  {
963
  case 0:
964
  case 4:
965
  {
966
    __m128i result = _mm_setzero_si128();
967
968
    _mm_storeu_si128(reinterpret_cast<__m128i*>(buffer), result);
969
970
    return data;
971
  }
972
973
  case 5: // 1-bit
974
  case 1: // 2-bit
975
  case 6:
976
  case 2: // 4-bit
977
  case 7:
978
  {
979
    const unsigned char* skip = data + (2 << (hbits < 3 ? hbits : hbits - 5));
980
981
    __m128i selb = _mm_loadl_epi64(reinterpret_cast<const __m128i*>(data));
982
    __m128i rest = _mm_loadu_si128(reinterpret_cast<const __m128i*>(skip));
983
984
    __m128i sent = kDecodeBytesGroupConfig[hbits][0];
985
    __m128i ctrl = kDecodeBytesGroupConfig[hbits][1];
986
987
    __m128i selw = _mm_shuffle_epi32(selb, 0x44);
988
    __m128i sel = _mm_and_si128(sent, _mm_multishift_epi64_epi8(ctrl, selw));
989
    __mmask16 mask16 = _mm_cmp_epi8_mask(sel, sent, _MM_CMPINT_EQ);
990
991
    __m128i result = _mm_mask_expand_epi8(sel, mask16, rest);
992
993
    _mm_storeu_si128(reinterpret_cast<__m128i*>(buffer), result);
994
995
    return skip + _mm_popcnt_u32(mask16);
996
  }
997
998
  case 3:
999
  case 8:
1000
  {
1001
    __m128i result = _mm_loadu_si128(reinterpret_cast<const __m128i*>(data));
1002
1003
    _mm_storeu_si128(reinterpret_cast<__m128i*>(buffer), result);
1004
1005
    return data + 16;
1006
  }
1007
1008
  default:
1009
    SIMD_UNREACHABLE(); // unreachable
1010
  }
1011
}
1012
#endif
1013
1014
#ifdef SIMD_NEON
1015
SIMD_TARGET
1016
inline uint8x16_t shuffleBytes(unsigned char mask0, unsigned char mask1, uint8x8_t rest0, uint8x8_t rest1)
1017
{
1018
  uint8x8_t sm0 = vld1_u8(kDecodeBytesGroupShuffle[mask0]);
1019
  uint8x8_t sm1 = vld1_u8(kDecodeBytesGroupShuffle[mask1]);
1020
1021
  uint8x8_t r0 = vtbl1_u8(rest0, sm0);
1022
  uint8x8_t r1 = vtbl1_u8(rest1, sm1);
1023
1024
  return vcombine_u8(r0, r1);
1025
}
1026
1027
SIMD_TARGET
1028
inline void neonMoveMask(uint8x16_t mask, unsigned char& mask0, unsigned char& mask1)
1029
{
1030
  // magic constant found using z3 SMT assuming mask has 8 groups of 0xff or 0x00
1031
  const uint64_t magic = 0x000103070f1f3f80ull;
1032
1033
  uint64x2_t mask2 = vreinterpretq_u64_u8(mask);
1034
1035
  mask0 = uint8_t((vgetq_lane_u64(mask2, 0) * magic) >> 56);
1036
  mask1 = uint8_t((vgetq_lane_u64(mask2, 1) * magic) >> 56);
1037
}
1038
1039
SIMD_TARGET
1040
inline const unsigned char* decodeBytesGroupSimd(const unsigned char* data, unsigned char* buffer, int hbits)
1041
{
1042
  switch (hbits)
1043
  {
1044
  case 0:
1045
  case 4:
1046
  {
1047
    uint8x16_t result = vdupq_n_u8(0);
1048
1049
    vst1q_u8(buffer, result);
1050
1051
    return data;
1052
  }
1053
1054
  case 1:
1055
  case 6:
1056
  {
1057
#ifdef SIMD_LATENCYOPT
1058
    unsigned int data32;
1059
    memcpy(&data32, data, 4);
1060
    data32 &= data32 >> 1;
1061
1062
    // arrange bits such that low bits of nibbles of data64 contain all 2-bit elements of data32
1063
    unsigned long long data64 = ((unsigned long long)data32 << 30) | (data32 & 0x3fffffff);
1064
1065
    // adds all 1-bit nibbles together; the sum fits in 4 bits because datacnt=16 would have used mode 3
1066
    int datacnt = int(((data64 & 0x1111111111111111ull) * 0x1111111111111111ull) >> 60);
1067
#endif
1068
1069
    uint8x8_t sel2 = vld1_u8(data);
1070
    uint8x8_t sel22 = vzip_u8(vshr_n_u8(sel2, 4), sel2).val[0];
1071
    uint8x8x2_t sel2222 = vzip_u8(vshr_n_u8(sel22, 2), sel22);
1072
    uint8x16_t sel = vandq_u8(vcombine_u8(sel2222.val[0], sel2222.val[1]), vdupq_n_u8(3));
1073
1074
    uint8x16_t mask = vceqq_u8(sel, vdupq_n_u8(3));
1075
    unsigned char mask0, mask1;
1076
    neonMoveMask(mask, mask0, mask1);
1077
1078
    uint8x8_t rest0 = vld1_u8(data + 4);
1079
    uint8x8_t rest1 = vld1_u8(data + 4 + kDecodeBytesGroupCount[mask0]);
1080
1081
    uint8x16_t result = vbslq_u8(mask, shuffleBytes(mask0, mask1, rest0, rest1), sel);
1082
1083
    vst1q_u8(buffer, result);
1084
1085
#ifdef SIMD_LATENCYOPT
1086
    return data + 4 + datacnt;
1087
#else
1088
    return data + 4 + kDecodeBytesGroupCount[mask0] + kDecodeBytesGroupCount[mask1];
1089
#endif
1090
  }
1091
1092
  case 2:
1093
  case 7:
1094
  {
1095
#ifdef SIMD_LATENCYOPT
1096
    unsigned long long data64;
1097
    memcpy(&data64, data, 8);
1098
    data64 &= data64 >> 1;
1099
    data64 &= data64 >> 2;
1100
1101
    // adds all 1-bit nibbles together; the sum fits in 4 bits because datacnt=16 would have used mode 3
1102
    int datacnt = int(((data64 & 0x1111111111111111ull) * 0x1111111111111111ull) >> 60);
1103
#endif
1104
1105
    uint8x8_t sel4 = vld1_u8(data);
1106
    uint8x8x2_t sel44 = vzip_u8(vshr_n_u8(sel4, 4), vand_u8(sel4, vdup_n_u8(15)));
1107
    uint8x16_t sel = vcombine_u8(sel44.val[0], sel44.val[1]);
1108
1109
    uint8x16_t mask = vceqq_u8(sel, vdupq_n_u8(15));
1110
    unsigned char mask0, mask1;
1111
    neonMoveMask(mask, mask0, mask1);
1112
1113
    uint8x8_t rest0 = vld1_u8(data + 8);
1114
    uint8x8_t rest1 = vld1_u8(data + 8 + kDecodeBytesGroupCount[mask0]);
1115
1116
    uint8x16_t result = vbslq_u8(mask, shuffleBytes(mask0, mask1, rest0, rest1), sel);
1117
1118
    vst1q_u8(buffer, result);
1119
1120
#ifdef SIMD_LATENCYOPT
1121
    return data + 8 + datacnt;
1122
#else
1123
    return data + 8 + kDecodeBytesGroupCount[mask0] + kDecodeBytesGroupCount[mask1];
1124
#endif
1125
  }
1126
1127
  case 3:
1128
  case 8:
1129
  {
1130
    uint8x16_t result = vld1q_u8(data);
1131
1132
    vst1q_u8(buffer, result);
1133
1134
    return data + 16;
1135
  }
1136
1137
  case 5:
1138
  {
1139
    unsigned char mask0 = data[0];
1140
    unsigned char mask1 = data[1];
1141
1142
    uint8x8_t rest0 = vld1_u8(data + 2);
1143
    uint8x8_t rest1 = vld1_u8(data + 2 + kDecodeBytesGroupCount[mask0]);
1144
1145
    uint8x16_t result = shuffleBytes(mask0, mask1, rest0, rest1);
1146
1147
    vst1q_u8(buffer, result);
1148
1149
    return data + 2 + kDecodeBytesGroupCount[mask0] + kDecodeBytesGroupCount[mask1];
1150
  }
1151
1152
  default:
1153
    SIMD_UNREACHABLE(); // unreachable
1154
  }
1155
}
1156
#endif
1157
1158
#ifdef SIMD_WASM
1159
SIMD_TARGET
1160
inline v128_t decodeShuffleMask(unsigned char mask0, unsigned char mask1)
1161
{
1162
  v128_t sm0 = wasm_v128_load(&kDecodeBytesGroupShuffle[mask0]);
1163
  v128_t sm1 = wasm_v128_load(&kDecodeBytesGroupShuffle[mask1]);
1164
1165
  v128_t sm1off = wasm_v128_load8_splat(&kDecodeBytesGroupCount[mask0]);
1166
  v128_t sm1r = wasm_i8x16_add(sm1, sm1off);
1167
1168
  return wasmx_unpacklo_v64x2(sm0, sm1r);
1169
}
1170
1171
SIMD_TARGET
1172
inline void wasmMoveMask(v128_t mask, unsigned char& mask0, unsigned char& mask1)
1173
{
1174
  // magic constant found using z3 SMT assuming mask has 8 groups of 0xff or 0x00
1175
  const uint64_t magic = 0x000103070f1f3f80ull;
1176
1177
  mask0 = uint8_t((wasm_i64x2_extract_lane(mask, 0) * magic) >> 56);
1178
  mask1 = uint8_t((wasm_i64x2_extract_lane(mask, 1) * magic) >> 56);
1179
}
1180
1181
SIMD_TARGET
1182
inline const unsigned char* decodeBytesGroupSimd(const unsigned char* data, unsigned char* buffer, int hbits)
1183
{
1184
  switch (hbits)
1185
  {
1186
  case 0:
1187
  case 4:
1188
  {
1189
    v128_t result = wasm_i8x16_splat(0);
1190
1191
    wasm_v128_store(buffer, result);
1192
1193
    return data;
1194
  }
1195
1196
  case 1:
1197
  case 6:
1198
  {
1199
    v128_t sel2 = wasm_v128_load(data);
1200
    v128_t rest = wasm_v128_load(data + 4);
1201
1202
    v128_t sel22 = wasmx_unpacklo_v8x16(wasm_i16x8_shr(sel2, 4), sel2);
1203
    v128_t sel2222 = wasmx_unpacklo_v8x16(wasm_i16x8_shr(sel22, 2), sel22);
1204
    v128_t sel = wasm_v128_and(sel2222, wasm_i8x16_splat(3));
1205
1206
    v128_t mask = wasm_i8x16_eq(sel, wasm_i8x16_splat(3));
1207
1208
    unsigned char mask0, mask1;
1209
    wasmMoveMask(mask, mask0, mask1);
1210
1211
    v128_t shuf = decodeShuffleMask(mask0, mask1);
1212
    v128_t result = wasm_v128_bitselect(wasm_i8x16_swizzle(rest, shuf), sel, mask);
1213
1214
    wasm_v128_store(buffer, result);
1215
1216
    return data + 4 + kDecodeBytesGroupCount[mask0] + kDecodeBytesGroupCount[mask1];
1217
  }
1218
1219
  case 2:
1220
  case 7:
1221
  {
1222
    v128_t sel4 = wasm_v128_load(data);
1223
    v128_t rest = wasm_v128_load(data + 8);
1224
1225
    v128_t sel44 = wasmx_unpacklo_v8x16(wasm_i16x8_shr(sel4, 4), sel4);
1226
    v128_t sel = wasm_v128_and(sel44, wasm_i8x16_splat(15));
1227
1228
    v128_t mask = wasm_i8x16_eq(sel, wasm_i8x16_splat(15));
1229
1230
    unsigned char mask0, mask1;
1231
    wasmMoveMask(mask, mask0, mask1);
1232
1233
    v128_t shuf = decodeShuffleMask(mask0, mask1);
1234
    v128_t result = wasm_v128_bitselect(wasm_i8x16_swizzle(rest, shuf), sel, mask);
1235
1236
    wasm_v128_store(buffer, result);
1237
1238
    return data + 8 + kDecodeBytesGroupCount[mask0] + kDecodeBytesGroupCount[mask1];
1239
  }
1240
1241
  case 3:
1242
  case 8:
1243
  {
1244
    v128_t result = wasm_v128_load(data);
1245
1246
    wasm_v128_store(buffer, result);
1247
1248
    return data + 16;
1249
  }
1250
1251
  case 5:
1252
  {
1253
    v128_t rest = wasm_v128_load(data + 2);
1254
1255
    unsigned char mask0 = data[0];
1256
    unsigned char mask1 = data[1];
1257
1258
    v128_t shuf = decodeShuffleMask(mask0, mask1);
1259
    v128_t result = wasm_i8x16_swizzle(rest, shuf);
1260
1261
    wasm_v128_store(buffer, result);
1262
1263
    return data + 2 + kDecodeBytesGroupCount[mask0] + kDecodeBytesGroupCount[mask1];
1264
  }
1265
1266
  default:
1267
    SIMD_UNREACHABLE(); // unreachable
1268
  }
1269
}
1270
#endif
1271
1272
#if defined(SIMD_SSE) || defined(SIMD_AVX)
1273
SIMD_TARGET
1274
inline void transpose8(__m128i& x0, __m128i& x1, __m128i& x2, __m128i& x3)
1275
5.91M
{
1276
5.91M
  __m128i t0 = _mm_unpacklo_epi8(x0, x1);
1277
5.91M
  __m128i t1 = _mm_unpackhi_epi8(x0, x1);
1278
5.91M
  __m128i t2 = _mm_unpacklo_epi8(x2, x3);
1279
5.91M
  __m128i t3 = _mm_unpackhi_epi8(x2, x3);
1280
1281
5.91M
  x0 = _mm_unpacklo_epi16(t0, t2);
1282
5.91M
  x1 = _mm_unpackhi_epi16(t0, t2);
1283
5.91M
  x2 = _mm_unpacklo_epi16(t1, t3);
1284
5.91M
  x3 = _mm_unpackhi_epi16(t1, t3);
1285
5.91M
}
1286
1287
SIMD_TARGET
1288
inline __m128i unzigzag8(__m128i v)
1289
21.7M
{
1290
21.7M
  __m128i xl = _mm_sub_epi8(_mm_setzero_si128(), _mm_and_si128(v, _mm_set1_epi8(1)));
1291
21.7M
  __m128i xr = _mm_and_si128(_mm_srli_epi16(v, 1), _mm_set1_epi8(127));
1292
1293
21.7M
  return _mm_xor_si128(xl, xr);
1294
21.7M
}
1295
1296
SIMD_TARGET
1297
inline __m128i unzigzag16(__m128i v)
1298
1.02M
{
1299
1.02M
  __m128i xl = _mm_sub_epi16(_mm_setzero_si128(), _mm_and_si128(v, _mm_set1_epi16(1)));
1300
1.02M
  __m128i xr = _mm_srli_epi16(v, 1);
1301
1302
1.02M
  return _mm_xor_si128(xl, xr);
1303
1.02M
}
1304
1305
SIMD_TARGET
1306
inline __m128i rotate32(__m128i v, int r)
1307
891k
{
1308
891k
  return _mm_or_si128(_mm_slli_epi32(v, r), _mm_srli_epi32(v, 32 - r));
1309
891k
}
1310
#endif
1311
1312
#ifdef SIMD_NEON
1313
SIMD_TARGET
1314
inline void transpose8(uint8x16_t& x0, uint8x16_t& x1, uint8x16_t& x2, uint8x16_t& x3)
1315
{
1316
  uint8x16x2_t t01 = vzipq_u8(x0, x1);
1317
  uint8x16x2_t t23 = vzipq_u8(x2, x3);
1318
1319
  uint16x8x2_t x01 = vzipq_u16(vreinterpretq_u16_u8(t01.val[0]), vreinterpretq_u16_u8(t23.val[0]));
1320
  uint16x8x2_t x23 = vzipq_u16(vreinterpretq_u16_u8(t01.val[1]), vreinterpretq_u16_u8(t23.val[1]));
1321
1322
  x0 = vreinterpretq_u8_u16(x01.val[0]);
1323
  x1 = vreinterpretq_u8_u16(x01.val[1]);
1324
  x2 = vreinterpretq_u8_u16(x23.val[0]);
1325
  x3 = vreinterpretq_u8_u16(x23.val[1]);
1326
}
1327
1328
SIMD_TARGET
1329
inline uint8x16_t unzigzag8(uint8x16_t v)
1330
{
1331
  uint8x16_t xl = vreinterpretq_u8_s8(vnegq_s8(vreinterpretq_s8_u8(vandq_u8(v, vdupq_n_u8(1)))));
1332
  uint8x16_t xr = vshrq_n_u8(v, 1);
1333
1334
  return veorq_u8(xl, xr);
1335
}
1336
1337
SIMD_TARGET
1338
inline uint8x16_t unzigzag16(uint8x16_t v)
1339
{
1340
  uint16x8_t vv = vreinterpretq_u16_u8(v);
1341
  uint8x16_t xl = vreinterpretq_u8_s16(vnegq_s16(vreinterpretq_s16_u16(vandq_u16(vv, vdupq_n_u16(1)))));
1342
  uint8x16_t xr = vreinterpretq_u8_u16(vshrq_n_u16(vv, 1));
1343
1344
  return veorq_u8(xl, xr);
1345
}
1346
1347
SIMD_TARGET
1348
inline uint8x16_t rotate32(uint8x16_t v, int r)
1349
{
1350
  uint32x4_t v32 = vreinterpretq_u32_u8(v);
1351
  return vreinterpretq_u8_u32(vorrq_u32(vshlq_u32(v32, vdupq_n_s32(r)), vshlq_u32(v32, vdupq_n_s32(r - 32))));
1352
}
1353
1354
template <int Channel>
1355
SIMD_TARGET inline uint8x8_t rebase(uint8x8_t npi, uint8x16_t r0, uint8x16_t r1, uint8x16_t r2, uint8x16_t r3)
1356
{
1357
  switch (Channel)
1358
  {
1359
  case 0:
1360
  {
1361
    uint8x16_t rsum = vaddq_u8(vaddq_u8(r0, r1), vaddq_u8(r2, r3));
1362
    uint8x8_t rsumx = vadd_u8(vget_low_u8(rsum), vget_high_u8(rsum));
1363
    return vadd_u8(vadd_u8(npi, rsumx), vext_u8(rsumx, rsumx, 4));
1364
  }
1365
  case 1:
1366
  {
1367
    uint16x8_t rsum = vaddq_u16(vaddq_u16(vreinterpretq_u16_u8(r0), vreinterpretq_u16_u8(r1)), vaddq_u16(vreinterpretq_u16_u8(r2), vreinterpretq_u16_u8(r3)));
1368
    uint16x4_t rsumx = vadd_u16(vget_low_u16(rsum), vget_high_u16(rsum));
1369
    return vreinterpret_u8_u16(vadd_u16(vadd_u16(vreinterpret_u16_u8(npi), rsumx), vext_u16(rsumx, rsumx, 2)));
1370
  }
1371
  case 2:
1372
  {
1373
    uint8x16_t rsum = veorq_u8(veorq_u8(r0, r1), veorq_u8(r2, r3));
1374
    uint8x8_t rsumx = veor_u8(vget_low_u8(rsum), vget_high_u8(rsum));
1375
    return veor_u8(veor_u8(npi, rsumx), vext_u8(rsumx, rsumx, 4));
1376
  }
1377
  default:
1378
    return npi;
1379
  }
1380
}
1381
#endif
1382
1383
#ifdef SIMD_WASM
1384
SIMD_TARGET
1385
inline void transpose8(v128_t& x0, v128_t& x1, v128_t& x2, v128_t& x3)
1386
{
1387
  v128_t t0 = wasmx_unpacklo_v8x16(x0, x1);
1388
  v128_t t1 = wasmx_unpackhi_v8x16(x0, x1);
1389
  v128_t t2 = wasmx_unpacklo_v8x16(x2, x3);
1390
  v128_t t3 = wasmx_unpackhi_v8x16(x2, x3);
1391
1392
  x0 = wasmx_unpacklo_v16x8(t0, t2);
1393
  x1 = wasmx_unpackhi_v16x8(t0, t2);
1394
  x2 = wasmx_unpacklo_v16x8(t1, t3);
1395
  x3 = wasmx_unpackhi_v16x8(t1, t3);
1396
}
1397
1398
SIMD_TARGET
1399
inline v128_t unzigzag8(v128_t v)
1400
{
1401
  v128_t xl = wasm_i8x16_neg(wasm_v128_and(v, wasm_i8x16_splat(1)));
1402
  v128_t xr = wasm_u8x16_shr(v, 1);
1403
1404
  return wasm_v128_xor(xl, xr);
1405
}
1406
1407
SIMD_TARGET
1408
inline v128_t unzigzag16(v128_t v)
1409
{
1410
  v128_t xl = wasm_i16x8_neg(wasm_v128_and(v, wasm_i16x8_splat(1)));
1411
  v128_t xr = wasm_u16x8_shr(v, 1);
1412
1413
  return wasm_v128_xor(xl, xr);
1414
}
1415
1416
SIMD_TARGET
1417
inline v128_t rotate32(v128_t v, int r)
1418
{
1419
  return wasm_v128_or(wasm_i32x4_shl(v, r), wasm_i32x4_shr(v, 32 - r));
1420
}
1421
#endif
1422
1423
#if defined(SIMD_SSE) || defined(SIMD_AVX) || defined(SIMD_NEON) || defined(SIMD_WASM)
1424
SIMD_TARGET
1425
static const unsigned char* decodeBytesSimd(const unsigned char* data, const unsigned char* data_end, unsigned char* buffer, size_t buffer_size, int hshift)
1426
899k
{
1427
899k
  assert(buffer_size % kByteGroupSize == 0);
1428
899k
  assert(kByteGroupSize == 16);
1429
1430
  // round number of groups to 4 to get number of header bytes
1431
899k
  size_t header_size = (buffer_size / kByteGroupSize + 3) / 4;
1432
899k
  if (size_t(data_end - data) < header_size)
1433
15
    return NULL;
1434
1435
899k
  const unsigned char* header = data;
1436
899k
  data += header_size;
1437
1438
899k
  size_t i = 0;
1439
1440
  // fast-path: process 4 groups at a time, do a shared bounds check
1441
4.27M
  for (; i + kByteGroupSize * 4 <= buffer_size && size_t(data_end - data) >= kByteGroupDecodeLimit * 4; i += kByteGroupSize * 4)
1442
3.37M
  {
1443
3.37M
    size_t header_offset = i / kByteGroupSize;
1444
3.37M
    unsigned char header_byte = header[header_offset / 4];
1445
1446
3.37M
    data = decodeBytesGroupSimd(data, buffer + i + kByteGroupSize * 0, hshift + ((header_byte >> 0) & 3));
1447
3.37M
    data = decodeBytesGroupSimd(data, buffer + i + kByteGroupSize * 1, hshift + ((header_byte >> 2) & 3));
1448
3.37M
    data = decodeBytesGroupSimd(data, buffer + i + kByteGroupSize * 2, hshift + ((header_byte >> 4) & 3));
1449
3.37M
    data = decodeBytesGroupSimd(data, buffer + i + kByteGroupSize * 3, hshift + ((header_byte >> 6) & 3));
1450
3.37M
  }
1451
1452
  // slow-path: process remaining groups
1453
1.00M
  for (; i < buffer_size; i += kByteGroupSize)
1454
109k
  {
1455
109k
    if (size_t(data_end - data) < kByteGroupDecodeLimit)
1456
660
      return NULL;
1457
1458
108k
    size_t header_offset = i / kByteGroupSize;
1459
108k
    unsigned char header_byte = header[header_offset / 4];
1460
1461
108k
    data = decodeBytesGroupSimd(data, buffer + i, hshift + ((header_byte >> ((header_offset % 4) * 2)) & 3));
1462
108k
  }
1463
1464
898k
  return data;
1465
899k
}
1466
1467
template <int Channel>
1468
SIMD_TARGET static void
1469
decodeDeltas4Simd(const unsigned char* buffer, unsigned char* transposed, size_t vertex_count_aligned, size_t vertex_size, unsigned char last_vertex[4], int rot)
1470
391k
{
1471
391k
#if defined(SIMD_SSE) || defined(SIMD_AVX)
1472
11.8M
#define TEMP __m128i
1473
391k
#define PREP() __m128i pi = _mm_cvtsi32_si128(*reinterpret_cast<const int*>(last_vertex))
1474
23.6M
#define LOAD(i) __m128i r##i = _mm_loadu_si128(reinterpret_cast<const __m128i*>(buffer + j + i * vertex_count_aligned))
1475
23.6M
#define GRP4(i) t0 = r##i, t1 = _mm_shuffle_epi32(r##i, 1), t2 = _mm_shuffle_epi32(r##i, 2), t3 = _mm_shuffle_epi32(r##i, 3)
1476
94.7M
#define FIXD(i) t##i = pi = Channel == 0 ? _mm_add_epi8(pi, t##i) : (Channel == 1 ? _mm_add_epi16(pi, t##i) : _mm_xor_si128(pi, t##i))
1477
94.7M
#define SAVE(i) *reinterpret_cast<int*>(savep) = _mm_cvtsi128_si32(t##i), savep += vertex_size
1478
391k
#endif
1479
1480
#ifdef SIMD_NEON
1481
#define TEMP uint8x8_t
1482
#define PREP() uint8x8_t pi = vreinterpret_u8_u32(vld1_lane_u32(reinterpret_cast<uint32_t*>(last_vertex), vdup_n_u32(0), 0))
1483
#define LOAD(i) uint8x16_t r##i = vld1q_u8(buffer + j + i * vertex_count_aligned)
1484
#define GRP4(i) t0 = vget_low_u8(r##i), t1 = vreinterpret_u8_u32(vdup_lane_u32(vreinterpret_u32_u8(t0), 1)), t2 = vget_high_u8(r##i), t3 = vreinterpret_u8_u32(vdup_lane_u32(vreinterpret_u32_u8(t2), 1))
1485
#define FIXD(i) t##i = pi = Channel == 0 ? vadd_u8(pi, t##i) : (Channel == 1 ? vreinterpret_u8_u16(vadd_u16(vreinterpret_u16_u8(pi), vreinterpret_u16_u8(t##i))) : veor_u8(pi, t##i))
1486
#define SAVE(i) vst1_lane_u32(reinterpret_cast<uint32_t*>(savep), vreinterpret_u32_u8(t##i), 0), savep += vertex_size
1487
#endif
1488
1489
#ifdef SIMD_WASM
1490
#define TEMP v128_t
1491
#define PREP() v128_t pi = wasm_v128_load(last_vertex)
1492
#define LOAD(i) v128_t r##i = wasm_v128_load(buffer + j + i * vertex_count_aligned)
1493
#define GRP4(i) t0 = r##i, t1 = wasmx_splat_v32x4(r##i, 1), t2 = wasmx_splat_v32x4(r##i, 2), t3 = wasmx_splat_v32x4(r##i, 3)
1494
#define FIXD(i) t##i = pi = Channel == 0 ? wasm_i8x16_add(pi, t##i) : (Channel == 1 ? wasm_i16x8_add(pi, t##i) : wasm_v128_xor(pi, t##i))
1495
#define SAVE(i) wasm_v128_store32_lane(savep, t##i, 0), savep += vertex_size
1496
#endif
1497
1498
23.6M
#define UNZR(i) r##i = Channel == 0 ? unzigzag8(r##i) : (Channel == 1 ? unzigzag16(r##i) : rotate32(r##i, rot))
1499
1500
391k
  PREP();
1501
1502
391k
  unsigned char* savep = transposed;
1503
1504
6.31M
  for (size_t j = 0; j < vertex_count_aligned; j += 16)
1505
5.91M
  {
1506
5.91M
    LOAD(0);
1507
5.91M
    LOAD(1);
1508
5.91M
    LOAD(2);
1509
5.91M
    LOAD(3);
1510
1511
5.91M
    transpose8(r0, r1, r2, r3);
1512
1513
5.91M
    TEMP t0, t1, t2, t3;
1514
5.91M
    TEMP npi = pi;
1515
1516
5.91M
    UNZR(0);
1517
5.91M
    GRP4(0);
1518
5.91M
    FIXD(0), FIXD(1), FIXD(2), FIXD(3);
1519
5.91M
    SAVE(0), SAVE(1), SAVE(2), SAVE(3);
1520
1521
5.91M
    UNZR(1);
1522
5.91M
    GRP4(1);
1523
5.91M
    FIXD(0), FIXD(1), FIXD(2), FIXD(3);
1524
5.91M
    SAVE(0), SAVE(1), SAVE(2), SAVE(3);
1525
1526
5.91M
    UNZR(2);
1527
5.91M
    GRP4(2);
1528
5.91M
    FIXD(0), FIXD(1), FIXD(2), FIXD(3);
1529
5.91M
    SAVE(0), SAVE(1), SAVE(2), SAVE(3);
1530
1531
5.91M
    UNZR(3);
1532
5.91M
    GRP4(3);
1533
5.91M
    FIXD(0), FIXD(1), FIXD(2), FIXD(3);
1534
5.91M
    SAVE(0), SAVE(1), SAVE(2), SAVE(3);
1535
1536
#if defined(SIMD_LATENCYOPT) && defined(SIMD_NEON) && (defined(__APPLE__) || defined(_WIN32))
1537
    // instead of relying on accumulated pi, recompute it from scratch from r0..r3; this shortens dependency between loop iterations
1538
    pi = rebase<Channel>(npi, r0, r1, r2, r3);
1539
#else
1540
5.91M
    (void)npi;
1541
5.91M
#endif
1542
1543
5.91M
#undef UNZR
1544
5.91M
#undef TEMP
1545
5.91M
#undef PREP
1546
5.91M
#undef LOAD
1547
5.91M
#undef GRP4
1548
5.91M
#undef FIXD
1549
5.91M
#undef SAVE
1550
5.91M
  }
1551
391k
}
vertexcodec.cpp:void meshopt::decodeDeltas4Simd<0>(unsigned char const*, unsigned char*, unsigned long, unsigned long, unsigned char*, int)
Line
Count
Source
1470
359k
{
1471
359k
#if defined(SIMD_SSE) || defined(SIMD_AVX)
1472
359k
#define TEMP __m128i
1473
359k
#define PREP() __m128i pi = _mm_cvtsi32_si128(*reinterpret_cast<const int*>(last_vertex))
1474
359k
#define LOAD(i) __m128i r##i = _mm_loadu_si128(reinterpret_cast<const __m128i*>(buffer + j + i * vertex_count_aligned))
1475
359k
#define GRP4(i) t0 = r##i, t1 = _mm_shuffle_epi32(r##i, 1), t2 = _mm_shuffle_epi32(r##i, 2), t3 = _mm_shuffle_epi32(r##i, 3)
1476
359k
#define FIXD(i) t##i = pi = Channel == 0 ? _mm_add_epi8(pi, t##i) : (Channel == 1 ? _mm_add_epi16(pi, t##i) : _mm_xor_si128(pi, t##i))
1477
359k
#define SAVE(i) *reinterpret_cast<int*>(savep) = _mm_cvtsi128_si32(t##i), savep += vertex_size
1478
359k
#endif
1479
1480
#ifdef SIMD_NEON
1481
#define TEMP uint8x8_t
1482
#define PREP() uint8x8_t pi = vreinterpret_u8_u32(vld1_lane_u32(reinterpret_cast<uint32_t*>(last_vertex), vdup_n_u32(0), 0))
1483
#define LOAD(i) uint8x16_t r##i = vld1q_u8(buffer + j + i * vertex_count_aligned)
1484
#define GRP4(i) t0 = vget_low_u8(r##i), t1 = vreinterpret_u8_u32(vdup_lane_u32(vreinterpret_u32_u8(t0), 1)), t2 = vget_high_u8(r##i), t3 = vreinterpret_u8_u32(vdup_lane_u32(vreinterpret_u32_u8(t2), 1))
1485
#define FIXD(i) t##i = pi = Channel == 0 ? vadd_u8(pi, t##i) : (Channel == 1 ? vreinterpret_u8_u16(vadd_u16(vreinterpret_u16_u8(pi), vreinterpret_u16_u8(t##i))) : veor_u8(pi, t##i))
1486
#define SAVE(i) vst1_lane_u32(reinterpret_cast<uint32_t*>(savep), vreinterpret_u32_u8(t##i), 0), savep += vertex_size
1487
#endif
1488
1489
#ifdef SIMD_WASM
1490
#define TEMP v128_t
1491
#define PREP() v128_t pi = wasm_v128_load(last_vertex)
1492
#define LOAD(i) v128_t r##i = wasm_v128_load(buffer + j + i * vertex_count_aligned)
1493
#define GRP4(i) t0 = r##i, t1 = wasmx_splat_v32x4(r##i, 1), t2 = wasmx_splat_v32x4(r##i, 2), t3 = wasmx_splat_v32x4(r##i, 3)
1494
#define FIXD(i) t##i = pi = Channel == 0 ? wasm_i8x16_add(pi, t##i) : (Channel == 1 ? wasm_i16x8_add(pi, t##i) : wasm_v128_xor(pi, t##i))
1495
#define SAVE(i) wasm_v128_store32_lane(savep, t##i, 0), savep += vertex_size
1496
#endif
1497
1498
359k
#define UNZR(i) r##i = Channel == 0 ? unzigzag8(r##i) : (Channel == 1 ? unzigzag16(r##i) : rotate32(r##i, rot))
1499
1500
359k
  PREP();
1501
1502
359k
  unsigned char* savep = transposed;
1503
1504
5.79M
  for (size_t j = 0; j < vertex_count_aligned; j += 16)
1505
5.43M
  {
1506
5.43M
    LOAD(0);
1507
5.43M
    LOAD(1);
1508
5.43M
    LOAD(2);
1509
5.43M
    LOAD(3);
1510
1511
5.43M
    transpose8(r0, r1, r2, r3);
1512
1513
5.43M
    TEMP t0, t1, t2, t3;
1514
5.43M
    TEMP npi = pi;
1515
1516
5.43M
    UNZR(0);
1517
5.43M
    GRP4(0);
1518
5.43M
    FIXD(0), FIXD(1), FIXD(2), FIXD(3);
1519
5.43M
    SAVE(0), SAVE(1), SAVE(2), SAVE(3);
1520
1521
5.43M
    UNZR(1);
1522
5.43M
    GRP4(1);
1523
5.43M
    FIXD(0), FIXD(1), FIXD(2), FIXD(3);
1524
5.43M
    SAVE(0), SAVE(1), SAVE(2), SAVE(3);
1525
1526
5.43M
    UNZR(2);
1527
5.43M
    GRP4(2);
1528
5.43M
    FIXD(0), FIXD(1), FIXD(2), FIXD(3);
1529
5.43M
    SAVE(0), SAVE(1), SAVE(2), SAVE(3);
1530
1531
5.43M
    UNZR(3);
1532
5.43M
    GRP4(3);
1533
5.43M
    FIXD(0), FIXD(1), FIXD(2), FIXD(3);
1534
5.43M
    SAVE(0), SAVE(1), SAVE(2), SAVE(3);
1535
1536
#if defined(SIMD_LATENCYOPT) && defined(SIMD_NEON) && (defined(__APPLE__) || defined(_WIN32))
1537
    // instead of relying on accumulated pi, recompute it from scratch from r0..r3; this shortens dependency between loop iterations
1538
    pi = rebase<Channel>(npi, r0, r1, r2, r3);
1539
#else
1540
5.43M
    (void)npi;
1541
5.43M
#endif
1542
1543
5.43M
#undef UNZR
1544
5.43M
#undef TEMP
1545
5.43M
#undef PREP
1546
5.43M
#undef LOAD
1547
5.43M
#undef GRP4
1548
5.43M
#undef FIXD
1549
5.43M
#undef SAVE
1550
5.43M
  }
1551
359k
}
vertexcodec.cpp:void meshopt::decodeDeltas4Simd<1>(unsigned char const*, unsigned char*, unsigned long, unsigned long, unsigned char*, int)
Line
Count
Source
1470
17.1k
{
1471
17.1k
#if defined(SIMD_SSE) || defined(SIMD_AVX)
1472
17.1k
#define TEMP __m128i
1473
17.1k
#define PREP() __m128i pi = _mm_cvtsi32_si128(*reinterpret_cast<const int*>(last_vertex))
1474
17.1k
#define LOAD(i) __m128i r##i = _mm_loadu_si128(reinterpret_cast<const __m128i*>(buffer + j + i * vertex_count_aligned))
1475
17.1k
#define GRP4(i) t0 = r##i, t1 = _mm_shuffle_epi32(r##i, 1), t2 = _mm_shuffle_epi32(r##i, 2), t3 = _mm_shuffle_epi32(r##i, 3)
1476
17.1k
#define FIXD(i) t##i = pi = Channel == 0 ? _mm_add_epi8(pi, t##i) : (Channel == 1 ? _mm_add_epi16(pi, t##i) : _mm_xor_si128(pi, t##i))
1477
17.1k
#define SAVE(i) *reinterpret_cast<int*>(savep) = _mm_cvtsi128_si32(t##i), savep += vertex_size
1478
17.1k
#endif
1479
1480
#ifdef SIMD_NEON
1481
#define TEMP uint8x8_t
1482
#define PREP() uint8x8_t pi = vreinterpret_u8_u32(vld1_lane_u32(reinterpret_cast<uint32_t*>(last_vertex), vdup_n_u32(0), 0))
1483
#define LOAD(i) uint8x16_t r##i = vld1q_u8(buffer + j + i * vertex_count_aligned)
1484
#define GRP4(i) t0 = vget_low_u8(r##i), t1 = vreinterpret_u8_u32(vdup_lane_u32(vreinterpret_u32_u8(t0), 1)), t2 = vget_high_u8(r##i), t3 = vreinterpret_u8_u32(vdup_lane_u32(vreinterpret_u32_u8(t2), 1))
1485
#define FIXD(i) t##i = pi = Channel == 0 ? vadd_u8(pi, t##i) : (Channel == 1 ? vreinterpret_u8_u16(vadd_u16(vreinterpret_u16_u8(pi), vreinterpret_u16_u8(t##i))) : veor_u8(pi, t##i))
1486
#define SAVE(i) vst1_lane_u32(reinterpret_cast<uint32_t*>(savep), vreinterpret_u32_u8(t##i), 0), savep += vertex_size
1487
#endif
1488
1489
#ifdef SIMD_WASM
1490
#define TEMP v128_t
1491
#define PREP() v128_t pi = wasm_v128_load(last_vertex)
1492
#define LOAD(i) v128_t r##i = wasm_v128_load(buffer + j + i * vertex_count_aligned)
1493
#define GRP4(i) t0 = r##i, t1 = wasmx_splat_v32x4(r##i, 1), t2 = wasmx_splat_v32x4(r##i, 2), t3 = wasmx_splat_v32x4(r##i, 3)
1494
#define FIXD(i) t##i = pi = Channel == 0 ? wasm_i8x16_add(pi, t##i) : (Channel == 1 ? wasm_i16x8_add(pi, t##i) : wasm_v128_xor(pi, t##i))
1495
#define SAVE(i) wasm_v128_store32_lane(savep, t##i, 0), savep += vertex_size
1496
#endif
1497
1498
17.1k
#define UNZR(i) r##i = Channel == 0 ? unzigzag8(r##i) : (Channel == 1 ? unzigzag16(r##i) : rotate32(r##i, rot))
1499
1500
17.1k
  PREP();
1501
1502
17.1k
  unsigned char* savep = transposed;
1503
1504
274k
  for (size_t j = 0; j < vertex_count_aligned; j += 16)
1505
257k
  {
1506
257k
    LOAD(0);
1507
257k
    LOAD(1);
1508
257k
    LOAD(2);
1509
257k
    LOAD(3);
1510
1511
257k
    transpose8(r0, r1, r2, r3);
1512
1513
257k
    TEMP t0, t1, t2, t3;
1514
257k
    TEMP npi = pi;
1515
1516
257k
    UNZR(0);
1517
257k
    GRP4(0);
1518
257k
    FIXD(0), FIXD(1), FIXD(2), FIXD(3);
1519
257k
    SAVE(0), SAVE(1), SAVE(2), SAVE(3);
1520
1521
257k
    UNZR(1);
1522
257k
    GRP4(1);
1523
257k
    FIXD(0), FIXD(1), FIXD(2), FIXD(3);
1524
257k
    SAVE(0), SAVE(1), SAVE(2), SAVE(3);
1525
1526
257k
    UNZR(2);
1527
257k
    GRP4(2);
1528
257k
    FIXD(0), FIXD(1), FIXD(2), FIXD(3);
1529
257k
    SAVE(0), SAVE(1), SAVE(2), SAVE(3);
1530
1531
257k
    UNZR(3);
1532
257k
    GRP4(3);
1533
257k
    FIXD(0), FIXD(1), FIXD(2), FIXD(3);
1534
257k
    SAVE(0), SAVE(1), SAVE(2), SAVE(3);
1535
1536
#if defined(SIMD_LATENCYOPT) && defined(SIMD_NEON) && (defined(__APPLE__) || defined(_WIN32))
1537
    // instead of relying on accumulated pi, recompute it from scratch from r0..r3; this shortens dependency between loop iterations
1538
    pi = rebase<Channel>(npi, r0, r1, r2, r3);
1539
#else
1540
257k
    (void)npi;
1541
257k
#endif
1542
1543
257k
#undef UNZR
1544
257k
#undef TEMP
1545
257k
#undef PREP
1546
257k
#undef LOAD
1547
257k
#undef GRP4
1548
257k
#undef FIXD
1549
257k
#undef SAVE
1550
257k
  }
1551
17.1k
}
vertexcodec.cpp:void meshopt::decodeDeltas4Simd<2>(unsigned char const*, unsigned char*, unsigned long, unsigned long, unsigned char*, int)
Line
Count
Source
1470
14.7k
{
1471
14.7k
#if defined(SIMD_SSE) || defined(SIMD_AVX)
1472
14.7k
#define TEMP __m128i
1473
14.7k
#define PREP() __m128i pi = _mm_cvtsi32_si128(*reinterpret_cast<const int*>(last_vertex))
1474
14.7k
#define LOAD(i) __m128i r##i = _mm_loadu_si128(reinterpret_cast<const __m128i*>(buffer + j + i * vertex_count_aligned))
1475
14.7k
#define GRP4(i) t0 = r##i, t1 = _mm_shuffle_epi32(r##i, 1), t2 = _mm_shuffle_epi32(r##i, 2), t3 = _mm_shuffle_epi32(r##i, 3)
1476
14.7k
#define FIXD(i) t##i = pi = Channel == 0 ? _mm_add_epi8(pi, t##i) : (Channel == 1 ? _mm_add_epi16(pi, t##i) : _mm_xor_si128(pi, t##i))
1477
14.7k
#define SAVE(i) *reinterpret_cast<int*>(savep) = _mm_cvtsi128_si32(t##i), savep += vertex_size
1478
14.7k
#endif
1479
1480
#ifdef SIMD_NEON
1481
#define TEMP uint8x8_t
1482
#define PREP() uint8x8_t pi = vreinterpret_u8_u32(vld1_lane_u32(reinterpret_cast<uint32_t*>(last_vertex), vdup_n_u32(0), 0))
1483
#define LOAD(i) uint8x16_t r##i = vld1q_u8(buffer + j + i * vertex_count_aligned)
1484
#define GRP4(i) t0 = vget_low_u8(r##i), t1 = vreinterpret_u8_u32(vdup_lane_u32(vreinterpret_u32_u8(t0), 1)), t2 = vget_high_u8(r##i), t3 = vreinterpret_u8_u32(vdup_lane_u32(vreinterpret_u32_u8(t2), 1))
1485
#define FIXD(i) t##i = pi = Channel == 0 ? vadd_u8(pi, t##i) : (Channel == 1 ? vreinterpret_u8_u16(vadd_u16(vreinterpret_u16_u8(pi), vreinterpret_u16_u8(t##i))) : veor_u8(pi, t##i))
1486
#define SAVE(i) vst1_lane_u32(reinterpret_cast<uint32_t*>(savep), vreinterpret_u32_u8(t##i), 0), savep += vertex_size
1487
#endif
1488
1489
#ifdef SIMD_WASM
1490
#define TEMP v128_t
1491
#define PREP() v128_t pi = wasm_v128_load(last_vertex)
1492
#define LOAD(i) v128_t r##i = wasm_v128_load(buffer + j + i * vertex_count_aligned)
1493
#define GRP4(i) t0 = r##i, t1 = wasmx_splat_v32x4(r##i, 1), t2 = wasmx_splat_v32x4(r##i, 2), t3 = wasmx_splat_v32x4(r##i, 3)
1494
#define FIXD(i) t##i = pi = Channel == 0 ? wasm_i8x16_add(pi, t##i) : (Channel == 1 ? wasm_i16x8_add(pi, t##i) : wasm_v128_xor(pi, t##i))
1495
#define SAVE(i) wasm_v128_store32_lane(savep, t##i, 0), savep += vertex_size
1496
#endif
1497
1498
14.7k
#define UNZR(i) r##i = Channel == 0 ? unzigzag8(r##i) : (Channel == 1 ? unzigzag16(r##i) : rotate32(r##i, rot))
1499
1500
14.7k
  PREP();
1501
1502
14.7k
  unsigned char* savep = transposed;
1503
1504
237k
  for (size_t j = 0; j < vertex_count_aligned; j += 16)
1505
222k
  {
1506
222k
    LOAD(0);
1507
222k
    LOAD(1);
1508
222k
    LOAD(2);
1509
222k
    LOAD(3);
1510
1511
222k
    transpose8(r0, r1, r2, r3);
1512
1513
222k
    TEMP t0, t1, t2, t3;
1514
222k
    TEMP npi = pi;
1515
1516
222k
    UNZR(0);
1517
222k
    GRP4(0);
1518
222k
    FIXD(0), FIXD(1), FIXD(2), FIXD(3);
1519
222k
    SAVE(0), SAVE(1), SAVE(2), SAVE(3);
1520
1521
222k
    UNZR(1);
1522
222k
    GRP4(1);
1523
222k
    FIXD(0), FIXD(1), FIXD(2), FIXD(3);
1524
222k
    SAVE(0), SAVE(1), SAVE(2), SAVE(3);
1525
1526
222k
    UNZR(2);
1527
222k
    GRP4(2);
1528
222k
    FIXD(0), FIXD(1), FIXD(2), FIXD(3);
1529
222k
    SAVE(0), SAVE(1), SAVE(2), SAVE(3);
1530
1531
222k
    UNZR(3);
1532
222k
    GRP4(3);
1533
222k
    FIXD(0), FIXD(1), FIXD(2), FIXD(3);
1534
222k
    SAVE(0), SAVE(1), SAVE(2), SAVE(3);
1535
1536
#if defined(SIMD_LATENCYOPT) && defined(SIMD_NEON) && (defined(__APPLE__) || defined(_WIN32))
1537
    // instead of relying on accumulated pi, recompute it from scratch from r0..r3; this shortens dependency between loop iterations
1538
    pi = rebase<Channel>(npi, r0, r1, r2, r3);
1539
#else
1540
222k
    (void)npi;
1541
222k
#endif
1542
1543
222k
#undef UNZR
1544
222k
#undef TEMP
1545
222k
#undef PREP
1546
222k
#undef LOAD
1547
222k
#undef GRP4
1548
222k
#undef FIXD
1549
222k
#undef SAVE
1550
222k
  }
1551
14.7k
}
1552
1553
SIMD_TARGET
1554
static const unsigned char* decodeVertexBlockSimd(const unsigned char* data, const unsigned char* data_end, unsigned char* vertex_data, size_t vertex_count, size_t vertex_size, unsigned char last_vertex[256], const unsigned char* channels, int version)
1555
147k
{
1556
147k
  assert(vertex_count > 0 && vertex_count <= kVertexBlockMaxSize);
1557
1558
147k
  unsigned char buffer[kVertexBlockMaxSize * 4];
1559
147k
  unsigned char transposed[kVertexBlockSizeBytes];
1560
1561
147k
  size_t vertex_count_aligned = (vertex_count + kByteGroupSize - 1) & ~(kByteGroupSize - 1);
1562
1563
147k
  size_t control_size = version == 0 ? 0 : vertex_size / 4;
1564
147k
  if (size_t(data_end - data) < control_size)
1565
0
    return NULL;
1566
1567
147k
  const unsigned char* control = data;
1568
147k
  data += control_size;
1569
1570
538k
  for (size_t k = 0; k < vertex_size; k += 4)
1571
391k
  {
1572
391k
    unsigned char ctrl_byte = version == 0 ? 0 : control[k / 4];
1573
1574
1.95M
    for (size_t j = 0; j < 4; ++j)
1575
1.56M
    {
1576
1.56M
      int ctrl = (ctrl_byte >> (j * 2)) & 3;
1577
1578
1.56M
      if (ctrl == 3)
1579
257k
      {
1580
        // literal encoding; safe to over-copy due to tail
1581
257k
        if (size_t(data_end - data) < vertex_count_aligned)
1582
100
          return NULL;
1583
1584
257k
        memcpy(buffer + j * vertex_count_aligned, data, vertex_count_aligned);
1585
257k
        data += vertex_count;
1586
257k
      }
1587
1.30M
      else if (ctrl == 2)
1588
409k
      {
1589
        // zero encoding
1590
409k
        memset(buffer + j * vertex_count_aligned, 0, vertex_count_aligned);
1591
409k
      }
1592
899k
      else
1593
899k
      {
1594
        // for v0, headers are mapped to 0..3; for v1, headers are mapped to 4..8
1595
899k
        int hshift = version == 0 ? 0 : 4 + ctrl;
1596
1597
899k
        data = decodeBytesSimd(data, data_end, buffer + j * vertex_count_aligned, vertex_count_aligned, hshift);
1598
899k
        if (!data)
1599
675
          return NULL;
1600
899k
      }
1601
1.56M
    }
1602
1603
391k
    int channel = version == 0 ? 0 : channels[k / 4];
1604
1605
391k
    switch (channel & 3)
1606
391k
    {
1607
359k
    case 0:
1608
359k
      decodeDeltas4Simd<0>(buffer, transposed + k, vertex_count_aligned, vertex_size, last_vertex + k, 0);
1609
359k
      break;
1610
17.1k
    case 1:
1611
17.1k
      decodeDeltas4Simd<1>(buffer, transposed + k, vertex_count_aligned, vertex_size, last_vertex + k, 0);
1612
17.1k
      break;
1613
14.7k
    case 2:
1614
14.7k
      decodeDeltas4Simd<2>(buffer, transposed + k, vertex_count_aligned, vertex_size, last_vertex + k, (32 - (channel >> 4)) & 31);
1615
14.7k
      break;
1616
143
    default:
1617
143
      return NULL; // invalid channel type
1618
391k
    }
1619
391k
  }
1620
1621
146k
  memcpy(vertex_data, transposed, vertex_count * vertex_size);
1622
1623
146k
  memcpy(last_vertex, &transposed[vertex_size * (vertex_count - 1)], vertex_size);
1624
1625
146k
  return data;
1626
147k
}
1627
#endif
1628
1629
#if defined(SIMD_SSE) && defined(SIMD_FALLBACK)
1630
static unsigned int getCpuFeatures()
1631
2
{
1632
2
  int cpuinfo[4] = {};
1633
#ifdef _MSC_VER
1634
  __cpuid(cpuinfo, 1);
1635
#else
1636
2
  __cpuid(1, cpuinfo[0], cpuinfo[1], cpuinfo[2], cpuinfo[3]);
1637
2
#endif
1638
2
  return cpuinfo[2];
1639
2
}
1640
1641
static unsigned int cpuid = getCpuFeatures();
1642
#endif
1643
1644
} // namespace meshopt
1645
1646
size_t meshopt_encodeVertexBufferLevel(unsigned char* buffer, size_t buffer_size, const void* vertices, size_t vertex_count, size_t vertex_size, int level, int version)
1647
12.0k
{
1648
12.0k
  using namespace meshopt;
1649
1650
12.0k
  assert(vertex_size > 0 && vertex_size <= 256);
1651
12.0k
  assert(vertex_size % 4 == 0);
1652
12.0k
  assert(level >= 0 && level <= 9); // only a subset of this range is used right now
1653
12.0k
  assert(version < 0 || unsigned(version) <= kDecodeVertexVersion);
1654
1655
12.0k
  version = version < 0 ? gEncodeVertexVersion : version;
1656
1657
#if TRACE
1658
  memset(vertexstats, 0, sizeof(vertexstats));
1659
#endif
1660
1661
12.0k
  const unsigned char* vertex_data = static_cast<const unsigned char*>(vertices);
1662
1663
12.0k
  unsigned char* data = buffer;
1664
12.0k
  unsigned char* data_end = buffer + buffer_size;
1665
1666
12.0k
  if (size_t(data_end - data) < 1)
1667
0
    return 0;
1668
1669
12.0k
  *data++ = (unsigned char)(kVertexHeader | version);
1670
1671
12.0k
  unsigned char first_vertex[256] = {};
1672
12.0k
  if (vertex_count > 0)
1673
10.8k
    memcpy(first_vertex, vertex_data, vertex_size);
1674
1675
12.0k
  unsigned char last_vertex[256] = {};
1676
12.0k
  memcpy(last_vertex, first_vertex, vertex_size);
1677
1678
12.0k
  size_t vertex_block_size = getVertexBlockSize(vertex_size);
1679
1680
12.0k
  unsigned char channels[64] = {};
1681
12.0k
  if (version != 0 && level > 1 && vertex_count > 1)
1682
18.0k
    for (size_t k = 0; k < vertex_size; k += 4)
1683
14.6k
    {
1684
14.6k
      int rot = level >= 3 ? estimateRotate(vertex_data, vertex_count, vertex_size, k, /* group_size= */ 16) : 0;
1685
14.6k
      int channel = estimateChannel(vertex_data, vertex_count, vertex_size, k, vertex_block_size, /* block_skip= */ 3, /* max_channels= */ level >= 3 ? 3 : 2, rot);
1686
1687
14.6k
      assert(unsigned(channel) < 2 || ((channel & 3) == 2 && unsigned(channel >> 4) < 8));
1688
14.6k
      channels[k / 4] = (unsigned char)channel;
1689
14.6k
    }
1690
1691
12.0k
  size_t vertex_offset = 0;
1692
1693
304k
  while (vertex_offset < vertex_count)
1694
292k
  {
1695
292k
    size_t block_size = (vertex_offset + vertex_block_size < vertex_count) ? vertex_block_size : vertex_count - vertex_offset;
1696
1697
292k
    data = encodeVertexBlock(data, data_end, vertex_data + vertex_offset * vertex_size, block_size, vertex_size, last_vertex, channels, version, level);
1698
292k
    if (!data)
1699
0
      return 0;
1700
1701
292k
    vertex_offset += block_size;
1702
292k
  }
1703
1704
12.0k
  size_t tail_size = vertex_size + (version == 0 ? 0 : vertex_size / 4);
1705
12.0k
  size_t tail_size_min = version == 0 ? kTailMinSizeV0 : kTailMinSizeV1;
1706
12.0k
  size_t tail_size_pad = tail_size < tail_size_min ? tail_size_min : tail_size;
1707
1708
12.0k
  if (size_t(data_end - data) < tail_size_pad)
1709
0
    return 0;
1710
1711
12.0k
  if (tail_size < tail_size_pad)
1712
6.82k
  {
1713
6.82k
    memset(data, 0, tail_size_pad - tail_size);
1714
6.82k
    data += tail_size_pad - tail_size;
1715
6.82k
  }
1716
1717
12.0k
  memcpy(data, first_vertex, vertex_size);
1718
12.0k
  data += vertex_size;
1719
1720
12.0k
  if (version != 0)
1721
8.88k
  {
1722
8.88k
    memcpy(data, channels, vertex_size / 4);
1723
8.88k
    data += vertex_size / 4;
1724
8.88k
  }
1725
1726
12.0k
  assert(data >= buffer + tail_size);
1727
12.0k
  assert(data <= buffer + buffer_size);
1728
1729
#if TRACE
1730
  size_t total_size = data - buffer;
1731
1732
  for (size_t k = 0; k < vertex_size; ++k)
1733
  {
1734
    const Stats& vsk = vertexstats[k];
1735
1736
    printf("%2d: %7d bytes [%4.1f%%] %.1f bpv", int(k), int(vsk.size), double(vsk.size) / double(total_size) * 100, double(vsk.size) / double(vertex_count) * 8);
1737
1738
    size_t total_k = vsk.header + vsk.bitg[1] + vsk.bitg[2] + vsk.bitg[4] + vsk.bitg[8];
1739
    double total_kr = total_k ? 1.0 / double(total_k) : 0;
1740
1741
    if (version != 0)
1742
    {
1743
      int channel = channels[k / 4];
1744
1745
      if ((channel & 3) == 2 && k % 4 == 0)
1746
        printf(" | ^%d", channel >> 4);
1747
      else
1748
        printf(" | %2s", channel == 0 ? "1" : (channel == 1 && k % 2 == 0 ? "2" : "."));
1749
    }
1750
1751
    printf(" | hdr [%5.1f%%] bitg [1 %4.1f%% 2 %4.1f%% 4 %4.1f%% 8 %4.1f%%]",
1752
        double(vsk.header) * total_kr * 100,
1753
        double(vsk.bitg[1]) * total_kr * 100, double(vsk.bitg[2]) * total_kr * 100,
1754
        double(vsk.bitg[4]) * total_kr * 100, double(vsk.bitg[8]) * total_kr * 100);
1755
1756
    size_t total_ctrl = vsk.ctrl[0] + vsk.ctrl[1] + vsk.ctrl[2] + vsk.ctrl[3];
1757
1758
    if (total_ctrl)
1759
    {
1760
      printf(" | ctrl %3.0f%% %3.0f%% %3.0f%% %3.0f%%",
1761
          double(vsk.ctrl[0]) / double(total_ctrl) * 100, double(vsk.ctrl[1]) / double(total_ctrl) * 100,
1762
          double(vsk.ctrl[2]) / double(total_ctrl) * 100, double(vsk.ctrl[3]) / double(total_ctrl) * 100);
1763
    }
1764
1765
    if (level >= 3)
1766
      printf(" | bitc [%3.0f%% %3.0f%% %3.0f%% %3.0f%% %3.0f%% %3.0f%% %3.0f%% %3.0f%%]",
1767
          double(vsk.bitc[0]) / double(vertex_count) * 100, double(vsk.bitc[1]) / double(vertex_count) * 100,
1768
          double(vsk.bitc[2]) / double(vertex_count) * 100, double(vsk.bitc[3]) / double(vertex_count) * 100,
1769
          double(vsk.bitc[4]) / double(vertex_count) * 100, double(vsk.bitc[5]) / double(vertex_count) * 100,
1770
          double(vsk.bitc[6]) / double(vertex_count) * 100, double(vsk.bitc[7]) / double(vertex_count) * 100);
1771
1772
    printf("\n");
1773
  }
1774
#endif
1775
1776
12.0k
  return data - buffer;
1777
12.0k
}
1778
1779
size_t meshopt_encodeVertexBuffer(unsigned char* buffer, size_t buffer_size, const void* vertices, size_t vertex_count, size_t vertex_size)
1780
0
{
1781
0
  return meshopt_encodeVertexBufferLevel(buffer, buffer_size, vertices, vertex_count, vertex_size, meshopt::kEncodeDefaultLevel, meshopt::gEncodeVertexVersion);
1782
0
}
1783
1784
size_t meshopt_encodeVertexBufferBound(size_t vertex_count, size_t vertex_size)
1785
6.03k
{
1786
6.03k
  using namespace meshopt;
1787
1788
6.03k
  assert(vertex_size > 0 && vertex_size <= 256);
1789
6.03k
  assert(vertex_size % 4 == 0);
1790
1791
6.03k
  size_t vertex_block_size = getVertexBlockSize(vertex_size);
1792
6.03k
  size_t vertex_block_count = (vertex_count + vertex_block_size - 1) / vertex_block_size;
1793
1794
6.03k
  size_t vertex_block_control_size = vertex_size / 4;
1795
6.03k
  size_t vertex_block_header_size = (vertex_block_size / kByteGroupSize + 3) / 4;
1796
6.03k
  size_t vertex_block_data_size = vertex_block_size;
1797
1798
6.03k
  size_t tail_size = vertex_size + (vertex_size / 4);
1799
6.03k
  size_t tail_size_min = kTailMinSizeV0 > kTailMinSizeV1 ? kTailMinSizeV0 : kTailMinSizeV1;
1800
6.03k
  size_t tail_size_pad = tail_size < tail_size_min ? tail_size_min : tail_size;
1801
6.03k
  assert(tail_size_pad >= kByteGroupDecodeLimit);
1802
1803
6.03k
  return 1 + vertex_block_count * vertex_size * (vertex_block_control_size + vertex_block_header_size + vertex_block_data_size) + tail_size_pad;
1804
6.03k
}
1805
1806
void meshopt_encodeVertexVersion(int version)
1807
1.50k
{
1808
1.50k
  assert(unsigned(version) <= unsigned(meshopt::kDecodeVertexVersion));
1809
1810
1.50k
  meshopt::gEncodeVertexVersion = version;
1811
1.50k
}
1812
1813
int meshopt_decodeVertexVersion(const unsigned char* buffer, size_t buffer_size)
1814
0
{
1815
0
  if (buffer_size < 1)
1816
0
    return -1;
1817
1818
0
  unsigned char header = buffer[0];
1819
1820
0
  if ((header & 0xf0) != meshopt::kVertexHeader)
1821
0
    return -1;
1822
1823
0
  int version = header & 0x0f;
1824
0
  if (version > meshopt::kDecodeVertexVersion)
1825
0
    return -1;
1826
1827
0
  return version;
1828
0
}
1829
1830
int meshopt_decodeVertexBuffer(void* destination, size_t vertex_count, size_t vertex_size, const unsigned char* buffer, size_t buffer_size)
1831
12.0k
{
1832
12.0k
  using namespace meshopt;
1833
1834
12.0k
  assert(vertex_size > 0 && vertex_size <= 256);
1835
12.0k
  assert(vertex_size % 4 == 0);
1836
1837
12.0k
  const unsigned char* (*decode)(const unsigned char*, const unsigned char*, unsigned char*, size_t, size_t, unsigned char[256], const unsigned char*, int) = NULL;
1838
1839
12.0k
#if defined(SIMD_SSE) && defined(SIMD_FALLBACK)
1840
12.0k
  decode = (cpuid & (1 << 9)) ? decodeVertexBlockSimd : decodeVertexBlock;
1841
#elif defined(SIMD_SSE) || defined(SIMD_AVX) || defined(SIMD_NEON) || defined(SIMD_WASM)
1842
  decode = decodeVertexBlockSimd;
1843
#else
1844
  decode = decodeVertexBlock;
1845
#endif
1846
1847
12.0k
#if defined(SIMD_SSE) || defined(SIMD_NEON) || defined(SIMD_WASM)
1848
12.0k
  assert(gDecodeBytesGroupInitialized);
1849
12.0k
  (void)gDecodeBytesGroupInitialized;
1850
12.0k
#endif
1851
1852
12.0k
  unsigned char* vertex_data = static_cast<unsigned char*>(destination);
1853
1854
12.0k
  const unsigned char* data = buffer;
1855
12.0k
  const unsigned char* data_end = buffer + buffer_size;
1856
1857
12.0k
  if (size_t(data_end - data) < 1)
1858
0
    return -2;
1859
1860
12.0k
  unsigned char data_header = *data++;
1861
1862
12.0k
  if ((data_header & 0xf0) != kVertexHeader)
1863
4.25k
    return -1;
1864
1865
7.81k
  int version = data_header & 0x0f;
1866
7.81k
  if (version > kDecodeVertexVersion)
1867
108
    return -1;
1868
1869
7.70k
  size_t tail_size = vertex_size + (version == 0 ? 0 : vertex_size / 4);
1870
7.70k
  size_t tail_size_min = version == 0 ? kTailMinSizeV0 : kTailMinSizeV1;
1871
7.70k
  size_t tail_size_pad = tail_size < tail_size_min ? tail_size_min : tail_size;
1872
1873
7.70k
  if (size_t(data_end - data) < tail_size_pad)
1874
151
    return -2;
1875
1876
7.55k
  const unsigned char* tail = data_end - tail_size;
1877
1878
7.55k
  unsigned char last_vertex[256];
1879
7.55k
  memcpy(last_vertex, tail, vertex_size);
1880
1881
7.55k
  const unsigned char* channels = version == 0 ? NULL : tail + vertex_size;
1882
1883
7.55k
  size_t vertex_block_size = getVertexBlockSize(vertex_size);
1884
1885
7.55k
  size_t vertex_offset = 0;
1886
1887
154k
  while (vertex_offset < vertex_count)
1888
147k
  {
1889
147k
    size_t block_size = (vertex_offset + vertex_block_size < vertex_count) ? vertex_block_size : vertex_count - vertex_offset;
1890
1891
147k
    data = decode(data, data_end, vertex_data + vertex_offset * vertex_size, block_size, vertex_size, last_vertex, channels, version);
1892
147k
    if (!data)
1893
918
      return -2;
1894
1895
146k
    vertex_offset += block_size;
1896
146k
  }
1897
1898
6.63k
  if (size_t(data_end - data) != tail_size_pad)
1899
600
    return -3;
1900
1901
6.03k
  return 0;
1902
6.63k
}
1903
1904
#undef SIMD_NEON
1905
#undef SIMD_SSE
1906
#undef SIMD_AVX
1907
#undef SIMD_WASM
1908
#undef SIMD_FALLBACK
1909
#undef SIMD_TARGET
1910
#undef SIMD_LATENCYOPT