/src/meshoptimizer/src/vertexcodec.cpp
Line | Count | Source |
1 | | // This file is part of meshoptimizer library; see meshoptimizer.h for version/license details |
2 | | #include "meshoptimizer.h" |
3 | | |
4 | | #include <assert.h> |
5 | | #include <string.h> |
6 | | |
7 | | // The block below auto-detects SIMD ISA that can be used on the target platform |
8 | | #ifndef MESHOPTIMIZER_NO_SIMD |
9 | | |
10 | | // The SIMD implementation requires SSSE3, which can be enabled unconditionally through compiler settings |
11 | | #if defined(__AVX__) || defined(__SSSE3__) |
12 | | #define SIMD_SSE |
13 | | #endif |
14 | | |
15 | | // An experimental implementation using AVX512 instructions; it's only enabled when AVX512 is enabled through compiler settings |
16 | | #if defined(__AVX512VBMI2__) && defined(__AVX512VBMI__) && defined(__AVX512VL__) && defined(__POPCNT__) |
17 | | #undef SIMD_SSE |
18 | | #define SIMD_AVX |
19 | | #endif |
20 | | |
21 | | // MSVC supports compiling SSSE3 code regardless of compile options; we use a cpuid-based scalar fallback |
22 | | #if !defined(SIMD_SSE) && !defined(SIMD_AVX) && defined(_MSC_VER) && !defined(__clang__) && (defined(_M_IX86) || defined(_M_X64)) |
23 | | #define SIMD_SSE |
24 | | #define SIMD_FALLBACK |
25 | | #endif |
26 | | |
27 | | // GCC 4.9+ and clang 3.8+ support targeting SIMD ISA from individual functions; we use a cpuid-based scalar fallback |
28 | | #if !defined(SIMD_SSE) && !defined(SIMD_AVX) && ((defined(__clang__) && __clang_major__ * 100 + __clang_minor__ >= 308) || (defined(__GNUC__) && __GNUC__ * 100 + __GNUC_MINOR__ >= 409)) && (defined(__i386__) || defined(__x86_64__)) |
29 | | #define SIMD_SSE |
30 | | #define SIMD_FALLBACK |
31 | | #define SIMD_TARGET __attribute__((target("ssse3"))) |
32 | | #endif |
33 | | |
34 | | // GCC/clang define these when NEON support is available |
35 | | #if defined(__ARM_NEON__) || defined(__ARM_NEON) |
36 | | #define SIMD_NEON |
37 | | #endif |
38 | | |
39 | | // On MSVC, we assume that ARM builds always target NEON-capable devices |
40 | | #if !defined(SIMD_NEON) && defined(_MSC_VER) && (defined(_M_ARM) || defined(_M_ARM64)) |
41 | | #define SIMD_NEON |
42 | | #endif |
43 | | |
44 | | // When targeting Wasm SIMD we can't use runtime cpuid checks so we unconditionally enable SIMD |
45 | | #if defined(__wasm_simd128__) |
46 | | #define SIMD_WASM |
47 | | // Prevent compiling other variant when wasm simd compilation is active |
48 | | #undef SIMD_NEON |
49 | | #undef SIMD_SSE |
50 | | #undef SIMD_AVX |
51 | | #endif |
52 | | |
53 | | #ifndef SIMD_TARGET |
54 | | #define SIMD_TARGET |
55 | | #endif |
56 | | |
57 | | // When targeting AArch64/x64, optimize for latency to allow decoding of individual 16-byte groups to overlap |
58 | | // We don't do this for 32-bit systems because we need 64-bit math for this and this will hurt in-order CPUs |
59 | | #if defined(__x86_64__) || defined(_M_X64) || defined(__aarch64__) || defined(_M_ARM64) |
60 | | #define SIMD_LATENCYOPT |
61 | | #endif |
62 | | |
63 | | // In switch dispatch, marking default case as unreachable allows to remove redundant bounds checks |
64 | | #if defined(__GNUC__) |
65 | 0 | #define SIMD_UNREACHABLE() __builtin_unreachable() |
66 | | #elif defined(_MSC_VER) |
67 | | #define SIMD_UNREACHABLE() __assume(false) |
68 | | #else |
69 | | #define SIMD_UNREACHABLE() assert(!"Unreachable") |
70 | | #endif |
71 | | |
72 | | #endif // !MESHOPTIMIZER_NO_SIMD |
73 | | |
74 | | #ifdef SIMD_SSE |
75 | | #include <tmmintrin.h> |
76 | | #endif |
77 | | |
78 | | #if defined(SIMD_SSE) && defined(SIMD_FALLBACK) |
79 | | #ifdef _MSC_VER |
80 | | #include <intrin.h> // __cpuid |
81 | | #else |
82 | | #include <cpuid.h> // __cpuid |
83 | | #endif |
84 | | #endif |
85 | | |
86 | | #ifdef SIMD_AVX |
87 | | #include <immintrin.h> |
88 | | #endif |
89 | | |
90 | | #ifdef SIMD_NEON |
91 | | #if defined(_MSC_VER) && defined(_M_ARM64) |
92 | | #include <arm64_neon.h> |
93 | | #else |
94 | | #include <arm_neon.h> |
95 | | #endif |
96 | | #endif |
97 | | |
98 | | #ifdef SIMD_WASM |
99 | | #include <wasm_simd128.h> |
100 | | #endif |
101 | | |
102 | | #ifndef TRACE |
103 | | #define TRACE 0 |
104 | | #endif |
105 | | |
106 | | #if TRACE |
107 | | #include <stdio.h> |
108 | | #endif |
109 | | |
110 | | #ifdef SIMD_WASM |
111 | | #define wasmx_splat_v32x4(v, i) wasm_i32x4_shuffle(v, v, i, i, i, i) |
112 | | #define wasmx_unpacklo_v8x16(a, b) wasm_i8x16_shuffle(a, b, 0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23) |
113 | | #define wasmx_unpackhi_v8x16(a, b) wasm_i8x16_shuffle(a, b, 8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31) |
114 | | #define wasmx_unpacklo_v16x8(a, b) wasm_i16x8_shuffle(a, b, 0, 8, 1, 9, 2, 10, 3, 11) |
115 | | #define wasmx_unpackhi_v16x8(a, b) wasm_i16x8_shuffle(a, b, 4, 12, 5, 13, 6, 14, 7, 15) |
116 | | #define wasmx_unpacklo_v64x2(a, b) wasm_i64x2_shuffle(a, b, 0, 2) |
117 | | #define wasmx_unpackhi_v64x2(a, b) wasm_i64x2_shuffle(a, b, 1, 3) |
118 | | #endif |
119 | | |
120 | | namespace meshopt |
121 | | { |
122 | | |
123 | | const unsigned char kVertexHeader = 0xa0; |
124 | | |
125 | | static int gEncodeVertexVersion = 1; |
126 | | const int kDecodeVertexVersion = 1; |
127 | | |
128 | | const size_t kVertexBlockSizeBytes = 8192; |
129 | | const size_t kVertexBlockMaxSize = 256; |
130 | | const size_t kByteGroupSize = 16; |
131 | | const size_t kByteGroupDecodeLimit = 24; |
132 | | const size_t kTailMinSizeV0 = 32; |
133 | | const size_t kTailMinSizeV1 = 24; |
134 | | |
135 | | static const int kBitsV0[4] = {0, 2, 4, 8}; |
136 | | static const int kBitsV1[5] = {0, 1, 2, 4, 8}; |
137 | | |
138 | | const int kEncodeDefaultLevel = 2; |
139 | | |
140 | | static size_t getVertexBlockSize(size_t vertex_size) |
141 | 25.6k | { |
142 | | // make sure the entire block fits into the scratch buffer and is aligned to byte group size |
143 | | // note: the block size is implicitly part of the format, so we can't change it without breaking compatibility |
144 | 25.6k | size_t result = (kVertexBlockSizeBytes / vertex_size) & ~(kByteGroupSize - 1); |
145 | | |
146 | 25.6k | return (result < kVertexBlockMaxSize) ? result : kVertexBlockMaxSize; |
147 | 25.6k | } |
148 | | |
149 | | inline unsigned int rotate(unsigned int v, int r) |
150 | 144M | { |
151 | 144M | return (v << r) | (v >> ((32 - r) & 31)); |
152 | 144M | } |
153 | | |
154 | | template <typename T> |
155 | | inline T zigzag(T v) |
156 | 952M | { |
157 | 952M | return (0 - (v >> (sizeof(T) * 8 - 1))) ^ (v << 1); |
158 | 952M | } unsigned char meshopt::zigzag<unsigned char>(unsigned char) Line | Count | Source | 156 | 806M | { | 157 | 806M | return (0 - (v >> (sizeof(T) * 8 - 1))) ^ (v << 1); | 158 | 806M | } |
unsigned short meshopt::zigzag<unsigned short>(unsigned short) Line | Count | Source | 156 | 146M | { | 157 | 146M | return (0 - (v >> (sizeof(T) * 8 - 1))) ^ (v << 1); | 158 | 146M | } |
Unexecuted instantiation: unsigned int meshopt::zigzag<unsigned int>(unsigned int) |
159 | | |
160 | | template <typename T> |
161 | | inline T unzigzag(T v) |
162 | 0 | { |
163 | 0 | return (0 - (v & 1)) ^ (v >> 1); |
164 | 0 | } Unexecuted instantiation: unsigned char meshopt::unzigzag<unsigned char>(unsigned char) Unexecuted instantiation: unsigned short meshopt::unzigzag<unsigned short>(unsigned short) Unexecuted instantiation: unsigned int meshopt::unzigzag<unsigned int>(unsigned int) |
165 | | |
166 | | #if TRACE |
167 | | struct Stats |
168 | | { |
169 | | size_t size; |
170 | | size_t header; // bytes for header |
171 | | size_t bitg[9]; // bytes for bit groups |
172 | | size_t bitc[8]; // bit consistency: how many bits are shared between all bytes in a group |
173 | | size_t ctrl[4]; // number of control groups |
174 | | }; |
175 | | |
176 | | static Stats* bytestats = NULL; |
177 | | static Stats vertexstats[256]; |
178 | | #endif |
179 | | |
180 | | static bool encodeBytesGroupZero(const unsigned char* buffer) |
181 | 38.3M | { |
182 | 38.3M | assert(kByteGroupSize == sizeof(unsigned long long) * 2); |
183 | | |
184 | 38.3M | unsigned long long v[2]; |
185 | 38.3M | memcpy(v, buffer, sizeof(v)); |
186 | | |
187 | 38.3M | return (v[0] | v[1]) == 0; |
188 | 38.3M | } |
189 | | |
190 | | static size_t encodeBytesGroupMeasure(const unsigned char* buffer, int bits) |
191 | 261M | { |
192 | 261M | assert(bits >= 0 && bits <= 8); |
193 | | |
194 | 261M | if (bits == 0) |
195 | 22.8M | return encodeBytesGroupZero(buffer) ? 0 : size_t(-1); |
196 | | |
197 | 239M | if (bits == 8) |
198 | 58.2M | return kByteGroupSize; |
199 | | |
200 | 180M | size_t result = kByteGroupSize * bits / 8; |
201 | | |
202 | 180M | unsigned char sentinel = (1 << bits) - 1; |
203 | | |
204 | 3.07G | for (size_t i = 0; i < kByteGroupSize; ++i) |
205 | 2.89G | result += buffer[i] >= sentinel; |
206 | | |
207 | 180M | return result; |
208 | 239M | } |
209 | | |
210 | | static unsigned char* encodeBytesGroup(unsigned char* data, const unsigned char* buffer, int bits) |
211 | 27.1M | { |
212 | 27.1M | assert(bits >= 0 && bits <= 8); |
213 | 27.1M | assert(kByteGroupSize % 8 == 0); |
214 | | |
215 | 27.1M | if (bits == 0) |
216 | 3.14M | return data; |
217 | | |
218 | 23.9M | if (bits == 8) |
219 | 15.9M | { |
220 | 15.9M | memcpy(data, buffer, kByteGroupSize); |
221 | 15.9M | return data + kByteGroupSize; |
222 | 15.9M | } |
223 | | |
224 | 8.07M | size_t byte_size = 8 / bits; |
225 | 8.07M | assert(kByteGroupSize % byte_size == 0); |
226 | | |
227 | | // fixed portion: bits bits for each value |
228 | | // variable portion: full byte for each out-of-range value (using 1...1 as sentinel) |
229 | 8.07M | unsigned char sentinel = (1 << bits) - 1; |
230 | | |
231 | 31.6M | for (size_t i = 0; i < kByteGroupSize; i += byte_size) |
232 | 23.5M | { |
233 | 23.5M | unsigned char byte = 0; |
234 | | |
235 | 152M | for (size_t k = 0; k < byte_size; ++k) |
236 | 129M | { |
237 | 129M | unsigned char enc = (buffer[i + k] >= sentinel) ? sentinel : buffer[i + k]; |
238 | | |
239 | 129M | byte <<= bits; |
240 | 129M | byte |= enc; |
241 | 129M | } |
242 | | |
243 | | // encode 1-bit groups in reverse bit order |
244 | | // this makes them faster to decode alongside other groups |
245 | 23.5M | if (bits == 1) |
246 | 9.78M | byte = (unsigned char)(((byte * 0x80200802ull) & 0x0884422110ull) * 0x0101010101ull >> 32); |
247 | | |
248 | 23.5M | *data++ = byte; |
249 | 23.5M | } |
250 | | |
251 | 137M | for (size_t i = 0; i < kByteGroupSize; ++i) |
252 | 129M | { |
253 | 129M | unsigned char v = buffer[i]; |
254 | | |
255 | | // branchless append of out-of-range values |
256 | 129M | *data = v; |
257 | 129M | data += v >= sentinel; |
258 | 129M | } |
259 | | |
260 | 8.07M | return data; |
261 | 8.07M | } |
262 | | |
263 | | static unsigned char* encodeBytes(unsigned char* data, unsigned char* data_end, const unsigned char* buffer, size_t buffer_size, const int bits[4]) |
264 | 1.77M | { |
265 | 1.77M | assert(buffer_size % kByteGroupSize == 0); |
266 | | |
267 | 1.77M | unsigned char* header = data; |
268 | | |
269 | | // round number of groups to 4 to get number of header bytes |
270 | 1.77M | size_t header_size = (buffer_size / kByteGroupSize + 3) / 4; |
271 | | |
272 | 1.77M | if (size_t(data_end - data) < header_size) |
273 | 0 | return NULL; |
274 | | |
275 | 1.77M | data += header_size; |
276 | | |
277 | 1.77M | memset(header, 0, header_size); |
278 | | |
279 | 1.77M | int last_bits = -1; |
280 | | |
281 | 28.9M | for (size_t i = 0; i < buffer_size; i += kByteGroupSize) |
282 | 27.1M | { |
283 | 27.1M | if (size_t(data_end - data) < kByteGroupDecodeLimit) |
284 | 0 | return NULL; |
285 | | |
286 | 27.1M | int best_bitk = 3; |
287 | 27.1M | size_t best_size = encodeBytesGroupMeasure(buffer + i, bits[best_bitk]); |
288 | | |
289 | 108M | for (int bitk = 0; bitk < 3; ++bitk) |
290 | 81.4M | { |
291 | 81.4M | size_t size = encodeBytesGroupMeasure(buffer + i, bits[bitk]); |
292 | | |
293 | | // favor consistent bit selection across groups, but never replace literals |
294 | 81.4M | if (size < best_size || (size == best_size && bits[bitk] == last_bits && bits[best_bitk] != 8)) |
295 | 13.4M | { |
296 | 13.4M | best_bitk = bitk; |
297 | 13.4M | best_size = size; |
298 | 13.4M | } |
299 | 81.4M | } |
300 | | |
301 | 27.1M | size_t header_offset = i / kByteGroupSize; |
302 | 27.1M | header[header_offset / 4] |= best_bitk << ((header_offset % 4) * 2); |
303 | | |
304 | 27.1M | int best_bits = bits[best_bitk]; |
305 | 27.1M | unsigned char* next = encodeBytesGroup(data, buffer + i, best_bits); |
306 | | |
307 | 27.1M | assert(data + best_size == next); |
308 | 27.1M | data = next; |
309 | 27.1M | last_bits = best_bits; |
310 | | |
311 | | #if TRACE |
312 | | bytestats->bitg[best_bits] += best_size; |
313 | | #endif |
314 | 27.1M | } |
315 | | |
316 | | #if TRACE |
317 | | bytestats->header += header_size; |
318 | | #endif |
319 | | |
320 | 1.77M | return data; |
321 | 1.77M | } |
322 | | |
323 | | template <typename T, bool Xor> |
324 | | static void encodeDeltas1(unsigned char* buffer, const unsigned char* vertex_data, size_t vertex_count, size_t vertex_size, const unsigned char last_vertex[256], size_t k, int rot) |
325 | 4.42M | { |
326 | 4.42M | size_t k0 = k & ~(sizeof(T) - 1); |
327 | 4.42M | int ks = (k & (sizeof(T) - 1)) * 8; |
328 | | |
329 | 4.42M | T p = last_vertex[k0]; |
330 | 6.46M | for (size_t j = 1; j < sizeof(T); ++j) |
331 | 2.04M | p |= T(last_vertex[k0 + j]) << (j * 8); |
332 | | |
333 | 4.42M | const unsigned char* vertex = vertex_data + k0; |
334 | | |
335 | 1.07G | for (size_t i = 0; i < vertex_count; ++i) |
336 | 1.06G | { |
337 | 1.06G | T v = vertex[0]; |
338 | 1.55G | for (size_t j = 1; j < sizeof(T); ++j) |
339 | 486M | v |= vertex[j] << (j * 8); |
340 | | |
341 | 1.06G | T d = Xor ? T(rotate(v ^ p, rot)) : zigzag(T(v - p)); |
342 | | |
343 | 1.06G | buffer[i] = (unsigned char)(d >> ks); |
344 | 1.06G | p = v; |
345 | 1.06G | vertex += vertex_size; |
346 | 1.06G | } |
347 | 4.42M | } vertexcodec.cpp:void meshopt::encodeDeltas1<unsigned char, false>(unsigned char*, unsigned char const*, unsigned long, unsigned long, unsigned char const*, unsigned long, int) Line | Count | Source | 325 | 3.33M | { | 326 | 3.33M | size_t k0 = k & ~(sizeof(T) - 1); | 327 | 3.33M | int ks = (k & (sizeof(T) - 1)) * 8; | 328 | | | 329 | 3.33M | T p = last_vertex[k0]; | 330 | 3.33M | for (size_t j = 1; j < sizeof(T); ++j) | 331 | 0 | p |= T(last_vertex[k0 + j]) << (j * 8); | 332 | | | 333 | 3.33M | const unsigned char* vertex = vertex_data + k0; | 334 | | | 335 | 809M | for (size_t i = 0; i < vertex_count; ++i) | 336 | 806M | { | 337 | 806M | T v = vertex[0]; | 338 | 806M | for (size_t j = 1; j < sizeof(T); ++j) | 339 | 0 | v |= vertex[j] << (j * 8); | 340 | | | 341 | 806M | T d = Xor ? T(rotate(v ^ p, rot)) : zigzag(T(v - p)); | 342 | | | 343 | 806M | buffer[i] = (unsigned char)(d >> ks); | 344 | 806M | p = v; | 345 | 806M | vertex += vertex_size; | 346 | 806M | } | 347 | 3.33M | } |
vertexcodec.cpp:void meshopt::encodeDeltas1<unsigned short, false>(unsigned char*, unsigned char const*, unsigned long, unsigned long, unsigned char const*, unsigned long, int) Line | Count | Source | 325 | 610k | { | 326 | 610k | size_t k0 = k & ~(sizeof(T) - 1); | 327 | 610k | int ks = (k & (sizeof(T) - 1)) * 8; | 328 | | | 329 | 610k | T p = last_vertex[k0]; | 330 | 1.22M | for (size_t j = 1; j < sizeof(T); ++j) | 331 | 610k | p |= T(last_vertex[k0 + j]) << (j * 8); | 332 | | | 333 | 610k | const unsigned char* vertex = vertex_data + k0; | 334 | | | 335 | 146M | for (size_t i = 0; i < vertex_count; ++i) | 336 | 146M | { | 337 | 146M | T v = vertex[0]; | 338 | 292M | for (size_t j = 1; j < sizeof(T); ++j) | 339 | 146M | v |= vertex[j] << (j * 8); | 340 | | | 341 | 146M | T d = Xor ? T(rotate(v ^ p, rot)) : zigzag(T(v - p)); | 342 | | | 343 | 146M | buffer[i] = (unsigned char)(d >> ks); | 344 | 146M | p = v; | 345 | 146M | vertex += vertex_size; | 346 | 146M | } | 347 | 610k | } |
vertexcodec.cpp:void meshopt::encodeDeltas1<unsigned int, true>(unsigned char*, unsigned char const*, unsigned long, unsigned long, unsigned char const*, unsigned long, int) Line | Count | Source | 325 | 478k | { | 326 | 478k | size_t k0 = k & ~(sizeof(T) - 1); | 327 | 478k | int ks = (k & (sizeof(T) - 1)) * 8; | 328 | | | 329 | 478k | T p = last_vertex[k0]; | 330 | 1.91M | for (size_t j = 1; j < sizeof(T); ++j) | 331 | 1.43M | p |= T(last_vertex[k0 + j]) << (j * 8); | 332 | | | 333 | 478k | const unsigned char* vertex = vertex_data + k0; | 334 | | | 335 | 114M | for (size_t i = 0; i < vertex_count; ++i) | 336 | 113M | { | 337 | 113M | T v = vertex[0]; | 338 | 454M | for (size_t j = 1; j < sizeof(T); ++j) | 339 | 340M | v |= vertex[j] << (j * 8); | 340 | | | 341 | 113M | T d = Xor ? T(rotate(v ^ p, rot)) : zigzag(T(v - p)); | 342 | | | 343 | 113M | buffer[i] = (unsigned char)(d >> ks); | 344 | 113M | p = v; | 345 | 113M | vertex += vertex_size; | 346 | 113M | } | 347 | 478k | } |
|
348 | | |
349 | | static void encodeDeltas(unsigned char* buffer, const unsigned char* vertex_data, size_t vertex_count, size_t vertex_size, const unsigned char last_vertex[256], size_t k, int channel) |
350 | 4.42M | { |
351 | 4.42M | switch (channel & 3) |
352 | 4.42M | { |
353 | 3.33M | case 0: |
354 | 3.33M | return encodeDeltas1<unsigned char, false>(buffer, vertex_data, vertex_count, vertex_size, last_vertex, k, 0); |
355 | 610k | case 1: |
356 | 610k | return encodeDeltas1<unsigned short, false>(buffer, vertex_data, vertex_count, vertex_size, last_vertex, k, 0); |
357 | 478k | case 2: |
358 | 478k | return encodeDeltas1<unsigned int, true>(buffer, vertex_data, vertex_count, vertex_size, last_vertex, k, channel >> 4); |
359 | 0 | default: |
360 | 0 | assert(!"Unsupported channel encoding"); // unreachable |
361 | 4.42M | } |
362 | 4.42M | } |
363 | | |
364 | | static int estimateBits(unsigned char v) |
365 | 123M | { |
366 | 123M | return v <= 15 ? (v <= 3 ? (v == 0 ? 0 : 2) : 4) : 8; |
367 | 123M | } |
368 | | |
369 | | static int estimateRotate(const unsigned char* vertex_data, size_t vertex_count, size_t vertex_size, size_t k, size_t group_size) |
370 | 12.9k | { |
371 | 12.9k | size_t sizes[8] = {}; |
372 | | |
373 | 12.9k | const unsigned char* vertex = vertex_data + k; |
374 | 12.9k | unsigned int last = vertex[0] | (vertex[1] << 8) | (vertex[2] << 16) | (vertex[3] << 24); |
375 | | |
376 | 3.88M | for (size_t i = 0; i < vertex_count; i += group_size) |
377 | 3.87M | { |
378 | 3.87M | unsigned int bitg = 0; |
379 | | |
380 | | // calculate bit consistency mask for the group |
381 | 65.7M | for (size_t j = 0; j < group_size && i + j < vertex_count; ++j) |
382 | 61.8M | { |
383 | 61.8M | unsigned int v = vertex[0] | (vertex[1] << 8) | (vertex[2] << 16) | (vertex[3] << 24); |
384 | 61.8M | unsigned int d = v ^ last; |
385 | | |
386 | 61.8M | bitg |= d; |
387 | 61.8M | last = v; |
388 | 61.8M | vertex += vertex_size; |
389 | 61.8M | } |
390 | | |
391 | | #if TRACE |
392 | | for (int j = 0; j < 32; ++j) |
393 | | vertexstats[k + (j / 8)].bitc[j % 8] += (i + group_size < vertex_count ? group_size : vertex_count - i) * (1 - ((bitg >> j) & 1)); |
394 | | #endif |
395 | | |
396 | 34.8M | for (int j = 0; j < 8; ++j) |
397 | 30.9M | { |
398 | 30.9M | unsigned int bitr = rotate(bitg, j); |
399 | | |
400 | 30.9M | sizes[j] += estimateBits((unsigned char)(bitr >> 0)) + estimateBits((unsigned char)(bitr >> 8)); |
401 | 30.9M | sizes[j] += estimateBits((unsigned char)(bitr >> 16)) + estimateBits((unsigned char)(bitr >> 24)); |
402 | 30.9M | } |
403 | 3.87M | } |
404 | | |
405 | 12.9k | int best_rot = 0; |
406 | 103k | for (int rot = 1; rot < 8; ++rot) |
407 | 90.3k | best_rot = (sizes[rot] < sizes[best_rot]) ? rot : best_rot; |
408 | | |
409 | 12.9k | return best_rot; |
410 | 12.9k | } |
411 | | |
412 | | static int estimateChannel(const unsigned char* vertex_data, size_t vertex_count, size_t vertex_size, size_t k, size_t vertex_block_size, size_t block_skip, int max_channel, int xor_rot) |
413 | 14.6k | { |
414 | 14.6k | unsigned char block[kVertexBlockMaxSize]; |
415 | 14.6k | assert(vertex_block_size <= kVertexBlockMaxSize); |
416 | | |
417 | 14.6k | unsigned char last_vertex[256] = {}; |
418 | | |
419 | 14.6k | size_t sizes[3] = {}; |
420 | 14.6k | assert(max_channel <= 3); |
421 | | |
422 | 133k | for (size_t i = 0; i < vertex_count; i += vertex_block_size * block_skip) |
423 | 119k | { |
424 | 119k | size_t block_size = i + vertex_block_size < vertex_count ? vertex_block_size : vertex_count - i; |
425 | 119k | size_t block_size_aligned = (block_size + kByteGroupSize - 1) & ~(kByteGroupSize - 1); |
426 | | |
427 | 119k | memcpy(last_vertex, vertex_data + (i == 0 ? 0 : i - 1) * vertex_size, vertex_size); |
428 | | |
429 | | // we sometimes encode elements we didn't fill when rounding to kByteGroupSize |
430 | 119k | if (block_size < block_size_aligned) |
431 | 9.35k | memset(block + block_size, 0, block_size_aligned - block_size); |
432 | | |
433 | 448k | for (int channel = 0; channel < max_channel; ++channel) |
434 | 1.64M | for (size_t j = 0; j < 4; ++j) |
435 | 1.31M | { |
436 | 1.31M | encodeDeltas(block, vertex_data + i * vertex_size, block_size, vertex_size, last_vertex, k + j, channel | (xor_rot << 4)); |
437 | | |
438 | 20.9M | for (size_t ig = 0; ig < block_size; ig += kByteGroupSize) |
439 | 19.5M | { |
440 | | // to maximize encoding performance we only evaluate 1/2/4/8 bit groups |
441 | 19.5M | size_t size1 = encodeBytesGroupMeasure(block + ig, 1); |
442 | 19.5M | size_t size2 = encodeBytesGroupMeasure(block + ig, 2); |
443 | 19.5M | size_t size4 = encodeBytesGroupMeasure(block + ig, 4); |
444 | 19.5M | size_t size8 = encodeBytesGroupMeasure(block + ig, 8); |
445 | | |
446 | 19.5M | size_t best_size = size1 < size2 ? size1 : size2; |
447 | 19.5M | best_size = best_size < size4 ? best_size : size4; |
448 | 19.5M | best_size = best_size < size8 ? best_size : size8; |
449 | | |
450 | 19.5M | sizes[channel] += best_size; |
451 | 19.5M | } |
452 | 1.31M | } |
453 | 119k | } |
454 | | |
455 | 14.6k | int best_channel = 0; |
456 | 42.2k | for (int channel = 1; channel < max_channel; ++channel) |
457 | 27.5k | best_channel = (sizes[channel] < sizes[best_channel]) ? channel : best_channel; |
458 | | |
459 | 14.6k | return best_channel == 2 ? best_channel | (xor_rot << 4) : best_channel; |
460 | 14.6k | } |
461 | | |
462 | | static bool estimateControlZero(const unsigned char* buffer, size_t vertex_count_aligned) |
463 | 2.78M | { |
464 | 16.3M | for (size_t i = 0; i < vertex_count_aligned; i += kByteGroupSize) |
465 | 15.5M | if (!encodeBytesGroupZero(buffer + i)) |
466 | 1.97M | return false; |
467 | | |
468 | 812k | return true; |
469 | 2.78M | } |
470 | | |
471 | | static int estimateControl(const unsigned char* buffer, size_t vertex_count, size_t vertex_count_aligned, int level) |
472 | 2.78M | { |
473 | 2.78M | if (estimateControlZero(buffer, vertex_count_aligned)) |
474 | 812k | return 2; // zero encoding |
475 | | |
476 | 1.97M | if (level == 0) |
477 | 980k | return 1; // 1248 encoding in level 0 for encoding speed |
478 | | |
479 | | // round number of groups to 4 to get number of header bytes |
480 | 992k | size_t header_size = (vertex_count_aligned / kByteGroupSize + 3) / 4; |
481 | | |
482 | 992k | size_t est_bytes0 = header_size, est_bytes1 = header_size; |
483 | | |
484 | 15.9M | for (size_t i = 0; i < vertex_count_aligned; i += kByteGroupSize) |
485 | 14.9M | { |
486 | | // assumes kBitsV1[] = {0, 1, 2, 4, 8} for performance |
487 | 14.9M | size_t size0 = encodeBytesGroupMeasure(buffer + i, 0); |
488 | 14.9M | size_t size1 = encodeBytesGroupMeasure(buffer + i, 1); |
489 | 14.9M | size_t size2 = encodeBytesGroupMeasure(buffer + i, 2); |
490 | 14.9M | size_t size4 = encodeBytesGroupMeasure(buffer + i, 4); |
491 | 14.9M | size_t size8 = encodeBytesGroupMeasure(buffer + i, 8); |
492 | | |
493 | | // both control modes have access to 1/2/4 bit encoding |
494 | 14.9M | size_t size12 = size1 < size2 ? size1 : size2; |
495 | 14.9M | size_t size124 = size12 < size4 ? size12 : size4; |
496 | | |
497 | | // each control mode has access to 0/8 bit encoding respectively |
498 | 14.9M | est_bytes0 += size124 < size0 ? size124 : size0; |
499 | 14.9M | est_bytes1 += size124 < size8 ? size124 : size8; |
500 | 14.9M | } |
501 | | |
502 | | // pick shortest control entry but prefer literal encoding |
503 | 992k | if (est_bytes0 < vertex_count || est_bytes1 < vertex_count) |
504 | 479k | return est_bytes0 < est_bytes1 ? 0 : 1; |
505 | 513k | else |
506 | 513k | return 3; // literal encoding |
507 | 992k | } |
508 | | |
509 | | static unsigned char* encodeVertexBlock(unsigned char* data, unsigned char* data_end, const unsigned char* vertex_data, size_t vertex_count, size_t vertex_size, unsigned char last_vertex[256], const unsigned char* channels, int version, int level) |
510 | 292k | { |
511 | 292k | assert(vertex_count > 0 && vertex_count <= kVertexBlockMaxSize); |
512 | 292k | assert(vertex_size % 4 == 0); |
513 | | |
514 | 292k | unsigned char buffer[kVertexBlockMaxSize]; |
515 | 292k | assert(sizeof(buffer) % kByteGroupSize == 0); |
516 | | |
517 | 292k | size_t vertex_count_aligned = (vertex_count + kByteGroupSize - 1) & ~(kByteGroupSize - 1); |
518 | | |
519 | | // we sometimes encode elements we didn't fill when rounding to kByteGroupSize |
520 | 292k | memset(buffer, 0, sizeof(buffer)); |
521 | | |
522 | 292k | size_t control_size = version == 0 ? 0 : vertex_size / 4; |
523 | 292k | if (size_t(data_end - data) < control_size) |
524 | 0 | return NULL; |
525 | | |
526 | 292k | unsigned char* control = data; |
527 | 292k | data += control_size; |
528 | | |
529 | 292k | memset(control, 0, control_size); |
530 | | |
531 | 3.39M | for (size_t k = 0; k < vertex_size; ++k) |
532 | 3.10M | { |
533 | 3.10M | encodeDeltas(buffer, vertex_data, vertex_count, vertex_size, last_vertex, k, version == 0 ? 0 : channels[k / 4]); |
534 | | |
535 | | #if TRACE |
536 | | const unsigned char* olddata = data; |
537 | | bytestats = &vertexstats[k]; |
538 | | #endif |
539 | | |
540 | 3.10M | int ctrl = 0; |
541 | | |
542 | 3.10M | if (version != 0) |
543 | 2.78M | { |
544 | 2.78M | ctrl = estimateControl(buffer, vertex_count, vertex_count_aligned, level); |
545 | | |
546 | 2.78M | assert(unsigned(ctrl) < 4); |
547 | 2.78M | control[k / 4] |= ctrl << ((k % 4) * 2); |
548 | | |
549 | | #if TRACE |
550 | | vertexstats[k].ctrl[ctrl]++; |
551 | | #endif |
552 | 2.78M | } |
553 | | |
554 | 3.10M | if (ctrl == 3) |
555 | 513k | { |
556 | | // literal encoding |
557 | 513k | if (size_t(data_end - data) < vertex_count) |
558 | 0 | return NULL; |
559 | | |
560 | 513k | memcpy(data, buffer, vertex_count); |
561 | 513k | data += vertex_count; |
562 | 513k | } |
563 | 2.58M | else if (ctrl != 2) // non-zero encoding |
564 | 1.77M | { |
565 | 1.77M | data = encodeBytes(data, data_end, buffer, vertex_count_aligned, version == 0 ? kBitsV0 : kBitsV1 + ctrl); |
566 | 1.77M | if (!data) |
567 | 0 | return NULL; |
568 | 1.77M | } |
569 | | |
570 | | #if TRACE |
571 | | bytestats = NULL; |
572 | | vertexstats[k].size += data - olddata; |
573 | | #endif |
574 | 3.10M | } |
575 | | |
576 | 292k | memcpy(last_vertex, &vertex_data[vertex_size * (vertex_count - 1)], vertex_size); |
577 | | |
578 | 292k | return data; |
579 | 292k | } |
580 | | |
581 | | #if defined(SIMD_FALLBACK) || (!defined(SIMD_SSE) && !defined(SIMD_NEON) && !defined(SIMD_AVX) && !defined(SIMD_WASM)) |
582 | | static const unsigned char* decodeBytesGroup(const unsigned char* data, unsigned char* buffer, int bits) |
583 | 0 | { |
584 | 0 | #define READ() byte = *data++ |
585 | 0 | #define NEXT(bits) enc = byte >> (8 - bits), byte <<= bits, encv = *data_var, *buffer++ = (enc == (1 << bits) - 1) ? encv : enc, data_var += (enc == (1 << bits) - 1) |
586 | |
|
587 | 0 | unsigned char byte, enc, encv; |
588 | 0 | const unsigned char* data_var; |
589 | |
|
590 | 0 | switch (bits) |
591 | 0 | { |
592 | 0 | case 0: |
593 | 0 | memset(buffer, 0, kByteGroupSize); |
594 | 0 | return data; |
595 | 0 | case 1: |
596 | 0 | data_var = data + 2; |
597 | | |
598 | | // 2 groups with 8 1-bit values in each byte (reversed from the order in other groups) |
599 | 0 | READ(); |
600 | 0 | byte = (unsigned char)(((byte * 0x80200802ull) & 0x0884422110ull) * 0x0101010101ull >> 32); |
601 | 0 | NEXT(1), NEXT(1), NEXT(1), NEXT(1), NEXT(1), NEXT(1), NEXT(1), NEXT(1); |
602 | 0 | READ(); |
603 | 0 | byte = (unsigned char)(((byte * 0x80200802ull) & 0x0884422110ull) * 0x0101010101ull >> 32); |
604 | 0 | NEXT(1), NEXT(1), NEXT(1), NEXT(1), NEXT(1), NEXT(1), NEXT(1), NEXT(1); |
605 | |
|
606 | 0 | return data_var; |
607 | 0 | case 2: |
608 | 0 | data_var = data + 4; |
609 | | |
610 | | // 4 groups with 4 2-bit values in each byte |
611 | 0 | READ(), NEXT(2), NEXT(2), NEXT(2), NEXT(2); |
612 | 0 | READ(), NEXT(2), NEXT(2), NEXT(2), NEXT(2); |
613 | 0 | READ(), NEXT(2), NEXT(2), NEXT(2), NEXT(2); |
614 | 0 | READ(), NEXT(2), NEXT(2), NEXT(2), NEXT(2); |
615 | |
|
616 | 0 | return data_var; |
617 | 0 | case 4: |
618 | 0 | data_var = data + 8; |
619 | | |
620 | | // 8 groups with 2 4-bit values in each byte |
621 | 0 | READ(), NEXT(4), NEXT(4); |
622 | 0 | READ(), NEXT(4), NEXT(4); |
623 | 0 | READ(), NEXT(4), NEXT(4); |
624 | 0 | READ(), NEXT(4), NEXT(4); |
625 | 0 | READ(), NEXT(4), NEXT(4); |
626 | 0 | READ(), NEXT(4), NEXT(4); |
627 | 0 | READ(), NEXT(4), NEXT(4); |
628 | 0 | READ(), NEXT(4), NEXT(4); |
629 | |
|
630 | 0 | return data_var; |
631 | 0 | case 8: |
632 | 0 | memcpy(buffer, data, kByteGroupSize); |
633 | 0 | return data + kByteGroupSize; |
634 | 0 | default: |
635 | 0 | assert(!"Unexpected bit length"); // unreachable |
636 | 0 | return data; |
637 | 0 | } |
638 | |
|
639 | 0 | #undef READ |
640 | 0 | #undef NEXT |
641 | 0 | } |
642 | | |
643 | | static const unsigned char* decodeBytes(const unsigned char* data, const unsigned char* data_end, unsigned char* buffer, size_t buffer_size, const int* bits) |
644 | 0 | { |
645 | 0 | assert(buffer_size % kByteGroupSize == 0); |
646 | | |
647 | | // round number of groups to 4 to get number of header bytes |
648 | 0 | size_t header_size = (buffer_size / kByteGroupSize + 3) / 4; |
649 | 0 | if (size_t(data_end - data) < header_size) |
650 | 0 | return NULL; |
651 | | |
652 | 0 | const unsigned char* header = data; |
653 | 0 | data += header_size; |
654 | |
|
655 | 0 | for (size_t i = 0; i < buffer_size; i += kByteGroupSize) |
656 | 0 | { |
657 | 0 | if (size_t(data_end - data) < kByteGroupDecodeLimit) |
658 | 0 | return NULL; |
659 | | |
660 | 0 | size_t header_offset = i / kByteGroupSize; |
661 | 0 | int bitsk = (header[header_offset / 4] >> ((header_offset % 4) * 2)) & 3; |
662 | |
|
663 | 0 | data = decodeBytesGroup(data, buffer + i, bits[bitsk]); |
664 | 0 | } |
665 | | |
666 | 0 | return data; |
667 | 0 | } |
668 | | |
669 | | template <typename T, bool Xor> |
670 | | static void decodeDeltas1(const unsigned char* buffer, unsigned char* transposed, size_t vertex_count, size_t vertex_size, const unsigned char* last_vertex, int rot) |
671 | 0 | { |
672 | 0 | for (size_t k = 0; k < 4; k += sizeof(T)) |
673 | 0 | { |
674 | 0 | size_t vertex_offset = k; |
675 | |
|
676 | 0 | T p = last_vertex[0]; |
677 | 0 | for (size_t j = 1; j < sizeof(T); ++j) |
678 | 0 | p |= last_vertex[j] << (8 * j); |
679 | |
|
680 | 0 | for (size_t i = 0; i < vertex_count; ++i) |
681 | 0 | { |
682 | 0 | T v = buffer[i]; |
683 | 0 | for (size_t j = 1; j < sizeof(T); ++j) |
684 | 0 | v |= buffer[i + vertex_count * j] << (8 * j); |
685 | |
|
686 | 0 | v = Xor ? T(rotate(v, rot)) ^ p : unzigzag(v) + p; |
687 | |
|
688 | 0 | for (size_t j = 0; j < sizeof(T); ++j) |
689 | 0 | transposed[vertex_offset + j] = (unsigned char)(v >> (j * 8)); |
690 | |
|
691 | 0 | p = v; |
692 | |
|
693 | 0 | vertex_offset += vertex_size; |
694 | 0 | } |
695 | |
|
696 | 0 | buffer += vertex_count * sizeof(T); |
697 | 0 | last_vertex += sizeof(T); |
698 | 0 | } |
699 | 0 | } Unexecuted instantiation: vertexcodec.cpp:void meshopt::decodeDeltas1<unsigned char, false>(unsigned char const*, unsigned char*, unsigned long, unsigned long, unsigned char const*, int) Unexecuted instantiation: vertexcodec.cpp:void meshopt::decodeDeltas1<unsigned short, false>(unsigned char const*, unsigned char*, unsigned long, unsigned long, unsigned char const*, int) Unexecuted instantiation: vertexcodec.cpp:void meshopt::decodeDeltas1<unsigned int, true>(unsigned char const*, unsigned char*, unsigned long, unsigned long, unsigned char const*, int) |
700 | | |
701 | | static const unsigned char* decodeVertexBlock(const unsigned char* data, const unsigned char* data_end, unsigned char* vertex_data, size_t vertex_count, size_t vertex_size, unsigned char last_vertex[256], const unsigned char* channels, int version) |
702 | 0 | { |
703 | 0 | assert(vertex_count > 0 && vertex_count <= kVertexBlockMaxSize); |
704 | | |
705 | 0 | unsigned char buffer[kVertexBlockMaxSize * 4]; |
706 | 0 | unsigned char transposed[kVertexBlockSizeBytes]; |
707 | |
|
708 | 0 | size_t vertex_count_aligned = (vertex_count + kByteGroupSize - 1) & ~(kByteGroupSize - 1); |
709 | 0 | assert(vertex_count <= vertex_count_aligned); |
710 | | |
711 | 0 | size_t control_size = version == 0 ? 0 : vertex_size / 4; |
712 | 0 | if (size_t(data_end - data) < control_size) |
713 | 0 | return NULL; |
714 | | |
715 | 0 | const unsigned char* control = data; |
716 | 0 | data += control_size; |
717 | |
|
718 | 0 | for (size_t k = 0; k < vertex_size; k += 4) |
719 | 0 | { |
720 | 0 | unsigned char ctrl_byte = version == 0 ? 0 : control[k / 4]; |
721 | |
|
722 | 0 | for (size_t j = 0; j < 4; ++j) |
723 | 0 | { |
724 | 0 | int ctrl = (ctrl_byte >> (j * 2)) & 3; |
725 | |
|
726 | 0 | if (ctrl == 3) |
727 | 0 | { |
728 | | // literal encoding |
729 | 0 | if (size_t(data_end - data) < vertex_count) |
730 | 0 | return NULL; |
731 | | |
732 | 0 | memcpy(buffer + j * vertex_count, data, vertex_count); |
733 | 0 | data += vertex_count; |
734 | 0 | } |
735 | 0 | else if (ctrl == 2) |
736 | 0 | { |
737 | | // zero encoding |
738 | 0 | memset(buffer + j * vertex_count, 0, vertex_count); |
739 | 0 | } |
740 | 0 | else |
741 | 0 | { |
742 | 0 | data = decodeBytes(data, data_end, buffer + j * vertex_count, vertex_count_aligned, version == 0 ? kBitsV0 : kBitsV1 + ctrl); |
743 | 0 | if (!data) |
744 | 0 | return NULL; |
745 | 0 | } |
746 | 0 | } |
747 | | |
748 | 0 | int channel = version == 0 ? 0 : channels[k / 4]; |
749 | |
|
750 | 0 | switch (channel & 3) |
751 | 0 | { |
752 | 0 | case 0: |
753 | 0 | decodeDeltas1<unsigned char, false>(buffer, transposed + k, vertex_count, vertex_size, last_vertex + k, 0); |
754 | 0 | break; |
755 | 0 | case 1: |
756 | 0 | decodeDeltas1<unsigned short, false>(buffer, transposed + k, vertex_count, vertex_size, last_vertex + k, 0); |
757 | 0 | break; |
758 | 0 | case 2: |
759 | 0 | decodeDeltas1<unsigned int, true>(buffer, transposed + k, vertex_count, vertex_size, last_vertex + k, (32 - (channel >> 4)) & 31); |
760 | 0 | break; |
761 | 0 | default: |
762 | 0 | return NULL; // invalid channel type |
763 | 0 | } |
764 | 0 | } |
765 | | |
766 | 0 | memcpy(vertex_data, transposed, vertex_count * vertex_size); |
767 | |
|
768 | 0 | memcpy(last_vertex, &transposed[vertex_size * (vertex_count - 1)], vertex_size); |
769 | |
|
770 | 0 | return data; |
771 | 0 | } |
772 | | #endif |
773 | | |
774 | | #if defined(SIMD_SSE) || defined(SIMD_NEON) || defined(SIMD_WASM) |
775 | | static unsigned char kDecodeBytesGroupShuffle[256][8]; |
776 | | static unsigned char kDecodeBytesGroupCount[256]; |
777 | | |
778 | | #ifdef __wasm__ |
779 | | __attribute__((cold)) // this saves 500 bytes in the output binary - we don't need to vectorize this loop! |
780 | | #endif |
781 | | static bool |
782 | | decodeBytesGroupBuildTables() |
783 | 2 | { |
784 | 514 | for (int mask = 0; mask < 256; ++mask) |
785 | 512 | { |
786 | 512 | unsigned char shuffle[8]; |
787 | 512 | unsigned char count = 0; |
788 | | |
789 | 4.60k | for (int i = 0; i < 8; ++i) |
790 | 4.09k | { |
791 | 4.09k | int maski = (mask >> i) & 1; |
792 | 4.09k | shuffle[i] = maski ? count : 0x80; |
793 | 4.09k | count += (unsigned char)(maski); |
794 | 4.09k | } |
795 | | |
796 | 512 | memcpy(kDecodeBytesGroupShuffle[mask], shuffle, 8); |
797 | 512 | kDecodeBytesGroupCount[mask] = count; |
798 | 512 | } |
799 | | |
800 | 2 | return true; |
801 | 2 | } |
802 | | |
803 | | static bool gDecodeBytesGroupInitialized = decodeBytesGroupBuildTables(); |
804 | | #endif |
805 | | |
806 | | #ifdef SIMD_SSE |
807 | | SIMD_TARGET |
808 | | inline __m128i decodeShuffleMask(unsigned char mask0, unsigned char mask1) |
809 | 4.05M | { |
810 | 4.05M | __m128i sm0 = _mm_loadl_epi64(reinterpret_cast<const __m128i*>(&kDecodeBytesGroupShuffle[mask0])); |
811 | 4.05M | __m128i sm1 = _mm_loadl_epi64(reinterpret_cast<const __m128i*>(&kDecodeBytesGroupShuffle[mask1])); |
812 | 4.05M | __m128i sm1off = _mm_set1_epi8(kDecodeBytesGroupCount[mask0]); |
813 | | |
814 | 4.05M | __m128i sm1r = _mm_add_epi8(sm1, sm1off); |
815 | | |
816 | 4.05M | return _mm_unpacklo_epi64(sm0, sm1r); |
817 | 4.05M | } |
818 | | |
819 | | SIMD_TARGET |
820 | | inline const unsigned char* decodeBytesGroupSimd(const unsigned char* data, unsigned char* buffer, int hbits) |
821 | 13.6M | { |
822 | 13.6M | switch (hbits) |
823 | 13.6M | { |
824 | 276k | case 0: |
825 | 1.60M | case 4: |
826 | 1.60M | { |
827 | 1.60M | __m128i result = _mm_setzero_si128(); |
828 | | |
829 | 1.60M | _mm_storeu_si128(reinterpret_cast<__m128i*>(buffer), result); |
830 | | |
831 | 1.60M | return data; |
832 | 276k | } |
833 | | |
834 | 190k | case 1: |
835 | 1.46M | case 6: |
836 | 1.46M | { |
837 | 1.46M | #ifdef __GNUC__ |
838 | 1.46M | typedef int __attribute__((aligned(1))) unaligned_int; |
839 | | #else |
840 | | typedef int unaligned_int; |
841 | | #endif |
842 | | |
843 | 1.46M | #ifdef SIMD_LATENCYOPT |
844 | 1.46M | unsigned int data32; |
845 | 1.46M | memcpy(&data32, data, 4); |
846 | 1.46M | data32 &= data32 >> 1; |
847 | | |
848 | | // arrange bits such that low bits of nibbles of data64 contain all 2-bit elements of data32 |
849 | 1.46M | unsigned long long data64 = ((unsigned long long)data32 << 30) | (data32 & 0x3fffffff); |
850 | | |
851 | | // adds all 1-bit nibbles together; the sum fits in 4 bits because datacnt=16 would have used mode 3 |
852 | 1.46M | int datacnt = int(((data64 & 0x1111111111111111ull) * 0x1111111111111111ull) >> 60); |
853 | 1.46M | #endif |
854 | | |
855 | 1.46M | __m128i sel2 = _mm_cvtsi32_si128(*reinterpret_cast<const unaligned_int*>(data)); |
856 | 1.46M | __m128i rest = _mm_loadu_si128(reinterpret_cast<const __m128i*>(data + 4)); |
857 | | |
858 | 1.46M | __m128i sel22 = _mm_unpacklo_epi8(_mm_srli_epi16(sel2, 4), sel2); |
859 | 1.46M | __m128i sel2222 = _mm_unpacklo_epi8(_mm_srli_epi16(sel22, 2), sel22); |
860 | 1.46M | __m128i sel = _mm_and_si128(sel2222, _mm_set1_epi8(3)); |
861 | | |
862 | 1.46M | __m128i mask = _mm_cmpeq_epi8(sel, _mm_set1_epi8(3)); |
863 | 1.46M | int mask16 = _mm_movemask_epi8(mask); |
864 | 1.46M | unsigned char mask0 = (unsigned char)(mask16 & 255); |
865 | 1.46M | unsigned char mask1 = (unsigned char)(mask16 >> 8); |
866 | | |
867 | 1.46M | __m128i shuf = decodeShuffleMask(mask0, mask1); |
868 | 1.46M | __m128i result = _mm_or_si128(_mm_shuffle_epi8(rest, shuf), _mm_andnot_si128(mask, sel)); |
869 | | |
870 | 1.46M | _mm_storeu_si128(reinterpret_cast<__m128i*>(buffer), result); |
871 | | |
872 | 1.46M | #ifdef SIMD_LATENCYOPT |
873 | 1.46M | return data + 4 + datacnt; |
874 | | #else |
875 | | return data + 4 + kDecodeBytesGroupCount[mask0] + kDecodeBytesGroupCount[mask1]; |
876 | | #endif |
877 | 190k | } |
878 | | |
879 | 18.2k | case 2: |
880 | 138k | case 7: |
881 | 138k | { |
882 | 138k | #ifdef SIMD_LATENCYOPT |
883 | 138k | unsigned long long data64; |
884 | 138k | memcpy(&data64, data, 8); |
885 | 138k | data64 &= data64 >> 1; |
886 | 138k | data64 &= data64 >> 2; |
887 | | |
888 | | // adds all 1-bit nibbles together; the sum fits in 4 bits because datacnt=16 would have used mode 3 |
889 | 138k | int datacnt = int(((data64 & 0x1111111111111111ull) * 0x1111111111111111ull) >> 60); |
890 | 138k | #endif |
891 | | |
892 | 138k | __m128i sel4 = _mm_loadl_epi64(reinterpret_cast<const __m128i*>(data)); |
893 | 138k | __m128i rest = _mm_loadu_si128(reinterpret_cast<const __m128i*>(data + 8)); |
894 | | |
895 | 138k | __m128i sel44 = _mm_unpacklo_epi8(_mm_srli_epi16(sel4, 4), sel4); |
896 | 138k | __m128i sel = _mm_and_si128(sel44, _mm_set1_epi8(15)); |
897 | | |
898 | 138k | __m128i mask = _mm_cmpeq_epi8(sel, _mm_set1_epi8(15)); |
899 | 138k | int mask16 = _mm_movemask_epi8(mask); |
900 | 138k | unsigned char mask0 = (unsigned char)(mask16 & 255); |
901 | 138k | unsigned char mask1 = (unsigned char)(mask16 >> 8); |
902 | | |
903 | 138k | __m128i shuf = decodeShuffleMask(mask0, mask1); |
904 | 138k | __m128i result = _mm_or_si128(_mm_shuffle_epi8(rest, shuf), _mm_andnot_si128(mask, sel)); |
905 | | |
906 | 138k | _mm_storeu_si128(reinterpret_cast<__m128i*>(buffer), result); |
907 | | |
908 | 138k | #ifdef SIMD_LATENCYOPT |
909 | 138k | return data + 8 + datacnt; |
910 | | #else |
911 | | return data + 8 + kDecodeBytesGroupCount[mask0] + kDecodeBytesGroupCount[mask1]; |
912 | | #endif |
913 | 18.2k | } |
914 | | |
915 | 1.73M | case 3: |
916 | 7.96M | case 8: |
917 | 7.96M | { |
918 | 7.96M | __m128i result = _mm_loadu_si128(reinterpret_cast<const __m128i*>(data)); |
919 | | |
920 | 7.96M | _mm_storeu_si128(reinterpret_cast<__m128i*>(buffer), result); |
921 | | |
922 | 7.96M | return data + 16; |
923 | 1.73M | } |
924 | | |
925 | 2.45M | case 5: |
926 | 2.45M | { |
927 | 2.45M | __m128i rest = _mm_loadu_si128(reinterpret_cast<const __m128i*>(data + 2)); |
928 | | |
929 | 2.45M | unsigned char mask0 = data[0]; |
930 | 2.45M | unsigned char mask1 = data[1]; |
931 | | |
932 | 2.45M | __m128i shuf = decodeShuffleMask(mask0, mask1); |
933 | 2.45M | __m128i result = _mm_shuffle_epi8(rest, shuf); |
934 | | |
935 | 2.45M | _mm_storeu_si128(reinterpret_cast<__m128i*>(buffer), result); |
936 | | |
937 | 2.45M | return data + 2 + kDecodeBytesGroupCount[mask0] + kDecodeBytesGroupCount[mask1]; |
938 | 1.73M | } |
939 | | |
940 | 0 | default: |
941 | 0 | SIMD_UNREACHABLE(); // unreachable |
942 | 13.6M | } |
943 | 13.6M | } |
944 | | #endif |
945 | | |
946 | | #ifdef SIMD_AVX |
947 | | static const __m128i kDecodeBytesGroupConfig[8][2] = { |
948 | | {_mm_setzero_si128(), _mm_setzero_si128()}, |
949 | | {_mm_set1_epi8(3), _mm_setr_epi8(6, 4, 2, 0, 14, 12, 10, 8, 22, 20, 18, 16, 30, 28, 26, 24)}, |
950 | | {_mm_set1_epi8(15), _mm_setr_epi8(4, 0, 12, 8, 20, 16, 28, 24, 36, 32, 44, 40, 52, 48, 60, 56)}, |
951 | | {_mm_setzero_si128(), _mm_setzero_si128()}, |
952 | | {_mm_setzero_si128(), _mm_setzero_si128()}, |
953 | | {_mm_set1_epi8(1), _mm_setr_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15)}, |
954 | | {_mm_set1_epi8(3), _mm_setr_epi8(6, 4, 2, 0, 14, 12, 10, 8, 22, 20, 18, 16, 30, 28, 26, 24)}, |
955 | | {_mm_set1_epi8(15), _mm_setr_epi8(4, 0, 12, 8, 20, 16, 28, 24, 36, 32, 44, 40, 52, 48, 60, 56)}, |
956 | | }; |
957 | | |
958 | | SIMD_TARGET |
959 | | inline const unsigned char* decodeBytesGroupSimd(const unsigned char* data, unsigned char* buffer, int hbits) |
960 | | { |
961 | | switch (hbits) |
962 | | { |
963 | | case 0: |
964 | | case 4: |
965 | | { |
966 | | __m128i result = _mm_setzero_si128(); |
967 | | |
968 | | _mm_storeu_si128(reinterpret_cast<__m128i*>(buffer), result); |
969 | | |
970 | | return data; |
971 | | } |
972 | | |
973 | | case 5: // 1-bit |
974 | | case 1: // 2-bit |
975 | | case 6: |
976 | | case 2: // 4-bit |
977 | | case 7: |
978 | | { |
979 | | const unsigned char* skip = data + (2 << (hbits < 3 ? hbits : hbits - 5)); |
980 | | |
981 | | __m128i selb = _mm_loadl_epi64(reinterpret_cast<const __m128i*>(data)); |
982 | | __m128i rest = _mm_loadu_si128(reinterpret_cast<const __m128i*>(skip)); |
983 | | |
984 | | __m128i sent = kDecodeBytesGroupConfig[hbits][0]; |
985 | | __m128i ctrl = kDecodeBytesGroupConfig[hbits][1]; |
986 | | |
987 | | __m128i selw = _mm_shuffle_epi32(selb, 0x44); |
988 | | __m128i sel = _mm_and_si128(sent, _mm_multishift_epi64_epi8(ctrl, selw)); |
989 | | __mmask16 mask16 = _mm_cmp_epi8_mask(sel, sent, _MM_CMPINT_EQ); |
990 | | |
991 | | __m128i result = _mm_mask_expand_epi8(sel, mask16, rest); |
992 | | |
993 | | _mm_storeu_si128(reinterpret_cast<__m128i*>(buffer), result); |
994 | | |
995 | | return skip + _mm_popcnt_u32(mask16); |
996 | | } |
997 | | |
998 | | case 3: |
999 | | case 8: |
1000 | | { |
1001 | | __m128i result = _mm_loadu_si128(reinterpret_cast<const __m128i*>(data)); |
1002 | | |
1003 | | _mm_storeu_si128(reinterpret_cast<__m128i*>(buffer), result); |
1004 | | |
1005 | | return data + 16; |
1006 | | } |
1007 | | |
1008 | | default: |
1009 | | SIMD_UNREACHABLE(); // unreachable |
1010 | | } |
1011 | | } |
1012 | | #endif |
1013 | | |
1014 | | #ifdef SIMD_NEON |
1015 | | SIMD_TARGET |
1016 | | inline uint8x16_t shuffleBytes(unsigned char mask0, unsigned char mask1, uint8x8_t rest0, uint8x8_t rest1) |
1017 | | { |
1018 | | uint8x8_t sm0 = vld1_u8(kDecodeBytesGroupShuffle[mask0]); |
1019 | | uint8x8_t sm1 = vld1_u8(kDecodeBytesGroupShuffle[mask1]); |
1020 | | |
1021 | | uint8x8_t r0 = vtbl1_u8(rest0, sm0); |
1022 | | uint8x8_t r1 = vtbl1_u8(rest1, sm1); |
1023 | | |
1024 | | return vcombine_u8(r0, r1); |
1025 | | } |
1026 | | |
1027 | | SIMD_TARGET |
1028 | | inline void neonMoveMask(uint8x16_t mask, unsigned char& mask0, unsigned char& mask1) |
1029 | | { |
1030 | | // magic constant found using z3 SMT assuming mask has 8 groups of 0xff or 0x00 |
1031 | | const uint64_t magic = 0x000103070f1f3f80ull; |
1032 | | |
1033 | | uint64x2_t mask2 = vreinterpretq_u64_u8(mask); |
1034 | | |
1035 | | mask0 = uint8_t((vgetq_lane_u64(mask2, 0) * magic) >> 56); |
1036 | | mask1 = uint8_t((vgetq_lane_u64(mask2, 1) * magic) >> 56); |
1037 | | } |
1038 | | |
1039 | | SIMD_TARGET |
1040 | | inline const unsigned char* decodeBytesGroupSimd(const unsigned char* data, unsigned char* buffer, int hbits) |
1041 | | { |
1042 | | switch (hbits) |
1043 | | { |
1044 | | case 0: |
1045 | | case 4: |
1046 | | { |
1047 | | uint8x16_t result = vdupq_n_u8(0); |
1048 | | |
1049 | | vst1q_u8(buffer, result); |
1050 | | |
1051 | | return data; |
1052 | | } |
1053 | | |
1054 | | case 1: |
1055 | | case 6: |
1056 | | { |
1057 | | #ifdef SIMD_LATENCYOPT |
1058 | | unsigned int data32; |
1059 | | memcpy(&data32, data, 4); |
1060 | | data32 &= data32 >> 1; |
1061 | | |
1062 | | // arrange bits such that low bits of nibbles of data64 contain all 2-bit elements of data32 |
1063 | | unsigned long long data64 = ((unsigned long long)data32 << 30) | (data32 & 0x3fffffff); |
1064 | | |
1065 | | // adds all 1-bit nibbles together; the sum fits in 4 bits because datacnt=16 would have used mode 3 |
1066 | | int datacnt = int(((data64 & 0x1111111111111111ull) * 0x1111111111111111ull) >> 60); |
1067 | | #endif |
1068 | | |
1069 | | uint8x8_t sel2 = vld1_u8(data); |
1070 | | uint8x8_t sel22 = vzip_u8(vshr_n_u8(sel2, 4), sel2).val[0]; |
1071 | | uint8x8x2_t sel2222 = vzip_u8(vshr_n_u8(sel22, 2), sel22); |
1072 | | uint8x16_t sel = vandq_u8(vcombine_u8(sel2222.val[0], sel2222.val[1]), vdupq_n_u8(3)); |
1073 | | |
1074 | | uint8x16_t mask = vceqq_u8(sel, vdupq_n_u8(3)); |
1075 | | unsigned char mask0, mask1; |
1076 | | neonMoveMask(mask, mask0, mask1); |
1077 | | |
1078 | | uint8x8_t rest0 = vld1_u8(data + 4); |
1079 | | uint8x8_t rest1 = vld1_u8(data + 4 + kDecodeBytesGroupCount[mask0]); |
1080 | | |
1081 | | uint8x16_t result = vbslq_u8(mask, shuffleBytes(mask0, mask1, rest0, rest1), sel); |
1082 | | |
1083 | | vst1q_u8(buffer, result); |
1084 | | |
1085 | | #ifdef SIMD_LATENCYOPT |
1086 | | return data + 4 + datacnt; |
1087 | | #else |
1088 | | return data + 4 + kDecodeBytesGroupCount[mask0] + kDecodeBytesGroupCount[mask1]; |
1089 | | #endif |
1090 | | } |
1091 | | |
1092 | | case 2: |
1093 | | case 7: |
1094 | | { |
1095 | | #ifdef SIMD_LATENCYOPT |
1096 | | unsigned long long data64; |
1097 | | memcpy(&data64, data, 8); |
1098 | | data64 &= data64 >> 1; |
1099 | | data64 &= data64 >> 2; |
1100 | | |
1101 | | // adds all 1-bit nibbles together; the sum fits in 4 bits because datacnt=16 would have used mode 3 |
1102 | | int datacnt = int(((data64 & 0x1111111111111111ull) * 0x1111111111111111ull) >> 60); |
1103 | | #endif |
1104 | | |
1105 | | uint8x8_t sel4 = vld1_u8(data); |
1106 | | uint8x8x2_t sel44 = vzip_u8(vshr_n_u8(sel4, 4), vand_u8(sel4, vdup_n_u8(15))); |
1107 | | uint8x16_t sel = vcombine_u8(sel44.val[0], sel44.val[1]); |
1108 | | |
1109 | | uint8x16_t mask = vceqq_u8(sel, vdupq_n_u8(15)); |
1110 | | unsigned char mask0, mask1; |
1111 | | neonMoveMask(mask, mask0, mask1); |
1112 | | |
1113 | | uint8x8_t rest0 = vld1_u8(data + 8); |
1114 | | uint8x8_t rest1 = vld1_u8(data + 8 + kDecodeBytesGroupCount[mask0]); |
1115 | | |
1116 | | uint8x16_t result = vbslq_u8(mask, shuffleBytes(mask0, mask1, rest0, rest1), sel); |
1117 | | |
1118 | | vst1q_u8(buffer, result); |
1119 | | |
1120 | | #ifdef SIMD_LATENCYOPT |
1121 | | return data + 8 + datacnt; |
1122 | | #else |
1123 | | return data + 8 + kDecodeBytesGroupCount[mask0] + kDecodeBytesGroupCount[mask1]; |
1124 | | #endif |
1125 | | } |
1126 | | |
1127 | | case 3: |
1128 | | case 8: |
1129 | | { |
1130 | | uint8x16_t result = vld1q_u8(data); |
1131 | | |
1132 | | vst1q_u8(buffer, result); |
1133 | | |
1134 | | return data + 16; |
1135 | | } |
1136 | | |
1137 | | case 5: |
1138 | | { |
1139 | | unsigned char mask0 = data[0]; |
1140 | | unsigned char mask1 = data[1]; |
1141 | | |
1142 | | uint8x8_t rest0 = vld1_u8(data + 2); |
1143 | | uint8x8_t rest1 = vld1_u8(data + 2 + kDecodeBytesGroupCount[mask0]); |
1144 | | |
1145 | | uint8x16_t result = shuffleBytes(mask0, mask1, rest0, rest1); |
1146 | | |
1147 | | vst1q_u8(buffer, result); |
1148 | | |
1149 | | return data + 2 + kDecodeBytesGroupCount[mask0] + kDecodeBytesGroupCount[mask1]; |
1150 | | } |
1151 | | |
1152 | | default: |
1153 | | SIMD_UNREACHABLE(); // unreachable |
1154 | | } |
1155 | | } |
1156 | | #endif |
1157 | | |
1158 | | #ifdef SIMD_WASM |
1159 | | SIMD_TARGET |
1160 | | inline v128_t decodeShuffleMask(unsigned char mask0, unsigned char mask1) |
1161 | | { |
1162 | | v128_t sm0 = wasm_v128_load(&kDecodeBytesGroupShuffle[mask0]); |
1163 | | v128_t sm1 = wasm_v128_load(&kDecodeBytesGroupShuffle[mask1]); |
1164 | | |
1165 | | v128_t sm1off = wasm_v128_load8_splat(&kDecodeBytesGroupCount[mask0]); |
1166 | | v128_t sm1r = wasm_i8x16_add(sm1, sm1off); |
1167 | | |
1168 | | return wasmx_unpacklo_v64x2(sm0, sm1r); |
1169 | | } |
1170 | | |
1171 | | SIMD_TARGET |
1172 | | inline void wasmMoveMask(v128_t mask, unsigned char& mask0, unsigned char& mask1) |
1173 | | { |
1174 | | // magic constant found using z3 SMT assuming mask has 8 groups of 0xff or 0x00 |
1175 | | const uint64_t magic = 0x000103070f1f3f80ull; |
1176 | | |
1177 | | mask0 = uint8_t((wasm_i64x2_extract_lane(mask, 0) * magic) >> 56); |
1178 | | mask1 = uint8_t((wasm_i64x2_extract_lane(mask, 1) * magic) >> 56); |
1179 | | } |
1180 | | |
1181 | | SIMD_TARGET |
1182 | | inline const unsigned char* decodeBytesGroupSimd(const unsigned char* data, unsigned char* buffer, int hbits) |
1183 | | { |
1184 | | switch (hbits) |
1185 | | { |
1186 | | case 0: |
1187 | | case 4: |
1188 | | { |
1189 | | v128_t result = wasm_i8x16_splat(0); |
1190 | | |
1191 | | wasm_v128_store(buffer, result); |
1192 | | |
1193 | | return data; |
1194 | | } |
1195 | | |
1196 | | case 1: |
1197 | | case 6: |
1198 | | { |
1199 | | v128_t sel2 = wasm_v128_load(data); |
1200 | | v128_t rest = wasm_v128_load(data + 4); |
1201 | | |
1202 | | v128_t sel22 = wasmx_unpacklo_v8x16(wasm_i16x8_shr(sel2, 4), sel2); |
1203 | | v128_t sel2222 = wasmx_unpacklo_v8x16(wasm_i16x8_shr(sel22, 2), sel22); |
1204 | | v128_t sel = wasm_v128_and(sel2222, wasm_i8x16_splat(3)); |
1205 | | |
1206 | | v128_t mask = wasm_i8x16_eq(sel, wasm_i8x16_splat(3)); |
1207 | | |
1208 | | unsigned char mask0, mask1; |
1209 | | wasmMoveMask(mask, mask0, mask1); |
1210 | | |
1211 | | v128_t shuf = decodeShuffleMask(mask0, mask1); |
1212 | | v128_t result = wasm_v128_bitselect(wasm_i8x16_swizzle(rest, shuf), sel, mask); |
1213 | | |
1214 | | wasm_v128_store(buffer, result); |
1215 | | |
1216 | | return data + 4 + kDecodeBytesGroupCount[mask0] + kDecodeBytesGroupCount[mask1]; |
1217 | | } |
1218 | | |
1219 | | case 2: |
1220 | | case 7: |
1221 | | { |
1222 | | v128_t sel4 = wasm_v128_load(data); |
1223 | | v128_t rest = wasm_v128_load(data + 8); |
1224 | | |
1225 | | v128_t sel44 = wasmx_unpacklo_v8x16(wasm_i16x8_shr(sel4, 4), sel4); |
1226 | | v128_t sel = wasm_v128_and(sel44, wasm_i8x16_splat(15)); |
1227 | | |
1228 | | v128_t mask = wasm_i8x16_eq(sel, wasm_i8x16_splat(15)); |
1229 | | |
1230 | | unsigned char mask0, mask1; |
1231 | | wasmMoveMask(mask, mask0, mask1); |
1232 | | |
1233 | | v128_t shuf = decodeShuffleMask(mask0, mask1); |
1234 | | v128_t result = wasm_v128_bitselect(wasm_i8x16_swizzle(rest, shuf), sel, mask); |
1235 | | |
1236 | | wasm_v128_store(buffer, result); |
1237 | | |
1238 | | return data + 8 + kDecodeBytesGroupCount[mask0] + kDecodeBytesGroupCount[mask1]; |
1239 | | } |
1240 | | |
1241 | | case 3: |
1242 | | case 8: |
1243 | | { |
1244 | | v128_t result = wasm_v128_load(data); |
1245 | | |
1246 | | wasm_v128_store(buffer, result); |
1247 | | |
1248 | | return data + 16; |
1249 | | } |
1250 | | |
1251 | | case 5: |
1252 | | { |
1253 | | v128_t rest = wasm_v128_load(data + 2); |
1254 | | |
1255 | | unsigned char mask0 = data[0]; |
1256 | | unsigned char mask1 = data[1]; |
1257 | | |
1258 | | v128_t shuf = decodeShuffleMask(mask0, mask1); |
1259 | | v128_t result = wasm_i8x16_swizzle(rest, shuf); |
1260 | | |
1261 | | wasm_v128_store(buffer, result); |
1262 | | |
1263 | | return data + 2 + kDecodeBytesGroupCount[mask0] + kDecodeBytesGroupCount[mask1]; |
1264 | | } |
1265 | | |
1266 | | default: |
1267 | | SIMD_UNREACHABLE(); // unreachable |
1268 | | } |
1269 | | } |
1270 | | #endif |
1271 | | |
1272 | | #if defined(SIMD_SSE) || defined(SIMD_AVX) |
1273 | | SIMD_TARGET |
1274 | | inline void transpose8(__m128i& x0, __m128i& x1, __m128i& x2, __m128i& x3) |
1275 | 5.91M | { |
1276 | 5.91M | __m128i t0 = _mm_unpacklo_epi8(x0, x1); |
1277 | 5.91M | __m128i t1 = _mm_unpackhi_epi8(x0, x1); |
1278 | 5.91M | __m128i t2 = _mm_unpacklo_epi8(x2, x3); |
1279 | 5.91M | __m128i t3 = _mm_unpackhi_epi8(x2, x3); |
1280 | | |
1281 | 5.91M | x0 = _mm_unpacklo_epi16(t0, t2); |
1282 | 5.91M | x1 = _mm_unpackhi_epi16(t0, t2); |
1283 | 5.91M | x2 = _mm_unpacklo_epi16(t1, t3); |
1284 | 5.91M | x3 = _mm_unpackhi_epi16(t1, t3); |
1285 | 5.91M | } |
1286 | | |
1287 | | SIMD_TARGET |
1288 | | inline __m128i unzigzag8(__m128i v) |
1289 | 21.7M | { |
1290 | 21.7M | __m128i xl = _mm_sub_epi8(_mm_setzero_si128(), _mm_and_si128(v, _mm_set1_epi8(1))); |
1291 | 21.7M | __m128i xr = _mm_and_si128(_mm_srli_epi16(v, 1), _mm_set1_epi8(127)); |
1292 | | |
1293 | 21.7M | return _mm_xor_si128(xl, xr); |
1294 | 21.7M | } |
1295 | | |
1296 | | SIMD_TARGET |
1297 | | inline __m128i unzigzag16(__m128i v) |
1298 | 1.02M | { |
1299 | 1.02M | __m128i xl = _mm_sub_epi16(_mm_setzero_si128(), _mm_and_si128(v, _mm_set1_epi16(1))); |
1300 | 1.02M | __m128i xr = _mm_srli_epi16(v, 1); |
1301 | | |
1302 | 1.02M | return _mm_xor_si128(xl, xr); |
1303 | 1.02M | } |
1304 | | |
1305 | | SIMD_TARGET |
1306 | | inline __m128i rotate32(__m128i v, int r) |
1307 | 891k | { |
1308 | 891k | return _mm_or_si128(_mm_slli_epi32(v, r), _mm_srli_epi32(v, 32 - r)); |
1309 | 891k | } |
1310 | | #endif |
1311 | | |
1312 | | #ifdef SIMD_NEON |
1313 | | SIMD_TARGET |
1314 | | inline void transpose8(uint8x16_t& x0, uint8x16_t& x1, uint8x16_t& x2, uint8x16_t& x3) |
1315 | | { |
1316 | | uint8x16x2_t t01 = vzipq_u8(x0, x1); |
1317 | | uint8x16x2_t t23 = vzipq_u8(x2, x3); |
1318 | | |
1319 | | uint16x8x2_t x01 = vzipq_u16(vreinterpretq_u16_u8(t01.val[0]), vreinterpretq_u16_u8(t23.val[0])); |
1320 | | uint16x8x2_t x23 = vzipq_u16(vreinterpretq_u16_u8(t01.val[1]), vreinterpretq_u16_u8(t23.val[1])); |
1321 | | |
1322 | | x0 = vreinterpretq_u8_u16(x01.val[0]); |
1323 | | x1 = vreinterpretq_u8_u16(x01.val[1]); |
1324 | | x2 = vreinterpretq_u8_u16(x23.val[0]); |
1325 | | x3 = vreinterpretq_u8_u16(x23.val[1]); |
1326 | | } |
1327 | | |
1328 | | SIMD_TARGET |
1329 | | inline uint8x16_t unzigzag8(uint8x16_t v) |
1330 | | { |
1331 | | uint8x16_t xl = vreinterpretq_u8_s8(vnegq_s8(vreinterpretq_s8_u8(vandq_u8(v, vdupq_n_u8(1))))); |
1332 | | uint8x16_t xr = vshrq_n_u8(v, 1); |
1333 | | |
1334 | | return veorq_u8(xl, xr); |
1335 | | } |
1336 | | |
1337 | | SIMD_TARGET |
1338 | | inline uint8x16_t unzigzag16(uint8x16_t v) |
1339 | | { |
1340 | | uint16x8_t vv = vreinterpretq_u16_u8(v); |
1341 | | uint8x16_t xl = vreinterpretq_u8_s16(vnegq_s16(vreinterpretq_s16_u16(vandq_u16(vv, vdupq_n_u16(1))))); |
1342 | | uint8x16_t xr = vreinterpretq_u8_u16(vshrq_n_u16(vv, 1)); |
1343 | | |
1344 | | return veorq_u8(xl, xr); |
1345 | | } |
1346 | | |
1347 | | SIMD_TARGET |
1348 | | inline uint8x16_t rotate32(uint8x16_t v, int r) |
1349 | | { |
1350 | | uint32x4_t v32 = vreinterpretq_u32_u8(v); |
1351 | | return vreinterpretq_u8_u32(vorrq_u32(vshlq_u32(v32, vdupq_n_s32(r)), vshlq_u32(v32, vdupq_n_s32(r - 32)))); |
1352 | | } |
1353 | | |
1354 | | template <int Channel> |
1355 | | SIMD_TARGET inline uint8x8_t rebase(uint8x8_t npi, uint8x16_t r0, uint8x16_t r1, uint8x16_t r2, uint8x16_t r3) |
1356 | | { |
1357 | | switch (Channel) |
1358 | | { |
1359 | | case 0: |
1360 | | { |
1361 | | uint8x16_t rsum = vaddq_u8(vaddq_u8(r0, r1), vaddq_u8(r2, r3)); |
1362 | | uint8x8_t rsumx = vadd_u8(vget_low_u8(rsum), vget_high_u8(rsum)); |
1363 | | return vadd_u8(vadd_u8(npi, rsumx), vext_u8(rsumx, rsumx, 4)); |
1364 | | } |
1365 | | case 1: |
1366 | | { |
1367 | | uint16x8_t rsum = vaddq_u16(vaddq_u16(vreinterpretq_u16_u8(r0), vreinterpretq_u16_u8(r1)), vaddq_u16(vreinterpretq_u16_u8(r2), vreinterpretq_u16_u8(r3))); |
1368 | | uint16x4_t rsumx = vadd_u16(vget_low_u16(rsum), vget_high_u16(rsum)); |
1369 | | return vreinterpret_u8_u16(vadd_u16(vadd_u16(vreinterpret_u16_u8(npi), rsumx), vext_u16(rsumx, rsumx, 2))); |
1370 | | } |
1371 | | case 2: |
1372 | | { |
1373 | | uint8x16_t rsum = veorq_u8(veorq_u8(r0, r1), veorq_u8(r2, r3)); |
1374 | | uint8x8_t rsumx = veor_u8(vget_low_u8(rsum), vget_high_u8(rsum)); |
1375 | | return veor_u8(veor_u8(npi, rsumx), vext_u8(rsumx, rsumx, 4)); |
1376 | | } |
1377 | | default: |
1378 | | return npi; |
1379 | | } |
1380 | | } |
1381 | | #endif |
1382 | | |
1383 | | #ifdef SIMD_WASM |
1384 | | SIMD_TARGET |
1385 | | inline void transpose8(v128_t& x0, v128_t& x1, v128_t& x2, v128_t& x3) |
1386 | | { |
1387 | | v128_t t0 = wasmx_unpacklo_v8x16(x0, x1); |
1388 | | v128_t t1 = wasmx_unpackhi_v8x16(x0, x1); |
1389 | | v128_t t2 = wasmx_unpacklo_v8x16(x2, x3); |
1390 | | v128_t t3 = wasmx_unpackhi_v8x16(x2, x3); |
1391 | | |
1392 | | x0 = wasmx_unpacklo_v16x8(t0, t2); |
1393 | | x1 = wasmx_unpackhi_v16x8(t0, t2); |
1394 | | x2 = wasmx_unpacklo_v16x8(t1, t3); |
1395 | | x3 = wasmx_unpackhi_v16x8(t1, t3); |
1396 | | } |
1397 | | |
1398 | | SIMD_TARGET |
1399 | | inline v128_t unzigzag8(v128_t v) |
1400 | | { |
1401 | | v128_t xl = wasm_i8x16_neg(wasm_v128_and(v, wasm_i8x16_splat(1))); |
1402 | | v128_t xr = wasm_u8x16_shr(v, 1); |
1403 | | |
1404 | | return wasm_v128_xor(xl, xr); |
1405 | | } |
1406 | | |
1407 | | SIMD_TARGET |
1408 | | inline v128_t unzigzag16(v128_t v) |
1409 | | { |
1410 | | v128_t xl = wasm_i16x8_neg(wasm_v128_and(v, wasm_i16x8_splat(1))); |
1411 | | v128_t xr = wasm_u16x8_shr(v, 1); |
1412 | | |
1413 | | return wasm_v128_xor(xl, xr); |
1414 | | } |
1415 | | |
1416 | | SIMD_TARGET |
1417 | | inline v128_t rotate32(v128_t v, int r) |
1418 | | { |
1419 | | return wasm_v128_or(wasm_i32x4_shl(v, r), wasm_i32x4_shr(v, 32 - r)); |
1420 | | } |
1421 | | #endif |
1422 | | |
1423 | | #if defined(SIMD_SSE) || defined(SIMD_AVX) || defined(SIMD_NEON) || defined(SIMD_WASM) |
1424 | | SIMD_TARGET |
1425 | | static const unsigned char* decodeBytesSimd(const unsigned char* data, const unsigned char* data_end, unsigned char* buffer, size_t buffer_size, int hshift) |
1426 | 899k | { |
1427 | 899k | assert(buffer_size % kByteGroupSize == 0); |
1428 | 899k | assert(kByteGroupSize == 16); |
1429 | | |
1430 | | // round number of groups to 4 to get number of header bytes |
1431 | 899k | size_t header_size = (buffer_size / kByteGroupSize + 3) / 4; |
1432 | 899k | if (size_t(data_end - data) < header_size) |
1433 | 15 | return NULL; |
1434 | | |
1435 | 899k | const unsigned char* header = data; |
1436 | 899k | data += header_size; |
1437 | | |
1438 | 899k | size_t i = 0; |
1439 | | |
1440 | | // fast-path: process 4 groups at a time, do a shared bounds check |
1441 | 4.27M | for (; i + kByteGroupSize * 4 <= buffer_size && size_t(data_end - data) >= kByteGroupDecodeLimit * 4; i += kByteGroupSize * 4) |
1442 | 3.37M | { |
1443 | 3.37M | size_t header_offset = i / kByteGroupSize; |
1444 | 3.37M | unsigned char header_byte = header[header_offset / 4]; |
1445 | | |
1446 | 3.37M | data = decodeBytesGroupSimd(data, buffer + i + kByteGroupSize * 0, hshift + ((header_byte >> 0) & 3)); |
1447 | 3.37M | data = decodeBytesGroupSimd(data, buffer + i + kByteGroupSize * 1, hshift + ((header_byte >> 2) & 3)); |
1448 | 3.37M | data = decodeBytesGroupSimd(data, buffer + i + kByteGroupSize * 2, hshift + ((header_byte >> 4) & 3)); |
1449 | 3.37M | data = decodeBytesGroupSimd(data, buffer + i + kByteGroupSize * 3, hshift + ((header_byte >> 6) & 3)); |
1450 | 3.37M | } |
1451 | | |
1452 | | // slow-path: process remaining groups |
1453 | 1.00M | for (; i < buffer_size; i += kByteGroupSize) |
1454 | 109k | { |
1455 | 109k | if (size_t(data_end - data) < kByteGroupDecodeLimit) |
1456 | 660 | return NULL; |
1457 | | |
1458 | 108k | size_t header_offset = i / kByteGroupSize; |
1459 | 108k | unsigned char header_byte = header[header_offset / 4]; |
1460 | | |
1461 | 108k | data = decodeBytesGroupSimd(data, buffer + i, hshift + ((header_byte >> ((header_offset % 4) * 2)) & 3)); |
1462 | 108k | } |
1463 | | |
1464 | 898k | return data; |
1465 | 899k | } |
1466 | | |
1467 | | template <int Channel> |
1468 | | SIMD_TARGET static void |
1469 | | decodeDeltas4Simd(const unsigned char* buffer, unsigned char* transposed, size_t vertex_count_aligned, size_t vertex_size, unsigned char last_vertex[4], int rot) |
1470 | 391k | { |
1471 | 391k | #if defined(SIMD_SSE) || defined(SIMD_AVX) |
1472 | 11.8M | #define TEMP __m128i |
1473 | 391k | #define PREP() __m128i pi = _mm_cvtsi32_si128(*reinterpret_cast<const int*>(last_vertex)) |
1474 | 23.6M | #define LOAD(i) __m128i r##i = _mm_loadu_si128(reinterpret_cast<const __m128i*>(buffer + j + i * vertex_count_aligned)) |
1475 | 23.6M | #define GRP4(i) t0 = r##i, t1 = _mm_shuffle_epi32(r##i, 1), t2 = _mm_shuffle_epi32(r##i, 2), t3 = _mm_shuffle_epi32(r##i, 3) |
1476 | 94.7M | #define FIXD(i) t##i = pi = Channel == 0 ? _mm_add_epi8(pi, t##i) : (Channel == 1 ? _mm_add_epi16(pi, t##i) : _mm_xor_si128(pi, t##i)) |
1477 | 94.7M | #define SAVE(i) *reinterpret_cast<int*>(savep) = _mm_cvtsi128_si32(t##i), savep += vertex_size |
1478 | 391k | #endif |
1479 | | |
1480 | | #ifdef SIMD_NEON |
1481 | | #define TEMP uint8x8_t |
1482 | | #define PREP() uint8x8_t pi = vreinterpret_u8_u32(vld1_lane_u32(reinterpret_cast<uint32_t*>(last_vertex), vdup_n_u32(0), 0)) |
1483 | | #define LOAD(i) uint8x16_t r##i = vld1q_u8(buffer + j + i * vertex_count_aligned) |
1484 | | #define GRP4(i) t0 = vget_low_u8(r##i), t1 = vreinterpret_u8_u32(vdup_lane_u32(vreinterpret_u32_u8(t0), 1)), t2 = vget_high_u8(r##i), t3 = vreinterpret_u8_u32(vdup_lane_u32(vreinterpret_u32_u8(t2), 1)) |
1485 | | #define FIXD(i) t##i = pi = Channel == 0 ? vadd_u8(pi, t##i) : (Channel == 1 ? vreinterpret_u8_u16(vadd_u16(vreinterpret_u16_u8(pi), vreinterpret_u16_u8(t##i))) : veor_u8(pi, t##i)) |
1486 | | #define SAVE(i) vst1_lane_u32(reinterpret_cast<uint32_t*>(savep), vreinterpret_u32_u8(t##i), 0), savep += vertex_size |
1487 | | #endif |
1488 | | |
1489 | | #ifdef SIMD_WASM |
1490 | | #define TEMP v128_t |
1491 | | #define PREP() v128_t pi = wasm_v128_load(last_vertex) |
1492 | | #define LOAD(i) v128_t r##i = wasm_v128_load(buffer + j + i * vertex_count_aligned) |
1493 | | #define GRP4(i) t0 = r##i, t1 = wasmx_splat_v32x4(r##i, 1), t2 = wasmx_splat_v32x4(r##i, 2), t3 = wasmx_splat_v32x4(r##i, 3) |
1494 | | #define FIXD(i) t##i = pi = Channel == 0 ? wasm_i8x16_add(pi, t##i) : (Channel == 1 ? wasm_i16x8_add(pi, t##i) : wasm_v128_xor(pi, t##i)) |
1495 | | #define SAVE(i) wasm_v128_store32_lane(savep, t##i, 0), savep += vertex_size |
1496 | | #endif |
1497 | | |
1498 | 23.6M | #define UNZR(i) r##i = Channel == 0 ? unzigzag8(r##i) : (Channel == 1 ? unzigzag16(r##i) : rotate32(r##i, rot)) |
1499 | | |
1500 | 391k | PREP(); |
1501 | | |
1502 | 391k | unsigned char* savep = transposed; |
1503 | | |
1504 | 6.31M | for (size_t j = 0; j < vertex_count_aligned; j += 16) |
1505 | 5.91M | { |
1506 | 5.91M | LOAD(0); |
1507 | 5.91M | LOAD(1); |
1508 | 5.91M | LOAD(2); |
1509 | 5.91M | LOAD(3); |
1510 | | |
1511 | 5.91M | transpose8(r0, r1, r2, r3); |
1512 | | |
1513 | 5.91M | TEMP t0, t1, t2, t3; |
1514 | 5.91M | TEMP npi = pi; |
1515 | | |
1516 | 5.91M | UNZR(0); |
1517 | 5.91M | GRP4(0); |
1518 | 5.91M | FIXD(0), FIXD(1), FIXD(2), FIXD(3); |
1519 | 5.91M | SAVE(0), SAVE(1), SAVE(2), SAVE(3); |
1520 | | |
1521 | 5.91M | UNZR(1); |
1522 | 5.91M | GRP4(1); |
1523 | 5.91M | FIXD(0), FIXD(1), FIXD(2), FIXD(3); |
1524 | 5.91M | SAVE(0), SAVE(1), SAVE(2), SAVE(3); |
1525 | | |
1526 | 5.91M | UNZR(2); |
1527 | 5.91M | GRP4(2); |
1528 | 5.91M | FIXD(0), FIXD(1), FIXD(2), FIXD(3); |
1529 | 5.91M | SAVE(0), SAVE(1), SAVE(2), SAVE(3); |
1530 | | |
1531 | 5.91M | UNZR(3); |
1532 | 5.91M | GRP4(3); |
1533 | 5.91M | FIXD(0), FIXD(1), FIXD(2), FIXD(3); |
1534 | 5.91M | SAVE(0), SAVE(1), SAVE(2), SAVE(3); |
1535 | | |
1536 | | #if defined(SIMD_LATENCYOPT) && defined(SIMD_NEON) && (defined(__APPLE__) || defined(_WIN32)) |
1537 | | // instead of relying on accumulated pi, recompute it from scratch from r0..r3; this shortens dependency between loop iterations |
1538 | | pi = rebase<Channel>(npi, r0, r1, r2, r3); |
1539 | | #else |
1540 | 5.91M | (void)npi; |
1541 | 5.91M | #endif |
1542 | | |
1543 | 5.91M | #undef UNZR |
1544 | 5.91M | #undef TEMP |
1545 | 5.91M | #undef PREP |
1546 | 5.91M | #undef LOAD |
1547 | 5.91M | #undef GRP4 |
1548 | 5.91M | #undef FIXD |
1549 | 5.91M | #undef SAVE |
1550 | 5.91M | } |
1551 | 391k | } vertexcodec.cpp:void meshopt::decodeDeltas4Simd<0>(unsigned char const*, unsigned char*, unsigned long, unsigned long, unsigned char*, int) Line | Count | Source | 1470 | 359k | { | 1471 | 359k | #if defined(SIMD_SSE) || defined(SIMD_AVX) | 1472 | 359k | #define TEMP __m128i | 1473 | 359k | #define PREP() __m128i pi = _mm_cvtsi32_si128(*reinterpret_cast<const int*>(last_vertex)) | 1474 | 359k | #define LOAD(i) __m128i r##i = _mm_loadu_si128(reinterpret_cast<const __m128i*>(buffer + j + i * vertex_count_aligned)) | 1475 | 359k | #define GRP4(i) t0 = r##i, t1 = _mm_shuffle_epi32(r##i, 1), t2 = _mm_shuffle_epi32(r##i, 2), t3 = _mm_shuffle_epi32(r##i, 3) | 1476 | 359k | #define FIXD(i) t##i = pi = Channel == 0 ? _mm_add_epi8(pi, t##i) : (Channel == 1 ? _mm_add_epi16(pi, t##i) : _mm_xor_si128(pi, t##i)) | 1477 | 359k | #define SAVE(i) *reinterpret_cast<int*>(savep) = _mm_cvtsi128_si32(t##i), savep += vertex_size | 1478 | 359k | #endif | 1479 | | | 1480 | | #ifdef SIMD_NEON | 1481 | | #define TEMP uint8x8_t | 1482 | | #define PREP() uint8x8_t pi = vreinterpret_u8_u32(vld1_lane_u32(reinterpret_cast<uint32_t*>(last_vertex), vdup_n_u32(0), 0)) | 1483 | | #define LOAD(i) uint8x16_t r##i = vld1q_u8(buffer + j + i * vertex_count_aligned) | 1484 | | #define GRP4(i) t0 = vget_low_u8(r##i), t1 = vreinterpret_u8_u32(vdup_lane_u32(vreinterpret_u32_u8(t0), 1)), t2 = vget_high_u8(r##i), t3 = vreinterpret_u8_u32(vdup_lane_u32(vreinterpret_u32_u8(t2), 1)) | 1485 | | #define FIXD(i) t##i = pi = Channel == 0 ? vadd_u8(pi, t##i) : (Channel == 1 ? vreinterpret_u8_u16(vadd_u16(vreinterpret_u16_u8(pi), vreinterpret_u16_u8(t##i))) : veor_u8(pi, t##i)) | 1486 | | #define SAVE(i) vst1_lane_u32(reinterpret_cast<uint32_t*>(savep), vreinterpret_u32_u8(t##i), 0), savep += vertex_size | 1487 | | #endif | 1488 | | | 1489 | | #ifdef SIMD_WASM | 1490 | | #define TEMP v128_t | 1491 | | #define PREP() v128_t pi = wasm_v128_load(last_vertex) | 1492 | | #define LOAD(i) v128_t r##i = wasm_v128_load(buffer + j + i * vertex_count_aligned) | 1493 | | #define GRP4(i) t0 = r##i, t1 = wasmx_splat_v32x4(r##i, 1), t2 = wasmx_splat_v32x4(r##i, 2), t3 = wasmx_splat_v32x4(r##i, 3) | 1494 | | #define FIXD(i) t##i = pi = Channel == 0 ? wasm_i8x16_add(pi, t##i) : (Channel == 1 ? wasm_i16x8_add(pi, t##i) : wasm_v128_xor(pi, t##i)) | 1495 | | #define SAVE(i) wasm_v128_store32_lane(savep, t##i, 0), savep += vertex_size | 1496 | | #endif | 1497 | | | 1498 | 359k | #define UNZR(i) r##i = Channel == 0 ? unzigzag8(r##i) : (Channel == 1 ? unzigzag16(r##i) : rotate32(r##i, rot)) | 1499 | | | 1500 | 359k | PREP(); | 1501 | | | 1502 | 359k | unsigned char* savep = transposed; | 1503 | | | 1504 | 5.79M | for (size_t j = 0; j < vertex_count_aligned; j += 16) | 1505 | 5.43M | { | 1506 | 5.43M | LOAD(0); | 1507 | 5.43M | LOAD(1); | 1508 | 5.43M | LOAD(2); | 1509 | 5.43M | LOAD(3); | 1510 | | | 1511 | 5.43M | transpose8(r0, r1, r2, r3); | 1512 | | | 1513 | 5.43M | TEMP t0, t1, t2, t3; | 1514 | 5.43M | TEMP npi = pi; | 1515 | | | 1516 | 5.43M | UNZR(0); | 1517 | 5.43M | GRP4(0); | 1518 | 5.43M | FIXD(0), FIXD(1), FIXD(2), FIXD(3); | 1519 | 5.43M | SAVE(0), SAVE(1), SAVE(2), SAVE(3); | 1520 | | | 1521 | 5.43M | UNZR(1); | 1522 | 5.43M | GRP4(1); | 1523 | 5.43M | FIXD(0), FIXD(1), FIXD(2), FIXD(3); | 1524 | 5.43M | SAVE(0), SAVE(1), SAVE(2), SAVE(3); | 1525 | | | 1526 | 5.43M | UNZR(2); | 1527 | 5.43M | GRP4(2); | 1528 | 5.43M | FIXD(0), FIXD(1), FIXD(2), FIXD(3); | 1529 | 5.43M | SAVE(0), SAVE(1), SAVE(2), SAVE(3); | 1530 | | | 1531 | 5.43M | UNZR(3); | 1532 | 5.43M | GRP4(3); | 1533 | 5.43M | FIXD(0), FIXD(1), FIXD(2), FIXD(3); | 1534 | 5.43M | SAVE(0), SAVE(1), SAVE(2), SAVE(3); | 1535 | | | 1536 | | #if defined(SIMD_LATENCYOPT) && defined(SIMD_NEON) && (defined(__APPLE__) || defined(_WIN32)) | 1537 | | // instead of relying on accumulated pi, recompute it from scratch from r0..r3; this shortens dependency between loop iterations | 1538 | | pi = rebase<Channel>(npi, r0, r1, r2, r3); | 1539 | | #else | 1540 | 5.43M | (void)npi; | 1541 | 5.43M | #endif | 1542 | | | 1543 | 5.43M | #undef UNZR | 1544 | 5.43M | #undef TEMP | 1545 | 5.43M | #undef PREP | 1546 | 5.43M | #undef LOAD | 1547 | 5.43M | #undef GRP4 | 1548 | 5.43M | #undef FIXD | 1549 | 5.43M | #undef SAVE | 1550 | 5.43M | } | 1551 | 359k | } |
vertexcodec.cpp:void meshopt::decodeDeltas4Simd<1>(unsigned char const*, unsigned char*, unsigned long, unsigned long, unsigned char*, int) Line | Count | Source | 1470 | 17.1k | { | 1471 | 17.1k | #if defined(SIMD_SSE) || defined(SIMD_AVX) | 1472 | 17.1k | #define TEMP __m128i | 1473 | 17.1k | #define PREP() __m128i pi = _mm_cvtsi32_si128(*reinterpret_cast<const int*>(last_vertex)) | 1474 | 17.1k | #define LOAD(i) __m128i r##i = _mm_loadu_si128(reinterpret_cast<const __m128i*>(buffer + j + i * vertex_count_aligned)) | 1475 | 17.1k | #define GRP4(i) t0 = r##i, t1 = _mm_shuffle_epi32(r##i, 1), t2 = _mm_shuffle_epi32(r##i, 2), t3 = _mm_shuffle_epi32(r##i, 3) | 1476 | 17.1k | #define FIXD(i) t##i = pi = Channel == 0 ? _mm_add_epi8(pi, t##i) : (Channel == 1 ? _mm_add_epi16(pi, t##i) : _mm_xor_si128(pi, t##i)) | 1477 | 17.1k | #define SAVE(i) *reinterpret_cast<int*>(savep) = _mm_cvtsi128_si32(t##i), savep += vertex_size | 1478 | 17.1k | #endif | 1479 | | | 1480 | | #ifdef SIMD_NEON | 1481 | | #define TEMP uint8x8_t | 1482 | | #define PREP() uint8x8_t pi = vreinterpret_u8_u32(vld1_lane_u32(reinterpret_cast<uint32_t*>(last_vertex), vdup_n_u32(0), 0)) | 1483 | | #define LOAD(i) uint8x16_t r##i = vld1q_u8(buffer + j + i * vertex_count_aligned) | 1484 | | #define GRP4(i) t0 = vget_low_u8(r##i), t1 = vreinterpret_u8_u32(vdup_lane_u32(vreinterpret_u32_u8(t0), 1)), t2 = vget_high_u8(r##i), t3 = vreinterpret_u8_u32(vdup_lane_u32(vreinterpret_u32_u8(t2), 1)) | 1485 | | #define FIXD(i) t##i = pi = Channel == 0 ? vadd_u8(pi, t##i) : (Channel == 1 ? vreinterpret_u8_u16(vadd_u16(vreinterpret_u16_u8(pi), vreinterpret_u16_u8(t##i))) : veor_u8(pi, t##i)) | 1486 | | #define SAVE(i) vst1_lane_u32(reinterpret_cast<uint32_t*>(savep), vreinterpret_u32_u8(t##i), 0), savep += vertex_size | 1487 | | #endif | 1488 | | | 1489 | | #ifdef SIMD_WASM | 1490 | | #define TEMP v128_t | 1491 | | #define PREP() v128_t pi = wasm_v128_load(last_vertex) | 1492 | | #define LOAD(i) v128_t r##i = wasm_v128_load(buffer + j + i * vertex_count_aligned) | 1493 | | #define GRP4(i) t0 = r##i, t1 = wasmx_splat_v32x4(r##i, 1), t2 = wasmx_splat_v32x4(r##i, 2), t3 = wasmx_splat_v32x4(r##i, 3) | 1494 | | #define FIXD(i) t##i = pi = Channel == 0 ? wasm_i8x16_add(pi, t##i) : (Channel == 1 ? wasm_i16x8_add(pi, t##i) : wasm_v128_xor(pi, t##i)) | 1495 | | #define SAVE(i) wasm_v128_store32_lane(savep, t##i, 0), savep += vertex_size | 1496 | | #endif | 1497 | | | 1498 | 17.1k | #define UNZR(i) r##i = Channel == 0 ? unzigzag8(r##i) : (Channel == 1 ? unzigzag16(r##i) : rotate32(r##i, rot)) | 1499 | | | 1500 | 17.1k | PREP(); | 1501 | | | 1502 | 17.1k | unsigned char* savep = transposed; | 1503 | | | 1504 | 274k | for (size_t j = 0; j < vertex_count_aligned; j += 16) | 1505 | 257k | { | 1506 | 257k | LOAD(0); | 1507 | 257k | LOAD(1); | 1508 | 257k | LOAD(2); | 1509 | 257k | LOAD(3); | 1510 | | | 1511 | 257k | transpose8(r0, r1, r2, r3); | 1512 | | | 1513 | 257k | TEMP t0, t1, t2, t3; | 1514 | 257k | TEMP npi = pi; | 1515 | | | 1516 | 257k | UNZR(0); | 1517 | 257k | GRP4(0); | 1518 | 257k | FIXD(0), FIXD(1), FIXD(2), FIXD(3); | 1519 | 257k | SAVE(0), SAVE(1), SAVE(2), SAVE(3); | 1520 | | | 1521 | 257k | UNZR(1); | 1522 | 257k | GRP4(1); | 1523 | 257k | FIXD(0), FIXD(1), FIXD(2), FIXD(3); | 1524 | 257k | SAVE(0), SAVE(1), SAVE(2), SAVE(3); | 1525 | | | 1526 | 257k | UNZR(2); | 1527 | 257k | GRP4(2); | 1528 | 257k | FIXD(0), FIXD(1), FIXD(2), FIXD(3); | 1529 | 257k | SAVE(0), SAVE(1), SAVE(2), SAVE(3); | 1530 | | | 1531 | 257k | UNZR(3); | 1532 | 257k | GRP4(3); | 1533 | 257k | FIXD(0), FIXD(1), FIXD(2), FIXD(3); | 1534 | 257k | SAVE(0), SAVE(1), SAVE(2), SAVE(3); | 1535 | | | 1536 | | #if defined(SIMD_LATENCYOPT) && defined(SIMD_NEON) && (defined(__APPLE__) || defined(_WIN32)) | 1537 | | // instead of relying on accumulated pi, recompute it from scratch from r0..r3; this shortens dependency between loop iterations | 1538 | | pi = rebase<Channel>(npi, r0, r1, r2, r3); | 1539 | | #else | 1540 | 257k | (void)npi; | 1541 | 257k | #endif | 1542 | | | 1543 | 257k | #undef UNZR | 1544 | 257k | #undef TEMP | 1545 | 257k | #undef PREP | 1546 | 257k | #undef LOAD | 1547 | 257k | #undef GRP4 | 1548 | 257k | #undef FIXD | 1549 | 257k | #undef SAVE | 1550 | 257k | } | 1551 | 17.1k | } |
vertexcodec.cpp:void meshopt::decodeDeltas4Simd<2>(unsigned char const*, unsigned char*, unsigned long, unsigned long, unsigned char*, int) Line | Count | Source | 1470 | 14.7k | { | 1471 | 14.7k | #if defined(SIMD_SSE) || defined(SIMD_AVX) | 1472 | 14.7k | #define TEMP __m128i | 1473 | 14.7k | #define PREP() __m128i pi = _mm_cvtsi32_si128(*reinterpret_cast<const int*>(last_vertex)) | 1474 | 14.7k | #define LOAD(i) __m128i r##i = _mm_loadu_si128(reinterpret_cast<const __m128i*>(buffer + j + i * vertex_count_aligned)) | 1475 | 14.7k | #define GRP4(i) t0 = r##i, t1 = _mm_shuffle_epi32(r##i, 1), t2 = _mm_shuffle_epi32(r##i, 2), t3 = _mm_shuffle_epi32(r##i, 3) | 1476 | 14.7k | #define FIXD(i) t##i = pi = Channel == 0 ? _mm_add_epi8(pi, t##i) : (Channel == 1 ? _mm_add_epi16(pi, t##i) : _mm_xor_si128(pi, t##i)) | 1477 | 14.7k | #define SAVE(i) *reinterpret_cast<int*>(savep) = _mm_cvtsi128_si32(t##i), savep += vertex_size | 1478 | 14.7k | #endif | 1479 | | | 1480 | | #ifdef SIMD_NEON | 1481 | | #define TEMP uint8x8_t | 1482 | | #define PREP() uint8x8_t pi = vreinterpret_u8_u32(vld1_lane_u32(reinterpret_cast<uint32_t*>(last_vertex), vdup_n_u32(0), 0)) | 1483 | | #define LOAD(i) uint8x16_t r##i = vld1q_u8(buffer + j + i * vertex_count_aligned) | 1484 | | #define GRP4(i) t0 = vget_low_u8(r##i), t1 = vreinterpret_u8_u32(vdup_lane_u32(vreinterpret_u32_u8(t0), 1)), t2 = vget_high_u8(r##i), t3 = vreinterpret_u8_u32(vdup_lane_u32(vreinterpret_u32_u8(t2), 1)) | 1485 | | #define FIXD(i) t##i = pi = Channel == 0 ? vadd_u8(pi, t##i) : (Channel == 1 ? vreinterpret_u8_u16(vadd_u16(vreinterpret_u16_u8(pi), vreinterpret_u16_u8(t##i))) : veor_u8(pi, t##i)) | 1486 | | #define SAVE(i) vst1_lane_u32(reinterpret_cast<uint32_t*>(savep), vreinterpret_u32_u8(t##i), 0), savep += vertex_size | 1487 | | #endif | 1488 | | | 1489 | | #ifdef SIMD_WASM | 1490 | | #define TEMP v128_t | 1491 | | #define PREP() v128_t pi = wasm_v128_load(last_vertex) | 1492 | | #define LOAD(i) v128_t r##i = wasm_v128_load(buffer + j + i * vertex_count_aligned) | 1493 | | #define GRP4(i) t0 = r##i, t1 = wasmx_splat_v32x4(r##i, 1), t2 = wasmx_splat_v32x4(r##i, 2), t3 = wasmx_splat_v32x4(r##i, 3) | 1494 | | #define FIXD(i) t##i = pi = Channel == 0 ? wasm_i8x16_add(pi, t##i) : (Channel == 1 ? wasm_i16x8_add(pi, t##i) : wasm_v128_xor(pi, t##i)) | 1495 | | #define SAVE(i) wasm_v128_store32_lane(savep, t##i, 0), savep += vertex_size | 1496 | | #endif | 1497 | | | 1498 | 14.7k | #define UNZR(i) r##i = Channel == 0 ? unzigzag8(r##i) : (Channel == 1 ? unzigzag16(r##i) : rotate32(r##i, rot)) | 1499 | | | 1500 | 14.7k | PREP(); | 1501 | | | 1502 | 14.7k | unsigned char* savep = transposed; | 1503 | | | 1504 | 237k | for (size_t j = 0; j < vertex_count_aligned; j += 16) | 1505 | 222k | { | 1506 | 222k | LOAD(0); | 1507 | 222k | LOAD(1); | 1508 | 222k | LOAD(2); | 1509 | 222k | LOAD(3); | 1510 | | | 1511 | 222k | transpose8(r0, r1, r2, r3); | 1512 | | | 1513 | 222k | TEMP t0, t1, t2, t3; | 1514 | 222k | TEMP npi = pi; | 1515 | | | 1516 | 222k | UNZR(0); | 1517 | 222k | GRP4(0); | 1518 | 222k | FIXD(0), FIXD(1), FIXD(2), FIXD(3); | 1519 | 222k | SAVE(0), SAVE(1), SAVE(2), SAVE(3); | 1520 | | | 1521 | 222k | UNZR(1); | 1522 | 222k | GRP4(1); | 1523 | 222k | FIXD(0), FIXD(1), FIXD(2), FIXD(3); | 1524 | 222k | SAVE(0), SAVE(1), SAVE(2), SAVE(3); | 1525 | | | 1526 | 222k | UNZR(2); | 1527 | 222k | GRP4(2); | 1528 | 222k | FIXD(0), FIXD(1), FIXD(2), FIXD(3); | 1529 | 222k | SAVE(0), SAVE(1), SAVE(2), SAVE(3); | 1530 | | | 1531 | 222k | UNZR(3); | 1532 | 222k | GRP4(3); | 1533 | 222k | FIXD(0), FIXD(1), FIXD(2), FIXD(3); | 1534 | 222k | SAVE(0), SAVE(1), SAVE(2), SAVE(3); | 1535 | | | 1536 | | #if defined(SIMD_LATENCYOPT) && defined(SIMD_NEON) && (defined(__APPLE__) || defined(_WIN32)) | 1537 | | // instead of relying on accumulated pi, recompute it from scratch from r0..r3; this shortens dependency between loop iterations | 1538 | | pi = rebase<Channel>(npi, r0, r1, r2, r3); | 1539 | | #else | 1540 | 222k | (void)npi; | 1541 | 222k | #endif | 1542 | | | 1543 | 222k | #undef UNZR | 1544 | 222k | #undef TEMP | 1545 | 222k | #undef PREP | 1546 | 222k | #undef LOAD | 1547 | 222k | #undef GRP4 | 1548 | 222k | #undef FIXD | 1549 | 222k | #undef SAVE | 1550 | 222k | } | 1551 | 14.7k | } |
|
1552 | | |
1553 | | SIMD_TARGET |
1554 | | static const unsigned char* decodeVertexBlockSimd(const unsigned char* data, const unsigned char* data_end, unsigned char* vertex_data, size_t vertex_count, size_t vertex_size, unsigned char last_vertex[256], const unsigned char* channels, int version) |
1555 | 147k | { |
1556 | 147k | assert(vertex_count > 0 && vertex_count <= kVertexBlockMaxSize); |
1557 | | |
1558 | 147k | unsigned char buffer[kVertexBlockMaxSize * 4]; |
1559 | 147k | unsigned char transposed[kVertexBlockSizeBytes]; |
1560 | | |
1561 | 147k | size_t vertex_count_aligned = (vertex_count + kByteGroupSize - 1) & ~(kByteGroupSize - 1); |
1562 | | |
1563 | 147k | size_t control_size = version == 0 ? 0 : vertex_size / 4; |
1564 | 147k | if (size_t(data_end - data) < control_size) |
1565 | 0 | return NULL; |
1566 | | |
1567 | 147k | const unsigned char* control = data; |
1568 | 147k | data += control_size; |
1569 | | |
1570 | 538k | for (size_t k = 0; k < vertex_size; k += 4) |
1571 | 391k | { |
1572 | 391k | unsigned char ctrl_byte = version == 0 ? 0 : control[k / 4]; |
1573 | | |
1574 | 1.95M | for (size_t j = 0; j < 4; ++j) |
1575 | 1.56M | { |
1576 | 1.56M | int ctrl = (ctrl_byte >> (j * 2)) & 3; |
1577 | | |
1578 | 1.56M | if (ctrl == 3) |
1579 | 257k | { |
1580 | | // literal encoding; safe to over-copy due to tail |
1581 | 257k | if (size_t(data_end - data) < vertex_count_aligned) |
1582 | 100 | return NULL; |
1583 | | |
1584 | 257k | memcpy(buffer + j * vertex_count_aligned, data, vertex_count_aligned); |
1585 | 257k | data += vertex_count; |
1586 | 257k | } |
1587 | 1.30M | else if (ctrl == 2) |
1588 | 409k | { |
1589 | | // zero encoding |
1590 | 409k | memset(buffer + j * vertex_count_aligned, 0, vertex_count_aligned); |
1591 | 409k | } |
1592 | 899k | else |
1593 | 899k | { |
1594 | | // for v0, headers are mapped to 0..3; for v1, headers are mapped to 4..8 |
1595 | 899k | int hshift = version == 0 ? 0 : 4 + ctrl; |
1596 | | |
1597 | 899k | data = decodeBytesSimd(data, data_end, buffer + j * vertex_count_aligned, vertex_count_aligned, hshift); |
1598 | 899k | if (!data) |
1599 | 675 | return NULL; |
1600 | 899k | } |
1601 | 1.56M | } |
1602 | | |
1603 | 391k | int channel = version == 0 ? 0 : channels[k / 4]; |
1604 | | |
1605 | 391k | switch (channel & 3) |
1606 | 391k | { |
1607 | 359k | case 0: |
1608 | 359k | decodeDeltas4Simd<0>(buffer, transposed + k, vertex_count_aligned, vertex_size, last_vertex + k, 0); |
1609 | 359k | break; |
1610 | 17.1k | case 1: |
1611 | 17.1k | decodeDeltas4Simd<1>(buffer, transposed + k, vertex_count_aligned, vertex_size, last_vertex + k, 0); |
1612 | 17.1k | break; |
1613 | 14.7k | case 2: |
1614 | 14.7k | decodeDeltas4Simd<2>(buffer, transposed + k, vertex_count_aligned, vertex_size, last_vertex + k, (32 - (channel >> 4)) & 31); |
1615 | 14.7k | break; |
1616 | 143 | default: |
1617 | 143 | return NULL; // invalid channel type |
1618 | 391k | } |
1619 | 391k | } |
1620 | | |
1621 | 146k | memcpy(vertex_data, transposed, vertex_count * vertex_size); |
1622 | | |
1623 | 146k | memcpy(last_vertex, &transposed[vertex_size * (vertex_count - 1)], vertex_size); |
1624 | | |
1625 | 146k | return data; |
1626 | 147k | } |
1627 | | #endif |
1628 | | |
1629 | | #if defined(SIMD_SSE) && defined(SIMD_FALLBACK) |
1630 | | static unsigned int getCpuFeatures() |
1631 | 2 | { |
1632 | 2 | int cpuinfo[4] = {}; |
1633 | | #ifdef _MSC_VER |
1634 | | __cpuid(cpuinfo, 1); |
1635 | | #else |
1636 | 2 | __cpuid(1, cpuinfo[0], cpuinfo[1], cpuinfo[2], cpuinfo[3]); |
1637 | 2 | #endif |
1638 | 2 | return cpuinfo[2]; |
1639 | 2 | } |
1640 | | |
1641 | | static unsigned int cpuid = getCpuFeatures(); |
1642 | | #endif |
1643 | | |
1644 | | } // namespace meshopt |
1645 | | |
1646 | | size_t meshopt_encodeVertexBufferLevel(unsigned char* buffer, size_t buffer_size, const void* vertices, size_t vertex_count, size_t vertex_size, int level, int version) |
1647 | 12.0k | { |
1648 | 12.0k | using namespace meshopt; |
1649 | | |
1650 | 12.0k | assert(vertex_size > 0 && vertex_size <= 256); |
1651 | 12.0k | assert(vertex_size % 4 == 0); |
1652 | 12.0k | assert(level >= 0 && level <= 9); // only a subset of this range is used right now |
1653 | 12.0k | assert(version < 0 || unsigned(version) <= kDecodeVertexVersion); |
1654 | | |
1655 | 12.0k | version = version < 0 ? gEncodeVertexVersion : version; |
1656 | | |
1657 | | #if TRACE |
1658 | | memset(vertexstats, 0, sizeof(vertexstats)); |
1659 | | #endif |
1660 | | |
1661 | 12.0k | const unsigned char* vertex_data = static_cast<const unsigned char*>(vertices); |
1662 | | |
1663 | 12.0k | unsigned char* data = buffer; |
1664 | 12.0k | unsigned char* data_end = buffer + buffer_size; |
1665 | | |
1666 | 12.0k | if (size_t(data_end - data) < 1) |
1667 | 0 | return 0; |
1668 | | |
1669 | 12.0k | *data++ = (unsigned char)(kVertexHeader | version); |
1670 | | |
1671 | 12.0k | unsigned char first_vertex[256] = {}; |
1672 | 12.0k | if (vertex_count > 0) |
1673 | 10.8k | memcpy(first_vertex, vertex_data, vertex_size); |
1674 | | |
1675 | 12.0k | unsigned char last_vertex[256] = {}; |
1676 | 12.0k | memcpy(last_vertex, first_vertex, vertex_size); |
1677 | | |
1678 | 12.0k | size_t vertex_block_size = getVertexBlockSize(vertex_size); |
1679 | | |
1680 | 12.0k | unsigned char channels[64] = {}; |
1681 | 12.0k | if (version != 0 && level > 1 && vertex_count > 1) |
1682 | 18.0k | for (size_t k = 0; k < vertex_size; k += 4) |
1683 | 14.6k | { |
1684 | 14.6k | int rot = level >= 3 ? estimateRotate(vertex_data, vertex_count, vertex_size, k, /* group_size= */ 16) : 0; |
1685 | 14.6k | int channel = estimateChannel(vertex_data, vertex_count, vertex_size, k, vertex_block_size, /* block_skip= */ 3, /* max_channels= */ level >= 3 ? 3 : 2, rot); |
1686 | | |
1687 | 14.6k | assert(unsigned(channel) < 2 || ((channel & 3) == 2 && unsigned(channel >> 4) < 8)); |
1688 | 14.6k | channels[k / 4] = (unsigned char)channel; |
1689 | 14.6k | } |
1690 | | |
1691 | 12.0k | size_t vertex_offset = 0; |
1692 | | |
1693 | 304k | while (vertex_offset < vertex_count) |
1694 | 292k | { |
1695 | 292k | size_t block_size = (vertex_offset + vertex_block_size < vertex_count) ? vertex_block_size : vertex_count - vertex_offset; |
1696 | | |
1697 | 292k | data = encodeVertexBlock(data, data_end, vertex_data + vertex_offset * vertex_size, block_size, vertex_size, last_vertex, channels, version, level); |
1698 | 292k | if (!data) |
1699 | 0 | return 0; |
1700 | | |
1701 | 292k | vertex_offset += block_size; |
1702 | 292k | } |
1703 | | |
1704 | 12.0k | size_t tail_size = vertex_size + (version == 0 ? 0 : vertex_size / 4); |
1705 | 12.0k | size_t tail_size_min = version == 0 ? kTailMinSizeV0 : kTailMinSizeV1; |
1706 | 12.0k | size_t tail_size_pad = tail_size < tail_size_min ? tail_size_min : tail_size; |
1707 | | |
1708 | 12.0k | if (size_t(data_end - data) < tail_size_pad) |
1709 | 0 | return 0; |
1710 | | |
1711 | 12.0k | if (tail_size < tail_size_pad) |
1712 | 6.82k | { |
1713 | 6.82k | memset(data, 0, tail_size_pad - tail_size); |
1714 | 6.82k | data += tail_size_pad - tail_size; |
1715 | 6.82k | } |
1716 | | |
1717 | 12.0k | memcpy(data, first_vertex, vertex_size); |
1718 | 12.0k | data += vertex_size; |
1719 | | |
1720 | 12.0k | if (version != 0) |
1721 | 8.88k | { |
1722 | 8.88k | memcpy(data, channels, vertex_size / 4); |
1723 | 8.88k | data += vertex_size / 4; |
1724 | 8.88k | } |
1725 | | |
1726 | 12.0k | assert(data >= buffer + tail_size); |
1727 | 12.0k | assert(data <= buffer + buffer_size); |
1728 | | |
1729 | | #if TRACE |
1730 | | size_t total_size = data - buffer; |
1731 | | |
1732 | | for (size_t k = 0; k < vertex_size; ++k) |
1733 | | { |
1734 | | const Stats& vsk = vertexstats[k]; |
1735 | | |
1736 | | printf("%2d: %7d bytes [%4.1f%%] %.1f bpv", int(k), int(vsk.size), double(vsk.size) / double(total_size) * 100, double(vsk.size) / double(vertex_count) * 8); |
1737 | | |
1738 | | size_t total_k = vsk.header + vsk.bitg[1] + vsk.bitg[2] + vsk.bitg[4] + vsk.bitg[8]; |
1739 | | double total_kr = total_k ? 1.0 / double(total_k) : 0; |
1740 | | |
1741 | | if (version != 0) |
1742 | | { |
1743 | | int channel = channels[k / 4]; |
1744 | | |
1745 | | if ((channel & 3) == 2 && k % 4 == 0) |
1746 | | printf(" | ^%d", channel >> 4); |
1747 | | else |
1748 | | printf(" | %2s", channel == 0 ? "1" : (channel == 1 && k % 2 == 0 ? "2" : ".")); |
1749 | | } |
1750 | | |
1751 | | printf(" | hdr [%5.1f%%] bitg [1 %4.1f%% 2 %4.1f%% 4 %4.1f%% 8 %4.1f%%]", |
1752 | | double(vsk.header) * total_kr * 100, |
1753 | | double(vsk.bitg[1]) * total_kr * 100, double(vsk.bitg[2]) * total_kr * 100, |
1754 | | double(vsk.bitg[4]) * total_kr * 100, double(vsk.bitg[8]) * total_kr * 100); |
1755 | | |
1756 | | size_t total_ctrl = vsk.ctrl[0] + vsk.ctrl[1] + vsk.ctrl[2] + vsk.ctrl[3]; |
1757 | | |
1758 | | if (total_ctrl) |
1759 | | { |
1760 | | printf(" | ctrl %3.0f%% %3.0f%% %3.0f%% %3.0f%%", |
1761 | | double(vsk.ctrl[0]) / double(total_ctrl) * 100, double(vsk.ctrl[1]) / double(total_ctrl) * 100, |
1762 | | double(vsk.ctrl[2]) / double(total_ctrl) * 100, double(vsk.ctrl[3]) / double(total_ctrl) * 100); |
1763 | | } |
1764 | | |
1765 | | if (level >= 3) |
1766 | | printf(" | bitc [%3.0f%% %3.0f%% %3.0f%% %3.0f%% %3.0f%% %3.0f%% %3.0f%% %3.0f%%]", |
1767 | | double(vsk.bitc[0]) / double(vertex_count) * 100, double(vsk.bitc[1]) / double(vertex_count) * 100, |
1768 | | double(vsk.bitc[2]) / double(vertex_count) * 100, double(vsk.bitc[3]) / double(vertex_count) * 100, |
1769 | | double(vsk.bitc[4]) / double(vertex_count) * 100, double(vsk.bitc[5]) / double(vertex_count) * 100, |
1770 | | double(vsk.bitc[6]) / double(vertex_count) * 100, double(vsk.bitc[7]) / double(vertex_count) * 100); |
1771 | | |
1772 | | printf("\n"); |
1773 | | } |
1774 | | #endif |
1775 | | |
1776 | 12.0k | return data - buffer; |
1777 | 12.0k | } |
1778 | | |
1779 | | size_t meshopt_encodeVertexBuffer(unsigned char* buffer, size_t buffer_size, const void* vertices, size_t vertex_count, size_t vertex_size) |
1780 | 0 | { |
1781 | 0 | return meshopt_encodeVertexBufferLevel(buffer, buffer_size, vertices, vertex_count, vertex_size, meshopt::kEncodeDefaultLevel, meshopt::gEncodeVertexVersion); |
1782 | 0 | } |
1783 | | |
1784 | | size_t meshopt_encodeVertexBufferBound(size_t vertex_count, size_t vertex_size) |
1785 | 6.03k | { |
1786 | 6.03k | using namespace meshopt; |
1787 | | |
1788 | 6.03k | assert(vertex_size > 0 && vertex_size <= 256); |
1789 | 6.03k | assert(vertex_size % 4 == 0); |
1790 | | |
1791 | 6.03k | size_t vertex_block_size = getVertexBlockSize(vertex_size); |
1792 | 6.03k | size_t vertex_block_count = (vertex_count + vertex_block_size - 1) / vertex_block_size; |
1793 | | |
1794 | 6.03k | size_t vertex_block_control_size = vertex_size / 4; |
1795 | 6.03k | size_t vertex_block_header_size = (vertex_block_size / kByteGroupSize + 3) / 4; |
1796 | 6.03k | size_t vertex_block_data_size = vertex_block_size; |
1797 | | |
1798 | 6.03k | size_t tail_size = vertex_size + (vertex_size / 4); |
1799 | 6.03k | size_t tail_size_min = kTailMinSizeV0 > kTailMinSizeV1 ? kTailMinSizeV0 : kTailMinSizeV1; |
1800 | 6.03k | size_t tail_size_pad = tail_size < tail_size_min ? tail_size_min : tail_size; |
1801 | 6.03k | assert(tail_size_pad >= kByteGroupDecodeLimit); |
1802 | | |
1803 | 6.03k | return 1 + vertex_block_count * vertex_size * (vertex_block_control_size + vertex_block_header_size + vertex_block_data_size) + tail_size_pad; |
1804 | 6.03k | } |
1805 | | |
1806 | | void meshopt_encodeVertexVersion(int version) |
1807 | 1.50k | { |
1808 | 1.50k | assert(unsigned(version) <= unsigned(meshopt::kDecodeVertexVersion)); |
1809 | | |
1810 | 1.50k | meshopt::gEncodeVertexVersion = version; |
1811 | 1.50k | } |
1812 | | |
1813 | | int meshopt_decodeVertexVersion(const unsigned char* buffer, size_t buffer_size) |
1814 | 0 | { |
1815 | 0 | if (buffer_size < 1) |
1816 | 0 | return -1; |
1817 | | |
1818 | 0 | unsigned char header = buffer[0]; |
1819 | |
|
1820 | 0 | if ((header & 0xf0) != meshopt::kVertexHeader) |
1821 | 0 | return -1; |
1822 | | |
1823 | 0 | int version = header & 0x0f; |
1824 | 0 | if (version > meshopt::kDecodeVertexVersion) |
1825 | 0 | return -1; |
1826 | | |
1827 | 0 | return version; |
1828 | 0 | } |
1829 | | |
1830 | | int meshopt_decodeVertexBuffer(void* destination, size_t vertex_count, size_t vertex_size, const unsigned char* buffer, size_t buffer_size) |
1831 | 12.0k | { |
1832 | 12.0k | using namespace meshopt; |
1833 | | |
1834 | 12.0k | assert(vertex_size > 0 && vertex_size <= 256); |
1835 | 12.0k | assert(vertex_size % 4 == 0); |
1836 | | |
1837 | 12.0k | const unsigned char* (*decode)(const unsigned char*, const unsigned char*, unsigned char*, size_t, size_t, unsigned char[256], const unsigned char*, int) = NULL; |
1838 | | |
1839 | 12.0k | #if defined(SIMD_SSE) && defined(SIMD_FALLBACK) |
1840 | 12.0k | decode = (cpuid & (1 << 9)) ? decodeVertexBlockSimd : decodeVertexBlock; |
1841 | | #elif defined(SIMD_SSE) || defined(SIMD_AVX) || defined(SIMD_NEON) || defined(SIMD_WASM) |
1842 | | decode = decodeVertexBlockSimd; |
1843 | | #else |
1844 | | decode = decodeVertexBlock; |
1845 | | #endif |
1846 | | |
1847 | 12.0k | #if defined(SIMD_SSE) || defined(SIMD_NEON) || defined(SIMD_WASM) |
1848 | 12.0k | assert(gDecodeBytesGroupInitialized); |
1849 | 12.0k | (void)gDecodeBytesGroupInitialized; |
1850 | 12.0k | #endif |
1851 | | |
1852 | 12.0k | unsigned char* vertex_data = static_cast<unsigned char*>(destination); |
1853 | | |
1854 | 12.0k | const unsigned char* data = buffer; |
1855 | 12.0k | const unsigned char* data_end = buffer + buffer_size; |
1856 | | |
1857 | 12.0k | if (size_t(data_end - data) < 1) |
1858 | 0 | return -2; |
1859 | | |
1860 | 12.0k | unsigned char data_header = *data++; |
1861 | | |
1862 | 12.0k | if ((data_header & 0xf0) != kVertexHeader) |
1863 | 4.25k | return -1; |
1864 | | |
1865 | 7.81k | int version = data_header & 0x0f; |
1866 | 7.81k | if (version > kDecodeVertexVersion) |
1867 | 108 | return -1; |
1868 | | |
1869 | 7.70k | size_t tail_size = vertex_size + (version == 0 ? 0 : vertex_size / 4); |
1870 | 7.70k | size_t tail_size_min = version == 0 ? kTailMinSizeV0 : kTailMinSizeV1; |
1871 | 7.70k | size_t tail_size_pad = tail_size < tail_size_min ? tail_size_min : tail_size; |
1872 | | |
1873 | 7.70k | if (size_t(data_end - data) < tail_size_pad) |
1874 | 151 | return -2; |
1875 | | |
1876 | 7.55k | const unsigned char* tail = data_end - tail_size; |
1877 | | |
1878 | 7.55k | unsigned char last_vertex[256]; |
1879 | 7.55k | memcpy(last_vertex, tail, vertex_size); |
1880 | | |
1881 | 7.55k | const unsigned char* channels = version == 0 ? NULL : tail + vertex_size; |
1882 | | |
1883 | 7.55k | size_t vertex_block_size = getVertexBlockSize(vertex_size); |
1884 | | |
1885 | 7.55k | size_t vertex_offset = 0; |
1886 | | |
1887 | 154k | while (vertex_offset < vertex_count) |
1888 | 147k | { |
1889 | 147k | size_t block_size = (vertex_offset + vertex_block_size < vertex_count) ? vertex_block_size : vertex_count - vertex_offset; |
1890 | | |
1891 | 147k | data = decode(data, data_end, vertex_data + vertex_offset * vertex_size, block_size, vertex_size, last_vertex, channels, version); |
1892 | 147k | if (!data) |
1893 | 918 | return -2; |
1894 | | |
1895 | 146k | vertex_offset += block_size; |
1896 | 146k | } |
1897 | | |
1898 | 6.63k | if (size_t(data_end - data) != tail_size_pad) |
1899 | 600 | return -3; |
1900 | | |
1901 | 6.03k | return 0; |
1902 | 6.63k | } |
1903 | | |
1904 | | #undef SIMD_NEON |
1905 | | #undef SIMD_SSE |
1906 | | #undef SIMD_AVX |
1907 | | #undef SIMD_WASM |
1908 | | #undef SIMD_FALLBACK |
1909 | | #undef SIMD_TARGET |
1910 | | #undef SIMD_LATENCYOPT |