/src/libjxl/lib/jxl/enc_fast_lossless.cc
Line | Count | Source (jump to first uncovered line) |
1 | | // Copyright (c) the JPEG XL Project Authors. All rights reserved. |
2 | | // |
3 | | // Use of this source code is governed by a BSD-style |
4 | | // license that can be found in the LICENSE file. |
5 | | |
6 | | #include "lib/jxl/base/status.h" |
7 | | #ifndef FJXL_SELF_INCLUDE |
8 | | |
9 | | #include <assert.h> |
10 | | |
11 | | #include <algorithm> |
12 | | #include <array> |
13 | | #include <cstdint> |
14 | | #include <cstdlib> |
15 | | #include <cstring> |
16 | | #include <limits> |
17 | | #include <memory> |
18 | | #include <vector> |
19 | | |
20 | | #include "lib/jxl/enc_fast_lossless.h" |
21 | | |
22 | | #if FJXL_STANDALONE |
23 | | #if defined(_MSC_VER) |
24 | | using ssize_t = intptr_t; |
25 | | #endif |
26 | | #else // FJXL_STANDALONE |
27 | | #include "lib/jxl/encode_internal.h" |
28 | | #endif // FJXL_STANDALONE |
29 | | |
30 | | #if defined(__x86_64__) || defined(_M_X64) |
31 | | #define FJXL_ARCH_IS_X86_64 1 |
32 | | #else |
33 | | #define FJXL_ARCH_IS_X86_64 0 |
34 | | #endif |
35 | | |
36 | | #if defined(__i386__) || defined(_M_IX86) || FJXL_ARCH_IS_X86_64 |
37 | | #define FJXL_ARCH_IS_X86 1 |
38 | | #else |
39 | | #define FJXL_ARCH_IS_X86 0 |
40 | | #endif |
41 | | |
42 | | #if FJXL_ARCH_IS_X86 |
43 | | #if defined(_MSC_VER) |
44 | | #include <intrin.h> |
45 | | #else // _MSC_VER |
46 | | #include <cpuid.h> |
47 | | #endif // _MSC_VER |
48 | | #endif // FJXL_ARCH_IS_X86 |
49 | | |
50 | | // Enable NEON and AVX2/AVX512 if not asked to do otherwise and the compilers |
51 | | // support it. |
52 | | #if defined(__aarch64__) || defined(_M_ARM64) // ARCH |
53 | | #include <arm_neon.h> |
54 | | |
55 | | #if !defined(FJXL_ENABLE_NEON) |
56 | | #define FJXL_ENABLE_NEON 1 |
57 | | #endif // !defined(FJXL_ENABLE_NEON) |
58 | | |
59 | | #elif FJXL_ARCH_IS_X86_64 && !defined(_MSC_VER) // ARCH |
60 | | #include <immintrin.h> |
61 | | |
62 | | // manually add _mm512_cvtsi512_si32 definition if missing |
63 | | // (e.g. with Xcode on macOS Mojave) |
64 | | // copied from gcc 11.1.0 include/avx512fintrin.h line 14367-14373 |
65 | | #if defined(__clang__) && \ |
66 | | ((!defined(__apple_build_version__) && __clang_major__ < 10) || \ |
67 | | (defined(__apple_build_version__) && __apple_build_version__ < 12000032)) |
68 | | inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) |
69 | | _mm512_cvtsi512_si32(__m512i __A) { |
70 | | __v16si __B = (__v16si)__A; |
71 | | return __B[0]; |
72 | | } |
73 | | #endif |
74 | | |
75 | | #if !defined(FJXL_ENABLE_AVX2) |
76 | | #define FJXL_ENABLE_AVX2 1 |
77 | | #endif // !defined(FJXL_ENABLE_AVX2) |
78 | | |
79 | | #if !defined(FJXL_ENABLE_AVX512) |
80 | | // On clang-7 or earlier, and gcc-10 or earlier, AVX512 seems broken. |
81 | | #if (defined(__clang__) && \ |
82 | | (!defined(__apple_build_version__) && __clang_major__ > 7) || \ |
83 | | (defined(__apple_build_version__) && \ |
84 | | __apple_build_version__ > 10010046)) || \ |
85 | | (defined(__GNUC__) && __GNUC__ > 10) |
86 | | #define FJXL_ENABLE_AVX512 1 |
87 | | #endif |
88 | | #endif // !defined(FJXL_ENABLE_AVX512) |
89 | | |
90 | | #endif // ARCH |
91 | | |
92 | | #ifndef FJXL_ENABLE_NEON |
93 | | #define FJXL_ENABLE_NEON 0 |
94 | | #endif |
95 | | |
96 | | #ifndef FJXL_ENABLE_AVX2 |
97 | | #define FJXL_ENABLE_AVX2 0 |
98 | | #endif |
99 | | |
100 | | #ifndef FJXL_ENABLE_AVX512 |
101 | | #define FJXL_ENABLE_AVX512 0 |
102 | | #endif |
103 | | |
104 | | namespace { |
105 | | |
106 | | enum class CpuFeature : uint32_t { |
107 | | kAVX2 = 0, |
108 | | |
109 | | kAVX512F, |
110 | | kAVX512VL, |
111 | | kAVX512CD, |
112 | | kAVX512BW, |
113 | | |
114 | | kVBMI, |
115 | | kVBMI2 |
116 | | }; |
117 | | |
118 | 0 | constexpr uint32_t CpuFeatureBit(CpuFeature feature) { |
119 | 0 | return 1u << static_cast<uint32_t>(feature); |
120 | 0 | } |
121 | | |
122 | | #if FJXL_ARCH_IS_X86 |
123 | | #if defined(_MSC_VER) |
124 | | void Cpuid(const uint32_t level, const uint32_t count, |
125 | | std::array<uint32_t, 4>& abcd) { |
126 | | int regs[4]; |
127 | | __cpuidex(regs, level, count); |
128 | | for (int i = 0; i < 4; ++i) { |
129 | | abcd[i] = regs[i]; |
130 | | } |
131 | | } |
132 | | uint32_t ReadXCR0() { return static_cast<uint32_t>(_xgetbv(0)); } |
133 | | #else // _MSC_VER |
134 | | void Cpuid(const uint32_t level, const uint32_t count, |
135 | 0 | std::array<uint32_t, 4>& abcd) { |
136 | 0 | uint32_t a; |
137 | 0 | uint32_t b; |
138 | 0 | uint32_t c; |
139 | 0 | uint32_t d; |
140 | 0 | __cpuid_count(level, count, a, b, c, d); |
141 | 0 | abcd[0] = a; |
142 | 0 | abcd[1] = b; |
143 | 0 | abcd[2] = c; |
144 | 0 | abcd[3] = d; |
145 | 0 | } |
146 | 0 | uint32_t ReadXCR0() { |
147 | 0 | uint32_t xcr0; |
148 | 0 | uint32_t xcr0_high; |
149 | 0 | const uint32_t index = 0; |
150 | 0 | asm volatile(".byte 0x0F, 0x01, 0xD0" |
151 | 0 | : "=a"(xcr0), "=d"(xcr0_high) |
152 | 0 | : "c"(index)); |
153 | 0 | return xcr0; |
154 | 0 | } |
155 | | #endif // _MSC_VER |
156 | | |
157 | 0 | uint32_t DetectCpuFeatures() { |
158 | 0 | uint32_t flags = 0; // return value |
159 | 0 | std::array<uint32_t, 4> abcd; |
160 | 0 | Cpuid(0, 0, abcd); |
161 | 0 | const uint32_t max_level = abcd[0]; |
162 | |
|
163 | 0 | const auto check_bit = [](uint32_t v, uint32_t idx) -> bool { |
164 | 0 | return (v & (1U << idx)) != 0; |
165 | 0 | }; |
166 | | |
167 | | // Extended features |
168 | 0 | if (max_level >= 7) { |
169 | 0 | Cpuid(7, 0, abcd); |
170 | 0 | flags |= check_bit(abcd[1], 5) ? CpuFeatureBit(CpuFeature::kAVX2) : 0; |
171 | |
|
172 | 0 | flags |= check_bit(abcd[1], 16) ? CpuFeatureBit(CpuFeature::kAVX512F) : 0; |
173 | 0 | flags |= check_bit(abcd[1], 28) ? CpuFeatureBit(CpuFeature::kAVX512CD) : 0; |
174 | 0 | flags |= check_bit(abcd[1], 30) ? CpuFeatureBit(CpuFeature::kAVX512BW) : 0; |
175 | 0 | flags |= check_bit(abcd[1], 31) ? CpuFeatureBit(CpuFeature::kAVX512VL) : 0; |
176 | |
|
177 | 0 | flags |= check_bit(abcd[2], 1) ? CpuFeatureBit(CpuFeature::kVBMI) : 0; |
178 | 0 | flags |= check_bit(abcd[2], 6) ? CpuFeatureBit(CpuFeature::kVBMI2) : 0; |
179 | 0 | } |
180 | |
|
181 | 0 | Cpuid(1, 0, abcd); |
182 | 0 | const bool os_has_xsave = check_bit(abcd[2], 27); |
183 | 0 | if (os_has_xsave) { |
184 | 0 | const uint32_t xcr0 = ReadXCR0(); |
185 | 0 | if (!check_bit(xcr0, 1) || !check_bit(xcr0, 2) || !check_bit(xcr0, 5) || |
186 | 0 | !check_bit(xcr0, 6) || !check_bit(xcr0, 7)) { |
187 | 0 | flags = 0; // TODO(eustas): be more selective? |
188 | 0 | } |
189 | 0 | } |
190 | |
|
191 | 0 | return flags; |
192 | 0 | } |
193 | | #else // FJXL_ARCH_IS_X86 |
194 | | uint32_t DetectCpuFeatures() { return 0; } |
195 | | #endif // FJXL_ARCH_IS_X86 |
196 | | |
197 | | #if defined(_MSC_VER) |
198 | | #define FJXL_UNUSED |
199 | | #else |
200 | | #define FJXL_UNUSED __attribute__((unused)) |
201 | | #endif |
202 | | |
203 | 0 | FJXL_UNUSED bool HasCpuFeature(CpuFeature feature) { |
204 | 0 | static uint32_t cpu_features = DetectCpuFeatures(); |
205 | 0 | return (cpu_features & CpuFeatureBit(feature)) != 0; |
206 | 0 | } |
207 | | |
208 | | #if defined(_MSC_VER) && !defined(__clang__) |
209 | | #define FJXL_INLINE __forceinline |
210 | | FJXL_INLINE uint32_t FloorLog2(uint32_t v) { |
211 | | unsigned long index; |
212 | | _BitScanReverse(&index, v); |
213 | | return index; |
214 | | } |
215 | | FJXL_INLINE uint32_t CtzNonZero(uint64_t v) { |
216 | | unsigned long index; |
217 | | _BitScanForward(&index, v); |
218 | | return index; |
219 | | } |
220 | | #else |
221 | | #define FJXL_INLINE inline __attribute__((always_inline)) |
222 | 0 | FJXL_INLINE uint32_t FloorLog2(uint32_t v) { |
223 | 0 | return v ? 31 - __builtin_clz(v) : 0; |
224 | 0 | } |
225 | 0 | FJXL_UNUSED FJXL_INLINE uint32_t CtzNonZero(uint64_t v) { |
226 | 0 | return __builtin_ctzll(v); |
227 | 0 | } |
228 | | #endif |
229 | | |
230 | | // Compiles to a memcpy on little-endian systems. |
231 | 0 | FJXL_INLINE void StoreLE64(uint8_t* tgt, uint64_t data) { |
232 | | #if (!defined(__BYTE_ORDER__) || (__BYTE_ORDER__ != __ORDER_LITTLE_ENDIAN__)) |
233 | | for (int i = 0; i < 8; i++) { |
234 | | tgt[i] = (data >> (i * 8)) & 0xFF; |
235 | | } |
236 | | #else |
237 | 0 | memcpy(tgt, &data, 8); |
238 | 0 | #endif |
239 | 0 | } |
240 | | |
241 | | FJXL_INLINE size_t AddBits(uint32_t count, uint64_t bits, uint8_t* data_buf, |
242 | 0 | size_t& bits_in_buffer, uint64_t& bit_buffer) { |
243 | 0 | bit_buffer |= bits << bits_in_buffer; |
244 | 0 | bits_in_buffer += count; |
245 | 0 | StoreLE64(data_buf, bit_buffer); |
246 | 0 | size_t bytes_in_buffer = bits_in_buffer / 8; |
247 | 0 | bits_in_buffer -= bytes_in_buffer * 8; |
248 | 0 | bit_buffer >>= bytes_in_buffer * 8; |
249 | 0 | return bytes_in_buffer; |
250 | 0 | } |
251 | | |
252 | | struct BitWriter { |
253 | 0 | void Allocate(size_t maximum_bit_size) { |
254 | 0 | assert(data == nullptr); |
255 | | // Leave some padding. |
256 | 0 | data.reset(static_cast<uint8_t*>(malloc(maximum_bit_size / 8 + 64))); |
257 | 0 | } |
258 | | |
259 | 0 | void Write(uint32_t count, uint64_t bits) { |
260 | 0 | bytes_written += AddBits(count, bits, data.get() + bytes_written, |
261 | 0 | bits_in_buffer, buffer); |
262 | 0 | } |
263 | | |
264 | 0 | void ZeroPadToByte() { |
265 | 0 | if (bits_in_buffer != 0) { |
266 | 0 | Write(8 - bits_in_buffer, 0); |
267 | 0 | } |
268 | 0 | } |
269 | | |
270 | | FJXL_INLINE void WriteMultiple(const uint64_t* nbits, const uint64_t* bits, |
271 | 0 | size_t n) { |
272 | | // Necessary because Write() is only guaranteed to work with <=56 bits. |
273 | | // Trying to SIMD-fy this code results in lower speed (and definitely less |
274 | | // clarity). |
275 | 0 | { |
276 | 0 | for (size_t i = 0; i < n; i++) { |
277 | 0 | this->buffer |= bits[i] << this->bits_in_buffer; |
278 | 0 | memcpy(this->data.get() + this->bytes_written, &this->buffer, 8); |
279 | 0 | uint64_t shift = 64 - this->bits_in_buffer; |
280 | 0 | this->bits_in_buffer += nbits[i]; |
281 | | // This `if` seems to be faster than using ternaries. |
282 | 0 | if (this->bits_in_buffer >= 64) { |
283 | 0 | uint64_t next_buffer = shift >= 64 ? 0 : bits[i] >> shift; |
284 | 0 | this->buffer = next_buffer; |
285 | 0 | this->bits_in_buffer -= 64; |
286 | 0 | this->bytes_written += 8; |
287 | 0 | } |
288 | 0 | } |
289 | 0 | memcpy(this->data.get() + this->bytes_written, &this->buffer, 8); |
290 | 0 | size_t bytes_in_buffer = this->bits_in_buffer / 8; |
291 | 0 | this->bits_in_buffer -= bytes_in_buffer * 8; |
292 | 0 | this->buffer >>= bytes_in_buffer * 8; |
293 | 0 | this->bytes_written += bytes_in_buffer; |
294 | 0 | } |
295 | 0 | } |
296 | | |
297 | | std::unique_ptr<uint8_t[], void (*)(void*)> data = {nullptr, free}; |
298 | | size_t bytes_written = 0; |
299 | | size_t bits_in_buffer = 0; |
300 | | uint64_t buffer = 0; |
301 | | }; |
302 | | |
303 | 0 | size_t SectionSize(const std::array<BitWriter, 4>& group_data) { |
304 | 0 | size_t sz = 0; |
305 | 0 | for (size_t j = 0; j < 4; j++) { |
306 | 0 | const auto& writer = group_data[j]; |
307 | 0 | sz += writer.bytes_written * 8 + writer.bits_in_buffer; |
308 | 0 | } |
309 | 0 | sz = (sz + 7) / 8; |
310 | 0 | return sz; |
311 | 0 | } |
312 | | |
313 | | constexpr size_t kMaxFrameHeaderSize = 5; |
314 | | |
315 | | constexpr size_t kGroupSizeOffset[4] = { |
316 | | static_cast<size_t>(0), |
317 | | static_cast<size_t>(1024), |
318 | | static_cast<size_t>(17408), |
319 | | static_cast<size_t>(4211712), |
320 | | }; |
321 | | constexpr size_t kTOCBits[4] = {12, 16, 24, 32}; |
322 | | |
323 | 0 | size_t TOCBucket(size_t group_size) { |
324 | 0 | size_t bucket = 0; |
325 | 0 | while (bucket < 3 && group_size >= kGroupSizeOffset[bucket + 1]) ++bucket; |
326 | 0 | return bucket; |
327 | 0 | } |
328 | | |
329 | | #if !FJXL_STANDALONE |
330 | 0 | size_t TOCSize(const std::vector<size_t>& group_sizes) { |
331 | 0 | size_t toc_bits = 0; |
332 | 0 | for (size_t group_size : group_sizes) { |
333 | 0 | toc_bits += kTOCBits[TOCBucket(group_size)]; |
334 | 0 | } |
335 | 0 | return (toc_bits + 7) / 8; |
336 | 0 | } |
337 | | |
338 | 0 | size_t FrameHeaderSize(bool have_alpha, bool is_last) { |
339 | 0 | size_t nbits = 28 + (have_alpha ? 4 : 0) + (is_last ? 0 : 2); |
340 | 0 | return (nbits + 7) / 8; |
341 | 0 | } |
342 | | #endif |
343 | | |
344 | | void ComputeAcGroupDataOffset(size_t dc_global_size, size_t num_dc_groups, |
345 | | size_t num_ac_groups, size_t& min_dc_global_size, |
346 | 0 | size_t& ac_group_offset) { |
347 | | // Max AC group size is 768 kB, so max AC group TOC bits is 24. |
348 | 0 | size_t ac_toc_max_bits = num_ac_groups * 24; |
349 | 0 | size_t ac_toc_min_bits = num_ac_groups * 12; |
350 | 0 | size_t max_padding = 1 + (ac_toc_max_bits - ac_toc_min_bits + 7) / 8; |
351 | 0 | min_dc_global_size = dc_global_size; |
352 | 0 | size_t dc_global_bucket = TOCBucket(min_dc_global_size); |
353 | 0 | while (TOCBucket(min_dc_global_size + max_padding) > dc_global_bucket) { |
354 | 0 | dc_global_bucket = TOCBucket(min_dc_global_size + max_padding); |
355 | 0 | min_dc_global_size = kGroupSizeOffset[dc_global_bucket]; |
356 | 0 | } |
357 | 0 | assert(TOCBucket(min_dc_global_size) == dc_global_bucket); |
358 | 0 | assert(TOCBucket(min_dc_global_size + max_padding) == dc_global_bucket); |
359 | 0 | size_t max_toc_bits = |
360 | 0 | kTOCBits[dc_global_bucket] + 12 * (1 + num_dc_groups) + ac_toc_max_bits; |
361 | 0 | size_t max_toc_size = (max_toc_bits + 7) / 8; |
362 | 0 | ac_group_offset = kMaxFrameHeaderSize + max_toc_size + min_dc_global_size; |
363 | 0 | } |
364 | | |
365 | | #if !FJXL_STANDALONE |
366 | | size_t ComputeDcGlobalPadding(const std::vector<size_t>& group_sizes, |
367 | | size_t ac_group_data_offset, |
368 | | size_t min_dc_global_size, bool have_alpha, |
369 | 0 | bool is_last) { |
370 | 0 | std::vector<size_t> new_group_sizes = group_sizes; |
371 | 0 | new_group_sizes[0] = min_dc_global_size; |
372 | 0 | size_t toc_size = TOCSize(new_group_sizes); |
373 | 0 | size_t actual_offset = |
374 | 0 | FrameHeaderSize(have_alpha, is_last) + toc_size + group_sizes[0]; |
375 | 0 | return ac_group_data_offset - actual_offset; |
376 | 0 | } |
377 | | #endif |
378 | | |
379 | | constexpr size_t kNumRawSymbols = 19; |
380 | | constexpr size_t kNumLZ77 = 33; |
381 | | constexpr size_t kLZ77CacheSize = 32; |
382 | | |
383 | | constexpr size_t kLZ77Offset = 224; |
384 | | constexpr size_t kLZ77MinLength = 7; |
385 | | |
386 | | void EncodeHybridUintLZ77(uint32_t value, uint32_t* token, uint32_t* nbits, |
387 | 0 | uint32_t* bits) { |
388 | | // 400 config |
389 | 0 | uint32_t n = FloorLog2(value); |
390 | 0 | *token = value < 16 ? value : 16 + n - 4; |
391 | 0 | *nbits = value < 16 ? 0 : n; |
392 | 0 | *bits = value < 16 ? 0 : value - (1 << *nbits); |
393 | 0 | } |
394 | | |
395 | | struct PrefixCode { |
396 | | uint8_t raw_nbits[kNumRawSymbols] = {}; |
397 | | uint8_t raw_bits[kNumRawSymbols] = {}; |
398 | | |
399 | | uint8_t lz77_nbits[kNumLZ77] = {}; |
400 | | uint16_t lz77_bits[kNumLZ77] = {}; |
401 | | |
402 | | uint64_t lz77_cache_bits[kLZ77CacheSize] = {}; |
403 | | uint8_t lz77_cache_nbits[kLZ77CacheSize] = {}; |
404 | | |
405 | | size_t numraw; |
406 | | |
407 | 0 | static uint16_t BitReverse(size_t nbits, uint16_t bits) { |
408 | 0 | constexpr uint16_t kNibbleLookup[16] = { |
409 | 0 | 0b0000, 0b1000, 0b0100, 0b1100, 0b0010, 0b1010, 0b0110, 0b1110, |
410 | 0 | 0b0001, 0b1001, 0b0101, 0b1101, 0b0011, 0b1011, 0b0111, 0b1111, |
411 | 0 | }; |
412 | 0 | uint16_t rev16 = (kNibbleLookup[bits & 0xF] << 12) | |
413 | 0 | (kNibbleLookup[(bits >> 4) & 0xF] << 8) | |
414 | 0 | (kNibbleLookup[(bits >> 8) & 0xF] << 4) | |
415 | 0 | (kNibbleLookup[bits >> 12]); |
416 | 0 | return rev16 >> (16 - nbits); |
417 | 0 | } |
418 | | |
419 | | // Create the prefix codes given the code lengths. |
420 | | // Supports the code lengths being split into two halves. |
421 | | static void ComputeCanonicalCode(const uint8_t* first_chunk_nbits, |
422 | | uint8_t* first_chunk_bits, |
423 | | size_t first_chunk_size, |
424 | | const uint8_t* second_chunk_nbits, |
425 | | uint16_t* second_chunk_bits, |
426 | 0 | size_t second_chunk_size) { |
427 | 0 | constexpr size_t kMaxCodeLength = 15; |
428 | 0 | uint8_t code_length_counts[kMaxCodeLength + 1] = {}; |
429 | 0 | for (size_t i = 0; i < first_chunk_size; i++) { |
430 | 0 | code_length_counts[first_chunk_nbits[i]]++; |
431 | 0 | assert(first_chunk_nbits[i] <= kMaxCodeLength); |
432 | 0 | assert(first_chunk_nbits[i] <= 8); |
433 | 0 | assert(first_chunk_nbits[i] > 0); |
434 | 0 | } |
435 | 0 | for (size_t i = 0; i < second_chunk_size; i++) { |
436 | 0 | code_length_counts[second_chunk_nbits[i]]++; |
437 | 0 | assert(second_chunk_nbits[i] <= kMaxCodeLength); |
438 | 0 | } |
439 | |
|
440 | 0 | uint16_t next_code[kMaxCodeLength + 1] = {}; |
441 | |
|
442 | 0 | uint16_t code = 0; |
443 | 0 | for (size_t i = 1; i < kMaxCodeLength + 1; i++) { |
444 | 0 | code = (code + code_length_counts[i - 1]) << 1; |
445 | 0 | next_code[i] = code; |
446 | 0 | } |
447 | |
|
448 | 0 | for (size_t i = 0; i < first_chunk_size; i++) { |
449 | 0 | first_chunk_bits[i] = |
450 | 0 | BitReverse(first_chunk_nbits[i], next_code[first_chunk_nbits[i]]++); |
451 | 0 | } |
452 | 0 | for (size_t i = 0; i < second_chunk_size; i++) { |
453 | 0 | second_chunk_bits[i] = |
454 | 0 | BitReverse(second_chunk_nbits[i], next_code[second_chunk_nbits[i]]++); |
455 | 0 | } |
456 | 0 | } |
457 | | |
458 | | template <typename T> |
459 | | static void ComputeCodeLengthsNonZeroImpl(const uint64_t* freqs, size_t n, |
460 | | size_t precision, T infty, |
461 | | const uint8_t* min_limit, |
462 | | const uint8_t* max_limit, |
463 | 0 | uint8_t* nbits) { |
464 | 0 | assert(precision < 15); |
465 | 0 | assert(n <= kMaxNumSymbols); |
466 | 0 | std::vector<T> dynp(((1U << precision) + 1) * (n + 1), infty); |
467 | 0 | auto d = [&](size_t sym, size_t off) -> T& { |
468 | 0 | return dynp[sym * ((1 << precision) + 1) + off]; |
469 | 0 | }; Unexecuted instantiation: enc_fast_lossless.cc:(anonymous namespace)::PrefixCode::ComputeCodeLengthsNonZeroImpl<unsigned int>(unsigned long const*, unsigned long, unsigned long, unsigned int, unsigned char const*, unsigned char const*, unsigned char*)::{lambda(unsigned long, unsigned long)#1}::operator()(unsigned long, unsigned long) const Unexecuted instantiation: enc_fast_lossless.cc:(anonymous namespace)::PrefixCode::ComputeCodeLengthsNonZeroImpl<unsigned long>(unsigned long const*, unsigned long, unsigned long, unsigned long, unsigned char const*, unsigned char const*, unsigned char*)::{lambda(unsigned long, unsigned long)#1}::operator()(unsigned long, unsigned long) const |
470 | 0 | d(0, 0) = 0; |
471 | 0 | for (size_t sym = 0; sym < n; sym++) { |
472 | 0 | for (T bits = min_limit[sym]; bits <= max_limit[sym]; bits++) { |
473 | 0 | size_t off_delta = 1U << (precision - bits); |
474 | 0 | for (size_t off = 0; off + off_delta <= (1U << precision); off++) { |
475 | 0 | d(sym + 1, off + off_delta) = |
476 | 0 | std::min(d(sym, off) + static_cast<T>(freqs[sym]) * bits, |
477 | 0 | d(sym + 1, off + off_delta)); |
478 | 0 | } |
479 | 0 | } |
480 | 0 | } |
481 | |
|
482 | 0 | size_t sym = n; |
483 | 0 | size_t off = 1U << precision; |
484 | |
|
485 | 0 | assert(d(sym, off) != infty); |
486 | |
|
487 | 0 | while (sym-- > 0) { |
488 | 0 | assert(off > 0); |
489 | 0 | for (size_t bits = min_limit[sym]; bits <= max_limit[sym]; bits++) { |
490 | 0 | size_t off_delta = 1U << (precision - bits); |
491 | 0 | if (off_delta <= off && |
492 | 0 | d(sym + 1, off) == d(sym, off - off_delta) + freqs[sym] * bits) { |
493 | 0 | off -= off_delta; |
494 | 0 | nbits[sym] = bits; |
495 | 0 | break; |
496 | 0 | } |
497 | 0 | } |
498 | 0 | } |
499 | 0 | } Unexecuted instantiation: enc_fast_lossless.cc:void (anonymous namespace)::PrefixCode::ComputeCodeLengthsNonZeroImpl<unsigned int>(unsigned long const*, unsigned long, unsigned long, unsigned int, unsigned char const*, unsigned char const*, unsigned char*) Unexecuted instantiation: enc_fast_lossless.cc:void (anonymous namespace)::PrefixCode::ComputeCodeLengthsNonZeroImpl<unsigned long>(unsigned long const*, unsigned long, unsigned long, unsigned long, unsigned char const*, unsigned char const*, unsigned char*) |
500 | | |
501 | | // Computes nbits[i] for i <= n, subject to min_limit[i] <= nbits[i] <= |
502 | | // max_limit[i] and sum 2**-nbits[i] == 1, so to minimize sum(nbits[i] * |
503 | | // freqs[i]). |
504 | | static void ComputeCodeLengthsNonZero(const uint64_t* freqs, size_t n, |
505 | | uint8_t* min_limit, uint8_t* max_limit, |
506 | 0 | uint8_t* nbits) { |
507 | 0 | size_t precision = 0; |
508 | 0 | size_t shortest_length = 255; |
509 | 0 | uint64_t freqsum = 0; |
510 | 0 | for (size_t i = 0; i < n; i++) { |
511 | 0 | assert(freqs[i] != 0); |
512 | 0 | freqsum += freqs[i]; |
513 | 0 | if (min_limit[i] < 1) min_limit[i] = 1; |
514 | 0 | assert(min_limit[i] <= max_limit[i]); |
515 | 0 | precision = std::max<size_t>(max_limit[i], precision); |
516 | 0 | shortest_length = std::min<size_t>(min_limit[i], shortest_length); |
517 | 0 | } |
518 | | // If all the minimum limits are greater than 1, shift precision so that we |
519 | | // behave as if the shortest was 1. |
520 | 0 | precision -= shortest_length - 1; |
521 | 0 | uint64_t infty = freqsum * precision; |
522 | 0 | if (infty < std::numeric_limits<uint32_t>::max() / 2) { |
523 | 0 | ComputeCodeLengthsNonZeroImpl(freqs, n, precision, |
524 | 0 | static_cast<uint32_t>(infty), min_limit, |
525 | 0 | max_limit, nbits); |
526 | 0 | } else { |
527 | 0 | ComputeCodeLengthsNonZeroImpl(freqs, n, precision, infty, min_limit, |
528 | 0 | max_limit, nbits); |
529 | 0 | } |
530 | 0 | } |
531 | | |
532 | | static constexpr size_t kMaxNumSymbols = |
533 | | kNumRawSymbols + 1 < kNumLZ77 ? kNumLZ77 : kNumRawSymbols + 1; |
534 | | static void ComputeCodeLengths(const uint64_t* freqs, size_t n, |
535 | | const uint8_t* min_limit_in, |
536 | 0 | const uint8_t* max_limit_in, uint8_t* nbits) { |
537 | 0 | assert(n <= kMaxNumSymbols); |
538 | 0 | uint64_t compact_freqs[kMaxNumSymbols]; |
539 | 0 | uint8_t min_limit[kMaxNumSymbols]; |
540 | 0 | uint8_t max_limit[kMaxNumSymbols]; |
541 | 0 | size_t ni = 0; |
542 | 0 | for (size_t i = 0; i < n; i++) { |
543 | 0 | if (freqs[i]) { |
544 | 0 | compact_freqs[ni] = freqs[i]; |
545 | 0 | min_limit[ni] = min_limit_in[i]; |
546 | 0 | max_limit[ni] = max_limit_in[i]; |
547 | 0 | ni++; |
548 | 0 | } |
549 | 0 | } |
550 | 0 | for (size_t i = ni; i < kMaxNumSymbols; ++i) { |
551 | 0 | compact_freqs[i] = 0; |
552 | 0 | min_limit[i] = 0; |
553 | 0 | max_limit[i] = 0; |
554 | 0 | } |
555 | 0 | uint8_t num_bits[kMaxNumSymbols] = {}; |
556 | 0 | ComputeCodeLengthsNonZero(compact_freqs, ni, min_limit, max_limit, |
557 | 0 | num_bits); |
558 | 0 | ni = 0; |
559 | 0 | for (size_t i = 0; i < n; i++) { |
560 | 0 | nbits[i] = 0; |
561 | 0 | if (freqs[i]) { |
562 | 0 | nbits[i] = num_bits[ni++]; |
563 | 0 | } |
564 | 0 | } |
565 | 0 | } |
566 | | |
567 | | // Invalid code, used to construct arrays. |
568 | 0 | PrefixCode() = default; |
569 | | |
570 | | template <typename BitDepth> |
571 | | PrefixCode(BitDepth /* bitdepth */, uint64_t* raw_counts, |
572 | 0 | uint64_t* lz77_counts) { |
573 | | // "merge" together all the lz77 counts in a single symbol for the level 1 |
574 | | // table (containing just the raw symbols, up to length 7). |
575 | 0 | uint64_t level1_counts[kNumRawSymbols + 1]; |
576 | 0 | memcpy(level1_counts, raw_counts, kNumRawSymbols * sizeof(uint64_t)); |
577 | 0 | numraw = kNumRawSymbols; |
578 | 0 | while (numraw > 0 && level1_counts[numraw - 1] == 0) numraw--; |
579 | |
|
580 | 0 | level1_counts[numraw] = 0; |
581 | 0 | for (size_t i = 0; i < kNumLZ77; i++) { |
582 | 0 | level1_counts[numraw] += lz77_counts[i]; |
583 | 0 | } |
584 | 0 | uint8_t level1_nbits[kNumRawSymbols + 1] = {}; |
585 | 0 | ComputeCodeLengths(level1_counts, numraw + 1, BitDepth::kMinRawLength, |
586 | 0 | BitDepth::kMaxRawLength, level1_nbits); |
587 | |
|
588 | 0 | uint8_t level2_nbits[kNumLZ77] = {}; |
589 | 0 | uint8_t min_lengths[kNumLZ77] = {}; |
590 | 0 | uint8_t l = 15 - level1_nbits[numraw]; |
591 | 0 | uint8_t max_lengths[kNumLZ77]; |
592 | 0 | for (uint8_t& max_length : max_lengths) { |
593 | 0 | max_length = l; |
594 | 0 | } |
595 | 0 | size_t num_lz77 = kNumLZ77; |
596 | 0 | while (num_lz77 > 0 && lz77_counts[num_lz77 - 1] == 0) num_lz77--; |
597 | 0 | ComputeCodeLengths(lz77_counts, num_lz77, min_lengths, max_lengths, |
598 | 0 | level2_nbits); |
599 | 0 | for (size_t i = 0; i < numraw; i++) { |
600 | 0 | raw_nbits[i] = level1_nbits[i]; |
601 | 0 | } |
602 | 0 | for (size_t i = 0; i < num_lz77; i++) { |
603 | 0 | lz77_nbits[i] = |
604 | 0 | level2_nbits[i] ? level1_nbits[numraw] + level2_nbits[i] : 0; |
605 | 0 | } |
606 | |
|
607 | 0 | ComputeCanonicalCode(raw_nbits, raw_bits, numraw, lz77_nbits, lz77_bits, |
608 | 0 | kNumLZ77); |
609 | | |
610 | | // Prepare lz77 cache |
611 | 0 | for (size_t count = 0; count < kLZ77CacheSize; count++) { |
612 | 0 | unsigned token, nbits, bits; |
613 | 0 | EncodeHybridUintLZ77(count, &token, &nbits, &bits); |
614 | 0 | lz77_cache_nbits[count] = lz77_nbits[token] + nbits + raw_nbits[0]; |
615 | 0 | lz77_cache_bits[count] = |
616 | 0 | (((bits << lz77_nbits[token]) | lz77_bits[token]) << raw_nbits[0]) | |
617 | 0 | raw_bits[0]; |
618 | 0 | } |
619 | 0 | } Unexecuted instantiation: enc_fast_lossless.cc:(anonymous namespace)::PrefixCode::PrefixCode<AVX2::(anonymous namespace)::UpTo8Bits>(AVX2::(anonymous namespace)::UpTo8Bits, unsigned long*, unsigned long*) Unexecuted instantiation: enc_fast_lossless.cc:(anonymous namespace)::PrefixCode::PrefixCode<AVX2::(anonymous namespace)::From9To13Bits>(AVX2::(anonymous namespace)::From9To13Bits, unsigned long*, unsigned long*) Unexecuted instantiation: enc_fast_lossless.cc:(anonymous namespace)::PrefixCode::PrefixCode<AVX2::(anonymous namespace)::Exactly14Bits>(AVX2::(anonymous namespace)::Exactly14Bits, unsigned long*, unsigned long*) Unexecuted instantiation: enc_fast_lossless.cc:(anonymous namespace)::PrefixCode::PrefixCode<AVX2::(anonymous namespace)::MoreThan14Bits>(AVX2::(anonymous namespace)::MoreThan14Bits, unsigned long*, unsigned long*) Unexecuted instantiation: enc_fast_lossless.cc:(anonymous namespace)::PrefixCode::PrefixCode<default_implementation::(anonymous namespace)::UpTo8Bits>(default_implementation::(anonymous namespace)::UpTo8Bits, unsigned long*, unsigned long*) Unexecuted instantiation: enc_fast_lossless.cc:(anonymous namespace)::PrefixCode::PrefixCode<default_implementation::(anonymous namespace)::From9To13Bits>(default_implementation::(anonymous namespace)::From9To13Bits, unsigned long*, unsigned long*) Unexecuted instantiation: enc_fast_lossless.cc:(anonymous namespace)::PrefixCode::PrefixCode<default_implementation::(anonymous namespace)::Exactly14Bits>(default_implementation::(anonymous namespace)::Exactly14Bits, unsigned long*, unsigned long*) Unexecuted instantiation: enc_fast_lossless.cc:(anonymous namespace)::PrefixCode::PrefixCode<default_implementation::(anonymous namespace)::MoreThan14Bits>(default_implementation::(anonymous namespace)::MoreThan14Bits, unsigned long*, unsigned long*) |
620 | | |
621 | | // Max bits written: 2 + 72 + 95 + 24 + 165 = 286 |
622 | 0 | void WriteTo(BitWriter* writer) const { |
623 | 0 | uint64_t code_length_counts[18] = {}; |
624 | 0 | code_length_counts[17] = 3 + 2 * (kNumLZ77 - 1); |
625 | 0 | for (uint8_t raw_nbit : raw_nbits) { |
626 | 0 | code_length_counts[raw_nbit]++; |
627 | 0 | } |
628 | 0 | for (uint8_t lz77_nbit : lz77_nbits) { |
629 | 0 | code_length_counts[lz77_nbit]++; |
630 | 0 | } |
631 | 0 | uint8_t code_length_nbits[18] = {}; |
632 | 0 | uint8_t code_length_nbits_min[18] = {}; |
633 | 0 | uint8_t code_length_nbits_max[18] = { |
634 | 0 | 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, |
635 | 0 | }; |
636 | 0 | ComputeCodeLengths(code_length_counts, 18, code_length_nbits_min, |
637 | 0 | code_length_nbits_max, code_length_nbits); |
638 | 0 | writer->Write(2, 0b00); // HSKIP = 0, i.e. don't skip code lengths. |
639 | | |
640 | | // As per Brotli RFC. |
641 | 0 | uint8_t code_length_order[18] = {1, 2, 3, 4, 0, 5, 17, 6, 16, |
642 | 0 | 7, 8, 9, 10, 11, 12, 13, 14, 15}; |
643 | 0 | uint8_t code_length_length_nbits[] = {2, 4, 3, 2, 2, 4}; |
644 | 0 | uint8_t code_length_length_bits[] = {0, 7, 3, 2, 1, 15}; |
645 | | |
646 | | // Encode lengths of code lengths. |
647 | 0 | size_t num_code_lengths = 18; |
648 | 0 | while (code_length_nbits[code_length_order[num_code_lengths - 1]] == 0) { |
649 | 0 | num_code_lengths--; |
650 | 0 | } |
651 | | // Max bits written in this loop: 18 * 4 = 72 |
652 | 0 | for (size_t i = 0; i < num_code_lengths; i++) { |
653 | 0 | int symbol = code_length_nbits[code_length_order[i]]; |
654 | 0 | writer->Write(code_length_length_nbits[symbol], |
655 | 0 | code_length_length_bits[symbol]); |
656 | 0 | } |
657 | | |
658 | | // Compute the canonical codes for the codes that represent the lengths of |
659 | | // the actual codes for data. |
660 | 0 | uint16_t code_length_bits[18] = {}; |
661 | 0 | ComputeCanonicalCode(nullptr, nullptr, 0, code_length_nbits, |
662 | 0 | code_length_bits, 18); |
663 | | // Encode raw bit code lengths. |
664 | | // Max bits written in this loop: 19 * 5 = 95 |
665 | 0 | for (uint8_t raw_nbit : raw_nbits) { |
666 | 0 | writer->Write(code_length_nbits[raw_nbit], code_length_bits[raw_nbit]); |
667 | 0 | } |
668 | 0 | size_t num_lz77 = kNumLZ77; |
669 | 0 | while (lz77_nbits[num_lz77 - 1] == 0) { |
670 | 0 | num_lz77--; |
671 | 0 | } |
672 | | // Encode 0s until 224 (start of LZ77 symbols). This is in total 224-19 = |
673 | | // 205. |
674 | 0 | static_assert(kLZ77Offset == 224, "kLZ77Offset should be 224"); |
675 | 0 | static_assert(kNumRawSymbols == 19, "kNumRawSymbols should be 19"); |
676 | 0 | { |
677 | | // Max bits in this block: 24 |
678 | 0 | writer->Write(code_length_nbits[17], code_length_bits[17]); |
679 | 0 | writer->Write(3, 0b010); // 5 |
680 | 0 | writer->Write(code_length_nbits[17], code_length_bits[17]); |
681 | 0 | writer->Write(3, 0b000); // (5-2)*8 + 3 = 27 |
682 | 0 | writer->Write(code_length_nbits[17], code_length_bits[17]); |
683 | 0 | writer->Write(3, 0b010); // (27-2)*8 + 5 = 205 |
684 | 0 | } |
685 | | // Encode LZ77 symbols, with values 224+i. |
686 | | // Max bits written in this loop: 33 * 5 = 165 |
687 | 0 | for (size_t i = 0; i < num_lz77; i++) { |
688 | 0 | writer->Write(code_length_nbits[lz77_nbits[i]], |
689 | 0 | code_length_bits[lz77_nbits[i]]); |
690 | 0 | } |
691 | 0 | } |
692 | | }; |
693 | | |
694 | | } // namespace |
695 | | |
696 | | extern "C" { |
697 | | |
698 | | struct JxlFastLosslessFrameState { |
699 | | JxlChunkedFrameInputSource input; |
700 | | size_t width; |
701 | | size_t height; |
702 | | size_t num_groups_x; |
703 | | size_t num_groups_y; |
704 | | size_t num_dc_groups_x; |
705 | | size_t num_dc_groups_y; |
706 | | size_t nb_chans; |
707 | | size_t bitdepth; |
708 | | int big_endian; |
709 | | int effort; |
710 | | bool collided; |
711 | | PrefixCode hcode[4]; |
712 | | std::vector<int16_t> lookup; |
713 | | BitWriter header; |
714 | | std::vector<std::array<BitWriter, 4>> group_data; |
715 | | std::vector<size_t> group_sizes; |
716 | | size_t ac_group_data_offset = 0; |
717 | | size_t min_dc_global_size = 0; |
718 | | size_t current_bit_writer = 0; |
719 | | size_t bit_writer_byte_pos = 0; |
720 | | size_t bits_in_buffer = 0; |
721 | | uint64_t bit_buffer = 0; |
722 | | bool process_done = false; |
723 | | }; |
724 | | |
725 | 0 | size_t JxlFastLosslessOutputSize(const JxlFastLosslessFrameState* frame) { |
726 | 0 | size_t total_size_groups = 0; |
727 | 0 | for (const auto& section : frame->group_data) { |
728 | 0 | total_size_groups += SectionSize(section); |
729 | 0 | } |
730 | 0 | return frame->header.bytes_written + total_size_groups; |
731 | 0 | } |
732 | | |
733 | | size_t JxlFastLosslessMaxRequiredOutput( |
734 | 0 | const JxlFastLosslessFrameState* frame) { |
735 | 0 | return JxlFastLosslessOutputSize(frame) + 32; |
736 | 0 | } |
737 | | |
738 | | void JxlFastLosslessPrepareHeader(JxlFastLosslessFrameState* frame, |
739 | 0 | int add_image_header, int is_last) { |
740 | 0 | BitWriter* output = &frame->header; |
741 | 0 | output->Allocate(1000 + frame->group_sizes.size() * 32); |
742 | |
|
743 | 0 | bool have_alpha = (frame->nb_chans == 2 || frame->nb_chans == 4); |
744 | |
|
745 | | #if FJXL_STANDALONE |
746 | | if (add_image_header) { |
747 | | // Signature |
748 | | output->Write(16, 0x0AFF); |
749 | | |
750 | | // Size header, hand-crafted. |
751 | | // Not small |
752 | | output->Write(1, 0); |
753 | | |
754 | | auto wsz = [output](size_t size) { |
755 | | if (size - 1 < (1 << 9)) { |
756 | | output->Write(2, 0b00); |
757 | | output->Write(9, size - 1); |
758 | | } else if (size - 1 < (1 << 13)) { |
759 | | output->Write(2, 0b01); |
760 | | output->Write(13, size - 1); |
761 | | } else if (size - 1 < (1 << 18)) { |
762 | | output->Write(2, 0b10); |
763 | | output->Write(18, size - 1); |
764 | | } else { |
765 | | output->Write(2, 0b11); |
766 | | output->Write(30, size - 1); |
767 | | } |
768 | | }; |
769 | | |
770 | | wsz(frame->height); |
771 | | |
772 | | // No special ratio. |
773 | | output->Write(3, 0); |
774 | | |
775 | | wsz(frame->width); |
776 | | |
777 | | // Hand-crafted ImageMetadata. |
778 | | output->Write(1, 0); // all_default |
779 | | output->Write(1, 0); // extra_fields |
780 | | output->Write(1, 0); // bit_depth.floating_point_sample |
781 | | if (frame->bitdepth == 8) { |
782 | | output->Write(2, 0b00); // bit_depth.bits_per_sample = 8 |
783 | | } else if (frame->bitdepth == 10) { |
784 | | output->Write(2, 0b01); // bit_depth.bits_per_sample = 10 |
785 | | } else if (frame->bitdepth == 12) { |
786 | | output->Write(2, 0b10); // bit_depth.bits_per_sample = 12 |
787 | | } else { |
788 | | output->Write(2, 0b11); // 1 + u(6) |
789 | | output->Write(6, frame->bitdepth - 1); |
790 | | } |
791 | | if (frame->bitdepth <= 14) { |
792 | | output->Write(1, 1); // 16-bit-buffer sufficient |
793 | | } else { |
794 | | output->Write(1, 0); // 16-bit-buffer NOT sufficient |
795 | | } |
796 | | if (have_alpha) { |
797 | | output->Write(2, 0b01); // One extra channel |
798 | | if (frame->bitdepth == 8) { |
799 | | output->Write(1, 1); // ... all_default (ie. 8-bit alpha) |
800 | | } else { |
801 | | output->Write(1, 0); // not d_alpha |
802 | | output->Write(2, 0); // type = kAlpha |
803 | | output->Write(1, 0); // not float |
804 | | if (frame->bitdepth == 10) { |
805 | | output->Write(2, 0b01); // bit_depth.bits_per_sample = 10 |
806 | | } else if (frame->bitdepth == 12) { |
807 | | output->Write(2, 0b10); // bit_depth.bits_per_sample = 12 |
808 | | } else { |
809 | | output->Write(2, 0b11); // 1 + u(6) |
810 | | output->Write(6, frame->bitdepth - 1); |
811 | | } |
812 | | output->Write(2, 0); // dim_shift = 0 |
813 | | output->Write(2, 0); // name_len = 0 |
814 | | output->Write(1, 0); // alpha_associated = 0 |
815 | | } |
816 | | } else { |
817 | | output->Write(2, 0b00); // No extra channel |
818 | | } |
819 | | output->Write(1, 0); // Not XYB |
820 | | if (frame->nb_chans > 2) { |
821 | | output->Write(1, 1); // color_encoding.all_default (sRGB) |
822 | | } else { |
823 | | output->Write(1, 0); // color_encoding.all_default false |
824 | | output->Write(1, 0); // color_encoding.want_icc false |
825 | | output->Write(2, 1); // grayscale |
826 | | output->Write(2, 1); // D65 |
827 | | output->Write(1, 0); // no gamma transfer function |
828 | | output->Write(2, 0b10); // tf: 2 + u(4) |
829 | | output->Write(4, 11); // tf of sRGB |
830 | | output->Write(2, 1); // relative rendering intent |
831 | | } |
832 | | output->Write(2, 0b00); // No extensions. |
833 | | |
834 | | output->Write(1, 1); // all_default transform data |
835 | | |
836 | | // No ICC, no preview. Frame should start at byte boundary. |
837 | | output->ZeroPadToByte(); |
838 | | } |
839 | | #else |
840 | 0 | assert(!add_image_header); |
841 | 0 | #endif |
842 | | // Handcrafted frame header. |
843 | 0 | output->Write(1, 0); // all_default |
844 | 0 | output->Write(2, 0b00); // regular frame |
845 | 0 | output->Write(1, 1); // modular |
846 | 0 | output->Write(2, 0b00); // default flags |
847 | 0 | output->Write(1, 0); // not YCbCr |
848 | 0 | output->Write(2, 0b00); // no upsampling |
849 | 0 | if (have_alpha) { |
850 | 0 | output->Write(2, 0b00); // no alpha upsampling |
851 | 0 | } |
852 | 0 | output->Write(2, 0b01); // default group size |
853 | 0 | output->Write(2, 0b00); // exactly one pass |
854 | 0 | output->Write(1, 0); // no custom size or origin |
855 | 0 | output->Write(2, 0b00); // kReplace blending mode |
856 | 0 | if (have_alpha) { |
857 | 0 | output->Write(2, 0b00); // kReplace blending mode for alpha channel |
858 | 0 | } |
859 | 0 | output->Write(1, is_last); // is_last |
860 | 0 | if (!is_last) { |
861 | 0 | output->Write(2, 0b00); // can not be saved as reference |
862 | 0 | } |
863 | 0 | output->Write(2, 0b00); // a frame has no name |
864 | 0 | output->Write(1, 0); // loop filter is not all_default |
865 | 0 | output->Write(1, 0); // no gaborish |
866 | 0 | output->Write(2, 0); // 0 EPF iters |
867 | 0 | output->Write(2, 0b00); // No LF extensions |
868 | 0 | output->Write(2, 0b00); // No FH extensions |
869 | |
|
870 | 0 | output->Write(1, 0); // No TOC permutation |
871 | 0 | output->ZeroPadToByte(); // TOC is byte-aligned. |
872 | 0 | assert(add_image_header || output->bytes_written <= kMaxFrameHeaderSize); |
873 | 0 | for (size_t group_size : frame->group_sizes) { |
874 | 0 | size_t bucket = TOCBucket(group_size); |
875 | 0 | output->Write(2, bucket); |
876 | 0 | output->Write(kTOCBits[bucket] - 2, group_size - kGroupSizeOffset[bucket]); |
877 | 0 | } |
878 | 0 | output->ZeroPadToByte(); // Groups are byte-aligned. |
879 | 0 | } |
880 | | |
881 | | #if !FJXL_STANDALONE |
882 | | bool JxlFastLosslessOutputAlignedSection( |
883 | 0 | const BitWriter& bw, JxlEncoderOutputProcessorWrapper* output_processor) { |
884 | 0 | assert(bw.bits_in_buffer == 0); |
885 | 0 | const uint8_t* data = bw.data.get(); |
886 | 0 | size_t remaining_len = bw.bytes_written; |
887 | 0 | while (remaining_len > 0) { |
888 | 0 | JXL_ASSIGN_OR_RETURN(auto buffer, |
889 | 0 | output_processor->GetBuffer(1, remaining_len)); |
890 | 0 | size_t n = std::min(buffer.size(), remaining_len); |
891 | 0 | if (n == 0) break; |
892 | 0 | memcpy(buffer.data(), data, n); |
893 | 0 | JXL_RETURN_IF_ERROR(buffer.advance(n)); |
894 | 0 | data += n; |
895 | 0 | remaining_len -= n; |
896 | 0 | }; |
897 | 0 | return true; |
898 | 0 | } |
899 | | |
900 | | bool JxlFastLosslessOutputHeaders( |
901 | | JxlFastLosslessFrameState* frame_state, |
902 | 0 | JxlEncoderOutputProcessorWrapper* output_processor) { |
903 | 0 | JXL_RETURN_IF_ERROR(JxlFastLosslessOutputAlignedSection(frame_state->header, |
904 | 0 | output_processor)); |
905 | 0 | JXL_RETURN_IF_ERROR(JxlFastLosslessOutputAlignedSection( |
906 | 0 | frame_state->group_data[0][0], output_processor)); |
907 | 0 | return true; |
908 | 0 | } |
909 | | #endif |
910 | | |
911 | | #if FJXL_ENABLE_AVX512 |
912 | | __attribute__((target("avx512vbmi2"))) static size_t AppendBytesWithBitOffset( |
913 | | const uint8_t* data, size_t n, size_t bit_buffer_nbits, |
914 | | unsigned char* output, uint64_t& bit_buffer) { |
915 | | if (n < 128) { |
916 | | return 0; |
917 | | } |
918 | | |
919 | | size_t i = 0; |
920 | | __m512i shift = _mm512_set1_epi64(64 - bit_buffer_nbits); |
921 | | __m512i carry = _mm512_set1_epi64(bit_buffer << (64 - bit_buffer_nbits)); |
922 | | |
923 | | for (; i + 64 <= n; i += 64) { |
924 | | __m512i current = _mm512_loadu_si512(data + i); |
925 | | __m512i previous_u64 = _mm512_alignr_epi64(current, carry, 7); |
926 | | carry = current; |
927 | | __m512i out = _mm512_shrdv_epi64(previous_u64, current, shift); |
928 | | _mm512_storeu_si512(output + i, out); |
929 | | } |
930 | | |
931 | | bit_buffer = data[i - 1] >> (8 - bit_buffer_nbits); |
932 | | |
933 | | return i; |
934 | | } |
935 | | #endif |
936 | | |
937 | | size_t JxlFastLosslessWriteOutput(JxlFastLosslessFrameState* frame, |
938 | 0 | unsigned char* output, size_t output_size) { |
939 | 0 | assert(output_size >= 32); |
940 | 0 | unsigned char* initial_output = output; |
941 | 0 | size_t (*append_bytes_with_bit_offset)(const uint8_t*, size_t, size_t, |
942 | 0 | unsigned char*, uint64_t&) = nullptr; |
943 | |
|
944 | | #if FJXL_ENABLE_AVX512 |
945 | | if (HasCpuFeature(CpuFeature::kVBMI2)) { |
946 | | append_bytes_with_bit_offset = AppendBytesWithBitOffset; |
947 | | } |
948 | | #endif |
949 | |
|
950 | 0 | while (true) { |
951 | 0 | size_t& cur = frame->current_bit_writer; |
952 | 0 | size_t& bw_pos = frame->bit_writer_byte_pos; |
953 | 0 | if (cur >= 1 + frame->group_data.size() * frame->nb_chans) { |
954 | 0 | return output - initial_output; |
955 | 0 | } |
956 | 0 | if (output_size <= 9) { |
957 | 0 | return output - initial_output; |
958 | 0 | } |
959 | 0 | size_t nbc = frame->nb_chans; |
960 | 0 | const BitWriter& writer = |
961 | 0 | cur == 0 ? frame->header |
962 | 0 | : frame->group_data[(cur - 1) / nbc][(cur - 1) % nbc]; |
963 | 0 | size_t full_byte_count = |
964 | 0 | std::min(output_size - 9, writer.bytes_written - bw_pos); |
965 | 0 | if (frame->bits_in_buffer == 0) { |
966 | 0 | memcpy(output, writer.data.get() + bw_pos, full_byte_count); |
967 | 0 | } else { |
968 | 0 | size_t i = 0; |
969 | 0 | if (append_bytes_with_bit_offset) { |
970 | 0 | i += append_bytes_with_bit_offset( |
971 | 0 | writer.data.get() + bw_pos, full_byte_count, frame->bits_in_buffer, |
972 | 0 | output, frame->bit_buffer); |
973 | 0 | } |
974 | 0 | #if defined(__BYTE_ORDER__) && (__BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__) |
975 | | // Copy 8 bytes at a time until we reach the border. |
976 | 0 | for (; i + 8 < full_byte_count; i += 8) { |
977 | 0 | uint64_t chunk; |
978 | 0 | memcpy(&chunk, writer.data.get() + bw_pos + i, 8); |
979 | 0 | uint64_t out = frame->bit_buffer | (chunk << frame->bits_in_buffer); |
980 | 0 | memcpy(output + i, &out, 8); |
981 | 0 | frame->bit_buffer = chunk >> (64 - frame->bits_in_buffer); |
982 | 0 | } |
983 | 0 | #endif |
984 | 0 | for (; i < full_byte_count; i++) { |
985 | 0 | AddBits(8, writer.data.get()[bw_pos + i], output + i, |
986 | 0 | frame->bits_in_buffer, frame->bit_buffer); |
987 | 0 | } |
988 | 0 | } |
989 | 0 | output += full_byte_count; |
990 | 0 | output_size -= full_byte_count; |
991 | 0 | bw_pos += full_byte_count; |
992 | 0 | if (bw_pos == writer.bytes_written) { |
993 | 0 | auto write = [&](size_t num, uint64_t bits) { |
994 | 0 | size_t n = AddBits(num, bits, output, frame->bits_in_buffer, |
995 | 0 | frame->bit_buffer); |
996 | 0 | output += n; |
997 | 0 | output_size -= n; |
998 | 0 | }; |
999 | 0 | if (writer.bits_in_buffer) { |
1000 | 0 | write(writer.bits_in_buffer, writer.buffer); |
1001 | 0 | } |
1002 | 0 | bw_pos = 0; |
1003 | 0 | cur++; |
1004 | 0 | if ((cur - 1) % nbc == 0 && frame->bits_in_buffer != 0) { |
1005 | 0 | write(8 - frame->bits_in_buffer, 0); |
1006 | 0 | } |
1007 | 0 | } |
1008 | 0 | } |
1009 | 0 | } |
1010 | | |
1011 | 0 | void JxlFastLosslessFreeFrameState(JxlFastLosslessFrameState* frame) { |
1012 | 0 | delete frame; |
1013 | 0 | } |
1014 | | |
1015 | | } // extern "C" |
1016 | | |
1017 | | #endif |
1018 | | |
1019 | | #ifdef FJXL_SELF_INCLUDE |
1020 | | |
1021 | | namespace { |
1022 | | |
1023 | | template <typename T> |
1024 | | struct VecPair { |
1025 | | T low; |
1026 | | T hi; |
1027 | | }; |
1028 | | |
1029 | | #ifdef FJXL_GENERIC_SIMD |
1030 | | #undef FJXL_GENERIC_SIMD |
1031 | | #endif |
1032 | | |
1033 | | #ifdef FJXL_AVX512 |
1034 | | #define FJXL_GENERIC_SIMD |
1035 | | struct SIMDVec32; |
1036 | | struct Mask32 { |
1037 | | __mmask16 mask; |
1038 | | SIMDVec32 IfThenElse(const SIMDVec32& if_true, const SIMDVec32& if_false); |
1039 | | size_t CountPrefix() const { |
1040 | | return CtzNonZero(~uint64_t{_cvtmask16_u32(mask)}); |
1041 | | } |
1042 | | }; |
1043 | | |
1044 | | struct SIMDVec32 { |
1045 | | __m512i vec; |
1046 | | |
1047 | | static constexpr size_t kLanes = 16; |
1048 | | |
1049 | | FJXL_INLINE static SIMDVec32 Load(const uint32_t* data) { |
1050 | | return SIMDVec32{_mm512_loadu_si512((__m512i*)data)}; |
1051 | | } |
1052 | | FJXL_INLINE void Store(uint32_t* data) { |
1053 | | _mm512_storeu_si512((__m512i*)data, vec); |
1054 | | } |
1055 | | FJXL_INLINE static SIMDVec32 Val(uint32_t v) { |
1056 | | return SIMDVec32{_mm512_set1_epi32(v)}; |
1057 | | } |
1058 | | FJXL_INLINE SIMDVec32 ValToToken() const { |
1059 | | return SIMDVec32{ |
1060 | | _mm512_sub_epi32(_mm512_set1_epi32(32), _mm512_lzcnt_epi32(vec))}; |
1061 | | } |
1062 | | FJXL_INLINE SIMDVec32 SatSubU(const SIMDVec32& to_subtract) const { |
1063 | | return SIMDVec32{_mm512_sub_epi32(_mm512_max_epu32(vec, to_subtract.vec), |
1064 | | to_subtract.vec)}; |
1065 | | } |
1066 | | FJXL_INLINE SIMDVec32 Sub(const SIMDVec32& to_subtract) const { |
1067 | | return SIMDVec32{_mm512_sub_epi32(vec, to_subtract.vec)}; |
1068 | | } |
1069 | | FJXL_INLINE SIMDVec32 Add(const SIMDVec32& oth) const { |
1070 | | return SIMDVec32{_mm512_add_epi32(vec, oth.vec)}; |
1071 | | } |
1072 | | FJXL_INLINE SIMDVec32 Xor(const SIMDVec32& oth) const { |
1073 | | return SIMDVec32{_mm512_xor_epi32(vec, oth.vec)}; |
1074 | | } |
1075 | | FJXL_INLINE Mask32 Eq(const SIMDVec32& oth) const { |
1076 | | return Mask32{_mm512_cmpeq_epi32_mask(vec, oth.vec)}; |
1077 | | } |
1078 | | FJXL_INLINE Mask32 Gt(const SIMDVec32& oth) const { |
1079 | | return Mask32{_mm512_cmpgt_epi32_mask(vec, oth.vec)}; |
1080 | | } |
1081 | | FJXL_INLINE SIMDVec32 Pow2() const { |
1082 | | return SIMDVec32{_mm512_sllv_epi32(_mm512_set1_epi32(1), vec)}; |
1083 | | } |
1084 | | template <size_t i> |
1085 | | FJXL_INLINE SIMDVec32 SignedShiftRight() const { |
1086 | | return SIMDVec32{_mm512_srai_epi32(vec, i)}; |
1087 | | } |
1088 | | }; |
1089 | | |
1090 | | struct SIMDVec16; |
1091 | | |
1092 | | struct Mask16 { |
1093 | | __mmask32 mask; |
1094 | | SIMDVec16 IfThenElse(const SIMDVec16& if_true, const SIMDVec16& if_false); |
1095 | | Mask16 And(const Mask16& oth) const { |
1096 | | return Mask16{_kand_mask32(mask, oth.mask)}; |
1097 | | } |
1098 | | size_t CountPrefix() const { |
1099 | | return CtzNonZero(~uint64_t{_cvtmask32_u32(mask)}); |
1100 | | } |
1101 | | }; |
1102 | | |
1103 | | struct SIMDVec16 { |
1104 | | __m512i vec; |
1105 | | |
1106 | | static constexpr size_t kLanes = 32; |
1107 | | |
1108 | | FJXL_INLINE static SIMDVec16 Load(const uint16_t* data) { |
1109 | | return SIMDVec16{_mm512_loadu_si512((__m512i*)data)}; |
1110 | | } |
1111 | | FJXL_INLINE void Store(uint16_t* data) { |
1112 | | _mm512_storeu_si512((__m512i*)data, vec); |
1113 | | } |
1114 | | FJXL_INLINE static SIMDVec16 Val(uint16_t v) { |
1115 | | return SIMDVec16{_mm512_set1_epi16(v)}; |
1116 | | } |
1117 | | FJXL_INLINE static SIMDVec16 FromTwo32(const SIMDVec32& lo, |
1118 | | const SIMDVec32& hi) { |
1119 | | auto tmp = _mm512_packus_epi32(lo.vec, hi.vec); |
1120 | | alignas(64) uint64_t perm[8] = {0, 2, 4, 6, 1, 3, 5, 7}; |
1121 | | return SIMDVec16{ |
1122 | | _mm512_permutex2var_epi64(tmp, _mm512_load_si512((__m512i*)perm), tmp)}; |
1123 | | } |
1124 | | |
1125 | | FJXL_INLINE SIMDVec16 ValToToken() const { |
1126 | | auto c16 = _mm512_set1_epi32(16); |
1127 | | auto c32 = _mm512_set1_epi32(32); |
1128 | | auto low16bit = _mm512_set1_epi32(0x0000FFFF); |
1129 | | auto lzhi = |
1130 | | _mm512_sub_epi32(c16, _mm512_min_epu32(c16, _mm512_lzcnt_epi32(vec))); |
1131 | | auto lzlo = _mm512_sub_epi32( |
1132 | | c32, _mm512_lzcnt_epi32(_mm512_and_si512(low16bit, vec))); |
1133 | | return SIMDVec16{_mm512_or_si512(lzlo, _mm512_slli_epi32(lzhi, 16))}; |
1134 | | } |
1135 | | |
1136 | | FJXL_INLINE SIMDVec16 SatSubU(const SIMDVec16& to_subtract) const { |
1137 | | return SIMDVec16{_mm512_subs_epu16(vec, to_subtract.vec)}; |
1138 | | } |
1139 | | FJXL_INLINE SIMDVec16 Sub(const SIMDVec16& to_subtract) const { |
1140 | | return SIMDVec16{_mm512_sub_epi16(vec, to_subtract.vec)}; |
1141 | | } |
1142 | | FJXL_INLINE SIMDVec16 Add(const SIMDVec16& oth) const { |
1143 | | return SIMDVec16{_mm512_add_epi16(vec, oth.vec)}; |
1144 | | } |
1145 | | FJXL_INLINE SIMDVec16 Min(const SIMDVec16& oth) const { |
1146 | | return SIMDVec16{_mm512_min_epu16(vec, oth.vec)}; |
1147 | | } |
1148 | | FJXL_INLINE Mask16 Eq(const SIMDVec16& oth) const { |
1149 | | return Mask16{_mm512_cmpeq_epi16_mask(vec, oth.vec)}; |
1150 | | } |
1151 | | FJXL_INLINE Mask16 Gt(const SIMDVec16& oth) const { |
1152 | | return Mask16{_mm512_cmpgt_epi16_mask(vec, oth.vec)}; |
1153 | | } |
1154 | | FJXL_INLINE SIMDVec16 Pow2() const { |
1155 | | return SIMDVec16{_mm512_sllv_epi16(_mm512_set1_epi16(1), vec)}; |
1156 | | } |
1157 | | FJXL_INLINE SIMDVec16 Or(const SIMDVec16& oth) const { |
1158 | | return SIMDVec16{_mm512_or_si512(vec, oth.vec)}; |
1159 | | } |
1160 | | FJXL_INLINE SIMDVec16 Xor(const SIMDVec16& oth) const { |
1161 | | return SIMDVec16{_mm512_xor_si512(vec, oth.vec)}; |
1162 | | } |
1163 | | FJXL_INLINE SIMDVec16 And(const SIMDVec16& oth) const { |
1164 | | return SIMDVec16{_mm512_and_si512(vec, oth.vec)}; |
1165 | | } |
1166 | | FJXL_INLINE SIMDVec16 HAdd(const SIMDVec16& oth) const { |
1167 | | return SIMDVec16{_mm512_srai_epi16(_mm512_add_epi16(vec, oth.vec), 1)}; |
1168 | | } |
1169 | | FJXL_INLINE SIMDVec16 PrepareForU8Lookup() const { |
1170 | | return SIMDVec16{_mm512_or_si512(vec, _mm512_set1_epi16(0xFF00))}; |
1171 | | } |
1172 | | FJXL_INLINE SIMDVec16 U8Lookup(const uint8_t* table) const { |
1173 | | return SIMDVec16{_mm512_shuffle_epi8( |
1174 | | _mm512_broadcast_i32x4(_mm_loadu_si128((__m128i*)table)), vec)}; |
1175 | | } |
1176 | | FJXL_INLINE VecPair<SIMDVec16> Interleave(const SIMDVec16& low) const { |
1177 | | auto lo = _mm512_unpacklo_epi16(low.vec, vec); |
1178 | | auto hi = _mm512_unpackhi_epi16(low.vec, vec); |
1179 | | alignas(64) uint64_t perm1[8] = {0, 1, 8, 9, 2, 3, 10, 11}; |
1180 | | alignas(64) uint64_t perm2[8] = {4, 5, 12, 13, 6, 7, 14, 15}; |
1181 | | return {SIMDVec16{_mm512_permutex2var_epi64( |
1182 | | lo, _mm512_load_si512((__m512i*)perm1), hi)}, |
1183 | | SIMDVec16{_mm512_permutex2var_epi64( |
1184 | | lo, _mm512_load_si512((__m512i*)perm2), hi)}}; |
1185 | | } |
1186 | | FJXL_INLINE VecPair<SIMDVec32> Upcast() const { |
1187 | | auto lo = _mm512_unpacklo_epi16(vec, _mm512_setzero_si512()); |
1188 | | auto hi = _mm512_unpackhi_epi16(vec, _mm512_setzero_si512()); |
1189 | | alignas(64) uint64_t perm1[8] = {0, 1, 8, 9, 2, 3, 10, 11}; |
1190 | | alignas(64) uint64_t perm2[8] = {4, 5, 12, 13, 6, 7, 14, 15}; |
1191 | | return {SIMDVec32{_mm512_permutex2var_epi64( |
1192 | | lo, _mm512_load_si512((__m512i*)perm1), hi)}, |
1193 | | SIMDVec32{_mm512_permutex2var_epi64( |
1194 | | lo, _mm512_load_si512((__m512i*)perm2), hi)}}; |
1195 | | } |
1196 | | template <size_t i> |
1197 | | FJXL_INLINE SIMDVec16 SignedShiftRight() const { |
1198 | | return SIMDVec16{_mm512_srai_epi16(vec, i)}; |
1199 | | } |
1200 | | |
1201 | | static std::array<SIMDVec16, 1> LoadG8(const unsigned char* data) { |
1202 | | __m256i bytes = _mm256_loadu_si256((__m256i*)data); |
1203 | | return {SIMDVec16{_mm512_cvtepu8_epi16(bytes)}}; |
1204 | | } |
1205 | | static std::array<SIMDVec16, 1> LoadG16(const unsigned char* data) { |
1206 | | return {Load((const uint16_t*)data)}; |
1207 | | } |
1208 | | |
1209 | | static std::array<SIMDVec16, 2> LoadGA8(const unsigned char* data) { |
1210 | | __m512i bytes = _mm512_loadu_si512((__m512i*)data); |
1211 | | __m512i gray = _mm512_and_si512(bytes, _mm512_set1_epi16(0xFF)); |
1212 | | __m512i alpha = _mm512_srli_epi16(bytes, 8); |
1213 | | return {SIMDVec16{gray}, SIMDVec16{alpha}}; |
1214 | | } |
1215 | | static std::array<SIMDVec16, 2> LoadGA16(const unsigned char* data) { |
1216 | | __m512i bytes1 = _mm512_loadu_si512((__m512i*)data); |
1217 | | __m512i bytes2 = _mm512_loadu_si512((__m512i*)(data + 64)); |
1218 | | __m512i g_mask = _mm512_set1_epi32(0xFFFF); |
1219 | | __m512i permuteidx = _mm512_set_epi64(7, 5, 3, 1, 6, 4, 2, 0); |
1220 | | __m512i g = _mm512_permutexvar_epi64( |
1221 | | permuteidx, _mm512_packus_epi32(_mm512_and_si512(bytes1, g_mask), |
1222 | | _mm512_and_si512(bytes2, g_mask))); |
1223 | | __m512i a = _mm512_permutexvar_epi64( |
1224 | | permuteidx, _mm512_packus_epi32(_mm512_srli_epi32(bytes1, 16), |
1225 | | _mm512_srli_epi32(bytes2, 16))); |
1226 | | return {SIMDVec16{g}, SIMDVec16{a}}; |
1227 | | } |
1228 | | |
1229 | | static std::array<SIMDVec16, 3> LoadRGB8(const unsigned char* data) { |
1230 | | __m512i bytes0 = _mm512_loadu_si512((__m512i*)data); |
1231 | | __m512i bytes1 = |
1232 | | _mm512_zextsi256_si512(_mm256_loadu_si256((__m256i*)(data + 64))); |
1233 | | |
1234 | | // 0x7A = element of upper half of second vector = 0 after lookup; still in |
1235 | | // the upper half once we add 1 or 2. |
1236 | | uint8_t z = 0x7A; |
1237 | | __m512i ridx = |
1238 | | _mm512_set_epi8(z, 93, z, 90, z, 87, z, 84, z, 81, z, 78, z, 75, z, 72, |
1239 | | z, 69, z, 66, z, 63, z, 60, z, 57, z, 54, z, 51, z, 48, |
1240 | | z, 45, z, 42, z, 39, z, 36, z, 33, z, 30, z, 27, z, 24, |
1241 | | z, 21, z, 18, z, 15, z, 12, z, 9, z, 6, z, 3, z, 0); |
1242 | | __m512i gidx = _mm512_add_epi8(ridx, _mm512_set1_epi8(1)); |
1243 | | __m512i bidx = _mm512_add_epi8(gidx, _mm512_set1_epi8(1)); |
1244 | | __m512i r = _mm512_permutex2var_epi8(bytes0, ridx, bytes1); |
1245 | | __m512i g = _mm512_permutex2var_epi8(bytes0, gidx, bytes1); |
1246 | | __m512i b = _mm512_permutex2var_epi8(bytes0, bidx, bytes1); |
1247 | | return {SIMDVec16{r}, SIMDVec16{g}, SIMDVec16{b}}; |
1248 | | } |
1249 | | static std::array<SIMDVec16, 3> LoadRGB16(const unsigned char* data) { |
1250 | | __m512i bytes0 = _mm512_loadu_si512((__m512i*)data); |
1251 | | __m512i bytes1 = _mm512_loadu_si512((__m512i*)(data + 64)); |
1252 | | __m512i bytes2 = _mm512_loadu_si512((__m512i*)(data + 128)); |
1253 | | |
1254 | | __m512i ridx_lo = _mm512_set_epi16(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 63, 60, 57, |
1255 | | 54, 51, 48, 45, 42, 39, 36, 33, 30, 27, |
1256 | | 24, 21, 18, 15, 12, 9, 6, 3, 0); |
1257 | | // -1 is such that when adding 1 or 2, we get the correct index for |
1258 | | // green/blue. |
1259 | | __m512i ridx_hi = |
1260 | | _mm512_set_epi16(29, 26, 23, 20, 17, 14, 11, 8, 5, 2, -1, 0, 0, 0, 0, 0, |
1261 | | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0); |
1262 | | __m512i gidx_lo = _mm512_add_epi16(ridx_lo, _mm512_set1_epi16(1)); |
1263 | | __m512i gidx_hi = _mm512_add_epi16(ridx_hi, _mm512_set1_epi16(1)); |
1264 | | __m512i bidx_lo = _mm512_add_epi16(gidx_lo, _mm512_set1_epi16(1)); |
1265 | | __m512i bidx_hi = _mm512_add_epi16(gidx_hi, _mm512_set1_epi16(1)); |
1266 | | |
1267 | | __mmask32 rmask = _cvtu32_mask32(0b11111111110000000000000000000000); |
1268 | | __mmask32 gbmask = _cvtu32_mask32(0b11111111111000000000000000000000); |
1269 | | |
1270 | | __m512i rlo = _mm512_permutex2var_epi16(bytes0, ridx_lo, bytes1); |
1271 | | __m512i glo = _mm512_permutex2var_epi16(bytes0, gidx_lo, bytes1); |
1272 | | __m512i blo = _mm512_permutex2var_epi16(bytes0, bidx_lo, bytes1); |
1273 | | __m512i r = _mm512_mask_permutexvar_epi16(rlo, rmask, ridx_hi, bytes2); |
1274 | | __m512i g = _mm512_mask_permutexvar_epi16(glo, gbmask, gidx_hi, bytes2); |
1275 | | __m512i b = _mm512_mask_permutexvar_epi16(blo, gbmask, bidx_hi, bytes2); |
1276 | | return {SIMDVec16{r}, SIMDVec16{g}, SIMDVec16{b}}; |
1277 | | } |
1278 | | |
1279 | | static std::array<SIMDVec16, 4> LoadRGBA8(const unsigned char* data) { |
1280 | | __m512i bytes1 = _mm512_loadu_si512((__m512i*)data); |
1281 | | __m512i bytes2 = _mm512_loadu_si512((__m512i*)(data + 64)); |
1282 | | __m512i rg_mask = _mm512_set1_epi32(0xFFFF); |
1283 | | __m512i permuteidx = _mm512_set_epi64(7, 5, 3, 1, 6, 4, 2, 0); |
1284 | | __m512i rg = _mm512_permutexvar_epi64( |
1285 | | permuteidx, _mm512_packus_epi32(_mm512_and_si512(bytes1, rg_mask), |
1286 | | _mm512_and_si512(bytes2, rg_mask))); |
1287 | | __m512i b_a = _mm512_permutexvar_epi64( |
1288 | | permuteidx, _mm512_packus_epi32(_mm512_srli_epi32(bytes1, 16), |
1289 | | _mm512_srli_epi32(bytes2, 16))); |
1290 | | __m512i r = _mm512_and_si512(rg, _mm512_set1_epi16(0xFF)); |
1291 | | __m512i g = _mm512_srli_epi16(rg, 8); |
1292 | | __m512i b = _mm512_and_si512(b_a, _mm512_set1_epi16(0xFF)); |
1293 | | __m512i a = _mm512_srli_epi16(b_a, 8); |
1294 | | return {SIMDVec16{r}, SIMDVec16{g}, SIMDVec16{b}, SIMDVec16{a}}; |
1295 | | } |
1296 | | static std::array<SIMDVec16, 4> LoadRGBA16(const unsigned char* data) { |
1297 | | __m512i bytes0 = _mm512_loadu_si512((__m512i*)data); |
1298 | | __m512i bytes1 = _mm512_loadu_si512((__m512i*)(data + 64)); |
1299 | | __m512i bytes2 = _mm512_loadu_si512((__m512i*)(data + 128)); |
1300 | | __m512i bytes3 = _mm512_loadu_si512((__m512i*)(data + 192)); |
1301 | | |
1302 | | auto pack32 = [](__m512i a, __m512i b) { |
1303 | | __m512i permuteidx = _mm512_set_epi64(7, 5, 3, 1, 6, 4, 2, 0); |
1304 | | return _mm512_permutexvar_epi64(permuteidx, _mm512_packus_epi32(a, b)); |
1305 | | }; |
1306 | | auto packlow32 = [&pack32](__m512i a, __m512i b) { |
1307 | | __m512i mask = _mm512_set1_epi32(0xFFFF); |
1308 | | return pack32(_mm512_and_si512(a, mask), _mm512_and_si512(b, mask)); |
1309 | | }; |
1310 | | auto packhi32 = [&pack32](__m512i a, __m512i b) { |
1311 | | return pack32(_mm512_srli_epi32(a, 16), _mm512_srli_epi32(b, 16)); |
1312 | | }; |
1313 | | |
1314 | | __m512i rb0 = packlow32(bytes0, bytes1); |
1315 | | __m512i rb1 = packlow32(bytes2, bytes3); |
1316 | | __m512i ga0 = packhi32(bytes0, bytes1); |
1317 | | __m512i ga1 = packhi32(bytes2, bytes3); |
1318 | | |
1319 | | __m512i r = packlow32(rb0, rb1); |
1320 | | __m512i g = packlow32(ga0, ga1); |
1321 | | __m512i b = packhi32(rb0, rb1); |
1322 | | __m512i a = packhi32(ga0, ga1); |
1323 | | return {SIMDVec16{r}, SIMDVec16{g}, SIMDVec16{b}, SIMDVec16{a}}; |
1324 | | } |
1325 | | |
1326 | | void SwapEndian() { |
1327 | | auto indices = _mm512_broadcast_i32x4( |
1328 | | _mm_setr_epi8(1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14)); |
1329 | | vec = _mm512_shuffle_epi8(vec, indices); |
1330 | | } |
1331 | | }; |
1332 | | |
1333 | | SIMDVec16 Mask16::IfThenElse(const SIMDVec16& if_true, |
1334 | | const SIMDVec16& if_false) { |
1335 | | return SIMDVec16{_mm512_mask_blend_epi16(mask, if_false.vec, if_true.vec)}; |
1336 | | } |
1337 | | |
1338 | | SIMDVec32 Mask32::IfThenElse(const SIMDVec32& if_true, |
1339 | | const SIMDVec32& if_false) { |
1340 | | return SIMDVec32{_mm512_mask_blend_epi32(mask, if_false.vec, if_true.vec)}; |
1341 | | } |
1342 | | |
1343 | | struct Bits64 { |
1344 | | static constexpr size_t kLanes = 8; |
1345 | | |
1346 | | __m512i nbits; |
1347 | | __m512i bits; |
1348 | | |
1349 | | FJXL_INLINE void Store(uint64_t* nbits_out, uint64_t* bits_out) { |
1350 | | _mm512_storeu_si512((__m512i*)nbits_out, nbits); |
1351 | | _mm512_storeu_si512((__m512i*)bits_out, bits); |
1352 | | } |
1353 | | }; |
1354 | | |
1355 | | struct Bits32 { |
1356 | | __m512i nbits; |
1357 | | __m512i bits; |
1358 | | |
1359 | | static Bits32 FromRaw(SIMDVec32 nbits, SIMDVec32 bits) { |
1360 | | return Bits32{nbits.vec, bits.vec}; |
1361 | | } |
1362 | | |
1363 | | Bits64 Merge() const { |
1364 | | auto nbits_hi32 = _mm512_srli_epi64(nbits, 32); |
1365 | | auto nbits_lo32 = _mm512_and_si512(nbits, _mm512_set1_epi64(0xFFFFFFFF)); |
1366 | | auto bits_hi32 = _mm512_srli_epi64(bits, 32); |
1367 | | auto bits_lo32 = _mm512_and_si512(bits, _mm512_set1_epi64(0xFFFFFFFF)); |
1368 | | |
1369 | | auto nbits64 = _mm512_add_epi64(nbits_hi32, nbits_lo32); |
1370 | | auto bits64 = |
1371 | | _mm512_or_si512(_mm512_sllv_epi64(bits_hi32, nbits_lo32), bits_lo32); |
1372 | | return Bits64{nbits64, bits64}; |
1373 | | } |
1374 | | |
1375 | | void Interleave(const Bits32& low) { |
1376 | | bits = _mm512_or_si512(_mm512_sllv_epi32(bits, low.nbits), low.bits); |
1377 | | nbits = _mm512_add_epi32(nbits, low.nbits); |
1378 | | } |
1379 | | |
1380 | | void ClipTo(size_t n) { |
1381 | | n = std::min<size_t>(n, 16); |
1382 | | constexpr uint32_t kMask[32] = { |
1383 | | ~0u, ~0u, ~0u, ~0u, ~0u, ~0u, ~0u, ~0u, ~0u, ~0u, ~0u, |
1384 | | ~0u, ~0u, ~0u, ~0u, ~0u, 0, 0, 0, 0, 0, 0, |
1385 | | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, |
1386 | | }; |
1387 | | __m512i mask = _mm512_loadu_si512((__m512i*)(kMask + 16 - n)); |
1388 | | nbits = _mm512_and_si512(mask, nbits); |
1389 | | bits = _mm512_and_si512(mask, bits); |
1390 | | } |
1391 | | void Skip(size_t n) { |
1392 | | n = std::min<size_t>(n, 16); |
1393 | | constexpr uint32_t kMask[32] = { |
1394 | | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, |
1395 | | 0, 0, 0, 0, 0, ~0u, ~0u, ~0u, ~0u, ~0u, ~0u, |
1396 | | ~0u, ~0u, ~0u, ~0u, ~0u, ~0u, ~0u, ~0u, ~0u, ~0u, |
1397 | | }; |
1398 | | __m512i mask = _mm512_loadu_si512((__m512i*)(kMask + 16 - n)); |
1399 | | nbits = _mm512_and_si512(mask, nbits); |
1400 | | bits = _mm512_and_si512(mask, bits); |
1401 | | } |
1402 | | }; |
1403 | | |
1404 | | struct Bits16 { |
1405 | | __m512i nbits; |
1406 | | __m512i bits; |
1407 | | |
1408 | | static Bits16 FromRaw(SIMDVec16 nbits, SIMDVec16 bits) { |
1409 | | return Bits16{nbits.vec, bits.vec}; |
1410 | | } |
1411 | | |
1412 | | Bits32 Merge() const { |
1413 | | auto nbits_hi16 = _mm512_srli_epi32(nbits, 16); |
1414 | | auto nbits_lo16 = _mm512_and_si512(nbits, _mm512_set1_epi32(0xFFFF)); |
1415 | | auto bits_hi16 = _mm512_srli_epi32(bits, 16); |
1416 | | auto bits_lo16 = _mm512_and_si512(bits, _mm512_set1_epi32(0xFFFF)); |
1417 | | |
1418 | | auto nbits32 = _mm512_add_epi32(nbits_hi16, nbits_lo16); |
1419 | | auto bits32 = |
1420 | | _mm512_or_si512(_mm512_sllv_epi32(bits_hi16, nbits_lo16), bits_lo16); |
1421 | | return Bits32{nbits32, bits32}; |
1422 | | } |
1423 | | |
1424 | | void Interleave(const Bits16& low) { |
1425 | | bits = _mm512_or_si512(_mm512_sllv_epi16(bits, low.nbits), low.bits); |
1426 | | nbits = _mm512_add_epi16(nbits, low.nbits); |
1427 | | } |
1428 | | |
1429 | | void ClipTo(size_t n) { |
1430 | | n = std::min<size_t>(n, 32); |
1431 | | constexpr uint16_t kMask[64] = { |
1432 | | 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, |
1433 | | 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, |
1434 | | 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, |
1435 | | 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, |
1436 | | 0, 0, 0, 0, 0, 0, 0, 0, |
1437 | | 0, 0, 0, 0, 0, 0, 0, 0, |
1438 | | 0, 0, 0, 0, 0, 0, 0, 0, |
1439 | | 0, 0, 0, 0, 0, 0, 0, 0, |
1440 | | }; |
1441 | | __m512i mask = _mm512_loadu_si512((__m512i*)(kMask + 32 - n)); |
1442 | | nbits = _mm512_and_si512(mask, nbits); |
1443 | | bits = _mm512_and_si512(mask, bits); |
1444 | | } |
1445 | | void Skip(size_t n) { |
1446 | | n = std::min<size_t>(n, 32); |
1447 | | constexpr uint16_t kMask[64] = { |
1448 | | 0, 0, 0, 0, 0, 0, 0, 0, |
1449 | | 0, 0, 0, 0, 0, 0, 0, 0, |
1450 | | 0, 0, 0, 0, 0, 0, 0, 0, |
1451 | | 0, 0, 0, 0, 0, 0, 0, 0, |
1452 | | 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, |
1453 | | 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, |
1454 | | 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, |
1455 | | 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, |
1456 | | }; |
1457 | | __m512i mask = _mm512_loadu_si512((__m512i*)(kMask + 32 - n)); |
1458 | | nbits = _mm512_and_si512(mask, nbits); |
1459 | | bits = _mm512_and_si512(mask, bits); |
1460 | | } |
1461 | | }; |
1462 | | |
1463 | | #endif |
1464 | | |
1465 | | #ifdef FJXL_AVX2 |
1466 | | #define FJXL_GENERIC_SIMD |
1467 | | |
1468 | | struct SIMDVec32; |
1469 | | |
1470 | | struct Mask32 { |
1471 | | __m256i mask; |
1472 | | SIMDVec32 IfThenElse(const SIMDVec32& if_true, const SIMDVec32& if_false); |
1473 | 0 | size_t CountPrefix() const { |
1474 | 0 | return CtzNonZero(~static_cast<uint64_t>( |
1475 | 0 | static_cast<uint8_t>(_mm256_movemask_ps(_mm256_castsi256_ps(mask))))); |
1476 | 0 | } |
1477 | | }; |
1478 | | |
1479 | | struct SIMDVec32 { |
1480 | | __m256i vec; |
1481 | | |
1482 | | static constexpr size_t kLanes = 8; |
1483 | | |
1484 | 0 | FJXL_INLINE static SIMDVec32 Load(const uint32_t* data) { |
1485 | 0 | return SIMDVec32{_mm256_loadu_si256((__m256i*)data)}; |
1486 | 0 | } |
1487 | 0 | FJXL_INLINE void Store(uint32_t* data) { |
1488 | 0 | _mm256_storeu_si256((__m256i*)data, vec); |
1489 | 0 | } |
1490 | 0 | FJXL_INLINE static SIMDVec32 Val(uint32_t v) { |
1491 | 0 | return SIMDVec32{_mm256_set1_epi32(v)}; |
1492 | 0 | } |
1493 | 0 | FJXL_INLINE SIMDVec32 ValToToken() const { |
1494 | 0 | auto f32 = _mm256_castps_si256(_mm256_cvtepi32_ps(vec)); |
1495 | 0 | return SIMDVec32{_mm256_max_epi32( |
1496 | 0 | _mm256_setzero_si256(), |
1497 | 0 | _mm256_sub_epi32(_mm256_srli_epi32(f32, 23), _mm256_set1_epi32(126)))}; |
1498 | 0 | } |
1499 | 0 | FJXL_INLINE SIMDVec32 SatSubU(const SIMDVec32& to_subtract) const { |
1500 | 0 | return SIMDVec32{_mm256_sub_epi32(_mm256_max_epu32(vec, to_subtract.vec), |
1501 | 0 | to_subtract.vec)}; |
1502 | 0 | } |
1503 | 0 | FJXL_INLINE SIMDVec32 Sub(const SIMDVec32& to_subtract) const { |
1504 | 0 | return SIMDVec32{_mm256_sub_epi32(vec, to_subtract.vec)}; |
1505 | 0 | } |
1506 | 0 | FJXL_INLINE SIMDVec32 Add(const SIMDVec32& oth) const { |
1507 | 0 | return SIMDVec32{_mm256_add_epi32(vec, oth.vec)}; |
1508 | 0 | } |
1509 | 0 | FJXL_INLINE SIMDVec32 Xor(const SIMDVec32& oth) const { |
1510 | 0 | return SIMDVec32{_mm256_xor_si256(vec, oth.vec)}; |
1511 | 0 | } |
1512 | 0 | FJXL_INLINE SIMDVec32 Pow2() const { |
1513 | 0 | return SIMDVec32{_mm256_sllv_epi32(_mm256_set1_epi32(1), vec)}; |
1514 | 0 | } |
1515 | 0 | FJXL_INLINE Mask32 Eq(const SIMDVec32& oth) const { |
1516 | 0 | return Mask32{_mm256_cmpeq_epi32(vec, oth.vec)}; |
1517 | 0 | } |
1518 | 0 | FJXL_INLINE Mask32 Gt(const SIMDVec32& oth) const { |
1519 | 0 | return Mask32{_mm256_cmpgt_epi32(vec, oth.vec)}; |
1520 | 0 | } |
1521 | | template <size_t i> |
1522 | 0 | FJXL_INLINE SIMDVec32 SignedShiftRight() const { |
1523 | 0 | return SIMDVec32{_mm256_srai_epi32(vec, i)}; |
1524 | 0 | } |
1525 | | }; |
1526 | | |
1527 | | struct SIMDVec16; |
1528 | | |
1529 | | struct Mask16 { |
1530 | | __m256i mask; |
1531 | | SIMDVec16 IfThenElse(const SIMDVec16& if_true, const SIMDVec16& if_false); |
1532 | 0 | Mask16 And(const Mask16& oth) const { |
1533 | 0 | return Mask16{_mm256_and_si256(mask, oth.mask)}; |
1534 | 0 | } |
1535 | 0 | size_t CountPrefix() const { |
1536 | 0 | return CtzNonZero(~static_cast<uint64_t>( |
1537 | 0 | static_cast<uint32_t>(_mm256_movemask_epi8(mask)))) / |
1538 | 0 | 2; |
1539 | 0 | } |
1540 | | }; |
1541 | | |
1542 | | struct SIMDVec16 { |
1543 | | __m256i vec; |
1544 | | |
1545 | | static constexpr size_t kLanes = 16; |
1546 | | |
1547 | 0 | FJXL_INLINE static SIMDVec16 Load(const uint16_t* data) { |
1548 | 0 | return SIMDVec16{_mm256_loadu_si256((__m256i*)data)}; |
1549 | 0 | } |
1550 | 0 | FJXL_INLINE void Store(uint16_t* data) { |
1551 | 0 | _mm256_storeu_si256((__m256i*)data, vec); |
1552 | 0 | } |
1553 | 0 | FJXL_INLINE static SIMDVec16 Val(uint16_t v) { |
1554 | 0 | return SIMDVec16{_mm256_set1_epi16(v)}; |
1555 | 0 | } |
1556 | | FJXL_INLINE static SIMDVec16 FromTwo32(const SIMDVec32& lo, |
1557 | 0 | const SIMDVec32& hi) { |
1558 | 0 | auto tmp = _mm256_packus_epi32(lo.vec, hi.vec); |
1559 | 0 | return SIMDVec16{_mm256_permute4x64_epi64(tmp, 0b11011000)}; |
1560 | 0 | } |
1561 | | |
1562 | 0 | FJXL_INLINE SIMDVec16 ValToToken() const { |
1563 | 0 | auto nibble0 = |
1564 | 0 | _mm256_or_si256(_mm256_and_si256(vec, _mm256_set1_epi16(0xF)), |
1565 | 0 | _mm256_set1_epi16(0xFF00)); |
1566 | 0 | auto nibble1 = _mm256_or_si256( |
1567 | 0 | _mm256_and_si256(_mm256_srli_epi16(vec, 4), _mm256_set1_epi16(0xF)), |
1568 | 0 | _mm256_set1_epi16(0xFF00)); |
1569 | 0 | auto nibble2 = _mm256_or_si256( |
1570 | 0 | _mm256_and_si256(_mm256_srli_epi16(vec, 8), _mm256_set1_epi16(0xF)), |
1571 | 0 | _mm256_set1_epi16(0xFF00)); |
1572 | 0 | auto nibble3 = |
1573 | 0 | _mm256_or_si256(_mm256_srli_epi16(vec, 12), _mm256_set1_epi16(0xFF00)); |
1574 | |
|
1575 | 0 | auto lut0 = _mm256_broadcastsi128_si256( |
1576 | 0 | _mm_setr_epi8(0, 1, 2, 2, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4)); |
1577 | 0 | auto lut1 = _mm256_broadcastsi128_si256( |
1578 | 0 | _mm_setr_epi8(0, 5, 6, 6, 7, 7, 7, 7, 8, 8, 8, 8, 8, 8, 8, 8)); |
1579 | 0 | auto lut2 = _mm256_broadcastsi128_si256(_mm_setr_epi8( |
1580 | 0 | 0, 9, 10, 10, 11, 11, 11, 11, 12, 12, 12, 12, 12, 12, 12, 12)); |
1581 | 0 | auto lut3 = _mm256_broadcastsi128_si256(_mm_setr_epi8( |
1582 | 0 | 0, 13, 14, 14, 15, 15, 15, 15, 16, 16, 16, 16, 16, 16, 16, 16)); |
1583 | |
|
1584 | 0 | auto token0 = _mm256_shuffle_epi8(lut0, nibble0); |
1585 | 0 | auto token1 = _mm256_shuffle_epi8(lut1, nibble1); |
1586 | 0 | auto token2 = _mm256_shuffle_epi8(lut2, nibble2); |
1587 | 0 | auto token3 = _mm256_shuffle_epi8(lut3, nibble3); |
1588 | |
|
1589 | 0 | auto token = _mm256_max_epi16(_mm256_max_epi16(token0, token1), |
1590 | 0 | _mm256_max_epi16(token2, token3)); |
1591 | 0 | return SIMDVec16{token}; |
1592 | 0 | } |
1593 | | |
1594 | 0 | FJXL_INLINE SIMDVec16 SatSubU(const SIMDVec16& to_subtract) const { |
1595 | 0 | return SIMDVec16{_mm256_subs_epu16(vec, to_subtract.vec)}; |
1596 | 0 | } |
1597 | 0 | FJXL_INLINE SIMDVec16 Sub(const SIMDVec16& to_subtract) const { |
1598 | 0 | return SIMDVec16{_mm256_sub_epi16(vec, to_subtract.vec)}; |
1599 | 0 | } |
1600 | 0 | FJXL_INLINE SIMDVec16 Add(const SIMDVec16& oth) const { |
1601 | 0 | return SIMDVec16{_mm256_add_epi16(vec, oth.vec)}; |
1602 | 0 | } |
1603 | 0 | FJXL_INLINE SIMDVec16 Min(const SIMDVec16& oth) const { |
1604 | 0 | return SIMDVec16{_mm256_min_epu16(vec, oth.vec)}; |
1605 | 0 | } |
1606 | 0 | FJXL_INLINE Mask16 Eq(const SIMDVec16& oth) const { |
1607 | 0 | return Mask16{_mm256_cmpeq_epi16(vec, oth.vec)}; |
1608 | 0 | } |
1609 | 0 | FJXL_INLINE Mask16 Gt(const SIMDVec16& oth) const { |
1610 | 0 | return Mask16{_mm256_cmpgt_epi16(vec, oth.vec)}; |
1611 | 0 | } |
1612 | 0 | FJXL_INLINE SIMDVec16 Pow2() const { |
1613 | 0 | auto pow2_lo_lut = _mm256_broadcastsi128_si256( |
1614 | 0 | _mm_setr_epi8(1 << 0, 1 << 1, 1 << 2, 1 << 3, 1 << 4, 1 << 5, 1 << 6, |
1615 | 0 | 1u << 7, 0, 0, 0, 0, 0, 0, 0, 0)); |
1616 | 0 | auto pow2_hi_lut = _mm256_broadcastsi128_si256( |
1617 | 0 | _mm_setr_epi8(0, 0, 0, 0, 0, 0, 0, 0, 1 << 0, 1 << 1, 1 << 2, 1 << 3, |
1618 | 0 | 1 << 4, 1 << 5, 1 << 6, 1u << 7)); |
1619 | |
|
1620 | 0 | auto masked = _mm256_or_si256(vec, _mm256_set1_epi16(0xFF00)); |
1621 | |
|
1622 | 0 | auto pow2_lo = _mm256_shuffle_epi8(pow2_lo_lut, masked); |
1623 | 0 | auto pow2_hi = _mm256_shuffle_epi8(pow2_hi_lut, masked); |
1624 | |
|
1625 | 0 | auto pow2 = _mm256_or_si256(_mm256_slli_epi16(pow2_hi, 8), pow2_lo); |
1626 | 0 | return SIMDVec16{pow2}; |
1627 | 0 | } |
1628 | 0 | FJXL_INLINE SIMDVec16 Or(const SIMDVec16& oth) const { |
1629 | 0 | return SIMDVec16{_mm256_or_si256(vec, oth.vec)}; |
1630 | 0 | } |
1631 | 0 | FJXL_INLINE SIMDVec16 Xor(const SIMDVec16& oth) const { |
1632 | 0 | return SIMDVec16{_mm256_xor_si256(vec, oth.vec)}; |
1633 | 0 | } |
1634 | 0 | FJXL_INLINE SIMDVec16 And(const SIMDVec16& oth) const { |
1635 | 0 | return SIMDVec16{_mm256_and_si256(vec, oth.vec)}; |
1636 | 0 | } |
1637 | 0 | FJXL_INLINE SIMDVec16 HAdd(const SIMDVec16& oth) const { |
1638 | 0 | return SIMDVec16{_mm256_srai_epi16(_mm256_add_epi16(vec, oth.vec), 1)}; |
1639 | 0 | } |
1640 | 0 | FJXL_INLINE SIMDVec16 PrepareForU8Lookup() const { |
1641 | 0 | return SIMDVec16{_mm256_or_si256(vec, _mm256_set1_epi16(0xFF00))}; |
1642 | 0 | } |
1643 | 0 | FJXL_INLINE SIMDVec16 U8Lookup(const uint8_t* table) const { |
1644 | 0 | return SIMDVec16{_mm256_shuffle_epi8( |
1645 | 0 | _mm256_broadcastsi128_si256(_mm_loadu_si128((__m128i*)table)), vec)}; |
1646 | 0 | } |
1647 | 0 | FJXL_INLINE VecPair<SIMDVec16> Interleave(const SIMDVec16& low) const { |
1648 | 0 | auto v02 = _mm256_unpacklo_epi16(low.vec, vec); |
1649 | 0 | auto v13 = _mm256_unpackhi_epi16(low.vec, vec); |
1650 | 0 | return {SIMDVec16{_mm256_permute2x128_si256(v02, v13, 0x20)}, |
1651 | 0 | SIMDVec16{_mm256_permute2x128_si256(v02, v13, 0x31)}}; |
1652 | 0 | } |
1653 | 0 | FJXL_INLINE VecPair<SIMDVec32> Upcast() const { |
1654 | 0 | auto v02 = _mm256_unpacklo_epi16(vec, _mm256_setzero_si256()); |
1655 | 0 | auto v13 = _mm256_unpackhi_epi16(vec, _mm256_setzero_si256()); |
1656 | 0 | return {SIMDVec32{_mm256_permute2x128_si256(v02, v13, 0x20)}, |
1657 | 0 | SIMDVec32{_mm256_permute2x128_si256(v02, v13, 0x31)}}; |
1658 | 0 | } |
1659 | | template <size_t i> |
1660 | 0 | FJXL_INLINE SIMDVec16 SignedShiftRight() const { |
1661 | 0 | return SIMDVec16{_mm256_srai_epi16(vec, i)}; |
1662 | 0 | } |
1663 | | |
1664 | 0 | static std::array<SIMDVec16, 1> LoadG8(const unsigned char* data) { |
1665 | 0 | __m128i bytes = _mm_loadu_si128((__m128i*)data); |
1666 | 0 | return {SIMDVec16{_mm256_cvtepu8_epi16(bytes)}}; |
1667 | 0 | } |
1668 | 0 | static std::array<SIMDVec16, 1> LoadG16(const unsigned char* data) { |
1669 | 0 | return {Load((const uint16_t*)data)}; |
1670 | 0 | } |
1671 | | |
1672 | 0 | static std::array<SIMDVec16, 2> LoadGA8(const unsigned char* data) { |
1673 | 0 | __m256i bytes = _mm256_loadu_si256((__m256i*)data); |
1674 | 0 | __m256i gray = _mm256_and_si256(bytes, _mm256_set1_epi16(0xFF)); |
1675 | 0 | __m256i alpha = _mm256_srli_epi16(bytes, 8); |
1676 | 0 | return {SIMDVec16{gray}, SIMDVec16{alpha}}; |
1677 | 0 | } |
1678 | 0 | static std::array<SIMDVec16, 2> LoadGA16(const unsigned char* data) { |
1679 | 0 | __m256i bytes1 = _mm256_loadu_si256((__m256i*)data); |
1680 | 0 | __m256i bytes2 = _mm256_loadu_si256((__m256i*)(data + 32)); |
1681 | 0 | __m256i g_mask = _mm256_set1_epi32(0xFFFF); |
1682 | 0 | __m256i g = _mm256_permute4x64_epi64( |
1683 | 0 | _mm256_packus_epi32(_mm256_and_si256(bytes1, g_mask), |
1684 | 0 | _mm256_and_si256(bytes2, g_mask)), |
1685 | 0 | 0b11011000); |
1686 | 0 | __m256i a = _mm256_permute4x64_epi64( |
1687 | 0 | _mm256_packus_epi32(_mm256_srli_epi32(bytes1, 16), |
1688 | 0 | _mm256_srli_epi32(bytes2, 16)), |
1689 | 0 | 0b11011000); |
1690 | 0 | return {SIMDVec16{g}, SIMDVec16{a}}; |
1691 | 0 | } |
1692 | | |
1693 | 0 | static std::array<SIMDVec16, 3> LoadRGB8(const unsigned char* data) { |
1694 | 0 | __m128i bytes0 = _mm_loadu_si128((__m128i*)data); |
1695 | 0 | __m128i bytes1 = _mm_loadu_si128((__m128i*)(data + 16)); |
1696 | 0 | __m128i bytes2 = _mm_loadu_si128((__m128i*)(data + 32)); |
1697 | |
|
1698 | 0 | __m128i idx = |
1699 | 0 | _mm_setr_epi8(0, 3, 6, 9, 12, 15, 2, 5, 8, 11, 14, 1, 4, 7, 10, 13); |
1700 | |
|
1701 | 0 | __m128i r6b5g5_0 = _mm_shuffle_epi8(bytes0, idx); |
1702 | 0 | __m128i g6r5b5_1 = _mm_shuffle_epi8(bytes1, idx); |
1703 | 0 | __m128i b6g5r5_2 = _mm_shuffle_epi8(bytes2, idx); |
1704 | |
|
1705 | 0 | __m128i mask010 = _mm_setr_epi8(0, 0, 0, 0, 0, 0, 0xFF, 0xFF, 0xFF, 0xFF, |
1706 | 0 | 0xFF, 0, 0, 0, 0, 0); |
1707 | 0 | __m128i mask001 = _mm_setr_epi8(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0xFF, 0xFF, |
1708 | 0 | 0xFF, 0xFF, 0xFF); |
1709 | |
|
1710 | 0 | __m128i b2g2b1 = _mm_blendv_epi8(b6g5r5_2, g6r5b5_1, mask001); |
1711 | 0 | __m128i b2b0b1 = _mm_blendv_epi8(b2g2b1, r6b5g5_0, mask010); |
1712 | |
|
1713 | 0 | __m128i r0r1b1 = _mm_blendv_epi8(r6b5g5_0, g6r5b5_1, mask010); |
1714 | 0 | __m128i r0r1r2 = _mm_blendv_epi8(r0r1b1, b6g5r5_2, mask001); |
1715 | |
|
1716 | 0 | __m128i g1r1g0 = _mm_blendv_epi8(g6r5b5_1, r6b5g5_0, mask001); |
1717 | 0 | __m128i g1g2g0 = _mm_blendv_epi8(g1r1g0, b6g5r5_2, mask010); |
1718 | |
|
1719 | 0 | __m128i g0g1g2 = _mm_alignr_epi8(g1g2g0, g1g2g0, 11); |
1720 | 0 | __m128i b0b1b2 = _mm_alignr_epi8(b2b0b1, b2b0b1, 6); |
1721 | |
|
1722 | 0 | return {SIMDVec16{_mm256_cvtepu8_epi16(r0r1r2)}, |
1723 | 0 | SIMDVec16{_mm256_cvtepu8_epi16(g0g1g2)}, |
1724 | 0 | SIMDVec16{_mm256_cvtepu8_epi16(b0b1b2)}}; |
1725 | 0 | } |
1726 | 0 | static std::array<SIMDVec16, 3> LoadRGB16(const unsigned char* data) { |
1727 | 0 | auto load_and_split_lohi = [](const unsigned char* data) { |
1728 | | // LHLHLH... |
1729 | 0 | __m256i bytes = _mm256_loadu_si256((__m256i*)data); |
1730 | | // L0L0L0... |
1731 | 0 | __m256i lo = _mm256_and_si256(bytes, _mm256_set1_epi16(0xFF)); |
1732 | | // H0H0H0... |
1733 | 0 | __m256i hi = _mm256_srli_epi16(bytes, 8); |
1734 | | // LLLLLLLLHHHHHHHHLLLLLLLLHHHHHHHH |
1735 | 0 | __m256i packed = _mm256_packus_epi16(lo, hi); |
1736 | 0 | return _mm256_permute4x64_epi64(packed, 0b11011000); |
1737 | 0 | }; |
1738 | 0 | __m256i bytes0 = load_and_split_lohi(data); |
1739 | 0 | __m256i bytes1 = load_and_split_lohi(data + 32); |
1740 | 0 | __m256i bytes2 = load_and_split_lohi(data + 64); |
1741 | |
|
1742 | 0 | __m256i idx = _mm256_broadcastsi128_si256( |
1743 | 0 | _mm_setr_epi8(0, 3, 6, 9, 12, 15, 2, 5, 8, 11, 14, 1, 4, 7, 10, 13)); |
1744 | |
|
1745 | 0 | __m256i r6b5g5_0 = _mm256_shuffle_epi8(bytes0, idx); |
1746 | 0 | __m256i g6r5b5_1 = _mm256_shuffle_epi8(bytes1, idx); |
1747 | 0 | __m256i b6g5r5_2 = _mm256_shuffle_epi8(bytes2, idx); |
1748 | |
|
1749 | 0 | __m256i mask010 = _mm256_broadcastsi128_si256(_mm_setr_epi8( |
1750 | 0 | 0, 0, 0, 0, 0, 0, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0, 0, 0, 0, 0)); |
1751 | 0 | __m256i mask001 = _mm256_broadcastsi128_si256(_mm_setr_epi8( |
1752 | 0 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF)); |
1753 | |
|
1754 | 0 | __m256i b2g2b1 = _mm256_blendv_epi8(b6g5r5_2, g6r5b5_1, mask001); |
1755 | 0 | __m256i b2b0b1 = _mm256_blendv_epi8(b2g2b1, r6b5g5_0, mask010); |
1756 | |
|
1757 | 0 | __m256i r0r1b1 = _mm256_blendv_epi8(r6b5g5_0, g6r5b5_1, mask010); |
1758 | 0 | __m256i r0r1r2 = _mm256_blendv_epi8(r0r1b1, b6g5r5_2, mask001); |
1759 | |
|
1760 | 0 | __m256i g1r1g0 = _mm256_blendv_epi8(g6r5b5_1, r6b5g5_0, mask001); |
1761 | 0 | __m256i g1g2g0 = _mm256_blendv_epi8(g1r1g0, b6g5r5_2, mask010); |
1762 | |
|
1763 | 0 | __m256i g0g1g2 = _mm256_alignr_epi8(g1g2g0, g1g2g0, 11); |
1764 | 0 | __m256i b0b1b2 = _mm256_alignr_epi8(b2b0b1, b2b0b1, 6); |
1765 | | |
1766 | | // Now r0r1r2, g0g1g2, b0b1b2 have the low bytes of the RGB pixels in their |
1767 | | // lower half, and the high bytes in their upper half. |
1768 | |
|
1769 | 0 | auto combine_low_hi = [](__m256i v) { |
1770 | 0 | __m128i low = _mm256_extracti128_si256(v, 0); |
1771 | 0 | __m128i hi = _mm256_extracti128_si256(v, 1); |
1772 | 0 | __m256i low16 = _mm256_cvtepu8_epi16(low); |
1773 | 0 | __m256i hi16 = _mm256_cvtepu8_epi16(hi); |
1774 | 0 | return _mm256_or_si256(_mm256_slli_epi16(hi16, 8), low16); |
1775 | 0 | }; |
1776 | |
|
1777 | 0 | return {SIMDVec16{combine_low_hi(r0r1r2)}, |
1778 | 0 | SIMDVec16{combine_low_hi(g0g1g2)}, |
1779 | 0 | SIMDVec16{combine_low_hi(b0b1b2)}}; |
1780 | 0 | } |
1781 | | |
1782 | 0 | static std::array<SIMDVec16, 4> LoadRGBA8(const unsigned char* data) { |
1783 | 0 | __m256i bytes1 = _mm256_loadu_si256((__m256i*)data); |
1784 | 0 | __m256i bytes2 = _mm256_loadu_si256((__m256i*)(data + 32)); |
1785 | 0 | __m256i rg_mask = _mm256_set1_epi32(0xFFFF); |
1786 | 0 | __m256i rg = _mm256_permute4x64_epi64( |
1787 | 0 | _mm256_packus_epi32(_mm256_and_si256(bytes1, rg_mask), |
1788 | 0 | _mm256_and_si256(bytes2, rg_mask)), |
1789 | 0 | 0b11011000); |
1790 | 0 | __m256i b_a = _mm256_permute4x64_epi64( |
1791 | 0 | _mm256_packus_epi32(_mm256_srli_epi32(bytes1, 16), |
1792 | 0 | _mm256_srli_epi32(bytes2, 16)), |
1793 | 0 | 0b11011000); |
1794 | 0 | __m256i r = _mm256_and_si256(rg, _mm256_set1_epi16(0xFF)); |
1795 | 0 | __m256i g = _mm256_srli_epi16(rg, 8); |
1796 | 0 | __m256i b = _mm256_and_si256(b_a, _mm256_set1_epi16(0xFF)); |
1797 | 0 | __m256i a = _mm256_srli_epi16(b_a, 8); |
1798 | 0 | return {SIMDVec16{r}, SIMDVec16{g}, SIMDVec16{b}, SIMDVec16{a}}; |
1799 | 0 | } |
1800 | 0 | static std::array<SIMDVec16, 4> LoadRGBA16(const unsigned char* data) { |
1801 | 0 | __m256i bytes0 = _mm256_loadu_si256((__m256i*)data); |
1802 | 0 | __m256i bytes1 = _mm256_loadu_si256((__m256i*)(data + 32)); |
1803 | 0 | __m256i bytes2 = _mm256_loadu_si256((__m256i*)(data + 64)); |
1804 | 0 | __m256i bytes3 = _mm256_loadu_si256((__m256i*)(data + 96)); |
1805 | |
|
1806 | 0 | auto pack32 = [](__m256i a, __m256i b) { |
1807 | 0 | return _mm256_permute4x64_epi64(_mm256_packus_epi32(a, b), 0b11011000); |
1808 | 0 | }; |
1809 | 0 | auto packlow32 = [&pack32](__m256i a, __m256i b) { |
1810 | 0 | __m256i mask = _mm256_set1_epi32(0xFFFF); |
1811 | 0 | return pack32(_mm256_and_si256(a, mask), _mm256_and_si256(b, mask)); |
1812 | 0 | }; |
1813 | 0 | auto packhi32 = [&pack32](__m256i a, __m256i b) { |
1814 | 0 | return pack32(_mm256_srli_epi32(a, 16), _mm256_srli_epi32(b, 16)); |
1815 | 0 | }; |
1816 | |
|
1817 | 0 | __m256i rb0 = packlow32(bytes0, bytes1); |
1818 | 0 | __m256i rb1 = packlow32(bytes2, bytes3); |
1819 | 0 | __m256i ga0 = packhi32(bytes0, bytes1); |
1820 | 0 | __m256i ga1 = packhi32(bytes2, bytes3); |
1821 | |
|
1822 | 0 | __m256i r = packlow32(rb0, rb1); |
1823 | 0 | __m256i g = packlow32(ga0, ga1); |
1824 | 0 | __m256i b = packhi32(rb0, rb1); |
1825 | 0 | __m256i a = packhi32(ga0, ga1); |
1826 | 0 | return {SIMDVec16{r}, SIMDVec16{g}, SIMDVec16{b}, SIMDVec16{a}}; |
1827 | 0 | } |
1828 | | |
1829 | 0 | void SwapEndian() { |
1830 | 0 | auto indices = _mm256_broadcastsi128_si256( |
1831 | 0 | _mm_setr_epi8(1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14)); |
1832 | 0 | vec = _mm256_shuffle_epi8(vec, indices); |
1833 | 0 | } |
1834 | | }; |
1835 | | |
1836 | | SIMDVec16 Mask16::IfThenElse(const SIMDVec16& if_true, |
1837 | 0 | const SIMDVec16& if_false) { |
1838 | 0 | return SIMDVec16{_mm256_blendv_epi8(if_false.vec, if_true.vec, mask)}; |
1839 | 0 | } |
1840 | | |
1841 | | SIMDVec32 Mask32::IfThenElse(const SIMDVec32& if_true, |
1842 | 0 | const SIMDVec32& if_false) { |
1843 | 0 | return SIMDVec32{_mm256_blendv_epi8(if_false.vec, if_true.vec, mask)}; |
1844 | 0 | } |
1845 | | |
1846 | | struct Bits64 { |
1847 | | static constexpr size_t kLanes = 4; |
1848 | | |
1849 | | __m256i nbits; |
1850 | | __m256i bits; |
1851 | | |
1852 | 0 | FJXL_INLINE void Store(uint64_t* nbits_out, uint64_t* bits_out) { |
1853 | 0 | _mm256_storeu_si256((__m256i*)nbits_out, nbits); |
1854 | 0 | _mm256_storeu_si256((__m256i*)bits_out, bits); |
1855 | 0 | } |
1856 | | }; |
1857 | | |
1858 | | struct Bits32 { |
1859 | | __m256i nbits; |
1860 | | __m256i bits; |
1861 | | |
1862 | 0 | static Bits32 FromRaw(SIMDVec32 nbits, SIMDVec32 bits) { |
1863 | 0 | return Bits32{nbits.vec, bits.vec}; |
1864 | 0 | } |
1865 | | |
1866 | 0 | Bits64 Merge() const { |
1867 | 0 | auto nbits_hi32 = _mm256_srli_epi64(nbits, 32); |
1868 | 0 | auto nbits_lo32 = _mm256_and_si256(nbits, _mm256_set1_epi64x(0xFFFFFFFF)); |
1869 | 0 | auto bits_hi32 = _mm256_srli_epi64(bits, 32); |
1870 | 0 | auto bits_lo32 = _mm256_and_si256(bits, _mm256_set1_epi64x(0xFFFFFFFF)); |
1871 | |
|
1872 | 0 | auto nbits64 = _mm256_add_epi64(nbits_hi32, nbits_lo32); |
1873 | 0 | auto bits64 = |
1874 | 0 | _mm256_or_si256(_mm256_sllv_epi64(bits_hi32, nbits_lo32), bits_lo32); |
1875 | 0 | return Bits64{nbits64, bits64}; |
1876 | 0 | } |
1877 | | |
1878 | 0 | void Interleave(const Bits32& low) { |
1879 | 0 | bits = _mm256_or_si256(_mm256_sllv_epi32(bits, low.nbits), low.bits); |
1880 | 0 | nbits = _mm256_add_epi32(nbits, low.nbits); |
1881 | 0 | } |
1882 | | |
1883 | 0 | void ClipTo(size_t n) { |
1884 | 0 | n = std::min<size_t>(n, 8); |
1885 | 0 | constexpr uint32_t kMask[16] = { |
1886 | 0 | ~0u, ~0u, ~0u, ~0u, ~0u, ~0u, ~0u, ~0u, 0, 0, 0, 0, 0, 0, 0, 0, |
1887 | 0 | }; |
1888 | 0 | __m256i mask = _mm256_loadu_si256((__m256i*)(kMask + 8 - n)); |
1889 | 0 | nbits = _mm256_and_si256(mask, nbits); |
1890 | 0 | bits = _mm256_and_si256(mask, bits); |
1891 | 0 | } |
1892 | 0 | void Skip(size_t n) { |
1893 | 0 | n = std::min<size_t>(n, 8); |
1894 | 0 | constexpr uint32_t kMask[16] = { |
1895 | 0 | 0, 0, 0, 0, 0, 0, 0, 0, ~0u, ~0u, ~0u, ~0u, ~0u, ~0u, ~0u, ~0u, |
1896 | 0 | }; |
1897 | 0 | __m256i mask = _mm256_loadu_si256((__m256i*)(kMask + 8 - n)); |
1898 | 0 | nbits = _mm256_and_si256(mask, nbits); |
1899 | 0 | bits = _mm256_and_si256(mask, bits); |
1900 | 0 | } |
1901 | | }; |
1902 | | |
1903 | | struct Bits16 { |
1904 | | __m256i nbits; |
1905 | | __m256i bits; |
1906 | | |
1907 | 0 | static Bits16 FromRaw(SIMDVec16 nbits, SIMDVec16 bits) { |
1908 | 0 | return Bits16{nbits.vec, bits.vec}; |
1909 | 0 | } |
1910 | | |
1911 | 0 | Bits32 Merge() const { |
1912 | 0 | auto nbits_hi16 = _mm256_srli_epi32(nbits, 16); |
1913 | 0 | auto nbits_lo16 = _mm256_and_si256(nbits, _mm256_set1_epi32(0xFFFF)); |
1914 | 0 | auto bits_hi16 = _mm256_srli_epi32(bits, 16); |
1915 | 0 | auto bits_lo16 = _mm256_and_si256(bits, _mm256_set1_epi32(0xFFFF)); |
1916 | |
|
1917 | 0 | auto nbits32 = _mm256_add_epi32(nbits_hi16, nbits_lo16); |
1918 | 0 | auto bits32 = |
1919 | 0 | _mm256_or_si256(_mm256_sllv_epi32(bits_hi16, nbits_lo16), bits_lo16); |
1920 | 0 | return Bits32{nbits32, bits32}; |
1921 | 0 | } |
1922 | | |
1923 | 0 | void Interleave(const Bits16& low) { |
1924 | 0 | auto pow2_lo_lut = _mm256_broadcastsi128_si256( |
1925 | 0 | _mm_setr_epi8(1 << 0, 1 << 1, 1 << 2, 1 << 3, 1 << 4, 1 << 5, 1 << 6, |
1926 | 0 | 1u << 7, 0, 0, 0, 0, 0, 0, 0, 0)); |
1927 | 0 | auto low_nbits_masked = |
1928 | 0 | _mm256_or_si256(low.nbits, _mm256_set1_epi16(0xFF00)); |
1929 | |
|
1930 | 0 | auto bits_shifted = _mm256_mullo_epi16( |
1931 | 0 | bits, _mm256_shuffle_epi8(pow2_lo_lut, low_nbits_masked)); |
1932 | |
|
1933 | 0 | nbits = _mm256_add_epi16(nbits, low.nbits); |
1934 | 0 | bits = _mm256_or_si256(bits_shifted, low.bits); |
1935 | 0 | } |
1936 | | |
1937 | 0 | void ClipTo(size_t n) { |
1938 | 0 | n = std::min<size_t>(n, 16); |
1939 | 0 | constexpr uint16_t kMask[32] = { |
1940 | 0 | 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, |
1941 | 0 | 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, |
1942 | 0 | 0, 0, 0, 0, 0, 0, 0, 0, |
1943 | 0 | 0, 0, 0, 0, 0, 0, 0, 0, |
1944 | 0 | }; |
1945 | 0 | __m256i mask = _mm256_loadu_si256((__m256i*)(kMask + 16 - n)); |
1946 | 0 | nbits = _mm256_and_si256(mask, nbits); |
1947 | 0 | bits = _mm256_and_si256(mask, bits); |
1948 | 0 | } |
1949 | | |
1950 | 0 | void Skip(size_t n) { |
1951 | 0 | n = std::min<size_t>(n, 16); |
1952 | 0 | constexpr uint16_t kMask[32] = { |
1953 | 0 | 0, 0, 0, 0, 0, 0, 0, 0, |
1954 | 0 | 0, 0, 0, 0, 0, 0, 0, 0, |
1955 | 0 | 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, |
1956 | 0 | 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, |
1957 | 0 | }; |
1958 | 0 | __m256i mask = _mm256_loadu_si256((__m256i*)(kMask + 16 - n)); |
1959 | 0 | nbits = _mm256_and_si256(mask, nbits); |
1960 | 0 | bits = _mm256_and_si256(mask, bits); |
1961 | 0 | } |
1962 | | }; |
1963 | | |
1964 | | #endif |
1965 | | |
1966 | | #ifdef FJXL_NEON |
1967 | | #define FJXL_GENERIC_SIMD |
1968 | | |
1969 | | struct SIMDVec32; |
1970 | | |
1971 | | struct Mask32 { |
1972 | | uint32x4_t mask; |
1973 | | SIMDVec32 IfThenElse(const SIMDVec32& if_true, const SIMDVec32& if_false); |
1974 | | Mask32 And(const Mask32& oth) const { |
1975 | | return Mask32{vandq_u32(mask, oth.mask)}; |
1976 | | } |
1977 | | size_t CountPrefix() const { |
1978 | | uint32_t val_unset[4] = {0, 1, 2, 3}; |
1979 | | uint32_t val_set[4] = {4, 4, 4, 4}; |
1980 | | uint32x4_t val = vbslq_u32(mask, vld1q_u32(val_set), vld1q_u32(val_unset)); |
1981 | | return vminvq_u32(val); |
1982 | | } |
1983 | | }; |
1984 | | |
1985 | | struct SIMDVec32 { |
1986 | | uint32x4_t vec; |
1987 | | |
1988 | | static constexpr size_t kLanes = 4; |
1989 | | |
1990 | | FJXL_INLINE static SIMDVec32 Load(const uint32_t* data) { |
1991 | | return SIMDVec32{vld1q_u32(data)}; |
1992 | | } |
1993 | | FJXL_INLINE void Store(uint32_t* data) { vst1q_u32(data, vec); } |
1994 | | FJXL_INLINE static SIMDVec32 Val(uint32_t v) { |
1995 | | return SIMDVec32{vdupq_n_u32(v)}; |
1996 | | } |
1997 | | FJXL_INLINE SIMDVec32 ValToToken() const { |
1998 | | return SIMDVec32{vsubq_u32(vdupq_n_u32(32), vclzq_u32(vec))}; |
1999 | | } |
2000 | | FJXL_INLINE SIMDVec32 SatSubU(const SIMDVec32& to_subtract) const { |
2001 | | return SIMDVec32{vqsubq_u32(vec, to_subtract.vec)}; |
2002 | | } |
2003 | | FJXL_INLINE SIMDVec32 Sub(const SIMDVec32& to_subtract) const { |
2004 | | return SIMDVec32{vsubq_u32(vec, to_subtract.vec)}; |
2005 | | } |
2006 | | FJXL_INLINE SIMDVec32 Add(const SIMDVec32& oth) const { |
2007 | | return SIMDVec32{vaddq_u32(vec, oth.vec)}; |
2008 | | } |
2009 | | FJXL_INLINE SIMDVec32 Xor(const SIMDVec32& oth) const { |
2010 | | return SIMDVec32{veorq_u32(vec, oth.vec)}; |
2011 | | } |
2012 | | FJXL_INLINE SIMDVec32 Pow2() const { |
2013 | | return SIMDVec32{vshlq_u32(vdupq_n_u32(1), vreinterpretq_s32_u32(vec))}; |
2014 | | } |
2015 | | FJXL_INLINE Mask32 Eq(const SIMDVec32& oth) const { |
2016 | | return Mask32{vceqq_u32(vec, oth.vec)}; |
2017 | | } |
2018 | | FJXL_INLINE Mask32 Gt(const SIMDVec32& oth) const { |
2019 | | return Mask32{ |
2020 | | vcgtq_s32(vreinterpretq_s32_u32(vec), vreinterpretq_s32_u32(oth.vec))}; |
2021 | | } |
2022 | | template <size_t i> |
2023 | | FJXL_INLINE SIMDVec32 SignedShiftRight() const { |
2024 | | return SIMDVec32{ |
2025 | | vreinterpretq_u32_s32(vshrq_n_s32(vreinterpretq_s32_u32(vec), i))}; |
2026 | | } |
2027 | | }; |
2028 | | |
2029 | | struct SIMDVec16; |
2030 | | |
2031 | | struct Mask16 { |
2032 | | uint16x8_t mask; |
2033 | | SIMDVec16 IfThenElse(const SIMDVec16& if_true, const SIMDVec16& if_false); |
2034 | | Mask16 And(const Mask16& oth) const { |
2035 | | return Mask16{vandq_u16(mask, oth.mask)}; |
2036 | | } |
2037 | | size_t CountPrefix() const { |
2038 | | uint16_t val_unset[8] = {0, 1, 2, 3, 4, 5, 6, 7}; |
2039 | | uint16_t val_set[8] = {8, 8, 8, 8, 8, 8, 8, 8}; |
2040 | | uint16x8_t val = vbslq_u16(mask, vld1q_u16(val_set), vld1q_u16(val_unset)); |
2041 | | return vminvq_u16(val); |
2042 | | } |
2043 | | }; |
2044 | | |
2045 | | struct SIMDVec16 { |
2046 | | uint16x8_t vec; |
2047 | | |
2048 | | static constexpr size_t kLanes = 8; |
2049 | | |
2050 | | FJXL_INLINE static SIMDVec16 Load(const uint16_t* data) { |
2051 | | return SIMDVec16{vld1q_u16(data)}; |
2052 | | } |
2053 | | FJXL_INLINE void Store(uint16_t* data) { vst1q_u16(data, vec); } |
2054 | | FJXL_INLINE static SIMDVec16 Val(uint16_t v) { |
2055 | | return SIMDVec16{vdupq_n_u16(v)}; |
2056 | | } |
2057 | | FJXL_INLINE static SIMDVec16 FromTwo32(const SIMDVec32& lo, |
2058 | | const SIMDVec32& hi) { |
2059 | | return SIMDVec16{vmovn_high_u32(vmovn_u32(lo.vec), hi.vec)}; |
2060 | | } |
2061 | | |
2062 | | FJXL_INLINE SIMDVec16 ValToToken() const { |
2063 | | return SIMDVec16{vsubq_u16(vdupq_n_u16(16), vclzq_u16(vec))}; |
2064 | | } |
2065 | | FJXL_INLINE SIMDVec16 SatSubU(const SIMDVec16& to_subtract) const { |
2066 | | return SIMDVec16{vqsubq_u16(vec, to_subtract.vec)}; |
2067 | | } |
2068 | | FJXL_INLINE SIMDVec16 Sub(const SIMDVec16& to_subtract) const { |
2069 | | return SIMDVec16{vsubq_u16(vec, to_subtract.vec)}; |
2070 | | } |
2071 | | FJXL_INLINE SIMDVec16 Add(const SIMDVec16& oth) const { |
2072 | | return SIMDVec16{vaddq_u16(vec, oth.vec)}; |
2073 | | } |
2074 | | FJXL_INLINE SIMDVec16 Min(const SIMDVec16& oth) const { |
2075 | | return SIMDVec16{vminq_u16(vec, oth.vec)}; |
2076 | | } |
2077 | | FJXL_INLINE Mask16 Eq(const SIMDVec16& oth) const { |
2078 | | return Mask16{vceqq_u16(vec, oth.vec)}; |
2079 | | } |
2080 | | FJXL_INLINE Mask16 Gt(const SIMDVec16& oth) const { |
2081 | | return Mask16{ |
2082 | | vcgtq_s16(vreinterpretq_s16_u16(vec), vreinterpretq_s16_u16(oth.vec))}; |
2083 | | } |
2084 | | FJXL_INLINE SIMDVec16 Pow2() const { |
2085 | | return SIMDVec16{vshlq_u16(vdupq_n_u16(1), vreinterpretq_s16_u16(vec))}; |
2086 | | } |
2087 | | FJXL_INLINE SIMDVec16 Or(const SIMDVec16& oth) const { |
2088 | | return SIMDVec16{vorrq_u16(vec, oth.vec)}; |
2089 | | } |
2090 | | FJXL_INLINE SIMDVec16 Xor(const SIMDVec16& oth) const { |
2091 | | return SIMDVec16{veorq_u16(vec, oth.vec)}; |
2092 | | } |
2093 | | FJXL_INLINE SIMDVec16 And(const SIMDVec16& oth) const { |
2094 | | return SIMDVec16{vandq_u16(vec, oth.vec)}; |
2095 | | } |
2096 | | FJXL_INLINE SIMDVec16 HAdd(const SIMDVec16& oth) const { |
2097 | | return SIMDVec16{vhaddq_u16(vec, oth.vec)}; |
2098 | | } |
2099 | | FJXL_INLINE SIMDVec16 PrepareForU8Lookup() const { |
2100 | | return SIMDVec16{vorrq_u16(vec, vdupq_n_u16(0xFF00))}; |
2101 | | } |
2102 | | FJXL_INLINE SIMDVec16 U8Lookup(const uint8_t* table) const { |
2103 | | uint8x16_t tbl = vld1q_u8(table); |
2104 | | uint8x16_t indices = vreinterpretq_u8_u16(vec); |
2105 | | return SIMDVec16{vreinterpretq_u16_u8(vqtbl1q_u8(tbl, indices))}; |
2106 | | } |
2107 | | FJXL_INLINE VecPair<SIMDVec16> Interleave(const SIMDVec16& low) const { |
2108 | | return {SIMDVec16{vzip1q_u16(low.vec, vec)}, |
2109 | | SIMDVec16{vzip2q_u16(low.vec, vec)}}; |
2110 | | } |
2111 | | FJXL_INLINE VecPair<SIMDVec32> Upcast() const { |
2112 | | uint32x4_t lo = vmovl_u16(vget_low_u16(vec)); |
2113 | | uint32x4_t hi = vmovl_high_u16(vec); |
2114 | | return {SIMDVec32{lo}, SIMDVec32{hi}}; |
2115 | | } |
2116 | | template <size_t i> |
2117 | | FJXL_INLINE SIMDVec16 SignedShiftRight() const { |
2118 | | return SIMDVec16{ |
2119 | | vreinterpretq_u16_s16(vshrq_n_s16(vreinterpretq_s16_u16(vec), i))}; |
2120 | | } |
2121 | | |
2122 | | static std::array<SIMDVec16, 1> LoadG8(const unsigned char* data) { |
2123 | | uint8x8_t v = vld1_u8(data); |
2124 | | return {SIMDVec16{vmovl_u8(v)}}; |
2125 | | } |
2126 | | static std::array<SIMDVec16, 1> LoadG16(const unsigned char* data) { |
2127 | | return {Load((const uint16_t*)data)}; |
2128 | | } |
2129 | | |
2130 | | static std::array<SIMDVec16, 2> LoadGA8(const unsigned char* data) { |
2131 | | uint8x8x2_t v = vld2_u8(data); |
2132 | | return {SIMDVec16{vmovl_u8(v.val[0])}, SIMDVec16{vmovl_u8(v.val[1])}}; |
2133 | | } |
2134 | | static std::array<SIMDVec16, 2> LoadGA16(const unsigned char* data) { |
2135 | | uint16x8x2_t v = vld2q_u16((const uint16_t*)data); |
2136 | | return {SIMDVec16{v.val[0]}, SIMDVec16{v.val[1]}}; |
2137 | | } |
2138 | | |
2139 | | static std::array<SIMDVec16, 3> LoadRGB8(const unsigned char* data) { |
2140 | | uint8x8x3_t v = vld3_u8(data); |
2141 | | return {SIMDVec16{vmovl_u8(v.val[0])}, SIMDVec16{vmovl_u8(v.val[1])}, |
2142 | | SIMDVec16{vmovl_u8(v.val[2])}}; |
2143 | | } |
2144 | | static std::array<SIMDVec16, 3> LoadRGB16(const unsigned char* data) { |
2145 | | uint16x8x3_t v = vld3q_u16((const uint16_t*)data); |
2146 | | return {SIMDVec16{v.val[0]}, SIMDVec16{v.val[1]}, SIMDVec16{v.val[2]}}; |
2147 | | } |
2148 | | |
2149 | | static std::array<SIMDVec16, 4> LoadRGBA8(const unsigned char* data) { |
2150 | | uint8x8x4_t v = vld4_u8(data); |
2151 | | return {SIMDVec16{vmovl_u8(v.val[0])}, SIMDVec16{vmovl_u8(v.val[1])}, |
2152 | | SIMDVec16{vmovl_u8(v.val[2])}, SIMDVec16{vmovl_u8(v.val[3])}}; |
2153 | | } |
2154 | | static std::array<SIMDVec16, 4> LoadRGBA16(const unsigned char* data) { |
2155 | | uint16x8x4_t v = vld4q_u16((const uint16_t*)data); |
2156 | | return {SIMDVec16{v.val[0]}, SIMDVec16{v.val[1]}, SIMDVec16{v.val[2]}, |
2157 | | SIMDVec16{v.val[3]}}; |
2158 | | } |
2159 | | |
2160 | | void SwapEndian() { |
2161 | | vec = vreinterpretq_u16_u8(vrev16q_u8(vreinterpretq_u8_u16(vec))); |
2162 | | } |
2163 | | }; |
2164 | | |
2165 | | SIMDVec16 Mask16::IfThenElse(const SIMDVec16& if_true, |
2166 | | const SIMDVec16& if_false) { |
2167 | | return SIMDVec16{vbslq_u16(mask, if_true.vec, if_false.vec)}; |
2168 | | } |
2169 | | |
2170 | | SIMDVec32 Mask32::IfThenElse(const SIMDVec32& if_true, |
2171 | | const SIMDVec32& if_false) { |
2172 | | return SIMDVec32{vbslq_u32(mask, if_true.vec, if_false.vec)}; |
2173 | | } |
2174 | | |
2175 | | struct Bits64 { |
2176 | | static constexpr size_t kLanes = 2; |
2177 | | |
2178 | | uint64x2_t nbits; |
2179 | | uint64x2_t bits; |
2180 | | |
2181 | | FJXL_INLINE void Store(uint64_t* nbits_out, uint64_t* bits_out) { |
2182 | | vst1q_u64(nbits_out, nbits); |
2183 | | vst1q_u64(bits_out, bits); |
2184 | | } |
2185 | | }; |
2186 | | |
2187 | | struct Bits32 { |
2188 | | uint32x4_t nbits; |
2189 | | uint32x4_t bits; |
2190 | | |
2191 | | static Bits32 FromRaw(SIMDVec32 nbits, SIMDVec32 bits) { |
2192 | | return Bits32{nbits.vec, bits.vec}; |
2193 | | } |
2194 | | |
2195 | | Bits64 Merge() const { |
2196 | | // TODO(veluca): can probably be optimized. |
2197 | | uint64x2_t nbits_lo32 = |
2198 | | vandq_u64(vreinterpretq_u64_u32(nbits), vdupq_n_u64(0xFFFFFFFF)); |
2199 | | uint64x2_t bits_hi32 = |
2200 | | vshlq_u64(vshrq_n_u64(vreinterpretq_u64_u32(bits), 32), |
2201 | | vreinterpretq_s64_u64(nbits_lo32)); |
2202 | | uint64x2_t bits_lo32 = |
2203 | | vandq_u64(vreinterpretq_u64_u32(bits), vdupq_n_u64(0xFFFFFFFF)); |
2204 | | uint64x2_t nbits64 = |
2205 | | vsraq_n_u64(nbits_lo32, vreinterpretq_u64_u32(nbits), 32); |
2206 | | uint64x2_t bits64 = vorrq_u64(bits_hi32, bits_lo32); |
2207 | | return Bits64{nbits64, bits64}; |
2208 | | } |
2209 | | |
2210 | | void Interleave(const Bits32& low) { |
2211 | | bits = |
2212 | | vorrq_u32(vshlq_u32(bits, vreinterpretq_s32_u32(low.nbits)), low.bits); |
2213 | | nbits = vaddq_u32(nbits, low.nbits); |
2214 | | } |
2215 | | |
2216 | | void ClipTo(size_t n) { |
2217 | | n = std::min<size_t>(n, 4); |
2218 | | constexpr uint32_t kMask[8] = { |
2219 | | ~0u, ~0u, ~0u, ~0u, 0, 0, 0, 0, |
2220 | | }; |
2221 | | uint32x4_t mask = vld1q_u32(kMask + 4 - n); |
2222 | | nbits = vandq_u32(mask, nbits); |
2223 | | bits = vandq_u32(mask, bits); |
2224 | | } |
2225 | | void Skip(size_t n) { |
2226 | | n = std::min<size_t>(n, 4); |
2227 | | constexpr uint32_t kMask[8] = { |
2228 | | 0, 0, 0, 0, ~0u, ~0u, ~0u, ~0u, |
2229 | | }; |
2230 | | uint32x4_t mask = vld1q_u32(kMask + 4 - n); |
2231 | | nbits = vandq_u32(mask, nbits); |
2232 | | bits = vandq_u32(mask, bits); |
2233 | | } |
2234 | | }; |
2235 | | |
2236 | | struct Bits16 { |
2237 | | uint16x8_t nbits; |
2238 | | uint16x8_t bits; |
2239 | | |
2240 | | static Bits16 FromRaw(SIMDVec16 nbits, SIMDVec16 bits) { |
2241 | | return Bits16{nbits.vec, bits.vec}; |
2242 | | } |
2243 | | |
2244 | | Bits32 Merge() const { |
2245 | | // TODO(veluca): can probably be optimized. |
2246 | | uint32x4_t nbits_lo16 = |
2247 | | vandq_u32(vreinterpretq_u32_u16(nbits), vdupq_n_u32(0xFFFF)); |
2248 | | uint32x4_t bits_hi16 = |
2249 | | vshlq_u32(vshrq_n_u32(vreinterpretq_u32_u16(bits), 16), |
2250 | | vreinterpretq_s32_u32(nbits_lo16)); |
2251 | | uint32x4_t bits_lo16 = |
2252 | | vandq_u32(vreinterpretq_u32_u16(bits), vdupq_n_u32(0xFFFF)); |
2253 | | uint32x4_t nbits32 = |
2254 | | vsraq_n_u32(nbits_lo16, vreinterpretq_u32_u16(nbits), 16); |
2255 | | uint32x4_t bits32 = vorrq_u32(bits_hi16, bits_lo16); |
2256 | | return Bits32{nbits32, bits32}; |
2257 | | } |
2258 | | |
2259 | | void Interleave(const Bits16& low) { |
2260 | | bits = |
2261 | | vorrq_u16(vshlq_u16(bits, vreinterpretq_s16_u16(low.nbits)), low.bits); |
2262 | | nbits = vaddq_u16(nbits, low.nbits); |
2263 | | } |
2264 | | |
2265 | | void ClipTo(size_t n) { |
2266 | | n = std::min<size_t>(n, 8); |
2267 | | constexpr uint16_t kMask[16] = { |
2268 | | 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, |
2269 | | 0, 0, 0, 0, 0, 0, 0, 0, |
2270 | | }; |
2271 | | uint16x8_t mask = vld1q_u16(kMask + 8 - n); |
2272 | | nbits = vandq_u16(mask, nbits); |
2273 | | bits = vandq_u16(mask, bits); |
2274 | | } |
2275 | | void Skip(size_t n) { |
2276 | | n = std::min<size_t>(n, 8); |
2277 | | constexpr uint16_t kMask[16] = { |
2278 | | 0, 0, 0, 0, 0, 0, 0, 0, |
2279 | | 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, |
2280 | | }; |
2281 | | uint16x8_t mask = vld1q_u16(kMask + 8 - n); |
2282 | | nbits = vandq_u16(mask, nbits); |
2283 | | bits = vandq_u16(mask, bits); |
2284 | | } |
2285 | | }; |
2286 | | |
2287 | | #endif |
2288 | | |
2289 | | #ifdef FJXL_GENERIC_SIMD |
2290 | | constexpr size_t SIMDVec32::kLanes; |
2291 | | constexpr size_t SIMDVec16::kLanes; |
2292 | | |
2293 | | // Â Each of these functions will process SIMDVec16::kLanes worth of values. |
2294 | | |
2295 | | FJXL_INLINE void TokenizeSIMD(const uint16_t* residuals, uint16_t* token_out, |
2296 | 0 | uint16_t* nbits_out, uint16_t* bits_out) { |
2297 | 0 | SIMDVec16 res = SIMDVec16::Load(residuals); |
2298 | 0 | SIMDVec16 token = res.ValToToken(); |
2299 | 0 | SIMDVec16 nbits = token.SatSubU(SIMDVec16::Val(1)); |
2300 | 0 | SIMDVec16 bits = res.SatSubU(nbits.Pow2()); |
2301 | 0 | token.Store(token_out); |
2302 | 0 | nbits.Store(nbits_out); |
2303 | 0 | bits.Store(bits_out); |
2304 | 0 | } |
2305 | | |
2306 | | FJXL_INLINE void TokenizeSIMD(const uint32_t* residuals, uint16_t* token_out, |
2307 | 0 | uint32_t* nbits_out, uint32_t* bits_out) { |
2308 | 0 | static_assert(SIMDVec16::kLanes == 2 * SIMDVec32::kLanes, |
2309 | 0 | "There should be twice more 16-bit lanes than 32-bit lanes"); |
2310 | 0 | SIMDVec32 res_lo = SIMDVec32::Load(residuals); |
2311 | 0 | SIMDVec32 res_hi = SIMDVec32::Load(residuals + SIMDVec32::kLanes); |
2312 | 0 | SIMDVec32 token_lo = res_lo.ValToToken(); |
2313 | 0 | SIMDVec32 token_hi = res_hi.ValToToken(); |
2314 | 0 | SIMDVec32 nbits_lo = token_lo.SatSubU(SIMDVec32::Val(1)); |
2315 | 0 | SIMDVec32 nbits_hi = token_hi.SatSubU(SIMDVec32::Val(1)); |
2316 | 0 | SIMDVec32 bits_lo = res_lo.SatSubU(nbits_lo.Pow2()); |
2317 | 0 | SIMDVec32 bits_hi = res_hi.SatSubU(nbits_hi.Pow2()); |
2318 | 0 | SIMDVec16 token = SIMDVec16::FromTwo32(token_lo, token_hi); |
2319 | 0 | token.Store(token_out); |
2320 | 0 | nbits_lo.Store(nbits_out); |
2321 | 0 | nbits_hi.Store(nbits_out + SIMDVec32::kLanes); |
2322 | 0 | bits_lo.Store(bits_out); |
2323 | 0 | bits_hi.Store(bits_out + SIMDVec32::kLanes); |
2324 | 0 | } |
2325 | | |
2326 | | FJXL_INLINE void HuffmanSIMDUpTo13(const uint16_t* tokens, |
2327 | | const uint8_t* raw_nbits_simd, |
2328 | | const uint8_t* raw_bits_simd, |
2329 | 0 | uint16_t* nbits_out, uint16_t* bits_out) { |
2330 | 0 | SIMDVec16 tok = SIMDVec16::Load(tokens).PrepareForU8Lookup(); |
2331 | 0 | tok.U8Lookup(raw_nbits_simd).Store(nbits_out); |
2332 | 0 | tok.U8Lookup(raw_bits_simd).Store(bits_out); |
2333 | 0 | } |
2334 | | |
2335 | | FJXL_INLINE void HuffmanSIMD14(const uint16_t* tokens, |
2336 | | const uint8_t* raw_nbits_simd, |
2337 | | const uint8_t* raw_bits_simd, |
2338 | 0 | uint16_t* nbits_out, uint16_t* bits_out) { |
2339 | 0 | SIMDVec16 token_cap = SIMDVec16::Val(15); |
2340 | 0 | SIMDVec16 tok = SIMDVec16::Load(tokens); |
2341 | 0 | SIMDVec16 tok_index = tok.Min(token_cap).PrepareForU8Lookup(); |
2342 | 0 | SIMDVec16 huff_bits_pre = tok_index.U8Lookup(raw_bits_simd); |
2343 | | // Set the highest bit when token == 16; the Huffman code is constructed in |
2344 | | // such a way that the code for token 15 is the same as the code for 16, |
2345 | | // except for the highest bit. |
2346 | 0 | Mask16 needs_high_bit = tok.Eq(SIMDVec16::Val(16)); |
2347 | 0 | SIMDVec16 huff_bits = needs_high_bit.IfThenElse( |
2348 | 0 | huff_bits_pre.Or(SIMDVec16::Val(128)), huff_bits_pre); |
2349 | 0 | huff_bits.Store(bits_out); |
2350 | 0 | tok_index.U8Lookup(raw_nbits_simd).Store(nbits_out); |
2351 | 0 | } |
2352 | | |
2353 | | FJXL_INLINE void HuffmanSIMDAbove14(const uint16_t* tokens, |
2354 | | const uint8_t* raw_nbits_simd, |
2355 | | const uint8_t* raw_bits_simd, |
2356 | 0 | uint16_t* nbits_out, uint16_t* bits_out) { |
2357 | 0 | SIMDVec16 tok = SIMDVec16::Load(tokens); |
2358 | | // We assume `tok` fits in a *signed* 16-bit integer. |
2359 | 0 | Mask16 above = tok.Gt(SIMDVec16::Val(12)); |
2360 | | // 13, 14 -> 13 |
2361 | | // 15, 16 -> 14 |
2362 | | // 17, 18 -> 15 |
2363 | 0 | SIMDVec16 remap_tok = above.IfThenElse(tok.HAdd(SIMDVec16::Val(13)), tok); |
2364 | 0 | SIMDVec16 tok_index = remap_tok.PrepareForU8Lookup(); |
2365 | 0 | SIMDVec16 huff_bits_pre = tok_index.U8Lookup(raw_bits_simd); |
2366 | | // Set the highest bit when token == 14, 16, 18. |
2367 | 0 | Mask16 needs_high_bit = above.And(tok.Eq(tok.And(SIMDVec16::Val(0xFFFE)))); |
2368 | 0 | SIMDVec16 huff_bits = needs_high_bit.IfThenElse( |
2369 | 0 | huff_bits_pre.Or(SIMDVec16::Val(128)), huff_bits_pre); |
2370 | 0 | huff_bits.Store(bits_out); |
2371 | 0 | tok_index.U8Lookup(raw_nbits_simd).Store(nbits_out); |
2372 | 0 | } |
2373 | | |
2374 | | FJXL_INLINE void StoreSIMDUpTo8(const uint16_t* nbits_tok, |
2375 | | const uint16_t* bits_tok, |
2376 | | const uint16_t* nbits_huff, |
2377 | | const uint16_t* bits_huff, size_t n, |
2378 | 0 | size_t skip, Bits32* bits_out) { |
2379 | 0 | Bits16 bits = |
2380 | 0 | Bits16::FromRaw(SIMDVec16::Load(nbits_tok), SIMDVec16::Load(bits_tok)); |
2381 | 0 | Bits16 huff_bits = |
2382 | 0 | Bits16::FromRaw(SIMDVec16::Load(nbits_huff), SIMDVec16::Load(bits_huff)); |
2383 | 0 | bits.Interleave(huff_bits); |
2384 | 0 | bits.ClipTo(n); |
2385 | 0 | bits.Skip(skip); |
2386 | 0 | bits_out[0] = bits.Merge(); |
2387 | 0 | } |
2388 | | |
2389 | | // Huffman and raw bits don't necessarily fit in a single u16 here. |
2390 | | FJXL_INLINE void StoreSIMDUpTo14(const uint16_t* nbits_tok, |
2391 | | const uint16_t* bits_tok, |
2392 | | const uint16_t* nbits_huff, |
2393 | | const uint16_t* bits_huff, size_t n, |
2394 | 0 | size_t skip, Bits32* bits_out) { |
2395 | 0 | VecPair<SIMDVec16> bits = |
2396 | 0 | SIMDVec16::Load(bits_tok).Interleave(SIMDVec16::Load(bits_huff)); |
2397 | 0 | VecPair<SIMDVec16> nbits = |
2398 | 0 | SIMDVec16::Load(nbits_tok).Interleave(SIMDVec16::Load(nbits_huff)); |
2399 | 0 | Bits16 low = Bits16::FromRaw(nbits.low, bits.low); |
2400 | 0 | Bits16 hi = Bits16::FromRaw(nbits.hi, bits.hi); |
2401 | 0 | low.ClipTo(2 * n); |
2402 | 0 | low.Skip(2 * skip); |
2403 | 0 | hi.ClipTo(std::max(2 * n, SIMDVec16::kLanes) - SIMDVec16::kLanes); |
2404 | 0 | hi.Skip(std::max(2 * skip, SIMDVec16::kLanes) - SIMDVec16::kLanes); |
2405 | |
|
2406 | 0 | bits_out[0] = low.Merge(); |
2407 | 0 | bits_out[1] = hi.Merge(); |
2408 | 0 | } |
2409 | | |
2410 | | FJXL_INLINE void StoreSIMDAbove14(const uint32_t* nbits_tok, |
2411 | | const uint32_t* bits_tok, |
2412 | | const uint16_t* nbits_huff, |
2413 | | const uint16_t* bits_huff, size_t n, |
2414 | 0 | size_t skip, Bits32* bits_out) { |
2415 | 0 | static_assert(SIMDVec16::kLanes == 2 * SIMDVec32::kLanes, |
2416 | 0 | "There should be twice more 16-bit lanes than 32-bit lanes"); |
2417 | 0 | Bits32 bits_low = |
2418 | 0 | Bits32::FromRaw(SIMDVec32::Load(nbits_tok), SIMDVec32::Load(bits_tok)); |
2419 | 0 | Bits32 bits_hi = |
2420 | 0 | Bits32::FromRaw(SIMDVec32::Load(nbits_tok + SIMDVec32::kLanes), |
2421 | 0 | SIMDVec32::Load(bits_tok + SIMDVec32::kLanes)); |
2422 | |
|
2423 | 0 | VecPair<SIMDVec32> huff_bits = SIMDVec16::Load(bits_huff).Upcast(); |
2424 | 0 | VecPair<SIMDVec32> huff_nbits = SIMDVec16::Load(nbits_huff).Upcast(); |
2425 | |
|
2426 | 0 | Bits32 huff_low = Bits32::FromRaw(huff_nbits.low, huff_bits.low); |
2427 | 0 | Bits32 huff_hi = Bits32::FromRaw(huff_nbits.hi, huff_bits.hi); |
2428 | |
|
2429 | 0 | bits_low.Interleave(huff_low); |
2430 | 0 | bits_low.ClipTo(n); |
2431 | 0 | bits_low.Skip(skip); |
2432 | 0 | bits_out[0] = bits_low; |
2433 | 0 | bits_hi.Interleave(huff_hi); |
2434 | 0 | bits_hi.ClipTo(std::max(n, SIMDVec32::kLanes) - SIMDVec32::kLanes); |
2435 | 0 | bits_hi.Skip(std::max(skip, SIMDVec32::kLanes) - SIMDVec32::kLanes); |
2436 | 0 | bits_out[1] = bits_hi; |
2437 | 0 | } |
2438 | | |
2439 | | #ifdef FJXL_AVX512 |
2440 | | FJXL_INLINE void StoreToWriterAVX512(const Bits32& bits32, BitWriter& output) { |
2441 | | __m512i bits = bits32.bits; |
2442 | | __m512i nbits = bits32.nbits; |
2443 | | |
2444 | | // Insert the leftover bits from the bit buffer at the bottom of the vector |
2445 | | // and extract the top of the vector. |
2446 | | uint64_t trail_bits = |
2447 | | _mm512_cvtsi512_si32(_mm512_alignr_epi32(bits, bits, 15)); |
2448 | | uint64_t trail_nbits = |
2449 | | _mm512_cvtsi512_si32(_mm512_alignr_epi32(nbits, nbits, 15)); |
2450 | | __m512i lead_bits = _mm512_set1_epi32(output.buffer); |
2451 | | __m512i lead_nbits = _mm512_set1_epi32(output.bits_in_buffer); |
2452 | | bits = _mm512_alignr_epi32(bits, lead_bits, 15); |
2453 | | nbits = _mm512_alignr_epi32(nbits, lead_nbits, 15); |
2454 | | |
2455 | | // Merge 32 -> 64 bits. |
2456 | | Bits32 b{nbits, bits}; |
2457 | | Bits64 b64 = b.Merge(); |
2458 | | bits = b64.bits; |
2459 | | nbits = b64.nbits; |
2460 | | |
2461 | | __m512i zero = _mm512_setzero_si512(); |
2462 | | |
2463 | | auto sh1 = [zero](__m512i vec) { return _mm512_alignr_epi64(vec, zero, 7); }; |
2464 | | auto sh2 = [zero](__m512i vec) { return _mm512_alignr_epi64(vec, zero, 6); }; |
2465 | | auto sh4 = [zero](__m512i vec) { return _mm512_alignr_epi64(vec, zero, 4); }; |
2466 | | |
2467 | | // Compute first-past-end-bit-position. |
2468 | | __m512i end_intermediate0 = _mm512_add_epi64(nbits, sh1(nbits)); |
2469 | | __m512i end_intermediate1 = |
2470 | | _mm512_add_epi64(end_intermediate0, sh2(end_intermediate0)); |
2471 | | __m512i end = _mm512_add_epi64(end_intermediate1, sh4(end_intermediate1)); |
2472 | | |
2473 | | uint64_t simd_nbits = _mm512_cvtsi512_si32(_mm512_alignr_epi64(end, end, 7)); |
2474 | | |
2475 | | // Compute begin-bit-position. |
2476 | | __m512i begin = _mm512_sub_epi64(end, nbits); |
2477 | | |
2478 | | // Index of the last bit in the chunk, or the end bit if nbits==0. |
2479 | | __m512i last = _mm512_mask_sub_epi64( |
2480 | | end, _mm512_cmpneq_epi64_mask(nbits, zero), end, _mm512_set1_epi64(1)); |
2481 | | |
2482 | | __m512i lane_offset_mask = _mm512_set1_epi64(63); |
2483 | | |
2484 | | // Starting position of the chunk that each lane will ultimately belong to. |
2485 | | __m512i chunk_start = _mm512_andnot_si512(lane_offset_mask, last); |
2486 | | |
2487 | | // For all lanes that contain bits belonging to two different 64-bit chunks, |
2488 | | // compute the number of bits that belong to the first chunk. |
2489 | | // total # of bits fit in a u16, so we can satsub_u16 here. |
2490 | | __m512i first_chunk_nbits = _mm512_subs_epu16(chunk_start, begin); |
2491 | | |
2492 | | // Move all the previous-chunk-bits to the previous lane. |
2493 | | __m512i negnbits = _mm512_sub_epi64(_mm512_set1_epi64(64), first_chunk_nbits); |
2494 | | __m512i first_chunk_bits = |
2495 | | _mm512_srlv_epi64(_mm512_sllv_epi64(bits, negnbits), negnbits); |
2496 | | __m512i first_chunk_bits_down = |
2497 | | _mm512_alignr_epi32(zero, first_chunk_bits, 2); |
2498 | | bits = _mm512_srlv_epi64(bits, first_chunk_nbits); |
2499 | | nbits = _mm512_sub_epi64(nbits, first_chunk_nbits); |
2500 | | bits = _mm512_or_si512(bits, _mm512_sllv_epi64(first_chunk_bits_down, nbits)); |
2501 | | begin = _mm512_add_epi64(begin, first_chunk_nbits); |
2502 | | |
2503 | | // We now know that every lane should give bits to only one chunk. We can |
2504 | | // shift the bits and then horizontally-or-reduce them within the same chunk. |
2505 | | __m512i offset = _mm512_and_si512(begin, lane_offset_mask); |
2506 | | __m512i aligned_bits = _mm512_sllv_epi64(bits, offset); |
2507 | | // h-or-reduce within same chunk |
2508 | | __m512i red0 = _mm512_mask_or_epi64( |
2509 | | aligned_bits, _mm512_cmpeq_epi64_mask(sh1(chunk_start), chunk_start), |
2510 | | sh1(aligned_bits), aligned_bits); |
2511 | | __m512i red1 = _mm512_mask_or_epi64( |
2512 | | red0, _mm512_cmpeq_epi64_mask(sh2(chunk_start), chunk_start), sh2(red0), |
2513 | | red0); |
2514 | | __m512i reduced = _mm512_mask_or_epi64( |
2515 | | red1, _mm512_cmpeq_epi64_mask(sh4(chunk_start), chunk_start), sh4(red1), |
2516 | | red1); |
2517 | | // Extract the highest lane that belongs to each chunk (the lane that ends up |
2518 | | // with the OR-ed value of all the other lanes of that chunk). |
2519 | | __m512i next_chunk_start = |
2520 | | _mm512_alignr_epi32(_mm512_set1_epi64(~0), chunk_start, 2); |
2521 | | __m512i result = _mm512_maskz_compress_epi64( |
2522 | | _mm512_cmpneq_epi64_mask(chunk_start, next_chunk_start), reduced); |
2523 | | |
2524 | | _mm512_storeu_si512((__m512i*)(output.data.get() + output.bytes_written), |
2525 | | result); |
2526 | | |
2527 | | // Update the bit writer and add the last 32-bit lane. |
2528 | | // Note that since trail_nbits was at most 32 to begin with, operating on |
2529 | | // trail_bits does not risk overflowing. |
2530 | | output.bytes_written += simd_nbits / 8; |
2531 | | // Here we are implicitly relying on the fact that simd_nbits < 512 to know |
2532 | | // that the byte of bitreader data we access is initialized. This is |
2533 | | // guaranteed because the remaining bits in the bitreader buffer are at most |
2534 | | // 7, so simd_nbits <= 505 always. |
2535 | | trail_bits = (trail_bits << (simd_nbits % 8)) + |
2536 | | output.data.get()[output.bytes_written]; |
2537 | | trail_nbits += simd_nbits % 8; |
2538 | | StoreLE64(output.data.get() + output.bytes_written, trail_bits); |
2539 | | size_t trail_bytes = trail_nbits / 8; |
2540 | | output.bits_in_buffer = trail_nbits % 8; |
2541 | | output.buffer = trail_bits >> (trail_bytes * 8); |
2542 | | output.bytes_written += trail_bytes; |
2543 | | } |
2544 | | |
2545 | | #endif |
2546 | | |
2547 | | template <size_t n> |
2548 | 0 | FJXL_INLINE void StoreToWriter(const Bits32* bits, BitWriter& output) { |
2549 | | #ifdef FJXL_AVX512 |
2550 | | static_assert(n <= 2, "n should be less or 2 for AVX512"); |
2551 | | StoreToWriterAVX512(bits[0], output); |
2552 | | if (n == 2) { |
2553 | | StoreToWriterAVX512(bits[1], output); |
2554 | | } |
2555 | | return; |
2556 | | #endif |
2557 | 0 | static_assert(n <= 4, "n should be less or 4"); |
2558 | 0 | alignas(64) uint64_t nbits64[Bits64::kLanes * n]; |
2559 | 0 | alignas(64) uint64_t bits64[Bits64::kLanes * n]; |
2560 | 0 | bits[0].Merge().Store(nbits64, bits64); |
2561 | 0 | if (n > 1) { |
2562 | 0 | bits[1].Merge().Store(nbits64 + Bits64::kLanes, bits64 + Bits64::kLanes); |
2563 | 0 | } |
2564 | 0 | if (n > 2) { |
2565 | 0 | bits[2].Merge().Store(nbits64 + 2 * Bits64::kLanes, |
2566 | 0 | bits64 + 2 * Bits64::kLanes); |
2567 | 0 | } |
2568 | 0 | if (n > 3) { |
2569 | 0 | bits[3].Merge().Store(nbits64 + 3 * Bits64::kLanes, |
2570 | 0 | bits64 + 3 * Bits64::kLanes); |
2571 | 0 | } |
2572 | 0 | output.WriteMultiple(nbits64, bits64, Bits64::kLanes * n); |
2573 | 0 | } Unexecuted instantiation: enc_fast_lossless.cc:void AVX2::(anonymous namespace)::StoreToWriter<1ul>(AVX2::(anonymous namespace)::Bits32 const*, (anonymous namespace)::BitWriter&) Unexecuted instantiation: enc_fast_lossless.cc:void AVX2::(anonymous namespace)::StoreToWriter<2ul>(AVX2::(anonymous namespace)::Bits32 const*, (anonymous namespace)::BitWriter&) |
2574 | | |
2575 | | namespace detail { |
2576 | | template <typename T> |
2577 | | struct IntegerTypes; |
2578 | | |
2579 | | template <> |
2580 | | struct IntegerTypes<SIMDVec16> { |
2581 | | using signed_ = int16_t; |
2582 | | using unsigned_ = uint16_t; |
2583 | | }; |
2584 | | |
2585 | | template <> |
2586 | | struct IntegerTypes<SIMDVec32> { |
2587 | | using signed_ = int32_t; |
2588 | | using unsigned_ = uint32_t; |
2589 | | }; |
2590 | | |
2591 | | template <typename T> |
2592 | | struct SIMDType; |
2593 | | |
2594 | | template <> |
2595 | | struct SIMDType<int16_t> { |
2596 | | using type = SIMDVec16; |
2597 | | }; |
2598 | | |
2599 | | template <> |
2600 | | struct SIMDType<int32_t> { |
2601 | | using type = SIMDVec32; |
2602 | | }; |
2603 | | |
2604 | | } // namespace detail |
2605 | | |
2606 | | template <typename T> |
2607 | | using signed_t = typename detail::IntegerTypes<T>::signed_; |
2608 | | |
2609 | | template <typename T> |
2610 | | using unsigned_t = typename detail::IntegerTypes<T>::unsigned_; |
2611 | | |
2612 | | template <typename T> |
2613 | | using simd_t = typename detail::SIMDType<T>::type; |
2614 | | |
2615 | | // This function will process exactly one vector worth of pixels. |
2616 | | |
2617 | | template <typename T> |
2618 | | size_t PredictPixels(const signed_t<T>* pixels, const signed_t<T>* pixels_left, |
2619 | | const signed_t<T>* pixels_top, |
2620 | | const signed_t<T>* pixels_topleft, |
2621 | 0 | unsigned_t<T>* residuals) { |
2622 | 0 | T px = T::Load((unsigned_t<T>*)pixels); |
2623 | 0 | T left = T::Load((unsigned_t<T>*)pixels_left); |
2624 | 0 | T top = T::Load((unsigned_t<T>*)pixels_top); |
2625 | 0 | T topleft = T::Load((unsigned_t<T>*)pixels_topleft); |
2626 | 0 | T ac = left.Sub(topleft); |
2627 | 0 | T ab = left.Sub(top); |
2628 | 0 | T bc = top.Sub(topleft); |
2629 | 0 | T grad = ac.Add(top); |
2630 | 0 | T d = ab.Xor(bc); |
2631 | 0 | T zero = T::Val(0); |
2632 | 0 | T clamp = zero.Gt(d).IfThenElse(top, left); |
2633 | 0 | T s = ac.Xor(bc); |
2634 | 0 | T pred = zero.Gt(s).IfThenElse(grad, clamp); |
2635 | 0 | T res = px.Sub(pred); |
2636 | 0 | T res_times_2 = res.Add(res); |
2637 | 0 | res = zero.Gt(res).IfThenElse(T::Val(-1).Sub(res_times_2), res_times_2); |
2638 | 0 | res.Store(residuals); |
2639 | 0 | return res.Eq(T::Val(0)).CountPrefix(); |
2640 | 0 | } Unexecuted instantiation: enc_fast_lossless.cc:unsigned long AVX2::(anonymous namespace)::PredictPixels<AVX2::(anonymous namespace)::SIMDVec16>(AVX2::(anonymous namespace)::detail::IntegerTypes<AVX2::(anonymous namespace)::SIMDVec16>::signed_ const*, AVX2::(anonymous namespace)::detail::IntegerTypes<AVX2::(anonymous namespace)::SIMDVec16>::signed_ const*, AVX2::(anonymous namespace)::detail::IntegerTypes<AVX2::(anonymous namespace)::SIMDVec16>::signed_ const*, AVX2::(anonymous namespace)::detail::IntegerTypes<AVX2::(anonymous namespace)::SIMDVec16>::signed_ const*, AVX2::(anonymous namespace)::detail::IntegerTypes<AVX2::(anonymous namespace)::SIMDVec16>::unsigned_*) Unexecuted instantiation: enc_fast_lossless.cc:unsigned long AVX2::(anonymous namespace)::PredictPixels<AVX2::(anonymous namespace)::SIMDVec32>(AVX2::(anonymous namespace)::detail::IntegerTypes<AVX2::(anonymous namespace)::SIMDVec32>::signed_ const*, AVX2::(anonymous namespace)::detail::IntegerTypes<AVX2::(anonymous namespace)::SIMDVec32>::signed_ const*, AVX2::(anonymous namespace)::detail::IntegerTypes<AVX2::(anonymous namespace)::SIMDVec32>::signed_ const*, AVX2::(anonymous namespace)::detail::IntegerTypes<AVX2::(anonymous namespace)::SIMDVec32>::signed_ const*, AVX2::(anonymous namespace)::detail::IntegerTypes<AVX2::(anonymous namespace)::SIMDVec32>::unsigned_*) |
2641 | | |
2642 | | #endif |
2643 | | |
2644 | | void EncodeHybridUint000(uint32_t value, uint32_t* token, uint32_t* nbits, |
2645 | 0 | uint32_t* bits) { |
2646 | 0 | uint32_t n = FloorLog2(value); |
2647 | 0 | *token = value ? n + 1 : 0; |
2648 | 0 | *nbits = value ? n : 0; |
2649 | 0 | *bits = value ? value - (1 << n) : 0; |
2650 | 0 | } Unexecuted instantiation: enc_fast_lossless.cc:AVX2::(anonymous namespace)::EncodeHybridUint000(unsigned int, unsigned int*, unsigned int*, unsigned int*) Unexecuted instantiation: enc_fast_lossless.cc:default_implementation::(anonymous namespace)::EncodeHybridUint000(unsigned int, unsigned int*, unsigned int*, unsigned int*) |
2651 | | |
2652 | | #ifdef FJXL_AVX512 |
2653 | | constexpr static size_t kLogChunkSize = 5; |
2654 | | #elif defined(FJXL_AVX2) || defined(FJXL_NEON) |
2655 | | // Even if NEON only has 128-bit lanes, it is still significantly (~1.3x) faster |
2656 | | // to process two vectors at a time. |
2657 | | constexpr static size_t kLogChunkSize = 4; |
2658 | | #else |
2659 | | constexpr static size_t kLogChunkSize = 3; |
2660 | | #endif |
2661 | | |
2662 | | constexpr static size_t kChunkSize = 1 << kLogChunkSize; |
2663 | | |
2664 | | template <typename Residual> |
2665 | | void GenericEncodeChunk(const Residual* residuals, size_t n, size_t skip, |
2666 | 0 | const PrefixCode& code, BitWriter& output) { |
2667 | 0 | for (size_t ix = skip; ix < n; ix++) { |
2668 | 0 | unsigned token, nbits, bits; |
2669 | 0 | EncodeHybridUint000(residuals[ix], &token, &nbits, &bits); |
2670 | 0 | output.Write(code.raw_nbits[token] + nbits, |
2671 | 0 | code.raw_bits[token] | bits << code.raw_nbits[token]); |
2672 | 0 | } |
2673 | 0 | } Unexecuted instantiation: enc_fast_lossless.cc:void default_implementation::(anonymous namespace)::GenericEncodeChunk<unsigned short>(unsigned short const*, unsigned long, unsigned long, (anonymous namespace)::PrefixCode const&, (anonymous namespace)::BitWriter&) Unexecuted instantiation: enc_fast_lossless.cc:void default_implementation::(anonymous namespace)::GenericEncodeChunk<unsigned int>(unsigned int const*, unsigned long, unsigned long, (anonymous namespace)::PrefixCode const&, (anonymous namespace)::BitWriter&) |
2674 | | |
2675 | | struct UpTo8Bits { |
2676 | | size_t bitdepth; |
2677 | 0 | explicit UpTo8Bits(size_t bitdepth) : bitdepth(bitdepth) { |
2678 | 0 | assert(bitdepth <= 8); |
2679 | 0 | } Unexecuted instantiation: enc_fast_lossless.cc:AVX2::(anonymous namespace)::UpTo8Bits::UpTo8Bits(unsigned long) Unexecuted instantiation: enc_fast_lossless.cc:default_implementation::(anonymous namespace)::UpTo8Bits::UpTo8Bits(unsigned long) |
2680 | | // Here we can fit up to 9 extra bits + 7 Huffman bits in a u16; for all other |
2681 | | // symbols, we could actually go up to 8 Huffman bits as we have at most 8 |
2682 | | // extra bits; however, the SIMD bit merging logic for AVX2 assumes that no |
2683 | | // Huffman length is 8 or more, so we cap at 8 anyway. Last symbol is used for |
2684 | | // LZ77 lengths and has no limitations except allowing to represent 32 symbols |
2685 | | // in total. |
2686 | | static constexpr uint8_t kMinRawLength[12] = {}; |
2687 | | static constexpr uint8_t kMaxRawLength[12] = { |
2688 | | 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 10, |
2689 | | }; |
2690 | 0 | static size_t MaxEncodedBitsPerSample() { return 16; } Unexecuted instantiation: enc_fast_lossless.cc:AVX2::(anonymous namespace)::UpTo8Bits::MaxEncodedBitsPerSample() Unexecuted instantiation: enc_fast_lossless.cc:default_implementation::(anonymous namespace)::UpTo8Bits::MaxEncodedBitsPerSample() |
2691 | | static constexpr size_t kInputBytes = 1; |
2692 | | using pixel_t = int16_t; |
2693 | | using upixel_t = uint16_t; |
2694 | | |
2695 | | static void PrepareForSimd(const uint8_t* nbits, const uint8_t* bits, |
2696 | | size_t n, uint8_t* nbits_simd, |
2697 | 0 | uint8_t* bits_simd) { |
2698 | 0 | assert(n <= 16); |
2699 | 0 | memcpy(nbits_simd, nbits, 16); |
2700 | 0 | memcpy(bits_simd, bits, 16); |
2701 | 0 | } Unexecuted instantiation: enc_fast_lossless.cc:AVX2::(anonymous namespace)::UpTo8Bits::PrepareForSimd(unsigned char const*, unsigned char const*, unsigned long, unsigned char*, unsigned char*) Unexecuted instantiation: enc_fast_lossless.cc:default_implementation::(anonymous namespace)::UpTo8Bits::PrepareForSimd(unsigned char const*, unsigned char const*, unsigned long, unsigned char*, unsigned char*) |
2702 | | |
2703 | | #ifdef FJXL_GENERIC_SIMD |
2704 | | static void EncodeChunkSimd(upixel_t* residuals, size_t n, size_t skip, |
2705 | | const uint8_t* raw_nbits_simd, |
2706 | 0 | const uint8_t* raw_bits_simd, BitWriter& output) { |
2707 | 0 | Bits32 bits32[kChunkSize / SIMDVec16::kLanes]; |
2708 | 0 | alignas(64) uint16_t bits[SIMDVec16::kLanes]; |
2709 | 0 | alignas(64) uint16_t nbits[SIMDVec16::kLanes]; |
2710 | 0 | alignas(64) uint16_t bits_huff[SIMDVec16::kLanes]; |
2711 | 0 | alignas(64) uint16_t nbits_huff[SIMDVec16::kLanes]; |
2712 | 0 | alignas(64) uint16_t token[SIMDVec16::kLanes]; |
2713 | 0 | for (size_t i = 0; i < kChunkSize; i += SIMDVec16::kLanes) { |
2714 | 0 | TokenizeSIMD(residuals + i, token, nbits, bits); |
2715 | 0 | HuffmanSIMDUpTo13(token, raw_nbits_simd, raw_bits_simd, nbits_huff, |
2716 | 0 | bits_huff); |
2717 | 0 | StoreSIMDUpTo8(nbits, bits, nbits_huff, bits_huff, std::max(n, i) - i, |
2718 | 0 | std::max(skip, i) - i, bits32 + i / SIMDVec16::kLanes); |
2719 | 0 | } |
2720 | 0 | StoreToWriter<kChunkSize / SIMDVec16::kLanes>(bits32, output); |
2721 | 0 | } |
2722 | | #endif |
2723 | | |
2724 | 0 | size_t NumSymbols(bool doing_ycocg_or_large_palette) const { |
2725 | | // values gain 1 bit for YCoCg, 1 bit for prediction. |
2726 | | // Maximum symbol is 1 + effective bit depth of residuals. |
2727 | 0 | if (doing_ycocg_or_large_palette) { |
2728 | 0 | return bitdepth + 3; |
2729 | 0 | } else { |
2730 | 0 | return bitdepth + 2; |
2731 | 0 | } |
2732 | 0 | } Unexecuted instantiation: enc_fast_lossless.cc:AVX2::(anonymous namespace)::UpTo8Bits::NumSymbols(bool) const Unexecuted instantiation: enc_fast_lossless.cc:default_implementation::(anonymous namespace)::UpTo8Bits::NumSymbols(bool) const |
2733 | | }; |
2734 | | constexpr uint8_t UpTo8Bits::kMinRawLength[]; |
2735 | | constexpr uint8_t UpTo8Bits::kMaxRawLength[]; |
2736 | | |
2737 | | struct From9To13Bits { |
2738 | | size_t bitdepth; |
2739 | 0 | explicit From9To13Bits(size_t bitdepth) : bitdepth(bitdepth) { |
2740 | 0 | assert(bitdepth <= 13 && bitdepth >= 9); |
2741 | 0 | } Unexecuted instantiation: enc_fast_lossless.cc:AVX2::(anonymous namespace)::From9To13Bits::From9To13Bits(unsigned long) Unexecuted instantiation: enc_fast_lossless.cc:default_implementation::(anonymous namespace)::From9To13Bits::From9To13Bits(unsigned long) |
2742 | | // Last symbol is used for LZ77 lengths and has no limitations except allowing |
2743 | | // to represent 32 symbols in total. |
2744 | | // We cannot fit all the bits in a u16, so do not even try and use up to 8 |
2745 | | // bits per raw symbol. |
2746 | | // There are at most 16 raw symbols, so Huffman coding can be SIMDfied without |
2747 | | // any special tricks. |
2748 | | static constexpr uint8_t kMinRawLength[17] = {}; |
2749 | | static constexpr uint8_t kMaxRawLength[17] = { |
2750 | | 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 10, |
2751 | | }; |
2752 | 0 | static size_t MaxEncodedBitsPerSample() { return 21; } Unexecuted instantiation: enc_fast_lossless.cc:AVX2::(anonymous namespace)::From9To13Bits::MaxEncodedBitsPerSample() Unexecuted instantiation: enc_fast_lossless.cc:default_implementation::(anonymous namespace)::From9To13Bits::MaxEncodedBitsPerSample() |
2753 | | static constexpr size_t kInputBytes = 2; |
2754 | | using pixel_t = int16_t; |
2755 | | using upixel_t = uint16_t; |
2756 | | |
2757 | | static void PrepareForSimd(const uint8_t* nbits, const uint8_t* bits, |
2758 | | size_t n, uint8_t* nbits_simd, |
2759 | 0 | uint8_t* bits_simd) { |
2760 | 0 | assert(n <= 16); |
2761 | 0 | memcpy(nbits_simd, nbits, 16); |
2762 | 0 | memcpy(bits_simd, bits, 16); |
2763 | 0 | } Unexecuted instantiation: enc_fast_lossless.cc:AVX2::(anonymous namespace)::From9To13Bits::PrepareForSimd(unsigned char const*, unsigned char const*, unsigned long, unsigned char*, unsigned char*) Unexecuted instantiation: enc_fast_lossless.cc:default_implementation::(anonymous namespace)::From9To13Bits::PrepareForSimd(unsigned char const*, unsigned char const*, unsigned long, unsigned char*, unsigned char*) |
2764 | | |
2765 | | #ifdef FJXL_GENERIC_SIMD |
2766 | | static void EncodeChunkSimd(upixel_t* residuals, size_t n, size_t skip, |
2767 | | const uint8_t* raw_nbits_simd, |
2768 | 0 | const uint8_t* raw_bits_simd, BitWriter& output) { |
2769 | 0 | Bits32 bits32[2 * kChunkSize / SIMDVec16::kLanes]; |
2770 | 0 | alignas(64) uint16_t bits[SIMDVec16::kLanes]; |
2771 | 0 | alignas(64) uint16_t nbits[SIMDVec16::kLanes]; |
2772 | 0 | alignas(64) uint16_t bits_huff[SIMDVec16::kLanes]; |
2773 | 0 | alignas(64) uint16_t nbits_huff[SIMDVec16::kLanes]; |
2774 | 0 | alignas(64) uint16_t token[SIMDVec16::kLanes]; |
2775 | 0 | for (size_t i = 0; i < kChunkSize; i += SIMDVec16::kLanes) { |
2776 | 0 | TokenizeSIMD(residuals + i, token, nbits, bits); |
2777 | 0 | HuffmanSIMDUpTo13(token, raw_nbits_simd, raw_bits_simd, nbits_huff, |
2778 | 0 | bits_huff); |
2779 | 0 | StoreSIMDUpTo14(nbits, bits, nbits_huff, bits_huff, std::max(n, i) - i, |
2780 | 0 | std::max(skip, i) - i, |
2781 | 0 | bits32 + 2 * i / SIMDVec16::kLanes); |
2782 | 0 | } |
2783 | 0 | StoreToWriter<2 * kChunkSize / SIMDVec16::kLanes>(bits32, output); |
2784 | 0 | } |
2785 | | #endif |
2786 | | |
2787 | 0 | size_t NumSymbols(bool doing_ycocg_or_large_palette) const { |
2788 | | // values gain 1 bit for YCoCg, 1 bit for prediction. |
2789 | | // Maximum symbol is 1 + effective bit depth of residuals. |
2790 | 0 | if (doing_ycocg_or_large_palette) { |
2791 | 0 | return bitdepth + 3; |
2792 | 0 | } else { |
2793 | 0 | return bitdepth + 2; |
2794 | 0 | } |
2795 | 0 | } Unexecuted instantiation: enc_fast_lossless.cc:AVX2::(anonymous namespace)::From9To13Bits::NumSymbols(bool) const Unexecuted instantiation: enc_fast_lossless.cc:default_implementation::(anonymous namespace)::From9To13Bits::NumSymbols(bool) const |
2796 | | }; |
2797 | | constexpr uint8_t From9To13Bits::kMinRawLength[]; |
2798 | | constexpr uint8_t From9To13Bits::kMaxRawLength[]; |
2799 | | |
2800 | 0 | void CheckHuffmanBitsSIMD(int bits1, int nbits1, int bits2, int nbits2) { |
2801 | 0 | assert(nbits1 == 8); |
2802 | 0 | assert(nbits2 == 8); |
2803 | 0 | assert(bits2 == (bits1 | 128)); |
2804 | 0 | } Unexecuted instantiation: enc_fast_lossless.cc:AVX2::(anonymous namespace)::CheckHuffmanBitsSIMD(int, int, int, int) Unexecuted instantiation: enc_fast_lossless.cc:default_implementation::(anonymous namespace)::CheckHuffmanBitsSIMD(int, int, int, int) |
2805 | | |
2806 | | struct Exactly14Bits { |
2807 | 0 | explicit Exactly14Bits(size_t bitdepth_) { assert(bitdepth_ == 14); } Unexecuted instantiation: enc_fast_lossless.cc:AVX2::(anonymous namespace)::Exactly14Bits::Exactly14Bits(unsigned long) Unexecuted instantiation: enc_fast_lossless.cc:default_implementation::(anonymous namespace)::Exactly14Bits::Exactly14Bits(unsigned long) |
2808 | | // Force LZ77 symbols to have at least 8 bits, and raw symbols 15 and 16 to |
2809 | | // have exactly 8, and no other symbol to have 8 or more. This ensures that |
2810 | | // the representation for 15 and 16 is identical up to one bit. |
2811 | | static constexpr uint8_t kMinRawLength[18] = { |
2812 | | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 8, 8, 7, |
2813 | | }; |
2814 | | static constexpr uint8_t kMaxRawLength[18] = { |
2815 | | 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 8, 8, 10, |
2816 | | }; |
2817 | | static constexpr size_t bitdepth = 14; |
2818 | 0 | static size_t MaxEncodedBitsPerSample() { return 22; } Unexecuted instantiation: enc_fast_lossless.cc:AVX2::(anonymous namespace)::Exactly14Bits::MaxEncodedBitsPerSample() Unexecuted instantiation: enc_fast_lossless.cc:default_implementation::(anonymous namespace)::Exactly14Bits::MaxEncodedBitsPerSample() |
2819 | | static constexpr size_t kInputBytes = 2; |
2820 | | using pixel_t = int16_t; |
2821 | | using upixel_t = uint16_t; |
2822 | | |
2823 | | static void PrepareForSimd(const uint8_t* nbits, const uint8_t* bits, |
2824 | | size_t n, uint8_t* nbits_simd, |
2825 | 0 | uint8_t* bits_simd) { |
2826 | 0 | assert(n == 17); |
2827 | 0 | CheckHuffmanBitsSIMD(bits[15], nbits[15], bits[16], nbits[16]); |
2828 | 0 | memcpy(nbits_simd, nbits, 16); |
2829 | 0 | memcpy(bits_simd, bits, 16); |
2830 | 0 | } Unexecuted instantiation: enc_fast_lossless.cc:AVX2::(anonymous namespace)::Exactly14Bits::PrepareForSimd(unsigned char const*, unsigned char const*, unsigned long, unsigned char*, unsigned char*) Unexecuted instantiation: enc_fast_lossless.cc:default_implementation::(anonymous namespace)::Exactly14Bits::PrepareForSimd(unsigned char const*, unsigned char const*, unsigned long, unsigned char*, unsigned char*) |
2831 | | |
2832 | | #ifdef FJXL_GENERIC_SIMD |
2833 | | static void EncodeChunkSimd(upixel_t* residuals, size_t n, size_t skip, |
2834 | | const uint8_t* raw_nbits_simd, |
2835 | 0 | const uint8_t* raw_bits_simd, BitWriter& output) { |
2836 | 0 | Bits32 bits32[2 * kChunkSize / SIMDVec16::kLanes]; |
2837 | 0 | alignas(64) uint16_t bits[SIMDVec16::kLanes]; |
2838 | 0 | alignas(64) uint16_t nbits[SIMDVec16::kLanes]; |
2839 | 0 | alignas(64) uint16_t bits_huff[SIMDVec16::kLanes]; |
2840 | 0 | alignas(64) uint16_t nbits_huff[SIMDVec16::kLanes]; |
2841 | 0 | alignas(64) uint16_t token[SIMDVec16::kLanes]; |
2842 | 0 | for (size_t i = 0; i < kChunkSize; i += SIMDVec16::kLanes) { |
2843 | 0 | TokenizeSIMD(residuals + i, token, nbits, bits); |
2844 | 0 | HuffmanSIMD14(token, raw_nbits_simd, raw_bits_simd, nbits_huff, |
2845 | 0 | bits_huff); |
2846 | 0 | StoreSIMDUpTo14(nbits, bits, nbits_huff, bits_huff, std::max(n, i) - i, |
2847 | 0 | std::max(skip, i) - i, |
2848 | 0 | bits32 + 2 * i / SIMDVec16::kLanes); |
2849 | 0 | } |
2850 | 0 | StoreToWriter<2 * kChunkSize / SIMDVec16::kLanes>(bits32, output); |
2851 | 0 | } |
2852 | | #endif |
2853 | | |
2854 | 0 | size_t NumSymbols(bool) const { return 17; } Unexecuted instantiation: enc_fast_lossless.cc:AVX2::(anonymous namespace)::Exactly14Bits::NumSymbols(bool) const Unexecuted instantiation: enc_fast_lossless.cc:default_implementation::(anonymous namespace)::Exactly14Bits::NumSymbols(bool) const |
2855 | | }; |
2856 | | constexpr uint8_t Exactly14Bits::kMinRawLength[]; |
2857 | | constexpr uint8_t Exactly14Bits::kMaxRawLength[]; |
2858 | | |
2859 | | struct MoreThan14Bits { |
2860 | | size_t bitdepth; |
2861 | 0 | explicit MoreThan14Bits(size_t bitdepth) : bitdepth(bitdepth) { |
2862 | 0 | assert(bitdepth > 14); |
2863 | 0 | assert(bitdepth <= 16); |
2864 | 0 | } Unexecuted instantiation: enc_fast_lossless.cc:AVX2::(anonymous namespace)::MoreThan14Bits::MoreThan14Bits(unsigned long) Unexecuted instantiation: enc_fast_lossless.cc:default_implementation::(anonymous namespace)::MoreThan14Bits::MoreThan14Bits(unsigned long) |
2865 | | // Force LZ77 symbols to have at least 8 bits, and raw symbols 13 to 18 to |
2866 | | // have exactly 8, and no other symbol to have 8 or more. This ensures that |
2867 | | // the representation for (13, 14), (15, 16), (17, 18) is identical up to one |
2868 | | // bit. |
2869 | | static constexpr uint8_t kMinRawLength[20] = { |
2870 | | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 8, 8, 8, 8, 8, 8, 7, |
2871 | | }; |
2872 | | static constexpr uint8_t kMaxRawLength[20] = { |
2873 | | 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 8, 8, 8, 8, 8, 8, 10, |
2874 | | }; |
2875 | 0 | static size_t MaxEncodedBitsPerSample() { return 24; } Unexecuted instantiation: enc_fast_lossless.cc:AVX2::(anonymous namespace)::MoreThan14Bits::MaxEncodedBitsPerSample() Unexecuted instantiation: enc_fast_lossless.cc:default_implementation::(anonymous namespace)::MoreThan14Bits::MaxEncodedBitsPerSample() |
2876 | | static constexpr size_t kInputBytes = 2; |
2877 | | using pixel_t = int32_t; |
2878 | | using upixel_t = uint32_t; |
2879 | | |
2880 | | static void PrepareForSimd(const uint8_t* nbits, const uint8_t* bits, |
2881 | | size_t n, uint8_t* nbits_simd, |
2882 | 0 | uint8_t* bits_simd) { |
2883 | 0 | assert(n == 19); |
2884 | 0 | CheckHuffmanBitsSIMD(bits[13], nbits[13], bits[14], nbits[14]); |
2885 | 0 | CheckHuffmanBitsSIMD(bits[15], nbits[15], bits[16], nbits[16]); |
2886 | 0 | CheckHuffmanBitsSIMD(bits[17], nbits[17], bits[18], nbits[18]); |
2887 | 0 | for (size_t i = 0; i < 14; i++) { |
2888 | 0 | nbits_simd[i] = nbits[i]; |
2889 | 0 | bits_simd[i] = bits[i]; |
2890 | 0 | } |
2891 | 0 | nbits_simd[14] = nbits[15]; |
2892 | 0 | bits_simd[14] = bits[15]; |
2893 | 0 | nbits_simd[15] = nbits[17]; |
2894 | 0 | bits_simd[15] = bits[17]; |
2895 | 0 | } Unexecuted instantiation: enc_fast_lossless.cc:AVX2::(anonymous namespace)::MoreThan14Bits::PrepareForSimd(unsigned char const*, unsigned char const*, unsigned long, unsigned char*, unsigned char*) Unexecuted instantiation: enc_fast_lossless.cc:default_implementation::(anonymous namespace)::MoreThan14Bits::PrepareForSimd(unsigned char const*, unsigned char const*, unsigned long, unsigned char*, unsigned char*) |
2896 | | |
2897 | | #ifdef FJXL_GENERIC_SIMD |
2898 | | static void EncodeChunkSimd(upixel_t* residuals, size_t n, size_t skip, |
2899 | | const uint8_t* raw_nbits_simd, |
2900 | 0 | const uint8_t* raw_bits_simd, BitWriter& output) { |
2901 | 0 | Bits32 bits32[2 * kChunkSize / SIMDVec16::kLanes]; |
2902 | 0 | alignas(64) uint32_t bits[SIMDVec16::kLanes]; |
2903 | 0 | alignas(64) uint32_t nbits[SIMDVec16::kLanes]; |
2904 | 0 | alignas(64) uint16_t bits_huff[SIMDVec16::kLanes]; |
2905 | 0 | alignas(64) uint16_t nbits_huff[SIMDVec16::kLanes]; |
2906 | 0 | alignas(64) uint16_t token[SIMDVec16::kLanes]; |
2907 | 0 | for (size_t i = 0; i < kChunkSize; i += SIMDVec16::kLanes) { |
2908 | 0 | TokenizeSIMD(residuals + i, token, nbits, bits); |
2909 | 0 | HuffmanSIMDAbove14(token, raw_nbits_simd, raw_bits_simd, nbits_huff, |
2910 | 0 | bits_huff); |
2911 | 0 | StoreSIMDAbove14(nbits, bits, nbits_huff, bits_huff, std::max(n, i) - i, |
2912 | 0 | std::max(skip, i) - i, |
2913 | 0 | bits32 + 2 * i / SIMDVec16::kLanes); |
2914 | 0 | } |
2915 | 0 | StoreToWriter<2 * kChunkSize / SIMDVec16::kLanes>(bits32, output); |
2916 | 0 | } |
2917 | | #endif |
2918 | 0 | size_t NumSymbols(bool) const { return 19; } Unexecuted instantiation: enc_fast_lossless.cc:AVX2::(anonymous namespace)::MoreThan14Bits::NumSymbols(bool) const Unexecuted instantiation: enc_fast_lossless.cc:default_implementation::(anonymous namespace)::MoreThan14Bits::NumSymbols(bool) const |
2919 | | }; |
2920 | | constexpr uint8_t MoreThan14Bits::kMinRawLength[]; |
2921 | | constexpr uint8_t MoreThan14Bits::kMaxRawLength[]; |
2922 | | |
2923 | | void PrepareDCGlobalCommon(bool is_single_group, size_t width, size_t height, |
2924 | 0 | const PrefixCode code[4], BitWriter* output) { |
2925 | 0 | output->Allocate(100000 + (is_single_group ? width * height * 16 : 0)); |
2926 | | // No patches, spline or noise. |
2927 | 0 | output->Write(1, 1); // default DC dequantization factors (?) |
2928 | 0 | output->Write(1, 1); // use global tree / histograms |
2929 | 0 | output->Write(1, 0); // no lz77 for the tree |
2930 | |
|
2931 | 0 | output->Write(1, 1); // simple code for the tree's context map |
2932 | 0 | output->Write(2, 0); // all contexts clustered together |
2933 | 0 | output->Write(1, 1); // use prefix code for tree |
2934 | 0 | output->Write(4, 0); // 000 hybrid uint |
2935 | 0 | output->Write(6, 0b100011); // Alphabet size is 4 (var16) |
2936 | 0 | output->Write(2, 1); // simple prefix code |
2937 | 0 | output->Write(2, 3); // with 4 symbols |
2938 | 0 | output->Write(2, 0); |
2939 | 0 | output->Write(2, 1); |
2940 | 0 | output->Write(2, 2); |
2941 | 0 | output->Write(2, 3); |
2942 | 0 | output->Write(1, 0); // First tree encoding option |
2943 | | |
2944 | | // Huffman table + extra bits for the tree. |
2945 | 0 | uint8_t symbol_bits[6] = {0b00, 0b10, 0b001, 0b101, 0b0011, 0b0111}; |
2946 | 0 | uint8_t symbol_nbits[6] = {2, 2, 3, 3, 4, 4}; |
2947 | | // Write a tree with a leaf per channel, and gradient predictor for every |
2948 | | // leaf. |
2949 | 0 | for (auto v : {1, 2, 1, 4, 1, 0, 0, 5, 0, 0, 0, 0, 5, |
2950 | 0 | 0, 0, 0, 0, 5, 0, 0, 0, 0, 5, 0, 0, 0}) { |
2951 | 0 | output->Write(symbol_nbits[v], symbol_bits[v]); |
2952 | 0 | } |
2953 | |
|
2954 | 0 | output->Write(1, 1); // Enable lz77 for the main bitstream |
2955 | 0 | output->Write(2, 0b00); // lz77 offset 224 |
2956 | 0 | static_assert(kLZ77Offset == 224, "kLZ77Offset should be 224"); |
2957 | 0 | output->Write(4, 0b1010); // lz77 min length 7 |
2958 | | // 400 hybrid uint config for lz77 |
2959 | 0 | output->Write(4, 4); |
2960 | 0 | output->Write(3, 0); |
2961 | 0 | output->Write(3, 0); |
2962 | |
|
2963 | 0 | output->Write(1, 1); // simple code for the context map |
2964 | 0 | output->Write(2, 3); // 3 bits per entry |
2965 | 0 | output->Write(3, 4); // channel 3 |
2966 | 0 | output->Write(3, 3); // channel 2 |
2967 | 0 | output->Write(3, 2); // channel 1 |
2968 | 0 | output->Write(3, 1); // channel 0 |
2969 | 0 | output->Write(3, 0); // distance histogram first |
2970 | |
|
2971 | 0 | output->Write(1, 1); // use prefix codes |
2972 | 0 | output->Write(4, 0); // 000 hybrid uint config for distances (only need 0) |
2973 | 0 | for (size_t i = 0; i < 4; i++) { |
2974 | 0 | output->Write(4, 0); // 000 hybrid uint config for symbols (only <= 10) |
2975 | 0 | } |
2976 | | |
2977 | | // Distance alphabet size: |
2978 | 0 | output->Write(5, 0b00001); // 2: just need 1 for RLE (i.e. distance 1) |
2979 | | // Symbol + LZ77 alphabet size: |
2980 | 0 | for (size_t i = 0; i < 4; i++) { |
2981 | 0 | output->Write(1, 1); // > 1 |
2982 | 0 | output->Write(4, 8); // <= 512 |
2983 | 0 | output->Write(8, 256); // == 512 |
2984 | 0 | } |
2985 | | |
2986 | | // Distance histogram: |
2987 | 0 | output->Write(2, 1); // simple prefix code |
2988 | 0 | output->Write(2, 0); // with one symbol |
2989 | 0 | output->Write(1, 1); // 1 |
2990 | | |
2991 | | // Symbol + lz77 histogram: |
2992 | 0 | for (size_t i = 0; i < 4; i++) { |
2993 | 0 | code[i].WriteTo(output); |
2994 | 0 | } |
2995 | | |
2996 | | // Group header for global modular image. |
2997 | 0 | output->Write(1, 1); // Global tree |
2998 | 0 | output->Write(1, 1); // All default wp |
2999 | 0 | } Unexecuted instantiation: enc_fast_lossless.cc:AVX2::(anonymous namespace)::PrepareDCGlobalCommon(bool, unsigned long, unsigned long, (anonymous namespace)::PrefixCode const*, (anonymous namespace)::BitWriter*) Unexecuted instantiation: enc_fast_lossless.cc:default_implementation::(anonymous namespace)::PrepareDCGlobalCommon(bool, unsigned long, unsigned long, (anonymous namespace)::PrefixCode const*, (anonymous namespace)::BitWriter*) |
3000 | | |
3001 | | void PrepareDCGlobal(bool is_single_group, size_t width, size_t height, |
3002 | | size_t nb_chans, const PrefixCode code[4], |
3003 | 0 | BitWriter* output) { |
3004 | 0 | PrepareDCGlobalCommon(is_single_group, width, height, code, output); |
3005 | 0 | if (nb_chans > 2) { |
3006 | 0 | output->Write(2, 0b01); // 1 transform |
3007 | 0 | output->Write(2, 0b00); // RCT |
3008 | 0 | output->Write(5, 0b00000); // Starting from ch 0 |
3009 | 0 | output->Write(2, 0b00); // YCoCg |
3010 | 0 | } else { |
3011 | 0 | output->Write(2, 0b00); // no transforms |
3012 | 0 | } |
3013 | 0 | if (!is_single_group) { |
3014 | 0 | output->ZeroPadToByte(); |
3015 | 0 | } |
3016 | 0 | } Unexecuted instantiation: enc_fast_lossless.cc:AVX2::(anonymous namespace)::PrepareDCGlobal(bool, unsigned long, unsigned long, unsigned long, (anonymous namespace)::PrefixCode const*, (anonymous namespace)::BitWriter*) Unexecuted instantiation: enc_fast_lossless.cc:default_implementation::(anonymous namespace)::PrepareDCGlobal(bool, unsigned long, unsigned long, unsigned long, (anonymous namespace)::PrefixCode const*, (anonymous namespace)::BitWriter*) |
3017 | | |
3018 | | template <typename BitDepth> |
3019 | | struct ChunkEncoder { |
3020 | 0 | void PrepareForSimd() { |
3021 | 0 | BitDepth::PrepareForSimd(code->raw_nbits, code->raw_bits, code->numraw, |
3022 | 0 | raw_nbits_simd, raw_bits_simd); |
3023 | 0 | } Unexecuted instantiation: enc_fast_lossless.cc:AVX2::(anonymous namespace)::ChunkEncoder<AVX2::(anonymous namespace)::UpTo8Bits>::PrepareForSimd() Unexecuted instantiation: enc_fast_lossless.cc:default_implementation::(anonymous namespace)::ChunkEncoder<default_implementation::(anonymous namespace)::UpTo8Bits>::PrepareForSimd() Unexecuted instantiation: enc_fast_lossless.cc:AVX2::(anonymous namespace)::ChunkEncoder<AVX2::(anonymous namespace)::From9To13Bits>::PrepareForSimd() Unexecuted instantiation: enc_fast_lossless.cc:AVX2::(anonymous namespace)::ChunkEncoder<AVX2::(anonymous namespace)::Exactly14Bits>::PrepareForSimd() Unexecuted instantiation: enc_fast_lossless.cc:AVX2::(anonymous namespace)::ChunkEncoder<AVX2::(anonymous namespace)::MoreThan14Bits>::PrepareForSimd() Unexecuted instantiation: enc_fast_lossless.cc:default_implementation::(anonymous namespace)::ChunkEncoder<default_implementation::(anonymous namespace)::From9To13Bits>::PrepareForSimd() Unexecuted instantiation: enc_fast_lossless.cc:default_implementation::(anonymous namespace)::ChunkEncoder<default_implementation::(anonymous namespace)::Exactly14Bits>::PrepareForSimd() Unexecuted instantiation: enc_fast_lossless.cc:default_implementation::(anonymous namespace)::ChunkEncoder<default_implementation::(anonymous namespace)::MoreThan14Bits>::PrepareForSimd() |
3024 | | FJXL_INLINE static void EncodeRle(size_t count, const PrefixCode& code, |
3025 | 0 | BitWriter& output) { |
3026 | 0 | if (count == 0) return; |
3027 | 0 | count -= kLZ77MinLength + 1; |
3028 | 0 | if (count < kLZ77CacheSize) { |
3029 | 0 | output.Write(code.lz77_cache_nbits[count], code.lz77_cache_bits[count]); |
3030 | 0 | } else { |
3031 | 0 | unsigned token, nbits, bits; |
3032 | 0 | EncodeHybridUintLZ77(count, &token, &nbits, &bits); |
3033 | 0 | uint64_t wbits = bits; |
3034 | 0 | wbits = (wbits << code.lz77_nbits[token]) | code.lz77_bits[token]; |
3035 | 0 | wbits = (wbits << code.raw_nbits[0]) | code.raw_bits[0]; |
3036 | 0 | output.Write(code.lz77_nbits[token] + nbits + code.raw_nbits[0], wbits); |
3037 | 0 | } |
3038 | 0 | } Unexecuted instantiation: enc_fast_lossless.cc:AVX2::(anonymous namespace)::ChunkEncoder<AVX2::(anonymous namespace)::UpTo8Bits>::EncodeRle(unsigned long, (anonymous namespace)::PrefixCode const&, (anonymous namespace)::BitWriter&) Unexecuted instantiation: enc_fast_lossless.cc:default_implementation::(anonymous namespace)::ChunkEncoder<default_implementation::(anonymous namespace)::UpTo8Bits>::EncodeRle(unsigned long, (anonymous namespace)::PrefixCode const&, (anonymous namespace)::BitWriter&) Unexecuted instantiation: enc_fast_lossless.cc:AVX2::(anonymous namespace)::ChunkEncoder<AVX2::(anonymous namespace)::From9To13Bits>::EncodeRle(unsigned long, (anonymous namespace)::PrefixCode const&, (anonymous namespace)::BitWriter&) Unexecuted instantiation: enc_fast_lossless.cc:AVX2::(anonymous namespace)::ChunkEncoder<AVX2::(anonymous namespace)::Exactly14Bits>::EncodeRle(unsigned long, (anonymous namespace)::PrefixCode const&, (anonymous namespace)::BitWriter&) Unexecuted instantiation: enc_fast_lossless.cc:AVX2::(anonymous namespace)::ChunkEncoder<AVX2::(anonymous namespace)::MoreThan14Bits>::EncodeRle(unsigned long, (anonymous namespace)::PrefixCode const&, (anonymous namespace)::BitWriter&) Unexecuted instantiation: enc_fast_lossless.cc:default_implementation::(anonymous namespace)::ChunkEncoder<default_implementation::(anonymous namespace)::From9To13Bits>::EncodeRle(unsigned long, (anonymous namespace)::PrefixCode const&, (anonymous namespace)::BitWriter&) Unexecuted instantiation: enc_fast_lossless.cc:default_implementation::(anonymous namespace)::ChunkEncoder<default_implementation::(anonymous namespace)::Exactly14Bits>::EncodeRle(unsigned long, (anonymous namespace)::PrefixCode const&, (anonymous namespace)::BitWriter&) Unexecuted instantiation: enc_fast_lossless.cc:default_implementation::(anonymous namespace)::ChunkEncoder<default_implementation::(anonymous namespace)::MoreThan14Bits>::EncodeRle(unsigned long, (anonymous namespace)::PrefixCode const&, (anonymous namespace)::BitWriter&) |
3039 | | |
3040 | | FJXL_INLINE void Chunk(size_t run, typename BitDepth::upixel_t* residuals, |
3041 | 0 | size_t skip, size_t n) { |
3042 | 0 | EncodeRle(run, *code, *output); |
3043 | | #ifdef FJXL_GENERIC_SIMD |
3044 | | BitDepth::EncodeChunkSimd(residuals, n, skip, raw_nbits_simd, raw_bits_simd, |
3045 | | *output); |
3046 | | #else |
3047 | | GenericEncodeChunk(residuals, n, skip, *code, *output); |
3048 | | #endif |
3049 | 0 | } Unexecuted instantiation: enc_fast_lossless.cc:AVX2::(anonymous namespace)::ChunkEncoder<AVX2::(anonymous namespace)::UpTo8Bits>::Chunk(unsigned long, unsigned short*, unsigned long, unsigned long) Unexecuted instantiation: enc_fast_lossless.cc:default_implementation::(anonymous namespace)::ChunkEncoder<default_implementation::(anonymous namespace)::UpTo8Bits>::Chunk(unsigned long, unsigned short*, unsigned long, unsigned long) Unexecuted instantiation: enc_fast_lossless.cc:AVX2::(anonymous namespace)::ChunkEncoder<AVX2::(anonymous namespace)::From9To13Bits>::Chunk(unsigned long, unsigned short*, unsigned long, unsigned long) Unexecuted instantiation: enc_fast_lossless.cc:AVX2::(anonymous namespace)::ChunkEncoder<AVX2::(anonymous namespace)::Exactly14Bits>::Chunk(unsigned long, unsigned short*, unsigned long, unsigned long) Unexecuted instantiation: enc_fast_lossless.cc:AVX2::(anonymous namespace)::ChunkEncoder<AVX2::(anonymous namespace)::MoreThan14Bits>::Chunk(unsigned long, unsigned int*, unsigned long, unsigned long) Unexecuted instantiation: enc_fast_lossless.cc:default_implementation::(anonymous namespace)::ChunkEncoder<default_implementation::(anonymous namespace)::From9To13Bits>::Chunk(unsigned long, unsigned short*, unsigned long, unsigned long) Unexecuted instantiation: enc_fast_lossless.cc:default_implementation::(anonymous namespace)::ChunkEncoder<default_implementation::(anonymous namespace)::Exactly14Bits>::Chunk(unsigned long, unsigned short*, unsigned long, unsigned long) Unexecuted instantiation: enc_fast_lossless.cc:default_implementation::(anonymous namespace)::ChunkEncoder<default_implementation::(anonymous namespace)::MoreThan14Bits>::Chunk(unsigned long, unsigned int*, unsigned long, unsigned long) |
3050 | | |
3051 | 0 | inline void Finalize(size_t run) { EncodeRle(run, *code, *output); } Unexecuted instantiation: enc_fast_lossless.cc:AVX2::(anonymous namespace)::ChunkEncoder<AVX2::(anonymous namespace)::UpTo8Bits>::Finalize(unsigned long) Unexecuted instantiation: enc_fast_lossless.cc:default_implementation::(anonymous namespace)::ChunkEncoder<default_implementation::(anonymous namespace)::UpTo8Bits>::Finalize(unsigned long) Unexecuted instantiation: enc_fast_lossless.cc:AVX2::(anonymous namespace)::ChunkEncoder<AVX2::(anonymous namespace)::From9To13Bits>::Finalize(unsigned long) Unexecuted instantiation: enc_fast_lossless.cc:AVX2::(anonymous namespace)::ChunkEncoder<AVX2::(anonymous namespace)::Exactly14Bits>::Finalize(unsigned long) Unexecuted instantiation: enc_fast_lossless.cc:AVX2::(anonymous namespace)::ChunkEncoder<AVX2::(anonymous namespace)::MoreThan14Bits>::Finalize(unsigned long) Unexecuted instantiation: enc_fast_lossless.cc:default_implementation::(anonymous namespace)::ChunkEncoder<default_implementation::(anonymous namespace)::From9To13Bits>::Finalize(unsigned long) Unexecuted instantiation: enc_fast_lossless.cc:default_implementation::(anonymous namespace)::ChunkEncoder<default_implementation::(anonymous namespace)::Exactly14Bits>::Finalize(unsigned long) Unexecuted instantiation: enc_fast_lossless.cc:default_implementation::(anonymous namespace)::ChunkEncoder<default_implementation::(anonymous namespace)::MoreThan14Bits>::Finalize(unsigned long) |
3052 | | |
3053 | | const PrefixCode* code; |
3054 | | BitWriter* output; |
3055 | | alignas(64) uint8_t raw_nbits_simd[16] = {}; |
3056 | | alignas(64) uint8_t raw_bits_simd[16] = {}; |
3057 | | }; |
3058 | | |
3059 | | template <typename BitDepth> |
3060 | | struct ChunkSampleCollector { |
3061 | 0 | FJXL_INLINE void Rle(size_t count, uint64_t* lz77_counts_) { |
3062 | 0 | if (count == 0) return; |
3063 | 0 | raw_counts[0] += 1; |
3064 | 0 | count -= kLZ77MinLength + 1; |
3065 | 0 | unsigned token, nbits, bits; |
3066 | 0 | EncodeHybridUintLZ77(count, &token, &nbits, &bits); |
3067 | 0 | lz77_counts_[token]++; |
3068 | 0 | } Unexecuted instantiation: enc_fast_lossless.cc:AVX2::(anonymous namespace)::ChunkSampleCollector<AVX2::(anonymous namespace)::UpTo8Bits>::Rle(unsigned long, unsigned long*) Unexecuted instantiation: enc_fast_lossless.cc:AVX2::(anonymous namespace)::ChunkSampleCollector<AVX2::(anonymous namespace)::From9To13Bits>::Rle(unsigned long, unsigned long*) Unexecuted instantiation: enc_fast_lossless.cc:AVX2::(anonymous namespace)::ChunkSampleCollector<AVX2::(anonymous namespace)::Exactly14Bits>::Rle(unsigned long, unsigned long*) Unexecuted instantiation: enc_fast_lossless.cc:AVX2::(anonymous namespace)::ChunkSampleCollector<AVX2::(anonymous namespace)::MoreThan14Bits>::Rle(unsigned long, unsigned long*) Unexecuted instantiation: enc_fast_lossless.cc:default_implementation::(anonymous namespace)::ChunkSampleCollector<default_implementation::(anonymous namespace)::UpTo8Bits>::Rle(unsigned long, unsigned long*) Unexecuted instantiation: enc_fast_lossless.cc:default_implementation::(anonymous namespace)::ChunkSampleCollector<default_implementation::(anonymous namespace)::From9To13Bits>::Rle(unsigned long, unsigned long*) Unexecuted instantiation: enc_fast_lossless.cc:default_implementation::(anonymous namespace)::ChunkSampleCollector<default_implementation::(anonymous namespace)::Exactly14Bits>::Rle(unsigned long, unsigned long*) Unexecuted instantiation: enc_fast_lossless.cc:default_implementation::(anonymous namespace)::ChunkSampleCollector<default_implementation::(anonymous namespace)::MoreThan14Bits>::Rle(unsigned long, unsigned long*) |
3069 | | |
3070 | | FJXL_INLINE void Chunk(size_t run, typename BitDepth::upixel_t* residuals, |
3071 | 0 | size_t skip, size_t n) { |
3072 | | // Run is broken. Encode the run and encode the individual vector. |
3073 | 0 | Rle(run, lz77_counts); |
3074 | 0 | for (size_t ix = skip; ix < n; ix++) { |
3075 | 0 | unsigned token, nbits, bits; |
3076 | 0 | EncodeHybridUint000(residuals[ix], &token, &nbits, &bits); |
3077 | 0 | raw_counts[token]++; |
3078 | 0 | } |
3079 | 0 | } Unexecuted instantiation: enc_fast_lossless.cc:AVX2::(anonymous namespace)::ChunkSampleCollector<AVX2::(anonymous namespace)::UpTo8Bits>::Chunk(unsigned long, unsigned short*, unsigned long, unsigned long) Unexecuted instantiation: enc_fast_lossless.cc:AVX2::(anonymous namespace)::ChunkSampleCollector<AVX2::(anonymous namespace)::From9To13Bits>::Chunk(unsigned long, unsigned short*, unsigned long, unsigned long) Unexecuted instantiation: enc_fast_lossless.cc:AVX2::(anonymous namespace)::ChunkSampleCollector<AVX2::(anonymous namespace)::Exactly14Bits>::Chunk(unsigned long, unsigned short*, unsigned long, unsigned long) Unexecuted instantiation: enc_fast_lossless.cc:AVX2::(anonymous namespace)::ChunkSampleCollector<AVX2::(anonymous namespace)::MoreThan14Bits>::Chunk(unsigned long, unsigned int*, unsigned long, unsigned long) Unexecuted instantiation: enc_fast_lossless.cc:default_implementation::(anonymous namespace)::ChunkSampleCollector<default_implementation::(anonymous namespace)::UpTo8Bits>::Chunk(unsigned long, unsigned short*, unsigned long, unsigned long) Unexecuted instantiation: enc_fast_lossless.cc:default_implementation::(anonymous namespace)::ChunkSampleCollector<default_implementation::(anonymous namespace)::From9To13Bits>::Chunk(unsigned long, unsigned short*, unsigned long, unsigned long) Unexecuted instantiation: enc_fast_lossless.cc:default_implementation::(anonymous namespace)::ChunkSampleCollector<default_implementation::(anonymous namespace)::Exactly14Bits>::Chunk(unsigned long, unsigned short*, unsigned long, unsigned long) Unexecuted instantiation: enc_fast_lossless.cc:default_implementation::(anonymous namespace)::ChunkSampleCollector<default_implementation::(anonymous namespace)::MoreThan14Bits>::Chunk(unsigned long, unsigned int*, unsigned long, unsigned long) |
3080 | | |
3081 | | // don't count final run since we don't know how long it really is |
3082 | 0 | void Finalize(size_t run) {} Unexecuted instantiation: enc_fast_lossless.cc:AVX2::(anonymous namespace)::ChunkSampleCollector<AVX2::(anonymous namespace)::UpTo8Bits>::Finalize(unsigned long) Unexecuted instantiation: enc_fast_lossless.cc:AVX2::(anonymous namespace)::ChunkSampleCollector<AVX2::(anonymous namespace)::From9To13Bits>::Finalize(unsigned long) Unexecuted instantiation: enc_fast_lossless.cc:AVX2::(anonymous namespace)::ChunkSampleCollector<AVX2::(anonymous namespace)::Exactly14Bits>::Finalize(unsigned long) Unexecuted instantiation: enc_fast_lossless.cc:AVX2::(anonymous namespace)::ChunkSampleCollector<AVX2::(anonymous namespace)::MoreThan14Bits>::Finalize(unsigned long) Unexecuted instantiation: enc_fast_lossless.cc:default_implementation::(anonymous namespace)::ChunkSampleCollector<default_implementation::(anonymous namespace)::UpTo8Bits>::Finalize(unsigned long) Unexecuted instantiation: enc_fast_lossless.cc:default_implementation::(anonymous namespace)::ChunkSampleCollector<default_implementation::(anonymous namespace)::From9To13Bits>::Finalize(unsigned long) Unexecuted instantiation: enc_fast_lossless.cc:default_implementation::(anonymous namespace)::ChunkSampleCollector<default_implementation::(anonymous namespace)::Exactly14Bits>::Finalize(unsigned long) Unexecuted instantiation: enc_fast_lossless.cc:default_implementation::(anonymous namespace)::ChunkSampleCollector<default_implementation::(anonymous namespace)::MoreThan14Bits>::Finalize(unsigned long) |
3083 | | |
3084 | | uint64_t* raw_counts; |
3085 | | uint64_t* lz77_counts; |
3086 | | }; |
3087 | | |
3088 | 0 | constexpr uint32_t PackSigned(int32_t value) { |
3089 | 0 | return (static_cast<uint32_t>(value) << 1) ^ |
3090 | 0 | ((static_cast<uint32_t>(~value) >> 31) - 1); |
3091 | 0 | } Unexecuted instantiation: enc_fast_lossless.cc:AVX2::(anonymous namespace)::PackSigned(int) Unexecuted instantiation: enc_fast_lossless.cc:default_implementation::(anonymous namespace)::PackSigned(int) |
3092 | | |
3093 | | template <typename T, typename BitDepth> |
3094 | | struct ChannelRowProcessor { |
3095 | | using upixel_t = typename BitDepth::upixel_t; |
3096 | | using pixel_t = typename BitDepth::pixel_t; |
3097 | | T* t; |
3098 | | void ProcessChunk(const pixel_t* row, const pixel_t* row_left, |
3099 | | const pixel_t* row_top, const pixel_t* row_topleft, |
3100 | 0 | size_t n) { |
3101 | 0 | alignas(64) upixel_t residuals[kChunkSize] = {}; |
3102 | 0 | size_t prefix_size = 0; |
3103 | 0 | size_t required_prefix_size = 0; |
3104 | | #ifdef FJXL_GENERIC_SIMD |
3105 | | constexpr size_t kNum = |
3106 | 0 | sizeof(pixel_t) == 2 ? SIMDVec16::kLanes : SIMDVec32::kLanes; |
3107 | 0 | for (size_t ix = 0; ix < kChunkSize; ix += kNum) { |
3108 | 0 | size_t c = |
3109 | 0 | PredictPixels<simd_t<pixel_t>>(row + ix, row_left + ix, row_top + ix, |
3110 | 0 | row_topleft + ix, residuals + ix); |
3111 | 0 | prefix_size = |
3112 | 0 | prefix_size == required_prefix_size ? prefix_size + c : prefix_size; |
3113 | 0 | required_prefix_size += kNum; |
3114 | 0 | } |
3115 | | #else |
3116 | 0 | for (size_t ix = 0; ix < kChunkSize; ix++) { |
3117 | 0 | pixel_t px = row[ix]; |
3118 | 0 | pixel_t left = row_left[ix]; |
3119 | 0 | pixel_t top = row_top[ix]; |
3120 | 0 | pixel_t topleft = row_topleft[ix]; |
3121 | 0 | pixel_t ac = left - topleft; |
3122 | 0 | pixel_t ab = left - top; |
3123 | 0 | pixel_t bc = top - topleft; |
3124 | 0 | pixel_t grad = static_cast<pixel_t>(static_cast<upixel_t>(ac) + |
3125 | 0 | static_cast<upixel_t>(top)); |
3126 | 0 | pixel_t d = ab ^ bc; |
3127 | 0 | pixel_t clamp = d < 0 ? top : left; |
3128 | 0 | pixel_t s = ac ^ bc; |
3129 | 0 | pixel_t pred = s < 0 ? grad : clamp; |
3130 | 0 | residuals[ix] = PackSigned(px - pred); |
3131 | 0 | prefix_size = prefix_size == required_prefix_size |
3132 | 0 | ? prefix_size + (residuals[ix] == 0) |
3133 | 0 | : prefix_size; |
3134 | 0 | required_prefix_size += 1; |
3135 | 0 | } |
3136 | | #endif |
3137 | 0 | prefix_size = std::min(n, prefix_size); |
3138 | 0 | if (prefix_size == n && (run > 0 || prefix_size > kLZ77MinLength)) { |
3139 | | // Run continues, nothing to do. |
3140 | 0 | run += prefix_size; |
3141 | 0 | } else if (prefix_size + run > kLZ77MinLength) { |
3142 | | // Run is broken. Encode the run and encode the individual vector. |
3143 | 0 | t->Chunk(run + prefix_size, residuals, prefix_size, n); |
3144 | 0 | run = 0; |
3145 | 0 | } else { |
3146 | | // There was no run to begin with. |
3147 | 0 | t->Chunk(0, residuals, 0, n); |
3148 | 0 | } |
3149 | 0 | } Unexecuted instantiation: enc_fast_lossless.cc:AVX2::(anonymous namespace)::ChannelRowProcessor<AVX2::(anonymous namespace)::ChunkSampleCollector<AVX2::(anonymous namespace)::UpTo8Bits>, AVX2::(anonymous namespace)::UpTo8Bits>::ProcessChunk(short const*, short const*, short const*, short const*, unsigned long) Unexecuted instantiation: enc_fast_lossless.cc:AVX2::(anonymous namespace)::ChannelRowProcessor<AVX2::(anonymous namespace)::ChunkEncoder<AVX2::(anonymous namespace)::UpTo8Bits>, AVX2::(anonymous namespace)::UpTo8Bits>::ProcessChunk(short const*, short const*, short const*, short const*, unsigned long) Unexecuted instantiation: enc_fast_lossless.cc:AVX2::(anonymous namespace)::ChannelRowProcessor<AVX2::(anonymous namespace)::ChunkSampleCollector<AVX2::(anonymous namespace)::From9To13Bits>, AVX2::(anonymous namespace)::From9To13Bits>::ProcessChunk(short const*, short const*, short const*, short const*, unsigned long) Unexecuted instantiation: enc_fast_lossless.cc:AVX2::(anonymous namespace)::ChannelRowProcessor<AVX2::(anonymous namespace)::ChunkSampleCollector<AVX2::(anonymous namespace)::Exactly14Bits>, AVX2::(anonymous namespace)::Exactly14Bits>::ProcessChunk(short const*, short const*, short const*, short const*, unsigned long) Unexecuted instantiation: enc_fast_lossless.cc:AVX2::(anonymous namespace)::ChannelRowProcessor<AVX2::(anonymous namespace)::ChunkSampleCollector<AVX2::(anonymous namespace)::MoreThan14Bits>, AVX2::(anonymous namespace)::MoreThan14Bits>::ProcessChunk(int const*, int const*, int const*, int const*, unsigned long) Unexecuted instantiation: enc_fast_lossless.cc:default_implementation::(anonymous namespace)::ChannelRowProcessor<default_implementation::(anonymous namespace)::ChunkSampleCollector<default_implementation::(anonymous namespace)::UpTo8Bits>, default_implementation::(anonymous namespace)::UpTo8Bits>::ProcessChunk(short const*, short const*, short const*, short const*, unsigned long) Unexecuted instantiation: enc_fast_lossless.cc:default_implementation::(anonymous namespace)::ChannelRowProcessor<default_implementation::(anonymous namespace)::ChunkEncoder<default_implementation::(anonymous namespace)::UpTo8Bits>, default_implementation::(anonymous namespace)::UpTo8Bits>::ProcessChunk(short const*, short const*, short const*, short const*, unsigned long) Unexecuted instantiation: enc_fast_lossless.cc:default_implementation::(anonymous namespace)::ChannelRowProcessor<default_implementation::(anonymous namespace)::ChunkSampleCollector<default_implementation::(anonymous namespace)::From9To13Bits>, default_implementation::(anonymous namespace)::From9To13Bits>::ProcessChunk(short const*, short const*, short const*, short const*, unsigned long) Unexecuted instantiation: enc_fast_lossless.cc:default_implementation::(anonymous namespace)::ChannelRowProcessor<default_implementation::(anonymous namespace)::ChunkSampleCollector<default_implementation::(anonymous namespace)::Exactly14Bits>, default_implementation::(anonymous namespace)::Exactly14Bits>::ProcessChunk(short const*, short const*, short const*, short const*, unsigned long) Unexecuted instantiation: enc_fast_lossless.cc:default_implementation::(anonymous namespace)::ChannelRowProcessor<default_implementation::(anonymous namespace)::ChunkSampleCollector<default_implementation::(anonymous namespace)::MoreThan14Bits>, default_implementation::(anonymous namespace)::MoreThan14Bits>::ProcessChunk(int const*, int const*, int const*, int const*, unsigned long) Unexecuted instantiation: enc_fast_lossless.cc:AVX2::(anonymous namespace)::ChannelRowProcessor<AVX2::(anonymous namespace)::ChunkEncoder<AVX2::(anonymous namespace)::From9To13Bits>, AVX2::(anonymous namespace)::From9To13Bits>::ProcessChunk(short const*, short const*, short const*, short const*, unsigned long) Unexecuted instantiation: enc_fast_lossless.cc:AVX2::(anonymous namespace)::ChannelRowProcessor<AVX2::(anonymous namespace)::ChunkEncoder<AVX2::(anonymous namespace)::Exactly14Bits>, AVX2::(anonymous namespace)::Exactly14Bits>::ProcessChunk(short const*, short const*, short const*, short const*, unsigned long) Unexecuted instantiation: enc_fast_lossless.cc:AVX2::(anonymous namespace)::ChannelRowProcessor<AVX2::(anonymous namespace)::ChunkEncoder<AVX2::(anonymous namespace)::MoreThan14Bits>, AVX2::(anonymous namespace)::MoreThan14Bits>::ProcessChunk(int const*, int const*, int const*, int const*, unsigned long) Unexecuted instantiation: enc_fast_lossless.cc:default_implementation::(anonymous namespace)::ChannelRowProcessor<default_implementation::(anonymous namespace)::ChunkEncoder<default_implementation::(anonymous namespace)::From9To13Bits>, default_implementation::(anonymous namespace)::From9To13Bits>::ProcessChunk(short const*, short const*, short const*, short const*, unsigned long) Unexecuted instantiation: enc_fast_lossless.cc:default_implementation::(anonymous namespace)::ChannelRowProcessor<default_implementation::(anonymous namespace)::ChunkEncoder<default_implementation::(anonymous namespace)::Exactly14Bits>, default_implementation::(anonymous namespace)::Exactly14Bits>::ProcessChunk(short const*, short const*, short const*, short const*, unsigned long) Unexecuted instantiation: enc_fast_lossless.cc:default_implementation::(anonymous namespace)::ChannelRowProcessor<default_implementation::(anonymous namespace)::ChunkEncoder<default_implementation::(anonymous namespace)::MoreThan14Bits>, default_implementation::(anonymous namespace)::MoreThan14Bits>::ProcessChunk(int const*, int const*, int const*, int const*, unsigned long) |
3150 | | |
3151 | | void ProcessRow(const pixel_t* row, const pixel_t* row_left, |
3152 | | const pixel_t* row_top, const pixel_t* row_topleft, |
3153 | 0 | size_t xs) { |
3154 | 0 | for (size_t x = 0; x < xs; x += kChunkSize) { |
3155 | 0 | ProcessChunk(row + x, row_left + x, row_top + x, row_topleft + x, |
3156 | 0 | std::min(kChunkSize, xs - x)); |
3157 | 0 | } |
3158 | 0 | } Unexecuted instantiation: enc_fast_lossless.cc:AVX2::(anonymous namespace)::ChannelRowProcessor<AVX2::(anonymous namespace)::ChunkSampleCollector<AVX2::(anonymous namespace)::UpTo8Bits>, AVX2::(anonymous namespace)::UpTo8Bits>::ProcessRow(short const*, short const*, short const*, short const*, unsigned long) Unexecuted instantiation: enc_fast_lossless.cc:AVX2::(anonymous namespace)::ChannelRowProcessor<AVX2::(anonymous namespace)::ChunkEncoder<AVX2::(anonymous namespace)::UpTo8Bits>, AVX2::(anonymous namespace)::UpTo8Bits>::ProcessRow(short const*, short const*, short const*, short const*, unsigned long) Unexecuted instantiation: enc_fast_lossless.cc:AVX2::(anonymous namespace)::ChannelRowProcessor<AVX2::(anonymous namespace)::ChunkSampleCollector<AVX2::(anonymous namespace)::From9To13Bits>, AVX2::(anonymous namespace)::From9To13Bits>::ProcessRow(short const*, short const*, short const*, short const*, unsigned long) Unexecuted instantiation: enc_fast_lossless.cc:AVX2::(anonymous namespace)::ChannelRowProcessor<AVX2::(anonymous namespace)::ChunkSampleCollector<AVX2::(anonymous namespace)::Exactly14Bits>, AVX2::(anonymous namespace)::Exactly14Bits>::ProcessRow(short const*, short const*, short const*, short const*, unsigned long) Unexecuted instantiation: enc_fast_lossless.cc:AVX2::(anonymous namespace)::ChannelRowProcessor<AVX2::(anonymous namespace)::ChunkSampleCollector<AVX2::(anonymous namespace)::MoreThan14Bits>, AVX2::(anonymous namespace)::MoreThan14Bits>::ProcessRow(int const*, int const*, int const*, int const*, unsigned long) Unexecuted instantiation: enc_fast_lossless.cc:default_implementation::(anonymous namespace)::ChannelRowProcessor<default_implementation::(anonymous namespace)::ChunkSampleCollector<default_implementation::(anonymous namespace)::UpTo8Bits>, default_implementation::(anonymous namespace)::UpTo8Bits>::ProcessRow(short const*, short const*, short const*, short const*, unsigned long) Unexecuted instantiation: enc_fast_lossless.cc:default_implementation::(anonymous namespace)::ChannelRowProcessor<default_implementation::(anonymous namespace)::ChunkEncoder<default_implementation::(anonymous namespace)::UpTo8Bits>, default_implementation::(anonymous namespace)::UpTo8Bits>::ProcessRow(short const*, short const*, short const*, short const*, unsigned long) Unexecuted instantiation: enc_fast_lossless.cc:default_implementation::(anonymous namespace)::ChannelRowProcessor<default_implementation::(anonymous namespace)::ChunkSampleCollector<default_implementation::(anonymous namespace)::From9To13Bits>, default_implementation::(anonymous namespace)::From9To13Bits>::ProcessRow(short const*, short const*, short const*, short const*, unsigned long) Unexecuted instantiation: enc_fast_lossless.cc:default_implementation::(anonymous namespace)::ChannelRowProcessor<default_implementation::(anonymous namespace)::ChunkSampleCollector<default_implementation::(anonymous namespace)::Exactly14Bits>, default_implementation::(anonymous namespace)::Exactly14Bits>::ProcessRow(short const*, short const*, short const*, short const*, unsigned long) Unexecuted instantiation: enc_fast_lossless.cc:default_implementation::(anonymous namespace)::ChannelRowProcessor<default_implementation::(anonymous namespace)::ChunkSampleCollector<default_implementation::(anonymous namespace)::MoreThan14Bits>, default_implementation::(anonymous namespace)::MoreThan14Bits>::ProcessRow(int const*, int const*, int const*, int const*, unsigned long) Unexecuted instantiation: enc_fast_lossless.cc:AVX2::(anonymous namespace)::ChannelRowProcessor<AVX2::(anonymous namespace)::ChunkEncoder<AVX2::(anonymous namespace)::From9To13Bits>, AVX2::(anonymous namespace)::From9To13Bits>::ProcessRow(short const*, short const*, short const*, short const*, unsigned long) Unexecuted instantiation: enc_fast_lossless.cc:AVX2::(anonymous namespace)::ChannelRowProcessor<AVX2::(anonymous namespace)::ChunkEncoder<AVX2::(anonymous namespace)::Exactly14Bits>, AVX2::(anonymous namespace)::Exactly14Bits>::ProcessRow(short const*, short const*, short const*, short const*, unsigned long) Unexecuted instantiation: enc_fast_lossless.cc:AVX2::(anonymous namespace)::ChannelRowProcessor<AVX2::(anonymous namespace)::ChunkEncoder<AVX2::(anonymous namespace)::MoreThan14Bits>, AVX2::(anonymous namespace)::MoreThan14Bits>::ProcessRow(int const*, int const*, int const*, int const*, unsigned long) Unexecuted instantiation: enc_fast_lossless.cc:default_implementation::(anonymous namespace)::ChannelRowProcessor<default_implementation::(anonymous namespace)::ChunkEncoder<default_implementation::(anonymous namespace)::From9To13Bits>, default_implementation::(anonymous namespace)::From9To13Bits>::ProcessRow(short const*, short const*, short const*, short const*, unsigned long) Unexecuted instantiation: enc_fast_lossless.cc:default_implementation::(anonymous namespace)::ChannelRowProcessor<default_implementation::(anonymous namespace)::ChunkEncoder<default_implementation::(anonymous namespace)::Exactly14Bits>, default_implementation::(anonymous namespace)::Exactly14Bits>::ProcessRow(short const*, short const*, short const*, short const*, unsigned long) Unexecuted instantiation: enc_fast_lossless.cc:default_implementation::(anonymous namespace)::ChannelRowProcessor<default_implementation::(anonymous namespace)::ChunkEncoder<default_implementation::(anonymous namespace)::MoreThan14Bits>, default_implementation::(anonymous namespace)::MoreThan14Bits>::ProcessRow(int const*, int const*, int const*, int const*, unsigned long) |
3159 | | |
3160 | 0 | void Finalize() { t->Finalize(run); } Unexecuted instantiation: enc_fast_lossless.cc:AVX2::(anonymous namespace)::ChannelRowProcessor<AVX2::(anonymous namespace)::ChunkSampleCollector<AVX2::(anonymous namespace)::UpTo8Bits>, AVX2::(anonymous namespace)::UpTo8Bits>::Finalize() Unexecuted instantiation: enc_fast_lossless.cc:AVX2::(anonymous namespace)::ChannelRowProcessor<AVX2::(anonymous namespace)::ChunkEncoder<AVX2::(anonymous namespace)::UpTo8Bits>, AVX2::(anonymous namespace)::UpTo8Bits>::Finalize() Unexecuted instantiation: enc_fast_lossless.cc:AVX2::(anonymous namespace)::ChannelRowProcessor<AVX2::(anonymous namespace)::ChunkSampleCollector<AVX2::(anonymous namespace)::From9To13Bits>, AVX2::(anonymous namespace)::From9To13Bits>::Finalize() Unexecuted instantiation: enc_fast_lossless.cc:AVX2::(anonymous namespace)::ChannelRowProcessor<AVX2::(anonymous namespace)::ChunkSampleCollector<AVX2::(anonymous namespace)::Exactly14Bits>, AVX2::(anonymous namespace)::Exactly14Bits>::Finalize() Unexecuted instantiation: enc_fast_lossless.cc:AVX2::(anonymous namespace)::ChannelRowProcessor<AVX2::(anonymous namespace)::ChunkSampleCollector<AVX2::(anonymous namespace)::MoreThan14Bits>, AVX2::(anonymous namespace)::MoreThan14Bits>::Finalize() Unexecuted instantiation: enc_fast_lossless.cc:default_implementation::(anonymous namespace)::ChannelRowProcessor<default_implementation::(anonymous namespace)::ChunkSampleCollector<default_implementation::(anonymous namespace)::UpTo8Bits>, default_implementation::(anonymous namespace)::UpTo8Bits>::Finalize() Unexecuted instantiation: enc_fast_lossless.cc:default_implementation::(anonymous namespace)::ChannelRowProcessor<default_implementation::(anonymous namespace)::ChunkEncoder<default_implementation::(anonymous namespace)::UpTo8Bits>, default_implementation::(anonymous namespace)::UpTo8Bits>::Finalize() Unexecuted instantiation: enc_fast_lossless.cc:default_implementation::(anonymous namespace)::ChannelRowProcessor<default_implementation::(anonymous namespace)::ChunkSampleCollector<default_implementation::(anonymous namespace)::From9To13Bits>, default_implementation::(anonymous namespace)::From9To13Bits>::Finalize() Unexecuted instantiation: enc_fast_lossless.cc:default_implementation::(anonymous namespace)::ChannelRowProcessor<default_implementation::(anonymous namespace)::ChunkSampleCollector<default_implementation::(anonymous namespace)::Exactly14Bits>, default_implementation::(anonymous namespace)::Exactly14Bits>::Finalize() Unexecuted instantiation: enc_fast_lossless.cc:default_implementation::(anonymous namespace)::ChannelRowProcessor<default_implementation::(anonymous namespace)::ChunkSampleCollector<default_implementation::(anonymous namespace)::MoreThan14Bits>, default_implementation::(anonymous namespace)::MoreThan14Bits>::Finalize() Unexecuted instantiation: enc_fast_lossless.cc:AVX2::(anonymous namespace)::ChannelRowProcessor<AVX2::(anonymous namespace)::ChunkEncoder<AVX2::(anonymous namespace)::From9To13Bits>, AVX2::(anonymous namespace)::From9To13Bits>::Finalize() Unexecuted instantiation: enc_fast_lossless.cc:AVX2::(anonymous namespace)::ChannelRowProcessor<AVX2::(anonymous namespace)::ChunkEncoder<AVX2::(anonymous namespace)::Exactly14Bits>, AVX2::(anonymous namespace)::Exactly14Bits>::Finalize() Unexecuted instantiation: enc_fast_lossless.cc:AVX2::(anonymous namespace)::ChannelRowProcessor<AVX2::(anonymous namespace)::ChunkEncoder<AVX2::(anonymous namespace)::MoreThan14Bits>, AVX2::(anonymous namespace)::MoreThan14Bits>::Finalize() Unexecuted instantiation: enc_fast_lossless.cc:default_implementation::(anonymous namespace)::ChannelRowProcessor<default_implementation::(anonymous namespace)::ChunkEncoder<default_implementation::(anonymous namespace)::From9To13Bits>, default_implementation::(anonymous namespace)::From9To13Bits>::Finalize() Unexecuted instantiation: enc_fast_lossless.cc:default_implementation::(anonymous namespace)::ChannelRowProcessor<default_implementation::(anonymous namespace)::ChunkEncoder<default_implementation::(anonymous namespace)::Exactly14Bits>, default_implementation::(anonymous namespace)::Exactly14Bits>::Finalize() Unexecuted instantiation: enc_fast_lossless.cc:default_implementation::(anonymous namespace)::ChannelRowProcessor<default_implementation::(anonymous namespace)::ChunkEncoder<default_implementation::(anonymous namespace)::MoreThan14Bits>, default_implementation::(anonymous namespace)::MoreThan14Bits>::Finalize() |
3161 | | // Invariant: run == 0 or run > kLZ77MinLength. |
3162 | | size_t run = 0; |
3163 | | }; |
3164 | | |
3165 | 0 | uint16_t LoadLE16(const unsigned char* ptr) { |
3166 | 0 | return uint16_t{ptr[0]} | (uint16_t{ptr[1]} << 8); |
3167 | 0 | } Unexecuted instantiation: enc_fast_lossless.cc:AVX2::(anonymous namespace)::LoadLE16(unsigned char const*) Unexecuted instantiation: enc_fast_lossless.cc:default_implementation::(anonymous namespace)::LoadLE16(unsigned char const*) |
3168 | | |
3169 | 0 | uint16_t SwapEndian(uint16_t in) { return (in >> 8) | (in << 8); } Unexecuted instantiation: enc_fast_lossless.cc:AVX2::(anonymous namespace)::SwapEndian(unsigned short) Unexecuted instantiation: enc_fast_lossless.cc:default_implementation::(anonymous namespace)::SwapEndian(unsigned short) |
3170 | | |
3171 | | #ifdef FJXL_GENERIC_SIMD |
3172 | 0 | void StorePixels(SIMDVec16 p, int16_t* dest) { p.Store((uint16_t*)dest); } |
3173 | | |
3174 | 0 | void StorePixels(SIMDVec16 p, int32_t* dest) { |
3175 | 0 | VecPair<SIMDVec32> p_up = p.Upcast(); |
3176 | 0 | p_up.low.Store((uint32_t*)dest); |
3177 | 0 | p_up.hi.Store((uint32_t*)dest + SIMDVec32::kLanes); |
3178 | 0 | } |
3179 | | #endif |
3180 | | |
3181 | | template <typename pixel_t> |
3182 | 0 | void FillRowG8(const unsigned char* rgba, size_t oxs, pixel_t* luma) { |
3183 | 0 | size_t x = 0; |
3184 | | #ifdef FJXL_GENERIC_SIMD |
3185 | 0 | for (; x + SIMDVec16::kLanes <= oxs; x += SIMDVec16::kLanes) { |
3186 | 0 | auto rgb = SIMDVec16::LoadG8(rgba + x); |
3187 | 0 | StorePixels(rgb[0], luma + x); |
3188 | 0 | } |
3189 | | #endif |
3190 | 0 | for (; x < oxs; x++) { |
3191 | 0 | luma[x] = rgba[x]; |
3192 | 0 | } |
3193 | 0 | } Unexecuted instantiation: enc_fast_lossless.cc:void AVX2::(anonymous namespace)::FillRowG8<short>(unsigned char const*, unsigned long, short*) Unexecuted instantiation: enc_fast_lossless.cc:void default_implementation::(anonymous namespace)::FillRowG8<short>(unsigned char const*, unsigned long, short*) Unexecuted instantiation: enc_fast_lossless.cc:void default_implementation::(anonymous namespace)::FillRowG8<int>(unsigned char const*, unsigned long, int*) Unexecuted instantiation: enc_fast_lossless.cc:void AVX2::(anonymous namespace)::FillRowG8<int>(unsigned char const*, unsigned long, int*) |
3194 | | |
3195 | | template <bool big_endian, typename pixel_t> |
3196 | 0 | void FillRowG16(const unsigned char* rgba, size_t oxs, pixel_t* luma) { |
3197 | 0 | size_t x = 0; |
3198 | | #ifdef FJXL_GENERIC_SIMD |
3199 | 0 | for (; x + SIMDVec16::kLanes <= oxs; x += SIMDVec16::kLanes) { |
3200 | 0 | auto rgb = SIMDVec16::LoadG16(rgba + 2 * x); |
3201 | 0 | if (big_endian) { |
3202 | 0 | rgb[0].SwapEndian(); |
3203 | 0 | } |
3204 | 0 | StorePixels(rgb[0], luma + x); |
3205 | 0 | } |
3206 | | #endif |
3207 | 0 | for (; x < oxs; x++) { |
3208 | 0 | uint16_t val = LoadLE16(rgba + 2 * x); |
3209 | 0 | if (big_endian) { |
3210 | 0 | val = SwapEndian(val); |
3211 | 0 | } |
3212 | 0 | luma[x] = val; |
3213 | 0 | } |
3214 | 0 | } Unexecuted instantiation: enc_fast_lossless.cc:void AVX2::(anonymous namespace)::FillRowG16<true, short>(unsigned char const*, unsigned long, short*) Unexecuted instantiation: enc_fast_lossless.cc:void AVX2::(anonymous namespace)::FillRowG16<false, short>(unsigned char const*, unsigned long, short*) Unexecuted instantiation: enc_fast_lossless.cc:void AVX2::(anonymous namespace)::FillRowG16<true, int>(unsigned char const*, unsigned long, int*) Unexecuted instantiation: enc_fast_lossless.cc:void AVX2::(anonymous namespace)::FillRowG16<false, int>(unsigned char const*, unsigned long, int*) Unexecuted instantiation: enc_fast_lossless.cc:void default_implementation::(anonymous namespace)::FillRowG16<true, short>(unsigned char const*, unsigned long, short*) Unexecuted instantiation: enc_fast_lossless.cc:void default_implementation::(anonymous namespace)::FillRowG16<false, short>(unsigned char const*, unsigned long, short*) Unexecuted instantiation: enc_fast_lossless.cc:void default_implementation::(anonymous namespace)::FillRowG16<true, int>(unsigned char const*, unsigned long, int*) Unexecuted instantiation: enc_fast_lossless.cc:void default_implementation::(anonymous namespace)::FillRowG16<false, int>(unsigned char const*, unsigned long, int*) |
3215 | | |
3216 | | template <typename pixel_t> |
3217 | | void FillRowGA8(const unsigned char* rgba, size_t oxs, pixel_t* luma, |
3218 | 0 | pixel_t* alpha) { |
3219 | 0 | size_t x = 0; |
3220 | | #ifdef FJXL_GENERIC_SIMD |
3221 | 0 | for (; x + SIMDVec16::kLanes <= oxs; x += SIMDVec16::kLanes) { |
3222 | 0 | auto rgb = SIMDVec16::LoadGA8(rgba + 2 * x); |
3223 | 0 | StorePixels(rgb[0], luma + x); |
3224 | 0 | StorePixels(rgb[1], alpha + x); |
3225 | 0 | } |
3226 | | #endif |
3227 | 0 | for (; x < oxs; x++) { |
3228 | 0 | luma[x] = rgba[2 * x]; |
3229 | 0 | alpha[x] = rgba[2 * x + 1]; |
3230 | 0 | } |
3231 | 0 | } Unexecuted instantiation: enc_fast_lossless.cc:void AVX2::(anonymous namespace)::FillRowGA8<short>(unsigned char const*, unsigned long, short*, short*) Unexecuted instantiation: enc_fast_lossless.cc:void default_implementation::(anonymous namespace)::FillRowGA8<short>(unsigned char const*, unsigned long, short*, short*) Unexecuted instantiation: enc_fast_lossless.cc:void default_implementation::(anonymous namespace)::FillRowGA8<int>(unsigned char const*, unsigned long, int*, int*) Unexecuted instantiation: enc_fast_lossless.cc:void AVX2::(anonymous namespace)::FillRowGA8<int>(unsigned char const*, unsigned long, int*, int*) |
3232 | | |
3233 | | template <bool big_endian, typename pixel_t> |
3234 | | void FillRowGA16(const unsigned char* rgba, size_t oxs, pixel_t* luma, |
3235 | 0 | pixel_t* alpha) { |
3236 | 0 | size_t x = 0; |
3237 | | #ifdef FJXL_GENERIC_SIMD |
3238 | 0 | for (; x + SIMDVec16::kLanes <= oxs; x += SIMDVec16::kLanes) { |
3239 | 0 | auto rgb = SIMDVec16::LoadGA16(rgba + 4 * x); |
3240 | 0 | if (big_endian) { |
3241 | 0 | rgb[0].SwapEndian(); |
3242 | 0 | rgb[1].SwapEndian(); |
3243 | 0 | } |
3244 | 0 | StorePixels(rgb[0], luma + x); |
3245 | 0 | StorePixels(rgb[1], alpha + x); |
3246 | 0 | } |
3247 | | #endif |
3248 | 0 | for (; x < oxs; x++) { |
3249 | 0 | uint16_t l = LoadLE16(rgba + 4 * x); |
3250 | 0 | uint16_t a = LoadLE16(rgba + 4 * x + 2); |
3251 | 0 | if (big_endian) { |
3252 | 0 | l = SwapEndian(l); |
3253 | 0 | a = SwapEndian(a); |
3254 | 0 | } |
3255 | 0 | luma[x] = l; |
3256 | 0 | alpha[x] = a; |
3257 | 0 | } |
3258 | 0 | } Unexecuted instantiation: enc_fast_lossless.cc:void AVX2::(anonymous namespace)::FillRowGA16<true, short>(unsigned char const*, unsigned long, short*, short*) Unexecuted instantiation: enc_fast_lossless.cc:void AVX2::(anonymous namespace)::FillRowGA16<false, short>(unsigned char const*, unsigned long, short*, short*) Unexecuted instantiation: enc_fast_lossless.cc:void AVX2::(anonymous namespace)::FillRowGA16<true, int>(unsigned char const*, unsigned long, int*, int*) Unexecuted instantiation: enc_fast_lossless.cc:void AVX2::(anonymous namespace)::FillRowGA16<false, int>(unsigned char const*, unsigned long, int*, int*) Unexecuted instantiation: enc_fast_lossless.cc:void default_implementation::(anonymous namespace)::FillRowGA16<true, short>(unsigned char const*, unsigned long, short*, short*) Unexecuted instantiation: enc_fast_lossless.cc:void default_implementation::(anonymous namespace)::FillRowGA16<false, short>(unsigned char const*, unsigned long, short*, short*) Unexecuted instantiation: enc_fast_lossless.cc:void default_implementation::(anonymous namespace)::FillRowGA16<true, int>(unsigned char const*, unsigned long, int*, int*) Unexecuted instantiation: enc_fast_lossless.cc:void default_implementation::(anonymous namespace)::FillRowGA16<false, int>(unsigned char const*, unsigned long, int*, int*) |
3259 | | |
3260 | | template <typename pixel_t> |
3261 | | void StoreYCoCg(pixel_t r, pixel_t g, pixel_t b, pixel_t* y, pixel_t* co, |
3262 | 0 | pixel_t* cg) { |
3263 | 0 | *co = r - b; |
3264 | 0 | pixel_t tmp = b + (*co >> 1); |
3265 | 0 | *cg = g - tmp; |
3266 | 0 | *y = tmp + (*cg >> 1); |
3267 | 0 | } Unexecuted instantiation: enc_fast_lossless.cc:void AVX2::(anonymous namespace)::StoreYCoCg<short>(short, short, short, short*, short*, short*) Unexecuted instantiation: enc_fast_lossless.cc:void AVX2::(anonymous namespace)::StoreYCoCg<int>(int, int, int, int*, int*, int*) Unexecuted instantiation: enc_fast_lossless.cc:void default_implementation::(anonymous namespace)::StoreYCoCg<short>(short, short, short, short*, short*, short*) Unexecuted instantiation: enc_fast_lossless.cc:void default_implementation::(anonymous namespace)::StoreYCoCg<int>(int, int, int, int*, int*, int*) |
3268 | | |
3269 | | #ifdef FJXL_GENERIC_SIMD |
3270 | | void StoreYCoCg(SIMDVec16 r, SIMDVec16 g, SIMDVec16 b, int16_t* y, int16_t* co, |
3271 | 0 | int16_t* cg) { |
3272 | 0 | SIMDVec16 co_v = r.Sub(b); |
3273 | 0 | SIMDVec16 tmp = b.Add(co_v.SignedShiftRight<1>()); |
3274 | 0 | SIMDVec16 cg_v = g.Sub(tmp); |
3275 | 0 | SIMDVec16 y_v = tmp.Add(cg_v.SignedShiftRight<1>()); |
3276 | 0 | y_v.Store(reinterpret_cast<uint16_t*>(y)); |
3277 | 0 | co_v.Store(reinterpret_cast<uint16_t*>(co)); |
3278 | 0 | cg_v.Store(reinterpret_cast<uint16_t*>(cg)); |
3279 | 0 | } |
3280 | | |
3281 | | void StoreYCoCg(SIMDVec16 r, SIMDVec16 g, SIMDVec16 b, int32_t* y, int32_t* co, |
3282 | 0 | int32_t* cg) { |
3283 | 0 | VecPair<SIMDVec32> r_up = r.Upcast(); |
3284 | 0 | VecPair<SIMDVec32> g_up = g.Upcast(); |
3285 | 0 | VecPair<SIMDVec32> b_up = b.Upcast(); |
3286 | 0 | SIMDVec32 co_lo_v = r_up.low.Sub(b_up.low); |
3287 | 0 | SIMDVec32 tmp_lo = b_up.low.Add(co_lo_v.SignedShiftRight<1>()); |
3288 | 0 | SIMDVec32 cg_lo_v = g_up.low.Sub(tmp_lo); |
3289 | 0 | SIMDVec32 y_lo_v = tmp_lo.Add(cg_lo_v.SignedShiftRight<1>()); |
3290 | 0 | SIMDVec32 co_hi_v = r_up.hi.Sub(b_up.hi); |
3291 | 0 | SIMDVec32 tmp_hi = b_up.hi.Add(co_hi_v.SignedShiftRight<1>()); |
3292 | 0 | SIMDVec32 cg_hi_v = g_up.hi.Sub(tmp_hi); |
3293 | 0 | SIMDVec32 y_hi_v = tmp_hi.Add(cg_hi_v.SignedShiftRight<1>()); |
3294 | 0 | y_lo_v.Store(reinterpret_cast<uint32_t*>(y)); |
3295 | 0 | co_lo_v.Store(reinterpret_cast<uint32_t*>(co)); |
3296 | 0 | cg_lo_v.Store(reinterpret_cast<uint32_t*>(cg)); |
3297 | 0 | y_hi_v.Store(reinterpret_cast<uint32_t*>(y) + SIMDVec32::kLanes); |
3298 | 0 | co_hi_v.Store(reinterpret_cast<uint32_t*>(co) + SIMDVec32::kLanes); |
3299 | 0 | cg_hi_v.Store(reinterpret_cast<uint32_t*>(cg) + SIMDVec32::kLanes); |
3300 | 0 | } |
3301 | | #endif |
3302 | | |
3303 | | template <typename pixel_t> |
3304 | | void FillRowRGB8(const unsigned char* rgba, size_t oxs, pixel_t* y, pixel_t* co, |
3305 | 0 | pixel_t* cg) { |
3306 | 0 | size_t x = 0; |
3307 | | #ifdef FJXL_GENERIC_SIMD |
3308 | 0 | for (; x + SIMDVec16::kLanes <= oxs; x += SIMDVec16::kLanes) { |
3309 | 0 | auto rgb = SIMDVec16::LoadRGB8(rgba + 3 * x); |
3310 | 0 | StoreYCoCg(rgb[0], rgb[1], rgb[2], y + x, co + x, cg + x); |
3311 | 0 | } |
3312 | | #endif |
3313 | 0 | for (; x < oxs; x++) { |
3314 | 0 | uint16_t r = rgba[3 * x]; |
3315 | 0 | uint16_t g = rgba[3 * x + 1]; |
3316 | 0 | uint16_t b = rgba[3 * x + 2]; |
3317 | 0 | StoreYCoCg<pixel_t>(r, g, b, y + x, co + x, cg + x); |
3318 | 0 | } |
3319 | 0 | } Unexecuted instantiation: enc_fast_lossless.cc:void AVX2::(anonymous namespace)::FillRowRGB8<short>(unsigned char const*, unsigned long, short*, short*, short*) Unexecuted instantiation: enc_fast_lossless.cc:void default_implementation::(anonymous namespace)::FillRowRGB8<short>(unsigned char const*, unsigned long, short*, short*, short*) Unexecuted instantiation: enc_fast_lossless.cc:void default_implementation::(anonymous namespace)::FillRowRGB8<int>(unsigned char const*, unsigned long, int*, int*, int*) Unexecuted instantiation: enc_fast_lossless.cc:void AVX2::(anonymous namespace)::FillRowRGB8<int>(unsigned char const*, unsigned long, int*, int*, int*) |
3320 | | |
3321 | | template <bool big_endian, typename pixel_t> |
3322 | | void FillRowRGB16(const unsigned char* rgba, size_t oxs, pixel_t* y, |
3323 | 0 | pixel_t* co, pixel_t* cg) { |
3324 | 0 | size_t x = 0; |
3325 | | #ifdef FJXL_GENERIC_SIMD |
3326 | 0 | for (; x + SIMDVec16::kLanes <= oxs; x += SIMDVec16::kLanes) { |
3327 | 0 | auto rgb = SIMDVec16::LoadRGB16(rgba + 6 * x); |
3328 | 0 | if (big_endian) { |
3329 | 0 | rgb[0].SwapEndian(); |
3330 | 0 | rgb[1].SwapEndian(); |
3331 | 0 | rgb[2].SwapEndian(); |
3332 | 0 | } |
3333 | 0 | StoreYCoCg(rgb[0], rgb[1], rgb[2], y + x, co + x, cg + x); |
3334 | 0 | } |
3335 | | #endif |
3336 | 0 | for (; x < oxs; x++) { |
3337 | 0 | uint16_t r = LoadLE16(rgba + 6 * x); |
3338 | 0 | uint16_t g = LoadLE16(rgba + 6 * x + 2); |
3339 | 0 | uint16_t b = LoadLE16(rgba + 6 * x + 4); |
3340 | 0 | if (big_endian) { |
3341 | 0 | r = SwapEndian(r); |
3342 | 0 | g = SwapEndian(g); |
3343 | 0 | b = SwapEndian(b); |
3344 | 0 | } |
3345 | 0 | StoreYCoCg<pixel_t>(r, g, b, y + x, co + x, cg + x); |
3346 | 0 | } |
3347 | 0 | } Unexecuted instantiation: enc_fast_lossless.cc:void AVX2::(anonymous namespace)::FillRowRGB16<true, short>(unsigned char const*, unsigned long, short*, short*, short*) Unexecuted instantiation: enc_fast_lossless.cc:void AVX2::(anonymous namespace)::FillRowRGB16<false, short>(unsigned char const*, unsigned long, short*, short*, short*) Unexecuted instantiation: enc_fast_lossless.cc:void AVX2::(anonymous namespace)::FillRowRGB16<true, int>(unsigned char const*, unsigned long, int*, int*, int*) Unexecuted instantiation: enc_fast_lossless.cc:void AVX2::(anonymous namespace)::FillRowRGB16<false, int>(unsigned char const*, unsigned long, int*, int*, int*) Unexecuted instantiation: enc_fast_lossless.cc:void default_implementation::(anonymous namespace)::FillRowRGB16<true, short>(unsigned char const*, unsigned long, short*, short*, short*) Unexecuted instantiation: enc_fast_lossless.cc:void default_implementation::(anonymous namespace)::FillRowRGB16<false, short>(unsigned char const*, unsigned long, short*, short*, short*) Unexecuted instantiation: enc_fast_lossless.cc:void default_implementation::(anonymous namespace)::FillRowRGB16<true, int>(unsigned char const*, unsigned long, int*, int*, int*) Unexecuted instantiation: enc_fast_lossless.cc:void default_implementation::(anonymous namespace)::FillRowRGB16<false, int>(unsigned char const*, unsigned long, int*, int*, int*) |
3348 | | |
3349 | | template <typename pixel_t> |
3350 | | void FillRowRGBA8(const unsigned char* rgba, size_t oxs, pixel_t* y, |
3351 | 0 | pixel_t* co, pixel_t* cg, pixel_t* alpha) { |
3352 | 0 | size_t x = 0; |
3353 | | #ifdef FJXL_GENERIC_SIMD |
3354 | 0 | for (; x + SIMDVec16::kLanes <= oxs; x += SIMDVec16::kLanes) { |
3355 | 0 | auto rgb = SIMDVec16::LoadRGBA8(rgba + 4 * x); |
3356 | 0 | StoreYCoCg(rgb[0], rgb[1], rgb[2], y + x, co + x, cg + x); |
3357 | 0 | StorePixels(rgb[3], alpha + x); |
3358 | 0 | } |
3359 | | #endif |
3360 | 0 | for (; x < oxs; x++) { |
3361 | 0 | uint16_t r = rgba[4 * x]; |
3362 | 0 | uint16_t g = rgba[4 * x + 1]; |
3363 | 0 | uint16_t b = rgba[4 * x + 2]; |
3364 | 0 | uint16_t a = rgba[4 * x + 3]; |
3365 | 0 | StoreYCoCg<pixel_t>(r, g, b, y + x, co + x, cg + x); |
3366 | 0 | alpha[x] = a; |
3367 | 0 | } |
3368 | 0 | } Unexecuted instantiation: enc_fast_lossless.cc:void AVX2::(anonymous namespace)::FillRowRGBA8<short>(unsigned char const*, unsigned long, short*, short*, short*, short*) Unexecuted instantiation: enc_fast_lossless.cc:void default_implementation::(anonymous namespace)::FillRowRGBA8<short>(unsigned char const*, unsigned long, short*, short*, short*, short*) Unexecuted instantiation: enc_fast_lossless.cc:void default_implementation::(anonymous namespace)::FillRowRGBA8<int>(unsigned char const*, unsigned long, int*, int*, int*, int*) Unexecuted instantiation: enc_fast_lossless.cc:void AVX2::(anonymous namespace)::FillRowRGBA8<int>(unsigned char const*, unsigned long, int*, int*, int*, int*) |
3369 | | |
3370 | | template <bool big_endian, typename pixel_t> |
3371 | | void FillRowRGBA16(const unsigned char* rgba, size_t oxs, pixel_t* y, |
3372 | 0 | pixel_t* co, pixel_t* cg, pixel_t* alpha) { |
3373 | 0 | size_t x = 0; |
3374 | | #ifdef FJXL_GENERIC_SIMD |
3375 | 0 | for (; x + SIMDVec16::kLanes <= oxs; x += SIMDVec16::kLanes) { |
3376 | 0 | auto rgb = SIMDVec16::LoadRGBA16(rgba + 8 * x); |
3377 | 0 | if (big_endian) { |
3378 | 0 | rgb[0].SwapEndian(); |
3379 | 0 | rgb[1].SwapEndian(); |
3380 | 0 | rgb[2].SwapEndian(); |
3381 | 0 | rgb[3].SwapEndian(); |
3382 | 0 | } |
3383 | 0 | StoreYCoCg(rgb[0], rgb[1], rgb[2], y + x, co + x, cg + x); |
3384 | 0 | StorePixels(rgb[3], alpha + x); |
3385 | 0 | } |
3386 | | #endif |
3387 | 0 | for (; x < oxs; x++) { |
3388 | 0 | uint16_t r = LoadLE16(rgba + 8 * x); |
3389 | 0 | uint16_t g = LoadLE16(rgba + 8 * x + 2); |
3390 | 0 | uint16_t b = LoadLE16(rgba + 8 * x + 4); |
3391 | 0 | uint16_t a = LoadLE16(rgba + 8 * x + 6); |
3392 | 0 | if (big_endian) { |
3393 | 0 | r = SwapEndian(r); |
3394 | 0 | g = SwapEndian(g); |
3395 | 0 | b = SwapEndian(b); |
3396 | 0 | a = SwapEndian(a); |
3397 | 0 | } |
3398 | 0 | StoreYCoCg<pixel_t>(r, g, b, y + x, co + x, cg + x); |
3399 | 0 | alpha[x] = a; |
3400 | 0 | } |
3401 | 0 | } Unexecuted instantiation: enc_fast_lossless.cc:void AVX2::(anonymous namespace)::FillRowRGBA16<true, short>(unsigned char const*, unsigned long, short*, short*, short*, short*) Unexecuted instantiation: enc_fast_lossless.cc:void AVX2::(anonymous namespace)::FillRowRGBA16<false, short>(unsigned char const*, unsigned long, short*, short*, short*, short*) Unexecuted instantiation: enc_fast_lossless.cc:void AVX2::(anonymous namespace)::FillRowRGBA16<true, int>(unsigned char const*, unsigned long, int*, int*, int*, int*) Unexecuted instantiation: enc_fast_lossless.cc:void AVX2::(anonymous namespace)::FillRowRGBA16<false, int>(unsigned char const*, unsigned long, int*, int*, int*, int*) Unexecuted instantiation: enc_fast_lossless.cc:void default_implementation::(anonymous namespace)::FillRowRGBA16<true, short>(unsigned char const*, unsigned long, short*, short*, short*, short*) Unexecuted instantiation: enc_fast_lossless.cc:void default_implementation::(anonymous namespace)::FillRowRGBA16<false, short>(unsigned char const*, unsigned long, short*, short*, short*, short*) Unexecuted instantiation: enc_fast_lossless.cc:void default_implementation::(anonymous namespace)::FillRowRGBA16<true, int>(unsigned char const*, unsigned long, int*, int*, int*, int*) Unexecuted instantiation: enc_fast_lossless.cc:void default_implementation::(anonymous namespace)::FillRowRGBA16<false, int>(unsigned char const*, unsigned long, int*, int*, int*, int*) |
3402 | | |
3403 | | template <typename Processor, typename BitDepth> |
3404 | | void ProcessImageArea(const unsigned char* rgba, size_t x0, size_t y0, |
3405 | | size_t xs, size_t yskip, size_t ys, size_t row_stride, |
3406 | | BitDepth bitdepth, size_t nb_chans, bool big_endian, |
3407 | 0 | Processor* processors) { |
3408 | 0 | constexpr size_t kPadding = 32; |
3409 | |
|
3410 | 0 | using pixel_t = typename BitDepth::pixel_t; |
3411 | |
|
3412 | 0 | constexpr size_t kAlign = 64; |
3413 | 0 | constexpr size_t kAlignPixels = kAlign / sizeof(pixel_t); |
3414 | |
|
3415 | 0 | auto align = [=](pixel_t* ptr) { |
3416 | 0 | size_t offset = reinterpret_cast<uintptr_t>(ptr) % kAlign; |
3417 | 0 | if (offset) { |
3418 | 0 | ptr += offset / sizeof(pixel_t); |
3419 | 0 | } |
3420 | 0 | return ptr; |
3421 | 0 | }; Unexecuted instantiation: enc_fast_lossless.cc:AVX2::(anonymous namespace)::ProcessImageArea<AVX2::(anonymous namespace)::ChannelRowProcessor<AVX2::(anonymous namespace)::ChunkSampleCollector<AVX2::(anonymous namespace)::UpTo8Bits>, AVX2::(anonymous namespace)::UpTo8Bits>, AVX2::(anonymous namespace)::UpTo8Bits>(unsigned char const*, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, AVX2::(anonymous namespace)::UpTo8Bits, unsigned long, bool, AVX2::(anonymous namespace)::ChannelRowProcessor<AVX2::(anonymous namespace)::ChunkSampleCollector<AVX2::(anonymous namespace)::UpTo8Bits>, AVX2::(anonymous namespace)::UpTo8Bits>*)::{lambda(short*)#1}::operator()(short*) const Unexecuted instantiation: enc_fast_lossless.cc:AVX2::(anonymous namespace)::ProcessImageArea<AVX2::(anonymous namespace)::ChannelRowProcessor<AVX2::(anonymous namespace)::ChunkSampleCollector<AVX2::(anonymous namespace)::From9To13Bits>, AVX2::(anonymous namespace)::From9To13Bits>, AVX2::(anonymous namespace)::From9To13Bits>(unsigned char const*, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, AVX2::(anonymous namespace)::From9To13Bits, unsigned long, bool, AVX2::(anonymous namespace)::ChannelRowProcessor<AVX2::(anonymous namespace)::ChunkSampleCollector<AVX2::(anonymous namespace)::From9To13Bits>, AVX2::(anonymous namespace)::From9To13Bits>*)::{lambda(short*)#1}::operator()(short*) const Unexecuted instantiation: enc_fast_lossless.cc:AVX2::(anonymous namespace)::ProcessImageArea<AVX2::(anonymous namespace)::ChannelRowProcessor<AVX2::(anonymous namespace)::ChunkSampleCollector<AVX2::(anonymous namespace)::Exactly14Bits>, AVX2::(anonymous namespace)::Exactly14Bits>, AVX2::(anonymous namespace)::Exactly14Bits>(unsigned char const*, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, AVX2::(anonymous namespace)::Exactly14Bits, unsigned long, bool, AVX2::(anonymous namespace)::ChannelRowProcessor<AVX2::(anonymous namespace)::ChunkSampleCollector<AVX2::(anonymous namespace)::Exactly14Bits>, AVX2::(anonymous namespace)::Exactly14Bits>*)::{lambda(short*)#1}::operator()(short*) const Unexecuted instantiation: enc_fast_lossless.cc:AVX2::(anonymous namespace)::ProcessImageArea<AVX2::(anonymous namespace)::ChannelRowProcessor<AVX2::(anonymous namespace)::ChunkSampleCollector<AVX2::(anonymous namespace)::MoreThan14Bits>, AVX2::(anonymous namespace)::MoreThan14Bits>, AVX2::(anonymous namespace)::MoreThan14Bits>(unsigned char const*, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, AVX2::(anonymous namespace)::MoreThan14Bits, unsigned long, bool, AVX2::(anonymous namespace)::ChannelRowProcessor<AVX2::(anonymous namespace)::ChunkSampleCollector<AVX2::(anonymous namespace)::MoreThan14Bits>, AVX2::(anonymous namespace)::MoreThan14Bits>*)::{lambda(int*)#1}::operator()(int*) const Unexecuted instantiation: enc_fast_lossless.cc:default_implementation::(anonymous namespace)::ProcessImageArea<default_implementation::(anonymous namespace)::ChannelRowProcessor<default_implementation::(anonymous namespace)::ChunkSampleCollector<default_implementation::(anonymous namespace)::UpTo8Bits>, default_implementation::(anonymous namespace)::UpTo8Bits>, default_implementation::(anonymous namespace)::UpTo8Bits>(unsigned char const*, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, default_implementation::(anonymous namespace)::UpTo8Bits, unsigned long, bool, default_implementation::(anonymous namespace)::ChannelRowProcessor<default_implementation::(anonymous namespace)::ChunkSampleCollector<default_implementation::(anonymous namespace)::UpTo8Bits>, default_implementation::(anonymous namespace)::UpTo8Bits>*)::{lambda(short*)#1}::operator()(short*) const Unexecuted instantiation: enc_fast_lossless.cc:default_implementation::(anonymous namespace)::ProcessImageArea<default_implementation::(anonymous namespace)::ChannelRowProcessor<default_implementation::(anonymous namespace)::ChunkSampleCollector<default_implementation::(anonymous namespace)::From9To13Bits>, default_implementation::(anonymous namespace)::From9To13Bits>, default_implementation::(anonymous namespace)::From9To13Bits>(unsigned char const*, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, default_implementation::(anonymous namespace)::From9To13Bits, unsigned long, bool, default_implementation::(anonymous namespace)::ChannelRowProcessor<default_implementation::(anonymous namespace)::ChunkSampleCollector<default_implementation::(anonymous namespace)::From9To13Bits>, default_implementation::(anonymous namespace)::From9To13Bits>*)::{lambda(short*)#1}::operator()(short*) const Unexecuted instantiation: enc_fast_lossless.cc:default_implementation::(anonymous namespace)::ProcessImageArea<default_implementation::(anonymous namespace)::ChannelRowProcessor<default_implementation::(anonymous namespace)::ChunkSampleCollector<default_implementation::(anonymous namespace)::Exactly14Bits>, default_implementation::(anonymous namespace)::Exactly14Bits>, default_implementation::(anonymous namespace)::Exactly14Bits>(unsigned char const*, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, default_implementation::(anonymous namespace)::Exactly14Bits, unsigned long, bool, default_implementation::(anonymous namespace)::ChannelRowProcessor<default_implementation::(anonymous namespace)::ChunkSampleCollector<default_implementation::(anonymous namespace)::Exactly14Bits>, default_implementation::(anonymous namespace)::Exactly14Bits>*)::{lambda(short*)#1}::operator()(short*) const Unexecuted instantiation: enc_fast_lossless.cc:default_implementation::(anonymous namespace)::ProcessImageArea<default_implementation::(anonymous namespace)::ChannelRowProcessor<default_implementation::(anonymous namespace)::ChunkSampleCollector<default_implementation::(anonymous namespace)::MoreThan14Bits>, default_implementation::(anonymous namespace)::MoreThan14Bits>, default_implementation::(anonymous namespace)::MoreThan14Bits>(unsigned char const*, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, default_implementation::(anonymous namespace)::MoreThan14Bits, unsigned long, bool, default_implementation::(anonymous namespace)::ChannelRowProcessor<default_implementation::(anonymous namespace)::ChunkSampleCollector<default_implementation::(anonymous namespace)::MoreThan14Bits>, default_implementation::(anonymous namespace)::MoreThan14Bits>*)::{lambda(int*)#1}::operator()(int*) const Unexecuted instantiation: enc_fast_lossless.cc:AVX2::(anonymous namespace)::ProcessImageArea<AVX2::(anonymous namespace)::ChannelRowProcessor<AVX2::(anonymous namespace)::ChunkEncoder<AVX2::(anonymous namespace)::UpTo8Bits>, AVX2::(anonymous namespace)::UpTo8Bits>, AVX2::(anonymous namespace)::UpTo8Bits>(unsigned char const*, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, AVX2::(anonymous namespace)::UpTo8Bits, unsigned long, bool, AVX2::(anonymous namespace)::ChannelRowProcessor<AVX2::(anonymous namespace)::ChunkEncoder<AVX2::(anonymous namespace)::UpTo8Bits>, AVX2::(anonymous namespace)::UpTo8Bits>*)::{lambda(short*)#1}::operator()(short*) const Unexecuted instantiation: enc_fast_lossless.cc:AVX2::(anonymous namespace)::ProcessImageArea<AVX2::(anonymous namespace)::ChannelRowProcessor<AVX2::(anonymous namespace)::ChunkEncoder<AVX2::(anonymous namespace)::From9To13Bits>, AVX2::(anonymous namespace)::From9To13Bits>, AVX2::(anonymous namespace)::From9To13Bits>(unsigned char const*, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, AVX2::(anonymous namespace)::From9To13Bits, unsigned long, bool, AVX2::(anonymous namespace)::ChannelRowProcessor<AVX2::(anonymous namespace)::ChunkEncoder<AVX2::(anonymous namespace)::From9To13Bits>, AVX2::(anonymous namespace)::From9To13Bits>*)::{lambda(short*)#1}::operator()(short*) const Unexecuted instantiation: enc_fast_lossless.cc:AVX2::(anonymous namespace)::ProcessImageArea<AVX2::(anonymous namespace)::ChannelRowProcessor<AVX2::(anonymous namespace)::ChunkEncoder<AVX2::(anonymous namespace)::Exactly14Bits>, AVX2::(anonymous namespace)::Exactly14Bits>, AVX2::(anonymous namespace)::Exactly14Bits>(unsigned char const*, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, AVX2::(anonymous namespace)::Exactly14Bits, unsigned long, bool, AVX2::(anonymous namespace)::ChannelRowProcessor<AVX2::(anonymous namespace)::ChunkEncoder<AVX2::(anonymous namespace)::Exactly14Bits>, AVX2::(anonymous namespace)::Exactly14Bits>*)::{lambda(short*)#1}::operator()(short*) const Unexecuted instantiation: enc_fast_lossless.cc:AVX2::(anonymous namespace)::ProcessImageArea<AVX2::(anonymous namespace)::ChannelRowProcessor<AVX2::(anonymous namespace)::ChunkEncoder<AVX2::(anonymous namespace)::MoreThan14Bits>, AVX2::(anonymous namespace)::MoreThan14Bits>, AVX2::(anonymous namespace)::MoreThan14Bits>(unsigned char const*, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, AVX2::(anonymous namespace)::MoreThan14Bits, unsigned long, bool, AVX2::(anonymous namespace)::ChannelRowProcessor<AVX2::(anonymous namespace)::ChunkEncoder<AVX2::(anonymous namespace)::MoreThan14Bits>, AVX2::(anonymous namespace)::MoreThan14Bits>*)::{lambda(int*)#1}::operator()(int*) const Unexecuted instantiation: enc_fast_lossless.cc:default_implementation::(anonymous namespace)::ProcessImageArea<default_implementation::(anonymous namespace)::ChannelRowProcessor<default_implementation::(anonymous namespace)::ChunkEncoder<default_implementation::(anonymous namespace)::UpTo8Bits>, default_implementation::(anonymous namespace)::UpTo8Bits>, default_implementation::(anonymous namespace)::UpTo8Bits>(unsigned char const*, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, default_implementation::(anonymous namespace)::UpTo8Bits, unsigned long, bool, default_implementation::(anonymous namespace)::ChannelRowProcessor<default_implementation::(anonymous namespace)::ChunkEncoder<default_implementation::(anonymous namespace)::UpTo8Bits>, default_implementation::(anonymous namespace)::UpTo8Bits>*)::{lambda(short*)#1}::operator()(short*) const Unexecuted instantiation: enc_fast_lossless.cc:default_implementation::(anonymous namespace)::ProcessImageArea<default_implementation::(anonymous namespace)::ChannelRowProcessor<default_implementation::(anonymous namespace)::ChunkEncoder<default_implementation::(anonymous namespace)::From9To13Bits>, default_implementation::(anonymous namespace)::From9To13Bits>, default_implementation::(anonymous namespace)::From9To13Bits>(unsigned char const*, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, default_implementation::(anonymous namespace)::From9To13Bits, unsigned long, bool, default_implementation::(anonymous namespace)::ChannelRowProcessor<default_implementation::(anonymous namespace)::ChunkEncoder<default_implementation::(anonymous namespace)::From9To13Bits>, default_implementation::(anonymous namespace)::From9To13Bits>*)::{lambda(short*)#1}::operator()(short*) const Unexecuted instantiation: enc_fast_lossless.cc:default_implementation::(anonymous namespace)::ProcessImageArea<default_implementation::(anonymous namespace)::ChannelRowProcessor<default_implementation::(anonymous namespace)::ChunkEncoder<default_implementation::(anonymous namespace)::Exactly14Bits>, default_implementation::(anonymous namespace)::Exactly14Bits>, default_implementation::(anonymous namespace)::Exactly14Bits>(unsigned char const*, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, default_implementation::(anonymous namespace)::Exactly14Bits, unsigned long, bool, default_implementation::(anonymous namespace)::ChannelRowProcessor<default_implementation::(anonymous namespace)::ChunkEncoder<default_implementation::(anonymous namespace)::Exactly14Bits>, default_implementation::(anonymous namespace)::Exactly14Bits>*)::{lambda(short*)#1}::operator()(short*) const Unexecuted instantiation: enc_fast_lossless.cc:default_implementation::(anonymous namespace)::ProcessImageArea<default_implementation::(anonymous namespace)::ChannelRowProcessor<default_implementation::(anonymous namespace)::ChunkEncoder<default_implementation::(anonymous namespace)::MoreThan14Bits>, default_implementation::(anonymous namespace)::MoreThan14Bits>, default_implementation::(anonymous namespace)::MoreThan14Bits>(unsigned char const*, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, default_implementation::(anonymous namespace)::MoreThan14Bits, unsigned long, bool, default_implementation::(anonymous namespace)::ChannelRowProcessor<default_implementation::(anonymous namespace)::ChunkEncoder<default_implementation::(anonymous namespace)::MoreThan14Bits>, default_implementation::(anonymous namespace)::MoreThan14Bits>*)::{lambda(int*)#1}::operator()(int*) const |
3422 | |
|
3423 | 0 | constexpr size_t kNumPx = |
3424 | 0 | (256 + kPadding * 2 + kAlignPixels + kAlignPixels - 1) / kAlignPixels * |
3425 | 0 | kAlignPixels; |
3426 | |
|
3427 | 0 | std::vector<std::array<std::array<pixel_t, kNumPx>, 2>> group_data(nb_chans); |
3428 | |
|
3429 | 0 | for (size_t y = 0; y < ys; y++) { |
3430 | 0 | const auto rgba_row = |
3431 | 0 | rgba + row_stride * (y0 + y) + x0 * nb_chans * BitDepth::kInputBytes; |
3432 | 0 | pixel_t* crow[4] = {}; |
3433 | 0 | pixel_t* prow[4] = {}; |
3434 | 0 | for (size_t i = 0; i < nb_chans; i++) { |
3435 | 0 | crow[i] = align(&group_data[i][y & 1][kPadding]); |
3436 | 0 | prow[i] = align(&group_data[i][(y - 1) & 1][kPadding]); |
3437 | 0 | } |
3438 | | |
3439 | | // Pre-fill rows with YCoCg converted pixels. |
3440 | 0 | if (nb_chans == 1) { |
3441 | 0 | if (BitDepth::kInputBytes == 1) { |
3442 | 0 | FillRowG8(rgba_row, xs, crow[0]); |
3443 | 0 | } else if (big_endian) { |
3444 | 0 | FillRowG16</*big_endian=*/true>(rgba_row, xs, crow[0]); |
3445 | 0 | } else { |
3446 | 0 | FillRowG16</*big_endian=*/false>(rgba_row, xs, crow[0]); |
3447 | 0 | } |
3448 | 0 | } else if (nb_chans == 2) { |
3449 | 0 | if (BitDepth::kInputBytes == 1) { |
3450 | 0 | FillRowGA8(rgba_row, xs, crow[0], crow[1]); |
3451 | 0 | } else if (big_endian) { |
3452 | 0 | FillRowGA16</*big_endian=*/true>(rgba_row, xs, crow[0], crow[1]); |
3453 | 0 | } else { |
3454 | 0 | FillRowGA16</*big_endian=*/false>(rgba_row, xs, crow[0], crow[1]); |
3455 | 0 | } |
3456 | 0 | } else if (nb_chans == 3) { |
3457 | 0 | if (BitDepth::kInputBytes == 1) { |
3458 | 0 | FillRowRGB8(rgba_row, xs, crow[0], crow[1], crow[2]); |
3459 | 0 | } else if (big_endian) { |
3460 | 0 | FillRowRGB16</*big_endian=*/true>(rgba_row, xs, crow[0], crow[1], |
3461 | 0 | crow[2]); |
3462 | 0 | } else { |
3463 | 0 | FillRowRGB16</*big_endian=*/false>(rgba_row, xs, crow[0], crow[1], |
3464 | 0 | crow[2]); |
3465 | 0 | } |
3466 | 0 | } else { |
3467 | 0 | if (BitDepth::kInputBytes == 1) { |
3468 | 0 | FillRowRGBA8(rgba_row, xs, crow[0], crow[1], crow[2], crow[3]); |
3469 | 0 | } else if (big_endian) { |
3470 | 0 | FillRowRGBA16</*big_endian=*/true>(rgba_row, xs, crow[0], crow[1], |
3471 | 0 | crow[2], crow[3]); |
3472 | 0 | } else { |
3473 | 0 | FillRowRGBA16</*big_endian=*/false>(rgba_row, xs, crow[0], crow[1], |
3474 | 0 | crow[2], crow[3]); |
3475 | 0 | } |
3476 | 0 | } |
3477 | | // Deal with x == 0. |
3478 | 0 | for (size_t c = 0; c < nb_chans; c++) { |
3479 | 0 | *(crow[c] - 1) = y > 0 ? *(prow[c]) : 0; |
3480 | | // Fix topleft. |
3481 | 0 | *(prow[c] - 1) = y > 0 ? *(prow[c]) : 0; |
3482 | 0 | } |
3483 | 0 | if (y < yskip) continue; |
3484 | 0 | for (size_t c = 0; c < nb_chans; c++) { |
3485 | | // Get pointers to px/left/top/topleft data to speedup loop. |
3486 | 0 | const pixel_t* row = crow[c]; |
3487 | 0 | const pixel_t* row_left = crow[c] - 1; |
3488 | 0 | const pixel_t* row_top = y == 0 ? row_left : prow[c]; |
3489 | 0 | const pixel_t* row_topleft = y == 0 ? row_left : prow[c] - 1; |
3490 | |
|
3491 | 0 | processors[c].ProcessRow(row, row_left, row_top, row_topleft, xs); |
3492 | 0 | } |
3493 | 0 | } |
3494 | 0 | for (size_t c = 0; c < nb_chans; c++) { |
3495 | 0 | processors[c].Finalize(); |
3496 | 0 | } |
3497 | 0 | } Unexecuted instantiation: enc_fast_lossless.cc:void AVX2::(anonymous namespace)::ProcessImageArea<AVX2::(anonymous namespace)::ChannelRowProcessor<AVX2::(anonymous namespace)::ChunkSampleCollector<AVX2::(anonymous namespace)::UpTo8Bits>, AVX2::(anonymous namespace)::UpTo8Bits>, AVX2::(anonymous namespace)::UpTo8Bits>(unsigned char const*, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, AVX2::(anonymous namespace)::UpTo8Bits, unsigned long, bool, AVX2::(anonymous namespace)::ChannelRowProcessor<AVX2::(anonymous namespace)::ChunkSampleCollector<AVX2::(anonymous namespace)::UpTo8Bits>, AVX2::(anonymous namespace)::UpTo8Bits>*) Unexecuted instantiation: enc_fast_lossless.cc:void AVX2::(anonymous namespace)::ProcessImageArea<AVX2::(anonymous namespace)::ChannelRowProcessor<AVX2::(anonymous namespace)::ChunkSampleCollector<AVX2::(anonymous namespace)::From9To13Bits>, AVX2::(anonymous namespace)::From9To13Bits>, AVX2::(anonymous namespace)::From9To13Bits>(unsigned char const*, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, AVX2::(anonymous namespace)::From9To13Bits, unsigned long, bool, AVX2::(anonymous namespace)::ChannelRowProcessor<AVX2::(anonymous namespace)::ChunkSampleCollector<AVX2::(anonymous namespace)::From9To13Bits>, AVX2::(anonymous namespace)::From9To13Bits>*) Unexecuted instantiation: enc_fast_lossless.cc:void AVX2::(anonymous namespace)::ProcessImageArea<AVX2::(anonymous namespace)::ChannelRowProcessor<AVX2::(anonymous namespace)::ChunkSampleCollector<AVX2::(anonymous namespace)::Exactly14Bits>, AVX2::(anonymous namespace)::Exactly14Bits>, AVX2::(anonymous namespace)::Exactly14Bits>(unsigned char const*, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, AVX2::(anonymous namespace)::Exactly14Bits, unsigned long, bool, AVX2::(anonymous namespace)::ChannelRowProcessor<AVX2::(anonymous namespace)::ChunkSampleCollector<AVX2::(anonymous namespace)::Exactly14Bits>, AVX2::(anonymous namespace)::Exactly14Bits>*) Unexecuted instantiation: enc_fast_lossless.cc:void AVX2::(anonymous namespace)::ProcessImageArea<AVX2::(anonymous namespace)::ChannelRowProcessor<AVX2::(anonymous namespace)::ChunkSampleCollector<AVX2::(anonymous namespace)::MoreThan14Bits>, AVX2::(anonymous namespace)::MoreThan14Bits>, AVX2::(anonymous namespace)::MoreThan14Bits>(unsigned char const*, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, AVX2::(anonymous namespace)::MoreThan14Bits, unsigned long, bool, AVX2::(anonymous namespace)::ChannelRowProcessor<AVX2::(anonymous namespace)::ChunkSampleCollector<AVX2::(anonymous namespace)::MoreThan14Bits>, AVX2::(anonymous namespace)::MoreThan14Bits>*) Unexecuted instantiation: enc_fast_lossless.cc:void default_implementation::(anonymous namespace)::ProcessImageArea<default_implementation::(anonymous namespace)::ChannelRowProcessor<default_implementation::(anonymous namespace)::ChunkSampleCollector<default_implementation::(anonymous namespace)::UpTo8Bits>, default_implementation::(anonymous namespace)::UpTo8Bits>, default_implementation::(anonymous namespace)::UpTo8Bits>(unsigned char const*, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, default_implementation::(anonymous namespace)::UpTo8Bits, unsigned long, bool, default_implementation::(anonymous namespace)::ChannelRowProcessor<default_implementation::(anonymous namespace)::ChunkSampleCollector<default_implementation::(anonymous namespace)::UpTo8Bits>, default_implementation::(anonymous namespace)::UpTo8Bits>*) Unexecuted instantiation: enc_fast_lossless.cc:void default_implementation::(anonymous namespace)::ProcessImageArea<default_implementation::(anonymous namespace)::ChannelRowProcessor<default_implementation::(anonymous namespace)::ChunkSampleCollector<default_implementation::(anonymous namespace)::From9To13Bits>, default_implementation::(anonymous namespace)::From9To13Bits>, default_implementation::(anonymous namespace)::From9To13Bits>(unsigned char const*, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, default_implementation::(anonymous namespace)::From9To13Bits, unsigned long, bool, default_implementation::(anonymous namespace)::ChannelRowProcessor<default_implementation::(anonymous namespace)::ChunkSampleCollector<default_implementation::(anonymous namespace)::From9To13Bits>, default_implementation::(anonymous namespace)::From9To13Bits>*) Unexecuted instantiation: enc_fast_lossless.cc:void default_implementation::(anonymous namespace)::ProcessImageArea<default_implementation::(anonymous namespace)::ChannelRowProcessor<default_implementation::(anonymous namespace)::ChunkSampleCollector<default_implementation::(anonymous namespace)::Exactly14Bits>, default_implementation::(anonymous namespace)::Exactly14Bits>, default_implementation::(anonymous namespace)::Exactly14Bits>(unsigned char const*, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, default_implementation::(anonymous namespace)::Exactly14Bits, unsigned long, bool, default_implementation::(anonymous namespace)::ChannelRowProcessor<default_implementation::(anonymous namespace)::ChunkSampleCollector<default_implementation::(anonymous namespace)::Exactly14Bits>, default_implementation::(anonymous namespace)::Exactly14Bits>*) Unexecuted instantiation: enc_fast_lossless.cc:void default_implementation::(anonymous namespace)::ProcessImageArea<default_implementation::(anonymous namespace)::ChannelRowProcessor<default_implementation::(anonymous namespace)::ChunkSampleCollector<default_implementation::(anonymous namespace)::MoreThan14Bits>, default_implementation::(anonymous namespace)::MoreThan14Bits>, default_implementation::(anonymous namespace)::MoreThan14Bits>(unsigned char const*, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, default_implementation::(anonymous namespace)::MoreThan14Bits, unsigned long, bool, default_implementation::(anonymous namespace)::ChannelRowProcessor<default_implementation::(anonymous namespace)::ChunkSampleCollector<default_implementation::(anonymous namespace)::MoreThan14Bits>, default_implementation::(anonymous namespace)::MoreThan14Bits>*) Unexecuted instantiation: enc_fast_lossless.cc:void AVX2::(anonymous namespace)::ProcessImageArea<AVX2::(anonymous namespace)::ChannelRowProcessor<AVX2::(anonymous namespace)::ChunkEncoder<AVX2::(anonymous namespace)::UpTo8Bits>, AVX2::(anonymous namespace)::UpTo8Bits>, AVX2::(anonymous namespace)::UpTo8Bits>(unsigned char const*, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, AVX2::(anonymous namespace)::UpTo8Bits, unsigned long, bool, AVX2::(anonymous namespace)::ChannelRowProcessor<AVX2::(anonymous namespace)::ChunkEncoder<AVX2::(anonymous namespace)::UpTo8Bits>, AVX2::(anonymous namespace)::UpTo8Bits>*) Unexecuted instantiation: enc_fast_lossless.cc:void AVX2::(anonymous namespace)::ProcessImageArea<AVX2::(anonymous namespace)::ChannelRowProcessor<AVX2::(anonymous namespace)::ChunkEncoder<AVX2::(anonymous namespace)::From9To13Bits>, AVX2::(anonymous namespace)::From9To13Bits>, AVX2::(anonymous namespace)::From9To13Bits>(unsigned char const*, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, AVX2::(anonymous namespace)::From9To13Bits, unsigned long, bool, AVX2::(anonymous namespace)::ChannelRowProcessor<AVX2::(anonymous namespace)::ChunkEncoder<AVX2::(anonymous namespace)::From9To13Bits>, AVX2::(anonymous namespace)::From9To13Bits>*) Unexecuted instantiation: enc_fast_lossless.cc:void AVX2::(anonymous namespace)::ProcessImageArea<AVX2::(anonymous namespace)::ChannelRowProcessor<AVX2::(anonymous namespace)::ChunkEncoder<AVX2::(anonymous namespace)::Exactly14Bits>, AVX2::(anonymous namespace)::Exactly14Bits>, AVX2::(anonymous namespace)::Exactly14Bits>(unsigned char const*, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, AVX2::(anonymous namespace)::Exactly14Bits, unsigned long, bool, AVX2::(anonymous namespace)::ChannelRowProcessor<AVX2::(anonymous namespace)::ChunkEncoder<AVX2::(anonymous namespace)::Exactly14Bits>, AVX2::(anonymous namespace)::Exactly14Bits>*) Unexecuted instantiation: enc_fast_lossless.cc:void AVX2::(anonymous namespace)::ProcessImageArea<AVX2::(anonymous namespace)::ChannelRowProcessor<AVX2::(anonymous namespace)::ChunkEncoder<AVX2::(anonymous namespace)::MoreThan14Bits>, AVX2::(anonymous namespace)::MoreThan14Bits>, AVX2::(anonymous namespace)::MoreThan14Bits>(unsigned char const*, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, AVX2::(anonymous namespace)::MoreThan14Bits, unsigned long, bool, AVX2::(anonymous namespace)::ChannelRowProcessor<AVX2::(anonymous namespace)::ChunkEncoder<AVX2::(anonymous namespace)::MoreThan14Bits>, AVX2::(anonymous namespace)::MoreThan14Bits>*) Unexecuted instantiation: enc_fast_lossless.cc:void default_implementation::(anonymous namespace)::ProcessImageArea<default_implementation::(anonymous namespace)::ChannelRowProcessor<default_implementation::(anonymous namespace)::ChunkEncoder<default_implementation::(anonymous namespace)::UpTo8Bits>, default_implementation::(anonymous namespace)::UpTo8Bits>, default_implementation::(anonymous namespace)::UpTo8Bits>(unsigned char const*, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, default_implementation::(anonymous namespace)::UpTo8Bits, unsigned long, bool, default_implementation::(anonymous namespace)::ChannelRowProcessor<default_implementation::(anonymous namespace)::ChunkEncoder<default_implementation::(anonymous namespace)::UpTo8Bits>, default_implementation::(anonymous namespace)::UpTo8Bits>*) Unexecuted instantiation: enc_fast_lossless.cc:void default_implementation::(anonymous namespace)::ProcessImageArea<default_implementation::(anonymous namespace)::ChannelRowProcessor<default_implementation::(anonymous namespace)::ChunkEncoder<default_implementation::(anonymous namespace)::From9To13Bits>, default_implementation::(anonymous namespace)::From9To13Bits>, default_implementation::(anonymous namespace)::From9To13Bits>(unsigned char const*, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, default_implementation::(anonymous namespace)::From9To13Bits, unsigned long, bool, default_implementation::(anonymous namespace)::ChannelRowProcessor<default_implementation::(anonymous namespace)::ChunkEncoder<default_implementation::(anonymous namespace)::From9To13Bits>, default_implementation::(anonymous namespace)::From9To13Bits>*) Unexecuted instantiation: enc_fast_lossless.cc:void default_implementation::(anonymous namespace)::ProcessImageArea<default_implementation::(anonymous namespace)::ChannelRowProcessor<default_implementation::(anonymous namespace)::ChunkEncoder<default_implementation::(anonymous namespace)::Exactly14Bits>, default_implementation::(anonymous namespace)::Exactly14Bits>, default_implementation::(anonymous namespace)::Exactly14Bits>(unsigned char const*, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, default_implementation::(anonymous namespace)::Exactly14Bits, unsigned long, bool, default_implementation::(anonymous namespace)::ChannelRowProcessor<default_implementation::(anonymous namespace)::ChunkEncoder<default_implementation::(anonymous namespace)::Exactly14Bits>, default_implementation::(anonymous namespace)::Exactly14Bits>*) Unexecuted instantiation: enc_fast_lossless.cc:void default_implementation::(anonymous namespace)::ProcessImageArea<default_implementation::(anonymous namespace)::ChannelRowProcessor<default_implementation::(anonymous namespace)::ChunkEncoder<default_implementation::(anonymous namespace)::MoreThan14Bits>, default_implementation::(anonymous namespace)::MoreThan14Bits>, default_implementation::(anonymous namespace)::MoreThan14Bits>(unsigned char const*, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, default_implementation::(anonymous namespace)::MoreThan14Bits, unsigned long, bool, default_implementation::(anonymous namespace)::ChannelRowProcessor<default_implementation::(anonymous namespace)::ChunkEncoder<default_implementation::(anonymous namespace)::MoreThan14Bits>, default_implementation::(anonymous namespace)::MoreThan14Bits>*) |
3498 | | |
3499 | | template <typename BitDepth> |
3500 | | void WriteACSection(const unsigned char* rgba, size_t x0, size_t y0, size_t xs, |
3501 | | size_t ys, size_t row_stride, bool is_single_group, |
3502 | | BitDepth bitdepth, size_t nb_chans, bool big_endian, |
3503 | | const PrefixCode code[4], |
3504 | 0 | std::array<BitWriter, 4>& output) { |
3505 | 0 | for (size_t i = 0; i < nb_chans; i++) { |
3506 | 0 | if (is_single_group && i == 0) continue; |
3507 | 0 | output[i].Allocate(xs * ys * bitdepth.MaxEncodedBitsPerSample() + 4); |
3508 | 0 | } |
3509 | 0 | if (!is_single_group) { |
3510 | | // Group header for modular image. |
3511 | | // When the image is single-group, the global modular image is the one |
3512 | | // that contains the pixel data, and there is no group header. |
3513 | 0 | output[0].Write(1, 1); // Global tree |
3514 | 0 | output[0].Write(1, 1); // All default wp |
3515 | 0 | output[0].Write(2, 0b00); // 0 transforms |
3516 | 0 | } |
3517 | |
|
3518 | 0 | ChunkEncoder<BitDepth> encoders[4]; |
3519 | 0 | ChannelRowProcessor<ChunkEncoder<BitDepth>, BitDepth> row_encoders[4]; |
3520 | 0 | for (size_t c = 0; c < nb_chans; c++) { |
3521 | 0 | row_encoders[c].t = &encoders[c]; |
3522 | 0 | encoders[c].output = &output[c]; |
3523 | 0 | encoders[c].code = &code[c]; |
3524 | 0 | encoders[c].PrepareForSimd(); |
3525 | 0 | } |
3526 | 0 | ProcessImageArea<ChannelRowProcessor<ChunkEncoder<BitDepth>, BitDepth>>( |
3527 | 0 | rgba, x0, y0, xs, 0, ys, row_stride, bitdepth, nb_chans, big_endian, |
3528 | 0 | row_encoders); |
3529 | 0 | } Unexecuted instantiation: enc_fast_lossless.cc:void AVX2::(anonymous namespace)::WriteACSection<AVX2::(anonymous namespace)::UpTo8Bits>(unsigned char const*, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, bool, AVX2::(anonymous namespace)::UpTo8Bits, unsigned long, bool, (anonymous namespace)::PrefixCode const*, std::__1::array<(anonymous namespace)::BitWriter, 4ul>&) Unexecuted instantiation: enc_fast_lossless.cc:void AVX2::(anonymous namespace)::WriteACSection<AVX2::(anonymous namespace)::From9To13Bits>(unsigned char const*, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, bool, AVX2::(anonymous namespace)::From9To13Bits, unsigned long, bool, (anonymous namespace)::PrefixCode const*, std::__1::array<(anonymous namespace)::BitWriter, 4ul>&) Unexecuted instantiation: enc_fast_lossless.cc:void AVX2::(anonymous namespace)::WriteACSection<AVX2::(anonymous namespace)::Exactly14Bits>(unsigned char const*, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, bool, AVX2::(anonymous namespace)::Exactly14Bits, unsigned long, bool, (anonymous namespace)::PrefixCode const*, std::__1::array<(anonymous namespace)::BitWriter, 4ul>&) Unexecuted instantiation: enc_fast_lossless.cc:void AVX2::(anonymous namespace)::WriteACSection<AVX2::(anonymous namespace)::MoreThan14Bits>(unsigned char const*, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, bool, AVX2::(anonymous namespace)::MoreThan14Bits, unsigned long, bool, (anonymous namespace)::PrefixCode const*, std::__1::array<(anonymous namespace)::BitWriter, 4ul>&) Unexecuted instantiation: enc_fast_lossless.cc:void default_implementation::(anonymous namespace)::WriteACSection<default_implementation::(anonymous namespace)::UpTo8Bits>(unsigned char const*, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, bool, default_implementation::(anonymous namespace)::UpTo8Bits, unsigned long, bool, (anonymous namespace)::PrefixCode const*, std::__1::array<(anonymous namespace)::BitWriter, 4ul>&) Unexecuted instantiation: enc_fast_lossless.cc:void default_implementation::(anonymous namespace)::WriteACSection<default_implementation::(anonymous namespace)::From9To13Bits>(unsigned char const*, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, bool, default_implementation::(anonymous namespace)::From9To13Bits, unsigned long, bool, (anonymous namespace)::PrefixCode const*, std::__1::array<(anonymous namespace)::BitWriter, 4ul>&) Unexecuted instantiation: enc_fast_lossless.cc:void default_implementation::(anonymous namespace)::WriteACSection<default_implementation::(anonymous namespace)::Exactly14Bits>(unsigned char const*, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, bool, default_implementation::(anonymous namespace)::Exactly14Bits, unsigned long, bool, (anonymous namespace)::PrefixCode const*, std::__1::array<(anonymous namespace)::BitWriter, 4ul>&) Unexecuted instantiation: enc_fast_lossless.cc:void default_implementation::(anonymous namespace)::WriteACSection<default_implementation::(anonymous namespace)::MoreThan14Bits>(unsigned char const*, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, bool, default_implementation::(anonymous namespace)::MoreThan14Bits, unsigned long, bool, (anonymous namespace)::PrefixCode const*, std::__1::array<(anonymous namespace)::BitWriter, 4ul>&) |
3530 | | |
3531 | | constexpr int kHashExp = 16; |
3532 | | constexpr uint32_t kHashSize = 1 << kHashExp; |
3533 | | constexpr uint32_t kHashMultiplier = 2654435761; |
3534 | | constexpr int kMaxColors = 512; |
3535 | | |
3536 | | // can be any function that returns a value in 0 .. kHashSize-1 |
3537 | | // has to map 0 to 0 |
3538 | 0 | inline uint32_t pixel_hash(uint32_t p) { |
3539 | 0 | return (p * kHashMultiplier) >> (32 - kHashExp); |
3540 | 0 | } Unexecuted instantiation: enc_fast_lossless.cc:AVX2::(anonymous namespace)::pixel_hash(unsigned int) Unexecuted instantiation: enc_fast_lossless.cc:default_implementation::(anonymous namespace)::pixel_hash(unsigned int) |
3541 | | |
3542 | | template <size_t nb_chans> |
3543 | | void FillRowPalette(const unsigned char* inrow, size_t xs, |
3544 | 0 | const int16_t* lookup, int16_t* out) { |
3545 | 0 | for (size_t x = 0; x < xs; x++) { |
3546 | 0 | uint32_t p = 0; |
3547 | 0 | for (size_t i = 0; i < nb_chans; ++i) { |
3548 | 0 | p |= inrow[x * nb_chans + i] << (8 * i); |
3549 | 0 | } |
3550 | 0 | out[x] = lookup[pixel_hash(p)]; |
3551 | 0 | } |
3552 | 0 | } Unexecuted instantiation: enc_fast_lossless.cc:void AVX2::(anonymous namespace)::FillRowPalette<1ul>(unsigned char const*, unsigned long, short const*, short*) Unexecuted instantiation: enc_fast_lossless.cc:void AVX2::(anonymous namespace)::FillRowPalette<2ul>(unsigned char const*, unsigned long, short const*, short*) Unexecuted instantiation: enc_fast_lossless.cc:void AVX2::(anonymous namespace)::FillRowPalette<3ul>(unsigned char const*, unsigned long, short const*, short*) Unexecuted instantiation: enc_fast_lossless.cc:void AVX2::(anonymous namespace)::FillRowPalette<4ul>(unsigned char const*, unsigned long, short const*, short*) Unexecuted instantiation: enc_fast_lossless.cc:void default_implementation::(anonymous namespace)::FillRowPalette<1ul>(unsigned char const*, unsigned long, short const*, short*) Unexecuted instantiation: enc_fast_lossless.cc:void default_implementation::(anonymous namespace)::FillRowPalette<2ul>(unsigned char const*, unsigned long, short const*, short*) Unexecuted instantiation: enc_fast_lossless.cc:void default_implementation::(anonymous namespace)::FillRowPalette<3ul>(unsigned char const*, unsigned long, short const*, short*) Unexecuted instantiation: enc_fast_lossless.cc:void default_implementation::(anonymous namespace)::FillRowPalette<4ul>(unsigned char const*, unsigned long, short const*, short*) |
3553 | | |
3554 | | template <typename Processor> |
3555 | | void ProcessImageAreaPalette(const unsigned char* rgba, size_t x0, size_t y0, |
3556 | | size_t xs, size_t yskip, size_t ys, |
3557 | | size_t row_stride, const int16_t* lookup, |
3558 | 0 | size_t nb_chans, Processor* processors) { |
3559 | 0 | constexpr size_t kPadding = 32; |
3560 | |
|
3561 | 0 | std::vector<std::array<int16_t, 256 + kPadding * 2>> group_data(2); |
3562 | 0 | Processor& row_encoder = processors[0]; |
3563 | |
|
3564 | 0 | for (size_t y = 0; y < ys; y++) { |
3565 | | // Pre-fill rows with palette converted pixels. |
3566 | 0 | const unsigned char* inrow = rgba + row_stride * (y0 + y) + x0 * nb_chans; |
3567 | 0 | int16_t* outrow = &group_data[y & 1][kPadding]; |
3568 | 0 | if (nb_chans == 1) { |
3569 | 0 | FillRowPalette<1>(inrow, xs, lookup, outrow); |
3570 | 0 | } else if (nb_chans == 2) { |
3571 | 0 | FillRowPalette<2>(inrow, xs, lookup, outrow); |
3572 | 0 | } else if (nb_chans == 3) { |
3573 | 0 | FillRowPalette<3>(inrow, xs, lookup, outrow); |
3574 | 0 | } else if (nb_chans == 4) { |
3575 | 0 | FillRowPalette<4>(inrow, xs, lookup, outrow); |
3576 | 0 | } |
3577 | | // Deal with x == 0. |
3578 | 0 | group_data[y & 1][kPadding - 1] = |
3579 | 0 | y > 0 ? group_data[(y - 1) & 1][kPadding] : 0; |
3580 | | // Fix topleft. |
3581 | 0 | group_data[(y - 1) & 1][kPadding - 1] = |
3582 | 0 | y > 0 ? group_data[(y - 1) & 1][kPadding] : 0; |
3583 | | // Get pointers to px/left/top/topleft data to speedup loop. |
3584 | 0 | const int16_t* row = &group_data[y & 1][kPadding]; |
3585 | 0 | const int16_t* row_left = &group_data[y & 1][kPadding - 1]; |
3586 | 0 | const int16_t* row_top = |
3587 | 0 | y == 0 ? row_left : &group_data[(y - 1) & 1][kPadding]; |
3588 | 0 | const int16_t* row_topleft = |
3589 | 0 | y == 0 ? row_left : &group_data[(y - 1) & 1][kPadding - 1]; |
3590 | |
|
3591 | 0 | row_encoder.ProcessRow(row, row_left, row_top, row_topleft, xs); |
3592 | 0 | } |
3593 | 0 | row_encoder.Finalize(); |
3594 | 0 | } Unexecuted instantiation: enc_fast_lossless.cc:void AVX2::(anonymous namespace)::ProcessImageAreaPalette<AVX2::(anonymous namespace)::ChannelRowProcessor<AVX2::(anonymous namespace)::ChunkSampleCollector<AVX2::(anonymous namespace)::UpTo8Bits>, AVX2::(anonymous namespace)::UpTo8Bits> >(unsigned char const*, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, short const*, unsigned long, AVX2::(anonymous namespace)::ChannelRowProcessor<AVX2::(anonymous namespace)::ChunkSampleCollector<AVX2::(anonymous namespace)::UpTo8Bits>, AVX2::(anonymous namespace)::UpTo8Bits>*) Unexecuted instantiation: enc_fast_lossless.cc:void default_implementation::(anonymous namespace)::ProcessImageAreaPalette<default_implementation::(anonymous namespace)::ChannelRowProcessor<default_implementation::(anonymous namespace)::ChunkSampleCollector<default_implementation::(anonymous namespace)::UpTo8Bits>, default_implementation::(anonymous namespace)::UpTo8Bits> >(unsigned char const*, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, short const*, unsigned long, default_implementation::(anonymous namespace)::ChannelRowProcessor<default_implementation::(anonymous namespace)::ChunkSampleCollector<default_implementation::(anonymous namespace)::UpTo8Bits>, default_implementation::(anonymous namespace)::UpTo8Bits>*) Unexecuted instantiation: enc_fast_lossless.cc:void AVX2::(anonymous namespace)::ProcessImageAreaPalette<AVX2::(anonymous namespace)::ChannelRowProcessor<AVX2::(anonymous namespace)::ChunkEncoder<AVX2::(anonymous namespace)::UpTo8Bits>, AVX2::(anonymous namespace)::UpTo8Bits> >(unsigned char const*, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, short const*, unsigned long, AVX2::(anonymous namespace)::ChannelRowProcessor<AVX2::(anonymous namespace)::ChunkEncoder<AVX2::(anonymous namespace)::UpTo8Bits>, AVX2::(anonymous namespace)::UpTo8Bits>*) Unexecuted instantiation: enc_fast_lossless.cc:void default_implementation::(anonymous namespace)::ProcessImageAreaPalette<default_implementation::(anonymous namespace)::ChannelRowProcessor<default_implementation::(anonymous namespace)::ChunkEncoder<default_implementation::(anonymous namespace)::UpTo8Bits>, default_implementation::(anonymous namespace)::UpTo8Bits> >(unsigned char const*, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, short const*, unsigned long, default_implementation::(anonymous namespace)::ChannelRowProcessor<default_implementation::(anonymous namespace)::ChunkEncoder<default_implementation::(anonymous namespace)::UpTo8Bits>, default_implementation::(anonymous namespace)::UpTo8Bits>*) |
3595 | | |
3596 | | void WriteACSectionPalette(const unsigned char* rgba, size_t x0, size_t y0, |
3597 | | size_t xs, size_t ys, size_t row_stride, |
3598 | | bool is_single_group, const PrefixCode code[4], |
3599 | | const int16_t* lookup, size_t nb_chans, |
3600 | 0 | BitWriter& output) { |
3601 | 0 | if (!is_single_group) { |
3602 | 0 | output.Allocate(16 * xs * ys + 4); |
3603 | | // Group header for modular image. |
3604 | | // When the image is single-group, the global modular image is the one |
3605 | | // that contains the pixel data, and there is no group header. |
3606 | 0 | output.Write(1, 1); // Global tree |
3607 | 0 | output.Write(1, 1); // All default wp |
3608 | 0 | output.Write(2, 0b00); // 0 transforms |
3609 | 0 | } |
3610 | |
|
3611 | 0 | ChunkEncoder<UpTo8Bits> encoder; |
3612 | 0 | ChannelRowProcessor<ChunkEncoder<UpTo8Bits>, UpTo8Bits> row_encoder; |
3613 | |
|
3614 | 0 | row_encoder.t = &encoder; |
3615 | 0 | encoder.output = &output; |
3616 | 0 | encoder.code = &code[is_single_group ? 1 : 0]; |
3617 | 0 | encoder.PrepareForSimd(); |
3618 | 0 | ProcessImageAreaPalette< |
3619 | 0 | ChannelRowProcessor<ChunkEncoder<UpTo8Bits>, UpTo8Bits>>( |
3620 | 0 | rgba, x0, y0, xs, 0, ys, row_stride, lookup, nb_chans, &row_encoder); |
3621 | 0 | } Unexecuted instantiation: enc_fast_lossless.cc:AVX2::(anonymous namespace)::WriteACSectionPalette(unsigned char const*, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, bool, (anonymous namespace)::PrefixCode const*, short const*, unsigned long, (anonymous namespace)::BitWriter&) Unexecuted instantiation: enc_fast_lossless.cc:default_implementation::(anonymous namespace)::WriteACSectionPalette(unsigned char const*, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, bool, (anonymous namespace)::PrefixCode const*, short const*, unsigned long, (anonymous namespace)::BitWriter&) |
3622 | | |
3623 | | template <typename BitDepth> |
3624 | | void CollectSamples(const unsigned char* rgba, size_t x0, size_t y0, size_t xs, |
3625 | | size_t row_stride, size_t row_count, |
3626 | | uint64_t raw_counts[4][kNumRawSymbols], |
3627 | | uint64_t lz77_counts[4][kNumLZ77], bool is_single_group, |
3628 | | bool palette, BitDepth bitdepth, size_t nb_chans, |
3629 | 0 | bool big_endian, const int16_t* lookup) { |
3630 | 0 | if (palette) { |
3631 | 0 | ChunkSampleCollector<UpTo8Bits> sample_collectors[4]; |
3632 | 0 | ChannelRowProcessor<ChunkSampleCollector<UpTo8Bits>, UpTo8Bits> |
3633 | 0 | row_sample_collectors[4]; |
3634 | 0 | for (size_t c = 0; c < nb_chans; c++) { |
3635 | 0 | row_sample_collectors[c].t = &sample_collectors[c]; |
3636 | 0 | sample_collectors[c].raw_counts = raw_counts[is_single_group ? 1 : 0]; |
3637 | 0 | sample_collectors[c].lz77_counts = lz77_counts[is_single_group ? 1 : 0]; |
3638 | 0 | } |
3639 | 0 | ProcessImageAreaPalette< |
3640 | 0 | ChannelRowProcessor<ChunkSampleCollector<UpTo8Bits>, UpTo8Bits>>( |
3641 | 0 | rgba, x0, y0, xs, 1, 1 + row_count, row_stride, lookup, nb_chans, |
3642 | 0 | row_sample_collectors); |
3643 | 0 | } else { |
3644 | 0 | ChunkSampleCollector<BitDepth> sample_collectors[4]; |
3645 | 0 | ChannelRowProcessor<ChunkSampleCollector<BitDepth>, BitDepth> |
3646 | 0 | row_sample_collectors[4]; |
3647 | 0 | for (size_t c = 0; c < nb_chans; c++) { |
3648 | 0 | row_sample_collectors[c].t = &sample_collectors[c]; |
3649 | 0 | sample_collectors[c].raw_counts = raw_counts[c]; |
3650 | 0 | sample_collectors[c].lz77_counts = lz77_counts[c]; |
3651 | 0 | } |
3652 | 0 | ProcessImageArea< |
3653 | 0 | ChannelRowProcessor<ChunkSampleCollector<BitDepth>, BitDepth>>( |
3654 | 0 | rgba, x0, y0, xs, 1, 1 + row_count, row_stride, bitdepth, nb_chans, |
3655 | 0 | big_endian, row_sample_collectors); |
3656 | 0 | } |
3657 | 0 | } Unexecuted instantiation: enc_fast_lossless.cc:void AVX2::(anonymous namespace)::CollectSamples<AVX2::(anonymous namespace)::UpTo8Bits>(unsigned char const*, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long (*) [19], unsigned long (*) [33], bool, bool, AVX2::(anonymous namespace)::UpTo8Bits, unsigned long, bool, short const*) Unexecuted instantiation: enc_fast_lossless.cc:void AVX2::(anonymous namespace)::CollectSamples<AVX2::(anonymous namespace)::From9To13Bits>(unsigned char const*, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long (*) [19], unsigned long (*) [33], bool, bool, AVX2::(anonymous namespace)::From9To13Bits, unsigned long, bool, short const*) Unexecuted instantiation: enc_fast_lossless.cc:void AVX2::(anonymous namespace)::CollectSamples<AVX2::(anonymous namespace)::Exactly14Bits>(unsigned char const*, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long (*) [19], unsigned long (*) [33], bool, bool, AVX2::(anonymous namespace)::Exactly14Bits, unsigned long, bool, short const*) Unexecuted instantiation: enc_fast_lossless.cc:void AVX2::(anonymous namespace)::CollectSamples<AVX2::(anonymous namespace)::MoreThan14Bits>(unsigned char const*, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long (*) [19], unsigned long (*) [33], bool, bool, AVX2::(anonymous namespace)::MoreThan14Bits, unsigned long, bool, short const*) Unexecuted instantiation: enc_fast_lossless.cc:void default_implementation::(anonymous namespace)::CollectSamples<default_implementation::(anonymous namespace)::UpTo8Bits>(unsigned char const*, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long (*) [19], unsigned long (*) [33], bool, bool, default_implementation::(anonymous namespace)::UpTo8Bits, unsigned long, bool, short const*) Unexecuted instantiation: enc_fast_lossless.cc:void default_implementation::(anonymous namespace)::CollectSamples<default_implementation::(anonymous namespace)::From9To13Bits>(unsigned char const*, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long (*) [19], unsigned long (*) [33], bool, bool, default_implementation::(anonymous namespace)::From9To13Bits, unsigned long, bool, short const*) Unexecuted instantiation: enc_fast_lossless.cc:void default_implementation::(anonymous namespace)::CollectSamples<default_implementation::(anonymous namespace)::Exactly14Bits>(unsigned char const*, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long (*) [19], unsigned long (*) [33], bool, bool, default_implementation::(anonymous namespace)::Exactly14Bits, unsigned long, bool, short const*) Unexecuted instantiation: enc_fast_lossless.cc:void default_implementation::(anonymous namespace)::CollectSamples<default_implementation::(anonymous namespace)::MoreThan14Bits>(unsigned char const*, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long (*) [19], unsigned long (*) [33], bool, bool, default_implementation::(anonymous namespace)::MoreThan14Bits, unsigned long, bool, short const*) |
3658 | | |
3659 | | void PrepareDCGlobalPalette(bool is_single_group, size_t width, size_t height, |
3660 | | size_t nb_chans, const PrefixCode code[4], |
3661 | | const std::vector<uint32_t>& palette, |
3662 | 0 | size_t pcolors, BitWriter* output) { |
3663 | 0 | PrepareDCGlobalCommon(is_single_group, width, height, code, output); |
3664 | 0 | output->Write(2, 0b01); // 1 transform |
3665 | 0 | output->Write(2, 0b01); // Palette |
3666 | 0 | output->Write(5, 0b00000); // Starting from ch 0 |
3667 | 0 | if (nb_chans == 1) { |
3668 | 0 | output->Write(2, 0b00); // 1-channel palette (Gray) |
3669 | 0 | } else if (nb_chans == 3) { |
3670 | 0 | output->Write(2, 0b01); // 3-channel palette (RGB) |
3671 | 0 | } else if (nb_chans == 4) { |
3672 | 0 | output->Write(2, 0b10); // 4-channel palette (RGBA) |
3673 | 0 | } else { |
3674 | 0 | output->Write(2, 0b11); |
3675 | 0 | output->Write(13, nb_chans - 1); |
3676 | 0 | } |
3677 | | // pcolors <= kMaxColors + kChunkSize - 1 |
3678 | 0 | static_assert(kMaxColors + kChunkSize < 1281, |
3679 | 0 | "add code to signal larger palette sizes"); |
3680 | 0 | if (pcolors < 256) { |
3681 | 0 | output->Write(2, 0b00); |
3682 | 0 | output->Write(8, pcolors); |
3683 | 0 | } else { |
3684 | 0 | output->Write(2, 0b01); |
3685 | 0 | output->Write(10, pcolors - 256); |
3686 | 0 | } |
3687 | |
|
3688 | 0 | output->Write(2, 0b00); // nb_deltas == 0 |
3689 | 0 | output->Write(4, 0); // Zero predictor for delta palette |
3690 | | // Encode palette |
3691 | 0 | ChunkEncoder<UpTo8Bits> encoder; |
3692 | 0 | ChannelRowProcessor<ChunkEncoder<UpTo8Bits>, UpTo8Bits> row_encoder; |
3693 | 0 | row_encoder.t = &encoder; |
3694 | 0 | encoder.output = output; |
3695 | 0 | encoder.code = &code[0]; |
3696 | 0 | encoder.PrepareForSimd(); |
3697 | 0 | std::vector<std::array<int16_t, 32 + 1024>> p(4); |
3698 | 0 | size_t i = 0; |
3699 | 0 | size_t have_zero = 1; |
3700 | 0 | for (; i < pcolors; i++) { |
3701 | 0 | p[0][16 + i + have_zero] = palette[i] & 0xFF; |
3702 | 0 | p[1][16 + i + have_zero] = (palette[i] >> 8) & 0xFF; |
3703 | 0 | p[2][16 + i + have_zero] = (palette[i] >> 16) & 0xFF; |
3704 | 0 | p[3][16 + i + have_zero] = (palette[i] >> 24) & 0xFF; |
3705 | 0 | } |
3706 | 0 | p[0][15] = 0; |
3707 | 0 | row_encoder.ProcessRow(p[0].data() + 16, p[0].data() + 15, p[0].data() + 15, |
3708 | 0 | p[0].data() + 15, pcolors); |
3709 | 0 | p[1][15] = p[0][16]; |
3710 | 0 | p[0][15] = p[0][16]; |
3711 | 0 | if (nb_chans > 1) { |
3712 | 0 | row_encoder.ProcessRow(p[1].data() + 16, p[1].data() + 15, p[0].data() + 16, |
3713 | 0 | p[0].data() + 15, pcolors); |
3714 | 0 | } |
3715 | 0 | p[2][15] = p[1][16]; |
3716 | 0 | p[1][15] = p[1][16]; |
3717 | 0 | if (nb_chans > 2) { |
3718 | 0 | row_encoder.ProcessRow(p[2].data() + 16, p[2].data() + 15, p[1].data() + 16, |
3719 | 0 | p[1].data() + 15, pcolors); |
3720 | 0 | } |
3721 | 0 | p[3][15] = p[2][16]; |
3722 | 0 | p[2][15] = p[2][16]; |
3723 | 0 | if (nb_chans > 3) { |
3724 | 0 | row_encoder.ProcessRow(p[3].data() + 16, p[3].data() + 15, p[2].data() + 16, |
3725 | 0 | p[2].data() + 15, pcolors); |
3726 | 0 | } |
3727 | 0 | row_encoder.Finalize(); |
3728 | |
|
3729 | 0 | if (!is_single_group) { |
3730 | 0 | output->ZeroPadToByte(); |
3731 | 0 | } |
3732 | 0 | } Unexecuted instantiation: enc_fast_lossless.cc:AVX2::(anonymous namespace)::PrepareDCGlobalPalette(bool, unsigned long, unsigned long, unsigned long, (anonymous namespace)::PrefixCode const*, std::__1::vector<unsigned int, std::__1::allocator<unsigned int> > const&, unsigned long, (anonymous namespace)::BitWriter*) Unexecuted instantiation: enc_fast_lossless.cc:default_implementation::(anonymous namespace)::PrepareDCGlobalPalette(bool, unsigned long, unsigned long, unsigned long, (anonymous namespace)::PrefixCode const*, std::__1::vector<unsigned int, std::__1::allocator<unsigned int> > const&, unsigned long, (anonymous namespace)::BitWriter*) |
3733 | | |
3734 | | template <size_t nb_chans> |
3735 | | bool detect_palette(const unsigned char* r, size_t width, |
3736 | 0 | std::vector<uint32_t>& palette) { |
3737 | 0 | size_t x = 0; |
3738 | 0 | bool collided = false; |
3739 | | // this is just an unrolling of the next loop |
3740 | 0 | size_t look_ahead = 7 + ((nb_chans == 1) ? 3 : ((nb_chans < 4) ? 1 : 0)); |
3741 | 0 | for (; x + look_ahead < width; x += 8) { |
3742 | 0 | uint32_t p[8] = {}, index[8]; |
3743 | 0 | for (int i = 0; i < 8; i++) { |
3744 | 0 | for (int j = 0; j < 4; ++j) { |
3745 | 0 | p[i] |= r[(x + i) * nb_chans + j] << (8 * j); |
3746 | 0 | } |
3747 | 0 | } |
3748 | 0 | for (int i = 0; i < 8; i++) p[i] &= ((1llu << (8 * nb_chans)) - 1); |
3749 | 0 | for (int i = 0; i < 8; i++) index[i] = pixel_hash(p[i]); |
3750 | 0 | for (int i = 0; i < 8; i++) { |
3751 | 0 | collided |= (palette[index[i]] != 0 && p[i] != palette[index[i]]); |
3752 | 0 | } |
3753 | 0 | for (int i = 0; i < 8; i++) palette[index[i]] = p[i]; |
3754 | 0 | } |
3755 | 0 | for (; x < width; x++) { |
3756 | 0 | uint32_t p = 0; |
3757 | 0 | for (size_t i = 0; i < nb_chans; ++i) { |
3758 | 0 | p |= r[x * nb_chans + i] << (8 * i); |
3759 | 0 | } |
3760 | 0 | uint32_t index = pixel_hash(p); |
3761 | 0 | collided |= (palette[index] != 0 && p != palette[index]); |
3762 | 0 | palette[index] = p; |
3763 | 0 | } |
3764 | 0 | return collided; |
3765 | 0 | } Unexecuted instantiation: enc_fast_lossless.cc:bool AVX2::(anonymous namespace)::detect_palette<1ul>(unsigned char const*, unsigned long, std::__1::vector<unsigned int, std::__1::allocator<unsigned int> >&) Unexecuted instantiation: enc_fast_lossless.cc:bool AVX2::(anonymous namespace)::detect_palette<2ul>(unsigned char const*, unsigned long, std::__1::vector<unsigned int, std::__1::allocator<unsigned int> >&) Unexecuted instantiation: enc_fast_lossless.cc:bool AVX2::(anonymous namespace)::detect_palette<3ul>(unsigned char const*, unsigned long, std::__1::vector<unsigned int, std::__1::allocator<unsigned int> >&) Unexecuted instantiation: enc_fast_lossless.cc:bool AVX2::(anonymous namespace)::detect_palette<4ul>(unsigned char const*, unsigned long, std::__1::vector<unsigned int, std::__1::allocator<unsigned int> >&) Unexecuted instantiation: enc_fast_lossless.cc:bool default_implementation::(anonymous namespace)::detect_palette<1ul>(unsigned char const*, unsigned long, std::__1::vector<unsigned int, std::__1::allocator<unsigned int> >&) Unexecuted instantiation: enc_fast_lossless.cc:bool default_implementation::(anonymous namespace)::detect_palette<2ul>(unsigned char const*, unsigned long, std::__1::vector<unsigned int, std::__1::allocator<unsigned int> >&) Unexecuted instantiation: enc_fast_lossless.cc:bool default_implementation::(anonymous namespace)::detect_palette<3ul>(unsigned char const*, unsigned long, std::__1::vector<unsigned int, std::__1::allocator<unsigned int> >&) Unexecuted instantiation: enc_fast_lossless.cc:bool default_implementation::(anonymous namespace)::detect_palette<4ul>(unsigned char const*, unsigned long, std::__1::vector<unsigned int, std::__1::allocator<unsigned int> >&) |
3766 | | |
3767 | | template <typename BitDepth> |
3768 | | JxlFastLosslessFrameState* LLPrepare(JxlChunkedFrameInputSource input, |
3769 | | size_t width, size_t height, |
3770 | | BitDepth bitdepth, size_t nb_chans, |
3771 | 0 | bool big_endian, int effort, int oneshot) { |
3772 | 0 | assert(width != 0); |
3773 | 0 | assert(height != 0); |
3774 | | |
3775 | | // Count colors to try palette |
3776 | 0 | std::vector<uint32_t> palette(kHashSize); |
3777 | 0 | std::vector<int16_t> lookup(kHashSize); |
3778 | 0 | lookup[0] = 0; |
3779 | 0 | int pcolors = 0; |
3780 | 0 | bool collided = effort < 2 || bitdepth.bitdepth != 8 || !oneshot; |
3781 | 0 | for (size_t y0 = 0; y0 < height && !collided; y0 += 256) { |
3782 | 0 | size_t ys = std::min<size_t>(height - y0, 256); |
3783 | 0 | for (size_t x0 = 0; x0 < width && !collided; x0 += 256) { |
3784 | 0 | size_t xs = std::min<size_t>(width - x0, 256); |
3785 | 0 | size_t stride; |
3786 | | // TODO(szabadka): Add RAII wrapper around this. |
3787 | 0 | const void* buffer = input.get_color_channel_data_at(input.opaque, x0, y0, |
3788 | 0 | xs, ys, &stride); |
3789 | 0 | auto rgba = reinterpret_cast<const unsigned char*>(buffer); |
3790 | 0 | for (size_t y = 0; y < ys && !collided; y++) { |
3791 | 0 | const unsigned char* r = rgba + stride * y; |
3792 | 0 | if (nb_chans == 1) collided = detect_palette<1>(r, xs, palette); |
3793 | 0 | if (nb_chans == 2) collided = detect_palette<2>(r, xs, palette); |
3794 | 0 | if (nb_chans == 3) collided = detect_palette<3>(r, xs, palette); |
3795 | 0 | if (nb_chans == 4) collided = detect_palette<4>(r, xs, palette); |
3796 | 0 | } |
3797 | 0 | input.release_buffer(input.opaque, buffer); |
3798 | 0 | } |
3799 | 0 | } |
3800 | 0 | int nb_entries = 0; |
3801 | 0 | if (!collided) { |
3802 | 0 | pcolors = 1; // always have all-zero as a palette color |
3803 | 0 | bool have_color = false; |
3804 | 0 | uint8_t minG = 255, maxG = 0; |
3805 | 0 | for (uint32_t k = 0; k < kHashSize; k++) { |
3806 | 0 | if (palette[k] == 0) continue; |
3807 | 0 | uint8_t p[4]; |
3808 | 0 | for (int i = 0; i < 4; ++i) { |
3809 | 0 | p[i] = (palette[k] >> (8 * i)) & 0xFF; |
3810 | 0 | } |
3811 | | // move entries to front so sort has less work |
3812 | 0 | palette[nb_entries] = palette[k]; |
3813 | 0 | if (p[0] != p[1] || p[0] != p[2]) have_color = true; |
3814 | 0 | if (p[1] < minG) minG = p[1]; |
3815 | 0 | if (p[1] > maxG) maxG = p[1]; |
3816 | 0 | nb_entries++; |
3817 | | // don't do palette if too many colors are needed |
3818 | 0 | if (nb_entries + pcolors > kMaxColors) { |
3819 | 0 | collided = true; |
3820 | 0 | break; |
3821 | 0 | } |
3822 | 0 | } |
3823 | 0 | if (!have_color) { |
3824 | | // don't do palette if it's just grayscale without many holes |
3825 | 0 | if (maxG - minG < nb_entries * 1.4f) collided = true; |
3826 | 0 | } |
3827 | 0 | } |
3828 | 0 | if (!collided) { |
3829 | 0 | std::sort( |
3830 | 0 | palette.begin(), palette.begin() + nb_entries, |
3831 | 0 | [&nb_chans](uint32_t ap, uint32_t bp) { |
3832 | 0 | if (ap == 0) return false; |
3833 | 0 | if (bp == 0) return true; |
3834 | 0 | uint8_t a[4], b[4]; |
3835 | 0 | for (int i = 0; i < 4; ++i) { |
3836 | 0 | a[i] = (ap >> (8 * i)) & 0xFF; |
3837 | 0 | b[i] = (bp >> (8 * i)) & 0xFF; |
3838 | 0 | } |
3839 | 0 | float ay, by; |
3840 | 0 | if (nb_chans == 4) { |
3841 | 0 | ay = (0.299f * a[0] + 0.587f * a[1] + 0.114f * a[2] + 0.01f) * a[3]; |
3842 | 0 | by = (0.299f * b[0] + 0.587f * b[1] + 0.114f * b[2] + 0.01f) * b[3]; |
3843 | 0 | } else { |
3844 | 0 | ay = (0.299f * a[0] + 0.587f * a[1] + 0.114f * a[2] + 0.01f); |
3845 | 0 | by = (0.299f * b[0] + 0.587f * b[1] + 0.114f * b[2] + 0.01f); |
3846 | 0 | } |
3847 | 0 | return ay < by; // sort on alpha*luma |
3848 | 0 | }); Unexecuted instantiation: enc_fast_lossless.cc:AVX2::(anonymous namespace)::LLPrepare<AVX2::(anonymous namespace)::UpTo8Bits>(JxlChunkedFrameInputSource, unsigned long, unsigned long, AVX2::(anonymous namespace)::UpTo8Bits, unsigned long, bool, int, int)::{lambda(unsigned int, unsigned int)#1}::operator()(unsigned int, unsigned int) const Unexecuted instantiation: enc_fast_lossless.cc:AVX2::(anonymous namespace)::LLPrepare<AVX2::(anonymous namespace)::From9To13Bits>(JxlChunkedFrameInputSource, unsigned long, unsigned long, AVX2::(anonymous namespace)::From9To13Bits, unsigned long, bool, int, int)::{lambda(unsigned int, unsigned int)#1}::operator()(unsigned int, unsigned int) const Unexecuted instantiation: enc_fast_lossless.cc:AVX2::(anonymous namespace)::LLPrepare<AVX2::(anonymous namespace)::Exactly14Bits>(JxlChunkedFrameInputSource, unsigned long, unsigned long, AVX2::(anonymous namespace)::Exactly14Bits, unsigned long, bool, int, int)::{lambda(unsigned int, unsigned int)#1}::operator()(unsigned int, unsigned int) const Unexecuted instantiation: enc_fast_lossless.cc:AVX2::(anonymous namespace)::LLPrepare<AVX2::(anonymous namespace)::MoreThan14Bits>(JxlChunkedFrameInputSource, unsigned long, unsigned long, AVX2::(anonymous namespace)::MoreThan14Bits, unsigned long, bool, int, int)::{lambda(unsigned int, unsigned int)#1}::operator()(unsigned int, unsigned int) const Unexecuted instantiation: enc_fast_lossless.cc:default_implementation::(anonymous namespace)::LLPrepare<default_implementation::(anonymous namespace)::UpTo8Bits>(JxlChunkedFrameInputSource, unsigned long, unsigned long, default_implementation::(anonymous namespace)::UpTo8Bits, unsigned long, bool, int, int)::{lambda(unsigned int, unsigned int)#1}::operator()(unsigned int, unsigned int) const Unexecuted instantiation: enc_fast_lossless.cc:default_implementation::(anonymous namespace)::LLPrepare<default_implementation::(anonymous namespace)::From9To13Bits>(JxlChunkedFrameInputSource, unsigned long, unsigned long, default_implementation::(anonymous namespace)::From9To13Bits, unsigned long, bool, int, int)::{lambda(unsigned int, unsigned int)#1}::operator()(unsigned int, unsigned int) const Unexecuted instantiation: enc_fast_lossless.cc:default_implementation::(anonymous namespace)::LLPrepare<default_implementation::(anonymous namespace)::Exactly14Bits>(JxlChunkedFrameInputSource, unsigned long, unsigned long, default_implementation::(anonymous namespace)::Exactly14Bits, unsigned long, bool, int, int)::{lambda(unsigned int, unsigned int)#1}::operator()(unsigned int, unsigned int) const Unexecuted instantiation: enc_fast_lossless.cc:default_implementation::(anonymous namespace)::LLPrepare<default_implementation::(anonymous namespace)::MoreThan14Bits>(JxlChunkedFrameInputSource, unsigned long, unsigned long, default_implementation::(anonymous namespace)::MoreThan14Bits, unsigned long, bool, int, int)::{lambda(unsigned int, unsigned int)#1}::operator()(unsigned int, unsigned int) const |
3849 | 0 | for (int k = 0; k < nb_entries; k++) { |
3850 | 0 | if (palette[k] == 0) break; |
3851 | 0 | lookup[pixel_hash(palette[k])] = pcolors++; |
3852 | 0 | } |
3853 | 0 | } |
3854 | |
|
3855 | 0 | size_t num_groups_x = (width + 255) / 256; |
3856 | 0 | size_t num_groups_y = (height + 255) / 256; |
3857 | 0 | size_t num_dc_groups_x = (width + 2047) / 2048; |
3858 | 0 | size_t num_dc_groups_y = (height + 2047) / 2048; |
3859 | |
|
3860 | 0 | uint64_t raw_counts[4][kNumRawSymbols] = {}; |
3861 | 0 | uint64_t lz77_counts[4][kNumLZ77] = {}; |
3862 | |
|
3863 | 0 | bool onegroup = num_groups_x == 1 && num_groups_y == 1; |
3864 | |
|
3865 | 0 | auto sample_rows = [&](size_t xg, size_t yg, size_t num_rows) { |
3866 | 0 | size_t y0 = yg * 256; |
3867 | 0 | size_t x0 = xg * 256; |
3868 | 0 | size_t ys = std::min<size_t>(height - y0, 256); |
3869 | 0 | size_t xs = std::min<size_t>(width - x0, 256); |
3870 | 0 | size_t stride; |
3871 | 0 | const void* buffer = |
3872 | 0 | input.get_color_channel_data_at(input.opaque, x0, y0, xs, ys, &stride); |
3873 | 0 | auto rgba = reinterpret_cast<const unsigned char*>(buffer); |
3874 | 0 | int y_begin_group = |
3875 | 0 | std::max<ssize_t>( |
3876 | 0 | 0, static_cast<ssize_t>(ys) - static_cast<ssize_t>(num_rows)) / |
3877 | 0 | 2; |
3878 | 0 | int y_count = std::min<int>(num_rows, ys - y_begin_group); |
3879 | 0 | int x_max = xs / kChunkSize * kChunkSize; |
3880 | 0 | CollectSamples(rgba, 0, y_begin_group, x_max, stride, y_count, raw_counts, |
3881 | 0 | lz77_counts, onegroup, !collided, bitdepth, nb_chans, |
3882 | 0 | big_endian, lookup.data()); |
3883 | 0 | input.release_buffer(input.opaque, buffer); |
3884 | 0 | }; Unexecuted instantiation: enc_fast_lossless.cc:AVX2::(anonymous namespace)::LLPrepare<AVX2::(anonymous namespace)::UpTo8Bits>(JxlChunkedFrameInputSource, unsigned long, unsigned long, AVX2::(anonymous namespace)::UpTo8Bits, unsigned long, bool, int, int)::{lambda(unsigned long, unsigned long, unsigned long)#1}::operator()(unsigned long, unsigned long, unsigned long) const Unexecuted instantiation: enc_fast_lossless.cc:AVX2::(anonymous namespace)::LLPrepare<AVX2::(anonymous namespace)::From9To13Bits>(JxlChunkedFrameInputSource, unsigned long, unsigned long, AVX2::(anonymous namespace)::From9To13Bits, unsigned long, bool, int, int)::{lambda(unsigned long, unsigned long, unsigned long)#1}::operator()(unsigned long, unsigned long, unsigned long) const Unexecuted instantiation: enc_fast_lossless.cc:AVX2::(anonymous namespace)::LLPrepare<AVX2::(anonymous namespace)::Exactly14Bits>(JxlChunkedFrameInputSource, unsigned long, unsigned long, AVX2::(anonymous namespace)::Exactly14Bits, unsigned long, bool, int, int)::{lambda(unsigned long, unsigned long, unsigned long)#1}::operator()(unsigned long, unsigned long, unsigned long) const Unexecuted instantiation: enc_fast_lossless.cc:AVX2::(anonymous namespace)::LLPrepare<AVX2::(anonymous namespace)::MoreThan14Bits>(JxlChunkedFrameInputSource, unsigned long, unsigned long, AVX2::(anonymous namespace)::MoreThan14Bits, unsigned long, bool, int, int)::{lambda(unsigned long, unsigned long, unsigned long)#1}::operator()(unsigned long, unsigned long, unsigned long) const Unexecuted instantiation: enc_fast_lossless.cc:default_implementation::(anonymous namespace)::LLPrepare<default_implementation::(anonymous namespace)::UpTo8Bits>(JxlChunkedFrameInputSource, unsigned long, unsigned long, default_implementation::(anonymous namespace)::UpTo8Bits, unsigned long, bool, int, int)::{lambda(unsigned long, unsigned long, unsigned long)#1}::operator()(unsigned long, unsigned long, unsigned long) const Unexecuted instantiation: enc_fast_lossless.cc:default_implementation::(anonymous namespace)::LLPrepare<default_implementation::(anonymous namespace)::From9To13Bits>(JxlChunkedFrameInputSource, unsigned long, unsigned long, default_implementation::(anonymous namespace)::From9To13Bits, unsigned long, bool, int, int)::{lambda(unsigned long, unsigned long, unsigned long)#1}::operator()(unsigned long, unsigned long, unsigned long) const Unexecuted instantiation: enc_fast_lossless.cc:default_implementation::(anonymous namespace)::LLPrepare<default_implementation::(anonymous namespace)::Exactly14Bits>(JxlChunkedFrameInputSource, unsigned long, unsigned long, default_implementation::(anonymous namespace)::Exactly14Bits, unsigned long, bool, int, int)::{lambda(unsigned long, unsigned long, unsigned long)#1}::operator()(unsigned long, unsigned long, unsigned long) const Unexecuted instantiation: enc_fast_lossless.cc:default_implementation::(anonymous namespace)::LLPrepare<default_implementation::(anonymous namespace)::MoreThan14Bits>(JxlChunkedFrameInputSource, unsigned long, unsigned long, default_implementation::(anonymous namespace)::MoreThan14Bits, unsigned long, bool, int, int)::{lambda(unsigned long, unsigned long, unsigned long)#1}::operator()(unsigned long, unsigned long, unsigned long) const |
3885 | | |
3886 | | // TODO(veluca): that `64` is an arbitrary constant, meant to correspond to |
3887 | | // the point where the number of processed rows is large enough that loading |
3888 | | // the entire image is cost-effective. |
3889 | 0 | if (oneshot || effort >= 64) { |
3890 | 0 | for (size_t g = 0; g < num_groups_y * num_groups_x; g++) { |
3891 | 0 | size_t xg = g % num_groups_x; |
3892 | 0 | size_t yg = g / num_groups_x; |
3893 | 0 | size_t y0 = yg * 256; |
3894 | 0 | size_t ys = std::min<size_t>(height - y0, 256); |
3895 | 0 | size_t num_rows = 2 * effort * ys / 256; |
3896 | 0 | sample_rows(xg, yg, num_rows); |
3897 | 0 | } |
3898 | 0 | } else { |
3899 | | // sample the middle (effort * 2 * num_groups) rows of the center group |
3900 | | // (possibly all of them). |
3901 | 0 | sample_rows((num_groups_x - 1) / 2, (num_groups_y - 1) / 2, |
3902 | 0 | 2 * effort * num_groups_x * num_groups_y); |
3903 | 0 | } |
3904 | | |
3905 | | // TODO(veluca): can probably improve this and make it bitdepth-dependent. |
3906 | 0 | uint64_t base_raw_counts[kNumRawSymbols] = { |
3907 | 0 | 3843, 852, 1270, 1214, 1014, 727, 481, 300, 159, 51, |
3908 | 0 | 5, 1, 1, 1, 1, 1, 1, 1, 1}; |
3909 | |
|
3910 | 0 | bool doing_ycocg = nb_chans > 2 && collided; |
3911 | 0 | bool large_palette = !collided || pcolors >= 256; |
3912 | 0 | for (size_t i = bitdepth.NumSymbols(doing_ycocg || large_palette); |
3913 | 0 | i < kNumRawSymbols; i++) { |
3914 | 0 | base_raw_counts[i] = 0; |
3915 | 0 | } |
3916 | |
|
3917 | 0 | for (size_t c = 0; c < 4; c++) { |
3918 | 0 | for (size_t i = 0; i < kNumRawSymbols; i++) { |
3919 | 0 | raw_counts[c][i] = (raw_counts[c][i] << 8) + base_raw_counts[i]; |
3920 | 0 | } |
3921 | 0 | } |
3922 | |
|
3923 | 0 | if (!collided) { |
3924 | 0 | unsigned token, nbits, bits; |
3925 | 0 | EncodeHybridUint000(PackSigned(pcolors - 1), &token, &nbits, &bits); |
3926 | | // ensure all palette indices can actually be encoded |
3927 | 0 | for (size_t i = 0; i < token + 1; i++) |
3928 | 0 | raw_counts[0][i] = std::max<uint64_t>(raw_counts[0][i], 1); |
3929 | | // these tokens are only used for the palette itself so they can get a bad |
3930 | | // code |
3931 | 0 | for (size_t i = token + 1; i < 10; i++) raw_counts[0][i] = 1; |
3932 | 0 | } |
3933 | |
|
3934 | 0 | uint64_t base_lz77_counts[kNumLZ77] = { |
3935 | 0 | 29, 27, 25, 23, 21, 21, 19, 18, 21, 17, 16, 15, 15, 14, |
3936 | 0 | 13, 13, 137, 98, 61, 34, 1, 1, 1, 1, 1, 1, 1, 1, |
3937 | 0 | }; |
3938 | |
|
3939 | 0 | for (size_t c = 0; c < 4; c++) { |
3940 | 0 | for (size_t i = 0; i < kNumLZ77; i++) { |
3941 | 0 | lz77_counts[c][i] = (lz77_counts[c][i] << 8) + base_lz77_counts[i]; |
3942 | 0 | } |
3943 | 0 | } |
3944 | |
|
3945 | 0 | JxlFastLosslessFrameState* frame_state = new JxlFastLosslessFrameState(); |
3946 | 0 | for (size_t i = 0; i < 4; i++) { |
3947 | 0 | frame_state->hcode[i] = PrefixCode(bitdepth, raw_counts[i], lz77_counts[i]); |
3948 | 0 | } |
3949 | |
|
3950 | 0 | size_t num_dc_groups = num_dc_groups_x * num_dc_groups_y; |
3951 | 0 | size_t num_ac_groups = num_groups_x * num_groups_y; |
3952 | 0 | size_t num_groups = onegroup ? 1 : (2 + num_dc_groups + num_ac_groups); |
3953 | 0 | frame_state->input = input; |
3954 | 0 | frame_state->width = width; |
3955 | 0 | frame_state->height = height; |
3956 | 0 | frame_state->num_groups_x = num_groups_x; |
3957 | 0 | frame_state->num_groups_y = num_groups_y; |
3958 | 0 | frame_state->num_dc_groups_x = num_dc_groups_x; |
3959 | 0 | frame_state->num_dc_groups_y = num_dc_groups_y; |
3960 | 0 | frame_state->nb_chans = nb_chans; |
3961 | 0 | frame_state->bitdepth = bitdepth.bitdepth; |
3962 | 0 | frame_state->big_endian = big_endian; |
3963 | 0 | frame_state->effort = effort; |
3964 | 0 | frame_state->collided = collided; |
3965 | 0 | frame_state->lookup = lookup; |
3966 | |
|
3967 | 0 | frame_state->group_data = std::vector<std::array<BitWriter, 4>>(num_groups); |
3968 | 0 | frame_state->group_sizes.resize(num_groups); |
3969 | 0 | if (collided) { |
3970 | 0 | PrepareDCGlobal(onegroup, width, height, nb_chans, frame_state->hcode, |
3971 | 0 | &frame_state->group_data[0][0]); |
3972 | 0 | } else { |
3973 | 0 | PrepareDCGlobalPalette(onegroup, width, height, nb_chans, |
3974 | 0 | frame_state->hcode, palette, pcolors, |
3975 | 0 | &frame_state->group_data[0][0]); |
3976 | 0 | } |
3977 | 0 | frame_state->group_sizes[0] = SectionSize(frame_state->group_data[0]); |
3978 | 0 | if (!onegroup) { |
3979 | 0 | ComputeAcGroupDataOffset(frame_state->group_sizes[0], num_dc_groups, |
3980 | 0 | num_ac_groups, frame_state->min_dc_global_size, |
3981 | 0 | frame_state->ac_group_data_offset); |
3982 | 0 | } |
3983 | |
|
3984 | 0 | return frame_state; |
3985 | 0 | } Unexecuted instantiation: enc_fast_lossless.cc:JxlFastLosslessFrameState* AVX2::(anonymous namespace)::LLPrepare<AVX2::(anonymous namespace)::UpTo8Bits>(JxlChunkedFrameInputSource, unsigned long, unsigned long, AVX2::(anonymous namespace)::UpTo8Bits, unsigned long, bool, int, int) Unexecuted instantiation: enc_fast_lossless.cc:JxlFastLosslessFrameState* AVX2::(anonymous namespace)::LLPrepare<AVX2::(anonymous namespace)::From9To13Bits>(JxlChunkedFrameInputSource, unsigned long, unsigned long, AVX2::(anonymous namespace)::From9To13Bits, unsigned long, bool, int, int) Unexecuted instantiation: enc_fast_lossless.cc:JxlFastLosslessFrameState* AVX2::(anonymous namespace)::LLPrepare<AVX2::(anonymous namespace)::Exactly14Bits>(JxlChunkedFrameInputSource, unsigned long, unsigned long, AVX2::(anonymous namespace)::Exactly14Bits, unsigned long, bool, int, int) Unexecuted instantiation: enc_fast_lossless.cc:JxlFastLosslessFrameState* AVX2::(anonymous namespace)::LLPrepare<AVX2::(anonymous namespace)::MoreThan14Bits>(JxlChunkedFrameInputSource, unsigned long, unsigned long, AVX2::(anonymous namespace)::MoreThan14Bits, unsigned long, bool, int, int) Unexecuted instantiation: enc_fast_lossless.cc:JxlFastLosslessFrameState* default_implementation::(anonymous namespace)::LLPrepare<default_implementation::(anonymous namespace)::UpTo8Bits>(JxlChunkedFrameInputSource, unsigned long, unsigned long, default_implementation::(anonymous namespace)::UpTo8Bits, unsigned long, bool, int, int) Unexecuted instantiation: enc_fast_lossless.cc:JxlFastLosslessFrameState* default_implementation::(anonymous namespace)::LLPrepare<default_implementation::(anonymous namespace)::From9To13Bits>(JxlChunkedFrameInputSource, unsigned long, unsigned long, default_implementation::(anonymous namespace)::From9To13Bits, unsigned long, bool, int, int) Unexecuted instantiation: enc_fast_lossless.cc:JxlFastLosslessFrameState* default_implementation::(anonymous namespace)::LLPrepare<default_implementation::(anonymous namespace)::Exactly14Bits>(JxlChunkedFrameInputSource, unsigned long, unsigned long, default_implementation::(anonymous namespace)::Exactly14Bits, unsigned long, bool, int, int) Unexecuted instantiation: enc_fast_lossless.cc:JxlFastLosslessFrameState* default_implementation::(anonymous namespace)::LLPrepare<default_implementation::(anonymous namespace)::MoreThan14Bits>(JxlChunkedFrameInputSource, unsigned long, unsigned long, default_implementation::(anonymous namespace)::MoreThan14Bits, unsigned long, bool, int, int) |
3986 | | |
3987 | | template <typename BitDepth> |
3988 | | jxl::Status LLProcess(JxlFastLosslessFrameState* frame_state, bool is_last, |
3989 | | BitDepth bitdepth, void* runner_opaque, |
3990 | | FJxlParallelRunner runner, |
3991 | 0 | JxlEncoderOutputProcessorWrapper* output_processor) { |
3992 | 0 | #if !FJXL_STANDALONE |
3993 | 0 | if (frame_state->process_done) { |
3994 | 0 | JxlFastLosslessPrepareHeader(frame_state, /*add_image_header=*/0, is_last); |
3995 | 0 | if (output_processor) { |
3996 | 0 | JXL_RETURN_IF_ERROR( |
3997 | 0 | JxlFastLosslessOutputFrame(frame_state, output_processor)); |
3998 | 0 | } |
3999 | 0 | return true; |
4000 | 0 | } |
4001 | 0 | #endif |
4002 | | // The maximum number of groups that we process concurrently here. |
4003 | | // TODO(szabadka) Use the number of threads or some outside parameter for the |
4004 | | // maximum memory usage instead. |
4005 | 0 | constexpr size_t kMaxLocalGroups = 16; |
4006 | 0 | bool onegroup = frame_state->group_sizes.size() == 1; |
4007 | 0 | bool streaming = !onegroup && output_processor; |
4008 | 0 | size_t total_groups = frame_state->num_groups_x * frame_state->num_groups_y; |
4009 | 0 | size_t max_groups = streaming ? kMaxLocalGroups : total_groups; |
4010 | 0 | #if !FJXL_STANDALONE |
4011 | 0 | size_t start_pos = 0; |
4012 | 0 | if (streaming) { |
4013 | 0 | start_pos = output_processor->CurrentPosition(); |
4014 | 0 | JXL_RETURN_IF_ERROR( |
4015 | 0 | output_processor->Seek(start_pos + frame_state->ac_group_data_offset)); |
4016 | 0 | } |
4017 | 0 | #endif |
4018 | 0 | for (size_t offset = 0; offset < total_groups; offset += max_groups) { |
4019 | 0 | size_t num_groups = std::min(max_groups, total_groups - offset); |
4020 | 0 | JxlFastLosslessFrameState local_frame_state; |
4021 | 0 | if (streaming) { |
4022 | 0 | local_frame_state.group_data = |
4023 | 0 | std::vector<std::array<BitWriter, 4>>(num_groups); |
4024 | 0 | } |
4025 | 0 | auto run_one = [&](size_t i) { |
4026 | 0 | size_t g = offset + i; |
4027 | 0 | size_t xg = g % frame_state->num_groups_x; |
4028 | 0 | size_t yg = g / frame_state->num_groups_x; |
4029 | 0 | size_t num_dc_groups = |
4030 | 0 | frame_state->num_dc_groups_x * frame_state->num_dc_groups_y; |
4031 | 0 | size_t group_id = onegroup ? 0 : (2 + num_dc_groups + g); |
4032 | 0 | size_t xs = std::min<size_t>(frame_state->width - xg * 256, 256); |
4033 | 0 | size_t ys = std::min<size_t>(frame_state->height - yg * 256, 256); |
4034 | 0 | size_t x0 = xg * 256; |
4035 | 0 | size_t y0 = yg * 256; |
4036 | 0 | size_t stride; |
4037 | 0 | JxlChunkedFrameInputSource input = frame_state->input; |
4038 | 0 | const void* buffer = input.get_color_channel_data_at(input.opaque, x0, y0, |
4039 | 0 | xs, ys, &stride); |
4040 | 0 | const unsigned char* rgba = |
4041 | 0 | reinterpret_cast<const unsigned char*>(buffer); |
4042 | |
|
4043 | 0 | auto& gd = streaming ? local_frame_state.group_data[i] |
4044 | 0 | : frame_state->group_data[group_id]; |
4045 | 0 | if (frame_state->collided) { |
4046 | 0 | WriteACSection(rgba, 0, 0, xs, ys, stride, onegroup, bitdepth, |
4047 | 0 | frame_state->nb_chans, frame_state->big_endian, |
4048 | 0 | frame_state->hcode, gd); |
4049 | 0 | } else { |
4050 | 0 | WriteACSectionPalette(rgba, 0, 0, xs, ys, stride, onegroup, |
4051 | 0 | frame_state->hcode, frame_state->lookup.data(), |
4052 | 0 | frame_state->nb_chans, gd[0]); |
4053 | 0 | } |
4054 | 0 | frame_state->group_sizes[group_id] = SectionSize(gd); |
4055 | 0 | input.release_buffer(input.opaque, buffer); |
4056 | 0 | }; Unexecuted instantiation: enc_fast_lossless.cc:AVX2::(anonymous namespace)::LLProcess<AVX2::(anonymous namespace)::UpTo8Bits>(JxlFastLosslessFrameState*, bool, AVX2::(anonymous namespace)::UpTo8Bits, void*, void (*)(void*, void*, void (*)(void*, unsigned long), unsigned long), JxlEncoderOutputProcessorWrapper*)::{lambda(unsigned long)#1}::operator()(unsigned long) const Unexecuted instantiation: enc_fast_lossless.cc:AVX2::(anonymous namespace)::LLProcess<AVX2::(anonymous namespace)::From9To13Bits>(JxlFastLosslessFrameState*, bool, AVX2::(anonymous namespace)::From9To13Bits, void*, void (*)(void*, void*, void (*)(void*, unsigned long), unsigned long), JxlEncoderOutputProcessorWrapper*)::{lambda(unsigned long)#1}::operator()(unsigned long) const Unexecuted instantiation: enc_fast_lossless.cc:AVX2::(anonymous namespace)::LLProcess<AVX2::(anonymous namespace)::Exactly14Bits>(JxlFastLosslessFrameState*, bool, AVX2::(anonymous namespace)::Exactly14Bits, void*, void (*)(void*, void*, void (*)(void*, unsigned long), unsigned long), JxlEncoderOutputProcessorWrapper*)::{lambda(unsigned long)#1}::operator()(unsigned long) const Unexecuted instantiation: enc_fast_lossless.cc:AVX2::(anonymous namespace)::LLProcess<AVX2::(anonymous namespace)::MoreThan14Bits>(JxlFastLosslessFrameState*, bool, AVX2::(anonymous namespace)::MoreThan14Bits, void*, void (*)(void*, void*, void (*)(void*, unsigned long), unsigned long), JxlEncoderOutputProcessorWrapper*)::{lambda(unsigned long)#1}::operator()(unsigned long) const Unexecuted instantiation: enc_fast_lossless.cc:default_implementation::(anonymous namespace)::LLProcess<default_implementation::(anonymous namespace)::UpTo8Bits>(JxlFastLosslessFrameState*, bool, default_implementation::(anonymous namespace)::UpTo8Bits, void*, void (*)(void*, void*, void (*)(void*, unsigned long), unsigned long), JxlEncoderOutputProcessorWrapper*)::{lambda(unsigned long)#1}::operator()(unsigned long) const Unexecuted instantiation: enc_fast_lossless.cc:default_implementation::(anonymous namespace)::LLProcess<default_implementation::(anonymous namespace)::From9To13Bits>(JxlFastLosslessFrameState*, bool, default_implementation::(anonymous namespace)::From9To13Bits, void*, void (*)(void*, void*, void (*)(void*, unsigned long), unsigned long), JxlEncoderOutputProcessorWrapper*)::{lambda(unsigned long)#1}::operator()(unsigned long) const Unexecuted instantiation: enc_fast_lossless.cc:default_implementation::(anonymous namespace)::LLProcess<default_implementation::(anonymous namespace)::Exactly14Bits>(JxlFastLosslessFrameState*, bool, default_implementation::(anonymous namespace)::Exactly14Bits, void*, void (*)(void*, void*, void (*)(void*, unsigned long), unsigned long), JxlEncoderOutputProcessorWrapper*)::{lambda(unsigned long)#1}::operator()(unsigned long) const Unexecuted instantiation: enc_fast_lossless.cc:default_implementation::(anonymous namespace)::LLProcess<default_implementation::(anonymous namespace)::MoreThan14Bits>(JxlFastLosslessFrameState*, bool, default_implementation::(anonymous namespace)::MoreThan14Bits, void*, void (*)(void*, void*, void (*)(void*, unsigned long), unsigned long), JxlEncoderOutputProcessorWrapper*)::{lambda(unsigned long)#1}::operator()(unsigned long) const |
4057 | 0 | runner( |
4058 | 0 | runner_opaque, &run_one, |
4059 | 0 | +[](void* r, size_t i) { |
4060 | 0 | (*reinterpret_cast<decltype(&run_one)>(r))(i); |
4061 | 0 | }, Unexecuted instantiation: enc_fast_lossless.cc:AVX2::(anonymous namespace)::LLProcess<AVX2::(anonymous namespace)::UpTo8Bits>(JxlFastLosslessFrameState*, bool, AVX2::(anonymous namespace)::UpTo8Bits, void*, void (*)(void*, void*, void (*)(void*, unsigned long), unsigned long), JxlEncoderOutputProcessorWrapper*)::{lambda(void*, unsigned long)#1}::operator()(void*, unsigned long) const Unexecuted instantiation: enc_fast_lossless.cc:AVX2::(anonymous namespace)::LLProcess<AVX2::(anonymous namespace)::From9To13Bits>(JxlFastLosslessFrameState*, bool, AVX2::(anonymous namespace)::From9To13Bits, void*, void (*)(void*, void*, void (*)(void*, unsigned long), unsigned long), JxlEncoderOutputProcessorWrapper*)::{lambda(void*, unsigned long)#1}::operator()(void*, unsigned long) const Unexecuted instantiation: enc_fast_lossless.cc:AVX2::(anonymous namespace)::LLProcess<AVX2::(anonymous namespace)::Exactly14Bits>(JxlFastLosslessFrameState*, bool, AVX2::(anonymous namespace)::Exactly14Bits, void*, void (*)(void*, void*, void (*)(void*, unsigned long), unsigned long), JxlEncoderOutputProcessorWrapper*)::{lambda(void*, unsigned long)#1}::operator()(void*, unsigned long) const Unexecuted instantiation: enc_fast_lossless.cc:AVX2::(anonymous namespace)::LLProcess<AVX2::(anonymous namespace)::MoreThan14Bits>(JxlFastLosslessFrameState*, bool, AVX2::(anonymous namespace)::MoreThan14Bits, void*, void (*)(void*, void*, void (*)(void*, unsigned long), unsigned long), JxlEncoderOutputProcessorWrapper*)::{lambda(void*, unsigned long)#1}::operator()(void*, unsigned long) const Unexecuted instantiation: enc_fast_lossless.cc:default_implementation::(anonymous namespace)::LLProcess<default_implementation::(anonymous namespace)::UpTo8Bits>(JxlFastLosslessFrameState*, bool, default_implementation::(anonymous namespace)::UpTo8Bits, void*, void (*)(void*, void*, void (*)(void*, unsigned long), unsigned long), JxlEncoderOutputProcessorWrapper*)::{lambda(void*, unsigned long)#1}::operator()(void*, unsigned long) const Unexecuted instantiation: enc_fast_lossless.cc:default_implementation::(anonymous namespace)::LLProcess<default_implementation::(anonymous namespace)::From9To13Bits>(JxlFastLosslessFrameState*, bool, default_implementation::(anonymous namespace)::From9To13Bits, void*, void (*)(void*, void*, void (*)(void*, unsigned long), unsigned long), JxlEncoderOutputProcessorWrapper*)::{lambda(void*, unsigned long)#1}::operator()(void*, unsigned long) const Unexecuted instantiation: enc_fast_lossless.cc:default_implementation::(anonymous namespace)::LLProcess<default_implementation::(anonymous namespace)::Exactly14Bits>(JxlFastLosslessFrameState*, bool, default_implementation::(anonymous namespace)::Exactly14Bits, void*, void (*)(void*, void*, void (*)(void*, unsigned long), unsigned long), JxlEncoderOutputProcessorWrapper*)::{lambda(void*, unsigned long)#1}::operator()(void*, unsigned long) const Unexecuted instantiation: enc_fast_lossless.cc:default_implementation::(anonymous namespace)::LLProcess<default_implementation::(anonymous namespace)::MoreThan14Bits>(JxlFastLosslessFrameState*, bool, default_implementation::(anonymous namespace)::MoreThan14Bits, void*, void (*)(void*, void*, void (*)(void*, unsigned long), unsigned long), JxlEncoderOutputProcessorWrapper*)::{lambda(void*, unsigned long)#1}::operator()(void*, unsigned long) const |
4062 | 0 | num_groups); |
4063 | 0 | #if !FJXL_STANDALONE |
4064 | 0 | if (streaming) { |
4065 | 0 | local_frame_state.nb_chans = frame_state->nb_chans; |
4066 | 0 | local_frame_state.current_bit_writer = 1; |
4067 | 0 | JXL_RETURN_IF_ERROR( |
4068 | 0 | JxlFastLosslessOutputFrame(&local_frame_state, output_processor)); |
4069 | 0 | } |
4070 | 0 | #endif |
4071 | 0 | } |
4072 | 0 | #if !FJXL_STANDALONE |
4073 | 0 | if (streaming) { |
4074 | 0 | size_t end_pos = output_processor->CurrentPosition(); |
4075 | 0 | JXL_RETURN_IF_ERROR(output_processor->Seek(start_pos)); |
4076 | 0 | frame_state->group_data.resize(1); |
4077 | 0 | bool have_alpha = frame_state->nb_chans == 2 || frame_state->nb_chans == 4; |
4078 | 0 | size_t padding = ComputeDcGlobalPadding( |
4079 | 0 | frame_state->group_sizes, frame_state->ac_group_data_offset, |
4080 | 0 | frame_state->min_dc_global_size, have_alpha, is_last); |
4081 | |
|
4082 | 0 | for (size_t i = 0; i < padding; ++i) { |
4083 | 0 | frame_state->group_data[0][0].Write(8, 0); |
4084 | 0 | } |
4085 | 0 | frame_state->group_sizes[0] += padding; |
4086 | 0 | JxlFastLosslessPrepareHeader(frame_state, /*add_image_header=*/0, is_last); |
4087 | 0 | assert(frame_state->ac_group_data_offset == |
4088 | 0 | JxlFastLosslessOutputSize(frame_state)); |
4089 | 0 | JXL_RETURN_IF_ERROR( |
4090 | 0 | JxlFastLosslessOutputHeaders(frame_state, output_processor)); |
4091 | 0 | JXL_RETURN_IF_ERROR(output_processor->Seek(end_pos)); |
4092 | 0 | } else if (output_processor) { |
4093 | 0 | assert(onegroup); |
4094 | 0 | JxlFastLosslessPrepareHeader(frame_state, /*add_image_header=*/0, is_last); |
4095 | 0 | if (output_processor) { |
4096 | 0 | JXL_RETURN_IF_ERROR( |
4097 | 0 | JxlFastLosslessOutputFrame(frame_state, output_processor)); |
4098 | 0 | } |
4099 | 0 | } |
4100 | 0 | frame_state->process_done = true; |
4101 | 0 | #endif |
4102 | 0 | return true; |
4103 | 0 | } Unexecuted instantiation: enc_fast_lossless.cc:jxl::Status AVX2::(anonymous namespace)::LLProcess<AVX2::(anonymous namespace)::UpTo8Bits>(JxlFastLosslessFrameState*, bool, AVX2::(anonymous namespace)::UpTo8Bits, void*, void (*)(void*, void*, void (*)(void*, unsigned long), unsigned long), JxlEncoderOutputProcessorWrapper*) Unexecuted instantiation: enc_fast_lossless.cc:jxl::Status AVX2::(anonymous namespace)::LLProcess<AVX2::(anonymous namespace)::From9To13Bits>(JxlFastLosslessFrameState*, bool, AVX2::(anonymous namespace)::From9To13Bits, void*, void (*)(void*, void*, void (*)(void*, unsigned long), unsigned long), JxlEncoderOutputProcessorWrapper*) Unexecuted instantiation: enc_fast_lossless.cc:jxl::Status AVX2::(anonymous namespace)::LLProcess<AVX2::(anonymous namespace)::Exactly14Bits>(JxlFastLosslessFrameState*, bool, AVX2::(anonymous namespace)::Exactly14Bits, void*, void (*)(void*, void*, void (*)(void*, unsigned long), unsigned long), JxlEncoderOutputProcessorWrapper*) Unexecuted instantiation: enc_fast_lossless.cc:jxl::Status AVX2::(anonymous namespace)::LLProcess<AVX2::(anonymous namespace)::MoreThan14Bits>(JxlFastLosslessFrameState*, bool, AVX2::(anonymous namespace)::MoreThan14Bits, void*, void (*)(void*, void*, void (*)(void*, unsigned long), unsigned long), JxlEncoderOutputProcessorWrapper*) Unexecuted instantiation: enc_fast_lossless.cc:jxl::Status default_implementation::(anonymous namespace)::LLProcess<default_implementation::(anonymous namespace)::UpTo8Bits>(JxlFastLosslessFrameState*, bool, default_implementation::(anonymous namespace)::UpTo8Bits, void*, void (*)(void*, void*, void (*)(void*, unsigned long), unsigned long), JxlEncoderOutputProcessorWrapper*) Unexecuted instantiation: enc_fast_lossless.cc:jxl::Status default_implementation::(anonymous namespace)::LLProcess<default_implementation::(anonymous namespace)::From9To13Bits>(JxlFastLosslessFrameState*, bool, default_implementation::(anonymous namespace)::From9To13Bits, void*, void (*)(void*, void*, void (*)(void*, unsigned long), unsigned long), JxlEncoderOutputProcessorWrapper*) Unexecuted instantiation: enc_fast_lossless.cc:jxl::Status default_implementation::(anonymous namespace)::LLProcess<default_implementation::(anonymous namespace)::Exactly14Bits>(JxlFastLosslessFrameState*, bool, default_implementation::(anonymous namespace)::Exactly14Bits, void*, void (*)(void*, void*, void (*)(void*, unsigned long), unsigned long), JxlEncoderOutputProcessorWrapper*) Unexecuted instantiation: enc_fast_lossless.cc:jxl::Status default_implementation::(anonymous namespace)::LLProcess<default_implementation::(anonymous namespace)::MoreThan14Bits>(JxlFastLosslessFrameState*, bool, default_implementation::(anonymous namespace)::MoreThan14Bits, void*, void (*)(void*, void*, void (*)(void*, unsigned long), unsigned long), JxlEncoderOutputProcessorWrapper*) |
4104 | | |
4105 | | JxlFastLosslessFrameState* JxlFastLosslessPrepareImpl( |
4106 | | JxlChunkedFrameInputSource input, size_t width, size_t height, |
4107 | | size_t nb_chans, size_t bitdepth, bool big_endian, int effort, |
4108 | 0 | int oneshot) { |
4109 | 0 | assert(bitdepth > 0); |
4110 | 0 | assert(nb_chans <= 4); |
4111 | 0 | assert(nb_chans != 0); |
4112 | 0 | if (bitdepth <= 8) { |
4113 | 0 | return LLPrepare(input, width, height, UpTo8Bits(bitdepth), nb_chans, |
4114 | 0 | big_endian, effort, oneshot); |
4115 | 0 | } |
4116 | 0 | if (bitdepth <= 13) { |
4117 | 0 | return LLPrepare(input, width, height, From9To13Bits(bitdepth), nb_chans, |
4118 | 0 | big_endian, effort, oneshot); |
4119 | 0 | } |
4120 | 0 | if (bitdepth == 14) { |
4121 | 0 | return LLPrepare(input, width, height, Exactly14Bits(bitdepth), nb_chans, |
4122 | 0 | big_endian, effort, oneshot); |
4123 | 0 | } |
4124 | 0 | return LLPrepare(input, width, height, MoreThan14Bits(bitdepth), nb_chans, |
4125 | 0 | big_endian, effort, oneshot); |
4126 | 0 | } Unexecuted instantiation: enc_fast_lossless.cc:AVX2::(anonymous namespace)::JxlFastLosslessPrepareImpl(JxlChunkedFrameInputSource, unsigned long, unsigned long, unsigned long, unsigned long, bool, int, int) Unexecuted instantiation: enc_fast_lossless.cc:default_implementation::(anonymous namespace)::JxlFastLosslessPrepareImpl(JxlChunkedFrameInputSource, unsigned long, unsigned long, unsigned long, unsigned long, bool, int, int) |
4127 | | |
4128 | | jxl::Status JxlFastLosslessProcessFrameImpl( |
4129 | | JxlFastLosslessFrameState* frame_state, bool is_last, void* runner_opaque, |
4130 | | FJxlParallelRunner runner, |
4131 | 0 | JxlEncoderOutputProcessorWrapper* output_processor) { |
4132 | 0 | const size_t bitdepth = frame_state->bitdepth; |
4133 | 0 | if (bitdepth <= 8) { |
4134 | 0 | JXL_RETURN_IF_ERROR(LLProcess(frame_state, is_last, UpTo8Bits(bitdepth), |
4135 | 0 | runner_opaque, runner, output_processor)); |
4136 | 0 | } else if (bitdepth <= 13) { |
4137 | 0 | JXL_RETURN_IF_ERROR(LLProcess(frame_state, is_last, From9To13Bits(bitdepth), |
4138 | 0 | runner_opaque, runner, output_processor)); |
4139 | 0 | } else if (bitdepth == 14) { |
4140 | 0 | JXL_RETURN_IF_ERROR(LLProcess(frame_state, is_last, Exactly14Bits(bitdepth), |
4141 | 0 | runner_opaque, runner, output_processor)); |
4142 | 0 | } else { |
4143 | 0 | JXL_RETURN_IF_ERROR(LLProcess(frame_state, is_last, |
4144 | 0 | MoreThan14Bits(bitdepth), runner_opaque, |
4145 | 0 | runner, output_processor)); |
4146 | 0 | } |
4147 | 0 | return true; |
4148 | 0 | } Unexecuted instantiation: enc_fast_lossless.cc:AVX2::(anonymous namespace)::JxlFastLosslessProcessFrameImpl(JxlFastLosslessFrameState*, bool, void*, void (*)(void*, void*, void (*)(void*, unsigned long), unsigned long), JxlEncoderOutputProcessorWrapper*) Unexecuted instantiation: enc_fast_lossless.cc:default_implementation::(anonymous namespace)::JxlFastLosslessProcessFrameImpl(JxlFastLosslessFrameState*, bool, void*, void (*)(void*, void*, void (*)(void*, unsigned long), unsigned long), JxlEncoderOutputProcessorWrapper*) |
4149 | | |
4150 | | } // namespace |
4151 | | |
4152 | | #endif // FJXL_SELF_INCLUDE |
4153 | | |
4154 | | #ifndef FJXL_SELF_INCLUDE |
4155 | | |
4156 | | #define FJXL_SELF_INCLUDE |
4157 | | |
4158 | | // If we have NEON enabled, it is the default target. |
4159 | | #if FJXL_ENABLE_NEON |
4160 | | |
4161 | | namespace default_implementation { |
4162 | | #define FJXL_NEON |
4163 | | #include "lib/jxl/enc_fast_lossless.cc" |
4164 | | #undef FJXL_NEON |
4165 | | } // namespace default_implementation |
4166 | | |
4167 | | #else // FJXL_ENABLE_NEON |
4168 | | |
4169 | | namespace default_implementation { |
4170 | | #include "lib/jxl/enc_fast_lossless.cc" // NOLINT |
4171 | | } |
4172 | | |
4173 | | #if FJXL_ENABLE_AVX2 |
4174 | | #ifdef __clang__ |
4175 | | #pragma clang attribute push(__attribute__((target("avx,avx2"))), \ |
4176 | | apply_to = function) |
4177 | | // Causes spurious warnings on clang5. |
4178 | | #pragma clang diagnostic push |
4179 | | #pragma clang diagnostic ignored "-Wmissing-braces" |
4180 | | #elif defined(__GNUC__) |
4181 | | #pragma GCC push_options |
4182 | | // Seems to cause spurious errors on GCC8. |
4183 | | #pragma GCC diagnostic ignored "-Wpsabi" |
4184 | | #pragma GCC target "avx,avx2" |
4185 | | #endif |
4186 | | |
4187 | | namespace AVX2 { |
4188 | | #define FJXL_AVX2 |
4189 | | #include "lib/jxl/enc_fast_lossless.cc" // NOLINT |
4190 | | #undef FJXL_AVX2 |
4191 | | } // namespace AVX2 |
4192 | | |
4193 | | #ifdef __clang__ |
4194 | | #pragma clang attribute pop |
4195 | | #pragma clang diagnostic pop |
4196 | | #elif defined(__GNUC__) |
4197 | | #pragma GCC pop_options |
4198 | | #endif |
4199 | | #endif // FJXL_ENABLE_AVX2 |
4200 | | |
4201 | | #if FJXL_ENABLE_AVX512 |
4202 | | #ifdef __clang__ |
4203 | | #pragma clang attribute push( \ |
4204 | | __attribute__((target("avx512cd,avx512bw,avx512vl,avx512f,avx512vbmi"))), \ |
4205 | | apply_to = function) |
4206 | | #elif defined(__GNUC__) |
4207 | | #pragma GCC push_options |
4208 | | #pragma GCC target "avx512cd,avx512bw,avx512vl,avx512f,avx512vbmi" |
4209 | | #endif |
4210 | | |
4211 | | namespace AVX512 { |
4212 | | #define FJXL_AVX512 |
4213 | | #include "lib/jxl/enc_fast_lossless.cc" |
4214 | | #undef FJXL_AVX512 |
4215 | | } // namespace AVX512 |
4216 | | |
4217 | | #ifdef __clang__ |
4218 | | #pragma clang attribute pop |
4219 | | #elif defined(__GNUC__) |
4220 | | #pragma GCC pop_options |
4221 | | #endif |
4222 | | #endif // FJXL_ENABLE_AVX512 |
4223 | | |
4224 | | #endif |
4225 | | |
4226 | | extern "C" { |
4227 | | |
4228 | | #if FJXL_STANDALONE |
4229 | | class FJxlFrameInput { |
4230 | | public: |
4231 | | FJxlFrameInput(const unsigned char* rgba, size_t row_stride, size_t nb_chans, |
4232 | | size_t bitdepth) |
4233 | | : rgba_(rgba), |
4234 | | row_stride_(row_stride), |
4235 | | bytes_per_pixel_(bitdepth <= 8 ? nb_chans : 2 * nb_chans) {} |
4236 | | |
4237 | | JxlChunkedFrameInputSource GetInputSource() { |
4238 | | return JxlChunkedFrameInputSource{this, GetDataAt, |
4239 | | [](void*, const void*) {}}; |
4240 | | } |
4241 | | |
4242 | | private: |
4243 | | static const void* GetDataAt(void* opaque, size_t xpos, size_t ypos, |
4244 | | size_t xsize, size_t ysize, size_t* row_offset) { |
4245 | | FJxlFrameInput* self = static_cast<FJxlFrameInput*>(opaque); |
4246 | | *row_offset = self->row_stride_; |
4247 | | return self->rgba_ + ypos * (*row_offset) + xpos * self->bytes_per_pixel_; |
4248 | | } |
4249 | | |
4250 | | const uint8_t* rgba_; |
4251 | | size_t row_stride_; |
4252 | | size_t bytes_per_pixel_; |
4253 | | }; |
4254 | | |
4255 | | size_t JxlFastLosslessEncode(const unsigned char* rgba, size_t width, |
4256 | | size_t row_stride, size_t height, size_t nb_chans, |
4257 | | size_t bitdepth, bool big_endian, int effort, |
4258 | | unsigned char** output, void* runner_opaque, |
4259 | | FJxlParallelRunner runner) { |
4260 | | FJxlFrameInput input(rgba, row_stride, nb_chans, bitdepth); |
4261 | | auto* frame_state = JxlFastLosslessPrepareFrame( |
4262 | | input.GetInputSource(), width, height, nb_chans, bitdepth, big_endian, |
4263 | | effort, /*oneshot=*/true); |
4264 | | if (!JxlFastLosslessProcessFrame(frame_state, /*is_last=*/true, runner_opaque, |
4265 | | runner, nullptr)) { |
4266 | | return 0; |
4267 | | } |
4268 | | JxlFastLosslessPrepareHeader(frame_state, /*add_image_header=*/1, |
4269 | | /*is_last=*/1); |
4270 | | size_t output_size = JxlFastLosslessMaxRequiredOutput(frame_state); |
4271 | | *output = (unsigned char*)malloc(output_size); |
4272 | | size_t written = 0; |
4273 | | size_t total = 0; |
4274 | | while ((written = JxlFastLosslessWriteOutput(frame_state, *output + total, |
4275 | | output_size - total)) != 0) { |
4276 | | total += written; |
4277 | | } |
4278 | | JxlFastLosslessFreeFrameState(frame_state); |
4279 | | return total; |
4280 | | } |
4281 | | #endif |
4282 | | |
4283 | | JxlFastLosslessFrameState* JxlFastLosslessPrepareFrame( |
4284 | | JxlChunkedFrameInputSource input, size_t width, size_t height, |
4285 | | size_t nb_chans, size_t bitdepth, bool big_endian, int effort, |
4286 | 0 | int oneshot) { |
4287 | | #if FJXL_ENABLE_AVX512 |
4288 | | if (HasCpuFeature(CpuFeature::kAVX512CD) && |
4289 | | HasCpuFeature(CpuFeature::kVBMI) && |
4290 | | HasCpuFeature(CpuFeature::kAVX512BW) && |
4291 | | HasCpuFeature(CpuFeature::kAVX512F) && |
4292 | | HasCpuFeature(CpuFeature::kAVX512VL)) { |
4293 | | return AVX512::JxlFastLosslessPrepareImpl( |
4294 | | input, width, height, nb_chans, bitdepth, big_endian, effort, oneshot); |
4295 | | } |
4296 | | #endif |
4297 | 0 | #if FJXL_ENABLE_AVX2 |
4298 | 0 | if (HasCpuFeature(CpuFeature::kAVX2)) { |
4299 | 0 | return AVX2::JxlFastLosslessPrepareImpl( |
4300 | 0 | input, width, height, nb_chans, bitdepth, big_endian, effort, oneshot); |
4301 | 0 | } |
4302 | 0 | #endif |
4303 | | |
4304 | 0 | return default_implementation::JxlFastLosslessPrepareImpl( |
4305 | 0 | input, width, height, nb_chans, bitdepth, big_endian, effort, oneshot); |
4306 | 0 | } |
4307 | | |
4308 | | bool JxlFastLosslessProcessFrame( |
4309 | | JxlFastLosslessFrameState* frame_state, bool is_last, void* runner_opaque, |
4310 | | FJxlParallelRunner runner, |
4311 | 0 | JxlEncoderOutputProcessorWrapper* output_processor) { |
4312 | 0 | auto trivial_runner = |
4313 | 0 | +[](void*, void* opaque, void fun(void*, size_t), size_t count) { |
4314 | 0 | for (size_t i = 0; i < count; i++) { |
4315 | 0 | fun(opaque, i); |
4316 | 0 | } |
4317 | 0 | }; |
4318 | |
|
4319 | 0 | if (runner == nullptr) { |
4320 | 0 | runner = trivial_runner; |
4321 | 0 | } |
4322 | |
|
4323 | | #if FJXL_ENABLE_AVX512 |
4324 | | if (HasCpuFeature(CpuFeature::kAVX512CD) && |
4325 | | HasCpuFeature(CpuFeature::kVBMI) && |
4326 | | HasCpuFeature(CpuFeature::kAVX512BW) && |
4327 | | HasCpuFeature(CpuFeature::kAVX512F) && |
4328 | | HasCpuFeature(CpuFeature::kAVX512VL)) { |
4329 | | JXL_RETURN_IF_ERROR(AVX512::JxlFastLosslessProcessFrameImpl( |
4330 | | frame_state, is_last, runner_opaque, runner, output_processor)); |
4331 | | return true; |
4332 | | } |
4333 | | #endif |
4334 | 0 | #if FJXL_ENABLE_AVX2 |
4335 | 0 | if (HasCpuFeature(CpuFeature::kAVX2)) { |
4336 | 0 | JXL_RETURN_IF_ERROR(AVX2::JxlFastLosslessProcessFrameImpl( |
4337 | 0 | frame_state, is_last, runner_opaque, runner, output_processor)); |
4338 | 0 | return true; |
4339 | 0 | } |
4340 | 0 | #endif |
4341 | | |
4342 | 0 | JXL_RETURN_IF_ERROR(default_implementation::JxlFastLosslessProcessFrameImpl( |
4343 | 0 | frame_state, is_last, runner_opaque, runner, output_processor)); |
4344 | 0 | return true; |
4345 | 0 | } |
4346 | | |
4347 | | } // extern "C" |
4348 | | |
4349 | | #if !FJXL_STANDALONE |
4350 | | bool JxlFastLosslessOutputFrame( |
4351 | | JxlFastLosslessFrameState* frame_state, |
4352 | 0 | JxlEncoderOutputProcessorWrapper* output_processor) { |
4353 | 0 | size_t fl_size = JxlFastLosslessOutputSize(frame_state); |
4354 | 0 | size_t written = 0; |
4355 | 0 | while (written < fl_size) { |
4356 | 0 | JXL_ASSIGN_OR_RETURN(auto buffer, |
4357 | 0 | output_processor->GetBuffer(32, fl_size - written)); |
4358 | 0 | size_t n = |
4359 | 0 | JxlFastLosslessWriteOutput(frame_state, buffer.data(), buffer.size()); |
4360 | 0 | if (n == 0) break; |
4361 | 0 | JXL_RETURN_IF_ERROR(buffer.advance(n)); |
4362 | 0 | written += n; |
4363 | 0 | }; |
4364 | 0 | return true; |
4365 | 0 | } |
4366 | | #endif |
4367 | | |
4368 | | #endif // FJXL_SELF_INCLUDE |