Coverage Report

Created: 2026-05-24 07:45

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/src/libjxl/lib/jxl/enc_fast_lossless.cc
Line
Count
Source
1
// Copyright (c) the JPEG XL Project Authors. All rights reserved.
2
//
3
// Use of this source code is governed by a BSD-style
4
// license that can be found in the LICENSE file.
5
6
#include "lib/jxl/base/status.h"
7
#ifndef FJXL_SELF_INCLUDE
8
9
#include <assert.h>
10
11
#include <algorithm>
12
#include <array>
13
#include <atomic>
14
#include <cstdint>
15
#include <cstdlib>
16
#include <cstring>
17
#include <limits>
18
#include <memory>
19
#include <vector>
20
21
#include "lib/jxl/enc_fast_lossless.h"
22
23
#if !FJXL_STANDALONE
24
#include "lib/jxl/encode_internal.h"
25
#endif  // FJXL_STANDALONE
26
27
#if defined(__x86_64__) || defined(_M_X64)
28
#define FJXL_ARCH_IS_X86_64 1
29
#else
30
#define FJXL_ARCH_IS_X86_64 0
31
#endif
32
33
#if defined(__i386__) || defined(_M_IX86) || FJXL_ARCH_IS_X86_64
34
#define FJXL_ARCH_IS_X86 1
35
#else
36
#define FJXL_ARCH_IS_X86 0
37
#endif
38
39
#if FJXL_ARCH_IS_X86
40
#if defined(_MSC_VER)
41
#include <intrin.h>
42
#else  // _MSC_VER
43
#include <cpuid.h>
44
#endif  // _MSC_VER
45
#endif  // FJXL_ARCH_IS_X86
46
47
// Enable NEON and AVX2/AVX512 if not asked to do otherwise and the compilers
48
// support it.
49
#if defined(__aarch64__) || defined(_M_ARM64)  // ARCH
50
#include <arm_neon.h>
51
52
#if !defined(FJXL_ENABLE_NEON)
53
#define FJXL_ENABLE_NEON 1
54
#endif  // !defined(FJXL_ENABLE_NEON)
55
56
#elif FJXL_ARCH_IS_X86_64 && !defined(_MSC_VER)  // ARCH
57
#include <immintrin.h>
58
59
// manually add _mm512_cvtsi512_si32 definition if missing
60
// (e.g. with Xcode on macOS Mojave)
61
// copied from gcc 11.1.0 include/avx512fintrin.h line 14367-14373
62
#if defined(__clang__) &&                                           \
63
    ((!defined(__apple_build_version__) && __clang_major__ < 10) || \
64
     (defined(__apple_build_version__) && __apple_build_version__ < 12000032))
65
inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
66
_mm512_cvtsi512_si32(__m512i __A) {
67
  __v16si __B = (__v16si)__A;
68
  return __B[0];
69
}
70
#endif
71
72
#if !defined(FJXL_ENABLE_AVX2)
73
#define FJXL_ENABLE_AVX2 1
74
#endif  // !defined(FJXL_ENABLE_AVX2)
75
76
#if !defined(FJXL_ENABLE_AVX512)
77
// On clang-7 or earlier, and gcc-10 or earlier, AVX512 seems broken.
78
#if (defined(__clang__) &&                                             \
79
         (!defined(__apple_build_version__) && __clang_major__ > 7) || \
80
     (defined(__apple_build_version__) &&                              \
81
      __apple_build_version__ > 10010046)) ||                          \
82
    (defined(__GNUC__) && __GNUC__ > 10)
83
#define FJXL_ENABLE_AVX512 1
84
#endif
85
#endif  // !defined(FJXL_ENABLE_AVX512)
86
87
#endif  // ARCH
88
89
#ifndef FJXL_ENABLE_NEON
90
#define FJXL_ENABLE_NEON 0
91
#endif
92
93
#ifndef FJXL_ENABLE_AVX2
94
#define FJXL_ENABLE_AVX2 0
95
#endif
96
97
#ifndef FJXL_ENABLE_AVX512
98
#define FJXL_ENABLE_AVX512 0
99
#endif
100
101
namespace {
102
103
enum class CpuFeature : uint32_t {
104
  kAVX2 = 0,
105
106
  kAVX512F,
107
  kAVX512VL,
108
  kAVX512CD,
109
  kAVX512BW,
110
111
  kVBMI,
112
  kVBMI2
113
};
114
115
0
constexpr uint32_t CpuFeatureBit(CpuFeature feature) {
116
0
  return 1u << static_cast<uint32_t>(feature);
117
0
}
118
119
#if FJXL_ARCH_IS_X86
120
#if defined(_MSC_VER)
121
void Cpuid(const uint32_t level, const uint32_t count,
122
           std::array<uint32_t, 4>& abcd) {
123
  int regs[4];
124
  __cpuidex(regs, level, count);
125
  for (int i = 0; i < 4; ++i) {
126
    abcd[i] = regs[i];
127
  }
128
}
129
uint32_t ReadXCR0() { return static_cast<uint32_t>(_xgetbv(0)); }
130
#else   // _MSC_VER
131
void Cpuid(const uint32_t level, const uint32_t count,
132
0
           std::array<uint32_t, 4>& abcd) {
133
0
  uint32_t a;
134
0
  uint32_t b;
135
0
  uint32_t c;
136
0
  uint32_t d;
137
0
  __cpuid_count(level, count, a, b, c, d);
138
0
  abcd[0] = a;
139
0
  abcd[1] = b;
140
0
  abcd[2] = c;
141
0
  abcd[3] = d;
142
0
}
143
0
uint32_t ReadXCR0() {
144
0
  uint32_t xcr0;
145
0
  uint32_t xcr0_high;
146
0
  const uint32_t index = 0;
147
0
  asm volatile(".byte 0x0F, 0x01, 0xD0"
148
0
               : "=a"(xcr0), "=d"(xcr0_high)
149
0
               : "c"(index));
150
0
  return xcr0;
151
0
}
152
#endif  // _MSC_VER
153
154
0
uint32_t DetectCpuFeatures() {
155
0
  uint32_t flags = 0;  // return value
156
0
  std::array<uint32_t, 4> abcd;
157
0
  Cpuid(0, 0, abcd);
158
0
  const uint32_t max_level = abcd[0];
159
160
0
  const auto check_bit = [](uint32_t v, uint32_t idx) -> bool {
161
0
    return (v & (1U << idx)) != 0;
162
0
  };
163
164
  // Extended features
165
0
  if (max_level >= 7) {
166
0
    Cpuid(7, 0, abcd);
167
0
    flags |= check_bit(abcd[1], 5) ? CpuFeatureBit(CpuFeature::kAVX2) : 0;
168
169
0
    flags |= check_bit(abcd[1], 16) ? CpuFeatureBit(CpuFeature::kAVX512F) : 0;
170
0
    flags |= check_bit(abcd[1], 28) ? CpuFeatureBit(CpuFeature::kAVX512CD) : 0;
171
0
    flags |= check_bit(abcd[1], 30) ? CpuFeatureBit(CpuFeature::kAVX512BW) : 0;
172
0
    flags |= check_bit(abcd[1], 31) ? CpuFeatureBit(CpuFeature::kAVX512VL) : 0;
173
174
0
    flags |= check_bit(abcd[2], 1) ? CpuFeatureBit(CpuFeature::kVBMI) : 0;
175
0
    flags |= check_bit(abcd[2], 6) ? CpuFeatureBit(CpuFeature::kVBMI2) : 0;
176
0
  }
177
178
0
  Cpuid(1, 0, abcd);
179
0
  const bool os_has_xsave = check_bit(abcd[2], 27);
180
0
  if (os_has_xsave) {
181
0
    const uint32_t xcr0 = ReadXCR0();
182
0
    if (!check_bit(xcr0, 1) || !check_bit(xcr0, 2)) {
183
0
      flags = 0;
184
0
    } else if (!check_bit(xcr0, 5) || !check_bit(xcr0, 6) ||
185
0
               !check_bit(xcr0, 7)) {
186
      // No AVX-512; disable everything but AVX2 if present
187
0
      flags &= CpuFeatureBit(CpuFeature::kAVX2);
188
0
    }
189
0
  }
190
191
0
  return flags;
192
0
}
193
#else   // FJXL_ARCH_IS_X86
194
uint32_t DetectCpuFeatures() { return 0; }
195
#endif  // FJXL_ARCH_IS_X86
196
197
#if defined(_MSC_VER)
198
#define FJXL_UNUSED
199
#else
200
#define FJXL_UNUSED __attribute__((unused))
201
#endif
202
203
0
FJXL_UNUSED bool HasCpuFeature(CpuFeature feature) {
204
0
  static uint32_t cpu_features = DetectCpuFeatures();
205
0
  return (cpu_features & CpuFeatureBit(feature)) != 0;
206
0
}
207
208
#if defined(_MSC_VER) && !defined(__clang__)
209
#define FJXL_INLINE __forceinline
210
FJXL_INLINE uint32_t FloorLog2(uint32_t v) {
211
  unsigned long index;
212
  _BitScanReverse(&index, v);
213
  return index;
214
}
215
FJXL_INLINE uint32_t CtzNonZero(uint64_t v) {
216
  unsigned long index;
217
  _BitScanForward(&index, v);
218
  return index;
219
}
220
#else
221
#define FJXL_INLINE inline __attribute__((always_inline))
222
0
FJXL_INLINE uint32_t FloorLog2(uint32_t v) {
223
0
  return v ? 31 - __builtin_clz(v) : 0;
224
0
}
225
0
FJXL_UNUSED FJXL_INLINE uint32_t CtzNonZero(uint64_t v) {
226
0
  return __builtin_ctzll(v);
227
0
}
228
#endif
229
230
// Compiles to a memcpy on little-endian systems.
231
0
FJXL_INLINE void StoreLE64(uint8_t* tgt, uint64_t data) {
232
#if (!defined(__BYTE_ORDER__) || (__BYTE_ORDER__ != __ORDER_LITTLE_ENDIAN__))
233
  for (int i = 0; i < 8; i++) {
234
    tgt[i] = (data >> (i * 8)) & 0xFF;
235
  }
236
#else
237
0
  memcpy(tgt, &data, 8);
238
0
#endif
239
0
}
240
241
FJXL_INLINE size_t AddBits(uint32_t count, uint64_t bits, uint8_t* data_buf,
242
0
                           size_t& bits_in_buffer, uint64_t& bit_buffer) {
243
0
  bit_buffer |= bits << bits_in_buffer;
244
0
  bits_in_buffer += count;
245
0
  StoreLE64(data_buf, bit_buffer);
246
0
  size_t bytes_in_buffer = bits_in_buffer / 8;
247
0
  bits_in_buffer -= bytes_in_buffer * 8;
248
0
  bit_buffer >>= bytes_in_buffer * 8;
249
0
  return bytes_in_buffer;
250
0
}
251
252
struct BitWriter {
253
0
  bool Allocate(size_t maximum_bit_size) {
254
0
    assert(data == nullptr);
255
    // Leave some padding.
256
0
    data.reset(static_cast<uint8_t*>(malloc(maximum_bit_size / 8 + 64)));
257
0
    return data != nullptr;
258
0
  }
259
260
0
  void Write(uint32_t count, uint64_t bits) {
261
0
    bytes_written += AddBits(count, bits, data.get() + bytes_written,
262
0
                             bits_in_buffer, buffer);
263
0
  }
264
265
0
  void ZeroPadToByte() {
266
0
    if (bits_in_buffer != 0) {
267
0
      Write(8 - bits_in_buffer, 0);
268
0
    }
269
0
  }
270
271
  FJXL_INLINE void WriteMultiple(const uint64_t* nbits, const uint64_t* bits,
272
0
                                 size_t n) {
273
    // Necessary because Write() is only guaranteed to work with <=56 bits.
274
    // Trying to SIMD-fy this code results in lower speed (and definitely less
275
    // clarity).
276
0
    {
277
0
      for (size_t i = 0; i < n; i++) {
278
0
        this->buffer |= bits[i] << this->bits_in_buffer;
279
0
        memcpy(this->data.get() + this->bytes_written, &this->buffer, 8);
280
0
        uint64_t shift = 64 - this->bits_in_buffer;
281
0
        this->bits_in_buffer += nbits[i];
282
        // This `if` seems to be faster than using ternaries.
283
0
        if (this->bits_in_buffer >= 64) {
284
0
          uint64_t next_buffer = shift >= 64 ? 0 : bits[i] >> shift;
285
0
          this->buffer = next_buffer;
286
0
          this->bits_in_buffer -= 64;
287
0
          this->bytes_written += 8;
288
0
        }
289
0
      }
290
0
      memcpy(this->data.get() + this->bytes_written, &this->buffer, 8);
291
0
      size_t bytes_in_buffer = this->bits_in_buffer / 8;
292
0
      this->bits_in_buffer -= bytes_in_buffer * 8;
293
0
      this->buffer >>= bytes_in_buffer * 8;
294
0
      this->bytes_written += bytes_in_buffer;
295
0
    }
296
0
  }
297
298
  std::unique_ptr<uint8_t[], void (*)(void*)> data = {nullptr, free};
299
  size_t bytes_written = 0;
300
  size_t bits_in_buffer = 0;
301
  uint64_t buffer = 0;
302
};
303
304
0
size_t SectionSize(const std::array<BitWriter, 4>& group_data) {
305
0
  size_t sz = 0;
306
0
  for (size_t j = 0; j < 4; j++) {
307
0
    const auto& writer = group_data[j];
308
0
    sz += writer.bytes_written * 8 + writer.bits_in_buffer;
309
0
  }
310
0
  sz = (sz + 7) / 8;
311
0
  return sz;
312
0
}
313
314
constexpr size_t kMaxFrameHeaderSize = 5;
315
316
constexpr size_t kGroupSizeOffset[4] = {
317
    static_cast<size_t>(0),
318
    static_cast<size_t>(1024),
319
    static_cast<size_t>(17408),
320
    static_cast<size_t>(4211712),
321
};
322
constexpr size_t kTOCBits[4] = {12, 16, 24, 32};
323
324
0
size_t TOCBucket(size_t group_size) {
325
0
  size_t bucket = 0;
326
0
  while (bucket < 3 && group_size >= kGroupSizeOffset[bucket + 1]) ++bucket;
327
0
  return bucket;
328
0
}
329
330
#if !FJXL_STANDALONE
331
0
size_t TOCSize(const std::vector<size_t>& group_sizes) {
332
0
  size_t toc_bits = 0;
333
0
  for (size_t group_size : group_sizes) {
334
0
    toc_bits += kTOCBits[TOCBucket(group_size)];
335
0
  }
336
0
  return (toc_bits + 7) / 8;
337
0
}
338
339
0
size_t FrameHeaderSize(bool have_alpha, bool is_last) {
340
0
  size_t nbits = 28 + (have_alpha ? 4 : 0) + (is_last ? 0 : 2);
341
0
  return (nbits + 7) / 8;
342
0
}
343
#endif
344
345
void ComputeAcGroupDataOffset(size_t dc_global_size, size_t num_dc_groups,
346
                              size_t num_ac_groups, size_t& min_dc_global_size,
347
0
                              size_t& ac_group_offset) {
348
  // Max AC group size is 768 kB, so max AC group TOC bits is 24.
349
0
  size_t ac_toc_max_bits = num_ac_groups * 24;
350
0
  size_t ac_toc_min_bits = num_ac_groups * 12;
351
0
  size_t max_padding = 1 + (ac_toc_max_bits - ac_toc_min_bits + 7) / 8;
352
0
  min_dc_global_size = dc_global_size;
353
0
  size_t dc_global_bucket = TOCBucket(min_dc_global_size);
354
0
  while (TOCBucket(min_dc_global_size + max_padding) > dc_global_bucket) {
355
0
    dc_global_bucket = TOCBucket(min_dc_global_size + max_padding);
356
0
    min_dc_global_size = kGroupSizeOffset[dc_global_bucket];
357
0
  }
358
0
  assert(TOCBucket(min_dc_global_size) == dc_global_bucket);
359
0
  assert(TOCBucket(min_dc_global_size + max_padding) == dc_global_bucket);
360
0
  size_t max_toc_bits =
361
0
      kTOCBits[dc_global_bucket] + 12 * (1 + num_dc_groups) + ac_toc_max_bits;
362
0
  size_t max_toc_size = (max_toc_bits + 7) / 8;
363
0
  ac_group_offset = kMaxFrameHeaderSize + max_toc_size + min_dc_global_size;
364
0
}
365
366
#if !FJXL_STANDALONE
367
size_t ComputeDcGlobalPadding(const std::vector<size_t>& group_sizes,
368
                              size_t ac_group_data_offset,
369
                              size_t min_dc_global_size, bool have_alpha,
370
0
                              bool is_last) {
371
0
  std::vector<size_t> new_group_sizes = group_sizes;
372
0
  new_group_sizes[0] = min_dc_global_size;
373
0
  size_t toc_size = TOCSize(new_group_sizes);
374
0
  size_t actual_offset =
375
0
      FrameHeaderSize(have_alpha, is_last) + toc_size + group_sizes[0];
376
0
  return ac_group_data_offset - actual_offset;
377
0
}
378
#endif
379
380
constexpr size_t kNumRawSymbols = 19;
381
constexpr size_t kNumLZ77 = 33;
382
constexpr size_t kLZ77CacheSize = 32;
383
384
constexpr size_t kLZ77Offset = 224;
385
constexpr size_t kLZ77MinLength = 7;
386
387
void EncodeHybridUintLZ77(uint32_t value, uint32_t* token, uint32_t* nbits,
388
0
                          uint32_t* bits) {
389
  // 400 config
390
0
  uint32_t n = FloorLog2(value);
391
0
  *token = value < 16 ? value : 16 + n - 4;
392
0
  *nbits = value < 16 ? 0 : n;
393
0
  *bits = value < 16 ? 0 : value - (1 << *nbits);
394
0
}
395
396
struct PrefixCode {
397
  uint8_t raw_nbits[kNumRawSymbols] = {};
398
  uint8_t raw_bits[kNumRawSymbols] = {};
399
400
  uint8_t lz77_nbits[kNumLZ77] = {};
401
  uint16_t lz77_bits[kNumLZ77] = {};
402
403
  uint64_t lz77_cache_bits[kLZ77CacheSize] = {};
404
  uint8_t lz77_cache_nbits[kLZ77CacheSize] = {};
405
406
  size_t numraw;
407
408
0
  static uint16_t BitReverse(size_t nbits, uint16_t bits) {
409
0
    constexpr uint16_t kNibbleLookup[16] = {
410
0
        0b0000, 0b1000, 0b0100, 0b1100, 0b0010, 0b1010, 0b0110, 0b1110,
411
0
        0b0001, 0b1001, 0b0101, 0b1101, 0b0011, 0b1011, 0b0111, 0b1111,
412
0
    };
413
0
    uint16_t rev16 = (kNibbleLookup[bits & 0xF] << 12) |
414
0
                     (kNibbleLookup[(bits >> 4) & 0xF] << 8) |
415
0
                     (kNibbleLookup[(bits >> 8) & 0xF] << 4) |
416
0
                     (kNibbleLookup[bits >> 12]);
417
0
    return rev16 >> (16 - nbits);
418
0
  }
419
420
  // Create the prefix codes given the code lengths.
421
  // Supports the code lengths being split into two halves.
422
  static void ComputeCanonicalCode(const uint8_t* first_chunk_nbits,
423
                                   uint8_t* first_chunk_bits,
424
                                   size_t first_chunk_size,
425
                                   const uint8_t* second_chunk_nbits,
426
                                   uint16_t* second_chunk_bits,
427
0
                                   size_t second_chunk_size) {
428
0
    constexpr size_t kMaxCodeLength = 15;
429
0
    uint8_t code_length_counts[kMaxCodeLength + 1] = {};
430
0
    for (size_t i = 0; i < first_chunk_size; i++) {
431
0
      code_length_counts[first_chunk_nbits[i]]++;
432
0
      assert(first_chunk_nbits[i] <= kMaxCodeLength);
433
0
      assert(first_chunk_nbits[i] <= 8);
434
0
      assert(first_chunk_nbits[i] > 0);
435
0
    }
436
0
    for (size_t i = 0; i < second_chunk_size; i++) {
437
0
      code_length_counts[second_chunk_nbits[i]]++;
438
0
      assert(second_chunk_nbits[i] <= kMaxCodeLength);
439
0
    }
440
441
0
    uint16_t next_code[kMaxCodeLength + 1] = {};
442
443
0
    uint16_t code = 0;
444
0
    for (size_t i = 1; i < kMaxCodeLength + 1; i++) {
445
0
      code = (code + code_length_counts[i - 1]) << 1;
446
0
      next_code[i] = code;
447
0
    }
448
449
0
    for (size_t i = 0; i < first_chunk_size; i++) {
450
0
      first_chunk_bits[i] =
451
0
          BitReverse(first_chunk_nbits[i], next_code[first_chunk_nbits[i]]++);
452
0
    }
453
0
    for (size_t i = 0; i < second_chunk_size; i++) {
454
0
      second_chunk_bits[i] =
455
0
          BitReverse(second_chunk_nbits[i], next_code[second_chunk_nbits[i]]++);
456
0
    }
457
0
  }
458
459
  template <typename T>
460
  static void ComputeCodeLengthsNonZeroImpl(const uint64_t* freqs, size_t n,
461
                                            size_t precision, T infty,
462
                                            const uint8_t* min_limit,
463
                                            const uint8_t* max_limit,
464
0
                                            uint8_t* nbits) {
465
0
    assert(precision < 15);
466
0
    assert(n <= kMaxNumSymbols);
467
0
    std::vector<T> dynp(((1U << precision) + 1) * (n + 1), infty);
468
0
    auto d = [&](size_t sym, size_t off) -> T& {
469
0
      return dynp[sym * ((1 << precision) + 1) + off];
470
0
    };
Unexecuted instantiation: enc_fast_lossless.cc:(anonymous namespace)::PrefixCode::ComputeCodeLengthsNonZeroImpl<unsigned int>(unsigned long const*, unsigned long, unsigned long, unsigned int, unsigned char const*, unsigned char const*, unsigned char*)::{lambda(unsigned long, unsigned long)#1}::operator()(unsigned long, unsigned long) const
Unexecuted instantiation: enc_fast_lossless.cc:(anonymous namespace)::PrefixCode::ComputeCodeLengthsNonZeroImpl<unsigned long>(unsigned long const*, unsigned long, unsigned long, unsigned long, unsigned char const*, unsigned char const*, unsigned char*)::{lambda(unsigned long, unsigned long)#1}::operator()(unsigned long, unsigned long) const
471
0
    d(0, 0) = 0;
472
0
    for (size_t sym = 0; sym < n; sym++) {
473
0
      for (T bits = min_limit[sym]; bits <= max_limit[sym]; bits++) {
474
0
        size_t off_delta = 1U << (precision - bits);
475
0
        for (size_t off = 0; off + off_delta <= (1U << precision); off++) {
476
0
          d(sym + 1, off + off_delta) =
477
0
              std::min(d(sym, off) + static_cast<T>(freqs[sym]) * bits,
478
0
                       d(sym + 1, off + off_delta));
479
0
        }
480
0
      }
481
0
    }
482
483
0
    size_t sym = n;
484
0
    size_t off = 1U << precision;
485
486
0
    assert(d(sym, off) != infty);
487
488
0
    while (sym-- > 0) {
489
0
      assert(off > 0);
490
0
      for (size_t bits = min_limit[sym]; bits <= max_limit[sym]; bits++) {
491
0
        size_t off_delta = 1U << (precision - bits);
492
0
        if (off_delta <= off &&
493
0
            d(sym + 1, off) == d(sym, off - off_delta) + freqs[sym] * bits) {
494
0
          off -= off_delta;
495
0
          nbits[sym] = bits;
496
0
          break;
497
0
        }
498
0
      }
499
0
    }
500
0
  }
Unexecuted instantiation: enc_fast_lossless.cc:void (anonymous namespace)::PrefixCode::ComputeCodeLengthsNonZeroImpl<unsigned int>(unsigned long const*, unsigned long, unsigned long, unsigned int, unsigned char const*, unsigned char const*, unsigned char*)
Unexecuted instantiation: enc_fast_lossless.cc:void (anonymous namespace)::PrefixCode::ComputeCodeLengthsNonZeroImpl<unsigned long>(unsigned long const*, unsigned long, unsigned long, unsigned long, unsigned char const*, unsigned char const*, unsigned char*)
501
502
  // Computes nbits[i] for i <= n, subject to min_limit[i] <= nbits[i] <=
503
  // max_limit[i] and sum 2**-nbits[i] == 1, so to minimize sum(nbits[i] *
504
  // freqs[i]).
505
  static void ComputeCodeLengthsNonZero(const uint64_t* freqs, size_t n,
506
                                        uint8_t* min_limit, uint8_t* max_limit,
507
0
                                        uint8_t* nbits) {
508
0
    size_t precision = 0;
509
0
    size_t shortest_length = 255;
510
0
    uint64_t freqsum = 0;
511
0
    for (size_t i = 0; i < n; i++) {
512
0
      assert(freqs[i] != 0);
513
0
      freqsum += freqs[i];
514
0
      if (min_limit[i] < 1) min_limit[i] = 1;
515
0
      assert(min_limit[i] <= max_limit[i]);
516
0
      precision = std::max<size_t>(max_limit[i], precision);
517
0
      shortest_length = std::min<size_t>(min_limit[i], shortest_length);
518
0
    }
519
    // If all the minimum limits are greater than 1, shift precision so that we
520
    // behave as if the shortest was 1.
521
0
    precision -= shortest_length - 1;
522
0
    uint64_t infty = freqsum * precision;
523
0
    if (infty < std::numeric_limits<uint32_t>::max() / 2) {
524
0
      ComputeCodeLengthsNonZeroImpl(freqs, n, precision,
525
0
                                    static_cast<uint32_t>(infty), min_limit,
526
0
                                    max_limit, nbits);
527
0
    } else {
528
0
      ComputeCodeLengthsNonZeroImpl(freqs, n, precision, infty, min_limit,
529
0
                                    max_limit, nbits);
530
0
    }
531
0
  }
532
533
  static constexpr size_t kMaxNumSymbols =
534
      kNumRawSymbols + 1 < kNumLZ77 ? kNumLZ77 : kNumRawSymbols + 1;
535
  static void ComputeCodeLengths(const uint64_t* freqs, size_t n,
536
                                 const uint8_t* min_limit_in,
537
0
                                 const uint8_t* max_limit_in, uint8_t* nbits) {
538
0
    assert(n <= kMaxNumSymbols);
539
0
    uint64_t compact_freqs[kMaxNumSymbols];
540
0
    uint8_t min_limit[kMaxNumSymbols];
541
0
    uint8_t max_limit[kMaxNumSymbols];
542
0
    size_t ni = 0;
543
0
    for (size_t i = 0; i < n; i++) {
544
0
      if (freqs[i]) {
545
0
        compact_freqs[ni] = freqs[i];
546
0
        min_limit[ni] = min_limit_in[i];
547
0
        max_limit[ni] = max_limit_in[i];
548
0
        ni++;
549
0
      }
550
0
    }
551
0
    for (size_t i = ni; i < kMaxNumSymbols; ++i) {
552
0
      compact_freqs[i] = 0;
553
0
      min_limit[i] = 0;
554
0
      max_limit[i] = 0;
555
0
    }
556
0
    uint8_t num_bits[kMaxNumSymbols] = {};
557
0
    ComputeCodeLengthsNonZero(compact_freqs, ni, min_limit, max_limit,
558
0
                              num_bits);
559
0
    ni = 0;
560
0
    for (size_t i = 0; i < n; i++) {
561
0
      nbits[i] = 0;
562
0
      if (freqs[i]) {
563
0
        nbits[i] = num_bits[ni++];
564
0
      }
565
0
    }
566
0
  }
567
568
  // Invalid code, used to construct arrays.
569
0
  PrefixCode() = default;
570
571
  template <typename BitDepth>
572
  PrefixCode(BitDepth /* bitdepth */, uint64_t* raw_counts,
573
0
             uint64_t* lz77_counts) {
574
    // "merge" together all the lz77 counts in a single symbol for the level 1
575
    // table (containing just the raw symbols, up to length 7).
576
0
    uint64_t level1_counts[kNumRawSymbols + 1];
577
0
    memcpy(level1_counts, raw_counts, kNumRawSymbols * sizeof(uint64_t));
578
0
    numraw = kNumRawSymbols;
579
0
    while (numraw > 0 && level1_counts[numraw - 1] == 0) numraw--;
580
581
0
    level1_counts[numraw] = 0;
582
0
    for (size_t i = 0; i < kNumLZ77; i++) {
583
0
      level1_counts[numraw] += lz77_counts[i];
584
0
    }
585
0
    uint8_t level1_nbits[kNumRawSymbols + 1] = {};
586
0
    ComputeCodeLengths(level1_counts, numraw + 1, BitDepth::kMinRawLength,
587
0
                       BitDepth::kMaxRawLength, level1_nbits);
588
589
0
    uint8_t level2_nbits[kNumLZ77] = {};
590
0
    uint8_t min_lengths[kNumLZ77] = {};
591
0
    uint8_t l = 15 - level1_nbits[numraw];
592
0
    uint8_t max_lengths[kNumLZ77];
593
0
    for (uint8_t& max_length : max_lengths) {
594
0
      max_length = l;
595
0
    }
596
0
    size_t num_lz77 = kNumLZ77;
597
0
    while (num_lz77 > 0 && lz77_counts[num_lz77 - 1] == 0) num_lz77--;
598
0
    ComputeCodeLengths(lz77_counts, num_lz77, min_lengths, max_lengths,
599
0
                       level2_nbits);
600
0
    for (size_t i = 0; i < numraw; i++) {
601
0
      raw_nbits[i] = level1_nbits[i];
602
0
    }
603
0
    for (size_t i = 0; i < num_lz77; i++) {
604
0
      lz77_nbits[i] =
605
0
          level2_nbits[i] ? level1_nbits[numraw] + level2_nbits[i] : 0;
606
0
    }
607
608
0
    ComputeCanonicalCode(raw_nbits, raw_bits, numraw, lz77_nbits, lz77_bits,
609
0
                         kNumLZ77);
610
611
    // Prepare lz77 cache
612
0
    for (size_t count = 0; count < kLZ77CacheSize; count++) {
613
0
      unsigned token, nbits, bits;
614
0
      EncodeHybridUintLZ77(count, &token, &nbits, &bits);
615
0
      lz77_cache_nbits[count] = lz77_nbits[token] + nbits + raw_nbits[0];
616
0
      lz77_cache_bits[count] =
617
0
          (((bits << lz77_nbits[token]) | lz77_bits[token]) << raw_nbits[0]) |
618
0
          raw_bits[0];
619
0
    }
620
0
  }
Unexecuted instantiation: enc_fast_lossless.cc:(anonymous namespace)::PrefixCode::PrefixCode<AVX512::(anonymous namespace)::UpTo8Bits>(AVX512::(anonymous namespace)::UpTo8Bits, unsigned long*, unsigned long*)
Unexecuted instantiation: enc_fast_lossless.cc:(anonymous namespace)::PrefixCode::PrefixCode<AVX512::(anonymous namespace)::From9To13Bits>(AVX512::(anonymous namespace)::From9To13Bits, unsigned long*, unsigned long*)
Unexecuted instantiation: enc_fast_lossless.cc:(anonymous namespace)::PrefixCode::PrefixCode<AVX512::(anonymous namespace)::Exactly14Bits>(AVX512::(anonymous namespace)::Exactly14Bits, unsigned long*, unsigned long*)
Unexecuted instantiation: enc_fast_lossless.cc:(anonymous namespace)::PrefixCode::PrefixCode<AVX512::(anonymous namespace)::MoreThan14Bits>(AVX512::(anonymous namespace)::MoreThan14Bits, unsigned long*, unsigned long*)
Unexecuted instantiation: enc_fast_lossless.cc:(anonymous namespace)::PrefixCode::PrefixCode<AVX2::(anonymous namespace)::UpTo8Bits>(AVX2::(anonymous namespace)::UpTo8Bits, unsigned long*, unsigned long*)
Unexecuted instantiation: enc_fast_lossless.cc:(anonymous namespace)::PrefixCode::PrefixCode<AVX2::(anonymous namespace)::From9To13Bits>(AVX2::(anonymous namespace)::From9To13Bits, unsigned long*, unsigned long*)
Unexecuted instantiation: enc_fast_lossless.cc:(anonymous namespace)::PrefixCode::PrefixCode<AVX2::(anonymous namespace)::Exactly14Bits>(AVX2::(anonymous namespace)::Exactly14Bits, unsigned long*, unsigned long*)
Unexecuted instantiation: enc_fast_lossless.cc:(anonymous namespace)::PrefixCode::PrefixCode<AVX2::(anonymous namespace)::MoreThan14Bits>(AVX2::(anonymous namespace)::MoreThan14Bits, unsigned long*, unsigned long*)
Unexecuted instantiation: enc_fast_lossless.cc:(anonymous namespace)::PrefixCode::PrefixCode<default_implementation::(anonymous namespace)::UpTo8Bits>(default_implementation::(anonymous namespace)::UpTo8Bits, unsigned long*, unsigned long*)
Unexecuted instantiation: enc_fast_lossless.cc:(anonymous namespace)::PrefixCode::PrefixCode<default_implementation::(anonymous namespace)::From9To13Bits>(default_implementation::(anonymous namespace)::From9To13Bits, unsigned long*, unsigned long*)
Unexecuted instantiation: enc_fast_lossless.cc:(anonymous namespace)::PrefixCode::PrefixCode<default_implementation::(anonymous namespace)::Exactly14Bits>(default_implementation::(anonymous namespace)::Exactly14Bits, unsigned long*, unsigned long*)
Unexecuted instantiation: enc_fast_lossless.cc:(anonymous namespace)::PrefixCode::PrefixCode<default_implementation::(anonymous namespace)::MoreThan14Bits>(default_implementation::(anonymous namespace)::MoreThan14Bits, unsigned long*, unsigned long*)
621
622
  // Max bits written: 2 + 72 + 95 + 24 + 165 = 286
623
0
  void WriteTo(BitWriter* writer) const {
624
0
    uint64_t code_length_counts[18] = {};
625
0
    code_length_counts[17] = 3 + 2 * (kNumLZ77 - 1);
626
0
    for (uint8_t raw_nbit : raw_nbits) {
627
0
      code_length_counts[raw_nbit]++;
628
0
    }
629
0
    for (uint8_t lz77_nbit : lz77_nbits) {
630
0
      code_length_counts[lz77_nbit]++;
631
0
    }
632
0
    uint8_t code_length_nbits[18] = {};
633
0
    uint8_t code_length_nbits_min[18] = {};
634
0
    uint8_t code_length_nbits_max[18] = {
635
0
        5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
636
0
    };
637
0
    ComputeCodeLengths(code_length_counts, 18, code_length_nbits_min,
638
0
                       code_length_nbits_max, code_length_nbits);
639
0
    writer->Write(2, 0b00);  // HSKIP = 0, i.e. don't skip code lengths.
640
641
    // As per Brotli RFC.
642
0
    uint8_t code_length_order[18] = {1, 2, 3, 4,  0,  5,  17, 6,  16,
643
0
                                     7, 8, 9, 10, 11, 12, 13, 14, 15};
644
0
    uint8_t code_length_length_nbits[] = {2, 4, 3, 2, 2, 4};
645
0
    uint8_t code_length_length_bits[] = {0, 7, 3, 2, 1, 15};
646
647
    // Encode lengths of code lengths.
648
0
    size_t num_code_lengths = 18;
649
0
    while (code_length_nbits[code_length_order[num_code_lengths - 1]] == 0) {
650
0
      num_code_lengths--;
651
0
    }
652
    // Max bits written in this loop: 18 * 4 = 72
653
0
    for (size_t i = 0; i < num_code_lengths; i++) {
654
0
      int symbol = code_length_nbits[code_length_order[i]];
655
0
      writer->Write(code_length_length_nbits[symbol],
656
0
                    code_length_length_bits[symbol]);
657
0
    }
658
659
    // Compute the canonical codes for the codes that represent the lengths of
660
    // the actual codes for data.
661
0
    uint16_t code_length_bits[18] = {};
662
0
    ComputeCanonicalCode(nullptr, nullptr, 0, code_length_nbits,
663
0
                         code_length_bits, 18);
664
    // Encode raw bit code lengths.
665
    // Max bits written in this loop: 19 * 5 = 95
666
0
    for (uint8_t raw_nbit : raw_nbits) {
667
0
      writer->Write(code_length_nbits[raw_nbit], code_length_bits[raw_nbit]);
668
0
    }
669
0
    size_t num_lz77 = kNumLZ77;
670
0
    while (lz77_nbits[num_lz77 - 1] == 0) {
671
0
      num_lz77--;
672
0
    }
673
    // Encode 0s until 224 (start of LZ77 symbols). This is in total 224-19 =
674
    // 205.
675
0
    static_assert(kLZ77Offset == 224, "kLZ77Offset should be 224");
676
0
    static_assert(kNumRawSymbols == 19, "kNumRawSymbols should be 19");
677
0
    {
678
      // Max bits in this block: 24
679
0
      writer->Write(code_length_nbits[17], code_length_bits[17]);
680
0
      writer->Write(3, 0b010);  // 5
681
0
      writer->Write(code_length_nbits[17], code_length_bits[17]);
682
0
      writer->Write(3, 0b000);  // (5-2)*8 + 3 = 27
683
0
      writer->Write(code_length_nbits[17], code_length_bits[17]);
684
0
      writer->Write(3, 0b010);  // (27-2)*8 + 5 = 205
685
0
    }
686
    // Encode LZ77 symbols, with values 224+i.
687
    // Max bits written in this loop: 33 * 5 = 165
688
0
    for (size_t i = 0; i < num_lz77; i++) {
689
0
      writer->Write(code_length_nbits[lz77_nbits[i]],
690
0
                    code_length_bits[lz77_nbits[i]]);
691
0
    }
692
0
  }
693
};
694
695
}  // namespace
696
697
extern "C" {
698
699
struct JxlFastLosslessFrameState {
700
  JxlChunkedFrameInputSource input;
701
  size_t width;
702
  size_t height;
703
  size_t num_groups_x;
704
  size_t num_groups_y;
705
  size_t num_dc_groups_x;
706
  size_t num_dc_groups_y;
707
  size_t nb_chans;
708
  size_t bitdepth;
709
  int big_endian;
710
  int effort;
711
  bool collided;
712
  PrefixCode hcode[4];
713
  std::vector<int16_t> lookup;
714
  BitWriter header;
715
  std::vector<std::array<BitWriter, 4>> group_data;
716
  std::vector<size_t> group_sizes;
717
  size_t ac_group_data_offset = 0;
718
  size_t min_dc_global_size = 0;
719
  size_t current_bit_writer = 0;
720
  size_t bit_writer_byte_pos = 0;
721
  size_t bits_in_buffer = 0;
722
  uint64_t bit_buffer = 0;
723
  bool process_done = false;
724
};
725
726
0
size_t JxlFastLosslessOutputSize(const JxlFastLosslessFrameState* frame) {
727
0
  size_t total_size_groups = 0;
728
0
  for (const auto& section : frame->group_data) {
729
0
    total_size_groups += SectionSize(section);
730
0
  }
731
0
  return frame->header.bytes_written + total_size_groups;
732
0
}
733
734
size_t JxlFastLosslessMaxRequiredOutput(
735
0
    const JxlFastLosslessFrameState* frame) {
736
0
  return JxlFastLosslessOutputSize(frame) + 32;
737
0
}
738
739
bool JxlFastLosslessPrepareHeader(JxlFastLosslessFrameState* frame,
740
0
                                  int add_image_header, int is_last) {
741
0
  BitWriter* output = &frame->header;
742
0
  if (!output->Allocate(1000 + frame->group_sizes.size() * 32)) return false;
743
744
0
  bool have_alpha = (frame->nb_chans == 2 || frame->nb_chans == 4);
745
746
#if FJXL_STANDALONE
747
  if (add_image_header) {
748
    // Signature
749
    output->Write(16, 0x0AFF);
750
751
    // Size header, hand-crafted.
752
    // Not small
753
    output->Write(1, 0);
754
755
    auto wsz = [output](size_t size) {
756
      if (size - 1 < (1 << 9)) {
757
        output->Write(2, 0b00);
758
        output->Write(9, size - 1);
759
      } else if (size - 1 < (1 << 13)) {
760
        output->Write(2, 0b01);
761
        output->Write(13, size - 1);
762
      } else if (size - 1 < (1 << 18)) {
763
        output->Write(2, 0b10);
764
        output->Write(18, size - 1);
765
      } else {
766
        output->Write(2, 0b11);
767
        output->Write(30, size - 1);
768
      }
769
    };
770
771
    wsz(frame->height);
772
773
    // No special ratio.
774
    output->Write(3, 0);
775
776
    wsz(frame->width);
777
778
    // Hand-crafted ImageMetadata.
779
    output->Write(1, 0);  // all_default
780
    output->Write(1, 0);  // extra_fields
781
    output->Write(1, 0);  // bit_depth.floating_point_sample
782
    if (frame->bitdepth == 8) {
783
      output->Write(2, 0b00);  // bit_depth.bits_per_sample = 8
784
    } else if (frame->bitdepth == 10) {
785
      output->Write(2, 0b01);  // bit_depth.bits_per_sample = 10
786
    } else if (frame->bitdepth == 12) {
787
      output->Write(2, 0b10);  // bit_depth.bits_per_sample = 12
788
    } else {
789
      output->Write(2, 0b11);  // 1 + u(6)
790
      output->Write(6, frame->bitdepth - 1);
791
    }
792
    if (frame->bitdepth <= 14) {
793
      output->Write(1, 1);  // 16-bit-buffer sufficient
794
    } else {
795
      output->Write(1, 0);  // 16-bit-buffer NOT sufficient
796
    }
797
    if (have_alpha) {
798
      output->Write(2, 0b01);  // One extra channel
799
      if (frame->bitdepth == 8) {
800
        output->Write(1, 1); // ... all_default (ie. 8-bit alpha)
801
      } else {
802
        output->Write(1, 0); // not d_alpha
803
        output->Write(2, 0); // type = kAlpha
804
        output->Write(1, 0); // not float
805
        if (frame->bitdepth == 10) {
806
          output->Write(2, 0b01); // bit_depth.bits_per_sample = 10
807
        } else if (frame->bitdepth == 12) {
808
          output->Write(2, 0b10); // bit_depth.bits_per_sample = 12
809
        } else {
810
          output->Write(2, 0b11); // 1 + u(6)
811
          output->Write(6, frame->bitdepth - 1);
812
        }
813
        output->Write(2, 0); // dim_shift = 0
814
        output->Write(2, 0); // name_len = 0
815
        output->Write(1, 0); // alpha_associated = 0
816
      }
817
    } else {
818
      output->Write(2, 0b00);  // No extra channel
819
    }
820
    output->Write(1, 0);  // Not XYB
821
    if (frame->nb_chans > 2) {
822
      output->Write(1, 1);  // color_encoding.all_default (sRGB)
823
    } else {
824
      output->Write(1, 0);     // color_encoding.all_default false
825
      output->Write(1, 0);     // color_encoding.want_icc false
826
      output->Write(2, 1);     // grayscale
827
      output->Write(2, 1);     // D65
828
      output->Write(1, 0);     // no gamma transfer function
829
      output->Write(2, 0b10);  // tf: 2 + u(4)
830
      output->Write(4, 11);    // tf of sRGB
831
      output->Write(2, 1);     // relative rendering intent
832
    }
833
    output->Write(2, 0b00);  // No extensions.
834
835
    output->Write(1, 1);  // all_default transform data
836
837
    // No ICC, no preview. Frame should start at byte boundary.
838
    output->ZeroPadToByte();
839
  }
840
#else
841
0
  assert(!add_image_header);
842
0
#endif
843
  // Handcrafted frame header.
844
0
  output->Write(1, 0);     // all_default
845
0
  output->Write(2, 0b00);  // regular frame
846
0
  output->Write(1, 1);     // modular
847
0
  output->Write(2, 0b00);  // default flags
848
0
  output->Write(1, 0);     // not YCbCr
849
0
  output->Write(2, 0b00);  // no upsampling
850
0
  if (have_alpha) {
851
0
    output->Write(2, 0b00);  // no alpha upsampling
852
0
  }
853
0
  output->Write(2, 0b01);  // default group size
854
0
  output->Write(2, 0b00);  // exactly one pass
855
0
  output->Write(1, 0);     // no custom size or origin
856
0
  output->Write(2, 0b00);  // kReplace blending mode
857
0
  if (have_alpha) {
858
0
    output->Write(2, 0b00);  // kReplace blending mode for alpha channel
859
0
  }
860
0
  output->Write(1, is_last);  // is_last
861
0
  if (!is_last) {
862
0
    output->Write(2, 0b00);  // can not be saved as reference
863
0
  }
864
0
  output->Write(2, 0b00);  // a frame has no name
865
0
  output->Write(1, 0);     // loop filter is not all_default
866
0
  output->Write(1, 0);     // no gaborish
867
0
  output->Write(2, 0);     // 0 EPF iters
868
0
  output->Write(2, 0b00);  // No LF extensions
869
0
  output->Write(2, 0b00);  // No FH extensions
870
871
0
  output->Write(1, 0);      // No TOC permutation
872
0
  output->ZeroPadToByte();  // TOC is byte-aligned.
873
0
  assert(add_image_header || output->bytes_written <= kMaxFrameHeaderSize);
874
0
  for (size_t group_size : frame->group_sizes) {
875
0
    size_t bucket = TOCBucket(group_size);
876
0
    output->Write(2, bucket);
877
0
    output->Write(kTOCBits[bucket] - 2, group_size - kGroupSizeOffset[bucket]);
878
0
  }
879
0
  output->ZeroPadToByte();  // Groups are byte-aligned.
880
0
  return true;
881
0
}
882
883
#if !FJXL_STANDALONE
884
bool JxlFastLosslessOutputAlignedSection(
885
0
    const BitWriter& bw, JxlEncoderOutputProcessorWrapper* output_processor) {
886
0
  assert(bw.bits_in_buffer == 0);
887
0
  const uint8_t* data = bw.data.get();
888
0
  size_t remaining_len = bw.bytes_written;
889
0
  while (remaining_len > 0) {
890
0
    JXL_ASSIGN_OR_RETURN(auto buffer,
891
0
                         output_processor->GetBuffer(1, remaining_len));
892
0
    size_t n = std::min(buffer.size(), remaining_len);
893
0
    if (n == 0) break;
894
0
    memcpy(buffer.data(), data, n);
895
0
    JXL_RETURN_IF_ERROR(buffer.advance(n));
896
0
    data += n;
897
0
    remaining_len -= n;
898
0
  };
899
0
  return true;
900
0
}
901
902
bool JxlFastLosslessOutputHeaders(
903
    JxlFastLosslessFrameState* frame_state,
904
0
    JxlEncoderOutputProcessorWrapper* output_processor) {
905
0
  JXL_RETURN_IF_ERROR(JxlFastLosslessOutputAlignedSection(frame_state->header,
906
0
                                                          output_processor));
907
0
  JXL_RETURN_IF_ERROR(JxlFastLosslessOutputAlignedSection(
908
0
      frame_state->group_data[0][0], output_processor));
909
0
  return true;
910
0
}
911
#endif
912
913
#if FJXL_ENABLE_AVX512
914
__attribute__((target("avx512vbmi2"))) static size_t AppendBytesWithBitOffset(
915
    const uint8_t* data, size_t n, size_t bit_buffer_nbits,
916
0
    unsigned char* output, uint64_t& bit_buffer) {
917
0
  if (n < 128) {
918
0
    return 0;
919
0
  }
920
921
0
  size_t i = 0;
922
0
  __m512i shift = _mm512_set1_epi64(64 - bit_buffer_nbits);
923
0
  __m512i carry = _mm512_set1_epi64(bit_buffer << (64 - bit_buffer_nbits));
924
925
0
  for (; i + 64 <= n; i += 64) {
926
0
    __m512i current = _mm512_loadu_si512(data + i);
927
0
    __m512i previous_u64 = _mm512_alignr_epi64(current, carry, 7);
928
0
    carry = current;
929
0
    __m512i out = _mm512_shrdv_epi64(previous_u64, current, shift);
930
0
    _mm512_storeu_si512(output + i, out);
931
0
  }
932
933
0
  bit_buffer = data[i - 1] >> (8 - bit_buffer_nbits);
934
935
0
  return i;
936
0
}
937
#endif
938
939
size_t JxlFastLosslessWriteOutput(JxlFastLosslessFrameState* frame,
940
0
                                  unsigned char* output, size_t output_size) {
941
0
  assert(output_size >= 32);
942
0
  unsigned char* initial_output = output;
943
0
  size_t (*append_bytes_with_bit_offset)(const uint8_t*, size_t, size_t,
944
0
                                         unsigned char*, uint64_t&) = nullptr;
945
946
0
#if FJXL_ENABLE_AVX512
947
0
  if (HasCpuFeature(CpuFeature::kVBMI2)) {
948
0
    append_bytes_with_bit_offset = AppendBytesWithBitOffset;
949
0
  }
950
0
#endif
951
952
0
  while (true) {
953
0
    size_t& cur = frame->current_bit_writer;
954
0
    size_t& bw_pos = frame->bit_writer_byte_pos;
955
0
    if (cur >= 1 + frame->group_data.size() * frame->nb_chans) {
956
0
      return output - initial_output;
957
0
    }
958
0
    if (output_size <= 9) {
959
0
      return output - initial_output;
960
0
    }
961
0
    size_t nbc = frame->nb_chans;
962
0
    const BitWriter& writer =
963
0
        cur == 0 ? frame->header
964
0
                 : frame->group_data[(cur - 1) / nbc][(cur - 1) % nbc];
965
0
    size_t full_byte_count =
966
0
        std::min(output_size - 9, writer.bytes_written - bw_pos);
967
0
    if (frame->bits_in_buffer == 0) {
968
0
      memcpy(output, writer.data.get() + bw_pos, full_byte_count);
969
0
    } else {
970
0
      size_t i = 0;
971
0
      if (append_bytes_with_bit_offset) {
972
0
        i += append_bytes_with_bit_offset(
973
0
            writer.data.get() + bw_pos, full_byte_count, frame->bits_in_buffer,
974
0
            output, frame->bit_buffer);
975
0
      }
976
0
#if defined(__BYTE_ORDER__) && (__BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__)
977
      // Copy 8 bytes at a time until we reach the border.
978
0
      for (; i + 8 < full_byte_count; i += 8) {
979
0
        uint64_t chunk;
980
0
        memcpy(&chunk, writer.data.get() + bw_pos + i, 8);
981
0
        uint64_t out = frame->bit_buffer | (chunk << frame->bits_in_buffer);
982
0
        memcpy(output + i, &out, 8);
983
0
        frame->bit_buffer = chunk >> (64 - frame->bits_in_buffer);
984
0
      }
985
0
#endif
986
0
      for (; i < full_byte_count; i++) {
987
0
        AddBits(8, writer.data.get()[bw_pos + i], output + i,
988
0
                frame->bits_in_buffer, frame->bit_buffer);
989
0
      }
990
0
    }
991
0
    output += full_byte_count;
992
0
    output_size -= full_byte_count;
993
0
    bw_pos += full_byte_count;
994
0
    if (bw_pos == writer.bytes_written) {
995
0
      auto write = [&](size_t num, uint64_t bits) {
996
0
        size_t n = AddBits(num, bits, output, frame->bits_in_buffer,
997
0
                           frame->bit_buffer);
998
0
        output += n;
999
0
        output_size -= n;
1000
0
      };
1001
0
      if (writer.bits_in_buffer) {
1002
0
        write(writer.bits_in_buffer, writer.buffer);
1003
0
      }
1004
0
      bw_pos = 0;
1005
0
      cur++;
1006
0
      if ((cur - 1) % nbc == 0 && frame->bits_in_buffer != 0) {
1007
0
        write(8 - frame->bits_in_buffer, 0);
1008
0
      }
1009
0
    }
1010
0
  }
1011
0
}
1012
1013
0
void JxlFastLosslessFreeFrameState(JxlFastLosslessFrameState* frame) {
1014
0
  delete frame;
1015
0
}
1016
1017
}  // extern "C"
1018
1019
#endif
1020
1021
#ifdef FJXL_SELF_INCLUDE
1022
1023
namespace {
1024
1025
template <typename T>
1026
struct VecPair {
1027
  T low;
1028
  T hi;
1029
};
1030
1031
#ifdef FJXL_GENERIC_SIMD
1032
#undef FJXL_GENERIC_SIMD
1033
#endif
1034
1035
#ifdef FJXL_AVX512
1036
#define FJXL_GENERIC_SIMD
1037
struct SIMDVec32;
1038
struct Mask32 {
1039
  __mmask16 mask;
1040
  SIMDVec32 IfThenElse(const SIMDVec32& if_true, const SIMDVec32& if_false);
1041
0
  size_t CountPrefix() const {
1042
0
    return CtzNonZero(~uint64_t{_cvtmask16_u32(mask)});
1043
0
  }
1044
};
1045
1046
struct SIMDVec32 {
1047
  __m512i vec;
1048
1049
  static constexpr size_t kLanes = 16;
1050
1051
0
  FJXL_INLINE static SIMDVec32 Load(const uint32_t* data) {
1052
0
    return SIMDVec32{_mm512_loadu_si512((__m512i*)data)};
1053
0
  }
1054
0
  FJXL_INLINE void Store(uint32_t* data) {
1055
0
    _mm512_storeu_si512((__m512i*)data, vec);
1056
0
  }
1057
0
  FJXL_INLINE static SIMDVec32 Val(uint32_t v) {
1058
0
    return SIMDVec32{_mm512_set1_epi32(v)};
1059
0
  }
1060
0
  FJXL_INLINE SIMDVec32 ValToToken() const {
1061
0
    return SIMDVec32{
1062
0
        _mm512_sub_epi32(_mm512_set1_epi32(32), _mm512_lzcnt_epi32(vec))};
1063
0
  }
1064
0
  FJXL_INLINE SIMDVec32 SatSubU(const SIMDVec32& to_subtract) const {
1065
0
    return SIMDVec32{_mm512_sub_epi32(_mm512_max_epu32(vec, to_subtract.vec),
1066
0
                                      to_subtract.vec)};
1067
0
  }
1068
0
  FJXL_INLINE SIMDVec32 Sub(const SIMDVec32& to_subtract) const {
1069
0
    return SIMDVec32{_mm512_sub_epi32(vec, to_subtract.vec)};
1070
0
  }
1071
0
  FJXL_INLINE SIMDVec32 Add(const SIMDVec32& oth) const {
1072
0
    return SIMDVec32{_mm512_add_epi32(vec, oth.vec)};
1073
0
  }
1074
0
  FJXL_INLINE SIMDVec32 Xor(const SIMDVec32& oth) const {
1075
0
    return SIMDVec32{_mm512_xor_epi32(vec, oth.vec)};
1076
0
  }
1077
0
  FJXL_INLINE Mask32 Eq(const SIMDVec32& oth) const {
1078
0
    return Mask32{_mm512_cmpeq_epi32_mask(vec, oth.vec)};
1079
0
  }
1080
0
  FJXL_INLINE Mask32 Gt(const SIMDVec32& oth) const {
1081
0
    return Mask32{_mm512_cmpgt_epi32_mask(vec, oth.vec)};
1082
0
  }
1083
0
  FJXL_INLINE SIMDVec32 Pow2() const {
1084
0
    return SIMDVec32{_mm512_sllv_epi32(_mm512_set1_epi32(1), vec)};
1085
0
  }
1086
  template <size_t i>
1087
0
  FJXL_INLINE SIMDVec32 SignedShiftRight() const {
1088
0
    return SIMDVec32{_mm512_srai_epi32(vec, i)};
1089
0
  }
1090
};
1091
1092
struct SIMDVec16;
1093
1094
struct Mask16 {
1095
  __mmask32 mask;
1096
  SIMDVec16 IfThenElse(const SIMDVec16& if_true, const SIMDVec16& if_false);
1097
0
  Mask16 And(const Mask16& oth) const {
1098
0
    return Mask16{_kand_mask32(mask, oth.mask)};
1099
0
  }
1100
0
  size_t CountPrefix() const {
1101
0
    return CtzNonZero(~uint64_t{_cvtmask32_u32(mask)});
1102
0
  }
1103
};
1104
1105
struct SIMDVec16 {
1106
  __m512i vec;
1107
1108
  static constexpr size_t kLanes = 32;
1109
1110
0
  FJXL_INLINE static SIMDVec16 Load(const uint16_t* data) {
1111
0
    return SIMDVec16{_mm512_loadu_si512((__m512i*)data)};
1112
0
  }
1113
0
  FJXL_INLINE void Store(uint16_t* data) {
1114
0
    _mm512_storeu_si512((__m512i*)data, vec);
1115
0
  }
1116
0
  FJXL_INLINE static SIMDVec16 Val(uint16_t v) {
1117
0
    return SIMDVec16{_mm512_set1_epi16(v)};
1118
0
  }
1119
  FJXL_INLINE static SIMDVec16 FromTwo32(const SIMDVec32& lo,
1120
0
                                         const SIMDVec32& hi) {
1121
0
    auto tmp = _mm512_packus_epi32(lo.vec, hi.vec);
1122
0
    alignas(64) uint64_t perm[8] = {0, 2, 4, 6, 1, 3, 5, 7};
1123
0
    return SIMDVec16{
1124
0
        _mm512_permutex2var_epi64(tmp, _mm512_load_si512((__m512i*)perm), tmp)};
1125
0
  }
1126
1127
0
  FJXL_INLINE SIMDVec16 ValToToken() const {
1128
0
    auto c16 = _mm512_set1_epi32(16);
1129
0
    auto c32 = _mm512_set1_epi32(32);
1130
0
    auto low16bit = _mm512_set1_epi32(0x0000FFFF);
1131
0
    auto lzhi =
1132
0
        _mm512_sub_epi32(c16, _mm512_min_epu32(c16, _mm512_lzcnt_epi32(vec)));
1133
0
    auto lzlo = _mm512_sub_epi32(
1134
0
        c32, _mm512_lzcnt_epi32(_mm512_and_si512(low16bit, vec)));
1135
0
    return SIMDVec16{_mm512_or_si512(lzlo, _mm512_slli_epi32(lzhi, 16))};
1136
0
  }
1137
1138
0
  FJXL_INLINE SIMDVec16 SatSubU(const SIMDVec16& to_subtract) const {
1139
0
    return SIMDVec16{_mm512_subs_epu16(vec, to_subtract.vec)};
1140
0
  }
1141
0
  FJXL_INLINE SIMDVec16 Sub(const SIMDVec16& to_subtract) const {
1142
0
    return SIMDVec16{_mm512_sub_epi16(vec, to_subtract.vec)};
1143
0
  }
1144
0
  FJXL_INLINE SIMDVec16 Add(const SIMDVec16& oth) const {
1145
0
    return SIMDVec16{_mm512_add_epi16(vec, oth.vec)};
1146
0
  }
1147
0
  FJXL_INLINE SIMDVec16 Min(const SIMDVec16& oth) const {
1148
0
    return SIMDVec16{_mm512_min_epu16(vec, oth.vec)};
1149
0
  }
1150
0
  FJXL_INLINE Mask16 Eq(const SIMDVec16& oth) const {
1151
0
    return Mask16{_mm512_cmpeq_epi16_mask(vec, oth.vec)};
1152
0
  }
1153
0
  FJXL_INLINE Mask16 Gt(const SIMDVec16& oth) const {
1154
0
    return Mask16{_mm512_cmpgt_epi16_mask(vec, oth.vec)};
1155
0
  }
1156
0
  FJXL_INLINE SIMDVec16 Pow2() const {
1157
0
    return SIMDVec16{_mm512_sllv_epi16(_mm512_set1_epi16(1), vec)};
1158
0
  }
1159
0
  FJXL_INLINE SIMDVec16 Or(const SIMDVec16& oth) const {
1160
0
    return SIMDVec16{_mm512_or_si512(vec, oth.vec)};
1161
0
  }
1162
0
  FJXL_INLINE SIMDVec16 Xor(const SIMDVec16& oth) const {
1163
0
    return SIMDVec16{_mm512_xor_si512(vec, oth.vec)};
1164
0
  }
1165
0
  FJXL_INLINE SIMDVec16 And(const SIMDVec16& oth) const {
1166
0
    return SIMDVec16{_mm512_and_si512(vec, oth.vec)};
1167
0
  }
1168
0
  FJXL_INLINE SIMDVec16 HAdd(const SIMDVec16& oth) const {
1169
0
    return SIMDVec16{_mm512_srai_epi16(_mm512_add_epi16(vec, oth.vec), 1)};
1170
0
  }
1171
0
  FJXL_INLINE SIMDVec16 PrepareForU8Lookup() const {
1172
0
    return SIMDVec16{_mm512_or_si512(vec, _mm512_set1_epi16(0xFF00))};
1173
0
  }
1174
0
  FJXL_INLINE SIMDVec16 U8Lookup(const uint8_t* table) const {
1175
0
    return SIMDVec16{_mm512_shuffle_epi8(
1176
0
        _mm512_broadcast_i32x4(_mm_loadu_si128((__m128i*)table)), vec)};
1177
0
  }
1178
0
  FJXL_INLINE VecPair<SIMDVec16> Interleave(const SIMDVec16& low) const {
1179
0
    auto lo = _mm512_unpacklo_epi16(low.vec, vec);
1180
0
    auto hi = _mm512_unpackhi_epi16(low.vec, vec);
1181
0
    alignas(64) uint64_t perm1[8] = {0, 1, 8, 9, 2, 3, 10, 11};
1182
0
    alignas(64) uint64_t perm2[8] = {4, 5, 12, 13, 6, 7, 14, 15};
1183
0
    return {SIMDVec16{_mm512_permutex2var_epi64(
1184
0
                lo, _mm512_load_si512((__m512i*)perm1), hi)},
1185
0
            SIMDVec16{_mm512_permutex2var_epi64(
1186
0
                lo, _mm512_load_si512((__m512i*)perm2), hi)}};
1187
0
  }
1188
0
  FJXL_INLINE VecPair<SIMDVec32> Upcast() const {
1189
0
    auto lo = _mm512_unpacklo_epi16(vec, _mm512_setzero_si512());
1190
0
    auto hi = _mm512_unpackhi_epi16(vec, _mm512_setzero_si512());
1191
0
    alignas(64) uint64_t perm1[8] = {0, 1, 8, 9, 2, 3, 10, 11};
1192
0
    alignas(64) uint64_t perm2[8] = {4, 5, 12, 13, 6, 7, 14, 15};
1193
0
    return {SIMDVec32{_mm512_permutex2var_epi64(
1194
0
                lo, _mm512_load_si512((__m512i*)perm1), hi)},
1195
0
            SIMDVec32{_mm512_permutex2var_epi64(
1196
0
                lo, _mm512_load_si512((__m512i*)perm2), hi)}};
1197
0
  }
1198
  template <size_t i>
1199
0
  FJXL_INLINE SIMDVec16 SignedShiftRight() const {
1200
0
    return SIMDVec16{_mm512_srai_epi16(vec, i)};
1201
0
  }
1202
1203
0
  static std::array<SIMDVec16, 1> LoadG8(const unsigned char* data) {
1204
0
    __m256i bytes = _mm256_loadu_si256((__m256i*)data);
1205
0
    return {SIMDVec16{_mm512_cvtepu8_epi16(bytes)}};
1206
0
  }
1207
0
  static std::array<SIMDVec16, 1> LoadG16(const unsigned char* data) {
1208
0
    return {Load((const uint16_t*)data)};
1209
0
  }
1210
1211
0
  static std::array<SIMDVec16, 2> LoadGA8(const unsigned char* data) {
1212
0
    __m512i bytes = _mm512_loadu_si512((__m512i*)data);
1213
0
    __m512i gray = _mm512_and_si512(bytes, _mm512_set1_epi16(0xFF));
1214
0
    __m512i alpha = _mm512_srli_epi16(bytes, 8);
1215
0
    return {SIMDVec16{gray}, SIMDVec16{alpha}};
1216
0
  }
1217
0
  static std::array<SIMDVec16, 2> LoadGA16(const unsigned char* data) {
1218
0
    __m512i bytes1 = _mm512_loadu_si512((__m512i*)data);
1219
0
    __m512i bytes2 = _mm512_loadu_si512((__m512i*)(data + 64));
1220
0
    __m512i g_mask = _mm512_set1_epi32(0xFFFF);
1221
0
    __m512i permuteidx = _mm512_set_epi64(7, 5, 3, 1, 6, 4, 2, 0);
1222
0
    __m512i g = _mm512_permutexvar_epi64(
1223
0
        permuteidx, _mm512_packus_epi32(_mm512_and_si512(bytes1, g_mask),
1224
0
                                        _mm512_and_si512(bytes2, g_mask)));
1225
0
    __m512i a = _mm512_permutexvar_epi64(
1226
0
        permuteidx, _mm512_packus_epi32(_mm512_srli_epi32(bytes1, 16),
1227
0
                                        _mm512_srli_epi32(bytes2, 16)));
1228
0
    return {SIMDVec16{g}, SIMDVec16{a}};
1229
0
  }
1230
1231
0
  static std::array<SIMDVec16, 3> LoadRGB8(const unsigned char* data) {
1232
0
    __m512i bytes0 = _mm512_loadu_si512((__m512i*)data);
1233
0
    __m512i bytes1 =
1234
0
        _mm512_zextsi256_si512(_mm256_loadu_si256((__m256i*)(data + 64)));
1235
1236
    // 0x7A = element of upper half of second vector = 0 after lookup; still in
1237
    // the upper half once we add 1 or 2.
1238
0
    uint8_t z = 0x7A;
1239
0
    __m512i ridx =
1240
0
        _mm512_set_epi8(z, 93, z, 90, z, 87, z, 84, z, 81, z, 78, z, 75, z, 72,
1241
0
                        z, 69, z, 66, z, 63, z, 60, z, 57, z, 54, z, 51, z, 48,
1242
0
                        z, 45, z, 42, z, 39, z, 36, z, 33, z, 30, z, 27, z, 24,
1243
0
                        z, 21, z, 18, z, 15, z, 12, z, 9, z, 6, z, 3, z, 0);
1244
0
    __m512i gidx = _mm512_add_epi8(ridx, _mm512_set1_epi8(1));
1245
0
    __m512i bidx = _mm512_add_epi8(gidx, _mm512_set1_epi8(1));
1246
0
    __m512i r = _mm512_permutex2var_epi8(bytes0, ridx, bytes1);
1247
0
    __m512i g = _mm512_permutex2var_epi8(bytes0, gidx, bytes1);
1248
0
    __m512i b = _mm512_permutex2var_epi8(bytes0, bidx, bytes1);
1249
0
    return {SIMDVec16{r}, SIMDVec16{g}, SIMDVec16{b}};
1250
0
  }
1251
0
  static std::array<SIMDVec16, 3> LoadRGB16(const unsigned char* data) {
1252
0
    __m512i bytes0 = _mm512_loadu_si512((__m512i*)data);
1253
0
    __m512i bytes1 = _mm512_loadu_si512((__m512i*)(data + 64));
1254
0
    __m512i bytes2 = _mm512_loadu_si512((__m512i*)(data + 128));
1255
1256
0
    __m512i ridx_lo = _mm512_set_epi16(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 63, 60, 57,
1257
0
                                       54, 51, 48, 45, 42, 39, 36, 33, 30, 27,
1258
0
                                       24, 21, 18, 15, 12, 9, 6, 3, 0);
1259
    // -1 is such that when adding 1 or 2, we get the correct index for
1260
    // green/blue.
1261
0
    __m512i ridx_hi =
1262
0
        _mm512_set_epi16(29, 26, 23, 20, 17, 14, 11, 8, 5, 2, -1, 0, 0, 0, 0, 0,
1263
0
                         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
1264
0
    __m512i gidx_lo = _mm512_add_epi16(ridx_lo, _mm512_set1_epi16(1));
1265
0
    __m512i gidx_hi = _mm512_add_epi16(ridx_hi, _mm512_set1_epi16(1));
1266
0
    __m512i bidx_lo = _mm512_add_epi16(gidx_lo, _mm512_set1_epi16(1));
1267
0
    __m512i bidx_hi = _mm512_add_epi16(gidx_hi, _mm512_set1_epi16(1));
1268
1269
0
    __mmask32 rmask = _cvtu32_mask32(0b11111111110000000000000000000000);
1270
0
    __mmask32 gbmask = _cvtu32_mask32(0b11111111111000000000000000000000);
1271
1272
0
    __m512i rlo = _mm512_permutex2var_epi16(bytes0, ridx_lo, bytes1);
1273
0
    __m512i glo = _mm512_permutex2var_epi16(bytes0, gidx_lo, bytes1);
1274
0
    __m512i blo = _mm512_permutex2var_epi16(bytes0, bidx_lo, bytes1);
1275
0
    __m512i r = _mm512_mask_permutexvar_epi16(rlo, rmask, ridx_hi, bytes2);
1276
0
    __m512i g = _mm512_mask_permutexvar_epi16(glo, gbmask, gidx_hi, bytes2);
1277
0
    __m512i b = _mm512_mask_permutexvar_epi16(blo, gbmask, bidx_hi, bytes2);
1278
0
    return {SIMDVec16{r}, SIMDVec16{g}, SIMDVec16{b}};
1279
0
  }
1280
1281
0
  static std::array<SIMDVec16, 4> LoadRGBA8(const unsigned char* data) {
1282
0
    __m512i bytes1 = _mm512_loadu_si512((__m512i*)data);
1283
0
    __m512i bytes2 = _mm512_loadu_si512((__m512i*)(data + 64));
1284
0
    __m512i rg_mask = _mm512_set1_epi32(0xFFFF);
1285
0
    __m512i permuteidx = _mm512_set_epi64(7, 5, 3, 1, 6, 4, 2, 0);
1286
0
    __m512i rg = _mm512_permutexvar_epi64(
1287
0
        permuteidx, _mm512_packus_epi32(_mm512_and_si512(bytes1, rg_mask),
1288
0
                                        _mm512_and_si512(bytes2, rg_mask)));
1289
0
    __m512i b_a = _mm512_permutexvar_epi64(
1290
0
        permuteidx, _mm512_packus_epi32(_mm512_srli_epi32(bytes1, 16),
1291
0
                                        _mm512_srli_epi32(bytes2, 16)));
1292
0
    __m512i r = _mm512_and_si512(rg, _mm512_set1_epi16(0xFF));
1293
0
    __m512i g = _mm512_srli_epi16(rg, 8);
1294
0
    __m512i b = _mm512_and_si512(b_a, _mm512_set1_epi16(0xFF));
1295
0
    __m512i a = _mm512_srli_epi16(b_a, 8);
1296
0
    return {SIMDVec16{r}, SIMDVec16{g}, SIMDVec16{b}, SIMDVec16{a}};
1297
0
  }
1298
0
  static std::array<SIMDVec16, 4> LoadRGBA16(const unsigned char* data) {
1299
0
    __m512i bytes0 = _mm512_loadu_si512((__m512i*)data);
1300
0
    __m512i bytes1 = _mm512_loadu_si512((__m512i*)(data + 64));
1301
0
    __m512i bytes2 = _mm512_loadu_si512((__m512i*)(data + 128));
1302
0
    __m512i bytes3 = _mm512_loadu_si512((__m512i*)(data + 192));
1303
1304
0
    auto pack32 = [](__m512i a, __m512i b) {
1305
0
      __m512i permuteidx = _mm512_set_epi64(7, 5, 3, 1, 6, 4, 2, 0);
1306
0
      return _mm512_permutexvar_epi64(permuteidx, _mm512_packus_epi32(a, b));
1307
0
    };
1308
0
    auto packlow32 = [&pack32](__m512i a, __m512i b) {
1309
0
      __m512i mask = _mm512_set1_epi32(0xFFFF);
1310
0
      return pack32(_mm512_and_si512(a, mask), _mm512_and_si512(b, mask));
1311
0
    };
1312
0
    auto packhi32 = [&pack32](__m512i a, __m512i b) {
1313
0
      return pack32(_mm512_srli_epi32(a, 16), _mm512_srli_epi32(b, 16));
1314
0
    };
1315
1316
0
    __m512i rb0 = packlow32(bytes0, bytes1);
1317
0
    __m512i rb1 = packlow32(bytes2, bytes3);
1318
0
    __m512i ga0 = packhi32(bytes0, bytes1);
1319
0
    __m512i ga1 = packhi32(bytes2, bytes3);
1320
1321
0
    __m512i r = packlow32(rb0, rb1);
1322
0
    __m512i g = packlow32(ga0, ga1);
1323
0
    __m512i b = packhi32(rb0, rb1);
1324
0
    __m512i a = packhi32(ga0, ga1);
1325
0
    return {SIMDVec16{r}, SIMDVec16{g}, SIMDVec16{b}, SIMDVec16{a}};
1326
0
  }
1327
1328
0
  void SwapEndian() {
1329
0
    auto indices = _mm512_broadcast_i32x4(
1330
0
        _mm_setr_epi8(1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14));
1331
0
    vec = _mm512_shuffle_epi8(vec, indices);
1332
0
  }
1333
};
1334
1335
SIMDVec16 Mask16::IfThenElse(const SIMDVec16& if_true,
1336
0
                             const SIMDVec16& if_false) {
1337
0
  return SIMDVec16{_mm512_mask_blend_epi16(mask, if_false.vec, if_true.vec)};
1338
0
}
1339
1340
SIMDVec32 Mask32::IfThenElse(const SIMDVec32& if_true,
1341
0
                             const SIMDVec32& if_false) {
1342
0
  return SIMDVec32{_mm512_mask_blend_epi32(mask, if_false.vec, if_true.vec)};
1343
0
}
1344
1345
struct Bits64 {
1346
  static constexpr size_t kLanes = 8;
1347
1348
  __m512i nbits;
1349
  __m512i bits;
1350
1351
0
  FJXL_INLINE void Store(uint64_t* nbits_out, uint64_t* bits_out) {
1352
0
    _mm512_storeu_si512((__m512i*)nbits_out, nbits);
1353
0
    _mm512_storeu_si512((__m512i*)bits_out, bits);
1354
0
  }
1355
};
1356
1357
struct Bits32 {
1358
  __m512i nbits;
1359
  __m512i bits;
1360
1361
0
  static Bits32 FromRaw(SIMDVec32 nbits, SIMDVec32 bits) {
1362
0
    return Bits32{nbits.vec, bits.vec};
1363
0
  }
1364
1365
0
  Bits64 Merge() const {
1366
0
    auto nbits_hi32 = _mm512_srli_epi64(nbits, 32);
1367
0
    auto nbits_lo32 = _mm512_and_si512(nbits, _mm512_set1_epi64(0xFFFFFFFF));
1368
0
    auto bits_hi32 = _mm512_srli_epi64(bits, 32);
1369
0
    auto bits_lo32 = _mm512_and_si512(bits, _mm512_set1_epi64(0xFFFFFFFF));
1370
1371
0
    auto nbits64 = _mm512_add_epi64(nbits_hi32, nbits_lo32);
1372
0
    auto bits64 =
1373
0
        _mm512_or_si512(_mm512_sllv_epi64(bits_hi32, nbits_lo32), bits_lo32);
1374
0
    return Bits64{nbits64, bits64};
1375
0
  }
1376
1377
0
  void Interleave(const Bits32& low) {
1378
0
    bits = _mm512_or_si512(_mm512_sllv_epi32(bits, low.nbits), low.bits);
1379
0
    nbits = _mm512_add_epi32(nbits, low.nbits);
1380
0
  }
1381
1382
0
  void ClipTo(size_t n) {
1383
0
    n = std::min<size_t>(n, 16);
1384
0
    constexpr uint32_t kMask[32] = {
1385
0
        ~0u, ~0u, ~0u, ~0u, ~0u, ~0u, ~0u, ~0u, ~0u, ~0u, ~0u,
1386
0
        ~0u, ~0u, ~0u, ~0u, ~0u, 0,   0,   0,   0,   0,   0,
1387
0
        0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
1388
0
    };
1389
0
    __m512i mask = _mm512_loadu_si512((__m512i*)(kMask + 16 - n));
1390
0
    nbits = _mm512_and_si512(mask, nbits);
1391
0
    bits = _mm512_and_si512(mask, bits);
1392
0
  }
1393
0
  void Skip(size_t n) {
1394
0
    n = std::min<size_t>(n, 16);
1395
0
    constexpr uint32_t kMask[32] = {
1396
0
        0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
1397
0
        0,   0,   0,   0,   0,   ~0u, ~0u, ~0u, ~0u, ~0u, ~0u,
1398
0
        ~0u, ~0u, ~0u, ~0u, ~0u, ~0u, ~0u, ~0u, ~0u, ~0u,
1399
0
    };
1400
0
    __m512i mask = _mm512_loadu_si512((__m512i*)(kMask + 16 - n));
1401
0
    nbits = _mm512_and_si512(mask, nbits);
1402
0
    bits = _mm512_and_si512(mask, bits);
1403
0
  }
1404
};
1405
1406
struct Bits16 {
1407
  __m512i nbits;
1408
  __m512i bits;
1409
1410
0
  static Bits16 FromRaw(SIMDVec16 nbits, SIMDVec16 bits) {
1411
0
    return Bits16{nbits.vec, bits.vec};
1412
0
  }
1413
1414
0
  Bits32 Merge() const {
1415
0
    auto nbits_hi16 = _mm512_srli_epi32(nbits, 16);
1416
0
    auto nbits_lo16 = _mm512_and_si512(nbits, _mm512_set1_epi32(0xFFFF));
1417
0
    auto bits_hi16 = _mm512_srli_epi32(bits, 16);
1418
0
    auto bits_lo16 = _mm512_and_si512(bits, _mm512_set1_epi32(0xFFFF));
1419
1420
0
    auto nbits32 = _mm512_add_epi32(nbits_hi16, nbits_lo16);
1421
0
    auto bits32 =
1422
0
        _mm512_or_si512(_mm512_sllv_epi32(bits_hi16, nbits_lo16), bits_lo16);
1423
0
    return Bits32{nbits32, bits32};
1424
0
  }
1425
1426
0
  void Interleave(const Bits16& low) {
1427
0
    bits = _mm512_or_si512(_mm512_sllv_epi16(bits, low.nbits), low.bits);
1428
0
    nbits = _mm512_add_epi16(nbits, low.nbits);
1429
0
  }
1430
1431
0
  void ClipTo(size_t n) {
1432
0
    n = std::min<size_t>(n, 32);
1433
0
    constexpr uint16_t kMask[64] = {
1434
0
        0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF,
1435
0
        0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF,
1436
0
        0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF,
1437
0
        0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF,
1438
0
        0,      0,      0,      0,      0,      0,      0,      0,
1439
0
        0,      0,      0,      0,      0,      0,      0,      0,
1440
0
        0,      0,      0,      0,      0,      0,      0,      0,
1441
0
        0,      0,      0,      0,      0,      0,      0,      0,
1442
0
    };
1443
0
    __m512i mask = _mm512_loadu_si512((__m512i*)(kMask + 32 - n));
1444
0
    nbits = _mm512_and_si512(mask, nbits);
1445
0
    bits = _mm512_and_si512(mask, bits);
1446
0
  }
1447
0
  void Skip(size_t n) {
1448
0
    n = std::min<size_t>(n, 32);
1449
0
    constexpr uint16_t kMask[64] = {
1450
0
        0,      0,      0,      0,      0,      0,      0,      0,
1451
0
        0,      0,      0,      0,      0,      0,      0,      0,
1452
0
        0,      0,      0,      0,      0,      0,      0,      0,
1453
0
        0,      0,      0,      0,      0,      0,      0,      0,
1454
0
        0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF,
1455
0
        0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF,
1456
0
        0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF,
1457
0
        0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF,
1458
0
    };
1459
0
    __m512i mask = _mm512_loadu_si512((__m512i*)(kMask + 32 - n));
1460
0
    nbits = _mm512_and_si512(mask, nbits);
1461
0
    bits = _mm512_and_si512(mask, bits);
1462
0
  }
1463
};
1464
1465
#endif
1466
1467
#ifdef FJXL_AVX2
1468
#define FJXL_GENERIC_SIMD
1469
1470
struct SIMDVec32;
1471
1472
struct Mask32 {
1473
  __m256i mask;
1474
  SIMDVec32 IfThenElse(const SIMDVec32& if_true, const SIMDVec32& if_false);
1475
0
  size_t CountPrefix() const {
1476
0
    return CtzNonZero(~static_cast<uint64_t>(
1477
0
        static_cast<uint8_t>(_mm256_movemask_ps(_mm256_castsi256_ps(mask)))));
1478
0
  }
1479
};
1480
1481
struct SIMDVec32 {
1482
  __m256i vec;
1483
1484
  static constexpr size_t kLanes = 8;
1485
1486
0
  FJXL_INLINE static SIMDVec32 Load(const uint32_t* data) {
1487
0
    return SIMDVec32{_mm256_loadu_si256((__m256i*)data)};
1488
0
  }
1489
0
  FJXL_INLINE void Store(uint32_t* data) {
1490
0
    _mm256_storeu_si256((__m256i*)data, vec);
1491
0
  }
1492
0
  FJXL_INLINE static SIMDVec32 Val(uint32_t v) {
1493
0
    return SIMDVec32{_mm256_set1_epi32(v)};
1494
0
  }
1495
0
  FJXL_INLINE SIMDVec32 ValToToken() const {
1496
0
    auto f32 = _mm256_castps_si256(_mm256_cvtepi32_ps(vec));
1497
0
    return SIMDVec32{_mm256_max_epi32(
1498
0
        _mm256_setzero_si256(),
1499
0
        _mm256_sub_epi32(_mm256_srli_epi32(f32, 23), _mm256_set1_epi32(126)))};
1500
0
  }
1501
0
  FJXL_INLINE SIMDVec32 SatSubU(const SIMDVec32& to_subtract) const {
1502
0
    return SIMDVec32{_mm256_sub_epi32(_mm256_max_epu32(vec, to_subtract.vec),
1503
0
                                      to_subtract.vec)};
1504
0
  }
1505
0
  FJXL_INLINE SIMDVec32 Sub(const SIMDVec32& to_subtract) const {
1506
0
    return SIMDVec32{_mm256_sub_epi32(vec, to_subtract.vec)};
1507
0
  }
1508
0
  FJXL_INLINE SIMDVec32 Add(const SIMDVec32& oth) const {
1509
0
    return SIMDVec32{_mm256_add_epi32(vec, oth.vec)};
1510
0
  }
1511
0
  FJXL_INLINE SIMDVec32 Xor(const SIMDVec32& oth) const {
1512
0
    return SIMDVec32{_mm256_xor_si256(vec, oth.vec)};
1513
0
  }
1514
0
  FJXL_INLINE SIMDVec32 Pow2() const {
1515
0
    return SIMDVec32{_mm256_sllv_epi32(_mm256_set1_epi32(1), vec)};
1516
0
  }
1517
0
  FJXL_INLINE Mask32 Eq(const SIMDVec32& oth) const {
1518
0
    return Mask32{_mm256_cmpeq_epi32(vec, oth.vec)};
1519
0
  }
1520
0
  FJXL_INLINE Mask32 Gt(const SIMDVec32& oth) const {
1521
0
    return Mask32{_mm256_cmpgt_epi32(vec, oth.vec)};
1522
0
  }
1523
  template <size_t i>
1524
0
  FJXL_INLINE SIMDVec32 SignedShiftRight() const {
1525
0
    return SIMDVec32{_mm256_srai_epi32(vec, i)};
1526
0
  }
1527
};
1528
1529
struct SIMDVec16;
1530
1531
struct Mask16 {
1532
  __m256i mask;
1533
  SIMDVec16 IfThenElse(const SIMDVec16& if_true, const SIMDVec16& if_false);
1534
0
  Mask16 And(const Mask16& oth) const {
1535
0
    return Mask16{_mm256_and_si256(mask, oth.mask)};
1536
0
  }
1537
0
  size_t CountPrefix() const {
1538
0
    return CtzNonZero(~static_cast<uint64_t>(
1539
0
               static_cast<uint32_t>(_mm256_movemask_epi8(mask)))) /
1540
0
           2;
1541
0
  }
1542
};
1543
1544
struct SIMDVec16 {
1545
  __m256i vec;
1546
1547
  static constexpr size_t kLanes = 16;
1548
1549
0
  FJXL_INLINE static SIMDVec16 Load(const uint16_t* data) {
1550
0
    return SIMDVec16{_mm256_loadu_si256((__m256i*)data)};
1551
0
  }
1552
0
  FJXL_INLINE void Store(uint16_t* data) {
1553
0
    _mm256_storeu_si256((__m256i*)data, vec);
1554
0
  }
1555
0
  FJXL_INLINE static SIMDVec16 Val(uint16_t v) {
1556
0
    return SIMDVec16{_mm256_set1_epi16(v)};
1557
0
  }
1558
  FJXL_INLINE static SIMDVec16 FromTwo32(const SIMDVec32& lo,
1559
0
                                         const SIMDVec32& hi) {
1560
0
    auto tmp = _mm256_packus_epi32(lo.vec, hi.vec);
1561
0
    return SIMDVec16{_mm256_permute4x64_epi64(tmp, 0b11011000)};
1562
0
  }
1563
1564
0
  FJXL_INLINE SIMDVec16 ValToToken() const {
1565
0
    auto nibble0 =
1566
0
        _mm256_or_si256(_mm256_and_si256(vec, _mm256_set1_epi16(0xF)),
1567
0
                        _mm256_set1_epi16(0xFF00));
1568
0
    auto nibble1 = _mm256_or_si256(
1569
0
        _mm256_and_si256(_mm256_srli_epi16(vec, 4), _mm256_set1_epi16(0xF)),
1570
0
        _mm256_set1_epi16(0xFF00));
1571
0
    auto nibble2 = _mm256_or_si256(
1572
0
        _mm256_and_si256(_mm256_srli_epi16(vec, 8), _mm256_set1_epi16(0xF)),
1573
0
        _mm256_set1_epi16(0xFF00));
1574
0
    auto nibble3 =
1575
0
        _mm256_or_si256(_mm256_srli_epi16(vec, 12), _mm256_set1_epi16(0xFF00));
1576
1577
0
    auto lut0 = _mm256_broadcastsi128_si256(
1578
0
        _mm_setr_epi8(0, 1, 2, 2, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4));
1579
0
    auto lut1 = _mm256_broadcastsi128_si256(
1580
0
        _mm_setr_epi8(0, 5, 6, 6, 7, 7, 7, 7, 8, 8, 8, 8, 8, 8, 8, 8));
1581
0
    auto lut2 = _mm256_broadcastsi128_si256(_mm_setr_epi8(
1582
0
        0, 9, 10, 10, 11, 11, 11, 11, 12, 12, 12, 12, 12, 12, 12, 12));
1583
0
    auto lut3 = _mm256_broadcastsi128_si256(_mm_setr_epi8(
1584
0
        0, 13, 14, 14, 15, 15, 15, 15, 16, 16, 16, 16, 16, 16, 16, 16));
1585
1586
0
    auto token0 = _mm256_shuffle_epi8(lut0, nibble0);
1587
0
    auto token1 = _mm256_shuffle_epi8(lut1, nibble1);
1588
0
    auto token2 = _mm256_shuffle_epi8(lut2, nibble2);
1589
0
    auto token3 = _mm256_shuffle_epi8(lut3, nibble3);
1590
1591
0
    auto token = _mm256_max_epi16(_mm256_max_epi16(token0, token1),
1592
0
                                  _mm256_max_epi16(token2, token3));
1593
0
    return SIMDVec16{token};
1594
0
  }
1595
1596
0
  FJXL_INLINE SIMDVec16 SatSubU(const SIMDVec16& to_subtract) const {
1597
0
    return SIMDVec16{_mm256_subs_epu16(vec, to_subtract.vec)};
1598
0
  }
1599
0
  FJXL_INLINE SIMDVec16 Sub(const SIMDVec16& to_subtract) const {
1600
0
    return SIMDVec16{_mm256_sub_epi16(vec, to_subtract.vec)};
1601
0
  }
1602
0
  FJXL_INLINE SIMDVec16 Add(const SIMDVec16& oth) const {
1603
0
    return SIMDVec16{_mm256_add_epi16(vec, oth.vec)};
1604
0
  }
1605
0
  FJXL_INLINE SIMDVec16 Min(const SIMDVec16& oth) const {
1606
0
    return SIMDVec16{_mm256_min_epu16(vec, oth.vec)};
1607
0
  }
1608
0
  FJXL_INLINE Mask16 Eq(const SIMDVec16& oth) const {
1609
0
    return Mask16{_mm256_cmpeq_epi16(vec, oth.vec)};
1610
0
  }
1611
0
  FJXL_INLINE Mask16 Gt(const SIMDVec16& oth) const {
1612
0
    return Mask16{_mm256_cmpgt_epi16(vec, oth.vec)};
1613
0
  }
1614
0
  FJXL_INLINE SIMDVec16 Pow2() const {
1615
0
    auto pow2_lo_lut = _mm256_broadcastsi128_si256(
1616
0
        _mm_setr_epi8(1 << 0, 1 << 1, 1 << 2, 1 << 3, 1 << 4, 1 << 5, 1 << 6,
1617
0
                      1u << 7, 0, 0, 0, 0, 0, 0, 0, 0));
1618
0
    auto pow2_hi_lut = _mm256_broadcastsi128_si256(
1619
0
        _mm_setr_epi8(0, 0, 0, 0, 0, 0, 0, 0, 1 << 0, 1 << 1, 1 << 2, 1 << 3,
1620
0
                      1 << 4, 1 << 5, 1 << 6, 1u << 7));
1621
1622
0
    auto masked = _mm256_or_si256(vec, _mm256_set1_epi16(0xFF00));
1623
1624
0
    auto pow2_lo = _mm256_shuffle_epi8(pow2_lo_lut, masked);
1625
0
    auto pow2_hi = _mm256_shuffle_epi8(pow2_hi_lut, masked);
1626
1627
0
    auto pow2 = _mm256_or_si256(_mm256_slli_epi16(pow2_hi, 8), pow2_lo);
1628
0
    return SIMDVec16{pow2};
1629
0
  }
1630
0
  FJXL_INLINE SIMDVec16 Or(const SIMDVec16& oth) const {
1631
0
    return SIMDVec16{_mm256_or_si256(vec, oth.vec)};
1632
0
  }
1633
0
  FJXL_INLINE SIMDVec16 Xor(const SIMDVec16& oth) const {
1634
0
    return SIMDVec16{_mm256_xor_si256(vec, oth.vec)};
1635
0
  }
1636
0
  FJXL_INLINE SIMDVec16 And(const SIMDVec16& oth) const {
1637
0
    return SIMDVec16{_mm256_and_si256(vec, oth.vec)};
1638
0
  }
1639
0
  FJXL_INLINE SIMDVec16 HAdd(const SIMDVec16& oth) const {
1640
0
    return SIMDVec16{_mm256_srai_epi16(_mm256_add_epi16(vec, oth.vec), 1)};
1641
0
  }
1642
0
  FJXL_INLINE SIMDVec16 PrepareForU8Lookup() const {
1643
0
    return SIMDVec16{_mm256_or_si256(vec, _mm256_set1_epi16(0xFF00))};
1644
0
  }
1645
0
  FJXL_INLINE SIMDVec16 U8Lookup(const uint8_t* table) const {
1646
0
    return SIMDVec16{_mm256_shuffle_epi8(
1647
0
        _mm256_broadcastsi128_si256(_mm_loadu_si128((__m128i*)table)), vec)};
1648
0
  }
1649
0
  FJXL_INLINE VecPair<SIMDVec16> Interleave(const SIMDVec16& low) const {
1650
0
    auto v02 = _mm256_unpacklo_epi16(low.vec, vec);
1651
0
    auto v13 = _mm256_unpackhi_epi16(low.vec, vec);
1652
0
    return {SIMDVec16{_mm256_permute2x128_si256(v02, v13, 0x20)},
1653
0
            SIMDVec16{_mm256_permute2x128_si256(v02, v13, 0x31)}};
1654
0
  }
1655
0
  FJXL_INLINE VecPair<SIMDVec32> Upcast() const {
1656
0
    auto v02 = _mm256_unpacklo_epi16(vec, _mm256_setzero_si256());
1657
0
    auto v13 = _mm256_unpackhi_epi16(vec, _mm256_setzero_si256());
1658
0
    return {SIMDVec32{_mm256_permute2x128_si256(v02, v13, 0x20)},
1659
0
            SIMDVec32{_mm256_permute2x128_si256(v02, v13, 0x31)}};
1660
0
  }
1661
  template <size_t i>
1662
0
  FJXL_INLINE SIMDVec16 SignedShiftRight() const {
1663
0
    return SIMDVec16{_mm256_srai_epi16(vec, i)};
1664
0
  }
1665
1666
0
  static std::array<SIMDVec16, 1> LoadG8(const unsigned char* data) {
1667
0
    __m128i bytes = _mm_loadu_si128((__m128i*)data);
1668
0
    return {SIMDVec16{_mm256_cvtepu8_epi16(bytes)}};
1669
0
  }
1670
0
  static std::array<SIMDVec16, 1> LoadG16(const unsigned char* data) {
1671
0
    return {Load((const uint16_t*)data)};
1672
0
  }
1673
1674
0
  static std::array<SIMDVec16, 2> LoadGA8(const unsigned char* data) {
1675
0
    __m256i bytes = _mm256_loadu_si256((__m256i*)data);
1676
0
    __m256i gray = _mm256_and_si256(bytes, _mm256_set1_epi16(0xFF));
1677
0
    __m256i alpha = _mm256_srli_epi16(bytes, 8);
1678
0
    return {SIMDVec16{gray}, SIMDVec16{alpha}};
1679
0
  }
1680
0
  static std::array<SIMDVec16, 2> LoadGA16(const unsigned char* data) {
1681
0
    __m256i bytes1 = _mm256_loadu_si256((__m256i*)data);
1682
0
    __m256i bytes2 = _mm256_loadu_si256((__m256i*)(data + 32));
1683
0
    __m256i g_mask = _mm256_set1_epi32(0xFFFF);
1684
0
    __m256i g = _mm256_permute4x64_epi64(
1685
0
        _mm256_packus_epi32(_mm256_and_si256(bytes1, g_mask),
1686
0
                            _mm256_and_si256(bytes2, g_mask)),
1687
0
        0b11011000);
1688
0
    __m256i a = _mm256_permute4x64_epi64(
1689
0
        _mm256_packus_epi32(_mm256_srli_epi32(bytes1, 16),
1690
0
                            _mm256_srli_epi32(bytes2, 16)),
1691
0
        0b11011000);
1692
0
    return {SIMDVec16{g}, SIMDVec16{a}};
1693
0
  }
1694
1695
0
  static std::array<SIMDVec16, 3> LoadRGB8(const unsigned char* data) {
1696
0
    __m128i bytes0 = _mm_loadu_si128((__m128i*)data);
1697
0
    __m128i bytes1 = _mm_loadu_si128((__m128i*)(data + 16));
1698
0
    __m128i bytes2 = _mm_loadu_si128((__m128i*)(data + 32));
1699
1700
0
    __m128i idx =
1701
0
        _mm_setr_epi8(0, 3, 6, 9, 12, 15, 2, 5, 8, 11, 14, 1, 4, 7, 10, 13);
1702
1703
0
    __m128i r6b5g5_0 = _mm_shuffle_epi8(bytes0, idx);
1704
0
    __m128i g6r5b5_1 = _mm_shuffle_epi8(bytes1, idx);
1705
0
    __m128i b6g5r5_2 = _mm_shuffle_epi8(bytes2, idx);
1706
1707
0
    __m128i mask010 = _mm_setr_epi8(0, 0, 0, 0, 0, 0, 0xFF, 0xFF, 0xFF, 0xFF,
1708
0
                                    0xFF, 0, 0, 0, 0, 0);
1709
0
    __m128i mask001 = _mm_setr_epi8(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0xFF, 0xFF,
1710
0
                                    0xFF, 0xFF, 0xFF);
1711
1712
0
    __m128i b2g2b1 = _mm_blendv_epi8(b6g5r5_2, g6r5b5_1, mask001);
1713
0
    __m128i b2b0b1 = _mm_blendv_epi8(b2g2b1, r6b5g5_0, mask010);
1714
1715
0
    __m128i r0r1b1 = _mm_blendv_epi8(r6b5g5_0, g6r5b5_1, mask010);
1716
0
    __m128i r0r1r2 = _mm_blendv_epi8(r0r1b1, b6g5r5_2, mask001);
1717
1718
0
    __m128i g1r1g0 = _mm_blendv_epi8(g6r5b5_1, r6b5g5_0, mask001);
1719
0
    __m128i g1g2g0 = _mm_blendv_epi8(g1r1g0, b6g5r5_2, mask010);
1720
1721
0
    __m128i g0g1g2 = _mm_alignr_epi8(g1g2g0, g1g2g0, 11);
1722
0
    __m128i b0b1b2 = _mm_alignr_epi8(b2b0b1, b2b0b1, 6);
1723
1724
0
    return {SIMDVec16{_mm256_cvtepu8_epi16(r0r1r2)},
1725
0
            SIMDVec16{_mm256_cvtepu8_epi16(g0g1g2)},
1726
0
            SIMDVec16{_mm256_cvtepu8_epi16(b0b1b2)}};
1727
0
  }
1728
0
  static std::array<SIMDVec16, 3> LoadRGB16(const unsigned char* data) {
1729
0
    auto load_and_split_lohi = [](const unsigned char* data) {
1730
      // LHLHLH...
1731
0
      __m256i bytes = _mm256_loadu_si256((__m256i*)data);
1732
      // L0L0L0...
1733
0
      __m256i lo = _mm256_and_si256(bytes, _mm256_set1_epi16(0xFF));
1734
      // H0H0H0...
1735
0
      __m256i hi = _mm256_srli_epi16(bytes, 8);
1736
      // LLLLLLLLHHHHHHHHLLLLLLLLHHHHHHHH
1737
0
      __m256i packed = _mm256_packus_epi16(lo, hi);
1738
0
      return _mm256_permute4x64_epi64(packed, 0b11011000);
1739
0
    };
1740
0
    __m256i bytes0 = load_and_split_lohi(data);
1741
0
    __m256i bytes1 = load_and_split_lohi(data + 32);
1742
0
    __m256i bytes2 = load_and_split_lohi(data + 64);
1743
1744
0
    __m256i idx = _mm256_broadcastsi128_si256(
1745
0
        _mm_setr_epi8(0, 3, 6, 9, 12, 15, 2, 5, 8, 11, 14, 1, 4, 7, 10, 13));
1746
1747
0
    __m256i r6b5g5_0 = _mm256_shuffle_epi8(bytes0, idx);
1748
0
    __m256i g6r5b5_1 = _mm256_shuffle_epi8(bytes1, idx);
1749
0
    __m256i b6g5r5_2 = _mm256_shuffle_epi8(bytes2, idx);
1750
1751
0
    __m256i mask010 = _mm256_broadcastsi128_si256(_mm_setr_epi8(
1752
0
        0, 0, 0, 0, 0, 0, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0, 0, 0, 0, 0));
1753
0
    __m256i mask001 = _mm256_broadcastsi128_si256(_mm_setr_epi8(
1754
0
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF));
1755
1756
0
    __m256i b2g2b1 = _mm256_blendv_epi8(b6g5r5_2, g6r5b5_1, mask001);
1757
0
    __m256i b2b0b1 = _mm256_blendv_epi8(b2g2b1, r6b5g5_0, mask010);
1758
1759
0
    __m256i r0r1b1 = _mm256_blendv_epi8(r6b5g5_0, g6r5b5_1, mask010);
1760
0
    __m256i r0r1r2 = _mm256_blendv_epi8(r0r1b1, b6g5r5_2, mask001);
1761
1762
0
    __m256i g1r1g0 = _mm256_blendv_epi8(g6r5b5_1, r6b5g5_0, mask001);
1763
0
    __m256i g1g2g0 = _mm256_blendv_epi8(g1r1g0, b6g5r5_2, mask010);
1764
1765
0
    __m256i g0g1g2 = _mm256_alignr_epi8(g1g2g0, g1g2g0, 11);
1766
0
    __m256i b0b1b2 = _mm256_alignr_epi8(b2b0b1, b2b0b1, 6);
1767
1768
    // Now r0r1r2, g0g1g2, b0b1b2 have the low bytes of the RGB pixels in their
1769
    // lower half, and the high bytes in their upper half.
1770
1771
0
    auto combine_low_hi = [](__m256i v) {
1772
0
      __m128i low = _mm256_extracti128_si256(v, 0);
1773
0
      __m128i hi = _mm256_extracti128_si256(v, 1);
1774
0
      __m256i low16 = _mm256_cvtepu8_epi16(low);
1775
0
      __m256i hi16 = _mm256_cvtepu8_epi16(hi);
1776
0
      return _mm256_or_si256(_mm256_slli_epi16(hi16, 8), low16);
1777
0
    };
1778
1779
0
    return {SIMDVec16{combine_low_hi(r0r1r2)},
1780
0
            SIMDVec16{combine_low_hi(g0g1g2)},
1781
0
            SIMDVec16{combine_low_hi(b0b1b2)}};
1782
0
  }
1783
1784
0
  static std::array<SIMDVec16, 4> LoadRGBA8(const unsigned char* data) {
1785
0
    __m256i bytes1 = _mm256_loadu_si256((__m256i*)data);
1786
0
    __m256i bytes2 = _mm256_loadu_si256((__m256i*)(data + 32));
1787
0
    __m256i rg_mask = _mm256_set1_epi32(0xFFFF);
1788
0
    __m256i rg = _mm256_permute4x64_epi64(
1789
0
        _mm256_packus_epi32(_mm256_and_si256(bytes1, rg_mask),
1790
0
                            _mm256_and_si256(bytes2, rg_mask)),
1791
0
        0b11011000);
1792
0
    __m256i b_a = _mm256_permute4x64_epi64(
1793
0
        _mm256_packus_epi32(_mm256_srli_epi32(bytes1, 16),
1794
0
                            _mm256_srli_epi32(bytes2, 16)),
1795
0
        0b11011000);
1796
0
    __m256i r = _mm256_and_si256(rg, _mm256_set1_epi16(0xFF));
1797
0
    __m256i g = _mm256_srli_epi16(rg, 8);
1798
0
    __m256i b = _mm256_and_si256(b_a, _mm256_set1_epi16(0xFF));
1799
0
    __m256i a = _mm256_srli_epi16(b_a, 8);
1800
0
    return {SIMDVec16{r}, SIMDVec16{g}, SIMDVec16{b}, SIMDVec16{a}};
1801
0
  }
1802
0
  static std::array<SIMDVec16, 4> LoadRGBA16(const unsigned char* data) {
1803
0
    __m256i bytes0 = _mm256_loadu_si256((__m256i*)data);
1804
0
    __m256i bytes1 = _mm256_loadu_si256((__m256i*)(data + 32));
1805
0
    __m256i bytes2 = _mm256_loadu_si256((__m256i*)(data + 64));
1806
0
    __m256i bytes3 = _mm256_loadu_si256((__m256i*)(data + 96));
1807
1808
0
    auto pack32 = [](__m256i a, __m256i b) {
1809
0
      return _mm256_permute4x64_epi64(_mm256_packus_epi32(a, b), 0b11011000);
1810
0
    };
1811
0
    auto packlow32 = [&pack32](__m256i a, __m256i b) {
1812
0
      __m256i mask = _mm256_set1_epi32(0xFFFF);
1813
0
      return pack32(_mm256_and_si256(a, mask), _mm256_and_si256(b, mask));
1814
0
    };
1815
0
    auto packhi32 = [&pack32](__m256i a, __m256i b) {
1816
0
      return pack32(_mm256_srli_epi32(a, 16), _mm256_srli_epi32(b, 16));
1817
0
    };
1818
1819
0
    __m256i rb0 = packlow32(bytes0, bytes1);
1820
0
    __m256i rb1 = packlow32(bytes2, bytes3);
1821
0
    __m256i ga0 = packhi32(bytes0, bytes1);
1822
0
    __m256i ga1 = packhi32(bytes2, bytes3);
1823
1824
0
    __m256i r = packlow32(rb0, rb1);
1825
0
    __m256i g = packlow32(ga0, ga1);
1826
0
    __m256i b = packhi32(rb0, rb1);
1827
0
    __m256i a = packhi32(ga0, ga1);
1828
0
    return {SIMDVec16{r}, SIMDVec16{g}, SIMDVec16{b}, SIMDVec16{a}};
1829
0
  }
1830
1831
0
  void SwapEndian() {
1832
0
    auto indices = _mm256_broadcastsi128_si256(
1833
0
        _mm_setr_epi8(1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14));
1834
0
    vec = _mm256_shuffle_epi8(vec, indices);
1835
0
  }
1836
};
1837
1838
SIMDVec16 Mask16::IfThenElse(const SIMDVec16& if_true,
1839
0
                             const SIMDVec16& if_false) {
1840
0
  return SIMDVec16{_mm256_blendv_epi8(if_false.vec, if_true.vec, mask)};
1841
0
}
1842
1843
SIMDVec32 Mask32::IfThenElse(const SIMDVec32& if_true,
1844
0
                             const SIMDVec32& if_false) {
1845
0
  return SIMDVec32{_mm256_blendv_epi8(if_false.vec, if_true.vec, mask)};
1846
0
}
1847
1848
struct Bits64 {
1849
  static constexpr size_t kLanes = 4;
1850
1851
  __m256i nbits;
1852
  __m256i bits;
1853
1854
0
  FJXL_INLINE void Store(uint64_t* nbits_out, uint64_t* bits_out) {
1855
0
    _mm256_storeu_si256((__m256i*)nbits_out, nbits);
1856
0
    _mm256_storeu_si256((__m256i*)bits_out, bits);
1857
0
  }
1858
};
1859
1860
struct Bits32 {
1861
  __m256i nbits;
1862
  __m256i bits;
1863
1864
0
  static Bits32 FromRaw(SIMDVec32 nbits, SIMDVec32 bits) {
1865
0
    return Bits32{nbits.vec, bits.vec};
1866
0
  }
1867
1868
0
  Bits64 Merge() const {
1869
0
    auto nbits_hi32 = _mm256_srli_epi64(nbits, 32);
1870
0
    auto nbits_lo32 = _mm256_and_si256(nbits, _mm256_set1_epi64x(0xFFFFFFFF));
1871
0
    auto bits_hi32 = _mm256_srli_epi64(bits, 32);
1872
0
    auto bits_lo32 = _mm256_and_si256(bits, _mm256_set1_epi64x(0xFFFFFFFF));
1873
1874
0
    auto nbits64 = _mm256_add_epi64(nbits_hi32, nbits_lo32);
1875
0
    auto bits64 =
1876
0
        _mm256_or_si256(_mm256_sllv_epi64(bits_hi32, nbits_lo32), bits_lo32);
1877
0
    return Bits64{nbits64, bits64};
1878
0
  }
1879
1880
0
  void Interleave(const Bits32& low) {
1881
0
    bits = _mm256_or_si256(_mm256_sllv_epi32(bits, low.nbits), low.bits);
1882
0
    nbits = _mm256_add_epi32(nbits, low.nbits);
1883
0
  }
1884
1885
0
  void ClipTo(size_t n) {
1886
0
    n = std::min<size_t>(n, 8);
1887
0
    constexpr uint32_t kMask[16] = {
1888
0
        ~0u, ~0u, ~0u, ~0u, ~0u, ~0u, ~0u, ~0u, 0, 0, 0, 0, 0, 0, 0, 0,
1889
0
    };
1890
0
    __m256i mask = _mm256_loadu_si256((__m256i*)(kMask + 8 - n));
1891
0
    nbits = _mm256_and_si256(mask, nbits);
1892
0
    bits = _mm256_and_si256(mask, bits);
1893
0
  }
1894
0
  void Skip(size_t n) {
1895
0
    n = std::min<size_t>(n, 8);
1896
0
    constexpr uint32_t kMask[16] = {
1897
0
        0, 0, 0, 0, 0, 0, 0, 0, ~0u, ~0u, ~0u, ~0u, ~0u, ~0u, ~0u, ~0u,
1898
0
    };
1899
0
    __m256i mask = _mm256_loadu_si256((__m256i*)(kMask + 8 - n));
1900
0
    nbits = _mm256_and_si256(mask, nbits);
1901
0
    bits = _mm256_and_si256(mask, bits);
1902
0
  }
1903
};
1904
1905
struct Bits16 {
1906
  __m256i nbits;
1907
  __m256i bits;
1908
1909
0
  static Bits16 FromRaw(SIMDVec16 nbits, SIMDVec16 bits) {
1910
0
    return Bits16{nbits.vec, bits.vec};
1911
0
  }
1912
1913
0
  Bits32 Merge() const {
1914
0
    auto nbits_hi16 = _mm256_srli_epi32(nbits, 16);
1915
0
    auto nbits_lo16 = _mm256_and_si256(nbits, _mm256_set1_epi32(0xFFFF));
1916
0
    auto bits_hi16 = _mm256_srli_epi32(bits, 16);
1917
0
    auto bits_lo16 = _mm256_and_si256(bits, _mm256_set1_epi32(0xFFFF));
1918
1919
0
    auto nbits32 = _mm256_add_epi32(nbits_hi16, nbits_lo16);
1920
0
    auto bits32 =
1921
0
        _mm256_or_si256(_mm256_sllv_epi32(bits_hi16, nbits_lo16), bits_lo16);
1922
0
    return Bits32{nbits32, bits32};
1923
0
  }
1924
1925
0
  void Interleave(const Bits16& low) {
1926
0
    auto pow2_lo_lut = _mm256_broadcastsi128_si256(
1927
0
        _mm_setr_epi8(1 << 0, 1 << 1, 1 << 2, 1 << 3, 1 << 4, 1 << 5, 1 << 6,
1928
0
                      1u << 7, 0, 0, 0, 0, 0, 0, 0, 0));
1929
0
    auto low_nbits_masked =
1930
0
        _mm256_or_si256(low.nbits, _mm256_set1_epi16(0xFF00));
1931
1932
0
    auto bits_shifted = _mm256_mullo_epi16(
1933
0
        bits, _mm256_shuffle_epi8(pow2_lo_lut, low_nbits_masked));
1934
1935
0
    nbits = _mm256_add_epi16(nbits, low.nbits);
1936
0
    bits = _mm256_or_si256(bits_shifted, low.bits);
1937
0
  }
1938
1939
0
  void ClipTo(size_t n) {
1940
0
    n = std::min<size_t>(n, 16);
1941
0
    constexpr uint16_t kMask[32] = {
1942
0
        0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF,
1943
0
        0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF,
1944
0
        0,      0,      0,      0,      0,      0,      0,      0,
1945
0
        0,      0,      0,      0,      0,      0,      0,      0,
1946
0
    };
1947
0
    __m256i mask = _mm256_loadu_si256((__m256i*)(kMask + 16 - n));
1948
0
    nbits = _mm256_and_si256(mask, nbits);
1949
0
    bits = _mm256_and_si256(mask, bits);
1950
0
  }
1951
1952
0
  void Skip(size_t n) {
1953
0
    n = std::min<size_t>(n, 16);
1954
0
    constexpr uint16_t kMask[32] = {
1955
0
        0,      0,      0,      0,      0,      0,      0,      0,
1956
0
        0,      0,      0,      0,      0,      0,      0,      0,
1957
0
        0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF,
1958
0
        0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF,
1959
0
    };
1960
0
    __m256i mask = _mm256_loadu_si256((__m256i*)(kMask + 16 - n));
1961
0
    nbits = _mm256_and_si256(mask, nbits);
1962
0
    bits = _mm256_and_si256(mask, bits);
1963
0
  }
1964
};
1965
1966
#endif
1967
1968
#ifdef FJXL_NEON
1969
#define FJXL_GENERIC_SIMD
1970
1971
struct SIMDVec32;
1972
1973
struct Mask32 {
1974
  uint32x4_t mask;
1975
  SIMDVec32 IfThenElse(const SIMDVec32& if_true, const SIMDVec32& if_false);
1976
  Mask32 And(const Mask32& oth) const {
1977
    return Mask32{vandq_u32(mask, oth.mask)};
1978
  }
1979
  size_t CountPrefix() const {
1980
    uint32_t val_unset[4] = {0, 1, 2, 3};
1981
    uint32_t val_set[4] = {4, 4, 4, 4};
1982
    uint32x4_t val = vbslq_u32(mask, vld1q_u32(val_set), vld1q_u32(val_unset));
1983
    return vminvq_u32(val);
1984
  }
1985
};
1986
1987
struct SIMDVec32 {
1988
  uint32x4_t vec;
1989
1990
  static constexpr size_t kLanes = 4;
1991
1992
  FJXL_INLINE static SIMDVec32 Load(const uint32_t* data) {
1993
    return SIMDVec32{vld1q_u32(data)};
1994
  }
1995
  FJXL_INLINE void Store(uint32_t* data) { vst1q_u32(data, vec); }
1996
  FJXL_INLINE static SIMDVec32 Val(uint32_t v) {
1997
    return SIMDVec32{vdupq_n_u32(v)};
1998
  }
1999
  FJXL_INLINE SIMDVec32 ValToToken() const {
2000
    return SIMDVec32{vsubq_u32(vdupq_n_u32(32), vclzq_u32(vec))};
2001
  }
2002
  FJXL_INLINE SIMDVec32 SatSubU(const SIMDVec32& to_subtract) const {
2003
    return SIMDVec32{vqsubq_u32(vec, to_subtract.vec)};
2004
  }
2005
  FJXL_INLINE SIMDVec32 Sub(const SIMDVec32& to_subtract) const {
2006
    return SIMDVec32{vsubq_u32(vec, to_subtract.vec)};
2007
  }
2008
  FJXL_INLINE SIMDVec32 Add(const SIMDVec32& oth) const {
2009
    return SIMDVec32{vaddq_u32(vec, oth.vec)};
2010
  }
2011
  FJXL_INLINE SIMDVec32 Xor(const SIMDVec32& oth) const {
2012
    return SIMDVec32{veorq_u32(vec, oth.vec)};
2013
  }
2014
  FJXL_INLINE SIMDVec32 Pow2() const {
2015
    return SIMDVec32{vshlq_u32(vdupq_n_u32(1), vreinterpretq_s32_u32(vec))};
2016
  }
2017
  FJXL_INLINE Mask32 Eq(const SIMDVec32& oth) const {
2018
    return Mask32{vceqq_u32(vec, oth.vec)};
2019
  }
2020
  FJXL_INLINE Mask32 Gt(const SIMDVec32& oth) const {
2021
    return Mask32{
2022
        vcgtq_s32(vreinterpretq_s32_u32(vec), vreinterpretq_s32_u32(oth.vec))};
2023
  }
2024
  template <size_t i>
2025
  FJXL_INLINE SIMDVec32 SignedShiftRight() const {
2026
    return SIMDVec32{
2027
        vreinterpretq_u32_s32(vshrq_n_s32(vreinterpretq_s32_u32(vec), i))};
2028
  }
2029
};
2030
2031
struct SIMDVec16;
2032
2033
struct Mask16 {
2034
  uint16x8_t mask;
2035
  SIMDVec16 IfThenElse(const SIMDVec16& if_true, const SIMDVec16& if_false);
2036
  Mask16 And(const Mask16& oth) const {
2037
    return Mask16{vandq_u16(mask, oth.mask)};
2038
  }
2039
  size_t CountPrefix() const {
2040
    uint16_t val_unset[8] = {0, 1, 2, 3, 4, 5, 6, 7};
2041
    uint16_t val_set[8] = {8, 8, 8, 8, 8, 8, 8, 8};
2042
    uint16x8_t val = vbslq_u16(mask, vld1q_u16(val_set), vld1q_u16(val_unset));
2043
    return vminvq_u16(val);
2044
  }
2045
};
2046
2047
struct SIMDVec16 {
2048
  uint16x8_t vec;
2049
2050
  static constexpr size_t kLanes = 8;
2051
2052
  FJXL_INLINE static SIMDVec16 Load(const uint16_t* data) {
2053
    return SIMDVec16{vld1q_u16(data)};
2054
  }
2055
  FJXL_INLINE void Store(uint16_t* data) { vst1q_u16(data, vec); }
2056
  FJXL_INLINE static SIMDVec16 Val(uint16_t v) {
2057
    return SIMDVec16{vdupq_n_u16(v)};
2058
  }
2059
  FJXL_INLINE static SIMDVec16 FromTwo32(const SIMDVec32& lo,
2060
                                         const SIMDVec32& hi) {
2061
    return SIMDVec16{vmovn_high_u32(vmovn_u32(lo.vec), hi.vec)};
2062
  }
2063
2064
  FJXL_INLINE SIMDVec16 ValToToken() const {
2065
    return SIMDVec16{vsubq_u16(vdupq_n_u16(16), vclzq_u16(vec))};
2066
  }
2067
  FJXL_INLINE SIMDVec16 SatSubU(const SIMDVec16& to_subtract) const {
2068
    return SIMDVec16{vqsubq_u16(vec, to_subtract.vec)};
2069
  }
2070
  FJXL_INLINE SIMDVec16 Sub(const SIMDVec16& to_subtract) const {
2071
    return SIMDVec16{vsubq_u16(vec, to_subtract.vec)};
2072
  }
2073
  FJXL_INLINE SIMDVec16 Add(const SIMDVec16& oth) const {
2074
    return SIMDVec16{vaddq_u16(vec, oth.vec)};
2075
  }
2076
  FJXL_INLINE SIMDVec16 Min(const SIMDVec16& oth) const {
2077
    return SIMDVec16{vminq_u16(vec, oth.vec)};
2078
  }
2079
  FJXL_INLINE Mask16 Eq(const SIMDVec16& oth) const {
2080
    return Mask16{vceqq_u16(vec, oth.vec)};
2081
  }
2082
  FJXL_INLINE Mask16 Gt(const SIMDVec16& oth) const {
2083
    return Mask16{
2084
        vcgtq_s16(vreinterpretq_s16_u16(vec), vreinterpretq_s16_u16(oth.vec))};
2085
  }
2086
  FJXL_INLINE SIMDVec16 Pow2() const {
2087
    return SIMDVec16{vshlq_u16(vdupq_n_u16(1), vreinterpretq_s16_u16(vec))};
2088
  }
2089
  FJXL_INLINE SIMDVec16 Or(const SIMDVec16& oth) const {
2090
    return SIMDVec16{vorrq_u16(vec, oth.vec)};
2091
  }
2092
  FJXL_INLINE SIMDVec16 Xor(const SIMDVec16& oth) const {
2093
    return SIMDVec16{veorq_u16(vec, oth.vec)};
2094
  }
2095
  FJXL_INLINE SIMDVec16 And(const SIMDVec16& oth) const {
2096
    return SIMDVec16{vandq_u16(vec, oth.vec)};
2097
  }
2098
  FJXL_INLINE SIMDVec16 HAdd(const SIMDVec16& oth) const {
2099
    return SIMDVec16{vhaddq_u16(vec, oth.vec)};
2100
  }
2101
  FJXL_INLINE SIMDVec16 PrepareForU8Lookup() const {
2102
    return SIMDVec16{vorrq_u16(vec, vdupq_n_u16(0xFF00))};
2103
  }
2104
  FJXL_INLINE SIMDVec16 U8Lookup(const uint8_t* table) const {
2105
    uint8x16_t tbl = vld1q_u8(table);
2106
    uint8x16_t indices = vreinterpretq_u8_u16(vec);
2107
    return SIMDVec16{vreinterpretq_u16_u8(vqtbl1q_u8(tbl, indices))};
2108
  }
2109
  FJXL_INLINE VecPair<SIMDVec16> Interleave(const SIMDVec16& low) const {
2110
    return {SIMDVec16{vzip1q_u16(low.vec, vec)},
2111
            SIMDVec16{vzip2q_u16(low.vec, vec)}};
2112
  }
2113
  FJXL_INLINE VecPair<SIMDVec32> Upcast() const {
2114
    uint32x4_t lo = vmovl_u16(vget_low_u16(vec));
2115
    uint32x4_t hi = vmovl_high_u16(vec);
2116
    return {SIMDVec32{lo}, SIMDVec32{hi}};
2117
  }
2118
  template <size_t i>
2119
  FJXL_INLINE SIMDVec16 SignedShiftRight() const {
2120
    return SIMDVec16{
2121
        vreinterpretq_u16_s16(vshrq_n_s16(vreinterpretq_s16_u16(vec), i))};
2122
  }
2123
2124
  static std::array<SIMDVec16, 1> LoadG8(const unsigned char* data) {
2125
    uint8x8_t v = vld1_u8(data);
2126
    return {SIMDVec16{vmovl_u8(v)}};
2127
  }
2128
  static std::array<SIMDVec16, 1> LoadG16(const unsigned char* data) {
2129
    return {Load((const uint16_t*)data)};
2130
  }
2131
2132
  static std::array<SIMDVec16, 2> LoadGA8(const unsigned char* data) {
2133
    uint8x8x2_t v = vld2_u8(data);
2134
    return {SIMDVec16{vmovl_u8(v.val[0])}, SIMDVec16{vmovl_u8(v.val[1])}};
2135
  }
2136
  static std::array<SIMDVec16, 2> LoadGA16(const unsigned char* data) {
2137
    uint16x8x2_t v = vld2q_u16((const uint16_t*)data);
2138
    return {SIMDVec16{v.val[0]}, SIMDVec16{v.val[1]}};
2139
  }
2140
2141
  static std::array<SIMDVec16, 3> LoadRGB8(const unsigned char* data) {
2142
    uint8x8x3_t v = vld3_u8(data);
2143
    return {SIMDVec16{vmovl_u8(v.val[0])}, SIMDVec16{vmovl_u8(v.val[1])},
2144
            SIMDVec16{vmovl_u8(v.val[2])}};
2145
  }
2146
  static std::array<SIMDVec16, 3> LoadRGB16(const unsigned char* data) {
2147
    uint16x8x3_t v = vld3q_u16((const uint16_t*)data);
2148
    return {SIMDVec16{v.val[0]}, SIMDVec16{v.val[1]}, SIMDVec16{v.val[2]}};
2149
  }
2150
2151
  static std::array<SIMDVec16, 4> LoadRGBA8(const unsigned char* data) {
2152
    uint8x8x4_t v = vld4_u8(data);
2153
    return {SIMDVec16{vmovl_u8(v.val[0])}, SIMDVec16{vmovl_u8(v.val[1])},
2154
            SIMDVec16{vmovl_u8(v.val[2])}, SIMDVec16{vmovl_u8(v.val[3])}};
2155
  }
2156
  static std::array<SIMDVec16, 4> LoadRGBA16(const unsigned char* data) {
2157
    uint16x8x4_t v = vld4q_u16((const uint16_t*)data);
2158
    return {SIMDVec16{v.val[0]}, SIMDVec16{v.val[1]}, SIMDVec16{v.val[2]},
2159
            SIMDVec16{v.val[3]}};
2160
  }
2161
2162
  void SwapEndian() {
2163
    vec = vreinterpretq_u16_u8(vrev16q_u8(vreinterpretq_u8_u16(vec)));
2164
  }
2165
};
2166
2167
SIMDVec16 Mask16::IfThenElse(const SIMDVec16& if_true,
2168
                             const SIMDVec16& if_false) {
2169
  return SIMDVec16{vbslq_u16(mask, if_true.vec, if_false.vec)};
2170
}
2171
2172
SIMDVec32 Mask32::IfThenElse(const SIMDVec32& if_true,
2173
                             const SIMDVec32& if_false) {
2174
  return SIMDVec32{vbslq_u32(mask, if_true.vec, if_false.vec)};
2175
}
2176
2177
struct Bits64 {
2178
  static constexpr size_t kLanes = 2;
2179
2180
  uint64x2_t nbits;
2181
  uint64x2_t bits;
2182
2183
  FJXL_INLINE void Store(uint64_t* nbits_out, uint64_t* bits_out) {
2184
    vst1q_u64(nbits_out, nbits);
2185
    vst1q_u64(bits_out, bits);
2186
  }
2187
};
2188
2189
struct Bits32 {
2190
  uint32x4_t nbits;
2191
  uint32x4_t bits;
2192
2193
  static Bits32 FromRaw(SIMDVec32 nbits, SIMDVec32 bits) {
2194
    return Bits32{nbits.vec, bits.vec};
2195
  }
2196
2197
  Bits64 Merge() const {
2198
    // TODO(veluca): can probably be optimized.
2199
    uint64x2_t nbits_lo32 =
2200
        vandq_u64(vreinterpretq_u64_u32(nbits), vdupq_n_u64(0xFFFFFFFF));
2201
    uint64x2_t bits_hi32 =
2202
        vshlq_u64(vshrq_n_u64(vreinterpretq_u64_u32(bits), 32),
2203
                  vreinterpretq_s64_u64(nbits_lo32));
2204
    uint64x2_t bits_lo32 =
2205
        vandq_u64(vreinterpretq_u64_u32(bits), vdupq_n_u64(0xFFFFFFFF));
2206
    uint64x2_t nbits64 =
2207
        vsraq_n_u64(nbits_lo32, vreinterpretq_u64_u32(nbits), 32);
2208
    uint64x2_t bits64 = vorrq_u64(bits_hi32, bits_lo32);
2209
    return Bits64{nbits64, bits64};
2210
  }
2211
2212
  void Interleave(const Bits32& low) {
2213
    bits =
2214
        vorrq_u32(vshlq_u32(bits, vreinterpretq_s32_u32(low.nbits)), low.bits);
2215
    nbits = vaddq_u32(nbits, low.nbits);
2216
  }
2217
2218
  void ClipTo(size_t n) {
2219
    n = std::min<size_t>(n, 4);
2220
    constexpr uint32_t kMask[8] = {
2221
        ~0u, ~0u, ~0u, ~0u, 0, 0, 0, 0,
2222
    };
2223
    uint32x4_t mask = vld1q_u32(kMask + 4 - n);
2224
    nbits = vandq_u32(mask, nbits);
2225
    bits = vandq_u32(mask, bits);
2226
  }
2227
  void Skip(size_t n) {
2228
    n = std::min<size_t>(n, 4);
2229
    constexpr uint32_t kMask[8] = {
2230
        0, 0, 0, 0, ~0u, ~0u, ~0u, ~0u,
2231
    };
2232
    uint32x4_t mask = vld1q_u32(kMask + 4 - n);
2233
    nbits = vandq_u32(mask, nbits);
2234
    bits = vandq_u32(mask, bits);
2235
  }
2236
};
2237
2238
struct Bits16 {
2239
  uint16x8_t nbits;
2240
  uint16x8_t bits;
2241
2242
  static Bits16 FromRaw(SIMDVec16 nbits, SIMDVec16 bits) {
2243
    return Bits16{nbits.vec, bits.vec};
2244
  }
2245
2246
  Bits32 Merge() const {
2247
    // TODO(veluca): can probably be optimized.
2248
    uint32x4_t nbits_lo16 =
2249
        vandq_u32(vreinterpretq_u32_u16(nbits), vdupq_n_u32(0xFFFF));
2250
    uint32x4_t bits_hi16 =
2251
        vshlq_u32(vshrq_n_u32(vreinterpretq_u32_u16(bits), 16),
2252
                  vreinterpretq_s32_u32(nbits_lo16));
2253
    uint32x4_t bits_lo16 =
2254
        vandq_u32(vreinterpretq_u32_u16(bits), vdupq_n_u32(0xFFFF));
2255
    uint32x4_t nbits32 =
2256
        vsraq_n_u32(nbits_lo16, vreinterpretq_u32_u16(nbits), 16);
2257
    uint32x4_t bits32 = vorrq_u32(bits_hi16, bits_lo16);
2258
    return Bits32{nbits32, bits32};
2259
  }
2260
2261
  void Interleave(const Bits16& low) {
2262
    bits =
2263
        vorrq_u16(vshlq_u16(bits, vreinterpretq_s16_u16(low.nbits)), low.bits);
2264
    nbits = vaddq_u16(nbits, low.nbits);
2265
  }
2266
2267
  void ClipTo(size_t n) {
2268
    n = std::min<size_t>(n, 8);
2269
    constexpr uint16_t kMask[16] = {
2270
        0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF,
2271
        0,      0,      0,      0,      0,      0,      0,      0,
2272
    };
2273
    uint16x8_t mask = vld1q_u16(kMask + 8 - n);
2274
    nbits = vandq_u16(mask, nbits);
2275
    bits = vandq_u16(mask, bits);
2276
  }
2277
  void Skip(size_t n) {
2278
    n = std::min<size_t>(n, 8);
2279
    constexpr uint16_t kMask[16] = {
2280
        0,      0,      0,      0,      0,      0,      0,      0,
2281
        0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF,
2282
    };
2283
    uint16x8_t mask = vld1q_u16(kMask + 8 - n);
2284
    nbits = vandq_u16(mask, nbits);
2285
    bits = vandq_u16(mask, bits);
2286
  }
2287
};
2288
2289
#endif
2290
2291
#ifdef FJXL_GENERIC_SIMD
2292
constexpr size_t SIMDVec32::kLanes;
2293
constexpr size_t SIMDVec16::kLanes;
2294
2295
//  Each of these functions will process SIMDVec16::kLanes worth of values.
2296
2297
FJXL_INLINE void TokenizeSIMD(const uint16_t* residuals, uint16_t* token_out,
2298
0
                              uint16_t* nbits_out, uint16_t* bits_out) {
2299
0
  SIMDVec16 res = SIMDVec16::Load(residuals);
2300
0
  SIMDVec16 token = res.ValToToken();
2301
0
  SIMDVec16 nbits = token.SatSubU(SIMDVec16::Val(1));
2302
0
  SIMDVec16 bits = res.SatSubU(nbits.Pow2());
2303
0
  token.Store(token_out);
2304
0
  nbits.Store(nbits_out);
2305
0
  bits.Store(bits_out);
2306
0
}
Unexecuted instantiation: enc_fast_lossless.cc:AVX512::(anonymous namespace)::TokenizeSIMD(unsigned short const*, unsigned short*, unsigned short*, unsigned short*)
Unexecuted instantiation: enc_fast_lossless.cc:AVX2::(anonymous namespace)::TokenizeSIMD(unsigned short const*, unsigned short*, unsigned short*, unsigned short*)
2307
2308
FJXL_INLINE void TokenizeSIMD(const uint32_t* residuals, uint16_t* token_out,
2309
0
                              uint32_t* nbits_out, uint32_t* bits_out) {
2310
0
  static_assert(SIMDVec16::kLanes == 2 * SIMDVec32::kLanes,
2311
0
                "There should be twice more 16-bit lanes than 32-bit lanes");
2312
0
  SIMDVec32 res_lo = SIMDVec32::Load(residuals);
2313
0
  SIMDVec32 res_hi = SIMDVec32::Load(residuals + SIMDVec32::kLanes);
2314
0
  SIMDVec32 token_lo = res_lo.ValToToken();
2315
0
  SIMDVec32 token_hi = res_hi.ValToToken();
2316
0
  SIMDVec32 nbits_lo = token_lo.SatSubU(SIMDVec32::Val(1));
2317
0
  SIMDVec32 nbits_hi = token_hi.SatSubU(SIMDVec32::Val(1));
2318
0
  SIMDVec32 bits_lo = res_lo.SatSubU(nbits_lo.Pow2());
2319
0
  SIMDVec32 bits_hi = res_hi.SatSubU(nbits_hi.Pow2());
2320
0
  SIMDVec16 token = SIMDVec16::FromTwo32(token_lo, token_hi);
2321
0
  token.Store(token_out);
2322
0
  nbits_lo.Store(nbits_out);
2323
0
  nbits_hi.Store(nbits_out + SIMDVec32::kLanes);
2324
0
  bits_lo.Store(bits_out);
2325
0
  bits_hi.Store(bits_out + SIMDVec32::kLanes);
2326
0
}
Unexecuted instantiation: enc_fast_lossless.cc:AVX512::(anonymous namespace)::TokenizeSIMD(unsigned int const*, unsigned short*, unsigned int*, unsigned int*)
Unexecuted instantiation: enc_fast_lossless.cc:AVX2::(anonymous namespace)::TokenizeSIMD(unsigned int const*, unsigned short*, unsigned int*, unsigned int*)
2327
2328
FJXL_INLINE void HuffmanSIMDUpTo13(const uint16_t* tokens,
2329
                                   const uint8_t* raw_nbits_simd,
2330
                                   const uint8_t* raw_bits_simd,
2331
0
                                   uint16_t* nbits_out, uint16_t* bits_out) {
2332
0
  SIMDVec16 tok = SIMDVec16::Load(tokens).PrepareForU8Lookup();
2333
0
  tok.U8Lookup(raw_nbits_simd).Store(nbits_out);
2334
0
  tok.U8Lookup(raw_bits_simd).Store(bits_out);
2335
0
}
Unexecuted instantiation: enc_fast_lossless.cc:AVX512::(anonymous namespace)::HuffmanSIMDUpTo13(unsigned short const*, unsigned char const*, unsigned char const*, unsigned short*, unsigned short*)
Unexecuted instantiation: enc_fast_lossless.cc:AVX2::(anonymous namespace)::HuffmanSIMDUpTo13(unsigned short const*, unsigned char const*, unsigned char const*, unsigned short*, unsigned short*)
2336
2337
FJXL_INLINE void HuffmanSIMD14(const uint16_t* tokens,
2338
                               const uint8_t* raw_nbits_simd,
2339
                               const uint8_t* raw_bits_simd,
2340
0
                               uint16_t* nbits_out, uint16_t* bits_out) {
2341
0
  SIMDVec16 token_cap = SIMDVec16::Val(15);
2342
0
  SIMDVec16 tok = SIMDVec16::Load(tokens);
2343
0
  SIMDVec16 tok_index = tok.Min(token_cap).PrepareForU8Lookup();
2344
0
  SIMDVec16 huff_bits_pre = tok_index.U8Lookup(raw_bits_simd);
2345
  // Set the highest bit when token == 16; the Huffman code is constructed in
2346
  // such a way that the code for token 15 is the same as the code for 16,
2347
  // except for the highest bit.
2348
0
  Mask16 needs_high_bit = tok.Eq(SIMDVec16::Val(16));
2349
0
  SIMDVec16 huff_bits = needs_high_bit.IfThenElse(
2350
0
      huff_bits_pre.Or(SIMDVec16::Val(128)), huff_bits_pre);
2351
0
  huff_bits.Store(bits_out);
2352
0
  tok_index.U8Lookup(raw_nbits_simd).Store(nbits_out);
2353
0
}
Unexecuted instantiation: enc_fast_lossless.cc:AVX512::(anonymous namespace)::HuffmanSIMD14(unsigned short const*, unsigned char const*, unsigned char const*, unsigned short*, unsigned short*)
Unexecuted instantiation: enc_fast_lossless.cc:AVX2::(anonymous namespace)::HuffmanSIMD14(unsigned short const*, unsigned char const*, unsigned char const*, unsigned short*, unsigned short*)
2354
2355
FJXL_INLINE void HuffmanSIMDAbove14(const uint16_t* tokens,
2356
                                    const uint8_t* raw_nbits_simd,
2357
                                    const uint8_t* raw_bits_simd,
2358
0
                                    uint16_t* nbits_out, uint16_t* bits_out) {
2359
0
  SIMDVec16 tok = SIMDVec16::Load(tokens);
2360
  // We assume `tok` fits in a *signed* 16-bit integer.
2361
0
  Mask16 above = tok.Gt(SIMDVec16::Val(12));
2362
  // 13, 14 -> 13
2363
  // 15, 16 -> 14
2364
  // 17, 18 -> 15
2365
0
  SIMDVec16 remap_tok = above.IfThenElse(tok.HAdd(SIMDVec16::Val(13)), tok);
2366
0
  SIMDVec16 tok_index = remap_tok.PrepareForU8Lookup();
2367
0
  SIMDVec16 huff_bits_pre = tok_index.U8Lookup(raw_bits_simd);
2368
  // Set the highest bit when token == 14, 16, 18.
2369
0
  Mask16 needs_high_bit = above.And(tok.Eq(tok.And(SIMDVec16::Val(0xFFFE))));
2370
0
  SIMDVec16 huff_bits = needs_high_bit.IfThenElse(
2371
0
      huff_bits_pre.Or(SIMDVec16::Val(128)), huff_bits_pre);
2372
0
  huff_bits.Store(bits_out);
2373
0
  tok_index.U8Lookup(raw_nbits_simd).Store(nbits_out);
2374
0
}
Unexecuted instantiation: enc_fast_lossless.cc:AVX512::(anonymous namespace)::HuffmanSIMDAbove14(unsigned short const*, unsigned char const*, unsigned char const*, unsigned short*, unsigned short*)
Unexecuted instantiation: enc_fast_lossless.cc:AVX2::(anonymous namespace)::HuffmanSIMDAbove14(unsigned short const*, unsigned char const*, unsigned char const*, unsigned short*, unsigned short*)
2375
2376
FJXL_INLINE void StoreSIMDUpTo8(const uint16_t* nbits_tok,
2377
                                const uint16_t* bits_tok,
2378
                                const uint16_t* nbits_huff,
2379
                                const uint16_t* bits_huff, size_t n,
2380
0
                                size_t skip, Bits32* bits_out) {
2381
0
  Bits16 bits =
2382
0
      Bits16::FromRaw(SIMDVec16::Load(nbits_tok), SIMDVec16::Load(bits_tok));
2383
0
  Bits16 huff_bits =
2384
0
      Bits16::FromRaw(SIMDVec16::Load(nbits_huff), SIMDVec16::Load(bits_huff));
2385
0
  bits.Interleave(huff_bits);
2386
0
  bits.ClipTo(n);
2387
0
  bits.Skip(skip);
2388
0
  bits_out[0] = bits.Merge();
2389
0
}
Unexecuted instantiation: enc_fast_lossless.cc:AVX512::(anonymous namespace)::StoreSIMDUpTo8(unsigned short const*, unsigned short const*, unsigned short const*, unsigned short const*, unsigned long, unsigned long, AVX512::(anonymous namespace)::Bits32*)
Unexecuted instantiation: enc_fast_lossless.cc:AVX2::(anonymous namespace)::StoreSIMDUpTo8(unsigned short const*, unsigned short const*, unsigned short const*, unsigned short const*, unsigned long, unsigned long, AVX2::(anonymous namespace)::Bits32*)
2390
2391
// Huffman and raw bits don't necessarily fit in a single u16 here.
2392
FJXL_INLINE void StoreSIMDUpTo14(const uint16_t* nbits_tok,
2393
                                 const uint16_t* bits_tok,
2394
                                 const uint16_t* nbits_huff,
2395
                                 const uint16_t* bits_huff, size_t n,
2396
0
                                 size_t skip, Bits32* bits_out) {
2397
0
  VecPair<SIMDVec16> bits =
2398
0
      SIMDVec16::Load(bits_tok).Interleave(SIMDVec16::Load(bits_huff));
2399
0
  VecPair<SIMDVec16> nbits =
2400
0
      SIMDVec16::Load(nbits_tok).Interleave(SIMDVec16::Load(nbits_huff));
2401
0
  Bits16 low = Bits16::FromRaw(nbits.low, bits.low);
2402
0
  Bits16 hi = Bits16::FromRaw(nbits.hi, bits.hi);
2403
0
  low.ClipTo(2 * n);
2404
0
  low.Skip(2 * skip);
2405
0
  hi.ClipTo(std::max(2 * n, SIMDVec16::kLanes) - SIMDVec16::kLanes);
2406
0
  hi.Skip(std::max(2 * skip, SIMDVec16::kLanes) - SIMDVec16::kLanes);
2407
2408
0
  bits_out[0] = low.Merge();
2409
0
  bits_out[1] = hi.Merge();
2410
0
}
Unexecuted instantiation: enc_fast_lossless.cc:AVX512::(anonymous namespace)::StoreSIMDUpTo14(unsigned short const*, unsigned short const*, unsigned short const*, unsigned short const*, unsigned long, unsigned long, AVX512::(anonymous namespace)::Bits32*)
Unexecuted instantiation: enc_fast_lossless.cc:AVX2::(anonymous namespace)::StoreSIMDUpTo14(unsigned short const*, unsigned short const*, unsigned short const*, unsigned short const*, unsigned long, unsigned long, AVX2::(anonymous namespace)::Bits32*)
2411
2412
FJXL_INLINE void StoreSIMDAbove14(const uint32_t* nbits_tok,
2413
                                  const uint32_t* bits_tok,
2414
                                  const uint16_t* nbits_huff,
2415
                                  const uint16_t* bits_huff, size_t n,
2416
0
                                  size_t skip, Bits32* bits_out) {
2417
0
  static_assert(SIMDVec16::kLanes == 2 * SIMDVec32::kLanes,
2418
0
                "There should be twice more 16-bit lanes than 32-bit lanes");
2419
0
  Bits32 bits_low =
2420
0
      Bits32::FromRaw(SIMDVec32::Load(nbits_tok), SIMDVec32::Load(bits_tok));
2421
0
  Bits32 bits_hi =
2422
0
      Bits32::FromRaw(SIMDVec32::Load(nbits_tok + SIMDVec32::kLanes),
2423
0
                      SIMDVec32::Load(bits_tok + SIMDVec32::kLanes));
2424
2425
0
  VecPair<SIMDVec32> huff_bits = SIMDVec16::Load(bits_huff).Upcast();
2426
0
  VecPair<SIMDVec32> huff_nbits = SIMDVec16::Load(nbits_huff).Upcast();
2427
2428
0
  Bits32 huff_low = Bits32::FromRaw(huff_nbits.low, huff_bits.low);
2429
0
  Bits32 huff_hi = Bits32::FromRaw(huff_nbits.hi, huff_bits.hi);
2430
2431
0
  bits_low.Interleave(huff_low);
2432
0
  bits_low.ClipTo(n);
2433
0
  bits_low.Skip(skip);
2434
0
  bits_out[0] = bits_low;
2435
0
  bits_hi.Interleave(huff_hi);
2436
0
  bits_hi.ClipTo(std::max(n, SIMDVec32::kLanes) - SIMDVec32::kLanes);
2437
0
  bits_hi.Skip(std::max(skip, SIMDVec32::kLanes) - SIMDVec32::kLanes);
2438
0
  bits_out[1] = bits_hi;
2439
0
}
Unexecuted instantiation: enc_fast_lossless.cc:AVX512::(anonymous namespace)::StoreSIMDAbove14(unsigned int const*, unsigned int const*, unsigned short const*, unsigned short const*, unsigned long, unsigned long, AVX512::(anonymous namespace)::Bits32*)
Unexecuted instantiation: enc_fast_lossless.cc:AVX2::(anonymous namespace)::StoreSIMDAbove14(unsigned int const*, unsigned int const*, unsigned short const*, unsigned short const*, unsigned long, unsigned long, AVX2::(anonymous namespace)::Bits32*)
2440
2441
#ifdef FJXL_AVX512
2442
0
FJXL_INLINE void StoreToWriterAVX512(const Bits32& bits32, BitWriter& output) {
2443
0
  __m512i bits = bits32.bits;
2444
0
  __m512i nbits = bits32.nbits;
2445
2446
  // Insert the leftover bits from the bit buffer at the bottom of the vector
2447
  // and extract the top of the vector.
2448
0
  uint64_t trail_bits =
2449
0
      _mm512_cvtsi512_si32(_mm512_alignr_epi32(bits, bits, 15));
2450
0
  uint64_t trail_nbits =
2451
0
      _mm512_cvtsi512_si32(_mm512_alignr_epi32(nbits, nbits, 15));
2452
0
  __m512i lead_bits = _mm512_set1_epi32(output.buffer);
2453
0
  __m512i lead_nbits = _mm512_set1_epi32(output.bits_in_buffer);
2454
0
  bits = _mm512_alignr_epi32(bits, lead_bits, 15);
2455
0
  nbits = _mm512_alignr_epi32(nbits, lead_nbits, 15);
2456
2457
  // Merge 32 -> 64 bits.
2458
0
  Bits32 b{nbits, bits};
2459
0
  Bits64 b64 = b.Merge();
2460
0
  bits = b64.bits;
2461
0
  nbits = b64.nbits;
2462
2463
0
  __m512i zero = _mm512_setzero_si512();
2464
2465
0
  auto sh1 = [zero](__m512i vec) { return _mm512_alignr_epi64(vec, zero, 7); };
2466
0
  auto sh2 = [zero](__m512i vec) { return _mm512_alignr_epi64(vec, zero, 6); };
2467
0
  auto sh4 = [zero](__m512i vec) { return _mm512_alignr_epi64(vec, zero, 4); };
2468
2469
  // Compute first-past-end-bit-position.
2470
0
  __m512i end_intermediate0 = _mm512_add_epi64(nbits, sh1(nbits));
2471
0
  __m512i end_intermediate1 =
2472
0
      _mm512_add_epi64(end_intermediate0, sh2(end_intermediate0));
2473
0
  __m512i end = _mm512_add_epi64(end_intermediate1, sh4(end_intermediate1));
2474
2475
0
  uint64_t simd_nbits = _mm512_cvtsi512_si32(_mm512_alignr_epi64(end, end, 7));
2476
2477
  // Compute begin-bit-position.
2478
0
  __m512i begin = _mm512_sub_epi64(end, nbits);
2479
2480
  // Index of the last bit in the chunk, or the end bit if nbits==0.
2481
0
  __m512i last = _mm512_mask_sub_epi64(
2482
0
      end, _mm512_cmpneq_epi64_mask(nbits, zero), end, _mm512_set1_epi64(1));
2483
2484
0
  __m512i lane_offset_mask = _mm512_set1_epi64(63);
2485
2486
  // Starting position of the chunk that each lane will ultimately belong to.
2487
0
  __m512i chunk_start = _mm512_andnot_si512(lane_offset_mask, last);
2488
2489
  // For all lanes that contain bits belonging to two different 64-bit chunks,
2490
  // compute the number of bits that belong to the first chunk.
2491
  // total # of bits fit in a u16, so we can satsub_u16 here.
2492
0
  __m512i first_chunk_nbits = _mm512_subs_epu16(chunk_start, begin);
2493
2494
  // Move all the previous-chunk-bits to the previous lane.
2495
0
  __m512i negnbits = _mm512_sub_epi64(_mm512_set1_epi64(64), first_chunk_nbits);
2496
0
  __m512i first_chunk_bits =
2497
0
      _mm512_srlv_epi64(_mm512_sllv_epi64(bits, negnbits), negnbits);
2498
0
  __m512i first_chunk_bits_down =
2499
0
      _mm512_alignr_epi32(zero, first_chunk_bits, 2);
2500
0
  bits = _mm512_srlv_epi64(bits, first_chunk_nbits);
2501
0
  nbits = _mm512_sub_epi64(nbits, first_chunk_nbits);
2502
0
  bits = _mm512_or_si512(bits, _mm512_sllv_epi64(first_chunk_bits_down, nbits));
2503
0
  begin = _mm512_add_epi64(begin, first_chunk_nbits);
2504
2505
  // We now know that every lane should give bits to only one chunk. We can
2506
  // shift the bits and then horizontally-or-reduce them within the same chunk.
2507
0
  __m512i offset = _mm512_and_si512(begin, lane_offset_mask);
2508
0
  __m512i aligned_bits = _mm512_sllv_epi64(bits, offset);
2509
  // h-or-reduce within same chunk
2510
0
  __m512i red0 = _mm512_mask_or_epi64(
2511
0
      aligned_bits, _mm512_cmpeq_epi64_mask(sh1(chunk_start), chunk_start),
2512
0
      sh1(aligned_bits), aligned_bits);
2513
0
  __m512i red1 = _mm512_mask_or_epi64(
2514
0
      red0, _mm512_cmpeq_epi64_mask(sh2(chunk_start), chunk_start), sh2(red0),
2515
0
      red0);
2516
0
  __m512i reduced = _mm512_mask_or_epi64(
2517
0
      red1, _mm512_cmpeq_epi64_mask(sh4(chunk_start), chunk_start), sh4(red1),
2518
0
      red1);
2519
  // Extract the highest lane that belongs to each chunk (the lane that ends up
2520
  // with the OR-ed value of all the other lanes of that chunk).
2521
0
  __m512i next_chunk_start =
2522
0
      _mm512_alignr_epi32(_mm512_set1_epi64(~0), chunk_start, 2);
2523
0
  __m512i result = _mm512_maskz_compress_epi64(
2524
0
      _mm512_cmpneq_epi64_mask(chunk_start, next_chunk_start), reduced);
2525
2526
0
  _mm512_storeu_si512((__m512i*)(output.data.get() + output.bytes_written),
2527
0
                      result);
2528
2529
  // Update the bit writer and add the last 32-bit lane.
2530
  // Note that since trail_nbits was at most 32 to begin with, operating on
2531
  // trail_bits does not risk overflowing.
2532
0
  output.bytes_written += simd_nbits / 8;
2533
  // Here we are implicitly relying on the fact that simd_nbits < 512 to know
2534
  // that the byte of bitreader data we access is initialized. This is
2535
  // guaranteed because the remaining bits in the bitreader buffer are at most
2536
  // 7, so simd_nbits <= 505 always.
2537
0
  trail_bits = (trail_bits << (simd_nbits % 8)) +
2538
0
               output.data.get()[output.bytes_written];
2539
0
  trail_nbits += simd_nbits % 8;
2540
0
  StoreLE64(output.data.get() + output.bytes_written, trail_bits);
2541
0
  size_t trail_bytes = trail_nbits / 8;
2542
0
  output.bits_in_buffer = trail_nbits % 8;
2543
0
  output.buffer = trail_bits >> (trail_bytes * 8);
2544
0
  output.bytes_written += trail_bytes;
2545
0
}
2546
2547
#endif
2548
2549
template <size_t n>
2550
0
FJXL_INLINE void StoreToWriter(const Bits32* bits, BitWriter& output) {
2551
#ifdef FJXL_AVX512
2552
  static_assert(n <= 2, "n should be less or 2 for AVX512");
2553
  StoreToWriterAVX512(bits[0], output);
2554
0
  if (n == 2) {
2555
0
    StoreToWriterAVX512(bits[1], output);
2556
0
  }
2557
  return;
2558
0
#endif
2559
0
  static_assert(n <= 4, "n should be less or 4");
2560
0
  alignas(64) uint64_t nbits64[Bits64::kLanes * n];
2561
0
  alignas(64) uint64_t bits64[Bits64::kLanes * n];
2562
0
  bits[0].Merge().Store(nbits64, bits64);
2563
0
  if (n > 1) {
2564
0
    bits[1].Merge().Store(nbits64 + Bits64::kLanes, bits64 + Bits64::kLanes);
2565
0
  }
2566
0
  if (n > 2) {
2567
0
    bits[2].Merge().Store(nbits64 + 2 * Bits64::kLanes,
2568
0
                          bits64 + 2 * Bits64::kLanes);
2569
0
  }
2570
0
  if (n > 3) {
2571
0
    bits[3].Merge().Store(nbits64 + 3 * Bits64::kLanes,
2572
0
                          bits64 + 3 * Bits64::kLanes);
2573
0
  }
2574
0
  output.WriteMultiple(nbits64, bits64, Bits64::kLanes * n);
2575
0
}
Unexecuted instantiation: enc_fast_lossless.cc:void AVX512::(anonymous namespace)::StoreToWriter<1ul>(AVX512::(anonymous namespace)::Bits32 const*, (anonymous namespace)::BitWriter&)
Unexecuted instantiation: enc_fast_lossless.cc:void AVX2::(anonymous namespace)::StoreToWriter<1ul>(AVX2::(anonymous namespace)::Bits32 const*, (anonymous namespace)::BitWriter&)
Unexecuted instantiation: enc_fast_lossless.cc:void AVX512::(anonymous namespace)::StoreToWriter<2ul>(AVX512::(anonymous namespace)::Bits32 const*, (anonymous namespace)::BitWriter&)
Unexecuted instantiation: enc_fast_lossless.cc:void AVX2::(anonymous namespace)::StoreToWriter<2ul>(AVX2::(anonymous namespace)::Bits32 const*, (anonymous namespace)::BitWriter&)
2576
2577
namespace detail {
2578
template <typename T>
2579
struct IntegerTypes;
2580
2581
template <>
2582
struct IntegerTypes<SIMDVec16> {
2583
  using signed_ = int16_t;
2584
  using unsigned_ = uint16_t;
2585
};
2586
2587
template <>
2588
struct IntegerTypes<SIMDVec32> {
2589
  using signed_ = int32_t;
2590
  using unsigned_ = uint32_t;
2591
};
2592
2593
template <typename T>
2594
struct SIMDType;
2595
2596
template <>
2597
struct SIMDType<int16_t> {
2598
  using type = SIMDVec16;
2599
};
2600
2601
template <>
2602
struct SIMDType<int32_t> {
2603
  using type = SIMDVec32;
2604
};
2605
2606
}  // namespace detail
2607
2608
template <typename T>
2609
using signed_t = typename detail::IntegerTypes<T>::signed_;
2610
2611
template <typename T>
2612
using unsigned_t = typename detail::IntegerTypes<T>::unsigned_;
2613
2614
template <typename T>
2615
using simd_t = typename detail::SIMDType<T>::type;
2616
2617
// This function will process exactly one vector worth of pixels.
2618
2619
template <typename T>
2620
size_t PredictPixels(const signed_t<T>* pixels, const signed_t<T>* pixels_left,
2621
                     const signed_t<T>* pixels_top,
2622
                     const signed_t<T>* pixels_topleft,
2623
0
                     unsigned_t<T>* residuals) {
2624
0
  T px = T::Load((unsigned_t<T>*)pixels);
2625
0
  T left = T::Load((unsigned_t<T>*)pixels_left);
2626
0
  T top = T::Load((unsigned_t<T>*)pixels_top);
2627
0
  T topleft = T::Load((unsigned_t<T>*)pixels_topleft);
2628
0
  T ac = left.Sub(topleft);
2629
0
  T ab = left.Sub(top);
2630
0
  T bc = top.Sub(topleft);
2631
0
  T grad = ac.Add(top);
2632
0
  T d = ab.Xor(bc);
2633
0
  T zero = T::Val(0);
2634
0
  T clamp = zero.Gt(d).IfThenElse(top, left);
2635
0
  T s = ac.Xor(bc);
2636
0
  T pred = zero.Gt(s).IfThenElse(grad, clamp);
2637
0
  T res = px.Sub(pred);
2638
0
  T res_times_2 = res.Add(res);
2639
0
  res = zero.Gt(res).IfThenElse(T::Val(-1).Sub(res_times_2), res_times_2);
2640
0
  res.Store(residuals);
2641
0
  return res.Eq(T::Val(0)).CountPrefix();
2642
0
}
Unexecuted instantiation: enc_fast_lossless.cc:unsigned long AVX512::(anonymous namespace)::PredictPixels<AVX512::(anonymous namespace)::SIMDVec16>(AVX512::(anonymous namespace)::detail::IntegerTypes<AVX512::(anonymous namespace)::SIMDVec16>::signed_ const*, AVX512::(anonymous namespace)::detail::IntegerTypes<AVX512::(anonymous namespace)::SIMDVec16>::signed_ const*, AVX512::(anonymous namespace)::detail::IntegerTypes<AVX512::(anonymous namespace)::SIMDVec16>::signed_ const*, AVX512::(anonymous namespace)::detail::IntegerTypes<AVX512::(anonymous namespace)::SIMDVec16>::signed_ const*, AVX512::(anonymous namespace)::detail::IntegerTypes<AVX512::(anonymous namespace)::SIMDVec16>::unsigned_*)
Unexecuted instantiation: enc_fast_lossless.cc:unsigned long AVX512::(anonymous namespace)::PredictPixels<AVX512::(anonymous namespace)::SIMDVec32>(AVX512::(anonymous namespace)::detail::IntegerTypes<AVX512::(anonymous namespace)::SIMDVec32>::signed_ const*, AVX512::(anonymous namespace)::detail::IntegerTypes<AVX512::(anonymous namespace)::SIMDVec32>::signed_ const*, AVX512::(anonymous namespace)::detail::IntegerTypes<AVX512::(anonymous namespace)::SIMDVec32>::signed_ const*, AVX512::(anonymous namespace)::detail::IntegerTypes<AVX512::(anonymous namespace)::SIMDVec32>::signed_ const*, AVX512::(anonymous namespace)::detail::IntegerTypes<AVX512::(anonymous namespace)::SIMDVec32>::unsigned_*)
Unexecuted instantiation: enc_fast_lossless.cc:unsigned long AVX2::(anonymous namespace)::PredictPixels<AVX2::(anonymous namespace)::SIMDVec16>(AVX2::(anonymous namespace)::detail::IntegerTypes<AVX2::(anonymous namespace)::SIMDVec16>::signed_ const*, AVX2::(anonymous namespace)::detail::IntegerTypes<AVX2::(anonymous namespace)::SIMDVec16>::signed_ const*, AVX2::(anonymous namespace)::detail::IntegerTypes<AVX2::(anonymous namespace)::SIMDVec16>::signed_ const*, AVX2::(anonymous namespace)::detail::IntegerTypes<AVX2::(anonymous namespace)::SIMDVec16>::signed_ const*, AVX2::(anonymous namespace)::detail::IntegerTypes<AVX2::(anonymous namespace)::SIMDVec16>::unsigned_*)
Unexecuted instantiation: enc_fast_lossless.cc:unsigned long AVX2::(anonymous namespace)::PredictPixels<AVX2::(anonymous namespace)::SIMDVec32>(AVX2::(anonymous namespace)::detail::IntegerTypes<AVX2::(anonymous namespace)::SIMDVec32>::signed_ const*, AVX2::(anonymous namespace)::detail::IntegerTypes<AVX2::(anonymous namespace)::SIMDVec32>::signed_ const*, AVX2::(anonymous namespace)::detail::IntegerTypes<AVX2::(anonymous namespace)::SIMDVec32>::signed_ const*, AVX2::(anonymous namespace)::detail::IntegerTypes<AVX2::(anonymous namespace)::SIMDVec32>::signed_ const*, AVX2::(anonymous namespace)::detail::IntegerTypes<AVX2::(anonymous namespace)::SIMDVec32>::unsigned_*)
2643
2644
#endif
2645
2646
void EncodeHybridUint000(uint32_t value, uint32_t* token, uint32_t* nbits,
2647
0
                         uint32_t* bits) {
2648
0
  uint32_t n = FloorLog2(value);
2649
0
  *token = value ? n + 1 : 0;
2650
0
  *nbits = value ? n : 0;
2651
0
  *bits = value ? value - (1 << n) : 0;
2652
0
}
Unexecuted instantiation: enc_fast_lossless.cc:AVX512::(anonymous namespace)::EncodeHybridUint000(unsigned int, unsigned int*, unsigned int*, unsigned int*)
Unexecuted instantiation: enc_fast_lossless.cc:AVX2::(anonymous namespace)::EncodeHybridUint000(unsigned int, unsigned int*, unsigned int*, unsigned int*)
Unexecuted instantiation: enc_fast_lossless.cc:default_implementation::(anonymous namespace)::EncodeHybridUint000(unsigned int, unsigned int*, unsigned int*, unsigned int*)
2653
2654
#ifdef FJXL_AVX512
2655
constexpr static size_t kLogChunkSize = 5;
2656
#elif defined(FJXL_AVX2) || defined(FJXL_NEON)
2657
// Even if NEON only has 128-bit lanes, it is still significantly (~1.3x) faster
2658
// to process two vectors at a time.
2659
constexpr static size_t kLogChunkSize = 4;
2660
#else
2661
constexpr static size_t kLogChunkSize = 3;
2662
#endif
2663
2664
constexpr static size_t kChunkSize = 1 << kLogChunkSize;
2665
2666
template <typename Residual>
2667
void GenericEncodeChunk(const Residual* residuals, size_t n, size_t skip,
2668
0
                        const PrefixCode& code, BitWriter& output) {
2669
0
  for (size_t ix = skip; ix < n; ix++) {
2670
0
    unsigned token, nbits, bits;
2671
0
    EncodeHybridUint000(residuals[ix], &token, &nbits, &bits);
2672
0
    output.Write(code.raw_nbits[token] + nbits,
2673
0
                 code.raw_bits[token] | bits << code.raw_nbits[token]);
2674
0
  }
2675
0
}
Unexecuted instantiation: enc_fast_lossless.cc:void default_implementation::(anonymous namespace)::GenericEncodeChunk<unsigned short>(unsigned short const*, unsigned long, unsigned long, (anonymous namespace)::PrefixCode const&, (anonymous namespace)::BitWriter&)
Unexecuted instantiation: enc_fast_lossless.cc:void default_implementation::(anonymous namespace)::GenericEncodeChunk<unsigned int>(unsigned int const*, unsigned long, unsigned long, (anonymous namespace)::PrefixCode const&, (anonymous namespace)::BitWriter&)
2676
2677
struct UpTo8Bits {
2678
  size_t bitdepth;
2679
0
  explicit UpTo8Bits(size_t bitdepth) : bitdepth(bitdepth) {
2680
0
    assert(bitdepth <= 8);
2681
0
  }
Unexecuted instantiation: enc_fast_lossless.cc:AVX512::(anonymous namespace)::UpTo8Bits::UpTo8Bits(unsigned long)
Unexecuted instantiation: enc_fast_lossless.cc:AVX2::(anonymous namespace)::UpTo8Bits::UpTo8Bits(unsigned long)
Unexecuted instantiation: enc_fast_lossless.cc:default_implementation::(anonymous namespace)::UpTo8Bits::UpTo8Bits(unsigned long)
2682
  // Here we can fit up to 9 extra bits + 7 Huffman bits in a u16; for all other
2683
  // symbols, we could actually go up to 8 Huffman bits as we have at most 8
2684
  // extra bits; however, the SIMD bit merging logic for AVX2 assumes that no
2685
  // Huffman length is 8 or more, so we cap at 8 anyway. Last symbol is used for
2686
  // LZ77 lengths and has no limitations except allowing to represent 32 symbols
2687
  // in total.
2688
  static constexpr uint8_t kMinRawLength[12] = {};
2689
  static constexpr uint8_t kMaxRawLength[12] = {
2690
      7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 10,
2691
  };
2692
0
  static size_t MaxEncodedBitsPerSample() { return 16; }
Unexecuted instantiation: enc_fast_lossless.cc:AVX512::(anonymous namespace)::UpTo8Bits::MaxEncodedBitsPerSample()
Unexecuted instantiation: enc_fast_lossless.cc:AVX2::(anonymous namespace)::UpTo8Bits::MaxEncodedBitsPerSample()
Unexecuted instantiation: enc_fast_lossless.cc:default_implementation::(anonymous namespace)::UpTo8Bits::MaxEncodedBitsPerSample()
2693
  static constexpr size_t kInputBytes = 1;
2694
  using pixel_t = int16_t;
2695
  using upixel_t = uint16_t;
2696
2697
  static void PrepareForSimd(const uint8_t* nbits, const uint8_t* bits,
2698
                             size_t n, uint8_t* nbits_simd,
2699
0
                             uint8_t* bits_simd) {
2700
0
    assert(n <= 16);
2701
0
    memcpy(nbits_simd, nbits, 16);
2702
0
    memcpy(bits_simd, bits, 16);
2703
0
  }
Unexecuted instantiation: enc_fast_lossless.cc:AVX512::(anonymous namespace)::UpTo8Bits::PrepareForSimd(unsigned char const*, unsigned char const*, unsigned long, unsigned char*, unsigned char*)
Unexecuted instantiation: enc_fast_lossless.cc:AVX2::(anonymous namespace)::UpTo8Bits::PrepareForSimd(unsigned char const*, unsigned char const*, unsigned long, unsigned char*, unsigned char*)
Unexecuted instantiation: enc_fast_lossless.cc:default_implementation::(anonymous namespace)::UpTo8Bits::PrepareForSimd(unsigned char const*, unsigned char const*, unsigned long, unsigned char*, unsigned char*)
2704
2705
#ifdef FJXL_GENERIC_SIMD
2706
  static void EncodeChunkSimd(upixel_t* residuals, size_t n, size_t skip,
2707
                              const uint8_t* raw_nbits_simd,
2708
0
                              const uint8_t* raw_bits_simd, BitWriter& output) {
2709
0
    Bits32 bits32[kChunkSize / SIMDVec16::kLanes];
2710
0
    alignas(64) uint16_t bits[SIMDVec16::kLanes];
2711
0
    alignas(64) uint16_t nbits[SIMDVec16::kLanes];
2712
0
    alignas(64) uint16_t bits_huff[SIMDVec16::kLanes];
2713
0
    alignas(64) uint16_t nbits_huff[SIMDVec16::kLanes];
2714
0
    alignas(64) uint16_t token[SIMDVec16::kLanes];
2715
0
    for (size_t i = 0; i < kChunkSize; i += SIMDVec16::kLanes) {
2716
0
      TokenizeSIMD(residuals + i, token, nbits, bits);
2717
0
      HuffmanSIMDUpTo13(token, raw_nbits_simd, raw_bits_simd, nbits_huff,
2718
0
                        bits_huff);
2719
0
      StoreSIMDUpTo8(nbits, bits, nbits_huff, bits_huff, std::max(n, i) - i,
2720
0
                     std::max(skip, i) - i, bits32 + i / SIMDVec16::kLanes);
2721
0
    }
2722
0
    StoreToWriter<kChunkSize / SIMDVec16::kLanes>(bits32, output);
2723
0
  }
Unexecuted instantiation: enc_fast_lossless.cc:AVX512::(anonymous namespace)::UpTo8Bits::EncodeChunkSimd(unsigned short*, unsigned long, unsigned long, unsigned char const*, unsigned char const*, (anonymous namespace)::BitWriter&)
Unexecuted instantiation: enc_fast_lossless.cc:AVX2::(anonymous namespace)::UpTo8Bits::EncodeChunkSimd(unsigned short*, unsigned long, unsigned long, unsigned char const*, unsigned char const*, (anonymous namespace)::BitWriter&)
2724
#endif
2725
2726
0
  size_t NumSymbols(bool doing_ycocg_or_large_palette) const {
2727
    // values gain 1 bit for YCoCg, 1 bit for prediction.
2728
    // Maximum symbol is 1 + effective bit depth of residuals.
2729
0
    if (doing_ycocg_or_large_palette) {
2730
0
      return bitdepth + 3;
2731
0
    } else {
2732
0
      return bitdepth + 2;
2733
0
    }
2734
0
  }
Unexecuted instantiation: enc_fast_lossless.cc:AVX512::(anonymous namespace)::UpTo8Bits::NumSymbols(bool) const
Unexecuted instantiation: enc_fast_lossless.cc:AVX2::(anonymous namespace)::UpTo8Bits::NumSymbols(bool) const
Unexecuted instantiation: enc_fast_lossless.cc:default_implementation::(anonymous namespace)::UpTo8Bits::NumSymbols(bool) const
2735
};
2736
constexpr uint8_t UpTo8Bits::kMinRawLength[];
2737
constexpr uint8_t UpTo8Bits::kMaxRawLength[];
2738
2739
struct From9To13Bits {
2740
  size_t bitdepth;
2741
0
  explicit From9To13Bits(size_t bitdepth) : bitdepth(bitdepth) {
2742
0
    assert(bitdepth <= 13 && bitdepth >= 9);
2743
0
  }
Unexecuted instantiation: enc_fast_lossless.cc:AVX512::(anonymous namespace)::From9To13Bits::From9To13Bits(unsigned long)
Unexecuted instantiation: enc_fast_lossless.cc:AVX2::(anonymous namespace)::From9To13Bits::From9To13Bits(unsigned long)
Unexecuted instantiation: enc_fast_lossless.cc:default_implementation::(anonymous namespace)::From9To13Bits::From9To13Bits(unsigned long)
2744
  // Last symbol is used for LZ77 lengths and has no limitations except allowing
2745
  // to represent 32 symbols in total.
2746
  // We cannot fit all the bits in a u16, so do not even try and use up to 8
2747
  // bits per raw symbol.
2748
  // There are at most 16 raw symbols, so Huffman coding can be SIMDfied without
2749
  // any special tricks.
2750
  static constexpr uint8_t kMinRawLength[17] = {};
2751
  static constexpr uint8_t kMaxRawLength[17] = {
2752
      8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 10,
2753
  };
2754
0
  static size_t MaxEncodedBitsPerSample() { return 21; }
Unexecuted instantiation: enc_fast_lossless.cc:AVX512::(anonymous namespace)::From9To13Bits::MaxEncodedBitsPerSample()
Unexecuted instantiation: enc_fast_lossless.cc:AVX2::(anonymous namespace)::From9To13Bits::MaxEncodedBitsPerSample()
Unexecuted instantiation: enc_fast_lossless.cc:default_implementation::(anonymous namespace)::From9To13Bits::MaxEncodedBitsPerSample()
2755
  static constexpr size_t kInputBytes = 2;
2756
  using pixel_t = int16_t;
2757
  using upixel_t = uint16_t;
2758
2759
  static void PrepareForSimd(const uint8_t* nbits, const uint8_t* bits,
2760
                             size_t n, uint8_t* nbits_simd,
2761
0
                             uint8_t* bits_simd) {
2762
0
    assert(n <= 16);
2763
0
    memcpy(nbits_simd, nbits, 16);
2764
0
    memcpy(bits_simd, bits, 16);
2765
0
  }
Unexecuted instantiation: enc_fast_lossless.cc:AVX512::(anonymous namespace)::From9To13Bits::PrepareForSimd(unsigned char const*, unsigned char const*, unsigned long, unsigned char*, unsigned char*)
Unexecuted instantiation: enc_fast_lossless.cc:AVX2::(anonymous namespace)::From9To13Bits::PrepareForSimd(unsigned char const*, unsigned char const*, unsigned long, unsigned char*, unsigned char*)
Unexecuted instantiation: enc_fast_lossless.cc:default_implementation::(anonymous namespace)::From9To13Bits::PrepareForSimd(unsigned char const*, unsigned char const*, unsigned long, unsigned char*, unsigned char*)
2766
2767
#ifdef FJXL_GENERIC_SIMD
2768
  static void EncodeChunkSimd(upixel_t* residuals, size_t n, size_t skip,
2769
                              const uint8_t* raw_nbits_simd,
2770
0
                              const uint8_t* raw_bits_simd, BitWriter& output) {
2771
0
    Bits32 bits32[2 * kChunkSize / SIMDVec16::kLanes];
2772
0
    alignas(64) uint16_t bits[SIMDVec16::kLanes];
2773
0
    alignas(64) uint16_t nbits[SIMDVec16::kLanes];
2774
0
    alignas(64) uint16_t bits_huff[SIMDVec16::kLanes];
2775
0
    alignas(64) uint16_t nbits_huff[SIMDVec16::kLanes];
2776
0
    alignas(64) uint16_t token[SIMDVec16::kLanes];
2777
0
    for (size_t i = 0; i < kChunkSize; i += SIMDVec16::kLanes) {
2778
0
      TokenizeSIMD(residuals + i, token, nbits, bits);
2779
0
      HuffmanSIMDUpTo13(token, raw_nbits_simd, raw_bits_simd, nbits_huff,
2780
0
                        bits_huff);
2781
0
      StoreSIMDUpTo14(nbits, bits, nbits_huff, bits_huff, std::max(n, i) - i,
2782
0
                      std::max(skip, i) - i,
2783
0
                      bits32 + 2 * i / SIMDVec16::kLanes);
2784
0
    }
2785
0
    StoreToWriter<2 * kChunkSize / SIMDVec16::kLanes>(bits32, output);
2786
0
  }
Unexecuted instantiation: enc_fast_lossless.cc:AVX512::(anonymous namespace)::From9To13Bits::EncodeChunkSimd(unsigned short*, unsigned long, unsigned long, unsigned char const*, unsigned char const*, (anonymous namespace)::BitWriter&)
Unexecuted instantiation: enc_fast_lossless.cc:AVX2::(anonymous namespace)::From9To13Bits::EncodeChunkSimd(unsigned short*, unsigned long, unsigned long, unsigned char const*, unsigned char const*, (anonymous namespace)::BitWriter&)
2787
#endif
2788
2789
0
  size_t NumSymbols(bool doing_ycocg_or_large_palette) const {
2790
    // values gain 1 bit for YCoCg, 1 bit for prediction.
2791
    // Maximum symbol is 1 + effective bit depth of residuals.
2792
0
    if (doing_ycocg_or_large_palette) {
2793
0
      return bitdepth + 3;
2794
0
    } else {
2795
0
      return bitdepth + 2;
2796
0
    }
2797
0
  }
Unexecuted instantiation: enc_fast_lossless.cc:AVX512::(anonymous namespace)::From9To13Bits::NumSymbols(bool) const
Unexecuted instantiation: enc_fast_lossless.cc:AVX2::(anonymous namespace)::From9To13Bits::NumSymbols(bool) const
Unexecuted instantiation: enc_fast_lossless.cc:default_implementation::(anonymous namespace)::From9To13Bits::NumSymbols(bool) const
2798
};
2799
constexpr uint8_t From9To13Bits::kMinRawLength[];
2800
constexpr uint8_t From9To13Bits::kMaxRawLength[];
2801
2802
0
void CheckHuffmanBitsSIMD(int bits1, int nbits1, int bits2, int nbits2) {
2803
0
  assert(nbits1 == 8);
2804
0
  assert(nbits2 == 8);
2805
0
  assert(bits2 == (bits1 | 128));
2806
0
}
Unexecuted instantiation: enc_fast_lossless.cc:AVX512::(anonymous namespace)::CheckHuffmanBitsSIMD(int, int, int, int)
Unexecuted instantiation: enc_fast_lossless.cc:AVX2::(anonymous namespace)::CheckHuffmanBitsSIMD(int, int, int, int)
Unexecuted instantiation: enc_fast_lossless.cc:default_implementation::(anonymous namespace)::CheckHuffmanBitsSIMD(int, int, int, int)
2807
2808
struct Exactly14Bits {
2809
0
  explicit Exactly14Bits(size_t bitdepth_) { assert(bitdepth_ == 14); }
Unexecuted instantiation: enc_fast_lossless.cc:AVX512::(anonymous namespace)::Exactly14Bits::Exactly14Bits(unsigned long)
Unexecuted instantiation: enc_fast_lossless.cc:AVX2::(anonymous namespace)::Exactly14Bits::Exactly14Bits(unsigned long)
Unexecuted instantiation: enc_fast_lossless.cc:default_implementation::(anonymous namespace)::Exactly14Bits::Exactly14Bits(unsigned long)
2810
  // Force LZ77 symbols to have at least 8 bits, and raw symbols 15 and 16 to
2811
  // have exactly 8, and no other symbol to have 8 or more. This ensures that
2812
  // the representation for 15 and 16 is identical up to one bit.
2813
  static constexpr uint8_t kMinRawLength[18] = {
2814
      0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 8, 8, 7,
2815
  };
2816
  static constexpr uint8_t kMaxRawLength[18] = {
2817
      7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 8, 8, 10,
2818
  };
2819
  static constexpr size_t bitdepth = 14;
2820
0
  static size_t MaxEncodedBitsPerSample() { return 22; }
Unexecuted instantiation: enc_fast_lossless.cc:AVX512::(anonymous namespace)::Exactly14Bits::MaxEncodedBitsPerSample()
Unexecuted instantiation: enc_fast_lossless.cc:AVX2::(anonymous namespace)::Exactly14Bits::MaxEncodedBitsPerSample()
Unexecuted instantiation: enc_fast_lossless.cc:default_implementation::(anonymous namespace)::Exactly14Bits::MaxEncodedBitsPerSample()
2821
  static constexpr size_t kInputBytes = 2;
2822
  using pixel_t = int16_t;
2823
  using upixel_t = uint16_t;
2824
2825
  static void PrepareForSimd(const uint8_t* nbits, const uint8_t* bits,
2826
                             size_t n, uint8_t* nbits_simd,
2827
0
                             uint8_t* bits_simd) {
2828
0
    assert(n == 17);
2829
0
    CheckHuffmanBitsSIMD(bits[15], nbits[15], bits[16], nbits[16]);
2830
0
    memcpy(nbits_simd, nbits, 16);
2831
0
    memcpy(bits_simd, bits, 16);
2832
0
  }
Unexecuted instantiation: enc_fast_lossless.cc:AVX512::(anonymous namespace)::Exactly14Bits::PrepareForSimd(unsigned char const*, unsigned char const*, unsigned long, unsigned char*, unsigned char*)
Unexecuted instantiation: enc_fast_lossless.cc:AVX2::(anonymous namespace)::Exactly14Bits::PrepareForSimd(unsigned char const*, unsigned char const*, unsigned long, unsigned char*, unsigned char*)
Unexecuted instantiation: enc_fast_lossless.cc:default_implementation::(anonymous namespace)::Exactly14Bits::PrepareForSimd(unsigned char const*, unsigned char const*, unsigned long, unsigned char*, unsigned char*)
2833
2834
#ifdef FJXL_GENERIC_SIMD
2835
  static void EncodeChunkSimd(upixel_t* residuals, size_t n, size_t skip,
2836
                              const uint8_t* raw_nbits_simd,
2837
0
                              const uint8_t* raw_bits_simd, BitWriter& output) {
2838
0
    Bits32 bits32[2 * kChunkSize / SIMDVec16::kLanes];
2839
0
    alignas(64) uint16_t bits[SIMDVec16::kLanes];
2840
0
    alignas(64) uint16_t nbits[SIMDVec16::kLanes];
2841
0
    alignas(64) uint16_t bits_huff[SIMDVec16::kLanes];
2842
0
    alignas(64) uint16_t nbits_huff[SIMDVec16::kLanes];
2843
0
    alignas(64) uint16_t token[SIMDVec16::kLanes];
2844
0
    for (size_t i = 0; i < kChunkSize; i += SIMDVec16::kLanes) {
2845
0
      TokenizeSIMD(residuals + i, token, nbits, bits);
2846
0
      HuffmanSIMD14(token, raw_nbits_simd, raw_bits_simd, nbits_huff,
2847
0
                    bits_huff);
2848
0
      StoreSIMDUpTo14(nbits, bits, nbits_huff, bits_huff, std::max(n, i) - i,
2849
0
                      std::max(skip, i) - i,
2850
0
                      bits32 + 2 * i / SIMDVec16::kLanes);
2851
0
    }
2852
0
    StoreToWriter<2 * kChunkSize / SIMDVec16::kLanes>(bits32, output);
2853
0
  }
Unexecuted instantiation: enc_fast_lossless.cc:AVX512::(anonymous namespace)::Exactly14Bits::EncodeChunkSimd(unsigned short*, unsigned long, unsigned long, unsigned char const*, unsigned char const*, (anonymous namespace)::BitWriter&)
Unexecuted instantiation: enc_fast_lossless.cc:AVX2::(anonymous namespace)::Exactly14Bits::EncodeChunkSimd(unsigned short*, unsigned long, unsigned long, unsigned char const*, unsigned char const*, (anonymous namespace)::BitWriter&)
2854
#endif
2855
2856
0
  size_t NumSymbols(bool) const { return 17; }
Unexecuted instantiation: enc_fast_lossless.cc:AVX512::(anonymous namespace)::Exactly14Bits::NumSymbols(bool) const
Unexecuted instantiation: enc_fast_lossless.cc:AVX2::(anonymous namespace)::Exactly14Bits::NumSymbols(bool) const
Unexecuted instantiation: enc_fast_lossless.cc:default_implementation::(anonymous namespace)::Exactly14Bits::NumSymbols(bool) const
2857
};
2858
constexpr uint8_t Exactly14Bits::kMinRawLength[];
2859
constexpr uint8_t Exactly14Bits::kMaxRawLength[];
2860
2861
struct MoreThan14Bits {
2862
  size_t bitdepth;
2863
0
  explicit MoreThan14Bits(size_t bitdepth) : bitdepth(bitdepth) {
2864
0
    assert(bitdepth > 14);
2865
0
    assert(bitdepth <= 16);
2866
0
  }
Unexecuted instantiation: enc_fast_lossless.cc:AVX512::(anonymous namespace)::MoreThan14Bits::MoreThan14Bits(unsigned long)
Unexecuted instantiation: enc_fast_lossless.cc:AVX2::(anonymous namespace)::MoreThan14Bits::MoreThan14Bits(unsigned long)
Unexecuted instantiation: enc_fast_lossless.cc:default_implementation::(anonymous namespace)::MoreThan14Bits::MoreThan14Bits(unsigned long)
2867
  // Force LZ77 symbols to have at least 8 bits, and raw symbols 13 to 18 to
2868
  // have exactly 8, and no other symbol to have 8 or more. This ensures that
2869
  // the representation for (13, 14), (15, 16), (17, 18) is identical up to one
2870
  // bit.
2871
  static constexpr uint8_t kMinRawLength[20] = {
2872
      0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 8, 8, 8, 8, 8, 8, 7,
2873
  };
2874
  static constexpr uint8_t kMaxRawLength[20] = {
2875
      7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 8, 8, 8, 8, 8, 8, 10,
2876
  };
2877
0
  static size_t MaxEncodedBitsPerSample() { return 24; }
Unexecuted instantiation: enc_fast_lossless.cc:AVX512::(anonymous namespace)::MoreThan14Bits::MaxEncodedBitsPerSample()
Unexecuted instantiation: enc_fast_lossless.cc:AVX2::(anonymous namespace)::MoreThan14Bits::MaxEncodedBitsPerSample()
Unexecuted instantiation: enc_fast_lossless.cc:default_implementation::(anonymous namespace)::MoreThan14Bits::MaxEncodedBitsPerSample()
2878
  static constexpr size_t kInputBytes = 2;
2879
  using pixel_t = int32_t;
2880
  using upixel_t = uint32_t;
2881
2882
  static void PrepareForSimd(const uint8_t* nbits, const uint8_t* bits,
2883
                             size_t n, uint8_t* nbits_simd,
2884
0
                             uint8_t* bits_simd) {
2885
0
    assert(n == 19);
2886
0
    CheckHuffmanBitsSIMD(bits[13], nbits[13], bits[14], nbits[14]);
2887
0
    CheckHuffmanBitsSIMD(bits[15], nbits[15], bits[16], nbits[16]);
2888
0
    CheckHuffmanBitsSIMD(bits[17], nbits[17], bits[18], nbits[18]);
2889
0
    for (size_t i = 0; i < 14; i++) {
2890
0
      nbits_simd[i] = nbits[i];
2891
0
      bits_simd[i] = bits[i];
2892
0
    }
2893
0
    nbits_simd[14] = nbits[15];
2894
0
    bits_simd[14] = bits[15];
2895
0
    nbits_simd[15] = nbits[17];
2896
0
    bits_simd[15] = bits[17];
2897
0
  }
Unexecuted instantiation: enc_fast_lossless.cc:AVX512::(anonymous namespace)::MoreThan14Bits::PrepareForSimd(unsigned char const*, unsigned char const*, unsigned long, unsigned char*, unsigned char*)
Unexecuted instantiation: enc_fast_lossless.cc:AVX2::(anonymous namespace)::MoreThan14Bits::PrepareForSimd(unsigned char const*, unsigned char const*, unsigned long, unsigned char*, unsigned char*)
Unexecuted instantiation: enc_fast_lossless.cc:default_implementation::(anonymous namespace)::MoreThan14Bits::PrepareForSimd(unsigned char const*, unsigned char const*, unsigned long, unsigned char*, unsigned char*)
2898
2899
#ifdef FJXL_GENERIC_SIMD
2900
  static void EncodeChunkSimd(upixel_t* residuals, size_t n, size_t skip,
2901
                              const uint8_t* raw_nbits_simd,
2902
0
                              const uint8_t* raw_bits_simd, BitWriter& output) {
2903
0
    Bits32 bits32[2 * kChunkSize / SIMDVec16::kLanes];
2904
0
    alignas(64) uint32_t bits[SIMDVec16::kLanes];
2905
0
    alignas(64) uint32_t nbits[SIMDVec16::kLanes];
2906
0
    alignas(64) uint16_t bits_huff[SIMDVec16::kLanes];
2907
0
    alignas(64) uint16_t nbits_huff[SIMDVec16::kLanes];
2908
0
    alignas(64) uint16_t token[SIMDVec16::kLanes];
2909
0
    for (size_t i = 0; i < kChunkSize; i += SIMDVec16::kLanes) {
2910
0
      TokenizeSIMD(residuals + i, token, nbits, bits);
2911
0
      HuffmanSIMDAbove14(token, raw_nbits_simd, raw_bits_simd, nbits_huff,
2912
0
                         bits_huff);
2913
0
      StoreSIMDAbove14(nbits, bits, nbits_huff, bits_huff, std::max(n, i) - i,
2914
0
                       std::max(skip, i) - i,
2915
0
                       bits32 + 2 * i / SIMDVec16::kLanes);
2916
0
    }
2917
0
    StoreToWriter<2 * kChunkSize / SIMDVec16::kLanes>(bits32, output);
2918
0
  }
Unexecuted instantiation: enc_fast_lossless.cc:AVX512::(anonymous namespace)::MoreThan14Bits::EncodeChunkSimd(unsigned int*, unsigned long, unsigned long, unsigned char const*, unsigned char const*, (anonymous namespace)::BitWriter&)
Unexecuted instantiation: enc_fast_lossless.cc:AVX2::(anonymous namespace)::MoreThan14Bits::EncodeChunkSimd(unsigned int*, unsigned long, unsigned long, unsigned char const*, unsigned char const*, (anonymous namespace)::BitWriter&)
2919
#endif
2920
0
  size_t NumSymbols(bool) const { return 19; }
Unexecuted instantiation: enc_fast_lossless.cc:AVX512::(anonymous namespace)::MoreThan14Bits::NumSymbols(bool) const
Unexecuted instantiation: enc_fast_lossless.cc:AVX2::(anonymous namespace)::MoreThan14Bits::NumSymbols(bool) const
Unexecuted instantiation: enc_fast_lossless.cc:default_implementation::(anonymous namespace)::MoreThan14Bits::NumSymbols(bool) const
2921
};
2922
constexpr uint8_t MoreThan14Bits::kMinRawLength[];
2923
constexpr uint8_t MoreThan14Bits::kMaxRawLength[];
2924
2925
bool PrepareDCGlobalCommon(bool is_single_group, size_t width, size_t height,
2926
0
                           const PrefixCode code[4], BitWriter* output) {
2927
0
  if (!output->Allocate(100000 + (is_single_group ? width * height * 16 : 0))) {
2928
0
    return false;
2929
0
  }
2930
  // No patches, spline or noise.
2931
0
  output->Write(1, 1);  // default DC dequantization factors (?)
2932
0
  output->Write(1, 1);  // use global tree / histograms
2933
0
  output->Write(1, 0);  // no lz77 for the tree
2934
2935
0
  output->Write(1, 1);         // simple code for the tree's context map
2936
0
  output->Write(2, 0);         // all contexts clustered together
2937
0
  output->Write(1, 1);         // use prefix code for tree
2938
0
  output->Write(4, 0);         // 000 hybrid uint
2939
0
  output->Write(6, 0b100011);  // Alphabet size is 4 (var16)
2940
0
  output->Write(2, 1);         // simple prefix code
2941
0
  output->Write(2, 3);         // with 4 symbols
2942
0
  output->Write(2, 0);
2943
0
  output->Write(2, 1);
2944
0
  output->Write(2, 2);
2945
0
  output->Write(2, 3);
2946
0
  output->Write(1, 0);  // First tree encoding option
2947
2948
  // Huffman table + extra bits for the tree.
2949
0
  uint8_t symbol_bits[6] = {0b00, 0b10, 0b001, 0b101, 0b0011, 0b0111};
2950
0
  uint8_t symbol_nbits[6] = {2, 2, 3, 3, 4, 4};
2951
  // Write a tree with a leaf per channel, and gradient predictor for every
2952
  // leaf.
2953
0
  for (auto v : {1, 2, 1, 4, 1, 0, 0, 5, 0, 0, 0, 0, 5,
2954
0
                 0, 0, 0, 0, 5, 0, 0, 0, 0, 5, 0, 0, 0}) {
2955
0
    output->Write(symbol_nbits[v], symbol_bits[v]);
2956
0
  }
2957
2958
0
  output->Write(1, 1);     // Enable lz77 for the main bitstream
2959
0
  output->Write(2, 0b00);  // lz77 offset 224
2960
0
  static_assert(kLZ77Offset == 224, "kLZ77Offset should be 224");
2961
0
  output->Write(4, 0b1010);  // lz77 min length 7
2962
  // 400 hybrid uint config for lz77
2963
0
  output->Write(4, 4);
2964
0
  output->Write(3, 0);
2965
0
  output->Write(3, 0);
2966
2967
0
  output->Write(1, 1);  // simple code for the context map
2968
0
  output->Write(2, 3);  // 3 bits per entry
2969
0
  output->Write(3, 4);  // channel 3
2970
0
  output->Write(3, 3);  // channel 2
2971
0
  output->Write(3, 2);  // channel 1
2972
0
  output->Write(3, 1);  // channel 0
2973
0
  output->Write(3, 0);  // distance histogram first
2974
2975
0
  output->Write(1, 1);  // use prefix codes
2976
0
  output->Write(4, 0);  // 000 hybrid uint config for distances (only need 0)
2977
0
  for (size_t i = 0; i < 4; i++) {
2978
0
    output->Write(4, 0);  // 000 hybrid uint config for symbols (only <= 10)
2979
0
  }
2980
2981
  // Distance alphabet size:
2982
0
  output->Write(5, 0b00001);  // 2: just need 1 for RLE (i.e. distance 1)
2983
  // Symbol + LZ77 alphabet size:
2984
0
  for (size_t i = 0; i < 4; i++) {
2985
0
    output->Write(1, 1);    // > 1
2986
0
    output->Write(4, 8);    // <= 512
2987
0
    output->Write(8, 256);  // == 512
2988
0
  }
2989
2990
  // Distance histogram:
2991
0
  output->Write(2, 1);  // simple prefix code
2992
0
  output->Write(2, 0);  // with one symbol
2993
0
  output->Write(1, 1);  // 1
2994
2995
  // Symbol + lz77 histogram:
2996
0
  for (size_t i = 0; i < 4; i++) {
2997
0
    code[i].WriteTo(output);
2998
0
  }
2999
3000
  // Group header for global modular image.
3001
0
  output->Write(1, 1);  // Global tree
3002
0
  output->Write(1, 1);  // All default wp
3003
0
  return true;
3004
0
}
Unexecuted instantiation: enc_fast_lossless.cc:AVX512::(anonymous namespace)::PrepareDCGlobalCommon(bool, unsigned long, unsigned long, (anonymous namespace)::PrefixCode const*, (anonymous namespace)::BitWriter*)
Unexecuted instantiation: enc_fast_lossless.cc:AVX2::(anonymous namespace)::PrepareDCGlobalCommon(bool, unsigned long, unsigned long, (anonymous namespace)::PrefixCode const*, (anonymous namespace)::BitWriter*)
Unexecuted instantiation: enc_fast_lossless.cc:default_implementation::(anonymous namespace)::PrepareDCGlobalCommon(bool, unsigned long, unsigned long, (anonymous namespace)::PrefixCode const*, (anonymous namespace)::BitWriter*)
3005
3006
bool PrepareDCGlobal(bool is_single_group, size_t width, size_t height,
3007
                     size_t nb_chans, const PrefixCode code[4],
3008
0
                     BitWriter* output) {
3009
0
  if (!PrepareDCGlobalCommon(is_single_group, width, height, code, output)) {
3010
0
    return false;
3011
0
  }
3012
0
  if (nb_chans > 2) {
3013
0
    output->Write(2, 0b01);     // 1 transform
3014
0
    output->Write(2, 0b00);     // RCT
3015
0
    output->Write(5, 0b00000);  // Starting from ch 0
3016
0
    output->Write(2, 0b00);     // YCoCg
3017
0
  } else {
3018
0
    output->Write(2, 0b00);  // no transforms
3019
0
  }
3020
0
  if (!is_single_group) {
3021
0
    output->ZeroPadToByte();
3022
0
  }
3023
0
  return true;
3024
0
}
Unexecuted instantiation: enc_fast_lossless.cc:AVX512::(anonymous namespace)::PrepareDCGlobal(bool, unsigned long, unsigned long, unsigned long, (anonymous namespace)::PrefixCode const*, (anonymous namespace)::BitWriter*)
Unexecuted instantiation: enc_fast_lossless.cc:AVX2::(anonymous namespace)::PrepareDCGlobal(bool, unsigned long, unsigned long, unsigned long, (anonymous namespace)::PrefixCode const*, (anonymous namespace)::BitWriter*)
Unexecuted instantiation: enc_fast_lossless.cc:default_implementation::(anonymous namespace)::PrepareDCGlobal(bool, unsigned long, unsigned long, unsigned long, (anonymous namespace)::PrefixCode const*, (anonymous namespace)::BitWriter*)
3025
3026
template <typename BitDepth>
3027
struct ChunkEncoder {
3028
0
  void PrepareForSimd() {
3029
0
    BitDepth::PrepareForSimd(code->raw_nbits, code->raw_bits, code->numraw,
3030
0
                             raw_nbits_simd, raw_bits_simd);
3031
0
  }
Unexecuted instantiation: enc_fast_lossless.cc:AVX512::(anonymous namespace)::ChunkEncoder<AVX512::(anonymous namespace)::UpTo8Bits>::PrepareForSimd()
Unexecuted instantiation: enc_fast_lossless.cc:AVX2::(anonymous namespace)::ChunkEncoder<AVX2::(anonymous namespace)::UpTo8Bits>::PrepareForSimd()
Unexecuted instantiation: enc_fast_lossless.cc:default_implementation::(anonymous namespace)::ChunkEncoder<default_implementation::(anonymous namespace)::UpTo8Bits>::PrepareForSimd()
Unexecuted instantiation: enc_fast_lossless.cc:AVX512::(anonymous namespace)::ChunkEncoder<AVX512::(anonymous namespace)::From9To13Bits>::PrepareForSimd()
Unexecuted instantiation: enc_fast_lossless.cc:AVX512::(anonymous namespace)::ChunkEncoder<AVX512::(anonymous namespace)::Exactly14Bits>::PrepareForSimd()
Unexecuted instantiation: enc_fast_lossless.cc:AVX512::(anonymous namespace)::ChunkEncoder<AVX512::(anonymous namespace)::MoreThan14Bits>::PrepareForSimd()
Unexecuted instantiation: enc_fast_lossless.cc:AVX2::(anonymous namespace)::ChunkEncoder<AVX2::(anonymous namespace)::From9To13Bits>::PrepareForSimd()
Unexecuted instantiation: enc_fast_lossless.cc:AVX2::(anonymous namespace)::ChunkEncoder<AVX2::(anonymous namespace)::Exactly14Bits>::PrepareForSimd()
Unexecuted instantiation: enc_fast_lossless.cc:AVX2::(anonymous namespace)::ChunkEncoder<AVX2::(anonymous namespace)::MoreThan14Bits>::PrepareForSimd()
Unexecuted instantiation: enc_fast_lossless.cc:default_implementation::(anonymous namespace)::ChunkEncoder<default_implementation::(anonymous namespace)::From9To13Bits>::PrepareForSimd()
Unexecuted instantiation: enc_fast_lossless.cc:default_implementation::(anonymous namespace)::ChunkEncoder<default_implementation::(anonymous namespace)::Exactly14Bits>::PrepareForSimd()
Unexecuted instantiation: enc_fast_lossless.cc:default_implementation::(anonymous namespace)::ChunkEncoder<default_implementation::(anonymous namespace)::MoreThan14Bits>::PrepareForSimd()
3032
  FJXL_INLINE static void EncodeRle(size_t count, const PrefixCode& code,
3033
0
                                    BitWriter& output) {
3034
0
    if (count == 0) return;
3035
0
    count -= kLZ77MinLength + 1;
3036
0
    if (count < kLZ77CacheSize) {
3037
0
      output.Write(code.lz77_cache_nbits[count], code.lz77_cache_bits[count]);
3038
0
    } else {
3039
0
      unsigned token, nbits, bits;
3040
0
      EncodeHybridUintLZ77(count, &token, &nbits, &bits);
3041
0
      uint64_t wbits = bits;
3042
0
      wbits = (wbits << code.lz77_nbits[token]) | code.lz77_bits[token];
3043
0
      wbits = (wbits << code.raw_nbits[0]) | code.raw_bits[0];
3044
0
      output.Write(code.lz77_nbits[token] + nbits + code.raw_nbits[0], wbits);
3045
0
    }
3046
0
  }
Unexecuted instantiation: enc_fast_lossless.cc:AVX512::(anonymous namespace)::ChunkEncoder<AVX512::(anonymous namespace)::UpTo8Bits>::EncodeRle(unsigned long, (anonymous namespace)::PrefixCode const&, (anonymous namespace)::BitWriter&)
Unexecuted instantiation: enc_fast_lossless.cc:AVX2::(anonymous namespace)::ChunkEncoder<AVX2::(anonymous namespace)::UpTo8Bits>::EncodeRle(unsigned long, (anonymous namespace)::PrefixCode const&, (anonymous namespace)::BitWriter&)
Unexecuted instantiation: enc_fast_lossless.cc:default_implementation::(anonymous namespace)::ChunkEncoder<default_implementation::(anonymous namespace)::UpTo8Bits>::EncodeRle(unsigned long, (anonymous namespace)::PrefixCode const&, (anonymous namespace)::BitWriter&)
Unexecuted instantiation: enc_fast_lossless.cc:AVX512::(anonymous namespace)::ChunkEncoder<AVX512::(anonymous namespace)::From9To13Bits>::EncodeRle(unsigned long, (anonymous namespace)::PrefixCode const&, (anonymous namespace)::BitWriter&)
Unexecuted instantiation: enc_fast_lossless.cc:AVX512::(anonymous namespace)::ChunkEncoder<AVX512::(anonymous namespace)::Exactly14Bits>::EncodeRle(unsigned long, (anonymous namespace)::PrefixCode const&, (anonymous namespace)::BitWriter&)
Unexecuted instantiation: enc_fast_lossless.cc:AVX512::(anonymous namespace)::ChunkEncoder<AVX512::(anonymous namespace)::MoreThan14Bits>::EncodeRle(unsigned long, (anonymous namespace)::PrefixCode const&, (anonymous namespace)::BitWriter&)
Unexecuted instantiation: enc_fast_lossless.cc:AVX2::(anonymous namespace)::ChunkEncoder<AVX2::(anonymous namespace)::From9To13Bits>::EncodeRle(unsigned long, (anonymous namespace)::PrefixCode const&, (anonymous namespace)::BitWriter&)
Unexecuted instantiation: enc_fast_lossless.cc:AVX2::(anonymous namespace)::ChunkEncoder<AVX2::(anonymous namespace)::Exactly14Bits>::EncodeRle(unsigned long, (anonymous namespace)::PrefixCode const&, (anonymous namespace)::BitWriter&)
Unexecuted instantiation: enc_fast_lossless.cc:AVX2::(anonymous namespace)::ChunkEncoder<AVX2::(anonymous namespace)::MoreThan14Bits>::EncodeRle(unsigned long, (anonymous namespace)::PrefixCode const&, (anonymous namespace)::BitWriter&)
Unexecuted instantiation: enc_fast_lossless.cc:default_implementation::(anonymous namespace)::ChunkEncoder<default_implementation::(anonymous namespace)::From9To13Bits>::EncodeRle(unsigned long, (anonymous namespace)::PrefixCode const&, (anonymous namespace)::BitWriter&)
Unexecuted instantiation: enc_fast_lossless.cc:default_implementation::(anonymous namespace)::ChunkEncoder<default_implementation::(anonymous namespace)::Exactly14Bits>::EncodeRle(unsigned long, (anonymous namespace)::PrefixCode const&, (anonymous namespace)::BitWriter&)
Unexecuted instantiation: enc_fast_lossless.cc:default_implementation::(anonymous namespace)::ChunkEncoder<default_implementation::(anonymous namespace)::MoreThan14Bits>::EncodeRle(unsigned long, (anonymous namespace)::PrefixCode const&, (anonymous namespace)::BitWriter&)
3047
3048
  FJXL_INLINE void Chunk(size_t run, typename BitDepth::upixel_t* residuals,
3049
0
                         size_t skip, size_t n) {
3050
0
    EncodeRle(run, *code, *output);
3051
#ifdef FJXL_GENERIC_SIMD
3052
    BitDepth::EncodeChunkSimd(residuals, n, skip, raw_nbits_simd, raw_bits_simd,
3053
                              *output);
3054
#else
3055
    GenericEncodeChunk(residuals, n, skip, *code, *output);
3056
#endif
3057
0
  }
Unexecuted instantiation: enc_fast_lossless.cc:AVX512::(anonymous namespace)::ChunkEncoder<AVX512::(anonymous namespace)::UpTo8Bits>::Chunk(unsigned long, unsigned short*, unsigned long, unsigned long)
Unexecuted instantiation: enc_fast_lossless.cc:AVX2::(anonymous namespace)::ChunkEncoder<AVX2::(anonymous namespace)::UpTo8Bits>::Chunk(unsigned long, unsigned short*, unsigned long, unsigned long)
Unexecuted instantiation: enc_fast_lossless.cc:default_implementation::(anonymous namespace)::ChunkEncoder<default_implementation::(anonymous namespace)::UpTo8Bits>::Chunk(unsigned long, unsigned short*, unsigned long, unsigned long)
Unexecuted instantiation: enc_fast_lossless.cc:AVX512::(anonymous namespace)::ChunkEncoder<AVX512::(anonymous namespace)::From9To13Bits>::Chunk(unsigned long, unsigned short*, unsigned long, unsigned long)
Unexecuted instantiation: enc_fast_lossless.cc:AVX512::(anonymous namespace)::ChunkEncoder<AVX512::(anonymous namespace)::Exactly14Bits>::Chunk(unsigned long, unsigned short*, unsigned long, unsigned long)
Unexecuted instantiation: enc_fast_lossless.cc:AVX512::(anonymous namespace)::ChunkEncoder<AVX512::(anonymous namespace)::MoreThan14Bits>::Chunk(unsigned long, unsigned int*, unsigned long, unsigned long)
Unexecuted instantiation: enc_fast_lossless.cc:AVX2::(anonymous namespace)::ChunkEncoder<AVX2::(anonymous namespace)::From9To13Bits>::Chunk(unsigned long, unsigned short*, unsigned long, unsigned long)
Unexecuted instantiation: enc_fast_lossless.cc:AVX2::(anonymous namespace)::ChunkEncoder<AVX2::(anonymous namespace)::Exactly14Bits>::Chunk(unsigned long, unsigned short*, unsigned long, unsigned long)
Unexecuted instantiation: enc_fast_lossless.cc:AVX2::(anonymous namespace)::ChunkEncoder<AVX2::(anonymous namespace)::MoreThan14Bits>::Chunk(unsigned long, unsigned int*, unsigned long, unsigned long)
Unexecuted instantiation: enc_fast_lossless.cc:default_implementation::(anonymous namespace)::ChunkEncoder<default_implementation::(anonymous namespace)::From9To13Bits>::Chunk(unsigned long, unsigned short*, unsigned long, unsigned long)
Unexecuted instantiation: enc_fast_lossless.cc:default_implementation::(anonymous namespace)::ChunkEncoder<default_implementation::(anonymous namespace)::Exactly14Bits>::Chunk(unsigned long, unsigned short*, unsigned long, unsigned long)
Unexecuted instantiation: enc_fast_lossless.cc:default_implementation::(anonymous namespace)::ChunkEncoder<default_implementation::(anonymous namespace)::MoreThan14Bits>::Chunk(unsigned long, unsigned int*, unsigned long, unsigned long)
3058
3059
0
  inline void Finalize(size_t run) { EncodeRle(run, *code, *output); }
Unexecuted instantiation: enc_fast_lossless.cc:AVX512::(anonymous namespace)::ChunkEncoder<AVX512::(anonymous namespace)::UpTo8Bits>::Finalize(unsigned long)
Unexecuted instantiation: enc_fast_lossless.cc:AVX2::(anonymous namespace)::ChunkEncoder<AVX2::(anonymous namespace)::UpTo8Bits>::Finalize(unsigned long)
Unexecuted instantiation: enc_fast_lossless.cc:default_implementation::(anonymous namespace)::ChunkEncoder<default_implementation::(anonymous namespace)::UpTo8Bits>::Finalize(unsigned long)
Unexecuted instantiation: enc_fast_lossless.cc:AVX512::(anonymous namespace)::ChunkEncoder<AVX512::(anonymous namespace)::From9To13Bits>::Finalize(unsigned long)
Unexecuted instantiation: enc_fast_lossless.cc:AVX512::(anonymous namespace)::ChunkEncoder<AVX512::(anonymous namespace)::Exactly14Bits>::Finalize(unsigned long)
Unexecuted instantiation: enc_fast_lossless.cc:AVX512::(anonymous namespace)::ChunkEncoder<AVX512::(anonymous namespace)::MoreThan14Bits>::Finalize(unsigned long)
Unexecuted instantiation: enc_fast_lossless.cc:AVX2::(anonymous namespace)::ChunkEncoder<AVX2::(anonymous namespace)::From9To13Bits>::Finalize(unsigned long)
Unexecuted instantiation: enc_fast_lossless.cc:AVX2::(anonymous namespace)::ChunkEncoder<AVX2::(anonymous namespace)::Exactly14Bits>::Finalize(unsigned long)
Unexecuted instantiation: enc_fast_lossless.cc:AVX2::(anonymous namespace)::ChunkEncoder<AVX2::(anonymous namespace)::MoreThan14Bits>::Finalize(unsigned long)
Unexecuted instantiation: enc_fast_lossless.cc:default_implementation::(anonymous namespace)::ChunkEncoder<default_implementation::(anonymous namespace)::From9To13Bits>::Finalize(unsigned long)
Unexecuted instantiation: enc_fast_lossless.cc:default_implementation::(anonymous namespace)::ChunkEncoder<default_implementation::(anonymous namespace)::Exactly14Bits>::Finalize(unsigned long)
Unexecuted instantiation: enc_fast_lossless.cc:default_implementation::(anonymous namespace)::ChunkEncoder<default_implementation::(anonymous namespace)::MoreThan14Bits>::Finalize(unsigned long)
3060
3061
  const PrefixCode* code;
3062
  BitWriter* output;
3063
  alignas(64) uint8_t raw_nbits_simd[16] = {};
3064
  alignas(64) uint8_t raw_bits_simd[16] = {};
3065
};
3066
3067
template <typename BitDepth>
3068
struct ChunkSampleCollector {
3069
0
  FJXL_INLINE void Rle(size_t count, uint64_t* lz77_counts_) {
3070
0
    if (count == 0) return;
3071
0
    raw_counts[0] += 1;
3072
0
    count -= kLZ77MinLength + 1;
3073
0
    unsigned token, nbits, bits;
3074
0
    EncodeHybridUintLZ77(count, &token, &nbits, &bits);
3075
0
    lz77_counts_[token]++;
3076
0
  }
Unexecuted instantiation: enc_fast_lossless.cc:AVX512::(anonymous namespace)::ChunkSampleCollector<AVX512::(anonymous namespace)::UpTo8Bits>::Rle(unsigned long, unsigned long*)
Unexecuted instantiation: enc_fast_lossless.cc:AVX512::(anonymous namespace)::ChunkSampleCollector<AVX512::(anonymous namespace)::From9To13Bits>::Rle(unsigned long, unsigned long*)
Unexecuted instantiation: enc_fast_lossless.cc:AVX512::(anonymous namespace)::ChunkSampleCollector<AVX512::(anonymous namespace)::Exactly14Bits>::Rle(unsigned long, unsigned long*)
Unexecuted instantiation: enc_fast_lossless.cc:AVX512::(anonymous namespace)::ChunkSampleCollector<AVX512::(anonymous namespace)::MoreThan14Bits>::Rle(unsigned long, unsigned long*)
Unexecuted instantiation: enc_fast_lossless.cc:AVX2::(anonymous namespace)::ChunkSampleCollector<AVX2::(anonymous namespace)::UpTo8Bits>::Rle(unsigned long, unsigned long*)
Unexecuted instantiation: enc_fast_lossless.cc:AVX2::(anonymous namespace)::ChunkSampleCollector<AVX2::(anonymous namespace)::From9To13Bits>::Rle(unsigned long, unsigned long*)
Unexecuted instantiation: enc_fast_lossless.cc:AVX2::(anonymous namespace)::ChunkSampleCollector<AVX2::(anonymous namespace)::Exactly14Bits>::Rle(unsigned long, unsigned long*)
Unexecuted instantiation: enc_fast_lossless.cc:AVX2::(anonymous namespace)::ChunkSampleCollector<AVX2::(anonymous namespace)::MoreThan14Bits>::Rle(unsigned long, unsigned long*)
Unexecuted instantiation: enc_fast_lossless.cc:default_implementation::(anonymous namespace)::ChunkSampleCollector<default_implementation::(anonymous namespace)::UpTo8Bits>::Rle(unsigned long, unsigned long*)
Unexecuted instantiation: enc_fast_lossless.cc:default_implementation::(anonymous namespace)::ChunkSampleCollector<default_implementation::(anonymous namespace)::From9To13Bits>::Rle(unsigned long, unsigned long*)
Unexecuted instantiation: enc_fast_lossless.cc:default_implementation::(anonymous namespace)::ChunkSampleCollector<default_implementation::(anonymous namespace)::Exactly14Bits>::Rle(unsigned long, unsigned long*)
Unexecuted instantiation: enc_fast_lossless.cc:default_implementation::(anonymous namespace)::ChunkSampleCollector<default_implementation::(anonymous namespace)::MoreThan14Bits>::Rle(unsigned long, unsigned long*)
3077
3078
  FJXL_INLINE void Chunk(size_t run, typename BitDepth::upixel_t* residuals,
3079
0
                         size_t skip, size_t n) {
3080
    // Run is broken. Encode the run and encode the individual vector.
3081
0
    Rle(run, lz77_counts);
3082
0
    for (size_t ix = skip; ix < n; ix++) {
3083
0
      unsigned token, nbits, bits;
3084
0
      EncodeHybridUint000(residuals[ix], &token, &nbits, &bits);
3085
0
      raw_counts[token]++;
3086
0
    }
3087
0
  }
Unexecuted instantiation: enc_fast_lossless.cc:AVX512::(anonymous namespace)::ChunkSampleCollector<AVX512::(anonymous namespace)::UpTo8Bits>::Chunk(unsigned long, unsigned short*, unsigned long, unsigned long)
Unexecuted instantiation: enc_fast_lossless.cc:AVX512::(anonymous namespace)::ChunkSampleCollector<AVX512::(anonymous namespace)::From9To13Bits>::Chunk(unsigned long, unsigned short*, unsigned long, unsigned long)
Unexecuted instantiation: enc_fast_lossless.cc:AVX512::(anonymous namespace)::ChunkSampleCollector<AVX512::(anonymous namespace)::Exactly14Bits>::Chunk(unsigned long, unsigned short*, unsigned long, unsigned long)
Unexecuted instantiation: enc_fast_lossless.cc:AVX512::(anonymous namespace)::ChunkSampleCollector<AVX512::(anonymous namespace)::MoreThan14Bits>::Chunk(unsigned long, unsigned int*, unsigned long, unsigned long)
Unexecuted instantiation: enc_fast_lossless.cc:AVX2::(anonymous namespace)::ChunkSampleCollector<AVX2::(anonymous namespace)::UpTo8Bits>::Chunk(unsigned long, unsigned short*, unsigned long, unsigned long)
Unexecuted instantiation: enc_fast_lossless.cc:AVX2::(anonymous namespace)::ChunkSampleCollector<AVX2::(anonymous namespace)::From9To13Bits>::Chunk(unsigned long, unsigned short*, unsigned long, unsigned long)
Unexecuted instantiation: enc_fast_lossless.cc:AVX2::(anonymous namespace)::ChunkSampleCollector<AVX2::(anonymous namespace)::Exactly14Bits>::Chunk(unsigned long, unsigned short*, unsigned long, unsigned long)
Unexecuted instantiation: enc_fast_lossless.cc:AVX2::(anonymous namespace)::ChunkSampleCollector<AVX2::(anonymous namespace)::MoreThan14Bits>::Chunk(unsigned long, unsigned int*, unsigned long, unsigned long)
Unexecuted instantiation: enc_fast_lossless.cc:default_implementation::(anonymous namespace)::ChunkSampleCollector<default_implementation::(anonymous namespace)::UpTo8Bits>::Chunk(unsigned long, unsigned short*, unsigned long, unsigned long)
Unexecuted instantiation: enc_fast_lossless.cc:default_implementation::(anonymous namespace)::ChunkSampleCollector<default_implementation::(anonymous namespace)::From9To13Bits>::Chunk(unsigned long, unsigned short*, unsigned long, unsigned long)
Unexecuted instantiation: enc_fast_lossless.cc:default_implementation::(anonymous namespace)::ChunkSampleCollector<default_implementation::(anonymous namespace)::Exactly14Bits>::Chunk(unsigned long, unsigned short*, unsigned long, unsigned long)
Unexecuted instantiation: enc_fast_lossless.cc:default_implementation::(anonymous namespace)::ChunkSampleCollector<default_implementation::(anonymous namespace)::MoreThan14Bits>::Chunk(unsigned long, unsigned int*, unsigned long, unsigned long)
3088
3089
  // don't count final run since we don't know how long it really is
3090
0
  void Finalize(size_t run) {}
Unexecuted instantiation: enc_fast_lossless.cc:AVX512::(anonymous namespace)::ChunkSampleCollector<AVX512::(anonymous namespace)::UpTo8Bits>::Finalize(unsigned long)
Unexecuted instantiation: enc_fast_lossless.cc:AVX512::(anonymous namespace)::ChunkSampleCollector<AVX512::(anonymous namespace)::From9To13Bits>::Finalize(unsigned long)
Unexecuted instantiation: enc_fast_lossless.cc:AVX512::(anonymous namespace)::ChunkSampleCollector<AVX512::(anonymous namespace)::Exactly14Bits>::Finalize(unsigned long)
Unexecuted instantiation: enc_fast_lossless.cc:AVX512::(anonymous namespace)::ChunkSampleCollector<AVX512::(anonymous namespace)::MoreThan14Bits>::Finalize(unsigned long)
Unexecuted instantiation: enc_fast_lossless.cc:AVX2::(anonymous namespace)::ChunkSampleCollector<AVX2::(anonymous namespace)::UpTo8Bits>::Finalize(unsigned long)
Unexecuted instantiation: enc_fast_lossless.cc:AVX2::(anonymous namespace)::ChunkSampleCollector<AVX2::(anonymous namespace)::From9To13Bits>::Finalize(unsigned long)
Unexecuted instantiation: enc_fast_lossless.cc:AVX2::(anonymous namespace)::ChunkSampleCollector<AVX2::(anonymous namespace)::Exactly14Bits>::Finalize(unsigned long)
Unexecuted instantiation: enc_fast_lossless.cc:AVX2::(anonymous namespace)::ChunkSampleCollector<AVX2::(anonymous namespace)::MoreThan14Bits>::Finalize(unsigned long)
Unexecuted instantiation: enc_fast_lossless.cc:default_implementation::(anonymous namespace)::ChunkSampleCollector<default_implementation::(anonymous namespace)::UpTo8Bits>::Finalize(unsigned long)
Unexecuted instantiation: enc_fast_lossless.cc:default_implementation::(anonymous namespace)::ChunkSampleCollector<default_implementation::(anonymous namespace)::From9To13Bits>::Finalize(unsigned long)
Unexecuted instantiation: enc_fast_lossless.cc:default_implementation::(anonymous namespace)::ChunkSampleCollector<default_implementation::(anonymous namespace)::Exactly14Bits>::Finalize(unsigned long)
Unexecuted instantiation: enc_fast_lossless.cc:default_implementation::(anonymous namespace)::ChunkSampleCollector<default_implementation::(anonymous namespace)::MoreThan14Bits>::Finalize(unsigned long)
3091
3092
  uint64_t* raw_counts;
3093
  uint64_t* lz77_counts;
3094
};
3095
3096
0
constexpr uint32_t PackSigned(int32_t value) {
3097
0
  return (static_cast<uint32_t>(value) << 1) ^
3098
0
         ((static_cast<uint32_t>(~value) >> 31) - 1);
3099
0
}
Unexecuted instantiation: enc_fast_lossless.cc:AVX512::(anonymous namespace)::PackSigned(int)
Unexecuted instantiation: enc_fast_lossless.cc:AVX2::(anonymous namespace)::PackSigned(int)
Unexecuted instantiation: enc_fast_lossless.cc:default_implementation::(anonymous namespace)::PackSigned(int)
3100
3101
template <typename T, typename BitDepth>
3102
struct ChannelRowProcessor {
3103
  using upixel_t = typename BitDepth::upixel_t;
3104
  using pixel_t = typename BitDepth::pixel_t;
3105
  T* t;
3106
  void ProcessChunk(const pixel_t* row, const pixel_t* row_left,
3107
                    const pixel_t* row_top, const pixel_t* row_topleft,
3108
0
                    size_t n) {
3109
0
    alignas(64) upixel_t residuals[kChunkSize] = {};
3110
0
    size_t prefix_size = 0;
3111
0
    size_t required_prefix_size = 0;
3112
#ifdef FJXL_GENERIC_SIMD
3113
    constexpr size_t kNum =
3114
0
        sizeof(pixel_t) == 2 ? SIMDVec16::kLanes : SIMDVec32::kLanes;
3115
0
    for (size_t ix = 0; ix < kChunkSize; ix += kNum) {
3116
0
      size_t c =
3117
0
          PredictPixels<simd_t<pixel_t>>(row + ix, row_left + ix, row_top + ix,
3118
0
                                         row_topleft + ix, residuals + ix);
3119
0
      prefix_size =
3120
0
          prefix_size == required_prefix_size ? prefix_size + c : prefix_size;
3121
0
      required_prefix_size += kNum;
3122
0
    }
3123
#else
3124
0
    for (size_t ix = 0; ix < kChunkSize; ix++) {
3125
0
      pixel_t px = row[ix];
3126
0
      pixel_t left = row_left[ix];
3127
0
      pixel_t top = row_top[ix];
3128
0
      pixel_t topleft = row_topleft[ix];
3129
0
      pixel_t ac = left - topleft;
3130
0
      pixel_t ab = left - top;
3131
0
      pixel_t bc = top - topleft;
3132
0
      pixel_t grad = static_cast<pixel_t>(static_cast<upixel_t>(ac) +
3133
0
                                          static_cast<upixel_t>(top));
3134
0
      pixel_t d = ab ^ bc;
3135
0
      pixel_t clamp = d < 0 ? top : left;
3136
0
      pixel_t s = ac ^ bc;
3137
0
      pixel_t pred = s < 0 ? grad : clamp;
3138
0
      residuals[ix] = PackSigned(px - pred);
3139
0
      prefix_size = prefix_size == required_prefix_size
3140
0
                        ? prefix_size + (residuals[ix] == 0)
3141
0
                        : prefix_size;
3142
0
      required_prefix_size += 1;
3143
0
    }
3144
#endif
3145
0
    prefix_size = std::min(n, prefix_size);
3146
0
    if (prefix_size == n && (run > 0 || prefix_size > kLZ77MinLength)) {
3147
      // Run continues, nothing to do.
3148
0
      run += prefix_size;
3149
0
    } else if (prefix_size + run > kLZ77MinLength) {
3150
      // Run is broken. Encode the run and encode the individual vector.
3151
0
      t->Chunk(run + prefix_size, residuals, prefix_size, n);
3152
0
      run = 0;
3153
0
    } else {
3154
      // There was no run to begin with.
3155
0
      t->Chunk(0, residuals, 0, n);
3156
0
    }
3157
0
  }
Unexecuted instantiation: enc_fast_lossless.cc:AVX512::(anonymous namespace)::ChannelRowProcessor<AVX512::(anonymous namespace)::ChunkSampleCollector<AVX512::(anonymous namespace)::UpTo8Bits>, AVX512::(anonymous namespace)::UpTo8Bits>::ProcessChunk(short const*, short const*, short const*, short const*, unsigned long)
Unexecuted instantiation: enc_fast_lossless.cc:AVX512::(anonymous namespace)::ChannelRowProcessor<AVX512::(anonymous namespace)::ChunkEncoder<AVX512::(anonymous namespace)::UpTo8Bits>, AVX512::(anonymous namespace)::UpTo8Bits>::ProcessChunk(short const*, short const*, short const*, short const*, unsigned long)
Unexecuted instantiation: enc_fast_lossless.cc:AVX512::(anonymous namespace)::ChannelRowProcessor<AVX512::(anonymous namespace)::ChunkSampleCollector<AVX512::(anonymous namespace)::From9To13Bits>, AVX512::(anonymous namespace)::From9To13Bits>::ProcessChunk(short const*, short const*, short const*, short const*, unsigned long)
Unexecuted instantiation: enc_fast_lossless.cc:AVX512::(anonymous namespace)::ChannelRowProcessor<AVX512::(anonymous namespace)::ChunkSampleCollector<AVX512::(anonymous namespace)::Exactly14Bits>, AVX512::(anonymous namespace)::Exactly14Bits>::ProcessChunk(short const*, short const*, short const*, short const*, unsigned long)
Unexecuted instantiation: enc_fast_lossless.cc:AVX512::(anonymous namespace)::ChannelRowProcessor<AVX512::(anonymous namespace)::ChunkSampleCollector<AVX512::(anonymous namespace)::MoreThan14Bits>, AVX512::(anonymous namespace)::MoreThan14Bits>::ProcessChunk(int const*, int const*, int const*, int const*, unsigned long)
Unexecuted instantiation: enc_fast_lossless.cc:AVX2::(anonymous namespace)::ChannelRowProcessor<AVX2::(anonymous namespace)::ChunkSampleCollector<AVX2::(anonymous namespace)::UpTo8Bits>, AVX2::(anonymous namespace)::UpTo8Bits>::ProcessChunk(short const*, short const*, short const*, short const*, unsigned long)
Unexecuted instantiation: enc_fast_lossless.cc:AVX2::(anonymous namespace)::ChannelRowProcessor<AVX2::(anonymous namespace)::ChunkEncoder<AVX2::(anonymous namespace)::UpTo8Bits>, AVX2::(anonymous namespace)::UpTo8Bits>::ProcessChunk(short const*, short const*, short const*, short const*, unsigned long)
Unexecuted instantiation: enc_fast_lossless.cc:AVX2::(anonymous namespace)::ChannelRowProcessor<AVX2::(anonymous namespace)::ChunkSampleCollector<AVX2::(anonymous namespace)::From9To13Bits>, AVX2::(anonymous namespace)::From9To13Bits>::ProcessChunk(short const*, short const*, short const*, short const*, unsigned long)
Unexecuted instantiation: enc_fast_lossless.cc:AVX2::(anonymous namespace)::ChannelRowProcessor<AVX2::(anonymous namespace)::ChunkSampleCollector<AVX2::(anonymous namespace)::Exactly14Bits>, AVX2::(anonymous namespace)::Exactly14Bits>::ProcessChunk(short const*, short const*, short const*, short const*, unsigned long)
Unexecuted instantiation: enc_fast_lossless.cc:AVX2::(anonymous namespace)::ChannelRowProcessor<AVX2::(anonymous namespace)::ChunkSampleCollector<AVX2::(anonymous namespace)::MoreThan14Bits>, AVX2::(anonymous namespace)::MoreThan14Bits>::ProcessChunk(int const*, int const*, int const*, int const*, unsigned long)
Unexecuted instantiation: enc_fast_lossless.cc:default_implementation::(anonymous namespace)::ChannelRowProcessor<default_implementation::(anonymous namespace)::ChunkSampleCollector<default_implementation::(anonymous namespace)::UpTo8Bits>, default_implementation::(anonymous namespace)::UpTo8Bits>::ProcessChunk(short const*, short const*, short const*, short const*, unsigned long)
Unexecuted instantiation: enc_fast_lossless.cc:default_implementation::(anonymous namespace)::ChannelRowProcessor<default_implementation::(anonymous namespace)::ChunkEncoder<default_implementation::(anonymous namespace)::UpTo8Bits>, default_implementation::(anonymous namespace)::UpTo8Bits>::ProcessChunk(short const*, short const*, short const*, short const*, unsigned long)
Unexecuted instantiation: enc_fast_lossless.cc:default_implementation::(anonymous namespace)::ChannelRowProcessor<default_implementation::(anonymous namespace)::ChunkSampleCollector<default_implementation::(anonymous namespace)::From9To13Bits>, default_implementation::(anonymous namespace)::From9To13Bits>::ProcessChunk(short const*, short const*, short const*, short const*, unsigned long)
Unexecuted instantiation: enc_fast_lossless.cc:default_implementation::(anonymous namespace)::ChannelRowProcessor<default_implementation::(anonymous namespace)::ChunkSampleCollector<default_implementation::(anonymous namespace)::Exactly14Bits>, default_implementation::(anonymous namespace)::Exactly14Bits>::ProcessChunk(short const*, short const*, short const*, short const*, unsigned long)
Unexecuted instantiation: enc_fast_lossless.cc:default_implementation::(anonymous namespace)::ChannelRowProcessor<default_implementation::(anonymous namespace)::ChunkSampleCollector<default_implementation::(anonymous namespace)::MoreThan14Bits>, default_implementation::(anonymous namespace)::MoreThan14Bits>::ProcessChunk(int const*, int const*, int const*, int const*, unsigned long)
Unexecuted instantiation: enc_fast_lossless.cc:AVX512::(anonymous namespace)::ChannelRowProcessor<AVX512::(anonymous namespace)::ChunkEncoder<AVX512::(anonymous namespace)::From9To13Bits>, AVX512::(anonymous namespace)::From9To13Bits>::ProcessChunk(short const*, short const*, short const*, short const*, unsigned long)
Unexecuted instantiation: enc_fast_lossless.cc:AVX512::(anonymous namespace)::ChannelRowProcessor<AVX512::(anonymous namespace)::ChunkEncoder<AVX512::(anonymous namespace)::Exactly14Bits>, AVX512::(anonymous namespace)::Exactly14Bits>::ProcessChunk(short const*, short const*, short const*, short const*, unsigned long)
Unexecuted instantiation: enc_fast_lossless.cc:AVX512::(anonymous namespace)::ChannelRowProcessor<AVX512::(anonymous namespace)::ChunkEncoder<AVX512::(anonymous namespace)::MoreThan14Bits>, AVX512::(anonymous namespace)::MoreThan14Bits>::ProcessChunk(int const*, int const*, int const*, int const*, unsigned long)
Unexecuted instantiation: enc_fast_lossless.cc:AVX2::(anonymous namespace)::ChannelRowProcessor<AVX2::(anonymous namespace)::ChunkEncoder<AVX2::(anonymous namespace)::From9To13Bits>, AVX2::(anonymous namespace)::From9To13Bits>::ProcessChunk(short const*, short const*, short const*, short const*, unsigned long)
Unexecuted instantiation: enc_fast_lossless.cc:AVX2::(anonymous namespace)::ChannelRowProcessor<AVX2::(anonymous namespace)::ChunkEncoder<AVX2::(anonymous namespace)::Exactly14Bits>, AVX2::(anonymous namespace)::Exactly14Bits>::ProcessChunk(short const*, short const*, short const*, short const*, unsigned long)
Unexecuted instantiation: enc_fast_lossless.cc:AVX2::(anonymous namespace)::ChannelRowProcessor<AVX2::(anonymous namespace)::ChunkEncoder<AVX2::(anonymous namespace)::MoreThan14Bits>, AVX2::(anonymous namespace)::MoreThan14Bits>::ProcessChunk(int const*, int const*, int const*, int const*, unsigned long)
Unexecuted instantiation: enc_fast_lossless.cc:default_implementation::(anonymous namespace)::ChannelRowProcessor<default_implementation::(anonymous namespace)::ChunkEncoder<default_implementation::(anonymous namespace)::From9To13Bits>, default_implementation::(anonymous namespace)::From9To13Bits>::ProcessChunk(short const*, short const*, short const*, short const*, unsigned long)
Unexecuted instantiation: enc_fast_lossless.cc:default_implementation::(anonymous namespace)::ChannelRowProcessor<default_implementation::(anonymous namespace)::ChunkEncoder<default_implementation::(anonymous namespace)::Exactly14Bits>, default_implementation::(anonymous namespace)::Exactly14Bits>::ProcessChunk(short const*, short const*, short const*, short const*, unsigned long)
Unexecuted instantiation: enc_fast_lossless.cc:default_implementation::(anonymous namespace)::ChannelRowProcessor<default_implementation::(anonymous namespace)::ChunkEncoder<default_implementation::(anonymous namespace)::MoreThan14Bits>, default_implementation::(anonymous namespace)::MoreThan14Bits>::ProcessChunk(int const*, int const*, int const*, int const*, unsigned long)
3158
3159
  void ProcessRow(const pixel_t* row, const pixel_t* row_left,
3160
                  const pixel_t* row_top, const pixel_t* row_topleft,
3161
0
                  size_t xs) {
3162
0
    for (size_t x = 0; x < xs; x += kChunkSize) {
3163
0
      ProcessChunk(row + x, row_left + x, row_top + x, row_topleft + x,
3164
0
                   std::min(kChunkSize, xs - x));
3165
0
    }
3166
0
  }
Unexecuted instantiation: enc_fast_lossless.cc:AVX512::(anonymous namespace)::ChannelRowProcessor<AVX512::(anonymous namespace)::ChunkSampleCollector<AVX512::(anonymous namespace)::UpTo8Bits>, AVX512::(anonymous namespace)::UpTo8Bits>::ProcessRow(short const*, short const*, short const*, short const*, unsigned long)
Unexecuted instantiation: enc_fast_lossless.cc:AVX512::(anonymous namespace)::ChannelRowProcessor<AVX512::(anonymous namespace)::ChunkEncoder<AVX512::(anonymous namespace)::UpTo8Bits>, AVX512::(anonymous namespace)::UpTo8Bits>::ProcessRow(short const*, short const*, short const*, short const*, unsigned long)
Unexecuted instantiation: enc_fast_lossless.cc:AVX512::(anonymous namespace)::ChannelRowProcessor<AVX512::(anonymous namespace)::ChunkSampleCollector<AVX512::(anonymous namespace)::From9To13Bits>, AVX512::(anonymous namespace)::From9To13Bits>::ProcessRow(short const*, short const*, short const*, short const*, unsigned long)
Unexecuted instantiation: enc_fast_lossless.cc:AVX512::(anonymous namespace)::ChannelRowProcessor<AVX512::(anonymous namespace)::ChunkSampleCollector<AVX512::(anonymous namespace)::Exactly14Bits>, AVX512::(anonymous namespace)::Exactly14Bits>::ProcessRow(short const*, short const*, short const*, short const*, unsigned long)
Unexecuted instantiation: enc_fast_lossless.cc:AVX512::(anonymous namespace)::ChannelRowProcessor<AVX512::(anonymous namespace)::ChunkSampleCollector<AVX512::(anonymous namespace)::MoreThan14Bits>, AVX512::(anonymous namespace)::MoreThan14Bits>::ProcessRow(int const*, int const*, int const*, int const*, unsigned long)
Unexecuted instantiation: enc_fast_lossless.cc:AVX2::(anonymous namespace)::ChannelRowProcessor<AVX2::(anonymous namespace)::ChunkSampleCollector<AVX2::(anonymous namespace)::UpTo8Bits>, AVX2::(anonymous namespace)::UpTo8Bits>::ProcessRow(short const*, short const*, short const*, short const*, unsigned long)
Unexecuted instantiation: enc_fast_lossless.cc:AVX2::(anonymous namespace)::ChannelRowProcessor<AVX2::(anonymous namespace)::ChunkEncoder<AVX2::(anonymous namespace)::UpTo8Bits>, AVX2::(anonymous namespace)::UpTo8Bits>::ProcessRow(short const*, short const*, short const*, short const*, unsigned long)
Unexecuted instantiation: enc_fast_lossless.cc:AVX2::(anonymous namespace)::ChannelRowProcessor<AVX2::(anonymous namespace)::ChunkSampleCollector<AVX2::(anonymous namespace)::From9To13Bits>, AVX2::(anonymous namespace)::From9To13Bits>::ProcessRow(short const*, short const*, short const*, short const*, unsigned long)
Unexecuted instantiation: enc_fast_lossless.cc:AVX2::(anonymous namespace)::ChannelRowProcessor<AVX2::(anonymous namespace)::ChunkSampleCollector<AVX2::(anonymous namespace)::Exactly14Bits>, AVX2::(anonymous namespace)::Exactly14Bits>::ProcessRow(short const*, short const*, short const*, short const*, unsigned long)
Unexecuted instantiation: enc_fast_lossless.cc:AVX2::(anonymous namespace)::ChannelRowProcessor<AVX2::(anonymous namespace)::ChunkSampleCollector<AVX2::(anonymous namespace)::MoreThan14Bits>, AVX2::(anonymous namespace)::MoreThan14Bits>::ProcessRow(int const*, int const*, int const*, int const*, unsigned long)
Unexecuted instantiation: enc_fast_lossless.cc:default_implementation::(anonymous namespace)::ChannelRowProcessor<default_implementation::(anonymous namespace)::ChunkSampleCollector<default_implementation::(anonymous namespace)::UpTo8Bits>, default_implementation::(anonymous namespace)::UpTo8Bits>::ProcessRow(short const*, short const*, short const*, short const*, unsigned long)
Unexecuted instantiation: enc_fast_lossless.cc:default_implementation::(anonymous namespace)::ChannelRowProcessor<default_implementation::(anonymous namespace)::ChunkEncoder<default_implementation::(anonymous namespace)::UpTo8Bits>, default_implementation::(anonymous namespace)::UpTo8Bits>::ProcessRow(short const*, short const*, short const*, short const*, unsigned long)
Unexecuted instantiation: enc_fast_lossless.cc:default_implementation::(anonymous namespace)::ChannelRowProcessor<default_implementation::(anonymous namespace)::ChunkSampleCollector<default_implementation::(anonymous namespace)::From9To13Bits>, default_implementation::(anonymous namespace)::From9To13Bits>::ProcessRow(short const*, short const*, short const*, short const*, unsigned long)
Unexecuted instantiation: enc_fast_lossless.cc:default_implementation::(anonymous namespace)::ChannelRowProcessor<default_implementation::(anonymous namespace)::ChunkSampleCollector<default_implementation::(anonymous namespace)::Exactly14Bits>, default_implementation::(anonymous namespace)::Exactly14Bits>::ProcessRow(short const*, short const*, short const*, short const*, unsigned long)
Unexecuted instantiation: enc_fast_lossless.cc:default_implementation::(anonymous namespace)::ChannelRowProcessor<default_implementation::(anonymous namespace)::ChunkSampleCollector<default_implementation::(anonymous namespace)::MoreThan14Bits>, default_implementation::(anonymous namespace)::MoreThan14Bits>::ProcessRow(int const*, int const*, int const*, int const*, unsigned long)
Unexecuted instantiation: enc_fast_lossless.cc:AVX512::(anonymous namespace)::ChannelRowProcessor<AVX512::(anonymous namespace)::ChunkEncoder<AVX512::(anonymous namespace)::From9To13Bits>, AVX512::(anonymous namespace)::From9To13Bits>::ProcessRow(short const*, short const*, short const*, short const*, unsigned long)
Unexecuted instantiation: enc_fast_lossless.cc:AVX512::(anonymous namespace)::ChannelRowProcessor<AVX512::(anonymous namespace)::ChunkEncoder<AVX512::(anonymous namespace)::Exactly14Bits>, AVX512::(anonymous namespace)::Exactly14Bits>::ProcessRow(short const*, short const*, short const*, short const*, unsigned long)
Unexecuted instantiation: enc_fast_lossless.cc:AVX512::(anonymous namespace)::ChannelRowProcessor<AVX512::(anonymous namespace)::ChunkEncoder<AVX512::(anonymous namespace)::MoreThan14Bits>, AVX512::(anonymous namespace)::MoreThan14Bits>::ProcessRow(int const*, int const*, int const*, int const*, unsigned long)
Unexecuted instantiation: enc_fast_lossless.cc:AVX2::(anonymous namespace)::ChannelRowProcessor<AVX2::(anonymous namespace)::ChunkEncoder<AVX2::(anonymous namespace)::From9To13Bits>, AVX2::(anonymous namespace)::From9To13Bits>::ProcessRow(short const*, short const*, short const*, short const*, unsigned long)
Unexecuted instantiation: enc_fast_lossless.cc:AVX2::(anonymous namespace)::ChannelRowProcessor<AVX2::(anonymous namespace)::ChunkEncoder<AVX2::(anonymous namespace)::Exactly14Bits>, AVX2::(anonymous namespace)::Exactly14Bits>::ProcessRow(short const*, short const*, short const*, short const*, unsigned long)
Unexecuted instantiation: enc_fast_lossless.cc:AVX2::(anonymous namespace)::ChannelRowProcessor<AVX2::(anonymous namespace)::ChunkEncoder<AVX2::(anonymous namespace)::MoreThan14Bits>, AVX2::(anonymous namespace)::MoreThan14Bits>::ProcessRow(int const*, int const*, int const*, int const*, unsigned long)
Unexecuted instantiation: enc_fast_lossless.cc:default_implementation::(anonymous namespace)::ChannelRowProcessor<default_implementation::(anonymous namespace)::ChunkEncoder<default_implementation::(anonymous namespace)::From9To13Bits>, default_implementation::(anonymous namespace)::From9To13Bits>::ProcessRow(short const*, short const*, short const*, short const*, unsigned long)
Unexecuted instantiation: enc_fast_lossless.cc:default_implementation::(anonymous namespace)::ChannelRowProcessor<default_implementation::(anonymous namespace)::ChunkEncoder<default_implementation::(anonymous namespace)::Exactly14Bits>, default_implementation::(anonymous namespace)::Exactly14Bits>::ProcessRow(short const*, short const*, short const*, short const*, unsigned long)
Unexecuted instantiation: enc_fast_lossless.cc:default_implementation::(anonymous namespace)::ChannelRowProcessor<default_implementation::(anonymous namespace)::ChunkEncoder<default_implementation::(anonymous namespace)::MoreThan14Bits>, default_implementation::(anonymous namespace)::MoreThan14Bits>::ProcessRow(int const*, int const*, int const*, int const*, unsigned long)
3167
3168
0
  void Finalize() { t->Finalize(run); }
Unexecuted instantiation: enc_fast_lossless.cc:AVX512::(anonymous namespace)::ChannelRowProcessor<AVX512::(anonymous namespace)::ChunkSampleCollector<AVX512::(anonymous namespace)::UpTo8Bits>, AVX512::(anonymous namespace)::UpTo8Bits>::Finalize()
Unexecuted instantiation: enc_fast_lossless.cc:AVX512::(anonymous namespace)::ChannelRowProcessor<AVX512::(anonymous namespace)::ChunkEncoder<AVX512::(anonymous namespace)::UpTo8Bits>, AVX512::(anonymous namespace)::UpTo8Bits>::Finalize()
Unexecuted instantiation: enc_fast_lossless.cc:AVX512::(anonymous namespace)::ChannelRowProcessor<AVX512::(anonymous namespace)::ChunkSampleCollector<AVX512::(anonymous namespace)::From9To13Bits>, AVX512::(anonymous namespace)::From9To13Bits>::Finalize()
Unexecuted instantiation: enc_fast_lossless.cc:AVX512::(anonymous namespace)::ChannelRowProcessor<AVX512::(anonymous namespace)::ChunkSampleCollector<AVX512::(anonymous namespace)::Exactly14Bits>, AVX512::(anonymous namespace)::Exactly14Bits>::Finalize()
Unexecuted instantiation: enc_fast_lossless.cc:AVX512::(anonymous namespace)::ChannelRowProcessor<AVX512::(anonymous namespace)::ChunkSampleCollector<AVX512::(anonymous namespace)::MoreThan14Bits>, AVX512::(anonymous namespace)::MoreThan14Bits>::Finalize()
Unexecuted instantiation: enc_fast_lossless.cc:AVX2::(anonymous namespace)::ChannelRowProcessor<AVX2::(anonymous namespace)::ChunkSampleCollector<AVX2::(anonymous namespace)::UpTo8Bits>, AVX2::(anonymous namespace)::UpTo8Bits>::Finalize()
Unexecuted instantiation: enc_fast_lossless.cc:AVX2::(anonymous namespace)::ChannelRowProcessor<AVX2::(anonymous namespace)::ChunkEncoder<AVX2::(anonymous namespace)::UpTo8Bits>, AVX2::(anonymous namespace)::UpTo8Bits>::Finalize()
Unexecuted instantiation: enc_fast_lossless.cc:AVX2::(anonymous namespace)::ChannelRowProcessor<AVX2::(anonymous namespace)::ChunkSampleCollector<AVX2::(anonymous namespace)::From9To13Bits>, AVX2::(anonymous namespace)::From9To13Bits>::Finalize()
Unexecuted instantiation: enc_fast_lossless.cc:AVX2::(anonymous namespace)::ChannelRowProcessor<AVX2::(anonymous namespace)::ChunkSampleCollector<AVX2::(anonymous namespace)::Exactly14Bits>, AVX2::(anonymous namespace)::Exactly14Bits>::Finalize()
Unexecuted instantiation: enc_fast_lossless.cc:AVX2::(anonymous namespace)::ChannelRowProcessor<AVX2::(anonymous namespace)::ChunkSampleCollector<AVX2::(anonymous namespace)::MoreThan14Bits>, AVX2::(anonymous namespace)::MoreThan14Bits>::Finalize()
Unexecuted instantiation: enc_fast_lossless.cc:default_implementation::(anonymous namespace)::ChannelRowProcessor<default_implementation::(anonymous namespace)::ChunkSampleCollector<default_implementation::(anonymous namespace)::UpTo8Bits>, default_implementation::(anonymous namespace)::UpTo8Bits>::Finalize()
Unexecuted instantiation: enc_fast_lossless.cc:default_implementation::(anonymous namespace)::ChannelRowProcessor<default_implementation::(anonymous namespace)::ChunkEncoder<default_implementation::(anonymous namespace)::UpTo8Bits>, default_implementation::(anonymous namespace)::UpTo8Bits>::Finalize()
Unexecuted instantiation: enc_fast_lossless.cc:default_implementation::(anonymous namespace)::ChannelRowProcessor<default_implementation::(anonymous namespace)::ChunkSampleCollector<default_implementation::(anonymous namespace)::From9To13Bits>, default_implementation::(anonymous namespace)::From9To13Bits>::Finalize()
Unexecuted instantiation: enc_fast_lossless.cc:default_implementation::(anonymous namespace)::ChannelRowProcessor<default_implementation::(anonymous namespace)::ChunkSampleCollector<default_implementation::(anonymous namespace)::Exactly14Bits>, default_implementation::(anonymous namespace)::Exactly14Bits>::Finalize()
Unexecuted instantiation: enc_fast_lossless.cc:default_implementation::(anonymous namespace)::ChannelRowProcessor<default_implementation::(anonymous namespace)::ChunkSampleCollector<default_implementation::(anonymous namespace)::MoreThan14Bits>, default_implementation::(anonymous namespace)::MoreThan14Bits>::Finalize()
Unexecuted instantiation: enc_fast_lossless.cc:AVX512::(anonymous namespace)::ChannelRowProcessor<AVX512::(anonymous namespace)::ChunkEncoder<AVX512::(anonymous namespace)::From9To13Bits>, AVX512::(anonymous namespace)::From9To13Bits>::Finalize()
Unexecuted instantiation: enc_fast_lossless.cc:AVX512::(anonymous namespace)::ChannelRowProcessor<AVX512::(anonymous namespace)::ChunkEncoder<AVX512::(anonymous namespace)::Exactly14Bits>, AVX512::(anonymous namespace)::Exactly14Bits>::Finalize()
Unexecuted instantiation: enc_fast_lossless.cc:AVX512::(anonymous namespace)::ChannelRowProcessor<AVX512::(anonymous namespace)::ChunkEncoder<AVX512::(anonymous namespace)::MoreThan14Bits>, AVX512::(anonymous namespace)::MoreThan14Bits>::Finalize()
Unexecuted instantiation: enc_fast_lossless.cc:AVX2::(anonymous namespace)::ChannelRowProcessor<AVX2::(anonymous namespace)::ChunkEncoder<AVX2::(anonymous namespace)::From9To13Bits>, AVX2::(anonymous namespace)::From9To13Bits>::Finalize()
Unexecuted instantiation: enc_fast_lossless.cc:AVX2::(anonymous namespace)::ChannelRowProcessor<AVX2::(anonymous namespace)::ChunkEncoder<AVX2::(anonymous namespace)::Exactly14Bits>, AVX2::(anonymous namespace)::Exactly14Bits>::Finalize()
Unexecuted instantiation: enc_fast_lossless.cc:AVX2::(anonymous namespace)::ChannelRowProcessor<AVX2::(anonymous namespace)::ChunkEncoder<AVX2::(anonymous namespace)::MoreThan14Bits>, AVX2::(anonymous namespace)::MoreThan14Bits>::Finalize()
Unexecuted instantiation: enc_fast_lossless.cc:default_implementation::(anonymous namespace)::ChannelRowProcessor<default_implementation::(anonymous namespace)::ChunkEncoder<default_implementation::(anonymous namespace)::From9To13Bits>, default_implementation::(anonymous namespace)::From9To13Bits>::Finalize()
Unexecuted instantiation: enc_fast_lossless.cc:default_implementation::(anonymous namespace)::ChannelRowProcessor<default_implementation::(anonymous namespace)::ChunkEncoder<default_implementation::(anonymous namespace)::Exactly14Bits>, default_implementation::(anonymous namespace)::Exactly14Bits>::Finalize()
Unexecuted instantiation: enc_fast_lossless.cc:default_implementation::(anonymous namespace)::ChannelRowProcessor<default_implementation::(anonymous namespace)::ChunkEncoder<default_implementation::(anonymous namespace)::MoreThan14Bits>, default_implementation::(anonymous namespace)::MoreThan14Bits>::Finalize()
3169
  // Invariant: run == 0 or run > kLZ77MinLength.
3170
  size_t run = 0;
3171
};
3172
3173
0
uint16_t LoadLE16(const unsigned char* ptr) {
3174
0
  return uint16_t{ptr[0]} | (uint16_t{ptr[1]} << 8);
3175
0
}
Unexecuted instantiation: enc_fast_lossless.cc:AVX512::(anonymous namespace)::LoadLE16(unsigned char const*)
Unexecuted instantiation: enc_fast_lossless.cc:AVX2::(anonymous namespace)::LoadLE16(unsigned char const*)
Unexecuted instantiation: enc_fast_lossless.cc:default_implementation::(anonymous namespace)::LoadLE16(unsigned char const*)
3176
3177
0
uint16_t SwapEndian(uint16_t in) { return (in >> 8) | (in << 8); }
Unexecuted instantiation: enc_fast_lossless.cc:AVX512::(anonymous namespace)::SwapEndian(unsigned short)
Unexecuted instantiation: enc_fast_lossless.cc:AVX2::(anonymous namespace)::SwapEndian(unsigned short)
Unexecuted instantiation: enc_fast_lossless.cc:default_implementation::(anonymous namespace)::SwapEndian(unsigned short)
3178
3179
#ifdef FJXL_GENERIC_SIMD
3180
0
void StorePixels(SIMDVec16 p, int16_t* dest) { p.Store((uint16_t*)dest); }
Unexecuted instantiation: enc_fast_lossless.cc:AVX512::(anonymous namespace)::StorePixels(AVX512::(anonymous namespace)::SIMDVec16, short*)
Unexecuted instantiation: enc_fast_lossless.cc:AVX2::(anonymous namespace)::StorePixels(AVX2::(anonymous namespace)::SIMDVec16, short*)
3181
3182
0
void StorePixels(SIMDVec16 p, int32_t* dest) {
3183
0
  VecPair<SIMDVec32> p_up = p.Upcast();
3184
0
  p_up.low.Store((uint32_t*)dest);
3185
0
  p_up.hi.Store((uint32_t*)dest + SIMDVec32::kLanes);
3186
0
}
Unexecuted instantiation: enc_fast_lossless.cc:AVX512::(anonymous namespace)::StorePixels(AVX512::(anonymous namespace)::SIMDVec16, int*)
Unexecuted instantiation: enc_fast_lossless.cc:AVX2::(anonymous namespace)::StorePixels(AVX2::(anonymous namespace)::SIMDVec16, int*)
3187
#endif
3188
3189
template <typename pixel_t>
3190
0
void FillRowG8(const unsigned char* rgba, size_t oxs, pixel_t* luma) {
3191
0
  size_t x = 0;
3192
#ifdef FJXL_GENERIC_SIMD
3193
0
  for (; x + SIMDVec16::kLanes <= oxs; x += SIMDVec16::kLanes) {
3194
0
    auto rgb = SIMDVec16::LoadG8(rgba + x);
3195
0
    StorePixels(rgb[0], luma + x);
3196
0
  }
3197
#endif
3198
0
  for (; x < oxs; x++) {
3199
0
    luma[x] = rgba[x];
3200
0
  }
3201
0
}
Unexecuted instantiation: enc_fast_lossless.cc:void AVX512::(anonymous namespace)::FillRowG8<short>(unsigned char const*, unsigned long, short*)
Unexecuted instantiation: enc_fast_lossless.cc:void AVX2::(anonymous namespace)::FillRowG8<short>(unsigned char const*, unsigned long, short*)
Unexecuted instantiation: enc_fast_lossless.cc:void default_implementation::(anonymous namespace)::FillRowG8<short>(unsigned char const*, unsigned long, short*)
Unexecuted instantiation: enc_fast_lossless.cc:void default_implementation::(anonymous namespace)::FillRowG8<int>(unsigned char const*, unsigned long, int*)
Unexecuted instantiation: enc_fast_lossless.cc:void AVX2::(anonymous namespace)::FillRowG8<int>(unsigned char const*, unsigned long, int*)
Unexecuted instantiation: enc_fast_lossless.cc:void AVX512::(anonymous namespace)::FillRowG8<int>(unsigned char const*, unsigned long, int*)
3202
3203
template <bool big_endian, typename pixel_t>
3204
0
void FillRowG16(const unsigned char* rgba, size_t oxs, pixel_t* luma) {
3205
0
  size_t x = 0;
3206
#ifdef FJXL_GENERIC_SIMD
3207
0
  for (; x + SIMDVec16::kLanes <= oxs; x += SIMDVec16::kLanes) {
3208
0
    auto rgb = SIMDVec16::LoadG16(rgba + 2 * x);
3209
0
    if (big_endian) {
3210
0
      rgb[0].SwapEndian();
3211
0
    }
3212
0
    StorePixels(rgb[0], luma + x);
3213
0
  }
3214
#endif
3215
0
  for (; x < oxs; x++) {
3216
0
    uint16_t val = LoadLE16(rgba + 2 * x);
3217
0
    if (big_endian) {
3218
0
      val = SwapEndian(val);
3219
0
    }
3220
0
    luma[x] = val;
3221
0
  }
3222
0
}
Unexecuted instantiation: enc_fast_lossless.cc:void AVX512::(anonymous namespace)::FillRowG16<true, short>(unsigned char const*, unsigned long, short*)
Unexecuted instantiation: enc_fast_lossless.cc:void AVX512::(anonymous namespace)::FillRowG16<false, short>(unsigned char const*, unsigned long, short*)
Unexecuted instantiation: enc_fast_lossless.cc:void AVX512::(anonymous namespace)::FillRowG16<true, int>(unsigned char const*, unsigned long, int*)
Unexecuted instantiation: enc_fast_lossless.cc:void AVX512::(anonymous namespace)::FillRowG16<false, int>(unsigned char const*, unsigned long, int*)
Unexecuted instantiation: enc_fast_lossless.cc:void AVX2::(anonymous namespace)::FillRowG16<true, short>(unsigned char const*, unsigned long, short*)
Unexecuted instantiation: enc_fast_lossless.cc:void AVX2::(anonymous namespace)::FillRowG16<false, short>(unsigned char const*, unsigned long, short*)
Unexecuted instantiation: enc_fast_lossless.cc:void AVX2::(anonymous namespace)::FillRowG16<true, int>(unsigned char const*, unsigned long, int*)
Unexecuted instantiation: enc_fast_lossless.cc:void AVX2::(anonymous namespace)::FillRowG16<false, int>(unsigned char const*, unsigned long, int*)
Unexecuted instantiation: enc_fast_lossless.cc:void default_implementation::(anonymous namespace)::FillRowG16<true, short>(unsigned char const*, unsigned long, short*)
Unexecuted instantiation: enc_fast_lossless.cc:void default_implementation::(anonymous namespace)::FillRowG16<false, short>(unsigned char const*, unsigned long, short*)
Unexecuted instantiation: enc_fast_lossless.cc:void default_implementation::(anonymous namespace)::FillRowG16<true, int>(unsigned char const*, unsigned long, int*)
Unexecuted instantiation: enc_fast_lossless.cc:void default_implementation::(anonymous namespace)::FillRowG16<false, int>(unsigned char const*, unsigned long, int*)
3223
3224
template <typename pixel_t>
3225
void FillRowGA8(const unsigned char* rgba, size_t oxs, pixel_t* luma,
3226
0
                pixel_t* alpha) {
3227
0
  size_t x = 0;
3228
#ifdef FJXL_GENERIC_SIMD
3229
0
  for (; x + SIMDVec16::kLanes <= oxs; x += SIMDVec16::kLanes) {
3230
0
    auto rgb = SIMDVec16::LoadGA8(rgba + 2 * x);
3231
0
    StorePixels(rgb[0], luma + x);
3232
0
    StorePixels(rgb[1], alpha + x);
3233
0
  }
3234
#endif
3235
0
  for (; x < oxs; x++) {
3236
0
    luma[x] = rgba[2 * x];
3237
0
    alpha[x] = rgba[2 * x + 1];
3238
0
  }
3239
0
}
Unexecuted instantiation: enc_fast_lossless.cc:void AVX512::(anonymous namespace)::FillRowGA8<short>(unsigned char const*, unsigned long, short*, short*)
Unexecuted instantiation: enc_fast_lossless.cc:void AVX2::(anonymous namespace)::FillRowGA8<short>(unsigned char const*, unsigned long, short*, short*)
Unexecuted instantiation: enc_fast_lossless.cc:void default_implementation::(anonymous namespace)::FillRowGA8<short>(unsigned char const*, unsigned long, short*, short*)
Unexecuted instantiation: enc_fast_lossless.cc:void default_implementation::(anonymous namespace)::FillRowGA8<int>(unsigned char const*, unsigned long, int*, int*)
Unexecuted instantiation: enc_fast_lossless.cc:void AVX2::(anonymous namespace)::FillRowGA8<int>(unsigned char const*, unsigned long, int*, int*)
Unexecuted instantiation: enc_fast_lossless.cc:void AVX512::(anonymous namespace)::FillRowGA8<int>(unsigned char const*, unsigned long, int*, int*)
3240
3241
template <bool big_endian, typename pixel_t>
3242
void FillRowGA16(const unsigned char* rgba, size_t oxs, pixel_t* luma,
3243
0
                 pixel_t* alpha) {
3244
0
  size_t x = 0;
3245
#ifdef FJXL_GENERIC_SIMD
3246
0
  for (; x + SIMDVec16::kLanes <= oxs; x += SIMDVec16::kLanes) {
3247
0
    auto rgb = SIMDVec16::LoadGA16(rgba + 4 * x);
3248
0
    if (big_endian) {
3249
0
      rgb[0].SwapEndian();
3250
0
      rgb[1].SwapEndian();
3251
0
    }
3252
0
    StorePixels(rgb[0], luma + x);
3253
0
    StorePixels(rgb[1], alpha + x);
3254
0
  }
3255
#endif
3256
0
  for (; x < oxs; x++) {
3257
0
    uint16_t l = LoadLE16(rgba + 4 * x);
3258
0
    uint16_t a = LoadLE16(rgba + 4 * x + 2);
3259
0
    if (big_endian) {
3260
0
      l = SwapEndian(l);
3261
0
      a = SwapEndian(a);
3262
0
    }
3263
0
    luma[x] = l;
3264
0
    alpha[x] = a;
3265
0
  }
3266
0
}
Unexecuted instantiation: enc_fast_lossless.cc:void AVX512::(anonymous namespace)::FillRowGA16<true, short>(unsigned char const*, unsigned long, short*, short*)
Unexecuted instantiation: enc_fast_lossless.cc:void AVX512::(anonymous namespace)::FillRowGA16<false, short>(unsigned char const*, unsigned long, short*, short*)
Unexecuted instantiation: enc_fast_lossless.cc:void AVX512::(anonymous namespace)::FillRowGA16<true, int>(unsigned char const*, unsigned long, int*, int*)
Unexecuted instantiation: enc_fast_lossless.cc:void AVX512::(anonymous namespace)::FillRowGA16<false, int>(unsigned char const*, unsigned long, int*, int*)
Unexecuted instantiation: enc_fast_lossless.cc:void AVX2::(anonymous namespace)::FillRowGA16<true, short>(unsigned char const*, unsigned long, short*, short*)
Unexecuted instantiation: enc_fast_lossless.cc:void AVX2::(anonymous namespace)::FillRowGA16<false, short>(unsigned char const*, unsigned long, short*, short*)
Unexecuted instantiation: enc_fast_lossless.cc:void AVX2::(anonymous namespace)::FillRowGA16<true, int>(unsigned char const*, unsigned long, int*, int*)
Unexecuted instantiation: enc_fast_lossless.cc:void AVX2::(anonymous namespace)::FillRowGA16<false, int>(unsigned char const*, unsigned long, int*, int*)
Unexecuted instantiation: enc_fast_lossless.cc:void default_implementation::(anonymous namespace)::FillRowGA16<true, short>(unsigned char const*, unsigned long, short*, short*)
Unexecuted instantiation: enc_fast_lossless.cc:void default_implementation::(anonymous namespace)::FillRowGA16<false, short>(unsigned char const*, unsigned long, short*, short*)
Unexecuted instantiation: enc_fast_lossless.cc:void default_implementation::(anonymous namespace)::FillRowGA16<true, int>(unsigned char const*, unsigned long, int*, int*)
Unexecuted instantiation: enc_fast_lossless.cc:void default_implementation::(anonymous namespace)::FillRowGA16<false, int>(unsigned char const*, unsigned long, int*, int*)
3267
3268
template <typename pixel_t>
3269
void StoreYCoCg(pixel_t r, pixel_t g, pixel_t b, pixel_t* y, pixel_t* co,
3270
0
                pixel_t* cg) {
3271
0
  *co = r - b;
3272
0
  pixel_t tmp = b + (*co >> 1);
3273
0
  *cg = g - tmp;
3274
0
  *y = tmp + (*cg >> 1);
3275
0
}
Unexecuted instantiation: enc_fast_lossless.cc:void AVX512::(anonymous namespace)::StoreYCoCg<short>(short, short, short, short*, short*, short*)
Unexecuted instantiation: enc_fast_lossless.cc:void AVX512::(anonymous namespace)::StoreYCoCg<int>(int, int, int, int*, int*, int*)
Unexecuted instantiation: enc_fast_lossless.cc:void AVX2::(anonymous namespace)::StoreYCoCg<short>(short, short, short, short*, short*, short*)
Unexecuted instantiation: enc_fast_lossless.cc:void AVX2::(anonymous namespace)::StoreYCoCg<int>(int, int, int, int*, int*, int*)
Unexecuted instantiation: enc_fast_lossless.cc:void default_implementation::(anonymous namespace)::StoreYCoCg<short>(short, short, short, short*, short*, short*)
Unexecuted instantiation: enc_fast_lossless.cc:void default_implementation::(anonymous namespace)::StoreYCoCg<int>(int, int, int, int*, int*, int*)
3276
3277
#ifdef FJXL_GENERIC_SIMD
3278
void StoreYCoCg(SIMDVec16 r, SIMDVec16 g, SIMDVec16 b, int16_t* y, int16_t* co,
3279
0
                int16_t* cg) {
3280
0
  SIMDVec16 co_v = r.Sub(b);
3281
0
  SIMDVec16 tmp = b.Add(co_v.SignedShiftRight<1>());
3282
0
  SIMDVec16 cg_v = g.Sub(tmp);
3283
0
  SIMDVec16 y_v = tmp.Add(cg_v.SignedShiftRight<1>());
3284
0
  y_v.Store(reinterpret_cast<uint16_t*>(y));
3285
0
  co_v.Store(reinterpret_cast<uint16_t*>(co));
3286
0
  cg_v.Store(reinterpret_cast<uint16_t*>(cg));
3287
0
}
Unexecuted instantiation: enc_fast_lossless.cc:AVX512::(anonymous namespace)::StoreYCoCg(AVX512::(anonymous namespace)::SIMDVec16, AVX512::(anonymous namespace)::SIMDVec16, AVX512::(anonymous namespace)::SIMDVec16, short*, short*, short*)
Unexecuted instantiation: enc_fast_lossless.cc:AVX2::(anonymous namespace)::StoreYCoCg(AVX2::(anonymous namespace)::SIMDVec16, AVX2::(anonymous namespace)::SIMDVec16, AVX2::(anonymous namespace)::SIMDVec16, short*, short*, short*)
3288
3289
void StoreYCoCg(SIMDVec16 r, SIMDVec16 g, SIMDVec16 b, int32_t* y, int32_t* co,
3290
0
                int32_t* cg) {
3291
0
  VecPair<SIMDVec32> r_up = r.Upcast();
3292
0
  VecPair<SIMDVec32> g_up = g.Upcast();
3293
0
  VecPair<SIMDVec32> b_up = b.Upcast();
3294
0
  SIMDVec32 co_lo_v = r_up.low.Sub(b_up.low);
3295
0
  SIMDVec32 tmp_lo = b_up.low.Add(co_lo_v.SignedShiftRight<1>());
3296
0
  SIMDVec32 cg_lo_v = g_up.low.Sub(tmp_lo);
3297
0
  SIMDVec32 y_lo_v = tmp_lo.Add(cg_lo_v.SignedShiftRight<1>());
3298
0
  SIMDVec32 co_hi_v = r_up.hi.Sub(b_up.hi);
3299
0
  SIMDVec32 tmp_hi = b_up.hi.Add(co_hi_v.SignedShiftRight<1>());
3300
0
  SIMDVec32 cg_hi_v = g_up.hi.Sub(tmp_hi);
3301
0
  SIMDVec32 y_hi_v = tmp_hi.Add(cg_hi_v.SignedShiftRight<1>());
3302
0
  y_lo_v.Store(reinterpret_cast<uint32_t*>(y));
3303
0
  co_lo_v.Store(reinterpret_cast<uint32_t*>(co));
3304
0
  cg_lo_v.Store(reinterpret_cast<uint32_t*>(cg));
3305
0
  y_hi_v.Store(reinterpret_cast<uint32_t*>(y) + SIMDVec32::kLanes);
3306
0
  co_hi_v.Store(reinterpret_cast<uint32_t*>(co) + SIMDVec32::kLanes);
3307
0
  cg_hi_v.Store(reinterpret_cast<uint32_t*>(cg) + SIMDVec32::kLanes);
3308
0
}
Unexecuted instantiation: enc_fast_lossless.cc:AVX512::(anonymous namespace)::StoreYCoCg(AVX512::(anonymous namespace)::SIMDVec16, AVX512::(anonymous namespace)::SIMDVec16, AVX512::(anonymous namespace)::SIMDVec16, int*, int*, int*)
Unexecuted instantiation: enc_fast_lossless.cc:AVX2::(anonymous namespace)::StoreYCoCg(AVX2::(anonymous namespace)::SIMDVec16, AVX2::(anonymous namespace)::SIMDVec16, AVX2::(anonymous namespace)::SIMDVec16, int*, int*, int*)
3309
#endif
3310
3311
template <typename pixel_t>
3312
void FillRowRGB8(const unsigned char* rgba, size_t oxs, pixel_t* y, pixel_t* co,
3313
0
                 pixel_t* cg) {
3314
0
  size_t x = 0;
3315
#ifdef FJXL_GENERIC_SIMD
3316
0
  for (; x + SIMDVec16::kLanes <= oxs; x += SIMDVec16::kLanes) {
3317
0
    auto rgb = SIMDVec16::LoadRGB8(rgba + 3 * x);
3318
0
    StoreYCoCg(rgb[0], rgb[1], rgb[2], y + x, co + x, cg + x);
3319
0
  }
3320
#endif
3321
0
  for (; x < oxs; x++) {
3322
0
    uint16_t r = rgba[3 * x];
3323
0
    uint16_t g = rgba[3 * x + 1];
3324
0
    uint16_t b = rgba[3 * x + 2];
3325
0
    StoreYCoCg<pixel_t>(r, g, b, y + x, co + x, cg + x);
3326
0
  }
3327
0
}
Unexecuted instantiation: enc_fast_lossless.cc:void AVX512::(anonymous namespace)::FillRowRGB8<short>(unsigned char const*, unsigned long, short*, short*, short*)
Unexecuted instantiation: enc_fast_lossless.cc:void AVX2::(anonymous namespace)::FillRowRGB8<short>(unsigned char const*, unsigned long, short*, short*, short*)
Unexecuted instantiation: enc_fast_lossless.cc:void default_implementation::(anonymous namespace)::FillRowRGB8<short>(unsigned char const*, unsigned long, short*, short*, short*)
Unexecuted instantiation: enc_fast_lossless.cc:void default_implementation::(anonymous namespace)::FillRowRGB8<int>(unsigned char const*, unsigned long, int*, int*, int*)
Unexecuted instantiation: enc_fast_lossless.cc:void AVX2::(anonymous namespace)::FillRowRGB8<int>(unsigned char const*, unsigned long, int*, int*, int*)
Unexecuted instantiation: enc_fast_lossless.cc:void AVX512::(anonymous namespace)::FillRowRGB8<int>(unsigned char const*, unsigned long, int*, int*, int*)
3328
3329
template <bool big_endian, typename pixel_t>
3330
void FillRowRGB16(const unsigned char* rgba, size_t oxs, pixel_t* y,
3331
0
                  pixel_t* co, pixel_t* cg) {
3332
0
  size_t x = 0;
3333
#ifdef FJXL_GENERIC_SIMD
3334
0
  for (; x + SIMDVec16::kLanes <= oxs; x += SIMDVec16::kLanes) {
3335
0
    auto rgb = SIMDVec16::LoadRGB16(rgba + 6 * x);
3336
0
    if (big_endian) {
3337
0
      rgb[0].SwapEndian();
3338
0
      rgb[1].SwapEndian();
3339
0
      rgb[2].SwapEndian();
3340
0
    }
3341
0
    StoreYCoCg(rgb[0], rgb[1], rgb[2], y + x, co + x, cg + x);
3342
0
  }
3343
#endif
3344
0
  for (; x < oxs; x++) {
3345
0
    uint16_t r = LoadLE16(rgba + 6 * x);
3346
0
    uint16_t g = LoadLE16(rgba + 6 * x + 2);
3347
0
    uint16_t b = LoadLE16(rgba + 6 * x + 4);
3348
0
    if (big_endian) {
3349
0
      r = SwapEndian(r);
3350
0
      g = SwapEndian(g);
3351
0
      b = SwapEndian(b);
3352
0
    }
3353
0
    StoreYCoCg<pixel_t>(r, g, b, y + x, co + x, cg + x);
3354
0
  }
3355
0
}
Unexecuted instantiation: enc_fast_lossless.cc:void AVX512::(anonymous namespace)::FillRowRGB16<true, short>(unsigned char const*, unsigned long, short*, short*, short*)
Unexecuted instantiation: enc_fast_lossless.cc:void AVX512::(anonymous namespace)::FillRowRGB16<false, short>(unsigned char const*, unsigned long, short*, short*, short*)
Unexecuted instantiation: enc_fast_lossless.cc:void AVX512::(anonymous namespace)::FillRowRGB16<true, int>(unsigned char const*, unsigned long, int*, int*, int*)
Unexecuted instantiation: enc_fast_lossless.cc:void AVX512::(anonymous namespace)::FillRowRGB16<false, int>(unsigned char const*, unsigned long, int*, int*, int*)
Unexecuted instantiation: enc_fast_lossless.cc:void AVX2::(anonymous namespace)::FillRowRGB16<true, short>(unsigned char const*, unsigned long, short*, short*, short*)
Unexecuted instantiation: enc_fast_lossless.cc:void AVX2::(anonymous namespace)::FillRowRGB16<false, short>(unsigned char const*, unsigned long, short*, short*, short*)
Unexecuted instantiation: enc_fast_lossless.cc:void AVX2::(anonymous namespace)::FillRowRGB16<true, int>(unsigned char const*, unsigned long, int*, int*, int*)
Unexecuted instantiation: enc_fast_lossless.cc:void AVX2::(anonymous namespace)::FillRowRGB16<false, int>(unsigned char const*, unsigned long, int*, int*, int*)
Unexecuted instantiation: enc_fast_lossless.cc:void default_implementation::(anonymous namespace)::FillRowRGB16<true, short>(unsigned char const*, unsigned long, short*, short*, short*)
Unexecuted instantiation: enc_fast_lossless.cc:void default_implementation::(anonymous namespace)::FillRowRGB16<false, short>(unsigned char const*, unsigned long, short*, short*, short*)
Unexecuted instantiation: enc_fast_lossless.cc:void default_implementation::(anonymous namespace)::FillRowRGB16<true, int>(unsigned char const*, unsigned long, int*, int*, int*)
Unexecuted instantiation: enc_fast_lossless.cc:void default_implementation::(anonymous namespace)::FillRowRGB16<false, int>(unsigned char const*, unsigned long, int*, int*, int*)
3356
3357
template <typename pixel_t>
3358
void FillRowRGBA8(const unsigned char* rgba, size_t oxs, pixel_t* y,
3359
0
                  pixel_t* co, pixel_t* cg, pixel_t* alpha) {
3360
0
  size_t x = 0;
3361
#ifdef FJXL_GENERIC_SIMD
3362
0
  for (; x + SIMDVec16::kLanes <= oxs; x += SIMDVec16::kLanes) {
3363
0
    auto rgb = SIMDVec16::LoadRGBA8(rgba + 4 * x);
3364
0
    StoreYCoCg(rgb[0], rgb[1], rgb[2], y + x, co + x, cg + x);
3365
0
    StorePixels(rgb[3], alpha + x);
3366
0
  }
3367
#endif
3368
0
  for (; x < oxs; x++) {
3369
0
    uint16_t r = rgba[4 * x];
3370
0
    uint16_t g = rgba[4 * x + 1];
3371
0
    uint16_t b = rgba[4 * x + 2];
3372
0
    uint16_t a = rgba[4 * x + 3];
3373
0
    StoreYCoCg<pixel_t>(r, g, b, y + x, co + x, cg + x);
3374
0
    alpha[x] = a;
3375
0
  }
3376
0
}
Unexecuted instantiation: enc_fast_lossless.cc:void AVX512::(anonymous namespace)::FillRowRGBA8<short>(unsigned char const*, unsigned long, short*, short*, short*, short*)
Unexecuted instantiation: enc_fast_lossless.cc:void AVX2::(anonymous namespace)::FillRowRGBA8<short>(unsigned char const*, unsigned long, short*, short*, short*, short*)
Unexecuted instantiation: enc_fast_lossless.cc:void default_implementation::(anonymous namespace)::FillRowRGBA8<short>(unsigned char const*, unsigned long, short*, short*, short*, short*)
Unexecuted instantiation: enc_fast_lossless.cc:void default_implementation::(anonymous namespace)::FillRowRGBA8<int>(unsigned char const*, unsigned long, int*, int*, int*, int*)
Unexecuted instantiation: enc_fast_lossless.cc:void AVX2::(anonymous namespace)::FillRowRGBA8<int>(unsigned char const*, unsigned long, int*, int*, int*, int*)
Unexecuted instantiation: enc_fast_lossless.cc:void AVX512::(anonymous namespace)::FillRowRGBA8<int>(unsigned char const*, unsigned long, int*, int*, int*, int*)
3377
3378
template <bool big_endian, typename pixel_t>
3379
void FillRowRGBA16(const unsigned char* rgba, size_t oxs, pixel_t* y,
3380
0
                   pixel_t* co, pixel_t* cg, pixel_t* alpha) {
3381
0
  size_t x = 0;
3382
#ifdef FJXL_GENERIC_SIMD
3383
0
  for (; x + SIMDVec16::kLanes <= oxs; x += SIMDVec16::kLanes) {
3384
0
    auto rgb = SIMDVec16::LoadRGBA16(rgba + 8 * x);
3385
0
    if (big_endian) {
3386
0
      rgb[0].SwapEndian();
3387
0
      rgb[1].SwapEndian();
3388
0
      rgb[2].SwapEndian();
3389
0
      rgb[3].SwapEndian();
3390
0
    }
3391
0
    StoreYCoCg(rgb[0], rgb[1], rgb[2], y + x, co + x, cg + x);
3392
0
    StorePixels(rgb[3], alpha + x);
3393
0
  }
3394
#endif
3395
0
  for (; x < oxs; x++) {
3396
0
    uint16_t r = LoadLE16(rgba + 8 * x);
3397
0
    uint16_t g = LoadLE16(rgba + 8 * x + 2);
3398
0
    uint16_t b = LoadLE16(rgba + 8 * x + 4);
3399
0
    uint16_t a = LoadLE16(rgba + 8 * x + 6);
3400
0
    if (big_endian) {
3401
0
      r = SwapEndian(r);
3402
0
      g = SwapEndian(g);
3403
0
      b = SwapEndian(b);
3404
0
      a = SwapEndian(a);
3405
0
    }
3406
0
    StoreYCoCg<pixel_t>(r, g, b, y + x, co + x, cg + x);
3407
0
    alpha[x] = a;
3408
0
  }
3409
0
}
Unexecuted instantiation: enc_fast_lossless.cc:void AVX512::(anonymous namespace)::FillRowRGBA16<true, short>(unsigned char const*, unsigned long, short*, short*, short*, short*)
Unexecuted instantiation: enc_fast_lossless.cc:void AVX512::(anonymous namespace)::FillRowRGBA16<false, short>(unsigned char const*, unsigned long, short*, short*, short*, short*)
Unexecuted instantiation: enc_fast_lossless.cc:void AVX512::(anonymous namespace)::FillRowRGBA16<true, int>(unsigned char const*, unsigned long, int*, int*, int*, int*)
Unexecuted instantiation: enc_fast_lossless.cc:void AVX512::(anonymous namespace)::FillRowRGBA16<false, int>(unsigned char const*, unsigned long, int*, int*, int*, int*)
Unexecuted instantiation: enc_fast_lossless.cc:void AVX2::(anonymous namespace)::FillRowRGBA16<true, short>(unsigned char const*, unsigned long, short*, short*, short*, short*)
Unexecuted instantiation: enc_fast_lossless.cc:void AVX2::(anonymous namespace)::FillRowRGBA16<false, short>(unsigned char const*, unsigned long, short*, short*, short*, short*)
Unexecuted instantiation: enc_fast_lossless.cc:void AVX2::(anonymous namespace)::FillRowRGBA16<true, int>(unsigned char const*, unsigned long, int*, int*, int*, int*)
Unexecuted instantiation: enc_fast_lossless.cc:void AVX2::(anonymous namespace)::FillRowRGBA16<false, int>(unsigned char const*, unsigned long, int*, int*, int*, int*)
Unexecuted instantiation: enc_fast_lossless.cc:void default_implementation::(anonymous namespace)::FillRowRGBA16<true, short>(unsigned char const*, unsigned long, short*, short*, short*, short*)
Unexecuted instantiation: enc_fast_lossless.cc:void default_implementation::(anonymous namespace)::FillRowRGBA16<false, short>(unsigned char const*, unsigned long, short*, short*, short*, short*)
Unexecuted instantiation: enc_fast_lossless.cc:void default_implementation::(anonymous namespace)::FillRowRGBA16<true, int>(unsigned char const*, unsigned long, int*, int*, int*, int*)
Unexecuted instantiation: enc_fast_lossless.cc:void default_implementation::(anonymous namespace)::FillRowRGBA16<false, int>(unsigned char const*, unsigned long, int*, int*, int*, int*)
3410
3411
template <typename Processor, typename BitDepth>
3412
void ProcessImageArea(const unsigned char* rgba, size_t x0, size_t y0,
3413
                      size_t xs, size_t yskip, size_t ys, size_t row_stride,
3414
                      BitDepth bitdepth, size_t nb_chans, bool big_endian,
3415
0
                      Processor* processors) {
3416
0
  constexpr size_t kPadding = 32;
3417
3418
0
  using pixel_t = typename BitDepth::pixel_t;
3419
3420
0
  constexpr size_t kAlign = 64;
3421
0
  constexpr size_t kAlignPixels = kAlign / sizeof(pixel_t);
3422
3423
0
  auto align = [=](pixel_t* ptr) {
3424
0
    size_t offset = reinterpret_cast<size_t>(ptr) % kAlign;
3425
0
    if (offset) {
3426
0
      ptr += offset / sizeof(pixel_t);
3427
0
    }
3428
0
    return ptr;
3429
0
  };
Unexecuted instantiation: enc_fast_lossless.cc:AVX512::(anonymous namespace)::ProcessImageArea<AVX512::(anonymous namespace)::ChannelRowProcessor<AVX512::(anonymous namespace)::ChunkSampleCollector<AVX512::(anonymous namespace)::UpTo8Bits>, AVX512::(anonymous namespace)::UpTo8Bits>, AVX512::(anonymous namespace)::UpTo8Bits>(unsigned char const*, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, AVX512::(anonymous namespace)::UpTo8Bits, unsigned long, bool, AVX512::(anonymous namespace)::ChannelRowProcessor<AVX512::(anonymous namespace)::ChunkSampleCollector<AVX512::(anonymous namespace)::UpTo8Bits>, AVX512::(anonymous namespace)::UpTo8Bits>*)::{lambda(short*)#1}::operator()(short*) const
Unexecuted instantiation: enc_fast_lossless.cc:AVX512::(anonymous namespace)::ProcessImageArea<AVX512::(anonymous namespace)::ChannelRowProcessor<AVX512::(anonymous namespace)::ChunkSampleCollector<AVX512::(anonymous namespace)::From9To13Bits>, AVX512::(anonymous namespace)::From9To13Bits>, AVX512::(anonymous namespace)::From9To13Bits>(unsigned char const*, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, AVX512::(anonymous namespace)::From9To13Bits, unsigned long, bool, AVX512::(anonymous namespace)::ChannelRowProcessor<AVX512::(anonymous namespace)::ChunkSampleCollector<AVX512::(anonymous namespace)::From9To13Bits>, AVX512::(anonymous namespace)::From9To13Bits>*)::{lambda(short*)#1}::operator()(short*) const
Unexecuted instantiation: enc_fast_lossless.cc:AVX512::(anonymous namespace)::ProcessImageArea<AVX512::(anonymous namespace)::ChannelRowProcessor<AVX512::(anonymous namespace)::ChunkSampleCollector<AVX512::(anonymous namespace)::Exactly14Bits>, AVX512::(anonymous namespace)::Exactly14Bits>, AVX512::(anonymous namespace)::Exactly14Bits>(unsigned char const*, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, AVX512::(anonymous namespace)::Exactly14Bits, unsigned long, bool, AVX512::(anonymous namespace)::ChannelRowProcessor<AVX512::(anonymous namespace)::ChunkSampleCollector<AVX512::(anonymous namespace)::Exactly14Bits>, AVX512::(anonymous namespace)::Exactly14Bits>*)::{lambda(short*)#1}::operator()(short*) const
Unexecuted instantiation: enc_fast_lossless.cc:AVX512::(anonymous namespace)::ProcessImageArea<AVX512::(anonymous namespace)::ChannelRowProcessor<AVX512::(anonymous namespace)::ChunkSampleCollector<AVX512::(anonymous namespace)::MoreThan14Bits>, AVX512::(anonymous namespace)::MoreThan14Bits>, AVX512::(anonymous namespace)::MoreThan14Bits>(unsigned char const*, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, AVX512::(anonymous namespace)::MoreThan14Bits, unsigned long, bool, AVX512::(anonymous namespace)::ChannelRowProcessor<AVX512::(anonymous namespace)::ChunkSampleCollector<AVX512::(anonymous namespace)::MoreThan14Bits>, AVX512::(anonymous namespace)::MoreThan14Bits>*)::{lambda(int*)#1}::operator()(int*) const
Unexecuted instantiation: enc_fast_lossless.cc:AVX2::(anonymous namespace)::ProcessImageArea<AVX2::(anonymous namespace)::ChannelRowProcessor<AVX2::(anonymous namespace)::ChunkSampleCollector<AVX2::(anonymous namespace)::UpTo8Bits>, AVX2::(anonymous namespace)::UpTo8Bits>, AVX2::(anonymous namespace)::UpTo8Bits>(unsigned char const*, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, AVX2::(anonymous namespace)::UpTo8Bits, unsigned long, bool, AVX2::(anonymous namespace)::ChannelRowProcessor<AVX2::(anonymous namespace)::ChunkSampleCollector<AVX2::(anonymous namespace)::UpTo8Bits>, AVX2::(anonymous namespace)::UpTo8Bits>*)::{lambda(short*)#1}::operator()(short*) const
Unexecuted instantiation: enc_fast_lossless.cc:AVX2::(anonymous namespace)::ProcessImageArea<AVX2::(anonymous namespace)::ChannelRowProcessor<AVX2::(anonymous namespace)::ChunkSampleCollector<AVX2::(anonymous namespace)::From9To13Bits>, AVX2::(anonymous namespace)::From9To13Bits>, AVX2::(anonymous namespace)::From9To13Bits>(unsigned char const*, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, AVX2::(anonymous namespace)::From9To13Bits, unsigned long, bool, AVX2::(anonymous namespace)::ChannelRowProcessor<AVX2::(anonymous namespace)::ChunkSampleCollector<AVX2::(anonymous namespace)::From9To13Bits>, AVX2::(anonymous namespace)::From9To13Bits>*)::{lambda(short*)#1}::operator()(short*) const
Unexecuted instantiation: enc_fast_lossless.cc:AVX2::(anonymous namespace)::ProcessImageArea<AVX2::(anonymous namespace)::ChannelRowProcessor<AVX2::(anonymous namespace)::ChunkSampleCollector<AVX2::(anonymous namespace)::Exactly14Bits>, AVX2::(anonymous namespace)::Exactly14Bits>, AVX2::(anonymous namespace)::Exactly14Bits>(unsigned char const*, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, AVX2::(anonymous namespace)::Exactly14Bits, unsigned long, bool, AVX2::(anonymous namespace)::ChannelRowProcessor<AVX2::(anonymous namespace)::ChunkSampleCollector<AVX2::(anonymous namespace)::Exactly14Bits>, AVX2::(anonymous namespace)::Exactly14Bits>*)::{lambda(short*)#1}::operator()(short*) const
Unexecuted instantiation: enc_fast_lossless.cc:AVX2::(anonymous namespace)::ProcessImageArea<AVX2::(anonymous namespace)::ChannelRowProcessor<AVX2::(anonymous namespace)::ChunkSampleCollector<AVX2::(anonymous namespace)::MoreThan14Bits>, AVX2::(anonymous namespace)::MoreThan14Bits>, AVX2::(anonymous namespace)::MoreThan14Bits>(unsigned char const*, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, AVX2::(anonymous namespace)::MoreThan14Bits, unsigned long, bool, AVX2::(anonymous namespace)::ChannelRowProcessor<AVX2::(anonymous namespace)::ChunkSampleCollector<AVX2::(anonymous namespace)::MoreThan14Bits>, AVX2::(anonymous namespace)::MoreThan14Bits>*)::{lambda(int*)#1}::operator()(int*) const
Unexecuted instantiation: enc_fast_lossless.cc:default_implementation::(anonymous namespace)::ProcessImageArea<default_implementation::(anonymous namespace)::ChannelRowProcessor<default_implementation::(anonymous namespace)::ChunkSampleCollector<default_implementation::(anonymous namespace)::UpTo8Bits>, default_implementation::(anonymous namespace)::UpTo8Bits>, default_implementation::(anonymous namespace)::UpTo8Bits>(unsigned char const*, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, default_implementation::(anonymous namespace)::UpTo8Bits, unsigned long, bool, default_implementation::(anonymous namespace)::ChannelRowProcessor<default_implementation::(anonymous namespace)::ChunkSampleCollector<default_implementation::(anonymous namespace)::UpTo8Bits>, default_implementation::(anonymous namespace)::UpTo8Bits>*)::{lambda(short*)#1}::operator()(short*) const
Unexecuted instantiation: enc_fast_lossless.cc:default_implementation::(anonymous namespace)::ProcessImageArea<default_implementation::(anonymous namespace)::ChannelRowProcessor<default_implementation::(anonymous namespace)::ChunkSampleCollector<default_implementation::(anonymous namespace)::From9To13Bits>, default_implementation::(anonymous namespace)::From9To13Bits>, default_implementation::(anonymous namespace)::From9To13Bits>(unsigned char const*, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, default_implementation::(anonymous namespace)::From9To13Bits, unsigned long, bool, default_implementation::(anonymous namespace)::ChannelRowProcessor<default_implementation::(anonymous namespace)::ChunkSampleCollector<default_implementation::(anonymous namespace)::From9To13Bits>, default_implementation::(anonymous namespace)::From9To13Bits>*)::{lambda(short*)#1}::operator()(short*) const
Unexecuted instantiation: enc_fast_lossless.cc:default_implementation::(anonymous namespace)::ProcessImageArea<default_implementation::(anonymous namespace)::ChannelRowProcessor<default_implementation::(anonymous namespace)::ChunkSampleCollector<default_implementation::(anonymous namespace)::Exactly14Bits>, default_implementation::(anonymous namespace)::Exactly14Bits>, default_implementation::(anonymous namespace)::Exactly14Bits>(unsigned char const*, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, default_implementation::(anonymous namespace)::Exactly14Bits, unsigned long, bool, default_implementation::(anonymous namespace)::ChannelRowProcessor<default_implementation::(anonymous namespace)::ChunkSampleCollector<default_implementation::(anonymous namespace)::Exactly14Bits>, default_implementation::(anonymous namespace)::Exactly14Bits>*)::{lambda(short*)#1}::operator()(short*) const
Unexecuted instantiation: enc_fast_lossless.cc:default_implementation::(anonymous namespace)::ProcessImageArea<default_implementation::(anonymous namespace)::ChannelRowProcessor<default_implementation::(anonymous namespace)::ChunkSampleCollector<default_implementation::(anonymous namespace)::MoreThan14Bits>, default_implementation::(anonymous namespace)::MoreThan14Bits>, default_implementation::(anonymous namespace)::MoreThan14Bits>(unsigned char const*, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, default_implementation::(anonymous namespace)::MoreThan14Bits, unsigned long, bool, default_implementation::(anonymous namespace)::ChannelRowProcessor<default_implementation::(anonymous namespace)::ChunkSampleCollector<default_implementation::(anonymous namespace)::MoreThan14Bits>, default_implementation::(anonymous namespace)::MoreThan14Bits>*)::{lambda(int*)#1}::operator()(int*) const
Unexecuted instantiation: enc_fast_lossless.cc:AVX512::(anonymous namespace)::ProcessImageArea<AVX512::(anonymous namespace)::ChannelRowProcessor<AVX512::(anonymous namespace)::ChunkEncoder<AVX512::(anonymous namespace)::UpTo8Bits>, AVX512::(anonymous namespace)::UpTo8Bits>, AVX512::(anonymous namespace)::UpTo8Bits>(unsigned char const*, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, AVX512::(anonymous namespace)::UpTo8Bits, unsigned long, bool, AVX512::(anonymous namespace)::ChannelRowProcessor<AVX512::(anonymous namespace)::ChunkEncoder<AVX512::(anonymous namespace)::UpTo8Bits>, AVX512::(anonymous namespace)::UpTo8Bits>*)::{lambda(short*)#1}::operator()(short*) const
Unexecuted instantiation: enc_fast_lossless.cc:AVX512::(anonymous namespace)::ProcessImageArea<AVX512::(anonymous namespace)::ChannelRowProcessor<AVX512::(anonymous namespace)::ChunkEncoder<AVX512::(anonymous namespace)::From9To13Bits>, AVX512::(anonymous namespace)::From9To13Bits>, AVX512::(anonymous namespace)::From9To13Bits>(unsigned char const*, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, AVX512::(anonymous namespace)::From9To13Bits, unsigned long, bool, AVX512::(anonymous namespace)::ChannelRowProcessor<AVX512::(anonymous namespace)::ChunkEncoder<AVX512::(anonymous namespace)::From9To13Bits>, AVX512::(anonymous namespace)::From9To13Bits>*)::{lambda(short*)#1}::operator()(short*) const
Unexecuted instantiation: enc_fast_lossless.cc:AVX512::(anonymous namespace)::ProcessImageArea<AVX512::(anonymous namespace)::ChannelRowProcessor<AVX512::(anonymous namespace)::ChunkEncoder<AVX512::(anonymous namespace)::Exactly14Bits>, AVX512::(anonymous namespace)::Exactly14Bits>, AVX512::(anonymous namespace)::Exactly14Bits>(unsigned char const*, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, AVX512::(anonymous namespace)::Exactly14Bits, unsigned long, bool, AVX512::(anonymous namespace)::ChannelRowProcessor<AVX512::(anonymous namespace)::ChunkEncoder<AVX512::(anonymous namespace)::Exactly14Bits>, AVX512::(anonymous namespace)::Exactly14Bits>*)::{lambda(short*)#1}::operator()(short*) const
Unexecuted instantiation: enc_fast_lossless.cc:AVX512::(anonymous namespace)::ProcessImageArea<AVX512::(anonymous namespace)::ChannelRowProcessor<AVX512::(anonymous namespace)::ChunkEncoder<AVX512::(anonymous namespace)::MoreThan14Bits>, AVX512::(anonymous namespace)::MoreThan14Bits>, AVX512::(anonymous namespace)::MoreThan14Bits>(unsigned char const*, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, AVX512::(anonymous namespace)::MoreThan14Bits, unsigned long, bool, AVX512::(anonymous namespace)::ChannelRowProcessor<AVX512::(anonymous namespace)::ChunkEncoder<AVX512::(anonymous namespace)::MoreThan14Bits>, AVX512::(anonymous namespace)::MoreThan14Bits>*)::{lambda(int*)#1}::operator()(int*) const
Unexecuted instantiation: enc_fast_lossless.cc:AVX2::(anonymous namespace)::ProcessImageArea<AVX2::(anonymous namespace)::ChannelRowProcessor<AVX2::(anonymous namespace)::ChunkEncoder<AVX2::(anonymous namespace)::UpTo8Bits>, AVX2::(anonymous namespace)::UpTo8Bits>, AVX2::(anonymous namespace)::UpTo8Bits>(unsigned char const*, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, AVX2::(anonymous namespace)::UpTo8Bits, unsigned long, bool, AVX2::(anonymous namespace)::ChannelRowProcessor<AVX2::(anonymous namespace)::ChunkEncoder<AVX2::(anonymous namespace)::UpTo8Bits>, AVX2::(anonymous namespace)::UpTo8Bits>*)::{lambda(short*)#1}::operator()(short*) const
Unexecuted instantiation: enc_fast_lossless.cc:AVX2::(anonymous namespace)::ProcessImageArea<AVX2::(anonymous namespace)::ChannelRowProcessor<AVX2::(anonymous namespace)::ChunkEncoder<AVX2::(anonymous namespace)::From9To13Bits>, AVX2::(anonymous namespace)::From9To13Bits>, AVX2::(anonymous namespace)::From9To13Bits>(unsigned char const*, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, AVX2::(anonymous namespace)::From9To13Bits, unsigned long, bool, AVX2::(anonymous namespace)::ChannelRowProcessor<AVX2::(anonymous namespace)::ChunkEncoder<AVX2::(anonymous namespace)::From9To13Bits>, AVX2::(anonymous namespace)::From9To13Bits>*)::{lambda(short*)#1}::operator()(short*) const
Unexecuted instantiation: enc_fast_lossless.cc:AVX2::(anonymous namespace)::ProcessImageArea<AVX2::(anonymous namespace)::ChannelRowProcessor<AVX2::(anonymous namespace)::ChunkEncoder<AVX2::(anonymous namespace)::Exactly14Bits>, AVX2::(anonymous namespace)::Exactly14Bits>, AVX2::(anonymous namespace)::Exactly14Bits>(unsigned char const*, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, AVX2::(anonymous namespace)::Exactly14Bits, unsigned long, bool, AVX2::(anonymous namespace)::ChannelRowProcessor<AVX2::(anonymous namespace)::ChunkEncoder<AVX2::(anonymous namespace)::Exactly14Bits>, AVX2::(anonymous namespace)::Exactly14Bits>*)::{lambda(short*)#1}::operator()(short*) const
Unexecuted instantiation: enc_fast_lossless.cc:AVX2::(anonymous namespace)::ProcessImageArea<AVX2::(anonymous namespace)::ChannelRowProcessor<AVX2::(anonymous namespace)::ChunkEncoder<AVX2::(anonymous namespace)::MoreThan14Bits>, AVX2::(anonymous namespace)::MoreThan14Bits>, AVX2::(anonymous namespace)::MoreThan14Bits>(unsigned char const*, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, AVX2::(anonymous namespace)::MoreThan14Bits, unsigned long, bool, AVX2::(anonymous namespace)::ChannelRowProcessor<AVX2::(anonymous namespace)::ChunkEncoder<AVX2::(anonymous namespace)::MoreThan14Bits>, AVX2::(anonymous namespace)::MoreThan14Bits>*)::{lambda(int*)#1}::operator()(int*) const
Unexecuted instantiation: enc_fast_lossless.cc:default_implementation::(anonymous namespace)::ProcessImageArea<default_implementation::(anonymous namespace)::ChannelRowProcessor<default_implementation::(anonymous namespace)::ChunkEncoder<default_implementation::(anonymous namespace)::UpTo8Bits>, default_implementation::(anonymous namespace)::UpTo8Bits>, default_implementation::(anonymous namespace)::UpTo8Bits>(unsigned char const*, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, default_implementation::(anonymous namespace)::UpTo8Bits, unsigned long, bool, default_implementation::(anonymous namespace)::ChannelRowProcessor<default_implementation::(anonymous namespace)::ChunkEncoder<default_implementation::(anonymous namespace)::UpTo8Bits>, default_implementation::(anonymous namespace)::UpTo8Bits>*)::{lambda(short*)#1}::operator()(short*) const
Unexecuted instantiation: enc_fast_lossless.cc:default_implementation::(anonymous namespace)::ProcessImageArea<default_implementation::(anonymous namespace)::ChannelRowProcessor<default_implementation::(anonymous namespace)::ChunkEncoder<default_implementation::(anonymous namespace)::From9To13Bits>, default_implementation::(anonymous namespace)::From9To13Bits>, default_implementation::(anonymous namespace)::From9To13Bits>(unsigned char const*, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, default_implementation::(anonymous namespace)::From9To13Bits, unsigned long, bool, default_implementation::(anonymous namespace)::ChannelRowProcessor<default_implementation::(anonymous namespace)::ChunkEncoder<default_implementation::(anonymous namespace)::From9To13Bits>, default_implementation::(anonymous namespace)::From9To13Bits>*)::{lambda(short*)#1}::operator()(short*) const
Unexecuted instantiation: enc_fast_lossless.cc:default_implementation::(anonymous namespace)::ProcessImageArea<default_implementation::(anonymous namespace)::ChannelRowProcessor<default_implementation::(anonymous namespace)::ChunkEncoder<default_implementation::(anonymous namespace)::Exactly14Bits>, default_implementation::(anonymous namespace)::Exactly14Bits>, default_implementation::(anonymous namespace)::Exactly14Bits>(unsigned char const*, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, default_implementation::(anonymous namespace)::Exactly14Bits, unsigned long, bool, default_implementation::(anonymous namespace)::ChannelRowProcessor<default_implementation::(anonymous namespace)::ChunkEncoder<default_implementation::(anonymous namespace)::Exactly14Bits>, default_implementation::(anonymous namespace)::Exactly14Bits>*)::{lambda(short*)#1}::operator()(short*) const
Unexecuted instantiation: enc_fast_lossless.cc:default_implementation::(anonymous namespace)::ProcessImageArea<default_implementation::(anonymous namespace)::ChannelRowProcessor<default_implementation::(anonymous namespace)::ChunkEncoder<default_implementation::(anonymous namespace)::MoreThan14Bits>, default_implementation::(anonymous namespace)::MoreThan14Bits>, default_implementation::(anonymous namespace)::MoreThan14Bits>(unsigned char const*, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, default_implementation::(anonymous namespace)::MoreThan14Bits, unsigned long, bool, default_implementation::(anonymous namespace)::ChannelRowProcessor<default_implementation::(anonymous namespace)::ChunkEncoder<default_implementation::(anonymous namespace)::MoreThan14Bits>, default_implementation::(anonymous namespace)::MoreThan14Bits>*)::{lambda(int*)#1}::operator()(int*) const
3430
3431
0
  constexpr size_t kNumPx =
3432
0
      (256 + kPadding * 2 + kAlignPixels + kAlignPixels - 1) / kAlignPixels *
3433
0
      kAlignPixels;
3434
3435
0
  std::vector<std::array<std::array<pixel_t, kNumPx>, 2>> group_data(nb_chans);
3436
3437
0
  for (size_t y = 0; y < ys; y++) {
3438
0
    const auto rgba_row =
3439
0
        rgba + row_stride * (y0 + y) + x0 * nb_chans * BitDepth::kInputBytes;
3440
0
    pixel_t* crow[4] = {};
3441
0
    pixel_t* prow[4] = {};
3442
0
    for (size_t i = 0; i < nb_chans; i++) {
3443
0
      crow[i] = align(&group_data[i][y & 1][kPadding]);
3444
0
      prow[i] = align(&group_data[i][(y - 1) & 1][kPadding]);
3445
0
    }
3446
3447
    // Pre-fill rows with YCoCg converted pixels.
3448
0
    if (nb_chans == 1) {
3449
0
      if (BitDepth::kInputBytes == 1) {
3450
0
        FillRowG8(rgba_row, xs, crow[0]);
3451
0
      } else if (big_endian) {
3452
0
        FillRowG16</*big_endian=*/true>(rgba_row, xs, crow[0]);
3453
0
      } else {
3454
0
        FillRowG16</*big_endian=*/false>(rgba_row, xs, crow[0]);
3455
0
      }
3456
0
    } else if (nb_chans == 2) {
3457
0
      if (BitDepth::kInputBytes == 1) {
3458
0
        FillRowGA8(rgba_row, xs, crow[0], crow[1]);
3459
0
      } else if (big_endian) {
3460
0
        FillRowGA16</*big_endian=*/true>(rgba_row, xs, crow[0], crow[1]);
3461
0
      } else {
3462
0
        FillRowGA16</*big_endian=*/false>(rgba_row, xs, crow[0], crow[1]);
3463
0
      }
3464
0
    } else if (nb_chans == 3) {
3465
0
      if (BitDepth::kInputBytes == 1) {
3466
0
        FillRowRGB8(rgba_row, xs, crow[0], crow[1], crow[2]);
3467
0
      } else if (big_endian) {
3468
0
        FillRowRGB16</*big_endian=*/true>(rgba_row, xs, crow[0], crow[1],
3469
0
                                          crow[2]);
3470
0
      } else {
3471
0
        FillRowRGB16</*big_endian=*/false>(rgba_row, xs, crow[0], crow[1],
3472
0
                                           crow[2]);
3473
0
      }
3474
0
    } else {
3475
0
      if (BitDepth::kInputBytes == 1) {
3476
0
        FillRowRGBA8(rgba_row, xs, crow[0], crow[1], crow[2], crow[3]);
3477
0
      } else if (big_endian) {
3478
0
        FillRowRGBA16</*big_endian=*/true>(rgba_row, xs, crow[0], crow[1],
3479
0
                                           crow[2], crow[3]);
3480
0
      } else {
3481
0
        FillRowRGBA16</*big_endian=*/false>(rgba_row, xs, crow[0], crow[1],
3482
0
                                            crow[2], crow[3]);
3483
0
      }
3484
0
    }
3485
    // Deal with x == 0.
3486
0
    for (size_t c = 0; c < nb_chans; c++) {
3487
0
      *(crow[c] - 1) = y > 0 ? *(prow[c]) : 0;
3488
      // Fix topleft.
3489
0
      *(prow[c] - 1) = y > 0 ? *(prow[c]) : 0;
3490
0
    }
3491
0
    if (y < yskip) continue;
3492
0
    for (size_t c = 0; c < nb_chans; c++) {
3493
      // Get pointers to px/left/top/topleft data to speedup loop.
3494
0
      const pixel_t* row = crow[c];
3495
0
      const pixel_t* row_left = crow[c] - 1;
3496
0
      const pixel_t* row_top = y == 0 ? row_left : prow[c];
3497
0
      const pixel_t* row_topleft = y == 0 ? row_left : prow[c] - 1;
3498
3499
0
      processors[c].ProcessRow(row, row_left, row_top, row_topleft, xs);
3500
0
    }
3501
0
  }
3502
0
  for (size_t c = 0; c < nb_chans; c++) {
3503
0
    processors[c].Finalize();
3504
0
  }
3505
0
}
Unexecuted instantiation: enc_fast_lossless.cc:void AVX512::(anonymous namespace)::ProcessImageArea<AVX512::(anonymous namespace)::ChannelRowProcessor<AVX512::(anonymous namespace)::ChunkSampleCollector<AVX512::(anonymous namespace)::UpTo8Bits>, AVX512::(anonymous namespace)::UpTo8Bits>, AVX512::(anonymous namespace)::UpTo8Bits>(unsigned char const*, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, AVX512::(anonymous namespace)::UpTo8Bits, unsigned long, bool, AVX512::(anonymous namespace)::ChannelRowProcessor<AVX512::(anonymous namespace)::ChunkSampleCollector<AVX512::(anonymous namespace)::UpTo8Bits>, AVX512::(anonymous namespace)::UpTo8Bits>*)
Unexecuted instantiation: enc_fast_lossless.cc:void AVX512::(anonymous namespace)::ProcessImageArea<AVX512::(anonymous namespace)::ChannelRowProcessor<AVX512::(anonymous namespace)::ChunkSampleCollector<AVX512::(anonymous namespace)::From9To13Bits>, AVX512::(anonymous namespace)::From9To13Bits>, AVX512::(anonymous namespace)::From9To13Bits>(unsigned char const*, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, AVX512::(anonymous namespace)::From9To13Bits, unsigned long, bool, AVX512::(anonymous namespace)::ChannelRowProcessor<AVX512::(anonymous namespace)::ChunkSampleCollector<AVX512::(anonymous namespace)::From9To13Bits>, AVX512::(anonymous namespace)::From9To13Bits>*)
Unexecuted instantiation: enc_fast_lossless.cc:void AVX512::(anonymous namespace)::ProcessImageArea<AVX512::(anonymous namespace)::ChannelRowProcessor<AVX512::(anonymous namespace)::ChunkSampleCollector<AVX512::(anonymous namespace)::Exactly14Bits>, AVX512::(anonymous namespace)::Exactly14Bits>, AVX512::(anonymous namespace)::Exactly14Bits>(unsigned char const*, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, AVX512::(anonymous namespace)::Exactly14Bits, unsigned long, bool, AVX512::(anonymous namespace)::ChannelRowProcessor<AVX512::(anonymous namespace)::ChunkSampleCollector<AVX512::(anonymous namespace)::Exactly14Bits>, AVX512::(anonymous namespace)::Exactly14Bits>*)
Unexecuted instantiation: enc_fast_lossless.cc:void AVX512::(anonymous namespace)::ProcessImageArea<AVX512::(anonymous namespace)::ChannelRowProcessor<AVX512::(anonymous namespace)::ChunkSampleCollector<AVX512::(anonymous namespace)::MoreThan14Bits>, AVX512::(anonymous namespace)::MoreThan14Bits>, AVX512::(anonymous namespace)::MoreThan14Bits>(unsigned char const*, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, AVX512::(anonymous namespace)::MoreThan14Bits, unsigned long, bool, AVX512::(anonymous namespace)::ChannelRowProcessor<AVX512::(anonymous namespace)::ChunkSampleCollector<AVX512::(anonymous namespace)::MoreThan14Bits>, AVX512::(anonymous namespace)::MoreThan14Bits>*)
Unexecuted instantiation: enc_fast_lossless.cc:void AVX2::(anonymous namespace)::ProcessImageArea<AVX2::(anonymous namespace)::ChannelRowProcessor<AVX2::(anonymous namespace)::ChunkSampleCollector<AVX2::(anonymous namespace)::UpTo8Bits>, AVX2::(anonymous namespace)::UpTo8Bits>, AVX2::(anonymous namespace)::UpTo8Bits>(unsigned char const*, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, AVX2::(anonymous namespace)::UpTo8Bits, unsigned long, bool, AVX2::(anonymous namespace)::ChannelRowProcessor<AVX2::(anonymous namespace)::ChunkSampleCollector<AVX2::(anonymous namespace)::UpTo8Bits>, AVX2::(anonymous namespace)::UpTo8Bits>*)
Unexecuted instantiation: enc_fast_lossless.cc:void AVX2::(anonymous namespace)::ProcessImageArea<AVX2::(anonymous namespace)::ChannelRowProcessor<AVX2::(anonymous namespace)::ChunkSampleCollector<AVX2::(anonymous namespace)::From9To13Bits>, AVX2::(anonymous namespace)::From9To13Bits>, AVX2::(anonymous namespace)::From9To13Bits>(unsigned char const*, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, AVX2::(anonymous namespace)::From9To13Bits, unsigned long, bool, AVX2::(anonymous namespace)::ChannelRowProcessor<AVX2::(anonymous namespace)::ChunkSampleCollector<AVX2::(anonymous namespace)::From9To13Bits>, AVX2::(anonymous namespace)::From9To13Bits>*)
Unexecuted instantiation: enc_fast_lossless.cc:void AVX2::(anonymous namespace)::ProcessImageArea<AVX2::(anonymous namespace)::ChannelRowProcessor<AVX2::(anonymous namespace)::ChunkSampleCollector<AVX2::(anonymous namespace)::Exactly14Bits>, AVX2::(anonymous namespace)::Exactly14Bits>, AVX2::(anonymous namespace)::Exactly14Bits>(unsigned char const*, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, AVX2::(anonymous namespace)::Exactly14Bits, unsigned long, bool, AVX2::(anonymous namespace)::ChannelRowProcessor<AVX2::(anonymous namespace)::ChunkSampleCollector<AVX2::(anonymous namespace)::Exactly14Bits>, AVX2::(anonymous namespace)::Exactly14Bits>*)
Unexecuted instantiation: enc_fast_lossless.cc:void AVX2::(anonymous namespace)::ProcessImageArea<AVX2::(anonymous namespace)::ChannelRowProcessor<AVX2::(anonymous namespace)::ChunkSampleCollector<AVX2::(anonymous namespace)::MoreThan14Bits>, AVX2::(anonymous namespace)::MoreThan14Bits>, AVX2::(anonymous namespace)::MoreThan14Bits>(unsigned char const*, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, AVX2::(anonymous namespace)::MoreThan14Bits, unsigned long, bool, AVX2::(anonymous namespace)::ChannelRowProcessor<AVX2::(anonymous namespace)::ChunkSampleCollector<AVX2::(anonymous namespace)::MoreThan14Bits>, AVX2::(anonymous namespace)::MoreThan14Bits>*)
Unexecuted instantiation: enc_fast_lossless.cc:void default_implementation::(anonymous namespace)::ProcessImageArea<default_implementation::(anonymous namespace)::ChannelRowProcessor<default_implementation::(anonymous namespace)::ChunkSampleCollector<default_implementation::(anonymous namespace)::UpTo8Bits>, default_implementation::(anonymous namespace)::UpTo8Bits>, default_implementation::(anonymous namespace)::UpTo8Bits>(unsigned char const*, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, default_implementation::(anonymous namespace)::UpTo8Bits, unsigned long, bool, default_implementation::(anonymous namespace)::ChannelRowProcessor<default_implementation::(anonymous namespace)::ChunkSampleCollector<default_implementation::(anonymous namespace)::UpTo8Bits>, default_implementation::(anonymous namespace)::UpTo8Bits>*)
Unexecuted instantiation: enc_fast_lossless.cc:void default_implementation::(anonymous namespace)::ProcessImageArea<default_implementation::(anonymous namespace)::ChannelRowProcessor<default_implementation::(anonymous namespace)::ChunkSampleCollector<default_implementation::(anonymous namespace)::From9To13Bits>, default_implementation::(anonymous namespace)::From9To13Bits>, default_implementation::(anonymous namespace)::From9To13Bits>(unsigned char const*, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, default_implementation::(anonymous namespace)::From9To13Bits, unsigned long, bool, default_implementation::(anonymous namespace)::ChannelRowProcessor<default_implementation::(anonymous namespace)::ChunkSampleCollector<default_implementation::(anonymous namespace)::From9To13Bits>, default_implementation::(anonymous namespace)::From9To13Bits>*)
Unexecuted instantiation: enc_fast_lossless.cc:void default_implementation::(anonymous namespace)::ProcessImageArea<default_implementation::(anonymous namespace)::ChannelRowProcessor<default_implementation::(anonymous namespace)::ChunkSampleCollector<default_implementation::(anonymous namespace)::Exactly14Bits>, default_implementation::(anonymous namespace)::Exactly14Bits>, default_implementation::(anonymous namespace)::Exactly14Bits>(unsigned char const*, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, default_implementation::(anonymous namespace)::Exactly14Bits, unsigned long, bool, default_implementation::(anonymous namespace)::ChannelRowProcessor<default_implementation::(anonymous namespace)::ChunkSampleCollector<default_implementation::(anonymous namespace)::Exactly14Bits>, default_implementation::(anonymous namespace)::Exactly14Bits>*)
Unexecuted instantiation: enc_fast_lossless.cc:void default_implementation::(anonymous namespace)::ProcessImageArea<default_implementation::(anonymous namespace)::ChannelRowProcessor<default_implementation::(anonymous namespace)::ChunkSampleCollector<default_implementation::(anonymous namespace)::MoreThan14Bits>, default_implementation::(anonymous namespace)::MoreThan14Bits>, default_implementation::(anonymous namespace)::MoreThan14Bits>(unsigned char const*, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, default_implementation::(anonymous namespace)::MoreThan14Bits, unsigned long, bool, default_implementation::(anonymous namespace)::ChannelRowProcessor<default_implementation::(anonymous namespace)::ChunkSampleCollector<default_implementation::(anonymous namespace)::MoreThan14Bits>, default_implementation::(anonymous namespace)::MoreThan14Bits>*)
Unexecuted instantiation: enc_fast_lossless.cc:void AVX512::(anonymous namespace)::ProcessImageArea<AVX512::(anonymous namespace)::ChannelRowProcessor<AVX512::(anonymous namespace)::ChunkEncoder<AVX512::(anonymous namespace)::UpTo8Bits>, AVX512::(anonymous namespace)::UpTo8Bits>, AVX512::(anonymous namespace)::UpTo8Bits>(unsigned char const*, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, AVX512::(anonymous namespace)::UpTo8Bits, unsigned long, bool, AVX512::(anonymous namespace)::ChannelRowProcessor<AVX512::(anonymous namespace)::ChunkEncoder<AVX512::(anonymous namespace)::UpTo8Bits>, AVX512::(anonymous namespace)::UpTo8Bits>*)
Unexecuted instantiation: enc_fast_lossless.cc:void AVX512::(anonymous namespace)::ProcessImageArea<AVX512::(anonymous namespace)::ChannelRowProcessor<AVX512::(anonymous namespace)::ChunkEncoder<AVX512::(anonymous namespace)::From9To13Bits>, AVX512::(anonymous namespace)::From9To13Bits>, AVX512::(anonymous namespace)::From9To13Bits>(unsigned char const*, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, AVX512::(anonymous namespace)::From9To13Bits, unsigned long, bool, AVX512::(anonymous namespace)::ChannelRowProcessor<AVX512::(anonymous namespace)::ChunkEncoder<AVX512::(anonymous namespace)::From9To13Bits>, AVX512::(anonymous namespace)::From9To13Bits>*)
Unexecuted instantiation: enc_fast_lossless.cc:void AVX512::(anonymous namespace)::ProcessImageArea<AVX512::(anonymous namespace)::ChannelRowProcessor<AVX512::(anonymous namespace)::ChunkEncoder<AVX512::(anonymous namespace)::Exactly14Bits>, AVX512::(anonymous namespace)::Exactly14Bits>, AVX512::(anonymous namespace)::Exactly14Bits>(unsigned char const*, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, AVX512::(anonymous namespace)::Exactly14Bits, unsigned long, bool, AVX512::(anonymous namespace)::ChannelRowProcessor<AVX512::(anonymous namespace)::ChunkEncoder<AVX512::(anonymous namespace)::Exactly14Bits>, AVX512::(anonymous namespace)::Exactly14Bits>*)
Unexecuted instantiation: enc_fast_lossless.cc:void AVX512::(anonymous namespace)::ProcessImageArea<AVX512::(anonymous namespace)::ChannelRowProcessor<AVX512::(anonymous namespace)::ChunkEncoder<AVX512::(anonymous namespace)::MoreThan14Bits>, AVX512::(anonymous namespace)::MoreThan14Bits>, AVX512::(anonymous namespace)::MoreThan14Bits>(unsigned char const*, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, AVX512::(anonymous namespace)::MoreThan14Bits, unsigned long, bool, AVX512::(anonymous namespace)::ChannelRowProcessor<AVX512::(anonymous namespace)::ChunkEncoder<AVX512::(anonymous namespace)::MoreThan14Bits>, AVX512::(anonymous namespace)::MoreThan14Bits>*)
Unexecuted instantiation: enc_fast_lossless.cc:void AVX2::(anonymous namespace)::ProcessImageArea<AVX2::(anonymous namespace)::ChannelRowProcessor<AVX2::(anonymous namespace)::ChunkEncoder<AVX2::(anonymous namespace)::UpTo8Bits>, AVX2::(anonymous namespace)::UpTo8Bits>, AVX2::(anonymous namespace)::UpTo8Bits>(unsigned char const*, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, AVX2::(anonymous namespace)::UpTo8Bits, unsigned long, bool, AVX2::(anonymous namespace)::ChannelRowProcessor<AVX2::(anonymous namespace)::ChunkEncoder<AVX2::(anonymous namespace)::UpTo8Bits>, AVX2::(anonymous namespace)::UpTo8Bits>*)
Unexecuted instantiation: enc_fast_lossless.cc:void AVX2::(anonymous namespace)::ProcessImageArea<AVX2::(anonymous namespace)::ChannelRowProcessor<AVX2::(anonymous namespace)::ChunkEncoder<AVX2::(anonymous namespace)::From9To13Bits>, AVX2::(anonymous namespace)::From9To13Bits>, AVX2::(anonymous namespace)::From9To13Bits>(unsigned char const*, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, AVX2::(anonymous namespace)::From9To13Bits, unsigned long, bool, AVX2::(anonymous namespace)::ChannelRowProcessor<AVX2::(anonymous namespace)::ChunkEncoder<AVX2::(anonymous namespace)::From9To13Bits>, AVX2::(anonymous namespace)::From9To13Bits>*)
Unexecuted instantiation: enc_fast_lossless.cc:void AVX2::(anonymous namespace)::ProcessImageArea<AVX2::(anonymous namespace)::ChannelRowProcessor<AVX2::(anonymous namespace)::ChunkEncoder<AVX2::(anonymous namespace)::Exactly14Bits>, AVX2::(anonymous namespace)::Exactly14Bits>, AVX2::(anonymous namespace)::Exactly14Bits>(unsigned char const*, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, AVX2::(anonymous namespace)::Exactly14Bits, unsigned long, bool, AVX2::(anonymous namespace)::ChannelRowProcessor<AVX2::(anonymous namespace)::ChunkEncoder<AVX2::(anonymous namespace)::Exactly14Bits>, AVX2::(anonymous namespace)::Exactly14Bits>*)
Unexecuted instantiation: enc_fast_lossless.cc:void AVX2::(anonymous namespace)::ProcessImageArea<AVX2::(anonymous namespace)::ChannelRowProcessor<AVX2::(anonymous namespace)::ChunkEncoder<AVX2::(anonymous namespace)::MoreThan14Bits>, AVX2::(anonymous namespace)::MoreThan14Bits>, AVX2::(anonymous namespace)::MoreThan14Bits>(unsigned char const*, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, AVX2::(anonymous namespace)::MoreThan14Bits, unsigned long, bool, AVX2::(anonymous namespace)::ChannelRowProcessor<AVX2::(anonymous namespace)::ChunkEncoder<AVX2::(anonymous namespace)::MoreThan14Bits>, AVX2::(anonymous namespace)::MoreThan14Bits>*)
Unexecuted instantiation: enc_fast_lossless.cc:void default_implementation::(anonymous namespace)::ProcessImageArea<default_implementation::(anonymous namespace)::ChannelRowProcessor<default_implementation::(anonymous namespace)::ChunkEncoder<default_implementation::(anonymous namespace)::UpTo8Bits>, default_implementation::(anonymous namespace)::UpTo8Bits>, default_implementation::(anonymous namespace)::UpTo8Bits>(unsigned char const*, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, default_implementation::(anonymous namespace)::UpTo8Bits, unsigned long, bool, default_implementation::(anonymous namespace)::ChannelRowProcessor<default_implementation::(anonymous namespace)::ChunkEncoder<default_implementation::(anonymous namespace)::UpTo8Bits>, default_implementation::(anonymous namespace)::UpTo8Bits>*)
Unexecuted instantiation: enc_fast_lossless.cc:void default_implementation::(anonymous namespace)::ProcessImageArea<default_implementation::(anonymous namespace)::ChannelRowProcessor<default_implementation::(anonymous namespace)::ChunkEncoder<default_implementation::(anonymous namespace)::From9To13Bits>, default_implementation::(anonymous namespace)::From9To13Bits>, default_implementation::(anonymous namespace)::From9To13Bits>(unsigned char const*, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, default_implementation::(anonymous namespace)::From9To13Bits, unsigned long, bool, default_implementation::(anonymous namespace)::ChannelRowProcessor<default_implementation::(anonymous namespace)::ChunkEncoder<default_implementation::(anonymous namespace)::From9To13Bits>, default_implementation::(anonymous namespace)::From9To13Bits>*)
Unexecuted instantiation: enc_fast_lossless.cc:void default_implementation::(anonymous namespace)::ProcessImageArea<default_implementation::(anonymous namespace)::ChannelRowProcessor<default_implementation::(anonymous namespace)::ChunkEncoder<default_implementation::(anonymous namespace)::Exactly14Bits>, default_implementation::(anonymous namespace)::Exactly14Bits>, default_implementation::(anonymous namespace)::Exactly14Bits>(unsigned char const*, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, default_implementation::(anonymous namespace)::Exactly14Bits, unsigned long, bool, default_implementation::(anonymous namespace)::ChannelRowProcessor<default_implementation::(anonymous namespace)::ChunkEncoder<default_implementation::(anonymous namespace)::Exactly14Bits>, default_implementation::(anonymous namespace)::Exactly14Bits>*)
Unexecuted instantiation: enc_fast_lossless.cc:void default_implementation::(anonymous namespace)::ProcessImageArea<default_implementation::(anonymous namespace)::ChannelRowProcessor<default_implementation::(anonymous namespace)::ChunkEncoder<default_implementation::(anonymous namespace)::MoreThan14Bits>, default_implementation::(anonymous namespace)::MoreThan14Bits>, default_implementation::(anonymous namespace)::MoreThan14Bits>(unsigned char const*, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, default_implementation::(anonymous namespace)::MoreThan14Bits, unsigned long, bool, default_implementation::(anonymous namespace)::ChannelRowProcessor<default_implementation::(anonymous namespace)::ChunkEncoder<default_implementation::(anonymous namespace)::MoreThan14Bits>, default_implementation::(anonymous namespace)::MoreThan14Bits>*)
3506
3507
template <typename BitDepth>
3508
bool WriteACSection(const unsigned char* rgba, size_t x0, size_t y0, size_t xs,
3509
                    size_t ys, size_t row_stride, bool is_single_group,
3510
                    BitDepth bitdepth, size_t nb_chans, bool big_endian,
3511
                    const PrefixCode code[4],
3512
0
                    std::array<BitWriter, 4>& output) {
3513
0
  for (size_t i = 0; i < nb_chans; i++) {
3514
0
    if (is_single_group && i == 0) continue;
3515
0
    if (!output[i].Allocate(xs * ys * bitdepth.MaxEncodedBitsPerSample() + 4)) {
3516
0
      return false;
3517
0
    }
3518
0
  }
3519
0
  if (!is_single_group) {
3520
    // Group header for modular image.
3521
    // When the image is single-group, the global modular image is the one
3522
    // that contains the pixel data, and there is no group header.
3523
0
    output[0].Write(1, 1);     // Global tree
3524
0
    output[0].Write(1, 1);     // All default wp
3525
0
    output[0].Write(2, 0b00);  // 0 transforms
3526
0
  }
3527
3528
0
  ChunkEncoder<BitDepth> encoders[4];
3529
0
  ChannelRowProcessor<ChunkEncoder<BitDepth>, BitDepth> row_encoders[4];
3530
0
  for (size_t c = 0; c < nb_chans; c++) {
3531
0
    row_encoders[c].t = &encoders[c];
3532
0
    encoders[c].output = &output[c];
3533
0
    encoders[c].code = &code[c];
3534
0
    encoders[c].PrepareForSimd();
3535
0
  }
3536
0
  ProcessImageArea<ChannelRowProcessor<ChunkEncoder<BitDepth>, BitDepth>>(
3537
0
      rgba, x0, y0, xs, 0, ys, row_stride, bitdepth, nb_chans, big_endian,
3538
0
      row_encoders);
3539
0
  return true;
3540
0
}
Unexecuted instantiation: enc_fast_lossless.cc:bool AVX512::(anonymous namespace)::WriteACSection<AVX512::(anonymous namespace)::UpTo8Bits>(unsigned char const*, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, bool, AVX512::(anonymous namespace)::UpTo8Bits, unsigned long, bool, (anonymous namespace)::PrefixCode const*, std::__1::array<(anonymous namespace)::BitWriter, 4ul>&)
Unexecuted instantiation: enc_fast_lossless.cc:bool AVX512::(anonymous namespace)::WriteACSection<AVX512::(anonymous namespace)::From9To13Bits>(unsigned char const*, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, bool, AVX512::(anonymous namespace)::From9To13Bits, unsigned long, bool, (anonymous namespace)::PrefixCode const*, std::__1::array<(anonymous namespace)::BitWriter, 4ul>&)
Unexecuted instantiation: enc_fast_lossless.cc:bool AVX512::(anonymous namespace)::WriteACSection<AVX512::(anonymous namespace)::Exactly14Bits>(unsigned char const*, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, bool, AVX512::(anonymous namespace)::Exactly14Bits, unsigned long, bool, (anonymous namespace)::PrefixCode const*, std::__1::array<(anonymous namespace)::BitWriter, 4ul>&)
Unexecuted instantiation: enc_fast_lossless.cc:bool AVX512::(anonymous namespace)::WriteACSection<AVX512::(anonymous namespace)::MoreThan14Bits>(unsigned char const*, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, bool, AVX512::(anonymous namespace)::MoreThan14Bits, unsigned long, bool, (anonymous namespace)::PrefixCode const*, std::__1::array<(anonymous namespace)::BitWriter, 4ul>&)
Unexecuted instantiation: enc_fast_lossless.cc:bool AVX2::(anonymous namespace)::WriteACSection<AVX2::(anonymous namespace)::UpTo8Bits>(unsigned char const*, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, bool, AVX2::(anonymous namespace)::UpTo8Bits, unsigned long, bool, (anonymous namespace)::PrefixCode const*, std::__1::array<(anonymous namespace)::BitWriter, 4ul>&)
Unexecuted instantiation: enc_fast_lossless.cc:bool AVX2::(anonymous namespace)::WriteACSection<AVX2::(anonymous namespace)::From9To13Bits>(unsigned char const*, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, bool, AVX2::(anonymous namespace)::From9To13Bits, unsigned long, bool, (anonymous namespace)::PrefixCode const*, std::__1::array<(anonymous namespace)::BitWriter, 4ul>&)
Unexecuted instantiation: enc_fast_lossless.cc:bool AVX2::(anonymous namespace)::WriteACSection<AVX2::(anonymous namespace)::Exactly14Bits>(unsigned char const*, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, bool, AVX2::(anonymous namespace)::Exactly14Bits, unsigned long, bool, (anonymous namespace)::PrefixCode const*, std::__1::array<(anonymous namespace)::BitWriter, 4ul>&)
Unexecuted instantiation: enc_fast_lossless.cc:bool AVX2::(anonymous namespace)::WriteACSection<AVX2::(anonymous namespace)::MoreThan14Bits>(unsigned char const*, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, bool, AVX2::(anonymous namespace)::MoreThan14Bits, unsigned long, bool, (anonymous namespace)::PrefixCode const*, std::__1::array<(anonymous namespace)::BitWriter, 4ul>&)
Unexecuted instantiation: enc_fast_lossless.cc:bool default_implementation::(anonymous namespace)::WriteACSection<default_implementation::(anonymous namespace)::UpTo8Bits>(unsigned char const*, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, bool, default_implementation::(anonymous namespace)::UpTo8Bits, unsigned long, bool, (anonymous namespace)::PrefixCode const*, std::__1::array<(anonymous namespace)::BitWriter, 4ul>&)
Unexecuted instantiation: enc_fast_lossless.cc:bool default_implementation::(anonymous namespace)::WriteACSection<default_implementation::(anonymous namespace)::From9To13Bits>(unsigned char const*, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, bool, default_implementation::(anonymous namespace)::From9To13Bits, unsigned long, bool, (anonymous namespace)::PrefixCode const*, std::__1::array<(anonymous namespace)::BitWriter, 4ul>&)
Unexecuted instantiation: enc_fast_lossless.cc:bool default_implementation::(anonymous namespace)::WriteACSection<default_implementation::(anonymous namespace)::Exactly14Bits>(unsigned char const*, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, bool, default_implementation::(anonymous namespace)::Exactly14Bits, unsigned long, bool, (anonymous namespace)::PrefixCode const*, std::__1::array<(anonymous namespace)::BitWriter, 4ul>&)
Unexecuted instantiation: enc_fast_lossless.cc:bool default_implementation::(anonymous namespace)::WriteACSection<default_implementation::(anonymous namespace)::MoreThan14Bits>(unsigned char const*, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, bool, default_implementation::(anonymous namespace)::MoreThan14Bits, unsigned long, bool, (anonymous namespace)::PrefixCode const*, std::__1::array<(anonymous namespace)::BitWriter, 4ul>&)
3541
3542
constexpr int kHashExp = 16;
3543
constexpr uint32_t kHashSize = 1 << kHashExp;
3544
constexpr uint32_t kHashMultiplier = 2654435761;
3545
constexpr int kMaxColors = 512;
3546
3547
// can be any function that returns a value in 0 .. kHashSize-1
3548
// has to map 0 to 0
3549
0
inline uint32_t pixel_hash(uint32_t p) {
3550
0
  return (p * kHashMultiplier) >> (32 - kHashExp);
3551
0
}
Unexecuted instantiation: enc_fast_lossless.cc:AVX512::(anonymous namespace)::pixel_hash(unsigned int)
Unexecuted instantiation: enc_fast_lossless.cc:AVX2::(anonymous namespace)::pixel_hash(unsigned int)
Unexecuted instantiation: enc_fast_lossless.cc:default_implementation::(anonymous namespace)::pixel_hash(unsigned int)
3552
3553
template <size_t nb_chans>
3554
void FillRowPalette(const unsigned char* inrow, size_t xs,
3555
0
                    const int16_t* lookup, int16_t* out) {
3556
0
  for (size_t x = 0; x < xs; x++) {
3557
0
    uint32_t p = 0;
3558
0
    for (size_t i = 0; i < nb_chans; ++i) {
3559
0
      p |= inrow[x * nb_chans + i] << (8 * i);
3560
0
    }
3561
0
    out[x] = lookup[pixel_hash(p)];
3562
0
  }
3563
0
}
Unexecuted instantiation: enc_fast_lossless.cc:void AVX512::(anonymous namespace)::FillRowPalette<1ul>(unsigned char const*, unsigned long, short const*, short*)
Unexecuted instantiation: enc_fast_lossless.cc:void AVX512::(anonymous namespace)::FillRowPalette<2ul>(unsigned char const*, unsigned long, short const*, short*)
Unexecuted instantiation: enc_fast_lossless.cc:void AVX512::(anonymous namespace)::FillRowPalette<3ul>(unsigned char const*, unsigned long, short const*, short*)
Unexecuted instantiation: enc_fast_lossless.cc:void AVX512::(anonymous namespace)::FillRowPalette<4ul>(unsigned char const*, unsigned long, short const*, short*)
Unexecuted instantiation: enc_fast_lossless.cc:void AVX2::(anonymous namespace)::FillRowPalette<1ul>(unsigned char const*, unsigned long, short const*, short*)
Unexecuted instantiation: enc_fast_lossless.cc:void AVX2::(anonymous namespace)::FillRowPalette<2ul>(unsigned char const*, unsigned long, short const*, short*)
Unexecuted instantiation: enc_fast_lossless.cc:void AVX2::(anonymous namespace)::FillRowPalette<3ul>(unsigned char const*, unsigned long, short const*, short*)
Unexecuted instantiation: enc_fast_lossless.cc:void AVX2::(anonymous namespace)::FillRowPalette<4ul>(unsigned char const*, unsigned long, short const*, short*)
Unexecuted instantiation: enc_fast_lossless.cc:void default_implementation::(anonymous namespace)::FillRowPalette<1ul>(unsigned char const*, unsigned long, short const*, short*)
Unexecuted instantiation: enc_fast_lossless.cc:void default_implementation::(anonymous namespace)::FillRowPalette<2ul>(unsigned char const*, unsigned long, short const*, short*)
Unexecuted instantiation: enc_fast_lossless.cc:void default_implementation::(anonymous namespace)::FillRowPalette<3ul>(unsigned char const*, unsigned long, short const*, short*)
Unexecuted instantiation: enc_fast_lossless.cc:void default_implementation::(anonymous namespace)::FillRowPalette<4ul>(unsigned char const*, unsigned long, short const*, short*)
3564
3565
template <typename Processor>
3566
void ProcessImageAreaPalette(const unsigned char* rgba, size_t x0, size_t y0,
3567
                             size_t xs, size_t yskip, size_t ys,
3568
                             size_t row_stride, const int16_t* lookup,
3569
0
                             size_t nb_chans, Processor* processors) {
3570
0
  constexpr size_t kPadding = 32;
3571
3572
0
  std::vector<std::array<int16_t, 256 + kPadding * 2>> group_data(2);
3573
0
  Processor& row_encoder = processors[0];
3574
3575
0
  for (size_t y = 0; y < ys; y++) {
3576
    // Pre-fill rows with palette converted pixels.
3577
0
    const unsigned char* inrow = rgba + row_stride * (y0 + y) + x0 * nb_chans;
3578
0
    int16_t* outrow = &group_data[y & 1][kPadding];
3579
0
    if (nb_chans == 1) {
3580
0
      FillRowPalette<1>(inrow, xs, lookup, outrow);
3581
0
    } else if (nb_chans == 2) {
3582
0
      FillRowPalette<2>(inrow, xs, lookup, outrow);
3583
0
    } else if (nb_chans == 3) {
3584
0
      FillRowPalette<3>(inrow, xs, lookup, outrow);
3585
0
    } else if (nb_chans == 4) {
3586
0
      FillRowPalette<4>(inrow, xs, lookup, outrow);
3587
0
    }
3588
    // Deal with x == 0.
3589
0
    group_data[y & 1][kPadding - 1] =
3590
0
        y > 0 ? group_data[(y - 1) & 1][kPadding] : 0;
3591
    // Fix topleft.
3592
0
    group_data[(y - 1) & 1][kPadding - 1] =
3593
0
        y > 0 ? group_data[(y - 1) & 1][kPadding] : 0;
3594
    // Get pointers to px/left/top/topleft data to speedup loop.
3595
0
    const int16_t* row = &group_data[y & 1][kPadding];
3596
0
    const int16_t* row_left = &group_data[y & 1][kPadding - 1];
3597
0
    const int16_t* row_top =
3598
0
        y == 0 ? row_left : &group_data[(y - 1) & 1][kPadding];
3599
0
    const int16_t* row_topleft =
3600
0
        y == 0 ? row_left : &group_data[(y - 1) & 1][kPadding - 1];
3601
3602
0
    row_encoder.ProcessRow(row, row_left, row_top, row_topleft, xs);
3603
0
  }
3604
0
  row_encoder.Finalize();
3605
0
}
Unexecuted instantiation: enc_fast_lossless.cc:void AVX512::(anonymous namespace)::ProcessImageAreaPalette<AVX512::(anonymous namespace)::ChannelRowProcessor<AVX512::(anonymous namespace)::ChunkSampleCollector<AVX512::(anonymous namespace)::UpTo8Bits>, AVX512::(anonymous namespace)::UpTo8Bits> >(unsigned char const*, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, short const*, unsigned long, AVX512::(anonymous namespace)::ChannelRowProcessor<AVX512::(anonymous namespace)::ChunkSampleCollector<AVX512::(anonymous namespace)::UpTo8Bits>, AVX512::(anonymous namespace)::UpTo8Bits>*)
Unexecuted instantiation: enc_fast_lossless.cc:void AVX2::(anonymous namespace)::ProcessImageAreaPalette<AVX2::(anonymous namespace)::ChannelRowProcessor<AVX2::(anonymous namespace)::ChunkSampleCollector<AVX2::(anonymous namespace)::UpTo8Bits>, AVX2::(anonymous namespace)::UpTo8Bits> >(unsigned char const*, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, short const*, unsigned long, AVX2::(anonymous namespace)::ChannelRowProcessor<AVX2::(anonymous namespace)::ChunkSampleCollector<AVX2::(anonymous namespace)::UpTo8Bits>, AVX2::(anonymous namespace)::UpTo8Bits>*)
Unexecuted instantiation: enc_fast_lossless.cc:void default_implementation::(anonymous namespace)::ProcessImageAreaPalette<default_implementation::(anonymous namespace)::ChannelRowProcessor<default_implementation::(anonymous namespace)::ChunkSampleCollector<default_implementation::(anonymous namespace)::UpTo8Bits>, default_implementation::(anonymous namespace)::UpTo8Bits> >(unsigned char const*, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, short const*, unsigned long, default_implementation::(anonymous namespace)::ChannelRowProcessor<default_implementation::(anonymous namespace)::ChunkSampleCollector<default_implementation::(anonymous namespace)::UpTo8Bits>, default_implementation::(anonymous namespace)::UpTo8Bits>*)
Unexecuted instantiation: enc_fast_lossless.cc:void AVX512::(anonymous namespace)::ProcessImageAreaPalette<AVX512::(anonymous namespace)::ChannelRowProcessor<AVX512::(anonymous namespace)::ChunkEncoder<AVX512::(anonymous namespace)::UpTo8Bits>, AVX512::(anonymous namespace)::UpTo8Bits> >(unsigned char const*, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, short const*, unsigned long, AVX512::(anonymous namespace)::ChannelRowProcessor<AVX512::(anonymous namespace)::ChunkEncoder<AVX512::(anonymous namespace)::UpTo8Bits>, AVX512::(anonymous namespace)::UpTo8Bits>*)
Unexecuted instantiation: enc_fast_lossless.cc:void AVX2::(anonymous namespace)::ProcessImageAreaPalette<AVX2::(anonymous namespace)::ChannelRowProcessor<AVX2::(anonymous namespace)::ChunkEncoder<AVX2::(anonymous namespace)::UpTo8Bits>, AVX2::(anonymous namespace)::UpTo8Bits> >(unsigned char const*, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, short const*, unsigned long, AVX2::(anonymous namespace)::ChannelRowProcessor<AVX2::(anonymous namespace)::ChunkEncoder<AVX2::(anonymous namespace)::UpTo8Bits>, AVX2::(anonymous namespace)::UpTo8Bits>*)
Unexecuted instantiation: enc_fast_lossless.cc:void default_implementation::(anonymous namespace)::ProcessImageAreaPalette<default_implementation::(anonymous namespace)::ChannelRowProcessor<default_implementation::(anonymous namespace)::ChunkEncoder<default_implementation::(anonymous namespace)::UpTo8Bits>, default_implementation::(anonymous namespace)::UpTo8Bits> >(unsigned char const*, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, short const*, unsigned long, default_implementation::(anonymous namespace)::ChannelRowProcessor<default_implementation::(anonymous namespace)::ChunkEncoder<default_implementation::(anonymous namespace)::UpTo8Bits>, default_implementation::(anonymous namespace)::UpTo8Bits>*)
3606
3607
bool WriteACSectionPalette(const unsigned char* rgba, size_t x0, size_t y0,
3608
                           size_t xs, size_t ys, size_t row_stride,
3609
                           bool is_single_group, const PrefixCode code[4],
3610
                           const int16_t* lookup, size_t nb_chans,
3611
0
                           BitWriter& output) {
3612
0
  if (!is_single_group) {
3613
0
    if (!output.Allocate(16 * xs * ys + 4)) return false;
3614
    // Group header for modular image.
3615
    // When the image is single-group, the global modular image is the one
3616
    // that contains the pixel data, and there is no group header.
3617
0
    output.Write(1, 1);     // Global tree
3618
0
    output.Write(1, 1);     // All default wp
3619
0
    output.Write(2, 0b00);  // 0 transforms
3620
0
  }
3621
3622
0
  ChunkEncoder<UpTo8Bits> encoder;
3623
0
  ChannelRowProcessor<ChunkEncoder<UpTo8Bits>, UpTo8Bits> row_encoder;
3624
3625
0
  row_encoder.t = &encoder;
3626
0
  encoder.output = &output;
3627
0
  encoder.code = &code[is_single_group ? 1 : 0];
3628
0
  encoder.PrepareForSimd();
3629
0
  ProcessImageAreaPalette<
3630
0
      ChannelRowProcessor<ChunkEncoder<UpTo8Bits>, UpTo8Bits>>(
3631
0
      rgba, x0, y0, xs, 0, ys, row_stride, lookup, nb_chans, &row_encoder);
3632
0
  return true;
3633
0
}
Unexecuted instantiation: enc_fast_lossless.cc:AVX512::(anonymous namespace)::WriteACSectionPalette(unsigned char const*, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, bool, (anonymous namespace)::PrefixCode const*, short const*, unsigned long, (anonymous namespace)::BitWriter&)
Unexecuted instantiation: enc_fast_lossless.cc:AVX2::(anonymous namespace)::WriteACSectionPalette(unsigned char const*, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, bool, (anonymous namespace)::PrefixCode const*, short const*, unsigned long, (anonymous namespace)::BitWriter&)
Unexecuted instantiation: enc_fast_lossless.cc:default_implementation::(anonymous namespace)::WriteACSectionPalette(unsigned char const*, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, bool, (anonymous namespace)::PrefixCode const*, short const*, unsigned long, (anonymous namespace)::BitWriter&)
3634
3635
template <typename BitDepth>
3636
void CollectSamples(const unsigned char* rgba, size_t x0, size_t y0, size_t xs,
3637
                    size_t row_stride, size_t row_count,
3638
                    uint64_t raw_counts[4][kNumRawSymbols],
3639
                    uint64_t lz77_counts[4][kNumLZ77], bool is_single_group,
3640
                    bool palette, BitDepth bitdepth, size_t nb_chans,
3641
0
                    bool big_endian, const int16_t* lookup) {
3642
0
  if (palette) {
3643
0
    ChunkSampleCollector<UpTo8Bits> sample_collectors[4];
3644
0
    ChannelRowProcessor<ChunkSampleCollector<UpTo8Bits>, UpTo8Bits>
3645
0
        row_sample_collectors[4];
3646
0
    for (size_t c = 0; c < nb_chans; c++) {
3647
0
      row_sample_collectors[c].t = &sample_collectors[c];
3648
0
      sample_collectors[c].raw_counts = raw_counts[is_single_group ? 1 : 0];
3649
0
      sample_collectors[c].lz77_counts = lz77_counts[is_single_group ? 1 : 0];
3650
0
    }
3651
0
    ProcessImageAreaPalette<
3652
0
        ChannelRowProcessor<ChunkSampleCollector<UpTo8Bits>, UpTo8Bits>>(
3653
0
        rgba, x0, y0, xs, 1, 1 + row_count, row_stride, lookup, nb_chans,
3654
0
        row_sample_collectors);
3655
0
  } else {
3656
0
    ChunkSampleCollector<BitDepth> sample_collectors[4];
3657
0
    ChannelRowProcessor<ChunkSampleCollector<BitDepth>, BitDepth>
3658
0
        row_sample_collectors[4];
3659
0
    for (size_t c = 0; c < nb_chans; c++) {
3660
0
      row_sample_collectors[c].t = &sample_collectors[c];
3661
0
      sample_collectors[c].raw_counts = raw_counts[c];
3662
0
      sample_collectors[c].lz77_counts = lz77_counts[c];
3663
0
    }
3664
0
    ProcessImageArea<
3665
0
        ChannelRowProcessor<ChunkSampleCollector<BitDepth>, BitDepth>>(
3666
0
        rgba, x0, y0, xs, 1, 1 + row_count, row_stride, bitdepth, nb_chans,
3667
0
        big_endian, row_sample_collectors);
3668
0
  }
3669
0
}
Unexecuted instantiation: enc_fast_lossless.cc:void AVX512::(anonymous namespace)::CollectSamples<AVX512::(anonymous namespace)::UpTo8Bits>(unsigned char const*, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long (*) [19], unsigned long (*) [33], bool, bool, AVX512::(anonymous namespace)::UpTo8Bits, unsigned long, bool, short const*)
Unexecuted instantiation: enc_fast_lossless.cc:void AVX512::(anonymous namespace)::CollectSamples<AVX512::(anonymous namespace)::From9To13Bits>(unsigned char const*, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long (*) [19], unsigned long (*) [33], bool, bool, AVX512::(anonymous namespace)::From9To13Bits, unsigned long, bool, short const*)
Unexecuted instantiation: enc_fast_lossless.cc:void AVX512::(anonymous namespace)::CollectSamples<AVX512::(anonymous namespace)::Exactly14Bits>(unsigned char const*, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long (*) [19], unsigned long (*) [33], bool, bool, AVX512::(anonymous namespace)::Exactly14Bits, unsigned long, bool, short const*)
Unexecuted instantiation: enc_fast_lossless.cc:void AVX512::(anonymous namespace)::CollectSamples<AVX512::(anonymous namespace)::MoreThan14Bits>(unsigned char const*, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long (*) [19], unsigned long (*) [33], bool, bool, AVX512::(anonymous namespace)::MoreThan14Bits, unsigned long, bool, short const*)
Unexecuted instantiation: enc_fast_lossless.cc:void AVX2::(anonymous namespace)::CollectSamples<AVX2::(anonymous namespace)::UpTo8Bits>(unsigned char const*, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long (*) [19], unsigned long (*) [33], bool, bool, AVX2::(anonymous namespace)::UpTo8Bits, unsigned long, bool, short const*)
Unexecuted instantiation: enc_fast_lossless.cc:void AVX2::(anonymous namespace)::CollectSamples<AVX2::(anonymous namespace)::From9To13Bits>(unsigned char const*, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long (*) [19], unsigned long (*) [33], bool, bool, AVX2::(anonymous namespace)::From9To13Bits, unsigned long, bool, short const*)
Unexecuted instantiation: enc_fast_lossless.cc:void AVX2::(anonymous namespace)::CollectSamples<AVX2::(anonymous namespace)::Exactly14Bits>(unsigned char const*, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long (*) [19], unsigned long (*) [33], bool, bool, AVX2::(anonymous namespace)::Exactly14Bits, unsigned long, bool, short const*)
Unexecuted instantiation: enc_fast_lossless.cc:void AVX2::(anonymous namespace)::CollectSamples<AVX2::(anonymous namespace)::MoreThan14Bits>(unsigned char const*, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long (*) [19], unsigned long (*) [33], bool, bool, AVX2::(anonymous namespace)::MoreThan14Bits, unsigned long, bool, short const*)
Unexecuted instantiation: enc_fast_lossless.cc:void default_implementation::(anonymous namespace)::CollectSamples<default_implementation::(anonymous namespace)::UpTo8Bits>(unsigned char const*, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long (*) [19], unsigned long (*) [33], bool, bool, default_implementation::(anonymous namespace)::UpTo8Bits, unsigned long, bool, short const*)
Unexecuted instantiation: enc_fast_lossless.cc:void default_implementation::(anonymous namespace)::CollectSamples<default_implementation::(anonymous namespace)::From9To13Bits>(unsigned char const*, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long (*) [19], unsigned long (*) [33], bool, bool, default_implementation::(anonymous namespace)::From9To13Bits, unsigned long, bool, short const*)
Unexecuted instantiation: enc_fast_lossless.cc:void default_implementation::(anonymous namespace)::CollectSamples<default_implementation::(anonymous namespace)::Exactly14Bits>(unsigned char const*, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long (*) [19], unsigned long (*) [33], bool, bool, default_implementation::(anonymous namespace)::Exactly14Bits, unsigned long, bool, short const*)
Unexecuted instantiation: enc_fast_lossless.cc:void default_implementation::(anonymous namespace)::CollectSamples<default_implementation::(anonymous namespace)::MoreThan14Bits>(unsigned char const*, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long (*) [19], unsigned long (*) [33], bool, bool, default_implementation::(anonymous namespace)::MoreThan14Bits, unsigned long, bool, short const*)
3670
3671
bool PrepareDCGlobalPalette(bool is_single_group, size_t width, size_t height,
3672
                            size_t nb_chans, const PrefixCode code[4],
3673
                            const std::vector<uint32_t>& palette,
3674
0
                            size_t pcolors, BitWriter* output) {
3675
0
  if (!PrepareDCGlobalCommon(is_single_group, width, height, code, output)) {
3676
0
    return false;
3677
0
  }
3678
0
  output->Write(2, 0b01);     // 1 transform
3679
0
  output->Write(2, 0b01);     // Palette
3680
0
  output->Write(5, 0b00000);  // Starting from ch 0
3681
0
  if (nb_chans == 1) {
3682
0
    output->Write(2, 0b00);  // 1-channel palette (Gray)
3683
0
  } else if (nb_chans == 3) {
3684
0
    output->Write(2, 0b01);  // 3-channel palette (RGB)
3685
0
  } else if (nb_chans == 4) {
3686
0
    output->Write(2, 0b10);  // 4-channel palette (RGBA)
3687
0
  } else {
3688
0
    output->Write(2, 0b11);
3689
0
    output->Write(13, nb_chans - 1);
3690
0
  }
3691
  // pcolors <= kMaxColors + kChunkSize - 1
3692
0
  static_assert(kMaxColors + kChunkSize < 1281,
3693
0
                "add code to signal larger palette sizes");
3694
0
  if (pcolors < 256) {
3695
0
    output->Write(2, 0b00);
3696
0
    output->Write(8, pcolors);
3697
0
  } else {
3698
0
    output->Write(2, 0b01);
3699
0
    output->Write(10, pcolors - 256);
3700
0
  }
3701
3702
0
  output->Write(2, 0b00);  // nb_deltas == 0
3703
0
  output->Write(4, 0);     // Zero predictor for delta palette
3704
  // Encode palette
3705
0
  ChunkEncoder<UpTo8Bits> encoder;
3706
0
  ChannelRowProcessor<ChunkEncoder<UpTo8Bits>, UpTo8Bits> row_encoder;
3707
0
  row_encoder.t = &encoder;
3708
0
  encoder.output = output;
3709
0
  encoder.code = &code[0];
3710
0
  encoder.PrepareForSimd();
3711
0
  std::vector<std::array<int16_t, 32 + 1024>> p(4);
3712
0
  size_t i = 0;
3713
0
  size_t have_zero = 1;
3714
0
  for (; i < pcolors; i++) {
3715
0
    p[0][16 + i + have_zero] = palette[i] & 0xFF;
3716
0
    p[1][16 + i + have_zero] = (palette[i] >> 8) & 0xFF;
3717
0
    p[2][16 + i + have_zero] = (palette[i] >> 16) & 0xFF;
3718
0
    p[3][16 + i + have_zero] = (palette[i] >> 24) & 0xFF;
3719
0
  }
3720
0
  p[0][15] = 0;
3721
0
  row_encoder.ProcessRow(p[0].data() + 16, p[0].data() + 15, p[0].data() + 15,
3722
0
                         p[0].data() + 15, pcolors);
3723
0
  p[1][15] = p[0][16];
3724
0
  p[0][15] = p[0][16];
3725
0
  if (nb_chans > 1) {
3726
0
    row_encoder.ProcessRow(p[1].data() + 16, p[1].data() + 15, p[0].data() + 16,
3727
0
                           p[0].data() + 15, pcolors);
3728
0
  }
3729
0
  p[2][15] = p[1][16];
3730
0
  p[1][15] = p[1][16];
3731
0
  if (nb_chans > 2) {
3732
0
    row_encoder.ProcessRow(p[2].data() + 16, p[2].data() + 15, p[1].data() + 16,
3733
0
                           p[1].data() + 15, pcolors);
3734
0
  }
3735
0
  p[3][15] = p[2][16];
3736
0
  p[2][15] = p[2][16];
3737
0
  if (nb_chans > 3) {
3738
0
    row_encoder.ProcessRow(p[3].data() + 16, p[3].data() + 15, p[2].data() + 16,
3739
0
                           p[2].data() + 15, pcolors);
3740
0
  }
3741
0
  row_encoder.Finalize();
3742
3743
0
  if (!is_single_group) {
3744
0
    output->ZeroPadToByte();
3745
0
  }
3746
0
  return true;
3747
0
}
Unexecuted instantiation: enc_fast_lossless.cc:AVX512::(anonymous namespace)::PrepareDCGlobalPalette(bool, unsigned long, unsigned long, unsigned long, (anonymous namespace)::PrefixCode const*, std::__1::vector<unsigned int, std::__1::allocator<unsigned int> > const&, unsigned long, (anonymous namespace)::BitWriter*)
Unexecuted instantiation: enc_fast_lossless.cc:AVX2::(anonymous namespace)::PrepareDCGlobalPalette(bool, unsigned long, unsigned long, unsigned long, (anonymous namespace)::PrefixCode const*, std::__1::vector<unsigned int, std::__1::allocator<unsigned int> > const&, unsigned long, (anonymous namespace)::BitWriter*)
Unexecuted instantiation: enc_fast_lossless.cc:default_implementation::(anonymous namespace)::PrepareDCGlobalPalette(bool, unsigned long, unsigned long, unsigned long, (anonymous namespace)::PrefixCode const*, std::__1::vector<unsigned int, std::__1::allocator<unsigned int> > const&, unsigned long, (anonymous namespace)::BitWriter*)
3748
3749
template <size_t nb_chans>
3750
bool detect_palette(const unsigned char* r, size_t width,
3751
0
                    std::vector<uint32_t>& palette) {
3752
0
  size_t x = 0;
3753
0
  bool collided = false;
3754
  // this is just an unrolling of the next loop
3755
0
  size_t look_ahead = 7 + ((nb_chans == 1) ? 3 : ((nb_chans < 4) ? 1 : 0));
3756
0
  for (; x + look_ahead < width; x += 8) {
3757
0
    uint32_t p[8] = {}, index[8];
3758
0
    for (int i = 0; i < 8; i++) {
3759
0
      for (int j = 0; j < 4; ++j) {
3760
0
        p[i] |= r[(x + i) * nb_chans + j] << (8 * j);
3761
0
      }
3762
0
    }
3763
0
    for (int i = 0; i < 8; i++) p[i] &= ((1llu << (8 * nb_chans)) - 1);
3764
0
    for (int i = 0; i < 8; i++) index[i] = pixel_hash(p[i]);
3765
0
    for (int i = 0; i < 8; i++) {
3766
0
      collided |= (palette[index[i]] != 0 && p[i] != palette[index[i]]);
3767
0
      palette[index[i]] = p[i];
3768
0
    }
3769
0
  }
3770
0
  for (; x < width; x++) {
3771
0
    uint32_t p = 0;
3772
0
    for (size_t i = 0; i < nb_chans; ++i) {
3773
0
      p |= r[x * nb_chans + i] << (8 * i);
3774
0
    }
3775
0
    uint32_t index = pixel_hash(p);
3776
0
    collided |= (palette[index] != 0 && p != palette[index]);
3777
0
    palette[index] = p;
3778
0
  }
3779
0
  return collided;
3780
0
}
Unexecuted instantiation: enc_fast_lossless.cc:bool AVX512::(anonymous namespace)::detect_palette<1ul>(unsigned char const*, unsigned long, std::__1::vector<unsigned int, std::__1::allocator<unsigned int> >&)
Unexecuted instantiation: enc_fast_lossless.cc:bool AVX512::(anonymous namespace)::detect_palette<2ul>(unsigned char const*, unsigned long, std::__1::vector<unsigned int, std::__1::allocator<unsigned int> >&)
Unexecuted instantiation: enc_fast_lossless.cc:bool AVX512::(anonymous namespace)::detect_palette<3ul>(unsigned char const*, unsigned long, std::__1::vector<unsigned int, std::__1::allocator<unsigned int> >&)
Unexecuted instantiation: enc_fast_lossless.cc:bool AVX512::(anonymous namespace)::detect_palette<4ul>(unsigned char const*, unsigned long, std::__1::vector<unsigned int, std::__1::allocator<unsigned int> >&)
Unexecuted instantiation: enc_fast_lossless.cc:bool AVX2::(anonymous namespace)::detect_palette<1ul>(unsigned char const*, unsigned long, std::__1::vector<unsigned int, std::__1::allocator<unsigned int> >&)
Unexecuted instantiation: enc_fast_lossless.cc:bool AVX2::(anonymous namespace)::detect_palette<2ul>(unsigned char const*, unsigned long, std::__1::vector<unsigned int, std::__1::allocator<unsigned int> >&)
Unexecuted instantiation: enc_fast_lossless.cc:bool AVX2::(anonymous namespace)::detect_palette<3ul>(unsigned char const*, unsigned long, std::__1::vector<unsigned int, std::__1::allocator<unsigned int> >&)
Unexecuted instantiation: enc_fast_lossless.cc:bool AVX2::(anonymous namespace)::detect_palette<4ul>(unsigned char const*, unsigned long, std::__1::vector<unsigned int, std::__1::allocator<unsigned int> >&)
Unexecuted instantiation: enc_fast_lossless.cc:bool default_implementation::(anonymous namespace)::detect_palette<1ul>(unsigned char const*, unsigned long, std::__1::vector<unsigned int, std::__1::allocator<unsigned int> >&)
Unexecuted instantiation: enc_fast_lossless.cc:bool default_implementation::(anonymous namespace)::detect_palette<2ul>(unsigned char const*, unsigned long, std::__1::vector<unsigned int, std::__1::allocator<unsigned int> >&)
Unexecuted instantiation: enc_fast_lossless.cc:bool default_implementation::(anonymous namespace)::detect_palette<3ul>(unsigned char const*, unsigned long, std::__1::vector<unsigned int, std::__1::allocator<unsigned int> >&)
Unexecuted instantiation: enc_fast_lossless.cc:bool default_implementation::(anonymous namespace)::detect_palette<4ul>(unsigned char const*, unsigned long, std::__1::vector<unsigned int, std::__1::allocator<unsigned int> >&)
3781
3782
template <typename BitDepth>
3783
JxlFastLosslessFrameState* LLPrepare(JxlChunkedFrameInputSource input,
3784
                                     size_t width, size_t height,
3785
                                     BitDepth bitdepth, size_t nb_chans,
3786
0
                                     bool big_endian, int effort, int oneshot) {
3787
0
  assert(width != 0);
3788
0
  assert(height != 0);
3789
3790
  // Count colors to try palette
3791
0
  std::vector<uint32_t> palette(kHashSize);
3792
0
  std::vector<int16_t> lookup(kHashSize);
3793
0
  lookup[0] = 0;
3794
0
  int pcolors = 0;
3795
0
  bool collided = effort < 2 || bitdepth.bitdepth != 8 || !oneshot;
3796
0
  for (size_t y0 = 0; y0 < height && !collided; y0 += 256) {
3797
0
    size_t ys = std::min<size_t>(height - y0, 256);
3798
0
    for (size_t x0 = 0; x0 < width && !collided; x0 += 256) {
3799
0
      size_t xs = std::min<size_t>(width - x0, 256);
3800
0
      size_t stride;
3801
      // TODO(szabadka): Add RAII wrapper around this.
3802
0
      const void* buffer = input.get_color_channel_data_at(input.opaque, x0, y0,
3803
0
                                                           xs, ys, &stride);
3804
0
      if (buffer == nullptr) return nullptr;
3805
0
      auto rgba = reinterpret_cast<const unsigned char*>(buffer);
3806
0
      for (size_t y = 0; y < ys && !collided; y++) {
3807
0
        const unsigned char* r = rgba + stride * y;
3808
0
        if (nb_chans == 1) collided = detect_palette<1>(r, xs, palette);
3809
0
        if (nb_chans == 2) collided = detect_palette<2>(r, xs, palette);
3810
0
        if (nb_chans == 3) collided = detect_palette<3>(r, xs, palette);
3811
0
        if (nb_chans == 4) collided = detect_palette<4>(r, xs, palette);
3812
0
      }
3813
0
      input.release_buffer(input.opaque, buffer);
3814
0
    }
3815
0
  }
3816
0
  int nb_entries = 0;
3817
0
  if (!collided) {
3818
0
    pcolors = 1;  // always have all-zero as a palette color
3819
0
    bool have_color = false;
3820
0
    uint8_t minG = 255, maxG = 0;
3821
0
    for (uint32_t k = 0; k < kHashSize; k++) {
3822
0
      if (palette[k] == 0) continue;
3823
0
      uint8_t p[4];
3824
0
      for (int i = 0; i < 4; ++i) {
3825
0
        p[i] = (palette[k] >> (8 * i)) & 0xFF;
3826
0
      }
3827
      // move entries to front so sort has less work
3828
0
      palette[nb_entries] = palette[k];
3829
0
      if (p[0] != p[1] || p[0] != p[2]) have_color = true;
3830
0
      if (p[1] < minG) minG = p[1];
3831
0
      if (p[1] > maxG) maxG = p[1];
3832
0
      nb_entries++;
3833
      // don't do palette if too many colors are needed
3834
0
      if (nb_entries + pcolors > kMaxColors) {
3835
0
        collided = true;
3836
0
        break;
3837
0
      }
3838
0
    }
3839
0
    if (!have_color) {
3840
      // don't do palette if it's just grayscale without many holes
3841
0
      if (maxG - minG < nb_entries * 1.4f) collided = true;
3842
0
    }
3843
0
  }
3844
0
  if (!collided) {
3845
0
    std::sort(
3846
0
        palette.begin(), palette.begin() + nb_entries,
3847
0
        [&nb_chans](uint32_t ap, uint32_t bp) {
3848
0
          if (ap == 0) return false;
3849
0
          if (bp == 0) return true;
3850
0
          uint8_t a[4], b[4];
3851
0
          for (int i = 0; i < 4; ++i) {
3852
0
            a[i] = (ap >> (8 * i)) & 0xFF;
3853
0
            b[i] = (bp >> (8 * i)) & 0xFF;
3854
0
          }
3855
0
          float ay, by;
3856
0
          if (nb_chans == 4) {
3857
0
            ay = (0.299f * a[0] + 0.587f * a[1] + 0.114f * a[2] + 0.01f) * a[3];
3858
0
            by = (0.299f * b[0] + 0.587f * b[1] + 0.114f * b[2] + 0.01f) * b[3];
3859
0
          } else {
3860
0
            ay = (0.299f * a[0] + 0.587f * a[1] + 0.114f * a[2] + 0.01f);
3861
0
            by = (0.299f * b[0] + 0.587f * b[1] + 0.114f * b[2] + 0.01f);
3862
0
          }
3863
0
          return ay < by;  // sort on alpha*luma
3864
0
        });
Unexecuted instantiation: enc_fast_lossless.cc:AVX512::(anonymous namespace)::LLPrepare<AVX512::(anonymous namespace)::UpTo8Bits>(JxlChunkedFrameInputSource, unsigned long, unsigned long, AVX512::(anonymous namespace)::UpTo8Bits, unsigned long, bool, int, int)::{lambda(unsigned int, unsigned int)#1}::operator()(unsigned int, unsigned int) const
Unexecuted instantiation: enc_fast_lossless.cc:AVX512::(anonymous namespace)::LLPrepare<AVX512::(anonymous namespace)::From9To13Bits>(JxlChunkedFrameInputSource, unsigned long, unsigned long, AVX512::(anonymous namespace)::From9To13Bits, unsigned long, bool, int, int)::{lambda(unsigned int, unsigned int)#1}::operator()(unsigned int, unsigned int) const
Unexecuted instantiation: enc_fast_lossless.cc:AVX512::(anonymous namespace)::LLPrepare<AVX512::(anonymous namespace)::Exactly14Bits>(JxlChunkedFrameInputSource, unsigned long, unsigned long, AVX512::(anonymous namespace)::Exactly14Bits, unsigned long, bool, int, int)::{lambda(unsigned int, unsigned int)#1}::operator()(unsigned int, unsigned int) const
Unexecuted instantiation: enc_fast_lossless.cc:AVX512::(anonymous namespace)::LLPrepare<AVX512::(anonymous namespace)::MoreThan14Bits>(JxlChunkedFrameInputSource, unsigned long, unsigned long, AVX512::(anonymous namespace)::MoreThan14Bits, unsigned long, bool, int, int)::{lambda(unsigned int, unsigned int)#1}::operator()(unsigned int, unsigned int) const
Unexecuted instantiation: enc_fast_lossless.cc:AVX2::(anonymous namespace)::LLPrepare<AVX2::(anonymous namespace)::UpTo8Bits>(JxlChunkedFrameInputSource, unsigned long, unsigned long, AVX2::(anonymous namespace)::UpTo8Bits, unsigned long, bool, int, int)::{lambda(unsigned int, unsigned int)#1}::operator()(unsigned int, unsigned int) const
Unexecuted instantiation: enc_fast_lossless.cc:AVX2::(anonymous namespace)::LLPrepare<AVX2::(anonymous namespace)::From9To13Bits>(JxlChunkedFrameInputSource, unsigned long, unsigned long, AVX2::(anonymous namespace)::From9To13Bits, unsigned long, bool, int, int)::{lambda(unsigned int, unsigned int)#1}::operator()(unsigned int, unsigned int) const
Unexecuted instantiation: enc_fast_lossless.cc:AVX2::(anonymous namespace)::LLPrepare<AVX2::(anonymous namespace)::Exactly14Bits>(JxlChunkedFrameInputSource, unsigned long, unsigned long, AVX2::(anonymous namespace)::Exactly14Bits, unsigned long, bool, int, int)::{lambda(unsigned int, unsigned int)#1}::operator()(unsigned int, unsigned int) const
Unexecuted instantiation: enc_fast_lossless.cc:AVX2::(anonymous namespace)::LLPrepare<AVX2::(anonymous namespace)::MoreThan14Bits>(JxlChunkedFrameInputSource, unsigned long, unsigned long, AVX2::(anonymous namespace)::MoreThan14Bits, unsigned long, bool, int, int)::{lambda(unsigned int, unsigned int)#1}::operator()(unsigned int, unsigned int) const
Unexecuted instantiation: enc_fast_lossless.cc:default_implementation::(anonymous namespace)::LLPrepare<default_implementation::(anonymous namespace)::UpTo8Bits>(JxlChunkedFrameInputSource, unsigned long, unsigned long, default_implementation::(anonymous namespace)::UpTo8Bits, unsigned long, bool, int, int)::{lambda(unsigned int, unsigned int)#1}::operator()(unsigned int, unsigned int) const
Unexecuted instantiation: enc_fast_lossless.cc:default_implementation::(anonymous namespace)::LLPrepare<default_implementation::(anonymous namespace)::From9To13Bits>(JxlChunkedFrameInputSource, unsigned long, unsigned long, default_implementation::(anonymous namespace)::From9To13Bits, unsigned long, bool, int, int)::{lambda(unsigned int, unsigned int)#1}::operator()(unsigned int, unsigned int) const
Unexecuted instantiation: enc_fast_lossless.cc:default_implementation::(anonymous namespace)::LLPrepare<default_implementation::(anonymous namespace)::Exactly14Bits>(JxlChunkedFrameInputSource, unsigned long, unsigned long, default_implementation::(anonymous namespace)::Exactly14Bits, unsigned long, bool, int, int)::{lambda(unsigned int, unsigned int)#1}::operator()(unsigned int, unsigned int) const
Unexecuted instantiation: enc_fast_lossless.cc:default_implementation::(anonymous namespace)::LLPrepare<default_implementation::(anonymous namespace)::MoreThan14Bits>(JxlChunkedFrameInputSource, unsigned long, unsigned long, default_implementation::(anonymous namespace)::MoreThan14Bits, unsigned long, bool, int, int)::{lambda(unsigned int, unsigned int)#1}::operator()(unsigned int, unsigned int) const
3865
0
    for (int k = 0; k < nb_entries; k++) {
3866
0
      if (palette[k] == 0) break;
3867
0
      lookup[pixel_hash(palette[k])] = pcolors++;
3868
0
    }
3869
0
  }
3870
3871
0
  size_t num_groups_x = (width + 255) / 256;
3872
0
  size_t num_groups_y = (height + 255) / 256;
3873
0
  size_t num_dc_groups_x = (width + 2047) / 2048;
3874
0
  size_t num_dc_groups_y = (height + 2047) / 2048;
3875
3876
0
  uint64_t raw_counts[4][kNumRawSymbols] = {};
3877
0
  uint64_t lz77_counts[4][kNumLZ77] = {};
3878
3879
0
  bool onegroup = num_groups_x == 1 && num_groups_y == 1;
3880
3881
0
  auto sample_rows = [&](size_t xg, size_t yg, size_t num_rows) {
3882
0
    size_t y0 = yg * 256;
3883
0
    size_t x0 = xg * 256;
3884
0
    size_t ys = std::min<size_t>(height - y0, 256);
3885
0
    size_t xs = std::min<size_t>(width - x0, 256);
3886
0
    size_t stride;
3887
0
    const void* buffer =
3888
0
        input.get_color_channel_data_at(input.opaque, x0, y0, xs, ys, &stride);
3889
0
    if (buffer == nullptr) {
3890
0
      return false;
3891
0
    }
3892
0
    auto rgba = reinterpret_cast<const unsigned char*>(buffer);
3893
0
    int y_begin_group =
3894
0
        std::max<ptrdiff_t>(
3895
0
            0, static_cast<ptrdiff_t>(ys) - static_cast<ptrdiff_t>(num_rows)) /
3896
0
        2;
3897
0
    int y_count =
3898
0
        std::max<int>(0, std::min<int>(num_rows, ys - y_begin_group - 1));
3899
0
    int x_max = xs / kChunkSize * kChunkSize;
3900
0
    CollectSamples(rgba, 0, y_begin_group, x_max, stride, y_count, raw_counts,
3901
0
                   lz77_counts, onegroup, !collided, bitdepth, nb_chans,
3902
0
                   big_endian, lookup.data());
3903
0
    input.release_buffer(input.opaque, buffer);
3904
0
    return true;
3905
0
  };
Unexecuted instantiation: enc_fast_lossless.cc:AVX512::(anonymous namespace)::LLPrepare<AVX512::(anonymous namespace)::UpTo8Bits>(JxlChunkedFrameInputSource, unsigned long, unsigned long, AVX512::(anonymous namespace)::UpTo8Bits, unsigned long, bool, int, int)::{lambda(unsigned long, unsigned long, unsigned long)#1}::operator()(unsigned long, unsigned long, unsigned long) const
Unexecuted instantiation: enc_fast_lossless.cc:AVX512::(anonymous namespace)::LLPrepare<AVX512::(anonymous namespace)::From9To13Bits>(JxlChunkedFrameInputSource, unsigned long, unsigned long, AVX512::(anonymous namespace)::From9To13Bits, unsigned long, bool, int, int)::{lambda(unsigned long, unsigned long, unsigned long)#1}::operator()(unsigned long, unsigned long, unsigned long) const
Unexecuted instantiation: enc_fast_lossless.cc:AVX512::(anonymous namespace)::LLPrepare<AVX512::(anonymous namespace)::Exactly14Bits>(JxlChunkedFrameInputSource, unsigned long, unsigned long, AVX512::(anonymous namespace)::Exactly14Bits, unsigned long, bool, int, int)::{lambda(unsigned long, unsigned long, unsigned long)#1}::operator()(unsigned long, unsigned long, unsigned long) const
Unexecuted instantiation: enc_fast_lossless.cc:AVX512::(anonymous namespace)::LLPrepare<AVX512::(anonymous namespace)::MoreThan14Bits>(JxlChunkedFrameInputSource, unsigned long, unsigned long, AVX512::(anonymous namespace)::MoreThan14Bits, unsigned long, bool, int, int)::{lambda(unsigned long, unsigned long, unsigned long)#1}::operator()(unsigned long, unsigned long, unsigned long) const
Unexecuted instantiation: enc_fast_lossless.cc:AVX2::(anonymous namespace)::LLPrepare<AVX2::(anonymous namespace)::UpTo8Bits>(JxlChunkedFrameInputSource, unsigned long, unsigned long, AVX2::(anonymous namespace)::UpTo8Bits, unsigned long, bool, int, int)::{lambda(unsigned long, unsigned long, unsigned long)#1}::operator()(unsigned long, unsigned long, unsigned long) const
Unexecuted instantiation: enc_fast_lossless.cc:AVX2::(anonymous namespace)::LLPrepare<AVX2::(anonymous namespace)::From9To13Bits>(JxlChunkedFrameInputSource, unsigned long, unsigned long, AVX2::(anonymous namespace)::From9To13Bits, unsigned long, bool, int, int)::{lambda(unsigned long, unsigned long, unsigned long)#1}::operator()(unsigned long, unsigned long, unsigned long) const
Unexecuted instantiation: enc_fast_lossless.cc:AVX2::(anonymous namespace)::LLPrepare<AVX2::(anonymous namespace)::Exactly14Bits>(JxlChunkedFrameInputSource, unsigned long, unsigned long, AVX2::(anonymous namespace)::Exactly14Bits, unsigned long, bool, int, int)::{lambda(unsigned long, unsigned long, unsigned long)#1}::operator()(unsigned long, unsigned long, unsigned long) const
Unexecuted instantiation: enc_fast_lossless.cc:AVX2::(anonymous namespace)::LLPrepare<AVX2::(anonymous namespace)::MoreThan14Bits>(JxlChunkedFrameInputSource, unsigned long, unsigned long, AVX2::(anonymous namespace)::MoreThan14Bits, unsigned long, bool, int, int)::{lambda(unsigned long, unsigned long, unsigned long)#1}::operator()(unsigned long, unsigned long, unsigned long) const
Unexecuted instantiation: enc_fast_lossless.cc:default_implementation::(anonymous namespace)::LLPrepare<default_implementation::(anonymous namespace)::UpTo8Bits>(JxlChunkedFrameInputSource, unsigned long, unsigned long, default_implementation::(anonymous namespace)::UpTo8Bits, unsigned long, bool, int, int)::{lambda(unsigned long, unsigned long, unsigned long)#1}::operator()(unsigned long, unsigned long, unsigned long) const
Unexecuted instantiation: enc_fast_lossless.cc:default_implementation::(anonymous namespace)::LLPrepare<default_implementation::(anonymous namespace)::From9To13Bits>(JxlChunkedFrameInputSource, unsigned long, unsigned long, default_implementation::(anonymous namespace)::From9To13Bits, unsigned long, bool, int, int)::{lambda(unsigned long, unsigned long, unsigned long)#1}::operator()(unsigned long, unsigned long, unsigned long) const
Unexecuted instantiation: enc_fast_lossless.cc:default_implementation::(anonymous namespace)::LLPrepare<default_implementation::(anonymous namespace)::Exactly14Bits>(JxlChunkedFrameInputSource, unsigned long, unsigned long, default_implementation::(anonymous namespace)::Exactly14Bits, unsigned long, bool, int, int)::{lambda(unsigned long, unsigned long, unsigned long)#1}::operator()(unsigned long, unsigned long, unsigned long) const
Unexecuted instantiation: enc_fast_lossless.cc:default_implementation::(anonymous namespace)::LLPrepare<default_implementation::(anonymous namespace)::MoreThan14Bits>(JxlChunkedFrameInputSource, unsigned long, unsigned long, default_implementation::(anonymous namespace)::MoreThan14Bits, unsigned long, bool, int, int)::{lambda(unsigned long, unsigned long, unsigned long)#1}::operator()(unsigned long, unsigned long, unsigned long) const
3906
3907
  // TODO(veluca): that `64` is an arbitrary constant, meant to correspond to
3908
  // the point where the number of processed rows is large enough that loading
3909
  // the entire image is cost-effective.
3910
0
  if (oneshot || effort >= 64) {
3911
0
    for (size_t g = 0; g < num_groups_y * num_groups_x; g++) {
3912
0
      size_t xg = g % num_groups_x;
3913
0
      size_t yg = g / num_groups_x;
3914
0
      size_t y0 = yg * 256;
3915
0
      size_t ys = std::min<size_t>(height - y0, 256);
3916
0
      size_t num_rows = 2 * effort * ys / 256;
3917
0
      if (!sample_rows(xg, yg, num_rows)) {
3918
0
        return nullptr;
3919
0
      }
3920
0
    }
3921
0
  } else {
3922
    // sample the middle (effort * 2 * num_groups) rows of the center group
3923
    // (possibly all of them).
3924
0
    if (!sample_rows((num_groups_x - 1) / 2, (num_groups_y - 1) / 2,
3925
0
                     2 * effort * num_groups_x * num_groups_y)) {
3926
0
      return nullptr;
3927
0
    }
3928
0
  }
3929
3930
  // TODO(veluca): can probably improve this and make it bitdepth-dependent.
3931
0
  uint64_t base_raw_counts[kNumRawSymbols] = {
3932
0
      3843, 852, 1270, 1214, 1014, 727, 481, 300, 159, 51,
3933
0
      5,    1,   1,    1,    1,    1,   1,   1,   1};
3934
3935
0
  bool doing_ycocg = nb_chans > 2 && collided;
3936
0
  bool large_palette = !collided || pcolors >= 256;
3937
0
  for (size_t i = bitdepth.NumSymbols(doing_ycocg || large_palette);
3938
0
       i < kNumRawSymbols; i++) {
3939
0
    base_raw_counts[i] = 0;
3940
0
  }
3941
3942
0
  for (size_t c = 0; c < 4; c++) {
3943
0
    for (size_t i = 0; i < kNumRawSymbols; i++) {
3944
0
      raw_counts[c][i] = (raw_counts[c][i] << 8) + base_raw_counts[i];
3945
0
    }
3946
0
  }
3947
3948
0
  if (!collided) {
3949
0
    unsigned token, nbits, bits;
3950
0
    EncodeHybridUint000(PackSigned(pcolors - 1), &token, &nbits, &bits);
3951
    // ensure all palette indices can actually be encoded
3952
0
    for (size_t i = 0; i < token + 1; i++)
3953
0
      raw_counts[0][i] = std::max<uint64_t>(raw_counts[0][i], 1);
3954
    // these tokens are only used for the palette itself so they can get a bad
3955
    // code
3956
0
    for (size_t i = token + 1; i < 10; i++) raw_counts[0][i] = 1;
3957
0
  }
3958
3959
0
  uint64_t base_lz77_counts[kNumLZ77] = {
3960
0
      29, 27, 25,  23, 21, 21, 19, 18, 21, 17, 16, 15, 15, 14,
3961
0
      13, 13, 137, 98, 61, 34, 1,  1,  1,  1,  1,  1,  1,  1,
3962
0
  };
3963
3964
0
  for (size_t c = 0; c < 4; c++) {
3965
0
    for (size_t i = 0; i < kNumLZ77; i++) {
3966
0
      lz77_counts[c][i] = (lz77_counts[c][i] << 8) + base_lz77_counts[i];
3967
0
    }
3968
0
  }
3969
3970
0
  JxlFastLosslessFrameState* frame_state = new JxlFastLosslessFrameState();
3971
0
  if (!frame_state) return nullptr;
3972
0
  for (size_t i = 0; i < 4; i++) {
3973
0
    frame_state->hcode[i] = PrefixCode(bitdepth, raw_counts[i], lz77_counts[i]);
3974
0
  }
3975
3976
0
  size_t num_dc_groups = num_dc_groups_x * num_dc_groups_y;
3977
0
  size_t num_ac_groups = num_groups_x * num_groups_y;
3978
0
  size_t num_groups = onegroup ? 1 : (2 + num_dc_groups + num_ac_groups);
3979
0
  frame_state->input = input;
3980
0
  frame_state->width = width;
3981
0
  frame_state->height = height;
3982
0
  frame_state->num_groups_x = num_groups_x;
3983
0
  frame_state->num_groups_y = num_groups_y;
3984
0
  frame_state->num_dc_groups_x = num_dc_groups_x;
3985
0
  frame_state->num_dc_groups_y = num_dc_groups_y;
3986
0
  frame_state->nb_chans = nb_chans;
3987
0
  frame_state->bitdepth = bitdepth.bitdepth;
3988
0
  frame_state->big_endian = big_endian;
3989
0
  frame_state->effort = effort;
3990
0
  frame_state->collided = collided;
3991
0
  frame_state->lookup = lookup;
3992
3993
0
  frame_state->group_data = std::vector<std::array<BitWriter, 4>>(num_groups);
3994
0
  frame_state->group_sizes.resize(num_groups);
3995
0
  if (collided) {
3996
0
    if (!PrepareDCGlobal(onegroup, width, height, nb_chans, frame_state->hcode,
3997
0
                         &frame_state->group_data[0][0])) {
3998
0
      delete frame_state;
3999
0
      return nullptr;
4000
0
    }
4001
0
  } else {
4002
0
    if (!PrepareDCGlobalPalette(onegroup, width, height, nb_chans,
4003
0
                                frame_state->hcode, palette, pcolors,
4004
0
                                &frame_state->group_data[0][0])) {
4005
0
      delete frame_state;
4006
0
      return nullptr;
4007
0
    }
4008
0
  }
4009
0
  frame_state->group_sizes[0] = SectionSize(frame_state->group_data[0]);
4010
0
  if (!onegroup) {
4011
0
    ComputeAcGroupDataOffset(frame_state->group_sizes[0], num_dc_groups,
4012
0
                             num_ac_groups, frame_state->min_dc_global_size,
4013
0
                             frame_state->ac_group_data_offset);
4014
0
  }
4015
4016
0
  return frame_state;
4017
0
}
Unexecuted instantiation: enc_fast_lossless.cc:JxlFastLosslessFrameState* AVX512::(anonymous namespace)::LLPrepare<AVX512::(anonymous namespace)::UpTo8Bits>(JxlChunkedFrameInputSource, unsigned long, unsigned long, AVX512::(anonymous namespace)::UpTo8Bits, unsigned long, bool, int, int)
Unexecuted instantiation: enc_fast_lossless.cc:JxlFastLosslessFrameState* AVX512::(anonymous namespace)::LLPrepare<AVX512::(anonymous namespace)::From9To13Bits>(JxlChunkedFrameInputSource, unsigned long, unsigned long, AVX512::(anonymous namespace)::From9To13Bits, unsigned long, bool, int, int)
Unexecuted instantiation: enc_fast_lossless.cc:JxlFastLosslessFrameState* AVX512::(anonymous namespace)::LLPrepare<AVX512::(anonymous namespace)::Exactly14Bits>(JxlChunkedFrameInputSource, unsigned long, unsigned long, AVX512::(anonymous namespace)::Exactly14Bits, unsigned long, bool, int, int)
Unexecuted instantiation: enc_fast_lossless.cc:JxlFastLosslessFrameState* AVX512::(anonymous namespace)::LLPrepare<AVX512::(anonymous namespace)::MoreThan14Bits>(JxlChunkedFrameInputSource, unsigned long, unsigned long, AVX512::(anonymous namespace)::MoreThan14Bits, unsigned long, bool, int, int)
Unexecuted instantiation: enc_fast_lossless.cc:JxlFastLosslessFrameState* AVX2::(anonymous namespace)::LLPrepare<AVX2::(anonymous namespace)::UpTo8Bits>(JxlChunkedFrameInputSource, unsigned long, unsigned long, AVX2::(anonymous namespace)::UpTo8Bits, unsigned long, bool, int, int)
Unexecuted instantiation: enc_fast_lossless.cc:JxlFastLosslessFrameState* AVX2::(anonymous namespace)::LLPrepare<AVX2::(anonymous namespace)::From9To13Bits>(JxlChunkedFrameInputSource, unsigned long, unsigned long, AVX2::(anonymous namespace)::From9To13Bits, unsigned long, bool, int, int)
Unexecuted instantiation: enc_fast_lossless.cc:JxlFastLosslessFrameState* AVX2::(anonymous namespace)::LLPrepare<AVX2::(anonymous namespace)::Exactly14Bits>(JxlChunkedFrameInputSource, unsigned long, unsigned long, AVX2::(anonymous namespace)::Exactly14Bits, unsigned long, bool, int, int)
Unexecuted instantiation: enc_fast_lossless.cc:JxlFastLosslessFrameState* AVX2::(anonymous namespace)::LLPrepare<AVX2::(anonymous namespace)::MoreThan14Bits>(JxlChunkedFrameInputSource, unsigned long, unsigned long, AVX2::(anonymous namespace)::MoreThan14Bits, unsigned long, bool, int, int)
Unexecuted instantiation: enc_fast_lossless.cc:JxlFastLosslessFrameState* default_implementation::(anonymous namespace)::LLPrepare<default_implementation::(anonymous namespace)::UpTo8Bits>(JxlChunkedFrameInputSource, unsigned long, unsigned long, default_implementation::(anonymous namespace)::UpTo8Bits, unsigned long, bool, int, int)
Unexecuted instantiation: enc_fast_lossless.cc:JxlFastLosslessFrameState* default_implementation::(anonymous namespace)::LLPrepare<default_implementation::(anonymous namespace)::From9To13Bits>(JxlChunkedFrameInputSource, unsigned long, unsigned long, default_implementation::(anonymous namespace)::From9To13Bits, unsigned long, bool, int, int)
Unexecuted instantiation: enc_fast_lossless.cc:JxlFastLosslessFrameState* default_implementation::(anonymous namespace)::LLPrepare<default_implementation::(anonymous namespace)::Exactly14Bits>(JxlChunkedFrameInputSource, unsigned long, unsigned long, default_implementation::(anonymous namespace)::Exactly14Bits, unsigned long, bool, int, int)
Unexecuted instantiation: enc_fast_lossless.cc:JxlFastLosslessFrameState* default_implementation::(anonymous namespace)::LLPrepare<default_implementation::(anonymous namespace)::MoreThan14Bits>(JxlChunkedFrameInputSource, unsigned long, unsigned long, default_implementation::(anonymous namespace)::MoreThan14Bits, unsigned long, bool, int, int)
4018
4019
template <typename BitDepth>
4020
jxl::Status LLProcess(JxlFastLosslessFrameState* frame_state, bool is_last,
4021
                      BitDepth bitdepth, void* runner_opaque,
4022
                      FJxlParallelRunner runner,
4023
0
                      JxlEncoderOutputProcessorWrapper* output_processor) {
4024
0
#if !FJXL_STANDALONE
4025
0
  if (frame_state->process_done) {
4026
0
    if (!JxlFastLosslessPrepareHeader(frame_state, /*add_image_header=*/0,
4027
0
                                      is_last)) {
4028
0
      return JXL_FAILURE("Allocation failed");
4029
0
    };
4030
0
    if (output_processor) {
4031
0
      JXL_RETURN_IF_ERROR(
4032
0
          JxlFastLosslessOutputFrame(frame_state, output_processor));
4033
0
    }
4034
0
    return true;
4035
0
  }
4036
0
#endif
4037
  // The maximum number of groups that we process concurrently here.
4038
  // TODO(szabadka) Use the number of threads or some outside parameter for the
4039
  // maximum memory usage instead.
4040
0
  constexpr size_t kMaxLocalGroups = 16;
4041
0
  bool onegroup = frame_state->group_sizes.size() == 1;
4042
0
  bool streaming = !onegroup && output_processor;
4043
0
  size_t total_groups = frame_state->num_groups_x * frame_state->num_groups_y;
4044
0
  size_t max_groups = streaming ? kMaxLocalGroups : total_groups;
4045
0
#if !FJXL_STANDALONE
4046
0
  size_t start_pos = 0;
4047
0
  if (streaming) {
4048
0
    start_pos = output_processor->CurrentPosition();
4049
0
    JXL_RETURN_IF_ERROR(
4050
0
        output_processor->Seek(start_pos + frame_state->ac_group_data_offset));
4051
0
  }
4052
0
#endif
4053
0
  for (size_t offset = 0; offset < total_groups; offset += max_groups) {
4054
0
    size_t num_groups = std::min(max_groups, total_groups - offset);
4055
0
    JxlFastLosslessFrameState local_frame_state;
4056
0
    if (streaming) {
4057
0
      local_frame_state.group_data =
4058
0
          std::vector<std::array<BitWriter, 4>>(num_groups);
4059
0
    }
4060
0
    std::atomic<uint32_t> has_error{0};
4061
0
    auto run_one = [&](size_t i) {
4062
0
      size_t g = offset + i;
4063
0
      size_t xg = g % frame_state->num_groups_x;
4064
0
      size_t yg = g / frame_state->num_groups_x;
4065
0
      size_t num_dc_groups =
4066
0
          frame_state->num_dc_groups_x * frame_state->num_dc_groups_y;
4067
0
      size_t group_id = onegroup ? 0 : (2 + num_dc_groups + g);
4068
0
      size_t xs = std::min<size_t>(frame_state->width - xg * 256, 256);
4069
0
      size_t ys = std::min<size_t>(frame_state->height - yg * 256, 256);
4070
0
      size_t x0 = xg * 256;
4071
0
      size_t y0 = yg * 256;
4072
0
      size_t stride;
4073
0
      JxlChunkedFrameInputSource input = frame_state->input;
4074
0
      const void* buffer = input.get_color_channel_data_at(input.opaque, x0, y0,
4075
0
                                                           xs, ys, &stride);
4076
0
      if (buffer == nullptr) {
4077
0
        has_error = 1;
4078
0
        return;
4079
0
      }
4080
0
      const unsigned char* rgba =
4081
0
          reinterpret_cast<const unsigned char*>(buffer);
4082
4083
0
      auto& gd = streaming ? local_frame_state.group_data[i]
4084
0
                           : frame_state->group_data[group_id];
4085
0
      bool ok;
4086
0
      if (frame_state->collided) {
4087
0
        ok = WriteACSection(rgba, 0, 0, xs, ys, stride, onegroup, bitdepth,
4088
0
                            frame_state->nb_chans, frame_state->big_endian,
4089
0
                            frame_state->hcode, gd);
4090
0
      } else {
4091
0
        ok = WriteACSectionPalette(
4092
0
            rgba, 0, 0, xs, ys, stride, onegroup, frame_state->hcode,
4093
0
            frame_state->lookup.data(), frame_state->nb_chans, gd[0]);
4094
0
      }
4095
0
      if (ok) {
4096
0
        frame_state->group_sizes[group_id] = SectionSize(gd);
4097
0
      } else {
4098
0
        has_error = 1;
4099
0
      }
4100
0
      input.release_buffer(input.opaque, buffer);
4101
0
    };
Unexecuted instantiation: enc_fast_lossless.cc:AVX512::(anonymous namespace)::LLProcess<AVX512::(anonymous namespace)::UpTo8Bits>(JxlFastLosslessFrameState*, bool, AVX512::(anonymous namespace)::UpTo8Bits, void*, void (*)(void*, void*, void (*)(void*, unsigned long), unsigned long), JxlEncoderOutputProcessorWrapper*)::{lambda(unsigned long)#1}::operator()(unsigned long) const
Unexecuted instantiation: enc_fast_lossless.cc:AVX512::(anonymous namespace)::LLProcess<AVX512::(anonymous namespace)::From9To13Bits>(JxlFastLosslessFrameState*, bool, AVX512::(anonymous namespace)::From9To13Bits, void*, void (*)(void*, void*, void (*)(void*, unsigned long), unsigned long), JxlEncoderOutputProcessorWrapper*)::{lambda(unsigned long)#1}::operator()(unsigned long) const
Unexecuted instantiation: enc_fast_lossless.cc:AVX512::(anonymous namespace)::LLProcess<AVX512::(anonymous namespace)::Exactly14Bits>(JxlFastLosslessFrameState*, bool, AVX512::(anonymous namespace)::Exactly14Bits, void*, void (*)(void*, void*, void (*)(void*, unsigned long), unsigned long), JxlEncoderOutputProcessorWrapper*)::{lambda(unsigned long)#1}::operator()(unsigned long) const
Unexecuted instantiation: enc_fast_lossless.cc:AVX512::(anonymous namespace)::LLProcess<AVX512::(anonymous namespace)::MoreThan14Bits>(JxlFastLosslessFrameState*, bool, AVX512::(anonymous namespace)::MoreThan14Bits, void*, void (*)(void*, void*, void (*)(void*, unsigned long), unsigned long), JxlEncoderOutputProcessorWrapper*)::{lambda(unsigned long)#1}::operator()(unsigned long) const
Unexecuted instantiation: enc_fast_lossless.cc:AVX2::(anonymous namespace)::LLProcess<AVX2::(anonymous namespace)::UpTo8Bits>(JxlFastLosslessFrameState*, bool, AVX2::(anonymous namespace)::UpTo8Bits, void*, void (*)(void*, void*, void (*)(void*, unsigned long), unsigned long), JxlEncoderOutputProcessorWrapper*)::{lambda(unsigned long)#1}::operator()(unsigned long) const
Unexecuted instantiation: enc_fast_lossless.cc:AVX2::(anonymous namespace)::LLProcess<AVX2::(anonymous namespace)::From9To13Bits>(JxlFastLosslessFrameState*, bool, AVX2::(anonymous namespace)::From9To13Bits, void*, void (*)(void*, void*, void (*)(void*, unsigned long), unsigned long), JxlEncoderOutputProcessorWrapper*)::{lambda(unsigned long)#1}::operator()(unsigned long) const
Unexecuted instantiation: enc_fast_lossless.cc:AVX2::(anonymous namespace)::LLProcess<AVX2::(anonymous namespace)::Exactly14Bits>(JxlFastLosslessFrameState*, bool, AVX2::(anonymous namespace)::Exactly14Bits, void*, void (*)(void*, void*, void (*)(void*, unsigned long), unsigned long), JxlEncoderOutputProcessorWrapper*)::{lambda(unsigned long)#1}::operator()(unsigned long) const
Unexecuted instantiation: enc_fast_lossless.cc:AVX2::(anonymous namespace)::LLProcess<AVX2::(anonymous namespace)::MoreThan14Bits>(JxlFastLosslessFrameState*, bool, AVX2::(anonymous namespace)::MoreThan14Bits, void*, void (*)(void*, void*, void (*)(void*, unsigned long), unsigned long), JxlEncoderOutputProcessorWrapper*)::{lambda(unsigned long)#1}::operator()(unsigned long) const
Unexecuted instantiation: enc_fast_lossless.cc:default_implementation::(anonymous namespace)::LLProcess<default_implementation::(anonymous namespace)::UpTo8Bits>(JxlFastLosslessFrameState*, bool, default_implementation::(anonymous namespace)::UpTo8Bits, void*, void (*)(void*, void*, void (*)(void*, unsigned long), unsigned long), JxlEncoderOutputProcessorWrapper*)::{lambda(unsigned long)#1}::operator()(unsigned long) const
Unexecuted instantiation: enc_fast_lossless.cc:default_implementation::(anonymous namespace)::LLProcess<default_implementation::(anonymous namespace)::From9To13Bits>(JxlFastLosslessFrameState*, bool, default_implementation::(anonymous namespace)::From9To13Bits, void*, void (*)(void*, void*, void (*)(void*, unsigned long), unsigned long), JxlEncoderOutputProcessorWrapper*)::{lambda(unsigned long)#1}::operator()(unsigned long) const
Unexecuted instantiation: enc_fast_lossless.cc:default_implementation::(anonymous namespace)::LLProcess<default_implementation::(anonymous namespace)::Exactly14Bits>(JxlFastLosslessFrameState*, bool, default_implementation::(anonymous namespace)::Exactly14Bits, void*, void (*)(void*, void*, void (*)(void*, unsigned long), unsigned long), JxlEncoderOutputProcessorWrapper*)::{lambda(unsigned long)#1}::operator()(unsigned long) const
Unexecuted instantiation: enc_fast_lossless.cc:default_implementation::(anonymous namespace)::LLProcess<default_implementation::(anonymous namespace)::MoreThan14Bits>(JxlFastLosslessFrameState*, bool, default_implementation::(anonymous namespace)::MoreThan14Bits, void*, void (*)(void*, void*, void (*)(void*, unsigned long), unsigned long), JxlEncoderOutputProcessorWrapper*)::{lambda(unsigned long)#1}::operator()(unsigned long) const
4102
0
    runner(
4103
0
        runner_opaque, &run_one,
4104
0
        +[](void* r, size_t i) {
4105
0
          (*reinterpret_cast<decltype(&run_one)>(r))(i);
4106
0
        },
Unexecuted instantiation: enc_fast_lossless.cc:AVX512::(anonymous namespace)::LLProcess<AVX512::(anonymous namespace)::UpTo8Bits>(JxlFastLosslessFrameState*, bool, AVX512::(anonymous namespace)::UpTo8Bits, void*, void (*)(void*, void*, void (*)(void*, unsigned long), unsigned long), JxlEncoderOutputProcessorWrapper*)::{lambda(void*, unsigned long)#1}::operator()(void*, unsigned long) const
Unexecuted instantiation: enc_fast_lossless.cc:AVX512::(anonymous namespace)::LLProcess<AVX512::(anonymous namespace)::From9To13Bits>(JxlFastLosslessFrameState*, bool, AVX512::(anonymous namespace)::From9To13Bits, void*, void (*)(void*, void*, void (*)(void*, unsigned long), unsigned long), JxlEncoderOutputProcessorWrapper*)::{lambda(void*, unsigned long)#1}::operator()(void*, unsigned long) const
Unexecuted instantiation: enc_fast_lossless.cc:AVX512::(anonymous namespace)::LLProcess<AVX512::(anonymous namespace)::Exactly14Bits>(JxlFastLosslessFrameState*, bool, AVX512::(anonymous namespace)::Exactly14Bits, void*, void (*)(void*, void*, void (*)(void*, unsigned long), unsigned long), JxlEncoderOutputProcessorWrapper*)::{lambda(void*, unsigned long)#1}::operator()(void*, unsigned long) const
Unexecuted instantiation: enc_fast_lossless.cc:AVX512::(anonymous namespace)::LLProcess<AVX512::(anonymous namespace)::MoreThan14Bits>(JxlFastLosslessFrameState*, bool, AVX512::(anonymous namespace)::MoreThan14Bits, void*, void (*)(void*, void*, void (*)(void*, unsigned long), unsigned long), JxlEncoderOutputProcessorWrapper*)::{lambda(void*, unsigned long)#1}::operator()(void*, unsigned long) const
Unexecuted instantiation: enc_fast_lossless.cc:AVX2::(anonymous namespace)::LLProcess<AVX2::(anonymous namespace)::UpTo8Bits>(JxlFastLosslessFrameState*, bool, AVX2::(anonymous namespace)::UpTo8Bits, void*, void (*)(void*, void*, void (*)(void*, unsigned long), unsigned long), JxlEncoderOutputProcessorWrapper*)::{lambda(void*, unsigned long)#1}::operator()(void*, unsigned long) const
Unexecuted instantiation: enc_fast_lossless.cc:AVX2::(anonymous namespace)::LLProcess<AVX2::(anonymous namespace)::From9To13Bits>(JxlFastLosslessFrameState*, bool, AVX2::(anonymous namespace)::From9To13Bits, void*, void (*)(void*, void*, void (*)(void*, unsigned long), unsigned long), JxlEncoderOutputProcessorWrapper*)::{lambda(void*, unsigned long)#1}::operator()(void*, unsigned long) const
Unexecuted instantiation: enc_fast_lossless.cc:AVX2::(anonymous namespace)::LLProcess<AVX2::(anonymous namespace)::Exactly14Bits>(JxlFastLosslessFrameState*, bool, AVX2::(anonymous namespace)::Exactly14Bits, void*, void (*)(void*, void*, void (*)(void*, unsigned long), unsigned long), JxlEncoderOutputProcessorWrapper*)::{lambda(void*, unsigned long)#1}::operator()(void*, unsigned long) const
Unexecuted instantiation: enc_fast_lossless.cc:AVX2::(anonymous namespace)::LLProcess<AVX2::(anonymous namespace)::MoreThan14Bits>(JxlFastLosslessFrameState*, bool, AVX2::(anonymous namespace)::MoreThan14Bits, void*, void (*)(void*, void*, void (*)(void*, unsigned long), unsigned long), JxlEncoderOutputProcessorWrapper*)::{lambda(void*, unsigned long)#1}::operator()(void*, unsigned long) const
Unexecuted instantiation: enc_fast_lossless.cc:default_implementation::(anonymous namespace)::LLProcess<default_implementation::(anonymous namespace)::UpTo8Bits>(JxlFastLosslessFrameState*, bool, default_implementation::(anonymous namespace)::UpTo8Bits, void*, void (*)(void*, void*, void (*)(void*, unsigned long), unsigned long), JxlEncoderOutputProcessorWrapper*)::{lambda(void*, unsigned long)#1}::operator()(void*, unsigned long) const
Unexecuted instantiation: enc_fast_lossless.cc:default_implementation::(anonymous namespace)::LLProcess<default_implementation::(anonymous namespace)::From9To13Bits>(JxlFastLosslessFrameState*, bool, default_implementation::(anonymous namespace)::From9To13Bits, void*, void (*)(void*, void*, void (*)(void*, unsigned long), unsigned long), JxlEncoderOutputProcessorWrapper*)::{lambda(void*, unsigned long)#1}::operator()(void*, unsigned long) const
Unexecuted instantiation: enc_fast_lossless.cc:default_implementation::(anonymous namespace)::LLProcess<default_implementation::(anonymous namespace)::Exactly14Bits>(JxlFastLosslessFrameState*, bool, default_implementation::(anonymous namespace)::Exactly14Bits, void*, void (*)(void*, void*, void (*)(void*, unsigned long), unsigned long), JxlEncoderOutputProcessorWrapper*)::{lambda(void*, unsigned long)#1}::operator()(void*, unsigned long) const
Unexecuted instantiation: enc_fast_lossless.cc:default_implementation::(anonymous namespace)::LLProcess<default_implementation::(anonymous namespace)::MoreThan14Bits>(JxlFastLosslessFrameState*, bool, default_implementation::(anonymous namespace)::MoreThan14Bits, void*, void (*)(void*, void*, void (*)(void*, unsigned long), unsigned long), JxlEncoderOutputProcessorWrapper*)::{lambda(void*, unsigned long)#1}::operator()(void*, unsigned long) const
4107
0
        num_groups);
4108
0
    if (has_error) return JXL_FAILURE("Allocation failed");
4109
0
#if !FJXL_STANDALONE
4110
0
    if (streaming) {
4111
0
      local_frame_state.nb_chans = frame_state->nb_chans;
4112
0
      local_frame_state.current_bit_writer = 1;
4113
0
      JXL_RETURN_IF_ERROR(
4114
0
          JxlFastLosslessOutputFrame(&local_frame_state, output_processor));
4115
0
    }
4116
0
#endif
4117
0
  }
4118
0
#if !FJXL_STANDALONE
4119
0
  if (streaming) {
4120
0
    size_t end_pos = output_processor->CurrentPosition();
4121
0
    JXL_RETURN_IF_ERROR(output_processor->Seek(start_pos));
4122
0
    frame_state->group_data.resize(1);
4123
0
    bool have_alpha = frame_state->nb_chans == 2 || frame_state->nb_chans == 4;
4124
0
    size_t padding = ComputeDcGlobalPadding(
4125
0
        frame_state->group_sizes, frame_state->ac_group_data_offset,
4126
0
        frame_state->min_dc_global_size, have_alpha, is_last);
4127
4128
0
    for (size_t i = 0; i < padding; ++i) {
4129
0
      frame_state->group_data[0][0].Write(8, 0);
4130
0
    }
4131
0
    frame_state->group_sizes[0] += padding;
4132
0
    if (!JxlFastLosslessPrepareHeader(frame_state, /*add_image_header=*/0,
4133
0
                                      is_last)) {
4134
0
      return JXL_FAILURE("Allocation failed");
4135
0
    }
4136
0
    assert(frame_state->ac_group_data_offset ==
4137
0
           JxlFastLosslessOutputSize(frame_state));
4138
0
    JXL_RETURN_IF_ERROR(
4139
0
        JxlFastLosslessOutputHeaders(frame_state, output_processor));
4140
0
    JXL_RETURN_IF_ERROR(output_processor->Seek(end_pos));
4141
0
  } else if (output_processor) {
4142
0
    assert(onegroup);
4143
0
    if (!JxlFastLosslessPrepareHeader(frame_state, /*add_image_header=*/0,
4144
0
                                      is_last)) {
4145
0
      return JXL_FAILURE("Allocation failed");
4146
0
    }
4147
0
    if (output_processor) {
4148
0
      JXL_RETURN_IF_ERROR(
4149
0
          JxlFastLosslessOutputFrame(frame_state, output_processor));
4150
0
    }
4151
0
  }
4152
0
  frame_state->process_done = true;
4153
0
#endif
4154
0
  return true;
4155
0
}
Unexecuted instantiation: enc_fast_lossless.cc:jxl::Status AVX512::(anonymous namespace)::LLProcess<AVX512::(anonymous namespace)::UpTo8Bits>(JxlFastLosslessFrameState*, bool, AVX512::(anonymous namespace)::UpTo8Bits, void*, void (*)(void*, void*, void (*)(void*, unsigned long), unsigned long), JxlEncoderOutputProcessorWrapper*)
Unexecuted instantiation: enc_fast_lossless.cc:jxl::Status AVX512::(anonymous namespace)::LLProcess<AVX512::(anonymous namespace)::From9To13Bits>(JxlFastLosslessFrameState*, bool, AVX512::(anonymous namespace)::From9To13Bits, void*, void (*)(void*, void*, void (*)(void*, unsigned long), unsigned long), JxlEncoderOutputProcessorWrapper*)
Unexecuted instantiation: enc_fast_lossless.cc:jxl::Status AVX512::(anonymous namespace)::LLProcess<AVX512::(anonymous namespace)::Exactly14Bits>(JxlFastLosslessFrameState*, bool, AVX512::(anonymous namespace)::Exactly14Bits, void*, void (*)(void*, void*, void (*)(void*, unsigned long), unsigned long), JxlEncoderOutputProcessorWrapper*)
Unexecuted instantiation: enc_fast_lossless.cc:jxl::Status AVX512::(anonymous namespace)::LLProcess<AVX512::(anonymous namespace)::MoreThan14Bits>(JxlFastLosslessFrameState*, bool, AVX512::(anonymous namespace)::MoreThan14Bits, void*, void (*)(void*, void*, void (*)(void*, unsigned long), unsigned long), JxlEncoderOutputProcessorWrapper*)
Unexecuted instantiation: enc_fast_lossless.cc:jxl::Status AVX2::(anonymous namespace)::LLProcess<AVX2::(anonymous namespace)::UpTo8Bits>(JxlFastLosslessFrameState*, bool, AVX2::(anonymous namespace)::UpTo8Bits, void*, void (*)(void*, void*, void (*)(void*, unsigned long), unsigned long), JxlEncoderOutputProcessorWrapper*)
Unexecuted instantiation: enc_fast_lossless.cc:jxl::Status AVX2::(anonymous namespace)::LLProcess<AVX2::(anonymous namespace)::From9To13Bits>(JxlFastLosslessFrameState*, bool, AVX2::(anonymous namespace)::From9To13Bits, void*, void (*)(void*, void*, void (*)(void*, unsigned long), unsigned long), JxlEncoderOutputProcessorWrapper*)
Unexecuted instantiation: enc_fast_lossless.cc:jxl::Status AVX2::(anonymous namespace)::LLProcess<AVX2::(anonymous namespace)::Exactly14Bits>(JxlFastLosslessFrameState*, bool, AVX2::(anonymous namespace)::Exactly14Bits, void*, void (*)(void*, void*, void (*)(void*, unsigned long), unsigned long), JxlEncoderOutputProcessorWrapper*)
Unexecuted instantiation: enc_fast_lossless.cc:jxl::Status AVX2::(anonymous namespace)::LLProcess<AVX2::(anonymous namespace)::MoreThan14Bits>(JxlFastLosslessFrameState*, bool, AVX2::(anonymous namespace)::MoreThan14Bits, void*, void (*)(void*, void*, void (*)(void*, unsigned long), unsigned long), JxlEncoderOutputProcessorWrapper*)
Unexecuted instantiation: enc_fast_lossless.cc:jxl::Status default_implementation::(anonymous namespace)::LLProcess<default_implementation::(anonymous namespace)::UpTo8Bits>(JxlFastLosslessFrameState*, bool, default_implementation::(anonymous namespace)::UpTo8Bits, void*, void (*)(void*, void*, void (*)(void*, unsigned long), unsigned long), JxlEncoderOutputProcessorWrapper*)
Unexecuted instantiation: enc_fast_lossless.cc:jxl::Status default_implementation::(anonymous namespace)::LLProcess<default_implementation::(anonymous namespace)::From9To13Bits>(JxlFastLosslessFrameState*, bool, default_implementation::(anonymous namespace)::From9To13Bits, void*, void (*)(void*, void*, void (*)(void*, unsigned long), unsigned long), JxlEncoderOutputProcessorWrapper*)
Unexecuted instantiation: enc_fast_lossless.cc:jxl::Status default_implementation::(anonymous namespace)::LLProcess<default_implementation::(anonymous namespace)::Exactly14Bits>(JxlFastLosslessFrameState*, bool, default_implementation::(anonymous namespace)::Exactly14Bits, void*, void (*)(void*, void*, void (*)(void*, unsigned long), unsigned long), JxlEncoderOutputProcessorWrapper*)
Unexecuted instantiation: enc_fast_lossless.cc:jxl::Status default_implementation::(anonymous namespace)::LLProcess<default_implementation::(anonymous namespace)::MoreThan14Bits>(JxlFastLosslessFrameState*, bool, default_implementation::(anonymous namespace)::MoreThan14Bits, void*, void (*)(void*, void*, void (*)(void*, unsigned long), unsigned long), JxlEncoderOutputProcessorWrapper*)
4156
4157
JxlFastLosslessFrameState* JxlFastLosslessPrepareImpl(
4158
    JxlChunkedFrameInputSource input, size_t width, size_t height,
4159
    size_t nb_chans, size_t bitdepth, bool big_endian, int effort,
4160
0
    int oneshot) {
4161
0
  assert(bitdepth > 0);
4162
0
  assert(nb_chans <= 4);
4163
0
  assert(nb_chans != 0);
4164
0
  if (bitdepth <= 8) {
4165
0
    return LLPrepare(input, width, height, UpTo8Bits(bitdepth), nb_chans,
4166
0
                     big_endian, effort, oneshot);
4167
0
  }
4168
0
  if (bitdepth <= 13) {
4169
0
    return LLPrepare(input, width, height, From9To13Bits(bitdepth), nb_chans,
4170
0
                     big_endian, effort, oneshot);
4171
0
  }
4172
0
  if (bitdepth == 14) {
4173
0
    return LLPrepare(input, width, height, Exactly14Bits(bitdepth), nb_chans,
4174
0
                     big_endian, effort, oneshot);
4175
0
  }
4176
0
  return LLPrepare(input, width, height, MoreThan14Bits(bitdepth), nb_chans,
4177
0
                   big_endian, effort, oneshot);
4178
0
}
Unexecuted instantiation: enc_fast_lossless.cc:AVX512::(anonymous namespace)::JxlFastLosslessPrepareImpl(JxlChunkedFrameInputSource, unsigned long, unsigned long, unsigned long, unsigned long, bool, int, int)
Unexecuted instantiation: enc_fast_lossless.cc:AVX2::(anonymous namespace)::JxlFastLosslessPrepareImpl(JxlChunkedFrameInputSource, unsigned long, unsigned long, unsigned long, unsigned long, bool, int, int)
Unexecuted instantiation: enc_fast_lossless.cc:default_implementation::(anonymous namespace)::JxlFastLosslessPrepareImpl(JxlChunkedFrameInputSource, unsigned long, unsigned long, unsigned long, unsigned long, bool, int, int)
4179
4180
jxl::Status JxlFastLosslessProcessFrameImpl(
4181
    JxlFastLosslessFrameState* frame_state, bool is_last, void* runner_opaque,
4182
    FJxlParallelRunner runner,
4183
0
    JxlEncoderOutputProcessorWrapper* output_processor) {
4184
0
  const size_t bitdepth = frame_state->bitdepth;
4185
0
  if (bitdepth <= 8) {
4186
0
    JXL_RETURN_IF_ERROR(LLProcess(frame_state, is_last, UpTo8Bits(bitdepth),
4187
0
                                  runner_opaque, runner, output_processor));
4188
0
  } else if (bitdepth <= 13) {
4189
0
    JXL_RETURN_IF_ERROR(LLProcess(frame_state, is_last, From9To13Bits(bitdepth),
4190
0
                                  runner_opaque, runner, output_processor));
4191
0
  } else if (bitdepth == 14) {
4192
0
    JXL_RETURN_IF_ERROR(LLProcess(frame_state, is_last, Exactly14Bits(bitdepth),
4193
0
                                  runner_opaque, runner, output_processor));
4194
0
  } else {
4195
0
    JXL_RETURN_IF_ERROR(LLProcess(frame_state, is_last,
4196
0
                                  MoreThan14Bits(bitdepth), runner_opaque,
4197
0
                                  runner, output_processor));
4198
0
  }
4199
0
  return true;
4200
0
}
Unexecuted instantiation: enc_fast_lossless.cc:AVX512::(anonymous namespace)::JxlFastLosslessProcessFrameImpl(JxlFastLosslessFrameState*, bool, void*, void (*)(void*, void*, void (*)(void*, unsigned long), unsigned long), JxlEncoderOutputProcessorWrapper*)
Unexecuted instantiation: enc_fast_lossless.cc:AVX2::(anonymous namespace)::JxlFastLosslessProcessFrameImpl(JxlFastLosslessFrameState*, bool, void*, void (*)(void*, void*, void (*)(void*, unsigned long), unsigned long), JxlEncoderOutputProcessorWrapper*)
Unexecuted instantiation: enc_fast_lossless.cc:default_implementation::(anonymous namespace)::JxlFastLosslessProcessFrameImpl(JxlFastLosslessFrameState*, bool, void*, void (*)(void*, void*, void (*)(void*, unsigned long), unsigned long), JxlEncoderOutputProcessorWrapper*)
4201
4202
}  // namespace
4203
4204
#endif  // FJXL_SELF_INCLUDE
4205
4206
#ifndef FJXL_SELF_INCLUDE
4207
4208
#define FJXL_SELF_INCLUDE
4209
4210
// If we have NEON enabled, it is the default target.
4211
#if FJXL_ENABLE_NEON
4212
4213
namespace default_implementation {
4214
#define FJXL_NEON
4215
#include "lib/jxl/enc_fast_lossless.cc"
4216
#undef FJXL_NEON
4217
}  // namespace default_implementation
4218
4219
#else                                    // FJXL_ENABLE_NEON
4220
4221
namespace default_implementation {
4222
#include "lib/jxl/enc_fast_lossless.cc"  // NOLINT
4223
}
4224
4225
#if FJXL_ENABLE_AVX2
4226
#ifdef __clang__
4227
#pragma clang attribute push(__attribute__((target("avx,avx2"))), \
4228
                             apply_to = function)
4229
// Causes spurious warnings on clang5.
4230
#pragma clang diagnostic push
4231
#pragma clang diagnostic ignored "-Wmissing-braces"
4232
#elif defined(__GNUC__)
4233
#pragma GCC push_options
4234
// Seems to cause spurious errors on GCC8.
4235
#pragma GCC diagnostic ignored "-Wpsabi"
4236
#pragma GCC target "avx,avx2"
4237
#endif
4238
4239
namespace AVX2 {
4240
#define FJXL_AVX2
4241
#include "lib/jxl/enc_fast_lossless.cc"  // NOLINT
4242
#undef FJXL_AVX2
4243
}  // namespace AVX2
4244
4245
#ifdef __clang__
4246
#pragma clang attribute pop
4247
#pragma clang diagnostic pop
4248
#elif defined(__GNUC__)
4249
#pragma GCC pop_options
4250
#endif
4251
#endif  // FJXL_ENABLE_AVX2
4252
4253
#if FJXL_ENABLE_AVX512
4254
#ifdef __clang__
4255
#pragma clang attribute push(                                                 \
4256
    __attribute__((target("avx512cd,avx512bw,avx512vl,avx512f,avx512vbmi"))), \
4257
    apply_to = function)
4258
#elif defined(__GNUC__)
4259
#pragma GCC push_options
4260
#pragma GCC target "avx512cd,avx512bw,avx512vl,avx512f,avx512vbmi"
4261
#endif
4262
4263
namespace AVX512 {
4264
#define FJXL_AVX512
4265
#include "lib/jxl/enc_fast_lossless.cc"
4266
#undef FJXL_AVX512
4267
}  // namespace AVX512
4268
4269
#ifdef __clang__
4270
#pragma clang attribute pop
4271
#elif defined(__GNUC__)
4272
#pragma GCC pop_options
4273
#endif
4274
#endif  // FJXL_ENABLE_AVX512
4275
4276
#endif
4277
4278
extern "C" {
4279
4280
#if FJXL_STANDALONE
4281
class FJxlFrameInput {
4282
 public:
4283
  FJxlFrameInput(const unsigned char* rgba, size_t row_stride, size_t nb_chans,
4284
                 size_t bitdepth)
4285
      : rgba_(rgba),
4286
        row_stride_(row_stride),
4287
        bytes_per_pixel_(bitdepth <= 8 ? nb_chans : 2 * nb_chans) {}
4288
4289
  JxlChunkedFrameInputSource GetInputSource() {
4290
    return JxlChunkedFrameInputSource{this, GetDataAt,
4291
                                      [](void*, const void*) {}};
4292
  }
4293
4294
 private:
4295
  static const void* GetDataAt(void* opaque, size_t xpos, size_t ypos,
4296
                               size_t xsize, size_t ysize, size_t* row_offset) {
4297
    FJxlFrameInput* self = static_cast<FJxlFrameInput*>(opaque);
4298
    *row_offset = self->row_stride_;
4299
    return self->rgba_ + ypos * (*row_offset) + xpos * self->bytes_per_pixel_;
4300
  }
4301
4302
  const uint8_t* rgba_;
4303
  size_t row_stride_;
4304
  size_t bytes_per_pixel_;
4305
};
4306
4307
size_t JxlFastLosslessEncode(const unsigned char* rgba, size_t width,
4308
                             size_t row_stride, size_t height, size_t nb_chans,
4309
                             size_t bitdepth, bool big_endian, int effort,
4310
                             unsigned char** output, void* runner_opaque,
4311
                             FJxlParallelRunner runner) {
4312
  FJxlFrameInput input(rgba, row_stride, nb_chans, bitdepth);
4313
  auto* frame_state = JxlFastLosslessPrepareFrame(
4314
      input.GetInputSource(), width, height, nb_chans, bitdepth, big_endian,
4315
      effort, /*oneshot=*/true);
4316
  if (!frame_state) return 0;
4317
  if (!JxlFastLosslessProcessFrame(frame_state, /*is_last=*/true, runner_opaque,
4318
                                   runner, nullptr)) {
4319
    JxlFastLosslessFreeFrameState(frame_state);
4320
    return 0;
4321
  }
4322
  if (!JxlFastLosslessPrepareHeader(frame_state, /*add_image_header=*/1,
4323
                                    /*is_last=*/1)) {
4324
    JxlFastLosslessFreeFrameState(frame_state);
4325
    return 0;
4326
  }
4327
  size_t output_size = JxlFastLosslessMaxRequiredOutput(frame_state);
4328
  *output = (unsigned char*)malloc(output_size);
4329
  if (*output == NULL) {
4330
    return JXL_FAILURE("Memory allocation failed");
4331
  }
4332
  size_t written = 0;
4333
  size_t total = 0;
4334
  while ((written = JxlFastLosslessWriteOutput(frame_state, *output + total,
4335
                                               output_size - total)) != 0) {
4336
    total += written;
4337
  }
4338
  JxlFastLosslessFreeFrameState(frame_state);
4339
  return total;
4340
}
4341
#endif
4342
4343
JxlFastLosslessFrameState* JxlFastLosslessPrepareFrame(
4344
    JxlChunkedFrameInputSource input, size_t width, size_t height,
4345
    size_t nb_chans, size_t bitdepth, bool big_endian, int effort,
4346
0
    int oneshot) {
4347
0
#if FJXL_ENABLE_AVX512
4348
0
  if (HasCpuFeature(CpuFeature::kAVX512CD) &&
4349
0
      HasCpuFeature(CpuFeature::kVBMI) &&
4350
0
      HasCpuFeature(CpuFeature::kAVX512BW) &&
4351
0
      HasCpuFeature(CpuFeature::kAVX512F) &&
4352
0
      HasCpuFeature(CpuFeature::kAVX512VL)) {
4353
0
    return AVX512::JxlFastLosslessPrepareImpl(
4354
0
        input, width, height, nb_chans, bitdepth, big_endian, effort, oneshot);
4355
0
  }
4356
0
#endif
4357
0
#if FJXL_ENABLE_AVX2
4358
0
  if (HasCpuFeature(CpuFeature::kAVX2)) {
4359
0
    return AVX2::JxlFastLosslessPrepareImpl(
4360
0
        input, width, height, nb_chans, bitdepth, big_endian, effort, oneshot);
4361
0
  }
4362
0
#endif
4363
4364
0
  return default_implementation::JxlFastLosslessPrepareImpl(
4365
0
      input, width, height, nb_chans, bitdepth, big_endian, effort, oneshot);
4366
0
}
4367
4368
bool JxlFastLosslessProcessFrame(
4369
    JxlFastLosslessFrameState* frame_state, bool is_last, void* runner_opaque,
4370
    FJxlParallelRunner runner,
4371
0
    JxlEncoderOutputProcessorWrapper* output_processor) {
4372
0
  auto trivial_runner =
4373
0
      +[](void*, void* opaque, void fun(void*, size_t), size_t count) {
4374
0
        for (size_t i = 0; i < count; i++) {
4375
0
          fun(opaque, i);
4376
0
        }
4377
0
      };
4378
4379
0
  if (runner == nullptr) {
4380
0
    runner = trivial_runner;
4381
0
  }
4382
4383
0
#if FJXL_ENABLE_AVX512
4384
0
  if (HasCpuFeature(CpuFeature::kAVX512CD) &&
4385
0
      HasCpuFeature(CpuFeature::kVBMI) &&
4386
0
      HasCpuFeature(CpuFeature::kAVX512BW) &&
4387
0
      HasCpuFeature(CpuFeature::kAVX512F) &&
4388
0
      HasCpuFeature(CpuFeature::kAVX512VL)) {
4389
0
    JXL_RETURN_IF_ERROR(AVX512::JxlFastLosslessProcessFrameImpl(
4390
0
        frame_state, is_last, runner_opaque, runner, output_processor));
4391
0
    return true;
4392
0
  }
4393
0
#endif
4394
0
#if FJXL_ENABLE_AVX2
4395
0
  if (HasCpuFeature(CpuFeature::kAVX2)) {
4396
0
    JXL_RETURN_IF_ERROR(AVX2::JxlFastLosslessProcessFrameImpl(
4397
0
        frame_state, is_last, runner_opaque, runner, output_processor));
4398
0
    return true;
4399
0
  }
4400
0
#endif
4401
4402
0
  JXL_RETURN_IF_ERROR(default_implementation::JxlFastLosslessProcessFrameImpl(
4403
0
      frame_state, is_last, runner_opaque, runner, output_processor));
4404
0
  return true;
4405
0
}
4406
4407
}  // extern "C"
4408
4409
#if !FJXL_STANDALONE
4410
bool JxlFastLosslessOutputFrame(
4411
    JxlFastLosslessFrameState* frame_state,
4412
0
    JxlEncoderOutputProcessorWrapper* output_processor) {
4413
0
  size_t fl_size = JxlFastLosslessOutputSize(frame_state);
4414
0
  size_t written = 0;
4415
0
  while (written < fl_size) {
4416
0
    JXL_ASSIGN_OR_RETURN(auto buffer,
4417
0
                         output_processor->GetBuffer(32, fl_size - written));
4418
0
    size_t n =
4419
0
        JxlFastLosslessWriteOutput(frame_state, buffer.data(), buffer.size());
4420
0
    if (n == 0) break;
4421
0
    JXL_RETURN_IF_ERROR(buffer.advance(n));
4422
0
    written += n;
4423
0
  };
4424
0
  return true;
4425
0
}
4426
#endif
4427
4428
#endif  // FJXL_SELF_INCLUDE