Coverage Report

Created: 2026-05-16 07:22

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/src/libjxl/lib/jxl/enc_fast_lossless.cc
Line
Count
Source
1
// Copyright (c) the JPEG XL Project Authors. All rights reserved.
2
//
3
// Use of this source code is governed by a BSD-style
4
// license that can be found in the LICENSE file.
5
6
#include "lib/jxl/base/status.h"
7
#ifndef FJXL_SELF_INCLUDE
8
9
#include <assert.h>
10
11
#include <algorithm>
12
#include <array>
13
#include <atomic>
14
#include <cstdint>
15
#include <cstdlib>
16
#include <cstring>
17
#include <limits>
18
#include <memory>
19
#include <vector>
20
21
#include "lib/jxl/enc_fast_lossless.h"
22
23
#if !FJXL_STANDALONE
24
#include "lib/jxl/encode_internal.h"
25
#endif  // FJXL_STANDALONE
26
27
#if defined(__x86_64__) || defined(_M_X64)
28
#define FJXL_ARCH_IS_X86_64 1
29
#else
30
#define FJXL_ARCH_IS_X86_64 0
31
#endif
32
33
#if defined(__i386__) || defined(_M_IX86) || FJXL_ARCH_IS_X86_64
34
#define FJXL_ARCH_IS_X86 1
35
#else
36
#define FJXL_ARCH_IS_X86 0
37
#endif
38
39
#if FJXL_ARCH_IS_X86
40
#if defined(_MSC_VER)
41
#include <intrin.h>
42
#else  // _MSC_VER
43
#include <cpuid.h>
44
#endif  // _MSC_VER
45
#endif  // FJXL_ARCH_IS_X86
46
47
// Enable NEON and AVX2/AVX512 if not asked to do otherwise and the compilers
48
// support it.
49
#if defined(__aarch64__) || defined(_M_ARM64)  // ARCH
50
#include <arm_neon.h>
51
52
#if !defined(FJXL_ENABLE_NEON)
53
#define FJXL_ENABLE_NEON 1
54
#endif  // !defined(FJXL_ENABLE_NEON)
55
56
#elif FJXL_ARCH_IS_X86_64 && !defined(_MSC_VER)  // ARCH
57
#include <immintrin.h>
58
59
// manually add _mm512_cvtsi512_si32 definition if missing
60
// (e.g. with Xcode on macOS Mojave)
61
// copied from gcc 11.1.0 include/avx512fintrin.h line 14367-14373
62
#if defined(__clang__) &&                                           \
63
    ((!defined(__apple_build_version__) && __clang_major__ < 10) || \
64
     (defined(__apple_build_version__) && __apple_build_version__ < 12000032))
65
inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
66
_mm512_cvtsi512_si32(__m512i __A) {
67
  __v16si __B = (__v16si)__A;
68
  return __B[0];
69
}
70
#endif
71
72
#if !defined(FJXL_ENABLE_AVX2)
73
#define FJXL_ENABLE_AVX2 1
74
#endif  // !defined(FJXL_ENABLE_AVX2)
75
76
#if !defined(FJXL_ENABLE_AVX512)
77
// On clang-7 or earlier, and gcc-10 or earlier, AVX512 seems broken.
78
#if (defined(__clang__) &&                                             \
79
         (!defined(__apple_build_version__) && __clang_major__ > 7) || \
80
     (defined(__apple_build_version__) &&                              \
81
      __apple_build_version__ > 10010046)) ||                          \
82
    (defined(__GNUC__) && __GNUC__ > 10)
83
#define FJXL_ENABLE_AVX512 1
84
#endif
85
#endif  // !defined(FJXL_ENABLE_AVX512)
86
87
#endif  // ARCH
88
89
#ifndef FJXL_ENABLE_NEON
90
#define FJXL_ENABLE_NEON 0
91
#endif
92
93
#ifndef FJXL_ENABLE_AVX2
94
#define FJXL_ENABLE_AVX2 0
95
#endif
96
97
#ifndef FJXL_ENABLE_AVX512
98
#define FJXL_ENABLE_AVX512 0
99
#endif
100
101
namespace {
102
103
enum class CpuFeature : uint32_t {
104
  kAVX2 = 0,
105
106
  kAVX512F,
107
  kAVX512VL,
108
  kAVX512CD,
109
  kAVX512BW,
110
111
  kVBMI,
112
  kVBMI2
113
};
114
115
0
constexpr uint32_t CpuFeatureBit(CpuFeature feature) {
116
0
  return 1u << static_cast<uint32_t>(feature);
117
0
}
118
119
#if FJXL_ARCH_IS_X86
120
#if defined(_MSC_VER)
121
void Cpuid(const uint32_t level, const uint32_t count,
122
           std::array<uint32_t, 4>& abcd) {
123
  int regs[4];
124
  __cpuidex(regs, level, count);
125
  for (int i = 0; i < 4; ++i) {
126
    abcd[i] = regs[i];
127
  }
128
}
129
uint32_t ReadXCR0() { return static_cast<uint32_t>(_xgetbv(0)); }
130
#else   // _MSC_VER
131
void Cpuid(const uint32_t level, const uint32_t count,
132
0
           std::array<uint32_t, 4>& abcd) {
133
0
  uint32_t a;
134
0
  uint32_t b;
135
0
  uint32_t c;
136
0
  uint32_t d;
137
0
  __cpuid_count(level, count, a, b, c, d);
138
0
  abcd[0] = a;
139
0
  abcd[1] = b;
140
0
  abcd[2] = c;
141
0
  abcd[3] = d;
142
0
}
143
0
uint32_t ReadXCR0() {
144
0
  uint32_t xcr0;
145
0
  uint32_t xcr0_high;
146
0
  const uint32_t index = 0;
147
0
  asm volatile(".byte 0x0F, 0x01, 0xD0"
148
0
               : "=a"(xcr0), "=d"(xcr0_high)
149
0
               : "c"(index));
150
0
  return xcr0;
151
0
}
152
#endif  // _MSC_VER
153
154
0
uint32_t DetectCpuFeatures() {
155
0
  uint32_t flags = 0;  // return value
156
0
  std::array<uint32_t, 4> abcd;
157
0
  Cpuid(0, 0, abcd);
158
0
  const uint32_t max_level = abcd[0];
159
160
0
  const auto check_bit = [](uint32_t v, uint32_t idx) -> bool {
161
0
    return (v & (1U << idx)) != 0;
162
0
  };
163
164
  // Extended features
165
0
  if (max_level >= 7) {
166
0
    Cpuid(7, 0, abcd);
167
0
    flags |= check_bit(abcd[1], 5) ? CpuFeatureBit(CpuFeature::kAVX2) : 0;
168
169
0
    flags |= check_bit(abcd[1], 16) ? CpuFeatureBit(CpuFeature::kAVX512F) : 0;
170
0
    flags |= check_bit(abcd[1], 28) ? CpuFeatureBit(CpuFeature::kAVX512CD) : 0;
171
0
    flags |= check_bit(abcd[1], 30) ? CpuFeatureBit(CpuFeature::kAVX512BW) : 0;
172
0
    flags |= check_bit(abcd[1], 31) ? CpuFeatureBit(CpuFeature::kAVX512VL) : 0;
173
174
0
    flags |= check_bit(abcd[2], 1) ? CpuFeatureBit(CpuFeature::kVBMI) : 0;
175
0
    flags |= check_bit(abcd[2], 6) ? CpuFeatureBit(CpuFeature::kVBMI2) : 0;
176
0
  }
177
178
0
  Cpuid(1, 0, abcd);
179
0
  const bool os_has_xsave = check_bit(abcd[2], 27);
180
0
  if (os_has_xsave) {
181
0
    const uint32_t xcr0 = ReadXCR0();
182
0
    if (!check_bit(xcr0, 1) || !check_bit(xcr0, 2)) {
183
0
      flags = 0;
184
0
    } else if (!check_bit(xcr0, 5) || !check_bit(xcr0, 6) ||
185
0
               !check_bit(xcr0, 7)) {
186
      // No AVX-512; disable everything but AVX2 if present
187
0
      flags &= CpuFeatureBit(CpuFeature::kAVX2);
188
0
    }
189
0
  }
190
191
0
  return flags;
192
0
}
193
#else   // FJXL_ARCH_IS_X86
194
uint32_t DetectCpuFeatures() { return 0; }
195
#endif  // FJXL_ARCH_IS_X86
196
197
#if defined(_MSC_VER)
198
#define FJXL_UNUSED
199
#else
200
#define FJXL_UNUSED __attribute__((unused))
201
#endif
202
203
0
FJXL_UNUSED bool HasCpuFeature(CpuFeature feature) {
204
0
  static uint32_t cpu_features = DetectCpuFeatures();
205
0
  return (cpu_features & CpuFeatureBit(feature)) != 0;
206
0
}
207
208
#if defined(_MSC_VER) && !defined(__clang__)
209
#define FJXL_INLINE __forceinline
210
FJXL_INLINE uint32_t FloorLog2(uint32_t v) {
211
  unsigned long index;
212
  _BitScanReverse(&index, v);
213
  return index;
214
}
215
FJXL_INLINE uint32_t CtzNonZero(uint64_t v) {
216
  unsigned long index;
217
  _BitScanForward(&index, v);
218
  return index;
219
}
220
#else
221
#define FJXL_INLINE inline __attribute__((always_inline))
222
0
FJXL_INLINE uint32_t FloorLog2(uint32_t v) {
223
0
  return v ? 31 - __builtin_clz(v) : 0;
224
0
}
225
0
FJXL_UNUSED FJXL_INLINE uint32_t CtzNonZero(uint64_t v) {
226
0
  return __builtin_ctzll(v);
227
0
}
228
#endif
229
230
// Compiles to a memcpy on little-endian systems.
231
0
FJXL_INLINE void StoreLE64(uint8_t* tgt, uint64_t data) {
232
#if (!defined(__BYTE_ORDER__) || (__BYTE_ORDER__ != __ORDER_LITTLE_ENDIAN__))
233
  for (int i = 0; i < 8; i++) {
234
    tgt[i] = (data >> (i * 8)) & 0xFF;
235
  }
236
#else
237
0
  memcpy(tgt, &data, 8);
238
0
#endif
239
0
}
240
241
FJXL_INLINE size_t AddBits(uint32_t count, uint64_t bits, uint8_t* data_buf,
242
0
                           size_t& bits_in_buffer, uint64_t& bit_buffer) {
243
0
  bit_buffer |= bits << bits_in_buffer;
244
0
  bits_in_buffer += count;
245
0
  StoreLE64(data_buf, bit_buffer);
246
0
  size_t bytes_in_buffer = bits_in_buffer / 8;
247
0
  bits_in_buffer -= bytes_in_buffer * 8;
248
0
  bit_buffer >>= bytes_in_buffer * 8;
249
0
  return bytes_in_buffer;
250
0
}
251
252
struct BitWriter {
253
0
  bool Allocate(size_t maximum_bit_size) {
254
0
    assert(data == nullptr);
255
    // Leave some padding.
256
0
    data.reset(static_cast<uint8_t*>(malloc(maximum_bit_size / 8 + 64)));
257
0
    return data != nullptr;
258
0
  }
259
260
0
  void Write(uint32_t count, uint64_t bits) {
261
0
    bytes_written += AddBits(count, bits, data.get() + bytes_written,
262
0
                             bits_in_buffer, buffer);
263
0
  }
264
265
0
  void ZeroPadToByte() {
266
0
    if (bits_in_buffer != 0) {
267
0
      Write(8 - bits_in_buffer, 0);
268
0
    }
269
0
  }
270
271
  FJXL_INLINE void WriteMultiple(const uint64_t* nbits, const uint64_t* bits,
272
0
                                 size_t n) {
273
    // Necessary because Write() is only guaranteed to work with <=56 bits.
274
    // Trying to SIMD-fy this code results in lower speed (and definitely less
275
    // clarity).
276
0
    {
277
0
      for (size_t i = 0; i < n; i++) {
278
0
        this->buffer |= bits[i] << this->bits_in_buffer;
279
0
        memcpy(this->data.get() + this->bytes_written, &this->buffer, 8);
280
0
        uint64_t shift = 64 - this->bits_in_buffer;
281
0
        this->bits_in_buffer += nbits[i];
282
        // This `if` seems to be faster than using ternaries.
283
0
        if (this->bits_in_buffer >= 64) {
284
0
          uint64_t next_buffer = shift >= 64 ? 0 : bits[i] >> shift;
285
0
          this->buffer = next_buffer;
286
0
          this->bits_in_buffer -= 64;
287
0
          this->bytes_written += 8;
288
0
        }
289
0
      }
290
0
      memcpy(this->data.get() + this->bytes_written, &this->buffer, 8);
291
0
      size_t bytes_in_buffer = this->bits_in_buffer / 8;
292
0
      this->bits_in_buffer -= bytes_in_buffer * 8;
293
0
      this->buffer >>= bytes_in_buffer * 8;
294
0
      this->bytes_written += bytes_in_buffer;
295
0
    }
296
0
  }
297
298
  std::unique_ptr<uint8_t[], void (*)(void*)> data = {nullptr, free};
299
  size_t bytes_written = 0;
300
  size_t bits_in_buffer = 0;
301
  uint64_t buffer = 0;
302
};
303
304
0
size_t SectionSize(const std::array<BitWriter, 4>& group_data) {
305
0
  size_t sz = 0;
306
0
  for (size_t j = 0; j < 4; j++) {
307
0
    const auto& writer = group_data[j];
308
0
    sz += writer.bytes_written * 8 + writer.bits_in_buffer;
309
0
  }
310
0
  sz = (sz + 7) / 8;
311
0
  return sz;
312
0
}
313
314
constexpr size_t kMaxFrameHeaderSize = 5;
315
316
constexpr size_t kGroupSizeOffset[4] = {
317
    static_cast<size_t>(0),
318
    static_cast<size_t>(1024),
319
    static_cast<size_t>(17408),
320
    static_cast<size_t>(4211712),
321
};
322
constexpr size_t kTOCBits[4] = {12, 16, 24, 32};
323
324
0
size_t TOCBucket(size_t group_size) {
325
0
  size_t bucket = 0;
326
0
  while (bucket < 3 && group_size >= kGroupSizeOffset[bucket + 1]) ++bucket;
327
0
  return bucket;
328
0
}
329
330
#if !FJXL_STANDALONE
331
0
size_t TOCSize(const std::vector<size_t>& group_sizes) {
332
0
  size_t toc_bits = 0;
333
0
  for (size_t group_size : group_sizes) {
334
0
    toc_bits += kTOCBits[TOCBucket(group_size)];
335
0
  }
336
0
  return (toc_bits + 7) / 8;
337
0
}
338
339
0
size_t FrameHeaderSize(bool have_alpha, bool is_last) {
340
0
  size_t nbits = 28 + (have_alpha ? 4 : 0) + (is_last ? 0 : 2);
341
0
  return (nbits + 7) / 8;
342
0
}
343
#endif
344
345
void ComputeAcGroupDataOffset(size_t dc_global_size, size_t num_dc_groups,
346
                              size_t num_ac_groups, size_t& min_dc_global_size,
347
0
                              size_t& ac_group_offset) {
348
  // Max AC group size is 768 kB, so max AC group TOC bits is 24.
349
0
  size_t ac_toc_max_bits = num_ac_groups * 24;
350
0
  size_t ac_toc_min_bits = num_ac_groups * 12;
351
0
  size_t max_padding = 1 + (ac_toc_max_bits - ac_toc_min_bits + 7) / 8;
352
0
  min_dc_global_size = dc_global_size;
353
0
  size_t dc_global_bucket = TOCBucket(min_dc_global_size);
354
0
  while (TOCBucket(min_dc_global_size + max_padding) > dc_global_bucket) {
355
0
    dc_global_bucket = TOCBucket(min_dc_global_size + max_padding);
356
0
    min_dc_global_size = kGroupSizeOffset[dc_global_bucket];
357
0
  }
358
0
  assert(TOCBucket(min_dc_global_size) == dc_global_bucket);
359
0
  assert(TOCBucket(min_dc_global_size + max_padding) == dc_global_bucket);
360
0
  size_t max_toc_bits =
361
0
      kTOCBits[dc_global_bucket] + 12 * (1 + num_dc_groups) + ac_toc_max_bits;
362
0
  size_t max_toc_size = (max_toc_bits + 7) / 8;
363
0
  ac_group_offset = kMaxFrameHeaderSize + max_toc_size + min_dc_global_size;
364
0
}
365
366
#if !FJXL_STANDALONE
367
size_t ComputeDcGlobalPadding(const std::vector<size_t>& group_sizes,
368
                              size_t ac_group_data_offset,
369
                              size_t min_dc_global_size, bool have_alpha,
370
0
                              bool is_last) {
371
0
  std::vector<size_t> new_group_sizes = group_sizes;
372
0
  new_group_sizes[0] = min_dc_global_size;
373
0
  size_t toc_size = TOCSize(new_group_sizes);
374
0
  size_t actual_offset =
375
0
      FrameHeaderSize(have_alpha, is_last) + toc_size + group_sizes[0];
376
0
  return ac_group_data_offset - actual_offset;
377
0
}
378
#endif
379
380
constexpr size_t kNumRawSymbols = 19;
381
constexpr size_t kNumLZ77 = 33;
382
constexpr size_t kLZ77CacheSize = 32;
383
384
constexpr size_t kLZ77Offset = 224;
385
constexpr size_t kLZ77MinLength = 7;
386
387
void EncodeHybridUintLZ77(uint32_t value, uint32_t* token, uint32_t* nbits,
388
0
                          uint32_t* bits) {
389
  // 400 config
390
0
  uint32_t n = FloorLog2(value);
391
0
  *token = value < 16 ? value : 16 + n - 4;
392
0
  *nbits = value < 16 ? 0 : n;
393
0
  *bits = value < 16 ? 0 : value - (1 << *nbits);
394
0
}
395
396
struct PrefixCode {
397
  uint8_t raw_nbits[kNumRawSymbols] = {};
398
  uint8_t raw_bits[kNumRawSymbols] = {};
399
400
  uint8_t lz77_nbits[kNumLZ77] = {};
401
  uint16_t lz77_bits[kNumLZ77] = {};
402
403
  uint64_t lz77_cache_bits[kLZ77CacheSize] = {};
404
  uint8_t lz77_cache_nbits[kLZ77CacheSize] = {};
405
406
  size_t numraw;
407
408
0
  static uint16_t BitReverse(size_t nbits, uint16_t bits) {
409
0
    constexpr uint16_t kNibbleLookup[16] = {
410
0
        0b0000, 0b1000, 0b0100, 0b1100, 0b0010, 0b1010, 0b0110, 0b1110,
411
0
        0b0001, 0b1001, 0b0101, 0b1101, 0b0011, 0b1011, 0b0111, 0b1111,
412
0
    };
413
0
    uint16_t rev16 = (kNibbleLookup[bits & 0xF] << 12) |
414
0
                     (kNibbleLookup[(bits >> 4) & 0xF] << 8) |
415
0
                     (kNibbleLookup[(bits >> 8) & 0xF] << 4) |
416
0
                     (kNibbleLookup[bits >> 12]);
417
0
    return rev16 >> (16 - nbits);
418
0
  }
419
420
  // Create the prefix codes given the code lengths.
421
  // Supports the code lengths being split into two halves.
422
  static void ComputeCanonicalCode(const uint8_t* first_chunk_nbits,
423
                                   uint8_t* first_chunk_bits,
424
                                   size_t first_chunk_size,
425
                                   const uint8_t* second_chunk_nbits,
426
                                   uint16_t* second_chunk_bits,
427
0
                                   size_t second_chunk_size) {
428
0
    constexpr size_t kMaxCodeLength = 15;
429
0
    uint8_t code_length_counts[kMaxCodeLength + 1] = {};
430
0
    for (size_t i = 0; i < first_chunk_size; i++) {
431
0
      code_length_counts[first_chunk_nbits[i]]++;
432
0
      assert(first_chunk_nbits[i] <= kMaxCodeLength);
433
0
      assert(first_chunk_nbits[i] <= 8);
434
0
      assert(first_chunk_nbits[i] > 0);
435
0
    }
436
0
    for (size_t i = 0; i < second_chunk_size; i++) {
437
0
      code_length_counts[second_chunk_nbits[i]]++;
438
0
      assert(second_chunk_nbits[i] <= kMaxCodeLength);
439
0
    }
440
441
0
    uint16_t next_code[kMaxCodeLength + 1] = {};
442
443
0
    uint16_t code = 0;
444
0
    for (size_t i = 1; i < kMaxCodeLength + 1; i++) {
445
0
      code = (code + code_length_counts[i - 1]) << 1;
446
0
      next_code[i] = code;
447
0
    }
448
449
0
    for (size_t i = 0; i < first_chunk_size; i++) {
450
0
      first_chunk_bits[i] =
451
0
          BitReverse(first_chunk_nbits[i], next_code[first_chunk_nbits[i]]++);
452
0
    }
453
0
    for (size_t i = 0; i < second_chunk_size; i++) {
454
0
      second_chunk_bits[i] =
455
0
          BitReverse(second_chunk_nbits[i], next_code[second_chunk_nbits[i]]++);
456
0
    }
457
0
  }
458
459
  template <typename T>
460
  static void ComputeCodeLengthsNonZeroImpl(const uint64_t* freqs, size_t n,
461
                                            size_t precision, T infty,
462
                                            const uint8_t* min_limit,
463
                                            const uint8_t* max_limit,
464
0
                                            uint8_t* nbits) {
465
0
    assert(precision < 15);
466
0
    assert(n <= kMaxNumSymbols);
467
0
    std::vector<T> dynp(((1U << precision) + 1) * (n + 1), infty);
468
0
    auto d = [&](size_t sym, size_t off) -> T& {
469
0
      return dynp[sym * ((1 << precision) + 1) + off];
470
0
    };
Unexecuted instantiation: enc_fast_lossless.cc:(anonymous namespace)::PrefixCode::ComputeCodeLengthsNonZeroImpl<unsigned int>(unsigned long const*, unsigned long, unsigned long, unsigned int, unsigned char const*, unsigned char const*, unsigned char*)::{lambda(unsigned long, unsigned long)#1}::operator()(unsigned long, unsigned long) const
Unexecuted instantiation: enc_fast_lossless.cc:(anonymous namespace)::PrefixCode::ComputeCodeLengthsNonZeroImpl<unsigned long>(unsigned long const*, unsigned long, unsigned long, unsigned long, unsigned char const*, unsigned char const*, unsigned char*)::{lambda(unsigned long, unsigned long)#1}::operator()(unsigned long, unsigned long) const
471
0
    d(0, 0) = 0;
472
0
    for (size_t sym = 0; sym < n; sym++) {
473
0
      for (T bits = min_limit[sym]; bits <= max_limit[sym]; bits++) {
474
0
        size_t off_delta = 1U << (precision - bits);
475
0
        for (size_t off = 0; off + off_delta <= (1U << precision); off++) {
476
0
          d(sym + 1, off + off_delta) =
477
0
              std::min(d(sym, off) + static_cast<T>(freqs[sym]) * bits,
478
0
                       d(sym + 1, off + off_delta));
479
0
        }
480
0
      }
481
0
    }
482
483
0
    size_t sym = n;
484
0
    size_t off = 1U << precision;
485
486
0
    assert(d(sym, off) != infty);
487
488
0
    while (sym-- > 0) {
489
0
      assert(off > 0);
490
0
      for (size_t bits = min_limit[sym]; bits <= max_limit[sym]; bits++) {
491
0
        size_t off_delta = 1U << (precision - bits);
492
0
        if (off_delta <= off &&
493
0
            d(sym + 1, off) == d(sym, off - off_delta) + freqs[sym] * bits) {
494
0
          off -= off_delta;
495
0
          nbits[sym] = bits;
496
0
          break;
497
0
        }
498
0
      }
499
0
    }
500
0
  }
Unexecuted instantiation: enc_fast_lossless.cc:void (anonymous namespace)::PrefixCode::ComputeCodeLengthsNonZeroImpl<unsigned int>(unsigned long const*, unsigned long, unsigned long, unsigned int, unsigned char const*, unsigned char const*, unsigned char*)
Unexecuted instantiation: enc_fast_lossless.cc:void (anonymous namespace)::PrefixCode::ComputeCodeLengthsNonZeroImpl<unsigned long>(unsigned long const*, unsigned long, unsigned long, unsigned long, unsigned char const*, unsigned char const*, unsigned char*)
501
502
  // Computes nbits[i] for i <= n, subject to min_limit[i] <= nbits[i] <=
503
  // max_limit[i] and sum 2**-nbits[i] == 1, so to minimize sum(nbits[i] *
504
  // freqs[i]).
505
  static void ComputeCodeLengthsNonZero(const uint64_t* freqs, size_t n,
506
                                        uint8_t* min_limit, uint8_t* max_limit,
507
0
                                        uint8_t* nbits) {
508
0
    size_t precision = 0;
509
0
    size_t shortest_length = 255;
510
0
    uint64_t freqsum = 0;
511
0
    for (size_t i = 0; i < n; i++) {
512
0
      assert(freqs[i] != 0);
513
0
      freqsum += freqs[i];
514
0
      if (min_limit[i] < 1) min_limit[i] = 1;
515
0
      assert(min_limit[i] <= max_limit[i]);
516
0
      precision = std::max<size_t>(max_limit[i], precision);
517
0
      shortest_length = std::min<size_t>(min_limit[i], shortest_length);
518
0
    }
519
    // If all the minimum limits are greater than 1, shift precision so that we
520
    // behave as if the shortest was 1.
521
0
    precision -= shortest_length - 1;
522
0
    uint64_t infty = freqsum * precision;
523
0
    if (infty < std::numeric_limits<uint32_t>::max() / 2) {
524
0
      ComputeCodeLengthsNonZeroImpl(freqs, n, precision,
525
0
                                    static_cast<uint32_t>(infty), min_limit,
526
0
                                    max_limit, nbits);
527
0
    } else {
528
0
      ComputeCodeLengthsNonZeroImpl(freqs, n, precision, infty, min_limit,
529
0
                                    max_limit, nbits);
530
0
    }
531
0
  }
532
533
  static constexpr size_t kMaxNumSymbols =
534
      kNumRawSymbols + 1 < kNumLZ77 ? kNumLZ77 : kNumRawSymbols + 1;
535
  static void ComputeCodeLengths(const uint64_t* freqs, size_t n,
536
                                 const uint8_t* min_limit_in,
537
0
                                 const uint8_t* max_limit_in, uint8_t* nbits) {
538
0
    assert(n <= kMaxNumSymbols);
539
0
    uint64_t compact_freqs[kMaxNumSymbols];
540
0
    uint8_t min_limit[kMaxNumSymbols];
541
0
    uint8_t max_limit[kMaxNumSymbols];
542
0
    size_t ni = 0;
543
0
    for (size_t i = 0; i < n; i++) {
544
0
      if (freqs[i]) {
545
0
        compact_freqs[ni] = freqs[i];
546
0
        min_limit[ni] = min_limit_in[i];
547
0
        max_limit[ni] = max_limit_in[i];
548
0
        ni++;
549
0
      }
550
0
    }
551
0
    for (size_t i = ni; i < kMaxNumSymbols; ++i) {
552
0
      compact_freqs[i] = 0;
553
0
      min_limit[i] = 0;
554
0
      max_limit[i] = 0;
555
0
    }
556
0
    uint8_t num_bits[kMaxNumSymbols] = {};
557
0
    ComputeCodeLengthsNonZero(compact_freqs, ni, min_limit, max_limit,
558
0
                              num_bits);
559
0
    ni = 0;
560
0
    for (size_t i = 0; i < n; i++) {
561
0
      nbits[i] = 0;
562
0
      if (freqs[i]) {
563
0
        nbits[i] = num_bits[ni++];
564
0
      }
565
0
    }
566
0
  }
567
568
  // Invalid code, used to construct arrays.
569
0
  PrefixCode() = default;
570
571
  template <typename BitDepth>
572
  PrefixCode(BitDepth /* bitdepth */, uint64_t* raw_counts,
573
0
             uint64_t* lz77_counts) {
574
    // "merge" together all the lz77 counts in a single symbol for the level 1
575
    // table (containing just the raw symbols, up to length 7).
576
0
    uint64_t level1_counts[kNumRawSymbols + 1];
577
0
    memcpy(level1_counts, raw_counts, kNumRawSymbols * sizeof(uint64_t));
578
0
    numraw = kNumRawSymbols;
579
0
    while (numraw > 0 && level1_counts[numraw - 1] == 0) numraw--;
580
581
0
    level1_counts[numraw] = 0;
582
0
    for (size_t i = 0; i < kNumLZ77; i++) {
583
0
      level1_counts[numraw] += lz77_counts[i];
584
0
    }
585
0
    uint8_t level1_nbits[kNumRawSymbols + 1] = {};
586
0
    ComputeCodeLengths(level1_counts, numraw + 1, BitDepth::kMinRawLength,
587
0
                       BitDepth::kMaxRawLength, level1_nbits);
588
589
0
    uint8_t level2_nbits[kNumLZ77] = {};
590
0
    uint8_t min_lengths[kNumLZ77] = {};
591
0
    uint8_t l = 15 - level1_nbits[numraw];
592
0
    uint8_t max_lengths[kNumLZ77];
593
0
    for (uint8_t& max_length : max_lengths) {
594
0
      max_length = l;
595
0
    }
596
0
    size_t num_lz77 = kNumLZ77;
597
0
    while (num_lz77 > 0 && lz77_counts[num_lz77 - 1] == 0) num_lz77--;
598
0
    ComputeCodeLengths(lz77_counts, num_lz77, min_lengths, max_lengths,
599
0
                       level2_nbits);
600
0
    for (size_t i = 0; i < numraw; i++) {
601
0
      raw_nbits[i] = level1_nbits[i];
602
0
    }
603
0
    for (size_t i = 0; i < num_lz77; i++) {
604
0
      lz77_nbits[i] =
605
0
          level2_nbits[i] ? level1_nbits[numraw] + level2_nbits[i] : 0;
606
0
    }
607
608
0
    ComputeCanonicalCode(raw_nbits, raw_bits, numraw, lz77_nbits, lz77_bits,
609
0
                         kNumLZ77);
610
611
    // Prepare lz77 cache
612
0
    for (size_t count = 0; count < kLZ77CacheSize; count++) {
613
0
      unsigned token, nbits, bits;
614
0
      EncodeHybridUintLZ77(count, &token, &nbits, &bits);
615
0
      lz77_cache_nbits[count] = lz77_nbits[token] + nbits + raw_nbits[0];
616
0
      lz77_cache_bits[count] =
617
0
          (((bits << lz77_nbits[token]) | lz77_bits[token]) << raw_nbits[0]) |
618
0
          raw_bits[0];
619
0
    }
620
0
  }
Unexecuted instantiation: enc_fast_lossless.cc:(anonymous namespace)::PrefixCode::PrefixCode<AVX2::(anonymous namespace)::UpTo8Bits>(AVX2::(anonymous namespace)::UpTo8Bits, unsigned long*, unsigned long*)
Unexecuted instantiation: enc_fast_lossless.cc:(anonymous namespace)::PrefixCode::PrefixCode<AVX2::(anonymous namespace)::From9To13Bits>(AVX2::(anonymous namespace)::From9To13Bits, unsigned long*, unsigned long*)
Unexecuted instantiation: enc_fast_lossless.cc:(anonymous namespace)::PrefixCode::PrefixCode<AVX2::(anonymous namespace)::Exactly14Bits>(AVX2::(anonymous namespace)::Exactly14Bits, unsigned long*, unsigned long*)
Unexecuted instantiation: enc_fast_lossless.cc:(anonymous namespace)::PrefixCode::PrefixCode<AVX2::(anonymous namespace)::MoreThan14Bits>(AVX2::(anonymous namespace)::MoreThan14Bits, unsigned long*, unsigned long*)
Unexecuted instantiation: enc_fast_lossless.cc:(anonymous namespace)::PrefixCode::PrefixCode<default_implementation::(anonymous namespace)::UpTo8Bits>(default_implementation::(anonymous namespace)::UpTo8Bits, unsigned long*, unsigned long*)
Unexecuted instantiation: enc_fast_lossless.cc:(anonymous namespace)::PrefixCode::PrefixCode<default_implementation::(anonymous namespace)::From9To13Bits>(default_implementation::(anonymous namespace)::From9To13Bits, unsigned long*, unsigned long*)
Unexecuted instantiation: enc_fast_lossless.cc:(anonymous namespace)::PrefixCode::PrefixCode<default_implementation::(anonymous namespace)::Exactly14Bits>(default_implementation::(anonymous namespace)::Exactly14Bits, unsigned long*, unsigned long*)
Unexecuted instantiation: enc_fast_lossless.cc:(anonymous namespace)::PrefixCode::PrefixCode<default_implementation::(anonymous namespace)::MoreThan14Bits>(default_implementation::(anonymous namespace)::MoreThan14Bits, unsigned long*, unsigned long*)
621
622
  // Max bits written: 2 + 72 + 95 + 24 + 165 = 286
623
0
  void WriteTo(BitWriter* writer) const {
624
0
    uint64_t code_length_counts[18] = {};
625
0
    code_length_counts[17] = 3 + 2 * (kNumLZ77 - 1);
626
0
    for (uint8_t raw_nbit : raw_nbits) {
627
0
      code_length_counts[raw_nbit]++;
628
0
    }
629
0
    for (uint8_t lz77_nbit : lz77_nbits) {
630
0
      code_length_counts[lz77_nbit]++;
631
0
    }
632
0
    uint8_t code_length_nbits[18] = {};
633
0
    uint8_t code_length_nbits_min[18] = {};
634
0
    uint8_t code_length_nbits_max[18] = {
635
0
        5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
636
0
    };
637
0
    ComputeCodeLengths(code_length_counts, 18, code_length_nbits_min,
638
0
                       code_length_nbits_max, code_length_nbits);
639
0
    writer->Write(2, 0b00);  // HSKIP = 0, i.e. don't skip code lengths.
640
641
    // As per Brotli RFC.
642
0
    uint8_t code_length_order[18] = {1, 2, 3, 4,  0,  5,  17, 6,  16,
643
0
                                     7, 8, 9, 10, 11, 12, 13, 14, 15};
644
0
    uint8_t code_length_length_nbits[] = {2, 4, 3, 2, 2, 4};
645
0
    uint8_t code_length_length_bits[] = {0, 7, 3, 2, 1, 15};
646
647
    // Encode lengths of code lengths.
648
0
    size_t num_code_lengths = 18;
649
0
    while (code_length_nbits[code_length_order[num_code_lengths - 1]] == 0) {
650
0
      num_code_lengths--;
651
0
    }
652
    // Max bits written in this loop: 18 * 4 = 72
653
0
    for (size_t i = 0; i < num_code_lengths; i++) {
654
0
      int symbol = code_length_nbits[code_length_order[i]];
655
0
      writer->Write(code_length_length_nbits[symbol],
656
0
                    code_length_length_bits[symbol]);
657
0
    }
658
659
    // Compute the canonical codes for the codes that represent the lengths of
660
    // the actual codes for data.
661
0
    uint16_t code_length_bits[18] = {};
662
0
    ComputeCanonicalCode(nullptr, nullptr, 0, code_length_nbits,
663
0
                         code_length_bits, 18);
664
    // Encode raw bit code lengths.
665
    // Max bits written in this loop: 19 * 5 = 95
666
0
    for (uint8_t raw_nbit : raw_nbits) {
667
0
      writer->Write(code_length_nbits[raw_nbit], code_length_bits[raw_nbit]);
668
0
    }
669
0
    size_t num_lz77 = kNumLZ77;
670
0
    while (lz77_nbits[num_lz77 - 1] == 0) {
671
0
      num_lz77--;
672
0
    }
673
    // Encode 0s until 224 (start of LZ77 symbols). This is in total 224-19 =
674
    // 205.
675
0
    static_assert(kLZ77Offset == 224, "kLZ77Offset should be 224");
676
0
    static_assert(kNumRawSymbols == 19, "kNumRawSymbols should be 19");
677
0
    {
678
      // Max bits in this block: 24
679
0
      writer->Write(code_length_nbits[17], code_length_bits[17]);
680
0
      writer->Write(3, 0b010);  // 5
681
0
      writer->Write(code_length_nbits[17], code_length_bits[17]);
682
0
      writer->Write(3, 0b000);  // (5-2)*8 + 3 = 27
683
0
      writer->Write(code_length_nbits[17], code_length_bits[17]);
684
0
      writer->Write(3, 0b010);  // (27-2)*8 + 5 = 205
685
0
    }
686
    // Encode LZ77 symbols, with values 224+i.
687
    // Max bits written in this loop: 33 * 5 = 165
688
0
    for (size_t i = 0; i < num_lz77; i++) {
689
0
      writer->Write(code_length_nbits[lz77_nbits[i]],
690
0
                    code_length_bits[lz77_nbits[i]]);
691
0
    }
692
0
  }
693
};
694
695
}  // namespace
696
697
extern "C" {
698
699
struct JxlFastLosslessFrameState {
700
  JxlChunkedFrameInputSource input;
701
  size_t width;
702
  size_t height;
703
  size_t num_groups_x;
704
  size_t num_groups_y;
705
  size_t num_dc_groups_x;
706
  size_t num_dc_groups_y;
707
  size_t nb_chans;
708
  size_t bitdepth;
709
  int big_endian;
710
  int effort;
711
  bool collided;
712
  PrefixCode hcode[4];
713
  std::vector<int16_t> lookup;
714
  BitWriter header;
715
  std::vector<std::array<BitWriter, 4>> group_data;
716
  std::vector<size_t> group_sizes;
717
  size_t ac_group_data_offset = 0;
718
  size_t min_dc_global_size = 0;
719
  size_t current_bit_writer = 0;
720
  size_t bit_writer_byte_pos = 0;
721
  size_t bits_in_buffer = 0;
722
  uint64_t bit_buffer = 0;
723
  bool process_done = false;
724
};
725
726
0
size_t JxlFastLosslessOutputSize(const JxlFastLosslessFrameState* frame) {
727
0
  size_t total_size_groups = 0;
728
0
  for (const auto& section : frame->group_data) {
729
0
    total_size_groups += SectionSize(section);
730
0
  }
731
0
  return frame->header.bytes_written + total_size_groups;
732
0
}
733
734
size_t JxlFastLosslessMaxRequiredOutput(
735
0
    const JxlFastLosslessFrameState* frame) {
736
0
  return JxlFastLosslessOutputSize(frame) + 32;
737
0
}
738
739
bool JxlFastLosslessPrepareHeader(JxlFastLosslessFrameState* frame,
740
0
                                  int add_image_header, int is_last) {
741
0
  BitWriter* output = &frame->header;
742
0
  if (!output->Allocate(1000 + frame->group_sizes.size() * 32)) return false;
743
744
0
  bool have_alpha = (frame->nb_chans == 2 || frame->nb_chans == 4);
745
746
#if FJXL_STANDALONE
747
  if (add_image_header) {
748
    // Signature
749
    output->Write(16, 0x0AFF);
750
751
    // Size header, hand-crafted.
752
    // Not small
753
    output->Write(1, 0);
754
755
    auto wsz = [output](size_t size) {
756
      if (size - 1 < (1 << 9)) {
757
        output->Write(2, 0b00);
758
        output->Write(9, size - 1);
759
      } else if (size - 1 < (1 << 13)) {
760
        output->Write(2, 0b01);
761
        output->Write(13, size - 1);
762
      } else if (size - 1 < (1 << 18)) {
763
        output->Write(2, 0b10);
764
        output->Write(18, size - 1);
765
      } else {
766
        output->Write(2, 0b11);
767
        output->Write(30, size - 1);
768
      }
769
    };
770
771
    wsz(frame->height);
772
773
    // No special ratio.
774
    output->Write(3, 0);
775
776
    wsz(frame->width);
777
778
    // Hand-crafted ImageMetadata.
779
    output->Write(1, 0);  // all_default
780
    output->Write(1, 0);  // extra_fields
781
    output->Write(1, 0);  // bit_depth.floating_point_sample
782
    if (frame->bitdepth == 8) {
783
      output->Write(2, 0b00);  // bit_depth.bits_per_sample = 8
784
    } else if (frame->bitdepth == 10) {
785
      output->Write(2, 0b01);  // bit_depth.bits_per_sample = 10
786
    } else if (frame->bitdepth == 12) {
787
      output->Write(2, 0b10);  // bit_depth.bits_per_sample = 12
788
    } else {
789
      output->Write(2, 0b11);  // 1 + u(6)
790
      output->Write(6, frame->bitdepth - 1);
791
    }
792
    if (frame->bitdepth <= 14) {
793
      output->Write(1, 1);  // 16-bit-buffer sufficient
794
    } else {
795
      output->Write(1, 0);  // 16-bit-buffer NOT sufficient
796
    }
797
    if (have_alpha) {
798
      output->Write(2, 0b01);  // One extra channel
799
      if (frame->bitdepth == 8) {
800
        output->Write(1, 1); // ... all_default (ie. 8-bit alpha)
801
      } else {
802
        output->Write(1, 0); // not d_alpha
803
        output->Write(2, 0); // type = kAlpha
804
        output->Write(1, 0); // not float
805
        if (frame->bitdepth == 10) {
806
          output->Write(2, 0b01); // bit_depth.bits_per_sample = 10
807
        } else if (frame->bitdepth == 12) {
808
          output->Write(2, 0b10); // bit_depth.bits_per_sample = 12
809
        } else {
810
          output->Write(2, 0b11); // 1 + u(6)
811
          output->Write(6, frame->bitdepth - 1);
812
        }
813
        output->Write(2, 0); // dim_shift = 0
814
        output->Write(2, 0); // name_len = 0
815
        output->Write(1, 0); // alpha_associated = 0
816
      }
817
    } else {
818
      output->Write(2, 0b00);  // No extra channel
819
    }
820
    output->Write(1, 0);  // Not XYB
821
    if (frame->nb_chans > 2) {
822
      output->Write(1, 1);  // color_encoding.all_default (sRGB)
823
    } else {
824
      output->Write(1, 0);     // color_encoding.all_default false
825
      output->Write(1, 0);     // color_encoding.want_icc false
826
      output->Write(2, 1);     // grayscale
827
      output->Write(2, 1);     // D65
828
      output->Write(1, 0);     // no gamma transfer function
829
      output->Write(2, 0b10);  // tf: 2 + u(4)
830
      output->Write(4, 11);    // tf of sRGB
831
      output->Write(2, 1);     // relative rendering intent
832
    }
833
    output->Write(2, 0b00);  // No extensions.
834
835
    output->Write(1, 1);  // all_default transform data
836
837
    // No ICC, no preview. Frame should start at byte boundary.
838
    output->ZeroPadToByte();
839
  }
840
#else
841
0
  assert(!add_image_header);
842
0
#endif
843
  // Handcrafted frame header.
844
0
  output->Write(1, 0);     // all_default
845
0
  output->Write(2, 0b00);  // regular frame
846
0
  output->Write(1, 1);     // modular
847
0
  output->Write(2, 0b00);  // default flags
848
0
  output->Write(1, 0);     // not YCbCr
849
0
  output->Write(2, 0b00);  // no upsampling
850
0
  if (have_alpha) {
851
0
    output->Write(2, 0b00);  // no alpha upsampling
852
0
  }
853
0
  output->Write(2, 0b01);  // default group size
854
0
  output->Write(2, 0b00);  // exactly one pass
855
0
  output->Write(1, 0);     // no custom size or origin
856
0
  output->Write(2, 0b00);  // kReplace blending mode
857
0
  if (have_alpha) {
858
0
    output->Write(2, 0b00);  // kReplace blending mode for alpha channel
859
0
  }
860
0
  output->Write(1, is_last);  // is_last
861
0
  if (!is_last) {
862
0
    output->Write(2, 0b00);  // can not be saved as reference
863
0
  }
864
0
  output->Write(2, 0b00);  // a frame has no name
865
0
  output->Write(1, 0);     // loop filter is not all_default
866
0
  output->Write(1, 0);     // no gaborish
867
0
  output->Write(2, 0);     // 0 EPF iters
868
0
  output->Write(2, 0b00);  // No LF extensions
869
0
  output->Write(2, 0b00);  // No FH extensions
870
871
0
  output->Write(1, 0);      // No TOC permutation
872
0
  output->ZeroPadToByte();  // TOC is byte-aligned.
873
0
  assert(add_image_header || output->bytes_written <= kMaxFrameHeaderSize);
874
0
  for (size_t group_size : frame->group_sizes) {
875
0
    size_t bucket = TOCBucket(group_size);
876
0
    output->Write(2, bucket);
877
0
    output->Write(kTOCBits[bucket] - 2, group_size - kGroupSizeOffset[bucket]);
878
0
  }
879
0
  output->ZeroPadToByte();  // Groups are byte-aligned.
880
0
  return true;
881
0
}
882
883
#if !FJXL_STANDALONE
884
bool JxlFastLosslessOutputAlignedSection(
885
0
    const BitWriter& bw, JxlEncoderOutputProcessorWrapper* output_processor) {
886
0
  assert(bw.bits_in_buffer == 0);
887
0
  const uint8_t* data = bw.data.get();
888
0
  size_t remaining_len = bw.bytes_written;
889
0
  while (remaining_len > 0) {
890
0
    JXL_ASSIGN_OR_RETURN(auto buffer,
891
0
                         output_processor->GetBuffer(1, remaining_len));
892
0
    size_t n = std::min(buffer.size(), remaining_len);
893
0
    if (n == 0) break;
894
0
    memcpy(buffer.data(), data, n);
895
0
    JXL_RETURN_IF_ERROR(buffer.advance(n));
896
0
    data += n;
897
0
    remaining_len -= n;
898
0
  };
899
0
  return true;
900
0
}
901
902
bool JxlFastLosslessOutputHeaders(
903
    JxlFastLosslessFrameState* frame_state,
904
0
    JxlEncoderOutputProcessorWrapper* output_processor) {
905
0
  JXL_RETURN_IF_ERROR(JxlFastLosslessOutputAlignedSection(frame_state->header,
906
0
                                                          output_processor));
907
0
  JXL_RETURN_IF_ERROR(JxlFastLosslessOutputAlignedSection(
908
0
      frame_state->group_data[0][0], output_processor));
909
0
  return true;
910
0
}
911
#endif
912
913
#if FJXL_ENABLE_AVX512
914
__attribute__((target("avx512vbmi2"))) static size_t AppendBytesWithBitOffset(
915
    const uint8_t* data, size_t n, size_t bit_buffer_nbits,
916
    unsigned char* output, uint64_t& bit_buffer) {
917
  if (n < 128) {
918
    return 0;
919
  }
920
921
  size_t i = 0;
922
  __m512i shift = _mm512_set1_epi64(64 - bit_buffer_nbits);
923
  __m512i carry = _mm512_set1_epi64(bit_buffer << (64 - bit_buffer_nbits));
924
925
  for (; i + 64 <= n; i += 64) {
926
    __m512i current = _mm512_loadu_si512(data + i);
927
    __m512i previous_u64 = _mm512_alignr_epi64(current, carry, 7);
928
    carry = current;
929
    __m512i out = _mm512_shrdv_epi64(previous_u64, current, shift);
930
    _mm512_storeu_si512(output + i, out);
931
  }
932
933
  bit_buffer = data[i - 1] >> (8 - bit_buffer_nbits);
934
935
  return i;
936
}
937
#endif
938
939
size_t JxlFastLosslessWriteOutput(JxlFastLosslessFrameState* frame,
940
0
                                  unsigned char* output, size_t output_size) {
941
0
  assert(output_size >= 32);
942
0
  unsigned char* initial_output = output;
943
0
  size_t (*append_bytes_with_bit_offset)(const uint8_t*, size_t, size_t,
944
0
                                         unsigned char*, uint64_t&) = nullptr;
945
946
#if FJXL_ENABLE_AVX512
947
  if (HasCpuFeature(CpuFeature::kVBMI2)) {
948
    append_bytes_with_bit_offset = AppendBytesWithBitOffset;
949
  }
950
#endif
951
952
0
  while (true) {
953
0
    size_t& cur = frame->current_bit_writer;
954
0
    size_t& bw_pos = frame->bit_writer_byte_pos;
955
0
    if (cur >= 1 + frame->group_data.size() * frame->nb_chans) {
956
0
      return output - initial_output;
957
0
    }
958
0
    if (output_size <= 9) {
959
0
      return output - initial_output;
960
0
    }
961
0
    size_t nbc = frame->nb_chans;
962
0
    const BitWriter& writer =
963
0
        cur == 0 ? frame->header
964
0
                 : frame->group_data[(cur - 1) / nbc][(cur - 1) % nbc];
965
0
    size_t full_byte_count =
966
0
        std::min(output_size - 9, writer.bytes_written - bw_pos);
967
0
    if (frame->bits_in_buffer == 0) {
968
0
      memcpy(output, writer.data.get() + bw_pos, full_byte_count);
969
0
    } else {
970
0
      size_t i = 0;
971
0
      if (append_bytes_with_bit_offset) {
972
0
        i += append_bytes_with_bit_offset(
973
0
            writer.data.get() + bw_pos, full_byte_count, frame->bits_in_buffer,
974
0
            output, frame->bit_buffer);
975
0
      }
976
0
#if defined(__BYTE_ORDER__) && (__BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__)
977
      // Copy 8 bytes at a time until we reach the border.
978
0
      for (; i + 8 < full_byte_count; i += 8) {
979
0
        uint64_t chunk;
980
0
        memcpy(&chunk, writer.data.get() + bw_pos + i, 8);
981
0
        uint64_t out = frame->bit_buffer | (chunk << frame->bits_in_buffer);
982
0
        memcpy(output + i, &out, 8);
983
0
        frame->bit_buffer = chunk >> (64 - frame->bits_in_buffer);
984
0
      }
985
0
#endif
986
0
      for (; i < full_byte_count; i++) {
987
0
        AddBits(8, writer.data.get()[bw_pos + i], output + i,
988
0
                frame->bits_in_buffer, frame->bit_buffer);
989
0
      }
990
0
    }
991
0
    output += full_byte_count;
992
0
    output_size -= full_byte_count;
993
0
    bw_pos += full_byte_count;
994
0
    if (bw_pos == writer.bytes_written) {
995
0
      auto write = [&](size_t num, uint64_t bits) {
996
0
        size_t n = AddBits(num, bits, output, frame->bits_in_buffer,
997
0
                           frame->bit_buffer);
998
0
        output += n;
999
0
        output_size -= n;
1000
0
      };
1001
0
      if (writer.bits_in_buffer) {
1002
0
        write(writer.bits_in_buffer, writer.buffer);
1003
0
      }
1004
0
      bw_pos = 0;
1005
0
      cur++;
1006
0
      if ((cur - 1) % nbc == 0 && frame->bits_in_buffer != 0) {
1007
0
        write(8 - frame->bits_in_buffer, 0);
1008
0
      }
1009
0
    }
1010
0
  }
1011
0
}
1012
1013
0
void JxlFastLosslessFreeFrameState(JxlFastLosslessFrameState* frame) {
1014
0
  delete frame;
1015
0
}
1016
1017
}  // extern "C"
1018
1019
#endif
1020
1021
#ifdef FJXL_SELF_INCLUDE
1022
1023
namespace {
1024
1025
template <typename T>
1026
struct VecPair {
1027
  T low;
1028
  T hi;
1029
};
1030
1031
#ifdef FJXL_GENERIC_SIMD
1032
#undef FJXL_GENERIC_SIMD
1033
#endif
1034
1035
#ifdef FJXL_AVX512
1036
#define FJXL_GENERIC_SIMD
1037
struct SIMDVec32;
1038
struct Mask32 {
1039
  __mmask16 mask;
1040
  SIMDVec32 IfThenElse(const SIMDVec32& if_true, const SIMDVec32& if_false);
1041
  size_t CountPrefix() const {
1042
    return CtzNonZero(~uint64_t{_cvtmask16_u32(mask)});
1043
  }
1044
};
1045
1046
struct SIMDVec32 {
1047
  __m512i vec;
1048
1049
  static constexpr size_t kLanes = 16;
1050
1051
  FJXL_INLINE static SIMDVec32 Load(const uint32_t* data) {
1052
    return SIMDVec32{_mm512_loadu_si512((__m512i*)data)};
1053
  }
1054
  FJXL_INLINE void Store(uint32_t* data) {
1055
    _mm512_storeu_si512((__m512i*)data, vec);
1056
  }
1057
  FJXL_INLINE static SIMDVec32 Val(uint32_t v) {
1058
    return SIMDVec32{_mm512_set1_epi32(v)};
1059
  }
1060
  FJXL_INLINE SIMDVec32 ValToToken() const {
1061
    return SIMDVec32{
1062
        _mm512_sub_epi32(_mm512_set1_epi32(32), _mm512_lzcnt_epi32(vec))};
1063
  }
1064
  FJXL_INLINE SIMDVec32 SatSubU(const SIMDVec32& to_subtract) const {
1065
    return SIMDVec32{_mm512_sub_epi32(_mm512_max_epu32(vec, to_subtract.vec),
1066
                                      to_subtract.vec)};
1067
  }
1068
  FJXL_INLINE SIMDVec32 Sub(const SIMDVec32& to_subtract) const {
1069
    return SIMDVec32{_mm512_sub_epi32(vec, to_subtract.vec)};
1070
  }
1071
  FJXL_INLINE SIMDVec32 Add(const SIMDVec32& oth) const {
1072
    return SIMDVec32{_mm512_add_epi32(vec, oth.vec)};
1073
  }
1074
  FJXL_INLINE SIMDVec32 Xor(const SIMDVec32& oth) const {
1075
    return SIMDVec32{_mm512_xor_epi32(vec, oth.vec)};
1076
  }
1077
  FJXL_INLINE Mask32 Eq(const SIMDVec32& oth) const {
1078
    return Mask32{_mm512_cmpeq_epi32_mask(vec, oth.vec)};
1079
  }
1080
  FJXL_INLINE Mask32 Gt(const SIMDVec32& oth) const {
1081
    return Mask32{_mm512_cmpgt_epi32_mask(vec, oth.vec)};
1082
  }
1083
  FJXL_INLINE SIMDVec32 Pow2() const {
1084
    return SIMDVec32{_mm512_sllv_epi32(_mm512_set1_epi32(1), vec)};
1085
  }
1086
  template <size_t i>
1087
  FJXL_INLINE SIMDVec32 SignedShiftRight() const {
1088
    return SIMDVec32{_mm512_srai_epi32(vec, i)};
1089
  }
1090
};
1091
1092
struct SIMDVec16;
1093
1094
struct Mask16 {
1095
  __mmask32 mask;
1096
  SIMDVec16 IfThenElse(const SIMDVec16& if_true, const SIMDVec16& if_false);
1097
  Mask16 And(const Mask16& oth) const {
1098
    return Mask16{_kand_mask32(mask, oth.mask)};
1099
  }
1100
  size_t CountPrefix() const {
1101
    return CtzNonZero(~uint64_t{_cvtmask32_u32(mask)});
1102
  }
1103
};
1104
1105
struct SIMDVec16 {
1106
  __m512i vec;
1107
1108
  static constexpr size_t kLanes = 32;
1109
1110
  FJXL_INLINE static SIMDVec16 Load(const uint16_t* data) {
1111
    return SIMDVec16{_mm512_loadu_si512((__m512i*)data)};
1112
  }
1113
  FJXL_INLINE void Store(uint16_t* data) {
1114
    _mm512_storeu_si512((__m512i*)data, vec);
1115
  }
1116
  FJXL_INLINE static SIMDVec16 Val(uint16_t v) {
1117
    return SIMDVec16{_mm512_set1_epi16(v)};
1118
  }
1119
  FJXL_INLINE static SIMDVec16 FromTwo32(const SIMDVec32& lo,
1120
                                         const SIMDVec32& hi) {
1121
    auto tmp = _mm512_packus_epi32(lo.vec, hi.vec);
1122
    alignas(64) uint64_t perm[8] = {0, 2, 4, 6, 1, 3, 5, 7};
1123
    return SIMDVec16{
1124
        _mm512_permutex2var_epi64(tmp, _mm512_load_si512((__m512i*)perm), tmp)};
1125
  }
1126
1127
  FJXL_INLINE SIMDVec16 ValToToken() const {
1128
    auto c16 = _mm512_set1_epi32(16);
1129
    auto c32 = _mm512_set1_epi32(32);
1130
    auto low16bit = _mm512_set1_epi32(0x0000FFFF);
1131
    auto lzhi =
1132
        _mm512_sub_epi32(c16, _mm512_min_epu32(c16, _mm512_lzcnt_epi32(vec)));
1133
    auto lzlo = _mm512_sub_epi32(
1134
        c32, _mm512_lzcnt_epi32(_mm512_and_si512(low16bit, vec)));
1135
    return SIMDVec16{_mm512_or_si512(lzlo, _mm512_slli_epi32(lzhi, 16))};
1136
  }
1137
1138
  FJXL_INLINE SIMDVec16 SatSubU(const SIMDVec16& to_subtract) const {
1139
    return SIMDVec16{_mm512_subs_epu16(vec, to_subtract.vec)};
1140
  }
1141
  FJXL_INLINE SIMDVec16 Sub(const SIMDVec16& to_subtract) const {
1142
    return SIMDVec16{_mm512_sub_epi16(vec, to_subtract.vec)};
1143
  }
1144
  FJXL_INLINE SIMDVec16 Add(const SIMDVec16& oth) const {
1145
    return SIMDVec16{_mm512_add_epi16(vec, oth.vec)};
1146
  }
1147
  FJXL_INLINE SIMDVec16 Min(const SIMDVec16& oth) const {
1148
    return SIMDVec16{_mm512_min_epu16(vec, oth.vec)};
1149
  }
1150
  FJXL_INLINE Mask16 Eq(const SIMDVec16& oth) const {
1151
    return Mask16{_mm512_cmpeq_epi16_mask(vec, oth.vec)};
1152
  }
1153
  FJXL_INLINE Mask16 Gt(const SIMDVec16& oth) const {
1154
    return Mask16{_mm512_cmpgt_epi16_mask(vec, oth.vec)};
1155
  }
1156
  FJXL_INLINE SIMDVec16 Pow2() const {
1157
    return SIMDVec16{_mm512_sllv_epi16(_mm512_set1_epi16(1), vec)};
1158
  }
1159
  FJXL_INLINE SIMDVec16 Or(const SIMDVec16& oth) const {
1160
    return SIMDVec16{_mm512_or_si512(vec, oth.vec)};
1161
  }
1162
  FJXL_INLINE SIMDVec16 Xor(const SIMDVec16& oth) const {
1163
    return SIMDVec16{_mm512_xor_si512(vec, oth.vec)};
1164
  }
1165
  FJXL_INLINE SIMDVec16 And(const SIMDVec16& oth) const {
1166
    return SIMDVec16{_mm512_and_si512(vec, oth.vec)};
1167
  }
1168
  FJXL_INLINE SIMDVec16 HAdd(const SIMDVec16& oth) const {
1169
    return SIMDVec16{_mm512_srai_epi16(_mm512_add_epi16(vec, oth.vec), 1)};
1170
  }
1171
  FJXL_INLINE SIMDVec16 PrepareForU8Lookup() const {
1172
    return SIMDVec16{_mm512_or_si512(vec, _mm512_set1_epi16(0xFF00))};
1173
  }
1174
  FJXL_INLINE SIMDVec16 U8Lookup(const uint8_t* table) const {
1175
    return SIMDVec16{_mm512_shuffle_epi8(
1176
        _mm512_broadcast_i32x4(_mm_loadu_si128((__m128i*)table)), vec)};
1177
  }
1178
  FJXL_INLINE VecPair<SIMDVec16> Interleave(const SIMDVec16& low) const {
1179
    auto lo = _mm512_unpacklo_epi16(low.vec, vec);
1180
    auto hi = _mm512_unpackhi_epi16(low.vec, vec);
1181
    alignas(64) uint64_t perm1[8] = {0, 1, 8, 9, 2, 3, 10, 11};
1182
    alignas(64) uint64_t perm2[8] = {4, 5, 12, 13, 6, 7, 14, 15};
1183
    return {SIMDVec16{_mm512_permutex2var_epi64(
1184
                lo, _mm512_load_si512((__m512i*)perm1), hi)},
1185
            SIMDVec16{_mm512_permutex2var_epi64(
1186
                lo, _mm512_load_si512((__m512i*)perm2), hi)}};
1187
  }
1188
  FJXL_INLINE VecPair<SIMDVec32> Upcast() const {
1189
    auto lo = _mm512_unpacklo_epi16(vec, _mm512_setzero_si512());
1190
    auto hi = _mm512_unpackhi_epi16(vec, _mm512_setzero_si512());
1191
    alignas(64) uint64_t perm1[8] = {0, 1, 8, 9, 2, 3, 10, 11};
1192
    alignas(64) uint64_t perm2[8] = {4, 5, 12, 13, 6, 7, 14, 15};
1193
    return {SIMDVec32{_mm512_permutex2var_epi64(
1194
                lo, _mm512_load_si512((__m512i*)perm1), hi)},
1195
            SIMDVec32{_mm512_permutex2var_epi64(
1196
                lo, _mm512_load_si512((__m512i*)perm2), hi)}};
1197
  }
1198
  template <size_t i>
1199
  FJXL_INLINE SIMDVec16 SignedShiftRight() const {
1200
    return SIMDVec16{_mm512_srai_epi16(vec, i)};
1201
  }
1202
1203
  static std::array<SIMDVec16, 1> LoadG8(const unsigned char* data) {
1204
    __m256i bytes = _mm256_loadu_si256((__m256i*)data);
1205
    return {SIMDVec16{_mm512_cvtepu8_epi16(bytes)}};
1206
  }
1207
  static std::array<SIMDVec16, 1> LoadG16(const unsigned char* data) {
1208
    return {Load((const uint16_t*)data)};
1209
  }
1210
1211
  static std::array<SIMDVec16, 2> LoadGA8(const unsigned char* data) {
1212
    __m512i bytes = _mm512_loadu_si512((__m512i*)data);
1213
    __m512i gray = _mm512_and_si512(bytes, _mm512_set1_epi16(0xFF));
1214
    __m512i alpha = _mm512_srli_epi16(bytes, 8);
1215
    return {SIMDVec16{gray}, SIMDVec16{alpha}};
1216
  }
1217
  static std::array<SIMDVec16, 2> LoadGA16(const unsigned char* data) {
1218
    __m512i bytes1 = _mm512_loadu_si512((__m512i*)data);
1219
    __m512i bytes2 = _mm512_loadu_si512((__m512i*)(data + 64));
1220
    __m512i g_mask = _mm512_set1_epi32(0xFFFF);
1221
    __m512i permuteidx = _mm512_set_epi64(7, 5, 3, 1, 6, 4, 2, 0);
1222
    __m512i g = _mm512_permutexvar_epi64(
1223
        permuteidx, _mm512_packus_epi32(_mm512_and_si512(bytes1, g_mask),
1224
                                        _mm512_and_si512(bytes2, g_mask)));
1225
    __m512i a = _mm512_permutexvar_epi64(
1226
        permuteidx, _mm512_packus_epi32(_mm512_srli_epi32(bytes1, 16),
1227
                                        _mm512_srli_epi32(bytes2, 16)));
1228
    return {SIMDVec16{g}, SIMDVec16{a}};
1229
  }
1230
1231
  static std::array<SIMDVec16, 3> LoadRGB8(const unsigned char* data) {
1232
    __m512i bytes0 = _mm512_loadu_si512((__m512i*)data);
1233
    __m512i bytes1 =
1234
        _mm512_zextsi256_si512(_mm256_loadu_si256((__m256i*)(data + 64)));
1235
1236
    // 0x7A = element of upper half of second vector = 0 after lookup; still in
1237
    // the upper half once we add 1 or 2.
1238
    uint8_t z = 0x7A;
1239
    __m512i ridx =
1240
        _mm512_set_epi8(z, 93, z, 90, z, 87, z, 84, z, 81, z, 78, z, 75, z, 72,
1241
                        z, 69, z, 66, z, 63, z, 60, z, 57, z, 54, z, 51, z, 48,
1242
                        z, 45, z, 42, z, 39, z, 36, z, 33, z, 30, z, 27, z, 24,
1243
                        z, 21, z, 18, z, 15, z, 12, z, 9, z, 6, z, 3, z, 0);
1244
    __m512i gidx = _mm512_add_epi8(ridx, _mm512_set1_epi8(1));
1245
    __m512i bidx = _mm512_add_epi8(gidx, _mm512_set1_epi8(1));
1246
    __m512i r = _mm512_permutex2var_epi8(bytes0, ridx, bytes1);
1247
    __m512i g = _mm512_permutex2var_epi8(bytes0, gidx, bytes1);
1248
    __m512i b = _mm512_permutex2var_epi8(bytes0, bidx, bytes1);
1249
    return {SIMDVec16{r}, SIMDVec16{g}, SIMDVec16{b}};
1250
  }
1251
  static std::array<SIMDVec16, 3> LoadRGB16(const unsigned char* data) {
1252
    __m512i bytes0 = _mm512_loadu_si512((__m512i*)data);
1253
    __m512i bytes1 = _mm512_loadu_si512((__m512i*)(data + 64));
1254
    __m512i bytes2 = _mm512_loadu_si512((__m512i*)(data + 128));
1255
1256
    __m512i ridx_lo = _mm512_set_epi16(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 63, 60, 57,
1257
                                       54, 51, 48, 45, 42, 39, 36, 33, 30, 27,
1258
                                       24, 21, 18, 15, 12, 9, 6, 3, 0);
1259
    // -1 is such that when adding 1 or 2, we get the correct index for
1260
    // green/blue.
1261
    __m512i ridx_hi =
1262
        _mm512_set_epi16(29, 26, 23, 20, 17, 14, 11, 8, 5, 2, -1, 0, 0, 0, 0, 0,
1263
                         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
1264
    __m512i gidx_lo = _mm512_add_epi16(ridx_lo, _mm512_set1_epi16(1));
1265
    __m512i gidx_hi = _mm512_add_epi16(ridx_hi, _mm512_set1_epi16(1));
1266
    __m512i bidx_lo = _mm512_add_epi16(gidx_lo, _mm512_set1_epi16(1));
1267
    __m512i bidx_hi = _mm512_add_epi16(gidx_hi, _mm512_set1_epi16(1));
1268
1269
    __mmask32 rmask = _cvtu32_mask32(0b11111111110000000000000000000000);
1270
    __mmask32 gbmask = _cvtu32_mask32(0b11111111111000000000000000000000);
1271
1272
    __m512i rlo = _mm512_permutex2var_epi16(bytes0, ridx_lo, bytes1);
1273
    __m512i glo = _mm512_permutex2var_epi16(bytes0, gidx_lo, bytes1);
1274
    __m512i blo = _mm512_permutex2var_epi16(bytes0, bidx_lo, bytes1);
1275
    __m512i r = _mm512_mask_permutexvar_epi16(rlo, rmask, ridx_hi, bytes2);
1276
    __m512i g = _mm512_mask_permutexvar_epi16(glo, gbmask, gidx_hi, bytes2);
1277
    __m512i b = _mm512_mask_permutexvar_epi16(blo, gbmask, bidx_hi, bytes2);
1278
    return {SIMDVec16{r}, SIMDVec16{g}, SIMDVec16{b}};
1279
  }
1280
1281
  static std::array<SIMDVec16, 4> LoadRGBA8(const unsigned char* data) {
1282
    __m512i bytes1 = _mm512_loadu_si512((__m512i*)data);
1283
    __m512i bytes2 = _mm512_loadu_si512((__m512i*)(data + 64));
1284
    __m512i rg_mask = _mm512_set1_epi32(0xFFFF);
1285
    __m512i permuteidx = _mm512_set_epi64(7, 5, 3, 1, 6, 4, 2, 0);
1286
    __m512i rg = _mm512_permutexvar_epi64(
1287
        permuteidx, _mm512_packus_epi32(_mm512_and_si512(bytes1, rg_mask),
1288
                                        _mm512_and_si512(bytes2, rg_mask)));
1289
    __m512i b_a = _mm512_permutexvar_epi64(
1290
        permuteidx, _mm512_packus_epi32(_mm512_srli_epi32(bytes1, 16),
1291
                                        _mm512_srli_epi32(bytes2, 16)));
1292
    __m512i r = _mm512_and_si512(rg, _mm512_set1_epi16(0xFF));
1293
    __m512i g = _mm512_srli_epi16(rg, 8);
1294
    __m512i b = _mm512_and_si512(b_a, _mm512_set1_epi16(0xFF));
1295
    __m512i a = _mm512_srli_epi16(b_a, 8);
1296
    return {SIMDVec16{r}, SIMDVec16{g}, SIMDVec16{b}, SIMDVec16{a}};
1297
  }
1298
  static std::array<SIMDVec16, 4> LoadRGBA16(const unsigned char* data) {
1299
    __m512i bytes0 = _mm512_loadu_si512((__m512i*)data);
1300
    __m512i bytes1 = _mm512_loadu_si512((__m512i*)(data + 64));
1301
    __m512i bytes2 = _mm512_loadu_si512((__m512i*)(data + 128));
1302
    __m512i bytes3 = _mm512_loadu_si512((__m512i*)(data + 192));
1303
1304
    auto pack32 = [](__m512i a, __m512i b) {
1305
      __m512i permuteidx = _mm512_set_epi64(7, 5, 3, 1, 6, 4, 2, 0);
1306
      return _mm512_permutexvar_epi64(permuteidx, _mm512_packus_epi32(a, b));
1307
    };
1308
    auto packlow32 = [&pack32](__m512i a, __m512i b) {
1309
      __m512i mask = _mm512_set1_epi32(0xFFFF);
1310
      return pack32(_mm512_and_si512(a, mask), _mm512_and_si512(b, mask));
1311
    };
1312
    auto packhi32 = [&pack32](__m512i a, __m512i b) {
1313
      return pack32(_mm512_srli_epi32(a, 16), _mm512_srli_epi32(b, 16));
1314
    };
1315
1316
    __m512i rb0 = packlow32(bytes0, bytes1);
1317
    __m512i rb1 = packlow32(bytes2, bytes3);
1318
    __m512i ga0 = packhi32(bytes0, bytes1);
1319
    __m512i ga1 = packhi32(bytes2, bytes3);
1320
1321
    __m512i r = packlow32(rb0, rb1);
1322
    __m512i g = packlow32(ga0, ga1);
1323
    __m512i b = packhi32(rb0, rb1);
1324
    __m512i a = packhi32(ga0, ga1);
1325
    return {SIMDVec16{r}, SIMDVec16{g}, SIMDVec16{b}, SIMDVec16{a}};
1326
  }
1327
1328
  void SwapEndian() {
1329
    auto indices = _mm512_broadcast_i32x4(
1330
        _mm_setr_epi8(1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14));
1331
    vec = _mm512_shuffle_epi8(vec, indices);
1332
  }
1333
};
1334
1335
SIMDVec16 Mask16::IfThenElse(const SIMDVec16& if_true,
1336
                             const SIMDVec16& if_false) {
1337
  return SIMDVec16{_mm512_mask_blend_epi16(mask, if_false.vec, if_true.vec)};
1338
}
1339
1340
SIMDVec32 Mask32::IfThenElse(const SIMDVec32& if_true,
1341
                             const SIMDVec32& if_false) {
1342
  return SIMDVec32{_mm512_mask_blend_epi32(mask, if_false.vec, if_true.vec)};
1343
}
1344
1345
struct Bits64 {
1346
  static constexpr size_t kLanes = 8;
1347
1348
  __m512i nbits;
1349
  __m512i bits;
1350
1351
  FJXL_INLINE void Store(uint64_t* nbits_out, uint64_t* bits_out) {
1352
    _mm512_storeu_si512((__m512i*)nbits_out, nbits);
1353
    _mm512_storeu_si512((__m512i*)bits_out, bits);
1354
  }
1355
};
1356
1357
struct Bits32 {
1358
  __m512i nbits;
1359
  __m512i bits;
1360
1361
  static Bits32 FromRaw(SIMDVec32 nbits, SIMDVec32 bits) {
1362
    return Bits32{nbits.vec, bits.vec};
1363
  }
1364
1365
  Bits64 Merge() const {
1366
    auto nbits_hi32 = _mm512_srli_epi64(nbits, 32);
1367
    auto nbits_lo32 = _mm512_and_si512(nbits, _mm512_set1_epi64(0xFFFFFFFF));
1368
    auto bits_hi32 = _mm512_srli_epi64(bits, 32);
1369
    auto bits_lo32 = _mm512_and_si512(bits, _mm512_set1_epi64(0xFFFFFFFF));
1370
1371
    auto nbits64 = _mm512_add_epi64(nbits_hi32, nbits_lo32);
1372
    auto bits64 =
1373
        _mm512_or_si512(_mm512_sllv_epi64(bits_hi32, nbits_lo32), bits_lo32);
1374
    return Bits64{nbits64, bits64};
1375
  }
1376
1377
  void Interleave(const Bits32& low) {
1378
    bits = _mm512_or_si512(_mm512_sllv_epi32(bits, low.nbits), low.bits);
1379
    nbits = _mm512_add_epi32(nbits, low.nbits);
1380
  }
1381
1382
  void ClipTo(size_t n) {
1383
    n = std::min<size_t>(n, 16);
1384
    constexpr uint32_t kMask[32] = {
1385
        ~0u, ~0u, ~0u, ~0u, ~0u, ~0u, ~0u, ~0u, ~0u, ~0u, ~0u,
1386
        ~0u, ~0u, ~0u, ~0u, ~0u, 0,   0,   0,   0,   0,   0,
1387
        0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
1388
    };
1389
    __m512i mask = _mm512_loadu_si512((__m512i*)(kMask + 16 - n));
1390
    nbits = _mm512_and_si512(mask, nbits);
1391
    bits = _mm512_and_si512(mask, bits);
1392
  }
1393
  void Skip(size_t n) {
1394
    n = std::min<size_t>(n, 16);
1395
    constexpr uint32_t kMask[32] = {
1396
        0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
1397
        0,   0,   0,   0,   0,   ~0u, ~0u, ~0u, ~0u, ~0u, ~0u,
1398
        ~0u, ~0u, ~0u, ~0u, ~0u, ~0u, ~0u, ~0u, ~0u, ~0u,
1399
    };
1400
    __m512i mask = _mm512_loadu_si512((__m512i*)(kMask + 16 - n));
1401
    nbits = _mm512_and_si512(mask, nbits);
1402
    bits = _mm512_and_si512(mask, bits);
1403
  }
1404
};
1405
1406
struct Bits16 {
1407
  __m512i nbits;
1408
  __m512i bits;
1409
1410
  static Bits16 FromRaw(SIMDVec16 nbits, SIMDVec16 bits) {
1411
    return Bits16{nbits.vec, bits.vec};
1412
  }
1413
1414
  Bits32 Merge() const {
1415
    auto nbits_hi16 = _mm512_srli_epi32(nbits, 16);
1416
    auto nbits_lo16 = _mm512_and_si512(nbits, _mm512_set1_epi32(0xFFFF));
1417
    auto bits_hi16 = _mm512_srli_epi32(bits, 16);
1418
    auto bits_lo16 = _mm512_and_si512(bits, _mm512_set1_epi32(0xFFFF));
1419
1420
    auto nbits32 = _mm512_add_epi32(nbits_hi16, nbits_lo16);
1421
    auto bits32 =
1422
        _mm512_or_si512(_mm512_sllv_epi32(bits_hi16, nbits_lo16), bits_lo16);
1423
    return Bits32{nbits32, bits32};
1424
  }
1425
1426
  void Interleave(const Bits16& low) {
1427
    bits = _mm512_or_si512(_mm512_sllv_epi16(bits, low.nbits), low.bits);
1428
    nbits = _mm512_add_epi16(nbits, low.nbits);
1429
  }
1430
1431
  void ClipTo(size_t n) {
1432
    n = std::min<size_t>(n, 32);
1433
    constexpr uint16_t kMask[64] = {
1434
        0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF,
1435
        0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF,
1436
        0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF,
1437
        0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF,
1438
        0,      0,      0,      0,      0,      0,      0,      0,
1439
        0,      0,      0,      0,      0,      0,      0,      0,
1440
        0,      0,      0,      0,      0,      0,      0,      0,
1441
        0,      0,      0,      0,      0,      0,      0,      0,
1442
    };
1443
    __m512i mask = _mm512_loadu_si512((__m512i*)(kMask + 32 - n));
1444
    nbits = _mm512_and_si512(mask, nbits);
1445
    bits = _mm512_and_si512(mask, bits);
1446
  }
1447
  void Skip(size_t n) {
1448
    n = std::min<size_t>(n, 32);
1449
    constexpr uint16_t kMask[64] = {
1450
        0,      0,      0,      0,      0,      0,      0,      0,
1451
        0,      0,      0,      0,      0,      0,      0,      0,
1452
        0,      0,      0,      0,      0,      0,      0,      0,
1453
        0,      0,      0,      0,      0,      0,      0,      0,
1454
        0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF,
1455
        0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF,
1456
        0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF,
1457
        0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF,
1458
    };
1459
    __m512i mask = _mm512_loadu_si512((__m512i*)(kMask + 32 - n));
1460
    nbits = _mm512_and_si512(mask, nbits);
1461
    bits = _mm512_and_si512(mask, bits);
1462
  }
1463
};
1464
1465
#endif
1466
1467
#ifdef FJXL_AVX2
1468
#define FJXL_GENERIC_SIMD
1469
1470
struct SIMDVec32;
1471
1472
struct Mask32 {
1473
  __m256i mask;
1474
  SIMDVec32 IfThenElse(const SIMDVec32& if_true, const SIMDVec32& if_false);
1475
0
  size_t CountPrefix() const {
1476
0
    return CtzNonZero(~static_cast<uint64_t>(
1477
0
        static_cast<uint8_t>(_mm256_movemask_ps(_mm256_castsi256_ps(mask)))));
1478
0
  }
1479
};
1480
1481
struct SIMDVec32 {
1482
  __m256i vec;
1483
1484
  static constexpr size_t kLanes = 8;
1485
1486
0
  FJXL_INLINE static SIMDVec32 Load(const uint32_t* data) {
1487
0
    return SIMDVec32{_mm256_loadu_si256((__m256i*)data)};
1488
0
  }
1489
0
  FJXL_INLINE void Store(uint32_t* data) {
1490
0
    _mm256_storeu_si256((__m256i*)data, vec);
1491
0
  }
1492
0
  FJXL_INLINE static SIMDVec32 Val(uint32_t v) {
1493
0
    return SIMDVec32{_mm256_set1_epi32(v)};
1494
0
  }
1495
0
  FJXL_INLINE SIMDVec32 ValToToken() const {
1496
0
    auto f32 = _mm256_castps_si256(_mm256_cvtepi32_ps(vec));
1497
0
    return SIMDVec32{_mm256_max_epi32(
1498
0
        _mm256_setzero_si256(),
1499
0
        _mm256_sub_epi32(_mm256_srli_epi32(f32, 23), _mm256_set1_epi32(126)))};
1500
0
  }
1501
0
  FJXL_INLINE SIMDVec32 SatSubU(const SIMDVec32& to_subtract) const {
1502
0
    return SIMDVec32{_mm256_sub_epi32(_mm256_max_epu32(vec, to_subtract.vec),
1503
0
                                      to_subtract.vec)};
1504
0
  }
1505
0
  FJXL_INLINE SIMDVec32 Sub(const SIMDVec32& to_subtract) const {
1506
0
    return SIMDVec32{_mm256_sub_epi32(vec, to_subtract.vec)};
1507
0
  }
1508
0
  FJXL_INLINE SIMDVec32 Add(const SIMDVec32& oth) const {
1509
0
    return SIMDVec32{_mm256_add_epi32(vec, oth.vec)};
1510
0
  }
1511
0
  FJXL_INLINE SIMDVec32 Xor(const SIMDVec32& oth) const {
1512
0
    return SIMDVec32{_mm256_xor_si256(vec, oth.vec)};
1513
0
  }
1514
0
  FJXL_INLINE SIMDVec32 Pow2() const {
1515
0
    return SIMDVec32{_mm256_sllv_epi32(_mm256_set1_epi32(1), vec)};
1516
0
  }
1517
0
  FJXL_INLINE Mask32 Eq(const SIMDVec32& oth) const {
1518
0
    return Mask32{_mm256_cmpeq_epi32(vec, oth.vec)};
1519
0
  }
1520
0
  FJXL_INLINE Mask32 Gt(const SIMDVec32& oth) const {
1521
0
    return Mask32{_mm256_cmpgt_epi32(vec, oth.vec)};
1522
0
  }
1523
  template <size_t i>
1524
0
  FJXL_INLINE SIMDVec32 SignedShiftRight() const {
1525
0
    return SIMDVec32{_mm256_srai_epi32(vec, i)};
1526
0
  }
1527
};
1528
1529
struct SIMDVec16;
1530
1531
struct Mask16 {
1532
  __m256i mask;
1533
  SIMDVec16 IfThenElse(const SIMDVec16& if_true, const SIMDVec16& if_false);
1534
0
  Mask16 And(const Mask16& oth) const {
1535
0
    return Mask16{_mm256_and_si256(mask, oth.mask)};
1536
0
  }
1537
0
  size_t CountPrefix() const {
1538
0
    return CtzNonZero(~static_cast<uint64_t>(
1539
0
               static_cast<uint32_t>(_mm256_movemask_epi8(mask)))) /
1540
0
           2;
1541
0
  }
1542
};
1543
1544
struct SIMDVec16 {
1545
  __m256i vec;
1546
1547
  static constexpr size_t kLanes = 16;
1548
1549
0
  FJXL_INLINE static SIMDVec16 Load(const uint16_t* data) {
1550
0
    return SIMDVec16{_mm256_loadu_si256((__m256i*)data)};
1551
0
  }
1552
0
  FJXL_INLINE void Store(uint16_t* data) {
1553
0
    _mm256_storeu_si256((__m256i*)data, vec);
1554
0
  }
1555
0
  FJXL_INLINE static SIMDVec16 Val(uint16_t v) {
1556
0
    return SIMDVec16{_mm256_set1_epi16(v)};
1557
0
  }
1558
  FJXL_INLINE static SIMDVec16 FromTwo32(const SIMDVec32& lo,
1559
0
                                         const SIMDVec32& hi) {
1560
0
    auto tmp = _mm256_packus_epi32(lo.vec, hi.vec);
1561
0
    return SIMDVec16{_mm256_permute4x64_epi64(tmp, 0b11011000)};
1562
0
  }
1563
1564
0
  FJXL_INLINE SIMDVec16 ValToToken() const {
1565
0
    auto nibble0 =
1566
0
        _mm256_or_si256(_mm256_and_si256(vec, _mm256_set1_epi16(0xF)),
1567
0
                        _mm256_set1_epi16(0xFF00));
1568
0
    auto nibble1 = _mm256_or_si256(
1569
0
        _mm256_and_si256(_mm256_srli_epi16(vec, 4), _mm256_set1_epi16(0xF)),
1570
0
        _mm256_set1_epi16(0xFF00));
1571
0
    auto nibble2 = _mm256_or_si256(
1572
0
        _mm256_and_si256(_mm256_srli_epi16(vec, 8), _mm256_set1_epi16(0xF)),
1573
0
        _mm256_set1_epi16(0xFF00));
1574
0
    auto nibble3 =
1575
0
        _mm256_or_si256(_mm256_srli_epi16(vec, 12), _mm256_set1_epi16(0xFF00));
1576
1577
0
    auto lut0 = _mm256_broadcastsi128_si256(
1578
0
        _mm_setr_epi8(0, 1, 2, 2, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4));
1579
0
    auto lut1 = _mm256_broadcastsi128_si256(
1580
0
        _mm_setr_epi8(0, 5, 6, 6, 7, 7, 7, 7, 8, 8, 8, 8, 8, 8, 8, 8));
1581
0
    auto lut2 = _mm256_broadcastsi128_si256(_mm_setr_epi8(
1582
0
        0, 9, 10, 10, 11, 11, 11, 11, 12, 12, 12, 12, 12, 12, 12, 12));
1583
0
    auto lut3 = _mm256_broadcastsi128_si256(_mm_setr_epi8(
1584
0
        0, 13, 14, 14, 15, 15, 15, 15, 16, 16, 16, 16, 16, 16, 16, 16));
1585
1586
0
    auto token0 = _mm256_shuffle_epi8(lut0, nibble0);
1587
0
    auto token1 = _mm256_shuffle_epi8(lut1, nibble1);
1588
0
    auto token2 = _mm256_shuffle_epi8(lut2, nibble2);
1589
0
    auto token3 = _mm256_shuffle_epi8(lut3, nibble3);
1590
1591
0
    auto token = _mm256_max_epi16(_mm256_max_epi16(token0, token1),
1592
0
                                  _mm256_max_epi16(token2, token3));
1593
0
    return SIMDVec16{token};
1594
0
  }
1595
1596
0
  FJXL_INLINE SIMDVec16 SatSubU(const SIMDVec16& to_subtract) const {
1597
0
    return SIMDVec16{_mm256_subs_epu16(vec, to_subtract.vec)};
1598
0
  }
1599
0
  FJXL_INLINE SIMDVec16 Sub(const SIMDVec16& to_subtract) const {
1600
0
    return SIMDVec16{_mm256_sub_epi16(vec, to_subtract.vec)};
1601
0
  }
1602
0
  FJXL_INLINE SIMDVec16 Add(const SIMDVec16& oth) const {
1603
0
    return SIMDVec16{_mm256_add_epi16(vec, oth.vec)};
1604
0
  }
1605
0
  FJXL_INLINE SIMDVec16 Min(const SIMDVec16& oth) const {
1606
0
    return SIMDVec16{_mm256_min_epu16(vec, oth.vec)};
1607
0
  }
1608
0
  FJXL_INLINE Mask16 Eq(const SIMDVec16& oth) const {
1609
0
    return Mask16{_mm256_cmpeq_epi16(vec, oth.vec)};
1610
0
  }
1611
0
  FJXL_INLINE Mask16 Gt(const SIMDVec16& oth) const {
1612
0
    return Mask16{_mm256_cmpgt_epi16(vec, oth.vec)};
1613
0
  }
1614
0
  FJXL_INLINE SIMDVec16 Pow2() const {
1615
0
    auto pow2_lo_lut = _mm256_broadcastsi128_si256(
1616
0
        _mm_setr_epi8(1 << 0, 1 << 1, 1 << 2, 1 << 3, 1 << 4, 1 << 5, 1 << 6,
1617
0
                      1u << 7, 0, 0, 0, 0, 0, 0, 0, 0));
1618
0
    auto pow2_hi_lut = _mm256_broadcastsi128_si256(
1619
0
        _mm_setr_epi8(0, 0, 0, 0, 0, 0, 0, 0, 1 << 0, 1 << 1, 1 << 2, 1 << 3,
1620
0
                      1 << 4, 1 << 5, 1 << 6, 1u << 7));
1621
1622
0
    auto masked = _mm256_or_si256(vec, _mm256_set1_epi16(0xFF00));
1623
1624
0
    auto pow2_lo = _mm256_shuffle_epi8(pow2_lo_lut, masked);
1625
0
    auto pow2_hi = _mm256_shuffle_epi8(pow2_hi_lut, masked);
1626
1627
0
    auto pow2 = _mm256_or_si256(_mm256_slli_epi16(pow2_hi, 8), pow2_lo);
1628
0
    return SIMDVec16{pow2};
1629
0
  }
1630
0
  FJXL_INLINE SIMDVec16 Or(const SIMDVec16& oth) const {
1631
0
    return SIMDVec16{_mm256_or_si256(vec, oth.vec)};
1632
0
  }
1633
0
  FJXL_INLINE SIMDVec16 Xor(const SIMDVec16& oth) const {
1634
0
    return SIMDVec16{_mm256_xor_si256(vec, oth.vec)};
1635
0
  }
1636
0
  FJXL_INLINE SIMDVec16 And(const SIMDVec16& oth) const {
1637
0
    return SIMDVec16{_mm256_and_si256(vec, oth.vec)};
1638
0
  }
1639
0
  FJXL_INLINE SIMDVec16 HAdd(const SIMDVec16& oth) const {
1640
0
    return SIMDVec16{_mm256_srai_epi16(_mm256_add_epi16(vec, oth.vec), 1)};
1641
0
  }
1642
0
  FJXL_INLINE SIMDVec16 PrepareForU8Lookup() const {
1643
0
    return SIMDVec16{_mm256_or_si256(vec, _mm256_set1_epi16(0xFF00))};
1644
0
  }
1645
0
  FJXL_INLINE SIMDVec16 U8Lookup(const uint8_t* table) const {
1646
0
    return SIMDVec16{_mm256_shuffle_epi8(
1647
0
        _mm256_broadcastsi128_si256(_mm_loadu_si128((__m128i*)table)), vec)};
1648
0
  }
1649
0
  FJXL_INLINE VecPair<SIMDVec16> Interleave(const SIMDVec16& low) const {
1650
0
    auto v02 = _mm256_unpacklo_epi16(low.vec, vec);
1651
0
    auto v13 = _mm256_unpackhi_epi16(low.vec, vec);
1652
0
    return {SIMDVec16{_mm256_permute2x128_si256(v02, v13, 0x20)},
1653
0
            SIMDVec16{_mm256_permute2x128_si256(v02, v13, 0x31)}};
1654
0
  }
1655
0
  FJXL_INLINE VecPair<SIMDVec32> Upcast() const {
1656
0
    auto v02 = _mm256_unpacklo_epi16(vec, _mm256_setzero_si256());
1657
0
    auto v13 = _mm256_unpackhi_epi16(vec, _mm256_setzero_si256());
1658
0
    return {SIMDVec32{_mm256_permute2x128_si256(v02, v13, 0x20)},
1659
0
            SIMDVec32{_mm256_permute2x128_si256(v02, v13, 0x31)}};
1660
0
  }
1661
  template <size_t i>
1662
0
  FJXL_INLINE SIMDVec16 SignedShiftRight() const {
1663
0
    return SIMDVec16{_mm256_srai_epi16(vec, i)};
1664
0
  }
1665
1666
0
  static std::array<SIMDVec16, 1> LoadG8(const unsigned char* data) {
1667
0
    __m128i bytes = _mm_loadu_si128((__m128i*)data);
1668
0
    return {SIMDVec16{_mm256_cvtepu8_epi16(bytes)}};
1669
0
  }
1670
0
  static std::array<SIMDVec16, 1> LoadG16(const unsigned char* data) {
1671
0
    return {Load((const uint16_t*)data)};
1672
0
  }
1673
1674
0
  static std::array<SIMDVec16, 2> LoadGA8(const unsigned char* data) {
1675
0
    __m256i bytes = _mm256_loadu_si256((__m256i*)data);
1676
0
    __m256i gray = _mm256_and_si256(bytes, _mm256_set1_epi16(0xFF));
1677
0
    __m256i alpha = _mm256_srli_epi16(bytes, 8);
1678
0
    return {SIMDVec16{gray}, SIMDVec16{alpha}};
1679
0
  }
1680
0
  static std::array<SIMDVec16, 2> LoadGA16(const unsigned char* data) {
1681
0
    __m256i bytes1 = _mm256_loadu_si256((__m256i*)data);
1682
0
    __m256i bytes2 = _mm256_loadu_si256((__m256i*)(data + 32));
1683
0
    __m256i g_mask = _mm256_set1_epi32(0xFFFF);
1684
0
    __m256i g = _mm256_permute4x64_epi64(
1685
0
        _mm256_packus_epi32(_mm256_and_si256(bytes1, g_mask),
1686
0
                            _mm256_and_si256(bytes2, g_mask)),
1687
0
        0b11011000);
1688
0
    __m256i a = _mm256_permute4x64_epi64(
1689
0
        _mm256_packus_epi32(_mm256_srli_epi32(bytes1, 16),
1690
0
                            _mm256_srli_epi32(bytes2, 16)),
1691
0
        0b11011000);
1692
0
    return {SIMDVec16{g}, SIMDVec16{a}};
1693
0
  }
1694
1695
0
  static std::array<SIMDVec16, 3> LoadRGB8(const unsigned char* data) {
1696
0
    __m128i bytes0 = _mm_loadu_si128((__m128i*)data);
1697
0
    __m128i bytes1 = _mm_loadu_si128((__m128i*)(data + 16));
1698
0
    __m128i bytes2 = _mm_loadu_si128((__m128i*)(data + 32));
1699
1700
0
    __m128i idx =
1701
0
        _mm_setr_epi8(0, 3, 6, 9, 12, 15, 2, 5, 8, 11, 14, 1, 4, 7, 10, 13);
1702
1703
0
    __m128i r6b5g5_0 = _mm_shuffle_epi8(bytes0, idx);
1704
0
    __m128i g6r5b5_1 = _mm_shuffle_epi8(bytes1, idx);
1705
0
    __m128i b6g5r5_2 = _mm_shuffle_epi8(bytes2, idx);
1706
1707
0
    __m128i mask010 = _mm_setr_epi8(0, 0, 0, 0, 0, 0, 0xFF, 0xFF, 0xFF, 0xFF,
1708
0
                                    0xFF, 0, 0, 0, 0, 0);
1709
0
    __m128i mask001 = _mm_setr_epi8(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0xFF, 0xFF,
1710
0
                                    0xFF, 0xFF, 0xFF);
1711
1712
0
    __m128i b2g2b1 = _mm_blendv_epi8(b6g5r5_2, g6r5b5_1, mask001);
1713
0
    __m128i b2b0b1 = _mm_blendv_epi8(b2g2b1, r6b5g5_0, mask010);
1714
1715
0
    __m128i r0r1b1 = _mm_blendv_epi8(r6b5g5_0, g6r5b5_1, mask010);
1716
0
    __m128i r0r1r2 = _mm_blendv_epi8(r0r1b1, b6g5r5_2, mask001);
1717
1718
0
    __m128i g1r1g0 = _mm_blendv_epi8(g6r5b5_1, r6b5g5_0, mask001);
1719
0
    __m128i g1g2g0 = _mm_blendv_epi8(g1r1g0, b6g5r5_2, mask010);
1720
1721
0
    __m128i g0g1g2 = _mm_alignr_epi8(g1g2g0, g1g2g0, 11);
1722
0
    __m128i b0b1b2 = _mm_alignr_epi8(b2b0b1, b2b0b1, 6);
1723
1724
0
    return {SIMDVec16{_mm256_cvtepu8_epi16(r0r1r2)},
1725
0
            SIMDVec16{_mm256_cvtepu8_epi16(g0g1g2)},
1726
0
            SIMDVec16{_mm256_cvtepu8_epi16(b0b1b2)}};
1727
0
  }
1728
0
  static std::array<SIMDVec16, 3> LoadRGB16(const unsigned char* data) {
1729
0
    auto load_and_split_lohi = [](const unsigned char* data) {
1730
      // LHLHLH...
1731
0
      __m256i bytes = _mm256_loadu_si256((__m256i*)data);
1732
      // L0L0L0...
1733
0
      __m256i lo = _mm256_and_si256(bytes, _mm256_set1_epi16(0xFF));
1734
      // H0H0H0...
1735
0
      __m256i hi = _mm256_srli_epi16(bytes, 8);
1736
      // LLLLLLLLHHHHHHHHLLLLLLLLHHHHHHHH
1737
0
      __m256i packed = _mm256_packus_epi16(lo, hi);
1738
0
      return _mm256_permute4x64_epi64(packed, 0b11011000);
1739
0
    };
1740
0
    __m256i bytes0 = load_and_split_lohi(data);
1741
0
    __m256i bytes1 = load_and_split_lohi(data + 32);
1742
0
    __m256i bytes2 = load_and_split_lohi(data + 64);
1743
1744
0
    __m256i idx = _mm256_broadcastsi128_si256(
1745
0
        _mm_setr_epi8(0, 3, 6, 9, 12, 15, 2, 5, 8, 11, 14, 1, 4, 7, 10, 13));
1746
1747
0
    __m256i r6b5g5_0 = _mm256_shuffle_epi8(bytes0, idx);
1748
0
    __m256i g6r5b5_1 = _mm256_shuffle_epi8(bytes1, idx);
1749
0
    __m256i b6g5r5_2 = _mm256_shuffle_epi8(bytes2, idx);
1750
1751
0
    __m256i mask010 = _mm256_broadcastsi128_si256(_mm_setr_epi8(
1752
0
        0, 0, 0, 0, 0, 0, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0, 0, 0, 0, 0));
1753
0
    __m256i mask001 = _mm256_broadcastsi128_si256(_mm_setr_epi8(
1754
0
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF));
1755
1756
0
    __m256i b2g2b1 = _mm256_blendv_epi8(b6g5r5_2, g6r5b5_1, mask001);
1757
0
    __m256i b2b0b1 = _mm256_blendv_epi8(b2g2b1, r6b5g5_0, mask010);
1758
1759
0
    __m256i r0r1b1 = _mm256_blendv_epi8(r6b5g5_0, g6r5b5_1, mask010);
1760
0
    __m256i r0r1r2 = _mm256_blendv_epi8(r0r1b1, b6g5r5_2, mask001);
1761
1762
0
    __m256i g1r1g0 = _mm256_blendv_epi8(g6r5b5_1, r6b5g5_0, mask001);
1763
0
    __m256i g1g2g0 = _mm256_blendv_epi8(g1r1g0, b6g5r5_2, mask010);
1764
1765
0
    __m256i g0g1g2 = _mm256_alignr_epi8(g1g2g0, g1g2g0, 11);
1766
0
    __m256i b0b1b2 = _mm256_alignr_epi8(b2b0b1, b2b0b1, 6);
1767
1768
    // Now r0r1r2, g0g1g2, b0b1b2 have the low bytes of the RGB pixels in their
1769
    // lower half, and the high bytes in their upper half.
1770
1771
0
    auto combine_low_hi = [](__m256i v) {
1772
0
      __m128i low = _mm256_extracti128_si256(v, 0);
1773
0
      __m128i hi = _mm256_extracti128_si256(v, 1);
1774
0
      __m256i low16 = _mm256_cvtepu8_epi16(low);
1775
0
      __m256i hi16 = _mm256_cvtepu8_epi16(hi);
1776
0
      return _mm256_or_si256(_mm256_slli_epi16(hi16, 8), low16);
1777
0
    };
1778
1779
0
    return {SIMDVec16{combine_low_hi(r0r1r2)},
1780
0
            SIMDVec16{combine_low_hi(g0g1g2)},
1781
0
            SIMDVec16{combine_low_hi(b0b1b2)}};
1782
0
  }
1783
1784
0
  static std::array<SIMDVec16, 4> LoadRGBA8(const unsigned char* data) {
1785
0
    __m256i bytes1 = _mm256_loadu_si256((__m256i*)data);
1786
0
    __m256i bytes2 = _mm256_loadu_si256((__m256i*)(data + 32));
1787
0
    __m256i rg_mask = _mm256_set1_epi32(0xFFFF);
1788
0
    __m256i rg = _mm256_permute4x64_epi64(
1789
0
        _mm256_packus_epi32(_mm256_and_si256(bytes1, rg_mask),
1790
0
                            _mm256_and_si256(bytes2, rg_mask)),
1791
0
        0b11011000);
1792
0
    __m256i b_a = _mm256_permute4x64_epi64(
1793
0
        _mm256_packus_epi32(_mm256_srli_epi32(bytes1, 16),
1794
0
                            _mm256_srli_epi32(bytes2, 16)),
1795
0
        0b11011000);
1796
0
    __m256i r = _mm256_and_si256(rg, _mm256_set1_epi16(0xFF));
1797
0
    __m256i g = _mm256_srli_epi16(rg, 8);
1798
0
    __m256i b = _mm256_and_si256(b_a, _mm256_set1_epi16(0xFF));
1799
0
    __m256i a = _mm256_srli_epi16(b_a, 8);
1800
0
    return {SIMDVec16{r}, SIMDVec16{g}, SIMDVec16{b}, SIMDVec16{a}};
1801
0
  }
1802
0
  static std::array<SIMDVec16, 4> LoadRGBA16(const unsigned char* data) {
1803
0
    __m256i bytes0 = _mm256_loadu_si256((__m256i*)data);
1804
0
    __m256i bytes1 = _mm256_loadu_si256((__m256i*)(data + 32));
1805
0
    __m256i bytes2 = _mm256_loadu_si256((__m256i*)(data + 64));
1806
0
    __m256i bytes3 = _mm256_loadu_si256((__m256i*)(data + 96));
1807
1808
0
    auto pack32 = [](__m256i a, __m256i b) {
1809
0
      return _mm256_permute4x64_epi64(_mm256_packus_epi32(a, b), 0b11011000);
1810
0
    };
1811
0
    auto packlow32 = [&pack32](__m256i a, __m256i b) {
1812
0
      __m256i mask = _mm256_set1_epi32(0xFFFF);
1813
0
      return pack32(_mm256_and_si256(a, mask), _mm256_and_si256(b, mask));
1814
0
    };
1815
0
    auto packhi32 = [&pack32](__m256i a, __m256i b) {
1816
0
      return pack32(_mm256_srli_epi32(a, 16), _mm256_srli_epi32(b, 16));
1817
0
    };
1818
1819
0
    __m256i rb0 = packlow32(bytes0, bytes1);
1820
0
    __m256i rb1 = packlow32(bytes2, bytes3);
1821
0
    __m256i ga0 = packhi32(bytes0, bytes1);
1822
0
    __m256i ga1 = packhi32(bytes2, bytes3);
1823
1824
0
    __m256i r = packlow32(rb0, rb1);
1825
0
    __m256i g = packlow32(ga0, ga1);
1826
0
    __m256i b = packhi32(rb0, rb1);
1827
0
    __m256i a = packhi32(ga0, ga1);
1828
0
    return {SIMDVec16{r}, SIMDVec16{g}, SIMDVec16{b}, SIMDVec16{a}};
1829
0
  }
1830
1831
0
  void SwapEndian() {
1832
0
    auto indices = _mm256_broadcastsi128_si256(
1833
0
        _mm_setr_epi8(1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14));
1834
0
    vec = _mm256_shuffle_epi8(vec, indices);
1835
0
  }
1836
};
1837
1838
SIMDVec16 Mask16::IfThenElse(const SIMDVec16& if_true,
1839
0
                             const SIMDVec16& if_false) {
1840
0
  return SIMDVec16{_mm256_blendv_epi8(if_false.vec, if_true.vec, mask)};
1841
0
}
1842
1843
SIMDVec32 Mask32::IfThenElse(const SIMDVec32& if_true,
1844
0
                             const SIMDVec32& if_false) {
1845
0
  return SIMDVec32{_mm256_blendv_epi8(if_false.vec, if_true.vec, mask)};
1846
0
}
1847
1848
struct Bits64 {
1849
  static constexpr size_t kLanes = 4;
1850
1851
  __m256i nbits;
1852
  __m256i bits;
1853
1854
0
  FJXL_INLINE void Store(uint64_t* nbits_out, uint64_t* bits_out) {
1855
0
    _mm256_storeu_si256((__m256i*)nbits_out, nbits);
1856
0
    _mm256_storeu_si256((__m256i*)bits_out, bits);
1857
0
  }
1858
};
1859
1860
struct Bits32 {
1861
  __m256i nbits;
1862
  __m256i bits;
1863
1864
0
  static Bits32 FromRaw(SIMDVec32 nbits, SIMDVec32 bits) {
1865
0
    return Bits32{nbits.vec, bits.vec};
1866
0
  }
1867
1868
0
  Bits64 Merge() const {
1869
0
    auto nbits_hi32 = _mm256_srli_epi64(nbits, 32);
1870
0
    auto nbits_lo32 = _mm256_and_si256(nbits, _mm256_set1_epi64x(0xFFFFFFFF));
1871
0
    auto bits_hi32 = _mm256_srli_epi64(bits, 32);
1872
0
    auto bits_lo32 = _mm256_and_si256(bits, _mm256_set1_epi64x(0xFFFFFFFF));
1873
1874
0
    auto nbits64 = _mm256_add_epi64(nbits_hi32, nbits_lo32);
1875
0
    auto bits64 =
1876
0
        _mm256_or_si256(_mm256_sllv_epi64(bits_hi32, nbits_lo32), bits_lo32);
1877
0
    return Bits64{nbits64, bits64};
1878
0
  }
1879
1880
0
  void Interleave(const Bits32& low) {
1881
0
    bits = _mm256_or_si256(_mm256_sllv_epi32(bits, low.nbits), low.bits);
1882
0
    nbits = _mm256_add_epi32(nbits, low.nbits);
1883
0
  }
1884
1885
0
  void ClipTo(size_t n) {
1886
0
    n = std::min<size_t>(n, 8);
1887
0
    constexpr uint32_t kMask[16] = {
1888
0
        ~0u, ~0u, ~0u, ~0u, ~0u, ~0u, ~0u, ~0u, 0, 0, 0, 0, 0, 0, 0, 0,
1889
0
    };
1890
0
    __m256i mask = _mm256_loadu_si256((__m256i*)(kMask + 8 - n));
1891
0
    nbits = _mm256_and_si256(mask, nbits);
1892
0
    bits = _mm256_and_si256(mask, bits);
1893
0
  }
1894
0
  void Skip(size_t n) {
1895
0
    n = std::min<size_t>(n, 8);
1896
0
    constexpr uint32_t kMask[16] = {
1897
0
        0, 0, 0, 0, 0, 0, 0, 0, ~0u, ~0u, ~0u, ~0u, ~0u, ~0u, ~0u, ~0u,
1898
0
    };
1899
0
    __m256i mask = _mm256_loadu_si256((__m256i*)(kMask + 8 - n));
1900
0
    nbits = _mm256_and_si256(mask, nbits);
1901
0
    bits = _mm256_and_si256(mask, bits);
1902
0
  }
1903
};
1904
1905
struct Bits16 {
1906
  __m256i nbits;
1907
  __m256i bits;
1908
1909
0
  static Bits16 FromRaw(SIMDVec16 nbits, SIMDVec16 bits) {
1910
0
    return Bits16{nbits.vec, bits.vec};
1911
0
  }
1912
1913
0
  Bits32 Merge() const {
1914
0
    auto nbits_hi16 = _mm256_srli_epi32(nbits, 16);
1915
0
    auto nbits_lo16 = _mm256_and_si256(nbits, _mm256_set1_epi32(0xFFFF));
1916
0
    auto bits_hi16 = _mm256_srli_epi32(bits, 16);
1917
0
    auto bits_lo16 = _mm256_and_si256(bits, _mm256_set1_epi32(0xFFFF));
1918
1919
0
    auto nbits32 = _mm256_add_epi32(nbits_hi16, nbits_lo16);
1920
0
    auto bits32 =
1921
0
        _mm256_or_si256(_mm256_sllv_epi32(bits_hi16, nbits_lo16), bits_lo16);
1922
0
    return Bits32{nbits32, bits32};
1923
0
  }
1924
1925
0
  void Interleave(const Bits16& low) {
1926
0
    auto pow2_lo_lut = _mm256_broadcastsi128_si256(
1927
0
        _mm_setr_epi8(1 << 0, 1 << 1, 1 << 2, 1 << 3, 1 << 4, 1 << 5, 1 << 6,
1928
0
                      1u << 7, 0, 0, 0, 0, 0, 0, 0, 0));
1929
0
    auto low_nbits_masked =
1930
0
        _mm256_or_si256(low.nbits, _mm256_set1_epi16(0xFF00));
1931
1932
0
    auto bits_shifted = _mm256_mullo_epi16(
1933
0
        bits, _mm256_shuffle_epi8(pow2_lo_lut, low_nbits_masked));
1934
1935
0
    nbits = _mm256_add_epi16(nbits, low.nbits);
1936
0
    bits = _mm256_or_si256(bits_shifted, low.bits);
1937
0
  }
1938
1939
0
  void ClipTo(size_t n) {
1940
0
    n = std::min<size_t>(n, 16);
1941
0
    constexpr uint16_t kMask[32] = {
1942
0
        0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF,
1943
0
        0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF,
1944
0
        0,      0,      0,      0,      0,      0,      0,      0,
1945
0
        0,      0,      0,      0,      0,      0,      0,      0,
1946
0
    };
1947
0
    __m256i mask = _mm256_loadu_si256((__m256i*)(kMask + 16 - n));
1948
0
    nbits = _mm256_and_si256(mask, nbits);
1949
0
    bits = _mm256_and_si256(mask, bits);
1950
0
  }
1951
1952
0
  void Skip(size_t n) {
1953
0
    n = std::min<size_t>(n, 16);
1954
0
    constexpr uint16_t kMask[32] = {
1955
0
        0,      0,      0,      0,      0,      0,      0,      0,
1956
0
        0,      0,      0,      0,      0,      0,      0,      0,
1957
0
        0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF,
1958
0
        0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF,
1959
0
    };
1960
0
    __m256i mask = _mm256_loadu_si256((__m256i*)(kMask + 16 - n));
1961
0
    nbits = _mm256_and_si256(mask, nbits);
1962
0
    bits = _mm256_and_si256(mask, bits);
1963
0
  }
1964
};
1965
1966
#endif
1967
1968
#ifdef FJXL_NEON
1969
#define FJXL_GENERIC_SIMD
1970
1971
struct SIMDVec32;
1972
1973
struct Mask32 {
1974
  uint32x4_t mask;
1975
  SIMDVec32 IfThenElse(const SIMDVec32& if_true, const SIMDVec32& if_false);
1976
  Mask32 And(const Mask32& oth) const {
1977
    return Mask32{vandq_u32(mask, oth.mask)};
1978
  }
1979
  size_t CountPrefix() const {
1980
    uint32_t val_unset[4] = {0, 1, 2, 3};
1981
    uint32_t val_set[4] = {4, 4, 4, 4};
1982
    uint32x4_t val = vbslq_u32(mask, vld1q_u32(val_set), vld1q_u32(val_unset));
1983
    return vminvq_u32(val);
1984
  }
1985
};
1986
1987
struct SIMDVec32 {
1988
  uint32x4_t vec;
1989
1990
  static constexpr size_t kLanes = 4;
1991
1992
  FJXL_INLINE static SIMDVec32 Load(const uint32_t* data) {
1993
    return SIMDVec32{vld1q_u32(data)};
1994
  }
1995
  FJXL_INLINE void Store(uint32_t* data) { vst1q_u32(data, vec); }
1996
  FJXL_INLINE static SIMDVec32 Val(uint32_t v) {
1997
    return SIMDVec32{vdupq_n_u32(v)};
1998
  }
1999
  FJXL_INLINE SIMDVec32 ValToToken() const {
2000
    return SIMDVec32{vsubq_u32(vdupq_n_u32(32), vclzq_u32(vec))};
2001
  }
2002
  FJXL_INLINE SIMDVec32 SatSubU(const SIMDVec32& to_subtract) const {
2003
    return SIMDVec32{vqsubq_u32(vec, to_subtract.vec)};
2004
  }
2005
  FJXL_INLINE SIMDVec32 Sub(const SIMDVec32& to_subtract) const {
2006
    return SIMDVec32{vsubq_u32(vec, to_subtract.vec)};
2007
  }
2008
  FJXL_INLINE SIMDVec32 Add(const SIMDVec32& oth) const {
2009
    return SIMDVec32{vaddq_u32(vec, oth.vec)};
2010
  }
2011
  FJXL_INLINE SIMDVec32 Xor(const SIMDVec32& oth) const {
2012
    return SIMDVec32{veorq_u32(vec, oth.vec)};
2013
  }
2014
  FJXL_INLINE SIMDVec32 Pow2() const {
2015
    return SIMDVec32{vshlq_u32(vdupq_n_u32(1), vreinterpretq_s32_u32(vec))};
2016
  }
2017
  FJXL_INLINE Mask32 Eq(const SIMDVec32& oth) const {
2018
    return Mask32{vceqq_u32(vec, oth.vec)};
2019
  }
2020
  FJXL_INLINE Mask32 Gt(const SIMDVec32& oth) const {
2021
    return Mask32{
2022
        vcgtq_s32(vreinterpretq_s32_u32(vec), vreinterpretq_s32_u32(oth.vec))};
2023
  }
2024
  template <size_t i>
2025
  FJXL_INLINE SIMDVec32 SignedShiftRight() const {
2026
    return SIMDVec32{
2027
        vreinterpretq_u32_s32(vshrq_n_s32(vreinterpretq_s32_u32(vec), i))};
2028
  }
2029
};
2030
2031
struct SIMDVec16;
2032
2033
struct Mask16 {
2034
  uint16x8_t mask;
2035
  SIMDVec16 IfThenElse(const SIMDVec16& if_true, const SIMDVec16& if_false);
2036
  Mask16 And(const Mask16& oth) const {
2037
    return Mask16{vandq_u16(mask, oth.mask)};
2038
  }
2039
  size_t CountPrefix() const {
2040
    uint16_t val_unset[8] = {0, 1, 2, 3, 4, 5, 6, 7};
2041
    uint16_t val_set[8] = {8, 8, 8, 8, 8, 8, 8, 8};
2042
    uint16x8_t val = vbslq_u16(mask, vld1q_u16(val_set), vld1q_u16(val_unset));
2043
    return vminvq_u16(val);
2044
  }
2045
};
2046
2047
struct SIMDVec16 {
2048
  uint16x8_t vec;
2049
2050
  static constexpr size_t kLanes = 8;
2051
2052
  FJXL_INLINE static SIMDVec16 Load(const uint16_t* data) {
2053
    return SIMDVec16{vld1q_u16(data)};
2054
  }
2055
  FJXL_INLINE void Store(uint16_t* data) { vst1q_u16(data, vec); }
2056
  FJXL_INLINE static SIMDVec16 Val(uint16_t v) {
2057
    return SIMDVec16{vdupq_n_u16(v)};
2058
  }
2059
  FJXL_INLINE static SIMDVec16 FromTwo32(const SIMDVec32& lo,
2060
                                         const SIMDVec32& hi) {
2061
    return SIMDVec16{vmovn_high_u32(vmovn_u32(lo.vec), hi.vec)};
2062
  }
2063
2064
  FJXL_INLINE SIMDVec16 ValToToken() const {
2065
    return SIMDVec16{vsubq_u16(vdupq_n_u16(16), vclzq_u16(vec))};
2066
  }
2067
  FJXL_INLINE SIMDVec16 SatSubU(const SIMDVec16& to_subtract) const {
2068
    return SIMDVec16{vqsubq_u16(vec, to_subtract.vec)};
2069
  }
2070
  FJXL_INLINE SIMDVec16 Sub(const SIMDVec16& to_subtract) const {
2071
    return SIMDVec16{vsubq_u16(vec, to_subtract.vec)};
2072
  }
2073
  FJXL_INLINE SIMDVec16 Add(const SIMDVec16& oth) const {
2074
    return SIMDVec16{vaddq_u16(vec, oth.vec)};
2075
  }
2076
  FJXL_INLINE SIMDVec16 Min(const SIMDVec16& oth) const {
2077
    return SIMDVec16{vminq_u16(vec, oth.vec)};
2078
  }
2079
  FJXL_INLINE Mask16 Eq(const SIMDVec16& oth) const {
2080
    return Mask16{vceqq_u16(vec, oth.vec)};
2081
  }
2082
  FJXL_INLINE Mask16 Gt(const SIMDVec16& oth) const {
2083
    return Mask16{
2084
        vcgtq_s16(vreinterpretq_s16_u16(vec), vreinterpretq_s16_u16(oth.vec))};
2085
  }
2086
  FJXL_INLINE SIMDVec16 Pow2() const {
2087
    return SIMDVec16{vshlq_u16(vdupq_n_u16(1), vreinterpretq_s16_u16(vec))};
2088
  }
2089
  FJXL_INLINE SIMDVec16 Or(const SIMDVec16& oth) const {
2090
    return SIMDVec16{vorrq_u16(vec, oth.vec)};
2091
  }
2092
  FJXL_INLINE SIMDVec16 Xor(const SIMDVec16& oth) const {
2093
    return SIMDVec16{veorq_u16(vec, oth.vec)};
2094
  }
2095
  FJXL_INLINE SIMDVec16 And(const SIMDVec16& oth) const {
2096
    return SIMDVec16{vandq_u16(vec, oth.vec)};
2097
  }
2098
  FJXL_INLINE SIMDVec16 HAdd(const SIMDVec16& oth) const {
2099
    return SIMDVec16{vhaddq_u16(vec, oth.vec)};
2100
  }
2101
  FJXL_INLINE SIMDVec16 PrepareForU8Lookup() const {
2102
    return SIMDVec16{vorrq_u16(vec, vdupq_n_u16(0xFF00))};
2103
  }
2104
  FJXL_INLINE SIMDVec16 U8Lookup(const uint8_t* table) const {
2105
    uint8x16_t tbl = vld1q_u8(table);
2106
    uint8x16_t indices = vreinterpretq_u8_u16(vec);
2107
    return SIMDVec16{vreinterpretq_u16_u8(vqtbl1q_u8(tbl, indices))};
2108
  }
2109
  FJXL_INLINE VecPair<SIMDVec16> Interleave(const SIMDVec16& low) const {
2110
    return {SIMDVec16{vzip1q_u16(low.vec, vec)},
2111
            SIMDVec16{vzip2q_u16(low.vec, vec)}};
2112
  }
2113
  FJXL_INLINE VecPair<SIMDVec32> Upcast() const {
2114
    uint32x4_t lo = vmovl_u16(vget_low_u16(vec));
2115
    uint32x4_t hi = vmovl_high_u16(vec);
2116
    return {SIMDVec32{lo}, SIMDVec32{hi}};
2117
  }
2118
  template <size_t i>
2119
  FJXL_INLINE SIMDVec16 SignedShiftRight() const {
2120
    return SIMDVec16{
2121
        vreinterpretq_u16_s16(vshrq_n_s16(vreinterpretq_s16_u16(vec), i))};
2122
  }
2123
2124
  static std::array<SIMDVec16, 1> LoadG8(const unsigned char* data) {
2125
    uint8x8_t v = vld1_u8(data);
2126
    return {SIMDVec16{vmovl_u8(v)}};
2127
  }
2128
  static std::array<SIMDVec16, 1> LoadG16(const unsigned char* data) {
2129
    return {Load((const uint16_t*)data)};
2130
  }
2131
2132
  static std::array<SIMDVec16, 2> LoadGA8(const unsigned char* data) {
2133
    uint8x8x2_t v = vld2_u8(data);
2134
    return {SIMDVec16{vmovl_u8(v.val[0])}, SIMDVec16{vmovl_u8(v.val[1])}};
2135
  }
2136
  static std::array<SIMDVec16, 2> LoadGA16(const unsigned char* data) {
2137
    uint16x8x2_t v = vld2q_u16((const uint16_t*)data);
2138
    return {SIMDVec16{v.val[0]}, SIMDVec16{v.val[1]}};
2139
  }
2140
2141
  static std::array<SIMDVec16, 3> LoadRGB8(const unsigned char* data) {
2142
    uint8x8x3_t v = vld3_u8(data);
2143
    return {SIMDVec16{vmovl_u8(v.val[0])}, SIMDVec16{vmovl_u8(v.val[1])},
2144
            SIMDVec16{vmovl_u8(v.val[2])}};
2145
  }
2146
  static std::array<SIMDVec16, 3> LoadRGB16(const unsigned char* data) {
2147
    uint16x8x3_t v = vld3q_u16((const uint16_t*)data);
2148
    return {SIMDVec16{v.val[0]}, SIMDVec16{v.val[1]}, SIMDVec16{v.val[2]}};
2149
  }
2150
2151
  static std::array<SIMDVec16, 4> LoadRGBA8(const unsigned char* data) {
2152
    uint8x8x4_t v = vld4_u8(data);
2153
    return {SIMDVec16{vmovl_u8(v.val[0])}, SIMDVec16{vmovl_u8(v.val[1])},
2154
            SIMDVec16{vmovl_u8(v.val[2])}, SIMDVec16{vmovl_u8(v.val[3])}};
2155
  }
2156
  static std::array<SIMDVec16, 4> LoadRGBA16(const unsigned char* data) {
2157
    uint16x8x4_t v = vld4q_u16((const uint16_t*)data);
2158
    return {SIMDVec16{v.val[0]}, SIMDVec16{v.val[1]}, SIMDVec16{v.val[2]},
2159
            SIMDVec16{v.val[3]}};
2160
  }
2161
2162
  void SwapEndian() {
2163
    vec = vreinterpretq_u16_u8(vrev16q_u8(vreinterpretq_u8_u16(vec)));
2164
  }
2165
};
2166
2167
SIMDVec16 Mask16::IfThenElse(const SIMDVec16& if_true,
2168
                             const SIMDVec16& if_false) {
2169
  return SIMDVec16{vbslq_u16(mask, if_true.vec, if_false.vec)};
2170
}
2171
2172
SIMDVec32 Mask32::IfThenElse(const SIMDVec32& if_true,
2173
                             const SIMDVec32& if_false) {
2174
  return SIMDVec32{vbslq_u32(mask, if_true.vec, if_false.vec)};
2175
}
2176
2177
struct Bits64 {
2178
  static constexpr size_t kLanes = 2;
2179
2180
  uint64x2_t nbits;
2181
  uint64x2_t bits;
2182
2183
  FJXL_INLINE void Store(uint64_t* nbits_out, uint64_t* bits_out) {
2184
    vst1q_u64(nbits_out, nbits);
2185
    vst1q_u64(bits_out, bits);
2186
  }
2187
};
2188
2189
struct Bits32 {
2190
  uint32x4_t nbits;
2191
  uint32x4_t bits;
2192
2193
  static Bits32 FromRaw(SIMDVec32 nbits, SIMDVec32 bits) {
2194
    return Bits32{nbits.vec, bits.vec};
2195
  }
2196
2197
  Bits64 Merge() const {
2198
    // TODO(veluca): can probably be optimized.
2199
    uint64x2_t nbits_lo32 =
2200
        vandq_u64(vreinterpretq_u64_u32(nbits), vdupq_n_u64(0xFFFFFFFF));
2201
    uint64x2_t bits_hi32 =
2202
        vshlq_u64(vshrq_n_u64(vreinterpretq_u64_u32(bits), 32),
2203
                  vreinterpretq_s64_u64(nbits_lo32));
2204
    uint64x2_t bits_lo32 =
2205
        vandq_u64(vreinterpretq_u64_u32(bits), vdupq_n_u64(0xFFFFFFFF));
2206
    uint64x2_t nbits64 =
2207
        vsraq_n_u64(nbits_lo32, vreinterpretq_u64_u32(nbits), 32);
2208
    uint64x2_t bits64 = vorrq_u64(bits_hi32, bits_lo32);
2209
    return Bits64{nbits64, bits64};
2210
  }
2211
2212
  void Interleave(const Bits32& low) {
2213
    bits =
2214
        vorrq_u32(vshlq_u32(bits, vreinterpretq_s32_u32(low.nbits)), low.bits);
2215
    nbits = vaddq_u32(nbits, low.nbits);
2216
  }
2217
2218
  void ClipTo(size_t n) {
2219
    n = std::min<size_t>(n, 4);
2220
    constexpr uint32_t kMask[8] = {
2221
        ~0u, ~0u, ~0u, ~0u, 0, 0, 0, 0,
2222
    };
2223
    uint32x4_t mask = vld1q_u32(kMask + 4 - n);
2224
    nbits = vandq_u32(mask, nbits);
2225
    bits = vandq_u32(mask, bits);
2226
  }
2227
  void Skip(size_t n) {
2228
    n = std::min<size_t>(n, 4);
2229
    constexpr uint32_t kMask[8] = {
2230
        0, 0, 0, 0, ~0u, ~0u, ~0u, ~0u,
2231
    };
2232
    uint32x4_t mask = vld1q_u32(kMask + 4 - n);
2233
    nbits = vandq_u32(mask, nbits);
2234
    bits = vandq_u32(mask, bits);
2235
  }
2236
};
2237
2238
struct Bits16 {
2239
  uint16x8_t nbits;
2240
  uint16x8_t bits;
2241
2242
  static Bits16 FromRaw(SIMDVec16 nbits, SIMDVec16 bits) {
2243
    return Bits16{nbits.vec, bits.vec};
2244
  }
2245
2246
  Bits32 Merge() const {
2247
    // TODO(veluca): can probably be optimized.
2248
    uint32x4_t nbits_lo16 =
2249
        vandq_u32(vreinterpretq_u32_u16(nbits), vdupq_n_u32(0xFFFF));
2250
    uint32x4_t bits_hi16 =
2251
        vshlq_u32(vshrq_n_u32(vreinterpretq_u32_u16(bits), 16),
2252
                  vreinterpretq_s32_u32(nbits_lo16));
2253
    uint32x4_t bits_lo16 =
2254
        vandq_u32(vreinterpretq_u32_u16(bits), vdupq_n_u32(0xFFFF));
2255
    uint32x4_t nbits32 =
2256
        vsraq_n_u32(nbits_lo16, vreinterpretq_u32_u16(nbits), 16);
2257
    uint32x4_t bits32 = vorrq_u32(bits_hi16, bits_lo16);
2258
    return Bits32{nbits32, bits32};
2259
  }
2260
2261
  void Interleave(const Bits16& low) {
2262
    bits =
2263
        vorrq_u16(vshlq_u16(bits, vreinterpretq_s16_u16(low.nbits)), low.bits);
2264
    nbits = vaddq_u16(nbits, low.nbits);
2265
  }
2266
2267
  void ClipTo(size_t n) {
2268
    n = std::min<size_t>(n, 8);
2269
    constexpr uint16_t kMask[16] = {
2270
        0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF,
2271
        0,      0,      0,      0,      0,      0,      0,      0,
2272
    };
2273
    uint16x8_t mask = vld1q_u16(kMask + 8 - n);
2274
    nbits = vandq_u16(mask, nbits);
2275
    bits = vandq_u16(mask, bits);
2276
  }
2277
  void Skip(size_t n) {
2278
    n = std::min<size_t>(n, 8);
2279
    constexpr uint16_t kMask[16] = {
2280
        0,      0,      0,      0,      0,      0,      0,      0,
2281
        0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF,
2282
    };
2283
    uint16x8_t mask = vld1q_u16(kMask + 8 - n);
2284
    nbits = vandq_u16(mask, nbits);
2285
    bits = vandq_u16(mask, bits);
2286
  }
2287
};
2288
2289
#endif
2290
2291
#ifdef FJXL_GENERIC_SIMD
2292
constexpr size_t SIMDVec32::kLanes;
2293
constexpr size_t SIMDVec16::kLanes;
2294
2295
//  Each of these functions will process SIMDVec16::kLanes worth of values.
2296
2297
FJXL_INLINE void TokenizeSIMD(const uint16_t* residuals, uint16_t* token_out,
2298
0
                              uint16_t* nbits_out, uint16_t* bits_out) {
2299
0
  SIMDVec16 res = SIMDVec16::Load(residuals);
2300
0
  SIMDVec16 token = res.ValToToken();
2301
0
  SIMDVec16 nbits = token.SatSubU(SIMDVec16::Val(1));
2302
0
  SIMDVec16 bits = res.SatSubU(nbits.Pow2());
2303
0
  token.Store(token_out);
2304
0
  nbits.Store(nbits_out);
2305
0
  bits.Store(bits_out);
2306
0
}
2307
2308
FJXL_INLINE void TokenizeSIMD(const uint32_t* residuals, uint16_t* token_out,
2309
0
                              uint32_t* nbits_out, uint32_t* bits_out) {
2310
0
  static_assert(SIMDVec16::kLanes == 2 * SIMDVec32::kLanes,
2311
0
                "There should be twice more 16-bit lanes than 32-bit lanes");
2312
0
  SIMDVec32 res_lo = SIMDVec32::Load(residuals);
2313
0
  SIMDVec32 res_hi = SIMDVec32::Load(residuals + SIMDVec32::kLanes);
2314
0
  SIMDVec32 token_lo = res_lo.ValToToken();
2315
0
  SIMDVec32 token_hi = res_hi.ValToToken();
2316
0
  SIMDVec32 nbits_lo = token_lo.SatSubU(SIMDVec32::Val(1));
2317
0
  SIMDVec32 nbits_hi = token_hi.SatSubU(SIMDVec32::Val(1));
2318
0
  SIMDVec32 bits_lo = res_lo.SatSubU(nbits_lo.Pow2());
2319
0
  SIMDVec32 bits_hi = res_hi.SatSubU(nbits_hi.Pow2());
2320
0
  SIMDVec16 token = SIMDVec16::FromTwo32(token_lo, token_hi);
2321
0
  token.Store(token_out);
2322
0
  nbits_lo.Store(nbits_out);
2323
0
  nbits_hi.Store(nbits_out + SIMDVec32::kLanes);
2324
0
  bits_lo.Store(bits_out);
2325
0
  bits_hi.Store(bits_out + SIMDVec32::kLanes);
2326
0
}
2327
2328
FJXL_INLINE void HuffmanSIMDUpTo13(const uint16_t* tokens,
2329
                                   const uint8_t* raw_nbits_simd,
2330
                                   const uint8_t* raw_bits_simd,
2331
0
                                   uint16_t* nbits_out, uint16_t* bits_out) {
2332
0
  SIMDVec16 tok = SIMDVec16::Load(tokens).PrepareForU8Lookup();
2333
0
  tok.U8Lookup(raw_nbits_simd).Store(nbits_out);
2334
0
  tok.U8Lookup(raw_bits_simd).Store(bits_out);
2335
0
}
2336
2337
FJXL_INLINE void HuffmanSIMD14(const uint16_t* tokens,
2338
                               const uint8_t* raw_nbits_simd,
2339
                               const uint8_t* raw_bits_simd,
2340
0
                               uint16_t* nbits_out, uint16_t* bits_out) {
2341
0
  SIMDVec16 token_cap = SIMDVec16::Val(15);
2342
0
  SIMDVec16 tok = SIMDVec16::Load(tokens);
2343
0
  SIMDVec16 tok_index = tok.Min(token_cap).PrepareForU8Lookup();
2344
0
  SIMDVec16 huff_bits_pre = tok_index.U8Lookup(raw_bits_simd);
2345
  // Set the highest bit when token == 16; the Huffman code is constructed in
2346
  // such a way that the code for token 15 is the same as the code for 16,
2347
  // except for the highest bit.
2348
0
  Mask16 needs_high_bit = tok.Eq(SIMDVec16::Val(16));
2349
0
  SIMDVec16 huff_bits = needs_high_bit.IfThenElse(
2350
0
      huff_bits_pre.Or(SIMDVec16::Val(128)), huff_bits_pre);
2351
0
  huff_bits.Store(bits_out);
2352
0
  tok_index.U8Lookup(raw_nbits_simd).Store(nbits_out);
2353
0
}
2354
2355
FJXL_INLINE void HuffmanSIMDAbove14(const uint16_t* tokens,
2356
                                    const uint8_t* raw_nbits_simd,
2357
                                    const uint8_t* raw_bits_simd,
2358
0
                                    uint16_t* nbits_out, uint16_t* bits_out) {
2359
0
  SIMDVec16 tok = SIMDVec16::Load(tokens);
2360
  // We assume `tok` fits in a *signed* 16-bit integer.
2361
0
  Mask16 above = tok.Gt(SIMDVec16::Val(12));
2362
  // 13, 14 -> 13
2363
  // 15, 16 -> 14
2364
  // 17, 18 -> 15
2365
0
  SIMDVec16 remap_tok = above.IfThenElse(tok.HAdd(SIMDVec16::Val(13)), tok);
2366
0
  SIMDVec16 tok_index = remap_tok.PrepareForU8Lookup();
2367
0
  SIMDVec16 huff_bits_pre = tok_index.U8Lookup(raw_bits_simd);
2368
  // Set the highest bit when token == 14, 16, 18.
2369
0
  Mask16 needs_high_bit = above.And(tok.Eq(tok.And(SIMDVec16::Val(0xFFFE))));
2370
0
  SIMDVec16 huff_bits = needs_high_bit.IfThenElse(
2371
0
      huff_bits_pre.Or(SIMDVec16::Val(128)), huff_bits_pre);
2372
0
  huff_bits.Store(bits_out);
2373
0
  tok_index.U8Lookup(raw_nbits_simd).Store(nbits_out);
2374
0
}
2375
2376
FJXL_INLINE void StoreSIMDUpTo8(const uint16_t* nbits_tok,
2377
                                const uint16_t* bits_tok,
2378
                                const uint16_t* nbits_huff,
2379
                                const uint16_t* bits_huff, size_t n,
2380
0
                                size_t skip, Bits32* bits_out) {
2381
0
  Bits16 bits =
2382
0
      Bits16::FromRaw(SIMDVec16::Load(nbits_tok), SIMDVec16::Load(bits_tok));
2383
0
  Bits16 huff_bits =
2384
0
      Bits16::FromRaw(SIMDVec16::Load(nbits_huff), SIMDVec16::Load(bits_huff));
2385
0
  bits.Interleave(huff_bits);
2386
0
  bits.ClipTo(n);
2387
0
  bits.Skip(skip);
2388
0
  bits_out[0] = bits.Merge();
2389
0
}
2390
2391
// Huffman and raw bits don't necessarily fit in a single u16 here.
2392
FJXL_INLINE void StoreSIMDUpTo14(const uint16_t* nbits_tok,
2393
                                 const uint16_t* bits_tok,
2394
                                 const uint16_t* nbits_huff,
2395
                                 const uint16_t* bits_huff, size_t n,
2396
0
                                 size_t skip, Bits32* bits_out) {
2397
0
  VecPair<SIMDVec16> bits =
2398
0
      SIMDVec16::Load(bits_tok).Interleave(SIMDVec16::Load(bits_huff));
2399
0
  VecPair<SIMDVec16> nbits =
2400
0
      SIMDVec16::Load(nbits_tok).Interleave(SIMDVec16::Load(nbits_huff));
2401
0
  Bits16 low = Bits16::FromRaw(nbits.low, bits.low);
2402
0
  Bits16 hi = Bits16::FromRaw(nbits.hi, bits.hi);
2403
0
  low.ClipTo(2 * n);
2404
0
  low.Skip(2 * skip);
2405
0
  hi.ClipTo(std::max(2 * n, SIMDVec16::kLanes) - SIMDVec16::kLanes);
2406
0
  hi.Skip(std::max(2 * skip, SIMDVec16::kLanes) - SIMDVec16::kLanes);
2407
2408
0
  bits_out[0] = low.Merge();
2409
0
  bits_out[1] = hi.Merge();
2410
0
}
2411
2412
FJXL_INLINE void StoreSIMDAbove14(const uint32_t* nbits_tok,
2413
                                  const uint32_t* bits_tok,
2414
                                  const uint16_t* nbits_huff,
2415
                                  const uint16_t* bits_huff, size_t n,
2416
0
                                  size_t skip, Bits32* bits_out) {
2417
0
  static_assert(SIMDVec16::kLanes == 2 * SIMDVec32::kLanes,
2418
0
                "There should be twice more 16-bit lanes than 32-bit lanes");
2419
0
  Bits32 bits_low =
2420
0
      Bits32::FromRaw(SIMDVec32::Load(nbits_tok), SIMDVec32::Load(bits_tok));
2421
0
  Bits32 bits_hi =
2422
0
      Bits32::FromRaw(SIMDVec32::Load(nbits_tok + SIMDVec32::kLanes),
2423
0
                      SIMDVec32::Load(bits_tok + SIMDVec32::kLanes));
2424
2425
0
  VecPair<SIMDVec32> huff_bits = SIMDVec16::Load(bits_huff).Upcast();
2426
0
  VecPair<SIMDVec32> huff_nbits = SIMDVec16::Load(nbits_huff).Upcast();
2427
2428
0
  Bits32 huff_low = Bits32::FromRaw(huff_nbits.low, huff_bits.low);
2429
0
  Bits32 huff_hi = Bits32::FromRaw(huff_nbits.hi, huff_bits.hi);
2430
2431
0
  bits_low.Interleave(huff_low);
2432
0
  bits_low.ClipTo(n);
2433
0
  bits_low.Skip(skip);
2434
0
  bits_out[0] = bits_low;
2435
0
  bits_hi.Interleave(huff_hi);
2436
0
  bits_hi.ClipTo(std::max(n, SIMDVec32::kLanes) - SIMDVec32::kLanes);
2437
0
  bits_hi.Skip(std::max(skip, SIMDVec32::kLanes) - SIMDVec32::kLanes);
2438
0
  bits_out[1] = bits_hi;
2439
0
}
2440
2441
#ifdef FJXL_AVX512
2442
FJXL_INLINE void StoreToWriterAVX512(const Bits32& bits32, BitWriter& output) {
2443
  __m512i bits = bits32.bits;
2444
  __m512i nbits = bits32.nbits;
2445
2446
  // Insert the leftover bits from the bit buffer at the bottom of the vector
2447
  // and extract the top of the vector.
2448
  uint64_t trail_bits =
2449
      _mm512_cvtsi512_si32(_mm512_alignr_epi32(bits, bits, 15));
2450
  uint64_t trail_nbits =
2451
      _mm512_cvtsi512_si32(_mm512_alignr_epi32(nbits, nbits, 15));
2452
  __m512i lead_bits = _mm512_set1_epi32(output.buffer);
2453
  __m512i lead_nbits = _mm512_set1_epi32(output.bits_in_buffer);
2454
  bits = _mm512_alignr_epi32(bits, lead_bits, 15);
2455
  nbits = _mm512_alignr_epi32(nbits, lead_nbits, 15);
2456
2457
  // Merge 32 -> 64 bits.
2458
  Bits32 b{nbits, bits};
2459
  Bits64 b64 = b.Merge();
2460
  bits = b64.bits;
2461
  nbits = b64.nbits;
2462
2463
  __m512i zero = _mm512_setzero_si512();
2464
2465
  auto sh1 = [zero](__m512i vec) { return _mm512_alignr_epi64(vec, zero, 7); };
2466
  auto sh2 = [zero](__m512i vec) { return _mm512_alignr_epi64(vec, zero, 6); };
2467
  auto sh4 = [zero](__m512i vec) { return _mm512_alignr_epi64(vec, zero, 4); };
2468
2469
  // Compute first-past-end-bit-position.
2470
  __m512i end_intermediate0 = _mm512_add_epi64(nbits, sh1(nbits));
2471
  __m512i end_intermediate1 =
2472
      _mm512_add_epi64(end_intermediate0, sh2(end_intermediate0));
2473
  __m512i end = _mm512_add_epi64(end_intermediate1, sh4(end_intermediate1));
2474
2475
  uint64_t simd_nbits = _mm512_cvtsi512_si32(_mm512_alignr_epi64(end, end, 7));
2476
2477
  // Compute begin-bit-position.
2478
  __m512i begin = _mm512_sub_epi64(end, nbits);
2479
2480
  // Index of the last bit in the chunk, or the end bit if nbits==0.
2481
  __m512i last = _mm512_mask_sub_epi64(
2482
      end, _mm512_cmpneq_epi64_mask(nbits, zero), end, _mm512_set1_epi64(1));
2483
2484
  __m512i lane_offset_mask = _mm512_set1_epi64(63);
2485
2486
  // Starting position of the chunk that each lane will ultimately belong to.
2487
  __m512i chunk_start = _mm512_andnot_si512(lane_offset_mask, last);
2488
2489
  // For all lanes that contain bits belonging to two different 64-bit chunks,
2490
  // compute the number of bits that belong to the first chunk.
2491
  // total # of bits fit in a u16, so we can satsub_u16 here.
2492
  __m512i first_chunk_nbits = _mm512_subs_epu16(chunk_start, begin);
2493
2494
  // Move all the previous-chunk-bits to the previous lane.
2495
  __m512i negnbits = _mm512_sub_epi64(_mm512_set1_epi64(64), first_chunk_nbits);
2496
  __m512i first_chunk_bits =
2497
      _mm512_srlv_epi64(_mm512_sllv_epi64(bits, negnbits), negnbits);
2498
  __m512i first_chunk_bits_down =
2499
      _mm512_alignr_epi32(zero, first_chunk_bits, 2);
2500
  bits = _mm512_srlv_epi64(bits, first_chunk_nbits);
2501
  nbits = _mm512_sub_epi64(nbits, first_chunk_nbits);
2502
  bits = _mm512_or_si512(bits, _mm512_sllv_epi64(first_chunk_bits_down, nbits));
2503
  begin = _mm512_add_epi64(begin, first_chunk_nbits);
2504
2505
  // We now know that every lane should give bits to only one chunk. We can
2506
  // shift the bits and then horizontally-or-reduce them within the same chunk.
2507
  __m512i offset = _mm512_and_si512(begin, lane_offset_mask);
2508
  __m512i aligned_bits = _mm512_sllv_epi64(bits, offset);
2509
  // h-or-reduce within same chunk
2510
  __m512i red0 = _mm512_mask_or_epi64(
2511
      aligned_bits, _mm512_cmpeq_epi64_mask(sh1(chunk_start), chunk_start),
2512
      sh1(aligned_bits), aligned_bits);
2513
  __m512i red1 = _mm512_mask_or_epi64(
2514
      red0, _mm512_cmpeq_epi64_mask(sh2(chunk_start), chunk_start), sh2(red0),
2515
      red0);
2516
  __m512i reduced = _mm512_mask_or_epi64(
2517
      red1, _mm512_cmpeq_epi64_mask(sh4(chunk_start), chunk_start), sh4(red1),
2518
      red1);
2519
  // Extract the highest lane that belongs to each chunk (the lane that ends up
2520
  // with the OR-ed value of all the other lanes of that chunk).
2521
  __m512i next_chunk_start =
2522
      _mm512_alignr_epi32(_mm512_set1_epi64(~0), chunk_start, 2);
2523
  __m512i result = _mm512_maskz_compress_epi64(
2524
      _mm512_cmpneq_epi64_mask(chunk_start, next_chunk_start), reduced);
2525
2526
  _mm512_storeu_si512((__m512i*)(output.data.get() + output.bytes_written),
2527
                      result);
2528
2529
  // Update the bit writer and add the last 32-bit lane.
2530
  // Note that since trail_nbits was at most 32 to begin with, operating on
2531
  // trail_bits does not risk overflowing.
2532
  output.bytes_written += simd_nbits / 8;
2533
  // Here we are implicitly relying on the fact that simd_nbits < 512 to know
2534
  // that the byte of bitreader data we access is initialized. This is
2535
  // guaranteed because the remaining bits in the bitreader buffer are at most
2536
  // 7, so simd_nbits <= 505 always.
2537
  trail_bits = (trail_bits << (simd_nbits % 8)) +
2538
               output.data.get()[output.bytes_written];
2539
  trail_nbits += simd_nbits % 8;
2540
  StoreLE64(output.data.get() + output.bytes_written, trail_bits);
2541
  size_t trail_bytes = trail_nbits / 8;
2542
  output.bits_in_buffer = trail_nbits % 8;
2543
  output.buffer = trail_bits >> (trail_bytes * 8);
2544
  output.bytes_written += trail_bytes;
2545
}
2546
2547
#endif
2548
2549
template <size_t n>
2550
0
FJXL_INLINE void StoreToWriter(const Bits32* bits, BitWriter& output) {
2551
#ifdef FJXL_AVX512
2552
  static_assert(n <= 2, "n should be less or 2 for AVX512");
2553
  StoreToWriterAVX512(bits[0], output);
2554
  if (n == 2) {
2555
    StoreToWriterAVX512(bits[1], output);
2556
  }
2557
  return;
2558
#endif
2559
0
  static_assert(n <= 4, "n should be less or 4");
2560
0
  alignas(64) uint64_t nbits64[Bits64::kLanes * n];
2561
0
  alignas(64) uint64_t bits64[Bits64::kLanes * n];
2562
0
  bits[0].Merge().Store(nbits64, bits64);
2563
0
  if (n > 1) {
2564
0
    bits[1].Merge().Store(nbits64 + Bits64::kLanes, bits64 + Bits64::kLanes);
2565
0
  }
2566
0
  if (n > 2) {
2567
0
    bits[2].Merge().Store(nbits64 + 2 * Bits64::kLanes,
2568
0
                          bits64 + 2 * Bits64::kLanes);
2569
0
  }
2570
0
  if (n > 3) {
2571
0
    bits[3].Merge().Store(nbits64 + 3 * Bits64::kLanes,
2572
0
                          bits64 + 3 * Bits64::kLanes);
2573
0
  }
2574
0
  output.WriteMultiple(nbits64, bits64, Bits64::kLanes * n);
2575
0
}
Unexecuted instantiation: enc_fast_lossless.cc:void AVX2::(anonymous namespace)::StoreToWriter<1ul>(AVX2::(anonymous namespace)::Bits32 const*, (anonymous namespace)::BitWriter&)
Unexecuted instantiation: enc_fast_lossless.cc:void AVX2::(anonymous namespace)::StoreToWriter<2ul>(AVX2::(anonymous namespace)::Bits32 const*, (anonymous namespace)::BitWriter&)
2576
2577
namespace detail {
2578
template <typename T>
2579
struct IntegerTypes;
2580
2581
template <>
2582
struct IntegerTypes<SIMDVec16> {
2583
  using signed_ = int16_t;
2584
  using unsigned_ = uint16_t;
2585
};
2586
2587
template <>
2588
struct IntegerTypes<SIMDVec32> {
2589
  using signed_ = int32_t;
2590
  using unsigned_ = uint32_t;
2591
};
2592
2593
template <typename T>
2594
struct SIMDType;
2595
2596
template <>
2597
struct SIMDType<int16_t> {
2598
  using type = SIMDVec16;
2599
};
2600
2601
template <>
2602
struct SIMDType<int32_t> {
2603
  using type = SIMDVec32;
2604
};
2605
2606
}  // namespace detail
2607
2608
template <typename T>
2609
using signed_t = typename detail::IntegerTypes<T>::signed_;
2610
2611
template <typename T>
2612
using unsigned_t = typename detail::IntegerTypes<T>::unsigned_;
2613
2614
template <typename T>
2615
using simd_t = typename detail::SIMDType<T>::type;
2616
2617
// This function will process exactly one vector worth of pixels.
2618
2619
template <typename T>
2620
size_t PredictPixels(const signed_t<T>* pixels, const signed_t<T>* pixels_left,
2621
                     const signed_t<T>* pixels_top,
2622
                     const signed_t<T>* pixels_topleft,
2623
0
                     unsigned_t<T>* residuals) {
2624
0
  T px = T::Load((unsigned_t<T>*)pixels);
2625
0
  T left = T::Load((unsigned_t<T>*)pixels_left);
2626
0
  T top = T::Load((unsigned_t<T>*)pixels_top);
2627
0
  T topleft = T::Load((unsigned_t<T>*)pixels_topleft);
2628
0
  T ac = left.Sub(topleft);
2629
0
  T ab = left.Sub(top);
2630
0
  T bc = top.Sub(topleft);
2631
0
  T grad = ac.Add(top);
2632
0
  T d = ab.Xor(bc);
2633
0
  T zero = T::Val(0);
2634
0
  T clamp = zero.Gt(d).IfThenElse(top, left);
2635
0
  T s = ac.Xor(bc);
2636
0
  T pred = zero.Gt(s).IfThenElse(grad, clamp);
2637
0
  T res = px.Sub(pred);
2638
0
  T res_times_2 = res.Add(res);
2639
0
  res = zero.Gt(res).IfThenElse(T::Val(-1).Sub(res_times_2), res_times_2);
2640
0
  res.Store(residuals);
2641
0
  return res.Eq(T::Val(0)).CountPrefix();
2642
0
}
Unexecuted instantiation: enc_fast_lossless.cc:unsigned long AVX2::(anonymous namespace)::PredictPixels<AVX2::(anonymous namespace)::SIMDVec16>(AVX2::(anonymous namespace)::detail::IntegerTypes<AVX2::(anonymous namespace)::SIMDVec16>::signed_ const*, AVX2::(anonymous namespace)::detail::IntegerTypes<AVX2::(anonymous namespace)::SIMDVec16>::signed_ const*, AVX2::(anonymous namespace)::detail::IntegerTypes<AVX2::(anonymous namespace)::SIMDVec16>::signed_ const*, AVX2::(anonymous namespace)::detail::IntegerTypes<AVX2::(anonymous namespace)::SIMDVec16>::signed_ const*, AVX2::(anonymous namespace)::detail::IntegerTypes<AVX2::(anonymous namespace)::SIMDVec16>::unsigned_*)
Unexecuted instantiation: enc_fast_lossless.cc:unsigned long AVX2::(anonymous namespace)::PredictPixels<AVX2::(anonymous namespace)::SIMDVec32>(AVX2::(anonymous namespace)::detail::IntegerTypes<AVX2::(anonymous namespace)::SIMDVec32>::signed_ const*, AVX2::(anonymous namespace)::detail::IntegerTypes<AVX2::(anonymous namespace)::SIMDVec32>::signed_ const*, AVX2::(anonymous namespace)::detail::IntegerTypes<AVX2::(anonymous namespace)::SIMDVec32>::signed_ const*, AVX2::(anonymous namespace)::detail::IntegerTypes<AVX2::(anonymous namespace)::SIMDVec32>::signed_ const*, AVX2::(anonymous namespace)::detail::IntegerTypes<AVX2::(anonymous namespace)::SIMDVec32>::unsigned_*)
2643
2644
#endif
2645
2646
void EncodeHybridUint000(uint32_t value, uint32_t* token, uint32_t* nbits,
2647
0
                         uint32_t* bits) {
2648
0
  uint32_t n = FloorLog2(value);
2649
0
  *token = value ? n + 1 : 0;
2650
0
  *nbits = value ? n : 0;
2651
0
  *bits = value ? value - (1 << n) : 0;
2652
0
}
Unexecuted instantiation: enc_fast_lossless.cc:AVX2::(anonymous namespace)::EncodeHybridUint000(unsigned int, unsigned int*, unsigned int*, unsigned int*)
Unexecuted instantiation: enc_fast_lossless.cc:default_implementation::(anonymous namespace)::EncodeHybridUint000(unsigned int, unsigned int*, unsigned int*, unsigned int*)
2653
2654
#ifdef FJXL_AVX512
2655
constexpr static size_t kLogChunkSize = 5;
2656
#elif defined(FJXL_AVX2) || defined(FJXL_NEON)
2657
// Even if NEON only has 128-bit lanes, it is still significantly (~1.3x) faster
2658
// to process two vectors at a time.
2659
constexpr static size_t kLogChunkSize = 4;
2660
#else
2661
constexpr static size_t kLogChunkSize = 3;
2662
#endif
2663
2664
constexpr static size_t kChunkSize = 1 << kLogChunkSize;
2665
2666
template <typename Residual>
2667
void GenericEncodeChunk(const Residual* residuals, size_t n, size_t skip,
2668
0
                        const PrefixCode& code, BitWriter& output) {
2669
0
  for (size_t ix = skip; ix < n; ix++) {
2670
0
    unsigned token, nbits, bits;
2671
0
    EncodeHybridUint000(residuals[ix], &token, &nbits, &bits);
2672
0
    output.Write(code.raw_nbits[token] + nbits,
2673
0
                 code.raw_bits[token] | bits << code.raw_nbits[token]);
2674
0
  }
2675
0
}
Unexecuted instantiation: enc_fast_lossless.cc:void default_implementation::(anonymous namespace)::GenericEncodeChunk<unsigned short>(unsigned short const*, unsigned long, unsigned long, (anonymous namespace)::PrefixCode const&, (anonymous namespace)::BitWriter&)
Unexecuted instantiation: enc_fast_lossless.cc:void default_implementation::(anonymous namespace)::GenericEncodeChunk<unsigned int>(unsigned int const*, unsigned long, unsigned long, (anonymous namespace)::PrefixCode const&, (anonymous namespace)::BitWriter&)
2676
2677
struct UpTo8Bits {
2678
  size_t bitdepth;
2679
0
  explicit UpTo8Bits(size_t bitdepth) : bitdepth(bitdepth) {
2680
0
    assert(bitdepth <= 8);
2681
0
  }
Unexecuted instantiation: enc_fast_lossless.cc:AVX2::(anonymous namespace)::UpTo8Bits::UpTo8Bits(unsigned long)
Unexecuted instantiation: enc_fast_lossless.cc:default_implementation::(anonymous namespace)::UpTo8Bits::UpTo8Bits(unsigned long)
2682
  // Here we can fit up to 9 extra bits + 7 Huffman bits in a u16; for all other
2683
  // symbols, we could actually go up to 8 Huffman bits as we have at most 8
2684
  // extra bits; however, the SIMD bit merging logic for AVX2 assumes that no
2685
  // Huffman length is 8 or more, so we cap at 8 anyway. Last symbol is used for
2686
  // LZ77 lengths and has no limitations except allowing to represent 32 symbols
2687
  // in total.
2688
  static constexpr uint8_t kMinRawLength[12] = {};
2689
  static constexpr uint8_t kMaxRawLength[12] = {
2690
      7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 10,
2691
  };
2692
0
  static size_t MaxEncodedBitsPerSample() { return 16; }
Unexecuted instantiation: enc_fast_lossless.cc:AVX2::(anonymous namespace)::UpTo8Bits::MaxEncodedBitsPerSample()
Unexecuted instantiation: enc_fast_lossless.cc:default_implementation::(anonymous namespace)::UpTo8Bits::MaxEncodedBitsPerSample()
2693
  static constexpr size_t kInputBytes = 1;
2694
  using pixel_t = int16_t;
2695
  using upixel_t = uint16_t;
2696
2697
  static void PrepareForSimd(const uint8_t* nbits, const uint8_t* bits,
2698
                             size_t n, uint8_t* nbits_simd,
2699
0
                             uint8_t* bits_simd) {
2700
0
    assert(n <= 16);
2701
0
    memcpy(nbits_simd, nbits, 16);
2702
0
    memcpy(bits_simd, bits, 16);
2703
0
  }
Unexecuted instantiation: enc_fast_lossless.cc:AVX2::(anonymous namespace)::UpTo8Bits::PrepareForSimd(unsigned char const*, unsigned char const*, unsigned long, unsigned char*, unsigned char*)
Unexecuted instantiation: enc_fast_lossless.cc:default_implementation::(anonymous namespace)::UpTo8Bits::PrepareForSimd(unsigned char const*, unsigned char const*, unsigned long, unsigned char*, unsigned char*)
2704
2705
#ifdef FJXL_GENERIC_SIMD
2706
  static void EncodeChunkSimd(upixel_t* residuals, size_t n, size_t skip,
2707
                              const uint8_t* raw_nbits_simd,
2708
0
                              const uint8_t* raw_bits_simd, BitWriter& output) {
2709
0
    Bits32 bits32[kChunkSize / SIMDVec16::kLanes];
2710
0
    alignas(64) uint16_t bits[SIMDVec16::kLanes];
2711
0
    alignas(64) uint16_t nbits[SIMDVec16::kLanes];
2712
0
    alignas(64) uint16_t bits_huff[SIMDVec16::kLanes];
2713
0
    alignas(64) uint16_t nbits_huff[SIMDVec16::kLanes];
2714
0
    alignas(64) uint16_t token[SIMDVec16::kLanes];
2715
0
    for (size_t i = 0; i < kChunkSize; i += SIMDVec16::kLanes) {
2716
0
      TokenizeSIMD(residuals + i, token, nbits, bits);
2717
0
      HuffmanSIMDUpTo13(token, raw_nbits_simd, raw_bits_simd, nbits_huff,
2718
0
                        bits_huff);
2719
0
      StoreSIMDUpTo8(nbits, bits, nbits_huff, bits_huff, std::max(n, i) - i,
2720
0
                     std::max(skip, i) - i, bits32 + i / SIMDVec16::kLanes);
2721
0
    }
2722
0
    StoreToWriter<kChunkSize / SIMDVec16::kLanes>(bits32, output);
2723
0
  }
2724
#endif
2725
2726
0
  size_t NumSymbols(bool doing_ycocg_or_large_palette) const {
2727
    // values gain 1 bit for YCoCg, 1 bit for prediction.
2728
    // Maximum symbol is 1 + effective bit depth of residuals.
2729
0
    if (doing_ycocg_or_large_palette) {
2730
0
      return bitdepth + 3;
2731
0
    } else {
2732
0
      return bitdepth + 2;
2733
0
    }
2734
0
  }
Unexecuted instantiation: enc_fast_lossless.cc:AVX2::(anonymous namespace)::UpTo8Bits::NumSymbols(bool) const
Unexecuted instantiation: enc_fast_lossless.cc:default_implementation::(anonymous namespace)::UpTo8Bits::NumSymbols(bool) const
2735
};
2736
constexpr uint8_t UpTo8Bits::kMinRawLength[];
2737
constexpr uint8_t UpTo8Bits::kMaxRawLength[];
2738
2739
struct From9To13Bits {
2740
  size_t bitdepth;
2741
0
  explicit From9To13Bits(size_t bitdepth) : bitdepth(bitdepth) {
2742
0
    assert(bitdepth <= 13 && bitdepth >= 9);
2743
0
  }
Unexecuted instantiation: enc_fast_lossless.cc:AVX2::(anonymous namespace)::From9To13Bits::From9To13Bits(unsigned long)
Unexecuted instantiation: enc_fast_lossless.cc:default_implementation::(anonymous namespace)::From9To13Bits::From9To13Bits(unsigned long)
2744
  // Last symbol is used for LZ77 lengths and has no limitations except allowing
2745
  // to represent 32 symbols in total.
2746
  // We cannot fit all the bits in a u16, so do not even try and use up to 8
2747
  // bits per raw symbol.
2748
  // There are at most 16 raw symbols, so Huffman coding can be SIMDfied without
2749
  // any special tricks.
2750
  static constexpr uint8_t kMinRawLength[17] = {};
2751
  static constexpr uint8_t kMaxRawLength[17] = {
2752
      8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 10,
2753
  };
2754
0
  static size_t MaxEncodedBitsPerSample() { return 21; }
Unexecuted instantiation: enc_fast_lossless.cc:AVX2::(anonymous namespace)::From9To13Bits::MaxEncodedBitsPerSample()
Unexecuted instantiation: enc_fast_lossless.cc:default_implementation::(anonymous namespace)::From9To13Bits::MaxEncodedBitsPerSample()
2755
  static constexpr size_t kInputBytes = 2;
2756
  using pixel_t = int16_t;
2757
  using upixel_t = uint16_t;
2758
2759
  static void PrepareForSimd(const uint8_t* nbits, const uint8_t* bits,
2760
                             size_t n, uint8_t* nbits_simd,
2761
0
                             uint8_t* bits_simd) {
2762
0
    assert(n <= 16);
2763
0
    memcpy(nbits_simd, nbits, 16);
2764
0
    memcpy(bits_simd, bits, 16);
2765
0
  }
Unexecuted instantiation: enc_fast_lossless.cc:AVX2::(anonymous namespace)::From9To13Bits::PrepareForSimd(unsigned char const*, unsigned char const*, unsigned long, unsigned char*, unsigned char*)
Unexecuted instantiation: enc_fast_lossless.cc:default_implementation::(anonymous namespace)::From9To13Bits::PrepareForSimd(unsigned char const*, unsigned char const*, unsigned long, unsigned char*, unsigned char*)
2766
2767
#ifdef FJXL_GENERIC_SIMD
2768
  static void EncodeChunkSimd(upixel_t* residuals, size_t n, size_t skip,
2769
                              const uint8_t* raw_nbits_simd,
2770
0
                              const uint8_t* raw_bits_simd, BitWriter& output) {
2771
0
    Bits32 bits32[2 * kChunkSize / SIMDVec16::kLanes];
2772
0
    alignas(64) uint16_t bits[SIMDVec16::kLanes];
2773
0
    alignas(64) uint16_t nbits[SIMDVec16::kLanes];
2774
0
    alignas(64) uint16_t bits_huff[SIMDVec16::kLanes];
2775
0
    alignas(64) uint16_t nbits_huff[SIMDVec16::kLanes];
2776
0
    alignas(64) uint16_t token[SIMDVec16::kLanes];
2777
0
    for (size_t i = 0; i < kChunkSize; i += SIMDVec16::kLanes) {
2778
0
      TokenizeSIMD(residuals + i, token, nbits, bits);
2779
0
      HuffmanSIMDUpTo13(token, raw_nbits_simd, raw_bits_simd, nbits_huff,
2780
0
                        bits_huff);
2781
0
      StoreSIMDUpTo14(nbits, bits, nbits_huff, bits_huff, std::max(n, i) - i,
2782
0
                      std::max(skip, i) - i,
2783
0
                      bits32 + 2 * i / SIMDVec16::kLanes);
2784
0
    }
2785
0
    StoreToWriter<2 * kChunkSize / SIMDVec16::kLanes>(bits32, output);
2786
0
  }
2787
#endif
2788
2789
0
  size_t NumSymbols(bool doing_ycocg_or_large_palette) const {
2790
    // values gain 1 bit for YCoCg, 1 bit for prediction.
2791
    // Maximum symbol is 1 + effective bit depth of residuals.
2792
0
    if (doing_ycocg_or_large_palette) {
2793
0
      return bitdepth + 3;
2794
0
    } else {
2795
0
      return bitdepth + 2;
2796
0
    }
2797
0
  }
Unexecuted instantiation: enc_fast_lossless.cc:AVX2::(anonymous namespace)::From9To13Bits::NumSymbols(bool) const
Unexecuted instantiation: enc_fast_lossless.cc:default_implementation::(anonymous namespace)::From9To13Bits::NumSymbols(bool) const
2798
};
2799
constexpr uint8_t From9To13Bits::kMinRawLength[];
2800
constexpr uint8_t From9To13Bits::kMaxRawLength[];
2801
2802
0
void CheckHuffmanBitsSIMD(int bits1, int nbits1, int bits2, int nbits2) {
2803
0
  assert(nbits1 == 8);
2804
0
  assert(nbits2 == 8);
2805
0
  assert(bits2 == (bits1 | 128));
2806
0
}
Unexecuted instantiation: enc_fast_lossless.cc:AVX2::(anonymous namespace)::CheckHuffmanBitsSIMD(int, int, int, int)
Unexecuted instantiation: enc_fast_lossless.cc:default_implementation::(anonymous namespace)::CheckHuffmanBitsSIMD(int, int, int, int)
2807
2808
struct Exactly14Bits {
2809
0
  explicit Exactly14Bits(size_t bitdepth_) { assert(bitdepth_ == 14); }
Unexecuted instantiation: enc_fast_lossless.cc:AVX2::(anonymous namespace)::Exactly14Bits::Exactly14Bits(unsigned long)
Unexecuted instantiation: enc_fast_lossless.cc:default_implementation::(anonymous namespace)::Exactly14Bits::Exactly14Bits(unsigned long)
2810
  // Force LZ77 symbols to have at least 8 bits, and raw symbols 15 and 16 to
2811
  // have exactly 8, and no other symbol to have 8 or more. This ensures that
2812
  // the representation for 15 and 16 is identical up to one bit.
2813
  static constexpr uint8_t kMinRawLength[18] = {
2814
      0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 8, 8, 7,
2815
  };
2816
  static constexpr uint8_t kMaxRawLength[18] = {
2817
      7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 8, 8, 10,
2818
  };
2819
  static constexpr size_t bitdepth = 14;
2820
0
  static size_t MaxEncodedBitsPerSample() { return 22; }
Unexecuted instantiation: enc_fast_lossless.cc:AVX2::(anonymous namespace)::Exactly14Bits::MaxEncodedBitsPerSample()
Unexecuted instantiation: enc_fast_lossless.cc:default_implementation::(anonymous namespace)::Exactly14Bits::MaxEncodedBitsPerSample()
2821
  static constexpr size_t kInputBytes = 2;
2822
  using pixel_t = int16_t;
2823
  using upixel_t = uint16_t;
2824
2825
  static void PrepareForSimd(const uint8_t* nbits, const uint8_t* bits,
2826
                             size_t n, uint8_t* nbits_simd,
2827
0
                             uint8_t* bits_simd) {
2828
0
    assert(n == 17);
2829
0
    CheckHuffmanBitsSIMD(bits[15], nbits[15], bits[16], nbits[16]);
2830
0
    memcpy(nbits_simd, nbits, 16);
2831
0
    memcpy(bits_simd, bits, 16);
2832
0
  }
Unexecuted instantiation: enc_fast_lossless.cc:AVX2::(anonymous namespace)::Exactly14Bits::PrepareForSimd(unsigned char const*, unsigned char const*, unsigned long, unsigned char*, unsigned char*)
Unexecuted instantiation: enc_fast_lossless.cc:default_implementation::(anonymous namespace)::Exactly14Bits::PrepareForSimd(unsigned char const*, unsigned char const*, unsigned long, unsigned char*, unsigned char*)
2833
2834
#ifdef FJXL_GENERIC_SIMD
2835
  static void EncodeChunkSimd(upixel_t* residuals, size_t n, size_t skip,
2836
                              const uint8_t* raw_nbits_simd,
2837
0
                              const uint8_t* raw_bits_simd, BitWriter& output) {
2838
0
    Bits32 bits32[2 * kChunkSize / SIMDVec16::kLanes];
2839
0
    alignas(64) uint16_t bits[SIMDVec16::kLanes];
2840
0
    alignas(64) uint16_t nbits[SIMDVec16::kLanes];
2841
0
    alignas(64) uint16_t bits_huff[SIMDVec16::kLanes];
2842
0
    alignas(64) uint16_t nbits_huff[SIMDVec16::kLanes];
2843
0
    alignas(64) uint16_t token[SIMDVec16::kLanes];
2844
0
    for (size_t i = 0; i < kChunkSize; i += SIMDVec16::kLanes) {
2845
0
      TokenizeSIMD(residuals + i, token, nbits, bits);
2846
0
      HuffmanSIMD14(token, raw_nbits_simd, raw_bits_simd, nbits_huff,
2847
0
                    bits_huff);
2848
0
      StoreSIMDUpTo14(nbits, bits, nbits_huff, bits_huff, std::max(n, i) - i,
2849
0
                      std::max(skip, i) - i,
2850
0
                      bits32 + 2 * i / SIMDVec16::kLanes);
2851
0
    }
2852
0
    StoreToWriter<2 * kChunkSize / SIMDVec16::kLanes>(bits32, output);
2853
0
  }
2854
#endif
2855
2856
0
  size_t NumSymbols(bool) const { return 17; }
Unexecuted instantiation: enc_fast_lossless.cc:AVX2::(anonymous namespace)::Exactly14Bits::NumSymbols(bool) const
Unexecuted instantiation: enc_fast_lossless.cc:default_implementation::(anonymous namespace)::Exactly14Bits::NumSymbols(bool) const
2857
};
2858
constexpr uint8_t Exactly14Bits::kMinRawLength[];
2859
constexpr uint8_t Exactly14Bits::kMaxRawLength[];
2860
2861
struct MoreThan14Bits {
2862
  size_t bitdepth;
2863
0
  explicit MoreThan14Bits(size_t bitdepth) : bitdepth(bitdepth) {
2864
0
    assert(bitdepth > 14);
2865
0
    assert(bitdepth <= 16);
2866
0
  }
Unexecuted instantiation: enc_fast_lossless.cc:AVX2::(anonymous namespace)::MoreThan14Bits::MoreThan14Bits(unsigned long)
Unexecuted instantiation: enc_fast_lossless.cc:default_implementation::(anonymous namespace)::MoreThan14Bits::MoreThan14Bits(unsigned long)
2867
  // Force LZ77 symbols to have at least 8 bits, and raw symbols 13 to 18 to
2868
  // have exactly 8, and no other symbol to have 8 or more. This ensures that
2869
  // the representation for (13, 14), (15, 16), (17, 18) is identical up to one
2870
  // bit.
2871
  static constexpr uint8_t kMinRawLength[20] = {
2872
      0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 8, 8, 8, 8, 8, 8, 7,
2873
  };
2874
  static constexpr uint8_t kMaxRawLength[20] = {
2875
      7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 8, 8, 8, 8, 8, 8, 10,
2876
  };
2877
0
  static size_t MaxEncodedBitsPerSample() { return 24; }
Unexecuted instantiation: enc_fast_lossless.cc:AVX2::(anonymous namespace)::MoreThan14Bits::MaxEncodedBitsPerSample()
Unexecuted instantiation: enc_fast_lossless.cc:default_implementation::(anonymous namespace)::MoreThan14Bits::MaxEncodedBitsPerSample()
2878
  static constexpr size_t kInputBytes = 2;
2879
  using pixel_t = int32_t;
2880
  using upixel_t = uint32_t;
2881
2882
  static void PrepareForSimd(const uint8_t* nbits, const uint8_t* bits,
2883
                             size_t n, uint8_t* nbits_simd,
2884
0
                             uint8_t* bits_simd) {
2885
0
    assert(n == 19);
2886
0
    CheckHuffmanBitsSIMD(bits[13], nbits[13], bits[14], nbits[14]);
2887
0
    CheckHuffmanBitsSIMD(bits[15], nbits[15], bits[16], nbits[16]);
2888
0
    CheckHuffmanBitsSIMD(bits[17], nbits[17], bits[18], nbits[18]);
2889
0
    for (size_t i = 0; i < 14; i++) {
2890
0
      nbits_simd[i] = nbits[i];
2891
0
      bits_simd[i] = bits[i];
2892
0
    }
2893
0
    nbits_simd[14] = nbits[15];
2894
0
    bits_simd[14] = bits[15];
2895
0
    nbits_simd[15] = nbits[17];
2896
0
    bits_simd[15] = bits[17];
2897
0
  }
Unexecuted instantiation: enc_fast_lossless.cc:AVX2::(anonymous namespace)::MoreThan14Bits::PrepareForSimd(unsigned char const*, unsigned char const*, unsigned long, unsigned char*, unsigned char*)
Unexecuted instantiation: enc_fast_lossless.cc:default_implementation::(anonymous namespace)::MoreThan14Bits::PrepareForSimd(unsigned char const*, unsigned char const*, unsigned long, unsigned char*, unsigned char*)
2898
2899
#ifdef FJXL_GENERIC_SIMD
2900
  static void EncodeChunkSimd(upixel_t* residuals, size_t n, size_t skip,
2901
                              const uint8_t* raw_nbits_simd,
2902
0
                              const uint8_t* raw_bits_simd, BitWriter& output) {
2903
0
    Bits32 bits32[2 * kChunkSize / SIMDVec16::kLanes];
2904
0
    alignas(64) uint32_t bits[SIMDVec16::kLanes];
2905
0
    alignas(64) uint32_t nbits[SIMDVec16::kLanes];
2906
0
    alignas(64) uint16_t bits_huff[SIMDVec16::kLanes];
2907
0
    alignas(64) uint16_t nbits_huff[SIMDVec16::kLanes];
2908
0
    alignas(64) uint16_t token[SIMDVec16::kLanes];
2909
0
    for (size_t i = 0; i < kChunkSize; i += SIMDVec16::kLanes) {
2910
0
      TokenizeSIMD(residuals + i, token, nbits, bits);
2911
0
      HuffmanSIMDAbove14(token, raw_nbits_simd, raw_bits_simd, nbits_huff,
2912
0
                         bits_huff);
2913
0
      StoreSIMDAbove14(nbits, bits, nbits_huff, bits_huff, std::max(n, i) - i,
2914
0
                       std::max(skip, i) - i,
2915
0
                       bits32 + 2 * i / SIMDVec16::kLanes);
2916
0
    }
2917
0
    StoreToWriter<2 * kChunkSize / SIMDVec16::kLanes>(bits32, output);
2918
0
  }
2919
#endif
2920
0
  size_t NumSymbols(bool) const { return 19; }
Unexecuted instantiation: enc_fast_lossless.cc:AVX2::(anonymous namespace)::MoreThan14Bits::NumSymbols(bool) const
Unexecuted instantiation: enc_fast_lossless.cc:default_implementation::(anonymous namespace)::MoreThan14Bits::NumSymbols(bool) const
2921
};
2922
constexpr uint8_t MoreThan14Bits::kMinRawLength[];
2923
constexpr uint8_t MoreThan14Bits::kMaxRawLength[];
2924
2925
bool PrepareDCGlobalCommon(bool is_single_group, size_t width, size_t height,
2926
0
                           const PrefixCode code[4], BitWriter* output) {
2927
0
  if (!output->Allocate(100000 + (is_single_group ? width * height * 16 : 0))) {
2928
0
    return false;
2929
0
  }
2930
  // No patches, spline or noise.
2931
0
  output->Write(1, 1);  // default DC dequantization factors (?)
2932
0
  output->Write(1, 1);  // use global tree / histograms
2933
0
  output->Write(1, 0);  // no lz77 for the tree
2934
2935
0
  output->Write(1, 1);         // simple code for the tree's context map
2936
0
  output->Write(2, 0);         // all contexts clustered together
2937
0
  output->Write(1, 1);         // use prefix code for tree
2938
0
  output->Write(4, 0);         // 000 hybrid uint
2939
0
  output->Write(6, 0b100011);  // Alphabet size is 4 (var16)
2940
0
  output->Write(2, 1);         // simple prefix code
2941
0
  output->Write(2, 3);         // with 4 symbols
2942
0
  output->Write(2, 0);
2943
0
  output->Write(2, 1);
2944
0
  output->Write(2, 2);
2945
0
  output->Write(2, 3);
2946
0
  output->Write(1, 0);  // First tree encoding option
2947
2948
  // Huffman table + extra bits for the tree.
2949
0
  uint8_t symbol_bits[6] = {0b00, 0b10, 0b001, 0b101, 0b0011, 0b0111};
2950
0
  uint8_t symbol_nbits[6] = {2, 2, 3, 3, 4, 4};
2951
  // Write a tree with a leaf per channel, and gradient predictor for every
2952
  // leaf.
2953
0
  for (auto v : {1, 2, 1, 4, 1, 0, 0, 5, 0, 0, 0, 0, 5,
2954
0
                 0, 0, 0, 0, 5, 0, 0, 0, 0, 5, 0, 0, 0}) {
2955
0
    output->Write(symbol_nbits[v], symbol_bits[v]);
2956
0
  }
2957
2958
0
  output->Write(1, 1);     // Enable lz77 for the main bitstream
2959
0
  output->Write(2, 0b00);  // lz77 offset 224
2960
0
  static_assert(kLZ77Offset == 224, "kLZ77Offset should be 224");
2961
0
  output->Write(4, 0b1010);  // lz77 min length 7
2962
  // 400 hybrid uint config for lz77
2963
0
  output->Write(4, 4);
2964
0
  output->Write(3, 0);
2965
0
  output->Write(3, 0);
2966
2967
0
  output->Write(1, 1);  // simple code for the context map
2968
0
  output->Write(2, 3);  // 3 bits per entry
2969
0
  output->Write(3, 4);  // channel 3
2970
0
  output->Write(3, 3);  // channel 2
2971
0
  output->Write(3, 2);  // channel 1
2972
0
  output->Write(3, 1);  // channel 0
2973
0
  output->Write(3, 0);  // distance histogram first
2974
2975
0
  output->Write(1, 1);  // use prefix codes
2976
0
  output->Write(4, 0);  // 000 hybrid uint config for distances (only need 0)
2977
0
  for (size_t i = 0; i < 4; i++) {
2978
0
    output->Write(4, 0);  // 000 hybrid uint config for symbols (only <= 10)
2979
0
  }
2980
2981
  // Distance alphabet size:
2982
0
  output->Write(5, 0b00001);  // 2: just need 1 for RLE (i.e. distance 1)
2983
  // Symbol + LZ77 alphabet size:
2984
0
  for (size_t i = 0; i < 4; i++) {
2985
0
    output->Write(1, 1);    // > 1
2986
0
    output->Write(4, 8);    // <= 512
2987
0
    output->Write(8, 256);  // == 512
2988
0
  }
2989
2990
  // Distance histogram:
2991
0
  output->Write(2, 1);  // simple prefix code
2992
0
  output->Write(2, 0);  // with one symbol
2993
0
  output->Write(1, 1);  // 1
2994
2995
  // Symbol + lz77 histogram:
2996
0
  for (size_t i = 0; i < 4; i++) {
2997
0
    code[i].WriteTo(output);
2998
0
  }
2999
3000
  // Group header for global modular image.
3001
0
  output->Write(1, 1);  // Global tree
3002
0
  output->Write(1, 1);  // All default wp
3003
0
  return true;
3004
0
}
Unexecuted instantiation: enc_fast_lossless.cc:AVX2::(anonymous namespace)::PrepareDCGlobalCommon(bool, unsigned long, unsigned long, (anonymous namespace)::PrefixCode const*, (anonymous namespace)::BitWriter*)
Unexecuted instantiation: enc_fast_lossless.cc:default_implementation::(anonymous namespace)::PrepareDCGlobalCommon(bool, unsigned long, unsigned long, (anonymous namespace)::PrefixCode const*, (anonymous namespace)::BitWriter*)
3005
3006
bool PrepareDCGlobal(bool is_single_group, size_t width, size_t height,
3007
                     size_t nb_chans, const PrefixCode code[4],
3008
0
                     BitWriter* output) {
3009
0
  if (!PrepareDCGlobalCommon(is_single_group, width, height, code, output)) {
3010
0
    return false;
3011
0
  }
3012
0
  if (nb_chans > 2) {
3013
0
    output->Write(2, 0b01);     // 1 transform
3014
0
    output->Write(2, 0b00);     // RCT
3015
0
    output->Write(5, 0b00000);  // Starting from ch 0
3016
0
    output->Write(2, 0b00);     // YCoCg
3017
0
  } else {
3018
0
    output->Write(2, 0b00);  // no transforms
3019
0
  }
3020
0
  if (!is_single_group) {
3021
0
    output->ZeroPadToByte();
3022
0
  }
3023
0
  return true;
3024
0
}
Unexecuted instantiation: enc_fast_lossless.cc:AVX2::(anonymous namespace)::PrepareDCGlobal(bool, unsigned long, unsigned long, unsigned long, (anonymous namespace)::PrefixCode const*, (anonymous namespace)::BitWriter*)
Unexecuted instantiation: enc_fast_lossless.cc:default_implementation::(anonymous namespace)::PrepareDCGlobal(bool, unsigned long, unsigned long, unsigned long, (anonymous namespace)::PrefixCode const*, (anonymous namespace)::BitWriter*)
3025
3026
template <typename BitDepth>
3027
struct ChunkEncoder {
3028
0
  void PrepareForSimd() {
3029
0
    BitDepth::PrepareForSimd(code->raw_nbits, code->raw_bits, code->numraw,
3030
0
                             raw_nbits_simd, raw_bits_simd);
3031
0
  }
Unexecuted instantiation: enc_fast_lossless.cc:AVX2::(anonymous namespace)::ChunkEncoder<AVX2::(anonymous namespace)::UpTo8Bits>::PrepareForSimd()
Unexecuted instantiation: enc_fast_lossless.cc:default_implementation::(anonymous namespace)::ChunkEncoder<default_implementation::(anonymous namespace)::UpTo8Bits>::PrepareForSimd()
Unexecuted instantiation: enc_fast_lossless.cc:AVX2::(anonymous namespace)::ChunkEncoder<AVX2::(anonymous namespace)::From9To13Bits>::PrepareForSimd()
Unexecuted instantiation: enc_fast_lossless.cc:AVX2::(anonymous namespace)::ChunkEncoder<AVX2::(anonymous namespace)::Exactly14Bits>::PrepareForSimd()
Unexecuted instantiation: enc_fast_lossless.cc:AVX2::(anonymous namespace)::ChunkEncoder<AVX2::(anonymous namespace)::MoreThan14Bits>::PrepareForSimd()
Unexecuted instantiation: enc_fast_lossless.cc:default_implementation::(anonymous namespace)::ChunkEncoder<default_implementation::(anonymous namespace)::From9To13Bits>::PrepareForSimd()
Unexecuted instantiation: enc_fast_lossless.cc:default_implementation::(anonymous namespace)::ChunkEncoder<default_implementation::(anonymous namespace)::Exactly14Bits>::PrepareForSimd()
Unexecuted instantiation: enc_fast_lossless.cc:default_implementation::(anonymous namespace)::ChunkEncoder<default_implementation::(anonymous namespace)::MoreThan14Bits>::PrepareForSimd()
3032
  FJXL_INLINE static void EncodeRle(size_t count, const PrefixCode& code,
3033
0
                                    BitWriter& output) {
3034
0
    if (count == 0) return;
3035
0
    count -= kLZ77MinLength + 1;
3036
0
    if (count < kLZ77CacheSize) {
3037
0
      output.Write(code.lz77_cache_nbits[count], code.lz77_cache_bits[count]);
3038
0
    } else {
3039
0
      unsigned token, nbits, bits;
3040
0
      EncodeHybridUintLZ77(count, &token, &nbits, &bits);
3041
0
      uint64_t wbits = bits;
3042
0
      wbits = (wbits << code.lz77_nbits[token]) | code.lz77_bits[token];
3043
0
      wbits = (wbits << code.raw_nbits[0]) | code.raw_bits[0];
3044
0
      output.Write(code.lz77_nbits[token] + nbits + code.raw_nbits[0], wbits);
3045
0
    }
3046
0
  }
Unexecuted instantiation: enc_fast_lossless.cc:AVX2::(anonymous namespace)::ChunkEncoder<AVX2::(anonymous namespace)::UpTo8Bits>::EncodeRle(unsigned long, (anonymous namespace)::PrefixCode const&, (anonymous namespace)::BitWriter&)
Unexecuted instantiation: enc_fast_lossless.cc:default_implementation::(anonymous namespace)::ChunkEncoder<default_implementation::(anonymous namespace)::UpTo8Bits>::EncodeRle(unsigned long, (anonymous namespace)::PrefixCode const&, (anonymous namespace)::BitWriter&)
Unexecuted instantiation: enc_fast_lossless.cc:AVX2::(anonymous namespace)::ChunkEncoder<AVX2::(anonymous namespace)::From9To13Bits>::EncodeRle(unsigned long, (anonymous namespace)::PrefixCode const&, (anonymous namespace)::BitWriter&)
Unexecuted instantiation: enc_fast_lossless.cc:AVX2::(anonymous namespace)::ChunkEncoder<AVX2::(anonymous namespace)::Exactly14Bits>::EncodeRle(unsigned long, (anonymous namespace)::PrefixCode const&, (anonymous namespace)::BitWriter&)
Unexecuted instantiation: enc_fast_lossless.cc:AVX2::(anonymous namespace)::ChunkEncoder<AVX2::(anonymous namespace)::MoreThan14Bits>::EncodeRle(unsigned long, (anonymous namespace)::PrefixCode const&, (anonymous namespace)::BitWriter&)
Unexecuted instantiation: enc_fast_lossless.cc:default_implementation::(anonymous namespace)::ChunkEncoder<default_implementation::(anonymous namespace)::From9To13Bits>::EncodeRle(unsigned long, (anonymous namespace)::PrefixCode const&, (anonymous namespace)::BitWriter&)
Unexecuted instantiation: enc_fast_lossless.cc:default_implementation::(anonymous namespace)::ChunkEncoder<default_implementation::(anonymous namespace)::Exactly14Bits>::EncodeRle(unsigned long, (anonymous namespace)::PrefixCode const&, (anonymous namespace)::BitWriter&)
Unexecuted instantiation: enc_fast_lossless.cc:default_implementation::(anonymous namespace)::ChunkEncoder<default_implementation::(anonymous namespace)::MoreThan14Bits>::EncodeRle(unsigned long, (anonymous namespace)::PrefixCode const&, (anonymous namespace)::BitWriter&)
3047
3048
  FJXL_INLINE void Chunk(size_t run, typename BitDepth::upixel_t* residuals,
3049
0
                         size_t skip, size_t n) {
3050
0
    EncodeRle(run, *code, *output);
3051
#ifdef FJXL_GENERIC_SIMD
3052
    BitDepth::EncodeChunkSimd(residuals, n, skip, raw_nbits_simd, raw_bits_simd,
3053
                              *output);
3054
#else
3055
    GenericEncodeChunk(residuals, n, skip, *code, *output);
3056
#endif
3057
0
  }
Unexecuted instantiation: enc_fast_lossless.cc:AVX2::(anonymous namespace)::ChunkEncoder<AVX2::(anonymous namespace)::UpTo8Bits>::Chunk(unsigned long, unsigned short*, unsigned long, unsigned long)
Unexecuted instantiation: enc_fast_lossless.cc:default_implementation::(anonymous namespace)::ChunkEncoder<default_implementation::(anonymous namespace)::UpTo8Bits>::Chunk(unsigned long, unsigned short*, unsigned long, unsigned long)
Unexecuted instantiation: enc_fast_lossless.cc:AVX2::(anonymous namespace)::ChunkEncoder<AVX2::(anonymous namespace)::From9To13Bits>::Chunk(unsigned long, unsigned short*, unsigned long, unsigned long)
Unexecuted instantiation: enc_fast_lossless.cc:AVX2::(anonymous namespace)::ChunkEncoder<AVX2::(anonymous namespace)::Exactly14Bits>::Chunk(unsigned long, unsigned short*, unsigned long, unsigned long)
Unexecuted instantiation: enc_fast_lossless.cc:AVX2::(anonymous namespace)::ChunkEncoder<AVX2::(anonymous namespace)::MoreThan14Bits>::Chunk(unsigned long, unsigned int*, unsigned long, unsigned long)
Unexecuted instantiation: enc_fast_lossless.cc:default_implementation::(anonymous namespace)::ChunkEncoder<default_implementation::(anonymous namespace)::From9To13Bits>::Chunk(unsigned long, unsigned short*, unsigned long, unsigned long)
Unexecuted instantiation: enc_fast_lossless.cc:default_implementation::(anonymous namespace)::ChunkEncoder<default_implementation::(anonymous namespace)::Exactly14Bits>::Chunk(unsigned long, unsigned short*, unsigned long, unsigned long)
Unexecuted instantiation: enc_fast_lossless.cc:default_implementation::(anonymous namespace)::ChunkEncoder<default_implementation::(anonymous namespace)::MoreThan14Bits>::Chunk(unsigned long, unsigned int*, unsigned long, unsigned long)
3058
3059
0
  inline void Finalize(size_t run) { EncodeRle(run, *code, *output); }
Unexecuted instantiation: enc_fast_lossless.cc:AVX2::(anonymous namespace)::ChunkEncoder<AVX2::(anonymous namespace)::UpTo8Bits>::Finalize(unsigned long)
Unexecuted instantiation: enc_fast_lossless.cc:default_implementation::(anonymous namespace)::ChunkEncoder<default_implementation::(anonymous namespace)::UpTo8Bits>::Finalize(unsigned long)
Unexecuted instantiation: enc_fast_lossless.cc:AVX2::(anonymous namespace)::ChunkEncoder<AVX2::(anonymous namespace)::From9To13Bits>::Finalize(unsigned long)
Unexecuted instantiation: enc_fast_lossless.cc:AVX2::(anonymous namespace)::ChunkEncoder<AVX2::(anonymous namespace)::Exactly14Bits>::Finalize(unsigned long)
Unexecuted instantiation: enc_fast_lossless.cc:AVX2::(anonymous namespace)::ChunkEncoder<AVX2::(anonymous namespace)::MoreThan14Bits>::Finalize(unsigned long)
Unexecuted instantiation: enc_fast_lossless.cc:default_implementation::(anonymous namespace)::ChunkEncoder<default_implementation::(anonymous namespace)::From9To13Bits>::Finalize(unsigned long)
Unexecuted instantiation: enc_fast_lossless.cc:default_implementation::(anonymous namespace)::ChunkEncoder<default_implementation::(anonymous namespace)::Exactly14Bits>::Finalize(unsigned long)
Unexecuted instantiation: enc_fast_lossless.cc:default_implementation::(anonymous namespace)::ChunkEncoder<default_implementation::(anonymous namespace)::MoreThan14Bits>::Finalize(unsigned long)
3060
3061
  const PrefixCode* code;
3062
  BitWriter* output;
3063
  alignas(64) uint8_t raw_nbits_simd[16] = {};
3064
  alignas(64) uint8_t raw_bits_simd[16] = {};
3065
};
3066
3067
template <typename BitDepth>
3068
struct ChunkSampleCollector {
3069
0
  FJXL_INLINE void Rle(size_t count, uint64_t* lz77_counts_) {
3070
0
    if (count == 0) return;
3071
0
    raw_counts[0] += 1;
3072
0
    count -= kLZ77MinLength + 1;
3073
0
    unsigned token, nbits, bits;
3074
0
    EncodeHybridUintLZ77(count, &token, &nbits, &bits);
3075
0
    lz77_counts_[token]++;
3076
0
  }
Unexecuted instantiation: enc_fast_lossless.cc:AVX2::(anonymous namespace)::ChunkSampleCollector<AVX2::(anonymous namespace)::UpTo8Bits>::Rle(unsigned long, unsigned long*)
Unexecuted instantiation: enc_fast_lossless.cc:AVX2::(anonymous namespace)::ChunkSampleCollector<AVX2::(anonymous namespace)::From9To13Bits>::Rle(unsigned long, unsigned long*)
Unexecuted instantiation: enc_fast_lossless.cc:AVX2::(anonymous namespace)::ChunkSampleCollector<AVX2::(anonymous namespace)::Exactly14Bits>::Rle(unsigned long, unsigned long*)
Unexecuted instantiation: enc_fast_lossless.cc:AVX2::(anonymous namespace)::ChunkSampleCollector<AVX2::(anonymous namespace)::MoreThan14Bits>::Rle(unsigned long, unsigned long*)
Unexecuted instantiation: enc_fast_lossless.cc:default_implementation::(anonymous namespace)::ChunkSampleCollector<default_implementation::(anonymous namespace)::UpTo8Bits>::Rle(unsigned long, unsigned long*)
Unexecuted instantiation: enc_fast_lossless.cc:default_implementation::(anonymous namespace)::ChunkSampleCollector<default_implementation::(anonymous namespace)::From9To13Bits>::Rle(unsigned long, unsigned long*)
Unexecuted instantiation: enc_fast_lossless.cc:default_implementation::(anonymous namespace)::ChunkSampleCollector<default_implementation::(anonymous namespace)::Exactly14Bits>::Rle(unsigned long, unsigned long*)
Unexecuted instantiation: enc_fast_lossless.cc:default_implementation::(anonymous namespace)::ChunkSampleCollector<default_implementation::(anonymous namespace)::MoreThan14Bits>::Rle(unsigned long, unsigned long*)
3077
3078
  FJXL_INLINE void Chunk(size_t run, typename BitDepth::upixel_t* residuals,
3079
0
                         size_t skip, size_t n) {
3080
    // Run is broken. Encode the run and encode the individual vector.
3081
0
    Rle(run, lz77_counts);
3082
0
    for (size_t ix = skip; ix < n; ix++) {
3083
0
      unsigned token, nbits, bits;
3084
0
      EncodeHybridUint000(residuals[ix], &token, &nbits, &bits);
3085
0
      raw_counts[token]++;
3086
0
    }
3087
0
  }
Unexecuted instantiation: enc_fast_lossless.cc:AVX2::(anonymous namespace)::ChunkSampleCollector<AVX2::(anonymous namespace)::UpTo8Bits>::Chunk(unsigned long, unsigned short*, unsigned long, unsigned long)
Unexecuted instantiation: enc_fast_lossless.cc:AVX2::(anonymous namespace)::ChunkSampleCollector<AVX2::(anonymous namespace)::From9To13Bits>::Chunk(unsigned long, unsigned short*, unsigned long, unsigned long)
Unexecuted instantiation: enc_fast_lossless.cc:AVX2::(anonymous namespace)::ChunkSampleCollector<AVX2::(anonymous namespace)::Exactly14Bits>::Chunk(unsigned long, unsigned short*, unsigned long, unsigned long)
Unexecuted instantiation: enc_fast_lossless.cc:AVX2::(anonymous namespace)::ChunkSampleCollector<AVX2::(anonymous namespace)::MoreThan14Bits>::Chunk(unsigned long, unsigned int*, unsigned long, unsigned long)
Unexecuted instantiation: enc_fast_lossless.cc:default_implementation::(anonymous namespace)::ChunkSampleCollector<default_implementation::(anonymous namespace)::UpTo8Bits>::Chunk(unsigned long, unsigned short*, unsigned long, unsigned long)
Unexecuted instantiation: enc_fast_lossless.cc:default_implementation::(anonymous namespace)::ChunkSampleCollector<default_implementation::(anonymous namespace)::From9To13Bits>::Chunk(unsigned long, unsigned short*, unsigned long, unsigned long)
Unexecuted instantiation: enc_fast_lossless.cc:default_implementation::(anonymous namespace)::ChunkSampleCollector<default_implementation::(anonymous namespace)::Exactly14Bits>::Chunk(unsigned long, unsigned short*, unsigned long, unsigned long)
Unexecuted instantiation: enc_fast_lossless.cc:default_implementation::(anonymous namespace)::ChunkSampleCollector<default_implementation::(anonymous namespace)::MoreThan14Bits>::Chunk(unsigned long, unsigned int*, unsigned long, unsigned long)
3088
3089
  // don't count final run since we don't know how long it really is
3090
0
  void Finalize(size_t run) {}
Unexecuted instantiation: enc_fast_lossless.cc:AVX2::(anonymous namespace)::ChunkSampleCollector<AVX2::(anonymous namespace)::UpTo8Bits>::Finalize(unsigned long)
Unexecuted instantiation: enc_fast_lossless.cc:AVX2::(anonymous namespace)::ChunkSampleCollector<AVX2::(anonymous namespace)::From9To13Bits>::Finalize(unsigned long)
Unexecuted instantiation: enc_fast_lossless.cc:AVX2::(anonymous namespace)::ChunkSampleCollector<AVX2::(anonymous namespace)::Exactly14Bits>::Finalize(unsigned long)
Unexecuted instantiation: enc_fast_lossless.cc:AVX2::(anonymous namespace)::ChunkSampleCollector<AVX2::(anonymous namespace)::MoreThan14Bits>::Finalize(unsigned long)
Unexecuted instantiation: enc_fast_lossless.cc:default_implementation::(anonymous namespace)::ChunkSampleCollector<default_implementation::(anonymous namespace)::UpTo8Bits>::Finalize(unsigned long)
Unexecuted instantiation: enc_fast_lossless.cc:default_implementation::(anonymous namespace)::ChunkSampleCollector<default_implementation::(anonymous namespace)::From9To13Bits>::Finalize(unsigned long)
Unexecuted instantiation: enc_fast_lossless.cc:default_implementation::(anonymous namespace)::ChunkSampleCollector<default_implementation::(anonymous namespace)::Exactly14Bits>::Finalize(unsigned long)
Unexecuted instantiation: enc_fast_lossless.cc:default_implementation::(anonymous namespace)::ChunkSampleCollector<default_implementation::(anonymous namespace)::MoreThan14Bits>::Finalize(unsigned long)
3091
3092
  uint64_t* raw_counts;
3093
  uint64_t* lz77_counts;
3094
};
3095
3096
0
constexpr uint32_t PackSigned(int32_t value) {
3097
0
  return (static_cast<uint32_t>(value) << 1) ^
3098
0
         ((static_cast<uint32_t>(~value) >> 31) - 1);
3099
0
}
Unexecuted instantiation: enc_fast_lossless.cc:AVX2::(anonymous namespace)::PackSigned(int)
Unexecuted instantiation: enc_fast_lossless.cc:default_implementation::(anonymous namespace)::PackSigned(int)
3100
3101
template <typename T, typename BitDepth>
3102
struct ChannelRowProcessor {
3103
  using upixel_t = typename BitDepth::upixel_t;
3104
  using pixel_t = typename BitDepth::pixel_t;
3105
  T* t;
3106
  void ProcessChunk(const pixel_t* row, const pixel_t* row_left,
3107
                    const pixel_t* row_top, const pixel_t* row_topleft,
3108
0
                    size_t n) {
3109
0
    alignas(64) upixel_t residuals[kChunkSize] = {};
3110
0
    size_t prefix_size = 0;
3111
0
    size_t required_prefix_size = 0;
3112
#ifdef FJXL_GENERIC_SIMD
3113
    constexpr size_t kNum =
3114
0
        sizeof(pixel_t) == 2 ? SIMDVec16::kLanes : SIMDVec32::kLanes;
3115
0
    for (size_t ix = 0; ix < kChunkSize; ix += kNum) {
3116
0
      size_t c =
3117
0
          PredictPixels<simd_t<pixel_t>>(row + ix, row_left + ix, row_top + ix,
3118
0
                                         row_topleft + ix, residuals + ix);
3119
0
      prefix_size =
3120
0
          prefix_size == required_prefix_size ? prefix_size + c : prefix_size;
3121
0
      required_prefix_size += kNum;
3122
0
    }
3123
#else
3124
0
    for (size_t ix = 0; ix < kChunkSize; ix++) {
3125
0
      pixel_t px = row[ix];
3126
0
      pixel_t left = row_left[ix];
3127
0
      pixel_t top = row_top[ix];
3128
0
      pixel_t topleft = row_topleft[ix];
3129
0
      pixel_t ac = left - topleft;
3130
0
      pixel_t ab = left - top;
3131
0
      pixel_t bc = top - topleft;
3132
0
      pixel_t grad = static_cast<pixel_t>(static_cast<upixel_t>(ac) +
3133
0
                                          static_cast<upixel_t>(top));
3134
0
      pixel_t d = ab ^ bc;
3135
0
      pixel_t clamp = d < 0 ? top : left;
3136
0
      pixel_t s = ac ^ bc;
3137
0
      pixel_t pred = s < 0 ? grad : clamp;
3138
0
      residuals[ix] = PackSigned(px - pred);
3139
0
      prefix_size = prefix_size == required_prefix_size
3140
0
                        ? prefix_size + (residuals[ix] == 0)
3141
0
                        : prefix_size;
3142
0
      required_prefix_size += 1;
3143
0
    }
3144
#endif
3145
0
    prefix_size = std::min(n, prefix_size);
3146
0
    if (prefix_size == n && (run > 0 || prefix_size > kLZ77MinLength)) {
3147
      // Run continues, nothing to do.
3148
0
      run += prefix_size;
3149
0
    } else if (prefix_size + run > kLZ77MinLength) {
3150
      // Run is broken. Encode the run and encode the individual vector.
3151
0
      t->Chunk(run + prefix_size, residuals, prefix_size, n);
3152
0
      run = 0;
3153
0
    } else {
3154
      // There was no run to begin with.
3155
0
      t->Chunk(0, residuals, 0, n);
3156
0
    }
3157
0
  }
Unexecuted instantiation: enc_fast_lossless.cc:AVX2::(anonymous namespace)::ChannelRowProcessor<AVX2::(anonymous namespace)::ChunkSampleCollector<AVX2::(anonymous namespace)::UpTo8Bits>, AVX2::(anonymous namespace)::UpTo8Bits>::ProcessChunk(short const*, short const*, short const*, short const*, unsigned long)
Unexecuted instantiation: enc_fast_lossless.cc:AVX2::(anonymous namespace)::ChannelRowProcessor<AVX2::(anonymous namespace)::ChunkEncoder<AVX2::(anonymous namespace)::UpTo8Bits>, AVX2::(anonymous namespace)::UpTo8Bits>::ProcessChunk(short const*, short const*, short const*, short const*, unsigned long)
Unexecuted instantiation: enc_fast_lossless.cc:AVX2::(anonymous namespace)::ChannelRowProcessor<AVX2::(anonymous namespace)::ChunkSampleCollector<AVX2::(anonymous namespace)::From9To13Bits>, AVX2::(anonymous namespace)::From9To13Bits>::ProcessChunk(short const*, short const*, short const*, short const*, unsigned long)
Unexecuted instantiation: enc_fast_lossless.cc:AVX2::(anonymous namespace)::ChannelRowProcessor<AVX2::(anonymous namespace)::ChunkSampleCollector<AVX2::(anonymous namespace)::Exactly14Bits>, AVX2::(anonymous namespace)::Exactly14Bits>::ProcessChunk(short const*, short const*, short const*, short const*, unsigned long)
Unexecuted instantiation: enc_fast_lossless.cc:AVX2::(anonymous namespace)::ChannelRowProcessor<AVX2::(anonymous namespace)::ChunkSampleCollector<AVX2::(anonymous namespace)::MoreThan14Bits>, AVX2::(anonymous namespace)::MoreThan14Bits>::ProcessChunk(int const*, int const*, int const*, int const*, unsigned long)
Unexecuted instantiation: enc_fast_lossless.cc:default_implementation::(anonymous namespace)::ChannelRowProcessor<default_implementation::(anonymous namespace)::ChunkSampleCollector<default_implementation::(anonymous namespace)::UpTo8Bits>, default_implementation::(anonymous namespace)::UpTo8Bits>::ProcessChunk(short const*, short const*, short const*, short const*, unsigned long)
Unexecuted instantiation: enc_fast_lossless.cc:default_implementation::(anonymous namespace)::ChannelRowProcessor<default_implementation::(anonymous namespace)::ChunkEncoder<default_implementation::(anonymous namespace)::UpTo8Bits>, default_implementation::(anonymous namespace)::UpTo8Bits>::ProcessChunk(short const*, short const*, short const*, short const*, unsigned long)
Unexecuted instantiation: enc_fast_lossless.cc:default_implementation::(anonymous namespace)::ChannelRowProcessor<default_implementation::(anonymous namespace)::ChunkSampleCollector<default_implementation::(anonymous namespace)::From9To13Bits>, default_implementation::(anonymous namespace)::From9To13Bits>::ProcessChunk(short const*, short const*, short const*, short const*, unsigned long)
Unexecuted instantiation: enc_fast_lossless.cc:default_implementation::(anonymous namespace)::ChannelRowProcessor<default_implementation::(anonymous namespace)::ChunkSampleCollector<default_implementation::(anonymous namespace)::Exactly14Bits>, default_implementation::(anonymous namespace)::Exactly14Bits>::ProcessChunk(short const*, short const*, short const*, short const*, unsigned long)
Unexecuted instantiation: enc_fast_lossless.cc:default_implementation::(anonymous namespace)::ChannelRowProcessor<default_implementation::(anonymous namespace)::ChunkSampleCollector<default_implementation::(anonymous namespace)::MoreThan14Bits>, default_implementation::(anonymous namespace)::MoreThan14Bits>::ProcessChunk(int const*, int const*, int const*, int const*, unsigned long)
Unexecuted instantiation: enc_fast_lossless.cc:AVX2::(anonymous namespace)::ChannelRowProcessor<AVX2::(anonymous namespace)::ChunkEncoder<AVX2::(anonymous namespace)::From9To13Bits>, AVX2::(anonymous namespace)::From9To13Bits>::ProcessChunk(short const*, short const*, short const*, short const*, unsigned long)
Unexecuted instantiation: enc_fast_lossless.cc:AVX2::(anonymous namespace)::ChannelRowProcessor<AVX2::(anonymous namespace)::ChunkEncoder<AVX2::(anonymous namespace)::Exactly14Bits>, AVX2::(anonymous namespace)::Exactly14Bits>::ProcessChunk(short const*, short const*, short const*, short const*, unsigned long)
Unexecuted instantiation: enc_fast_lossless.cc:AVX2::(anonymous namespace)::ChannelRowProcessor<AVX2::(anonymous namespace)::ChunkEncoder<AVX2::(anonymous namespace)::MoreThan14Bits>, AVX2::(anonymous namespace)::MoreThan14Bits>::ProcessChunk(int const*, int const*, int const*, int const*, unsigned long)
Unexecuted instantiation: enc_fast_lossless.cc:default_implementation::(anonymous namespace)::ChannelRowProcessor<default_implementation::(anonymous namespace)::ChunkEncoder<default_implementation::(anonymous namespace)::From9To13Bits>, default_implementation::(anonymous namespace)::From9To13Bits>::ProcessChunk(short const*, short const*, short const*, short const*, unsigned long)
Unexecuted instantiation: enc_fast_lossless.cc:default_implementation::(anonymous namespace)::ChannelRowProcessor<default_implementation::(anonymous namespace)::ChunkEncoder<default_implementation::(anonymous namespace)::Exactly14Bits>, default_implementation::(anonymous namespace)::Exactly14Bits>::ProcessChunk(short const*, short const*, short const*, short const*, unsigned long)
Unexecuted instantiation: enc_fast_lossless.cc:default_implementation::(anonymous namespace)::ChannelRowProcessor<default_implementation::(anonymous namespace)::ChunkEncoder<default_implementation::(anonymous namespace)::MoreThan14Bits>, default_implementation::(anonymous namespace)::MoreThan14Bits>::ProcessChunk(int const*, int const*, int const*, int const*, unsigned long)
3158
3159
  void ProcessRow(const pixel_t* row, const pixel_t* row_left,
3160
                  const pixel_t* row_top, const pixel_t* row_topleft,
3161
0
                  size_t xs) {
3162
0
    for (size_t x = 0; x < xs; x += kChunkSize) {
3163
0
      ProcessChunk(row + x, row_left + x, row_top + x, row_topleft + x,
3164
0
                   std::min(kChunkSize, xs - x));
3165
0
    }
3166
0
  }
Unexecuted instantiation: enc_fast_lossless.cc:AVX2::(anonymous namespace)::ChannelRowProcessor<AVX2::(anonymous namespace)::ChunkSampleCollector<AVX2::(anonymous namespace)::UpTo8Bits>, AVX2::(anonymous namespace)::UpTo8Bits>::ProcessRow(short const*, short const*, short const*, short const*, unsigned long)
Unexecuted instantiation: enc_fast_lossless.cc:AVX2::(anonymous namespace)::ChannelRowProcessor<AVX2::(anonymous namespace)::ChunkEncoder<AVX2::(anonymous namespace)::UpTo8Bits>, AVX2::(anonymous namespace)::UpTo8Bits>::ProcessRow(short const*, short const*, short const*, short const*, unsigned long)
Unexecuted instantiation: enc_fast_lossless.cc:AVX2::(anonymous namespace)::ChannelRowProcessor<AVX2::(anonymous namespace)::ChunkSampleCollector<AVX2::(anonymous namespace)::From9To13Bits>, AVX2::(anonymous namespace)::From9To13Bits>::ProcessRow(short const*, short const*, short const*, short const*, unsigned long)
Unexecuted instantiation: enc_fast_lossless.cc:AVX2::(anonymous namespace)::ChannelRowProcessor<AVX2::(anonymous namespace)::ChunkSampleCollector<AVX2::(anonymous namespace)::Exactly14Bits>, AVX2::(anonymous namespace)::Exactly14Bits>::ProcessRow(short const*, short const*, short const*, short const*, unsigned long)
Unexecuted instantiation: enc_fast_lossless.cc:AVX2::(anonymous namespace)::ChannelRowProcessor<AVX2::(anonymous namespace)::ChunkSampleCollector<AVX2::(anonymous namespace)::MoreThan14Bits>, AVX2::(anonymous namespace)::MoreThan14Bits>::ProcessRow(int const*, int const*, int const*, int const*, unsigned long)
Unexecuted instantiation: enc_fast_lossless.cc:default_implementation::(anonymous namespace)::ChannelRowProcessor<default_implementation::(anonymous namespace)::ChunkSampleCollector<default_implementation::(anonymous namespace)::UpTo8Bits>, default_implementation::(anonymous namespace)::UpTo8Bits>::ProcessRow(short const*, short const*, short const*, short const*, unsigned long)
Unexecuted instantiation: enc_fast_lossless.cc:default_implementation::(anonymous namespace)::ChannelRowProcessor<default_implementation::(anonymous namespace)::ChunkEncoder<default_implementation::(anonymous namespace)::UpTo8Bits>, default_implementation::(anonymous namespace)::UpTo8Bits>::ProcessRow(short const*, short const*, short const*, short const*, unsigned long)
Unexecuted instantiation: enc_fast_lossless.cc:default_implementation::(anonymous namespace)::ChannelRowProcessor<default_implementation::(anonymous namespace)::ChunkSampleCollector<default_implementation::(anonymous namespace)::From9To13Bits>, default_implementation::(anonymous namespace)::From9To13Bits>::ProcessRow(short const*, short const*, short const*, short const*, unsigned long)
Unexecuted instantiation: enc_fast_lossless.cc:default_implementation::(anonymous namespace)::ChannelRowProcessor<default_implementation::(anonymous namespace)::ChunkSampleCollector<default_implementation::(anonymous namespace)::Exactly14Bits>, default_implementation::(anonymous namespace)::Exactly14Bits>::ProcessRow(short const*, short const*, short const*, short const*, unsigned long)
Unexecuted instantiation: enc_fast_lossless.cc:default_implementation::(anonymous namespace)::ChannelRowProcessor<default_implementation::(anonymous namespace)::ChunkSampleCollector<default_implementation::(anonymous namespace)::MoreThan14Bits>, default_implementation::(anonymous namespace)::MoreThan14Bits>::ProcessRow(int const*, int const*, int const*, int const*, unsigned long)
Unexecuted instantiation: enc_fast_lossless.cc:AVX2::(anonymous namespace)::ChannelRowProcessor<AVX2::(anonymous namespace)::ChunkEncoder<AVX2::(anonymous namespace)::From9To13Bits>, AVX2::(anonymous namespace)::From9To13Bits>::ProcessRow(short const*, short const*, short const*, short const*, unsigned long)
Unexecuted instantiation: enc_fast_lossless.cc:AVX2::(anonymous namespace)::ChannelRowProcessor<AVX2::(anonymous namespace)::ChunkEncoder<AVX2::(anonymous namespace)::Exactly14Bits>, AVX2::(anonymous namespace)::Exactly14Bits>::ProcessRow(short const*, short const*, short const*, short const*, unsigned long)
Unexecuted instantiation: enc_fast_lossless.cc:AVX2::(anonymous namespace)::ChannelRowProcessor<AVX2::(anonymous namespace)::ChunkEncoder<AVX2::(anonymous namespace)::MoreThan14Bits>, AVX2::(anonymous namespace)::MoreThan14Bits>::ProcessRow(int const*, int const*, int const*, int const*, unsigned long)
Unexecuted instantiation: enc_fast_lossless.cc:default_implementation::(anonymous namespace)::ChannelRowProcessor<default_implementation::(anonymous namespace)::ChunkEncoder<default_implementation::(anonymous namespace)::From9To13Bits>, default_implementation::(anonymous namespace)::From9To13Bits>::ProcessRow(short const*, short const*, short const*, short const*, unsigned long)
Unexecuted instantiation: enc_fast_lossless.cc:default_implementation::(anonymous namespace)::ChannelRowProcessor<default_implementation::(anonymous namespace)::ChunkEncoder<default_implementation::(anonymous namespace)::Exactly14Bits>, default_implementation::(anonymous namespace)::Exactly14Bits>::ProcessRow(short const*, short const*, short const*, short const*, unsigned long)
Unexecuted instantiation: enc_fast_lossless.cc:default_implementation::(anonymous namespace)::ChannelRowProcessor<default_implementation::(anonymous namespace)::ChunkEncoder<default_implementation::(anonymous namespace)::MoreThan14Bits>, default_implementation::(anonymous namespace)::MoreThan14Bits>::ProcessRow(int const*, int const*, int const*, int const*, unsigned long)
3167
3168
0
  void Finalize() { t->Finalize(run); }
Unexecuted instantiation: enc_fast_lossless.cc:AVX2::(anonymous namespace)::ChannelRowProcessor<AVX2::(anonymous namespace)::ChunkSampleCollector<AVX2::(anonymous namespace)::UpTo8Bits>, AVX2::(anonymous namespace)::UpTo8Bits>::Finalize()
Unexecuted instantiation: enc_fast_lossless.cc:AVX2::(anonymous namespace)::ChannelRowProcessor<AVX2::(anonymous namespace)::ChunkEncoder<AVX2::(anonymous namespace)::UpTo8Bits>, AVX2::(anonymous namespace)::UpTo8Bits>::Finalize()
Unexecuted instantiation: enc_fast_lossless.cc:AVX2::(anonymous namespace)::ChannelRowProcessor<AVX2::(anonymous namespace)::ChunkSampleCollector<AVX2::(anonymous namespace)::From9To13Bits>, AVX2::(anonymous namespace)::From9To13Bits>::Finalize()
Unexecuted instantiation: enc_fast_lossless.cc:AVX2::(anonymous namespace)::ChannelRowProcessor<AVX2::(anonymous namespace)::ChunkSampleCollector<AVX2::(anonymous namespace)::Exactly14Bits>, AVX2::(anonymous namespace)::Exactly14Bits>::Finalize()
Unexecuted instantiation: enc_fast_lossless.cc:AVX2::(anonymous namespace)::ChannelRowProcessor<AVX2::(anonymous namespace)::ChunkSampleCollector<AVX2::(anonymous namespace)::MoreThan14Bits>, AVX2::(anonymous namespace)::MoreThan14Bits>::Finalize()
Unexecuted instantiation: enc_fast_lossless.cc:default_implementation::(anonymous namespace)::ChannelRowProcessor<default_implementation::(anonymous namespace)::ChunkSampleCollector<default_implementation::(anonymous namespace)::UpTo8Bits>, default_implementation::(anonymous namespace)::UpTo8Bits>::Finalize()
Unexecuted instantiation: enc_fast_lossless.cc:default_implementation::(anonymous namespace)::ChannelRowProcessor<default_implementation::(anonymous namespace)::ChunkEncoder<default_implementation::(anonymous namespace)::UpTo8Bits>, default_implementation::(anonymous namespace)::UpTo8Bits>::Finalize()
Unexecuted instantiation: enc_fast_lossless.cc:default_implementation::(anonymous namespace)::ChannelRowProcessor<default_implementation::(anonymous namespace)::ChunkSampleCollector<default_implementation::(anonymous namespace)::From9To13Bits>, default_implementation::(anonymous namespace)::From9To13Bits>::Finalize()
Unexecuted instantiation: enc_fast_lossless.cc:default_implementation::(anonymous namespace)::ChannelRowProcessor<default_implementation::(anonymous namespace)::ChunkSampleCollector<default_implementation::(anonymous namespace)::Exactly14Bits>, default_implementation::(anonymous namespace)::Exactly14Bits>::Finalize()
Unexecuted instantiation: enc_fast_lossless.cc:default_implementation::(anonymous namespace)::ChannelRowProcessor<default_implementation::(anonymous namespace)::ChunkSampleCollector<default_implementation::(anonymous namespace)::MoreThan14Bits>, default_implementation::(anonymous namespace)::MoreThan14Bits>::Finalize()
Unexecuted instantiation: enc_fast_lossless.cc:AVX2::(anonymous namespace)::ChannelRowProcessor<AVX2::(anonymous namespace)::ChunkEncoder<AVX2::(anonymous namespace)::From9To13Bits>, AVX2::(anonymous namespace)::From9To13Bits>::Finalize()
Unexecuted instantiation: enc_fast_lossless.cc:AVX2::(anonymous namespace)::ChannelRowProcessor<AVX2::(anonymous namespace)::ChunkEncoder<AVX2::(anonymous namespace)::Exactly14Bits>, AVX2::(anonymous namespace)::Exactly14Bits>::Finalize()
Unexecuted instantiation: enc_fast_lossless.cc:AVX2::(anonymous namespace)::ChannelRowProcessor<AVX2::(anonymous namespace)::ChunkEncoder<AVX2::(anonymous namespace)::MoreThan14Bits>, AVX2::(anonymous namespace)::MoreThan14Bits>::Finalize()
Unexecuted instantiation: enc_fast_lossless.cc:default_implementation::(anonymous namespace)::ChannelRowProcessor<default_implementation::(anonymous namespace)::ChunkEncoder<default_implementation::(anonymous namespace)::From9To13Bits>, default_implementation::(anonymous namespace)::From9To13Bits>::Finalize()
Unexecuted instantiation: enc_fast_lossless.cc:default_implementation::(anonymous namespace)::ChannelRowProcessor<default_implementation::(anonymous namespace)::ChunkEncoder<default_implementation::(anonymous namespace)::Exactly14Bits>, default_implementation::(anonymous namespace)::Exactly14Bits>::Finalize()
Unexecuted instantiation: enc_fast_lossless.cc:default_implementation::(anonymous namespace)::ChannelRowProcessor<default_implementation::(anonymous namespace)::ChunkEncoder<default_implementation::(anonymous namespace)::MoreThan14Bits>, default_implementation::(anonymous namespace)::MoreThan14Bits>::Finalize()
3169
  // Invariant: run == 0 or run > kLZ77MinLength.
3170
  size_t run = 0;
3171
};
3172
3173
0
uint16_t LoadLE16(const unsigned char* ptr) {
3174
0
  return uint16_t{ptr[0]} | (uint16_t{ptr[1]} << 8);
3175
0
}
Unexecuted instantiation: enc_fast_lossless.cc:AVX2::(anonymous namespace)::LoadLE16(unsigned char const*)
Unexecuted instantiation: enc_fast_lossless.cc:default_implementation::(anonymous namespace)::LoadLE16(unsigned char const*)
3176
3177
0
uint16_t SwapEndian(uint16_t in) { return (in >> 8) | (in << 8); }
Unexecuted instantiation: enc_fast_lossless.cc:AVX2::(anonymous namespace)::SwapEndian(unsigned short)
Unexecuted instantiation: enc_fast_lossless.cc:default_implementation::(anonymous namespace)::SwapEndian(unsigned short)
3178
3179
#ifdef FJXL_GENERIC_SIMD
3180
0
void StorePixels(SIMDVec16 p, int16_t* dest) { p.Store((uint16_t*)dest); }
3181
3182
0
void StorePixels(SIMDVec16 p, int32_t* dest) {
3183
0
  VecPair<SIMDVec32> p_up = p.Upcast();
3184
0
  p_up.low.Store((uint32_t*)dest);
3185
0
  p_up.hi.Store((uint32_t*)dest + SIMDVec32::kLanes);
3186
0
}
3187
#endif
3188
3189
template <typename pixel_t>
3190
0
void FillRowG8(const unsigned char* rgba, size_t oxs, pixel_t* luma) {
3191
0
  size_t x = 0;
3192
#ifdef FJXL_GENERIC_SIMD
3193
0
  for (; x + SIMDVec16::kLanes <= oxs; x += SIMDVec16::kLanes) {
3194
0
    auto rgb = SIMDVec16::LoadG8(rgba + x);
3195
0
    StorePixels(rgb[0], luma + x);
3196
0
  }
3197
#endif
3198
0
  for (; x < oxs; x++) {
3199
0
    luma[x] = rgba[x];
3200
0
  }
3201
0
}
Unexecuted instantiation: enc_fast_lossless.cc:void AVX2::(anonymous namespace)::FillRowG8<short>(unsigned char const*, unsigned long, short*)
Unexecuted instantiation: enc_fast_lossless.cc:void default_implementation::(anonymous namespace)::FillRowG8<short>(unsigned char const*, unsigned long, short*)
Unexecuted instantiation: enc_fast_lossless.cc:void default_implementation::(anonymous namespace)::FillRowG8<int>(unsigned char const*, unsigned long, int*)
Unexecuted instantiation: enc_fast_lossless.cc:void AVX2::(anonymous namespace)::FillRowG8<int>(unsigned char const*, unsigned long, int*)
3202
3203
template <bool big_endian, typename pixel_t>
3204
0
void FillRowG16(const unsigned char* rgba, size_t oxs, pixel_t* luma) {
3205
0
  size_t x = 0;
3206
#ifdef FJXL_GENERIC_SIMD
3207
0
  for (; x + SIMDVec16::kLanes <= oxs; x += SIMDVec16::kLanes) {
3208
0
    auto rgb = SIMDVec16::LoadG16(rgba + 2 * x);
3209
0
    if (big_endian) {
3210
0
      rgb[0].SwapEndian();
3211
0
    }
3212
0
    StorePixels(rgb[0], luma + x);
3213
0
  }
3214
#endif
3215
0
  for (; x < oxs; x++) {
3216
0
    uint16_t val = LoadLE16(rgba + 2 * x);
3217
0
    if (big_endian) {
3218
0
      val = SwapEndian(val);
3219
0
    }
3220
0
    luma[x] = val;
3221
0
  }
3222
0
}
Unexecuted instantiation: enc_fast_lossless.cc:void AVX2::(anonymous namespace)::FillRowG16<true, short>(unsigned char const*, unsigned long, short*)
Unexecuted instantiation: enc_fast_lossless.cc:void AVX2::(anonymous namespace)::FillRowG16<false, short>(unsigned char const*, unsigned long, short*)
Unexecuted instantiation: enc_fast_lossless.cc:void AVX2::(anonymous namespace)::FillRowG16<true, int>(unsigned char const*, unsigned long, int*)
Unexecuted instantiation: enc_fast_lossless.cc:void AVX2::(anonymous namespace)::FillRowG16<false, int>(unsigned char const*, unsigned long, int*)
Unexecuted instantiation: enc_fast_lossless.cc:void default_implementation::(anonymous namespace)::FillRowG16<true, short>(unsigned char const*, unsigned long, short*)
Unexecuted instantiation: enc_fast_lossless.cc:void default_implementation::(anonymous namespace)::FillRowG16<false, short>(unsigned char const*, unsigned long, short*)
Unexecuted instantiation: enc_fast_lossless.cc:void default_implementation::(anonymous namespace)::FillRowG16<true, int>(unsigned char const*, unsigned long, int*)
Unexecuted instantiation: enc_fast_lossless.cc:void default_implementation::(anonymous namespace)::FillRowG16<false, int>(unsigned char const*, unsigned long, int*)
3223
3224
template <typename pixel_t>
3225
void FillRowGA8(const unsigned char* rgba, size_t oxs, pixel_t* luma,
3226
0
                pixel_t* alpha) {
3227
0
  size_t x = 0;
3228
#ifdef FJXL_GENERIC_SIMD
3229
0
  for (; x + SIMDVec16::kLanes <= oxs; x += SIMDVec16::kLanes) {
3230
0
    auto rgb = SIMDVec16::LoadGA8(rgba + 2 * x);
3231
0
    StorePixels(rgb[0], luma + x);
3232
0
    StorePixels(rgb[1], alpha + x);
3233
0
  }
3234
#endif
3235
0
  for (; x < oxs; x++) {
3236
0
    luma[x] = rgba[2 * x];
3237
0
    alpha[x] = rgba[2 * x + 1];
3238
0
  }
3239
0
}
Unexecuted instantiation: enc_fast_lossless.cc:void AVX2::(anonymous namespace)::FillRowGA8<short>(unsigned char const*, unsigned long, short*, short*)
Unexecuted instantiation: enc_fast_lossless.cc:void default_implementation::(anonymous namespace)::FillRowGA8<short>(unsigned char const*, unsigned long, short*, short*)
Unexecuted instantiation: enc_fast_lossless.cc:void default_implementation::(anonymous namespace)::FillRowGA8<int>(unsigned char const*, unsigned long, int*, int*)
Unexecuted instantiation: enc_fast_lossless.cc:void AVX2::(anonymous namespace)::FillRowGA8<int>(unsigned char const*, unsigned long, int*, int*)
3240
3241
template <bool big_endian, typename pixel_t>
3242
void FillRowGA16(const unsigned char* rgba, size_t oxs, pixel_t* luma,
3243
0
                 pixel_t* alpha) {
3244
0
  size_t x = 0;
3245
#ifdef FJXL_GENERIC_SIMD
3246
0
  for (; x + SIMDVec16::kLanes <= oxs; x += SIMDVec16::kLanes) {
3247
0
    auto rgb = SIMDVec16::LoadGA16(rgba + 4 * x);
3248
0
    if (big_endian) {
3249
0
      rgb[0].SwapEndian();
3250
0
      rgb[1].SwapEndian();
3251
0
    }
3252
0
    StorePixels(rgb[0], luma + x);
3253
0
    StorePixels(rgb[1], alpha + x);
3254
0
  }
3255
#endif
3256
0
  for (; x < oxs; x++) {
3257
0
    uint16_t l = LoadLE16(rgba + 4 * x);
3258
0
    uint16_t a = LoadLE16(rgba + 4 * x + 2);
3259
0
    if (big_endian) {
3260
0
      l = SwapEndian(l);
3261
0
      a = SwapEndian(a);
3262
0
    }
3263
0
    luma[x] = l;
3264
0
    alpha[x] = a;
3265
0
  }
3266
0
}
Unexecuted instantiation: enc_fast_lossless.cc:void AVX2::(anonymous namespace)::FillRowGA16<true, short>(unsigned char const*, unsigned long, short*, short*)
Unexecuted instantiation: enc_fast_lossless.cc:void AVX2::(anonymous namespace)::FillRowGA16<false, short>(unsigned char const*, unsigned long, short*, short*)
Unexecuted instantiation: enc_fast_lossless.cc:void AVX2::(anonymous namespace)::FillRowGA16<true, int>(unsigned char const*, unsigned long, int*, int*)
Unexecuted instantiation: enc_fast_lossless.cc:void AVX2::(anonymous namespace)::FillRowGA16<false, int>(unsigned char const*, unsigned long, int*, int*)
Unexecuted instantiation: enc_fast_lossless.cc:void default_implementation::(anonymous namespace)::FillRowGA16<true, short>(unsigned char const*, unsigned long, short*, short*)
Unexecuted instantiation: enc_fast_lossless.cc:void default_implementation::(anonymous namespace)::FillRowGA16<false, short>(unsigned char const*, unsigned long, short*, short*)
Unexecuted instantiation: enc_fast_lossless.cc:void default_implementation::(anonymous namespace)::FillRowGA16<true, int>(unsigned char const*, unsigned long, int*, int*)
Unexecuted instantiation: enc_fast_lossless.cc:void default_implementation::(anonymous namespace)::FillRowGA16<false, int>(unsigned char const*, unsigned long, int*, int*)
3267
3268
template <typename pixel_t>
3269
void StoreYCoCg(pixel_t r, pixel_t g, pixel_t b, pixel_t* y, pixel_t* co,
3270
0
                pixel_t* cg) {
3271
0
  *co = r - b;
3272
0
  pixel_t tmp = b + (*co >> 1);
3273
0
  *cg = g - tmp;
3274
0
  *y = tmp + (*cg >> 1);
3275
0
}
Unexecuted instantiation: enc_fast_lossless.cc:void AVX2::(anonymous namespace)::StoreYCoCg<short>(short, short, short, short*, short*, short*)
Unexecuted instantiation: enc_fast_lossless.cc:void AVX2::(anonymous namespace)::StoreYCoCg<int>(int, int, int, int*, int*, int*)
Unexecuted instantiation: enc_fast_lossless.cc:void default_implementation::(anonymous namespace)::StoreYCoCg<short>(short, short, short, short*, short*, short*)
Unexecuted instantiation: enc_fast_lossless.cc:void default_implementation::(anonymous namespace)::StoreYCoCg<int>(int, int, int, int*, int*, int*)
3276
3277
#ifdef FJXL_GENERIC_SIMD
3278
void StoreYCoCg(SIMDVec16 r, SIMDVec16 g, SIMDVec16 b, int16_t* y, int16_t* co,
3279
0
                int16_t* cg) {
3280
0
  SIMDVec16 co_v = r.Sub(b);
3281
0
  SIMDVec16 tmp = b.Add(co_v.SignedShiftRight<1>());
3282
0
  SIMDVec16 cg_v = g.Sub(tmp);
3283
0
  SIMDVec16 y_v = tmp.Add(cg_v.SignedShiftRight<1>());
3284
0
  y_v.Store(reinterpret_cast<uint16_t*>(y));
3285
0
  co_v.Store(reinterpret_cast<uint16_t*>(co));
3286
0
  cg_v.Store(reinterpret_cast<uint16_t*>(cg));
3287
0
}
3288
3289
void StoreYCoCg(SIMDVec16 r, SIMDVec16 g, SIMDVec16 b, int32_t* y, int32_t* co,
3290
0
                int32_t* cg) {
3291
0
  VecPair<SIMDVec32> r_up = r.Upcast();
3292
0
  VecPair<SIMDVec32> g_up = g.Upcast();
3293
0
  VecPair<SIMDVec32> b_up = b.Upcast();
3294
0
  SIMDVec32 co_lo_v = r_up.low.Sub(b_up.low);
3295
0
  SIMDVec32 tmp_lo = b_up.low.Add(co_lo_v.SignedShiftRight<1>());
3296
0
  SIMDVec32 cg_lo_v = g_up.low.Sub(tmp_lo);
3297
0
  SIMDVec32 y_lo_v = tmp_lo.Add(cg_lo_v.SignedShiftRight<1>());
3298
0
  SIMDVec32 co_hi_v = r_up.hi.Sub(b_up.hi);
3299
0
  SIMDVec32 tmp_hi = b_up.hi.Add(co_hi_v.SignedShiftRight<1>());
3300
0
  SIMDVec32 cg_hi_v = g_up.hi.Sub(tmp_hi);
3301
0
  SIMDVec32 y_hi_v = tmp_hi.Add(cg_hi_v.SignedShiftRight<1>());
3302
0
  y_lo_v.Store(reinterpret_cast<uint32_t*>(y));
3303
0
  co_lo_v.Store(reinterpret_cast<uint32_t*>(co));
3304
0
  cg_lo_v.Store(reinterpret_cast<uint32_t*>(cg));
3305
0
  y_hi_v.Store(reinterpret_cast<uint32_t*>(y) + SIMDVec32::kLanes);
3306
0
  co_hi_v.Store(reinterpret_cast<uint32_t*>(co) + SIMDVec32::kLanes);
3307
0
  cg_hi_v.Store(reinterpret_cast<uint32_t*>(cg) + SIMDVec32::kLanes);
3308
0
}
3309
#endif
3310
3311
template <typename pixel_t>
3312
void FillRowRGB8(const unsigned char* rgba, size_t oxs, pixel_t* y, pixel_t* co,
3313
0
                 pixel_t* cg) {
3314
0
  size_t x = 0;
3315
#ifdef FJXL_GENERIC_SIMD
3316
0
  for (; x + SIMDVec16::kLanes <= oxs; x += SIMDVec16::kLanes) {
3317
0
    auto rgb = SIMDVec16::LoadRGB8(rgba + 3 * x);
3318
0
    StoreYCoCg(rgb[0], rgb[1], rgb[2], y + x, co + x, cg + x);
3319
0
  }
3320
#endif
3321
0
  for (; x < oxs; x++) {
3322
0
    uint16_t r = rgba[3 * x];
3323
0
    uint16_t g = rgba[3 * x + 1];
3324
0
    uint16_t b = rgba[3 * x + 2];
3325
0
    StoreYCoCg<pixel_t>(r, g, b, y + x, co + x, cg + x);
3326
0
  }
3327
0
}
Unexecuted instantiation: enc_fast_lossless.cc:void AVX2::(anonymous namespace)::FillRowRGB8<short>(unsigned char const*, unsigned long, short*, short*, short*)
Unexecuted instantiation: enc_fast_lossless.cc:void default_implementation::(anonymous namespace)::FillRowRGB8<short>(unsigned char const*, unsigned long, short*, short*, short*)
Unexecuted instantiation: enc_fast_lossless.cc:void default_implementation::(anonymous namespace)::FillRowRGB8<int>(unsigned char const*, unsigned long, int*, int*, int*)
Unexecuted instantiation: enc_fast_lossless.cc:void AVX2::(anonymous namespace)::FillRowRGB8<int>(unsigned char const*, unsigned long, int*, int*, int*)
3328
3329
template <bool big_endian, typename pixel_t>
3330
void FillRowRGB16(const unsigned char* rgba, size_t oxs, pixel_t* y,
3331
0
                  pixel_t* co, pixel_t* cg) {
3332
0
  size_t x = 0;
3333
#ifdef FJXL_GENERIC_SIMD
3334
0
  for (; x + SIMDVec16::kLanes <= oxs; x += SIMDVec16::kLanes) {
3335
0
    auto rgb = SIMDVec16::LoadRGB16(rgba + 6 * x);
3336
0
    if (big_endian) {
3337
0
      rgb[0].SwapEndian();
3338
0
      rgb[1].SwapEndian();
3339
0
      rgb[2].SwapEndian();
3340
0
    }
3341
0
    StoreYCoCg(rgb[0], rgb[1], rgb[2], y + x, co + x, cg + x);
3342
0
  }
3343
#endif
3344
0
  for (; x < oxs; x++) {
3345
0
    uint16_t r = LoadLE16(rgba + 6 * x);
3346
0
    uint16_t g = LoadLE16(rgba + 6 * x + 2);
3347
0
    uint16_t b = LoadLE16(rgba + 6 * x + 4);
3348
0
    if (big_endian) {
3349
0
      r = SwapEndian(r);
3350
0
      g = SwapEndian(g);
3351
0
      b = SwapEndian(b);
3352
0
    }
3353
0
    StoreYCoCg<pixel_t>(r, g, b, y + x, co + x, cg + x);
3354
0
  }
3355
0
}
Unexecuted instantiation: enc_fast_lossless.cc:void AVX2::(anonymous namespace)::FillRowRGB16<true, short>(unsigned char const*, unsigned long, short*, short*, short*)
Unexecuted instantiation: enc_fast_lossless.cc:void AVX2::(anonymous namespace)::FillRowRGB16<false, short>(unsigned char const*, unsigned long, short*, short*, short*)
Unexecuted instantiation: enc_fast_lossless.cc:void AVX2::(anonymous namespace)::FillRowRGB16<true, int>(unsigned char const*, unsigned long, int*, int*, int*)
Unexecuted instantiation: enc_fast_lossless.cc:void AVX2::(anonymous namespace)::FillRowRGB16<false, int>(unsigned char const*, unsigned long, int*, int*, int*)
Unexecuted instantiation: enc_fast_lossless.cc:void default_implementation::(anonymous namespace)::FillRowRGB16<true, short>(unsigned char const*, unsigned long, short*, short*, short*)
Unexecuted instantiation: enc_fast_lossless.cc:void default_implementation::(anonymous namespace)::FillRowRGB16<false, short>(unsigned char const*, unsigned long, short*, short*, short*)
Unexecuted instantiation: enc_fast_lossless.cc:void default_implementation::(anonymous namespace)::FillRowRGB16<true, int>(unsigned char const*, unsigned long, int*, int*, int*)
Unexecuted instantiation: enc_fast_lossless.cc:void default_implementation::(anonymous namespace)::FillRowRGB16<false, int>(unsigned char const*, unsigned long, int*, int*, int*)
3356
3357
template <typename pixel_t>
3358
void FillRowRGBA8(const unsigned char* rgba, size_t oxs, pixel_t* y,
3359
0
                  pixel_t* co, pixel_t* cg, pixel_t* alpha) {
3360
0
  size_t x = 0;
3361
#ifdef FJXL_GENERIC_SIMD
3362
0
  for (; x + SIMDVec16::kLanes <= oxs; x += SIMDVec16::kLanes) {
3363
0
    auto rgb = SIMDVec16::LoadRGBA8(rgba + 4 * x);
3364
0
    StoreYCoCg(rgb[0], rgb[1], rgb[2], y + x, co + x, cg + x);
3365
0
    StorePixels(rgb[3], alpha + x);
3366
0
  }
3367
#endif
3368
0
  for (; x < oxs; x++) {
3369
0
    uint16_t r = rgba[4 * x];
3370
0
    uint16_t g = rgba[4 * x + 1];
3371
0
    uint16_t b = rgba[4 * x + 2];
3372
0
    uint16_t a = rgba[4 * x + 3];
3373
0
    StoreYCoCg<pixel_t>(r, g, b, y + x, co + x, cg + x);
3374
0
    alpha[x] = a;
3375
0
  }
3376
0
}
Unexecuted instantiation: enc_fast_lossless.cc:void AVX2::(anonymous namespace)::FillRowRGBA8<short>(unsigned char const*, unsigned long, short*, short*, short*, short*)
Unexecuted instantiation: enc_fast_lossless.cc:void default_implementation::(anonymous namespace)::FillRowRGBA8<short>(unsigned char const*, unsigned long, short*, short*, short*, short*)
Unexecuted instantiation: enc_fast_lossless.cc:void default_implementation::(anonymous namespace)::FillRowRGBA8<int>(unsigned char const*, unsigned long, int*, int*, int*, int*)
Unexecuted instantiation: enc_fast_lossless.cc:void AVX2::(anonymous namespace)::FillRowRGBA8<int>(unsigned char const*, unsigned long, int*, int*, int*, int*)
3377
3378
template <bool big_endian, typename pixel_t>
3379
void FillRowRGBA16(const unsigned char* rgba, size_t oxs, pixel_t* y,
3380
0
                   pixel_t* co, pixel_t* cg, pixel_t* alpha) {
3381
0
  size_t x = 0;
3382
#ifdef FJXL_GENERIC_SIMD
3383
0
  for (; x + SIMDVec16::kLanes <= oxs; x += SIMDVec16::kLanes) {
3384
0
    auto rgb = SIMDVec16::LoadRGBA16(rgba + 8 * x);
3385
0
    if (big_endian) {
3386
0
      rgb[0].SwapEndian();
3387
0
      rgb[1].SwapEndian();
3388
0
      rgb[2].SwapEndian();
3389
0
      rgb[3].SwapEndian();
3390
0
    }
3391
0
    StoreYCoCg(rgb[0], rgb[1], rgb[2], y + x, co + x, cg + x);
3392
0
    StorePixels(rgb[3], alpha + x);
3393
0
  }
3394
#endif
3395
0
  for (; x < oxs; x++) {
3396
0
    uint16_t r = LoadLE16(rgba + 8 * x);
3397
0
    uint16_t g = LoadLE16(rgba + 8 * x + 2);
3398
0
    uint16_t b = LoadLE16(rgba + 8 * x + 4);
3399
0
    uint16_t a = LoadLE16(rgba + 8 * x + 6);
3400
0
    if (big_endian) {
3401
0
      r = SwapEndian(r);
3402
0
      g = SwapEndian(g);
3403
0
      b = SwapEndian(b);
3404
0
      a = SwapEndian(a);
3405
0
    }
3406
0
    StoreYCoCg<pixel_t>(r, g, b, y + x, co + x, cg + x);
3407
0
    alpha[x] = a;
3408
0
  }
3409
0
}
Unexecuted instantiation: enc_fast_lossless.cc:void AVX2::(anonymous namespace)::FillRowRGBA16<true, short>(unsigned char const*, unsigned long, short*, short*, short*, short*)
Unexecuted instantiation: enc_fast_lossless.cc:void AVX2::(anonymous namespace)::FillRowRGBA16<false, short>(unsigned char const*, unsigned long, short*, short*, short*, short*)
Unexecuted instantiation: enc_fast_lossless.cc:void AVX2::(anonymous namespace)::FillRowRGBA16<true, int>(unsigned char const*, unsigned long, int*, int*, int*, int*)
Unexecuted instantiation: enc_fast_lossless.cc:void AVX2::(anonymous namespace)::FillRowRGBA16<false, int>(unsigned char const*, unsigned long, int*, int*, int*, int*)
Unexecuted instantiation: enc_fast_lossless.cc:void default_implementation::(anonymous namespace)::FillRowRGBA16<true, short>(unsigned char const*, unsigned long, short*, short*, short*, short*)
Unexecuted instantiation: enc_fast_lossless.cc:void default_implementation::(anonymous namespace)::FillRowRGBA16<false, short>(unsigned char const*, unsigned long, short*, short*, short*, short*)
Unexecuted instantiation: enc_fast_lossless.cc:void default_implementation::(anonymous namespace)::FillRowRGBA16<true, int>(unsigned char const*, unsigned long, int*, int*, int*, int*)
Unexecuted instantiation: enc_fast_lossless.cc:void default_implementation::(anonymous namespace)::FillRowRGBA16<false, int>(unsigned char const*, unsigned long, int*, int*, int*, int*)
3410
3411
template <typename Processor, typename BitDepth>
3412
void ProcessImageArea(const unsigned char* rgba, size_t x0, size_t y0,
3413
                      size_t xs, size_t yskip, size_t ys, size_t row_stride,
3414
                      BitDepth bitdepth, size_t nb_chans, bool big_endian,
3415
0
                      Processor* processors) {
3416
0
  constexpr size_t kPadding = 32;
3417
3418
0
  using pixel_t = typename BitDepth::pixel_t;
3419
3420
0
  constexpr size_t kAlign = 64;
3421
0
  constexpr size_t kAlignPixels = kAlign / sizeof(pixel_t);
3422
3423
0
  auto align = [=](pixel_t* ptr) {
3424
0
    size_t offset = reinterpret_cast<size_t>(ptr) % kAlign;
3425
0
    if (offset) {
3426
0
      ptr += offset / sizeof(pixel_t);
3427
0
    }
3428
0
    return ptr;
3429
0
  };
Unexecuted instantiation: enc_fast_lossless.cc:AVX2::(anonymous namespace)::ProcessImageArea<AVX2::(anonymous namespace)::ChannelRowProcessor<AVX2::(anonymous namespace)::ChunkSampleCollector<AVX2::(anonymous namespace)::UpTo8Bits>, AVX2::(anonymous namespace)::UpTo8Bits>, AVX2::(anonymous namespace)::UpTo8Bits>(unsigned char const*, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, AVX2::(anonymous namespace)::UpTo8Bits, unsigned long, bool, AVX2::(anonymous namespace)::ChannelRowProcessor<AVX2::(anonymous namespace)::ChunkSampleCollector<AVX2::(anonymous namespace)::UpTo8Bits>, AVX2::(anonymous namespace)::UpTo8Bits>*)::{lambda(short*)#1}::operator()(short*) const
Unexecuted instantiation: enc_fast_lossless.cc:AVX2::(anonymous namespace)::ProcessImageArea<AVX2::(anonymous namespace)::ChannelRowProcessor<AVX2::(anonymous namespace)::ChunkSampleCollector<AVX2::(anonymous namespace)::From9To13Bits>, AVX2::(anonymous namespace)::From9To13Bits>, AVX2::(anonymous namespace)::From9To13Bits>(unsigned char const*, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, AVX2::(anonymous namespace)::From9To13Bits, unsigned long, bool, AVX2::(anonymous namespace)::ChannelRowProcessor<AVX2::(anonymous namespace)::ChunkSampleCollector<AVX2::(anonymous namespace)::From9To13Bits>, AVX2::(anonymous namespace)::From9To13Bits>*)::{lambda(short*)#1}::operator()(short*) const
Unexecuted instantiation: enc_fast_lossless.cc:AVX2::(anonymous namespace)::ProcessImageArea<AVX2::(anonymous namespace)::ChannelRowProcessor<AVX2::(anonymous namespace)::ChunkSampleCollector<AVX2::(anonymous namespace)::Exactly14Bits>, AVX2::(anonymous namespace)::Exactly14Bits>, AVX2::(anonymous namespace)::Exactly14Bits>(unsigned char const*, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, AVX2::(anonymous namespace)::Exactly14Bits, unsigned long, bool, AVX2::(anonymous namespace)::ChannelRowProcessor<AVX2::(anonymous namespace)::ChunkSampleCollector<AVX2::(anonymous namespace)::Exactly14Bits>, AVX2::(anonymous namespace)::Exactly14Bits>*)::{lambda(short*)#1}::operator()(short*) const
Unexecuted instantiation: enc_fast_lossless.cc:AVX2::(anonymous namespace)::ProcessImageArea<AVX2::(anonymous namespace)::ChannelRowProcessor<AVX2::(anonymous namespace)::ChunkSampleCollector<AVX2::(anonymous namespace)::MoreThan14Bits>, AVX2::(anonymous namespace)::MoreThan14Bits>, AVX2::(anonymous namespace)::MoreThan14Bits>(unsigned char const*, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, AVX2::(anonymous namespace)::MoreThan14Bits, unsigned long, bool, AVX2::(anonymous namespace)::ChannelRowProcessor<AVX2::(anonymous namespace)::ChunkSampleCollector<AVX2::(anonymous namespace)::MoreThan14Bits>, AVX2::(anonymous namespace)::MoreThan14Bits>*)::{lambda(int*)#1}::operator()(int*) const
Unexecuted instantiation: enc_fast_lossless.cc:default_implementation::(anonymous namespace)::ProcessImageArea<default_implementation::(anonymous namespace)::ChannelRowProcessor<default_implementation::(anonymous namespace)::ChunkSampleCollector<default_implementation::(anonymous namespace)::UpTo8Bits>, default_implementation::(anonymous namespace)::UpTo8Bits>, default_implementation::(anonymous namespace)::UpTo8Bits>(unsigned char const*, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, default_implementation::(anonymous namespace)::UpTo8Bits, unsigned long, bool, default_implementation::(anonymous namespace)::ChannelRowProcessor<default_implementation::(anonymous namespace)::ChunkSampleCollector<default_implementation::(anonymous namespace)::UpTo8Bits>, default_implementation::(anonymous namespace)::UpTo8Bits>*)::{lambda(short*)#1}::operator()(short*) const
Unexecuted instantiation: enc_fast_lossless.cc:default_implementation::(anonymous namespace)::ProcessImageArea<default_implementation::(anonymous namespace)::ChannelRowProcessor<default_implementation::(anonymous namespace)::ChunkSampleCollector<default_implementation::(anonymous namespace)::From9To13Bits>, default_implementation::(anonymous namespace)::From9To13Bits>, default_implementation::(anonymous namespace)::From9To13Bits>(unsigned char const*, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, default_implementation::(anonymous namespace)::From9To13Bits, unsigned long, bool, default_implementation::(anonymous namespace)::ChannelRowProcessor<default_implementation::(anonymous namespace)::ChunkSampleCollector<default_implementation::(anonymous namespace)::From9To13Bits>, default_implementation::(anonymous namespace)::From9To13Bits>*)::{lambda(short*)#1}::operator()(short*) const
Unexecuted instantiation: enc_fast_lossless.cc:default_implementation::(anonymous namespace)::ProcessImageArea<default_implementation::(anonymous namespace)::ChannelRowProcessor<default_implementation::(anonymous namespace)::ChunkSampleCollector<default_implementation::(anonymous namespace)::Exactly14Bits>, default_implementation::(anonymous namespace)::Exactly14Bits>, default_implementation::(anonymous namespace)::Exactly14Bits>(unsigned char const*, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, default_implementation::(anonymous namespace)::Exactly14Bits, unsigned long, bool, default_implementation::(anonymous namespace)::ChannelRowProcessor<default_implementation::(anonymous namespace)::ChunkSampleCollector<default_implementation::(anonymous namespace)::Exactly14Bits>, default_implementation::(anonymous namespace)::Exactly14Bits>*)::{lambda(short*)#1}::operator()(short*) const
Unexecuted instantiation: enc_fast_lossless.cc:default_implementation::(anonymous namespace)::ProcessImageArea<default_implementation::(anonymous namespace)::ChannelRowProcessor<default_implementation::(anonymous namespace)::ChunkSampleCollector<default_implementation::(anonymous namespace)::MoreThan14Bits>, default_implementation::(anonymous namespace)::MoreThan14Bits>, default_implementation::(anonymous namespace)::MoreThan14Bits>(unsigned char const*, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, default_implementation::(anonymous namespace)::MoreThan14Bits, unsigned long, bool, default_implementation::(anonymous namespace)::ChannelRowProcessor<default_implementation::(anonymous namespace)::ChunkSampleCollector<default_implementation::(anonymous namespace)::MoreThan14Bits>, default_implementation::(anonymous namespace)::MoreThan14Bits>*)::{lambda(int*)#1}::operator()(int*) const
Unexecuted instantiation: enc_fast_lossless.cc:AVX2::(anonymous namespace)::ProcessImageArea<AVX2::(anonymous namespace)::ChannelRowProcessor<AVX2::(anonymous namespace)::ChunkEncoder<AVX2::(anonymous namespace)::UpTo8Bits>, AVX2::(anonymous namespace)::UpTo8Bits>, AVX2::(anonymous namespace)::UpTo8Bits>(unsigned char const*, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, AVX2::(anonymous namespace)::UpTo8Bits, unsigned long, bool, AVX2::(anonymous namespace)::ChannelRowProcessor<AVX2::(anonymous namespace)::ChunkEncoder<AVX2::(anonymous namespace)::UpTo8Bits>, AVX2::(anonymous namespace)::UpTo8Bits>*)::{lambda(short*)#1}::operator()(short*) const
Unexecuted instantiation: enc_fast_lossless.cc:AVX2::(anonymous namespace)::ProcessImageArea<AVX2::(anonymous namespace)::ChannelRowProcessor<AVX2::(anonymous namespace)::ChunkEncoder<AVX2::(anonymous namespace)::From9To13Bits>, AVX2::(anonymous namespace)::From9To13Bits>, AVX2::(anonymous namespace)::From9To13Bits>(unsigned char const*, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, AVX2::(anonymous namespace)::From9To13Bits, unsigned long, bool, AVX2::(anonymous namespace)::ChannelRowProcessor<AVX2::(anonymous namespace)::ChunkEncoder<AVX2::(anonymous namespace)::From9To13Bits>, AVX2::(anonymous namespace)::From9To13Bits>*)::{lambda(short*)#1}::operator()(short*) const
Unexecuted instantiation: enc_fast_lossless.cc:AVX2::(anonymous namespace)::ProcessImageArea<AVX2::(anonymous namespace)::ChannelRowProcessor<AVX2::(anonymous namespace)::ChunkEncoder<AVX2::(anonymous namespace)::Exactly14Bits>, AVX2::(anonymous namespace)::Exactly14Bits>, AVX2::(anonymous namespace)::Exactly14Bits>(unsigned char const*, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, AVX2::(anonymous namespace)::Exactly14Bits, unsigned long, bool, AVX2::(anonymous namespace)::ChannelRowProcessor<AVX2::(anonymous namespace)::ChunkEncoder<AVX2::(anonymous namespace)::Exactly14Bits>, AVX2::(anonymous namespace)::Exactly14Bits>*)::{lambda(short*)#1}::operator()(short*) const
Unexecuted instantiation: enc_fast_lossless.cc:AVX2::(anonymous namespace)::ProcessImageArea<AVX2::(anonymous namespace)::ChannelRowProcessor<AVX2::(anonymous namespace)::ChunkEncoder<AVX2::(anonymous namespace)::MoreThan14Bits>, AVX2::(anonymous namespace)::MoreThan14Bits>, AVX2::(anonymous namespace)::MoreThan14Bits>(unsigned char const*, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, AVX2::(anonymous namespace)::MoreThan14Bits, unsigned long, bool, AVX2::(anonymous namespace)::ChannelRowProcessor<AVX2::(anonymous namespace)::ChunkEncoder<AVX2::(anonymous namespace)::MoreThan14Bits>, AVX2::(anonymous namespace)::MoreThan14Bits>*)::{lambda(int*)#1}::operator()(int*) const
Unexecuted instantiation: enc_fast_lossless.cc:default_implementation::(anonymous namespace)::ProcessImageArea<default_implementation::(anonymous namespace)::ChannelRowProcessor<default_implementation::(anonymous namespace)::ChunkEncoder<default_implementation::(anonymous namespace)::UpTo8Bits>, default_implementation::(anonymous namespace)::UpTo8Bits>, default_implementation::(anonymous namespace)::UpTo8Bits>(unsigned char const*, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, default_implementation::(anonymous namespace)::UpTo8Bits, unsigned long, bool, default_implementation::(anonymous namespace)::ChannelRowProcessor<default_implementation::(anonymous namespace)::ChunkEncoder<default_implementation::(anonymous namespace)::UpTo8Bits>, default_implementation::(anonymous namespace)::UpTo8Bits>*)::{lambda(short*)#1}::operator()(short*) const
Unexecuted instantiation: enc_fast_lossless.cc:default_implementation::(anonymous namespace)::ProcessImageArea<default_implementation::(anonymous namespace)::ChannelRowProcessor<default_implementation::(anonymous namespace)::ChunkEncoder<default_implementation::(anonymous namespace)::From9To13Bits>, default_implementation::(anonymous namespace)::From9To13Bits>, default_implementation::(anonymous namespace)::From9To13Bits>(unsigned char const*, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, default_implementation::(anonymous namespace)::From9To13Bits, unsigned long, bool, default_implementation::(anonymous namespace)::ChannelRowProcessor<default_implementation::(anonymous namespace)::ChunkEncoder<default_implementation::(anonymous namespace)::From9To13Bits>, default_implementation::(anonymous namespace)::From9To13Bits>*)::{lambda(short*)#1}::operator()(short*) const
Unexecuted instantiation: enc_fast_lossless.cc:default_implementation::(anonymous namespace)::ProcessImageArea<default_implementation::(anonymous namespace)::ChannelRowProcessor<default_implementation::(anonymous namespace)::ChunkEncoder<default_implementation::(anonymous namespace)::Exactly14Bits>, default_implementation::(anonymous namespace)::Exactly14Bits>, default_implementation::(anonymous namespace)::Exactly14Bits>(unsigned char const*, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, default_implementation::(anonymous namespace)::Exactly14Bits, unsigned long, bool, default_implementation::(anonymous namespace)::ChannelRowProcessor<default_implementation::(anonymous namespace)::ChunkEncoder<default_implementation::(anonymous namespace)::Exactly14Bits>, default_implementation::(anonymous namespace)::Exactly14Bits>*)::{lambda(short*)#1}::operator()(short*) const
Unexecuted instantiation: enc_fast_lossless.cc:default_implementation::(anonymous namespace)::ProcessImageArea<default_implementation::(anonymous namespace)::ChannelRowProcessor<default_implementation::(anonymous namespace)::ChunkEncoder<default_implementation::(anonymous namespace)::MoreThan14Bits>, default_implementation::(anonymous namespace)::MoreThan14Bits>, default_implementation::(anonymous namespace)::MoreThan14Bits>(unsigned char const*, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, default_implementation::(anonymous namespace)::MoreThan14Bits, unsigned long, bool, default_implementation::(anonymous namespace)::ChannelRowProcessor<default_implementation::(anonymous namespace)::ChunkEncoder<default_implementation::(anonymous namespace)::MoreThan14Bits>, default_implementation::(anonymous namespace)::MoreThan14Bits>*)::{lambda(int*)#1}::operator()(int*) const
3430
3431
0
  constexpr size_t kNumPx =
3432
0
      (256 + kPadding * 2 + kAlignPixels + kAlignPixels - 1) / kAlignPixels *
3433
0
      kAlignPixels;
3434
3435
0
  std::vector<std::array<std::array<pixel_t, kNumPx>, 2>> group_data(nb_chans);
3436
3437
0
  for (size_t y = 0; y < ys; y++) {
3438
0
    const auto rgba_row =
3439
0
        rgba + row_stride * (y0 + y) + x0 * nb_chans * BitDepth::kInputBytes;
3440
0
    pixel_t* crow[4] = {};
3441
0
    pixel_t* prow[4] = {};
3442
0
    for (size_t i = 0; i < nb_chans; i++) {
3443
0
      crow[i] = align(&group_data[i][y & 1][kPadding]);
3444
0
      prow[i] = align(&group_data[i][(y - 1) & 1][kPadding]);
3445
0
    }
3446
3447
    // Pre-fill rows with YCoCg converted pixels.
3448
0
    if (nb_chans == 1) {
3449
0
      if (BitDepth::kInputBytes == 1) {
3450
0
        FillRowG8(rgba_row, xs, crow[0]);
3451
0
      } else if (big_endian) {
3452
0
        FillRowG16</*big_endian=*/true>(rgba_row, xs, crow[0]);
3453
0
      } else {
3454
0
        FillRowG16</*big_endian=*/false>(rgba_row, xs, crow[0]);
3455
0
      }
3456
0
    } else if (nb_chans == 2) {
3457
0
      if (BitDepth::kInputBytes == 1) {
3458
0
        FillRowGA8(rgba_row, xs, crow[0], crow[1]);
3459
0
      } else if (big_endian) {
3460
0
        FillRowGA16</*big_endian=*/true>(rgba_row, xs, crow[0], crow[1]);
3461
0
      } else {
3462
0
        FillRowGA16</*big_endian=*/false>(rgba_row, xs, crow[0], crow[1]);
3463
0
      }
3464
0
    } else if (nb_chans == 3) {
3465
0
      if (BitDepth::kInputBytes == 1) {
3466
0
        FillRowRGB8(rgba_row, xs, crow[0], crow[1], crow[2]);
3467
0
      } else if (big_endian) {
3468
0
        FillRowRGB16</*big_endian=*/true>(rgba_row, xs, crow[0], crow[1],
3469
0
                                          crow[2]);
3470
0
      } else {
3471
0
        FillRowRGB16</*big_endian=*/false>(rgba_row, xs, crow[0], crow[1],
3472
0
                                           crow[2]);
3473
0
      }
3474
0
    } else {
3475
0
      if (BitDepth::kInputBytes == 1) {
3476
0
        FillRowRGBA8(rgba_row, xs, crow[0], crow[1], crow[2], crow[3]);
3477
0
      } else if (big_endian) {
3478
0
        FillRowRGBA16</*big_endian=*/true>(rgba_row, xs, crow[0], crow[1],
3479
0
                                           crow[2], crow[3]);
3480
0
      } else {
3481
0
        FillRowRGBA16</*big_endian=*/false>(rgba_row, xs, crow[0], crow[1],
3482
0
                                            crow[2], crow[3]);
3483
0
      }
3484
0
    }
3485
    // Deal with x == 0.
3486
0
    for (size_t c = 0; c < nb_chans; c++) {
3487
0
      *(crow[c] - 1) = y > 0 ? *(prow[c]) : 0;
3488
      // Fix topleft.
3489
0
      *(prow[c] - 1) = y > 0 ? *(prow[c]) : 0;
3490
0
    }
3491
0
    if (y < yskip) continue;
3492
0
    for (size_t c = 0; c < nb_chans; c++) {
3493
      // Get pointers to px/left/top/topleft data to speedup loop.
3494
0
      const pixel_t* row = crow[c];
3495
0
      const pixel_t* row_left = crow[c] - 1;
3496
0
      const pixel_t* row_top = y == 0 ? row_left : prow[c];
3497
0
      const pixel_t* row_topleft = y == 0 ? row_left : prow[c] - 1;
3498
3499
0
      processors[c].ProcessRow(row, row_left, row_top, row_topleft, xs);
3500
0
    }
3501
0
  }
3502
0
  for (size_t c = 0; c < nb_chans; c++) {
3503
0
    processors[c].Finalize();
3504
0
  }
3505
0
}
Unexecuted instantiation: enc_fast_lossless.cc:void AVX2::(anonymous namespace)::ProcessImageArea<AVX2::(anonymous namespace)::ChannelRowProcessor<AVX2::(anonymous namespace)::ChunkSampleCollector<AVX2::(anonymous namespace)::UpTo8Bits>, AVX2::(anonymous namespace)::UpTo8Bits>, AVX2::(anonymous namespace)::UpTo8Bits>(unsigned char const*, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, AVX2::(anonymous namespace)::UpTo8Bits, unsigned long, bool, AVX2::(anonymous namespace)::ChannelRowProcessor<AVX2::(anonymous namespace)::ChunkSampleCollector<AVX2::(anonymous namespace)::UpTo8Bits>, AVX2::(anonymous namespace)::UpTo8Bits>*)
Unexecuted instantiation: enc_fast_lossless.cc:void AVX2::(anonymous namespace)::ProcessImageArea<AVX2::(anonymous namespace)::ChannelRowProcessor<AVX2::(anonymous namespace)::ChunkSampleCollector<AVX2::(anonymous namespace)::From9To13Bits>, AVX2::(anonymous namespace)::From9To13Bits>, AVX2::(anonymous namespace)::From9To13Bits>(unsigned char const*, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, AVX2::(anonymous namespace)::From9To13Bits, unsigned long, bool, AVX2::(anonymous namespace)::ChannelRowProcessor<AVX2::(anonymous namespace)::ChunkSampleCollector<AVX2::(anonymous namespace)::From9To13Bits>, AVX2::(anonymous namespace)::From9To13Bits>*)
Unexecuted instantiation: enc_fast_lossless.cc:void AVX2::(anonymous namespace)::ProcessImageArea<AVX2::(anonymous namespace)::ChannelRowProcessor<AVX2::(anonymous namespace)::ChunkSampleCollector<AVX2::(anonymous namespace)::Exactly14Bits>, AVX2::(anonymous namespace)::Exactly14Bits>, AVX2::(anonymous namespace)::Exactly14Bits>(unsigned char const*, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, AVX2::(anonymous namespace)::Exactly14Bits, unsigned long, bool, AVX2::(anonymous namespace)::ChannelRowProcessor<AVX2::(anonymous namespace)::ChunkSampleCollector<AVX2::(anonymous namespace)::Exactly14Bits>, AVX2::(anonymous namespace)::Exactly14Bits>*)
Unexecuted instantiation: enc_fast_lossless.cc:void AVX2::(anonymous namespace)::ProcessImageArea<AVX2::(anonymous namespace)::ChannelRowProcessor<AVX2::(anonymous namespace)::ChunkSampleCollector<AVX2::(anonymous namespace)::MoreThan14Bits>, AVX2::(anonymous namespace)::MoreThan14Bits>, AVX2::(anonymous namespace)::MoreThan14Bits>(unsigned char const*, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, AVX2::(anonymous namespace)::MoreThan14Bits, unsigned long, bool, AVX2::(anonymous namespace)::ChannelRowProcessor<AVX2::(anonymous namespace)::ChunkSampleCollector<AVX2::(anonymous namespace)::MoreThan14Bits>, AVX2::(anonymous namespace)::MoreThan14Bits>*)
Unexecuted instantiation: enc_fast_lossless.cc:void default_implementation::(anonymous namespace)::ProcessImageArea<default_implementation::(anonymous namespace)::ChannelRowProcessor<default_implementation::(anonymous namespace)::ChunkSampleCollector<default_implementation::(anonymous namespace)::UpTo8Bits>, default_implementation::(anonymous namespace)::UpTo8Bits>, default_implementation::(anonymous namespace)::UpTo8Bits>(unsigned char const*, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, default_implementation::(anonymous namespace)::UpTo8Bits, unsigned long, bool, default_implementation::(anonymous namespace)::ChannelRowProcessor<default_implementation::(anonymous namespace)::ChunkSampleCollector<default_implementation::(anonymous namespace)::UpTo8Bits>, default_implementation::(anonymous namespace)::UpTo8Bits>*)
Unexecuted instantiation: enc_fast_lossless.cc:void default_implementation::(anonymous namespace)::ProcessImageArea<default_implementation::(anonymous namespace)::ChannelRowProcessor<default_implementation::(anonymous namespace)::ChunkSampleCollector<default_implementation::(anonymous namespace)::From9To13Bits>, default_implementation::(anonymous namespace)::From9To13Bits>, default_implementation::(anonymous namespace)::From9To13Bits>(unsigned char const*, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, default_implementation::(anonymous namespace)::From9To13Bits, unsigned long, bool, default_implementation::(anonymous namespace)::ChannelRowProcessor<default_implementation::(anonymous namespace)::ChunkSampleCollector<default_implementation::(anonymous namespace)::From9To13Bits>, default_implementation::(anonymous namespace)::From9To13Bits>*)
Unexecuted instantiation: enc_fast_lossless.cc:void default_implementation::(anonymous namespace)::ProcessImageArea<default_implementation::(anonymous namespace)::ChannelRowProcessor<default_implementation::(anonymous namespace)::ChunkSampleCollector<default_implementation::(anonymous namespace)::Exactly14Bits>, default_implementation::(anonymous namespace)::Exactly14Bits>, default_implementation::(anonymous namespace)::Exactly14Bits>(unsigned char const*, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, default_implementation::(anonymous namespace)::Exactly14Bits, unsigned long, bool, default_implementation::(anonymous namespace)::ChannelRowProcessor<default_implementation::(anonymous namespace)::ChunkSampleCollector<default_implementation::(anonymous namespace)::Exactly14Bits>, default_implementation::(anonymous namespace)::Exactly14Bits>*)
Unexecuted instantiation: enc_fast_lossless.cc:void default_implementation::(anonymous namespace)::ProcessImageArea<default_implementation::(anonymous namespace)::ChannelRowProcessor<default_implementation::(anonymous namespace)::ChunkSampleCollector<default_implementation::(anonymous namespace)::MoreThan14Bits>, default_implementation::(anonymous namespace)::MoreThan14Bits>, default_implementation::(anonymous namespace)::MoreThan14Bits>(unsigned char const*, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, default_implementation::(anonymous namespace)::MoreThan14Bits, unsigned long, bool, default_implementation::(anonymous namespace)::ChannelRowProcessor<default_implementation::(anonymous namespace)::ChunkSampleCollector<default_implementation::(anonymous namespace)::MoreThan14Bits>, default_implementation::(anonymous namespace)::MoreThan14Bits>*)
Unexecuted instantiation: enc_fast_lossless.cc:void AVX2::(anonymous namespace)::ProcessImageArea<AVX2::(anonymous namespace)::ChannelRowProcessor<AVX2::(anonymous namespace)::ChunkEncoder<AVX2::(anonymous namespace)::UpTo8Bits>, AVX2::(anonymous namespace)::UpTo8Bits>, AVX2::(anonymous namespace)::UpTo8Bits>(unsigned char const*, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, AVX2::(anonymous namespace)::UpTo8Bits, unsigned long, bool, AVX2::(anonymous namespace)::ChannelRowProcessor<AVX2::(anonymous namespace)::ChunkEncoder<AVX2::(anonymous namespace)::UpTo8Bits>, AVX2::(anonymous namespace)::UpTo8Bits>*)
Unexecuted instantiation: enc_fast_lossless.cc:void AVX2::(anonymous namespace)::ProcessImageArea<AVX2::(anonymous namespace)::ChannelRowProcessor<AVX2::(anonymous namespace)::ChunkEncoder<AVX2::(anonymous namespace)::From9To13Bits>, AVX2::(anonymous namespace)::From9To13Bits>, AVX2::(anonymous namespace)::From9To13Bits>(unsigned char const*, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, AVX2::(anonymous namespace)::From9To13Bits, unsigned long, bool, AVX2::(anonymous namespace)::ChannelRowProcessor<AVX2::(anonymous namespace)::ChunkEncoder<AVX2::(anonymous namespace)::From9To13Bits>, AVX2::(anonymous namespace)::From9To13Bits>*)
Unexecuted instantiation: enc_fast_lossless.cc:void AVX2::(anonymous namespace)::ProcessImageArea<AVX2::(anonymous namespace)::ChannelRowProcessor<AVX2::(anonymous namespace)::ChunkEncoder<AVX2::(anonymous namespace)::Exactly14Bits>, AVX2::(anonymous namespace)::Exactly14Bits>, AVX2::(anonymous namespace)::Exactly14Bits>(unsigned char const*, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, AVX2::(anonymous namespace)::Exactly14Bits, unsigned long, bool, AVX2::(anonymous namespace)::ChannelRowProcessor<AVX2::(anonymous namespace)::ChunkEncoder<AVX2::(anonymous namespace)::Exactly14Bits>, AVX2::(anonymous namespace)::Exactly14Bits>*)
Unexecuted instantiation: enc_fast_lossless.cc:void AVX2::(anonymous namespace)::ProcessImageArea<AVX2::(anonymous namespace)::ChannelRowProcessor<AVX2::(anonymous namespace)::ChunkEncoder<AVX2::(anonymous namespace)::MoreThan14Bits>, AVX2::(anonymous namespace)::MoreThan14Bits>, AVX2::(anonymous namespace)::MoreThan14Bits>(unsigned char const*, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, AVX2::(anonymous namespace)::MoreThan14Bits, unsigned long, bool, AVX2::(anonymous namespace)::ChannelRowProcessor<AVX2::(anonymous namespace)::ChunkEncoder<AVX2::(anonymous namespace)::MoreThan14Bits>, AVX2::(anonymous namespace)::MoreThan14Bits>*)
Unexecuted instantiation: enc_fast_lossless.cc:void default_implementation::(anonymous namespace)::ProcessImageArea<default_implementation::(anonymous namespace)::ChannelRowProcessor<default_implementation::(anonymous namespace)::ChunkEncoder<default_implementation::(anonymous namespace)::UpTo8Bits>, default_implementation::(anonymous namespace)::UpTo8Bits>, default_implementation::(anonymous namespace)::UpTo8Bits>(unsigned char const*, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, default_implementation::(anonymous namespace)::UpTo8Bits, unsigned long, bool, default_implementation::(anonymous namespace)::ChannelRowProcessor<default_implementation::(anonymous namespace)::ChunkEncoder<default_implementation::(anonymous namespace)::UpTo8Bits>, default_implementation::(anonymous namespace)::UpTo8Bits>*)
Unexecuted instantiation: enc_fast_lossless.cc:void default_implementation::(anonymous namespace)::ProcessImageArea<default_implementation::(anonymous namespace)::ChannelRowProcessor<default_implementation::(anonymous namespace)::ChunkEncoder<default_implementation::(anonymous namespace)::From9To13Bits>, default_implementation::(anonymous namespace)::From9To13Bits>, default_implementation::(anonymous namespace)::From9To13Bits>(unsigned char const*, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, default_implementation::(anonymous namespace)::From9To13Bits, unsigned long, bool, default_implementation::(anonymous namespace)::ChannelRowProcessor<default_implementation::(anonymous namespace)::ChunkEncoder<default_implementation::(anonymous namespace)::From9To13Bits>, default_implementation::(anonymous namespace)::From9To13Bits>*)
Unexecuted instantiation: enc_fast_lossless.cc:void default_implementation::(anonymous namespace)::ProcessImageArea<default_implementation::(anonymous namespace)::ChannelRowProcessor<default_implementation::(anonymous namespace)::ChunkEncoder<default_implementation::(anonymous namespace)::Exactly14Bits>, default_implementation::(anonymous namespace)::Exactly14Bits>, default_implementation::(anonymous namespace)::Exactly14Bits>(unsigned char const*, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, default_implementation::(anonymous namespace)::Exactly14Bits, unsigned long, bool, default_implementation::(anonymous namespace)::ChannelRowProcessor<default_implementation::(anonymous namespace)::ChunkEncoder<default_implementation::(anonymous namespace)::Exactly14Bits>, default_implementation::(anonymous namespace)::Exactly14Bits>*)
Unexecuted instantiation: enc_fast_lossless.cc:void default_implementation::(anonymous namespace)::ProcessImageArea<default_implementation::(anonymous namespace)::ChannelRowProcessor<default_implementation::(anonymous namespace)::ChunkEncoder<default_implementation::(anonymous namespace)::MoreThan14Bits>, default_implementation::(anonymous namespace)::MoreThan14Bits>, default_implementation::(anonymous namespace)::MoreThan14Bits>(unsigned char const*, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, default_implementation::(anonymous namespace)::MoreThan14Bits, unsigned long, bool, default_implementation::(anonymous namespace)::ChannelRowProcessor<default_implementation::(anonymous namespace)::ChunkEncoder<default_implementation::(anonymous namespace)::MoreThan14Bits>, default_implementation::(anonymous namespace)::MoreThan14Bits>*)
3506
3507
template <typename BitDepth>
3508
bool WriteACSection(const unsigned char* rgba, size_t x0, size_t y0, size_t xs,
3509
                    size_t ys, size_t row_stride, bool is_single_group,
3510
                    BitDepth bitdepth, size_t nb_chans, bool big_endian,
3511
                    const PrefixCode code[4],
3512
0
                    std::array<BitWriter, 4>& output) {
3513
0
  for (size_t i = 0; i < nb_chans; i++) {
3514
0
    if (is_single_group && i == 0) continue;
3515
0
    if (!output[i].Allocate(xs * ys * bitdepth.MaxEncodedBitsPerSample() + 4)) {
3516
0
      return false;
3517
0
    }
3518
0
  }
3519
0
  if (!is_single_group) {
3520
    // Group header for modular image.
3521
    // When the image is single-group, the global modular image is the one
3522
    // that contains the pixel data, and there is no group header.
3523
0
    output[0].Write(1, 1);     // Global tree
3524
0
    output[0].Write(1, 1);     // All default wp
3525
0
    output[0].Write(2, 0b00);  // 0 transforms
3526
0
  }
3527
3528
0
  ChunkEncoder<BitDepth> encoders[4];
3529
0
  ChannelRowProcessor<ChunkEncoder<BitDepth>, BitDepth> row_encoders[4];
3530
0
  for (size_t c = 0; c < nb_chans; c++) {
3531
0
    row_encoders[c].t = &encoders[c];
3532
0
    encoders[c].output = &output[c];
3533
0
    encoders[c].code = &code[c];
3534
0
    encoders[c].PrepareForSimd();
3535
0
  }
3536
0
  ProcessImageArea<ChannelRowProcessor<ChunkEncoder<BitDepth>, BitDepth>>(
3537
0
      rgba, x0, y0, xs, 0, ys, row_stride, bitdepth, nb_chans, big_endian,
3538
0
      row_encoders);
3539
0
  return true;
3540
0
}
Unexecuted instantiation: enc_fast_lossless.cc:bool AVX2::(anonymous namespace)::WriteACSection<AVX2::(anonymous namespace)::UpTo8Bits>(unsigned char const*, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, bool, AVX2::(anonymous namespace)::UpTo8Bits, unsigned long, bool, (anonymous namespace)::PrefixCode const*, std::__1::array<(anonymous namespace)::BitWriter, 4ul>&)
Unexecuted instantiation: enc_fast_lossless.cc:bool AVX2::(anonymous namespace)::WriteACSection<AVX2::(anonymous namespace)::From9To13Bits>(unsigned char const*, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, bool, AVX2::(anonymous namespace)::From9To13Bits, unsigned long, bool, (anonymous namespace)::PrefixCode const*, std::__1::array<(anonymous namespace)::BitWriter, 4ul>&)
Unexecuted instantiation: enc_fast_lossless.cc:bool AVX2::(anonymous namespace)::WriteACSection<AVX2::(anonymous namespace)::Exactly14Bits>(unsigned char const*, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, bool, AVX2::(anonymous namespace)::Exactly14Bits, unsigned long, bool, (anonymous namespace)::PrefixCode const*, std::__1::array<(anonymous namespace)::BitWriter, 4ul>&)
Unexecuted instantiation: enc_fast_lossless.cc:bool AVX2::(anonymous namespace)::WriteACSection<AVX2::(anonymous namespace)::MoreThan14Bits>(unsigned char const*, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, bool, AVX2::(anonymous namespace)::MoreThan14Bits, unsigned long, bool, (anonymous namespace)::PrefixCode const*, std::__1::array<(anonymous namespace)::BitWriter, 4ul>&)
Unexecuted instantiation: enc_fast_lossless.cc:bool default_implementation::(anonymous namespace)::WriteACSection<default_implementation::(anonymous namespace)::UpTo8Bits>(unsigned char const*, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, bool, default_implementation::(anonymous namespace)::UpTo8Bits, unsigned long, bool, (anonymous namespace)::PrefixCode const*, std::__1::array<(anonymous namespace)::BitWriter, 4ul>&)
Unexecuted instantiation: enc_fast_lossless.cc:bool default_implementation::(anonymous namespace)::WriteACSection<default_implementation::(anonymous namespace)::From9To13Bits>(unsigned char const*, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, bool, default_implementation::(anonymous namespace)::From9To13Bits, unsigned long, bool, (anonymous namespace)::PrefixCode const*, std::__1::array<(anonymous namespace)::BitWriter, 4ul>&)
Unexecuted instantiation: enc_fast_lossless.cc:bool default_implementation::(anonymous namespace)::WriteACSection<default_implementation::(anonymous namespace)::Exactly14Bits>(unsigned char const*, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, bool, default_implementation::(anonymous namespace)::Exactly14Bits, unsigned long, bool, (anonymous namespace)::PrefixCode const*, std::__1::array<(anonymous namespace)::BitWriter, 4ul>&)
Unexecuted instantiation: enc_fast_lossless.cc:bool default_implementation::(anonymous namespace)::WriteACSection<default_implementation::(anonymous namespace)::MoreThan14Bits>(unsigned char const*, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, bool, default_implementation::(anonymous namespace)::MoreThan14Bits, unsigned long, bool, (anonymous namespace)::PrefixCode const*, std::__1::array<(anonymous namespace)::BitWriter, 4ul>&)
3541
3542
constexpr int kHashExp = 16;
3543
constexpr uint32_t kHashSize = 1 << kHashExp;
3544
constexpr uint32_t kHashMultiplier = 2654435761;
3545
constexpr int kMaxColors = 512;
3546
3547
// can be any function that returns a value in 0 .. kHashSize-1
3548
// has to map 0 to 0
3549
0
inline uint32_t pixel_hash(uint32_t p) {
3550
0
  return (p * kHashMultiplier) >> (32 - kHashExp);
3551
0
}
Unexecuted instantiation: enc_fast_lossless.cc:AVX2::(anonymous namespace)::pixel_hash(unsigned int)
Unexecuted instantiation: enc_fast_lossless.cc:default_implementation::(anonymous namespace)::pixel_hash(unsigned int)
3552
3553
template <size_t nb_chans>
3554
void FillRowPalette(const unsigned char* inrow, size_t xs,
3555
0
                    const int16_t* lookup, int16_t* out) {
3556
0
  for (size_t x = 0; x < xs; x++) {
3557
0
    uint32_t p = 0;
3558
0
    for (size_t i = 0; i < nb_chans; ++i) {
3559
0
      p |= inrow[x * nb_chans + i] << (8 * i);
3560
0
    }
3561
0
    out[x] = lookup[pixel_hash(p)];
3562
0
  }
3563
0
}
Unexecuted instantiation: enc_fast_lossless.cc:void AVX2::(anonymous namespace)::FillRowPalette<1ul>(unsigned char const*, unsigned long, short const*, short*)
Unexecuted instantiation: enc_fast_lossless.cc:void AVX2::(anonymous namespace)::FillRowPalette<2ul>(unsigned char const*, unsigned long, short const*, short*)
Unexecuted instantiation: enc_fast_lossless.cc:void AVX2::(anonymous namespace)::FillRowPalette<3ul>(unsigned char const*, unsigned long, short const*, short*)
Unexecuted instantiation: enc_fast_lossless.cc:void AVX2::(anonymous namespace)::FillRowPalette<4ul>(unsigned char const*, unsigned long, short const*, short*)
Unexecuted instantiation: enc_fast_lossless.cc:void default_implementation::(anonymous namespace)::FillRowPalette<1ul>(unsigned char const*, unsigned long, short const*, short*)
Unexecuted instantiation: enc_fast_lossless.cc:void default_implementation::(anonymous namespace)::FillRowPalette<2ul>(unsigned char const*, unsigned long, short const*, short*)
Unexecuted instantiation: enc_fast_lossless.cc:void default_implementation::(anonymous namespace)::FillRowPalette<3ul>(unsigned char const*, unsigned long, short const*, short*)
Unexecuted instantiation: enc_fast_lossless.cc:void default_implementation::(anonymous namespace)::FillRowPalette<4ul>(unsigned char const*, unsigned long, short const*, short*)
3564
3565
template <typename Processor>
3566
void ProcessImageAreaPalette(const unsigned char* rgba, size_t x0, size_t y0,
3567
                             size_t xs, size_t yskip, size_t ys,
3568
                             size_t row_stride, const int16_t* lookup,
3569
0
                             size_t nb_chans, Processor* processors) {
3570
0
  constexpr size_t kPadding = 32;
3571
3572
0
  std::vector<std::array<int16_t, 256 + kPadding * 2>> group_data(2);
3573
0
  Processor& row_encoder = processors[0];
3574
3575
0
  for (size_t y = 0; y < ys; y++) {
3576
    // Pre-fill rows with palette converted pixels.
3577
0
    const unsigned char* inrow = rgba + row_stride * (y0 + y) + x0 * nb_chans;
3578
0
    int16_t* outrow = &group_data[y & 1][kPadding];
3579
0
    if (nb_chans == 1) {
3580
0
      FillRowPalette<1>(inrow, xs, lookup, outrow);
3581
0
    } else if (nb_chans == 2) {
3582
0
      FillRowPalette<2>(inrow, xs, lookup, outrow);
3583
0
    } else if (nb_chans == 3) {
3584
0
      FillRowPalette<3>(inrow, xs, lookup, outrow);
3585
0
    } else if (nb_chans == 4) {
3586
0
      FillRowPalette<4>(inrow, xs, lookup, outrow);
3587
0
    }
3588
    // Deal with x == 0.
3589
0
    group_data[y & 1][kPadding - 1] =
3590
0
        y > 0 ? group_data[(y - 1) & 1][kPadding] : 0;
3591
    // Fix topleft.
3592
0
    group_data[(y - 1) & 1][kPadding - 1] =
3593
0
        y > 0 ? group_data[(y - 1) & 1][kPadding] : 0;
3594
    // Get pointers to px/left/top/topleft data to speedup loop.
3595
0
    const int16_t* row = &group_data[y & 1][kPadding];
3596
0
    const int16_t* row_left = &group_data[y & 1][kPadding - 1];
3597
0
    const int16_t* row_top =
3598
0
        y == 0 ? row_left : &group_data[(y - 1) & 1][kPadding];
3599
0
    const int16_t* row_topleft =
3600
0
        y == 0 ? row_left : &group_data[(y - 1) & 1][kPadding - 1];
3601
3602
0
    row_encoder.ProcessRow(row, row_left, row_top, row_topleft, xs);
3603
0
  }
3604
0
  row_encoder.Finalize();
3605
0
}
Unexecuted instantiation: enc_fast_lossless.cc:void AVX2::(anonymous namespace)::ProcessImageAreaPalette<AVX2::(anonymous namespace)::ChannelRowProcessor<AVX2::(anonymous namespace)::ChunkSampleCollector<AVX2::(anonymous namespace)::UpTo8Bits>, AVX2::(anonymous namespace)::UpTo8Bits> >(unsigned char const*, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, short const*, unsigned long, AVX2::(anonymous namespace)::ChannelRowProcessor<AVX2::(anonymous namespace)::ChunkSampleCollector<AVX2::(anonymous namespace)::UpTo8Bits>, AVX2::(anonymous namespace)::UpTo8Bits>*)
Unexecuted instantiation: enc_fast_lossless.cc:void default_implementation::(anonymous namespace)::ProcessImageAreaPalette<default_implementation::(anonymous namespace)::ChannelRowProcessor<default_implementation::(anonymous namespace)::ChunkSampleCollector<default_implementation::(anonymous namespace)::UpTo8Bits>, default_implementation::(anonymous namespace)::UpTo8Bits> >(unsigned char const*, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, short const*, unsigned long, default_implementation::(anonymous namespace)::ChannelRowProcessor<default_implementation::(anonymous namespace)::ChunkSampleCollector<default_implementation::(anonymous namespace)::UpTo8Bits>, default_implementation::(anonymous namespace)::UpTo8Bits>*)
Unexecuted instantiation: enc_fast_lossless.cc:void AVX2::(anonymous namespace)::ProcessImageAreaPalette<AVX2::(anonymous namespace)::ChannelRowProcessor<AVX2::(anonymous namespace)::ChunkEncoder<AVX2::(anonymous namespace)::UpTo8Bits>, AVX2::(anonymous namespace)::UpTo8Bits> >(unsigned char const*, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, short const*, unsigned long, AVX2::(anonymous namespace)::ChannelRowProcessor<AVX2::(anonymous namespace)::ChunkEncoder<AVX2::(anonymous namespace)::UpTo8Bits>, AVX2::(anonymous namespace)::UpTo8Bits>*)
Unexecuted instantiation: enc_fast_lossless.cc:void default_implementation::(anonymous namespace)::ProcessImageAreaPalette<default_implementation::(anonymous namespace)::ChannelRowProcessor<default_implementation::(anonymous namespace)::ChunkEncoder<default_implementation::(anonymous namespace)::UpTo8Bits>, default_implementation::(anonymous namespace)::UpTo8Bits> >(unsigned char const*, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, short const*, unsigned long, default_implementation::(anonymous namespace)::ChannelRowProcessor<default_implementation::(anonymous namespace)::ChunkEncoder<default_implementation::(anonymous namespace)::UpTo8Bits>, default_implementation::(anonymous namespace)::UpTo8Bits>*)
3606
3607
bool WriteACSectionPalette(const unsigned char* rgba, size_t x0, size_t y0,
3608
                           size_t xs, size_t ys, size_t row_stride,
3609
                           bool is_single_group, const PrefixCode code[4],
3610
                           const int16_t* lookup, size_t nb_chans,
3611
0
                           BitWriter& output) {
3612
0
  if (!is_single_group) {
3613
0
    if (!output.Allocate(16 * xs * ys + 4)) return false;
3614
    // Group header for modular image.
3615
    // When the image is single-group, the global modular image is the one
3616
    // that contains the pixel data, and there is no group header.
3617
0
    output.Write(1, 1);     // Global tree
3618
0
    output.Write(1, 1);     // All default wp
3619
0
    output.Write(2, 0b00);  // 0 transforms
3620
0
  }
3621
3622
0
  ChunkEncoder<UpTo8Bits> encoder;
3623
0
  ChannelRowProcessor<ChunkEncoder<UpTo8Bits>, UpTo8Bits> row_encoder;
3624
3625
0
  row_encoder.t = &encoder;
3626
0
  encoder.output = &output;
3627
0
  encoder.code = &code[is_single_group ? 1 : 0];
3628
0
  encoder.PrepareForSimd();
3629
0
  ProcessImageAreaPalette<
3630
0
      ChannelRowProcessor<ChunkEncoder<UpTo8Bits>, UpTo8Bits>>(
3631
0
      rgba, x0, y0, xs, 0, ys, row_stride, lookup, nb_chans, &row_encoder);
3632
0
  return true;
3633
0
}
Unexecuted instantiation: enc_fast_lossless.cc:AVX2::(anonymous namespace)::WriteACSectionPalette(unsigned char const*, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, bool, (anonymous namespace)::PrefixCode const*, short const*, unsigned long, (anonymous namespace)::BitWriter&)
Unexecuted instantiation: enc_fast_lossless.cc:default_implementation::(anonymous namespace)::WriteACSectionPalette(unsigned char const*, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, bool, (anonymous namespace)::PrefixCode const*, short const*, unsigned long, (anonymous namespace)::BitWriter&)
3634
3635
template <typename BitDepth>
3636
void CollectSamples(const unsigned char* rgba, size_t x0, size_t y0, size_t xs,
3637
                    size_t row_stride, size_t row_count,
3638
                    uint64_t raw_counts[4][kNumRawSymbols],
3639
                    uint64_t lz77_counts[4][kNumLZ77], bool is_single_group,
3640
                    bool palette, BitDepth bitdepth, size_t nb_chans,
3641
0
                    bool big_endian, const int16_t* lookup) {
3642
0
  if (palette) {
3643
0
    ChunkSampleCollector<UpTo8Bits> sample_collectors[4];
3644
0
    ChannelRowProcessor<ChunkSampleCollector<UpTo8Bits>, UpTo8Bits>
3645
0
        row_sample_collectors[4];
3646
0
    for (size_t c = 0; c < nb_chans; c++) {
3647
0
      row_sample_collectors[c].t = &sample_collectors[c];
3648
0
      sample_collectors[c].raw_counts = raw_counts[is_single_group ? 1 : 0];
3649
0
      sample_collectors[c].lz77_counts = lz77_counts[is_single_group ? 1 : 0];
3650
0
    }
3651
0
    ProcessImageAreaPalette<
3652
0
        ChannelRowProcessor<ChunkSampleCollector<UpTo8Bits>, UpTo8Bits>>(
3653
0
        rgba, x0, y0, xs, 1, 1 + row_count, row_stride, lookup, nb_chans,
3654
0
        row_sample_collectors);
3655
0
  } else {
3656
0
    ChunkSampleCollector<BitDepth> sample_collectors[4];
3657
0
    ChannelRowProcessor<ChunkSampleCollector<BitDepth>, BitDepth>
3658
0
        row_sample_collectors[4];
3659
0
    for (size_t c = 0; c < nb_chans; c++) {
3660
0
      row_sample_collectors[c].t = &sample_collectors[c];
3661
0
      sample_collectors[c].raw_counts = raw_counts[c];
3662
0
      sample_collectors[c].lz77_counts = lz77_counts[c];
3663
0
    }
3664
0
    ProcessImageArea<
3665
0
        ChannelRowProcessor<ChunkSampleCollector<BitDepth>, BitDepth>>(
3666
0
        rgba, x0, y0, xs, 1, 1 + row_count, row_stride, bitdepth, nb_chans,
3667
0
        big_endian, row_sample_collectors);
3668
0
  }
3669
0
}
Unexecuted instantiation: enc_fast_lossless.cc:void AVX2::(anonymous namespace)::CollectSamples<AVX2::(anonymous namespace)::UpTo8Bits>(unsigned char const*, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long (*) [19], unsigned long (*) [33], bool, bool, AVX2::(anonymous namespace)::UpTo8Bits, unsigned long, bool, short const*)
Unexecuted instantiation: enc_fast_lossless.cc:void AVX2::(anonymous namespace)::CollectSamples<AVX2::(anonymous namespace)::From9To13Bits>(unsigned char const*, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long (*) [19], unsigned long (*) [33], bool, bool, AVX2::(anonymous namespace)::From9To13Bits, unsigned long, bool, short const*)
Unexecuted instantiation: enc_fast_lossless.cc:void AVX2::(anonymous namespace)::CollectSamples<AVX2::(anonymous namespace)::Exactly14Bits>(unsigned char const*, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long (*) [19], unsigned long (*) [33], bool, bool, AVX2::(anonymous namespace)::Exactly14Bits, unsigned long, bool, short const*)
Unexecuted instantiation: enc_fast_lossless.cc:void AVX2::(anonymous namespace)::CollectSamples<AVX2::(anonymous namespace)::MoreThan14Bits>(unsigned char const*, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long (*) [19], unsigned long (*) [33], bool, bool, AVX2::(anonymous namespace)::MoreThan14Bits, unsigned long, bool, short const*)
Unexecuted instantiation: enc_fast_lossless.cc:void default_implementation::(anonymous namespace)::CollectSamples<default_implementation::(anonymous namespace)::UpTo8Bits>(unsigned char const*, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long (*) [19], unsigned long (*) [33], bool, bool, default_implementation::(anonymous namespace)::UpTo8Bits, unsigned long, bool, short const*)
Unexecuted instantiation: enc_fast_lossless.cc:void default_implementation::(anonymous namespace)::CollectSamples<default_implementation::(anonymous namespace)::From9To13Bits>(unsigned char const*, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long (*) [19], unsigned long (*) [33], bool, bool, default_implementation::(anonymous namespace)::From9To13Bits, unsigned long, bool, short const*)
Unexecuted instantiation: enc_fast_lossless.cc:void default_implementation::(anonymous namespace)::CollectSamples<default_implementation::(anonymous namespace)::Exactly14Bits>(unsigned char const*, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long (*) [19], unsigned long (*) [33], bool, bool, default_implementation::(anonymous namespace)::Exactly14Bits, unsigned long, bool, short const*)
Unexecuted instantiation: enc_fast_lossless.cc:void default_implementation::(anonymous namespace)::CollectSamples<default_implementation::(anonymous namespace)::MoreThan14Bits>(unsigned char const*, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long (*) [19], unsigned long (*) [33], bool, bool, default_implementation::(anonymous namespace)::MoreThan14Bits, unsigned long, bool, short const*)
3670
3671
bool PrepareDCGlobalPalette(bool is_single_group, size_t width, size_t height,
3672
                            size_t nb_chans, const PrefixCode code[4],
3673
                            const std::vector<uint32_t>& palette,
3674
0
                            size_t pcolors, BitWriter* output) {
3675
0
  if (!PrepareDCGlobalCommon(is_single_group, width, height, code, output)) {
3676
0
    return false;
3677
0
  }
3678
0
  output->Write(2, 0b01);     // 1 transform
3679
0
  output->Write(2, 0b01);     // Palette
3680
0
  output->Write(5, 0b00000);  // Starting from ch 0
3681
0
  if (nb_chans == 1) {
3682
0
    output->Write(2, 0b00);  // 1-channel palette (Gray)
3683
0
  } else if (nb_chans == 3) {
3684
0
    output->Write(2, 0b01);  // 3-channel palette (RGB)
3685
0
  } else if (nb_chans == 4) {
3686
0
    output->Write(2, 0b10);  // 4-channel palette (RGBA)
3687
0
  } else {
3688
0
    output->Write(2, 0b11);
3689
0
    output->Write(13, nb_chans - 1);
3690
0
  }
3691
  // pcolors <= kMaxColors + kChunkSize - 1
3692
0
  static_assert(kMaxColors + kChunkSize < 1281,
3693
0
                "add code to signal larger palette sizes");
3694
0
  if (pcolors < 256) {
3695
0
    output->Write(2, 0b00);
3696
0
    output->Write(8, pcolors);
3697
0
  } else {
3698
0
    output->Write(2, 0b01);
3699
0
    output->Write(10, pcolors - 256);
3700
0
  }
3701
3702
0
  output->Write(2, 0b00);  // nb_deltas == 0
3703
0
  output->Write(4, 0);     // Zero predictor for delta palette
3704
  // Encode palette
3705
0
  ChunkEncoder<UpTo8Bits> encoder;
3706
0
  ChannelRowProcessor<ChunkEncoder<UpTo8Bits>, UpTo8Bits> row_encoder;
3707
0
  row_encoder.t = &encoder;
3708
0
  encoder.output = output;
3709
0
  encoder.code = &code[0];
3710
0
  encoder.PrepareForSimd();
3711
0
  std::vector<std::array<int16_t, 32 + 1024>> p(4);
3712
0
  size_t i = 0;
3713
0
  size_t have_zero = 1;
3714
0
  for (; i < pcolors; i++) {
3715
0
    p[0][16 + i + have_zero] = palette[i] & 0xFF;
3716
0
    p[1][16 + i + have_zero] = (palette[i] >> 8) & 0xFF;
3717
0
    p[2][16 + i + have_zero] = (palette[i] >> 16) & 0xFF;
3718
0
    p[3][16 + i + have_zero] = (palette[i] >> 24) & 0xFF;
3719
0
  }
3720
0
  p[0][15] = 0;
3721
0
  row_encoder.ProcessRow(p[0].data() + 16, p[0].data() + 15, p[0].data() + 15,
3722
0
                         p[0].data() + 15, pcolors);
3723
0
  p[1][15] = p[0][16];
3724
0
  p[0][15] = p[0][16];
3725
0
  if (nb_chans > 1) {
3726
0
    row_encoder.ProcessRow(p[1].data() + 16, p[1].data() + 15, p[0].data() + 16,
3727
0
                           p[0].data() + 15, pcolors);
3728
0
  }
3729
0
  p[2][15] = p[1][16];
3730
0
  p[1][15] = p[1][16];
3731
0
  if (nb_chans > 2) {
3732
0
    row_encoder.ProcessRow(p[2].data() + 16, p[2].data() + 15, p[1].data() + 16,
3733
0
                           p[1].data() + 15, pcolors);
3734
0
  }
3735
0
  p[3][15] = p[2][16];
3736
0
  p[2][15] = p[2][16];
3737
0
  if (nb_chans > 3) {
3738
0
    row_encoder.ProcessRow(p[3].data() + 16, p[3].data() + 15, p[2].data() + 16,
3739
0
                           p[2].data() + 15, pcolors);
3740
0
  }
3741
0
  row_encoder.Finalize();
3742
3743
0
  if (!is_single_group) {
3744
0
    output->ZeroPadToByte();
3745
0
  }
3746
0
  return true;
3747
0
}
Unexecuted instantiation: enc_fast_lossless.cc:AVX2::(anonymous namespace)::PrepareDCGlobalPalette(bool, unsigned long, unsigned long, unsigned long, (anonymous namespace)::PrefixCode const*, std::__1::vector<unsigned int, std::__1::allocator<unsigned int> > const&, unsigned long, (anonymous namespace)::BitWriter*)
Unexecuted instantiation: enc_fast_lossless.cc:default_implementation::(anonymous namespace)::PrepareDCGlobalPalette(bool, unsigned long, unsigned long, unsigned long, (anonymous namespace)::PrefixCode const*, std::__1::vector<unsigned int, std::__1::allocator<unsigned int> > const&, unsigned long, (anonymous namespace)::BitWriter*)
3748
3749
template <size_t nb_chans>
3750
bool detect_palette(const unsigned char* r, size_t width,
3751
0
                    std::vector<uint32_t>& palette) {
3752
0
  size_t x = 0;
3753
0
  bool collided = false;
3754
  // this is just an unrolling of the next loop
3755
0
  size_t look_ahead = 7 + ((nb_chans == 1) ? 3 : ((nb_chans < 4) ? 1 : 0));
3756
0
  for (; x + look_ahead < width; x += 8) {
3757
0
    uint32_t p[8] = {}, index[8];
3758
0
    for (int i = 0; i < 8; i++) {
3759
0
      for (int j = 0; j < 4; ++j) {
3760
0
        p[i] |= r[(x + i) * nb_chans + j] << (8 * j);
3761
0
      }
3762
0
    }
3763
0
    for (int i = 0; i < 8; i++) p[i] &= ((1llu << (8 * nb_chans)) - 1);
3764
0
    for (int i = 0; i < 8; i++) index[i] = pixel_hash(p[i]);
3765
0
    for (int i = 0; i < 8; i++) {
3766
0
      collided |= (palette[index[i]] != 0 && p[i] != palette[index[i]]);
3767
0
      palette[index[i]] = p[i];
3768
0
    }
3769
0
  }
3770
0
  for (; x < width; x++) {
3771
0
    uint32_t p = 0;
3772
0
    for (size_t i = 0; i < nb_chans; ++i) {
3773
0
      p |= r[x * nb_chans + i] << (8 * i);
3774
0
    }
3775
0
    uint32_t index = pixel_hash(p);
3776
0
    collided |= (palette[index] != 0 && p != palette[index]);
3777
0
    palette[index] = p;
3778
0
  }
3779
0
  return collided;
3780
0
}
Unexecuted instantiation: enc_fast_lossless.cc:bool AVX2::(anonymous namespace)::detect_palette<1ul>(unsigned char const*, unsigned long, std::__1::vector<unsigned int, std::__1::allocator<unsigned int> >&)
Unexecuted instantiation: enc_fast_lossless.cc:bool AVX2::(anonymous namespace)::detect_palette<2ul>(unsigned char const*, unsigned long, std::__1::vector<unsigned int, std::__1::allocator<unsigned int> >&)
Unexecuted instantiation: enc_fast_lossless.cc:bool AVX2::(anonymous namespace)::detect_palette<3ul>(unsigned char const*, unsigned long, std::__1::vector<unsigned int, std::__1::allocator<unsigned int> >&)
Unexecuted instantiation: enc_fast_lossless.cc:bool AVX2::(anonymous namespace)::detect_palette<4ul>(unsigned char const*, unsigned long, std::__1::vector<unsigned int, std::__1::allocator<unsigned int> >&)
Unexecuted instantiation: enc_fast_lossless.cc:bool default_implementation::(anonymous namespace)::detect_palette<1ul>(unsigned char const*, unsigned long, std::__1::vector<unsigned int, std::__1::allocator<unsigned int> >&)
Unexecuted instantiation: enc_fast_lossless.cc:bool default_implementation::(anonymous namespace)::detect_palette<2ul>(unsigned char const*, unsigned long, std::__1::vector<unsigned int, std::__1::allocator<unsigned int> >&)
Unexecuted instantiation: enc_fast_lossless.cc:bool default_implementation::(anonymous namespace)::detect_palette<3ul>(unsigned char const*, unsigned long, std::__1::vector<unsigned int, std::__1::allocator<unsigned int> >&)
Unexecuted instantiation: enc_fast_lossless.cc:bool default_implementation::(anonymous namespace)::detect_palette<4ul>(unsigned char const*, unsigned long, std::__1::vector<unsigned int, std::__1::allocator<unsigned int> >&)
3781
3782
template <typename BitDepth>
3783
JxlFastLosslessFrameState* LLPrepare(JxlChunkedFrameInputSource input,
3784
                                     size_t width, size_t height,
3785
                                     BitDepth bitdepth, size_t nb_chans,
3786
0
                                     bool big_endian, int effort, int oneshot) {
3787
0
  assert(width != 0);
3788
0
  assert(height != 0);
3789
3790
  // Count colors to try palette
3791
0
  std::vector<uint32_t> palette(kHashSize);
3792
0
  std::vector<int16_t> lookup(kHashSize);
3793
0
  lookup[0] = 0;
3794
0
  int pcolors = 0;
3795
0
  bool collided = effort < 2 || bitdepth.bitdepth != 8 || !oneshot;
3796
0
  for (size_t y0 = 0; y0 < height && !collided; y0 += 256) {
3797
0
    size_t ys = std::min<size_t>(height - y0, 256);
3798
0
    for (size_t x0 = 0; x0 < width && !collided; x0 += 256) {
3799
0
      size_t xs = std::min<size_t>(width - x0, 256);
3800
0
      size_t stride;
3801
      // TODO(szabadka): Add RAII wrapper around this.
3802
0
      const void* buffer = input.get_color_channel_data_at(input.opaque, x0, y0,
3803
0
                                                           xs, ys, &stride);
3804
0
      if (buffer == nullptr) return nullptr;
3805
0
      auto rgba = reinterpret_cast<const unsigned char*>(buffer);
3806
0
      for (size_t y = 0; y < ys && !collided; y++) {
3807
0
        const unsigned char* r = rgba + stride * y;
3808
0
        if (nb_chans == 1) collided = detect_palette<1>(r, xs, palette);
3809
0
        if (nb_chans == 2) collided = detect_palette<2>(r, xs, palette);
3810
0
        if (nb_chans == 3) collided = detect_palette<3>(r, xs, palette);
3811
0
        if (nb_chans == 4) collided = detect_palette<4>(r, xs, palette);
3812
0
      }
3813
0
      input.release_buffer(input.opaque, buffer);
3814
0
    }
3815
0
  }
3816
0
  int nb_entries = 0;
3817
0
  if (!collided) {
3818
0
    pcolors = 1;  // always have all-zero as a palette color
3819
0
    bool have_color = false;
3820
0
    uint8_t minG = 255, maxG = 0;
3821
0
    for (uint32_t k = 0; k < kHashSize; k++) {
3822
0
      if (palette[k] == 0) continue;
3823
0
      uint8_t p[4];
3824
0
      for (int i = 0; i < 4; ++i) {
3825
0
        p[i] = (palette[k] >> (8 * i)) & 0xFF;
3826
0
      }
3827
      // move entries to front so sort has less work
3828
0
      palette[nb_entries] = palette[k];
3829
0
      if (p[0] != p[1] || p[0] != p[2]) have_color = true;
3830
0
      if (p[1] < minG) minG = p[1];
3831
0
      if (p[1] > maxG) maxG = p[1];
3832
0
      nb_entries++;
3833
      // don't do palette if too many colors are needed
3834
0
      if (nb_entries + pcolors > kMaxColors) {
3835
0
        collided = true;
3836
0
        break;
3837
0
      }
3838
0
    }
3839
0
    if (!have_color) {
3840
      // don't do palette if it's just grayscale without many holes
3841
0
      if (maxG - minG < nb_entries * 1.4f) collided = true;
3842
0
    }
3843
0
  }
3844
0
  if (!collided) {
3845
0
    std::sort(
3846
0
        palette.begin(), palette.begin() + nb_entries,
3847
0
        [&nb_chans](uint32_t ap, uint32_t bp) {
3848
0
          if (ap == 0) return false;
3849
0
          if (bp == 0) return true;
3850
0
          uint8_t a[4], b[4];
3851
0
          for (int i = 0; i < 4; ++i) {
3852
0
            a[i] = (ap >> (8 * i)) & 0xFF;
3853
0
            b[i] = (bp >> (8 * i)) & 0xFF;
3854
0
          }
3855
0
          float ay, by;
3856
0
          if (nb_chans == 4) {
3857
0
            ay = (0.299f * a[0] + 0.587f * a[1] + 0.114f * a[2] + 0.01f) * a[3];
3858
0
            by = (0.299f * b[0] + 0.587f * b[1] + 0.114f * b[2] + 0.01f) * b[3];
3859
0
          } else {
3860
0
            ay = (0.299f * a[0] + 0.587f * a[1] + 0.114f * a[2] + 0.01f);
3861
0
            by = (0.299f * b[0] + 0.587f * b[1] + 0.114f * b[2] + 0.01f);
3862
0
          }
3863
0
          return ay < by;  // sort on alpha*luma
3864
0
        });
Unexecuted instantiation: enc_fast_lossless.cc:AVX2::(anonymous namespace)::LLPrepare<AVX2::(anonymous namespace)::UpTo8Bits>(JxlChunkedFrameInputSource, unsigned long, unsigned long, AVX2::(anonymous namespace)::UpTo8Bits, unsigned long, bool, int, int)::{lambda(unsigned int, unsigned int)#1}::operator()(unsigned int, unsigned int) const
Unexecuted instantiation: enc_fast_lossless.cc:AVX2::(anonymous namespace)::LLPrepare<AVX2::(anonymous namespace)::From9To13Bits>(JxlChunkedFrameInputSource, unsigned long, unsigned long, AVX2::(anonymous namespace)::From9To13Bits, unsigned long, bool, int, int)::{lambda(unsigned int, unsigned int)#1}::operator()(unsigned int, unsigned int) const
Unexecuted instantiation: enc_fast_lossless.cc:AVX2::(anonymous namespace)::LLPrepare<AVX2::(anonymous namespace)::Exactly14Bits>(JxlChunkedFrameInputSource, unsigned long, unsigned long, AVX2::(anonymous namespace)::Exactly14Bits, unsigned long, bool, int, int)::{lambda(unsigned int, unsigned int)#1}::operator()(unsigned int, unsigned int) const
Unexecuted instantiation: enc_fast_lossless.cc:AVX2::(anonymous namespace)::LLPrepare<AVX2::(anonymous namespace)::MoreThan14Bits>(JxlChunkedFrameInputSource, unsigned long, unsigned long, AVX2::(anonymous namespace)::MoreThan14Bits, unsigned long, bool, int, int)::{lambda(unsigned int, unsigned int)#1}::operator()(unsigned int, unsigned int) const
Unexecuted instantiation: enc_fast_lossless.cc:default_implementation::(anonymous namespace)::LLPrepare<default_implementation::(anonymous namespace)::UpTo8Bits>(JxlChunkedFrameInputSource, unsigned long, unsigned long, default_implementation::(anonymous namespace)::UpTo8Bits, unsigned long, bool, int, int)::{lambda(unsigned int, unsigned int)#1}::operator()(unsigned int, unsigned int) const
Unexecuted instantiation: enc_fast_lossless.cc:default_implementation::(anonymous namespace)::LLPrepare<default_implementation::(anonymous namespace)::From9To13Bits>(JxlChunkedFrameInputSource, unsigned long, unsigned long, default_implementation::(anonymous namespace)::From9To13Bits, unsigned long, bool, int, int)::{lambda(unsigned int, unsigned int)#1}::operator()(unsigned int, unsigned int) const
Unexecuted instantiation: enc_fast_lossless.cc:default_implementation::(anonymous namespace)::LLPrepare<default_implementation::(anonymous namespace)::Exactly14Bits>(JxlChunkedFrameInputSource, unsigned long, unsigned long, default_implementation::(anonymous namespace)::Exactly14Bits, unsigned long, bool, int, int)::{lambda(unsigned int, unsigned int)#1}::operator()(unsigned int, unsigned int) const
Unexecuted instantiation: enc_fast_lossless.cc:default_implementation::(anonymous namespace)::LLPrepare<default_implementation::(anonymous namespace)::MoreThan14Bits>(JxlChunkedFrameInputSource, unsigned long, unsigned long, default_implementation::(anonymous namespace)::MoreThan14Bits, unsigned long, bool, int, int)::{lambda(unsigned int, unsigned int)#1}::operator()(unsigned int, unsigned int) const
3865
0
    for (int k = 0; k < nb_entries; k++) {
3866
0
      if (palette[k] == 0) break;
3867
0
      lookup[pixel_hash(palette[k])] = pcolors++;
3868
0
    }
3869
0
  }
3870
3871
0
  size_t num_groups_x = (width + 255) / 256;
3872
0
  size_t num_groups_y = (height + 255) / 256;
3873
0
  size_t num_dc_groups_x = (width + 2047) / 2048;
3874
0
  size_t num_dc_groups_y = (height + 2047) / 2048;
3875
3876
0
  uint64_t raw_counts[4][kNumRawSymbols] = {};
3877
0
  uint64_t lz77_counts[4][kNumLZ77] = {};
3878
3879
0
  bool onegroup = num_groups_x == 1 && num_groups_y == 1;
3880
3881
0
  auto sample_rows = [&](size_t xg, size_t yg, size_t num_rows) {
3882
0
    size_t y0 = yg * 256;
3883
0
    size_t x0 = xg * 256;
3884
0
    size_t ys = std::min<size_t>(height - y0, 256);
3885
0
    size_t xs = std::min<size_t>(width - x0, 256);
3886
0
    size_t stride;
3887
0
    const void* buffer =
3888
0
        input.get_color_channel_data_at(input.opaque, x0, y0, xs, ys, &stride);
3889
0
    if (buffer == nullptr) {
3890
0
      return false;
3891
0
    }
3892
0
    auto rgba = reinterpret_cast<const unsigned char*>(buffer);
3893
0
    int y_begin_group =
3894
0
        std::max<ptrdiff_t>(
3895
0
            0, static_cast<ptrdiff_t>(ys) - static_cast<ptrdiff_t>(num_rows)) /
3896
0
        2;
3897
0
    int y_count =
3898
0
        std::max<int>(0, std::min<int>(num_rows, ys - y_begin_group - 1));
3899
0
    int x_max = xs / kChunkSize * kChunkSize;
3900
0
    CollectSamples(rgba, 0, y_begin_group, x_max, stride, y_count, raw_counts,
3901
0
                   lz77_counts, onegroup, !collided, bitdepth, nb_chans,
3902
0
                   big_endian, lookup.data());
3903
0
    input.release_buffer(input.opaque, buffer);
3904
0
    return true;
3905
0
  };
Unexecuted instantiation: enc_fast_lossless.cc:AVX2::(anonymous namespace)::LLPrepare<AVX2::(anonymous namespace)::UpTo8Bits>(JxlChunkedFrameInputSource, unsigned long, unsigned long, AVX2::(anonymous namespace)::UpTo8Bits, unsigned long, bool, int, int)::{lambda(unsigned long, unsigned long, unsigned long)#1}::operator()(unsigned long, unsigned long, unsigned long) const
Unexecuted instantiation: enc_fast_lossless.cc:AVX2::(anonymous namespace)::LLPrepare<AVX2::(anonymous namespace)::From9To13Bits>(JxlChunkedFrameInputSource, unsigned long, unsigned long, AVX2::(anonymous namespace)::From9To13Bits, unsigned long, bool, int, int)::{lambda(unsigned long, unsigned long, unsigned long)#1}::operator()(unsigned long, unsigned long, unsigned long) const
Unexecuted instantiation: enc_fast_lossless.cc:AVX2::(anonymous namespace)::LLPrepare<AVX2::(anonymous namespace)::Exactly14Bits>(JxlChunkedFrameInputSource, unsigned long, unsigned long, AVX2::(anonymous namespace)::Exactly14Bits, unsigned long, bool, int, int)::{lambda(unsigned long, unsigned long, unsigned long)#1}::operator()(unsigned long, unsigned long, unsigned long) const
Unexecuted instantiation: enc_fast_lossless.cc:AVX2::(anonymous namespace)::LLPrepare<AVX2::(anonymous namespace)::MoreThan14Bits>(JxlChunkedFrameInputSource, unsigned long, unsigned long, AVX2::(anonymous namespace)::MoreThan14Bits, unsigned long, bool, int, int)::{lambda(unsigned long, unsigned long, unsigned long)#1}::operator()(unsigned long, unsigned long, unsigned long) const
Unexecuted instantiation: enc_fast_lossless.cc:default_implementation::(anonymous namespace)::LLPrepare<default_implementation::(anonymous namespace)::UpTo8Bits>(JxlChunkedFrameInputSource, unsigned long, unsigned long, default_implementation::(anonymous namespace)::UpTo8Bits, unsigned long, bool, int, int)::{lambda(unsigned long, unsigned long, unsigned long)#1}::operator()(unsigned long, unsigned long, unsigned long) const
Unexecuted instantiation: enc_fast_lossless.cc:default_implementation::(anonymous namespace)::LLPrepare<default_implementation::(anonymous namespace)::From9To13Bits>(JxlChunkedFrameInputSource, unsigned long, unsigned long, default_implementation::(anonymous namespace)::From9To13Bits, unsigned long, bool, int, int)::{lambda(unsigned long, unsigned long, unsigned long)#1}::operator()(unsigned long, unsigned long, unsigned long) const
Unexecuted instantiation: enc_fast_lossless.cc:default_implementation::(anonymous namespace)::LLPrepare<default_implementation::(anonymous namespace)::Exactly14Bits>(JxlChunkedFrameInputSource, unsigned long, unsigned long, default_implementation::(anonymous namespace)::Exactly14Bits, unsigned long, bool, int, int)::{lambda(unsigned long, unsigned long, unsigned long)#1}::operator()(unsigned long, unsigned long, unsigned long) const
Unexecuted instantiation: enc_fast_lossless.cc:default_implementation::(anonymous namespace)::LLPrepare<default_implementation::(anonymous namespace)::MoreThan14Bits>(JxlChunkedFrameInputSource, unsigned long, unsigned long, default_implementation::(anonymous namespace)::MoreThan14Bits, unsigned long, bool, int, int)::{lambda(unsigned long, unsigned long, unsigned long)#1}::operator()(unsigned long, unsigned long, unsigned long) const
3906
3907
  // TODO(veluca): that `64` is an arbitrary constant, meant to correspond to
3908
  // the point where the number of processed rows is large enough that loading
3909
  // the entire image is cost-effective.
3910
0
  if (oneshot || effort >= 64) {
3911
0
    for (size_t g = 0; g < num_groups_y * num_groups_x; g++) {
3912
0
      size_t xg = g % num_groups_x;
3913
0
      size_t yg = g / num_groups_x;
3914
0
      size_t y0 = yg * 256;
3915
0
      size_t ys = std::min<size_t>(height - y0, 256);
3916
0
      size_t num_rows = 2 * effort * ys / 256;
3917
0
      if (!sample_rows(xg, yg, num_rows)) {
3918
0
        return nullptr;
3919
0
      }
3920
0
    }
3921
0
  } else {
3922
    // sample the middle (effort * 2 * num_groups) rows of the center group
3923
    // (possibly all of them).
3924
0
    if (!sample_rows((num_groups_x - 1) / 2, (num_groups_y - 1) / 2,
3925
0
                     2 * effort * num_groups_x * num_groups_y)) {
3926
0
      return nullptr;
3927
0
    }
3928
0
  }
3929
3930
  // TODO(veluca): can probably improve this and make it bitdepth-dependent.
3931
0
  uint64_t base_raw_counts[kNumRawSymbols] = {
3932
0
      3843, 852, 1270, 1214, 1014, 727, 481, 300, 159, 51,
3933
0
      5,    1,   1,    1,    1,    1,   1,   1,   1};
3934
3935
0
  bool doing_ycocg = nb_chans > 2 && collided;
3936
0
  bool large_palette = !collided || pcolors >= 256;
3937
0
  for (size_t i = bitdepth.NumSymbols(doing_ycocg || large_palette);
3938
0
       i < kNumRawSymbols; i++) {
3939
0
    base_raw_counts[i] = 0;
3940
0
  }
3941
3942
0
  for (size_t c = 0; c < 4; c++) {
3943
0
    for (size_t i = 0; i < kNumRawSymbols; i++) {
3944
0
      raw_counts[c][i] = (raw_counts[c][i] << 8) + base_raw_counts[i];
3945
0
    }
3946
0
  }
3947
3948
0
  if (!collided) {
3949
0
    unsigned token, nbits, bits;
3950
0
    EncodeHybridUint000(PackSigned(pcolors - 1), &token, &nbits, &bits);
3951
    // ensure all palette indices can actually be encoded
3952
0
    for (size_t i = 0; i < token + 1; i++)
3953
0
      raw_counts[0][i] = std::max<uint64_t>(raw_counts[0][i], 1);
3954
    // these tokens are only used for the palette itself so they can get a bad
3955
    // code
3956
0
    for (size_t i = token + 1; i < 10; i++) raw_counts[0][i] = 1;
3957
0
  }
3958
3959
0
  uint64_t base_lz77_counts[kNumLZ77] = {
3960
0
      29, 27, 25,  23, 21, 21, 19, 18, 21, 17, 16, 15, 15, 14,
3961
0
      13, 13, 137, 98, 61, 34, 1,  1,  1,  1,  1,  1,  1,  1,
3962
0
  };
3963
3964
0
  for (size_t c = 0; c < 4; c++) {
3965
0
    for (size_t i = 0; i < kNumLZ77; i++) {
3966
0
      lz77_counts[c][i] = (lz77_counts[c][i] << 8) + base_lz77_counts[i];
3967
0
    }
3968
0
  }
3969
3970
0
  JxlFastLosslessFrameState* frame_state = new JxlFastLosslessFrameState();
3971
0
  if (!frame_state) return nullptr;
3972
0
  for (size_t i = 0; i < 4; i++) {
3973
0
    frame_state->hcode[i] = PrefixCode(bitdepth, raw_counts[i], lz77_counts[i]);
3974
0
  }
3975
3976
0
  size_t num_dc_groups = num_dc_groups_x * num_dc_groups_y;
3977
0
  size_t num_ac_groups = num_groups_x * num_groups_y;
3978
0
  size_t num_groups = onegroup ? 1 : (2 + num_dc_groups + num_ac_groups);
3979
0
  frame_state->input = input;
3980
0
  frame_state->width = width;
3981
0
  frame_state->height = height;
3982
0
  frame_state->num_groups_x = num_groups_x;
3983
0
  frame_state->num_groups_y = num_groups_y;
3984
0
  frame_state->num_dc_groups_x = num_dc_groups_x;
3985
0
  frame_state->num_dc_groups_y = num_dc_groups_y;
3986
0
  frame_state->nb_chans = nb_chans;
3987
0
  frame_state->bitdepth = bitdepth.bitdepth;
3988
0
  frame_state->big_endian = big_endian;
3989
0
  frame_state->effort = effort;
3990
0
  frame_state->collided = collided;
3991
0
  frame_state->lookup = lookup;
3992
3993
0
  frame_state->group_data = std::vector<std::array<BitWriter, 4>>(num_groups);
3994
0
  frame_state->group_sizes.resize(num_groups);
3995
0
  if (collided) {
3996
0
    if (!PrepareDCGlobal(onegroup, width, height, nb_chans, frame_state->hcode,
3997
0
                         &frame_state->group_data[0][0])) {
3998
0
      delete frame_state;
3999
0
      return nullptr;
4000
0
    }
4001
0
  } else {
4002
0
    if (!PrepareDCGlobalPalette(onegroup, width, height, nb_chans,
4003
0
                                frame_state->hcode, palette, pcolors,
4004
0
                                &frame_state->group_data[0][0])) {
4005
0
      delete frame_state;
4006
0
      return nullptr;
4007
0
    }
4008
0
  }
4009
0
  frame_state->group_sizes[0] = SectionSize(frame_state->group_data[0]);
4010
0
  if (!onegroup) {
4011
0
    ComputeAcGroupDataOffset(frame_state->group_sizes[0], num_dc_groups,
4012
0
                             num_ac_groups, frame_state->min_dc_global_size,
4013
0
                             frame_state->ac_group_data_offset);
4014
0
  }
4015
4016
0
  return frame_state;
4017
0
}
Unexecuted instantiation: enc_fast_lossless.cc:JxlFastLosslessFrameState* AVX2::(anonymous namespace)::LLPrepare<AVX2::(anonymous namespace)::UpTo8Bits>(JxlChunkedFrameInputSource, unsigned long, unsigned long, AVX2::(anonymous namespace)::UpTo8Bits, unsigned long, bool, int, int)
Unexecuted instantiation: enc_fast_lossless.cc:JxlFastLosslessFrameState* AVX2::(anonymous namespace)::LLPrepare<AVX2::(anonymous namespace)::From9To13Bits>(JxlChunkedFrameInputSource, unsigned long, unsigned long, AVX2::(anonymous namespace)::From9To13Bits, unsigned long, bool, int, int)
Unexecuted instantiation: enc_fast_lossless.cc:JxlFastLosslessFrameState* AVX2::(anonymous namespace)::LLPrepare<AVX2::(anonymous namespace)::Exactly14Bits>(JxlChunkedFrameInputSource, unsigned long, unsigned long, AVX2::(anonymous namespace)::Exactly14Bits, unsigned long, bool, int, int)
Unexecuted instantiation: enc_fast_lossless.cc:JxlFastLosslessFrameState* AVX2::(anonymous namespace)::LLPrepare<AVX2::(anonymous namespace)::MoreThan14Bits>(JxlChunkedFrameInputSource, unsigned long, unsigned long, AVX2::(anonymous namespace)::MoreThan14Bits, unsigned long, bool, int, int)
Unexecuted instantiation: enc_fast_lossless.cc:JxlFastLosslessFrameState* default_implementation::(anonymous namespace)::LLPrepare<default_implementation::(anonymous namespace)::UpTo8Bits>(JxlChunkedFrameInputSource, unsigned long, unsigned long, default_implementation::(anonymous namespace)::UpTo8Bits, unsigned long, bool, int, int)
Unexecuted instantiation: enc_fast_lossless.cc:JxlFastLosslessFrameState* default_implementation::(anonymous namespace)::LLPrepare<default_implementation::(anonymous namespace)::From9To13Bits>(JxlChunkedFrameInputSource, unsigned long, unsigned long, default_implementation::(anonymous namespace)::From9To13Bits, unsigned long, bool, int, int)
Unexecuted instantiation: enc_fast_lossless.cc:JxlFastLosslessFrameState* default_implementation::(anonymous namespace)::LLPrepare<default_implementation::(anonymous namespace)::Exactly14Bits>(JxlChunkedFrameInputSource, unsigned long, unsigned long, default_implementation::(anonymous namespace)::Exactly14Bits, unsigned long, bool, int, int)
Unexecuted instantiation: enc_fast_lossless.cc:JxlFastLosslessFrameState* default_implementation::(anonymous namespace)::LLPrepare<default_implementation::(anonymous namespace)::MoreThan14Bits>(JxlChunkedFrameInputSource, unsigned long, unsigned long, default_implementation::(anonymous namespace)::MoreThan14Bits, unsigned long, bool, int, int)
4018
4019
template <typename BitDepth>
4020
jxl::Status LLProcess(JxlFastLosslessFrameState* frame_state, bool is_last,
4021
                      BitDepth bitdepth, void* runner_opaque,
4022
                      FJxlParallelRunner runner,
4023
0
                      JxlEncoderOutputProcessorWrapper* output_processor) {
4024
0
#if !FJXL_STANDALONE
4025
0
  if (frame_state->process_done) {
4026
0
    if (!JxlFastLosslessPrepareHeader(frame_state, /*add_image_header=*/0,
4027
0
                                      is_last)) {
4028
0
      return JXL_FAILURE("Allocation failed");
4029
0
    };
4030
0
    if (output_processor) {
4031
0
      JXL_RETURN_IF_ERROR(
4032
0
          JxlFastLosslessOutputFrame(frame_state, output_processor));
4033
0
    }
4034
0
    return true;
4035
0
  }
4036
0
#endif
4037
  // The maximum number of groups that we process concurrently here.
4038
  // TODO(szabadka) Use the number of threads or some outside parameter for the
4039
  // maximum memory usage instead.
4040
0
  constexpr size_t kMaxLocalGroups = 16;
4041
0
  bool onegroup = frame_state->group_sizes.size() == 1;
4042
0
  bool streaming = !onegroup && output_processor;
4043
0
  size_t total_groups = frame_state->num_groups_x * frame_state->num_groups_y;
4044
0
  size_t max_groups = streaming ? kMaxLocalGroups : total_groups;
4045
0
#if !FJXL_STANDALONE
4046
0
  size_t start_pos = 0;
4047
0
  if (streaming) {
4048
0
    start_pos = output_processor->CurrentPosition();
4049
0
    JXL_RETURN_IF_ERROR(
4050
0
        output_processor->Seek(start_pos + frame_state->ac_group_data_offset));
4051
0
  }
4052
0
#endif
4053
0
  for (size_t offset = 0; offset < total_groups; offset += max_groups) {
4054
0
    size_t num_groups = std::min(max_groups, total_groups - offset);
4055
0
    JxlFastLosslessFrameState local_frame_state;
4056
0
    if (streaming) {
4057
0
      local_frame_state.group_data =
4058
0
          std::vector<std::array<BitWriter, 4>>(num_groups);
4059
0
    }
4060
0
    std::atomic<uint32_t> has_error{0};
4061
0
    auto run_one = [&](size_t i) {
4062
0
      size_t g = offset + i;
4063
0
      size_t xg = g % frame_state->num_groups_x;
4064
0
      size_t yg = g / frame_state->num_groups_x;
4065
0
      size_t num_dc_groups =
4066
0
          frame_state->num_dc_groups_x * frame_state->num_dc_groups_y;
4067
0
      size_t group_id = onegroup ? 0 : (2 + num_dc_groups + g);
4068
0
      size_t xs = std::min<size_t>(frame_state->width - xg * 256, 256);
4069
0
      size_t ys = std::min<size_t>(frame_state->height - yg * 256, 256);
4070
0
      size_t x0 = xg * 256;
4071
0
      size_t y0 = yg * 256;
4072
0
      size_t stride;
4073
0
      JxlChunkedFrameInputSource input = frame_state->input;
4074
0
      const void* buffer = input.get_color_channel_data_at(input.opaque, x0, y0,
4075
0
                                                           xs, ys, &stride);
4076
0
      if (buffer == nullptr) {
4077
0
        has_error = 1;
4078
0
        return;
4079
0
      }
4080
0
      const unsigned char* rgba =
4081
0
          reinterpret_cast<const unsigned char*>(buffer);
4082
4083
0
      auto& gd = streaming ? local_frame_state.group_data[i]
4084
0
                           : frame_state->group_data[group_id];
4085
0
      bool ok;
4086
0
      if (frame_state->collided) {
4087
0
        ok = WriteACSection(rgba, 0, 0, xs, ys, stride, onegroup, bitdepth,
4088
0
                            frame_state->nb_chans, frame_state->big_endian,
4089
0
                            frame_state->hcode, gd);
4090
0
      } else {
4091
0
        ok = WriteACSectionPalette(
4092
0
            rgba, 0, 0, xs, ys, stride, onegroup, frame_state->hcode,
4093
0
            frame_state->lookup.data(), frame_state->nb_chans, gd[0]);
4094
0
      }
4095
0
      if (ok) {
4096
0
        frame_state->group_sizes[group_id] = SectionSize(gd);
4097
0
      } else {
4098
0
        has_error = 1;
4099
0
      }
4100
0
      input.release_buffer(input.opaque, buffer);
4101
0
    };
Unexecuted instantiation: enc_fast_lossless.cc:AVX2::(anonymous namespace)::LLProcess<AVX2::(anonymous namespace)::UpTo8Bits>(JxlFastLosslessFrameState*, bool, AVX2::(anonymous namespace)::UpTo8Bits, void*, void (*)(void*, void*, void (*)(void*, unsigned long), unsigned long), JxlEncoderOutputProcessorWrapper*)::{lambda(unsigned long)#1}::operator()(unsigned long) const
Unexecuted instantiation: enc_fast_lossless.cc:AVX2::(anonymous namespace)::LLProcess<AVX2::(anonymous namespace)::From9To13Bits>(JxlFastLosslessFrameState*, bool, AVX2::(anonymous namespace)::From9To13Bits, void*, void (*)(void*, void*, void (*)(void*, unsigned long), unsigned long), JxlEncoderOutputProcessorWrapper*)::{lambda(unsigned long)#1}::operator()(unsigned long) const
Unexecuted instantiation: enc_fast_lossless.cc:AVX2::(anonymous namespace)::LLProcess<AVX2::(anonymous namespace)::Exactly14Bits>(JxlFastLosslessFrameState*, bool, AVX2::(anonymous namespace)::Exactly14Bits, void*, void (*)(void*, void*, void (*)(void*, unsigned long), unsigned long), JxlEncoderOutputProcessorWrapper*)::{lambda(unsigned long)#1}::operator()(unsigned long) const
Unexecuted instantiation: enc_fast_lossless.cc:AVX2::(anonymous namespace)::LLProcess<AVX2::(anonymous namespace)::MoreThan14Bits>(JxlFastLosslessFrameState*, bool, AVX2::(anonymous namespace)::MoreThan14Bits, void*, void (*)(void*, void*, void (*)(void*, unsigned long), unsigned long), JxlEncoderOutputProcessorWrapper*)::{lambda(unsigned long)#1}::operator()(unsigned long) const
Unexecuted instantiation: enc_fast_lossless.cc:default_implementation::(anonymous namespace)::LLProcess<default_implementation::(anonymous namespace)::UpTo8Bits>(JxlFastLosslessFrameState*, bool, default_implementation::(anonymous namespace)::UpTo8Bits, void*, void (*)(void*, void*, void (*)(void*, unsigned long), unsigned long), JxlEncoderOutputProcessorWrapper*)::{lambda(unsigned long)#1}::operator()(unsigned long) const
Unexecuted instantiation: enc_fast_lossless.cc:default_implementation::(anonymous namespace)::LLProcess<default_implementation::(anonymous namespace)::From9To13Bits>(JxlFastLosslessFrameState*, bool, default_implementation::(anonymous namespace)::From9To13Bits, void*, void (*)(void*, void*, void (*)(void*, unsigned long), unsigned long), JxlEncoderOutputProcessorWrapper*)::{lambda(unsigned long)#1}::operator()(unsigned long) const
Unexecuted instantiation: enc_fast_lossless.cc:default_implementation::(anonymous namespace)::LLProcess<default_implementation::(anonymous namespace)::Exactly14Bits>(JxlFastLosslessFrameState*, bool, default_implementation::(anonymous namespace)::Exactly14Bits, void*, void (*)(void*, void*, void (*)(void*, unsigned long), unsigned long), JxlEncoderOutputProcessorWrapper*)::{lambda(unsigned long)#1}::operator()(unsigned long) const
Unexecuted instantiation: enc_fast_lossless.cc:default_implementation::(anonymous namespace)::LLProcess<default_implementation::(anonymous namespace)::MoreThan14Bits>(JxlFastLosslessFrameState*, bool, default_implementation::(anonymous namespace)::MoreThan14Bits, void*, void (*)(void*, void*, void (*)(void*, unsigned long), unsigned long), JxlEncoderOutputProcessorWrapper*)::{lambda(unsigned long)#1}::operator()(unsigned long) const
4102
0
    runner(
4103
0
        runner_opaque, &run_one,
4104
0
        +[](void* r, size_t i) {
4105
0
          (*reinterpret_cast<decltype(&run_one)>(r))(i);
4106
0
        },
Unexecuted instantiation: enc_fast_lossless.cc:AVX2::(anonymous namespace)::LLProcess<AVX2::(anonymous namespace)::UpTo8Bits>(JxlFastLosslessFrameState*, bool, AVX2::(anonymous namespace)::UpTo8Bits, void*, void (*)(void*, void*, void (*)(void*, unsigned long), unsigned long), JxlEncoderOutputProcessorWrapper*)::{lambda(void*, unsigned long)#1}::operator()(void*, unsigned long) const
Unexecuted instantiation: enc_fast_lossless.cc:AVX2::(anonymous namespace)::LLProcess<AVX2::(anonymous namespace)::From9To13Bits>(JxlFastLosslessFrameState*, bool, AVX2::(anonymous namespace)::From9To13Bits, void*, void (*)(void*, void*, void (*)(void*, unsigned long), unsigned long), JxlEncoderOutputProcessorWrapper*)::{lambda(void*, unsigned long)#1}::operator()(void*, unsigned long) const
Unexecuted instantiation: enc_fast_lossless.cc:AVX2::(anonymous namespace)::LLProcess<AVX2::(anonymous namespace)::Exactly14Bits>(JxlFastLosslessFrameState*, bool, AVX2::(anonymous namespace)::Exactly14Bits, void*, void (*)(void*, void*, void (*)(void*, unsigned long), unsigned long), JxlEncoderOutputProcessorWrapper*)::{lambda(void*, unsigned long)#1}::operator()(void*, unsigned long) const
Unexecuted instantiation: enc_fast_lossless.cc:AVX2::(anonymous namespace)::LLProcess<AVX2::(anonymous namespace)::MoreThan14Bits>(JxlFastLosslessFrameState*, bool, AVX2::(anonymous namespace)::MoreThan14Bits, void*, void (*)(void*, void*, void (*)(void*, unsigned long), unsigned long), JxlEncoderOutputProcessorWrapper*)::{lambda(void*, unsigned long)#1}::operator()(void*, unsigned long) const
Unexecuted instantiation: enc_fast_lossless.cc:default_implementation::(anonymous namespace)::LLProcess<default_implementation::(anonymous namespace)::UpTo8Bits>(JxlFastLosslessFrameState*, bool, default_implementation::(anonymous namespace)::UpTo8Bits, void*, void (*)(void*, void*, void (*)(void*, unsigned long), unsigned long), JxlEncoderOutputProcessorWrapper*)::{lambda(void*, unsigned long)#1}::operator()(void*, unsigned long) const
Unexecuted instantiation: enc_fast_lossless.cc:default_implementation::(anonymous namespace)::LLProcess<default_implementation::(anonymous namespace)::From9To13Bits>(JxlFastLosslessFrameState*, bool, default_implementation::(anonymous namespace)::From9To13Bits, void*, void (*)(void*, void*, void (*)(void*, unsigned long), unsigned long), JxlEncoderOutputProcessorWrapper*)::{lambda(void*, unsigned long)#1}::operator()(void*, unsigned long) const
Unexecuted instantiation: enc_fast_lossless.cc:default_implementation::(anonymous namespace)::LLProcess<default_implementation::(anonymous namespace)::Exactly14Bits>(JxlFastLosslessFrameState*, bool, default_implementation::(anonymous namespace)::Exactly14Bits, void*, void (*)(void*, void*, void (*)(void*, unsigned long), unsigned long), JxlEncoderOutputProcessorWrapper*)::{lambda(void*, unsigned long)#1}::operator()(void*, unsigned long) const
Unexecuted instantiation: enc_fast_lossless.cc:default_implementation::(anonymous namespace)::LLProcess<default_implementation::(anonymous namespace)::MoreThan14Bits>(JxlFastLosslessFrameState*, bool, default_implementation::(anonymous namespace)::MoreThan14Bits, void*, void (*)(void*, void*, void (*)(void*, unsigned long), unsigned long), JxlEncoderOutputProcessorWrapper*)::{lambda(void*, unsigned long)#1}::operator()(void*, unsigned long) const
4107
0
        num_groups);
4108
0
    if (has_error) return JXL_FAILURE("Allocation failed");
4109
0
#if !FJXL_STANDALONE
4110
0
    if (streaming) {
4111
0
      local_frame_state.nb_chans = frame_state->nb_chans;
4112
0
      local_frame_state.current_bit_writer = 1;
4113
0
      JXL_RETURN_IF_ERROR(
4114
0
          JxlFastLosslessOutputFrame(&local_frame_state, output_processor));
4115
0
    }
4116
0
#endif
4117
0
  }
4118
0
#if !FJXL_STANDALONE
4119
0
  if (streaming) {
4120
0
    size_t end_pos = output_processor->CurrentPosition();
4121
0
    JXL_RETURN_IF_ERROR(output_processor->Seek(start_pos));
4122
0
    frame_state->group_data.resize(1);
4123
0
    bool have_alpha = frame_state->nb_chans == 2 || frame_state->nb_chans == 4;
4124
0
    size_t padding = ComputeDcGlobalPadding(
4125
0
        frame_state->group_sizes, frame_state->ac_group_data_offset,
4126
0
        frame_state->min_dc_global_size, have_alpha, is_last);
4127
4128
0
    for (size_t i = 0; i < padding; ++i) {
4129
0
      frame_state->group_data[0][0].Write(8, 0);
4130
0
    }
4131
0
    frame_state->group_sizes[0] += padding;
4132
0
    if (!JxlFastLosslessPrepareHeader(frame_state, /*add_image_header=*/0,
4133
0
                                      is_last)) {
4134
0
      return JXL_FAILURE("Allocation failed");
4135
0
    }
4136
0
    assert(frame_state->ac_group_data_offset ==
4137
0
           JxlFastLosslessOutputSize(frame_state));
4138
0
    JXL_RETURN_IF_ERROR(
4139
0
        JxlFastLosslessOutputHeaders(frame_state, output_processor));
4140
0
    JXL_RETURN_IF_ERROR(output_processor->Seek(end_pos));
4141
0
  } else if (output_processor) {
4142
0
    assert(onegroup);
4143
0
    if (!JxlFastLosslessPrepareHeader(frame_state, /*add_image_header=*/0,
4144
0
                                      is_last)) {
4145
0
      return JXL_FAILURE("Allocation failed");
4146
0
    }
4147
0
    if (output_processor) {
4148
0
      JXL_RETURN_IF_ERROR(
4149
0
          JxlFastLosslessOutputFrame(frame_state, output_processor));
4150
0
    }
4151
0
  }
4152
0
  frame_state->process_done = true;
4153
0
#endif
4154
0
  return true;
4155
0
}
Unexecuted instantiation: enc_fast_lossless.cc:jxl::Status AVX2::(anonymous namespace)::LLProcess<AVX2::(anonymous namespace)::UpTo8Bits>(JxlFastLosslessFrameState*, bool, AVX2::(anonymous namespace)::UpTo8Bits, void*, void (*)(void*, void*, void (*)(void*, unsigned long), unsigned long), JxlEncoderOutputProcessorWrapper*)
Unexecuted instantiation: enc_fast_lossless.cc:jxl::Status AVX2::(anonymous namespace)::LLProcess<AVX2::(anonymous namespace)::From9To13Bits>(JxlFastLosslessFrameState*, bool, AVX2::(anonymous namespace)::From9To13Bits, void*, void (*)(void*, void*, void (*)(void*, unsigned long), unsigned long), JxlEncoderOutputProcessorWrapper*)
Unexecuted instantiation: enc_fast_lossless.cc:jxl::Status AVX2::(anonymous namespace)::LLProcess<AVX2::(anonymous namespace)::Exactly14Bits>(JxlFastLosslessFrameState*, bool, AVX2::(anonymous namespace)::Exactly14Bits, void*, void (*)(void*, void*, void (*)(void*, unsigned long), unsigned long), JxlEncoderOutputProcessorWrapper*)
Unexecuted instantiation: enc_fast_lossless.cc:jxl::Status AVX2::(anonymous namespace)::LLProcess<AVX2::(anonymous namespace)::MoreThan14Bits>(JxlFastLosslessFrameState*, bool, AVX2::(anonymous namespace)::MoreThan14Bits, void*, void (*)(void*, void*, void (*)(void*, unsigned long), unsigned long), JxlEncoderOutputProcessorWrapper*)
Unexecuted instantiation: enc_fast_lossless.cc:jxl::Status default_implementation::(anonymous namespace)::LLProcess<default_implementation::(anonymous namespace)::UpTo8Bits>(JxlFastLosslessFrameState*, bool, default_implementation::(anonymous namespace)::UpTo8Bits, void*, void (*)(void*, void*, void (*)(void*, unsigned long), unsigned long), JxlEncoderOutputProcessorWrapper*)
Unexecuted instantiation: enc_fast_lossless.cc:jxl::Status default_implementation::(anonymous namespace)::LLProcess<default_implementation::(anonymous namespace)::From9To13Bits>(JxlFastLosslessFrameState*, bool, default_implementation::(anonymous namespace)::From9To13Bits, void*, void (*)(void*, void*, void (*)(void*, unsigned long), unsigned long), JxlEncoderOutputProcessorWrapper*)
Unexecuted instantiation: enc_fast_lossless.cc:jxl::Status default_implementation::(anonymous namespace)::LLProcess<default_implementation::(anonymous namespace)::Exactly14Bits>(JxlFastLosslessFrameState*, bool, default_implementation::(anonymous namespace)::Exactly14Bits, void*, void (*)(void*, void*, void (*)(void*, unsigned long), unsigned long), JxlEncoderOutputProcessorWrapper*)
Unexecuted instantiation: enc_fast_lossless.cc:jxl::Status default_implementation::(anonymous namespace)::LLProcess<default_implementation::(anonymous namespace)::MoreThan14Bits>(JxlFastLosslessFrameState*, bool, default_implementation::(anonymous namespace)::MoreThan14Bits, void*, void (*)(void*, void*, void (*)(void*, unsigned long), unsigned long), JxlEncoderOutputProcessorWrapper*)
4156
4157
JxlFastLosslessFrameState* JxlFastLosslessPrepareImpl(
4158
    JxlChunkedFrameInputSource input, size_t width, size_t height,
4159
    size_t nb_chans, size_t bitdepth, bool big_endian, int effort,
4160
0
    int oneshot) {
4161
0
  assert(bitdepth > 0);
4162
0
  assert(nb_chans <= 4);
4163
0
  assert(nb_chans != 0);
4164
0
  if (bitdepth <= 8) {
4165
0
    return LLPrepare(input, width, height, UpTo8Bits(bitdepth), nb_chans,
4166
0
                     big_endian, effort, oneshot);
4167
0
  }
4168
0
  if (bitdepth <= 13) {
4169
0
    return LLPrepare(input, width, height, From9To13Bits(bitdepth), nb_chans,
4170
0
                     big_endian, effort, oneshot);
4171
0
  }
4172
0
  if (bitdepth == 14) {
4173
0
    return LLPrepare(input, width, height, Exactly14Bits(bitdepth), nb_chans,
4174
0
                     big_endian, effort, oneshot);
4175
0
  }
4176
0
  return LLPrepare(input, width, height, MoreThan14Bits(bitdepth), nb_chans,
4177
0
                   big_endian, effort, oneshot);
4178
0
}
Unexecuted instantiation: enc_fast_lossless.cc:AVX2::(anonymous namespace)::JxlFastLosslessPrepareImpl(JxlChunkedFrameInputSource, unsigned long, unsigned long, unsigned long, unsigned long, bool, int, int)
Unexecuted instantiation: enc_fast_lossless.cc:default_implementation::(anonymous namespace)::JxlFastLosslessPrepareImpl(JxlChunkedFrameInputSource, unsigned long, unsigned long, unsigned long, unsigned long, bool, int, int)
4179
4180
jxl::Status JxlFastLosslessProcessFrameImpl(
4181
    JxlFastLosslessFrameState* frame_state, bool is_last, void* runner_opaque,
4182
    FJxlParallelRunner runner,
4183
0
    JxlEncoderOutputProcessorWrapper* output_processor) {
4184
0
  const size_t bitdepth = frame_state->bitdepth;
4185
0
  if (bitdepth <= 8) {
4186
0
    JXL_RETURN_IF_ERROR(LLProcess(frame_state, is_last, UpTo8Bits(bitdepth),
4187
0
                                  runner_opaque, runner, output_processor));
4188
0
  } else if (bitdepth <= 13) {
4189
0
    JXL_RETURN_IF_ERROR(LLProcess(frame_state, is_last, From9To13Bits(bitdepth),
4190
0
                                  runner_opaque, runner, output_processor));
4191
0
  } else if (bitdepth == 14) {
4192
0
    JXL_RETURN_IF_ERROR(LLProcess(frame_state, is_last, Exactly14Bits(bitdepth),
4193
0
                                  runner_opaque, runner, output_processor));
4194
0
  } else {
4195
0
    JXL_RETURN_IF_ERROR(LLProcess(frame_state, is_last,
4196
0
                                  MoreThan14Bits(bitdepth), runner_opaque,
4197
0
                                  runner, output_processor));
4198
0
  }
4199
0
  return true;
4200
0
}
Unexecuted instantiation: enc_fast_lossless.cc:AVX2::(anonymous namespace)::JxlFastLosslessProcessFrameImpl(JxlFastLosslessFrameState*, bool, void*, void (*)(void*, void*, void (*)(void*, unsigned long), unsigned long), JxlEncoderOutputProcessorWrapper*)
Unexecuted instantiation: enc_fast_lossless.cc:default_implementation::(anonymous namespace)::JxlFastLosslessProcessFrameImpl(JxlFastLosslessFrameState*, bool, void*, void (*)(void*, void*, void (*)(void*, unsigned long), unsigned long), JxlEncoderOutputProcessorWrapper*)
4201
4202
}  // namespace
4203
4204
#endif  // FJXL_SELF_INCLUDE
4205
4206
#ifndef FJXL_SELF_INCLUDE
4207
4208
#define FJXL_SELF_INCLUDE
4209
4210
// If we have NEON enabled, it is the default target.
4211
#if FJXL_ENABLE_NEON
4212
4213
namespace default_implementation {
4214
#define FJXL_NEON
4215
#include "lib/jxl/enc_fast_lossless.cc"
4216
#undef FJXL_NEON
4217
}  // namespace default_implementation
4218
4219
#else                                    // FJXL_ENABLE_NEON
4220
4221
namespace default_implementation {
4222
#include "lib/jxl/enc_fast_lossless.cc"  // NOLINT
4223
}
4224
4225
#if FJXL_ENABLE_AVX2
4226
#ifdef __clang__
4227
#pragma clang attribute push(__attribute__((target("avx,avx2"))), \
4228
                             apply_to = function)
4229
// Causes spurious warnings on clang5.
4230
#pragma clang diagnostic push
4231
#pragma clang diagnostic ignored "-Wmissing-braces"
4232
#elif defined(__GNUC__)
4233
#pragma GCC push_options
4234
// Seems to cause spurious errors on GCC8.
4235
#pragma GCC diagnostic ignored "-Wpsabi"
4236
#pragma GCC target "avx,avx2"
4237
#endif
4238
4239
namespace AVX2 {
4240
#define FJXL_AVX2
4241
#include "lib/jxl/enc_fast_lossless.cc"  // NOLINT
4242
#undef FJXL_AVX2
4243
}  // namespace AVX2
4244
4245
#ifdef __clang__
4246
#pragma clang attribute pop
4247
#pragma clang diagnostic pop
4248
#elif defined(__GNUC__)
4249
#pragma GCC pop_options
4250
#endif
4251
#endif  // FJXL_ENABLE_AVX2
4252
4253
#if FJXL_ENABLE_AVX512
4254
#ifdef __clang__
4255
#pragma clang attribute push(                                                 \
4256
    __attribute__((target("avx512cd,avx512bw,avx512vl,avx512f,avx512vbmi"))), \
4257
    apply_to = function)
4258
#elif defined(__GNUC__)
4259
#pragma GCC push_options
4260
#pragma GCC target "avx512cd,avx512bw,avx512vl,avx512f,avx512vbmi"
4261
#endif
4262
4263
namespace AVX512 {
4264
#define FJXL_AVX512
4265
#include "lib/jxl/enc_fast_lossless.cc"
4266
#undef FJXL_AVX512
4267
}  // namespace AVX512
4268
4269
#ifdef __clang__
4270
#pragma clang attribute pop
4271
#elif defined(__GNUC__)
4272
#pragma GCC pop_options
4273
#endif
4274
#endif  // FJXL_ENABLE_AVX512
4275
4276
#endif
4277
4278
extern "C" {
4279
4280
#if FJXL_STANDALONE
4281
class FJxlFrameInput {
4282
 public:
4283
  FJxlFrameInput(const unsigned char* rgba, size_t row_stride, size_t nb_chans,
4284
                 size_t bitdepth)
4285
      : rgba_(rgba),
4286
        row_stride_(row_stride),
4287
        bytes_per_pixel_(bitdepth <= 8 ? nb_chans : 2 * nb_chans) {}
4288
4289
  JxlChunkedFrameInputSource GetInputSource() {
4290
    return JxlChunkedFrameInputSource{this, GetDataAt,
4291
                                      [](void*, const void*) {}};
4292
  }
4293
4294
 private:
4295
  static const void* GetDataAt(void* opaque, size_t xpos, size_t ypos,
4296
                               size_t xsize, size_t ysize, size_t* row_offset) {
4297
    FJxlFrameInput* self = static_cast<FJxlFrameInput*>(opaque);
4298
    *row_offset = self->row_stride_;
4299
    return self->rgba_ + ypos * (*row_offset) + xpos * self->bytes_per_pixel_;
4300
  }
4301
4302
  const uint8_t* rgba_;
4303
  size_t row_stride_;
4304
  size_t bytes_per_pixel_;
4305
};
4306
4307
size_t JxlFastLosslessEncode(const unsigned char* rgba, size_t width,
4308
                             size_t row_stride, size_t height, size_t nb_chans,
4309
                             size_t bitdepth, bool big_endian, int effort,
4310
                             unsigned char** output, void* runner_opaque,
4311
                             FJxlParallelRunner runner) {
4312
  FJxlFrameInput input(rgba, row_stride, nb_chans, bitdepth);
4313
  auto* frame_state = JxlFastLosslessPrepareFrame(
4314
      input.GetInputSource(), width, height, nb_chans, bitdepth, big_endian,
4315
      effort, /*oneshot=*/true);
4316
  if (!frame_state) return 0;
4317
  if (!JxlFastLosslessProcessFrame(frame_state, /*is_last=*/true, runner_opaque,
4318
                                   runner, nullptr)) {
4319
    JxlFastLosslessFreeFrameState(frame_state);
4320
    return 0;
4321
  }
4322
  if (!JxlFastLosslessPrepareHeader(frame_state, /*add_image_header=*/1,
4323
                                    /*is_last=*/1)) {
4324
    JxlFastLosslessFreeFrameState(frame_state);
4325
    return 0;
4326
  }
4327
  size_t output_size = JxlFastLosslessMaxRequiredOutput(frame_state);
4328
  *output = (unsigned char*)malloc(output_size);
4329
  if (*output == NULL) {
4330
    return JXL_FAILURE("Memory allocation failed");
4331
  }
4332
  size_t written = 0;
4333
  size_t total = 0;
4334
  while ((written = JxlFastLosslessWriteOutput(frame_state, *output + total,
4335
                                               output_size - total)) != 0) {
4336
    total += written;
4337
  }
4338
  JxlFastLosslessFreeFrameState(frame_state);
4339
  return total;
4340
}
4341
#endif
4342
4343
JxlFastLosslessFrameState* JxlFastLosslessPrepareFrame(
4344
    JxlChunkedFrameInputSource input, size_t width, size_t height,
4345
    size_t nb_chans, size_t bitdepth, bool big_endian, int effort,
4346
0
    int oneshot) {
4347
#if FJXL_ENABLE_AVX512
4348
  if (HasCpuFeature(CpuFeature::kAVX512CD) &&
4349
      HasCpuFeature(CpuFeature::kVBMI) &&
4350
      HasCpuFeature(CpuFeature::kAVX512BW) &&
4351
      HasCpuFeature(CpuFeature::kAVX512F) &&
4352
      HasCpuFeature(CpuFeature::kAVX512VL)) {
4353
    return AVX512::JxlFastLosslessPrepareImpl(
4354
        input, width, height, nb_chans, bitdepth, big_endian, effort, oneshot);
4355
  }
4356
#endif
4357
0
#if FJXL_ENABLE_AVX2
4358
0
  if (HasCpuFeature(CpuFeature::kAVX2)) {
4359
0
    return AVX2::JxlFastLosslessPrepareImpl(
4360
0
        input, width, height, nb_chans, bitdepth, big_endian, effort, oneshot);
4361
0
  }
4362
0
#endif
4363
4364
0
  return default_implementation::JxlFastLosslessPrepareImpl(
4365
0
      input, width, height, nb_chans, bitdepth, big_endian, effort, oneshot);
4366
0
}
4367
4368
bool JxlFastLosslessProcessFrame(
4369
    JxlFastLosslessFrameState* frame_state, bool is_last, void* runner_opaque,
4370
    FJxlParallelRunner runner,
4371
0
    JxlEncoderOutputProcessorWrapper* output_processor) {
4372
0
  auto trivial_runner =
4373
0
      +[](void*, void* opaque, void fun(void*, size_t), size_t count) {
4374
0
        for (size_t i = 0; i < count; i++) {
4375
0
          fun(opaque, i);
4376
0
        }
4377
0
      };
4378
4379
0
  if (runner == nullptr) {
4380
0
    runner = trivial_runner;
4381
0
  }
4382
4383
#if FJXL_ENABLE_AVX512
4384
  if (HasCpuFeature(CpuFeature::kAVX512CD) &&
4385
      HasCpuFeature(CpuFeature::kVBMI) &&
4386
      HasCpuFeature(CpuFeature::kAVX512BW) &&
4387
      HasCpuFeature(CpuFeature::kAVX512F) &&
4388
      HasCpuFeature(CpuFeature::kAVX512VL)) {
4389
    JXL_RETURN_IF_ERROR(AVX512::JxlFastLosslessProcessFrameImpl(
4390
        frame_state, is_last, runner_opaque, runner, output_processor));
4391
    return true;
4392
  }
4393
#endif
4394
0
#if FJXL_ENABLE_AVX2
4395
0
  if (HasCpuFeature(CpuFeature::kAVX2)) {
4396
0
    JXL_RETURN_IF_ERROR(AVX2::JxlFastLosslessProcessFrameImpl(
4397
0
        frame_state, is_last, runner_opaque, runner, output_processor));
4398
0
    return true;
4399
0
  }
4400
0
#endif
4401
4402
0
  JXL_RETURN_IF_ERROR(default_implementation::JxlFastLosslessProcessFrameImpl(
4403
0
      frame_state, is_last, runner_opaque, runner, output_processor));
4404
0
  return true;
4405
0
}
4406
4407
}  // extern "C"
4408
4409
#if !FJXL_STANDALONE
4410
bool JxlFastLosslessOutputFrame(
4411
    JxlFastLosslessFrameState* frame_state,
4412
0
    JxlEncoderOutputProcessorWrapper* output_processor) {
4413
0
  size_t fl_size = JxlFastLosslessOutputSize(frame_state);
4414
0
  size_t written = 0;
4415
0
  while (written < fl_size) {
4416
0
    JXL_ASSIGN_OR_RETURN(auto buffer,
4417
0
                         output_processor->GetBuffer(32, fl_size - written));
4418
0
    size_t n =
4419
0
        JxlFastLosslessWriteOutput(frame_state, buffer.data(), buffer.size());
4420
0
    if (n == 0) break;
4421
0
    JXL_RETURN_IF_ERROR(buffer.advance(n));
4422
0
    written += n;
4423
0
  };
4424
0
  return true;
4425
0
}
4426
#endif
4427
4428
#endif  // FJXL_SELF_INCLUDE