Coverage Report

Created: 2025-06-16 07:00

/src/libjxl/lib/jxl/enc_fast_lossless.cc
Line
Count
Source (jump to first uncovered line)
1
// Copyright (c) the JPEG XL Project Authors. All rights reserved.
2
//
3
// Use of this source code is governed by a BSD-style
4
// license that can be found in the LICENSE file.
5
6
#include "lib/jxl/base/status.h"
7
#ifndef FJXL_SELF_INCLUDE
8
9
#include <assert.h>
10
11
#include <algorithm>
12
#include <array>
13
#include <cstdint>
14
#include <cstdlib>
15
#include <cstring>
16
#include <limits>
17
#include <memory>
18
#include <vector>
19
20
#include "lib/jxl/enc_fast_lossless.h"
21
22
#if FJXL_STANDALONE
23
#if defined(_MSC_VER)
24
using ssize_t = intptr_t;
25
#endif
26
#else  // FJXL_STANDALONE
27
#include "lib/jxl/encode_internal.h"
28
#endif  // FJXL_STANDALONE
29
30
#if defined(__x86_64__) || defined(_M_X64)
31
#define FJXL_ARCH_IS_X86_64 1
32
#else
33
#define FJXL_ARCH_IS_X86_64 0
34
#endif
35
36
#if defined(__i386__) || defined(_M_IX86) || FJXL_ARCH_IS_X86_64
37
#define FJXL_ARCH_IS_X86 1
38
#else
39
#define FJXL_ARCH_IS_X86 0
40
#endif
41
42
#if FJXL_ARCH_IS_X86
43
#if defined(_MSC_VER)
44
#include <intrin.h>
45
#else  // _MSC_VER
46
#include <cpuid.h>
47
#endif  // _MSC_VER
48
#endif  // FJXL_ARCH_IS_X86
49
50
// Enable NEON and AVX2/AVX512 if not asked to do otherwise and the compilers
51
// support it.
52
#if defined(__aarch64__) || defined(_M_ARM64)  // ARCH
53
#include <arm_neon.h>
54
55
#if !defined(FJXL_ENABLE_NEON)
56
#define FJXL_ENABLE_NEON 1
57
#endif  // !defined(FJXL_ENABLE_NEON)
58
59
#elif FJXL_ARCH_IS_X86_64 && !defined(_MSC_VER)  // ARCH
60
#include <immintrin.h>
61
62
// manually add _mm512_cvtsi512_si32 definition if missing
63
// (e.g. with Xcode on macOS Mojave)
64
// copied from gcc 11.1.0 include/avx512fintrin.h line 14367-14373
65
#if defined(__clang__) &&                                           \
66
    ((!defined(__apple_build_version__) && __clang_major__ < 10) || \
67
     (defined(__apple_build_version__) && __apple_build_version__ < 12000032))
68
inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
69
_mm512_cvtsi512_si32(__m512i __A) {
70
  __v16si __B = (__v16si)__A;
71
  return __B[0];
72
}
73
#endif
74
75
#if !defined(FJXL_ENABLE_AVX2)
76
#define FJXL_ENABLE_AVX2 1
77
#endif  // !defined(FJXL_ENABLE_AVX2)
78
79
#if !defined(FJXL_ENABLE_AVX512)
80
// On clang-7 or earlier, and gcc-10 or earlier, AVX512 seems broken.
81
#if (defined(__clang__) &&                                             \
82
         (!defined(__apple_build_version__) && __clang_major__ > 7) || \
83
     (defined(__apple_build_version__) &&                              \
84
      __apple_build_version__ > 10010046)) ||                          \
85
    (defined(__GNUC__) && __GNUC__ > 10)
86
#define FJXL_ENABLE_AVX512 1
87
#endif
88
#endif  // !defined(FJXL_ENABLE_AVX512)
89
90
#endif  // ARCH
91
92
#ifndef FJXL_ENABLE_NEON
93
#define FJXL_ENABLE_NEON 0
94
#endif
95
96
#ifndef FJXL_ENABLE_AVX2
97
#define FJXL_ENABLE_AVX2 0
98
#endif
99
100
#ifndef FJXL_ENABLE_AVX512
101
#define FJXL_ENABLE_AVX512 0
102
#endif
103
104
namespace {
105
106
enum class CpuFeature : uint32_t {
107
  kAVX2 = 0,
108
109
  kAVX512F,
110
  kAVX512VL,
111
  kAVX512CD,
112
  kAVX512BW,
113
114
  kVBMI,
115
  kVBMI2
116
};
117
118
0
constexpr uint32_t CpuFeatureBit(CpuFeature feature) {
119
0
  return 1u << static_cast<uint32_t>(feature);
120
0
}
121
122
#if FJXL_ARCH_IS_X86
123
#if defined(_MSC_VER)
124
void Cpuid(const uint32_t level, const uint32_t count,
125
           std::array<uint32_t, 4>& abcd) {
126
  int regs[4];
127
  __cpuidex(regs, level, count);
128
  for (int i = 0; i < 4; ++i) {
129
    abcd[i] = regs[i];
130
  }
131
}
132
uint32_t ReadXCR0() { return static_cast<uint32_t>(_xgetbv(0)); }
133
#else   // _MSC_VER
134
void Cpuid(const uint32_t level, const uint32_t count,
135
0
           std::array<uint32_t, 4>& abcd) {
136
0
  uint32_t a;
137
0
  uint32_t b;
138
0
  uint32_t c;
139
0
  uint32_t d;
140
0
  __cpuid_count(level, count, a, b, c, d);
141
0
  abcd[0] = a;
142
0
  abcd[1] = b;
143
0
  abcd[2] = c;
144
0
  abcd[3] = d;
145
0
}
146
0
uint32_t ReadXCR0() {
147
0
  uint32_t xcr0;
148
0
  uint32_t xcr0_high;
149
0
  const uint32_t index = 0;
150
0
  asm volatile(".byte 0x0F, 0x01, 0xD0"
151
0
               : "=a"(xcr0), "=d"(xcr0_high)
152
0
               : "c"(index));
153
0
  return xcr0;
154
0
}
155
#endif  // _MSC_VER
156
157
0
uint32_t DetectCpuFeatures() {
158
0
  uint32_t flags = 0;  // return value
159
0
  std::array<uint32_t, 4> abcd;
160
0
  Cpuid(0, 0, abcd);
161
0
  const uint32_t max_level = abcd[0];
162
163
0
  const auto check_bit = [](uint32_t v, uint32_t idx) -> bool {
164
0
    return (v & (1U << idx)) != 0;
165
0
  };
166
167
  // Extended features
168
0
  if (max_level >= 7) {
169
0
    Cpuid(7, 0, abcd);
170
0
    flags |= check_bit(abcd[1], 5) ? CpuFeatureBit(CpuFeature::kAVX2) : 0;
171
172
0
    flags |= check_bit(abcd[1], 16) ? CpuFeatureBit(CpuFeature::kAVX512F) : 0;
173
0
    flags |= check_bit(abcd[1], 28) ? CpuFeatureBit(CpuFeature::kAVX512CD) : 0;
174
0
    flags |= check_bit(abcd[1], 30) ? CpuFeatureBit(CpuFeature::kAVX512BW) : 0;
175
0
    flags |= check_bit(abcd[1], 31) ? CpuFeatureBit(CpuFeature::kAVX512VL) : 0;
176
177
0
    flags |= check_bit(abcd[2], 1) ? CpuFeatureBit(CpuFeature::kVBMI) : 0;
178
0
    flags |= check_bit(abcd[2], 6) ? CpuFeatureBit(CpuFeature::kVBMI2) : 0;
179
0
  }
180
181
0
  Cpuid(1, 0, abcd);
182
0
  const bool os_has_xsave = check_bit(abcd[2], 27);
183
0
  if (os_has_xsave) {
184
0
    const uint32_t xcr0 = ReadXCR0();
185
0
    if (!check_bit(xcr0, 1) || !check_bit(xcr0, 2) || !check_bit(xcr0, 5) ||
186
0
        !check_bit(xcr0, 6) || !check_bit(xcr0, 7)) {
187
0
      flags = 0;  // TODO(eustas): be more selective?
188
0
    }
189
0
  }
190
191
0
  return flags;
192
0
}
193
#else   // FJXL_ARCH_IS_X86
194
uint32_t DetectCpuFeatures() { return 0; }
195
#endif  // FJXL_ARCH_IS_X86
196
197
#if defined(_MSC_VER)
198
#define FJXL_UNUSED
199
#else
200
#define FJXL_UNUSED __attribute__((unused))
201
#endif
202
203
0
FJXL_UNUSED bool HasCpuFeature(CpuFeature feature) {
204
0
  static uint32_t cpu_features = DetectCpuFeatures();
205
0
  return (cpu_features & CpuFeatureBit(feature)) != 0;
206
0
}
207
208
#if defined(_MSC_VER) && !defined(__clang__)
209
#define FJXL_INLINE __forceinline
210
FJXL_INLINE uint32_t FloorLog2(uint32_t v) {
211
  unsigned long index;
212
  _BitScanReverse(&index, v);
213
  return index;
214
}
215
FJXL_INLINE uint32_t CtzNonZero(uint64_t v) {
216
  unsigned long index;
217
  _BitScanForward(&index, v);
218
  return index;
219
}
220
#else
221
#define FJXL_INLINE inline __attribute__((always_inline))
222
0
FJXL_INLINE uint32_t FloorLog2(uint32_t v) {
223
0
  return v ? 31 - __builtin_clz(v) : 0;
224
0
}
225
0
FJXL_UNUSED FJXL_INLINE uint32_t CtzNonZero(uint64_t v) {
226
0
  return __builtin_ctzll(v);
227
0
}
228
#endif
229
230
// Compiles to a memcpy on little-endian systems.
231
0
FJXL_INLINE void StoreLE64(uint8_t* tgt, uint64_t data) {
232
#if (!defined(__BYTE_ORDER__) || (__BYTE_ORDER__ != __ORDER_LITTLE_ENDIAN__))
233
  for (int i = 0; i < 8; i++) {
234
    tgt[i] = (data >> (i * 8)) & 0xFF;
235
  }
236
#else
237
0
  memcpy(tgt, &data, 8);
238
0
#endif
239
0
}
240
241
FJXL_INLINE size_t AddBits(uint32_t count, uint64_t bits, uint8_t* data_buf,
242
0
                           size_t& bits_in_buffer, uint64_t& bit_buffer) {
243
0
  bit_buffer |= bits << bits_in_buffer;
244
0
  bits_in_buffer += count;
245
0
  StoreLE64(data_buf, bit_buffer);
246
0
  size_t bytes_in_buffer = bits_in_buffer / 8;
247
0
  bits_in_buffer -= bytes_in_buffer * 8;
248
0
  bit_buffer >>= bytes_in_buffer * 8;
249
0
  return bytes_in_buffer;
250
0
}
251
252
struct BitWriter {
253
0
  void Allocate(size_t maximum_bit_size) {
254
0
    assert(data == nullptr);
255
    // Leave some padding.
256
0
    data.reset(static_cast<uint8_t*>(malloc(maximum_bit_size / 8 + 64)));
257
0
  }
258
259
0
  void Write(uint32_t count, uint64_t bits) {
260
0
    bytes_written += AddBits(count, bits, data.get() + bytes_written,
261
0
                             bits_in_buffer, buffer);
262
0
  }
263
264
0
  void ZeroPadToByte() {
265
0
    if (bits_in_buffer != 0) {
266
0
      Write(8 - bits_in_buffer, 0);
267
0
    }
268
0
  }
269
270
  FJXL_INLINE void WriteMultiple(const uint64_t* nbits, const uint64_t* bits,
271
0
                                 size_t n) {
272
    // Necessary because Write() is only guaranteed to work with <=56 bits.
273
    // Trying to SIMD-fy this code results in lower speed (and definitely less
274
    // clarity).
275
0
    {
276
0
      for (size_t i = 0; i < n; i++) {
277
0
        this->buffer |= bits[i] << this->bits_in_buffer;
278
0
        memcpy(this->data.get() + this->bytes_written, &this->buffer, 8);
279
0
        uint64_t shift = 64 - this->bits_in_buffer;
280
0
        this->bits_in_buffer += nbits[i];
281
        // This `if` seems to be faster than using ternaries.
282
0
        if (this->bits_in_buffer >= 64) {
283
0
          uint64_t next_buffer = shift >= 64 ? 0 : bits[i] >> shift;
284
0
          this->buffer = next_buffer;
285
0
          this->bits_in_buffer -= 64;
286
0
          this->bytes_written += 8;
287
0
        }
288
0
      }
289
0
      memcpy(this->data.get() + this->bytes_written, &this->buffer, 8);
290
0
      size_t bytes_in_buffer = this->bits_in_buffer / 8;
291
0
      this->bits_in_buffer -= bytes_in_buffer * 8;
292
0
      this->buffer >>= bytes_in_buffer * 8;
293
0
      this->bytes_written += bytes_in_buffer;
294
0
    }
295
0
  }
296
297
  std::unique_ptr<uint8_t[], void (*)(void*)> data = {nullptr, free};
298
  size_t bytes_written = 0;
299
  size_t bits_in_buffer = 0;
300
  uint64_t buffer = 0;
301
};
302
303
0
size_t SectionSize(const std::array<BitWriter, 4>& group_data) {
304
0
  size_t sz = 0;
305
0
  for (size_t j = 0; j < 4; j++) {
306
0
    const auto& writer = group_data[j];
307
0
    sz += writer.bytes_written * 8 + writer.bits_in_buffer;
308
0
  }
309
0
  sz = (sz + 7) / 8;
310
0
  return sz;
311
0
}
312
313
constexpr size_t kMaxFrameHeaderSize = 5;
314
315
constexpr size_t kGroupSizeOffset[4] = {
316
    static_cast<size_t>(0),
317
    static_cast<size_t>(1024),
318
    static_cast<size_t>(17408),
319
    static_cast<size_t>(4211712),
320
};
321
constexpr size_t kTOCBits[4] = {12, 16, 24, 32};
322
323
0
size_t TOCBucket(size_t group_size) {
324
0
  size_t bucket = 0;
325
0
  while (bucket < 3 && group_size >= kGroupSizeOffset[bucket + 1]) ++bucket;
326
0
  return bucket;
327
0
}
328
329
#if !FJXL_STANDALONE
330
0
size_t TOCSize(const std::vector<size_t>& group_sizes) {
331
0
  size_t toc_bits = 0;
332
0
  for (size_t group_size : group_sizes) {
333
0
    toc_bits += kTOCBits[TOCBucket(group_size)];
334
0
  }
335
0
  return (toc_bits + 7) / 8;
336
0
}
337
338
0
size_t FrameHeaderSize(bool have_alpha, bool is_last) {
339
0
  size_t nbits = 28 + (have_alpha ? 4 : 0) + (is_last ? 0 : 2);
340
0
  return (nbits + 7) / 8;
341
0
}
342
#endif
343
344
void ComputeAcGroupDataOffset(size_t dc_global_size, size_t num_dc_groups,
345
                              size_t num_ac_groups, size_t& min_dc_global_size,
346
0
                              size_t& ac_group_offset) {
347
  // Max AC group size is 768 kB, so max AC group TOC bits is 24.
348
0
  size_t ac_toc_max_bits = num_ac_groups * 24;
349
0
  size_t ac_toc_min_bits = num_ac_groups * 12;
350
0
  size_t max_padding = 1 + (ac_toc_max_bits - ac_toc_min_bits + 7) / 8;
351
0
  min_dc_global_size = dc_global_size;
352
0
  size_t dc_global_bucket = TOCBucket(min_dc_global_size);
353
0
  while (TOCBucket(min_dc_global_size + max_padding) > dc_global_bucket) {
354
0
    dc_global_bucket = TOCBucket(min_dc_global_size + max_padding);
355
0
    min_dc_global_size = kGroupSizeOffset[dc_global_bucket];
356
0
  }
357
0
  assert(TOCBucket(min_dc_global_size) == dc_global_bucket);
358
0
  assert(TOCBucket(min_dc_global_size + max_padding) == dc_global_bucket);
359
0
  size_t max_toc_bits =
360
0
      kTOCBits[dc_global_bucket] + 12 * (1 + num_dc_groups) + ac_toc_max_bits;
361
0
  size_t max_toc_size = (max_toc_bits + 7) / 8;
362
0
  ac_group_offset = kMaxFrameHeaderSize + max_toc_size + min_dc_global_size;
363
0
}
364
365
#if !FJXL_STANDALONE
366
size_t ComputeDcGlobalPadding(const std::vector<size_t>& group_sizes,
367
                              size_t ac_group_data_offset,
368
                              size_t min_dc_global_size, bool have_alpha,
369
0
                              bool is_last) {
370
0
  std::vector<size_t> new_group_sizes = group_sizes;
371
0
  new_group_sizes[0] = min_dc_global_size;
372
0
  size_t toc_size = TOCSize(new_group_sizes);
373
0
  size_t actual_offset =
374
0
      FrameHeaderSize(have_alpha, is_last) + toc_size + group_sizes[0];
375
0
  return ac_group_data_offset - actual_offset;
376
0
}
377
#endif
378
379
constexpr size_t kNumRawSymbols = 19;
380
constexpr size_t kNumLZ77 = 33;
381
constexpr size_t kLZ77CacheSize = 32;
382
383
constexpr size_t kLZ77Offset = 224;
384
constexpr size_t kLZ77MinLength = 7;
385
386
void EncodeHybridUintLZ77(uint32_t value, uint32_t* token, uint32_t* nbits,
387
0
                          uint32_t* bits) {
388
  // 400 config
389
0
  uint32_t n = FloorLog2(value);
390
0
  *token = value < 16 ? value : 16 + n - 4;
391
0
  *nbits = value < 16 ? 0 : n;
392
0
  *bits = value < 16 ? 0 : value - (1 << *nbits);
393
0
}
394
395
struct PrefixCode {
396
  uint8_t raw_nbits[kNumRawSymbols] = {};
397
  uint8_t raw_bits[kNumRawSymbols] = {};
398
399
  uint8_t lz77_nbits[kNumLZ77] = {};
400
  uint16_t lz77_bits[kNumLZ77] = {};
401
402
  uint64_t lz77_cache_bits[kLZ77CacheSize] = {};
403
  uint8_t lz77_cache_nbits[kLZ77CacheSize] = {};
404
405
  size_t numraw;
406
407
0
  static uint16_t BitReverse(size_t nbits, uint16_t bits) {
408
0
    constexpr uint16_t kNibbleLookup[16] = {
409
0
        0b0000, 0b1000, 0b0100, 0b1100, 0b0010, 0b1010, 0b0110, 0b1110,
410
0
        0b0001, 0b1001, 0b0101, 0b1101, 0b0011, 0b1011, 0b0111, 0b1111,
411
0
    };
412
0
    uint16_t rev16 = (kNibbleLookup[bits & 0xF] << 12) |
413
0
                     (kNibbleLookup[(bits >> 4) & 0xF] << 8) |
414
0
                     (kNibbleLookup[(bits >> 8) & 0xF] << 4) |
415
0
                     (kNibbleLookup[bits >> 12]);
416
0
    return rev16 >> (16 - nbits);
417
0
  }
418
419
  // Create the prefix codes given the code lengths.
420
  // Supports the code lengths being split into two halves.
421
  static void ComputeCanonicalCode(const uint8_t* first_chunk_nbits,
422
                                   uint8_t* first_chunk_bits,
423
                                   size_t first_chunk_size,
424
                                   const uint8_t* second_chunk_nbits,
425
                                   uint16_t* second_chunk_bits,
426
0
                                   size_t second_chunk_size) {
427
0
    constexpr size_t kMaxCodeLength = 15;
428
0
    uint8_t code_length_counts[kMaxCodeLength + 1] = {};
429
0
    for (size_t i = 0; i < first_chunk_size; i++) {
430
0
      code_length_counts[first_chunk_nbits[i]]++;
431
0
      assert(first_chunk_nbits[i] <= kMaxCodeLength);
432
0
      assert(first_chunk_nbits[i] <= 8);
433
0
      assert(first_chunk_nbits[i] > 0);
434
0
    }
435
0
    for (size_t i = 0; i < second_chunk_size; i++) {
436
0
      code_length_counts[second_chunk_nbits[i]]++;
437
0
      assert(second_chunk_nbits[i] <= kMaxCodeLength);
438
0
    }
439
440
0
    uint16_t next_code[kMaxCodeLength + 1] = {};
441
442
0
    uint16_t code = 0;
443
0
    for (size_t i = 1; i < kMaxCodeLength + 1; i++) {
444
0
      code = (code + code_length_counts[i - 1]) << 1;
445
0
      next_code[i] = code;
446
0
    }
447
448
0
    for (size_t i = 0; i < first_chunk_size; i++) {
449
0
      first_chunk_bits[i] =
450
0
          BitReverse(first_chunk_nbits[i], next_code[first_chunk_nbits[i]]++);
451
0
    }
452
0
    for (size_t i = 0; i < second_chunk_size; i++) {
453
0
      second_chunk_bits[i] =
454
0
          BitReverse(second_chunk_nbits[i], next_code[second_chunk_nbits[i]]++);
455
0
    }
456
0
  }
457
458
  template <typename T>
459
  static void ComputeCodeLengthsNonZeroImpl(const uint64_t* freqs, size_t n,
460
                                            size_t precision, T infty,
461
                                            const uint8_t* min_limit,
462
                                            const uint8_t* max_limit,
463
0
                                            uint8_t* nbits) {
464
0
    assert(precision < 15);
465
0
    assert(n <= kMaxNumSymbols);
466
0
    std::vector<T> dynp(((1U << precision) + 1) * (n + 1), infty);
467
0
    auto d = [&](size_t sym, size_t off) -> T& {
468
0
      return dynp[sym * ((1 << precision) + 1) + off];
469
0
    };
Unexecuted instantiation: enc_fast_lossless.cc:(anonymous namespace)::PrefixCode::ComputeCodeLengthsNonZeroImpl<unsigned int>(unsigned long const*, unsigned long, unsigned long, unsigned int, unsigned char const*, unsigned char const*, unsigned char*)::{lambda(unsigned long, unsigned long)#1}::operator()(unsigned long, unsigned long) const
Unexecuted instantiation: enc_fast_lossless.cc:(anonymous namespace)::PrefixCode::ComputeCodeLengthsNonZeroImpl<unsigned long>(unsigned long const*, unsigned long, unsigned long, unsigned long, unsigned char const*, unsigned char const*, unsigned char*)::{lambda(unsigned long, unsigned long)#1}::operator()(unsigned long, unsigned long) const
470
0
    d(0, 0) = 0;
471
0
    for (size_t sym = 0; sym < n; sym++) {
472
0
      for (T bits = min_limit[sym]; bits <= max_limit[sym]; bits++) {
473
0
        size_t off_delta = 1U << (precision - bits);
474
0
        for (size_t off = 0; off + off_delta <= (1U << precision); off++) {
475
0
          d(sym + 1, off + off_delta) =
476
0
              std::min(d(sym, off) + static_cast<T>(freqs[sym]) * bits,
477
0
                       d(sym + 1, off + off_delta));
478
0
        }
479
0
      }
480
0
    }
481
482
0
    size_t sym = n;
483
0
    size_t off = 1U << precision;
484
485
0
    assert(d(sym, off) != infty);
486
487
0
    while (sym-- > 0) {
488
0
      assert(off > 0);
489
0
      for (size_t bits = min_limit[sym]; bits <= max_limit[sym]; bits++) {
490
0
        size_t off_delta = 1U << (precision - bits);
491
0
        if (off_delta <= off &&
492
0
            d(sym + 1, off) == d(sym, off - off_delta) + freqs[sym] * bits) {
493
0
          off -= off_delta;
494
0
          nbits[sym] = bits;
495
0
          break;
496
0
        }
497
0
      }
498
0
    }
499
0
  }
Unexecuted instantiation: enc_fast_lossless.cc:void (anonymous namespace)::PrefixCode::ComputeCodeLengthsNonZeroImpl<unsigned int>(unsigned long const*, unsigned long, unsigned long, unsigned int, unsigned char const*, unsigned char const*, unsigned char*)
Unexecuted instantiation: enc_fast_lossless.cc:void (anonymous namespace)::PrefixCode::ComputeCodeLengthsNonZeroImpl<unsigned long>(unsigned long const*, unsigned long, unsigned long, unsigned long, unsigned char const*, unsigned char const*, unsigned char*)
500
501
  // Computes nbits[i] for i <= n, subject to min_limit[i] <= nbits[i] <=
502
  // max_limit[i] and sum 2**-nbits[i] == 1, so to minimize sum(nbits[i] *
503
  // freqs[i]).
504
  static void ComputeCodeLengthsNonZero(const uint64_t* freqs, size_t n,
505
                                        uint8_t* min_limit, uint8_t* max_limit,
506
0
                                        uint8_t* nbits) {
507
0
    size_t precision = 0;
508
0
    size_t shortest_length = 255;
509
0
    uint64_t freqsum = 0;
510
0
    for (size_t i = 0; i < n; i++) {
511
0
      assert(freqs[i] != 0);
512
0
      freqsum += freqs[i];
513
0
      if (min_limit[i] < 1) min_limit[i] = 1;
514
0
      assert(min_limit[i] <= max_limit[i]);
515
0
      precision = std::max<size_t>(max_limit[i], precision);
516
0
      shortest_length = std::min<size_t>(min_limit[i], shortest_length);
517
0
    }
518
    // If all the minimum limits are greater than 1, shift precision so that we
519
    // behave as if the shortest was 1.
520
0
    precision -= shortest_length - 1;
521
0
    uint64_t infty = freqsum * precision;
522
0
    if (infty < std::numeric_limits<uint32_t>::max() / 2) {
523
0
      ComputeCodeLengthsNonZeroImpl(freqs, n, precision,
524
0
                                    static_cast<uint32_t>(infty), min_limit,
525
0
                                    max_limit, nbits);
526
0
    } else {
527
0
      ComputeCodeLengthsNonZeroImpl(freqs, n, precision, infty, min_limit,
528
0
                                    max_limit, nbits);
529
0
    }
530
0
  }
531
532
  static constexpr size_t kMaxNumSymbols =
533
      kNumRawSymbols + 1 < kNumLZ77 ? kNumLZ77 : kNumRawSymbols + 1;
534
  static void ComputeCodeLengths(const uint64_t* freqs, size_t n,
535
                                 const uint8_t* min_limit_in,
536
0
                                 const uint8_t* max_limit_in, uint8_t* nbits) {
537
0
    assert(n <= kMaxNumSymbols);
538
0
    uint64_t compact_freqs[kMaxNumSymbols];
539
0
    uint8_t min_limit[kMaxNumSymbols];
540
0
    uint8_t max_limit[kMaxNumSymbols];
541
0
    size_t ni = 0;
542
0
    for (size_t i = 0; i < n; i++) {
543
0
      if (freqs[i]) {
544
0
        compact_freqs[ni] = freqs[i];
545
0
        min_limit[ni] = min_limit_in[i];
546
0
        max_limit[ni] = max_limit_in[i];
547
0
        ni++;
548
0
      }
549
0
    }
550
0
    for (size_t i = ni; i < kMaxNumSymbols; ++i) {
551
0
      compact_freqs[i] = 0;
552
0
      min_limit[i] = 0;
553
0
      max_limit[i] = 0;
554
0
    }
555
0
    uint8_t num_bits[kMaxNumSymbols] = {};
556
0
    ComputeCodeLengthsNonZero(compact_freqs, ni, min_limit, max_limit,
557
0
                              num_bits);
558
0
    ni = 0;
559
0
    for (size_t i = 0; i < n; i++) {
560
0
      nbits[i] = 0;
561
0
      if (freqs[i]) {
562
0
        nbits[i] = num_bits[ni++];
563
0
      }
564
0
    }
565
0
  }
566
567
  // Invalid code, used to construct arrays.
568
0
  PrefixCode() = default;
569
570
  template <typename BitDepth>
571
  PrefixCode(BitDepth /* bitdepth */, uint64_t* raw_counts,
572
0
             uint64_t* lz77_counts) {
573
    // "merge" together all the lz77 counts in a single symbol for the level 1
574
    // table (containing just the raw symbols, up to length 7).
575
0
    uint64_t level1_counts[kNumRawSymbols + 1];
576
0
    memcpy(level1_counts, raw_counts, kNumRawSymbols * sizeof(uint64_t));
577
0
    numraw = kNumRawSymbols;
578
0
    while (numraw > 0 && level1_counts[numraw - 1] == 0) numraw--;
579
580
0
    level1_counts[numraw] = 0;
581
0
    for (size_t i = 0; i < kNumLZ77; i++) {
582
0
      level1_counts[numraw] += lz77_counts[i];
583
0
    }
584
0
    uint8_t level1_nbits[kNumRawSymbols + 1] = {};
585
0
    ComputeCodeLengths(level1_counts, numraw + 1, BitDepth::kMinRawLength,
586
0
                       BitDepth::kMaxRawLength, level1_nbits);
587
588
0
    uint8_t level2_nbits[kNumLZ77] = {};
589
0
    uint8_t min_lengths[kNumLZ77] = {};
590
0
    uint8_t l = 15 - level1_nbits[numraw];
591
0
    uint8_t max_lengths[kNumLZ77];
592
0
    for (uint8_t& max_length : max_lengths) {
593
0
      max_length = l;
594
0
    }
595
0
    size_t num_lz77 = kNumLZ77;
596
0
    while (num_lz77 > 0 && lz77_counts[num_lz77 - 1] == 0) num_lz77--;
597
0
    ComputeCodeLengths(lz77_counts, num_lz77, min_lengths, max_lengths,
598
0
                       level2_nbits);
599
0
    for (size_t i = 0; i < numraw; i++) {
600
0
      raw_nbits[i] = level1_nbits[i];
601
0
    }
602
0
    for (size_t i = 0; i < num_lz77; i++) {
603
0
      lz77_nbits[i] =
604
0
          level2_nbits[i] ? level1_nbits[numraw] + level2_nbits[i] : 0;
605
0
    }
606
607
0
    ComputeCanonicalCode(raw_nbits, raw_bits, numraw, lz77_nbits, lz77_bits,
608
0
                         kNumLZ77);
609
610
    // Prepare lz77 cache
611
0
    for (size_t count = 0; count < kLZ77CacheSize; count++) {
612
0
      unsigned token, nbits, bits;
613
0
      EncodeHybridUintLZ77(count, &token, &nbits, &bits);
614
0
      lz77_cache_nbits[count] = lz77_nbits[token] + nbits + raw_nbits[0];
615
0
      lz77_cache_bits[count] =
616
0
          (((bits << lz77_nbits[token]) | lz77_bits[token]) << raw_nbits[0]) |
617
0
          raw_bits[0];
618
0
    }
619
0
  }
Unexecuted instantiation: enc_fast_lossless.cc:(anonymous namespace)::PrefixCode::PrefixCode<AVX2::(anonymous namespace)::UpTo8Bits>(AVX2::(anonymous namespace)::UpTo8Bits, unsigned long*, unsigned long*)
Unexecuted instantiation: enc_fast_lossless.cc:(anonymous namespace)::PrefixCode::PrefixCode<AVX2::(anonymous namespace)::From9To13Bits>(AVX2::(anonymous namespace)::From9To13Bits, unsigned long*, unsigned long*)
Unexecuted instantiation: enc_fast_lossless.cc:(anonymous namespace)::PrefixCode::PrefixCode<AVX2::(anonymous namespace)::Exactly14Bits>(AVX2::(anonymous namespace)::Exactly14Bits, unsigned long*, unsigned long*)
Unexecuted instantiation: enc_fast_lossless.cc:(anonymous namespace)::PrefixCode::PrefixCode<AVX2::(anonymous namespace)::MoreThan14Bits>(AVX2::(anonymous namespace)::MoreThan14Bits, unsigned long*, unsigned long*)
Unexecuted instantiation: enc_fast_lossless.cc:(anonymous namespace)::PrefixCode::PrefixCode<default_implementation::(anonymous namespace)::UpTo8Bits>(default_implementation::(anonymous namespace)::UpTo8Bits, unsigned long*, unsigned long*)
Unexecuted instantiation: enc_fast_lossless.cc:(anonymous namespace)::PrefixCode::PrefixCode<default_implementation::(anonymous namespace)::From9To13Bits>(default_implementation::(anonymous namespace)::From9To13Bits, unsigned long*, unsigned long*)
Unexecuted instantiation: enc_fast_lossless.cc:(anonymous namespace)::PrefixCode::PrefixCode<default_implementation::(anonymous namespace)::Exactly14Bits>(default_implementation::(anonymous namespace)::Exactly14Bits, unsigned long*, unsigned long*)
Unexecuted instantiation: enc_fast_lossless.cc:(anonymous namespace)::PrefixCode::PrefixCode<default_implementation::(anonymous namespace)::MoreThan14Bits>(default_implementation::(anonymous namespace)::MoreThan14Bits, unsigned long*, unsigned long*)
620
621
  // Max bits written: 2 + 72 + 95 + 24 + 165 = 286
622
0
  void WriteTo(BitWriter* writer) const {
623
0
    uint64_t code_length_counts[18] = {};
624
0
    code_length_counts[17] = 3 + 2 * (kNumLZ77 - 1);
625
0
    for (uint8_t raw_nbit : raw_nbits) {
626
0
      code_length_counts[raw_nbit]++;
627
0
    }
628
0
    for (uint8_t lz77_nbit : lz77_nbits) {
629
0
      code_length_counts[lz77_nbit]++;
630
0
    }
631
0
    uint8_t code_length_nbits[18] = {};
632
0
    uint8_t code_length_nbits_min[18] = {};
633
0
    uint8_t code_length_nbits_max[18] = {
634
0
        5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
635
0
    };
636
0
    ComputeCodeLengths(code_length_counts, 18, code_length_nbits_min,
637
0
                       code_length_nbits_max, code_length_nbits);
638
0
    writer->Write(2, 0b00);  // HSKIP = 0, i.e. don't skip code lengths.
639
640
    // As per Brotli RFC.
641
0
    uint8_t code_length_order[18] = {1, 2, 3, 4,  0,  5,  17, 6,  16,
642
0
                                     7, 8, 9, 10, 11, 12, 13, 14, 15};
643
0
    uint8_t code_length_length_nbits[] = {2, 4, 3, 2, 2, 4};
644
0
    uint8_t code_length_length_bits[] = {0, 7, 3, 2, 1, 15};
645
646
    // Encode lengths of code lengths.
647
0
    size_t num_code_lengths = 18;
648
0
    while (code_length_nbits[code_length_order[num_code_lengths - 1]] == 0) {
649
0
      num_code_lengths--;
650
0
    }
651
    // Max bits written in this loop: 18 * 4 = 72
652
0
    for (size_t i = 0; i < num_code_lengths; i++) {
653
0
      int symbol = code_length_nbits[code_length_order[i]];
654
0
      writer->Write(code_length_length_nbits[symbol],
655
0
                    code_length_length_bits[symbol]);
656
0
    }
657
658
    // Compute the canonical codes for the codes that represent the lengths of
659
    // the actual codes for data.
660
0
    uint16_t code_length_bits[18] = {};
661
0
    ComputeCanonicalCode(nullptr, nullptr, 0, code_length_nbits,
662
0
                         code_length_bits, 18);
663
    // Encode raw bit code lengths.
664
    // Max bits written in this loop: 19 * 5 = 95
665
0
    for (uint8_t raw_nbit : raw_nbits) {
666
0
      writer->Write(code_length_nbits[raw_nbit], code_length_bits[raw_nbit]);
667
0
    }
668
0
    size_t num_lz77 = kNumLZ77;
669
0
    while (lz77_nbits[num_lz77 - 1] == 0) {
670
0
      num_lz77--;
671
0
    }
672
    // Encode 0s until 224 (start of LZ77 symbols). This is in total 224-19 =
673
    // 205.
674
0
    static_assert(kLZ77Offset == 224, "kLZ77Offset should be 224");
675
0
    static_assert(kNumRawSymbols == 19, "kNumRawSymbols should be 19");
676
0
    {
677
      // Max bits in this block: 24
678
0
      writer->Write(code_length_nbits[17], code_length_bits[17]);
679
0
      writer->Write(3, 0b010);  // 5
680
0
      writer->Write(code_length_nbits[17], code_length_bits[17]);
681
0
      writer->Write(3, 0b000);  // (5-2)*8 + 3 = 27
682
0
      writer->Write(code_length_nbits[17], code_length_bits[17]);
683
0
      writer->Write(3, 0b010);  // (27-2)*8 + 5 = 205
684
0
    }
685
    // Encode LZ77 symbols, with values 224+i.
686
    // Max bits written in this loop: 33 * 5 = 165
687
0
    for (size_t i = 0; i < num_lz77; i++) {
688
0
      writer->Write(code_length_nbits[lz77_nbits[i]],
689
0
                    code_length_bits[lz77_nbits[i]]);
690
0
    }
691
0
  }
692
};
693
694
}  // namespace
695
696
extern "C" {
697
698
struct JxlFastLosslessFrameState {
699
  JxlChunkedFrameInputSource input;
700
  size_t width;
701
  size_t height;
702
  size_t num_groups_x;
703
  size_t num_groups_y;
704
  size_t num_dc_groups_x;
705
  size_t num_dc_groups_y;
706
  size_t nb_chans;
707
  size_t bitdepth;
708
  int big_endian;
709
  int effort;
710
  bool collided;
711
  PrefixCode hcode[4];
712
  std::vector<int16_t> lookup;
713
  BitWriter header;
714
  std::vector<std::array<BitWriter, 4>> group_data;
715
  std::vector<size_t> group_sizes;
716
  size_t ac_group_data_offset = 0;
717
  size_t min_dc_global_size = 0;
718
  size_t current_bit_writer = 0;
719
  size_t bit_writer_byte_pos = 0;
720
  size_t bits_in_buffer = 0;
721
  uint64_t bit_buffer = 0;
722
  bool process_done = false;
723
};
724
725
0
size_t JxlFastLosslessOutputSize(const JxlFastLosslessFrameState* frame) {
726
0
  size_t total_size_groups = 0;
727
0
  for (const auto& section : frame->group_data) {
728
0
    total_size_groups += SectionSize(section);
729
0
  }
730
0
  return frame->header.bytes_written + total_size_groups;
731
0
}
732
733
size_t JxlFastLosslessMaxRequiredOutput(
734
0
    const JxlFastLosslessFrameState* frame) {
735
0
  return JxlFastLosslessOutputSize(frame) + 32;
736
0
}
737
738
void JxlFastLosslessPrepareHeader(JxlFastLosslessFrameState* frame,
739
0
                                  int add_image_header, int is_last) {
740
0
  BitWriter* output = &frame->header;
741
0
  output->Allocate(1000 + frame->group_sizes.size() * 32);
742
743
0
  bool have_alpha = (frame->nb_chans == 2 || frame->nb_chans == 4);
744
745
#if FJXL_STANDALONE
746
  if (add_image_header) {
747
    // Signature
748
    output->Write(16, 0x0AFF);
749
750
    // Size header, hand-crafted.
751
    // Not small
752
    output->Write(1, 0);
753
754
    auto wsz = [output](size_t size) {
755
      if (size - 1 < (1 << 9)) {
756
        output->Write(2, 0b00);
757
        output->Write(9, size - 1);
758
      } else if (size - 1 < (1 << 13)) {
759
        output->Write(2, 0b01);
760
        output->Write(13, size - 1);
761
      } else if (size - 1 < (1 << 18)) {
762
        output->Write(2, 0b10);
763
        output->Write(18, size - 1);
764
      } else {
765
        output->Write(2, 0b11);
766
        output->Write(30, size - 1);
767
      }
768
    };
769
770
    wsz(frame->height);
771
772
    // No special ratio.
773
    output->Write(3, 0);
774
775
    wsz(frame->width);
776
777
    // Hand-crafted ImageMetadata.
778
    output->Write(1, 0);  // all_default
779
    output->Write(1, 0);  // extra_fields
780
    output->Write(1, 0);  // bit_depth.floating_point_sample
781
    if (frame->bitdepth == 8) {
782
      output->Write(2, 0b00);  // bit_depth.bits_per_sample = 8
783
    } else if (frame->bitdepth == 10) {
784
      output->Write(2, 0b01);  // bit_depth.bits_per_sample = 10
785
    } else if (frame->bitdepth == 12) {
786
      output->Write(2, 0b10);  // bit_depth.bits_per_sample = 12
787
    } else {
788
      output->Write(2, 0b11);  // 1 + u(6)
789
      output->Write(6, frame->bitdepth - 1);
790
    }
791
    if (frame->bitdepth <= 14) {
792
      output->Write(1, 1);  // 16-bit-buffer sufficient
793
    } else {
794
      output->Write(1, 0);  // 16-bit-buffer NOT sufficient
795
    }
796
    if (have_alpha) {
797
      output->Write(2, 0b01);  // One extra channel
798
      if (frame->bitdepth == 8) {
799
        output->Write(1, 1); // ... all_default (ie. 8-bit alpha)
800
      } else {
801
        output->Write(1, 0); // not d_alpha
802
        output->Write(2, 0); // type = kAlpha
803
        output->Write(1, 0); // not float
804
        if (frame->bitdepth == 10) {
805
          output->Write(2, 0b01); // bit_depth.bits_per_sample = 10
806
        } else if (frame->bitdepth == 12) {
807
          output->Write(2, 0b10); // bit_depth.bits_per_sample = 12
808
        } else {
809
          output->Write(2, 0b11); // 1 + u(6)
810
          output->Write(6, frame->bitdepth - 1);
811
        }
812
        output->Write(2, 0); // dim_shift = 0
813
        output->Write(2, 0); // name_len = 0
814
        output->Write(1, 0); // alpha_associated = 0
815
      }
816
    } else {
817
      output->Write(2, 0b00);  // No extra channel
818
    }
819
    output->Write(1, 0);  // Not XYB
820
    if (frame->nb_chans > 2) {
821
      output->Write(1, 1);  // color_encoding.all_default (sRGB)
822
    } else {
823
      output->Write(1, 0);     // color_encoding.all_default false
824
      output->Write(1, 0);     // color_encoding.want_icc false
825
      output->Write(2, 1);     // grayscale
826
      output->Write(2, 1);     // D65
827
      output->Write(1, 0);     // no gamma transfer function
828
      output->Write(2, 0b10);  // tf: 2 + u(4)
829
      output->Write(4, 11);    // tf of sRGB
830
      output->Write(2, 1);     // relative rendering intent
831
    }
832
    output->Write(2, 0b00);  // No extensions.
833
834
    output->Write(1, 1);  // all_default transform data
835
836
    // No ICC, no preview. Frame should start at byte boundary.
837
    output->ZeroPadToByte();
838
  }
839
#else
840
0
  assert(!add_image_header);
841
0
#endif
842
  // Handcrafted frame header.
843
0
  output->Write(1, 0);     // all_default
844
0
  output->Write(2, 0b00);  // regular frame
845
0
  output->Write(1, 1);     // modular
846
0
  output->Write(2, 0b00);  // default flags
847
0
  output->Write(1, 0);     // not YCbCr
848
0
  output->Write(2, 0b00);  // no upsampling
849
0
  if (have_alpha) {
850
0
    output->Write(2, 0b00);  // no alpha upsampling
851
0
  }
852
0
  output->Write(2, 0b01);  // default group size
853
0
  output->Write(2, 0b00);  // exactly one pass
854
0
  output->Write(1, 0);     // no custom size or origin
855
0
  output->Write(2, 0b00);  // kReplace blending mode
856
0
  if (have_alpha) {
857
0
    output->Write(2, 0b00);  // kReplace blending mode for alpha channel
858
0
  }
859
0
  output->Write(1, is_last);  // is_last
860
0
  if (!is_last) {
861
0
    output->Write(2, 0b00);  // can not be saved as reference
862
0
  }
863
0
  output->Write(2, 0b00);  // a frame has no name
864
0
  output->Write(1, 0);     // loop filter is not all_default
865
0
  output->Write(1, 0);     // no gaborish
866
0
  output->Write(2, 0);     // 0 EPF iters
867
0
  output->Write(2, 0b00);  // No LF extensions
868
0
  output->Write(2, 0b00);  // No FH extensions
869
870
0
  output->Write(1, 0);      // No TOC permutation
871
0
  output->ZeroPadToByte();  // TOC is byte-aligned.
872
0
  assert(add_image_header || output->bytes_written <= kMaxFrameHeaderSize);
873
0
  for (size_t group_size : frame->group_sizes) {
874
0
    size_t bucket = TOCBucket(group_size);
875
0
    output->Write(2, bucket);
876
0
    output->Write(kTOCBits[bucket] - 2, group_size - kGroupSizeOffset[bucket]);
877
0
  }
878
0
  output->ZeroPadToByte();  // Groups are byte-aligned.
879
0
}
880
881
#if !FJXL_STANDALONE
882
bool JxlFastLosslessOutputAlignedSection(
883
0
    const BitWriter& bw, JxlEncoderOutputProcessorWrapper* output_processor) {
884
0
  assert(bw.bits_in_buffer == 0);
885
0
  const uint8_t* data = bw.data.get();
886
0
  size_t remaining_len = bw.bytes_written;
887
0
  while (remaining_len > 0) {
888
0
    JXL_ASSIGN_OR_RETURN(auto buffer,
889
0
                         output_processor->GetBuffer(1, remaining_len));
890
0
    size_t n = std::min(buffer.size(), remaining_len);
891
0
    if (n == 0) break;
892
0
    memcpy(buffer.data(), data, n);
893
0
    JXL_RETURN_IF_ERROR(buffer.advance(n));
894
0
    data += n;
895
0
    remaining_len -= n;
896
0
  };
897
0
  return true;
898
0
}
899
900
bool JxlFastLosslessOutputHeaders(
901
    JxlFastLosslessFrameState* frame_state,
902
0
    JxlEncoderOutputProcessorWrapper* output_processor) {
903
0
  JXL_RETURN_IF_ERROR(JxlFastLosslessOutputAlignedSection(frame_state->header,
904
0
                                                          output_processor));
905
0
  JXL_RETURN_IF_ERROR(JxlFastLosslessOutputAlignedSection(
906
0
      frame_state->group_data[0][0], output_processor));
907
0
  return true;
908
0
}
909
#endif
910
911
#if FJXL_ENABLE_AVX512
912
__attribute__((target("avx512vbmi2"))) static size_t AppendBytesWithBitOffset(
913
    const uint8_t* data, size_t n, size_t bit_buffer_nbits,
914
    unsigned char* output, uint64_t& bit_buffer) {
915
  if (n < 128) {
916
    return 0;
917
  }
918
919
  size_t i = 0;
920
  __m512i shift = _mm512_set1_epi64(64 - bit_buffer_nbits);
921
  __m512i carry = _mm512_set1_epi64(bit_buffer << (64 - bit_buffer_nbits));
922
923
  for (; i + 64 <= n; i += 64) {
924
    __m512i current = _mm512_loadu_si512(data + i);
925
    __m512i previous_u64 = _mm512_alignr_epi64(current, carry, 7);
926
    carry = current;
927
    __m512i out = _mm512_shrdv_epi64(previous_u64, current, shift);
928
    _mm512_storeu_si512(output + i, out);
929
  }
930
931
  bit_buffer = data[i - 1] >> (8 - bit_buffer_nbits);
932
933
  return i;
934
}
935
#endif
936
937
size_t JxlFastLosslessWriteOutput(JxlFastLosslessFrameState* frame,
938
0
                                  unsigned char* output, size_t output_size) {
939
0
  assert(output_size >= 32);
940
0
  unsigned char* initial_output = output;
941
0
  size_t (*append_bytes_with_bit_offset)(const uint8_t*, size_t, size_t,
942
0
                                         unsigned char*, uint64_t&) = nullptr;
943
944
#if FJXL_ENABLE_AVX512
945
  if (HasCpuFeature(CpuFeature::kVBMI2)) {
946
    append_bytes_with_bit_offset = AppendBytesWithBitOffset;
947
  }
948
#endif
949
950
0
  while (true) {
951
0
    size_t& cur = frame->current_bit_writer;
952
0
    size_t& bw_pos = frame->bit_writer_byte_pos;
953
0
    if (cur >= 1 + frame->group_data.size() * frame->nb_chans) {
954
0
      return output - initial_output;
955
0
    }
956
0
    if (output_size <= 9) {
957
0
      return output - initial_output;
958
0
    }
959
0
    size_t nbc = frame->nb_chans;
960
0
    const BitWriter& writer =
961
0
        cur == 0 ? frame->header
962
0
                 : frame->group_data[(cur - 1) / nbc][(cur - 1) % nbc];
963
0
    size_t full_byte_count =
964
0
        std::min(output_size - 9, writer.bytes_written - bw_pos);
965
0
    if (frame->bits_in_buffer == 0) {
966
0
      memcpy(output, writer.data.get() + bw_pos, full_byte_count);
967
0
    } else {
968
0
      size_t i = 0;
969
0
      if (append_bytes_with_bit_offset) {
970
0
        i += append_bytes_with_bit_offset(
971
0
            writer.data.get() + bw_pos, full_byte_count, frame->bits_in_buffer,
972
0
            output, frame->bit_buffer);
973
0
      }
974
0
#if defined(__BYTE_ORDER__) && (__BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__)
975
      // Copy 8 bytes at a time until we reach the border.
976
0
      for (; i + 8 < full_byte_count; i += 8) {
977
0
        uint64_t chunk;
978
0
        memcpy(&chunk, writer.data.get() + bw_pos + i, 8);
979
0
        uint64_t out = frame->bit_buffer | (chunk << frame->bits_in_buffer);
980
0
        memcpy(output + i, &out, 8);
981
0
        frame->bit_buffer = chunk >> (64 - frame->bits_in_buffer);
982
0
      }
983
0
#endif
984
0
      for (; i < full_byte_count; i++) {
985
0
        AddBits(8, writer.data.get()[bw_pos + i], output + i,
986
0
                frame->bits_in_buffer, frame->bit_buffer);
987
0
      }
988
0
    }
989
0
    output += full_byte_count;
990
0
    output_size -= full_byte_count;
991
0
    bw_pos += full_byte_count;
992
0
    if (bw_pos == writer.bytes_written) {
993
0
      auto write = [&](size_t num, uint64_t bits) {
994
0
        size_t n = AddBits(num, bits, output, frame->bits_in_buffer,
995
0
                           frame->bit_buffer);
996
0
        output += n;
997
0
        output_size -= n;
998
0
      };
999
0
      if (writer.bits_in_buffer) {
1000
0
        write(writer.bits_in_buffer, writer.buffer);
1001
0
      }
1002
0
      bw_pos = 0;
1003
0
      cur++;
1004
0
      if ((cur - 1) % nbc == 0 && frame->bits_in_buffer != 0) {
1005
0
        write(8 - frame->bits_in_buffer, 0);
1006
0
      }
1007
0
    }
1008
0
  }
1009
0
}
1010
1011
0
void JxlFastLosslessFreeFrameState(JxlFastLosslessFrameState* frame) {
1012
0
  delete frame;
1013
0
}
1014
1015
}  // extern "C"
1016
1017
#endif
1018
1019
#ifdef FJXL_SELF_INCLUDE
1020
1021
namespace {
1022
1023
template <typename T>
1024
struct VecPair {
1025
  T low;
1026
  T hi;
1027
};
1028
1029
#ifdef FJXL_GENERIC_SIMD
1030
#undef FJXL_GENERIC_SIMD
1031
#endif
1032
1033
#ifdef FJXL_AVX512
1034
#define FJXL_GENERIC_SIMD
1035
struct SIMDVec32;
1036
struct Mask32 {
1037
  __mmask16 mask;
1038
  SIMDVec32 IfThenElse(const SIMDVec32& if_true, const SIMDVec32& if_false);
1039
  size_t CountPrefix() const {
1040
    return CtzNonZero(~uint64_t{_cvtmask16_u32(mask)});
1041
  }
1042
};
1043
1044
struct SIMDVec32 {
1045
  __m512i vec;
1046
1047
  static constexpr size_t kLanes = 16;
1048
1049
  FJXL_INLINE static SIMDVec32 Load(const uint32_t* data) {
1050
    return SIMDVec32{_mm512_loadu_si512((__m512i*)data)};
1051
  }
1052
  FJXL_INLINE void Store(uint32_t* data) {
1053
    _mm512_storeu_si512((__m512i*)data, vec);
1054
  }
1055
  FJXL_INLINE static SIMDVec32 Val(uint32_t v) {
1056
    return SIMDVec32{_mm512_set1_epi32(v)};
1057
  }
1058
  FJXL_INLINE SIMDVec32 ValToToken() const {
1059
    return SIMDVec32{
1060
        _mm512_sub_epi32(_mm512_set1_epi32(32), _mm512_lzcnt_epi32(vec))};
1061
  }
1062
  FJXL_INLINE SIMDVec32 SatSubU(const SIMDVec32& to_subtract) const {
1063
    return SIMDVec32{_mm512_sub_epi32(_mm512_max_epu32(vec, to_subtract.vec),
1064
                                      to_subtract.vec)};
1065
  }
1066
  FJXL_INLINE SIMDVec32 Sub(const SIMDVec32& to_subtract) const {
1067
    return SIMDVec32{_mm512_sub_epi32(vec, to_subtract.vec)};
1068
  }
1069
  FJXL_INLINE SIMDVec32 Add(const SIMDVec32& oth) const {
1070
    return SIMDVec32{_mm512_add_epi32(vec, oth.vec)};
1071
  }
1072
  FJXL_INLINE SIMDVec32 Xor(const SIMDVec32& oth) const {
1073
    return SIMDVec32{_mm512_xor_epi32(vec, oth.vec)};
1074
  }
1075
  FJXL_INLINE Mask32 Eq(const SIMDVec32& oth) const {
1076
    return Mask32{_mm512_cmpeq_epi32_mask(vec, oth.vec)};
1077
  }
1078
  FJXL_INLINE Mask32 Gt(const SIMDVec32& oth) const {
1079
    return Mask32{_mm512_cmpgt_epi32_mask(vec, oth.vec)};
1080
  }
1081
  FJXL_INLINE SIMDVec32 Pow2() const {
1082
    return SIMDVec32{_mm512_sllv_epi32(_mm512_set1_epi32(1), vec)};
1083
  }
1084
  template <size_t i>
1085
  FJXL_INLINE SIMDVec32 SignedShiftRight() const {
1086
    return SIMDVec32{_mm512_srai_epi32(vec, i)};
1087
  }
1088
};
1089
1090
struct SIMDVec16;
1091
1092
struct Mask16 {
1093
  __mmask32 mask;
1094
  SIMDVec16 IfThenElse(const SIMDVec16& if_true, const SIMDVec16& if_false);
1095
  Mask16 And(const Mask16& oth) const {
1096
    return Mask16{_kand_mask32(mask, oth.mask)};
1097
  }
1098
  size_t CountPrefix() const {
1099
    return CtzNonZero(~uint64_t{_cvtmask32_u32(mask)});
1100
  }
1101
};
1102
1103
struct SIMDVec16 {
1104
  __m512i vec;
1105
1106
  static constexpr size_t kLanes = 32;
1107
1108
  FJXL_INLINE static SIMDVec16 Load(const uint16_t* data) {
1109
    return SIMDVec16{_mm512_loadu_si512((__m512i*)data)};
1110
  }
1111
  FJXL_INLINE void Store(uint16_t* data) {
1112
    _mm512_storeu_si512((__m512i*)data, vec);
1113
  }
1114
  FJXL_INLINE static SIMDVec16 Val(uint16_t v) {
1115
    return SIMDVec16{_mm512_set1_epi16(v)};
1116
  }
1117
  FJXL_INLINE static SIMDVec16 FromTwo32(const SIMDVec32& lo,
1118
                                         const SIMDVec32& hi) {
1119
    auto tmp = _mm512_packus_epi32(lo.vec, hi.vec);
1120
    alignas(64) uint64_t perm[8] = {0, 2, 4, 6, 1, 3, 5, 7};
1121
    return SIMDVec16{
1122
        _mm512_permutex2var_epi64(tmp, _mm512_load_si512((__m512i*)perm), tmp)};
1123
  }
1124
1125
  FJXL_INLINE SIMDVec16 ValToToken() const {
1126
    auto c16 = _mm512_set1_epi32(16);
1127
    auto c32 = _mm512_set1_epi32(32);
1128
    auto low16bit = _mm512_set1_epi32(0x0000FFFF);
1129
    auto lzhi =
1130
        _mm512_sub_epi32(c16, _mm512_min_epu32(c16, _mm512_lzcnt_epi32(vec)));
1131
    auto lzlo = _mm512_sub_epi32(
1132
        c32, _mm512_lzcnt_epi32(_mm512_and_si512(low16bit, vec)));
1133
    return SIMDVec16{_mm512_or_si512(lzlo, _mm512_slli_epi32(lzhi, 16))};
1134
  }
1135
1136
  FJXL_INLINE SIMDVec16 SatSubU(const SIMDVec16& to_subtract) const {
1137
    return SIMDVec16{_mm512_subs_epu16(vec, to_subtract.vec)};
1138
  }
1139
  FJXL_INLINE SIMDVec16 Sub(const SIMDVec16& to_subtract) const {
1140
    return SIMDVec16{_mm512_sub_epi16(vec, to_subtract.vec)};
1141
  }
1142
  FJXL_INLINE SIMDVec16 Add(const SIMDVec16& oth) const {
1143
    return SIMDVec16{_mm512_add_epi16(vec, oth.vec)};
1144
  }
1145
  FJXL_INLINE SIMDVec16 Min(const SIMDVec16& oth) const {
1146
    return SIMDVec16{_mm512_min_epu16(vec, oth.vec)};
1147
  }
1148
  FJXL_INLINE Mask16 Eq(const SIMDVec16& oth) const {
1149
    return Mask16{_mm512_cmpeq_epi16_mask(vec, oth.vec)};
1150
  }
1151
  FJXL_INLINE Mask16 Gt(const SIMDVec16& oth) const {
1152
    return Mask16{_mm512_cmpgt_epi16_mask(vec, oth.vec)};
1153
  }
1154
  FJXL_INLINE SIMDVec16 Pow2() const {
1155
    return SIMDVec16{_mm512_sllv_epi16(_mm512_set1_epi16(1), vec)};
1156
  }
1157
  FJXL_INLINE SIMDVec16 Or(const SIMDVec16& oth) const {
1158
    return SIMDVec16{_mm512_or_si512(vec, oth.vec)};
1159
  }
1160
  FJXL_INLINE SIMDVec16 Xor(const SIMDVec16& oth) const {
1161
    return SIMDVec16{_mm512_xor_si512(vec, oth.vec)};
1162
  }
1163
  FJXL_INLINE SIMDVec16 And(const SIMDVec16& oth) const {
1164
    return SIMDVec16{_mm512_and_si512(vec, oth.vec)};
1165
  }
1166
  FJXL_INLINE SIMDVec16 HAdd(const SIMDVec16& oth) const {
1167
    return SIMDVec16{_mm512_srai_epi16(_mm512_add_epi16(vec, oth.vec), 1)};
1168
  }
1169
  FJXL_INLINE SIMDVec16 PrepareForU8Lookup() const {
1170
    return SIMDVec16{_mm512_or_si512(vec, _mm512_set1_epi16(0xFF00))};
1171
  }
1172
  FJXL_INLINE SIMDVec16 U8Lookup(const uint8_t* table) const {
1173
    return SIMDVec16{_mm512_shuffle_epi8(
1174
        _mm512_broadcast_i32x4(_mm_loadu_si128((__m128i*)table)), vec)};
1175
  }
1176
  FJXL_INLINE VecPair<SIMDVec16> Interleave(const SIMDVec16& low) const {
1177
    auto lo = _mm512_unpacklo_epi16(low.vec, vec);
1178
    auto hi = _mm512_unpackhi_epi16(low.vec, vec);
1179
    alignas(64) uint64_t perm1[8] = {0, 1, 8, 9, 2, 3, 10, 11};
1180
    alignas(64) uint64_t perm2[8] = {4, 5, 12, 13, 6, 7, 14, 15};
1181
    return {SIMDVec16{_mm512_permutex2var_epi64(
1182
                lo, _mm512_load_si512((__m512i*)perm1), hi)},
1183
            SIMDVec16{_mm512_permutex2var_epi64(
1184
                lo, _mm512_load_si512((__m512i*)perm2), hi)}};
1185
  }
1186
  FJXL_INLINE VecPair<SIMDVec32> Upcast() const {
1187
    auto lo = _mm512_unpacklo_epi16(vec, _mm512_setzero_si512());
1188
    auto hi = _mm512_unpackhi_epi16(vec, _mm512_setzero_si512());
1189
    alignas(64) uint64_t perm1[8] = {0, 1, 8, 9, 2, 3, 10, 11};
1190
    alignas(64) uint64_t perm2[8] = {4, 5, 12, 13, 6, 7, 14, 15};
1191
    return {SIMDVec32{_mm512_permutex2var_epi64(
1192
                lo, _mm512_load_si512((__m512i*)perm1), hi)},
1193
            SIMDVec32{_mm512_permutex2var_epi64(
1194
                lo, _mm512_load_si512((__m512i*)perm2), hi)}};
1195
  }
1196
  template <size_t i>
1197
  FJXL_INLINE SIMDVec16 SignedShiftRight() const {
1198
    return SIMDVec16{_mm512_srai_epi16(vec, i)};
1199
  }
1200
1201
  static std::array<SIMDVec16, 1> LoadG8(const unsigned char* data) {
1202
    __m256i bytes = _mm256_loadu_si256((__m256i*)data);
1203
    return {SIMDVec16{_mm512_cvtepu8_epi16(bytes)}};
1204
  }
1205
  static std::array<SIMDVec16, 1> LoadG16(const unsigned char* data) {
1206
    return {Load((const uint16_t*)data)};
1207
  }
1208
1209
  static std::array<SIMDVec16, 2> LoadGA8(const unsigned char* data) {
1210
    __m512i bytes = _mm512_loadu_si512((__m512i*)data);
1211
    __m512i gray = _mm512_and_si512(bytes, _mm512_set1_epi16(0xFF));
1212
    __m512i alpha = _mm512_srli_epi16(bytes, 8);
1213
    return {SIMDVec16{gray}, SIMDVec16{alpha}};
1214
  }
1215
  static std::array<SIMDVec16, 2> LoadGA16(const unsigned char* data) {
1216
    __m512i bytes1 = _mm512_loadu_si512((__m512i*)data);
1217
    __m512i bytes2 = _mm512_loadu_si512((__m512i*)(data + 64));
1218
    __m512i g_mask = _mm512_set1_epi32(0xFFFF);
1219
    __m512i permuteidx = _mm512_set_epi64(7, 5, 3, 1, 6, 4, 2, 0);
1220
    __m512i g = _mm512_permutexvar_epi64(
1221
        permuteidx, _mm512_packus_epi32(_mm512_and_si512(bytes1, g_mask),
1222
                                        _mm512_and_si512(bytes2, g_mask)));
1223
    __m512i a = _mm512_permutexvar_epi64(
1224
        permuteidx, _mm512_packus_epi32(_mm512_srli_epi32(bytes1, 16),
1225
                                        _mm512_srli_epi32(bytes2, 16)));
1226
    return {SIMDVec16{g}, SIMDVec16{a}};
1227
  }
1228
1229
  static std::array<SIMDVec16, 3> LoadRGB8(const unsigned char* data) {
1230
    __m512i bytes0 = _mm512_loadu_si512((__m512i*)data);
1231
    __m512i bytes1 =
1232
        _mm512_zextsi256_si512(_mm256_loadu_si256((__m256i*)(data + 64)));
1233
1234
    // 0x7A = element of upper half of second vector = 0 after lookup; still in
1235
    // the upper half once we add 1 or 2.
1236
    uint8_t z = 0x7A;
1237
    __m512i ridx =
1238
        _mm512_set_epi8(z, 93, z, 90, z, 87, z, 84, z, 81, z, 78, z, 75, z, 72,
1239
                        z, 69, z, 66, z, 63, z, 60, z, 57, z, 54, z, 51, z, 48,
1240
                        z, 45, z, 42, z, 39, z, 36, z, 33, z, 30, z, 27, z, 24,
1241
                        z, 21, z, 18, z, 15, z, 12, z, 9, z, 6, z, 3, z, 0);
1242
    __m512i gidx = _mm512_add_epi8(ridx, _mm512_set1_epi8(1));
1243
    __m512i bidx = _mm512_add_epi8(gidx, _mm512_set1_epi8(1));
1244
    __m512i r = _mm512_permutex2var_epi8(bytes0, ridx, bytes1);
1245
    __m512i g = _mm512_permutex2var_epi8(bytes0, gidx, bytes1);
1246
    __m512i b = _mm512_permutex2var_epi8(bytes0, bidx, bytes1);
1247
    return {SIMDVec16{r}, SIMDVec16{g}, SIMDVec16{b}};
1248
  }
1249
  static std::array<SIMDVec16, 3> LoadRGB16(const unsigned char* data) {
1250
    __m512i bytes0 = _mm512_loadu_si512((__m512i*)data);
1251
    __m512i bytes1 = _mm512_loadu_si512((__m512i*)(data + 64));
1252
    __m512i bytes2 = _mm512_loadu_si512((__m512i*)(data + 128));
1253
1254
    __m512i ridx_lo = _mm512_set_epi16(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 63, 60, 57,
1255
                                       54, 51, 48, 45, 42, 39, 36, 33, 30, 27,
1256
                                       24, 21, 18, 15, 12, 9, 6, 3, 0);
1257
    // -1 is such that when adding 1 or 2, we get the correct index for
1258
    // green/blue.
1259
    __m512i ridx_hi =
1260
        _mm512_set_epi16(29, 26, 23, 20, 17, 14, 11, 8, 5, 2, -1, 0, 0, 0, 0, 0,
1261
                         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
1262
    __m512i gidx_lo = _mm512_add_epi16(ridx_lo, _mm512_set1_epi16(1));
1263
    __m512i gidx_hi = _mm512_add_epi16(ridx_hi, _mm512_set1_epi16(1));
1264
    __m512i bidx_lo = _mm512_add_epi16(gidx_lo, _mm512_set1_epi16(1));
1265
    __m512i bidx_hi = _mm512_add_epi16(gidx_hi, _mm512_set1_epi16(1));
1266
1267
    __mmask32 rmask = _cvtu32_mask32(0b11111111110000000000000000000000);
1268
    __mmask32 gbmask = _cvtu32_mask32(0b11111111111000000000000000000000);
1269
1270
    __m512i rlo = _mm512_permutex2var_epi16(bytes0, ridx_lo, bytes1);
1271
    __m512i glo = _mm512_permutex2var_epi16(bytes0, gidx_lo, bytes1);
1272
    __m512i blo = _mm512_permutex2var_epi16(bytes0, bidx_lo, bytes1);
1273
    __m512i r = _mm512_mask_permutexvar_epi16(rlo, rmask, ridx_hi, bytes2);
1274
    __m512i g = _mm512_mask_permutexvar_epi16(glo, gbmask, gidx_hi, bytes2);
1275
    __m512i b = _mm512_mask_permutexvar_epi16(blo, gbmask, bidx_hi, bytes2);
1276
    return {SIMDVec16{r}, SIMDVec16{g}, SIMDVec16{b}};
1277
  }
1278
1279
  static std::array<SIMDVec16, 4> LoadRGBA8(const unsigned char* data) {
1280
    __m512i bytes1 = _mm512_loadu_si512((__m512i*)data);
1281
    __m512i bytes2 = _mm512_loadu_si512((__m512i*)(data + 64));
1282
    __m512i rg_mask = _mm512_set1_epi32(0xFFFF);
1283
    __m512i permuteidx = _mm512_set_epi64(7, 5, 3, 1, 6, 4, 2, 0);
1284
    __m512i rg = _mm512_permutexvar_epi64(
1285
        permuteidx, _mm512_packus_epi32(_mm512_and_si512(bytes1, rg_mask),
1286
                                        _mm512_and_si512(bytes2, rg_mask)));
1287
    __m512i b_a = _mm512_permutexvar_epi64(
1288
        permuteidx, _mm512_packus_epi32(_mm512_srli_epi32(bytes1, 16),
1289
                                        _mm512_srli_epi32(bytes2, 16)));
1290
    __m512i r = _mm512_and_si512(rg, _mm512_set1_epi16(0xFF));
1291
    __m512i g = _mm512_srli_epi16(rg, 8);
1292
    __m512i b = _mm512_and_si512(b_a, _mm512_set1_epi16(0xFF));
1293
    __m512i a = _mm512_srli_epi16(b_a, 8);
1294
    return {SIMDVec16{r}, SIMDVec16{g}, SIMDVec16{b}, SIMDVec16{a}};
1295
  }
1296
  static std::array<SIMDVec16, 4> LoadRGBA16(const unsigned char* data) {
1297
    __m512i bytes0 = _mm512_loadu_si512((__m512i*)data);
1298
    __m512i bytes1 = _mm512_loadu_si512((__m512i*)(data + 64));
1299
    __m512i bytes2 = _mm512_loadu_si512((__m512i*)(data + 128));
1300
    __m512i bytes3 = _mm512_loadu_si512((__m512i*)(data + 192));
1301
1302
    auto pack32 = [](__m512i a, __m512i b) {
1303
      __m512i permuteidx = _mm512_set_epi64(7, 5, 3, 1, 6, 4, 2, 0);
1304
      return _mm512_permutexvar_epi64(permuteidx, _mm512_packus_epi32(a, b));
1305
    };
1306
    auto packlow32 = [&pack32](__m512i a, __m512i b) {
1307
      __m512i mask = _mm512_set1_epi32(0xFFFF);
1308
      return pack32(_mm512_and_si512(a, mask), _mm512_and_si512(b, mask));
1309
    };
1310
    auto packhi32 = [&pack32](__m512i a, __m512i b) {
1311
      return pack32(_mm512_srli_epi32(a, 16), _mm512_srli_epi32(b, 16));
1312
    };
1313
1314
    __m512i rb0 = packlow32(bytes0, bytes1);
1315
    __m512i rb1 = packlow32(bytes2, bytes3);
1316
    __m512i ga0 = packhi32(bytes0, bytes1);
1317
    __m512i ga1 = packhi32(bytes2, bytes3);
1318
1319
    __m512i r = packlow32(rb0, rb1);
1320
    __m512i g = packlow32(ga0, ga1);
1321
    __m512i b = packhi32(rb0, rb1);
1322
    __m512i a = packhi32(ga0, ga1);
1323
    return {SIMDVec16{r}, SIMDVec16{g}, SIMDVec16{b}, SIMDVec16{a}};
1324
  }
1325
1326
  void SwapEndian() {
1327
    auto indices = _mm512_broadcast_i32x4(
1328
        _mm_setr_epi8(1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14));
1329
    vec = _mm512_shuffle_epi8(vec, indices);
1330
  }
1331
};
1332
1333
SIMDVec16 Mask16::IfThenElse(const SIMDVec16& if_true,
1334
                             const SIMDVec16& if_false) {
1335
  return SIMDVec16{_mm512_mask_blend_epi16(mask, if_false.vec, if_true.vec)};
1336
}
1337
1338
SIMDVec32 Mask32::IfThenElse(const SIMDVec32& if_true,
1339
                             const SIMDVec32& if_false) {
1340
  return SIMDVec32{_mm512_mask_blend_epi32(mask, if_false.vec, if_true.vec)};
1341
}
1342
1343
struct Bits64 {
1344
  static constexpr size_t kLanes = 8;
1345
1346
  __m512i nbits;
1347
  __m512i bits;
1348
1349
  FJXL_INLINE void Store(uint64_t* nbits_out, uint64_t* bits_out) {
1350
    _mm512_storeu_si512((__m512i*)nbits_out, nbits);
1351
    _mm512_storeu_si512((__m512i*)bits_out, bits);
1352
  }
1353
};
1354
1355
struct Bits32 {
1356
  __m512i nbits;
1357
  __m512i bits;
1358
1359
  static Bits32 FromRaw(SIMDVec32 nbits, SIMDVec32 bits) {
1360
    return Bits32{nbits.vec, bits.vec};
1361
  }
1362
1363
  Bits64 Merge() const {
1364
    auto nbits_hi32 = _mm512_srli_epi64(nbits, 32);
1365
    auto nbits_lo32 = _mm512_and_si512(nbits, _mm512_set1_epi64(0xFFFFFFFF));
1366
    auto bits_hi32 = _mm512_srli_epi64(bits, 32);
1367
    auto bits_lo32 = _mm512_and_si512(bits, _mm512_set1_epi64(0xFFFFFFFF));
1368
1369
    auto nbits64 = _mm512_add_epi64(nbits_hi32, nbits_lo32);
1370
    auto bits64 =
1371
        _mm512_or_si512(_mm512_sllv_epi64(bits_hi32, nbits_lo32), bits_lo32);
1372
    return Bits64{nbits64, bits64};
1373
  }
1374
1375
  void Interleave(const Bits32& low) {
1376
    bits = _mm512_or_si512(_mm512_sllv_epi32(bits, low.nbits), low.bits);
1377
    nbits = _mm512_add_epi32(nbits, low.nbits);
1378
  }
1379
1380
  void ClipTo(size_t n) {
1381
    n = std::min<size_t>(n, 16);
1382
    constexpr uint32_t kMask[32] = {
1383
        ~0u, ~0u, ~0u, ~0u, ~0u, ~0u, ~0u, ~0u, ~0u, ~0u, ~0u,
1384
        ~0u, ~0u, ~0u, ~0u, ~0u, 0,   0,   0,   0,   0,   0,
1385
        0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
1386
    };
1387
    __m512i mask = _mm512_loadu_si512((__m512i*)(kMask + 16 - n));
1388
    nbits = _mm512_and_si512(mask, nbits);
1389
    bits = _mm512_and_si512(mask, bits);
1390
  }
1391
  void Skip(size_t n) {
1392
    n = std::min<size_t>(n, 16);
1393
    constexpr uint32_t kMask[32] = {
1394
        0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
1395
        0,   0,   0,   0,   0,   ~0u, ~0u, ~0u, ~0u, ~0u, ~0u,
1396
        ~0u, ~0u, ~0u, ~0u, ~0u, ~0u, ~0u, ~0u, ~0u, ~0u,
1397
    };
1398
    __m512i mask = _mm512_loadu_si512((__m512i*)(kMask + 16 - n));
1399
    nbits = _mm512_and_si512(mask, nbits);
1400
    bits = _mm512_and_si512(mask, bits);
1401
  }
1402
};
1403
1404
struct Bits16 {
1405
  __m512i nbits;
1406
  __m512i bits;
1407
1408
  static Bits16 FromRaw(SIMDVec16 nbits, SIMDVec16 bits) {
1409
    return Bits16{nbits.vec, bits.vec};
1410
  }
1411
1412
  Bits32 Merge() const {
1413
    auto nbits_hi16 = _mm512_srli_epi32(nbits, 16);
1414
    auto nbits_lo16 = _mm512_and_si512(nbits, _mm512_set1_epi32(0xFFFF));
1415
    auto bits_hi16 = _mm512_srli_epi32(bits, 16);
1416
    auto bits_lo16 = _mm512_and_si512(bits, _mm512_set1_epi32(0xFFFF));
1417
1418
    auto nbits32 = _mm512_add_epi32(nbits_hi16, nbits_lo16);
1419
    auto bits32 =
1420
        _mm512_or_si512(_mm512_sllv_epi32(bits_hi16, nbits_lo16), bits_lo16);
1421
    return Bits32{nbits32, bits32};
1422
  }
1423
1424
  void Interleave(const Bits16& low) {
1425
    bits = _mm512_or_si512(_mm512_sllv_epi16(bits, low.nbits), low.bits);
1426
    nbits = _mm512_add_epi16(nbits, low.nbits);
1427
  }
1428
1429
  void ClipTo(size_t n) {
1430
    n = std::min<size_t>(n, 32);
1431
    constexpr uint16_t kMask[64] = {
1432
        0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF,
1433
        0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF,
1434
        0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF,
1435
        0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF,
1436
        0,      0,      0,      0,      0,      0,      0,      0,
1437
        0,      0,      0,      0,      0,      0,      0,      0,
1438
        0,      0,      0,      0,      0,      0,      0,      0,
1439
        0,      0,      0,      0,      0,      0,      0,      0,
1440
    };
1441
    __m512i mask = _mm512_loadu_si512((__m512i*)(kMask + 32 - n));
1442
    nbits = _mm512_and_si512(mask, nbits);
1443
    bits = _mm512_and_si512(mask, bits);
1444
  }
1445
  void Skip(size_t n) {
1446
    n = std::min<size_t>(n, 32);
1447
    constexpr uint16_t kMask[64] = {
1448
        0,      0,      0,      0,      0,      0,      0,      0,
1449
        0,      0,      0,      0,      0,      0,      0,      0,
1450
        0,      0,      0,      0,      0,      0,      0,      0,
1451
        0,      0,      0,      0,      0,      0,      0,      0,
1452
        0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF,
1453
        0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF,
1454
        0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF,
1455
        0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF,
1456
    };
1457
    __m512i mask = _mm512_loadu_si512((__m512i*)(kMask + 32 - n));
1458
    nbits = _mm512_and_si512(mask, nbits);
1459
    bits = _mm512_and_si512(mask, bits);
1460
  }
1461
};
1462
1463
#endif
1464
1465
#ifdef FJXL_AVX2
1466
#define FJXL_GENERIC_SIMD
1467
1468
struct SIMDVec32;
1469
1470
struct Mask32 {
1471
  __m256i mask;
1472
  SIMDVec32 IfThenElse(const SIMDVec32& if_true, const SIMDVec32& if_false);
1473
0
  size_t CountPrefix() const {
1474
0
    return CtzNonZero(~static_cast<uint64_t>(
1475
0
        static_cast<uint8_t>(_mm256_movemask_ps(_mm256_castsi256_ps(mask)))));
1476
0
  }
1477
};
1478
1479
struct SIMDVec32 {
1480
  __m256i vec;
1481
1482
  static constexpr size_t kLanes = 8;
1483
1484
0
  FJXL_INLINE static SIMDVec32 Load(const uint32_t* data) {
1485
0
    return SIMDVec32{_mm256_loadu_si256((__m256i*)data)};
1486
0
  }
1487
0
  FJXL_INLINE void Store(uint32_t* data) {
1488
0
    _mm256_storeu_si256((__m256i*)data, vec);
1489
0
  }
1490
0
  FJXL_INLINE static SIMDVec32 Val(uint32_t v) {
1491
0
    return SIMDVec32{_mm256_set1_epi32(v)};
1492
0
  }
1493
0
  FJXL_INLINE SIMDVec32 ValToToken() const {
1494
0
    auto f32 = _mm256_castps_si256(_mm256_cvtepi32_ps(vec));
1495
0
    return SIMDVec32{_mm256_max_epi32(
1496
0
        _mm256_setzero_si256(),
1497
0
        _mm256_sub_epi32(_mm256_srli_epi32(f32, 23), _mm256_set1_epi32(126)))};
1498
0
  }
1499
0
  FJXL_INLINE SIMDVec32 SatSubU(const SIMDVec32& to_subtract) const {
1500
0
    return SIMDVec32{_mm256_sub_epi32(_mm256_max_epu32(vec, to_subtract.vec),
1501
0
                                      to_subtract.vec)};
1502
0
  }
1503
0
  FJXL_INLINE SIMDVec32 Sub(const SIMDVec32& to_subtract) const {
1504
0
    return SIMDVec32{_mm256_sub_epi32(vec, to_subtract.vec)};
1505
0
  }
1506
0
  FJXL_INLINE SIMDVec32 Add(const SIMDVec32& oth) const {
1507
0
    return SIMDVec32{_mm256_add_epi32(vec, oth.vec)};
1508
0
  }
1509
0
  FJXL_INLINE SIMDVec32 Xor(const SIMDVec32& oth) const {
1510
0
    return SIMDVec32{_mm256_xor_si256(vec, oth.vec)};
1511
0
  }
1512
0
  FJXL_INLINE SIMDVec32 Pow2() const {
1513
0
    return SIMDVec32{_mm256_sllv_epi32(_mm256_set1_epi32(1), vec)};
1514
0
  }
1515
0
  FJXL_INLINE Mask32 Eq(const SIMDVec32& oth) const {
1516
0
    return Mask32{_mm256_cmpeq_epi32(vec, oth.vec)};
1517
0
  }
1518
0
  FJXL_INLINE Mask32 Gt(const SIMDVec32& oth) const {
1519
0
    return Mask32{_mm256_cmpgt_epi32(vec, oth.vec)};
1520
0
  }
1521
  template <size_t i>
1522
0
  FJXL_INLINE SIMDVec32 SignedShiftRight() const {
1523
0
    return SIMDVec32{_mm256_srai_epi32(vec, i)};
1524
0
  }
1525
};
1526
1527
struct SIMDVec16;
1528
1529
struct Mask16 {
1530
  __m256i mask;
1531
  SIMDVec16 IfThenElse(const SIMDVec16& if_true, const SIMDVec16& if_false);
1532
0
  Mask16 And(const Mask16& oth) const {
1533
0
    return Mask16{_mm256_and_si256(mask, oth.mask)};
1534
0
  }
1535
0
  size_t CountPrefix() const {
1536
0
    return CtzNonZero(~static_cast<uint64_t>(
1537
0
               static_cast<uint32_t>(_mm256_movemask_epi8(mask)))) /
1538
0
           2;
1539
0
  }
1540
};
1541
1542
struct SIMDVec16 {
1543
  __m256i vec;
1544
1545
  static constexpr size_t kLanes = 16;
1546
1547
0
  FJXL_INLINE static SIMDVec16 Load(const uint16_t* data) {
1548
0
    return SIMDVec16{_mm256_loadu_si256((__m256i*)data)};
1549
0
  }
1550
0
  FJXL_INLINE void Store(uint16_t* data) {
1551
0
    _mm256_storeu_si256((__m256i*)data, vec);
1552
0
  }
1553
0
  FJXL_INLINE static SIMDVec16 Val(uint16_t v) {
1554
0
    return SIMDVec16{_mm256_set1_epi16(v)};
1555
0
  }
1556
  FJXL_INLINE static SIMDVec16 FromTwo32(const SIMDVec32& lo,
1557
0
                                         const SIMDVec32& hi) {
1558
0
    auto tmp = _mm256_packus_epi32(lo.vec, hi.vec);
1559
0
    return SIMDVec16{_mm256_permute4x64_epi64(tmp, 0b11011000)};
1560
0
  }
1561
1562
0
  FJXL_INLINE SIMDVec16 ValToToken() const {
1563
0
    auto nibble0 =
1564
0
        _mm256_or_si256(_mm256_and_si256(vec, _mm256_set1_epi16(0xF)),
1565
0
                        _mm256_set1_epi16(0xFF00));
1566
0
    auto nibble1 = _mm256_or_si256(
1567
0
        _mm256_and_si256(_mm256_srli_epi16(vec, 4), _mm256_set1_epi16(0xF)),
1568
0
        _mm256_set1_epi16(0xFF00));
1569
0
    auto nibble2 = _mm256_or_si256(
1570
0
        _mm256_and_si256(_mm256_srli_epi16(vec, 8), _mm256_set1_epi16(0xF)),
1571
0
        _mm256_set1_epi16(0xFF00));
1572
0
    auto nibble3 =
1573
0
        _mm256_or_si256(_mm256_srli_epi16(vec, 12), _mm256_set1_epi16(0xFF00));
1574
1575
0
    auto lut0 = _mm256_broadcastsi128_si256(
1576
0
        _mm_setr_epi8(0, 1, 2, 2, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4));
1577
0
    auto lut1 = _mm256_broadcastsi128_si256(
1578
0
        _mm_setr_epi8(0, 5, 6, 6, 7, 7, 7, 7, 8, 8, 8, 8, 8, 8, 8, 8));
1579
0
    auto lut2 = _mm256_broadcastsi128_si256(_mm_setr_epi8(
1580
0
        0, 9, 10, 10, 11, 11, 11, 11, 12, 12, 12, 12, 12, 12, 12, 12));
1581
0
    auto lut3 = _mm256_broadcastsi128_si256(_mm_setr_epi8(
1582
0
        0, 13, 14, 14, 15, 15, 15, 15, 16, 16, 16, 16, 16, 16, 16, 16));
1583
1584
0
    auto token0 = _mm256_shuffle_epi8(lut0, nibble0);
1585
0
    auto token1 = _mm256_shuffle_epi8(lut1, nibble1);
1586
0
    auto token2 = _mm256_shuffle_epi8(lut2, nibble2);
1587
0
    auto token3 = _mm256_shuffle_epi8(lut3, nibble3);
1588
1589
0
    auto token = _mm256_max_epi16(_mm256_max_epi16(token0, token1),
1590
0
                                  _mm256_max_epi16(token2, token3));
1591
0
    return SIMDVec16{token};
1592
0
  }
1593
1594
0
  FJXL_INLINE SIMDVec16 SatSubU(const SIMDVec16& to_subtract) const {
1595
0
    return SIMDVec16{_mm256_subs_epu16(vec, to_subtract.vec)};
1596
0
  }
1597
0
  FJXL_INLINE SIMDVec16 Sub(const SIMDVec16& to_subtract) const {
1598
0
    return SIMDVec16{_mm256_sub_epi16(vec, to_subtract.vec)};
1599
0
  }
1600
0
  FJXL_INLINE SIMDVec16 Add(const SIMDVec16& oth) const {
1601
0
    return SIMDVec16{_mm256_add_epi16(vec, oth.vec)};
1602
0
  }
1603
0
  FJXL_INLINE SIMDVec16 Min(const SIMDVec16& oth) const {
1604
0
    return SIMDVec16{_mm256_min_epu16(vec, oth.vec)};
1605
0
  }
1606
0
  FJXL_INLINE Mask16 Eq(const SIMDVec16& oth) const {
1607
0
    return Mask16{_mm256_cmpeq_epi16(vec, oth.vec)};
1608
0
  }
1609
0
  FJXL_INLINE Mask16 Gt(const SIMDVec16& oth) const {
1610
0
    return Mask16{_mm256_cmpgt_epi16(vec, oth.vec)};
1611
0
  }
1612
0
  FJXL_INLINE SIMDVec16 Pow2() const {
1613
0
    auto pow2_lo_lut = _mm256_broadcastsi128_si256(
1614
0
        _mm_setr_epi8(1 << 0, 1 << 1, 1 << 2, 1 << 3, 1 << 4, 1 << 5, 1 << 6,
1615
0
                      1u << 7, 0, 0, 0, 0, 0, 0, 0, 0));
1616
0
    auto pow2_hi_lut = _mm256_broadcastsi128_si256(
1617
0
        _mm_setr_epi8(0, 0, 0, 0, 0, 0, 0, 0, 1 << 0, 1 << 1, 1 << 2, 1 << 3,
1618
0
                      1 << 4, 1 << 5, 1 << 6, 1u << 7));
1619
1620
0
    auto masked = _mm256_or_si256(vec, _mm256_set1_epi16(0xFF00));
1621
1622
0
    auto pow2_lo = _mm256_shuffle_epi8(pow2_lo_lut, masked);
1623
0
    auto pow2_hi = _mm256_shuffle_epi8(pow2_hi_lut, masked);
1624
1625
0
    auto pow2 = _mm256_or_si256(_mm256_slli_epi16(pow2_hi, 8), pow2_lo);
1626
0
    return SIMDVec16{pow2};
1627
0
  }
1628
0
  FJXL_INLINE SIMDVec16 Or(const SIMDVec16& oth) const {
1629
0
    return SIMDVec16{_mm256_or_si256(vec, oth.vec)};
1630
0
  }
1631
0
  FJXL_INLINE SIMDVec16 Xor(const SIMDVec16& oth) const {
1632
0
    return SIMDVec16{_mm256_xor_si256(vec, oth.vec)};
1633
0
  }
1634
0
  FJXL_INLINE SIMDVec16 And(const SIMDVec16& oth) const {
1635
0
    return SIMDVec16{_mm256_and_si256(vec, oth.vec)};
1636
0
  }
1637
0
  FJXL_INLINE SIMDVec16 HAdd(const SIMDVec16& oth) const {
1638
0
    return SIMDVec16{_mm256_srai_epi16(_mm256_add_epi16(vec, oth.vec), 1)};
1639
0
  }
1640
0
  FJXL_INLINE SIMDVec16 PrepareForU8Lookup() const {
1641
0
    return SIMDVec16{_mm256_or_si256(vec, _mm256_set1_epi16(0xFF00))};
1642
0
  }
1643
0
  FJXL_INLINE SIMDVec16 U8Lookup(const uint8_t* table) const {
1644
0
    return SIMDVec16{_mm256_shuffle_epi8(
1645
0
        _mm256_broadcastsi128_si256(_mm_loadu_si128((__m128i*)table)), vec)};
1646
0
  }
1647
0
  FJXL_INLINE VecPair<SIMDVec16> Interleave(const SIMDVec16& low) const {
1648
0
    auto v02 = _mm256_unpacklo_epi16(low.vec, vec);
1649
0
    auto v13 = _mm256_unpackhi_epi16(low.vec, vec);
1650
0
    return {SIMDVec16{_mm256_permute2x128_si256(v02, v13, 0x20)},
1651
0
            SIMDVec16{_mm256_permute2x128_si256(v02, v13, 0x31)}};
1652
0
  }
1653
0
  FJXL_INLINE VecPair<SIMDVec32> Upcast() const {
1654
0
    auto v02 = _mm256_unpacklo_epi16(vec, _mm256_setzero_si256());
1655
0
    auto v13 = _mm256_unpackhi_epi16(vec, _mm256_setzero_si256());
1656
0
    return {SIMDVec32{_mm256_permute2x128_si256(v02, v13, 0x20)},
1657
0
            SIMDVec32{_mm256_permute2x128_si256(v02, v13, 0x31)}};
1658
0
  }
1659
  template <size_t i>
1660
0
  FJXL_INLINE SIMDVec16 SignedShiftRight() const {
1661
0
    return SIMDVec16{_mm256_srai_epi16(vec, i)};
1662
0
  }
1663
1664
0
  static std::array<SIMDVec16, 1> LoadG8(const unsigned char* data) {
1665
0
    __m128i bytes = _mm_loadu_si128((__m128i*)data);
1666
0
    return {SIMDVec16{_mm256_cvtepu8_epi16(bytes)}};
1667
0
  }
1668
0
  static std::array<SIMDVec16, 1> LoadG16(const unsigned char* data) {
1669
0
    return {Load((const uint16_t*)data)};
1670
0
  }
1671
1672
0
  static std::array<SIMDVec16, 2> LoadGA8(const unsigned char* data) {
1673
0
    __m256i bytes = _mm256_loadu_si256((__m256i*)data);
1674
0
    __m256i gray = _mm256_and_si256(bytes, _mm256_set1_epi16(0xFF));
1675
0
    __m256i alpha = _mm256_srli_epi16(bytes, 8);
1676
0
    return {SIMDVec16{gray}, SIMDVec16{alpha}};
1677
0
  }
1678
0
  static std::array<SIMDVec16, 2> LoadGA16(const unsigned char* data) {
1679
0
    __m256i bytes1 = _mm256_loadu_si256((__m256i*)data);
1680
0
    __m256i bytes2 = _mm256_loadu_si256((__m256i*)(data + 32));
1681
0
    __m256i g_mask = _mm256_set1_epi32(0xFFFF);
1682
0
    __m256i g = _mm256_permute4x64_epi64(
1683
0
        _mm256_packus_epi32(_mm256_and_si256(bytes1, g_mask),
1684
0
                            _mm256_and_si256(bytes2, g_mask)),
1685
0
        0b11011000);
1686
0
    __m256i a = _mm256_permute4x64_epi64(
1687
0
        _mm256_packus_epi32(_mm256_srli_epi32(bytes1, 16),
1688
0
                            _mm256_srli_epi32(bytes2, 16)),
1689
0
        0b11011000);
1690
0
    return {SIMDVec16{g}, SIMDVec16{a}};
1691
0
  }
1692
1693
0
  static std::array<SIMDVec16, 3> LoadRGB8(const unsigned char* data) {
1694
0
    __m128i bytes0 = _mm_loadu_si128((__m128i*)data);
1695
0
    __m128i bytes1 = _mm_loadu_si128((__m128i*)(data + 16));
1696
0
    __m128i bytes2 = _mm_loadu_si128((__m128i*)(data + 32));
1697
1698
0
    __m128i idx =
1699
0
        _mm_setr_epi8(0, 3, 6, 9, 12, 15, 2, 5, 8, 11, 14, 1, 4, 7, 10, 13);
1700
1701
0
    __m128i r6b5g5_0 = _mm_shuffle_epi8(bytes0, idx);
1702
0
    __m128i g6r5b5_1 = _mm_shuffle_epi8(bytes1, idx);
1703
0
    __m128i b6g5r5_2 = _mm_shuffle_epi8(bytes2, idx);
1704
1705
0
    __m128i mask010 = _mm_setr_epi8(0, 0, 0, 0, 0, 0, 0xFF, 0xFF, 0xFF, 0xFF,
1706
0
                                    0xFF, 0, 0, 0, 0, 0);
1707
0
    __m128i mask001 = _mm_setr_epi8(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0xFF, 0xFF,
1708
0
                                    0xFF, 0xFF, 0xFF);
1709
1710
0
    __m128i b2g2b1 = _mm_blendv_epi8(b6g5r5_2, g6r5b5_1, mask001);
1711
0
    __m128i b2b0b1 = _mm_blendv_epi8(b2g2b1, r6b5g5_0, mask010);
1712
1713
0
    __m128i r0r1b1 = _mm_blendv_epi8(r6b5g5_0, g6r5b5_1, mask010);
1714
0
    __m128i r0r1r2 = _mm_blendv_epi8(r0r1b1, b6g5r5_2, mask001);
1715
1716
0
    __m128i g1r1g0 = _mm_blendv_epi8(g6r5b5_1, r6b5g5_0, mask001);
1717
0
    __m128i g1g2g0 = _mm_blendv_epi8(g1r1g0, b6g5r5_2, mask010);
1718
1719
0
    __m128i g0g1g2 = _mm_alignr_epi8(g1g2g0, g1g2g0, 11);
1720
0
    __m128i b0b1b2 = _mm_alignr_epi8(b2b0b1, b2b0b1, 6);
1721
1722
0
    return {SIMDVec16{_mm256_cvtepu8_epi16(r0r1r2)},
1723
0
            SIMDVec16{_mm256_cvtepu8_epi16(g0g1g2)},
1724
0
            SIMDVec16{_mm256_cvtepu8_epi16(b0b1b2)}};
1725
0
  }
1726
0
  static std::array<SIMDVec16, 3> LoadRGB16(const unsigned char* data) {
1727
0
    auto load_and_split_lohi = [](const unsigned char* data) {
1728
      // LHLHLH...
1729
0
      __m256i bytes = _mm256_loadu_si256((__m256i*)data);
1730
      // L0L0L0...
1731
0
      __m256i lo = _mm256_and_si256(bytes, _mm256_set1_epi16(0xFF));
1732
      // H0H0H0...
1733
0
      __m256i hi = _mm256_srli_epi16(bytes, 8);
1734
      // LLLLLLLLHHHHHHHHLLLLLLLLHHHHHHHH
1735
0
      __m256i packed = _mm256_packus_epi16(lo, hi);
1736
0
      return _mm256_permute4x64_epi64(packed, 0b11011000);
1737
0
    };
1738
0
    __m256i bytes0 = load_and_split_lohi(data);
1739
0
    __m256i bytes1 = load_and_split_lohi(data + 32);
1740
0
    __m256i bytes2 = load_and_split_lohi(data + 64);
1741
1742
0
    __m256i idx = _mm256_broadcastsi128_si256(
1743
0
        _mm_setr_epi8(0, 3, 6, 9, 12, 15, 2, 5, 8, 11, 14, 1, 4, 7, 10, 13));
1744
1745
0
    __m256i r6b5g5_0 = _mm256_shuffle_epi8(bytes0, idx);
1746
0
    __m256i g6r5b5_1 = _mm256_shuffle_epi8(bytes1, idx);
1747
0
    __m256i b6g5r5_2 = _mm256_shuffle_epi8(bytes2, idx);
1748
1749
0
    __m256i mask010 = _mm256_broadcastsi128_si256(_mm_setr_epi8(
1750
0
        0, 0, 0, 0, 0, 0, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0, 0, 0, 0, 0));
1751
0
    __m256i mask001 = _mm256_broadcastsi128_si256(_mm_setr_epi8(
1752
0
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF));
1753
1754
0
    __m256i b2g2b1 = _mm256_blendv_epi8(b6g5r5_2, g6r5b5_1, mask001);
1755
0
    __m256i b2b0b1 = _mm256_blendv_epi8(b2g2b1, r6b5g5_0, mask010);
1756
1757
0
    __m256i r0r1b1 = _mm256_blendv_epi8(r6b5g5_0, g6r5b5_1, mask010);
1758
0
    __m256i r0r1r2 = _mm256_blendv_epi8(r0r1b1, b6g5r5_2, mask001);
1759
1760
0
    __m256i g1r1g0 = _mm256_blendv_epi8(g6r5b5_1, r6b5g5_0, mask001);
1761
0
    __m256i g1g2g0 = _mm256_blendv_epi8(g1r1g0, b6g5r5_2, mask010);
1762
1763
0
    __m256i g0g1g2 = _mm256_alignr_epi8(g1g2g0, g1g2g0, 11);
1764
0
    __m256i b0b1b2 = _mm256_alignr_epi8(b2b0b1, b2b0b1, 6);
1765
1766
    // Now r0r1r2, g0g1g2, b0b1b2 have the low bytes of the RGB pixels in their
1767
    // lower half, and the high bytes in their upper half.
1768
1769
0
    auto combine_low_hi = [](__m256i v) {
1770
0
      __m128i low = _mm256_extracti128_si256(v, 0);
1771
0
      __m128i hi = _mm256_extracti128_si256(v, 1);
1772
0
      __m256i low16 = _mm256_cvtepu8_epi16(low);
1773
0
      __m256i hi16 = _mm256_cvtepu8_epi16(hi);
1774
0
      return _mm256_or_si256(_mm256_slli_epi16(hi16, 8), low16);
1775
0
    };
1776
1777
0
    return {SIMDVec16{combine_low_hi(r0r1r2)},
1778
0
            SIMDVec16{combine_low_hi(g0g1g2)},
1779
0
            SIMDVec16{combine_low_hi(b0b1b2)}};
1780
0
  }
1781
1782
0
  static std::array<SIMDVec16, 4> LoadRGBA8(const unsigned char* data) {
1783
0
    __m256i bytes1 = _mm256_loadu_si256((__m256i*)data);
1784
0
    __m256i bytes2 = _mm256_loadu_si256((__m256i*)(data + 32));
1785
0
    __m256i rg_mask = _mm256_set1_epi32(0xFFFF);
1786
0
    __m256i rg = _mm256_permute4x64_epi64(
1787
0
        _mm256_packus_epi32(_mm256_and_si256(bytes1, rg_mask),
1788
0
                            _mm256_and_si256(bytes2, rg_mask)),
1789
0
        0b11011000);
1790
0
    __m256i b_a = _mm256_permute4x64_epi64(
1791
0
        _mm256_packus_epi32(_mm256_srli_epi32(bytes1, 16),
1792
0
                            _mm256_srli_epi32(bytes2, 16)),
1793
0
        0b11011000);
1794
0
    __m256i r = _mm256_and_si256(rg, _mm256_set1_epi16(0xFF));
1795
0
    __m256i g = _mm256_srli_epi16(rg, 8);
1796
0
    __m256i b = _mm256_and_si256(b_a, _mm256_set1_epi16(0xFF));
1797
0
    __m256i a = _mm256_srli_epi16(b_a, 8);
1798
0
    return {SIMDVec16{r}, SIMDVec16{g}, SIMDVec16{b}, SIMDVec16{a}};
1799
0
  }
1800
0
  static std::array<SIMDVec16, 4> LoadRGBA16(const unsigned char* data) {
1801
0
    __m256i bytes0 = _mm256_loadu_si256((__m256i*)data);
1802
0
    __m256i bytes1 = _mm256_loadu_si256((__m256i*)(data + 32));
1803
0
    __m256i bytes2 = _mm256_loadu_si256((__m256i*)(data + 64));
1804
0
    __m256i bytes3 = _mm256_loadu_si256((__m256i*)(data + 96));
1805
1806
0
    auto pack32 = [](__m256i a, __m256i b) {
1807
0
      return _mm256_permute4x64_epi64(_mm256_packus_epi32(a, b), 0b11011000);
1808
0
    };
1809
0
    auto packlow32 = [&pack32](__m256i a, __m256i b) {
1810
0
      __m256i mask = _mm256_set1_epi32(0xFFFF);
1811
0
      return pack32(_mm256_and_si256(a, mask), _mm256_and_si256(b, mask));
1812
0
    };
1813
0
    auto packhi32 = [&pack32](__m256i a, __m256i b) {
1814
0
      return pack32(_mm256_srli_epi32(a, 16), _mm256_srli_epi32(b, 16));
1815
0
    };
1816
1817
0
    __m256i rb0 = packlow32(bytes0, bytes1);
1818
0
    __m256i rb1 = packlow32(bytes2, bytes3);
1819
0
    __m256i ga0 = packhi32(bytes0, bytes1);
1820
0
    __m256i ga1 = packhi32(bytes2, bytes3);
1821
1822
0
    __m256i r = packlow32(rb0, rb1);
1823
0
    __m256i g = packlow32(ga0, ga1);
1824
0
    __m256i b = packhi32(rb0, rb1);
1825
0
    __m256i a = packhi32(ga0, ga1);
1826
0
    return {SIMDVec16{r}, SIMDVec16{g}, SIMDVec16{b}, SIMDVec16{a}};
1827
0
  }
1828
1829
0
  void SwapEndian() {
1830
0
    auto indices = _mm256_broadcastsi128_si256(
1831
0
        _mm_setr_epi8(1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14));
1832
0
    vec = _mm256_shuffle_epi8(vec, indices);
1833
0
  }
1834
};
1835
1836
SIMDVec16 Mask16::IfThenElse(const SIMDVec16& if_true,
1837
0
                             const SIMDVec16& if_false) {
1838
0
  return SIMDVec16{_mm256_blendv_epi8(if_false.vec, if_true.vec, mask)};
1839
0
}
1840
1841
SIMDVec32 Mask32::IfThenElse(const SIMDVec32& if_true,
1842
0
                             const SIMDVec32& if_false) {
1843
0
  return SIMDVec32{_mm256_blendv_epi8(if_false.vec, if_true.vec, mask)};
1844
0
}
1845
1846
struct Bits64 {
1847
  static constexpr size_t kLanes = 4;
1848
1849
  __m256i nbits;
1850
  __m256i bits;
1851
1852
0
  FJXL_INLINE void Store(uint64_t* nbits_out, uint64_t* bits_out) {
1853
0
    _mm256_storeu_si256((__m256i*)nbits_out, nbits);
1854
0
    _mm256_storeu_si256((__m256i*)bits_out, bits);
1855
0
  }
1856
};
1857
1858
struct Bits32 {
1859
  __m256i nbits;
1860
  __m256i bits;
1861
1862
0
  static Bits32 FromRaw(SIMDVec32 nbits, SIMDVec32 bits) {
1863
0
    return Bits32{nbits.vec, bits.vec};
1864
0
  }
1865
1866
0
  Bits64 Merge() const {
1867
0
    auto nbits_hi32 = _mm256_srli_epi64(nbits, 32);
1868
0
    auto nbits_lo32 = _mm256_and_si256(nbits, _mm256_set1_epi64x(0xFFFFFFFF));
1869
0
    auto bits_hi32 = _mm256_srli_epi64(bits, 32);
1870
0
    auto bits_lo32 = _mm256_and_si256(bits, _mm256_set1_epi64x(0xFFFFFFFF));
1871
1872
0
    auto nbits64 = _mm256_add_epi64(nbits_hi32, nbits_lo32);
1873
0
    auto bits64 =
1874
0
        _mm256_or_si256(_mm256_sllv_epi64(bits_hi32, nbits_lo32), bits_lo32);
1875
0
    return Bits64{nbits64, bits64};
1876
0
  }
1877
1878
0
  void Interleave(const Bits32& low) {
1879
0
    bits = _mm256_or_si256(_mm256_sllv_epi32(bits, low.nbits), low.bits);
1880
0
    nbits = _mm256_add_epi32(nbits, low.nbits);
1881
0
  }
1882
1883
0
  void ClipTo(size_t n) {
1884
0
    n = std::min<size_t>(n, 8);
1885
0
    constexpr uint32_t kMask[16] = {
1886
0
        ~0u, ~0u, ~0u, ~0u, ~0u, ~0u, ~0u, ~0u, 0, 0, 0, 0, 0, 0, 0, 0,
1887
0
    };
1888
0
    __m256i mask = _mm256_loadu_si256((__m256i*)(kMask + 8 - n));
1889
0
    nbits = _mm256_and_si256(mask, nbits);
1890
0
    bits = _mm256_and_si256(mask, bits);
1891
0
  }
1892
0
  void Skip(size_t n) {
1893
0
    n = std::min<size_t>(n, 8);
1894
0
    constexpr uint32_t kMask[16] = {
1895
0
        0, 0, 0, 0, 0, 0, 0, 0, ~0u, ~0u, ~0u, ~0u, ~0u, ~0u, ~0u, ~0u,
1896
0
    };
1897
0
    __m256i mask = _mm256_loadu_si256((__m256i*)(kMask + 8 - n));
1898
0
    nbits = _mm256_and_si256(mask, nbits);
1899
0
    bits = _mm256_and_si256(mask, bits);
1900
0
  }
1901
};
1902
1903
struct Bits16 {
1904
  __m256i nbits;
1905
  __m256i bits;
1906
1907
0
  static Bits16 FromRaw(SIMDVec16 nbits, SIMDVec16 bits) {
1908
0
    return Bits16{nbits.vec, bits.vec};
1909
0
  }
1910
1911
0
  Bits32 Merge() const {
1912
0
    auto nbits_hi16 = _mm256_srli_epi32(nbits, 16);
1913
0
    auto nbits_lo16 = _mm256_and_si256(nbits, _mm256_set1_epi32(0xFFFF));
1914
0
    auto bits_hi16 = _mm256_srli_epi32(bits, 16);
1915
0
    auto bits_lo16 = _mm256_and_si256(bits, _mm256_set1_epi32(0xFFFF));
1916
1917
0
    auto nbits32 = _mm256_add_epi32(nbits_hi16, nbits_lo16);
1918
0
    auto bits32 =
1919
0
        _mm256_or_si256(_mm256_sllv_epi32(bits_hi16, nbits_lo16), bits_lo16);
1920
0
    return Bits32{nbits32, bits32};
1921
0
  }
1922
1923
0
  void Interleave(const Bits16& low) {
1924
0
    auto pow2_lo_lut = _mm256_broadcastsi128_si256(
1925
0
        _mm_setr_epi8(1 << 0, 1 << 1, 1 << 2, 1 << 3, 1 << 4, 1 << 5, 1 << 6,
1926
0
                      1u << 7, 0, 0, 0, 0, 0, 0, 0, 0));
1927
0
    auto low_nbits_masked =
1928
0
        _mm256_or_si256(low.nbits, _mm256_set1_epi16(0xFF00));
1929
1930
0
    auto bits_shifted = _mm256_mullo_epi16(
1931
0
        bits, _mm256_shuffle_epi8(pow2_lo_lut, low_nbits_masked));
1932
1933
0
    nbits = _mm256_add_epi16(nbits, low.nbits);
1934
0
    bits = _mm256_or_si256(bits_shifted, low.bits);
1935
0
  }
1936
1937
0
  void ClipTo(size_t n) {
1938
0
    n = std::min<size_t>(n, 16);
1939
0
    constexpr uint16_t kMask[32] = {
1940
0
        0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF,
1941
0
        0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF,
1942
0
        0,      0,      0,      0,      0,      0,      0,      0,
1943
0
        0,      0,      0,      0,      0,      0,      0,      0,
1944
0
    };
1945
0
    __m256i mask = _mm256_loadu_si256((__m256i*)(kMask + 16 - n));
1946
0
    nbits = _mm256_and_si256(mask, nbits);
1947
0
    bits = _mm256_and_si256(mask, bits);
1948
0
  }
1949
1950
0
  void Skip(size_t n) {
1951
0
    n = std::min<size_t>(n, 16);
1952
0
    constexpr uint16_t kMask[32] = {
1953
0
        0,      0,      0,      0,      0,      0,      0,      0,
1954
0
        0,      0,      0,      0,      0,      0,      0,      0,
1955
0
        0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF,
1956
0
        0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF,
1957
0
    };
1958
0
    __m256i mask = _mm256_loadu_si256((__m256i*)(kMask + 16 - n));
1959
0
    nbits = _mm256_and_si256(mask, nbits);
1960
0
    bits = _mm256_and_si256(mask, bits);
1961
0
  }
1962
};
1963
1964
#endif
1965
1966
#ifdef FJXL_NEON
1967
#define FJXL_GENERIC_SIMD
1968
1969
struct SIMDVec32;
1970
1971
struct Mask32 {
1972
  uint32x4_t mask;
1973
  SIMDVec32 IfThenElse(const SIMDVec32& if_true, const SIMDVec32& if_false);
1974
  Mask32 And(const Mask32& oth) const {
1975
    return Mask32{vandq_u32(mask, oth.mask)};
1976
  }
1977
  size_t CountPrefix() const {
1978
    uint32_t val_unset[4] = {0, 1, 2, 3};
1979
    uint32_t val_set[4] = {4, 4, 4, 4};
1980
    uint32x4_t val = vbslq_u32(mask, vld1q_u32(val_set), vld1q_u32(val_unset));
1981
    return vminvq_u32(val);
1982
  }
1983
};
1984
1985
struct SIMDVec32 {
1986
  uint32x4_t vec;
1987
1988
  static constexpr size_t kLanes = 4;
1989
1990
  FJXL_INLINE static SIMDVec32 Load(const uint32_t* data) {
1991
    return SIMDVec32{vld1q_u32(data)};
1992
  }
1993
  FJXL_INLINE void Store(uint32_t* data) { vst1q_u32(data, vec); }
1994
  FJXL_INLINE static SIMDVec32 Val(uint32_t v) {
1995
    return SIMDVec32{vdupq_n_u32(v)};
1996
  }
1997
  FJXL_INLINE SIMDVec32 ValToToken() const {
1998
    return SIMDVec32{vsubq_u32(vdupq_n_u32(32), vclzq_u32(vec))};
1999
  }
2000
  FJXL_INLINE SIMDVec32 SatSubU(const SIMDVec32& to_subtract) const {
2001
    return SIMDVec32{vqsubq_u32(vec, to_subtract.vec)};
2002
  }
2003
  FJXL_INLINE SIMDVec32 Sub(const SIMDVec32& to_subtract) const {
2004
    return SIMDVec32{vsubq_u32(vec, to_subtract.vec)};
2005
  }
2006
  FJXL_INLINE SIMDVec32 Add(const SIMDVec32& oth) const {
2007
    return SIMDVec32{vaddq_u32(vec, oth.vec)};
2008
  }
2009
  FJXL_INLINE SIMDVec32 Xor(const SIMDVec32& oth) const {
2010
    return SIMDVec32{veorq_u32(vec, oth.vec)};
2011
  }
2012
  FJXL_INLINE SIMDVec32 Pow2() const {
2013
    return SIMDVec32{vshlq_u32(vdupq_n_u32(1), vreinterpretq_s32_u32(vec))};
2014
  }
2015
  FJXL_INLINE Mask32 Eq(const SIMDVec32& oth) const {
2016
    return Mask32{vceqq_u32(vec, oth.vec)};
2017
  }
2018
  FJXL_INLINE Mask32 Gt(const SIMDVec32& oth) const {
2019
    return Mask32{
2020
        vcgtq_s32(vreinterpretq_s32_u32(vec), vreinterpretq_s32_u32(oth.vec))};
2021
  }
2022
  template <size_t i>
2023
  FJXL_INLINE SIMDVec32 SignedShiftRight() const {
2024
    return SIMDVec32{
2025
        vreinterpretq_u32_s32(vshrq_n_s32(vreinterpretq_s32_u32(vec), i))};
2026
  }
2027
};
2028
2029
struct SIMDVec16;
2030
2031
struct Mask16 {
2032
  uint16x8_t mask;
2033
  SIMDVec16 IfThenElse(const SIMDVec16& if_true, const SIMDVec16& if_false);
2034
  Mask16 And(const Mask16& oth) const {
2035
    return Mask16{vandq_u16(mask, oth.mask)};
2036
  }
2037
  size_t CountPrefix() const {
2038
    uint16_t val_unset[8] = {0, 1, 2, 3, 4, 5, 6, 7};
2039
    uint16_t val_set[8] = {8, 8, 8, 8, 8, 8, 8, 8};
2040
    uint16x8_t val = vbslq_u16(mask, vld1q_u16(val_set), vld1q_u16(val_unset));
2041
    return vminvq_u16(val);
2042
  }
2043
};
2044
2045
struct SIMDVec16 {
2046
  uint16x8_t vec;
2047
2048
  static constexpr size_t kLanes = 8;
2049
2050
  FJXL_INLINE static SIMDVec16 Load(const uint16_t* data) {
2051
    return SIMDVec16{vld1q_u16(data)};
2052
  }
2053
  FJXL_INLINE void Store(uint16_t* data) { vst1q_u16(data, vec); }
2054
  FJXL_INLINE static SIMDVec16 Val(uint16_t v) {
2055
    return SIMDVec16{vdupq_n_u16(v)};
2056
  }
2057
  FJXL_INLINE static SIMDVec16 FromTwo32(const SIMDVec32& lo,
2058
                                         const SIMDVec32& hi) {
2059
    return SIMDVec16{vmovn_high_u32(vmovn_u32(lo.vec), hi.vec)};
2060
  }
2061
2062
  FJXL_INLINE SIMDVec16 ValToToken() const {
2063
    return SIMDVec16{vsubq_u16(vdupq_n_u16(16), vclzq_u16(vec))};
2064
  }
2065
  FJXL_INLINE SIMDVec16 SatSubU(const SIMDVec16& to_subtract) const {
2066
    return SIMDVec16{vqsubq_u16(vec, to_subtract.vec)};
2067
  }
2068
  FJXL_INLINE SIMDVec16 Sub(const SIMDVec16& to_subtract) const {
2069
    return SIMDVec16{vsubq_u16(vec, to_subtract.vec)};
2070
  }
2071
  FJXL_INLINE SIMDVec16 Add(const SIMDVec16& oth) const {
2072
    return SIMDVec16{vaddq_u16(vec, oth.vec)};
2073
  }
2074
  FJXL_INLINE SIMDVec16 Min(const SIMDVec16& oth) const {
2075
    return SIMDVec16{vminq_u16(vec, oth.vec)};
2076
  }
2077
  FJXL_INLINE Mask16 Eq(const SIMDVec16& oth) const {
2078
    return Mask16{vceqq_u16(vec, oth.vec)};
2079
  }
2080
  FJXL_INLINE Mask16 Gt(const SIMDVec16& oth) const {
2081
    return Mask16{
2082
        vcgtq_s16(vreinterpretq_s16_u16(vec), vreinterpretq_s16_u16(oth.vec))};
2083
  }
2084
  FJXL_INLINE SIMDVec16 Pow2() const {
2085
    return SIMDVec16{vshlq_u16(vdupq_n_u16(1), vreinterpretq_s16_u16(vec))};
2086
  }
2087
  FJXL_INLINE SIMDVec16 Or(const SIMDVec16& oth) const {
2088
    return SIMDVec16{vorrq_u16(vec, oth.vec)};
2089
  }
2090
  FJXL_INLINE SIMDVec16 Xor(const SIMDVec16& oth) const {
2091
    return SIMDVec16{veorq_u16(vec, oth.vec)};
2092
  }
2093
  FJXL_INLINE SIMDVec16 And(const SIMDVec16& oth) const {
2094
    return SIMDVec16{vandq_u16(vec, oth.vec)};
2095
  }
2096
  FJXL_INLINE SIMDVec16 HAdd(const SIMDVec16& oth) const {
2097
    return SIMDVec16{vhaddq_u16(vec, oth.vec)};
2098
  }
2099
  FJXL_INLINE SIMDVec16 PrepareForU8Lookup() const {
2100
    return SIMDVec16{vorrq_u16(vec, vdupq_n_u16(0xFF00))};
2101
  }
2102
  FJXL_INLINE SIMDVec16 U8Lookup(const uint8_t* table) const {
2103
    uint8x16_t tbl = vld1q_u8(table);
2104
    uint8x16_t indices = vreinterpretq_u8_u16(vec);
2105
    return SIMDVec16{vreinterpretq_u16_u8(vqtbl1q_u8(tbl, indices))};
2106
  }
2107
  FJXL_INLINE VecPair<SIMDVec16> Interleave(const SIMDVec16& low) const {
2108
    return {SIMDVec16{vzip1q_u16(low.vec, vec)},
2109
            SIMDVec16{vzip2q_u16(low.vec, vec)}};
2110
  }
2111
  FJXL_INLINE VecPair<SIMDVec32> Upcast() const {
2112
    uint32x4_t lo = vmovl_u16(vget_low_u16(vec));
2113
    uint32x4_t hi = vmovl_high_u16(vec);
2114
    return {SIMDVec32{lo}, SIMDVec32{hi}};
2115
  }
2116
  template <size_t i>
2117
  FJXL_INLINE SIMDVec16 SignedShiftRight() const {
2118
    return SIMDVec16{
2119
        vreinterpretq_u16_s16(vshrq_n_s16(vreinterpretq_s16_u16(vec), i))};
2120
  }
2121
2122
  static std::array<SIMDVec16, 1> LoadG8(const unsigned char* data) {
2123
    uint8x8_t v = vld1_u8(data);
2124
    return {SIMDVec16{vmovl_u8(v)}};
2125
  }
2126
  static std::array<SIMDVec16, 1> LoadG16(const unsigned char* data) {
2127
    return {Load((const uint16_t*)data)};
2128
  }
2129
2130
  static std::array<SIMDVec16, 2> LoadGA8(const unsigned char* data) {
2131
    uint8x8x2_t v = vld2_u8(data);
2132
    return {SIMDVec16{vmovl_u8(v.val[0])}, SIMDVec16{vmovl_u8(v.val[1])}};
2133
  }
2134
  static std::array<SIMDVec16, 2> LoadGA16(const unsigned char* data) {
2135
    uint16x8x2_t v = vld2q_u16((const uint16_t*)data);
2136
    return {SIMDVec16{v.val[0]}, SIMDVec16{v.val[1]}};
2137
  }
2138
2139
  static std::array<SIMDVec16, 3> LoadRGB8(const unsigned char* data) {
2140
    uint8x8x3_t v = vld3_u8(data);
2141
    return {SIMDVec16{vmovl_u8(v.val[0])}, SIMDVec16{vmovl_u8(v.val[1])},
2142
            SIMDVec16{vmovl_u8(v.val[2])}};
2143
  }
2144
  static std::array<SIMDVec16, 3> LoadRGB16(const unsigned char* data) {
2145
    uint16x8x3_t v = vld3q_u16((const uint16_t*)data);
2146
    return {SIMDVec16{v.val[0]}, SIMDVec16{v.val[1]}, SIMDVec16{v.val[2]}};
2147
  }
2148
2149
  static std::array<SIMDVec16, 4> LoadRGBA8(const unsigned char* data) {
2150
    uint8x8x4_t v = vld4_u8(data);
2151
    return {SIMDVec16{vmovl_u8(v.val[0])}, SIMDVec16{vmovl_u8(v.val[1])},
2152
            SIMDVec16{vmovl_u8(v.val[2])}, SIMDVec16{vmovl_u8(v.val[3])}};
2153
  }
2154
  static std::array<SIMDVec16, 4> LoadRGBA16(const unsigned char* data) {
2155
    uint16x8x4_t v = vld4q_u16((const uint16_t*)data);
2156
    return {SIMDVec16{v.val[0]}, SIMDVec16{v.val[1]}, SIMDVec16{v.val[2]},
2157
            SIMDVec16{v.val[3]}};
2158
  }
2159
2160
  void SwapEndian() {
2161
    vec = vreinterpretq_u16_u8(vrev16q_u8(vreinterpretq_u8_u16(vec)));
2162
  }
2163
};
2164
2165
SIMDVec16 Mask16::IfThenElse(const SIMDVec16& if_true,
2166
                             const SIMDVec16& if_false) {
2167
  return SIMDVec16{vbslq_u16(mask, if_true.vec, if_false.vec)};
2168
}
2169
2170
SIMDVec32 Mask32::IfThenElse(const SIMDVec32& if_true,
2171
                             const SIMDVec32& if_false) {
2172
  return SIMDVec32{vbslq_u32(mask, if_true.vec, if_false.vec)};
2173
}
2174
2175
struct Bits64 {
2176
  static constexpr size_t kLanes = 2;
2177
2178
  uint64x2_t nbits;
2179
  uint64x2_t bits;
2180
2181
  FJXL_INLINE void Store(uint64_t* nbits_out, uint64_t* bits_out) {
2182
    vst1q_u64(nbits_out, nbits);
2183
    vst1q_u64(bits_out, bits);
2184
  }
2185
};
2186
2187
struct Bits32 {
2188
  uint32x4_t nbits;
2189
  uint32x4_t bits;
2190
2191
  static Bits32 FromRaw(SIMDVec32 nbits, SIMDVec32 bits) {
2192
    return Bits32{nbits.vec, bits.vec};
2193
  }
2194
2195
  Bits64 Merge() const {
2196
    // TODO(veluca): can probably be optimized.
2197
    uint64x2_t nbits_lo32 =
2198
        vandq_u64(vreinterpretq_u64_u32(nbits), vdupq_n_u64(0xFFFFFFFF));
2199
    uint64x2_t bits_hi32 =
2200
        vshlq_u64(vshrq_n_u64(vreinterpretq_u64_u32(bits), 32),
2201
                  vreinterpretq_s64_u64(nbits_lo32));
2202
    uint64x2_t bits_lo32 =
2203
        vandq_u64(vreinterpretq_u64_u32(bits), vdupq_n_u64(0xFFFFFFFF));
2204
    uint64x2_t nbits64 =
2205
        vsraq_n_u64(nbits_lo32, vreinterpretq_u64_u32(nbits), 32);
2206
    uint64x2_t bits64 = vorrq_u64(bits_hi32, bits_lo32);
2207
    return Bits64{nbits64, bits64};
2208
  }
2209
2210
  void Interleave(const Bits32& low) {
2211
    bits =
2212
        vorrq_u32(vshlq_u32(bits, vreinterpretq_s32_u32(low.nbits)), low.bits);
2213
    nbits = vaddq_u32(nbits, low.nbits);
2214
  }
2215
2216
  void ClipTo(size_t n) {
2217
    n = std::min<size_t>(n, 4);
2218
    constexpr uint32_t kMask[8] = {
2219
        ~0u, ~0u, ~0u, ~0u, 0, 0, 0, 0,
2220
    };
2221
    uint32x4_t mask = vld1q_u32(kMask + 4 - n);
2222
    nbits = vandq_u32(mask, nbits);
2223
    bits = vandq_u32(mask, bits);
2224
  }
2225
  void Skip(size_t n) {
2226
    n = std::min<size_t>(n, 4);
2227
    constexpr uint32_t kMask[8] = {
2228
        0, 0, 0, 0, ~0u, ~0u, ~0u, ~0u,
2229
    };
2230
    uint32x4_t mask = vld1q_u32(kMask + 4 - n);
2231
    nbits = vandq_u32(mask, nbits);
2232
    bits = vandq_u32(mask, bits);
2233
  }
2234
};
2235
2236
struct Bits16 {
2237
  uint16x8_t nbits;
2238
  uint16x8_t bits;
2239
2240
  static Bits16 FromRaw(SIMDVec16 nbits, SIMDVec16 bits) {
2241
    return Bits16{nbits.vec, bits.vec};
2242
  }
2243
2244
  Bits32 Merge() const {
2245
    // TODO(veluca): can probably be optimized.
2246
    uint32x4_t nbits_lo16 =
2247
        vandq_u32(vreinterpretq_u32_u16(nbits), vdupq_n_u32(0xFFFF));
2248
    uint32x4_t bits_hi16 =
2249
        vshlq_u32(vshrq_n_u32(vreinterpretq_u32_u16(bits), 16),
2250
                  vreinterpretq_s32_u32(nbits_lo16));
2251
    uint32x4_t bits_lo16 =
2252
        vandq_u32(vreinterpretq_u32_u16(bits), vdupq_n_u32(0xFFFF));
2253
    uint32x4_t nbits32 =
2254
        vsraq_n_u32(nbits_lo16, vreinterpretq_u32_u16(nbits), 16);
2255
    uint32x4_t bits32 = vorrq_u32(bits_hi16, bits_lo16);
2256
    return Bits32{nbits32, bits32};
2257
  }
2258
2259
  void Interleave(const Bits16& low) {
2260
    bits =
2261
        vorrq_u16(vshlq_u16(bits, vreinterpretq_s16_u16(low.nbits)), low.bits);
2262
    nbits = vaddq_u16(nbits, low.nbits);
2263
  }
2264
2265
  void ClipTo(size_t n) {
2266
    n = std::min<size_t>(n, 8);
2267
    constexpr uint16_t kMask[16] = {
2268
        0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF,
2269
        0,      0,      0,      0,      0,      0,      0,      0,
2270
    };
2271
    uint16x8_t mask = vld1q_u16(kMask + 8 - n);
2272
    nbits = vandq_u16(mask, nbits);
2273
    bits = vandq_u16(mask, bits);
2274
  }
2275
  void Skip(size_t n) {
2276
    n = std::min<size_t>(n, 8);
2277
    constexpr uint16_t kMask[16] = {
2278
        0,      0,      0,      0,      0,      0,      0,      0,
2279
        0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF,
2280
    };
2281
    uint16x8_t mask = vld1q_u16(kMask + 8 - n);
2282
    nbits = vandq_u16(mask, nbits);
2283
    bits = vandq_u16(mask, bits);
2284
  }
2285
};
2286
2287
#endif
2288
2289
#ifdef FJXL_GENERIC_SIMD
2290
constexpr size_t SIMDVec32::kLanes;
2291
constexpr size_t SIMDVec16::kLanes;
2292
2293
//  Each of these functions will process SIMDVec16::kLanes worth of values.
2294
2295
FJXL_INLINE void TokenizeSIMD(const uint16_t* residuals, uint16_t* token_out,
2296
0
                              uint16_t* nbits_out, uint16_t* bits_out) {
2297
0
  SIMDVec16 res = SIMDVec16::Load(residuals);
2298
0
  SIMDVec16 token = res.ValToToken();
2299
0
  SIMDVec16 nbits = token.SatSubU(SIMDVec16::Val(1));
2300
0
  SIMDVec16 bits = res.SatSubU(nbits.Pow2());
2301
0
  token.Store(token_out);
2302
0
  nbits.Store(nbits_out);
2303
0
  bits.Store(bits_out);
2304
0
}
2305
2306
FJXL_INLINE void TokenizeSIMD(const uint32_t* residuals, uint16_t* token_out,
2307
0
                              uint32_t* nbits_out, uint32_t* bits_out) {
2308
0
  static_assert(SIMDVec16::kLanes == 2 * SIMDVec32::kLanes,
2309
0
                "There should be twice more 16-bit lanes than 32-bit lanes");
2310
0
  SIMDVec32 res_lo = SIMDVec32::Load(residuals);
2311
0
  SIMDVec32 res_hi = SIMDVec32::Load(residuals + SIMDVec32::kLanes);
2312
0
  SIMDVec32 token_lo = res_lo.ValToToken();
2313
0
  SIMDVec32 token_hi = res_hi.ValToToken();
2314
0
  SIMDVec32 nbits_lo = token_lo.SatSubU(SIMDVec32::Val(1));
2315
0
  SIMDVec32 nbits_hi = token_hi.SatSubU(SIMDVec32::Val(1));
2316
0
  SIMDVec32 bits_lo = res_lo.SatSubU(nbits_lo.Pow2());
2317
0
  SIMDVec32 bits_hi = res_hi.SatSubU(nbits_hi.Pow2());
2318
0
  SIMDVec16 token = SIMDVec16::FromTwo32(token_lo, token_hi);
2319
0
  token.Store(token_out);
2320
0
  nbits_lo.Store(nbits_out);
2321
0
  nbits_hi.Store(nbits_out + SIMDVec32::kLanes);
2322
0
  bits_lo.Store(bits_out);
2323
0
  bits_hi.Store(bits_out + SIMDVec32::kLanes);
2324
0
}
2325
2326
FJXL_INLINE void HuffmanSIMDUpTo13(const uint16_t* tokens,
2327
                                   const uint8_t* raw_nbits_simd,
2328
                                   const uint8_t* raw_bits_simd,
2329
0
                                   uint16_t* nbits_out, uint16_t* bits_out) {
2330
0
  SIMDVec16 tok = SIMDVec16::Load(tokens).PrepareForU8Lookup();
2331
0
  tok.U8Lookup(raw_nbits_simd).Store(nbits_out);
2332
0
  tok.U8Lookup(raw_bits_simd).Store(bits_out);
2333
0
}
2334
2335
FJXL_INLINE void HuffmanSIMD14(const uint16_t* tokens,
2336
                               const uint8_t* raw_nbits_simd,
2337
                               const uint8_t* raw_bits_simd,
2338
0
                               uint16_t* nbits_out, uint16_t* bits_out) {
2339
0
  SIMDVec16 token_cap = SIMDVec16::Val(15);
2340
0
  SIMDVec16 tok = SIMDVec16::Load(tokens);
2341
0
  SIMDVec16 tok_index = tok.Min(token_cap).PrepareForU8Lookup();
2342
0
  SIMDVec16 huff_bits_pre = tok_index.U8Lookup(raw_bits_simd);
2343
  // Set the highest bit when token == 16; the Huffman code is constructed in
2344
  // such a way that the code for token 15 is the same as the code for 16,
2345
  // except for the highest bit.
2346
0
  Mask16 needs_high_bit = tok.Eq(SIMDVec16::Val(16));
2347
0
  SIMDVec16 huff_bits = needs_high_bit.IfThenElse(
2348
0
      huff_bits_pre.Or(SIMDVec16::Val(128)), huff_bits_pre);
2349
0
  huff_bits.Store(bits_out);
2350
0
  tok_index.U8Lookup(raw_nbits_simd).Store(nbits_out);
2351
0
}
2352
2353
FJXL_INLINE void HuffmanSIMDAbove14(const uint16_t* tokens,
2354
                                    const uint8_t* raw_nbits_simd,
2355
                                    const uint8_t* raw_bits_simd,
2356
0
                                    uint16_t* nbits_out, uint16_t* bits_out) {
2357
0
  SIMDVec16 tok = SIMDVec16::Load(tokens);
2358
  // We assume `tok` fits in a *signed* 16-bit integer.
2359
0
  Mask16 above = tok.Gt(SIMDVec16::Val(12));
2360
  // 13, 14 -> 13
2361
  // 15, 16 -> 14
2362
  // 17, 18 -> 15
2363
0
  SIMDVec16 remap_tok = above.IfThenElse(tok.HAdd(SIMDVec16::Val(13)), tok);
2364
0
  SIMDVec16 tok_index = remap_tok.PrepareForU8Lookup();
2365
0
  SIMDVec16 huff_bits_pre = tok_index.U8Lookup(raw_bits_simd);
2366
  // Set the highest bit when token == 14, 16, 18.
2367
0
  Mask16 needs_high_bit = above.And(tok.Eq(tok.And(SIMDVec16::Val(0xFFFE))));
2368
0
  SIMDVec16 huff_bits = needs_high_bit.IfThenElse(
2369
0
      huff_bits_pre.Or(SIMDVec16::Val(128)), huff_bits_pre);
2370
0
  huff_bits.Store(bits_out);
2371
0
  tok_index.U8Lookup(raw_nbits_simd).Store(nbits_out);
2372
0
}
2373
2374
FJXL_INLINE void StoreSIMDUpTo8(const uint16_t* nbits_tok,
2375
                                const uint16_t* bits_tok,
2376
                                const uint16_t* nbits_huff,
2377
                                const uint16_t* bits_huff, size_t n,
2378
0
                                size_t skip, Bits32* bits_out) {
2379
0
  Bits16 bits =
2380
0
      Bits16::FromRaw(SIMDVec16::Load(nbits_tok), SIMDVec16::Load(bits_tok));
2381
0
  Bits16 huff_bits =
2382
0
      Bits16::FromRaw(SIMDVec16::Load(nbits_huff), SIMDVec16::Load(bits_huff));
2383
0
  bits.Interleave(huff_bits);
2384
0
  bits.ClipTo(n);
2385
0
  bits.Skip(skip);
2386
0
  bits_out[0] = bits.Merge();
2387
0
}
2388
2389
// Huffman and raw bits don't necessarily fit in a single u16 here.
2390
FJXL_INLINE void StoreSIMDUpTo14(const uint16_t* nbits_tok,
2391
                                 const uint16_t* bits_tok,
2392
                                 const uint16_t* nbits_huff,
2393
                                 const uint16_t* bits_huff, size_t n,
2394
0
                                 size_t skip, Bits32* bits_out) {
2395
0
  VecPair<SIMDVec16> bits =
2396
0
      SIMDVec16::Load(bits_tok).Interleave(SIMDVec16::Load(bits_huff));
2397
0
  VecPair<SIMDVec16> nbits =
2398
0
      SIMDVec16::Load(nbits_tok).Interleave(SIMDVec16::Load(nbits_huff));
2399
0
  Bits16 low = Bits16::FromRaw(nbits.low, bits.low);
2400
0
  Bits16 hi = Bits16::FromRaw(nbits.hi, bits.hi);
2401
0
  low.ClipTo(2 * n);
2402
0
  low.Skip(2 * skip);
2403
0
  hi.ClipTo(std::max(2 * n, SIMDVec16::kLanes) - SIMDVec16::kLanes);
2404
0
  hi.Skip(std::max(2 * skip, SIMDVec16::kLanes) - SIMDVec16::kLanes);
2405
2406
0
  bits_out[0] = low.Merge();
2407
0
  bits_out[1] = hi.Merge();
2408
0
}
2409
2410
FJXL_INLINE void StoreSIMDAbove14(const uint32_t* nbits_tok,
2411
                                  const uint32_t* bits_tok,
2412
                                  const uint16_t* nbits_huff,
2413
                                  const uint16_t* bits_huff, size_t n,
2414
0
                                  size_t skip, Bits32* bits_out) {
2415
0
  static_assert(SIMDVec16::kLanes == 2 * SIMDVec32::kLanes,
2416
0
                "There should be twice more 16-bit lanes than 32-bit lanes");
2417
0
  Bits32 bits_low =
2418
0
      Bits32::FromRaw(SIMDVec32::Load(nbits_tok), SIMDVec32::Load(bits_tok));
2419
0
  Bits32 bits_hi =
2420
0
      Bits32::FromRaw(SIMDVec32::Load(nbits_tok + SIMDVec32::kLanes),
2421
0
                      SIMDVec32::Load(bits_tok + SIMDVec32::kLanes));
2422
2423
0
  VecPair<SIMDVec32> huff_bits = SIMDVec16::Load(bits_huff).Upcast();
2424
0
  VecPair<SIMDVec32> huff_nbits = SIMDVec16::Load(nbits_huff).Upcast();
2425
2426
0
  Bits32 huff_low = Bits32::FromRaw(huff_nbits.low, huff_bits.low);
2427
0
  Bits32 huff_hi = Bits32::FromRaw(huff_nbits.hi, huff_bits.hi);
2428
2429
0
  bits_low.Interleave(huff_low);
2430
0
  bits_low.ClipTo(n);
2431
0
  bits_low.Skip(skip);
2432
0
  bits_out[0] = bits_low;
2433
0
  bits_hi.Interleave(huff_hi);
2434
0
  bits_hi.ClipTo(std::max(n, SIMDVec32::kLanes) - SIMDVec32::kLanes);
2435
0
  bits_hi.Skip(std::max(skip, SIMDVec32::kLanes) - SIMDVec32::kLanes);
2436
0
  bits_out[1] = bits_hi;
2437
0
}
2438
2439
#ifdef FJXL_AVX512
2440
FJXL_INLINE void StoreToWriterAVX512(const Bits32& bits32, BitWriter& output) {
2441
  __m512i bits = bits32.bits;
2442
  __m512i nbits = bits32.nbits;
2443
2444
  // Insert the leftover bits from the bit buffer at the bottom of the vector
2445
  // and extract the top of the vector.
2446
  uint64_t trail_bits =
2447
      _mm512_cvtsi512_si32(_mm512_alignr_epi32(bits, bits, 15));
2448
  uint64_t trail_nbits =
2449
      _mm512_cvtsi512_si32(_mm512_alignr_epi32(nbits, nbits, 15));
2450
  __m512i lead_bits = _mm512_set1_epi32(output.buffer);
2451
  __m512i lead_nbits = _mm512_set1_epi32(output.bits_in_buffer);
2452
  bits = _mm512_alignr_epi32(bits, lead_bits, 15);
2453
  nbits = _mm512_alignr_epi32(nbits, lead_nbits, 15);
2454
2455
  // Merge 32 -> 64 bits.
2456
  Bits32 b{nbits, bits};
2457
  Bits64 b64 = b.Merge();
2458
  bits = b64.bits;
2459
  nbits = b64.nbits;
2460
2461
  __m512i zero = _mm512_setzero_si512();
2462
2463
  auto sh1 = [zero](__m512i vec) { return _mm512_alignr_epi64(vec, zero, 7); };
2464
  auto sh2 = [zero](__m512i vec) { return _mm512_alignr_epi64(vec, zero, 6); };
2465
  auto sh4 = [zero](__m512i vec) { return _mm512_alignr_epi64(vec, zero, 4); };
2466
2467
  // Compute first-past-end-bit-position.
2468
  __m512i end_intermediate0 = _mm512_add_epi64(nbits, sh1(nbits));
2469
  __m512i end_intermediate1 =
2470
      _mm512_add_epi64(end_intermediate0, sh2(end_intermediate0));
2471
  __m512i end = _mm512_add_epi64(end_intermediate1, sh4(end_intermediate1));
2472
2473
  uint64_t simd_nbits = _mm512_cvtsi512_si32(_mm512_alignr_epi64(end, end, 7));
2474
2475
  // Compute begin-bit-position.
2476
  __m512i begin = _mm512_sub_epi64(end, nbits);
2477
2478
  // Index of the last bit in the chunk, or the end bit if nbits==0.
2479
  __m512i last = _mm512_mask_sub_epi64(
2480
      end, _mm512_cmpneq_epi64_mask(nbits, zero), end, _mm512_set1_epi64(1));
2481
2482
  __m512i lane_offset_mask = _mm512_set1_epi64(63);
2483
2484
  // Starting position of the chunk that each lane will ultimately belong to.
2485
  __m512i chunk_start = _mm512_andnot_si512(lane_offset_mask, last);
2486
2487
  // For all lanes that contain bits belonging to two different 64-bit chunks,
2488
  // compute the number of bits that belong to the first chunk.
2489
  // total # of bits fit in a u16, so we can satsub_u16 here.
2490
  __m512i first_chunk_nbits = _mm512_subs_epu16(chunk_start, begin);
2491
2492
  // Move all the previous-chunk-bits to the previous lane.
2493
  __m512i negnbits = _mm512_sub_epi64(_mm512_set1_epi64(64), first_chunk_nbits);
2494
  __m512i first_chunk_bits =
2495
      _mm512_srlv_epi64(_mm512_sllv_epi64(bits, negnbits), negnbits);
2496
  __m512i first_chunk_bits_down =
2497
      _mm512_alignr_epi32(zero, first_chunk_bits, 2);
2498
  bits = _mm512_srlv_epi64(bits, first_chunk_nbits);
2499
  nbits = _mm512_sub_epi64(nbits, first_chunk_nbits);
2500
  bits = _mm512_or_si512(bits, _mm512_sllv_epi64(first_chunk_bits_down, nbits));
2501
  begin = _mm512_add_epi64(begin, first_chunk_nbits);
2502
2503
  // We now know that every lane should give bits to only one chunk. We can
2504
  // shift the bits and then horizontally-or-reduce them within the same chunk.
2505
  __m512i offset = _mm512_and_si512(begin, lane_offset_mask);
2506
  __m512i aligned_bits = _mm512_sllv_epi64(bits, offset);
2507
  // h-or-reduce within same chunk
2508
  __m512i red0 = _mm512_mask_or_epi64(
2509
      aligned_bits, _mm512_cmpeq_epi64_mask(sh1(chunk_start), chunk_start),
2510
      sh1(aligned_bits), aligned_bits);
2511
  __m512i red1 = _mm512_mask_or_epi64(
2512
      red0, _mm512_cmpeq_epi64_mask(sh2(chunk_start), chunk_start), sh2(red0),
2513
      red0);
2514
  __m512i reduced = _mm512_mask_or_epi64(
2515
      red1, _mm512_cmpeq_epi64_mask(sh4(chunk_start), chunk_start), sh4(red1),
2516
      red1);
2517
  // Extract the highest lane that belongs to each chunk (the lane that ends up
2518
  // with the OR-ed value of all the other lanes of that chunk).
2519
  __m512i next_chunk_start =
2520
      _mm512_alignr_epi32(_mm512_set1_epi64(~0), chunk_start, 2);
2521
  __m512i result = _mm512_maskz_compress_epi64(
2522
      _mm512_cmpneq_epi64_mask(chunk_start, next_chunk_start), reduced);
2523
2524
  _mm512_storeu_si512((__m512i*)(output.data.get() + output.bytes_written),
2525
                      result);
2526
2527
  // Update the bit writer and add the last 32-bit lane.
2528
  // Note that since trail_nbits was at most 32 to begin with, operating on
2529
  // trail_bits does not risk overflowing.
2530
  output.bytes_written += simd_nbits / 8;
2531
  // Here we are implicitly relying on the fact that simd_nbits < 512 to know
2532
  // that the byte of bitreader data we access is initialized. This is
2533
  // guaranteed because the remaining bits in the bitreader buffer are at most
2534
  // 7, so simd_nbits <= 505 always.
2535
  trail_bits = (trail_bits << (simd_nbits % 8)) +
2536
               output.data.get()[output.bytes_written];
2537
  trail_nbits += simd_nbits % 8;
2538
  StoreLE64(output.data.get() + output.bytes_written, trail_bits);
2539
  size_t trail_bytes = trail_nbits / 8;
2540
  output.bits_in_buffer = trail_nbits % 8;
2541
  output.buffer = trail_bits >> (trail_bytes * 8);
2542
  output.bytes_written += trail_bytes;
2543
}
2544
2545
#endif
2546
2547
template <size_t n>
2548
0
FJXL_INLINE void StoreToWriter(const Bits32* bits, BitWriter& output) {
2549
#ifdef FJXL_AVX512
2550
  static_assert(n <= 2, "n should be less or 2 for AVX512");
2551
  StoreToWriterAVX512(bits[0], output);
2552
  if (n == 2) {
2553
    StoreToWriterAVX512(bits[1], output);
2554
  }
2555
  return;
2556
#endif
2557
0
  static_assert(n <= 4, "n should be less or 4");
2558
0
  alignas(64) uint64_t nbits64[Bits64::kLanes * n];
2559
0
  alignas(64) uint64_t bits64[Bits64::kLanes * n];
2560
0
  bits[0].Merge().Store(nbits64, bits64);
2561
0
  if (n > 1) {
2562
0
    bits[1].Merge().Store(nbits64 + Bits64::kLanes, bits64 + Bits64::kLanes);
2563
0
  }
2564
0
  if (n > 2) {
2565
0
    bits[2].Merge().Store(nbits64 + 2 * Bits64::kLanes,
2566
0
                          bits64 + 2 * Bits64::kLanes);
2567
0
  }
2568
0
  if (n > 3) {
2569
0
    bits[3].Merge().Store(nbits64 + 3 * Bits64::kLanes,
2570
0
                          bits64 + 3 * Bits64::kLanes);
2571
0
  }
2572
0
  output.WriteMultiple(nbits64, bits64, Bits64::kLanes * n);
2573
0
}
Unexecuted instantiation: enc_fast_lossless.cc:void AVX2::(anonymous namespace)::StoreToWriter<1ul>(AVX2::(anonymous namespace)::Bits32 const*, (anonymous namespace)::BitWriter&)
Unexecuted instantiation: enc_fast_lossless.cc:void AVX2::(anonymous namespace)::StoreToWriter<2ul>(AVX2::(anonymous namespace)::Bits32 const*, (anonymous namespace)::BitWriter&)
2574
2575
namespace detail {
2576
template <typename T>
2577
struct IntegerTypes;
2578
2579
template <>
2580
struct IntegerTypes<SIMDVec16> {
2581
  using signed_ = int16_t;
2582
  using unsigned_ = uint16_t;
2583
};
2584
2585
template <>
2586
struct IntegerTypes<SIMDVec32> {
2587
  using signed_ = int32_t;
2588
  using unsigned_ = uint32_t;
2589
};
2590
2591
template <typename T>
2592
struct SIMDType;
2593
2594
template <>
2595
struct SIMDType<int16_t> {
2596
  using type = SIMDVec16;
2597
};
2598
2599
template <>
2600
struct SIMDType<int32_t> {
2601
  using type = SIMDVec32;
2602
};
2603
2604
}  // namespace detail
2605
2606
template <typename T>
2607
using signed_t = typename detail::IntegerTypes<T>::signed_;
2608
2609
template <typename T>
2610
using unsigned_t = typename detail::IntegerTypes<T>::unsigned_;
2611
2612
template <typename T>
2613
using simd_t = typename detail::SIMDType<T>::type;
2614
2615
// This function will process exactly one vector worth of pixels.
2616
2617
template <typename T>
2618
size_t PredictPixels(const signed_t<T>* pixels, const signed_t<T>* pixels_left,
2619
                     const signed_t<T>* pixels_top,
2620
                     const signed_t<T>* pixels_topleft,
2621
0
                     unsigned_t<T>* residuals) {
2622
0
  T px = T::Load((unsigned_t<T>*)pixels);
2623
0
  T left = T::Load((unsigned_t<T>*)pixels_left);
2624
0
  T top = T::Load((unsigned_t<T>*)pixels_top);
2625
0
  T topleft = T::Load((unsigned_t<T>*)pixels_topleft);
2626
0
  T ac = left.Sub(topleft);
2627
0
  T ab = left.Sub(top);
2628
0
  T bc = top.Sub(topleft);
2629
0
  T grad = ac.Add(top);
2630
0
  T d = ab.Xor(bc);
2631
0
  T zero = T::Val(0);
2632
0
  T clamp = zero.Gt(d).IfThenElse(top, left);
2633
0
  T s = ac.Xor(bc);
2634
0
  T pred = zero.Gt(s).IfThenElse(grad, clamp);
2635
0
  T res = px.Sub(pred);
2636
0
  T res_times_2 = res.Add(res);
2637
0
  res = zero.Gt(res).IfThenElse(T::Val(-1).Sub(res_times_2), res_times_2);
2638
0
  res.Store(residuals);
2639
0
  return res.Eq(T::Val(0)).CountPrefix();
2640
0
}
Unexecuted instantiation: enc_fast_lossless.cc:unsigned long AVX2::(anonymous namespace)::PredictPixels<AVX2::(anonymous namespace)::SIMDVec16>(AVX2::(anonymous namespace)::detail::IntegerTypes<AVX2::(anonymous namespace)::SIMDVec16>::signed_ const*, AVX2::(anonymous namespace)::detail::IntegerTypes<AVX2::(anonymous namespace)::SIMDVec16>::signed_ const*, AVX2::(anonymous namespace)::detail::IntegerTypes<AVX2::(anonymous namespace)::SIMDVec16>::signed_ const*, AVX2::(anonymous namespace)::detail::IntegerTypes<AVX2::(anonymous namespace)::SIMDVec16>::signed_ const*, AVX2::(anonymous namespace)::detail::IntegerTypes<AVX2::(anonymous namespace)::SIMDVec16>::unsigned_*)
Unexecuted instantiation: enc_fast_lossless.cc:unsigned long AVX2::(anonymous namespace)::PredictPixels<AVX2::(anonymous namespace)::SIMDVec32>(AVX2::(anonymous namespace)::detail::IntegerTypes<AVX2::(anonymous namespace)::SIMDVec32>::signed_ const*, AVX2::(anonymous namespace)::detail::IntegerTypes<AVX2::(anonymous namespace)::SIMDVec32>::signed_ const*, AVX2::(anonymous namespace)::detail::IntegerTypes<AVX2::(anonymous namespace)::SIMDVec32>::signed_ const*, AVX2::(anonymous namespace)::detail::IntegerTypes<AVX2::(anonymous namespace)::SIMDVec32>::signed_ const*, AVX2::(anonymous namespace)::detail::IntegerTypes<AVX2::(anonymous namespace)::SIMDVec32>::unsigned_*)
2641
2642
#endif
2643
2644
void EncodeHybridUint000(uint32_t value, uint32_t* token, uint32_t* nbits,
2645
0
                         uint32_t* bits) {
2646
0
  uint32_t n = FloorLog2(value);
2647
0
  *token = value ? n + 1 : 0;
2648
0
  *nbits = value ? n : 0;
2649
0
  *bits = value ? value - (1 << n) : 0;
2650
0
}
Unexecuted instantiation: enc_fast_lossless.cc:AVX2::(anonymous namespace)::EncodeHybridUint000(unsigned int, unsigned int*, unsigned int*, unsigned int*)
Unexecuted instantiation: enc_fast_lossless.cc:default_implementation::(anonymous namespace)::EncodeHybridUint000(unsigned int, unsigned int*, unsigned int*, unsigned int*)
2651
2652
#ifdef FJXL_AVX512
2653
constexpr static size_t kLogChunkSize = 5;
2654
#elif defined(FJXL_AVX2) || defined(FJXL_NEON)
2655
// Even if NEON only has 128-bit lanes, it is still significantly (~1.3x) faster
2656
// to process two vectors at a time.
2657
constexpr static size_t kLogChunkSize = 4;
2658
#else
2659
constexpr static size_t kLogChunkSize = 3;
2660
#endif
2661
2662
constexpr static size_t kChunkSize = 1 << kLogChunkSize;
2663
2664
template <typename Residual>
2665
void GenericEncodeChunk(const Residual* residuals, size_t n, size_t skip,
2666
0
                        const PrefixCode& code, BitWriter& output) {
2667
0
  for (size_t ix = skip; ix < n; ix++) {
2668
0
    unsigned token, nbits, bits;
2669
0
    EncodeHybridUint000(residuals[ix], &token, &nbits, &bits);
2670
0
    output.Write(code.raw_nbits[token] + nbits,
2671
0
                 code.raw_bits[token] | bits << code.raw_nbits[token]);
2672
0
  }
2673
0
}
Unexecuted instantiation: enc_fast_lossless.cc:void default_implementation::(anonymous namespace)::GenericEncodeChunk<unsigned short>(unsigned short const*, unsigned long, unsigned long, (anonymous namespace)::PrefixCode const&, (anonymous namespace)::BitWriter&)
Unexecuted instantiation: enc_fast_lossless.cc:void default_implementation::(anonymous namespace)::GenericEncodeChunk<unsigned int>(unsigned int const*, unsigned long, unsigned long, (anonymous namespace)::PrefixCode const&, (anonymous namespace)::BitWriter&)
2674
2675
struct UpTo8Bits {
2676
  size_t bitdepth;
2677
0
  explicit UpTo8Bits(size_t bitdepth) : bitdepth(bitdepth) {
2678
0
    assert(bitdepth <= 8);
2679
0
  }
Unexecuted instantiation: enc_fast_lossless.cc:AVX2::(anonymous namespace)::UpTo8Bits::UpTo8Bits(unsigned long)
Unexecuted instantiation: enc_fast_lossless.cc:default_implementation::(anonymous namespace)::UpTo8Bits::UpTo8Bits(unsigned long)
2680
  // Here we can fit up to 9 extra bits + 7 Huffman bits in a u16; for all other
2681
  // symbols, we could actually go up to 8 Huffman bits as we have at most 8
2682
  // extra bits; however, the SIMD bit merging logic for AVX2 assumes that no
2683
  // Huffman length is 8 or more, so we cap at 8 anyway. Last symbol is used for
2684
  // LZ77 lengths and has no limitations except allowing to represent 32 symbols
2685
  // in total.
2686
  static constexpr uint8_t kMinRawLength[12] = {};
2687
  static constexpr uint8_t kMaxRawLength[12] = {
2688
      7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 10,
2689
  };
2690
0
  static size_t MaxEncodedBitsPerSample() { return 16; }
Unexecuted instantiation: enc_fast_lossless.cc:AVX2::(anonymous namespace)::UpTo8Bits::MaxEncodedBitsPerSample()
Unexecuted instantiation: enc_fast_lossless.cc:default_implementation::(anonymous namespace)::UpTo8Bits::MaxEncodedBitsPerSample()
2691
  static constexpr size_t kInputBytes = 1;
2692
  using pixel_t = int16_t;
2693
  using upixel_t = uint16_t;
2694
2695
  static void PrepareForSimd(const uint8_t* nbits, const uint8_t* bits,
2696
                             size_t n, uint8_t* nbits_simd,
2697
0
                             uint8_t* bits_simd) {
2698
0
    assert(n <= 16);
2699
0
    memcpy(nbits_simd, nbits, 16);
2700
0
    memcpy(bits_simd, bits, 16);
2701
0
  }
Unexecuted instantiation: enc_fast_lossless.cc:AVX2::(anonymous namespace)::UpTo8Bits::PrepareForSimd(unsigned char const*, unsigned char const*, unsigned long, unsigned char*, unsigned char*)
Unexecuted instantiation: enc_fast_lossless.cc:default_implementation::(anonymous namespace)::UpTo8Bits::PrepareForSimd(unsigned char const*, unsigned char const*, unsigned long, unsigned char*, unsigned char*)
2702
2703
#ifdef FJXL_GENERIC_SIMD
2704
  static void EncodeChunkSimd(upixel_t* residuals, size_t n, size_t skip,
2705
                              const uint8_t* raw_nbits_simd,
2706
0
                              const uint8_t* raw_bits_simd, BitWriter& output) {
2707
0
    Bits32 bits32[kChunkSize / SIMDVec16::kLanes];
2708
0
    alignas(64) uint16_t bits[SIMDVec16::kLanes];
2709
0
    alignas(64) uint16_t nbits[SIMDVec16::kLanes];
2710
0
    alignas(64) uint16_t bits_huff[SIMDVec16::kLanes];
2711
0
    alignas(64) uint16_t nbits_huff[SIMDVec16::kLanes];
2712
0
    alignas(64) uint16_t token[SIMDVec16::kLanes];
2713
0
    for (size_t i = 0; i < kChunkSize; i += SIMDVec16::kLanes) {
2714
0
      TokenizeSIMD(residuals + i, token, nbits, bits);
2715
0
      HuffmanSIMDUpTo13(token, raw_nbits_simd, raw_bits_simd, nbits_huff,
2716
0
                        bits_huff);
2717
0
      StoreSIMDUpTo8(nbits, bits, nbits_huff, bits_huff, std::max(n, i) - i,
2718
0
                     std::max(skip, i) - i, bits32 + i / SIMDVec16::kLanes);
2719
0
    }
2720
0
    StoreToWriter<kChunkSize / SIMDVec16::kLanes>(bits32, output);
2721
0
  }
2722
#endif
2723
2724
0
  size_t NumSymbols(bool doing_ycocg_or_large_palette) const {
2725
    // values gain 1 bit for YCoCg, 1 bit for prediction.
2726
    // Maximum symbol is 1 + effective bit depth of residuals.
2727
0
    if (doing_ycocg_or_large_palette) {
2728
0
      return bitdepth + 3;
2729
0
    } else {
2730
0
      return bitdepth + 2;
2731
0
    }
2732
0
  }
Unexecuted instantiation: enc_fast_lossless.cc:AVX2::(anonymous namespace)::UpTo8Bits::NumSymbols(bool) const
Unexecuted instantiation: enc_fast_lossless.cc:default_implementation::(anonymous namespace)::UpTo8Bits::NumSymbols(bool) const
2733
};
2734
constexpr uint8_t UpTo8Bits::kMinRawLength[];
2735
constexpr uint8_t UpTo8Bits::kMaxRawLength[];
2736
2737
struct From9To13Bits {
2738
  size_t bitdepth;
2739
0
  explicit From9To13Bits(size_t bitdepth) : bitdepth(bitdepth) {
2740
0
    assert(bitdepth <= 13 && bitdepth >= 9);
2741
0
  }
Unexecuted instantiation: enc_fast_lossless.cc:AVX2::(anonymous namespace)::From9To13Bits::From9To13Bits(unsigned long)
Unexecuted instantiation: enc_fast_lossless.cc:default_implementation::(anonymous namespace)::From9To13Bits::From9To13Bits(unsigned long)
2742
  // Last symbol is used for LZ77 lengths and has no limitations except allowing
2743
  // to represent 32 symbols in total.
2744
  // We cannot fit all the bits in a u16, so do not even try and use up to 8
2745
  // bits per raw symbol.
2746
  // There are at most 16 raw symbols, so Huffman coding can be SIMDfied without
2747
  // any special tricks.
2748
  static constexpr uint8_t kMinRawLength[17] = {};
2749
  static constexpr uint8_t kMaxRawLength[17] = {
2750
      8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 10,
2751
  };
2752
0
  static size_t MaxEncodedBitsPerSample() { return 21; }
Unexecuted instantiation: enc_fast_lossless.cc:AVX2::(anonymous namespace)::From9To13Bits::MaxEncodedBitsPerSample()
Unexecuted instantiation: enc_fast_lossless.cc:default_implementation::(anonymous namespace)::From9To13Bits::MaxEncodedBitsPerSample()
2753
  static constexpr size_t kInputBytes = 2;
2754
  using pixel_t = int16_t;
2755
  using upixel_t = uint16_t;
2756
2757
  static void PrepareForSimd(const uint8_t* nbits, const uint8_t* bits,
2758
                             size_t n, uint8_t* nbits_simd,
2759
0
                             uint8_t* bits_simd) {
2760
0
    assert(n <= 16);
2761
0
    memcpy(nbits_simd, nbits, 16);
2762
0
    memcpy(bits_simd, bits, 16);
2763
0
  }
Unexecuted instantiation: enc_fast_lossless.cc:AVX2::(anonymous namespace)::From9To13Bits::PrepareForSimd(unsigned char const*, unsigned char const*, unsigned long, unsigned char*, unsigned char*)
Unexecuted instantiation: enc_fast_lossless.cc:default_implementation::(anonymous namespace)::From9To13Bits::PrepareForSimd(unsigned char const*, unsigned char const*, unsigned long, unsigned char*, unsigned char*)
2764
2765
#ifdef FJXL_GENERIC_SIMD
2766
  static void EncodeChunkSimd(upixel_t* residuals, size_t n, size_t skip,
2767
                              const uint8_t* raw_nbits_simd,
2768
0
                              const uint8_t* raw_bits_simd, BitWriter& output) {
2769
0
    Bits32 bits32[2 * kChunkSize / SIMDVec16::kLanes];
2770
0
    alignas(64) uint16_t bits[SIMDVec16::kLanes];
2771
0
    alignas(64) uint16_t nbits[SIMDVec16::kLanes];
2772
0
    alignas(64) uint16_t bits_huff[SIMDVec16::kLanes];
2773
0
    alignas(64) uint16_t nbits_huff[SIMDVec16::kLanes];
2774
0
    alignas(64) uint16_t token[SIMDVec16::kLanes];
2775
0
    for (size_t i = 0; i < kChunkSize; i += SIMDVec16::kLanes) {
2776
0
      TokenizeSIMD(residuals + i, token, nbits, bits);
2777
0
      HuffmanSIMDUpTo13(token, raw_nbits_simd, raw_bits_simd, nbits_huff,
2778
0
                        bits_huff);
2779
0
      StoreSIMDUpTo14(nbits, bits, nbits_huff, bits_huff, std::max(n, i) - i,
2780
0
                      std::max(skip, i) - i,
2781
0
                      bits32 + 2 * i / SIMDVec16::kLanes);
2782
0
    }
2783
0
    StoreToWriter<2 * kChunkSize / SIMDVec16::kLanes>(bits32, output);
2784
0
  }
2785
#endif
2786
2787
0
  size_t NumSymbols(bool doing_ycocg_or_large_palette) const {
2788
    // values gain 1 bit for YCoCg, 1 bit for prediction.
2789
    // Maximum symbol is 1 + effective bit depth of residuals.
2790
0
    if (doing_ycocg_or_large_palette) {
2791
0
      return bitdepth + 3;
2792
0
    } else {
2793
0
      return bitdepth + 2;
2794
0
    }
2795
0
  }
Unexecuted instantiation: enc_fast_lossless.cc:AVX2::(anonymous namespace)::From9To13Bits::NumSymbols(bool) const
Unexecuted instantiation: enc_fast_lossless.cc:default_implementation::(anonymous namespace)::From9To13Bits::NumSymbols(bool) const
2796
};
2797
constexpr uint8_t From9To13Bits::kMinRawLength[];
2798
constexpr uint8_t From9To13Bits::kMaxRawLength[];
2799
2800
0
void CheckHuffmanBitsSIMD(int bits1, int nbits1, int bits2, int nbits2) {
2801
0
  assert(nbits1 == 8);
2802
0
  assert(nbits2 == 8);
2803
0
  assert(bits2 == (bits1 | 128));
2804
0
}
Unexecuted instantiation: enc_fast_lossless.cc:AVX2::(anonymous namespace)::CheckHuffmanBitsSIMD(int, int, int, int)
Unexecuted instantiation: enc_fast_lossless.cc:default_implementation::(anonymous namespace)::CheckHuffmanBitsSIMD(int, int, int, int)
2805
2806
struct Exactly14Bits {
2807
0
  explicit Exactly14Bits(size_t bitdepth_) { assert(bitdepth_ == 14); }
Unexecuted instantiation: enc_fast_lossless.cc:AVX2::(anonymous namespace)::Exactly14Bits::Exactly14Bits(unsigned long)
Unexecuted instantiation: enc_fast_lossless.cc:default_implementation::(anonymous namespace)::Exactly14Bits::Exactly14Bits(unsigned long)
2808
  // Force LZ77 symbols to have at least 8 bits, and raw symbols 15 and 16 to
2809
  // have exactly 8, and no other symbol to have 8 or more. This ensures that
2810
  // the representation for 15 and 16 is identical up to one bit.
2811
  static constexpr uint8_t kMinRawLength[18] = {
2812
      0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 8, 8, 7,
2813
  };
2814
  static constexpr uint8_t kMaxRawLength[18] = {
2815
      7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 8, 8, 10,
2816
  };
2817
  static constexpr size_t bitdepth = 14;
2818
0
  static size_t MaxEncodedBitsPerSample() { return 22; }
Unexecuted instantiation: enc_fast_lossless.cc:AVX2::(anonymous namespace)::Exactly14Bits::MaxEncodedBitsPerSample()
Unexecuted instantiation: enc_fast_lossless.cc:default_implementation::(anonymous namespace)::Exactly14Bits::MaxEncodedBitsPerSample()
2819
  static constexpr size_t kInputBytes = 2;
2820
  using pixel_t = int16_t;
2821
  using upixel_t = uint16_t;
2822
2823
  static void PrepareForSimd(const uint8_t* nbits, const uint8_t* bits,
2824
                             size_t n, uint8_t* nbits_simd,
2825
0
                             uint8_t* bits_simd) {
2826
0
    assert(n == 17);
2827
0
    CheckHuffmanBitsSIMD(bits[15], nbits[15], bits[16], nbits[16]);
2828
0
    memcpy(nbits_simd, nbits, 16);
2829
0
    memcpy(bits_simd, bits, 16);
2830
0
  }
Unexecuted instantiation: enc_fast_lossless.cc:AVX2::(anonymous namespace)::Exactly14Bits::PrepareForSimd(unsigned char const*, unsigned char const*, unsigned long, unsigned char*, unsigned char*)
Unexecuted instantiation: enc_fast_lossless.cc:default_implementation::(anonymous namespace)::Exactly14Bits::PrepareForSimd(unsigned char const*, unsigned char const*, unsigned long, unsigned char*, unsigned char*)
2831
2832
#ifdef FJXL_GENERIC_SIMD
2833
  static void EncodeChunkSimd(upixel_t* residuals, size_t n, size_t skip,
2834
                              const uint8_t* raw_nbits_simd,
2835
0
                              const uint8_t* raw_bits_simd, BitWriter& output) {
2836
0
    Bits32 bits32[2 * kChunkSize / SIMDVec16::kLanes];
2837
0
    alignas(64) uint16_t bits[SIMDVec16::kLanes];
2838
0
    alignas(64) uint16_t nbits[SIMDVec16::kLanes];
2839
0
    alignas(64) uint16_t bits_huff[SIMDVec16::kLanes];
2840
0
    alignas(64) uint16_t nbits_huff[SIMDVec16::kLanes];
2841
0
    alignas(64) uint16_t token[SIMDVec16::kLanes];
2842
0
    for (size_t i = 0; i < kChunkSize; i += SIMDVec16::kLanes) {
2843
0
      TokenizeSIMD(residuals + i, token, nbits, bits);
2844
0
      HuffmanSIMD14(token, raw_nbits_simd, raw_bits_simd, nbits_huff,
2845
0
                    bits_huff);
2846
0
      StoreSIMDUpTo14(nbits, bits, nbits_huff, bits_huff, std::max(n, i) - i,
2847
0
                      std::max(skip, i) - i,
2848
0
                      bits32 + 2 * i / SIMDVec16::kLanes);
2849
0
    }
2850
0
    StoreToWriter<2 * kChunkSize / SIMDVec16::kLanes>(bits32, output);
2851
0
  }
2852
#endif
2853
2854
0
  size_t NumSymbols(bool) const { return 17; }
Unexecuted instantiation: enc_fast_lossless.cc:AVX2::(anonymous namespace)::Exactly14Bits::NumSymbols(bool) const
Unexecuted instantiation: enc_fast_lossless.cc:default_implementation::(anonymous namespace)::Exactly14Bits::NumSymbols(bool) const
2855
};
2856
constexpr uint8_t Exactly14Bits::kMinRawLength[];
2857
constexpr uint8_t Exactly14Bits::kMaxRawLength[];
2858
2859
struct MoreThan14Bits {
2860
  size_t bitdepth;
2861
0
  explicit MoreThan14Bits(size_t bitdepth) : bitdepth(bitdepth) {
2862
0
    assert(bitdepth > 14);
2863
0
    assert(bitdepth <= 16);
2864
0
  }
Unexecuted instantiation: enc_fast_lossless.cc:AVX2::(anonymous namespace)::MoreThan14Bits::MoreThan14Bits(unsigned long)
Unexecuted instantiation: enc_fast_lossless.cc:default_implementation::(anonymous namespace)::MoreThan14Bits::MoreThan14Bits(unsigned long)
2865
  // Force LZ77 symbols to have at least 8 bits, and raw symbols 13 to 18 to
2866
  // have exactly 8, and no other symbol to have 8 or more. This ensures that
2867
  // the representation for (13, 14), (15, 16), (17, 18) is identical up to one
2868
  // bit.
2869
  static constexpr uint8_t kMinRawLength[20] = {
2870
      0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 8, 8, 8, 8, 8, 8, 7,
2871
  };
2872
  static constexpr uint8_t kMaxRawLength[20] = {
2873
      7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 8, 8, 8, 8, 8, 8, 10,
2874
  };
2875
0
  static size_t MaxEncodedBitsPerSample() { return 24; }
Unexecuted instantiation: enc_fast_lossless.cc:AVX2::(anonymous namespace)::MoreThan14Bits::MaxEncodedBitsPerSample()
Unexecuted instantiation: enc_fast_lossless.cc:default_implementation::(anonymous namespace)::MoreThan14Bits::MaxEncodedBitsPerSample()
2876
  static constexpr size_t kInputBytes = 2;
2877
  using pixel_t = int32_t;
2878
  using upixel_t = uint32_t;
2879
2880
  static void PrepareForSimd(const uint8_t* nbits, const uint8_t* bits,
2881
                             size_t n, uint8_t* nbits_simd,
2882
0
                             uint8_t* bits_simd) {
2883
0
    assert(n == 19);
2884
0
    CheckHuffmanBitsSIMD(bits[13], nbits[13], bits[14], nbits[14]);
2885
0
    CheckHuffmanBitsSIMD(bits[15], nbits[15], bits[16], nbits[16]);
2886
0
    CheckHuffmanBitsSIMD(bits[17], nbits[17], bits[18], nbits[18]);
2887
0
    for (size_t i = 0; i < 14; i++) {
2888
0
      nbits_simd[i] = nbits[i];
2889
0
      bits_simd[i] = bits[i];
2890
0
    }
2891
0
    nbits_simd[14] = nbits[15];
2892
0
    bits_simd[14] = bits[15];
2893
0
    nbits_simd[15] = nbits[17];
2894
0
    bits_simd[15] = bits[17];
2895
0
  }
Unexecuted instantiation: enc_fast_lossless.cc:AVX2::(anonymous namespace)::MoreThan14Bits::PrepareForSimd(unsigned char const*, unsigned char const*, unsigned long, unsigned char*, unsigned char*)
Unexecuted instantiation: enc_fast_lossless.cc:default_implementation::(anonymous namespace)::MoreThan14Bits::PrepareForSimd(unsigned char const*, unsigned char const*, unsigned long, unsigned char*, unsigned char*)
2896
2897
#ifdef FJXL_GENERIC_SIMD
2898
  static void EncodeChunkSimd(upixel_t* residuals, size_t n, size_t skip,
2899
                              const uint8_t* raw_nbits_simd,
2900
0
                              const uint8_t* raw_bits_simd, BitWriter& output) {
2901
0
    Bits32 bits32[2 * kChunkSize / SIMDVec16::kLanes];
2902
0
    alignas(64) uint32_t bits[SIMDVec16::kLanes];
2903
0
    alignas(64) uint32_t nbits[SIMDVec16::kLanes];
2904
0
    alignas(64) uint16_t bits_huff[SIMDVec16::kLanes];
2905
0
    alignas(64) uint16_t nbits_huff[SIMDVec16::kLanes];
2906
0
    alignas(64) uint16_t token[SIMDVec16::kLanes];
2907
0
    for (size_t i = 0; i < kChunkSize; i += SIMDVec16::kLanes) {
2908
0
      TokenizeSIMD(residuals + i, token, nbits, bits);
2909
0
      HuffmanSIMDAbove14(token, raw_nbits_simd, raw_bits_simd, nbits_huff,
2910
0
                         bits_huff);
2911
0
      StoreSIMDAbove14(nbits, bits, nbits_huff, bits_huff, std::max(n, i) - i,
2912
0
                       std::max(skip, i) - i,
2913
0
                       bits32 + 2 * i / SIMDVec16::kLanes);
2914
0
    }
2915
0
    StoreToWriter<2 * kChunkSize / SIMDVec16::kLanes>(bits32, output);
2916
0
  }
2917
#endif
2918
0
  size_t NumSymbols(bool) const { return 19; }
Unexecuted instantiation: enc_fast_lossless.cc:AVX2::(anonymous namespace)::MoreThan14Bits::NumSymbols(bool) const
Unexecuted instantiation: enc_fast_lossless.cc:default_implementation::(anonymous namespace)::MoreThan14Bits::NumSymbols(bool) const
2919
};
2920
constexpr uint8_t MoreThan14Bits::kMinRawLength[];
2921
constexpr uint8_t MoreThan14Bits::kMaxRawLength[];
2922
2923
void PrepareDCGlobalCommon(bool is_single_group, size_t width, size_t height,
2924
0
                           const PrefixCode code[4], BitWriter* output) {
2925
0
  output->Allocate(100000 + (is_single_group ? width * height * 16 : 0));
2926
  // No patches, spline or noise.
2927
0
  output->Write(1, 1);  // default DC dequantization factors (?)
2928
0
  output->Write(1, 1);  // use global tree / histograms
2929
0
  output->Write(1, 0);  // no lz77 for the tree
2930
2931
0
  output->Write(1, 1);         // simple code for the tree's context map
2932
0
  output->Write(2, 0);         // all contexts clustered together
2933
0
  output->Write(1, 1);         // use prefix code for tree
2934
0
  output->Write(4, 0);         // 000 hybrid uint
2935
0
  output->Write(6, 0b100011);  // Alphabet size is 4 (var16)
2936
0
  output->Write(2, 1);         // simple prefix code
2937
0
  output->Write(2, 3);         // with 4 symbols
2938
0
  output->Write(2, 0);
2939
0
  output->Write(2, 1);
2940
0
  output->Write(2, 2);
2941
0
  output->Write(2, 3);
2942
0
  output->Write(1, 0);  // First tree encoding option
2943
2944
  // Huffman table + extra bits for the tree.
2945
0
  uint8_t symbol_bits[6] = {0b00, 0b10, 0b001, 0b101, 0b0011, 0b0111};
2946
0
  uint8_t symbol_nbits[6] = {2, 2, 3, 3, 4, 4};
2947
  // Write a tree with a leaf per channel, and gradient predictor for every
2948
  // leaf.
2949
0
  for (auto v : {1, 2, 1, 4, 1, 0, 0, 5, 0, 0, 0, 0, 5,
2950
0
                 0, 0, 0, 0, 5, 0, 0, 0, 0, 5, 0, 0, 0}) {
2951
0
    output->Write(symbol_nbits[v], symbol_bits[v]);
2952
0
  }
2953
2954
0
  output->Write(1, 1);     // Enable lz77 for the main bitstream
2955
0
  output->Write(2, 0b00);  // lz77 offset 224
2956
0
  static_assert(kLZ77Offset == 224, "kLZ77Offset should be 224");
2957
0
  output->Write(4, 0b1010);  // lz77 min length 7
2958
  // 400 hybrid uint config for lz77
2959
0
  output->Write(4, 4);
2960
0
  output->Write(3, 0);
2961
0
  output->Write(3, 0);
2962
2963
0
  output->Write(1, 1);  // simple code for the context map
2964
0
  output->Write(2, 3);  // 3 bits per entry
2965
0
  output->Write(3, 4);  // channel 3
2966
0
  output->Write(3, 3);  // channel 2
2967
0
  output->Write(3, 2);  // channel 1
2968
0
  output->Write(3, 1);  // channel 0
2969
0
  output->Write(3, 0);  // distance histogram first
2970
2971
0
  output->Write(1, 1);  // use prefix codes
2972
0
  output->Write(4, 0);  // 000 hybrid uint config for distances (only need 0)
2973
0
  for (size_t i = 0; i < 4; i++) {
2974
0
    output->Write(4, 0);  // 000 hybrid uint config for symbols (only <= 10)
2975
0
  }
2976
2977
  // Distance alphabet size:
2978
0
  output->Write(5, 0b00001);  // 2: just need 1 for RLE (i.e. distance 1)
2979
  // Symbol + LZ77 alphabet size:
2980
0
  for (size_t i = 0; i < 4; i++) {
2981
0
    output->Write(1, 1);    // > 1
2982
0
    output->Write(4, 8);    // <= 512
2983
0
    output->Write(8, 256);  // == 512
2984
0
  }
2985
2986
  // Distance histogram:
2987
0
  output->Write(2, 1);  // simple prefix code
2988
0
  output->Write(2, 0);  // with one symbol
2989
0
  output->Write(1, 1);  // 1
2990
2991
  // Symbol + lz77 histogram:
2992
0
  for (size_t i = 0; i < 4; i++) {
2993
0
    code[i].WriteTo(output);
2994
0
  }
2995
2996
  // Group header for global modular image.
2997
0
  output->Write(1, 1);  // Global tree
2998
0
  output->Write(1, 1);  // All default wp
2999
0
}
Unexecuted instantiation: enc_fast_lossless.cc:AVX2::(anonymous namespace)::PrepareDCGlobalCommon(bool, unsigned long, unsigned long, (anonymous namespace)::PrefixCode const*, (anonymous namespace)::BitWriter*)
Unexecuted instantiation: enc_fast_lossless.cc:default_implementation::(anonymous namespace)::PrepareDCGlobalCommon(bool, unsigned long, unsigned long, (anonymous namespace)::PrefixCode const*, (anonymous namespace)::BitWriter*)
3000
3001
void PrepareDCGlobal(bool is_single_group, size_t width, size_t height,
3002
                     size_t nb_chans, const PrefixCode code[4],
3003
0
                     BitWriter* output) {
3004
0
  PrepareDCGlobalCommon(is_single_group, width, height, code, output);
3005
0
  if (nb_chans > 2) {
3006
0
    output->Write(2, 0b01);     // 1 transform
3007
0
    output->Write(2, 0b00);     // RCT
3008
0
    output->Write(5, 0b00000);  // Starting from ch 0
3009
0
    output->Write(2, 0b00);     // YCoCg
3010
0
  } else {
3011
0
    output->Write(2, 0b00);  // no transforms
3012
0
  }
3013
0
  if (!is_single_group) {
3014
0
    output->ZeroPadToByte();
3015
0
  }
3016
0
}
Unexecuted instantiation: enc_fast_lossless.cc:AVX2::(anonymous namespace)::PrepareDCGlobal(bool, unsigned long, unsigned long, unsigned long, (anonymous namespace)::PrefixCode const*, (anonymous namespace)::BitWriter*)
Unexecuted instantiation: enc_fast_lossless.cc:default_implementation::(anonymous namespace)::PrepareDCGlobal(bool, unsigned long, unsigned long, unsigned long, (anonymous namespace)::PrefixCode const*, (anonymous namespace)::BitWriter*)
3017
3018
template <typename BitDepth>
3019
struct ChunkEncoder {
3020
0
  void PrepareForSimd() {
3021
0
    BitDepth::PrepareForSimd(code->raw_nbits, code->raw_bits, code->numraw,
3022
0
                             raw_nbits_simd, raw_bits_simd);
3023
0
  }
Unexecuted instantiation: enc_fast_lossless.cc:AVX2::(anonymous namespace)::ChunkEncoder<AVX2::(anonymous namespace)::UpTo8Bits>::PrepareForSimd()
Unexecuted instantiation: enc_fast_lossless.cc:default_implementation::(anonymous namespace)::ChunkEncoder<default_implementation::(anonymous namespace)::UpTo8Bits>::PrepareForSimd()
Unexecuted instantiation: enc_fast_lossless.cc:AVX2::(anonymous namespace)::ChunkEncoder<AVX2::(anonymous namespace)::From9To13Bits>::PrepareForSimd()
Unexecuted instantiation: enc_fast_lossless.cc:AVX2::(anonymous namespace)::ChunkEncoder<AVX2::(anonymous namespace)::Exactly14Bits>::PrepareForSimd()
Unexecuted instantiation: enc_fast_lossless.cc:AVX2::(anonymous namespace)::ChunkEncoder<AVX2::(anonymous namespace)::MoreThan14Bits>::PrepareForSimd()
Unexecuted instantiation: enc_fast_lossless.cc:default_implementation::(anonymous namespace)::ChunkEncoder<default_implementation::(anonymous namespace)::From9To13Bits>::PrepareForSimd()
Unexecuted instantiation: enc_fast_lossless.cc:default_implementation::(anonymous namespace)::ChunkEncoder<default_implementation::(anonymous namespace)::Exactly14Bits>::PrepareForSimd()
Unexecuted instantiation: enc_fast_lossless.cc:default_implementation::(anonymous namespace)::ChunkEncoder<default_implementation::(anonymous namespace)::MoreThan14Bits>::PrepareForSimd()
3024
  FJXL_INLINE static void EncodeRle(size_t count, const PrefixCode& code,
3025
0
                                    BitWriter& output) {
3026
0
    if (count == 0) return;
3027
0
    count -= kLZ77MinLength + 1;
3028
0
    if (count < kLZ77CacheSize) {
3029
0
      output.Write(code.lz77_cache_nbits[count], code.lz77_cache_bits[count]);
3030
0
    } else {
3031
0
      unsigned token, nbits, bits;
3032
0
      EncodeHybridUintLZ77(count, &token, &nbits, &bits);
3033
0
      uint64_t wbits = bits;
3034
0
      wbits = (wbits << code.lz77_nbits[token]) | code.lz77_bits[token];
3035
0
      wbits = (wbits << code.raw_nbits[0]) | code.raw_bits[0];
3036
0
      output.Write(code.lz77_nbits[token] + nbits + code.raw_nbits[0], wbits);
3037
0
    }
3038
0
  }
Unexecuted instantiation: enc_fast_lossless.cc:AVX2::(anonymous namespace)::ChunkEncoder<AVX2::(anonymous namespace)::UpTo8Bits>::EncodeRle(unsigned long, (anonymous namespace)::PrefixCode const&, (anonymous namespace)::BitWriter&)
Unexecuted instantiation: enc_fast_lossless.cc:default_implementation::(anonymous namespace)::ChunkEncoder<default_implementation::(anonymous namespace)::UpTo8Bits>::EncodeRle(unsigned long, (anonymous namespace)::PrefixCode const&, (anonymous namespace)::BitWriter&)
Unexecuted instantiation: enc_fast_lossless.cc:AVX2::(anonymous namespace)::ChunkEncoder<AVX2::(anonymous namespace)::From9To13Bits>::EncodeRle(unsigned long, (anonymous namespace)::PrefixCode const&, (anonymous namespace)::BitWriter&)
Unexecuted instantiation: enc_fast_lossless.cc:AVX2::(anonymous namespace)::ChunkEncoder<AVX2::(anonymous namespace)::Exactly14Bits>::EncodeRle(unsigned long, (anonymous namespace)::PrefixCode const&, (anonymous namespace)::BitWriter&)
Unexecuted instantiation: enc_fast_lossless.cc:AVX2::(anonymous namespace)::ChunkEncoder<AVX2::(anonymous namespace)::MoreThan14Bits>::EncodeRle(unsigned long, (anonymous namespace)::PrefixCode const&, (anonymous namespace)::BitWriter&)
Unexecuted instantiation: enc_fast_lossless.cc:default_implementation::(anonymous namespace)::ChunkEncoder<default_implementation::(anonymous namespace)::From9To13Bits>::EncodeRle(unsigned long, (anonymous namespace)::PrefixCode const&, (anonymous namespace)::BitWriter&)
Unexecuted instantiation: enc_fast_lossless.cc:default_implementation::(anonymous namespace)::ChunkEncoder<default_implementation::(anonymous namespace)::Exactly14Bits>::EncodeRle(unsigned long, (anonymous namespace)::PrefixCode const&, (anonymous namespace)::BitWriter&)
Unexecuted instantiation: enc_fast_lossless.cc:default_implementation::(anonymous namespace)::ChunkEncoder<default_implementation::(anonymous namespace)::MoreThan14Bits>::EncodeRle(unsigned long, (anonymous namespace)::PrefixCode const&, (anonymous namespace)::BitWriter&)
3039
3040
  FJXL_INLINE void Chunk(size_t run, typename BitDepth::upixel_t* residuals,
3041
0
                         size_t skip, size_t n) {
3042
0
    EncodeRle(run, *code, *output);
3043
#ifdef FJXL_GENERIC_SIMD
3044
    BitDepth::EncodeChunkSimd(residuals, n, skip, raw_nbits_simd, raw_bits_simd,
3045
                              *output);
3046
#else
3047
    GenericEncodeChunk(residuals, n, skip, *code, *output);
3048
#endif
3049
0
  }
Unexecuted instantiation: enc_fast_lossless.cc:AVX2::(anonymous namespace)::ChunkEncoder<AVX2::(anonymous namespace)::UpTo8Bits>::Chunk(unsigned long, unsigned short*, unsigned long, unsigned long)
Unexecuted instantiation: enc_fast_lossless.cc:default_implementation::(anonymous namespace)::ChunkEncoder<default_implementation::(anonymous namespace)::UpTo8Bits>::Chunk(unsigned long, unsigned short*, unsigned long, unsigned long)
Unexecuted instantiation: enc_fast_lossless.cc:AVX2::(anonymous namespace)::ChunkEncoder<AVX2::(anonymous namespace)::From9To13Bits>::Chunk(unsigned long, unsigned short*, unsigned long, unsigned long)
Unexecuted instantiation: enc_fast_lossless.cc:AVX2::(anonymous namespace)::ChunkEncoder<AVX2::(anonymous namespace)::Exactly14Bits>::Chunk(unsigned long, unsigned short*, unsigned long, unsigned long)
Unexecuted instantiation: enc_fast_lossless.cc:AVX2::(anonymous namespace)::ChunkEncoder<AVX2::(anonymous namespace)::MoreThan14Bits>::Chunk(unsigned long, unsigned int*, unsigned long, unsigned long)
Unexecuted instantiation: enc_fast_lossless.cc:default_implementation::(anonymous namespace)::ChunkEncoder<default_implementation::(anonymous namespace)::From9To13Bits>::Chunk(unsigned long, unsigned short*, unsigned long, unsigned long)
Unexecuted instantiation: enc_fast_lossless.cc:default_implementation::(anonymous namespace)::ChunkEncoder<default_implementation::(anonymous namespace)::Exactly14Bits>::Chunk(unsigned long, unsigned short*, unsigned long, unsigned long)
Unexecuted instantiation: enc_fast_lossless.cc:default_implementation::(anonymous namespace)::ChunkEncoder<default_implementation::(anonymous namespace)::MoreThan14Bits>::Chunk(unsigned long, unsigned int*, unsigned long, unsigned long)
3050
3051
0
  inline void Finalize(size_t run) { EncodeRle(run, *code, *output); }
Unexecuted instantiation: enc_fast_lossless.cc:AVX2::(anonymous namespace)::ChunkEncoder<AVX2::(anonymous namespace)::UpTo8Bits>::Finalize(unsigned long)
Unexecuted instantiation: enc_fast_lossless.cc:default_implementation::(anonymous namespace)::ChunkEncoder<default_implementation::(anonymous namespace)::UpTo8Bits>::Finalize(unsigned long)
Unexecuted instantiation: enc_fast_lossless.cc:AVX2::(anonymous namespace)::ChunkEncoder<AVX2::(anonymous namespace)::From9To13Bits>::Finalize(unsigned long)
Unexecuted instantiation: enc_fast_lossless.cc:AVX2::(anonymous namespace)::ChunkEncoder<AVX2::(anonymous namespace)::Exactly14Bits>::Finalize(unsigned long)
Unexecuted instantiation: enc_fast_lossless.cc:AVX2::(anonymous namespace)::ChunkEncoder<AVX2::(anonymous namespace)::MoreThan14Bits>::Finalize(unsigned long)
Unexecuted instantiation: enc_fast_lossless.cc:default_implementation::(anonymous namespace)::ChunkEncoder<default_implementation::(anonymous namespace)::From9To13Bits>::Finalize(unsigned long)
Unexecuted instantiation: enc_fast_lossless.cc:default_implementation::(anonymous namespace)::ChunkEncoder<default_implementation::(anonymous namespace)::Exactly14Bits>::Finalize(unsigned long)
Unexecuted instantiation: enc_fast_lossless.cc:default_implementation::(anonymous namespace)::ChunkEncoder<default_implementation::(anonymous namespace)::MoreThan14Bits>::Finalize(unsigned long)
3052
3053
  const PrefixCode* code;
3054
  BitWriter* output;
3055
  alignas(64) uint8_t raw_nbits_simd[16] = {};
3056
  alignas(64) uint8_t raw_bits_simd[16] = {};
3057
};
3058
3059
template <typename BitDepth>
3060
struct ChunkSampleCollector {
3061
0
  FJXL_INLINE void Rle(size_t count, uint64_t* lz77_counts_) {
3062
0
    if (count == 0) return;
3063
0
    raw_counts[0] += 1;
3064
0
    count -= kLZ77MinLength + 1;
3065
0
    unsigned token, nbits, bits;
3066
0
    EncodeHybridUintLZ77(count, &token, &nbits, &bits);
3067
0
    lz77_counts_[token]++;
3068
0
  }
Unexecuted instantiation: enc_fast_lossless.cc:AVX2::(anonymous namespace)::ChunkSampleCollector<AVX2::(anonymous namespace)::UpTo8Bits>::Rle(unsigned long, unsigned long*)
Unexecuted instantiation: enc_fast_lossless.cc:AVX2::(anonymous namespace)::ChunkSampleCollector<AVX2::(anonymous namespace)::From9To13Bits>::Rle(unsigned long, unsigned long*)
Unexecuted instantiation: enc_fast_lossless.cc:AVX2::(anonymous namespace)::ChunkSampleCollector<AVX2::(anonymous namespace)::Exactly14Bits>::Rle(unsigned long, unsigned long*)
Unexecuted instantiation: enc_fast_lossless.cc:AVX2::(anonymous namespace)::ChunkSampleCollector<AVX2::(anonymous namespace)::MoreThan14Bits>::Rle(unsigned long, unsigned long*)
Unexecuted instantiation: enc_fast_lossless.cc:default_implementation::(anonymous namespace)::ChunkSampleCollector<default_implementation::(anonymous namespace)::UpTo8Bits>::Rle(unsigned long, unsigned long*)
Unexecuted instantiation: enc_fast_lossless.cc:default_implementation::(anonymous namespace)::ChunkSampleCollector<default_implementation::(anonymous namespace)::From9To13Bits>::Rle(unsigned long, unsigned long*)
Unexecuted instantiation: enc_fast_lossless.cc:default_implementation::(anonymous namespace)::ChunkSampleCollector<default_implementation::(anonymous namespace)::Exactly14Bits>::Rle(unsigned long, unsigned long*)
Unexecuted instantiation: enc_fast_lossless.cc:default_implementation::(anonymous namespace)::ChunkSampleCollector<default_implementation::(anonymous namespace)::MoreThan14Bits>::Rle(unsigned long, unsigned long*)
3069
3070
  FJXL_INLINE void Chunk(size_t run, typename BitDepth::upixel_t* residuals,
3071
0
                         size_t skip, size_t n) {
3072
    // Run is broken. Encode the run and encode the individual vector.
3073
0
    Rle(run, lz77_counts);
3074
0
    for (size_t ix = skip; ix < n; ix++) {
3075
0
      unsigned token, nbits, bits;
3076
0
      EncodeHybridUint000(residuals[ix], &token, &nbits, &bits);
3077
0
      raw_counts[token]++;
3078
0
    }
3079
0
  }
Unexecuted instantiation: enc_fast_lossless.cc:AVX2::(anonymous namespace)::ChunkSampleCollector<AVX2::(anonymous namespace)::UpTo8Bits>::Chunk(unsigned long, unsigned short*, unsigned long, unsigned long)
Unexecuted instantiation: enc_fast_lossless.cc:AVX2::(anonymous namespace)::ChunkSampleCollector<AVX2::(anonymous namespace)::From9To13Bits>::Chunk(unsigned long, unsigned short*, unsigned long, unsigned long)
Unexecuted instantiation: enc_fast_lossless.cc:AVX2::(anonymous namespace)::ChunkSampleCollector<AVX2::(anonymous namespace)::Exactly14Bits>::Chunk(unsigned long, unsigned short*, unsigned long, unsigned long)
Unexecuted instantiation: enc_fast_lossless.cc:AVX2::(anonymous namespace)::ChunkSampleCollector<AVX2::(anonymous namespace)::MoreThan14Bits>::Chunk(unsigned long, unsigned int*, unsigned long, unsigned long)
Unexecuted instantiation: enc_fast_lossless.cc:default_implementation::(anonymous namespace)::ChunkSampleCollector<default_implementation::(anonymous namespace)::UpTo8Bits>::Chunk(unsigned long, unsigned short*, unsigned long, unsigned long)
Unexecuted instantiation: enc_fast_lossless.cc:default_implementation::(anonymous namespace)::ChunkSampleCollector<default_implementation::(anonymous namespace)::From9To13Bits>::Chunk(unsigned long, unsigned short*, unsigned long, unsigned long)
Unexecuted instantiation: enc_fast_lossless.cc:default_implementation::(anonymous namespace)::ChunkSampleCollector<default_implementation::(anonymous namespace)::Exactly14Bits>::Chunk(unsigned long, unsigned short*, unsigned long, unsigned long)
Unexecuted instantiation: enc_fast_lossless.cc:default_implementation::(anonymous namespace)::ChunkSampleCollector<default_implementation::(anonymous namespace)::MoreThan14Bits>::Chunk(unsigned long, unsigned int*, unsigned long, unsigned long)
3080
3081
  // don't count final run since we don't know how long it really is
3082
0
  void Finalize(size_t run) {}
Unexecuted instantiation: enc_fast_lossless.cc:AVX2::(anonymous namespace)::ChunkSampleCollector<AVX2::(anonymous namespace)::UpTo8Bits>::Finalize(unsigned long)
Unexecuted instantiation: enc_fast_lossless.cc:AVX2::(anonymous namespace)::ChunkSampleCollector<AVX2::(anonymous namespace)::From9To13Bits>::Finalize(unsigned long)
Unexecuted instantiation: enc_fast_lossless.cc:AVX2::(anonymous namespace)::ChunkSampleCollector<AVX2::(anonymous namespace)::Exactly14Bits>::Finalize(unsigned long)
Unexecuted instantiation: enc_fast_lossless.cc:AVX2::(anonymous namespace)::ChunkSampleCollector<AVX2::(anonymous namespace)::MoreThan14Bits>::Finalize(unsigned long)
Unexecuted instantiation: enc_fast_lossless.cc:default_implementation::(anonymous namespace)::ChunkSampleCollector<default_implementation::(anonymous namespace)::UpTo8Bits>::Finalize(unsigned long)
Unexecuted instantiation: enc_fast_lossless.cc:default_implementation::(anonymous namespace)::ChunkSampleCollector<default_implementation::(anonymous namespace)::From9To13Bits>::Finalize(unsigned long)
Unexecuted instantiation: enc_fast_lossless.cc:default_implementation::(anonymous namespace)::ChunkSampleCollector<default_implementation::(anonymous namespace)::Exactly14Bits>::Finalize(unsigned long)
Unexecuted instantiation: enc_fast_lossless.cc:default_implementation::(anonymous namespace)::ChunkSampleCollector<default_implementation::(anonymous namespace)::MoreThan14Bits>::Finalize(unsigned long)
3083
3084
  uint64_t* raw_counts;
3085
  uint64_t* lz77_counts;
3086
};
3087
3088
0
constexpr uint32_t PackSigned(int32_t value) {
3089
0
  return (static_cast<uint32_t>(value) << 1) ^
3090
0
         ((static_cast<uint32_t>(~value) >> 31) - 1);
3091
0
}
Unexecuted instantiation: enc_fast_lossless.cc:AVX2::(anonymous namespace)::PackSigned(int)
Unexecuted instantiation: enc_fast_lossless.cc:default_implementation::(anonymous namespace)::PackSigned(int)
3092
3093
template <typename T, typename BitDepth>
3094
struct ChannelRowProcessor {
3095
  using upixel_t = typename BitDepth::upixel_t;
3096
  using pixel_t = typename BitDepth::pixel_t;
3097
  T* t;
3098
  void ProcessChunk(const pixel_t* row, const pixel_t* row_left,
3099
                    const pixel_t* row_top, const pixel_t* row_topleft,
3100
0
                    size_t n) {
3101
0
    alignas(64) upixel_t residuals[kChunkSize] = {};
3102
0
    size_t prefix_size = 0;
3103
0
    size_t required_prefix_size = 0;
3104
#ifdef FJXL_GENERIC_SIMD
3105
    constexpr size_t kNum =
3106
0
        sizeof(pixel_t) == 2 ? SIMDVec16::kLanes : SIMDVec32::kLanes;
3107
0
    for (size_t ix = 0; ix < kChunkSize; ix += kNum) {
3108
0
      size_t c =
3109
0
          PredictPixels<simd_t<pixel_t>>(row + ix, row_left + ix, row_top + ix,
3110
0
                                         row_topleft + ix, residuals + ix);
3111
0
      prefix_size =
3112
0
          prefix_size == required_prefix_size ? prefix_size + c : prefix_size;
3113
0
      required_prefix_size += kNum;
3114
0
    }
3115
#else
3116
0
    for (size_t ix = 0; ix < kChunkSize; ix++) {
3117
0
      pixel_t px = row[ix];
3118
0
      pixel_t left = row_left[ix];
3119
0
      pixel_t top = row_top[ix];
3120
0
      pixel_t topleft = row_topleft[ix];
3121
0
      pixel_t ac = left - topleft;
3122
0
      pixel_t ab = left - top;
3123
0
      pixel_t bc = top - topleft;
3124
0
      pixel_t grad = static_cast<pixel_t>(static_cast<upixel_t>(ac) +
3125
0
                                          static_cast<upixel_t>(top));
3126
0
      pixel_t d = ab ^ bc;
3127
0
      pixel_t clamp = d < 0 ? top : left;
3128
0
      pixel_t s = ac ^ bc;
3129
0
      pixel_t pred = s < 0 ? grad : clamp;
3130
0
      residuals[ix] = PackSigned(px - pred);
3131
0
      prefix_size = prefix_size == required_prefix_size
3132
0
                        ? prefix_size + (residuals[ix] == 0)
3133
0
                        : prefix_size;
3134
0
      required_prefix_size += 1;
3135
0
    }
3136
#endif
3137
0
    prefix_size = std::min(n, prefix_size);
3138
0
    if (prefix_size == n && (run > 0 || prefix_size > kLZ77MinLength)) {
3139
      // Run continues, nothing to do.
3140
0
      run += prefix_size;
3141
0
    } else if (prefix_size + run > kLZ77MinLength) {
3142
      // Run is broken. Encode the run and encode the individual vector.
3143
0
      t->Chunk(run + prefix_size, residuals, prefix_size, n);
3144
0
      run = 0;
3145
0
    } else {
3146
      // There was no run to begin with.
3147
0
      t->Chunk(0, residuals, 0, n);
3148
0
    }
3149
0
  }
Unexecuted instantiation: enc_fast_lossless.cc:AVX2::(anonymous namespace)::ChannelRowProcessor<AVX2::(anonymous namespace)::ChunkSampleCollector<AVX2::(anonymous namespace)::UpTo8Bits>, AVX2::(anonymous namespace)::UpTo8Bits>::ProcessChunk(short const*, short const*, short const*, short const*, unsigned long)
Unexecuted instantiation: enc_fast_lossless.cc:AVX2::(anonymous namespace)::ChannelRowProcessor<AVX2::(anonymous namespace)::ChunkEncoder<AVX2::(anonymous namespace)::UpTo8Bits>, AVX2::(anonymous namespace)::UpTo8Bits>::ProcessChunk(short const*, short const*, short const*, short const*, unsigned long)
Unexecuted instantiation: enc_fast_lossless.cc:AVX2::(anonymous namespace)::ChannelRowProcessor<AVX2::(anonymous namespace)::ChunkSampleCollector<AVX2::(anonymous namespace)::From9To13Bits>, AVX2::(anonymous namespace)::From9To13Bits>::ProcessChunk(short const*, short const*, short const*, short const*, unsigned long)
Unexecuted instantiation: enc_fast_lossless.cc:AVX2::(anonymous namespace)::ChannelRowProcessor<AVX2::(anonymous namespace)::ChunkSampleCollector<AVX2::(anonymous namespace)::Exactly14Bits>, AVX2::(anonymous namespace)::Exactly14Bits>::ProcessChunk(short const*, short const*, short const*, short const*, unsigned long)
Unexecuted instantiation: enc_fast_lossless.cc:AVX2::(anonymous namespace)::ChannelRowProcessor<AVX2::(anonymous namespace)::ChunkSampleCollector<AVX2::(anonymous namespace)::MoreThan14Bits>, AVX2::(anonymous namespace)::MoreThan14Bits>::ProcessChunk(int const*, int const*, int const*, int const*, unsigned long)
Unexecuted instantiation: enc_fast_lossless.cc:default_implementation::(anonymous namespace)::ChannelRowProcessor<default_implementation::(anonymous namespace)::ChunkSampleCollector<default_implementation::(anonymous namespace)::UpTo8Bits>, default_implementation::(anonymous namespace)::UpTo8Bits>::ProcessChunk(short const*, short const*, short const*, short const*, unsigned long)
Unexecuted instantiation: enc_fast_lossless.cc:default_implementation::(anonymous namespace)::ChannelRowProcessor<default_implementation::(anonymous namespace)::ChunkEncoder<default_implementation::(anonymous namespace)::UpTo8Bits>, default_implementation::(anonymous namespace)::UpTo8Bits>::ProcessChunk(short const*, short const*, short const*, short const*, unsigned long)
Unexecuted instantiation: enc_fast_lossless.cc:default_implementation::(anonymous namespace)::ChannelRowProcessor<default_implementation::(anonymous namespace)::ChunkSampleCollector<default_implementation::(anonymous namespace)::From9To13Bits>, default_implementation::(anonymous namespace)::From9To13Bits>::ProcessChunk(short const*, short const*, short const*, short const*, unsigned long)
Unexecuted instantiation: enc_fast_lossless.cc:default_implementation::(anonymous namespace)::ChannelRowProcessor<default_implementation::(anonymous namespace)::ChunkSampleCollector<default_implementation::(anonymous namespace)::Exactly14Bits>, default_implementation::(anonymous namespace)::Exactly14Bits>::ProcessChunk(short const*, short const*, short const*, short const*, unsigned long)
Unexecuted instantiation: enc_fast_lossless.cc:default_implementation::(anonymous namespace)::ChannelRowProcessor<default_implementation::(anonymous namespace)::ChunkSampleCollector<default_implementation::(anonymous namespace)::MoreThan14Bits>, default_implementation::(anonymous namespace)::MoreThan14Bits>::ProcessChunk(int const*, int const*, int const*, int const*, unsigned long)
Unexecuted instantiation: enc_fast_lossless.cc:AVX2::(anonymous namespace)::ChannelRowProcessor<AVX2::(anonymous namespace)::ChunkEncoder<AVX2::(anonymous namespace)::From9To13Bits>, AVX2::(anonymous namespace)::From9To13Bits>::ProcessChunk(short const*, short const*, short const*, short const*, unsigned long)
Unexecuted instantiation: enc_fast_lossless.cc:AVX2::(anonymous namespace)::ChannelRowProcessor<AVX2::(anonymous namespace)::ChunkEncoder<AVX2::(anonymous namespace)::Exactly14Bits>, AVX2::(anonymous namespace)::Exactly14Bits>::ProcessChunk(short const*, short const*, short const*, short const*, unsigned long)
Unexecuted instantiation: enc_fast_lossless.cc:AVX2::(anonymous namespace)::ChannelRowProcessor<AVX2::(anonymous namespace)::ChunkEncoder<AVX2::(anonymous namespace)::MoreThan14Bits>, AVX2::(anonymous namespace)::MoreThan14Bits>::ProcessChunk(int const*, int const*, int const*, int const*, unsigned long)
Unexecuted instantiation: enc_fast_lossless.cc:default_implementation::(anonymous namespace)::ChannelRowProcessor<default_implementation::(anonymous namespace)::ChunkEncoder<default_implementation::(anonymous namespace)::From9To13Bits>, default_implementation::(anonymous namespace)::From9To13Bits>::ProcessChunk(short const*, short const*, short const*, short const*, unsigned long)
Unexecuted instantiation: enc_fast_lossless.cc:default_implementation::(anonymous namespace)::ChannelRowProcessor<default_implementation::(anonymous namespace)::ChunkEncoder<default_implementation::(anonymous namespace)::Exactly14Bits>, default_implementation::(anonymous namespace)::Exactly14Bits>::ProcessChunk(short const*, short const*, short const*, short const*, unsigned long)
Unexecuted instantiation: enc_fast_lossless.cc:default_implementation::(anonymous namespace)::ChannelRowProcessor<default_implementation::(anonymous namespace)::ChunkEncoder<default_implementation::(anonymous namespace)::MoreThan14Bits>, default_implementation::(anonymous namespace)::MoreThan14Bits>::ProcessChunk(int const*, int const*, int const*, int const*, unsigned long)
3150
3151
  void ProcessRow(const pixel_t* row, const pixel_t* row_left,
3152
                  const pixel_t* row_top, const pixel_t* row_topleft,
3153
0
                  size_t xs) {
3154
0
    for (size_t x = 0; x < xs; x += kChunkSize) {
3155
0
      ProcessChunk(row + x, row_left + x, row_top + x, row_topleft + x,
3156
0
                   std::min(kChunkSize, xs - x));
3157
0
    }
3158
0
  }
Unexecuted instantiation: enc_fast_lossless.cc:AVX2::(anonymous namespace)::ChannelRowProcessor<AVX2::(anonymous namespace)::ChunkSampleCollector<AVX2::(anonymous namespace)::UpTo8Bits>, AVX2::(anonymous namespace)::UpTo8Bits>::ProcessRow(short const*, short const*, short const*, short const*, unsigned long)
Unexecuted instantiation: enc_fast_lossless.cc:AVX2::(anonymous namespace)::ChannelRowProcessor<AVX2::(anonymous namespace)::ChunkEncoder<AVX2::(anonymous namespace)::UpTo8Bits>, AVX2::(anonymous namespace)::UpTo8Bits>::ProcessRow(short const*, short const*, short const*, short const*, unsigned long)
Unexecuted instantiation: enc_fast_lossless.cc:AVX2::(anonymous namespace)::ChannelRowProcessor<AVX2::(anonymous namespace)::ChunkSampleCollector<AVX2::(anonymous namespace)::From9To13Bits>, AVX2::(anonymous namespace)::From9To13Bits>::ProcessRow(short const*, short const*, short const*, short const*, unsigned long)
Unexecuted instantiation: enc_fast_lossless.cc:AVX2::(anonymous namespace)::ChannelRowProcessor<AVX2::(anonymous namespace)::ChunkSampleCollector<AVX2::(anonymous namespace)::Exactly14Bits>, AVX2::(anonymous namespace)::Exactly14Bits>::ProcessRow(short const*, short const*, short const*, short const*, unsigned long)
Unexecuted instantiation: enc_fast_lossless.cc:AVX2::(anonymous namespace)::ChannelRowProcessor<AVX2::(anonymous namespace)::ChunkSampleCollector<AVX2::(anonymous namespace)::MoreThan14Bits>, AVX2::(anonymous namespace)::MoreThan14Bits>::ProcessRow(int const*, int const*, int const*, int const*, unsigned long)
Unexecuted instantiation: enc_fast_lossless.cc:default_implementation::(anonymous namespace)::ChannelRowProcessor<default_implementation::(anonymous namespace)::ChunkSampleCollector<default_implementation::(anonymous namespace)::UpTo8Bits>, default_implementation::(anonymous namespace)::UpTo8Bits>::ProcessRow(short const*, short const*, short const*, short const*, unsigned long)
Unexecuted instantiation: enc_fast_lossless.cc:default_implementation::(anonymous namespace)::ChannelRowProcessor<default_implementation::(anonymous namespace)::ChunkEncoder<default_implementation::(anonymous namespace)::UpTo8Bits>, default_implementation::(anonymous namespace)::UpTo8Bits>::ProcessRow(short const*, short const*, short const*, short const*, unsigned long)
Unexecuted instantiation: enc_fast_lossless.cc:default_implementation::(anonymous namespace)::ChannelRowProcessor<default_implementation::(anonymous namespace)::ChunkSampleCollector<default_implementation::(anonymous namespace)::From9To13Bits>, default_implementation::(anonymous namespace)::From9To13Bits>::ProcessRow(short const*, short const*, short const*, short const*, unsigned long)
Unexecuted instantiation: enc_fast_lossless.cc:default_implementation::(anonymous namespace)::ChannelRowProcessor<default_implementation::(anonymous namespace)::ChunkSampleCollector<default_implementation::(anonymous namespace)::Exactly14Bits>, default_implementation::(anonymous namespace)::Exactly14Bits>::ProcessRow(short const*, short const*, short const*, short const*, unsigned long)
Unexecuted instantiation: enc_fast_lossless.cc:default_implementation::(anonymous namespace)::ChannelRowProcessor<default_implementation::(anonymous namespace)::ChunkSampleCollector<default_implementation::(anonymous namespace)::MoreThan14Bits>, default_implementation::(anonymous namespace)::MoreThan14Bits>::ProcessRow(int const*, int const*, int const*, int const*, unsigned long)
Unexecuted instantiation: enc_fast_lossless.cc:AVX2::(anonymous namespace)::ChannelRowProcessor<AVX2::(anonymous namespace)::ChunkEncoder<AVX2::(anonymous namespace)::From9To13Bits>, AVX2::(anonymous namespace)::From9To13Bits>::ProcessRow(short const*, short const*, short const*, short const*, unsigned long)
Unexecuted instantiation: enc_fast_lossless.cc:AVX2::(anonymous namespace)::ChannelRowProcessor<AVX2::(anonymous namespace)::ChunkEncoder<AVX2::(anonymous namespace)::Exactly14Bits>, AVX2::(anonymous namespace)::Exactly14Bits>::ProcessRow(short const*, short const*, short const*, short const*, unsigned long)
Unexecuted instantiation: enc_fast_lossless.cc:AVX2::(anonymous namespace)::ChannelRowProcessor<AVX2::(anonymous namespace)::ChunkEncoder<AVX2::(anonymous namespace)::MoreThan14Bits>, AVX2::(anonymous namespace)::MoreThan14Bits>::ProcessRow(int const*, int const*, int const*, int const*, unsigned long)
Unexecuted instantiation: enc_fast_lossless.cc:default_implementation::(anonymous namespace)::ChannelRowProcessor<default_implementation::(anonymous namespace)::ChunkEncoder<default_implementation::(anonymous namespace)::From9To13Bits>, default_implementation::(anonymous namespace)::From9To13Bits>::ProcessRow(short const*, short const*, short const*, short const*, unsigned long)
Unexecuted instantiation: enc_fast_lossless.cc:default_implementation::(anonymous namespace)::ChannelRowProcessor<default_implementation::(anonymous namespace)::ChunkEncoder<default_implementation::(anonymous namespace)::Exactly14Bits>, default_implementation::(anonymous namespace)::Exactly14Bits>::ProcessRow(short const*, short const*, short const*, short const*, unsigned long)
Unexecuted instantiation: enc_fast_lossless.cc:default_implementation::(anonymous namespace)::ChannelRowProcessor<default_implementation::(anonymous namespace)::ChunkEncoder<default_implementation::(anonymous namespace)::MoreThan14Bits>, default_implementation::(anonymous namespace)::MoreThan14Bits>::ProcessRow(int const*, int const*, int const*, int const*, unsigned long)
3159
3160
0
  void Finalize() { t->Finalize(run); }
Unexecuted instantiation: enc_fast_lossless.cc:AVX2::(anonymous namespace)::ChannelRowProcessor<AVX2::(anonymous namespace)::ChunkSampleCollector<AVX2::(anonymous namespace)::UpTo8Bits>, AVX2::(anonymous namespace)::UpTo8Bits>::Finalize()
Unexecuted instantiation: enc_fast_lossless.cc:AVX2::(anonymous namespace)::ChannelRowProcessor<AVX2::(anonymous namespace)::ChunkEncoder<AVX2::(anonymous namespace)::UpTo8Bits>, AVX2::(anonymous namespace)::UpTo8Bits>::Finalize()
Unexecuted instantiation: enc_fast_lossless.cc:AVX2::(anonymous namespace)::ChannelRowProcessor<AVX2::(anonymous namespace)::ChunkSampleCollector<AVX2::(anonymous namespace)::From9To13Bits>, AVX2::(anonymous namespace)::From9To13Bits>::Finalize()
Unexecuted instantiation: enc_fast_lossless.cc:AVX2::(anonymous namespace)::ChannelRowProcessor<AVX2::(anonymous namespace)::ChunkSampleCollector<AVX2::(anonymous namespace)::Exactly14Bits>, AVX2::(anonymous namespace)::Exactly14Bits>::Finalize()
Unexecuted instantiation: enc_fast_lossless.cc:AVX2::(anonymous namespace)::ChannelRowProcessor<AVX2::(anonymous namespace)::ChunkSampleCollector<AVX2::(anonymous namespace)::MoreThan14Bits>, AVX2::(anonymous namespace)::MoreThan14Bits>::Finalize()
Unexecuted instantiation: enc_fast_lossless.cc:default_implementation::(anonymous namespace)::ChannelRowProcessor<default_implementation::(anonymous namespace)::ChunkSampleCollector<default_implementation::(anonymous namespace)::UpTo8Bits>, default_implementation::(anonymous namespace)::UpTo8Bits>::Finalize()
Unexecuted instantiation: enc_fast_lossless.cc:default_implementation::(anonymous namespace)::ChannelRowProcessor<default_implementation::(anonymous namespace)::ChunkEncoder<default_implementation::(anonymous namespace)::UpTo8Bits>, default_implementation::(anonymous namespace)::UpTo8Bits>::Finalize()
Unexecuted instantiation: enc_fast_lossless.cc:default_implementation::(anonymous namespace)::ChannelRowProcessor<default_implementation::(anonymous namespace)::ChunkSampleCollector<default_implementation::(anonymous namespace)::From9To13Bits>, default_implementation::(anonymous namespace)::From9To13Bits>::Finalize()
Unexecuted instantiation: enc_fast_lossless.cc:default_implementation::(anonymous namespace)::ChannelRowProcessor<default_implementation::(anonymous namespace)::ChunkSampleCollector<default_implementation::(anonymous namespace)::Exactly14Bits>, default_implementation::(anonymous namespace)::Exactly14Bits>::Finalize()
Unexecuted instantiation: enc_fast_lossless.cc:default_implementation::(anonymous namespace)::ChannelRowProcessor<default_implementation::(anonymous namespace)::ChunkSampleCollector<default_implementation::(anonymous namespace)::MoreThan14Bits>, default_implementation::(anonymous namespace)::MoreThan14Bits>::Finalize()
Unexecuted instantiation: enc_fast_lossless.cc:AVX2::(anonymous namespace)::ChannelRowProcessor<AVX2::(anonymous namespace)::ChunkEncoder<AVX2::(anonymous namespace)::From9To13Bits>, AVX2::(anonymous namespace)::From9To13Bits>::Finalize()
Unexecuted instantiation: enc_fast_lossless.cc:AVX2::(anonymous namespace)::ChannelRowProcessor<AVX2::(anonymous namespace)::ChunkEncoder<AVX2::(anonymous namespace)::Exactly14Bits>, AVX2::(anonymous namespace)::Exactly14Bits>::Finalize()
Unexecuted instantiation: enc_fast_lossless.cc:AVX2::(anonymous namespace)::ChannelRowProcessor<AVX2::(anonymous namespace)::ChunkEncoder<AVX2::(anonymous namespace)::MoreThan14Bits>, AVX2::(anonymous namespace)::MoreThan14Bits>::Finalize()
Unexecuted instantiation: enc_fast_lossless.cc:default_implementation::(anonymous namespace)::ChannelRowProcessor<default_implementation::(anonymous namespace)::ChunkEncoder<default_implementation::(anonymous namespace)::From9To13Bits>, default_implementation::(anonymous namespace)::From9To13Bits>::Finalize()
Unexecuted instantiation: enc_fast_lossless.cc:default_implementation::(anonymous namespace)::ChannelRowProcessor<default_implementation::(anonymous namespace)::ChunkEncoder<default_implementation::(anonymous namespace)::Exactly14Bits>, default_implementation::(anonymous namespace)::Exactly14Bits>::Finalize()
Unexecuted instantiation: enc_fast_lossless.cc:default_implementation::(anonymous namespace)::ChannelRowProcessor<default_implementation::(anonymous namespace)::ChunkEncoder<default_implementation::(anonymous namespace)::MoreThan14Bits>, default_implementation::(anonymous namespace)::MoreThan14Bits>::Finalize()
3161
  // Invariant: run == 0 or run > kLZ77MinLength.
3162
  size_t run = 0;
3163
};
3164
3165
0
uint16_t LoadLE16(const unsigned char* ptr) {
3166
0
  return uint16_t{ptr[0]} | (uint16_t{ptr[1]} << 8);
3167
0
}
Unexecuted instantiation: enc_fast_lossless.cc:AVX2::(anonymous namespace)::LoadLE16(unsigned char const*)
Unexecuted instantiation: enc_fast_lossless.cc:default_implementation::(anonymous namespace)::LoadLE16(unsigned char const*)
3168
3169
0
uint16_t SwapEndian(uint16_t in) { return (in >> 8) | (in << 8); }
Unexecuted instantiation: enc_fast_lossless.cc:AVX2::(anonymous namespace)::SwapEndian(unsigned short)
Unexecuted instantiation: enc_fast_lossless.cc:default_implementation::(anonymous namespace)::SwapEndian(unsigned short)
3170
3171
#ifdef FJXL_GENERIC_SIMD
3172
0
void StorePixels(SIMDVec16 p, int16_t* dest) { p.Store((uint16_t*)dest); }
3173
3174
0
void StorePixels(SIMDVec16 p, int32_t* dest) {
3175
0
  VecPair<SIMDVec32> p_up = p.Upcast();
3176
0
  p_up.low.Store((uint32_t*)dest);
3177
0
  p_up.hi.Store((uint32_t*)dest + SIMDVec32::kLanes);
3178
0
}
3179
#endif
3180
3181
template <typename pixel_t>
3182
0
void FillRowG8(const unsigned char* rgba, size_t oxs, pixel_t* luma) {
3183
0
  size_t x = 0;
3184
#ifdef FJXL_GENERIC_SIMD
3185
0
  for (; x + SIMDVec16::kLanes <= oxs; x += SIMDVec16::kLanes) {
3186
0
    auto rgb = SIMDVec16::LoadG8(rgba + x);
3187
0
    StorePixels(rgb[0], luma + x);
3188
0
  }
3189
#endif
3190
0
  for (; x < oxs; x++) {
3191
0
    luma[x] = rgba[x];
3192
0
  }
3193
0
}
Unexecuted instantiation: enc_fast_lossless.cc:void AVX2::(anonymous namespace)::FillRowG8<short>(unsigned char const*, unsigned long, short*)
Unexecuted instantiation: enc_fast_lossless.cc:void default_implementation::(anonymous namespace)::FillRowG8<short>(unsigned char const*, unsigned long, short*)
Unexecuted instantiation: enc_fast_lossless.cc:void default_implementation::(anonymous namespace)::FillRowG8<int>(unsigned char const*, unsigned long, int*)
Unexecuted instantiation: enc_fast_lossless.cc:void AVX2::(anonymous namespace)::FillRowG8<int>(unsigned char const*, unsigned long, int*)
3194
3195
template <bool big_endian, typename pixel_t>
3196
0
void FillRowG16(const unsigned char* rgba, size_t oxs, pixel_t* luma) {
3197
0
  size_t x = 0;
3198
#ifdef FJXL_GENERIC_SIMD
3199
0
  for (; x + SIMDVec16::kLanes <= oxs; x += SIMDVec16::kLanes) {
3200
0
    auto rgb = SIMDVec16::LoadG16(rgba + 2 * x);
3201
0
    if (big_endian) {
3202
0
      rgb[0].SwapEndian();
3203
0
    }
3204
0
    StorePixels(rgb[0], luma + x);
3205
0
  }
3206
#endif
3207
0
  for (; x < oxs; x++) {
3208
0
    uint16_t val = LoadLE16(rgba + 2 * x);
3209
0
    if (big_endian) {
3210
0
      val = SwapEndian(val);
3211
0
    }
3212
0
    luma[x] = val;
3213
0
  }
3214
0
}
Unexecuted instantiation: enc_fast_lossless.cc:void AVX2::(anonymous namespace)::FillRowG16<true, short>(unsigned char const*, unsigned long, short*)
Unexecuted instantiation: enc_fast_lossless.cc:void AVX2::(anonymous namespace)::FillRowG16<false, short>(unsigned char const*, unsigned long, short*)
Unexecuted instantiation: enc_fast_lossless.cc:void AVX2::(anonymous namespace)::FillRowG16<true, int>(unsigned char const*, unsigned long, int*)
Unexecuted instantiation: enc_fast_lossless.cc:void AVX2::(anonymous namespace)::FillRowG16<false, int>(unsigned char const*, unsigned long, int*)
Unexecuted instantiation: enc_fast_lossless.cc:void default_implementation::(anonymous namespace)::FillRowG16<true, short>(unsigned char const*, unsigned long, short*)
Unexecuted instantiation: enc_fast_lossless.cc:void default_implementation::(anonymous namespace)::FillRowG16<false, short>(unsigned char const*, unsigned long, short*)
Unexecuted instantiation: enc_fast_lossless.cc:void default_implementation::(anonymous namespace)::FillRowG16<true, int>(unsigned char const*, unsigned long, int*)
Unexecuted instantiation: enc_fast_lossless.cc:void default_implementation::(anonymous namespace)::FillRowG16<false, int>(unsigned char const*, unsigned long, int*)
3215
3216
template <typename pixel_t>
3217
void FillRowGA8(const unsigned char* rgba, size_t oxs, pixel_t* luma,
3218
0
                pixel_t* alpha) {
3219
0
  size_t x = 0;
3220
#ifdef FJXL_GENERIC_SIMD
3221
0
  for (; x + SIMDVec16::kLanes <= oxs; x += SIMDVec16::kLanes) {
3222
0
    auto rgb = SIMDVec16::LoadGA8(rgba + 2 * x);
3223
0
    StorePixels(rgb[0], luma + x);
3224
0
    StorePixels(rgb[1], alpha + x);
3225
0
  }
3226
#endif
3227
0
  for (; x < oxs; x++) {
3228
0
    luma[x] = rgba[2 * x];
3229
0
    alpha[x] = rgba[2 * x + 1];
3230
0
  }
3231
0
}
Unexecuted instantiation: enc_fast_lossless.cc:void AVX2::(anonymous namespace)::FillRowGA8<short>(unsigned char const*, unsigned long, short*, short*)
Unexecuted instantiation: enc_fast_lossless.cc:void default_implementation::(anonymous namespace)::FillRowGA8<short>(unsigned char const*, unsigned long, short*, short*)
Unexecuted instantiation: enc_fast_lossless.cc:void default_implementation::(anonymous namespace)::FillRowGA8<int>(unsigned char const*, unsigned long, int*, int*)
Unexecuted instantiation: enc_fast_lossless.cc:void AVX2::(anonymous namespace)::FillRowGA8<int>(unsigned char const*, unsigned long, int*, int*)
3232
3233
template <bool big_endian, typename pixel_t>
3234
void FillRowGA16(const unsigned char* rgba, size_t oxs, pixel_t* luma,
3235
0
                 pixel_t* alpha) {
3236
0
  size_t x = 0;
3237
#ifdef FJXL_GENERIC_SIMD
3238
0
  for (; x + SIMDVec16::kLanes <= oxs; x += SIMDVec16::kLanes) {
3239
0
    auto rgb = SIMDVec16::LoadGA16(rgba + 4 * x);
3240
0
    if (big_endian) {
3241
0
      rgb[0].SwapEndian();
3242
0
      rgb[1].SwapEndian();
3243
0
    }
3244
0
    StorePixels(rgb[0], luma + x);
3245
0
    StorePixels(rgb[1], alpha + x);
3246
0
  }
3247
#endif
3248
0
  for (; x < oxs; x++) {
3249
0
    uint16_t l = LoadLE16(rgba + 4 * x);
3250
0
    uint16_t a = LoadLE16(rgba + 4 * x + 2);
3251
0
    if (big_endian) {
3252
0
      l = SwapEndian(l);
3253
0
      a = SwapEndian(a);
3254
0
    }
3255
0
    luma[x] = l;
3256
0
    alpha[x] = a;
3257
0
  }
3258
0
}
Unexecuted instantiation: enc_fast_lossless.cc:void AVX2::(anonymous namespace)::FillRowGA16<true, short>(unsigned char const*, unsigned long, short*, short*)
Unexecuted instantiation: enc_fast_lossless.cc:void AVX2::(anonymous namespace)::FillRowGA16<false, short>(unsigned char const*, unsigned long, short*, short*)
Unexecuted instantiation: enc_fast_lossless.cc:void AVX2::(anonymous namespace)::FillRowGA16<true, int>(unsigned char const*, unsigned long, int*, int*)
Unexecuted instantiation: enc_fast_lossless.cc:void AVX2::(anonymous namespace)::FillRowGA16<false, int>(unsigned char const*, unsigned long, int*, int*)
Unexecuted instantiation: enc_fast_lossless.cc:void default_implementation::(anonymous namespace)::FillRowGA16<true, short>(unsigned char const*, unsigned long, short*, short*)
Unexecuted instantiation: enc_fast_lossless.cc:void default_implementation::(anonymous namespace)::FillRowGA16<false, short>(unsigned char const*, unsigned long, short*, short*)
Unexecuted instantiation: enc_fast_lossless.cc:void default_implementation::(anonymous namespace)::FillRowGA16<true, int>(unsigned char const*, unsigned long, int*, int*)
Unexecuted instantiation: enc_fast_lossless.cc:void default_implementation::(anonymous namespace)::FillRowGA16<false, int>(unsigned char const*, unsigned long, int*, int*)
3259
3260
template <typename pixel_t>
3261
void StoreYCoCg(pixel_t r, pixel_t g, pixel_t b, pixel_t* y, pixel_t* co,
3262
0
                pixel_t* cg) {
3263
0
  *co = r - b;
3264
0
  pixel_t tmp = b + (*co >> 1);
3265
0
  *cg = g - tmp;
3266
0
  *y = tmp + (*cg >> 1);
3267
0
}
Unexecuted instantiation: enc_fast_lossless.cc:void AVX2::(anonymous namespace)::StoreYCoCg<short>(short, short, short, short*, short*, short*)
Unexecuted instantiation: enc_fast_lossless.cc:void AVX2::(anonymous namespace)::StoreYCoCg<int>(int, int, int, int*, int*, int*)
Unexecuted instantiation: enc_fast_lossless.cc:void default_implementation::(anonymous namespace)::StoreYCoCg<short>(short, short, short, short*, short*, short*)
Unexecuted instantiation: enc_fast_lossless.cc:void default_implementation::(anonymous namespace)::StoreYCoCg<int>(int, int, int, int*, int*, int*)
3268
3269
#ifdef FJXL_GENERIC_SIMD
3270
void StoreYCoCg(SIMDVec16 r, SIMDVec16 g, SIMDVec16 b, int16_t* y, int16_t* co,
3271
0
                int16_t* cg) {
3272
0
  SIMDVec16 co_v = r.Sub(b);
3273
0
  SIMDVec16 tmp = b.Add(co_v.SignedShiftRight<1>());
3274
0
  SIMDVec16 cg_v = g.Sub(tmp);
3275
0
  SIMDVec16 y_v = tmp.Add(cg_v.SignedShiftRight<1>());
3276
0
  y_v.Store(reinterpret_cast<uint16_t*>(y));
3277
0
  co_v.Store(reinterpret_cast<uint16_t*>(co));
3278
0
  cg_v.Store(reinterpret_cast<uint16_t*>(cg));
3279
0
}
3280
3281
void StoreYCoCg(SIMDVec16 r, SIMDVec16 g, SIMDVec16 b, int32_t* y, int32_t* co,
3282
0
                int32_t* cg) {
3283
0
  VecPair<SIMDVec32> r_up = r.Upcast();
3284
0
  VecPair<SIMDVec32> g_up = g.Upcast();
3285
0
  VecPair<SIMDVec32> b_up = b.Upcast();
3286
0
  SIMDVec32 co_lo_v = r_up.low.Sub(b_up.low);
3287
0
  SIMDVec32 tmp_lo = b_up.low.Add(co_lo_v.SignedShiftRight<1>());
3288
0
  SIMDVec32 cg_lo_v = g_up.low.Sub(tmp_lo);
3289
0
  SIMDVec32 y_lo_v = tmp_lo.Add(cg_lo_v.SignedShiftRight<1>());
3290
0
  SIMDVec32 co_hi_v = r_up.hi.Sub(b_up.hi);
3291
0
  SIMDVec32 tmp_hi = b_up.hi.Add(co_hi_v.SignedShiftRight<1>());
3292
0
  SIMDVec32 cg_hi_v = g_up.hi.Sub(tmp_hi);
3293
0
  SIMDVec32 y_hi_v = tmp_hi.Add(cg_hi_v.SignedShiftRight<1>());
3294
0
  y_lo_v.Store(reinterpret_cast<uint32_t*>(y));
3295
0
  co_lo_v.Store(reinterpret_cast<uint32_t*>(co));
3296
0
  cg_lo_v.Store(reinterpret_cast<uint32_t*>(cg));
3297
0
  y_hi_v.Store(reinterpret_cast<uint32_t*>(y) + SIMDVec32::kLanes);
3298
0
  co_hi_v.Store(reinterpret_cast<uint32_t*>(co) + SIMDVec32::kLanes);
3299
0
  cg_hi_v.Store(reinterpret_cast<uint32_t*>(cg) + SIMDVec32::kLanes);
3300
0
}
3301
#endif
3302
3303
template <typename pixel_t>
3304
void FillRowRGB8(const unsigned char* rgba, size_t oxs, pixel_t* y, pixel_t* co,
3305
0
                 pixel_t* cg) {
3306
0
  size_t x = 0;
3307
#ifdef FJXL_GENERIC_SIMD
3308
0
  for (; x + SIMDVec16::kLanes <= oxs; x += SIMDVec16::kLanes) {
3309
0
    auto rgb = SIMDVec16::LoadRGB8(rgba + 3 * x);
3310
0
    StoreYCoCg(rgb[0], rgb[1], rgb[2], y + x, co + x, cg + x);
3311
0
  }
3312
#endif
3313
0
  for (; x < oxs; x++) {
3314
0
    uint16_t r = rgba[3 * x];
3315
0
    uint16_t g = rgba[3 * x + 1];
3316
0
    uint16_t b = rgba[3 * x + 2];
3317
0
    StoreYCoCg<pixel_t>(r, g, b, y + x, co + x, cg + x);
3318
0
  }
3319
0
}
Unexecuted instantiation: enc_fast_lossless.cc:void AVX2::(anonymous namespace)::FillRowRGB8<short>(unsigned char const*, unsigned long, short*, short*, short*)
Unexecuted instantiation: enc_fast_lossless.cc:void default_implementation::(anonymous namespace)::FillRowRGB8<short>(unsigned char const*, unsigned long, short*, short*, short*)
Unexecuted instantiation: enc_fast_lossless.cc:void default_implementation::(anonymous namespace)::FillRowRGB8<int>(unsigned char const*, unsigned long, int*, int*, int*)
Unexecuted instantiation: enc_fast_lossless.cc:void AVX2::(anonymous namespace)::FillRowRGB8<int>(unsigned char const*, unsigned long, int*, int*, int*)
3320
3321
template <bool big_endian, typename pixel_t>
3322
void FillRowRGB16(const unsigned char* rgba, size_t oxs, pixel_t* y,
3323
0
                  pixel_t* co, pixel_t* cg) {
3324
0
  size_t x = 0;
3325
#ifdef FJXL_GENERIC_SIMD
3326
0
  for (; x + SIMDVec16::kLanes <= oxs; x += SIMDVec16::kLanes) {
3327
0
    auto rgb = SIMDVec16::LoadRGB16(rgba + 6 * x);
3328
0
    if (big_endian) {
3329
0
      rgb[0].SwapEndian();
3330
0
      rgb[1].SwapEndian();
3331
0
      rgb[2].SwapEndian();
3332
0
    }
3333
0
    StoreYCoCg(rgb[0], rgb[1], rgb[2], y + x, co + x, cg + x);
3334
0
  }
3335
#endif
3336
0
  for (; x < oxs; x++) {
3337
0
    uint16_t r = LoadLE16(rgba + 6 * x);
3338
0
    uint16_t g = LoadLE16(rgba + 6 * x + 2);
3339
0
    uint16_t b = LoadLE16(rgba + 6 * x + 4);
3340
0
    if (big_endian) {
3341
0
      r = SwapEndian(r);
3342
0
      g = SwapEndian(g);
3343
0
      b = SwapEndian(b);
3344
0
    }
3345
0
    StoreYCoCg<pixel_t>(r, g, b, y + x, co + x, cg + x);
3346
0
  }
3347
0
}
Unexecuted instantiation: enc_fast_lossless.cc:void AVX2::(anonymous namespace)::FillRowRGB16<true, short>(unsigned char const*, unsigned long, short*, short*, short*)
Unexecuted instantiation: enc_fast_lossless.cc:void AVX2::(anonymous namespace)::FillRowRGB16<false, short>(unsigned char const*, unsigned long, short*, short*, short*)
Unexecuted instantiation: enc_fast_lossless.cc:void AVX2::(anonymous namespace)::FillRowRGB16<true, int>(unsigned char const*, unsigned long, int*, int*, int*)
Unexecuted instantiation: enc_fast_lossless.cc:void AVX2::(anonymous namespace)::FillRowRGB16<false, int>(unsigned char const*, unsigned long, int*, int*, int*)
Unexecuted instantiation: enc_fast_lossless.cc:void default_implementation::(anonymous namespace)::FillRowRGB16<true, short>(unsigned char const*, unsigned long, short*, short*, short*)
Unexecuted instantiation: enc_fast_lossless.cc:void default_implementation::(anonymous namespace)::FillRowRGB16<false, short>(unsigned char const*, unsigned long, short*, short*, short*)
Unexecuted instantiation: enc_fast_lossless.cc:void default_implementation::(anonymous namespace)::FillRowRGB16<true, int>(unsigned char const*, unsigned long, int*, int*, int*)
Unexecuted instantiation: enc_fast_lossless.cc:void default_implementation::(anonymous namespace)::FillRowRGB16<false, int>(unsigned char const*, unsigned long, int*, int*, int*)
3348
3349
template <typename pixel_t>
3350
void FillRowRGBA8(const unsigned char* rgba, size_t oxs, pixel_t* y,
3351
0
                  pixel_t* co, pixel_t* cg, pixel_t* alpha) {
3352
0
  size_t x = 0;
3353
#ifdef FJXL_GENERIC_SIMD
3354
0
  for (; x + SIMDVec16::kLanes <= oxs; x += SIMDVec16::kLanes) {
3355
0
    auto rgb = SIMDVec16::LoadRGBA8(rgba + 4 * x);
3356
0
    StoreYCoCg(rgb[0], rgb[1], rgb[2], y + x, co + x, cg + x);
3357
0
    StorePixels(rgb[3], alpha + x);
3358
0
  }
3359
#endif
3360
0
  for (; x < oxs; x++) {
3361
0
    uint16_t r = rgba[4 * x];
3362
0
    uint16_t g = rgba[4 * x + 1];
3363
0
    uint16_t b = rgba[4 * x + 2];
3364
0
    uint16_t a = rgba[4 * x + 3];
3365
0
    StoreYCoCg<pixel_t>(r, g, b, y + x, co + x, cg + x);
3366
0
    alpha[x] = a;
3367
0
  }
3368
0
}
Unexecuted instantiation: enc_fast_lossless.cc:void AVX2::(anonymous namespace)::FillRowRGBA8<short>(unsigned char const*, unsigned long, short*, short*, short*, short*)
Unexecuted instantiation: enc_fast_lossless.cc:void default_implementation::(anonymous namespace)::FillRowRGBA8<short>(unsigned char const*, unsigned long, short*, short*, short*, short*)
Unexecuted instantiation: enc_fast_lossless.cc:void default_implementation::(anonymous namespace)::FillRowRGBA8<int>(unsigned char const*, unsigned long, int*, int*, int*, int*)
Unexecuted instantiation: enc_fast_lossless.cc:void AVX2::(anonymous namespace)::FillRowRGBA8<int>(unsigned char const*, unsigned long, int*, int*, int*, int*)
3369
3370
template <bool big_endian, typename pixel_t>
3371
void FillRowRGBA16(const unsigned char* rgba, size_t oxs, pixel_t* y,
3372
0
                   pixel_t* co, pixel_t* cg, pixel_t* alpha) {
3373
0
  size_t x = 0;
3374
#ifdef FJXL_GENERIC_SIMD
3375
0
  for (; x + SIMDVec16::kLanes <= oxs; x += SIMDVec16::kLanes) {
3376
0
    auto rgb = SIMDVec16::LoadRGBA16(rgba + 8 * x);
3377
0
    if (big_endian) {
3378
0
      rgb[0].SwapEndian();
3379
0
      rgb[1].SwapEndian();
3380
0
      rgb[2].SwapEndian();
3381
0
      rgb[3].SwapEndian();
3382
0
    }
3383
0
    StoreYCoCg(rgb[0], rgb[1], rgb[2], y + x, co + x, cg + x);
3384
0
    StorePixels(rgb[3], alpha + x);
3385
0
  }
3386
#endif
3387
0
  for (; x < oxs; x++) {
3388
0
    uint16_t r = LoadLE16(rgba + 8 * x);
3389
0
    uint16_t g = LoadLE16(rgba + 8 * x + 2);
3390
0
    uint16_t b = LoadLE16(rgba + 8 * x + 4);
3391
0
    uint16_t a = LoadLE16(rgba + 8 * x + 6);
3392
0
    if (big_endian) {
3393
0
      r = SwapEndian(r);
3394
0
      g = SwapEndian(g);
3395
0
      b = SwapEndian(b);
3396
0
      a = SwapEndian(a);
3397
0
    }
3398
0
    StoreYCoCg<pixel_t>(r, g, b, y + x, co + x, cg + x);
3399
0
    alpha[x] = a;
3400
0
  }
3401
0
}
Unexecuted instantiation: enc_fast_lossless.cc:void AVX2::(anonymous namespace)::FillRowRGBA16<true, short>(unsigned char const*, unsigned long, short*, short*, short*, short*)
Unexecuted instantiation: enc_fast_lossless.cc:void AVX2::(anonymous namespace)::FillRowRGBA16<false, short>(unsigned char const*, unsigned long, short*, short*, short*, short*)
Unexecuted instantiation: enc_fast_lossless.cc:void AVX2::(anonymous namespace)::FillRowRGBA16<true, int>(unsigned char const*, unsigned long, int*, int*, int*, int*)
Unexecuted instantiation: enc_fast_lossless.cc:void AVX2::(anonymous namespace)::FillRowRGBA16<false, int>(unsigned char const*, unsigned long, int*, int*, int*, int*)
Unexecuted instantiation: enc_fast_lossless.cc:void default_implementation::(anonymous namespace)::FillRowRGBA16<true, short>(unsigned char const*, unsigned long, short*, short*, short*, short*)
Unexecuted instantiation: enc_fast_lossless.cc:void default_implementation::(anonymous namespace)::FillRowRGBA16<false, short>(unsigned char const*, unsigned long, short*, short*, short*, short*)
Unexecuted instantiation: enc_fast_lossless.cc:void default_implementation::(anonymous namespace)::FillRowRGBA16<true, int>(unsigned char const*, unsigned long, int*, int*, int*, int*)
Unexecuted instantiation: enc_fast_lossless.cc:void default_implementation::(anonymous namespace)::FillRowRGBA16<false, int>(unsigned char const*, unsigned long, int*, int*, int*, int*)
3402
3403
template <typename Processor, typename BitDepth>
3404
void ProcessImageArea(const unsigned char* rgba, size_t x0, size_t y0,
3405
                      size_t xs, size_t yskip, size_t ys, size_t row_stride,
3406
                      BitDepth bitdepth, size_t nb_chans, bool big_endian,
3407
0
                      Processor* processors) {
3408
0
  constexpr size_t kPadding = 32;
3409
3410
0
  using pixel_t = typename BitDepth::pixel_t;
3411
3412
0
  constexpr size_t kAlign = 64;
3413
0
  constexpr size_t kAlignPixels = kAlign / sizeof(pixel_t);
3414
3415
0
  auto align = [=](pixel_t* ptr) {
3416
0
    size_t offset = reinterpret_cast<uintptr_t>(ptr) % kAlign;
3417
0
    if (offset) {
3418
0
      ptr += offset / sizeof(pixel_t);
3419
0
    }
3420
0
    return ptr;
3421
0
  };
Unexecuted instantiation: enc_fast_lossless.cc:AVX2::(anonymous namespace)::ProcessImageArea<AVX2::(anonymous namespace)::ChannelRowProcessor<AVX2::(anonymous namespace)::ChunkSampleCollector<AVX2::(anonymous namespace)::UpTo8Bits>, AVX2::(anonymous namespace)::UpTo8Bits>, AVX2::(anonymous namespace)::UpTo8Bits>(unsigned char const*, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, AVX2::(anonymous namespace)::UpTo8Bits, unsigned long, bool, AVX2::(anonymous namespace)::ChannelRowProcessor<AVX2::(anonymous namespace)::ChunkSampleCollector<AVX2::(anonymous namespace)::UpTo8Bits>, AVX2::(anonymous namespace)::UpTo8Bits>*)::{lambda(short*)#1}::operator()(short*) const
Unexecuted instantiation: enc_fast_lossless.cc:AVX2::(anonymous namespace)::ProcessImageArea<AVX2::(anonymous namespace)::ChannelRowProcessor<AVX2::(anonymous namespace)::ChunkSampleCollector<AVX2::(anonymous namespace)::From9To13Bits>, AVX2::(anonymous namespace)::From9To13Bits>, AVX2::(anonymous namespace)::From9To13Bits>(unsigned char const*, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, AVX2::(anonymous namespace)::From9To13Bits, unsigned long, bool, AVX2::(anonymous namespace)::ChannelRowProcessor<AVX2::(anonymous namespace)::ChunkSampleCollector<AVX2::(anonymous namespace)::From9To13Bits>, AVX2::(anonymous namespace)::From9To13Bits>*)::{lambda(short*)#1}::operator()(short*) const
Unexecuted instantiation: enc_fast_lossless.cc:AVX2::(anonymous namespace)::ProcessImageArea<AVX2::(anonymous namespace)::ChannelRowProcessor<AVX2::(anonymous namespace)::ChunkSampleCollector<AVX2::(anonymous namespace)::Exactly14Bits>, AVX2::(anonymous namespace)::Exactly14Bits>, AVX2::(anonymous namespace)::Exactly14Bits>(unsigned char const*, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, AVX2::(anonymous namespace)::Exactly14Bits, unsigned long, bool, AVX2::(anonymous namespace)::ChannelRowProcessor<AVX2::(anonymous namespace)::ChunkSampleCollector<AVX2::(anonymous namespace)::Exactly14Bits>, AVX2::(anonymous namespace)::Exactly14Bits>*)::{lambda(short*)#1}::operator()(short*) const
Unexecuted instantiation: enc_fast_lossless.cc:AVX2::(anonymous namespace)::ProcessImageArea<AVX2::(anonymous namespace)::ChannelRowProcessor<AVX2::(anonymous namespace)::ChunkSampleCollector<AVX2::(anonymous namespace)::MoreThan14Bits>, AVX2::(anonymous namespace)::MoreThan14Bits>, AVX2::(anonymous namespace)::MoreThan14Bits>(unsigned char const*, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, AVX2::(anonymous namespace)::MoreThan14Bits, unsigned long, bool, AVX2::(anonymous namespace)::ChannelRowProcessor<AVX2::(anonymous namespace)::ChunkSampleCollector<AVX2::(anonymous namespace)::MoreThan14Bits>, AVX2::(anonymous namespace)::MoreThan14Bits>*)::{lambda(int*)#1}::operator()(int*) const
Unexecuted instantiation: enc_fast_lossless.cc:default_implementation::(anonymous namespace)::ProcessImageArea<default_implementation::(anonymous namespace)::ChannelRowProcessor<default_implementation::(anonymous namespace)::ChunkSampleCollector<default_implementation::(anonymous namespace)::UpTo8Bits>, default_implementation::(anonymous namespace)::UpTo8Bits>, default_implementation::(anonymous namespace)::UpTo8Bits>(unsigned char const*, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, default_implementation::(anonymous namespace)::UpTo8Bits, unsigned long, bool, default_implementation::(anonymous namespace)::ChannelRowProcessor<default_implementation::(anonymous namespace)::ChunkSampleCollector<default_implementation::(anonymous namespace)::UpTo8Bits>, default_implementation::(anonymous namespace)::UpTo8Bits>*)::{lambda(short*)#1}::operator()(short*) const
Unexecuted instantiation: enc_fast_lossless.cc:default_implementation::(anonymous namespace)::ProcessImageArea<default_implementation::(anonymous namespace)::ChannelRowProcessor<default_implementation::(anonymous namespace)::ChunkSampleCollector<default_implementation::(anonymous namespace)::From9To13Bits>, default_implementation::(anonymous namespace)::From9To13Bits>, default_implementation::(anonymous namespace)::From9To13Bits>(unsigned char const*, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, default_implementation::(anonymous namespace)::From9To13Bits, unsigned long, bool, default_implementation::(anonymous namespace)::ChannelRowProcessor<default_implementation::(anonymous namespace)::ChunkSampleCollector<default_implementation::(anonymous namespace)::From9To13Bits>, default_implementation::(anonymous namespace)::From9To13Bits>*)::{lambda(short*)#1}::operator()(short*) const
Unexecuted instantiation: enc_fast_lossless.cc:default_implementation::(anonymous namespace)::ProcessImageArea<default_implementation::(anonymous namespace)::ChannelRowProcessor<default_implementation::(anonymous namespace)::ChunkSampleCollector<default_implementation::(anonymous namespace)::Exactly14Bits>, default_implementation::(anonymous namespace)::Exactly14Bits>, default_implementation::(anonymous namespace)::Exactly14Bits>(unsigned char const*, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, default_implementation::(anonymous namespace)::Exactly14Bits, unsigned long, bool, default_implementation::(anonymous namespace)::ChannelRowProcessor<default_implementation::(anonymous namespace)::ChunkSampleCollector<default_implementation::(anonymous namespace)::Exactly14Bits>, default_implementation::(anonymous namespace)::Exactly14Bits>*)::{lambda(short*)#1}::operator()(short*) const
Unexecuted instantiation: enc_fast_lossless.cc:default_implementation::(anonymous namespace)::ProcessImageArea<default_implementation::(anonymous namespace)::ChannelRowProcessor<default_implementation::(anonymous namespace)::ChunkSampleCollector<default_implementation::(anonymous namespace)::MoreThan14Bits>, default_implementation::(anonymous namespace)::MoreThan14Bits>, default_implementation::(anonymous namespace)::MoreThan14Bits>(unsigned char const*, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, default_implementation::(anonymous namespace)::MoreThan14Bits, unsigned long, bool, default_implementation::(anonymous namespace)::ChannelRowProcessor<default_implementation::(anonymous namespace)::ChunkSampleCollector<default_implementation::(anonymous namespace)::MoreThan14Bits>, default_implementation::(anonymous namespace)::MoreThan14Bits>*)::{lambda(int*)#1}::operator()(int*) const
Unexecuted instantiation: enc_fast_lossless.cc:AVX2::(anonymous namespace)::ProcessImageArea<AVX2::(anonymous namespace)::ChannelRowProcessor<AVX2::(anonymous namespace)::ChunkEncoder<AVX2::(anonymous namespace)::UpTo8Bits>, AVX2::(anonymous namespace)::UpTo8Bits>, AVX2::(anonymous namespace)::UpTo8Bits>(unsigned char const*, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, AVX2::(anonymous namespace)::UpTo8Bits, unsigned long, bool, AVX2::(anonymous namespace)::ChannelRowProcessor<AVX2::(anonymous namespace)::ChunkEncoder<AVX2::(anonymous namespace)::UpTo8Bits>, AVX2::(anonymous namespace)::UpTo8Bits>*)::{lambda(short*)#1}::operator()(short*) const
Unexecuted instantiation: enc_fast_lossless.cc:AVX2::(anonymous namespace)::ProcessImageArea<AVX2::(anonymous namespace)::ChannelRowProcessor<AVX2::(anonymous namespace)::ChunkEncoder<AVX2::(anonymous namespace)::From9To13Bits>, AVX2::(anonymous namespace)::From9To13Bits>, AVX2::(anonymous namespace)::From9To13Bits>(unsigned char const*, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, AVX2::(anonymous namespace)::From9To13Bits, unsigned long, bool, AVX2::(anonymous namespace)::ChannelRowProcessor<AVX2::(anonymous namespace)::ChunkEncoder<AVX2::(anonymous namespace)::From9To13Bits>, AVX2::(anonymous namespace)::From9To13Bits>*)::{lambda(short*)#1}::operator()(short*) const
Unexecuted instantiation: enc_fast_lossless.cc:AVX2::(anonymous namespace)::ProcessImageArea<AVX2::(anonymous namespace)::ChannelRowProcessor<AVX2::(anonymous namespace)::ChunkEncoder<AVX2::(anonymous namespace)::Exactly14Bits>, AVX2::(anonymous namespace)::Exactly14Bits>, AVX2::(anonymous namespace)::Exactly14Bits>(unsigned char const*, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, AVX2::(anonymous namespace)::Exactly14Bits, unsigned long, bool, AVX2::(anonymous namespace)::ChannelRowProcessor<AVX2::(anonymous namespace)::ChunkEncoder<AVX2::(anonymous namespace)::Exactly14Bits>, AVX2::(anonymous namespace)::Exactly14Bits>*)::{lambda(short*)#1}::operator()(short*) const
Unexecuted instantiation: enc_fast_lossless.cc:AVX2::(anonymous namespace)::ProcessImageArea<AVX2::(anonymous namespace)::ChannelRowProcessor<AVX2::(anonymous namespace)::ChunkEncoder<AVX2::(anonymous namespace)::MoreThan14Bits>, AVX2::(anonymous namespace)::MoreThan14Bits>, AVX2::(anonymous namespace)::MoreThan14Bits>(unsigned char const*, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, AVX2::(anonymous namespace)::MoreThan14Bits, unsigned long, bool, AVX2::(anonymous namespace)::ChannelRowProcessor<AVX2::(anonymous namespace)::ChunkEncoder<AVX2::(anonymous namespace)::MoreThan14Bits>, AVX2::(anonymous namespace)::MoreThan14Bits>*)::{lambda(int*)#1}::operator()(int*) const
Unexecuted instantiation: enc_fast_lossless.cc:default_implementation::(anonymous namespace)::ProcessImageArea<default_implementation::(anonymous namespace)::ChannelRowProcessor<default_implementation::(anonymous namespace)::ChunkEncoder<default_implementation::(anonymous namespace)::UpTo8Bits>, default_implementation::(anonymous namespace)::UpTo8Bits>, default_implementation::(anonymous namespace)::UpTo8Bits>(unsigned char const*, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, default_implementation::(anonymous namespace)::UpTo8Bits, unsigned long, bool, default_implementation::(anonymous namespace)::ChannelRowProcessor<default_implementation::(anonymous namespace)::ChunkEncoder<default_implementation::(anonymous namespace)::UpTo8Bits>, default_implementation::(anonymous namespace)::UpTo8Bits>*)::{lambda(short*)#1}::operator()(short*) const
Unexecuted instantiation: enc_fast_lossless.cc:default_implementation::(anonymous namespace)::ProcessImageArea<default_implementation::(anonymous namespace)::ChannelRowProcessor<default_implementation::(anonymous namespace)::ChunkEncoder<default_implementation::(anonymous namespace)::From9To13Bits>, default_implementation::(anonymous namespace)::From9To13Bits>, default_implementation::(anonymous namespace)::From9To13Bits>(unsigned char const*, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, default_implementation::(anonymous namespace)::From9To13Bits, unsigned long, bool, default_implementation::(anonymous namespace)::ChannelRowProcessor<default_implementation::(anonymous namespace)::ChunkEncoder<default_implementation::(anonymous namespace)::From9To13Bits>, default_implementation::(anonymous namespace)::From9To13Bits>*)::{lambda(short*)#1}::operator()(short*) const
Unexecuted instantiation: enc_fast_lossless.cc:default_implementation::(anonymous namespace)::ProcessImageArea<default_implementation::(anonymous namespace)::ChannelRowProcessor<default_implementation::(anonymous namespace)::ChunkEncoder<default_implementation::(anonymous namespace)::Exactly14Bits>, default_implementation::(anonymous namespace)::Exactly14Bits>, default_implementation::(anonymous namespace)::Exactly14Bits>(unsigned char const*, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, default_implementation::(anonymous namespace)::Exactly14Bits, unsigned long, bool, default_implementation::(anonymous namespace)::ChannelRowProcessor<default_implementation::(anonymous namespace)::ChunkEncoder<default_implementation::(anonymous namespace)::Exactly14Bits>, default_implementation::(anonymous namespace)::Exactly14Bits>*)::{lambda(short*)#1}::operator()(short*) const
Unexecuted instantiation: enc_fast_lossless.cc:default_implementation::(anonymous namespace)::ProcessImageArea<default_implementation::(anonymous namespace)::ChannelRowProcessor<default_implementation::(anonymous namespace)::ChunkEncoder<default_implementation::(anonymous namespace)::MoreThan14Bits>, default_implementation::(anonymous namespace)::MoreThan14Bits>, default_implementation::(anonymous namespace)::MoreThan14Bits>(unsigned char const*, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, default_implementation::(anonymous namespace)::MoreThan14Bits, unsigned long, bool, default_implementation::(anonymous namespace)::ChannelRowProcessor<default_implementation::(anonymous namespace)::ChunkEncoder<default_implementation::(anonymous namespace)::MoreThan14Bits>, default_implementation::(anonymous namespace)::MoreThan14Bits>*)::{lambda(int*)#1}::operator()(int*) const
3422
3423
0
  constexpr size_t kNumPx =
3424
0
      (256 + kPadding * 2 + kAlignPixels + kAlignPixels - 1) / kAlignPixels *
3425
0
      kAlignPixels;
3426
3427
0
  std::vector<std::array<std::array<pixel_t, kNumPx>, 2>> group_data(nb_chans);
3428
3429
0
  for (size_t y = 0; y < ys; y++) {
3430
0
    const auto rgba_row =
3431
0
        rgba + row_stride * (y0 + y) + x0 * nb_chans * BitDepth::kInputBytes;
3432
0
    pixel_t* crow[4] = {};
3433
0
    pixel_t* prow[4] = {};
3434
0
    for (size_t i = 0; i < nb_chans; i++) {
3435
0
      crow[i] = align(&group_data[i][y & 1][kPadding]);
3436
0
      prow[i] = align(&group_data[i][(y - 1) & 1][kPadding]);
3437
0
    }
3438
3439
    // Pre-fill rows with YCoCg converted pixels.
3440
0
    if (nb_chans == 1) {
3441
0
      if (BitDepth::kInputBytes == 1) {
3442
0
        FillRowG8(rgba_row, xs, crow[0]);
3443
0
      } else if (big_endian) {
3444
0
        FillRowG16</*big_endian=*/true>(rgba_row, xs, crow[0]);
3445
0
      } else {
3446
0
        FillRowG16</*big_endian=*/false>(rgba_row, xs, crow[0]);
3447
0
      }
3448
0
    } else if (nb_chans == 2) {
3449
0
      if (BitDepth::kInputBytes == 1) {
3450
0
        FillRowGA8(rgba_row, xs, crow[0], crow[1]);
3451
0
      } else if (big_endian) {
3452
0
        FillRowGA16</*big_endian=*/true>(rgba_row, xs, crow[0], crow[1]);
3453
0
      } else {
3454
0
        FillRowGA16</*big_endian=*/false>(rgba_row, xs, crow[0], crow[1]);
3455
0
      }
3456
0
    } else if (nb_chans == 3) {
3457
0
      if (BitDepth::kInputBytes == 1) {
3458
0
        FillRowRGB8(rgba_row, xs, crow[0], crow[1], crow[2]);
3459
0
      } else if (big_endian) {
3460
0
        FillRowRGB16</*big_endian=*/true>(rgba_row, xs, crow[0], crow[1],
3461
0
                                          crow[2]);
3462
0
      } else {
3463
0
        FillRowRGB16</*big_endian=*/false>(rgba_row, xs, crow[0], crow[1],
3464
0
                                           crow[2]);
3465
0
      }
3466
0
    } else {
3467
0
      if (BitDepth::kInputBytes == 1) {
3468
0
        FillRowRGBA8(rgba_row, xs, crow[0], crow[1], crow[2], crow[3]);
3469
0
      } else if (big_endian) {
3470
0
        FillRowRGBA16</*big_endian=*/true>(rgba_row, xs, crow[0], crow[1],
3471
0
                                           crow[2], crow[3]);
3472
0
      } else {
3473
0
        FillRowRGBA16</*big_endian=*/false>(rgba_row, xs, crow[0], crow[1],
3474
0
                                            crow[2], crow[3]);
3475
0
      }
3476
0
    }
3477
    // Deal with x == 0.
3478
0
    for (size_t c = 0; c < nb_chans; c++) {
3479
0
      *(crow[c] - 1) = y > 0 ? *(prow[c]) : 0;
3480
      // Fix topleft.
3481
0
      *(prow[c] - 1) = y > 0 ? *(prow[c]) : 0;
3482
0
    }
3483
0
    if (y < yskip) continue;
3484
0
    for (size_t c = 0; c < nb_chans; c++) {
3485
      // Get pointers to px/left/top/topleft data to speedup loop.
3486
0
      const pixel_t* row = crow[c];
3487
0
      const pixel_t* row_left = crow[c] - 1;
3488
0
      const pixel_t* row_top = y == 0 ? row_left : prow[c];
3489
0
      const pixel_t* row_topleft = y == 0 ? row_left : prow[c] - 1;
3490
3491
0
      processors[c].ProcessRow(row, row_left, row_top, row_topleft, xs);
3492
0
    }
3493
0
  }
3494
0
  for (size_t c = 0; c < nb_chans; c++) {
3495
0
    processors[c].Finalize();
3496
0
  }
3497
0
}
Unexecuted instantiation: enc_fast_lossless.cc:void AVX2::(anonymous namespace)::ProcessImageArea<AVX2::(anonymous namespace)::ChannelRowProcessor<AVX2::(anonymous namespace)::ChunkSampleCollector<AVX2::(anonymous namespace)::UpTo8Bits>, AVX2::(anonymous namespace)::UpTo8Bits>, AVX2::(anonymous namespace)::UpTo8Bits>(unsigned char const*, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, AVX2::(anonymous namespace)::UpTo8Bits, unsigned long, bool, AVX2::(anonymous namespace)::ChannelRowProcessor<AVX2::(anonymous namespace)::ChunkSampleCollector<AVX2::(anonymous namespace)::UpTo8Bits>, AVX2::(anonymous namespace)::UpTo8Bits>*)
Unexecuted instantiation: enc_fast_lossless.cc:void AVX2::(anonymous namespace)::ProcessImageArea<AVX2::(anonymous namespace)::ChannelRowProcessor<AVX2::(anonymous namespace)::ChunkSampleCollector<AVX2::(anonymous namespace)::From9To13Bits>, AVX2::(anonymous namespace)::From9To13Bits>, AVX2::(anonymous namespace)::From9To13Bits>(unsigned char const*, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, AVX2::(anonymous namespace)::From9To13Bits, unsigned long, bool, AVX2::(anonymous namespace)::ChannelRowProcessor<AVX2::(anonymous namespace)::ChunkSampleCollector<AVX2::(anonymous namespace)::From9To13Bits>, AVX2::(anonymous namespace)::From9To13Bits>*)
Unexecuted instantiation: enc_fast_lossless.cc:void AVX2::(anonymous namespace)::ProcessImageArea<AVX2::(anonymous namespace)::ChannelRowProcessor<AVX2::(anonymous namespace)::ChunkSampleCollector<AVX2::(anonymous namespace)::Exactly14Bits>, AVX2::(anonymous namespace)::Exactly14Bits>, AVX2::(anonymous namespace)::Exactly14Bits>(unsigned char const*, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, AVX2::(anonymous namespace)::Exactly14Bits, unsigned long, bool, AVX2::(anonymous namespace)::ChannelRowProcessor<AVX2::(anonymous namespace)::ChunkSampleCollector<AVX2::(anonymous namespace)::Exactly14Bits>, AVX2::(anonymous namespace)::Exactly14Bits>*)
Unexecuted instantiation: enc_fast_lossless.cc:void AVX2::(anonymous namespace)::ProcessImageArea<AVX2::(anonymous namespace)::ChannelRowProcessor<AVX2::(anonymous namespace)::ChunkSampleCollector<AVX2::(anonymous namespace)::MoreThan14Bits>, AVX2::(anonymous namespace)::MoreThan14Bits>, AVX2::(anonymous namespace)::MoreThan14Bits>(unsigned char const*, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, AVX2::(anonymous namespace)::MoreThan14Bits, unsigned long, bool, AVX2::(anonymous namespace)::ChannelRowProcessor<AVX2::(anonymous namespace)::ChunkSampleCollector<AVX2::(anonymous namespace)::MoreThan14Bits>, AVX2::(anonymous namespace)::MoreThan14Bits>*)
Unexecuted instantiation: enc_fast_lossless.cc:void default_implementation::(anonymous namespace)::ProcessImageArea<default_implementation::(anonymous namespace)::ChannelRowProcessor<default_implementation::(anonymous namespace)::ChunkSampleCollector<default_implementation::(anonymous namespace)::UpTo8Bits>, default_implementation::(anonymous namespace)::UpTo8Bits>, default_implementation::(anonymous namespace)::UpTo8Bits>(unsigned char const*, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, default_implementation::(anonymous namespace)::UpTo8Bits, unsigned long, bool, default_implementation::(anonymous namespace)::ChannelRowProcessor<default_implementation::(anonymous namespace)::ChunkSampleCollector<default_implementation::(anonymous namespace)::UpTo8Bits>, default_implementation::(anonymous namespace)::UpTo8Bits>*)
Unexecuted instantiation: enc_fast_lossless.cc:void default_implementation::(anonymous namespace)::ProcessImageArea<default_implementation::(anonymous namespace)::ChannelRowProcessor<default_implementation::(anonymous namespace)::ChunkSampleCollector<default_implementation::(anonymous namespace)::From9To13Bits>, default_implementation::(anonymous namespace)::From9To13Bits>, default_implementation::(anonymous namespace)::From9To13Bits>(unsigned char const*, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, default_implementation::(anonymous namespace)::From9To13Bits, unsigned long, bool, default_implementation::(anonymous namespace)::ChannelRowProcessor<default_implementation::(anonymous namespace)::ChunkSampleCollector<default_implementation::(anonymous namespace)::From9To13Bits>, default_implementation::(anonymous namespace)::From9To13Bits>*)
Unexecuted instantiation: enc_fast_lossless.cc:void default_implementation::(anonymous namespace)::ProcessImageArea<default_implementation::(anonymous namespace)::ChannelRowProcessor<default_implementation::(anonymous namespace)::ChunkSampleCollector<default_implementation::(anonymous namespace)::Exactly14Bits>, default_implementation::(anonymous namespace)::Exactly14Bits>, default_implementation::(anonymous namespace)::Exactly14Bits>(unsigned char const*, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, default_implementation::(anonymous namespace)::Exactly14Bits, unsigned long, bool, default_implementation::(anonymous namespace)::ChannelRowProcessor<default_implementation::(anonymous namespace)::ChunkSampleCollector<default_implementation::(anonymous namespace)::Exactly14Bits>, default_implementation::(anonymous namespace)::Exactly14Bits>*)
Unexecuted instantiation: enc_fast_lossless.cc:void default_implementation::(anonymous namespace)::ProcessImageArea<default_implementation::(anonymous namespace)::ChannelRowProcessor<default_implementation::(anonymous namespace)::ChunkSampleCollector<default_implementation::(anonymous namespace)::MoreThan14Bits>, default_implementation::(anonymous namespace)::MoreThan14Bits>, default_implementation::(anonymous namespace)::MoreThan14Bits>(unsigned char const*, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, default_implementation::(anonymous namespace)::MoreThan14Bits, unsigned long, bool, default_implementation::(anonymous namespace)::ChannelRowProcessor<default_implementation::(anonymous namespace)::ChunkSampleCollector<default_implementation::(anonymous namespace)::MoreThan14Bits>, default_implementation::(anonymous namespace)::MoreThan14Bits>*)
Unexecuted instantiation: enc_fast_lossless.cc:void AVX2::(anonymous namespace)::ProcessImageArea<AVX2::(anonymous namespace)::ChannelRowProcessor<AVX2::(anonymous namespace)::ChunkEncoder<AVX2::(anonymous namespace)::UpTo8Bits>, AVX2::(anonymous namespace)::UpTo8Bits>, AVX2::(anonymous namespace)::UpTo8Bits>(unsigned char const*, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, AVX2::(anonymous namespace)::UpTo8Bits, unsigned long, bool, AVX2::(anonymous namespace)::ChannelRowProcessor<AVX2::(anonymous namespace)::ChunkEncoder<AVX2::(anonymous namespace)::UpTo8Bits>, AVX2::(anonymous namespace)::UpTo8Bits>*)
Unexecuted instantiation: enc_fast_lossless.cc:void AVX2::(anonymous namespace)::ProcessImageArea<AVX2::(anonymous namespace)::ChannelRowProcessor<AVX2::(anonymous namespace)::ChunkEncoder<AVX2::(anonymous namespace)::From9To13Bits>, AVX2::(anonymous namespace)::From9To13Bits>, AVX2::(anonymous namespace)::From9To13Bits>(unsigned char const*, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, AVX2::(anonymous namespace)::From9To13Bits, unsigned long, bool, AVX2::(anonymous namespace)::ChannelRowProcessor<AVX2::(anonymous namespace)::ChunkEncoder<AVX2::(anonymous namespace)::From9To13Bits>, AVX2::(anonymous namespace)::From9To13Bits>*)
Unexecuted instantiation: enc_fast_lossless.cc:void AVX2::(anonymous namespace)::ProcessImageArea<AVX2::(anonymous namespace)::ChannelRowProcessor<AVX2::(anonymous namespace)::ChunkEncoder<AVX2::(anonymous namespace)::Exactly14Bits>, AVX2::(anonymous namespace)::Exactly14Bits>, AVX2::(anonymous namespace)::Exactly14Bits>(unsigned char const*, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, AVX2::(anonymous namespace)::Exactly14Bits, unsigned long, bool, AVX2::(anonymous namespace)::ChannelRowProcessor<AVX2::(anonymous namespace)::ChunkEncoder<AVX2::(anonymous namespace)::Exactly14Bits>, AVX2::(anonymous namespace)::Exactly14Bits>*)
Unexecuted instantiation: enc_fast_lossless.cc:void AVX2::(anonymous namespace)::ProcessImageArea<AVX2::(anonymous namespace)::ChannelRowProcessor<AVX2::(anonymous namespace)::ChunkEncoder<AVX2::(anonymous namespace)::MoreThan14Bits>, AVX2::(anonymous namespace)::MoreThan14Bits>, AVX2::(anonymous namespace)::MoreThan14Bits>(unsigned char const*, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, AVX2::(anonymous namespace)::MoreThan14Bits, unsigned long, bool, AVX2::(anonymous namespace)::ChannelRowProcessor<AVX2::(anonymous namespace)::ChunkEncoder<AVX2::(anonymous namespace)::MoreThan14Bits>, AVX2::(anonymous namespace)::MoreThan14Bits>*)
Unexecuted instantiation: enc_fast_lossless.cc:void default_implementation::(anonymous namespace)::ProcessImageArea<default_implementation::(anonymous namespace)::ChannelRowProcessor<default_implementation::(anonymous namespace)::ChunkEncoder<default_implementation::(anonymous namespace)::UpTo8Bits>, default_implementation::(anonymous namespace)::UpTo8Bits>, default_implementation::(anonymous namespace)::UpTo8Bits>(unsigned char const*, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, default_implementation::(anonymous namespace)::UpTo8Bits, unsigned long, bool, default_implementation::(anonymous namespace)::ChannelRowProcessor<default_implementation::(anonymous namespace)::ChunkEncoder<default_implementation::(anonymous namespace)::UpTo8Bits>, default_implementation::(anonymous namespace)::UpTo8Bits>*)
Unexecuted instantiation: enc_fast_lossless.cc:void default_implementation::(anonymous namespace)::ProcessImageArea<default_implementation::(anonymous namespace)::ChannelRowProcessor<default_implementation::(anonymous namespace)::ChunkEncoder<default_implementation::(anonymous namespace)::From9To13Bits>, default_implementation::(anonymous namespace)::From9To13Bits>, default_implementation::(anonymous namespace)::From9To13Bits>(unsigned char const*, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, default_implementation::(anonymous namespace)::From9To13Bits, unsigned long, bool, default_implementation::(anonymous namespace)::ChannelRowProcessor<default_implementation::(anonymous namespace)::ChunkEncoder<default_implementation::(anonymous namespace)::From9To13Bits>, default_implementation::(anonymous namespace)::From9To13Bits>*)
Unexecuted instantiation: enc_fast_lossless.cc:void default_implementation::(anonymous namespace)::ProcessImageArea<default_implementation::(anonymous namespace)::ChannelRowProcessor<default_implementation::(anonymous namespace)::ChunkEncoder<default_implementation::(anonymous namespace)::Exactly14Bits>, default_implementation::(anonymous namespace)::Exactly14Bits>, default_implementation::(anonymous namespace)::Exactly14Bits>(unsigned char const*, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, default_implementation::(anonymous namespace)::Exactly14Bits, unsigned long, bool, default_implementation::(anonymous namespace)::ChannelRowProcessor<default_implementation::(anonymous namespace)::ChunkEncoder<default_implementation::(anonymous namespace)::Exactly14Bits>, default_implementation::(anonymous namespace)::Exactly14Bits>*)
Unexecuted instantiation: enc_fast_lossless.cc:void default_implementation::(anonymous namespace)::ProcessImageArea<default_implementation::(anonymous namespace)::ChannelRowProcessor<default_implementation::(anonymous namespace)::ChunkEncoder<default_implementation::(anonymous namespace)::MoreThan14Bits>, default_implementation::(anonymous namespace)::MoreThan14Bits>, default_implementation::(anonymous namespace)::MoreThan14Bits>(unsigned char const*, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, default_implementation::(anonymous namespace)::MoreThan14Bits, unsigned long, bool, default_implementation::(anonymous namespace)::ChannelRowProcessor<default_implementation::(anonymous namespace)::ChunkEncoder<default_implementation::(anonymous namespace)::MoreThan14Bits>, default_implementation::(anonymous namespace)::MoreThan14Bits>*)
3498
3499
template <typename BitDepth>
3500
void WriteACSection(const unsigned char* rgba, size_t x0, size_t y0, size_t xs,
3501
                    size_t ys, size_t row_stride, bool is_single_group,
3502
                    BitDepth bitdepth, size_t nb_chans, bool big_endian,
3503
                    const PrefixCode code[4],
3504
0
                    std::array<BitWriter, 4>& output) {
3505
0
  for (size_t i = 0; i < nb_chans; i++) {
3506
0
    if (is_single_group && i == 0) continue;
3507
0
    output[i].Allocate(xs * ys * bitdepth.MaxEncodedBitsPerSample() + 4);
3508
0
  }
3509
0
  if (!is_single_group) {
3510
    // Group header for modular image.
3511
    // When the image is single-group, the global modular image is the one
3512
    // that contains the pixel data, and there is no group header.
3513
0
    output[0].Write(1, 1);     // Global tree
3514
0
    output[0].Write(1, 1);     // All default wp
3515
0
    output[0].Write(2, 0b00);  // 0 transforms
3516
0
  }
3517
3518
0
  ChunkEncoder<BitDepth> encoders[4];
3519
0
  ChannelRowProcessor<ChunkEncoder<BitDepth>, BitDepth> row_encoders[4];
3520
0
  for (size_t c = 0; c < nb_chans; c++) {
3521
0
    row_encoders[c].t = &encoders[c];
3522
0
    encoders[c].output = &output[c];
3523
0
    encoders[c].code = &code[c];
3524
0
    encoders[c].PrepareForSimd();
3525
0
  }
3526
0
  ProcessImageArea<ChannelRowProcessor<ChunkEncoder<BitDepth>, BitDepth>>(
3527
0
      rgba, x0, y0, xs, 0, ys, row_stride, bitdepth, nb_chans, big_endian,
3528
0
      row_encoders);
3529
0
}
Unexecuted instantiation: enc_fast_lossless.cc:void AVX2::(anonymous namespace)::WriteACSection<AVX2::(anonymous namespace)::UpTo8Bits>(unsigned char const*, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, bool, AVX2::(anonymous namespace)::UpTo8Bits, unsigned long, bool, (anonymous namespace)::PrefixCode const*, std::__1::array<(anonymous namespace)::BitWriter, 4ul>&)
Unexecuted instantiation: enc_fast_lossless.cc:void AVX2::(anonymous namespace)::WriteACSection<AVX2::(anonymous namespace)::From9To13Bits>(unsigned char const*, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, bool, AVX2::(anonymous namespace)::From9To13Bits, unsigned long, bool, (anonymous namespace)::PrefixCode const*, std::__1::array<(anonymous namespace)::BitWriter, 4ul>&)
Unexecuted instantiation: enc_fast_lossless.cc:void AVX2::(anonymous namespace)::WriteACSection<AVX2::(anonymous namespace)::Exactly14Bits>(unsigned char const*, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, bool, AVX2::(anonymous namespace)::Exactly14Bits, unsigned long, bool, (anonymous namespace)::PrefixCode const*, std::__1::array<(anonymous namespace)::BitWriter, 4ul>&)
Unexecuted instantiation: enc_fast_lossless.cc:void AVX2::(anonymous namespace)::WriteACSection<AVX2::(anonymous namespace)::MoreThan14Bits>(unsigned char const*, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, bool, AVX2::(anonymous namespace)::MoreThan14Bits, unsigned long, bool, (anonymous namespace)::PrefixCode const*, std::__1::array<(anonymous namespace)::BitWriter, 4ul>&)
Unexecuted instantiation: enc_fast_lossless.cc:void default_implementation::(anonymous namespace)::WriteACSection<default_implementation::(anonymous namespace)::UpTo8Bits>(unsigned char const*, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, bool, default_implementation::(anonymous namespace)::UpTo8Bits, unsigned long, bool, (anonymous namespace)::PrefixCode const*, std::__1::array<(anonymous namespace)::BitWriter, 4ul>&)
Unexecuted instantiation: enc_fast_lossless.cc:void default_implementation::(anonymous namespace)::WriteACSection<default_implementation::(anonymous namespace)::From9To13Bits>(unsigned char const*, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, bool, default_implementation::(anonymous namespace)::From9To13Bits, unsigned long, bool, (anonymous namespace)::PrefixCode const*, std::__1::array<(anonymous namespace)::BitWriter, 4ul>&)
Unexecuted instantiation: enc_fast_lossless.cc:void default_implementation::(anonymous namespace)::WriteACSection<default_implementation::(anonymous namespace)::Exactly14Bits>(unsigned char const*, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, bool, default_implementation::(anonymous namespace)::Exactly14Bits, unsigned long, bool, (anonymous namespace)::PrefixCode const*, std::__1::array<(anonymous namespace)::BitWriter, 4ul>&)
Unexecuted instantiation: enc_fast_lossless.cc:void default_implementation::(anonymous namespace)::WriteACSection<default_implementation::(anonymous namespace)::MoreThan14Bits>(unsigned char const*, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, bool, default_implementation::(anonymous namespace)::MoreThan14Bits, unsigned long, bool, (anonymous namespace)::PrefixCode const*, std::__1::array<(anonymous namespace)::BitWriter, 4ul>&)
3530
3531
constexpr int kHashExp = 16;
3532
constexpr uint32_t kHashSize = 1 << kHashExp;
3533
constexpr uint32_t kHashMultiplier = 2654435761;
3534
constexpr int kMaxColors = 512;
3535
3536
// can be any function that returns a value in 0 .. kHashSize-1
3537
// has to map 0 to 0
3538
0
inline uint32_t pixel_hash(uint32_t p) {
3539
0
  return (p * kHashMultiplier) >> (32 - kHashExp);
3540
0
}
Unexecuted instantiation: enc_fast_lossless.cc:AVX2::(anonymous namespace)::pixel_hash(unsigned int)
Unexecuted instantiation: enc_fast_lossless.cc:default_implementation::(anonymous namespace)::pixel_hash(unsigned int)
3541
3542
template <size_t nb_chans>
3543
void FillRowPalette(const unsigned char* inrow, size_t xs,
3544
0
                    const int16_t* lookup, int16_t* out) {
3545
0
  for (size_t x = 0; x < xs; x++) {
3546
0
    uint32_t p = 0;
3547
0
    for (size_t i = 0; i < nb_chans; ++i) {
3548
0
      p |= inrow[x * nb_chans + i] << (8 * i);
3549
0
    }
3550
0
    out[x] = lookup[pixel_hash(p)];
3551
0
  }
3552
0
}
Unexecuted instantiation: enc_fast_lossless.cc:void AVX2::(anonymous namespace)::FillRowPalette<1ul>(unsigned char const*, unsigned long, short const*, short*)
Unexecuted instantiation: enc_fast_lossless.cc:void AVX2::(anonymous namespace)::FillRowPalette<2ul>(unsigned char const*, unsigned long, short const*, short*)
Unexecuted instantiation: enc_fast_lossless.cc:void AVX2::(anonymous namespace)::FillRowPalette<3ul>(unsigned char const*, unsigned long, short const*, short*)
Unexecuted instantiation: enc_fast_lossless.cc:void AVX2::(anonymous namespace)::FillRowPalette<4ul>(unsigned char const*, unsigned long, short const*, short*)
Unexecuted instantiation: enc_fast_lossless.cc:void default_implementation::(anonymous namespace)::FillRowPalette<1ul>(unsigned char const*, unsigned long, short const*, short*)
Unexecuted instantiation: enc_fast_lossless.cc:void default_implementation::(anonymous namespace)::FillRowPalette<2ul>(unsigned char const*, unsigned long, short const*, short*)
Unexecuted instantiation: enc_fast_lossless.cc:void default_implementation::(anonymous namespace)::FillRowPalette<3ul>(unsigned char const*, unsigned long, short const*, short*)
Unexecuted instantiation: enc_fast_lossless.cc:void default_implementation::(anonymous namespace)::FillRowPalette<4ul>(unsigned char const*, unsigned long, short const*, short*)
3553
3554
template <typename Processor>
3555
void ProcessImageAreaPalette(const unsigned char* rgba, size_t x0, size_t y0,
3556
                             size_t xs, size_t yskip, size_t ys,
3557
                             size_t row_stride, const int16_t* lookup,
3558
0
                             size_t nb_chans, Processor* processors) {
3559
0
  constexpr size_t kPadding = 32;
3560
3561
0
  std::vector<std::array<int16_t, 256 + kPadding * 2>> group_data(2);
3562
0
  Processor& row_encoder = processors[0];
3563
3564
0
  for (size_t y = 0; y < ys; y++) {
3565
    // Pre-fill rows with palette converted pixels.
3566
0
    const unsigned char* inrow = rgba + row_stride * (y0 + y) + x0 * nb_chans;
3567
0
    int16_t* outrow = &group_data[y & 1][kPadding];
3568
0
    if (nb_chans == 1) {
3569
0
      FillRowPalette<1>(inrow, xs, lookup, outrow);
3570
0
    } else if (nb_chans == 2) {
3571
0
      FillRowPalette<2>(inrow, xs, lookup, outrow);
3572
0
    } else if (nb_chans == 3) {
3573
0
      FillRowPalette<3>(inrow, xs, lookup, outrow);
3574
0
    } else if (nb_chans == 4) {
3575
0
      FillRowPalette<4>(inrow, xs, lookup, outrow);
3576
0
    }
3577
    // Deal with x == 0.
3578
0
    group_data[y & 1][kPadding - 1] =
3579
0
        y > 0 ? group_data[(y - 1) & 1][kPadding] : 0;
3580
    // Fix topleft.
3581
0
    group_data[(y - 1) & 1][kPadding - 1] =
3582
0
        y > 0 ? group_data[(y - 1) & 1][kPadding] : 0;
3583
    // Get pointers to px/left/top/topleft data to speedup loop.
3584
0
    const int16_t* row = &group_data[y & 1][kPadding];
3585
0
    const int16_t* row_left = &group_data[y & 1][kPadding - 1];
3586
0
    const int16_t* row_top =
3587
0
        y == 0 ? row_left : &group_data[(y - 1) & 1][kPadding];
3588
0
    const int16_t* row_topleft =
3589
0
        y == 0 ? row_left : &group_data[(y - 1) & 1][kPadding - 1];
3590
3591
0
    row_encoder.ProcessRow(row, row_left, row_top, row_topleft, xs);
3592
0
  }
3593
0
  row_encoder.Finalize();
3594
0
}
Unexecuted instantiation: enc_fast_lossless.cc:void AVX2::(anonymous namespace)::ProcessImageAreaPalette<AVX2::(anonymous namespace)::ChannelRowProcessor<AVX2::(anonymous namespace)::ChunkSampleCollector<AVX2::(anonymous namespace)::UpTo8Bits>, AVX2::(anonymous namespace)::UpTo8Bits> >(unsigned char const*, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, short const*, unsigned long, AVX2::(anonymous namespace)::ChannelRowProcessor<AVX2::(anonymous namespace)::ChunkSampleCollector<AVX2::(anonymous namespace)::UpTo8Bits>, AVX2::(anonymous namespace)::UpTo8Bits>*)
Unexecuted instantiation: enc_fast_lossless.cc:void default_implementation::(anonymous namespace)::ProcessImageAreaPalette<default_implementation::(anonymous namespace)::ChannelRowProcessor<default_implementation::(anonymous namespace)::ChunkSampleCollector<default_implementation::(anonymous namespace)::UpTo8Bits>, default_implementation::(anonymous namespace)::UpTo8Bits> >(unsigned char const*, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, short const*, unsigned long, default_implementation::(anonymous namespace)::ChannelRowProcessor<default_implementation::(anonymous namespace)::ChunkSampleCollector<default_implementation::(anonymous namespace)::UpTo8Bits>, default_implementation::(anonymous namespace)::UpTo8Bits>*)
Unexecuted instantiation: enc_fast_lossless.cc:void AVX2::(anonymous namespace)::ProcessImageAreaPalette<AVX2::(anonymous namespace)::ChannelRowProcessor<AVX2::(anonymous namespace)::ChunkEncoder<AVX2::(anonymous namespace)::UpTo8Bits>, AVX2::(anonymous namespace)::UpTo8Bits> >(unsigned char const*, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, short const*, unsigned long, AVX2::(anonymous namespace)::ChannelRowProcessor<AVX2::(anonymous namespace)::ChunkEncoder<AVX2::(anonymous namespace)::UpTo8Bits>, AVX2::(anonymous namespace)::UpTo8Bits>*)
Unexecuted instantiation: enc_fast_lossless.cc:void default_implementation::(anonymous namespace)::ProcessImageAreaPalette<default_implementation::(anonymous namespace)::ChannelRowProcessor<default_implementation::(anonymous namespace)::ChunkEncoder<default_implementation::(anonymous namespace)::UpTo8Bits>, default_implementation::(anonymous namespace)::UpTo8Bits> >(unsigned char const*, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, short const*, unsigned long, default_implementation::(anonymous namespace)::ChannelRowProcessor<default_implementation::(anonymous namespace)::ChunkEncoder<default_implementation::(anonymous namespace)::UpTo8Bits>, default_implementation::(anonymous namespace)::UpTo8Bits>*)
3595
3596
void WriteACSectionPalette(const unsigned char* rgba, size_t x0, size_t y0,
3597
                           size_t xs, size_t ys, size_t row_stride,
3598
                           bool is_single_group, const PrefixCode code[4],
3599
                           const int16_t* lookup, size_t nb_chans,
3600
0
                           BitWriter& output) {
3601
0
  if (!is_single_group) {
3602
0
    output.Allocate(16 * xs * ys + 4);
3603
    // Group header for modular image.
3604
    // When the image is single-group, the global modular image is the one
3605
    // that contains the pixel data, and there is no group header.
3606
0
    output.Write(1, 1);     // Global tree
3607
0
    output.Write(1, 1);     // All default wp
3608
0
    output.Write(2, 0b00);  // 0 transforms
3609
0
  }
3610
3611
0
  ChunkEncoder<UpTo8Bits> encoder;
3612
0
  ChannelRowProcessor<ChunkEncoder<UpTo8Bits>, UpTo8Bits> row_encoder;
3613
3614
0
  row_encoder.t = &encoder;
3615
0
  encoder.output = &output;
3616
0
  encoder.code = &code[is_single_group ? 1 : 0];
3617
0
  encoder.PrepareForSimd();
3618
0
  ProcessImageAreaPalette<
3619
0
      ChannelRowProcessor<ChunkEncoder<UpTo8Bits>, UpTo8Bits>>(
3620
0
      rgba, x0, y0, xs, 0, ys, row_stride, lookup, nb_chans, &row_encoder);
3621
0
}
Unexecuted instantiation: enc_fast_lossless.cc:AVX2::(anonymous namespace)::WriteACSectionPalette(unsigned char const*, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, bool, (anonymous namespace)::PrefixCode const*, short const*, unsigned long, (anonymous namespace)::BitWriter&)
Unexecuted instantiation: enc_fast_lossless.cc:default_implementation::(anonymous namespace)::WriteACSectionPalette(unsigned char const*, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, bool, (anonymous namespace)::PrefixCode const*, short const*, unsigned long, (anonymous namespace)::BitWriter&)
3622
3623
template <typename BitDepth>
3624
void CollectSamples(const unsigned char* rgba, size_t x0, size_t y0, size_t xs,
3625
                    size_t row_stride, size_t row_count,
3626
                    uint64_t raw_counts[4][kNumRawSymbols],
3627
                    uint64_t lz77_counts[4][kNumLZ77], bool is_single_group,
3628
                    bool palette, BitDepth bitdepth, size_t nb_chans,
3629
0
                    bool big_endian, const int16_t* lookup) {
3630
0
  if (palette) {
3631
0
    ChunkSampleCollector<UpTo8Bits> sample_collectors[4];
3632
0
    ChannelRowProcessor<ChunkSampleCollector<UpTo8Bits>, UpTo8Bits>
3633
0
        row_sample_collectors[4];
3634
0
    for (size_t c = 0; c < nb_chans; c++) {
3635
0
      row_sample_collectors[c].t = &sample_collectors[c];
3636
0
      sample_collectors[c].raw_counts = raw_counts[is_single_group ? 1 : 0];
3637
0
      sample_collectors[c].lz77_counts = lz77_counts[is_single_group ? 1 : 0];
3638
0
    }
3639
0
    ProcessImageAreaPalette<
3640
0
        ChannelRowProcessor<ChunkSampleCollector<UpTo8Bits>, UpTo8Bits>>(
3641
0
        rgba, x0, y0, xs, 1, 1 + row_count, row_stride, lookup, nb_chans,
3642
0
        row_sample_collectors);
3643
0
  } else {
3644
0
    ChunkSampleCollector<BitDepth> sample_collectors[4];
3645
0
    ChannelRowProcessor<ChunkSampleCollector<BitDepth>, BitDepth>
3646
0
        row_sample_collectors[4];
3647
0
    for (size_t c = 0; c < nb_chans; c++) {
3648
0
      row_sample_collectors[c].t = &sample_collectors[c];
3649
0
      sample_collectors[c].raw_counts = raw_counts[c];
3650
0
      sample_collectors[c].lz77_counts = lz77_counts[c];
3651
0
    }
3652
0
    ProcessImageArea<
3653
0
        ChannelRowProcessor<ChunkSampleCollector<BitDepth>, BitDepth>>(
3654
0
        rgba, x0, y0, xs, 1, 1 + row_count, row_stride, bitdepth, nb_chans,
3655
0
        big_endian, row_sample_collectors);
3656
0
  }
3657
0
}
Unexecuted instantiation: enc_fast_lossless.cc:void AVX2::(anonymous namespace)::CollectSamples<AVX2::(anonymous namespace)::UpTo8Bits>(unsigned char const*, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long (*) [19], unsigned long (*) [33], bool, bool, AVX2::(anonymous namespace)::UpTo8Bits, unsigned long, bool, short const*)
Unexecuted instantiation: enc_fast_lossless.cc:void AVX2::(anonymous namespace)::CollectSamples<AVX2::(anonymous namespace)::From9To13Bits>(unsigned char const*, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long (*) [19], unsigned long (*) [33], bool, bool, AVX2::(anonymous namespace)::From9To13Bits, unsigned long, bool, short const*)
Unexecuted instantiation: enc_fast_lossless.cc:void AVX2::(anonymous namespace)::CollectSamples<AVX2::(anonymous namespace)::Exactly14Bits>(unsigned char const*, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long (*) [19], unsigned long (*) [33], bool, bool, AVX2::(anonymous namespace)::Exactly14Bits, unsigned long, bool, short const*)
Unexecuted instantiation: enc_fast_lossless.cc:void AVX2::(anonymous namespace)::CollectSamples<AVX2::(anonymous namespace)::MoreThan14Bits>(unsigned char const*, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long (*) [19], unsigned long (*) [33], bool, bool, AVX2::(anonymous namespace)::MoreThan14Bits, unsigned long, bool, short const*)
Unexecuted instantiation: enc_fast_lossless.cc:void default_implementation::(anonymous namespace)::CollectSamples<default_implementation::(anonymous namespace)::UpTo8Bits>(unsigned char const*, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long (*) [19], unsigned long (*) [33], bool, bool, default_implementation::(anonymous namespace)::UpTo8Bits, unsigned long, bool, short const*)
Unexecuted instantiation: enc_fast_lossless.cc:void default_implementation::(anonymous namespace)::CollectSamples<default_implementation::(anonymous namespace)::From9To13Bits>(unsigned char const*, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long (*) [19], unsigned long (*) [33], bool, bool, default_implementation::(anonymous namespace)::From9To13Bits, unsigned long, bool, short const*)
Unexecuted instantiation: enc_fast_lossless.cc:void default_implementation::(anonymous namespace)::CollectSamples<default_implementation::(anonymous namespace)::Exactly14Bits>(unsigned char const*, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long (*) [19], unsigned long (*) [33], bool, bool, default_implementation::(anonymous namespace)::Exactly14Bits, unsigned long, bool, short const*)
Unexecuted instantiation: enc_fast_lossless.cc:void default_implementation::(anonymous namespace)::CollectSamples<default_implementation::(anonymous namespace)::MoreThan14Bits>(unsigned char const*, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long (*) [19], unsigned long (*) [33], bool, bool, default_implementation::(anonymous namespace)::MoreThan14Bits, unsigned long, bool, short const*)
3658
3659
void PrepareDCGlobalPalette(bool is_single_group, size_t width, size_t height,
3660
                            size_t nb_chans, const PrefixCode code[4],
3661
                            const std::vector<uint32_t>& palette,
3662
0
                            size_t pcolors, BitWriter* output) {
3663
0
  PrepareDCGlobalCommon(is_single_group, width, height, code, output);
3664
0
  output->Write(2, 0b01);     // 1 transform
3665
0
  output->Write(2, 0b01);     // Palette
3666
0
  output->Write(5, 0b00000);  // Starting from ch 0
3667
0
  if (nb_chans == 1) {
3668
0
    output->Write(2, 0b00);  // 1-channel palette (Gray)
3669
0
  } else if (nb_chans == 3) {
3670
0
    output->Write(2, 0b01);  // 3-channel palette (RGB)
3671
0
  } else if (nb_chans == 4) {
3672
0
    output->Write(2, 0b10);  // 4-channel palette (RGBA)
3673
0
  } else {
3674
0
    output->Write(2, 0b11);
3675
0
    output->Write(13, nb_chans - 1);
3676
0
  }
3677
  // pcolors <= kMaxColors + kChunkSize - 1
3678
0
  static_assert(kMaxColors + kChunkSize < 1281,
3679
0
                "add code to signal larger palette sizes");
3680
0
  if (pcolors < 256) {
3681
0
    output->Write(2, 0b00);
3682
0
    output->Write(8, pcolors);
3683
0
  } else {
3684
0
    output->Write(2, 0b01);
3685
0
    output->Write(10, pcolors - 256);
3686
0
  }
3687
3688
0
  output->Write(2, 0b00);  // nb_deltas == 0
3689
0
  output->Write(4, 0);     // Zero predictor for delta palette
3690
  // Encode palette
3691
0
  ChunkEncoder<UpTo8Bits> encoder;
3692
0
  ChannelRowProcessor<ChunkEncoder<UpTo8Bits>, UpTo8Bits> row_encoder;
3693
0
  row_encoder.t = &encoder;
3694
0
  encoder.output = output;
3695
0
  encoder.code = &code[0];
3696
0
  encoder.PrepareForSimd();
3697
0
  std::vector<std::array<int16_t, 32 + 1024>> p(4);
3698
0
  size_t i = 0;
3699
0
  size_t have_zero = 1;
3700
0
  for (; i < pcolors; i++) {
3701
0
    p[0][16 + i + have_zero] = palette[i] & 0xFF;
3702
0
    p[1][16 + i + have_zero] = (palette[i] >> 8) & 0xFF;
3703
0
    p[2][16 + i + have_zero] = (palette[i] >> 16) & 0xFF;
3704
0
    p[3][16 + i + have_zero] = (palette[i] >> 24) & 0xFF;
3705
0
  }
3706
0
  p[0][15] = 0;
3707
0
  row_encoder.ProcessRow(p[0].data() + 16, p[0].data() + 15, p[0].data() + 15,
3708
0
                         p[0].data() + 15, pcolors);
3709
0
  p[1][15] = p[0][16];
3710
0
  p[0][15] = p[0][16];
3711
0
  if (nb_chans > 1) {
3712
0
    row_encoder.ProcessRow(p[1].data() + 16, p[1].data() + 15, p[0].data() + 16,
3713
0
                           p[0].data() + 15, pcolors);
3714
0
  }
3715
0
  p[2][15] = p[1][16];
3716
0
  p[1][15] = p[1][16];
3717
0
  if (nb_chans > 2) {
3718
0
    row_encoder.ProcessRow(p[2].data() + 16, p[2].data() + 15, p[1].data() + 16,
3719
0
                           p[1].data() + 15, pcolors);
3720
0
  }
3721
0
  p[3][15] = p[2][16];
3722
0
  p[2][15] = p[2][16];
3723
0
  if (nb_chans > 3) {
3724
0
    row_encoder.ProcessRow(p[3].data() + 16, p[3].data() + 15, p[2].data() + 16,
3725
0
                           p[2].data() + 15, pcolors);
3726
0
  }
3727
0
  row_encoder.Finalize();
3728
3729
0
  if (!is_single_group) {
3730
0
    output->ZeroPadToByte();
3731
0
  }
3732
0
}
Unexecuted instantiation: enc_fast_lossless.cc:AVX2::(anonymous namespace)::PrepareDCGlobalPalette(bool, unsigned long, unsigned long, unsigned long, (anonymous namespace)::PrefixCode const*, std::__1::vector<unsigned int, std::__1::allocator<unsigned int> > const&, unsigned long, (anonymous namespace)::BitWriter*)
Unexecuted instantiation: enc_fast_lossless.cc:default_implementation::(anonymous namespace)::PrepareDCGlobalPalette(bool, unsigned long, unsigned long, unsigned long, (anonymous namespace)::PrefixCode const*, std::__1::vector<unsigned int, std::__1::allocator<unsigned int> > const&, unsigned long, (anonymous namespace)::BitWriter*)
3733
3734
template <size_t nb_chans>
3735
bool detect_palette(const unsigned char* r, size_t width,
3736
0
                    std::vector<uint32_t>& palette) {
3737
0
  size_t x = 0;
3738
0
  bool collided = false;
3739
  // this is just an unrolling of the next loop
3740
0
  size_t look_ahead = 7 + ((nb_chans == 1) ? 3 : ((nb_chans < 4) ? 1 : 0));
3741
0
  for (; x + look_ahead < width; x += 8) {
3742
0
    uint32_t p[8] = {}, index[8];
3743
0
    for (int i = 0; i < 8; i++) {
3744
0
      for (int j = 0; j < 4; ++j) {
3745
0
        p[i] |= r[(x + i) * nb_chans + j] << (8 * j);
3746
0
      }
3747
0
    }
3748
0
    for (int i = 0; i < 8; i++) p[i] &= ((1llu << (8 * nb_chans)) - 1);
3749
0
    for (int i = 0; i < 8; i++) index[i] = pixel_hash(p[i]);
3750
0
    for (int i = 0; i < 8; i++) {
3751
0
      collided |= (palette[index[i]] != 0 && p[i] != palette[index[i]]);
3752
0
    }
3753
0
    for (int i = 0; i < 8; i++) palette[index[i]] = p[i];
3754
0
  }
3755
0
  for (; x < width; x++) {
3756
0
    uint32_t p = 0;
3757
0
    for (size_t i = 0; i < nb_chans; ++i) {
3758
0
      p |= r[x * nb_chans + i] << (8 * i);
3759
0
    }
3760
0
    uint32_t index = pixel_hash(p);
3761
0
    collided |= (palette[index] != 0 && p != palette[index]);
3762
0
    palette[index] = p;
3763
0
  }
3764
0
  return collided;
3765
0
}
Unexecuted instantiation: enc_fast_lossless.cc:bool AVX2::(anonymous namespace)::detect_palette<1ul>(unsigned char const*, unsigned long, std::__1::vector<unsigned int, std::__1::allocator<unsigned int> >&)
Unexecuted instantiation: enc_fast_lossless.cc:bool AVX2::(anonymous namespace)::detect_palette<2ul>(unsigned char const*, unsigned long, std::__1::vector<unsigned int, std::__1::allocator<unsigned int> >&)
Unexecuted instantiation: enc_fast_lossless.cc:bool AVX2::(anonymous namespace)::detect_palette<3ul>(unsigned char const*, unsigned long, std::__1::vector<unsigned int, std::__1::allocator<unsigned int> >&)
Unexecuted instantiation: enc_fast_lossless.cc:bool AVX2::(anonymous namespace)::detect_palette<4ul>(unsigned char const*, unsigned long, std::__1::vector<unsigned int, std::__1::allocator<unsigned int> >&)
Unexecuted instantiation: enc_fast_lossless.cc:bool default_implementation::(anonymous namespace)::detect_palette<1ul>(unsigned char const*, unsigned long, std::__1::vector<unsigned int, std::__1::allocator<unsigned int> >&)
Unexecuted instantiation: enc_fast_lossless.cc:bool default_implementation::(anonymous namespace)::detect_palette<2ul>(unsigned char const*, unsigned long, std::__1::vector<unsigned int, std::__1::allocator<unsigned int> >&)
Unexecuted instantiation: enc_fast_lossless.cc:bool default_implementation::(anonymous namespace)::detect_palette<3ul>(unsigned char const*, unsigned long, std::__1::vector<unsigned int, std::__1::allocator<unsigned int> >&)
Unexecuted instantiation: enc_fast_lossless.cc:bool default_implementation::(anonymous namespace)::detect_palette<4ul>(unsigned char const*, unsigned long, std::__1::vector<unsigned int, std::__1::allocator<unsigned int> >&)
3766
3767
template <typename BitDepth>
3768
JxlFastLosslessFrameState* LLPrepare(JxlChunkedFrameInputSource input,
3769
                                     size_t width, size_t height,
3770
                                     BitDepth bitdepth, size_t nb_chans,
3771
0
                                     bool big_endian, int effort, int oneshot) {
3772
0
  assert(width != 0);
3773
0
  assert(height != 0);
3774
3775
  // Count colors to try palette
3776
0
  std::vector<uint32_t> palette(kHashSize);
3777
0
  std::vector<int16_t> lookup(kHashSize);
3778
0
  lookup[0] = 0;
3779
0
  int pcolors = 0;
3780
0
  bool collided = effort < 2 || bitdepth.bitdepth != 8 || !oneshot;
3781
0
  for (size_t y0 = 0; y0 < height && !collided; y0 += 256) {
3782
0
    size_t ys = std::min<size_t>(height - y0, 256);
3783
0
    for (size_t x0 = 0; x0 < width && !collided; x0 += 256) {
3784
0
      size_t xs = std::min<size_t>(width - x0, 256);
3785
0
      size_t stride;
3786
      // TODO(szabadka): Add RAII wrapper around this.
3787
0
      const void* buffer = input.get_color_channel_data_at(input.opaque, x0, y0,
3788
0
                                                           xs, ys, &stride);
3789
0
      auto rgba = reinterpret_cast<const unsigned char*>(buffer);
3790
0
      for (size_t y = 0; y < ys && !collided; y++) {
3791
0
        const unsigned char* r = rgba + stride * y;
3792
0
        if (nb_chans == 1) collided = detect_palette<1>(r, xs, palette);
3793
0
        if (nb_chans == 2) collided = detect_palette<2>(r, xs, palette);
3794
0
        if (nb_chans == 3) collided = detect_palette<3>(r, xs, palette);
3795
0
        if (nb_chans == 4) collided = detect_palette<4>(r, xs, palette);
3796
0
      }
3797
0
      input.release_buffer(input.opaque, buffer);
3798
0
    }
3799
0
  }
3800
0
  int nb_entries = 0;
3801
0
  if (!collided) {
3802
0
    pcolors = 1;  // always have all-zero as a palette color
3803
0
    bool have_color = false;
3804
0
    uint8_t minG = 255, maxG = 0;
3805
0
    for (uint32_t k = 0; k < kHashSize; k++) {
3806
0
      if (palette[k] == 0) continue;
3807
0
      uint8_t p[4];
3808
0
      for (int i = 0; i < 4; ++i) {
3809
0
        p[i] = (palette[k] >> (8 * i)) & 0xFF;
3810
0
      }
3811
      // move entries to front so sort has less work
3812
0
      palette[nb_entries] = palette[k];
3813
0
      if (p[0] != p[1] || p[0] != p[2]) have_color = true;
3814
0
      if (p[1] < minG) minG = p[1];
3815
0
      if (p[1] > maxG) maxG = p[1];
3816
0
      nb_entries++;
3817
      // don't do palette if too many colors are needed
3818
0
      if (nb_entries + pcolors > kMaxColors) {
3819
0
        collided = true;
3820
0
        break;
3821
0
      }
3822
0
    }
3823
0
    if (!have_color) {
3824
      // don't do palette if it's just grayscale without many holes
3825
0
      if (maxG - minG < nb_entries * 1.4f) collided = true;
3826
0
    }
3827
0
  }
3828
0
  if (!collided) {
3829
0
    std::sort(
3830
0
        palette.begin(), palette.begin() + nb_entries,
3831
0
        [&nb_chans](uint32_t ap, uint32_t bp) {
3832
0
          if (ap == 0) return false;
3833
0
          if (bp == 0) return true;
3834
0
          uint8_t a[4], b[4];
3835
0
          for (int i = 0; i < 4; ++i) {
3836
0
            a[i] = (ap >> (8 * i)) & 0xFF;
3837
0
            b[i] = (bp >> (8 * i)) & 0xFF;
3838
0
          }
3839
0
          float ay, by;
3840
0
          if (nb_chans == 4) {
3841
0
            ay = (0.299f * a[0] + 0.587f * a[1] + 0.114f * a[2] + 0.01f) * a[3];
3842
0
            by = (0.299f * b[0] + 0.587f * b[1] + 0.114f * b[2] + 0.01f) * b[3];
3843
0
          } else {
3844
0
            ay = (0.299f * a[0] + 0.587f * a[1] + 0.114f * a[2] + 0.01f);
3845
0
            by = (0.299f * b[0] + 0.587f * b[1] + 0.114f * b[2] + 0.01f);
3846
0
          }
3847
0
          return ay < by;  // sort on alpha*luma
3848
0
        });
Unexecuted instantiation: enc_fast_lossless.cc:AVX2::(anonymous namespace)::LLPrepare<AVX2::(anonymous namespace)::UpTo8Bits>(JxlChunkedFrameInputSource, unsigned long, unsigned long, AVX2::(anonymous namespace)::UpTo8Bits, unsigned long, bool, int, int)::{lambda(unsigned int, unsigned int)#1}::operator()(unsigned int, unsigned int) const
Unexecuted instantiation: enc_fast_lossless.cc:AVX2::(anonymous namespace)::LLPrepare<AVX2::(anonymous namespace)::From9To13Bits>(JxlChunkedFrameInputSource, unsigned long, unsigned long, AVX2::(anonymous namespace)::From9To13Bits, unsigned long, bool, int, int)::{lambda(unsigned int, unsigned int)#1}::operator()(unsigned int, unsigned int) const
Unexecuted instantiation: enc_fast_lossless.cc:AVX2::(anonymous namespace)::LLPrepare<AVX2::(anonymous namespace)::Exactly14Bits>(JxlChunkedFrameInputSource, unsigned long, unsigned long, AVX2::(anonymous namespace)::Exactly14Bits, unsigned long, bool, int, int)::{lambda(unsigned int, unsigned int)#1}::operator()(unsigned int, unsigned int) const
Unexecuted instantiation: enc_fast_lossless.cc:AVX2::(anonymous namespace)::LLPrepare<AVX2::(anonymous namespace)::MoreThan14Bits>(JxlChunkedFrameInputSource, unsigned long, unsigned long, AVX2::(anonymous namespace)::MoreThan14Bits, unsigned long, bool, int, int)::{lambda(unsigned int, unsigned int)#1}::operator()(unsigned int, unsigned int) const
Unexecuted instantiation: enc_fast_lossless.cc:default_implementation::(anonymous namespace)::LLPrepare<default_implementation::(anonymous namespace)::UpTo8Bits>(JxlChunkedFrameInputSource, unsigned long, unsigned long, default_implementation::(anonymous namespace)::UpTo8Bits, unsigned long, bool, int, int)::{lambda(unsigned int, unsigned int)#1}::operator()(unsigned int, unsigned int) const
Unexecuted instantiation: enc_fast_lossless.cc:default_implementation::(anonymous namespace)::LLPrepare<default_implementation::(anonymous namespace)::From9To13Bits>(JxlChunkedFrameInputSource, unsigned long, unsigned long, default_implementation::(anonymous namespace)::From9To13Bits, unsigned long, bool, int, int)::{lambda(unsigned int, unsigned int)#1}::operator()(unsigned int, unsigned int) const
Unexecuted instantiation: enc_fast_lossless.cc:default_implementation::(anonymous namespace)::LLPrepare<default_implementation::(anonymous namespace)::Exactly14Bits>(JxlChunkedFrameInputSource, unsigned long, unsigned long, default_implementation::(anonymous namespace)::Exactly14Bits, unsigned long, bool, int, int)::{lambda(unsigned int, unsigned int)#1}::operator()(unsigned int, unsigned int) const
Unexecuted instantiation: enc_fast_lossless.cc:default_implementation::(anonymous namespace)::LLPrepare<default_implementation::(anonymous namespace)::MoreThan14Bits>(JxlChunkedFrameInputSource, unsigned long, unsigned long, default_implementation::(anonymous namespace)::MoreThan14Bits, unsigned long, bool, int, int)::{lambda(unsigned int, unsigned int)#1}::operator()(unsigned int, unsigned int) const
3849
0
    for (int k = 0; k < nb_entries; k++) {
3850
0
      if (palette[k] == 0) break;
3851
0
      lookup[pixel_hash(palette[k])] = pcolors++;
3852
0
    }
3853
0
  }
3854
3855
0
  size_t num_groups_x = (width + 255) / 256;
3856
0
  size_t num_groups_y = (height + 255) / 256;
3857
0
  size_t num_dc_groups_x = (width + 2047) / 2048;
3858
0
  size_t num_dc_groups_y = (height + 2047) / 2048;
3859
3860
0
  uint64_t raw_counts[4][kNumRawSymbols] = {};
3861
0
  uint64_t lz77_counts[4][kNumLZ77] = {};
3862
3863
0
  bool onegroup = num_groups_x == 1 && num_groups_y == 1;
3864
3865
0
  auto sample_rows = [&](size_t xg, size_t yg, size_t num_rows) {
3866
0
    size_t y0 = yg * 256;
3867
0
    size_t x0 = xg * 256;
3868
0
    size_t ys = std::min<size_t>(height - y0, 256);
3869
0
    size_t xs = std::min<size_t>(width - x0, 256);
3870
0
    size_t stride;
3871
0
    const void* buffer =
3872
0
        input.get_color_channel_data_at(input.opaque, x0, y0, xs, ys, &stride);
3873
0
    auto rgba = reinterpret_cast<const unsigned char*>(buffer);
3874
0
    int y_begin_group =
3875
0
        std::max<ssize_t>(
3876
0
            0, static_cast<ssize_t>(ys) - static_cast<ssize_t>(num_rows)) /
3877
0
        2;
3878
0
    int y_count = std::min<int>(num_rows, ys - y_begin_group);
3879
0
    int x_max = xs / kChunkSize * kChunkSize;
3880
0
    CollectSamples(rgba, 0, y_begin_group, x_max, stride, y_count, raw_counts,
3881
0
                   lz77_counts, onegroup, !collided, bitdepth, nb_chans,
3882
0
                   big_endian, lookup.data());
3883
0
    input.release_buffer(input.opaque, buffer);
3884
0
  };
Unexecuted instantiation: enc_fast_lossless.cc:AVX2::(anonymous namespace)::LLPrepare<AVX2::(anonymous namespace)::UpTo8Bits>(JxlChunkedFrameInputSource, unsigned long, unsigned long, AVX2::(anonymous namespace)::UpTo8Bits, unsigned long, bool, int, int)::{lambda(unsigned long, unsigned long, unsigned long)#1}::operator()(unsigned long, unsigned long, unsigned long) const
Unexecuted instantiation: enc_fast_lossless.cc:AVX2::(anonymous namespace)::LLPrepare<AVX2::(anonymous namespace)::From9To13Bits>(JxlChunkedFrameInputSource, unsigned long, unsigned long, AVX2::(anonymous namespace)::From9To13Bits, unsigned long, bool, int, int)::{lambda(unsigned long, unsigned long, unsigned long)#1}::operator()(unsigned long, unsigned long, unsigned long) const
Unexecuted instantiation: enc_fast_lossless.cc:AVX2::(anonymous namespace)::LLPrepare<AVX2::(anonymous namespace)::Exactly14Bits>(JxlChunkedFrameInputSource, unsigned long, unsigned long, AVX2::(anonymous namespace)::Exactly14Bits, unsigned long, bool, int, int)::{lambda(unsigned long, unsigned long, unsigned long)#1}::operator()(unsigned long, unsigned long, unsigned long) const
Unexecuted instantiation: enc_fast_lossless.cc:AVX2::(anonymous namespace)::LLPrepare<AVX2::(anonymous namespace)::MoreThan14Bits>(JxlChunkedFrameInputSource, unsigned long, unsigned long, AVX2::(anonymous namespace)::MoreThan14Bits, unsigned long, bool, int, int)::{lambda(unsigned long, unsigned long, unsigned long)#1}::operator()(unsigned long, unsigned long, unsigned long) const
Unexecuted instantiation: enc_fast_lossless.cc:default_implementation::(anonymous namespace)::LLPrepare<default_implementation::(anonymous namespace)::UpTo8Bits>(JxlChunkedFrameInputSource, unsigned long, unsigned long, default_implementation::(anonymous namespace)::UpTo8Bits, unsigned long, bool, int, int)::{lambda(unsigned long, unsigned long, unsigned long)#1}::operator()(unsigned long, unsigned long, unsigned long) const
Unexecuted instantiation: enc_fast_lossless.cc:default_implementation::(anonymous namespace)::LLPrepare<default_implementation::(anonymous namespace)::From9To13Bits>(JxlChunkedFrameInputSource, unsigned long, unsigned long, default_implementation::(anonymous namespace)::From9To13Bits, unsigned long, bool, int, int)::{lambda(unsigned long, unsigned long, unsigned long)#1}::operator()(unsigned long, unsigned long, unsigned long) const
Unexecuted instantiation: enc_fast_lossless.cc:default_implementation::(anonymous namespace)::LLPrepare<default_implementation::(anonymous namespace)::Exactly14Bits>(JxlChunkedFrameInputSource, unsigned long, unsigned long, default_implementation::(anonymous namespace)::Exactly14Bits, unsigned long, bool, int, int)::{lambda(unsigned long, unsigned long, unsigned long)#1}::operator()(unsigned long, unsigned long, unsigned long) const
Unexecuted instantiation: enc_fast_lossless.cc:default_implementation::(anonymous namespace)::LLPrepare<default_implementation::(anonymous namespace)::MoreThan14Bits>(JxlChunkedFrameInputSource, unsigned long, unsigned long, default_implementation::(anonymous namespace)::MoreThan14Bits, unsigned long, bool, int, int)::{lambda(unsigned long, unsigned long, unsigned long)#1}::operator()(unsigned long, unsigned long, unsigned long) const
3885
3886
  // TODO(veluca): that `64` is an arbitrary constant, meant to correspond to
3887
  // the point where the number of processed rows is large enough that loading
3888
  // the entire image is cost-effective.
3889
0
  if (oneshot || effort >= 64) {
3890
0
    for (size_t g = 0; g < num_groups_y * num_groups_x; g++) {
3891
0
      size_t xg = g % num_groups_x;
3892
0
      size_t yg = g / num_groups_x;
3893
0
      size_t y0 = yg * 256;
3894
0
      size_t ys = std::min<size_t>(height - y0, 256);
3895
0
      size_t num_rows = 2 * effort * ys / 256;
3896
0
      sample_rows(xg, yg, num_rows);
3897
0
    }
3898
0
  } else {
3899
    // sample the middle (effort * 2 * num_groups) rows of the center group
3900
    // (possibly all of them).
3901
0
    sample_rows((num_groups_x - 1) / 2, (num_groups_y - 1) / 2,
3902
0
                2 * effort * num_groups_x * num_groups_y);
3903
0
  }
3904
3905
  // TODO(veluca): can probably improve this and make it bitdepth-dependent.
3906
0
  uint64_t base_raw_counts[kNumRawSymbols] = {
3907
0
      3843, 852, 1270, 1214, 1014, 727, 481, 300, 159, 51,
3908
0
      5,    1,   1,    1,    1,    1,   1,   1,   1};
3909
3910
0
  bool doing_ycocg = nb_chans > 2 && collided;
3911
0
  bool large_palette = !collided || pcolors >= 256;
3912
0
  for (size_t i = bitdepth.NumSymbols(doing_ycocg || large_palette);
3913
0
       i < kNumRawSymbols; i++) {
3914
0
    base_raw_counts[i] = 0;
3915
0
  }
3916
3917
0
  for (size_t c = 0; c < 4; c++) {
3918
0
    for (size_t i = 0; i < kNumRawSymbols; i++) {
3919
0
      raw_counts[c][i] = (raw_counts[c][i] << 8) + base_raw_counts[i];
3920
0
    }
3921
0
  }
3922
3923
0
  if (!collided) {
3924
0
    unsigned token, nbits, bits;
3925
0
    EncodeHybridUint000(PackSigned(pcolors - 1), &token, &nbits, &bits);
3926
    // ensure all palette indices can actually be encoded
3927
0
    for (size_t i = 0; i < token + 1; i++)
3928
0
      raw_counts[0][i] = std::max<uint64_t>(raw_counts[0][i], 1);
3929
    // these tokens are only used for the palette itself so they can get a bad
3930
    // code
3931
0
    for (size_t i = token + 1; i < 10; i++) raw_counts[0][i] = 1;
3932
0
  }
3933
3934
0
  uint64_t base_lz77_counts[kNumLZ77] = {
3935
0
      29, 27, 25,  23, 21, 21, 19, 18, 21, 17, 16, 15, 15, 14,
3936
0
      13, 13, 137, 98, 61, 34, 1,  1,  1,  1,  1,  1,  1,  1,
3937
0
  };
3938
3939
0
  for (size_t c = 0; c < 4; c++) {
3940
0
    for (size_t i = 0; i < kNumLZ77; i++) {
3941
0
      lz77_counts[c][i] = (lz77_counts[c][i] << 8) + base_lz77_counts[i];
3942
0
    }
3943
0
  }
3944
3945
0
  JxlFastLosslessFrameState* frame_state = new JxlFastLosslessFrameState();
3946
0
  for (size_t i = 0; i < 4; i++) {
3947
0
    frame_state->hcode[i] = PrefixCode(bitdepth, raw_counts[i], lz77_counts[i]);
3948
0
  }
3949
3950
0
  size_t num_dc_groups = num_dc_groups_x * num_dc_groups_y;
3951
0
  size_t num_ac_groups = num_groups_x * num_groups_y;
3952
0
  size_t num_groups = onegroup ? 1 : (2 + num_dc_groups + num_ac_groups);
3953
0
  frame_state->input = input;
3954
0
  frame_state->width = width;
3955
0
  frame_state->height = height;
3956
0
  frame_state->num_groups_x = num_groups_x;
3957
0
  frame_state->num_groups_y = num_groups_y;
3958
0
  frame_state->num_dc_groups_x = num_dc_groups_x;
3959
0
  frame_state->num_dc_groups_y = num_dc_groups_y;
3960
0
  frame_state->nb_chans = nb_chans;
3961
0
  frame_state->bitdepth = bitdepth.bitdepth;
3962
0
  frame_state->big_endian = big_endian;
3963
0
  frame_state->effort = effort;
3964
0
  frame_state->collided = collided;
3965
0
  frame_state->lookup = lookup;
3966
3967
0
  frame_state->group_data = std::vector<std::array<BitWriter, 4>>(num_groups);
3968
0
  frame_state->group_sizes.resize(num_groups);
3969
0
  if (collided) {
3970
0
    PrepareDCGlobal(onegroup, width, height, nb_chans, frame_state->hcode,
3971
0
                    &frame_state->group_data[0][0]);
3972
0
  } else {
3973
0
    PrepareDCGlobalPalette(onegroup, width, height, nb_chans,
3974
0
                           frame_state->hcode, palette, pcolors,
3975
0
                           &frame_state->group_data[0][0]);
3976
0
  }
3977
0
  frame_state->group_sizes[0] = SectionSize(frame_state->group_data[0]);
3978
0
  if (!onegroup) {
3979
0
    ComputeAcGroupDataOffset(frame_state->group_sizes[0], num_dc_groups,
3980
0
                             num_ac_groups, frame_state->min_dc_global_size,
3981
0
                             frame_state->ac_group_data_offset);
3982
0
  }
3983
3984
0
  return frame_state;
3985
0
}
Unexecuted instantiation: enc_fast_lossless.cc:JxlFastLosslessFrameState* AVX2::(anonymous namespace)::LLPrepare<AVX2::(anonymous namespace)::UpTo8Bits>(JxlChunkedFrameInputSource, unsigned long, unsigned long, AVX2::(anonymous namespace)::UpTo8Bits, unsigned long, bool, int, int)
Unexecuted instantiation: enc_fast_lossless.cc:JxlFastLosslessFrameState* AVX2::(anonymous namespace)::LLPrepare<AVX2::(anonymous namespace)::From9To13Bits>(JxlChunkedFrameInputSource, unsigned long, unsigned long, AVX2::(anonymous namespace)::From9To13Bits, unsigned long, bool, int, int)
Unexecuted instantiation: enc_fast_lossless.cc:JxlFastLosslessFrameState* AVX2::(anonymous namespace)::LLPrepare<AVX2::(anonymous namespace)::Exactly14Bits>(JxlChunkedFrameInputSource, unsigned long, unsigned long, AVX2::(anonymous namespace)::Exactly14Bits, unsigned long, bool, int, int)
Unexecuted instantiation: enc_fast_lossless.cc:JxlFastLosslessFrameState* AVX2::(anonymous namespace)::LLPrepare<AVX2::(anonymous namespace)::MoreThan14Bits>(JxlChunkedFrameInputSource, unsigned long, unsigned long, AVX2::(anonymous namespace)::MoreThan14Bits, unsigned long, bool, int, int)
Unexecuted instantiation: enc_fast_lossless.cc:JxlFastLosslessFrameState* default_implementation::(anonymous namespace)::LLPrepare<default_implementation::(anonymous namespace)::UpTo8Bits>(JxlChunkedFrameInputSource, unsigned long, unsigned long, default_implementation::(anonymous namespace)::UpTo8Bits, unsigned long, bool, int, int)
Unexecuted instantiation: enc_fast_lossless.cc:JxlFastLosslessFrameState* default_implementation::(anonymous namespace)::LLPrepare<default_implementation::(anonymous namespace)::From9To13Bits>(JxlChunkedFrameInputSource, unsigned long, unsigned long, default_implementation::(anonymous namespace)::From9To13Bits, unsigned long, bool, int, int)
Unexecuted instantiation: enc_fast_lossless.cc:JxlFastLosslessFrameState* default_implementation::(anonymous namespace)::LLPrepare<default_implementation::(anonymous namespace)::Exactly14Bits>(JxlChunkedFrameInputSource, unsigned long, unsigned long, default_implementation::(anonymous namespace)::Exactly14Bits, unsigned long, bool, int, int)
Unexecuted instantiation: enc_fast_lossless.cc:JxlFastLosslessFrameState* default_implementation::(anonymous namespace)::LLPrepare<default_implementation::(anonymous namespace)::MoreThan14Bits>(JxlChunkedFrameInputSource, unsigned long, unsigned long, default_implementation::(anonymous namespace)::MoreThan14Bits, unsigned long, bool, int, int)
3986
3987
template <typename BitDepth>
3988
jxl::Status LLProcess(JxlFastLosslessFrameState* frame_state, bool is_last,
3989
                      BitDepth bitdepth, void* runner_opaque,
3990
                      FJxlParallelRunner runner,
3991
0
                      JxlEncoderOutputProcessorWrapper* output_processor) {
3992
0
#if !FJXL_STANDALONE
3993
0
  if (frame_state->process_done) {
3994
0
    JxlFastLosslessPrepareHeader(frame_state, /*add_image_header=*/0, is_last);
3995
0
    if (output_processor) {
3996
0
      JXL_RETURN_IF_ERROR(
3997
0
          JxlFastLosslessOutputFrame(frame_state, output_processor));
3998
0
    }
3999
0
    return true;
4000
0
  }
4001
0
#endif
4002
  // The maximum number of groups that we process concurrently here.
4003
  // TODO(szabadka) Use the number of threads or some outside parameter for the
4004
  // maximum memory usage instead.
4005
0
  constexpr size_t kMaxLocalGroups = 16;
4006
0
  bool onegroup = frame_state->group_sizes.size() == 1;
4007
0
  bool streaming = !onegroup && output_processor;
4008
0
  size_t total_groups = frame_state->num_groups_x * frame_state->num_groups_y;
4009
0
  size_t max_groups = streaming ? kMaxLocalGroups : total_groups;
4010
0
#if !FJXL_STANDALONE
4011
0
  size_t start_pos = 0;
4012
0
  if (streaming) {
4013
0
    start_pos = output_processor->CurrentPosition();
4014
0
    JXL_RETURN_IF_ERROR(
4015
0
        output_processor->Seek(start_pos + frame_state->ac_group_data_offset));
4016
0
  }
4017
0
#endif
4018
0
  for (size_t offset = 0; offset < total_groups; offset += max_groups) {
4019
0
    size_t num_groups = std::min(max_groups, total_groups - offset);
4020
0
    JxlFastLosslessFrameState local_frame_state;
4021
0
    if (streaming) {
4022
0
      local_frame_state.group_data =
4023
0
          std::vector<std::array<BitWriter, 4>>(num_groups);
4024
0
    }
4025
0
    auto run_one = [&](size_t i) {
4026
0
      size_t g = offset + i;
4027
0
      size_t xg = g % frame_state->num_groups_x;
4028
0
      size_t yg = g / frame_state->num_groups_x;
4029
0
      size_t num_dc_groups =
4030
0
          frame_state->num_dc_groups_x * frame_state->num_dc_groups_y;
4031
0
      size_t group_id = onegroup ? 0 : (2 + num_dc_groups + g);
4032
0
      size_t xs = std::min<size_t>(frame_state->width - xg * 256, 256);
4033
0
      size_t ys = std::min<size_t>(frame_state->height - yg * 256, 256);
4034
0
      size_t x0 = xg * 256;
4035
0
      size_t y0 = yg * 256;
4036
0
      size_t stride;
4037
0
      JxlChunkedFrameInputSource input = frame_state->input;
4038
0
      const void* buffer = input.get_color_channel_data_at(input.opaque, x0, y0,
4039
0
                                                           xs, ys, &stride);
4040
0
      const unsigned char* rgba =
4041
0
          reinterpret_cast<const unsigned char*>(buffer);
4042
4043
0
      auto& gd = streaming ? local_frame_state.group_data[i]
4044
0
                           : frame_state->group_data[group_id];
4045
0
      if (frame_state->collided) {
4046
0
        WriteACSection(rgba, 0, 0, xs, ys, stride, onegroup, bitdepth,
4047
0
                       frame_state->nb_chans, frame_state->big_endian,
4048
0
                       frame_state->hcode, gd);
4049
0
      } else {
4050
0
        WriteACSectionPalette(rgba, 0, 0, xs, ys, stride, onegroup,
4051
0
                              frame_state->hcode, frame_state->lookup.data(),
4052
0
                              frame_state->nb_chans, gd[0]);
4053
0
      }
4054
0
      frame_state->group_sizes[group_id] = SectionSize(gd);
4055
0
      input.release_buffer(input.opaque, buffer);
4056
0
    };
Unexecuted instantiation: enc_fast_lossless.cc:AVX2::(anonymous namespace)::LLProcess<AVX2::(anonymous namespace)::UpTo8Bits>(JxlFastLosslessFrameState*, bool, AVX2::(anonymous namespace)::UpTo8Bits, void*, void (*)(void*, void*, void (*)(void*, unsigned long), unsigned long), JxlEncoderOutputProcessorWrapper*)::{lambda(unsigned long)#1}::operator()(unsigned long) const
Unexecuted instantiation: enc_fast_lossless.cc:AVX2::(anonymous namespace)::LLProcess<AVX2::(anonymous namespace)::From9To13Bits>(JxlFastLosslessFrameState*, bool, AVX2::(anonymous namespace)::From9To13Bits, void*, void (*)(void*, void*, void (*)(void*, unsigned long), unsigned long), JxlEncoderOutputProcessorWrapper*)::{lambda(unsigned long)#1}::operator()(unsigned long) const
Unexecuted instantiation: enc_fast_lossless.cc:AVX2::(anonymous namespace)::LLProcess<AVX2::(anonymous namespace)::Exactly14Bits>(JxlFastLosslessFrameState*, bool, AVX2::(anonymous namespace)::Exactly14Bits, void*, void (*)(void*, void*, void (*)(void*, unsigned long), unsigned long), JxlEncoderOutputProcessorWrapper*)::{lambda(unsigned long)#1}::operator()(unsigned long) const
Unexecuted instantiation: enc_fast_lossless.cc:AVX2::(anonymous namespace)::LLProcess<AVX2::(anonymous namespace)::MoreThan14Bits>(JxlFastLosslessFrameState*, bool, AVX2::(anonymous namespace)::MoreThan14Bits, void*, void (*)(void*, void*, void (*)(void*, unsigned long), unsigned long), JxlEncoderOutputProcessorWrapper*)::{lambda(unsigned long)#1}::operator()(unsigned long) const
Unexecuted instantiation: enc_fast_lossless.cc:default_implementation::(anonymous namespace)::LLProcess<default_implementation::(anonymous namespace)::UpTo8Bits>(JxlFastLosslessFrameState*, bool, default_implementation::(anonymous namespace)::UpTo8Bits, void*, void (*)(void*, void*, void (*)(void*, unsigned long), unsigned long), JxlEncoderOutputProcessorWrapper*)::{lambda(unsigned long)#1}::operator()(unsigned long) const
Unexecuted instantiation: enc_fast_lossless.cc:default_implementation::(anonymous namespace)::LLProcess<default_implementation::(anonymous namespace)::From9To13Bits>(JxlFastLosslessFrameState*, bool, default_implementation::(anonymous namespace)::From9To13Bits, void*, void (*)(void*, void*, void (*)(void*, unsigned long), unsigned long), JxlEncoderOutputProcessorWrapper*)::{lambda(unsigned long)#1}::operator()(unsigned long) const
Unexecuted instantiation: enc_fast_lossless.cc:default_implementation::(anonymous namespace)::LLProcess<default_implementation::(anonymous namespace)::Exactly14Bits>(JxlFastLosslessFrameState*, bool, default_implementation::(anonymous namespace)::Exactly14Bits, void*, void (*)(void*, void*, void (*)(void*, unsigned long), unsigned long), JxlEncoderOutputProcessorWrapper*)::{lambda(unsigned long)#1}::operator()(unsigned long) const
Unexecuted instantiation: enc_fast_lossless.cc:default_implementation::(anonymous namespace)::LLProcess<default_implementation::(anonymous namespace)::MoreThan14Bits>(JxlFastLosslessFrameState*, bool, default_implementation::(anonymous namespace)::MoreThan14Bits, void*, void (*)(void*, void*, void (*)(void*, unsigned long), unsigned long), JxlEncoderOutputProcessorWrapper*)::{lambda(unsigned long)#1}::operator()(unsigned long) const
4057
0
    runner(
4058
0
        runner_opaque, &run_one,
4059
0
        +[](void* r, size_t i) {
4060
0
          (*reinterpret_cast<decltype(&run_one)>(r))(i);
4061
0
        },
Unexecuted instantiation: enc_fast_lossless.cc:AVX2::(anonymous namespace)::LLProcess<AVX2::(anonymous namespace)::UpTo8Bits>(JxlFastLosslessFrameState*, bool, AVX2::(anonymous namespace)::UpTo8Bits, void*, void (*)(void*, void*, void (*)(void*, unsigned long), unsigned long), JxlEncoderOutputProcessorWrapper*)::{lambda(void*, unsigned long)#1}::operator()(void*, unsigned long) const
Unexecuted instantiation: enc_fast_lossless.cc:AVX2::(anonymous namespace)::LLProcess<AVX2::(anonymous namespace)::From9To13Bits>(JxlFastLosslessFrameState*, bool, AVX2::(anonymous namespace)::From9To13Bits, void*, void (*)(void*, void*, void (*)(void*, unsigned long), unsigned long), JxlEncoderOutputProcessorWrapper*)::{lambda(void*, unsigned long)#1}::operator()(void*, unsigned long) const
Unexecuted instantiation: enc_fast_lossless.cc:AVX2::(anonymous namespace)::LLProcess<AVX2::(anonymous namespace)::Exactly14Bits>(JxlFastLosslessFrameState*, bool, AVX2::(anonymous namespace)::Exactly14Bits, void*, void (*)(void*, void*, void (*)(void*, unsigned long), unsigned long), JxlEncoderOutputProcessorWrapper*)::{lambda(void*, unsigned long)#1}::operator()(void*, unsigned long) const
Unexecuted instantiation: enc_fast_lossless.cc:AVX2::(anonymous namespace)::LLProcess<AVX2::(anonymous namespace)::MoreThan14Bits>(JxlFastLosslessFrameState*, bool, AVX2::(anonymous namespace)::MoreThan14Bits, void*, void (*)(void*, void*, void (*)(void*, unsigned long), unsigned long), JxlEncoderOutputProcessorWrapper*)::{lambda(void*, unsigned long)#1}::operator()(void*, unsigned long) const
Unexecuted instantiation: enc_fast_lossless.cc:default_implementation::(anonymous namespace)::LLProcess<default_implementation::(anonymous namespace)::UpTo8Bits>(JxlFastLosslessFrameState*, bool, default_implementation::(anonymous namespace)::UpTo8Bits, void*, void (*)(void*, void*, void (*)(void*, unsigned long), unsigned long), JxlEncoderOutputProcessorWrapper*)::{lambda(void*, unsigned long)#1}::operator()(void*, unsigned long) const
Unexecuted instantiation: enc_fast_lossless.cc:default_implementation::(anonymous namespace)::LLProcess<default_implementation::(anonymous namespace)::From9To13Bits>(JxlFastLosslessFrameState*, bool, default_implementation::(anonymous namespace)::From9To13Bits, void*, void (*)(void*, void*, void (*)(void*, unsigned long), unsigned long), JxlEncoderOutputProcessorWrapper*)::{lambda(void*, unsigned long)#1}::operator()(void*, unsigned long) const
Unexecuted instantiation: enc_fast_lossless.cc:default_implementation::(anonymous namespace)::LLProcess<default_implementation::(anonymous namespace)::Exactly14Bits>(JxlFastLosslessFrameState*, bool, default_implementation::(anonymous namespace)::Exactly14Bits, void*, void (*)(void*, void*, void (*)(void*, unsigned long), unsigned long), JxlEncoderOutputProcessorWrapper*)::{lambda(void*, unsigned long)#1}::operator()(void*, unsigned long) const
Unexecuted instantiation: enc_fast_lossless.cc:default_implementation::(anonymous namespace)::LLProcess<default_implementation::(anonymous namespace)::MoreThan14Bits>(JxlFastLosslessFrameState*, bool, default_implementation::(anonymous namespace)::MoreThan14Bits, void*, void (*)(void*, void*, void (*)(void*, unsigned long), unsigned long), JxlEncoderOutputProcessorWrapper*)::{lambda(void*, unsigned long)#1}::operator()(void*, unsigned long) const
4062
0
        num_groups);
4063
0
#if !FJXL_STANDALONE
4064
0
    if (streaming) {
4065
0
      local_frame_state.nb_chans = frame_state->nb_chans;
4066
0
      local_frame_state.current_bit_writer = 1;
4067
0
      JXL_RETURN_IF_ERROR(
4068
0
          JxlFastLosslessOutputFrame(&local_frame_state, output_processor));
4069
0
    }
4070
0
#endif
4071
0
  }
4072
0
#if !FJXL_STANDALONE
4073
0
  if (streaming) {
4074
0
    size_t end_pos = output_processor->CurrentPosition();
4075
0
    JXL_RETURN_IF_ERROR(output_processor->Seek(start_pos));
4076
0
    frame_state->group_data.resize(1);
4077
0
    bool have_alpha = frame_state->nb_chans == 2 || frame_state->nb_chans == 4;
4078
0
    size_t padding = ComputeDcGlobalPadding(
4079
0
        frame_state->group_sizes, frame_state->ac_group_data_offset,
4080
0
        frame_state->min_dc_global_size, have_alpha, is_last);
4081
4082
0
    for (size_t i = 0; i < padding; ++i) {
4083
0
      frame_state->group_data[0][0].Write(8, 0);
4084
0
    }
4085
0
    frame_state->group_sizes[0] += padding;
4086
0
    JxlFastLosslessPrepareHeader(frame_state, /*add_image_header=*/0, is_last);
4087
0
    assert(frame_state->ac_group_data_offset ==
4088
0
           JxlFastLosslessOutputSize(frame_state));
4089
0
    JXL_RETURN_IF_ERROR(
4090
0
        JxlFastLosslessOutputHeaders(frame_state, output_processor));
4091
0
    JXL_RETURN_IF_ERROR(output_processor->Seek(end_pos));
4092
0
  } else if (output_processor) {
4093
0
    assert(onegroup);
4094
0
    JxlFastLosslessPrepareHeader(frame_state, /*add_image_header=*/0, is_last);
4095
0
    if (output_processor) {
4096
0
      JXL_RETURN_IF_ERROR(
4097
0
          JxlFastLosslessOutputFrame(frame_state, output_processor));
4098
0
    }
4099
0
  }
4100
0
  frame_state->process_done = true;
4101
0
#endif
4102
0
  return true;
4103
0
}
Unexecuted instantiation: enc_fast_lossless.cc:jxl::Status AVX2::(anonymous namespace)::LLProcess<AVX2::(anonymous namespace)::UpTo8Bits>(JxlFastLosslessFrameState*, bool, AVX2::(anonymous namespace)::UpTo8Bits, void*, void (*)(void*, void*, void (*)(void*, unsigned long), unsigned long), JxlEncoderOutputProcessorWrapper*)
Unexecuted instantiation: enc_fast_lossless.cc:jxl::Status AVX2::(anonymous namespace)::LLProcess<AVX2::(anonymous namespace)::From9To13Bits>(JxlFastLosslessFrameState*, bool, AVX2::(anonymous namespace)::From9To13Bits, void*, void (*)(void*, void*, void (*)(void*, unsigned long), unsigned long), JxlEncoderOutputProcessorWrapper*)
Unexecuted instantiation: enc_fast_lossless.cc:jxl::Status AVX2::(anonymous namespace)::LLProcess<AVX2::(anonymous namespace)::Exactly14Bits>(JxlFastLosslessFrameState*, bool, AVX2::(anonymous namespace)::Exactly14Bits, void*, void (*)(void*, void*, void (*)(void*, unsigned long), unsigned long), JxlEncoderOutputProcessorWrapper*)
Unexecuted instantiation: enc_fast_lossless.cc:jxl::Status AVX2::(anonymous namespace)::LLProcess<AVX2::(anonymous namespace)::MoreThan14Bits>(JxlFastLosslessFrameState*, bool, AVX2::(anonymous namespace)::MoreThan14Bits, void*, void (*)(void*, void*, void (*)(void*, unsigned long), unsigned long), JxlEncoderOutputProcessorWrapper*)
Unexecuted instantiation: enc_fast_lossless.cc:jxl::Status default_implementation::(anonymous namespace)::LLProcess<default_implementation::(anonymous namespace)::UpTo8Bits>(JxlFastLosslessFrameState*, bool, default_implementation::(anonymous namespace)::UpTo8Bits, void*, void (*)(void*, void*, void (*)(void*, unsigned long), unsigned long), JxlEncoderOutputProcessorWrapper*)
Unexecuted instantiation: enc_fast_lossless.cc:jxl::Status default_implementation::(anonymous namespace)::LLProcess<default_implementation::(anonymous namespace)::From9To13Bits>(JxlFastLosslessFrameState*, bool, default_implementation::(anonymous namespace)::From9To13Bits, void*, void (*)(void*, void*, void (*)(void*, unsigned long), unsigned long), JxlEncoderOutputProcessorWrapper*)
Unexecuted instantiation: enc_fast_lossless.cc:jxl::Status default_implementation::(anonymous namespace)::LLProcess<default_implementation::(anonymous namespace)::Exactly14Bits>(JxlFastLosslessFrameState*, bool, default_implementation::(anonymous namespace)::Exactly14Bits, void*, void (*)(void*, void*, void (*)(void*, unsigned long), unsigned long), JxlEncoderOutputProcessorWrapper*)
Unexecuted instantiation: enc_fast_lossless.cc:jxl::Status default_implementation::(anonymous namespace)::LLProcess<default_implementation::(anonymous namespace)::MoreThan14Bits>(JxlFastLosslessFrameState*, bool, default_implementation::(anonymous namespace)::MoreThan14Bits, void*, void (*)(void*, void*, void (*)(void*, unsigned long), unsigned long), JxlEncoderOutputProcessorWrapper*)
4104
4105
JxlFastLosslessFrameState* JxlFastLosslessPrepareImpl(
4106
    JxlChunkedFrameInputSource input, size_t width, size_t height,
4107
    size_t nb_chans, size_t bitdepth, bool big_endian, int effort,
4108
0
    int oneshot) {
4109
0
  assert(bitdepth > 0);
4110
0
  assert(nb_chans <= 4);
4111
0
  assert(nb_chans != 0);
4112
0
  if (bitdepth <= 8) {
4113
0
    return LLPrepare(input, width, height, UpTo8Bits(bitdepth), nb_chans,
4114
0
                     big_endian, effort, oneshot);
4115
0
  }
4116
0
  if (bitdepth <= 13) {
4117
0
    return LLPrepare(input, width, height, From9To13Bits(bitdepth), nb_chans,
4118
0
                     big_endian, effort, oneshot);
4119
0
  }
4120
0
  if (bitdepth == 14) {
4121
0
    return LLPrepare(input, width, height, Exactly14Bits(bitdepth), nb_chans,
4122
0
                     big_endian, effort, oneshot);
4123
0
  }
4124
0
  return LLPrepare(input, width, height, MoreThan14Bits(bitdepth), nb_chans,
4125
0
                   big_endian, effort, oneshot);
4126
0
}
Unexecuted instantiation: enc_fast_lossless.cc:AVX2::(anonymous namespace)::JxlFastLosslessPrepareImpl(JxlChunkedFrameInputSource, unsigned long, unsigned long, unsigned long, unsigned long, bool, int, int)
Unexecuted instantiation: enc_fast_lossless.cc:default_implementation::(anonymous namespace)::JxlFastLosslessPrepareImpl(JxlChunkedFrameInputSource, unsigned long, unsigned long, unsigned long, unsigned long, bool, int, int)
4127
4128
jxl::Status JxlFastLosslessProcessFrameImpl(
4129
    JxlFastLosslessFrameState* frame_state, bool is_last, void* runner_opaque,
4130
    FJxlParallelRunner runner,
4131
0
    JxlEncoderOutputProcessorWrapper* output_processor) {
4132
0
  const size_t bitdepth = frame_state->bitdepth;
4133
0
  if (bitdepth <= 8) {
4134
0
    JXL_RETURN_IF_ERROR(LLProcess(frame_state, is_last, UpTo8Bits(bitdepth),
4135
0
                                  runner_opaque, runner, output_processor));
4136
0
  } else if (bitdepth <= 13) {
4137
0
    JXL_RETURN_IF_ERROR(LLProcess(frame_state, is_last, From9To13Bits(bitdepth),
4138
0
                                  runner_opaque, runner, output_processor));
4139
0
  } else if (bitdepth == 14) {
4140
0
    JXL_RETURN_IF_ERROR(LLProcess(frame_state, is_last, Exactly14Bits(bitdepth),
4141
0
                                  runner_opaque, runner, output_processor));
4142
0
  } else {
4143
0
    JXL_RETURN_IF_ERROR(LLProcess(frame_state, is_last,
4144
0
                                  MoreThan14Bits(bitdepth), runner_opaque,
4145
0
                                  runner, output_processor));
4146
0
  }
4147
0
  return true;
4148
0
}
Unexecuted instantiation: enc_fast_lossless.cc:AVX2::(anonymous namespace)::JxlFastLosslessProcessFrameImpl(JxlFastLosslessFrameState*, bool, void*, void (*)(void*, void*, void (*)(void*, unsigned long), unsigned long), JxlEncoderOutputProcessorWrapper*)
Unexecuted instantiation: enc_fast_lossless.cc:default_implementation::(anonymous namespace)::JxlFastLosslessProcessFrameImpl(JxlFastLosslessFrameState*, bool, void*, void (*)(void*, void*, void (*)(void*, unsigned long), unsigned long), JxlEncoderOutputProcessorWrapper*)
4149
4150
}  // namespace
4151
4152
#endif  // FJXL_SELF_INCLUDE
4153
4154
#ifndef FJXL_SELF_INCLUDE
4155
4156
#define FJXL_SELF_INCLUDE
4157
4158
// If we have NEON enabled, it is the default target.
4159
#if FJXL_ENABLE_NEON
4160
4161
namespace default_implementation {
4162
#define FJXL_NEON
4163
#include "lib/jxl/enc_fast_lossless.cc"
4164
#undef FJXL_NEON
4165
}  // namespace default_implementation
4166
4167
#else                                    // FJXL_ENABLE_NEON
4168
4169
namespace default_implementation {
4170
#include "lib/jxl/enc_fast_lossless.cc"  // NOLINT
4171
}
4172
4173
#if FJXL_ENABLE_AVX2
4174
#ifdef __clang__
4175
#pragma clang attribute push(__attribute__((target("avx,avx2"))), \
4176
                             apply_to = function)
4177
// Causes spurious warnings on clang5.
4178
#pragma clang diagnostic push
4179
#pragma clang diagnostic ignored "-Wmissing-braces"
4180
#elif defined(__GNUC__)
4181
#pragma GCC push_options
4182
// Seems to cause spurious errors on GCC8.
4183
#pragma GCC diagnostic ignored "-Wpsabi"
4184
#pragma GCC target "avx,avx2"
4185
#endif
4186
4187
namespace AVX2 {
4188
#define FJXL_AVX2
4189
#include "lib/jxl/enc_fast_lossless.cc"  // NOLINT
4190
#undef FJXL_AVX2
4191
}  // namespace AVX2
4192
4193
#ifdef __clang__
4194
#pragma clang attribute pop
4195
#pragma clang diagnostic pop
4196
#elif defined(__GNUC__)
4197
#pragma GCC pop_options
4198
#endif
4199
#endif  // FJXL_ENABLE_AVX2
4200
4201
#if FJXL_ENABLE_AVX512
4202
#ifdef __clang__
4203
#pragma clang attribute push(                                                 \
4204
    __attribute__((target("avx512cd,avx512bw,avx512vl,avx512f,avx512vbmi"))), \
4205
    apply_to = function)
4206
#elif defined(__GNUC__)
4207
#pragma GCC push_options
4208
#pragma GCC target "avx512cd,avx512bw,avx512vl,avx512f,avx512vbmi"
4209
#endif
4210
4211
namespace AVX512 {
4212
#define FJXL_AVX512
4213
#include "lib/jxl/enc_fast_lossless.cc"
4214
#undef FJXL_AVX512
4215
}  // namespace AVX512
4216
4217
#ifdef __clang__
4218
#pragma clang attribute pop
4219
#elif defined(__GNUC__)
4220
#pragma GCC pop_options
4221
#endif
4222
#endif  // FJXL_ENABLE_AVX512
4223
4224
#endif
4225
4226
extern "C" {
4227
4228
#if FJXL_STANDALONE
4229
class FJxlFrameInput {
4230
 public:
4231
  FJxlFrameInput(const unsigned char* rgba, size_t row_stride, size_t nb_chans,
4232
                 size_t bitdepth)
4233
      : rgba_(rgba),
4234
        row_stride_(row_stride),
4235
        bytes_per_pixel_(bitdepth <= 8 ? nb_chans : 2 * nb_chans) {}
4236
4237
  JxlChunkedFrameInputSource GetInputSource() {
4238
    return JxlChunkedFrameInputSource{this, GetDataAt,
4239
                                      [](void*, const void*) {}};
4240
  }
4241
4242
 private:
4243
  static const void* GetDataAt(void* opaque, size_t xpos, size_t ypos,
4244
                               size_t xsize, size_t ysize, size_t* row_offset) {
4245
    FJxlFrameInput* self = static_cast<FJxlFrameInput*>(opaque);
4246
    *row_offset = self->row_stride_;
4247
    return self->rgba_ + ypos * (*row_offset) + xpos * self->bytes_per_pixel_;
4248
  }
4249
4250
  const uint8_t* rgba_;
4251
  size_t row_stride_;
4252
  size_t bytes_per_pixel_;
4253
};
4254
4255
size_t JxlFastLosslessEncode(const unsigned char* rgba, size_t width,
4256
                             size_t row_stride, size_t height, size_t nb_chans,
4257
                             size_t bitdepth, bool big_endian, int effort,
4258
                             unsigned char** output, void* runner_opaque,
4259
                             FJxlParallelRunner runner) {
4260
  FJxlFrameInput input(rgba, row_stride, nb_chans, bitdepth);
4261
  auto* frame_state = JxlFastLosslessPrepareFrame(
4262
      input.GetInputSource(), width, height, nb_chans, bitdepth, big_endian,
4263
      effort, /*oneshot=*/true);
4264
  if (!JxlFastLosslessProcessFrame(frame_state, /*is_last=*/true, runner_opaque,
4265
                                   runner, nullptr)) {
4266
    return 0;
4267
  }
4268
  JxlFastLosslessPrepareHeader(frame_state, /*add_image_header=*/1,
4269
                               /*is_last=*/1);
4270
  size_t output_size = JxlFastLosslessMaxRequiredOutput(frame_state);
4271
  *output = (unsigned char*)malloc(output_size);
4272
  size_t written = 0;
4273
  size_t total = 0;
4274
  while ((written = JxlFastLosslessWriteOutput(frame_state, *output + total,
4275
                                               output_size - total)) != 0) {
4276
    total += written;
4277
  }
4278
  JxlFastLosslessFreeFrameState(frame_state);
4279
  return total;
4280
}
4281
#endif
4282
4283
JxlFastLosslessFrameState* JxlFastLosslessPrepareFrame(
4284
    JxlChunkedFrameInputSource input, size_t width, size_t height,
4285
    size_t nb_chans, size_t bitdepth, bool big_endian, int effort,
4286
0
    int oneshot) {
4287
#if FJXL_ENABLE_AVX512
4288
  if (HasCpuFeature(CpuFeature::kAVX512CD) &&
4289
      HasCpuFeature(CpuFeature::kVBMI) &&
4290
      HasCpuFeature(CpuFeature::kAVX512BW) &&
4291
      HasCpuFeature(CpuFeature::kAVX512F) &&
4292
      HasCpuFeature(CpuFeature::kAVX512VL)) {
4293
    return AVX512::JxlFastLosslessPrepareImpl(
4294
        input, width, height, nb_chans, bitdepth, big_endian, effort, oneshot);
4295
  }
4296
#endif
4297
0
#if FJXL_ENABLE_AVX2
4298
0
  if (HasCpuFeature(CpuFeature::kAVX2)) {
4299
0
    return AVX2::JxlFastLosslessPrepareImpl(
4300
0
        input, width, height, nb_chans, bitdepth, big_endian, effort, oneshot);
4301
0
  }
4302
0
#endif
4303
4304
0
  return default_implementation::JxlFastLosslessPrepareImpl(
4305
0
      input, width, height, nb_chans, bitdepth, big_endian, effort, oneshot);
4306
0
}
4307
4308
bool JxlFastLosslessProcessFrame(
4309
    JxlFastLosslessFrameState* frame_state, bool is_last, void* runner_opaque,
4310
    FJxlParallelRunner runner,
4311
0
    JxlEncoderOutputProcessorWrapper* output_processor) {
4312
0
  auto trivial_runner =
4313
0
      +[](void*, void* opaque, void fun(void*, size_t), size_t count) {
4314
0
        for (size_t i = 0; i < count; i++) {
4315
0
          fun(opaque, i);
4316
0
        }
4317
0
      };
4318
4319
0
  if (runner == nullptr) {
4320
0
    runner = trivial_runner;
4321
0
  }
4322
4323
#if FJXL_ENABLE_AVX512
4324
  if (HasCpuFeature(CpuFeature::kAVX512CD) &&
4325
      HasCpuFeature(CpuFeature::kVBMI) &&
4326
      HasCpuFeature(CpuFeature::kAVX512BW) &&
4327
      HasCpuFeature(CpuFeature::kAVX512F) &&
4328
      HasCpuFeature(CpuFeature::kAVX512VL)) {
4329
    JXL_RETURN_IF_ERROR(AVX512::JxlFastLosslessProcessFrameImpl(
4330
        frame_state, is_last, runner_opaque, runner, output_processor));
4331
    return true;
4332
  }
4333
#endif
4334
0
#if FJXL_ENABLE_AVX2
4335
0
  if (HasCpuFeature(CpuFeature::kAVX2)) {
4336
0
    JXL_RETURN_IF_ERROR(AVX2::JxlFastLosslessProcessFrameImpl(
4337
0
        frame_state, is_last, runner_opaque, runner, output_processor));
4338
0
    return true;
4339
0
  }
4340
0
#endif
4341
4342
0
  JXL_RETURN_IF_ERROR(default_implementation::JxlFastLosslessProcessFrameImpl(
4343
0
      frame_state, is_last, runner_opaque, runner, output_processor));
4344
0
  return true;
4345
0
}
4346
4347
}  // extern "C"
4348
4349
#if !FJXL_STANDALONE
4350
bool JxlFastLosslessOutputFrame(
4351
    JxlFastLosslessFrameState* frame_state,
4352
0
    JxlEncoderOutputProcessorWrapper* output_processor) {
4353
0
  size_t fl_size = JxlFastLosslessOutputSize(frame_state);
4354
0
  size_t written = 0;
4355
0
  while (written < fl_size) {
4356
0
    JXL_ASSIGN_OR_RETURN(auto buffer,
4357
0
                         output_processor->GetBuffer(32, fl_size - written));
4358
0
    size_t n =
4359
0
        JxlFastLosslessWriteOutput(frame_state, buffer.data(), buffer.size());
4360
0
    if (n == 0) break;
4361
0
    JXL_RETURN_IF_ERROR(buffer.advance(n));
4362
0
    written += n;
4363
0
  };
4364
0
  return true;
4365
0
}
4366
#endif
4367
4368
#endif  // FJXL_SELF_INCLUDE