Coverage Report

Created: 2025-10-12 07:48

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/src/libjxl/lib/jxl/enc_fast_lossless.cc
Line
Count
Source
1
// Copyright (c) the JPEG XL Project Authors. All rights reserved.
2
//
3
// Use of this source code is governed by a BSD-style
4
// license that can be found in the LICENSE file.
5
6
#include "lib/jxl/base/status.h"
7
#ifndef FJXL_SELF_INCLUDE
8
9
#include <assert.h>
10
11
#include <algorithm>
12
#include <array>
13
#include <cstdint>
14
#include <cstdlib>
15
#include <cstring>
16
#include <limits>
17
#include <memory>
18
#include <vector>
19
20
#include "lib/jxl/enc_fast_lossless.h"
21
22
#if FJXL_STANDALONE
23
#if defined(_MSC_VER)
24
using ssize_t = intptr_t;
25
#endif
26
#else  // FJXL_STANDALONE
27
#include "lib/jxl/encode_internal.h"
28
#endif  // FJXL_STANDALONE
29
30
#if defined(__x86_64__) || defined(_M_X64)
31
#define FJXL_ARCH_IS_X86_64 1
32
#else
33
#define FJXL_ARCH_IS_X86_64 0
34
#endif
35
36
#if defined(__i386__) || defined(_M_IX86) || FJXL_ARCH_IS_X86_64
37
#define FJXL_ARCH_IS_X86 1
38
#else
39
#define FJXL_ARCH_IS_X86 0
40
#endif
41
42
#if FJXL_ARCH_IS_X86
43
#if defined(_MSC_VER)
44
#include <intrin.h>
45
#else  // _MSC_VER
46
#include <cpuid.h>
47
#endif  // _MSC_VER
48
#endif  // FJXL_ARCH_IS_X86
49
50
// Enable NEON and AVX2/AVX512 if not asked to do otherwise and the compilers
51
// support it.
52
#if defined(__aarch64__) || defined(_M_ARM64)  // ARCH
53
#include <arm_neon.h>
54
55
#if !defined(FJXL_ENABLE_NEON)
56
#define FJXL_ENABLE_NEON 1
57
#endif  // !defined(FJXL_ENABLE_NEON)
58
59
#elif FJXL_ARCH_IS_X86_64 && !defined(_MSC_VER)  // ARCH
60
#include <immintrin.h>
61
62
// manually add _mm512_cvtsi512_si32 definition if missing
63
// (e.g. with Xcode on macOS Mojave)
64
// copied from gcc 11.1.0 include/avx512fintrin.h line 14367-14373
65
#if defined(__clang__) &&                                           \
66
    ((!defined(__apple_build_version__) && __clang_major__ < 10) || \
67
     (defined(__apple_build_version__) && __apple_build_version__ < 12000032))
68
inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
69
_mm512_cvtsi512_si32(__m512i __A) {
70
  __v16si __B = (__v16si)__A;
71
  return __B[0];
72
}
73
#endif
74
75
#if !defined(FJXL_ENABLE_AVX2)
76
#define FJXL_ENABLE_AVX2 1
77
#endif  // !defined(FJXL_ENABLE_AVX2)
78
79
#if !defined(FJXL_ENABLE_AVX512)
80
// On clang-7 or earlier, and gcc-10 or earlier, AVX512 seems broken.
81
#if (defined(__clang__) &&                                             \
82
         (!defined(__apple_build_version__) && __clang_major__ > 7) || \
83
     (defined(__apple_build_version__) &&                              \
84
      __apple_build_version__ > 10010046)) ||                          \
85
    (defined(__GNUC__) && __GNUC__ > 10)
86
#define FJXL_ENABLE_AVX512 1
87
#endif
88
#endif  // !defined(FJXL_ENABLE_AVX512)
89
90
#endif  // ARCH
91
92
#ifndef FJXL_ENABLE_NEON
93
#define FJXL_ENABLE_NEON 0
94
#endif
95
96
#ifndef FJXL_ENABLE_AVX2
97
#define FJXL_ENABLE_AVX2 0
98
#endif
99
100
#ifndef FJXL_ENABLE_AVX512
101
#define FJXL_ENABLE_AVX512 0
102
#endif
103
104
namespace {
105
106
enum class CpuFeature : uint32_t {
107
  kAVX2 = 0,
108
109
  kAVX512F,
110
  kAVX512VL,
111
  kAVX512CD,
112
  kAVX512BW,
113
114
  kVBMI,
115
  kVBMI2
116
};
117
118
0
constexpr uint32_t CpuFeatureBit(CpuFeature feature) {
119
0
  return 1u << static_cast<uint32_t>(feature);
120
0
}
121
122
#if FJXL_ARCH_IS_X86
123
#if defined(_MSC_VER)
124
void Cpuid(const uint32_t level, const uint32_t count,
125
           std::array<uint32_t, 4>& abcd) {
126
  int regs[4];
127
  __cpuidex(regs, level, count);
128
  for (int i = 0; i < 4; ++i) {
129
    abcd[i] = regs[i];
130
  }
131
}
132
uint32_t ReadXCR0() { return static_cast<uint32_t>(_xgetbv(0)); }
133
#else   // _MSC_VER
134
void Cpuid(const uint32_t level, const uint32_t count,
135
0
           std::array<uint32_t, 4>& abcd) {
136
0
  uint32_t a;
137
0
  uint32_t b;
138
0
  uint32_t c;
139
0
  uint32_t d;
140
0
  __cpuid_count(level, count, a, b, c, d);
141
0
  abcd[0] = a;
142
0
  abcd[1] = b;
143
0
  abcd[2] = c;
144
0
  abcd[3] = d;
145
0
}
146
0
uint32_t ReadXCR0() {
147
0
  uint32_t xcr0;
148
0
  uint32_t xcr0_high;
149
0
  const uint32_t index = 0;
150
0
  asm volatile(".byte 0x0F, 0x01, 0xD0"
151
0
               : "=a"(xcr0), "=d"(xcr0_high)
152
0
               : "c"(index));
153
0
  return xcr0;
154
0
}
155
#endif  // _MSC_VER
156
157
0
uint32_t DetectCpuFeatures() {
158
0
  uint32_t flags = 0;  // return value
159
0
  std::array<uint32_t, 4> abcd;
160
0
  Cpuid(0, 0, abcd);
161
0
  const uint32_t max_level = abcd[0];
162
163
0
  const auto check_bit = [](uint32_t v, uint32_t idx) -> bool {
164
0
    return (v & (1U << idx)) != 0;
165
0
  };
166
167
  // Extended features
168
0
  if (max_level >= 7) {
169
0
    Cpuid(7, 0, abcd);
170
0
    flags |= check_bit(abcd[1], 5) ? CpuFeatureBit(CpuFeature::kAVX2) : 0;
171
172
0
    flags |= check_bit(abcd[1], 16) ? CpuFeatureBit(CpuFeature::kAVX512F) : 0;
173
0
    flags |= check_bit(abcd[1], 28) ? CpuFeatureBit(CpuFeature::kAVX512CD) : 0;
174
0
    flags |= check_bit(abcd[1], 30) ? CpuFeatureBit(CpuFeature::kAVX512BW) : 0;
175
0
    flags |= check_bit(abcd[1], 31) ? CpuFeatureBit(CpuFeature::kAVX512VL) : 0;
176
177
0
    flags |= check_bit(abcd[2], 1) ? CpuFeatureBit(CpuFeature::kVBMI) : 0;
178
0
    flags |= check_bit(abcd[2], 6) ? CpuFeatureBit(CpuFeature::kVBMI2) : 0;
179
0
  }
180
181
0
  Cpuid(1, 0, abcd);
182
0
  const bool os_has_xsave = check_bit(abcd[2], 27);
183
0
  if (os_has_xsave) {
184
0
    const uint32_t xcr0 = ReadXCR0();
185
0
    if (!check_bit(xcr0, 1) || !check_bit(xcr0, 2)) {
186
0
      flags = 0;
187
0
    } else if (!check_bit(xcr0, 5) || !check_bit(xcr0, 6) ||
188
0
               !check_bit(xcr0, 7)) {
189
      // No AVX-512; disable everything but AVX2 if present
190
0
      flags &= CpuFeatureBit(CpuFeature::kAVX2);
191
0
    }
192
0
  }
193
194
0
  return flags;
195
0
}
196
#else   // FJXL_ARCH_IS_X86
197
uint32_t DetectCpuFeatures() { return 0; }
198
#endif  // FJXL_ARCH_IS_X86
199
200
#if defined(_MSC_VER)
201
#define FJXL_UNUSED
202
#else
203
#define FJXL_UNUSED __attribute__((unused))
204
#endif
205
206
0
FJXL_UNUSED bool HasCpuFeature(CpuFeature feature) {
207
0
  static uint32_t cpu_features = DetectCpuFeatures();
208
0
  return (cpu_features & CpuFeatureBit(feature)) != 0;
209
0
}
210
211
#if defined(_MSC_VER) && !defined(__clang__)
212
#define FJXL_INLINE __forceinline
213
FJXL_INLINE uint32_t FloorLog2(uint32_t v) {
214
  unsigned long index;
215
  _BitScanReverse(&index, v);
216
  return index;
217
}
218
FJXL_INLINE uint32_t CtzNonZero(uint64_t v) {
219
  unsigned long index;
220
  _BitScanForward(&index, v);
221
  return index;
222
}
223
#else
224
#define FJXL_INLINE inline __attribute__((always_inline))
225
0
FJXL_INLINE uint32_t FloorLog2(uint32_t v) {
226
0
  return v ? 31 - __builtin_clz(v) : 0;
227
0
}
228
0
FJXL_UNUSED FJXL_INLINE uint32_t CtzNonZero(uint64_t v) {
229
0
  return __builtin_ctzll(v);
230
0
}
231
#endif
232
233
// Compiles to a memcpy on little-endian systems.
234
0
FJXL_INLINE void StoreLE64(uint8_t* tgt, uint64_t data) {
235
#if (!defined(__BYTE_ORDER__) || (__BYTE_ORDER__ != __ORDER_LITTLE_ENDIAN__))
236
  for (int i = 0; i < 8; i++) {
237
    tgt[i] = (data >> (i * 8)) & 0xFF;
238
  }
239
#else
240
0
  memcpy(tgt, &data, 8);
241
0
#endif
242
0
}
243
244
FJXL_INLINE size_t AddBits(uint32_t count, uint64_t bits, uint8_t* data_buf,
245
0
                           size_t& bits_in_buffer, uint64_t& bit_buffer) {
246
0
  bit_buffer |= bits << bits_in_buffer;
247
0
  bits_in_buffer += count;
248
0
  StoreLE64(data_buf, bit_buffer);
249
0
  size_t bytes_in_buffer = bits_in_buffer / 8;
250
0
  bits_in_buffer -= bytes_in_buffer * 8;
251
0
  bit_buffer >>= bytes_in_buffer * 8;
252
0
  return bytes_in_buffer;
253
0
}
254
255
struct BitWriter {
256
0
  void Allocate(size_t maximum_bit_size) {
257
0
    assert(data == nullptr);
258
    // Leave some padding.
259
0
    data.reset(static_cast<uint8_t*>(malloc(maximum_bit_size / 8 + 64)));
260
0
  }
261
262
0
  void Write(uint32_t count, uint64_t bits) {
263
0
    bytes_written += AddBits(count, bits, data.get() + bytes_written,
264
0
                             bits_in_buffer, buffer);
265
0
  }
266
267
0
  void ZeroPadToByte() {
268
0
    if (bits_in_buffer != 0) {
269
0
      Write(8 - bits_in_buffer, 0);
270
0
    }
271
0
  }
272
273
  FJXL_INLINE void WriteMultiple(const uint64_t* nbits, const uint64_t* bits,
274
0
                                 size_t n) {
275
    // Necessary because Write() is only guaranteed to work with <=56 bits.
276
    // Trying to SIMD-fy this code results in lower speed (and definitely less
277
    // clarity).
278
0
    {
279
0
      for (size_t i = 0; i < n; i++) {
280
0
        this->buffer |= bits[i] << this->bits_in_buffer;
281
0
        memcpy(this->data.get() + this->bytes_written, &this->buffer, 8);
282
0
        uint64_t shift = 64 - this->bits_in_buffer;
283
0
        this->bits_in_buffer += nbits[i];
284
        // This `if` seems to be faster than using ternaries.
285
0
        if (this->bits_in_buffer >= 64) {
286
0
          uint64_t next_buffer = shift >= 64 ? 0 : bits[i] >> shift;
287
0
          this->buffer = next_buffer;
288
0
          this->bits_in_buffer -= 64;
289
0
          this->bytes_written += 8;
290
0
        }
291
0
      }
292
0
      memcpy(this->data.get() + this->bytes_written, &this->buffer, 8);
293
0
      size_t bytes_in_buffer = this->bits_in_buffer / 8;
294
0
      this->bits_in_buffer -= bytes_in_buffer * 8;
295
0
      this->buffer >>= bytes_in_buffer * 8;
296
0
      this->bytes_written += bytes_in_buffer;
297
0
    }
298
0
  }
299
300
  std::unique_ptr<uint8_t[], void (*)(void*)> data = {nullptr, free};
301
  size_t bytes_written = 0;
302
  size_t bits_in_buffer = 0;
303
  uint64_t buffer = 0;
304
};
305
306
0
size_t SectionSize(const std::array<BitWriter, 4>& group_data) {
307
0
  size_t sz = 0;
308
0
  for (size_t j = 0; j < 4; j++) {
309
0
    const auto& writer = group_data[j];
310
0
    sz += writer.bytes_written * 8 + writer.bits_in_buffer;
311
0
  }
312
0
  sz = (sz + 7) / 8;
313
0
  return sz;
314
0
}
315
316
constexpr size_t kMaxFrameHeaderSize = 5;
317
318
constexpr size_t kGroupSizeOffset[4] = {
319
    static_cast<size_t>(0),
320
    static_cast<size_t>(1024),
321
    static_cast<size_t>(17408),
322
    static_cast<size_t>(4211712),
323
};
324
constexpr size_t kTOCBits[4] = {12, 16, 24, 32};
325
326
0
size_t TOCBucket(size_t group_size) {
327
0
  size_t bucket = 0;
328
0
  while (bucket < 3 && group_size >= kGroupSizeOffset[bucket + 1]) ++bucket;
329
0
  return bucket;
330
0
}
331
332
#if !FJXL_STANDALONE
333
0
size_t TOCSize(const std::vector<size_t>& group_sizes) {
334
0
  size_t toc_bits = 0;
335
0
  for (size_t group_size : group_sizes) {
336
0
    toc_bits += kTOCBits[TOCBucket(group_size)];
337
0
  }
338
0
  return (toc_bits + 7) / 8;
339
0
}
340
341
0
size_t FrameHeaderSize(bool have_alpha, bool is_last) {
342
0
  size_t nbits = 28 + (have_alpha ? 4 : 0) + (is_last ? 0 : 2);
343
0
  return (nbits + 7) / 8;
344
0
}
345
#endif
346
347
void ComputeAcGroupDataOffset(size_t dc_global_size, size_t num_dc_groups,
348
                              size_t num_ac_groups, size_t& min_dc_global_size,
349
0
                              size_t& ac_group_offset) {
350
  // Max AC group size is 768 kB, so max AC group TOC bits is 24.
351
0
  size_t ac_toc_max_bits = num_ac_groups * 24;
352
0
  size_t ac_toc_min_bits = num_ac_groups * 12;
353
0
  size_t max_padding = 1 + (ac_toc_max_bits - ac_toc_min_bits + 7) / 8;
354
0
  min_dc_global_size = dc_global_size;
355
0
  size_t dc_global_bucket = TOCBucket(min_dc_global_size);
356
0
  while (TOCBucket(min_dc_global_size + max_padding) > dc_global_bucket) {
357
0
    dc_global_bucket = TOCBucket(min_dc_global_size + max_padding);
358
0
    min_dc_global_size = kGroupSizeOffset[dc_global_bucket];
359
0
  }
360
0
  assert(TOCBucket(min_dc_global_size) == dc_global_bucket);
361
0
  assert(TOCBucket(min_dc_global_size + max_padding) == dc_global_bucket);
362
0
  size_t max_toc_bits =
363
0
      kTOCBits[dc_global_bucket] + 12 * (1 + num_dc_groups) + ac_toc_max_bits;
364
0
  size_t max_toc_size = (max_toc_bits + 7) / 8;
365
0
  ac_group_offset = kMaxFrameHeaderSize + max_toc_size + min_dc_global_size;
366
0
}
367
368
#if !FJXL_STANDALONE
369
size_t ComputeDcGlobalPadding(const std::vector<size_t>& group_sizes,
370
                              size_t ac_group_data_offset,
371
                              size_t min_dc_global_size, bool have_alpha,
372
0
                              bool is_last) {
373
0
  std::vector<size_t> new_group_sizes = group_sizes;
374
0
  new_group_sizes[0] = min_dc_global_size;
375
0
  size_t toc_size = TOCSize(new_group_sizes);
376
0
  size_t actual_offset =
377
0
      FrameHeaderSize(have_alpha, is_last) + toc_size + group_sizes[0];
378
0
  return ac_group_data_offset - actual_offset;
379
0
}
380
#endif
381
382
constexpr size_t kNumRawSymbols = 19;
383
constexpr size_t kNumLZ77 = 33;
384
constexpr size_t kLZ77CacheSize = 32;
385
386
constexpr size_t kLZ77Offset = 224;
387
constexpr size_t kLZ77MinLength = 7;
388
389
void EncodeHybridUintLZ77(uint32_t value, uint32_t* token, uint32_t* nbits,
390
0
                          uint32_t* bits) {
391
  // 400 config
392
0
  uint32_t n = FloorLog2(value);
393
0
  *token = value < 16 ? value : 16 + n - 4;
394
0
  *nbits = value < 16 ? 0 : n;
395
0
  *bits = value < 16 ? 0 : value - (1 << *nbits);
396
0
}
397
398
struct PrefixCode {
399
  uint8_t raw_nbits[kNumRawSymbols] = {};
400
  uint8_t raw_bits[kNumRawSymbols] = {};
401
402
  uint8_t lz77_nbits[kNumLZ77] = {};
403
  uint16_t lz77_bits[kNumLZ77] = {};
404
405
  uint64_t lz77_cache_bits[kLZ77CacheSize] = {};
406
  uint8_t lz77_cache_nbits[kLZ77CacheSize] = {};
407
408
  size_t numraw;
409
410
0
  static uint16_t BitReverse(size_t nbits, uint16_t bits) {
411
0
    constexpr uint16_t kNibbleLookup[16] = {
412
0
        0b0000, 0b1000, 0b0100, 0b1100, 0b0010, 0b1010, 0b0110, 0b1110,
413
0
        0b0001, 0b1001, 0b0101, 0b1101, 0b0011, 0b1011, 0b0111, 0b1111,
414
0
    };
415
0
    uint16_t rev16 = (kNibbleLookup[bits & 0xF] << 12) |
416
0
                     (kNibbleLookup[(bits >> 4) & 0xF] << 8) |
417
0
                     (kNibbleLookup[(bits >> 8) & 0xF] << 4) |
418
0
                     (kNibbleLookup[bits >> 12]);
419
0
    return rev16 >> (16 - nbits);
420
0
  }
421
422
  // Create the prefix codes given the code lengths.
423
  // Supports the code lengths being split into two halves.
424
  static void ComputeCanonicalCode(const uint8_t* first_chunk_nbits,
425
                                   uint8_t* first_chunk_bits,
426
                                   size_t first_chunk_size,
427
                                   const uint8_t* second_chunk_nbits,
428
                                   uint16_t* second_chunk_bits,
429
0
                                   size_t second_chunk_size) {
430
0
    constexpr size_t kMaxCodeLength = 15;
431
0
    uint8_t code_length_counts[kMaxCodeLength + 1] = {};
432
0
    for (size_t i = 0; i < first_chunk_size; i++) {
433
0
      code_length_counts[first_chunk_nbits[i]]++;
434
0
      assert(first_chunk_nbits[i] <= kMaxCodeLength);
435
0
      assert(first_chunk_nbits[i] <= 8);
436
0
      assert(first_chunk_nbits[i] > 0);
437
0
    }
438
0
    for (size_t i = 0; i < second_chunk_size; i++) {
439
0
      code_length_counts[second_chunk_nbits[i]]++;
440
0
      assert(second_chunk_nbits[i] <= kMaxCodeLength);
441
0
    }
442
443
0
    uint16_t next_code[kMaxCodeLength + 1] = {};
444
445
0
    uint16_t code = 0;
446
0
    for (size_t i = 1; i < kMaxCodeLength + 1; i++) {
447
0
      code = (code + code_length_counts[i - 1]) << 1;
448
0
      next_code[i] = code;
449
0
    }
450
451
0
    for (size_t i = 0; i < first_chunk_size; i++) {
452
0
      first_chunk_bits[i] =
453
0
          BitReverse(first_chunk_nbits[i], next_code[first_chunk_nbits[i]]++);
454
0
    }
455
0
    for (size_t i = 0; i < second_chunk_size; i++) {
456
0
      second_chunk_bits[i] =
457
0
          BitReverse(second_chunk_nbits[i], next_code[second_chunk_nbits[i]]++);
458
0
    }
459
0
  }
460
461
  template <typename T>
462
  static void ComputeCodeLengthsNonZeroImpl(const uint64_t* freqs, size_t n,
463
                                            size_t precision, T infty,
464
                                            const uint8_t* min_limit,
465
                                            const uint8_t* max_limit,
466
0
                                            uint8_t* nbits) {
467
0
    assert(precision < 15);
468
0
    assert(n <= kMaxNumSymbols);
469
0
    std::vector<T> dynp(((1U << precision) + 1) * (n + 1), infty);
470
0
    auto d = [&](size_t sym, size_t off) -> T& {
471
0
      return dynp[sym * ((1 << precision) + 1) + off];
472
0
    };
Unexecuted instantiation: enc_fast_lossless.cc:(anonymous namespace)::PrefixCode::ComputeCodeLengthsNonZeroImpl<unsigned int>(unsigned long const*, unsigned long, unsigned long, unsigned int, unsigned char const*, unsigned char const*, unsigned char*)::{lambda(unsigned long, unsigned long)#1}::operator()(unsigned long, unsigned long) const
Unexecuted instantiation: enc_fast_lossless.cc:(anonymous namespace)::PrefixCode::ComputeCodeLengthsNonZeroImpl<unsigned long>(unsigned long const*, unsigned long, unsigned long, unsigned long, unsigned char const*, unsigned char const*, unsigned char*)::{lambda(unsigned long, unsigned long)#1}::operator()(unsigned long, unsigned long) const
473
0
    d(0, 0) = 0;
474
0
    for (size_t sym = 0; sym < n; sym++) {
475
0
      for (T bits = min_limit[sym]; bits <= max_limit[sym]; bits++) {
476
0
        size_t off_delta = 1U << (precision - bits);
477
0
        for (size_t off = 0; off + off_delta <= (1U << precision); off++) {
478
0
          d(sym + 1, off + off_delta) =
479
0
              std::min(d(sym, off) + static_cast<T>(freqs[sym]) * bits,
480
0
                       d(sym + 1, off + off_delta));
481
0
        }
482
0
      }
483
0
    }
484
485
0
    size_t sym = n;
486
0
    size_t off = 1U << precision;
487
488
0
    assert(d(sym, off) != infty);
489
490
0
    while (sym-- > 0) {
491
0
      assert(off > 0);
492
0
      for (size_t bits = min_limit[sym]; bits <= max_limit[sym]; bits++) {
493
0
        size_t off_delta = 1U << (precision - bits);
494
0
        if (off_delta <= off &&
495
0
            d(sym + 1, off) == d(sym, off - off_delta) + freqs[sym] * bits) {
496
0
          off -= off_delta;
497
0
          nbits[sym] = bits;
498
0
          break;
499
0
        }
500
0
      }
501
0
    }
502
0
  }
Unexecuted instantiation: enc_fast_lossless.cc:void (anonymous namespace)::PrefixCode::ComputeCodeLengthsNonZeroImpl<unsigned int>(unsigned long const*, unsigned long, unsigned long, unsigned int, unsigned char const*, unsigned char const*, unsigned char*)
Unexecuted instantiation: enc_fast_lossless.cc:void (anonymous namespace)::PrefixCode::ComputeCodeLengthsNonZeroImpl<unsigned long>(unsigned long const*, unsigned long, unsigned long, unsigned long, unsigned char const*, unsigned char const*, unsigned char*)
503
504
  // Computes nbits[i] for i <= n, subject to min_limit[i] <= nbits[i] <=
505
  // max_limit[i] and sum 2**-nbits[i] == 1, so to minimize sum(nbits[i] *
506
  // freqs[i]).
507
  static void ComputeCodeLengthsNonZero(const uint64_t* freqs, size_t n,
508
                                        uint8_t* min_limit, uint8_t* max_limit,
509
0
                                        uint8_t* nbits) {
510
0
    size_t precision = 0;
511
0
    size_t shortest_length = 255;
512
0
    uint64_t freqsum = 0;
513
0
    for (size_t i = 0; i < n; i++) {
514
0
      assert(freqs[i] != 0);
515
0
      freqsum += freqs[i];
516
0
      if (min_limit[i] < 1) min_limit[i] = 1;
517
0
      assert(min_limit[i] <= max_limit[i]);
518
0
      precision = std::max<size_t>(max_limit[i], precision);
519
0
      shortest_length = std::min<size_t>(min_limit[i], shortest_length);
520
0
    }
521
    // If all the minimum limits are greater than 1, shift precision so that we
522
    // behave as if the shortest was 1.
523
0
    precision -= shortest_length - 1;
524
0
    uint64_t infty = freqsum * precision;
525
0
    if (infty < std::numeric_limits<uint32_t>::max() / 2) {
526
0
      ComputeCodeLengthsNonZeroImpl(freqs, n, precision,
527
0
                                    static_cast<uint32_t>(infty), min_limit,
528
0
                                    max_limit, nbits);
529
0
    } else {
530
0
      ComputeCodeLengthsNonZeroImpl(freqs, n, precision, infty, min_limit,
531
0
                                    max_limit, nbits);
532
0
    }
533
0
  }
534
535
  static constexpr size_t kMaxNumSymbols =
536
      kNumRawSymbols + 1 < kNumLZ77 ? kNumLZ77 : kNumRawSymbols + 1;
537
  static void ComputeCodeLengths(const uint64_t* freqs, size_t n,
538
                                 const uint8_t* min_limit_in,
539
0
                                 const uint8_t* max_limit_in, uint8_t* nbits) {
540
0
    assert(n <= kMaxNumSymbols);
541
0
    uint64_t compact_freqs[kMaxNumSymbols];
542
0
    uint8_t min_limit[kMaxNumSymbols];
543
0
    uint8_t max_limit[kMaxNumSymbols];
544
0
    size_t ni = 0;
545
0
    for (size_t i = 0; i < n; i++) {
546
0
      if (freqs[i]) {
547
0
        compact_freqs[ni] = freqs[i];
548
0
        min_limit[ni] = min_limit_in[i];
549
0
        max_limit[ni] = max_limit_in[i];
550
0
        ni++;
551
0
      }
552
0
    }
553
0
    for (size_t i = ni; i < kMaxNumSymbols; ++i) {
554
0
      compact_freqs[i] = 0;
555
0
      min_limit[i] = 0;
556
0
      max_limit[i] = 0;
557
0
    }
558
0
    uint8_t num_bits[kMaxNumSymbols] = {};
559
0
    ComputeCodeLengthsNonZero(compact_freqs, ni, min_limit, max_limit,
560
0
                              num_bits);
561
0
    ni = 0;
562
0
    for (size_t i = 0; i < n; i++) {
563
0
      nbits[i] = 0;
564
0
      if (freqs[i]) {
565
0
        nbits[i] = num_bits[ni++];
566
0
      }
567
0
    }
568
0
  }
569
570
  // Invalid code, used to construct arrays.
571
0
  PrefixCode() = default;
572
573
  template <typename BitDepth>
574
  PrefixCode(BitDepth /* bitdepth */, uint64_t* raw_counts,
575
0
             uint64_t* lz77_counts) {
576
    // "merge" together all the lz77 counts in a single symbol for the level 1
577
    // table (containing just the raw symbols, up to length 7).
578
0
    uint64_t level1_counts[kNumRawSymbols + 1];
579
0
    memcpy(level1_counts, raw_counts, kNumRawSymbols * sizeof(uint64_t));
580
0
    numraw = kNumRawSymbols;
581
0
    while (numraw > 0 && level1_counts[numraw - 1] == 0) numraw--;
582
583
0
    level1_counts[numraw] = 0;
584
0
    for (size_t i = 0; i < kNumLZ77; i++) {
585
0
      level1_counts[numraw] += lz77_counts[i];
586
0
    }
587
0
    uint8_t level1_nbits[kNumRawSymbols + 1] = {};
588
0
    ComputeCodeLengths(level1_counts, numraw + 1, BitDepth::kMinRawLength,
589
0
                       BitDepth::kMaxRawLength, level1_nbits);
590
591
0
    uint8_t level2_nbits[kNumLZ77] = {};
592
0
    uint8_t min_lengths[kNumLZ77] = {};
593
0
    uint8_t l = 15 - level1_nbits[numraw];
594
0
    uint8_t max_lengths[kNumLZ77];
595
0
    for (uint8_t& max_length : max_lengths) {
596
0
      max_length = l;
597
0
    }
598
0
    size_t num_lz77 = kNumLZ77;
599
0
    while (num_lz77 > 0 && lz77_counts[num_lz77 - 1] == 0) num_lz77--;
600
0
    ComputeCodeLengths(lz77_counts, num_lz77, min_lengths, max_lengths,
601
0
                       level2_nbits);
602
0
    for (size_t i = 0; i < numraw; i++) {
603
0
      raw_nbits[i] = level1_nbits[i];
604
0
    }
605
0
    for (size_t i = 0; i < num_lz77; i++) {
606
0
      lz77_nbits[i] =
607
0
          level2_nbits[i] ? level1_nbits[numraw] + level2_nbits[i] : 0;
608
0
    }
609
610
0
    ComputeCanonicalCode(raw_nbits, raw_bits, numraw, lz77_nbits, lz77_bits,
611
0
                         kNumLZ77);
612
613
    // Prepare lz77 cache
614
0
    for (size_t count = 0; count < kLZ77CacheSize; count++) {
615
0
      unsigned token, nbits, bits;
616
0
      EncodeHybridUintLZ77(count, &token, &nbits, &bits);
617
0
      lz77_cache_nbits[count] = lz77_nbits[token] + nbits + raw_nbits[0];
618
0
      lz77_cache_bits[count] =
619
0
          (((bits << lz77_nbits[token]) | lz77_bits[token]) << raw_nbits[0]) |
620
0
          raw_bits[0];
621
0
    }
622
0
  }
Unexecuted instantiation: enc_fast_lossless.cc:(anonymous namespace)::PrefixCode::PrefixCode<AVX2::(anonymous namespace)::UpTo8Bits>(AVX2::(anonymous namespace)::UpTo8Bits, unsigned long*, unsigned long*)
Unexecuted instantiation: enc_fast_lossless.cc:(anonymous namespace)::PrefixCode::PrefixCode<AVX2::(anonymous namespace)::From9To13Bits>(AVX2::(anonymous namespace)::From9To13Bits, unsigned long*, unsigned long*)
Unexecuted instantiation: enc_fast_lossless.cc:(anonymous namespace)::PrefixCode::PrefixCode<AVX2::(anonymous namespace)::Exactly14Bits>(AVX2::(anonymous namespace)::Exactly14Bits, unsigned long*, unsigned long*)
Unexecuted instantiation: enc_fast_lossless.cc:(anonymous namespace)::PrefixCode::PrefixCode<AVX2::(anonymous namespace)::MoreThan14Bits>(AVX2::(anonymous namespace)::MoreThan14Bits, unsigned long*, unsigned long*)
Unexecuted instantiation: enc_fast_lossless.cc:(anonymous namespace)::PrefixCode::PrefixCode<default_implementation::(anonymous namespace)::UpTo8Bits>(default_implementation::(anonymous namespace)::UpTo8Bits, unsigned long*, unsigned long*)
Unexecuted instantiation: enc_fast_lossless.cc:(anonymous namespace)::PrefixCode::PrefixCode<default_implementation::(anonymous namespace)::From9To13Bits>(default_implementation::(anonymous namespace)::From9To13Bits, unsigned long*, unsigned long*)
Unexecuted instantiation: enc_fast_lossless.cc:(anonymous namespace)::PrefixCode::PrefixCode<default_implementation::(anonymous namespace)::Exactly14Bits>(default_implementation::(anonymous namespace)::Exactly14Bits, unsigned long*, unsigned long*)
Unexecuted instantiation: enc_fast_lossless.cc:(anonymous namespace)::PrefixCode::PrefixCode<default_implementation::(anonymous namespace)::MoreThan14Bits>(default_implementation::(anonymous namespace)::MoreThan14Bits, unsigned long*, unsigned long*)
623
624
  // Max bits written: 2 + 72 + 95 + 24 + 165 = 286
625
0
  void WriteTo(BitWriter* writer) const {
626
0
    uint64_t code_length_counts[18] = {};
627
0
    code_length_counts[17] = 3 + 2 * (kNumLZ77 - 1);
628
0
    for (uint8_t raw_nbit : raw_nbits) {
629
0
      code_length_counts[raw_nbit]++;
630
0
    }
631
0
    for (uint8_t lz77_nbit : lz77_nbits) {
632
0
      code_length_counts[lz77_nbit]++;
633
0
    }
634
0
    uint8_t code_length_nbits[18] = {};
635
0
    uint8_t code_length_nbits_min[18] = {};
636
0
    uint8_t code_length_nbits_max[18] = {
637
0
        5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
638
0
    };
639
0
    ComputeCodeLengths(code_length_counts, 18, code_length_nbits_min,
640
0
                       code_length_nbits_max, code_length_nbits);
641
0
    writer->Write(2, 0b00);  // HSKIP = 0, i.e. don't skip code lengths.
642
643
    // As per Brotli RFC.
644
0
    uint8_t code_length_order[18] = {1, 2, 3, 4,  0,  5,  17, 6,  16,
645
0
                                     7, 8, 9, 10, 11, 12, 13, 14, 15};
646
0
    uint8_t code_length_length_nbits[] = {2, 4, 3, 2, 2, 4};
647
0
    uint8_t code_length_length_bits[] = {0, 7, 3, 2, 1, 15};
648
649
    // Encode lengths of code lengths.
650
0
    size_t num_code_lengths = 18;
651
0
    while (code_length_nbits[code_length_order[num_code_lengths - 1]] == 0) {
652
0
      num_code_lengths--;
653
0
    }
654
    // Max bits written in this loop: 18 * 4 = 72
655
0
    for (size_t i = 0; i < num_code_lengths; i++) {
656
0
      int symbol = code_length_nbits[code_length_order[i]];
657
0
      writer->Write(code_length_length_nbits[symbol],
658
0
                    code_length_length_bits[symbol]);
659
0
    }
660
661
    // Compute the canonical codes for the codes that represent the lengths of
662
    // the actual codes for data.
663
0
    uint16_t code_length_bits[18] = {};
664
0
    ComputeCanonicalCode(nullptr, nullptr, 0, code_length_nbits,
665
0
                         code_length_bits, 18);
666
    // Encode raw bit code lengths.
667
    // Max bits written in this loop: 19 * 5 = 95
668
0
    for (uint8_t raw_nbit : raw_nbits) {
669
0
      writer->Write(code_length_nbits[raw_nbit], code_length_bits[raw_nbit]);
670
0
    }
671
0
    size_t num_lz77 = kNumLZ77;
672
0
    while (lz77_nbits[num_lz77 - 1] == 0) {
673
0
      num_lz77--;
674
0
    }
675
    // Encode 0s until 224 (start of LZ77 symbols). This is in total 224-19 =
676
    // 205.
677
0
    static_assert(kLZ77Offset == 224, "kLZ77Offset should be 224");
678
0
    static_assert(kNumRawSymbols == 19, "kNumRawSymbols should be 19");
679
0
    {
680
      // Max bits in this block: 24
681
0
      writer->Write(code_length_nbits[17], code_length_bits[17]);
682
0
      writer->Write(3, 0b010);  // 5
683
0
      writer->Write(code_length_nbits[17], code_length_bits[17]);
684
0
      writer->Write(3, 0b000);  // (5-2)*8 + 3 = 27
685
0
      writer->Write(code_length_nbits[17], code_length_bits[17]);
686
0
      writer->Write(3, 0b010);  // (27-2)*8 + 5 = 205
687
0
    }
688
    // Encode LZ77 symbols, with values 224+i.
689
    // Max bits written in this loop: 33 * 5 = 165
690
0
    for (size_t i = 0; i < num_lz77; i++) {
691
0
      writer->Write(code_length_nbits[lz77_nbits[i]],
692
0
                    code_length_bits[lz77_nbits[i]]);
693
0
    }
694
0
  }
695
};
696
697
}  // namespace
698
699
extern "C" {
700
701
struct JxlFastLosslessFrameState {
702
  JxlChunkedFrameInputSource input;
703
  size_t width;
704
  size_t height;
705
  size_t num_groups_x;
706
  size_t num_groups_y;
707
  size_t num_dc_groups_x;
708
  size_t num_dc_groups_y;
709
  size_t nb_chans;
710
  size_t bitdepth;
711
  int big_endian;
712
  int effort;
713
  bool collided;
714
  PrefixCode hcode[4];
715
  std::vector<int16_t> lookup;
716
  BitWriter header;
717
  std::vector<std::array<BitWriter, 4>> group_data;
718
  std::vector<size_t> group_sizes;
719
  size_t ac_group_data_offset = 0;
720
  size_t min_dc_global_size = 0;
721
  size_t current_bit_writer = 0;
722
  size_t bit_writer_byte_pos = 0;
723
  size_t bits_in_buffer = 0;
724
  uint64_t bit_buffer = 0;
725
  bool process_done = false;
726
};
727
728
0
size_t JxlFastLosslessOutputSize(const JxlFastLosslessFrameState* frame) {
729
0
  size_t total_size_groups = 0;
730
0
  for (const auto& section : frame->group_data) {
731
0
    total_size_groups += SectionSize(section);
732
0
  }
733
0
  return frame->header.bytes_written + total_size_groups;
734
0
}
735
736
size_t JxlFastLosslessMaxRequiredOutput(
737
0
    const JxlFastLosslessFrameState* frame) {
738
0
  return JxlFastLosslessOutputSize(frame) + 32;
739
0
}
740
741
void JxlFastLosslessPrepareHeader(JxlFastLosslessFrameState* frame,
742
0
                                  int add_image_header, int is_last) {
743
0
  BitWriter* output = &frame->header;
744
0
  output->Allocate(1000 + frame->group_sizes.size() * 32);
745
746
0
  bool have_alpha = (frame->nb_chans == 2 || frame->nb_chans == 4);
747
748
#if FJXL_STANDALONE
749
  if (add_image_header) {
750
    // Signature
751
    output->Write(16, 0x0AFF);
752
753
    // Size header, hand-crafted.
754
    // Not small
755
    output->Write(1, 0);
756
757
    auto wsz = [output](size_t size) {
758
      if (size - 1 < (1 << 9)) {
759
        output->Write(2, 0b00);
760
        output->Write(9, size - 1);
761
      } else if (size - 1 < (1 << 13)) {
762
        output->Write(2, 0b01);
763
        output->Write(13, size - 1);
764
      } else if (size - 1 < (1 << 18)) {
765
        output->Write(2, 0b10);
766
        output->Write(18, size - 1);
767
      } else {
768
        output->Write(2, 0b11);
769
        output->Write(30, size - 1);
770
      }
771
    };
772
773
    wsz(frame->height);
774
775
    // No special ratio.
776
    output->Write(3, 0);
777
778
    wsz(frame->width);
779
780
    // Hand-crafted ImageMetadata.
781
    output->Write(1, 0);  // all_default
782
    output->Write(1, 0);  // extra_fields
783
    output->Write(1, 0);  // bit_depth.floating_point_sample
784
    if (frame->bitdepth == 8) {
785
      output->Write(2, 0b00);  // bit_depth.bits_per_sample = 8
786
    } else if (frame->bitdepth == 10) {
787
      output->Write(2, 0b01);  // bit_depth.bits_per_sample = 10
788
    } else if (frame->bitdepth == 12) {
789
      output->Write(2, 0b10);  // bit_depth.bits_per_sample = 12
790
    } else {
791
      output->Write(2, 0b11);  // 1 + u(6)
792
      output->Write(6, frame->bitdepth - 1);
793
    }
794
    if (frame->bitdepth <= 14) {
795
      output->Write(1, 1);  // 16-bit-buffer sufficient
796
    } else {
797
      output->Write(1, 0);  // 16-bit-buffer NOT sufficient
798
    }
799
    if (have_alpha) {
800
      output->Write(2, 0b01);  // One extra channel
801
      if (frame->bitdepth == 8) {
802
        output->Write(1, 1); // ... all_default (ie. 8-bit alpha)
803
      } else {
804
        output->Write(1, 0); // not d_alpha
805
        output->Write(2, 0); // type = kAlpha
806
        output->Write(1, 0); // not float
807
        if (frame->bitdepth == 10) {
808
          output->Write(2, 0b01); // bit_depth.bits_per_sample = 10
809
        } else if (frame->bitdepth == 12) {
810
          output->Write(2, 0b10); // bit_depth.bits_per_sample = 12
811
        } else {
812
          output->Write(2, 0b11); // 1 + u(6)
813
          output->Write(6, frame->bitdepth - 1);
814
        }
815
        output->Write(2, 0); // dim_shift = 0
816
        output->Write(2, 0); // name_len = 0
817
        output->Write(1, 0); // alpha_associated = 0
818
      }
819
    } else {
820
      output->Write(2, 0b00);  // No extra channel
821
    }
822
    output->Write(1, 0);  // Not XYB
823
    if (frame->nb_chans > 2) {
824
      output->Write(1, 1);  // color_encoding.all_default (sRGB)
825
    } else {
826
      output->Write(1, 0);     // color_encoding.all_default false
827
      output->Write(1, 0);     // color_encoding.want_icc false
828
      output->Write(2, 1);     // grayscale
829
      output->Write(2, 1);     // D65
830
      output->Write(1, 0);     // no gamma transfer function
831
      output->Write(2, 0b10);  // tf: 2 + u(4)
832
      output->Write(4, 11);    // tf of sRGB
833
      output->Write(2, 1);     // relative rendering intent
834
    }
835
    output->Write(2, 0b00);  // No extensions.
836
837
    output->Write(1, 1);  // all_default transform data
838
839
    // No ICC, no preview. Frame should start at byte boundary.
840
    output->ZeroPadToByte();
841
  }
842
#else
843
0
  assert(!add_image_header);
844
0
#endif
845
  // Handcrafted frame header.
846
0
  output->Write(1, 0);     // all_default
847
0
  output->Write(2, 0b00);  // regular frame
848
0
  output->Write(1, 1);     // modular
849
0
  output->Write(2, 0b00);  // default flags
850
0
  output->Write(1, 0);     // not YCbCr
851
0
  output->Write(2, 0b00);  // no upsampling
852
0
  if (have_alpha) {
853
0
    output->Write(2, 0b00);  // no alpha upsampling
854
0
  }
855
0
  output->Write(2, 0b01);  // default group size
856
0
  output->Write(2, 0b00);  // exactly one pass
857
0
  output->Write(1, 0);     // no custom size or origin
858
0
  output->Write(2, 0b00);  // kReplace blending mode
859
0
  if (have_alpha) {
860
0
    output->Write(2, 0b00);  // kReplace blending mode for alpha channel
861
0
  }
862
0
  output->Write(1, is_last);  // is_last
863
0
  if (!is_last) {
864
0
    output->Write(2, 0b00);  // can not be saved as reference
865
0
  }
866
0
  output->Write(2, 0b00);  // a frame has no name
867
0
  output->Write(1, 0);     // loop filter is not all_default
868
0
  output->Write(1, 0);     // no gaborish
869
0
  output->Write(2, 0);     // 0 EPF iters
870
0
  output->Write(2, 0b00);  // No LF extensions
871
0
  output->Write(2, 0b00);  // No FH extensions
872
873
0
  output->Write(1, 0);      // No TOC permutation
874
0
  output->ZeroPadToByte();  // TOC is byte-aligned.
875
0
  assert(add_image_header || output->bytes_written <= kMaxFrameHeaderSize);
876
0
  for (size_t group_size : frame->group_sizes) {
877
0
    size_t bucket = TOCBucket(group_size);
878
0
    output->Write(2, bucket);
879
0
    output->Write(kTOCBits[bucket] - 2, group_size - kGroupSizeOffset[bucket]);
880
0
  }
881
0
  output->ZeroPadToByte();  // Groups are byte-aligned.
882
0
}
883
884
#if !FJXL_STANDALONE
885
bool JxlFastLosslessOutputAlignedSection(
886
0
    const BitWriter& bw, JxlEncoderOutputProcessorWrapper* output_processor) {
887
0
  assert(bw.bits_in_buffer == 0);
888
0
  const uint8_t* data = bw.data.get();
889
0
  size_t remaining_len = bw.bytes_written;
890
0
  while (remaining_len > 0) {
891
0
    JXL_ASSIGN_OR_RETURN(auto buffer,
892
0
                         output_processor->GetBuffer(1, remaining_len));
893
0
    size_t n = std::min(buffer.size(), remaining_len);
894
0
    if (n == 0) break;
895
0
    memcpy(buffer.data(), data, n);
896
0
    JXL_RETURN_IF_ERROR(buffer.advance(n));
897
0
    data += n;
898
0
    remaining_len -= n;
899
0
  };
900
0
  return true;
901
0
}
902
903
bool JxlFastLosslessOutputHeaders(
904
    JxlFastLosslessFrameState* frame_state,
905
0
    JxlEncoderOutputProcessorWrapper* output_processor) {
906
0
  JXL_RETURN_IF_ERROR(JxlFastLosslessOutputAlignedSection(frame_state->header,
907
0
                                                          output_processor));
908
0
  JXL_RETURN_IF_ERROR(JxlFastLosslessOutputAlignedSection(
909
0
      frame_state->group_data[0][0], output_processor));
910
0
  return true;
911
0
}
912
#endif
913
914
#if FJXL_ENABLE_AVX512
915
__attribute__((target("avx512vbmi2"))) static size_t AppendBytesWithBitOffset(
916
    const uint8_t* data, size_t n, size_t bit_buffer_nbits,
917
    unsigned char* output, uint64_t& bit_buffer) {
918
  if (n < 128) {
919
    return 0;
920
  }
921
922
  size_t i = 0;
923
  __m512i shift = _mm512_set1_epi64(64 - bit_buffer_nbits);
924
  __m512i carry = _mm512_set1_epi64(bit_buffer << (64 - bit_buffer_nbits));
925
926
  for (; i + 64 <= n; i += 64) {
927
    __m512i current = _mm512_loadu_si512(data + i);
928
    __m512i previous_u64 = _mm512_alignr_epi64(current, carry, 7);
929
    carry = current;
930
    __m512i out = _mm512_shrdv_epi64(previous_u64, current, shift);
931
    _mm512_storeu_si512(output + i, out);
932
  }
933
934
  bit_buffer = data[i - 1] >> (8 - bit_buffer_nbits);
935
936
  return i;
937
}
938
#endif
939
940
size_t JxlFastLosslessWriteOutput(JxlFastLosslessFrameState* frame,
941
0
                                  unsigned char* output, size_t output_size) {
942
0
  assert(output_size >= 32);
943
0
  unsigned char* initial_output = output;
944
0
  size_t (*append_bytes_with_bit_offset)(const uint8_t*, size_t, size_t,
945
0
                                         unsigned char*, uint64_t&) = nullptr;
946
947
#if FJXL_ENABLE_AVX512
948
  if (HasCpuFeature(CpuFeature::kVBMI2)) {
949
    append_bytes_with_bit_offset = AppendBytesWithBitOffset;
950
  }
951
#endif
952
953
0
  while (true) {
954
0
    size_t& cur = frame->current_bit_writer;
955
0
    size_t& bw_pos = frame->bit_writer_byte_pos;
956
0
    if (cur >= 1 + frame->group_data.size() * frame->nb_chans) {
957
0
      return output - initial_output;
958
0
    }
959
0
    if (output_size <= 9) {
960
0
      return output - initial_output;
961
0
    }
962
0
    size_t nbc = frame->nb_chans;
963
0
    const BitWriter& writer =
964
0
        cur == 0 ? frame->header
965
0
                 : frame->group_data[(cur - 1) / nbc][(cur - 1) % nbc];
966
0
    size_t full_byte_count =
967
0
        std::min(output_size - 9, writer.bytes_written - bw_pos);
968
0
    if (frame->bits_in_buffer == 0) {
969
0
      memcpy(output, writer.data.get() + bw_pos, full_byte_count);
970
0
    } else {
971
0
      size_t i = 0;
972
0
      if (append_bytes_with_bit_offset) {
973
0
        i += append_bytes_with_bit_offset(
974
0
            writer.data.get() + bw_pos, full_byte_count, frame->bits_in_buffer,
975
0
            output, frame->bit_buffer);
976
0
      }
977
0
#if defined(__BYTE_ORDER__) && (__BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__)
978
      // Copy 8 bytes at a time until we reach the border.
979
0
      for (; i + 8 < full_byte_count; i += 8) {
980
0
        uint64_t chunk;
981
0
        memcpy(&chunk, writer.data.get() + bw_pos + i, 8);
982
0
        uint64_t out = frame->bit_buffer | (chunk << frame->bits_in_buffer);
983
0
        memcpy(output + i, &out, 8);
984
0
        frame->bit_buffer = chunk >> (64 - frame->bits_in_buffer);
985
0
      }
986
0
#endif
987
0
      for (; i < full_byte_count; i++) {
988
0
        AddBits(8, writer.data.get()[bw_pos + i], output + i,
989
0
                frame->bits_in_buffer, frame->bit_buffer);
990
0
      }
991
0
    }
992
0
    output += full_byte_count;
993
0
    output_size -= full_byte_count;
994
0
    bw_pos += full_byte_count;
995
0
    if (bw_pos == writer.bytes_written) {
996
0
      auto write = [&](size_t num, uint64_t bits) {
997
0
        size_t n = AddBits(num, bits, output, frame->bits_in_buffer,
998
0
                           frame->bit_buffer);
999
0
        output += n;
1000
0
        output_size -= n;
1001
0
      };
1002
0
      if (writer.bits_in_buffer) {
1003
0
        write(writer.bits_in_buffer, writer.buffer);
1004
0
      }
1005
0
      bw_pos = 0;
1006
0
      cur++;
1007
0
      if ((cur - 1) % nbc == 0 && frame->bits_in_buffer != 0) {
1008
0
        write(8 - frame->bits_in_buffer, 0);
1009
0
      }
1010
0
    }
1011
0
  }
1012
0
}
1013
1014
0
void JxlFastLosslessFreeFrameState(JxlFastLosslessFrameState* frame) {
1015
0
  delete frame;
1016
0
}
1017
1018
}  // extern "C"
1019
1020
#endif
1021
1022
#ifdef FJXL_SELF_INCLUDE
1023
1024
namespace {
1025
1026
template <typename T>
1027
struct VecPair {
1028
  T low;
1029
  T hi;
1030
};
1031
1032
#ifdef FJXL_GENERIC_SIMD
1033
#undef FJXL_GENERIC_SIMD
1034
#endif
1035
1036
#ifdef FJXL_AVX512
1037
#define FJXL_GENERIC_SIMD
1038
struct SIMDVec32;
1039
struct Mask32 {
1040
  __mmask16 mask;
1041
  SIMDVec32 IfThenElse(const SIMDVec32& if_true, const SIMDVec32& if_false);
1042
  size_t CountPrefix() const {
1043
    return CtzNonZero(~uint64_t{_cvtmask16_u32(mask)});
1044
  }
1045
};
1046
1047
struct SIMDVec32 {
1048
  __m512i vec;
1049
1050
  static constexpr size_t kLanes = 16;
1051
1052
  FJXL_INLINE static SIMDVec32 Load(const uint32_t* data) {
1053
    return SIMDVec32{_mm512_loadu_si512((__m512i*)data)};
1054
  }
1055
  FJXL_INLINE void Store(uint32_t* data) {
1056
    _mm512_storeu_si512((__m512i*)data, vec);
1057
  }
1058
  FJXL_INLINE static SIMDVec32 Val(uint32_t v) {
1059
    return SIMDVec32{_mm512_set1_epi32(v)};
1060
  }
1061
  FJXL_INLINE SIMDVec32 ValToToken() const {
1062
    return SIMDVec32{
1063
        _mm512_sub_epi32(_mm512_set1_epi32(32), _mm512_lzcnt_epi32(vec))};
1064
  }
1065
  FJXL_INLINE SIMDVec32 SatSubU(const SIMDVec32& to_subtract) const {
1066
    return SIMDVec32{_mm512_sub_epi32(_mm512_max_epu32(vec, to_subtract.vec),
1067
                                      to_subtract.vec)};
1068
  }
1069
  FJXL_INLINE SIMDVec32 Sub(const SIMDVec32& to_subtract) const {
1070
    return SIMDVec32{_mm512_sub_epi32(vec, to_subtract.vec)};
1071
  }
1072
  FJXL_INLINE SIMDVec32 Add(const SIMDVec32& oth) const {
1073
    return SIMDVec32{_mm512_add_epi32(vec, oth.vec)};
1074
  }
1075
  FJXL_INLINE SIMDVec32 Xor(const SIMDVec32& oth) const {
1076
    return SIMDVec32{_mm512_xor_epi32(vec, oth.vec)};
1077
  }
1078
  FJXL_INLINE Mask32 Eq(const SIMDVec32& oth) const {
1079
    return Mask32{_mm512_cmpeq_epi32_mask(vec, oth.vec)};
1080
  }
1081
  FJXL_INLINE Mask32 Gt(const SIMDVec32& oth) const {
1082
    return Mask32{_mm512_cmpgt_epi32_mask(vec, oth.vec)};
1083
  }
1084
  FJXL_INLINE SIMDVec32 Pow2() const {
1085
    return SIMDVec32{_mm512_sllv_epi32(_mm512_set1_epi32(1), vec)};
1086
  }
1087
  template <size_t i>
1088
  FJXL_INLINE SIMDVec32 SignedShiftRight() const {
1089
    return SIMDVec32{_mm512_srai_epi32(vec, i)};
1090
  }
1091
};
1092
1093
struct SIMDVec16;
1094
1095
struct Mask16 {
1096
  __mmask32 mask;
1097
  SIMDVec16 IfThenElse(const SIMDVec16& if_true, const SIMDVec16& if_false);
1098
  Mask16 And(const Mask16& oth) const {
1099
    return Mask16{_kand_mask32(mask, oth.mask)};
1100
  }
1101
  size_t CountPrefix() const {
1102
    return CtzNonZero(~uint64_t{_cvtmask32_u32(mask)});
1103
  }
1104
};
1105
1106
struct SIMDVec16 {
1107
  __m512i vec;
1108
1109
  static constexpr size_t kLanes = 32;
1110
1111
  FJXL_INLINE static SIMDVec16 Load(const uint16_t* data) {
1112
    return SIMDVec16{_mm512_loadu_si512((__m512i*)data)};
1113
  }
1114
  FJXL_INLINE void Store(uint16_t* data) {
1115
    _mm512_storeu_si512((__m512i*)data, vec);
1116
  }
1117
  FJXL_INLINE static SIMDVec16 Val(uint16_t v) {
1118
    return SIMDVec16{_mm512_set1_epi16(v)};
1119
  }
1120
  FJXL_INLINE static SIMDVec16 FromTwo32(const SIMDVec32& lo,
1121
                                         const SIMDVec32& hi) {
1122
    auto tmp = _mm512_packus_epi32(lo.vec, hi.vec);
1123
    alignas(64) uint64_t perm[8] = {0, 2, 4, 6, 1, 3, 5, 7};
1124
    return SIMDVec16{
1125
        _mm512_permutex2var_epi64(tmp, _mm512_load_si512((__m512i*)perm), tmp)};
1126
  }
1127
1128
  FJXL_INLINE SIMDVec16 ValToToken() const {
1129
    auto c16 = _mm512_set1_epi32(16);
1130
    auto c32 = _mm512_set1_epi32(32);
1131
    auto low16bit = _mm512_set1_epi32(0x0000FFFF);
1132
    auto lzhi =
1133
        _mm512_sub_epi32(c16, _mm512_min_epu32(c16, _mm512_lzcnt_epi32(vec)));
1134
    auto lzlo = _mm512_sub_epi32(
1135
        c32, _mm512_lzcnt_epi32(_mm512_and_si512(low16bit, vec)));
1136
    return SIMDVec16{_mm512_or_si512(lzlo, _mm512_slli_epi32(lzhi, 16))};
1137
  }
1138
1139
  FJXL_INLINE SIMDVec16 SatSubU(const SIMDVec16& to_subtract) const {
1140
    return SIMDVec16{_mm512_subs_epu16(vec, to_subtract.vec)};
1141
  }
1142
  FJXL_INLINE SIMDVec16 Sub(const SIMDVec16& to_subtract) const {
1143
    return SIMDVec16{_mm512_sub_epi16(vec, to_subtract.vec)};
1144
  }
1145
  FJXL_INLINE SIMDVec16 Add(const SIMDVec16& oth) const {
1146
    return SIMDVec16{_mm512_add_epi16(vec, oth.vec)};
1147
  }
1148
  FJXL_INLINE SIMDVec16 Min(const SIMDVec16& oth) const {
1149
    return SIMDVec16{_mm512_min_epu16(vec, oth.vec)};
1150
  }
1151
  FJXL_INLINE Mask16 Eq(const SIMDVec16& oth) const {
1152
    return Mask16{_mm512_cmpeq_epi16_mask(vec, oth.vec)};
1153
  }
1154
  FJXL_INLINE Mask16 Gt(const SIMDVec16& oth) const {
1155
    return Mask16{_mm512_cmpgt_epi16_mask(vec, oth.vec)};
1156
  }
1157
  FJXL_INLINE SIMDVec16 Pow2() const {
1158
    return SIMDVec16{_mm512_sllv_epi16(_mm512_set1_epi16(1), vec)};
1159
  }
1160
  FJXL_INLINE SIMDVec16 Or(const SIMDVec16& oth) const {
1161
    return SIMDVec16{_mm512_or_si512(vec, oth.vec)};
1162
  }
1163
  FJXL_INLINE SIMDVec16 Xor(const SIMDVec16& oth) const {
1164
    return SIMDVec16{_mm512_xor_si512(vec, oth.vec)};
1165
  }
1166
  FJXL_INLINE SIMDVec16 And(const SIMDVec16& oth) const {
1167
    return SIMDVec16{_mm512_and_si512(vec, oth.vec)};
1168
  }
1169
  FJXL_INLINE SIMDVec16 HAdd(const SIMDVec16& oth) const {
1170
    return SIMDVec16{_mm512_srai_epi16(_mm512_add_epi16(vec, oth.vec), 1)};
1171
  }
1172
  FJXL_INLINE SIMDVec16 PrepareForU8Lookup() const {
1173
    return SIMDVec16{_mm512_or_si512(vec, _mm512_set1_epi16(0xFF00))};
1174
  }
1175
  FJXL_INLINE SIMDVec16 U8Lookup(const uint8_t* table) const {
1176
    return SIMDVec16{_mm512_shuffle_epi8(
1177
        _mm512_broadcast_i32x4(_mm_loadu_si128((__m128i*)table)), vec)};
1178
  }
1179
  FJXL_INLINE VecPair<SIMDVec16> Interleave(const SIMDVec16& low) const {
1180
    auto lo = _mm512_unpacklo_epi16(low.vec, vec);
1181
    auto hi = _mm512_unpackhi_epi16(low.vec, vec);
1182
    alignas(64) uint64_t perm1[8] = {0, 1, 8, 9, 2, 3, 10, 11};
1183
    alignas(64) uint64_t perm2[8] = {4, 5, 12, 13, 6, 7, 14, 15};
1184
    return {SIMDVec16{_mm512_permutex2var_epi64(
1185
                lo, _mm512_load_si512((__m512i*)perm1), hi)},
1186
            SIMDVec16{_mm512_permutex2var_epi64(
1187
                lo, _mm512_load_si512((__m512i*)perm2), hi)}};
1188
  }
1189
  FJXL_INLINE VecPair<SIMDVec32> Upcast() const {
1190
    auto lo = _mm512_unpacklo_epi16(vec, _mm512_setzero_si512());
1191
    auto hi = _mm512_unpackhi_epi16(vec, _mm512_setzero_si512());
1192
    alignas(64) uint64_t perm1[8] = {0, 1, 8, 9, 2, 3, 10, 11};
1193
    alignas(64) uint64_t perm2[8] = {4, 5, 12, 13, 6, 7, 14, 15};
1194
    return {SIMDVec32{_mm512_permutex2var_epi64(
1195
                lo, _mm512_load_si512((__m512i*)perm1), hi)},
1196
            SIMDVec32{_mm512_permutex2var_epi64(
1197
                lo, _mm512_load_si512((__m512i*)perm2), hi)}};
1198
  }
1199
  template <size_t i>
1200
  FJXL_INLINE SIMDVec16 SignedShiftRight() const {
1201
    return SIMDVec16{_mm512_srai_epi16(vec, i)};
1202
  }
1203
1204
  static std::array<SIMDVec16, 1> LoadG8(const unsigned char* data) {
1205
    __m256i bytes = _mm256_loadu_si256((__m256i*)data);
1206
    return {SIMDVec16{_mm512_cvtepu8_epi16(bytes)}};
1207
  }
1208
  static std::array<SIMDVec16, 1> LoadG16(const unsigned char* data) {
1209
    return {Load((const uint16_t*)data)};
1210
  }
1211
1212
  static std::array<SIMDVec16, 2> LoadGA8(const unsigned char* data) {
1213
    __m512i bytes = _mm512_loadu_si512((__m512i*)data);
1214
    __m512i gray = _mm512_and_si512(bytes, _mm512_set1_epi16(0xFF));
1215
    __m512i alpha = _mm512_srli_epi16(bytes, 8);
1216
    return {SIMDVec16{gray}, SIMDVec16{alpha}};
1217
  }
1218
  static std::array<SIMDVec16, 2> LoadGA16(const unsigned char* data) {
1219
    __m512i bytes1 = _mm512_loadu_si512((__m512i*)data);
1220
    __m512i bytes2 = _mm512_loadu_si512((__m512i*)(data + 64));
1221
    __m512i g_mask = _mm512_set1_epi32(0xFFFF);
1222
    __m512i permuteidx = _mm512_set_epi64(7, 5, 3, 1, 6, 4, 2, 0);
1223
    __m512i g = _mm512_permutexvar_epi64(
1224
        permuteidx, _mm512_packus_epi32(_mm512_and_si512(bytes1, g_mask),
1225
                                        _mm512_and_si512(bytes2, g_mask)));
1226
    __m512i a = _mm512_permutexvar_epi64(
1227
        permuteidx, _mm512_packus_epi32(_mm512_srli_epi32(bytes1, 16),
1228
                                        _mm512_srli_epi32(bytes2, 16)));
1229
    return {SIMDVec16{g}, SIMDVec16{a}};
1230
  }
1231
1232
  static std::array<SIMDVec16, 3> LoadRGB8(const unsigned char* data) {
1233
    __m512i bytes0 = _mm512_loadu_si512((__m512i*)data);
1234
    __m512i bytes1 =
1235
        _mm512_zextsi256_si512(_mm256_loadu_si256((__m256i*)(data + 64)));
1236
1237
    // 0x7A = element of upper half of second vector = 0 after lookup; still in
1238
    // the upper half once we add 1 or 2.
1239
    uint8_t z = 0x7A;
1240
    __m512i ridx =
1241
        _mm512_set_epi8(z, 93, z, 90, z, 87, z, 84, z, 81, z, 78, z, 75, z, 72,
1242
                        z, 69, z, 66, z, 63, z, 60, z, 57, z, 54, z, 51, z, 48,
1243
                        z, 45, z, 42, z, 39, z, 36, z, 33, z, 30, z, 27, z, 24,
1244
                        z, 21, z, 18, z, 15, z, 12, z, 9, z, 6, z, 3, z, 0);
1245
    __m512i gidx = _mm512_add_epi8(ridx, _mm512_set1_epi8(1));
1246
    __m512i bidx = _mm512_add_epi8(gidx, _mm512_set1_epi8(1));
1247
    __m512i r = _mm512_permutex2var_epi8(bytes0, ridx, bytes1);
1248
    __m512i g = _mm512_permutex2var_epi8(bytes0, gidx, bytes1);
1249
    __m512i b = _mm512_permutex2var_epi8(bytes0, bidx, bytes1);
1250
    return {SIMDVec16{r}, SIMDVec16{g}, SIMDVec16{b}};
1251
  }
1252
  static std::array<SIMDVec16, 3> LoadRGB16(const unsigned char* data) {
1253
    __m512i bytes0 = _mm512_loadu_si512((__m512i*)data);
1254
    __m512i bytes1 = _mm512_loadu_si512((__m512i*)(data + 64));
1255
    __m512i bytes2 = _mm512_loadu_si512((__m512i*)(data + 128));
1256
1257
    __m512i ridx_lo = _mm512_set_epi16(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 63, 60, 57,
1258
                                       54, 51, 48, 45, 42, 39, 36, 33, 30, 27,
1259
                                       24, 21, 18, 15, 12, 9, 6, 3, 0);
1260
    // -1 is such that when adding 1 or 2, we get the correct index for
1261
    // green/blue.
1262
    __m512i ridx_hi =
1263
        _mm512_set_epi16(29, 26, 23, 20, 17, 14, 11, 8, 5, 2, -1, 0, 0, 0, 0, 0,
1264
                         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
1265
    __m512i gidx_lo = _mm512_add_epi16(ridx_lo, _mm512_set1_epi16(1));
1266
    __m512i gidx_hi = _mm512_add_epi16(ridx_hi, _mm512_set1_epi16(1));
1267
    __m512i bidx_lo = _mm512_add_epi16(gidx_lo, _mm512_set1_epi16(1));
1268
    __m512i bidx_hi = _mm512_add_epi16(gidx_hi, _mm512_set1_epi16(1));
1269
1270
    __mmask32 rmask = _cvtu32_mask32(0b11111111110000000000000000000000);
1271
    __mmask32 gbmask = _cvtu32_mask32(0b11111111111000000000000000000000);
1272
1273
    __m512i rlo = _mm512_permutex2var_epi16(bytes0, ridx_lo, bytes1);
1274
    __m512i glo = _mm512_permutex2var_epi16(bytes0, gidx_lo, bytes1);
1275
    __m512i blo = _mm512_permutex2var_epi16(bytes0, bidx_lo, bytes1);
1276
    __m512i r = _mm512_mask_permutexvar_epi16(rlo, rmask, ridx_hi, bytes2);
1277
    __m512i g = _mm512_mask_permutexvar_epi16(glo, gbmask, gidx_hi, bytes2);
1278
    __m512i b = _mm512_mask_permutexvar_epi16(blo, gbmask, bidx_hi, bytes2);
1279
    return {SIMDVec16{r}, SIMDVec16{g}, SIMDVec16{b}};
1280
  }
1281
1282
  static std::array<SIMDVec16, 4> LoadRGBA8(const unsigned char* data) {
1283
    __m512i bytes1 = _mm512_loadu_si512((__m512i*)data);
1284
    __m512i bytes2 = _mm512_loadu_si512((__m512i*)(data + 64));
1285
    __m512i rg_mask = _mm512_set1_epi32(0xFFFF);
1286
    __m512i permuteidx = _mm512_set_epi64(7, 5, 3, 1, 6, 4, 2, 0);
1287
    __m512i rg = _mm512_permutexvar_epi64(
1288
        permuteidx, _mm512_packus_epi32(_mm512_and_si512(bytes1, rg_mask),
1289
                                        _mm512_and_si512(bytes2, rg_mask)));
1290
    __m512i b_a = _mm512_permutexvar_epi64(
1291
        permuteidx, _mm512_packus_epi32(_mm512_srli_epi32(bytes1, 16),
1292
                                        _mm512_srli_epi32(bytes2, 16)));
1293
    __m512i r = _mm512_and_si512(rg, _mm512_set1_epi16(0xFF));
1294
    __m512i g = _mm512_srli_epi16(rg, 8);
1295
    __m512i b = _mm512_and_si512(b_a, _mm512_set1_epi16(0xFF));
1296
    __m512i a = _mm512_srli_epi16(b_a, 8);
1297
    return {SIMDVec16{r}, SIMDVec16{g}, SIMDVec16{b}, SIMDVec16{a}};
1298
  }
1299
  static std::array<SIMDVec16, 4> LoadRGBA16(const unsigned char* data) {
1300
    __m512i bytes0 = _mm512_loadu_si512((__m512i*)data);
1301
    __m512i bytes1 = _mm512_loadu_si512((__m512i*)(data + 64));
1302
    __m512i bytes2 = _mm512_loadu_si512((__m512i*)(data + 128));
1303
    __m512i bytes3 = _mm512_loadu_si512((__m512i*)(data + 192));
1304
1305
    auto pack32 = [](__m512i a, __m512i b) {
1306
      __m512i permuteidx = _mm512_set_epi64(7, 5, 3, 1, 6, 4, 2, 0);
1307
      return _mm512_permutexvar_epi64(permuteidx, _mm512_packus_epi32(a, b));
1308
    };
1309
    auto packlow32 = [&pack32](__m512i a, __m512i b) {
1310
      __m512i mask = _mm512_set1_epi32(0xFFFF);
1311
      return pack32(_mm512_and_si512(a, mask), _mm512_and_si512(b, mask));
1312
    };
1313
    auto packhi32 = [&pack32](__m512i a, __m512i b) {
1314
      return pack32(_mm512_srli_epi32(a, 16), _mm512_srli_epi32(b, 16));
1315
    };
1316
1317
    __m512i rb0 = packlow32(bytes0, bytes1);
1318
    __m512i rb1 = packlow32(bytes2, bytes3);
1319
    __m512i ga0 = packhi32(bytes0, bytes1);
1320
    __m512i ga1 = packhi32(bytes2, bytes3);
1321
1322
    __m512i r = packlow32(rb0, rb1);
1323
    __m512i g = packlow32(ga0, ga1);
1324
    __m512i b = packhi32(rb0, rb1);
1325
    __m512i a = packhi32(ga0, ga1);
1326
    return {SIMDVec16{r}, SIMDVec16{g}, SIMDVec16{b}, SIMDVec16{a}};
1327
  }
1328
1329
  void SwapEndian() {
1330
    auto indices = _mm512_broadcast_i32x4(
1331
        _mm_setr_epi8(1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14));
1332
    vec = _mm512_shuffle_epi8(vec, indices);
1333
  }
1334
};
1335
1336
SIMDVec16 Mask16::IfThenElse(const SIMDVec16& if_true,
1337
                             const SIMDVec16& if_false) {
1338
  return SIMDVec16{_mm512_mask_blend_epi16(mask, if_false.vec, if_true.vec)};
1339
}
1340
1341
SIMDVec32 Mask32::IfThenElse(const SIMDVec32& if_true,
1342
                             const SIMDVec32& if_false) {
1343
  return SIMDVec32{_mm512_mask_blend_epi32(mask, if_false.vec, if_true.vec)};
1344
}
1345
1346
struct Bits64 {
1347
  static constexpr size_t kLanes = 8;
1348
1349
  __m512i nbits;
1350
  __m512i bits;
1351
1352
  FJXL_INLINE void Store(uint64_t* nbits_out, uint64_t* bits_out) {
1353
    _mm512_storeu_si512((__m512i*)nbits_out, nbits);
1354
    _mm512_storeu_si512((__m512i*)bits_out, bits);
1355
  }
1356
};
1357
1358
struct Bits32 {
1359
  __m512i nbits;
1360
  __m512i bits;
1361
1362
  static Bits32 FromRaw(SIMDVec32 nbits, SIMDVec32 bits) {
1363
    return Bits32{nbits.vec, bits.vec};
1364
  }
1365
1366
  Bits64 Merge() const {
1367
    auto nbits_hi32 = _mm512_srli_epi64(nbits, 32);
1368
    auto nbits_lo32 = _mm512_and_si512(nbits, _mm512_set1_epi64(0xFFFFFFFF));
1369
    auto bits_hi32 = _mm512_srli_epi64(bits, 32);
1370
    auto bits_lo32 = _mm512_and_si512(bits, _mm512_set1_epi64(0xFFFFFFFF));
1371
1372
    auto nbits64 = _mm512_add_epi64(nbits_hi32, nbits_lo32);
1373
    auto bits64 =
1374
        _mm512_or_si512(_mm512_sllv_epi64(bits_hi32, nbits_lo32), bits_lo32);
1375
    return Bits64{nbits64, bits64};
1376
  }
1377
1378
  void Interleave(const Bits32& low) {
1379
    bits = _mm512_or_si512(_mm512_sllv_epi32(bits, low.nbits), low.bits);
1380
    nbits = _mm512_add_epi32(nbits, low.nbits);
1381
  }
1382
1383
  void ClipTo(size_t n) {
1384
    n = std::min<size_t>(n, 16);
1385
    constexpr uint32_t kMask[32] = {
1386
        ~0u, ~0u, ~0u, ~0u, ~0u, ~0u, ~0u, ~0u, ~0u, ~0u, ~0u,
1387
        ~0u, ~0u, ~0u, ~0u, ~0u, 0,   0,   0,   0,   0,   0,
1388
        0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
1389
    };
1390
    __m512i mask = _mm512_loadu_si512((__m512i*)(kMask + 16 - n));
1391
    nbits = _mm512_and_si512(mask, nbits);
1392
    bits = _mm512_and_si512(mask, bits);
1393
  }
1394
  void Skip(size_t n) {
1395
    n = std::min<size_t>(n, 16);
1396
    constexpr uint32_t kMask[32] = {
1397
        0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
1398
        0,   0,   0,   0,   0,   ~0u, ~0u, ~0u, ~0u, ~0u, ~0u,
1399
        ~0u, ~0u, ~0u, ~0u, ~0u, ~0u, ~0u, ~0u, ~0u, ~0u,
1400
    };
1401
    __m512i mask = _mm512_loadu_si512((__m512i*)(kMask + 16 - n));
1402
    nbits = _mm512_and_si512(mask, nbits);
1403
    bits = _mm512_and_si512(mask, bits);
1404
  }
1405
};
1406
1407
struct Bits16 {
1408
  __m512i nbits;
1409
  __m512i bits;
1410
1411
  static Bits16 FromRaw(SIMDVec16 nbits, SIMDVec16 bits) {
1412
    return Bits16{nbits.vec, bits.vec};
1413
  }
1414
1415
  Bits32 Merge() const {
1416
    auto nbits_hi16 = _mm512_srli_epi32(nbits, 16);
1417
    auto nbits_lo16 = _mm512_and_si512(nbits, _mm512_set1_epi32(0xFFFF));
1418
    auto bits_hi16 = _mm512_srli_epi32(bits, 16);
1419
    auto bits_lo16 = _mm512_and_si512(bits, _mm512_set1_epi32(0xFFFF));
1420
1421
    auto nbits32 = _mm512_add_epi32(nbits_hi16, nbits_lo16);
1422
    auto bits32 =
1423
        _mm512_or_si512(_mm512_sllv_epi32(bits_hi16, nbits_lo16), bits_lo16);
1424
    return Bits32{nbits32, bits32};
1425
  }
1426
1427
  void Interleave(const Bits16& low) {
1428
    bits = _mm512_or_si512(_mm512_sllv_epi16(bits, low.nbits), low.bits);
1429
    nbits = _mm512_add_epi16(nbits, low.nbits);
1430
  }
1431
1432
  void ClipTo(size_t n) {
1433
    n = std::min<size_t>(n, 32);
1434
    constexpr uint16_t kMask[64] = {
1435
        0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF,
1436
        0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF,
1437
        0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF,
1438
        0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF,
1439
        0,      0,      0,      0,      0,      0,      0,      0,
1440
        0,      0,      0,      0,      0,      0,      0,      0,
1441
        0,      0,      0,      0,      0,      0,      0,      0,
1442
        0,      0,      0,      0,      0,      0,      0,      0,
1443
    };
1444
    __m512i mask = _mm512_loadu_si512((__m512i*)(kMask + 32 - n));
1445
    nbits = _mm512_and_si512(mask, nbits);
1446
    bits = _mm512_and_si512(mask, bits);
1447
  }
1448
  void Skip(size_t n) {
1449
    n = std::min<size_t>(n, 32);
1450
    constexpr uint16_t kMask[64] = {
1451
        0,      0,      0,      0,      0,      0,      0,      0,
1452
        0,      0,      0,      0,      0,      0,      0,      0,
1453
        0,      0,      0,      0,      0,      0,      0,      0,
1454
        0,      0,      0,      0,      0,      0,      0,      0,
1455
        0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF,
1456
        0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF,
1457
        0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF,
1458
        0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF,
1459
    };
1460
    __m512i mask = _mm512_loadu_si512((__m512i*)(kMask + 32 - n));
1461
    nbits = _mm512_and_si512(mask, nbits);
1462
    bits = _mm512_and_si512(mask, bits);
1463
  }
1464
};
1465
1466
#endif
1467
1468
#ifdef FJXL_AVX2
1469
#define FJXL_GENERIC_SIMD
1470
1471
struct SIMDVec32;
1472
1473
struct Mask32 {
1474
  __m256i mask;
1475
  SIMDVec32 IfThenElse(const SIMDVec32& if_true, const SIMDVec32& if_false);
1476
0
  size_t CountPrefix() const {
1477
0
    return CtzNonZero(~static_cast<uint64_t>(
1478
0
        static_cast<uint8_t>(_mm256_movemask_ps(_mm256_castsi256_ps(mask)))));
1479
0
  }
1480
};
1481
1482
struct SIMDVec32 {
1483
  __m256i vec;
1484
1485
  static constexpr size_t kLanes = 8;
1486
1487
0
  FJXL_INLINE static SIMDVec32 Load(const uint32_t* data) {
1488
0
    return SIMDVec32{_mm256_loadu_si256((__m256i*)data)};
1489
0
  }
1490
0
  FJXL_INLINE void Store(uint32_t* data) {
1491
0
    _mm256_storeu_si256((__m256i*)data, vec);
1492
0
  }
1493
0
  FJXL_INLINE static SIMDVec32 Val(uint32_t v) {
1494
0
    return SIMDVec32{_mm256_set1_epi32(v)};
1495
0
  }
1496
0
  FJXL_INLINE SIMDVec32 ValToToken() const {
1497
0
    auto f32 = _mm256_castps_si256(_mm256_cvtepi32_ps(vec));
1498
0
    return SIMDVec32{_mm256_max_epi32(
1499
0
        _mm256_setzero_si256(),
1500
0
        _mm256_sub_epi32(_mm256_srli_epi32(f32, 23), _mm256_set1_epi32(126)))};
1501
0
  }
1502
0
  FJXL_INLINE SIMDVec32 SatSubU(const SIMDVec32& to_subtract) const {
1503
0
    return SIMDVec32{_mm256_sub_epi32(_mm256_max_epu32(vec, to_subtract.vec),
1504
0
                                      to_subtract.vec)};
1505
0
  }
1506
0
  FJXL_INLINE SIMDVec32 Sub(const SIMDVec32& to_subtract) const {
1507
0
    return SIMDVec32{_mm256_sub_epi32(vec, to_subtract.vec)};
1508
0
  }
1509
0
  FJXL_INLINE SIMDVec32 Add(const SIMDVec32& oth) const {
1510
0
    return SIMDVec32{_mm256_add_epi32(vec, oth.vec)};
1511
0
  }
1512
0
  FJXL_INLINE SIMDVec32 Xor(const SIMDVec32& oth) const {
1513
0
    return SIMDVec32{_mm256_xor_si256(vec, oth.vec)};
1514
0
  }
1515
0
  FJXL_INLINE SIMDVec32 Pow2() const {
1516
0
    return SIMDVec32{_mm256_sllv_epi32(_mm256_set1_epi32(1), vec)};
1517
0
  }
1518
0
  FJXL_INLINE Mask32 Eq(const SIMDVec32& oth) const {
1519
0
    return Mask32{_mm256_cmpeq_epi32(vec, oth.vec)};
1520
0
  }
1521
0
  FJXL_INLINE Mask32 Gt(const SIMDVec32& oth) const {
1522
0
    return Mask32{_mm256_cmpgt_epi32(vec, oth.vec)};
1523
0
  }
1524
  template <size_t i>
1525
0
  FJXL_INLINE SIMDVec32 SignedShiftRight() const {
1526
0
    return SIMDVec32{_mm256_srai_epi32(vec, i)};
1527
0
  }
1528
};
1529
1530
struct SIMDVec16;
1531
1532
struct Mask16 {
1533
  __m256i mask;
1534
  SIMDVec16 IfThenElse(const SIMDVec16& if_true, const SIMDVec16& if_false);
1535
0
  Mask16 And(const Mask16& oth) const {
1536
0
    return Mask16{_mm256_and_si256(mask, oth.mask)};
1537
0
  }
1538
0
  size_t CountPrefix() const {
1539
0
    return CtzNonZero(~static_cast<uint64_t>(
1540
0
               static_cast<uint32_t>(_mm256_movemask_epi8(mask)))) /
1541
0
           2;
1542
0
  }
1543
};
1544
1545
struct SIMDVec16 {
1546
  __m256i vec;
1547
1548
  static constexpr size_t kLanes = 16;
1549
1550
0
  FJXL_INLINE static SIMDVec16 Load(const uint16_t* data) {
1551
0
    return SIMDVec16{_mm256_loadu_si256((__m256i*)data)};
1552
0
  }
1553
0
  FJXL_INLINE void Store(uint16_t* data) {
1554
0
    _mm256_storeu_si256((__m256i*)data, vec);
1555
0
  }
1556
0
  FJXL_INLINE static SIMDVec16 Val(uint16_t v) {
1557
0
    return SIMDVec16{_mm256_set1_epi16(v)};
1558
0
  }
1559
  FJXL_INLINE static SIMDVec16 FromTwo32(const SIMDVec32& lo,
1560
0
                                         const SIMDVec32& hi) {
1561
0
    auto tmp = _mm256_packus_epi32(lo.vec, hi.vec);
1562
0
    return SIMDVec16{_mm256_permute4x64_epi64(tmp, 0b11011000)};
1563
0
  }
1564
1565
0
  FJXL_INLINE SIMDVec16 ValToToken() const {
1566
0
    auto nibble0 =
1567
0
        _mm256_or_si256(_mm256_and_si256(vec, _mm256_set1_epi16(0xF)),
1568
0
                        _mm256_set1_epi16(0xFF00));
1569
0
    auto nibble1 = _mm256_or_si256(
1570
0
        _mm256_and_si256(_mm256_srli_epi16(vec, 4), _mm256_set1_epi16(0xF)),
1571
0
        _mm256_set1_epi16(0xFF00));
1572
0
    auto nibble2 = _mm256_or_si256(
1573
0
        _mm256_and_si256(_mm256_srli_epi16(vec, 8), _mm256_set1_epi16(0xF)),
1574
0
        _mm256_set1_epi16(0xFF00));
1575
0
    auto nibble3 =
1576
0
        _mm256_or_si256(_mm256_srli_epi16(vec, 12), _mm256_set1_epi16(0xFF00));
1577
1578
0
    auto lut0 = _mm256_broadcastsi128_si256(
1579
0
        _mm_setr_epi8(0, 1, 2, 2, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4));
1580
0
    auto lut1 = _mm256_broadcastsi128_si256(
1581
0
        _mm_setr_epi8(0, 5, 6, 6, 7, 7, 7, 7, 8, 8, 8, 8, 8, 8, 8, 8));
1582
0
    auto lut2 = _mm256_broadcastsi128_si256(_mm_setr_epi8(
1583
0
        0, 9, 10, 10, 11, 11, 11, 11, 12, 12, 12, 12, 12, 12, 12, 12));
1584
0
    auto lut3 = _mm256_broadcastsi128_si256(_mm_setr_epi8(
1585
0
        0, 13, 14, 14, 15, 15, 15, 15, 16, 16, 16, 16, 16, 16, 16, 16));
1586
1587
0
    auto token0 = _mm256_shuffle_epi8(lut0, nibble0);
1588
0
    auto token1 = _mm256_shuffle_epi8(lut1, nibble1);
1589
0
    auto token2 = _mm256_shuffle_epi8(lut2, nibble2);
1590
0
    auto token3 = _mm256_shuffle_epi8(lut3, nibble3);
1591
1592
0
    auto token = _mm256_max_epi16(_mm256_max_epi16(token0, token1),
1593
0
                                  _mm256_max_epi16(token2, token3));
1594
0
    return SIMDVec16{token};
1595
0
  }
1596
1597
0
  FJXL_INLINE SIMDVec16 SatSubU(const SIMDVec16& to_subtract) const {
1598
0
    return SIMDVec16{_mm256_subs_epu16(vec, to_subtract.vec)};
1599
0
  }
1600
0
  FJXL_INLINE SIMDVec16 Sub(const SIMDVec16& to_subtract) const {
1601
0
    return SIMDVec16{_mm256_sub_epi16(vec, to_subtract.vec)};
1602
0
  }
1603
0
  FJXL_INLINE SIMDVec16 Add(const SIMDVec16& oth) const {
1604
0
    return SIMDVec16{_mm256_add_epi16(vec, oth.vec)};
1605
0
  }
1606
0
  FJXL_INLINE SIMDVec16 Min(const SIMDVec16& oth) const {
1607
0
    return SIMDVec16{_mm256_min_epu16(vec, oth.vec)};
1608
0
  }
1609
0
  FJXL_INLINE Mask16 Eq(const SIMDVec16& oth) const {
1610
0
    return Mask16{_mm256_cmpeq_epi16(vec, oth.vec)};
1611
0
  }
1612
0
  FJXL_INLINE Mask16 Gt(const SIMDVec16& oth) const {
1613
0
    return Mask16{_mm256_cmpgt_epi16(vec, oth.vec)};
1614
0
  }
1615
0
  FJXL_INLINE SIMDVec16 Pow2() const {
1616
0
    auto pow2_lo_lut = _mm256_broadcastsi128_si256(
1617
0
        _mm_setr_epi8(1 << 0, 1 << 1, 1 << 2, 1 << 3, 1 << 4, 1 << 5, 1 << 6,
1618
0
                      1u << 7, 0, 0, 0, 0, 0, 0, 0, 0));
1619
0
    auto pow2_hi_lut = _mm256_broadcastsi128_si256(
1620
0
        _mm_setr_epi8(0, 0, 0, 0, 0, 0, 0, 0, 1 << 0, 1 << 1, 1 << 2, 1 << 3,
1621
0
                      1 << 4, 1 << 5, 1 << 6, 1u << 7));
1622
1623
0
    auto masked = _mm256_or_si256(vec, _mm256_set1_epi16(0xFF00));
1624
1625
0
    auto pow2_lo = _mm256_shuffle_epi8(pow2_lo_lut, masked);
1626
0
    auto pow2_hi = _mm256_shuffle_epi8(pow2_hi_lut, masked);
1627
1628
0
    auto pow2 = _mm256_or_si256(_mm256_slli_epi16(pow2_hi, 8), pow2_lo);
1629
0
    return SIMDVec16{pow2};
1630
0
  }
1631
0
  FJXL_INLINE SIMDVec16 Or(const SIMDVec16& oth) const {
1632
0
    return SIMDVec16{_mm256_or_si256(vec, oth.vec)};
1633
0
  }
1634
0
  FJXL_INLINE SIMDVec16 Xor(const SIMDVec16& oth) const {
1635
0
    return SIMDVec16{_mm256_xor_si256(vec, oth.vec)};
1636
0
  }
1637
0
  FJXL_INLINE SIMDVec16 And(const SIMDVec16& oth) const {
1638
0
    return SIMDVec16{_mm256_and_si256(vec, oth.vec)};
1639
0
  }
1640
0
  FJXL_INLINE SIMDVec16 HAdd(const SIMDVec16& oth) const {
1641
0
    return SIMDVec16{_mm256_srai_epi16(_mm256_add_epi16(vec, oth.vec), 1)};
1642
0
  }
1643
0
  FJXL_INLINE SIMDVec16 PrepareForU8Lookup() const {
1644
0
    return SIMDVec16{_mm256_or_si256(vec, _mm256_set1_epi16(0xFF00))};
1645
0
  }
1646
0
  FJXL_INLINE SIMDVec16 U8Lookup(const uint8_t* table) const {
1647
0
    return SIMDVec16{_mm256_shuffle_epi8(
1648
0
        _mm256_broadcastsi128_si256(_mm_loadu_si128((__m128i*)table)), vec)};
1649
0
  }
1650
0
  FJXL_INLINE VecPair<SIMDVec16> Interleave(const SIMDVec16& low) const {
1651
0
    auto v02 = _mm256_unpacklo_epi16(low.vec, vec);
1652
0
    auto v13 = _mm256_unpackhi_epi16(low.vec, vec);
1653
0
    return {SIMDVec16{_mm256_permute2x128_si256(v02, v13, 0x20)},
1654
0
            SIMDVec16{_mm256_permute2x128_si256(v02, v13, 0x31)}};
1655
0
  }
1656
0
  FJXL_INLINE VecPair<SIMDVec32> Upcast() const {
1657
0
    auto v02 = _mm256_unpacklo_epi16(vec, _mm256_setzero_si256());
1658
0
    auto v13 = _mm256_unpackhi_epi16(vec, _mm256_setzero_si256());
1659
0
    return {SIMDVec32{_mm256_permute2x128_si256(v02, v13, 0x20)},
1660
0
            SIMDVec32{_mm256_permute2x128_si256(v02, v13, 0x31)}};
1661
0
  }
1662
  template <size_t i>
1663
0
  FJXL_INLINE SIMDVec16 SignedShiftRight() const {
1664
0
    return SIMDVec16{_mm256_srai_epi16(vec, i)};
1665
0
  }
1666
1667
0
  static std::array<SIMDVec16, 1> LoadG8(const unsigned char* data) {
1668
0
    __m128i bytes = _mm_loadu_si128((__m128i*)data);
1669
0
    return {SIMDVec16{_mm256_cvtepu8_epi16(bytes)}};
1670
0
  }
1671
0
  static std::array<SIMDVec16, 1> LoadG16(const unsigned char* data) {
1672
0
    return {Load((const uint16_t*)data)};
1673
0
  }
1674
1675
0
  static std::array<SIMDVec16, 2> LoadGA8(const unsigned char* data) {
1676
0
    __m256i bytes = _mm256_loadu_si256((__m256i*)data);
1677
0
    __m256i gray = _mm256_and_si256(bytes, _mm256_set1_epi16(0xFF));
1678
0
    __m256i alpha = _mm256_srli_epi16(bytes, 8);
1679
0
    return {SIMDVec16{gray}, SIMDVec16{alpha}};
1680
0
  }
1681
0
  static std::array<SIMDVec16, 2> LoadGA16(const unsigned char* data) {
1682
0
    __m256i bytes1 = _mm256_loadu_si256((__m256i*)data);
1683
0
    __m256i bytes2 = _mm256_loadu_si256((__m256i*)(data + 32));
1684
0
    __m256i g_mask = _mm256_set1_epi32(0xFFFF);
1685
0
    __m256i g = _mm256_permute4x64_epi64(
1686
0
        _mm256_packus_epi32(_mm256_and_si256(bytes1, g_mask),
1687
0
                            _mm256_and_si256(bytes2, g_mask)),
1688
0
        0b11011000);
1689
0
    __m256i a = _mm256_permute4x64_epi64(
1690
0
        _mm256_packus_epi32(_mm256_srli_epi32(bytes1, 16),
1691
0
                            _mm256_srli_epi32(bytes2, 16)),
1692
0
        0b11011000);
1693
0
    return {SIMDVec16{g}, SIMDVec16{a}};
1694
0
  }
1695
1696
0
  static std::array<SIMDVec16, 3> LoadRGB8(const unsigned char* data) {
1697
0
    __m128i bytes0 = _mm_loadu_si128((__m128i*)data);
1698
0
    __m128i bytes1 = _mm_loadu_si128((__m128i*)(data + 16));
1699
0
    __m128i bytes2 = _mm_loadu_si128((__m128i*)(data + 32));
1700
1701
0
    __m128i idx =
1702
0
        _mm_setr_epi8(0, 3, 6, 9, 12, 15, 2, 5, 8, 11, 14, 1, 4, 7, 10, 13);
1703
1704
0
    __m128i r6b5g5_0 = _mm_shuffle_epi8(bytes0, idx);
1705
0
    __m128i g6r5b5_1 = _mm_shuffle_epi8(bytes1, idx);
1706
0
    __m128i b6g5r5_2 = _mm_shuffle_epi8(bytes2, idx);
1707
1708
0
    __m128i mask010 = _mm_setr_epi8(0, 0, 0, 0, 0, 0, 0xFF, 0xFF, 0xFF, 0xFF,
1709
0
                                    0xFF, 0, 0, 0, 0, 0);
1710
0
    __m128i mask001 = _mm_setr_epi8(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0xFF, 0xFF,
1711
0
                                    0xFF, 0xFF, 0xFF);
1712
1713
0
    __m128i b2g2b1 = _mm_blendv_epi8(b6g5r5_2, g6r5b5_1, mask001);
1714
0
    __m128i b2b0b1 = _mm_blendv_epi8(b2g2b1, r6b5g5_0, mask010);
1715
1716
0
    __m128i r0r1b1 = _mm_blendv_epi8(r6b5g5_0, g6r5b5_1, mask010);
1717
0
    __m128i r0r1r2 = _mm_blendv_epi8(r0r1b1, b6g5r5_2, mask001);
1718
1719
0
    __m128i g1r1g0 = _mm_blendv_epi8(g6r5b5_1, r6b5g5_0, mask001);
1720
0
    __m128i g1g2g0 = _mm_blendv_epi8(g1r1g0, b6g5r5_2, mask010);
1721
1722
0
    __m128i g0g1g2 = _mm_alignr_epi8(g1g2g0, g1g2g0, 11);
1723
0
    __m128i b0b1b2 = _mm_alignr_epi8(b2b0b1, b2b0b1, 6);
1724
1725
0
    return {SIMDVec16{_mm256_cvtepu8_epi16(r0r1r2)},
1726
0
            SIMDVec16{_mm256_cvtepu8_epi16(g0g1g2)},
1727
0
            SIMDVec16{_mm256_cvtepu8_epi16(b0b1b2)}};
1728
0
  }
1729
0
  static std::array<SIMDVec16, 3> LoadRGB16(const unsigned char* data) {
1730
0
    auto load_and_split_lohi = [](const unsigned char* data) {
1731
      // LHLHLH...
1732
0
      __m256i bytes = _mm256_loadu_si256((__m256i*)data);
1733
      // L0L0L0...
1734
0
      __m256i lo = _mm256_and_si256(bytes, _mm256_set1_epi16(0xFF));
1735
      // H0H0H0...
1736
0
      __m256i hi = _mm256_srli_epi16(bytes, 8);
1737
      // LLLLLLLLHHHHHHHHLLLLLLLLHHHHHHHH
1738
0
      __m256i packed = _mm256_packus_epi16(lo, hi);
1739
0
      return _mm256_permute4x64_epi64(packed, 0b11011000);
1740
0
    };
1741
0
    __m256i bytes0 = load_and_split_lohi(data);
1742
0
    __m256i bytes1 = load_and_split_lohi(data + 32);
1743
0
    __m256i bytes2 = load_and_split_lohi(data + 64);
1744
1745
0
    __m256i idx = _mm256_broadcastsi128_si256(
1746
0
        _mm_setr_epi8(0, 3, 6, 9, 12, 15, 2, 5, 8, 11, 14, 1, 4, 7, 10, 13));
1747
1748
0
    __m256i r6b5g5_0 = _mm256_shuffle_epi8(bytes0, idx);
1749
0
    __m256i g6r5b5_1 = _mm256_shuffle_epi8(bytes1, idx);
1750
0
    __m256i b6g5r5_2 = _mm256_shuffle_epi8(bytes2, idx);
1751
1752
0
    __m256i mask010 = _mm256_broadcastsi128_si256(_mm_setr_epi8(
1753
0
        0, 0, 0, 0, 0, 0, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0, 0, 0, 0, 0));
1754
0
    __m256i mask001 = _mm256_broadcastsi128_si256(_mm_setr_epi8(
1755
0
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF));
1756
1757
0
    __m256i b2g2b1 = _mm256_blendv_epi8(b6g5r5_2, g6r5b5_1, mask001);
1758
0
    __m256i b2b0b1 = _mm256_blendv_epi8(b2g2b1, r6b5g5_0, mask010);
1759
1760
0
    __m256i r0r1b1 = _mm256_blendv_epi8(r6b5g5_0, g6r5b5_1, mask010);
1761
0
    __m256i r0r1r2 = _mm256_blendv_epi8(r0r1b1, b6g5r5_2, mask001);
1762
1763
0
    __m256i g1r1g0 = _mm256_blendv_epi8(g6r5b5_1, r6b5g5_0, mask001);
1764
0
    __m256i g1g2g0 = _mm256_blendv_epi8(g1r1g0, b6g5r5_2, mask010);
1765
1766
0
    __m256i g0g1g2 = _mm256_alignr_epi8(g1g2g0, g1g2g0, 11);
1767
0
    __m256i b0b1b2 = _mm256_alignr_epi8(b2b0b1, b2b0b1, 6);
1768
1769
    // Now r0r1r2, g0g1g2, b0b1b2 have the low bytes of the RGB pixels in their
1770
    // lower half, and the high bytes in their upper half.
1771
1772
0
    auto combine_low_hi = [](__m256i v) {
1773
0
      __m128i low = _mm256_extracti128_si256(v, 0);
1774
0
      __m128i hi = _mm256_extracti128_si256(v, 1);
1775
0
      __m256i low16 = _mm256_cvtepu8_epi16(low);
1776
0
      __m256i hi16 = _mm256_cvtepu8_epi16(hi);
1777
0
      return _mm256_or_si256(_mm256_slli_epi16(hi16, 8), low16);
1778
0
    };
1779
1780
0
    return {SIMDVec16{combine_low_hi(r0r1r2)},
1781
0
            SIMDVec16{combine_low_hi(g0g1g2)},
1782
0
            SIMDVec16{combine_low_hi(b0b1b2)}};
1783
0
  }
1784
1785
0
  static std::array<SIMDVec16, 4> LoadRGBA8(const unsigned char* data) {
1786
0
    __m256i bytes1 = _mm256_loadu_si256((__m256i*)data);
1787
0
    __m256i bytes2 = _mm256_loadu_si256((__m256i*)(data + 32));
1788
0
    __m256i rg_mask = _mm256_set1_epi32(0xFFFF);
1789
0
    __m256i rg = _mm256_permute4x64_epi64(
1790
0
        _mm256_packus_epi32(_mm256_and_si256(bytes1, rg_mask),
1791
0
                            _mm256_and_si256(bytes2, rg_mask)),
1792
0
        0b11011000);
1793
0
    __m256i b_a = _mm256_permute4x64_epi64(
1794
0
        _mm256_packus_epi32(_mm256_srli_epi32(bytes1, 16),
1795
0
                            _mm256_srli_epi32(bytes2, 16)),
1796
0
        0b11011000);
1797
0
    __m256i r = _mm256_and_si256(rg, _mm256_set1_epi16(0xFF));
1798
0
    __m256i g = _mm256_srli_epi16(rg, 8);
1799
0
    __m256i b = _mm256_and_si256(b_a, _mm256_set1_epi16(0xFF));
1800
0
    __m256i a = _mm256_srli_epi16(b_a, 8);
1801
0
    return {SIMDVec16{r}, SIMDVec16{g}, SIMDVec16{b}, SIMDVec16{a}};
1802
0
  }
1803
0
  static std::array<SIMDVec16, 4> LoadRGBA16(const unsigned char* data) {
1804
0
    __m256i bytes0 = _mm256_loadu_si256((__m256i*)data);
1805
0
    __m256i bytes1 = _mm256_loadu_si256((__m256i*)(data + 32));
1806
0
    __m256i bytes2 = _mm256_loadu_si256((__m256i*)(data + 64));
1807
0
    __m256i bytes3 = _mm256_loadu_si256((__m256i*)(data + 96));
1808
1809
0
    auto pack32 = [](__m256i a, __m256i b) {
1810
0
      return _mm256_permute4x64_epi64(_mm256_packus_epi32(a, b), 0b11011000);
1811
0
    };
1812
0
    auto packlow32 = [&pack32](__m256i a, __m256i b) {
1813
0
      __m256i mask = _mm256_set1_epi32(0xFFFF);
1814
0
      return pack32(_mm256_and_si256(a, mask), _mm256_and_si256(b, mask));
1815
0
    };
1816
0
    auto packhi32 = [&pack32](__m256i a, __m256i b) {
1817
0
      return pack32(_mm256_srli_epi32(a, 16), _mm256_srli_epi32(b, 16));
1818
0
    };
1819
1820
0
    __m256i rb0 = packlow32(bytes0, bytes1);
1821
0
    __m256i rb1 = packlow32(bytes2, bytes3);
1822
0
    __m256i ga0 = packhi32(bytes0, bytes1);
1823
0
    __m256i ga1 = packhi32(bytes2, bytes3);
1824
1825
0
    __m256i r = packlow32(rb0, rb1);
1826
0
    __m256i g = packlow32(ga0, ga1);
1827
0
    __m256i b = packhi32(rb0, rb1);
1828
0
    __m256i a = packhi32(ga0, ga1);
1829
0
    return {SIMDVec16{r}, SIMDVec16{g}, SIMDVec16{b}, SIMDVec16{a}};
1830
0
  }
1831
1832
0
  void SwapEndian() {
1833
0
    auto indices = _mm256_broadcastsi128_si256(
1834
0
        _mm_setr_epi8(1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14));
1835
0
    vec = _mm256_shuffle_epi8(vec, indices);
1836
0
  }
1837
};
1838
1839
SIMDVec16 Mask16::IfThenElse(const SIMDVec16& if_true,
1840
0
                             const SIMDVec16& if_false) {
1841
0
  return SIMDVec16{_mm256_blendv_epi8(if_false.vec, if_true.vec, mask)};
1842
0
}
1843
1844
SIMDVec32 Mask32::IfThenElse(const SIMDVec32& if_true,
1845
0
                             const SIMDVec32& if_false) {
1846
0
  return SIMDVec32{_mm256_blendv_epi8(if_false.vec, if_true.vec, mask)};
1847
0
}
1848
1849
struct Bits64 {
1850
  static constexpr size_t kLanes = 4;
1851
1852
  __m256i nbits;
1853
  __m256i bits;
1854
1855
0
  FJXL_INLINE void Store(uint64_t* nbits_out, uint64_t* bits_out) {
1856
0
    _mm256_storeu_si256((__m256i*)nbits_out, nbits);
1857
0
    _mm256_storeu_si256((__m256i*)bits_out, bits);
1858
0
  }
1859
};
1860
1861
struct Bits32 {
1862
  __m256i nbits;
1863
  __m256i bits;
1864
1865
0
  static Bits32 FromRaw(SIMDVec32 nbits, SIMDVec32 bits) {
1866
0
    return Bits32{nbits.vec, bits.vec};
1867
0
  }
1868
1869
0
  Bits64 Merge() const {
1870
0
    auto nbits_hi32 = _mm256_srli_epi64(nbits, 32);
1871
0
    auto nbits_lo32 = _mm256_and_si256(nbits, _mm256_set1_epi64x(0xFFFFFFFF));
1872
0
    auto bits_hi32 = _mm256_srli_epi64(bits, 32);
1873
0
    auto bits_lo32 = _mm256_and_si256(bits, _mm256_set1_epi64x(0xFFFFFFFF));
1874
1875
0
    auto nbits64 = _mm256_add_epi64(nbits_hi32, nbits_lo32);
1876
0
    auto bits64 =
1877
0
        _mm256_or_si256(_mm256_sllv_epi64(bits_hi32, nbits_lo32), bits_lo32);
1878
0
    return Bits64{nbits64, bits64};
1879
0
  }
1880
1881
0
  void Interleave(const Bits32& low) {
1882
0
    bits = _mm256_or_si256(_mm256_sllv_epi32(bits, low.nbits), low.bits);
1883
0
    nbits = _mm256_add_epi32(nbits, low.nbits);
1884
0
  }
1885
1886
0
  void ClipTo(size_t n) {
1887
0
    n = std::min<size_t>(n, 8);
1888
0
    constexpr uint32_t kMask[16] = {
1889
0
        ~0u, ~0u, ~0u, ~0u, ~0u, ~0u, ~0u, ~0u, 0, 0, 0, 0, 0, 0, 0, 0,
1890
0
    };
1891
0
    __m256i mask = _mm256_loadu_si256((__m256i*)(kMask + 8 - n));
1892
0
    nbits = _mm256_and_si256(mask, nbits);
1893
0
    bits = _mm256_and_si256(mask, bits);
1894
0
  }
1895
0
  void Skip(size_t n) {
1896
0
    n = std::min<size_t>(n, 8);
1897
0
    constexpr uint32_t kMask[16] = {
1898
0
        0, 0, 0, 0, 0, 0, 0, 0, ~0u, ~0u, ~0u, ~0u, ~0u, ~0u, ~0u, ~0u,
1899
0
    };
1900
0
    __m256i mask = _mm256_loadu_si256((__m256i*)(kMask + 8 - n));
1901
0
    nbits = _mm256_and_si256(mask, nbits);
1902
0
    bits = _mm256_and_si256(mask, bits);
1903
0
  }
1904
};
1905
1906
struct Bits16 {
1907
  __m256i nbits;
1908
  __m256i bits;
1909
1910
0
  static Bits16 FromRaw(SIMDVec16 nbits, SIMDVec16 bits) {
1911
0
    return Bits16{nbits.vec, bits.vec};
1912
0
  }
1913
1914
0
  Bits32 Merge() const {
1915
0
    auto nbits_hi16 = _mm256_srli_epi32(nbits, 16);
1916
0
    auto nbits_lo16 = _mm256_and_si256(nbits, _mm256_set1_epi32(0xFFFF));
1917
0
    auto bits_hi16 = _mm256_srli_epi32(bits, 16);
1918
0
    auto bits_lo16 = _mm256_and_si256(bits, _mm256_set1_epi32(0xFFFF));
1919
1920
0
    auto nbits32 = _mm256_add_epi32(nbits_hi16, nbits_lo16);
1921
0
    auto bits32 =
1922
0
        _mm256_or_si256(_mm256_sllv_epi32(bits_hi16, nbits_lo16), bits_lo16);
1923
0
    return Bits32{nbits32, bits32};
1924
0
  }
1925
1926
0
  void Interleave(const Bits16& low) {
1927
0
    auto pow2_lo_lut = _mm256_broadcastsi128_si256(
1928
0
        _mm_setr_epi8(1 << 0, 1 << 1, 1 << 2, 1 << 3, 1 << 4, 1 << 5, 1 << 6,
1929
0
                      1u << 7, 0, 0, 0, 0, 0, 0, 0, 0));
1930
0
    auto low_nbits_masked =
1931
0
        _mm256_or_si256(low.nbits, _mm256_set1_epi16(0xFF00));
1932
1933
0
    auto bits_shifted = _mm256_mullo_epi16(
1934
0
        bits, _mm256_shuffle_epi8(pow2_lo_lut, low_nbits_masked));
1935
1936
0
    nbits = _mm256_add_epi16(nbits, low.nbits);
1937
0
    bits = _mm256_or_si256(bits_shifted, low.bits);
1938
0
  }
1939
1940
0
  void ClipTo(size_t n) {
1941
0
    n = std::min<size_t>(n, 16);
1942
0
    constexpr uint16_t kMask[32] = {
1943
0
        0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF,
1944
0
        0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF,
1945
0
        0,      0,      0,      0,      0,      0,      0,      0,
1946
0
        0,      0,      0,      0,      0,      0,      0,      0,
1947
0
    };
1948
0
    __m256i mask = _mm256_loadu_si256((__m256i*)(kMask + 16 - n));
1949
0
    nbits = _mm256_and_si256(mask, nbits);
1950
0
    bits = _mm256_and_si256(mask, bits);
1951
0
  }
1952
1953
0
  void Skip(size_t n) {
1954
0
    n = std::min<size_t>(n, 16);
1955
0
    constexpr uint16_t kMask[32] = {
1956
0
        0,      0,      0,      0,      0,      0,      0,      0,
1957
0
        0,      0,      0,      0,      0,      0,      0,      0,
1958
0
        0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF,
1959
0
        0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF,
1960
0
    };
1961
0
    __m256i mask = _mm256_loadu_si256((__m256i*)(kMask + 16 - n));
1962
0
    nbits = _mm256_and_si256(mask, nbits);
1963
0
    bits = _mm256_and_si256(mask, bits);
1964
0
  }
1965
};
1966
1967
#endif
1968
1969
#ifdef FJXL_NEON
1970
#define FJXL_GENERIC_SIMD
1971
1972
struct SIMDVec32;
1973
1974
struct Mask32 {
1975
  uint32x4_t mask;
1976
  SIMDVec32 IfThenElse(const SIMDVec32& if_true, const SIMDVec32& if_false);
1977
  Mask32 And(const Mask32& oth) const {
1978
    return Mask32{vandq_u32(mask, oth.mask)};
1979
  }
1980
  size_t CountPrefix() const {
1981
    uint32_t val_unset[4] = {0, 1, 2, 3};
1982
    uint32_t val_set[4] = {4, 4, 4, 4};
1983
    uint32x4_t val = vbslq_u32(mask, vld1q_u32(val_set), vld1q_u32(val_unset));
1984
    return vminvq_u32(val);
1985
  }
1986
};
1987
1988
struct SIMDVec32 {
1989
  uint32x4_t vec;
1990
1991
  static constexpr size_t kLanes = 4;
1992
1993
  FJXL_INLINE static SIMDVec32 Load(const uint32_t* data) {
1994
    return SIMDVec32{vld1q_u32(data)};
1995
  }
1996
  FJXL_INLINE void Store(uint32_t* data) { vst1q_u32(data, vec); }
1997
  FJXL_INLINE static SIMDVec32 Val(uint32_t v) {
1998
    return SIMDVec32{vdupq_n_u32(v)};
1999
  }
2000
  FJXL_INLINE SIMDVec32 ValToToken() const {
2001
    return SIMDVec32{vsubq_u32(vdupq_n_u32(32), vclzq_u32(vec))};
2002
  }
2003
  FJXL_INLINE SIMDVec32 SatSubU(const SIMDVec32& to_subtract) const {
2004
    return SIMDVec32{vqsubq_u32(vec, to_subtract.vec)};
2005
  }
2006
  FJXL_INLINE SIMDVec32 Sub(const SIMDVec32& to_subtract) const {
2007
    return SIMDVec32{vsubq_u32(vec, to_subtract.vec)};
2008
  }
2009
  FJXL_INLINE SIMDVec32 Add(const SIMDVec32& oth) const {
2010
    return SIMDVec32{vaddq_u32(vec, oth.vec)};
2011
  }
2012
  FJXL_INLINE SIMDVec32 Xor(const SIMDVec32& oth) const {
2013
    return SIMDVec32{veorq_u32(vec, oth.vec)};
2014
  }
2015
  FJXL_INLINE SIMDVec32 Pow2() const {
2016
    return SIMDVec32{vshlq_u32(vdupq_n_u32(1), vreinterpretq_s32_u32(vec))};
2017
  }
2018
  FJXL_INLINE Mask32 Eq(const SIMDVec32& oth) const {
2019
    return Mask32{vceqq_u32(vec, oth.vec)};
2020
  }
2021
  FJXL_INLINE Mask32 Gt(const SIMDVec32& oth) const {
2022
    return Mask32{
2023
        vcgtq_s32(vreinterpretq_s32_u32(vec), vreinterpretq_s32_u32(oth.vec))};
2024
  }
2025
  template <size_t i>
2026
  FJXL_INLINE SIMDVec32 SignedShiftRight() const {
2027
    return SIMDVec32{
2028
        vreinterpretq_u32_s32(vshrq_n_s32(vreinterpretq_s32_u32(vec), i))};
2029
  }
2030
};
2031
2032
struct SIMDVec16;
2033
2034
struct Mask16 {
2035
  uint16x8_t mask;
2036
  SIMDVec16 IfThenElse(const SIMDVec16& if_true, const SIMDVec16& if_false);
2037
  Mask16 And(const Mask16& oth) const {
2038
    return Mask16{vandq_u16(mask, oth.mask)};
2039
  }
2040
  size_t CountPrefix() const {
2041
    uint16_t val_unset[8] = {0, 1, 2, 3, 4, 5, 6, 7};
2042
    uint16_t val_set[8] = {8, 8, 8, 8, 8, 8, 8, 8};
2043
    uint16x8_t val = vbslq_u16(mask, vld1q_u16(val_set), vld1q_u16(val_unset));
2044
    return vminvq_u16(val);
2045
  }
2046
};
2047
2048
struct SIMDVec16 {
2049
  uint16x8_t vec;
2050
2051
  static constexpr size_t kLanes = 8;
2052
2053
  FJXL_INLINE static SIMDVec16 Load(const uint16_t* data) {
2054
    return SIMDVec16{vld1q_u16(data)};
2055
  }
2056
  FJXL_INLINE void Store(uint16_t* data) { vst1q_u16(data, vec); }
2057
  FJXL_INLINE static SIMDVec16 Val(uint16_t v) {
2058
    return SIMDVec16{vdupq_n_u16(v)};
2059
  }
2060
  FJXL_INLINE static SIMDVec16 FromTwo32(const SIMDVec32& lo,
2061
                                         const SIMDVec32& hi) {
2062
    return SIMDVec16{vmovn_high_u32(vmovn_u32(lo.vec), hi.vec)};
2063
  }
2064
2065
  FJXL_INLINE SIMDVec16 ValToToken() const {
2066
    return SIMDVec16{vsubq_u16(vdupq_n_u16(16), vclzq_u16(vec))};
2067
  }
2068
  FJXL_INLINE SIMDVec16 SatSubU(const SIMDVec16& to_subtract) const {
2069
    return SIMDVec16{vqsubq_u16(vec, to_subtract.vec)};
2070
  }
2071
  FJXL_INLINE SIMDVec16 Sub(const SIMDVec16& to_subtract) const {
2072
    return SIMDVec16{vsubq_u16(vec, to_subtract.vec)};
2073
  }
2074
  FJXL_INLINE SIMDVec16 Add(const SIMDVec16& oth) const {
2075
    return SIMDVec16{vaddq_u16(vec, oth.vec)};
2076
  }
2077
  FJXL_INLINE SIMDVec16 Min(const SIMDVec16& oth) const {
2078
    return SIMDVec16{vminq_u16(vec, oth.vec)};
2079
  }
2080
  FJXL_INLINE Mask16 Eq(const SIMDVec16& oth) const {
2081
    return Mask16{vceqq_u16(vec, oth.vec)};
2082
  }
2083
  FJXL_INLINE Mask16 Gt(const SIMDVec16& oth) const {
2084
    return Mask16{
2085
        vcgtq_s16(vreinterpretq_s16_u16(vec), vreinterpretq_s16_u16(oth.vec))};
2086
  }
2087
  FJXL_INLINE SIMDVec16 Pow2() const {
2088
    return SIMDVec16{vshlq_u16(vdupq_n_u16(1), vreinterpretq_s16_u16(vec))};
2089
  }
2090
  FJXL_INLINE SIMDVec16 Or(const SIMDVec16& oth) const {
2091
    return SIMDVec16{vorrq_u16(vec, oth.vec)};
2092
  }
2093
  FJXL_INLINE SIMDVec16 Xor(const SIMDVec16& oth) const {
2094
    return SIMDVec16{veorq_u16(vec, oth.vec)};
2095
  }
2096
  FJXL_INLINE SIMDVec16 And(const SIMDVec16& oth) const {
2097
    return SIMDVec16{vandq_u16(vec, oth.vec)};
2098
  }
2099
  FJXL_INLINE SIMDVec16 HAdd(const SIMDVec16& oth) const {
2100
    return SIMDVec16{vhaddq_u16(vec, oth.vec)};
2101
  }
2102
  FJXL_INLINE SIMDVec16 PrepareForU8Lookup() const {
2103
    return SIMDVec16{vorrq_u16(vec, vdupq_n_u16(0xFF00))};
2104
  }
2105
  FJXL_INLINE SIMDVec16 U8Lookup(const uint8_t* table) const {
2106
    uint8x16_t tbl = vld1q_u8(table);
2107
    uint8x16_t indices = vreinterpretq_u8_u16(vec);
2108
    return SIMDVec16{vreinterpretq_u16_u8(vqtbl1q_u8(tbl, indices))};
2109
  }
2110
  FJXL_INLINE VecPair<SIMDVec16> Interleave(const SIMDVec16& low) const {
2111
    return {SIMDVec16{vzip1q_u16(low.vec, vec)},
2112
            SIMDVec16{vzip2q_u16(low.vec, vec)}};
2113
  }
2114
  FJXL_INLINE VecPair<SIMDVec32> Upcast() const {
2115
    uint32x4_t lo = vmovl_u16(vget_low_u16(vec));
2116
    uint32x4_t hi = vmovl_high_u16(vec);
2117
    return {SIMDVec32{lo}, SIMDVec32{hi}};
2118
  }
2119
  template <size_t i>
2120
  FJXL_INLINE SIMDVec16 SignedShiftRight() const {
2121
    return SIMDVec16{
2122
        vreinterpretq_u16_s16(vshrq_n_s16(vreinterpretq_s16_u16(vec), i))};
2123
  }
2124
2125
  static std::array<SIMDVec16, 1> LoadG8(const unsigned char* data) {
2126
    uint8x8_t v = vld1_u8(data);
2127
    return {SIMDVec16{vmovl_u8(v)}};
2128
  }
2129
  static std::array<SIMDVec16, 1> LoadG16(const unsigned char* data) {
2130
    return {Load((const uint16_t*)data)};
2131
  }
2132
2133
  static std::array<SIMDVec16, 2> LoadGA8(const unsigned char* data) {
2134
    uint8x8x2_t v = vld2_u8(data);
2135
    return {SIMDVec16{vmovl_u8(v.val[0])}, SIMDVec16{vmovl_u8(v.val[1])}};
2136
  }
2137
  static std::array<SIMDVec16, 2> LoadGA16(const unsigned char* data) {
2138
    uint16x8x2_t v = vld2q_u16((const uint16_t*)data);
2139
    return {SIMDVec16{v.val[0]}, SIMDVec16{v.val[1]}};
2140
  }
2141
2142
  static std::array<SIMDVec16, 3> LoadRGB8(const unsigned char* data) {
2143
    uint8x8x3_t v = vld3_u8(data);
2144
    return {SIMDVec16{vmovl_u8(v.val[0])}, SIMDVec16{vmovl_u8(v.val[1])},
2145
            SIMDVec16{vmovl_u8(v.val[2])}};
2146
  }
2147
  static std::array<SIMDVec16, 3> LoadRGB16(const unsigned char* data) {
2148
    uint16x8x3_t v = vld3q_u16((const uint16_t*)data);
2149
    return {SIMDVec16{v.val[0]}, SIMDVec16{v.val[1]}, SIMDVec16{v.val[2]}};
2150
  }
2151
2152
  static std::array<SIMDVec16, 4> LoadRGBA8(const unsigned char* data) {
2153
    uint8x8x4_t v = vld4_u8(data);
2154
    return {SIMDVec16{vmovl_u8(v.val[0])}, SIMDVec16{vmovl_u8(v.val[1])},
2155
            SIMDVec16{vmovl_u8(v.val[2])}, SIMDVec16{vmovl_u8(v.val[3])}};
2156
  }
2157
  static std::array<SIMDVec16, 4> LoadRGBA16(const unsigned char* data) {
2158
    uint16x8x4_t v = vld4q_u16((const uint16_t*)data);
2159
    return {SIMDVec16{v.val[0]}, SIMDVec16{v.val[1]}, SIMDVec16{v.val[2]},
2160
            SIMDVec16{v.val[3]}};
2161
  }
2162
2163
  void SwapEndian() {
2164
    vec = vreinterpretq_u16_u8(vrev16q_u8(vreinterpretq_u8_u16(vec)));
2165
  }
2166
};
2167
2168
SIMDVec16 Mask16::IfThenElse(const SIMDVec16& if_true,
2169
                             const SIMDVec16& if_false) {
2170
  return SIMDVec16{vbslq_u16(mask, if_true.vec, if_false.vec)};
2171
}
2172
2173
SIMDVec32 Mask32::IfThenElse(const SIMDVec32& if_true,
2174
                             const SIMDVec32& if_false) {
2175
  return SIMDVec32{vbslq_u32(mask, if_true.vec, if_false.vec)};
2176
}
2177
2178
struct Bits64 {
2179
  static constexpr size_t kLanes = 2;
2180
2181
  uint64x2_t nbits;
2182
  uint64x2_t bits;
2183
2184
  FJXL_INLINE void Store(uint64_t* nbits_out, uint64_t* bits_out) {
2185
    vst1q_u64(nbits_out, nbits);
2186
    vst1q_u64(bits_out, bits);
2187
  }
2188
};
2189
2190
struct Bits32 {
2191
  uint32x4_t nbits;
2192
  uint32x4_t bits;
2193
2194
  static Bits32 FromRaw(SIMDVec32 nbits, SIMDVec32 bits) {
2195
    return Bits32{nbits.vec, bits.vec};
2196
  }
2197
2198
  Bits64 Merge() const {
2199
    // TODO(veluca): can probably be optimized.
2200
    uint64x2_t nbits_lo32 =
2201
        vandq_u64(vreinterpretq_u64_u32(nbits), vdupq_n_u64(0xFFFFFFFF));
2202
    uint64x2_t bits_hi32 =
2203
        vshlq_u64(vshrq_n_u64(vreinterpretq_u64_u32(bits), 32),
2204
                  vreinterpretq_s64_u64(nbits_lo32));
2205
    uint64x2_t bits_lo32 =
2206
        vandq_u64(vreinterpretq_u64_u32(bits), vdupq_n_u64(0xFFFFFFFF));
2207
    uint64x2_t nbits64 =
2208
        vsraq_n_u64(nbits_lo32, vreinterpretq_u64_u32(nbits), 32);
2209
    uint64x2_t bits64 = vorrq_u64(bits_hi32, bits_lo32);
2210
    return Bits64{nbits64, bits64};
2211
  }
2212
2213
  void Interleave(const Bits32& low) {
2214
    bits =
2215
        vorrq_u32(vshlq_u32(bits, vreinterpretq_s32_u32(low.nbits)), low.bits);
2216
    nbits = vaddq_u32(nbits, low.nbits);
2217
  }
2218
2219
  void ClipTo(size_t n) {
2220
    n = std::min<size_t>(n, 4);
2221
    constexpr uint32_t kMask[8] = {
2222
        ~0u, ~0u, ~0u, ~0u, 0, 0, 0, 0,
2223
    };
2224
    uint32x4_t mask = vld1q_u32(kMask + 4 - n);
2225
    nbits = vandq_u32(mask, nbits);
2226
    bits = vandq_u32(mask, bits);
2227
  }
2228
  void Skip(size_t n) {
2229
    n = std::min<size_t>(n, 4);
2230
    constexpr uint32_t kMask[8] = {
2231
        0, 0, 0, 0, ~0u, ~0u, ~0u, ~0u,
2232
    };
2233
    uint32x4_t mask = vld1q_u32(kMask + 4 - n);
2234
    nbits = vandq_u32(mask, nbits);
2235
    bits = vandq_u32(mask, bits);
2236
  }
2237
};
2238
2239
struct Bits16 {
2240
  uint16x8_t nbits;
2241
  uint16x8_t bits;
2242
2243
  static Bits16 FromRaw(SIMDVec16 nbits, SIMDVec16 bits) {
2244
    return Bits16{nbits.vec, bits.vec};
2245
  }
2246
2247
  Bits32 Merge() const {
2248
    // TODO(veluca): can probably be optimized.
2249
    uint32x4_t nbits_lo16 =
2250
        vandq_u32(vreinterpretq_u32_u16(nbits), vdupq_n_u32(0xFFFF));
2251
    uint32x4_t bits_hi16 =
2252
        vshlq_u32(vshrq_n_u32(vreinterpretq_u32_u16(bits), 16),
2253
                  vreinterpretq_s32_u32(nbits_lo16));
2254
    uint32x4_t bits_lo16 =
2255
        vandq_u32(vreinterpretq_u32_u16(bits), vdupq_n_u32(0xFFFF));
2256
    uint32x4_t nbits32 =
2257
        vsraq_n_u32(nbits_lo16, vreinterpretq_u32_u16(nbits), 16);
2258
    uint32x4_t bits32 = vorrq_u32(bits_hi16, bits_lo16);
2259
    return Bits32{nbits32, bits32};
2260
  }
2261
2262
  void Interleave(const Bits16& low) {
2263
    bits =
2264
        vorrq_u16(vshlq_u16(bits, vreinterpretq_s16_u16(low.nbits)), low.bits);
2265
    nbits = vaddq_u16(nbits, low.nbits);
2266
  }
2267
2268
  void ClipTo(size_t n) {
2269
    n = std::min<size_t>(n, 8);
2270
    constexpr uint16_t kMask[16] = {
2271
        0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF,
2272
        0,      0,      0,      0,      0,      0,      0,      0,
2273
    };
2274
    uint16x8_t mask = vld1q_u16(kMask + 8 - n);
2275
    nbits = vandq_u16(mask, nbits);
2276
    bits = vandq_u16(mask, bits);
2277
  }
2278
  void Skip(size_t n) {
2279
    n = std::min<size_t>(n, 8);
2280
    constexpr uint16_t kMask[16] = {
2281
        0,      0,      0,      0,      0,      0,      0,      0,
2282
        0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF,
2283
    };
2284
    uint16x8_t mask = vld1q_u16(kMask + 8 - n);
2285
    nbits = vandq_u16(mask, nbits);
2286
    bits = vandq_u16(mask, bits);
2287
  }
2288
};
2289
2290
#endif
2291
2292
#ifdef FJXL_GENERIC_SIMD
2293
constexpr size_t SIMDVec32::kLanes;
2294
constexpr size_t SIMDVec16::kLanes;
2295
2296
//  Each of these functions will process SIMDVec16::kLanes worth of values.
2297
2298
FJXL_INLINE void TokenizeSIMD(const uint16_t* residuals, uint16_t* token_out,
2299
0
                              uint16_t* nbits_out, uint16_t* bits_out) {
2300
0
  SIMDVec16 res = SIMDVec16::Load(residuals);
2301
0
  SIMDVec16 token = res.ValToToken();
2302
0
  SIMDVec16 nbits = token.SatSubU(SIMDVec16::Val(1));
2303
0
  SIMDVec16 bits = res.SatSubU(nbits.Pow2());
2304
0
  token.Store(token_out);
2305
0
  nbits.Store(nbits_out);
2306
0
  bits.Store(bits_out);
2307
0
}
2308
2309
FJXL_INLINE void TokenizeSIMD(const uint32_t* residuals, uint16_t* token_out,
2310
0
                              uint32_t* nbits_out, uint32_t* bits_out) {
2311
0
  static_assert(SIMDVec16::kLanes == 2 * SIMDVec32::kLanes,
2312
0
                "There should be twice more 16-bit lanes than 32-bit lanes");
2313
0
  SIMDVec32 res_lo = SIMDVec32::Load(residuals);
2314
0
  SIMDVec32 res_hi = SIMDVec32::Load(residuals + SIMDVec32::kLanes);
2315
0
  SIMDVec32 token_lo = res_lo.ValToToken();
2316
0
  SIMDVec32 token_hi = res_hi.ValToToken();
2317
0
  SIMDVec32 nbits_lo = token_lo.SatSubU(SIMDVec32::Val(1));
2318
0
  SIMDVec32 nbits_hi = token_hi.SatSubU(SIMDVec32::Val(1));
2319
0
  SIMDVec32 bits_lo = res_lo.SatSubU(nbits_lo.Pow2());
2320
0
  SIMDVec32 bits_hi = res_hi.SatSubU(nbits_hi.Pow2());
2321
0
  SIMDVec16 token = SIMDVec16::FromTwo32(token_lo, token_hi);
2322
0
  token.Store(token_out);
2323
0
  nbits_lo.Store(nbits_out);
2324
0
  nbits_hi.Store(nbits_out + SIMDVec32::kLanes);
2325
0
  bits_lo.Store(bits_out);
2326
0
  bits_hi.Store(bits_out + SIMDVec32::kLanes);
2327
0
}
2328
2329
FJXL_INLINE void HuffmanSIMDUpTo13(const uint16_t* tokens,
2330
                                   const uint8_t* raw_nbits_simd,
2331
                                   const uint8_t* raw_bits_simd,
2332
0
                                   uint16_t* nbits_out, uint16_t* bits_out) {
2333
0
  SIMDVec16 tok = SIMDVec16::Load(tokens).PrepareForU8Lookup();
2334
0
  tok.U8Lookup(raw_nbits_simd).Store(nbits_out);
2335
0
  tok.U8Lookup(raw_bits_simd).Store(bits_out);
2336
0
}
2337
2338
FJXL_INLINE void HuffmanSIMD14(const uint16_t* tokens,
2339
                               const uint8_t* raw_nbits_simd,
2340
                               const uint8_t* raw_bits_simd,
2341
0
                               uint16_t* nbits_out, uint16_t* bits_out) {
2342
0
  SIMDVec16 token_cap = SIMDVec16::Val(15);
2343
0
  SIMDVec16 tok = SIMDVec16::Load(tokens);
2344
0
  SIMDVec16 tok_index = tok.Min(token_cap).PrepareForU8Lookup();
2345
0
  SIMDVec16 huff_bits_pre = tok_index.U8Lookup(raw_bits_simd);
2346
  // Set the highest bit when token == 16; the Huffman code is constructed in
2347
  // such a way that the code for token 15 is the same as the code for 16,
2348
  // except for the highest bit.
2349
0
  Mask16 needs_high_bit = tok.Eq(SIMDVec16::Val(16));
2350
0
  SIMDVec16 huff_bits = needs_high_bit.IfThenElse(
2351
0
      huff_bits_pre.Or(SIMDVec16::Val(128)), huff_bits_pre);
2352
0
  huff_bits.Store(bits_out);
2353
0
  tok_index.U8Lookup(raw_nbits_simd).Store(nbits_out);
2354
0
}
2355
2356
FJXL_INLINE void HuffmanSIMDAbove14(const uint16_t* tokens,
2357
                                    const uint8_t* raw_nbits_simd,
2358
                                    const uint8_t* raw_bits_simd,
2359
0
                                    uint16_t* nbits_out, uint16_t* bits_out) {
2360
0
  SIMDVec16 tok = SIMDVec16::Load(tokens);
2361
  // We assume `tok` fits in a *signed* 16-bit integer.
2362
0
  Mask16 above = tok.Gt(SIMDVec16::Val(12));
2363
  // 13, 14 -> 13
2364
  // 15, 16 -> 14
2365
  // 17, 18 -> 15
2366
0
  SIMDVec16 remap_tok = above.IfThenElse(tok.HAdd(SIMDVec16::Val(13)), tok);
2367
0
  SIMDVec16 tok_index = remap_tok.PrepareForU8Lookup();
2368
0
  SIMDVec16 huff_bits_pre = tok_index.U8Lookup(raw_bits_simd);
2369
  // Set the highest bit when token == 14, 16, 18.
2370
0
  Mask16 needs_high_bit = above.And(tok.Eq(tok.And(SIMDVec16::Val(0xFFFE))));
2371
0
  SIMDVec16 huff_bits = needs_high_bit.IfThenElse(
2372
0
      huff_bits_pre.Or(SIMDVec16::Val(128)), huff_bits_pre);
2373
0
  huff_bits.Store(bits_out);
2374
0
  tok_index.U8Lookup(raw_nbits_simd).Store(nbits_out);
2375
0
}
2376
2377
FJXL_INLINE void StoreSIMDUpTo8(const uint16_t* nbits_tok,
2378
                                const uint16_t* bits_tok,
2379
                                const uint16_t* nbits_huff,
2380
                                const uint16_t* bits_huff, size_t n,
2381
0
                                size_t skip, Bits32* bits_out) {
2382
0
  Bits16 bits =
2383
0
      Bits16::FromRaw(SIMDVec16::Load(nbits_tok), SIMDVec16::Load(bits_tok));
2384
0
  Bits16 huff_bits =
2385
0
      Bits16::FromRaw(SIMDVec16::Load(nbits_huff), SIMDVec16::Load(bits_huff));
2386
0
  bits.Interleave(huff_bits);
2387
0
  bits.ClipTo(n);
2388
0
  bits.Skip(skip);
2389
0
  bits_out[0] = bits.Merge();
2390
0
}
2391
2392
// Huffman and raw bits don't necessarily fit in a single u16 here.
2393
FJXL_INLINE void StoreSIMDUpTo14(const uint16_t* nbits_tok,
2394
                                 const uint16_t* bits_tok,
2395
                                 const uint16_t* nbits_huff,
2396
                                 const uint16_t* bits_huff, size_t n,
2397
0
                                 size_t skip, Bits32* bits_out) {
2398
0
  VecPair<SIMDVec16> bits =
2399
0
      SIMDVec16::Load(bits_tok).Interleave(SIMDVec16::Load(bits_huff));
2400
0
  VecPair<SIMDVec16> nbits =
2401
0
      SIMDVec16::Load(nbits_tok).Interleave(SIMDVec16::Load(nbits_huff));
2402
0
  Bits16 low = Bits16::FromRaw(nbits.low, bits.low);
2403
0
  Bits16 hi = Bits16::FromRaw(nbits.hi, bits.hi);
2404
0
  low.ClipTo(2 * n);
2405
0
  low.Skip(2 * skip);
2406
0
  hi.ClipTo(std::max(2 * n, SIMDVec16::kLanes) - SIMDVec16::kLanes);
2407
0
  hi.Skip(std::max(2 * skip, SIMDVec16::kLanes) - SIMDVec16::kLanes);
2408
2409
0
  bits_out[0] = low.Merge();
2410
0
  bits_out[1] = hi.Merge();
2411
0
}
2412
2413
FJXL_INLINE void StoreSIMDAbove14(const uint32_t* nbits_tok,
2414
                                  const uint32_t* bits_tok,
2415
                                  const uint16_t* nbits_huff,
2416
                                  const uint16_t* bits_huff, size_t n,
2417
0
                                  size_t skip, Bits32* bits_out) {
2418
0
  static_assert(SIMDVec16::kLanes == 2 * SIMDVec32::kLanes,
2419
0
                "There should be twice more 16-bit lanes than 32-bit lanes");
2420
0
  Bits32 bits_low =
2421
0
      Bits32::FromRaw(SIMDVec32::Load(nbits_tok), SIMDVec32::Load(bits_tok));
2422
0
  Bits32 bits_hi =
2423
0
      Bits32::FromRaw(SIMDVec32::Load(nbits_tok + SIMDVec32::kLanes),
2424
0
                      SIMDVec32::Load(bits_tok + SIMDVec32::kLanes));
2425
2426
0
  VecPair<SIMDVec32> huff_bits = SIMDVec16::Load(bits_huff).Upcast();
2427
0
  VecPair<SIMDVec32> huff_nbits = SIMDVec16::Load(nbits_huff).Upcast();
2428
2429
0
  Bits32 huff_low = Bits32::FromRaw(huff_nbits.low, huff_bits.low);
2430
0
  Bits32 huff_hi = Bits32::FromRaw(huff_nbits.hi, huff_bits.hi);
2431
2432
0
  bits_low.Interleave(huff_low);
2433
0
  bits_low.ClipTo(n);
2434
0
  bits_low.Skip(skip);
2435
0
  bits_out[0] = bits_low;
2436
0
  bits_hi.Interleave(huff_hi);
2437
0
  bits_hi.ClipTo(std::max(n, SIMDVec32::kLanes) - SIMDVec32::kLanes);
2438
0
  bits_hi.Skip(std::max(skip, SIMDVec32::kLanes) - SIMDVec32::kLanes);
2439
0
  bits_out[1] = bits_hi;
2440
0
}
2441
2442
#ifdef FJXL_AVX512
2443
FJXL_INLINE void StoreToWriterAVX512(const Bits32& bits32, BitWriter& output) {
2444
  __m512i bits = bits32.bits;
2445
  __m512i nbits = bits32.nbits;
2446
2447
  // Insert the leftover bits from the bit buffer at the bottom of the vector
2448
  // and extract the top of the vector.
2449
  uint64_t trail_bits =
2450
      _mm512_cvtsi512_si32(_mm512_alignr_epi32(bits, bits, 15));
2451
  uint64_t trail_nbits =
2452
      _mm512_cvtsi512_si32(_mm512_alignr_epi32(nbits, nbits, 15));
2453
  __m512i lead_bits = _mm512_set1_epi32(output.buffer);
2454
  __m512i lead_nbits = _mm512_set1_epi32(output.bits_in_buffer);
2455
  bits = _mm512_alignr_epi32(bits, lead_bits, 15);
2456
  nbits = _mm512_alignr_epi32(nbits, lead_nbits, 15);
2457
2458
  // Merge 32 -> 64 bits.
2459
  Bits32 b{nbits, bits};
2460
  Bits64 b64 = b.Merge();
2461
  bits = b64.bits;
2462
  nbits = b64.nbits;
2463
2464
  __m512i zero = _mm512_setzero_si512();
2465
2466
  auto sh1 = [zero](__m512i vec) { return _mm512_alignr_epi64(vec, zero, 7); };
2467
  auto sh2 = [zero](__m512i vec) { return _mm512_alignr_epi64(vec, zero, 6); };
2468
  auto sh4 = [zero](__m512i vec) { return _mm512_alignr_epi64(vec, zero, 4); };
2469
2470
  // Compute first-past-end-bit-position.
2471
  __m512i end_intermediate0 = _mm512_add_epi64(nbits, sh1(nbits));
2472
  __m512i end_intermediate1 =
2473
      _mm512_add_epi64(end_intermediate0, sh2(end_intermediate0));
2474
  __m512i end = _mm512_add_epi64(end_intermediate1, sh4(end_intermediate1));
2475
2476
  uint64_t simd_nbits = _mm512_cvtsi512_si32(_mm512_alignr_epi64(end, end, 7));
2477
2478
  // Compute begin-bit-position.
2479
  __m512i begin = _mm512_sub_epi64(end, nbits);
2480
2481
  // Index of the last bit in the chunk, or the end bit if nbits==0.
2482
  __m512i last = _mm512_mask_sub_epi64(
2483
      end, _mm512_cmpneq_epi64_mask(nbits, zero), end, _mm512_set1_epi64(1));
2484
2485
  __m512i lane_offset_mask = _mm512_set1_epi64(63);
2486
2487
  // Starting position of the chunk that each lane will ultimately belong to.
2488
  __m512i chunk_start = _mm512_andnot_si512(lane_offset_mask, last);
2489
2490
  // For all lanes that contain bits belonging to two different 64-bit chunks,
2491
  // compute the number of bits that belong to the first chunk.
2492
  // total # of bits fit in a u16, so we can satsub_u16 here.
2493
  __m512i first_chunk_nbits = _mm512_subs_epu16(chunk_start, begin);
2494
2495
  // Move all the previous-chunk-bits to the previous lane.
2496
  __m512i negnbits = _mm512_sub_epi64(_mm512_set1_epi64(64), first_chunk_nbits);
2497
  __m512i first_chunk_bits =
2498
      _mm512_srlv_epi64(_mm512_sllv_epi64(bits, negnbits), negnbits);
2499
  __m512i first_chunk_bits_down =
2500
      _mm512_alignr_epi32(zero, first_chunk_bits, 2);
2501
  bits = _mm512_srlv_epi64(bits, first_chunk_nbits);
2502
  nbits = _mm512_sub_epi64(nbits, first_chunk_nbits);
2503
  bits = _mm512_or_si512(bits, _mm512_sllv_epi64(first_chunk_bits_down, nbits));
2504
  begin = _mm512_add_epi64(begin, first_chunk_nbits);
2505
2506
  // We now know that every lane should give bits to only one chunk. We can
2507
  // shift the bits and then horizontally-or-reduce them within the same chunk.
2508
  __m512i offset = _mm512_and_si512(begin, lane_offset_mask);
2509
  __m512i aligned_bits = _mm512_sllv_epi64(bits, offset);
2510
  // h-or-reduce within same chunk
2511
  __m512i red0 = _mm512_mask_or_epi64(
2512
      aligned_bits, _mm512_cmpeq_epi64_mask(sh1(chunk_start), chunk_start),
2513
      sh1(aligned_bits), aligned_bits);
2514
  __m512i red1 = _mm512_mask_or_epi64(
2515
      red0, _mm512_cmpeq_epi64_mask(sh2(chunk_start), chunk_start), sh2(red0),
2516
      red0);
2517
  __m512i reduced = _mm512_mask_or_epi64(
2518
      red1, _mm512_cmpeq_epi64_mask(sh4(chunk_start), chunk_start), sh4(red1),
2519
      red1);
2520
  // Extract the highest lane that belongs to each chunk (the lane that ends up
2521
  // with the OR-ed value of all the other lanes of that chunk).
2522
  __m512i next_chunk_start =
2523
      _mm512_alignr_epi32(_mm512_set1_epi64(~0), chunk_start, 2);
2524
  __m512i result = _mm512_maskz_compress_epi64(
2525
      _mm512_cmpneq_epi64_mask(chunk_start, next_chunk_start), reduced);
2526
2527
  _mm512_storeu_si512((__m512i*)(output.data.get() + output.bytes_written),
2528
                      result);
2529
2530
  // Update the bit writer and add the last 32-bit lane.
2531
  // Note that since trail_nbits was at most 32 to begin with, operating on
2532
  // trail_bits does not risk overflowing.
2533
  output.bytes_written += simd_nbits / 8;
2534
  // Here we are implicitly relying on the fact that simd_nbits < 512 to know
2535
  // that the byte of bitreader data we access is initialized. This is
2536
  // guaranteed because the remaining bits in the bitreader buffer are at most
2537
  // 7, so simd_nbits <= 505 always.
2538
  trail_bits = (trail_bits << (simd_nbits % 8)) +
2539
               output.data.get()[output.bytes_written];
2540
  trail_nbits += simd_nbits % 8;
2541
  StoreLE64(output.data.get() + output.bytes_written, trail_bits);
2542
  size_t trail_bytes = trail_nbits / 8;
2543
  output.bits_in_buffer = trail_nbits % 8;
2544
  output.buffer = trail_bits >> (trail_bytes * 8);
2545
  output.bytes_written += trail_bytes;
2546
}
2547
2548
#endif
2549
2550
template <size_t n>
2551
0
FJXL_INLINE void StoreToWriter(const Bits32* bits, BitWriter& output) {
2552
#ifdef FJXL_AVX512
2553
  static_assert(n <= 2, "n should be less or 2 for AVX512");
2554
  StoreToWriterAVX512(bits[0], output);
2555
  if (n == 2) {
2556
    StoreToWriterAVX512(bits[1], output);
2557
  }
2558
  return;
2559
#endif
2560
0
  static_assert(n <= 4, "n should be less or 4");
2561
0
  alignas(64) uint64_t nbits64[Bits64::kLanes * n];
2562
0
  alignas(64) uint64_t bits64[Bits64::kLanes * n];
2563
0
  bits[0].Merge().Store(nbits64, bits64);
2564
0
  if (n > 1) {
2565
0
    bits[1].Merge().Store(nbits64 + Bits64::kLanes, bits64 + Bits64::kLanes);
2566
0
  }
2567
0
  if (n > 2) {
2568
0
    bits[2].Merge().Store(nbits64 + 2 * Bits64::kLanes,
2569
0
                          bits64 + 2 * Bits64::kLanes);
2570
0
  }
2571
0
  if (n > 3) {
2572
0
    bits[3].Merge().Store(nbits64 + 3 * Bits64::kLanes,
2573
0
                          bits64 + 3 * Bits64::kLanes);
2574
0
  }
2575
0
  output.WriteMultiple(nbits64, bits64, Bits64::kLanes * n);
2576
0
}
Unexecuted instantiation: enc_fast_lossless.cc:void AVX2::(anonymous namespace)::StoreToWriter<1ul>(AVX2::(anonymous namespace)::Bits32 const*, (anonymous namespace)::BitWriter&)
Unexecuted instantiation: enc_fast_lossless.cc:void AVX2::(anonymous namespace)::StoreToWriter<2ul>(AVX2::(anonymous namespace)::Bits32 const*, (anonymous namespace)::BitWriter&)
2577
2578
namespace detail {
2579
template <typename T>
2580
struct IntegerTypes;
2581
2582
template <>
2583
struct IntegerTypes<SIMDVec16> {
2584
  using signed_ = int16_t;
2585
  using unsigned_ = uint16_t;
2586
};
2587
2588
template <>
2589
struct IntegerTypes<SIMDVec32> {
2590
  using signed_ = int32_t;
2591
  using unsigned_ = uint32_t;
2592
};
2593
2594
template <typename T>
2595
struct SIMDType;
2596
2597
template <>
2598
struct SIMDType<int16_t> {
2599
  using type = SIMDVec16;
2600
};
2601
2602
template <>
2603
struct SIMDType<int32_t> {
2604
  using type = SIMDVec32;
2605
};
2606
2607
}  // namespace detail
2608
2609
template <typename T>
2610
using signed_t = typename detail::IntegerTypes<T>::signed_;
2611
2612
template <typename T>
2613
using unsigned_t = typename detail::IntegerTypes<T>::unsigned_;
2614
2615
template <typename T>
2616
using simd_t = typename detail::SIMDType<T>::type;
2617
2618
// This function will process exactly one vector worth of pixels.
2619
2620
template <typename T>
2621
size_t PredictPixels(const signed_t<T>* pixels, const signed_t<T>* pixels_left,
2622
                     const signed_t<T>* pixels_top,
2623
                     const signed_t<T>* pixels_topleft,
2624
0
                     unsigned_t<T>* residuals) {
2625
0
  T px = T::Load((unsigned_t<T>*)pixels);
2626
0
  T left = T::Load((unsigned_t<T>*)pixels_left);
2627
0
  T top = T::Load((unsigned_t<T>*)pixels_top);
2628
0
  T topleft = T::Load((unsigned_t<T>*)pixels_topleft);
2629
0
  T ac = left.Sub(topleft);
2630
0
  T ab = left.Sub(top);
2631
0
  T bc = top.Sub(topleft);
2632
0
  T grad = ac.Add(top);
2633
0
  T d = ab.Xor(bc);
2634
0
  T zero = T::Val(0);
2635
0
  T clamp = zero.Gt(d).IfThenElse(top, left);
2636
0
  T s = ac.Xor(bc);
2637
0
  T pred = zero.Gt(s).IfThenElse(grad, clamp);
2638
0
  T res = px.Sub(pred);
2639
0
  T res_times_2 = res.Add(res);
2640
0
  res = zero.Gt(res).IfThenElse(T::Val(-1).Sub(res_times_2), res_times_2);
2641
0
  res.Store(residuals);
2642
0
  return res.Eq(T::Val(0)).CountPrefix();
2643
0
}
Unexecuted instantiation: enc_fast_lossless.cc:unsigned long AVX2::(anonymous namespace)::PredictPixels<AVX2::(anonymous namespace)::SIMDVec16>(AVX2::(anonymous namespace)::detail::IntegerTypes<AVX2::(anonymous namespace)::SIMDVec16>::signed_ const*, AVX2::(anonymous namespace)::detail::IntegerTypes<AVX2::(anonymous namespace)::SIMDVec16>::signed_ const*, AVX2::(anonymous namespace)::detail::IntegerTypes<AVX2::(anonymous namespace)::SIMDVec16>::signed_ const*, AVX2::(anonymous namespace)::detail::IntegerTypes<AVX2::(anonymous namespace)::SIMDVec16>::signed_ const*, AVX2::(anonymous namespace)::detail::IntegerTypes<AVX2::(anonymous namespace)::SIMDVec16>::unsigned_*)
Unexecuted instantiation: enc_fast_lossless.cc:unsigned long AVX2::(anonymous namespace)::PredictPixels<AVX2::(anonymous namespace)::SIMDVec32>(AVX2::(anonymous namespace)::detail::IntegerTypes<AVX2::(anonymous namespace)::SIMDVec32>::signed_ const*, AVX2::(anonymous namespace)::detail::IntegerTypes<AVX2::(anonymous namespace)::SIMDVec32>::signed_ const*, AVX2::(anonymous namespace)::detail::IntegerTypes<AVX2::(anonymous namespace)::SIMDVec32>::signed_ const*, AVX2::(anonymous namespace)::detail::IntegerTypes<AVX2::(anonymous namespace)::SIMDVec32>::signed_ const*, AVX2::(anonymous namespace)::detail::IntegerTypes<AVX2::(anonymous namespace)::SIMDVec32>::unsigned_*)
2644
2645
#endif
2646
2647
void EncodeHybridUint000(uint32_t value, uint32_t* token, uint32_t* nbits,
2648
0
                         uint32_t* bits) {
2649
0
  uint32_t n = FloorLog2(value);
2650
0
  *token = value ? n + 1 : 0;
2651
0
  *nbits = value ? n : 0;
2652
0
  *bits = value ? value - (1 << n) : 0;
2653
0
}
Unexecuted instantiation: enc_fast_lossless.cc:AVX2::(anonymous namespace)::EncodeHybridUint000(unsigned int, unsigned int*, unsigned int*, unsigned int*)
Unexecuted instantiation: enc_fast_lossless.cc:default_implementation::(anonymous namespace)::EncodeHybridUint000(unsigned int, unsigned int*, unsigned int*, unsigned int*)
2654
2655
#ifdef FJXL_AVX512
2656
constexpr static size_t kLogChunkSize = 5;
2657
#elif defined(FJXL_AVX2) || defined(FJXL_NEON)
2658
// Even if NEON only has 128-bit lanes, it is still significantly (~1.3x) faster
2659
// to process two vectors at a time.
2660
constexpr static size_t kLogChunkSize = 4;
2661
#else
2662
constexpr static size_t kLogChunkSize = 3;
2663
#endif
2664
2665
constexpr static size_t kChunkSize = 1 << kLogChunkSize;
2666
2667
template <typename Residual>
2668
void GenericEncodeChunk(const Residual* residuals, size_t n, size_t skip,
2669
0
                        const PrefixCode& code, BitWriter& output) {
2670
0
  for (size_t ix = skip; ix < n; ix++) {
2671
0
    unsigned token, nbits, bits;
2672
0
    EncodeHybridUint000(residuals[ix], &token, &nbits, &bits);
2673
0
    output.Write(code.raw_nbits[token] + nbits,
2674
0
                 code.raw_bits[token] | bits << code.raw_nbits[token]);
2675
0
  }
2676
0
}
Unexecuted instantiation: enc_fast_lossless.cc:void default_implementation::(anonymous namespace)::GenericEncodeChunk<unsigned short>(unsigned short const*, unsigned long, unsigned long, (anonymous namespace)::PrefixCode const&, (anonymous namespace)::BitWriter&)
Unexecuted instantiation: enc_fast_lossless.cc:void default_implementation::(anonymous namespace)::GenericEncodeChunk<unsigned int>(unsigned int const*, unsigned long, unsigned long, (anonymous namespace)::PrefixCode const&, (anonymous namespace)::BitWriter&)
2677
2678
struct UpTo8Bits {
2679
  size_t bitdepth;
2680
0
  explicit UpTo8Bits(size_t bitdepth) : bitdepth(bitdepth) {
2681
0
    assert(bitdepth <= 8);
2682
0
  }
Unexecuted instantiation: enc_fast_lossless.cc:AVX2::(anonymous namespace)::UpTo8Bits::UpTo8Bits(unsigned long)
Unexecuted instantiation: enc_fast_lossless.cc:default_implementation::(anonymous namespace)::UpTo8Bits::UpTo8Bits(unsigned long)
2683
  // Here we can fit up to 9 extra bits + 7 Huffman bits in a u16; for all other
2684
  // symbols, we could actually go up to 8 Huffman bits as we have at most 8
2685
  // extra bits; however, the SIMD bit merging logic for AVX2 assumes that no
2686
  // Huffman length is 8 or more, so we cap at 8 anyway. Last symbol is used for
2687
  // LZ77 lengths and has no limitations except allowing to represent 32 symbols
2688
  // in total.
2689
  static constexpr uint8_t kMinRawLength[12] = {};
2690
  static constexpr uint8_t kMaxRawLength[12] = {
2691
      7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 10,
2692
  };
2693
0
  static size_t MaxEncodedBitsPerSample() { return 16; }
Unexecuted instantiation: enc_fast_lossless.cc:AVX2::(anonymous namespace)::UpTo8Bits::MaxEncodedBitsPerSample()
Unexecuted instantiation: enc_fast_lossless.cc:default_implementation::(anonymous namespace)::UpTo8Bits::MaxEncodedBitsPerSample()
2694
  static constexpr size_t kInputBytes = 1;
2695
  using pixel_t = int16_t;
2696
  using upixel_t = uint16_t;
2697
2698
  static void PrepareForSimd(const uint8_t* nbits, const uint8_t* bits,
2699
                             size_t n, uint8_t* nbits_simd,
2700
0
                             uint8_t* bits_simd) {
2701
0
    assert(n <= 16);
2702
0
    memcpy(nbits_simd, nbits, 16);
2703
0
    memcpy(bits_simd, bits, 16);
2704
0
  }
Unexecuted instantiation: enc_fast_lossless.cc:AVX2::(anonymous namespace)::UpTo8Bits::PrepareForSimd(unsigned char const*, unsigned char const*, unsigned long, unsigned char*, unsigned char*)
Unexecuted instantiation: enc_fast_lossless.cc:default_implementation::(anonymous namespace)::UpTo8Bits::PrepareForSimd(unsigned char const*, unsigned char const*, unsigned long, unsigned char*, unsigned char*)
2705
2706
#ifdef FJXL_GENERIC_SIMD
2707
  static void EncodeChunkSimd(upixel_t* residuals, size_t n, size_t skip,
2708
                              const uint8_t* raw_nbits_simd,
2709
0
                              const uint8_t* raw_bits_simd, BitWriter& output) {
2710
0
    Bits32 bits32[kChunkSize / SIMDVec16::kLanes];
2711
0
    alignas(64) uint16_t bits[SIMDVec16::kLanes];
2712
0
    alignas(64) uint16_t nbits[SIMDVec16::kLanes];
2713
0
    alignas(64) uint16_t bits_huff[SIMDVec16::kLanes];
2714
0
    alignas(64) uint16_t nbits_huff[SIMDVec16::kLanes];
2715
0
    alignas(64) uint16_t token[SIMDVec16::kLanes];
2716
0
    for (size_t i = 0; i < kChunkSize; i += SIMDVec16::kLanes) {
2717
0
      TokenizeSIMD(residuals + i, token, nbits, bits);
2718
0
      HuffmanSIMDUpTo13(token, raw_nbits_simd, raw_bits_simd, nbits_huff,
2719
0
                        bits_huff);
2720
0
      StoreSIMDUpTo8(nbits, bits, nbits_huff, bits_huff, std::max(n, i) - i,
2721
0
                     std::max(skip, i) - i, bits32 + i / SIMDVec16::kLanes);
2722
0
    }
2723
0
    StoreToWriter<kChunkSize / SIMDVec16::kLanes>(bits32, output);
2724
0
  }
2725
#endif
2726
2727
0
  size_t NumSymbols(bool doing_ycocg_or_large_palette) const {
2728
    // values gain 1 bit for YCoCg, 1 bit for prediction.
2729
    // Maximum symbol is 1 + effective bit depth of residuals.
2730
0
    if (doing_ycocg_or_large_palette) {
2731
0
      return bitdepth + 3;
2732
0
    } else {
2733
0
      return bitdepth + 2;
2734
0
    }
2735
0
  }
Unexecuted instantiation: enc_fast_lossless.cc:AVX2::(anonymous namespace)::UpTo8Bits::NumSymbols(bool) const
Unexecuted instantiation: enc_fast_lossless.cc:default_implementation::(anonymous namespace)::UpTo8Bits::NumSymbols(bool) const
2736
};
2737
constexpr uint8_t UpTo8Bits::kMinRawLength[];
2738
constexpr uint8_t UpTo8Bits::kMaxRawLength[];
2739
2740
struct From9To13Bits {
2741
  size_t bitdepth;
2742
0
  explicit From9To13Bits(size_t bitdepth) : bitdepth(bitdepth) {
2743
0
    assert(bitdepth <= 13 && bitdepth >= 9);
2744
0
  }
Unexecuted instantiation: enc_fast_lossless.cc:AVX2::(anonymous namespace)::From9To13Bits::From9To13Bits(unsigned long)
Unexecuted instantiation: enc_fast_lossless.cc:default_implementation::(anonymous namespace)::From9To13Bits::From9To13Bits(unsigned long)
2745
  // Last symbol is used for LZ77 lengths and has no limitations except allowing
2746
  // to represent 32 symbols in total.
2747
  // We cannot fit all the bits in a u16, so do not even try and use up to 8
2748
  // bits per raw symbol.
2749
  // There are at most 16 raw symbols, so Huffman coding can be SIMDfied without
2750
  // any special tricks.
2751
  static constexpr uint8_t kMinRawLength[17] = {};
2752
  static constexpr uint8_t kMaxRawLength[17] = {
2753
      8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 10,
2754
  };
2755
0
  static size_t MaxEncodedBitsPerSample() { return 21; }
Unexecuted instantiation: enc_fast_lossless.cc:AVX2::(anonymous namespace)::From9To13Bits::MaxEncodedBitsPerSample()
Unexecuted instantiation: enc_fast_lossless.cc:default_implementation::(anonymous namespace)::From9To13Bits::MaxEncodedBitsPerSample()
2756
  static constexpr size_t kInputBytes = 2;
2757
  using pixel_t = int16_t;
2758
  using upixel_t = uint16_t;
2759
2760
  static void PrepareForSimd(const uint8_t* nbits, const uint8_t* bits,
2761
                             size_t n, uint8_t* nbits_simd,
2762
0
                             uint8_t* bits_simd) {
2763
0
    assert(n <= 16);
2764
0
    memcpy(nbits_simd, nbits, 16);
2765
0
    memcpy(bits_simd, bits, 16);
2766
0
  }
Unexecuted instantiation: enc_fast_lossless.cc:AVX2::(anonymous namespace)::From9To13Bits::PrepareForSimd(unsigned char const*, unsigned char const*, unsigned long, unsigned char*, unsigned char*)
Unexecuted instantiation: enc_fast_lossless.cc:default_implementation::(anonymous namespace)::From9To13Bits::PrepareForSimd(unsigned char const*, unsigned char const*, unsigned long, unsigned char*, unsigned char*)
2767
2768
#ifdef FJXL_GENERIC_SIMD
2769
  static void EncodeChunkSimd(upixel_t* residuals, size_t n, size_t skip,
2770
                              const uint8_t* raw_nbits_simd,
2771
0
                              const uint8_t* raw_bits_simd, BitWriter& output) {
2772
0
    Bits32 bits32[2 * kChunkSize / SIMDVec16::kLanes];
2773
0
    alignas(64) uint16_t bits[SIMDVec16::kLanes];
2774
0
    alignas(64) uint16_t nbits[SIMDVec16::kLanes];
2775
0
    alignas(64) uint16_t bits_huff[SIMDVec16::kLanes];
2776
0
    alignas(64) uint16_t nbits_huff[SIMDVec16::kLanes];
2777
0
    alignas(64) uint16_t token[SIMDVec16::kLanes];
2778
0
    for (size_t i = 0; i < kChunkSize; i += SIMDVec16::kLanes) {
2779
0
      TokenizeSIMD(residuals + i, token, nbits, bits);
2780
0
      HuffmanSIMDUpTo13(token, raw_nbits_simd, raw_bits_simd, nbits_huff,
2781
0
                        bits_huff);
2782
0
      StoreSIMDUpTo14(nbits, bits, nbits_huff, bits_huff, std::max(n, i) - i,
2783
0
                      std::max(skip, i) - i,
2784
0
                      bits32 + 2 * i / SIMDVec16::kLanes);
2785
0
    }
2786
0
    StoreToWriter<2 * kChunkSize / SIMDVec16::kLanes>(bits32, output);
2787
0
  }
2788
#endif
2789
2790
0
  size_t NumSymbols(bool doing_ycocg_or_large_palette) const {
2791
    // values gain 1 bit for YCoCg, 1 bit for prediction.
2792
    // Maximum symbol is 1 + effective bit depth of residuals.
2793
0
    if (doing_ycocg_or_large_palette) {
2794
0
      return bitdepth + 3;
2795
0
    } else {
2796
0
      return bitdepth + 2;
2797
0
    }
2798
0
  }
Unexecuted instantiation: enc_fast_lossless.cc:AVX2::(anonymous namespace)::From9To13Bits::NumSymbols(bool) const
Unexecuted instantiation: enc_fast_lossless.cc:default_implementation::(anonymous namespace)::From9To13Bits::NumSymbols(bool) const
2799
};
2800
constexpr uint8_t From9To13Bits::kMinRawLength[];
2801
constexpr uint8_t From9To13Bits::kMaxRawLength[];
2802
2803
0
void CheckHuffmanBitsSIMD(int bits1, int nbits1, int bits2, int nbits2) {
2804
0
  assert(nbits1 == 8);
2805
0
  assert(nbits2 == 8);
2806
0
  assert(bits2 == (bits1 | 128));
2807
0
}
Unexecuted instantiation: enc_fast_lossless.cc:AVX2::(anonymous namespace)::CheckHuffmanBitsSIMD(int, int, int, int)
Unexecuted instantiation: enc_fast_lossless.cc:default_implementation::(anonymous namespace)::CheckHuffmanBitsSIMD(int, int, int, int)
2808
2809
struct Exactly14Bits {
2810
0
  explicit Exactly14Bits(size_t bitdepth_) { assert(bitdepth_ == 14); }
Unexecuted instantiation: enc_fast_lossless.cc:AVX2::(anonymous namespace)::Exactly14Bits::Exactly14Bits(unsigned long)
Unexecuted instantiation: enc_fast_lossless.cc:default_implementation::(anonymous namespace)::Exactly14Bits::Exactly14Bits(unsigned long)
2811
  // Force LZ77 symbols to have at least 8 bits, and raw symbols 15 and 16 to
2812
  // have exactly 8, and no other symbol to have 8 or more. This ensures that
2813
  // the representation for 15 and 16 is identical up to one bit.
2814
  static constexpr uint8_t kMinRawLength[18] = {
2815
      0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 8, 8, 7,
2816
  };
2817
  static constexpr uint8_t kMaxRawLength[18] = {
2818
      7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 8, 8, 10,
2819
  };
2820
  static constexpr size_t bitdepth = 14;
2821
0
  static size_t MaxEncodedBitsPerSample() { return 22; }
Unexecuted instantiation: enc_fast_lossless.cc:AVX2::(anonymous namespace)::Exactly14Bits::MaxEncodedBitsPerSample()
Unexecuted instantiation: enc_fast_lossless.cc:default_implementation::(anonymous namespace)::Exactly14Bits::MaxEncodedBitsPerSample()
2822
  static constexpr size_t kInputBytes = 2;
2823
  using pixel_t = int16_t;
2824
  using upixel_t = uint16_t;
2825
2826
  static void PrepareForSimd(const uint8_t* nbits, const uint8_t* bits,
2827
                             size_t n, uint8_t* nbits_simd,
2828
0
                             uint8_t* bits_simd) {
2829
0
    assert(n == 17);
2830
0
    CheckHuffmanBitsSIMD(bits[15], nbits[15], bits[16], nbits[16]);
2831
0
    memcpy(nbits_simd, nbits, 16);
2832
0
    memcpy(bits_simd, bits, 16);
2833
0
  }
Unexecuted instantiation: enc_fast_lossless.cc:AVX2::(anonymous namespace)::Exactly14Bits::PrepareForSimd(unsigned char const*, unsigned char const*, unsigned long, unsigned char*, unsigned char*)
Unexecuted instantiation: enc_fast_lossless.cc:default_implementation::(anonymous namespace)::Exactly14Bits::PrepareForSimd(unsigned char const*, unsigned char const*, unsigned long, unsigned char*, unsigned char*)
2834
2835
#ifdef FJXL_GENERIC_SIMD
2836
  static void EncodeChunkSimd(upixel_t* residuals, size_t n, size_t skip,
2837
                              const uint8_t* raw_nbits_simd,
2838
0
                              const uint8_t* raw_bits_simd, BitWriter& output) {
2839
0
    Bits32 bits32[2 * kChunkSize / SIMDVec16::kLanes];
2840
0
    alignas(64) uint16_t bits[SIMDVec16::kLanes];
2841
0
    alignas(64) uint16_t nbits[SIMDVec16::kLanes];
2842
0
    alignas(64) uint16_t bits_huff[SIMDVec16::kLanes];
2843
0
    alignas(64) uint16_t nbits_huff[SIMDVec16::kLanes];
2844
0
    alignas(64) uint16_t token[SIMDVec16::kLanes];
2845
0
    for (size_t i = 0; i < kChunkSize; i += SIMDVec16::kLanes) {
2846
0
      TokenizeSIMD(residuals + i, token, nbits, bits);
2847
0
      HuffmanSIMD14(token, raw_nbits_simd, raw_bits_simd, nbits_huff,
2848
0
                    bits_huff);
2849
0
      StoreSIMDUpTo14(nbits, bits, nbits_huff, bits_huff, std::max(n, i) - i,
2850
0
                      std::max(skip, i) - i,
2851
0
                      bits32 + 2 * i / SIMDVec16::kLanes);
2852
0
    }
2853
0
    StoreToWriter<2 * kChunkSize / SIMDVec16::kLanes>(bits32, output);
2854
0
  }
2855
#endif
2856
2857
0
  size_t NumSymbols(bool) const { return 17; }
Unexecuted instantiation: enc_fast_lossless.cc:AVX2::(anonymous namespace)::Exactly14Bits::NumSymbols(bool) const
Unexecuted instantiation: enc_fast_lossless.cc:default_implementation::(anonymous namespace)::Exactly14Bits::NumSymbols(bool) const
2858
};
2859
constexpr uint8_t Exactly14Bits::kMinRawLength[];
2860
constexpr uint8_t Exactly14Bits::kMaxRawLength[];
2861
2862
struct MoreThan14Bits {
2863
  size_t bitdepth;
2864
0
  explicit MoreThan14Bits(size_t bitdepth) : bitdepth(bitdepth) {
2865
0
    assert(bitdepth > 14);
2866
0
    assert(bitdepth <= 16);
2867
0
  }
Unexecuted instantiation: enc_fast_lossless.cc:AVX2::(anonymous namespace)::MoreThan14Bits::MoreThan14Bits(unsigned long)
Unexecuted instantiation: enc_fast_lossless.cc:default_implementation::(anonymous namespace)::MoreThan14Bits::MoreThan14Bits(unsigned long)
2868
  // Force LZ77 symbols to have at least 8 bits, and raw symbols 13 to 18 to
2869
  // have exactly 8, and no other symbol to have 8 or more. This ensures that
2870
  // the representation for (13, 14), (15, 16), (17, 18) is identical up to one
2871
  // bit.
2872
  static constexpr uint8_t kMinRawLength[20] = {
2873
      0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 8, 8, 8, 8, 8, 8, 7,
2874
  };
2875
  static constexpr uint8_t kMaxRawLength[20] = {
2876
      7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 8, 8, 8, 8, 8, 8, 10,
2877
  };
2878
0
  static size_t MaxEncodedBitsPerSample() { return 24; }
Unexecuted instantiation: enc_fast_lossless.cc:AVX2::(anonymous namespace)::MoreThan14Bits::MaxEncodedBitsPerSample()
Unexecuted instantiation: enc_fast_lossless.cc:default_implementation::(anonymous namespace)::MoreThan14Bits::MaxEncodedBitsPerSample()
2879
  static constexpr size_t kInputBytes = 2;
2880
  using pixel_t = int32_t;
2881
  using upixel_t = uint32_t;
2882
2883
  static void PrepareForSimd(const uint8_t* nbits, const uint8_t* bits,
2884
                             size_t n, uint8_t* nbits_simd,
2885
0
                             uint8_t* bits_simd) {
2886
0
    assert(n == 19);
2887
0
    CheckHuffmanBitsSIMD(bits[13], nbits[13], bits[14], nbits[14]);
2888
0
    CheckHuffmanBitsSIMD(bits[15], nbits[15], bits[16], nbits[16]);
2889
0
    CheckHuffmanBitsSIMD(bits[17], nbits[17], bits[18], nbits[18]);
2890
0
    for (size_t i = 0; i < 14; i++) {
2891
0
      nbits_simd[i] = nbits[i];
2892
0
      bits_simd[i] = bits[i];
2893
0
    }
2894
0
    nbits_simd[14] = nbits[15];
2895
0
    bits_simd[14] = bits[15];
2896
0
    nbits_simd[15] = nbits[17];
2897
0
    bits_simd[15] = bits[17];
2898
0
  }
Unexecuted instantiation: enc_fast_lossless.cc:AVX2::(anonymous namespace)::MoreThan14Bits::PrepareForSimd(unsigned char const*, unsigned char const*, unsigned long, unsigned char*, unsigned char*)
Unexecuted instantiation: enc_fast_lossless.cc:default_implementation::(anonymous namespace)::MoreThan14Bits::PrepareForSimd(unsigned char const*, unsigned char const*, unsigned long, unsigned char*, unsigned char*)
2899
2900
#ifdef FJXL_GENERIC_SIMD
2901
  static void EncodeChunkSimd(upixel_t* residuals, size_t n, size_t skip,
2902
                              const uint8_t* raw_nbits_simd,
2903
0
                              const uint8_t* raw_bits_simd, BitWriter& output) {
2904
0
    Bits32 bits32[2 * kChunkSize / SIMDVec16::kLanes];
2905
0
    alignas(64) uint32_t bits[SIMDVec16::kLanes];
2906
0
    alignas(64) uint32_t nbits[SIMDVec16::kLanes];
2907
0
    alignas(64) uint16_t bits_huff[SIMDVec16::kLanes];
2908
0
    alignas(64) uint16_t nbits_huff[SIMDVec16::kLanes];
2909
0
    alignas(64) uint16_t token[SIMDVec16::kLanes];
2910
0
    for (size_t i = 0; i < kChunkSize; i += SIMDVec16::kLanes) {
2911
0
      TokenizeSIMD(residuals + i, token, nbits, bits);
2912
0
      HuffmanSIMDAbove14(token, raw_nbits_simd, raw_bits_simd, nbits_huff,
2913
0
                         bits_huff);
2914
0
      StoreSIMDAbove14(nbits, bits, nbits_huff, bits_huff, std::max(n, i) - i,
2915
0
                       std::max(skip, i) - i,
2916
0
                       bits32 + 2 * i / SIMDVec16::kLanes);
2917
0
    }
2918
0
    StoreToWriter<2 * kChunkSize / SIMDVec16::kLanes>(bits32, output);
2919
0
  }
2920
#endif
2921
0
  size_t NumSymbols(bool) const { return 19; }
Unexecuted instantiation: enc_fast_lossless.cc:AVX2::(anonymous namespace)::MoreThan14Bits::NumSymbols(bool) const
Unexecuted instantiation: enc_fast_lossless.cc:default_implementation::(anonymous namespace)::MoreThan14Bits::NumSymbols(bool) const
2922
};
2923
constexpr uint8_t MoreThan14Bits::kMinRawLength[];
2924
constexpr uint8_t MoreThan14Bits::kMaxRawLength[];
2925
2926
void PrepareDCGlobalCommon(bool is_single_group, size_t width, size_t height,
2927
0
                           const PrefixCode code[4], BitWriter* output) {
2928
0
  output->Allocate(100000 + (is_single_group ? width * height * 16 : 0));
2929
  // No patches, spline or noise.
2930
0
  output->Write(1, 1);  // default DC dequantization factors (?)
2931
0
  output->Write(1, 1);  // use global tree / histograms
2932
0
  output->Write(1, 0);  // no lz77 for the tree
2933
2934
0
  output->Write(1, 1);         // simple code for the tree's context map
2935
0
  output->Write(2, 0);         // all contexts clustered together
2936
0
  output->Write(1, 1);         // use prefix code for tree
2937
0
  output->Write(4, 0);         // 000 hybrid uint
2938
0
  output->Write(6, 0b100011);  // Alphabet size is 4 (var16)
2939
0
  output->Write(2, 1);         // simple prefix code
2940
0
  output->Write(2, 3);         // with 4 symbols
2941
0
  output->Write(2, 0);
2942
0
  output->Write(2, 1);
2943
0
  output->Write(2, 2);
2944
0
  output->Write(2, 3);
2945
0
  output->Write(1, 0);  // First tree encoding option
2946
2947
  // Huffman table + extra bits for the tree.
2948
0
  uint8_t symbol_bits[6] = {0b00, 0b10, 0b001, 0b101, 0b0011, 0b0111};
2949
0
  uint8_t symbol_nbits[6] = {2, 2, 3, 3, 4, 4};
2950
  // Write a tree with a leaf per channel, and gradient predictor for every
2951
  // leaf.
2952
0
  for (auto v : {1, 2, 1, 4, 1, 0, 0, 5, 0, 0, 0, 0, 5,
2953
0
                 0, 0, 0, 0, 5, 0, 0, 0, 0, 5, 0, 0, 0}) {
2954
0
    output->Write(symbol_nbits[v], symbol_bits[v]);
2955
0
  }
2956
2957
0
  output->Write(1, 1);     // Enable lz77 for the main bitstream
2958
0
  output->Write(2, 0b00);  // lz77 offset 224
2959
0
  static_assert(kLZ77Offset == 224, "kLZ77Offset should be 224");
2960
0
  output->Write(4, 0b1010);  // lz77 min length 7
2961
  // 400 hybrid uint config for lz77
2962
0
  output->Write(4, 4);
2963
0
  output->Write(3, 0);
2964
0
  output->Write(3, 0);
2965
2966
0
  output->Write(1, 1);  // simple code for the context map
2967
0
  output->Write(2, 3);  // 3 bits per entry
2968
0
  output->Write(3, 4);  // channel 3
2969
0
  output->Write(3, 3);  // channel 2
2970
0
  output->Write(3, 2);  // channel 1
2971
0
  output->Write(3, 1);  // channel 0
2972
0
  output->Write(3, 0);  // distance histogram first
2973
2974
0
  output->Write(1, 1);  // use prefix codes
2975
0
  output->Write(4, 0);  // 000 hybrid uint config for distances (only need 0)
2976
0
  for (size_t i = 0; i < 4; i++) {
2977
0
    output->Write(4, 0);  // 000 hybrid uint config for symbols (only <= 10)
2978
0
  }
2979
2980
  // Distance alphabet size:
2981
0
  output->Write(5, 0b00001);  // 2: just need 1 for RLE (i.e. distance 1)
2982
  // Symbol + LZ77 alphabet size:
2983
0
  for (size_t i = 0; i < 4; i++) {
2984
0
    output->Write(1, 1);    // > 1
2985
0
    output->Write(4, 8);    // <= 512
2986
0
    output->Write(8, 256);  // == 512
2987
0
  }
2988
2989
  // Distance histogram:
2990
0
  output->Write(2, 1);  // simple prefix code
2991
0
  output->Write(2, 0);  // with one symbol
2992
0
  output->Write(1, 1);  // 1
2993
2994
  // Symbol + lz77 histogram:
2995
0
  for (size_t i = 0; i < 4; i++) {
2996
0
    code[i].WriteTo(output);
2997
0
  }
2998
2999
  // Group header for global modular image.
3000
0
  output->Write(1, 1);  // Global tree
3001
0
  output->Write(1, 1);  // All default wp
3002
0
}
Unexecuted instantiation: enc_fast_lossless.cc:AVX2::(anonymous namespace)::PrepareDCGlobalCommon(bool, unsigned long, unsigned long, (anonymous namespace)::PrefixCode const*, (anonymous namespace)::BitWriter*)
Unexecuted instantiation: enc_fast_lossless.cc:default_implementation::(anonymous namespace)::PrepareDCGlobalCommon(bool, unsigned long, unsigned long, (anonymous namespace)::PrefixCode const*, (anonymous namespace)::BitWriter*)
3003
3004
void PrepareDCGlobal(bool is_single_group, size_t width, size_t height,
3005
                     size_t nb_chans, const PrefixCode code[4],
3006
0
                     BitWriter* output) {
3007
0
  PrepareDCGlobalCommon(is_single_group, width, height, code, output);
3008
0
  if (nb_chans > 2) {
3009
0
    output->Write(2, 0b01);     // 1 transform
3010
0
    output->Write(2, 0b00);     // RCT
3011
0
    output->Write(5, 0b00000);  // Starting from ch 0
3012
0
    output->Write(2, 0b00);     // YCoCg
3013
0
  } else {
3014
0
    output->Write(2, 0b00);  // no transforms
3015
0
  }
3016
0
  if (!is_single_group) {
3017
0
    output->ZeroPadToByte();
3018
0
  }
3019
0
}
Unexecuted instantiation: enc_fast_lossless.cc:AVX2::(anonymous namespace)::PrepareDCGlobal(bool, unsigned long, unsigned long, unsigned long, (anonymous namespace)::PrefixCode const*, (anonymous namespace)::BitWriter*)
Unexecuted instantiation: enc_fast_lossless.cc:default_implementation::(anonymous namespace)::PrepareDCGlobal(bool, unsigned long, unsigned long, unsigned long, (anonymous namespace)::PrefixCode const*, (anonymous namespace)::BitWriter*)
3020
3021
template <typename BitDepth>
3022
struct ChunkEncoder {
3023
0
  void PrepareForSimd() {
3024
0
    BitDepth::PrepareForSimd(code->raw_nbits, code->raw_bits, code->numraw,
3025
0
                             raw_nbits_simd, raw_bits_simd);
3026
0
  }
Unexecuted instantiation: enc_fast_lossless.cc:AVX2::(anonymous namespace)::ChunkEncoder<AVX2::(anonymous namespace)::UpTo8Bits>::PrepareForSimd()
Unexecuted instantiation: enc_fast_lossless.cc:default_implementation::(anonymous namespace)::ChunkEncoder<default_implementation::(anonymous namespace)::UpTo8Bits>::PrepareForSimd()
Unexecuted instantiation: enc_fast_lossless.cc:AVX2::(anonymous namespace)::ChunkEncoder<AVX2::(anonymous namespace)::From9To13Bits>::PrepareForSimd()
Unexecuted instantiation: enc_fast_lossless.cc:AVX2::(anonymous namespace)::ChunkEncoder<AVX2::(anonymous namespace)::Exactly14Bits>::PrepareForSimd()
Unexecuted instantiation: enc_fast_lossless.cc:AVX2::(anonymous namespace)::ChunkEncoder<AVX2::(anonymous namespace)::MoreThan14Bits>::PrepareForSimd()
Unexecuted instantiation: enc_fast_lossless.cc:default_implementation::(anonymous namespace)::ChunkEncoder<default_implementation::(anonymous namespace)::From9To13Bits>::PrepareForSimd()
Unexecuted instantiation: enc_fast_lossless.cc:default_implementation::(anonymous namespace)::ChunkEncoder<default_implementation::(anonymous namespace)::Exactly14Bits>::PrepareForSimd()
Unexecuted instantiation: enc_fast_lossless.cc:default_implementation::(anonymous namespace)::ChunkEncoder<default_implementation::(anonymous namespace)::MoreThan14Bits>::PrepareForSimd()
3027
  FJXL_INLINE static void EncodeRle(size_t count, const PrefixCode& code,
3028
0
                                    BitWriter& output) {
3029
0
    if (count == 0) return;
3030
0
    count -= kLZ77MinLength + 1;
3031
0
    if (count < kLZ77CacheSize) {
3032
0
      output.Write(code.lz77_cache_nbits[count], code.lz77_cache_bits[count]);
3033
0
    } else {
3034
0
      unsigned token, nbits, bits;
3035
0
      EncodeHybridUintLZ77(count, &token, &nbits, &bits);
3036
0
      uint64_t wbits = bits;
3037
0
      wbits = (wbits << code.lz77_nbits[token]) | code.lz77_bits[token];
3038
0
      wbits = (wbits << code.raw_nbits[0]) | code.raw_bits[0];
3039
0
      output.Write(code.lz77_nbits[token] + nbits + code.raw_nbits[0], wbits);
3040
0
    }
3041
0
  }
Unexecuted instantiation: enc_fast_lossless.cc:AVX2::(anonymous namespace)::ChunkEncoder<AVX2::(anonymous namespace)::UpTo8Bits>::EncodeRle(unsigned long, (anonymous namespace)::PrefixCode const&, (anonymous namespace)::BitWriter&)
Unexecuted instantiation: enc_fast_lossless.cc:default_implementation::(anonymous namespace)::ChunkEncoder<default_implementation::(anonymous namespace)::UpTo8Bits>::EncodeRle(unsigned long, (anonymous namespace)::PrefixCode const&, (anonymous namespace)::BitWriter&)
Unexecuted instantiation: enc_fast_lossless.cc:AVX2::(anonymous namespace)::ChunkEncoder<AVX2::(anonymous namespace)::From9To13Bits>::EncodeRle(unsigned long, (anonymous namespace)::PrefixCode const&, (anonymous namespace)::BitWriter&)
Unexecuted instantiation: enc_fast_lossless.cc:AVX2::(anonymous namespace)::ChunkEncoder<AVX2::(anonymous namespace)::Exactly14Bits>::EncodeRle(unsigned long, (anonymous namespace)::PrefixCode const&, (anonymous namespace)::BitWriter&)
Unexecuted instantiation: enc_fast_lossless.cc:AVX2::(anonymous namespace)::ChunkEncoder<AVX2::(anonymous namespace)::MoreThan14Bits>::EncodeRle(unsigned long, (anonymous namespace)::PrefixCode const&, (anonymous namespace)::BitWriter&)
Unexecuted instantiation: enc_fast_lossless.cc:default_implementation::(anonymous namespace)::ChunkEncoder<default_implementation::(anonymous namespace)::From9To13Bits>::EncodeRle(unsigned long, (anonymous namespace)::PrefixCode const&, (anonymous namespace)::BitWriter&)
Unexecuted instantiation: enc_fast_lossless.cc:default_implementation::(anonymous namespace)::ChunkEncoder<default_implementation::(anonymous namespace)::Exactly14Bits>::EncodeRle(unsigned long, (anonymous namespace)::PrefixCode const&, (anonymous namespace)::BitWriter&)
Unexecuted instantiation: enc_fast_lossless.cc:default_implementation::(anonymous namespace)::ChunkEncoder<default_implementation::(anonymous namespace)::MoreThan14Bits>::EncodeRle(unsigned long, (anonymous namespace)::PrefixCode const&, (anonymous namespace)::BitWriter&)
3042
3043
  FJXL_INLINE void Chunk(size_t run, typename BitDepth::upixel_t* residuals,
3044
0
                         size_t skip, size_t n) {
3045
0
    EncodeRle(run, *code, *output);
3046
#ifdef FJXL_GENERIC_SIMD
3047
    BitDepth::EncodeChunkSimd(residuals, n, skip, raw_nbits_simd, raw_bits_simd,
3048
                              *output);
3049
#else
3050
    GenericEncodeChunk(residuals, n, skip, *code, *output);
3051
#endif
3052
0
  }
Unexecuted instantiation: enc_fast_lossless.cc:AVX2::(anonymous namespace)::ChunkEncoder<AVX2::(anonymous namespace)::UpTo8Bits>::Chunk(unsigned long, unsigned short*, unsigned long, unsigned long)
Unexecuted instantiation: enc_fast_lossless.cc:default_implementation::(anonymous namespace)::ChunkEncoder<default_implementation::(anonymous namespace)::UpTo8Bits>::Chunk(unsigned long, unsigned short*, unsigned long, unsigned long)
Unexecuted instantiation: enc_fast_lossless.cc:AVX2::(anonymous namespace)::ChunkEncoder<AVX2::(anonymous namespace)::From9To13Bits>::Chunk(unsigned long, unsigned short*, unsigned long, unsigned long)
Unexecuted instantiation: enc_fast_lossless.cc:AVX2::(anonymous namespace)::ChunkEncoder<AVX2::(anonymous namespace)::Exactly14Bits>::Chunk(unsigned long, unsigned short*, unsigned long, unsigned long)
Unexecuted instantiation: enc_fast_lossless.cc:AVX2::(anonymous namespace)::ChunkEncoder<AVX2::(anonymous namespace)::MoreThan14Bits>::Chunk(unsigned long, unsigned int*, unsigned long, unsigned long)
Unexecuted instantiation: enc_fast_lossless.cc:default_implementation::(anonymous namespace)::ChunkEncoder<default_implementation::(anonymous namespace)::From9To13Bits>::Chunk(unsigned long, unsigned short*, unsigned long, unsigned long)
Unexecuted instantiation: enc_fast_lossless.cc:default_implementation::(anonymous namespace)::ChunkEncoder<default_implementation::(anonymous namespace)::Exactly14Bits>::Chunk(unsigned long, unsigned short*, unsigned long, unsigned long)
Unexecuted instantiation: enc_fast_lossless.cc:default_implementation::(anonymous namespace)::ChunkEncoder<default_implementation::(anonymous namespace)::MoreThan14Bits>::Chunk(unsigned long, unsigned int*, unsigned long, unsigned long)
3053
3054
0
  inline void Finalize(size_t run) { EncodeRle(run, *code, *output); }
Unexecuted instantiation: enc_fast_lossless.cc:AVX2::(anonymous namespace)::ChunkEncoder<AVX2::(anonymous namespace)::UpTo8Bits>::Finalize(unsigned long)
Unexecuted instantiation: enc_fast_lossless.cc:default_implementation::(anonymous namespace)::ChunkEncoder<default_implementation::(anonymous namespace)::UpTo8Bits>::Finalize(unsigned long)
Unexecuted instantiation: enc_fast_lossless.cc:AVX2::(anonymous namespace)::ChunkEncoder<AVX2::(anonymous namespace)::From9To13Bits>::Finalize(unsigned long)
Unexecuted instantiation: enc_fast_lossless.cc:AVX2::(anonymous namespace)::ChunkEncoder<AVX2::(anonymous namespace)::Exactly14Bits>::Finalize(unsigned long)
Unexecuted instantiation: enc_fast_lossless.cc:AVX2::(anonymous namespace)::ChunkEncoder<AVX2::(anonymous namespace)::MoreThan14Bits>::Finalize(unsigned long)
Unexecuted instantiation: enc_fast_lossless.cc:default_implementation::(anonymous namespace)::ChunkEncoder<default_implementation::(anonymous namespace)::From9To13Bits>::Finalize(unsigned long)
Unexecuted instantiation: enc_fast_lossless.cc:default_implementation::(anonymous namespace)::ChunkEncoder<default_implementation::(anonymous namespace)::Exactly14Bits>::Finalize(unsigned long)
Unexecuted instantiation: enc_fast_lossless.cc:default_implementation::(anonymous namespace)::ChunkEncoder<default_implementation::(anonymous namespace)::MoreThan14Bits>::Finalize(unsigned long)
3055
3056
  const PrefixCode* code;
3057
  BitWriter* output;
3058
  alignas(64) uint8_t raw_nbits_simd[16] = {};
3059
  alignas(64) uint8_t raw_bits_simd[16] = {};
3060
};
3061
3062
template <typename BitDepth>
3063
struct ChunkSampleCollector {
3064
0
  FJXL_INLINE void Rle(size_t count, uint64_t* lz77_counts_) {
3065
0
    if (count == 0) return;
3066
0
    raw_counts[0] += 1;
3067
0
    count -= kLZ77MinLength + 1;
3068
0
    unsigned token, nbits, bits;
3069
0
    EncodeHybridUintLZ77(count, &token, &nbits, &bits);
3070
0
    lz77_counts_[token]++;
3071
0
  }
Unexecuted instantiation: enc_fast_lossless.cc:AVX2::(anonymous namespace)::ChunkSampleCollector<AVX2::(anonymous namespace)::UpTo8Bits>::Rle(unsigned long, unsigned long*)
Unexecuted instantiation: enc_fast_lossless.cc:AVX2::(anonymous namespace)::ChunkSampleCollector<AVX2::(anonymous namespace)::From9To13Bits>::Rle(unsigned long, unsigned long*)
Unexecuted instantiation: enc_fast_lossless.cc:AVX2::(anonymous namespace)::ChunkSampleCollector<AVX2::(anonymous namespace)::Exactly14Bits>::Rle(unsigned long, unsigned long*)
Unexecuted instantiation: enc_fast_lossless.cc:AVX2::(anonymous namespace)::ChunkSampleCollector<AVX2::(anonymous namespace)::MoreThan14Bits>::Rle(unsigned long, unsigned long*)
Unexecuted instantiation: enc_fast_lossless.cc:default_implementation::(anonymous namespace)::ChunkSampleCollector<default_implementation::(anonymous namespace)::UpTo8Bits>::Rle(unsigned long, unsigned long*)
Unexecuted instantiation: enc_fast_lossless.cc:default_implementation::(anonymous namespace)::ChunkSampleCollector<default_implementation::(anonymous namespace)::From9To13Bits>::Rle(unsigned long, unsigned long*)
Unexecuted instantiation: enc_fast_lossless.cc:default_implementation::(anonymous namespace)::ChunkSampleCollector<default_implementation::(anonymous namespace)::Exactly14Bits>::Rle(unsigned long, unsigned long*)
Unexecuted instantiation: enc_fast_lossless.cc:default_implementation::(anonymous namespace)::ChunkSampleCollector<default_implementation::(anonymous namespace)::MoreThan14Bits>::Rle(unsigned long, unsigned long*)
3072
3073
  FJXL_INLINE void Chunk(size_t run, typename BitDepth::upixel_t* residuals,
3074
0
                         size_t skip, size_t n) {
3075
    // Run is broken. Encode the run and encode the individual vector.
3076
0
    Rle(run, lz77_counts);
3077
0
    for (size_t ix = skip; ix < n; ix++) {
3078
0
      unsigned token, nbits, bits;
3079
0
      EncodeHybridUint000(residuals[ix], &token, &nbits, &bits);
3080
0
      raw_counts[token]++;
3081
0
    }
3082
0
  }
Unexecuted instantiation: enc_fast_lossless.cc:AVX2::(anonymous namespace)::ChunkSampleCollector<AVX2::(anonymous namespace)::UpTo8Bits>::Chunk(unsigned long, unsigned short*, unsigned long, unsigned long)
Unexecuted instantiation: enc_fast_lossless.cc:AVX2::(anonymous namespace)::ChunkSampleCollector<AVX2::(anonymous namespace)::From9To13Bits>::Chunk(unsigned long, unsigned short*, unsigned long, unsigned long)
Unexecuted instantiation: enc_fast_lossless.cc:AVX2::(anonymous namespace)::ChunkSampleCollector<AVX2::(anonymous namespace)::Exactly14Bits>::Chunk(unsigned long, unsigned short*, unsigned long, unsigned long)
Unexecuted instantiation: enc_fast_lossless.cc:AVX2::(anonymous namespace)::ChunkSampleCollector<AVX2::(anonymous namespace)::MoreThan14Bits>::Chunk(unsigned long, unsigned int*, unsigned long, unsigned long)
Unexecuted instantiation: enc_fast_lossless.cc:default_implementation::(anonymous namespace)::ChunkSampleCollector<default_implementation::(anonymous namespace)::UpTo8Bits>::Chunk(unsigned long, unsigned short*, unsigned long, unsigned long)
Unexecuted instantiation: enc_fast_lossless.cc:default_implementation::(anonymous namespace)::ChunkSampleCollector<default_implementation::(anonymous namespace)::From9To13Bits>::Chunk(unsigned long, unsigned short*, unsigned long, unsigned long)
Unexecuted instantiation: enc_fast_lossless.cc:default_implementation::(anonymous namespace)::ChunkSampleCollector<default_implementation::(anonymous namespace)::Exactly14Bits>::Chunk(unsigned long, unsigned short*, unsigned long, unsigned long)
Unexecuted instantiation: enc_fast_lossless.cc:default_implementation::(anonymous namespace)::ChunkSampleCollector<default_implementation::(anonymous namespace)::MoreThan14Bits>::Chunk(unsigned long, unsigned int*, unsigned long, unsigned long)
3083
3084
  // don't count final run since we don't know how long it really is
3085
0
  void Finalize(size_t run) {}
Unexecuted instantiation: enc_fast_lossless.cc:AVX2::(anonymous namespace)::ChunkSampleCollector<AVX2::(anonymous namespace)::UpTo8Bits>::Finalize(unsigned long)
Unexecuted instantiation: enc_fast_lossless.cc:AVX2::(anonymous namespace)::ChunkSampleCollector<AVX2::(anonymous namespace)::From9To13Bits>::Finalize(unsigned long)
Unexecuted instantiation: enc_fast_lossless.cc:AVX2::(anonymous namespace)::ChunkSampleCollector<AVX2::(anonymous namespace)::Exactly14Bits>::Finalize(unsigned long)
Unexecuted instantiation: enc_fast_lossless.cc:AVX2::(anonymous namespace)::ChunkSampleCollector<AVX2::(anonymous namespace)::MoreThan14Bits>::Finalize(unsigned long)
Unexecuted instantiation: enc_fast_lossless.cc:default_implementation::(anonymous namespace)::ChunkSampleCollector<default_implementation::(anonymous namespace)::UpTo8Bits>::Finalize(unsigned long)
Unexecuted instantiation: enc_fast_lossless.cc:default_implementation::(anonymous namespace)::ChunkSampleCollector<default_implementation::(anonymous namespace)::From9To13Bits>::Finalize(unsigned long)
Unexecuted instantiation: enc_fast_lossless.cc:default_implementation::(anonymous namespace)::ChunkSampleCollector<default_implementation::(anonymous namespace)::Exactly14Bits>::Finalize(unsigned long)
Unexecuted instantiation: enc_fast_lossless.cc:default_implementation::(anonymous namespace)::ChunkSampleCollector<default_implementation::(anonymous namespace)::MoreThan14Bits>::Finalize(unsigned long)
3086
3087
  uint64_t* raw_counts;
3088
  uint64_t* lz77_counts;
3089
};
3090
3091
0
constexpr uint32_t PackSigned(int32_t value) {
3092
0
  return (static_cast<uint32_t>(value) << 1) ^
3093
0
         ((static_cast<uint32_t>(~value) >> 31) - 1);
3094
0
}
Unexecuted instantiation: enc_fast_lossless.cc:AVX2::(anonymous namespace)::PackSigned(int)
Unexecuted instantiation: enc_fast_lossless.cc:default_implementation::(anonymous namespace)::PackSigned(int)
3095
3096
template <typename T, typename BitDepth>
3097
struct ChannelRowProcessor {
3098
  using upixel_t = typename BitDepth::upixel_t;
3099
  using pixel_t = typename BitDepth::pixel_t;
3100
  T* t;
3101
  void ProcessChunk(const pixel_t* row, const pixel_t* row_left,
3102
                    const pixel_t* row_top, const pixel_t* row_topleft,
3103
0
                    size_t n) {
3104
0
    alignas(64) upixel_t residuals[kChunkSize] = {};
3105
0
    size_t prefix_size = 0;
3106
0
    size_t required_prefix_size = 0;
3107
#ifdef FJXL_GENERIC_SIMD
3108
    constexpr size_t kNum =
3109
0
        sizeof(pixel_t) == 2 ? SIMDVec16::kLanes : SIMDVec32::kLanes;
3110
0
    for (size_t ix = 0; ix < kChunkSize; ix += kNum) {
3111
0
      size_t c =
3112
0
          PredictPixels<simd_t<pixel_t>>(row + ix, row_left + ix, row_top + ix,
3113
0
                                         row_topleft + ix, residuals + ix);
3114
0
      prefix_size =
3115
0
          prefix_size == required_prefix_size ? prefix_size + c : prefix_size;
3116
0
      required_prefix_size += kNum;
3117
0
    }
3118
#else
3119
0
    for (size_t ix = 0; ix < kChunkSize; ix++) {
3120
0
      pixel_t px = row[ix];
3121
0
      pixel_t left = row_left[ix];
3122
0
      pixel_t top = row_top[ix];
3123
0
      pixel_t topleft = row_topleft[ix];
3124
0
      pixel_t ac = left - topleft;
3125
0
      pixel_t ab = left - top;
3126
0
      pixel_t bc = top - topleft;
3127
0
      pixel_t grad = static_cast<pixel_t>(static_cast<upixel_t>(ac) +
3128
0
                                          static_cast<upixel_t>(top));
3129
0
      pixel_t d = ab ^ bc;
3130
0
      pixel_t clamp = d < 0 ? top : left;
3131
0
      pixel_t s = ac ^ bc;
3132
0
      pixel_t pred = s < 0 ? grad : clamp;
3133
0
      residuals[ix] = PackSigned(px - pred);
3134
0
      prefix_size = prefix_size == required_prefix_size
3135
0
                        ? prefix_size + (residuals[ix] == 0)
3136
0
                        : prefix_size;
3137
0
      required_prefix_size += 1;
3138
0
    }
3139
#endif
3140
0
    prefix_size = std::min(n, prefix_size);
3141
0
    if (prefix_size == n && (run > 0 || prefix_size > kLZ77MinLength)) {
3142
      // Run continues, nothing to do.
3143
0
      run += prefix_size;
3144
0
    } else if (prefix_size + run > kLZ77MinLength) {
3145
      // Run is broken. Encode the run and encode the individual vector.
3146
0
      t->Chunk(run + prefix_size, residuals, prefix_size, n);
3147
0
      run = 0;
3148
0
    } else {
3149
      // There was no run to begin with.
3150
0
      t->Chunk(0, residuals, 0, n);
3151
0
    }
3152
0
  }
Unexecuted instantiation: enc_fast_lossless.cc:AVX2::(anonymous namespace)::ChannelRowProcessor<AVX2::(anonymous namespace)::ChunkSampleCollector<AVX2::(anonymous namespace)::UpTo8Bits>, AVX2::(anonymous namespace)::UpTo8Bits>::ProcessChunk(short const*, short const*, short const*, short const*, unsigned long)
Unexecuted instantiation: enc_fast_lossless.cc:AVX2::(anonymous namespace)::ChannelRowProcessor<AVX2::(anonymous namespace)::ChunkEncoder<AVX2::(anonymous namespace)::UpTo8Bits>, AVX2::(anonymous namespace)::UpTo8Bits>::ProcessChunk(short const*, short const*, short const*, short const*, unsigned long)
Unexecuted instantiation: enc_fast_lossless.cc:AVX2::(anonymous namespace)::ChannelRowProcessor<AVX2::(anonymous namespace)::ChunkSampleCollector<AVX2::(anonymous namespace)::From9To13Bits>, AVX2::(anonymous namespace)::From9To13Bits>::ProcessChunk(short const*, short const*, short const*, short const*, unsigned long)
Unexecuted instantiation: enc_fast_lossless.cc:AVX2::(anonymous namespace)::ChannelRowProcessor<AVX2::(anonymous namespace)::ChunkSampleCollector<AVX2::(anonymous namespace)::Exactly14Bits>, AVX2::(anonymous namespace)::Exactly14Bits>::ProcessChunk(short const*, short const*, short const*, short const*, unsigned long)
Unexecuted instantiation: enc_fast_lossless.cc:AVX2::(anonymous namespace)::ChannelRowProcessor<AVX2::(anonymous namespace)::ChunkSampleCollector<AVX2::(anonymous namespace)::MoreThan14Bits>, AVX2::(anonymous namespace)::MoreThan14Bits>::ProcessChunk(int const*, int const*, int const*, int const*, unsigned long)
Unexecuted instantiation: enc_fast_lossless.cc:default_implementation::(anonymous namespace)::ChannelRowProcessor<default_implementation::(anonymous namespace)::ChunkSampleCollector<default_implementation::(anonymous namespace)::UpTo8Bits>, default_implementation::(anonymous namespace)::UpTo8Bits>::ProcessChunk(short const*, short const*, short const*, short const*, unsigned long)
Unexecuted instantiation: enc_fast_lossless.cc:default_implementation::(anonymous namespace)::ChannelRowProcessor<default_implementation::(anonymous namespace)::ChunkEncoder<default_implementation::(anonymous namespace)::UpTo8Bits>, default_implementation::(anonymous namespace)::UpTo8Bits>::ProcessChunk(short const*, short const*, short const*, short const*, unsigned long)
Unexecuted instantiation: enc_fast_lossless.cc:default_implementation::(anonymous namespace)::ChannelRowProcessor<default_implementation::(anonymous namespace)::ChunkSampleCollector<default_implementation::(anonymous namespace)::From9To13Bits>, default_implementation::(anonymous namespace)::From9To13Bits>::ProcessChunk(short const*, short const*, short const*, short const*, unsigned long)
Unexecuted instantiation: enc_fast_lossless.cc:default_implementation::(anonymous namespace)::ChannelRowProcessor<default_implementation::(anonymous namespace)::ChunkSampleCollector<default_implementation::(anonymous namespace)::Exactly14Bits>, default_implementation::(anonymous namespace)::Exactly14Bits>::ProcessChunk(short const*, short const*, short const*, short const*, unsigned long)
Unexecuted instantiation: enc_fast_lossless.cc:default_implementation::(anonymous namespace)::ChannelRowProcessor<default_implementation::(anonymous namespace)::ChunkSampleCollector<default_implementation::(anonymous namespace)::MoreThan14Bits>, default_implementation::(anonymous namespace)::MoreThan14Bits>::ProcessChunk(int const*, int const*, int const*, int const*, unsigned long)
Unexecuted instantiation: enc_fast_lossless.cc:AVX2::(anonymous namespace)::ChannelRowProcessor<AVX2::(anonymous namespace)::ChunkEncoder<AVX2::(anonymous namespace)::From9To13Bits>, AVX2::(anonymous namespace)::From9To13Bits>::ProcessChunk(short const*, short const*, short const*, short const*, unsigned long)
Unexecuted instantiation: enc_fast_lossless.cc:AVX2::(anonymous namespace)::ChannelRowProcessor<AVX2::(anonymous namespace)::ChunkEncoder<AVX2::(anonymous namespace)::Exactly14Bits>, AVX2::(anonymous namespace)::Exactly14Bits>::ProcessChunk(short const*, short const*, short const*, short const*, unsigned long)
Unexecuted instantiation: enc_fast_lossless.cc:AVX2::(anonymous namespace)::ChannelRowProcessor<AVX2::(anonymous namespace)::ChunkEncoder<AVX2::(anonymous namespace)::MoreThan14Bits>, AVX2::(anonymous namespace)::MoreThan14Bits>::ProcessChunk(int const*, int const*, int const*, int const*, unsigned long)
Unexecuted instantiation: enc_fast_lossless.cc:default_implementation::(anonymous namespace)::ChannelRowProcessor<default_implementation::(anonymous namespace)::ChunkEncoder<default_implementation::(anonymous namespace)::From9To13Bits>, default_implementation::(anonymous namespace)::From9To13Bits>::ProcessChunk(short const*, short const*, short const*, short const*, unsigned long)
Unexecuted instantiation: enc_fast_lossless.cc:default_implementation::(anonymous namespace)::ChannelRowProcessor<default_implementation::(anonymous namespace)::ChunkEncoder<default_implementation::(anonymous namespace)::Exactly14Bits>, default_implementation::(anonymous namespace)::Exactly14Bits>::ProcessChunk(short const*, short const*, short const*, short const*, unsigned long)
Unexecuted instantiation: enc_fast_lossless.cc:default_implementation::(anonymous namespace)::ChannelRowProcessor<default_implementation::(anonymous namespace)::ChunkEncoder<default_implementation::(anonymous namespace)::MoreThan14Bits>, default_implementation::(anonymous namespace)::MoreThan14Bits>::ProcessChunk(int const*, int const*, int const*, int const*, unsigned long)
3153
3154
  void ProcessRow(const pixel_t* row, const pixel_t* row_left,
3155
                  const pixel_t* row_top, const pixel_t* row_topleft,
3156
0
                  size_t xs) {
3157
0
    for (size_t x = 0; x < xs; x += kChunkSize) {
3158
0
      ProcessChunk(row + x, row_left + x, row_top + x, row_topleft + x,
3159
0
                   std::min(kChunkSize, xs - x));
3160
0
    }
3161
0
  }
Unexecuted instantiation: enc_fast_lossless.cc:AVX2::(anonymous namespace)::ChannelRowProcessor<AVX2::(anonymous namespace)::ChunkSampleCollector<AVX2::(anonymous namespace)::UpTo8Bits>, AVX2::(anonymous namespace)::UpTo8Bits>::ProcessRow(short const*, short const*, short const*, short const*, unsigned long)
Unexecuted instantiation: enc_fast_lossless.cc:AVX2::(anonymous namespace)::ChannelRowProcessor<AVX2::(anonymous namespace)::ChunkEncoder<AVX2::(anonymous namespace)::UpTo8Bits>, AVX2::(anonymous namespace)::UpTo8Bits>::ProcessRow(short const*, short const*, short const*, short const*, unsigned long)
Unexecuted instantiation: enc_fast_lossless.cc:AVX2::(anonymous namespace)::ChannelRowProcessor<AVX2::(anonymous namespace)::ChunkSampleCollector<AVX2::(anonymous namespace)::From9To13Bits>, AVX2::(anonymous namespace)::From9To13Bits>::ProcessRow(short const*, short const*, short const*, short const*, unsigned long)
Unexecuted instantiation: enc_fast_lossless.cc:AVX2::(anonymous namespace)::ChannelRowProcessor<AVX2::(anonymous namespace)::ChunkSampleCollector<AVX2::(anonymous namespace)::Exactly14Bits>, AVX2::(anonymous namespace)::Exactly14Bits>::ProcessRow(short const*, short const*, short const*, short const*, unsigned long)
Unexecuted instantiation: enc_fast_lossless.cc:AVX2::(anonymous namespace)::ChannelRowProcessor<AVX2::(anonymous namespace)::ChunkSampleCollector<AVX2::(anonymous namespace)::MoreThan14Bits>, AVX2::(anonymous namespace)::MoreThan14Bits>::ProcessRow(int const*, int const*, int const*, int const*, unsigned long)
Unexecuted instantiation: enc_fast_lossless.cc:default_implementation::(anonymous namespace)::ChannelRowProcessor<default_implementation::(anonymous namespace)::ChunkSampleCollector<default_implementation::(anonymous namespace)::UpTo8Bits>, default_implementation::(anonymous namespace)::UpTo8Bits>::ProcessRow(short const*, short const*, short const*, short const*, unsigned long)
Unexecuted instantiation: enc_fast_lossless.cc:default_implementation::(anonymous namespace)::ChannelRowProcessor<default_implementation::(anonymous namespace)::ChunkEncoder<default_implementation::(anonymous namespace)::UpTo8Bits>, default_implementation::(anonymous namespace)::UpTo8Bits>::ProcessRow(short const*, short const*, short const*, short const*, unsigned long)
Unexecuted instantiation: enc_fast_lossless.cc:default_implementation::(anonymous namespace)::ChannelRowProcessor<default_implementation::(anonymous namespace)::ChunkSampleCollector<default_implementation::(anonymous namespace)::From9To13Bits>, default_implementation::(anonymous namespace)::From9To13Bits>::ProcessRow(short const*, short const*, short const*, short const*, unsigned long)
Unexecuted instantiation: enc_fast_lossless.cc:default_implementation::(anonymous namespace)::ChannelRowProcessor<default_implementation::(anonymous namespace)::ChunkSampleCollector<default_implementation::(anonymous namespace)::Exactly14Bits>, default_implementation::(anonymous namespace)::Exactly14Bits>::ProcessRow(short const*, short const*, short const*, short const*, unsigned long)
Unexecuted instantiation: enc_fast_lossless.cc:default_implementation::(anonymous namespace)::ChannelRowProcessor<default_implementation::(anonymous namespace)::ChunkSampleCollector<default_implementation::(anonymous namespace)::MoreThan14Bits>, default_implementation::(anonymous namespace)::MoreThan14Bits>::ProcessRow(int const*, int const*, int const*, int const*, unsigned long)
Unexecuted instantiation: enc_fast_lossless.cc:AVX2::(anonymous namespace)::ChannelRowProcessor<AVX2::(anonymous namespace)::ChunkEncoder<AVX2::(anonymous namespace)::From9To13Bits>, AVX2::(anonymous namespace)::From9To13Bits>::ProcessRow(short const*, short const*, short const*, short const*, unsigned long)
Unexecuted instantiation: enc_fast_lossless.cc:AVX2::(anonymous namespace)::ChannelRowProcessor<AVX2::(anonymous namespace)::ChunkEncoder<AVX2::(anonymous namespace)::Exactly14Bits>, AVX2::(anonymous namespace)::Exactly14Bits>::ProcessRow(short const*, short const*, short const*, short const*, unsigned long)
Unexecuted instantiation: enc_fast_lossless.cc:AVX2::(anonymous namespace)::ChannelRowProcessor<AVX2::(anonymous namespace)::ChunkEncoder<AVX2::(anonymous namespace)::MoreThan14Bits>, AVX2::(anonymous namespace)::MoreThan14Bits>::ProcessRow(int const*, int const*, int const*, int const*, unsigned long)
Unexecuted instantiation: enc_fast_lossless.cc:default_implementation::(anonymous namespace)::ChannelRowProcessor<default_implementation::(anonymous namespace)::ChunkEncoder<default_implementation::(anonymous namespace)::From9To13Bits>, default_implementation::(anonymous namespace)::From9To13Bits>::ProcessRow(short const*, short const*, short const*, short const*, unsigned long)
Unexecuted instantiation: enc_fast_lossless.cc:default_implementation::(anonymous namespace)::ChannelRowProcessor<default_implementation::(anonymous namespace)::ChunkEncoder<default_implementation::(anonymous namespace)::Exactly14Bits>, default_implementation::(anonymous namespace)::Exactly14Bits>::ProcessRow(short const*, short const*, short const*, short const*, unsigned long)
Unexecuted instantiation: enc_fast_lossless.cc:default_implementation::(anonymous namespace)::ChannelRowProcessor<default_implementation::(anonymous namespace)::ChunkEncoder<default_implementation::(anonymous namespace)::MoreThan14Bits>, default_implementation::(anonymous namespace)::MoreThan14Bits>::ProcessRow(int const*, int const*, int const*, int const*, unsigned long)
3162
3163
0
  void Finalize() { t->Finalize(run); }
Unexecuted instantiation: enc_fast_lossless.cc:AVX2::(anonymous namespace)::ChannelRowProcessor<AVX2::(anonymous namespace)::ChunkSampleCollector<AVX2::(anonymous namespace)::UpTo8Bits>, AVX2::(anonymous namespace)::UpTo8Bits>::Finalize()
Unexecuted instantiation: enc_fast_lossless.cc:AVX2::(anonymous namespace)::ChannelRowProcessor<AVX2::(anonymous namespace)::ChunkEncoder<AVX2::(anonymous namespace)::UpTo8Bits>, AVX2::(anonymous namespace)::UpTo8Bits>::Finalize()
Unexecuted instantiation: enc_fast_lossless.cc:AVX2::(anonymous namespace)::ChannelRowProcessor<AVX2::(anonymous namespace)::ChunkSampleCollector<AVX2::(anonymous namespace)::From9To13Bits>, AVX2::(anonymous namespace)::From9To13Bits>::Finalize()
Unexecuted instantiation: enc_fast_lossless.cc:AVX2::(anonymous namespace)::ChannelRowProcessor<AVX2::(anonymous namespace)::ChunkSampleCollector<AVX2::(anonymous namespace)::Exactly14Bits>, AVX2::(anonymous namespace)::Exactly14Bits>::Finalize()
Unexecuted instantiation: enc_fast_lossless.cc:AVX2::(anonymous namespace)::ChannelRowProcessor<AVX2::(anonymous namespace)::ChunkSampleCollector<AVX2::(anonymous namespace)::MoreThan14Bits>, AVX2::(anonymous namespace)::MoreThan14Bits>::Finalize()
Unexecuted instantiation: enc_fast_lossless.cc:default_implementation::(anonymous namespace)::ChannelRowProcessor<default_implementation::(anonymous namespace)::ChunkSampleCollector<default_implementation::(anonymous namespace)::UpTo8Bits>, default_implementation::(anonymous namespace)::UpTo8Bits>::Finalize()
Unexecuted instantiation: enc_fast_lossless.cc:default_implementation::(anonymous namespace)::ChannelRowProcessor<default_implementation::(anonymous namespace)::ChunkEncoder<default_implementation::(anonymous namespace)::UpTo8Bits>, default_implementation::(anonymous namespace)::UpTo8Bits>::Finalize()
Unexecuted instantiation: enc_fast_lossless.cc:default_implementation::(anonymous namespace)::ChannelRowProcessor<default_implementation::(anonymous namespace)::ChunkSampleCollector<default_implementation::(anonymous namespace)::From9To13Bits>, default_implementation::(anonymous namespace)::From9To13Bits>::Finalize()
Unexecuted instantiation: enc_fast_lossless.cc:default_implementation::(anonymous namespace)::ChannelRowProcessor<default_implementation::(anonymous namespace)::ChunkSampleCollector<default_implementation::(anonymous namespace)::Exactly14Bits>, default_implementation::(anonymous namespace)::Exactly14Bits>::Finalize()
Unexecuted instantiation: enc_fast_lossless.cc:default_implementation::(anonymous namespace)::ChannelRowProcessor<default_implementation::(anonymous namespace)::ChunkSampleCollector<default_implementation::(anonymous namespace)::MoreThan14Bits>, default_implementation::(anonymous namespace)::MoreThan14Bits>::Finalize()
Unexecuted instantiation: enc_fast_lossless.cc:AVX2::(anonymous namespace)::ChannelRowProcessor<AVX2::(anonymous namespace)::ChunkEncoder<AVX2::(anonymous namespace)::From9To13Bits>, AVX2::(anonymous namespace)::From9To13Bits>::Finalize()
Unexecuted instantiation: enc_fast_lossless.cc:AVX2::(anonymous namespace)::ChannelRowProcessor<AVX2::(anonymous namespace)::ChunkEncoder<AVX2::(anonymous namespace)::Exactly14Bits>, AVX2::(anonymous namespace)::Exactly14Bits>::Finalize()
Unexecuted instantiation: enc_fast_lossless.cc:AVX2::(anonymous namespace)::ChannelRowProcessor<AVX2::(anonymous namespace)::ChunkEncoder<AVX2::(anonymous namespace)::MoreThan14Bits>, AVX2::(anonymous namespace)::MoreThan14Bits>::Finalize()
Unexecuted instantiation: enc_fast_lossless.cc:default_implementation::(anonymous namespace)::ChannelRowProcessor<default_implementation::(anonymous namespace)::ChunkEncoder<default_implementation::(anonymous namespace)::From9To13Bits>, default_implementation::(anonymous namespace)::From9To13Bits>::Finalize()
Unexecuted instantiation: enc_fast_lossless.cc:default_implementation::(anonymous namespace)::ChannelRowProcessor<default_implementation::(anonymous namespace)::ChunkEncoder<default_implementation::(anonymous namespace)::Exactly14Bits>, default_implementation::(anonymous namespace)::Exactly14Bits>::Finalize()
Unexecuted instantiation: enc_fast_lossless.cc:default_implementation::(anonymous namespace)::ChannelRowProcessor<default_implementation::(anonymous namespace)::ChunkEncoder<default_implementation::(anonymous namespace)::MoreThan14Bits>, default_implementation::(anonymous namespace)::MoreThan14Bits>::Finalize()
3164
  // Invariant: run == 0 or run > kLZ77MinLength.
3165
  size_t run = 0;
3166
};
3167
3168
0
uint16_t LoadLE16(const unsigned char* ptr) {
3169
0
  return uint16_t{ptr[0]} | (uint16_t{ptr[1]} << 8);
3170
0
}
Unexecuted instantiation: enc_fast_lossless.cc:AVX2::(anonymous namespace)::LoadLE16(unsigned char const*)
Unexecuted instantiation: enc_fast_lossless.cc:default_implementation::(anonymous namespace)::LoadLE16(unsigned char const*)
3171
3172
0
uint16_t SwapEndian(uint16_t in) { return (in >> 8) | (in << 8); }
Unexecuted instantiation: enc_fast_lossless.cc:AVX2::(anonymous namespace)::SwapEndian(unsigned short)
Unexecuted instantiation: enc_fast_lossless.cc:default_implementation::(anonymous namespace)::SwapEndian(unsigned short)
3173
3174
#ifdef FJXL_GENERIC_SIMD
3175
0
void StorePixels(SIMDVec16 p, int16_t* dest) { p.Store((uint16_t*)dest); }
3176
3177
0
void StorePixels(SIMDVec16 p, int32_t* dest) {
3178
0
  VecPair<SIMDVec32> p_up = p.Upcast();
3179
0
  p_up.low.Store((uint32_t*)dest);
3180
0
  p_up.hi.Store((uint32_t*)dest + SIMDVec32::kLanes);
3181
0
}
3182
#endif
3183
3184
template <typename pixel_t>
3185
0
void FillRowG8(const unsigned char* rgba, size_t oxs, pixel_t* luma) {
3186
0
  size_t x = 0;
3187
#ifdef FJXL_GENERIC_SIMD
3188
0
  for (; x + SIMDVec16::kLanes <= oxs; x += SIMDVec16::kLanes) {
3189
0
    auto rgb = SIMDVec16::LoadG8(rgba + x);
3190
0
    StorePixels(rgb[0], luma + x);
3191
0
  }
3192
#endif
3193
0
  for (; x < oxs; x++) {
3194
0
    luma[x] = rgba[x];
3195
0
  }
3196
0
}
Unexecuted instantiation: enc_fast_lossless.cc:void AVX2::(anonymous namespace)::FillRowG8<short>(unsigned char const*, unsigned long, short*)
Unexecuted instantiation: enc_fast_lossless.cc:void default_implementation::(anonymous namespace)::FillRowG8<short>(unsigned char const*, unsigned long, short*)
Unexecuted instantiation: enc_fast_lossless.cc:void default_implementation::(anonymous namespace)::FillRowG8<int>(unsigned char const*, unsigned long, int*)
Unexecuted instantiation: enc_fast_lossless.cc:void AVX2::(anonymous namespace)::FillRowG8<int>(unsigned char const*, unsigned long, int*)
3197
3198
template <bool big_endian, typename pixel_t>
3199
0
void FillRowG16(const unsigned char* rgba, size_t oxs, pixel_t* luma) {
3200
0
  size_t x = 0;
3201
#ifdef FJXL_GENERIC_SIMD
3202
0
  for (; x + SIMDVec16::kLanes <= oxs; x += SIMDVec16::kLanes) {
3203
0
    auto rgb = SIMDVec16::LoadG16(rgba + 2 * x);
3204
0
    if (big_endian) {
3205
0
      rgb[0].SwapEndian();
3206
0
    }
3207
0
    StorePixels(rgb[0], luma + x);
3208
0
  }
3209
#endif
3210
0
  for (; x < oxs; x++) {
3211
0
    uint16_t val = LoadLE16(rgba + 2 * x);
3212
0
    if (big_endian) {
3213
0
      val = SwapEndian(val);
3214
0
    }
3215
0
    luma[x] = val;
3216
0
  }
3217
0
}
Unexecuted instantiation: enc_fast_lossless.cc:void AVX2::(anonymous namespace)::FillRowG16<true, short>(unsigned char const*, unsigned long, short*)
Unexecuted instantiation: enc_fast_lossless.cc:void AVX2::(anonymous namespace)::FillRowG16<false, short>(unsigned char const*, unsigned long, short*)
Unexecuted instantiation: enc_fast_lossless.cc:void AVX2::(anonymous namespace)::FillRowG16<true, int>(unsigned char const*, unsigned long, int*)
Unexecuted instantiation: enc_fast_lossless.cc:void AVX2::(anonymous namespace)::FillRowG16<false, int>(unsigned char const*, unsigned long, int*)
Unexecuted instantiation: enc_fast_lossless.cc:void default_implementation::(anonymous namespace)::FillRowG16<true, short>(unsigned char const*, unsigned long, short*)
Unexecuted instantiation: enc_fast_lossless.cc:void default_implementation::(anonymous namespace)::FillRowG16<false, short>(unsigned char const*, unsigned long, short*)
Unexecuted instantiation: enc_fast_lossless.cc:void default_implementation::(anonymous namespace)::FillRowG16<true, int>(unsigned char const*, unsigned long, int*)
Unexecuted instantiation: enc_fast_lossless.cc:void default_implementation::(anonymous namespace)::FillRowG16<false, int>(unsigned char const*, unsigned long, int*)
3218
3219
template <typename pixel_t>
3220
void FillRowGA8(const unsigned char* rgba, size_t oxs, pixel_t* luma,
3221
0
                pixel_t* alpha) {
3222
0
  size_t x = 0;
3223
#ifdef FJXL_GENERIC_SIMD
3224
0
  for (; x + SIMDVec16::kLanes <= oxs; x += SIMDVec16::kLanes) {
3225
0
    auto rgb = SIMDVec16::LoadGA8(rgba + 2 * x);
3226
0
    StorePixels(rgb[0], luma + x);
3227
0
    StorePixels(rgb[1], alpha + x);
3228
0
  }
3229
#endif
3230
0
  for (; x < oxs; x++) {
3231
0
    luma[x] = rgba[2 * x];
3232
0
    alpha[x] = rgba[2 * x + 1];
3233
0
  }
3234
0
}
Unexecuted instantiation: enc_fast_lossless.cc:void AVX2::(anonymous namespace)::FillRowGA8<short>(unsigned char const*, unsigned long, short*, short*)
Unexecuted instantiation: enc_fast_lossless.cc:void default_implementation::(anonymous namespace)::FillRowGA8<short>(unsigned char const*, unsigned long, short*, short*)
Unexecuted instantiation: enc_fast_lossless.cc:void default_implementation::(anonymous namespace)::FillRowGA8<int>(unsigned char const*, unsigned long, int*, int*)
Unexecuted instantiation: enc_fast_lossless.cc:void AVX2::(anonymous namespace)::FillRowGA8<int>(unsigned char const*, unsigned long, int*, int*)
3235
3236
template <bool big_endian, typename pixel_t>
3237
void FillRowGA16(const unsigned char* rgba, size_t oxs, pixel_t* luma,
3238
0
                 pixel_t* alpha) {
3239
0
  size_t x = 0;
3240
#ifdef FJXL_GENERIC_SIMD
3241
0
  for (; x + SIMDVec16::kLanes <= oxs; x += SIMDVec16::kLanes) {
3242
0
    auto rgb = SIMDVec16::LoadGA16(rgba + 4 * x);
3243
0
    if (big_endian) {
3244
0
      rgb[0].SwapEndian();
3245
0
      rgb[1].SwapEndian();
3246
0
    }
3247
0
    StorePixels(rgb[0], luma + x);
3248
0
    StorePixels(rgb[1], alpha + x);
3249
0
  }
3250
#endif
3251
0
  for (; x < oxs; x++) {
3252
0
    uint16_t l = LoadLE16(rgba + 4 * x);
3253
0
    uint16_t a = LoadLE16(rgba + 4 * x + 2);
3254
0
    if (big_endian) {
3255
0
      l = SwapEndian(l);
3256
0
      a = SwapEndian(a);
3257
0
    }
3258
0
    luma[x] = l;
3259
0
    alpha[x] = a;
3260
0
  }
3261
0
}
Unexecuted instantiation: enc_fast_lossless.cc:void AVX2::(anonymous namespace)::FillRowGA16<true, short>(unsigned char const*, unsigned long, short*, short*)
Unexecuted instantiation: enc_fast_lossless.cc:void AVX2::(anonymous namespace)::FillRowGA16<false, short>(unsigned char const*, unsigned long, short*, short*)
Unexecuted instantiation: enc_fast_lossless.cc:void AVX2::(anonymous namespace)::FillRowGA16<true, int>(unsigned char const*, unsigned long, int*, int*)
Unexecuted instantiation: enc_fast_lossless.cc:void AVX2::(anonymous namespace)::FillRowGA16<false, int>(unsigned char const*, unsigned long, int*, int*)
Unexecuted instantiation: enc_fast_lossless.cc:void default_implementation::(anonymous namespace)::FillRowGA16<true, short>(unsigned char const*, unsigned long, short*, short*)
Unexecuted instantiation: enc_fast_lossless.cc:void default_implementation::(anonymous namespace)::FillRowGA16<false, short>(unsigned char const*, unsigned long, short*, short*)
Unexecuted instantiation: enc_fast_lossless.cc:void default_implementation::(anonymous namespace)::FillRowGA16<true, int>(unsigned char const*, unsigned long, int*, int*)
Unexecuted instantiation: enc_fast_lossless.cc:void default_implementation::(anonymous namespace)::FillRowGA16<false, int>(unsigned char const*, unsigned long, int*, int*)
3262
3263
template <typename pixel_t>
3264
void StoreYCoCg(pixel_t r, pixel_t g, pixel_t b, pixel_t* y, pixel_t* co,
3265
0
                pixel_t* cg) {
3266
0
  *co = r - b;
3267
0
  pixel_t tmp = b + (*co >> 1);
3268
0
  *cg = g - tmp;
3269
0
  *y = tmp + (*cg >> 1);
3270
0
}
Unexecuted instantiation: enc_fast_lossless.cc:void AVX2::(anonymous namespace)::StoreYCoCg<short>(short, short, short, short*, short*, short*)
Unexecuted instantiation: enc_fast_lossless.cc:void AVX2::(anonymous namespace)::StoreYCoCg<int>(int, int, int, int*, int*, int*)
Unexecuted instantiation: enc_fast_lossless.cc:void default_implementation::(anonymous namespace)::StoreYCoCg<short>(short, short, short, short*, short*, short*)
Unexecuted instantiation: enc_fast_lossless.cc:void default_implementation::(anonymous namespace)::StoreYCoCg<int>(int, int, int, int*, int*, int*)
3271
3272
#ifdef FJXL_GENERIC_SIMD
3273
void StoreYCoCg(SIMDVec16 r, SIMDVec16 g, SIMDVec16 b, int16_t* y, int16_t* co,
3274
0
                int16_t* cg) {
3275
0
  SIMDVec16 co_v = r.Sub(b);
3276
0
  SIMDVec16 tmp = b.Add(co_v.SignedShiftRight<1>());
3277
0
  SIMDVec16 cg_v = g.Sub(tmp);
3278
0
  SIMDVec16 y_v = tmp.Add(cg_v.SignedShiftRight<1>());
3279
0
  y_v.Store(reinterpret_cast<uint16_t*>(y));
3280
0
  co_v.Store(reinterpret_cast<uint16_t*>(co));
3281
0
  cg_v.Store(reinterpret_cast<uint16_t*>(cg));
3282
0
}
3283
3284
void StoreYCoCg(SIMDVec16 r, SIMDVec16 g, SIMDVec16 b, int32_t* y, int32_t* co,
3285
0
                int32_t* cg) {
3286
0
  VecPair<SIMDVec32> r_up = r.Upcast();
3287
0
  VecPair<SIMDVec32> g_up = g.Upcast();
3288
0
  VecPair<SIMDVec32> b_up = b.Upcast();
3289
0
  SIMDVec32 co_lo_v = r_up.low.Sub(b_up.low);
3290
0
  SIMDVec32 tmp_lo = b_up.low.Add(co_lo_v.SignedShiftRight<1>());
3291
0
  SIMDVec32 cg_lo_v = g_up.low.Sub(tmp_lo);
3292
0
  SIMDVec32 y_lo_v = tmp_lo.Add(cg_lo_v.SignedShiftRight<1>());
3293
0
  SIMDVec32 co_hi_v = r_up.hi.Sub(b_up.hi);
3294
0
  SIMDVec32 tmp_hi = b_up.hi.Add(co_hi_v.SignedShiftRight<1>());
3295
0
  SIMDVec32 cg_hi_v = g_up.hi.Sub(tmp_hi);
3296
0
  SIMDVec32 y_hi_v = tmp_hi.Add(cg_hi_v.SignedShiftRight<1>());
3297
0
  y_lo_v.Store(reinterpret_cast<uint32_t*>(y));
3298
0
  co_lo_v.Store(reinterpret_cast<uint32_t*>(co));
3299
0
  cg_lo_v.Store(reinterpret_cast<uint32_t*>(cg));
3300
0
  y_hi_v.Store(reinterpret_cast<uint32_t*>(y) + SIMDVec32::kLanes);
3301
0
  co_hi_v.Store(reinterpret_cast<uint32_t*>(co) + SIMDVec32::kLanes);
3302
0
  cg_hi_v.Store(reinterpret_cast<uint32_t*>(cg) + SIMDVec32::kLanes);
3303
0
}
3304
#endif
3305
3306
template <typename pixel_t>
3307
void FillRowRGB8(const unsigned char* rgba, size_t oxs, pixel_t* y, pixel_t* co,
3308
0
                 pixel_t* cg) {
3309
0
  size_t x = 0;
3310
#ifdef FJXL_GENERIC_SIMD
3311
0
  for (; x + SIMDVec16::kLanes <= oxs; x += SIMDVec16::kLanes) {
3312
0
    auto rgb = SIMDVec16::LoadRGB8(rgba + 3 * x);
3313
0
    StoreYCoCg(rgb[0], rgb[1], rgb[2], y + x, co + x, cg + x);
3314
0
  }
3315
#endif
3316
0
  for (; x < oxs; x++) {
3317
0
    uint16_t r = rgba[3 * x];
3318
0
    uint16_t g = rgba[3 * x + 1];
3319
0
    uint16_t b = rgba[3 * x + 2];
3320
0
    StoreYCoCg<pixel_t>(r, g, b, y + x, co + x, cg + x);
3321
0
  }
3322
0
}
Unexecuted instantiation: enc_fast_lossless.cc:void AVX2::(anonymous namespace)::FillRowRGB8<short>(unsigned char const*, unsigned long, short*, short*, short*)
Unexecuted instantiation: enc_fast_lossless.cc:void default_implementation::(anonymous namespace)::FillRowRGB8<short>(unsigned char const*, unsigned long, short*, short*, short*)
Unexecuted instantiation: enc_fast_lossless.cc:void default_implementation::(anonymous namespace)::FillRowRGB8<int>(unsigned char const*, unsigned long, int*, int*, int*)
Unexecuted instantiation: enc_fast_lossless.cc:void AVX2::(anonymous namespace)::FillRowRGB8<int>(unsigned char const*, unsigned long, int*, int*, int*)
3323
3324
template <bool big_endian, typename pixel_t>
3325
void FillRowRGB16(const unsigned char* rgba, size_t oxs, pixel_t* y,
3326
0
                  pixel_t* co, pixel_t* cg) {
3327
0
  size_t x = 0;
3328
#ifdef FJXL_GENERIC_SIMD
3329
0
  for (; x + SIMDVec16::kLanes <= oxs; x += SIMDVec16::kLanes) {
3330
0
    auto rgb = SIMDVec16::LoadRGB16(rgba + 6 * x);
3331
0
    if (big_endian) {
3332
0
      rgb[0].SwapEndian();
3333
0
      rgb[1].SwapEndian();
3334
0
      rgb[2].SwapEndian();
3335
0
    }
3336
0
    StoreYCoCg(rgb[0], rgb[1], rgb[2], y + x, co + x, cg + x);
3337
0
  }
3338
#endif
3339
0
  for (; x < oxs; x++) {
3340
0
    uint16_t r = LoadLE16(rgba + 6 * x);
3341
0
    uint16_t g = LoadLE16(rgba + 6 * x + 2);
3342
0
    uint16_t b = LoadLE16(rgba + 6 * x + 4);
3343
0
    if (big_endian) {
3344
0
      r = SwapEndian(r);
3345
0
      g = SwapEndian(g);
3346
0
      b = SwapEndian(b);
3347
0
    }
3348
0
    StoreYCoCg<pixel_t>(r, g, b, y + x, co + x, cg + x);
3349
0
  }
3350
0
}
Unexecuted instantiation: enc_fast_lossless.cc:void AVX2::(anonymous namespace)::FillRowRGB16<true, short>(unsigned char const*, unsigned long, short*, short*, short*)
Unexecuted instantiation: enc_fast_lossless.cc:void AVX2::(anonymous namespace)::FillRowRGB16<false, short>(unsigned char const*, unsigned long, short*, short*, short*)
Unexecuted instantiation: enc_fast_lossless.cc:void AVX2::(anonymous namespace)::FillRowRGB16<true, int>(unsigned char const*, unsigned long, int*, int*, int*)
Unexecuted instantiation: enc_fast_lossless.cc:void AVX2::(anonymous namespace)::FillRowRGB16<false, int>(unsigned char const*, unsigned long, int*, int*, int*)
Unexecuted instantiation: enc_fast_lossless.cc:void default_implementation::(anonymous namespace)::FillRowRGB16<true, short>(unsigned char const*, unsigned long, short*, short*, short*)
Unexecuted instantiation: enc_fast_lossless.cc:void default_implementation::(anonymous namespace)::FillRowRGB16<false, short>(unsigned char const*, unsigned long, short*, short*, short*)
Unexecuted instantiation: enc_fast_lossless.cc:void default_implementation::(anonymous namespace)::FillRowRGB16<true, int>(unsigned char const*, unsigned long, int*, int*, int*)
Unexecuted instantiation: enc_fast_lossless.cc:void default_implementation::(anonymous namespace)::FillRowRGB16<false, int>(unsigned char const*, unsigned long, int*, int*, int*)
3351
3352
template <typename pixel_t>
3353
void FillRowRGBA8(const unsigned char* rgba, size_t oxs, pixel_t* y,
3354
0
                  pixel_t* co, pixel_t* cg, pixel_t* alpha) {
3355
0
  size_t x = 0;
3356
#ifdef FJXL_GENERIC_SIMD
3357
0
  for (; x + SIMDVec16::kLanes <= oxs; x += SIMDVec16::kLanes) {
3358
0
    auto rgb = SIMDVec16::LoadRGBA8(rgba + 4 * x);
3359
0
    StoreYCoCg(rgb[0], rgb[1], rgb[2], y + x, co + x, cg + x);
3360
0
    StorePixels(rgb[3], alpha + x);
3361
0
  }
3362
#endif
3363
0
  for (; x < oxs; x++) {
3364
0
    uint16_t r = rgba[4 * x];
3365
0
    uint16_t g = rgba[4 * x + 1];
3366
0
    uint16_t b = rgba[4 * x + 2];
3367
0
    uint16_t a = rgba[4 * x + 3];
3368
0
    StoreYCoCg<pixel_t>(r, g, b, y + x, co + x, cg + x);
3369
0
    alpha[x] = a;
3370
0
  }
3371
0
}
Unexecuted instantiation: enc_fast_lossless.cc:void AVX2::(anonymous namespace)::FillRowRGBA8<short>(unsigned char const*, unsigned long, short*, short*, short*, short*)
Unexecuted instantiation: enc_fast_lossless.cc:void default_implementation::(anonymous namespace)::FillRowRGBA8<short>(unsigned char const*, unsigned long, short*, short*, short*, short*)
Unexecuted instantiation: enc_fast_lossless.cc:void default_implementation::(anonymous namespace)::FillRowRGBA8<int>(unsigned char const*, unsigned long, int*, int*, int*, int*)
Unexecuted instantiation: enc_fast_lossless.cc:void AVX2::(anonymous namespace)::FillRowRGBA8<int>(unsigned char const*, unsigned long, int*, int*, int*, int*)
3372
3373
template <bool big_endian, typename pixel_t>
3374
void FillRowRGBA16(const unsigned char* rgba, size_t oxs, pixel_t* y,
3375
0
                   pixel_t* co, pixel_t* cg, pixel_t* alpha) {
3376
0
  size_t x = 0;
3377
#ifdef FJXL_GENERIC_SIMD
3378
0
  for (; x + SIMDVec16::kLanes <= oxs; x += SIMDVec16::kLanes) {
3379
0
    auto rgb = SIMDVec16::LoadRGBA16(rgba + 8 * x);
3380
0
    if (big_endian) {
3381
0
      rgb[0].SwapEndian();
3382
0
      rgb[1].SwapEndian();
3383
0
      rgb[2].SwapEndian();
3384
0
      rgb[3].SwapEndian();
3385
0
    }
3386
0
    StoreYCoCg(rgb[0], rgb[1], rgb[2], y + x, co + x, cg + x);
3387
0
    StorePixels(rgb[3], alpha + x);
3388
0
  }
3389
#endif
3390
0
  for (; x < oxs; x++) {
3391
0
    uint16_t r = LoadLE16(rgba + 8 * x);
3392
0
    uint16_t g = LoadLE16(rgba + 8 * x + 2);
3393
0
    uint16_t b = LoadLE16(rgba + 8 * x + 4);
3394
0
    uint16_t a = LoadLE16(rgba + 8 * x + 6);
3395
0
    if (big_endian) {
3396
0
      r = SwapEndian(r);
3397
0
      g = SwapEndian(g);
3398
0
      b = SwapEndian(b);
3399
0
      a = SwapEndian(a);
3400
0
    }
3401
0
    StoreYCoCg<pixel_t>(r, g, b, y + x, co + x, cg + x);
3402
0
    alpha[x] = a;
3403
0
  }
3404
0
}
Unexecuted instantiation: enc_fast_lossless.cc:void AVX2::(anonymous namespace)::FillRowRGBA16<true, short>(unsigned char const*, unsigned long, short*, short*, short*, short*)
Unexecuted instantiation: enc_fast_lossless.cc:void AVX2::(anonymous namespace)::FillRowRGBA16<false, short>(unsigned char const*, unsigned long, short*, short*, short*, short*)
Unexecuted instantiation: enc_fast_lossless.cc:void AVX2::(anonymous namespace)::FillRowRGBA16<true, int>(unsigned char const*, unsigned long, int*, int*, int*, int*)
Unexecuted instantiation: enc_fast_lossless.cc:void AVX2::(anonymous namespace)::FillRowRGBA16<false, int>(unsigned char const*, unsigned long, int*, int*, int*, int*)
Unexecuted instantiation: enc_fast_lossless.cc:void default_implementation::(anonymous namespace)::FillRowRGBA16<true, short>(unsigned char const*, unsigned long, short*, short*, short*, short*)
Unexecuted instantiation: enc_fast_lossless.cc:void default_implementation::(anonymous namespace)::FillRowRGBA16<false, short>(unsigned char const*, unsigned long, short*, short*, short*, short*)
Unexecuted instantiation: enc_fast_lossless.cc:void default_implementation::(anonymous namespace)::FillRowRGBA16<true, int>(unsigned char const*, unsigned long, int*, int*, int*, int*)
Unexecuted instantiation: enc_fast_lossless.cc:void default_implementation::(anonymous namespace)::FillRowRGBA16<false, int>(unsigned char const*, unsigned long, int*, int*, int*, int*)
3405
3406
template <typename Processor, typename BitDepth>
3407
void ProcessImageArea(const unsigned char* rgba, size_t x0, size_t y0,
3408
                      size_t xs, size_t yskip, size_t ys, size_t row_stride,
3409
                      BitDepth bitdepth, size_t nb_chans, bool big_endian,
3410
0
                      Processor* processors) {
3411
0
  constexpr size_t kPadding = 32;
3412
3413
0
  using pixel_t = typename BitDepth::pixel_t;
3414
3415
0
  constexpr size_t kAlign = 64;
3416
0
  constexpr size_t kAlignPixels = kAlign / sizeof(pixel_t);
3417
3418
0
  auto align = [=](pixel_t* ptr) {
3419
0
    size_t offset = reinterpret_cast<uintptr_t>(ptr) % kAlign;
3420
0
    if (offset) {
3421
0
      ptr += offset / sizeof(pixel_t);
3422
0
    }
3423
0
    return ptr;
3424
0
  };
Unexecuted instantiation: enc_fast_lossless.cc:AVX2::(anonymous namespace)::ProcessImageArea<AVX2::(anonymous namespace)::ChannelRowProcessor<AVX2::(anonymous namespace)::ChunkSampleCollector<AVX2::(anonymous namespace)::UpTo8Bits>, AVX2::(anonymous namespace)::UpTo8Bits>, AVX2::(anonymous namespace)::UpTo8Bits>(unsigned char const*, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, AVX2::(anonymous namespace)::UpTo8Bits, unsigned long, bool, AVX2::(anonymous namespace)::ChannelRowProcessor<AVX2::(anonymous namespace)::ChunkSampleCollector<AVX2::(anonymous namespace)::UpTo8Bits>, AVX2::(anonymous namespace)::UpTo8Bits>*)::{lambda(short*)#1}::operator()(short*) const
Unexecuted instantiation: enc_fast_lossless.cc:AVX2::(anonymous namespace)::ProcessImageArea<AVX2::(anonymous namespace)::ChannelRowProcessor<AVX2::(anonymous namespace)::ChunkSampleCollector<AVX2::(anonymous namespace)::From9To13Bits>, AVX2::(anonymous namespace)::From9To13Bits>, AVX2::(anonymous namespace)::From9To13Bits>(unsigned char const*, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, AVX2::(anonymous namespace)::From9To13Bits, unsigned long, bool, AVX2::(anonymous namespace)::ChannelRowProcessor<AVX2::(anonymous namespace)::ChunkSampleCollector<AVX2::(anonymous namespace)::From9To13Bits>, AVX2::(anonymous namespace)::From9To13Bits>*)::{lambda(short*)#1}::operator()(short*) const
Unexecuted instantiation: enc_fast_lossless.cc:AVX2::(anonymous namespace)::ProcessImageArea<AVX2::(anonymous namespace)::ChannelRowProcessor<AVX2::(anonymous namespace)::ChunkSampleCollector<AVX2::(anonymous namespace)::Exactly14Bits>, AVX2::(anonymous namespace)::Exactly14Bits>, AVX2::(anonymous namespace)::Exactly14Bits>(unsigned char const*, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, AVX2::(anonymous namespace)::Exactly14Bits, unsigned long, bool, AVX2::(anonymous namespace)::ChannelRowProcessor<AVX2::(anonymous namespace)::ChunkSampleCollector<AVX2::(anonymous namespace)::Exactly14Bits>, AVX2::(anonymous namespace)::Exactly14Bits>*)::{lambda(short*)#1}::operator()(short*) const
Unexecuted instantiation: enc_fast_lossless.cc:AVX2::(anonymous namespace)::ProcessImageArea<AVX2::(anonymous namespace)::ChannelRowProcessor<AVX2::(anonymous namespace)::ChunkSampleCollector<AVX2::(anonymous namespace)::MoreThan14Bits>, AVX2::(anonymous namespace)::MoreThan14Bits>, AVX2::(anonymous namespace)::MoreThan14Bits>(unsigned char const*, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, AVX2::(anonymous namespace)::MoreThan14Bits, unsigned long, bool, AVX2::(anonymous namespace)::ChannelRowProcessor<AVX2::(anonymous namespace)::ChunkSampleCollector<AVX2::(anonymous namespace)::MoreThan14Bits>, AVX2::(anonymous namespace)::MoreThan14Bits>*)::{lambda(int*)#1}::operator()(int*) const
Unexecuted instantiation: enc_fast_lossless.cc:default_implementation::(anonymous namespace)::ProcessImageArea<default_implementation::(anonymous namespace)::ChannelRowProcessor<default_implementation::(anonymous namespace)::ChunkSampleCollector<default_implementation::(anonymous namespace)::UpTo8Bits>, default_implementation::(anonymous namespace)::UpTo8Bits>, default_implementation::(anonymous namespace)::UpTo8Bits>(unsigned char const*, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, default_implementation::(anonymous namespace)::UpTo8Bits, unsigned long, bool, default_implementation::(anonymous namespace)::ChannelRowProcessor<default_implementation::(anonymous namespace)::ChunkSampleCollector<default_implementation::(anonymous namespace)::UpTo8Bits>, default_implementation::(anonymous namespace)::UpTo8Bits>*)::{lambda(short*)#1}::operator()(short*) const
Unexecuted instantiation: enc_fast_lossless.cc:default_implementation::(anonymous namespace)::ProcessImageArea<default_implementation::(anonymous namespace)::ChannelRowProcessor<default_implementation::(anonymous namespace)::ChunkSampleCollector<default_implementation::(anonymous namespace)::From9To13Bits>, default_implementation::(anonymous namespace)::From9To13Bits>, default_implementation::(anonymous namespace)::From9To13Bits>(unsigned char const*, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, default_implementation::(anonymous namespace)::From9To13Bits, unsigned long, bool, default_implementation::(anonymous namespace)::ChannelRowProcessor<default_implementation::(anonymous namespace)::ChunkSampleCollector<default_implementation::(anonymous namespace)::From9To13Bits>, default_implementation::(anonymous namespace)::From9To13Bits>*)::{lambda(short*)#1}::operator()(short*) const
Unexecuted instantiation: enc_fast_lossless.cc:default_implementation::(anonymous namespace)::ProcessImageArea<default_implementation::(anonymous namespace)::ChannelRowProcessor<default_implementation::(anonymous namespace)::ChunkSampleCollector<default_implementation::(anonymous namespace)::Exactly14Bits>, default_implementation::(anonymous namespace)::Exactly14Bits>, default_implementation::(anonymous namespace)::Exactly14Bits>(unsigned char const*, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, default_implementation::(anonymous namespace)::Exactly14Bits, unsigned long, bool, default_implementation::(anonymous namespace)::ChannelRowProcessor<default_implementation::(anonymous namespace)::ChunkSampleCollector<default_implementation::(anonymous namespace)::Exactly14Bits>, default_implementation::(anonymous namespace)::Exactly14Bits>*)::{lambda(short*)#1}::operator()(short*) const
Unexecuted instantiation: enc_fast_lossless.cc:default_implementation::(anonymous namespace)::ProcessImageArea<default_implementation::(anonymous namespace)::ChannelRowProcessor<default_implementation::(anonymous namespace)::ChunkSampleCollector<default_implementation::(anonymous namespace)::MoreThan14Bits>, default_implementation::(anonymous namespace)::MoreThan14Bits>, default_implementation::(anonymous namespace)::MoreThan14Bits>(unsigned char const*, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, default_implementation::(anonymous namespace)::MoreThan14Bits, unsigned long, bool, default_implementation::(anonymous namespace)::ChannelRowProcessor<default_implementation::(anonymous namespace)::ChunkSampleCollector<default_implementation::(anonymous namespace)::MoreThan14Bits>, default_implementation::(anonymous namespace)::MoreThan14Bits>*)::{lambda(int*)#1}::operator()(int*) const
Unexecuted instantiation: enc_fast_lossless.cc:AVX2::(anonymous namespace)::ProcessImageArea<AVX2::(anonymous namespace)::ChannelRowProcessor<AVX2::(anonymous namespace)::ChunkEncoder<AVX2::(anonymous namespace)::UpTo8Bits>, AVX2::(anonymous namespace)::UpTo8Bits>, AVX2::(anonymous namespace)::UpTo8Bits>(unsigned char const*, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, AVX2::(anonymous namespace)::UpTo8Bits, unsigned long, bool, AVX2::(anonymous namespace)::ChannelRowProcessor<AVX2::(anonymous namespace)::ChunkEncoder<AVX2::(anonymous namespace)::UpTo8Bits>, AVX2::(anonymous namespace)::UpTo8Bits>*)::{lambda(short*)#1}::operator()(short*) const
Unexecuted instantiation: enc_fast_lossless.cc:AVX2::(anonymous namespace)::ProcessImageArea<AVX2::(anonymous namespace)::ChannelRowProcessor<AVX2::(anonymous namespace)::ChunkEncoder<AVX2::(anonymous namespace)::From9To13Bits>, AVX2::(anonymous namespace)::From9To13Bits>, AVX2::(anonymous namespace)::From9To13Bits>(unsigned char const*, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, AVX2::(anonymous namespace)::From9To13Bits, unsigned long, bool, AVX2::(anonymous namespace)::ChannelRowProcessor<AVX2::(anonymous namespace)::ChunkEncoder<AVX2::(anonymous namespace)::From9To13Bits>, AVX2::(anonymous namespace)::From9To13Bits>*)::{lambda(short*)#1}::operator()(short*) const
Unexecuted instantiation: enc_fast_lossless.cc:AVX2::(anonymous namespace)::ProcessImageArea<AVX2::(anonymous namespace)::ChannelRowProcessor<AVX2::(anonymous namespace)::ChunkEncoder<AVX2::(anonymous namespace)::Exactly14Bits>, AVX2::(anonymous namespace)::Exactly14Bits>, AVX2::(anonymous namespace)::Exactly14Bits>(unsigned char const*, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, AVX2::(anonymous namespace)::Exactly14Bits, unsigned long, bool, AVX2::(anonymous namespace)::ChannelRowProcessor<AVX2::(anonymous namespace)::ChunkEncoder<AVX2::(anonymous namespace)::Exactly14Bits>, AVX2::(anonymous namespace)::Exactly14Bits>*)::{lambda(short*)#1}::operator()(short*) const
Unexecuted instantiation: enc_fast_lossless.cc:AVX2::(anonymous namespace)::ProcessImageArea<AVX2::(anonymous namespace)::ChannelRowProcessor<AVX2::(anonymous namespace)::ChunkEncoder<AVX2::(anonymous namespace)::MoreThan14Bits>, AVX2::(anonymous namespace)::MoreThan14Bits>, AVX2::(anonymous namespace)::MoreThan14Bits>(unsigned char const*, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, AVX2::(anonymous namespace)::MoreThan14Bits, unsigned long, bool, AVX2::(anonymous namespace)::ChannelRowProcessor<AVX2::(anonymous namespace)::ChunkEncoder<AVX2::(anonymous namespace)::MoreThan14Bits>, AVX2::(anonymous namespace)::MoreThan14Bits>*)::{lambda(int*)#1}::operator()(int*) const
Unexecuted instantiation: enc_fast_lossless.cc:default_implementation::(anonymous namespace)::ProcessImageArea<default_implementation::(anonymous namespace)::ChannelRowProcessor<default_implementation::(anonymous namespace)::ChunkEncoder<default_implementation::(anonymous namespace)::UpTo8Bits>, default_implementation::(anonymous namespace)::UpTo8Bits>, default_implementation::(anonymous namespace)::UpTo8Bits>(unsigned char const*, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, default_implementation::(anonymous namespace)::UpTo8Bits, unsigned long, bool, default_implementation::(anonymous namespace)::ChannelRowProcessor<default_implementation::(anonymous namespace)::ChunkEncoder<default_implementation::(anonymous namespace)::UpTo8Bits>, default_implementation::(anonymous namespace)::UpTo8Bits>*)::{lambda(short*)#1}::operator()(short*) const
Unexecuted instantiation: enc_fast_lossless.cc:default_implementation::(anonymous namespace)::ProcessImageArea<default_implementation::(anonymous namespace)::ChannelRowProcessor<default_implementation::(anonymous namespace)::ChunkEncoder<default_implementation::(anonymous namespace)::From9To13Bits>, default_implementation::(anonymous namespace)::From9To13Bits>, default_implementation::(anonymous namespace)::From9To13Bits>(unsigned char const*, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, default_implementation::(anonymous namespace)::From9To13Bits, unsigned long, bool, default_implementation::(anonymous namespace)::ChannelRowProcessor<default_implementation::(anonymous namespace)::ChunkEncoder<default_implementation::(anonymous namespace)::From9To13Bits>, default_implementation::(anonymous namespace)::From9To13Bits>*)::{lambda(short*)#1}::operator()(short*) const
Unexecuted instantiation: enc_fast_lossless.cc:default_implementation::(anonymous namespace)::ProcessImageArea<default_implementation::(anonymous namespace)::ChannelRowProcessor<default_implementation::(anonymous namespace)::ChunkEncoder<default_implementation::(anonymous namespace)::Exactly14Bits>, default_implementation::(anonymous namespace)::Exactly14Bits>, default_implementation::(anonymous namespace)::Exactly14Bits>(unsigned char const*, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, default_implementation::(anonymous namespace)::Exactly14Bits, unsigned long, bool, default_implementation::(anonymous namespace)::ChannelRowProcessor<default_implementation::(anonymous namespace)::ChunkEncoder<default_implementation::(anonymous namespace)::Exactly14Bits>, default_implementation::(anonymous namespace)::Exactly14Bits>*)::{lambda(short*)#1}::operator()(short*) const
Unexecuted instantiation: enc_fast_lossless.cc:default_implementation::(anonymous namespace)::ProcessImageArea<default_implementation::(anonymous namespace)::ChannelRowProcessor<default_implementation::(anonymous namespace)::ChunkEncoder<default_implementation::(anonymous namespace)::MoreThan14Bits>, default_implementation::(anonymous namespace)::MoreThan14Bits>, default_implementation::(anonymous namespace)::MoreThan14Bits>(unsigned char const*, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, default_implementation::(anonymous namespace)::MoreThan14Bits, unsigned long, bool, default_implementation::(anonymous namespace)::ChannelRowProcessor<default_implementation::(anonymous namespace)::ChunkEncoder<default_implementation::(anonymous namespace)::MoreThan14Bits>, default_implementation::(anonymous namespace)::MoreThan14Bits>*)::{lambda(int*)#1}::operator()(int*) const
3425
3426
0
  constexpr size_t kNumPx =
3427
0
      (256 + kPadding * 2 + kAlignPixels + kAlignPixels - 1) / kAlignPixels *
3428
0
      kAlignPixels;
3429
3430
0
  std::vector<std::array<std::array<pixel_t, kNumPx>, 2>> group_data(nb_chans);
3431
3432
0
  for (size_t y = 0; y < ys; y++) {
3433
0
    const auto rgba_row =
3434
0
        rgba + row_stride * (y0 + y) + x0 * nb_chans * BitDepth::kInputBytes;
3435
0
    pixel_t* crow[4] = {};
3436
0
    pixel_t* prow[4] = {};
3437
0
    for (size_t i = 0; i < nb_chans; i++) {
3438
0
      crow[i] = align(&group_data[i][y & 1][kPadding]);
3439
0
      prow[i] = align(&group_data[i][(y - 1) & 1][kPadding]);
3440
0
    }
3441
3442
    // Pre-fill rows with YCoCg converted pixels.
3443
0
    if (nb_chans == 1) {
3444
0
      if (BitDepth::kInputBytes == 1) {
3445
0
        FillRowG8(rgba_row, xs, crow[0]);
3446
0
      } else if (big_endian) {
3447
0
        FillRowG16</*big_endian=*/true>(rgba_row, xs, crow[0]);
3448
0
      } else {
3449
0
        FillRowG16</*big_endian=*/false>(rgba_row, xs, crow[0]);
3450
0
      }
3451
0
    } else if (nb_chans == 2) {
3452
0
      if (BitDepth::kInputBytes == 1) {
3453
0
        FillRowGA8(rgba_row, xs, crow[0], crow[1]);
3454
0
      } else if (big_endian) {
3455
0
        FillRowGA16</*big_endian=*/true>(rgba_row, xs, crow[0], crow[1]);
3456
0
      } else {
3457
0
        FillRowGA16</*big_endian=*/false>(rgba_row, xs, crow[0], crow[1]);
3458
0
      }
3459
0
    } else if (nb_chans == 3) {
3460
0
      if (BitDepth::kInputBytes == 1) {
3461
0
        FillRowRGB8(rgba_row, xs, crow[0], crow[1], crow[2]);
3462
0
      } else if (big_endian) {
3463
0
        FillRowRGB16</*big_endian=*/true>(rgba_row, xs, crow[0], crow[1],
3464
0
                                          crow[2]);
3465
0
      } else {
3466
0
        FillRowRGB16</*big_endian=*/false>(rgba_row, xs, crow[0], crow[1],
3467
0
                                           crow[2]);
3468
0
      }
3469
0
    } else {
3470
0
      if (BitDepth::kInputBytes == 1) {
3471
0
        FillRowRGBA8(rgba_row, xs, crow[0], crow[1], crow[2], crow[3]);
3472
0
      } else if (big_endian) {
3473
0
        FillRowRGBA16</*big_endian=*/true>(rgba_row, xs, crow[0], crow[1],
3474
0
                                           crow[2], crow[3]);
3475
0
      } else {
3476
0
        FillRowRGBA16</*big_endian=*/false>(rgba_row, xs, crow[0], crow[1],
3477
0
                                            crow[2], crow[3]);
3478
0
      }
3479
0
    }
3480
    // Deal with x == 0.
3481
0
    for (size_t c = 0; c < nb_chans; c++) {
3482
0
      *(crow[c] - 1) = y > 0 ? *(prow[c]) : 0;
3483
      // Fix topleft.
3484
0
      *(prow[c] - 1) = y > 0 ? *(prow[c]) : 0;
3485
0
    }
3486
0
    if (y < yskip) continue;
3487
0
    for (size_t c = 0; c < nb_chans; c++) {
3488
      // Get pointers to px/left/top/topleft data to speedup loop.
3489
0
      const pixel_t* row = crow[c];
3490
0
      const pixel_t* row_left = crow[c] - 1;
3491
0
      const pixel_t* row_top = y == 0 ? row_left : prow[c];
3492
0
      const pixel_t* row_topleft = y == 0 ? row_left : prow[c] - 1;
3493
3494
0
      processors[c].ProcessRow(row, row_left, row_top, row_topleft, xs);
3495
0
    }
3496
0
  }
3497
0
  for (size_t c = 0; c < nb_chans; c++) {
3498
0
    processors[c].Finalize();
3499
0
  }
3500
0
}
Unexecuted instantiation: enc_fast_lossless.cc:void AVX2::(anonymous namespace)::ProcessImageArea<AVX2::(anonymous namespace)::ChannelRowProcessor<AVX2::(anonymous namespace)::ChunkSampleCollector<AVX2::(anonymous namespace)::UpTo8Bits>, AVX2::(anonymous namespace)::UpTo8Bits>, AVX2::(anonymous namespace)::UpTo8Bits>(unsigned char const*, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, AVX2::(anonymous namespace)::UpTo8Bits, unsigned long, bool, AVX2::(anonymous namespace)::ChannelRowProcessor<AVX2::(anonymous namespace)::ChunkSampleCollector<AVX2::(anonymous namespace)::UpTo8Bits>, AVX2::(anonymous namespace)::UpTo8Bits>*)
Unexecuted instantiation: enc_fast_lossless.cc:void AVX2::(anonymous namespace)::ProcessImageArea<AVX2::(anonymous namespace)::ChannelRowProcessor<AVX2::(anonymous namespace)::ChunkSampleCollector<AVX2::(anonymous namespace)::From9To13Bits>, AVX2::(anonymous namespace)::From9To13Bits>, AVX2::(anonymous namespace)::From9To13Bits>(unsigned char const*, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, AVX2::(anonymous namespace)::From9To13Bits, unsigned long, bool, AVX2::(anonymous namespace)::ChannelRowProcessor<AVX2::(anonymous namespace)::ChunkSampleCollector<AVX2::(anonymous namespace)::From9To13Bits>, AVX2::(anonymous namespace)::From9To13Bits>*)
Unexecuted instantiation: enc_fast_lossless.cc:void AVX2::(anonymous namespace)::ProcessImageArea<AVX2::(anonymous namespace)::ChannelRowProcessor<AVX2::(anonymous namespace)::ChunkSampleCollector<AVX2::(anonymous namespace)::Exactly14Bits>, AVX2::(anonymous namespace)::Exactly14Bits>, AVX2::(anonymous namespace)::Exactly14Bits>(unsigned char const*, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, AVX2::(anonymous namespace)::Exactly14Bits, unsigned long, bool, AVX2::(anonymous namespace)::ChannelRowProcessor<AVX2::(anonymous namespace)::ChunkSampleCollector<AVX2::(anonymous namespace)::Exactly14Bits>, AVX2::(anonymous namespace)::Exactly14Bits>*)
Unexecuted instantiation: enc_fast_lossless.cc:void AVX2::(anonymous namespace)::ProcessImageArea<AVX2::(anonymous namespace)::ChannelRowProcessor<AVX2::(anonymous namespace)::ChunkSampleCollector<AVX2::(anonymous namespace)::MoreThan14Bits>, AVX2::(anonymous namespace)::MoreThan14Bits>, AVX2::(anonymous namespace)::MoreThan14Bits>(unsigned char const*, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, AVX2::(anonymous namespace)::MoreThan14Bits, unsigned long, bool, AVX2::(anonymous namespace)::ChannelRowProcessor<AVX2::(anonymous namespace)::ChunkSampleCollector<AVX2::(anonymous namespace)::MoreThan14Bits>, AVX2::(anonymous namespace)::MoreThan14Bits>*)
Unexecuted instantiation: enc_fast_lossless.cc:void default_implementation::(anonymous namespace)::ProcessImageArea<default_implementation::(anonymous namespace)::ChannelRowProcessor<default_implementation::(anonymous namespace)::ChunkSampleCollector<default_implementation::(anonymous namespace)::UpTo8Bits>, default_implementation::(anonymous namespace)::UpTo8Bits>, default_implementation::(anonymous namespace)::UpTo8Bits>(unsigned char const*, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, default_implementation::(anonymous namespace)::UpTo8Bits, unsigned long, bool, default_implementation::(anonymous namespace)::ChannelRowProcessor<default_implementation::(anonymous namespace)::ChunkSampleCollector<default_implementation::(anonymous namespace)::UpTo8Bits>, default_implementation::(anonymous namespace)::UpTo8Bits>*)
Unexecuted instantiation: enc_fast_lossless.cc:void default_implementation::(anonymous namespace)::ProcessImageArea<default_implementation::(anonymous namespace)::ChannelRowProcessor<default_implementation::(anonymous namespace)::ChunkSampleCollector<default_implementation::(anonymous namespace)::From9To13Bits>, default_implementation::(anonymous namespace)::From9To13Bits>, default_implementation::(anonymous namespace)::From9To13Bits>(unsigned char const*, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, default_implementation::(anonymous namespace)::From9To13Bits, unsigned long, bool, default_implementation::(anonymous namespace)::ChannelRowProcessor<default_implementation::(anonymous namespace)::ChunkSampleCollector<default_implementation::(anonymous namespace)::From9To13Bits>, default_implementation::(anonymous namespace)::From9To13Bits>*)
Unexecuted instantiation: enc_fast_lossless.cc:void default_implementation::(anonymous namespace)::ProcessImageArea<default_implementation::(anonymous namespace)::ChannelRowProcessor<default_implementation::(anonymous namespace)::ChunkSampleCollector<default_implementation::(anonymous namespace)::Exactly14Bits>, default_implementation::(anonymous namespace)::Exactly14Bits>, default_implementation::(anonymous namespace)::Exactly14Bits>(unsigned char const*, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, default_implementation::(anonymous namespace)::Exactly14Bits, unsigned long, bool, default_implementation::(anonymous namespace)::ChannelRowProcessor<default_implementation::(anonymous namespace)::ChunkSampleCollector<default_implementation::(anonymous namespace)::Exactly14Bits>, default_implementation::(anonymous namespace)::Exactly14Bits>*)
Unexecuted instantiation: enc_fast_lossless.cc:void default_implementation::(anonymous namespace)::ProcessImageArea<default_implementation::(anonymous namespace)::ChannelRowProcessor<default_implementation::(anonymous namespace)::ChunkSampleCollector<default_implementation::(anonymous namespace)::MoreThan14Bits>, default_implementation::(anonymous namespace)::MoreThan14Bits>, default_implementation::(anonymous namespace)::MoreThan14Bits>(unsigned char const*, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, default_implementation::(anonymous namespace)::MoreThan14Bits, unsigned long, bool, default_implementation::(anonymous namespace)::ChannelRowProcessor<default_implementation::(anonymous namespace)::ChunkSampleCollector<default_implementation::(anonymous namespace)::MoreThan14Bits>, default_implementation::(anonymous namespace)::MoreThan14Bits>*)
Unexecuted instantiation: enc_fast_lossless.cc:void AVX2::(anonymous namespace)::ProcessImageArea<AVX2::(anonymous namespace)::ChannelRowProcessor<AVX2::(anonymous namespace)::ChunkEncoder<AVX2::(anonymous namespace)::UpTo8Bits>, AVX2::(anonymous namespace)::UpTo8Bits>, AVX2::(anonymous namespace)::UpTo8Bits>(unsigned char const*, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, AVX2::(anonymous namespace)::UpTo8Bits, unsigned long, bool, AVX2::(anonymous namespace)::ChannelRowProcessor<AVX2::(anonymous namespace)::ChunkEncoder<AVX2::(anonymous namespace)::UpTo8Bits>, AVX2::(anonymous namespace)::UpTo8Bits>*)
Unexecuted instantiation: enc_fast_lossless.cc:void AVX2::(anonymous namespace)::ProcessImageArea<AVX2::(anonymous namespace)::ChannelRowProcessor<AVX2::(anonymous namespace)::ChunkEncoder<AVX2::(anonymous namespace)::From9To13Bits>, AVX2::(anonymous namespace)::From9To13Bits>, AVX2::(anonymous namespace)::From9To13Bits>(unsigned char const*, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, AVX2::(anonymous namespace)::From9To13Bits, unsigned long, bool, AVX2::(anonymous namespace)::ChannelRowProcessor<AVX2::(anonymous namespace)::ChunkEncoder<AVX2::(anonymous namespace)::From9To13Bits>, AVX2::(anonymous namespace)::From9To13Bits>*)
Unexecuted instantiation: enc_fast_lossless.cc:void AVX2::(anonymous namespace)::ProcessImageArea<AVX2::(anonymous namespace)::ChannelRowProcessor<AVX2::(anonymous namespace)::ChunkEncoder<AVX2::(anonymous namespace)::Exactly14Bits>, AVX2::(anonymous namespace)::Exactly14Bits>, AVX2::(anonymous namespace)::Exactly14Bits>(unsigned char const*, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, AVX2::(anonymous namespace)::Exactly14Bits, unsigned long, bool, AVX2::(anonymous namespace)::ChannelRowProcessor<AVX2::(anonymous namespace)::ChunkEncoder<AVX2::(anonymous namespace)::Exactly14Bits>, AVX2::(anonymous namespace)::Exactly14Bits>*)
Unexecuted instantiation: enc_fast_lossless.cc:void AVX2::(anonymous namespace)::ProcessImageArea<AVX2::(anonymous namespace)::ChannelRowProcessor<AVX2::(anonymous namespace)::ChunkEncoder<AVX2::(anonymous namespace)::MoreThan14Bits>, AVX2::(anonymous namespace)::MoreThan14Bits>, AVX2::(anonymous namespace)::MoreThan14Bits>(unsigned char const*, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, AVX2::(anonymous namespace)::MoreThan14Bits, unsigned long, bool, AVX2::(anonymous namespace)::ChannelRowProcessor<AVX2::(anonymous namespace)::ChunkEncoder<AVX2::(anonymous namespace)::MoreThan14Bits>, AVX2::(anonymous namespace)::MoreThan14Bits>*)
Unexecuted instantiation: enc_fast_lossless.cc:void default_implementation::(anonymous namespace)::ProcessImageArea<default_implementation::(anonymous namespace)::ChannelRowProcessor<default_implementation::(anonymous namespace)::ChunkEncoder<default_implementation::(anonymous namespace)::UpTo8Bits>, default_implementation::(anonymous namespace)::UpTo8Bits>, default_implementation::(anonymous namespace)::UpTo8Bits>(unsigned char const*, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, default_implementation::(anonymous namespace)::UpTo8Bits, unsigned long, bool, default_implementation::(anonymous namespace)::ChannelRowProcessor<default_implementation::(anonymous namespace)::ChunkEncoder<default_implementation::(anonymous namespace)::UpTo8Bits>, default_implementation::(anonymous namespace)::UpTo8Bits>*)
Unexecuted instantiation: enc_fast_lossless.cc:void default_implementation::(anonymous namespace)::ProcessImageArea<default_implementation::(anonymous namespace)::ChannelRowProcessor<default_implementation::(anonymous namespace)::ChunkEncoder<default_implementation::(anonymous namespace)::From9To13Bits>, default_implementation::(anonymous namespace)::From9To13Bits>, default_implementation::(anonymous namespace)::From9To13Bits>(unsigned char const*, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, default_implementation::(anonymous namespace)::From9To13Bits, unsigned long, bool, default_implementation::(anonymous namespace)::ChannelRowProcessor<default_implementation::(anonymous namespace)::ChunkEncoder<default_implementation::(anonymous namespace)::From9To13Bits>, default_implementation::(anonymous namespace)::From9To13Bits>*)
Unexecuted instantiation: enc_fast_lossless.cc:void default_implementation::(anonymous namespace)::ProcessImageArea<default_implementation::(anonymous namespace)::ChannelRowProcessor<default_implementation::(anonymous namespace)::ChunkEncoder<default_implementation::(anonymous namespace)::Exactly14Bits>, default_implementation::(anonymous namespace)::Exactly14Bits>, default_implementation::(anonymous namespace)::Exactly14Bits>(unsigned char const*, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, default_implementation::(anonymous namespace)::Exactly14Bits, unsigned long, bool, default_implementation::(anonymous namespace)::ChannelRowProcessor<default_implementation::(anonymous namespace)::ChunkEncoder<default_implementation::(anonymous namespace)::Exactly14Bits>, default_implementation::(anonymous namespace)::Exactly14Bits>*)
Unexecuted instantiation: enc_fast_lossless.cc:void default_implementation::(anonymous namespace)::ProcessImageArea<default_implementation::(anonymous namespace)::ChannelRowProcessor<default_implementation::(anonymous namespace)::ChunkEncoder<default_implementation::(anonymous namespace)::MoreThan14Bits>, default_implementation::(anonymous namespace)::MoreThan14Bits>, default_implementation::(anonymous namespace)::MoreThan14Bits>(unsigned char const*, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, default_implementation::(anonymous namespace)::MoreThan14Bits, unsigned long, bool, default_implementation::(anonymous namespace)::ChannelRowProcessor<default_implementation::(anonymous namespace)::ChunkEncoder<default_implementation::(anonymous namespace)::MoreThan14Bits>, default_implementation::(anonymous namespace)::MoreThan14Bits>*)
3501
3502
template <typename BitDepth>
3503
void WriteACSection(const unsigned char* rgba, size_t x0, size_t y0, size_t xs,
3504
                    size_t ys, size_t row_stride, bool is_single_group,
3505
                    BitDepth bitdepth, size_t nb_chans, bool big_endian,
3506
                    const PrefixCode code[4],
3507
0
                    std::array<BitWriter, 4>& output) {
3508
0
  for (size_t i = 0; i < nb_chans; i++) {
3509
0
    if (is_single_group && i == 0) continue;
3510
0
    output[i].Allocate(xs * ys * bitdepth.MaxEncodedBitsPerSample() + 4);
3511
0
  }
3512
0
  if (!is_single_group) {
3513
    // Group header for modular image.
3514
    // When the image is single-group, the global modular image is the one
3515
    // that contains the pixel data, and there is no group header.
3516
0
    output[0].Write(1, 1);     // Global tree
3517
0
    output[0].Write(1, 1);     // All default wp
3518
0
    output[0].Write(2, 0b00);  // 0 transforms
3519
0
  }
3520
3521
0
  ChunkEncoder<BitDepth> encoders[4];
3522
0
  ChannelRowProcessor<ChunkEncoder<BitDepth>, BitDepth> row_encoders[4];
3523
0
  for (size_t c = 0; c < nb_chans; c++) {
3524
0
    row_encoders[c].t = &encoders[c];
3525
0
    encoders[c].output = &output[c];
3526
0
    encoders[c].code = &code[c];
3527
0
    encoders[c].PrepareForSimd();
3528
0
  }
3529
0
  ProcessImageArea<ChannelRowProcessor<ChunkEncoder<BitDepth>, BitDepth>>(
3530
0
      rgba, x0, y0, xs, 0, ys, row_stride, bitdepth, nb_chans, big_endian,
3531
0
      row_encoders);
3532
0
}
Unexecuted instantiation: enc_fast_lossless.cc:void AVX2::(anonymous namespace)::WriteACSection<AVX2::(anonymous namespace)::UpTo8Bits>(unsigned char const*, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, bool, AVX2::(anonymous namespace)::UpTo8Bits, unsigned long, bool, (anonymous namespace)::PrefixCode const*, std::__1::array<(anonymous namespace)::BitWriter, 4ul>&)
Unexecuted instantiation: enc_fast_lossless.cc:void AVX2::(anonymous namespace)::WriteACSection<AVX2::(anonymous namespace)::From9To13Bits>(unsigned char const*, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, bool, AVX2::(anonymous namespace)::From9To13Bits, unsigned long, bool, (anonymous namespace)::PrefixCode const*, std::__1::array<(anonymous namespace)::BitWriter, 4ul>&)
Unexecuted instantiation: enc_fast_lossless.cc:void AVX2::(anonymous namespace)::WriteACSection<AVX2::(anonymous namespace)::Exactly14Bits>(unsigned char const*, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, bool, AVX2::(anonymous namespace)::Exactly14Bits, unsigned long, bool, (anonymous namespace)::PrefixCode const*, std::__1::array<(anonymous namespace)::BitWriter, 4ul>&)
Unexecuted instantiation: enc_fast_lossless.cc:void AVX2::(anonymous namespace)::WriteACSection<AVX2::(anonymous namespace)::MoreThan14Bits>(unsigned char const*, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, bool, AVX2::(anonymous namespace)::MoreThan14Bits, unsigned long, bool, (anonymous namespace)::PrefixCode const*, std::__1::array<(anonymous namespace)::BitWriter, 4ul>&)
Unexecuted instantiation: enc_fast_lossless.cc:void default_implementation::(anonymous namespace)::WriteACSection<default_implementation::(anonymous namespace)::UpTo8Bits>(unsigned char const*, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, bool, default_implementation::(anonymous namespace)::UpTo8Bits, unsigned long, bool, (anonymous namespace)::PrefixCode const*, std::__1::array<(anonymous namespace)::BitWriter, 4ul>&)
Unexecuted instantiation: enc_fast_lossless.cc:void default_implementation::(anonymous namespace)::WriteACSection<default_implementation::(anonymous namespace)::From9To13Bits>(unsigned char const*, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, bool, default_implementation::(anonymous namespace)::From9To13Bits, unsigned long, bool, (anonymous namespace)::PrefixCode const*, std::__1::array<(anonymous namespace)::BitWriter, 4ul>&)
Unexecuted instantiation: enc_fast_lossless.cc:void default_implementation::(anonymous namespace)::WriteACSection<default_implementation::(anonymous namespace)::Exactly14Bits>(unsigned char const*, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, bool, default_implementation::(anonymous namespace)::Exactly14Bits, unsigned long, bool, (anonymous namespace)::PrefixCode const*, std::__1::array<(anonymous namespace)::BitWriter, 4ul>&)
Unexecuted instantiation: enc_fast_lossless.cc:void default_implementation::(anonymous namespace)::WriteACSection<default_implementation::(anonymous namespace)::MoreThan14Bits>(unsigned char const*, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, bool, default_implementation::(anonymous namespace)::MoreThan14Bits, unsigned long, bool, (anonymous namespace)::PrefixCode const*, std::__1::array<(anonymous namespace)::BitWriter, 4ul>&)
3533
3534
constexpr int kHashExp = 16;
3535
constexpr uint32_t kHashSize = 1 << kHashExp;
3536
constexpr uint32_t kHashMultiplier = 2654435761;
3537
constexpr int kMaxColors = 512;
3538
3539
// can be any function that returns a value in 0 .. kHashSize-1
3540
// has to map 0 to 0
3541
0
inline uint32_t pixel_hash(uint32_t p) {
3542
0
  return (p * kHashMultiplier) >> (32 - kHashExp);
3543
0
}
Unexecuted instantiation: enc_fast_lossless.cc:AVX2::(anonymous namespace)::pixel_hash(unsigned int)
Unexecuted instantiation: enc_fast_lossless.cc:default_implementation::(anonymous namespace)::pixel_hash(unsigned int)
3544
3545
template <size_t nb_chans>
3546
void FillRowPalette(const unsigned char* inrow, size_t xs,
3547
0
                    const int16_t* lookup, int16_t* out) {
3548
0
  for (size_t x = 0; x < xs; x++) {
3549
0
    uint32_t p = 0;
3550
0
    for (size_t i = 0; i < nb_chans; ++i) {
3551
0
      p |= inrow[x * nb_chans + i] << (8 * i);
3552
0
    }
3553
0
    out[x] = lookup[pixel_hash(p)];
3554
0
  }
3555
0
}
Unexecuted instantiation: enc_fast_lossless.cc:void AVX2::(anonymous namespace)::FillRowPalette<1ul>(unsigned char const*, unsigned long, short const*, short*)
Unexecuted instantiation: enc_fast_lossless.cc:void AVX2::(anonymous namespace)::FillRowPalette<2ul>(unsigned char const*, unsigned long, short const*, short*)
Unexecuted instantiation: enc_fast_lossless.cc:void AVX2::(anonymous namespace)::FillRowPalette<3ul>(unsigned char const*, unsigned long, short const*, short*)
Unexecuted instantiation: enc_fast_lossless.cc:void AVX2::(anonymous namespace)::FillRowPalette<4ul>(unsigned char const*, unsigned long, short const*, short*)
Unexecuted instantiation: enc_fast_lossless.cc:void default_implementation::(anonymous namespace)::FillRowPalette<1ul>(unsigned char const*, unsigned long, short const*, short*)
Unexecuted instantiation: enc_fast_lossless.cc:void default_implementation::(anonymous namespace)::FillRowPalette<2ul>(unsigned char const*, unsigned long, short const*, short*)
Unexecuted instantiation: enc_fast_lossless.cc:void default_implementation::(anonymous namespace)::FillRowPalette<3ul>(unsigned char const*, unsigned long, short const*, short*)
Unexecuted instantiation: enc_fast_lossless.cc:void default_implementation::(anonymous namespace)::FillRowPalette<4ul>(unsigned char const*, unsigned long, short const*, short*)
3556
3557
template <typename Processor>
3558
void ProcessImageAreaPalette(const unsigned char* rgba, size_t x0, size_t y0,
3559
                             size_t xs, size_t yskip, size_t ys,
3560
                             size_t row_stride, const int16_t* lookup,
3561
0
                             size_t nb_chans, Processor* processors) {
3562
0
  constexpr size_t kPadding = 32;
3563
3564
0
  std::vector<std::array<int16_t, 256 + kPadding * 2>> group_data(2);
3565
0
  Processor& row_encoder = processors[0];
3566
3567
0
  for (size_t y = 0; y < ys; y++) {
3568
    // Pre-fill rows with palette converted pixels.
3569
0
    const unsigned char* inrow = rgba + row_stride * (y0 + y) + x0 * nb_chans;
3570
0
    int16_t* outrow = &group_data[y & 1][kPadding];
3571
0
    if (nb_chans == 1) {
3572
0
      FillRowPalette<1>(inrow, xs, lookup, outrow);
3573
0
    } else if (nb_chans == 2) {
3574
0
      FillRowPalette<2>(inrow, xs, lookup, outrow);
3575
0
    } else if (nb_chans == 3) {
3576
0
      FillRowPalette<3>(inrow, xs, lookup, outrow);
3577
0
    } else if (nb_chans == 4) {
3578
0
      FillRowPalette<4>(inrow, xs, lookup, outrow);
3579
0
    }
3580
    // Deal with x == 0.
3581
0
    group_data[y & 1][kPadding - 1] =
3582
0
        y > 0 ? group_data[(y - 1) & 1][kPadding] : 0;
3583
    // Fix topleft.
3584
0
    group_data[(y - 1) & 1][kPadding - 1] =
3585
0
        y > 0 ? group_data[(y - 1) & 1][kPadding] : 0;
3586
    // Get pointers to px/left/top/topleft data to speedup loop.
3587
0
    const int16_t* row = &group_data[y & 1][kPadding];
3588
0
    const int16_t* row_left = &group_data[y & 1][kPadding - 1];
3589
0
    const int16_t* row_top =
3590
0
        y == 0 ? row_left : &group_data[(y - 1) & 1][kPadding];
3591
0
    const int16_t* row_topleft =
3592
0
        y == 0 ? row_left : &group_data[(y - 1) & 1][kPadding - 1];
3593
3594
0
    row_encoder.ProcessRow(row, row_left, row_top, row_topleft, xs);
3595
0
  }
3596
0
  row_encoder.Finalize();
3597
0
}
Unexecuted instantiation: enc_fast_lossless.cc:void AVX2::(anonymous namespace)::ProcessImageAreaPalette<AVX2::(anonymous namespace)::ChannelRowProcessor<AVX2::(anonymous namespace)::ChunkSampleCollector<AVX2::(anonymous namespace)::UpTo8Bits>, AVX2::(anonymous namespace)::UpTo8Bits> >(unsigned char const*, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, short const*, unsigned long, AVX2::(anonymous namespace)::ChannelRowProcessor<AVX2::(anonymous namespace)::ChunkSampleCollector<AVX2::(anonymous namespace)::UpTo8Bits>, AVX2::(anonymous namespace)::UpTo8Bits>*)
Unexecuted instantiation: enc_fast_lossless.cc:void default_implementation::(anonymous namespace)::ProcessImageAreaPalette<default_implementation::(anonymous namespace)::ChannelRowProcessor<default_implementation::(anonymous namespace)::ChunkSampleCollector<default_implementation::(anonymous namespace)::UpTo8Bits>, default_implementation::(anonymous namespace)::UpTo8Bits> >(unsigned char const*, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, short const*, unsigned long, default_implementation::(anonymous namespace)::ChannelRowProcessor<default_implementation::(anonymous namespace)::ChunkSampleCollector<default_implementation::(anonymous namespace)::UpTo8Bits>, default_implementation::(anonymous namespace)::UpTo8Bits>*)
Unexecuted instantiation: enc_fast_lossless.cc:void AVX2::(anonymous namespace)::ProcessImageAreaPalette<AVX2::(anonymous namespace)::ChannelRowProcessor<AVX2::(anonymous namespace)::ChunkEncoder<AVX2::(anonymous namespace)::UpTo8Bits>, AVX2::(anonymous namespace)::UpTo8Bits> >(unsigned char const*, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, short const*, unsigned long, AVX2::(anonymous namespace)::ChannelRowProcessor<AVX2::(anonymous namespace)::ChunkEncoder<AVX2::(anonymous namespace)::UpTo8Bits>, AVX2::(anonymous namespace)::UpTo8Bits>*)
Unexecuted instantiation: enc_fast_lossless.cc:void default_implementation::(anonymous namespace)::ProcessImageAreaPalette<default_implementation::(anonymous namespace)::ChannelRowProcessor<default_implementation::(anonymous namespace)::ChunkEncoder<default_implementation::(anonymous namespace)::UpTo8Bits>, default_implementation::(anonymous namespace)::UpTo8Bits> >(unsigned char const*, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, short const*, unsigned long, default_implementation::(anonymous namespace)::ChannelRowProcessor<default_implementation::(anonymous namespace)::ChunkEncoder<default_implementation::(anonymous namespace)::UpTo8Bits>, default_implementation::(anonymous namespace)::UpTo8Bits>*)
3598
3599
void WriteACSectionPalette(const unsigned char* rgba, size_t x0, size_t y0,
3600
                           size_t xs, size_t ys, size_t row_stride,
3601
                           bool is_single_group, const PrefixCode code[4],
3602
                           const int16_t* lookup, size_t nb_chans,
3603
0
                           BitWriter& output) {
3604
0
  if (!is_single_group) {
3605
0
    output.Allocate(16 * xs * ys + 4);
3606
    // Group header for modular image.
3607
    // When the image is single-group, the global modular image is the one
3608
    // that contains the pixel data, and there is no group header.
3609
0
    output.Write(1, 1);     // Global tree
3610
0
    output.Write(1, 1);     // All default wp
3611
0
    output.Write(2, 0b00);  // 0 transforms
3612
0
  }
3613
3614
0
  ChunkEncoder<UpTo8Bits> encoder;
3615
0
  ChannelRowProcessor<ChunkEncoder<UpTo8Bits>, UpTo8Bits> row_encoder;
3616
3617
0
  row_encoder.t = &encoder;
3618
0
  encoder.output = &output;
3619
0
  encoder.code = &code[is_single_group ? 1 : 0];
3620
0
  encoder.PrepareForSimd();
3621
0
  ProcessImageAreaPalette<
3622
0
      ChannelRowProcessor<ChunkEncoder<UpTo8Bits>, UpTo8Bits>>(
3623
0
      rgba, x0, y0, xs, 0, ys, row_stride, lookup, nb_chans, &row_encoder);
3624
0
}
Unexecuted instantiation: enc_fast_lossless.cc:AVX2::(anonymous namespace)::WriteACSectionPalette(unsigned char const*, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, bool, (anonymous namespace)::PrefixCode const*, short const*, unsigned long, (anonymous namespace)::BitWriter&)
Unexecuted instantiation: enc_fast_lossless.cc:default_implementation::(anonymous namespace)::WriteACSectionPalette(unsigned char const*, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, bool, (anonymous namespace)::PrefixCode const*, short const*, unsigned long, (anonymous namespace)::BitWriter&)
3625
3626
template <typename BitDepth>
3627
void CollectSamples(const unsigned char* rgba, size_t x0, size_t y0, size_t xs,
3628
                    size_t row_stride, size_t row_count,
3629
                    uint64_t raw_counts[4][kNumRawSymbols],
3630
                    uint64_t lz77_counts[4][kNumLZ77], bool is_single_group,
3631
                    bool palette, BitDepth bitdepth, size_t nb_chans,
3632
0
                    bool big_endian, const int16_t* lookup) {
3633
0
  if (palette) {
3634
0
    ChunkSampleCollector<UpTo8Bits> sample_collectors[4];
3635
0
    ChannelRowProcessor<ChunkSampleCollector<UpTo8Bits>, UpTo8Bits>
3636
0
        row_sample_collectors[4];
3637
0
    for (size_t c = 0; c < nb_chans; c++) {
3638
0
      row_sample_collectors[c].t = &sample_collectors[c];
3639
0
      sample_collectors[c].raw_counts = raw_counts[is_single_group ? 1 : 0];
3640
0
      sample_collectors[c].lz77_counts = lz77_counts[is_single_group ? 1 : 0];
3641
0
    }
3642
0
    ProcessImageAreaPalette<
3643
0
        ChannelRowProcessor<ChunkSampleCollector<UpTo8Bits>, UpTo8Bits>>(
3644
0
        rgba, x0, y0, xs, 1, 1 + row_count, row_stride, lookup, nb_chans,
3645
0
        row_sample_collectors);
3646
0
  } else {
3647
0
    ChunkSampleCollector<BitDepth> sample_collectors[4];
3648
0
    ChannelRowProcessor<ChunkSampleCollector<BitDepth>, BitDepth>
3649
0
        row_sample_collectors[4];
3650
0
    for (size_t c = 0; c < nb_chans; c++) {
3651
0
      row_sample_collectors[c].t = &sample_collectors[c];
3652
0
      sample_collectors[c].raw_counts = raw_counts[c];
3653
0
      sample_collectors[c].lz77_counts = lz77_counts[c];
3654
0
    }
3655
0
    ProcessImageArea<
3656
0
        ChannelRowProcessor<ChunkSampleCollector<BitDepth>, BitDepth>>(
3657
0
        rgba, x0, y0, xs, 1, 1 + row_count, row_stride, bitdepth, nb_chans,
3658
0
        big_endian, row_sample_collectors);
3659
0
  }
3660
0
}
Unexecuted instantiation: enc_fast_lossless.cc:void AVX2::(anonymous namespace)::CollectSamples<AVX2::(anonymous namespace)::UpTo8Bits>(unsigned char const*, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long (*) [19], unsigned long (*) [33], bool, bool, AVX2::(anonymous namespace)::UpTo8Bits, unsigned long, bool, short const*)
Unexecuted instantiation: enc_fast_lossless.cc:void AVX2::(anonymous namespace)::CollectSamples<AVX2::(anonymous namespace)::From9To13Bits>(unsigned char const*, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long (*) [19], unsigned long (*) [33], bool, bool, AVX2::(anonymous namespace)::From9To13Bits, unsigned long, bool, short const*)
Unexecuted instantiation: enc_fast_lossless.cc:void AVX2::(anonymous namespace)::CollectSamples<AVX2::(anonymous namespace)::Exactly14Bits>(unsigned char const*, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long (*) [19], unsigned long (*) [33], bool, bool, AVX2::(anonymous namespace)::Exactly14Bits, unsigned long, bool, short const*)
Unexecuted instantiation: enc_fast_lossless.cc:void AVX2::(anonymous namespace)::CollectSamples<AVX2::(anonymous namespace)::MoreThan14Bits>(unsigned char const*, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long (*) [19], unsigned long (*) [33], bool, bool, AVX2::(anonymous namespace)::MoreThan14Bits, unsigned long, bool, short const*)
Unexecuted instantiation: enc_fast_lossless.cc:void default_implementation::(anonymous namespace)::CollectSamples<default_implementation::(anonymous namespace)::UpTo8Bits>(unsigned char const*, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long (*) [19], unsigned long (*) [33], bool, bool, default_implementation::(anonymous namespace)::UpTo8Bits, unsigned long, bool, short const*)
Unexecuted instantiation: enc_fast_lossless.cc:void default_implementation::(anonymous namespace)::CollectSamples<default_implementation::(anonymous namespace)::From9To13Bits>(unsigned char const*, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long (*) [19], unsigned long (*) [33], bool, bool, default_implementation::(anonymous namespace)::From9To13Bits, unsigned long, bool, short const*)
Unexecuted instantiation: enc_fast_lossless.cc:void default_implementation::(anonymous namespace)::CollectSamples<default_implementation::(anonymous namespace)::Exactly14Bits>(unsigned char const*, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long (*) [19], unsigned long (*) [33], bool, bool, default_implementation::(anonymous namespace)::Exactly14Bits, unsigned long, bool, short const*)
Unexecuted instantiation: enc_fast_lossless.cc:void default_implementation::(anonymous namespace)::CollectSamples<default_implementation::(anonymous namespace)::MoreThan14Bits>(unsigned char const*, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long (*) [19], unsigned long (*) [33], bool, bool, default_implementation::(anonymous namespace)::MoreThan14Bits, unsigned long, bool, short const*)
3661
3662
void PrepareDCGlobalPalette(bool is_single_group, size_t width, size_t height,
3663
                            size_t nb_chans, const PrefixCode code[4],
3664
                            const std::vector<uint32_t>& palette,
3665
0
                            size_t pcolors, BitWriter* output) {
3666
0
  PrepareDCGlobalCommon(is_single_group, width, height, code, output);
3667
0
  output->Write(2, 0b01);     // 1 transform
3668
0
  output->Write(2, 0b01);     // Palette
3669
0
  output->Write(5, 0b00000);  // Starting from ch 0
3670
0
  if (nb_chans == 1) {
3671
0
    output->Write(2, 0b00);  // 1-channel palette (Gray)
3672
0
  } else if (nb_chans == 3) {
3673
0
    output->Write(2, 0b01);  // 3-channel palette (RGB)
3674
0
  } else if (nb_chans == 4) {
3675
0
    output->Write(2, 0b10);  // 4-channel palette (RGBA)
3676
0
  } else {
3677
0
    output->Write(2, 0b11);
3678
0
    output->Write(13, nb_chans - 1);
3679
0
  }
3680
  // pcolors <= kMaxColors + kChunkSize - 1
3681
0
  static_assert(kMaxColors + kChunkSize < 1281,
3682
0
                "add code to signal larger palette sizes");
3683
0
  if (pcolors < 256) {
3684
0
    output->Write(2, 0b00);
3685
0
    output->Write(8, pcolors);
3686
0
  } else {
3687
0
    output->Write(2, 0b01);
3688
0
    output->Write(10, pcolors - 256);
3689
0
  }
3690
3691
0
  output->Write(2, 0b00);  // nb_deltas == 0
3692
0
  output->Write(4, 0);     // Zero predictor for delta palette
3693
  // Encode palette
3694
0
  ChunkEncoder<UpTo8Bits> encoder;
3695
0
  ChannelRowProcessor<ChunkEncoder<UpTo8Bits>, UpTo8Bits> row_encoder;
3696
0
  row_encoder.t = &encoder;
3697
0
  encoder.output = output;
3698
0
  encoder.code = &code[0];
3699
0
  encoder.PrepareForSimd();
3700
0
  std::vector<std::array<int16_t, 32 + 1024>> p(4);
3701
0
  size_t i = 0;
3702
0
  size_t have_zero = 1;
3703
0
  for (; i < pcolors; i++) {
3704
0
    p[0][16 + i + have_zero] = palette[i] & 0xFF;
3705
0
    p[1][16 + i + have_zero] = (palette[i] >> 8) & 0xFF;
3706
0
    p[2][16 + i + have_zero] = (palette[i] >> 16) & 0xFF;
3707
0
    p[3][16 + i + have_zero] = (palette[i] >> 24) & 0xFF;
3708
0
  }
3709
0
  p[0][15] = 0;
3710
0
  row_encoder.ProcessRow(p[0].data() + 16, p[0].data() + 15, p[0].data() + 15,
3711
0
                         p[0].data() + 15, pcolors);
3712
0
  p[1][15] = p[0][16];
3713
0
  p[0][15] = p[0][16];
3714
0
  if (nb_chans > 1) {
3715
0
    row_encoder.ProcessRow(p[1].data() + 16, p[1].data() + 15, p[0].data() + 16,
3716
0
                           p[0].data() + 15, pcolors);
3717
0
  }
3718
0
  p[2][15] = p[1][16];
3719
0
  p[1][15] = p[1][16];
3720
0
  if (nb_chans > 2) {
3721
0
    row_encoder.ProcessRow(p[2].data() + 16, p[2].data() + 15, p[1].data() + 16,
3722
0
                           p[1].data() + 15, pcolors);
3723
0
  }
3724
0
  p[3][15] = p[2][16];
3725
0
  p[2][15] = p[2][16];
3726
0
  if (nb_chans > 3) {
3727
0
    row_encoder.ProcessRow(p[3].data() + 16, p[3].data() + 15, p[2].data() + 16,
3728
0
                           p[2].data() + 15, pcolors);
3729
0
  }
3730
0
  row_encoder.Finalize();
3731
3732
0
  if (!is_single_group) {
3733
0
    output->ZeroPadToByte();
3734
0
  }
3735
0
}
Unexecuted instantiation: enc_fast_lossless.cc:AVX2::(anonymous namespace)::PrepareDCGlobalPalette(bool, unsigned long, unsigned long, unsigned long, (anonymous namespace)::PrefixCode const*, std::__1::vector<unsigned int, std::__1::allocator<unsigned int> > const&, unsigned long, (anonymous namespace)::BitWriter*)
Unexecuted instantiation: enc_fast_lossless.cc:default_implementation::(anonymous namespace)::PrepareDCGlobalPalette(bool, unsigned long, unsigned long, unsigned long, (anonymous namespace)::PrefixCode const*, std::__1::vector<unsigned int, std::__1::allocator<unsigned int> > const&, unsigned long, (anonymous namespace)::BitWriter*)
3736
3737
template <size_t nb_chans>
3738
bool detect_palette(const unsigned char* r, size_t width,
3739
0
                    std::vector<uint32_t>& palette) {
3740
0
  size_t x = 0;
3741
0
  bool collided = false;
3742
  // this is just an unrolling of the next loop
3743
0
  size_t look_ahead = 7 + ((nb_chans == 1) ? 3 : ((nb_chans < 4) ? 1 : 0));
3744
0
  for (; x + look_ahead < width; x += 8) {
3745
0
    uint32_t p[8] = {}, index[8];
3746
0
    for (int i = 0; i < 8; i++) {
3747
0
      for (int j = 0; j < 4; ++j) {
3748
0
        p[i] |= r[(x + i) * nb_chans + j] << (8 * j);
3749
0
      }
3750
0
    }
3751
0
    for (int i = 0; i < 8; i++) p[i] &= ((1llu << (8 * nb_chans)) - 1);
3752
0
    for (int i = 0; i < 8; i++) index[i] = pixel_hash(p[i]);
3753
0
    for (int i = 0; i < 8; i++) {
3754
0
      collided |= (palette[index[i]] != 0 && p[i] != palette[index[i]]);
3755
0
      palette[index[i]] = p[i];
3756
0
    }
3757
0
  }
3758
0
  for (; x < width; x++) {
3759
0
    uint32_t p = 0;
3760
0
    for (size_t i = 0; i < nb_chans; ++i) {
3761
0
      p |= r[x * nb_chans + i] << (8 * i);
3762
0
    }
3763
0
    uint32_t index = pixel_hash(p);
3764
0
    collided |= (palette[index] != 0 && p != palette[index]);
3765
0
    palette[index] = p;
3766
0
  }
3767
0
  return collided;
3768
0
}
Unexecuted instantiation: enc_fast_lossless.cc:bool AVX2::(anonymous namespace)::detect_palette<1ul>(unsigned char const*, unsigned long, std::__1::vector<unsigned int, std::__1::allocator<unsigned int> >&)
Unexecuted instantiation: enc_fast_lossless.cc:bool AVX2::(anonymous namespace)::detect_palette<2ul>(unsigned char const*, unsigned long, std::__1::vector<unsigned int, std::__1::allocator<unsigned int> >&)
Unexecuted instantiation: enc_fast_lossless.cc:bool AVX2::(anonymous namespace)::detect_palette<3ul>(unsigned char const*, unsigned long, std::__1::vector<unsigned int, std::__1::allocator<unsigned int> >&)
Unexecuted instantiation: enc_fast_lossless.cc:bool AVX2::(anonymous namespace)::detect_palette<4ul>(unsigned char const*, unsigned long, std::__1::vector<unsigned int, std::__1::allocator<unsigned int> >&)
Unexecuted instantiation: enc_fast_lossless.cc:bool default_implementation::(anonymous namespace)::detect_palette<1ul>(unsigned char const*, unsigned long, std::__1::vector<unsigned int, std::__1::allocator<unsigned int> >&)
Unexecuted instantiation: enc_fast_lossless.cc:bool default_implementation::(anonymous namespace)::detect_palette<2ul>(unsigned char const*, unsigned long, std::__1::vector<unsigned int, std::__1::allocator<unsigned int> >&)
Unexecuted instantiation: enc_fast_lossless.cc:bool default_implementation::(anonymous namespace)::detect_palette<3ul>(unsigned char const*, unsigned long, std::__1::vector<unsigned int, std::__1::allocator<unsigned int> >&)
Unexecuted instantiation: enc_fast_lossless.cc:bool default_implementation::(anonymous namespace)::detect_palette<4ul>(unsigned char const*, unsigned long, std::__1::vector<unsigned int, std::__1::allocator<unsigned int> >&)
3769
3770
template <typename BitDepth>
3771
JxlFastLosslessFrameState* LLPrepare(JxlChunkedFrameInputSource input,
3772
                                     size_t width, size_t height,
3773
                                     BitDepth bitdepth, size_t nb_chans,
3774
0
                                     bool big_endian, int effort, int oneshot) {
3775
0
  assert(width != 0);
3776
0
  assert(height != 0);
3777
3778
  // Count colors to try palette
3779
0
  std::vector<uint32_t> palette(kHashSize);
3780
0
  std::vector<int16_t> lookup(kHashSize);
3781
0
  lookup[0] = 0;
3782
0
  int pcolors = 0;
3783
0
  bool collided = effort < 2 || bitdepth.bitdepth != 8 || !oneshot;
3784
0
  for (size_t y0 = 0; y0 < height && !collided; y0 += 256) {
3785
0
    size_t ys = std::min<size_t>(height - y0, 256);
3786
0
    for (size_t x0 = 0; x0 < width && !collided; x0 += 256) {
3787
0
      size_t xs = std::min<size_t>(width - x0, 256);
3788
0
      size_t stride;
3789
      // TODO(szabadka): Add RAII wrapper around this.
3790
0
      const void* buffer = input.get_color_channel_data_at(input.opaque, x0, y0,
3791
0
                                                           xs, ys, &stride);
3792
0
      auto rgba = reinterpret_cast<const unsigned char*>(buffer);
3793
0
      for (size_t y = 0; y < ys && !collided; y++) {
3794
0
        const unsigned char* r = rgba + stride * y;
3795
0
        if (nb_chans == 1) collided = detect_palette<1>(r, xs, palette);
3796
0
        if (nb_chans == 2) collided = detect_palette<2>(r, xs, palette);
3797
0
        if (nb_chans == 3) collided = detect_palette<3>(r, xs, palette);
3798
0
        if (nb_chans == 4) collided = detect_palette<4>(r, xs, palette);
3799
0
      }
3800
0
      input.release_buffer(input.opaque, buffer);
3801
0
    }
3802
0
  }
3803
0
  int nb_entries = 0;
3804
0
  if (!collided) {
3805
0
    pcolors = 1;  // always have all-zero as a palette color
3806
0
    bool have_color = false;
3807
0
    uint8_t minG = 255, maxG = 0;
3808
0
    for (uint32_t k = 0; k < kHashSize; k++) {
3809
0
      if (palette[k] == 0) continue;
3810
0
      uint8_t p[4];
3811
0
      for (int i = 0; i < 4; ++i) {
3812
0
        p[i] = (palette[k] >> (8 * i)) & 0xFF;
3813
0
      }
3814
      // move entries to front so sort has less work
3815
0
      palette[nb_entries] = palette[k];
3816
0
      if (p[0] != p[1] || p[0] != p[2]) have_color = true;
3817
0
      if (p[1] < minG) minG = p[1];
3818
0
      if (p[1] > maxG) maxG = p[1];
3819
0
      nb_entries++;
3820
      // don't do palette if too many colors are needed
3821
0
      if (nb_entries + pcolors > kMaxColors) {
3822
0
        collided = true;
3823
0
        break;
3824
0
      }
3825
0
    }
3826
0
    if (!have_color) {
3827
      // don't do palette if it's just grayscale without many holes
3828
0
      if (maxG - minG < nb_entries * 1.4f) collided = true;
3829
0
    }
3830
0
  }
3831
0
  if (!collided) {
3832
0
    std::sort(
3833
0
        palette.begin(), palette.begin() + nb_entries,
3834
0
        [&nb_chans](uint32_t ap, uint32_t bp) {
3835
0
          if (ap == 0) return false;
3836
0
          if (bp == 0) return true;
3837
0
          uint8_t a[4], b[4];
3838
0
          for (int i = 0; i < 4; ++i) {
3839
0
            a[i] = (ap >> (8 * i)) & 0xFF;
3840
0
            b[i] = (bp >> (8 * i)) & 0xFF;
3841
0
          }
3842
0
          float ay, by;
3843
0
          if (nb_chans == 4) {
3844
0
            ay = (0.299f * a[0] + 0.587f * a[1] + 0.114f * a[2] + 0.01f) * a[3];
3845
0
            by = (0.299f * b[0] + 0.587f * b[1] + 0.114f * b[2] + 0.01f) * b[3];
3846
0
          } else {
3847
0
            ay = (0.299f * a[0] + 0.587f * a[1] + 0.114f * a[2] + 0.01f);
3848
0
            by = (0.299f * b[0] + 0.587f * b[1] + 0.114f * b[2] + 0.01f);
3849
0
          }
3850
0
          return ay < by;  // sort on alpha*luma
3851
0
        });
Unexecuted instantiation: enc_fast_lossless.cc:AVX2::(anonymous namespace)::LLPrepare<AVX2::(anonymous namespace)::UpTo8Bits>(JxlChunkedFrameInputSource, unsigned long, unsigned long, AVX2::(anonymous namespace)::UpTo8Bits, unsigned long, bool, int, int)::{lambda(unsigned int, unsigned int)#1}::operator()(unsigned int, unsigned int) const
Unexecuted instantiation: enc_fast_lossless.cc:AVX2::(anonymous namespace)::LLPrepare<AVX2::(anonymous namespace)::From9To13Bits>(JxlChunkedFrameInputSource, unsigned long, unsigned long, AVX2::(anonymous namespace)::From9To13Bits, unsigned long, bool, int, int)::{lambda(unsigned int, unsigned int)#1}::operator()(unsigned int, unsigned int) const
Unexecuted instantiation: enc_fast_lossless.cc:AVX2::(anonymous namespace)::LLPrepare<AVX2::(anonymous namespace)::Exactly14Bits>(JxlChunkedFrameInputSource, unsigned long, unsigned long, AVX2::(anonymous namespace)::Exactly14Bits, unsigned long, bool, int, int)::{lambda(unsigned int, unsigned int)#1}::operator()(unsigned int, unsigned int) const
Unexecuted instantiation: enc_fast_lossless.cc:AVX2::(anonymous namespace)::LLPrepare<AVX2::(anonymous namespace)::MoreThan14Bits>(JxlChunkedFrameInputSource, unsigned long, unsigned long, AVX2::(anonymous namespace)::MoreThan14Bits, unsigned long, bool, int, int)::{lambda(unsigned int, unsigned int)#1}::operator()(unsigned int, unsigned int) const
Unexecuted instantiation: enc_fast_lossless.cc:default_implementation::(anonymous namespace)::LLPrepare<default_implementation::(anonymous namespace)::UpTo8Bits>(JxlChunkedFrameInputSource, unsigned long, unsigned long, default_implementation::(anonymous namespace)::UpTo8Bits, unsigned long, bool, int, int)::{lambda(unsigned int, unsigned int)#1}::operator()(unsigned int, unsigned int) const
Unexecuted instantiation: enc_fast_lossless.cc:default_implementation::(anonymous namespace)::LLPrepare<default_implementation::(anonymous namespace)::From9To13Bits>(JxlChunkedFrameInputSource, unsigned long, unsigned long, default_implementation::(anonymous namespace)::From9To13Bits, unsigned long, bool, int, int)::{lambda(unsigned int, unsigned int)#1}::operator()(unsigned int, unsigned int) const
Unexecuted instantiation: enc_fast_lossless.cc:default_implementation::(anonymous namespace)::LLPrepare<default_implementation::(anonymous namespace)::Exactly14Bits>(JxlChunkedFrameInputSource, unsigned long, unsigned long, default_implementation::(anonymous namespace)::Exactly14Bits, unsigned long, bool, int, int)::{lambda(unsigned int, unsigned int)#1}::operator()(unsigned int, unsigned int) const
Unexecuted instantiation: enc_fast_lossless.cc:default_implementation::(anonymous namespace)::LLPrepare<default_implementation::(anonymous namespace)::MoreThan14Bits>(JxlChunkedFrameInputSource, unsigned long, unsigned long, default_implementation::(anonymous namespace)::MoreThan14Bits, unsigned long, bool, int, int)::{lambda(unsigned int, unsigned int)#1}::operator()(unsigned int, unsigned int) const
3852
0
    for (int k = 0; k < nb_entries; k++) {
3853
0
      if (palette[k] == 0) break;
3854
0
      lookup[pixel_hash(palette[k])] = pcolors++;
3855
0
    }
3856
0
  }
3857
3858
0
  size_t num_groups_x = (width + 255) / 256;
3859
0
  size_t num_groups_y = (height + 255) / 256;
3860
0
  size_t num_dc_groups_x = (width + 2047) / 2048;
3861
0
  size_t num_dc_groups_y = (height + 2047) / 2048;
3862
3863
0
  uint64_t raw_counts[4][kNumRawSymbols] = {};
3864
0
  uint64_t lz77_counts[4][kNumLZ77] = {};
3865
3866
0
  bool onegroup = num_groups_x == 1 && num_groups_y == 1;
3867
3868
0
  auto sample_rows = [&](size_t xg, size_t yg, size_t num_rows) {
3869
0
    size_t y0 = yg * 256;
3870
0
    size_t x0 = xg * 256;
3871
0
    size_t ys = std::min<size_t>(height - y0, 256);
3872
0
    size_t xs = std::min<size_t>(width - x0, 256);
3873
0
    size_t stride;
3874
0
    const void* buffer =
3875
0
        input.get_color_channel_data_at(input.opaque, x0, y0, xs, ys, &stride);
3876
0
    auto rgba = reinterpret_cast<const unsigned char*>(buffer);
3877
0
    int y_begin_group =
3878
0
        std::max<ssize_t>(
3879
0
            0, static_cast<ssize_t>(ys) - static_cast<ssize_t>(num_rows)) /
3880
0
        2;
3881
0
    int y_count = std::min<int>(num_rows, ys - y_begin_group);
3882
0
    int x_max = xs / kChunkSize * kChunkSize;
3883
0
    CollectSamples(rgba, 0, y_begin_group, x_max, stride, y_count, raw_counts,
3884
0
                   lz77_counts, onegroup, !collided, bitdepth, nb_chans,
3885
0
                   big_endian, lookup.data());
3886
0
    input.release_buffer(input.opaque, buffer);
3887
0
  };
Unexecuted instantiation: enc_fast_lossless.cc:AVX2::(anonymous namespace)::LLPrepare<AVX2::(anonymous namespace)::UpTo8Bits>(JxlChunkedFrameInputSource, unsigned long, unsigned long, AVX2::(anonymous namespace)::UpTo8Bits, unsigned long, bool, int, int)::{lambda(unsigned long, unsigned long, unsigned long)#1}::operator()(unsigned long, unsigned long, unsigned long) const
Unexecuted instantiation: enc_fast_lossless.cc:AVX2::(anonymous namespace)::LLPrepare<AVX2::(anonymous namespace)::From9To13Bits>(JxlChunkedFrameInputSource, unsigned long, unsigned long, AVX2::(anonymous namespace)::From9To13Bits, unsigned long, bool, int, int)::{lambda(unsigned long, unsigned long, unsigned long)#1}::operator()(unsigned long, unsigned long, unsigned long) const
Unexecuted instantiation: enc_fast_lossless.cc:AVX2::(anonymous namespace)::LLPrepare<AVX2::(anonymous namespace)::Exactly14Bits>(JxlChunkedFrameInputSource, unsigned long, unsigned long, AVX2::(anonymous namespace)::Exactly14Bits, unsigned long, bool, int, int)::{lambda(unsigned long, unsigned long, unsigned long)#1}::operator()(unsigned long, unsigned long, unsigned long) const
Unexecuted instantiation: enc_fast_lossless.cc:AVX2::(anonymous namespace)::LLPrepare<AVX2::(anonymous namespace)::MoreThan14Bits>(JxlChunkedFrameInputSource, unsigned long, unsigned long, AVX2::(anonymous namespace)::MoreThan14Bits, unsigned long, bool, int, int)::{lambda(unsigned long, unsigned long, unsigned long)#1}::operator()(unsigned long, unsigned long, unsigned long) const
Unexecuted instantiation: enc_fast_lossless.cc:default_implementation::(anonymous namespace)::LLPrepare<default_implementation::(anonymous namespace)::UpTo8Bits>(JxlChunkedFrameInputSource, unsigned long, unsigned long, default_implementation::(anonymous namespace)::UpTo8Bits, unsigned long, bool, int, int)::{lambda(unsigned long, unsigned long, unsigned long)#1}::operator()(unsigned long, unsigned long, unsigned long) const
Unexecuted instantiation: enc_fast_lossless.cc:default_implementation::(anonymous namespace)::LLPrepare<default_implementation::(anonymous namespace)::From9To13Bits>(JxlChunkedFrameInputSource, unsigned long, unsigned long, default_implementation::(anonymous namespace)::From9To13Bits, unsigned long, bool, int, int)::{lambda(unsigned long, unsigned long, unsigned long)#1}::operator()(unsigned long, unsigned long, unsigned long) const
Unexecuted instantiation: enc_fast_lossless.cc:default_implementation::(anonymous namespace)::LLPrepare<default_implementation::(anonymous namespace)::Exactly14Bits>(JxlChunkedFrameInputSource, unsigned long, unsigned long, default_implementation::(anonymous namespace)::Exactly14Bits, unsigned long, bool, int, int)::{lambda(unsigned long, unsigned long, unsigned long)#1}::operator()(unsigned long, unsigned long, unsigned long) const
Unexecuted instantiation: enc_fast_lossless.cc:default_implementation::(anonymous namespace)::LLPrepare<default_implementation::(anonymous namespace)::MoreThan14Bits>(JxlChunkedFrameInputSource, unsigned long, unsigned long, default_implementation::(anonymous namespace)::MoreThan14Bits, unsigned long, bool, int, int)::{lambda(unsigned long, unsigned long, unsigned long)#1}::operator()(unsigned long, unsigned long, unsigned long) const
3888
3889
  // TODO(veluca): that `64` is an arbitrary constant, meant to correspond to
3890
  // the point where the number of processed rows is large enough that loading
3891
  // the entire image is cost-effective.
3892
0
  if (oneshot || effort >= 64) {
3893
0
    for (size_t g = 0; g < num_groups_y * num_groups_x; g++) {
3894
0
      size_t xg = g % num_groups_x;
3895
0
      size_t yg = g / num_groups_x;
3896
0
      size_t y0 = yg * 256;
3897
0
      size_t ys = std::min<size_t>(height - y0, 256);
3898
0
      size_t num_rows = 2 * effort * ys / 256;
3899
0
      sample_rows(xg, yg, num_rows);
3900
0
    }
3901
0
  } else {
3902
    // sample the middle (effort * 2 * num_groups) rows of the center group
3903
    // (possibly all of them).
3904
0
    sample_rows((num_groups_x - 1) / 2, (num_groups_y - 1) / 2,
3905
0
                2 * effort * num_groups_x * num_groups_y);
3906
0
  }
3907
3908
  // TODO(veluca): can probably improve this and make it bitdepth-dependent.
3909
0
  uint64_t base_raw_counts[kNumRawSymbols] = {
3910
0
      3843, 852, 1270, 1214, 1014, 727, 481, 300, 159, 51,
3911
0
      5,    1,   1,    1,    1,    1,   1,   1,   1};
3912
3913
0
  bool doing_ycocg = nb_chans > 2 && collided;
3914
0
  bool large_palette = !collided || pcolors >= 256;
3915
0
  for (size_t i = bitdepth.NumSymbols(doing_ycocg || large_palette);
3916
0
       i < kNumRawSymbols; i++) {
3917
0
    base_raw_counts[i] = 0;
3918
0
  }
3919
3920
0
  for (size_t c = 0; c < 4; c++) {
3921
0
    for (size_t i = 0; i < kNumRawSymbols; i++) {
3922
0
      raw_counts[c][i] = (raw_counts[c][i] << 8) + base_raw_counts[i];
3923
0
    }
3924
0
  }
3925
3926
0
  if (!collided) {
3927
0
    unsigned token, nbits, bits;
3928
0
    EncodeHybridUint000(PackSigned(pcolors - 1), &token, &nbits, &bits);
3929
    // ensure all palette indices can actually be encoded
3930
0
    for (size_t i = 0; i < token + 1; i++)
3931
0
      raw_counts[0][i] = std::max<uint64_t>(raw_counts[0][i], 1);
3932
    // these tokens are only used for the palette itself so they can get a bad
3933
    // code
3934
0
    for (size_t i = token + 1; i < 10; i++) raw_counts[0][i] = 1;
3935
0
  }
3936
3937
0
  uint64_t base_lz77_counts[kNumLZ77] = {
3938
0
      29, 27, 25,  23, 21, 21, 19, 18, 21, 17, 16, 15, 15, 14,
3939
0
      13, 13, 137, 98, 61, 34, 1,  1,  1,  1,  1,  1,  1,  1,
3940
0
  };
3941
3942
0
  for (size_t c = 0; c < 4; c++) {
3943
0
    for (size_t i = 0; i < kNumLZ77; i++) {
3944
0
      lz77_counts[c][i] = (lz77_counts[c][i] << 8) + base_lz77_counts[i];
3945
0
    }
3946
0
  }
3947
3948
0
  JxlFastLosslessFrameState* frame_state = new JxlFastLosslessFrameState();
3949
0
  for (size_t i = 0; i < 4; i++) {
3950
0
    frame_state->hcode[i] = PrefixCode(bitdepth, raw_counts[i], lz77_counts[i]);
3951
0
  }
3952
3953
0
  size_t num_dc_groups = num_dc_groups_x * num_dc_groups_y;
3954
0
  size_t num_ac_groups = num_groups_x * num_groups_y;
3955
0
  size_t num_groups = onegroup ? 1 : (2 + num_dc_groups + num_ac_groups);
3956
0
  frame_state->input = input;
3957
0
  frame_state->width = width;
3958
0
  frame_state->height = height;
3959
0
  frame_state->num_groups_x = num_groups_x;
3960
0
  frame_state->num_groups_y = num_groups_y;
3961
0
  frame_state->num_dc_groups_x = num_dc_groups_x;
3962
0
  frame_state->num_dc_groups_y = num_dc_groups_y;
3963
0
  frame_state->nb_chans = nb_chans;
3964
0
  frame_state->bitdepth = bitdepth.bitdepth;
3965
0
  frame_state->big_endian = big_endian;
3966
0
  frame_state->effort = effort;
3967
0
  frame_state->collided = collided;
3968
0
  frame_state->lookup = lookup;
3969
3970
0
  frame_state->group_data = std::vector<std::array<BitWriter, 4>>(num_groups);
3971
0
  frame_state->group_sizes.resize(num_groups);
3972
0
  if (collided) {
3973
0
    PrepareDCGlobal(onegroup, width, height, nb_chans, frame_state->hcode,
3974
0
                    &frame_state->group_data[0][0]);
3975
0
  } else {
3976
0
    PrepareDCGlobalPalette(onegroup, width, height, nb_chans,
3977
0
                           frame_state->hcode, palette, pcolors,
3978
0
                           &frame_state->group_data[0][0]);
3979
0
  }
3980
0
  frame_state->group_sizes[0] = SectionSize(frame_state->group_data[0]);
3981
0
  if (!onegroup) {
3982
0
    ComputeAcGroupDataOffset(frame_state->group_sizes[0], num_dc_groups,
3983
0
                             num_ac_groups, frame_state->min_dc_global_size,
3984
0
                             frame_state->ac_group_data_offset);
3985
0
  }
3986
3987
0
  return frame_state;
3988
0
}
Unexecuted instantiation: enc_fast_lossless.cc:JxlFastLosslessFrameState* AVX2::(anonymous namespace)::LLPrepare<AVX2::(anonymous namespace)::UpTo8Bits>(JxlChunkedFrameInputSource, unsigned long, unsigned long, AVX2::(anonymous namespace)::UpTo8Bits, unsigned long, bool, int, int)
Unexecuted instantiation: enc_fast_lossless.cc:JxlFastLosslessFrameState* AVX2::(anonymous namespace)::LLPrepare<AVX2::(anonymous namespace)::From9To13Bits>(JxlChunkedFrameInputSource, unsigned long, unsigned long, AVX2::(anonymous namespace)::From9To13Bits, unsigned long, bool, int, int)
Unexecuted instantiation: enc_fast_lossless.cc:JxlFastLosslessFrameState* AVX2::(anonymous namespace)::LLPrepare<AVX2::(anonymous namespace)::Exactly14Bits>(JxlChunkedFrameInputSource, unsigned long, unsigned long, AVX2::(anonymous namespace)::Exactly14Bits, unsigned long, bool, int, int)
Unexecuted instantiation: enc_fast_lossless.cc:JxlFastLosslessFrameState* AVX2::(anonymous namespace)::LLPrepare<AVX2::(anonymous namespace)::MoreThan14Bits>(JxlChunkedFrameInputSource, unsigned long, unsigned long, AVX2::(anonymous namespace)::MoreThan14Bits, unsigned long, bool, int, int)
Unexecuted instantiation: enc_fast_lossless.cc:JxlFastLosslessFrameState* default_implementation::(anonymous namespace)::LLPrepare<default_implementation::(anonymous namespace)::UpTo8Bits>(JxlChunkedFrameInputSource, unsigned long, unsigned long, default_implementation::(anonymous namespace)::UpTo8Bits, unsigned long, bool, int, int)
Unexecuted instantiation: enc_fast_lossless.cc:JxlFastLosslessFrameState* default_implementation::(anonymous namespace)::LLPrepare<default_implementation::(anonymous namespace)::From9To13Bits>(JxlChunkedFrameInputSource, unsigned long, unsigned long, default_implementation::(anonymous namespace)::From9To13Bits, unsigned long, bool, int, int)
Unexecuted instantiation: enc_fast_lossless.cc:JxlFastLosslessFrameState* default_implementation::(anonymous namespace)::LLPrepare<default_implementation::(anonymous namespace)::Exactly14Bits>(JxlChunkedFrameInputSource, unsigned long, unsigned long, default_implementation::(anonymous namespace)::Exactly14Bits, unsigned long, bool, int, int)
Unexecuted instantiation: enc_fast_lossless.cc:JxlFastLosslessFrameState* default_implementation::(anonymous namespace)::LLPrepare<default_implementation::(anonymous namespace)::MoreThan14Bits>(JxlChunkedFrameInputSource, unsigned long, unsigned long, default_implementation::(anonymous namespace)::MoreThan14Bits, unsigned long, bool, int, int)
3989
3990
template <typename BitDepth>
3991
jxl::Status LLProcess(JxlFastLosslessFrameState* frame_state, bool is_last,
3992
                      BitDepth bitdepth, void* runner_opaque,
3993
                      FJxlParallelRunner runner,
3994
0
                      JxlEncoderOutputProcessorWrapper* output_processor) {
3995
0
#if !FJXL_STANDALONE
3996
0
  if (frame_state->process_done) {
3997
0
    JxlFastLosslessPrepareHeader(frame_state, /*add_image_header=*/0, is_last);
3998
0
    if (output_processor) {
3999
0
      JXL_RETURN_IF_ERROR(
4000
0
          JxlFastLosslessOutputFrame(frame_state, output_processor));
4001
0
    }
4002
0
    return true;
4003
0
  }
4004
0
#endif
4005
  // The maximum number of groups that we process concurrently here.
4006
  // TODO(szabadka) Use the number of threads or some outside parameter for the
4007
  // maximum memory usage instead.
4008
0
  constexpr size_t kMaxLocalGroups = 16;
4009
0
  bool onegroup = frame_state->group_sizes.size() == 1;
4010
0
  bool streaming = !onegroup && output_processor;
4011
0
  size_t total_groups = frame_state->num_groups_x * frame_state->num_groups_y;
4012
0
  size_t max_groups = streaming ? kMaxLocalGroups : total_groups;
4013
0
#if !FJXL_STANDALONE
4014
0
  size_t start_pos = 0;
4015
0
  if (streaming) {
4016
0
    start_pos = output_processor->CurrentPosition();
4017
0
    JXL_RETURN_IF_ERROR(
4018
0
        output_processor->Seek(start_pos + frame_state->ac_group_data_offset));
4019
0
  }
4020
0
#endif
4021
0
  for (size_t offset = 0; offset < total_groups; offset += max_groups) {
4022
0
    size_t num_groups = std::min(max_groups, total_groups - offset);
4023
0
    JxlFastLosslessFrameState local_frame_state;
4024
0
    if (streaming) {
4025
0
      local_frame_state.group_data =
4026
0
          std::vector<std::array<BitWriter, 4>>(num_groups);
4027
0
    }
4028
0
    auto run_one = [&](size_t i) {
4029
0
      size_t g = offset + i;
4030
0
      size_t xg = g % frame_state->num_groups_x;
4031
0
      size_t yg = g / frame_state->num_groups_x;
4032
0
      size_t num_dc_groups =
4033
0
          frame_state->num_dc_groups_x * frame_state->num_dc_groups_y;
4034
0
      size_t group_id = onegroup ? 0 : (2 + num_dc_groups + g);
4035
0
      size_t xs = std::min<size_t>(frame_state->width - xg * 256, 256);
4036
0
      size_t ys = std::min<size_t>(frame_state->height - yg * 256, 256);
4037
0
      size_t x0 = xg * 256;
4038
0
      size_t y0 = yg * 256;
4039
0
      size_t stride;
4040
0
      JxlChunkedFrameInputSource input = frame_state->input;
4041
0
      const void* buffer = input.get_color_channel_data_at(input.opaque, x0, y0,
4042
0
                                                           xs, ys, &stride);
4043
0
      const unsigned char* rgba =
4044
0
          reinterpret_cast<const unsigned char*>(buffer);
4045
4046
0
      auto& gd = streaming ? local_frame_state.group_data[i]
4047
0
                           : frame_state->group_data[group_id];
4048
0
      if (frame_state->collided) {
4049
0
        WriteACSection(rgba, 0, 0, xs, ys, stride, onegroup, bitdepth,
4050
0
                       frame_state->nb_chans, frame_state->big_endian,
4051
0
                       frame_state->hcode, gd);
4052
0
      } else {
4053
0
        WriteACSectionPalette(rgba, 0, 0, xs, ys, stride, onegroup,
4054
0
                              frame_state->hcode, frame_state->lookup.data(),
4055
0
                              frame_state->nb_chans, gd[0]);
4056
0
      }
4057
0
      frame_state->group_sizes[group_id] = SectionSize(gd);
4058
0
      input.release_buffer(input.opaque, buffer);
4059
0
    };
Unexecuted instantiation: enc_fast_lossless.cc:AVX2::(anonymous namespace)::LLProcess<AVX2::(anonymous namespace)::UpTo8Bits>(JxlFastLosslessFrameState*, bool, AVX2::(anonymous namespace)::UpTo8Bits, void*, void (*)(void*, void*, void (*)(void*, unsigned long), unsigned long), JxlEncoderOutputProcessorWrapper*)::{lambda(unsigned long)#1}::operator()(unsigned long) const
Unexecuted instantiation: enc_fast_lossless.cc:AVX2::(anonymous namespace)::LLProcess<AVX2::(anonymous namespace)::From9To13Bits>(JxlFastLosslessFrameState*, bool, AVX2::(anonymous namespace)::From9To13Bits, void*, void (*)(void*, void*, void (*)(void*, unsigned long), unsigned long), JxlEncoderOutputProcessorWrapper*)::{lambda(unsigned long)#1}::operator()(unsigned long) const
Unexecuted instantiation: enc_fast_lossless.cc:AVX2::(anonymous namespace)::LLProcess<AVX2::(anonymous namespace)::Exactly14Bits>(JxlFastLosslessFrameState*, bool, AVX2::(anonymous namespace)::Exactly14Bits, void*, void (*)(void*, void*, void (*)(void*, unsigned long), unsigned long), JxlEncoderOutputProcessorWrapper*)::{lambda(unsigned long)#1}::operator()(unsigned long) const
Unexecuted instantiation: enc_fast_lossless.cc:AVX2::(anonymous namespace)::LLProcess<AVX2::(anonymous namespace)::MoreThan14Bits>(JxlFastLosslessFrameState*, bool, AVX2::(anonymous namespace)::MoreThan14Bits, void*, void (*)(void*, void*, void (*)(void*, unsigned long), unsigned long), JxlEncoderOutputProcessorWrapper*)::{lambda(unsigned long)#1}::operator()(unsigned long) const
Unexecuted instantiation: enc_fast_lossless.cc:default_implementation::(anonymous namespace)::LLProcess<default_implementation::(anonymous namespace)::UpTo8Bits>(JxlFastLosslessFrameState*, bool, default_implementation::(anonymous namespace)::UpTo8Bits, void*, void (*)(void*, void*, void (*)(void*, unsigned long), unsigned long), JxlEncoderOutputProcessorWrapper*)::{lambda(unsigned long)#1}::operator()(unsigned long) const
Unexecuted instantiation: enc_fast_lossless.cc:default_implementation::(anonymous namespace)::LLProcess<default_implementation::(anonymous namespace)::From9To13Bits>(JxlFastLosslessFrameState*, bool, default_implementation::(anonymous namespace)::From9To13Bits, void*, void (*)(void*, void*, void (*)(void*, unsigned long), unsigned long), JxlEncoderOutputProcessorWrapper*)::{lambda(unsigned long)#1}::operator()(unsigned long) const
Unexecuted instantiation: enc_fast_lossless.cc:default_implementation::(anonymous namespace)::LLProcess<default_implementation::(anonymous namespace)::Exactly14Bits>(JxlFastLosslessFrameState*, bool, default_implementation::(anonymous namespace)::Exactly14Bits, void*, void (*)(void*, void*, void (*)(void*, unsigned long), unsigned long), JxlEncoderOutputProcessorWrapper*)::{lambda(unsigned long)#1}::operator()(unsigned long) const
Unexecuted instantiation: enc_fast_lossless.cc:default_implementation::(anonymous namespace)::LLProcess<default_implementation::(anonymous namespace)::MoreThan14Bits>(JxlFastLosslessFrameState*, bool, default_implementation::(anonymous namespace)::MoreThan14Bits, void*, void (*)(void*, void*, void (*)(void*, unsigned long), unsigned long), JxlEncoderOutputProcessorWrapper*)::{lambda(unsigned long)#1}::operator()(unsigned long) const
4060
0
    runner(
4061
0
        runner_opaque, &run_one,
4062
0
        +[](void* r, size_t i) {
4063
0
          (*reinterpret_cast<decltype(&run_one)>(r))(i);
4064
0
        },
Unexecuted instantiation: enc_fast_lossless.cc:AVX2::(anonymous namespace)::LLProcess<AVX2::(anonymous namespace)::UpTo8Bits>(JxlFastLosslessFrameState*, bool, AVX2::(anonymous namespace)::UpTo8Bits, void*, void (*)(void*, void*, void (*)(void*, unsigned long), unsigned long), JxlEncoderOutputProcessorWrapper*)::{lambda(void*, unsigned long)#1}::operator()(void*, unsigned long) const
Unexecuted instantiation: enc_fast_lossless.cc:AVX2::(anonymous namespace)::LLProcess<AVX2::(anonymous namespace)::From9To13Bits>(JxlFastLosslessFrameState*, bool, AVX2::(anonymous namespace)::From9To13Bits, void*, void (*)(void*, void*, void (*)(void*, unsigned long), unsigned long), JxlEncoderOutputProcessorWrapper*)::{lambda(void*, unsigned long)#1}::operator()(void*, unsigned long) const
Unexecuted instantiation: enc_fast_lossless.cc:AVX2::(anonymous namespace)::LLProcess<AVX2::(anonymous namespace)::Exactly14Bits>(JxlFastLosslessFrameState*, bool, AVX2::(anonymous namespace)::Exactly14Bits, void*, void (*)(void*, void*, void (*)(void*, unsigned long), unsigned long), JxlEncoderOutputProcessorWrapper*)::{lambda(void*, unsigned long)#1}::operator()(void*, unsigned long) const
Unexecuted instantiation: enc_fast_lossless.cc:AVX2::(anonymous namespace)::LLProcess<AVX2::(anonymous namespace)::MoreThan14Bits>(JxlFastLosslessFrameState*, bool, AVX2::(anonymous namespace)::MoreThan14Bits, void*, void (*)(void*, void*, void (*)(void*, unsigned long), unsigned long), JxlEncoderOutputProcessorWrapper*)::{lambda(void*, unsigned long)#1}::operator()(void*, unsigned long) const
Unexecuted instantiation: enc_fast_lossless.cc:default_implementation::(anonymous namespace)::LLProcess<default_implementation::(anonymous namespace)::UpTo8Bits>(JxlFastLosslessFrameState*, bool, default_implementation::(anonymous namespace)::UpTo8Bits, void*, void (*)(void*, void*, void (*)(void*, unsigned long), unsigned long), JxlEncoderOutputProcessorWrapper*)::{lambda(void*, unsigned long)#1}::operator()(void*, unsigned long) const
Unexecuted instantiation: enc_fast_lossless.cc:default_implementation::(anonymous namespace)::LLProcess<default_implementation::(anonymous namespace)::From9To13Bits>(JxlFastLosslessFrameState*, bool, default_implementation::(anonymous namespace)::From9To13Bits, void*, void (*)(void*, void*, void (*)(void*, unsigned long), unsigned long), JxlEncoderOutputProcessorWrapper*)::{lambda(void*, unsigned long)#1}::operator()(void*, unsigned long) const
Unexecuted instantiation: enc_fast_lossless.cc:default_implementation::(anonymous namespace)::LLProcess<default_implementation::(anonymous namespace)::Exactly14Bits>(JxlFastLosslessFrameState*, bool, default_implementation::(anonymous namespace)::Exactly14Bits, void*, void (*)(void*, void*, void (*)(void*, unsigned long), unsigned long), JxlEncoderOutputProcessorWrapper*)::{lambda(void*, unsigned long)#1}::operator()(void*, unsigned long) const
Unexecuted instantiation: enc_fast_lossless.cc:default_implementation::(anonymous namespace)::LLProcess<default_implementation::(anonymous namespace)::MoreThan14Bits>(JxlFastLosslessFrameState*, bool, default_implementation::(anonymous namespace)::MoreThan14Bits, void*, void (*)(void*, void*, void (*)(void*, unsigned long), unsigned long), JxlEncoderOutputProcessorWrapper*)::{lambda(void*, unsigned long)#1}::operator()(void*, unsigned long) const
4065
0
        num_groups);
4066
0
#if !FJXL_STANDALONE
4067
0
    if (streaming) {
4068
0
      local_frame_state.nb_chans = frame_state->nb_chans;
4069
0
      local_frame_state.current_bit_writer = 1;
4070
0
      JXL_RETURN_IF_ERROR(
4071
0
          JxlFastLosslessOutputFrame(&local_frame_state, output_processor));
4072
0
    }
4073
0
#endif
4074
0
  }
4075
0
#if !FJXL_STANDALONE
4076
0
  if (streaming) {
4077
0
    size_t end_pos = output_processor->CurrentPosition();
4078
0
    JXL_RETURN_IF_ERROR(output_processor->Seek(start_pos));
4079
0
    frame_state->group_data.resize(1);
4080
0
    bool have_alpha = frame_state->nb_chans == 2 || frame_state->nb_chans == 4;
4081
0
    size_t padding = ComputeDcGlobalPadding(
4082
0
        frame_state->group_sizes, frame_state->ac_group_data_offset,
4083
0
        frame_state->min_dc_global_size, have_alpha, is_last);
4084
4085
0
    for (size_t i = 0; i < padding; ++i) {
4086
0
      frame_state->group_data[0][0].Write(8, 0);
4087
0
    }
4088
0
    frame_state->group_sizes[0] += padding;
4089
0
    JxlFastLosslessPrepareHeader(frame_state, /*add_image_header=*/0, is_last);
4090
0
    assert(frame_state->ac_group_data_offset ==
4091
0
           JxlFastLosslessOutputSize(frame_state));
4092
0
    JXL_RETURN_IF_ERROR(
4093
0
        JxlFastLosslessOutputHeaders(frame_state, output_processor));
4094
0
    JXL_RETURN_IF_ERROR(output_processor->Seek(end_pos));
4095
0
  } else if (output_processor) {
4096
0
    assert(onegroup);
4097
0
    JxlFastLosslessPrepareHeader(frame_state, /*add_image_header=*/0, is_last);
4098
0
    if (output_processor) {
4099
0
      JXL_RETURN_IF_ERROR(
4100
0
          JxlFastLosslessOutputFrame(frame_state, output_processor));
4101
0
    }
4102
0
  }
4103
0
  frame_state->process_done = true;
4104
0
#endif
4105
0
  return true;
4106
0
}
Unexecuted instantiation: enc_fast_lossless.cc:jxl::Status AVX2::(anonymous namespace)::LLProcess<AVX2::(anonymous namespace)::UpTo8Bits>(JxlFastLosslessFrameState*, bool, AVX2::(anonymous namespace)::UpTo8Bits, void*, void (*)(void*, void*, void (*)(void*, unsigned long), unsigned long), JxlEncoderOutputProcessorWrapper*)
Unexecuted instantiation: enc_fast_lossless.cc:jxl::Status AVX2::(anonymous namespace)::LLProcess<AVX2::(anonymous namespace)::From9To13Bits>(JxlFastLosslessFrameState*, bool, AVX2::(anonymous namespace)::From9To13Bits, void*, void (*)(void*, void*, void (*)(void*, unsigned long), unsigned long), JxlEncoderOutputProcessorWrapper*)
Unexecuted instantiation: enc_fast_lossless.cc:jxl::Status AVX2::(anonymous namespace)::LLProcess<AVX2::(anonymous namespace)::Exactly14Bits>(JxlFastLosslessFrameState*, bool, AVX2::(anonymous namespace)::Exactly14Bits, void*, void (*)(void*, void*, void (*)(void*, unsigned long), unsigned long), JxlEncoderOutputProcessorWrapper*)
Unexecuted instantiation: enc_fast_lossless.cc:jxl::Status AVX2::(anonymous namespace)::LLProcess<AVX2::(anonymous namespace)::MoreThan14Bits>(JxlFastLosslessFrameState*, bool, AVX2::(anonymous namespace)::MoreThan14Bits, void*, void (*)(void*, void*, void (*)(void*, unsigned long), unsigned long), JxlEncoderOutputProcessorWrapper*)
Unexecuted instantiation: enc_fast_lossless.cc:jxl::Status default_implementation::(anonymous namespace)::LLProcess<default_implementation::(anonymous namespace)::UpTo8Bits>(JxlFastLosslessFrameState*, bool, default_implementation::(anonymous namespace)::UpTo8Bits, void*, void (*)(void*, void*, void (*)(void*, unsigned long), unsigned long), JxlEncoderOutputProcessorWrapper*)
Unexecuted instantiation: enc_fast_lossless.cc:jxl::Status default_implementation::(anonymous namespace)::LLProcess<default_implementation::(anonymous namespace)::From9To13Bits>(JxlFastLosslessFrameState*, bool, default_implementation::(anonymous namespace)::From9To13Bits, void*, void (*)(void*, void*, void (*)(void*, unsigned long), unsigned long), JxlEncoderOutputProcessorWrapper*)
Unexecuted instantiation: enc_fast_lossless.cc:jxl::Status default_implementation::(anonymous namespace)::LLProcess<default_implementation::(anonymous namespace)::Exactly14Bits>(JxlFastLosslessFrameState*, bool, default_implementation::(anonymous namespace)::Exactly14Bits, void*, void (*)(void*, void*, void (*)(void*, unsigned long), unsigned long), JxlEncoderOutputProcessorWrapper*)
Unexecuted instantiation: enc_fast_lossless.cc:jxl::Status default_implementation::(anonymous namespace)::LLProcess<default_implementation::(anonymous namespace)::MoreThan14Bits>(JxlFastLosslessFrameState*, bool, default_implementation::(anonymous namespace)::MoreThan14Bits, void*, void (*)(void*, void*, void (*)(void*, unsigned long), unsigned long), JxlEncoderOutputProcessorWrapper*)
4107
4108
JxlFastLosslessFrameState* JxlFastLosslessPrepareImpl(
4109
    JxlChunkedFrameInputSource input, size_t width, size_t height,
4110
    size_t nb_chans, size_t bitdepth, bool big_endian, int effort,
4111
0
    int oneshot) {
4112
0
  assert(bitdepth > 0);
4113
0
  assert(nb_chans <= 4);
4114
0
  assert(nb_chans != 0);
4115
0
  if (bitdepth <= 8) {
4116
0
    return LLPrepare(input, width, height, UpTo8Bits(bitdepth), nb_chans,
4117
0
                     big_endian, effort, oneshot);
4118
0
  }
4119
0
  if (bitdepth <= 13) {
4120
0
    return LLPrepare(input, width, height, From9To13Bits(bitdepth), nb_chans,
4121
0
                     big_endian, effort, oneshot);
4122
0
  }
4123
0
  if (bitdepth == 14) {
4124
0
    return LLPrepare(input, width, height, Exactly14Bits(bitdepth), nb_chans,
4125
0
                     big_endian, effort, oneshot);
4126
0
  }
4127
0
  return LLPrepare(input, width, height, MoreThan14Bits(bitdepth), nb_chans,
4128
0
                   big_endian, effort, oneshot);
4129
0
}
Unexecuted instantiation: enc_fast_lossless.cc:AVX2::(anonymous namespace)::JxlFastLosslessPrepareImpl(JxlChunkedFrameInputSource, unsigned long, unsigned long, unsigned long, unsigned long, bool, int, int)
Unexecuted instantiation: enc_fast_lossless.cc:default_implementation::(anonymous namespace)::JxlFastLosslessPrepareImpl(JxlChunkedFrameInputSource, unsigned long, unsigned long, unsigned long, unsigned long, bool, int, int)
4130
4131
jxl::Status JxlFastLosslessProcessFrameImpl(
4132
    JxlFastLosslessFrameState* frame_state, bool is_last, void* runner_opaque,
4133
    FJxlParallelRunner runner,
4134
0
    JxlEncoderOutputProcessorWrapper* output_processor) {
4135
0
  const size_t bitdepth = frame_state->bitdepth;
4136
0
  if (bitdepth <= 8) {
4137
0
    JXL_RETURN_IF_ERROR(LLProcess(frame_state, is_last, UpTo8Bits(bitdepth),
4138
0
                                  runner_opaque, runner, output_processor));
4139
0
  } else if (bitdepth <= 13) {
4140
0
    JXL_RETURN_IF_ERROR(LLProcess(frame_state, is_last, From9To13Bits(bitdepth),
4141
0
                                  runner_opaque, runner, output_processor));
4142
0
  } else if (bitdepth == 14) {
4143
0
    JXL_RETURN_IF_ERROR(LLProcess(frame_state, is_last, Exactly14Bits(bitdepth),
4144
0
                                  runner_opaque, runner, output_processor));
4145
0
  } else {
4146
0
    JXL_RETURN_IF_ERROR(LLProcess(frame_state, is_last,
4147
0
                                  MoreThan14Bits(bitdepth), runner_opaque,
4148
0
                                  runner, output_processor));
4149
0
  }
4150
0
  return true;
4151
0
}
Unexecuted instantiation: enc_fast_lossless.cc:AVX2::(anonymous namespace)::JxlFastLosslessProcessFrameImpl(JxlFastLosslessFrameState*, bool, void*, void (*)(void*, void*, void (*)(void*, unsigned long), unsigned long), JxlEncoderOutputProcessorWrapper*)
Unexecuted instantiation: enc_fast_lossless.cc:default_implementation::(anonymous namespace)::JxlFastLosslessProcessFrameImpl(JxlFastLosslessFrameState*, bool, void*, void (*)(void*, void*, void (*)(void*, unsigned long), unsigned long), JxlEncoderOutputProcessorWrapper*)
4152
4153
}  // namespace
4154
4155
#endif  // FJXL_SELF_INCLUDE
4156
4157
#ifndef FJXL_SELF_INCLUDE
4158
4159
#define FJXL_SELF_INCLUDE
4160
4161
// If we have NEON enabled, it is the default target.
4162
#if FJXL_ENABLE_NEON
4163
4164
namespace default_implementation {
4165
#define FJXL_NEON
4166
#include "lib/jxl/enc_fast_lossless.cc"
4167
#undef FJXL_NEON
4168
}  // namespace default_implementation
4169
4170
#else                                    // FJXL_ENABLE_NEON
4171
4172
namespace default_implementation {
4173
#include "lib/jxl/enc_fast_lossless.cc"  // NOLINT
4174
}
4175
4176
#if FJXL_ENABLE_AVX2
4177
#ifdef __clang__
4178
#pragma clang attribute push(__attribute__((target("avx,avx2"))), \
4179
                             apply_to = function)
4180
// Causes spurious warnings on clang5.
4181
#pragma clang diagnostic push
4182
#pragma clang diagnostic ignored "-Wmissing-braces"
4183
#elif defined(__GNUC__)
4184
#pragma GCC push_options
4185
// Seems to cause spurious errors on GCC8.
4186
#pragma GCC diagnostic ignored "-Wpsabi"
4187
#pragma GCC target "avx,avx2"
4188
#endif
4189
4190
namespace AVX2 {
4191
#define FJXL_AVX2
4192
#include "lib/jxl/enc_fast_lossless.cc"  // NOLINT
4193
#undef FJXL_AVX2
4194
}  // namespace AVX2
4195
4196
#ifdef __clang__
4197
#pragma clang attribute pop
4198
#pragma clang diagnostic pop
4199
#elif defined(__GNUC__)
4200
#pragma GCC pop_options
4201
#endif
4202
#endif  // FJXL_ENABLE_AVX2
4203
4204
#if FJXL_ENABLE_AVX512
4205
#ifdef __clang__
4206
#pragma clang attribute push(                                                 \
4207
    __attribute__((target("avx512cd,avx512bw,avx512vl,avx512f,avx512vbmi"))), \
4208
    apply_to = function)
4209
#elif defined(__GNUC__)
4210
#pragma GCC push_options
4211
#pragma GCC target "avx512cd,avx512bw,avx512vl,avx512f,avx512vbmi"
4212
#endif
4213
4214
namespace AVX512 {
4215
#define FJXL_AVX512
4216
#include "lib/jxl/enc_fast_lossless.cc"
4217
#undef FJXL_AVX512
4218
}  // namespace AVX512
4219
4220
#ifdef __clang__
4221
#pragma clang attribute pop
4222
#elif defined(__GNUC__)
4223
#pragma GCC pop_options
4224
#endif
4225
#endif  // FJXL_ENABLE_AVX512
4226
4227
#endif
4228
4229
extern "C" {
4230
4231
#if FJXL_STANDALONE
4232
class FJxlFrameInput {
4233
 public:
4234
  FJxlFrameInput(const unsigned char* rgba, size_t row_stride, size_t nb_chans,
4235
                 size_t bitdepth)
4236
      : rgba_(rgba),
4237
        row_stride_(row_stride),
4238
        bytes_per_pixel_(bitdepth <= 8 ? nb_chans : 2 * nb_chans) {}
4239
4240
  JxlChunkedFrameInputSource GetInputSource() {
4241
    return JxlChunkedFrameInputSource{this, GetDataAt,
4242
                                      [](void*, const void*) {}};
4243
  }
4244
4245
 private:
4246
  static const void* GetDataAt(void* opaque, size_t xpos, size_t ypos,
4247
                               size_t xsize, size_t ysize, size_t* row_offset) {
4248
    FJxlFrameInput* self = static_cast<FJxlFrameInput*>(opaque);
4249
    *row_offset = self->row_stride_;
4250
    return self->rgba_ + ypos * (*row_offset) + xpos * self->bytes_per_pixel_;
4251
  }
4252
4253
  const uint8_t* rgba_;
4254
  size_t row_stride_;
4255
  size_t bytes_per_pixel_;
4256
};
4257
4258
size_t JxlFastLosslessEncode(const unsigned char* rgba, size_t width,
4259
                             size_t row_stride, size_t height, size_t nb_chans,
4260
                             size_t bitdepth, bool big_endian, int effort,
4261
                             unsigned char** output, void* runner_opaque,
4262
                             FJxlParallelRunner runner) {
4263
  FJxlFrameInput input(rgba, row_stride, nb_chans, bitdepth);
4264
  auto* frame_state = JxlFastLosslessPrepareFrame(
4265
      input.GetInputSource(), width, height, nb_chans, bitdepth, big_endian,
4266
      effort, /*oneshot=*/true);
4267
  if (!JxlFastLosslessProcessFrame(frame_state, /*is_last=*/true, runner_opaque,
4268
                                   runner, nullptr)) {
4269
    return 0;
4270
  }
4271
  JxlFastLosslessPrepareHeader(frame_state, /*add_image_header=*/1,
4272
                               /*is_last=*/1);
4273
  size_t output_size = JxlFastLosslessMaxRequiredOutput(frame_state);
4274
  *output = (unsigned char*)malloc(output_size);
4275
  size_t written = 0;
4276
  size_t total = 0;
4277
  while ((written = JxlFastLosslessWriteOutput(frame_state, *output + total,
4278
                                               output_size - total)) != 0) {
4279
    total += written;
4280
  }
4281
  JxlFastLosslessFreeFrameState(frame_state);
4282
  return total;
4283
}
4284
#endif
4285
4286
JxlFastLosslessFrameState* JxlFastLosslessPrepareFrame(
4287
    JxlChunkedFrameInputSource input, size_t width, size_t height,
4288
    size_t nb_chans, size_t bitdepth, bool big_endian, int effort,
4289
0
    int oneshot) {
4290
#if FJXL_ENABLE_AVX512
4291
  if (HasCpuFeature(CpuFeature::kAVX512CD) &&
4292
      HasCpuFeature(CpuFeature::kVBMI) &&
4293
      HasCpuFeature(CpuFeature::kAVX512BW) &&
4294
      HasCpuFeature(CpuFeature::kAVX512F) &&
4295
      HasCpuFeature(CpuFeature::kAVX512VL)) {
4296
    return AVX512::JxlFastLosslessPrepareImpl(
4297
        input, width, height, nb_chans, bitdepth, big_endian, effort, oneshot);
4298
  }
4299
#endif
4300
0
#if FJXL_ENABLE_AVX2
4301
0
  if (HasCpuFeature(CpuFeature::kAVX2)) {
4302
0
    return AVX2::JxlFastLosslessPrepareImpl(
4303
0
        input, width, height, nb_chans, bitdepth, big_endian, effort, oneshot);
4304
0
  }
4305
0
#endif
4306
4307
0
  return default_implementation::JxlFastLosslessPrepareImpl(
4308
0
      input, width, height, nb_chans, bitdepth, big_endian, effort, oneshot);
4309
0
}
4310
4311
bool JxlFastLosslessProcessFrame(
4312
    JxlFastLosslessFrameState* frame_state, bool is_last, void* runner_opaque,
4313
    FJxlParallelRunner runner,
4314
0
    JxlEncoderOutputProcessorWrapper* output_processor) {
4315
0
  auto trivial_runner =
4316
0
      +[](void*, void* opaque, void fun(void*, size_t), size_t count) {
4317
0
        for (size_t i = 0; i < count; i++) {
4318
0
          fun(opaque, i);
4319
0
        }
4320
0
      };
4321
4322
0
  if (runner == nullptr) {
4323
0
    runner = trivial_runner;
4324
0
  }
4325
4326
#if FJXL_ENABLE_AVX512
4327
  if (HasCpuFeature(CpuFeature::kAVX512CD) &&
4328
      HasCpuFeature(CpuFeature::kVBMI) &&
4329
      HasCpuFeature(CpuFeature::kAVX512BW) &&
4330
      HasCpuFeature(CpuFeature::kAVX512F) &&
4331
      HasCpuFeature(CpuFeature::kAVX512VL)) {
4332
    JXL_RETURN_IF_ERROR(AVX512::JxlFastLosslessProcessFrameImpl(
4333
        frame_state, is_last, runner_opaque, runner, output_processor));
4334
    return true;
4335
  }
4336
#endif
4337
0
#if FJXL_ENABLE_AVX2
4338
0
  if (HasCpuFeature(CpuFeature::kAVX2)) {
4339
0
    JXL_RETURN_IF_ERROR(AVX2::JxlFastLosslessProcessFrameImpl(
4340
0
        frame_state, is_last, runner_opaque, runner, output_processor));
4341
0
    return true;
4342
0
  }
4343
0
#endif
4344
4345
0
  JXL_RETURN_IF_ERROR(default_implementation::JxlFastLosslessProcessFrameImpl(
4346
0
      frame_state, is_last, runner_opaque, runner, output_processor));
4347
0
  return true;
4348
0
}
4349
4350
}  // extern "C"
4351
4352
#if !FJXL_STANDALONE
4353
bool JxlFastLosslessOutputFrame(
4354
    JxlFastLosslessFrameState* frame_state,
4355
0
    JxlEncoderOutputProcessorWrapper* output_processor) {
4356
0
  size_t fl_size = JxlFastLosslessOutputSize(frame_state);
4357
0
  size_t written = 0;
4358
0
  while (written < fl_size) {
4359
0
    JXL_ASSIGN_OR_RETURN(auto buffer,
4360
0
                         output_processor->GetBuffer(32, fl_size - written));
4361
0
    size_t n =
4362
0
        JxlFastLosslessWriteOutput(frame_state, buffer.data(), buffer.size());
4363
0
    if (n == 0) break;
4364
0
    JXL_RETURN_IF_ERROR(buffer.advance(n));
4365
0
    written += n;
4366
0
  };
4367
0
  return true;
4368
0
}
4369
#endif
4370
4371
#endif  // FJXL_SELF_INCLUDE