Coverage Report

Created: 2025-11-14 07:32

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/src/libjxl/lib/jxl/enc_fast_lossless.cc
Line
Count
Source
1
// Copyright (c) the JPEG XL Project Authors. All rights reserved.
2
//
3
// Use of this source code is governed by a BSD-style
4
// license that can be found in the LICENSE file.
5
6
#include "lib/jxl/base/status.h"
7
#ifndef FJXL_SELF_INCLUDE
8
9
#include <assert.h>
10
11
#include <algorithm>
12
#include <array>
13
#include <cstdint>
14
#include <cstdlib>
15
#include <cstring>
16
#include <limits>
17
#include <memory>
18
#include <vector>
19
20
#include "lib/jxl/enc_fast_lossless.h"
21
22
#if !FJXL_STANDALONE
23
#include "lib/jxl/encode_internal.h"
24
#endif  // FJXL_STANDALONE
25
26
#if defined(__x86_64__) || defined(_M_X64)
27
#define FJXL_ARCH_IS_X86_64 1
28
#else
29
#define FJXL_ARCH_IS_X86_64 0
30
#endif
31
32
#if defined(__i386__) || defined(_M_IX86) || FJXL_ARCH_IS_X86_64
33
#define FJXL_ARCH_IS_X86 1
34
#else
35
#define FJXL_ARCH_IS_X86 0
36
#endif
37
38
#if FJXL_ARCH_IS_X86
39
#if defined(_MSC_VER)
40
#include <intrin.h>
41
#else  // _MSC_VER
42
#include <cpuid.h>
43
#endif  // _MSC_VER
44
#endif  // FJXL_ARCH_IS_X86
45
46
// Enable NEON and AVX2/AVX512 if not asked to do otherwise and the compilers
47
// support it.
48
#if defined(__aarch64__) || defined(_M_ARM64)  // ARCH
49
#include <arm_neon.h>
50
51
#if !defined(FJXL_ENABLE_NEON)
52
#define FJXL_ENABLE_NEON 1
53
#endif  // !defined(FJXL_ENABLE_NEON)
54
55
#elif FJXL_ARCH_IS_X86_64 && !defined(_MSC_VER)  // ARCH
56
#include <immintrin.h>
57
58
// manually add _mm512_cvtsi512_si32 definition if missing
59
// (e.g. with Xcode on macOS Mojave)
60
// copied from gcc 11.1.0 include/avx512fintrin.h line 14367-14373
61
#if defined(__clang__) &&                                           \
62
    ((!defined(__apple_build_version__) && __clang_major__ < 10) || \
63
     (defined(__apple_build_version__) && __apple_build_version__ < 12000032))
64
inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
65
_mm512_cvtsi512_si32(__m512i __A) {
66
  __v16si __B = (__v16si)__A;
67
  return __B[0];
68
}
69
#endif
70
71
#if !defined(FJXL_ENABLE_AVX2)
72
#define FJXL_ENABLE_AVX2 1
73
#endif  // !defined(FJXL_ENABLE_AVX2)
74
75
#if !defined(FJXL_ENABLE_AVX512)
76
// On clang-7 or earlier, and gcc-10 or earlier, AVX512 seems broken.
77
#if (defined(__clang__) &&                                             \
78
         (!defined(__apple_build_version__) && __clang_major__ > 7) || \
79
     (defined(__apple_build_version__) &&                              \
80
      __apple_build_version__ > 10010046)) ||                          \
81
    (defined(__GNUC__) && __GNUC__ > 10)
82
#define FJXL_ENABLE_AVX512 1
83
#endif
84
#endif  // !defined(FJXL_ENABLE_AVX512)
85
86
#endif  // ARCH
87
88
#ifndef FJXL_ENABLE_NEON
89
#define FJXL_ENABLE_NEON 0
90
#endif
91
92
#ifndef FJXL_ENABLE_AVX2
93
#define FJXL_ENABLE_AVX2 0
94
#endif
95
96
#ifndef FJXL_ENABLE_AVX512
97
#define FJXL_ENABLE_AVX512 0
98
#endif
99
100
namespace {
101
102
enum class CpuFeature : uint32_t {
103
  kAVX2 = 0,
104
105
  kAVX512F,
106
  kAVX512VL,
107
  kAVX512CD,
108
  kAVX512BW,
109
110
  kVBMI,
111
  kVBMI2
112
};
113
114
0
constexpr uint32_t CpuFeatureBit(CpuFeature feature) {
115
0
  return 1u << static_cast<uint32_t>(feature);
116
0
}
117
118
#if FJXL_ARCH_IS_X86
119
#if defined(_MSC_VER)
120
void Cpuid(const uint32_t level, const uint32_t count,
121
           std::array<uint32_t, 4>& abcd) {
122
  int regs[4];
123
  __cpuidex(regs, level, count);
124
  for (int i = 0; i < 4; ++i) {
125
    abcd[i] = regs[i];
126
  }
127
}
128
uint32_t ReadXCR0() { return static_cast<uint32_t>(_xgetbv(0)); }
129
#else   // _MSC_VER
130
void Cpuid(const uint32_t level, const uint32_t count,
131
0
           std::array<uint32_t, 4>& abcd) {
132
0
  uint32_t a;
133
0
  uint32_t b;
134
0
  uint32_t c;
135
0
  uint32_t d;
136
0
  __cpuid_count(level, count, a, b, c, d);
137
0
  abcd[0] = a;
138
0
  abcd[1] = b;
139
0
  abcd[2] = c;
140
0
  abcd[3] = d;
141
0
}
142
0
uint32_t ReadXCR0() {
143
0
  uint32_t xcr0;
144
0
  uint32_t xcr0_high;
145
0
  const uint32_t index = 0;
146
0
  asm volatile(".byte 0x0F, 0x01, 0xD0"
147
0
               : "=a"(xcr0), "=d"(xcr0_high)
148
0
               : "c"(index));
149
0
  return xcr0;
150
0
}
151
#endif  // _MSC_VER
152
153
0
uint32_t DetectCpuFeatures() {
154
0
  uint32_t flags = 0;  // return value
155
0
  std::array<uint32_t, 4> abcd;
156
0
  Cpuid(0, 0, abcd);
157
0
  const uint32_t max_level = abcd[0];
158
159
0
  const auto check_bit = [](uint32_t v, uint32_t idx) -> bool {
160
0
    return (v & (1U << idx)) != 0;
161
0
  };
162
163
  // Extended features
164
0
  if (max_level >= 7) {
165
0
    Cpuid(7, 0, abcd);
166
0
    flags |= check_bit(abcd[1], 5) ? CpuFeatureBit(CpuFeature::kAVX2) : 0;
167
168
0
    flags |= check_bit(abcd[1], 16) ? CpuFeatureBit(CpuFeature::kAVX512F) : 0;
169
0
    flags |= check_bit(abcd[1], 28) ? CpuFeatureBit(CpuFeature::kAVX512CD) : 0;
170
0
    flags |= check_bit(abcd[1], 30) ? CpuFeatureBit(CpuFeature::kAVX512BW) : 0;
171
0
    flags |= check_bit(abcd[1], 31) ? CpuFeatureBit(CpuFeature::kAVX512VL) : 0;
172
173
0
    flags |= check_bit(abcd[2], 1) ? CpuFeatureBit(CpuFeature::kVBMI) : 0;
174
0
    flags |= check_bit(abcd[2], 6) ? CpuFeatureBit(CpuFeature::kVBMI2) : 0;
175
0
  }
176
177
0
  Cpuid(1, 0, abcd);
178
0
  const bool os_has_xsave = check_bit(abcd[2], 27);
179
0
  if (os_has_xsave) {
180
0
    const uint32_t xcr0 = ReadXCR0();
181
0
    if (!check_bit(xcr0, 1) || !check_bit(xcr0, 2)) {
182
0
      flags = 0;
183
0
    } else if (!check_bit(xcr0, 5) || !check_bit(xcr0, 6) ||
184
0
               !check_bit(xcr0, 7)) {
185
      // No AVX-512; disable everything but AVX2 if present
186
0
      flags &= CpuFeatureBit(CpuFeature::kAVX2);
187
0
    }
188
0
  }
189
190
0
  return flags;
191
0
}
192
#else   // FJXL_ARCH_IS_X86
193
uint32_t DetectCpuFeatures() { return 0; }
194
#endif  // FJXL_ARCH_IS_X86
195
196
#if defined(_MSC_VER)
197
#define FJXL_UNUSED
198
#else
199
#define FJXL_UNUSED __attribute__((unused))
200
#endif
201
202
0
FJXL_UNUSED bool HasCpuFeature(CpuFeature feature) {
203
0
  static uint32_t cpu_features = DetectCpuFeatures();
204
0
  return (cpu_features & CpuFeatureBit(feature)) != 0;
205
0
}
206
207
#if defined(_MSC_VER) && !defined(__clang__)
208
#define FJXL_INLINE __forceinline
209
FJXL_INLINE uint32_t FloorLog2(uint32_t v) {
210
  unsigned long index;
211
  _BitScanReverse(&index, v);
212
  return index;
213
}
214
FJXL_INLINE uint32_t CtzNonZero(uint64_t v) {
215
  unsigned long index;
216
  _BitScanForward(&index, v);
217
  return index;
218
}
219
#else
220
#define FJXL_INLINE inline __attribute__((always_inline))
221
0
FJXL_INLINE uint32_t FloorLog2(uint32_t v) {
222
0
  return v ? 31 - __builtin_clz(v) : 0;
223
0
}
224
0
FJXL_UNUSED FJXL_INLINE uint32_t CtzNonZero(uint64_t v) {
225
0
  return __builtin_ctzll(v);
226
0
}
227
#endif
228
229
// Compiles to a memcpy on little-endian systems.
230
0
FJXL_INLINE void StoreLE64(uint8_t* tgt, uint64_t data) {
231
#if (!defined(__BYTE_ORDER__) || (__BYTE_ORDER__ != __ORDER_LITTLE_ENDIAN__))
232
  for (int i = 0; i < 8; i++) {
233
    tgt[i] = (data >> (i * 8)) & 0xFF;
234
  }
235
#else
236
0
  memcpy(tgt, &data, 8);
237
0
#endif
238
0
}
239
240
FJXL_INLINE size_t AddBits(uint32_t count, uint64_t bits, uint8_t* data_buf,
241
0
                           size_t& bits_in_buffer, uint64_t& bit_buffer) {
242
0
  bit_buffer |= bits << bits_in_buffer;
243
0
  bits_in_buffer += count;
244
0
  StoreLE64(data_buf, bit_buffer);
245
0
  size_t bytes_in_buffer = bits_in_buffer / 8;
246
0
  bits_in_buffer -= bytes_in_buffer * 8;
247
0
  bit_buffer >>= bytes_in_buffer * 8;
248
0
  return bytes_in_buffer;
249
0
}
250
251
struct BitWriter {
252
0
  void Allocate(size_t maximum_bit_size) {
253
0
    assert(data == nullptr);
254
    // Leave some padding.
255
0
    data.reset(static_cast<uint8_t*>(malloc(maximum_bit_size / 8 + 64)));
256
0
  }
257
258
0
  void Write(uint32_t count, uint64_t bits) {
259
0
    bytes_written += AddBits(count, bits, data.get() + bytes_written,
260
0
                             bits_in_buffer, buffer);
261
0
  }
262
263
0
  void ZeroPadToByte() {
264
0
    if (bits_in_buffer != 0) {
265
0
      Write(8 - bits_in_buffer, 0);
266
0
    }
267
0
  }
268
269
  FJXL_INLINE void WriteMultiple(const uint64_t* nbits, const uint64_t* bits,
270
0
                                 size_t n) {
271
    // Necessary because Write() is only guaranteed to work with <=56 bits.
272
    // Trying to SIMD-fy this code results in lower speed (and definitely less
273
    // clarity).
274
0
    {
275
0
      for (size_t i = 0; i < n; i++) {
276
0
        this->buffer |= bits[i] << this->bits_in_buffer;
277
0
        memcpy(this->data.get() + this->bytes_written, &this->buffer, 8);
278
0
        uint64_t shift = 64 - this->bits_in_buffer;
279
0
        this->bits_in_buffer += nbits[i];
280
        // This `if` seems to be faster than using ternaries.
281
0
        if (this->bits_in_buffer >= 64) {
282
0
          uint64_t next_buffer = shift >= 64 ? 0 : bits[i] >> shift;
283
0
          this->buffer = next_buffer;
284
0
          this->bits_in_buffer -= 64;
285
0
          this->bytes_written += 8;
286
0
        }
287
0
      }
288
0
      memcpy(this->data.get() + this->bytes_written, &this->buffer, 8);
289
0
      size_t bytes_in_buffer = this->bits_in_buffer / 8;
290
0
      this->bits_in_buffer -= bytes_in_buffer * 8;
291
0
      this->buffer >>= bytes_in_buffer * 8;
292
0
      this->bytes_written += bytes_in_buffer;
293
0
    }
294
0
  }
295
296
  std::unique_ptr<uint8_t[], void (*)(void*)> data = {nullptr, free};
297
  size_t bytes_written = 0;
298
  size_t bits_in_buffer = 0;
299
  uint64_t buffer = 0;
300
};
301
302
0
size_t SectionSize(const std::array<BitWriter, 4>& group_data) {
303
0
  size_t sz = 0;
304
0
  for (size_t j = 0; j < 4; j++) {
305
0
    const auto& writer = group_data[j];
306
0
    sz += writer.bytes_written * 8 + writer.bits_in_buffer;
307
0
  }
308
0
  sz = (sz + 7) / 8;
309
0
  return sz;
310
0
}
311
312
constexpr size_t kMaxFrameHeaderSize = 5;
313
314
constexpr size_t kGroupSizeOffset[4] = {
315
    static_cast<size_t>(0),
316
    static_cast<size_t>(1024),
317
    static_cast<size_t>(17408),
318
    static_cast<size_t>(4211712),
319
};
320
constexpr size_t kTOCBits[4] = {12, 16, 24, 32};
321
322
0
size_t TOCBucket(size_t group_size) {
323
0
  size_t bucket = 0;
324
0
  while (bucket < 3 && group_size >= kGroupSizeOffset[bucket + 1]) ++bucket;
325
0
  return bucket;
326
0
}
327
328
#if !FJXL_STANDALONE
329
0
size_t TOCSize(const std::vector<size_t>& group_sizes) {
330
0
  size_t toc_bits = 0;
331
0
  for (size_t group_size : group_sizes) {
332
0
    toc_bits += kTOCBits[TOCBucket(group_size)];
333
0
  }
334
0
  return (toc_bits + 7) / 8;
335
0
}
336
337
0
size_t FrameHeaderSize(bool have_alpha, bool is_last) {
338
0
  size_t nbits = 28 + (have_alpha ? 4 : 0) + (is_last ? 0 : 2);
339
0
  return (nbits + 7) / 8;
340
0
}
341
#endif
342
343
void ComputeAcGroupDataOffset(size_t dc_global_size, size_t num_dc_groups,
344
                              size_t num_ac_groups, size_t& min_dc_global_size,
345
0
                              size_t& ac_group_offset) {
346
  // Max AC group size is 768 kB, so max AC group TOC bits is 24.
347
0
  size_t ac_toc_max_bits = num_ac_groups * 24;
348
0
  size_t ac_toc_min_bits = num_ac_groups * 12;
349
0
  size_t max_padding = 1 + (ac_toc_max_bits - ac_toc_min_bits + 7) / 8;
350
0
  min_dc_global_size = dc_global_size;
351
0
  size_t dc_global_bucket = TOCBucket(min_dc_global_size);
352
0
  while (TOCBucket(min_dc_global_size + max_padding) > dc_global_bucket) {
353
0
    dc_global_bucket = TOCBucket(min_dc_global_size + max_padding);
354
0
    min_dc_global_size = kGroupSizeOffset[dc_global_bucket];
355
0
  }
356
0
  assert(TOCBucket(min_dc_global_size) == dc_global_bucket);
357
0
  assert(TOCBucket(min_dc_global_size + max_padding) == dc_global_bucket);
358
0
  size_t max_toc_bits =
359
0
      kTOCBits[dc_global_bucket] + 12 * (1 + num_dc_groups) + ac_toc_max_bits;
360
0
  size_t max_toc_size = (max_toc_bits + 7) / 8;
361
0
  ac_group_offset = kMaxFrameHeaderSize + max_toc_size + min_dc_global_size;
362
0
}
363
364
#if !FJXL_STANDALONE
365
size_t ComputeDcGlobalPadding(const std::vector<size_t>& group_sizes,
366
                              size_t ac_group_data_offset,
367
                              size_t min_dc_global_size, bool have_alpha,
368
0
                              bool is_last) {
369
0
  std::vector<size_t> new_group_sizes = group_sizes;
370
0
  new_group_sizes[0] = min_dc_global_size;
371
0
  size_t toc_size = TOCSize(new_group_sizes);
372
0
  size_t actual_offset =
373
0
      FrameHeaderSize(have_alpha, is_last) + toc_size + group_sizes[0];
374
0
  return ac_group_data_offset - actual_offset;
375
0
}
376
#endif
377
378
constexpr size_t kNumRawSymbols = 19;
379
constexpr size_t kNumLZ77 = 33;
380
constexpr size_t kLZ77CacheSize = 32;
381
382
constexpr size_t kLZ77Offset = 224;
383
constexpr size_t kLZ77MinLength = 7;
384
385
void EncodeHybridUintLZ77(uint32_t value, uint32_t* token, uint32_t* nbits,
386
0
                          uint32_t* bits) {
387
  // 400 config
388
0
  uint32_t n = FloorLog2(value);
389
0
  *token = value < 16 ? value : 16 + n - 4;
390
0
  *nbits = value < 16 ? 0 : n;
391
0
  *bits = value < 16 ? 0 : value - (1 << *nbits);
392
0
}
393
394
struct PrefixCode {
395
  uint8_t raw_nbits[kNumRawSymbols] = {};
396
  uint8_t raw_bits[kNumRawSymbols] = {};
397
398
  uint8_t lz77_nbits[kNumLZ77] = {};
399
  uint16_t lz77_bits[kNumLZ77] = {};
400
401
  uint64_t lz77_cache_bits[kLZ77CacheSize] = {};
402
  uint8_t lz77_cache_nbits[kLZ77CacheSize] = {};
403
404
  size_t numraw;
405
406
0
  static uint16_t BitReverse(size_t nbits, uint16_t bits) {
407
0
    constexpr uint16_t kNibbleLookup[16] = {
408
0
        0b0000, 0b1000, 0b0100, 0b1100, 0b0010, 0b1010, 0b0110, 0b1110,
409
0
        0b0001, 0b1001, 0b0101, 0b1101, 0b0011, 0b1011, 0b0111, 0b1111,
410
0
    };
411
0
    uint16_t rev16 = (kNibbleLookup[bits & 0xF] << 12) |
412
0
                     (kNibbleLookup[(bits >> 4) & 0xF] << 8) |
413
0
                     (kNibbleLookup[(bits >> 8) & 0xF] << 4) |
414
0
                     (kNibbleLookup[bits >> 12]);
415
0
    return rev16 >> (16 - nbits);
416
0
  }
417
418
  // Create the prefix codes given the code lengths.
419
  // Supports the code lengths being split into two halves.
420
  static void ComputeCanonicalCode(const uint8_t* first_chunk_nbits,
421
                                   uint8_t* first_chunk_bits,
422
                                   size_t first_chunk_size,
423
                                   const uint8_t* second_chunk_nbits,
424
                                   uint16_t* second_chunk_bits,
425
0
                                   size_t second_chunk_size) {
426
0
    constexpr size_t kMaxCodeLength = 15;
427
0
    uint8_t code_length_counts[kMaxCodeLength + 1] = {};
428
0
    for (size_t i = 0; i < first_chunk_size; i++) {
429
0
      code_length_counts[first_chunk_nbits[i]]++;
430
0
      assert(first_chunk_nbits[i] <= kMaxCodeLength);
431
0
      assert(first_chunk_nbits[i] <= 8);
432
0
      assert(first_chunk_nbits[i] > 0);
433
0
    }
434
0
    for (size_t i = 0; i < second_chunk_size; i++) {
435
0
      code_length_counts[second_chunk_nbits[i]]++;
436
0
      assert(second_chunk_nbits[i] <= kMaxCodeLength);
437
0
    }
438
439
0
    uint16_t next_code[kMaxCodeLength + 1] = {};
440
441
0
    uint16_t code = 0;
442
0
    for (size_t i = 1; i < kMaxCodeLength + 1; i++) {
443
0
      code = (code + code_length_counts[i - 1]) << 1;
444
0
      next_code[i] = code;
445
0
    }
446
447
0
    for (size_t i = 0; i < first_chunk_size; i++) {
448
0
      first_chunk_bits[i] =
449
0
          BitReverse(first_chunk_nbits[i], next_code[first_chunk_nbits[i]]++);
450
0
    }
451
0
    for (size_t i = 0; i < second_chunk_size; i++) {
452
0
      second_chunk_bits[i] =
453
0
          BitReverse(second_chunk_nbits[i], next_code[second_chunk_nbits[i]]++);
454
0
    }
455
0
  }
456
457
  template <typename T>
458
  static void ComputeCodeLengthsNonZeroImpl(const uint64_t* freqs, size_t n,
459
                                            size_t precision, T infty,
460
                                            const uint8_t* min_limit,
461
                                            const uint8_t* max_limit,
462
0
                                            uint8_t* nbits) {
463
0
    assert(precision < 15);
464
0
    assert(n <= kMaxNumSymbols);
465
0
    std::vector<T> dynp(((1U << precision) + 1) * (n + 1), infty);
466
0
    auto d = [&](size_t sym, size_t off) -> T& {
467
0
      return dynp[sym * ((1 << precision) + 1) + off];
468
0
    };
Unexecuted instantiation: enc_fast_lossless.cc:(anonymous namespace)::PrefixCode::ComputeCodeLengthsNonZeroImpl<unsigned int>(unsigned long const*, unsigned long, unsigned long, unsigned int, unsigned char const*, unsigned char const*, unsigned char*)::{lambda(unsigned long, unsigned long)#1}::operator()(unsigned long, unsigned long) const
Unexecuted instantiation: enc_fast_lossless.cc:(anonymous namespace)::PrefixCode::ComputeCodeLengthsNonZeroImpl<unsigned long>(unsigned long const*, unsigned long, unsigned long, unsigned long, unsigned char const*, unsigned char const*, unsigned char*)::{lambda(unsigned long, unsigned long)#1}::operator()(unsigned long, unsigned long) const
469
0
    d(0, 0) = 0;
470
0
    for (size_t sym = 0; sym < n; sym++) {
471
0
      for (T bits = min_limit[sym]; bits <= max_limit[sym]; bits++) {
472
0
        size_t off_delta = 1U << (precision - bits);
473
0
        for (size_t off = 0; off + off_delta <= (1U << precision); off++) {
474
0
          d(sym + 1, off + off_delta) =
475
0
              std::min(d(sym, off) + static_cast<T>(freqs[sym]) * bits,
476
0
                       d(sym + 1, off + off_delta));
477
0
        }
478
0
      }
479
0
    }
480
481
0
    size_t sym = n;
482
0
    size_t off = 1U << precision;
483
484
0
    assert(d(sym, off) != infty);
485
486
0
    while (sym-- > 0) {
487
0
      assert(off > 0);
488
0
      for (size_t bits = min_limit[sym]; bits <= max_limit[sym]; bits++) {
489
0
        size_t off_delta = 1U << (precision - bits);
490
0
        if (off_delta <= off &&
491
0
            d(sym + 1, off) == d(sym, off - off_delta) + freqs[sym] * bits) {
492
0
          off -= off_delta;
493
0
          nbits[sym] = bits;
494
0
          break;
495
0
        }
496
0
      }
497
0
    }
498
0
  }
Unexecuted instantiation: enc_fast_lossless.cc:void (anonymous namespace)::PrefixCode::ComputeCodeLengthsNonZeroImpl<unsigned int>(unsigned long const*, unsigned long, unsigned long, unsigned int, unsigned char const*, unsigned char const*, unsigned char*)
Unexecuted instantiation: enc_fast_lossless.cc:void (anonymous namespace)::PrefixCode::ComputeCodeLengthsNonZeroImpl<unsigned long>(unsigned long const*, unsigned long, unsigned long, unsigned long, unsigned char const*, unsigned char const*, unsigned char*)
499
500
  // Computes nbits[i] for i <= n, subject to min_limit[i] <= nbits[i] <=
501
  // max_limit[i] and sum 2**-nbits[i] == 1, so to minimize sum(nbits[i] *
502
  // freqs[i]).
503
  static void ComputeCodeLengthsNonZero(const uint64_t* freqs, size_t n,
504
                                        uint8_t* min_limit, uint8_t* max_limit,
505
0
                                        uint8_t* nbits) {
506
0
    size_t precision = 0;
507
0
    size_t shortest_length = 255;
508
0
    uint64_t freqsum = 0;
509
0
    for (size_t i = 0; i < n; i++) {
510
0
      assert(freqs[i] != 0);
511
0
      freqsum += freqs[i];
512
0
      if (min_limit[i] < 1) min_limit[i] = 1;
513
0
      assert(min_limit[i] <= max_limit[i]);
514
0
      precision = std::max<size_t>(max_limit[i], precision);
515
0
      shortest_length = std::min<size_t>(min_limit[i], shortest_length);
516
0
    }
517
    // If all the minimum limits are greater than 1, shift precision so that we
518
    // behave as if the shortest was 1.
519
0
    precision -= shortest_length - 1;
520
0
    uint64_t infty = freqsum * precision;
521
0
    if (infty < std::numeric_limits<uint32_t>::max() / 2) {
522
0
      ComputeCodeLengthsNonZeroImpl(freqs, n, precision,
523
0
                                    static_cast<uint32_t>(infty), min_limit,
524
0
                                    max_limit, nbits);
525
0
    } else {
526
0
      ComputeCodeLengthsNonZeroImpl(freqs, n, precision, infty, min_limit,
527
0
                                    max_limit, nbits);
528
0
    }
529
0
  }
530
531
  static constexpr size_t kMaxNumSymbols =
532
      kNumRawSymbols + 1 < kNumLZ77 ? kNumLZ77 : kNumRawSymbols + 1;
533
  static void ComputeCodeLengths(const uint64_t* freqs, size_t n,
534
                                 const uint8_t* min_limit_in,
535
0
                                 const uint8_t* max_limit_in, uint8_t* nbits) {
536
0
    assert(n <= kMaxNumSymbols);
537
0
    uint64_t compact_freqs[kMaxNumSymbols];
538
0
    uint8_t min_limit[kMaxNumSymbols];
539
0
    uint8_t max_limit[kMaxNumSymbols];
540
0
    size_t ni = 0;
541
0
    for (size_t i = 0; i < n; i++) {
542
0
      if (freqs[i]) {
543
0
        compact_freqs[ni] = freqs[i];
544
0
        min_limit[ni] = min_limit_in[i];
545
0
        max_limit[ni] = max_limit_in[i];
546
0
        ni++;
547
0
      }
548
0
    }
549
0
    for (size_t i = ni; i < kMaxNumSymbols; ++i) {
550
0
      compact_freqs[i] = 0;
551
0
      min_limit[i] = 0;
552
0
      max_limit[i] = 0;
553
0
    }
554
0
    uint8_t num_bits[kMaxNumSymbols] = {};
555
0
    ComputeCodeLengthsNonZero(compact_freqs, ni, min_limit, max_limit,
556
0
                              num_bits);
557
0
    ni = 0;
558
0
    for (size_t i = 0; i < n; i++) {
559
0
      nbits[i] = 0;
560
0
      if (freqs[i]) {
561
0
        nbits[i] = num_bits[ni++];
562
0
      }
563
0
    }
564
0
  }
565
566
  // Invalid code, used to construct arrays.
567
0
  PrefixCode() = default;
568
569
  template <typename BitDepth>
570
  PrefixCode(BitDepth /* bitdepth */, uint64_t* raw_counts,
571
0
             uint64_t* lz77_counts) {
572
    // "merge" together all the lz77 counts in a single symbol for the level 1
573
    // table (containing just the raw symbols, up to length 7).
574
0
    uint64_t level1_counts[kNumRawSymbols + 1];
575
0
    memcpy(level1_counts, raw_counts, kNumRawSymbols * sizeof(uint64_t));
576
0
    numraw = kNumRawSymbols;
577
0
    while (numraw > 0 && level1_counts[numraw - 1] == 0) numraw--;
578
579
0
    level1_counts[numraw] = 0;
580
0
    for (size_t i = 0; i < kNumLZ77; i++) {
581
0
      level1_counts[numraw] += lz77_counts[i];
582
0
    }
583
0
    uint8_t level1_nbits[kNumRawSymbols + 1] = {};
584
0
    ComputeCodeLengths(level1_counts, numraw + 1, BitDepth::kMinRawLength,
585
0
                       BitDepth::kMaxRawLength, level1_nbits);
586
587
0
    uint8_t level2_nbits[kNumLZ77] = {};
588
0
    uint8_t min_lengths[kNumLZ77] = {};
589
0
    uint8_t l = 15 - level1_nbits[numraw];
590
0
    uint8_t max_lengths[kNumLZ77];
591
0
    for (uint8_t& max_length : max_lengths) {
592
0
      max_length = l;
593
0
    }
594
0
    size_t num_lz77 = kNumLZ77;
595
0
    while (num_lz77 > 0 && lz77_counts[num_lz77 - 1] == 0) num_lz77--;
596
0
    ComputeCodeLengths(lz77_counts, num_lz77, min_lengths, max_lengths,
597
0
                       level2_nbits);
598
0
    for (size_t i = 0; i < numraw; i++) {
599
0
      raw_nbits[i] = level1_nbits[i];
600
0
    }
601
0
    for (size_t i = 0; i < num_lz77; i++) {
602
0
      lz77_nbits[i] =
603
0
          level2_nbits[i] ? level1_nbits[numraw] + level2_nbits[i] : 0;
604
0
    }
605
606
0
    ComputeCanonicalCode(raw_nbits, raw_bits, numraw, lz77_nbits, lz77_bits,
607
0
                         kNumLZ77);
608
609
    // Prepare lz77 cache
610
0
    for (size_t count = 0; count < kLZ77CacheSize; count++) {
611
0
      unsigned token, nbits, bits;
612
0
      EncodeHybridUintLZ77(count, &token, &nbits, &bits);
613
0
      lz77_cache_nbits[count] = lz77_nbits[token] + nbits + raw_nbits[0];
614
0
      lz77_cache_bits[count] =
615
0
          (((bits << lz77_nbits[token]) | lz77_bits[token]) << raw_nbits[0]) |
616
0
          raw_bits[0];
617
0
    }
618
0
  }
Unexecuted instantiation: enc_fast_lossless.cc:(anonymous namespace)::PrefixCode::PrefixCode<AVX2::(anonymous namespace)::UpTo8Bits>(AVX2::(anonymous namespace)::UpTo8Bits, unsigned long*, unsigned long*)
Unexecuted instantiation: enc_fast_lossless.cc:(anonymous namespace)::PrefixCode::PrefixCode<AVX2::(anonymous namespace)::From9To13Bits>(AVX2::(anonymous namespace)::From9To13Bits, unsigned long*, unsigned long*)
Unexecuted instantiation: enc_fast_lossless.cc:(anonymous namespace)::PrefixCode::PrefixCode<AVX2::(anonymous namespace)::Exactly14Bits>(AVX2::(anonymous namespace)::Exactly14Bits, unsigned long*, unsigned long*)
Unexecuted instantiation: enc_fast_lossless.cc:(anonymous namespace)::PrefixCode::PrefixCode<AVX2::(anonymous namespace)::MoreThan14Bits>(AVX2::(anonymous namespace)::MoreThan14Bits, unsigned long*, unsigned long*)
Unexecuted instantiation: enc_fast_lossless.cc:(anonymous namespace)::PrefixCode::PrefixCode<default_implementation::(anonymous namespace)::UpTo8Bits>(default_implementation::(anonymous namespace)::UpTo8Bits, unsigned long*, unsigned long*)
Unexecuted instantiation: enc_fast_lossless.cc:(anonymous namespace)::PrefixCode::PrefixCode<default_implementation::(anonymous namespace)::From9To13Bits>(default_implementation::(anonymous namespace)::From9To13Bits, unsigned long*, unsigned long*)
Unexecuted instantiation: enc_fast_lossless.cc:(anonymous namespace)::PrefixCode::PrefixCode<default_implementation::(anonymous namespace)::Exactly14Bits>(default_implementation::(anonymous namespace)::Exactly14Bits, unsigned long*, unsigned long*)
Unexecuted instantiation: enc_fast_lossless.cc:(anonymous namespace)::PrefixCode::PrefixCode<default_implementation::(anonymous namespace)::MoreThan14Bits>(default_implementation::(anonymous namespace)::MoreThan14Bits, unsigned long*, unsigned long*)
619
620
  // Max bits written: 2 + 72 + 95 + 24 + 165 = 286
621
0
  void WriteTo(BitWriter* writer) const {
622
0
    uint64_t code_length_counts[18] = {};
623
0
    code_length_counts[17] = 3 + 2 * (kNumLZ77 - 1);
624
0
    for (uint8_t raw_nbit : raw_nbits) {
625
0
      code_length_counts[raw_nbit]++;
626
0
    }
627
0
    for (uint8_t lz77_nbit : lz77_nbits) {
628
0
      code_length_counts[lz77_nbit]++;
629
0
    }
630
0
    uint8_t code_length_nbits[18] = {};
631
0
    uint8_t code_length_nbits_min[18] = {};
632
0
    uint8_t code_length_nbits_max[18] = {
633
0
        5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
634
0
    };
635
0
    ComputeCodeLengths(code_length_counts, 18, code_length_nbits_min,
636
0
                       code_length_nbits_max, code_length_nbits);
637
0
    writer->Write(2, 0b00);  // HSKIP = 0, i.e. don't skip code lengths.
638
639
    // As per Brotli RFC.
640
0
    uint8_t code_length_order[18] = {1, 2, 3, 4,  0,  5,  17, 6,  16,
641
0
                                     7, 8, 9, 10, 11, 12, 13, 14, 15};
642
0
    uint8_t code_length_length_nbits[] = {2, 4, 3, 2, 2, 4};
643
0
    uint8_t code_length_length_bits[] = {0, 7, 3, 2, 1, 15};
644
645
    // Encode lengths of code lengths.
646
0
    size_t num_code_lengths = 18;
647
0
    while (code_length_nbits[code_length_order[num_code_lengths - 1]] == 0) {
648
0
      num_code_lengths--;
649
0
    }
650
    // Max bits written in this loop: 18 * 4 = 72
651
0
    for (size_t i = 0; i < num_code_lengths; i++) {
652
0
      int symbol = code_length_nbits[code_length_order[i]];
653
0
      writer->Write(code_length_length_nbits[symbol],
654
0
                    code_length_length_bits[symbol]);
655
0
    }
656
657
    // Compute the canonical codes for the codes that represent the lengths of
658
    // the actual codes for data.
659
0
    uint16_t code_length_bits[18] = {};
660
0
    ComputeCanonicalCode(nullptr, nullptr, 0, code_length_nbits,
661
0
                         code_length_bits, 18);
662
    // Encode raw bit code lengths.
663
    // Max bits written in this loop: 19 * 5 = 95
664
0
    for (uint8_t raw_nbit : raw_nbits) {
665
0
      writer->Write(code_length_nbits[raw_nbit], code_length_bits[raw_nbit]);
666
0
    }
667
0
    size_t num_lz77 = kNumLZ77;
668
0
    while (lz77_nbits[num_lz77 - 1] == 0) {
669
0
      num_lz77--;
670
0
    }
671
    // Encode 0s until 224 (start of LZ77 symbols). This is in total 224-19 =
672
    // 205.
673
0
    static_assert(kLZ77Offset == 224, "kLZ77Offset should be 224");
674
0
    static_assert(kNumRawSymbols == 19, "kNumRawSymbols should be 19");
675
0
    {
676
      // Max bits in this block: 24
677
0
      writer->Write(code_length_nbits[17], code_length_bits[17]);
678
0
      writer->Write(3, 0b010);  // 5
679
0
      writer->Write(code_length_nbits[17], code_length_bits[17]);
680
0
      writer->Write(3, 0b000);  // (5-2)*8 + 3 = 27
681
0
      writer->Write(code_length_nbits[17], code_length_bits[17]);
682
0
      writer->Write(3, 0b010);  // (27-2)*8 + 5 = 205
683
0
    }
684
    // Encode LZ77 symbols, with values 224+i.
685
    // Max bits written in this loop: 33 * 5 = 165
686
0
    for (size_t i = 0; i < num_lz77; i++) {
687
0
      writer->Write(code_length_nbits[lz77_nbits[i]],
688
0
                    code_length_bits[lz77_nbits[i]]);
689
0
    }
690
0
  }
691
};
692
693
}  // namespace
694
695
extern "C" {
696
697
struct JxlFastLosslessFrameState {
698
  JxlChunkedFrameInputSource input;
699
  size_t width;
700
  size_t height;
701
  size_t num_groups_x;
702
  size_t num_groups_y;
703
  size_t num_dc_groups_x;
704
  size_t num_dc_groups_y;
705
  size_t nb_chans;
706
  size_t bitdepth;
707
  int big_endian;
708
  int effort;
709
  bool collided;
710
  PrefixCode hcode[4];
711
  std::vector<int16_t> lookup;
712
  BitWriter header;
713
  std::vector<std::array<BitWriter, 4>> group_data;
714
  std::vector<size_t> group_sizes;
715
  size_t ac_group_data_offset = 0;
716
  size_t min_dc_global_size = 0;
717
  size_t current_bit_writer = 0;
718
  size_t bit_writer_byte_pos = 0;
719
  size_t bits_in_buffer = 0;
720
  uint64_t bit_buffer = 0;
721
  bool process_done = false;
722
};
723
724
0
size_t JxlFastLosslessOutputSize(const JxlFastLosslessFrameState* frame) {
725
0
  size_t total_size_groups = 0;
726
0
  for (const auto& section : frame->group_data) {
727
0
    total_size_groups += SectionSize(section);
728
0
  }
729
0
  return frame->header.bytes_written + total_size_groups;
730
0
}
731
732
size_t JxlFastLosslessMaxRequiredOutput(
733
0
    const JxlFastLosslessFrameState* frame) {
734
0
  return JxlFastLosslessOutputSize(frame) + 32;
735
0
}
736
737
void JxlFastLosslessPrepareHeader(JxlFastLosslessFrameState* frame,
738
0
                                  int add_image_header, int is_last) {
739
0
  BitWriter* output = &frame->header;
740
0
  output->Allocate(1000 + frame->group_sizes.size() * 32);
741
742
0
  bool have_alpha = (frame->nb_chans == 2 || frame->nb_chans == 4);
743
744
#if FJXL_STANDALONE
745
  if (add_image_header) {
746
    // Signature
747
    output->Write(16, 0x0AFF);
748
749
    // Size header, hand-crafted.
750
    // Not small
751
    output->Write(1, 0);
752
753
    auto wsz = [output](size_t size) {
754
      if (size - 1 < (1 << 9)) {
755
        output->Write(2, 0b00);
756
        output->Write(9, size - 1);
757
      } else if (size - 1 < (1 << 13)) {
758
        output->Write(2, 0b01);
759
        output->Write(13, size - 1);
760
      } else if (size - 1 < (1 << 18)) {
761
        output->Write(2, 0b10);
762
        output->Write(18, size - 1);
763
      } else {
764
        output->Write(2, 0b11);
765
        output->Write(30, size - 1);
766
      }
767
    };
768
769
    wsz(frame->height);
770
771
    // No special ratio.
772
    output->Write(3, 0);
773
774
    wsz(frame->width);
775
776
    // Hand-crafted ImageMetadata.
777
    output->Write(1, 0);  // all_default
778
    output->Write(1, 0);  // extra_fields
779
    output->Write(1, 0);  // bit_depth.floating_point_sample
780
    if (frame->bitdepth == 8) {
781
      output->Write(2, 0b00);  // bit_depth.bits_per_sample = 8
782
    } else if (frame->bitdepth == 10) {
783
      output->Write(2, 0b01);  // bit_depth.bits_per_sample = 10
784
    } else if (frame->bitdepth == 12) {
785
      output->Write(2, 0b10);  // bit_depth.bits_per_sample = 12
786
    } else {
787
      output->Write(2, 0b11);  // 1 + u(6)
788
      output->Write(6, frame->bitdepth - 1);
789
    }
790
    if (frame->bitdepth <= 14) {
791
      output->Write(1, 1);  // 16-bit-buffer sufficient
792
    } else {
793
      output->Write(1, 0);  // 16-bit-buffer NOT sufficient
794
    }
795
    if (have_alpha) {
796
      output->Write(2, 0b01);  // One extra channel
797
      if (frame->bitdepth == 8) {
798
        output->Write(1, 1); // ... all_default (ie. 8-bit alpha)
799
      } else {
800
        output->Write(1, 0); // not d_alpha
801
        output->Write(2, 0); // type = kAlpha
802
        output->Write(1, 0); // not float
803
        if (frame->bitdepth == 10) {
804
          output->Write(2, 0b01); // bit_depth.bits_per_sample = 10
805
        } else if (frame->bitdepth == 12) {
806
          output->Write(2, 0b10); // bit_depth.bits_per_sample = 12
807
        } else {
808
          output->Write(2, 0b11); // 1 + u(6)
809
          output->Write(6, frame->bitdepth - 1);
810
        }
811
        output->Write(2, 0); // dim_shift = 0
812
        output->Write(2, 0); // name_len = 0
813
        output->Write(1, 0); // alpha_associated = 0
814
      }
815
    } else {
816
      output->Write(2, 0b00);  // No extra channel
817
    }
818
    output->Write(1, 0);  // Not XYB
819
    if (frame->nb_chans > 2) {
820
      output->Write(1, 1);  // color_encoding.all_default (sRGB)
821
    } else {
822
      output->Write(1, 0);     // color_encoding.all_default false
823
      output->Write(1, 0);     // color_encoding.want_icc false
824
      output->Write(2, 1);     // grayscale
825
      output->Write(2, 1);     // D65
826
      output->Write(1, 0);     // no gamma transfer function
827
      output->Write(2, 0b10);  // tf: 2 + u(4)
828
      output->Write(4, 11);    // tf of sRGB
829
      output->Write(2, 1);     // relative rendering intent
830
    }
831
    output->Write(2, 0b00);  // No extensions.
832
833
    output->Write(1, 1);  // all_default transform data
834
835
    // No ICC, no preview. Frame should start at byte boundary.
836
    output->ZeroPadToByte();
837
  }
838
#else
839
0
  assert(!add_image_header);
840
0
#endif
841
  // Handcrafted frame header.
842
0
  output->Write(1, 0);     // all_default
843
0
  output->Write(2, 0b00);  // regular frame
844
0
  output->Write(1, 1);     // modular
845
0
  output->Write(2, 0b00);  // default flags
846
0
  output->Write(1, 0);     // not YCbCr
847
0
  output->Write(2, 0b00);  // no upsampling
848
0
  if (have_alpha) {
849
0
    output->Write(2, 0b00);  // no alpha upsampling
850
0
  }
851
0
  output->Write(2, 0b01);  // default group size
852
0
  output->Write(2, 0b00);  // exactly one pass
853
0
  output->Write(1, 0);     // no custom size or origin
854
0
  output->Write(2, 0b00);  // kReplace blending mode
855
0
  if (have_alpha) {
856
0
    output->Write(2, 0b00);  // kReplace blending mode for alpha channel
857
0
  }
858
0
  output->Write(1, is_last);  // is_last
859
0
  if (!is_last) {
860
0
    output->Write(2, 0b00);  // can not be saved as reference
861
0
  }
862
0
  output->Write(2, 0b00);  // a frame has no name
863
0
  output->Write(1, 0);     // loop filter is not all_default
864
0
  output->Write(1, 0);     // no gaborish
865
0
  output->Write(2, 0);     // 0 EPF iters
866
0
  output->Write(2, 0b00);  // No LF extensions
867
0
  output->Write(2, 0b00);  // No FH extensions
868
869
0
  output->Write(1, 0);      // No TOC permutation
870
0
  output->ZeroPadToByte();  // TOC is byte-aligned.
871
0
  assert(add_image_header || output->bytes_written <= kMaxFrameHeaderSize);
872
0
  for (size_t group_size : frame->group_sizes) {
873
0
    size_t bucket = TOCBucket(group_size);
874
0
    output->Write(2, bucket);
875
0
    output->Write(kTOCBits[bucket] - 2, group_size - kGroupSizeOffset[bucket]);
876
0
  }
877
0
  output->ZeroPadToByte();  // Groups are byte-aligned.
878
0
}
879
880
#if !FJXL_STANDALONE
881
bool JxlFastLosslessOutputAlignedSection(
882
0
    const BitWriter& bw, JxlEncoderOutputProcessorWrapper* output_processor) {
883
0
  assert(bw.bits_in_buffer == 0);
884
0
  const uint8_t* data = bw.data.get();
885
0
  size_t remaining_len = bw.bytes_written;
886
0
  while (remaining_len > 0) {
887
0
    JXL_ASSIGN_OR_RETURN(auto buffer,
888
0
                         output_processor->GetBuffer(1, remaining_len));
889
0
    size_t n = std::min(buffer.size(), remaining_len);
890
0
    if (n == 0) break;
891
0
    memcpy(buffer.data(), data, n);
892
0
    JXL_RETURN_IF_ERROR(buffer.advance(n));
893
0
    data += n;
894
0
    remaining_len -= n;
895
0
  };
896
0
  return true;
897
0
}
898
899
bool JxlFastLosslessOutputHeaders(
900
    JxlFastLosslessFrameState* frame_state,
901
0
    JxlEncoderOutputProcessorWrapper* output_processor) {
902
0
  JXL_RETURN_IF_ERROR(JxlFastLosslessOutputAlignedSection(frame_state->header,
903
0
                                                          output_processor));
904
0
  JXL_RETURN_IF_ERROR(JxlFastLosslessOutputAlignedSection(
905
0
      frame_state->group_data[0][0], output_processor));
906
0
  return true;
907
0
}
908
#endif
909
910
#if FJXL_ENABLE_AVX512
911
__attribute__((target("avx512vbmi2"))) static size_t AppendBytesWithBitOffset(
912
    const uint8_t* data, size_t n, size_t bit_buffer_nbits,
913
    unsigned char* output, uint64_t& bit_buffer) {
914
  if (n < 128) {
915
    return 0;
916
  }
917
918
  size_t i = 0;
919
  __m512i shift = _mm512_set1_epi64(64 - bit_buffer_nbits);
920
  __m512i carry = _mm512_set1_epi64(bit_buffer << (64 - bit_buffer_nbits));
921
922
  for (; i + 64 <= n; i += 64) {
923
    __m512i current = _mm512_loadu_si512(data + i);
924
    __m512i previous_u64 = _mm512_alignr_epi64(current, carry, 7);
925
    carry = current;
926
    __m512i out = _mm512_shrdv_epi64(previous_u64, current, shift);
927
    _mm512_storeu_si512(output + i, out);
928
  }
929
930
  bit_buffer = data[i - 1] >> (8 - bit_buffer_nbits);
931
932
  return i;
933
}
934
#endif
935
936
size_t JxlFastLosslessWriteOutput(JxlFastLosslessFrameState* frame,
937
0
                                  unsigned char* output, size_t output_size) {
938
0
  assert(output_size >= 32);
939
0
  unsigned char* initial_output = output;
940
0
  size_t (*append_bytes_with_bit_offset)(const uint8_t*, size_t, size_t,
941
0
                                         unsigned char*, uint64_t&) = nullptr;
942
943
#if FJXL_ENABLE_AVX512
944
  if (HasCpuFeature(CpuFeature::kVBMI2)) {
945
    append_bytes_with_bit_offset = AppendBytesWithBitOffset;
946
  }
947
#endif
948
949
0
  while (true) {
950
0
    size_t& cur = frame->current_bit_writer;
951
0
    size_t& bw_pos = frame->bit_writer_byte_pos;
952
0
    if (cur >= 1 + frame->group_data.size() * frame->nb_chans) {
953
0
      return output - initial_output;
954
0
    }
955
0
    if (output_size <= 9) {
956
0
      return output - initial_output;
957
0
    }
958
0
    size_t nbc = frame->nb_chans;
959
0
    const BitWriter& writer =
960
0
        cur == 0 ? frame->header
961
0
                 : frame->group_data[(cur - 1) / nbc][(cur - 1) % nbc];
962
0
    size_t full_byte_count =
963
0
        std::min(output_size - 9, writer.bytes_written - bw_pos);
964
0
    if (frame->bits_in_buffer == 0) {
965
0
      memcpy(output, writer.data.get() + bw_pos, full_byte_count);
966
0
    } else {
967
0
      size_t i = 0;
968
0
      if (append_bytes_with_bit_offset) {
969
0
        i += append_bytes_with_bit_offset(
970
0
            writer.data.get() + bw_pos, full_byte_count, frame->bits_in_buffer,
971
0
            output, frame->bit_buffer);
972
0
      }
973
0
#if defined(__BYTE_ORDER__) && (__BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__)
974
      // Copy 8 bytes at a time until we reach the border.
975
0
      for (; i + 8 < full_byte_count; i += 8) {
976
0
        uint64_t chunk;
977
0
        memcpy(&chunk, writer.data.get() + bw_pos + i, 8);
978
0
        uint64_t out = frame->bit_buffer | (chunk << frame->bits_in_buffer);
979
0
        memcpy(output + i, &out, 8);
980
0
        frame->bit_buffer = chunk >> (64 - frame->bits_in_buffer);
981
0
      }
982
0
#endif
983
0
      for (; i < full_byte_count; i++) {
984
0
        AddBits(8, writer.data.get()[bw_pos + i], output + i,
985
0
                frame->bits_in_buffer, frame->bit_buffer);
986
0
      }
987
0
    }
988
0
    output += full_byte_count;
989
0
    output_size -= full_byte_count;
990
0
    bw_pos += full_byte_count;
991
0
    if (bw_pos == writer.bytes_written) {
992
0
      auto write = [&](size_t num, uint64_t bits) {
993
0
        size_t n = AddBits(num, bits, output, frame->bits_in_buffer,
994
0
                           frame->bit_buffer);
995
0
        output += n;
996
0
        output_size -= n;
997
0
      };
998
0
      if (writer.bits_in_buffer) {
999
0
        write(writer.bits_in_buffer, writer.buffer);
1000
0
      }
1001
0
      bw_pos = 0;
1002
0
      cur++;
1003
0
      if ((cur - 1) % nbc == 0 && frame->bits_in_buffer != 0) {
1004
0
        write(8 - frame->bits_in_buffer, 0);
1005
0
      }
1006
0
    }
1007
0
  }
1008
0
}
1009
1010
0
void JxlFastLosslessFreeFrameState(JxlFastLosslessFrameState* frame) {
1011
0
  delete frame;
1012
0
}
1013
1014
}  // extern "C"
1015
1016
#endif
1017
1018
#ifdef FJXL_SELF_INCLUDE
1019
1020
namespace {
1021
1022
template <typename T>
1023
struct VecPair {
1024
  T low;
1025
  T hi;
1026
};
1027
1028
#ifdef FJXL_GENERIC_SIMD
1029
#undef FJXL_GENERIC_SIMD
1030
#endif
1031
1032
#ifdef FJXL_AVX512
1033
#define FJXL_GENERIC_SIMD
1034
struct SIMDVec32;
1035
struct Mask32 {
1036
  __mmask16 mask;
1037
  SIMDVec32 IfThenElse(const SIMDVec32& if_true, const SIMDVec32& if_false);
1038
  size_t CountPrefix() const {
1039
    return CtzNonZero(~uint64_t{_cvtmask16_u32(mask)});
1040
  }
1041
};
1042
1043
struct SIMDVec32 {
1044
  __m512i vec;
1045
1046
  static constexpr size_t kLanes = 16;
1047
1048
  FJXL_INLINE static SIMDVec32 Load(const uint32_t* data) {
1049
    return SIMDVec32{_mm512_loadu_si512((__m512i*)data)};
1050
  }
1051
  FJXL_INLINE void Store(uint32_t* data) {
1052
    _mm512_storeu_si512((__m512i*)data, vec);
1053
  }
1054
  FJXL_INLINE static SIMDVec32 Val(uint32_t v) {
1055
    return SIMDVec32{_mm512_set1_epi32(v)};
1056
  }
1057
  FJXL_INLINE SIMDVec32 ValToToken() const {
1058
    return SIMDVec32{
1059
        _mm512_sub_epi32(_mm512_set1_epi32(32), _mm512_lzcnt_epi32(vec))};
1060
  }
1061
  FJXL_INLINE SIMDVec32 SatSubU(const SIMDVec32& to_subtract) const {
1062
    return SIMDVec32{_mm512_sub_epi32(_mm512_max_epu32(vec, to_subtract.vec),
1063
                                      to_subtract.vec)};
1064
  }
1065
  FJXL_INLINE SIMDVec32 Sub(const SIMDVec32& to_subtract) const {
1066
    return SIMDVec32{_mm512_sub_epi32(vec, to_subtract.vec)};
1067
  }
1068
  FJXL_INLINE SIMDVec32 Add(const SIMDVec32& oth) const {
1069
    return SIMDVec32{_mm512_add_epi32(vec, oth.vec)};
1070
  }
1071
  FJXL_INLINE SIMDVec32 Xor(const SIMDVec32& oth) const {
1072
    return SIMDVec32{_mm512_xor_epi32(vec, oth.vec)};
1073
  }
1074
  FJXL_INLINE Mask32 Eq(const SIMDVec32& oth) const {
1075
    return Mask32{_mm512_cmpeq_epi32_mask(vec, oth.vec)};
1076
  }
1077
  FJXL_INLINE Mask32 Gt(const SIMDVec32& oth) const {
1078
    return Mask32{_mm512_cmpgt_epi32_mask(vec, oth.vec)};
1079
  }
1080
  FJXL_INLINE SIMDVec32 Pow2() const {
1081
    return SIMDVec32{_mm512_sllv_epi32(_mm512_set1_epi32(1), vec)};
1082
  }
1083
  template <size_t i>
1084
  FJXL_INLINE SIMDVec32 SignedShiftRight() const {
1085
    return SIMDVec32{_mm512_srai_epi32(vec, i)};
1086
  }
1087
};
1088
1089
struct SIMDVec16;
1090
1091
struct Mask16 {
1092
  __mmask32 mask;
1093
  SIMDVec16 IfThenElse(const SIMDVec16& if_true, const SIMDVec16& if_false);
1094
  Mask16 And(const Mask16& oth) const {
1095
    return Mask16{_kand_mask32(mask, oth.mask)};
1096
  }
1097
  size_t CountPrefix() const {
1098
    return CtzNonZero(~uint64_t{_cvtmask32_u32(mask)});
1099
  }
1100
};
1101
1102
struct SIMDVec16 {
1103
  __m512i vec;
1104
1105
  static constexpr size_t kLanes = 32;
1106
1107
  FJXL_INLINE static SIMDVec16 Load(const uint16_t* data) {
1108
    return SIMDVec16{_mm512_loadu_si512((__m512i*)data)};
1109
  }
1110
  FJXL_INLINE void Store(uint16_t* data) {
1111
    _mm512_storeu_si512((__m512i*)data, vec);
1112
  }
1113
  FJXL_INLINE static SIMDVec16 Val(uint16_t v) {
1114
    return SIMDVec16{_mm512_set1_epi16(v)};
1115
  }
1116
  FJXL_INLINE static SIMDVec16 FromTwo32(const SIMDVec32& lo,
1117
                                         const SIMDVec32& hi) {
1118
    auto tmp = _mm512_packus_epi32(lo.vec, hi.vec);
1119
    alignas(64) uint64_t perm[8] = {0, 2, 4, 6, 1, 3, 5, 7};
1120
    return SIMDVec16{
1121
        _mm512_permutex2var_epi64(tmp, _mm512_load_si512((__m512i*)perm), tmp)};
1122
  }
1123
1124
  FJXL_INLINE SIMDVec16 ValToToken() const {
1125
    auto c16 = _mm512_set1_epi32(16);
1126
    auto c32 = _mm512_set1_epi32(32);
1127
    auto low16bit = _mm512_set1_epi32(0x0000FFFF);
1128
    auto lzhi =
1129
        _mm512_sub_epi32(c16, _mm512_min_epu32(c16, _mm512_lzcnt_epi32(vec)));
1130
    auto lzlo = _mm512_sub_epi32(
1131
        c32, _mm512_lzcnt_epi32(_mm512_and_si512(low16bit, vec)));
1132
    return SIMDVec16{_mm512_or_si512(lzlo, _mm512_slli_epi32(lzhi, 16))};
1133
  }
1134
1135
  FJXL_INLINE SIMDVec16 SatSubU(const SIMDVec16& to_subtract) const {
1136
    return SIMDVec16{_mm512_subs_epu16(vec, to_subtract.vec)};
1137
  }
1138
  FJXL_INLINE SIMDVec16 Sub(const SIMDVec16& to_subtract) const {
1139
    return SIMDVec16{_mm512_sub_epi16(vec, to_subtract.vec)};
1140
  }
1141
  FJXL_INLINE SIMDVec16 Add(const SIMDVec16& oth) const {
1142
    return SIMDVec16{_mm512_add_epi16(vec, oth.vec)};
1143
  }
1144
  FJXL_INLINE SIMDVec16 Min(const SIMDVec16& oth) const {
1145
    return SIMDVec16{_mm512_min_epu16(vec, oth.vec)};
1146
  }
1147
  FJXL_INLINE Mask16 Eq(const SIMDVec16& oth) const {
1148
    return Mask16{_mm512_cmpeq_epi16_mask(vec, oth.vec)};
1149
  }
1150
  FJXL_INLINE Mask16 Gt(const SIMDVec16& oth) const {
1151
    return Mask16{_mm512_cmpgt_epi16_mask(vec, oth.vec)};
1152
  }
1153
  FJXL_INLINE SIMDVec16 Pow2() const {
1154
    return SIMDVec16{_mm512_sllv_epi16(_mm512_set1_epi16(1), vec)};
1155
  }
1156
  FJXL_INLINE SIMDVec16 Or(const SIMDVec16& oth) const {
1157
    return SIMDVec16{_mm512_or_si512(vec, oth.vec)};
1158
  }
1159
  FJXL_INLINE SIMDVec16 Xor(const SIMDVec16& oth) const {
1160
    return SIMDVec16{_mm512_xor_si512(vec, oth.vec)};
1161
  }
1162
  FJXL_INLINE SIMDVec16 And(const SIMDVec16& oth) const {
1163
    return SIMDVec16{_mm512_and_si512(vec, oth.vec)};
1164
  }
1165
  FJXL_INLINE SIMDVec16 HAdd(const SIMDVec16& oth) const {
1166
    return SIMDVec16{_mm512_srai_epi16(_mm512_add_epi16(vec, oth.vec), 1)};
1167
  }
1168
  FJXL_INLINE SIMDVec16 PrepareForU8Lookup() const {
1169
    return SIMDVec16{_mm512_or_si512(vec, _mm512_set1_epi16(0xFF00))};
1170
  }
1171
  FJXL_INLINE SIMDVec16 U8Lookup(const uint8_t* table) const {
1172
    return SIMDVec16{_mm512_shuffle_epi8(
1173
        _mm512_broadcast_i32x4(_mm_loadu_si128((__m128i*)table)), vec)};
1174
  }
1175
  FJXL_INLINE VecPair<SIMDVec16> Interleave(const SIMDVec16& low) const {
1176
    auto lo = _mm512_unpacklo_epi16(low.vec, vec);
1177
    auto hi = _mm512_unpackhi_epi16(low.vec, vec);
1178
    alignas(64) uint64_t perm1[8] = {0, 1, 8, 9, 2, 3, 10, 11};
1179
    alignas(64) uint64_t perm2[8] = {4, 5, 12, 13, 6, 7, 14, 15};
1180
    return {SIMDVec16{_mm512_permutex2var_epi64(
1181
                lo, _mm512_load_si512((__m512i*)perm1), hi)},
1182
            SIMDVec16{_mm512_permutex2var_epi64(
1183
                lo, _mm512_load_si512((__m512i*)perm2), hi)}};
1184
  }
1185
  FJXL_INLINE VecPair<SIMDVec32> Upcast() const {
1186
    auto lo = _mm512_unpacklo_epi16(vec, _mm512_setzero_si512());
1187
    auto hi = _mm512_unpackhi_epi16(vec, _mm512_setzero_si512());
1188
    alignas(64) uint64_t perm1[8] = {0, 1, 8, 9, 2, 3, 10, 11};
1189
    alignas(64) uint64_t perm2[8] = {4, 5, 12, 13, 6, 7, 14, 15};
1190
    return {SIMDVec32{_mm512_permutex2var_epi64(
1191
                lo, _mm512_load_si512((__m512i*)perm1), hi)},
1192
            SIMDVec32{_mm512_permutex2var_epi64(
1193
                lo, _mm512_load_si512((__m512i*)perm2), hi)}};
1194
  }
1195
  template <size_t i>
1196
  FJXL_INLINE SIMDVec16 SignedShiftRight() const {
1197
    return SIMDVec16{_mm512_srai_epi16(vec, i)};
1198
  }
1199
1200
  static std::array<SIMDVec16, 1> LoadG8(const unsigned char* data) {
1201
    __m256i bytes = _mm256_loadu_si256((__m256i*)data);
1202
    return {SIMDVec16{_mm512_cvtepu8_epi16(bytes)}};
1203
  }
1204
  static std::array<SIMDVec16, 1> LoadG16(const unsigned char* data) {
1205
    return {Load((const uint16_t*)data)};
1206
  }
1207
1208
  static std::array<SIMDVec16, 2> LoadGA8(const unsigned char* data) {
1209
    __m512i bytes = _mm512_loadu_si512((__m512i*)data);
1210
    __m512i gray = _mm512_and_si512(bytes, _mm512_set1_epi16(0xFF));
1211
    __m512i alpha = _mm512_srli_epi16(bytes, 8);
1212
    return {SIMDVec16{gray}, SIMDVec16{alpha}};
1213
  }
1214
  static std::array<SIMDVec16, 2> LoadGA16(const unsigned char* data) {
1215
    __m512i bytes1 = _mm512_loadu_si512((__m512i*)data);
1216
    __m512i bytes2 = _mm512_loadu_si512((__m512i*)(data + 64));
1217
    __m512i g_mask = _mm512_set1_epi32(0xFFFF);
1218
    __m512i permuteidx = _mm512_set_epi64(7, 5, 3, 1, 6, 4, 2, 0);
1219
    __m512i g = _mm512_permutexvar_epi64(
1220
        permuteidx, _mm512_packus_epi32(_mm512_and_si512(bytes1, g_mask),
1221
                                        _mm512_and_si512(bytes2, g_mask)));
1222
    __m512i a = _mm512_permutexvar_epi64(
1223
        permuteidx, _mm512_packus_epi32(_mm512_srli_epi32(bytes1, 16),
1224
                                        _mm512_srli_epi32(bytes2, 16)));
1225
    return {SIMDVec16{g}, SIMDVec16{a}};
1226
  }
1227
1228
  static std::array<SIMDVec16, 3> LoadRGB8(const unsigned char* data) {
1229
    __m512i bytes0 = _mm512_loadu_si512((__m512i*)data);
1230
    __m512i bytes1 =
1231
        _mm512_zextsi256_si512(_mm256_loadu_si256((__m256i*)(data + 64)));
1232
1233
    // 0x7A = element of upper half of second vector = 0 after lookup; still in
1234
    // the upper half once we add 1 or 2.
1235
    uint8_t z = 0x7A;
1236
    __m512i ridx =
1237
        _mm512_set_epi8(z, 93, z, 90, z, 87, z, 84, z, 81, z, 78, z, 75, z, 72,
1238
                        z, 69, z, 66, z, 63, z, 60, z, 57, z, 54, z, 51, z, 48,
1239
                        z, 45, z, 42, z, 39, z, 36, z, 33, z, 30, z, 27, z, 24,
1240
                        z, 21, z, 18, z, 15, z, 12, z, 9, z, 6, z, 3, z, 0);
1241
    __m512i gidx = _mm512_add_epi8(ridx, _mm512_set1_epi8(1));
1242
    __m512i bidx = _mm512_add_epi8(gidx, _mm512_set1_epi8(1));
1243
    __m512i r = _mm512_permutex2var_epi8(bytes0, ridx, bytes1);
1244
    __m512i g = _mm512_permutex2var_epi8(bytes0, gidx, bytes1);
1245
    __m512i b = _mm512_permutex2var_epi8(bytes0, bidx, bytes1);
1246
    return {SIMDVec16{r}, SIMDVec16{g}, SIMDVec16{b}};
1247
  }
1248
  static std::array<SIMDVec16, 3> LoadRGB16(const unsigned char* data) {
1249
    __m512i bytes0 = _mm512_loadu_si512((__m512i*)data);
1250
    __m512i bytes1 = _mm512_loadu_si512((__m512i*)(data + 64));
1251
    __m512i bytes2 = _mm512_loadu_si512((__m512i*)(data + 128));
1252
1253
    __m512i ridx_lo = _mm512_set_epi16(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 63, 60, 57,
1254
                                       54, 51, 48, 45, 42, 39, 36, 33, 30, 27,
1255
                                       24, 21, 18, 15, 12, 9, 6, 3, 0);
1256
    // -1 is such that when adding 1 or 2, we get the correct index for
1257
    // green/blue.
1258
    __m512i ridx_hi =
1259
        _mm512_set_epi16(29, 26, 23, 20, 17, 14, 11, 8, 5, 2, -1, 0, 0, 0, 0, 0,
1260
                         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
1261
    __m512i gidx_lo = _mm512_add_epi16(ridx_lo, _mm512_set1_epi16(1));
1262
    __m512i gidx_hi = _mm512_add_epi16(ridx_hi, _mm512_set1_epi16(1));
1263
    __m512i bidx_lo = _mm512_add_epi16(gidx_lo, _mm512_set1_epi16(1));
1264
    __m512i bidx_hi = _mm512_add_epi16(gidx_hi, _mm512_set1_epi16(1));
1265
1266
    __mmask32 rmask = _cvtu32_mask32(0b11111111110000000000000000000000);
1267
    __mmask32 gbmask = _cvtu32_mask32(0b11111111111000000000000000000000);
1268
1269
    __m512i rlo = _mm512_permutex2var_epi16(bytes0, ridx_lo, bytes1);
1270
    __m512i glo = _mm512_permutex2var_epi16(bytes0, gidx_lo, bytes1);
1271
    __m512i blo = _mm512_permutex2var_epi16(bytes0, bidx_lo, bytes1);
1272
    __m512i r = _mm512_mask_permutexvar_epi16(rlo, rmask, ridx_hi, bytes2);
1273
    __m512i g = _mm512_mask_permutexvar_epi16(glo, gbmask, gidx_hi, bytes2);
1274
    __m512i b = _mm512_mask_permutexvar_epi16(blo, gbmask, bidx_hi, bytes2);
1275
    return {SIMDVec16{r}, SIMDVec16{g}, SIMDVec16{b}};
1276
  }
1277
1278
  static std::array<SIMDVec16, 4> LoadRGBA8(const unsigned char* data) {
1279
    __m512i bytes1 = _mm512_loadu_si512((__m512i*)data);
1280
    __m512i bytes2 = _mm512_loadu_si512((__m512i*)(data + 64));
1281
    __m512i rg_mask = _mm512_set1_epi32(0xFFFF);
1282
    __m512i permuteidx = _mm512_set_epi64(7, 5, 3, 1, 6, 4, 2, 0);
1283
    __m512i rg = _mm512_permutexvar_epi64(
1284
        permuteidx, _mm512_packus_epi32(_mm512_and_si512(bytes1, rg_mask),
1285
                                        _mm512_and_si512(bytes2, rg_mask)));
1286
    __m512i b_a = _mm512_permutexvar_epi64(
1287
        permuteidx, _mm512_packus_epi32(_mm512_srli_epi32(bytes1, 16),
1288
                                        _mm512_srli_epi32(bytes2, 16)));
1289
    __m512i r = _mm512_and_si512(rg, _mm512_set1_epi16(0xFF));
1290
    __m512i g = _mm512_srli_epi16(rg, 8);
1291
    __m512i b = _mm512_and_si512(b_a, _mm512_set1_epi16(0xFF));
1292
    __m512i a = _mm512_srli_epi16(b_a, 8);
1293
    return {SIMDVec16{r}, SIMDVec16{g}, SIMDVec16{b}, SIMDVec16{a}};
1294
  }
1295
  static std::array<SIMDVec16, 4> LoadRGBA16(const unsigned char* data) {
1296
    __m512i bytes0 = _mm512_loadu_si512((__m512i*)data);
1297
    __m512i bytes1 = _mm512_loadu_si512((__m512i*)(data + 64));
1298
    __m512i bytes2 = _mm512_loadu_si512((__m512i*)(data + 128));
1299
    __m512i bytes3 = _mm512_loadu_si512((__m512i*)(data + 192));
1300
1301
    auto pack32 = [](__m512i a, __m512i b) {
1302
      __m512i permuteidx = _mm512_set_epi64(7, 5, 3, 1, 6, 4, 2, 0);
1303
      return _mm512_permutexvar_epi64(permuteidx, _mm512_packus_epi32(a, b));
1304
    };
1305
    auto packlow32 = [&pack32](__m512i a, __m512i b) {
1306
      __m512i mask = _mm512_set1_epi32(0xFFFF);
1307
      return pack32(_mm512_and_si512(a, mask), _mm512_and_si512(b, mask));
1308
    };
1309
    auto packhi32 = [&pack32](__m512i a, __m512i b) {
1310
      return pack32(_mm512_srli_epi32(a, 16), _mm512_srli_epi32(b, 16));
1311
    };
1312
1313
    __m512i rb0 = packlow32(bytes0, bytes1);
1314
    __m512i rb1 = packlow32(bytes2, bytes3);
1315
    __m512i ga0 = packhi32(bytes0, bytes1);
1316
    __m512i ga1 = packhi32(bytes2, bytes3);
1317
1318
    __m512i r = packlow32(rb0, rb1);
1319
    __m512i g = packlow32(ga0, ga1);
1320
    __m512i b = packhi32(rb0, rb1);
1321
    __m512i a = packhi32(ga0, ga1);
1322
    return {SIMDVec16{r}, SIMDVec16{g}, SIMDVec16{b}, SIMDVec16{a}};
1323
  }
1324
1325
  void SwapEndian() {
1326
    auto indices = _mm512_broadcast_i32x4(
1327
        _mm_setr_epi8(1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14));
1328
    vec = _mm512_shuffle_epi8(vec, indices);
1329
  }
1330
};
1331
1332
SIMDVec16 Mask16::IfThenElse(const SIMDVec16& if_true,
1333
                             const SIMDVec16& if_false) {
1334
  return SIMDVec16{_mm512_mask_blend_epi16(mask, if_false.vec, if_true.vec)};
1335
}
1336
1337
SIMDVec32 Mask32::IfThenElse(const SIMDVec32& if_true,
1338
                             const SIMDVec32& if_false) {
1339
  return SIMDVec32{_mm512_mask_blend_epi32(mask, if_false.vec, if_true.vec)};
1340
}
1341
1342
struct Bits64 {
1343
  static constexpr size_t kLanes = 8;
1344
1345
  __m512i nbits;
1346
  __m512i bits;
1347
1348
  FJXL_INLINE void Store(uint64_t* nbits_out, uint64_t* bits_out) {
1349
    _mm512_storeu_si512((__m512i*)nbits_out, nbits);
1350
    _mm512_storeu_si512((__m512i*)bits_out, bits);
1351
  }
1352
};
1353
1354
struct Bits32 {
1355
  __m512i nbits;
1356
  __m512i bits;
1357
1358
  static Bits32 FromRaw(SIMDVec32 nbits, SIMDVec32 bits) {
1359
    return Bits32{nbits.vec, bits.vec};
1360
  }
1361
1362
  Bits64 Merge() const {
1363
    auto nbits_hi32 = _mm512_srli_epi64(nbits, 32);
1364
    auto nbits_lo32 = _mm512_and_si512(nbits, _mm512_set1_epi64(0xFFFFFFFF));
1365
    auto bits_hi32 = _mm512_srli_epi64(bits, 32);
1366
    auto bits_lo32 = _mm512_and_si512(bits, _mm512_set1_epi64(0xFFFFFFFF));
1367
1368
    auto nbits64 = _mm512_add_epi64(nbits_hi32, nbits_lo32);
1369
    auto bits64 =
1370
        _mm512_or_si512(_mm512_sllv_epi64(bits_hi32, nbits_lo32), bits_lo32);
1371
    return Bits64{nbits64, bits64};
1372
  }
1373
1374
  void Interleave(const Bits32& low) {
1375
    bits = _mm512_or_si512(_mm512_sllv_epi32(bits, low.nbits), low.bits);
1376
    nbits = _mm512_add_epi32(nbits, low.nbits);
1377
  }
1378
1379
  void ClipTo(size_t n) {
1380
    n = std::min<size_t>(n, 16);
1381
    constexpr uint32_t kMask[32] = {
1382
        ~0u, ~0u, ~0u, ~0u, ~0u, ~0u, ~0u, ~0u, ~0u, ~0u, ~0u,
1383
        ~0u, ~0u, ~0u, ~0u, ~0u, 0,   0,   0,   0,   0,   0,
1384
        0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
1385
    };
1386
    __m512i mask = _mm512_loadu_si512((__m512i*)(kMask + 16 - n));
1387
    nbits = _mm512_and_si512(mask, nbits);
1388
    bits = _mm512_and_si512(mask, bits);
1389
  }
1390
  void Skip(size_t n) {
1391
    n = std::min<size_t>(n, 16);
1392
    constexpr uint32_t kMask[32] = {
1393
        0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
1394
        0,   0,   0,   0,   0,   ~0u, ~0u, ~0u, ~0u, ~0u, ~0u,
1395
        ~0u, ~0u, ~0u, ~0u, ~0u, ~0u, ~0u, ~0u, ~0u, ~0u,
1396
    };
1397
    __m512i mask = _mm512_loadu_si512((__m512i*)(kMask + 16 - n));
1398
    nbits = _mm512_and_si512(mask, nbits);
1399
    bits = _mm512_and_si512(mask, bits);
1400
  }
1401
};
1402
1403
struct Bits16 {
1404
  __m512i nbits;
1405
  __m512i bits;
1406
1407
  static Bits16 FromRaw(SIMDVec16 nbits, SIMDVec16 bits) {
1408
    return Bits16{nbits.vec, bits.vec};
1409
  }
1410
1411
  Bits32 Merge() const {
1412
    auto nbits_hi16 = _mm512_srli_epi32(nbits, 16);
1413
    auto nbits_lo16 = _mm512_and_si512(nbits, _mm512_set1_epi32(0xFFFF));
1414
    auto bits_hi16 = _mm512_srli_epi32(bits, 16);
1415
    auto bits_lo16 = _mm512_and_si512(bits, _mm512_set1_epi32(0xFFFF));
1416
1417
    auto nbits32 = _mm512_add_epi32(nbits_hi16, nbits_lo16);
1418
    auto bits32 =
1419
        _mm512_or_si512(_mm512_sllv_epi32(bits_hi16, nbits_lo16), bits_lo16);
1420
    return Bits32{nbits32, bits32};
1421
  }
1422
1423
  void Interleave(const Bits16& low) {
1424
    bits = _mm512_or_si512(_mm512_sllv_epi16(bits, low.nbits), low.bits);
1425
    nbits = _mm512_add_epi16(nbits, low.nbits);
1426
  }
1427
1428
  void ClipTo(size_t n) {
1429
    n = std::min<size_t>(n, 32);
1430
    constexpr uint16_t kMask[64] = {
1431
        0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF,
1432
        0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF,
1433
        0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF,
1434
        0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF,
1435
        0,      0,      0,      0,      0,      0,      0,      0,
1436
        0,      0,      0,      0,      0,      0,      0,      0,
1437
        0,      0,      0,      0,      0,      0,      0,      0,
1438
        0,      0,      0,      0,      0,      0,      0,      0,
1439
    };
1440
    __m512i mask = _mm512_loadu_si512((__m512i*)(kMask + 32 - n));
1441
    nbits = _mm512_and_si512(mask, nbits);
1442
    bits = _mm512_and_si512(mask, bits);
1443
  }
1444
  void Skip(size_t n) {
1445
    n = std::min<size_t>(n, 32);
1446
    constexpr uint16_t kMask[64] = {
1447
        0,      0,      0,      0,      0,      0,      0,      0,
1448
        0,      0,      0,      0,      0,      0,      0,      0,
1449
        0,      0,      0,      0,      0,      0,      0,      0,
1450
        0,      0,      0,      0,      0,      0,      0,      0,
1451
        0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF,
1452
        0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF,
1453
        0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF,
1454
        0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF,
1455
    };
1456
    __m512i mask = _mm512_loadu_si512((__m512i*)(kMask + 32 - n));
1457
    nbits = _mm512_and_si512(mask, nbits);
1458
    bits = _mm512_and_si512(mask, bits);
1459
  }
1460
};
1461
1462
#endif
1463
1464
#ifdef FJXL_AVX2
1465
#define FJXL_GENERIC_SIMD
1466
1467
struct SIMDVec32;
1468
1469
struct Mask32 {
1470
  __m256i mask;
1471
  SIMDVec32 IfThenElse(const SIMDVec32& if_true, const SIMDVec32& if_false);
1472
0
  size_t CountPrefix() const {
1473
0
    return CtzNonZero(~static_cast<uint64_t>(
1474
0
        static_cast<uint8_t>(_mm256_movemask_ps(_mm256_castsi256_ps(mask)))));
1475
0
  }
1476
};
1477
1478
struct SIMDVec32 {
1479
  __m256i vec;
1480
1481
  static constexpr size_t kLanes = 8;
1482
1483
0
  FJXL_INLINE static SIMDVec32 Load(const uint32_t* data) {
1484
0
    return SIMDVec32{_mm256_loadu_si256((__m256i*)data)};
1485
0
  }
1486
0
  FJXL_INLINE void Store(uint32_t* data) {
1487
0
    _mm256_storeu_si256((__m256i*)data, vec);
1488
0
  }
1489
0
  FJXL_INLINE static SIMDVec32 Val(uint32_t v) {
1490
0
    return SIMDVec32{_mm256_set1_epi32(v)};
1491
0
  }
1492
0
  FJXL_INLINE SIMDVec32 ValToToken() const {
1493
0
    auto f32 = _mm256_castps_si256(_mm256_cvtepi32_ps(vec));
1494
0
    return SIMDVec32{_mm256_max_epi32(
1495
0
        _mm256_setzero_si256(),
1496
0
        _mm256_sub_epi32(_mm256_srli_epi32(f32, 23), _mm256_set1_epi32(126)))};
1497
0
  }
1498
0
  FJXL_INLINE SIMDVec32 SatSubU(const SIMDVec32& to_subtract) const {
1499
0
    return SIMDVec32{_mm256_sub_epi32(_mm256_max_epu32(vec, to_subtract.vec),
1500
0
                                      to_subtract.vec)};
1501
0
  }
1502
0
  FJXL_INLINE SIMDVec32 Sub(const SIMDVec32& to_subtract) const {
1503
0
    return SIMDVec32{_mm256_sub_epi32(vec, to_subtract.vec)};
1504
0
  }
1505
0
  FJXL_INLINE SIMDVec32 Add(const SIMDVec32& oth) const {
1506
0
    return SIMDVec32{_mm256_add_epi32(vec, oth.vec)};
1507
0
  }
1508
0
  FJXL_INLINE SIMDVec32 Xor(const SIMDVec32& oth) const {
1509
0
    return SIMDVec32{_mm256_xor_si256(vec, oth.vec)};
1510
0
  }
1511
0
  FJXL_INLINE SIMDVec32 Pow2() const {
1512
0
    return SIMDVec32{_mm256_sllv_epi32(_mm256_set1_epi32(1), vec)};
1513
0
  }
1514
0
  FJXL_INLINE Mask32 Eq(const SIMDVec32& oth) const {
1515
0
    return Mask32{_mm256_cmpeq_epi32(vec, oth.vec)};
1516
0
  }
1517
0
  FJXL_INLINE Mask32 Gt(const SIMDVec32& oth) const {
1518
0
    return Mask32{_mm256_cmpgt_epi32(vec, oth.vec)};
1519
0
  }
1520
  template <size_t i>
1521
0
  FJXL_INLINE SIMDVec32 SignedShiftRight() const {
1522
0
    return SIMDVec32{_mm256_srai_epi32(vec, i)};
1523
0
  }
1524
};
1525
1526
struct SIMDVec16;
1527
1528
struct Mask16 {
1529
  __m256i mask;
1530
  SIMDVec16 IfThenElse(const SIMDVec16& if_true, const SIMDVec16& if_false);
1531
0
  Mask16 And(const Mask16& oth) const {
1532
0
    return Mask16{_mm256_and_si256(mask, oth.mask)};
1533
0
  }
1534
0
  size_t CountPrefix() const {
1535
0
    return CtzNonZero(~static_cast<uint64_t>(
1536
0
               static_cast<uint32_t>(_mm256_movemask_epi8(mask)))) /
1537
0
           2;
1538
0
  }
1539
};
1540
1541
struct SIMDVec16 {
1542
  __m256i vec;
1543
1544
  static constexpr size_t kLanes = 16;
1545
1546
0
  FJXL_INLINE static SIMDVec16 Load(const uint16_t* data) {
1547
0
    return SIMDVec16{_mm256_loadu_si256((__m256i*)data)};
1548
0
  }
1549
0
  FJXL_INLINE void Store(uint16_t* data) {
1550
0
    _mm256_storeu_si256((__m256i*)data, vec);
1551
0
  }
1552
0
  FJXL_INLINE static SIMDVec16 Val(uint16_t v) {
1553
0
    return SIMDVec16{_mm256_set1_epi16(v)};
1554
0
  }
1555
  FJXL_INLINE static SIMDVec16 FromTwo32(const SIMDVec32& lo,
1556
0
                                         const SIMDVec32& hi) {
1557
0
    auto tmp = _mm256_packus_epi32(lo.vec, hi.vec);
1558
0
    return SIMDVec16{_mm256_permute4x64_epi64(tmp, 0b11011000)};
1559
0
  }
1560
1561
0
  FJXL_INLINE SIMDVec16 ValToToken() const {
1562
0
    auto nibble0 =
1563
0
        _mm256_or_si256(_mm256_and_si256(vec, _mm256_set1_epi16(0xF)),
1564
0
                        _mm256_set1_epi16(0xFF00));
1565
0
    auto nibble1 = _mm256_or_si256(
1566
0
        _mm256_and_si256(_mm256_srli_epi16(vec, 4), _mm256_set1_epi16(0xF)),
1567
0
        _mm256_set1_epi16(0xFF00));
1568
0
    auto nibble2 = _mm256_or_si256(
1569
0
        _mm256_and_si256(_mm256_srli_epi16(vec, 8), _mm256_set1_epi16(0xF)),
1570
0
        _mm256_set1_epi16(0xFF00));
1571
0
    auto nibble3 =
1572
0
        _mm256_or_si256(_mm256_srli_epi16(vec, 12), _mm256_set1_epi16(0xFF00));
1573
1574
0
    auto lut0 = _mm256_broadcastsi128_si256(
1575
0
        _mm_setr_epi8(0, 1, 2, 2, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4));
1576
0
    auto lut1 = _mm256_broadcastsi128_si256(
1577
0
        _mm_setr_epi8(0, 5, 6, 6, 7, 7, 7, 7, 8, 8, 8, 8, 8, 8, 8, 8));
1578
0
    auto lut2 = _mm256_broadcastsi128_si256(_mm_setr_epi8(
1579
0
        0, 9, 10, 10, 11, 11, 11, 11, 12, 12, 12, 12, 12, 12, 12, 12));
1580
0
    auto lut3 = _mm256_broadcastsi128_si256(_mm_setr_epi8(
1581
0
        0, 13, 14, 14, 15, 15, 15, 15, 16, 16, 16, 16, 16, 16, 16, 16));
1582
1583
0
    auto token0 = _mm256_shuffle_epi8(lut0, nibble0);
1584
0
    auto token1 = _mm256_shuffle_epi8(lut1, nibble1);
1585
0
    auto token2 = _mm256_shuffle_epi8(lut2, nibble2);
1586
0
    auto token3 = _mm256_shuffle_epi8(lut3, nibble3);
1587
1588
0
    auto token = _mm256_max_epi16(_mm256_max_epi16(token0, token1),
1589
0
                                  _mm256_max_epi16(token2, token3));
1590
0
    return SIMDVec16{token};
1591
0
  }
1592
1593
0
  FJXL_INLINE SIMDVec16 SatSubU(const SIMDVec16& to_subtract) const {
1594
0
    return SIMDVec16{_mm256_subs_epu16(vec, to_subtract.vec)};
1595
0
  }
1596
0
  FJXL_INLINE SIMDVec16 Sub(const SIMDVec16& to_subtract) const {
1597
0
    return SIMDVec16{_mm256_sub_epi16(vec, to_subtract.vec)};
1598
0
  }
1599
0
  FJXL_INLINE SIMDVec16 Add(const SIMDVec16& oth) const {
1600
0
    return SIMDVec16{_mm256_add_epi16(vec, oth.vec)};
1601
0
  }
1602
0
  FJXL_INLINE SIMDVec16 Min(const SIMDVec16& oth) const {
1603
0
    return SIMDVec16{_mm256_min_epu16(vec, oth.vec)};
1604
0
  }
1605
0
  FJXL_INLINE Mask16 Eq(const SIMDVec16& oth) const {
1606
0
    return Mask16{_mm256_cmpeq_epi16(vec, oth.vec)};
1607
0
  }
1608
0
  FJXL_INLINE Mask16 Gt(const SIMDVec16& oth) const {
1609
0
    return Mask16{_mm256_cmpgt_epi16(vec, oth.vec)};
1610
0
  }
1611
0
  FJXL_INLINE SIMDVec16 Pow2() const {
1612
0
    auto pow2_lo_lut = _mm256_broadcastsi128_si256(
1613
0
        _mm_setr_epi8(1 << 0, 1 << 1, 1 << 2, 1 << 3, 1 << 4, 1 << 5, 1 << 6,
1614
0
                      1u << 7, 0, 0, 0, 0, 0, 0, 0, 0));
1615
0
    auto pow2_hi_lut = _mm256_broadcastsi128_si256(
1616
0
        _mm_setr_epi8(0, 0, 0, 0, 0, 0, 0, 0, 1 << 0, 1 << 1, 1 << 2, 1 << 3,
1617
0
                      1 << 4, 1 << 5, 1 << 6, 1u << 7));
1618
1619
0
    auto masked = _mm256_or_si256(vec, _mm256_set1_epi16(0xFF00));
1620
1621
0
    auto pow2_lo = _mm256_shuffle_epi8(pow2_lo_lut, masked);
1622
0
    auto pow2_hi = _mm256_shuffle_epi8(pow2_hi_lut, masked);
1623
1624
0
    auto pow2 = _mm256_or_si256(_mm256_slli_epi16(pow2_hi, 8), pow2_lo);
1625
0
    return SIMDVec16{pow2};
1626
0
  }
1627
0
  FJXL_INLINE SIMDVec16 Or(const SIMDVec16& oth) const {
1628
0
    return SIMDVec16{_mm256_or_si256(vec, oth.vec)};
1629
0
  }
1630
0
  FJXL_INLINE SIMDVec16 Xor(const SIMDVec16& oth) const {
1631
0
    return SIMDVec16{_mm256_xor_si256(vec, oth.vec)};
1632
0
  }
1633
0
  FJXL_INLINE SIMDVec16 And(const SIMDVec16& oth) const {
1634
0
    return SIMDVec16{_mm256_and_si256(vec, oth.vec)};
1635
0
  }
1636
0
  FJXL_INLINE SIMDVec16 HAdd(const SIMDVec16& oth) const {
1637
0
    return SIMDVec16{_mm256_srai_epi16(_mm256_add_epi16(vec, oth.vec), 1)};
1638
0
  }
1639
0
  FJXL_INLINE SIMDVec16 PrepareForU8Lookup() const {
1640
0
    return SIMDVec16{_mm256_or_si256(vec, _mm256_set1_epi16(0xFF00))};
1641
0
  }
1642
0
  FJXL_INLINE SIMDVec16 U8Lookup(const uint8_t* table) const {
1643
0
    return SIMDVec16{_mm256_shuffle_epi8(
1644
0
        _mm256_broadcastsi128_si256(_mm_loadu_si128((__m128i*)table)), vec)};
1645
0
  }
1646
0
  FJXL_INLINE VecPair<SIMDVec16> Interleave(const SIMDVec16& low) const {
1647
0
    auto v02 = _mm256_unpacklo_epi16(low.vec, vec);
1648
0
    auto v13 = _mm256_unpackhi_epi16(low.vec, vec);
1649
0
    return {SIMDVec16{_mm256_permute2x128_si256(v02, v13, 0x20)},
1650
0
            SIMDVec16{_mm256_permute2x128_si256(v02, v13, 0x31)}};
1651
0
  }
1652
0
  FJXL_INLINE VecPair<SIMDVec32> Upcast() const {
1653
0
    auto v02 = _mm256_unpacklo_epi16(vec, _mm256_setzero_si256());
1654
0
    auto v13 = _mm256_unpackhi_epi16(vec, _mm256_setzero_si256());
1655
0
    return {SIMDVec32{_mm256_permute2x128_si256(v02, v13, 0x20)},
1656
0
            SIMDVec32{_mm256_permute2x128_si256(v02, v13, 0x31)}};
1657
0
  }
1658
  template <size_t i>
1659
0
  FJXL_INLINE SIMDVec16 SignedShiftRight() const {
1660
0
    return SIMDVec16{_mm256_srai_epi16(vec, i)};
1661
0
  }
1662
1663
0
  static std::array<SIMDVec16, 1> LoadG8(const unsigned char* data) {
1664
0
    __m128i bytes = _mm_loadu_si128((__m128i*)data);
1665
0
    return {SIMDVec16{_mm256_cvtepu8_epi16(bytes)}};
1666
0
  }
1667
0
  static std::array<SIMDVec16, 1> LoadG16(const unsigned char* data) {
1668
0
    return {Load((const uint16_t*)data)};
1669
0
  }
1670
1671
0
  static std::array<SIMDVec16, 2> LoadGA8(const unsigned char* data) {
1672
0
    __m256i bytes = _mm256_loadu_si256((__m256i*)data);
1673
0
    __m256i gray = _mm256_and_si256(bytes, _mm256_set1_epi16(0xFF));
1674
0
    __m256i alpha = _mm256_srli_epi16(bytes, 8);
1675
0
    return {SIMDVec16{gray}, SIMDVec16{alpha}};
1676
0
  }
1677
0
  static std::array<SIMDVec16, 2> LoadGA16(const unsigned char* data) {
1678
0
    __m256i bytes1 = _mm256_loadu_si256((__m256i*)data);
1679
0
    __m256i bytes2 = _mm256_loadu_si256((__m256i*)(data + 32));
1680
0
    __m256i g_mask = _mm256_set1_epi32(0xFFFF);
1681
0
    __m256i g = _mm256_permute4x64_epi64(
1682
0
        _mm256_packus_epi32(_mm256_and_si256(bytes1, g_mask),
1683
0
                            _mm256_and_si256(bytes2, g_mask)),
1684
0
        0b11011000);
1685
0
    __m256i a = _mm256_permute4x64_epi64(
1686
0
        _mm256_packus_epi32(_mm256_srli_epi32(bytes1, 16),
1687
0
                            _mm256_srli_epi32(bytes2, 16)),
1688
0
        0b11011000);
1689
0
    return {SIMDVec16{g}, SIMDVec16{a}};
1690
0
  }
1691
1692
0
  static std::array<SIMDVec16, 3> LoadRGB8(const unsigned char* data) {
1693
0
    __m128i bytes0 = _mm_loadu_si128((__m128i*)data);
1694
0
    __m128i bytes1 = _mm_loadu_si128((__m128i*)(data + 16));
1695
0
    __m128i bytes2 = _mm_loadu_si128((__m128i*)(data + 32));
1696
1697
0
    __m128i idx =
1698
0
        _mm_setr_epi8(0, 3, 6, 9, 12, 15, 2, 5, 8, 11, 14, 1, 4, 7, 10, 13);
1699
1700
0
    __m128i r6b5g5_0 = _mm_shuffle_epi8(bytes0, idx);
1701
0
    __m128i g6r5b5_1 = _mm_shuffle_epi8(bytes1, idx);
1702
0
    __m128i b6g5r5_2 = _mm_shuffle_epi8(bytes2, idx);
1703
1704
0
    __m128i mask010 = _mm_setr_epi8(0, 0, 0, 0, 0, 0, 0xFF, 0xFF, 0xFF, 0xFF,
1705
0
                                    0xFF, 0, 0, 0, 0, 0);
1706
0
    __m128i mask001 = _mm_setr_epi8(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0xFF, 0xFF,
1707
0
                                    0xFF, 0xFF, 0xFF);
1708
1709
0
    __m128i b2g2b1 = _mm_blendv_epi8(b6g5r5_2, g6r5b5_1, mask001);
1710
0
    __m128i b2b0b1 = _mm_blendv_epi8(b2g2b1, r6b5g5_0, mask010);
1711
1712
0
    __m128i r0r1b1 = _mm_blendv_epi8(r6b5g5_0, g6r5b5_1, mask010);
1713
0
    __m128i r0r1r2 = _mm_blendv_epi8(r0r1b1, b6g5r5_2, mask001);
1714
1715
0
    __m128i g1r1g0 = _mm_blendv_epi8(g6r5b5_1, r6b5g5_0, mask001);
1716
0
    __m128i g1g2g0 = _mm_blendv_epi8(g1r1g0, b6g5r5_2, mask010);
1717
1718
0
    __m128i g0g1g2 = _mm_alignr_epi8(g1g2g0, g1g2g0, 11);
1719
0
    __m128i b0b1b2 = _mm_alignr_epi8(b2b0b1, b2b0b1, 6);
1720
1721
0
    return {SIMDVec16{_mm256_cvtepu8_epi16(r0r1r2)},
1722
0
            SIMDVec16{_mm256_cvtepu8_epi16(g0g1g2)},
1723
0
            SIMDVec16{_mm256_cvtepu8_epi16(b0b1b2)}};
1724
0
  }
1725
0
  static std::array<SIMDVec16, 3> LoadRGB16(const unsigned char* data) {
1726
0
    auto load_and_split_lohi = [](const unsigned char* data) {
1727
      // LHLHLH...
1728
0
      __m256i bytes = _mm256_loadu_si256((__m256i*)data);
1729
      // L0L0L0...
1730
0
      __m256i lo = _mm256_and_si256(bytes, _mm256_set1_epi16(0xFF));
1731
      // H0H0H0...
1732
0
      __m256i hi = _mm256_srli_epi16(bytes, 8);
1733
      // LLLLLLLLHHHHHHHHLLLLLLLLHHHHHHHH
1734
0
      __m256i packed = _mm256_packus_epi16(lo, hi);
1735
0
      return _mm256_permute4x64_epi64(packed, 0b11011000);
1736
0
    };
1737
0
    __m256i bytes0 = load_and_split_lohi(data);
1738
0
    __m256i bytes1 = load_and_split_lohi(data + 32);
1739
0
    __m256i bytes2 = load_and_split_lohi(data + 64);
1740
1741
0
    __m256i idx = _mm256_broadcastsi128_si256(
1742
0
        _mm_setr_epi8(0, 3, 6, 9, 12, 15, 2, 5, 8, 11, 14, 1, 4, 7, 10, 13));
1743
1744
0
    __m256i r6b5g5_0 = _mm256_shuffle_epi8(bytes0, idx);
1745
0
    __m256i g6r5b5_1 = _mm256_shuffle_epi8(bytes1, idx);
1746
0
    __m256i b6g5r5_2 = _mm256_shuffle_epi8(bytes2, idx);
1747
1748
0
    __m256i mask010 = _mm256_broadcastsi128_si256(_mm_setr_epi8(
1749
0
        0, 0, 0, 0, 0, 0, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0, 0, 0, 0, 0));
1750
0
    __m256i mask001 = _mm256_broadcastsi128_si256(_mm_setr_epi8(
1751
0
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF));
1752
1753
0
    __m256i b2g2b1 = _mm256_blendv_epi8(b6g5r5_2, g6r5b5_1, mask001);
1754
0
    __m256i b2b0b1 = _mm256_blendv_epi8(b2g2b1, r6b5g5_0, mask010);
1755
1756
0
    __m256i r0r1b1 = _mm256_blendv_epi8(r6b5g5_0, g6r5b5_1, mask010);
1757
0
    __m256i r0r1r2 = _mm256_blendv_epi8(r0r1b1, b6g5r5_2, mask001);
1758
1759
0
    __m256i g1r1g0 = _mm256_blendv_epi8(g6r5b5_1, r6b5g5_0, mask001);
1760
0
    __m256i g1g2g0 = _mm256_blendv_epi8(g1r1g0, b6g5r5_2, mask010);
1761
1762
0
    __m256i g0g1g2 = _mm256_alignr_epi8(g1g2g0, g1g2g0, 11);
1763
0
    __m256i b0b1b2 = _mm256_alignr_epi8(b2b0b1, b2b0b1, 6);
1764
1765
    // Now r0r1r2, g0g1g2, b0b1b2 have the low bytes of the RGB pixels in their
1766
    // lower half, and the high bytes in their upper half.
1767
1768
0
    auto combine_low_hi = [](__m256i v) {
1769
0
      __m128i low = _mm256_extracti128_si256(v, 0);
1770
0
      __m128i hi = _mm256_extracti128_si256(v, 1);
1771
0
      __m256i low16 = _mm256_cvtepu8_epi16(low);
1772
0
      __m256i hi16 = _mm256_cvtepu8_epi16(hi);
1773
0
      return _mm256_or_si256(_mm256_slli_epi16(hi16, 8), low16);
1774
0
    };
1775
1776
0
    return {SIMDVec16{combine_low_hi(r0r1r2)},
1777
0
            SIMDVec16{combine_low_hi(g0g1g2)},
1778
0
            SIMDVec16{combine_low_hi(b0b1b2)}};
1779
0
  }
1780
1781
0
  static std::array<SIMDVec16, 4> LoadRGBA8(const unsigned char* data) {
1782
0
    __m256i bytes1 = _mm256_loadu_si256((__m256i*)data);
1783
0
    __m256i bytes2 = _mm256_loadu_si256((__m256i*)(data + 32));
1784
0
    __m256i rg_mask = _mm256_set1_epi32(0xFFFF);
1785
0
    __m256i rg = _mm256_permute4x64_epi64(
1786
0
        _mm256_packus_epi32(_mm256_and_si256(bytes1, rg_mask),
1787
0
                            _mm256_and_si256(bytes2, rg_mask)),
1788
0
        0b11011000);
1789
0
    __m256i b_a = _mm256_permute4x64_epi64(
1790
0
        _mm256_packus_epi32(_mm256_srli_epi32(bytes1, 16),
1791
0
                            _mm256_srli_epi32(bytes2, 16)),
1792
0
        0b11011000);
1793
0
    __m256i r = _mm256_and_si256(rg, _mm256_set1_epi16(0xFF));
1794
0
    __m256i g = _mm256_srli_epi16(rg, 8);
1795
0
    __m256i b = _mm256_and_si256(b_a, _mm256_set1_epi16(0xFF));
1796
0
    __m256i a = _mm256_srli_epi16(b_a, 8);
1797
0
    return {SIMDVec16{r}, SIMDVec16{g}, SIMDVec16{b}, SIMDVec16{a}};
1798
0
  }
1799
0
  static std::array<SIMDVec16, 4> LoadRGBA16(const unsigned char* data) {
1800
0
    __m256i bytes0 = _mm256_loadu_si256((__m256i*)data);
1801
0
    __m256i bytes1 = _mm256_loadu_si256((__m256i*)(data + 32));
1802
0
    __m256i bytes2 = _mm256_loadu_si256((__m256i*)(data + 64));
1803
0
    __m256i bytes3 = _mm256_loadu_si256((__m256i*)(data + 96));
1804
1805
0
    auto pack32 = [](__m256i a, __m256i b) {
1806
0
      return _mm256_permute4x64_epi64(_mm256_packus_epi32(a, b), 0b11011000);
1807
0
    };
1808
0
    auto packlow32 = [&pack32](__m256i a, __m256i b) {
1809
0
      __m256i mask = _mm256_set1_epi32(0xFFFF);
1810
0
      return pack32(_mm256_and_si256(a, mask), _mm256_and_si256(b, mask));
1811
0
    };
1812
0
    auto packhi32 = [&pack32](__m256i a, __m256i b) {
1813
0
      return pack32(_mm256_srli_epi32(a, 16), _mm256_srli_epi32(b, 16));
1814
0
    };
1815
1816
0
    __m256i rb0 = packlow32(bytes0, bytes1);
1817
0
    __m256i rb1 = packlow32(bytes2, bytes3);
1818
0
    __m256i ga0 = packhi32(bytes0, bytes1);
1819
0
    __m256i ga1 = packhi32(bytes2, bytes3);
1820
1821
0
    __m256i r = packlow32(rb0, rb1);
1822
0
    __m256i g = packlow32(ga0, ga1);
1823
0
    __m256i b = packhi32(rb0, rb1);
1824
0
    __m256i a = packhi32(ga0, ga1);
1825
0
    return {SIMDVec16{r}, SIMDVec16{g}, SIMDVec16{b}, SIMDVec16{a}};
1826
0
  }
1827
1828
0
  void SwapEndian() {
1829
0
    auto indices = _mm256_broadcastsi128_si256(
1830
0
        _mm_setr_epi8(1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14));
1831
0
    vec = _mm256_shuffle_epi8(vec, indices);
1832
0
  }
1833
};
1834
1835
SIMDVec16 Mask16::IfThenElse(const SIMDVec16& if_true,
1836
0
                             const SIMDVec16& if_false) {
1837
0
  return SIMDVec16{_mm256_blendv_epi8(if_false.vec, if_true.vec, mask)};
1838
0
}
1839
1840
SIMDVec32 Mask32::IfThenElse(const SIMDVec32& if_true,
1841
0
                             const SIMDVec32& if_false) {
1842
0
  return SIMDVec32{_mm256_blendv_epi8(if_false.vec, if_true.vec, mask)};
1843
0
}
1844
1845
struct Bits64 {
1846
  static constexpr size_t kLanes = 4;
1847
1848
  __m256i nbits;
1849
  __m256i bits;
1850
1851
0
  FJXL_INLINE void Store(uint64_t* nbits_out, uint64_t* bits_out) {
1852
0
    _mm256_storeu_si256((__m256i*)nbits_out, nbits);
1853
0
    _mm256_storeu_si256((__m256i*)bits_out, bits);
1854
0
  }
1855
};
1856
1857
struct Bits32 {
1858
  __m256i nbits;
1859
  __m256i bits;
1860
1861
0
  static Bits32 FromRaw(SIMDVec32 nbits, SIMDVec32 bits) {
1862
0
    return Bits32{nbits.vec, bits.vec};
1863
0
  }
1864
1865
0
  Bits64 Merge() const {
1866
0
    auto nbits_hi32 = _mm256_srli_epi64(nbits, 32);
1867
0
    auto nbits_lo32 = _mm256_and_si256(nbits, _mm256_set1_epi64x(0xFFFFFFFF));
1868
0
    auto bits_hi32 = _mm256_srli_epi64(bits, 32);
1869
0
    auto bits_lo32 = _mm256_and_si256(bits, _mm256_set1_epi64x(0xFFFFFFFF));
1870
1871
0
    auto nbits64 = _mm256_add_epi64(nbits_hi32, nbits_lo32);
1872
0
    auto bits64 =
1873
0
        _mm256_or_si256(_mm256_sllv_epi64(bits_hi32, nbits_lo32), bits_lo32);
1874
0
    return Bits64{nbits64, bits64};
1875
0
  }
1876
1877
0
  void Interleave(const Bits32& low) {
1878
0
    bits = _mm256_or_si256(_mm256_sllv_epi32(bits, low.nbits), low.bits);
1879
0
    nbits = _mm256_add_epi32(nbits, low.nbits);
1880
0
  }
1881
1882
0
  void ClipTo(size_t n) {
1883
0
    n = std::min<size_t>(n, 8);
1884
0
    constexpr uint32_t kMask[16] = {
1885
0
        ~0u, ~0u, ~0u, ~0u, ~0u, ~0u, ~0u, ~0u, 0, 0, 0, 0, 0, 0, 0, 0,
1886
0
    };
1887
0
    __m256i mask = _mm256_loadu_si256((__m256i*)(kMask + 8 - n));
1888
0
    nbits = _mm256_and_si256(mask, nbits);
1889
0
    bits = _mm256_and_si256(mask, bits);
1890
0
  }
1891
0
  void Skip(size_t n) {
1892
0
    n = std::min<size_t>(n, 8);
1893
0
    constexpr uint32_t kMask[16] = {
1894
0
        0, 0, 0, 0, 0, 0, 0, 0, ~0u, ~0u, ~0u, ~0u, ~0u, ~0u, ~0u, ~0u,
1895
0
    };
1896
0
    __m256i mask = _mm256_loadu_si256((__m256i*)(kMask + 8 - n));
1897
0
    nbits = _mm256_and_si256(mask, nbits);
1898
0
    bits = _mm256_and_si256(mask, bits);
1899
0
  }
1900
};
1901
1902
struct Bits16 {
1903
  __m256i nbits;
1904
  __m256i bits;
1905
1906
0
  static Bits16 FromRaw(SIMDVec16 nbits, SIMDVec16 bits) {
1907
0
    return Bits16{nbits.vec, bits.vec};
1908
0
  }
1909
1910
0
  Bits32 Merge() const {
1911
0
    auto nbits_hi16 = _mm256_srli_epi32(nbits, 16);
1912
0
    auto nbits_lo16 = _mm256_and_si256(nbits, _mm256_set1_epi32(0xFFFF));
1913
0
    auto bits_hi16 = _mm256_srli_epi32(bits, 16);
1914
0
    auto bits_lo16 = _mm256_and_si256(bits, _mm256_set1_epi32(0xFFFF));
1915
1916
0
    auto nbits32 = _mm256_add_epi32(nbits_hi16, nbits_lo16);
1917
0
    auto bits32 =
1918
0
        _mm256_or_si256(_mm256_sllv_epi32(bits_hi16, nbits_lo16), bits_lo16);
1919
0
    return Bits32{nbits32, bits32};
1920
0
  }
1921
1922
0
  void Interleave(const Bits16& low) {
1923
0
    auto pow2_lo_lut = _mm256_broadcastsi128_si256(
1924
0
        _mm_setr_epi8(1 << 0, 1 << 1, 1 << 2, 1 << 3, 1 << 4, 1 << 5, 1 << 6,
1925
0
                      1u << 7, 0, 0, 0, 0, 0, 0, 0, 0));
1926
0
    auto low_nbits_masked =
1927
0
        _mm256_or_si256(low.nbits, _mm256_set1_epi16(0xFF00));
1928
1929
0
    auto bits_shifted = _mm256_mullo_epi16(
1930
0
        bits, _mm256_shuffle_epi8(pow2_lo_lut, low_nbits_masked));
1931
1932
0
    nbits = _mm256_add_epi16(nbits, low.nbits);
1933
0
    bits = _mm256_or_si256(bits_shifted, low.bits);
1934
0
  }
1935
1936
0
  void ClipTo(size_t n) {
1937
0
    n = std::min<size_t>(n, 16);
1938
0
    constexpr uint16_t kMask[32] = {
1939
0
        0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF,
1940
0
        0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF,
1941
0
        0,      0,      0,      0,      0,      0,      0,      0,
1942
0
        0,      0,      0,      0,      0,      0,      0,      0,
1943
0
    };
1944
0
    __m256i mask = _mm256_loadu_si256((__m256i*)(kMask + 16 - n));
1945
0
    nbits = _mm256_and_si256(mask, nbits);
1946
0
    bits = _mm256_and_si256(mask, bits);
1947
0
  }
1948
1949
0
  void Skip(size_t n) {
1950
0
    n = std::min<size_t>(n, 16);
1951
0
    constexpr uint16_t kMask[32] = {
1952
0
        0,      0,      0,      0,      0,      0,      0,      0,
1953
0
        0,      0,      0,      0,      0,      0,      0,      0,
1954
0
        0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF,
1955
0
        0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF,
1956
0
    };
1957
0
    __m256i mask = _mm256_loadu_si256((__m256i*)(kMask + 16 - n));
1958
0
    nbits = _mm256_and_si256(mask, nbits);
1959
0
    bits = _mm256_and_si256(mask, bits);
1960
0
  }
1961
};
1962
1963
#endif
1964
1965
#ifdef FJXL_NEON
1966
#define FJXL_GENERIC_SIMD
1967
1968
struct SIMDVec32;
1969
1970
struct Mask32 {
1971
  uint32x4_t mask;
1972
  SIMDVec32 IfThenElse(const SIMDVec32& if_true, const SIMDVec32& if_false);
1973
  Mask32 And(const Mask32& oth) const {
1974
    return Mask32{vandq_u32(mask, oth.mask)};
1975
  }
1976
  size_t CountPrefix() const {
1977
    uint32_t val_unset[4] = {0, 1, 2, 3};
1978
    uint32_t val_set[4] = {4, 4, 4, 4};
1979
    uint32x4_t val = vbslq_u32(mask, vld1q_u32(val_set), vld1q_u32(val_unset));
1980
    return vminvq_u32(val);
1981
  }
1982
};
1983
1984
struct SIMDVec32 {
1985
  uint32x4_t vec;
1986
1987
  static constexpr size_t kLanes = 4;
1988
1989
  FJXL_INLINE static SIMDVec32 Load(const uint32_t* data) {
1990
    return SIMDVec32{vld1q_u32(data)};
1991
  }
1992
  FJXL_INLINE void Store(uint32_t* data) { vst1q_u32(data, vec); }
1993
  FJXL_INLINE static SIMDVec32 Val(uint32_t v) {
1994
    return SIMDVec32{vdupq_n_u32(v)};
1995
  }
1996
  FJXL_INLINE SIMDVec32 ValToToken() const {
1997
    return SIMDVec32{vsubq_u32(vdupq_n_u32(32), vclzq_u32(vec))};
1998
  }
1999
  FJXL_INLINE SIMDVec32 SatSubU(const SIMDVec32& to_subtract) const {
2000
    return SIMDVec32{vqsubq_u32(vec, to_subtract.vec)};
2001
  }
2002
  FJXL_INLINE SIMDVec32 Sub(const SIMDVec32& to_subtract) const {
2003
    return SIMDVec32{vsubq_u32(vec, to_subtract.vec)};
2004
  }
2005
  FJXL_INLINE SIMDVec32 Add(const SIMDVec32& oth) const {
2006
    return SIMDVec32{vaddq_u32(vec, oth.vec)};
2007
  }
2008
  FJXL_INLINE SIMDVec32 Xor(const SIMDVec32& oth) const {
2009
    return SIMDVec32{veorq_u32(vec, oth.vec)};
2010
  }
2011
  FJXL_INLINE SIMDVec32 Pow2() const {
2012
    return SIMDVec32{vshlq_u32(vdupq_n_u32(1), vreinterpretq_s32_u32(vec))};
2013
  }
2014
  FJXL_INLINE Mask32 Eq(const SIMDVec32& oth) const {
2015
    return Mask32{vceqq_u32(vec, oth.vec)};
2016
  }
2017
  FJXL_INLINE Mask32 Gt(const SIMDVec32& oth) const {
2018
    return Mask32{
2019
        vcgtq_s32(vreinterpretq_s32_u32(vec), vreinterpretq_s32_u32(oth.vec))};
2020
  }
2021
  template <size_t i>
2022
  FJXL_INLINE SIMDVec32 SignedShiftRight() const {
2023
    return SIMDVec32{
2024
        vreinterpretq_u32_s32(vshrq_n_s32(vreinterpretq_s32_u32(vec), i))};
2025
  }
2026
};
2027
2028
struct SIMDVec16;
2029
2030
struct Mask16 {
2031
  uint16x8_t mask;
2032
  SIMDVec16 IfThenElse(const SIMDVec16& if_true, const SIMDVec16& if_false);
2033
  Mask16 And(const Mask16& oth) const {
2034
    return Mask16{vandq_u16(mask, oth.mask)};
2035
  }
2036
  size_t CountPrefix() const {
2037
    uint16_t val_unset[8] = {0, 1, 2, 3, 4, 5, 6, 7};
2038
    uint16_t val_set[8] = {8, 8, 8, 8, 8, 8, 8, 8};
2039
    uint16x8_t val = vbslq_u16(mask, vld1q_u16(val_set), vld1q_u16(val_unset));
2040
    return vminvq_u16(val);
2041
  }
2042
};
2043
2044
struct SIMDVec16 {
2045
  uint16x8_t vec;
2046
2047
  static constexpr size_t kLanes = 8;
2048
2049
  FJXL_INLINE static SIMDVec16 Load(const uint16_t* data) {
2050
    return SIMDVec16{vld1q_u16(data)};
2051
  }
2052
  FJXL_INLINE void Store(uint16_t* data) { vst1q_u16(data, vec); }
2053
  FJXL_INLINE static SIMDVec16 Val(uint16_t v) {
2054
    return SIMDVec16{vdupq_n_u16(v)};
2055
  }
2056
  FJXL_INLINE static SIMDVec16 FromTwo32(const SIMDVec32& lo,
2057
                                         const SIMDVec32& hi) {
2058
    return SIMDVec16{vmovn_high_u32(vmovn_u32(lo.vec), hi.vec)};
2059
  }
2060
2061
  FJXL_INLINE SIMDVec16 ValToToken() const {
2062
    return SIMDVec16{vsubq_u16(vdupq_n_u16(16), vclzq_u16(vec))};
2063
  }
2064
  FJXL_INLINE SIMDVec16 SatSubU(const SIMDVec16& to_subtract) const {
2065
    return SIMDVec16{vqsubq_u16(vec, to_subtract.vec)};
2066
  }
2067
  FJXL_INLINE SIMDVec16 Sub(const SIMDVec16& to_subtract) const {
2068
    return SIMDVec16{vsubq_u16(vec, to_subtract.vec)};
2069
  }
2070
  FJXL_INLINE SIMDVec16 Add(const SIMDVec16& oth) const {
2071
    return SIMDVec16{vaddq_u16(vec, oth.vec)};
2072
  }
2073
  FJXL_INLINE SIMDVec16 Min(const SIMDVec16& oth) const {
2074
    return SIMDVec16{vminq_u16(vec, oth.vec)};
2075
  }
2076
  FJXL_INLINE Mask16 Eq(const SIMDVec16& oth) const {
2077
    return Mask16{vceqq_u16(vec, oth.vec)};
2078
  }
2079
  FJXL_INLINE Mask16 Gt(const SIMDVec16& oth) const {
2080
    return Mask16{
2081
        vcgtq_s16(vreinterpretq_s16_u16(vec), vreinterpretq_s16_u16(oth.vec))};
2082
  }
2083
  FJXL_INLINE SIMDVec16 Pow2() const {
2084
    return SIMDVec16{vshlq_u16(vdupq_n_u16(1), vreinterpretq_s16_u16(vec))};
2085
  }
2086
  FJXL_INLINE SIMDVec16 Or(const SIMDVec16& oth) const {
2087
    return SIMDVec16{vorrq_u16(vec, oth.vec)};
2088
  }
2089
  FJXL_INLINE SIMDVec16 Xor(const SIMDVec16& oth) const {
2090
    return SIMDVec16{veorq_u16(vec, oth.vec)};
2091
  }
2092
  FJXL_INLINE SIMDVec16 And(const SIMDVec16& oth) const {
2093
    return SIMDVec16{vandq_u16(vec, oth.vec)};
2094
  }
2095
  FJXL_INLINE SIMDVec16 HAdd(const SIMDVec16& oth) const {
2096
    return SIMDVec16{vhaddq_u16(vec, oth.vec)};
2097
  }
2098
  FJXL_INLINE SIMDVec16 PrepareForU8Lookup() const {
2099
    return SIMDVec16{vorrq_u16(vec, vdupq_n_u16(0xFF00))};
2100
  }
2101
  FJXL_INLINE SIMDVec16 U8Lookup(const uint8_t* table) const {
2102
    uint8x16_t tbl = vld1q_u8(table);
2103
    uint8x16_t indices = vreinterpretq_u8_u16(vec);
2104
    return SIMDVec16{vreinterpretq_u16_u8(vqtbl1q_u8(tbl, indices))};
2105
  }
2106
  FJXL_INLINE VecPair<SIMDVec16> Interleave(const SIMDVec16& low) const {
2107
    return {SIMDVec16{vzip1q_u16(low.vec, vec)},
2108
            SIMDVec16{vzip2q_u16(low.vec, vec)}};
2109
  }
2110
  FJXL_INLINE VecPair<SIMDVec32> Upcast() const {
2111
    uint32x4_t lo = vmovl_u16(vget_low_u16(vec));
2112
    uint32x4_t hi = vmovl_high_u16(vec);
2113
    return {SIMDVec32{lo}, SIMDVec32{hi}};
2114
  }
2115
  template <size_t i>
2116
  FJXL_INLINE SIMDVec16 SignedShiftRight() const {
2117
    return SIMDVec16{
2118
        vreinterpretq_u16_s16(vshrq_n_s16(vreinterpretq_s16_u16(vec), i))};
2119
  }
2120
2121
  static std::array<SIMDVec16, 1> LoadG8(const unsigned char* data) {
2122
    uint8x8_t v = vld1_u8(data);
2123
    return {SIMDVec16{vmovl_u8(v)}};
2124
  }
2125
  static std::array<SIMDVec16, 1> LoadG16(const unsigned char* data) {
2126
    return {Load((const uint16_t*)data)};
2127
  }
2128
2129
  static std::array<SIMDVec16, 2> LoadGA8(const unsigned char* data) {
2130
    uint8x8x2_t v = vld2_u8(data);
2131
    return {SIMDVec16{vmovl_u8(v.val[0])}, SIMDVec16{vmovl_u8(v.val[1])}};
2132
  }
2133
  static std::array<SIMDVec16, 2> LoadGA16(const unsigned char* data) {
2134
    uint16x8x2_t v = vld2q_u16((const uint16_t*)data);
2135
    return {SIMDVec16{v.val[0]}, SIMDVec16{v.val[1]}};
2136
  }
2137
2138
  static std::array<SIMDVec16, 3> LoadRGB8(const unsigned char* data) {
2139
    uint8x8x3_t v = vld3_u8(data);
2140
    return {SIMDVec16{vmovl_u8(v.val[0])}, SIMDVec16{vmovl_u8(v.val[1])},
2141
            SIMDVec16{vmovl_u8(v.val[2])}};
2142
  }
2143
  static std::array<SIMDVec16, 3> LoadRGB16(const unsigned char* data) {
2144
    uint16x8x3_t v = vld3q_u16((const uint16_t*)data);
2145
    return {SIMDVec16{v.val[0]}, SIMDVec16{v.val[1]}, SIMDVec16{v.val[2]}};
2146
  }
2147
2148
  static std::array<SIMDVec16, 4> LoadRGBA8(const unsigned char* data) {
2149
    uint8x8x4_t v = vld4_u8(data);
2150
    return {SIMDVec16{vmovl_u8(v.val[0])}, SIMDVec16{vmovl_u8(v.val[1])},
2151
            SIMDVec16{vmovl_u8(v.val[2])}, SIMDVec16{vmovl_u8(v.val[3])}};
2152
  }
2153
  static std::array<SIMDVec16, 4> LoadRGBA16(const unsigned char* data) {
2154
    uint16x8x4_t v = vld4q_u16((const uint16_t*)data);
2155
    return {SIMDVec16{v.val[0]}, SIMDVec16{v.val[1]}, SIMDVec16{v.val[2]},
2156
            SIMDVec16{v.val[3]}};
2157
  }
2158
2159
  void SwapEndian() {
2160
    vec = vreinterpretq_u16_u8(vrev16q_u8(vreinterpretq_u8_u16(vec)));
2161
  }
2162
};
2163
2164
SIMDVec16 Mask16::IfThenElse(const SIMDVec16& if_true,
2165
                             const SIMDVec16& if_false) {
2166
  return SIMDVec16{vbslq_u16(mask, if_true.vec, if_false.vec)};
2167
}
2168
2169
SIMDVec32 Mask32::IfThenElse(const SIMDVec32& if_true,
2170
                             const SIMDVec32& if_false) {
2171
  return SIMDVec32{vbslq_u32(mask, if_true.vec, if_false.vec)};
2172
}
2173
2174
struct Bits64 {
2175
  static constexpr size_t kLanes = 2;
2176
2177
  uint64x2_t nbits;
2178
  uint64x2_t bits;
2179
2180
  FJXL_INLINE void Store(uint64_t* nbits_out, uint64_t* bits_out) {
2181
    vst1q_u64(nbits_out, nbits);
2182
    vst1q_u64(bits_out, bits);
2183
  }
2184
};
2185
2186
struct Bits32 {
2187
  uint32x4_t nbits;
2188
  uint32x4_t bits;
2189
2190
  static Bits32 FromRaw(SIMDVec32 nbits, SIMDVec32 bits) {
2191
    return Bits32{nbits.vec, bits.vec};
2192
  }
2193
2194
  Bits64 Merge() const {
2195
    // TODO(veluca): can probably be optimized.
2196
    uint64x2_t nbits_lo32 =
2197
        vandq_u64(vreinterpretq_u64_u32(nbits), vdupq_n_u64(0xFFFFFFFF));
2198
    uint64x2_t bits_hi32 =
2199
        vshlq_u64(vshrq_n_u64(vreinterpretq_u64_u32(bits), 32),
2200
                  vreinterpretq_s64_u64(nbits_lo32));
2201
    uint64x2_t bits_lo32 =
2202
        vandq_u64(vreinterpretq_u64_u32(bits), vdupq_n_u64(0xFFFFFFFF));
2203
    uint64x2_t nbits64 =
2204
        vsraq_n_u64(nbits_lo32, vreinterpretq_u64_u32(nbits), 32);
2205
    uint64x2_t bits64 = vorrq_u64(bits_hi32, bits_lo32);
2206
    return Bits64{nbits64, bits64};
2207
  }
2208
2209
  void Interleave(const Bits32& low) {
2210
    bits =
2211
        vorrq_u32(vshlq_u32(bits, vreinterpretq_s32_u32(low.nbits)), low.bits);
2212
    nbits = vaddq_u32(nbits, low.nbits);
2213
  }
2214
2215
  void ClipTo(size_t n) {
2216
    n = std::min<size_t>(n, 4);
2217
    constexpr uint32_t kMask[8] = {
2218
        ~0u, ~0u, ~0u, ~0u, 0, 0, 0, 0,
2219
    };
2220
    uint32x4_t mask = vld1q_u32(kMask + 4 - n);
2221
    nbits = vandq_u32(mask, nbits);
2222
    bits = vandq_u32(mask, bits);
2223
  }
2224
  void Skip(size_t n) {
2225
    n = std::min<size_t>(n, 4);
2226
    constexpr uint32_t kMask[8] = {
2227
        0, 0, 0, 0, ~0u, ~0u, ~0u, ~0u,
2228
    };
2229
    uint32x4_t mask = vld1q_u32(kMask + 4 - n);
2230
    nbits = vandq_u32(mask, nbits);
2231
    bits = vandq_u32(mask, bits);
2232
  }
2233
};
2234
2235
struct Bits16 {
2236
  uint16x8_t nbits;
2237
  uint16x8_t bits;
2238
2239
  static Bits16 FromRaw(SIMDVec16 nbits, SIMDVec16 bits) {
2240
    return Bits16{nbits.vec, bits.vec};
2241
  }
2242
2243
  Bits32 Merge() const {
2244
    // TODO(veluca): can probably be optimized.
2245
    uint32x4_t nbits_lo16 =
2246
        vandq_u32(vreinterpretq_u32_u16(nbits), vdupq_n_u32(0xFFFF));
2247
    uint32x4_t bits_hi16 =
2248
        vshlq_u32(vshrq_n_u32(vreinterpretq_u32_u16(bits), 16),
2249
                  vreinterpretq_s32_u32(nbits_lo16));
2250
    uint32x4_t bits_lo16 =
2251
        vandq_u32(vreinterpretq_u32_u16(bits), vdupq_n_u32(0xFFFF));
2252
    uint32x4_t nbits32 =
2253
        vsraq_n_u32(nbits_lo16, vreinterpretq_u32_u16(nbits), 16);
2254
    uint32x4_t bits32 = vorrq_u32(bits_hi16, bits_lo16);
2255
    return Bits32{nbits32, bits32};
2256
  }
2257
2258
  void Interleave(const Bits16& low) {
2259
    bits =
2260
        vorrq_u16(vshlq_u16(bits, vreinterpretq_s16_u16(low.nbits)), low.bits);
2261
    nbits = vaddq_u16(nbits, low.nbits);
2262
  }
2263
2264
  void ClipTo(size_t n) {
2265
    n = std::min<size_t>(n, 8);
2266
    constexpr uint16_t kMask[16] = {
2267
        0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF,
2268
        0,      0,      0,      0,      0,      0,      0,      0,
2269
    };
2270
    uint16x8_t mask = vld1q_u16(kMask + 8 - n);
2271
    nbits = vandq_u16(mask, nbits);
2272
    bits = vandq_u16(mask, bits);
2273
  }
2274
  void Skip(size_t n) {
2275
    n = std::min<size_t>(n, 8);
2276
    constexpr uint16_t kMask[16] = {
2277
        0,      0,      0,      0,      0,      0,      0,      0,
2278
        0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF,
2279
    };
2280
    uint16x8_t mask = vld1q_u16(kMask + 8 - n);
2281
    nbits = vandq_u16(mask, nbits);
2282
    bits = vandq_u16(mask, bits);
2283
  }
2284
};
2285
2286
#endif
2287
2288
#ifdef FJXL_GENERIC_SIMD
2289
constexpr size_t SIMDVec32::kLanes;
2290
constexpr size_t SIMDVec16::kLanes;
2291
2292
//  Each of these functions will process SIMDVec16::kLanes worth of values.
2293
2294
FJXL_INLINE void TokenizeSIMD(const uint16_t* residuals, uint16_t* token_out,
2295
0
                              uint16_t* nbits_out, uint16_t* bits_out) {
2296
0
  SIMDVec16 res = SIMDVec16::Load(residuals);
2297
0
  SIMDVec16 token = res.ValToToken();
2298
0
  SIMDVec16 nbits = token.SatSubU(SIMDVec16::Val(1));
2299
0
  SIMDVec16 bits = res.SatSubU(nbits.Pow2());
2300
0
  token.Store(token_out);
2301
0
  nbits.Store(nbits_out);
2302
0
  bits.Store(bits_out);
2303
0
}
2304
2305
FJXL_INLINE void TokenizeSIMD(const uint32_t* residuals, uint16_t* token_out,
2306
0
                              uint32_t* nbits_out, uint32_t* bits_out) {
2307
0
  static_assert(SIMDVec16::kLanes == 2 * SIMDVec32::kLanes,
2308
0
                "There should be twice more 16-bit lanes than 32-bit lanes");
2309
0
  SIMDVec32 res_lo = SIMDVec32::Load(residuals);
2310
0
  SIMDVec32 res_hi = SIMDVec32::Load(residuals + SIMDVec32::kLanes);
2311
0
  SIMDVec32 token_lo = res_lo.ValToToken();
2312
0
  SIMDVec32 token_hi = res_hi.ValToToken();
2313
0
  SIMDVec32 nbits_lo = token_lo.SatSubU(SIMDVec32::Val(1));
2314
0
  SIMDVec32 nbits_hi = token_hi.SatSubU(SIMDVec32::Val(1));
2315
0
  SIMDVec32 bits_lo = res_lo.SatSubU(nbits_lo.Pow2());
2316
0
  SIMDVec32 bits_hi = res_hi.SatSubU(nbits_hi.Pow2());
2317
0
  SIMDVec16 token = SIMDVec16::FromTwo32(token_lo, token_hi);
2318
0
  token.Store(token_out);
2319
0
  nbits_lo.Store(nbits_out);
2320
0
  nbits_hi.Store(nbits_out + SIMDVec32::kLanes);
2321
0
  bits_lo.Store(bits_out);
2322
0
  bits_hi.Store(bits_out + SIMDVec32::kLanes);
2323
0
}
2324
2325
FJXL_INLINE void HuffmanSIMDUpTo13(const uint16_t* tokens,
2326
                                   const uint8_t* raw_nbits_simd,
2327
                                   const uint8_t* raw_bits_simd,
2328
0
                                   uint16_t* nbits_out, uint16_t* bits_out) {
2329
0
  SIMDVec16 tok = SIMDVec16::Load(tokens).PrepareForU8Lookup();
2330
0
  tok.U8Lookup(raw_nbits_simd).Store(nbits_out);
2331
0
  tok.U8Lookup(raw_bits_simd).Store(bits_out);
2332
0
}
2333
2334
FJXL_INLINE void HuffmanSIMD14(const uint16_t* tokens,
2335
                               const uint8_t* raw_nbits_simd,
2336
                               const uint8_t* raw_bits_simd,
2337
0
                               uint16_t* nbits_out, uint16_t* bits_out) {
2338
0
  SIMDVec16 token_cap = SIMDVec16::Val(15);
2339
0
  SIMDVec16 tok = SIMDVec16::Load(tokens);
2340
0
  SIMDVec16 tok_index = tok.Min(token_cap).PrepareForU8Lookup();
2341
0
  SIMDVec16 huff_bits_pre = tok_index.U8Lookup(raw_bits_simd);
2342
  // Set the highest bit when token == 16; the Huffman code is constructed in
2343
  // such a way that the code for token 15 is the same as the code for 16,
2344
  // except for the highest bit.
2345
0
  Mask16 needs_high_bit = tok.Eq(SIMDVec16::Val(16));
2346
0
  SIMDVec16 huff_bits = needs_high_bit.IfThenElse(
2347
0
      huff_bits_pre.Or(SIMDVec16::Val(128)), huff_bits_pre);
2348
0
  huff_bits.Store(bits_out);
2349
0
  tok_index.U8Lookup(raw_nbits_simd).Store(nbits_out);
2350
0
}
2351
2352
FJXL_INLINE void HuffmanSIMDAbove14(const uint16_t* tokens,
2353
                                    const uint8_t* raw_nbits_simd,
2354
                                    const uint8_t* raw_bits_simd,
2355
0
                                    uint16_t* nbits_out, uint16_t* bits_out) {
2356
0
  SIMDVec16 tok = SIMDVec16::Load(tokens);
2357
  // We assume `tok` fits in a *signed* 16-bit integer.
2358
0
  Mask16 above = tok.Gt(SIMDVec16::Val(12));
2359
  // 13, 14 -> 13
2360
  // 15, 16 -> 14
2361
  // 17, 18 -> 15
2362
0
  SIMDVec16 remap_tok = above.IfThenElse(tok.HAdd(SIMDVec16::Val(13)), tok);
2363
0
  SIMDVec16 tok_index = remap_tok.PrepareForU8Lookup();
2364
0
  SIMDVec16 huff_bits_pre = tok_index.U8Lookup(raw_bits_simd);
2365
  // Set the highest bit when token == 14, 16, 18.
2366
0
  Mask16 needs_high_bit = above.And(tok.Eq(tok.And(SIMDVec16::Val(0xFFFE))));
2367
0
  SIMDVec16 huff_bits = needs_high_bit.IfThenElse(
2368
0
      huff_bits_pre.Or(SIMDVec16::Val(128)), huff_bits_pre);
2369
0
  huff_bits.Store(bits_out);
2370
0
  tok_index.U8Lookup(raw_nbits_simd).Store(nbits_out);
2371
0
}
2372
2373
FJXL_INLINE void StoreSIMDUpTo8(const uint16_t* nbits_tok,
2374
                                const uint16_t* bits_tok,
2375
                                const uint16_t* nbits_huff,
2376
                                const uint16_t* bits_huff, size_t n,
2377
0
                                size_t skip, Bits32* bits_out) {
2378
0
  Bits16 bits =
2379
0
      Bits16::FromRaw(SIMDVec16::Load(nbits_tok), SIMDVec16::Load(bits_tok));
2380
0
  Bits16 huff_bits =
2381
0
      Bits16::FromRaw(SIMDVec16::Load(nbits_huff), SIMDVec16::Load(bits_huff));
2382
0
  bits.Interleave(huff_bits);
2383
0
  bits.ClipTo(n);
2384
0
  bits.Skip(skip);
2385
0
  bits_out[0] = bits.Merge();
2386
0
}
2387
2388
// Huffman and raw bits don't necessarily fit in a single u16 here.
2389
FJXL_INLINE void StoreSIMDUpTo14(const uint16_t* nbits_tok,
2390
                                 const uint16_t* bits_tok,
2391
                                 const uint16_t* nbits_huff,
2392
                                 const uint16_t* bits_huff, size_t n,
2393
0
                                 size_t skip, Bits32* bits_out) {
2394
0
  VecPair<SIMDVec16> bits =
2395
0
      SIMDVec16::Load(bits_tok).Interleave(SIMDVec16::Load(bits_huff));
2396
0
  VecPair<SIMDVec16> nbits =
2397
0
      SIMDVec16::Load(nbits_tok).Interleave(SIMDVec16::Load(nbits_huff));
2398
0
  Bits16 low = Bits16::FromRaw(nbits.low, bits.low);
2399
0
  Bits16 hi = Bits16::FromRaw(nbits.hi, bits.hi);
2400
0
  low.ClipTo(2 * n);
2401
0
  low.Skip(2 * skip);
2402
0
  hi.ClipTo(std::max(2 * n, SIMDVec16::kLanes) - SIMDVec16::kLanes);
2403
0
  hi.Skip(std::max(2 * skip, SIMDVec16::kLanes) - SIMDVec16::kLanes);
2404
2405
0
  bits_out[0] = low.Merge();
2406
0
  bits_out[1] = hi.Merge();
2407
0
}
2408
2409
FJXL_INLINE void StoreSIMDAbove14(const uint32_t* nbits_tok,
2410
                                  const uint32_t* bits_tok,
2411
                                  const uint16_t* nbits_huff,
2412
                                  const uint16_t* bits_huff, size_t n,
2413
0
                                  size_t skip, Bits32* bits_out) {
2414
0
  static_assert(SIMDVec16::kLanes == 2 * SIMDVec32::kLanes,
2415
0
                "There should be twice more 16-bit lanes than 32-bit lanes");
2416
0
  Bits32 bits_low =
2417
0
      Bits32::FromRaw(SIMDVec32::Load(nbits_tok), SIMDVec32::Load(bits_tok));
2418
0
  Bits32 bits_hi =
2419
0
      Bits32::FromRaw(SIMDVec32::Load(nbits_tok + SIMDVec32::kLanes),
2420
0
                      SIMDVec32::Load(bits_tok + SIMDVec32::kLanes));
2421
2422
0
  VecPair<SIMDVec32> huff_bits = SIMDVec16::Load(bits_huff).Upcast();
2423
0
  VecPair<SIMDVec32> huff_nbits = SIMDVec16::Load(nbits_huff).Upcast();
2424
2425
0
  Bits32 huff_low = Bits32::FromRaw(huff_nbits.low, huff_bits.low);
2426
0
  Bits32 huff_hi = Bits32::FromRaw(huff_nbits.hi, huff_bits.hi);
2427
2428
0
  bits_low.Interleave(huff_low);
2429
0
  bits_low.ClipTo(n);
2430
0
  bits_low.Skip(skip);
2431
0
  bits_out[0] = bits_low;
2432
0
  bits_hi.Interleave(huff_hi);
2433
0
  bits_hi.ClipTo(std::max(n, SIMDVec32::kLanes) - SIMDVec32::kLanes);
2434
0
  bits_hi.Skip(std::max(skip, SIMDVec32::kLanes) - SIMDVec32::kLanes);
2435
0
  bits_out[1] = bits_hi;
2436
0
}
2437
2438
#ifdef FJXL_AVX512
2439
FJXL_INLINE void StoreToWriterAVX512(const Bits32& bits32, BitWriter& output) {
2440
  __m512i bits = bits32.bits;
2441
  __m512i nbits = bits32.nbits;
2442
2443
  // Insert the leftover bits from the bit buffer at the bottom of the vector
2444
  // and extract the top of the vector.
2445
  uint64_t trail_bits =
2446
      _mm512_cvtsi512_si32(_mm512_alignr_epi32(bits, bits, 15));
2447
  uint64_t trail_nbits =
2448
      _mm512_cvtsi512_si32(_mm512_alignr_epi32(nbits, nbits, 15));
2449
  __m512i lead_bits = _mm512_set1_epi32(output.buffer);
2450
  __m512i lead_nbits = _mm512_set1_epi32(output.bits_in_buffer);
2451
  bits = _mm512_alignr_epi32(bits, lead_bits, 15);
2452
  nbits = _mm512_alignr_epi32(nbits, lead_nbits, 15);
2453
2454
  // Merge 32 -> 64 bits.
2455
  Bits32 b{nbits, bits};
2456
  Bits64 b64 = b.Merge();
2457
  bits = b64.bits;
2458
  nbits = b64.nbits;
2459
2460
  __m512i zero = _mm512_setzero_si512();
2461
2462
  auto sh1 = [zero](__m512i vec) { return _mm512_alignr_epi64(vec, zero, 7); };
2463
  auto sh2 = [zero](__m512i vec) { return _mm512_alignr_epi64(vec, zero, 6); };
2464
  auto sh4 = [zero](__m512i vec) { return _mm512_alignr_epi64(vec, zero, 4); };
2465
2466
  // Compute first-past-end-bit-position.
2467
  __m512i end_intermediate0 = _mm512_add_epi64(nbits, sh1(nbits));
2468
  __m512i end_intermediate1 =
2469
      _mm512_add_epi64(end_intermediate0, sh2(end_intermediate0));
2470
  __m512i end = _mm512_add_epi64(end_intermediate1, sh4(end_intermediate1));
2471
2472
  uint64_t simd_nbits = _mm512_cvtsi512_si32(_mm512_alignr_epi64(end, end, 7));
2473
2474
  // Compute begin-bit-position.
2475
  __m512i begin = _mm512_sub_epi64(end, nbits);
2476
2477
  // Index of the last bit in the chunk, or the end bit if nbits==0.
2478
  __m512i last = _mm512_mask_sub_epi64(
2479
      end, _mm512_cmpneq_epi64_mask(nbits, zero), end, _mm512_set1_epi64(1));
2480
2481
  __m512i lane_offset_mask = _mm512_set1_epi64(63);
2482
2483
  // Starting position of the chunk that each lane will ultimately belong to.
2484
  __m512i chunk_start = _mm512_andnot_si512(lane_offset_mask, last);
2485
2486
  // For all lanes that contain bits belonging to two different 64-bit chunks,
2487
  // compute the number of bits that belong to the first chunk.
2488
  // total # of bits fit in a u16, so we can satsub_u16 here.
2489
  __m512i first_chunk_nbits = _mm512_subs_epu16(chunk_start, begin);
2490
2491
  // Move all the previous-chunk-bits to the previous lane.
2492
  __m512i negnbits = _mm512_sub_epi64(_mm512_set1_epi64(64), first_chunk_nbits);
2493
  __m512i first_chunk_bits =
2494
      _mm512_srlv_epi64(_mm512_sllv_epi64(bits, negnbits), negnbits);
2495
  __m512i first_chunk_bits_down =
2496
      _mm512_alignr_epi32(zero, first_chunk_bits, 2);
2497
  bits = _mm512_srlv_epi64(bits, first_chunk_nbits);
2498
  nbits = _mm512_sub_epi64(nbits, first_chunk_nbits);
2499
  bits = _mm512_or_si512(bits, _mm512_sllv_epi64(first_chunk_bits_down, nbits));
2500
  begin = _mm512_add_epi64(begin, first_chunk_nbits);
2501
2502
  // We now know that every lane should give bits to only one chunk. We can
2503
  // shift the bits and then horizontally-or-reduce them within the same chunk.
2504
  __m512i offset = _mm512_and_si512(begin, lane_offset_mask);
2505
  __m512i aligned_bits = _mm512_sllv_epi64(bits, offset);
2506
  // h-or-reduce within same chunk
2507
  __m512i red0 = _mm512_mask_or_epi64(
2508
      aligned_bits, _mm512_cmpeq_epi64_mask(sh1(chunk_start), chunk_start),
2509
      sh1(aligned_bits), aligned_bits);
2510
  __m512i red1 = _mm512_mask_or_epi64(
2511
      red0, _mm512_cmpeq_epi64_mask(sh2(chunk_start), chunk_start), sh2(red0),
2512
      red0);
2513
  __m512i reduced = _mm512_mask_or_epi64(
2514
      red1, _mm512_cmpeq_epi64_mask(sh4(chunk_start), chunk_start), sh4(red1),
2515
      red1);
2516
  // Extract the highest lane that belongs to each chunk (the lane that ends up
2517
  // with the OR-ed value of all the other lanes of that chunk).
2518
  __m512i next_chunk_start =
2519
      _mm512_alignr_epi32(_mm512_set1_epi64(~0), chunk_start, 2);
2520
  __m512i result = _mm512_maskz_compress_epi64(
2521
      _mm512_cmpneq_epi64_mask(chunk_start, next_chunk_start), reduced);
2522
2523
  _mm512_storeu_si512((__m512i*)(output.data.get() + output.bytes_written),
2524
                      result);
2525
2526
  // Update the bit writer and add the last 32-bit lane.
2527
  // Note that since trail_nbits was at most 32 to begin with, operating on
2528
  // trail_bits does not risk overflowing.
2529
  output.bytes_written += simd_nbits / 8;
2530
  // Here we are implicitly relying on the fact that simd_nbits < 512 to know
2531
  // that the byte of bitreader data we access is initialized. This is
2532
  // guaranteed because the remaining bits in the bitreader buffer are at most
2533
  // 7, so simd_nbits <= 505 always.
2534
  trail_bits = (trail_bits << (simd_nbits % 8)) +
2535
               output.data.get()[output.bytes_written];
2536
  trail_nbits += simd_nbits % 8;
2537
  StoreLE64(output.data.get() + output.bytes_written, trail_bits);
2538
  size_t trail_bytes = trail_nbits / 8;
2539
  output.bits_in_buffer = trail_nbits % 8;
2540
  output.buffer = trail_bits >> (trail_bytes * 8);
2541
  output.bytes_written += trail_bytes;
2542
}
2543
2544
#endif
2545
2546
template <size_t n>
2547
0
FJXL_INLINE void StoreToWriter(const Bits32* bits, BitWriter& output) {
2548
#ifdef FJXL_AVX512
2549
  static_assert(n <= 2, "n should be less or 2 for AVX512");
2550
  StoreToWriterAVX512(bits[0], output);
2551
  if (n == 2) {
2552
    StoreToWriterAVX512(bits[1], output);
2553
  }
2554
  return;
2555
#endif
2556
0
  static_assert(n <= 4, "n should be less or 4");
2557
0
  alignas(64) uint64_t nbits64[Bits64::kLanes * n];
2558
0
  alignas(64) uint64_t bits64[Bits64::kLanes * n];
2559
0
  bits[0].Merge().Store(nbits64, bits64);
2560
0
  if (n > 1) {
2561
0
    bits[1].Merge().Store(nbits64 + Bits64::kLanes, bits64 + Bits64::kLanes);
2562
0
  }
2563
0
  if (n > 2) {
2564
0
    bits[2].Merge().Store(nbits64 + 2 * Bits64::kLanes,
2565
0
                          bits64 + 2 * Bits64::kLanes);
2566
0
  }
2567
0
  if (n > 3) {
2568
0
    bits[3].Merge().Store(nbits64 + 3 * Bits64::kLanes,
2569
0
                          bits64 + 3 * Bits64::kLanes);
2570
0
  }
2571
0
  output.WriteMultiple(nbits64, bits64, Bits64::kLanes * n);
2572
0
}
Unexecuted instantiation: enc_fast_lossless.cc:void AVX2::(anonymous namespace)::StoreToWriter<1ul>(AVX2::(anonymous namespace)::Bits32 const*, (anonymous namespace)::BitWriter&)
Unexecuted instantiation: enc_fast_lossless.cc:void AVX2::(anonymous namespace)::StoreToWriter<2ul>(AVX2::(anonymous namespace)::Bits32 const*, (anonymous namespace)::BitWriter&)
2573
2574
namespace detail {
2575
template <typename T>
2576
struct IntegerTypes;
2577
2578
template <>
2579
struct IntegerTypes<SIMDVec16> {
2580
  using signed_ = int16_t;
2581
  using unsigned_ = uint16_t;
2582
};
2583
2584
template <>
2585
struct IntegerTypes<SIMDVec32> {
2586
  using signed_ = int32_t;
2587
  using unsigned_ = uint32_t;
2588
};
2589
2590
template <typename T>
2591
struct SIMDType;
2592
2593
template <>
2594
struct SIMDType<int16_t> {
2595
  using type = SIMDVec16;
2596
};
2597
2598
template <>
2599
struct SIMDType<int32_t> {
2600
  using type = SIMDVec32;
2601
};
2602
2603
}  // namespace detail
2604
2605
template <typename T>
2606
using signed_t = typename detail::IntegerTypes<T>::signed_;
2607
2608
template <typename T>
2609
using unsigned_t = typename detail::IntegerTypes<T>::unsigned_;
2610
2611
template <typename T>
2612
using simd_t = typename detail::SIMDType<T>::type;
2613
2614
// This function will process exactly one vector worth of pixels.
2615
2616
template <typename T>
2617
size_t PredictPixels(const signed_t<T>* pixels, const signed_t<T>* pixels_left,
2618
                     const signed_t<T>* pixels_top,
2619
                     const signed_t<T>* pixels_topleft,
2620
0
                     unsigned_t<T>* residuals) {
2621
0
  T px = T::Load((unsigned_t<T>*)pixels);
2622
0
  T left = T::Load((unsigned_t<T>*)pixels_left);
2623
0
  T top = T::Load((unsigned_t<T>*)pixels_top);
2624
0
  T topleft = T::Load((unsigned_t<T>*)pixels_topleft);
2625
0
  T ac = left.Sub(topleft);
2626
0
  T ab = left.Sub(top);
2627
0
  T bc = top.Sub(topleft);
2628
0
  T grad = ac.Add(top);
2629
0
  T d = ab.Xor(bc);
2630
0
  T zero = T::Val(0);
2631
0
  T clamp = zero.Gt(d).IfThenElse(top, left);
2632
0
  T s = ac.Xor(bc);
2633
0
  T pred = zero.Gt(s).IfThenElse(grad, clamp);
2634
0
  T res = px.Sub(pred);
2635
0
  T res_times_2 = res.Add(res);
2636
0
  res = zero.Gt(res).IfThenElse(T::Val(-1).Sub(res_times_2), res_times_2);
2637
0
  res.Store(residuals);
2638
0
  return res.Eq(T::Val(0)).CountPrefix();
2639
0
}
Unexecuted instantiation: enc_fast_lossless.cc:unsigned long AVX2::(anonymous namespace)::PredictPixels<AVX2::(anonymous namespace)::SIMDVec16>(AVX2::(anonymous namespace)::detail::IntegerTypes<AVX2::(anonymous namespace)::SIMDVec16>::signed_ const*, AVX2::(anonymous namespace)::detail::IntegerTypes<AVX2::(anonymous namespace)::SIMDVec16>::signed_ const*, AVX2::(anonymous namespace)::detail::IntegerTypes<AVX2::(anonymous namespace)::SIMDVec16>::signed_ const*, AVX2::(anonymous namespace)::detail::IntegerTypes<AVX2::(anonymous namespace)::SIMDVec16>::signed_ const*, AVX2::(anonymous namespace)::detail::IntegerTypes<AVX2::(anonymous namespace)::SIMDVec16>::unsigned_*)
Unexecuted instantiation: enc_fast_lossless.cc:unsigned long AVX2::(anonymous namespace)::PredictPixels<AVX2::(anonymous namespace)::SIMDVec32>(AVX2::(anonymous namespace)::detail::IntegerTypes<AVX2::(anonymous namespace)::SIMDVec32>::signed_ const*, AVX2::(anonymous namespace)::detail::IntegerTypes<AVX2::(anonymous namespace)::SIMDVec32>::signed_ const*, AVX2::(anonymous namespace)::detail::IntegerTypes<AVX2::(anonymous namespace)::SIMDVec32>::signed_ const*, AVX2::(anonymous namespace)::detail::IntegerTypes<AVX2::(anonymous namespace)::SIMDVec32>::signed_ const*, AVX2::(anonymous namespace)::detail::IntegerTypes<AVX2::(anonymous namespace)::SIMDVec32>::unsigned_*)
2640
2641
#endif
2642
2643
void EncodeHybridUint000(uint32_t value, uint32_t* token, uint32_t* nbits,
2644
0
                         uint32_t* bits) {
2645
0
  uint32_t n = FloorLog2(value);
2646
0
  *token = value ? n + 1 : 0;
2647
0
  *nbits = value ? n : 0;
2648
0
  *bits = value ? value - (1 << n) : 0;
2649
0
}
Unexecuted instantiation: enc_fast_lossless.cc:AVX2::(anonymous namespace)::EncodeHybridUint000(unsigned int, unsigned int*, unsigned int*, unsigned int*)
Unexecuted instantiation: enc_fast_lossless.cc:default_implementation::(anonymous namespace)::EncodeHybridUint000(unsigned int, unsigned int*, unsigned int*, unsigned int*)
2650
2651
#ifdef FJXL_AVX512
2652
constexpr static size_t kLogChunkSize = 5;
2653
#elif defined(FJXL_AVX2) || defined(FJXL_NEON)
2654
// Even if NEON only has 128-bit lanes, it is still significantly (~1.3x) faster
2655
// to process two vectors at a time.
2656
constexpr static size_t kLogChunkSize = 4;
2657
#else
2658
constexpr static size_t kLogChunkSize = 3;
2659
#endif
2660
2661
constexpr static size_t kChunkSize = 1 << kLogChunkSize;
2662
2663
template <typename Residual>
2664
void GenericEncodeChunk(const Residual* residuals, size_t n, size_t skip,
2665
0
                        const PrefixCode& code, BitWriter& output) {
2666
0
  for (size_t ix = skip; ix < n; ix++) {
2667
0
    unsigned token, nbits, bits;
2668
0
    EncodeHybridUint000(residuals[ix], &token, &nbits, &bits);
2669
0
    output.Write(code.raw_nbits[token] + nbits,
2670
0
                 code.raw_bits[token] | bits << code.raw_nbits[token]);
2671
0
  }
2672
0
}
Unexecuted instantiation: enc_fast_lossless.cc:void default_implementation::(anonymous namespace)::GenericEncodeChunk<unsigned short>(unsigned short const*, unsigned long, unsigned long, (anonymous namespace)::PrefixCode const&, (anonymous namespace)::BitWriter&)
Unexecuted instantiation: enc_fast_lossless.cc:void default_implementation::(anonymous namespace)::GenericEncodeChunk<unsigned int>(unsigned int const*, unsigned long, unsigned long, (anonymous namespace)::PrefixCode const&, (anonymous namespace)::BitWriter&)
2673
2674
struct UpTo8Bits {
2675
  size_t bitdepth;
2676
0
  explicit UpTo8Bits(size_t bitdepth) : bitdepth(bitdepth) {
2677
0
    assert(bitdepth <= 8);
2678
0
  }
Unexecuted instantiation: enc_fast_lossless.cc:AVX2::(anonymous namespace)::UpTo8Bits::UpTo8Bits(unsigned long)
Unexecuted instantiation: enc_fast_lossless.cc:default_implementation::(anonymous namespace)::UpTo8Bits::UpTo8Bits(unsigned long)
2679
  // Here we can fit up to 9 extra bits + 7 Huffman bits in a u16; for all other
2680
  // symbols, we could actually go up to 8 Huffman bits as we have at most 8
2681
  // extra bits; however, the SIMD bit merging logic for AVX2 assumes that no
2682
  // Huffman length is 8 or more, so we cap at 8 anyway. Last symbol is used for
2683
  // LZ77 lengths and has no limitations except allowing to represent 32 symbols
2684
  // in total.
2685
  static constexpr uint8_t kMinRawLength[12] = {};
2686
  static constexpr uint8_t kMaxRawLength[12] = {
2687
      7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 10,
2688
  };
2689
0
  static size_t MaxEncodedBitsPerSample() { return 16; }
Unexecuted instantiation: enc_fast_lossless.cc:AVX2::(anonymous namespace)::UpTo8Bits::MaxEncodedBitsPerSample()
Unexecuted instantiation: enc_fast_lossless.cc:default_implementation::(anonymous namespace)::UpTo8Bits::MaxEncodedBitsPerSample()
2690
  static constexpr size_t kInputBytes = 1;
2691
  using pixel_t = int16_t;
2692
  using upixel_t = uint16_t;
2693
2694
  static void PrepareForSimd(const uint8_t* nbits, const uint8_t* bits,
2695
                             size_t n, uint8_t* nbits_simd,
2696
0
                             uint8_t* bits_simd) {
2697
0
    assert(n <= 16);
2698
0
    memcpy(nbits_simd, nbits, 16);
2699
0
    memcpy(bits_simd, bits, 16);
2700
0
  }
Unexecuted instantiation: enc_fast_lossless.cc:AVX2::(anonymous namespace)::UpTo8Bits::PrepareForSimd(unsigned char const*, unsigned char const*, unsigned long, unsigned char*, unsigned char*)
Unexecuted instantiation: enc_fast_lossless.cc:default_implementation::(anonymous namespace)::UpTo8Bits::PrepareForSimd(unsigned char const*, unsigned char const*, unsigned long, unsigned char*, unsigned char*)
2701
2702
#ifdef FJXL_GENERIC_SIMD
2703
  static void EncodeChunkSimd(upixel_t* residuals, size_t n, size_t skip,
2704
                              const uint8_t* raw_nbits_simd,
2705
0
                              const uint8_t* raw_bits_simd, BitWriter& output) {
2706
0
    Bits32 bits32[kChunkSize / SIMDVec16::kLanes];
2707
0
    alignas(64) uint16_t bits[SIMDVec16::kLanes];
2708
0
    alignas(64) uint16_t nbits[SIMDVec16::kLanes];
2709
0
    alignas(64) uint16_t bits_huff[SIMDVec16::kLanes];
2710
0
    alignas(64) uint16_t nbits_huff[SIMDVec16::kLanes];
2711
0
    alignas(64) uint16_t token[SIMDVec16::kLanes];
2712
0
    for (size_t i = 0; i < kChunkSize; i += SIMDVec16::kLanes) {
2713
0
      TokenizeSIMD(residuals + i, token, nbits, bits);
2714
0
      HuffmanSIMDUpTo13(token, raw_nbits_simd, raw_bits_simd, nbits_huff,
2715
0
                        bits_huff);
2716
0
      StoreSIMDUpTo8(nbits, bits, nbits_huff, bits_huff, std::max(n, i) - i,
2717
0
                     std::max(skip, i) - i, bits32 + i / SIMDVec16::kLanes);
2718
0
    }
2719
0
    StoreToWriter<kChunkSize / SIMDVec16::kLanes>(bits32, output);
2720
0
  }
2721
#endif
2722
2723
0
  size_t NumSymbols(bool doing_ycocg_or_large_palette) const {
2724
    // values gain 1 bit for YCoCg, 1 bit for prediction.
2725
    // Maximum symbol is 1 + effective bit depth of residuals.
2726
0
    if (doing_ycocg_or_large_palette) {
2727
0
      return bitdepth + 3;
2728
0
    } else {
2729
0
      return bitdepth + 2;
2730
0
    }
2731
0
  }
Unexecuted instantiation: enc_fast_lossless.cc:AVX2::(anonymous namespace)::UpTo8Bits::NumSymbols(bool) const
Unexecuted instantiation: enc_fast_lossless.cc:default_implementation::(anonymous namespace)::UpTo8Bits::NumSymbols(bool) const
2732
};
2733
constexpr uint8_t UpTo8Bits::kMinRawLength[];
2734
constexpr uint8_t UpTo8Bits::kMaxRawLength[];
2735
2736
struct From9To13Bits {
2737
  size_t bitdepth;
2738
0
  explicit From9To13Bits(size_t bitdepth) : bitdepth(bitdepth) {
2739
0
    assert(bitdepth <= 13 && bitdepth >= 9);
2740
0
  }
Unexecuted instantiation: enc_fast_lossless.cc:AVX2::(anonymous namespace)::From9To13Bits::From9To13Bits(unsigned long)
Unexecuted instantiation: enc_fast_lossless.cc:default_implementation::(anonymous namespace)::From9To13Bits::From9To13Bits(unsigned long)
2741
  // Last symbol is used for LZ77 lengths and has no limitations except allowing
2742
  // to represent 32 symbols in total.
2743
  // We cannot fit all the bits in a u16, so do not even try and use up to 8
2744
  // bits per raw symbol.
2745
  // There are at most 16 raw symbols, so Huffman coding can be SIMDfied without
2746
  // any special tricks.
2747
  static constexpr uint8_t kMinRawLength[17] = {};
2748
  static constexpr uint8_t kMaxRawLength[17] = {
2749
      8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 10,
2750
  };
2751
0
  static size_t MaxEncodedBitsPerSample() { return 21; }
Unexecuted instantiation: enc_fast_lossless.cc:AVX2::(anonymous namespace)::From9To13Bits::MaxEncodedBitsPerSample()
Unexecuted instantiation: enc_fast_lossless.cc:default_implementation::(anonymous namespace)::From9To13Bits::MaxEncodedBitsPerSample()
2752
  static constexpr size_t kInputBytes = 2;
2753
  using pixel_t = int16_t;
2754
  using upixel_t = uint16_t;
2755
2756
  static void PrepareForSimd(const uint8_t* nbits, const uint8_t* bits,
2757
                             size_t n, uint8_t* nbits_simd,
2758
0
                             uint8_t* bits_simd) {
2759
0
    assert(n <= 16);
2760
0
    memcpy(nbits_simd, nbits, 16);
2761
0
    memcpy(bits_simd, bits, 16);
2762
0
  }
Unexecuted instantiation: enc_fast_lossless.cc:AVX2::(anonymous namespace)::From9To13Bits::PrepareForSimd(unsigned char const*, unsigned char const*, unsigned long, unsigned char*, unsigned char*)
Unexecuted instantiation: enc_fast_lossless.cc:default_implementation::(anonymous namespace)::From9To13Bits::PrepareForSimd(unsigned char const*, unsigned char const*, unsigned long, unsigned char*, unsigned char*)
2763
2764
#ifdef FJXL_GENERIC_SIMD
2765
  static void EncodeChunkSimd(upixel_t* residuals, size_t n, size_t skip,
2766
                              const uint8_t* raw_nbits_simd,
2767
0
                              const uint8_t* raw_bits_simd, BitWriter& output) {
2768
0
    Bits32 bits32[2 * kChunkSize / SIMDVec16::kLanes];
2769
0
    alignas(64) uint16_t bits[SIMDVec16::kLanes];
2770
0
    alignas(64) uint16_t nbits[SIMDVec16::kLanes];
2771
0
    alignas(64) uint16_t bits_huff[SIMDVec16::kLanes];
2772
0
    alignas(64) uint16_t nbits_huff[SIMDVec16::kLanes];
2773
0
    alignas(64) uint16_t token[SIMDVec16::kLanes];
2774
0
    for (size_t i = 0; i < kChunkSize; i += SIMDVec16::kLanes) {
2775
0
      TokenizeSIMD(residuals + i, token, nbits, bits);
2776
0
      HuffmanSIMDUpTo13(token, raw_nbits_simd, raw_bits_simd, nbits_huff,
2777
0
                        bits_huff);
2778
0
      StoreSIMDUpTo14(nbits, bits, nbits_huff, bits_huff, std::max(n, i) - i,
2779
0
                      std::max(skip, i) - i,
2780
0
                      bits32 + 2 * i / SIMDVec16::kLanes);
2781
0
    }
2782
0
    StoreToWriter<2 * kChunkSize / SIMDVec16::kLanes>(bits32, output);
2783
0
  }
2784
#endif
2785
2786
0
  size_t NumSymbols(bool doing_ycocg_or_large_palette) const {
2787
    // values gain 1 bit for YCoCg, 1 bit for prediction.
2788
    // Maximum symbol is 1 + effective bit depth of residuals.
2789
0
    if (doing_ycocg_or_large_palette) {
2790
0
      return bitdepth + 3;
2791
0
    } else {
2792
0
      return bitdepth + 2;
2793
0
    }
2794
0
  }
Unexecuted instantiation: enc_fast_lossless.cc:AVX2::(anonymous namespace)::From9To13Bits::NumSymbols(bool) const
Unexecuted instantiation: enc_fast_lossless.cc:default_implementation::(anonymous namespace)::From9To13Bits::NumSymbols(bool) const
2795
};
2796
constexpr uint8_t From9To13Bits::kMinRawLength[];
2797
constexpr uint8_t From9To13Bits::kMaxRawLength[];
2798
2799
0
void CheckHuffmanBitsSIMD(int bits1, int nbits1, int bits2, int nbits2) {
2800
0
  assert(nbits1 == 8);
2801
0
  assert(nbits2 == 8);
2802
0
  assert(bits2 == (bits1 | 128));
2803
0
}
Unexecuted instantiation: enc_fast_lossless.cc:AVX2::(anonymous namespace)::CheckHuffmanBitsSIMD(int, int, int, int)
Unexecuted instantiation: enc_fast_lossless.cc:default_implementation::(anonymous namespace)::CheckHuffmanBitsSIMD(int, int, int, int)
2804
2805
struct Exactly14Bits {
2806
0
  explicit Exactly14Bits(size_t bitdepth_) { assert(bitdepth_ == 14); }
Unexecuted instantiation: enc_fast_lossless.cc:AVX2::(anonymous namespace)::Exactly14Bits::Exactly14Bits(unsigned long)
Unexecuted instantiation: enc_fast_lossless.cc:default_implementation::(anonymous namespace)::Exactly14Bits::Exactly14Bits(unsigned long)
2807
  // Force LZ77 symbols to have at least 8 bits, and raw symbols 15 and 16 to
2808
  // have exactly 8, and no other symbol to have 8 or more. This ensures that
2809
  // the representation for 15 and 16 is identical up to one bit.
2810
  static constexpr uint8_t kMinRawLength[18] = {
2811
      0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 8, 8, 7,
2812
  };
2813
  static constexpr uint8_t kMaxRawLength[18] = {
2814
      7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 8, 8, 10,
2815
  };
2816
  static constexpr size_t bitdepth = 14;
2817
0
  static size_t MaxEncodedBitsPerSample() { return 22; }
Unexecuted instantiation: enc_fast_lossless.cc:AVX2::(anonymous namespace)::Exactly14Bits::MaxEncodedBitsPerSample()
Unexecuted instantiation: enc_fast_lossless.cc:default_implementation::(anonymous namespace)::Exactly14Bits::MaxEncodedBitsPerSample()
2818
  static constexpr size_t kInputBytes = 2;
2819
  using pixel_t = int16_t;
2820
  using upixel_t = uint16_t;
2821
2822
  static void PrepareForSimd(const uint8_t* nbits, const uint8_t* bits,
2823
                             size_t n, uint8_t* nbits_simd,
2824
0
                             uint8_t* bits_simd) {
2825
0
    assert(n == 17);
2826
0
    CheckHuffmanBitsSIMD(bits[15], nbits[15], bits[16], nbits[16]);
2827
0
    memcpy(nbits_simd, nbits, 16);
2828
0
    memcpy(bits_simd, bits, 16);
2829
0
  }
Unexecuted instantiation: enc_fast_lossless.cc:AVX2::(anonymous namespace)::Exactly14Bits::PrepareForSimd(unsigned char const*, unsigned char const*, unsigned long, unsigned char*, unsigned char*)
Unexecuted instantiation: enc_fast_lossless.cc:default_implementation::(anonymous namespace)::Exactly14Bits::PrepareForSimd(unsigned char const*, unsigned char const*, unsigned long, unsigned char*, unsigned char*)
2830
2831
#ifdef FJXL_GENERIC_SIMD
2832
  static void EncodeChunkSimd(upixel_t* residuals, size_t n, size_t skip,
2833
                              const uint8_t* raw_nbits_simd,
2834
0
                              const uint8_t* raw_bits_simd, BitWriter& output) {
2835
0
    Bits32 bits32[2 * kChunkSize / SIMDVec16::kLanes];
2836
0
    alignas(64) uint16_t bits[SIMDVec16::kLanes];
2837
0
    alignas(64) uint16_t nbits[SIMDVec16::kLanes];
2838
0
    alignas(64) uint16_t bits_huff[SIMDVec16::kLanes];
2839
0
    alignas(64) uint16_t nbits_huff[SIMDVec16::kLanes];
2840
0
    alignas(64) uint16_t token[SIMDVec16::kLanes];
2841
0
    for (size_t i = 0; i < kChunkSize; i += SIMDVec16::kLanes) {
2842
0
      TokenizeSIMD(residuals + i, token, nbits, bits);
2843
0
      HuffmanSIMD14(token, raw_nbits_simd, raw_bits_simd, nbits_huff,
2844
0
                    bits_huff);
2845
0
      StoreSIMDUpTo14(nbits, bits, nbits_huff, bits_huff, std::max(n, i) - i,
2846
0
                      std::max(skip, i) - i,
2847
0
                      bits32 + 2 * i / SIMDVec16::kLanes);
2848
0
    }
2849
0
    StoreToWriter<2 * kChunkSize / SIMDVec16::kLanes>(bits32, output);
2850
0
  }
2851
#endif
2852
2853
0
  size_t NumSymbols(bool) const { return 17; }
Unexecuted instantiation: enc_fast_lossless.cc:AVX2::(anonymous namespace)::Exactly14Bits::NumSymbols(bool) const
Unexecuted instantiation: enc_fast_lossless.cc:default_implementation::(anonymous namespace)::Exactly14Bits::NumSymbols(bool) const
2854
};
2855
constexpr uint8_t Exactly14Bits::kMinRawLength[];
2856
constexpr uint8_t Exactly14Bits::kMaxRawLength[];
2857
2858
struct MoreThan14Bits {
2859
  size_t bitdepth;
2860
0
  explicit MoreThan14Bits(size_t bitdepth) : bitdepth(bitdepth) {
2861
0
    assert(bitdepth > 14);
2862
0
    assert(bitdepth <= 16);
2863
0
  }
Unexecuted instantiation: enc_fast_lossless.cc:AVX2::(anonymous namespace)::MoreThan14Bits::MoreThan14Bits(unsigned long)
Unexecuted instantiation: enc_fast_lossless.cc:default_implementation::(anonymous namespace)::MoreThan14Bits::MoreThan14Bits(unsigned long)
2864
  // Force LZ77 symbols to have at least 8 bits, and raw symbols 13 to 18 to
2865
  // have exactly 8, and no other symbol to have 8 or more. This ensures that
2866
  // the representation for (13, 14), (15, 16), (17, 18) is identical up to one
2867
  // bit.
2868
  static constexpr uint8_t kMinRawLength[20] = {
2869
      0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 8, 8, 8, 8, 8, 8, 7,
2870
  };
2871
  static constexpr uint8_t kMaxRawLength[20] = {
2872
      7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 8, 8, 8, 8, 8, 8, 10,
2873
  };
2874
0
  static size_t MaxEncodedBitsPerSample() { return 24; }
Unexecuted instantiation: enc_fast_lossless.cc:AVX2::(anonymous namespace)::MoreThan14Bits::MaxEncodedBitsPerSample()
Unexecuted instantiation: enc_fast_lossless.cc:default_implementation::(anonymous namespace)::MoreThan14Bits::MaxEncodedBitsPerSample()
2875
  static constexpr size_t kInputBytes = 2;
2876
  using pixel_t = int32_t;
2877
  using upixel_t = uint32_t;
2878
2879
  static void PrepareForSimd(const uint8_t* nbits, const uint8_t* bits,
2880
                             size_t n, uint8_t* nbits_simd,
2881
0
                             uint8_t* bits_simd) {
2882
0
    assert(n == 19);
2883
0
    CheckHuffmanBitsSIMD(bits[13], nbits[13], bits[14], nbits[14]);
2884
0
    CheckHuffmanBitsSIMD(bits[15], nbits[15], bits[16], nbits[16]);
2885
0
    CheckHuffmanBitsSIMD(bits[17], nbits[17], bits[18], nbits[18]);
2886
0
    for (size_t i = 0; i < 14; i++) {
2887
0
      nbits_simd[i] = nbits[i];
2888
0
      bits_simd[i] = bits[i];
2889
0
    }
2890
0
    nbits_simd[14] = nbits[15];
2891
0
    bits_simd[14] = bits[15];
2892
0
    nbits_simd[15] = nbits[17];
2893
0
    bits_simd[15] = bits[17];
2894
0
  }
Unexecuted instantiation: enc_fast_lossless.cc:AVX2::(anonymous namespace)::MoreThan14Bits::PrepareForSimd(unsigned char const*, unsigned char const*, unsigned long, unsigned char*, unsigned char*)
Unexecuted instantiation: enc_fast_lossless.cc:default_implementation::(anonymous namespace)::MoreThan14Bits::PrepareForSimd(unsigned char const*, unsigned char const*, unsigned long, unsigned char*, unsigned char*)
2895
2896
#ifdef FJXL_GENERIC_SIMD
2897
  static void EncodeChunkSimd(upixel_t* residuals, size_t n, size_t skip,
2898
                              const uint8_t* raw_nbits_simd,
2899
0
                              const uint8_t* raw_bits_simd, BitWriter& output) {
2900
0
    Bits32 bits32[2 * kChunkSize / SIMDVec16::kLanes];
2901
0
    alignas(64) uint32_t bits[SIMDVec16::kLanes];
2902
0
    alignas(64) uint32_t nbits[SIMDVec16::kLanes];
2903
0
    alignas(64) uint16_t bits_huff[SIMDVec16::kLanes];
2904
0
    alignas(64) uint16_t nbits_huff[SIMDVec16::kLanes];
2905
0
    alignas(64) uint16_t token[SIMDVec16::kLanes];
2906
0
    for (size_t i = 0; i < kChunkSize; i += SIMDVec16::kLanes) {
2907
0
      TokenizeSIMD(residuals + i, token, nbits, bits);
2908
0
      HuffmanSIMDAbove14(token, raw_nbits_simd, raw_bits_simd, nbits_huff,
2909
0
                         bits_huff);
2910
0
      StoreSIMDAbove14(nbits, bits, nbits_huff, bits_huff, std::max(n, i) - i,
2911
0
                       std::max(skip, i) - i,
2912
0
                       bits32 + 2 * i / SIMDVec16::kLanes);
2913
0
    }
2914
0
    StoreToWriter<2 * kChunkSize / SIMDVec16::kLanes>(bits32, output);
2915
0
  }
2916
#endif
2917
0
  size_t NumSymbols(bool) const { return 19; }
Unexecuted instantiation: enc_fast_lossless.cc:AVX2::(anonymous namespace)::MoreThan14Bits::NumSymbols(bool) const
Unexecuted instantiation: enc_fast_lossless.cc:default_implementation::(anonymous namespace)::MoreThan14Bits::NumSymbols(bool) const
2918
};
2919
constexpr uint8_t MoreThan14Bits::kMinRawLength[];
2920
constexpr uint8_t MoreThan14Bits::kMaxRawLength[];
2921
2922
void PrepareDCGlobalCommon(bool is_single_group, size_t width, size_t height,
2923
0
                           const PrefixCode code[4], BitWriter* output) {
2924
0
  output->Allocate(100000 + (is_single_group ? width * height * 16 : 0));
2925
  // No patches, spline or noise.
2926
0
  output->Write(1, 1);  // default DC dequantization factors (?)
2927
0
  output->Write(1, 1);  // use global tree / histograms
2928
0
  output->Write(1, 0);  // no lz77 for the tree
2929
2930
0
  output->Write(1, 1);         // simple code for the tree's context map
2931
0
  output->Write(2, 0);         // all contexts clustered together
2932
0
  output->Write(1, 1);         // use prefix code for tree
2933
0
  output->Write(4, 0);         // 000 hybrid uint
2934
0
  output->Write(6, 0b100011);  // Alphabet size is 4 (var16)
2935
0
  output->Write(2, 1);         // simple prefix code
2936
0
  output->Write(2, 3);         // with 4 symbols
2937
0
  output->Write(2, 0);
2938
0
  output->Write(2, 1);
2939
0
  output->Write(2, 2);
2940
0
  output->Write(2, 3);
2941
0
  output->Write(1, 0);  // First tree encoding option
2942
2943
  // Huffman table + extra bits for the tree.
2944
0
  uint8_t symbol_bits[6] = {0b00, 0b10, 0b001, 0b101, 0b0011, 0b0111};
2945
0
  uint8_t symbol_nbits[6] = {2, 2, 3, 3, 4, 4};
2946
  // Write a tree with a leaf per channel, and gradient predictor for every
2947
  // leaf.
2948
0
  for (auto v : {1, 2, 1, 4, 1, 0, 0, 5, 0, 0, 0, 0, 5,
2949
0
                 0, 0, 0, 0, 5, 0, 0, 0, 0, 5, 0, 0, 0}) {
2950
0
    output->Write(symbol_nbits[v], symbol_bits[v]);
2951
0
  }
2952
2953
0
  output->Write(1, 1);     // Enable lz77 for the main bitstream
2954
0
  output->Write(2, 0b00);  // lz77 offset 224
2955
0
  static_assert(kLZ77Offset == 224, "kLZ77Offset should be 224");
2956
0
  output->Write(4, 0b1010);  // lz77 min length 7
2957
  // 400 hybrid uint config for lz77
2958
0
  output->Write(4, 4);
2959
0
  output->Write(3, 0);
2960
0
  output->Write(3, 0);
2961
2962
0
  output->Write(1, 1);  // simple code for the context map
2963
0
  output->Write(2, 3);  // 3 bits per entry
2964
0
  output->Write(3, 4);  // channel 3
2965
0
  output->Write(3, 3);  // channel 2
2966
0
  output->Write(3, 2);  // channel 1
2967
0
  output->Write(3, 1);  // channel 0
2968
0
  output->Write(3, 0);  // distance histogram first
2969
2970
0
  output->Write(1, 1);  // use prefix codes
2971
0
  output->Write(4, 0);  // 000 hybrid uint config for distances (only need 0)
2972
0
  for (size_t i = 0; i < 4; i++) {
2973
0
    output->Write(4, 0);  // 000 hybrid uint config for symbols (only <= 10)
2974
0
  }
2975
2976
  // Distance alphabet size:
2977
0
  output->Write(5, 0b00001);  // 2: just need 1 for RLE (i.e. distance 1)
2978
  // Symbol + LZ77 alphabet size:
2979
0
  for (size_t i = 0; i < 4; i++) {
2980
0
    output->Write(1, 1);    // > 1
2981
0
    output->Write(4, 8);    // <= 512
2982
0
    output->Write(8, 256);  // == 512
2983
0
  }
2984
2985
  // Distance histogram:
2986
0
  output->Write(2, 1);  // simple prefix code
2987
0
  output->Write(2, 0);  // with one symbol
2988
0
  output->Write(1, 1);  // 1
2989
2990
  // Symbol + lz77 histogram:
2991
0
  for (size_t i = 0; i < 4; i++) {
2992
0
    code[i].WriteTo(output);
2993
0
  }
2994
2995
  // Group header for global modular image.
2996
0
  output->Write(1, 1);  // Global tree
2997
0
  output->Write(1, 1);  // All default wp
2998
0
}
Unexecuted instantiation: enc_fast_lossless.cc:AVX2::(anonymous namespace)::PrepareDCGlobalCommon(bool, unsigned long, unsigned long, (anonymous namespace)::PrefixCode const*, (anonymous namespace)::BitWriter*)
Unexecuted instantiation: enc_fast_lossless.cc:default_implementation::(anonymous namespace)::PrepareDCGlobalCommon(bool, unsigned long, unsigned long, (anonymous namespace)::PrefixCode const*, (anonymous namespace)::BitWriter*)
2999
3000
void PrepareDCGlobal(bool is_single_group, size_t width, size_t height,
3001
                     size_t nb_chans, const PrefixCode code[4],
3002
0
                     BitWriter* output) {
3003
0
  PrepareDCGlobalCommon(is_single_group, width, height, code, output);
3004
0
  if (nb_chans > 2) {
3005
0
    output->Write(2, 0b01);     // 1 transform
3006
0
    output->Write(2, 0b00);     // RCT
3007
0
    output->Write(5, 0b00000);  // Starting from ch 0
3008
0
    output->Write(2, 0b00);     // YCoCg
3009
0
  } else {
3010
0
    output->Write(2, 0b00);  // no transforms
3011
0
  }
3012
0
  if (!is_single_group) {
3013
0
    output->ZeroPadToByte();
3014
0
  }
3015
0
}
Unexecuted instantiation: enc_fast_lossless.cc:AVX2::(anonymous namespace)::PrepareDCGlobal(bool, unsigned long, unsigned long, unsigned long, (anonymous namespace)::PrefixCode const*, (anonymous namespace)::BitWriter*)
Unexecuted instantiation: enc_fast_lossless.cc:default_implementation::(anonymous namespace)::PrepareDCGlobal(bool, unsigned long, unsigned long, unsigned long, (anonymous namespace)::PrefixCode const*, (anonymous namespace)::BitWriter*)
3016
3017
template <typename BitDepth>
3018
struct ChunkEncoder {
3019
0
  void PrepareForSimd() {
3020
0
    BitDepth::PrepareForSimd(code->raw_nbits, code->raw_bits, code->numraw,
3021
0
                             raw_nbits_simd, raw_bits_simd);
3022
0
  }
Unexecuted instantiation: enc_fast_lossless.cc:AVX2::(anonymous namespace)::ChunkEncoder<AVX2::(anonymous namespace)::UpTo8Bits>::PrepareForSimd()
Unexecuted instantiation: enc_fast_lossless.cc:default_implementation::(anonymous namespace)::ChunkEncoder<default_implementation::(anonymous namespace)::UpTo8Bits>::PrepareForSimd()
Unexecuted instantiation: enc_fast_lossless.cc:AVX2::(anonymous namespace)::ChunkEncoder<AVX2::(anonymous namespace)::From9To13Bits>::PrepareForSimd()
Unexecuted instantiation: enc_fast_lossless.cc:AVX2::(anonymous namespace)::ChunkEncoder<AVX2::(anonymous namespace)::Exactly14Bits>::PrepareForSimd()
Unexecuted instantiation: enc_fast_lossless.cc:AVX2::(anonymous namespace)::ChunkEncoder<AVX2::(anonymous namespace)::MoreThan14Bits>::PrepareForSimd()
Unexecuted instantiation: enc_fast_lossless.cc:default_implementation::(anonymous namespace)::ChunkEncoder<default_implementation::(anonymous namespace)::From9To13Bits>::PrepareForSimd()
Unexecuted instantiation: enc_fast_lossless.cc:default_implementation::(anonymous namespace)::ChunkEncoder<default_implementation::(anonymous namespace)::Exactly14Bits>::PrepareForSimd()
Unexecuted instantiation: enc_fast_lossless.cc:default_implementation::(anonymous namespace)::ChunkEncoder<default_implementation::(anonymous namespace)::MoreThan14Bits>::PrepareForSimd()
3023
  FJXL_INLINE static void EncodeRle(size_t count, const PrefixCode& code,
3024
0
                                    BitWriter& output) {
3025
0
    if (count == 0) return;
3026
0
    count -= kLZ77MinLength + 1;
3027
0
    if (count < kLZ77CacheSize) {
3028
0
      output.Write(code.lz77_cache_nbits[count], code.lz77_cache_bits[count]);
3029
0
    } else {
3030
0
      unsigned token, nbits, bits;
3031
0
      EncodeHybridUintLZ77(count, &token, &nbits, &bits);
3032
0
      uint64_t wbits = bits;
3033
0
      wbits = (wbits << code.lz77_nbits[token]) | code.lz77_bits[token];
3034
0
      wbits = (wbits << code.raw_nbits[0]) | code.raw_bits[0];
3035
0
      output.Write(code.lz77_nbits[token] + nbits + code.raw_nbits[0], wbits);
3036
0
    }
3037
0
  }
Unexecuted instantiation: enc_fast_lossless.cc:AVX2::(anonymous namespace)::ChunkEncoder<AVX2::(anonymous namespace)::UpTo8Bits>::EncodeRle(unsigned long, (anonymous namespace)::PrefixCode const&, (anonymous namespace)::BitWriter&)
Unexecuted instantiation: enc_fast_lossless.cc:default_implementation::(anonymous namespace)::ChunkEncoder<default_implementation::(anonymous namespace)::UpTo8Bits>::EncodeRle(unsigned long, (anonymous namespace)::PrefixCode const&, (anonymous namespace)::BitWriter&)
Unexecuted instantiation: enc_fast_lossless.cc:AVX2::(anonymous namespace)::ChunkEncoder<AVX2::(anonymous namespace)::From9To13Bits>::EncodeRle(unsigned long, (anonymous namespace)::PrefixCode const&, (anonymous namespace)::BitWriter&)
Unexecuted instantiation: enc_fast_lossless.cc:AVX2::(anonymous namespace)::ChunkEncoder<AVX2::(anonymous namespace)::Exactly14Bits>::EncodeRle(unsigned long, (anonymous namespace)::PrefixCode const&, (anonymous namespace)::BitWriter&)
Unexecuted instantiation: enc_fast_lossless.cc:AVX2::(anonymous namespace)::ChunkEncoder<AVX2::(anonymous namespace)::MoreThan14Bits>::EncodeRle(unsigned long, (anonymous namespace)::PrefixCode const&, (anonymous namespace)::BitWriter&)
Unexecuted instantiation: enc_fast_lossless.cc:default_implementation::(anonymous namespace)::ChunkEncoder<default_implementation::(anonymous namespace)::From9To13Bits>::EncodeRle(unsigned long, (anonymous namespace)::PrefixCode const&, (anonymous namespace)::BitWriter&)
Unexecuted instantiation: enc_fast_lossless.cc:default_implementation::(anonymous namespace)::ChunkEncoder<default_implementation::(anonymous namespace)::Exactly14Bits>::EncodeRle(unsigned long, (anonymous namespace)::PrefixCode const&, (anonymous namespace)::BitWriter&)
Unexecuted instantiation: enc_fast_lossless.cc:default_implementation::(anonymous namespace)::ChunkEncoder<default_implementation::(anonymous namespace)::MoreThan14Bits>::EncodeRle(unsigned long, (anonymous namespace)::PrefixCode const&, (anonymous namespace)::BitWriter&)
3038
3039
  FJXL_INLINE void Chunk(size_t run, typename BitDepth::upixel_t* residuals,
3040
0
                         size_t skip, size_t n) {
3041
0
    EncodeRle(run, *code, *output);
3042
#ifdef FJXL_GENERIC_SIMD
3043
    BitDepth::EncodeChunkSimd(residuals, n, skip, raw_nbits_simd, raw_bits_simd,
3044
                              *output);
3045
#else
3046
    GenericEncodeChunk(residuals, n, skip, *code, *output);
3047
#endif
3048
0
  }
Unexecuted instantiation: enc_fast_lossless.cc:AVX2::(anonymous namespace)::ChunkEncoder<AVX2::(anonymous namespace)::UpTo8Bits>::Chunk(unsigned long, unsigned short*, unsigned long, unsigned long)
Unexecuted instantiation: enc_fast_lossless.cc:default_implementation::(anonymous namespace)::ChunkEncoder<default_implementation::(anonymous namespace)::UpTo8Bits>::Chunk(unsigned long, unsigned short*, unsigned long, unsigned long)
Unexecuted instantiation: enc_fast_lossless.cc:AVX2::(anonymous namespace)::ChunkEncoder<AVX2::(anonymous namespace)::From9To13Bits>::Chunk(unsigned long, unsigned short*, unsigned long, unsigned long)
Unexecuted instantiation: enc_fast_lossless.cc:AVX2::(anonymous namespace)::ChunkEncoder<AVX2::(anonymous namespace)::Exactly14Bits>::Chunk(unsigned long, unsigned short*, unsigned long, unsigned long)
Unexecuted instantiation: enc_fast_lossless.cc:AVX2::(anonymous namespace)::ChunkEncoder<AVX2::(anonymous namespace)::MoreThan14Bits>::Chunk(unsigned long, unsigned int*, unsigned long, unsigned long)
Unexecuted instantiation: enc_fast_lossless.cc:default_implementation::(anonymous namespace)::ChunkEncoder<default_implementation::(anonymous namespace)::From9To13Bits>::Chunk(unsigned long, unsigned short*, unsigned long, unsigned long)
Unexecuted instantiation: enc_fast_lossless.cc:default_implementation::(anonymous namespace)::ChunkEncoder<default_implementation::(anonymous namespace)::Exactly14Bits>::Chunk(unsigned long, unsigned short*, unsigned long, unsigned long)
Unexecuted instantiation: enc_fast_lossless.cc:default_implementation::(anonymous namespace)::ChunkEncoder<default_implementation::(anonymous namespace)::MoreThan14Bits>::Chunk(unsigned long, unsigned int*, unsigned long, unsigned long)
3049
3050
0
  inline void Finalize(size_t run) { EncodeRle(run, *code, *output); }
Unexecuted instantiation: enc_fast_lossless.cc:AVX2::(anonymous namespace)::ChunkEncoder<AVX2::(anonymous namespace)::UpTo8Bits>::Finalize(unsigned long)
Unexecuted instantiation: enc_fast_lossless.cc:default_implementation::(anonymous namespace)::ChunkEncoder<default_implementation::(anonymous namespace)::UpTo8Bits>::Finalize(unsigned long)
Unexecuted instantiation: enc_fast_lossless.cc:AVX2::(anonymous namespace)::ChunkEncoder<AVX2::(anonymous namespace)::From9To13Bits>::Finalize(unsigned long)
Unexecuted instantiation: enc_fast_lossless.cc:AVX2::(anonymous namespace)::ChunkEncoder<AVX2::(anonymous namespace)::Exactly14Bits>::Finalize(unsigned long)
Unexecuted instantiation: enc_fast_lossless.cc:AVX2::(anonymous namespace)::ChunkEncoder<AVX2::(anonymous namespace)::MoreThan14Bits>::Finalize(unsigned long)
Unexecuted instantiation: enc_fast_lossless.cc:default_implementation::(anonymous namespace)::ChunkEncoder<default_implementation::(anonymous namespace)::From9To13Bits>::Finalize(unsigned long)
Unexecuted instantiation: enc_fast_lossless.cc:default_implementation::(anonymous namespace)::ChunkEncoder<default_implementation::(anonymous namespace)::Exactly14Bits>::Finalize(unsigned long)
Unexecuted instantiation: enc_fast_lossless.cc:default_implementation::(anonymous namespace)::ChunkEncoder<default_implementation::(anonymous namespace)::MoreThan14Bits>::Finalize(unsigned long)
3051
3052
  const PrefixCode* code;
3053
  BitWriter* output;
3054
  alignas(64) uint8_t raw_nbits_simd[16] = {};
3055
  alignas(64) uint8_t raw_bits_simd[16] = {};
3056
};
3057
3058
template <typename BitDepth>
3059
struct ChunkSampleCollector {
3060
0
  FJXL_INLINE void Rle(size_t count, uint64_t* lz77_counts_) {
3061
0
    if (count == 0) return;
3062
0
    raw_counts[0] += 1;
3063
0
    count -= kLZ77MinLength + 1;
3064
0
    unsigned token, nbits, bits;
3065
0
    EncodeHybridUintLZ77(count, &token, &nbits, &bits);
3066
0
    lz77_counts_[token]++;
3067
0
  }
Unexecuted instantiation: enc_fast_lossless.cc:AVX2::(anonymous namespace)::ChunkSampleCollector<AVX2::(anonymous namespace)::UpTo8Bits>::Rle(unsigned long, unsigned long*)
Unexecuted instantiation: enc_fast_lossless.cc:AVX2::(anonymous namespace)::ChunkSampleCollector<AVX2::(anonymous namespace)::From9To13Bits>::Rle(unsigned long, unsigned long*)
Unexecuted instantiation: enc_fast_lossless.cc:AVX2::(anonymous namespace)::ChunkSampleCollector<AVX2::(anonymous namespace)::Exactly14Bits>::Rle(unsigned long, unsigned long*)
Unexecuted instantiation: enc_fast_lossless.cc:AVX2::(anonymous namespace)::ChunkSampleCollector<AVX2::(anonymous namespace)::MoreThan14Bits>::Rle(unsigned long, unsigned long*)
Unexecuted instantiation: enc_fast_lossless.cc:default_implementation::(anonymous namespace)::ChunkSampleCollector<default_implementation::(anonymous namespace)::UpTo8Bits>::Rle(unsigned long, unsigned long*)
Unexecuted instantiation: enc_fast_lossless.cc:default_implementation::(anonymous namespace)::ChunkSampleCollector<default_implementation::(anonymous namespace)::From9To13Bits>::Rle(unsigned long, unsigned long*)
Unexecuted instantiation: enc_fast_lossless.cc:default_implementation::(anonymous namespace)::ChunkSampleCollector<default_implementation::(anonymous namespace)::Exactly14Bits>::Rle(unsigned long, unsigned long*)
Unexecuted instantiation: enc_fast_lossless.cc:default_implementation::(anonymous namespace)::ChunkSampleCollector<default_implementation::(anonymous namespace)::MoreThan14Bits>::Rle(unsigned long, unsigned long*)
3068
3069
  FJXL_INLINE void Chunk(size_t run, typename BitDepth::upixel_t* residuals,
3070
0
                         size_t skip, size_t n) {
3071
    // Run is broken. Encode the run and encode the individual vector.
3072
0
    Rle(run, lz77_counts);
3073
0
    for (size_t ix = skip; ix < n; ix++) {
3074
0
      unsigned token, nbits, bits;
3075
0
      EncodeHybridUint000(residuals[ix], &token, &nbits, &bits);
3076
0
      raw_counts[token]++;
3077
0
    }
3078
0
  }
Unexecuted instantiation: enc_fast_lossless.cc:AVX2::(anonymous namespace)::ChunkSampleCollector<AVX2::(anonymous namespace)::UpTo8Bits>::Chunk(unsigned long, unsigned short*, unsigned long, unsigned long)
Unexecuted instantiation: enc_fast_lossless.cc:AVX2::(anonymous namespace)::ChunkSampleCollector<AVX2::(anonymous namespace)::From9To13Bits>::Chunk(unsigned long, unsigned short*, unsigned long, unsigned long)
Unexecuted instantiation: enc_fast_lossless.cc:AVX2::(anonymous namespace)::ChunkSampleCollector<AVX2::(anonymous namespace)::Exactly14Bits>::Chunk(unsigned long, unsigned short*, unsigned long, unsigned long)
Unexecuted instantiation: enc_fast_lossless.cc:AVX2::(anonymous namespace)::ChunkSampleCollector<AVX2::(anonymous namespace)::MoreThan14Bits>::Chunk(unsigned long, unsigned int*, unsigned long, unsigned long)
Unexecuted instantiation: enc_fast_lossless.cc:default_implementation::(anonymous namespace)::ChunkSampleCollector<default_implementation::(anonymous namespace)::UpTo8Bits>::Chunk(unsigned long, unsigned short*, unsigned long, unsigned long)
Unexecuted instantiation: enc_fast_lossless.cc:default_implementation::(anonymous namespace)::ChunkSampleCollector<default_implementation::(anonymous namespace)::From9To13Bits>::Chunk(unsigned long, unsigned short*, unsigned long, unsigned long)
Unexecuted instantiation: enc_fast_lossless.cc:default_implementation::(anonymous namespace)::ChunkSampleCollector<default_implementation::(anonymous namespace)::Exactly14Bits>::Chunk(unsigned long, unsigned short*, unsigned long, unsigned long)
Unexecuted instantiation: enc_fast_lossless.cc:default_implementation::(anonymous namespace)::ChunkSampleCollector<default_implementation::(anonymous namespace)::MoreThan14Bits>::Chunk(unsigned long, unsigned int*, unsigned long, unsigned long)
3079
3080
  // don't count final run since we don't know how long it really is
3081
0
  void Finalize(size_t run) {}
Unexecuted instantiation: enc_fast_lossless.cc:AVX2::(anonymous namespace)::ChunkSampleCollector<AVX2::(anonymous namespace)::UpTo8Bits>::Finalize(unsigned long)
Unexecuted instantiation: enc_fast_lossless.cc:AVX2::(anonymous namespace)::ChunkSampleCollector<AVX2::(anonymous namespace)::From9To13Bits>::Finalize(unsigned long)
Unexecuted instantiation: enc_fast_lossless.cc:AVX2::(anonymous namespace)::ChunkSampleCollector<AVX2::(anonymous namespace)::Exactly14Bits>::Finalize(unsigned long)
Unexecuted instantiation: enc_fast_lossless.cc:AVX2::(anonymous namespace)::ChunkSampleCollector<AVX2::(anonymous namespace)::MoreThan14Bits>::Finalize(unsigned long)
Unexecuted instantiation: enc_fast_lossless.cc:default_implementation::(anonymous namespace)::ChunkSampleCollector<default_implementation::(anonymous namespace)::UpTo8Bits>::Finalize(unsigned long)
Unexecuted instantiation: enc_fast_lossless.cc:default_implementation::(anonymous namespace)::ChunkSampleCollector<default_implementation::(anonymous namespace)::From9To13Bits>::Finalize(unsigned long)
Unexecuted instantiation: enc_fast_lossless.cc:default_implementation::(anonymous namespace)::ChunkSampleCollector<default_implementation::(anonymous namespace)::Exactly14Bits>::Finalize(unsigned long)
Unexecuted instantiation: enc_fast_lossless.cc:default_implementation::(anonymous namespace)::ChunkSampleCollector<default_implementation::(anonymous namespace)::MoreThan14Bits>::Finalize(unsigned long)
3082
3083
  uint64_t* raw_counts;
3084
  uint64_t* lz77_counts;
3085
};
3086
3087
0
constexpr uint32_t PackSigned(int32_t value) {
3088
0
  return (static_cast<uint32_t>(value) << 1) ^
3089
0
         ((static_cast<uint32_t>(~value) >> 31) - 1);
3090
0
}
Unexecuted instantiation: enc_fast_lossless.cc:AVX2::(anonymous namespace)::PackSigned(int)
Unexecuted instantiation: enc_fast_lossless.cc:default_implementation::(anonymous namespace)::PackSigned(int)
3091
3092
template <typename T, typename BitDepth>
3093
struct ChannelRowProcessor {
3094
  using upixel_t = typename BitDepth::upixel_t;
3095
  using pixel_t = typename BitDepth::pixel_t;
3096
  T* t;
3097
  void ProcessChunk(const pixel_t* row, const pixel_t* row_left,
3098
                    const pixel_t* row_top, const pixel_t* row_topleft,
3099
0
                    size_t n) {
3100
0
    alignas(64) upixel_t residuals[kChunkSize] = {};
3101
0
    size_t prefix_size = 0;
3102
0
    size_t required_prefix_size = 0;
3103
#ifdef FJXL_GENERIC_SIMD
3104
    constexpr size_t kNum =
3105
0
        sizeof(pixel_t) == 2 ? SIMDVec16::kLanes : SIMDVec32::kLanes;
3106
0
    for (size_t ix = 0; ix < kChunkSize; ix += kNum) {
3107
0
      size_t c =
3108
0
          PredictPixels<simd_t<pixel_t>>(row + ix, row_left + ix, row_top + ix,
3109
0
                                         row_topleft + ix, residuals + ix);
3110
0
      prefix_size =
3111
0
          prefix_size == required_prefix_size ? prefix_size + c : prefix_size;
3112
0
      required_prefix_size += kNum;
3113
0
    }
3114
#else
3115
0
    for (size_t ix = 0; ix < kChunkSize; ix++) {
3116
0
      pixel_t px = row[ix];
3117
0
      pixel_t left = row_left[ix];
3118
0
      pixel_t top = row_top[ix];
3119
0
      pixel_t topleft = row_topleft[ix];
3120
0
      pixel_t ac = left - topleft;
3121
0
      pixel_t ab = left - top;
3122
0
      pixel_t bc = top - topleft;
3123
0
      pixel_t grad = static_cast<pixel_t>(static_cast<upixel_t>(ac) +
3124
0
                                          static_cast<upixel_t>(top));
3125
0
      pixel_t d = ab ^ bc;
3126
0
      pixel_t clamp = d < 0 ? top : left;
3127
0
      pixel_t s = ac ^ bc;
3128
0
      pixel_t pred = s < 0 ? grad : clamp;
3129
0
      residuals[ix] = PackSigned(px - pred);
3130
0
      prefix_size = prefix_size == required_prefix_size
3131
0
                        ? prefix_size + (residuals[ix] == 0)
3132
0
                        : prefix_size;
3133
0
      required_prefix_size += 1;
3134
0
    }
3135
#endif
3136
0
    prefix_size = std::min(n, prefix_size);
3137
0
    if (prefix_size == n && (run > 0 || prefix_size > kLZ77MinLength)) {
3138
      // Run continues, nothing to do.
3139
0
      run += prefix_size;
3140
0
    } else if (prefix_size + run > kLZ77MinLength) {
3141
      // Run is broken. Encode the run and encode the individual vector.
3142
0
      t->Chunk(run + prefix_size, residuals, prefix_size, n);
3143
0
      run = 0;
3144
0
    } else {
3145
      // There was no run to begin with.
3146
0
      t->Chunk(0, residuals, 0, n);
3147
0
    }
3148
0
  }
Unexecuted instantiation: enc_fast_lossless.cc:AVX2::(anonymous namespace)::ChannelRowProcessor<AVX2::(anonymous namespace)::ChunkSampleCollector<AVX2::(anonymous namespace)::UpTo8Bits>, AVX2::(anonymous namespace)::UpTo8Bits>::ProcessChunk(short const*, short const*, short const*, short const*, unsigned long)
Unexecuted instantiation: enc_fast_lossless.cc:AVX2::(anonymous namespace)::ChannelRowProcessor<AVX2::(anonymous namespace)::ChunkEncoder<AVX2::(anonymous namespace)::UpTo8Bits>, AVX2::(anonymous namespace)::UpTo8Bits>::ProcessChunk(short const*, short const*, short const*, short const*, unsigned long)
Unexecuted instantiation: enc_fast_lossless.cc:AVX2::(anonymous namespace)::ChannelRowProcessor<AVX2::(anonymous namespace)::ChunkSampleCollector<AVX2::(anonymous namespace)::From9To13Bits>, AVX2::(anonymous namespace)::From9To13Bits>::ProcessChunk(short const*, short const*, short const*, short const*, unsigned long)
Unexecuted instantiation: enc_fast_lossless.cc:AVX2::(anonymous namespace)::ChannelRowProcessor<AVX2::(anonymous namespace)::ChunkSampleCollector<AVX2::(anonymous namespace)::Exactly14Bits>, AVX2::(anonymous namespace)::Exactly14Bits>::ProcessChunk(short const*, short const*, short const*, short const*, unsigned long)
Unexecuted instantiation: enc_fast_lossless.cc:AVX2::(anonymous namespace)::ChannelRowProcessor<AVX2::(anonymous namespace)::ChunkSampleCollector<AVX2::(anonymous namespace)::MoreThan14Bits>, AVX2::(anonymous namespace)::MoreThan14Bits>::ProcessChunk(int const*, int const*, int const*, int const*, unsigned long)
Unexecuted instantiation: enc_fast_lossless.cc:default_implementation::(anonymous namespace)::ChannelRowProcessor<default_implementation::(anonymous namespace)::ChunkSampleCollector<default_implementation::(anonymous namespace)::UpTo8Bits>, default_implementation::(anonymous namespace)::UpTo8Bits>::ProcessChunk(short const*, short const*, short const*, short const*, unsigned long)
Unexecuted instantiation: enc_fast_lossless.cc:default_implementation::(anonymous namespace)::ChannelRowProcessor<default_implementation::(anonymous namespace)::ChunkEncoder<default_implementation::(anonymous namespace)::UpTo8Bits>, default_implementation::(anonymous namespace)::UpTo8Bits>::ProcessChunk(short const*, short const*, short const*, short const*, unsigned long)
Unexecuted instantiation: enc_fast_lossless.cc:default_implementation::(anonymous namespace)::ChannelRowProcessor<default_implementation::(anonymous namespace)::ChunkSampleCollector<default_implementation::(anonymous namespace)::From9To13Bits>, default_implementation::(anonymous namespace)::From9To13Bits>::ProcessChunk(short const*, short const*, short const*, short const*, unsigned long)
Unexecuted instantiation: enc_fast_lossless.cc:default_implementation::(anonymous namespace)::ChannelRowProcessor<default_implementation::(anonymous namespace)::ChunkSampleCollector<default_implementation::(anonymous namespace)::Exactly14Bits>, default_implementation::(anonymous namespace)::Exactly14Bits>::ProcessChunk(short const*, short const*, short const*, short const*, unsigned long)
Unexecuted instantiation: enc_fast_lossless.cc:default_implementation::(anonymous namespace)::ChannelRowProcessor<default_implementation::(anonymous namespace)::ChunkSampleCollector<default_implementation::(anonymous namespace)::MoreThan14Bits>, default_implementation::(anonymous namespace)::MoreThan14Bits>::ProcessChunk(int const*, int const*, int const*, int const*, unsigned long)
Unexecuted instantiation: enc_fast_lossless.cc:AVX2::(anonymous namespace)::ChannelRowProcessor<AVX2::(anonymous namespace)::ChunkEncoder<AVX2::(anonymous namespace)::From9To13Bits>, AVX2::(anonymous namespace)::From9To13Bits>::ProcessChunk(short const*, short const*, short const*, short const*, unsigned long)
Unexecuted instantiation: enc_fast_lossless.cc:AVX2::(anonymous namespace)::ChannelRowProcessor<AVX2::(anonymous namespace)::ChunkEncoder<AVX2::(anonymous namespace)::Exactly14Bits>, AVX2::(anonymous namespace)::Exactly14Bits>::ProcessChunk(short const*, short const*, short const*, short const*, unsigned long)
Unexecuted instantiation: enc_fast_lossless.cc:AVX2::(anonymous namespace)::ChannelRowProcessor<AVX2::(anonymous namespace)::ChunkEncoder<AVX2::(anonymous namespace)::MoreThan14Bits>, AVX2::(anonymous namespace)::MoreThan14Bits>::ProcessChunk(int const*, int const*, int const*, int const*, unsigned long)
Unexecuted instantiation: enc_fast_lossless.cc:default_implementation::(anonymous namespace)::ChannelRowProcessor<default_implementation::(anonymous namespace)::ChunkEncoder<default_implementation::(anonymous namespace)::From9To13Bits>, default_implementation::(anonymous namespace)::From9To13Bits>::ProcessChunk(short const*, short const*, short const*, short const*, unsigned long)
Unexecuted instantiation: enc_fast_lossless.cc:default_implementation::(anonymous namespace)::ChannelRowProcessor<default_implementation::(anonymous namespace)::ChunkEncoder<default_implementation::(anonymous namespace)::Exactly14Bits>, default_implementation::(anonymous namespace)::Exactly14Bits>::ProcessChunk(short const*, short const*, short const*, short const*, unsigned long)
Unexecuted instantiation: enc_fast_lossless.cc:default_implementation::(anonymous namespace)::ChannelRowProcessor<default_implementation::(anonymous namespace)::ChunkEncoder<default_implementation::(anonymous namespace)::MoreThan14Bits>, default_implementation::(anonymous namespace)::MoreThan14Bits>::ProcessChunk(int const*, int const*, int const*, int const*, unsigned long)
3149
3150
  void ProcessRow(const pixel_t* row, const pixel_t* row_left,
3151
                  const pixel_t* row_top, const pixel_t* row_topleft,
3152
0
                  size_t xs) {
3153
0
    for (size_t x = 0; x < xs; x += kChunkSize) {
3154
0
      ProcessChunk(row + x, row_left + x, row_top + x, row_topleft + x,
3155
0
                   std::min(kChunkSize, xs - x));
3156
0
    }
3157
0
  }
Unexecuted instantiation: enc_fast_lossless.cc:AVX2::(anonymous namespace)::ChannelRowProcessor<AVX2::(anonymous namespace)::ChunkSampleCollector<AVX2::(anonymous namespace)::UpTo8Bits>, AVX2::(anonymous namespace)::UpTo8Bits>::ProcessRow(short const*, short const*, short const*, short const*, unsigned long)
Unexecuted instantiation: enc_fast_lossless.cc:AVX2::(anonymous namespace)::ChannelRowProcessor<AVX2::(anonymous namespace)::ChunkEncoder<AVX2::(anonymous namespace)::UpTo8Bits>, AVX2::(anonymous namespace)::UpTo8Bits>::ProcessRow(short const*, short const*, short const*, short const*, unsigned long)
Unexecuted instantiation: enc_fast_lossless.cc:AVX2::(anonymous namespace)::ChannelRowProcessor<AVX2::(anonymous namespace)::ChunkSampleCollector<AVX2::(anonymous namespace)::From9To13Bits>, AVX2::(anonymous namespace)::From9To13Bits>::ProcessRow(short const*, short const*, short const*, short const*, unsigned long)
Unexecuted instantiation: enc_fast_lossless.cc:AVX2::(anonymous namespace)::ChannelRowProcessor<AVX2::(anonymous namespace)::ChunkSampleCollector<AVX2::(anonymous namespace)::Exactly14Bits>, AVX2::(anonymous namespace)::Exactly14Bits>::ProcessRow(short const*, short const*, short const*, short const*, unsigned long)
Unexecuted instantiation: enc_fast_lossless.cc:AVX2::(anonymous namespace)::ChannelRowProcessor<AVX2::(anonymous namespace)::ChunkSampleCollector<AVX2::(anonymous namespace)::MoreThan14Bits>, AVX2::(anonymous namespace)::MoreThan14Bits>::ProcessRow(int const*, int const*, int const*, int const*, unsigned long)
Unexecuted instantiation: enc_fast_lossless.cc:default_implementation::(anonymous namespace)::ChannelRowProcessor<default_implementation::(anonymous namespace)::ChunkSampleCollector<default_implementation::(anonymous namespace)::UpTo8Bits>, default_implementation::(anonymous namespace)::UpTo8Bits>::ProcessRow(short const*, short const*, short const*, short const*, unsigned long)
Unexecuted instantiation: enc_fast_lossless.cc:default_implementation::(anonymous namespace)::ChannelRowProcessor<default_implementation::(anonymous namespace)::ChunkEncoder<default_implementation::(anonymous namespace)::UpTo8Bits>, default_implementation::(anonymous namespace)::UpTo8Bits>::ProcessRow(short const*, short const*, short const*, short const*, unsigned long)
Unexecuted instantiation: enc_fast_lossless.cc:default_implementation::(anonymous namespace)::ChannelRowProcessor<default_implementation::(anonymous namespace)::ChunkSampleCollector<default_implementation::(anonymous namespace)::From9To13Bits>, default_implementation::(anonymous namespace)::From9To13Bits>::ProcessRow(short const*, short const*, short const*, short const*, unsigned long)
Unexecuted instantiation: enc_fast_lossless.cc:default_implementation::(anonymous namespace)::ChannelRowProcessor<default_implementation::(anonymous namespace)::ChunkSampleCollector<default_implementation::(anonymous namespace)::Exactly14Bits>, default_implementation::(anonymous namespace)::Exactly14Bits>::ProcessRow(short const*, short const*, short const*, short const*, unsigned long)
Unexecuted instantiation: enc_fast_lossless.cc:default_implementation::(anonymous namespace)::ChannelRowProcessor<default_implementation::(anonymous namespace)::ChunkSampleCollector<default_implementation::(anonymous namespace)::MoreThan14Bits>, default_implementation::(anonymous namespace)::MoreThan14Bits>::ProcessRow(int const*, int const*, int const*, int const*, unsigned long)
Unexecuted instantiation: enc_fast_lossless.cc:AVX2::(anonymous namespace)::ChannelRowProcessor<AVX2::(anonymous namespace)::ChunkEncoder<AVX2::(anonymous namespace)::From9To13Bits>, AVX2::(anonymous namespace)::From9To13Bits>::ProcessRow(short const*, short const*, short const*, short const*, unsigned long)
Unexecuted instantiation: enc_fast_lossless.cc:AVX2::(anonymous namespace)::ChannelRowProcessor<AVX2::(anonymous namespace)::ChunkEncoder<AVX2::(anonymous namespace)::Exactly14Bits>, AVX2::(anonymous namespace)::Exactly14Bits>::ProcessRow(short const*, short const*, short const*, short const*, unsigned long)
Unexecuted instantiation: enc_fast_lossless.cc:AVX2::(anonymous namespace)::ChannelRowProcessor<AVX2::(anonymous namespace)::ChunkEncoder<AVX2::(anonymous namespace)::MoreThan14Bits>, AVX2::(anonymous namespace)::MoreThan14Bits>::ProcessRow(int const*, int const*, int const*, int const*, unsigned long)
Unexecuted instantiation: enc_fast_lossless.cc:default_implementation::(anonymous namespace)::ChannelRowProcessor<default_implementation::(anonymous namespace)::ChunkEncoder<default_implementation::(anonymous namespace)::From9To13Bits>, default_implementation::(anonymous namespace)::From9To13Bits>::ProcessRow(short const*, short const*, short const*, short const*, unsigned long)
Unexecuted instantiation: enc_fast_lossless.cc:default_implementation::(anonymous namespace)::ChannelRowProcessor<default_implementation::(anonymous namespace)::ChunkEncoder<default_implementation::(anonymous namespace)::Exactly14Bits>, default_implementation::(anonymous namespace)::Exactly14Bits>::ProcessRow(short const*, short const*, short const*, short const*, unsigned long)
Unexecuted instantiation: enc_fast_lossless.cc:default_implementation::(anonymous namespace)::ChannelRowProcessor<default_implementation::(anonymous namespace)::ChunkEncoder<default_implementation::(anonymous namespace)::MoreThan14Bits>, default_implementation::(anonymous namespace)::MoreThan14Bits>::ProcessRow(int const*, int const*, int const*, int const*, unsigned long)
3158
3159
0
  void Finalize() { t->Finalize(run); }
Unexecuted instantiation: enc_fast_lossless.cc:AVX2::(anonymous namespace)::ChannelRowProcessor<AVX2::(anonymous namespace)::ChunkSampleCollector<AVX2::(anonymous namespace)::UpTo8Bits>, AVX2::(anonymous namespace)::UpTo8Bits>::Finalize()
Unexecuted instantiation: enc_fast_lossless.cc:AVX2::(anonymous namespace)::ChannelRowProcessor<AVX2::(anonymous namespace)::ChunkEncoder<AVX2::(anonymous namespace)::UpTo8Bits>, AVX2::(anonymous namespace)::UpTo8Bits>::Finalize()
Unexecuted instantiation: enc_fast_lossless.cc:AVX2::(anonymous namespace)::ChannelRowProcessor<AVX2::(anonymous namespace)::ChunkSampleCollector<AVX2::(anonymous namespace)::From9To13Bits>, AVX2::(anonymous namespace)::From9To13Bits>::Finalize()
Unexecuted instantiation: enc_fast_lossless.cc:AVX2::(anonymous namespace)::ChannelRowProcessor<AVX2::(anonymous namespace)::ChunkSampleCollector<AVX2::(anonymous namespace)::Exactly14Bits>, AVX2::(anonymous namespace)::Exactly14Bits>::Finalize()
Unexecuted instantiation: enc_fast_lossless.cc:AVX2::(anonymous namespace)::ChannelRowProcessor<AVX2::(anonymous namespace)::ChunkSampleCollector<AVX2::(anonymous namespace)::MoreThan14Bits>, AVX2::(anonymous namespace)::MoreThan14Bits>::Finalize()
Unexecuted instantiation: enc_fast_lossless.cc:default_implementation::(anonymous namespace)::ChannelRowProcessor<default_implementation::(anonymous namespace)::ChunkSampleCollector<default_implementation::(anonymous namespace)::UpTo8Bits>, default_implementation::(anonymous namespace)::UpTo8Bits>::Finalize()
Unexecuted instantiation: enc_fast_lossless.cc:default_implementation::(anonymous namespace)::ChannelRowProcessor<default_implementation::(anonymous namespace)::ChunkEncoder<default_implementation::(anonymous namespace)::UpTo8Bits>, default_implementation::(anonymous namespace)::UpTo8Bits>::Finalize()
Unexecuted instantiation: enc_fast_lossless.cc:default_implementation::(anonymous namespace)::ChannelRowProcessor<default_implementation::(anonymous namespace)::ChunkSampleCollector<default_implementation::(anonymous namespace)::From9To13Bits>, default_implementation::(anonymous namespace)::From9To13Bits>::Finalize()
Unexecuted instantiation: enc_fast_lossless.cc:default_implementation::(anonymous namespace)::ChannelRowProcessor<default_implementation::(anonymous namespace)::ChunkSampleCollector<default_implementation::(anonymous namespace)::Exactly14Bits>, default_implementation::(anonymous namespace)::Exactly14Bits>::Finalize()
Unexecuted instantiation: enc_fast_lossless.cc:default_implementation::(anonymous namespace)::ChannelRowProcessor<default_implementation::(anonymous namespace)::ChunkSampleCollector<default_implementation::(anonymous namespace)::MoreThan14Bits>, default_implementation::(anonymous namespace)::MoreThan14Bits>::Finalize()
Unexecuted instantiation: enc_fast_lossless.cc:AVX2::(anonymous namespace)::ChannelRowProcessor<AVX2::(anonymous namespace)::ChunkEncoder<AVX2::(anonymous namespace)::From9To13Bits>, AVX2::(anonymous namespace)::From9To13Bits>::Finalize()
Unexecuted instantiation: enc_fast_lossless.cc:AVX2::(anonymous namespace)::ChannelRowProcessor<AVX2::(anonymous namespace)::ChunkEncoder<AVX2::(anonymous namespace)::Exactly14Bits>, AVX2::(anonymous namespace)::Exactly14Bits>::Finalize()
Unexecuted instantiation: enc_fast_lossless.cc:AVX2::(anonymous namespace)::ChannelRowProcessor<AVX2::(anonymous namespace)::ChunkEncoder<AVX2::(anonymous namespace)::MoreThan14Bits>, AVX2::(anonymous namespace)::MoreThan14Bits>::Finalize()
Unexecuted instantiation: enc_fast_lossless.cc:default_implementation::(anonymous namespace)::ChannelRowProcessor<default_implementation::(anonymous namespace)::ChunkEncoder<default_implementation::(anonymous namespace)::From9To13Bits>, default_implementation::(anonymous namespace)::From9To13Bits>::Finalize()
Unexecuted instantiation: enc_fast_lossless.cc:default_implementation::(anonymous namespace)::ChannelRowProcessor<default_implementation::(anonymous namespace)::ChunkEncoder<default_implementation::(anonymous namespace)::Exactly14Bits>, default_implementation::(anonymous namespace)::Exactly14Bits>::Finalize()
Unexecuted instantiation: enc_fast_lossless.cc:default_implementation::(anonymous namespace)::ChannelRowProcessor<default_implementation::(anonymous namespace)::ChunkEncoder<default_implementation::(anonymous namespace)::MoreThan14Bits>, default_implementation::(anonymous namespace)::MoreThan14Bits>::Finalize()
3160
  // Invariant: run == 0 or run > kLZ77MinLength.
3161
  size_t run = 0;
3162
};
3163
3164
0
uint16_t LoadLE16(const unsigned char* ptr) {
3165
0
  return uint16_t{ptr[0]} | (uint16_t{ptr[1]} << 8);
3166
0
}
Unexecuted instantiation: enc_fast_lossless.cc:AVX2::(anonymous namespace)::LoadLE16(unsigned char const*)
Unexecuted instantiation: enc_fast_lossless.cc:default_implementation::(anonymous namespace)::LoadLE16(unsigned char const*)
3167
3168
0
uint16_t SwapEndian(uint16_t in) { return (in >> 8) | (in << 8); }
Unexecuted instantiation: enc_fast_lossless.cc:AVX2::(anonymous namespace)::SwapEndian(unsigned short)
Unexecuted instantiation: enc_fast_lossless.cc:default_implementation::(anonymous namespace)::SwapEndian(unsigned short)
3169
3170
#ifdef FJXL_GENERIC_SIMD
3171
0
void StorePixels(SIMDVec16 p, int16_t* dest) { p.Store((uint16_t*)dest); }
3172
3173
0
void StorePixels(SIMDVec16 p, int32_t* dest) {
3174
0
  VecPair<SIMDVec32> p_up = p.Upcast();
3175
0
  p_up.low.Store((uint32_t*)dest);
3176
0
  p_up.hi.Store((uint32_t*)dest + SIMDVec32::kLanes);
3177
0
}
3178
#endif
3179
3180
template <typename pixel_t>
3181
0
void FillRowG8(const unsigned char* rgba, size_t oxs, pixel_t* luma) {
3182
0
  size_t x = 0;
3183
#ifdef FJXL_GENERIC_SIMD
3184
0
  for (; x + SIMDVec16::kLanes <= oxs; x += SIMDVec16::kLanes) {
3185
0
    auto rgb = SIMDVec16::LoadG8(rgba + x);
3186
0
    StorePixels(rgb[0], luma + x);
3187
0
  }
3188
#endif
3189
0
  for (; x < oxs; x++) {
3190
0
    luma[x] = rgba[x];
3191
0
  }
3192
0
}
Unexecuted instantiation: enc_fast_lossless.cc:void AVX2::(anonymous namespace)::FillRowG8<short>(unsigned char const*, unsigned long, short*)
Unexecuted instantiation: enc_fast_lossless.cc:void default_implementation::(anonymous namespace)::FillRowG8<short>(unsigned char const*, unsigned long, short*)
Unexecuted instantiation: enc_fast_lossless.cc:void default_implementation::(anonymous namespace)::FillRowG8<int>(unsigned char const*, unsigned long, int*)
Unexecuted instantiation: enc_fast_lossless.cc:void AVX2::(anonymous namespace)::FillRowG8<int>(unsigned char const*, unsigned long, int*)
3193
3194
template <bool big_endian, typename pixel_t>
3195
0
void FillRowG16(const unsigned char* rgba, size_t oxs, pixel_t* luma) {
3196
0
  size_t x = 0;
3197
#ifdef FJXL_GENERIC_SIMD
3198
0
  for (; x + SIMDVec16::kLanes <= oxs; x += SIMDVec16::kLanes) {
3199
0
    auto rgb = SIMDVec16::LoadG16(rgba + 2 * x);
3200
0
    if (big_endian) {
3201
0
      rgb[0].SwapEndian();
3202
0
    }
3203
0
    StorePixels(rgb[0], luma + x);
3204
0
  }
3205
#endif
3206
0
  for (; x < oxs; x++) {
3207
0
    uint16_t val = LoadLE16(rgba + 2 * x);
3208
0
    if (big_endian) {
3209
0
      val = SwapEndian(val);
3210
0
    }
3211
0
    luma[x] = val;
3212
0
  }
3213
0
}
Unexecuted instantiation: enc_fast_lossless.cc:void AVX2::(anonymous namespace)::FillRowG16<true, short>(unsigned char const*, unsigned long, short*)
Unexecuted instantiation: enc_fast_lossless.cc:void AVX2::(anonymous namespace)::FillRowG16<false, short>(unsigned char const*, unsigned long, short*)
Unexecuted instantiation: enc_fast_lossless.cc:void AVX2::(anonymous namespace)::FillRowG16<true, int>(unsigned char const*, unsigned long, int*)
Unexecuted instantiation: enc_fast_lossless.cc:void AVX2::(anonymous namespace)::FillRowG16<false, int>(unsigned char const*, unsigned long, int*)
Unexecuted instantiation: enc_fast_lossless.cc:void default_implementation::(anonymous namespace)::FillRowG16<true, short>(unsigned char const*, unsigned long, short*)
Unexecuted instantiation: enc_fast_lossless.cc:void default_implementation::(anonymous namespace)::FillRowG16<false, short>(unsigned char const*, unsigned long, short*)
Unexecuted instantiation: enc_fast_lossless.cc:void default_implementation::(anonymous namespace)::FillRowG16<true, int>(unsigned char const*, unsigned long, int*)
Unexecuted instantiation: enc_fast_lossless.cc:void default_implementation::(anonymous namespace)::FillRowG16<false, int>(unsigned char const*, unsigned long, int*)
3214
3215
template <typename pixel_t>
3216
void FillRowGA8(const unsigned char* rgba, size_t oxs, pixel_t* luma,
3217
0
                pixel_t* alpha) {
3218
0
  size_t x = 0;
3219
#ifdef FJXL_GENERIC_SIMD
3220
0
  for (; x + SIMDVec16::kLanes <= oxs; x += SIMDVec16::kLanes) {
3221
0
    auto rgb = SIMDVec16::LoadGA8(rgba + 2 * x);
3222
0
    StorePixels(rgb[0], luma + x);
3223
0
    StorePixels(rgb[1], alpha + x);
3224
0
  }
3225
#endif
3226
0
  for (; x < oxs; x++) {
3227
0
    luma[x] = rgba[2 * x];
3228
0
    alpha[x] = rgba[2 * x + 1];
3229
0
  }
3230
0
}
Unexecuted instantiation: enc_fast_lossless.cc:void AVX2::(anonymous namespace)::FillRowGA8<short>(unsigned char const*, unsigned long, short*, short*)
Unexecuted instantiation: enc_fast_lossless.cc:void default_implementation::(anonymous namespace)::FillRowGA8<short>(unsigned char const*, unsigned long, short*, short*)
Unexecuted instantiation: enc_fast_lossless.cc:void default_implementation::(anonymous namespace)::FillRowGA8<int>(unsigned char const*, unsigned long, int*, int*)
Unexecuted instantiation: enc_fast_lossless.cc:void AVX2::(anonymous namespace)::FillRowGA8<int>(unsigned char const*, unsigned long, int*, int*)
3231
3232
template <bool big_endian, typename pixel_t>
3233
void FillRowGA16(const unsigned char* rgba, size_t oxs, pixel_t* luma,
3234
0
                 pixel_t* alpha) {
3235
0
  size_t x = 0;
3236
#ifdef FJXL_GENERIC_SIMD
3237
0
  for (; x + SIMDVec16::kLanes <= oxs; x += SIMDVec16::kLanes) {
3238
0
    auto rgb = SIMDVec16::LoadGA16(rgba + 4 * x);
3239
0
    if (big_endian) {
3240
0
      rgb[0].SwapEndian();
3241
0
      rgb[1].SwapEndian();
3242
0
    }
3243
0
    StorePixels(rgb[0], luma + x);
3244
0
    StorePixels(rgb[1], alpha + x);
3245
0
  }
3246
#endif
3247
0
  for (; x < oxs; x++) {
3248
0
    uint16_t l = LoadLE16(rgba + 4 * x);
3249
0
    uint16_t a = LoadLE16(rgba + 4 * x + 2);
3250
0
    if (big_endian) {
3251
0
      l = SwapEndian(l);
3252
0
      a = SwapEndian(a);
3253
0
    }
3254
0
    luma[x] = l;
3255
0
    alpha[x] = a;
3256
0
  }
3257
0
}
Unexecuted instantiation: enc_fast_lossless.cc:void AVX2::(anonymous namespace)::FillRowGA16<true, short>(unsigned char const*, unsigned long, short*, short*)
Unexecuted instantiation: enc_fast_lossless.cc:void AVX2::(anonymous namespace)::FillRowGA16<false, short>(unsigned char const*, unsigned long, short*, short*)
Unexecuted instantiation: enc_fast_lossless.cc:void AVX2::(anonymous namespace)::FillRowGA16<true, int>(unsigned char const*, unsigned long, int*, int*)
Unexecuted instantiation: enc_fast_lossless.cc:void AVX2::(anonymous namespace)::FillRowGA16<false, int>(unsigned char const*, unsigned long, int*, int*)
Unexecuted instantiation: enc_fast_lossless.cc:void default_implementation::(anonymous namespace)::FillRowGA16<true, short>(unsigned char const*, unsigned long, short*, short*)
Unexecuted instantiation: enc_fast_lossless.cc:void default_implementation::(anonymous namespace)::FillRowGA16<false, short>(unsigned char const*, unsigned long, short*, short*)
Unexecuted instantiation: enc_fast_lossless.cc:void default_implementation::(anonymous namespace)::FillRowGA16<true, int>(unsigned char const*, unsigned long, int*, int*)
Unexecuted instantiation: enc_fast_lossless.cc:void default_implementation::(anonymous namespace)::FillRowGA16<false, int>(unsigned char const*, unsigned long, int*, int*)
3258
3259
template <typename pixel_t>
3260
void StoreYCoCg(pixel_t r, pixel_t g, pixel_t b, pixel_t* y, pixel_t* co,
3261
0
                pixel_t* cg) {
3262
0
  *co = r - b;
3263
0
  pixel_t tmp = b + (*co >> 1);
3264
0
  *cg = g - tmp;
3265
0
  *y = tmp + (*cg >> 1);
3266
0
}
Unexecuted instantiation: enc_fast_lossless.cc:void AVX2::(anonymous namespace)::StoreYCoCg<short>(short, short, short, short*, short*, short*)
Unexecuted instantiation: enc_fast_lossless.cc:void AVX2::(anonymous namespace)::StoreYCoCg<int>(int, int, int, int*, int*, int*)
Unexecuted instantiation: enc_fast_lossless.cc:void default_implementation::(anonymous namespace)::StoreYCoCg<short>(short, short, short, short*, short*, short*)
Unexecuted instantiation: enc_fast_lossless.cc:void default_implementation::(anonymous namespace)::StoreYCoCg<int>(int, int, int, int*, int*, int*)
3267
3268
#ifdef FJXL_GENERIC_SIMD
3269
void StoreYCoCg(SIMDVec16 r, SIMDVec16 g, SIMDVec16 b, int16_t* y, int16_t* co,
3270
0
                int16_t* cg) {
3271
0
  SIMDVec16 co_v = r.Sub(b);
3272
0
  SIMDVec16 tmp = b.Add(co_v.SignedShiftRight<1>());
3273
0
  SIMDVec16 cg_v = g.Sub(tmp);
3274
0
  SIMDVec16 y_v = tmp.Add(cg_v.SignedShiftRight<1>());
3275
0
  y_v.Store(reinterpret_cast<uint16_t*>(y));
3276
0
  co_v.Store(reinterpret_cast<uint16_t*>(co));
3277
0
  cg_v.Store(reinterpret_cast<uint16_t*>(cg));
3278
0
}
3279
3280
void StoreYCoCg(SIMDVec16 r, SIMDVec16 g, SIMDVec16 b, int32_t* y, int32_t* co,
3281
0
                int32_t* cg) {
3282
0
  VecPair<SIMDVec32> r_up = r.Upcast();
3283
0
  VecPair<SIMDVec32> g_up = g.Upcast();
3284
0
  VecPair<SIMDVec32> b_up = b.Upcast();
3285
0
  SIMDVec32 co_lo_v = r_up.low.Sub(b_up.low);
3286
0
  SIMDVec32 tmp_lo = b_up.low.Add(co_lo_v.SignedShiftRight<1>());
3287
0
  SIMDVec32 cg_lo_v = g_up.low.Sub(tmp_lo);
3288
0
  SIMDVec32 y_lo_v = tmp_lo.Add(cg_lo_v.SignedShiftRight<1>());
3289
0
  SIMDVec32 co_hi_v = r_up.hi.Sub(b_up.hi);
3290
0
  SIMDVec32 tmp_hi = b_up.hi.Add(co_hi_v.SignedShiftRight<1>());
3291
0
  SIMDVec32 cg_hi_v = g_up.hi.Sub(tmp_hi);
3292
0
  SIMDVec32 y_hi_v = tmp_hi.Add(cg_hi_v.SignedShiftRight<1>());
3293
0
  y_lo_v.Store(reinterpret_cast<uint32_t*>(y));
3294
0
  co_lo_v.Store(reinterpret_cast<uint32_t*>(co));
3295
0
  cg_lo_v.Store(reinterpret_cast<uint32_t*>(cg));
3296
0
  y_hi_v.Store(reinterpret_cast<uint32_t*>(y) + SIMDVec32::kLanes);
3297
0
  co_hi_v.Store(reinterpret_cast<uint32_t*>(co) + SIMDVec32::kLanes);
3298
0
  cg_hi_v.Store(reinterpret_cast<uint32_t*>(cg) + SIMDVec32::kLanes);
3299
0
}
3300
#endif
3301
3302
template <typename pixel_t>
3303
void FillRowRGB8(const unsigned char* rgba, size_t oxs, pixel_t* y, pixel_t* co,
3304
0
                 pixel_t* cg) {
3305
0
  size_t x = 0;
3306
#ifdef FJXL_GENERIC_SIMD
3307
0
  for (; x + SIMDVec16::kLanes <= oxs; x += SIMDVec16::kLanes) {
3308
0
    auto rgb = SIMDVec16::LoadRGB8(rgba + 3 * x);
3309
0
    StoreYCoCg(rgb[0], rgb[1], rgb[2], y + x, co + x, cg + x);
3310
0
  }
3311
#endif
3312
0
  for (; x < oxs; x++) {
3313
0
    uint16_t r = rgba[3 * x];
3314
0
    uint16_t g = rgba[3 * x + 1];
3315
0
    uint16_t b = rgba[3 * x + 2];
3316
0
    StoreYCoCg<pixel_t>(r, g, b, y + x, co + x, cg + x);
3317
0
  }
3318
0
}
Unexecuted instantiation: enc_fast_lossless.cc:void AVX2::(anonymous namespace)::FillRowRGB8<short>(unsigned char const*, unsigned long, short*, short*, short*)
Unexecuted instantiation: enc_fast_lossless.cc:void default_implementation::(anonymous namespace)::FillRowRGB8<short>(unsigned char const*, unsigned long, short*, short*, short*)
Unexecuted instantiation: enc_fast_lossless.cc:void default_implementation::(anonymous namespace)::FillRowRGB8<int>(unsigned char const*, unsigned long, int*, int*, int*)
Unexecuted instantiation: enc_fast_lossless.cc:void AVX2::(anonymous namespace)::FillRowRGB8<int>(unsigned char const*, unsigned long, int*, int*, int*)
3319
3320
template <bool big_endian, typename pixel_t>
3321
void FillRowRGB16(const unsigned char* rgba, size_t oxs, pixel_t* y,
3322
0
                  pixel_t* co, pixel_t* cg) {
3323
0
  size_t x = 0;
3324
#ifdef FJXL_GENERIC_SIMD
3325
0
  for (; x + SIMDVec16::kLanes <= oxs; x += SIMDVec16::kLanes) {
3326
0
    auto rgb = SIMDVec16::LoadRGB16(rgba + 6 * x);
3327
0
    if (big_endian) {
3328
0
      rgb[0].SwapEndian();
3329
0
      rgb[1].SwapEndian();
3330
0
      rgb[2].SwapEndian();
3331
0
    }
3332
0
    StoreYCoCg(rgb[0], rgb[1], rgb[2], y + x, co + x, cg + x);
3333
0
  }
3334
#endif
3335
0
  for (; x < oxs; x++) {
3336
0
    uint16_t r = LoadLE16(rgba + 6 * x);
3337
0
    uint16_t g = LoadLE16(rgba + 6 * x + 2);
3338
0
    uint16_t b = LoadLE16(rgba + 6 * x + 4);
3339
0
    if (big_endian) {
3340
0
      r = SwapEndian(r);
3341
0
      g = SwapEndian(g);
3342
0
      b = SwapEndian(b);
3343
0
    }
3344
0
    StoreYCoCg<pixel_t>(r, g, b, y + x, co + x, cg + x);
3345
0
  }
3346
0
}
Unexecuted instantiation: enc_fast_lossless.cc:void AVX2::(anonymous namespace)::FillRowRGB16<true, short>(unsigned char const*, unsigned long, short*, short*, short*)
Unexecuted instantiation: enc_fast_lossless.cc:void AVX2::(anonymous namespace)::FillRowRGB16<false, short>(unsigned char const*, unsigned long, short*, short*, short*)
Unexecuted instantiation: enc_fast_lossless.cc:void AVX2::(anonymous namespace)::FillRowRGB16<true, int>(unsigned char const*, unsigned long, int*, int*, int*)
Unexecuted instantiation: enc_fast_lossless.cc:void AVX2::(anonymous namespace)::FillRowRGB16<false, int>(unsigned char const*, unsigned long, int*, int*, int*)
Unexecuted instantiation: enc_fast_lossless.cc:void default_implementation::(anonymous namespace)::FillRowRGB16<true, short>(unsigned char const*, unsigned long, short*, short*, short*)
Unexecuted instantiation: enc_fast_lossless.cc:void default_implementation::(anonymous namespace)::FillRowRGB16<false, short>(unsigned char const*, unsigned long, short*, short*, short*)
Unexecuted instantiation: enc_fast_lossless.cc:void default_implementation::(anonymous namespace)::FillRowRGB16<true, int>(unsigned char const*, unsigned long, int*, int*, int*)
Unexecuted instantiation: enc_fast_lossless.cc:void default_implementation::(anonymous namespace)::FillRowRGB16<false, int>(unsigned char const*, unsigned long, int*, int*, int*)
3347
3348
template <typename pixel_t>
3349
void FillRowRGBA8(const unsigned char* rgba, size_t oxs, pixel_t* y,
3350
0
                  pixel_t* co, pixel_t* cg, pixel_t* alpha) {
3351
0
  size_t x = 0;
3352
#ifdef FJXL_GENERIC_SIMD
3353
0
  for (; x + SIMDVec16::kLanes <= oxs; x += SIMDVec16::kLanes) {
3354
0
    auto rgb = SIMDVec16::LoadRGBA8(rgba + 4 * x);
3355
0
    StoreYCoCg(rgb[0], rgb[1], rgb[2], y + x, co + x, cg + x);
3356
0
    StorePixels(rgb[3], alpha + x);
3357
0
  }
3358
#endif
3359
0
  for (; x < oxs; x++) {
3360
0
    uint16_t r = rgba[4 * x];
3361
0
    uint16_t g = rgba[4 * x + 1];
3362
0
    uint16_t b = rgba[4 * x + 2];
3363
0
    uint16_t a = rgba[4 * x + 3];
3364
0
    StoreYCoCg<pixel_t>(r, g, b, y + x, co + x, cg + x);
3365
0
    alpha[x] = a;
3366
0
  }
3367
0
}
Unexecuted instantiation: enc_fast_lossless.cc:void AVX2::(anonymous namespace)::FillRowRGBA8<short>(unsigned char const*, unsigned long, short*, short*, short*, short*)
Unexecuted instantiation: enc_fast_lossless.cc:void default_implementation::(anonymous namespace)::FillRowRGBA8<short>(unsigned char const*, unsigned long, short*, short*, short*, short*)
Unexecuted instantiation: enc_fast_lossless.cc:void default_implementation::(anonymous namespace)::FillRowRGBA8<int>(unsigned char const*, unsigned long, int*, int*, int*, int*)
Unexecuted instantiation: enc_fast_lossless.cc:void AVX2::(anonymous namespace)::FillRowRGBA8<int>(unsigned char const*, unsigned long, int*, int*, int*, int*)
3368
3369
template <bool big_endian, typename pixel_t>
3370
void FillRowRGBA16(const unsigned char* rgba, size_t oxs, pixel_t* y,
3371
0
                   pixel_t* co, pixel_t* cg, pixel_t* alpha) {
3372
0
  size_t x = 0;
3373
#ifdef FJXL_GENERIC_SIMD
3374
0
  for (; x + SIMDVec16::kLanes <= oxs; x += SIMDVec16::kLanes) {
3375
0
    auto rgb = SIMDVec16::LoadRGBA16(rgba + 8 * x);
3376
0
    if (big_endian) {
3377
0
      rgb[0].SwapEndian();
3378
0
      rgb[1].SwapEndian();
3379
0
      rgb[2].SwapEndian();
3380
0
      rgb[3].SwapEndian();
3381
0
    }
3382
0
    StoreYCoCg(rgb[0], rgb[1], rgb[2], y + x, co + x, cg + x);
3383
0
    StorePixels(rgb[3], alpha + x);
3384
0
  }
3385
#endif
3386
0
  for (; x < oxs; x++) {
3387
0
    uint16_t r = LoadLE16(rgba + 8 * x);
3388
0
    uint16_t g = LoadLE16(rgba + 8 * x + 2);
3389
0
    uint16_t b = LoadLE16(rgba + 8 * x + 4);
3390
0
    uint16_t a = LoadLE16(rgba + 8 * x + 6);
3391
0
    if (big_endian) {
3392
0
      r = SwapEndian(r);
3393
0
      g = SwapEndian(g);
3394
0
      b = SwapEndian(b);
3395
0
      a = SwapEndian(a);
3396
0
    }
3397
0
    StoreYCoCg<pixel_t>(r, g, b, y + x, co + x, cg + x);
3398
0
    alpha[x] = a;
3399
0
  }
3400
0
}
Unexecuted instantiation: enc_fast_lossless.cc:void AVX2::(anonymous namespace)::FillRowRGBA16<true, short>(unsigned char const*, unsigned long, short*, short*, short*, short*)
Unexecuted instantiation: enc_fast_lossless.cc:void AVX2::(anonymous namespace)::FillRowRGBA16<false, short>(unsigned char const*, unsigned long, short*, short*, short*, short*)
Unexecuted instantiation: enc_fast_lossless.cc:void AVX2::(anonymous namespace)::FillRowRGBA16<true, int>(unsigned char const*, unsigned long, int*, int*, int*, int*)
Unexecuted instantiation: enc_fast_lossless.cc:void AVX2::(anonymous namespace)::FillRowRGBA16<false, int>(unsigned char const*, unsigned long, int*, int*, int*, int*)
Unexecuted instantiation: enc_fast_lossless.cc:void default_implementation::(anonymous namespace)::FillRowRGBA16<true, short>(unsigned char const*, unsigned long, short*, short*, short*, short*)
Unexecuted instantiation: enc_fast_lossless.cc:void default_implementation::(anonymous namespace)::FillRowRGBA16<false, short>(unsigned char const*, unsigned long, short*, short*, short*, short*)
Unexecuted instantiation: enc_fast_lossless.cc:void default_implementation::(anonymous namespace)::FillRowRGBA16<true, int>(unsigned char const*, unsigned long, int*, int*, int*, int*)
Unexecuted instantiation: enc_fast_lossless.cc:void default_implementation::(anonymous namespace)::FillRowRGBA16<false, int>(unsigned char const*, unsigned long, int*, int*, int*, int*)
3401
3402
template <typename Processor, typename BitDepth>
3403
void ProcessImageArea(const unsigned char* rgba, size_t x0, size_t y0,
3404
                      size_t xs, size_t yskip, size_t ys, size_t row_stride,
3405
                      BitDepth bitdepth, size_t nb_chans, bool big_endian,
3406
0
                      Processor* processors) {
3407
0
  constexpr size_t kPadding = 32;
3408
3409
0
  using pixel_t = typename BitDepth::pixel_t;
3410
3411
0
  constexpr size_t kAlign = 64;
3412
0
  constexpr size_t kAlignPixels = kAlign / sizeof(pixel_t);
3413
3414
0
  auto align = [=](pixel_t* ptr) {
3415
0
    size_t offset = reinterpret_cast<size_t>(ptr) % kAlign;
3416
0
    if (offset) {
3417
0
      ptr += offset / sizeof(pixel_t);
3418
0
    }
3419
0
    return ptr;
3420
0
  };
Unexecuted instantiation: enc_fast_lossless.cc:AVX2::(anonymous namespace)::ProcessImageArea<AVX2::(anonymous namespace)::ChannelRowProcessor<AVX2::(anonymous namespace)::ChunkSampleCollector<AVX2::(anonymous namespace)::UpTo8Bits>, AVX2::(anonymous namespace)::UpTo8Bits>, AVX2::(anonymous namespace)::UpTo8Bits>(unsigned char const*, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, AVX2::(anonymous namespace)::UpTo8Bits, unsigned long, bool, AVX2::(anonymous namespace)::ChannelRowProcessor<AVX2::(anonymous namespace)::ChunkSampleCollector<AVX2::(anonymous namespace)::UpTo8Bits>, AVX2::(anonymous namespace)::UpTo8Bits>*)::{lambda(short*)#1}::operator()(short*) const
Unexecuted instantiation: enc_fast_lossless.cc:AVX2::(anonymous namespace)::ProcessImageArea<AVX2::(anonymous namespace)::ChannelRowProcessor<AVX2::(anonymous namespace)::ChunkSampleCollector<AVX2::(anonymous namespace)::From9To13Bits>, AVX2::(anonymous namespace)::From9To13Bits>, AVX2::(anonymous namespace)::From9To13Bits>(unsigned char const*, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, AVX2::(anonymous namespace)::From9To13Bits, unsigned long, bool, AVX2::(anonymous namespace)::ChannelRowProcessor<AVX2::(anonymous namespace)::ChunkSampleCollector<AVX2::(anonymous namespace)::From9To13Bits>, AVX2::(anonymous namespace)::From9To13Bits>*)::{lambda(short*)#1}::operator()(short*) const
Unexecuted instantiation: enc_fast_lossless.cc:AVX2::(anonymous namespace)::ProcessImageArea<AVX2::(anonymous namespace)::ChannelRowProcessor<AVX2::(anonymous namespace)::ChunkSampleCollector<AVX2::(anonymous namespace)::Exactly14Bits>, AVX2::(anonymous namespace)::Exactly14Bits>, AVX2::(anonymous namespace)::Exactly14Bits>(unsigned char const*, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, AVX2::(anonymous namespace)::Exactly14Bits, unsigned long, bool, AVX2::(anonymous namespace)::ChannelRowProcessor<AVX2::(anonymous namespace)::ChunkSampleCollector<AVX2::(anonymous namespace)::Exactly14Bits>, AVX2::(anonymous namespace)::Exactly14Bits>*)::{lambda(short*)#1}::operator()(short*) const
Unexecuted instantiation: enc_fast_lossless.cc:AVX2::(anonymous namespace)::ProcessImageArea<AVX2::(anonymous namespace)::ChannelRowProcessor<AVX2::(anonymous namespace)::ChunkSampleCollector<AVX2::(anonymous namespace)::MoreThan14Bits>, AVX2::(anonymous namespace)::MoreThan14Bits>, AVX2::(anonymous namespace)::MoreThan14Bits>(unsigned char const*, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, AVX2::(anonymous namespace)::MoreThan14Bits, unsigned long, bool, AVX2::(anonymous namespace)::ChannelRowProcessor<AVX2::(anonymous namespace)::ChunkSampleCollector<AVX2::(anonymous namespace)::MoreThan14Bits>, AVX2::(anonymous namespace)::MoreThan14Bits>*)::{lambda(int*)#1}::operator()(int*) const
Unexecuted instantiation: enc_fast_lossless.cc:default_implementation::(anonymous namespace)::ProcessImageArea<default_implementation::(anonymous namespace)::ChannelRowProcessor<default_implementation::(anonymous namespace)::ChunkSampleCollector<default_implementation::(anonymous namespace)::UpTo8Bits>, default_implementation::(anonymous namespace)::UpTo8Bits>, default_implementation::(anonymous namespace)::UpTo8Bits>(unsigned char const*, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, default_implementation::(anonymous namespace)::UpTo8Bits, unsigned long, bool, default_implementation::(anonymous namespace)::ChannelRowProcessor<default_implementation::(anonymous namespace)::ChunkSampleCollector<default_implementation::(anonymous namespace)::UpTo8Bits>, default_implementation::(anonymous namespace)::UpTo8Bits>*)::{lambda(short*)#1}::operator()(short*) const
Unexecuted instantiation: enc_fast_lossless.cc:default_implementation::(anonymous namespace)::ProcessImageArea<default_implementation::(anonymous namespace)::ChannelRowProcessor<default_implementation::(anonymous namespace)::ChunkSampleCollector<default_implementation::(anonymous namespace)::From9To13Bits>, default_implementation::(anonymous namespace)::From9To13Bits>, default_implementation::(anonymous namespace)::From9To13Bits>(unsigned char const*, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, default_implementation::(anonymous namespace)::From9To13Bits, unsigned long, bool, default_implementation::(anonymous namespace)::ChannelRowProcessor<default_implementation::(anonymous namespace)::ChunkSampleCollector<default_implementation::(anonymous namespace)::From9To13Bits>, default_implementation::(anonymous namespace)::From9To13Bits>*)::{lambda(short*)#1}::operator()(short*) const
Unexecuted instantiation: enc_fast_lossless.cc:default_implementation::(anonymous namespace)::ProcessImageArea<default_implementation::(anonymous namespace)::ChannelRowProcessor<default_implementation::(anonymous namespace)::ChunkSampleCollector<default_implementation::(anonymous namespace)::Exactly14Bits>, default_implementation::(anonymous namespace)::Exactly14Bits>, default_implementation::(anonymous namespace)::Exactly14Bits>(unsigned char const*, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, default_implementation::(anonymous namespace)::Exactly14Bits, unsigned long, bool, default_implementation::(anonymous namespace)::ChannelRowProcessor<default_implementation::(anonymous namespace)::ChunkSampleCollector<default_implementation::(anonymous namespace)::Exactly14Bits>, default_implementation::(anonymous namespace)::Exactly14Bits>*)::{lambda(short*)#1}::operator()(short*) const
Unexecuted instantiation: enc_fast_lossless.cc:default_implementation::(anonymous namespace)::ProcessImageArea<default_implementation::(anonymous namespace)::ChannelRowProcessor<default_implementation::(anonymous namespace)::ChunkSampleCollector<default_implementation::(anonymous namespace)::MoreThan14Bits>, default_implementation::(anonymous namespace)::MoreThan14Bits>, default_implementation::(anonymous namespace)::MoreThan14Bits>(unsigned char const*, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, default_implementation::(anonymous namespace)::MoreThan14Bits, unsigned long, bool, default_implementation::(anonymous namespace)::ChannelRowProcessor<default_implementation::(anonymous namespace)::ChunkSampleCollector<default_implementation::(anonymous namespace)::MoreThan14Bits>, default_implementation::(anonymous namespace)::MoreThan14Bits>*)::{lambda(int*)#1}::operator()(int*) const
Unexecuted instantiation: enc_fast_lossless.cc:AVX2::(anonymous namespace)::ProcessImageArea<AVX2::(anonymous namespace)::ChannelRowProcessor<AVX2::(anonymous namespace)::ChunkEncoder<AVX2::(anonymous namespace)::UpTo8Bits>, AVX2::(anonymous namespace)::UpTo8Bits>, AVX2::(anonymous namespace)::UpTo8Bits>(unsigned char const*, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, AVX2::(anonymous namespace)::UpTo8Bits, unsigned long, bool, AVX2::(anonymous namespace)::ChannelRowProcessor<AVX2::(anonymous namespace)::ChunkEncoder<AVX2::(anonymous namespace)::UpTo8Bits>, AVX2::(anonymous namespace)::UpTo8Bits>*)::{lambda(short*)#1}::operator()(short*) const
Unexecuted instantiation: enc_fast_lossless.cc:AVX2::(anonymous namespace)::ProcessImageArea<AVX2::(anonymous namespace)::ChannelRowProcessor<AVX2::(anonymous namespace)::ChunkEncoder<AVX2::(anonymous namespace)::From9To13Bits>, AVX2::(anonymous namespace)::From9To13Bits>, AVX2::(anonymous namespace)::From9To13Bits>(unsigned char const*, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, AVX2::(anonymous namespace)::From9To13Bits, unsigned long, bool, AVX2::(anonymous namespace)::ChannelRowProcessor<AVX2::(anonymous namespace)::ChunkEncoder<AVX2::(anonymous namespace)::From9To13Bits>, AVX2::(anonymous namespace)::From9To13Bits>*)::{lambda(short*)#1}::operator()(short*) const
Unexecuted instantiation: enc_fast_lossless.cc:AVX2::(anonymous namespace)::ProcessImageArea<AVX2::(anonymous namespace)::ChannelRowProcessor<AVX2::(anonymous namespace)::ChunkEncoder<AVX2::(anonymous namespace)::Exactly14Bits>, AVX2::(anonymous namespace)::Exactly14Bits>, AVX2::(anonymous namespace)::Exactly14Bits>(unsigned char const*, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, AVX2::(anonymous namespace)::Exactly14Bits, unsigned long, bool, AVX2::(anonymous namespace)::ChannelRowProcessor<AVX2::(anonymous namespace)::ChunkEncoder<AVX2::(anonymous namespace)::Exactly14Bits>, AVX2::(anonymous namespace)::Exactly14Bits>*)::{lambda(short*)#1}::operator()(short*) const
Unexecuted instantiation: enc_fast_lossless.cc:AVX2::(anonymous namespace)::ProcessImageArea<AVX2::(anonymous namespace)::ChannelRowProcessor<AVX2::(anonymous namespace)::ChunkEncoder<AVX2::(anonymous namespace)::MoreThan14Bits>, AVX2::(anonymous namespace)::MoreThan14Bits>, AVX2::(anonymous namespace)::MoreThan14Bits>(unsigned char const*, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, AVX2::(anonymous namespace)::MoreThan14Bits, unsigned long, bool, AVX2::(anonymous namespace)::ChannelRowProcessor<AVX2::(anonymous namespace)::ChunkEncoder<AVX2::(anonymous namespace)::MoreThan14Bits>, AVX2::(anonymous namespace)::MoreThan14Bits>*)::{lambda(int*)#1}::operator()(int*) const
Unexecuted instantiation: enc_fast_lossless.cc:default_implementation::(anonymous namespace)::ProcessImageArea<default_implementation::(anonymous namespace)::ChannelRowProcessor<default_implementation::(anonymous namespace)::ChunkEncoder<default_implementation::(anonymous namespace)::UpTo8Bits>, default_implementation::(anonymous namespace)::UpTo8Bits>, default_implementation::(anonymous namespace)::UpTo8Bits>(unsigned char const*, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, default_implementation::(anonymous namespace)::UpTo8Bits, unsigned long, bool, default_implementation::(anonymous namespace)::ChannelRowProcessor<default_implementation::(anonymous namespace)::ChunkEncoder<default_implementation::(anonymous namespace)::UpTo8Bits>, default_implementation::(anonymous namespace)::UpTo8Bits>*)::{lambda(short*)#1}::operator()(short*) const
Unexecuted instantiation: enc_fast_lossless.cc:default_implementation::(anonymous namespace)::ProcessImageArea<default_implementation::(anonymous namespace)::ChannelRowProcessor<default_implementation::(anonymous namespace)::ChunkEncoder<default_implementation::(anonymous namespace)::From9To13Bits>, default_implementation::(anonymous namespace)::From9To13Bits>, default_implementation::(anonymous namespace)::From9To13Bits>(unsigned char const*, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, default_implementation::(anonymous namespace)::From9To13Bits, unsigned long, bool, default_implementation::(anonymous namespace)::ChannelRowProcessor<default_implementation::(anonymous namespace)::ChunkEncoder<default_implementation::(anonymous namespace)::From9To13Bits>, default_implementation::(anonymous namespace)::From9To13Bits>*)::{lambda(short*)#1}::operator()(short*) const
Unexecuted instantiation: enc_fast_lossless.cc:default_implementation::(anonymous namespace)::ProcessImageArea<default_implementation::(anonymous namespace)::ChannelRowProcessor<default_implementation::(anonymous namespace)::ChunkEncoder<default_implementation::(anonymous namespace)::Exactly14Bits>, default_implementation::(anonymous namespace)::Exactly14Bits>, default_implementation::(anonymous namespace)::Exactly14Bits>(unsigned char const*, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, default_implementation::(anonymous namespace)::Exactly14Bits, unsigned long, bool, default_implementation::(anonymous namespace)::ChannelRowProcessor<default_implementation::(anonymous namespace)::ChunkEncoder<default_implementation::(anonymous namespace)::Exactly14Bits>, default_implementation::(anonymous namespace)::Exactly14Bits>*)::{lambda(short*)#1}::operator()(short*) const
Unexecuted instantiation: enc_fast_lossless.cc:default_implementation::(anonymous namespace)::ProcessImageArea<default_implementation::(anonymous namespace)::ChannelRowProcessor<default_implementation::(anonymous namespace)::ChunkEncoder<default_implementation::(anonymous namespace)::MoreThan14Bits>, default_implementation::(anonymous namespace)::MoreThan14Bits>, default_implementation::(anonymous namespace)::MoreThan14Bits>(unsigned char const*, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, default_implementation::(anonymous namespace)::MoreThan14Bits, unsigned long, bool, default_implementation::(anonymous namespace)::ChannelRowProcessor<default_implementation::(anonymous namespace)::ChunkEncoder<default_implementation::(anonymous namespace)::MoreThan14Bits>, default_implementation::(anonymous namespace)::MoreThan14Bits>*)::{lambda(int*)#1}::operator()(int*) const
3421
3422
0
  constexpr size_t kNumPx =
3423
0
      (256 + kPadding * 2 + kAlignPixels + kAlignPixels - 1) / kAlignPixels *
3424
0
      kAlignPixels;
3425
3426
0
  std::vector<std::array<std::array<pixel_t, kNumPx>, 2>> group_data(nb_chans);
3427
3428
0
  for (size_t y = 0; y < ys; y++) {
3429
0
    const auto rgba_row =
3430
0
        rgba + row_stride * (y0 + y) + x0 * nb_chans * BitDepth::kInputBytes;
3431
0
    pixel_t* crow[4] = {};
3432
0
    pixel_t* prow[4] = {};
3433
0
    for (size_t i = 0; i < nb_chans; i++) {
3434
0
      crow[i] = align(&group_data[i][y & 1][kPadding]);
3435
0
      prow[i] = align(&group_data[i][(y - 1) & 1][kPadding]);
3436
0
    }
3437
3438
    // Pre-fill rows with YCoCg converted pixels.
3439
0
    if (nb_chans == 1) {
3440
0
      if (BitDepth::kInputBytes == 1) {
3441
0
        FillRowG8(rgba_row, xs, crow[0]);
3442
0
      } else if (big_endian) {
3443
0
        FillRowG16</*big_endian=*/true>(rgba_row, xs, crow[0]);
3444
0
      } else {
3445
0
        FillRowG16</*big_endian=*/false>(rgba_row, xs, crow[0]);
3446
0
      }
3447
0
    } else if (nb_chans == 2) {
3448
0
      if (BitDepth::kInputBytes == 1) {
3449
0
        FillRowGA8(rgba_row, xs, crow[0], crow[1]);
3450
0
      } else if (big_endian) {
3451
0
        FillRowGA16</*big_endian=*/true>(rgba_row, xs, crow[0], crow[1]);
3452
0
      } else {
3453
0
        FillRowGA16</*big_endian=*/false>(rgba_row, xs, crow[0], crow[1]);
3454
0
      }
3455
0
    } else if (nb_chans == 3) {
3456
0
      if (BitDepth::kInputBytes == 1) {
3457
0
        FillRowRGB8(rgba_row, xs, crow[0], crow[1], crow[2]);
3458
0
      } else if (big_endian) {
3459
0
        FillRowRGB16</*big_endian=*/true>(rgba_row, xs, crow[0], crow[1],
3460
0
                                          crow[2]);
3461
0
      } else {
3462
0
        FillRowRGB16</*big_endian=*/false>(rgba_row, xs, crow[0], crow[1],
3463
0
                                           crow[2]);
3464
0
      }
3465
0
    } else {
3466
0
      if (BitDepth::kInputBytes == 1) {
3467
0
        FillRowRGBA8(rgba_row, xs, crow[0], crow[1], crow[2], crow[3]);
3468
0
      } else if (big_endian) {
3469
0
        FillRowRGBA16</*big_endian=*/true>(rgba_row, xs, crow[0], crow[1],
3470
0
                                           crow[2], crow[3]);
3471
0
      } else {
3472
0
        FillRowRGBA16</*big_endian=*/false>(rgba_row, xs, crow[0], crow[1],
3473
0
                                            crow[2], crow[3]);
3474
0
      }
3475
0
    }
3476
    // Deal with x == 0.
3477
0
    for (size_t c = 0; c < nb_chans; c++) {
3478
0
      *(crow[c] - 1) = y > 0 ? *(prow[c]) : 0;
3479
      // Fix topleft.
3480
0
      *(prow[c] - 1) = y > 0 ? *(prow[c]) : 0;
3481
0
    }
3482
0
    if (y < yskip) continue;
3483
0
    for (size_t c = 0; c < nb_chans; c++) {
3484
      // Get pointers to px/left/top/topleft data to speedup loop.
3485
0
      const pixel_t* row = crow[c];
3486
0
      const pixel_t* row_left = crow[c] - 1;
3487
0
      const pixel_t* row_top = y == 0 ? row_left : prow[c];
3488
0
      const pixel_t* row_topleft = y == 0 ? row_left : prow[c] - 1;
3489
3490
0
      processors[c].ProcessRow(row, row_left, row_top, row_topleft, xs);
3491
0
    }
3492
0
  }
3493
0
  for (size_t c = 0; c < nb_chans; c++) {
3494
0
    processors[c].Finalize();
3495
0
  }
3496
0
}
Unexecuted instantiation: enc_fast_lossless.cc:void AVX2::(anonymous namespace)::ProcessImageArea<AVX2::(anonymous namespace)::ChannelRowProcessor<AVX2::(anonymous namespace)::ChunkSampleCollector<AVX2::(anonymous namespace)::UpTo8Bits>, AVX2::(anonymous namespace)::UpTo8Bits>, AVX2::(anonymous namespace)::UpTo8Bits>(unsigned char const*, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, AVX2::(anonymous namespace)::UpTo8Bits, unsigned long, bool, AVX2::(anonymous namespace)::ChannelRowProcessor<AVX2::(anonymous namespace)::ChunkSampleCollector<AVX2::(anonymous namespace)::UpTo8Bits>, AVX2::(anonymous namespace)::UpTo8Bits>*)
Unexecuted instantiation: enc_fast_lossless.cc:void AVX2::(anonymous namespace)::ProcessImageArea<AVX2::(anonymous namespace)::ChannelRowProcessor<AVX2::(anonymous namespace)::ChunkSampleCollector<AVX2::(anonymous namespace)::From9To13Bits>, AVX2::(anonymous namespace)::From9To13Bits>, AVX2::(anonymous namespace)::From9To13Bits>(unsigned char const*, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, AVX2::(anonymous namespace)::From9To13Bits, unsigned long, bool, AVX2::(anonymous namespace)::ChannelRowProcessor<AVX2::(anonymous namespace)::ChunkSampleCollector<AVX2::(anonymous namespace)::From9To13Bits>, AVX2::(anonymous namespace)::From9To13Bits>*)
Unexecuted instantiation: enc_fast_lossless.cc:void AVX2::(anonymous namespace)::ProcessImageArea<AVX2::(anonymous namespace)::ChannelRowProcessor<AVX2::(anonymous namespace)::ChunkSampleCollector<AVX2::(anonymous namespace)::Exactly14Bits>, AVX2::(anonymous namespace)::Exactly14Bits>, AVX2::(anonymous namespace)::Exactly14Bits>(unsigned char const*, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, AVX2::(anonymous namespace)::Exactly14Bits, unsigned long, bool, AVX2::(anonymous namespace)::ChannelRowProcessor<AVX2::(anonymous namespace)::ChunkSampleCollector<AVX2::(anonymous namespace)::Exactly14Bits>, AVX2::(anonymous namespace)::Exactly14Bits>*)
Unexecuted instantiation: enc_fast_lossless.cc:void AVX2::(anonymous namespace)::ProcessImageArea<AVX2::(anonymous namespace)::ChannelRowProcessor<AVX2::(anonymous namespace)::ChunkSampleCollector<AVX2::(anonymous namespace)::MoreThan14Bits>, AVX2::(anonymous namespace)::MoreThan14Bits>, AVX2::(anonymous namespace)::MoreThan14Bits>(unsigned char const*, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, AVX2::(anonymous namespace)::MoreThan14Bits, unsigned long, bool, AVX2::(anonymous namespace)::ChannelRowProcessor<AVX2::(anonymous namespace)::ChunkSampleCollector<AVX2::(anonymous namespace)::MoreThan14Bits>, AVX2::(anonymous namespace)::MoreThan14Bits>*)
Unexecuted instantiation: enc_fast_lossless.cc:void default_implementation::(anonymous namespace)::ProcessImageArea<default_implementation::(anonymous namespace)::ChannelRowProcessor<default_implementation::(anonymous namespace)::ChunkSampleCollector<default_implementation::(anonymous namespace)::UpTo8Bits>, default_implementation::(anonymous namespace)::UpTo8Bits>, default_implementation::(anonymous namespace)::UpTo8Bits>(unsigned char const*, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, default_implementation::(anonymous namespace)::UpTo8Bits, unsigned long, bool, default_implementation::(anonymous namespace)::ChannelRowProcessor<default_implementation::(anonymous namespace)::ChunkSampleCollector<default_implementation::(anonymous namespace)::UpTo8Bits>, default_implementation::(anonymous namespace)::UpTo8Bits>*)
Unexecuted instantiation: enc_fast_lossless.cc:void default_implementation::(anonymous namespace)::ProcessImageArea<default_implementation::(anonymous namespace)::ChannelRowProcessor<default_implementation::(anonymous namespace)::ChunkSampleCollector<default_implementation::(anonymous namespace)::From9To13Bits>, default_implementation::(anonymous namespace)::From9To13Bits>, default_implementation::(anonymous namespace)::From9To13Bits>(unsigned char const*, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, default_implementation::(anonymous namespace)::From9To13Bits, unsigned long, bool, default_implementation::(anonymous namespace)::ChannelRowProcessor<default_implementation::(anonymous namespace)::ChunkSampleCollector<default_implementation::(anonymous namespace)::From9To13Bits>, default_implementation::(anonymous namespace)::From9To13Bits>*)
Unexecuted instantiation: enc_fast_lossless.cc:void default_implementation::(anonymous namespace)::ProcessImageArea<default_implementation::(anonymous namespace)::ChannelRowProcessor<default_implementation::(anonymous namespace)::ChunkSampleCollector<default_implementation::(anonymous namespace)::Exactly14Bits>, default_implementation::(anonymous namespace)::Exactly14Bits>, default_implementation::(anonymous namespace)::Exactly14Bits>(unsigned char const*, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, default_implementation::(anonymous namespace)::Exactly14Bits, unsigned long, bool, default_implementation::(anonymous namespace)::ChannelRowProcessor<default_implementation::(anonymous namespace)::ChunkSampleCollector<default_implementation::(anonymous namespace)::Exactly14Bits>, default_implementation::(anonymous namespace)::Exactly14Bits>*)
Unexecuted instantiation: enc_fast_lossless.cc:void default_implementation::(anonymous namespace)::ProcessImageArea<default_implementation::(anonymous namespace)::ChannelRowProcessor<default_implementation::(anonymous namespace)::ChunkSampleCollector<default_implementation::(anonymous namespace)::MoreThan14Bits>, default_implementation::(anonymous namespace)::MoreThan14Bits>, default_implementation::(anonymous namespace)::MoreThan14Bits>(unsigned char const*, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, default_implementation::(anonymous namespace)::MoreThan14Bits, unsigned long, bool, default_implementation::(anonymous namespace)::ChannelRowProcessor<default_implementation::(anonymous namespace)::ChunkSampleCollector<default_implementation::(anonymous namespace)::MoreThan14Bits>, default_implementation::(anonymous namespace)::MoreThan14Bits>*)
Unexecuted instantiation: enc_fast_lossless.cc:void AVX2::(anonymous namespace)::ProcessImageArea<AVX2::(anonymous namespace)::ChannelRowProcessor<AVX2::(anonymous namespace)::ChunkEncoder<AVX2::(anonymous namespace)::UpTo8Bits>, AVX2::(anonymous namespace)::UpTo8Bits>, AVX2::(anonymous namespace)::UpTo8Bits>(unsigned char const*, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, AVX2::(anonymous namespace)::UpTo8Bits, unsigned long, bool, AVX2::(anonymous namespace)::ChannelRowProcessor<AVX2::(anonymous namespace)::ChunkEncoder<AVX2::(anonymous namespace)::UpTo8Bits>, AVX2::(anonymous namespace)::UpTo8Bits>*)
Unexecuted instantiation: enc_fast_lossless.cc:void AVX2::(anonymous namespace)::ProcessImageArea<AVX2::(anonymous namespace)::ChannelRowProcessor<AVX2::(anonymous namespace)::ChunkEncoder<AVX2::(anonymous namespace)::From9To13Bits>, AVX2::(anonymous namespace)::From9To13Bits>, AVX2::(anonymous namespace)::From9To13Bits>(unsigned char const*, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, AVX2::(anonymous namespace)::From9To13Bits, unsigned long, bool, AVX2::(anonymous namespace)::ChannelRowProcessor<AVX2::(anonymous namespace)::ChunkEncoder<AVX2::(anonymous namespace)::From9To13Bits>, AVX2::(anonymous namespace)::From9To13Bits>*)
Unexecuted instantiation: enc_fast_lossless.cc:void AVX2::(anonymous namespace)::ProcessImageArea<AVX2::(anonymous namespace)::ChannelRowProcessor<AVX2::(anonymous namespace)::ChunkEncoder<AVX2::(anonymous namespace)::Exactly14Bits>, AVX2::(anonymous namespace)::Exactly14Bits>, AVX2::(anonymous namespace)::Exactly14Bits>(unsigned char const*, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, AVX2::(anonymous namespace)::Exactly14Bits, unsigned long, bool, AVX2::(anonymous namespace)::ChannelRowProcessor<AVX2::(anonymous namespace)::ChunkEncoder<AVX2::(anonymous namespace)::Exactly14Bits>, AVX2::(anonymous namespace)::Exactly14Bits>*)
Unexecuted instantiation: enc_fast_lossless.cc:void AVX2::(anonymous namespace)::ProcessImageArea<AVX2::(anonymous namespace)::ChannelRowProcessor<AVX2::(anonymous namespace)::ChunkEncoder<AVX2::(anonymous namespace)::MoreThan14Bits>, AVX2::(anonymous namespace)::MoreThan14Bits>, AVX2::(anonymous namespace)::MoreThan14Bits>(unsigned char const*, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, AVX2::(anonymous namespace)::MoreThan14Bits, unsigned long, bool, AVX2::(anonymous namespace)::ChannelRowProcessor<AVX2::(anonymous namespace)::ChunkEncoder<AVX2::(anonymous namespace)::MoreThan14Bits>, AVX2::(anonymous namespace)::MoreThan14Bits>*)
Unexecuted instantiation: enc_fast_lossless.cc:void default_implementation::(anonymous namespace)::ProcessImageArea<default_implementation::(anonymous namespace)::ChannelRowProcessor<default_implementation::(anonymous namespace)::ChunkEncoder<default_implementation::(anonymous namespace)::UpTo8Bits>, default_implementation::(anonymous namespace)::UpTo8Bits>, default_implementation::(anonymous namespace)::UpTo8Bits>(unsigned char const*, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, default_implementation::(anonymous namespace)::UpTo8Bits, unsigned long, bool, default_implementation::(anonymous namespace)::ChannelRowProcessor<default_implementation::(anonymous namespace)::ChunkEncoder<default_implementation::(anonymous namespace)::UpTo8Bits>, default_implementation::(anonymous namespace)::UpTo8Bits>*)
Unexecuted instantiation: enc_fast_lossless.cc:void default_implementation::(anonymous namespace)::ProcessImageArea<default_implementation::(anonymous namespace)::ChannelRowProcessor<default_implementation::(anonymous namespace)::ChunkEncoder<default_implementation::(anonymous namespace)::From9To13Bits>, default_implementation::(anonymous namespace)::From9To13Bits>, default_implementation::(anonymous namespace)::From9To13Bits>(unsigned char const*, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, default_implementation::(anonymous namespace)::From9To13Bits, unsigned long, bool, default_implementation::(anonymous namespace)::ChannelRowProcessor<default_implementation::(anonymous namespace)::ChunkEncoder<default_implementation::(anonymous namespace)::From9To13Bits>, default_implementation::(anonymous namespace)::From9To13Bits>*)
Unexecuted instantiation: enc_fast_lossless.cc:void default_implementation::(anonymous namespace)::ProcessImageArea<default_implementation::(anonymous namespace)::ChannelRowProcessor<default_implementation::(anonymous namespace)::ChunkEncoder<default_implementation::(anonymous namespace)::Exactly14Bits>, default_implementation::(anonymous namespace)::Exactly14Bits>, default_implementation::(anonymous namespace)::Exactly14Bits>(unsigned char const*, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, default_implementation::(anonymous namespace)::Exactly14Bits, unsigned long, bool, default_implementation::(anonymous namespace)::ChannelRowProcessor<default_implementation::(anonymous namespace)::ChunkEncoder<default_implementation::(anonymous namespace)::Exactly14Bits>, default_implementation::(anonymous namespace)::Exactly14Bits>*)
Unexecuted instantiation: enc_fast_lossless.cc:void default_implementation::(anonymous namespace)::ProcessImageArea<default_implementation::(anonymous namespace)::ChannelRowProcessor<default_implementation::(anonymous namespace)::ChunkEncoder<default_implementation::(anonymous namespace)::MoreThan14Bits>, default_implementation::(anonymous namespace)::MoreThan14Bits>, default_implementation::(anonymous namespace)::MoreThan14Bits>(unsigned char const*, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, default_implementation::(anonymous namespace)::MoreThan14Bits, unsigned long, bool, default_implementation::(anonymous namespace)::ChannelRowProcessor<default_implementation::(anonymous namespace)::ChunkEncoder<default_implementation::(anonymous namespace)::MoreThan14Bits>, default_implementation::(anonymous namespace)::MoreThan14Bits>*)
3497
3498
template <typename BitDepth>
3499
void WriteACSection(const unsigned char* rgba, size_t x0, size_t y0, size_t xs,
3500
                    size_t ys, size_t row_stride, bool is_single_group,
3501
                    BitDepth bitdepth, size_t nb_chans, bool big_endian,
3502
                    const PrefixCode code[4],
3503
0
                    std::array<BitWriter, 4>& output) {
3504
0
  for (size_t i = 0; i < nb_chans; i++) {
3505
0
    if (is_single_group && i == 0) continue;
3506
0
    output[i].Allocate(xs * ys * bitdepth.MaxEncodedBitsPerSample() + 4);
3507
0
  }
3508
0
  if (!is_single_group) {
3509
    // Group header for modular image.
3510
    // When the image is single-group, the global modular image is the one
3511
    // that contains the pixel data, and there is no group header.
3512
0
    output[0].Write(1, 1);     // Global tree
3513
0
    output[0].Write(1, 1);     // All default wp
3514
0
    output[0].Write(2, 0b00);  // 0 transforms
3515
0
  }
3516
3517
0
  ChunkEncoder<BitDepth> encoders[4];
3518
0
  ChannelRowProcessor<ChunkEncoder<BitDepth>, BitDepth> row_encoders[4];
3519
0
  for (size_t c = 0; c < nb_chans; c++) {
3520
0
    row_encoders[c].t = &encoders[c];
3521
0
    encoders[c].output = &output[c];
3522
0
    encoders[c].code = &code[c];
3523
0
    encoders[c].PrepareForSimd();
3524
0
  }
3525
0
  ProcessImageArea<ChannelRowProcessor<ChunkEncoder<BitDepth>, BitDepth>>(
3526
0
      rgba, x0, y0, xs, 0, ys, row_stride, bitdepth, nb_chans, big_endian,
3527
0
      row_encoders);
3528
0
}
Unexecuted instantiation: enc_fast_lossless.cc:void AVX2::(anonymous namespace)::WriteACSection<AVX2::(anonymous namespace)::UpTo8Bits>(unsigned char const*, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, bool, AVX2::(anonymous namespace)::UpTo8Bits, unsigned long, bool, (anonymous namespace)::PrefixCode const*, std::__1::array<(anonymous namespace)::BitWriter, 4ul>&)
Unexecuted instantiation: enc_fast_lossless.cc:void AVX2::(anonymous namespace)::WriteACSection<AVX2::(anonymous namespace)::From9To13Bits>(unsigned char const*, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, bool, AVX2::(anonymous namespace)::From9To13Bits, unsigned long, bool, (anonymous namespace)::PrefixCode const*, std::__1::array<(anonymous namespace)::BitWriter, 4ul>&)
Unexecuted instantiation: enc_fast_lossless.cc:void AVX2::(anonymous namespace)::WriteACSection<AVX2::(anonymous namespace)::Exactly14Bits>(unsigned char const*, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, bool, AVX2::(anonymous namespace)::Exactly14Bits, unsigned long, bool, (anonymous namespace)::PrefixCode const*, std::__1::array<(anonymous namespace)::BitWriter, 4ul>&)
Unexecuted instantiation: enc_fast_lossless.cc:void AVX2::(anonymous namespace)::WriteACSection<AVX2::(anonymous namespace)::MoreThan14Bits>(unsigned char const*, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, bool, AVX2::(anonymous namespace)::MoreThan14Bits, unsigned long, bool, (anonymous namespace)::PrefixCode const*, std::__1::array<(anonymous namespace)::BitWriter, 4ul>&)
Unexecuted instantiation: enc_fast_lossless.cc:void default_implementation::(anonymous namespace)::WriteACSection<default_implementation::(anonymous namespace)::UpTo8Bits>(unsigned char const*, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, bool, default_implementation::(anonymous namespace)::UpTo8Bits, unsigned long, bool, (anonymous namespace)::PrefixCode const*, std::__1::array<(anonymous namespace)::BitWriter, 4ul>&)
Unexecuted instantiation: enc_fast_lossless.cc:void default_implementation::(anonymous namespace)::WriteACSection<default_implementation::(anonymous namespace)::From9To13Bits>(unsigned char const*, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, bool, default_implementation::(anonymous namespace)::From9To13Bits, unsigned long, bool, (anonymous namespace)::PrefixCode const*, std::__1::array<(anonymous namespace)::BitWriter, 4ul>&)
Unexecuted instantiation: enc_fast_lossless.cc:void default_implementation::(anonymous namespace)::WriteACSection<default_implementation::(anonymous namespace)::Exactly14Bits>(unsigned char const*, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, bool, default_implementation::(anonymous namespace)::Exactly14Bits, unsigned long, bool, (anonymous namespace)::PrefixCode const*, std::__1::array<(anonymous namespace)::BitWriter, 4ul>&)
Unexecuted instantiation: enc_fast_lossless.cc:void default_implementation::(anonymous namespace)::WriteACSection<default_implementation::(anonymous namespace)::MoreThan14Bits>(unsigned char const*, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, bool, default_implementation::(anonymous namespace)::MoreThan14Bits, unsigned long, bool, (anonymous namespace)::PrefixCode const*, std::__1::array<(anonymous namespace)::BitWriter, 4ul>&)
3529
3530
constexpr int kHashExp = 16;
3531
constexpr uint32_t kHashSize = 1 << kHashExp;
3532
constexpr uint32_t kHashMultiplier = 2654435761;
3533
constexpr int kMaxColors = 512;
3534
3535
// can be any function that returns a value in 0 .. kHashSize-1
3536
// has to map 0 to 0
3537
0
inline uint32_t pixel_hash(uint32_t p) {
3538
0
  return (p * kHashMultiplier) >> (32 - kHashExp);
3539
0
}
Unexecuted instantiation: enc_fast_lossless.cc:AVX2::(anonymous namespace)::pixel_hash(unsigned int)
Unexecuted instantiation: enc_fast_lossless.cc:default_implementation::(anonymous namespace)::pixel_hash(unsigned int)
3540
3541
template <size_t nb_chans>
3542
void FillRowPalette(const unsigned char* inrow, size_t xs,
3543
0
                    const int16_t* lookup, int16_t* out) {
3544
0
  for (size_t x = 0; x < xs; x++) {
3545
0
    uint32_t p = 0;
3546
0
    for (size_t i = 0; i < nb_chans; ++i) {
3547
0
      p |= inrow[x * nb_chans + i] << (8 * i);
3548
0
    }
3549
0
    out[x] = lookup[pixel_hash(p)];
3550
0
  }
3551
0
}
Unexecuted instantiation: enc_fast_lossless.cc:void AVX2::(anonymous namespace)::FillRowPalette<1ul>(unsigned char const*, unsigned long, short const*, short*)
Unexecuted instantiation: enc_fast_lossless.cc:void AVX2::(anonymous namespace)::FillRowPalette<2ul>(unsigned char const*, unsigned long, short const*, short*)
Unexecuted instantiation: enc_fast_lossless.cc:void AVX2::(anonymous namespace)::FillRowPalette<3ul>(unsigned char const*, unsigned long, short const*, short*)
Unexecuted instantiation: enc_fast_lossless.cc:void AVX2::(anonymous namespace)::FillRowPalette<4ul>(unsigned char const*, unsigned long, short const*, short*)
Unexecuted instantiation: enc_fast_lossless.cc:void default_implementation::(anonymous namespace)::FillRowPalette<1ul>(unsigned char const*, unsigned long, short const*, short*)
Unexecuted instantiation: enc_fast_lossless.cc:void default_implementation::(anonymous namespace)::FillRowPalette<2ul>(unsigned char const*, unsigned long, short const*, short*)
Unexecuted instantiation: enc_fast_lossless.cc:void default_implementation::(anonymous namespace)::FillRowPalette<3ul>(unsigned char const*, unsigned long, short const*, short*)
Unexecuted instantiation: enc_fast_lossless.cc:void default_implementation::(anonymous namespace)::FillRowPalette<4ul>(unsigned char const*, unsigned long, short const*, short*)
3552
3553
template <typename Processor>
3554
void ProcessImageAreaPalette(const unsigned char* rgba, size_t x0, size_t y0,
3555
                             size_t xs, size_t yskip, size_t ys,
3556
                             size_t row_stride, const int16_t* lookup,
3557
0
                             size_t nb_chans, Processor* processors) {
3558
0
  constexpr size_t kPadding = 32;
3559
3560
0
  std::vector<std::array<int16_t, 256 + kPadding * 2>> group_data(2);
3561
0
  Processor& row_encoder = processors[0];
3562
3563
0
  for (size_t y = 0; y < ys; y++) {
3564
    // Pre-fill rows with palette converted pixels.
3565
0
    const unsigned char* inrow = rgba + row_stride * (y0 + y) + x0 * nb_chans;
3566
0
    int16_t* outrow = &group_data[y & 1][kPadding];
3567
0
    if (nb_chans == 1) {
3568
0
      FillRowPalette<1>(inrow, xs, lookup, outrow);
3569
0
    } else if (nb_chans == 2) {
3570
0
      FillRowPalette<2>(inrow, xs, lookup, outrow);
3571
0
    } else if (nb_chans == 3) {
3572
0
      FillRowPalette<3>(inrow, xs, lookup, outrow);
3573
0
    } else if (nb_chans == 4) {
3574
0
      FillRowPalette<4>(inrow, xs, lookup, outrow);
3575
0
    }
3576
    // Deal with x == 0.
3577
0
    group_data[y & 1][kPadding - 1] =
3578
0
        y > 0 ? group_data[(y - 1) & 1][kPadding] : 0;
3579
    // Fix topleft.
3580
0
    group_data[(y - 1) & 1][kPadding - 1] =
3581
0
        y > 0 ? group_data[(y - 1) & 1][kPadding] : 0;
3582
    // Get pointers to px/left/top/topleft data to speedup loop.
3583
0
    const int16_t* row = &group_data[y & 1][kPadding];
3584
0
    const int16_t* row_left = &group_data[y & 1][kPadding - 1];
3585
0
    const int16_t* row_top =
3586
0
        y == 0 ? row_left : &group_data[(y - 1) & 1][kPadding];
3587
0
    const int16_t* row_topleft =
3588
0
        y == 0 ? row_left : &group_data[(y - 1) & 1][kPadding - 1];
3589
3590
0
    row_encoder.ProcessRow(row, row_left, row_top, row_topleft, xs);
3591
0
  }
3592
0
  row_encoder.Finalize();
3593
0
}
Unexecuted instantiation: enc_fast_lossless.cc:void AVX2::(anonymous namespace)::ProcessImageAreaPalette<AVX2::(anonymous namespace)::ChannelRowProcessor<AVX2::(anonymous namespace)::ChunkSampleCollector<AVX2::(anonymous namespace)::UpTo8Bits>, AVX2::(anonymous namespace)::UpTo8Bits> >(unsigned char const*, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, short const*, unsigned long, AVX2::(anonymous namespace)::ChannelRowProcessor<AVX2::(anonymous namespace)::ChunkSampleCollector<AVX2::(anonymous namespace)::UpTo8Bits>, AVX2::(anonymous namespace)::UpTo8Bits>*)
Unexecuted instantiation: enc_fast_lossless.cc:void default_implementation::(anonymous namespace)::ProcessImageAreaPalette<default_implementation::(anonymous namespace)::ChannelRowProcessor<default_implementation::(anonymous namespace)::ChunkSampleCollector<default_implementation::(anonymous namespace)::UpTo8Bits>, default_implementation::(anonymous namespace)::UpTo8Bits> >(unsigned char const*, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, short const*, unsigned long, default_implementation::(anonymous namespace)::ChannelRowProcessor<default_implementation::(anonymous namespace)::ChunkSampleCollector<default_implementation::(anonymous namespace)::UpTo8Bits>, default_implementation::(anonymous namespace)::UpTo8Bits>*)
Unexecuted instantiation: enc_fast_lossless.cc:void AVX2::(anonymous namespace)::ProcessImageAreaPalette<AVX2::(anonymous namespace)::ChannelRowProcessor<AVX2::(anonymous namespace)::ChunkEncoder<AVX2::(anonymous namespace)::UpTo8Bits>, AVX2::(anonymous namespace)::UpTo8Bits> >(unsigned char const*, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, short const*, unsigned long, AVX2::(anonymous namespace)::ChannelRowProcessor<AVX2::(anonymous namespace)::ChunkEncoder<AVX2::(anonymous namespace)::UpTo8Bits>, AVX2::(anonymous namespace)::UpTo8Bits>*)
Unexecuted instantiation: enc_fast_lossless.cc:void default_implementation::(anonymous namespace)::ProcessImageAreaPalette<default_implementation::(anonymous namespace)::ChannelRowProcessor<default_implementation::(anonymous namespace)::ChunkEncoder<default_implementation::(anonymous namespace)::UpTo8Bits>, default_implementation::(anonymous namespace)::UpTo8Bits> >(unsigned char const*, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, short const*, unsigned long, default_implementation::(anonymous namespace)::ChannelRowProcessor<default_implementation::(anonymous namespace)::ChunkEncoder<default_implementation::(anonymous namespace)::UpTo8Bits>, default_implementation::(anonymous namespace)::UpTo8Bits>*)
3594
3595
void WriteACSectionPalette(const unsigned char* rgba, size_t x0, size_t y0,
3596
                           size_t xs, size_t ys, size_t row_stride,
3597
                           bool is_single_group, const PrefixCode code[4],
3598
                           const int16_t* lookup, size_t nb_chans,
3599
0
                           BitWriter& output) {
3600
0
  if (!is_single_group) {
3601
0
    output.Allocate(16 * xs * ys + 4);
3602
    // Group header for modular image.
3603
    // When the image is single-group, the global modular image is the one
3604
    // that contains the pixel data, and there is no group header.
3605
0
    output.Write(1, 1);     // Global tree
3606
0
    output.Write(1, 1);     // All default wp
3607
0
    output.Write(2, 0b00);  // 0 transforms
3608
0
  }
3609
3610
0
  ChunkEncoder<UpTo8Bits> encoder;
3611
0
  ChannelRowProcessor<ChunkEncoder<UpTo8Bits>, UpTo8Bits> row_encoder;
3612
3613
0
  row_encoder.t = &encoder;
3614
0
  encoder.output = &output;
3615
0
  encoder.code = &code[is_single_group ? 1 : 0];
3616
0
  encoder.PrepareForSimd();
3617
0
  ProcessImageAreaPalette<
3618
0
      ChannelRowProcessor<ChunkEncoder<UpTo8Bits>, UpTo8Bits>>(
3619
0
      rgba, x0, y0, xs, 0, ys, row_stride, lookup, nb_chans, &row_encoder);
3620
0
}
Unexecuted instantiation: enc_fast_lossless.cc:AVX2::(anonymous namespace)::WriteACSectionPalette(unsigned char const*, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, bool, (anonymous namespace)::PrefixCode const*, short const*, unsigned long, (anonymous namespace)::BitWriter&)
Unexecuted instantiation: enc_fast_lossless.cc:default_implementation::(anonymous namespace)::WriteACSectionPalette(unsigned char const*, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, bool, (anonymous namespace)::PrefixCode const*, short const*, unsigned long, (anonymous namespace)::BitWriter&)
3621
3622
template <typename BitDepth>
3623
void CollectSamples(const unsigned char* rgba, size_t x0, size_t y0, size_t xs,
3624
                    size_t row_stride, size_t row_count,
3625
                    uint64_t raw_counts[4][kNumRawSymbols],
3626
                    uint64_t lz77_counts[4][kNumLZ77], bool is_single_group,
3627
                    bool palette, BitDepth bitdepth, size_t nb_chans,
3628
0
                    bool big_endian, const int16_t* lookup) {
3629
0
  if (palette) {
3630
0
    ChunkSampleCollector<UpTo8Bits> sample_collectors[4];
3631
0
    ChannelRowProcessor<ChunkSampleCollector<UpTo8Bits>, UpTo8Bits>
3632
0
        row_sample_collectors[4];
3633
0
    for (size_t c = 0; c < nb_chans; c++) {
3634
0
      row_sample_collectors[c].t = &sample_collectors[c];
3635
0
      sample_collectors[c].raw_counts = raw_counts[is_single_group ? 1 : 0];
3636
0
      sample_collectors[c].lz77_counts = lz77_counts[is_single_group ? 1 : 0];
3637
0
    }
3638
0
    ProcessImageAreaPalette<
3639
0
        ChannelRowProcessor<ChunkSampleCollector<UpTo8Bits>, UpTo8Bits>>(
3640
0
        rgba, x0, y0, xs, 1, 1 + row_count, row_stride, lookup, nb_chans,
3641
0
        row_sample_collectors);
3642
0
  } else {
3643
0
    ChunkSampleCollector<BitDepth> sample_collectors[4];
3644
0
    ChannelRowProcessor<ChunkSampleCollector<BitDepth>, BitDepth>
3645
0
        row_sample_collectors[4];
3646
0
    for (size_t c = 0; c < nb_chans; c++) {
3647
0
      row_sample_collectors[c].t = &sample_collectors[c];
3648
0
      sample_collectors[c].raw_counts = raw_counts[c];
3649
0
      sample_collectors[c].lz77_counts = lz77_counts[c];
3650
0
    }
3651
0
    ProcessImageArea<
3652
0
        ChannelRowProcessor<ChunkSampleCollector<BitDepth>, BitDepth>>(
3653
0
        rgba, x0, y0, xs, 1, 1 + row_count, row_stride, bitdepth, nb_chans,
3654
0
        big_endian, row_sample_collectors);
3655
0
  }
3656
0
}
Unexecuted instantiation: enc_fast_lossless.cc:void AVX2::(anonymous namespace)::CollectSamples<AVX2::(anonymous namespace)::UpTo8Bits>(unsigned char const*, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long (*) [19], unsigned long (*) [33], bool, bool, AVX2::(anonymous namespace)::UpTo8Bits, unsigned long, bool, short const*)
Unexecuted instantiation: enc_fast_lossless.cc:void AVX2::(anonymous namespace)::CollectSamples<AVX2::(anonymous namespace)::From9To13Bits>(unsigned char const*, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long (*) [19], unsigned long (*) [33], bool, bool, AVX2::(anonymous namespace)::From9To13Bits, unsigned long, bool, short const*)
Unexecuted instantiation: enc_fast_lossless.cc:void AVX2::(anonymous namespace)::CollectSamples<AVX2::(anonymous namespace)::Exactly14Bits>(unsigned char const*, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long (*) [19], unsigned long (*) [33], bool, bool, AVX2::(anonymous namespace)::Exactly14Bits, unsigned long, bool, short const*)
Unexecuted instantiation: enc_fast_lossless.cc:void AVX2::(anonymous namespace)::CollectSamples<AVX2::(anonymous namespace)::MoreThan14Bits>(unsigned char const*, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long (*) [19], unsigned long (*) [33], bool, bool, AVX2::(anonymous namespace)::MoreThan14Bits, unsigned long, bool, short const*)
Unexecuted instantiation: enc_fast_lossless.cc:void default_implementation::(anonymous namespace)::CollectSamples<default_implementation::(anonymous namespace)::UpTo8Bits>(unsigned char const*, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long (*) [19], unsigned long (*) [33], bool, bool, default_implementation::(anonymous namespace)::UpTo8Bits, unsigned long, bool, short const*)
Unexecuted instantiation: enc_fast_lossless.cc:void default_implementation::(anonymous namespace)::CollectSamples<default_implementation::(anonymous namespace)::From9To13Bits>(unsigned char const*, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long (*) [19], unsigned long (*) [33], bool, bool, default_implementation::(anonymous namespace)::From9To13Bits, unsigned long, bool, short const*)
Unexecuted instantiation: enc_fast_lossless.cc:void default_implementation::(anonymous namespace)::CollectSamples<default_implementation::(anonymous namespace)::Exactly14Bits>(unsigned char const*, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long (*) [19], unsigned long (*) [33], bool, bool, default_implementation::(anonymous namespace)::Exactly14Bits, unsigned long, bool, short const*)
Unexecuted instantiation: enc_fast_lossless.cc:void default_implementation::(anonymous namespace)::CollectSamples<default_implementation::(anonymous namespace)::MoreThan14Bits>(unsigned char const*, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long (*) [19], unsigned long (*) [33], bool, bool, default_implementation::(anonymous namespace)::MoreThan14Bits, unsigned long, bool, short const*)
3657
3658
void PrepareDCGlobalPalette(bool is_single_group, size_t width, size_t height,
3659
                            size_t nb_chans, const PrefixCode code[4],
3660
                            const std::vector<uint32_t>& palette,
3661
0
                            size_t pcolors, BitWriter* output) {
3662
0
  PrepareDCGlobalCommon(is_single_group, width, height, code, output);
3663
0
  output->Write(2, 0b01);     // 1 transform
3664
0
  output->Write(2, 0b01);     // Palette
3665
0
  output->Write(5, 0b00000);  // Starting from ch 0
3666
0
  if (nb_chans == 1) {
3667
0
    output->Write(2, 0b00);  // 1-channel palette (Gray)
3668
0
  } else if (nb_chans == 3) {
3669
0
    output->Write(2, 0b01);  // 3-channel palette (RGB)
3670
0
  } else if (nb_chans == 4) {
3671
0
    output->Write(2, 0b10);  // 4-channel palette (RGBA)
3672
0
  } else {
3673
0
    output->Write(2, 0b11);
3674
0
    output->Write(13, nb_chans - 1);
3675
0
  }
3676
  // pcolors <= kMaxColors + kChunkSize - 1
3677
0
  static_assert(kMaxColors + kChunkSize < 1281,
3678
0
                "add code to signal larger palette sizes");
3679
0
  if (pcolors < 256) {
3680
0
    output->Write(2, 0b00);
3681
0
    output->Write(8, pcolors);
3682
0
  } else {
3683
0
    output->Write(2, 0b01);
3684
0
    output->Write(10, pcolors - 256);
3685
0
  }
3686
3687
0
  output->Write(2, 0b00);  // nb_deltas == 0
3688
0
  output->Write(4, 0);     // Zero predictor for delta palette
3689
  // Encode palette
3690
0
  ChunkEncoder<UpTo8Bits> encoder;
3691
0
  ChannelRowProcessor<ChunkEncoder<UpTo8Bits>, UpTo8Bits> row_encoder;
3692
0
  row_encoder.t = &encoder;
3693
0
  encoder.output = output;
3694
0
  encoder.code = &code[0];
3695
0
  encoder.PrepareForSimd();
3696
0
  std::vector<std::array<int16_t, 32 + 1024>> p(4);
3697
0
  size_t i = 0;
3698
0
  size_t have_zero = 1;
3699
0
  for (; i < pcolors; i++) {
3700
0
    p[0][16 + i + have_zero] = palette[i] & 0xFF;
3701
0
    p[1][16 + i + have_zero] = (palette[i] >> 8) & 0xFF;
3702
0
    p[2][16 + i + have_zero] = (palette[i] >> 16) & 0xFF;
3703
0
    p[3][16 + i + have_zero] = (palette[i] >> 24) & 0xFF;
3704
0
  }
3705
0
  p[0][15] = 0;
3706
0
  row_encoder.ProcessRow(p[0].data() + 16, p[0].data() + 15, p[0].data() + 15,
3707
0
                         p[0].data() + 15, pcolors);
3708
0
  p[1][15] = p[0][16];
3709
0
  p[0][15] = p[0][16];
3710
0
  if (nb_chans > 1) {
3711
0
    row_encoder.ProcessRow(p[1].data() + 16, p[1].data() + 15, p[0].data() + 16,
3712
0
                           p[0].data() + 15, pcolors);
3713
0
  }
3714
0
  p[2][15] = p[1][16];
3715
0
  p[1][15] = p[1][16];
3716
0
  if (nb_chans > 2) {
3717
0
    row_encoder.ProcessRow(p[2].data() + 16, p[2].data() + 15, p[1].data() + 16,
3718
0
                           p[1].data() + 15, pcolors);
3719
0
  }
3720
0
  p[3][15] = p[2][16];
3721
0
  p[2][15] = p[2][16];
3722
0
  if (nb_chans > 3) {
3723
0
    row_encoder.ProcessRow(p[3].data() + 16, p[3].data() + 15, p[2].data() + 16,
3724
0
                           p[2].data() + 15, pcolors);
3725
0
  }
3726
0
  row_encoder.Finalize();
3727
3728
0
  if (!is_single_group) {
3729
0
    output->ZeroPadToByte();
3730
0
  }
3731
0
}
Unexecuted instantiation: enc_fast_lossless.cc:AVX2::(anonymous namespace)::PrepareDCGlobalPalette(bool, unsigned long, unsigned long, unsigned long, (anonymous namespace)::PrefixCode const*, std::__1::vector<unsigned int, std::__1::allocator<unsigned int> > const&, unsigned long, (anonymous namespace)::BitWriter*)
Unexecuted instantiation: enc_fast_lossless.cc:default_implementation::(anonymous namespace)::PrepareDCGlobalPalette(bool, unsigned long, unsigned long, unsigned long, (anonymous namespace)::PrefixCode const*, std::__1::vector<unsigned int, std::__1::allocator<unsigned int> > const&, unsigned long, (anonymous namespace)::BitWriter*)
3732
3733
template <size_t nb_chans>
3734
bool detect_palette(const unsigned char* r, size_t width,
3735
0
                    std::vector<uint32_t>& palette) {
3736
0
  size_t x = 0;
3737
0
  bool collided = false;
3738
  // this is just an unrolling of the next loop
3739
0
  size_t look_ahead = 7 + ((nb_chans == 1) ? 3 : ((nb_chans < 4) ? 1 : 0));
3740
0
  for (; x + look_ahead < width; x += 8) {
3741
0
    uint32_t p[8] = {}, index[8];
3742
0
    for (int i = 0; i < 8; i++) {
3743
0
      for (int j = 0; j < 4; ++j) {
3744
0
        p[i] |= r[(x + i) * nb_chans + j] << (8 * j);
3745
0
      }
3746
0
    }
3747
0
    for (int i = 0; i < 8; i++) p[i] &= ((1llu << (8 * nb_chans)) - 1);
3748
0
    for (int i = 0; i < 8; i++) index[i] = pixel_hash(p[i]);
3749
0
    for (int i = 0; i < 8; i++) {
3750
0
      collided |= (palette[index[i]] != 0 && p[i] != palette[index[i]]);
3751
0
      palette[index[i]] = p[i];
3752
0
    }
3753
0
  }
3754
0
  for (; x < width; x++) {
3755
0
    uint32_t p = 0;
3756
0
    for (size_t i = 0; i < nb_chans; ++i) {
3757
0
      p |= r[x * nb_chans + i] << (8 * i);
3758
0
    }
3759
0
    uint32_t index = pixel_hash(p);
3760
0
    collided |= (palette[index] != 0 && p != palette[index]);
3761
0
    palette[index] = p;
3762
0
  }
3763
0
  return collided;
3764
0
}
Unexecuted instantiation: enc_fast_lossless.cc:bool AVX2::(anonymous namespace)::detect_palette<1ul>(unsigned char const*, unsigned long, std::__1::vector<unsigned int, std::__1::allocator<unsigned int> >&)
Unexecuted instantiation: enc_fast_lossless.cc:bool AVX2::(anonymous namespace)::detect_palette<2ul>(unsigned char const*, unsigned long, std::__1::vector<unsigned int, std::__1::allocator<unsigned int> >&)
Unexecuted instantiation: enc_fast_lossless.cc:bool AVX2::(anonymous namespace)::detect_palette<3ul>(unsigned char const*, unsigned long, std::__1::vector<unsigned int, std::__1::allocator<unsigned int> >&)
Unexecuted instantiation: enc_fast_lossless.cc:bool AVX2::(anonymous namespace)::detect_palette<4ul>(unsigned char const*, unsigned long, std::__1::vector<unsigned int, std::__1::allocator<unsigned int> >&)
Unexecuted instantiation: enc_fast_lossless.cc:bool default_implementation::(anonymous namespace)::detect_palette<1ul>(unsigned char const*, unsigned long, std::__1::vector<unsigned int, std::__1::allocator<unsigned int> >&)
Unexecuted instantiation: enc_fast_lossless.cc:bool default_implementation::(anonymous namespace)::detect_palette<2ul>(unsigned char const*, unsigned long, std::__1::vector<unsigned int, std::__1::allocator<unsigned int> >&)
Unexecuted instantiation: enc_fast_lossless.cc:bool default_implementation::(anonymous namespace)::detect_palette<3ul>(unsigned char const*, unsigned long, std::__1::vector<unsigned int, std::__1::allocator<unsigned int> >&)
Unexecuted instantiation: enc_fast_lossless.cc:bool default_implementation::(anonymous namespace)::detect_palette<4ul>(unsigned char const*, unsigned long, std::__1::vector<unsigned int, std::__1::allocator<unsigned int> >&)
3765
3766
template <typename BitDepth>
3767
JxlFastLosslessFrameState* LLPrepare(JxlChunkedFrameInputSource input,
3768
                                     size_t width, size_t height,
3769
                                     BitDepth bitdepth, size_t nb_chans,
3770
0
                                     bool big_endian, int effort, int oneshot) {
3771
0
  assert(width != 0);
3772
0
  assert(height != 0);
3773
3774
  // Count colors to try palette
3775
0
  std::vector<uint32_t> palette(kHashSize);
3776
0
  std::vector<int16_t> lookup(kHashSize);
3777
0
  lookup[0] = 0;
3778
0
  int pcolors = 0;
3779
0
  bool collided = effort < 2 || bitdepth.bitdepth != 8 || !oneshot;
3780
0
  for (size_t y0 = 0; y0 < height && !collided; y0 += 256) {
3781
0
    size_t ys = std::min<size_t>(height - y0, 256);
3782
0
    for (size_t x0 = 0; x0 < width && !collided; x0 += 256) {
3783
0
      size_t xs = std::min<size_t>(width - x0, 256);
3784
0
      size_t stride;
3785
      // TODO(szabadka): Add RAII wrapper around this.
3786
0
      const void* buffer = input.get_color_channel_data_at(input.opaque, x0, y0,
3787
0
                                                           xs, ys, &stride);
3788
0
      auto rgba = reinterpret_cast<const unsigned char*>(buffer);
3789
0
      for (size_t y = 0; y < ys && !collided; y++) {
3790
0
        const unsigned char* r = rgba + stride * y;
3791
0
        if (nb_chans == 1) collided = detect_palette<1>(r, xs, palette);
3792
0
        if (nb_chans == 2) collided = detect_palette<2>(r, xs, palette);
3793
0
        if (nb_chans == 3) collided = detect_palette<3>(r, xs, palette);
3794
0
        if (nb_chans == 4) collided = detect_palette<4>(r, xs, palette);
3795
0
      }
3796
0
      input.release_buffer(input.opaque, buffer);
3797
0
    }
3798
0
  }
3799
0
  int nb_entries = 0;
3800
0
  if (!collided) {
3801
0
    pcolors = 1;  // always have all-zero as a palette color
3802
0
    bool have_color = false;
3803
0
    uint8_t minG = 255, maxG = 0;
3804
0
    for (uint32_t k = 0; k < kHashSize; k++) {
3805
0
      if (palette[k] == 0) continue;
3806
0
      uint8_t p[4];
3807
0
      for (int i = 0; i < 4; ++i) {
3808
0
        p[i] = (palette[k] >> (8 * i)) & 0xFF;
3809
0
      }
3810
      // move entries to front so sort has less work
3811
0
      palette[nb_entries] = palette[k];
3812
0
      if (p[0] != p[1] || p[0] != p[2]) have_color = true;
3813
0
      if (p[1] < minG) minG = p[1];
3814
0
      if (p[1] > maxG) maxG = p[1];
3815
0
      nb_entries++;
3816
      // don't do palette if too many colors are needed
3817
0
      if (nb_entries + pcolors > kMaxColors) {
3818
0
        collided = true;
3819
0
        break;
3820
0
      }
3821
0
    }
3822
0
    if (!have_color) {
3823
      // don't do palette if it's just grayscale without many holes
3824
0
      if (maxG - minG < nb_entries * 1.4f) collided = true;
3825
0
    }
3826
0
  }
3827
0
  if (!collided) {
3828
0
    std::sort(
3829
0
        palette.begin(), palette.begin() + nb_entries,
3830
0
        [&nb_chans](uint32_t ap, uint32_t bp) {
3831
0
          if (ap == 0) return false;
3832
0
          if (bp == 0) return true;
3833
0
          uint8_t a[4], b[4];
3834
0
          for (int i = 0; i < 4; ++i) {
3835
0
            a[i] = (ap >> (8 * i)) & 0xFF;
3836
0
            b[i] = (bp >> (8 * i)) & 0xFF;
3837
0
          }
3838
0
          float ay, by;
3839
0
          if (nb_chans == 4) {
3840
0
            ay = (0.299f * a[0] + 0.587f * a[1] + 0.114f * a[2] + 0.01f) * a[3];
3841
0
            by = (0.299f * b[0] + 0.587f * b[1] + 0.114f * b[2] + 0.01f) * b[3];
3842
0
          } else {
3843
0
            ay = (0.299f * a[0] + 0.587f * a[1] + 0.114f * a[2] + 0.01f);
3844
0
            by = (0.299f * b[0] + 0.587f * b[1] + 0.114f * b[2] + 0.01f);
3845
0
          }
3846
0
          return ay < by;  // sort on alpha*luma
3847
0
        });
Unexecuted instantiation: enc_fast_lossless.cc:AVX2::(anonymous namespace)::LLPrepare<AVX2::(anonymous namespace)::UpTo8Bits>(JxlChunkedFrameInputSource, unsigned long, unsigned long, AVX2::(anonymous namespace)::UpTo8Bits, unsigned long, bool, int, int)::{lambda(unsigned int, unsigned int)#1}::operator()(unsigned int, unsigned int) const
Unexecuted instantiation: enc_fast_lossless.cc:AVX2::(anonymous namespace)::LLPrepare<AVX2::(anonymous namespace)::From9To13Bits>(JxlChunkedFrameInputSource, unsigned long, unsigned long, AVX2::(anonymous namespace)::From9To13Bits, unsigned long, bool, int, int)::{lambda(unsigned int, unsigned int)#1}::operator()(unsigned int, unsigned int) const
Unexecuted instantiation: enc_fast_lossless.cc:AVX2::(anonymous namespace)::LLPrepare<AVX2::(anonymous namespace)::Exactly14Bits>(JxlChunkedFrameInputSource, unsigned long, unsigned long, AVX2::(anonymous namespace)::Exactly14Bits, unsigned long, bool, int, int)::{lambda(unsigned int, unsigned int)#1}::operator()(unsigned int, unsigned int) const
Unexecuted instantiation: enc_fast_lossless.cc:AVX2::(anonymous namespace)::LLPrepare<AVX2::(anonymous namespace)::MoreThan14Bits>(JxlChunkedFrameInputSource, unsigned long, unsigned long, AVX2::(anonymous namespace)::MoreThan14Bits, unsigned long, bool, int, int)::{lambda(unsigned int, unsigned int)#1}::operator()(unsigned int, unsigned int) const
Unexecuted instantiation: enc_fast_lossless.cc:default_implementation::(anonymous namespace)::LLPrepare<default_implementation::(anonymous namespace)::UpTo8Bits>(JxlChunkedFrameInputSource, unsigned long, unsigned long, default_implementation::(anonymous namespace)::UpTo8Bits, unsigned long, bool, int, int)::{lambda(unsigned int, unsigned int)#1}::operator()(unsigned int, unsigned int) const
Unexecuted instantiation: enc_fast_lossless.cc:default_implementation::(anonymous namespace)::LLPrepare<default_implementation::(anonymous namespace)::From9To13Bits>(JxlChunkedFrameInputSource, unsigned long, unsigned long, default_implementation::(anonymous namespace)::From9To13Bits, unsigned long, bool, int, int)::{lambda(unsigned int, unsigned int)#1}::operator()(unsigned int, unsigned int) const
Unexecuted instantiation: enc_fast_lossless.cc:default_implementation::(anonymous namespace)::LLPrepare<default_implementation::(anonymous namespace)::Exactly14Bits>(JxlChunkedFrameInputSource, unsigned long, unsigned long, default_implementation::(anonymous namespace)::Exactly14Bits, unsigned long, bool, int, int)::{lambda(unsigned int, unsigned int)#1}::operator()(unsigned int, unsigned int) const
Unexecuted instantiation: enc_fast_lossless.cc:default_implementation::(anonymous namespace)::LLPrepare<default_implementation::(anonymous namespace)::MoreThan14Bits>(JxlChunkedFrameInputSource, unsigned long, unsigned long, default_implementation::(anonymous namespace)::MoreThan14Bits, unsigned long, bool, int, int)::{lambda(unsigned int, unsigned int)#1}::operator()(unsigned int, unsigned int) const
3848
0
    for (int k = 0; k < nb_entries; k++) {
3849
0
      if (palette[k] == 0) break;
3850
0
      lookup[pixel_hash(palette[k])] = pcolors++;
3851
0
    }
3852
0
  }
3853
3854
0
  size_t num_groups_x = (width + 255) / 256;
3855
0
  size_t num_groups_y = (height + 255) / 256;
3856
0
  size_t num_dc_groups_x = (width + 2047) / 2048;
3857
0
  size_t num_dc_groups_y = (height + 2047) / 2048;
3858
3859
0
  uint64_t raw_counts[4][kNumRawSymbols] = {};
3860
0
  uint64_t lz77_counts[4][kNumLZ77] = {};
3861
3862
0
  bool onegroup = num_groups_x == 1 && num_groups_y == 1;
3863
3864
0
  auto sample_rows = [&](size_t xg, size_t yg, size_t num_rows) {
3865
0
    size_t y0 = yg * 256;
3866
0
    size_t x0 = xg * 256;
3867
0
    size_t ys = std::min<size_t>(height - y0, 256);
3868
0
    size_t xs = std::min<size_t>(width - x0, 256);
3869
0
    size_t stride;
3870
0
    const void* buffer =
3871
0
        input.get_color_channel_data_at(input.opaque, x0, y0, xs, ys, &stride);
3872
0
    auto rgba = reinterpret_cast<const unsigned char*>(buffer);
3873
0
    int y_begin_group =
3874
0
        std::max<ptrdiff_t>(
3875
0
            0, static_cast<ptrdiff_t>(ys) - static_cast<ptrdiff_t>(num_rows)) /
3876
0
        2;
3877
0
    int y_count = std::min<int>(num_rows, ys - y_begin_group);
3878
0
    int x_max = xs / kChunkSize * kChunkSize;
3879
0
    CollectSamples(rgba, 0, y_begin_group, x_max, stride, y_count, raw_counts,
3880
0
                   lz77_counts, onegroup, !collided, bitdepth, nb_chans,
3881
0
                   big_endian, lookup.data());
3882
0
    input.release_buffer(input.opaque, buffer);
3883
0
  };
Unexecuted instantiation: enc_fast_lossless.cc:AVX2::(anonymous namespace)::LLPrepare<AVX2::(anonymous namespace)::UpTo8Bits>(JxlChunkedFrameInputSource, unsigned long, unsigned long, AVX2::(anonymous namespace)::UpTo8Bits, unsigned long, bool, int, int)::{lambda(unsigned long, unsigned long, unsigned long)#1}::operator()(unsigned long, unsigned long, unsigned long) const
Unexecuted instantiation: enc_fast_lossless.cc:AVX2::(anonymous namespace)::LLPrepare<AVX2::(anonymous namespace)::From9To13Bits>(JxlChunkedFrameInputSource, unsigned long, unsigned long, AVX2::(anonymous namespace)::From9To13Bits, unsigned long, bool, int, int)::{lambda(unsigned long, unsigned long, unsigned long)#1}::operator()(unsigned long, unsigned long, unsigned long) const
Unexecuted instantiation: enc_fast_lossless.cc:AVX2::(anonymous namespace)::LLPrepare<AVX2::(anonymous namespace)::Exactly14Bits>(JxlChunkedFrameInputSource, unsigned long, unsigned long, AVX2::(anonymous namespace)::Exactly14Bits, unsigned long, bool, int, int)::{lambda(unsigned long, unsigned long, unsigned long)#1}::operator()(unsigned long, unsigned long, unsigned long) const
Unexecuted instantiation: enc_fast_lossless.cc:AVX2::(anonymous namespace)::LLPrepare<AVX2::(anonymous namespace)::MoreThan14Bits>(JxlChunkedFrameInputSource, unsigned long, unsigned long, AVX2::(anonymous namespace)::MoreThan14Bits, unsigned long, bool, int, int)::{lambda(unsigned long, unsigned long, unsigned long)#1}::operator()(unsigned long, unsigned long, unsigned long) const
Unexecuted instantiation: enc_fast_lossless.cc:default_implementation::(anonymous namespace)::LLPrepare<default_implementation::(anonymous namespace)::UpTo8Bits>(JxlChunkedFrameInputSource, unsigned long, unsigned long, default_implementation::(anonymous namespace)::UpTo8Bits, unsigned long, bool, int, int)::{lambda(unsigned long, unsigned long, unsigned long)#1}::operator()(unsigned long, unsigned long, unsigned long) const
Unexecuted instantiation: enc_fast_lossless.cc:default_implementation::(anonymous namespace)::LLPrepare<default_implementation::(anonymous namespace)::From9To13Bits>(JxlChunkedFrameInputSource, unsigned long, unsigned long, default_implementation::(anonymous namespace)::From9To13Bits, unsigned long, bool, int, int)::{lambda(unsigned long, unsigned long, unsigned long)#1}::operator()(unsigned long, unsigned long, unsigned long) const
Unexecuted instantiation: enc_fast_lossless.cc:default_implementation::(anonymous namespace)::LLPrepare<default_implementation::(anonymous namespace)::Exactly14Bits>(JxlChunkedFrameInputSource, unsigned long, unsigned long, default_implementation::(anonymous namespace)::Exactly14Bits, unsigned long, bool, int, int)::{lambda(unsigned long, unsigned long, unsigned long)#1}::operator()(unsigned long, unsigned long, unsigned long) const
Unexecuted instantiation: enc_fast_lossless.cc:default_implementation::(anonymous namespace)::LLPrepare<default_implementation::(anonymous namespace)::MoreThan14Bits>(JxlChunkedFrameInputSource, unsigned long, unsigned long, default_implementation::(anonymous namespace)::MoreThan14Bits, unsigned long, bool, int, int)::{lambda(unsigned long, unsigned long, unsigned long)#1}::operator()(unsigned long, unsigned long, unsigned long) const
3884
3885
  // TODO(veluca): that `64` is an arbitrary constant, meant to correspond to
3886
  // the point where the number of processed rows is large enough that loading
3887
  // the entire image is cost-effective.
3888
0
  if (oneshot || effort >= 64) {
3889
0
    for (size_t g = 0; g < num_groups_y * num_groups_x; g++) {
3890
0
      size_t xg = g % num_groups_x;
3891
0
      size_t yg = g / num_groups_x;
3892
0
      size_t y0 = yg * 256;
3893
0
      size_t ys = std::min<size_t>(height - y0, 256);
3894
0
      size_t num_rows = 2 * effort * ys / 256;
3895
0
      sample_rows(xg, yg, num_rows);
3896
0
    }
3897
0
  } else {
3898
    // sample the middle (effort * 2 * num_groups) rows of the center group
3899
    // (possibly all of them).
3900
0
    sample_rows((num_groups_x - 1) / 2, (num_groups_y - 1) / 2,
3901
0
                2 * effort * num_groups_x * num_groups_y);
3902
0
  }
3903
3904
  // TODO(veluca): can probably improve this and make it bitdepth-dependent.
3905
0
  uint64_t base_raw_counts[kNumRawSymbols] = {
3906
0
      3843, 852, 1270, 1214, 1014, 727, 481, 300, 159, 51,
3907
0
      5,    1,   1,    1,    1,    1,   1,   1,   1};
3908
3909
0
  bool doing_ycocg = nb_chans > 2 && collided;
3910
0
  bool large_palette = !collided || pcolors >= 256;
3911
0
  for (size_t i = bitdepth.NumSymbols(doing_ycocg || large_palette);
3912
0
       i < kNumRawSymbols; i++) {
3913
0
    base_raw_counts[i] = 0;
3914
0
  }
3915
3916
0
  for (size_t c = 0; c < 4; c++) {
3917
0
    for (size_t i = 0; i < kNumRawSymbols; i++) {
3918
0
      raw_counts[c][i] = (raw_counts[c][i] << 8) + base_raw_counts[i];
3919
0
    }
3920
0
  }
3921
3922
0
  if (!collided) {
3923
0
    unsigned token, nbits, bits;
3924
0
    EncodeHybridUint000(PackSigned(pcolors - 1), &token, &nbits, &bits);
3925
    // ensure all palette indices can actually be encoded
3926
0
    for (size_t i = 0; i < token + 1; i++)
3927
0
      raw_counts[0][i] = std::max<uint64_t>(raw_counts[0][i], 1);
3928
    // these tokens are only used for the palette itself so they can get a bad
3929
    // code
3930
0
    for (size_t i = token + 1; i < 10; i++) raw_counts[0][i] = 1;
3931
0
  }
3932
3933
0
  uint64_t base_lz77_counts[kNumLZ77] = {
3934
0
      29, 27, 25,  23, 21, 21, 19, 18, 21, 17, 16, 15, 15, 14,
3935
0
      13, 13, 137, 98, 61, 34, 1,  1,  1,  1,  1,  1,  1,  1,
3936
0
  };
3937
3938
0
  for (size_t c = 0; c < 4; c++) {
3939
0
    for (size_t i = 0; i < kNumLZ77; i++) {
3940
0
      lz77_counts[c][i] = (lz77_counts[c][i] << 8) + base_lz77_counts[i];
3941
0
    }
3942
0
  }
3943
3944
0
  JxlFastLosslessFrameState* frame_state = new JxlFastLosslessFrameState();
3945
0
  for (size_t i = 0; i < 4; i++) {
3946
0
    frame_state->hcode[i] = PrefixCode(bitdepth, raw_counts[i], lz77_counts[i]);
3947
0
  }
3948
3949
0
  size_t num_dc_groups = num_dc_groups_x * num_dc_groups_y;
3950
0
  size_t num_ac_groups = num_groups_x * num_groups_y;
3951
0
  size_t num_groups = onegroup ? 1 : (2 + num_dc_groups + num_ac_groups);
3952
0
  frame_state->input = input;
3953
0
  frame_state->width = width;
3954
0
  frame_state->height = height;
3955
0
  frame_state->num_groups_x = num_groups_x;
3956
0
  frame_state->num_groups_y = num_groups_y;
3957
0
  frame_state->num_dc_groups_x = num_dc_groups_x;
3958
0
  frame_state->num_dc_groups_y = num_dc_groups_y;
3959
0
  frame_state->nb_chans = nb_chans;
3960
0
  frame_state->bitdepth = bitdepth.bitdepth;
3961
0
  frame_state->big_endian = big_endian;
3962
0
  frame_state->effort = effort;
3963
0
  frame_state->collided = collided;
3964
0
  frame_state->lookup = lookup;
3965
3966
0
  frame_state->group_data = std::vector<std::array<BitWriter, 4>>(num_groups);
3967
0
  frame_state->group_sizes.resize(num_groups);
3968
0
  if (collided) {
3969
0
    PrepareDCGlobal(onegroup, width, height, nb_chans, frame_state->hcode,
3970
0
                    &frame_state->group_data[0][0]);
3971
0
  } else {
3972
0
    PrepareDCGlobalPalette(onegroup, width, height, nb_chans,
3973
0
                           frame_state->hcode, palette, pcolors,
3974
0
                           &frame_state->group_data[0][0]);
3975
0
  }
3976
0
  frame_state->group_sizes[0] = SectionSize(frame_state->group_data[0]);
3977
0
  if (!onegroup) {
3978
0
    ComputeAcGroupDataOffset(frame_state->group_sizes[0], num_dc_groups,
3979
0
                             num_ac_groups, frame_state->min_dc_global_size,
3980
0
                             frame_state->ac_group_data_offset);
3981
0
  }
3982
3983
0
  return frame_state;
3984
0
}
Unexecuted instantiation: enc_fast_lossless.cc:JxlFastLosslessFrameState* AVX2::(anonymous namespace)::LLPrepare<AVX2::(anonymous namespace)::UpTo8Bits>(JxlChunkedFrameInputSource, unsigned long, unsigned long, AVX2::(anonymous namespace)::UpTo8Bits, unsigned long, bool, int, int)
Unexecuted instantiation: enc_fast_lossless.cc:JxlFastLosslessFrameState* AVX2::(anonymous namespace)::LLPrepare<AVX2::(anonymous namespace)::From9To13Bits>(JxlChunkedFrameInputSource, unsigned long, unsigned long, AVX2::(anonymous namespace)::From9To13Bits, unsigned long, bool, int, int)
Unexecuted instantiation: enc_fast_lossless.cc:JxlFastLosslessFrameState* AVX2::(anonymous namespace)::LLPrepare<AVX2::(anonymous namespace)::Exactly14Bits>(JxlChunkedFrameInputSource, unsigned long, unsigned long, AVX2::(anonymous namespace)::Exactly14Bits, unsigned long, bool, int, int)
Unexecuted instantiation: enc_fast_lossless.cc:JxlFastLosslessFrameState* AVX2::(anonymous namespace)::LLPrepare<AVX2::(anonymous namespace)::MoreThan14Bits>(JxlChunkedFrameInputSource, unsigned long, unsigned long, AVX2::(anonymous namespace)::MoreThan14Bits, unsigned long, bool, int, int)
Unexecuted instantiation: enc_fast_lossless.cc:JxlFastLosslessFrameState* default_implementation::(anonymous namespace)::LLPrepare<default_implementation::(anonymous namespace)::UpTo8Bits>(JxlChunkedFrameInputSource, unsigned long, unsigned long, default_implementation::(anonymous namespace)::UpTo8Bits, unsigned long, bool, int, int)
Unexecuted instantiation: enc_fast_lossless.cc:JxlFastLosslessFrameState* default_implementation::(anonymous namespace)::LLPrepare<default_implementation::(anonymous namespace)::From9To13Bits>(JxlChunkedFrameInputSource, unsigned long, unsigned long, default_implementation::(anonymous namespace)::From9To13Bits, unsigned long, bool, int, int)
Unexecuted instantiation: enc_fast_lossless.cc:JxlFastLosslessFrameState* default_implementation::(anonymous namespace)::LLPrepare<default_implementation::(anonymous namespace)::Exactly14Bits>(JxlChunkedFrameInputSource, unsigned long, unsigned long, default_implementation::(anonymous namespace)::Exactly14Bits, unsigned long, bool, int, int)
Unexecuted instantiation: enc_fast_lossless.cc:JxlFastLosslessFrameState* default_implementation::(anonymous namespace)::LLPrepare<default_implementation::(anonymous namespace)::MoreThan14Bits>(JxlChunkedFrameInputSource, unsigned long, unsigned long, default_implementation::(anonymous namespace)::MoreThan14Bits, unsigned long, bool, int, int)
3985
3986
template <typename BitDepth>
3987
jxl::Status LLProcess(JxlFastLosslessFrameState* frame_state, bool is_last,
3988
                      BitDepth bitdepth, void* runner_opaque,
3989
                      FJxlParallelRunner runner,
3990
0
                      JxlEncoderOutputProcessorWrapper* output_processor) {
3991
0
#if !FJXL_STANDALONE
3992
0
  if (frame_state->process_done) {
3993
0
    JxlFastLosslessPrepareHeader(frame_state, /*add_image_header=*/0, is_last);
3994
0
    if (output_processor) {
3995
0
      JXL_RETURN_IF_ERROR(
3996
0
          JxlFastLosslessOutputFrame(frame_state, output_processor));
3997
0
    }
3998
0
    return true;
3999
0
  }
4000
0
#endif
4001
  // The maximum number of groups that we process concurrently here.
4002
  // TODO(szabadka) Use the number of threads or some outside parameter for the
4003
  // maximum memory usage instead.
4004
0
  constexpr size_t kMaxLocalGroups = 16;
4005
0
  bool onegroup = frame_state->group_sizes.size() == 1;
4006
0
  bool streaming = !onegroup && output_processor;
4007
0
  size_t total_groups = frame_state->num_groups_x * frame_state->num_groups_y;
4008
0
  size_t max_groups = streaming ? kMaxLocalGroups : total_groups;
4009
0
#if !FJXL_STANDALONE
4010
0
  size_t start_pos = 0;
4011
0
  if (streaming) {
4012
0
    start_pos = output_processor->CurrentPosition();
4013
0
    JXL_RETURN_IF_ERROR(
4014
0
        output_processor->Seek(start_pos + frame_state->ac_group_data_offset));
4015
0
  }
4016
0
#endif
4017
0
  for (size_t offset = 0; offset < total_groups; offset += max_groups) {
4018
0
    size_t num_groups = std::min(max_groups, total_groups - offset);
4019
0
    JxlFastLosslessFrameState local_frame_state;
4020
0
    if (streaming) {
4021
0
      local_frame_state.group_data =
4022
0
          std::vector<std::array<BitWriter, 4>>(num_groups);
4023
0
    }
4024
0
    auto run_one = [&](size_t i) {
4025
0
      size_t g = offset + i;
4026
0
      size_t xg = g % frame_state->num_groups_x;
4027
0
      size_t yg = g / frame_state->num_groups_x;
4028
0
      size_t num_dc_groups =
4029
0
          frame_state->num_dc_groups_x * frame_state->num_dc_groups_y;
4030
0
      size_t group_id = onegroup ? 0 : (2 + num_dc_groups + g);
4031
0
      size_t xs = std::min<size_t>(frame_state->width - xg * 256, 256);
4032
0
      size_t ys = std::min<size_t>(frame_state->height - yg * 256, 256);
4033
0
      size_t x0 = xg * 256;
4034
0
      size_t y0 = yg * 256;
4035
0
      size_t stride;
4036
0
      JxlChunkedFrameInputSource input = frame_state->input;
4037
0
      const void* buffer = input.get_color_channel_data_at(input.opaque, x0, y0,
4038
0
                                                           xs, ys, &stride);
4039
0
      const unsigned char* rgba =
4040
0
          reinterpret_cast<const unsigned char*>(buffer);
4041
4042
0
      auto& gd = streaming ? local_frame_state.group_data[i]
4043
0
                           : frame_state->group_data[group_id];
4044
0
      if (frame_state->collided) {
4045
0
        WriteACSection(rgba, 0, 0, xs, ys, stride, onegroup, bitdepth,
4046
0
                       frame_state->nb_chans, frame_state->big_endian,
4047
0
                       frame_state->hcode, gd);
4048
0
      } else {
4049
0
        WriteACSectionPalette(rgba, 0, 0, xs, ys, stride, onegroup,
4050
0
                              frame_state->hcode, frame_state->lookup.data(),
4051
0
                              frame_state->nb_chans, gd[0]);
4052
0
      }
4053
0
      frame_state->group_sizes[group_id] = SectionSize(gd);
4054
0
      input.release_buffer(input.opaque, buffer);
4055
0
    };
Unexecuted instantiation: enc_fast_lossless.cc:AVX2::(anonymous namespace)::LLProcess<AVX2::(anonymous namespace)::UpTo8Bits>(JxlFastLosslessFrameState*, bool, AVX2::(anonymous namespace)::UpTo8Bits, void*, void (*)(void*, void*, void (*)(void*, unsigned long), unsigned long), JxlEncoderOutputProcessorWrapper*)::{lambda(unsigned long)#1}::operator()(unsigned long) const
Unexecuted instantiation: enc_fast_lossless.cc:AVX2::(anonymous namespace)::LLProcess<AVX2::(anonymous namespace)::From9To13Bits>(JxlFastLosslessFrameState*, bool, AVX2::(anonymous namespace)::From9To13Bits, void*, void (*)(void*, void*, void (*)(void*, unsigned long), unsigned long), JxlEncoderOutputProcessorWrapper*)::{lambda(unsigned long)#1}::operator()(unsigned long) const
Unexecuted instantiation: enc_fast_lossless.cc:AVX2::(anonymous namespace)::LLProcess<AVX2::(anonymous namespace)::Exactly14Bits>(JxlFastLosslessFrameState*, bool, AVX2::(anonymous namespace)::Exactly14Bits, void*, void (*)(void*, void*, void (*)(void*, unsigned long), unsigned long), JxlEncoderOutputProcessorWrapper*)::{lambda(unsigned long)#1}::operator()(unsigned long) const
Unexecuted instantiation: enc_fast_lossless.cc:AVX2::(anonymous namespace)::LLProcess<AVX2::(anonymous namespace)::MoreThan14Bits>(JxlFastLosslessFrameState*, bool, AVX2::(anonymous namespace)::MoreThan14Bits, void*, void (*)(void*, void*, void (*)(void*, unsigned long), unsigned long), JxlEncoderOutputProcessorWrapper*)::{lambda(unsigned long)#1}::operator()(unsigned long) const
Unexecuted instantiation: enc_fast_lossless.cc:default_implementation::(anonymous namespace)::LLProcess<default_implementation::(anonymous namespace)::UpTo8Bits>(JxlFastLosslessFrameState*, bool, default_implementation::(anonymous namespace)::UpTo8Bits, void*, void (*)(void*, void*, void (*)(void*, unsigned long), unsigned long), JxlEncoderOutputProcessorWrapper*)::{lambda(unsigned long)#1}::operator()(unsigned long) const
Unexecuted instantiation: enc_fast_lossless.cc:default_implementation::(anonymous namespace)::LLProcess<default_implementation::(anonymous namespace)::From9To13Bits>(JxlFastLosslessFrameState*, bool, default_implementation::(anonymous namespace)::From9To13Bits, void*, void (*)(void*, void*, void (*)(void*, unsigned long), unsigned long), JxlEncoderOutputProcessorWrapper*)::{lambda(unsigned long)#1}::operator()(unsigned long) const
Unexecuted instantiation: enc_fast_lossless.cc:default_implementation::(anonymous namespace)::LLProcess<default_implementation::(anonymous namespace)::Exactly14Bits>(JxlFastLosslessFrameState*, bool, default_implementation::(anonymous namespace)::Exactly14Bits, void*, void (*)(void*, void*, void (*)(void*, unsigned long), unsigned long), JxlEncoderOutputProcessorWrapper*)::{lambda(unsigned long)#1}::operator()(unsigned long) const
Unexecuted instantiation: enc_fast_lossless.cc:default_implementation::(anonymous namespace)::LLProcess<default_implementation::(anonymous namespace)::MoreThan14Bits>(JxlFastLosslessFrameState*, bool, default_implementation::(anonymous namespace)::MoreThan14Bits, void*, void (*)(void*, void*, void (*)(void*, unsigned long), unsigned long), JxlEncoderOutputProcessorWrapper*)::{lambda(unsigned long)#1}::operator()(unsigned long) const
4056
0
    runner(
4057
0
        runner_opaque, &run_one,
4058
0
        +[](void* r, size_t i) {
4059
0
          (*reinterpret_cast<decltype(&run_one)>(r))(i);
4060
0
        },
Unexecuted instantiation: enc_fast_lossless.cc:AVX2::(anonymous namespace)::LLProcess<AVX2::(anonymous namespace)::UpTo8Bits>(JxlFastLosslessFrameState*, bool, AVX2::(anonymous namespace)::UpTo8Bits, void*, void (*)(void*, void*, void (*)(void*, unsigned long), unsigned long), JxlEncoderOutputProcessorWrapper*)::{lambda(void*, unsigned long)#1}::operator()(void*, unsigned long) const
Unexecuted instantiation: enc_fast_lossless.cc:AVX2::(anonymous namespace)::LLProcess<AVX2::(anonymous namespace)::From9To13Bits>(JxlFastLosslessFrameState*, bool, AVX2::(anonymous namespace)::From9To13Bits, void*, void (*)(void*, void*, void (*)(void*, unsigned long), unsigned long), JxlEncoderOutputProcessorWrapper*)::{lambda(void*, unsigned long)#1}::operator()(void*, unsigned long) const
Unexecuted instantiation: enc_fast_lossless.cc:AVX2::(anonymous namespace)::LLProcess<AVX2::(anonymous namespace)::Exactly14Bits>(JxlFastLosslessFrameState*, bool, AVX2::(anonymous namespace)::Exactly14Bits, void*, void (*)(void*, void*, void (*)(void*, unsigned long), unsigned long), JxlEncoderOutputProcessorWrapper*)::{lambda(void*, unsigned long)#1}::operator()(void*, unsigned long) const
Unexecuted instantiation: enc_fast_lossless.cc:AVX2::(anonymous namespace)::LLProcess<AVX2::(anonymous namespace)::MoreThan14Bits>(JxlFastLosslessFrameState*, bool, AVX2::(anonymous namespace)::MoreThan14Bits, void*, void (*)(void*, void*, void (*)(void*, unsigned long), unsigned long), JxlEncoderOutputProcessorWrapper*)::{lambda(void*, unsigned long)#1}::operator()(void*, unsigned long) const
Unexecuted instantiation: enc_fast_lossless.cc:default_implementation::(anonymous namespace)::LLProcess<default_implementation::(anonymous namespace)::UpTo8Bits>(JxlFastLosslessFrameState*, bool, default_implementation::(anonymous namespace)::UpTo8Bits, void*, void (*)(void*, void*, void (*)(void*, unsigned long), unsigned long), JxlEncoderOutputProcessorWrapper*)::{lambda(void*, unsigned long)#1}::operator()(void*, unsigned long) const
Unexecuted instantiation: enc_fast_lossless.cc:default_implementation::(anonymous namespace)::LLProcess<default_implementation::(anonymous namespace)::From9To13Bits>(JxlFastLosslessFrameState*, bool, default_implementation::(anonymous namespace)::From9To13Bits, void*, void (*)(void*, void*, void (*)(void*, unsigned long), unsigned long), JxlEncoderOutputProcessorWrapper*)::{lambda(void*, unsigned long)#1}::operator()(void*, unsigned long) const
Unexecuted instantiation: enc_fast_lossless.cc:default_implementation::(anonymous namespace)::LLProcess<default_implementation::(anonymous namespace)::Exactly14Bits>(JxlFastLosslessFrameState*, bool, default_implementation::(anonymous namespace)::Exactly14Bits, void*, void (*)(void*, void*, void (*)(void*, unsigned long), unsigned long), JxlEncoderOutputProcessorWrapper*)::{lambda(void*, unsigned long)#1}::operator()(void*, unsigned long) const
Unexecuted instantiation: enc_fast_lossless.cc:default_implementation::(anonymous namespace)::LLProcess<default_implementation::(anonymous namespace)::MoreThan14Bits>(JxlFastLosslessFrameState*, bool, default_implementation::(anonymous namespace)::MoreThan14Bits, void*, void (*)(void*, void*, void (*)(void*, unsigned long), unsigned long), JxlEncoderOutputProcessorWrapper*)::{lambda(void*, unsigned long)#1}::operator()(void*, unsigned long) const
4061
0
        num_groups);
4062
0
#if !FJXL_STANDALONE
4063
0
    if (streaming) {
4064
0
      local_frame_state.nb_chans = frame_state->nb_chans;
4065
0
      local_frame_state.current_bit_writer = 1;
4066
0
      JXL_RETURN_IF_ERROR(
4067
0
          JxlFastLosslessOutputFrame(&local_frame_state, output_processor));
4068
0
    }
4069
0
#endif
4070
0
  }
4071
0
#if !FJXL_STANDALONE
4072
0
  if (streaming) {
4073
0
    size_t end_pos = output_processor->CurrentPosition();
4074
0
    JXL_RETURN_IF_ERROR(output_processor->Seek(start_pos));
4075
0
    frame_state->group_data.resize(1);
4076
0
    bool have_alpha = frame_state->nb_chans == 2 || frame_state->nb_chans == 4;
4077
0
    size_t padding = ComputeDcGlobalPadding(
4078
0
        frame_state->group_sizes, frame_state->ac_group_data_offset,
4079
0
        frame_state->min_dc_global_size, have_alpha, is_last);
4080
4081
0
    for (size_t i = 0; i < padding; ++i) {
4082
0
      frame_state->group_data[0][0].Write(8, 0);
4083
0
    }
4084
0
    frame_state->group_sizes[0] += padding;
4085
0
    JxlFastLosslessPrepareHeader(frame_state, /*add_image_header=*/0, is_last);
4086
0
    assert(frame_state->ac_group_data_offset ==
4087
0
           JxlFastLosslessOutputSize(frame_state));
4088
0
    JXL_RETURN_IF_ERROR(
4089
0
        JxlFastLosslessOutputHeaders(frame_state, output_processor));
4090
0
    JXL_RETURN_IF_ERROR(output_processor->Seek(end_pos));
4091
0
  } else if (output_processor) {
4092
0
    assert(onegroup);
4093
0
    JxlFastLosslessPrepareHeader(frame_state, /*add_image_header=*/0, is_last);
4094
0
    if (output_processor) {
4095
0
      JXL_RETURN_IF_ERROR(
4096
0
          JxlFastLosslessOutputFrame(frame_state, output_processor));
4097
0
    }
4098
0
  }
4099
0
  frame_state->process_done = true;
4100
0
#endif
4101
0
  return true;
4102
0
}
Unexecuted instantiation: enc_fast_lossless.cc:jxl::Status AVX2::(anonymous namespace)::LLProcess<AVX2::(anonymous namespace)::UpTo8Bits>(JxlFastLosslessFrameState*, bool, AVX2::(anonymous namespace)::UpTo8Bits, void*, void (*)(void*, void*, void (*)(void*, unsigned long), unsigned long), JxlEncoderOutputProcessorWrapper*)
Unexecuted instantiation: enc_fast_lossless.cc:jxl::Status AVX2::(anonymous namespace)::LLProcess<AVX2::(anonymous namespace)::From9To13Bits>(JxlFastLosslessFrameState*, bool, AVX2::(anonymous namespace)::From9To13Bits, void*, void (*)(void*, void*, void (*)(void*, unsigned long), unsigned long), JxlEncoderOutputProcessorWrapper*)
Unexecuted instantiation: enc_fast_lossless.cc:jxl::Status AVX2::(anonymous namespace)::LLProcess<AVX2::(anonymous namespace)::Exactly14Bits>(JxlFastLosslessFrameState*, bool, AVX2::(anonymous namespace)::Exactly14Bits, void*, void (*)(void*, void*, void (*)(void*, unsigned long), unsigned long), JxlEncoderOutputProcessorWrapper*)
Unexecuted instantiation: enc_fast_lossless.cc:jxl::Status AVX2::(anonymous namespace)::LLProcess<AVX2::(anonymous namespace)::MoreThan14Bits>(JxlFastLosslessFrameState*, bool, AVX2::(anonymous namespace)::MoreThan14Bits, void*, void (*)(void*, void*, void (*)(void*, unsigned long), unsigned long), JxlEncoderOutputProcessorWrapper*)
Unexecuted instantiation: enc_fast_lossless.cc:jxl::Status default_implementation::(anonymous namespace)::LLProcess<default_implementation::(anonymous namespace)::UpTo8Bits>(JxlFastLosslessFrameState*, bool, default_implementation::(anonymous namespace)::UpTo8Bits, void*, void (*)(void*, void*, void (*)(void*, unsigned long), unsigned long), JxlEncoderOutputProcessorWrapper*)
Unexecuted instantiation: enc_fast_lossless.cc:jxl::Status default_implementation::(anonymous namespace)::LLProcess<default_implementation::(anonymous namespace)::From9To13Bits>(JxlFastLosslessFrameState*, bool, default_implementation::(anonymous namespace)::From9To13Bits, void*, void (*)(void*, void*, void (*)(void*, unsigned long), unsigned long), JxlEncoderOutputProcessorWrapper*)
Unexecuted instantiation: enc_fast_lossless.cc:jxl::Status default_implementation::(anonymous namespace)::LLProcess<default_implementation::(anonymous namespace)::Exactly14Bits>(JxlFastLosslessFrameState*, bool, default_implementation::(anonymous namespace)::Exactly14Bits, void*, void (*)(void*, void*, void (*)(void*, unsigned long), unsigned long), JxlEncoderOutputProcessorWrapper*)
Unexecuted instantiation: enc_fast_lossless.cc:jxl::Status default_implementation::(anonymous namespace)::LLProcess<default_implementation::(anonymous namespace)::MoreThan14Bits>(JxlFastLosslessFrameState*, bool, default_implementation::(anonymous namespace)::MoreThan14Bits, void*, void (*)(void*, void*, void (*)(void*, unsigned long), unsigned long), JxlEncoderOutputProcessorWrapper*)
4103
4104
JxlFastLosslessFrameState* JxlFastLosslessPrepareImpl(
4105
    JxlChunkedFrameInputSource input, size_t width, size_t height,
4106
    size_t nb_chans, size_t bitdepth, bool big_endian, int effort,
4107
0
    int oneshot) {
4108
0
  assert(bitdepth > 0);
4109
0
  assert(nb_chans <= 4);
4110
0
  assert(nb_chans != 0);
4111
0
  if (bitdepth <= 8) {
4112
0
    return LLPrepare(input, width, height, UpTo8Bits(bitdepth), nb_chans,
4113
0
                     big_endian, effort, oneshot);
4114
0
  }
4115
0
  if (bitdepth <= 13) {
4116
0
    return LLPrepare(input, width, height, From9To13Bits(bitdepth), nb_chans,
4117
0
                     big_endian, effort, oneshot);
4118
0
  }
4119
0
  if (bitdepth == 14) {
4120
0
    return LLPrepare(input, width, height, Exactly14Bits(bitdepth), nb_chans,
4121
0
                     big_endian, effort, oneshot);
4122
0
  }
4123
0
  return LLPrepare(input, width, height, MoreThan14Bits(bitdepth), nb_chans,
4124
0
                   big_endian, effort, oneshot);
4125
0
}
Unexecuted instantiation: enc_fast_lossless.cc:AVX2::(anonymous namespace)::JxlFastLosslessPrepareImpl(JxlChunkedFrameInputSource, unsigned long, unsigned long, unsigned long, unsigned long, bool, int, int)
Unexecuted instantiation: enc_fast_lossless.cc:default_implementation::(anonymous namespace)::JxlFastLosslessPrepareImpl(JxlChunkedFrameInputSource, unsigned long, unsigned long, unsigned long, unsigned long, bool, int, int)
4126
4127
jxl::Status JxlFastLosslessProcessFrameImpl(
4128
    JxlFastLosslessFrameState* frame_state, bool is_last, void* runner_opaque,
4129
    FJxlParallelRunner runner,
4130
0
    JxlEncoderOutputProcessorWrapper* output_processor) {
4131
0
  const size_t bitdepth = frame_state->bitdepth;
4132
0
  if (bitdepth <= 8) {
4133
0
    JXL_RETURN_IF_ERROR(LLProcess(frame_state, is_last, UpTo8Bits(bitdepth),
4134
0
                                  runner_opaque, runner, output_processor));
4135
0
  } else if (bitdepth <= 13) {
4136
0
    JXL_RETURN_IF_ERROR(LLProcess(frame_state, is_last, From9To13Bits(bitdepth),
4137
0
                                  runner_opaque, runner, output_processor));
4138
0
  } else if (bitdepth == 14) {
4139
0
    JXL_RETURN_IF_ERROR(LLProcess(frame_state, is_last, Exactly14Bits(bitdepth),
4140
0
                                  runner_opaque, runner, output_processor));
4141
0
  } else {
4142
0
    JXL_RETURN_IF_ERROR(LLProcess(frame_state, is_last,
4143
0
                                  MoreThan14Bits(bitdepth), runner_opaque,
4144
0
                                  runner, output_processor));
4145
0
  }
4146
0
  return true;
4147
0
}
Unexecuted instantiation: enc_fast_lossless.cc:AVX2::(anonymous namespace)::JxlFastLosslessProcessFrameImpl(JxlFastLosslessFrameState*, bool, void*, void (*)(void*, void*, void (*)(void*, unsigned long), unsigned long), JxlEncoderOutputProcessorWrapper*)
Unexecuted instantiation: enc_fast_lossless.cc:default_implementation::(anonymous namespace)::JxlFastLosslessProcessFrameImpl(JxlFastLosslessFrameState*, bool, void*, void (*)(void*, void*, void (*)(void*, unsigned long), unsigned long), JxlEncoderOutputProcessorWrapper*)
4148
4149
}  // namespace
4150
4151
#endif  // FJXL_SELF_INCLUDE
4152
4153
#ifndef FJXL_SELF_INCLUDE
4154
4155
#define FJXL_SELF_INCLUDE
4156
4157
// If we have NEON enabled, it is the default target.
4158
#if FJXL_ENABLE_NEON
4159
4160
namespace default_implementation {
4161
#define FJXL_NEON
4162
#include "lib/jxl/enc_fast_lossless.cc"
4163
#undef FJXL_NEON
4164
}  // namespace default_implementation
4165
4166
#else                                    // FJXL_ENABLE_NEON
4167
4168
namespace default_implementation {
4169
#include "lib/jxl/enc_fast_lossless.cc"  // NOLINT
4170
}
4171
4172
#if FJXL_ENABLE_AVX2
4173
#ifdef __clang__
4174
#pragma clang attribute push(__attribute__((target("avx,avx2"))), \
4175
                             apply_to = function)
4176
// Causes spurious warnings on clang5.
4177
#pragma clang diagnostic push
4178
#pragma clang diagnostic ignored "-Wmissing-braces"
4179
#elif defined(__GNUC__)
4180
#pragma GCC push_options
4181
// Seems to cause spurious errors on GCC8.
4182
#pragma GCC diagnostic ignored "-Wpsabi"
4183
#pragma GCC target "avx,avx2"
4184
#endif
4185
4186
namespace AVX2 {
4187
#define FJXL_AVX2
4188
#include "lib/jxl/enc_fast_lossless.cc"  // NOLINT
4189
#undef FJXL_AVX2
4190
}  // namespace AVX2
4191
4192
#ifdef __clang__
4193
#pragma clang attribute pop
4194
#pragma clang diagnostic pop
4195
#elif defined(__GNUC__)
4196
#pragma GCC pop_options
4197
#endif
4198
#endif  // FJXL_ENABLE_AVX2
4199
4200
#if FJXL_ENABLE_AVX512
4201
#ifdef __clang__
4202
#pragma clang attribute push(                                                 \
4203
    __attribute__((target("avx512cd,avx512bw,avx512vl,avx512f,avx512vbmi"))), \
4204
    apply_to = function)
4205
#elif defined(__GNUC__)
4206
#pragma GCC push_options
4207
#pragma GCC target "avx512cd,avx512bw,avx512vl,avx512f,avx512vbmi"
4208
#endif
4209
4210
namespace AVX512 {
4211
#define FJXL_AVX512
4212
#include "lib/jxl/enc_fast_lossless.cc"
4213
#undef FJXL_AVX512
4214
}  // namespace AVX512
4215
4216
#ifdef __clang__
4217
#pragma clang attribute pop
4218
#elif defined(__GNUC__)
4219
#pragma GCC pop_options
4220
#endif
4221
#endif  // FJXL_ENABLE_AVX512
4222
4223
#endif
4224
4225
extern "C" {
4226
4227
#if FJXL_STANDALONE
4228
class FJxlFrameInput {
4229
 public:
4230
  FJxlFrameInput(const unsigned char* rgba, size_t row_stride, size_t nb_chans,
4231
                 size_t bitdepth)
4232
      : rgba_(rgba),
4233
        row_stride_(row_stride),
4234
        bytes_per_pixel_(bitdepth <= 8 ? nb_chans : 2 * nb_chans) {}
4235
4236
  JxlChunkedFrameInputSource GetInputSource() {
4237
    return JxlChunkedFrameInputSource{this, GetDataAt,
4238
                                      [](void*, const void*) {}};
4239
  }
4240
4241
 private:
4242
  static const void* GetDataAt(void* opaque, size_t xpos, size_t ypos,
4243
                               size_t xsize, size_t ysize, size_t* row_offset) {
4244
    FJxlFrameInput* self = static_cast<FJxlFrameInput*>(opaque);
4245
    *row_offset = self->row_stride_;
4246
    return self->rgba_ + ypos * (*row_offset) + xpos * self->bytes_per_pixel_;
4247
  }
4248
4249
  const uint8_t* rgba_;
4250
  size_t row_stride_;
4251
  size_t bytes_per_pixel_;
4252
};
4253
4254
size_t JxlFastLosslessEncode(const unsigned char* rgba, size_t width,
4255
                             size_t row_stride, size_t height, size_t nb_chans,
4256
                             size_t bitdepth, bool big_endian, int effort,
4257
                             unsigned char** output, void* runner_opaque,
4258
                             FJxlParallelRunner runner) {
4259
  FJxlFrameInput input(rgba, row_stride, nb_chans, bitdepth);
4260
  auto* frame_state = JxlFastLosslessPrepareFrame(
4261
      input.GetInputSource(), width, height, nb_chans, bitdepth, big_endian,
4262
      effort, /*oneshot=*/true);
4263
  if (!JxlFastLosslessProcessFrame(frame_state, /*is_last=*/true, runner_opaque,
4264
                                   runner, nullptr)) {
4265
    return 0;
4266
  }
4267
  JxlFastLosslessPrepareHeader(frame_state, /*add_image_header=*/1,
4268
                               /*is_last=*/1);
4269
  size_t output_size = JxlFastLosslessMaxRequiredOutput(frame_state);
4270
  *output = (unsigned char*)malloc(output_size);
4271
  size_t written = 0;
4272
  size_t total = 0;
4273
  while ((written = JxlFastLosslessWriteOutput(frame_state, *output + total,
4274
                                               output_size - total)) != 0) {
4275
    total += written;
4276
  }
4277
  JxlFastLosslessFreeFrameState(frame_state);
4278
  return total;
4279
}
4280
#endif
4281
4282
JxlFastLosslessFrameState* JxlFastLosslessPrepareFrame(
4283
    JxlChunkedFrameInputSource input, size_t width, size_t height,
4284
    size_t nb_chans, size_t bitdepth, bool big_endian, int effort,
4285
0
    int oneshot) {
4286
#if FJXL_ENABLE_AVX512
4287
  if (HasCpuFeature(CpuFeature::kAVX512CD) &&
4288
      HasCpuFeature(CpuFeature::kVBMI) &&
4289
      HasCpuFeature(CpuFeature::kAVX512BW) &&
4290
      HasCpuFeature(CpuFeature::kAVX512F) &&
4291
      HasCpuFeature(CpuFeature::kAVX512VL)) {
4292
    return AVX512::JxlFastLosslessPrepareImpl(
4293
        input, width, height, nb_chans, bitdepth, big_endian, effort, oneshot);
4294
  }
4295
#endif
4296
0
#if FJXL_ENABLE_AVX2
4297
0
  if (HasCpuFeature(CpuFeature::kAVX2)) {
4298
0
    return AVX2::JxlFastLosslessPrepareImpl(
4299
0
        input, width, height, nb_chans, bitdepth, big_endian, effort, oneshot);
4300
0
  }
4301
0
#endif
4302
4303
0
  return default_implementation::JxlFastLosslessPrepareImpl(
4304
0
      input, width, height, nb_chans, bitdepth, big_endian, effort, oneshot);
4305
0
}
4306
4307
bool JxlFastLosslessProcessFrame(
4308
    JxlFastLosslessFrameState* frame_state, bool is_last, void* runner_opaque,
4309
    FJxlParallelRunner runner,
4310
0
    JxlEncoderOutputProcessorWrapper* output_processor) {
4311
0
  auto trivial_runner =
4312
0
      +[](void*, void* opaque, void fun(void*, size_t), size_t count) {
4313
0
        for (size_t i = 0; i < count; i++) {
4314
0
          fun(opaque, i);
4315
0
        }
4316
0
      };
4317
4318
0
  if (runner == nullptr) {
4319
0
    runner = trivial_runner;
4320
0
  }
4321
4322
#if FJXL_ENABLE_AVX512
4323
  if (HasCpuFeature(CpuFeature::kAVX512CD) &&
4324
      HasCpuFeature(CpuFeature::kVBMI) &&
4325
      HasCpuFeature(CpuFeature::kAVX512BW) &&
4326
      HasCpuFeature(CpuFeature::kAVX512F) &&
4327
      HasCpuFeature(CpuFeature::kAVX512VL)) {
4328
    JXL_RETURN_IF_ERROR(AVX512::JxlFastLosslessProcessFrameImpl(
4329
        frame_state, is_last, runner_opaque, runner, output_processor));
4330
    return true;
4331
  }
4332
#endif
4333
0
#if FJXL_ENABLE_AVX2
4334
0
  if (HasCpuFeature(CpuFeature::kAVX2)) {
4335
0
    JXL_RETURN_IF_ERROR(AVX2::JxlFastLosslessProcessFrameImpl(
4336
0
        frame_state, is_last, runner_opaque, runner, output_processor));
4337
0
    return true;
4338
0
  }
4339
0
#endif
4340
4341
0
  JXL_RETURN_IF_ERROR(default_implementation::JxlFastLosslessProcessFrameImpl(
4342
0
      frame_state, is_last, runner_opaque, runner, output_processor));
4343
0
  return true;
4344
0
}
4345
4346
}  // extern "C"
4347
4348
#if !FJXL_STANDALONE
4349
bool JxlFastLosslessOutputFrame(
4350
    JxlFastLosslessFrameState* frame_state,
4351
0
    JxlEncoderOutputProcessorWrapper* output_processor) {
4352
0
  size_t fl_size = JxlFastLosslessOutputSize(frame_state);
4353
0
  size_t written = 0;
4354
0
  while (written < fl_size) {
4355
0
    JXL_ASSIGN_OR_RETURN(auto buffer,
4356
0
                         output_processor->GetBuffer(32, fl_size - written));
4357
0
    size_t n =
4358
0
        JxlFastLosslessWriteOutput(frame_state, buffer.data(), buffer.size());
4359
0
    if (n == 0) break;
4360
0
    JXL_RETURN_IF_ERROR(buffer.advance(n));
4361
0
    written += n;
4362
0
  };
4363
0
  return true;
4364
0
}
4365
#endif
4366
4367
#endif  // FJXL_SELF_INCLUDE