Coverage Report

Created: 2025-07-23 07:47

/src/libjxl/lib/jpegli/idct.cc
Line
Count
Source (jump to first uncovered line)
1
// Copyright (c) the JPEG XL Project Authors. All rights reserved.
2
//
3
// Use of this source code is governed by a BSD-style
4
// license that can be found in the LICENSE file.
5
6
#include "lib/jpegli/idct.h"
7
8
#include <algorithm>
9
#include <cmath>
10
#include <cstddef>
11
#include <cstdint>
12
13
#include "lib/jpegli/common.h"
14
#include "lib/jpegli/decode_internal.h"
15
#include "lib/jxl/base/compiler_specific.h"
16
#include "lib/jxl/base/status.h"
17
18
#undef HWY_TARGET_INCLUDE
19
#define HWY_TARGET_INCLUDE "lib/jpegli/idct.cc"
20
#include <hwy/foreach_target.h>
21
#include <hwy/highway.h>
22
23
#include "lib/jpegli/transpose-inl.h"
24
25
HWY_BEFORE_NAMESPACE();
26
namespace jpegli {
27
namespace HWY_NAMESPACE {
28
29
// These templates are not found via ADL.
30
using hwy::HWY_NAMESPACE::Abs;
31
using hwy::HWY_NAMESPACE::Add;
32
using hwy::HWY_NAMESPACE::Gt;
33
using hwy::HWY_NAMESPACE::IfThenElseZero;
34
using hwy::HWY_NAMESPACE::Mul;
35
using hwy::HWY_NAMESPACE::MulAdd;
36
using hwy::HWY_NAMESPACE::NegMulAdd;
37
using hwy::HWY_NAMESPACE::Rebind;
38
using hwy::HWY_NAMESPACE::Sub;
39
using hwy::HWY_NAMESPACE::Vec;
40
using hwy::HWY_NAMESPACE::Xor;
41
42
using D = HWY_FULL(float);
43
using DI = HWY_FULL(int32_t);
44
constexpr D d;
45
constexpr DI di;
46
47
using D8 = HWY_CAPPED(float, 8);
48
constexpr D8 d8;
49
50
void DequantBlock(const int16_t* JXL_RESTRICT qblock,
51
                  const float* JXL_RESTRICT dequant,
52
31.8M
                  const float* JXL_RESTRICT biases, float* JXL_RESTRICT block) {
53
416M
  for (size_t k = 0; k < 64; k += Lanes(d)) {
54
384M
    const auto mul = Load(d, dequant + k);
55
384M
    const auto bias = Load(d, biases + k);
56
384M
    const Rebind<int16_t, DI> di16;
57
384M
    const Vec<DI> quant_i = PromoteTo(di, Load(di16, qblock + k));
58
384M
    const Rebind<float, DI> df;
59
384M
    const auto quant = ConvertTo(df, quant_i);
60
384M
    const auto abs_quant = Abs(quant);
61
384M
    const auto not_0 = Gt(abs_quant, Zero(df));
62
384M
    const auto sign_quant = Xor(quant, abs_quant);
63
384M
    const auto biased_quant = Sub(quant, Xor(bias, sign_quant));
64
384M
    const auto deq = IfThenElseZero(not_0, Mul(biased_quant, mul));
65
384M
    Store(deq, d, block + k);
66
384M
  }
67
31.8M
}
jpegli::N_SSE4::DequantBlock(short const*, float const*, float const*, float*)
Line
Count
Source
52
7.95M
                  const float* JXL_RESTRICT biases, float* JXL_RESTRICT block) {
53
135M
  for (size_t k = 0; k < 64; k += Lanes(d)) {
54
127M
    const auto mul = Load(d, dequant + k);
55
127M
    const auto bias = Load(d, biases + k);
56
127M
    const Rebind<int16_t, DI> di16;
57
127M
    const Vec<DI> quant_i = PromoteTo(di, Load(di16, qblock + k));
58
127M
    const Rebind<float, DI> df;
59
127M
    const auto quant = ConvertTo(df, quant_i);
60
127M
    const auto abs_quant = Abs(quant);
61
127M
    const auto not_0 = Gt(abs_quant, Zero(df));
62
127M
    const auto sign_quant = Xor(quant, abs_quant);
63
127M
    const auto biased_quant = Sub(quant, Xor(bias, sign_quant));
64
127M
    const auto deq = IfThenElseZero(not_0, Mul(biased_quant, mul));
65
127M
    Store(deq, d, block + k);
66
127M
  }
67
7.95M
}
jpegli::N_AVX2::DequantBlock(short const*, float const*, float const*, float*)
Line
Count
Source
52
15.6M
                  const float* JXL_RESTRICT biases, float* JXL_RESTRICT block) {
53
140M
  for (size_t k = 0; k < 64; k += Lanes(d)) {
54
125M
    const auto mul = Load(d, dequant + k);
55
125M
    const auto bias = Load(d, biases + k);
56
125M
    const Rebind<int16_t, DI> di16;
57
125M
    const Vec<DI> quant_i = PromoteTo(di, Load(di16, qblock + k));
58
125M
    const Rebind<float, DI> df;
59
125M
    const auto quant = ConvertTo(df, quant_i);
60
125M
    const auto abs_quant = Abs(quant);
61
125M
    const auto not_0 = Gt(abs_quant, Zero(df));
62
125M
    const auto sign_quant = Xor(quant, abs_quant);
63
125M
    const auto biased_quant = Sub(quant, Xor(bias, sign_quant));
64
125M
    const auto deq = IfThenElseZero(not_0, Mul(biased_quant, mul));
65
125M
    Store(deq, d, block + k);
66
125M
  }
67
15.6M
}
jpegli::N_SSE2::DequantBlock(short const*, float const*, float const*, float*)
Line
Count
Source
52
8.25M
                  const float* JXL_RESTRICT biases, float* JXL_RESTRICT block) {
53
140M
  for (size_t k = 0; k < 64; k += Lanes(d)) {
54
132M
    const auto mul = Load(d, dequant + k);
55
132M
    const auto bias = Load(d, biases + k);
56
132M
    const Rebind<int16_t, DI> di16;
57
132M
    const Vec<DI> quant_i = PromoteTo(di, Load(di16, qblock + k));
58
132M
    const Rebind<float, DI> df;
59
132M
    const auto quant = ConvertTo(df, quant_i);
60
132M
    const auto abs_quant = Abs(quant);
61
132M
    const auto not_0 = Gt(abs_quant, Zero(df));
62
132M
    const auto sign_quant = Xor(quant, abs_quant);
63
132M
    const auto biased_quant = Sub(quant, Xor(bias, sign_quant));
64
132M
    const auto deq = IfThenElseZero(not_0, Mul(biased_quant, mul));
65
132M
    Store(deq, d, block + k);
66
132M
  }
67
8.25M
}
68
69
template <size_t N>
70
void ForwardEvenOdd(const float* JXL_RESTRICT a_in, size_t a_in_stride,
71
288M
                    float* JXL_RESTRICT a_out) {
72
1.05G
  for (size_t i = 0; i < N / 2; i++) {
73
768M
    auto in1 = LoadU(d8, a_in + 2 * i * a_in_stride);
74
768M
    Store(in1, d8, a_out + i * 8);
75
768M
  }
76
1.05G
  for (size_t i = N / 2; i < N; i++) {
77
768M
    auto in1 = LoadU(d8, a_in + (2 * (i - N / 2) + 1) * a_in_stride);
78
768M
    Store(in1, d8, a_out + i * 8);
79
768M
  }
80
288M
}
void jpegli::N_SSE4::ForwardEvenOdd<8ul>(float const*, unsigned long, float*)
Line
Count
Source
71
31.8M
                    float* JXL_RESTRICT a_out) {
72
159M
  for (size_t i = 0; i < N / 2; i++) {
73
127M
    auto in1 = LoadU(d8, a_in + 2 * i * a_in_stride);
74
127M
    Store(in1, d8, a_out + i * 8);
75
127M
  }
76
159M
  for (size_t i = N / 2; i < N; i++) {
77
127M
    auto in1 = LoadU(d8, a_in + (2 * (i - N / 2) + 1) * a_in_stride);
78
127M
    Store(in1, d8, a_out + i * 8);
79
127M
  }
80
31.8M
}
void jpegli::N_SSE4::ForwardEvenOdd<4ul>(float const*, unsigned long, float*)
Line
Count
Source
71
63.6M
                    float* JXL_RESTRICT a_out) {
72
190M
  for (size_t i = 0; i < N / 2; i++) {
73
127M
    auto in1 = LoadU(d8, a_in + 2 * i * a_in_stride);
74
127M
    Store(in1, d8, a_out + i * 8);
75
127M
  }
76
190M
  for (size_t i = N / 2; i < N; i++) {
77
127M
    auto in1 = LoadU(d8, a_in + (2 * (i - N / 2) + 1) * a_in_stride);
78
127M
    Store(in1, d8, a_out + i * 8);
79
127M
  }
80
63.6M
}
void jpegli::N_AVX2::ForwardEvenOdd<8ul>(float const*, unsigned long, float*)
Line
Count
Source
71
31.2M
                    float* JXL_RESTRICT a_out) {
72
156M
  for (size_t i = 0; i < N / 2; i++) {
73
125M
    auto in1 = LoadU(d8, a_in + 2 * i * a_in_stride);
74
125M
    Store(in1, d8, a_out + i * 8);
75
125M
  }
76
156M
  for (size_t i = N / 2; i < N; i++) {
77
125M
    auto in1 = LoadU(d8, a_in + (2 * (i - N / 2) + 1) * a_in_stride);
78
125M
    Store(in1, d8, a_out + i * 8);
79
125M
  }
80
31.2M
}
void jpegli::N_AVX2::ForwardEvenOdd<4ul>(float const*, unsigned long, float*)
Line
Count
Source
71
62.5M
                    float* JXL_RESTRICT a_out) {
72
187M
  for (size_t i = 0; i < N / 2; i++) {
73
125M
    auto in1 = LoadU(d8, a_in + 2 * i * a_in_stride);
74
125M
    Store(in1, d8, a_out + i * 8);
75
125M
  }
76
187M
  for (size_t i = N / 2; i < N; i++) {
77
125M
    auto in1 = LoadU(d8, a_in + (2 * (i - N / 2) + 1) * a_in_stride);
78
125M
    Store(in1, d8, a_out + i * 8);
79
125M
  }
80
62.5M
}
void jpegli::N_SSE2::ForwardEvenOdd<8ul>(float const*, unsigned long, float*)
Line
Count
Source
71
33.0M
                    float* JXL_RESTRICT a_out) {
72
165M
  for (size_t i = 0; i < N / 2; i++) {
73
132M
    auto in1 = LoadU(d8, a_in + 2 * i * a_in_stride);
74
132M
    Store(in1, d8, a_out + i * 8);
75
132M
  }
76
165M
  for (size_t i = N / 2; i < N; i++) {
77
132M
    auto in1 = LoadU(d8, a_in + (2 * (i - N / 2) + 1) * a_in_stride);
78
132M
    Store(in1, d8, a_out + i * 8);
79
132M
  }
80
33.0M
}
void jpegli::N_SSE2::ForwardEvenOdd<4ul>(float const*, unsigned long, float*)
Line
Count
Source
71
66.0M
                    float* JXL_RESTRICT a_out) {
72
198M
  for (size_t i = 0; i < N / 2; i++) {
73
132M
    auto in1 = LoadU(d8, a_in + 2 * i * a_in_stride);
74
132M
    Store(in1, d8, a_out + i * 8);
75
132M
  }
76
198M
  for (size_t i = N / 2; i < N; i++) {
77
132M
    auto in1 = LoadU(d8, a_in + (2 * (i - N / 2) + 1) * a_in_stride);
78
132M
    Store(in1, d8, a_out + i * 8);
79
132M
  }
80
66.0M
}
81
82
template <size_t N>
83
288M
void BTranspose(float* JXL_RESTRICT coeff) {
84
768M
  for (size_t i = N - 1; i > 0; i--) {
85
480M
    auto in1 = Load(d8, coeff + i * 8);
86
480M
    auto in2 = Load(d8, coeff + (i - 1) * 8);
87
480M
    Store(Add(in1, in2), d8, coeff + i * 8);
88
480M
  }
89
288M
  constexpr float kSqrt2 = 1.41421356237f;
90
288M
  auto sqrt2 = Set(d8, kSqrt2);
91
288M
  auto in1 = Load(d8, coeff);
92
288M
  Store(Mul(in1, sqrt2), d8, coeff);
93
288M
}
void jpegli::N_SSE4::BTranspose<2ul>(float*)
Line
Count
Source
83
63.6M
void BTranspose(float* JXL_RESTRICT coeff) {
84
127M
  for (size_t i = N - 1; i > 0; i--) {
85
63.6M
    auto in1 = Load(d8, coeff + i * 8);
86
63.6M
    auto in2 = Load(d8, coeff + (i - 1) * 8);
87
63.6M
    Store(Add(in1, in2), d8, coeff + i * 8);
88
63.6M
  }
89
63.6M
  constexpr float kSqrt2 = 1.41421356237f;
90
63.6M
  auto sqrt2 = Set(d8, kSqrt2);
91
63.6M
  auto in1 = Load(d8, coeff);
92
63.6M
  Store(Mul(in1, sqrt2), d8, coeff);
93
63.6M
}
void jpegli::N_SSE4::BTranspose<4ul>(float*)
Line
Count
Source
83
31.8M
void BTranspose(float* JXL_RESTRICT coeff) {
84
127M
  for (size_t i = N - 1; i > 0; i--) {
85
95.4M
    auto in1 = Load(d8, coeff + i * 8);
86
95.4M
    auto in2 = Load(d8, coeff + (i - 1) * 8);
87
95.4M
    Store(Add(in1, in2), d8, coeff + i * 8);
88
95.4M
  }
89
31.8M
  constexpr float kSqrt2 = 1.41421356237f;
90
31.8M
  auto sqrt2 = Set(d8, kSqrt2);
91
31.8M
  auto in1 = Load(d8, coeff);
92
31.8M
  Store(Mul(in1, sqrt2), d8, coeff);
93
31.8M
}
void jpegli::N_AVX2::BTranspose<2ul>(float*)
Line
Count
Source
83
62.5M
void BTranspose(float* JXL_RESTRICT coeff) {
84
125M
  for (size_t i = N - 1; i > 0; i--) {
85
62.5M
    auto in1 = Load(d8, coeff + i * 8);
86
62.5M
    auto in2 = Load(d8, coeff + (i - 1) * 8);
87
62.5M
    Store(Add(in1, in2), d8, coeff + i * 8);
88
62.5M
  }
89
62.5M
  constexpr float kSqrt2 = 1.41421356237f;
90
62.5M
  auto sqrt2 = Set(d8, kSqrt2);
91
62.5M
  auto in1 = Load(d8, coeff);
92
62.5M
  Store(Mul(in1, sqrt2), d8, coeff);
93
62.5M
}
void jpegli::N_AVX2::BTranspose<4ul>(float*)
Line
Count
Source
83
31.2M
void BTranspose(float* JXL_RESTRICT coeff) {
84
125M
  for (size_t i = N - 1; i > 0; i--) {
85
93.7M
    auto in1 = Load(d8, coeff + i * 8);
86
93.7M
    auto in2 = Load(d8, coeff + (i - 1) * 8);
87
93.7M
    Store(Add(in1, in2), d8, coeff + i * 8);
88
93.7M
  }
89
31.2M
  constexpr float kSqrt2 = 1.41421356237f;
90
31.2M
  auto sqrt2 = Set(d8, kSqrt2);
91
31.2M
  auto in1 = Load(d8, coeff);
92
31.2M
  Store(Mul(in1, sqrt2), d8, coeff);
93
31.2M
}
void jpegli::N_SSE2::BTranspose<2ul>(float*)
Line
Count
Source
83
66.0M
void BTranspose(float* JXL_RESTRICT coeff) {
84
132M
  for (size_t i = N - 1; i > 0; i--) {
85
66.0M
    auto in1 = Load(d8, coeff + i * 8);
86
66.0M
    auto in2 = Load(d8, coeff + (i - 1) * 8);
87
66.0M
    Store(Add(in1, in2), d8, coeff + i * 8);
88
66.0M
  }
89
66.0M
  constexpr float kSqrt2 = 1.41421356237f;
90
66.0M
  auto sqrt2 = Set(d8, kSqrt2);
91
66.0M
  auto in1 = Load(d8, coeff);
92
66.0M
  Store(Mul(in1, sqrt2), d8, coeff);
93
66.0M
}
void jpegli::N_SSE2::BTranspose<4ul>(float*)
Line
Count
Source
83
33.0M
void BTranspose(float* JXL_RESTRICT coeff) {
84
132M
  for (size_t i = N - 1; i > 0; i--) {
85
99.1M
    auto in1 = Load(d8, coeff + i * 8);
86
99.1M
    auto in2 = Load(d8, coeff + (i - 1) * 8);
87
99.1M
    Store(Add(in1, in2), d8, coeff + i * 8);
88
99.1M
  }
89
33.0M
  constexpr float kSqrt2 = 1.41421356237f;
90
33.0M
  auto sqrt2 = Set(d8, kSqrt2);
91
33.0M
  auto in1 = Load(d8, coeff);
92
33.0M
  Store(Mul(in1, sqrt2), d8, coeff);
93
33.0M
}
94
95
// Constants for DCT implementation. Generated by the following snippet:
96
// for i in range(N // 2):
97
//    print(1.0 / (2 * math.cos((i + 0.5) * math.pi / N)), end=", ")
98
template <size_t N>
99
struct WcMultipliers;
100
101
template <>
102
struct WcMultipliers<4> {
103
  static constexpr float kMultipliers[] = {
104
      0.541196100146197,
105
      1.3065629648763764,
106
  };
107
};
108
109
template <>
110
struct WcMultipliers<8> {
111
  static constexpr float kMultipliers[] = {
112
      0.5097955791041592,
113
      0.6013448869350453,
114
      0.8999762231364156,
115
      2.5629154477415055,
116
  };
117
};
118
119
#if JXL_CXX_LANG < JXL_CXX_17
120
constexpr float WcMultipliers<4>::kMultipliers[];
121
constexpr float WcMultipliers<8>::kMultipliers[];
122
#endif
123
124
template <size_t N>
125
void MultiplyAndAdd(const float* JXL_RESTRICT coeff, float* JXL_RESTRICT out,
126
288M
                    size_t out_stride) {
127
1.05G
  for (size_t i = 0; i < N / 2; i++) {
128
768M
    auto mul = Set(d8, WcMultipliers<N>::kMultipliers[i]);
129
768M
    auto in1 = Load(d8, coeff + i * 8);
130
768M
    auto in2 = Load(d8, coeff + (N / 2 + i) * 8);
131
768M
    auto out1 = MulAdd(mul, in2, in1);
132
768M
    auto out2 = NegMulAdd(mul, in2, in1);
133
768M
    StoreU(out1, d8, out + i * out_stride);
134
768M
    StoreU(out2, d8, out + (N - i - 1) * out_stride);
135
768M
  }
136
288M
}
void jpegli::N_SSE4::MultiplyAndAdd<4ul>(float const*, float*, unsigned long)
Line
Count
Source
126
63.6M
                    size_t out_stride) {
127
190M
  for (size_t i = 0; i < N / 2; i++) {
128
127M
    auto mul = Set(d8, WcMultipliers<N>::kMultipliers[i]);
129
127M
    auto in1 = Load(d8, coeff + i * 8);
130
127M
    auto in2 = Load(d8, coeff + (N / 2 + i) * 8);
131
127M
    auto out1 = MulAdd(mul, in2, in1);
132
127M
    auto out2 = NegMulAdd(mul, in2, in1);
133
127M
    StoreU(out1, d8, out + i * out_stride);
134
127M
    StoreU(out2, d8, out + (N - i - 1) * out_stride);
135
127M
  }
136
63.6M
}
void jpegli::N_SSE4::MultiplyAndAdd<8ul>(float const*, float*, unsigned long)
Line
Count
Source
126
31.8M
                    size_t out_stride) {
127
159M
  for (size_t i = 0; i < N / 2; i++) {
128
127M
    auto mul = Set(d8, WcMultipliers<N>::kMultipliers[i]);
129
127M
    auto in1 = Load(d8, coeff + i * 8);
130
127M
    auto in2 = Load(d8, coeff + (N / 2 + i) * 8);
131
127M
    auto out1 = MulAdd(mul, in2, in1);
132
127M
    auto out2 = NegMulAdd(mul, in2, in1);
133
127M
    StoreU(out1, d8, out + i * out_stride);
134
127M
    StoreU(out2, d8, out + (N - i - 1) * out_stride);
135
127M
  }
136
31.8M
}
void jpegli::N_AVX2::MultiplyAndAdd<4ul>(float const*, float*, unsigned long)
Line
Count
Source
126
62.5M
                    size_t out_stride) {
127
187M
  for (size_t i = 0; i < N / 2; i++) {
128
125M
    auto mul = Set(d8, WcMultipliers<N>::kMultipliers[i]);
129
125M
    auto in1 = Load(d8, coeff + i * 8);
130
125M
    auto in2 = Load(d8, coeff + (N / 2 + i) * 8);
131
125M
    auto out1 = MulAdd(mul, in2, in1);
132
125M
    auto out2 = NegMulAdd(mul, in2, in1);
133
125M
    StoreU(out1, d8, out + i * out_stride);
134
125M
    StoreU(out2, d8, out + (N - i - 1) * out_stride);
135
125M
  }
136
62.5M
}
void jpegli::N_AVX2::MultiplyAndAdd<8ul>(float const*, float*, unsigned long)
Line
Count
Source
126
31.2M
                    size_t out_stride) {
127
156M
  for (size_t i = 0; i < N / 2; i++) {
128
125M
    auto mul = Set(d8, WcMultipliers<N>::kMultipliers[i]);
129
125M
    auto in1 = Load(d8, coeff + i * 8);
130
125M
    auto in2 = Load(d8, coeff + (N / 2 + i) * 8);
131
125M
    auto out1 = MulAdd(mul, in2, in1);
132
125M
    auto out2 = NegMulAdd(mul, in2, in1);
133
125M
    StoreU(out1, d8, out + i * out_stride);
134
125M
    StoreU(out2, d8, out + (N - i - 1) * out_stride);
135
125M
  }
136
31.2M
}
void jpegli::N_SSE2::MultiplyAndAdd<4ul>(float const*, float*, unsigned long)
Line
Count
Source
126
66.0M
                    size_t out_stride) {
127
198M
  for (size_t i = 0; i < N / 2; i++) {
128
132M
    auto mul = Set(d8, WcMultipliers<N>::kMultipliers[i]);
129
132M
    auto in1 = Load(d8, coeff + i * 8);
130
132M
    auto in2 = Load(d8, coeff + (N / 2 + i) * 8);
131
132M
    auto out1 = MulAdd(mul, in2, in1);
132
132M
    auto out2 = NegMulAdd(mul, in2, in1);
133
132M
    StoreU(out1, d8, out + i * out_stride);
134
132M
    StoreU(out2, d8, out + (N - i - 1) * out_stride);
135
132M
  }
136
66.0M
}
void jpegli::N_SSE2::MultiplyAndAdd<8ul>(float const*, float*, unsigned long)
Line
Count
Source
126
33.0M
                    size_t out_stride) {
127
165M
  for (size_t i = 0; i < N / 2; i++) {
128
132M
    auto mul = Set(d8, WcMultipliers<N>::kMultipliers[i]);
129
132M
    auto in1 = Load(d8, coeff + i * 8);
130
132M
    auto in2 = Load(d8, coeff + (N / 2 + i) * 8);
131
132M
    auto out1 = MulAdd(mul, in2, in1);
132
132M
    auto out2 = NegMulAdd(mul, in2, in1);
133
132M
    StoreU(out1, d8, out + i * out_stride);
134
132M
    StoreU(out2, d8, out + (N - i - 1) * out_stride);
135
132M
  }
136
33.0M
}
137
138
template <size_t N>
139
struct IDCT1DImpl;
140
141
template <>
142
struct IDCT1DImpl<1> {
143
  JXL_INLINE void operator()(const float* from, size_t from_stride, float* to,
144
0
                             size_t to_stride) {
145
0
    StoreU(LoadU(d8, from), d8, to);
146
0
  }
Unexecuted instantiation: jpegli::N_SSE4::IDCT1DImpl<1ul>::operator()(float const*, unsigned long, float*, unsigned long)
Unexecuted instantiation: jpegli::N_AVX2::IDCT1DImpl<1ul>::operator()(float const*, unsigned long, float*, unsigned long)
Unexecuted instantiation: jpegli::N_SSE2::IDCT1DImpl<1ul>::operator()(float const*, unsigned long, float*, unsigned long)
147
};
148
149
template <>
150
struct IDCT1DImpl<2> {
151
  JXL_INLINE void operator()(const float* from, size_t from_stride, float* to,
152
384M
                             size_t to_stride) {
153
384M
    JXL_DASSERT(from_stride >= 8);
154
384M
    JXL_DASSERT(to_stride >= 8);
155
384M
    auto in1 = LoadU(d8, from);
156
384M
    auto in2 = LoadU(d8, from + from_stride);
157
384M
    StoreU(Add(in1, in2), d8, to);
158
384M
    StoreU(Sub(in1, in2), d8, to + to_stride);
159
384M
  }
jpegli::N_SSE4::IDCT1DImpl<2ul>::operator()(float const*, unsigned long, float*, unsigned long)
Line
Count
Source
152
127M
                             size_t to_stride) {
153
127M
    JXL_DASSERT(from_stride >= 8);
154
127M
    JXL_DASSERT(to_stride >= 8);
155
127M
    auto in1 = LoadU(d8, from);
156
127M
    auto in2 = LoadU(d8, from + from_stride);
157
127M
    StoreU(Add(in1, in2), d8, to);
158
127M
    StoreU(Sub(in1, in2), d8, to + to_stride);
159
127M
  }
jpegli::N_AVX2::IDCT1DImpl<2ul>::operator()(float const*, unsigned long, float*, unsigned long)
Line
Count
Source
152
125M
                             size_t to_stride) {
153
125M
    JXL_DASSERT(from_stride >= 8);
154
125M
    JXL_DASSERT(to_stride >= 8);
155
125M
    auto in1 = LoadU(d8, from);
156
125M
    auto in2 = LoadU(d8, from + from_stride);
157
125M
    StoreU(Add(in1, in2), d8, to);
158
125M
    StoreU(Sub(in1, in2), d8, to + to_stride);
159
125M
  }
jpegli::N_SSE2::IDCT1DImpl<2ul>::operator()(float const*, unsigned long, float*, unsigned long)
Line
Count
Source
152
132M
                             size_t to_stride) {
153
132M
    JXL_DASSERT(from_stride >= 8);
154
132M
    JXL_DASSERT(to_stride >= 8);
155
132M
    auto in1 = LoadU(d8, from);
156
132M
    auto in2 = LoadU(d8, from + from_stride);
157
132M
    StoreU(Add(in1, in2), d8, to);
158
132M
    StoreU(Sub(in1, in2), d8, to + to_stride);
159
132M
  }
160
};
161
162
template <size_t N>
163
struct IDCT1DImpl {
164
  void operator()(const float* from, size_t from_stride, float* to,
165
288M
                  size_t to_stride) {
166
288M
    JXL_DASSERT(from_stride >= 8);
167
288M
    JXL_DASSERT(to_stride >= 8);
168
288M
    HWY_ALIGN float tmp[64];
169
288M
    ForwardEvenOdd<N>(from, from_stride, tmp);
170
288M
    IDCT1DImpl<N / 2>()(tmp, 8, tmp, 8);
171
288M
    BTranspose<N / 2>(tmp + N * 4);
172
288M
    IDCT1DImpl<N / 2>()(tmp + N * 4, 8, tmp + N * 4, 8);
173
288M
    MultiplyAndAdd<N>(tmp, to, to_stride);
174
288M
  }
jpegli::N_SSE4::IDCT1DImpl<8ul>::operator()(float const*, unsigned long, float*, unsigned long)
Line
Count
Source
165
31.8M
                  size_t to_stride) {
166
31.8M
    JXL_DASSERT(from_stride >= 8);
167
31.8M
    JXL_DASSERT(to_stride >= 8);
168
31.8M
    HWY_ALIGN float tmp[64];
169
31.8M
    ForwardEvenOdd<N>(from, from_stride, tmp);
170
31.8M
    IDCT1DImpl<N / 2>()(tmp, 8, tmp, 8);
171
31.8M
    BTranspose<N / 2>(tmp + N * 4);
172
31.8M
    IDCT1DImpl<N / 2>()(tmp + N * 4, 8, tmp + N * 4, 8);
173
31.8M
    MultiplyAndAdd<N>(tmp, to, to_stride);
174
31.8M
  }
jpegli::N_SSE4::IDCT1DImpl<4ul>::operator()(float const*, unsigned long, float*, unsigned long)
Line
Count
Source
165
63.6M
                  size_t to_stride) {
166
63.6M
    JXL_DASSERT(from_stride >= 8);
167
63.6M
    JXL_DASSERT(to_stride >= 8);
168
63.6M
    HWY_ALIGN float tmp[64];
169
63.6M
    ForwardEvenOdd<N>(from, from_stride, tmp);
170
63.6M
    IDCT1DImpl<N / 2>()(tmp, 8, tmp, 8);
171
63.6M
    BTranspose<N / 2>(tmp + N * 4);
172
63.6M
    IDCT1DImpl<N / 2>()(tmp + N * 4, 8, tmp + N * 4, 8);
173
63.6M
    MultiplyAndAdd<N>(tmp, to, to_stride);
174
63.6M
  }
jpegli::N_AVX2::IDCT1DImpl<8ul>::operator()(float const*, unsigned long, float*, unsigned long)
Line
Count
Source
165
31.2M
                  size_t to_stride) {
166
31.2M
    JXL_DASSERT(from_stride >= 8);
167
31.2M
    JXL_DASSERT(to_stride >= 8);
168
31.2M
    HWY_ALIGN float tmp[64];
169
31.2M
    ForwardEvenOdd<N>(from, from_stride, tmp);
170
31.2M
    IDCT1DImpl<N / 2>()(tmp, 8, tmp, 8);
171
31.2M
    BTranspose<N / 2>(tmp + N * 4);
172
31.2M
    IDCT1DImpl<N / 2>()(tmp + N * 4, 8, tmp + N * 4, 8);
173
31.2M
    MultiplyAndAdd<N>(tmp, to, to_stride);
174
31.2M
  }
jpegli::N_AVX2::IDCT1DImpl<4ul>::operator()(float const*, unsigned long, float*, unsigned long)
Line
Count
Source
165
62.5M
                  size_t to_stride) {
166
62.5M
    JXL_DASSERT(from_stride >= 8);
167
62.5M
    JXL_DASSERT(to_stride >= 8);
168
62.5M
    HWY_ALIGN float tmp[64];
169
62.5M
    ForwardEvenOdd<N>(from, from_stride, tmp);
170
62.5M
    IDCT1DImpl<N / 2>()(tmp, 8, tmp, 8);
171
62.5M
    BTranspose<N / 2>(tmp + N * 4);
172
62.5M
    IDCT1DImpl<N / 2>()(tmp + N * 4, 8, tmp + N * 4, 8);
173
62.5M
    MultiplyAndAdd<N>(tmp, to, to_stride);
174
62.5M
  }
jpegli::N_SSE2::IDCT1DImpl<8ul>::operator()(float const*, unsigned long, float*, unsigned long)
Line
Count
Source
165
33.0M
                  size_t to_stride) {
166
33.0M
    JXL_DASSERT(from_stride >= 8);
167
33.0M
    JXL_DASSERT(to_stride >= 8);
168
33.0M
    HWY_ALIGN float tmp[64];
169
33.0M
    ForwardEvenOdd<N>(from, from_stride, tmp);
170
33.0M
    IDCT1DImpl<N / 2>()(tmp, 8, tmp, 8);
171
33.0M
    BTranspose<N / 2>(tmp + N * 4);
172
33.0M
    IDCT1DImpl<N / 2>()(tmp + N * 4, 8, tmp + N * 4, 8);
173
33.0M
    MultiplyAndAdd<N>(tmp, to, to_stride);
174
33.0M
  }
jpegli::N_SSE2::IDCT1DImpl<4ul>::operator()(float const*, unsigned long, float*, unsigned long)
Line
Count
Source
165
66.0M
                  size_t to_stride) {
166
66.0M
    JXL_DASSERT(from_stride >= 8);
167
66.0M
    JXL_DASSERT(to_stride >= 8);
168
66.0M
    HWY_ALIGN float tmp[64];
169
66.0M
    ForwardEvenOdd<N>(from, from_stride, tmp);
170
66.0M
    IDCT1DImpl<N / 2>()(tmp, 8, tmp, 8);
171
66.0M
    BTranspose<N / 2>(tmp + N * 4);
172
66.0M
    IDCT1DImpl<N / 2>()(tmp + N * 4, 8, tmp + N * 4, 8);
173
66.0M
    MultiplyAndAdd<N>(tmp, to, to_stride);
174
66.0M
  }
175
};
176
177
template <size_t N>
178
void IDCT1D(float* JXL_RESTRICT from, float* JXL_RESTRICT output,
179
63.6M
            size_t output_stride) {
180
159M
  for (size_t i = 0; i < 8; i += Lanes(d8)) {
181
96.1M
    IDCT1DImpl<N>()(from + i, 8, output + i, output_stride);
182
96.1M
  }
183
63.6M
}
void jpegli::N_SSE4::IDCT1D<8ul>(float*, float*, unsigned long)
Line
Count
Source
179
15.9M
            size_t output_stride) {
180
47.7M
  for (size_t i = 0; i < 8; i += Lanes(d8)) {
181
31.8M
    IDCT1DImpl<N>()(from + i, 8, output + i, output_stride);
182
31.8M
  }
183
15.9M
}
void jpegli::N_AVX2::IDCT1D<8ul>(float*, float*, unsigned long)
Line
Count
Source
179
31.2M
            size_t output_stride) {
180
62.5M
  for (size_t i = 0; i < 8; i += Lanes(d8)) {
181
31.2M
    IDCT1DImpl<N>()(from + i, 8, output + i, output_stride);
182
31.2M
  }
183
31.2M
}
void jpegli::N_SSE2::IDCT1D<8ul>(float*, float*, unsigned long)
Line
Count
Source
179
16.5M
            size_t output_stride) {
180
49.5M
  for (size_t i = 0; i < 8; i += Lanes(d8)) {
181
33.0M
    IDCT1DImpl<N>()(from + i, 8, output + i, output_stride);
182
33.0M
  }
183
16.5M
}
184
185
void ComputeScaledIDCT(float* JXL_RESTRICT block0, float* JXL_RESTRICT block1,
186
31.8M
                       float* JXL_RESTRICT output, size_t output_stride) {
187
31.8M
  Transpose8x8Block(block0, block1);
188
31.8M
  IDCT1D<8>(block1, block0, 8);
189
31.8M
  Transpose8x8Block(block0, block1);
190
31.8M
  IDCT1D<8>(block1, output, output_stride);
191
31.8M
}
jpegli::N_SSE4::ComputeScaledIDCT(float*, float*, float*, unsigned long)
Line
Count
Source
186
7.95M
                       float* JXL_RESTRICT output, size_t output_stride) {
187
7.95M
  Transpose8x8Block(block0, block1);
188
7.95M
  IDCT1D<8>(block1, block0, 8);
189
7.95M
  Transpose8x8Block(block0, block1);
190
7.95M
  IDCT1D<8>(block1, output, output_stride);
191
7.95M
}
jpegli::N_AVX2::ComputeScaledIDCT(float*, float*, float*, unsigned long)
Line
Count
Source
186
15.6M
                       float* JXL_RESTRICT output, size_t output_stride) {
187
15.6M
  Transpose8x8Block(block0, block1);
188
15.6M
  IDCT1D<8>(block1, block0, 8);
189
15.6M
  Transpose8x8Block(block0, block1);
190
15.6M
  IDCT1D<8>(block1, output, output_stride);
191
15.6M
}
jpegli::N_SSE2::ComputeScaledIDCT(float*, float*, float*, unsigned long)
Line
Count
Source
186
8.25M
                       float* JXL_RESTRICT output, size_t output_stride) {
187
8.25M
  Transpose8x8Block(block0, block1);
188
8.25M
  IDCT1D<8>(block1, block0, 8);
189
8.25M
  Transpose8x8Block(block0, block1);
190
8.25M
  IDCT1D<8>(block1, output, output_stride);
191
8.25M
}
192
193
void InverseTransformBlock8x8(const int16_t* JXL_RESTRICT qblock,
194
                              const float* JXL_RESTRICT dequant,
195
                              const float* JXL_RESTRICT biases,
196
                              float* JXL_RESTRICT scratch_space,
197
                              float* JXL_RESTRICT output, size_t output_stride,
198
31.8M
                              size_t dctsize) {
199
31.8M
  float* JXL_RESTRICT block0 = scratch_space;
200
31.8M
  float* JXL_RESTRICT block1 = scratch_space + DCTSIZE2;
201
31.8M
  DequantBlock(qblock, dequant, biases, block0);
202
31.8M
  ComputeScaledIDCT(block0, block1, output, output_stride);
203
31.8M
}
jpegli::N_SSE4::InverseTransformBlock8x8(short const*, float const*, float const*, float*, float*, unsigned long, unsigned long)
Line
Count
Source
198
7.95M
                              size_t dctsize) {
199
7.95M
  float* JXL_RESTRICT block0 = scratch_space;
200
7.95M
  float* JXL_RESTRICT block1 = scratch_space + DCTSIZE2;
201
7.95M
  DequantBlock(qblock, dequant, biases, block0);
202
7.95M
  ComputeScaledIDCT(block0, block1, output, output_stride);
203
7.95M
}
jpegli::N_AVX2::InverseTransformBlock8x8(short const*, float const*, float const*, float*, float*, unsigned long, unsigned long)
Line
Count
Source
198
15.6M
                              size_t dctsize) {
199
15.6M
  float* JXL_RESTRICT block0 = scratch_space;
200
15.6M
  float* JXL_RESTRICT block1 = scratch_space + DCTSIZE2;
201
15.6M
  DequantBlock(qblock, dequant, biases, block0);
202
15.6M
  ComputeScaledIDCT(block0, block1, output, output_stride);
203
15.6M
}
jpegli::N_SSE2::InverseTransformBlock8x8(short const*, float const*, float const*, float*, float*, unsigned long, unsigned long)
Line
Count
Source
198
8.25M
                              size_t dctsize) {
199
8.25M
  float* JXL_RESTRICT block0 = scratch_space;
200
8.25M
  float* JXL_RESTRICT block1 = scratch_space + DCTSIZE2;
201
8.25M
  DequantBlock(qblock, dequant, biases, block0);
202
8.25M
  ComputeScaledIDCT(block0, block1, output, output_stride);
203
8.25M
}
204
205
// Computes the N-point IDCT of in[], and stores the result in out[]. The in[]
206
// array is at most 8 values long, values in[8:N-1] are assumed to be 0.
207
0
void Compute1dIDCT(const float* in, float* out, size_t N) {
208
0
  switch (N) {
209
0
    case 3: {
210
0
      static constexpr float kC3[3] = {
211
0
          1.414213562373,
212
0
          1.224744871392,
213
0
          0.707106781187,
214
0
      };
215
0
      float even0 = in[0] + kC3[2] * in[2];
216
0
      float even1 = in[0] - kC3[0] * in[2];
217
0
      float odd0 = kC3[1] * in[1];
218
0
      out[0] = even0 + odd0;
219
0
      out[2] = even0 - odd0;
220
0
      out[1] = even1;
221
0
      break;
222
0
    }
223
0
    case 5: {
224
0
      static constexpr float kC5[5] = {
225
0
          1.414213562373, 1.344997023928, 1.144122805635,
226
0
          0.831253875555, 0.437016024449,
227
0
      };
228
0
      float even0 = in[0] + kC5[2] * in[2] + kC5[4] * in[4];
229
0
      float even1 = in[0] - kC5[4] * in[2] - kC5[2] * in[4];
230
0
      float even2 = in[0] - kC5[0] * in[2] + kC5[0] * in[4];
231
0
      float odd0 = kC5[1] * in[1] + kC5[3] * in[3];
232
0
      float odd1 = kC5[3] * in[1] - kC5[1] * in[3];
233
0
      out[0] = even0 + odd0;
234
0
      out[4] = even0 - odd0;
235
0
      out[1] = even1 + odd1;
236
0
      out[3] = even1 - odd1;
237
0
      out[2] = even2;
238
0
      break;
239
0
    }
240
0
    case 6: {
241
0
      static constexpr float kC6[6] = {
242
0
          1.414213562373, 1.366025403784, 1.224744871392,
243
0
          1.000000000000, 0.707106781187, 0.366025403784,
244
0
      };
245
0
      float even0 = in[0] + kC6[2] * in[2] + kC6[4] * in[4];
246
0
      float even1 = in[0] - kC6[0] * in[4];
247
0
      float even2 = in[0] - kC6[2] * in[2] + kC6[4] * in[4];
248
0
      float odd0 = kC6[1] * in[1] + kC6[3] * in[3] + kC6[5] * in[5];
249
0
      float odd1 = kC6[3] * in[1] - kC6[3] * in[3] - kC6[3] * in[5];
250
0
      float odd2 = kC6[5] * in[1] - kC6[3] * in[3] + kC6[1] * in[5];
251
0
      out[0] = even0 + odd0;
252
0
      out[5] = even0 - odd0;
253
0
      out[1] = even1 + odd1;
254
0
      out[4] = even1 - odd1;
255
0
      out[2] = even2 + odd2;
256
0
      out[3] = even2 - odd2;
257
0
      break;
258
0
    }
259
0
    case 7: {
260
0
      static constexpr float kC7[7] = {
261
0
          1.414213562373, 1.378756275744, 1.274162392264, 1.105676685997,
262
0
          0.881747733790, 0.613604268353, 0.314692122713,
263
0
      };
264
0
      float even0 = in[0] + kC7[2] * in[2] + kC7[4] * in[4] + kC7[6] * in[6];
265
0
      float even1 = in[0] + kC7[6] * in[2] - kC7[2] * in[4] - kC7[4] * in[6];
266
0
      float even2 = in[0] - kC7[4] * in[2] - kC7[6] * in[4] + kC7[2] * in[6];
267
0
      float even3 = in[0] - kC7[0] * in[2] + kC7[0] * in[4] - kC7[0] * in[6];
268
0
      float odd0 = kC7[1] * in[1] + kC7[3] * in[3] + kC7[5] * in[5];
269
0
      float odd1 = kC7[3] * in[1] - kC7[5] * in[3] - kC7[1] * in[5];
270
0
      float odd2 = kC7[5] * in[1] - kC7[1] * in[3] + kC7[3] * in[5];
271
0
      out[0] = even0 + odd0;
272
0
      out[6] = even0 - odd0;
273
0
      out[1] = even1 + odd1;
274
0
      out[5] = even1 - odd1;
275
0
      out[2] = even2 + odd2;
276
0
      out[4] = even2 - odd2;
277
0
      out[3] = even3;
278
0
      break;
279
0
    }
280
0
    case 9: {
281
0
      static constexpr float kC9[9] = {
282
0
          1.414213562373, 1.392728480640, 1.328926048777,
283
0
          1.224744871392, 1.083350440839, 0.909038955344,
284
0
          0.707106781187, 0.483689525296, 0.245575607938,
285
0
      };
286
0
      float even0 = in[0] + kC9[2] * in[2] + kC9[4] * in[4] + kC9[6] * in[6];
287
0
      float even1 = in[0] + kC9[6] * in[2] - kC9[6] * in[4] - kC9[0] * in[6];
288
0
      float even2 = in[0] - kC9[8] * in[2] - kC9[2] * in[4] + kC9[6] * in[6];
289
0
      float even3 = in[0] - kC9[4] * in[2] + kC9[8] * in[4] + kC9[6] * in[6];
290
0
      float even4 = in[0] - kC9[0] * in[2] + kC9[0] * in[4] - kC9[0] * in[6];
291
0
      float odd0 =
292
0
          kC9[1] * in[1] + kC9[3] * in[3] + kC9[5] * in[5] + kC9[7] * in[7];
293
0
      float odd1 = kC9[3] * in[1] - kC9[3] * in[5] - kC9[3] * in[7];
294
0
      float odd2 =
295
0
          kC9[5] * in[1] - kC9[3] * in[3] - kC9[7] * in[5] + kC9[1] * in[7];
296
0
      float odd3 =
297
0
          kC9[7] * in[1] - kC9[3] * in[3] + kC9[1] * in[5] - kC9[5] * in[7];
298
0
      out[0] = even0 + odd0;
299
0
      out[8] = even0 - odd0;
300
0
      out[1] = even1 + odd1;
301
0
      out[7] = even1 - odd1;
302
0
      out[2] = even2 + odd2;
303
0
      out[6] = even2 - odd2;
304
0
      out[3] = even3 + odd3;
305
0
      out[5] = even3 - odd3;
306
0
      out[4] = even4;
307
0
      break;
308
0
    }
309
0
    case 10: {
310
0
      static constexpr float kC10[10] = {
311
0
          1.414213562373, 1.396802246667, 1.344997023928, 1.260073510670,
312
0
          1.144122805635, 1.000000000000, 0.831253875555, 0.642039521920,
313
0
          0.437016024449, 0.221231742082,
314
0
      };
315
0
      float even0 = in[0] + kC10[2] * in[2] + kC10[4] * in[4] + kC10[6] * in[6];
316
0
      float even1 = in[0] + kC10[6] * in[2] - kC10[8] * in[4] - kC10[2] * in[6];
317
0
      float even2 = in[0] - kC10[0] * in[4];
318
0
      float even3 = in[0] - kC10[6] * in[2] - kC10[8] * in[4] + kC10[2] * in[6];
319
0
      float even4 = in[0] - kC10[2] * in[2] + kC10[4] * in[4] - kC10[6] * in[6];
320
0
      float odd0 =
321
0
          kC10[1] * in[1] + kC10[3] * in[3] + kC10[5] * in[5] + kC10[7] * in[7];
322
0
      float odd1 =
323
0
          kC10[3] * in[1] + kC10[9] * in[3] - kC10[5] * in[5] - kC10[1] * in[7];
324
0
      float odd2 =
325
0
          kC10[5] * in[1] - kC10[5] * in[3] - kC10[5] * in[5] + kC10[5] * in[7];
326
0
      float odd3 =
327
0
          kC10[7] * in[1] - kC10[1] * in[3] + kC10[5] * in[5] + kC10[9] * in[7];
328
0
      float odd4 =
329
0
          kC10[9] * in[1] - kC10[7] * in[3] + kC10[5] * in[5] - kC10[3] * in[7];
330
0
      out[0] = even0 + odd0;
331
0
      out[9] = even0 - odd0;
332
0
      out[1] = even1 + odd1;
333
0
      out[8] = even1 - odd1;
334
0
      out[2] = even2 + odd2;
335
0
      out[7] = even2 - odd2;
336
0
      out[3] = even3 + odd3;
337
0
      out[6] = even3 - odd3;
338
0
      out[4] = even4 + odd4;
339
0
      out[5] = even4 - odd4;
340
0
      break;
341
0
    }
342
0
    case 11: {
343
0
      static constexpr float kC11[11] = {
344
0
          1.414213562373, 1.399818907436, 1.356927976287, 1.286413904599,
345
0
          1.189712155524, 1.068791297809, 0.926112931411, 0.764581576418,
346
0
          0.587485545401, 0.398430002847, 0.201263574413,
347
0
      };
348
0
      float even0 = in[0] + kC11[2] * in[2] + kC11[4] * in[4] + kC11[6] * in[6];
349
0
      float even1 =
350
0
          in[0] + kC11[6] * in[2] - kC11[10] * in[4] - kC11[4] * in[6];
351
0
      float even2 =
352
0
          in[0] + kC11[10] * in[2] - kC11[2] * in[4] - kC11[8] * in[6];
353
0
      float even3 = in[0] - kC11[8] * in[2] - kC11[6] * in[4] + kC11[2] * in[6];
354
0
      float even4 =
355
0
          in[0] - kC11[4] * in[2] + kC11[8] * in[4] + kC11[10] * in[6];
356
0
      float even5 = in[0] - kC11[0] * in[2] + kC11[0] * in[4] - kC11[0] * in[6];
357
0
      float odd0 =
358
0
          kC11[1] * in[1] + kC11[3] * in[3] + kC11[5] * in[5] + kC11[7] * in[7];
359
0
      float odd1 =
360
0
          kC11[3] * in[1] + kC11[9] * in[3] - kC11[7] * in[5] - kC11[1] * in[7];
361
0
      float odd2 =
362
0
          kC11[5] * in[1] - kC11[7] * in[3] - kC11[3] * in[5] + kC11[9] * in[7];
363
0
      float odd3 =
364
0
          kC11[7] * in[1] - kC11[1] * in[3] + kC11[9] * in[5] + kC11[5] * in[7];
365
0
      float odd4 =
366
0
          kC11[9] * in[1] - kC11[5] * in[3] + kC11[1] * in[5] - kC11[3] * in[7];
367
0
      out[0] = even0 + odd0;
368
0
      out[10] = even0 - odd0;
369
0
      out[1] = even1 + odd1;
370
0
      out[9] = even1 - odd1;
371
0
      out[2] = even2 + odd2;
372
0
      out[8] = even2 - odd2;
373
0
      out[3] = even3 + odd3;
374
0
      out[7] = even3 - odd3;
375
0
      out[4] = even4 + odd4;
376
0
      out[6] = even4 - odd4;
377
0
      out[5] = even5;
378
0
      break;
379
0
    }
380
0
    case 12: {
381
0
      static constexpr float kC12[12] = {
382
0
          1.414213562373, 1.402114769300, 1.366025403784, 1.306562964876,
383
0
          1.224744871392, 1.121971053594, 1.000000000000, 0.860918669154,
384
0
          0.707106781187, 0.541196100146, 0.366025403784, 0.184591911283,
385
0
      };
386
0
      float even0 = in[0] + kC12[2] * in[2] + kC12[4] * in[4] + kC12[6] * in[6];
387
0
      float even1 = in[0] + kC12[6] * in[2] - kC12[6] * in[6];
388
0
      float even2 =
389
0
          in[0] + kC12[10] * in[2] - kC12[4] * in[4] - kC12[6] * in[6];
390
0
      float even3 =
391
0
          in[0] - kC12[10] * in[2] - kC12[4] * in[4] + kC12[6] * in[6];
392
0
      float even4 = in[0] - kC12[6] * in[2] + kC12[6] * in[6];
393
0
      float even5 = in[0] - kC12[2] * in[2] + kC12[4] * in[4] - kC12[6] * in[6];
394
0
      float odd0 =
395
0
          kC12[1] * in[1] + kC12[3] * in[3] + kC12[5] * in[5] + kC12[7] * in[7];
396
0
      float odd1 =
397
0
          kC12[3] * in[1] + kC12[9] * in[3] - kC12[9] * in[5] - kC12[3] * in[7];
398
0
      float odd2 = kC12[5] * in[1] - kC12[9] * in[3] - kC12[1] * in[5] -
399
0
                   kC12[11] * in[7];
400
0
      float odd3 = kC12[7] * in[1] - kC12[3] * in[3] - kC12[11] * in[5] +
401
0
                   kC12[1] * in[7];
402
0
      float odd4 =
403
0
          kC12[9] * in[1] - kC12[3] * in[3] + kC12[3] * in[5] - kC12[9] * in[7];
404
0
      float odd5 = kC12[11] * in[1] - kC12[9] * in[3] + kC12[7] * in[5] -
405
0
                   kC12[5] * in[7];
406
0
      out[0] = even0 + odd0;
407
0
      out[11] = even0 - odd0;
408
0
      out[1] = even1 + odd1;
409
0
      out[10] = even1 - odd1;
410
0
      out[2] = even2 + odd2;
411
0
      out[9] = even2 - odd2;
412
0
      out[3] = even3 + odd3;
413
0
      out[8] = even3 - odd3;
414
0
      out[4] = even4 + odd4;
415
0
      out[7] = even4 - odd4;
416
0
      out[5] = even5 + odd5;
417
0
      out[6] = even5 - odd5;
418
0
      break;
419
0
    }
420
0
    case 13: {
421
0
      static constexpr float kC13[13] = {
422
0
          1.414213562373, 1.403902353238, 1.373119086479, 1.322312651445,
423
0
          1.252223920364, 1.163874944761, 1.058554051646, 0.937797056801,
424
0
          0.803364869133, 0.657217812653, 0.501487040539, 0.338443458124,
425
0
          0.170464607981,
426
0
      };
427
0
      float even0 = in[0] + kC13[2] * in[2] + kC13[4] * in[4] + kC13[6] * in[6];
428
0
      float even1 =
429
0
          in[0] + kC13[6] * in[2] + kC13[12] * in[4] - kC13[8] * in[6];
430
0
      float even2 =
431
0
          in[0] + kC13[10] * in[2] - kC13[6] * in[4] - kC13[4] * in[6];
432
0
      float even3 =
433
0
          in[0] - kC13[12] * in[2] - kC13[2] * in[4] + kC13[10] * in[6];
434
0
      float even4 =
435
0
          in[0] - kC13[8] * in[2] - kC13[10] * in[4] + kC13[2] * in[6];
436
0
      float even5 =
437
0
          in[0] - kC13[4] * in[2] + kC13[8] * in[4] - kC13[12] * in[6];
438
0
      float even6 = in[0] - kC13[0] * in[2] + kC13[0] * in[4] - kC13[0] * in[6];
439
0
      float odd0 =
440
0
          kC13[1] * in[1] + kC13[3] * in[3] + kC13[5] * in[5] + kC13[7] * in[7];
441
0
      float odd1 = kC13[3] * in[1] + kC13[9] * in[3] - kC13[11] * in[5] -
442
0
                   kC13[5] * in[7];
443
0
      float odd2 = kC13[5] * in[1] - kC13[11] * in[3] - kC13[1] * in[5] -
444
0
                   kC13[9] * in[7];
445
0
      float odd3 =
446
0
          kC13[7] * in[1] - kC13[5] * in[3] - kC13[9] * in[5] + kC13[3] * in[7];
447
0
      float odd4 = kC13[9] * in[1] - kC13[1] * in[3] + kC13[7] * in[5] +
448
0
                   kC13[11] * in[7];
449
0
      float odd5 = kC13[11] * in[1] - kC13[7] * in[3] + kC13[3] * in[5] -
450
0
                   kC13[1] * in[7];
451
0
      out[0] = even0 + odd0;
452
0
      out[12] = even0 - odd0;
453
0
      out[1] = even1 + odd1;
454
0
      out[11] = even1 - odd1;
455
0
      out[2] = even2 + odd2;
456
0
      out[10] = even2 - odd2;
457
0
      out[3] = even3 + odd3;
458
0
      out[9] = even3 - odd3;
459
0
      out[4] = even4 + odd4;
460
0
      out[8] = even4 - odd4;
461
0
      out[5] = even5 + odd5;
462
0
      out[7] = even5 - odd5;
463
0
      out[6] = even6;
464
0
      break;
465
0
    }
466
0
    case 14: {
467
0
      static constexpr float kC14[14] = {
468
0
          1.414213562373, 1.405321284327, 1.378756275744, 1.334852607020,
469
0
          1.274162392264, 1.197448846138, 1.105676685997, 1.000000000000,
470
0
          0.881747733790, 0.752406978226, 0.613604268353, 0.467085128785,
471
0
          0.314692122713, 0.158341680609,
472
0
      };
473
0
      float even0 = in[0] + kC14[2] * in[2] + kC14[4] * in[4] + kC14[6] * in[6];
474
0
      float even1 =
475
0
          in[0] + kC14[6] * in[2] + kC14[12] * in[4] - kC14[10] * in[6];
476
0
      float even2 =
477
0
          in[0] + kC14[10] * in[2] - kC14[8] * in[4] - kC14[2] * in[6];
478
0
      float even3 = in[0] - kC14[0] * in[4];
479
0
      float even4 =
480
0
          in[0] - kC14[10] * in[2] - kC14[8] * in[4] + kC14[2] * in[6];
481
0
      float even5 =
482
0
          in[0] - kC14[6] * in[2] + kC14[12] * in[4] + kC14[10] * in[6];
483
0
      float even6 = in[0] - kC14[2] * in[2] + kC14[4] * in[4] - kC14[6] * in[6];
484
0
      float odd0 =
485
0
          kC14[1] * in[1] + kC14[3] * in[3] + kC14[5] * in[5] + kC14[7] * in[7];
486
0
      float odd1 = kC14[3] * in[1] + kC14[9] * in[3] - kC14[13] * in[5] -
487
0
                   kC14[7] * in[7];
488
0
      float odd2 = kC14[5] * in[1] - kC14[13] * in[3] - kC14[3] * in[5] -
489
0
                   kC14[7] * in[7];
490
0
      float odd3 =
491
0
          kC14[7] * in[1] - kC14[7] * in[3] - kC14[7] * in[5] + kC14[7] * in[7];
492
0
      float odd4 = kC14[9] * in[1] - kC14[1] * in[3] + kC14[11] * in[5] +
493
0
                   kC14[7] * in[7];
494
0
      float odd5 = kC14[11] * in[1] - kC14[5] * in[3] + kC14[1] * in[5] -
495
0
                   kC14[7] * in[7];
496
0
      float odd6 = kC14[13] * in[1] - kC14[11] * in[3] + kC14[9] * in[5] -
497
0
                   kC14[7] * in[7];
498
0
      out[0] = even0 + odd0;
499
0
      out[13] = even0 - odd0;
500
0
      out[1] = even1 + odd1;
501
0
      out[12] = even1 - odd1;
502
0
      out[2] = even2 + odd2;
503
0
      out[11] = even2 - odd2;
504
0
      out[3] = even3 + odd3;
505
0
      out[10] = even3 - odd3;
506
0
      out[4] = even4 + odd4;
507
0
      out[9] = even4 - odd4;
508
0
      out[5] = even5 + odd5;
509
0
      out[8] = even5 - odd5;
510
0
      out[6] = even6 + odd6;
511
0
      out[7] = even6 - odd6;
512
0
      break;
513
0
    }
514
0
    case 15: {
515
0
      static constexpr float kC15[15] = {
516
0
          1.414213562373, 1.406466352507, 1.383309602960, 1.344997023928,
517
0
          1.291948376043, 1.224744871392, 1.144122805635, 1.050965490998,
518
0
          0.946293578512, 0.831253875555, 0.707106781187, 0.575212476952,
519
0
          0.437016024449, 0.294031532930, 0.147825570407,
520
0
      };
521
0
      float even0 = in[0] + kC15[2] * in[2] + kC15[4] * in[4] + kC15[6] * in[6];
522
0
      float even1 =
523
0
          in[0] + kC15[6] * in[2] + kC15[12] * in[4] - kC15[12] * in[6];
524
0
      float even2 =
525
0
          in[0] + kC15[10] * in[2] - kC15[10] * in[4] - kC15[0] * in[6];
526
0
      float even3 =
527
0
          in[0] + kC15[14] * in[2] - kC15[2] * in[4] - kC15[12] * in[6];
528
0
      float even4 =
529
0
          in[0] - kC15[12] * in[2] - kC15[6] * in[4] + kC15[6] * in[6];
530
0
      float even5 =
531
0
          in[0] - kC15[8] * in[2] - kC15[14] * in[4] + kC15[6] * in[6];
532
0
      float even6 =
533
0
          in[0] - kC15[4] * in[2] + kC15[8] * in[4] - kC15[12] * in[6];
534
0
      float even7 = in[0] - kC15[0] * in[2] + kC15[0] * in[4] - kC15[0] * in[6];
535
0
      float odd0 =
536
0
          kC15[1] * in[1] + kC15[3] * in[3] + kC15[5] * in[5] + kC15[7] * in[7];
537
0
      float odd1 = kC15[3] * in[1] + kC15[9] * in[3] - kC15[9] * in[7];
538
0
      float odd2 = kC15[5] * in[1] - kC15[5] * in[5] - kC15[5] * in[7];
539
0
      float odd3 = kC15[7] * in[1] - kC15[9] * in[3] - kC15[5] * in[5] +
540
0
                   kC15[11] * in[7];
541
0
      float odd4 = kC15[9] * in[1] - kC15[3] * in[3] + kC15[3] * in[7];
542
0
      float odd5 = kC15[11] * in[1] - kC15[3] * in[3] + kC15[5] * in[5] -
543
0
                   kC15[13] * in[7];
544
0
      float odd6 = kC15[13] * in[1] - kC15[9] * in[3] + kC15[5] * in[5] -
545
0
                   kC15[1] * in[7];
546
0
      out[0] = even0 + odd0;
547
0
      out[14] = even0 - odd0;
548
0
      out[1] = even1 + odd1;
549
0
      out[13] = even1 - odd1;
550
0
      out[2] = even2 + odd2;
551
0
      out[12] = even2 - odd2;
552
0
      out[3] = even3 + odd3;
553
0
      out[11] = even3 - odd3;
554
0
      out[4] = even4 + odd4;
555
0
      out[10] = even4 - odd4;
556
0
      out[5] = even5 + odd5;
557
0
      out[9] = even5 - odd5;
558
0
      out[6] = even6 + odd6;
559
0
      out[8] = even6 - odd6;
560
0
      out[7] = even7;
561
0
      break;
562
0
    }
563
0
    case 16: {
564
0
      static constexpr float kC16[16] = {
565
0
          1.414213562373, 1.407403737526, 1.387039845322, 1.353318001174,
566
0
          1.306562964876, 1.247225012987, 1.175875602419, 1.093201867002,
567
0
          1.000000000000, 0.897167586343, 0.785694958387, 0.666655658478,
568
0
          0.541196100146, 0.410524527522, 0.275899379283, 0.138617169199,
569
0
      };
570
0
      float even0 = in[0] + kC16[2] * in[2] + kC16[4] * in[4] + kC16[6] * in[6];
571
0
      float even1 =
572
0
          in[0] + kC16[6] * in[2] + kC16[12] * in[4] - kC16[14] * in[6];
573
0
      float even2 =
574
0
          in[0] + kC16[10] * in[2] - kC16[12] * in[4] - kC16[2] * in[6];
575
0
      float even3 =
576
0
          in[0] + kC16[14] * in[2] - kC16[4] * in[4] - kC16[10] * in[6];
577
0
      float even4 =
578
0
          in[0] - kC16[14] * in[2] - kC16[4] * in[4] + kC16[10] * in[6];
579
0
      float even5 =
580
0
          in[0] - kC16[10] * in[2] - kC16[12] * in[4] + kC16[2] * in[6];
581
0
      float even6 =
582
0
          in[0] - kC16[6] * in[2] + kC16[12] * in[4] + kC16[14] * in[6];
583
0
      float even7 = in[0] - kC16[2] * in[2] + kC16[4] * in[4] - kC16[6] * in[6];
584
0
      float odd0 = (kC16[1] * in[1] + kC16[3] * in[3] + kC16[5] * in[5] +
585
0
                    kC16[7] * in[7]);
586
0
      float odd1 = (kC16[3] * in[1] + kC16[9] * in[3] + kC16[15] * in[5] -
587
0
                    kC16[11] * in[7]);
588
0
      float odd2 = (kC16[5] * in[1] + kC16[15] * in[3] - kC16[7] * in[5] -
589
0
                    kC16[3] * in[7]);
590
0
      float odd3 = (kC16[7] * in[1] - kC16[11] * in[3] - kC16[3] * in[5] +
591
0
                    kC16[15] * in[7]);
592
0
      float odd4 = (kC16[9] * in[1] - kC16[5] * in[3] - kC16[13] * in[5] +
593
0
                    kC16[1] * in[7]);
594
0
      float odd5 = (kC16[11] * in[1] - kC16[1] * in[3] + kC16[9] * in[5] +
595
0
                    kC16[13] * in[7]);
596
0
      float odd6 = (kC16[13] * in[1] - kC16[7] * in[3] + kC16[1] * in[5] -
597
0
                    kC16[5] * in[7]);
598
0
      float odd7 = (kC16[15] * in[1] - kC16[13] * in[3] + kC16[11] * in[5] -
599
0
                    kC16[9] * in[7]);
600
0
      out[0] = even0 + odd0;
601
0
      out[15] = even0 - odd0;
602
0
      out[1] = even1 + odd1;
603
0
      out[14] = even1 - odd1;
604
0
      out[2] = even2 + odd2;
605
0
      out[13] = even2 - odd2;
606
0
      out[3] = even3 + odd3;
607
0
      out[12] = even3 - odd3;
608
0
      out[4] = even4 + odd4;
609
0
      out[11] = even4 - odd4;
610
0
      out[5] = even5 + odd5;
611
0
      out[10] = even5 - odd5;
612
0
      out[6] = even6 + odd6;
613
0
      out[9] = even6 - odd6;
614
0
      out[7] = even7 + odd7;
615
0
      out[8] = even7 - odd7;
616
0
      break;
617
0
    }
618
0
    default:
619
0
      JXL_DEBUG_ABORT("Unreachable");
620
0
      break;
621
0
  }
622
0
}
Unexecuted instantiation: jpegli::N_SSE4::Compute1dIDCT(float const*, float*, unsigned long)
Unexecuted instantiation: jpegli::N_AVX2::Compute1dIDCT(float const*, float*, unsigned long)
Unexecuted instantiation: jpegli::N_SSE2::Compute1dIDCT(float const*, float*, unsigned long)
623
624
void InverseTransformBlockGeneric(const int16_t* JXL_RESTRICT qblock,
625
                                  const float* JXL_RESTRICT dequant,
626
                                  const float* JXL_RESTRICT biases,
627
                                  float* JXL_RESTRICT scratch_space,
628
                                  float* JXL_RESTRICT output,
629
0
                                  size_t output_stride, size_t dctsize) {
630
0
  float* JXL_RESTRICT block0 = scratch_space;
631
0
  float* JXL_RESTRICT block1 = scratch_space + DCTSIZE2;
632
0
  DequantBlock(qblock, dequant, biases, block0);
633
0
  if (dctsize == 1) {
634
0
    *output = *block0;
635
0
  } else if (dctsize == 2 || dctsize == 4) {
636
0
    float* JXL_RESTRICT block2 = scratch_space + 2 * DCTSIZE2;
637
0
    ComputeScaledIDCT(block0, block1, block2, 8);
638
0
    if (dctsize == 4) {
639
0
      for (size_t iy = 0; iy < 4; ++iy) {
640
0
        for (size_t ix = 0; ix < 4; ++ix) {
641
0
          float* block = &block2[16 * iy + 2 * ix];
642
0
          output[iy * output_stride + ix] =
643
0
              0.25f * (block[0] + block[1] + block[8] + block[9]);
644
0
        }
645
0
      }
646
0
    } else {
647
0
      for (size_t iy = 0; iy < 2; ++iy) {
648
0
        for (size_t ix = 0; ix < 2; ++ix) {
649
0
          float* block = &block2[32 * iy + 4 * ix];
650
0
          output[iy * output_stride + ix] =
651
0
              0.0625f *
652
0
              (block[0] + block[1] + block[2] + block[3] + block[8] + block[9] +
653
0
               block[10] + block[11] + block[16] + block[17] + block[18] +
654
0
               block[19] + block[24] + block[25] + block[26] + block[27]);
655
0
        }
656
0
      }
657
0
    }
658
0
  } else {
659
0
    float dctin[DCTSIZE];
660
0
    float dctout[DCTSIZE * 2];
661
0
    size_t insize = std::min<size_t>(dctsize, DCTSIZE);
662
0
    for (size_t ix = 0; ix < insize; ++ix) {
663
0
      for (size_t iy = 0; iy < insize; ++iy) {
664
0
        dctin[iy] = block0[iy * DCTSIZE + ix];
665
0
      }
666
0
      Compute1dIDCT(dctin, dctout, dctsize);
667
0
      for (size_t iy = 0; iy < dctsize; ++iy) {
668
0
        block1[iy * dctsize + ix] = dctout[iy];
669
0
      }
670
0
    }
671
0
    for (size_t iy = 0; iy < dctsize; ++iy) {
672
0
      Compute1dIDCT(block1 + iy * dctsize, output + iy * output_stride,
673
0
                    dctsize);
674
0
    }
675
0
  }
676
0
}
Unexecuted instantiation: jpegli::N_SSE4::InverseTransformBlockGeneric(short const*, float const*, float const*, float*, float*, unsigned long, unsigned long)
Unexecuted instantiation: jpegli::N_AVX2::InverseTransformBlockGeneric(short const*, float const*, float const*, float*, float*, unsigned long, unsigned long)
Unexecuted instantiation: jpegli::N_SSE2::InverseTransformBlockGeneric(short const*, float const*, float const*, float*, float*, unsigned long, unsigned long)
677
678
// NOLINTNEXTLINE(google-readability-namespace-comments)
679
}  // namespace HWY_NAMESPACE
680
}  // namespace jpegli
681
HWY_AFTER_NAMESPACE();
682
683
#if HWY_ONCE
684
namespace jpegli {
685
686
HWY_EXPORT(InverseTransformBlock8x8);
687
HWY_EXPORT(InverseTransformBlockGeneric);
688
689
2.68k
jxl::Status ChooseInverseTransform(j_decompress_ptr cinfo) {
690
2.68k
  jpeg_decomp_master* m = cinfo->master;
691
7.84k
  for (int c = 0; c < cinfo->num_components; ++c) {
692
5.15k
    int dct_size = m->scaled_dct_size[c];
693
5.15k
    if (dct_size < 1 || dct_size > 16) {
694
0
      return JXL_FAILURE("Compute1dIDCT does not support N=%d", dct_size);
695
0
    }
696
5.15k
    if (dct_size == DCTSIZE) {
697
5.15k
      m->inverse_transform[c] = HWY_DYNAMIC_DISPATCH(InverseTransformBlock8x8);
698
5.15k
    } else {
699
0
      m->inverse_transform[c] =
700
0
          HWY_DYNAMIC_DISPATCH(InverseTransformBlockGeneric);
701
0
    }
702
5.15k
  }
703
2.68k
  return true;
704
2.68k
}
705
706
}  // namespace jpegli
707
#endif  // HWY_ONCE