/src/libjxl/lib/jpegli/idct.cc
Line | Count | Source (jump to first uncovered line) |
1 | | // Copyright (c) the JPEG XL Project Authors. All rights reserved. |
2 | | // |
3 | | // Use of this source code is governed by a BSD-style |
4 | | // license that can be found in the LICENSE file. |
5 | | |
6 | | #include "lib/jpegli/idct.h" |
7 | | |
8 | | #include <algorithm> |
9 | | #include <cmath> |
10 | | #include <cstddef> |
11 | | #include <cstdint> |
12 | | |
13 | | #include "lib/jpegli/common.h" |
14 | | #include "lib/jpegli/decode_internal.h" |
15 | | #include "lib/jxl/base/compiler_specific.h" |
16 | | #include "lib/jxl/base/status.h" |
17 | | |
18 | | #undef HWY_TARGET_INCLUDE |
19 | | #define HWY_TARGET_INCLUDE "lib/jpegli/idct.cc" |
20 | | #include <hwy/foreach_target.h> |
21 | | #include <hwy/highway.h> |
22 | | |
23 | | #include "lib/jpegli/transpose-inl.h" |
24 | | |
25 | | HWY_BEFORE_NAMESPACE(); |
26 | | namespace jpegli { |
27 | | namespace HWY_NAMESPACE { |
28 | | |
29 | | // These templates are not found via ADL. |
30 | | using hwy::HWY_NAMESPACE::Abs; |
31 | | using hwy::HWY_NAMESPACE::Add; |
32 | | using hwy::HWY_NAMESPACE::Gt; |
33 | | using hwy::HWY_NAMESPACE::IfThenElseZero; |
34 | | using hwy::HWY_NAMESPACE::Mul; |
35 | | using hwy::HWY_NAMESPACE::MulAdd; |
36 | | using hwy::HWY_NAMESPACE::NegMulAdd; |
37 | | using hwy::HWY_NAMESPACE::Rebind; |
38 | | using hwy::HWY_NAMESPACE::Sub; |
39 | | using hwy::HWY_NAMESPACE::Vec; |
40 | | using hwy::HWY_NAMESPACE::Xor; |
41 | | |
42 | | using D = HWY_FULL(float); |
43 | | using DI = HWY_FULL(int32_t); |
44 | | constexpr D d; |
45 | | constexpr DI di; |
46 | | |
47 | | using D8 = HWY_CAPPED(float, 8); |
48 | | constexpr D8 d8; |
49 | | |
50 | | void DequantBlock(const int16_t* JXL_RESTRICT qblock, |
51 | | const float* JXL_RESTRICT dequant, |
52 | 31.8M | const float* JXL_RESTRICT biases, float* JXL_RESTRICT block) { |
53 | 416M | for (size_t k = 0; k < 64; k += Lanes(d)) { |
54 | 384M | const auto mul = Load(d, dequant + k); |
55 | 384M | const auto bias = Load(d, biases + k); |
56 | 384M | const Rebind<int16_t, DI> di16; |
57 | 384M | const Vec<DI> quant_i = PromoteTo(di, Load(di16, qblock + k)); |
58 | 384M | const Rebind<float, DI> df; |
59 | 384M | const auto quant = ConvertTo(df, quant_i); |
60 | 384M | const auto abs_quant = Abs(quant); |
61 | 384M | const auto not_0 = Gt(abs_quant, Zero(df)); |
62 | 384M | const auto sign_quant = Xor(quant, abs_quant); |
63 | 384M | const auto biased_quant = Sub(quant, Xor(bias, sign_quant)); |
64 | 384M | const auto deq = IfThenElseZero(not_0, Mul(biased_quant, mul)); |
65 | 384M | Store(deq, d, block + k); |
66 | 384M | } |
67 | 31.8M | } jpegli::N_SSE4::DequantBlock(short const*, float const*, float const*, float*) Line | Count | Source | 52 | 7.95M | const float* JXL_RESTRICT biases, float* JXL_RESTRICT block) { | 53 | 135M | for (size_t k = 0; k < 64; k += Lanes(d)) { | 54 | 127M | const auto mul = Load(d, dequant + k); | 55 | 127M | const auto bias = Load(d, biases + k); | 56 | 127M | const Rebind<int16_t, DI> di16; | 57 | 127M | const Vec<DI> quant_i = PromoteTo(di, Load(di16, qblock + k)); | 58 | 127M | const Rebind<float, DI> df; | 59 | 127M | const auto quant = ConvertTo(df, quant_i); | 60 | 127M | const auto abs_quant = Abs(quant); | 61 | 127M | const auto not_0 = Gt(abs_quant, Zero(df)); | 62 | 127M | const auto sign_quant = Xor(quant, abs_quant); | 63 | 127M | const auto biased_quant = Sub(quant, Xor(bias, sign_quant)); | 64 | 127M | const auto deq = IfThenElseZero(not_0, Mul(biased_quant, mul)); | 65 | 127M | Store(deq, d, block + k); | 66 | 127M | } | 67 | 7.95M | } |
jpegli::N_AVX2::DequantBlock(short const*, float const*, float const*, float*) Line | Count | Source | 52 | 15.6M | const float* JXL_RESTRICT biases, float* JXL_RESTRICT block) { | 53 | 140M | for (size_t k = 0; k < 64; k += Lanes(d)) { | 54 | 125M | const auto mul = Load(d, dequant + k); | 55 | 125M | const auto bias = Load(d, biases + k); | 56 | 125M | const Rebind<int16_t, DI> di16; | 57 | 125M | const Vec<DI> quant_i = PromoteTo(di, Load(di16, qblock + k)); | 58 | 125M | const Rebind<float, DI> df; | 59 | 125M | const auto quant = ConvertTo(df, quant_i); | 60 | 125M | const auto abs_quant = Abs(quant); | 61 | 125M | const auto not_0 = Gt(abs_quant, Zero(df)); | 62 | 125M | const auto sign_quant = Xor(quant, abs_quant); | 63 | 125M | const auto biased_quant = Sub(quant, Xor(bias, sign_quant)); | 64 | 125M | const auto deq = IfThenElseZero(not_0, Mul(biased_quant, mul)); | 65 | 125M | Store(deq, d, block + k); | 66 | 125M | } | 67 | 15.6M | } |
jpegli::N_SSE2::DequantBlock(short const*, float const*, float const*, float*) Line | Count | Source | 52 | 8.25M | const float* JXL_RESTRICT biases, float* JXL_RESTRICT block) { | 53 | 140M | for (size_t k = 0; k < 64; k += Lanes(d)) { | 54 | 132M | const auto mul = Load(d, dequant + k); | 55 | 132M | const auto bias = Load(d, biases + k); | 56 | 132M | const Rebind<int16_t, DI> di16; | 57 | 132M | const Vec<DI> quant_i = PromoteTo(di, Load(di16, qblock + k)); | 58 | 132M | const Rebind<float, DI> df; | 59 | 132M | const auto quant = ConvertTo(df, quant_i); | 60 | 132M | const auto abs_quant = Abs(quant); | 61 | 132M | const auto not_0 = Gt(abs_quant, Zero(df)); | 62 | 132M | const auto sign_quant = Xor(quant, abs_quant); | 63 | 132M | const auto biased_quant = Sub(quant, Xor(bias, sign_quant)); | 64 | 132M | const auto deq = IfThenElseZero(not_0, Mul(biased_quant, mul)); | 65 | 132M | Store(deq, d, block + k); | 66 | 132M | } | 67 | 8.25M | } |
|
68 | | |
69 | | template <size_t N> |
70 | | void ForwardEvenOdd(const float* JXL_RESTRICT a_in, size_t a_in_stride, |
71 | 288M | float* JXL_RESTRICT a_out) { |
72 | 1.05G | for (size_t i = 0; i < N / 2; i++) { |
73 | 768M | auto in1 = LoadU(d8, a_in + 2 * i * a_in_stride); |
74 | 768M | Store(in1, d8, a_out + i * 8); |
75 | 768M | } |
76 | 1.05G | for (size_t i = N / 2; i < N; i++) { |
77 | 768M | auto in1 = LoadU(d8, a_in + (2 * (i - N / 2) + 1) * a_in_stride); |
78 | 768M | Store(in1, d8, a_out + i * 8); |
79 | 768M | } |
80 | 288M | } void jpegli::N_SSE4::ForwardEvenOdd<8ul>(float const*, unsigned long, float*) Line | Count | Source | 71 | 31.8M | float* JXL_RESTRICT a_out) { | 72 | 159M | for (size_t i = 0; i < N / 2; i++) { | 73 | 127M | auto in1 = LoadU(d8, a_in + 2 * i * a_in_stride); | 74 | 127M | Store(in1, d8, a_out + i * 8); | 75 | 127M | } | 76 | 159M | for (size_t i = N / 2; i < N; i++) { | 77 | 127M | auto in1 = LoadU(d8, a_in + (2 * (i - N / 2) + 1) * a_in_stride); | 78 | 127M | Store(in1, d8, a_out + i * 8); | 79 | 127M | } | 80 | 31.8M | } |
void jpegli::N_SSE4::ForwardEvenOdd<4ul>(float const*, unsigned long, float*) Line | Count | Source | 71 | 63.6M | float* JXL_RESTRICT a_out) { | 72 | 190M | for (size_t i = 0; i < N / 2; i++) { | 73 | 127M | auto in1 = LoadU(d8, a_in + 2 * i * a_in_stride); | 74 | 127M | Store(in1, d8, a_out + i * 8); | 75 | 127M | } | 76 | 190M | for (size_t i = N / 2; i < N; i++) { | 77 | 127M | auto in1 = LoadU(d8, a_in + (2 * (i - N / 2) + 1) * a_in_stride); | 78 | 127M | Store(in1, d8, a_out + i * 8); | 79 | 127M | } | 80 | 63.6M | } |
void jpegli::N_AVX2::ForwardEvenOdd<8ul>(float const*, unsigned long, float*) Line | Count | Source | 71 | 31.2M | float* JXL_RESTRICT a_out) { | 72 | 156M | for (size_t i = 0; i < N / 2; i++) { | 73 | 125M | auto in1 = LoadU(d8, a_in + 2 * i * a_in_stride); | 74 | 125M | Store(in1, d8, a_out + i * 8); | 75 | 125M | } | 76 | 156M | for (size_t i = N / 2; i < N; i++) { | 77 | 125M | auto in1 = LoadU(d8, a_in + (2 * (i - N / 2) + 1) * a_in_stride); | 78 | 125M | Store(in1, d8, a_out + i * 8); | 79 | 125M | } | 80 | 31.2M | } |
void jpegli::N_AVX2::ForwardEvenOdd<4ul>(float const*, unsigned long, float*) Line | Count | Source | 71 | 62.5M | float* JXL_RESTRICT a_out) { | 72 | 187M | for (size_t i = 0; i < N / 2; i++) { | 73 | 125M | auto in1 = LoadU(d8, a_in + 2 * i * a_in_stride); | 74 | 125M | Store(in1, d8, a_out + i * 8); | 75 | 125M | } | 76 | 187M | for (size_t i = N / 2; i < N; i++) { | 77 | 125M | auto in1 = LoadU(d8, a_in + (2 * (i - N / 2) + 1) * a_in_stride); | 78 | 125M | Store(in1, d8, a_out + i * 8); | 79 | 125M | } | 80 | 62.5M | } |
void jpegli::N_SSE2::ForwardEvenOdd<8ul>(float const*, unsigned long, float*) Line | Count | Source | 71 | 33.0M | float* JXL_RESTRICT a_out) { | 72 | 165M | for (size_t i = 0; i < N / 2; i++) { | 73 | 132M | auto in1 = LoadU(d8, a_in + 2 * i * a_in_stride); | 74 | 132M | Store(in1, d8, a_out + i * 8); | 75 | 132M | } | 76 | 165M | for (size_t i = N / 2; i < N; i++) { | 77 | 132M | auto in1 = LoadU(d8, a_in + (2 * (i - N / 2) + 1) * a_in_stride); | 78 | 132M | Store(in1, d8, a_out + i * 8); | 79 | 132M | } | 80 | 33.0M | } |
void jpegli::N_SSE2::ForwardEvenOdd<4ul>(float const*, unsigned long, float*) Line | Count | Source | 71 | 66.0M | float* JXL_RESTRICT a_out) { | 72 | 198M | for (size_t i = 0; i < N / 2; i++) { | 73 | 132M | auto in1 = LoadU(d8, a_in + 2 * i * a_in_stride); | 74 | 132M | Store(in1, d8, a_out + i * 8); | 75 | 132M | } | 76 | 198M | for (size_t i = N / 2; i < N; i++) { | 77 | 132M | auto in1 = LoadU(d8, a_in + (2 * (i - N / 2) + 1) * a_in_stride); | 78 | 132M | Store(in1, d8, a_out + i * 8); | 79 | 132M | } | 80 | 66.0M | } |
|
81 | | |
82 | | template <size_t N> |
83 | 288M | void BTranspose(float* JXL_RESTRICT coeff) { |
84 | 768M | for (size_t i = N - 1; i > 0; i--) { |
85 | 480M | auto in1 = Load(d8, coeff + i * 8); |
86 | 480M | auto in2 = Load(d8, coeff + (i - 1) * 8); |
87 | 480M | Store(Add(in1, in2), d8, coeff + i * 8); |
88 | 480M | } |
89 | 288M | constexpr float kSqrt2 = 1.41421356237f; |
90 | 288M | auto sqrt2 = Set(d8, kSqrt2); |
91 | 288M | auto in1 = Load(d8, coeff); |
92 | 288M | Store(Mul(in1, sqrt2), d8, coeff); |
93 | 288M | } void jpegli::N_SSE4::BTranspose<2ul>(float*) Line | Count | Source | 83 | 63.6M | void BTranspose(float* JXL_RESTRICT coeff) { | 84 | 127M | for (size_t i = N - 1; i > 0; i--) { | 85 | 63.6M | auto in1 = Load(d8, coeff + i * 8); | 86 | 63.6M | auto in2 = Load(d8, coeff + (i - 1) * 8); | 87 | 63.6M | Store(Add(in1, in2), d8, coeff + i * 8); | 88 | 63.6M | } | 89 | 63.6M | constexpr float kSqrt2 = 1.41421356237f; | 90 | 63.6M | auto sqrt2 = Set(d8, kSqrt2); | 91 | 63.6M | auto in1 = Load(d8, coeff); | 92 | 63.6M | Store(Mul(in1, sqrt2), d8, coeff); | 93 | 63.6M | } |
void jpegli::N_SSE4::BTranspose<4ul>(float*) Line | Count | Source | 83 | 31.8M | void BTranspose(float* JXL_RESTRICT coeff) { | 84 | 127M | for (size_t i = N - 1; i > 0; i--) { | 85 | 95.4M | auto in1 = Load(d8, coeff + i * 8); | 86 | 95.4M | auto in2 = Load(d8, coeff + (i - 1) * 8); | 87 | 95.4M | Store(Add(in1, in2), d8, coeff + i * 8); | 88 | 95.4M | } | 89 | 31.8M | constexpr float kSqrt2 = 1.41421356237f; | 90 | 31.8M | auto sqrt2 = Set(d8, kSqrt2); | 91 | 31.8M | auto in1 = Load(d8, coeff); | 92 | 31.8M | Store(Mul(in1, sqrt2), d8, coeff); | 93 | 31.8M | } |
void jpegli::N_AVX2::BTranspose<2ul>(float*) Line | Count | Source | 83 | 62.5M | void BTranspose(float* JXL_RESTRICT coeff) { | 84 | 125M | for (size_t i = N - 1; i > 0; i--) { | 85 | 62.5M | auto in1 = Load(d8, coeff + i * 8); | 86 | 62.5M | auto in2 = Load(d8, coeff + (i - 1) * 8); | 87 | 62.5M | Store(Add(in1, in2), d8, coeff + i * 8); | 88 | 62.5M | } | 89 | 62.5M | constexpr float kSqrt2 = 1.41421356237f; | 90 | 62.5M | auto sqrt2 = Set(d8, kSqrt2); | 91 | 62.5M | auto in1 = Load(d8, coeff); | 92 | 62.5M | Store(Mul(in1, sqrt2), d8, coeff); | 93 | 62.5M | } |
void jpegli::N_AVX2::BTranspose<4ul>(float*) Line | Count | Source | 83 | 31.2M | void BTranspose(float* JXL_RESTRICT coeff) { | 84 | 125M | for (size_t i = N - 1; i > 0; i--) { | 85 | 93.7M | auto in1 = Load(d8, coeff + i * 8); | 86 | 93.7M | auto in2 = Load(d8, coeff + (i - 1) * 8); | 87 | 93.7M | Store(Add(in1, in2), d8, coeff + i * 8); | 88 | 93.7M | } | 89 | 31.2M | constexpr float kSqrt2 = 1.41421356237f; | 90 | 31.2M | auto sqrt2 = Set(d8, kSqrt2); | 91 | 31.2M | auto in1 = Load(d8, coeff); | 92 | 31.2M | Store(Mul(in1, sqrt2), d8, coeff); | 93 | 31.2M | } |
void jpegli::N_SSE2::BTranspose<2ul>(float*) Line | Count | Source | 83 | 66.0M | void BTranspose(float* JXL_RESTRICT coeff) { | 84 | 132M | for (size_t i = N - 1; i > 0; i--) { | 85 | 66.0M | auto in1 = Load(d8, coeff + i * 8); | 86 | 66.0M | auto in2 = Load(d8, coeff + (i - 1) * 8); | 87 | 66.0M | Store(Add(in1, in2), d8, coeff + i * 8); | 88 | 66.0M | } | 89 | 66.0M | constexpr float kSqrt2 = 1.41421356237f; | 90 | 66.0M | auto sqrt2 = Set(d8, kSqrt2); | 91 | 66.0M | auto in1 = Load(d8, coeff); | 92 | 66.0M | Store(Mul(in1, sqrt2), d8, coeff); | 93 | 66.0M | } |
void jpegli::N_SSE2::BTranspose<4ul>(float*) Line | Count | Source | 83 | 33.0M | void BTranspose(float* JXL_RESTRICT coeff) { | 84 | 132M | for (size_t i = N - 1; i > 0; i--) { | 85 | 99.1M | auto in1 = Load(d8, coeff + i * 8); | 86 | 99.1M | auto in2 = Load(d8, coeff + (i - 1) * 8); | 87 | 99.1M | Store(Add(in1, in2), d8, coeff + i * 8); | 88 | 99.1M | } | 89 | 33.0M | constexpr float kSqrt2 = 1.41421356237f; | 90 | 33.0M | auto sqrt2 = Set(d8, kSqrt2); | 91 | 33.0M | auto in1 = Load(d8, coeff); | 92 | 33.0M | Store(Mul(in1, sqrt2), d8, coeff); | 93 | 33.0M | } |
|
94 | | |
95 | | // Constants for DCT implementation. Generated by the following snippet: |
96 | | // for i in range(N // 2): |
97 | | // print(1.0 / (2 * math.cos((i + 0.5) * math.pi / N)), end=", ") |
98 | | template <size_t N> |
99 | | struct WcMultipliers; |
100 | | |
101 | | template <> |
102 | | struct WcMultipliers<4> { |
103 | | static constexpr float kMultipliers[] = { |
104 | | 0.541196100146197, |
105 | | 1.3065629648763764, |
106 | | }; |
107 | | }; |
108 | | |
109 | | template <> |
110 | | struct WcMultipliers<8> { |
111 | | static constexpr float kMultipliers[] = { |
112 | | 0.5097955791041592, |
113 | | 0.6013448869350453, |
114 | | 0.8999762231364156, |
115 | | 2.5629154477415055, |
116 | | }; |
117 | | }; |
118 | | |
119 | | #if JXL_CXX_LANG < JXL_CXX_17 |
120 | | constexpr float WcMultipliers<4>::kMultipliers[]; |
121 | | constexpr float WcMultipliers<8>::kMultipliers[]; |
122 | | #endif |
123 | | |
124 | | template <size_t N> |
125 | | void MultiplyAndAdd(const float* JXL_RESTRICT coeff, float* JXL_RESTRICT out, |
126 | 288M | size_t out_stride) { |
127 | 1.05G | for (size_t i = 0; i < N / 2; i++) { |
128 | 768M | auto mul = Set(d8, WcMultipliers<N>::kMultipliers[i]); |
129 | 768M | auto in1 = Load(d8, coeff + i * 8); |
130 | 768M | auto in2 = Load(d8, coeff + (N / 2 + i) * 8); |
131 | 768M | auto out1 = MulAdd(mul, in2, in1); |
132 | 768M | auto out2 = NegMulAdd(mul, in2, in1); |
133 | 768M | StoreU(out1, d8, out + i * out_stride); |
134 | 768M | StoreU(out2, d8, out + (N - i - 1) * out_stride); |
135 | 768M | } |
136 | 288M | } void jpegli::N_SSE4::MultiplyAndAdd<4ul>(float const*, float*, unsigned long) Line | Count | Source | 126 | 63.6M | size_t out_stride) { | 127 | 190M | for (size_t i = 0; i < N / 2; i++) { | 128 | 127M | auto mul = Set(d8, WcMultipliers<N>::kMultipliers[i]); | 129 | 127M | auto in1 = Load(d8, coeff + i * 8); | 130 | 127M | auto in2 = Load(d8, coeff + (N / 2 + i) * 8); | 131 | 127M | auto out1 = MulAdd(mul, in2, in1); | 132 | 127M | auto out2 = NegMulAdd(mul, in2, in1); | 133 | 127M | StoreU(out1, d8, out + i * out_stride); | 134 | 127M | StoreU(out2, d8, out + (N - i - 1) * out_stride); | 135 | 127M | } | 136 | 63.6M | } |
void jpegli::N_SSE4::MultiplyAndAdd<8ul>(float const*, float*, unsigned long) Line | Count | Source | 126 | 31.8M | size_t out_stride) { | 127 | 159M | for (size_t i = 0; i < N / 2; i++) { | 128 | 127M | auto mul = Set(d8, WcMultipliers<N>::kMultipliers[i]); | 129 | 127M | auto in1 = Load(d8, coeff + i * 8); | 130 | 127M | auto in2 = Load(d8, coeff + (N / 2 + i) * 8); | 131 | 127M | auto out1 = MulAdd(mul, in2, in1); | 132 | 127M | auto out2 = NegMulAdd(mul, in2, in1); | 133 | 127M | StoreU(out1, d8, out + i * out_stride); | 134 | 127M | StoreU(out2, d8, out + (N - i - 1) * out_stride); | 135 | 127M | } | 136 | 31.8M | } |
void jpegli::N_AVX2::MultiplyAndAdd<4ul>(float const*, float*, unsigned long) Line | Count | Source | 126 | 62.5M | size_t out_stride) { | 127 | 187M | for (size_t i = 0; i < N / 2; i++) { | 128 | 125M | auto mul = Set(d8, WcMultipliers<N>::kMultipliers[i]); | 129 | 125M | auto in1 = Load(d8, coeff + i * 8); | 130 | 125M | auto in2 = Load(d8, coeff + (N / 2 + i) * 8); | 131 | 125M | auto out1 = MulAdd(mul, in2, in1); | 132 | 125M | auto out2 = NegMulAdd(mul, in2, in1); | 133 | 125M | StoreU(out1, d8, out + i * out_stride); | 134 | 125M | StoreU(out2, d8, out + (N - i - 1) * out_stride); | 135 | 125M | } | 136 | 62.5M | } |
void jpegli::N_AVX2::MultiplyAndAdd<8ul>(float const*, float*, unsigned long) Line | Count | Source | 126 | 31.2M | size_t out_stride) { | 127 | 156M | for (size_t i = 0; i < N / 2; i++) { | 128 | 125M | auto mul = Set(d8, WcMultipliers<N>::kMultipliers[i]); | 129 | 125M | auto in1 = Load(d8, coeff + i * 8); | 130 | 125M | auto in2 = Load(d8, coeff + (N / 2 + i) * 8); | 131 | 125M | auto out1 = MulAdd(mul, in2, in1); | 132 | 125M | auto out2 = NegMulAdd(mul, in2, in1); | 133 | 125M | StoreU(out1, d8, out + i * out_stride); | 134 | 125M | StoreU(out2, d8, out + (N - i - 1) * out_stride); | 135 | 125M | } | 136 | 31.2M | } |
void jpegli::N_SSE2::MultiplyAndAdd<4ul>(float const*, float*, unsigned long) Line | Count | Source | 126 | 66.0M | size_t out_stride) { | 127 | 198M | for (size_t i = 0; i < N / 2; i++) { | 128 | 132M | auto mul = Set(d8, WcMultipliers<N>::kMultipliers[i]); | 129 | 132M | auto in1 = Load(d8, coeff + i * 8); | 130 | 132M | auto in2 = Load(d8, coeff + (N / 2 + i) * 8); | 131 | 132M | auto out1 = MulAdd(mul, in2, in1); | 132 | 132M | auto out2 = NegMulAdd(mul, in2, in1); | 133 | 132M | StoreU(out1, d8, out + i * out_stride); | 134 | 132M | StoreU(out2, d8, out + (N - i - 1) * out_stride); | 135 | 132M | } | 136 | 66.0M | } |
void jpegli::N_SSE2::MultiplyAndAdd<8ul>(float const*, float*, unsigned long) Line | Count | Source | 126 | 33.0M | size_t out_stride) { | 127 | 165M | for (size_t i = 0; i < N / 2; i++) { | 128 | 132M | auto mul = Set(d8, WcMultipliers<N>::kMultipliers[i]); | 129 | 132M | auto in1 = Load(d8, coeff + i * 8); | 130 | 132M | auto in2 = Load(d8, coeff + (N / 2 + i) * 8); | 131 | 132M | auto out1 = MulAdd(mul, in2, in1); | 132 | 132M | auto out2 = NegMulAdd(mul, in2, in1); | 133 | 132M | StoreU(out1, d8, out + i * out_stride); | 134 | 132M | StoreU(out2, d8, out + (N - i - 1) * out_stride); | 135 | 132M | } | 136 | 33.0M | } |
|
137 | | |
138 | | template <size_t N> |
139 | | struct IDCT1DImpl; |
140 | | |
141 | | template <> |
142 | | struct IDCT1DImpl<1> { |
143 | | JXL_INLINE void operator()(const float* from, size_t from_stride, float* to, |
144 | 0 | size_t to_stride) { |
145 | 0 | StoreU(LoadU(d8, from), d8, to); |
146 | 0 | } Unexecuted instantiation: jpegli::N_SSE4::IDCT1DImpl<1ul>::operator()(float const*, unsigned long, float*, unsigned long) Unexecuted instantiation: jpegli::N_AVX2::IDCT1DImpl<1ul>::operator()(float const*, unsigned long, float*, unsigned long) Unexecuted instantiation: jpegli::N_SSE2::IDCT1DImpl<1ul>::operator()(float const*, unsigned long, float*, unsigned long) |
147 | | }; |
148 | | |
149 | | template <> |
150 | | struct IDCT1DImpl<2> { |
151 | | JXL_INLINE void operator()(const float* from, size_t from_stride, float* to, |
152 | 384M | size_t to_stride) { |
153 | 384M | JXL_DASSERT(from_stride >= 8); |
154 | 384M | JXL_DASSERT(to_stride >= 8); |
155 | 384M | auto in1 = LoadU(d8, from); |
156 | 384M | auto in2 = LoadU(d8, from + from_stride); |
157 | 384M | StoreU(Add(in1, in2), d8, to); |
158 | 384M | StoreU(Sub(in1, in2), d8, to + to_stride); |
159 | 384M | } jpegli::N_SSE4::IDCT1DImpl<2ul>::operator()(float const*, unsigned long, float*, unsigned long) Line | Count | Source | 152 | 127M | size_t to_stride) { | 153 | 127M | JXL_DASSERT(from_stride >= 8); | 154 | 127M | JXL_DASSERT(to_stride >= 8); | 155 | 127M | auto in1 = LoadU(d8, from); | 156 | 127M | auto in2 = LoadU(d8, from + from_stride); | 157 | 127M | StoreU(Add(in1, in2), d8, to); | 158 | 127M | StoreU(Sub(in1, in2), d8, to + to_stride); | 159 | 127M | } |
jpegli::N_AVX2::IDCT1DImpl<2ul>::operator()(float const*, unsigned long, float*, unsigned long) Line | Count | Source | 152 | 125M | size_t to_stride) { | 153 | 125M | JXL_DASSERT(from_stride >= 8); | 154 | 125M | JXL_DASSERT(to_stride >= 8); | 155 | 125M | auto in1 = LoadU(d8, from); | 156 | 125M | auto in2 = LoadU(d8, from + from_stride); | 157 | 125M | StoreU(Add(in1, in2), d8, to); | 158 | 125M | StoreU(Sub(in1, in2), d8, to + to_stride); | 159 | 125M | } |
jpegli::N_SSE2::IDCT1DImpl<2ul>::operator()(float const*, unsigned long, float*, unsigned long) Line | Count | Source | 152 | 132M | size_t to_stride) { | 153 | 132M | JXL_DASSERT(from_stride >= 8); | 154 | 132M | JXL_DASSERT(to_stride >= 8); | 155 | 132M | auto in1 = LoadU(d8, from); | 156 | 132M | auto in2 = LoadU(d8, from + from_stride); | 157 | 132M | StoreU(Add(in1, in2), d8, to); | 158 | 132M | StoreU(Sub(in1, in2), d8, to + to_stride); | 159 | 132M | } |
|
160 | | }; |
161 | | |
162 | | template <size_t N> |
163 | | struct IDCT1DImpl { |
164 | | void operator()(const float* from, size_t from_stride, float* to, |
165 | 288M | size_t to_stride) { |
166 | 288M | JXL_DASSERT(from_stride >= 8); |
167 | 288M | JXL_DASSERT(to_stride >= 8); |
168 | 288M | HWY_ALIGN float tmp[64]; |
169 | 288M | ForwardEvenOdd<N>(from, from_stride, tmp); |
170 | 288M | IDCT1DImpl<N / 2>()(tmp, 8, tmp, 8); |
171 | 288M | BTranspose<N / 2>(tmp + N * 4); |
172 | 288M | IDCT1DImpl<N / 2>()(tmp + N * 4, 8, tmp + N * 4, 8); |
173 | 288M | MultiplyAndAdd<N>(tmp, to, to_stride); |
174 | 288M | } jpegli::N_SSE4::IDCT1DImpl<8ul>::operator()(float const*, unsigned long, float*, unsigned long) Line | Count | Source | 165 | 31.8M | size_t to_stride) { | 166 | 31.8M | JXL_DASSERT(from_stride >= 8); | 167 | 31.8M | JXL_DASSERT(to_stride >= 8); | 168 | 31.8M | HWY_ALIGN float tmp[64]; | 169 | 31.8M | ForwardEvenOdd<N>(from, from_stride, tmp); | 170 | 31.8M | IDCT1DImpl<N / 2>()(tmp, 8, tmp, 8); | 171 | 31.8M | BTranspose<N / 2>(tmp + N * 4); | 172 | 31.8M | IDCT1DImpl<N / 2>()(tmp + N * 4, 8, tmp + N * 4, 8); | 173 | 31.8M | MultiplyAndAdd<N>(tmp, to, to_stride); | 174 | 31.8M | } |
jpegli::N_SSE4::IDCT1DImpl<4ul>::operator()(float const*, unsigned long, float*, unsigned long) Line | Count | Source | 165 | 63.6M | size_t to_stride) { | 166 | 63.6M | JXL_DASSERT(from_stride >= 8); | 167 | 63.6M | JXL_DASSERT(to_stride >= 8); | 168 | 63.6M | HWY_ALIGN float tmp[64]; | 169 | 63.6M | ForwardEvenOdd<N>(from, from_stride, tmp); | 170 | 63.6M | IDCT1DImpl<N / 2>()(tmp, 8, tmp, 8); | 171 | 63.6M | BTranspose<N / 2>(tmp + N * 4); | 172 | 63.6M | IDCT1DImpl<N / 2>()(tmp + N * 4, 8, tmp + N * 4, 8); | 173 | 63.6M | MultiplyAndAdd<N>(tmp, to, to_stride); | 174 | 63.6M | } |
jpegli::N_AVX2::IDCT1DImpl<8ul>::operator()(float const*, unsigned long, float*, unsigned long) Line | Count | Source | 165 | 31.2M | size_t to_stride) { | 166 | 31.2M | JXL_DASSERT(from_stride >= 8); | 167 | 31.2M | JXL_DASSERT(to_stride >= 8); | 168 | 31.2M | HWY_ALIGN float tmp[64]; | 169 | 31.2M | ForwardEvenOdd<N>(from, from_stride, tmp); | 170 | 31.2M | IDCT1DImpl<N / 2>()(tmp, 8, tmp, 8); | 171 | 31.2M | BTranspose<N / 2>(tmp + N * 4); | 172 | 31.2M | IDCT1DImpl<N / 2>()(tmp + N * 4, 8, tmp + N * 4, 8); | 173 | 31.2M | MultiplyAndAdd<N>(tmp, to, to_stride); | 174 | 31.2M | } |
jpegli::N_AVX2::IDCT1DImpl<4ul>::operator()(float const*, unsigned long, float*, unsigned long) Line | Count | Source | 165 | 62.5M | size_t to_stride) { | 166 | 62.5M | JXL_DASSERT(from_stride >= 8); | 167 | 62.5M | JXL_DASSERT(to_stride >= 8); | 168 | 62.5M | HWY_ALIGN float tmp[64]; | 169 | 62.5M | ForwardEvenOdd<N>(from, from_stride, tmp); | 170 | 62.5M | IDCT1DImpl<N / 2>()(tmp, 8, tmp, 8); | 171 | 62.5M | BTranspose<N / 2>(tmp + N * 4); | 172 | 62.5M | IDCT1DImpl<N / 2>()(tmp + N * 4, 8, tmp + N * 4, 8); | 173 | 62.5M | MultiplyAndAdd<N>(tmp, to, to_stride); | 174 | 62.5M | } |
jpegli::N_SSE2::IDCT1DImpl<8ul>::operator()(float const*, unsigned long, float*, unsigned long) Line | Count | Source | 165 | 33.0M | size_t to_stride) { | 166 | 33.0M | JXL_DASSERT(from_stride >= 8); | 167 | 33.0M | JXL_DASSERT(to_stride >= 8); | 168 | 33.0M | HWY_ALIGN float tmp[64]; | 169 | 33.0M | ForwardEvenOdd<N>(from, from_stride, tmp); | 170 | 33.0M | IDCT1DImpl<N / 2>()(tmp, 8, tmp, 8); | 171 | 33.0M | BTranspose<N / 2>(tmp + N * 4); | 172 | 33.0M | IDCT1DImpl<N / 2>()(tmp + N * 4, 8, tmp + N * 4, 8); | 173 | 33.0M | MultiplyAndAdd<N>(tmp, to, to_stride); | 174 | 33.0M | } |
jpegli::N_SSE2::IDCT1DImpl<4ul>::operator()(float const*, unsigned long, float*, unsigned long) Line | Count | Source | 165 | 66.0M | size_t to_stride) { | 166 | 66.0M | JXL_DASSERT(from_stride >= 8); | 167 | 66.0M | JXL_DASSERT(to_stride >= 8); | 168 | 66.0M | HWY_ALIGN float tmp[64]; | 169 | 66.0M | ForwardEvenOdd<N>(from, from_stride, tmp); | 170 | 66.0M | IDCT1DImpl<N / 2>()(tmp, 8, tmp, 8); | 171 | 66.0M | BTranspose<N / 2>(tmp + N * 4); | 172 | 66.0M | IDCT1DImpl<N / 2>()(tmp + N * 4, 8, tmp + N * 4, 8); | 173 | 66.0M | MultiplyAndAdd<N>(tmp, to, to_stride); | 174 | 66.0M | } |
|
175 | | }; |
176 | | |
177 | | template <size_t N> |
178 | | void IDCT1D(float* JXL_RESTRICT from, float* JXL_RESTRICT output, |
179 | 63.6M | size_t output_stride) { |
180 | 159M | for (size_t i = 0; i < 8; i += Lanes(d8)) { |
181 | 96.1M | IDCT1DImpl<N>()(from + i, 8, output + i, output_stride); |
182 | 96.1M | } |
183 | 63.6M | } void jpegli::N_SSE4::IDCT1D<8ul>(float*, float*, unsigned long) Line | Count | Source | 179 | 15.9M | size_t output_stride) { | 180 | 47.7M | for (size_t i = 0; i < 8; i += Lanes(d8)) { | 181 | 31.8M | IDCT1DImpl<N>()(from + i, 8, output + i, output_stride); | 182 | 31.8M | } | 183 | 15.9M | } |
void jpegli::N_AVX2::IDCT1D<8ul>(float*, float*, unsigned long) Line | Count | Source | 179 | 31.2M | size_t output_stride) { | 180 | 62.5M | for (size_t i = 0; i < 8; i += Lanes(d8)) { | 181 | 31.2M | IDCT1DImpl<N>()(from + i, 8, output + i, output_stride); | 182 | 31.2M | } | 183 | 31.2M | } |
void jpegli::N_SSE2::IDCT1D<8ul>(float*, float*, unsigned long) Line | Count | Source | 179 | 16.5M | size_t output_stride) { | 180 | 49.5M | for (size_t i = 0; i < 8; i += Lanes(d8)) { | 181 | 33.0M | IDCT1DImpl<N>()(from + i, 8, output + i, output_stride); | 182 | 33.0M | } | 183 | 16.5M | } |
|
184 | | |
185 | | void ComputeScaledIDCT(float* JXL_RESTRICT block0, float* JXL_RESTRICT block1, |
186 | 31.8M | float* JXL_RESTRICT output, size_t output_stride) { |
187 | 31.8M | Transpose8x8Block(block0, block1); |
188 | 31.8M | IDCT1D<8>(block1, block0, 8); |
189 | 31.8M | Transpose8x8Block(block0, block1); |
190 | 31.8M | IDCT1D<8>(block1, output, output_stride); |
191 | 31.8M | } jpegli::N_SSE4::ComputeScaledIDCT(float*, float*, float*, unsigned long) Line | Count | Source | 186 | 7.95M | float* JXL_RESTRICT output, size_t output_stride) { | 187 | 7.95M | Transpose8x8Block(block0, block1); | 188 | 7.95M | IDCT1D<8>(block1, block0, 8); | 189 | 7.95M | Transpose8x8Block(block0, block1); | 190 | 7.95M | IDCT1D<8>(block1, output, output_stride); | 191 | 7.95M | } |
jpegli::N_AVX2::ComputeScaledIDCT(float*, float*, float*, unsigned long) Line | Count | Source | 186 | 15.6M | float* JXL_RESTRICT output, size_t output_stride) { | 187 | 15.6M | Transpose8x8Block(block0, block1); | 188 | 15.6M | IDCT1D<8>(block1, block0, 8); | 189 | 15.6M | Transpose8x8Block(block0, block1); | 190 | 15.6M | IDCT1D<8>(block1, output, output_stride); | 191 | 15.6M | } |
jpegli::N_SSE2::ComputeScaledIDCT(float*, float*, float*, unsigned long) Line | Count | Source | 186 | 8.25M | float* JXL_RESTRICT output, size_t output_stride) { | 187 | 8.25M | Transpose8x8Block(block0, block1); | 188 | 8.25M | IDCT1D<8>(block1, block0, 8); | 189 | 8.25M | Transpose8x8Block(block0, block1); | 190 | 8.25M | IDCT1D<8>(block1, output, output_stride); | 191 | 8.25M | } |
|
192 | | |
193 | | void InverseTransformBlock8x8(const int16_t* JXL_RESTRICT qblock, |
194 | | const float* JXL_RESTRICT dequant, |
195 | | const float* JXL_RESTRICT biases, |
196 | | float* JXL_RESTRICT scratch_space, |
197 | | float* JXL_RESTRICT output, size_t output_stride, |
198 | 31.8M | size_t dctsize) { |
199 | 31.8M | float* JXL_RESTRICT block0 = scratch_space; |
200 | 31.8M | float* JXL_RESTRICT block1 = scratch_space + DCTSIZE2; |
201 | 31.8M | DequantBlock(qblock, dequant, biases, block0); |
202 | 31.8M | ComputeScaledIDCT(block0, block1, output, output_stride); |
203 | 31.8M | } jpegli::N_SSE4::InverseTransformBlock8x8(short const*, float const*, float const*, float*, float*, unsigned long, unsigned long) Line | Count | Source | 198 | 7.95M | size_t dctsize) { | 199 | 7.95M | float* JXL_RESTRICT block0 = scratch_space; | 200 | 7.95M | float* JXL_RESTRICT block1 = scratch_space + DCTSIZE2; | 201 | 7.95M | DequantBlock(qblock, dequant, biases, block0); | 202 | 7.95M | ComputeScaledIDCT(block0, block1, output, output_stride); | 203 | 7.95M | } |
jpegli::N_AVX2::InverseTransformBlock8x8(short const*, float const*, float const*, float*, float*, unsigned long, unsigned long) Line | Count | Source | 198 | 15.6M | size_t dctsize) { | 199 | 15.6M | float* JXL_RESTRICT block0 = scratch_space; | 200 | 15.6M | float* JXL_RESTRICT block1 = scratch_space + DCTSIZE2; | 201 | 15.6M | DequantBlock(qblock, dequant, biases, block0); | 202 | 15.6M | ComputeScaledIDCT(block0, block1, output, output_stride); | 203 | 15.6M | } |
jpegli::N_SSE2::InverseTransformBlock8x8(short const*, float const*, float const*, float*, float*, unsigned long, unsigned long) Line | Count | Source | 198 | 8.25M | size_t dctsize) { | 199 | 8.25M | float* JXL_RESTRICT block0 = scratch_space; | 200 | 8.25M | float* JXL_RESTRICT block1 = scratch_space + DCTSIZE2; | 201 | 8.25M | DequantBlock(qblock, dequant, biases, block0); | 202 | 8.25M | ComputeScaledIDCT(block0, block1, output, output_stride); | 203 | 8.25M | } |
|
204 | | |
205 | | // Computes the N-point IDCT of in[], and stores the result in out[]. The in[] |
206 | | // array is at most 8 values long, values in[8:N-1] are assumed to be 0. |
207 | 0 | void Compute1dIDCT(const float* in, float* out, size_t N) { |
208 | 0 | switch (N) { |
209 | 0 | case 3: { |
210 | 0 | static constexpr float kC3[3] = { |
211 | 0 | 1.414213562373, |
212 | 0 | 1.224744871392, |
213 | 0 | 0.707106781187, |
214 | 0 | }; |
215 | 0 | float even0 = in[0] + kC3[2] * in[2]; |
216 | 0 | float even1 = in[0] - kC3[0] * in[2]; |
217 | 0 | float odd0 = kC3[1] * in[1]; |
218 | 0 | out[0] = even0 + odd0; |
219 | 0 | out[2] = even0 - odd0; |
220 | 0 | out[1] = even1; |
221 | 0 | break; |
222 | 0 | } |
223 | 0 | case 5: { |
224 | 0 | static constexpr float kC5[5] = { |
225 | 0 | 1.414213562373, 1.344997023928, 1.144122805635, |
226 | 0 | 0.831253875555, 0.437016024449, |
227 | 0 | }; |
228 | 0 | float even0 = in[0] + kC5[2] * in[2] + kC5[4] * in[4]; |
229 | 0 | float even1 = in[0] - kC5[4] * in[2] - kC5[2] * in[4]; |
230 | 0 | float even2 = in[0] - kC5[0] * in[2] + kC5[0] * in[4]; |
231 | 0 | float odd0 = kC5[1] * in[1] + kC5[3] * in[3]; |
232 | 0 | float odd1 = kC5[3] * in[1] - kC5[1] * in[3]; |
233 | 0 | out[0] = even0 + odd0; |
234 | 0 | out[4] = even0 - odd0; |
235 | 0 | out[1] = even1 + odd1; |
236 | 0 | out[3] = even1 - odd1; |
237 | 0 | out[2] = even2; |
238 | 0 | break; |
239 | 0 | } |
240 | 0 | case 6: { |
241 | 0 | static constexpr float kC6[6] = { |
242 | 0 | 1.414213562373, 1.366025403784, 1.224744871392, |
243 | 0 | 1.000000000000, 0.707106781187, 0.366025403784, |
244 | 0 | }; |
245 | 0 | float even0 = in[0] + kC6[2] * in[2] + kC6[4] * in[4]; |
246 | 0 | float even1 = in[0] - kC6[0] * in[4]; |
247 | 0 | float even2 = in[0] - kC6[2] * in[2] + kC6[4] * in[4]; |
248 | 0 | float odd0 = kC6[1] * in[1] + kC6[3] * in[3] + kC6[5] * in[5]; |
249 | 0 | float odd1 = kC6[3] * in[1] - kC6[3] * in[3] - kC6[3] * in[5]; |
250 | 0 | float odd2 = kC6[5] * in[1] - kC6[3] * in[3] + kC6[1] * in[5]; |
251 | 0 | out[0] = even0 + odd0; |
252 | 0 | out[5] = even0 - odd0; |
253 | 0 | out[1] = even1 + odd1; |
254 | 0 | out[4] = even1 - odd1; |
255 | 0 | out[2] = even2 + odd2; |
256 | 0 | out[3] = even2 - odd2; |
257 | 0 | break; |
258 | 0 | } |
259 | 0 | case 7: { |
260 | 0 | static constexpr float kC7[7] = { |
261 | 0 | 1.414213562373, 1.378756275744, 1.274162392264, 1.105676685997, |
262 | 0 | 0.881747733790, 0.613604268353, 0.314692122713, |
263 | 0 | }; |
264 | 0 | float even0 = in[0] + kC7[2] * in[2] + kC7[4] * in[4] + kC7[6] * in[6]; |
265 | 0 | float even1 = in[0] + kC7[6] * in[2] - kC7[2] * in[4] - kC7[4] * in[6]; |
266 | 0 | float even2 = in[0] - kC7[4] * in[2] - kC7[6] * in[4] + kC7[2] * in[6]; |
267 | 0 | float even3 = in[0] - kC7[0] * in[2] + kC7[0] * in[4] - kC7[0] * in[6]; |
268 | 0 | float odd0 = kC7[1] * in[1] + kC7[3] * in[3] + kC7[5] * in[5]; |
269 | 0 | float odd1 = kC7[3] * in[1] - kC7[5] * in[3] - kC7[1] * in[5]; |
270 | 0 | float odd2 = kC7[5] * in[1] - kC7[1] * in[3] + kC7[3] * in[5]; |
271 | 0 | out[0] = even0 + odd0; |
272 | 0 | out[6] = even0 - odd0; |
273 | 0 | out[1] = even1 + odd1; |
274 | 0 | out[5] = even1 - odd1; |
275 | 0 | out[2] = even2 + odd2; |
276 | 0 | out[4] = even2 - odd2; |
277 | 0 | out[3] = even3; |
278 | 0 | break; |
279 | 0 | } |
280 | 0 | case 9: { |
281 | 0 | static constexpr float kC9[9] = { |
282 | 0 | 1.414213562373, 1.392728480640, 1.328926048777, |
283 | 0 | 1.224744871392, 1.083350440839, 0.909038955344, |
284 | 0 | 0.707106781187, 0.483689525296, 0.245575607938, |
285 | 0 | }; |
286 | 0 | float even0 = in[0] + kC9[2] * in[2] + kC9[4] * in[4] + kC9[6] * in[6]; |
287 | 0 | float even1 = in[0] + kC9[6] * in[2] - kC9[6] * in[4] - kC9[0] * in[6]; |
288 | 0 | float even2 = in[0] - kC9[8] * in[2] - kC9[2] * in[4] + kC9[6] * in[6]; |
289 | 0 | float even3 = in[0] - kC9[4] * in[2] + kC9[8] * in[4] + kC9[6] * in[6]; |
290 | 0 | float even4 = in[0] - kC9[0] * in[2] + kC9[0] * in[4] - kC9[0] * in[6]; |
291 | 0 | float odd0 = |
292 | 0 | kC9[1] * in[1] + kC9[3] * in[3] + kC9[5] * in[5] + kC9[7] * in[7]; |
293 | 0 | float odd1 = kC9[3] * in[1] - kC9[3] * in[5] - kC9[3] * in[7]; |
294 | 0 | float odd2 = |
295 | 0 | kC9[5] * in[1] - kC9[3] * in[3] - kC9[7] * in[5] + kC9[1] * in[7]; |
296 | 0 | float odd3 = |
297 | 0 | kC9[7] * in[1] - kC9[3] * in[3] + kC9[1] * in[5] - kC9[5] * in[7]; |
298 | 0 | out[0] = even0 + odd0; |
299 | 0 | out[8] = even0 - odd0; |
300 | 0 | out[1] = even1 + odd1; |
301 | 0 | out[7] = even1 - odd1; |
302 | 0 | out[2] = even2 + odd2; |
303 | 0 | out[6] = even2 - odd2; |
304 | 0 | out[3] = even3 + odd3; |
305 | 0 | out[5] = even3 - odd3; |
306 | 0 | out[4] = even4; |
307 | 0 | break; |
308 | 0 | } |
309 | 0 | case 10: { |
310 | 0 | static constexpr float kC10[10] = { |
311 | 0 | 1.414213562373, 1.396802246667, 1.344997023928, 1.260073510670, |
312 | 0 | 1.144122805635, 1.000000000000, 0.831253875555, 0.642039521920, |
313 | 0 | 0.437016024449, 0.221231742082, |
314 | 0 | }; |
315 | 0 | float even0 = in[0] + kC10[2] * in[2] + kC10[4] * in[4] + kC10[6] * in[6]; |
316 | 0 | float even1 = in[0] + kC10[6] * in[2] - kC10[8] * in[4] - kC10[2] * in[6]; |
317 | 0 | float even2 = in[0] - kC10[0] * in[4]; |
318 | 0 | float even3 = in[0] - kC10[6] * in[2] - kC10[8] * in[4] + kC10[2] * in[6]; |
319 | 0 | float even4 = in[0] - kC10[2] * in[2] + kC10[4] * in[4] - kC10[6] * in[6]; |
320 | 0 | float odd0 = |
321 | 0 | kC10[1] * in[1] + kC10[3] * in[3] + kC10[5] * in[5] + kC10[7] * in[7]; |
322 | 0 | float odd1 = |
323 | 0 | kC10[3] * in[1] + kC10[9] * in[3] - kC10[5] * in[5] - kC10[1] * in[7]; |
324 | 0 | float odd2 = |
325 | 0 | kC10[5] * in[1] - kC10[5] * in[3] - kC10[5] * in[5] + kC10[5] * in[7]; |
326 | 0 | float odd3 = |
327 | 0 | kC10[7] * in[1] - kC10[1] * in[3] + kC10[5] * in[5] + kC10[9] * in[7]; |
328 | 0 | float odd4 = |
329 | 0 | kC10[9] * in[1] - kC10[7] * in[3] + kC10[5] * in[5] - kC10[3] * in[7]; |
330 | 0 | out[0] = even0 + odd0; |
331 | 0 | out[9] = even0 - odd0; |
332 | 0 | out[1] = even1 + odd1; |
333 | 0 | out[8] = even1 - odd1; |
334 | 0 | out[2] = even2 + odd2; |
335 | 0 | out[7] = even2 - odd2; |
336 | 0 | out[3] = even3 + odd3; |
337 | 0 | out[6] = even3 - odd3; |
338 | 0 | out[4] = even4 + odd4; |
339 | 0 | out[5] = even4 - odd4; |
340 | 0 | break; |
341 | 0 | } |
342 | 0 | case 11: { |
343 | 0 | static constexpr float kC11[11] = { |
344 | 0 | 1.414213562373, 1.399818907436, 1.356927976287, 1.286413904599, |
345 | 0 | 1.189712155524, 1.068791297809, 0.926112931411, 0.764581576418, |
346 | 0 | 0.587485545401, 0.398430002847, 0.201263574413, |
347 | 0 | }; |
348 | 0 | float even0 = in[0] + kC11[2] * in[2] + kC11[4] * in[4] + kC11[6] * in[6]; |
349 | 0 | float even1 = |
350 | 0 | in[0] + kC11[6] * in[2] - kC11[10] * in[4] - kC11[4] * in[6]; |
351 | 0 | float even2 = |
352 | 0 | in[0] + kC11[10] * in[2] - kC11[2] * in[4] - kC11[8] * in[6]; |
353 | 0 | float even3 = in[0] - kC11[8] * in[2] - kC11[6] * in[4] + kC11[2] * in[6]; |
354 | 0 | float even4 = |
355 | 0 | in[0] - kC11[4] * in[2] + kC11[8] * in[4] + kC11[10] * in[6]; |
356 | 0 | float even5 = in[0] - kC11[0] * in[2] + kC11[0] * in[4] - kC11[0] * in[6]; |
357 | 0 | float odd0 = |
358 | 0 | kC11[1] * in[1] + kC11[3] * in[3] + kC11[5] * in[5] + kC11[7] * in[7]; |
359 | 0 | float odd1 = |
360 | 0 | kC11[3] * in[1] + kC11[9] * in[3] - kC11[7] * in[5] - kC11[1] * in[7]; |
361 | 0 | float odd2 = |
362 | 0 | kC11[5] * in[1] - kC11[7] * in[3] - kC11[3] * in[5] + kC11[9] * in[7]; |
363 | 0 | float odd3 = |
364 | 0 | kC11[7] * in[1] - kC11[1] * in[3] + kC11[9] * in[5] + kC11[5] * in[7]; |
365 | 0 | float odd4 = |
366 | 0 | kC11[9] * in[1] - kC11[5] * in[3] + kC11[1] * in[5] - kC11[3] * in[7]; |
367 | 0 | out[0] = even0 + odd0; |
368 | 0 | out[10] = even0 - odd0; |
369 | 0 | out[1] = even1 + odd1; |
370 | 0 | out[9] = even1 - odd1; |
371 | 0 | out[2] = even2 + odd2; |
372 | 0 | out[8] = even2 - odd2; |
373 | 0 | out[3] = even3 + odd3; |
374 | 0 | out[7] = even3 - odd3; |
375 | 0 | out[4] = even4 + odd4; |
376 | 0 | out[6] = even4 - odd4; |
377 | 0 | out[5] = even5; |
378 | 0 | break; |
379 | 0 | } |
380 | 0 | case 12: { |
381 | 0 | static constexpr float kC12[12] = { |
382 | 0 | 1.414213562373, 1.402114769300, 1.366025403784, 1.306562964876, |
383 | 0 | 1.224744871392, 1.121971053594, 1.000000000000, 0.860918669154, |
384 | 0 | 0.707106781187, 0.541196100146, 0.366025403784, 0.184591911283, |
385 | 0 | }; |
386 | 0 | float even0 = in[0] + kC12[2] * in[2] + kC12[4] * in[4] + kC12[6] * in[6]; |
387 | 0 | float even1 = in[0] + kC12[6] * in[2] - kC12[6] * in[6]; |
388 | 0 | float even2 = |
389 | 0 | in[0] + kC12[10] * in[2] - kC12[4] * in[4] - kC12[6] * in[6]; |
390 | 0 | float even3 = |
391 | 0 | in[0] - kC12[10] * in[2] - kC12[4] * in[4] + kC12[6] * in[6]; |
392 | 0 | float even4 = in[0] - kC12[6] * in[2] + kC12[6] * in[6]; |
393 | 0 | float even5 = in[0] - kC12[2] * in[2] + kC12[4] * in[4] - kC12[6] * in[6]; |
394 | 0 | float odd0 = |
395 | 0 | kC12[1] * in[1] + kC12[3] * in[3] + kC12[5] * in[5] + kC12[7] * in[7]; |
396 | 0 | float odd1 = |
397 | 0 | kC12[3] * in[1] + kC12[9] * in[3] - kC12[9] * in[5] - kC12[3] * in[7]; |
398 | 0 | float odd2 = kC12[5] * in[1] - kC12[9] * in[3] - kC12[1] * in[5] - |
399 | 0 | kC12[11] * in[7]; |
400 | 0 | float odd3 = kC12[7] * in[1] - kC12[3] * in[3] - kC12[11] * in[5] + |
401 | 0 | kC12[1] * in[7]; |
402 | 0 | float odd4 = |
403 | 0 | kC12[9] * in[1] - kC12[3] * in[3] + kC12[3] * in[5] - kC12[9] * in[7]; |
404 | 0 | float odd5 = kC12[11] * in[1] - kC12[9] * in[3] + kC12[7] * in[5] - |
405 | 0 | kC12[5] * in[7]; |
406 | 0 | out[0] = even0 + odd0; |
407 | 0 | out[11] = even0 - odd0; |
408 | 0 | out[1] = even1 + odd1; |
409 | 0 | out[10] = even1 - odd1; |
410 | 0 | out[2] = even2 + odd2; |
411 | 0 | out[9] = even2 - odd2; |
412 | 0 | out[3] = even3 + odd3; |
413 | 0 | out[8] = even3 - odd3; |
414 | 0 | out[4] = even4 + odd4; |
415 | 0 | out[7] = even4 - odd4; |
416 | 0 | out[5] = even5 + odd5; |
417 | 0 | out[6] = even5 - odd5; |
418 | 0 | break; |
419 | 0 | } |
420 | 0 | case 13: { |
421 | 0 | static constexpr float kC13[13] = { |
422 | 0 | 1.414213562373, 1.403902353238, 1.373119086479, 1.322312651445, |
423 | 0 | 1.252223920364, 1.163874944761, 1.058554051646, 0.937797056801, |
424 | 0 | 0.803364869133, 0.657217812653, 0.501487040539, 0.338443458124, |
425 | 0 | 0.170464607981, |
426 | 0 | }; |
427 | 0 | float even0 = in[0] + kC13[2] * in[2] + kC13[4] * in[4] + kC13[6] * in[6]; |
428 | 0 | float even1 = |
429 | 0 | in[0] + kC13[6] * in[2] + kC13[12] * in[4] - kC13[8] * in[6]; |
430 | 0 | float even2 = |
431 | 0 | in[0] + kC13[10] * in[2] - kC13[6] * in[4] - kC13[4] * in[6]; |
432 | 0 | float even3 = |
433 | 0 | in[0] - kC13[12] * in[2] - kC13[2] * in[4] + kC13[10] * in[6]; |
434 | 0 | float even4 = |
435 | 0 | in[0] - kC13[8] * in[2] - kC13[10] * in[4] + kC13[2] * in[6]; |
436 | 0 | float even5 = |
437 | 0 | in[0] - kC13[4] * in[2] + kC13[8] * in[4] - kC13[12] * in[6]; |
438 | 0 | float even6 = in[0] - kC13[0] * in[2] + kC13[0] * in[4] - kC13[0] * in[6]; |
439 | 0 | float odd0 = |
440 | 0 | kC13[1] * in[1] + kC13[3] * in[3] + kC13[5] * in[5] + kC13[7] * in[7]; |
441 | 0 | float odd1 = kC13[3] * in[1] + kC13[9] * in[3] - kC13[11] * in[5] - |
442 | 0 | kC13[5] * in[7]; |
443 | 0 | float odd2 = kC13[5] * in[1] - kC13[11] * in[3] - kC13[1] * in[5] - |
444 | 0 | kC13[9] * in[7]; |
445 | 0 | float odd3 = |
446 | 0 | kC13[7] * in[1] - kC13[5] * in[3] - kC13[9] * in[5] + kC13[3] * in[7]; |
447 | 0 | float odd4 = kC13[9] * in[1] - kC13[1] * in[3] + kC13[7] * in[5] + |
448 | 0 | kC13[11] * in[7]; |
449 | 0 | float odd5 = kC13[11] * in[1] - kC13[7] * in[3] + kC13[3] * in[5] - |
450 | 0 | kC13[1] * in[7]; |
451 | 0 | out[0] = even0 + odd0; |
452 | 0 | out[12] = even0 - odd0; |
453 | 0 | out[1] = even1 + odd1; |
454 | 0 | out[11] = even1 - odd1; |
455 | 0 | out[2] = even2 + odd2; |
456 | 0 | out[10] = even2 - odd2; |
457 | 0 | out[3] = even3 + odd3; |
458 | 0 | out[9] = even3 - odd3; |
459 | 0 | out[4] = even4 + odd4; |
460 | 0 | out[8] = even4 - odd4; |
461 | 0 | out[5] = even5 + odd5; |
462 | 0 | out[7] = even5 - odd5; |
463 | 0 | out[6] = even6; |
464 | 0 | break; |
465 | 0 | } |
466 | 0 | case 14: { |
467 | 0 | static constexpr float kC14[14] = { |
468 | 0 | 1.414213562373, 1.405321284327, 1.378756275744, 1.334852607020, |
469 | 0 | 1.274162392264, 1.197448846138, 1.105676685997, 1.000000000000, |
470 | 0 | 0.881747733790, 0.752406978226, 0.613604268353, 0.467085128785, |
471 | 0 | 0.314692122713, 0.158341680609, |
472 | 0 | }; |
473 | 0 | float even0 = in[0] + kC14[2] * in[2] + kC14[4] * in[4] + kC14[6] * in[6]; |
474 | 0 | float even1 = |
475 | 0 | in[0] + kC14[6] * in[2] + kC14[12] * in[4] - kC14[10] * in[6]; |
476 | 0 | float even2 = |
477 | 0 | in[0] + kC14[10] * in[2] - kC14[8] * in[4] - kC14[2] * in[6]; |
478 | 0 | float even3 = in[0] - kC14[0] * in[4]; |
479 | 0 | float even4 = |
480 | 0 | in[0] - kC14[10] * in[2] - kC14[8] * in[4] + kC14[2] * in[6]; |
481 | 0 | float even5 = |
482 | 0 | in[0] - kC14[6] * in[2] + kC14[12] * in[4] + kC14[10] * in[6]; |
483 | 0 | float even6 = in[0] - kC14[2] * in[2] + kC14[4] * in[4] - kC14[6] * in[6]; |
484 | 0 | float odd0 = |
485 | 0 | kC14[1] * in[1] + kC14[3] * in[3] + kC14[5] * in[5] + kC14[7] * in[7]; |
486 | 0 | float odd1 = kC14[3] * in[1] + kC14[9] * in[3] - kC14[13] * in[5] - |
487 | 0 | kC14[7] * in[7]; |
488 | 0 | float odd2 = kC14[5] * in[1] - kC14[13] * in[3] - kC14[3] * in[5] - |
489 | 0 | kC14[7] * in[7]; |
490 | 0 | float odd3 = |
491 | 0 | kC14[7] * in[1] - kC14[7] * in[3] - kC14[7] * in[5] + kC14[7] * in[7]; |
492 | 0 | float odd4 = kC14[9] * in[1] - kC14[1] * in[3] + kC14[11] * in[5] + |
493 | 0 | kC14[7] * in[7]; |
494 | 0 | float odd5 = kC14[11] * in[1] - kC14[5] * in[3] + kC14[1] * in[5] - |
495 | 0 | kC14[7] * in[7]; |
496 | 0 | float odd6 = kC14[13] * in[1] - kC14[11] * in[3] + kC14[9] * in[5] - |
497 | 0 | kC14[7] * in[7]; |
498 | 0 | out[0] = even0 + odd0; |
499 | 0 | out[13] = even0 - odd0; |
500 | 0 | out[1] = even1 + odd1; |
501 | 0 | out[12] = even1 - odd1; |
502 | 0 | out[2] = even2 + odd2; |
503 | 0 | out[11] = even2 - odd2; |
504 | 0 | out[3] = even3 + odd3; |
505 | 0 | out[10] = even3 - odd3; |
506 | 0 | out[4] = even4 + odd4; |
507 | 0 | out[9] = even4 - odd4; |
508 | 0 | out[5] = even5 + odd5; |
509 | 0 | out[8] = even5 - odd5; |
510 | 0 | out[6] = even6 + odd6; |
511 | 0 | out[7] = even6 - odd6; |
512 | 0 | break; |
513 | 0 | } |
514 | 0 | case 15: { |
515 | 0 | static constexpr float kC15[15] = { |
516 | 0 | 1.414213562373, 1.406466352507, 1.383309602960, 1.344997023928, |
517 | 0 | 1.291948376043, 1.224744871392, 1.144122805635, 1.050965490998, |
518 | 0 | 0.946293578512, 0.831253875555, 0.707106781187, 0.575212476952, |
519 | 0 | 0.437016024449, 0.294031532930, 0.147825570407, |
520 | 0 | }; |
521 | 0 | float even0 = in[0] + kC15[2] * in[2] + kC15[4] * in[4] + kC15[6] * in[6]; |
522 | 0 | float even1 = |
523 | 0 | in[0] + kC15[6] * in[2] + kC15[12] * in[4] - kC15[12] * in[6]; |
524 | 0 | float even2 = |
525 | 0 | in[0] + kC15[10] * in[2] - kC15[10] * in[4] - kC15[0] * in[6]; |
526 | 0 | float even3 = |
527 | 0 | in[0] + kC15[14] * in[2] - kC15[2] * in[4] - kC15[12] * in[6]; |
528 | 0 | float even4 = |
529 | 0 | in[0] - kC15[12] * in[2] - kC15[6] * in[4] + kC15[6] * in[6]; |
530 | 0 | float even5 = |
531 | 0 | in[0] - kC15[8] * in[2] - kC15[14] * in[4] + kC15[6] * in[6]; |
532 | 0 | float even6 = |
533 | 0 | in[0] - kC15[4] * in[2] + kC15[8] * in[4] - kC15[12] * in[6]; |
534 | 0 | float even7 = in[0] - kC15[0] * in[2] + kC15[0] * in[4] - kC15[0] * in[6]; |
535 | 0 | float odd0 = |
536 | 0 | kC15[1] * in[1] + kC15[3] * in[3] + kC15[5] * in[5] + kC15[7] * in[7]; |
537 | 0 | float odd1 = kC15[3] * in[1] + kC15[9] * in[3] - kC15[9] * in[7]; |
538 | 0 | float odd2 = kC15[5] * in[1] - kC15[5] * in[5] - kC15[5] * in[7]; |
539 | 0 | float odd3 = kC15[7] * in[1] - kC15[9] * in[3] - kC15[5] * in[5] + |
540 | 0 | kC15[11] * in[7]; |
541 | 0 | float odd4 = kC15[9] * in[1] - kC15[3] * in[3] + kC15[3] * in[7]; |
542 | 0 | float odd5 = kC15[11] * in[1] - kC15[3] * in[3] + kC15[5] * in[5] - |
543 | 0 | kC15[13] * in[7]; |
544 | 0 | float odd6 = kC15[13] * in[1] - kC15[9] * in[3] + kC15[5] * in[5] - |
545 | 0 | kC15[1] * in[7]; |
546 | 0 | out[0] = even0 + odd0; |
547 | 0 | out[14] = even0 - odd0; |
548 | 0 | out[1] = even1 + odd1; |
549 | 0 | out[13] = even1 - odd1; |
550 | 0 | out[2] = even2 + odd2; |
551 | 0 | out[12] = even2 - odd2; |
552 | 0 | out[3] = even3 + odd3; |
553 | 0 | out[11] = even3 - odd3; |
554 | 0 | out[4] = even4 + odd4; |
555 | 0 | out[10] = even4 - odd4; |
556 | 0 | out[5] = even5 + odd5; |
557 | 0 | out[9] = even5 - odd5; |
558 | 0 | out[6] = even6 + odd6; |
559 | 0 | out[8] = even6 - odd6; |
560 | 0 | out[7] = even7; |
561 | 0 | break; |
562 | 0 | } |
563 | 0 | case 16: { |
564 | 0 | static constexpr float kC16[16] = { |
565 | 0 | 1.414213562373, 1.407403737526, 1.387039845322, 1.353318001174, |
566 | 0 | 1.306562964876, 1.247225012987, 1.175875602419, 1.093201867002, |
567 | 0 | 1.000000000000, 0.897167586343, 0.785694958387, 0.666655658478, |
568 | 0 | 0.541196100146, 0.410524527522, 0.275899379283, 0.138617169199, |
569 | 0 | }; |
570 | 0 | float even0 = in[0] + kC16[2] * in[2] + kC16[4] * in[4] + kC16[6] * in[6]; |
571 | 0 | float even1 = |
572 | 0 | in[0] + kC16[6] * in[2] + kC16[12] * in[4] - kC16[14] * in[6]; |
573 | 0 | float even2 = |
574 | 0 | in[0] + kC16[10] * in[2] - kC16[12] * in[4] - kC16[2] * in[6]; |
575 | 0 | float even3 = |
576 | 0 | in[0] + kC16[14] * in[2] - kC16[4] * in[4] - kC16[10] * in[6]; |
577 | 0 | float even4 = |
578 | 0 | in[0] - kC16[14] * in[2] - kC16[4] * in[4] + kC16[10] * in[6]; |
579 | 0 | float even5 = |
580 | 0 | in[0] - kC16[10] * in[2] - kC16[12] * in[4] + kC16[2] * in[6]; |
581 | 0 | float even6 = |
582 | 0 | in[0] - kC16[6] * in[2] + kC16[12] * in[4] + kC16[14] * in[6]; |
583 | 0 | float even7 = in[0] - kC16[2] * in[2] + kC16[4] * in[4] - kC16[6] * in[6]; |
584 | 0 | float odd0 = (kC16[1] * in[1] + kC16[3] * in[3] + kC16[5] * in[5] + |
585 | 0 | kC16[7] * in[7]); |
586 | 0 | float odd1 = (kC16[3] * in[1] + kC16[9] * in[3] + kC16[15] * in[5] - |
587 | 0 | kC16[11] * in[7]); |
588 | 0 | float odd2 = (kC16[5] * in[1] + kC16[15] * in[3] - kC16[7] * in[5] - |
589 | 0 | kC16[3] * in[7]); |
590 | 0 | float odd3 = (kC16[7] * in[1] - kC16[11] * in[3] - kC16[3] * in[5] + |
591 | 0 | kC16[15] * in[7]); |
592 | 0 | float odd4 = (kC16[9] * in[1] - kC16[5] * in[3] - kC16[13] * in[5] + |
593 | 0 | kC16[1] * in[7]); |
594 | 0 | float odd5 = (kC16[11] * in[1] - kC16[1] * in[3] + kC16[9] * in[5] + |
595 | 0 | kC16[13] * in[7]); |
596 | 0 | float odd6 = (kC16[13] * in[1] - kC16[7] * in[3] + kC16[1] * in[5] - |
597 | 0 | kC16[5] * in[7]); |
598 | 0 | float odd7 = (kC16[15] * in[1] - kC16[13] * in[3] + kC16[11] * in[5] - |
599 | 0 | kC16[9] * in[7]); |
600 | 0 | out[0] = even0 + odd0; |
601 | 0 | out[15] = even0 - odd0; |
602 | 0 | out[1] = even1 + odd1; |
603 | 0 | out[14] = even1 - odd1; |
604 | 0 | out[2] = even2 + odd2; |
605 | 0 | out[13] = even2 - odd2; |
606 | 0 | out[3] = even3 + odd3; |
607 | 0 | out[12] = even3 - odd3; |
608 | 0 | out[4] = even4 + odd4; |
609 | 0 | out[11] = even4 - odd4; |
610 | 0 | out[5] = even5 + odd5; |
611 | 0 | out[10] = even5 - odd5; |
612 | 0 | out[6] = even6 + odd6; |
613 | 0 | out[9] = even6 - odd6; |
614 | 0 | out[7] = even7 + odd7; |
615 | 0 | out[8] = even7 - odd7; |
616 | 0 | break; |
617 | 0 | } |
618 | 0 | default: |
619 | 0 | JXL_DEBUG_ABORT("Unreachable"); |
620 | 0 | break; |
621 | 0 | } |
622 | 0 | } Unexecuted instantiation: jpegli::N_SSE4::Compute1dIDCT(float const*, float*, unsigned long) Unexecuted instantiation: jpegli::N_AVX2::Compute1dIDCT(float const*, float*, unsigned long) Unexecuted instantiation: jpegli::N_SSE2::Compute1dIDCT(float const*, float*, unsigned long) |
623 | | |
624 | | void InverseTransformBlockGeneric(const int16_t* JXL_RESTRICT qblock, |
625 | | const float* JXL_RESTRICT dequant, |
626 | | const float* JXL_RESTRICT biases, |
627 | | float* JXL_RESTRICT scratch_space, |
628 | | float* JXL_RESTRICT output, |
629 | 0 | size_t output_stride, size_t dctsize) { |
630 | 0 | float* JXL_RESTRICT block0 = scratch_space; |
631 | 0 | float* JXL_RESTRICT block1 = scratch_space + DCTSIZE2; |
632 | 0 | DequantBlock(qblock, dequant, biases, block0); |
633 | 0 | if (dctsize == 1) { |
634 | 0 | *output = *block0; |
635 | 0 | } else if (dctsize == 2 || dctsize == 4) { |
636 | 0 | float* JXL_RESTRICT block2 = scratch_space + 2 * DCTSIZE2; |
637 | 0 | ComputeScaledIDCT(block0, block1, block2, 8); |
638 | 0 | if (dctsize == 4) { |
639 | 0 | for (size_t iy = 0; iy < 4; ++iy) { |
640 | 0 | for (size_t ix = 0; ix < 4; ++ix) { |
641 | 0 | float* block = &block2[16 * iy + 2 * ix]; |
642 | 0 | output[iy * output_stride + ix] = |
643 | 0 | 0.25f * (block[0] + block[1] + block[8] + block[9]); |
644 | 0 | } |
645 | 0 | } |
646 | 0 | } else { |
647 | 0 | for (size_t iy = 0; iy < 2; ++iy) { |
648 | 0 | for (size_t ix = 0; ix < 2; ++ix) { |
649 | 0 | float* block = &block2[32 * iy + 4 * ix]; |
650 | 0 | output[iy * output_stride + ix] = |
651 | 0 | 0.0625f * |
652 | 0 | (block[0] + block[1] + block[2] + block[3] + block[8] + block[9] + |
653 | 0 | block[10] + block[11] + block[16] + block[17] + block[18] + |
654 | 0 | block[19] + block[24] + block[25] + block[26] + block[27]); |
655 | 0 | } |
656 | 0 | } |
657 | 0 | } |
658 | 0 | } else { |
659 | 0 | float dctin[DCTSIZE]; |
660 | 0 | float dctout[DCTSIZE * 2]; |
661 | 0 | size_t insize = std::min<size_t>(dctsize, DCTSIZE); |
662 | 0 | for (size_t ix = 0; ix < insize; ++ix) { |
663 | 0 | for (size_t iy = 0; iy < insize; ++iy) { |
664 | 0 | dctin[iy] = block0[iy * DCTSIZE + ix]; |
665 | 0 | } |
666 | 0 | Compute1dIDCT(dctin, dctout, dctsize); |
667 | 0 | for (size_t iy = 0; iy < dctsize; ++iy) { |
668 | 0 | block1[iy * dctsize + ix] = dctout[iy]; |
669 | 0 | } |
670 | 0 | } |
671 | 0 | for (size_t iy = 0; iy < dctsize; ++iy) { |
672 | 0 | Compute1dIDCT(block1 + iy * dctsize, output + iy * output_stride, |
673 | 0 | dctsize); |
674 | 0 | } |
675 | 0 | } |
676 | 0 | } Unexecuted instantiation: jpegli::N_SSE4::InverseTransformBlockGeneric(short const*, float const*, float const*, float*, float*, unsigned long, unsigned long) Unexecuted instantiation: jpegli::N_AVX2::InverseTransformBlockGeneric(short const*, float const*, float const*, float*, float*, unsigned long, unsigned long) Unexecuted instantiation: jpegli::N_SSE2::InverseTransformBlockGeneric(short const*, float const*, float const*, float*, float*, unsigned long, unsigned long) |
677 | | |
678 | | // NOLINTNEXTLINE(google-readability-namespace-comments) |
679 | | } // namespace HWY_NAMESPACE |
680 | | } // namespace jpegli |
681 | | HWY_AFTER_NAMESPACE(); |
682 | | |
683 | | #if HWY_ONCE |
684 | | namespace jpegli { |
685 | | |
686 | | HWY_EXPORT(InverseTransformBlock8x8); |
687 | | HWY_EXPORT(InverseTransformBlockGeneric); |
688 | | |
689 | 2.68k | jxl::Status ChooseInverseTransform(j_decompress_ptr cinfo) { |
690 | 2.68k | jpeg_decomp_master* m = cinfo->master; |
691 | 7.84k | for (int c = 0; c < cinfo->num_components; ++c) { |
692 | 5.15k | int dct_size = m->scaled_dct_size[c]; |
693 | 5.15k | if (dct_size < 1 || dct_size > 16) { |
694 | 0 | return JXL_FAILURE("Compute1dIDCT does not support N=%d", dct_size); |
695 | 0 | } |
696 | 5.15k | if (dct_size == DCTSIZE) { |
697 | 5.15k | m->inverse_transform[c] = HWY_DYNAMIC_DISPATCH(InverseTransformBlock8x8); |
698 | 5.15k | } else { |
699 | 0 | m->inverse_transform[c] = |
700 | 0 | HWY_DYNAMIC_DISPATCH(InverseTransformBlockGeneric); |
701 | 0 | } |
702 | 5.15k | } |
703 | 2.68k | return true; |
704 | 2.68k | } |
705 | | |
706 | | } // namespace jpegli |
707 | | #endif // HWY_ONCE |