/src/libjxl/lib/jpegli/transpose-inl.h
Line | Count | Source |
1 | | // Copyright (c) the JPEG XL Project Authors. All rights reserved. |
2 | | // |
3 | | // Use of this source code is governed by a BSD-style |
4 | | // license that can be found in the LICENSE file. |
5 | | |
6 | | #if defined(LIB_JPEGLI_TRANSPOSE_INL_H_) == defined(HWY_TARGET_TOGGLE) |
7 | | #ifdef LIB_JPEGLI_TRANSPOSE_INL_H_ |
8 | | #undef LIB_JPEGLI_TRANSPOSE_INL_H_ |
9 | | #else |
10 | | #define LIB_JPEGLI_TRANSPOSE_INL_H_ |
11 | | #endif |
12 | | |
13 | | #include "lib/jxl/base/compiler_specific.h" |
14 | | |
15 | | HWY_BEFORE_NAMESPACE(); |
16 | | namespace jpegli { |
17 | | namespace HWY_NAMESPACE { |
18 | | namespace { |
19 | | |
20 | | #if HWY_CAP_GE256 |
21 | | static JXL_INLINE void Transpose8x8Block(const float* JXL_RESTRICT from, |
22 | 31.1M | float* JXL_RESTRICT to) { |
23 | 31.1M | const HWY_CAPPED(float, 8) d; |
24 | 31.1M | auto i0 = Load(d, from); |
25 | 31.1M | auto i1 = Load(d, from + 1 * 8); |
26 | 31.1M | auto i2 = Load(d, from + 2 * 8); |
27 | 31.1M | auto i3 = Load(d, from + 3 * 8); |
28 | 31.1M | auto i4 = Load(d, from + 4 * 8); |
29 | 31.1M | auto i5 = Load(d, from + 5 * 8); |
30 | 31.1M | auto i6 = Load(d, from + 6 * 8); |
31 | 31.1M | auto i7 = Load(d, from + 7 * 8); |
32 | | |
33 | 31.1M | const auto q0 = InterleaveLower(d, i0, i2); |
34 | 31.1M | const auto q1 = InterleaveLower(d, i1, i3); |
35 | 31.1M | const auto q2 = InterleaveUpper(d, i0, i2); |
36 | 31.1M | const auto q3 = InterleaveUpper(d, i1, i3); |
37 | 31.1M | const auto q4 = InterleaveLower(d, i4, i6); |
38 | 31.1M | const auto q5 = InterleaveLower(d, i5, i7); |
39 | 31.1M | const auto q6 = InterleaveUpper(d, i4, i6); |
40 | 31.1M | const auto q7 = InterleaveUpper(d, i5, i7); |
41 | | |
42 | 31.1M | const auto r0 = InterleaveLower(d, q0, q1); |
43 | 31.1M | const auto r1 = InterleaveUpper(d, q0, q1); |
44 | 31.1M | const auto r2 = InterleaveLower(d, q2, q3); |
45 | 31.1M | const auto r3 = InterleaveUpper(d, q2, q3); |
46 | 31.1M | const auto r4 = InterleaveLower(d, q4, q5); |
47 | 31.1M | const auto r5 = InterleaveUpper(d, q4, q5); |
48 | 31.1M | const auto r6 = InterleaveLower(d, q6, q7); |
49 | 31.1M | const auto r7 = InterleaveUpper(d, q6, q7); |
50 | | |
51 | 31.1M | i0 = ConcatLowerLower(d, r4, r0); |
52 | 31.1M | i1 = ConcatLowerLower(d, r5, r1); |
53 | 31.1M | i2 = ConcatLowerLower(d, r6, r2); |
54 | 31.1M | i3 = ConcatLowerLower(d, r7, r3); |
55 | 31.1M | i4 = ConcatUpperUpper(d, r4, r0); |
56 | 31.1M | i5 = ConcatUpperUpper(d, r5, r1); |
57 | 31.1M | i6 = ConcatUpperUpper(d, r6, r2); |
58 | 31.1M | i7 = ConcatUpperUpper(d, r7, r3); |
59 | | |
60 | 31.1M | Store(i0, d, to); |
61 | 31.1M | Store(i1, d, to + 1 * 8); |
62 | 31.1M | Store(i2, d, to + 2 * 8); |
63 | 31.1M | Store(i3, d, to + 3 * 8); |
64 | 31.1M | Store(i4, d, to + 4 * 8); |
65 | 31.1M | Store(i5, d, to + 5 * 8); |
66 | 31.1M | Store(i6, d, to + 6 * 8); |
67 | 31.1M | Store(i7, d, to + 7 * 8); |
68 | 31.1M | } idct.cc:jpegli::N_AVX2::(anonymous namespace)::Transpose8x8Block(float const*, float*) Line | Count | Source | 22 | 31.1M | float* JXL_RESTRICT to) { | 23 | 31.1M | const HWY_CAPPED(float, 8) d; | 24 | 31.1M | auto i0 = Load(d, from); | 25 | 31.1M | auto i1 = Load(d, from + 1 * 8); | 26 | 31.1M | auto i2 = Load(d, from + 2 * 8); | 27 | 31.1M | auto i3 = Load(d, from + 3 * 8); | 28 | 31.1M | auto i4 = Load(d, from + 4 * 8); | 29 | 31.1M | auto i5 = Load(d, from + 5 * 8); | 30 | 31.1M | auto i6 = Load(d, from + 6 * 8); | 31 | 31.1M | auto i7 = Load(d, from + 7 * 8); | 32 | | | 33 | 31.1M | const auto q0 = InterleaveLower(d, i0, i2); | 34 | 31.1M | const auto q1 = InterleaveLower(d, i1, i3); | 35 | 31.1M | const auto q2 = InterleaveUpper(d, i0, i2); | 36 | 31.1M | const auto q3 = InterleaveUpper(d, i1, i3); | 37 | 31.1M | const auto q4 = InterleaveLower(d, i4, i6); | 38 | 31.1M | const auto q5 = InterleaveLower(d, i5, i7); | 39 | 31.1M | const auto q6 = InterleaveUpper(d, i4, i6); | 40 | 31.1M | const auto q7 = InterleaveUpper(d, i5, i7); | 41 | | | 42 | 31.1M | const auto r0 = InterleaveLower(d, q0, q1); | 43 | 31.1M | const auto r1 = InterleaveUpper(d, q0, q1); | 44 | 31.1M | const auto r2 = InterleaveLower(d, q2, q3); | 45 | 31.1M | const auto r3 = InterleaveUpper(d, q2, q3); | 46 | 31.1M | const auto r4 = InterleaveLower(d, q4, q5); | 47 | 31.1M | const auto r5 = InterleaveUpper(d, q4, q5); | 48 | 31.1M | const auto r6 = InterleaveLower(d, q6, q7); | 49 | 31.1M | const auto r7 = InterleaveUpper(d, q6, q7); | 50 | | | 51 | 31.1M | i0 = ConcatLowerLower(d, r4, r0); | 52 | 31.1M | i1 = ConcatLowerLower(d, r5, r1); | 53 | 31.1M | i2 = ConcatLowerLower(d, r6, r2); | 54 | 31.1M | i3 = ConcatLowerLower(d, r7, r3); | 55 | 31.1M | i4 = ConcatUpperUpper(d, r4, r0); | 56 | 31.1M | i5 = ConcatUpperUpper(d, r5, r1); | 57 | 31.1M | i6 = ConcatUpperUpper(d, r6, r2); | 58 | 31.1M | i7 = ConcatUpperUpper(d, r7, r3); | 59 | | | 60 | 31.1M | Store(i0, d, to); | 61 | 31.1M | Store(i1, d, to + 1 * 8); | 62 | 31.1M | Store(i2, d, to + 2 * 8); | 63 | 31.1M | Store(i3, d, to + 3 * 8); | 64 | 31.1M | Store(i4, d, to + 4 * 8); | 65 | 31.1M | Store(i5, d, to + 5 * 8); | 66 | 31.1M | Store(i6, d, to + 6 * 8); | 67 | 31.1M | Store(i7, d, to + 7 * 8); | 68 | 31.1M | } |
Unexecuted instantiation: idct.cc:jpegli::N_AVX3::(anonymous namespace)::Transpose8x8Block(float const*, float*) Unexecuted instantiation: idct.cc:jpegli::N_AVX3_ZEN4::(anonymous namespace)::Transpose8x8Block(float const*, float*) Unexecuted instantiation: idct.cc:jpegli::N_AVX3_SPR::(anonymous namespace)::Transpose8x8Block(float const*, float*) |
69 | | #elif HWY_TARGET != HWY_SCALAR |
70 | | static JXL_INLINE void Transpose8x8Block(const float* JXL_RESTRICT from, |
71 | 28.7M | float* JXL_RESTRICT to) { |
72 | 28.7M | const HWY_CAPPED(float, 4) d; |
73 | 86.2M | for (size_t n = 0; n < 8; n += 4) { |
74 | 172M | for (size_t m = 0; m < 8; m += 4) { |
75 | 115M | auto p0 = Load(d, from + n * 8 + m); |
76 | 115M | auto p1 = Load(d, from + (n + 1) * 8 + m); |
77 | 115M | auto p2 = Load(d, from + (n + 2) * 8 + m); |
78 | 115M | auto p3 = Load(d, from + (n + 3) * 8 + m); |
79 | 115M | const auto q0 = InterleaveLower(d, p0, p2); |
80 | 115M | const auto q1 = InterleaveLower(d, p1, p3); |
81 | 115M | const auto q2 = InterleaveUpper(d, p0, p2); |
82 | 115M | const auto q3 = InterleaveUpper(d, p1, p3); |
83 | | |
84 | 115M | const auto r0 = InterleaveLower(d, q0, q1); |
85 | 115M | const auto r1 = InterleaveUpper(d, q0, q1); |
86 | 115M | const auto r2 = InterleaveLower(d, q2, q3); |
87 | 115M | const auto r3 = InterleaveUpper(d, q2, q3); |
88 | 115M | Store(r0, d, to + m * 8 + n); |
89 | 115M | Store(r1, d, to + (1 + m) * 8 + n); |
90 | 115M | Store(r2, d, to + (2 + m) * 8 + n); |
91 | 115M | Store(r3, d, to + (3 + m) * 8 + n); |
92 | 115M | } |
93 | 57.5M | } |
94 | 28.7M | } idct.cc:jpegli::N_SSE4::(anonymous namespace)::Transpose8x8Block(float const*, float*) Line | Count | Source | 71 | 14.6M | float* JXL_RESTRICT to) { | 72 | 14.6M | const HWY_CAPPED(float, 4) d; | 73 | 43.9M | for (size_t n = 0; n < 8; n += 4) { | 74 | 87.9M | for (size_t m = 0; m < 8; m += 4) { | 75 | 58.6M | auto p0 = Load(d, from + n * 8 + m); | 76 | 58.6M | auto p1 = Load(d, from + (n + 1) * 8 + m); | 77 | 58.6M | auto p2 = Load(d, from + (n + 2) * 8 + m); | 78 | 58.6M | auto p3 = Load(d, from + (n + 3) * 8 + m); | 79 | 58.6M | const auto q0 = InterleaveLower(d, p0, p2); | 80 | 58.6M | const auto q1 = InterleaveLower(d, p1, p3); | 81 | 58.6M | const auto q2 = InterleaveUpper(d, p0, p2); | 82 | 58.6M | const auto q3 = InterleaveUpper(d, p1, p3); | 83 | | | 84 | 58.6M | const auto r0 = InterleaveLower(d, q0, q1); | 85 | 58.6M | const auto r1 = InterleaveUpper(d, q0, q1); | 86 | 58.6M | const auto r2 = InterleaveLower(d, q2, q3); | 87 | 58.6M | const auto r3 = InterleaveUpper(d, q2, q3); | 88 | 58.6M | Store(r0, d, to + m * 8 + n); | 89 | 58.6M | Store(r1, d, to + (1 + m) * 8 + n); | 90 | 58.6M | Store(r2, d, to + (2 + m) * 8 + n); | 91 | 58.6M | Store(r3, d, to + (3 + m) * 8 + n); | 92 | 58.6M | } | 93 | 29.3M | } | 94 | 14.6M | } |
idct.cc:jpegli::N_SSE2::(anonymous namespace)::Transpose8x8Block(float const*, float*) Line | Count | Source | 71 | 14.1M | float* JXL_RESTRICT to) { | 72 | 14.1M | const HWY_CAPPED(float, 4) d; | 73 | 42.3M | for (size_t n = 0; n < 8; n += 4) { | 74 | 84.6M | for (size_t m = 0; m < 8; m += 4) { | 75 | 56.4M | auto p0 = Load(d, from + n * 8 + m); | 76 | 56.4M | auto p1 = Load(d, from + (n + 1) * 8 + m); | 77 | 56.4M | auto p2 = Load(d, from + (n + 2) * 8 + m); | 78 | 56.4M | auto p3 = Load(d, from + (n + 3) * 8 + m); | 79 | 56.4M | const auto q0 = InterleaveLower(d, p0, p2); | 80 | 56.4M | const auto q1 = InterleaveLower(d, p1, p3); | 81 | 56.4M | const auto q2 = InterleaveUpper(d, p0, p2); | 82 | 56.4M | const auto q3 = InterleaveUpper(d, p1, p3); | 83 | | | 84 | 56.4M | const auto r0 = InterleaveLower(d, q0, q1); | 85 | 56.4M | const auto r1 = InterleaveUpper(d, q0, q1); | 86 | 56.4M | const auto r2 = InterleaveLower(d, q2, q3); | 87 | 56.4M | const auto r3 = InterleaveUpper(d, q2, q3); | 88 | 56.4M | Store(r0, d, to + m * 8 + n); | 89 | 56.4M | Store(r1, d, to + (1 + m) * 8 + n); | 90 | 56.4M | Store(r2, d, to + (2 + m) * 8 + n); | 91 | 56.4M | Store(r3, d, to + (3 + m) * 8 + n); | 92 | 56.4M | } | 93 | 28.2M | } | 94 | 14.1M | } |
|
95 | | #else |
96 | | static JXL_INLINE void Transpose8x8Block(const float* JXL_RESTRICT from, |
97 | | float* JXL_RESTRICT to) { |
98 | | for (size_t n = 0; n < 8; ++n) { |
99 | | for (size_t m = 0; m < 8; ++m) { |
100 | | to[8 * n + m] = from[8 * m + n]; |
101 | | } |
102 | | } |
103 | | } |
104 | | #endif |
105 | | |
106 | | // NOLINTNEXTLINE(google-readability-namespace-comments) |
107 | | } // namespace |
108 | | } // namespace HWY_NAMESPACE |
109 | | } // namespace jpegli |
110 | | HWY_AFTER_NAMESPACE(); |
111 | | #endif // LIB_JPEGLI_TRANSPOSE_INL_H_ |