/src/libjxl/lib/jxl/modular/transform/rct.cc
Line | Count | Source |
1 | | // Copyright (c) the JPEG XL Project Authors. All rights reserved. |
2 | | // |
3 | | // Use of this source code is governed by a BSD-style |
4 | | // license that can be found in the LICENSE file. |
5 | | |
6 | | #include "lib/jxl/modular/transform/rct.h" |
7 | | |
8 | | #include <cstddef> |
9 | | #include <cstdint> |
10 | | #include <utility> |
11 | | |
12 | | #include "lib/jxl/base/data_parallel.h" |
13 | | #include "lib/jxl/base/status.h" |
14 | | #include "lib/jxl/modular/modular_image.h" |
15 | | #include "lib/jxl/modular/transform/transform.h" |
16 | | #undef HWY_TARGET_INCLUDE |
17 | | #define HWY_TARGET_INCLUDE "lib/jxl/modular/transform/rct.cc" |
18 | | #include <hwy/foreach_target.h> |
19 | | #include <hwy/highway.h> |
20 | | HWY_BEFORE_NAMESPACE(); |
21 | | namespace jxl { |
22 | | namespace HWY_NAMESPACE { |
23 | | |
24 | | // These templates are not found via ADL. |
25 | | using hwy::HWY_NAMESPACE::Add; |
26 | | using hwy::HWY_NAMESPACE::ShiftRight; |
27 | | using hwy::HWY_NAMESPACE::Sub; |
28 | | |
29 | | template <int transform_type> |
30 | | void InvRCTRow(const pixel_type* in0, const pixel_type* in1, |
31 | | const pixel_type* in2, pixel_type* out0, pixel_type* out1, |
32 | 68.7k | pixel_type* out2, size_t w) { |
33 | 68.7k | static_assert(transform_type >= 0 && transform_type < 7, |
34 | 68.7k | "Invalid transform type"); |
35 | 68.7k | int second = transform_type >> 1; |
36 | 68.7k | int third = transform_type & 1; |
37 | | |
38 | 68.7k | size_t x = 0; |
39 | 68.7k | const HWY_FULL(pixel_type) d; |
40 | 68.7k | const size_t N = Lanes(d); |
41 | 935k | for (; x + N - 1 < w; x += N) { |
42 | 866k | if (transform_type == 6) { |
43 | 554k | auto Y = Load(d, in0 + x); |
44 | 554k | auto Co = Load(d, in1 + x); |
45 | 554k | auto Cg = Load(d, in2 + x); |
46 | 554k | Y = Sub(Y, ShiftRight<1>(Cg)); |
47 | 554k | auto G = Add(Cg, Y); |
48 | 554k | Y = Sub(Y, ShiftRight<1>(Co)); |
49 | 554k | auto R = Add(Y, Co); |
50 | 554k | Store(R, d, out0 + x); |
51 | 554k | Store(G, d, out1 + x); |
52 | 554k | Store(Y, d, out2 + x); |
53 | 554k | } else { |
54 | 311k | auto First = Load(d, in0 + x); |
55 | 311k | auto Second = Load(d, in1 + x); |
56 | 311k | auto Third = Load(d, in2 + x); |
57 | 311k | if (third) Third = Add(Third, First); |
58 | 311k | if (second == 1) { |
59 | 266k | Second = Add(Second, First); |
60 | 266k | } else if (second == 2) { |
61 | 22.2k | Second = Add(Second, ShiftRight<1>(Add(First, Third))); |
62 | 22.2k | } |
63 | 311k | Store(First, d, out0 + x); |
64 | 311k | Store(Second, d, out1 + x); |
65 | 311k | Store(Third, d, out2 + x); |
66 | 311k | } |
67 | 866k | } |
68 | 128k | for (; x < w; x++) { |
69 | 60.1k | if (transform_type == 6) { |
70 | 29.8k | pixel_type Y = in0[x]; |
71 | 29.8k | pixel_type Co = in1[x]; |
72 | 29.8k | pixel_type Cg = in2[x]; |
73 | 29.8k | pixel_type tmp = PixelAdd(Y, -(Cg >> 1)); |
74 | 29.8k | pixel_type G = PixelAdd(Cg, tmp); |
75 | 29.8k | pixel_type B = PixelAdd(tmp, -(Co >> 1)); |
76 | 29.8k | pixel_type R = PixelAdd(B, Co); |
77 | 29.8k | out0[x] = R; |
78 | 29.8k | out1[x] = G; |
79 | 29.8k | out2[x] = B; |
80 | 30.2k | } else { |
81 | 30.2k | pixel_type First = in0[x]; |
82 | 30.2k | pixel_type Second = in1[x]; |
83 | 30.2k | pixel_type Third = in2[x]; |
84 | 30.2k | if (third) Third = PixelAdd(Third, First); |
85 | 30.2k | if (second == 1) { |
86 | 19.9k | Second = PixelAdd(Second, First); |
87 | 19.9k | } else if (second == 2) { |
88 | 7.28k | Second = PixelAdd(Second, (PixelAdd(First, Third) >> 1)); |
89 | 7.28k | } |
90 | 30.2k | out0[x] = First; |
91 | 30.2k | out1[x] = Second; |
92 | 30.2k | out2[x] = Third; |
93 | 30.2k | } |
94 | 60.1k | } |
95 | 68.7k | } Unexecuted instantiation: void jxl::N_SSE4::InvRCTRow<0>(int const*, int const*, int const*, int*, int*, int*, unsigned long) Unexecuted instantiation: void jxl::N_SSE4::InvRCTRow<1>(int const*, int const*, int const*, int*, int*, int*, unsigned long) Unexecuted instantiation: void jxl::N_SSE4::InvRCTRow<2>(int const*, int const*, int const*, int*, int*, int*, unsigned long) Unexecuted instantiation: void jxl::N_SSE4::InvRCTRow<3>(int const*, int const*, int const*, int*, int*, int*, unsigned long) Unexecuted instantiation: void jxl::N_SSE4::InvRCTRow<4>(int const*, int const*, int const*, int*, int*, int*, unsigned long) Unexecuted instantiation: void jxl::N_SSE4::InvRCTRow<5>(int const*, int const*, int const*, int*, int*, int*, unsigned long) Unexecuted instantiation: void jxl::N_SSE4::InvRCTRow<6>(int const*, int const*, int const*, int*, int*, int*, unsigned long) Unexecuted instantiation: void jxl::N_AVX2::InvRCTRow<0>(int const*, int const*, int const*, int*, int*, int*, unsigned long) void jxl::N_AVX2::InvRCTRow<1>(int const*, int const*, int const*, int*, int*, int*, unsigned long) Line | Count | Source | 32 | 2.33k | pixel_type* out2, size_t w) { | 33 | 2.33k | static_assert(transform_type >= 0 && transform_type < 7, | 34 | 2.33k | "Invalid transform type"); | 35 | 2.33k | int second = transform_type >> 1; | 36 | 2.33k | int third = transform_type & 1; | 37 | | | 38 | 2.33k | size_t x = 0; | 39 | 2.33k | const HWY_FULL(pixel_type) d; | 40 | 2.33k | const size_t N = Lanes(d); | 41 | 25.1k | for (; x + N - 1 < w; x += N) { | 42 | 22.7k | if (transform_type == 6) { | 43 | 0 | auto Y = Load(d, in0 + x); | 44 | 0 | auto Co = Load(d, in1 + x); | 45 | 0 | auto Cg = Load(d, in2 + x); | 46 | 0 | Y = Sub(Y, ShiftRight<1>(Cg)); | 47 | 0 | auto G = Add(Cg, Y); | 48 | 0 | Y = Sub(Y, ShiftRight<1>(Co)); | 49 | 0 | auto R = Add(Y, Co); | 50 | 0 | Store(R, d, out0 + x); | 51 | 0 | Store(G, d, out1 + x); | 52 | 0 | Store(Y, d, out2 + x); | 53 | 22.7k | } else { | 54 | 22.7k | auto First = Load(d, in0 + x); | 55 | 22.7k | auto Second = Load(d, in1 + x); | 56 | 22.7k | auto Third = Load(d, in2 + x); | 57 | 22.7k | if (third) Third = Add(Third, First); | 58 | 22.7k | if (second == 1) { | 59 | 0 | Second = Add(Second, First); | 60 | 22.7k | } else if (second == 2) { | 61 | 0 | Second = Add(Second, ShiftRight<1>(Add(First, Third))); | 62 | 0 | } | 63 | 22.7k | Store(First, d, out0 + x); | 64 | 22.7k | Store(Second, d, out1 + x); | 65 | 22.7k | Store(Third, d, out2 + x); | 66 | 22.7k | } | 67 | 22.7k | } | 68 | 5.31k | for (; x < w; x++) { | 69 | 2.97k | if (transform_type == 6) { | 70 | 0 | pixel_type Y = in0[x]; | 71 | 0 | pixel_type Co = in1[x]; | 72 | 0 | pixel_type Cg = in2[x]; | 73 | 0 | pixel_type tmp = PixelAdd(Y, -(Cg >> 1)); | 74 | 0 | pixel_type G = PixelAdd(Cg, tmp); | 75 | 0 | pixel_type B = PixelAdd(tmp, -(Co >> 1)); | 76 | 0 | pixel_type R = PixelAdd(B, Co); | 77 | 0 | out0[x] = R; | 78 | 0 | out1[x] = G; | 79 | 0 | out2[x] = B; | 80 | 2.97k | } else { | 81 | 2.97k | pixel_type First = in0[x]; | 82 | 2.97k | pixel_type Second = in1[x]; | 83 | 2.97k | pixel_type Third = in2[x]; | 84 | 2.97k | if (third) Third = PixelAdd(Third, First); | 85 | 2.97k | if (second == 1) { | 86 | 0 | Second = PixelAdd(Second, First); | 87 | 2.97k | } else if (second == 2) { | 88 | 0 | Second = PixelAdd(Second, (PixelAdd(First, Third) >> 1)); | 89 | 0 | } | 90 | 2.97k | out0[x] = First; | 91 | 2.97k | out1[x] = Second; | 92 | 2.97k | out2[x] = Third; | 93 | 2.97k | } | 94 | 2.97k | } | 95 | 2.33k | } |
void jxl::N_AVX2::InvRCTRow<2>(int const*, int const*, int const*, int*, int*, int*, unsigned long) Line | Count | Source | 32 | 20.2k | pixel_type* out2, size_t w) { | 33 | 20.2k | static_assert(transform_type >= 0 && transform_type < 7, | 34 | 20.2k | "Invalid transform type"); | 35 | 20.2k | int second = transform_type >> 1; | 36 | 20.2k | int third = transform_type & 1; | 37 | | | 38 | 20.2k | size_t x = 0; | 39 | 20.2k | const HWY_FULL(pixel_type) d; | 40 | 20.2k | const size_t N = Lanes(d); | 41 | 215k | for (; x + N - 1 < w; x += N) { | 42 | 195k | if (transform_type == 6) { | 43 | 0 | auto Y = Load(d, in0 + x); | 44 | 0 | auto Co = Load(d, in1 + x); | 45 | 0 | auto Cg = Load(d, in2 + x); | 46 | 0 | Y = Sub(Y, ShiftRight<1>(Cg)); | 47 | 0 | auto G = Add(Cg, Y); | 48 | 0 | Y = Sub(Y, ShiftRight<1>(Co)); | 49 | 0 | auto R = Add(Y, Co); | 50 | 0 | Store(R, d, out0 + x); | 51 | 0 | Store(G, d, out1 + x); | 52 | 0 | Store(Y, d, out2 + x); | 53 | 195k | } else { | 54 | 195k | auto First = Load(d, in0 + x); | 55 | 195k | auto Second = Load(d, in1 + x); | 56 | 195k | auto Third = Load(d, in2 + x); | 57 | 195k | if (third) Third = Add(Third, First); | 58 | 195k | if (second == 1) { | 59 | 195k | Second = Add(Second, First); | 60 | 195k | } else if (second == 2) { | 61 | 0 | Second = Add(Second, ShiftRight<1>(Add(First, Third))); | 62 | 0 | } | 63 | 195k | Store(First, d, out0 + x); | 64 | 195k | Store(Second, d, out1 + x); | 65 | 195k | Store(Third, d, out2 + x); | 66 | 195k | } | 67 | 195k | } | 68 | 34.8k | for (; x < w; x++) { | 69 | 14.5k | if (transform_type == 6) { | 70 | 0 | pixel_type Y = in0[x]; | 71 | 0 | pixel_type Co = in1[x]; | 72 | 0 | pixel_type Cg = in2[x]; | 73 | 0 | pixel_type tmp = PixelAdd(Y, -(Cg >> 1)); | 74 | 0 | pixel_type G = PixelAdd(Cg, tmp); | 75 | 0 | pixel_type B = PixelAdd(tmp, -(Co >> 1)); | 76 | 0 | pixel_type R = PixelAdd(B, Co); | 77 | 0 | out0[x] = R; | 78 | 0 | out1[x] = G; | 79 | 0 | out2[x] = B; | 80 | 14.5k | } else { | 81 | 14.5k | pixel_type First = in0[x]; | 82 | 14.5k | pixel_type Second = in1[x]; | 83 | 14.5k | pixel_type Third = in2[x]; | 84 | 14.5k | if (third) Third = PixelAdd(Third, First); | 85 | 14.5k | if (second == 1) { | 86 | 14.5k | Second = PixelAdd(Second, First); | 87 | 14.5k | } else if (second == 2) { | 88 | 0 | Second = PixelAdd(Second, (PixelAdd(First, Third) >> 1)); | 89 | 0 | } | 90 | 14.5k | out0[x] = First; | 91 | 14.5k | out1[x] = Second; | 92 | 14.5k | out2[x] = Third; | 93 | 14.5k | } | 94 | 14.5k | } | 95 | 20.2k | } |
void jxl::N_AVX2::InvRCTRow<3>(int const*, int const*, int const*, int*, int*, int*, unsigned long) Line | Count | Source | 32 | 5.48k | pixel_type* out2, size_t w) { | 33 | 5.48k | static_assert(transform_type >= 0 && transform_type < 7, | 34 | 5.48k | "Invalid transform type"); | 35 | 5.48k | int second = transform_type >> 1; | 36 | 5.48k | int third = transform_type & 1; | 37 | | | 38 | 5.48k | size_t x = 0; | 39 | 5.48k | const HWY_FULL(pixel_type) d; | 40 | 5.48k | const size_t N = Lanes(d); | 41 | 77.2k | for (; x + N - 1 < w; x += N) { | 42 | 71.7k | if (transform_type == 6) { | 43 | 0 | auto Y = Load(d, in0 + x); | 44 | 0 | auto Co = Load(d, in1 + x); | 45 | 0 | auto Cg = Load(d, in2 + x); | 46 | 0 | Y = Sub(Y, ShiftRight<1>(Cg)); | 47 | 0 | auto G = Add(Cg, Y); | 48 | 0 | Y = Sub(Y, ShiftRight<1>(Co)); | 49 | 0 | auto R = Add(Y, Co); | 50 | 0 | Store(R, d, out0 + x); | 51 | 0 | Store(G, d, out1 + x); | 52 | 0 | Store(Y, d, out2 + x); | 53 | 71.7k | } else { | 54 | 71.7k | auto First = Load(d, in0 + x); | 55 | 71.7k | auto Second = Load(d, in1 + x); | 56 | 71.7k | auto Third = Load(d, in2 + x); | 57 | 71.7k | if (third) Third = Add(Third, First); | 58 | 71.7k | if (second == 1) { | 59 | 71.7k | Second = Add(Second, First); | 60 | 71.7k | } else if (second == 2) { | 61 | 0 | Second = Add(Second, ShiftRight<1>(Add(First, Third))); | 62 | 0 | } | 63 | 71.7k | Store(First, d, out0 + x); | 64 | 71.7k | Store(Second, d, out1 + x); | 65 | 71.7k | Store(Third, d, out2 + x); | 66 | 71.7k | } | 67 | 71.7k | } | 68 | 10.9k | for (; x < w; x++) { | 69 | 5.41k | if (transform_type == 6) { | 70 | 0 | pixel_type Y = in0[x]; | 71 | 0 | pixel_type Co = in1[x]; | 72 | 0 | pixel_type Cg = in2[x]; | 73 | 0 | pixel_type tmp = PixelAdd(Y, -(Cg >> 1)); | 74 | 0 | pixel_type G = PixelAdd(Cg, tmp); | 75 | 0 | pixel_type B = PixelAdd(tmp, -(Co >> 1)); | 76 | 0 | pixel_type R = PixelAdd(B, Co); | 77 | 0 | out0[x] = R; | 78 | 0 | out1[x] = G; | 79 | 0 | out2[x] = B; | 80 | 5.41k | } else { | 81 | 5.41k | pixel_type First = in0[x]; | 82 | 5.41k | pixel_type Second = in1[x]; | 83 | 5.41k | pixel_type Third = in2[x]; | 84 | 5.41k | if (third) Third = PixelAdd(Third, First); | 85 | 5.41k | if (second == 1) { | 86 | 5.41k | Second = PixelAdd(Second, First); | 87 | 5.41k | } else if (second == 2) { | 88 | 0 | Second = PixelAdd(Second, (PixelAdd(First, Third) >> 1)); | 89 | 0 | } | 90 | 5.41k | out0[x] = First; | 91 | 5.41k | out1[x] = Second; | 92 | 5.41k | out2[x] = Third; | 93 | 5.41k | } | 94 | 5.41k | } | 95 | 5.48k | } |
void jxl::N_AVX2::InvRCTRow<4>(int const*, int const*, int const*, int*, int*, int*, unsigned long) Line | Count | Source | 32 | 2.98k | pixel_type* out2, size_t w) { | 33 | 2.98k | static_assert(transform_type >= 0 && transform_type < 7, | 34 | 2.98k | "Invalid transform type"); | 35 | 2.98k | int second = transform_type >> 1; | 36 | 2.98k | int third = transform_type & 1; | 37 | | | 38 | 2.98k | size_t x = 0; | 39 | 2.98k | const HWY_FULL(pixel_type) d; | 40 | 2.98k | const size_t N = Lanes(d); | 41 | 17.6k | for (; x + N - 1 < w; x += N) { | 42 | 14.6k | if (transform_type == 6) { | 43 | 0 | auto Y = Load(d, in0 + x); | 44 | 0 | auto Co = Load(d, in1 + x); | 45 | 0 | auto Cg = Load(d, in2 + x); | 46 | 0 | Y = Sub(Y, ShiftRight<1>(Cg)); | 47 | 0 | auto G = Add(Cg, Y); | 48 | 0 | Y = Sub(Y, ShiftRight<1>(Co)); | 49 | 0 | auto R = Add(Y, Co); | 50 | 0 | Store(R, d, out0 + x); | 51 | 0 | Store(G, d, out1 + x); | 52 | 0 | Store(Y, d, out2 + x); | 53 | 14.6k | } else { | 54 | 14.6k | auto First = Load(d, in0 + x); | 55 | 14.6k | auto Second = Load(d, in1 + x); | 56 | 14.6k | auto Third = Load(d, in2 + x); | 57 | 14.6k | if (third) Third = Add(Third, First); | 58 | 14.6k | if (second == 1) { | 59 | 0 | Second = Add(Second, First); | 60 | 14.6k | } else if (second == 2) { | 61 | 14.6k | Second = Add(Second, ShiftRight<1>(Add(First, Third))); | 62 | 14.6k | } | 63 | 14.6k | Store(First, d, out0 + x); | 64 | 14.6k | Store(Second, d, out1 + x); | 65 | 14.6k | Store(Third, d, out2 + x); | 66 | 14.6k | } | 67 | 14.6k | } | 68 | 9.03k | for (; x < w; x++) { | 69 | 6.04k | if (transform_type == 6) { | 70 | 0 | pixel_type Y = in0[x]; | 71 | 0 | pixel_type Co = in1[x]; | 72 | 0 | pixel_type Cg = in2[x]; | 73 | 0 | pixel_type tmp = PixelAdd(Y, -(Cg >> 1)); | 74 | 0 | pixel_type G = PixelAdd(Cg, tmp); | 75 | 0 | pixel_type B = PixelAdd(tmp, -(Co >> 1)); | 76 | 0 | pixel_type R = PixelAdd(B, Co); | 77 | 0 | out0[x] = R; | 78 | 0 | out1[x] = G; | 79 | 0 | out2[x] = B; | 80 | 6.04k | } else { | 81 | 6.04k | pixel_type First = in0[x]; | 82 | 6.04k | pixel_type Second = in1[x]; | 83 | 6.04k | pixel_type Third = in2[x]; | 84 | 6.04k | if (third) Third = PixelAdd(Third, First); | 85 | 6.04k | if (second == 1) { | 86 | 0 | Second = PixelAdd(Second, First); | 87 | 6.04k | } else if (second == 2) { | 88 | 6.04k | Second = PixelAdd(Second, (PixelAdd(First, Third) >> 1)); | 89 | 6.04k | } | 90 | 6.04k | out0[x] = First; | 91 | 6.04k | out1[x] = Second; | 92 | 6.04k | out2[x] = Third; | 93 | 6.04k | } | 94 | 6.04k | } | 95 | 2.98k | } |
void jxl::N_AVX2::InvRCTRow<5>(int const*, int const*, int const*, int*, int*, int*, unsigned long) Line | Count | Source | 32 | 2.93k | pixel_type* out2, size_t w) { | 33 | 2.93k | static_assert(transform_type >= 0 && transform_type < 7, | 34 | 2.93k | "Invalid transform type"); | 35 | 2.93k | int second = transform_type >> 1; | 36 | 2.93k | int third = transform_type & 1; | 37 | | | 38 | 2.93k | size_t x = 0; | 39 | 2.93k | const HWY_FULL(pixel_type) d; | 40 | 2.93k | const size_t N = Lanes(d); | 41 | 10.5k | for (; x + N - 1 < w; x += N) { | 42 | 7.58k | if (transform_type == 6) { | 43 | 0 | auto Y = Load(d, in0 + x); | 44 | 0 | auto Co = Load(d, in1 + x); | 45 | 0 | auto Cg = Load(d, in2 + x); | 46 | 0 | Y = Sub(Y, ShiftRight<1>(Cg)); | 47 | 0 | auto G = Add(Cg, Y); | 48 | 0 | Y = Sub(Y, ShiftRight<1>(Co)); | 49 | 0 | auto R = Add(Y, Co); | 50 | 0 | Store(R, d, out0 + x); | 51 | 0 | Store(G, d, out1 + x); | 52 | 0 | Store(Y, d, out2 + x); | 53 | 7.58k | } else { | 54 | 7.58k | auto First = Load(d, in0 + x); | 55 | 7.58k | auto Second = Load(d, in1 + x); | 56 | 7.58k | auto Third = Load(d, in2 + x); | 57 | 7.58k | if (third) Third = Add(Third, First); | 58 | 7.58k | if (second == 1) { | 59 | 0 | Second = Add(Second, First); | 60 | 7.58k | } else if (second == 2) { | 61 | 7.58k | Second = Add(Second, ShiftRight<1>(Add(First, Third))); | 62 | 7.58k | } | 63 | 7.58k | Store(First, d, out0 + x); | 64 | 7.58k | Store(Second, d, out1 + x); | 65 | 7.58k | Store(Third, d, out2 + x); | 66 | 7.58k | } | 67 | 7.58k | } | 68 | 4.17k | for (; x < w; x++) { | 69 | 1.24k | if (transform_type == 6) { | 70 | 0 | pixel_type Y = in0[x]; | 71 | 0 | pixel_type Co = in1[x]; | 72 | 0 | pixel_type Cg = in2[x]; | 73 | 0 | pixel_type tmp = PixelAdd(Y, -(Cg >> 1)); | 74 | 0 | pixel_type G = PixelAdd(Cg, tmp); | 75 | 0 | pixel_type B = PixelAdd(tmp, -(Co >> 1)); | 76 | 0 | pixel_type R = PixelAdd(B, Co); | 77 | 0 | out0[x] = R; | 78 | 0 | out1[x] = G; | 79 | 0 | out2[x] = B; | 80 | 1.24k | } else { | 81 | 1.24k | pixel_type First = in0[x]; | 82 | 1.24k | pixel_type Second = in1[x]; | 83 | 1.24k | pixel_type Third = in2[x]; | 84 | 1.24k | if (third) Third = PixelAdd(Third, First); | 85 | 1.24k | if (second == 1) { | 86 | 0 | Second = PixelAdd(Second, First); | 87 | 1.24k | } else if (second == 2) { | 88 | 1.24k | Second = PixelAdd(Second, (PixelAdd(First, Third) >> 1)); | 89 | 1.24k | } | 90 | 1.24k | out0[x] = First; | 91 | 1.24k | out1[x] = Second; | 92 | 1.24k | out2[x] = Third; | 93 | 1.24k | } | 94 | 1.24k | } | 95 | 2.93k | } |
void jxl::N_AVX2::InvRCTRow<6>(int const*, int const*, int const*, int*, int*, int*, unsigned long) Line | Count | Source | 32 | 34.7k | pixel_type* out2, size_t w) { | 33 | 34.7k | static_assert(transform_type >= 0 && transform_type < 7, | 34 | 34.7k | "Invalid transform type"); | 35 | 34.7k | int second = transform_type >> 1; | 36 | 34.7k | int third = transform_type & 1; | 37 | | | 38 | 34.7k | size_t x = 0; | 39 | 34.7k | const HWY_FULL(pixel_type) d; | 40 | 34.7k | const size_t N = Lanes(d); | 41 | 589k | for (; x + N - 1 < w; x += N) { | 42 | 554k | if (transform_type == 6) { | 43 | 554k | auto Y = Load(d, in0 + x); | 44 | 554k | auto Co = Load(d, in1 + x); | 45 | 554k | auto Cg = Load(d, in2 + x); | 46 | 554k | Y = Sub(Y, ShiftRight<1>(Cg)); | 47 | 554k | auto G = Add(Cg, Y); | 48 | 554k | Y = Sub(Y, ShiftRight<1>(Co)); | 49 | 554k | auto R = Add(Y, Co); | 50 | 554k | Store(R, d, out0 + x); | 51 | 554k | Store(G, d, out1 + x); | 52 | 554k | Store(Y, d, out2 + x); | 53 | 554k | } else { | 54 | 0 | auto First = Load(d, in0 + x); | 55 | 0 | auto Second = Load(d, in1 + x); | 56 | 0 | auto Third = Load(d, in2 + x); | 57 | 0 | if (third) Third = Add(Third, First); | 58 | 0 | if (second == 1) { | 59 | 0 | Second = Add(Second, First); | 60 | 0 | } else if (second == 2) { | 61 | 0 | Second = Add(Second, ShiftRight<1>(Add(First, Third))); | 62 | 0 | } | 63 | 0 | Store(First, d, out0 + x); | 64 | 0 | Store(Second, d, out1 + x); | 65 | 0 | Store(Third, d, out2 + x); | 66 | 0 | } | 67 | 554k | } | 68 | 64.6k | for (; x < w; x++) { | 69 | 29.8k | if (transform_type == 6) { | 70 | 29.8k | pixel_type Y = in0[x]; | 71 | 29.8k | pixel_type Co = in1[x]; | 72 | 29.8k | pixel_type Cg = in2[x]; | 73 | 29.8k | pixel_type tmp = PixelAdd(Y, -(Cg >> 1)); | 74 | 29.8k | pixel_type G = PixelAdd(Cg, tmp); | 75 | 29.8k | pixel_type B = PixelAdd(tmp, -(Co >> 1)); | 76 | 29.8k | pixel_type R = PixelAdd(B, Co); | 77 | 29.8k | out0[x] = R; | 78 | 29.8k | out1[x] = G; | 79 | 29.8k | out2[x] = B; | 80 | 29.8k | } else { | 81 | 0 | pixel_type First = in0[x]; | 82 | 0 | pixel_type Second = in1[x]; | 83 | 0 | pixel_type Third = in2[x]; | 84 | 0 | if (third) Third = PixelAdd(Third, First); | 85 | 0 | if (second == 1) { | 86 | 0 | Second = PixelAdd(Second, First); | 87 | 0 | } else if (second == 2) { | 88 | 0 | Second = PixelAdd(Second, (PixelAdd(First, Third) >> 1)); | 89 | 0 | } | 90 | 0 | out0[x] = First; | 91 | 0 | out1[x] = Second; | 92 | 0 | out2[x] = Third; | 93 | 0 | } | 94 | 29.8k | } | 95 | 34.7k | } |
Unexecuted instantiation: void jxl::N_SSE2::InvRCTRow<0>(int const*, int const*, int const*, int*, int*, int*, unsigned long) Unexecuted instantiation: void jxl::N_SSE2::InvRCTRow<1>(int const*, int const*, int const*, int*, int*, int*, unsigned long) Unexecuted instantiation: void jxl::N_SSE2::InvRCTRow<2>(int const*, int const*, int const*, int*, int*, int*, unsigned long) Unexecuted instantiation: void jxl::N_SSE2::InvRCTRow<3>(int const*, int const*, int const*, int*, int*, int*, unsigned long) Unexecuted instantiation: void jxl::N_SSE2::InvRCTRow<4>(int const*, int const*, int const*, int*, int*, int*, unsigned long) Unexecuted instantiation: void jxl::N_SSE2::InvRCTRow<5>(int const*, int const*, int const*, int*, int*, int*, unsigned long) Unexecuted instantiation: void jxl::N_SSE2::InvRCTRow<6>(int const*, int const*, int const*, int*, int*, int*, unsigned long) |
96 | | |
97 | 2.97k | Status InvRCT(Image& input, size_t begin_c, size_t rct_type, ThreadPool* pool) { |
98 | 2.97k | JXL_RETURN_IF_ERROR(CheckEqualChannels(input, begin_c, begin_c + 2)); |
99 | 2.97k | size_t m = begin_c; |
100 | 2.97k | Channel& c0 = input.channel[m + 0]; |
101 | 2.97k | size_t w = c0.w; |
102 | 2.97k | size_t h = c0.h; |
103 | 2.97k | if (rct_type == 0) { // noop |
104 | 1.15k | return true; |
105 | 1.15k | } |
106 | | // Permutation: 0=RGB, 1=GBR, 2=BRG, 3=RBG, 4=GRB, 5=BGR |
107 | 1.81k | int permutation = rct_type / 7; |
108 | 1.81k | JXL_ENSURE(permutation < 6); |
109 | | // 0-5 values have the low bit corresponding to Third and the high bits |
110 | | // corresponding to Second. 6 corresponds to YCoCg. |
111 | | // |
112 | | // Second: 0=nop, 1=SubtractFirst, 2=SubtractAvgFirstThird |
113 | | // |
114 | | // Third: 0=nop, 1=SubtractFirst |
115 | 1.81k | int custom = rct_type % 7; |
116 | | // Special case: permute-only. Swap channels around. |
117 | 1.81k | if (custom == 0) { |
118 | 26 | Channel ch0 = std::move(input.channel[m]); |
119 | 26 | Channel ch1 = std::move(input.channel[m + 1]); |
120 | 26 | Channel ch2 = std::move(input.channel[m + 2]); |
121 | 26 | input.channel[m + (permutation % 3)] = std::move(ch0); |
122 | 26 | input.channel[m + ((permutation + 1 + permutation / 3) % 3)] = |
123 | 26 | std::move(ch1); |
124 | 26 | input.channel[m + ((permutation + 2 - permutation / 3) % 3)] = |
125 | 26 | std::move(ch2); |
126 | 26 | return true; |
127 | 26 | } |
128 | 1.78k | constexpr decltype(&InvRCTRow<0>) inv_rct_row[] = { |
129 | 1.78k | InvRCTRow<0>, InvRCTRow<1>, InvRCTRow<2>, InvRCTRow<3>, |
130 | 1.78k | InvRCTRow<4>, InvRCTRow<5>, InvRCTRow<6>}; |
131 | 1.78k | const auto process_row = [&](const uint32_t task, |
132 | 68.7k | size_t /* thread */) -> Status { |
133 | 68.7k | const size_t y = task; |
134 | 68.7k | const pixel_type* in0 = input.channel[m].Row(y); |
135 | 68.7k | const pixel_type* in1 = input.channel[m + 1].Row(y); |
136 | 68.7k | const pixel_type* in2 = input.channel[m + 2].Row(y); |
137 | 68.7k | pixel_type* out0 = input.channel[m + (permutation % 3)].Row(y); |
138 | 68.7k | pixel_type* out1 = |
139 | 68.7k | input.channel[m + ((permutation + 1 + permutation / 3) % 3)].Row(y); |
140 | 68.7k | pixel_type* out2 = |
141 | 68.7k | input.channel[m + ((permutation + 2 - permutation / 3) % 3)].Row(y); |
142 | 68.7k | inv_rct_row[custom](in0, in1, in2, out0, out1, out2, w); |
143 | 68.7k | return true; |
144 | 68.7k | }; Unexecuted instantiation: rct.cc:jxl::N_SSE4::InvRCT(jxl::Image&, unsigned long, unsigned long, jxl::ThreadPool*)::$_0::operator()(unsigned int, unsigned long) const rct.cc:jxl::N_AVX2::InvRCT(jxl::Image&, unsigned long, unsigned long, jxl::ThreadPool*)::$_0::operator()(unsigned int, unsigned long) const Line | Count | Source | 132 | 68.7k | size_t /* thread */) -> Status { | 133 | 68.7k | const size_t y = task; | 134 | 68.7k | const pixel_type* in0 = input.channel[m].Row(y); | 135 | 68.7k | const pixel_type* in1 = input.channel[m + 1].Row(y); | 136 | 68.7k | const pixel_type* in2 = input.channel[m + 2].Row(y); | 137 | 68.7k | pixel_type* out0 = input.channel[m + (permutation % 3)].Row(y); | 138 | 68.7k | pixel_type* out1 = | 139 | 68.7k | input.channel[m + ((permutation + 1 + permutation / 3) % 3)].Row(y); | 140 | 68.7k | pixel_type* out2 = | 141 | 68.7k | input.channel[m + ((permutation + 2 - permutation / 3) % 3)].Row(y); | 142 | 68.7k | inv_rct_row[custom](in0, in1, in2, out0, out1, out2, w); | 143 | 68.7k | return true; | 144 | 68.7k | }; |
Unexecuted instantiation: rct.cc:jxl::N_SSE2::InvRCT(jxl::Image&, unsigned long, unsigned long, jxl::ThreadPool*)::$_0::operator()(unsigned int, unsigned long) const |
145 | 1.78k | JXL_RETURN_IF_ERROR( |
146 | 1.78k | RunOnPool(pool, 0, h, ThreadPool::NoInit, process_row, "InvRCT")); |
147 | 1.78k | return true; |
148 | 1.78k | } Unexecuted instantiation: jxl::N_SSE4::InvRCT(jxl::Image&, unsigned long, unsigned long, jxl::ThreadPool*) jxl::N_AVX2::InvRCT(jxl::Image&, unsigned long, unsigned long, jxl::ThreadPool*) Line | Count | Source | 97 | 2.97k | Status InvRCT(Image& input, size_t begin_c, size_t rct_type, ThreadPool* pool) { | 98 | 2.97k | JXL_RETURN_IF_ERROR(CheckEqualChannels(input, begin_c, begin_c + 2)); | 99 | 2.97k | size_t m = begin_c; | 100 | 2.97k | Channel& c0 = input.channel[m + 0]; | 101 | 2.97k | size_t w = c0.w; | 102 | 2.97k | size_t h = c0.h; | 103 | 2.97k | if (rct_type == 0) { // noop | 104 | 1.15k | return true; | 105 | 1.15k | } | 106 | | // Permutation: 0=RGB, 1=GBR, 2=BRG, 3=RBG, 4=GRB, 5=BGR | 107 | 1.81k | int permutation = rct_type / 7; | 108 | 1.81k | JXL_ENSURE(permutation < 6); | 109 | | // 0-5 values have the low bit corresponding to Third and the high bits | 110 | | // corresponding to Second. 6 corresponds to YCoCg. | 111 | | // | 112 | | // Second: 0=nop, 1=SubtractFirst, 2=SubtractAvgFirstThird | 113 | | // | 114 | | // Third: 0=nop, 1=SubtractFirst | 115 | 1.81k | int custom = rct_type % 7; | 116 | | // Special case: permute-only. Swap channels around. | 117 | 1.81k | if (custom == 0) { | 118 | 26 | Channel ch0 = std::move(input.channel[m]); | 119 | 26 | Channel ch1 = std::move(input.channel[m + 1]); | 120 | 26 | Channel ch2 = std::move(input.channel[m + 2]); | 121 | 26 | input.channel[m + (permutation % 3)] = std::move(ch0); | 122 | 26 | input.channel[m + ((permutation + 1 + permutation / 3) % 3)] = | 123 | 26 | std::move(ch1); | 124 | 26 | input.channel[m + ((permutation + 2 - permutation / 3) % 3)] = | 125 | 26 | std::move(ch2); | 126 | 26 | return true; | 127 | 26 | } | 128 | 1.78k | constexpr decltype(&InvRCTRow<0>) inv_rct_row[] = { | 129 | 1.78k | InvRCTRow<0>, InvRCTRow<1>, InvRCTRow<2>, InvRCTRow<3>, | 130 | 1.78k | InvRCTRow<4>, InvRCTRow<5>, InvRCTRow<6>}; | 131 | 1.78k | const auto process_row = [&](const uint32_t task, | 132 | 1.78k | size_t /* thread */) -> Status { | 133 | 1.78k | const size_t y = task; | 134 | 1.78k | const pixel_type* in0 = input.channel[m].Row(y); | 135 | 1.78k | const pixel_type* in1 = input.channel[m + 1].Row(y); | 136 | 1.78k | const pixel_type* in2 = input.channel[m + 2].Row(y); | 137 | 1.78k | pixel_type* out0 = input.channel[m + (permutation % 3)].Row(y); | 138 | 1.78k | pixel_type* out1 = | 139 | 1.78k | input.channel[m + ((permutation + 1 + permutation / 3) % 3)].Row(y); | 140 | 1.78k | pixel_type* out2 = | 141 | 1.78k | input.channel[m + ((permutation + 2 - permutation / 3) % 3)].Row(y); | 142 | 1.78k | inv_rct_row[custom](in0, in1, in2, out0, out1, out2, w); | 143 | 1.78k | return true; | 144 | 1.78k | }; | 145 | 1.78k | JXL_RETURN_IF_ERROR( | 146 | 1.78k | RunOnPool(pool, 0, h, ThreadPool::NoInit, process_row, "InvRCT")); | 147 | 1.78k | return true; | 148 | 1.78k | } |
Unexecuted instantiation: jxl::N_SSE2::InvRCT(jxl::Image&, unsigned long, unsigned long, jxl::ThreadPool*) |
149 | | |
150 | | } // namespace HWY_NAMESPACE |
151 | | } // namespace jxl |
152 | | HWY_AFTER_NAMESPACE(); |
153 | | |
154 | | #if HWY_ONCE |
155 | | namespace jxl { |
156 | | |
157 | | HWY_EXPORT(InvRCT); |
158 | 2.97k | Status InvRCT(Image& input, size_t begin_c, size_t rct_type, ThreadPool* pool) { |
159 | 2.97k | return HWY_DYNAMIC_DISPATCH(InvRCT)(input, begin_c, rct_type, pool); |
160 | 2.97k | } |
161 | | |
162 | | } // namespace jxl |
163 | | #endif |