/src/libjxl/lib/jxl/modular/transform/rct.cc
Line | Count | Source |
1 | | // Copyright (c) the JPEG XL Project Authors. All rights reserved. |
2 | | // |
3 | | // Use of this source code is governed by a BSD-style |
4 | | // license that can be found in the LICENSE file. |
5 | | |
6 | | #include "lib/jxl/modular/transform/rct.h" |
7 | | |
8 | | #include <cstddef> |
9 | | #include <cstdint> |
10 | | #include <utility> |
11 | | |
12 | | #include "lib/jxl/base/data_parallel.h" |
13 | | #include "lib/jxl/base/status.h" |
14 | | #include "lib/jxl/modular/modular_image.h" |
15 | | #include "lib/jxl/modular/transform/transform.h" |
16 | | #undef HWY_TARGET_INCLUDE |
17 | | #define HWY_TARGET_INCLUDE "lib/jxl/modular/transform/rct.cc" |
18 | | #include <hwy/foreach_target.h> |
19 | | #include <hwy/highway.h> |
20 | | HWY_BEFORE_NAMESPACE(); |
21 | | namespace jxl { |
22 | | namespace HWY_NAMESPACE { |
23 | | |
24 | | // These templates are not found via ADL. |
25 | | using hwy::HWY_NAMESPACE::Add; |
26 | | using hwy::HWY_NAMESPACE::ShiftRight; |
27 | | using hwy::HWY_NAMESPACE::Sub; |
28 | | |
29 | | template <int transform_type> |
30 | | void InvRCTRow(const pixel_type* in0, const pixel_type* in1, |
31 | | const pixel_type* in2, pixel_type* out0, pixel_type* out1, |
32 | 46.7k | pixel_type* out2, size_t w) { |
33 | 46.7k | static_assert(transform_type >= 0 && transform_type < 7, |
34 | 46.7k | "Invalid transform type"); |
35 | 46.7k | int second = transform_type >> 1; |
36 | 46.7k | int third = transform_type & 1; |
37 | | |
38 | 46.7k | size_t x = 0; |
39 | 46.7k | const HWY_FULL(pixel_type) d; |
40 | 46.7k | const size_t N = Lanes(d); |
41 | 485k | for (; x + N - 1 < w; x += N) { |
42 | 439k | if (transform_type == 6) { |
43 | 125k | auto Y = Load(d, in0 + x); |
44 | 125k | auto Co = Load(d, in1 + x); |
45 | 125k | auto Cg = Load(d, in2 + x); |
46 | 125k | Y = Sub(Y, ShiftRight<1>(Cg)); |
47 | 125k | auto G = Add(Cg, Y); |
48 | 125k | Y = Sub(Y, ShiftRight<1>(Co)); |
49 | 125k | auto R = Add(Y, Co); |
50 | 125k | Store(R, d, out0 + x); |
51 | 125k | Store(G, d, out1 + x); |
52 | 125k | Store(Y, d, out2 + x); |
53 | 313k | } else { |
54 | 313k | auto First = Load(d, in0 + x); |
55 | 313k | auto Second = Load(d, in1 + x); |
56 | 313k | auto Third = Load(d, in2 + x); |
57 | 313k | if (third) Third = Add(Third, First); |
58 | 313k | if (second == 1) { |
59 | 145k | Second = Add(Second, First); |
60 | 168k | } else if (second == 2) { |
61 | 48.6k | Second = Add(Second, ShiftRight<1>(Add(First, Third))); |
62 | 48.6k | } |
63 | 313k | Store(First, d, out0 + x); |
64 | 313k | Store(Second, d, out1 + x); |
65 | 313k | Store(Third, d, out2 + x); |
66 | 313k | } |
67 | 439k | } |
68 | 104k | for (; x < w; x++) { |
69 | 57.7k | if (transform_type == 6) { |
70 | 9.98k | pixel_type Y = in0[x]; |
71 | 9.98k | pixel_type Co = in1[x]; |
72 | 9.98k | pixel_type Cg = in2[x]; |
73 | 9.98k | pixel_type tmp = PixelAdd(Y, -(Cg >> 1)); |
74 | 9.98k | pixel_type G = PixelAdd(Cg, tmp); |
75 | 9.98k | pixel_type B = PixelAdd(tmp, -(Co >> 1)); |
76 | 9.98k | pixel_type R = PixelAdd(B, Co); |
77 | 9.98k | out0[x] = R; |
78 | 9.98k | out1[x] = G; |
79 | 9.98k | out2[x] = B; |
80 | 47.7k | } else { |
81 | 47.7k | pixel_type First = in0[x]; |
82 | 47.7k | pixel_type Second = in1[x]; |
83 | 47.7k | pixel_type Third = in2[x]; |
84 | 47.7k | if (third) Third = PixelAdd(Third, First); |
85 | 47.7k | if (second == 1) { |
86 | 29.3k | Second = PixelAdd(Second, First); |
87 | 29.3k | } else if (second == 2) { |
88 | 8.11k | Second = PixelAdd(Second, (PixelAdd(First, Third) >> 1)); |
89 | 8.11k | } |
90 | 47.7k | out0[x] = First; |
91 | 47.7k | out1[x] = Second; |
92 | 47.7k | out2[x] = Third; |
93 | 47.7k | } |
94 | 57.7k | } |
95 | 46.7k | } Unexecuted instantiation: void jxl::N_SSE4::InvRCTRow<0>(int const*, int const*, int const*, int*, int*, int*, unsigned long) Unexecuted instantiation: void jxl::N_SSE4::InvRCTRow<1>(int const*, int const*, int const*, int*, int*, int*, unsigned long) Unexecuted instantiation: void jxl::N_SSE4::InvRCTRow<2>(int const*, int const*, int const*, int*, int*, int*, unsigned long) Unexecuted instantiation: void jxl::N_SSE4::InvRCTRow<3>(int const*, int const*, int const*, int*, int*, int*, unsigned long) Unexecuted instantiation: void jxl::N_SSE4::InvRCTRow<4>(int const*, int const*, int const*, int*, int*, int*, unsigned long) Unexecuted instantiation: void jxl::N_SSE4::InvRCTRow<5>(int const*, int const*, int const*, int*, int*, int*, unsigned long) Unexecuted instantiation: void jxl::N_SSE4::InvRCTRow<6>(int const*, int const*, int const*, int*, int*, int*, unsigned long) Unexecuted instantiation: void jxl::N_AVX2::InvRCTRow<0>(int const*, int const*, int const*, int*, int*, int*, unsigned long) void jxl::N_AVX2::InvRCTRow<1>(int const*, int const*, int const*, int*, int*, int*, unsigned long) Line | Count | Source | 32 | 3.18k | pixel_type* out2, size_t w) { | 33 | 3.18k | static_assert(transform_type >= 0 && transform_type < 7, | 34 | 3.18k | "Invalid transform type"); | 35 | 3.18k | int second = transform_type >> 1; | 36 | 3.18k | int third = transform_type & 1; | 37 | | | 38 | 3.18k | size_t x = 0; | 39 | 3.18k | const HWY_FULL(pixel_type) d; | 40 | 3.18k | const size_t N = Lanes(d); | 41 | 122k | for (; x + N - 1 < w; x += N) { | 42 | 119k | if (transform_type == 6) { | 43 | 0 | auto Y = Load(d, in0 + x); | 44 | 0 | auto Co = Load(d, in1 + x); | 45 | 0 | auto Cg = Load(d, in2 + x); | 46 | 0 | Y = Sub(Y, ShiftRight<1>(Cg)); | 47 | 0 | auto G = Add(Cg, Y); | 48 | 0 | Y = Sub(Y, ShiftRight<1>(Co)); | 49 | 0 | auto R = Add(Y, Co); | 50 | 0 | Store(R, d, out0 + x); | 51 | 0 | Store(G, d, out1 + x); | 52 | 0 | Store(Y, d, out2 + x); | 53 | 119k | } else { | 54 | 119k | auto First = Load(d, in0 + x); | 55 | 119k | auto Second = Load(d, in1 + x); | 56 | 119k | auto Third = Load(d, in2 + x); | 57 | 119k | if (third) Third = Add(Third, First); | 58 | 119k | if (second == 1) { | 59 | 0 | Second = Add(Second, First); | 60 | 119k | } else if (second == 2) { | 61 | 0 | Second = Add(Second, ShiftRight<1>(Add(First, Third))); | 62 | 0 | } | 63 | 119k | Store(First, d, out0 + x); | 64 | 119k | Store(Second, d, out1 + x); | 65 | 119k | Store(Third, d, out2 + x); | 66 | 119k | } | 67 | 119k | } | 68 | 13.4k | for (; x < w; x++) { | 69 | 10.2k | if (transform_type == 6) { | 70 | 0 | pixel_type Y = in0[x]; | 71 | 0 | pixel_type Co = in1[x]; | 72 | 0 | pixel_type Cg = in2[x]; | 73 | 0 | pixel_type tmp = PixelAdd(Y, -(Cg >> 1)); | 74 | 0 | pixel_type G = PixelAdd(Cg, tmp); | 75 | 0 | pixel_type B = PixelAdd(tmp, -(Co >> 1)); | 76 | 0 | pixel_type R = PixelAdd(B, Co); | 77 | 0 | out0[x] = R; | 78 | 0 | out1[x] = G; | 79 | 0 | out2[x] = B; | 80 | 10.2k | } else { | 81 | 10.2k | pixel_type First = in0[x]; | 82 | 10.2k | pixel_type Second = in1[x]; | 83 | 10.2k | pixel_type Third = in2[x]; | 84 | 10.2k | if (third) Third = PixelAdd(Third, First); | 85 | 10.2k | if (second == 1) { | 86 | 0 | Second = PixelAdd(Second, First); | 87 | 10.2k | } else if (second == 2) { | 88 | 0 | Second = PixelAdd(Second, (PixelAdd(First, Third) >> 1)); | 89 | 0 | } | 90 | 10.2k | out0[x] = First; | 91 | 10.2k | out1[x] = Second; | 92 | 10.2k | out2[x] = Third; | 93 | 10.2k | } | 94 | 10.2k | } | 95 | 3.18k | } |
void jxl::N_AVX2::InvRCTRow<2>(int const*, int const*, int const*, int*, int*, int*, unsigned long) Line | Count | Source | 32 | 5.91k | pixel_type* out2, size_t w) { | 33 | 5.91k | static_assert(transform_type >= 0 && transform_type < 7, | 34 | 5.91k | "Invalid transform type"); | 35 | 5.91k | int second = transform_type >> 1; | 36 | 5.91k | int third = transform_type & 1; | 37 | | | 38 | 5.91k | size_t x = 0; | 39 | 5.91k | const HWY_FULL(pixel_type) d; | 40 | 5.91k | const size_t N = Lanes(d); | 41 | 41.5k | for (; x + N - 1 < w; x += N) { | 42 | 35.6k | if (transform_type == 6) { | 43 | 0 | auto Y = Load(d, in0 + x); | 44 | 0 | auto Co = Load(d, in1 + x); | 45 | 0 | auto Cg = Load(d, in2 + x); | 46 | 0 | Y = Sub(Y, ShiftRight<1>(Cg)); | 47 | 0 | auto G = Add(Cg, Y); | 48 | 0 | Y = Sub(Y, ShiftRight<1>(Co)); | 49 | 0 | auto R = Add(Y, Co); | 50 | 0 | Store(R, d, out0 + x); | 51 | 0 | Store(G, d, out1 + x); | 52 | 0 | Store(Y, d, out2 + x); | 53 | 35.6k | } else { | 54 | 35.6k | auto First = Load(d, in0 + x); | 55 | 35.6k | auto Second = Load(d, in1 + x); | 56 | 35.6k | auto Third = Load(d, in2 + x); | 57 | 35.6k | if (third) Third = Add(Third, First); | 58 | 35.6k | if (second == 1) { | 59 | 35.6k | Second = Add(Second, First); | 60 | 35.6k | } else if (second == 2) { | 61 | 0 | Second = Add(Second, ShiftRight<1>(Add(First, Third))); | 62 | 0 | } | 63 | 35.6k | Store(First, d, out0 + x); | 64 | 35.6k | Store(Second, d, out1 + x); | 65 | 35.6k | Store(Third, d, out2 + x); | 66 | 35.6k | } | 67 | 35.6k | } | 68 | 27.9k | for (; x < w; x++) { | 69 | 22.0k | if (transform_type == 6) { | 70 | 0 | pixel_type Y = in0[x]; | 71 | 0 | pixel_type Co = in1[x]; | 72 | 0 | pixel_type Cg = in2[x]; | 73 | 0 | pixel_type tmp = PixelAdd(Y, -(Cg >> 1)); | 74 | 0 | pixel_type G = PixelAdd(Cg, tmp); | 75 | 0 | pixel_type B = PixelAdd(tmp, -(Co >> 1)); | 76 | 0 | pixel_type R = PixelAdd(B, Co); | 77 | 0 | out0[x] = R; | 78 | 0 | out1[x] = G; | 79 | 0 | out2[x] = B; | 80 | 22.0k | } else { | 81 | 22.0k | pixel_type First = in0[x]; | 82 | 22.0k | pixel_type Second = in1[x]; | 83 | 22.0k | pixel_type Third = in2[x]; | 84 | 22.0k | if (third) Third = PixelAdd(Third, First); | 85 | 22.0k | if (second == 1) { | 86 | 22.0k | Second = PixelAdd(Second, First); | 87 | 22.0k | } else if (second == 2) { | 88 | 0 | Second = PixelAdd(Second, (PixelAdd(First, Third) >> 1)); | 89 | 0 | } | 90 | 22.0k | out0[x] = First; | 91 | 22.0k | out1[x] = Second; | 92 | 22.0k | out2[x] = Third; | 93 | 22.0k | } | 94 | 22.0k | } | 95 | 5.91k | } |
void jxl::N_AVX2::InvRCTRow<3>(int const*, int const*, int const*, int*, int*, int*, unsigned long) Line | Count | Source | 32 | 7.43k | pixel_type* out2, size_t w) { | 33 | 7.43k | static_assert(transform_type >= 0 && transform_type < 7, | 34 | 7.43k | "Invalid transform type"); | 35 | 7.43k | int second = transform_type >> 1; | 36 | 7.43k | int third = transform_type & 1; | 37 | | | 38 | 7.43k | size_t x = 0; | 39 | 7.43k | const HWY_FULL(pixel_type) d; | 40 | 7.43k | const size_t N = Lanes(d); | 41 | 117k | for (; x + N - 1 < w; x += N) { | 42 | 109k | if (transform_type == 6) { | 43 | 0 | auto Y = Load(d, in0 + x); | 44 | 0 | auto Co = Load(d, in1 + x); | 45 | 0 | auto Cg = Load(d, in2 + x); | 46 | 0 | Y = Sub(Y, ShiftRight<1>(Cg)); | 47 | 0 | auto G = Add(Cg, Y); | 48 | 0 | Y = Sub(Y, ShiftRight<1>(Co)); | 49 | 0 | auto R = Add(Y, Co); | 50 | 0 | Store(R, d, out0 + x); | 51 | 0 | Store(G, d, out1 + x); | 52 | 0 | Store(Y, d, out2 + x); | 53 | 109k | } else { | 54 | 109k | auto First = Load(d, in0 + x); | 55 | 109k | auto Second = Load(d, in1 + x); | 56 | 109k | auto Third = Load(d, in2 + x); | 57 | 109k | if (third) Third = Add(Third, First); | 58 | 109k | if (second == 1) { | 59 | 109k | Second = Add(Second, First); | 60 | 109k | } else if (second == 2) { | 61 | 0 | Second = Add(Second, ShiftRight<1>(Add(First, Third))); | 62 | 0 | } | 63 | 109k | Store(First, d, out0 + x); | 64 | 109k | Store(Second, d, out1 + x); | 65 | 109k | Store(Third, d, out2 + x); | 66 | 109k | } | 67 | 109k | } | 68 | 14.7k | for (; x < w; x++) { | 69 | 7.32k | if (transform_type == 6) { | 70 | 0 | pixel_type Y = in0[x]; | 71 | 0 | pixel_type Co = in1[x]; | 72 | 0 | pixel_type Cg = in2[x]; | 73 | 0 | pixel_type tmp = PixelAdd(Y, -(Cg >> 1)); | 74 | 0 | pixel_type G = PixelAdd(Cg, tmp); | 75 | 0 | pixel_type B = PixelAdd(tmp, -(Co >> 1)); | 76 | 0 | pixel_type R = PixelAdd(B, Co); | 77 | 0 | out0[x] = R; | 78 | 0 | out1[x] = G; | 79 | 0 | out2[x] = B; | 80 | 7.32k | } else { | 81 | 7.32k | pixel_type First = in0[x]; | 82 | 7.32k | pixel_type Second = in1[x]; | 83 | 7.32k | pixel_type Third = in2[x]; | 84 | 7.32k | if (third) Third = PixelAdd(Third, First); | 85 | 7.32k | if (second == 1) { | 86 | 7.32k | Second = PixelAdd(Second, First); | 87 | 7.32k | } else if (second == 2) { | 88 | 0 | Second = PixelAdd(Second, (PixelAdd(First, Third) >> 1)); | 89 | 0 | } | 90 | 7.32k | out0[x] = First; | 91 | 7.32k | out1[x] = Second; | 92 | 7.32k | out2[x] = Third; | 93 | 7.32k | } | 94 | 7.32k | } | 95 | 7.43k | } |
void jxl::N_AVX2::InvRCTRow<4>(int const*, int const*, int const*, int*, int*, int*, unsigned long) Line | Count | Source | 32 | 2.73k | pixel_type* out2, size_t w) { | 33 | 2.73k | static_assert(transform_type >= 0 && transform_type < 7, | 34 | 2.73k | "Invalid transform type"); | 35 | 2.73k | int second = transform_type >> 1; | 36 | 2.73k | int third = transform_type & 1; | 37 | | | 38 | 2.73k | size_t x = 0; | 39 | 2.73k | const HWY_FULL(pixel_type) d; | 40 | 2.73k | const size_t N = Lanes(d); | 41 | 29.0k | for (; x + N - 1 < w; x += N) { | 42 | 26.2k | if (transform_type == 6) { | 43 | 0 | auto Y = Load(d, in0 + x); | 44 | 0 | auto Co = Load(d, in1 + x); | 45 | 0 | auto Cg = Load(d, in2 + x); | 46 | 0 | Y = Sub(Y, ShiftRight<1>(Cg)); | 47 | 0 | auto G = Add(Cg, Y); | 48 | 0 | Y = Sub(Y, ShiftRight<1>(Co)); | 49 | 0 | auto R = Add(Y, Co); | 50 | 0 | Store(R, d, out0 + x); | 51 | 0 | Store(G, d, out1 + x); | 52 | 0 | Store(Y, d, out2 + x); | 53 | 26.2k | } else { | 54 | 26.2k | auto First = Load(d, in0 + x); | 55 | 26.2k | auto Second = Load(d, in1 + x); | 56 | 26.2k | auto Third = Load(d, in2 + x); | 57 | 26.2k | if (third) Third = Add(Third, First); | 58 | 26.2k | if (second == 1) { | 59 | 0 | Second = Add(Second, First); | 60 | 26.2k | } else if (second == 2) { | 61 | 26.2k | Second = Add(Second, ShiftRight<1>(Add(First, Third))); | 62 | 26.2k | } | 63 | 26.2k | Store(First, d, out0 + x); | 64 | 26.2k | Store(Second, d, out1 + x); | 65 | 26.2k | Store(Third, d, out2 + x); | 66 | 26.2k | } | 67 | 26.2k | } | 68 | 6.39k | for (; x < w; x++) { | 69 | 3.65k | if (transform_type == 6) { | 70 | 0 | pixel_type Y = in0[x]; | 71 | 0 | pixel_type Co = in1[x]; | 72 | 0 | pixel_type Cg = in2[x]; | 73 | 0 | pixel_type tmp = PixelAdd(Y, -(Cg >> 1)); | 74 | 0 | pixel_type G = PixelAdd(Cg, tmp); | 75 | 0 | pixel_type B = PixelAdd(tmp, -(Co >> 1)); | 76 | 0 | pixel_type R = PixelAdd(B, Co); | 77 | 0 | out0[x] = R; | 78 | 0 | out1[x] = G; | 79 | 0 | out2[x] = B; | 80 | 3.65k | } else { | 81 | 3.65k | pixel_type First = in0[x]; | 82 | 3.65k | pixel_type Second = in1[x]; | 83 | 3.65k | pixel_type Third = in2[x]; | 84 | 3.65k | if (third) Third = PixelAdd(Third, First); | 85 | 3.65k | if (second == 1) { | 86 | 0 | Second = PixelAdd(Second, First); | 87 | 3.65k | } else if (second == 2) { | 88 | 3.65k | Second = PixelAdd(Second, (PixelAdd(First, Third) >> 1)); | 89 | 3.65k | } | 90 | 3.65k | out0[x] = First; | 91 | 3.65k | out1[x] = Second; | 92 | 3.65k | out2[x] = Third; | 93 | 3.65k | } | 94 | 3.65k | } | 95 | 2.73k | } |
void jxl::N_AVX2::InvRCTRow<5>(int const*, int const*, int const*, int*, int*, int*, unsigned long) Line | Count | Source | 32 | 16.3k | pixel_type* out2, size_t w) { | 33 | 16.3k | static_assert(transform_type >= 0 && transform_type < 7, | 34 | 16.3k | "Invalid transform type"); | 35 | 16.3k | int second = transform_type >> 1; | 36 | 16.3k | int third = transform_type & 1; | 37 | | | 38 | 16.3k | size_t x = 0; | 39 | 16.3k | const HWY_FULL(pixel_type) d; | 40 | 16.3k | const size_t N = Lanes(d); | 41 | 38.7k | for (; x + N - 1 < w; x += N) { | 42 | 22.3k | if (transform_type == 6) { | 43 | 0 | auto Y = Load(d, in0 + x); | 44 | 0 | auto Co = Load(d, in1 + x); | 45 | 0 | auto Cg = Load(d, in2 + x); | 46 | 0 | Y = Sub(Y, ShiftRight<1>(Cg)); | 47 | 0 | auto G = Add(Cg, Y); | 48 | 0 | Y = Sub(Y, ShiftRight<1>(Co)); | 49 | 0 | auto R = Add(Y, Co); | 50 | 0 | Store(R, d, out0 + x); | 51 | 0 | Store(G, d, out1 + x); | 52 | 0 | Store(Y, d, out2 + x); | 53 | 22.3k | } else { | 54 | 22.3k | auto First = Load(d, in0 + x); | 55 | 22.3k | auto Second = Load(d, in1 + x); | 56 | 22.3k | auto Third = Load(d, in2 + x); | 57 | 22.3k | if (third) Third = Add(Third, First); | 58 | 22.3k | if (second == 1) { | 59 | 0 | Second = Add(Second, First); | 60 | 22.3k | } else if (second == 2) { | 61 | 22.3k | Second = Add(Second, ShiftRight<1>(Add(First, Third))); | 62 | 22.3k | } | 63 | 22.3k | Store(First, d, out0 + x); | 64 | 22.3k | Store(Second, d, out1 + x); | 65 | 22.3k | Store(Third, d, out2 + x); | 66 | 22.3k | } | 67 | 22.3k | } | 68 | 20.7k | for (; x < w; x++) { | 69 | 4.45k | if (transform_type == 6) { | 70 | 0 | pixel_type Y = in0[x]; | 71 | 0 | pixel_type Co = in1[x]; | 72 | 0 | pixel_type Cg = in2[x]; | 73 | 0 | pixel_type tmp = PixelAdd(Y, -(Cg >> 1)); | 74 | 0 | pixel_type G = PixelAdd(Cg, tmp); | 75 | 0 | pixel_type B = PixelAdd(tmp, -(Co >> 1)); | 76 | 0 | pixel_type R = PixelAdd(B, Co); | 77 | 0 | out0[x] = R; | 78 | 0 | out1[x] = G; | 79 | 0 | out2[x] = B; | 80 | 4.45k | } else { | 81 | 4.45k | pixel_type First = in0[x]; | 82 | 4.45k | pixel_type Second = in1[x]; | 83 | 4.45k | pixel_type Third = in2[x]; | 84 | 4.45k | if (third) Third = PixelAdd(Third, First); | 85 | 4.45k | if (second == 1) { | 86 | 0 | Second = PixelAdd(Second, First); | 87 | 4.45k | } else if (second == 2) { | 88 | 4.45k | Second = PixelAdd(Second, (PixelAdd(First, Third) >> 1)); | 89 | 4.45k | } | 90 | 4.45k | out0[x] = First; | 91 | 4.45k | out1[x] = Second; | 92 | 4.45k | out2[x] = Third; | 93 | 4.45k | } | 94 | 4.45k | } | 95 | 16.3k | } |
void jxl::N_AVX2::InvRCTRow<6>(int const*, int const*, int const*, int*, int*, int*, unsigned long) Line | Count | Source | 32 | 11.1k | pixel_type* out2, size_t w) { | 33 | 11.1k | static_assert(transform_type >= 0 && transform_type < 7, | 34 | 11.1k | "Invalid transform type"); | 35 | 11.1k | int second = transform_type >> 1; | 36 | 11.1k | int third = transform_type & 1; | 37 | | | 38 | 11.1k | size_t x = 0; | 39 | 11.1k | const HWY_FULL(pixel_type) d; | 40 | 11.1k | const size_t N = Lanes(d); | 41 | 136k | for (; x + N - 1 < w; x += N) { | 42 | 125k | if (transform_type == 6) { | 43 | 125k | auto Y = Load(d, in0 + x); | 44 | 125k | auto Co = Load(d, in1 + x); | 45 | 125k | auto Cg = Load(d, in2 + x); | 46 | 125k | Y = Sub(Y, ShiftRight<1>(Cg)); | 47 | 125k | auto G = Add(Cg, Y); | 48 | 125k | Y = Sub(Y, ShiftRight<1>(Co)); | 49 | 125k | auto R = Add(Y, Co); | 50 | 125k | Store(R, d, out0 + x); | 51 | 125k | Store(G, d, out1 + x); | 52 | 125k | Store(Y, d, out2 + x); | 53 | 125k | } else { | 54 | 0 | auto First = Load(d, in0 + x); | 55 | 0 | auto Second = Load(d, in1 + x); | 56 | 0 | auto Third = Load(d, in2 + x); | 57 | 0 | if (third) Third = Add(Third, First); | 58 | 0 | if (second == 1) { | 59 | 0 | Second = Add(Second, First); | 60 | 0 | } else if (second == 2) { | 61 | 0 | Second = Add(Second, ShiftRight<1>(Add(First, Third))); | 62 | 0 | } | 63 | 0 | Store(First, d, out0 + x); | 64 | 0 | Store(Second, d, out1 + x); | 65 | 0 | Store(Third, d, out2 + x); | 66 | 0 | } | 67 | 125k | } | 68 | 21.1k | for (; x < w; x++) { | 69 | 9.98k | if (transform_type == 6) { | 70 | 9.98k | pixel_type Y = in0[x]; | 71 | 9.98k | pixel_type Co = in1[x]; | 72 | 9.98k | pixel_type Cg = in2[x]; | 73 | 9.98k | pixel_type tmp = PixelAdd(Y, -(Cg >> 1)); | 74 | 9.98k | pixel_type G = PixelAdd(Cg, tmp); | 75 | 9.98k | pixel_type B = PixelAdd(tmp, -(Co >> 1)); | 76 | 9.98k | pixel_type R = PixelAdd(B, Co); | 77 | 9.98k | out0[x] = R; | 78 | 9.98k | out1[x] = G; | 79 | 9.98k | out2[x] = B; | 80 | 9.98k | } else { | 81 | 0 | pixel_type First = in0[x]; | 82 | 0 | pixel_type Second = in1[x]; | 83 | 0 | pixel_type Third = in2[x]; | 84 | 0 | if (third) Third = PixelAdd(Third, First); | 85 | 0 | if (second == 1) { | 86 | 0 | Second = PixelAdd(Second, First); | 87 | 0 | } else if (second == 2) { | 88 | 0 | Second = PixelAdd(Second, (PixelAdd(First, Third) >> 1)); | 89 | 0 | } | 90 | 0 | out0[x] = First; | 91 | 0 | out1[x] = Second; | 92 | 0 | out2[x] = Third; | 93 | 0 | } | 94 | 9.98k | } | 95 | 11.1k | } |
Unexecuted instantiation: void jxl::N_SSE2::InvRCTRow<0>(int const*, int const*, int const*, int*, int*, int*, unsigned long) Unexecuted instantiation: void jxl::N_SSE2::InvRCTRow<1>(int const*, int const*, int const*, int*, int*, int*, unsigned long) Unexecuted instantiation: void jxl::N_SSE2::InvRCTRow<2>(int const*, int const*, int const*, int*, int*, int*, unsigned long) Unexecuted instantiation: void jxl::N_SSE2::InvRCTRow<3>(int const*, int const*, int const*, int*, int*, int*, unsigned long) Unexecuted instantiation: void jxl::N_SSE2::InvRCTRow<4>(int const*, int const*, int const*, int*, int*, int*, unsigned long) Unexecuted instantiation: void jxl::N_SSE2::InvRCTRow<5>(int const*, int const*, int const*, int*, int*, int*, unsigned long) Unexecuted instantiation: void jxl::N_SSE2::InvRCTRow<6>(int const*, int const*, int const*, int*, int*, int*, unsigned long) |
96 | | |
97 | 1.78k | Status InvRCT(Image& input, size_t begin_c, size_t rct_type, ThreadPool* pool) { |
98 | 1.78k | JXL_RETURN_IF_ERROR(CheckEqualChannels(input, begin_c, begin_c + 2)); |
99 | 1.78k | size_t m = begin_c; |
100 | 1.78k | Channel& c0 = input.channel[m + 0]; |
101 | 1.78k | size_t w = c0.w; |
102 | 1.78k | size_t h = c0.h; |
103 | 1.78k | if (rct_type == 0) { // noop |
104 | 249 | return true; |
105 | 249 | } |
106 | | // Permutation: 0=RGB, 1=GBR, 2=BRG, 3=RBG, 4=GRB, 5=BGR |
107 | 1.53k | int permutation = rct_type / 7; |
108 | 1.53k | JXL_ENSURE(permutation < 6); |
109 | | // 0-5 values have the low bit corresponding to Third and the high bits |
110 | | // corresponding to Second. 6 corresponds to YCoCg. |
111 | | // |
112 | | // Second: 0=nop, 1=SubtractFirst, 2=SubtractAvgFirstThird |
113 | | // |
114 | | // Third: 0=nop, 1=SubtractFirst |
115 | 1.53k | int custom = rct_type % 7; |
116 | | // Special case: permute-only. Swap channels around. |
117 | 1.53k | if (custom == 0) { |
118 | 4 | Channel ch0 = std::move(input.channel[m]); |
119 | 4 | Channel ch1 = std::move(input.channel[m + 1]); |
120 | 4 | Channel ch2 = std::move(input.channel[m + 2]); |
121 | 4 | input.channel[m + (permutation % 3)] = std::move(ch0); |
122 | 4 | input.channel[m + ((permutation + 1 + permutation / 3) % 3)] = |
123 | 4 | std::move(ch1); |
124 | 4 | input.channel[m + ((permutation + 2 - permutation / 3) % 3)] = |
125 | 4 | std::move(ch2); |
126 | 4 | return true; |
127 | 4 | } |
128 | 1.53k | constexpr decltype(&InvRCTRow<0>) inv_rct_row[] = { |
129 | 1.53k | InvRCTRow<0>, InvRCTRow<1>, InvRCTRow<2>, InvRCTRow<3>, |
130 | 1.53k | InvRCTRow<4>, InvRCTRow<5>, InvRCTRow<6>}; |
131 | 1.53k | const auto process_row = [&](const uint32_t task, |
132 | 46.7k | size_t /* thread */) -> Status { |
133 | 46.7k | const size_t y = task; |
134 | 46.7k | const pixel_type* in0 = input.channel[m].Row(y); |
135 | 46.7k | const pixel_type* in1 = input.channel[m + 1].Row(y); |
136 | 46.7k | const pixel_type* in2 = input.channel[m + 2].Row(y); |
137 | 46.7k | pixel_type* out0 = input.channel[m + (permutation % 3)].Row(y); |
138 | 46.7k | pixel_type* out1 = |
139 | 46.7k | input.channel[m + ((permutation + 1 + permutation / 3) % 3)].Row(y); |
140 | 46.7k | pixel_type* out2 = |
141 | 46.7k | input.channel[m + ((permutation + 2 - permutation / 3) % 3)].Row(y); |
142 | 46.7k | inv_rct_row[custom](in0, in1, in2, out0, out1, out2, w); |
143 | 46.7k | return true; |
144 | 46.7k | }; Unexecuted instantiation: rct.cc:jxl::N_SSE4::InvRCT(jxl::Image&, unsigned long, unsigned long, jxl::ThreadPool*)::$_0::operator()(unsigned int, unsigned long) const rct.cc:jxl::N_AVX2::InvRCT(jxl::Image&, unsigned long, unsigned long, jxl::ThreadPool*)::$_0::operator()(unsigned int, unsigned long) const Line | Count | Source | 132 | 46.7k | size_t /* thread */) -> Status { | 133 | 46.7k | const size_t y = task; | 134 | 46.7k | const pixel_type* in0 = input.channel[m].Row(y); | 135 | 46.7k | const pixel_type* in1 = input.channel[m + 1].Row(y); | 136 | 46.7k | const pixel_type* in2 = input.channel[m + 2].Row(y); | 137 | 46.7k | pixel_type* out0 = input.channel[m + (permutation % 3)].Row(y); | 138 | 46.7k | pixel_type* out1 = | 139 | 46.7k | input.channel[m + ((permutation + 1 + permutation / 3) % 3)].Row(y); | 140 | 46.7k | pixel_type* out2 = | 141 | 46.7k | input.channel[m + ((permutation + 2 - permutation / 3) % 3)].Row(y); | 142 | 46.7k | inv_rct_row[custom](in0, in1, in2, out0, out1, out2, w); | 143 | 46.7k | return true; | 144 | 46.7k | }; |
Unexecuted instantiation: rct.cc:jxl::N_SSE2::InvRCT(jxl::Image&, unsigned long, unsigned long, jxl::ThreadPool*)::$_0::operator()(unsigned int, unsigned long) const |
145 | 1.53k | JXL_RETURN_IF_ERROR( |
146 | 1.53k | RunOnPool(pool, 0, h, ThreadPool::NoInit, process_row, "InvRCT")); |
147 | 1.53k | return true; |
148 | 1.53k | } Unexecuted instantiation: jxl::N_SSE4::InvRCT(jxl::Image&, unsigned long, unsigned long, jxl::ThreadPool*) jxl::N_AVX2::InvRCT(jxl::Image&, unsigned long, unsigned long, jxl::ThreadPool*) Line | Count | Source | 97 | 1.78k | Status InvRCT(Image& input, size_t begin_c, size_t rct_type, ThreadPool* pool) { | 98 | 1.78k | JXL_RETURN_IF_ERROR(CheckEqualChannels(input, begin_c, begin_c + 2)); | 99 | 1.78k | size_t m = begin_c; | 100 | 1.78k | Channel& c0 = input.channel[m + 0]; | 101 | 1.78k | size_t w = c0.w; | 102 | 1.78k | size_t h = c0.h; | 103 | 1.78k | if (rct_type == 0) { // noop | 104 | 249 | return true; | 105 | 249 | } | 106 | | // Permutation: 0=RGB, 1=GBR, 2=BRG, 3=RBG, 4=GRB, 5=BGR | 107 | 1.53k | int permutation = rct_type / 7; | 108 | 1.53k | JXL_ENSURE(permutation < 6); | 109 | | // 0-5 values have the low bit corresponding to Third and the high bits | 110 | | // corresponding to Second. 6 corresponds to YCoCg. | 111 | | // | 112 | | // Second: 0=nop, 1=SubtractFirst, 2=SubtractAvgFirstThird | 113 | | // | 114 | | // Third: 0=nop, 1=SubtractFirst | 115 | 1.53k | int custom = rct_type % 7; | 116 | | // Special case: permute-only. Swap channels around. | 117 | 1.53k | if (custom == 0) { | 118 | 4 | Channel ch0 = std::move(input.channel[m]); | 119 | 4 | Channel ch1 = std::move(input.channel[m + 1]); | 120 | 4 | Channel ch2 = std::move(input.channel[m + 2]); | 121 | 4 | input.channel[m + (permutation % 3)] = std::move(ch0); | 122 | 4 | input.channel[m + ((permutation + 1 + permutation / 3) % 3)] = | 123 | 4 | std::move(ch1); | 124 | 4 | input.channel[m + ((permutation + 2 - permutation / 3) % 3)] = | 125 | 4 | std::move(ch2); | 126 | 4 | return true; | 127 | 4 | } | 128 | 1.53k | constexpr decltype(&InvRCTRow<0>) inv_rct_row[] = { | 129 | 1.53k | InvRCTRow<0>, InvRCTRow<1>, InvRCTRow<2>, InvRCTRow<3>, | 130 | 1.53k | InvRCTRow<4>, InvRCTRow<5>, InvRCTRow<6>}; | 131 | 1.53k | const auto process_row = [&](const uint32_t task, | 132 | 1.53k | size_t /* thread */) -> Status { | 133 | 1.53k | const size_t y = task; | 134 | 1.53k | const pixel_type* in0 = input.channel[m].Row(y); | 135 | 1.53k | const pixel_type* in1 = input.channel[m + 1].Row(y); | 136 | 1.53k | const pixel_type* in2 = input.channel[m + 2].Row(y); | 137 | 1.53k | pixel_type* out0 = input.channel[m + (permutation % 3)].Row(y); | 138 | 1.53k | pixel_type* out1 = | 139 | 1.53k | input.channel[m + ((permutation + 1 + permutation / 3) % 3)].Row(y); | 140 | 1.53k | pixel_type* out2 = | 141 | 1.53k | input.channel[m + ((permutation + 2 - permutation / 3) % 3)].Row(y); | 142 | 1.53k | inv_rct_row[custom](in0, in1, in2, out0, out1, out2, w); | 143 | 1.53k | return true; | 144 | 1.53k | }; | 145 | 1.53k | JXL_RETURN_IF_ERROR( | 146 | 1.53k | RunOnPool(pool, 0, h, ThreadPool::NoInit, process_row, "InvRCT")); | 147 | 1.53k | return true; | 148 | 1.53k | } |
Unexecuted instantiation: jxl::N_SSE2::InvRCT(jxl::Image&, unsigned long, unsigned long, jxl::ThreadPool*) |
149 | | |
150 | | } // namespace HWY_NAMESPACE |
151 | | } // namespace jxl |
152 | | HWY_AFTER_NAMESPACE(); |
153 | | |
154 | | #if HWY_ONCE |
155 | | namespace jxl { |
156 | | |
157 | | HWY_EXPORT(InvRCT); |
158 | 1.78k | Status InvRCT(Image& input, size_t begin_c, size_t rct_type, ThreadPool* pool) { |
159 | 1.78k | return HWY_DYNAMIC_DISPATCH(InvRCT)(input, begin_c, rct_type, pool); |
160 | 1.78k | } |
161 | | |
162 | | } // namespace jxl |
163 | | #endif |