95 | 1.83M | } Unexecuted instantiation: void jxl::N_SSE4::InvRCTRow<0>(int const*, int const*, int const*, int*, int*, int*, unsigned long) void jxl::N_SSE4::InvRCTRow<1>(int const*, int const*, int const*, int*, int*, int*, unsigned long) Line | Count | Source | 32 | 3.51k | pixel_type* out2, size_t w) { | 33 | 3.51k | static_assert(transform_type >= 0 && transform_type < 7, | 34 | 3.51k | "Invalid transform type"); | 35 | 3.51k | int second = transform_type >> 1; | 36 | 3.51k | int third = transform_type & 1; | 37 | | | 38 | 3.51k | size_t x = 0; | 39 | 3.51k | const HWY_FULL(pixel_type) d; | 40 | 3.51k | const size_t N = Lanes(d); | 41 | 228k | for (; x + N - 1 < w; x += N) { | 42 | 225k | if (transform_type == 6) { | 43 | 0 | auto Y = Load(d, in0 + x); | 44 | 0 | auto Co = Load(d, in1 + x); | 45 | 0 | auto Cg = Load(d, in2 + x); | 46 | 0 | Y = Sub(Y, ShiftRight<1>(Cg)); | 47 | 0 | auto G = Add(Cg, Y); | 48 | 0 | Y = Sub(Y, ShiftRight<1>(Co)); | 49 | 0 | auto R = Add(Y, Co); | 50 | 0 | Store(R, d, out0 + x); | 51 | 0 | Store(G, d, out1 + x); | 52 | 0 | Store(Y, d, out2 + x); | 53 | 225k | } else { | 54 | 225k | auto First = Load(d, in0 + x); | 55 | 225k | auto Second = Load(d, in1 + x); | 56 | 225k | auto Third = Load(d, in2 + x); | 57 | 225k | if (third) Third = Add(Third, First); | 58 | 225k | if (second == 1) { | 59 | 0 | Second = Add(Second, First); | 60 | 225k | } else if (second == 2) { | 61 | 0 | Second = Add(Second, ShiftRight<1>(Add(First, Third))); | 62 | 0 | } | 63 | 225k | Store(First, d, out0 + x); | 64 | 225k | Store(Second, d, out1 + x); | 65 | 225k | Store(Third, d, out2 + x); | 66 | 225k | } | 67 | 225k | } | 68 | 8.12k | for (; x < w; x++) { | 69 | 4.60k | if (transform_type == 6) { | 70 | 0 | pixel_type Y = in0[x]; | 71 | 0 | pixel_type Co = in1[x]; | 72 | 0 | pixel_type Cg = in2[x]; | 73 | 0 | pixel_type tmp = PixelAdd(Y, -(Cg >> 1)); | 74 | 0 | pixel_type G = PixelAdd(Cg, tmp); | 75 | 0 | pixel_type B = PixelAdd(tmp, -(Co >> 1)); | 76 | 0 | pixel_type R = PixelAdd(B, Co); | 77 | 0 | out0[x] = R; | 78 | 0 | out1[x] = G; | 79 | 0 | out2[x] = B; | 80 | 4.60k | } else { | 81 | 4.60k | pixel_type First = in0[x]; | 82 | 4.60k | pixel_type Second = in1[x]; | 83 | 4.60k | pixel_type Third = in2[x]; | 84 | 4.60k | if (third) Third = PixelAdd(Third, First); | 85 | 4.60k | if (second == 1) { | 86 | 0 | Second = PixelAdd(Second, First); | 87 | 4.60k | } else if (second == 2) { | 88 | 0 | Second = PixelAdd(Second, (PixelAdd(First, Third) >> 1)); | 89 | 0 | } | 90 | 4.60k | out0[x] = First; | 91 | 4.60k | out1[x] = Second; | 92 | 4.60k | out2[x] = Third; | 93 | 4.60k | } | 94 | 4.60k | } | 95 | 3.51k | } |
void jxl::N_SSE4::InvRCTRow<2>(int const*, int const*, int const*, int*, int*, int*, unsigned long) Line | Count | Source | 32 | 3.88k | pixel_type* out2, size_t w) { | 33 | 3.88k | static_assert(transform_type >= 0 && transform_type < 7, | 34 | 3.88k | "Invalid transform type"); | 35 | 3.88k | int second = transform_type >> 1; | 36 | 3.88k | int third = transform_type & 1; | 37 | | | 38 | 3.88k | size_t x = 0; | 39 | 3.88k | const HWY_FULL(pixel_type) d; | 40 | 3.88k | const size_t N = Lanes(d); | 41 | 85.9k | for (; x + N - 1 < w; x += N) { | 42 | 82.0k | if (transform_type == 6) { | 43 | 0 | auto Y = Load(d, in0 + x); | 44 | 0 | auto Co = Load(d, in1 + x); | 45 | 0 | auto Cg = Load(d, in2 + x); | 46 | 0 | Y = Sub(Y, ShiftRight<1>(Cg)); | 47 | 0 | auto G = Add(Cg, Y); | 48 | 0 | Y = Sub(Y, ShiftRight<1>(Co)); | 49 | 0 | auto R = Add(Y, Co); | 50 | 0 | Store(R, d, out0 + x); | 51 | 0 | Store(G, d, out1 + x); | 52 | 0 | Store(Y, d, out2 + x); | 53 | 82.0k | } else { | 54 | 82.0k | auto First = Load(d, in0 + x); | 55 | 82.0k | auto Second = Load(d, in1 + x); | 56 | 82.0k | auto Third = Load(d, in2 + x); | 57 | 82.0k | if (third) Third = Add(Third, First); | 58 | 82.0k | if (second == 1) { | 59 | 82.0k | Second = Add(Second, First); | 60 | 82.0k | } else if (second == 2) { | 61 | 0 | Second = Add(Second, ShiftRight<1>(Add(First, Third))); | 62 | 0 | } | 63 | 82.0k | Store(First, d, out0 + x); | 64 | 82.0k | Store(Second, d, out1 + x); | 65 | 82.0k | Store(Third, d, out2 + x); | 66 | 82.0k | } | 67 | 82.0k | } | 68 | 6.00k | for (; x < w; x++) { | 69 | 2.11k | if (transform_type == 6) { | 70 | 0 | pixel_type Y = in0[x]; | 71 | 0 | pixel_type Co = in1[x]; | 72 | 0 | pixel_type Cg = in2[x]; | 73 | 0 | pixel_type tmp = PixelAdd(Y, -(Cg >> 1)); | 74 | 0 | pixel_type G = PixelAdd(Cg, tmp); | 75 | 0 | pixel_type B = PixelAdd(tmp, -(Co >> 1)); | 76 | 0 | pixel_type R = PixelAdd(B, Co); | 77 | 0 | out0[x] = R; | 78 | 0 | out1[x] = G; | 79 | 0 | out2[x] = B; | 80 | 2.11k | } else { | 81 | 2.11k | pixel_type First = in0[x]; | 82 | 2.11k | pixel_type Second = in1[x]; | 83 | 2.11k | pixel_type Third = in2[x]; | 84 | 2.11k | if (third) Third = PixelAdd(Third, First); | 85 | 2.11k | if (second == 1) { | 86 | 2.11k | Second = PixelAdd(Second, First); | 87 | 2.11k | } else if (second == 2) { | 88 | 0 | Second = PixelAdd(Second, (PixelAdd(First, Third) >> 1)); | 89 | 0 | } | 90 | 2.11k | out0[x] = First; | 91 | 2.11k | out1[x] = Second; | 92 | 2.11k | out2[x] = Third; | 93 | 2.11k | } | 94 | 2.11k | } | 95 | 3.88k | } |
void jxl::N_SSE4::InvRCTRow<3>(int const*, int const*, int const*, int*, int*, int*, unsigned long) Line | Count | Source | 32 | 3.87k | pixel_type* out2, size_t w) { | 33 | 3.87k | static_assert(transform_type >= 0 && transform_type < 7, | 34 | 3.87k | "Invalid transform type"); | 35 | 3.87k | int second = transform_type >> 1; | 36 | 3.87k | int third = transform_type & 1; | 37 | | | 38 | 3.87k | size_t x = 0; | 39 | 3.87k | const HWY_FULL(pixel_type) d; | 40 | 3.87k | const size_t N = Lanes(d); | 41 | 112k | for (; x + N - 1 < w; x += N) { | 42 | 109k | if (transform_type == 6) { | 43 | 0 | auto Y = Load(d, in0 + x); | 44 | 0 | auto Co = Load(d, in1 + x); | 45 | 0 | auto Cg = Load(d, in2 + x); | 46 | 0 | Y = Sub(Y, ShiftRight<1>(Cg)); | 47 | 0 | auto G = Add(Cg, Y); | 48 | 0 | Y = Sub(Y, ShiftRight<1>(Co)); | 49 | 0 | auto R = Add(Y, Co); | 50 | 0 | Store(R, d, out0 + x); | 51 | 0 | Store(G, d, out1 + x); | 52 | 0 | Store(Y, d, out2 + x); | 53 | 109k | } else { | 54 | 109k | auto First = Load(d, in0 + x); | 55 | 109k | auto Second = Load(d, in1 + x); | 56 | 109k | auto Third = Load(d, in2 + x); | 57 | 109k | if (third) Third = Add(Third, First); | 58 | 109k | if (second == 1) { | 59 | 109k | Second = Add(Second, First); | 60 | 18.4E | } else if (second == 2) { | 61 | 0 | Second = Add(Second, ShiftRight<1>(Add(First, Third))); | 62 | 0 | } | 63 | 109k | Store(First, d, out0 + x); | 64 | 109k | Store(Second, d, out1 + x); | 65 | 109k | Store(Third, d, out2 + x); | 66 | 109k | } | 67 | 109k | } | 68 | 5.76k | for (; x < w; x++) { | 69 | 1.88k | if (transform_type == 6) { | 70 | 0 | pixel_type Y = in0[x]; | 71 | 0 | pixel_type Co = in1[x]; | 72 | 0 | pixel_type Cg = in2[x]; | 73 | 0 | pixel_type tmp = PixelAdd(Y, -(Cg >> 1)); | 74 | 0 | pixel_type G = PixelAdd(Cg, tmp); | 75 | 0 | pixel_type B = PixelAdd(tmp, -(Co >> 1)); | 76 | 0 | pixel_type R = PixelAdd(B, Co); | 77 | 0 | out0[x] = R; | 78 | 0 | out1[x] = G; | 79 | 0 | out2[x] = B; | 80 | 1.88k | } else { | 81 | 1.88k | pixel_type First = in0[x]; | 82 | 1.88k | pixel_type Second = in1[x]; | 83 | 1.88k | pixel_type Third = in2[x]; | 84 | 1.88k | if (third) Third = PixelAdd(Third, First); | 85 | 1.88k | if (second == 1) { | 86 | 1.88k | Second = PixelAdd(Second, First); | 87 | 1.88k | } else if (second == 2) { | 88 | 0 | Second = PixelAdd(Second, (PixelAdd(First, Third) >> 1)); | 89 | 0 | } | 90 | 1.88k | out0[x] = First; | 91 | 1.88k | out1[x] = Second; | 92 | 1.88k | out2[x] = Third; | 93 | 1.88k | } | 94 | 1.88k | } | 95 | 3.87k | } |
void jxl::N_SSE4::InvRCTRow<4>(int const*, int const*, int const*, int*, int*, int*, unsigned long) Line | Count | Source | 32 | 61.8k | pixel_type* out2, size_t w) { | 33 | 61.8k | static_assert(transform_type >= 0 && transform_type < 7, | 34 | 61.8k | "Invalid transform type"); | 35 | 61.8k | int second = transform_type >> 1; | 36 | 61.8k | int third = transform_type & 1; | 37 | | | 38 | 61.8k | size_t x = 0; | 39 | 61.8k | const HWY_FULL(pixel_type) d; | 40 | 61.8k | const size_t N = Lanes(d); | 41 | 3.33M | for (; x + N - 1 < w; x += N) { | 42 | 3.27M | if (transform_type == 6) { | 43 | 0 | auto Y = Load(d, in0 + x); | 44 | 0 | auto Co = Load(d, in1 + x); | 45 | 0 | auto Cg = Load(d, in2 + x); | 46 | 0 | Y = Sub(Y, ShiftRight<1>(Cg)); | 47 | 0 | auto G = Add(Cg, Y); | 48 | 0 | Y = Sub(Y, ShiftRight<1>(Co)); | 49 | 0 | auto R = Add(Y, Co); | 50 | 0 | Store(R, d, out0 + x); | 51 | 0 | Store(G, d, out1 + x); | 52 | 0 | Store(Y, d, out2 + x); | 53 | 3.27M | } else { | 54 | 3.27M | auto First = Load(d, in0 + x); | 55 | 3.27M | auto Second = Load(d, in1 + x); | 56 | 3.27M | auto Third = Load(d, in2 + x); | 57 | 3.27M | if (third) Third = Add(Third, First); | 58 | 3.27M | if (second == 1) { | 59 | 0 | Second = Add(Second, First); | 60 | 3.27M | } else if (second == 2) { | 61 | 3.27M | Second = Add(Second, ShiftRight<1>(Add(First, Third))); | 62 | 3.27M | } | 63 | 3.27M | Store(First, d, out0 + x); | 64 | 3.27M | Store(Second, d, out1 + x); | 65 | 3.27M | Store(Third, d, out2 + x); | 66 | 3.27M | } | 67 | 3.27M | } | 68 | 68.6k | for (; x < w; x++) { | 69 | 6.83k | if (transform_type == 6) { | 70 | 0 | pixel_type Y = in0[x]; | 71 | 0 | pixel_type Co = in1[x]; | 72 | 0 | pixel_type Cg = in2[x]; | 73 | 0 | pixel_type tmp = PixelAdd(Y, -(Cg >> 1)); | 74 | 0 | pixel_type G = PixelAdd(Cg, tmp); | 75 | 0 | pixel_type B = PixelAdd(tmp, -(Co >> 1)); | 76 | 0 | pixel_type R = PixelAdd(B, Co); | 77 | 0 | out0[x] = R; | 78 | 0 | out1[x] = G; | 79 | 0 | out2[x] = B; | 80 | 6.83k | } else { | 81 | 6.83k | pixel_type First = in0[x]; | 82 | 6.83k | pixel_type Second = in1[x]; | 83 | 6.83k | pixel_type Third = in2[x]; | 84 | 6.83k | if (third) Third = PixelAdd(Third, First); | 85 | 6.83k | if (second == 1) { | 86 | 0 | Second = PixelAdd(Second, First); | 87 | 6.83k | } else if (second == 2) { | 88 | 6.83k | Second = PixelAdd(Second, (PixelAdd(First, Third) >> 1)); | 89 | 6.83k | } | 90 | 6.83k | out0[x] = First; | 91 | 6.83k | out1[x] = Second; | 92 | 6.83k | out2[x] = Third; | 93 | 6.83k | } | 94 | 6.83k | } | 95 | 61.8k | } |
void jxl::N_SSE4::InvRCTRow<5>(int const*, int const*, int const*, int*, int*, int*, unsigned long) Line | Count | Source | 32 | 75.3k | pixel_type* out2, size_t w) { | 33 | 75.3k | static_assert(transform_type >= 0 && transform_type < 7, | 34 | 75.3k | "Invalid transform type"); | 35 | 75.3k | int second = transform_type >> 1; | 36 | 75.3k | int third = transform_type & 1; | 37 | | | 38 | 75.3k | size_t x = 0; | 39 | 75.3k | const HWY_FULL(pixel_type) d; | 40 | 75.3k | const size_t N = Lanes(d); | 41 | 3.96M | for (; x + N - 1 < w; x += N) { | 42 | 3.89M | if (transform_type == 6) { | 43 | 0 | auto Y = Load(d, in0 + x); | 44 | 0 | auto Co = Load(d, in1 + x); | 45 | 0 | auto Cg = Load(d, in2 + x); | 46 | 0 | Y = Sub(Y, ShiftRight<1>(Cg)); | 47 | 0 | auto G = Add(Cg, Y); | 48 | 0 | Y = Sub(Y, ShiftRight<1>(Co)); | 49 | 0 | auto R = Add(Y, Co); | 50 | 0 | Store(R, d, out0 + x); | 51 | 0 | Store(G, d, out1 + x); | 52 | 0 | Store(Y, d, out2 + x); | 53 | 3.89M | } else { | 54 | 3.89M | auto First = Load(d, in0 + x); | 55 | 3.89M | auto Second = Load(d, in1 + x); | 56 | 3.89M | auto Third = Load(d, in2 + x); | 57 | 3.89M | if (third) Third = Add(Third, First); | 58 | 3.89M | if (second == 1) { | 59 | 0 | Second = Add(Second, First); | 60 | 3.89M | } else if (second == 2) { | 61 | 3.89M | Second = Add(Second, ShiftRight<1>(Add(First, Third))); | 62 | 3.89M | } | 63 | 3.89M | Store(First, d, out0 + x); | 64 | 3.89M | Store(Second, d, out1 + x); | 65 | 3.89M | Store(Third, d, out2 + x); | 66 | 3.89M | } | 67 | 3.89M | } | 68 | 79.1k | for (; x < w; x++) { | 69 | 3.81k | if (transform_type == 6) { | 70 | 0 | pixel_type Y = in0[x]; | 71 | 0 | pixel_type Co = in1[x]; | 72 | 0 | pixel_type Cg = in2[x]; | 73 | 0 | pixel_type tmp = PixelAdd(Y, -(Cg >> 1)); | 74 | 0 | pixel_type G = PixelAdd(Cg, tmp); | 75 | 0 | pixel_type B = PixelAdd(tmp, -(Co >> 1)); | 76 | 0 | pixel_type R = PixelAdd(B, Co); | 77 | 0 | out0[x] = R; | 78 | 0 | out1[x] = G; | 79 | 0 | out2[x] = B; | 80 | 3.81k | } else { | 81 | 3.81k | pixel_type First = in0[x]; | 82 | 3.81k | pixel_type Second = in1[x]; | 83 | 3.81k | pixel_type Third = in2[x]; | 84 | 3.81k | if (third) Third = PixelAdd(Third, First); | 85 | 3.81k | if (second == 1) { | 86 | 0 | Second = PixelAdd(Second, First); | 87 | 3.81k | } else if (second == 2) { | 88 | 3.81k | Second = PixelAdd(Second, (PixelAdd(First, Third) >> 1)); | 89 | 3.81k | } | 90 | 3.81k | out0[x] = First; | 91 | 3.81k | out1[x] = Second; | 92 | 3.81k | out2[x] = Third; | 93 | 3.81k | } | 94 | 3.81k | } | 95 | 75.3k | } |
void jxl::N_SSE4::InvRCTRow<6>(int const*, int const*, int const*, int*, int*, int*, unsigned long) Line | Count | Source | 32 | 82.9k | pixel_type* out2, size_t w) { | 33 | 82.9k | static_assert(transform_type >= 0 && transform_type < 7, | 34 | 82.9k | "Invalid transform type"); | 35 | 82.9k | int second = transform_type >> 1; | 36 | 82.9k | int third = transform_type & 1; | 37 | | | 38 | 82.9k | size_t x = 0; | 39 | 82.9k | const HWY_FULL(pixel_type) d; | 40 | 82.9k | const size_t N = Lanes(d); | 41 | 3.72M | for (; x + N - 1 < w; x += N) { | 42 | 3.64M | if (transform_type == 6) { | 43 | 3.64M | auto Y = Load(d, in0 + x); | 44 | 3.64M | auto Co = Load(d, in1 + x); | 45 | 3.64M | auto Cg = Load(d, in2 + x); | 46 | 3.64M | Y = Sub(Y, ShiftRight<1>(Cg)); | 47 | 3.64M | auto G = Add(Cg, Y); | 48 | 3.64M | Y = Sub(Y, ShiftRight<1>(Co)); | 49 | 3.64M | auto R = Add(Y, Co); | 50 | 3.64M | Store(R, d, out0 + x); | 51 | 3.64M | Store(G, d, out1 + x); | 52 | 3.64M | Store(Y, d, out2 + x); | 53 | 3.64M | } else { | 54 | 126 | auto First = Load(d, in0 + x); | 55 | 126 | auto Second = Load(d, in1 + x); | 56 | 126 | auto Third = Load(d, in2 + x); | 57 | 126 | if (third) Third = Add(Third, First); | 58 | 126 | if (second == 1) { | 59 | 0 | Second = Add(Second, First); | 60 | 126 | } else if (second == 2) { | 61 | 0 | Second = Add(Second, ShiftRight<1>(Add(First, Third))); | 62 | 0 | } | 63 | 126 | Store(First, d, out0 + x); | 64 | 126 | Store(Second, d, out1 + x); | 65 | 126 | Store(Third, d, out2 + x); | 66 | 126 | } | 67 | 3.64M | } | 68 | 113k | for (; x < w; x++) { | 69 | 30.9k | if (transform_type == 6) { | 70 | 30.9k | pixel_type Y = in0[x]; | 71 | 30.9k | pixel_type Co = in1[x]; | 72 | 30.9k | pixel_type Cg = in2[x]; | 73 | 30.9k | pixel_type tmp = PixelAdd(Y, -(Cg >> 1)); | 74 | 30.9k | pixel_type G = PixelAdd(Cg, tmp); | 75 | 30.9k | pixel_type B = PixelAdd(tmp, -(Co >> 1)); | 76 | 30.9k | pixel_type R = PixelAdd(B, Co); | 77 | 30.9k | out0[x] = R; | 78 | 30.9k | out1[x] = G; | 79 | 30.9k | out2[x] = B; | 80 | 18.4E | } else { | 81 | 18.4E | pixel_type First = in0[x]; | 82 | 18.4E | pixel_type Second = in1[x]; | 83 | 18.4E | pixel_type Third = in2[x]; | 84 | 18.4E | if (third) Third = PixelAdd(Third, First); | 85 | 18.4E | if (second == 1) { | 86 | 0 | Second = PixelAdd(Second, First); | 87 | 18.4E | } else if (second == 2) { | 88 | 0 | Second = PixelAdd(Second, (PixelAdd(First, Third) >> 1)); | 89 | 0 | } | 90 | 18.4E | out0[x] = First; | 91 | 18.4E | out1[x] = Second; | 92 | 18.4E | out2[x] = Third; | 93 | 18.4E | } | 94 | 30.9k | } | 95 | 82.9k | } |
Unexecuted instantiation: void jxl::N_AVX2::InvRCTRow<0>(int const*, int const*, int const*, int*, int*, int*, unsigned long) void jxl::N_AVX2::InvRCTRow<1>(int const*, int const*, int const*, int*, int*, int*, unsigned long) Line | Count | Source | 32 | 199k | pixel_type* out2, size_t w) { | 33 | 199k | static_assert(transform_type >= 0 && transform_type < 7, | 34 | 199k | "Invalid transform type"); | 35 | 199k | int second = transform_type >> 1; | 36 | 199k | int third = transform_type & 1; | 37 | | | 38 | 199k | size_t x = 0; | 39 | 199k | const HWY_FULL(pixel_type) d; | 40 | 199k | const size_t N = Lanes(d); | 41 | 2.33M | for (; x + N - 1 < w; x += N) { | 42 | 2.13M | if (transform_type == 6) { | 43 | 0 | auto Y = Load(d, in0 + x); | 44 | 0 | auto Co = Load(d, in1 + x); | 45 | 0 | auto Cg = Load(d, in2 + x); | 46 | 0 | Y = Sub(Y, ShiftRight<1>(Cg)); | 47 | 0 | auto G = Add(Cg, Y); | 48 | 0 | Y = Sub(Y, ShiftRight<1>(Co)); | 49 | 0 | auto R = Add(Y, Co); | 50 | 0 | Store(R, d, out0 + x); | 51 | 0 | Store(G, d, out1 + x); | 52 | 0 | Store(Y, d, out2 + x); | 53 | 2.13M | } else { | 54 | 2.13M | auto First = Load(d, in0 + x); | 55 | 2.13M | auto Second = Load(d, in1 + x); | 56 | 2.13M | auto Third = Load(d, in2 + x); | 57 | 2.13M | if (third) Third = Add(Third, First); | 58 | 2.13M | if (second == 1) { | 59 | 0 | Second = Add(Second, First); | 60 | 2.13M | } else if (second == 2) { | 61 | 0 | Second = Add(Second, ShiftRight<1>(Add(First, Third))); | 62 | 0 | } | 63 | 2.13M | Store(First, d, out0 + x); | 64 | 2.13M | Store(Second, d, out1 + x); | 65 | 2.13M | Store(Third, d, out2 + x); | 66 | 2.13M | } | 67 | 2.13M | } | 68 | 906k | for (; x < w; x++) { | 69 | 706k | if (transform_type == 6) { | 70 | 0 | pixel_type Y = in0[x]; | 71 | 0 | pixel_type Co = in1[x]; | 72 | 0 | pixel_type Cg = in2[x]; | 73 | 0 | pixel_type tmp = PixelAdd(Y, -(Cg >> 1)); | 74 | 0 | pixel_type G = PixelAdd(Cg, tmp); | 75 | 0 | pixel_type B = PixelAdd(tmp, -(Co >> 1)); | 76 | 0 | pixel_type R = PixelAdd(B, Co); | 77 | 0 | out0[x] = R; | 78 | 0 | out1[x] = G; | 79 | 0 | out2[x] = B; | 80 | 706k | } else { | 81 | 706k | pixel_type First = in0[x]; | 82 | 706k | pixel_type Second = in1[x]; | 83 | 706k | pixel_type Third = in2[x]; | 84 | 706k | if (third) Third = PixelAdd(Third, First); | 85 | 706k | if (second == 1) { | 86 | 0 | Second = PixelAdd(Second, First); | 87 | 706k | } else if (second == 2) { | 88 | 0 | Second = PixelAdd(Second, (PixelAdd(First, Third) >> 1)); | 89 | 0 | } | 90 | 706k | out0[x] = First; | 91 | 706k | out1[x] = Second; | 92 | 706k | out2[x] = Third; | 93 | 706k | } | 94 | 706k | } | 95 | 199k | } |
void jxl::N_AVX2::InvRCTRow<2>(int const*, int const*, int const*, int*, int*, int*, unsigned long) Line | Count | Source | 32 | 193k | pixel_type* out2, size_t w) { | 33 | 193k | static_assert(transform_type >= 0 && transform_type < 7, | 34 | 193k | "Invalid transform type"); | 35 | 193k | int second = transform_type >> 1; | 36 | 193k | int third = transform_type & 1; | 37 | | | 38 | 193k | size_t x = 0; | 39 | 193k | const HWY_FULL(pixel_type) d; | 40 | 193k | const size_t N = Lanes(d); | 41 | 1.39M | for (; x + N - 1 < w; x += N) { | 42 | 1.20M | if (transform_type == 6) { | 43 | 0 | auto Y = Load(d, in0 + x); | 44 | 0 | auto Co = Load(d, in1 + x); | 45 | 0 | auto Cg = Load(d, in2 + x); | 46 | 0 | Y = Sub(Y, ShiftRight<1>(Cg)); | 47 | 0 | auto G = Add(Cg, Y); | 48 | 0 | Y = Sub(Y, ShiftRight<1>(Co)); | 49 | 0 | auto R = Add(Y, Co); | 50 | 0 | Store(R, d, out0 + x); | 51 | 0 | Store(G, d, out1 + x); | 52 | 0 | Store(Y, d, out2 + x); | 53 | 1.20M | } else { | 54 | 1.20M | auto First = Load(d, in0 + x); | 55 | 1.20M | auto Second = Load(d, in1 + x); | 56 | 1.20M | auto Third = Load(d, in2 + x); | 57 | 1.20M | if (third) Third = Add(Third, First); | 58 | 1.20M | if (second == 1) { | 59 | 1.20M | Second = Add(Second, First); | 60 | 18.4E | } else if (second == 2) { | 61 | 0 | Second = Add(Second, ShiftRight<1>(Add(First, Third))); | 62 | 0 | } | 63 | 1.20M | Store(First, d, out0 + x); | 64 | 1.20M | Store(Second, d, out1 + x); | 65 | 1.20M | Store(Third, d, out2 + x); | 66 | 1.20M | } | 67 | 1.20M | } | 68 | 541k | for (; x < w; x++) { | 69 | 347k | if (transform_type == 6) { | 70 | 0 | pixel_type Y = in0[x]; | 71 | 0 | pixel_type Co = in1[x]; | 72 | 0 | pixel_type Cg = in2[x]; | 73 | 0 | pixel_type tmp = PixelAdd(Y, -(Cg >> 1)); | 74 | 0 | pixel_type G = PixelAdd(Cg, tmp); | 75 | 0 | pixel_type B = PixelAdd(tmp, -(Co >> 1)); | 76 | 0 | pixel_type R = PixelAdd(B, Co); | 77 | 0 | out0[x] = R; | 78 | 0 | out1[x] = G; | 79 | 0 | out2[x] = B; | 80 | 347k | } else { | 81 | 347k | pixel_type First = in0[x]; | 82 | 347k | pixel_type Second = in1[x]; | 83 | 347k | pixel_type Third = in2[x]; | 84 | 347k | if (third) Third = PixelAdd(Third, First); | 85 | 347k | if (second == 1) { | 86 | 347k | Second = PixelAdd(Second, First); | 87 | 347k | } else if (second == 2) { | 88 | 0 | Second = PixelAdd(Second, (PixelAdd(First, Third) >> 1)); | 89 | 0 | } | 90 | 347k | out0[x] = First; | 91 | 347k | out1[x] = Second; | 92 | 347k | out2[x] = Third; | 93 | 347k | } | 94 | 347k | } | 95 | 193k | } |
void jxl::N_AVX2::InvRCTRow<3>(int const*, int const*, int const*, int*, int*, int*, unsigned long) Line | Count | Source | 32 | 96.0k | pixel_type* out2, size_t w) { | 33 | 96.0k | static_assert(transform_type >= 0 && transform_type < 7, | 34 | 96.0k | "Invalid transform type"); | 35 | 96.0k | int second = transform_type >> 1; | 36 | 96.0k | int third = transform_type & 1; | 37 | | | 38 | 96.0k | size_t x = 0; | 39 | 96.0k | const HWY_FULL(pixel_type) d; | 40 | 96.0k | const size_t N = Lanes(d); | 41 | 1.20M | for (; x + N - 1 < w; x += N) { | 42 | 1.11M | if (transform_type == 6) { | 43 | 0 | auto Y = Load(d, in0 + x); | 44 | 0 | auto Co = Load(d, in1 + x); | 45 | 0 | auto Cg = Load(d, in2 + x); | 46 | 0 | Y = Sub(Y, ShiftRight<1>(Cg)); | 47 | 0 | auto G = Add(Cg, Y); | 48 | 0 | Y = Sub(Y, ShiftRight<1>(Co)); | 49 | 0 | auto R = Add(Y, Co); | 50 | 0 | Store(R, d, out0 + x); | 51 | 0 | Store(G, d, out1 + x); | 52 | 0 | Store(Y, d, out2 + x); | 53 | 1.11M | } else { | 54 | 1.11M | auto First = Load(d, in0 + x); | 55 | 1.11M | auto Second = Load(d, in1 + x); | 56 | 1.11M | auto Third = Load(d, in2 + x); | 57 | 1.11M | if (third) Third = Add(Third, First); | 58 | 1.11M | if (second == 1) { | 59 | 1.11M | Second = Add(Second, First); | 60 | 18.4E | } else if (second == 2) { | 61 | 0 | Second = Add(Second, ShiftRight<1>(Add(First, Third))); | 62 | 0 | } | 63 | 1.11M | Store(First, d, out0 + x); | 64 | 1.11M | Store(Second, d, out1 + x); | 65 | 1.11M | Store(Third, d, out2 + x); | 66 | 1.11M | } | 67 | 1.11M | } | 68 | 217k | for (; x < w; x++) { | 69 | 121k | if (transform_type == 6) { | 70 | 0 | pixel_type Y = in0[x]; | 71 | 0 | pixel_type Co = in1[x]; | 72 | 0 | pixel_type Cg = in2[x]; | 73 | 0 | pixel_type tmp = PixelAdd(Y, -(Cg >> 1)); | 74 | 0 | pixel_type G = PixelAdd(Cg, tmp); | 75 | 0 | pixel_type B = PixelAdd(tmp, -(Co >> 1)); | 76 | 0 | pixel_type R = PixelAdd(B, Co); | 77 | 0 | out0[x] = R; | 78 | 0 | out1[x] = G; | 79 | 0 | out2[x] = B; | 80 | 121k | } else { | 81 | 121k | pixel_type First = in0[x]; | 82 | 121k | pixel_type Second = in1[x]; | 83 | 121k | pixel_type Third = in2[x]; | 84 | 121k | if (third) Third = PixelAdd(Third, First); | 85 | 121k | if (second == 1) { | 86 | 121k | Second = PixelAdd(Second, First); | 87 | 121k | } else if (second == 2) { | 88 | 0 | Second = PixelAdd(Second, (PixelAdd(First, Third) >> 1)); | 89 | 0 | } | 90 | 121k | out0[x] = First; | 91 | 121k | out1[x] = Second; | 92 | 121k | out2[x] = Third; | 93 | 121k | } | 94 | 121k | } | 95 | 96.0k | } |
void jxl::N_AVX2::InvRCTRow<4>(int const*, int const*, int const*, int*, int*, int*, unsigned long) Line | Count | Source | 32 | 86.9k | pixel_type* out2, size_t w) { | 33 | 86.9k | static_assert(transform_type >= 0 && transform_type < 7, | 34 | 86.9k | "Invalid transform type"); | 35 | 86.9k | int second = transform_type >> 1; | 36 | 86.9k | int third = transform_type & 1; | 37 | | | 38 | 86.9k | size_t x = 0; | 39 | 86.9k | const HWY_FULL(pixel_type) d; | 40 | 86.9k | const size_t N = Lanes(d); | 41 | 1.88M | for (; x + N - 1 < w; x += N) { | 42 | 1.79M | if (transform_type == 6) { | 43 | 0 | auto Y = Load(d, in0 + x); | 44 | 0 | auto Co = Load(d, in1 + x); | 45 | 0 | auto Cg = Load(d, in2 + x); | 46 | 0 | Y = Sub(Y, ShiftRight<1>(Cg)); | 47 | 0 | auto G = Add(Cg, Y); | 48 | 0 | Y = Sub(Y, ShiftRight<1>(Co)); | 49 | 0 | auto R = Add(Y, Co); | 50 | 0 | Store(R, d, out0 + x); | 51 | 0 | Store(G, d, out1 + x); | 52 | 0 | Store(Y, d, out2 + x); | 53 | 1.79M | } else { | 54 | 1.79M | auto First = Load(d, in0 + x); | 55 | 1.79M | auto Second = Load(d, in1 + x); | 56 | 1.79M | auto Third = Load(d, in2 + x); | 57 | 1.79M | if (third) Third = Add(Third, First); | 58 | 1.79M | if (second == 1) { | 59 | 0 | Second = Add(Second, First); | 60 | 1.79M | } else if (second == 2) { | 61 | 1.79M | Second = Add(Second, ShiftRight<1>(Add(First, Third))); | 62 | 1.79M | } | 63 | 1.79M | Store(First, d, out0 + x); | 64 | 1.79M | Store(Second, d, out1 + x); | 65 | 1.79M | Store(Third, d, out2 + x); | 66 | 1.79M | } | 67 | 1.79M | } | 68 | 181k | for (; x < w; x++) { | 69 | 94.6k | if (transform_type == 6) { | 70 | 0 | pixel_type Y = in0[x]; | 71 | 0 | pixel_type Co = in1[x]; | 72 | 0 | pixel_type Cg = in2[x]; | 73 | 0 | pixel_type tmp = PixelAdd(Y, -(Cg >> 1)); | 74 | 0 | pixel_type G = PixelAdd(Cg, tmp); | 75 | 0 | pixel_type B = PixelAdd(tmp, -(Co >> 1)); | 76 | 0 | pixel_type R = PixelAdd(B, Co); | 77 | 0 | out0[x] = R; | 78 | 0 | out1[x] = G; | 79 | 0 | out2[x] = B; | 80 | 94.6k | } else { | 81 | 94.6k | pixel_type First = in0[x]; | 82 | 94.6k | pixel_type Second = in1[x]; | 83 | 94.6k | pixel_type Third = in2[x]; | 84 | 94.6k | if (third) Third = PixelAdd(Third, First); | 85 | 94.6k | if (second == 1) { | 86 | 0 | Second = PixelAdd(Second, First); | 87 | 94.6k | } else if (second == 2) { | 88 | 94.6k | Second = PixelAdd(Second, (PixelAdd(First, Third) >> 1)); | 89 | 94.6k | } | 90 | 94.6k | out0[x] = First; | 91 | 94.6k | out1[x] = Second; | 92 | 94.6k | out2[x] = Third; | 93 | 94.6k | } | 94 | 94.6k | } | 95 | 86.9k | } |
void jxl::N_AVX2::InvRCTRow<5>(int const*, int const*, int const*, int*, int*, int*, unsigned long) Line | Count | Source | 32 | 178k | pixel_type* out2, size_t w) { | 33 | 178k | static_assert(transform_type >= 0 && transform_type < 7, | 34 | 178k | "Invalid transform type"); | 35 | 178k | int second = transform_type >> 1; | 36 | 178k | int third = transform_type & 1; | 37 | | | 38 | 178k | size_t x = 0; | 39 | 178k | const HWY_FULL(pixel_type) d; | 40 | 178k | const size_t N = Lanes(d); | 41 | 4.79M | for (; x + N - 1 < w; x += N) { | 42 | 4.61M | if (transform_type == 6) { | 43 | 0 | auto Y = Load(d, in0 + x); | 44 | 0 | auto Co = Load(d, in1 + x); | 45 | 0 | auto Cg = Load(d, in2 + x); | 46 | 0 | Y = Sub(Y, ShiftRight<1>(Cg)); | 47 | 0 | auto G = Add(Cg, Y); | 48 | 0 | Y = Sub(Y, ShiftRight<1>(Co)); | 49 | 0 | auto R = Add(Y, Co); | 50 | 0 | Store(R, d, out0 + x); | 51 | 0 | Store(G, d, out1 + x); | 52 | 0 | Store(Y, d, out2 + x); | 53 | 4.61M | } else { | 54 | 4.61M | auto First = Load(d, in0 + x); | 55 | 4.61M | auto Second = Load(d, in1 + x); | 56 | 4.61M | auto Third = Load(d, in2 + x); | 57 | 4.66M | if (third) Third = Add(Third, First); | 58 | 4.61M | if (second == 1) { | 59 | 0 | Second = Add(Second, First); | 60 | 4.65M | } else if (second == 2) { | 61 | 4.65M | Second = Add(Second, ShiftRight<1>(Add(First, Third))); | 62 | 4.65M | } | 63 | 4.61M | Store(First, d, out0 + x); | 64 | 4.61M | Store(Second, d, out1 + x); | 65 | 4.61M | Store(Third, d, out2 + x); | 66 | 4.61M | } | 67 | 4.61M | } | 68 | 252k | for (; x < w; x++) { | 69 | 74.6k | if (transform_type == 6) { | 70 | 0 | pixel_type Y = in0[x]; | 71 | 0 | pixel_type Co = in1[x]; | 72 | 0 | pixel_type Cg = in2[x]; | 73 | 0 | pixel_type tmp = PixelAdd(Y, -(Cg >> 1)); | 74 | 0 | pixel_type G = PixelAdd(Cg, tmp); | 75 | 0 | pixel_type B = PixelAdd(tmp, -(Co >> 1)); | 76 | 0 | pixel_type R = PixelAdd(B, Co); | 77 | 0 | out0[x] = R; | 78 | 0 | out1[x] = G; | 79 | 0 | out2[x] = B; | 80 | 74.6k | } else { | 81 | 74.6k | pixel_type First = in0[x]; | 82 | 74.6k | pixel_type Second = in1[x]; | 83 | 74.6k | pixel_type Third = in2[x]; | 84 | 74.6k | if (third) Third = PixelAdd(Third, First); | 85 | 74.6k | if (second == 1) { | 86 | 0 | Second = PixelAdd(Second, First); | 87 | 74.6k | } else if (second == 2) { | 88 | 74.6k | Second = PixelAdd(Second, (PixelAdd(First, Third) >> 1)); | 89 | 74.6k | } | 90 | 74.6k | out0[x] = First; | 91 | 74.6k | out1[x] = Second; | 92 | 74.6k | out2[x] = Third; | 93 | 74.6k | } | 94 | 74.6k | } | 95 | 178k | } |
void jxl::N_AVX2::InvRCTRow<6>(int const*, int const*, int const*, int*, int*, int*, unsigned long) Line | Count | Source | 32 | 563k | pixel_type* out2, size_t w) { | 33 | 563k | static_assert(transform_type >= 0 && transform_type < 7, | 34 | 563k | "Invalid transform type"); | 35 | 563k | int second = transform_type >> 1; | 36 | 563k | int third = transform_type & 1; | 37 | | | 38 | 563k | size_t x = 0; | 39 | 563k | const HWY_FULL(pixel_type) d; | 40 | 563k | const size_t N = Lanes(d); | 41 | 9.81M | for (; x + N - 1 < w; x += N) { | 42 | 9.24M | if (transform_type == 6) { | 43 | 9.24M | auto Y = Load(d, in0 + x); | 44 | 9.24M | auto Co = Load(d, in1 + x); | 45 | 9.24M | auto Cg = Load(d, in2 + x); | 46 | 9.24M | Y = Sub(Y, ShiftRight<1>(Cg)); | 47 | 9.24M | auto G = Add(Cg, Y); | 48 | 9.24M | Y = Sub(Y, ShiftRight<1>(Co)); | 49 | 9.24M | auto R = Add(Y, Co); | 50 | 9.24M | Store(R, d, out0 + x); | 51 | 9.24M | Store(G, d, out1 + x); | 52 | 9.24M | Store(Y, d, out2 + x); | 53 | 9.24M | } else { | 54 | 57 | auto First = Load(d, in0 + x); | 55 | 57 | auto Second = Load(d, in1 + x); | 56 | 57 | auto Third = Load(d, in2 + x); | 57 | 57 | if (third) Third = Add(Third, First); | 58 | 57 | if (second == 1) { | 59 | 0 | Second = Add(Second, First); | 60 | 57 | } else if (second == 2) { | 61 | 0 | Second = Add(Second, ShiftRight<1>(Add(First, Third))); | 62 | 0 | } | 63 | 57 | Store(First, d, out0 + x); | 64 | 57 | Store(Second, d, out1 + x); | 65 | 57 | Store(Third, d, out2 + x); | 66 | 57 | } | 67 | 9.24M | } | 68 | 1.98M | for (; x < w; x++) { | 69 | 1.41M | if (transform_type == 6) { | 70 | 1.41M | pixel_type Y = in0[x]; | 71 | 1.41M | pixel_type Co = in1[x]; | 72 | 1.41M | pixel_type Cg = in2[x]; | 73 | 1.41M | pixel_type tmp = PixelAdd(Y, -(Cg >> 1)); | 74 | 1.41M | pixel_type G = PixelAdd(Cg, tmp); | 75 | 1.41M | pixel_type B = PixelAdd(tmp, -(Co >> 1)); | 76 | 1.41M | pixel_type R = PixelAdd(B, Co); | 77 | 1.41M | out0[x] = R; | 78 | 1.41M | out1[x] = G; | 79 | 1.41M | out2[x] = B; | 80 | 18.4E | } else { | 81 | 18.4E | pixel_type First = in0[x]; | 82 | 18.4E | pixel_type Second = in1[x]; | 83 | 18.4E | pixel_type Third = in2[x]; | 84 | 18.4E | if (third) Third = PixelAdd(Third, First); | 85 | 18.4E | if (second == 1) { | 86 | 0 | Second = PixelAdd(Second, First); | 87 | 18.4E | } else if (second == 2) { | 88 | 0 | Second = PixelAdd(Second, (PixelAdd(First, Third) >> 1)); | 89 | 0 | } | 90 | 18.4E | out0[x] = First; | 91 | 18.4E | out1[x] = Second; | 92 | 18.4E | out2[x] = Third; | 93 | 18.4E | } | 94 | 1.41M | } | 95 | 563k | } |
Unexecuted instantiation: void jxl::N_SSE2::InvRCTRow<0>(int const*, int const*, int const*, int*, int*, int*, unsigned long) void jxl::N_SSE2::InvRCTRow<1>(int const*, int const*, int const*, int*, int*, int*, unsigned long) Line | Count | Source | 32 | 3.28k | pixel_type* out2, size_t w) { | 33 | 3.28k | static_assert(transform_type >= 0 && transform_type < 7, | 34 | 3.28k | "Invalid transform type"); | 35 | 3.28k | int second = transform_type >> 1; | 36 | 3.28k | int third = transform_type & 1; | 37 | | | 38 | 3.28k | size_t x = 0; | 39 | 3.28k | const HWY_FULL(pixel_type) d; | 40 | 3.28k | const size_t N = Lanes(d); | 41 | 195k | for (; x + N - 1 < w; x += N) { | 42 | 192k | if (transform_type == 6) { | 43 | 0 | auto Y = Load(d, in0 + x); | 44 | 0 | auto Co = Load(d, in1 + x); | 45 | 0 | auto Cg = Load(d, in2 + x); | 46 | 0 | Y = Sub(Y, ShiftRight<1>(Cg)); | 47 | 0 | auto G = Add(Cg, Y); | 48 | 0 | Y = Sub(Y, ShiftRight<1>(Co)); | 49 | 0 | auto R = Add(Y, Co); | 50 | 0 | Store(R, d, out0 + x); | 51 | 0 | Store(G, d, out1 + x); | 52 | 0 | Store(Y, d, out2 + x); | 53 | 192k | } else { | 54 | 192k | auto First = Load(d, in0 + x); | 55 | 192k | auto Second = Load(d, in1 + x); | 56 | 192k | auto Third = Load(d, in2 + x); | 57 | 192k | if (third) Third = Add(Third, First); | 58 | 192k | if (second == 1) { | 59 | 0 | Second = Add(Second, First); | 60 | 192k | } else if (second == 2) { | 61 | 0 | Second = Add(Second, ShiftRight<1>(Add(First, Third))); | 62 | 0 | } | 63 | 192k | Store(First, d, out0 + x); | 64 | 192k | Store(Second, d, out1 + x); | 65 | 192k | Store(Third, d, out2 + x); | 66 | 192k | } | 67 | 192k | } | 68 | 9.07k | for (; x < w; x++) { | 69 | 5.78k | if (transform_type == 6) { | 70 | 0 | pixel_type Y = in0[x]; | 71 | 0 | pixel_type Co = in1[x]; | 72 | 0 | pixel_type Cg = in2[x]; | 73 | 0 | pixel_type tmp = PixelAdd(Y, -(Cg >> 1)); | 74 | 0 | pixel_type G = PixelAdd(Cg, tmp); | 75 | 0 | pixel_type B = PixelAdd(tmp, -(Co >> 1)); | 76 | 0 | pixel_type R = PixelAdd(B, Co); | 77 | 0 | out0[x] = R; | 78 | 0 | out1[x] = G; | 79 | 0 | out2[x] = B; | 80 | 5.78k | } else { | 81 | 5.78k | pixel_type First = in0[x]; | 82 | 5.78k | pixel_type Second = in1[x]; | 83 | 5.78k | pixel_type Third = in2[x]; | 84 | 5.78k | if (third) Third = PixelAdd(Third, First); | 85 | 5.78k | if (second == 1) { | 86 | 0 | Second = PixelAdd(Second, First); | 87 | 5.78k | } else if (second == 2) { | 88 | 0 | Second = PixelAdd(Second, (PixelAdd(First, Third) >> 1)); | 89 | 0 | } | 90 | 5.78k | out0[x] = First; | 91 | 5.78k | out1[x] = Second; | 92 | 5.78k | out2[x] = Third; | 93 | 5.78k | } | 94 | 5.78k | } | 95 | 3.28k | } |
void jxl::N_SSE2::InvRCTRow<2>(int const*, int const*, int const*, int*, int*, int*, unsigned long) Line | Count | Source | 32 | 3.12k | pixel_type* out2, size_t w) { | 33 | 3.12k | static_assert(transform_type >= 0 && transform_type < 7, | 34 | 3.12k | "Invalid transform type"); | 35 | 3.12k | int second = transform_type >> 1; | 36 | 3.12k | int third = transform_type & 1; | 37 | | | 38 | 3.12k | size_t x = 0; | 39 | 3.12k | const HWY_FULL(pixel_type) d; | 40 | 3.12k | const size_t N = Lanes(d); | 41 | 50.2k | for (; x + N - 1 < w; x += N) { | 42 | 47.1k | if (transform_type == 6) { | 43 | 0 | auto Y = Load(d, in0 + x); | 44 | 0 | auto Co = Load(d, in1 + x); | 45 | 0 | auto Cg = Load(d, in2 + x); | 46 | 0 | Y = Sub(Y, ShiftRight<1>(Cg)); | 47 | 0 | auto G = Add(Cg, Y); | 48 | 0 | Y = Sub(Y, ShiftRight<1>(Co)); | 49 | 0 | auto R = Add(Y, Co); | 50 | 0 | Store(R, d, out0 + x); | 51 | 0 | Store(G, d, out1 + x); | 52 | 0 | Store(Y, d, out2 + x); | 53 | 47.1k | } else { | 54 | 47.1k | auto First = Load(d, in0 + x); | 55 | 47.1k | auto Second = Load(d, in1 + x); | 56 | 47.1k | auto Third = Load(d, in2 + x); | 57 | 47.1k | if (third) Third = Add(Third, First); | 58 | 47.2k | if (second == 1) { | 59 | 47.2k | Second = Add(Second, First); | 60 | 18.4E | } else if (second == 2) { | 61 | 0 | Second = Add(Second, ShiftRight<1>(Add(First, Third))); | 62 | 0 | } | 63 | 47.1k | Store(First, d, out0 + x); | 64 | 47.1k | Store(Second, d, out1 + x); | 65 | 47.1k | Store(Third, d, out2 + x); | 66 | 47.1k | } | 67 | 47.1k | } | 68 | 8.56k | for (; x < w; x++) { | 69 | 5.44k | if (transform_type == 6) { | 70 | 0 | pixel_type Y = in0[x]; | 71 | 0 | pixel_type Co = in1[x]; | 72 | 0 | pixel_type Cg = in2[x]; | 73 | 0 | pixel_type tmp = PixelAdd(Y, -(Cg >> 1)); | 74 | 0 | pixel_type G = PixelAdd(Cg, tmp); | 75 | 0 | pixel_type B = PixelAdd(tmp, -(Co >> 1)); | 76 | 0 | pixel_type R = PixelAdd(B, Co); | 77 | 0 | out0[x] = R; | 78 | 0 | out1[x] = G; | 79 | 0 | out2[x] = B; | 80 | 5.44k | } else { | 81 | 5.44k | pixel_type First = in0[x]; | 82 | 5.44k | pixel_type Second = in1[x]; | 83 | 5.44k | pixel_type Third = in2[x]; | 84 | 5.44k | if (third) Third = PixelAdd(Third, First); | 85 | 5.44k | if (second == 1) { | 86 | 5.44k | Second = PixelAdd(Second, First); | 87 | 5.44k | } else if (second == 2) { | 88 | 0 | Second = PixelAdd(Second, (PixelAdd(First, Third) >> 1)); | 89 | 0 | } | 90 | 5.44k | out0[x] = First; | 91 | 5.44k | out1[x] = Second; | 92 | 5.44k | out2[x] = Third; | 93 | 5.44k | } | 94 | 5.44k | } | 95 | 3.12k | } |
void jxl::N_SSE2::InvRCTRow<3>(int const*, int const*, int const*, int*, int*, int*, unsigned long) Line | Count | Source | 32 | 2.88k | pixel_type* out2, size_t w) { | 33 | 2.88k | static_assert(transform_type >= 0 && transform_type < 7, | 34 | 2.88k | "Invalid transform type"); | 35 | 2.88k | int second = transform_type >> 1; | 36 | 2.88k | int third = transform_type & 1; | 37 | | | 38 | 2.88k | size_t x = 0; | 39 | 2.88k | const HWY_FULL(pixel_type) d; | 40 | 2.88k | const size_t N = Lanes(d); | 41 | 78.6k | for (; x + N - 1 < w; x += N) { | 42 | 75.8k | if (transform_type == 6) { | 43 | 0 | auto Y = Load(d, in0 + x); | 44 | 0 | auto Co = Load(d, in1 + x); | 45 | 0 | auto Cg = Load(d, in2 + x); | 46 | 0 | Y = Sub(Y, ShiftRight<1>(Cg)); | 47 | 0 | auto G = Add(Cg, Y); | 48 | 0 | Y = Sub(Y, ShiftRight<1>(Co)); | 49 | 0 | auto R = Add(Y, Co); | 50 | 0 | Store(R, d, out0 + x); | 51 | 0 | Store(G, d, out1 + x); | 52 | 0 | Store(Y, d, out2 + x); | 53 | 75.8k | } else { | 54 | 75.8k | auto First = Load(d, in0 + x); | 55 | 75.8k | auto Second = Load(d, in1 + x); | 56 | 75.8k | auto Third = Load(d, in2 + x); | 57 | 75.8k | if (third) Third = Add(Third, First); | 58 | 75.8k | if (second == 1) { | 59 | 75.8k | Second = Add(Second, First); | 60 | 18.4E | } else if (second == 2) { | 61 | 0 | Second = Add(Second, ShiftRight<1>(Add(First, Third))); | 62 | 0 | } | 63 | 75.8k | Store(First, d, out0 + x); | 64 | 75.8k | Store(Second, d, out1 + x); | 65 | 75.8k | Store(Third, d, out2 + x); | 66 | 75.8k | } | 67 | 75.8k | } | 68 | 5.56k | for (; x < w; x++) { | 69 | 2.68k | if (transform_type == 6) { | 70 | 0 | pixel_type Y = in0[x]; | 71 | 0 | pixel_type Co = in1[x]; | 72 | 0 | pixel_type Cg = in2[x]; | 73 | 0 | pixel_type tmp = PixelAdd(Y, -(Cg >> 1)); | 74 | 0 | pixel_type G = PixelAdd(Cg, tmp); | 75 | 0 | pixel_type B = PixelAdd(tmp, -(Co >> 1)); | 76 | 0 | pixel_type R = PixelAdd(B, Co); | 77 | 0 | out0[x] = R; | 78 | 0 | out1[x] = G; | 79 | 0 | out2[x] = B; | 80 | 2.68k | } else { | 81 | 2.68k | pixel_type First = in0[x]; | 82 | 2.68k | pixel_type Second = in1[x]; | 83 | 2.68k | pixel_type Third = in2[x]; | 84 | 2.68k | if (third) Third = PixelAdd(Third, First); | 85 | 2.68k | if (second == 1) { | 86 | 2.68k | Second = PixelAdd(Second, First); | 87 | 2.68k | } else if (second == 2) { | 88 | 0 | Second = PixelAdd(Second, (PixelAdd(First, Third) >> 1)); | 89 | 0 | } | 90 | 2.68k | out0[x] = First; | 91 | 2.68k | out1[x] = Second; | 92 | 2.68k | out2[x] = Third; | 93 | 2.68k | } | 94 | 2.68k | } | 95 | 2.88k | } |
void jxl::N_SSE2::InvRCTRow<4>(int const*, int const*, int const*, int*, int*, int*, unsigned long) Line | Count | Source | 32 | 98.1k | pixel_type* out2, size_t w) { | 33 | 98.1k | static_assert(transform_type >= 0 && transform_type < 7, | 34 | 98.1k | "Invalid transform type"); | 35 | 98.1k | int second = transform_type >> 1; | 36 | 98.1k | int third = transform_type & 1; | 37 | | | 38 | 98.1k | size_t x = 0; | 39 | 98.1k | const HWY_FULL(pixel_type) d; | 40 | 98.1k | const size_t N = Lanes(d); | 41 | 5.35M | for (; x + N - 1 < w; x += N) { | 42 | 5.25M | if (transform_type == 6) { | 43 | 0 | auto Y = Load(d, in0 + x); | 44 | 0 | auto Co = Load(d, in1 + x); | 45 | 0 | auto Cg = Load(d, in2 + x); | 46 | 0 | Y = Sub(Y, ShiftRight<1>(Cg)); | 47 | 0 | auto G = Add(Cg, Y); | 48 | 0 | Y = Sub(Y, ShiftRight<1>(Co)); | 49 | 0 | auto R = Add(Y, Co); | 50 | 0 | Store(R, d, out0 + x); | 51 | 0 | Store(G, d, out1 + x); | 52 | 0 | Store(Y, d, out2 + x); | 53 | 5.25M | } else { | 54 | 5.25M | auto First = Load(d, in0 + x); | 55 | 5.25M | auto Second = Load(d, in1 + x); | 56 | 5.25M | auto Third = Load(d, in2 + x); | 57 | 5.25M | if (third) Third = Add(Third, First); | 58 | 5.25M | if (second == 1) { | 59 | 0 | Second = Add(Second, First); | 60 | 5.25M | } else if (second == 2) { | 61 | 5.25M | Second = Add(Second, ShiftRight<1>(Add(First, Third))); | 62 | 5.25M | } | 63 | 5.25M | Store(First, d, out0 + x); | 64 | 5.25M | Store(Second, d, out1 + x); | 65 | 5.25M | Store(Third, d, out2 + x); | 66 | 5.25M | } | 67 | 5.25M | } | 68 | 103k | for (; x < w; x++) { | 69 | 4.89k | if (transform_type == 6) { | 70 | 0 | pixel_type Y = in0[x]; | 71 | 0 | pixel_type Co = in1[x]; | 72 | 0 | pixel_type Cg = in2[x]; | 73 | 0 | pixel_type tmp = PixelAdd(Y, -(Cg >> 1)); | 74 | 0 | pixel_type G = PixelAdd(Cg, tmp); | 75 | 0 | pixel_type B = PixelAdd(tmp, -(Co >> 1)); | 76 | 0 | pixel_type R = PixelAdd(B, Co); | 77 | 0 | out0[x] = R; | 78 | 0 | out1[x] = G; | 79 | 0 | out2[x] = B; | 80 | 4.89k | } else { | 81 | 4.89k | pixel_type First = in0[x]; | 82 | 4.89k | pixel_type Second = in1[x]; | 83 | 4.89k | pixel_type Third = in2[x]; | 84 | 4.89k | if (third) Third = PixelAdd(Third, First); | 85 | 4.89k | if (second == 1) { | 86 | 0 | Second = PixelAdd(Second, First); | 87 | 4.89k | } else if (second == 2) { | 88 | 4.89k | Second = PixelAdd(Second, (PixelAdd(First, Third) >> 1)); | 89 | 4.89k | } | 90 | 4.89k | out0[x] = First; | 91 | 4.89k | out1[x] = Second; | 92 | 4.89k | out2[x] = Third; | 93 | 4.89k | } | 94 | 4.89k | } | 95 | 98.1k | } |
void jxl::N_SSE2::InvRCTRow<5>(int const*, int const*, int const*, int*, int*, int*, unsigned long) Line | Count | Source | 32 | 68.1k | pixel_type* out2, size_t w) { | 33 | 68.1k | static_assert(transform_type >= 0 && transform_type < 7, | 34 | 68.1k | "Invalid transform type"); | 35 | 68.1k | int second = transform_type >> 1; | 36 | 68.1k | int third = transform_type & 1; | 37 | | | 38 | 68.1k | size_t x = 0; | 39 | 68.1k | const HWY_FULL(pixel_type) d; | 40 | 68.1k | const size_t N = Lanes(d); | 41 | 3.36M | for (; x + N - 1 < w; x += N) { | 42 | 3.29M | if (transform_type == 6) { | 43 | 0 | auto Y = Load(d, in0 + x); | 44 | 0 | auto Co = Load(d, in1 + x); | 45 | 0 | auto Cg = Load(d, in2 + x); | 46 | 0 | Y = Sub(Y, ShiftRight<1>(Cg)); | 47 | 0 | auto G = Add(Cg, Y); | 48 | 0 | Y = Sub(Y, ShiftRight<1>(Co)); | 49 | 0 | auto R = Add(Y, Co); | 50 | 0 | Store(R, d, out0 + x); | 51 | 0 | Store(G, d, out1 + x); | 52 | 0 | Store(Y, d, out2 + x); | 53 | 3.29M | } else { | 54 | 3.29M | auto First = Load(d, in0 + x); | 55 | 3.29M | auto Second = Load(d, in1 + x); | 56 | 3.29M | auto Third = Load(d, in2 + x); | 57 | 3.31M | if (third) Third = Add(Third, First); | 58 | 3.29M | if (second == 1) { | 59 | 0 | Second = Add(Second, First); | 60 | 3.31M | } else if (second == 2) { | 61 | 3.31M | Second = Add(Second, ShiftRight<1>(Add(First, Third))); | 62 | 3.31M | } | 63 | 3.29M | Store(First, d, out0 + x); | 64 | 3.29M | Store(Second, d, out1 + x); | 65 | 3.29M | Store(Third, d, out2 + x); | 66 | 3.29M | } | 67 | 3.29M | } | 68 | 71.7k | for (; x < w; x++) { | 69 | 3.63k | if (transform_type == 6) { | 70 | 0 | pixel_type Y = in0[x]; | 71 | 0 | pixel_type Co = in1[x]; | 72 | 0 | pixel_type Cg = in2[x]; | 73 | 0 | pixel_type tmp = PixelAdd(Y, -(Cg >> 1)); | 74 | 0 | pixel_type G = PixelAdd(Cg, tmp); | 75 | 0 | pixel_type B = PixelAdd(tmp, -(Co >> 1)); | 76 | 0 | pixel_type R = PixelAdd(B, Co); | 77 | 0 | out0[x] = R; | 78 | 0 | out1[x] = G; | 79 | 0 | out2[x] = B; | 80 | 3.63k | } else { | 81 | 3.63k | pixel_type First = in0[x]; | 82 | 3.63k | pixel_type Second = in1[x]; | 83 | 3.63k | pixel_type Third = in2[x]; | 84 | 3.63k | if (third) Third = PixelAdd(Third, First); | 85 | 3.63k | if (second == 1) { | 86 | 0 | Second = PixelAdd(Second, First); | 87 | 3.63k | } else if (second == 2) { | 88 | 3.63k | Second = PixelAdd(Second, (PixelAdd(First, Third) >> 1)); | 89 | 3.63k | } | 90 | 3.63k | out0[x] = First; | 91 | 3.63k | out1[x] = Second; | 92 | 3.63k | out2[x] = Third; | 93 | 3.63k | } | 94 | 3.63k | } | 95 | 68.1k | } |
void jxl::N_SSE2::InvRCTRow<6>(int const*, int const*, int const*, int*, int*, int*, unsigned long) Line | Count | Source | 32 | 110k | pixel_type* out2, size_t w) { | 33 | 110k | static_assert(transform_type >= 0 && transform_type < 7, | 34 | 110k | "Invalid transform type"); | 35 | 110k | int second = transform_type >> 1; | 36 | 110k | int third = transform_type & 1; | 37 | | | 38 | 110k | size_t x = 0; | 39 | 110k | const HWY_FULL(pixel_type) d; | 40 | 110k | const size_t N = Lanes(d); | 41 | 5.84M | for (; x + N - 1 < w; x += N) { | 42 | 5.73M | if (transform_type == 6) { | 43 | 5.73M | auto Y = Load(d, in0 + x); | 44 | 5.73M | auto Co = Load(d, in1 + x); | 45 | 5.73M | auto Cg = Load(d, in2 + x); | 46 | 5.73M | Y = Sub(Y, ShiftRight<1>(Cg)); | 47 | 5.73M | auto G = Add(Cg, Y); | 48 | 5.73M | Y = Sub(Y, ShiftRight<1>(Co)); | 49 | 5.73M | auto R = Add(Y, Co); | 50 | 5.73M | Store(R, d, out0 + x); | 51 | 5.73M | Store(G, d, out1 + x); | 52 | 5.73M | Store(Y, d, out2 + x); | 53 | 5.73M | } else { | 54 | 3 | auto First = Load(d, in0 + x); | 55 | 3 | auto Second = Load(d, in1 + x); | 56 | 3 | auto Third = Load(d, in2 + x); | 57 | 3 | if (third) Third = Add(Third, First); | 58 | 3 | if (second == 1) { | 59 | 0 | Second = Add(Second, First); | 60 | 3 | } else if (second == 2) { | 61 | 0 | Second = Add(Second, ShiftRight<1>(Add(First, Third))); | 62 | 0 | } | 63 | 3 | Store(First, d, out0 + x); | 64 | 3 | Store(Second, d, out1 + x); | 65 | 3 | Store(Third, d, out2 + x); | 66 | 3 | } | 67 | 5.73M | } | 68 | 124k | for (; x < w; x++) { | 69 | 14.1k | if (transform_type == 6) { | 70 | 14.1k | pixel_type Y = in0[x]; | 71 | 14.1k | pixel_type Co = in1[x]; | 72 | 14.1k | pixel_type Cg = in2[x]; | 73 | 14.1k | pixel_type tmp = PixelAdd(Y, -(Cg >> 1)); | 74 | 14.1k | pixel_type G = PixelAdd(Cg, tmp); | 75 | 14.1k | pixel_type B = PixelAdd(tmp, -(Co >> 1)); | 76 | 14.1k | pixel_type R = PixelAdd(B, Co); | 77 | 14.1k | out0[x] = R; | 78 | 14.1k | out1[x] = G; | 79 | 14.1k | out2[x] = B; | 80 | 14.1k | } else { | 81 | 0 | pixel_type First = in0[x]; | 82 | 0 | pixel_type Second = in1[x]; | 83 | 0 | pixel_type Third = in2[x]; | 84 | 0 | if (third) Third = PixelAdd(Third, First); | 85 | 0 | if (second == 1) { | 86 | 0 | Second = PixelAdd(Second, First); | 87 | 0 | } else if (second == 2) { | 88 | 0 | Second = PixelAdd(Second, (PixelAdd(First, Third) >> 1)); | 89 | 0 | } | 90 | 0 | out0[x] = First; | 91 | 0 | out1[x] = Second; | 92 | 0 | out2[x] = Third; | 93 | 0 | } | 94 | 14.1k | } | 95 | 110k | } |
|