95 | 7.22M | } Unexecuted instantiation: void jxl::N_SSE4::InvRCTRow<0>(int const*, int const*, int const*, int*, int*, int*, unsigned long) void jxl::N_SSE4::InvRCTRow<1>(int const*, int const*, int const*, int*, int*, int*, unsigned long) Line | Count | Source | 32 | 3.14k | pixel_type* out2, size_t w) { | 33 | 3.14k | static_assert(transform_type >= 0 && transform_type < 7, | 34 | 3.14k | "Invalid transform type"); | 35 | 3.14k | int second = transform_type >> 1; | 36 | 3.14k | int third = transform_type & 1; | 37 | | | 38 | 3.14k | size_t x = 0; | 39 | 3.14k | const HWY_FULL(pixel_type) d; | 40 | 3.14k | const size_t N = Lanes(d); | 41 | 60.4k | for (; x + N - 1 < w; x += N) { | 42 | 57.2k | if (transform_type == 6) { | 43 | 0 | auto Y = Load(d, in0 + x); | 44 | 0 | auto Co = Load(d, in1 + x); | 45 | 0 | auto Cg = Load(d, in2 + x); | 46 | 0 | Y = Sub(Y, ShiftRight<1>(Cg)); | 47 | 0 | auto G = Add(Cg, Y); | 48 | 0 | Y = Sub(Y, ShiftRight<1>(Co)); | 49 | 0 | auto R = Add(Y, Co); | 50 | 0 | Store(R, d, out0 + x); | 51 | 0 | Store(G, d, out1 + x); | 52 | 0 | Store(Y, d, out2 + x); | 53 | 57.2k | } else { | 54 | 57.2k | auto First = Load(d, in0 + x); | 55 | 57.2k | auto Second = Load(d, in1 + x); | 56 | 57.2k | auto Third = Load(d, in2 + x); | 57 | 57.3k | if (third) Third = Add(Third, First); | 58 | 57.2k | if (second == 1) { | 59 | 0 | Second = Add(Second, First); | 60 | 57.2k | } else if (second == 2) { | 61 | 0 | Second = Add(Second, ShiftRight<1>(Add(First, Third))); | 62 | 0 | } | 63 | 57.2k | Store(First, d, out0 + x); | 64 | 57.2k | Store(Second, d, out1 + x); | 65 | 57.2k | Store(Third, d, out2 + x); | 66 | 57.2k | } | 67 | 57.2k | } | 68 | 7.96k | for (; x < w; x++) { | 69 | 4.81k | if (transform_type == 6) { | 70 | 0 | pixel_type Y = in0[x]; | 71 | 0 | pixel_type Co = in1[x]; | 72 | 0 | pixel_type Cg = in2[x]; | 73 | 0 | pixel_type tmp = PixelAdd(Y, -(Cg >> 1)); | 74 | 0 | pixel_type G = PixelAdd(Cg, tmp); | 75 | 0 | pixel_type B = PixelAdd(tmp, -(Co >> 1)); | 76 | 0 | pixel_type R = PixelAdd(B, Co); | 77 | 0 | out0[x] = R; | 78 | 0 | out1[x] = G; | 79 | 0 | out2[x] = B; | 80 | 4.81k | } else { | 81 | 4.81k | pixel_type First = in0[x]; | 82 | 4.81k | pixel_type Second = in1[x]; | 83 | 4.81k | pixel_type Third = in2[x]; | 84 | 4.81k | if (third) Third = PixelAdd(Third, First); | 85 | 4.81k | if (second == 1) { | 86 | 0 | Second = PixelAdd(Second, First); | 87 | 4.81k | } else if (second == 2) { | 88 | 0 | Second = PixelAdd(Second, (PixelAdd(First, Third) >> 1)); | 89 | 0 | } | 90 | 4.81k | out0[x] = First; | 91 | 4.81k | out1[x] = Second; | 92 | 4.81k | out2[x] = Third; | 93 | 4.81k | } | 94 | 4.81k | } | 95 | 3.14k | } |
void jxl::N_SSE4::InvRCTRow<2>(int const*, int const*, int const*, int*, int*, int*, unsigned long) Line | Count | Source | 32 | 3.26k | pixel_type* out2, size_t w) { | 33 | 3.26k | static_assert(transform_type >= 0 && transform_type < 7, | 34 | 3.26k | "Invalid transform type"); | 35 | 3.26k | int second = transform_type >> 1; | 36 | 3.26k | int third = transform_type & 1; | 37 | | | 38 | 3.26k | size_t x = 0; | 39 | 3.26k | const HWY_FULL(pixel_type) d; | 40 | 3.26k | const size_t N = Lanes(d); | 41 | 103k | for (; x + N - 1 < w; x += N) { | 42 | 99.8k | if (transform_type == 6) { | 43 | 0 | auto Y = Load(d, in0 + x); | 44 | 0 | auto Co = Load(d, in1 + x); | 45 | 0 | auto Cg = Load(d, in2 + x); | 46 | 0 | Y = Sub(Y, ShiftRight<1>(Cg)); | 47 | 0 | auto G = Add(Cg, Y); | 48 | 0 | Y = Sub(Y, ShiftRight<1>(Co)); | 49 | 0 | auto R = Add(Y, Co); | 50 | 0 | Store(R, d, out0 + x); | 51 | 0 | Store(G, d, out1 + x); | 52 | 0 | Store(Y, d, out2 + x); | 53 | 99.8k | } else { | 54 | 99.8k | auto First = Load(d, in0 + x); | 55 | 99.8k | auto Second = Load(d, in1 + x); | 56 | 99.8k | auto Third = Load(d, in2 + x); | 57 | 99.8k | if (third) Third = Add(Third, First); | 58 | 99.8k | if (second == 1) { | 59 | 99.8k | Second = Add(Second, First); | 60 | 99.8k | } else if (second == 2) { | 61 | 0 | Second = Add(Second, ShiftRight<1>(Add(First, Third))); | 62 | 0 | } | 63 | 99.8k | Store(First, d, out0 + x); | 64 | 99.8k | Store(Second, d, out1 + x); | 65 | 99.8k | Store(Third, d, out2 + x); | 66 | 99.8k | } | 67 | 99.8k | } | 68 | 5.45k | for (; x < w; x++) { | 69 | 2.19k | if (transform_type == 6) { | 70 | 0 | pixel_type Y = in0[x]; | 71 | 0 | pixel_type Co = in1[x]; | 72 | 0 | pixel_type Cg = in2[x]; | 73 | 0 | pixel_type tmp = PixelAdd(Y, -(Cg >> 1)); | 74 | 0 | pixel_type G = PixelAdd(Cg, tmp); | 75 | 0 | pixel_type B = PixelAdd(tmp, -(Co >> 1)); | 76 | 0 | pixel_type R = PixelAdd(B, Co); | 77 | 0 | out0[x] = R; | 78 | 0 | out1[x] = G; | 79 | 0 | out2[x] = B; | 80 | 2.19k | } else { | 81 | 2.19k | pixel_type First = in0[x]; | 82 | 2.19k | pixel_type Second = in1[x]; | 83 | 2.19k | pixel_type Third = in2[x]; | 84 | 2.19k | if (third) Third = PixelAdd(Third, First); | 85 | 2.19k | if (second == 1) { | 86 | 2.19k | Second = PixelAdd(Second, First); | 87 | 2.19k | } else if (second == 2) { | 88 | 0 | Second = PixelAdd(Second, (PixelAdd(First, Third) >> 1)); | 89 | 0 | } | 90 | 2.19k | out0[x] = First; | 91 | 2.19k | out1[x] = Second; | 92 | 2.19k | out2[x] = Third; | 93 | 2.19k | } | 94 | 2.19k | } | 95 | 3.26k | } |
void jxl::N_SSE4::InvRCTRow<3>(int const*, int const*, int const*, int*, int*, int*, unsigned long) Line | Count | Source | 32 | 3.60k | pixel_type* out2, size_t w) { | 33 | 3.60k | static_assert(transform_type >= 0 && transform_type < 7, | 34 | 3.60k | "Invalid transform type"); | 35 | 3.60k | int second = transform_type >> 1; | 36 | 3.60k | int third = transform_type & 1; | 37 | | | 38 | 3.60k | size_t x = 0; | 39 | 3.60k | const HWY_FULL(pixel_type) d; | 40 | 3.60k | const size_t N = Lanes(d); | 41 | 103k | for (; x + N - 1 < w; x += N) { | 42 | 99.7k | if (transform_type == 6) { | 43 | 0 | auto Y = Load(d, in0 + x); | 44 | 0 | auto Co = Load(d, in1 + x); | 45 | 0 | auto Cg = Load(d, in2 + x); | 46 | 0 | Y = Sub(Y, ShiftRight<1>(Cg)); | 47 | 0 | auto G = Add(Cg, Y); | 48 | 0 | Y = Sub(Y, ShiftRight<1>(Co)); | 49 | 0 | auto R = Add(Y, Co); | 50 | 0 | Store(R, d, out0 + x); | 51 | 0 | Store(G, d, out1 + x); | 52 | 0 | Store(Y, d, out2 + x); | 53 | 99.7k | } else { | 54 | 99.7k | auto First = Load(d, in0 + x); | 55 | 99.7k | auto Second = Load(d, in1 + x); | 56 | 99.7k | auto Third = Load(d, in2 + x); | 57 | 99.8k | if (third) Third = Add(Third, First); | 58 | 99.8k | if (second == 1) { | 59 | 99.8k | Second = Add(Second, First); | 60 | 18.4E | } else if (second == 2) { | 61 | 0 | Second = Add(Second, ShiftRight<1>(Add(First, Third))); | 62 | 0 | } | 63 | 99.7k | Store(First, d, out0 + x); | 64 | 99.7k | Store(Second, d, out1 + x); | 65 | 99.7k | Store(Third, d, out2 + x); | 66 | 99.7k | } | 67 | 99.7k | } | 68 | 5.79k | for (; x < w; x++) { | 69 | 2.18k | if (transform_type == 6) { | 70 | 0 | pixel_type Y = in0[x]; | 71 | 0 | pixel_type Co = in1[x]; | 72 | 0 | pixel_type Cg = in2[x]; | 73 | 0 | pixel_type tmp = PixelAdd(Y, -(Cg >> 1)); | 74 | 0 | pixel_type G = PixelAdd(Cg, tmp); | 75 | 0 | pixel_type B = PixelAdd(tmp, -(Co >> 1)); | 76 | 0 | pixel_type R = PixelAdd(B, Co); | 77 | 0 | out0[x] = R; | 78 | 0 | out1[x] = G; | 79 | 0 | out2[x] = B; | 80 | 2.18k | } else { | 81 | 2.18k | pixel_type First = in0[x]; | 82 | 2.18k | pixel_type Second = in1[x]; | 83 | 2.18k | pixel_type Third = in2[x]; | 84 | 2.18k | if (third) Third = PixelAdd(Third, First); | 85 | 2.18k | if (second == 1) { | 86 | 2.18k | Second = PixelAdd(Second, First); | 87 | 2.18k | } else if (second == 2) { | 88 | 0 | Second = PixelAdd(Second, (PixelAdd(First, Third) >> 1)); | 89 | 0 | } | 90 | 2.18k | out0[x] = First; | 91 | 2.18k | out1[x] = Second; | 92 | 2.18k | out2[x] = Third; | 93 | 2.18k | } | 94 | 2.18k | } | 95 | 3.60k | } |
void jxl::N_SSE4::InvRCTRow<4>(int const*, int const*, int const*, int*, int*, int*, unsigned long) Line | Count | Source | 32 | 27.5k | pixel_type* out2, size_t w) { | 33 | 27.5k | static_assert(transform_type >= 0 && transform_type < 7, | 34 | 27.5k | "Invalid transform type"); | 35 | 27.5k | int second = transform_type >> 1; | 36 | 27.5k | int third = transform_type & 1; | 37 | | | 38 | 27.5k | size_t x = 0; | 39 | 27.5k | const HWY_FULL(pixel_type) d; | 40 | 27.5k | const size_t N = Lanes(d); | 41 | 908k | for (; x + N - 1 < w; x += N) { | 42 | 881k | if (transform_type == 6) { | 43 | 0 | auto Y = Load(d, in0 + x); | 44 | 0 | auto Co = Load(d, in1 + x); | 45 | 0 | auto Cg = Load(d, in2 + x); | 46 | 0 | Y = Sub(Y, ShiftRight<1>(Cg)); | 47 | 0 | auto G = Add(Cg, Y); | 48 | 0 | Y = Sub(Y, ShiftRight<1>(Co)); | 49 | 0 | auto R = Add(Y, Co); | 50 | 0 | Store(R, d, out0 + x); | 51 | 0 | Store(G, d, out1 + x); | 52 | 0 | Store(Y, d, out2 + x); | 53 | 881k | } else { | 54 | 881k | auto First = Load(d, in0 + x); | 55 | 881k | auto Second = Load(d, in1 + x); | 56 | 881k | auto Third = Load(d, in2 + x); | 57 | 881k | if (third) Third = Add(Third, First); | 58 | 881k | if (second == 1) { | 59 | 0 | Second = Add(Second, First); | 60 | 881k | } else if (second == 2) { | 61 | 880k | Second = Add(Second, ShiftRight<1>(Add(First, Third))); | 62 | 880k | } | 63 | 881k | Store(First, d, out0 + x); | 64 | 881k | Store(Second, d, out1 + x); | 65 | 881k | Store(Third, d, out2 + x); | 66 | 881k | } | 67 | 881k | } | 68 | 46.9k | for (; x < w; x++) { | 69 | 19.3k | if (transform_type == 6) { | 70 | 0 | pixel_type Y = in0[x]; | 71 | 0 | pixel_type Co = in1[x]; | 72 | 0 | pixel_type Cg = in2[x]; | 73 | 0 | pixel_type tmp = PixelAdd(Y, -(Cg >> 1)); | 74 | 0 | pixel_type G = PixelAdd(Cg, tmp); | 75 | 0 | pixel_type B = PixelAdd(tmp, -(Co >> 1)); | 76 | 0 | pixel_type R = PixelAdd(B, Co); | 77 | 0 | out0[x] = R; | 78 | 0 | out1[x] = G; | 79 | 0 | out2[x] = B; | 80 | 19.3k | } else { | 81 | 19.3k | pixel_type First = in0[x]; | 82 | 19.3k | pixel_type Second = in1[x]; | 83 | 19.3k | pixel_type Third = in2[x]; | 84 | 19.3k | if (third) Third = PixelAdd(Third, First); | 85 | 19.3k | if (second == 1) { | 86 | 0 | Second = PixelAdd(Second, First); | 87 | 19.3k | } else if (second == 2) { | 88 | 19.3k | Second = PixelAdd(Second, (PixelAdd(First, Third) >> 1)); | 89 | 19.3k | } | 90 | 19.3k | out0[x] = First; | 91 | 19.3k | out1[x] = Second; | 92 | 19.3k | out2[x] = Third; | 93 | 19.3k | } | 94 | 19.3k | } | 95 | 27.5k | } |
void jxl::N_SSE4::InvRCTRow<5>(int const*, int const*, int const*, int*, int*, int*, unsigned long) Line | Count | Source | 32 | 157k | pixel_type* out2, size_t w) { | 33 | 157k | static_assert(transform_type >= 0 && transform_type < 7, | 34 | 157k | "Invalid transform type"); | 35 | 157k | int second = transform_type >> 1; | 36 | 157k | int third = transform_type & 1; | 37 | | | 38 | 157k | size_t x = 0; | 39 | 157k | const HWY_FULL(pixel_type) d; | 40 | 157k | const size_t N = Lanes(d); | 41 | 8.36M | for (; x + N - 1 < w; x += N) { | 42 | 8.21M | if (transform_type == 6) { | 43 | 0 | auto Y = Load(d, in0 + x); | 44 | 0 | auto Co = Load(d, in1 + x); | 45 | 0 | auto Cg = Load(d, in2 + x); | 46 | 0 | Y = Sub(Y, ShiftRight<1>(Cg)); | 47 | 0 | auto G = Add(Cg, Y); | 48 | 0 | Y = Sub(Y, ShiftRight<1>(Co)); | 49 | 0 | auto R = Add(Y, Co); | 50 | 0 | Store(R, d, out0 + x); | 51 | 0 | Store(G, d, out1 + x); | 52 | 0 | Store(Y, d, out2 + x); | 53 | 8.21M | } else { | 54 | 8.21M | auto First = Load(d, in0 + x); | 55 | 8.21M | auto Second = Load(d, in1 + x); | 56 | 8.21M | auto Third = Load(d, in2 + x); | 57 | 8.24M | if (third) Third = Add(Third, First); | 58 | 8.21M | if (second == 1) { | 59 | 0 | Second = Add(Second, First); | 60 | 8.21M | } else if (second == 2) { | 61 | 8.20M | Second = Add(Second, ShiftRight<1>(Add(First, Third))); | 62 | 8.20M | } | 63 | 8.21M | Store(First, d, out0 + x); | 64 | 8.21M | Store(Second, d, out1 + x); | 65 | 8.21M | Store(Third, d, out2 + x); | 66 | 8.21M | } | 67 | 8.21M | } | 68 | 158k | for (; x < w; x++) { | 69 | 1.69k | if (transform_type == 6) { | 70 | 0 | pixel_type Y = in0[x]; | 71 | 0 | pixel_type Co = in1[x]; | 72 | 0 | pixel_type Cg = in2[x]; | 73 | 0 | pixel_type tmp = PixelAdd(Y, -(Cg >> 1)); | 74 | 0 | pixel_type G = PixelAdd(Cg, tmp); | 75 | 0 | pixel_type B = PixelAdd(tmp, -(Co >> 1)); | 76 | 0 | pixel_type R = PixelAdd(B, Co); | 77 | 0 | out0[x] = R; | 78 | 0 | out1[x] = G; | 79 | 0 | out2[x] = B; | 80 | 1.69k | } else { | 81 | 1.69k | pixel_type First = in0[x]; | 82 | 1.69k | pixel_type Second = in1[x]; | 83 | 1.69k | pixel_type Third = in2[x]; | 84 | 1.69k | if (third) Third = PixelAdd(Third, First); | 85 | 1.69k | if (second == 1) { | 86 | 0 | Second = PixelAdd(Second, First); | 87 | 1.69k | } else if (second == 2) { | 88 | 1.69k | Second = PixelAdd(Second, (PixelAdd(First, Third) >> 1)); | 89 | 1.69k | } | 90 | 1.69k | out0[x] = First; | 91 | 1.69k | out1[x] = Second; | 92 | 1.69k | out2[x] = Third; | 93 | 1.69k | } | 94 | 1.69k | } | 95 | 157k | } |
void jxl::N_SSE4::InvRCTRow<6>(int const*, int const*, int const*, int*, int*, int*, unsigned long) Line | Count | Source | 32 | 45.8k | pixel_type* out2, size_t w) { | 33 | 45.8k | static_assert(transform_type >= 0 && transform_type < 7, | 34 | 45.8k | "Invalid transform type"); | 35 | 45.8k | int second = transform_type >> 1; | 36 | 45.8k | int third = transform_type & 1; | 37 | | | 38 | 45.8k | size_t x = 0; | 39 | 45.8k | const HWY_FULL(pixel_type) d; | 40 | 45.8k | const size_t N = Lanes(d); | 41 | 1.57M | for (; x + N - 1 < w; x += N) { | 42 | 1.53M | if (transform_type == 6) { | 43 | 1.53M | auto Y = Load(d, in0 + x); | 44 | 1.53M | auto Co = Load(d, in1 + x); | 45 | 1.53M | auto Cg = Load(d, in2 + x); | 46 | 1.53M | Y = Sub(Y, ShiftRight<1>(Cg)); | 47 | 1.53M | auto G = Add(Cg, Y); | 48 | 1.53M | Y = Sub(Y, ShiftRight<1>(Co)); | 49 | 1.53M | auto R = Add(Y, Co); | 50 | 1.53M | Store(R, d, out0 + x); | 51 | 1.53M | Store(G, d, out1 + x); | 52 | 1.53M | Store(Y, d, out2 + x); | 53 | 18.4E | } else { | 54 | 18.4E | auto First = Load(d, in0 + x); | 55 | 18.4E | auto Second = Load(d, in1 + x); | 56 | 18.4E | auto Third = Load(d, in2 + x); | 57 | 18.4E | if (third) Third = Add(Third, First); | 58 | 18.4E | if (second == 1) { | 59 | 0 | Second = Add(Second, First); | 60 | 18.4E | } else if (second == 2) { | 61 | 0 | Second = Add(Second, ShiftRight<1>(Add(First, Third))); | 62 | 0 | } | 63 | 18.4E | Store(First, d, out0 + x); | 64 | 18.4E | Store(Second, d, out1 + x); | 65 | 18.4E | Store(Third, d, out2 + x); | 66 | 18.4E | } | 67 | 1.52M | } | 68 | 87.4k | for (; x < w; x++) { | 69 | 41.6k | if (transform_type == 6) { | 70 | 41.6k | pixel_type Y = in0[x]; | 71 | 41.6k | pixel_type Co = in1[x]; | 72 | 41.6k | pixel_type Cg = in2[x]; | 73 | 41.6k | pixel_type tmp = PixelAdd(Y, -(Cg >> 1)); | 74 | 41.6k | pixel_type G = PixelAdd(Cg, tmp); | 75 | 41.6k | pixel_type B = PixelAdd(tmp, -(Co >> 1)); | 76 | 41.6k | pixel_type R = PixelAdd(B, Co); | 77 | 41.6k | out0[x] = R; | 78 | 41.6k | out1[x] = G; | 79 | 41.6k | out2[x] = B; | 80 | 18.4E | } else { | 81 | 18.4E | pixel_type First = in0[x]; | 82 | 18.4E | pixel_type Second = in1[x]; | 83 | 18.4E | pixel_type Third = in2[x]; | 84 | 18.4E | if (third) Third = PixelAdd(Third, First); | 85 | 18.4E | if (second == 1) { | 86 | 0 | Second = PixelAdd(Second, First); | 87 | 18.4E | } else if (second == 2) { | 88 | 0 | Second = PixelAdd(Second, (PixelAdd(First, Third) >> 1)); | 89 | 0 | } | 90 | 18.4E | out0[x] = First; | 91 | 18.4E | out1[x] = Second; | 92 | 18.4E | out2[x] = Third; | 93 | 18.4E | } | 94 | 41.5k | } | 95 | 45.8k | } |
Unexecuted instantiation: void jxl::N_AVX2::InvRCTRow<0>(int const*, int const*, int const*, int*, int*, int*, unsigned long) void jxl::N_AVX2::InvRCTRow<1>(int const*, int const*, int const*, int*, int*, int*, unsigned long) Line | Count | Source | 32 | 67.2k | pixel_type* out2, size_t w) { | 33 | 67.2k | static_assert(transform_type >= 0 && transform_type < 7, | 34 | 67.2k | "Invalid transform type"); | 35 | 67.2k | int second = transform_type >> 1; | 36 | 67.2k | int third = transform_type & 1; | 37 | | | 38 | 67.2k | size_t x = 0; | 39 | 67.2k | const HWY_FULL(pixel_type) d; | 40 | 67.2k | const size_t N = Lanes(d); | 41 | 646k | for (; x + N - 1 < w; x += N) { | 42 | 579k | if (transform_type == 6) { | 43 | 0 | auto Y = Load(d, in0 + x); | 44 | 0 | auto Co = Load(d, in1 + x); | 45 | 0 | auto Cg = Load(d, in2 + x); | 46 | 0 | Y = Sub(Y, ShiftRight<1>(Cg)); | 47 | 0 | auto G = Add(Cg, Y); | 48 | 0 | Y = Sub(Y, ShiftRight<1>(Co)); | 49 | 0 | auto R = Add(Y, Co); | 50 | 0 | Store(R, d, out0 + x); | 51 | 0 | Store(G, d, out1 + x); | 52 | 0 | Store(Y, d, out2 + x); | 53 | 579k | } else { | 54 | 579k | auto First = Load(d, in0 + x); | 55 | 579k | auto Second = Load(d, in1 + x); | 56 | 579k | auto Third = Load(d, in2 + x); | 57 | 579k | if (third) Third = Add(Third, First); | 58 | 579k | if (second == 1) { | 59 | 0 | Second = Add(Second, First); | 60 | 579k | } else if (second == 2) { | 61 | 0 | Second = Add(Second, ShiftRight<1>(Add(First, Third))); | 62 | 0 | } | 63 | 579k | Store(First, d, out0 + x); | 64 | 579k | Store(Second, d, out1 + x); | 65 | 579k | Store(Third, d, out2 + x); | 66 | 579k | } | 67 | 579k | } | 68 | 204k | for (; x < w; x++) { | 69 | 137k | if (transform_type == 6) { | 70 | 0 | pixel_type Y = in0[x]; | 71 | 0 | pixel_type Co = in1[x]; | 72 | 0 | pixel_type Cg = in2[x]; | 73 | 0 | pixel_type tmp = PixelAdd(Y, -(Cg >> 1)); | 74 | 0 | pixel_type G = PixelAdd(Cg, tmp); | 75 | 0 | pixel_type B = PixelAdd(tmp, -(Co >> 1)); | 76 | 0 | pixel_type R = PixelAdd(B, Co); | 77 | 0 | out0[x] = R; | 78 | 0 | out1[x] = G; | 79 | 0 | out2[x] = B; | 80 | 137k | } else { | 81 | 137k | pixel_type First = in0[x]; | 82 | 137k | pixel_type Second = in1[x]; | 83 | 137k | pixel_type Third = in2[x]; | 84 | 137k | if (third) Third = PixelAdd(Third, First); | 85 | 137k | if (second == 1) { | 86 | 0 | Second = PixelAdd(Second, First); | 87 | 137k | } else if (second == 2) { | 88 | 0 | Second = PixelAdd(Second, (PixelAdd(First, Third) >> 1)); | 89 | 0 | } | 90 | 137k | out0[x] = First; | 91 | 137k | out1[x] = Second; | 92 | 137k | out2[x] = Third; | 93 | 137k | } | 94 | 137k | } | 95 | 67.2k | } |
void jxl::N_AVX2::InvRCTRow<2>(int const*, int const*, int const*, int*, int*, int*, unsigned long) Line | Count | Source | 32 | 222k | pixel_type* out2, size_t w) { | 33 | 222k | static_assert(transform_type >= 0 && transform_type < 7, | 34 | 222k | "Invalid transform type"); | 35 | 222k | int second = transform_type >> 1; | 36 | 222k | int third = transform_type & 1; | 37 | | | 38 | 222k | size_t x = 0; | 39 | 222k | const HWY_FULL(pixel_type) d; | 40 | 222k | const size_t N = Lanes(d); | 41 | 3.45M | for (; x + N - 1 < w; x += N) { | 42 | 3.22M | if (transform_type == 6) { | 43 | 0 | auto Y = Load(d, in0 + x); | 44 | 0 | auto Co = Load(d, in1 + x); | 45 | 0 | auto Cg = Load(d, in2 + x); | 46 | 0 | Y = Sub(Y, ShiftRight<1>(Cg)); | 47 | 0 | auto G = Add(Cg, Y); | 48 | 0 | Y = Sub(Y, ShiftRight<1>(Co)); | 49 | 0 | auto R = Add(Y, Co); | 50 | 0 | Store(R, d, out0 + x); | 51 | 0 | Store(G, d, out1 + x); | 52 | 0 | Store(Y, d, out2 + x); | 53 | 3.22M | } else { | 54 | 3.22M | auto First = Load(d, in0 + x); | 55 | 3.22M | auto Second = Load(d, in1 + x); | 56 | 3.22M | auto Third = Load(d, in2 + x); | 57 | 3.22M | if (third) Third = Add(Third, First); | 58 | 3.22M | if (second == 1) { | 59 | 3.22M | Second = Add(Second, First); | 60 | 3.22M | } else if (second == 2) { | 61 | 0 | Second = Add(Second, ShiftRight<1>(Add(First, Third))); | 62 | 0 | } | 63 | 3.22M | Store(First, d, out0 + x); | 64 | 3.22M | Store(Second, d, out1 + x); | 65 | 3.22M | Store(Third, d, out2 + x); | 66 | 3.22M | } | 67 | 3.22M | } | 68 | 495k | for (; x < w; x++) { | 69 | 273k | if (transform_type == 6) { | 70 | 0 | pixel_type Y = in0[x]; | 71 | 0 | pixel_type Co = in1[x]; | 72 | 0 | pixel_type Cg = in2[x]; | 73 | 0 | pixel_type tmp = PixelAdd(Y, -(Cg >> 1)); | 74 | 0 | pixel_type G = PixelAdd(Cg, tmp); | 75 | 0 | pixel_type B = PixelAdd(tmp, -(Co >> 1)); | 76 | 0 | pixel_type R = PixelAdd(B, Co); | 77 | 0 | out0[x] = R; | 78 | 0 | out1[x] = G; | 79 | 0 | out2[x] = B; | 80 | 273k | } else { | 81 | 273k | pixel_type First = in0[x]; | 82 | 273k | pixel_type Second = in1[x]; | 83 | 273k | pixel_type Third = in2[x]; | 84 | 273k | if (third) Third = PixelAdd(Third, First); | 85 | 273k | if (second == 1) { | 86 | 273k | Second = PixelAdd(Second, First); | 87 | 273k | } else if (second == 2) { | 88 | 0 | Second = PixelAdd(Second, (PixelAdd(First, Third) >> 1)); | 89 | 0 | } | 90 | 273k | out0[x] = First; | 91 | 273k | out1[x] = Second; | 92 | 273k | out2[x] = Third; | 93 | 273k | } | 94 | 273k | } | 95 | 222k | } |
void jxl::N_AVX2::InvRCTRow<3>(int const*, int const*, int const*, int*, int*, int*, unsigned long) Line | Count | Source | 32 | 81.9k | pixel_type* out2, size_t w) { | 33 | 81.9k | static_assert(transform_type >= 0 && transform_type < 7, | 34 | 81.9k | "Invalid transform type"); | 35 | 81.9k | int second = transform_type >> 1; | 36 | 81.9k | int third = transform_type & 1; | 37 | | | 38 | 81.9k | size_t x = 0; | 39 | 81.9k | const HWY_FULL(pixel_type) d; | 40 | 81.9k | const size_t N = Lanes(d); | 41 | 956k | for (; x + N - 1 < w; x += N) { | 42 | 874k | if (transform_type == 6) { | 43 | 0 | auto Y = Load(d, in0 + x); | 44 | 0 | auto Co = Load(d, in1 + x); | 45 | 0 | auto Cg = Load(d, in2 + x); | 46 | 0 | Y = Sub(Y, ShiftRight<1>(Cg)); | 47 | 0 | auto G = Add(Cg, Y); | 48 | 0 | Y = Sub(Y, ShiftRight<1>(Co)); | 49 | 0 | auto R = Add(Y, Co); | 50 | 0 | Store(R, d, out0 + x); | 51 | 0 | Store(G, d, out1 + x); | 52 | 0 | Store(Y, d, out2 + x); | 53 | 874k | } else { | 54 | 874k | auto First = Load(d, in0 + x); | 55 | 874k | auto Second = Load(d, in1 + x); | 56 | 874k | auto Third = Load(d, in2 + x); | 57 | 874k | if (third) Third = Add(Third, First); | 58 | 874k | if (second == 1) { | 59 | 874k | Second = Add(Second, First); | 60 | 18.4E | } else if (second == 2) { | 61 | 0 | Second = Add(Second, ShiftRight<1>(Add(First, Third))); | 62 | 0 | } | 63 | 874k | Store(First, d, out0 + x); | 64 | 874k | Store(Second, d, out1 + x); | 65 | 874k | Store(Third, d, out2 + x); | 66 | 874k | } | 67 | 874k | } | 68 | 428k | for (; x < w; x++) { | 69 | 346k | if (transform_type == 6) { | 70 | 0 | pixel_type Y = in0[x]; | 71 | 0 | pixel_type Co = in1[x]; | 72 | 0 | pixel_type Cg = in2[x]; | 73 | 0 | pixel_type tmp = PixelAdd(Y, -(Cg >> 1)); | 74 | 0 | pixel_type G = PixelAdd(Cg, tmp); | 75 | 0 | pixel_type B = PixelAdd(tmp, -(Co >> 1)); | 76 | 0 | pixel_type R = PixelAdd(B, Co); | 77 | 0 | out0[x] = R; | 78 | 0 | out1[x] = G; | 79 | 0 | out2[x] = B; | 80 | 346k | } else { | 81 | 346k | pixel_type First = in0[x]; | 82 | 346k | pixel_type Second = in1[x]; | 83 | 346k | pixel_type Third = in2[x]; | 84 | 346k | if (third) Third = PixelAdd(Third, First); | 85 | 346k | if (second == 1) { | 86 | 346k | Second = PixelAdd(Second, First); | 87 | 346k | } else if (second == 2) { | 88 | 0 | Second = PixelAdd(Second, (PixelAdd(First, Third) >> 1)); | 89 | 0 | } | 90 | 346k | out0[x] = First; | 91 | 346k | out1[x] = Second; | 92 | 346k | out2[x] = Third; | 93 | 346k | } | 94 | 346k | } | 95 | 81.9k | } |
void jxl::N_AVX2::InvRCTRow<4>(int const*, int const*, int const*, int*, int*, int*, unsigned long) Line | Count | Source | 32 | 82.7k | pixel_type* out2, size_t w) { | 33 | 82.7k | static_assert(transform_type >= 0 && transform_type < 7, | 34 | 82.7k | "Invalid transform type"); | 35 | 82.7k | int second = transform_type >> 1; | 36 | 82.7k | int third = transform_type & 1; | 37 | | | 38 | 82.7k | size_t x = 0; | 39 | 82.7k | const HWY_FULL(pixel_type) d; | 40 | 82.7k | const size_t N = Lanes(d); | 41 | 1.48M | for (; x + N - 1 < w; x += N) { | 42 | 1.40M | if (transform_type == 6) { | 43 | 0 | auto Y = Load(d, in0 + x); | 44 | 0 | auto Co = Load(d, in1 + x); | 45 | 0 | auto Cg = Load(d, in2 + x); | 46 | 0 | Y = Sub(Y, ShiftRight<1>(Cg)); | 47 | 0 | auto G = Add(Cg, Y); | 48 | 0 | Y = Sub(Y, ShiftRight<1>(Co)); | 49 | 0 | auto R = Add(Y, Co); | 50 | 0 | Store(R, d, out0 + x); | 51 | 0 | Store(G, d, out1 + x); | 52 | 0 | Store(Y, d, out2 + x); | 53 | 1.40M | } else { | 54 | 1.40M | auto First = Load(d, in0 + x); | 55 | 1.40M | auto Second = Load(d, in1 + x); | 56 | 1.40M | auto Third = Load(d, in2 + x); | 57 | 1.40M | if (third) Third = Add(Third, First); | 58 | 1.40M | if (second == 1) { | 59 | 0 | Second = Add(Second, First); | 60 | 1.40M | } else if (second == 2) { | 61 | 1.40M | Second = Add(Second, ShiftRight<1>(Add(First, Third))); | 62 | 1.40M | } | 63 | 1.40M | Store(First, d, out0 + x); | 64 | 1.40M | Store(Second, d, out1 + x); | 65 | 1.40M | Store(Third, d, out2 + x); | 66 | 1.40M | } | 67 | 1.40M | } | 68 | 193k | for (; x < w; x++) { | 69 | 111k | if (transform_type == 6) { | 70 | 0 | pixel_type Y = in0[x]; | 71 | 0 | pixel_type Co = in1[x]; | 72 | 0 | pixel_type Cg = in2[x]; | 73 | 0 | pixel_type tmp = PixelAdd(Y, -(Cg >> 1)); | 74 | 0 | pixel_type G = PixelAdd(Cg, tmp); | 75 | 0 | pixel_type B = PixelAdd(tmp, -(Co >> 1)); | 76 | 0 | pixel_type R = PixelAdd(B, Co); | 77 | 0 | out0[x] = R; | 78 | 0 | out1[x] = G; | 79 | 0 | out2[x] = B; | 80 | 111k | } else { | 81 | 111k | pixel_type First = in0[x]; | 82 | 111k | pixel_type Second = in1[x]; | 83 | 111k | pixel_type Third = in2[x]; | 84 | 111k | if (third) Third = PixelAdd(Third, First); | 85 | 111k | if (second == 1) { | 86 | 0 | Second = PixelAdd(Second, First); | 87 | 111k | } else if (second == 2) { | 88 | 111k | Second = PixelAdd(Second, (PixelAdd(First, Third) >> 1)); | 89 | 111k | } | 90 | 111k | out0[x] = First; | 91 | 111k | out1[x] = Second; | 92 | 111k | out2[x] = Third; | 93 | 111k | } | 94 | 111k | } | 95 | 82.7k | } |
void jxl::N_AVX2::InvRCTRow<5>(int const*, int const*, int const*, int*, int*, int*, unsigned long) Line | Count | Source | 32 | 185k | pixel_type* out2, size_t w) { | 33 | 185k | static_assert(transform_type >= 0 && transform_type < 7, | 34 | 185k | "Invalid transform type"); | 35 | 185k | int second = transform_type >> 1; | 36 | 185k | int third = transform_type & 1; | 37 | | | 38 | 185k | size_t x = 0; | 39 | 185k | const HWY_FULL(pixel_type) d; | 40 | 185k | const size_t N = Lanes(d); | 41 | 4.67M | for (; x + N - 1 < w; x += N) { | 42 | 4.49M | if (transform_type == 6) { | 43 | 0 | auto Y = Load(d, in0 + x); | 44 | 0 | auto Co = Load(d, in1 + x); | 45 | 0 | auto Cg = Load(d, in2 + x); | 46 | 0 | Y = Sub(Y, ShiftRight<1>(Cg)); | 47 | 0 | auto G = Add(Cg, Y); | 48 | 0 | Y = Sub(Y, ShiftRight<1>(Co)); | 49 | 0 | auto R = Add(Y, Co); | 50 | 0 | Store(R, d, out0 + x); | 51 | 0 | Store(G, d, out1 + x); | 52 | 0 | Store(Y, d, out2 + x); | 53 | 4.49M | } else { | 54 | 4.49M | auto First = Load(d, in0 + x); | 55 | 4.49M | auto Second = Load(d, in1 + x); | 56 | 4.49M | auto Third = Load(d, in2 + x); | 57 | 4.49M | if (third) Third = Add(Third, First); | 58 | 4.49M | if (second == 1) { | 59 | 0 | Second = Add(Second, First); | 60 | 4.49M | } else if (second == 2) { | 61 | 4.49M | Second = Add(Second, ShiftRight<1>(Add(First, Third))); | 62 | 4.49M | } | 63 | 4.49M | Store(First, d, out0 + x); | 64 | 4.49M | Store(Second, d, out1 + x); | 65 | 4.49M | Store(Third, d, out2 + x); | 66 | 4.49M | } | 67 | 4.49M | } | 68 | 567k | for (; x < w; x++) { | 69 | 382k | if (transform_type == 6) { | 70 | 0 | pixel_type Y = in0[x]; | 71 | 0 | pixel_type Co = in1[x]; | 72 | 0 | pixel_type Cg = in2[x]; | 73 | 0 | pixel_type tmp = PixelAdd(Y, -(Cg >> 1)); | 74 | 0 | pixel_type G = PixelAdd(Cg, tmp); | 75 | 0 | pixel_type B = PixelAdd(tmp, -(Co >> 1)); | 76 | 0 | pixel_type R = PixelAdd(B, Co); | 77 | 0 | out0[x] = R; | 78 | 0 | out1[x] = G; | 79 | 0 | out2[x] = B; | 80 | 382k | } else { | 81 | 382k | pixel_type First = in0[x]; | 82 | 382k | pixel_type Second = in1[x]; | 83 | 382k | pixel_type Third = in2[x]; | 84 | 382k | if (third) Third = PixelAdd(Third, First); | 85 | 382k | if (second == 1) { | 86 | 0 | Second = PixelAdd(Second, First); | 87 | 382k | } else if (second == 2) { | 88 | 382k | Second = PixelAdd(Second, (PixelAdd(First, Third) >> 1)); | 89 | 382k | } | 90 | 382k | out0[x] = First; | 91 | 382k | out1[x] = Second; | 92 | 382k | out2[x] = Third; | 93 | 382k | } | 94 | 382k | } | 95 | 185k | } |
void jxl::N_AVX2::InvRCTRow<6>(int const*, int const*, int const*, int*, int*, int*, unsigned long) Line | Count | Source | 32 | 6.11M | pixel_type* out2, size_t w) { | 33 | 6.11M | static_assert(transform_type >= 0 && transform_type < 7, | 34 | 6.11M | "Invalid transform type"); | 35 | 6.11M | int second = transform_type >> 1; | 36 | 6.11M | int third = transform_type & 1; | 37 | | | 38 | 6.11M | size_t x = 0; | 39 | 6.11M | const HWY_FULL(pixel_type) d; | 40 | 6.11M | const size_t N = Lanes(d); | 41 | 10.0M | for (; x + N - 1 < w; x += N) { | 42 | 3.97M | if (transform_type == 6) { | 43 | 3.97M | auto Y = Load(d, in0 + x); | 44 | 3.97M | auto Co = Load(d, in1 + x); | 45 | 3.97M | auto Cg = Load(d, in2 + x); | 46 | 3.97M | Y = Sub(Y, ShiftRight<1>(Cg)); | 47 | 3.97M | auto G = Add(Cg, Y); | 48 | 3.97M | Y = Sub(Y, ShiftRight<1>(Co)); | 49 | 3.97M | auto R = Add(Y, Co); | 50 | 3.97M | Store(R, d, out0 + x); | 51 | 3.97M | Store(G, d, out1 + x); | 52 | 3.97M | Store(Y, d, out2 + x); | 53 | 3.97M | } else { | 54 | 12 | auto First = Load(d, in0 + x); | 55 | 12 | auto Second = Load(d, in1 + x); | 56 | 12 | auto Third = Load(d, in2 + x); | 57 | 12 | if (third) Third = Add(Third, First); | 58 | 12 | if (second == 1) { | 59 | 0 | Second = Add(Second, First); | 60 | 12 | } else if (second == 2) { | 61 | 0 | Second = Add(Second, ShiftRight<1>(Add(First, Third))); | 62 | 0 | } | 63 | 12 | Store(First, d, out0 + x); | 64 | 12 | Store(Second, d, out1 + x); | 65 | 12 | Store(Third, d, out2 + x); | 66 | 12 | } | 67 | 3.97M | } | 68 | 45.2M | for (; x < w; x++) { | 69 | 39.1M | if (transform_type == 6) { | 70 | 39.1M | pixel_type Y = in0[x]; | 71 | 39.1M | pixel_type Co = in1[x]; | 72 | 39.1M | pixel_type Cg = in2[x]; | 73 | 39.1M | pixel_type tmp = PixelAdd(Y, -(Cg >> 1)); | 74 | 39.1M | pixel_type G = PixelAdd(Cg, tmp); | 75 | 39.1M | pixel_type B = PixelAdd(tmp, -(Co >> 1)); | 76 | 39.1M | pixel_type R = PixelAdd(B, Co); | 77 | 39.1M | out0[x] = R; | 78 | 39.1M | out1[x] = G; | 79 | 39.1M | out2[x] = B; | 80 | 39.1M | } else { | 81 | 0 | pixel_type First = in0[x]; | 82 | 0 | pixel_type Second = in1[x]; | 83 | 0 | pixel_type Third = in2[x]; | 84 | 0 | if (third) Third = PixelAdd(Third, First); | 85 | 0 | if (second == 1) { | 86 | 0 | Second = PixelAdd(Second, First); | 87 | 0 | } else if (second == 2) { | 88 | 0 | Second = PixelAdd(Second, (PixelAdd(First, Third) >> 1)); | 89 | 0 | } | 90 | 0 | out0[x] = First; | 91 | 0 | out1[x] = Second; | 92 | 0 | out2[x] = Third; | 93 | 0 | } | 94 | 39.1M | } | 95 | 6.11M | } |
Unexecuted instantiation: void jxl::N_SSE2::InvRCTRow<0>(int const*, int const*, int const*, int*, int*, int*, unsigned long) void jxl::N_SSE2::InvRCTRow<1>(int const*, int const*, int const*, int*, int*, int*, unsigned long) Line | Count | Source | 32 | 5.05k | pixel_type* out2, size_t w) { | 33 | 5.05k | static_assert(transform_type >= 0 && transform_type < 7, | 34 | 5.05k | "Invalid transform type"); | 35 | 5.05k | int second = transform_type >> 1; | 36 | 5.05k | int third = transform_type & 1; | 37 | | | 38 | 5.05k | size_t x = 0; | 39 | 5.05k | const HWY_FULL(pixel_type) d; | 40 | 5.05k | const size_t N = Lanes(d); | 41 | 147k | for (; x + N - 1 < w; x += N) { | 42 | 142k | if (transform_type == 6) { | 43 | 0 | auto Y = Load(d, in0 + x); | 44 | 0 | auto Co = Load(d, in1 + x); | 45 | 0 | auto Cg = Load(d, in2 + x); | 46 | 0 | Y = Sub(Y, ShiftRight<1>(Cg)); | 47 | 0 | auto G = Add(Cg, Y); | 48 | 0 | Y = Sub(Y, ShiftRight<1>(Co)); | 49 | 0 | auto R = Add(Y, Co); | 50 | 0 | Store(R, d, out0 + x); | 51 | 0 | Store(G, d, out1 + x); | 52 | 0 | Store(Y, d, out2 + x); | 53 | 142k | } else { | 54 | 142k | auto First = Load(d, in0 + x); | 55 | 142k | auto Second = Load(d, in1 + x); | 56 | 142k | auto Third = Load(d, in2 + x); | 57 | 142k | if (third) Third = Add(Third, First); | 58 | 142k | if (second == 1) { | 59 | 0 | Second = Add(Second, First); | 60 | 142k | } else if (second == 2) { | 61 | 0 | Second = Add(Second, ShiftRight<1>(Add(First, Third))); | 62 | 0 | } | 63 | 142k | Store(First, d, out0 + x); | 64 | 142k | Store(Second, d, out1 + x); | 65 | 142k | Store(Third, d, out2 + x); | 66 | 142k | } | 67 | 142k | } | 68 | 13.9k | for (; x < w; x++) { | 69 | 8.88k | if (transform_type == 6) { | 70 | 0 | pixel_type Y = in0[x]; | 71 | 0 | pixel_type Co = in1[x]; | 72 | 0 | pixel_type Cg = in2[x]; | 73 | 0 | pixel_type tmp = PixelAdd(Y, -(Cg >> 1)); | 74 | 0 | pixel_type G = PixelAdd(Cg, tmp); | 75 | 0 | pixel_type B = PixelAdd(tmp, -(Co >> 1)); | 76 | 0 | pixel_type R = PixelAdd(B, Co); | 77 | 0 | out0[x] = R; | 78 | 0 | out1[x] = G; | 79 | 0 | out2[x] = B; | 80 | 8.88k | } else { | 81 | 8.88k | pixel_type First = in0[x]; | 82 | 8.88k | pixel_type Second = in1[x]; | 83 | 8.88k | pixel_type Third = in2[x]; | 84 | 8.88k | if (third) Third = PixelAdd(Third, First); | 85 | 8.88k | if (second == 1) { | 86 | 0 | Second = PixelAdd(Second, First); | 87 | 8.88k | } else if (second == 2) { | 88 | 0 | Second = PixelAdd(Second, (PixelAdd(First, Third) >> 1)); | 89 | 0 | } | 90 | 8.88k | out0[x] = First; | 91 | 8.88k | out1[x] = Second; | 92 | 8.88k | out2[x] = Third; | 93 | 8.88k | } | 94 | 8.88k | } | 95 | 5.05k | } |
void jxl::N_SSE2::InvRCTRow<2>(int const*, int const*, int const*, int*, int*, int*, unsigned long) Line | Count | Source | 32 | 3.75k | pixel_type* out2, size_t w) { | 33 | 3.75k | static_assert(transform_type >= 0 && transform_type < 7, | 34 | 3.75k | "Invalid transform type"); | 35 | 3.75k | int second = transform_type >> 1; | 36 | 3.75k | int third = transform_type & 1; | 37 | | | 38 | 3.75k | size_t x = 0; | 39 | 3.75k | const HWY_FULL(pixel_type) d; | 40 | 3.75k | const size_t N = Lanes(d); | 41 | 101k | for (; x + N - 1 < w; x += N) { | 42 | 97.5k | if (transform_type == 6) { | 43 | 0 | auto Y = Load(d, in0 + x); | 44 | 0 | auto Co = Load(d, in1 + x); | 45 | 0 | auto Cg = Load(d, in2 + x); | 46 | 0 | Y = Sub(Y, ShiftRight<1>(Cg)); | 47 | 0 | auto G = Add(Cg, Y); | 48 | 0 | Y = Sub(Y, ShiftRight<1>(Co)); | 49 | 0 | auto R = Add(Y, Co); | 50 | 0 | Store(R, d, out0 + x); | 51 | 0 | Store(G, d, out1 + x); | 52 | 0 | Store(Y, d, out2 + x); | 53 | 97.5k | } else { | 54 | 97.5k | auto First = Load(d, in0 + x); | 55 | 97.5k | auto Second = Load(d, in1 + x); | 56 | 97.5k | auto Third = Load(d, in2 + x); | 57 | 97.5k | if (third) Third = Add(Third, First); | 58 | 97.5k | if (second == 1) { | 59 | 97.5k | Second = Add(Second, First); | 60 | 18.4E | } else if (second == 2) { | 61 | 0 | Second = Add(Second, ShiftRight<1>(Add(First, Third))); | 62 | 0 | } | 63 | 97.5k | Store(First, d, out0 + x); | 64 | 97.5k | Store(Second, d, out1 + x); | 65 | 97.5k | Store(Third, d, out2 + x); | 66 | 97.5k | } | 67 | 97.5k | } | 68 | 6.63k | for (; x < w; x++) { | 69 | 2.88k | if (transform_type == 6) { | 70 | 0 | pixel_type Y = in0[x]; | 71 | 0 | pixel_type Co = in1[x]; | 72 | 0 | pixel_type Cg = in2[x]; | 73 | 0 | pixel_type tmp = PixelAdd(Y, -(Cg >> 1)); | 74 | 0 | pixel_type G = PixelAdd(Cg, tmp); | 75 | 0 | pixel_type B = PixelAdd(tmp, -(Co >> 1)); | 76 | 0 | pixel_type R = PixelAdd(B, Co); | 77 | 0 | out0[x] = R; | 78 | 0 | out1[x] = G; | 79 | 0 | out2[x] = B; | 80 | 2.88k | } else { | 81 | 2.88k | pixel_type First = in0[x]; | 82 | 2.88k | pixel_type Second = in1[x]; | 83 | 2.88k | pixel_type Third = in2[x]; | 84 | 2.88k | if (third) Third = PixelAdd(Third, First); | 85 | 2.88k | if (second == 1) { | 86 | 2.88k | Second = PixelAdd(Second, First); | 87 | 2.88k | } else if (second == 2) { | 88 | 0 | Second = PixelAdd(Second, (PixelAdd(First, Third) >> 1)); | 89 | 0 | } | 90 | 2.88k | out0[x] = First; | 91 | 2.88k | out1[x] = Second; | 92 | 2.88k | out2[x] = Third; | 93 | 2.88k | } | 94 | 2.88k | } | 95 | 3.75k | } |
void jxl::N_SSE2::InvRCTRow<3>(int const*, int const*, int const*, int*, int*, int*, unsigned long) Line | Count | Source | 32 | 3.35k | pixel_type* out2, size_t w) { | 33 | 3.35k | static_assert(transform_type >= 0 && transform_type < 7, | 34 | 3.35k | "Invalid transform type"); | 35 | 3.35k | int second = transform_type >> 1; | 36 | 3.35k | int third = transform_type & 1; | 37 | | | 38 | 3.35k | size_t x = 0; | 39 | 3.35k | const HWY_FULL(pixel_type) d; | 40 | 3.35k | const size_t N = Lanes(d); | 41 | 80.5k | for (; x + N - 1 < w; x += N) { | 42 | 77.2k | if (transform_type == 6) { | 43 | 0 | auto Y = Load(d, in0 + x); | 44 | 0 | auto Co = Load(d, in1 + x); | 45 | 0 | auto Cg = Load(d, in2 + x); | 46 | 0 | Y = Sub(Y, ShiftRight<1>(Cg)); | 47 | 0 | auto G = Add(Cg, Y); | 48 | 0 | Y = Sub(Y, ShiftRight<1>(Co)); | 49 | 0 | auto R = Add(Y, Co); | 50 | 0 | Store(R, d, out0 + x); | 51 | 0 | Store(G, d, out1 + x); | 52 | 0 | Store(Y, d, out2 + x); | 53 | 77.2k | } else { | 54 | 77.2k | auto First = Load(d, in0 + x); | 55 | 77.2k | auto Second = Load(d, in1 + x); | 56 | 77.2k | auto Third = Load(d, in2 + x); | 57 | 77.2k | if (third) Third = Add(Third, First); | 58 | 77.3k | if (second == 1) { | 59 | 77.3k | Second = Add(Second, First); | 60 | 18.4E | } else if (second == 2) { | 61 | 0 | Second = Add(Second, ShiftRight<1>(Add(First, Third))); | 62 | 0 | } | 63 | 77.2k | Store(First, d, out0 + x); | 64 | 77.2k | Store(Second, d, out1 + x); | 65 | 77.2k | Store(Third, d, out2 + x); | 66 | 77.2k | } | 67 | 77.2k | } | 68 | 6.73k | for (; x < w; x++) { | 69 | 3.37k | if (transform_type == 6) { | 70 | 0 | pixel_type Y = in0[x]; | 71 | 0 | pixel_type Co = in1[x]; | 72 | 0 | pixel_type Cg = in2[x]; | 73 | 0 | pixel_type tmp = PixelAdd(Y, -(Cg >> 1)); | 74 | 0 | pixel_type G = PixelAdd(Cg, tmp); | 75 | 0 | pixel_type B = PixelAdd(tmp, -(Co >> 1)); | 76 | 0 | pixel_type R = PixelAdd(B, Co); | 77 | 0 | out0[x] = R; | 78 | 0 | out1[x] = G; | 79 | 0 | out2[x] = B; | 80 | 3.37k | } else { | 81 | 3.37k | pixel_type First = in0[x]; | 82 | 3.37k | pixel_type Second = in1[x]; | 83 | 3.37k | pixel_type Third = in2[x]; | 84 | 3.37k | if (third) Third = PixelAdd(Third, First); | 85 | 3.37k | if (second == 1) { | 86 | 3.37k | Second = PixelAdd(Second, First); | 87 | 3.37k | } else if (second == 2) { | 88 | 0 | Second = PixelAdd(Second, (PixelAdd(First, Third) >> 1)); | 89 | 0 | } | 90 | 3.37k | out0[x] = First; | 91 | 3.37k | out1[x] = Second; | 92 | 3.37k | out2[x] = Third; | 93 | 3.37k | } | 94 | 3.37k | } | 95 | 3.35k | } |
void jxl::N_SSE2::InvRCTRow<4>(int const*, int const*, int const*, int*, int*, int*, unsigned long) Line | Count | Source | 32 | 63.6k | pixel_type* out2, size_t w) { | 33 | 63.6k | static_assert(transform_type >= 0 && transform_type < 7, | 34 | 63.6k | "Invalid transform type"); | 35 | 63.6k | int second = transform_type >> 1; | 36 | 63.6k | int third = transform_type & 1; | 37 | | | 38 | 63.6k | size_t x = 0; | 39 | 63.6k | const HWY_FULL(pixel_type) d; | 40 | 63.6k | const size_t N = Lanes(d); | 41 | 3.61M | for (; x + N - 1 < w; x += N) { | 42 | 3.55M | if (transform_type == 6) { | 43 | 0 | auto Y = Load(d, in0 + x); | 44 | 0 | auto Co = Load(d, in1 + x); | 45 | 0 | auto Cg = Load(d, in2 + x); | 46 | 0 | Y = Sub(Y, ShiftRight<1>(Cg)); | 47 | 0 | auto G = Add(Cg, Y); | 48 | 0 | Y = Sub(Y, ShiftRight<1>(Co)); | 49 | 0 | auto R = Add(Y, Co); | 50 | 0 | Store(R, d, out0 + x); | 51 | 0 | Store(G, d, out1 + x); | 52 | 0 | Store(Y, d, out2 + x); | 53 | 3.55M | } else { | 54 | 3.55M | auto First = Load(d, in0 + x); | 55 | 3.55M | auto Second = Load(d, in1 + x); | 56 | 3.55M | auto Third = Load(d, in2 + x); | 57 | 3.55M | if (third) Third = Add(Third, First); | 58 | 3.55M | if (second == 1) { | 59 | 0 | Second = Add(Second, First); | 60 | 3.55M | } else if (second == 2) { | 61 | 3.55M | Second = Add(Second, ShiftRight<1>(Add(First, Third))); | 62 | 3.55M | } | 63 | 3.55M | Store(First, d, out0 + x); | 64 | 3.55M | Store(Second, d, out1 + x); | 65 | 3.55M | Store(Third, d, out2 + x); | 66 | 3.55M | } | 67 | 3.55M | } | 68 | 71.1k | for (; x < w; x++) { | 69 | 7.52k | if (transform_type == 6) { | 70 | 0 | pixel_type Y = in0[x]; | 71 | 0 | pixel_type Co = in1[x]; | 72 | 0 | pixel_type Cg = in2[x]; | 73 | 0 | pixel_type tmp = PixelAdd(Y, -(Cg >> 1)); | 74 | 0 | pixel_type G = PixelAdd(Cg, tmp); | 75 | 0 | pixel_type B = PixelAdd(tmp, -(Co >> 1)); | 76 | 0 | pixel_type R = PixelAdd(B, Co); | 77 | 0 | out0[x] = R; | 78 | 0 | out1[x] = G; | 79 | 0 | out2[x] = B; | 80 | 7.52k | } else { | 81 | 7.52k | pixel_type First = in0[x]; | 82 | 7.52k | pixel_type Second = in1[x]; | 83 | 7.52k | pixel_type Third = in2[x]; | 84 | 7.52k | if (third) Third = PixelAdd(Third, First); | 85 | 7.52k | if (second == 1) { | 86 | 0 | Second = PixelAdd(Second, First); | 87 | 7.52k | } else if (second == 2) { | 88 | 7.52k | Second = PixelAdd(Second, (PixelAdd(First, Third) >> 1)); | 89 | 7.52k | } | 90 | 7.52k | out0[x] = First; | 91 | 7.52k | out1[x] = Second; | 92 | 7.52k | out2[x] = Third; | 93 | 7.52k | } | 94 | 7.52k | } | 95 | 63.6k | } |
void jxl::N_SSE2::InvRCTRow<5>(int const*, int const*, int const*, int*, int*, int*, unsigned long) Line | Count | Source | 32 | 69.5k | pixel_type* out2, size_t w) { | 33 | 69.5k | static_assert(transform_type >= 0 && transform_type < 7, | 34 | 69.5k | "Invalid transform type"); | 35 | 69.5k | int second = transform_type >> 1; | 36 | 69.5k | int third = transform_type & 1; | 37 | | | 38 | 69.5k | size_t x = 0; | 39 | 69.5k | const HWY_FULL(pixel_type) d; | 40 | 69.5k | const size_t N = Lanes(d); | 41 | 3.51M | for (; x + N - 1 < w; x += N) { | 42 | 3.44M | if (transform_type == 6) { | 43 | 0 | auto Y = Load(d, in0 + x); | 44 | 0 | auto Co = Load(d, in1 + x); | 45 | 0 | auto Cg = Load(d, in2 + x); | 46 | 0 | Y = Sub(Y, ShiftRight<1>(Cg)); | 47 | 0 | auto G = Add(Cg, Y); | 48 | 0 | Y = Sub(Y, ShiftRight<1>(Co)); | 49 | 0 | auto R = Add(Y, Co); | 50 | 0 | Store(R, d, out0 + x); | 51 | 0 | Store(G, d, out1 + x); | 52 | 0 | Store(Y, d, out2 + x); | 53 | 3.44M | } else { | 54 | 3.44M | auto First = Load(d, in0 + x); | 55 | 3.44M | auto Second = Load(d, in1 + x); | 56 | 3.44M | auto Third = Load(d, in2 + x); | 57 | 3.44M | if (third) Third = Add(Third, First); | 58 | 3.44M | if (second == 1) { | 59 | 0 | Second = Add(Second, First); | 60 | 3.44M | } else if (second == 2) { | 61 | 3.44M | Second = Add(Second, ShiftRight<1>(Add(First, Third))); | 62 | 3.44M | } | 63 | 3.44M | Store(First, d, out0 + x); | 64 | 3.44M | Store(Second, d, out1 + x); | 65 | 3.44M | Store(Third, d, out2 + x); | 66 | 3.44M | } | 67 | 3.44M | } | 68 | 72.6k | for (; x < w; x++) { | 69 | 3.10k | if (transform_type == 6) { | 70 | 0 | pixel_type Y = in0[x]; | 71 | 0 | pixel_type Co = in1[x]; | 72 | 0 | pixel_type Cg = in2[x]; | 73 | 0 | pixel_type tmp = PixelAdd(Y, -(Cg >> 1)); | 74 | 0 | pixel_type G = PixelAdd(Cg, tmp); | 75 | 0 | pixel_type B = PixelAdd(tmp, -(Co >> 1)); | 76 | 0 | pixel_type R = PixelAdd(B, Co); | 77 | 0 | out0[x] = R; | 78 | 0 | out1[x] = G; | 79 | 0 | out2[x] = B; | 80 | 3.10k | } else { | 81 | 3.10k | pixel_type First = in0[x]; | 82 | 3.10k | pixel_type Second = in1[x]; | 83 | 3.10k | pixel_type Third = in2[x]; | 84 | 3.10k | if (third) Third = PixelAdd(Third, First); | 85 | 3.10k | if (second == 1) { | 86 | 0 | Second = PixelAdd(Second, First); | 87 | 3.10k | } else if (second == 2) { | 88 | 3.10k | Second = PixelAdd(Second, (PixelAdd(First, Third) >> 1)); | 89 | 3.10k | } | 90 | 3.10k | out0[x] = First; | 91 | 3.10k | out1[x] = Second; | 92 | 3.10k | out2[x] = Third; | 93 | 3.10k | } | 94 | 3.10k | } | 95 | 69.5k | } |
void jxl::N_SSE2::InvRCTRow<6>(int const*, int const*, int const*, int*, int*, int*, unsigned long) Line | Count | Source | 32 | 87.5k | pixel_type* out2, size_t w) { | 33 | 87.5k | static_assert(transform_type >= 0 && transform_type < 7, | 34 | 87.5k | "Invalid transform type"); | 35 | 87.5k | int second = transform_type >> 1; | 36 | 87.5k | int third = transform_type & 1; | 37 | | | 38 | 87.5k | size_t x = 0; | 39 | 87.5k | const HWY_FULL(pixel_type) d; | 40 | 87.5k | const size_t N = Lanes(d); | 41 | 4.71M | for (; x + N - 1 < w; x += N) { | 42 | 4.62M | if (transform_type == 6) { | 43 | 4.62M | auto Y = Load(d, in0 + x); | 44 | 4.62M | auto Co = Load(d, in1 + x); | 45 | 4.62M | auto Cg = Load(d, in2 + x); | 46 | 4.62M | Y = Sub(Y, ShiftRight<1>(Cg)); | 47 | 4.62M | auto G = Add(Cg, Y); | 48 | 4.62M | Y = Sub(Y, ShiftRight<1>(Co)); | 49 | 4.62M | auto R = Add(Y, Co); | 50 | 4.62M | Store(R, d, out0 + x); | 51 | 4.62M | Store(G, d, out1 + x); | 52 | 4.62M | Store(Y, d, out2 + x); | 53 | 18.4E | } else { | 54 | 18.4E | auto First = Load(d, in0 + x); | 55 | 18.4E | auto Second = Load(d, in1 + x); | 56 | 18.4E | auto Third = Load(d, in2 + x); | 57 | 18.4E | if (third) Third = Add(Third, First); | 58 | 18.4E | if (second == 1) { | 59 | 0 | Second = Add(Second, First); | 60 | 18.4E | } else if (second == 2) { | 61 | 0 | Second = Add(Second, ShiftRight<1>(Add(First, Third))); | 62 | 0 | } | 63 | 18.4E | Store(First, d, out0 + x); | 64 | 18.4E | Store(Second, d, out1 + x); | 65 | 18.4E | Store(Third, d, out2 + x); | 66 | 18.4E | } | 67 | 4.62M | } | 68 | 121k | for (; x < w; x++) { | 69 | 33.8k | if (transform_type == 6) { | 70 | 33.8k | pixel_type Y = in0[x]; | 71 | 33.8k | pixel_type Co = in1[x]; | 72 | 33.8k | pixel_type Cg = in2[x]; | 73 | 33.8k | pixel_type tmp = PixelAdd(Y, -(Cg >> 1)); | 74 | 33.8k | pixel_type G = PixelAdd(Cg, tmp); | 75 | 33.8k | pixel_type B = PixelAdd(tmp, -(Co >> 1)); | 76 | 33.8k | pixel_type R = PixelAdd(B, Co); | 77 | 33.8k | out0[x] = R; | 78 | 33.8k | out1[x] = G; | 79 | 33.8k | out2[x] = B; | 80 | 33.8k | } else { | 81 | 0 | pixel_type First = in0[x]; | 82 | 0 | pixel_type Second = in1[x]; | 83 | 0 | pixel_type Third = in2[x]; | 84 | 0 | if (third) Third = PixelAdd(Third, First); | 85 | 0 | if (second == 1) { | 86 | 0 | Second = PixelAdd(Second, First); | 87 | 0 | } else if (second == 2) { | 88 | 0 | Second = PixelAdd(Second, (PixelAdd(First, Third) >> 1)); | 89 | 0 | } | 90 | 0 | out0[x] = First; | 91 | 0 | out1[x] = Second; | 92 | 0 | out2[x] = Third; | 93 | 0 | } | 94 | 33.8k | } | 95 | 87.5k | } |
|