/src/libjxl/lib/jpegli/upsample.cc
Line | Count | Source (jump to first uncovered line) |
1 | | // Copyright (c) the JPEG XL Project Authors. All rights reserved. |
2 | | // |
3 | | // Use of this source code is governed by a BSD-style |
4 | | // license that can be found in the LICENSE file. |
5 | | |
6 | | #include "lib/jpegli/upsample.h" |
7 | | |
8 | | #include <string.h> |
9 | | |
10 | | #undef HWY_TARGET_INCLUDE |
11 | | #define HWY_TARGET_INCLUDE "lib/jpegli/upsample.cc" |
12 | | #include <hwy/foreach_target.h> |
13 | | #include <hwy/highway.h> |
14 | | |
15 | | HWY_BEFORE_NAMESPACE(); |
16 | | namespace jpegli { |
17 | | namespace HWY_NAMESPACE { |
18 | | |
19 | | // These templates are not found via ADL. |
20 | | using hwy::HWY_NAMESPACE::Mul; |
21 | | using hwy::HWY_NAMESPACE::MulAdd; |
22 | | using hwy::HWY_NAMESPACE::Vec; |
23 | | |
24 | | #if HWY_CAP_GE512 |
25 | | using hwy::HWY_NAMESPACE::Half; |
26 | | using hwy::HWY_NAMESPACE::Vec; |
27 | | template <size_t i, class DF, class V> |
28 | 0 | HWY_INLINE Vec<Half<Half<DF>>> Quarter(const DF df, V v) { |
29 | 0 | using HF = Half<DF>; |
30 | 0 | using HHF = Half<HF>; |
31 | 0 | auto half = i >= 2 ? UpperHalf(HF(), v) : LowerHalf(HF(), v); |
32 | 0 | return i & 1 ? UpperHalf(HHF(), half) : LowerHalf(HHF(), half); |
33 | 0 | } Unexecuted instantiation: decltype (Zero((hwy::N_AVX3::Simd<float, 16ul, 0>::Half::Half)())) jpegli::N_AVX3::Quarter<0ul, hwy::N_AVX3::Simd<float, 16ul, 0>, hwy::N_AVX3::Vec512<float> >(hwy::N_AVX3::Simd<float, 16ul, 0>, hwy::N_AVX3::Vec512<float>) Unexecuted instantiation: decltype (Zero((hwy::N_AVX3::Simd<float, 16ul, 0>::Half::Half)())) jpegli::N_AVX3::Quarter<1ul, hwy::N_AVX3::Simd<float, 16ul, 0>, hwy::N_AVX3::Vec512<float> >(hwy::N_AVX3::Simd<float, 16ul, 0>, hwy::N_AVX3::Vec512<float>) Unexecuted instantiation: decltype (Zero((hwy::N_AVX3::Simd<float, 16ul, 0>::Half::Half)())) jpegli::N_AVX3::Quarter<2ul, hwy::N_AVX3::Simd<float, 16ul, 0>, hwy::N_AVX3::Vec512<float> >(hwy::N_AVX3::Simd<float, 16ul, 0>, hwy::N_AVX3::Vec512<float>) Unexecuted instantiation: decltype (Zero((hwy::N_AVX3::Simd<float, 16ul, 0>::Half::Half)())) jpegli::N_AVX3::Quarter<3ul, hwy::N_AVX3::Simd<float, 16ul, 0>, hwy::N_AVX3::Vec512<float> >(hwy::N_AVX3::Simd<float, 16ul, 0>, hwy::N_AVX3::Vec512<float>) Unexecuted instantiation: decltype (Zero((hwy::N_AVX3_ZEN4::Simd<float, 16ul, 0>::Half::Half)())) jpegli::N_AVX3_ZEN4::Quarter<0ul, hwy::N_AVX3_ZEN4::Simd<float, 16ul, 0>, hwy::N_AVX3_ZEN4::Vec512<float> >(hwy::N_AVX3_ZEN4::Simd<float, 16ul, 0>, hwy::N_AVX3_ZEN4::Vec512<float>) Unexecuted instantiation: decltype (Zero((hwy::N_AVX3_ZEN4::Simd<float, 16ul, 0>::Half::Half)())) jpegli::N_AVX3_ZEN4::Quarter<1ul, hwy::N_AVX3_ZEN4::Simd<float, 16ul, 0>, hwy::N_AVX3_ZEN4::Vec512<float> >(hwy::N_AVX3_ZEN4::Simd<float, 16ul, 0>, hwy::N_AVX3_ZEN4::Vec512<float>) Unexecuted instantiation: decltype (Zero((hwy::N_AVX3_ZEN4::Simd<float, 16ul, 0>::Half::Half)())) jpegli::N_AVX3_ZEN4::Quarter<2ul, hwy::N_AVX3_ZEN4::Simd<float, 16ul, 0>, hwy::N_AVX3_ZEN4::Vec512<float> >(hwy::N_AVX3_ZEN4::Simd<float, 16ul, 0>, hwy::N_AVX3_ZEN4::Vec512<float>) Unexecuted instantiation: decltype (Zero((hwy::N_AVX3_ZEN4::Simd<float, 16ul, 0>::Half::Half)())) jpegli::N_AVX3_ZEN4::Quarter<3ul, hwy::N_AVX3_ZEN4::Simd<float, 16ul, 0>, hwy::N_AVX3_ZEN4::Vec512<float> >(hwy::N_AVX3_ZEN4::Simd<float, 16ul, 0>, hwy::N_AVX3_ZEN4::Vec512<float>) Unexecuted instantiation: decltype (Zero((hwy::N_AVX3_SPR::Simd<float, 16ul, 0>::Half::Half)())) jpegli::N_AVX3_SPR::Quarter<0ul, hwy::N_AVX3_SPR::Simd<float, 16ul, 0>, hwy::N_AVX3_SPR::Vec512<float> >(hwy::N_AVX3_SPR::Simd<float, 16ul, 0>, hwy::N_AVX3_SPR::Vec512<float>) Unexecuted instantiation: decltype (Zero((hwy::N_AVX3_SPR::Simd<float, 16ul, 0>::Half::Half)())) jpegli::N_AVX3_SPR::Quarter<1ul, hwy::N_AVX3_SPR::Simd<float, 16ul, 0>, hwy::N_AVX3_SPR::Vec512<float> >(hwy::N_AVX3_SPR::Simd<float, 16ul, 0>, hwy::N_AVX3_SPR::Vec512<float>) Unexecuted instantiation: decltype (Zero((hwy::N_AVX3_SPR::Simd<float, 16ul, 0>::Half::Half)())) jpegli::N_AVX3_SPR::Quarter<2ul, hwy::N_AVX3_SPR::Simd<float, 16ul, 0>, hwy::N_AVX3_SPR::Vec512<float> >(hwy::N_AVX3_SPR::Simd<float, 16ul, 0>, hwy::N_AVX3_SPR::Vec512<float>) Unexecuted instantiation: decltype (Zero((hwy::N_AVX3_SPR::Simd<float, 16ul, 0>::Half::Half)())) jpegli::N_AVX3_SPR::Quarter<3ul, hwy::N_AVX3_SPR::Simd<float, 16ul, 0>, hwy::N_AVX3_SPR::Vec512<float> >(hwy::N_AVX3_SPR::Simd<float, 16ul, 0>, hwy::N_AVX3_SPR::Vec512<float>) |
34 | | |
35 | | template <class DF, class V> |
36 | 0 | HWY_INLINE Vec<DF> Concat4(const DF df, V v0, V v1, V v2, V v3) { |
37 | 0 | using HF = Half<DF>; |
38 | 0 | return Combine(DF(), Combine(HF(), v3, v2), Combine(HF(), v1, v0)); |
39 | 0 | } Unexecuted instantiation: decltype (Zero((hwy::N_AVX3::Simd<float, 16ul, 0>)())) jpegli::N_AVX3::Concat4<hwy::N_AVX3::Simd<float, 16ul, 0>, hwy::N_AVX3::Vec128<float, 4ul> >(hwy::N_AVX3::Simd<float, 16ul, 0>, hwy::N_AVX3::Vec128<float, 4ul>, hwy::N_AVX3::Vec128<float, 4ul>, hwy::N_AVX3::Vec128<float, 4ul>, hwy::N_AVX3::Vec128<float, 4ul>) Unexecuted instantiation: decltype (Zero((hwy::N_AVX3_ZEN4::Simd<float, 16ul, 0>)())) jpegli::N_AVX3_ZEN4::Concat4<hwy::N_AVX3_ZEN4::Simd<float, 16ul, 0>, hwy::N_AVX3_ZEN4::Vec128<float, 4ul> >(hwy::N_AVX3_ZEN4::Simd<float, 16ul, 0>, hwy::N_AVX3_ZEN4::Vec128<float, 4ul>, hwy::N_AVX3_ZEN4::Vec128<float, 4ul>, hwy::N_AVX3_ZEN4::Vec128<float, 4ul>, hwy::N_AVX3_ZEN4::Vec128<float, 4ul>) Unexecuted instantiation: decltype (Zero((hwy::N_AVX3_SPR::Simd<float, 16ul, 0>)())) jpegli::N_AVX3_SPR::Concat4<hwy::N_AVX3_SPR::Simd<float, 16ul, 0>, hwy::N_AVX3_SPR::Vec128<float, 4ul> >(hwy::N_AVX3_SPR::Simd<float, 16ul, 0>, hwy::N_AVX3_SPR::Vec128<float, 4ul>, hwy::N_AVX3_SPR::Vec128<float, 4ul>, hwy::N_AVX3_SPR::Vec128<float, 4ul>, hwy::N_AVX3_SPR::Vec128<float, 4ul>) |
40 | | |
41 | | #endif |
42 | | |
43 | | // Stores v0[0], v1[0], v0[1], v1[1], ... to mem, in this order. Mem must be |
44 | | // aligned. |
45 | | template <class DF, class V, typename T> |
46 | 162M | void StoreInterleaved(const DF df, V v0, V v1, T* mem) { |
47 | 162M | static_assert(sizeof(T) == 4, "only use StoreInterleaved for 4-byte types"); |
48 | | #if HWY_TARGET == HWY_SCALAR |
49 | | Store(v0, df, mem); |
50 | | Store(v1, df, mem + 1); |
51 | | #elif !HWY_CAP_GE256 |
52 | | Store(InterleaveLower(df, v0, v1), df, mem); |
53 | | Store(InterleaveUpper(df, v0, v1), df, mem + Lanes(df)); |
54 | | #else |
55 | 50.9M | if (!HWY_CAP_GE512 || Lanes(df) == 8) { |
56 | 50.9M | auto t0 = InterleaveLower(df, v0, v1); |
57 | 50.9M | auto t1 = InterleaveUpper(df, v0, v1); |
58 | 50.9M | Store(ConcatLowerLower(df, t1, t0), df, mem); |
59 | 50.9M | Store(ConcatUpperUpper(df, t1, t0), df, mem + Lanes(df)); |
60 | 50.9M | } else { |
61 | | #if HWY_CAP_GE512 |
62 | | auto t0 = InterleaveLower(df, v0, v1); |
63 | | auto t1 = InterleaveUpper(df, v0, v1); |
64 | | Store(Concat4(df, Quarter<0>(df, t0), Quarter<0>(df, t1), |
65 | | Quarter<1>(df, t0), Quarter<1>(df, t1)), |
66 | | df, mem); |
67 | | Store(Concat4(df, Quarter<2>(df, t0), Quarter<2>(df, t1), |
68 | | Quarter<3>(df, t0), Quarter<3>(df, t1)), |
69 | | df, mem + Lanes(df)); |
70 | | #endif |
71 | 0 | } |
72 | | #endif |
73 | 162M | } void jpegli::N_SSE4::StoreInterleaved<hwy::N_SSE4::Simd<float, 4ul, 0>, hwy::N_SSE4::Vec128<float, 4ul>, float>(hwy::N_SSE4::Simd<float, 4ul, 0>, hwy::N_SSE4::Vec128<float, 4ul>, hwy::N_SSE4::Vec128<float, 4ul>, float*) Line | Count | Source | 46 | 54.3M | void StoreInterleaved(const DF df, V v0, V v1, T* mem) { | 47 | 54.3M | static_assert(sizeof(T) == 4, "only use StoreInterleaved for 4-byte types"); | 48 | | #if HWY_TARGET == HWY_SCALAR | 49 | | Store(v0, df, mem); | 50 | | Store(v1, df, mem + 1); | 51 | | #elif !HWY_CAP_GE256 | 52 | 54.3M | Store(InterleaveLower(df, v0, v1), df, mem); | 53 | 54.3M | Store(InterleaveUpper(df, v0, v1), df, mem + Lanes(df)); | 54 | | #else | 55 | | if (!HWY_CAP_GE512 || Lanes(df) == 8) { | 56 | | auto t0 = InterleaveLower(df, v0, v1); | 57 | | auto t1 = InterleaveUpper(df, v0, v1); | 58 | | Store(ConcatLowerLower(df, t1, t0), df, mem); | 59 | | Store(ConcatUpperUpper(df, t1, t0), df, mem + Lanes(df)); | 60 | | } else { | 61 | | #if HWY_CAP_GE512 | 62 | | auto t0 = InterleaveLower(df, v0, v1); | 63 | | auto t1 = InterleaveUpper(df, v0, v1); | 64 | | Store(Concat4(df, Quarter<0>(df, t0), Quarter<0>(df, t1), | 65 | | Quarter<1>(df, t0), Quarter<1>(df, t1)), | 66 | | df, mem); | 67 | | Store(Concat4(df, Quarter<2>(df, t0), Quarter<2>(df, t1), | 68 | | Quarter<3>(df, t0), Quarter<3>(df, t1)), | 69 | | df, mem + Lanes(df)); | 70 | | #endif | 71 | | } | 72 | | #endif | 73 | 54.3M | } |
void jpegli::N_AVX2::StoreInterleaved<hwy::N_AVX2::Simd<float, 8ul, 0>, hwy::N_AVX2::Vec256<float>, float>(hwy::N_AVX2::Simd<float, 8ul, 0>, hwy::N_AVX2::Vec256<float>, hwy::N_AVX2::Vec256<float>, float*) Line | Count | Source | 46 | 50.9M | void StoreInterleaved(const DF df, V v0, V v1, T* mem) { | 47 | 50.9M | static_assert(sizeof(T) == 4, "only use StoreInterleaved for 4-byte types"); | 48 | | #if HWY_TARGET == HWY_SCALAR | 49 | | Store(v0, df, mem); | 50 | | Store(v1, df, mem + 1); | 51 | | #elif !HWY_CAP_GE256 | 52 | | Store(InterleaveLower(df, v0, v1), df, mem); | 53 | | Store(InterleaveUpper(df, v0, v1), df, mem + Lanes(df)); | 54 | | #else | 55 | 50.9M | if (!HWY_CAP_GE512 || Lanes(df) == 8) { | 56 | 50.9M | auto t0 = InterleaveLower(df, v0, v1); | 57 | 50.9M | auto t1 = InterleaveUpper(df, v0, v1); | 58 | 50.9M | Store(ConcatLowerLower(df, t1, t0), df, mem); | 59 | 50.9M | Store(ConcatUpperUpper(df, t1, t0), df, mem + Lanes(df)); | 60 | 50.9M | } else { | 61 | | #if HWY_CAP_GE512 | 62 | | auto t0 = InterleaveLower(df, v0, v1); | 63 | | auto t1 = InterleaveUpper(df, v0, v1); | 64 | | Store(Concat4(df, Quarter<0>(df, t0), Quarter<0>(df, t1), | 65 | | Quarter<1>(df, t0), Quarter<1>(df, t1)), | 66 | | df, mem); | 67 | | Store(Concat4(df, Quarter<2>(df, t0), Quarter<2>(df, t1), | 68 | | Quarter<3>(df, t0), Quarter<3>(df, t1)), | 69 | | df, mem + Lanes(df)); | 70 | | #endif | 71 | 0 | } | 72 | 50.9M | #endif | 73 | 50.9M | } |
Unexecuted instantiation: void jpegli::N_AVX3::StoreInterleaved<hwy::N_AVX3::Simd<float, 16ul, 0>, hwy::N_AVX3::Vec512<float>, float>(hwy::N_AVX3::Simd<float, 16ul, 0>, hwy::N_AVX3::Vec512<float>, hwy::N_AVX3::Vec512<float>, float*) Unexecuted instantiation: void jpegli::N_AVX3_ZEN4::StoreInterleaved<hwy::N_AVX3_ZEN4::Simd<float, 16ul, 0>, hwy::N_AVX3_ZEN4::Vec512<float>, float>(hwy::N_AVX3_ZEN4::Simd<float, 16ul, 0>, hwy::N_AVX3_ZEN4::Vec512<float>, hwy::N_AVX3_ZEN4::Vec512<float>, float*) Unexecuted instantiation: void jpegli::N_AVX3_SPR::StoreInterleaved<hwy::N_AVX3_SPR::Simd<float, 16ul, 0>, hwy::N_AVX3_SPR::Vec512<float>, float>(hwy::N_AVX3_SPR::Simd<float, 16ul, 0>, hwy::N_AVX3_SPR::Vec512<float>, hwy::N_AVX3_SPR::Vec512<float>, float*) void jpegli::N_SSE2::StoreInterleaved<hwy::N_SSE2::Simd<float, 4ul, 0>, hwy::N_SSE2::Vec128<float, 4ul>, float>(hwy::N_SSE2::Simd<float, 4ul, 0>, hwy::N_SSE2::Vec128<float, 4ul>, hwy::N_SSE2::Vec128<float, 4ul>, float*) Line | Count | Source | 46 | 56.7M | void StoreInterleaved(const DF df, V v0, V v1, T* mem) { | 47 | 56.7M | static_assert(sizeof(T) == 4, "only use StoreInterleaved for 4-byte types"); | 48 | | #if HWY_TARGET == HWY_SCALAR | 49 | | Store(v0, df, mem); | 50 | | Store(v1, df, mem + 1); | 51 | | #elif !HWY_CAP_GE256 | 52 | 56.7M | Store(InterleaveLower(df, v0, v1), df, mem); | 53 | 56.7M | Store(InterleaveUpper(df, v0, v1), df, mem + Lanes(df)); | 54 | | #else | 55 | | if (!HWY_CAP_GE512 || Lanes(df) == 8) { | 56 | | auto t0 = InterleaveLower(df, v0, v1); | 57 | | auto t1 = InterleaveUpper(df, v0, v1); | 58 | | Store(ConcatLowerLower(df, t1, t0), df, mem); | 59 | | Store(ConcatUpperUpper(df, t1, t0), df, mem + Lanes(df)); | 60 | | } else { | 61 | | #if HWY_CAP_GE512 | 62 | | auto t0 = InterleaveLower(df, v0, v1); | 63 | | auto t1 = InterleaveUpper(df, v0, v1); | 64 | | Store(Concat4(df, Quarter<0>(df, t0), Quarter<0>(df, t1), | 65 | | Quarter<1>(df, t0), Quarter<1>(df, t1)), | 66 | | df, mem); | 67 | | Store(Concat4(df, Quarter<2>(df, t0), Quarter<2>(df, t1), | 68 | | Quarter<3>(df, t0), Quarter<3>(df, t1)), | 69 | | df, mem + Lanes(df)); | 70 | | #endif | 71 | | } | 72 | | #endif | 73 | 56.7M | } |
|
74 | | |
75 | | void Upsample2Horizontal(float* JXL_RESTRICT row, |
76 | 14.4M | float* JXL_RESTRICT scratch_space, size_t len_out) { |
77 | 14.4M | HWY_FULL(float) df; |
78 | 14.4M | auto threefour = Set(df, 0.75f); |
79 | 14.4M | auto onefour = Set(df, 0.25f); |
80 | 14.4M | const size_t len_in = (len_out + 1) >> 1; |
81 | 14.4M | memcpy(scratch_space, row, len_in * sizeof(row[0])); |
82 | 14.4M | scratch_space[-1] = scratch_space[0]; |
83 | 14.4M | scratch_space[len_in] = scratch_space[len_in - 1]; |
84 | 176M | for (size_t x = 0; x < len_in; x += Lanes(df)) { |
85 | 162M | auto current = Mul(Load(df, scratch_space + x), threefour); |
86 | 162M | auto prev = LoadU(df, scratch_space + x - 1); |
87 | 162M | auto next = LoadU(df, scratch_space + x + 1); |
88 | 162M | auto left = MulAdd(onefour, prev, current); |
89 | 162M | auto right = MulAdd(onefour, next, current); |
90 | 162M | StoreInterleaved(df, left, right, row + x * 2); |
91 | 162M | } |
92 | 14.4M | } jpegli::N_SSE4::Upsample2Horizontal(float*, float*, unsigned long) Line | Count | Source | 76 | 4.74M | float* JXL_RESTRICT scratch_space, size_t len_out) { | 77 | 4.74M | HWY_FULL(float) df; | 78 | 4.74M | auto threefour = Set(df, 0.75f); | 79 | 4.74M | auto onefour = Set(df, 0.25f); | 80 | 4.74M | const size_t len_in = (len_out + 1) >> 1; | 81 | 4.74M | memcpy(scratch_space, row, len_in * sizeof(row[0])); | 82 | 4.74M | scratch_space[-1] = scratch_space[0]; | 83 | 4.74M | scratch_space[len_in] = scratch_space[len_in - 1]; | 84 | 59.1M | for (size_t x = 0; x < len_in; x += Lanes(df)) { | 85 | 54.3M | auto current = Mul(Load(df, scratch_space + x), threefour); | 86 | 54.3M | auto prev = LoadU(df, scratch_space + x - 1); | 87 | 54.3M | auto next = LoadU(df, scratch_space + x + 1); | 88 | 54.3M | auto left = MulAdd(onefour, prev, current); | 89 | 54.3M | auto right = MulAdd(onefour, next, current); | 90 | 54.3M | StoreInterleaved(df, left, right, row + x * 2); | 91 | 54.3M | } | 92 | 4.74M | } |
jpegli::N_AVX2::Upsample2Horizontal(float*, float*, unsigned long) Line | Count | Source | 76 | 3.28M | float* JXL_RESTRICT scratch_space, size_t len_out) { | 77 | 3.28M | HWY_FULL(float) df; | 78 | 3.28M | auto threefour = Set(df, 0.75f); | 79 | 3.28M | auto onefour = Set(df, 0.25f); | 80 | 3.28M | const size_t len_in = (len_out + 1) >> 1; | 81 | 3.28M | memcpy(scratch_space, row, len_in * sizeof(row[0])); | 82 | 3.28M | scratch_space[-1] = scratch_space[0]; | 83 | 3.28M | scratch_space[len_in] = scratch_space[len_in - 1]; | 84 | 54.2M | for (size_t x = 0; x < len_in; x += Lanes(df)) { | 85 | 50.9M | auto current = Mul(Load(df, scratch_space + x), threefour); | 86 | 50.9M | auto prev = LoadU(df, scratch_space + x - 1); | 87 | 50.9M | auto next = LoadU(df, scratch_space + x + 1); | 88 | 50.9M | auto left = MulAdd(onefour, prev, current); | 89 | 50.9M | auto right = MulAdd(onefour, next, current); | 90 | 50.9M | StoreInterleaved(df, left, right, row + x * 2); | 91 | 50.9M | } | 92 | 3.28M | } |
Unexecuted instantiation: jpegli::N_AVX3::Upsample2Horizontal(float*, float*, unsigned long) Unexecuted instantiation: jpegli::N_AVX3_ZEN4::Upsample2Horizontal(float*, float*, unsigned long) Unexecuted instantiation: jpegli::N_AVX3_SPR::Upsample2Horizontal(float*, float*, unsigned long) jpegli::N_SSE2::Upsample2Horizontal(float*, float*, unsigned long) Line | Count | Source | 76 | 6.39M | float* JXL_RESTRICT scratch_space, size_t len_out) { | 77 | 6.39M | HWY_FULL(float) df; | 78 | 6.39M | auto threefour = Set(df, 0.75f); | 79 | 6.39M | auto onefour = Set(df, 0.25f); | 80 | 6.39M | const size_t len_in = (len_out + 1) >> 1; | 81 | 6.39M | memcpy(scratch_space, row, len_in * sizeof(row[0])); | 82 | 6.39M | scratch_space[-1] = scratch_space[0]; | 83 | 6.39M | scratch_space[len_in] = scratch_space[len_in - 1]; | 84 | 63.1M | for (size_t x = 0; x < len_in; x += Lanes(df)) { | 85 | 56.7M | auto current = Mul(Load(df, scratch_space + x), threefour); | 86 | 56.7M | auto prev = LoadU(df, scratch_space + x - 1); | 87 | 56.7M | auto next = LoadU(df, scratch_space + x + 1); | 88 | 56.7M | auto left = MulAdd(onefour, prev, current); | 89 | 56.7M | auto right = MulAdd(onefour, next, current); | 90 | 56.7M | StoreInterleaved(df, left, right, row + x * 2); | 91 | 56.7M | } | 92 | 6.39M | } |
|
93 | | |
94 | | void Upsample2Vertical(const float* JXL_RESTRICT row_top, |
95 | | const float* JXL_RESTRICT row_mid, |
96 | | const float* JXL_RESTRICT row_bot, |
97 | | float* JXL_RESTRICT row_out0, |
98 | 6.10M | float* JXL_RESTRICT row_out1, size_t len) { |
99 | 6.10M | HWY_FULL(float) df; |
100 | 6.10M | auto threefour = Set(df, 0.75f); |
101 | 6.10M | auto onefour = Set(df, 0.25f); |
102 | 106M | for (size_t x = 0; x < len; x += Lanes(df)) { |
103 | 100M | auto it = Load(df, row_top + x); |
104 | 100M | auto im = Load(df, row_mid + x); |
105 | 100M | auto ib = Load(df, row_bot + x); |
106 | 100M | auto im_scaled = Mul(im, threefour); |
107 | 100M | Store(MulAdd(it, onefour, im_scaled), df, row_out0 + x); |
108 | 100M | Store(MulAdd(ib, onefour, im_scaled), df, row_out1 + x); |
109 | 100M | } |
110 | 6.10M | } jpegli::N_SSE4::Upsample2Vertical(float const*, float const*, float const*, float*, float*, unsigned long) Line | Count | Source | 98 | 1.84M | float* JXL_RESTRICT row_out1, size_t len) { | 99 | 1.84M | HWY_FULL(float) df; | 100 | 1.84M | auto threefour = Set(df, 0.75f); | 101 | 1.84M | auto onefour = Set(df, 0.25f); | 102 | 36.4M | for (size_t x = 0; x < len; x += Lanes(df)) { | 103 | 34.6M | auto it = Load(df, row_top + x); | 104 | 34.6M | auto im = Load(df, row_mid + x); | 105 | 34.6M | auto ib = Load(df, row_bot + x); | 106 | 34.6M | auto im_scaled = Mul(im, threefour); | 107 | 34.6M | Store(MulAdd(it, onefour, im_scaled), df, row_out0 + x); | 108 | 34.6M | Store(MulAdd(ib, onefour, im_scaled), df, row_out1 + x); | 109 | 34.6M | } | 110 | 1.84M | } |
jpegli::N_AVX2::Upsample2Vertical(float const*, float const*, float const*, float*, float*, unsigned long) Line | Count | Source | 98 | 1.98M | float* JXL_RESTRICT row_out1, size_t len) { | 99 | 1.98M | HWY_FULL(float) df; | 100 | 1.98M | auto threefour = Set(df, 0.75f); | 101 | 1.98M | auto onefour = Set(df, 0.25f); | 102 | 38.4M | for (size_t x = 0; x < len; x += Lanes(df)) { | 103 | 36.4M | auto it = Load(df, row_top + x); | 104 | 36.4M | auto im = Load(df, row_mid + x); | 105 | 36.4M | auto ib = Load(df, row_bot + x); | 106 | 36.4M | auto im_scaled = Mul(im, threefour); | 107 | 36.4M | Store(MulAdd(it, onefour, im_scaled), df, row_out0 + x); | 108 | 36.4M | Store(MulAdd(ib, onefour, im_scaled), df, row_out1 + x); | 109 | 36.4M | } | 110 | 1.98M | } |
Unexecuted instantiation: jpegli::N_AVX3::Upsample2Vertical(float const*, float const*, float const*, float*, float*, unsigned long) Unexecuted instantiation: jpegli::N_AVX3_ZEN4::Upsample2Vertical(float const*, float const*, float const*, float*, float*, unsigned long) Unexecuted instantiation: jpegli::N_AVX3_SPR::Upsample2Vertical(float const*, float const*, float const*, float*, float*, unsigned long) jpegli::N_SSE2::Upsample2Vertical(float const*, float const*, float const*, float*, float*, unsigned long) Line | Count | Source | 98 | 2.27M | float* JXL_RESTRICT row_out1, size_t len) { | 99 | 2.27M | HWY_FULL(float) df; | 100 | 2.27M | auto threefour = Set(df, 0.75f); | 101 | 2.27M | auto onefour = Set(df, 0.25f); | 102 | 31.9M | for (size_t x = 0; x < len; x += Lanes(df)) { | 103 | 29.6M | auto it = Load(df, row_top + x); | 104 | 29.6M | auto im = Load(df, row_mid + x); | 105 | 29.6M | auto ib = Load(df, row_bot + x); | 106 | 29.6M | auto im_scaled = Mul(im, threefour); | 107 | 29.6M | Store(MulAdd(it, onefour, im_scaled), df, row_out0 + x); | 108 | 29.6M | Store(MulAdd(ib, onefour, im_scaled), df, row_out1 + x); | 109 | 29.6M | } | 110 | 2.27M | } |
|
111 | | |
112 | | // NOLINTNEXTLINE(google-readability-namespace-comments) |
113 | | } // namespace HWY_NAMESPACE |
114 | | } // namespace jpegli |
115 | | HWY_AFTER_NAMESPACE(); |
116 | | |
117 | | #if HWY_ONCE |
118 | | namespace jpegli { |
119 | | |
120 | | HWY_EXPORT(Upsample2Horizontal); |
121 | | HWY_EXPORT(Upsample2Vertical); |
122 | | |
123 | | void Upsample2Horizontal(float* JXL_RESTRICT row, |
124 | 14.4M | float* JXL_RESTRICT scratch_space, size_t len_out) { |
125 | 14.4M | return HWY_DYNAMIC_DISPATCH(Upsample2Horizontal)(row, scratch_space, len_out); |
126 | 14.4M | } |
127 | | |
128 | | void Upsample2Vertical(const float* JXL_RESTRICT row_top, |
129 | | const float* JXL_RESTRICT row_mid, |
130 | | const float* JXL_RESTRICT row_bot, |
131 | | float* JXL_RESTRICT row_out0, |
132 | 6.10M | float* JXL_RESTRICT row_out1, size_t len) { |
133 | 6.10M | return HWY_DYNAMIC_DISPATCH(Upsample2Vertical)(row_top, row_mid, row_bot, |
134 | 6.10M | row_out0, row_out1, len); |
135 | 6.10M | } |
136 | | } // namespace jpegli |
137 | | #endif // HWY_ONCE |