/src/libjxl/lib/jxl/simd_util.cc
Line | Count | Source |
1 | | // Copyright (c) the JPEG XL Project Authors. All rights reserved. |
2 | | // |
3 | | // Use of this source code is governed by a BSD-style |
4 | | // license that can be found in the LICENSE file. |
5 | | |
6 | | #include "lib/jxl/simd_util.h" |
7 | | |
8 | | #include <cstddef> |
9 | | #include <cstdint> |
10 | | |
11 | | #include "lib/jxl/base/compiler_specific.h" |
12 | | |
13 | | #undef HWY_TARGET_INCLUDE |
14 | | #define HWY_TARGET_INCLUDE "lib/jxl/simd_util.cc" |
15 | | #include <hwy/foreach_target.h> |
16 | | #include <hwy/highway.h> |
17 | | |
18 | | HWY_BEFORE_NAMESPACE(); |
19 | | namespace jxl { |
20 | | namespace HWY_NAMESPACE { |
21 | | |
22 | | using hwy::HWY_NAMESPACE::GetLane; |
23 | | using hwy::HWY_NAMESPACE::IfThenElseZero; |
24 | | using hwy::HWY_NAMESPACE::Iota; |
25 | | using hwy::HWY_NAMESPACE::LoadU; |
26 | | using hwy::HWY_NAMESPACE::Lt; |
27 | | using hwy::HWY_NAMESPACE::Max; |
28 | | using hwy::HWY_NAMESPACE::MaxOfLanes; |
29 | | using hwy::HWY_NAMESPACE::Set; |
30 | | |
31 | 40.2M | size_t MaxVectorSize() { |
32 | 40.2M | HWY_FULL(float) df; |
33 | 40.2M | return Lanes(df) * sizeof(float); |
34 | 40.2M | } Unexecuted instantiation: jxl::N_SSE4::MaxVectorSize() jxl::N_AVX2::MaxVectorSize() Line | Count | Source | 31 | 40.2M | size_t MaxVectorSize() { | 32 | 40.2M | HWY_FULL(float) df; | 33 | 40.2M | return Lanes(df) * sizeof(float); | 34 | 40.2M | } |
Unexecuted instantiation: jxl::N_AVX3::MaxVectorSize() Unexecuted instantiation: jxl::N_AVX3_ZEN4::MaxVectorSize() Unexecuted instantiation: jxl::N_AVX3_SPR::MaxVectorSize() Unexecuted instantiation: jxl::N_SSE2::MaxVectorSize() |
35 | | |
36 | 290k | uint32_t MaxValue(uint32_t* JXL_RESTRICT data, size_t len) { |
37 | 290k | HWY_FULL(uint32_t) du; |
38 | 290k | size_t last_full = Lanes(du) * (len / Lanes(du)); |
39 | 290k | auto max = Set(du, 0); |
40 | 8.49M | for (size_t i = 0; i < last_full; i += Lanes(du)) { |
41 | 8.20M | max = Max(max, LoadU(du, data + i)); |
42 | 8.20M | } |
43 | 290k | if (last_full < len) { |
44 | 88.5k | const auto stop = Set(du, len); |
45 | 88.5k | const auto fence = Iota(du, last_full); |
46 | 88.5k | const auto take = Lt(fence, stop); |
47 | 88.5k | max = Max(max, IfThenElseZero(take, LoadU(du, data + last_full))); |
48 | 88.5k | } |
49 | 290k | return GetLane(MaxOfLanes(du, max)); |
50 | 290k | } Unexecuted instantiation: jxl::N_SSE4::MaxValue(unsigned int*, unsigned long) jxl::N_AVX2::MaxValue(unsigned int*, unsigned long) Line | Count | Source | 36 | 290k | uint32_t MaxValue(uint32_t* JXL_RESTRICT data, size_t len) { | 37 | 290k | HWY_FULL(uint32_t) du; | 38 | 290k | size_t last_full = Lanes(du) * (len / Lanes(du)); | 39 | 290k | auto max = Set(du, 0); | 40 | 8.49M | for (size_t i = 0; i < last_full; i += Lanes(du)) { | 41 | 8.20M | max = Max(max, LoadU(du, data + i)); | 42 | 8.20M | } | 43 | 290k | if (last_full < len) { | 44 | 88.5k | const auto stop = Set(du, len); | 45 | 88.5k | const auto fence = Iota(du, last_full); | 46 | 88.5k | const auto take = Lt(fence, stop); | 47 | 88.5k | max = Max(max, IfThenElseZero(take, LoadU(du, data + last_full))); | 48 | 88.5k | } | 49 | 290k | return GetLane(MaxOfLanes(du, max)); | 50 | 290k | } |
Unexecuted instantiation: jxl::N_AVX3::MaxValue(unsigned int*, unsigned long) Unexecuted instantiation: jxl::N_AVX3_ZEN4::MaxValue(unsigned int*, unsigned long) Unexecuted instantiation: jxl::N_AVX3_SPR::MaxValue(unsigned int*, unsigned long) Unexecuted instantiation: jxl::N_SSE2::MaxValue(unsigned int*, unsigned long) |
51 | | |
52 | | // NOLINTNEXTLINE(google-readability-namespace-comments) |
53 | | } // namespace HWY_NAMESPACE |
54 | | } // namespace jxl |
55 | | HWY_AFTER_NAMESPACE(); |
56 | | |
57 | | #if HWY_ONCE |
58 | | namespace jxl { |
59 | | |
60 | | HWY_EXPORT(MaxVectorSize); |
61 | | HWY_EXPORT(MaxValue); |
62 | | |
63 | 40.2M | size_t MaxVectorSize() { |
64 | | // Ideally HWY framework should provide us this value. |
65 | | // Less than ideal is to check all available targets and choose maximal. |
66 | | // As for now, we just ask current active target, assuming it won't change. |
67 | 40.2M | return HWY_DYNAMIC_DISPATCH(MaxVectorSize)(); |
68 | 40.2M | } |
69 | | |
70 | 290k | uint32_t MaxValue(uint32_t* JXL_RESTRICT data, size_t len) { |
71 | 290k | return HWY_DYNAMIC_DISPATCH(MaxValue)(data, len); |
72 | 290k | } |
73 | | |
74 | | } // namespace jxl |
75 | | #endif |