/src/Simd/src/Simd/SimdSse41ReduceGray4x4.cpp
Line | Count | Source |
1 | | /* |
2 | | * Simd Library (http://ermig1979.github.io/Simd). |
3 | | * |
4 | | * Copyright (c) 2011-2022 Yermalayeu Ihar. |
5 | | * |
6 | | * Permission is hereby granted, free of charge, to any person obtaining a copy |
7 | | * of this software and associated documentation files (the "Software"), to deal |
8 | | * in the Software without restriction, including without limitation the rights |
9 | | * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell |
10 | | * copies of the Software, and to permit persons to whom the Software is |
11 | | * furnished to do so, subject to the following conditions: |
12 | | * |
13 | | * The above copyright notice and this permission notice shall be included in |
14 | | * all copies or substantial portions of the Software. |
15 | | * |
16 | | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR |
17 | | * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, |
18 | | * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE |
19 | | * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER |
20 | | * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, |
21 | | * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE |
22 | | * SOFTWARE. |
23 | | */ |
24 | | #include "Simd/SimdMemory.h" |
25 | | #include "Simd/SimdStore.h" |
26 | | |
27 | | namespace Simd |
28 | | { |
29 | | #ifdef SIMD_SSE41_ENABLE |
30 | | namespace Sse41 |
31 | | { |
32 | | namespace |
33 | | { |
34 | | struct Buffer |
35 | | { |
36 | | Buffer(size_t width) |
37 | 0 | { |
38 | 0 | _p = Allocate(sizeof(uint16_t) * 4 * width); |
39 | 0 | src0 = (uint16_t*)_p; |
40 | 0 | src1 = src0 + width; |
41 | 0 | src2 = src1 + width; |
42 | 0 | src3 = src2 + width; |
43 | 0 | } |
44 | | |
45 | | ~Buffer() |
46 | 0 | { |
47 | 0 | Free(_p); |
48 | 0 | } |
49 | | |
50 | | uint16_t * src0; |
51 | | uint16_t * src1; |
52 | | uint16_t * src2; |
53 | | uint16_t * src3; |
54 | | private: |
55 | | void * _p; |
56 | | }; |
57 | | } |
58 | | |
59 | | SIMD_INLINE __m128i DivideBy64(__m128i value) |
60 | 0 | { |
61 | 0 | return _mm_srli_epi16(_mm_add_epi16(value, K16_0020), 6); |
62 | 0 | } |
63 | | |
64 | | const __m128i K8_01_03 = SIMD_MM_SET2_EPI8(1, 3); |
65 | | const __m128i K8_03_01 = SIMD_MM_SET2_EPI8(3, 1); |
66 | | |
67 | | SIMD_INLINE __m128i BinomialSum16(const __m128i & ab, const __m128i & cd) |
68 | 0 | { |
69 | 0 | return _mm_add_epi16(_mm_maddubs_epi16(ab, K8_01_03), _mm_maddubs_epi16(cd, K8_03_01)); |
70 | 0 | } |
71 | | |
72 | | SIMD_INLINE __m128i ReduceColNose(const uint8_t *src) |
73 | 0 | { |
74 | 0 | const __m128i t1 = _mm_loadu_si128((__m128i*)src); |
75 | 0 | const __m128i t2 = _mm_loadu_si128((__m128i*)(src + 1)); |
76 | 0 | return BinomialSum16(LoadBeforeFirst<1>(t1), t2); |
77 | 0 | } |
78 | | |
79 | | SIMD_INLINE __m128i ReduceColBody(const uint8_t *src) |
80 | 0 | { |
81 | 0 | const __m128i t0 = _mm_loadu_si128((__m128i*)(src - 1)); |
82 | 0 | const __m128i t2 = _mm_loadu_si128((__m128i*)(src + 1)); |
83 | 0 | return BinomialSum16(t0, t2); |
84 | 0 | } |
85 | | |
86 | | template <bool even> SIMD_INLINE __m128i ReduceColTail(const uint8_t *src); |
87 | | |
88 | | template <> SIMD_INLINE __m128i ReduceColTail<true>(const uint8_t *src) |
89 | 0 | { |
90 | 0 | const __m128i t0 = _mm_loadu_si128((__m128i*)(src - 1)); |
91 | 0 | const __m128i t1 = _mm_loadu_si128((__m128i*)src); |
92 | 0 | return BinomialSum16(t0, LoadAfterLast<1>(t1)); |
93 | 0 | } |
94 | | |
95 | | template <> SIMD_INLINE __m128i ReduceColTail<false>(const uint8_t *src) |
96 | 0 | { |
97 | 0 | const __m128i t0 = _mm_loadu_si128((__m128i*)(src - 1)); |
98 | 0 | return BinomialSum16(t0, LoadAfterLast<1>(LoadAfterLast<1>(t0))); |
99 | 0 | } |
100 | | |
101 | | template <bool align> SIMD_INLINE __m128i ReduceRow(const Buffer & buffer, size_t offset) |
102 | 0 | { |
103 | 0 | return _mm_packus_epi16(_mm_and_si128(DivideBy64(BinomialSum16( |
104 | 0 | Load<align>((__m128i*)(buffer.src0 + offset)), Load<align>((__m128i*)(buffer.src1 + offset)), |
105 | 0 | Load<align>((__m128i*)(buffer.src2 + offset)), Load<align>((__m128i*)(buffer.src3 + offset)))), K16_00FF), K_ZERO); |
106 | 0 | } Unexecuted instantiation: SimdSse41ReduceGray4x4.cpp:long long __vector(2) Simd::Sse41::ReduceRow<true>(Simd::Sse41::(anonymous namespace)::Buffer const&, unsigned long) Unexecuted instantiation: SimdSse41ReduceGray4x4.cpp:long long __vector(2) Simd::Sse41::ReduceRow<false>(Simd::Sse41::(anonymous namespace)::Buffer const&, unsigned long) |
107 | | |
108 | | template <bool even> void ReduceGray4x4(const uint8_t *src, size_t srcWidth, size_t srcHeight, size_t srcStride, |
109 | | uint8_t *dst, size_t dstWidth, size_t dstHeight, size_t dstStride) |
110 | 0 | { |
111 | 0 | assert((srcWidth + 1) / 2 == dstWidth && (srcHeight + 1) / 2 == dstHeight && srcWidth > A); |
112 | |
|
113 | 0 | size_t alignedDstWidth = Simd::AlignLo(dstWidth, HA); |
114 | 0 | size_t srcTail = Simd::AlignHi(srcWidth - A, 2); |
115 | |
|
116 | 0 | Buffer buffer(Simd::AlignHi(dstWidth, A)); |
117 | |
|
118 | 0 | __m128i tmp = ReduceColNose(src); |
119 | 0 | Store<true>((__m128i*)buffer.src0, tmp); |
120 | 0 | Store<true>((__m128i*)buffer.src1, tmp); |
121 | 0 | size_t srcCol = A, dstCol = HA; |
122 | 0 | for (; srcCol < srcWidth - A; srcCol += A, dstCol += HA) |
123 | 0 | { |
124 | 0 | tmp = ReduceColBody(src + srcCol); |
125 | 0 | Store<true>((__m128i*)(buffer.src0 + dstCol), tmp); |
126 | 0 | Store<true>((__m128i*)(buffer.src1 + dstCol), tmp); |
127 | 0 | } |
128 | 0 | tmp = ReduceColTail<even>(src + srcTail); |
129 | 0 | Store<false>((__m128i*)(buffer.src0 + dstWidth - HA), tmp); |
130 | 0 | Store<false>((__m128i*)(buffer.src1 + dstWidth - HA), tmp); |
131 | |
|
132 | 0 | for (size_t row = 0; row < srcHeight; row += 2, dst += dstStride) |
133 | 0 | { |
134 | 0 | const uint8_t *src2 = src + srcStride*(row + 1); |
135 | 0 | const uint8_t *src3 = src2 + srcStride; |
136 | 0 | if (row >= srcHeight - 2) |
137 | 0 | { |
138 | 0 | src2 = src + srcStride*(srcHeight - 1); |
139 | 0 | src3 = src2; |
140 | 0 | } |
141 | |
|
142 | 0 | Store<true>((__m128i*)buffer.src2, ReduceColNose(src2)); |
143 | 0 | Store<true>((__m128i*)buffer.src3, ReduceColNose(src3)); |
144 | 0 | size_t srcCol = A, dstCol = HA; |
145 | 0 | for (; srcCol < srcWidth - A; srcCol += A, dstCol += HA) |
146 | 0 | { |
147 | 0 | Store<true>((__m128i*)(buffer.src2 + dstCol), ReduceColBody(src2 + srcCol)); |
148 | 0 | Store<true>((__m128i*)(buffer.src3 + dstCol), ReduceColBody(src3 + srcCol)); |
149 | 0 | } |
150 | 0 | Store<false>((__m128i*)(buffer.src2 + dstWidth - HA), ReduceColTail<even>(src2 + srcTail)); |
151 | 0 | Store<false>((__m128i*)(buffer.src3 + dstWidth - HA), ReduceColTail<even>(src3 + srcTail)); |
152 | |
|
153 | 0 | for (size_t col = 0; col < alignedDstWidth; col += HA) |
154 | 0 | _mm_storel_epi64((__m128i*)(dst + col), ReduceRow<true>(buffer, col)); |
155 | |
|
156 | 0 | if (alignedDstWidth != dstWidth) |
157 | 0 | _mm_storel_epi64((__m128i*)(dst + dstWidth - HA), ReduceRow<false>(buffer, dstWidth - HA)); |
158 | |
|
159 | 0 | Swap(buffer.src0, buffer.src2); |
160 | 0 | Swap(buffer.src1, buffer.src3); |
161 | 0 | } |
162 | 0 | } Unexecuted instantiation: void Simd::Sse41::ReduceGray4x4<true>(unsigned char const*, unsigned long, unsigned long, unsigned long, unsigned char*, unsigned long, unsigned long, unsigned long) Unexecuted instantiation: void Simd::Sse41::ReduceGray4x4<false>(unsigned char const*, unsigned long, unsigned long, unsigned long, unsigned char*, unsigned long, unsigned long, unsigned long) |
163 | | |
164 | | void ReduceGray4x4(const uint8_t *src, size_t srcWidth, size_t srcHeight, size_t srcStride, |
165 | | uint8_t *dst, size_t dstWidth, size_t dstHeight, size_t dstStride) |
166 | 0 | { |
167 | 0 | if (Aligned(srcWidth, 2)) |
168 | 0 | ReduceGray4x4<true>(src, srcWidth, srcHeight, srcStride, dst, dstWidth, dstHeight, dstStride); |
169 | 0 | else |
170 | 0 | ReduceGray4x4<false>(src, srcWidth, srcHeight, srcStride, dst, dstWidth, dstHeight, dstStride); |
171 | 0 | } |
172 | | } |
173 | | #endif |
174 | | } |