/src/vvenc/source/Lib/CommonLib/x86/SampleAdaptiveOffsetX86.h
Line | Count | Source |
1 | | /* ----------------------------------------------------------------------------- |
2 | | The copyright in this software is being made available under the Clear BSD |
3 | | License, included below. No patent rights, trademark rights and/or |
4 | | other Intellectual Property Rights other than the copyrights concerning |
5 | | the Software are granted under this license. |
6 | | |
7 | | The Clear BSD License |
8 | | |
9 | | Copyright (c) 2019-2026, Fraunhofer-Gesellschaft zur Förderung der angewandten Forschung e.V. & The VVenC Authors. |
10 | | All rights reserved. |
11 | | |
12 | | Redistribution and use in source and binary forms, with or without modification, |
13 | | are permitted (subject to the limitations in the disclaimer below) provided that |
14 | | the following conditions are met: |
15 | | |
16 | | * Redistributions of source code must retain the above copyright notice, |
17 | | this list of conditions and the following disclaimer. |
18 | | |
19 | | * Redistributions in binary form must reproduce the above copyright |
20 | | notice, this list of conditions and the following disclaimer in the |
21 | | documentation and/or other materials provided with the distribution. |
22 | | |
23 | | * Neither the name of the copyright holder nor the names of its |
24 | | contributors may be used to endorse or promote products derived from this |
25 | | software without specific prior written permission. |
26 | | |
27 | | NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE GRANTED BY |
28 | | THIS LICENSE. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND |
29 | | CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT |
30 | | LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A |
31 | | PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR |
32 | | CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, |
33 | | EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, |
34 | | PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR |
35 | | BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER |
36 | | IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) |
37 | | ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE |
38 | | POSSIBILITY OF SUCH DAMAGE. |
39 | | |
40 | | |
41 | | ------------------------------------------------------------------------------------------- */ |
42 | | /** \file SampleAdaptiveOffsetX86.h |
43 | | \brief SAO filter class |
44 | | */ |
45 | | |
46 | | #pragma once |
47 | | |
48 | | #include "CommonDefX86.h" |
49 | | #include "SampleAdaptiveOffset.h" |
50 | | |
51 | | #if defined(TARGET_SIMD_X86) && ENABLE_SIMD_OPT_SAO |
52 | | |
53 | | //! \ingroup CommonLib |
54 | | //! \{ |
55 | | |
56 | | namespace vvenc { |
57 | | |
58 | 0 | #define SAO_NUM_OFFSETS 4 /* number of SAO offset values */ |
59 | 0 | #define SAO_EO_NUM_CATEGORIES (SAO_NUM_OFFSETS + 1) /* number of different eo categories */ |
60 | | |
61 | | template <X86_VEXT vext> |
62 | | void offsetBlock_SIMD( const int channelBitDepth, |
63 | | const ClpRng& clpRng, |
64 | | int typeIdx, |
65 | | int* offset, |
66 | | int startIdx, |
67 | | const Pel* srcBlk, |
68 | | Pel* resBlk, |
69 | | ptrdiff_t srcStride, |
70 | | ptrdiff_t resStride, |
71 | | int width, |
72 | | int height, |
73 | | uint8_t availMask, |
74 | | // bool isLeftAvail, |
75 | | // bool isRightAvail, |
76 | | // bool isAboveAvail, |
77 | | // bool isBelowAvail, |
78 | | // bool isAboveLeftAvail, |
79 | | // bool isAboveRightAvail, |
80 | | // bool isBelowLeftAvail, |
81 | | // bool isBelowRightAvail, |
82 | | std::vector<int8_t> &signLineBuf1, |
83 | | std::vector<int8_t> &signLineBuf2) |
84 | 0 | { |
85 | |
|
86 | 0 | int x,y, startX, startY, endX, endY, edgeType; |
87 | 0 | int firstLineStartX, firstLineEndX, lastLineStartX, lastLineEndX; |
88 | 0 | int8_t signLeft, signRight, signDown; |
89 | |
|
90 | 0 | const Pel* srcLine = srcBlk; |
91 | 0 | Pel* resLine = resBlk; |
92 | |
|
93 | 0 | switch(typeIdx) |
94 | 0 | { |
95 | 0 | case SAO_TYPE_EO_0: |
96 | 0 | { |
97 | 0 | if (availMask&LeftAvail && availMask&RightAvail) |
98 | 0 | { |
99 | |
|
100 | 0 | int8_t p_eo_offsets[16] = {0,}; |
101 | 0 | for (int i = 0; i < SAO_EO_NUM_CATEGORIES; i++) |
102 | 0 | { |
103 | 0 | p_eo_offsets[i] = offset[i]; |
104 | 0 | } |
105 | |
|
106 | | #ifdef USE_AVX2 |
107 | | // AVX2 |
108 | 0 | if( ( width & 15 ) == 0 && vext >= AVX2 ) |
109 | 0 | { |
110 | 0 | __m256i vsrca,vsrcal,vsrcar; |
111 | 0 | __m256i vbaseoffset = _mm256_set1_epi16(2) ; |
112 | 0 | __m256i vplusone = _mm256_set1_epi16(1); |
113 | 0 | __m256i vzero = _mm256_set1_epi8(0); |
114 | 0 | __m256i vibdimax = _mm256_set1_epi16((1<<channelBitDepth) -1 ); |
115 | 0 | __m256i voffsettbl = _mm256_broadcastsi128_si256(_mm_loadu_si128((__m128i*)p_eo_offsets)); |
116 | | |
117 | 0 | for (y=0; y< height; y++) |
118 | 0 | { |
119 | 0 | for (x=0; x< width; x+=16) |
120 | 0 | { |
121 | 0 | vsrca = _mm256_loadu_si256((__m256i*)&srcLine[x]); |
122 | 0 | vsrcal = _mm256_loadu_si256((__m256i*)&srcLine[x-1]); |
123 | 0 | vsrcar = _mm256_loadu_si256((__m256i*)&srcLine[x+1]); |
124 | 0 | vsrcal = _mm256_sub_epi16(vsrca, vsrcal); |
125 | 0 | vsrcar = _mm256_sub_epi16(vsrca, vsrcar); |
126 | 0 | __m256i vsignl = _mm256_sign_epi16(vplusone, vsrcal); |
127 | 0 | __m256i vsignr = _mm256_sign_epi16(vplusone, vsrcar); |
128 | 0 | __m256i vsign = _mm256_add_epi16(_mm256_add_epi16(vsignl, vsignr), vbaseoffset); |
129 | 0 | __m256i veoffsets = _mm256_shuffle_epi8(voffsettbl, vsign); |
130 | 0 | veoffsets = _mm256_slli_epi16 (veoffsets,8); |
131 | 0 | veoffsets = _mm256_srai_epi16 (veoffsets,8); |
132 | |
|
133 | 0 | vsrca = _mm256_add_epi16(vsrca, veoffsets); |
134 | 0 | vsrca = _mm256_min_epi16(_mm256_max_epi16(vsrca, vzero), vibdimax); |
135 | 0 | _mm256_storeu_si256((__m256i*)&resLine[x], vsrca); |
136 | 0 | } |
137 | 0 | srcLine += srcStride; |
138 | 0 | resLine += resStride; |
139 | 0 | } |
140 | 0 | } |
141 | 0 | else |
142 | 0 | #endif |
143 | 0 | { |
144 | 0 | __m128i vsrca,vsrcal,vsrcar; |
145 | 0 | __m128i vbaseoffset = _mm_set1_epi16(2) ; |
146 | 0 | __m128i vplusone = _mm_set1_epi16(1); |
147 | 0 | __m128i vzero = _mm_set1_epi8(0); |
148 | 0 | __m128i vibdimax = _mm_set1_epi16((1<<channelBitDepth) -1 ); |
149 | 0 | __m128i voffsettbl = _mm_loadu_si128((__m128i*)p_eo_offsets); |
150 | | |
151 | |
|
152 | 0 | for (y=0; y< height; y++) |
153 | 0 | { |
154 | |
|
155 | 0 | for (x=0; x< width; x+=8) |
156 | 0 | { |
157 | 0 | vsrca = _mm_loadu_si128((__m128i*)&srcLine[x]); |
158 | 0 | vsrcal = _mm_loadu_si128((__m128i*)&srcLine[x-1]); |
159 | 0 | vsrcar = _mm_loadu_si128((__m128i*)&srcLine[x+1]); |
160 | 0 | vsrcal = _mm_sub_epi16(vsrca, vsrcal); |
161 | 0 | vsrcar = _mm_sub_epi16(vsrca, vsrcar); |
162 | 0 | __m128i vsignl = _mm_sign_epi16(vplusone, vsrcal); |
163 | 0 | __m128i vsignr = _mm_sign_epi16(vplusone, vsrcar); |
164 | 0 | __m128i vsign = _mm_add_epi16(_mm_add_epi16(vsignl, vsignr), vbaseoffset); |
165 | 0 | __m128i veoffsets = _mm_shuffle_epi8(voffsettbl, vsign); |
166 | 0 | veoffsets = _mm_slli_epi16 (veoffsets,8); |
167 | 0 | veoffsets = _mm_srai_epi16 (veoffsets,8); |
168 | |
|
169 | 0 | vsrca = _mm_add_epi16(vsrca, veoffsets); |
170 | 0 | vsrca = _mm_min_epi16(_mm_max_epi16(vsrca, vzero), vibdimax); |
171 | 0 | _mm_store_si128((__m128i*)&resLine[x], vsrca); |
172 | 0 | } |
173 | 0 | srcLine += srcStride; |
174 | 0 | resLine += resStride; |
175 | 0 | } |
176 | 0 | } |
177 | 0 | } |
178 | 0 | else |
179 | 0 | { |
180 | 0 | offset += 2; |
181 | 0 | startX = availMask&LeftAvail ? 0 : 1; |
182 | 0 | endX = availMask&RightAvail ? width : (width -1); |
183 | 0 | for (y=0; y< height; y++) |
184 | 0 | { |
185 | 0 | signLeft = (int8_t)sgn(srcLine[startX] - srcLine[startX-1]); |
186 | 0 | for (x=startX; x< endX; x++) |
187 | 0 | { |
188 | 0 | signRight = (int8_t)sgn(srcLine[x] - srcLine[x+1]); |
189 | 0 | edgeType = signRight + signLeft; |
190 | 0 | signLeft = -signRight; |
191 | |
|
192 | 0 | resLine[x] = ClipPel<int>( srcLine[x] + offset[edgeType], clpRng); |
193 | 0 | } |
194 | 0 | srcLine += srcStride; |
195 | 0 | resLine += resStride; |
196 | 0 | } |
197 | |
|
198 | 0 | } |
199 | 0 | } |
200 | 0 | break; |
201 | 0 | case SAO_TYPE_EO_90: |
202 | 0 | { |
203 | |
|
204 | 0 | int8_t p_eo_offsets[16] = {0,}; |
205 | 0 | for (int i = 0; i < SAO_EO_NUM_CATEGORIES; i++) |
206 | 0 | { |
207 | 0 | p_eo_offsets[i] = offset[i]; |
208 | 0 | } |
209 | 0 | const Pel* srcLineAbove= srcLine- srcStride; |
210 | 0 | const Pel* srcLineBelow= srcLine+ srcStride; |
211 | 0 | startY=0; |
212 | 0 | if (!(availMask&AboveAvail)) |
213 | 0 | { |
214 | 0 | startY=1; |
215 | 0 | srcLineAbove= srcLine; |
216 | 0 | srcLine += srcStride; |
217 | 0 | resLine += resStride; |
218 | 0 | srcLineBelow= srcLine+ srcStride; |
219 | 0 | } |
220 | 0 | endY=height; |
221 | 0 | if (!(availMask&BelowAvail)) |
222 | 0 | { |
223 | 0 | endY=height-1; |
224 | 0 | } |
225 | | #ifdef USE_AVX2 |
226 | | // AVX2 |
227 | 0 | if( ( width & 15 ) == 0 && ( vext >= AVX2 ) ) |
228 | 0 | { |
229 | 0 | __m256i vsrca,vsrcat,vsrcab; |
230 | | |
231 | | __m256i vbaseoffset = _mm256_set1_epi16(2) ; |
232 | | __m256i vplusone = _mm256_set1_epi16(1); |
233 | | __m256i vzero = _mm256_set1_epi8(0); |
234 | | __m256i vibdimax = _mm256_set1_epi16((1<<channelBitDepth) -1 ); |
235 | | __m256i voffsettbl = _mm256_broadcastsi128_si256(_mm_loadu_si128((__m128i*)p_eo_offsets)); |
236 | | const Pel* srcLineBelow= srcLine+ srcStride; |
237 | | |
238 | 0 | for (y=startY; y< endY; y++) |
239 | 0 | { |
240 | 0 | for (x=0; x< width; x+=16) |
241 | 0 | { |
242 | 0 | vsrca = _mm256_loadu_si256((__m256i*)&srcLine[x]); |
243 | 0 | vsrcat = _mm256_loadu_si256((__m256i*)&srcLineAbove[x]); |
244 | 0 | vsrcab = _mm256_loadu_si256((__m256i*)&srcLineBelow[x]); |
245 | 0 | vsrcat = _mm256_sub_epi16(vsrca, vsrcat); |
246 | 0 | vsrcab = _mm256_sub_epi16(vsrca, vsrcab); |
247 | 0 | __m256i vsignt = _mm256_sign_epi16(vplusone, vsrcat); |
248 | 0 | __m256i vsignb = _mm256_sign_epi16(vplusone, vsrcab); |
249 | 0 | __m256i vsign = _mm256_add_epi16(_mm256_add_epi16(vsignt, vsignb), vbaseoffset); |
250 | 0 | __m256i veoffsets = _mm256_shuffle_epi8(voffsettbl, vsign); |
251 | 0 | veoffsets = _mm256_slli_epi16 (veoffsets,8); |
252 | 0 | veoffsets = _mm256_srai_epi16 (veoffsets,8); |
253 | |
|
254 | 0 | vsrca = _mm256_add_epi16(vsrca, veoffsets); |
255 | 0 | vsrca = _mm256_min_epi16(_mm256_max_epi16(vsrca, vzero), vibdimax); |
256 | 0 | _mm256_storeu_si256((__m256i*)&resLine[x], vsrca); |
257 | 0 | } |
258 | 0 | srcLine += srcStride; |
259 | 0 | srcLineBelow += srcStride; |
260 | 0 | srcLineAbove += srcStride; |
261 | 0 | resLine += resStride; |
262 | 0 | } |
263 | 0 | } |
264 | 0 | else |
265 | 0 | #endif |
266 | 0 | { |
267 | 0 | __m128i vsrca,vsrcat,vsrcab; |
268 | 0 | __m128i vbaseoffset = _mm_set1_epi16(2) ; |
269 | 0 | __m128i vplusone = _mm_set1_epi16(1); |
270 | 0 | __m128i vzero = _mm_set1_epi8(0); |
271 | 0 | __m128i vibdimax = _mm_set1_epi16((1<<channelBitDepth) -1 ); |
272 | 0 | __m128i voffsettbl = _mm_loadu_si128((__m128i*)p_eo_offsets); |
273 | | |
274 | |
|
275 | 0 | for (y=startY; y< endY; y++) |
276 | 0 | { |
277 | 0 | for (x=0; x< width; x+=8) |
278 | 0 | { |
279 | 0 | vsrca = _mm_loadu_si128((__m128i*)&srcLine[x]); |
280 | 0 | vsrcat = _mm_loadu_si128((__m128i*)&srcLineAbove[x]); |
281 | 0 | vsrcab = _mm_loadu_si128((__m128i*)&srcLineBelow[x]); |
282 | 0 | vsrcat = _mm_sub_epi16(vsrca, vsrcat); |
283 | 0 | vsrcab = _mm_sub_epi16(vsrca, vsrcab); |
284 | 0 | __m128i vsignt = _mm_sign_epi16(vplusone, vsrcat); |
285 | 0 | __m128i vsignb = _mm_sign_epi16(vplusone, vsrcab); |
286 | 0 | __m128i vsign = _mm_add_epi16(_mm_add_epi16(vsignt, vsignb), vbaseoffset); |
287 | 0 | __m128i veoffsets = _mm_shuffle_epi8(voffsettbl, vsign); |
288 | 0 | veoffsets = _mm_slli_epi16 (veoffsets,8); |
289 | 0 | veoffsets = _mm_srai_epi16 (veoffsets,8); |
290 | |
|
291 | 0 | vsrca = _mm_add_epi16(vsrca, veoffsets); |
292 | 0 | vsrca = _mm_min_epi16(_mm_max_epi16(vsrca, vzero), vibdimax); |
293 | 0 | _mm_store_si128((__m128i*)&resLine[x], vsrca); |
294 | 0 | } |
295 | 0 | srcLine += srcStride; |
296 | 0 | srcLineBelow += srcStride; |
297 | 0 | srcLineAbove += srcStride; |
298 | 0 | resLine += resStride; |
299 | 0 | } |
300 | 0 | } |
301 | 0 | } |
302 | 0 | break; |
303 | 0 | case SAO_TYPE_EO_135: |
304 | 0 | { |
305 | | // if (isLeftAvail && isRightAvail && isAboveLeftAvail && isBelowRightAvail ) |
306 | 0 | if((LeftAvail|RightAvail|AboveLeftAvail|BelowRightAvail) == (int)(availMask&(LeftAvail|RightAvail|AboveLeftAvail|BelowRightAvail))) |
307 | 0 | { |
308 | |
|
309 | 0 | int8_t p_eo_offsets[16] = {0,}; |
310 | 0 | for (int i = 0; i < SAO_EO_NUM_CATEGORIES; i++) |
311 | 0 | { |
312 | 0 | p_eo_offsets[i] = offset[i]; |
313 | 0 | } |
314 | 0 | const Pel* srcLineAbove= srcLine- srcStride; |
315 | 0 | const Pel* srcLineBelow= srcLine+ srcStride; |
316 | 0 | startY=0; |
317 | 0 | if (!(availMask&AboveAvail)) |
318 | 0 | { |
319 | 0 | startY=1; |
320 | 0 | srcLineAbove= srcLine; |
321 | 0 | srcLine += srcStride; |
322 | 0 | resLine += resStride; |
323 | 0 | srcLineBelow= srcLine+ srcStride; |
324 | 0 | } |
325 | 0 | endY=height; |
326 | 0 | if (!(availMask&BelowAvail)) |
327 | 0 | { |
328 | 0 | endY=height-1; |
329 | 0 | } |
330 | | #ifdef USE_AVX2 |
331 | | // AVX2 |
332 | 0 | if( ( width & 15 ) == 0 && vext >= AVX2 ) |
333 | 0 | { |
334 | 0 | __m256i vsrca,vsrcat,vsrcab; |
335 | | |
336 | | __m256i vbaseoffset = _mm256_set1_epi16(2) ; |
337 | | __m256i vplusone = _mm256_set1_epi16(1); |
338 | | __m256i vzero = _mm256_set1_epi8(0); |
339 | | __m256i vibdimax = _mm256_set1_epi16((1<<channelBitDepth) -1 ); |
340 | | __m256i voffsettbl = _mm256_broadcastsi128_si256(_mm_loadu_si128((__m128i*)p_eo_offsets)); |
341 | | const Pel* srcLineBelow= srcLine+ srcStride; |
342 | | |
343 | 0 | for (y=startY; y< endY; y++) |
344 | 0 | { |
345 | 0 | for (x=0; x< width; x+=16) |
346 | 0 | { |
347 | 0 | vsrca = _mm256_loadu_si256((__m256i*)&srcLine[x]); |
348 | 0 | vsrcat = _mm256_loadu_si256((__m256i*)&srcLineAbove[x-1]); |
349 | 0 | vsrcab = _mm256_loadu_si256((__m256i*)&srcLineBelow[x+1]); |
350 | 0 | vsrcat = _mm256_sub_epi16(vsrca, vsrcat); |
351 | 0 | vsrcab = _mm256_sub_epi16(vsrca, vsrcab); |
352 | 0 | __m256i vsignt = _mm256_sign_epi16(vplusone, vsrcat); |
353 | 0 | __m256i vsignb = _mm256_sign_epi16(vplusone, vsrcab); |
354 | 0 | __m256i vsign = _mm256_add_epi16(_mm256_add_epi16(vsignt, vsignb), vbaseoffset); |
355 | 0 | __m256i veoffsets = _mm256_shuffle_epi8(voffsettbl, vsign); |
356 | 0 | veoffsets = _mm256_slli_epi16 (veoffsets,8); |
357 | 0 | veoffsets = _mm256_srai_epi16 (veoffsets,8); |
358 | |
|
359 | 0 | vsrca = _mm256_add_epi16(vsrca, veoffsets); |
360 | 0 | vsrca = _mm256_min_epi16(_mm256_max_epi16(vsrca, vzero), vibdimax); |
361 | 0 | _mm256_storeu_si256((__m256i*)&resLine[x], vsrca); |
362 | 0 | } |
363 | 0 | srcLine += srcStride; |
364 | 0 | srcLineBelow += srcStride; |
365 | 0 | srcLineAbove += srcStride; |
366 | 0 | resLine += resStride; |
367 | 0 | } |
368 | 0 | } |
369 | 0 | else |
370 | 0 | #endif |
371 | 0 | { |
372 | 0 | __m128i vsrca,vsrcat,vsrcab; |
373 | 0 | __m128i vbaseoffset = _mm_set1_epi16(2) ; |
374 | 0 | __m128i vplusone = _mm_set1_epi16(1); |
375 | 0 | __m128i vzero = _mm_set1_epi8(0); |
376 | 0 | __m128i vibdimax = _mm_set1_epi16((1<<channelBitDepth) -1 ); |
377 | 0 | __m128i voffsettbl = _mm_loadu_si128((__m128i*)p_eo_offsets); |
378 | | |
379 | |
|
380 | 0 | for (y=startY; y< endY; y++) |
381 | 0 | { |
382 | 0 | for (x=0; x< width; x+=8) |
383 | 0 | { |
384 | 0 | vsrca = _mm_loadu_si128((__m128i*)&srcLine[x]); |
385 | 0 | vsrcat = _mm_loadu_si128((__m128i*)&srcLineAbove[x-1]); |
386 | 0 | vsrcab = _mm_loadu_si128((__m128i*)&srcLineBelow[x+1]); |
387 | 0 | vsrcat = _mm_sub_epi16(vsrca, vsrcat); |
388 | 0 | vsrcab = _mm_sub_epi16(vsrca, vsrcab); |
389 | 0 | __m128i vsignt = _mm_sign_epi16(vplusone, vsrcat); |
390 | 0 | __m128i vsignb = _mm_sign_epi16(vplusone, vsrcab); |
391 | 0 | __m128i vsign = _mm_add_epi16(_mm_add_epi16(vsignt, vsignb), vbaseoffset); |
392 | 0 | __m128i veoffsets = _mm_shuffle_epi8(voffsettbl, vsign); |
393 | 0 | veoffsets = _mm_slli_epi16 (veoffsets,8); |
394 | 0 | veoffsets = _mm_srai_epi16 (veoffsets,8); |
395 | |
|
396 | 0 | vsrca = _mm_add_epi16(vsrca, veoffsets); |
397 | 0 | vsrca = _mm_min_epi16(_mm_max_epi16(vsrca, vzero), vibdimax); |
398 | 0 | _mm_store_si128((__m128i*)&resLine[x], vsrca); |
399 | 0 | } |
400 | 0 | srcLine += srcStride; |
401 | 0 | srcLineBelow += srcStride; |
402 | 0 | srcLineAbove += srcStride; |
403 | 0 | resLine += resStride; |
404 | 0 | } |
405 | 0 | } |
406 | | |
407 | |
|
408 | 0 | } |
409 | 0 | else |
410 | 0 | { |
411 | 0 | offset += 2; |
412 | 0 | int8_t *signUpLine, *signDownLine, *signTmpLine; |
413 | |
|
414 | 0 | signUpLine = &signLineBuf1[0]; |
415 | 0 | signDownLine= &signLineBuf2[0]; |
416 | |
|
417 | 0 | startX = availMask&LeftAvail ? 0 : 1 ; |
418 | 0 | endX = availMask&RightAvail ? width : (width-1); |
419 | | |
420 | | //prepare 2nd line's upper sign |
421 | 0 | const Pel* srcLineBelow= srcLine+ srcStride; |
422 | 0 | for (x=startX; x< endX+1; x++) |
423 | 0 | { |
424 | 0 | signUpLine[x] = (int8_t)sgn(srcLineBelow[x] - srcLine[x- 1]); |
425 | 0 | } |
426 | | |
427 | | //1st line |
428 | 0 | const Pel* srcLineAbove= srcLine- srcStride; |
429 | 0 | firstLineStartX = availMask&AboveLeftAvail ? 0 : 1; |
430 | 0 | firstLineEndX = availMask&AboveAvail? endX: 1; |
431 | 0 | for(x= firstLineStartX; x< firstLineEndX; x++) |
432 | 0 | { |
433 | 0 | edgeType = sgn(srcLine[x] - srcLineAbove[x- 1]) - signUpLine[x+1]; |
434 | |
|
435 | 0 | resLine[x] = ClipPel<int>( srcLine[x] + offset[edgeType], clpRng); |
436 | 0 | } |
437 | 0 | srcLine += srcStride; |
438 | 0 | resLine += resStride; |
439 | | |
440 | | |
441 | | //middle lines |
442 | 0 | for (y= 1; y< height-1; y++) |
443 | 0 | { |
444 | 0 | srcLineBelow= srcLine+ srcStride; |
445 | |
|
446 | 0 | for (x=startX; x<endX; x++) |
447 | 0 | { |
448 | 0 | signDown = (int8_t)sgn(srcLine[x] - srcLineBelow[x+ 1]); |
449 | 0 | edgeType = signDown + signUpLine[x]; |
450 | 0 | resLine[x] = ClipPel<int>( srcLine[x] + offset[edgeType], clpRng); |
451 | |
|
452 | 0 | signDownLine[x+1] = -signDown; |
453 | 0 | } |
454 | 0 | signDownLine[startX] = (int8_t)sgn(srcLineBelow[startX] - srcLine[startX-1]); |
455 | |
|
456 | 0 | signTmpLine = signUpLine; |
457 | 0 | signUpLine = signDownLine; |
458 | 0 | signDownLine = signTmpLine; |
459 | |
|
460 | 0 | srcLine += srcStride; |
461 | 0 | resLine += resStride; |
462 | 0 | } |
463 | | |
464 | | //last line |
465 | 0 | srcLineBelow= srcLine+ srcStride; |
466 | 0 | lastLineStartX = availMask&BelowAvail ? startX : (width -1); |
467 | 0 | lastLineEndX = availMask&BelowRightAvail ? width : (width -1); |
468 | 0 | for(x= lastLineStartX; x< lastLineEndX; x++) |
469 | 0 | { |
470 | 0 | edgeType = sgn(srcLine[x] - srcLineBelow[x+ 1]) + signUpLine[x]; |
471 | 0 | resLine[x] = ClipPel<int>( srcLine[x] + offset[edgeType], clpRng); |
472 | |
|
473 | 0 | } |
474 | |
|
475 | 0 | } |
476 | 0 | } |
477 | 0 | break; |
478 | 0 | case SAO_TYPE_EO_45: |
479 | 0 | { |
480 | | // if (isLeftAvail && isRightAvail && isAboveLeftAvail && isBelowRightAvail ) |
481 | 0 | if((LeftAvail|RightAvail|AboveLeftAvail|BelowRightAvail) == ((int)availMask&(LeftAvail|RightAvail|AboveLeftAvail|BelowRightAvail))) |
482 | 0 | { |
483 | |
|
484 | 0 | int8_t p_eo_offsets[16] = {0,}; |
485 | 0 | for (int i = 0; i < SAO_EO_NUM_CATEGORIES; i++) |
486 | 0 | { |
487 | 0 | p_eo_offsets[i] = offset[i]; |
488 | 0 | } |
489 | 0 | const Pel* srcLineAbove= srcLine- srcStride; |
490 | 0 | const Pel* srcLineBelow= srcLine+ srcStride; |
491 | 0 | startY=0; |
492 | 0 | if (!(availMask&AboveAvail)) |
493 | 0 | { |
494 | 0 | startY=1; |
495 | 0 | srcLineAbove= srcLine; |
496 | 0 | srcLine += srcStride; |
497 | 0 | resLine += resStride; |
498 | 0 | srcLineBelow= srcLine+ srcStride; |
499 | 0 | } |
500 | 0 | endY=height; |
501 | 0 | if (!(availMask&BelowAvail)) |
502 | 0 | { |
503 | 0 | endY=height-1; |
504 | 0 | } |
505 | | #ifdef USE_AVX2 |
506 | | // AVX2 |
507 | 0 | if( ( width & 15 ) == 0 && vext >= AVX2 ) |
508 | 0 | { |
509 | 0 | __m256i vsrca,vsrcat,vsrcab; |
510 | 0 | __m256i vbaseoffset = _mm256_set1_epi16(2) ; |
511 | 0 | __m256i vplusone = _mm256_set1_epi16(1); |
512 | 0 | __m256i vzero = _mm256_set1_epi8(0); |
513 | 0 | __m256i vibdimax = _mm256_set1_epi16((1<<channelBitDepth) -1 ); |
514 | 0 | __m256i voffsettbl = _mm256_broadcastsi128_si256(_mm_loadu_si128((__m128i*)p_eo_offsets)); |
515 | 0 | const Pel* srcLineBelow= srcLine+ srcStride; |
516 | | |
517 | 0 | for (y=startY; y< endY; y++) |
518 | 0 | { |
519 | 0 | for (x=0; x< width; x+=16) |
520 | 0 | { |
521 | 0 | vsrca = _mm256_loadu_si256((__m256i*)&srcLine[x]); |
522 | 0 | vsrcat = _mm256_loadu_si256((__m256i*)&srcLineAbove[x+1]); |
523 | 0 | vsrcab = _mm256_loadu_si256((__m256i*)&srcLineBelow[x-1]); |
524 | 0 | vsrcat = _mm256_sub_epi16(vsrca, vsrcat); |
525 | 0 | vsrcab = _mm256_sub_epi16(vsrca, vsrcab); |
526 | 0 | __m256i vsignt = _mm256_sign_epi16(vplusone, vsrcat); |
527 | 0 | __m256i vsignb = _mm256_sign_epi16(vplusone, vsrcab); |
528 | 0 | __m256i vsign = _mm256_add_epi16(_mm256_add_epi16(vsignt, vsignb), vbaseoffset); |
529 | 0 | __m256i veoffsets = _mm256_shuffle_epi8(voffsettbl, vsign); |
530 | 0 | veoffsets = _mm256_slli_epi16 (veoffsets,8); |
531 | 0 | veoffsets = _mm256_srai_epi16 (veoffsets,8); |
532 | |
|
533 | 0 | vsrca = _mm256_add_epi16(vsrca, veoffsets); |
534 | 0 | vsrca = _mm256_min_epi16(_mm256_max_epi16(vsrca, vzero), vibdimax); |
535 | 0 | _mm256_storeu_si256((__m256i*)&resLine[x], vsrca); |
536 | 0 | } |
537 | 0 | srcLine += srcStride; |
538 | 0 | srcLineBelow += srcStride; |
539 | 0 | srcLineAbove += srcStride; |
540 | 0 | resLine += resStride; |
541 | 0 | } |
542 | 0 | } |
543 | 0 | else |
544 | 0 | #endif |
545 | 0 | { |
546 | 0 | __m128i vsrca,vsrcat,vsrcab; |
547 | 0 | __m128i vbaseoffset = _mm_set1_epi16(2) ; |
548 | 0 | __m128i vplusone = _mm_set1_epi16(1); |
549 | 0 | __m128i vzero = _mm_set1_epi8(0); |
550 | 0 | __m128i vibdimax = _mm_set1_epi16((1<<channelBitDepth) -1 ); |
551 | 0 | __m128i voffsettbl = _mm_loadu_si128((__m128i*)p_eo_offsets); |
552 | |
|
553 | 0 | for (y=startY; y< endY; y++) |
554 | 0 | { |
555 | 0 | for (x=0; x< width; x+=8) |
556 | 0 | { |
557 | 0 | vsrca = _mm_loadu_si128((__m128i*)&srcLine[x]); |
558 | 0 | vsrcat = _mm_loadu_si128((__m128i*)&srcLineAbove[x+1]); |
559 | 0 | vsrcab = _mm_loadu_si128((__m128i*)&srcLineBelow[x-1]); |
560 | 0 | vsrcat = _mm_sub_epi16(vsrca, vsrcat); |
561 | 0 | vsrcab = _mm_sub_epi16(vsrca, vsrcab); |
562 | 0 | __m128i vsignt = _mm_sign_epi16(vplusone, vsrcat); |
563 | 0 | __m128i vsignb = _mm_sign_epi16(vplusone, vsrcab); |
564 | 0 | __m128i vsign = _mm_add_epi16(_mm_add_epi16(vsignt, vsignb), vbaseoffset); |
565 | 0 | __m128i veoffsets = _mm_shuffle_epi8(voffsettbl, vsign); |
566 | 0 | veoffsets = _mm_slli_epi16 (veoffsets,8); |
567 | 0 | veoffsets = _mm_srai_epi16 (veoffsets,8); |
568 | |
|
569 | 0 | vsrca = _mm_add_epi16(vsrca, veoffsets); |
570 | 0 | vsrca = _mm_min_epi16(_mm_max_epi16(vsrca, vzero), vibdimax); |
571 | 0 | _mm_store_si128((__m128i*)&resLine[x], vsrca); |
572 | 0 | } |
573 | 0 | srcLine += srcStride; |
574 | 0 | srcLineBelow += srcStride; |
575 | 0 | srcLineAbove += srcStride; |
576 | 0 | resLine += resStride; |
577 | 0 | } |
578 | 0 | } |
579 | 0 | } |
580 | 0 | else |
581 | 0 | { |
582 | 0 | offset += 2; |
583 | 0 | int8_t *signUpLine = &signLineBuf1[1]; |
584 | |
|
585 | 0 | startX = availMask&LeftAvail ? 0 : 1; |
586 | 0 | endX = availMask&RightAvail ? width : (width -1); |
587 | | |
588 | | //prepare 2nd line upper sign |
589 | 0 | const Pel* srcLineBelow= srcLine+ srcStride; |
590 | 0 | for (x=startX-1; x< endX; x++) |
591 | 0 | { |
592 | 0 | signUpLine[x] = (int8_t)sgn(srcLineBelow[x] - srcLine[x+1]); |
593 | 0 | } |
594 | | //first line |
595 | 0 | const Pel* srcLineAbove= srcLine- srcStride; |
596 | 0 | firstLineStartX = availMask&AboveAvail ? startX : (width -1 ); |
597 | 0 | firstLineEndX = availMask&AboveRightAvail ? width : (width-1); |
598 | 0 | for(x= firstLineStartX; x< firstLineEndX; x++) |
599 | 0 | { |
600 | 0 | edgeType = sgn(srcLine[x] - srcLineAbove[x+1]) -signUpLine[x-1]; |
601 | 0 | resLine[x] = ClipPel<int>(srcLine[x] + offset[edgeType], clpRng); |
602 | 0 | } |
603 | 0 | srcLine += srcStride; |
604 | 0 | resLine += resStride; |
605 | | |
606 | | //middle lines |
607 | 0 | for (y= 1; y< height-1; y++) |
608 | 0 | { |
609 | 0 | srcLineBelow= srcLine+ srcStride; |
610 | |
|
611 | 0 | for(x= startX; x< endX; x++) |
612 | 0 | { |
613 | 0 | signDown = (int8_t)sgn(srcLine[x] - srcLineBelow[x-1]); |
614 | 0 | edgeType = signDown + signUpLine[x]; |
615 | 0 | resLine[x] = ClipPel<int>(srcLine[x] + offset[edgeType], clpRng); |
616 | 0 | signUpLine[x-1] = -signDown; |
617 | 0 | } |
618 | 0 | signUpLine[endX-1] = (int8_t)sgn(srcLineBelow[endX-1] - srcLine[endX]); |
619 | 0 | srcLine += srcStride; |
620 | 0 | resLine += resStride; |
621 | 0 | } |
622 | | |
623 | | //last line |
624 | 0 | srcLineBelow= srcLine+ srcStride; |
625 | 0 | lastLineStartX = availMask&BelowLeftAvail ? 0 : 1; |
626 | 0 | lastLineEndX = availMask&BelowAvail ? endX : 1; |
627 | 0 | for(x= lastLineStartX; x< lastLineEndX; x++) |
628 | 0 | { |
629 | 0 | edgeType = sgn(srcLine[x] - srcLineBelow[x-1]) + signUpLine[x]; |
630 | 0 | resLine[x] = ClipPel<int>(srcLine[x] + offset[edgeType], clpRng); |
631 | |
|
632 | 0 | } |
633 | |
|
634 | 0 | } |
635 | 0 | } |
636 | 0 | break; |
637 | 0 | case SAO_TYPE_BO: |
638 | 0 | { |
639 | 0 | const int shiftBits = channelBitDepth - NUM_SAO_BO_CLASSES_LOG2; |
640 | 0 | int8_t p_eo_offsets[16] = {0,}; |
641 | 0 | for (int i = 0; i < 4; i++) |
642 | 0 | { |
643 | 0 | p_eo_offsets[i] = offset[( startIdx + i ) % MAX_NUM_SAO_CLASSES]; |
644 | 0 | } |
645 | | #ifdef USE_AVX2 |
646 | | // AVX2 |
647 | 0 | if( ( width & 15 ) == 0 && vext >= AVX2 ) |
648 | 0 | { |
649 | 0 | __m256i vsrc; |
650 | 0 | __m256i vbaseoffset = _mm256_set1_epi16(startIdx - MAX_NUM_SAO_CLASSES) ; |
651 | 0 | __m256i vminus = _mm256_set1_epi8(-1); |
652 | 0 | __m256i vzero = _mm256_set1_epi8(0); |
653 | | |
654 | | __m256i vfour = _mm256_set1_epi16(4); |
655 | | __m256i vibdimax = _mm256_set1_epi16((1<<channelBitDepth) -1 ); |
656 | | __m256i voffsettbl = _mm256_broadcastsi128_si256(_mm_loadu_si128((__m128i*)p_eo_offsets)); |
657 | | |
658 | 0 | for (y=0; y< height; y++) |
659 | 0 | { |
660 | 0 | for (x=0; x< width; x+=16) |
661 | 0 | { |
662 | 0 | vsrc = _mm256_loadu_si256((__m256i*)&srcLine[x]); |
663 | 0 | __m256i bands = _mm256_srai_epi16(vsrc, shiftBits); |
664 | 0 | bands = _mm256_sub_epi16(bands, vbaseoffset); |
665 | 0 | bands = _mm256_and_si256(bands, _mm256_set1_epi16( MAX_NUM_SAO_CLASSES - 1 )); // modulo 32 = modulo NUM_SAO_BO_CLASSES_LOG2 |
666 | 0 | __m256i mask1 = _mm256_cmpgt_epi16(bands,vminus); |
667 | 0 | __m256i mask2 = _mm256_cmpgt_epi16(vfour,bands); |
668 | |
|
669 | 0 | __m256i veoffsets = _mm256_shuffle_epi8(voffsettbl, bands); |
670 | 0 | veoffsets = _mm256_slli_epi16 (veoffsets,8); |
671 | 0 | veoffsets = _mm256_srai_epi16 (veoffsets,8); |
672 | |
|
673 | 0 | veoffsets = _mm256_and_si256(veoffsets,mask1); |
674 | 0 | veoffsets = _mm256_and_si256(veoffsets,mask2); |
675 | |
|
676 | 0 | vsrc = _mm256_add_epi16(vsrc, veoffsets); |
677 | 0 | vsrc = _mm256_min_epi16(_mm256_max_epi16(vsrc, vzero), vibdimax); |
678 | 0 | _mm256_storeu_si256((__m256i*)&resLine[x], vsrc); |
679 | 0 | } |
680 | 0 | srcLine += srcStride; |
681 | 0 | resLine += resStride; |
682 | 0 | } |
683 | |
|
684 | 0 | } |
685 | 0 | else |
686 | 0 | #endif |
687 | 0 | { |
688 | 0 | __m128i vsrc; |
689 | 0 | __m128i vbaseoffset = _mm_set1_epi16(startIdx - MAX_NUM_SAO_CLASSES) ; |
690 | 0 | __m128i vminus = _mm_set1_epi8(-1); |
691 | 0 | __m128i vzero = _mm_set1_epi8(0); |
692 | |
|
693 | 0 | __m128i vfour = _mm_set1_epi16(4); |
694 | 0 | __m128i vibdimax = _mm_set1_epi16((1<<channelBitDepth) -1 ); |
695 | 0 | __m128i voffsettbl = _mm_loadu_si128((__m128i*)p_eo_offsets); |
696 | 0 | for (y=0; y< height; y++) |
697 | 0 | { |
698 | 0 | for (x=0; x< width; x+=8) |
699 | 0 | { |
700 | 0 | vsrc = _mm_loadu_si128((__m128i*)&srcLine[x]); |
701 | 0 | __m128i bands = _mm_srai_epi16(vsrc, shiftBits); |
702 | 0 | bands = _mm_sub_epi16(bands, vbaseoffset); |
703 | 0 | bands = _mm_and_si128(bands, _mm_set1_epi16( MAX_NUM_SAO_CLASSES - 1 )); // modulo 32 = modulo NUM_SAO_BO_CLASSES_LOG2 |
704 | 0 | __m128i mask1 = _mm_cmpgt_epi16(bands,vminus); |
705 | 0 | __m128i mask2 = _mm_cmplt_epi16(bands,vfour); |
706 | |
|
707 | 0 | __m128i veoffsets = _mm_shuffle_epi8(voffsettbl, bands); |
708 | 0 | veoffsets = _mm_slli_epi16 (veoffsets,8); |
709 | 0 | veoffsets = _mm_srai_epi16 (veoffsets,8); |
710 | |
|
711 | 0 | veoffsets = _mm_and_si128(veoffsets,mask1); |
712 | 0 | veoffsets = _mm_and_si128(veoffsets,mask2); |
713 | |
|
714 | 0 | vsrc = _mm_add_epi16(vsrc, veoffsets); |
715 | 0 | vsrc = _mm_min_epi16(_mm_max_epi16(vsrc, vzero), vibdimax); |
716 | 0 | _mm_store_si128((__m128i*)&resLine[x], vsrc); |
717 | 0 | } |
718 | 0 | srcLine += srcStride; |
719 | 0 | resLine += resStride; |
720 | 0 | } |
721 | 0 | } |
722 | 0 | } |
723 | 0 | break; |
724 | 0 | default: |
725 | 0 | { |
726 | 0 | THROW("Not a supported SAO types\n"); |
727 | 0 | } |
728 | 0 | } |
729 | | #if USE_AVX2 |
730 | | |
731 | 0 | _mm256_zeroupper(); |
732 | 0 | #endif |
733 | 0 | } Unexecuted instantiation: void vvenc::offsetBlock_SIMD<(vvenc::x86_simd::X86_VEXT)1>(int, vvenc::ClpRng const&, int, int*, int, short const*, short*, long, long, int, int, unsigned char, std::__1::vector<signed char, std::__1::allocator<signed char> >&, std::__1::vector<signed char, std::__1::allocator<signed char> >&) Unexecuted instantiation: void vvenc::offsetBlock_SIMD<(vvenc::x86_simd::X86_VEXT)4>(int, vvenc::ClpRng const&, int, int*, int, short const*, short*, long, long, int, int, unsigned char, std::__1::vector<signed char, std::__1::allocator<signed char> >&, std::__1::vector<signed char, std::__1::allocator<signed char> >&) |
734 | | |
735 | | template <X86_VEXT vext> |
736 | | void calcSaoStatisticsBo_SIMD(int width,int endX,int endY,Pel* srcLine,Pel* orgLine,int srcStride,int orgStride,int channelBitDepth, int64_t *count,int64_t *diff) |
737 | 0 | { |
738 | 0 | if ( width % 16 == 0 ) |
739 | 0 | { |
740 | 0 | int iNaRight=width-endX; |
741 | 0 | int x; |
742 | 0 | int i_bo_range_shift = channelBitDepth - NUM_SAO_BO_CLASSES_LOG2; |
743 | 0 | __m128i vzero = _mm_setzero_si128(); |
744 | 0 | for (int y=0; y<endY; y++) |
745 | 0 | { |
746 | 0 | for (x=0; x<endX-16; x+=16) |
747 | 0 | { |
748 | 0 | __m128i vsrca, vsrcb; |
749 | 0 | __m128i vdiffa,vdiffb; |
750 | 0 | if (sizeof(Pel) == 1){ |
751 | 0 | __m128i vsrc = _mm_loadu_si128((__m128i*)&srcLine[x]); |
752 | 0 | vsrca = _mm_unpacklo_epi8(vsrc, vzero); |
753 | 0 | vsrcb = _mm_unpackhi_epi8(vsrc, vzero); |
754 | 0 | __m128i vorg = _mm_loadu_si128((__m128i*)&orgLine[x]); |
755 | 0 | __m128i vorga = _mm_unpacklo_epi8(vorg, vzero); |
756 | 0 | __m128i vorgb = _mm_unpackhi_epi8(vorg, vzero); |
757 | 0 | vdiffa = _mm_sub_epi16(vorga, vsrca); |
758 | 0 | vdiffb = _mm_sub_epi16(vorgb, vsrcb); |
759 | 0 | } |
760 | 0 | else |
761 | 0 | { |
762 | 0 | vsrca = _mm_loadu_si128((__m128i*)&srcLine[x]); |
763 | 0 | vsrcb = _mm_loadu_si128((__m128i*)&srcLine[x+8]); |
764 | 0 | __m128i vorga = _mm_loadu_si128((__m128i*)&orgLine[x]); |
765 | 0 | __m128i vorgb = _mm_loadu_si128((__m128i*)&orgLine[x+8]); |
766 | 0 | vdiffa = _mm_sub_epi16(vorga, vsrca); |
767 | 0 | vdiffb = _mm_sub_epi16(vorgb, vsrcb); |
768 | 0 | } |
769 | 0 | __m128i vbanda = _mm_srai_epi16(vsrca, i_bo_range_shift); |
770 | 0 | __m128i vbandb = _mm_srai_epi16(vsrcb, i_bo_range_shift); |
771 | 0 | int iBand; |
772 | | // since gcc 4.6 synopsis of _mm_extract_epi16 has changed to (int)(unsigned short)_mm_extract_epi16() |
773 | | // therefore cast result to short to have signed values |
774 | 0 | short iDiff; |
775 | 0 | iBand = _mm_extract_epi16(vbanda, 0); |
776 | 0 | iDiff = (short)_mm_extract_epi16(vdiffa, 0); |
777 | 0 | diff[iBand] += iDiff; |
778 | 0 | count[iBand] += 1; |
779 | 0 | iBand = _mm_extract_epi16(vbanda, 1); |
780 | 0 | iDiff = (short)_mm_extract_epi16(vdiffa, 1); |
781 | 0 | diff[iBand] += iDiff; |
782 | 0 | count[iBand] += 1; |
783 | 0 | iBand = _mm_extract_epi16(vbanda, 2); |
784 | 0 | iDiff = (short)_mm_extract_epi16(vdiffa, 2); |
785 | 0 | diff[iBand] += iDiff; |
786 | 0 | count[iBand] += 1; |
787 | 0 | iBand = _mm_extract_epi16(vbanda, 3); |
788 | 0 | iDiff = (short)_mm_extract_epi16(vdiffa, 3); |
789 | 0 | diff[iBand] += iDiff; |
790 | 0 | count[iBand] += 1; |
791 | 0 | iBand = _mm_extract_epi16(vbanda, 4); |
792 | 0 | iDiff = (short)_mm_extract_epi16(vdiffa, 4); |
793 | 0 | diff[iBand] += iDiff; |
794 | 0 | count[iBand] += 1; |
795 | 0 | iBand = _mm_extract_epi16(vbanda, 5); |
796 | 0 | iDiff = (short)_mm_extract_epi16(vdiffa, 5); |
797 | 0 | diff[iBand] += iDiff; |
798 | 0 | count[iBand] += 1; |
799 | 0 | iBand = _mm_extract_epi16(vbanda, 6); |
800 | 0 | iDiff = (short)_mm_extract_epi16(vdiffa, 6); |
801 | 0 | diff[iBand] += iDiff; |
802 | 0 | count[iBand] += 1; |
803 | 0 | iBand = _mm_extract_epi16(vbanda, 7); |
804 | 0 | iDiff = (short)_mm_extract_epi16(vdiffa, 7); |
805 | 0 | diff[iBand] += iDiff; |
806 | 0 | count[iBand] += 1; |
807 | 0 | iBand = _mm_extract_epi16(vbandb, 0); |
808 | 0 | iDiff = (short)_mm_extract_epi16(vdiffb, 0); |
809 | 0 | diff[iBand] += iDiff; |
810 | 0 | count[iBand] += 1; |
811 | 0 | iBand = _mm_extract_epi16(vbandb, 1); |
812 | 0 | iDiff = (short)_mm_extract_epi16(vdiffb, 1); |
813 | 0 | diff[iBand] += iDiff; |
814 | 0 | count[iBand] += 1; |
815 | 0 | iBand = _mm_extract_epi16(vbandb, 2); |
816 | 0 | iDiff = (short)_mm_extract_epi16(vdiffb, 2); |
817 | 0 | diff[iBand] += iDiff; |
818 | 0 | count[iBand] += 1; |
819 | 0 | iBand = _mm_extract_epi16(vbandb, 3); |
820 | 0 | iDiff = (short)_mm_extract_epi16(vdiffb, 3); |
821 | 0 | diff[iBand] += iDiff; |
822 | 0 | count[iBand] += 1; |
823 | 0 | iBand = _mm_extract_epi16(vbandb, 4); |
824 | 0 | iDiff = (short)_mm_extract_epi16(vdiffb, 4); |
825 | 0 | diff[iBand] += iDiff; |
826 | 0 | count[iBand] += 1; |
827 | 0 | iBand = _mm_extract_epi16(vbandb, 5); |
828 | 0 | iDiff = (short)_mm_extract_epi16(vdiffb, 5); |
829 | 0 | diff[iBand] += iDiff; |
830 | 0 | count[iBand] += 1; |
831 | 0 | iBand = _mm_extract_epi16(vbandb, 6); |
832 | 0 | iDiff = (short)_mm_extract_epi16(vdiffb, 6); |
833 | 0 | diff[iBand] += iDiff; |
834 | 0 | count[iBand] += 1; |
835 | 0 | iBand = _mm_extract_epi16(vbandb, 7); |
836 | 0 | iDiff = (short)_mm_extract_epi16(vdiffb, 7); |
837 | 0 | diff[iBand] += iDiff; |
838 | 0 | count[iBand] += 1; |
839 | 0 | } |
840 | | //last colum |
841 | 0 | { |
842 | 0 | __m128i vsrca, vsrcb; |
843 | 0 | __m128i vdiffa,vdiffb; |
844 | 0 | if (sizeof(Pel) == 1){ |
845 | 0 | __m128i vsrc = _mm_loadu_si128((__m128i*)&srcLine[x]); |
846 | 0 | vsrca = _mm_unpacklo_epi8(vsrc, vzero); |
847 | 0 | vsrcb = _mm_unpackhi_epi8(vsrc, vzero); |
848 | 0 | __m128i vorg = _mm_loadu_si128((__m128i*)&orgLine[x]); |
849 | 0 | __m128i vorga = _mm_unpacklo_epi8(vorg, vzero); |
850 | 0 | __m128i vorgb = _mm_unpackhi_epi8(vorg, vzero); |
851 | 0 | vdiffa = _mm_sub_epi16(vorga, vsrca); |
852 | 0 | vdiffb = _mm_sub_epi16(vorgb, vsrcb); |
853 | 0 | } |
854 | 0 | else |
855 | 0 | { |
856 | 0 | vsrca = _mm_loadu_si128((__m128i*)&srcLine[x]); |
857 | 0 | vsrcb = _mm_loadu_si128((__m128i*)&srcLine[x+8]); |
858 | 0 | __m128i vorga = _mm_loadu_si128((__m128i*)&orgLine[x]); |
859 | 0 | __m128i vorgb = _mm_loadu_si128((__m128i*)&orgLine[x+8]); |
860 | 0 | vdiffa = _mm_sub_epi16(vorga, vsrca); |
861 | 0 | vdiffb = _mm_sub_epi16(vorgb, vsrcb); |
862 | 0 | } |
863 | 0 | __m128i vbanda = _mm_srai_epi16(vsrca, i_bo_range_shift); |
864 | 0 | __m128i vbandb = _mm_srai_epi16(vsrcb, i_bo_range_shift); |
865 | 0 | int iBand; |
866 | | // since gcc 4.6 synopsis of _mm_extract_epi16 has changed to (int)(unsigned short)_mm_extract_epi16() |
867 | | // therefore cast result to short to have signed values |
868 | 0 | short iDiff; |
869 | 0 | iBand = _mm_extract_epi16(vbanda, 0); |
870 | 0 | iDiff = (short)_mm_extract_epi16(vdiffa, 0); |
871 | 0 | diff[iBand] += iDiff; |
872 | 0 | count[iBand] += 1; |
873 | 0 | if (iNaRight<15) |
874 | 0 | { |
875 | 0 | iBand = _mm_extract_epi16(vbanda, 1); |
876 | 0 | iDiff = (short)_mm_extract_epi16(vdiffa, 1); |
877 | 0 | diff[iBand] += iDiff; |
878 | 0 | count[iBand] += 1; |
879 | 0 | } |
880 | 0 | if (iNaRight<14) |
881 | 0 | { |
882 | 0 | iBand = _mm_extract_epi16(vbanda, 2); |
883 | 0 | iDiff = (short)_mm_extract_epi16(vdiffa, 2); |
884 | 0 | diff[iBand] += iDiff; |
885 | 0 | count[iBand] += 1; |
886 | 0 | } |
887 | 0 | if (iNaRight<13) |
888 | 0 | { |
889 | 0 | iBand = _mm_extract_epi16(vbanda, 3); |
890 | 0 | iDiff = (short)_mm_extract_epi16(vdiffa, 3); |
891 | 0 | diff[iBand] += iDiff; |
892 | 0 | count[iBand] += 1; |
893 | 0 | } |
894 | 0 | if (iNaRight<12) |
895 | 0 | { |
896 | 0 | iBand = _mm_extract_epi16(vbanda, 4); |
897 | 0 | iDiff = (short)_mm_extract_epi16(vdiffa, 4); |
898 | 0 | diff[iBand] += iDiff; |
899 | 0 | count[iBand] += 1; |
900 | 0 | } |
901 | 0 | if (iNaRight<11) |
902 | 0 | { |
903 | 0 | iBand = _mm_extract_epi16(vbanda, 5); |
904 | 0 | iDiff = (short)_mm_extract_epi16(vdiffa, 5); |
905 | 0 | diff[iBand] += iDiff; |
906 | 0 | count[iBand] += 1; |
907 | 0 | } |
908 | 0 | if (iNaRight<10) |
909 | 0 | { |
910 | 0 | iBand = _mm_extract_epi16(vbanda, 6); |
911 | 0 | iDiff = (short)_mm_extract_epi16(vdiffa, 6); |
912 | 0 | diff[iBand] += iDiff; |
913 | 0 | count[iBand] += 1; |
914 | 0 | } |
915 | 0 | if (iNaRight<9) |
916 | 0 | { |
917 | 0 | iBand = _mm_extract_epi16(vbanda, 7); |
918 | 0 | iDiff = (short)_mm_extract_epi16(vdiffa, 7); |
919 | 0 | diff[iBand] += iDiff; |
920 | 0 | count[iBand] += 1; |
921 | 0 | } |
922 | 0 | if (iNaRight<8) |
923 | 0 | { |
924 | 0 | iBand = _mm_extract_epi16(vbandb, 0); |
925 | 0 | iDiff = (short)_mm_extract_epi16(vdiffb, 0); |
926 | 0 | diff[iBand] += iDiff; |
927 | 0 | count[iBand] += 1; |
928 | 0 | } |
929 | 0 | if (iNaRight<7) |
930 | 0 | { |
931 | 0 | iBand = _mm_extract_epi16(vbandb, 1); |
932 | 0 | iDiff = (short)_mm_extract_epi16(vdiffb, 1); |
933 | 0 | diff[iBand] += iDiff; |
934 | 0 | count[iBand] += 1; |
935 | 0 | } |
936 | 0 | if (iNaRight<6) |
937 | 0 | { |
938 | 0 | iBand = _mm_extract_epi16(vbandb, 2); |
939 | 0 | iDiff = (short)_mm_extract_epi16(vdiffb, 2); |
940 | 0 | diff[iBand] += iDiff; |
941 | 0 | count[iBand] += 1; |
942 | 0 | } |
943 | 0 | if (iNaRight<5) |
944 | 0 | { |
945 | 0 | iBand = _mm_extract_epi16(vbandb, 3); |
946 | 0 | iDiff = (short)_mm_extract_epi16(vdiffb, 3); |
947 | 0 | diff[iBand] += iDiff; |
948 | 0 | count[iBand] += 1; |
949 | 0 | } |
950 | 0 | if (iNaRight<=4) |
951 | 0 | { |
952 | 0 | iBand = _mm_extract_epi16(vbandb, 4); |
953 | 0 | iDiff = (short)_mm_extract_epi16(vdiffb, 4); |
954 | 0 | diff[iBand] += iDiff; |
955 | 0 | count[iBand] += 1; |
956 | 0 | } |
957 | 0 | if (iNaRight<3) |
958 | 0 | { |
959 | 0 | iBand = _mm_extract_epi16(vbandb, 5); |
960 | 0 | iDiff = (short)_mm_extract_epi16(vdiffb, 5); |
961 | 0 | diff[iBand] += iDiff; |
962 | 0 | count[iBand] += 1; |
963 | 0 | } |
964 | 0 | if (iNaRight<2) |
965 | 0 | { |
966 | 0 | iBand = _mm_extract_epi16(vbandb, 6); |
967 | 0 | iDiff = (short)_mm_extract_epi16(vdiffb, 6); |
968 | 0 | diff[iBand] += iDiff; |
969 | 0 | count[iBand] += 1; |
970 | 0 | } |
971 | 0 | if (iNaRight<1) |
972 | 0 | { |
973 | 0 | iBand = _mm_extract_epi16(vbandb, 7); |
974 | 0 | iDiff = (short)_mm_extract_epi16(vdiffb, 7); |
975 | 0 | diff[iBand] += iDiff; |
976 | 0 | count[iBand] += 1; |
977 | 0 | } |
978 | 0 | } |
979 | 0 | srcLine += srcStride; |
980 | 0 | orgLine += orgStride; |
981 | 0 | } |
982 | 0 | } |
983 | 0 | else |
984 | 0 | { |
985 | 0 | int i,j; |
986 | 0 | int iBoRangeShift = channelBitDepth - NUM_SAO_BO_CLASSES_LOG2; |
987 | 0 | for ( i = 0; i < endY; i++ ) |
988 | 0 | { |
989 | 0 | for ( j = 0; j < endX; j++, srcLine++, orgLine++ ) |
990 | 0 | { |
991 | 0 | int iBand = *srcLine >> iBoRangeShift; |
992 | 0 | diff[iBand] += (*orgLine - *srcLine); |
993 | 0 | count[iBand] += 1; |
994 | 0 | } |
995 | 0 | srcLine += srcStride - endX; |
996 | 0 | orgLine += orgStride - endX; |
997 | 0 | } |
998 | 0 | } |
999 | 0 | } Unexecuted instantiation: void vvenc::calcSaoStatisticsBo_SIMD<(vvenc::x86_simd::X86_VEXT)1>(int, int, int, short*, short*, int, int, int, long*, long*) Unexecuted instantiation: void vvenc::calcSaoStatisticsBo_SIMD<(vvenc::x86_simd::X86_VEXT)4>(int, int, int, short*, short*, int, int, int, long*, long*) |
1000 | | |
1001 | | template <X86_VEXT vext> |
1002 | | void calcSaoStatisticsEo0_SIMD(int width,int startX,int endX,int endY,Pel* srcLine,Pel* orgLine,int srcStride,int orgStride,int64_t *count, int64_t *diff) |
1003 | 0 | { |
1004 | 0 | int iNaRight=width-endX; |
1005 | |
|
1006 | 0 | int iNaWidth = startX + iNaRight; |
1007 | 0 | int i,j; |
1008 | 0 | if ( width % 16 == 0 ) |
1009 | 0 | { |
1010 | 0 | __m128i vzero = _mm_set1_epi8(0); |
1011 | 0 | __m128i vplusone = _mm_set1_epi8(1); |
1012 | 0 | __m128i vbaseoffset = _mm_set1_epi8(2); |
1013 | | // store intermediate results in 32bit partial sums for each EO type |
1014 | 0 | __m128i vdiffsum[NUM_SAO_EO_CLASSES]; |
1015 | 0 | __m128i vcountsum[NUM_SAO_EO_CLASSES]; |
1016 | 0 | __m128i vconst[NUM_SAO_EO_CLASSES]; |
1017 | 0 | for ( int i = 0; i < NUM_SAO_EO_CLASSES; i++ ) |
1018 | 0 | { |
1019 | 0 | vdiffsum[i] = _mm_set1_epi32(0); |
1020 | 0 | vcountsum[i] = _mm_set1_epi32(0); |
1021 | 0 | vconst[i] = _mm_set1_epi16(i); |
1022 | 0 | } |
1023 | | // create masks for first and last pixel row |
1024 | 0 | const unsigned short mask[16]={0,0,0,0,0,0,0,0,0xffff,0xffff,0xffff,0xffff,0xffff,0xffff,0xffff,0xffff}; |
1025 | 0 | __m128i vmaskgs = _mm_set1_epi16(0); |
1026 | 0 | __m128i vmaskge= _mm_set1_epi16(0); |
1027 | 0 | if ( startX ) |
1028 | 0 | { |
1029 | 0 | vmaskgs = _mm_insert_epi16( vmaskgs, 0xffff, 0); |
1030 | 0 | } |
1031 | 0 | if ( iNaRight ) |
1032 | 0 | { |
1033 | 0 | vmaskge = _mm_loadu_si128((__m128i*)&mask[iNaRight]); |
1034 | 0 | } |
1035 | 0 | for ( int y = 0; y < endY; y++) |
1036 | 0 | { |
1037 | 0 | __m128i vmaskga = vmaskgs; |
1038 | 0 | __m128i vmaskgb = vzero; |
1039 | 0 | for ( int x= 0; x < width; x+=16 ) |
1040 | 0 | { |
1041 | 0 | __m128i vsrcal,vsrcar; |
1042 | 0 | __m128i vsrcbl,vsrcbr; |
1043 | 0 | __m128i vdiffa,vdiffb; |
1044 | | // set mask for last pixel |
1045 | 0 | if ( x >= width - 16 ) |
1046 | 0 | { |
1047 | 0 | vmaskgb = vmaskge; |
1048 | 0 | } |
1049 | | // load reconstruction and compute difference between original signal and reconstruction |
1050 | 0 | if (sizeof(Pel) ==1) |
1051 | 0 | { |
1052 | 0 | __m128i vsrc = _mm_loadu_si128((__m128i*)&srcLine[x]); |
1053 | 0 | __m128i vsrcl = _mm_loadu_si128((__m128i*)&srcLine[x-1]); |
1054 | 0 | __m128i vsrcr = _mm_loadu_si128((__m128i*)&srcLine[x+1]); |
1055 | 0 | __m128i vsrca = _mm_unpacklo_epi8(vsrc, vzero); |
1056 | 0 | __m128i vsrcb = _mm_unpackhi_epi8(vsrc, vzero); |
1057 | 0 | vsrcal = _mm_unpacklo_epi8(vsrcl, vzero); |
1058 | 0 | vsrcbl = _mm_unpackhi_epi8(vsrcl, vzero); |
1059 | 0 | vsrcar = _mm_unpacklo_epi8(vsrcr, vzero); |
1060 | 0 | vsrcbr = _mm_unpackhi_epi8(vsrcr, vzero); |
1061 | 0 | vsrcal = _mm_sub_epi16(vsrca, vsrcal); |
1062 | 0 | vsrcar = _mm_sub_epi16(vsrca, vsrcar); |
1063 | 0 | vsrcbl = _mm_sub_epi16(vsrcb, vsrcbl); |
1064 | 0 | vsrcbr = _mm_sub_epi16(vsrcb, vsrcbr); |
1065 | 0 | __m128i vorg = _mm_loadu_si128((__m128i*)&orgLine[x]); |
1066 | 0 | __m128i vorga = _mm_unpacklo_epi8(vorg, vzero); |
1067 | 0 | __m128i vorgb = _mm_unpackhi_epi8(vorg, vzero); |
1068 | 0 | vdiffa = _mm_sub_epi16(vorga, vsrca); |
1069 | 0 | vdiffb = _mm_sub_epi16(vorgb, vsrcb); |
1070 | 0 | } |
1071 | 0 | else |
1072 | 0 | { |
1073 | 0 | __m128i vsrca = _mm_loadu_si128((__m128i*)&srcLine[x]); |
1074 | 0 | vsrcal = _mm_loadu_si128((__m128i*)&srcLine[x-1]); |
1075 | 0 | vsrcar = _mm_loadu_si128((__m128i*)&srcLine[x+1]); |
1076 | 0 | vsrcal = _mm_sub_epi16(vsrca, vsrcal); |
1077 | 0 | vsrcar = _mm_sub_epi16(vsrca, vsrcar); |
1078 | 0 | __m128i vsrcb = _mm_loadu_si128((__m128i*)&srcLine[x+8]); |
1079 | 0 | vsrcbl = _mm_loadu_si128((__m128i*)&srcLine[x+8-1]); |
1080 | 0 | vsrcbr = _mm_loadu_si128((__m128i*)&srcLine[x+8+1]); |
1081 | 0 | vsrcbl = _mm_sub_epi16(vsrcb, vsrcbl); |
1082 | 0 | vsrcbr = _mm_sub_epi16(vsrcb, vsrcbr); |
1083 | 0 | __m128i vorga = _mm_loadu_si128((__m128i*)&orgLine[x]); |
1084 | 0 | __m128i vorgb = _mm_loadu_si128((__m128i*)&orgLine[x+8]); |
1085 | 0 | vdiffa = _mm_sub_epi16(vorga, vsrca); |
1086 | 0 | vdiffb = _mm_sub_epi16(vorgb, vsrcb); |
1087 | 0 | } |
1088 | | // compute sign and type for 16 pixels |
1089 | 0 | __m128i vsignl = _mm_packs_epi16(vsrcal, vsrcbl); |
1090 | 0 | __m128i vsignr = _mm_packs_epi16(vsrcar, vsrcbr); |
1091 | 0 | vsignl = _mm_sign_epi8(vplusone, vsignl); |
1092 | 0 | vsignr = _mm_sign_epi8(vplusone, vsignr); |
1093 | 0 | __m128i vtype = _mm_add_epi8(_mm_add_epi8(vsignl, vsignr), vbaseoffset); |
1094 | 0 | __m128i vtypea = _mm_unpacklo_epi8(vtype, vzero); |
1095 | 0 | __m128i vtypeb = _mm_unpackhi_epi8(vtype, vzero); |
1096 | 0 | vtypea = _mm_or_si128(vtypea, vmaskga); |
1097 | 0 | vtypeb = _mm_or_si128(vtypeb, vmaskgb); |
1098 | | // count occurence of each type and accumulate partial sums for each type |
1099 | 0 | for ( int i = 0; i < NUM_SAO_EO_CLASSES; i++ ) |
1100 | 0 | { |
1101 | 0 | __m128i vmaska = _mm_cmpeq_epi16(vtypea, vconst[i]); |
1102 | 0 | __m128i vmaskb = _mm_cmpeq_epi16(vtypeb, vconst[i]); |
1103 | 0 | __m128i vdiffma = _mm_and_si128(vmaska, vdiffa); |
1104 | 0 | __m128i vdiffmb = _mm_and_si128(vmaskb, vdiffb); |
1105 | 0 | vdiffsum[i] = _mm_add_epi32(vdiffsum[i], _mm_madd_epi16(vdiffma, vconst[1])); |
1106 | 0 | vdiffsum[i] = _mm_add_epi32(vdiffsum[i], _mm_madd_epi16(vdiffmb, vconst[1])); |
1107 | 0 | __m128i vcountma = _mm_srli_epi16(vmaska,15); |
1108 | 0 | __m128i vcountmb = _mm_srli_epi16(vmaskb,15); |
1109 | 0 | vcountsum[i] = _mm_add_epi32(vcountsum[i], _mm_madd_epi16(vcountma, vconst[1])); |
1110 | 0 | vcountsum[i] = _mm_add_epi32(vcountsum[i], _mm_madd_epi16(vcountmb, vconst[1])); |
1111 | 0 | } |
1112 | | // clear mask for first pixel |
1113 | 0 | vmaskga = vzero; |
1114 | 0 | } |
1115 | | // next pixel line |
1116 | 0 | srcLine += srcStride; |
1117 | 0 | orgLine += orgStride; |
1118 | 0 | } |
1119 | | // horizontal add of four 32 bit partial sums |
1120 | 0 | for ( int i = 0; i < NUM_SAO_EO_CLASSES; i++ ) |
1121 | 0 | { |
1122 | 0 | vdiffsum[i] = _mm_add_epi32(vdiffsum[i], _mm_srli_si128(vdiffsum[i], 8)); |
1123 | 0 | vdiffsum[i] = _mm_add_epi32(vdiffsum[i], _mm_srli_si128(vdiffsum[i], 4)); |
1124 | 0 | diff[i] = _mm_cvtsi128_si32(vdiffsum[i]); |
1125 | 0 | vcountsum[i] = _mm_add_epi32(vcountsum[i], _mm_srli_si128(vcountsum[i], 8)); |
1126 | 0 | vcountsum[i] = _mm_add_epi32(vcountsum[i], _mm_srli_si128(vcountsum[i], 4)); |
1127 | 0 | count[i] = _mm_cvtsi128_si32(vcountsum[i]); |
1128 | 0 | } |
1129 | 0 | } |
1130 | 0 | else |
1131 | 0 | { |
1132 | 0 | srcLine = srcLine + startX; |
1133 | 0 | orgLine =orgLine + startX; |
1134 | 0 | diff +=2; |
1135 | 0 | count+=2; |
1136 | |
|
1137 | 0 | for ( i = 0; i < endY; i++ ) |
1138 | 0 | { |
1139 | 0 | int iSignLeft = sgn( *srcLine - *(srcLine - 1) ); |
1140 | 0 | for ( j = 0; j < width - iNaWidth; j++, srcLine++, orgLine++ ) |
1141 | 0 | { |
1142 | 0 | int iSignRight = sgn( *srcLine - *(srcLine + 1) ); |
1143 | | //printf("%d ",*srcLine); |
1144 | 0 | int iType = iSignLeft + iSignRight; |
1145 | 0 | iSignLeft = -1 * iSignRight; |
1146 | 0 | diff[iType] += (*orgLine - *srcLine); |
1147 | 0 | count[iType] += 1; |
1148 | 0 | } |
1149 | 0 | srcLine += srcStride - ( width - iNaWidth ); |
1150 | 0 | orgLine += orgStride - ( width - iNaWidth ); |
1151 | 0 | } |
1152 | 0 | } |
1153 | 0 | } Unexecuted instantiation: void vvenc::calcSaoStatisticsEo0_SIMD<(vvenc::x86_simd::X86_VEXT)1>(int, int, int, int, short*, short*, int, int, long*, long*) Unexecuted instantiation: void vvenc::calcSaoStatisticsEo0_SIMD<(vvenc::x86_simd::X86_VEXT)4>(int, int, int, int, short*, short*, int, int, long*, long*) |
1154 | | template <X86_VEXT vext> |
1155 | | void calcSaoStatisticsEo90_SIMD(int width,int endX,int startY,int endY,Pel* srcLine,Pel* orgLine,int srcStride,int orgStride,int64_t *count, int64_t *diff,int8_t *signUpLine) |
1156 | 0 | { |
1157 | 0 | if ( width % 16 == 0 ) |
1158 | 0 | { |
1159 | 0 | int iNaRight=width-endX; |
1160 | 0 | __m128i vzero = _mm_set1_epi8(0); |
1161 | 0 | __m128i vplusone = _mm_set1_epi8(1); |
1162 | 0 | __m128i vbaseoffset = _mm_set1_epi8(2); |
1163 | | // store intermediate results in 32bit partial sums for each EO type |
1164 | 0 | __m128i vdiffsum[NUM_SAO_EO_CLASSES]; |
1165 | 0 | __m128i vcountsum[NUM_SAO_EO_CLASSES]; |
1166 | 0 | __m128i vconst[NUM_SAO_EO_CLASSES]; |
1167 | 0 | for ( int i = 0; i < NUM_SAO_EO_CLASSES; i++ ) |
1168 | 0 | { |
1169 | 0 | vdiffsum[i] = _mm_set1_epi32(0); |
1170 | 0 | vcountsum[i] = _mm_set1_epi32(0); |
1171 | 0 | vconst[i] = _mm_set1_epi16(i); |
1172 | 0 | } |
1173 | | // create masks for first and last pixel row |
1174 | 0 | const unsigned short mask[16]={0,0,0,0,0,0,0,0,0xffff,0xffff,0xffff,0xffff,0xffff,0xffff,0xffff,0xffff}; |
1175 | 0 | __m128i vmaskge= _mm_set1_epi16(0); |
1176 | 0 | if ( iNaRight ) |
1177 | 0 | { |
1178 | 0 | vmaskge = _mm_loadu_si128((__m128i*)&mask[iNaRight]); |
1179 | 0 | } |
1180 | |
|
1181 | 0 | __m128i vsigns[MAX_CU_SIZE/16 +1]; //+1 to avoid MSVC error |
1182 | 0 | for (int x=0; x< endX; x+=16) |
1183 | 0 | { |
1184 | 0 | __m128i vsrca,vsrcb; |
1185 | 0 | __m128i vsrcat,vsrcbt; |
1186 | 0 | if (sizeof(Pel) == 1) |
1187 | 0 | { |
1188 | 0 | __m128i vsrc = _mm_loadu_si128((__m128i*)&srcLine[x]); |
1189 | 0 | __m128i vsrct = _mm_loadu_si128((__m128i*)&srcLine[x-srcStride]); |
1190 | 0 | vsrca = _mm_unpacklo_epi8(vsrc, vzero); |
1191 | 0 | vsrcb = _mm_unpackhi_epi8(vsrc, vzero); |
1192 | 0 | vsrcat = _mm_unpacklo_epi8(vsrct, vzero); |
1193 | 0 | vsrcbt = _mm_unpackhi_epi8(vsrct, vzero); |
1194 | 0 | } |
1195 | 0 | else |
1196 | 0 | { |
1197 | 0 | vsrca = _mm_loadu_si128((__m128i*)&srcLine[x]); |
1198 | 0 | vsrcb = _mm_loadu_si128((__m128i*)&srcLine[x+8]); |
1199 | 0 | vsrcat = _mm_loadu_si128((__m128i*)&srcLine[x - srcStride]); |
1200 | 0 | vsrcbt = _mm_loadu_si128((__m128i*)&srcLine[x+8 - srcStride]); |
1201 | 0 | } |
1202 | 0 | vsrcat = _mm_sub_epi16(vsrcat, vsrca); |
1203 | 0 | vsrcbt = _mm_sub_epi16(vsrcbt, vsrcb); |
1204 | 0 | vsigns[x/16] = _mm_sign_epi8(vplusone, _mm_packs_epi16(vsrcat, vsrcbt)); |
1205 | 0 | } |
1206 | | /* filter all lines */ |
1207 | 0 | for (int j = startY; j < endY ; j++) |
1208 | 0 | { |
1209 | 0 | __m128i vmaskgb = vzero; |
1210 | | |
1211 | | /* start with first pixel */ |
1212 | | /* filter all pixels of this line */ |
1213 | 0 | for (int x = 0; x < endX; x+=16) |
1214 | 0 | { |
1215 | 0 | __m128i vsrca,vsrcb; |
1216 | 0 | __m128i vsrcad, vsrcbd; |
1217 | 0 | __m128i vdiffa,vdiffb; |
1218 | | // set mask for last pixel |
1219 | 0 | if ( x >= width - 16 ) |
1220 | 0 | { |
1221 | 0 | vmaskgb = vmaskge; |
1222 | 0 | } |
1223 | | // load reconstruction and compute difference between original signal and reconstruction |
1224 | 0 | if (sizeof(Pel) == 1) |
1225 | 0 | { |
1226 | 0 | __m128i vsrc = _mm_loadu_si128((__m128i*)&srcLine[x]); |
1227 | 0 | __m128i vsrcd = _mm_loadu_si128((__m128i*)&srcLine[x+srcStride]); |
1228 | 0 | vsrca = _mm_unpacklo_epi8(vsrc, vzero); |
1229 | 0 | vsrcb = _mm_unpackhi_epi8(vsrc, vzero); |
1230 | 0 | vsrcad = _mm_unpacklo_epi8(vsrcd, vzero); |
1231 | 0 | vsrcbd = _mm_unpackhi_epi8(vsrcd, vzero); |
1232 | |
|
1233 | 0 | __m128i vorg = _mm_loadu_si128((__m128i*)&orgLine[x]); |
1234 | 0 | __m128i vorga = _mm_unpacklo_epi8(vorg, vzero); |
1235 | 0 | __m128i vorgb = _mm_unpackhi_epi8(vorg, vzero); |
1236 | 0 | vdiffa = _mm_sub_epi16(vorga, vsrca); |
1237 | 0 | vdiffb = _mm_sub_epi16(vorgb, vsrcb); |
1238 | 0 | } |
1239 | 0 | else |
1240 | 0 | { |
1241 | 0 | vsrca = _mm_loadu_si128((__m128i*)&srcLine[x]); |
1242 | 0 | vsrcb = _mm_loadu_si128((__m128i*)&srcLine[x+8]); |
1243 | 0 | vsrcad = _mm_loadu_si128((__m128i*)&srcLine[x + srcStride]); |
1244 | 0 | vsrcbd = _mm_loadu_si128((__m128i*)&srcLine[x+8 + srcStride]); |
1245 | 0 | __m128i vorga = _mm_loadu_si128((__m128i*)&orgLine[x]); |
1246 | 0 | __m128i vorgb = _mm_loadu_si128((__m128i*)&orgLine[x+8]); |
1247 | 0 | vdiffa = _mm_sub_epi16(vorga, vsrca); |
1248 | 0 | vdiffb = _mm_sub_epi16(vorgb, vsrcb); |
1249 | 0 | } |
1250 | | // compute sign and type for 16 pixels |
1251 | 0 | vsrcad = _mm_sub_epi16(vsrca, vsrcad); |
1252 | 0 | vsrcbd = _mm_sub_epi16(vsrcb, vsrcbd); |
1253 | 0 | __m128i vsignd = _mm_sign_epi8(vplusone, _mm_packs_epi16(vsrcad, vsrcbd)); |
1254 | 0 | __m128i vsignt = vsigns[x/16]; |
1255 | 0 | vsigns[x/16] = vsignd; |
1256 | 0 | __m128i vtype = _mm_add_epi8(_mm_sub_epi8(vsignd, vsignt), vbaseoffset); |
1257 | 0 | __m128i vtypea = _mm_unpacklo_epi8(vtype, vzero); |
1258 | 0 | __m128i vtypeb = _mm_unpackhi_epi8(vtype, vzero); |
1259 | 0 | vtypeb = _mm_or_si128(vtypeb, vmaskgb); |
1260 | | |
1261 | | // count occurence of each type and accumulate partial sums for each type |
1262 | 0 | for ( int i = 0; i < NUM_SAO_EO_CLASSES; i++ ) |
1263 | 0 | { |
1264 | 0 | __m128i vmaska = _mm_cmpeq_epi16(vtypea, vconst[i]); |
1265 | 0 | __m128i vmaskb = _mm_cmpeq_epi16(vtypeb, vconst[i]); |
1266 | 0 | __m128i vdiffma = _mm_and_si128(vmaska, vdiffa); |
1267 | 0 | __m128i vdiffmb = _mm_and_si128(vmaskb, vdiffb); |
1268 | 0 | vdiffsum[i] = _mm_add_epi32(vdiffsum[i], _mm_madd_epi16(vdiffma, vconst[1])); |
1269 | 0 | vdiffsum[i] = _mm_add_epi32(vdiffsum[i], _mm_madd_epi16(vdiffmb, vconst[1])); |
1270 | 0 | __m128i vcountma = _mm_srli_epi16(vmaska,15); |
1271 | 0 | __m128i vcountmb = _mm_srli_epi16(vmaskb,15); |
1272 | 0 | vcountsum[i] = _mm_add_epi32(vcountsum[i], _mm_madd_epi16(vcountma, vconst[1])); |
1273 | 0 | vcountsum[i] = _mm_add_epi32(vcountsum[i], _mm_madd_epi16(vcountmb, vconst[1])); |
1274 | 0 | } |
1275 | 0 | } |
1276 | | // next pixel line |
1277 | 0 | srcLine += srcStride; |
1278 | 0 | orgLine += orgStride; |
1279 | 0 | } |
1280 | | // horizontal add of four 32 bit partial sums |
1281 | 0 | for ( int i = 0; i < NUM_SAO_EO_CLASSES; i++ ) |
1282 | 0 | { |
1283 | 0 | vdiffsum[i] = _mm_add_epi32(vdiffsum[i], _mm_srli_si128(vdiffsum[i], 8)); |
1284 | 0 | vdiffsum[i] = _mm_add_epi32(vdiffsum[i], _mm_srli_si128(vdiffsum[i], 4)); |
1285 | 0 | diff[i] = _mm_cvtsi128_si32(vdiffsum[i]); |
1286 | 0 | vcountsum[i] = _mm_add_epi32(vcountsum[i], _mm_srli_si128(vcountsum[i], 8)); |
1287 | 0 | vcountsum[i] = _mm_add_epi32(vcountsum[i], _mm_srli_si128(vcountsum[i], 4)); |
1288 | 0 | count[i] = _mm_cvtsi128_si32(vcountsum[i]); |
1289 | 0 | } |
1290 | 0 | } |
1291 | 0 | else |
1292 | 0 | { |
1293 | 0 | diff +=2; |
1294 | 0 | count+=2; |
1295 | 0 | int x,y,edgeType; |
1296 | 0 | Pel* srcLineAbove = srcLine - srcStride; |
1297 | 0 | int8_t signDown; |
1298 | 0 | for (x=0; x<endX; x++) |
1299 | 0 | { |
1300 | 0 | signUpLine[x] = (int8_t)sgn(srcLine[x] - srcLineAbove[x]); |
1301 | 0 | } |
1302 | 0 | Pel* srcLineBelow; |
1303 | 0 | for (y=startY; y<endY; y++) |
1304 | 0 | { |
1305 | 0 | srcLineBelow = srcLine + srcStride; |
1306 | |
|
1307 | 0 | for (x=0; x<endX; x++) |
1308 | 0 | { |
1309 | 0 | signDown = (int8_t)sgn(srcLine[x] - srcLineBelow[x]); |
1310 | 0 | edgeType = signDown + signUpLine[x]; |
1311 | 0 | signUpLine[x]= -signDown; |
1312 | |
|
1313 | 0 | diff [edgeType] += (orgLine[x] - srcLine[x]); |
1314 | 0 | count[edgeType] ++; |
1315 | 0 | } |
1316 | 0 | srcLine += srcStride; |
1317 | 0 | orgLine += orgStride; |
1318 | 0 | } |
1319 | 0 | } |
1320 | 0 | } Unexecuted instantiation: void vvenc::calcSaoStatisticsEo90_SIMD<(vvenc::x86_simd::X86_VEXT)1>(int, int, int, int, short*, short*, int, int, long*, long*, signed char*) Unexecuted instantiation: void vvenc::calcSaoStatisticsEo90_SIMD<(vvenc::x86_simd::X86_VEXT)4>(int, int, int, int, short*, short*, int, int, long*, long*, signed char*) |
1321 | | template <X86_VEXT vext> |
1322 | | void calcSaoStatisticsEo135_SIMD(int width,int startX,int endX,int endY,Pel* srcLine,Pel* orgLine,int srcStride,int orgStride,int64_t *count, int64_t *diff,int8_t *signUpLine,int8_t *signDownLine) |
1323 | 0 | { |
1324 | 0 | if ( width % 16 == 0 ) |
1325 | 0 | { |
1326 | 0 | int iNaRight=width-endX; |
1327 | 0 | diff -=2; |
1328 | 0 | count-=2; |
1329 | 0 | __m128i vzero = _mm_set1_epi8(0); |
1330 | 0 | __m128i vplusone = _mm_set1_epi8(1); |
1331 | 0 | __m128i vbaseoffset = _mm_set1_epi8(2); |
1332 | | // store intermediate results in 32bit partial sums for each EO type |
1333 | 0 | __m128i vdiffsum[NUM_SAO_EO_CLASSES]; |
1334 | 0 | __m128i vcountsum[NUM_SAO_EO_CLASSES]; |
1335 | 0 | __m128i vconst[NUM_SAO_EO_CLASSES]; |
1336 | 0 | for ( int i = 0; i < NUM_SAO_EO_CLASSES; i++ ) |
1337 | 0 | { |
1338 | 0 | vdiffsum[i] = _mm_set1_epi32(0); |
1339 | 0 | vcountsum[i] = _mm_set1_epi32(0); |
1340 | 0 | vconst[i] = _mm_set1_epi16(i); |
1341 | 0 | } |
1342 | | // create masks for first and last pixel row |
1343 | 0 | const unsigned short mask[16]={0,0,0,0,0,0,0,0,0xffff,0xffff,0xffff,0xffff,0xffff,0xffff,0xffff,0xffff}; |
1344 | 0 | __m128i vmaskgs = _mm_set1_epi16(0); |
1345 | 0 | __m128i vmaskge = _mm_set1_epi16(0); |
1346 | 0 | if ( startX ) |
1347 | 0 | { |
1348 | 0 | vmaskgs = _mm_insert_epi16( vmaskgs, 0xffff, 0); |
1349 | 0 | } |
1350 | 0 | if ( iNaRight ) |
1351 | 0 | { |
1352 | 0 | vmaskge = _mm_loadu_si128((__m128i*)&mask[iNaRight]); |
1353 | 0 | } |
1354 | | /* filter all lines */ |
1355 | 0 | for (int j = 1; j < endY; j++) |
1356 | 0 | { |
1357 | 0 | __m128i vmaskga = vmaskgs; |
1358 | 0 | __m128i vmaskgb = vconst[0]; |
1359 | | /* start with first pixel */ |
1360 | | /* filter all pixels of this line */ |
1361 | 0 | for (int x = 0; x < width; x+=16) |
1362 | 0 | { |
1363 | 0 | __m128i vsrca,vsrcb; |
1364 | 0 | __m128i vsrcad,vsrcbd; |
1365 | 0 | __m128i vsrcat,vsrcbt; |
1366 | 0 | __m128i vdiffa,vdiffb; |
1367 | | // set mask for last pixel |
1368 | 0 | if ( x >= width - 16 ) |
1369 | 0 | { |
1370 | 0 | vmaskgb = vmaskge; |
1371 | 0 | } |
1372 | 0 | if (sizeof(Pel) == 1) |
1373 | 0 | { |
1374 | 0 | __m128i vsrct = _mm_loadu_si128((__m128i*)&srcLine[ x-srcStride-1 ]); |
1375 | 0 | __m128i vsrc = _mm_loadu_si128((__m128i*)&srcLine[ x ]); |
1376 | 0 | __m128i vsrcd = _mm_loadu_si128((__m128i*)&srcLine[ x+srcStride+1 ]); |
1377 | 0 | vsrcat = _mm_unpacklo_epi8(vsrct, vzero); |
1378 | 0 | vsrcbt = _mm_unpackhi_epi8(vsrct, vzero); |
1379 | 0 | vsrca = _mm_unpacklo_epi8(vsrc, vzero); |
1380 | 0 | vsrcb = _mm_unpackhi_epi8(vsrc, vzero); |
1381 | 0 | vsrcad = _mm_unpacklo_epi8(vsrcd, vzero); |
1382 | 0 | vsrcbd = _mm_unpackhi_epi8(vsrcd, vzero); |
1383 | 0 | __m128i vorg = _mm_loadu_si128((__m128i*)&orgLine[x]); |
1384 | 0 | __m128i vorga = _mm_unpacklo_epi8(vorg, vzero); |
1385 | 0 | __m128i vorgb = _mm_unpackhi_epi8(vorg, vzero); |
1386 | 0 | vdiffa = _mm_sub_epi16(vorga, vsrca); |
1387 | 0 | vdiffb = _mm_sub_epi16(vorgb, vsrcb); |
1388 | 0 | } |
1389 | 0 | else |
1390 | 0 | { |
1391 | 0 | vsrcat = _mm_loadu_si128((__m128i*)&srcLine[x - 1 - srcStride ]); |
1392 | 0 | vsrcbt = _mm_loadu_si128((__m128i*)&srcLine[x - 1 + 8 - srcStride]); |
1393 | 0 | vsrca = _mm_loadu_si128((__m128i*)&srcLine[x]); |
1394 | 0 | vsrcb = _mm_loadu_si128((__m128i*)&srcLine[x+8]); |
1395 | 0 | vsrcad = _mm_loadu_si128((__m128i*)&srcLine[x + 1 + srcStride ]); |
1396 | 0 | vsrcbd = _mm_loadu_si128((__m128i*)&srcLine[x + 1 + 8 + srcStride]); |
1397 | 0 | __m128i vorga = _mm_loadu_si128((__m128i*)&orgLine[x]); |
1398 | 0 | __m128i vorgb = _mm_loadu_si128((__m128i*)&orgLine[x+8]); |
1399 | 0 | vdiffa = _mm_sub_epi16(vorga, vsrca); |
1400 | 0 | vdiffb = _mm_sub_epi16(vorgb, vsrcb); |
1401 | 0 | } |
1402 | | // compute sign and type for 16 pixels |
1403 | 0 | vsrcat = _mm_sub_epi16(vsrca, vsrcat); |
1404 | 0 | vsrcbt = _mm_sub_epi16(vsrcb, vsrcbt); |
1405 | 0 | vsrcad = _mm_sub_epi16(vsrca, vsrcad); |
1406 | 0 | vsrcbd = _mm_sub_epi16(vsrcb, vsrcbd); |
1407 | 0 | __m128i vsignt = _mm_sign_epi8(vplusone, _mm_packs_epi16(vsrcat, vsrcbt)); |
1408 | 0 | __m128i vsignd = _mm_sign_epi8(vplusone, _mm_packs_epi16(vsrcad, vsrcbd)); |
1409 | 0 | __m128i vtype = _mm_add_epi8(_mm_add_epi8(vsignd, vsignt), vbaseoffset); |
1410 | 0 | __m128i vtypea = _mm_unpacklo_epi8(vtype, vzero); |
1411 | 0 | __m128i vtypeb = _mm_unpackhi_epi8(vtype, vzero); |
1412 | 0 | vtypea = _mm_or_si128(vtypea, vmaskga); |
1413 | 0 | vtypeb = _mm_or_si128(vtypeb, vmaskgb); |
1414 | | // count occurence of each type and accumulate partial sums for each type |
1415 | 0 | for ( int i = 0; i < NUM_SAO_EO_CLASSES; i++ ) |
1416 | 0 | { |
1417 | 0 | __m128i vmaska = _mm_cmpeq_epi16(vtypea, vconst[i]); |
1418 | 0 | __m128i vmaskb = _mm_cmpeq_epi16(vtypeb, vconst[i]); |
1419 | 0 | __m128i vdiffma = _mm_and_si128(vmaska, vdiffa); |
1420 | 0 | __m128i vdiffmb = _mm_and_si128(vmaskb, vdiffb); |
1421 | 0 | vdiffsum[i] = _mm_add_epi32(vdiffsum[i], _mm_madd_epi16(vdiffma, vconst[1])); |
1422 | 0 | vdiffsum[i] = _mm_add_epi32(vdiffsum[i], _mm_madd_epi16(vdiffmb, vconst[1])); |
1423 | 0 | __m128i vcountma = _mm_srli_epi16(vmaska,15); |
1424 | 0 | __m128i vcountmb = _mm_srli_epi16(vmaskb,15); |
1425 | 0 | vcountsum[i] = _mm_add_epi32(vcountsum[i], _mm_madd_epi16(vcountma, vconst[1])); |
1426 | 0 | vcountsum[i] = _mm_add_epi32(vcountsum[i], _mm_madd_epi16(vcountmb, vconst[1])); |
1427 | 0 | } |
1428 | | // clear mask for first pixel |
1429 | 0 | vmaskga = vconst[0]; |
1430 | 0 | } |
1431 | | // next pixel line |
1432 | 0 | srcLine += srcStride; |
1433 | 0 | orgLine += orgStride; |
1434 | 0 | } |
1435 | | // horizontal add of four 32 bit partial sums |
1436 | 0 | for ( int i = 0; i < NUM_SAO_EO_CLASSES; i++ ) |
1437 | 0 | { |
1438 | 0 | vdiffsum[i] = _mm_add_epi32(vdiffsum[i], _mm_srli_si128(vdiffsum[i], 8)); |
1439 | 0 | vdiffsum[i] = _mm_add_epi32(vdiffsum[i], _mm_srli_si128(vdiffsum[i], 4)); |
1440 | 0 | diff[i] += _mm_cvtsi128_si32(vdiffsum[i]); |
1441 | 0 | vcountsum[i] = _mm_add_epi32(vcountsum[i], _mm_srli_si128(vcountsum[i], 8)); |
1442 | 0 | vcountsum[i] = _mm_add_epi32(vcountsum[i], _mm_srli_si128(vcountsum[i], 4)); |
1443 | 0 | count[i] += _mm_cvtsi128_si32(vcountsum[i]); |
1444 | 0 | } |
1445 | 0 | } |
1446 | 0 | else |
1447 | 0 | { |
1448 | 0 | int x,y,edgeType; |
1449 | 0 | int8_t signDown; |
1450 | | //middle lines |
1451 | 0 | for (y=1; y<endY; y++) |
1452 | 0 | { |
1453 | 0 | int8_t* pTopSign = NULL; |
1454 | 0 | Pel* srcLineBelow = srcLine + srcStride; |
1455 | 0 | int8_t iTmpSign = (int8_t)sgn( srcLineBelow[startX] - srcLine[startX-1] ); |
1456 | 0 | for ( x=startX,pTopSign = &signUpLine[startX]; x<endX; x++ , pTopSign++ ) |
1457 | 0 | { |
1458 | 0 | signDown = (int8_t)sgn(srcLine[x] - srcLineBelow[x+1]); |
1459 | 0 | edgeType = signDown + *pTopSign; |
1460 | 0 | *pTopSign = iTmpSign; |
1461 | 0 | iTmpSign = -signDown; |
1462 | 0 | diff [edgeType] += (orgLine[x] - srcLine[x]); |
1463 | 0 | count[edgeType] ++; |
1464 | 0 | } |
1465 | 0 | srcLine += srcStride; |
1466 | 0 | orgLine += orgStride; |
1467 | 0 | } |
1468 | 0 | } |
1469 | 0 | } Unexecuted instantiation: void vvenc::calcSaoStatisticsEo135_SIMD<(vvenc::x86_simd::X86_VEXT)1>(int, int, int, int, short*, short*, int, int, long*, long*, signed char*, signed char*) Unexecuted instantiation: void vvenc::calcSaoStatisticsEo135_SIMD<(vvenc::x86_simd::X86_VEXT)4>(int, int, int, int, short*, short*, int, int, long*, long*, signed char*, signed char*) |
1470 | | template <X86_VEXT vext> |
1471 | | void calcSaoStatisticsEo45_SIMD(int width,int startX,int endX,int endY,Pel* srcLine,Pel* orgLine,int srcStride,int orgStride,int64_t *count, int64_t *diff,int8_t *signUpLine) |
1472 | 0 | { |
1473 | 0 | Pel* pRec = srcLine; |
1474 | 0 | Pel* pOrg = orgLine; |
1475 | 0 | Pel* srcLineBelow = srcLine + srcStride; |
1476 | 0 | if (width % 16 == 0 ) |
1477 | 0 | { |
1478 | | //const unsigned short mask[16]={0,0,0,0,0,0,0,0,0xffff,0xffff,0xffff,0xffff,0xffff,0xffff,0xffff,0xffff}; |
1479 | 0 | int iNaRight=width-endX; |
1480 | 0 | diff -=2; |
1481 | 0 | count-=2; |
1482 | 0 | __m128i vzero = _mm_set1_epi8(0); |
1483 | 0 | __m128i vplusone = _mm_set1_epi8(1); |
1484 | 0 | __m128i vbaseoffset = _mm_set1_epi8(2); |
1485 | | // store intermediate results in 32bit partial sums for each EO type |
1486 | 0 | __m128i vdiffsum[NUM_SAO_EO_CLASSES]; |
1487 | 0 | __m128i vcountsum[NUM_SAO_EO_CLASSES]; |
1488 | 0 | __m128i vconst[NUM_SAO_EO_CLASSES]; |
1489 | 0 | for ( int i = 0; i < NUM_SAO_EO_CLASSES; i++ ) |
1490 | 0 | { |
1491 | 0 | vdiffsum[i] = _mm_set1_epi32(0); |
1492 | 0 | vcountsum[i] = _mm_set1_epi32(0); |
1493 | 0 | vconst[i] = _mm_set1_epi16(i); |
1494 | 0 | } |
1495 | | // create masks for first and last pixel row |
1496 | 0 | const unsigned short mask[16]={0,0,0,0,0,0,0,0,0xffff,0xffff,0xffff,0xffff,0xffff,0xffff,0xffff,0xffff}; |
1497 | 0 | __m128i vmaskgs = _mm_set1_epi16(0); |
1498 | 0 | __m128i vmaskge = _mm_set1_epi16(0); |
1499 | 0 | if ( startX ) |
1500 | 0 | { |
1501 | 0 | vmaskgs = _mm_insert_epi16( vmaskgs, 0xffff, 0); |
1502 | 0 | } |
1503 | 0 | if ( iNaRight ) |
1504 | 0 | { |
1505 | 0 | vmaskge = _mm_loadu_si128((__m128i*)&mask[iNaRight]); |
1506 | 0 | } |
1507 | | /* filter all lines */ |
1508 | 0 | for (int j = 1; j < endY; j++) |
1509 | 0 | { |
1510 | 0 | __m128i vmaskga = vmaskgs; |
1511 | 0 | __m128i vmaskgb = vconst[0]; |
1512 | | /* start with first pixel */ |
1513 | | /* filter all pixels of this line */ |
1514 | 0 | for (int x = 0; x < width; x+=16) |
1515 | 0 | { |
1516 | 0 | __m128i vsrca,vsrcb; |
1517 | 0 | __m128i vsrcad,vsrcbd; |
1518 | 0 | __m128i vsrcat,vsrcbt; |
1519 | 0 | __m128i vdiffa,vdiffb; |
1520 | | // set mask for last pixel |
1521 | 0 | if ( x >= width - 16 ) |
1522 | 0 | { |
1523 | 0 | vmaskgb = vmaskge; |
1524 | 0 | } |
1525 | 0 | if (sizeof(Pel) == 1) |
1526 | 0 | { |
1527 | 0 | __m128i vsrct = _mm_loadu_si128((__m128i*)&pRec[ x-srcStride+1 ]); |
1528 | 0 | __m128i vsrc = _mm_loadu_si128((__m128i*)&pRec[ x ]); |
1529 | 0 | __m128i vsrcd = _mm_loadu_si128((__m128i*)&pRec[ x+srcStride-1 ]); |
1530 | 0 | vsrcat = _mm_unpacklo_epi8(vsrct, vzero); |
1531 | 0 | vsrcbt = _mm_unpackhi_epi8(vsrct, vzero); |
1532 | 0 | vsrca = _mm_unpacklo_epi8(vsrc, vzero); |
1533 | 0 | vsrcb = _mm_unpackhi_epi8(vsrc, vzero); |
1534 | 0 | vsrcad = _mm_unpacklo_epi8(vsrcd, vzero); |
1535 | 0 | vsrcbd = _mm_unpackhi_epi8(vsrcd, vzero); |
1536 | 0 | __m128i vorg = _mm_loadu_si128((__m128i*)&pOrg[x]); |
1537 | 0 | __m128i vorga = _mm_unpacklo_epi8(vorg, vzero); |
1538 | 0 | __m128i vorgb = _mm_unpackhi_epi8(vorg, vzero); |
1539 | 0 | vdiffa = _mm_sub_epi16(vorga, vsrca); |
1540 | 0 | vdiffb = _mm_sub_epi16(vorgb, vsrcb); |
1541 | 0 | } |
1542 | 0 | else |
1543 | 0 | { |
1544 | 0 | vsrcat = _mm_loadu_si128((__m128i*)&pRec[x + 1 - srcStride ]); |
1545 | 0 | vsrcbt = _mm_loadu_si128((__m128i*)&pRec[x + 1 + 8 - srcStride]); |
1546 | 0 | vsrca = _mm_loadu_si128((__m128i*)&pRec[x]); |
1547 | 0 | vsrcb = _mm_loadu_si128((__m128i*)&pRec[x+8]); |
1548 | 0 | vsrcad = _mm_loadu_si128((__m128i*)&pRec[x - 1 + srcStride ]); |
1549 | 0 | vsrcbd = _mm_loadu_si128((__m128i*)&pRec[x - 1 + 8 + srcStride]); |
1550 | 0 | __m128i vorga = _mm_loadu_si128((__m128i*)&pOrg[x]); |
1551 | 0 | __m128i vorgb = _mm_loadu_si128((__m128i*)&pOrg[x+8]); |
1552 | 0 | vdiffa = _mm_sub_epi16(vorga, vsrca); |
1553 | 0 | vdiffb = _mm_sub_epi16(vorgb, vsrcb); |
1554 | 0 | } |
1555 | | // compute sign and type for 16 pixels |
1556 | 0 | vsrcat = _mm_sub_epi16(vsrca, vsrcat); |
1557 | 0 | vsrcbt = _mm_sub_epi16(vsrcb, vsrcbt); |
1558 | 0 | vsrcad = _mm_sub_epi16(vsrca, vsrcad); |
1559 | 0 | vsrcbd = _mm_sub_epi16(vsrcb, vsrcbd); |
1560 | 0 | __m128i vsignt = _mm_sign_epi8(vplusone, _mm_packs_epi16(vsrcat, vsrcbt)); |
1561 | 0 | __m128i vsignd = _mm_sign_epi8(vplusone, _mm_packs_epi16(vsrcad, vsrcbd)); |
1562 | 0 | __m128i vtype = _mm_add_epi8(_mm_add_epi8(vsignd, vsignt), vbaseoffset); |
1563 | 0 | __m128i vtypea = _mm_unpacklo_epi8(vtype, vzero); |
1564 | 0 | __m128i vtypeb = _mm_unpackhi_epi8(vtype, vzero); |
1565 | 0 | vtypea = _mm_or_si128(vtypea, vmaskga); |
1566 | 0 | vtypeb = _mm_or_si128(vtypeb, vmaskgb); |
1567 | | // count occurence of each type and accumulate partial sums for each type |
1568 | 0 | for ( int i = 0; i < NUM_SAO_EO_CLASSES; i++ ) |
1569 | 0 | { |
1570 | 0 | __m128i vmaska = _mm_cmpeq_epi16(vtypea, vconst[i]); |
1571 | 0 | __m128i vmaskb = _mm_cmpeq_epi16(vtypeb, vconst[i]); |
1572 | 0 | __m128i vdiffma = _mm_and_si128(vmaska, vdiffa); |
1573 | 0 | __m128i vdiffmb = _mm_and_si128(vmaskb, vdiffb); |
1574 | 0 | vdiffsum[i] = _mm_add_epi32(vdiffsum[i], _mm_madd_epi16(vdiffma, vconst[1])); |
1575 | 0 | vdiffsum[i] = _mm_add_epi32(vdiffsum[i], _mm_madd_epi16(vdiffmb, vconst[1])); |
1576 | 0 | __m128i vcountma = _mm_srli_epi16(vmaska,15); |
1577 | 0 | __m128i vcountmb = _mm_srli_epi16(vmaskb,15); |
1578 | 0 | vcountsum[i] = _mm_add_epi32(vcountsum[i], _mm_madd_epi16(vcountma, vconst[1])); |
1579 | 0 | vcountsum[i] = _mm_add_epi32(vcountsum[i], _mm_madd_epi16(vcountmb, vconst[1])); |
1580 | 0 | } |
1581 | | // clear mask for first pixel |
1582 | 0 | vmaskga = vconst[0]; |
1583 | 0 | } |
1584 | | // next pixel line |
1585 | 0 | pRec += srcStride; |
1586 | 0 | pOrg += orgStride; |
1587 | 0 | } |
1588 | | |
1589 | | // horizontal add of four 32 bit partial sums |
1590 | 0 | for ( int i = 0; i < NUM_SAO_EO_CLASSES; i++ ) |
1591 | 0 | { |
1592 | 0 | vdiffsum[i] = _mm_add_epi32(vdiffsum[i], _mm_srli_si128(vdiffsum[i], 8)); |
1593 | 0 | vdiffsum[i] = _mm_add_epi32(vdiffsum[i], _mm_srli_si128(vdiffsum[i], 4)); |
1594 | 0 | diff[i] += _mm_cvtsi128_si32(vdiffsum[i]); |
1595 | 0 | vcountsum[i] = _mm_add_epi32(vcountsum[i], _mm_srli_si128(vcountsum[i], 8)); |
1596 | 0 | vcountsum[i] = _mm_add_epi32(vcountsum[i], _mm_srli_si128(vcountsum[i], 4)); |
1597 | 0 | count[i] += _mm_cvtsi128_si32(vcountsum[i]); |
1598 | 0 | } |
1599 | 0 | } |
1600 | 0 | else |
1601 | 0 | { |
1602 | 0 | int x,y,edgeType; |
1603 | 0 | int8_t signDown; |
1604 | | //middle lines |
1605 | 0 | for (y=1; y<endY; y++) |
1606 | 0 | { |
1607 | 0 | srcLineBelow = srcLine + srcStride; |
1608 | 0 | for(x=startX; x<endX; x++) |
1609 | 0 | { |
1610 | 0 | signDown = (int8_t)sgn(srcLine[x] - srcLineBelow[x-1]); |
1611 | 0 | edgeType = signDown + signUpLine[x]; |
1612 | 0 | diff [edgeType] += (orgLine[x] - srcLine[x]); |
1613 | 0 | count[edgeType] ++; |
1614 | 0 | signUpLine[x-1] = -signDown; |
1615 | 0 | } |
1616 | 0 | signUpLine[endX-1] = (int8_t)sgn(srcLineBelow[endX-1] - srcLine[endX]); |
1617 | 0 | srcLine += srcStride; |
1618 | 0 | orgLine += orgStride; |
1619 | 0 | } |
1620 | 0 | } |
1621 | 0 | } Unexecuted instantiation: void vvenc::calcSaoStatisticsEo45_SIMD<(vvenc::x86_simd::X86_VEXT)1>(int, int, int, int, short*, short*, int, int, long*, long*, signed char*) Unexecuted instantiation: void vvenc::calcSaoStatisticsEo45_SIMD<(vvenc::x86_simd::X86_VEXT)4>(int, int, int, int, short*, short*, int, int, long*, long*, signed char*) |
1622 | | template <X86_VEXT vext> |
1623 | | void SampleAdaptiveOffset::_initSampleAdaptiveOffsetX86() |
1624 | 0 | { |
1625 | 0 | offsetBlock= offsetBlock_SIMD<vext>; |
1626 | 0 | calcSaoStatisticsEo0 = calcSaoStatisticsEo0_SIMD<vext>; |
1627 | 0 | calcSaoStatisticsEo90 = calcSaoStatisticsEo90_SIMD<vext>; |
1628 | 0 | calcSaoStatisticsEo135 = calcSaoStatisticsEo135_SIMD<vext>; |
1629 | 0 | calcSaoStatisticsEo45 = calcSaoStatisticsEo45_SIMD<vext>; |
1630 | 0 | calcSaoStatisticsBo = calcSaoStatisticsBo_SIMD<vext>; |
1631 | |
|
1632 | 0 | } Unexecuted instantiation: void vvenc::SampleAdaptiveOffset::_initSampleAdaptiveOffsetX86<(vvenc::x86_simd::X86_VEXT)1>() Unexecuted instantiation: void vvenc::SampleAdaptiveOffset::_initSampleAdaptiveOffsetX86<(vvenc::x86_simd::X86_VEXT)4>() |
1633 | | |
1634 | | template void SampleAdaptiveOffset::_initSampleAdaptiveOffsetX86<SIMDX86>(); |
1635 | | |
1636 | | } // namespace vvenc |
1637 | | |
1638 | | //! \} |
1639 | | |
1640 | | #endif // TARGET_SIMD_X86 |
1641 | | |