/src/libvpx/vpx_dsp/x86/avg_pred_avx2.c
Line | Count | Source (jump to first uncovered line) |
1 | | /* |
2 | | * Copyright (c) 2023 The WebM project authors. All Rights Reserved. |
3 | | * |
4 | | * Use of this source code is governed by a BSD-style license |
5 | | * that can be found in the LICENSE file in the root of the source |
6 | | * tree. An additional intellectual property rights grant can be found |
7 | | * in the file PATENTS. All contributing project authors may |
8 | | * be found in the AUTHORS file in the root of the source tree. |
9 | | */ |
10 | | |
11 | | #include <assert.h> |
12 | | #include <immintrin.h> |
13 | | |
14 | | #include "./vpx_dsp_rtcd.h" |
15 | | |
16 | | void vpx_comp_avg_pred_avx2(uint8_t *comp_pred, const uint8_t *pred, int width, |
17 | 0 | int height, const uint8_t *ref, int ref_stride) { |
18 | 0 | int row = 0; |
19 | | // comp_pred and pred must be 32 byte aligned. |
20 | 0 | assert(((intptr_t)comp_pred % 32) == 0); |
21 | 0 | assert(((intptr_t)pred % 32) == 0); |
22 | |
|
23 | 0 | if (width == 8) { |
24 | 0 | assert(height % 4 == 0); |
25 | 0 | do { |
26 | 0 | const __m256i p = _mm256_load_si256((const __m256i *)pred); |
27 | 0 | const __m128i r_0 = _mm_loadl_epi64((const __m128i *)ref); |
28 | 0 | const __m128i r_1 = |
29 | 0 | _mm_loadl_epi64((const __m128i *)(ref + 2 * ref_stride)); |
30 | |
|
31 | 0 | const __m128i r1 = _mm_castps_si128(_mm_loadh_pi( |
32 | 0 | _mm_castsi128_ps(r_0), (const __m64 *)(ref + ref_stride))); |
33 | 0 | const __m128i r2 = _mm_castps_si128(_mm_loadh_pi( |
34 | 0 | _mm_castsi128_ps(r_1), (const __m64 *)(ref + 3 * ref_stride))); |
35 | |
|
36 | 0 | const __m256i ref_0123 = |
37 | 0 | _mm256_inserti128_si256(_mm256_castsi128_si256(r1), r2, 1); |
38 | 0 | const __m256i avg = _mm256_avg_epu8(p, ref_0123); |
39 | |
|
40 | 0 | _mm256_store_si256((__m256i *)comp_pred, avg); |
41 | |
|
42 | 0 | row += 4; |
43 | 0 | pred += 32; |
44 | 0 | comp_pred += 32; |
45 | 0 | ref += 4 * ref_stride; |
46 | 0 | } while (row < height); |
47 | 0 | } else if (width == 16) { |
48 | 0 | assert(height % 4 == 0); |
49 | 0 | do { |
50 | 0 | const __m256i pred_0 = _mm256_load_si256((const __m256i *)pred); |
51 | 0 | const __m256i pred_1 = _mm256_load_si256((const __m256i *)(pred + 32)); |
52 | 0 | const __m256i tmp0 = |
53 | 0 | _mm256_castsi128_si256(_mm_loadu_si128((const __m128i *)ref)); |
54 | 0 | const __m256i ref_0 = _mm256_inserti128_si256( |
55 | 0 | tmp0, _mm_loadu_si128((const __m128i *)(ref + ref_stride)), 1); |
56 | 0 | const __m256i tmp1 = _mm256_castsi128_si256( |
57 | 0 | _mm_loadu_si128((const __m128i *)(ref + 2 * ref_stride))); |
58 | 0 | const __m256i ref_1 = _mm256_inserti128_si256( |
59 | 0 | tmp1, _mm_loadu_si128((const __m128i *)(ref + 3 * ref_stride)), 1); |
60 | 0 | const __m256i average_0 = _mm256_avg_epu8(pred_0, ref_0); |
61 | 0 | const __m256i average_1 = _mm256_avg_epu8(pred_1, ref_1); |
62 | 0 | _mm256_store_si256((__m256i *)comp_pred, average_0); |
63 | 0 | _mm256_store_si256((__m256i *)(comp_pred + 32), average_1); |
64 | |
|
65 | 0 | row += 4; |
66 | 0 | pred += 64; |
67 | 0 | comp_pred += 64; |
68 | 0 | ref += 4 * ref_stride; |
69 | 0 | } while (row < height); |
70 | 0 | } else if (width == 32) { |
71 | 0 | assert(height % 2 == 0); |
72 | 0 | do { |
73 | 0 | const __m256i pred_0 = _mm256_load_si256((const __m256i *)pred); |
74 | 0 | const __m256i pred_1 = _mm256_load_si256((const __m256i *)(pred + 32)); |
75 | 0 | const __m256i ref_0 = _mm256_loadu_si256((const __m256i *)ref); |
76 | 0 | const __m256i ref_1 = |
77 | 0 | _mm256_loadu_si256((const __m256i *)(ref + ref_stride)); |
78 | 0 | const __m256i average_0 = _mm256_avg_epu8(pred_0, ref_0); |
79 | 0 | const __m256i average_1 = _mm256_avg_epu8(pred_1, ref_1); |
80 | 0 | _mm256_store_si256((__m256i *)comp_pred, average_0); |
81 | 0 | _mm256_store_si256((__m256i *)(comp_pred + 32), average_1); |
82 | |
|
83 | 0 | row += 2; |
84 | 0 | pred += 64; |
85 | 0 | comp_pred += 64; |
86 | 0 | ref += 2 * ref_stride; |
87 | 0 | } while (row < height); |
88 | 0 | } else if (width % 64 == 0) { |
89 | 0 | do { |
90 | 0 | int x; |
91 | 0 | for (x = 0; x < width; x += 64) { |
92 | 0 | const __m256i pred_0 = _mm256_load_si256((const __m256i *)(pred + x)); |
93 | 0 | const __m256i pred_1 = |
94 | 0 | _mm256_load_si256((const __m256i *)(pred + x + 32)); |
95 | 0 | const __m256i ref_0 = _mm256_loadu_si256((const __m256i *)(ref + x)); |
96 | 0 | const __m256i ref_1 = |
97 | 0 | _mm256_loadu_si256((const __m256i *)(ref + x + 32)); |
98 | 0 | const __m256i average_0 = _mm256_avg_epu8(pred_0, ref_0); |
99 | 0 | const __m256i average_1 = _mm256_avg_epu8(pred_1, ref_1); |
100 | 0 | _mm256_store_si256((__m256i *)(comp_pred + x), average_0); |
101 | 0 | _mm256_store_si256((__m256i *)(comp_pred + x + 32), average_1); |
102 | 0 | } |
103 | 0 | row++; |
104 | 0 | pred += width; |
105 | 0 | comp_pred += width; |
106 | 0 | ref += ref_stride; |
107 | 0 | } while (row < height); |
108 | 0 | } else { |
109 | 0 | vpx_comp_avg_pred_sse2(comp_pred, pred, width, height, ref, ref_stride); |
110 | 0 | } |
111 | 0 | } |