/src/libmpeg2/common/x86/icv_variance_ssse3.c
Line | Count | Source |
1 | | /****************************************************************************** |
2 | | * |
3 | | * Copyright (C) 2015 The Android Open Source Project |
4 | | * |
5 | | * Licensed under the Apache License, Version 2.0 (the "License"); |
6 | | * you may not use this file except in compliance with the License. |
7 | | * You may obtain a copy of the License at: |
8 | | * |
9 | | * http://www.apache.org/licenses/LICENSE-2.0 |
10 | | * |
11 | | * Unless required by applicable law or agreed to in writing, software |
12 | | * distributed under the License is distributed on an "AS IS" BASIS, |
13 | | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
14 | | * See the License for the specific language governing permissions and |
15 | | * limitations under the License. |
16 | | * |
17 | | ***************************************************************************** |
18 | | * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore |
19 | | */ |
20 | | /** |
21 | | ******************************************************************************* |
22 | | * @file |
23 | | * icv_variance_sse42.c |
24 | | * |
25 | | * @brief |
26 | | * This file contains the functions to compute variance |
27 | | * |
28 | | * @author |
29 | | * Ittiam |
30 | | * |
31 | | * @par List of Functions: |
32 | | * icv_variance_8x4_ssse3() |
33 | | * |
34 | | * @remarks |
35 | | * None |
36 | | * |
37 | | ******************************************************************************* |
38 | | */ |
39 | | /*****************************************************************************/ |
40 | | /* File Includes */ |
41 | | /*****************************************************************************/ |
42 | | /* System include files */ |
43 | | #include <stdio.h> |
44 | | #include <stdint.h> |
45 | | #include <string.h> |
46 | | #include <stdlib.h> |
47 | | #include <assert.h> |
48 | | #include <immintrin.h> |
49 | | |
50 | | /* User include files */ |
51 | | #include "icv_datatypes.h" |
52 | | #include "icv_macros.h" |
53 | | #include "icv_platform_macros.h" |
54 | | #include "icv.h" |
55 | | |
56 | | /** |
57 | | ******************************************************************************* |
58 | | * |
59 | | * @brief |
60 | | * Computes variance of a given 8x4 block |
61 | | * |
62 | | * @par Description |
63 | | * Compute variance of a given 8x4 block |
64 | | * |
65 | | * @param[in] pu1_src |
66 | | * Source |
67 | | * |
68 | | * @param[in] src_strd |
69 | | * Source stride |
70 | | * |
71 | | * @param[in] wd |
72 | | * Assumed to be 8 |
73 | | * |
74 | | * @param[in] ht |
75 | | * Assumed to be 4 |
76 | | * |
77 | | * @returns |
78 | | * Variance |
79 | | * |
80 | | * @remarks |
81 | | * |
82 | | ******************************************************************************* |
83 | | */ |
84 | | WORD32 icv_variance_8x4_ssse3(UWORD8 *pu1_src, WORD32 src_strd, WORD32 wd, WORD32 ht) |
85 | 42.8M | { |
86 | 42.8M | WORD32 sum; |
87 | 42.8M | WORD32 sum_sqr; |
88 | 42.8M | WORD32 blk_sz; |
89 | 42.8M | WORD32 vrnc; |
90 | 42.8M | __m128 src_r0, src_r1; |
91 | 42.8M | __m128i ssrc_r0, ssrc_r1, ssrc_r2, ssrc_r3; |
92 | 42.8M | __m128i sum_r0, sum_r1; |
93 | 42.8M | __m128i sqr_r0, sqr_r1, sqr_r2, sqr_r3; |
94 | 42.8M | __m128i vsum, vsum_sqr; |
95 | 42.8M | __m128i zero; |
96 | 42.8M | UNUSED(wd); |
97 | 42.8M | UNUSED(ht); |
98 | | |
99 | 42.8M | ASSERT(wd == 8); |
100 | 42.8M | ASSERT(ht == 4); |
101 | | |
102 | 42.9M | sum = 0; |
103 | 42.9M | sum_sqr = 0; |
104 | | |
105 | 42.9M | blk_sz = 8 * 4; |
106 | | |
107 | 42.9M | zero = _mm_setzero_si128(); |
108 | | |
109 | | /* Load source */ |
110 | 42.9M | src_r0 = (__m128)_mm_loadl_epi64((__m128i *) (pu1_src)); |
111 | 42.9M | pu1_src += src_strd; |
112 | | |
113 | 42.9M | src_r1 = (__m128)_mm_loadl_epi64((__m128i *) (pu1_src)); |
114 | 42.9M | pu1_src += src_strd; |
115 | | |
116 | 42.9M | src_r0 = _mm_loadh_pi (src_r0, (__m64 *) (pu1_src)); |
117 | 42.9M | pu1_src += src_strd; |
118 | | |
119 | 42.9M | src_r1 = _mm_loadh_pi (src_r1, (__m64 *) (pu1_src)); |
120 | 42.9M | pu1_src += src_strd; |
121 | | |
122 | | /* Compute sum of all elements */ |
123 | | /* Use SAD with 0, since there is no pairwise addition */ |
124 | 42.9M | sum_r0 = _mm_sad_epu8((__m128i)src_r0, zero); |
125 | 42.9M | sum_r1 = _mm_sad_epu8((__m128i)src_r1, zero); |
126 | | |
127 | | /* Accumulate SAD */ |
128 | 42.9M | vsum = _mm_add_epi64(sum_r0, sum_r1); |
129 | 42.9M | vsum = _mm_add_epi64(vsum, _mm_srli_si128(vsum, 8)); |
130 | | |
131 | 42.9M | sum = _mm_cvtsi128_si32(vsum); |
132 | | |
133 | | /* Unpack to 16 bits */ |
134 | 42.9M | ssrc_r0 = _mm_unpacklo_epi8((__m128i)src_r0, zero); |
135 | 42.9M | ssrc_r1 = _mm_unpacklo_epi8((__m128i)src_r1, zero); |
136 | 42.9M | ssrc_r2 = _mm_unpackhi_epi8((__m128i)src_r0, zero); |
137 | 42.9M | ssrc_r3 = _mm_unpackhi_epi8((__m128i)src_r1, zero); |
138 | | |
139 | | /* Compute sum of squares */ |
140 | 42.9M | sqr_r0 = _mm_madd_epi16(ssrc_r0, ssrc_r0); |
141 | 42.9M | sqr_r1 = _mm_madd_epi16(ssrc_r1, ssrc_r1); |
142 | 42.9M | sqr_r2 = _mm_madd_epi16(ssrc_r2, ssrc_r2); |
143 | 42.9M | sqr_r3 = _mm_madd_epi16(ssrc_r3, ssrc_r3); |
144 | | |
145 | 42.9M | vsum_sqr = _mm_add_epi32(sqr_r0, sqr_r1); |
146 | 42.9M | vsum_sqr = _mm_add_epi32(vsum_sqr, sqr_r2); |
147 | 42.9M | vsum_sqr = _mm_add_epi32(vsum_sqr, sqr_r3); |
148 | | |
149 | 42.9M | vsum_sqr = _mm_add_epi32(vsum_sqr, _mm_srli_si128(vsum_sqr, 8)); |
150 | 42.9M | vsum_sqr = _mm_add_epi32(vsum_sqr, _mm_srli_si128(vsum_sqr, 4)); |
151 | 42.9M | sum_sqr = _mm_cvtsi128_si32(vsum_sqr); |
152 | | |
153 | | /* Compute variance */ |
154 | 42.9M | vrnc = ((sum_sqr * blk_sz) - (sum * sum)) / (blk_sz * blk_sz); |
155 | | |
156 | 42.9M | return vrnc; |
157 | 42.8M | } |
158 | | |