/src/libmpeg2/common/x86/ideint_cac_ssse3.c
Line | Count | Source |
1 | | /****************************************************************************** |
2 | | * |
3 | | * Copyright (C) 2015 The Android Open Source Project |
4 | | * |
5 | | * Licensed under the Apache License, Version 2.0 (the "License"); |
6 | | * you may not use this file except in compliance with the License. |
7 | | * You may obtain a copy of the License at: |
8 | | * |
9 | | * http://www.apache.org/licenses/LICENSE-2.0 |
10 | | * |
11 | | * Unless required by applicable law or agreed to in writing, software |
12 | | * distributed under the License is distributed on an "AS IS" BASIS, |
13 | | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
14 | | * See the License for the specific language governing permissions and |
15 | | * limitations under the License. |
16 | | * |
17 | | ***************************************************************************** |
18 | | * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore |
19 | | */ |
20 | | /** |
21 | | ******************************************************************************* |
22 | | * @file |
23 | | * ideint_cac_ssse3.c |
24 | | * |
25 | | * @brief |
26 | | * This file include the definitions of the combing artifact check function |
27 | | * of the de-interlacer and some variant of that. |
28 | | * |
29 | | * @author |
30 | | * Ittiam |
31 | | * |
32 | | * @par List of Functions: |
33 | | * cac_4x8() |
34 | | * ideint_cac() |
35 | | * |
36 | | * @remarks |
37 | | * In the de-interlacer workspace, cac is not a seperate assembly module as |
38 | | * it comes along with the de_int_decision() function. But in C-Model, to |
39 | | * keep the things cleaner, it was made to be a separate function during |
40 | | * cac experiments long after the assembly was written by Mudit. |
41 | | * |
42 | | ******************************************************************************* |
43 | | */ |
44 | | /*****************************************************************************/ |
45 | | /* File Includes */ |
46 | | /*****************************************************************************/ |
47 | | /* System include files */ |
48 | | #include <stdio.h> |
49 | | #include <stdint.h> |
50 | | #include <string.h> |
51 | | #include <stdlib.h> |
52 | | #include <immintrin.h> |
53 | | |
54 | | /* User include files */ |
55 | | #include "icv_datatypes.h" |
56 | | #include "icv_macros.h" |
57 | | #include "icv.h" |
58 | | #include "icv_variance.h" |
59 | | #include "icv_sad.h" |
60 | | #include "ideint.h" |
61 | | #include "ideint_defs.h" |
62 | | #include "ideint_structs.h" |
63 | | #include "ideint_cac.h" |
64 | | |
65 | | /** |
66 | | ******************************************************************************* |
67 | | * |
68 | | * @brief |
69 | | * Combing artifact check function for 8x8 block |
70 | | * |
71 | | * @par Description |
72 | | * Determines CAC for 8x8 block by calling 8x4 CAC function |
73 | | * |
74 | | * @param[in] pu1_top |
75 | | * Top field |
76 | | * |
77 | | * @param[in] pu1_bot |
78 | | * Bottom field |
79 | | * |
80 | | * @param[in] top_strd |
81 | | * Top field Stride |
82 | | * |
83 | | * @param[in] bot_strd |
84 | | * Bottom field stride |
85 | | * |
86 | | * @returns |
87 | | * combing artifact flag (1 = detected, 0 = not detected) |
88 | | * |
89 | | * @remarks |
90 | | * |
91 | | ******************************************************************************* |
92 | | */ |
93 | | WORD32 ideint_cac_8x8_ssse3(UWORD8 *pu1_top, |
94 | | UWORD8 *pu1_bot, |
95 | | WORD32 top_strd, |
96 | | WORD32 bot_strd) |
97 | 30.2M | { |
98 | 30.2M | WORD32 ca; /* combing artifact result */ |
99 | 30.2M | WORD32 i; |
100 | 30.2M | WORD32 adj[2] = {0}; |
101 | 30.2M | WORD32 alt[2] = {0}; |
102 | 30.2M | WORD32 sum_1, sum_2, sum_3, sum_4; |
103 | 30.2M | WORD32 sum_diff, diff_sum; |
104 | | |
105 | 30.2M | __m128i top[4]; |
106 | 30.2M | __m128i bot[4]; |
107 | 30.2M | __m128i sum_t[4]; |
108 | 30.2M | __m128i sum_b[4]; |
109 | 30.2M | __m128i zero; |
110 | | |
111 | | |
112 | 30.2M | zero = _mm_setzero_si128(); |
113 | | |
114 | 132M | for(i = 0; i < 4; i++) |
115 | 102M | { |
116 | | /* Load top */ |
117 | 102M | top[i] = (__m128i)_mm_loadl_epi64((__m128i *) (pu1_top)); |
118 | 102M | pu1_top += top_strd; |
119 | | |
120 | | /* Load bottom */ |
121 | 102M | bot[i] = (__m128i)_mm_loadl_epi64((__m128i *) (pu1_bot)); |
122 | 102M | pu1_bot += bot_strd; |
123 | | |
124 | | /* Unpack */ |
125 | 102M | top[i] = _mm_unpacklo_epi8(top[i], zero); |
126 | 102M | bot[i] = _mm_unpacklo_epi8(bot[i], zero); |
127 | | |
128 | | /* Compute row sums */ |
129 | 102M | sum_t[i] = _mm_sad_epu8(top[i], zero); |
130 | 102M | sum_b[i] = _mm_sad_epu8(bot[i], zero); |
131 | 102M | } |
132 | | |
133 | | /* Compute row based alt and adj */ |
134 | 81.4M | for(i = 0; i < 4; i += 2) |
135 | 51.1M | { |
136 | 51.1M | sum_1 = _mm_cvtsi128_si32(sum_t[i + 0]); |
137 | 51.1M | sum_2 = _mm_cvtsi128_si32(sum_b[i + 0]); |
138 | 51.1M | sum_diff = ABS_DIF(sum_1, sum_2); |
139 | 51.1M | if(sum_diff >= RSUM_CSUM_THRESH) |
140 | 52.3k | adj[0] += sum_diff; |
141 | | |
142 | 51.1M | sum_3 = _mm_cvtsi128_si32(sum_t[i + 1]); |
143 | 51.1M | sum_4 = _mm_cvtsi128_si32(sum_b[i + 1]); |
144 | 51.1M | sum_diff = ABS_DIF(sum_3, sum_4); |
145 | 51.1M | if(sum_diff >= RSUM_CSUM_THRESH) |
146 | 59.6k | adj[0] += sum_diff; |
147 | | |
148 | 51.1M | alt[0] += ABS_DIF(sum_1, sum_3); |
149 | 51.1M | alt[0] += ABS_DIF(sum_2, sum_4); |
150 | | |
151 | 51.1M | sum_1 = _mm_cvtsi128_si32(_mm_srli_si128(sum_t[i + 0], 8)); |
152 | 51.1M | sum_2 = _mm_cvtsi128_si32(_mm_srli_si128(sum_b[i + 0], 8)); |
153 | 51.1M | sum_diff = ABS_DIF(sum_1, sum_2); |
154 | 51.1M | if(sum_diff >= RSUM_CSUM_THRESH) |
155 | 51.2k | adj[1] += sum_diff; |
156 | | |
157 | 51.1M | sum_3 = _mm_cvtsi128_si32(_mm_srli_si128(sum_t[i + 1], 8)); |
158 | 51.1M | sum_4 = _mm_cvtsi128_si32(_mm_srli_si128(sum_b[i + 1], 8)); |
159 | 51.1M | sum_diff = ABS_DIF(sum_3, sum_4); |
160 | 51.1M | if(sum_diff >= RSUM_CSUM_THRESH) |
161 | 57.2k | adj[1] += sum_diff; |
162 | | |
163 | 51.1M | alt[1] += ABS_DIF(sum_1, sum_3); |
164 | 51.1M | alt[1] += ABS_DIF(sum_2, sum_4); |
165 | 51.1M | } |
166 | | |
167 | | /* Compute column based adj */ |
168 | 30.2M | { |
169 | 30.2M | __m128i avg1, avg2; |
170 | 30.2M | __m128i top_avg, bot_avg; |
171 | 30.2M | __m128i min, max, diff, thresh; |
172 | 30.2M | __m128i mask; |
173 | 30.2M | avg1 = _mm_avg_epu8(top[0], top[1]); |
174 | 30.2M | avg2 = _mm_avg_epu8(top[2], top[3]); |
175 | 30.2M | top_avg = _mm_avg_epu8(avg1, avg2); |
176 | | |
177 | 30.2M | avg1 = _mm_avg_epu8(bot[0], bot[1]); |
178 | 30.2M | avg2 = _mm_avg_epu8(bot[2], bot[3]); |
179 | 30.2M | bot_avg = _mm_avg_epu8(avg1, avg2); |
180 | | |
181 | 30.2M | min = _mm_min_epu8(top_avg, bot_avg); |
182 | 30.2M | max = _mm_max_epu8(top_avg, bot_avg); |
183 | | |
184 | 30.2M | diff = _mm_sub_epi16(max, min); |
185 | 30.2M | thresh = _mm_set1_epi16((RSUM_CSUM_THRESH >> 2) - 1); |
186 | | |
187 | 30.2M | mask = _mm_cmpgt_epi16(diff, thresh); |
188 | 30.2M | diff = _mm_and_si128(diff, mask); |
189 | | |
190 | 30.2M | diff_sum = _mm_extract_epi16(diff, 0); |
191 | 30.2M | diff_sum += _mm_extract_epi16(diff, 1); |
192 | 30.2M | diff_sum += _mm_extract_epi16(diff, 2); |
193 | 30.2M | diff_sum += _mm_extract_epi16(diff, 3); |
194 | | |
195 | 30.2M | adj[0] += diff_sum << 2; |
196 | | |
197 | 30.2M | diff_sum = _mm_extract_epi16(diff, 4); |
198 | 30.2M | diff_sum += _mm_extract_epi16(diff, 5); |
199 | 30.2M | diff_sum += _mm_extract_epi16(diff, 6); |
200 | 30.2M | diff_sum += _mm_extract_epi16(diff, 7); |
201 | | |
202 | 30.2M | adj[1] += diff_sum << 2; |
203 | | |
204 | 30.2M | } |
205 | | |
206 | | /* Compute column based alt */ |
207 | 30.2M | { |
208 | 30.2M | __m128i avg1, avg2; |
209 | 30.2M | __m128i even_avg, odd_avg, diff; |
210 | 30.2M | avg1 = _mm_avg_epu8(top[0], bot[0]); |
211 | 30.2M | avg2 = _mm_avg_epu8(top[2], bot[2]); |
212 | 30.2M | even_avg = _mm_avg_epu8(avg1, avg2); |
213 | | |
214 | 30.2M | avg1 = _mm_avg_epu8(top[1], bot[1]); |
215 | 30.2M | avg2 = _mm_avg_epu8(top[3], bot[3]); |
216 | 30.2M | odd_avg = _mm_avg_epu8(avg1, avg2); |
217 | | |
218 | 30.2M | diff = _mm_sad_epu8(even_avg, odd_avg); |
219 | | |
220 | | |
221 | 30.2M | diff_sum = _mm_cvtsi128_si32(diff); |
222 | 30.2M | alt[0] += diff_sum << 2; |
223 | | |
224 | 30.2M | diff_sum = _mm_cvtsi128_si32(_mm_srli_si128(diff, 8)); |
225 | 30.2M | alt[1] += diff_sum << 2; |
226 | | |
227 | 30.2M | } |
228 | 30.2M | alt[0] += (alt[0] >> SAD_BIAS_MULT_SHIFT) + (SAD_BIAS_ADDITIVE >> 1); |
229 | 30.2M | alt[1] += (alt[1] >> SAD_BIAS_MULT_SHIFT) + (SAD_BIAS_ADDITIVE >> 1); |
230 | | |
231 | 30.2M | ca = (alt[0] < adj[0]); |
232 | 30.2M | ca |= (alt[1] < adj[1]); |
233 | | |
234 | 30.2M | return ca; |
235 | 30.2M | } |
236 | | |