Coverage Report

Created: 2024-09-06 07:53

/src/libvpx/vp8/encoder/x86/denoising_sse2.c
Line
Count
Source (jump to first uncovered line)
1
/*
2
 *  Copyright (c) 2012 The WebM project authors. All Rights Reserved.
3
 *
4
 *  Use of this source code is governed by a BSD-style license
5
 *  that can be found in the LICENSE file in the root of the source
6
 *  tree. An additional intellectual property rights grant can be found
7
 *  in the file PATENTS.  All contributing project authors may
8
 *  be found in the AUTHORS file in the root of the source tree.
9
 */
10
11
#include "vp8/encoder/denoising.h"
12
#include "vp8/common/reconinter.h"
13
#include "vpx/vpx_integer.h"
14
#include "vpx_mem/vpx_mem.h"
15
#include "vp8_rtcd.h"
16
17
#include <emmintrin.h>
18
#include "vpx_ports/emmintrin_compat.h"
19
20
/* Compute the sum of all pixel differences of this MB. */
21
0
static INLINE unsigned int abs_sum_diff_16x1(__m128i acc_diff) {
22
0
  const __m128i k_1 = _mm_set1_epi16(1);
23
0
  const __m128i acc_diff_lo =
24
0
      _mm_srai_epi16(_mm_unpacklo_epi8(acc_diff, acc_diff), 8);
25
0
  const __m128i acc_diff_hi =
26
0
      _mm_srai_epi16(_mm_unpackhi_epi8(acc_diff, acc_diff), 8);
27
0
  const __m128i acc_diff_16 = _mm_add_epi16(acc_diff_lo, acc_diff_hi);
28
0
  const __m128i hg_fe_dc_ba = _mm_madd_epi16(acc_diff_16, k_1);
29
0
  const __m128i hgfe_dcba =
30
0
      _mm_add_epi32(hg_fe_dc_ba, _mm_srli_si128(hg_fe_dc_ba, 8));
31
0
  const __m128i hgfedcba =
32
0
      _mm_add_epi32(hgfe_dcba, _mm_srli_si128(hgfe_dcba, 4));
33
0
  unsigned int sum_diff = (unsigned int)abs(_mm_cvtsi128_si32(hgfedcba));
34
35
0
  return sum_diff;
36
0
}
37
38
int vp8_denoiser_filter_sse2(unsigned char *mc_running_avg_y,
39
                             int mc_avg_y_stride, unsigned char *running_avg_y,
40
                             int avg_y_stride, unsigned char *sig,
41
                             int sig_stride, unsigned int motion_magnitude,
42
0
                             int increase_denoising) {
43
0
  unsigned char *running_avg_y_start = running_avg_y;
44
0
  unsigned char *sig_start = sig;
45
0
  unsigned int sum_diff_thresh;
46
0
  int r;
47
0
  int shift_inc =
48
0
      (increase_denoising && motion_magnitude <= MOTION_MAGNITUDE_THRESHOLD)
49
0
          ? 1
50
0
          : 0;
51
0
  __m128i acc_diff = _mm_setzero_si128();
52
0
  const __m128i k_0 = _mm_setzero_si128();
53
0
  const __m128i k_4 = _mm_set1_epi8(4 + shift_inc);
54
0
  const __m128i k_8 = _mm_set1_epi8(8);
55
0
  const __m128i k_16 = _mm_set1_epi8(16);
56
  /* Modify each level's adjustment according to motion_magnitude. */
57
0
  const __m128i l3 = _mm_set1_epi8(
58
0
      (motion_magnitude <= MOTION_MAGNITUDE_THRESHOLD) ? 7 + shift_inc : 6);
59
  /* Difference between level 3 and level 2 is 2. */
60
0
  const __m128i l32 = _mm_set1_epi8(2);
61
  /* Difference between level 2 and level 1 is 1. */
62
0
  const __m128i l21 = _mm_set1_epi8(1);
63
64
0
  for (r = 0; r < 16; ++r) {
65
    /* Calculate differences */
66
0
    const __m128i v_sig = _mm_loadu_si128((__m128i *)(&sig[0]));
67
0
    const __m128i v_mc_running_avg_y =
68
0
        _mm_loadu_si128((__m128i *)(&mc_running_avg_y[0]));
69
0
    __m128i v_running_avg_y;
70
0
    const __m128i pdiff = _mm_subs_epu8(v_mc_running_avg_y, v_sig);
71
0
    const __m128i ndiff = _mm_subs_epu8(v_sig, v_mc_running_avg_y);
72
    /* Obtain the sign. FF if diff is negative. */
73
0
    const __m128i diff_sign = _mm_cmpeq_epi8(pdiff, k_0);
74
    /* Clamp absolute difference to 16 to be used to get mask. Doing this
75
     * allows us to use _mm_cmpgt_epi8, which operates on signed byte. */
76
0
    const __m128i clamped_absdiff =
77
0
        _mm_min_epu8(_mm_or_si128(pdiff, ndiff), k_16);
78
    /* Get masks for l2 l1 and l0 adjustments */
79
0
    const __m128i mask2 = _mm_cmpgt_epi8(k_16, clamped_absdiff);
80
0
    const __m128i mask1 = _mm_cmpgt_epi8(k_8, clamped_absdiff);
81
0
    const __m128i mask0 = _mm_cmpgt_epi8(k_4, clamped_absdiff);
82
    /* Get adjustments for l2, l1, and l0 */
83
0
    __m128i adj2 = _mm_and_si128(mask2, l32);
84
0
    const __m128i adj1 = _mm_and_si128(mask1, l21);
85
0
    const __m128i adj0 = _mm_and_si128(mask0, clamped_absdiff);
86
0
    __m128i adj, padj, nadj;
87
88
    /* Combine the adjustments and get absolute adjustments. */
89
0
    adj2 = _mm_add_epi8(adj2, adj1);
90
0
    adj = _mm_sub_epi8(l3, adj2);
91
0
    adj = _mm_andnot_si128(mask0, adj);
92
0
    adj = _mm_or_si128(adj, adj0);
93
94
    /* Restore the sign and get positive and negative adjustments. */
95
0
    padj = _mm_andnot_si128(diff_sign, adj);
96
0
    nadj = _mm_and_si128(diff_sign, adj);
97
98
    /* Calculate filtered value. */
99
0
    v_running_avg_y = _mm_adds_epu8(v_sig, padj);
100
0
    v_running_avg_y = _mm_subs_epu8(v_running_avg_y, nadj);
101
0
    _mm_storeu_si128((__m128i *)running_avg_y, v_running_avg_y);
102
103
    /* Adjustments <=7, and each element in acc_diff can fit in signed
104
     * char.
105
     */
106
0
    acc_diff = _mm_adds_epi8(acc_diff, padj);
107
0
    acc_diff = _mm_subs_epi8(acc_diff, nadj);
108
109
    /* Update pointers for next iteration. */
110
0
    sig += sig_stride;
111
0
    mc_running_avg_y += mc_avg_y_stride;
112
0
    running_avg_y += avg_y_stride;
113
0
  }
114
115
0
  {
116
    /* Compute the sum of all pixel differences of this MB. */
117
0
    unsigned int abs_sum_diff = abs_sum_diff_16x1(acc_diff);
118
0
    sum_diff_thresh = SUM_DIFF_THRESHOLD;
119
0
    if (increase_denoising) sum_diff_thresh = SUM_DIFF_THRESHOLD_HIGH;
120
0
    if (abs_sum_diff > sum_diff_thresh) {
121
      // Before returning to copy the block (i.e., apply no denoising),
122
      // check if we can still apply some (weaker) temporal filtering to
123
      // this block, that would otherwise not be denoised at all. Simplest
124
      // is to apply an additional adjustment to running_avg_y to bring it
125
      // closer to sig. The adjustment is capped by a maximum delta, and
126
      // chosen such that in most cases the resulting sum_diff will be
127
      // within the acceptable range given by sum_diff_thresh.
128
129
      // The delta is set by the excess of absolute pixel diff over the
130
      // threshold.
131
0
      int delta = ((abs_sum_diff - sum_diff_thresh) >> 8) + 1;
132
      // Only apply the adjustment for max delta up to 3.
133
0
      if (delta < 4) {
134
0
        const __m128i k_delta = _mm_set1_epi8(delta);
135
0
        sig -= sig_stride * 16;
136
0
        mc_running_avg_y -= mc_avg_y_stride * 16;
137
0
        running_avg_y -= avg_y_stride * 16;
138
0
        for (r = 0; r < 16; ++r) {
139
0
          __m128i v_running_avg_y =
140
0
              _mm_loadu_si128((__m128i *)(&running_avg_y[0]));
141
          // Calculate differences.
142
0
          const __m128i v_sig = _mm_loadu_si128((__m128i *)(&sig[0]));
143
0
          const __m128i v_mc_running_avg_y =
144
0
              _mm_loadu_si128((__m128i *)(&mc_running_avg_y[0]));
145
0
          const __m128i pdiff = _mm_subs_epu8(v_mc_running_avg_y, v_sig);
146
0
          const __m128i ndiff = _mm_subs_epu8(v_sig, v_mc_running_avg_y);
147
          // Obtain the sign. FF if diff is negative.
148
0
          const __m128i diff_sign = _mm_cmpeq_epi8(pdiff, k_0);
149
          // Clamp absolute difference to delta to get the adjustment.
150
0
          const __m128i adj = _mm_min_epu8(_mm_or_si128(pdiff, ndiff), k_delta);
151
          // Restore the sign and get positive and negative adjustments.
152
0
          __m128i padj, nadj;
153
0
          padj = _mm_andnot_si128(diff_sign, adj);
154
0
          nadj = _mm_and_si128(diff_sign, adj);
155
          // Calculate filtered value.
156
0
          v_running_avg_y = _mm_subs_epu8(v_running_avg_y, padj);
157
0
          v_running_avg_y = _mm_adds_epu8(v_running_avg_y, nadj);
158
0
          _mm_storeu_si128((__m128i *)running_avg_y, v_running_avg_y);
159
160
          // Accumulate the adjustments.
161
0
          acc_diff = _mm_subs_epi8(acc_diff, padj);
162
0
          acc_diff = _mm_adds_epi8(acc_diff, nadj);
163
164
          // Update pointers for next iteration.
165
0
          sig += sig_stride;
166
0
          mc_running_avg_y += mc_avg_y_stride;
167
0
          running_avg_y += avg_y_stride;
168
0
        }
169
0
        abs_sum_diff = abs_sum_diff_16x1(acc_diff);
170
0
        if (abs_sum_diff > sum_diff_thresh) {
171
0
          return COPY_BLOCK;
172
0
        }
173
0
      } else {
174
0
        return COPY_BLOCK;
175
0
      }
176
0
    }
177
0
  }
178
179
0
  vp8_copy_mem16x16(running_avg_y_start, avg_y_stride, sig_start, sig_stride);
180
0
  return FILTER_BLOCK;
181
0
}
182
183
int vp8_denoiser_filter_uv_sse2(unsigned char *mc_running_avg,
184
                                int mc_avg_stride, unsigned char *running_avg,
185
                                int avg_stride, unsigned char *sig,
186
                                int sig_stride, unsigned int motion_magnitude,
187
0
                                int increase_denoising) {
188
0
  unsigned char *running_avg_start = running_avg;
189
0
  unsigned char *sig_start = sig;
190
0
  unsigned int sum_diff_thresh;
191
0
  int r;
192
0
  int shift_inc =
193
0
      (increase_denoising && motion_magnitude <= MOTION_MAGNITUDE_THRESHOLD_UV)
194
0
          ? 1
195
0
          : 0;
196
0
  __m128i acc_diff = _mm_setzero_si128();
197
0
  const __m128i k_0 = _mm_setzero_si128();
198
0
  const __m128i k_4 = _mm_set1_epi8(4 + shift_inc);
199
0
  const __m128i k_8 = _mm_set1_epi8(8);
200
0
  const __m128i k_16 = _mm_set1_epi8(16);
201
  /* Modify each level's adjustment according to motion_magnitude. */
202
0
  const __m128i l3 = _mm_set1_epi8(
203
0
      (motion_magnitude <= MOTION_MAGNITUDE_THRESHOLD_UV) ? 7 + shift_inc : 6);
204
  /* Difference between level 3 and level 2 is 2. */
205
0
  const __m128i l32 = _mm_set1_epi8(2);
206
  /* Difference between level 2 and level 1 is 1. */
207
0
  const __m128i l21 = _mm_set1_epi8(1);
208
209
0
  {
210
0
    const __m128i k_1 = _mm_set1_epi16(1);
211
0
    __m128i vec_sum_block = _mm_setzero_si128();
212
213
    // Avoid denoising color signal if its close to average level.
214
0
    for (r = 0; r < 8; ++r) {
215
0
      const __m128i v_sig = _mm_loadl_epi64((__m128i *)(&sig[0]));
216
0
      const __m128i v_sig_unpack = _mm_unpacklo_epi8(v_sig, k_0);
217
0
      vec_sum_block = _mm_add_epi16(vec_sum_block, v_sig_unpack);
218
0
      sig += sig_stride;
219
0
    }
220
0
    sig -= sig_stride * 8;
221
0
    {
222
0
      const __m128i hg_fe_dc_ba = _mm_madd_epi16(vec_sum_block, k_1);
223
0
      const __m128i hgfe_dcba =
224
0
          _mm_add_epi32(hg_fe_dc_ba, _mm_srli_si128(hg_fe_dc_ba, 8));
225
0
      const __m128i hgfedcba =
226
0
          _mm_add_epi32(hgfe_dcba, _mm_srli_si128(hgfe_dcba, 4));
227
0
      const int sum_block = _mm_cvtsi128_si32(hgfedcba);
228
0
      if (abs(sum_block - (128 * 8 * 8)) < SUM_DIFF_FROM_AVG_THRESH_UV) {
229
0
        return COPY_BLOCK;
230
0
      }
231
0
    }
232
0
  }
233
234
0
  for (r = 0; r < 4; ++r) {
235
    /* Calculate differences */
236
0
    const __m128i v_sig_low =
237
0
        _mm_castpd_si128(_mm_load_sd((double *)(&sig[0])));
238
0
    const __m128i v_sig = _mm_castpd_si128(_mm_loadh_pd(
239
0
        _mm_castsi128_pd(v_sig_low), (double *)(&sig[sig_stride])));
240
0
    const __m128i v_mc_running_avg_low =
241
0
        _mm_castpd_si128(_mm_load_sd((double *)(&mc_running_avg[0])));
242
0
    const __m128i v_mc_running_avg = _mm_castpd_si128(
243
0
        _mm_loadh_pd(_mm_castsi128_pd(v_mc_running_avg_low),
244
0
                     (double *)(&mc_running_avg[mc_avg_stride])));
245
0
    const __m128i pdiff = _mm_subs_epu8(v_mc_running_avg, v_sig);
246
0
    const __m128i ndiff = _mm_subs_epu8(v_sig, v_mc_running_avg);
247
    /* Obtain the sign. FF if diff is negative. */
248
0
    const __m128i diff_sign = _mm_cmpeq_epi8(pdiff, k_0);
249
    /* Clamp absolute difference to 16 to be used to get mask. Doing this
250
     * allows us to use _mm_cmpgt_epi8, which operates on signed byte. */
251
0
    const __m128i clamped_absdiff =
252
0
        _mm_min_epu8(_mm_or_si128(pdiff, ndiff), k_16);
253
    /* Get masks for l2 l1 and l0 adjustments */
254
0
    const __m128i mask2 = _mm_cmpgt_epi8(k_16, clamped_absdiff);
255
0
    const __m128i mask1 = _mm_cmpgt_epi8(k_8, clamped_absdiff);
256
0
    const __m128i mask0 = _mm_cmpgt_epi8(k_4, clamped_absdiff);
257
    /* Get adjustments for l2, l1, and l0 */
258
0
    __m128i adj2 = _mm_and_si128(mask2, l32);
259
0
    const __m128i adj1 = _mm_and_si128(mask1, l21);
260
0
    const __m128i adj0 = _mm_and_si128(mask0, clamped_absdiff);
261
0
    __m128i adj, padj, nadj;
262
0
    __m128i v_running_avg;
263
264
    /* Combine the adjustments and get absolute adjustments. */
265
0
    adj2 = _mm_add_epi8(adj2, adj1);
266
0
    adj = _mm_sub_epi8(l3, adj2);
267
0
    adj = _mm_andnot_si128(mask0, adj);
268
0
    adj = _mm_or_si128(adj, adj0);
269
270
    /* Restore the sign and get positive and negative adjustments. */
271
0
    padj = _mm_andnot_si128(diff_sign, adj);
272
0
    nadj = _mm_and_si128(diff_sign, adj);
273
274
    /* Calculate filtered value. */
275
0
    v_running_avg = _mm_adds_epu8(v_sig, padj);
276
0
    v_running_avg = _mm_subs_epu8(v_running_avg, nadj);
277
278
0
    _mm_storel_pd((double *)&running_avg[0], _mm_castsi128_pd(v_running_avg));
279
0
    _mm_storeh_pd((double *)&running_avg[avg_stride],
280
0
                  _mm_castsi128_pd(v_running_avg));
281
282
    /* Adjustments <=7, and each element in acc_diff can fit in signed
283
     * char.
284
     */
285
0
    acc_diff = _mm_adds_epi8(acc_diff, padj);
286
0
    acc_diff = _mm_subs_epi8(acc_diff, nadj);
287
288
    /* Update pointers for next iteration. */
289
0
    sig += sig_stride * 2;
290
0
    mc_running_avg += mc_avg_stride * 2;
291
0
    running_avg += avg_stride * 2;
292
0
  }
293
294
0
  {
295
0
    unsigned int abs_sum_diff = abs_sum_diff_16x1(acc_diff);
296
0
    sum_diff_thresh = SUM_DIFF_THRESHOLD_UV;
297
0
    if (increase_denoising) sum_diff_thresh = SUM_DIFF_THRESHOLD_HIGH_UV;
298
0
    if (abs_sum_diff > sum_diff_thresh) {
299
      // Before returning to copy the block (i.e., apply no denoising),
300
      // check if we can still apply some (weaker) temporal filtering to
301
      // this block, that would otherwise not be denoised at all. Simplest
302
      // is to apply an additional adjustment to running_avg_y to bring it
303
      // closer to sig. The adjustment is capped by a maximum delta, and
304
      // chosen such that in most cases the resulting sum_diff will be
305
      // within the acceptable range given by sum_diff_thresh.
306
307
      // The delta is set by the excess of absolute pixel diff over the
308
      // threshold.
309
0
      int delta = ((abs_sum_diff - sum_diff_thresh) >> 8) + 1;
310
      // Only apply the adjustment for max delta up to 3.
311
0
      if (delta < 4) {
312
0
        const __m128i k_delta = _mm_set1_epi8(delta);
313
0
        sig -= sig_stride * 8;
314
0
        mc_running_avg -= mc_avg_stride * 8;
315
0
        running_avg -= avg_stride * 8;
316
0
        for (r = 0; r < 4; ++r) {
317
          // Calculate differences.
318
0
          const __m128i v_sig_low =
319
0
              _mm_castpd_si128(_mm_load_sd((double *)(&sig[0])));
320
0
          const __m128i v_sig = _mm_castpd_si128(_mm_loadh_pd(
321
0
              _mm_castsi128_pd(v_sig_low), (double *)(&sig[sig_stride])));
322
0
          const __m128i v_mc_running_avg_low =
323
0
              _mm_castpd_si128(_mm_load_sd((double *)(&mc_running_avg[0])));
324
0
          const __m128i v_mc_running_avg = _mm_castpd_si128(
325
0
              _mm_loadh_pd(_mm_castsi128_pd(v_mc_running_avg_low),
326
0
                           (double *)(&mc_running_avg[mc_avg_stride])));
327
0
          const __m128i pdiff = _mm_subs_epu8(v_mc_running_avg, v_sig);
328
0
          const __m128i ndiff = _mm_subs_epu8(v_sig, v_mc_running_avg);
329
          // Obtain the sign. FF if diff is negative.
330
0
          const __m128i diff_sign = _mm_cmpeq_epi8(pdiff, k_0);
331
          // Clamp absolute difference to delta to get the adjustment.
332
0
          const __m128i adj = _mm_min_epu8(_mm_or_si128(pdiff, ndiff), k_delta);
333
          // Restore the sign and get positive and negative adjustments.
334
0
          __m128i padj, nadj;
335
0
          const __m128i v_running_avg_low =
336
0
              _mm_castpd_si128(_mm_load_sd((double *)(&running_avg[0])));
337
0
          __m128i v_running_avg = _mm_castpd_si128(
338
0
              _mm_loadh_pd(_mm_castsi128_pd(v_running_avg_low),
339
0
                           (double *)(&running_avg[avg_stride])));
340
0
          padj = _mm_andnot_si128(diff_sign, adj);
341
0
          nadj = _mm_and_si128(diff_sign, adj);
342
          // Calculate filtered value.
343
0
          v_running_avg = _mm_subs_epu8(v_running_avg, padj);
344
0
          v_running_avg = _mm_adds_epu8(v_running_avg, nadj);
345
346
0
          _mm_storel_pd((double *)&running_avg[0],
347
0
                        _mm_castsi128_pd(v_running_avg));
348
0
          _mm_storeh_pd((double *)&running_avg[avg_stride],
349
0
                        _mm_castsi128_pd(v_running_avg));
350
351
          // Accumulate the adjustments.
352
0
          acc_diff = _mm_subs_epi8(acc_diff, padj);
353
0
          acc_diff = _mm_adds_epi8(acc_diff, nadj);
354
355
          // Update pointers for next iteration.
356
0
          sig += sig_stride * 2;
357
0
          mc_running_avg += mc_avg_stride * 2;
358
0
          running_avg += avg_stride * 2;
359
0
        }
360
0
        abs_sum_diff = abs_sum_diff_16x1(acc_diff);
361
0
        if (abs_sum_diff > sum_diff_thresh) {
362
0
          return COPY_BLOCK;
363
0
        }
364
0
      } else {
365
0
        return COPY_BLOCK;
366
0
      }
367
0
    }
368
0
  }
369
370
0
  vp8_copy_mem8x8(running_avg_start, avg_stride, sig_start, sig_stride);
371
0
  return FILTER_BLOCK;
372
0
}