Coverage Report

Created: 2026-06-07 07:20

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/src/libwebp/src/dsp/ssim_sse2.c
Line
Count
Source
1
// Copyright 2017 Google Inc. All Rights Reserved.
2
//
3
// Use of this source code is governed by a BSD-style license
4
// that can be found in the COPYING file in the root of the source
5
// tree. An additional intellectual property rights grant can be found
6
// in the file PATENTS. All contributing project authors may
7
// be found in the AUTHORS file in the root of the source tree.
8
// -----------------------------------------------------------------------------
9
//
10
// SSE2 version of distortion calculation
11
//
12
// Author: Skal (pascal.massimino@gmail.com)
13
14
#include "src/dsp/dsp.h"
15
16
#if defined(WEBP_USE_SSE2)
17
#include <assert.h>
18
#include <emmintrin.h>
19
20
#include "src/dsp/common_sse2.h"
21
#include "src/dsp/cpu.h"
22
#include "src/webp/types.h"
23
24
#if !defined(WEBP_DISABLE_STATS)
25
26
// Helper function
27
static WEBP_INLINE void SubtractAndSquare_SSE2(const __m128i a, const __m128i b,
28
0
                                               __m128i* const sum) {
29
  // take abs(a-b) in 8b
30
0
  const __m128i a_b = _mm_subs_epu8(a, b);
31
0
  const __m128i b_a = _mm_subs_epu8(b, a);
32
0
  const __m128i abs_a_b = _mm_or_si128(a_b, b_a);
33
  // zero-extend to 16b
34
0
  const __m128i zero = _mm_setzero_si128();
35
0
  const __m128i C0 = _mm_unpacklo_epi8(abs_a_b, zero);
36
0
  const __m128i C1 = _mm_unpackhi_epi8(abs_a_b, zero);
37
  // multiply with self
38
0
  const __m128i sum1 = _mm_madd_epi16(C0, C0);
39
0
  const __m128i sum2 = _mm_madd_epi16(C1, C1);
40
0
  *sum = _mm_add_epi32(sum1, sum2);
41
0
}
42
43
//------------------------------------------------------------------------------
44
// SSIM / PSNR entry point
45
46
static uint32_t AccumulateSSE_SSE2(const uint8_t* src1, const uint8_t* src2,
47
0
                                   int len) {
48
0
  int i = 0;
49
0
  uint32_t sse2 = 0;
50
0
  if (len >= 16) {
51
0
    const int limit = len - 32;
52
0
    int32_t tmp[4];
53
0
    __m128i sum1;
54
0
    __m128i sum = _mm_setzero_si128();
55
0
    __m128i a0 = _mm_loadu_si128((const __m128i*)&src1[i]);
56
0
    __m128i b0 = _mm_loadu_si128((const __m128i*)&src2[i]);
57
0
    i += 16;
58
0
    while (i <= limit) {
59
0
      const __m128i a1 = _mm_loadu_si128((const __m128i*)&src1[i]);
60
0
      const __m128i b1 = _mm_loadu_si128((const __m128i*)&src2[i]);
61
0
      __m128i sum2;
62
0
      i += 16;
63
0
      SubtractAndSquare_SSE2(a0, b0, &sum1);
64
0
      sum = _mm_add_epi32(sum, sum1);
65
0
      a0 = _mm_loadu_si128((const __m128i*)&src1[i]);
66
0
      b0 = _mm_loadu_si128((const __m128i*)&src2[i]);
67
0
      i += 16;
68
0
      SubtractAndSquare_SSE2(a1, b1, &sum2);
69
0
      sum = _mm_add_epi32(sum, sum2);
70
0
    }
71
0
    SubtractAndSquare_SSE2(a0, b0, &sum1);
72
0
    sum = _mm_add_epi32(sum, sum1);
73
0
    _mm_storeu_si128((__m128i*)tmp, sum);
74
0
    sse2 += (tmp[3] + tmp[2] + tmp[1] + tmp[0]);
75
0
  }
76
77
0
  for (; i < len; ++i) {
78
0
    const int32_t diff = src1[i] - src2[i];
79
0
    sse2 += diff * diff;
80
0
  }
81
0
  return sse2;
82
0
}
83
#endif  // !defined(WEBP_DISABLE_STATS)
84
85
#if !defined(WEBP_REDUCE_SIZE)
86
87
0
static uint32_t HorizontalAdd16b_SSE2(const __m128i* const m) {
88
0
  uint16_t tmp[8];
89
0
  const __m128i a = _mm_srli_si128(*m, 8);
90
0
  const __m128i b = _mm_add_epi16(*m, a);
91
0
  _mm_storeu_si128((__m128i*)tmp, b);
92
0
  return (uint32_t)tmp[3] + tmp[2] + tmp[1] + tmp[0];
93
0
}
94
95
0
static uint32_t HorizontalAdd32b_SSE2(const __m128i* const m) {
96
0
  const __m128i a = _mm_srli_si128(*m, 8);
97
0
  const __m128i b = _mm_add_epi32(*m, a);
98
0
  const __m128i c = _mm_add_epi32(b, _mm_srli_si128(b, 4));
99
0
  return (uint32_t)_mm_cvtsi128_si32(c);
100
0
}
101
102
static const uint16_t kWeight[] = {1, 2, 3, 4, 3, 2, 1, 0};
103
104
#define ACCUMULATE_ROW(WEIGHT)                                \
105
0
  do {                                                        \
106
0
    /* compute row weight (Wx * Wy) */                        \
107
0
    const __m128i Wy = _mm_set1_epi16((WEIGHT));              \
108
0
    const __m128i W = _mm_mullo_epi16(Wx, Wy);                \
109
0
    /* process 8 bytes at a time (7 bytes, actually) */       \
110
0
    const __m128i a0 = _mm_loadl_epi64((const __m128i*)src1); \
111
0
    const __m128i b0 = _mm_loadl_epi64((const __m128i*)src2); \
112
0
    /* convert to 16b and multiply by weight */               \
113
0
    const __m128i a1 = _mm_unpacklo_epi8(a0, zero);           \
114
0
    const __m128i b1 = _mm_unpacklo_epi8(b0, zero);           \
115
0
    const __m128i wa1 = _mm_mullo_epi16(a1, W);               \
116
0
    const __m128i wb1 = _mm_mullo_epi16(b1, W);               \
117
0
    /* accumulate */                                          \
118
0
    xm = _mm_add_epi16(xm, wa1);                              \
119
0
    ym = _mm_add_epi16(ym, wb1);                              \
120
0
    xxm = _mm_add_epi32(xxm, _mm_madd_epi16(a1, wa1));        \
121
0
    xym = _mm_add_epi32(xym, _mm_madd_epi16(a1, wb1));        \
122
0
    yym = _mm_add_epi32(yym, _mm_madd_epi16(b1, wb1));        \
123
0
    src1 += stride1;                                          \
124
0
    src2 += stride2;                                          \
125
0
  } while (0)
126
127
static double SSIMGet_SSE2(const uint8_t* src1, int stride1,
128
0
                           const uint8_t* src2, int stride2) {
129
0
  VP8DistoStats stats;
130
0
  const __m128i zero = _mm_setzero_si128();
131
0
  __m128i xm = zero, ym = zero;                // 16b accums
132
0
  __m128i xxm = zero, yym = zero, xym = zero;  // 32b accum
133
0
  const __m128i Wx = _mm_loadu_si128((const __m128i*)kWeight);
134
0
  assert(2 * VP8_SSIM_KERNEL + 1 == 7);
135
0
  ACCUMULATE_ROW(1);
136
0
  ACCUMULATE_ROW(2);
137
0
  ACCUMULATE_ROW(3);
138
0
  ACCUMULATE_ROW(4);
139
0
  ACCUMULATE_ROW(3);
140
0
  ACCUMULATE_ROW(2);
141
0
  ACCUMULATE_ROW(1);
142
0
  stats.xm = HorizontalAdd16b_SSE2(&xm);
143
0
  stats.ym = HorizontalAdd16b_SSE2(&ym);
144
0
  stats.xxm = HorizontalAdd32b_SSE2(&xxm);
145
0
  stats.xym = HorizontalAdd32b_SSE2(&xym);
146
0
  stats.yym = HorizontalAdd32b_SSE2(&yym);
147
0
  return VP8SSIMFromStats(&stats);
148
0
}
149
150
#endif  // !defined(WEBP_REDUCE_SIZE)
151
152
extern void VP8SSIMDspInitSSE2(void);
153
154
0
WEBP_TSAN_IGNORE_FUNCTION void VP8SSIMDspInitSSE2(void) {
155
0
#if !defined(WEBP_DISABLE_STATS)
156
0
  VP8AccumulateSSE = AccumulateSSE_SSE2;
157
0
#endif
158
0
#if !defined(WEBP_REDUCE_SIZE)
159
0
  VP8SSIMGet = SSIMGet_SSE2;
160
0
#endif
161
0
}
162
163
#else  // !WEBP_USE_SSE2
164
165
WEBP_DSP_INIT_STUB(VP8SSIMDspInitSSE2)
166
167
#endif  // WEBP_USE_SSE2