/src/libwebp/src/dsp/ssim_sse2.c
Line | Count | Source (jump to first uncovered line) |
1 | | // Copyright 2017 Google Inc. All Rights Reserved. |
2 | | // |
3 | | // Use of this source code is governed by a BSD-style license |
4 | | // that can be found in the COPYING file in the root of the source |
5 | | // tree. An additional intellectual property rights grant can be found |
6 | | // in the file PATENTS. All contributing project authors may |
7 | | // be found in the AUTHORS file in the root of the source tree. |
8 | | // ----------------------------------------------------------------------------- |
9 | | // |
10 | | // SSE2 version of distortion calculation |
11 | | // |
12 | | // Author: Skal (pascal.massimino@gmail.com) |
13 | | |
14 | | #include "src/dsp/dsp.h" |
15 | | |
16 | | #if defined(WEBP_USE_SSE2) |
17 | | #include <emmintrin.h> |
18 | | |
19 | | #include <assert.h> |
20 | | |
21 | | #include "src/dsp/common_sse2.h" |
22 | | #include "src/dsp/cpu.h" |
23 | | #include "src/webp/types.h" |
24 | | |
25 | | #if !defined(WEBP_DISABLE_STATS) |
26 | | |
27 | | // Helper function |
28 | | static WEBP_INLINE void SubtractAndSquare_SSE2(const __m128i a, const __m128i b, |
29 | 0 | __m128i* const sum) { |
30 | | // take abs(a-b) in 8b |
31 | 0 | const __m128i a_b = _mm_subs_epu8(a, b); |
32 | 0 | const __m128i b_a = _mm_subs_epu8(b, a); |
33 | 0 | const __m128i abs_a_b = _mm_or_si128(a_b, b_a); |
34 | | // zero-extend to 16b |
35 | 0 | const __m128i zero = _mm_setzero_si128(); |
36 | 0 | const __m128i C0 = _mm_unpacklo_epi8(abs_a_b, zero); |
37 | 0 | const __m128i C1 = _mm_unpackhi_epi8(abs_a_b, zero); |
38 | | // multiply with self |
39 | 0 | const __m128i sum1 = _mm_madd_epi16(C0, C0); |
40 | 0 | const __m128i sum2 = _mm_madd_epi16(C1, C1); |
41 | 0 | *sum = _mm_add_epi32(sum1, sum2); |
42 | 0 | } |
43 | | |
44 | | //------------------------------------------------------------------------------ |
45 | | // SSIM / PSNR entry point |
46 | | |
47 | | static uint32_t AccumulateSSE_SSE2(const uint8_t* src1, |
48 | 0 | const uint8_t* src2, int len) { |
49 | 0 | int i = 0; |
50 | 0 | uint32_t sse2 = 0; |
51 | 0 | if (len >= 16) { |
52 | 0 | const int limit = len - 32; |
53 | 0 | int32_t tmp[4]; |
54 | 0 | __m128i sum1; |
55 | 0 | __m128i sum = _mm_setzero_si128(); |
56 | 0 | __m128i a0 = _mm_loadu_si128((const __m128i*)&src1[i]); |
57 | 0 | __m128i b0 = _mm_loadu_si128((const __m128i*)&src2[i]); |
58 | 0 | i += 16; |
59 | 0 | while (i <= limit) { |
60 | 0 | const __m128i a1 = _mm_loadu_si128((const __m128i*)&src1[i]); |
61 | 0 | const __m128i b1 = _mm_loadu_si128((const __m128i*)&src2[i]); |
62 | 0 | __m128i sum2; |
63 | 0 | i += 16; |
64 | 0 | SubtractAndSquare_SSE2(a0, b0, &sum1); |
65 | 0 | sum = _mm_add_epi32(sum, sum1); |
66 | 0 | a0 = _mm_loadu_si128((const __m128i*)&src1[i]); |
67 | 0 | b0 = _mm_loadu_si128((const __m128i*)&src2[i]); |
68 | 0 | i += 16; |
69 | 0 | SubtractAndSquare_SSE2(a1, b1, &sum2); |
70 | 0 | sum = _mm_add_epi32(sum, sum2); |
71 | 0 | } |
72 | 0 | SubtractAndSquare_SSE2(a0, b0, &sum1); |
73 | 0 | sum = _mm_add_epi32(sum, sum1); |
74 | 0 | _mm_storeu_si128((__m128i*)tmp, sum); |
75 | 0 | sse2 += (tmp[3] + tmp[2] + tmp[1] + tmp[0]); |
76 | 0 | } |
77 | |
|
78 | 0 | for (; i < len; ++i) { |
79 | 0 | const int32_t diff = src1[i] - src2[i]; |
80 | 0 | sse2 += diff * diff; |
81 | 0 | } |
82 | 0 | return sse2; |
83 | 0 | } |
84 | | #endif // !defined(WEBP_DISABLE_STATS) |
85 | | |
86 | | #if !defined(WEBP_REDUCE_SIZE) |
87 | | |
88 | 0 | static uint32_t HorizontalAdd16b_SSE2(const __m128i* const m) { |
89 | 0 | uint16_t tmp[8]; |
90 | 0 | const __m128i a = _mm_srli_si128(*m, 8); |
91 | 0 | const __m128i b = _mm_add_epi16(*m, a); |
92 | 0 | _mm_storeu_si128((__m128i*)tmp, b); |
93 | 0 | return (uint32_t)tmp[3] + tmp[2] + tmp[1] + tmp[0]; |
94 | 0 | } |
95 | | |
96 | 0 | static uint32_t HorizontalAdd32b_SSE2(const __m128i* const m) { |
97 | 0 | const __m128i a = _mm_srli_si128(*m, 8); |
98 | 0 | const __m128i b = _mm_add_epi32(*m, a); |
99 | 0 | const __m128i c = _mm_add_epi32(b, _mm_srli_si128(b, 4)); |
100 | 0 | return (uint32_t)_mm_cvtsi128_si32(c); |
101 | 0 | } |
102 | | |
103 | | static const uint16_t kWeight[] = { 1, 2, 3, 4, 3, 2, 1, 0 }; |
104 | | |
105 | 0 | #define ACCUMULATE_ROW(WEIGHT) do { \ |
106 | 0 | /* compute row weight (Wx * Wy) */ \ |
107 | 0 | const __m128i Wy = _mm_set1_epi16((WEIGHT)); \ |
108 | 0 | const __m128i W = _mm_mullo_epi16(Wx, Wy); \ |
109 | 0 | /* process 8 bytes at a time (7 bytes, actually) */ \ |
110 | 0 | const __m128i a0 = _mm_loadl_epi64((const __m128i*)src1); \ |
111 | 0 | const __m128i b0 = _mm_loadl_epi64((const __m128i*)src2); \ |
112 | 0 | /* convert to 16b and multiply by weight */ \ |
113 | 0 | const __m128i a1 = _mm_unpacklo_epi8(a0, zero); \ |
114 | 0 | const __m128i b1 = _mm_unpacklo_epi8(b0, zero); \ |
115 | 0 | const __m128i wa1 = _mm_mullo_epi16(a1, W); \ |
116 | 0 | const __m128i wb1 = _mm_mullo_epi16(b1, W); \ |
117 | 0 | /* accumulate */ \ |
118 | 0 | xm = _mm_add_epi16(xm, wa1); \ |
119 | 0 | ym = _mm_add_epi16(ym, wb1); \ |
120 | 0 | xxm = _mm_add_epi32(xxm, _mm_madd_epi16(a1, wa1)); \ |
121 | 0 | xym = _mm_add_epi32(xym, _mm_madd_epi16(a1, wb1)); \ |
122 | 0 | yym = _mm_add_epi32(yym, _mm_madd_epi16(b1, wb1)); \ |
123 | 0 | src1 += stride1; \ |
124 | 0 | src2 += stride2; \ |
125 | 0 | } while (0) |
126 | | |
127 | | static double SSIMGet_SSE2(const uint8_t* src1, int stride1, |
128 | 0 | const uint8_t* src2, int stride2) { |
129 | 0 | VP8DistoStats stats; |
130 | 0 | const __m128i zero = _mm_setzero_si128(); |
131 | 0 | __m128i xm = zero, ym = zero; // 16b accums |
132 | 0 | __m128i xxm = zero, yym = zero, xym = zero; // 32b accum |
133 | 0 | const __m128i Wx = _mm_loadu_si128((const __m128i*)kWeight); |
134 | 0 | assert(2 * VP8_SSIM_KERNEL + 1 == 7); |
135 | 0 | ACCUMULATE_ROW(1); |
136 | 0 | ACCUMULATE_ROW(2); |
137 | 0 | ACCUMULATE_ROW(3); |
138 | 0 | ACCUMULATE_ROW(4); |
139 | 0 | ACCUMULATE_ROW(3); |
140 | 0 | ACCUMULATE_ROW(2); |
141 | 0 | ACCUMULATE_ROW(1); |
142 | 0 | stats.xm = HorizontalAdd16b_SSE2(&xm); |
143 | 0 | stats.ym = HorizontalAdd16b_SSE2(&ym); |
144 | 0 | stats.xxm = HorizontalAdd32b_SSE2(&xxm); |
145 | 0 | stats.xym = HorizontalAdd32b_SSE2(&xym); |
146 | 0 | stats.yym = HorizontalAdd32b_SSE2(&yym); |
147 | 0 | return VP8SSIMFromStats(&stats); |
148 | 0 | } |
149 | | |
150 | | #endif // !defined(WEBP_REDUCE_SIZE) |
151 | | |
152 | | extern void VP8SSIMDspInitSSE2(void); |
153 | | |
154 | 0 | WEBP_TSAN_IGNORE_FUNCTION void VP8SSIMDspInitSSE2(void) { |
155 | 0 | #if !defined(WEBP_DISABLE_STATS) |
156 | 0 | VP8AccumulateSSE = AccumulateSSE_SSE2; |
157 | 0 | #endif |
158 | 0 | #if !defined(WEBP_REDUCE_SIZE) |
159 | 0 | VP8SSIMGet = SSIMGet_SSE2; |
160 | 0 | #endif |
161 | 0 | } |
162 | | |
163 | | #else // !WEBP_USE_SSE2 |
164 | | |
165 | | WEBP_DSP_INIT_STUB(VP8SSIMDspInitSSE2) |
166 | | |
167 | | #endif // WEBP_USE_SSE2 |