/src/libvpx/vp8/encoder/x86/vp8_quantize_sse2.c
Line | Count | Source (jump to first uncovered line) |
1 | | /* |
2 | | * Copyright (c) 2012 The WebM project authors. All Rights Reserved. |
3 | | * |
4 | | * Use of this source code is governed by a BSD-style license |
5 | | * that can be found in the LICENSE file in the root of the source |
6 | | * tree. An additional intellectual property rights grant can be found |
7 | | * in the file PATENTS. All contributing project authors may |
8 | | * be found in the AUTHORS file in the root of the source tree. |
9 | | */ |
10 | | |
11 | | #include "vpx_config.h" |
12 | | #include "vp8_rtcd.h" |
13 | | #include "vpx_ports/x86.h" |
14 | | #include "vpx_mem/vpx_mem.h" |
15 | | #include "vp8/encoder/block.h" |
16 | | #include "vp8/common/entropy.h" /* vp8_default_inv_zig_zag */ |
17 | | |
18 | | #include <mmintrin.h> /* MMX */ |
19 | | #include <xmmintrin.h> /* SSE */ |
20 | | #include <emmintrin.h> /* SSE2 */ |
21 | | |
22 | | #define SELECT_EOB(i, z) \ |
23 | 0 | do { \ |
24 | 0 | short boost = *zbin_boost_ptr; \ |
25 | 0 | int cmp = (x[z] < boost) | (y[z] == 0); \ |
26 | 0 | zbin_boost_ptr++; \ |
27 | 0 | if (cmp) break; \ |
28 | 0 | qcoeff_ptr[z] = y[z]; \ |
29 | 0 | eob = i; \ |
30 | 0 | zbin_boost_ptr = b->zrun_zbin_boost; \ |
31 | 0 | } while (0) |
32 | | |
33 | 0 | void vp8_regular_quantize_b_sse2(BLOCK *b, BLOCKD *d) { |
34 | 0 | char eob = 0; |
35 | 0 | short *zbin_boost_ptr; |
36 | 0 | short *qcoeff_ptr = d->qcoeff; |
37 | 0 | DECLARE_ALIGNED(16, short, x[16]); |
38 | 0 | DECLARE_ALIGNED(16, short, y[16]); |
39 | |
|
40 | 0 | __m128i sz0, x0, sz1, x1, y0, y1, x_minus_zbin0, x_minus_zbin1; |
41 | 0 | __m128i quant_shift0 = _mm_load_si128((__m128i *)(b->quant_shift)); |
42 | 0 | __m128i quant_shift1 = _mm_load_si128((__m128i *)(b->quant_shift + 8)); |
43 | 0 | __m128i z0 = _mm_load_si128((__m128i *)(b->coeff)); |
44 | 0 | __m128i z1 = _mm_load_si128((__m128i *)(b->coeff + 8)); |
45 | 0 | __m128i zbin_extra = _mm_cvtsi32_si128(b->zbin_extra); |
46 | 0 | __m128i zbin0 = _mm_load_si128((__m128i *)(b->zbin)); |
47 | 0 | __m128i zbin1 = _mm_load_si128((__m128i *)(b->zbin + 8)); |
48 | 0 | __m128i round0 = _mm_load_si128((__m128i *)(b->round)); |
49 | 0 | __m128i round1 = _mm_load_si128((__m128i *)(b->round + 8)); |
50 | 0 | __m128i quant0 = _mm_load_si128((__m128i *)(b->quant)); |
51 | 0 | __m128i quant1 = _mm_load_si128((__m128i *)(b->quant + 8)); |
52 | 0 | __m128i dequant0 = _mm_load_si128((__m128i *)(d->dequant)); |
53 | 0 | __m128i dequant1 = _mm_load_si128((__m128i *)(d->dequant + 8)); |
54 | |
|
55 | 0 | memset(qcoeff_ptr, 0, 32); |
56 | | |
57 | | /* Duplicate to all lanes. */ |
58 | 0 | zbin_extra = _mm_shufflelo_epi16(zbin_extra, 0); |
59 | 0 | zbin_extra = _mm_unpacklo_epi16(zbin_extra, zbin_extra); |
60 | | |
61 | | /* Sign of z: z >> 15 */ |
62 | 0 | sz0 = _mm_srai_epi16(z0, 15); |
63 | 0 | sz1 = _mm_srai_epi16(z1, 15); |
64 | | |
65 | | /* x = abs(z): (z ^ sz) - sz */ |
66 | 0 | x0 = _mm_xor_si128(z0, sz0); |
67 | 0 | x1 = _mm_xor_si128(z1, sz1); |
68 | 0 | x0 = _mm_sub_epi16(x0, sz0); |
69 | 0 | x1 = _mm_sub_epi16(x1, sz1); |
70 | | |
71 | | /* zbin[] + zbin_extra */ |
72 | 0 | zbin0 = _mm_add_epi16(zbin0, zbin_extra); |
73 | 0 | zbin1 = _mm_add_epi16(zbin1, zbin_extra); |
74 | | |
75 | | /* In C x is compared to zbin where zbin = zbin[] + boost + extra. Rebalance |
76 | | * the equation because boost is the only value which can change: |
77 | | * x - (zbin[] + extra) >= boost */ |
78 | 0 | x_minus_zbin0 = _mm_sub_epi16(x0, zbin0); |
79 | 0 | x_minus_zbin1 = _mm_sub_epi16(x1, zbin1); |
80 | |
|
81 | 0 | _mm_store_si128((__m128i *)(x), x_minus_zbin0); |
82 | 0 | _mm_store_si128((__m128i *)(x + 8), x_minus_zbin1); |
83 | | |
84 | | /* All the remaining calculations are valid whether they are done now with |
85 | | * simd or later inside the loop one at a time. */ |
86 | 0 | x0 = _mm_add_epi16(x0, round0); |
87 | 0 | x1 = _mm_add_epi16(x1, round1); |
88 | |
|
89 | 0 | y0 = _mm_mulhi_epi16(x0, quant0); |
90 | 0 | y1 = _mm_mulhi_epi16(x1, quant1); |
91 | |
|
92 | 0 | y0 = _mm_add_epi16(y0, x0); |
93 | 0 | y1 = _mm_add_epi16(y1, x1); |
94 | | |
95 | | /* Instead of shifting each value independently we convert the scaling |
96 | | * factor with 1 << (16 - shift) so we can use multiply/return high half. */ |
97 | 0 | y0 = _mm_mulhi_epi16(y0, quant_shift0); |
98 | 0 | y1 = _mm_mulhi_epi16(y1, quant_shift1); |
99 | | |
100 | | /* Return the sign: (y ^ sz) - sz */ |
101 | 0 | y0 = _mm_xor_si128(y0, sz0); |
102 | 0 | y1 = _mm_xor_si128(y1, sz1); |
103 | 0 | y0 = _mm_sub_epi16(y0, sz0); |
104 | 0 | y1 = _mm_sub_epi16(y1, sz1); |
105 | |
|
106 | 0 | _mm_store_si128((__m128i *)(y), y0); |
107 | 0 | _mm_store_si128((__m128i *)(y + 8), y1); |
108 | |
|
109 | 0 | zbin_boost_ptr = b->zrun_zbin_boost; |
110 | | |
111 | | /* The loop gets unrolled anyway. Avoid the vp8_default_zig_zag1d lookup. */ |
112 | 0 | SELECT_EOB(1, 0); |
113 | 0 | SELECT_EOB(2, 1); |
114 | 0 | SELECT_EOB(3, 4); |
115 | 0 | SELECT_EOB(4, 8); |
116 | 0 | SELECT_EOB(5, 5); |
117 | 0 | SELECT_EOB(6, 2); |
118 | 0 | SELECT_EOB(7, 3); |
119 | 0 | SELECT_EOB(8, 6); |
120 | 0 | SELECT_EOB(9, 9); |
121 | 0 | SELECT_EOB(10, 12); |
122 | 0 | SELECT_EOB(11, 13); |
123 | 0 | SELECT_EOB(12, 10); |
124 | 0 | SELECT_EOB(13, 7); |
125 | 0 | SELECT_EOB(14, 11); |
126 | 0 | SELECT_EOB(15, 14); |
127 | 0 | SELECT_EOB(16, 15); |
128 | | |
129 | 0 | y0 = _mm_load_si128((__m128i *)(d->qcoeff)); |
130 | 0 | y1 = _mm_load_si128((__m128i *)(d->qcoeff + 8)); |
131 | | |
132 | | /* dqcoeff = qcoeff * dequant */ |
133 | 0 | y0 = _mm_mullo_epi16(y0, dequant0); |
134 | 0 | y1 = _mm_mullo_epi16(y1, dequant1); |
135 | |
|
136 | 0 | _mm_store_si128((__m128i *)(d->dqcoeff), y0); |
137 | 0 | _mm_store_si128((__m128i *)(d->dqcoeff + 8), y1); |
138 | |
|
139 | 0 | *d->eob = eob; |
140 | 0 | } |
141 | | |
142 | 0 | void vp8_fast_quantize_b_sse2(BLOCK *b, BLOCKD *d) { |
143 | 0 | __m128i z0 = _mm_load_si128((__m128i *)(b->coeff)); |
144 | 0 | __m128i z1 = _mm_load_si128((__m128i *)(b->coeff + 8)); |
145 | 0 | __m128i round0 = _mm_load_si128((__m128i *)(b->round)); |
146 | 0 | __m128i round1 = _mm_load_si128((__m128i *)(b->round + 8)); |
147 | 0 | __m128i quant_fast0 = _mm_load_si128((__m128i *)(b->quant_fast)); |
148 | 0 | __m128i quant_fast1 = _mm_load_si128((__m128i *)(b->quant_fast + 8)); |
149 | 0 | __m128i dequant0 = _mm_load_si128((__m128i *)(d->dequant)); |
150 | 0 | __m128i dequant1 = _mm_load_si128((__m128i *)(d->dequant + 8)); |
151 | 0 | __m128i inv_zig_zag0 = |
152 | 0 | _mm_load_si128((const __m128i *)(vp8_default_inv_zig_zag)); |
153 | 0 | __m128i inv_zig_zag1 = |
154 | 0 | _mm_load_si128((const __m128i *)(vp8_default_inv_zig_zag + 8)); |
155 | |
|
156 | 0 | __m128i sz0, sz1, x0, x1, y0, y1, xdq0, xdq1, zeros, ones; |
157 | | |
158 | | /* sign of z: z >> 15 */ |
159 | 0 | sz0 = _mm_srai_epi16(z0, 15); |
160 | 0 | sz1 = _mm_srai_epi16(z1, 15); |
161 | | |
162 | | /* x = abs(z): (z ^ sz) - sz */ |
163 | 0 | x0 = _mm_xor_si128(z0, sz0); |
164 | 0 | x1 = _mm_xor_si128(z1, sz1); |
165 | 0 | x0 = _mm_sub_epi16(x0, sz0); |
166 | 0 | x1 = _mm_sub_epi16(x1, sz1); |
167 | | |
168 | | /* x += round */ |
169 | 0 | x0 = _mm_add_epi16(x0, round0); |
170 | 0 | x1 = _mm_add_epi16(x1, round1); |
171 | | |
172 | | /* y = (x * quant) >> 16 */ |
173 | 0 | y0 = _mm_mulhi_epi16(x0, quant_fast0); |
174 | 0 | y1 = _mm_mulhi_epi16(x1, quant_fast1); |
175 | | |
176 | | /* x = abs(y) = (y ^ sz) - sz */ |
177 | 0 | y0 = _mm_xor_si128(y0, sz0); |
178 | 0 | y1 = _mm_xor_si128(y1, sz1); |
179 | 0 | x0 = _mm_sub_epi16(y0, sz0); |
180 | 0 | x1 = _mm_sub_epi16(y1, sz1); |
181 | | |
182 | | /* qcoeff = x */ |
183 | 0 | _mm_store_si128((__m128i *)(d->qcoeff), x0); |
184 | 0 | _mm_store_si128((__m128i *)(d->qcoeff + 8), x1); |
185 | | |
186 | | /* x * dequant */ |
187 | 0 | xdq0 = _mm_mullo_epi16(x0, dequant0); |
188 | 0 | xdq1 = _mm_mullo_epi16(x1, dequant1); |
189 | | |
190 | | /* dqcoeff = x * dequant */ |
191 | 0 | _mm_store_si128((__m128i *)(d->dqcoeff), xdq0); |
192 | 0 | _mm_store_si128((__m128i *)(d->dqcoeff + 8), xdq1); |
193 | | |
194 | | /* build a mask for the zig zag */ |
195 | 0 | zeros = _mm_setzero_si128(); |
196 | |
|
197 | 0 | x0 = _mm_cmpeq_epi16(x0, zeros); |
198 | 0 | x1 = _mm_cmpeq_epi16(x1, zeros); |
199 | |
|
200 | 0 | ones = _mm_cmpeq_epi16(zeros, zeros); |
201 | |
|
202 | 0 | x0 = _mm_xor_si128(x0, ones); |
203 | 0 | x1 = _mm_xor_si128(x1, ones); |
204 | |
|
205 | 0 | x0 = _mm_and_si128(x0, inv_zig_zag0); |
206 | 0 | x1 = _mm_and_si128(x1, inv_zig_zag1); |
207 | |
|
208 | 0 | x0 = _mm_max_epi16(x0, x1); |
209 | | |
210 | | /* now down to 8 */ |
211 | 0 | x1 = _mm_shuffle_epi32(x0, 0xE); // 0b00001110 |
212 | |
|
213 | 0 | x0 = _mm_max_epi16(x0, x1); |
214 | | |
215 | | /* only 4 left */ |
216 | 0 | x1 = _mm_shufflelo_epi16(x0, 0xE); // 0b00001110 |
217 | |
|
218 | 0 | x0 = _mm_max_epi16(x0, x1); |
219 | | |
220 | | /* okay, just 2! */ |
221 | 0 | x1 = _mm_shufflelo_epi16(x0, 0x1); // 0b00000001 |
222 | |
|
223 | 0 | x0 = _mm_max_epi16(x0, x1); |
224 | |
|
225 | 0 | *d->eob = 0xFF & _mm_cvtsi128_si32(x0); |
226 | 0 | } |