/src/libvpx/vp8/encoder/x86/quantize_sse4.c
Line | Count | Source |
1 | | /* |
2 | | * Copyright (c) 2012 The WebM project authors. All Rights Reserved. |
3 | | * |
4 | | * Use of this source code is governed by a BSD-style license |
5 | | * that can be found in the LICENSE file in the root of the source |
6 | | * tree. An additional intellectual property rights grant can be found |
7 | | * in the file PATENTS. All contributing project authors may |
8 | | * be found in the AUTHORS file in the root of the source tree. |
9 | | */ |
10 | | |
11 | | #include <smmintrin.h> /* SSE4.1 */ |
12 | | |
13 | | #include "./vp8_rtcd.h" |
14 | | #include "vp8/encoder/block.h" |
15 | | #include "vpx_ports/bitops.h" /* get_lsb */ |
16 | | #include "vpx_ports/compiler_attributes.h" |
17 | | |
18 | | // Unsigned shift overflow is disabled for the use of ~1U << eob with ymask. |
19 | | VPX_NO_UNSIGNED_SHIFT_CHECK void vp8_regular_quantize_b_sse4_1(BLOCK *b, |
20 | 95.9M | BLOCKD *d) { |
21 | 95.9M | int eob = -1; |
22 | 95.9M | short *zbin_boost_ptr = b->zrun_zbin_boost; |
23 | 95.9M | __m128i zbin_boost0 = _mm_load_si128((__m128i *)(zbin_boost_ptr)); |
24 | 95.9M | __m128i zbin_boost1 = _mm_load_si128((__m128i *)(zbin_boost_ptr + 8)); |
25 | 95.9M | __m128i x0, x1, y0, y1, x_minus_zbin0, x_minus_zbin1, dqcoeff0, dqcoeff1; |
26 | 95.9M | __m128i quant_shift0 = _mm_load_si128((__m128i *)(b->quant_shift)); |
27 | 95.9M | __m128i quant_shift1 = _mm_load_si128((__m128i *)(b->quant_shift + 8)); |
28 | 95.9M | __m128i z0 = _mm_load_si128((__m128i *)(b->coeff)); |
29 | 95.9M | __m128i z1 = _mm_load_si128((__m128i *)(b->coeff + 8)); |
30 | 95.9M | __m128i zbin_extra = _mm_cvtsi32_si128(b->zbin_extra); |
31 | 95.9M | __m128i zbin0 = _mm_load_si128((__m128i *)(b->zbin)); |
32 | 95.9M | __m128i zbin1 = _mm_load_si128((__m128i *)(b->zbin + 8)); |
33 | 95.9M | __m128i round0 = _mm_load_si128((__m128i *)(b->round)); |
34 | 95.9M | __m128i round1 = _mm_load_si128((__m128i *)(b->round + 8)); |
35 | 95.9M | __m128i quant0 = _mm_load_si128((__m128i *)(b->quant)); |
36 | 95.9M | __m128i quant1 = _mm_load_si128((__m128i *)(b->quant + 8)); |
37 | 95.9M | __m128i dequant0 = _mm_load_si128((__m128i *)(d->dequant)); |
38 | 95.9M | __m128i dequant1 = _mm_load_si128((__m128i *)(d->dequant + 8)); |
39 | 95.9M | __m128i qcoeff0, qcoeff1, t0, t1, x_shuf0, x_shuf1; |
40 | 95.9M | uint32_t mask, ymask; |
41 | 95.9M | DECLARE_ALIGNED(16, static const uint8_t, |
42 | 95.9M | zig_zag_mask[16]) = { 0, 1, 4, 8, 5, 2, 3, 6, |
43 | 95.9M | 9, 12, 13, 10, 7, 11, 14, 15 }; |
44 | 95.9M | DECLARE_ALIGNED(16, uint16_t, qcoeff[16]) = { 0 }; |
45 | | |
46 | | /* Duplicate to all lanes. */ |
47 | 95.9M | zbin_extra = _mm_shufflelo_epi16(zbin_extra, 0); |
48 | 95.9M | zbin_extra = _mm_unpacklo_epi16(zbin_extra, zbin_extra); |
49 | | |
50 | | /* x = abs(z) */ |
51 | 95.9M | x0 = _mm_abs_epi16(z0); |
52 | 95.9M | x1 = _mm_abs_epi16(z1); |
53 | | |
54 | | /* zbin[] + zbin_extra */ |
55 | 95.9M | zbin0 = _mm_add_epi16(zbin0, zbin_extra); |
56 | 95.9M | zbin1 = _mm_add_epi16(zbin1, zbin_extra); |
57 | | |
58 | | /* In C x is compared to zbin where zbin = zbin[] + boost + extra. Rebalance |
59 | | * the equation because boost is the only value which can change: |
60 | | * x - (zbin[] + extra) >= boost */ |
61 | 95.9M | x_minus_zbin0 = _mm_sub_epi16(x0, zbin0); |
62 | 95.9M | x_minus_zbin1 = _mm_sub_epi16(x1, zbin1); |
63 | | |
64 | | /* All the remaining calculations are valid whether they are done now with |
65 | | * simd or later inside the loop one at a time. */ |
66 | 95.9M | x0 = _mm_add_epi16(x0, round0); |
67 | 95.9M | x1 = _mm_add_epi16(x1, round1); |
68 | | |
69 | 95.9M | y0 = _mm_mulhi_epi16(x0, quant0); |
70 | 95.9M | y1 = _mm_mulhi_epi16(x1, quant1); |
71 | | |
72 | 95.9M | y0 = _mm_add_epi16(y0, x0); |
73 | 95.9M | y1 = _mm_add_epi16(y1, x1); |
74 | | |
75 | | /* Instead of shifting each value independently we convert the scaling |
76 | | * factor with 1 << (16 - shift) so we can use multiply/return high half. */ |
77 | 95.9M | y0 = _mm_mulhi_epi16(y0, quant_shift0); |
78 | 95.9M | y1 = _mm_mulhi_epi16(y1, quant_shift1); |
79 | | |
80 | | /* Restore the sign. */ |
81 | 95.9M | y0 = _mm_sign_epi16(y0, z0); |
82 | 95.9M | y1 = _mm_sign_epi16(y1, z1); |
83 | | |
84 | 95.9M | { |
85 | 95.9M | const __m128i zig_zag_i16_0 = |
86 | 95.9M | _mm_setr_epi8(0, 1, 2, 3, 8, 9, 14, 15, 10, 11, 4, 5, 6, 7, 12, 13); |
87 | 95.9M | const __m128i zig_zag_i16_1 = |
88 | 95.9M | _mm_setr_epi8(0, 1, 6, 7, 8, 9, 2, 3, 14, 15, 4, 5, 10, 11, 12, 13); |
89 | | |
90 | | /* The first part of the zig zag needs a value |
91 | | * from x_minus_zbin1 and vice versa. */ |
92 | 95.9M | t1 = _mm_alignr_epi8(x_minus_zbin1, x_minus_zbin1, 2); |
93 | 95.9M | t0 = _mm_blend_epi16(x_minus_zbin0, t1, 0x80); |
94 | 95.9M | t1 = _mm_blend_epi16(t1, x_minus_zbin0, 0x80); |
95 | 95.9M | x_shuf0 = _mm_shuffle_epi8(t0, zig_zag_i16_0); |
96 | 95.9M | x_shuf1 = _mm_shuffle_epi8(t1, zig_zag_i16_1); |
97 | 95.9M | } |
98 | | |
99 | | /* Check if y is nonzero and put it in zig zag order. */ |
100 | 95.9M | t0 = _mm_packs_epi16(y0, y1); |
101 | 95.9M | t0 = _mm_cmpeq_epi8(t0, _mm_setzero_si128()); |
102 | 95.9M | t0 = _mm_shuffle_epi8(t0, _mm_load_si128((const __m128i *)zig_zag_mask)); |
103 | 95.9M | ymask = _mm_movemask_epi8(t0) ^ 0xffff; |
104 | | |
105 | 585M | for (;;) { |
106 | 585M | t0 = _mm_cmpgt_epi16(zbin_boost0, x_shuf0); |
107 | 585M | t1 = _mm_cmpgt_epi16(zbin_boost1, x_shuf1); |
108 | 585M | t0 = _mm_packs_epi16(t0, t1); |
109 | 585M | mask = _mm_movemask_epi8(t0); |
110 | 585M | mask = ~mask & ymask; |
111 | 585M | if (!mask) break; |
112 | | /* |eob| will contain the index of the next found element where: |
113 | | * boost[i - old_eob - 1] <= x[zigzag[i]] && y[zigzag[i]] != 0 */ |
114 | 489M | eob = get_lsb(mask); |
115 | | /* Need to clear the mask from processed elements so that |
116 | | * they are no longer counted in the next iteration. */ |
117 | 489M | ymask &= ~1U << eob; |
118 | | /* It's safe to read ahead of this buffer if struct VP8_COMP has at |
119 | | * least 32 bytes before the zrun_zbin_boost_* fields (it has 384). |
120 | | * Any data read outside of the buffer is masked by the updated |ymask|. */ |
121 | 489M | zbin_boost0 = _mm_loadu_si128((__m128i *)(zbin_boost_ptr - eob - 1)); |
122 | 489M | zbin_boost1 = _mm_loadu_si128((__m128i *)(zbin_boost_ptr - eob + 7)); |
123 | 489M | qcoeff[zig_zag_mask[eob]] = 0xffff; |
124 | 489M | } |
125 | | |
126 | 95.9M | qcoeff0 = _mm_load_si128((__m128i *)(qcoeff)); |
127 | 95.9M | qcoeff1 = _mm_load_si128((__m128i *)(qcoeff + 8)); |
128 | 95.9M | qcoeff0 = _mm_and_si128(qcoeff0, y0); |
129 | 95.9M | qcoeff1 = _mm_and_si128(qcoeff1, y1); |
130 | | |
131 | 95.9M | _mm_store_si128((__m128i *)(d->qcoeff), qcoeff0); |
132 | 95.9M | _mm_store_si128((__m128i *)(d->qcoeff + 8), qcoeff1); |
133 | | |
134 | 95.9M | dqcoeff0 = _mm_mullo_epi16(qcoeff0, dequant0); |
135 | 95.9M | dqcoeff1 = _mm_mullo_epi16(qcoeff1, dequant1); |
136 | | |
137 | 95.9M | _mm_store_si128((__m128i *)(d->dqcoeff), dqcoeff0); |
138 | 95.9M | _mm_store_si128((__m128i *)(d->dqcoeff + 8), dqcoeff1); |
139 | | |
140 | 95.9M | *d->eob = eob + 1; |
141 | 95.9M | } |