/src/libwebp/src/dsp/enc_sse41.c
Line  | Count  | Source (jump to first uncovered line)  | 
1  |  | // Copyright 2015 Google Inc. All Rights Reserved.  | 
2  |  | //  | 
3  |  | // Use of this source code is governed by a BSD-style license  | 
4  |  | // that can be found in the COPYING file in the root of the source  | 
5  |  | // tree. An additional intellectual property rights grant can be found  | 
6  |  | // in the file PATENTS. All contributing project authors may  | 
7  |  | // be found in the AUTHORS file in the root of the source tree.  | 
8  |  | // -----------------------------------------------------------------------------  | 
9  |  | //  | 
10  |  | // SSE4 version of some encoding functions.  | 
11  |  | //  | 
12  |  | // Author: Skal (pascal.massimino@gmail.com)  | 
13  |  |  | 
14  |  | #include "src/dsp/dsp.h"  | 
15  |  |  | 
16  |  | #if defined(WEBP_USE_SSE41)  | 
17  |  | #include <emmintrin.h>  | 
18  |  | #include <smmintrin.h>  | 
19  |  |  | 
20  |  | #include <stdlib.h>  // for abs()  | 
21  |  |  | 
22  |  | #include "src/dsp/common_sse2.h"  | 
23  |  | #include "src/dsp/cpu.h"  | 
24  |  | #include "src/enc/vp8i_enc.h"  | 
25  |  | #include "src/webp/types.h"  | 
26  |  |  | 
27  |  | //------------------------------------------------------------------------------  | 
28  |  | // Compute susceptibility based on DCT-coeff histograms.  | 
29  |  |  | 
30  |  | static void CollectHistogram_SSE41(const uint8_t* WEBP_RESTRICT ref,  | 
31  |  |                                    const uint8_t* WEBP_RESTRICT pred,  | 
32  |  |                                    int start_block, int end_block,  | 
33  | 0  |                                    VP8Histogram* WEBP_RESTRICT const histo) { | 
34  | 0  |   const __m128i max_coeff_thresh = _mm_set1_epi16(MAX_COEFF_THRESH);  | 
35  | 0  |   int j;  | 
36  | 0  |   int distribution[MAX_COEFF_THRESH + 1] = { 0 }; | 
37  | 0  |   for (j = start_block; j < end_block; ++j) { | 
38  | 0  |     int16_t out[16];  | 
39  | 0  |     int k;  | 
40  |  | 
  | 
41  | 0  |     VP8FTransform(ref + VP8DspScan[j], pred + VP8DspScan[j], out);  | 
42  |  |  | 
43  |  |     // Convert coefficients to bin (within out[]).  | 
44  | 0  |     { | 
45  |  |       // Load.  | 
46  | 0  |       const __m128i out0 = _mm_loadu_si128((__m128i*)&out[0]);  | 
47  | 0  |       const __m128i out1 = _mm_loadu_si128((__m128i*)&out[8]);  | 
48  |  |       // v = abs(out) >> 3  | 
49  | 0  |       const __m128i abs0 = _mm_abs_epi16(out0);  | 
50  | 0  |       const __m128i abs1 = _mm_abs_epi16(out1);  | 
51  | 0  |       const __m128i v0 = _mm_srai_epi16(abs0, 3);  | 
52  | 0  |       const __m128i v1 = _mm_srai_epi16(abs1, 3);  | 
53  |  |       // bin = min(v, MAX_COEFF_THRESH)  | 
54  | 0  |       const __m128i bin0 = _mm_min_epi16(v0, max_coeff_thresh);  | 
55  | 0  |       const __m128i bin1 = _mm_min_epi16(v1, max_coeff_thresh);  | 
56  |  |       // Store.  | 
57  | 0  |       _mm_storeu_si128((__m128i*)&out[0], bin0);  | 
58  | 0  |       _mm_storeu_si128((__m128i*)&out[8], bin1);  | 
59  | 0  |     }  | 
60  |  |  | 
61  |  |     // Convert coefficients to bin.  | 
62  | 0  |     for (k = 0; k < 16; ++k) { | 
63  | 0  |       ++distribution[out[k]];  | 
64  | 0  |     }  | 
65  | 0  |   }  | 
66  | 0  |   VP8SetHistogramData(distribution, histo);  | 
67  | 0  | }  | 
68  |  |  | 
69  |  | //------------------------------------------------------------------------------  | 
70  |  | // Texture distortion  | 
71  |  | //  | 
72  |  | // We try to match the spectral content (weighted) between source and  | 
73  |  | // reconstructed samples.  | 
74  |  |  | 
75  |  | // Hadamard transform  | 
76  |  | // Returns the weighted sum of the absolute value of transformed coefficients.  | 
77  |  | // w[] contains a row-major 4 by 4 symmetric matrix.  | 
78  |  | static int TTransform_SSE41(const uint8_t* inA, const uint8_t* inB,  | 
79  | 0  |                             const uint16_t* const w) { | 
80  | 0  |   int32_t sum[4];  | 
81  | 0  |   __m128i tmp_0, tmp_1, tmp_2, tmp_3;  | 
82  |  |  | 
83  |  |   // Load and combine inputs.  | 
84  | 0  |   { | 
85  | 0  |     const __m128i inA_0 = _mm_loadu_si128((const __m128i*)&inA[BPS * 0]);  | 
86  | 0  |     const __m128i inA_1 = _mm_loadu_si128((const __m128i*)&inA[BPS * 1]);  | 
87  | 0  |     const __m128i inA_2 = _mm_loadu_si128((const __m128i*)&inA[BPS * 2]);  | 
88  |  |     // In SSE4.1, with gcc 4.8 at least (maybe other versions),  | 
89  |  |     // _mm_loadu_si128 is faster than _mm_loadl_epi64. But for the last lump  | 
90  |  |     // of inA and inB, _mm_loadl_epi64 is still used not to have an out of  | 
91  |  |     // bound read.  | 
92  | 0  |     const __m128i inA_3 = _mm_loadl_epi64((const __m128i*)&inA[BPS * 3]);  | 
93  | 0  |     const __m128i inB_0 = _mm_loadu_si128((const __m128i*)&inB[BPS * 0]);  | 
94  | 0  |     const __m128i inB_1 = _mm_loadu_si128((const __m128i*)&inB[BPS * 1]);  | 
95  | 0  |     const __m128i inB_2 = _mm_loadu_si128((const __m128i*)&inB[BPS * 2]);  | 
96  | 0  |     const __m128i inB_3 = _mm_loadl_epi64((const __m128i*)&inB[BPS * 3]);  | 
97  |  |  | 
98  |  |     // Combine inA and inB (we'll do two transforms in parallel).  | 
99  | 0  |     const __m128i inAB_0 = _mm_unpacklo_epi32(inA_0, inB_0);  | 
100  | 0  |     const __m128i inAB_1 = _mm_unpacklo_epi32(inA_1, inB_1);  | 
101  | 0  |     const __m128i inAB_2 = _mm_unpacklo_epi32(inA_2, inB_2);  | 
102  | 0  |     const __m128i inAB_3 = _mm_unpacklo_epi32(inA_3, inB_3);  | 
103  | 0  |     tmp_0 = _mm_cvtepu8_epi16(inAB_0);  | 
104  | 0  |     tmp_1 = _mm_cvtepu8_epi16(inAB_1);  | 
105  | 0  |     tmp_2 = _mm_cvtepu8_epi16(inAB_2);  | 
106  | 0  |     tmp_3 = _mm_cvtepu8_epi16(inAB_3);  | 
107  |  |     // a00 a01 a02 a03   b00 b01 b02 b03  | 
108  |  |     // a10 a11 a12 a13   b10 b11 b12 b13  | 
109  |  |     // a20 a21 a22 a23   b20 b21 b22 b23  | 
110  |  |     // a30 a31 a32 a33   b30 b31 b32 b33  | 
111  | 0  |   }  | 
112  |  |  | 
113  |  |   // Vertical pass first to avoid a transpose (vertical and horizontal passes  | 
114  |  |   // are commutative because w/kWeightY is symmetric) and subsequent transpose.  | 
115  | 0  |   { | 
116  |  |     // Calculate a and b (two 4x4 at once).  | 
117  | 0  |     const __m128i a0 = _mm_add_epi16(tmp_0, tmp_2);  | 
118  | 0  |     const __m128i a1 = _mm_add_epi16(tmp_1, tmp_3);  | 
119  | 0  |     const __m128i a2 = _mm_sub_epi16(tmp_1, tmp_3);  | 
120  | 0  |     const __m128i a3 = _mm_sub_epi16(tmp_0, tmp_2);  | 
121  | 0  |     const __m128i b0 = _mm_add_epi16(a0, a1);  | 
122  | 0  |     const __m128i b1 = _mm_add_epi16(a3, a2);  | 
123  | 0  |     const __m128i b2 = _mm_sub_epi16(a3, a2);  | 
124  | 0  |     const __m128i b3 = _mm_sub_epi16(a0, a1);  | 
125  |  |     // a00 a01 a02 a03   b00 b01 b02 b03  | 
126  |  |     // a10 a11 a12 a13   b10 b11 b12 b13  | 
127  |  |     // a20 a21 a22 a23   b20 b21 b22 b23  | 
128  |  |     // a30 a31 a32 a33   b30 b31 b32 b33  | 
129  |  |  | 
130  |  |     // Transpose the two 4x4.  | 
131  | 0  |     VP8Transpose_2_4x4_16b(&b0, &b1, &b2, &b3, &tmp_0, &tmp_1, &tmp_2, &tmp_3);  | 
132  | 0  |   }  | 
133  |  |  | 
134  |  |   // Horizontal pass and difference of weighted sums.  | 
135  | 0  |   { | 
136  |  |     // Load all inputs.  | 
137  | 0  |     const __m128i w_0 = _mm_loadu_si128((const __m128i*)&w[0]);  | 
138  | 0  |     const __m128i w_8 = _mm_loadu_si128((const __m128i*)&w[8]);  | 
139  |  |  | 
140  |  |     // Calculate a and b (two 4x4 at once).  | 
141  | 0  |     const __m128i a0 = _mm_add_epi16(tmp_0, tmp_2);  | 
142  | 0  |     const __m128i a1 = _mm_add_epi16(tmp_1, tmp_3);  | 
143  | 0  |     const __m128i a2 = _mm_sub_epi16(tmp_1, tmp_3);  | 
144  | 0  |     const __m128i a3 = _mm_sub_epi16(tmp_0, tmp_2);  | 
145  | 0  |     const __m128i b0 = _mm_add_epi16(a0, a1);  | 
146  | 0  |     const __m128i b1 = _mm_add_epi16(a3, a2);  | 
147  | 0  |     const __m128i b2 = _mm_sub_epi16(a3, a2);  | 
148  | 0  |     const __m128i b3 = _mm_sub_epi16(a0, a1);  | 
149  |  |  | 
150  |  |     // Separate the transforms of inA and inB.  | 
151  | 0  |     __m128i A_b0 = _mm_unpacklo_epi64(b0, b1);  | 
152  | 0  |     __m128i A_b2 = _mm_unpacklo_epi64(b2, b3);  | 
153  | 0  |     __m128i B_b0 = _mm_unpackhi_epi64(b0, b1);  | 
154  | 0  |     __m128i B_b2 = _mm_unpackhi_epi64(b2, b3);  | 
155  |  | 
  | 
156  | 0  |     A_b0 = _mm_abs_epi16(A_b0);  | 
157  | 0  |     A_b2 = _mm_abs_epi16(A_b2);  | 
158  | 0  |     B_b0 = _mm_abs_epi16(B_b0);  | 
159  | 0  |     B_b2 = _mm_abs_epi16(B_b2);  | 
160  |  |  | 
161  |  |     // weighted sums  | 
162  | 0  |     A_b0 = _mm_madd_epi16(A_b0, w_0);  | 
163  | 0  |     A_b2 = _mm_madd_epi16(A_b2, w_8);  | 
164  | 0  |     B_b0 = _mm_madd_epi16(B_b0, w_0);  | 
165  | 0  |     B_b2 = _mm_madd_epi16(B_b2, w_8);  | 
166  | 0  |     A_b0 = _mm_add_epi32(A_b0, A_b2);  | 
167  | 0  |     B_b0 = _mm_add_epi32(B_b0, B_b2);  | 
168  |  |  | 
169  |  |     // difference of weighted sums  | 
170  | 0  |     A_b2 = _mm_sub_epi32(A_b0, B_b0);  | 
171  | 0  |     _mm_storeu_si128((__m128i*)&sum[0], A_b2);  | 
172  | 0  |   }  | 
173  | 0  |   return sum[0] + sum[1] + sum[2] + sum[3];  | 
174  | 0  | }  | 
175  |  |  | 
176  |  | static int Disto4x4_SSE41(const uint8_t* WEBP_RESTRICT const a,  | 
177  |  |                           const uint8_t* WEBP_RESTRICT const b,  | 
178  | 0  |                           const uint16_t* WEBP_RESTRICT const w) { | 
179  | 0  |   const int diff_sum = TTransform_SSE41(a, b, w);  | 
180  | 0  |   return abs(diff_sum) >> 5;  | 
181  | 0  | }  | 
182  |  |  | 
183  |  | static int Disto16x16_SSE41(const uint8_t* WEBP_RESTRICT const a,  | 
184  |  |                             const uint8_t* WEBP_RESTRICT const b,  | 
185  | 0  |                             const uint16_t* WEBP_RESTRICT const w) { | 
186  | 0  |   int D = 0;  | 
187  | 0  |   int x, y;  | 
188  | 0  |   for (y = 0; y < 16 * BPS; y += 4 * BPS) { | 
189  | 0  |     for (x = 0; x < 16; x += 4) { | 
190  | 0  |       D += Disto4x4_SSE41(a + x + y, b + x + y, w);  | 
191  | 0  |     }  | 
192  | 0  |   }  | 
193  | 0  |   return D;  | 
194  | 0  | }  | 
195  |  |  | 
196  |  | //------------------------------------------------------------------------------  | 
197  |  | // Quantization  | 
198  |  | //  | 
199  |  |  | 
200  |  | // Generates a pshufb constant for shuffling 16b words.  | 
201  |  | #define PSHUFB_CST(A,B,C,D,E,F,G,H) \  | 
202  | 0  |   _mm_set_epi8(2 * (H) + 1, 2 * (H) + 0, 2 * (G) + 1, 2 * (G) + 0, \  | 
203  | 0  |                2 * (F) + 1, 2 * (F) + 0, 2 * (E) + 1, 2 * (E) + 0, \  | 
204  | 0  |                2 * (D) + 1, 2 * (D) + 0, 2 * (C) + 1, 2 * (C) + 0, \  | 
205  | 0  |                2 * (B) + 1, 2 * (B) + 0, 2 * (A) + 1, 2 * (A) + 0)  | 
206  |  |  | 
207  |  | static WEBP_INLINE int DoQuantizeBlock_SSE41(int16_t in[16], int16_t out[16],  | 
208  |  |                                              const uint16_t* const sharpen,  | 
209  | 0  |                                              const VP8Matrix* const mtx) { | 
210  | 0  |   const __m128i max_coeff_2047 = _mm_set1_epi16(MAX_LEVEL);  | 
211  | 0  |   const __m128i zero = _mm_setzero_si128();  | 
212  | 0  |   __m128i out0, out8;  | 
213  | 0  |   __m128i packed_out;  | 
214  |  |  | 
215  |  |   // Load all inputs.  | 
216  | 0  |   __m128i in0 = _mm_loadu_si128((__m128i*)&in[0]);  | 
217  | 0  |   __m128i in8 = _mm_loadu_si128((__m128i*)&in[8]);  | 
218  | 0  |   const __m128i iq0 = _mm_loadu_si128((const __m128i*)&mtx->iq[0]);  | 
219  | 0  |   const __m128i iq8 = _mm_loadu_si128((const __m128i*)&mtx->iq[8]);  | 
220  | 0  |   const __m128i q0 = _mm_loadu_si128((const __m128i*)&mtx->q[0]);  | 
221  | 0  |   const __m128i q8 = _mm_loadu_si128((const __m128i*)&mtx->q[8]);  | 
222  |  |  | 
223  |  |   // coeff = abs(in)  | 
224  | 0  |   __m128i coeff0 = _mm_abs_epi16(in0);  | 
225  | 0  |   __m128i coeff8 = _mm_abs_epi16(in8);  | 
226  |  |  | 
227  |  |   // coeff = abs(in) + sharpen  | 
228  | 0  |   if (sharpen != NULL) { | 
229  | 0  |     const __m128i sharpen0 = _mm_loadu_si128((const __m128i*)&sharpen[0]);  | 
230  | 0  |     const __m128i sharpen8 = _mm_loadu_si128((const __m128i*)&sharpen[8]);  | 
231  | 0  |     coeff0 = _mm_add_epi16(coeff0, sharpen0);  | 
232  | 0  |     coeff8 = _mm_add_epi16(coeff8, sharpen8);  | 
233  | 0  |   }  | 
234  |  |  | 
235  |  |   // out = (coeff * iQ + B) >> QFIX  | 
236  | 0  |   { | 
237  |  |     // doing calculations with 32b precision (QFIX=17)  | 
238  |  |     // out = (coeff * iQ)  | 
239  | 0  |     const __m128i coeff_iQ0H = _mm_mulhi_epu16(coeff0, iq0);  | 
240  | 0  |     const __m128i coeff_iQ0L = _mm_mullo_epi16(coeff0, iq0);  | 
241  | 0  |     const __m128i coeff_iQ8H = _mm_mulhi_epu16(coeff8, iq8);  | 
242  | 0  |     const __m128i coeff_iQ8L = _mm_mullo_epi16(coeff8, iq8);  | 
243  | 0  |     __m128i out_00 = _mm_unpacklo_epi16(coeff_iQ0L, coeff_iQ0H);  | 
244  | 0  |     __m128i out_04 = _mm_unpackhi_epi16(coeff_iQ0L, coeff_iQ0H);  | 
245  | 0  |     __m128i out_08 = _mm_unpacklo_epi16(coeff_iQ8L, coeff_iQ8H);  | 
246  | 0  |     __m128i out_12 = _mm_unpackhi_epi16(coeff_iQ8L, coeff_iQ8H);  | 
247  |  |     // out = (coeff * iQ + B)  | 
248  | 0  |     const __m128i bias_00 = _mm_loadu_si128((const __m128i*)&mtx->bias[0]);  | 
249  | 0  |     const __m128i bias_04 = _mm_loadu_si128((const __m128i*)&mtx->bias[4]);  | 
250  | 0  |     const __m128i bias_08 = _mm_loadu_si128((const __m128i*)&mtx->bias[8]);  | 
251  | 0  |     const __m128i bias_12 = _mm_loadu_si128((const __m128i*)&mtx->bias[12]);  | 
252  | 0  |     out_00 = _mm_add_epi32(out_00, bias_00);  | 
253  | 0  |     out_04 = _mm_add_epi32(out_04, bias_04);  | 
254  | 0  |     out_08 = _mm_add_epi32(out_08, bias_08);  | 
255  | 0  |     out_12 = _mm_add_epi32(out_12, bias_12);  | 
256  |  |     // out = QUANTDIV(coeff, iQ, B, QFIX)  | 
257  | 0  |     out_00 = _mm_srai_epi32(out_00, QFIX);  | 
258  | 0  |     out_04 = _mm_srai_epi32(out_04, QFIX);  | 
259  | 0  |     out_08 = _mm_srai_epi32(out_08, QFIX);  | 
260  | 0  |     out_12 = _mm_srai_epi32(out_12, QFIX);  | 
261  |  |  | 
262  |  |     // pack result as 16b  | 
263  | 0  |     out0 = _mm_packs_epi32(out_00, out_04);  | 
264  | 0  |     out8 = _mm_packs_epi32(out_08, out_12);  | 
265  |  |  | 
266  |  |     // if (coeff > 2047) coeff = 2047  | 
267  | 0  |     out0 = _mm_min_epi16(out0, max_coeff_2047);  | 
268  | 0  |     out8 = _mm_min_epi16(out8, max_coeff_2047);  | 
269  | 0  |   }  | 
270  |  |  | 
271  |  |   // put sign back  | 
272  | 0  |   out0 = _mm_sign_epi16(out0, in0);  | 
273  | 0  |   out8 = _mm_sign_epi16(out8, in8);  | 
274  |  |  | 
275  |  |   // in = out * Q  | 
276  | 0  |   in0 = _mm_mullo_epi16(out0, q0);  | 
277  | 0  |   in8 = _mm_mullo_epi16(out8, q8);  | 
278  |  | 
  | 
279  | 0  |   _mm_storeu_si128((__m128i*)&in[0], in0);  | 
280  | 0  |   _mm_storeu_si128((__m128i*)&in[8], in8);  | 
281  |  |  | 
282  |  |   // zigzag the output before storing it. The re-ordering is:  | 
283  |  |   //    0 1 2 3 4 5 6 7 | 8  9 10 11 12 13 14 15  | 
284  |  |   // -> 0 1 4[8]5 2 3 6 | 9 12 13 10 [7]11 14 15  | 
285  |  |   // There's only two misplaced entries ([8] and [7]) that are crossing the  | 
286  |  |   // reg's boundaries.  | 
287  |  |   // We use pshufb instead of pshuflo/pshufhi.  | 
288  | 0  |   { | 
289  | 0  |     const __m128i kCst_lo = PSHUFB_CST(0, 1, 4, -1, 5, 2, 3, 6);  | 
290  | 0  |     const __m128i kCst_7 = PSHUFB_CST(-1, -1, -1, -1, 7, -1, -1, -1);  | 
291  | 0  |     const __m128i tmp_lo = _mm_shuffle_epi8(out0, kCst_lo);  | 
292  | 0  |     const __m128i tmp_7 = _mm_shuffle_epi8(out0, kCst_7);  // extract #7  | 
293  | 0  |     const __m128i kCst_hi = PSHUFB_CST(1, 4, 5, 2, -1, 3, 6, 7);  | 
294  | 0  |     const __m128i kCst_8 = PSHUFB_CST(-1, -1, -1, 0, -1, -1, -1, -1);  | 
295  | 0  |     const __m128i tmp_hi = _mm_shuffle_epi8(out8, kCst_hi);  | 
296  | 0  |     const __m128i tmp_8 = _mm_shuffle_epi8(out8, kCst_8);  // extract #8  | 
297  | 0  |     const __m128i out_z0 = _mm_or_si128(tmp_lo, tmp_8);  | 
298  | 0  |     const __m128i out_z8 = _mm_or_si128(tmp_hi, tmp_7);  | 
299  | 0  |     _mm_storeu_si128((__m128i*)&out[0], out_z0);  | 
300  | 0  |     _mm_storeu_si128((__m128i*)&out[8], out_z8);  | 
301  | 0  |     packed_out = _mm_packs_epi16(out_z0, out_z8);  | 
302  | 0  |   }  | 
303  |  |  | 
304  |  |   // detect if all 'out' values are zeroes or not  | 
305  | 0  |   return (_mm_movemask_epi8(_mm_cmpeq_epi8(packed_out, zero)) != 0xffff);  | 
306  | 0  | }  | 
307  |  |  | 
308  |  | #undef PSHUFB_CST  | 
309  |  |  | 
310  |  | static int QuantizeBlock_SSE41(int16_t in[16], int16_t out[16],  | 
311  | 0  |                                const VP8Matrix* WEBP_RESTRICT const mtx) { | 
312  | 0  |   return DoQuantizeBlock_SSE41(in, out, &mtx->sharpen[0], mtx);  | 
313  | 0  | }  | 
314  |  |  | 
315  |  | static int QuantizeBlockWHT_SSE41(int16_t in[16], int16_t out[16],  | 
316  | 0  |                                   const VP8Matrix* WEBP_RESTRICT const mtx) { | 
317  | 0  |   return DoQuantizeBlock_SSE41(in, out, NULL, mtx);  | 
318  | 0  | }  | 
319  |  |  | 
320  |  | static int Quantize2Blocks_SSE41(int16_t in[32], int16_t out[32],  | 
321  | 0  |                                  const VP8Matrix* WEBP_RESTRICT const mtx) { | 
322  | 0  |   int nz;  | 
323  | 0  |   const uint16_t* const sharpen = &mtx->sharpen[0];  | 
324  | 0  |   nz  = DoQuantizeBlock_SSE41(in + 0 * 16, out + 0 * 16, sharpen, mtx) << 0;  | 
325  | 0  |   nz |= DoQuantizeBlock_SSE41(in + 1 * 16, out + 1 * 16, sharpen, mtx) << 1;  | 
326  | 0  |   return nz;  | 
327  | 0  | }  | 
328  |  |  | 
329  |  | //------------------------------------------------------------------------------  | 
330  |  | // Entry point  | 
331  |  |  | 
332  |  | extern void VP8EncDspInitSSE41(void);  | 
333  | 0  | WEBP_TSAN_IGNORE_FUNCTION void VP8EncDspInitSSE41(void) { | 
334  | 0  |   VP8CollectHistogram = CollectHistogram_SSE41;  | 
335  | 0  |   VP8EncQuantizeBlock = QuantizeBlock_SSE41;  | 
336  | 0  |   VP8EncQuantize2Blocks = Quantize2Blocks_SSE41;  | 
337  | 0  |   VP8EncQuantizeBlockWHT = QuantizeBlockWHT_SSE41;  | 
338  | 0  |   VP8TDisto4x4 = Disto4x4_SSE41;  | 
339  | 0  |   VP8TDisto16x16 = Disto16x16_SSE41;  | 
340  | 0  | }  | 
341  |  |  | 
342  |  | #else  // !WEBP_USE_SSE41  | 
343  |  |  | 
344  |  | WEBP_DSP_INIT_STUB(VP8EncDspInitSSE41)  | 
345  |  |  | 
346  |  | #endif  // WEBP_USE_SSE41  |