Coverage Report

Created: 2024-09-06 07:53

/src/theora/lib/x86/sse2encfrag.c
Line
Count
Source
1
/********************************************************************
2
 *                                                                  *
3
 * THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE.   *
4
 * USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS     *
5
 * GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
6
 * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING.       *
7
 *                                                                  *
8
 * THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2009                *
9
 * by the Xiph.Org Foundation http://www.xiph.org/                  *
10
 *                                                                  *
11
 ********************************************************************
12
13
  function:
14
  last mod: $Id: dsp_mmx.c 14579 2008-03-12 06:42:40Z xiphmont $
15
16
 ********************************************************************/
17
#include <stddef.h>
18
#include "x86enc.h"
19
#include "sse2trans.h"
20
21
#if defined(OC_X86_ASM)
22
23
/*Load a 4x8 array of pixels values from %[src] and %[ref] and compute their
24
   16-bit differences.
25
  On output, these are stored in _m0, xmm1, xmm2, and xmm3.
26
  xmm4 and xmm5 are clobbered.*/
27
#define OC_LOAD_SUB_4x8(_m0) \
28
 "#OC_LOAD_SUB_4x8\n\t" \
29
 /*Load the first three rows.*/ \
30
 "movq (%[src]),"_m0"\n\t" \
31
 "movq (%[ref]),%%xmm4\n\t" \
32
 "movq (%[src],%[ystride]),%%xmm1\n\t" \
33
 "movq (%[ref],%[ystride]),%%xmm3\n\t" \
34
 "movq (%[src],%[ystride],2),%%xmm2\n\t" \
35
 "movq (%[ref],%[ystride],2),%%xmm5\n\t" \
36
 /*Unpack and subtract.*/ \
37
 "punpcklbw %%xmm4,"_m0"\n\t" \
38
 "punpcklbw %%xmm4,%%xmm4\n\t" \
39
 "punpcklbw %%xmm3,%%xmm1\n\t" \
40
 "punpcklbw %%xmm3,%%xmm3\n\t" \
41
 "psubw %%xmm4,"_m0"\n\t" \
42
 "psubw %%xmm3,%%xmm1\n\t" \
43
 /*Load the last row.*/ \
44
 "movq (%[src],%[ystride3]),%%xmm3\n\t" \
45
 "movq (%[ref],%[ystride3]),%%xmm4\n\t" \
46
 /*Unpack, subtract, and advance the pointers.*/ \
47
 "punpcklbw %%xmm5,%%xmm2\n\t" \
48
 "punpcklbw %%xmm5,%%xmm5\n\t" \
49
 "lea (%[src],%[ystride],4),%[src]\n\t" \
50
 "psubw %%xmm5,%%xmm2\n\t" \
51
 "punpcklbw %%xmm4,%%xmm3\n\t" \
52
 "punpcklbw %%xmm4,%%xmm4\n\t" \
53
 "lea (%[ref],%[ystride],4),%[ref]\n\t" \
54
 "psubw %%xmm4,%%xmm3\n\t" \
55
56
/*Square and accumulate four rows of differences in _m0, xmm1, xmm2, and xmm3.
57
  On output, xmm0 contains the sum of two of the rows, and the other two are
58
   added to xmm7.*/
59
#define OC_SSD_4x8(_m0) \
60
 "pmaddwd "_m0","_m0"\n\t" \
61
 "pmaddwd %%xmm1,%%xmm1\n\t" \
62
 "pmaddwd %%xmm2,%%xmm2\n\t" \
63
 "pmaddwd %%xmm3,%%xmm3\n\t" \
64
 "paddd %%xmm1,"_m0"\n\t" \
65
 "paddd %%xmm3,%%xmm2\n\t" \
66
 "paddd %%xmm2,%%xmm7\n\t" \
67
68
unsigned oc_enc_frag_ssd_sse2(const unsigned char *_src,
69
3.54M
 const unsigned char *_ref,int _ystride){
70
3.54M
  unsigned ret;
71
3.54M
  __asm__ __volatile__(
72
3.54M
    OC_LOAD_SUB_4x8("%%xmm7")
73
3.54M
    OC_SSD_4x8("%%xmm7")
74
3.54M
    OC_LOAD_SUB_4x8("%%xmm0")
75
3.54M
    OC_SSD_4x8("%%xmm0")
76
3.54M
    "paddd %%xmm0,%%xmm7\n\t"
77
3.54M
    "movdqa %%xmm7,%%xmm6\n\t"
78
3.54M
    "punpckhqdq %%xmm7,%%xmm7\n\t"
79
3.54M
    "paddd %%xmm6,%%xmm7\n\t"
80
3.54M
    "pshufd $1,%%xmm7,%%xmm6\n\t"
81
3.54M
    "paddd %%xmm6,%%xmm7\n\t"
82
3.54M
    "movd %%xmm7,%[ret]\n\t"
83
3.54M
    :[ret]"=a"(ret)
84
3.54M
    :[src]"r"(_src),[ref]"r"(_ref),[ystride]"r"((ptrdiff_t)_ystride),
85
3.54M
     [ystride3]"r"((ptrdiff_t)_ystride*3)
86
3.54M
  );
87
3.54M
  return ret;
88
3.54M
}
89
90
static const unsigned char __attribute__((aligned(16))) OC_MASK_CONSTS[8]={
91
  0x01,0x02,0x04,0x08,0x10,0x20,0x40,0x80
92
};
93
94
/*Load a 2x8 array of pixels values from %[src] and %[ref] and compute their
95
   horizontal sums as well as their 16-bit differences subject to a mask.
96
  %%xmm5 must contain OC_MASK_CONSTS[0...7] and %%xmm6 must contain 0.*/
97
#define OC_LOAD_SUB_MASK_2x8 \
98
 "#OC_LOAD_SUB_MASK_2x8\n\t" \
99
 /*Start the loads and expand the next 8 bits of the mask.*/ \
100
 "shl $8,%[m]\n\t" \
101
 "movq (%[src]),%%xmm0\n\t" \
102
 "mov %h[m],%b[m]\n\t" \
103
 "movq (%[ref]),%%xmm2\n\t" \
104
 "movd %[m],%%xmm4\n\t" \
105
 "shr $8,%[m]\n\t" \
106
 "pshuflw $0x00,%%xmm4,%%xmm4\n\t" \
107
 "mov %h[m],%b[m]\n\t" \
108
 "pand %%xmm6,%%xmm4\n\t" \
109
 "pcmpeqb %%xmm6,%%xmm4\n\t" \
110
 /*Perform the masking.*/ \
111
 "pand %%xmm4,%%xmm0\n\t" \
112
 "pand %%xmm4,%%xmm2\n\t" \
113
 /*Finish the loads while unpacking the first set of rows, and expand the next
114
    8 bits of the mask.*/ \
115
 "movd %[m],%%xmm4\n\t" \
116
 "movq (%[src],%[ystride]),%%xmm1\n\t" \
117
 "pshuflw $0x00,%%xmm4,%%xmm4\n\t" \
118
 "movq (%[ref],%[ystride]),%%xmm3\n\t" \
119
 "pand %%xmm6,%%xmm4\n\t" \
120
 "punpcklbw %%xmm2,%%xmm0\n\t" \
121
 "pcmpeqb %%xmm6,%%xmm4\n\t" \
122
 "punpcklbw %%xmm2,%%xmm2\n\t" \
123
 /*Mask and unpack the second set of rows.*/ \
124
 "pand %%xmm4,%%xmm1\n\t" \
125
 "pand %%xmm4,%%xmm3\n\t" \
126
 "punpcklbw %%xmm3,%%xmm1\n\t" \
127
 "punpcklbw %%xmm3,%%xmm3\n\t" \
128
 "psubw %%xmm2,%%xmm0\n\t" \
129
 "psubw %%xmm3,%%xmm1\n\t" \
130
131
unsigned oc_enc_frag_border_ssd_sse2(const unsigned char *_src,
132
1.25M
 const unsigned char *_ref,int _ystride,ogg_int64_t _mask){
133
1.25M
  ptrdiff_t ystride;
134
1.25M
  unsigned  ret;
135
1.25M
  int       i;
136
1.25M
  ystride=_ystride;
137
1.25M
  __asm__ __volatile__(
138
1.25M
    "pxor %%xmm7,%%xmm7\n\t"
139
1.25M
    "movq %[c],%%xmm6\n\t"
140
1.25M
    :
141
1.25M
    :[c]"m"(OC_CONST_ARRAY_OPERAND(unsigned char,OC_MASK_CONSTS,8))
142
1.25M
  );
143
6.25M
  for(i=0;i<4;i++){
144
5.00M
    unsigned m;
145
5.00M
    m=_mask&0xFFFF;
146
5.00M
    _mask>>=16;
147
5.00M
    if(m){
148
3.20M
      __asm__ __volatile__(
149
3.20M
        OC_LOAD_SUB_MASK_2x8
150
3.20M
        "pmaddwd %%xmm0,%%xmm0\n\t"
151
3.20M
        "pmaddwd %%xmm1,%%xmm1\n\t"
152
3.20M
        "paddd %%xmm0,%%xmm7\n\t"
153
3.20M
        "paddd %%xmm1,%%xmm7\n\t"
154
3.20M
        :[src]"+r"(_src),[ref]"+r"(_ref),[ystride]"+r"(ystride),[m]"+Q"(m)
155
3.20M
      );
156
3.20M
    }
157
5.00M
    _src+=2*ystride;
158
5.00M
    _ref+=2*ystride;
159
5.00M
  }
160
1.25M
  __asm__ __volatile__(
161
1.25M
    "movdqa %%xmm7,%%xmm6\n\t"
162
1.25M
    "punpckhqdq %%xmm7,%%xmm7\n\t"
163
1.25M
    "paddd %%xmm6,%%xmm7\n\t"
164
1.25M
    "pshufd $1,%%xmm7,%%xmm6\n\t"
165
1.25M
    "paddd %%xmm6,%%xmm7\n\t"
166
1.25M
    "movd %%xmm7,%[ret]\n\t"
167
1.25M
    :[ret]"=a"(ret)
168
1.25M
  );
169
1.25M
  return ret;
170
1.25M
}
171
172
173
/*Load an 8x8 array of pixel values from %[src] and %[ref] and compute their
174
   16-bit difference in %%xmm0...%%xmm7.*/
175
#define OC_LOAD_SUB_8x8 \
176
 "#OC_LOAD_SUB_8x8\n\t" \
177
 "movq (%[src]),%%xmm0\n\t" \
178
 "movq (%[ref]),%%xmm4\n\t" \
179
 "movq (%[src],%[src_ystride]),%%xmm1\n\t" \
180
 "lea (%[src],%[src_ystride],2),%[src]\n\t" \
181
 "movq (%[ref],%[ref_ystride]),%%xmm5\n\t" \
182
 "lea (%[ref],%[ref_ystride],2),%[ref]\n\t" \
183
 "movq (%[src]),%%xmm2\n\t" \
184
 "movq (%[ref]),%%xmm7\n\t" \
185
 "movq (%[src],%[src_ystride]),%%xmm3\n\t" \
186
 "movq (%[ref],%[ref_ystride]),%%xmm6\n\t" \
187
 "punpcklbw %%xmm4,%%xmm0\n\t" \
188
 "lea (%[src],%[src_ystride],2),%[src]\n\t" \
189
 "punpcklbw %%xmm4,%%xmm4\n\t" \
190
 "lea (%[ref],%[ref_ystride],2),%[ref]\n\t" \
191
 "psubw %%xmm4,%%xmm0\n\t" \
192
 "movq (%[src]),%%xmm4\n\t" \
193
 "movdqa %%xmm0,"OC_MEM_OFFS(0x00,buf)"\n\t" \
194
 "movq (%[ref]),%%xmm0\n\t" \
195
 "punpcklbw %%xmm5,%%xmm1\n\t" \
196
 "punpcklbw %%xmm5,%%xmm5\n\t" \
197
 "psubw %%xmm5,%%xmm1\n\t" \
198
 "movq (%[src],%[src_ystride]),%%xmm5\n\t" \
199
 "punpcklbw %%xmm7,%%xmm2\n\t" \
200
 "punpcklbw %%xmm7,%%xmm7\n\t" \
201
 "psubw %%xmm7,%%xmm2\n\t" \
202
 "movq (%[ref],%[ref_ystride]),%%xmm7\n\t" \
203
 "punpcklbw %%xmm6,%%xmm3\n\t" \
204
 "lea (%[src],%[src_ystride],2),%[src]\n\t" \
205
 "punpcklbw %%xmm6,%%xmm6\n\t" \
206
 "psubw %%xmm6,%%xmm3\n\t" \
207
 "movq (%[src]),%%xmm6\n\t" \
208
 "punpcklbw %%xmm0,%%xmm4\n\t" \
209
 "lea (%[ref],%[ref_ystride],2),%[ref]\n\t" \
210
 "punpcklbw %%xmm0,%%xmm0\n\t" \
211
 "lea (%[src],%[src_ystride],2),%[src]\n\t" \
212
 "psubw %%xmm0,%%xmm4\n\t" \
213
 "movq (%[ref]),%%xmm0\n\t" \
214
 "punpcklbw %%xmm7,%%xmm5\n\t" \
215
 "neg %[src_ystride]\n\t" \
216
 "punpcklbw %%xmm7,%%xmm7\n\t" \
217
 "psubw %%xmm7,%%xmm5\n\t" \
218
 "movq (%[src],%[src_ystride]),%%xmm7\n\t" \
219
 "punpcklbw %%xmm0,%%xmm6\n\t" \
220
 "lea (%[ref],%[ref_ystride],2),%[ref]\n\t" \
221
 "punpcklbw %%xmm0,%%xmm0\n\t" \
222
 "neg %[ref_ystride]\n\t" \
223
 "psubw %%xmm0,%%xmm6\n\t" \
224
 "movq (%[ref],%[ref_ystride]),%%xmm0\n\t" \
225
 "punpcklbw %%xmm0,%%xmm7\n\t" \
226
 "punpcklbw %%xmm0,%%xmm0\n\t" \
227
 "psubw %%xmm0,%%xmm7\n\t" \
228
 "movdqa "OC_MEM_OFFS(0x00,buf)",%%xmm0\n\t" \
229
230
/*Load an 8x8 array of pixel values from %[src] into %%xmm0...%%xmm7.*/
231
#define OC_LOAD_8x8 \
232
 "#OC_LOAD_8x8\n\t" \
233
 "movq (%[src]),%%xmm0\n\t" \
234
 "movq (%[src],%[ystride]),%%xmm1\n\t" \
235
 "movq (%[src],%[ystride],2),%%xmm2\n\t" \
236
 "pxor %%xmm7,%%xmm7\n\t" \
237
 "movq (%[src],%[ystride3]),%%xmm3\n\t" \
238
 "punpcklbw %%xmm7,%%xmm0\n\t" \
239
 "movq (%[src4]),%%xmm4\n\t" \
240
 "punpcklbw %%xmm7,%%xmm1\n\t" \
241
 "movq (%[src4],%[ystride]),%%xmm5\n\t" \
242
 "punpcklbw %%xmm7,%%xmm2\n\t" \
243
 "movq (%[src4],%[ystride],2),%%xmm6\n\t" \
244
 "punpcklbw %%xmm7,%%xmm3\n\t" \
245
 "movq (%[src4],%[ystride3]),%%xmm7\n\t" \
246
 "punpcklbw %%xmm4,%%xmm4\n\t" \
247
 "punpcklbw %%xmm5,%%xmm5\n\t" \
248
 "psrlw $8,%%xmm4\n\t" \
249
 "psrlw $8,%%xmm5\n\t" \
250
 "punpcklbw %%xmm6,%%xmm6\n\t" \
251
 "punpcklbw %%xmm7,%%xmm7\n\t" \
252
 "psrlw $8,%%xmm6\n\t" \
253
 "psrlw $8,%%xmm7\n\t" \
254
255
/*Performs the first two stages of an 8-point 1-D Hadamard transform in place.
256
  Outputs 1, 3, 4, and 5 from the second stage are negated (which allows us to
257
   perform this stage in place with no temporary registers).*/
258
#define OC_HADAMARD_AB_8x8 \
259
 "#OC_HADAMARD_AB_8x8\n\t" \
260
 /*Stage A:*/ \
261
 "paddw %%xmm5,%%xmm1\n\t" \
262
 "paddw %%xmm6,%%xmm2\n\t" \
263
 "paddw %%xmm5,%%xmm5\n\t" \
264
 "paddw %%xmm6,%%xmm6\n\t" \
265
 "psubw %%xmm1,%%xmm5\n\t" \
266
 "psubw %%xmm2,%%xmm6\n\t" \
267
 "paddw %%xmm7,%%xmm3\n\t" \
268
 "paddw %%xmm4,%%xmm0\n\t" \
269
 "paddw %%xmm7,%%xmm7\n\t" \
270
 "paddw %%xmm4,%%xmm4\n\t" \
271
 "psubw %%xmm3,%%xmm7\n\t" \
272
 "psubw %%xmm0,%%xmm4\n\t" \
273
 /*Stage B:*/ \
274
 "paddw %%xmm2,%%xmm0\n\t" \
275
 "paddw %%xmm3,%%xmm1\n\t" \
276
 "paddw %%xmm6,%%xmm4\n\t" \
277
 "paddw %%xmm7,%%xmm5\n\t" \
278
 "paddw %%xmm2,%%xmm2\n\t" \
279
 "paddw %%xmm3,%%xmm3\n\t" \
280
 "paddw %%xmm6,%%xmm6\n\t" \
281
 "paddw %%xmm7,%%xmm7\n\t" \
282
 "psubw %%xmm0,%%xmm2\n\t" \
283
 "psubw %%xmm1,%%xmm3\n\t" \
284
 "psubw %%xmm4,%%xmm6\n\t" \
285
 "psubw %%xmm5,%%xmm7\n\t" \
286
287
/*Performs the last stage of an 8-point 1-D Hadamard transform in place.
288
  Outputs 1, 3, 5, and 7 are negated (which allows us to perform this stage in
289
   place with no temporary registers).*/
290
#define OC_HADAMARD_C_8x8 \
291
 "#OC_HADAMARD_C_8x8\n\t" \
292
 /*Stage C:*/ \
293
 "paddw %%xmm1,%%xmm0\n\t" \
294
 "paddw %%xmm3,%%xmm2\n\t" \
295
 "paddw %%xmm5,%%xmm4\n\t" \
296
 "paddw %%xmm7,%%xmm6\n\t" \
297
 "paddw %%xmm1,%%xmm1\n\t" \
298
 "paddw %%xmm3,%%xmm3\n\t" \
299
 "paddw %%xmm5,%%xmm5\n\t" \
300
 "paddw %%xmm7,%%xmm7\n\t" \
301
 "psubw %%xmm0,%%xmm1\n\t" \
302
 "psubw %%xmm2,%%xmm3\n\t" \
303
 "psubw %%xmm4,%%xmm5\n\t" \
304
 "psubw %%xmm6,%%xmm7\n\t" \
305
306
/*Performs an 8-point 1-D Hadamard transform in place.
307
  Outputs 1, 2, 4, and 7 are negated (which allows us to perform the transform
308
   in place with no temporary registers).*/
309
#define OC_HADAMARD_8x8 \
310
 OC_HADAMARD_AB_8x8 \
311
 OC_HADAMARD_C_8x8 \
312
313
/*Performs the first part of the final stage of the Hadamard transform and
314
   summing of absolute values.
315
  At the end of this part, %%xmm1 will contain the DC coefficient of the
316
   transform.*/
317
#define OC_HADAMARD_C_ABS_ACCUM_A_8x8 \
318
 /*We use the fact that \
319
     (abs(a+b)+abs(a-b))/2=max(abs(a),abs(b)) \
320
    to merge the final butterfly with the abs and the first stage of \
321
    accumulation. \
322
   Thus we can avoid using pabsw, which is not available until SSSE3. \
323
   Emulating pabsw takes 3 instructions, so the straightforward SSE2 \
324
    implementation would be (3+3)*8+7=55 instructions (+4 for spilling \
325
    registers). \
326
   Even with pabsw, it would be (3+1)*8+7=39 instructions (with no spills). \
327
   This implementation is only 26 (+4 for spilling registers).*/ \
328
 "#OC_HADAMARD_C_ABS_ACCUM_A_8x8\n\t" \
329
 "movdqa %%xmm7,"OC_MEM_OFFS(0x10,buf)"\n\t" \
330
 "movdqa %%xmm6,"OC_MEM_OFFS(0x00,buf)"\n\t" \
331
 /*xmm7={0x7FFF}x4 \
332
   xmm4=max(abs(xmm4),abs(xmm5))-0x7FFF*/ \
333
 "pcmpeqb %%xmm7,%%xmm7\n\t" \
334
 "movdqa %%xmm4,%%xmm6\n\t" \
335
 "psrlw $1,%%xmm7\n\t" \
336
 "paddw %%xmm5,%%xmm6\n\t" \
337
 "pmaxsw %%xmm5,%%xmm4\n\t" \
338
 "paddsw %%xmm7,%%xmm6\n\t" \
339
 "psubw %%xmm6,%%xmm4\n\t" \
340
 /*xmm2=max(abs(xmm2),abs(xmm3))-0x7FFF \
341
   xmm0=max(abs(xmm0),abs(xmm1))-0x7FFF*/ \
342
 "movdqa %%xmm2,%%xmm6\n\t" \
343
 "movdqa %%xmm0,%%xmm5\n\t" \
344
 "pmaxsw %%xmm3,%%xmm2\n\t" \
345
 "pmaxsw %%xmm1,%%xmm0\n\t" \
346
 "paddw %%xmm3,%%xmm6\n\t" \
347
 "movdqa "OC_MEM_OFFS(0x10,buf)",%%xmm3\n\t" \
348
 "paddw %%xmm5,%%xmm1\n\t" \
349
 "movdqa "OC_MEM_OFFS(0x00,buf)",%%xmm5\n\t" \
350
351
/*Performs the second part of the final stage of the Hadamard transform and
352
   summing of absolute values.*/
353
#define OC_HADAMARD_C_ABS_ACCUM_B_8x8 \
354
 "#OC_HADAMARD_C_ABS_ACCUM_B_8x8\n\t" \
355
 "paddsw %%xmm7,%%xmm6\n\t" \
356
 "paddsw %%xmm7,%%xmm1\n\t" \
357
 "psubw %%xmm6,%%xmm2\n\t" \
358
 "psubw %%xmm1,%%xmm0\n\t" \
359
 /*xmm7={1}x4 (needed for the horizontal add that follows) \
360
   xmm0+=xmm2+xmm4+max(abs(xmm3),abs(xmm5))-0x7FFF*/ \
361
 "movdqa %%xmm3,%%xmm6\n\t" \
362
 "pmaxsw %%xmm5,%%xmm3\n\t" \
363
 "paddw %%xmm2,%%xmm0\n\t" \
364
 "paddw %%xmm5,%%xmm6\n\t" \
365
 "paddw %%xmm4,%%xmm0\n\t" \
366
 "paddsw %%xmm7,%%xmm6\n\t" \
367
 "paddw %%xmm3,%%xmm0\n\t" \
368
 "psrlw $14,%%xmm7\n\t" \
369
 "psubw %%xmm6,%%xmm0\n\t" \
370
371
/*Performs the last stage of an 8-point 1-D Hadamard transform, takes the
372
   absolute value of each component, and accumulates everything into xmm0.*/
373
#define OC_HADAMARD_C_ABS_ACCUM_8x8 \
374
 OC_HADAMARD_C_ABS_ACCUM_A_8x8 \
375
 OC_HADAMARD_C_ABS_ACCUM_B_8x8 \
376
377
/*Performs an 8-point 1-D Hadamard transform, takes the absolute value of each
378
   component, and accumulates everything into xmm0.
379
  Note that xmm0 will have an extra 4 added to each column, and that after
380
   removing this value, the remainder will be half the conventional value.*/
381
#define OC_HADAMARD_ABS_ACCUM_8x8 \
382
 OC_HADAMARD_AB_8x8 \
383
 OC_HADAMARD_C_ABS_ACCUM_8x8
384
385
static unsigned oc_int_frag_satd_sse2(int *_dc,
386
 const unsigned char *_src,int _src_ystride,
387
40.1M
 const unsigned char *_ref,int _ref_ystride){
388
40.1M
  OC_ALIGN16(ogg_int16_t buf[16]);
389
40.1M
  unsigned ret;
390
40.1M
  unsigned ret2;
391
40.1M
  int      dc;
392
40.1M
  __asm__ __volatile__(
393
40.1M
    OC_LOAD_SUB_8x8
394
40.1M
    OC_HADAMARD_8x8
395
40.1M
    OC_TRANSPOSE_8x8
396
    /*We split out the stages here so we can save the DC coefficient in the
397
       middle.*/
398
40.1M
    OC_HADAMARD_AB_8x8
399
40.1M
    OC_HADAMARD_C_ABS_ACCUM_A_8x8
400
40.1M
    "movd %%xmm1,%[dc]\n\t"
401
40.1M
    OC_HADAMARD_C_ABS_ACCUM_B_8x8
402
    /*Up to this point, everything fit in 16 bits (8 input + 1 for the
403
       difference + 2*3 for the two 8-point 1-D Hadamards - 1 for the abs - 1
404
       for the factor of two we dropped + 3 for the vertical accumulation).
405
      Now we finally have to promote things to dwords.
406
      We break this part out of OC_HADAMARD_ABS_ACCUM_8x8 to hide the long
407
       latency of pmaddwd by starting to compute abs(dc) here.*/
408
40.1M
    "pmaddwd %%xmm7,%%xmm0\n\t"
409
40.1M
    "movsx %w[dc],%[dc]\n\t"
410
40.1M
    "cdq\n\t"
411
40.1M
    "movdqa %%xmm0,%%xmm1\n\t"
412
40.1M
    "punpckhqdq %%xmm0,%%xmm0\n\t"
413
40.1M
    "paddd %%xmm1,%%xmm0\n\t"
414
40.1M
    "pshuflw $0xE,%%xmm0,%%xmm1\n\t"
415
40.1M
    "paddd %%xmm1,%%xmm0\n\t"
416
40.1M
    "movd %%xmm0,%[ret]\n\t"
417
    /*The sums produced by OC_HADAMARD_ABS_ACCUM_8x8 each have an extra 4
418
       added to them, a factor of two removed, and the DC value included;
419
       correct the final sum here.*/
420
40.1M
    "lea -64(%[ret2],%[ret],2),%[ret]\n\t"
421
40.1M
    "xor %[dc],%[ret2]\n\t"
422
40.1M
    "sub %[ret2],%[ret]\n\t"
423
    /*Although it looks like we're using 7 registers here, gcc can alias %[ret]
424
       and %[dc] with some of the inputs, since for once we don't write to
425
       them until after we're done using everything but %[buf].*/
426
    /*Note that _src_ystride and _ref_ystride must be given non-overlapping
427
       constraints, otherewise if gcc can prove they're equal it will allocate
428
       them to the same register (which is bad); _src and _ref face a similar
429
       problem.
430
      All four are destructively modified, but if we list them as output
431
       constraints, gcc can't alias them with other outputs.*/
432
40.1M
    :[ret]"=r"(ret),[ret2]"=d"(ret2),[dc]"=a"(dc),
433
40.1M
     [buf]"=m"(OC_ARRAY_OPERAND(ogg_int16_t,buf,16))
434
40.1M
    :[src]"S"(_src),[src_ystride]"c"((ptrdiff_t)_src_ystride),
435
40.1M
     [ref]"a"(_ref),[ref_ystride]"d"((ptrdiff_t)_ref_ystride)
436
    /*We have to use neg, so we actually clobber the condition codes for once
437
       (not to mention sub, and add).*/
438
40.1M
    :"cc"
439
40.1M
  );
440
40.1M
  *_dc=dc;
441
40.1M
  return ret;
442
40.1M
}
443
444
unsigned oc_enc_frag_satd_sse2(int *_dc,const unsigned char *_src,
445
20.3M
 const unsigned char *_ref,int _ystride){
446
20.3M
  return oc_int_frag_satd_sse2(_dc,_src,_ystride,_ref,_ystride);
447
20.3M
}
448
449
unsigned oc_enc_frag_satd2_sse2(int *_dc,const unsigned char *_src,
450
19.7M
 const unsigned char *_ref1,const unsigned char *_ref2,int _ystride){
451
19.7M
  OC_ALIGN8(unsigned char ref[64]);
452
19.7M
  oc_int_frag_copy2_mmxext(ref,8,_ref1,_ref2,_ystride);
453
19.7M
  return oc_int_frag_satd_sse2(_dc,_src,_ystride,ref,8);
454
19.7M
}
455
456
unsigned oc_enc_frag_intra_satd_sse2(int *_dc,
457
25.1M
 const unsigned char *_src,int _ystride){
458
25.1M
  OC_ALIGN16(ogg_int16_t buf[16]);
459
25.1M
  unsigned ret;
460
25.1M
  int      dc;
461
25.1M
  __asm__ __volatile__(
462
25.1M
    OC_LOAD_8x8
463
25.1M
    OC_HADAMARD_8x8
464
25.1M
    OC_TRANSPOSE_8x8
465
    /*We split out the stages here so we can save the DC coefficient in the
466
       middle.*/
467
25.1M
    OC_HADAMARD_AB_8x8
468
25.1M
    OC_HADAMARD_C_ABS_ACCUM_A_8x8
469
25.1M
    "movd %%xmm1,%[dc]\n\t"
470
25.1M
    OC_HADAMARD_C_ABS_ACCUM_B_8x8
471
    /*Up to this point, everything fit in 16 bits (8 input + 1 for the
472
       difference + 2*3 for the two 8-point 1-D Hadamards - 1 for the abs - 1
473
       for the factor of two we dropped + 3 for the vertical accumulation).
474
      Now we finally have to promote things to dwords.*/
475
25.1M
    "pmaddwd %%xmm7,%%xmm0\n\t"
476
    /*We assume that the DC coefficient is always positive (which is true,
477
       because the input to the INTRA transform was not a difference).*/
478
25.1M
    "movzx %w[dc],%[dc]\n\t"
479
25.1M
    "movdqa %%xmm0,%%xmm1\n\t"
480
25.1M
    "punpckhqdq %%xmm0,%%xmm0\n\t"
481
25.1M
    "paddd %%xmm1,%%xmm0\n\t"
482
25.1M
    "pshuflw $0xE,%%xmm0,%%xmm1\n\t"
483
25.1M
    "paddd %%xmm1,%%xmm0\n\t"
484
25.1M
    "movd %%xmm0,%[ret]\n\t"
485
25.1M
    "lea -64(%[ret],%[ret]),%[ret]\n\t"
486
25.1M
    "sub %[dc],%[ret]\n\t"
487
    /*Although it looks like we're using 7 registers here, gcc can alias %[ret]
488
       and %[dc] with some of the inputs, since for once we don't write to
489
       them until after we're done using everything but %[buf].*/
490
25.1M
    :[ret]"=a"(ret),[dc]"=r"(dc),
491
25.1M
     [buf]"=m"(OC_ARRAY_OPERAND(ogg_int16_t,buf,16))
492
25.1M
    :[src]"r"(_src),[src4]"r"(_src+4*_ystride),
493
25.1M
     [ystride]"r"((ptrdiff_t)_ystride),[ystride3]"r"((ptrdiff_t)3*_ystride)
494
    /*We have to use sub, so we actually clobber the condition codes for once.*/
495
25.1M
    :"cc"
496
25.1M
  );
497
25.1M
  *_dc=dc;
498
25.1M
  return ret;
499
25.1M
}
500
501
#endif