Coverage Report

Created: 2026-05-16 07:49

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/src/theora/lib/x86/sse2encfrag.c
Line
Count
Source
1
/********************************************************************
2
 *                                                                  *
3
 * THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE.   *
4
 * USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS     *
5
 * GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
6
 * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING.       *
7
 *                                                                  *
8
 * THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2009                *
9
 * by the Xiph.Org Foundation https://www.xiph.org/                 *
10
 *                                                                  *
11
 ********************************************************************
12
13
  function:
14
15
 ********************************************************************/
16
#include <stddef.h>
17
#include "x86enc.h"
18
#include "sse2trans.h"
19
20
#if defined(OC_X86_ASM)
21
22
/*Load a 4x8 array of pixels values from %[src] and %[ref] and compute their
23
   16-bit differences.
24
  On output, these are stored in _m0, xmm1, xmm2, and xmm3.
25
  xmm4 and xmm5 are clobbered.*/
26
#define OC_LOAD_SUB_4x8(_m0) \
27
 "#OC_LOAD_SUB_4x8\n\t" \
28
 /*Load the first three rows.*/ \
29
 "movq (%[src]),"_m0"\n\t" \
30
 "movq (%[ref]),%%xmm4\n\t" \
31
 "movq (%[src],%[ystride]),%%xmm1\n\t" \
32
 "movq (%[ref],%[ystride]),%%xmm3\n\t" \
33
 "movq (%[src],%[ystride],2),%%xmm2\n\t" \
34
 "movq (%[ref],%[ystride],2),%%xmm5\n\t" \
35
 /*Unpack and subtract.*/ \
36
 "punpcklbw %%xmm4,"_m0"\n\t" \
37
 "punpcklbw %%xmm4,%%xmm4\n\t" \
38
 "punpcklbw %%xmm3,%%xmm1\n\t" \
39
 "punpcklbw %%xmm3,%%xmm3\n\t" \
40
 "psubw %%xmm4,"_m0"\n\t" \
41
 "psubw %%xmm3,%%xmm1\n\t" \
42
 /*Load the last row.*/ \
43
 "movq (%[src],%[ystride3]),%%xmm3\n\t" \
44
 "movq (%[ref],%[ystride3]),%%xmm4\n\t" \
45
 /*Unpack, subtract, and advance the pointers.*/ \
46
 "punpcklbw %%xmm5,%%xmm2\n\t" \
47
 "punpcklbw %%xmm5,%%xmm5\n\t" \
48
 "lea (%[src],%[ystride],4),%[src]\n\t" \
49
 "psubw %%xmm5,%%xmm2\n\t" \
50
 "punpcklbw %%xmm4,%%xmm3\n\t" \
51
 "punpcklbw %%xmm4,%%xmm4\n\t" \
52
 "lea (%[ref],%[ystride],4),%[ref]\n\t" \
53
 "psubw %%xmm4,%%xmm3\n\t" \
54
55
/*Square and accumulate four rows of differences in _m0, xmm1, xmm2, and xmm3.
56
  On output, xmm0 contains the sum of two of the rows, and the other two are
57
   added to xmm7.*/
58
#define OC_SSD_4x8(_m0) \
59
 "pmaddwd "_m0","_m0"\n\t" \
60
 "pmaddwd %%xmm1,%%xmm1\n\t" \
61
 "pmaddwd %%xmm2,%%xmm2\n\t" \
62
 "pmaddwd %%xmm3,%%xmm3\n\t" \
63
 "paddd %%xmm1,"_m0"\n\t" \
64
 "paddd %%xmm3,%%xmm2\n\t" \
65
 "paddd %%xmm2,%%xmm7\n\t" \
66
67
unsigned __attribute__((target("sse2"))) oc_enc_frag_ssd_sse2(const unsigned char *_src,
68
1.63M
 const unsigned char *_ref,int _ystride){
69
1.63M
  unsigned ret;
70
1.63M
  __asm__ __volatile__(
71
1.63M
    OC_LOAD_SUB_4x8("%%xmm7")
72
1.63M
    OC_SSD_4x8("%%xmm7")
73
1.63M
    OC_LOAD_SUB_4x8("%%xmm0")
74
1.63M
    OC_SSD_4x8("%%xmm0")
75
1.63M
    "paddd %%xmm0,%%xmm7\n\t"
76
1.63M
    "movdqa %%xmm7,%%xmm6\n\t"
77
1.63M
    "punpckhqdq %%xmm7,%%xmm7\n\t"
78
1.63M
    "paddd %%xmm6,%%xmm7\n\t"
79
1.63M
    "pshufd $1,%%xmm7,%%xmm6\n\t"
80
1.63M
    "paddd %%xmm6,%%xmm7\n\t"
81
1.63M
    "movd %%xmm7,%[ret]\n\t"
82
1.63M
    :[ret]"=a"(ret)
83
1.63M
    :[src]"r"(_src),[ref]"r"(_ref),[ystride]"r"((ptrdiff_t)_ystride),
84
1.63M
     [ystride3]"r"((ptrdiff_t)_ystride*3)
85
1.63M
    :"%xmm0", "%xmm1", "%xmm2", "%xmm3",
86
1.63M
     "%xmm4", "%xmm5", "%xmm6", "%xmm7"
87
1.63M
  );
88
1.63M
  return ret;
89
1.63M
}
90
91
static const unsigned char __attribute__((aligned(16))) OC_MASK_CONSTS[8]={
92
  0x01,0x02,0x04,0x08,0x10,0x20,0x40,0x80
93
};
94
95
/*Load a 2x8 array of pixels values from %[src] and %[ref] and compute their
96
   horizontal sums as well as their 16-bit differences subject to a mask.
97
  %[cr] must contain OC_MASK_CONSTS[0...7] and %[mr] must contain 0.*/
98
#define OC_LOAD_SUB_MASK_2x8 \
99
 "#OC_LOAD_SUB_MASK_2x8\n\t" \
100
 /*Start the loads and expand the next 8 bits of the mask.*/ \
101
 "shl $8,%[m]\n\t" \
102
 "movq (%[src]),%%xmm0\n\t" \
103
 "mov %h[m],%b[m]\n\t" \
104
 "movq (%[ref]),%%xmm2\n\t" \
105
 "movd %[m],%%xmm4\n\t" \
106
 "shr $8,%[m]\n\t" \
107
 "pshuflw $0x00,%%xmm4,%%xmm4\n\t" \
108
 "mov %h[m],%b[m]\n\t" \
109
 "pand %[cr],%%xmm4\n\t" \
110
 "pcmpeqb %[cr],%%xmm4\n\t" \
111
 /*Perform the masking.*/ \
112
 "pand %%xmm4,%%xmm0\n\t" \
113
 "pand %%xmm4,%%xmm2\n\t" \
114
 /*Finish the loads while unpacking the first set of rows, and expand the next
115
    8 bits of the mask.*/ \
116
 "movd %[m],%%xmm4\n\t" \
117
 "movq (%[src],%[ystride]),%%xmm1\n\t" \
118
 "pshuflw $0x00,%%xmm4,%%xmm4\n\t" \
119
 "movq (%[ref],%[ystride]),%%xmm3\n\t" \
120
 "pand %[cr],%%xmm4\n\t" \
121
 "punpcklbw %%xmm2,%%xmm0\n\t" \
122
 "pcmpeqb %[cr],%%xmm4\n\t" \
123
 "punpcklbw %%xmm2,%%xmm2\n\t" \
124
 /*Mask and unpack the second set of rows.*/ \
125
 "pand %%xmm4,%%xmm1\n\t" \
126
 "pand %%xmm4,%%xmm3\n\t" \
127
 "punpcklbw %%xmm3,%%xmm1\n\t" \
128
 "punpcklbw %%xmm3,%%xmm3\n\t" \
129
 "psubw %%xmm2,%%xmm0\n\t" \
130
 "psubw %%xmm3,%%xmm1\n\t" \
131
132
unsigned __attribute__((target("sse2"))) oc_enc_frag_border_ssd_sse2(const unsigned char *_src,
133
982k
 const unsigned char *_ref,int _ystride,ogg_int64_t _mask){
134
982k
  ptrdiff_t ystride;
135
982k
  unsigned  ret;
136
982k
  int       i;
137
982k
  ystride=_ystride;
138
  /*Store intermediate values across __asm__ blocks*/
139
982k
  register sse2_reg cr;
140
982k
  register sse2_reg mr;
141
982k
  __asm__ __volatile__(
142
982k
    "pxor %[mr],%[mr]\n\t"
143
982k
    "movq %[c],%[cr]\n\t"
144
982k
    :[cr]"=x"(cr), [mr]"=x"(mr)
145
982k
    :[c]"m"(OC_CONST_ARRAY_OPERAND(unsigned char,OC_MASK_CONSTS,8))
146
982k
  );
147
4.91M
  for(i=0;i<4;i++){
148
3.92M
    unsigned m;
149
3.92M
    m=_mask&0xFFFF;
150
3.92M
    _mask>>=16;
151
3.92M
    if(m){
152
2.37M
      __asm__ __volatile__(
153
2.37M
        OC_LOAD_SUB_MASK_2x8
154
2.37M
        "pmaddwd %%xmm0,%%xmm0\n\t"
155
2.37M
        "pmaddwd %%xmm1,%%xmm1\n\t"
156
2.37M
        "paddd %%xmm0,%[mr]\n\t"
157
2.37M
        "paddd %%xmm1,%[mr]\n\t"
158
2.37M
        :[src]"+r"(_src),[ref]"+r"(_ref),[ystride]"+r"(ystride),[m]"+Q"(m),[mr]"+x"(mr)
159
2.37M
        :[cr]"x"(cr)
160
2.37M
        :"%xmm0", "%xmm1", "%xmm2", "%xmm3",
161
2.37M
         "%xmm4"/*, "%xmm5", "%xmm6", "%xmm7"*/
162
2.37M
      );
163
2.37M
    }
164
3.92M
    _src+=2*ystride;
165
3.92M
    _ref+=2*ystride;
166
3.92M
  }
167
982k
  __asm__ __volatile__(
168
982k
    "movdqa %[mr],%%xmm6\n\t"
169
982k
    "punpckhqdq %[mr],%%xmm7\n\t"
170
982k
    "paddd %%xmm6,%%xmm7\n\t"
171
982k
    "pshufd $1,%%xmm7,%%xmm6\n\t"
172
982k
    "paddd %%xmm6,%%xmm7\n\t"
173
982k
    "movd %%xmm7,%[ret]\n\t"
174
982k
    :[ret]"=a"(ret)
175
982k
    :[mr]"x"(mr)
176
982k
    :"%xmm6", "%xmm7"
177
982k
  );
178
982k
  return ret;
179
982k
}
180
181
182
/*Load an 8x8 array of pixel values from %[src] and %[ref] and compute their
183
   16-bit difference in %%xmm0...%%xmm7.*/
184
#define OC_LOAD_SUB_8x8 \
185
 "#OC_LOAD_SUB_8x8\n\t" \
186
 "movq (%[src]),%%xmm0\n\t" \
187
 "movq (%[ref]),%%xmm4\n\t" \
188
 "movq (%[src],%[src_ystride]),%%xmm1\n\t" \
189
 "lea (%[src],%[src_ystride],2),%[src]\n\t" \
190
 "movq (%[ref],%[ref_ystride]),%%xmm5\n\t" \
191
 "lea (%[ref],%[ref_ystride],2),%[ref]\n\t" \
192
 "movq (%[src]),%%xmm2\n\t" \
193
 "movq (%[ref]),%%xmm7\n\t" \
194
 "movq (%[src],%[src_ystride]),%%xmm3\n\t" \
195
 "movq (%[ref],%[ref_ystride]),%%xmm6\n\t" \
196
 "punpcklbw %%xmm4,%%xmm0\n\t" \
197
 "lea (%[src],%[src_ystride],2),%[src]\n\t" \
198
 "punpcklbw %%xmm4,%%xmm4\n\t" \
199
 "lea (%[ref],%[ref_ystride],2),%[ref]\n\t" \
200
 "psubw %%xmm4,%%xmm0\n\t" \
201
 "movq (%[src]),%%xmm4\n\t" \
202
 "movdqa %%xmm0,"OC_MEM_OFFS(0x00,buf)"\n\t" \
203
 "movq (%[ref]),%%xmm0\n\t" \
204
 "punpcklbw %%xmm5,%%xmm1\n\t" \
205
 "punpcklbw %%xmm5,%%xmm5\n\t" \
206
 "psubw %%xmm5,%%xmm1\n\t" \
207
 "movq (%[src],%[src_ystride]),%%xmm5\n\t" \
208
 "punpcklbw %%xmm7,%%xmm2\n\t" \
209
 "punpcklbw %%xmm7,%%xmm7\n\t" \
210
 "psubw %%xmm7,%%xmm2\n\t" \
211
 "movq (%[ref],%[ref_ystride]),%%xmm7\n\t" \
212
 "punpcklbw %%xmm6,%%xmm3\n\t" \
213
 "lea (%[src],%[src_ystride],2),%[src]\n\t" \
214
 "punpcklbw %%xmm6,%%xmm6\n\t" \
215
 "psubw %%xmm6,%%xmm3\n\t" \
216
 "movq (%[src]),%%xmm6\n\t" \
217
 "punpcklbw %%xmm0,%%xmm4\n\t" \
218
 "lea (%[ref],%[ref_ystride],2),%[ref]\n\t" \
219
 "punpcklbw %%xmm0,%%xmm0\n\t" \
220
 "lea (%[src],%[src_ystride],2),%[src]\n\t" \
221
 "psubw %%xmm0,%%xmm4\n\t" \
222
 "movq (%[ref]),%%xmm0\n\t" \
223
 "punpcklbw %%xmm7,%%xmm5\n\t" \
224
 "neg %[src_ystride]\n\t" \
225
 "punpcklbw %%xmm7,%%xmm7\n\t" \
226
 "psubw %%xmm7,%%xmm5\n\t" \
227
 "movq (%[src],%[src_ystride]),%%xmm7\n\t" \
228
 "punpcklbw %%xmm0,%%xmm6\n\t" \
229
 "lea (%[ref],%[ref_ystride],2),%[ref]\n\t" \
230
 "punpcklbw %%xmm0,%%xmm0\n\t" \
231
 "neg %[ref_ystride]\n\t" \
232
 "psubw %%xmm0,%%xmm6\n\t" \
233
 "movq (%[ref],%[ref_ystride]),%%xmm0\n\t" \
234
 "punpcklbw %%xmm0,%%xmm7\n\t" \
235
 "punpcklbw %%xmm0,%%xmm0\n\t" \
236
 "psubw %%xmm0,%%xmm7\n\t" \
237
 "movdqa "OC_MEM_OFFS(0x00,buf)",%%xmm0\n\t" \
238
239
/*Load an 8x8 array of pixel values from %[src] into %%xmm0...%%xmm7.*/
240
#define OC_LOAD_8x8 \
241
 "#OC_LOAD_8x8\n\t" \
242
 "movq (%[src]),%%xmm0\n\t" \
243
 "movq (%[src],%[ystride]),%%xmm1\n\t" \
244
 "movq (%[src],%[ystride],2),%%xmm2\n\t" \
245
 "pxor %%xmm7,%%xmm7\n\t" \
246
 "movq (%[src],%[ystride3]),%%xmm3\n\t" \
247
 "punpcklbw %%xmm7,%%xmm0\n\t" \
248
 "movq (%[src4]),%%xmm4\n\t" \
249
 "punpcklbw %%xmm7,%%xmm1\n\t" \
250
 "movq (%[src4],%[ystride]),%%xmm5\n\t" \
251
 "punpcklbw %%xmm7,%%xmm2\n\t" \
252
 "movq (%[src4],%[ystride],2),%%xmm6\n\t" \
253
 "punpcklbw %%xmm7,%%xmm3\n\t" \
254
 "movq (%[src4],%[ystride3]),%%xmm7\n\t" \
255
 "punpcklbw %%xmm4,%%xmm4\n\t" \
256
 "punpcklbw %%xmm5,%%xmm5\n\t" \
257
 "psrlw $8,%%xmm4\n\t" \
258
 "psrlw $8,%%xmm5\n\t" \
259
 "punpcklbw %%xmm6,%%xmm6\n\t" \
260
 "punpcklbw %%xmm7,%%xmm7\n\t" \
261
 "psrlw $8,%%xmm6\n\t" \
262
 "psrlw $8,%%xmm7\n\t" \
263
264
/*Performs the first two stages of an 8-point 1-D Hadamard transform in place.
265
  Outputs 1, 3, 4, and 5 from the second stage are negated (which allows us to
266
   perform this stage in place with no temporary registers).*/
267
#define OC_HADAMARD_AB_8x8 \
268
 "#OC_HADAMARD_AB_8x8\n\t" \
269
 /*Stage A:*/ \
270
 "paddw %%xmm5,%%xmm1\n\t" \
271
 "paddw %%xmm6,%%xmm2\n\t" \
272
 "paddw %%xmm5,%%xmm5\n\t" \
273
 "paddw %%xmm6,%%xmm6\n\t" \
274
 "psubw %%xmm1,%%xmm5\n\t" \
275
 "psubw %%xmm2,%%xmm6\n\t" \
276
 "paddw %%xmm7,%%xmm3\n\t" \
277
 "paddw %%xmm4,%%xmm0\n\t" \
278
 "paddw %%xmm7,%%xmm7\n\t" \
279
 "paddw %%xmm4,%%xmm4\n\t" \
280
 "psubw %%xmm3,%%xmm7\n\t" \
281
 "psubw %%xmm0,%%xmm4\n\t" \
282
 /*Stage B:*/ \
283
 "paddw %%xmm2,%%xmm0\n\t" \
284
 "paddw %%xmm3,%%xmm1\n\t" \
285
 "paddw %%xmm6,%%xmm4\n\t" \
286
 "paddw %%xmm7,%%xmm5\n\t" \
287
 "paddw %%xmm2,%%xmm2\n\t" \
288
 "paddw %%xmm3,%%xmm3\n\t" \
289
 "paddw %%xmm6,%%xmm6\n\t" \
290
 "paddw %%xmm7,%%xmm7\n\t" \
291
 "psubw %%xmm0,%%xmm2\n\t" \
292
 "psubw %%xmm1,%%xmm3\n\t" \
293
 "psubw %%xmm4,%%xmm6\n\t" \
294
 "psubw %%xmm5,%%xmm7\n\t" \
295
296
/*Performs the last stage of an 8-point 1-D Hadamard transform in place.
297
  Outputs 1, 3, 5, and 7 are negated (which allows us to perform this stage in
298
   place with no temporary registers).*/
299
#define OC_HADAMARD_C_8x8 \
300
 "#OC_HADAMARD_C_8x8\n\t" \
301
 /*Stage C:*/ \
302
 "paddw %%xmm1,%%xmm0\n\t" \
303
 "paddw %%xmm3,%%xmm2\n\t" \
304
 "paddw %%xmm5,%%xmm4\n\t" \
305
 "paddw %%xmm7,%%xmm6\n\t" \
306
 "paddw %%xmm1,%%xmm1\n\t" \
307
 "paddw %%xmm3,%%xmm3\n\t" \
308
 "paddw %%xmm5,%%xmm5\n\t" \
309
 "paddw %%xmm7,%%xmm7\n\t" \
310
 "psubw %%xmm0,%%xmm1\n\t" \
311
 "psubw %%xmm2,%%xmm3\n\t" \
312
 "psubw %%xmm4,%%xmm5\n\t" \
313
 "psubw %%xmm6,%%xmm7\n\t" \
314
315
/*Performs an 8-point 1-D Hadamard transform in place.
316
  Outputs 1, 2, 4, and 7 are negated (which allows us to perform the transform
317
   in place with no temporary registers).*/
318
#define OC_HADAMARD_8x8 \
319
 OC_HADAMARD_AB_8x8 \
320
 OC_HADAMARD_C_8x8 \
321
322
/*Performs the first part of the final stage of the Hadamard transform and
323
   summing of absolute values.
324
  At the end of this part, %%xmm1 will contain the DC coefficient of the
325
   transform.*/
326
#define OC_HADAMARD_C_ABS_ACCUM_A_8x8 \
327
 /*We use the fact that \
328
     (abs(a+b)+abs(a-b))/2=max(abs(a),abs(b)) \
329
    to merge the final butterfly with the abs and the first stage of \
330
    accumulation. \
331
   Thus we can avoid using pabsw, which is not available until SSSE3. \
332
   Emulating pabsw takes 3 instructions, so the straightforward SSE2 \
333
    implementation would be (3+3)*8+7=55 instructions (+4 for spilling \
334
    registers). \
335
   Even with pabsw, it would be (3+1)*8+7=39 instructions (with no spills). \
336
   This implementation is only 26 (+4 for spilling registers).*/ \
337
 "#OC_HADAMARD_C_ABS_ACCUM_A_8x8\n\t" \
338
 "movdqa %%xmm7,"OC_MEM_OFFS(0x10,buf)"\n\t" \
339
 "movdqa %%xmm6,"OC_MEM_OFFS(0x00,buf)"\n\t" \
340
 /*xmm7={0x7FFF}x4 \
341
   xmm4=max(abs(xmm4),abs(xmm5))-0x7FFF*/ \
342
 "pcmpeqb %%xmm7,%%xmm7\n\t" \
343
 "movdqa %%xmm4,%%xmm6\n\t" \
344
 "psrlw $1,%%xmm7\n\t" \
345
 "paddw %%xmm5,%%xmm6\n\t" \
346
 "pmaxsw %%xmm5,%%xmm4\n\t" \
347
 "paddsw %%xmm7,%%xmm6\n\t" \
348
 "psubw %%xmm6,%%xmm4\n\t" \
349
 /*xmm2=max(abs(xmm2),abs(xmm3))-0x7FFF \
350
   xmm0=max(abs(xmm0),abs(xmm1))-0x7FFF*/ \
351
 "movdqa %%xmm2,%%xmm6\n\t" \
352
 "movdqa %%xmm0,%%xmm5\n\t" \
353
 "pmaxsw %%xmm3,%%xmm2\n\t" \
354
 "pmaxsw %%xmm1,%%xmm0\n\t" \
355
 "paddw %%xmm3,%%xmm6\n\t" \
356
 "movdqa "OC_MEM_OFFS(0x10,buf)",%%xmm3\n\t" \
357
 "paddw %%xmm5,%%xmm1\n\t" \
358
 "movdqa "OC_MEM_OFFS(0x00,buf)",%%xmm5\n\t" \
359
360
/*Performs the second part of the final stage of the Hadamard transform and
361
   summing of absolute values.*/
362
#define OC_HADAMARD_C_ABS_ACCUM_B_8x8 \
363
 "#OC_HADAMARD_C_ABS_ACCUM_B_8x8\n\t" \
364
 "paddsw %%xmm7,%%xmm6\n\t" \
365
 "paddsw %%xmm7,%%xmm1\n\t" \
366
 "psubw %%xmm6,%%xmm2\n\t" \
367
 "psubw %%xmm1,%%xmm0\n\t" \
368
 /*xmm7={1}x4 (needed for the horizontal add that follows) \
369
   xmm0+=xmm2+xmm4+max(abs(xmm3),abs(xmm5))-0x7FFF*/ \
370
 "movdqa %%xmm3,%%xmm6\n\t" \
371
 "pmaxsw %%xmm5,%%xmm3\n\t" \
372
 "paddw %%xmm2,%%xmm0\n\t" \
373
 "paddw %%xmm5,%%xmm6\n\t" \
374
 "paddw %%xmm4,%%xmm0\n\t" \
375
 "paddsw %%xmm7,%%xmm6\n\t" \
376
 "paddw %%xmm3,%%xmm0\n\t" \
377
 "psrlw $14,%%xmm7\n\t" \
378
 "psubw %%xmm6,%%xmm0\n\t" \
379
380
/*Performs the last stage of an 8-point 1-D Hadamard transform, takes the
381
   absolute value of each component, and accumulates everything into xmm0.*/
382
#define OC_HADAMARD_C_ABS_ACCUM_8x8 \
383
 OC_HADAMARD_C_ABS_ACCUM_A_8x8 \
384
 OC_HADAMARD_C_ABS_ACCUM_B_8x8 \
385
386
/*Performs an 8-point 1-D Hadamard transform, takes the absolute value of each
387
   component, and accumulates everything into xmm0.
388
  Note that xmm0 will have an extra 4 added to each column, and that after
389
   removing this value, the remainder will be half the conventional value.*/
390
#define OC_HADAMARD_ABS_ACCUM_8x8 \
391
 OC_HADAMARD_AB_8x8 \
392
 OC_HADAMARD_C_ABS_ACCUM_8x8
393
394
static unsigned __attribute__((target("sse2"))) oc_int_frag_satd_sse2(int *_dc,
395
 const unsigned char *_src,int _src_ystride,
396
21.4M
 const unsigned char *_ref,int _ref_ystride){
397
21.4M
  OC_ALIGN16(ogg_int16_t buf[16]);
398
21.4M
  unsigned ret;
399
21.4M
  unsigned ret2;
400
21.4M
  int      dc;
401
21.4M
  __asm__ __volatile__(
402
21.4M
    OC_LOAD_SUB_8x8
403
21.4M
    OC_HADAMARD_8x8
404
21.4M
    OC_TRANSPOSE_8x8
405
    /*We split out the stages here so we can save the DC coefficient in the
406
       middle.*/
407
21.4M
    OC_HADAMARD_AB_8x8
408
21.4M
    OC_HADAMARD_C_ABS_ACCUM_A_8x8
409
21.4M
    "movd %%xmm1,%[dc]\n\t"
410
21.4M
    OC_HADAMARD_C_ABS_ACCUM_B_8x8
411
    /*Up to this point, everything fit in 16 bits (8 input + 1 for the
412
       difference + 2*3 for the two 8-point 1-D Hadamards - 1 for the abs - 1
413
       for the factor of two we dropped + 3 for the vertical accumulation).
414
      Now we finally have to promote things to dwords.
415
      We break this part out of OC_HADAMARD_ABS_ACCUM_8x8 to hide the long
416
       latency of pmaddwd by starting to compute abs(dc) here.*/
417
21.4M
    "pmaddwd %%xmm7,%%xmm0\n\t"
418
21.4M
    "movsx %w[dc],%[dc]\n\t"
419
21.4M
    "cdq\n\t"
420
21.4M
    "movdqa %%xmm0,%%xmm1\n\t"
421
21.4M
    "punpckhqdq %%xmm0,%%xmm0\n\t"
422
21.4M
    "paddd %%xmm1,%%xmm0\n\t"
423
21.4M
    "pshuflw $0xE,%%xmm0,%%xmm1\n\t"
424
21.4M
    "paddd %%xmm1,%%xmm0\n\t"
425
21.4M
    "movd %%xmm0,%[ret]\n\t"
426
    /*The sums produced by OC_HADAMARD_ABS_ACCUM_8x8 each have an extra 4
427
       added to them, a factor of two removed, and the DC value included;
428
       correct the final sum here.*/
429
21.4M
    "lea -64(%[ret2],%[ret],2),%[ret]\n\t"
430
21.4M
    "xor %[dc],%[ret2]\n\t"
431
21.4M
    "sub %[ret2],%[ret]\n\t"
432
    /*Although it looks like we're using 7 registers here, gcc can alias %[ret]
433
       and %[dc] with some of the inputs, since for once we don't write to
434
       them until after we're done using everything but %[buf].*/
435
    /*Note that _src_ystride and _ref_ystride must be given non-overlapping
436
       constraints, otherwise if gcc can prove they're equal it will allocate
437
       them to the same register (which is bad); _src and _ref face a similar
438
       problem.
439
      All four are destructively modified, but if we list them as output
440
       constraints, gcc can't alias them with other outputs.*/
441
21.4M
    :[ret]"=r"(ret),[ret2]"=d"(ret2),[dc]"=a"(dc),
442
21.4M
     [buf]"=m"(OC_ARRAY_OPERAND(ogg_int16_t,buf,16))
443
21.4M
    :[src]"S"(_src),[src_ystride]"c"((ptrdiff_t)_src_ystride),
444
21.4M
     [ref]"a"(_ref),[ref_ystride]"d"((ptrdiff_t)_ref_ystride)
445
    /*We have to use neg, so we actually clobber the condition codes for once
446
       (not to mention sub, and add).*/
447
21.4M
    :"cc",
448
21.4M
     "%xmm0", "%xmm1", "%xmm2", "%xmm3",
449
21.4M
     "%xmm4", "%xmm5", "%xmm6", "%xmm7"
450
21.4M
  );
451
21.4M
  *_dc=dc;
452
21.4M
  return ret;
453
21.4M
}
454
455
unsigned __attribute__((target("sse2"))) oc_enc_frag_satd_sse2(int *_dc,const unsigned char *_src,
456
10.5M
 const unsigned char *_ref,int _ystride){
457
10.5M
  return oc_int_frag_satd_sse2(_dc,_src,_ystride,_ref,_ystride);
458
10.5M
}
459
460
unsigned __attribute__((target("sse2"))) oc_enc_frag_satd2_sse2(int *_dc,const unsigned char *_src,
461
10.9M
 const unsigned char *_ref1,const unsigned char *_ref2,int _ystride){
462
10.9M
  OC_ALIGN8(unsigned char ref[64]);
463
10.9M
  oc_int_frag_copy2_mmxext(ref,8,_ref1,_ref2,_ystride);
464
10.9M
  return oc_int_frag_satd_sse2(_dc,_src,_ystride,ref,8);
465
10.9M
}
466
467
unsigned __attribute__((target("sse2"))) oc_enc_frag_intra_satd_sse2(int *_dc,
468
20.6M
 const unsigned char *_src,int _ystride){
469
20.6M
  OC_ALIGN16(ogg_int16_t buf[16]);
470
20.6M
  unsigned ret;
471
20.6M
  int      dc;
472
20.6M
  __asm__ __volatile__(
473
20.6M
    OC_LOAD_8x8
474
20.6M
    OC_HADAMARD_8x8
475
20.6M
    OC_TRANSPOSE_8x8
476
    /*We split out the stages here so we can save the DC coefficient in the
477
       middle.*/
478
20.6M
    OC_HADAMARD_AB_8x8
479
20.6M
    OC_HADAMARD_C_ABS_ACCUM_A_8x8
480
20.6M
    "movd %%xmm1,%[dc]\n\t"
481
20.6M
    OC_HADAMARD_C_ABS_ACCUM_B_8x8
482
    /*Up to this point, everything fit in 16 bits (8 input + 1 for the
483
       difference + 2*3 for the two 8-point 1-D Hadamards - 1 for the abs - 1
484
       for the factor of two we dropped + 3 for the vertical accumulation).
485
      Now we finally have to promote things to dwords.*/
486
20.6M
    "pmaddwd %%xmm7,%%xmm0\n\t"
487
    /*We assume that the DC coefficient is always positive (which is true,
488
       because the input to the INTRA transform was not a difference).*/
489
20.6M
    "movzx %w[dc],%[dc]\n\t"
490
20.6M
    "movdqa %%xmm0,%%xmm1\n\t"
491
20.6M
    "punpckhqdq %%xmm0,%%xmm0\n\t"
492
20.6M
    "paddd %%xmm1,%%xmm0\n\t"
493
20.6M
    "pshuflw $0xE,%%xmm0,%%xmm1\n\t"
494
20.6M
    "paddd %%xmm1,%%xmm0\n\t"
495
20.6M
    "movd %%xmm0,%[ret]\n\t"
496
20.6M
    "lea -64(%[ret],%[ret]),%[ret]\n\t"
497
20.6M
    "sub %[dc],%[ret]\n\t"
498
    /*Although it looks like we're using 7 registers here, gcc can alias %[ret]
499
       and %[dc] with some of the inputs, since for once we don't write to
500
       them until after we're done using everything but %[buf].*/
501
20.6M
    :[ret]"=a"(ret),[dc]"=r"(dc),
502
20.6M
     [buf]"=m"(OC_ARRAY_OPERAND(ogg_int16_t,buf,16))
503
20.6M
    :[src]"r"(_src),[src4]"r"(_src+4*_ystride),
504
20.6M
     [ystride]"r"((ptrdiff_t)_ystride),[ystride3]"r"((ptrdiff_t)3*_ystride)
505
    /*We have to use sub, so we actually clobber the condition codes for once.*/
506
20.6M
    :"cc",
507
20.6M
     "%xmm0", "%xmm1", "%xmm2", "%xmm3",
508
20.6M
     "%xmm4", "%xmm5", "%xmm6", "%xmm7"
509
20.6M
  );
510
20.6M
  *_dc=dc;
511
20.6M
  return ret;
512
20.6M
}
513
514
#endif