Coverage Report

Created: 2024-09-06 07:53

/src/theora/lib/x86/mmxencfrag.c
Line
Count
Source (jump to first uncovered line)
1
/********************************************************************
2
 *                                                                  *
3
 * THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE.   *
4
 * USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS     *
5
 * GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
6
 * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING.       *
7
 *                                                                  *
8
 * THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2009                *
9
 * by the Xiph.Org Foundation http://www.xiph.org/                  *
10
 *                                                                  *
11
 ********************************************************************
12
13
  function:
14
  last mod: $Id: dsp_mmx.c 14579 2008-03-12 06:42:40Z xiphmont $
15
16
 ********************************************************************/
17
#include <stddef.h>
18
#include "x86enc.h"
19
20
#if defined(OC_X86_ASM)
21
22
unsigned oc_enc_frag_sad_mmxext(const unsigned char *_src,
23
19.3M
 const unsigned char *_ref,int _ystride){
24
19.3M
  ptrdiff_t ystride3;
25
19.3M
  ptrdiff_t ret;
26
19.3M
  __asm__ __volatile__(
27
    /*Load the first 4 rows of each block.*/
28
19.3M
    "movq (%[src]),%%mm0\n\t"
29
19.3M
    "movq (%[ref]),%%mm1\n\t"
30
19.3M
    "movq (%[src],%[ystride]),%%mm2\n\t"
31
19.3M
    "movq (%[ref],%[ystride]),%%mm3\n\t"
32
19.3M
    "lea (%[ystride],%[ystride],2),%[ystride3]\n\t"
33
19.3M
    "movq (%[src],%[ystride],2),%%mm4\n\t"
34
19.3M
    "movq (%[ref],%[ystride],2),%%mm5\n\t"
35
19.3M
    "movq (%[src],%[ystride3]),%%mm6\n\t"
36
19.3M
    "movq (%[ref],%[ystride3]),%%mm7\n\t"
37
    /*Compute their SADs and add them in %%mm0*/
38
19.3M
    "psadbw %%mm1,%%mm0\n\t"
39
19.3M
    "psadbw %%mm3,%%mm2\n\t"
40
19.3M
    "lea (%[src],%[ystride],4),%[src]\n\t"
41
19.3M
    "paddw %%mm2,%%mm0\n\t"
42
19.3M
    "lea (%[ref],%[ystride],4),%[ref]\n\t"
43
    /*Load the next 3 rows as registers become available.*/
44
19.3M
    "movq (%[src]),%%mm2\n\t"
45
19.3M
    "movq (%[ref]),%%mm3\n\t"
46
19.3M
    "psadbw %%mm5,%%mm4\n\t"
47
19.3M
    "psadbw %%mm7,%%mm6\n\t"
48
19.3M
    "paddw %%mm4,%%mm0\n\t"
49
19.3M
    "movq (%[ref],%[ystride]),%%mm5\n\t"
50
19.3M
    "movq (%[src],%[ystride]),%%mm4\n\t"
51
19.3M
    "paddw %%mm6,%%mm0\n\t"
52
19.3M
    "movq (%[ref],%[ystride],2),%%mm7\n\t"
53
19.3M
    "movq (%[src],%[ystride],2),%%mm6\n\t"
54
    /*Start adding their SADs to %%mm0*/
55
19.3M
    "psadbw %%mm3,%%mm2\n\t"
56
19.3M
    "psadbw %%mm5,%%mm4\n\t"
57
19.3M
    "paddw %%mm2,%%mm0\n\t"
58
19.3M
    "psadbw %%mm7,%%mm6\n\t"
59
    /*Load last row as registers become available.*/
60
19.3M
    "movq (%[src],%[ystride3]),%%mm2\n\t"
61
19.3M
    "movq (%[ref],%[ystride3]),%%mm3\n\t"
62
    /*And finish adding up their SADs.*/
63
19.3M
    "paddw %%mm4,%%mm0\n\t"
64
19.3M
    "psadbw %%mm3,%%mm2\n\t"
65
19.3M
    "paddw %%mm6,%%mm0\n\t"
66
19.3M
    "paddw %%mm2,%%mm0\n\t"
67
19.3M
    "movd %%mm0,%[ret]\n\t"
68
19.3M
    :[ret]"=a"(ret),[src]"+r"(_src),[ref]"+r"(_ref),[ystride3]"=&r"(ystride3)
69
19.3M
    :[ystride]"r"((ptrdiff_t)_ystride)
70
19.3M
  );
71
19.3M
  return (unsigned)ret;
72
19.3M
}
73
74
unsigned oc_enc_frag_sad_thresh_mmxext(const unsigned char *_src,
75
0
 const unsigned char *_ref,int _ystride,unsigned _thresh){
76
  /*Early termination is for suckers.*/
77
0
  return oc_enc_frag_sad_mmxext(_src,_ref,_ystride);
78
0
}
79
80
/*Assumes the first two rows of %[ref1] and %[ref2] are in %%mm0...%%mm3, the
81
   first two rows of %[src] are in %%mm4,%%mm5, and {1}x8 is in %%mm7.
82
  We pre-load the next two rows of data as registers become available.*/
83
#define OC_SAD2_LOOP \
84
 "#OC_SAD2_LOOP\n\t" \
85
 /*We want to compute (%%mm0+%%mm1>>1) on unsigned bytes without overflow, but \
86
    pavgb computes (%%mm0+%%mm1+1>>1). \
87
   The latter is exactly 1 too large when the low bit of two corresponding \
88
    bytes is only set in one of them. \
89
   Therefore we pxor the operands, pand to mask out the low bits, and psubb to \
90
    correct the output of pavgb. \
91
   TODO: This should be rewritten to compute ~pavgb(~a,~b) instead, which \
92
    schedules better; currently, however, this function is unused.*/ \
93
 "movq %%mm0,%%mm6\n\t" \
94
 "lea (%[ref1],%[ystride],2),%[ref1]\n\t" \
95
 "pxor %%mm1,%%mm0\n\t" \
96
 "pavgb %%mm1,%%mm6\n\t" \
97
 "lea (%[ref2],%[ystride],2),%[ref2]\n\t" \
98
 "movq %%mm2,%%mm1\n\t" \
99
 "pand %%mm7,%%mm0\n\t" \
100
 "pavgb %%mm3,%%mm2\n\t" \
101
 "pxor %%mm3,%%mm1\n\t" \
102
 "movq (%[ref2],%[ystride]),%%mm3\n\t" \
103
 "psubb %%mm0,%%mm6\n\t" \
104
 "movq (%[ref1]),%%mm0\n\t" \
105
 "pand %%mm7,%%mm1\n\t" \
106
 "psadbw %%mm6,%%mm4\n\t" \
107
 "movd %[ret],%%mm6\n\t" \
108
 "psubb %%mm1,%%mm2\n\t" \
109
 "movq (%[ref2]),%%mm1\n\t" \
110
 "lea (%[src],%[ystride],2),%[src]\n\t" \
111
 "psadbw %%mm2,%%mm5\n\t" \
112
 "movq (%[ref1],%[ystride]),%%mm2\n\t" \
113
 "paddw %%mm4,%%mm5\n\t" \
114
 "movq (%[src]),%%mm4\n\t" \
115
 "paddw %%mm5,%%mm6\n\t" \
116
 "movq (%[src],%[ystride]),%%mm5\n\t" \
117
 "movd %%mm6,%[ret]\n\t" \
118
119
/*Same as above, but does not pre-load the next two rows.*/
120
#define OC_SAD2_TAIL \
121
 "#OC_SAD2_TAIL\n\t" \
122
 "movq %%mm0,%%mm6\n\t" \
123
 "pavgb %%mm1,%%mm0\n\t" \
124
 "pxor %%mm1,%%mm6\n\t" \
125
 "movq %%mm2,%%mm1\n\t" \
126
 "pand %%mm7,%%mm6\n\t" \
127
 "pavgb %%mm3,%%mm2\n\t" \
128
 "pxor %%mm3,%%mm1\n\t" \
129
 "psubb %%mm6,%%mm0\n\t" \
130
 "pand %%mm7,%%mm1\n\t" \
131
 "psadbw %%mm0,%%mm4\n\t" \
132
 "psubb %%mm1,%%mm2\n\t" \
133
 "movd %[ret],%%mm6\n\t" \
134
 "psadbw %%mm2,%%mm5\n\t" \
135
 "paddw %%mm4,%%mm5\n\t" \
136
 "paddw %%mm5,%%mm6\n\t" \
137
 "movd %%mm6,%[ret]\n\t" \
138
139
unsigned oc_enc_frag_sad2_thresh_mmxext(const unsigned char *_src,
140
 const unsigned char *_ref1,const unsigned char *_ref2,int _ystride,
141
0
 unsigned _thresh){
142
0
  ptrdiff_t ret;
143
0
  __asm__ __volatile__(
144
0
    "movq (%[ref1]),%%mm0\n\t"
145
0
    "movq (%[ref2]),%%mm1\n\t"
146
0
    "movq (%[ref1],%[ystride]),%%mm2\n\t"
147
0
    "movq (%[ref2],%[ystride]),%%mm3\n\t"
148
0
    "xor %[ret],%[ret]\n\t"
149
0
    "movq (%[src]),%%mm4\n\t"
150
0
    "pxor %%mm7,%%mm7\n\t"
151
0
    "pcmpeqb %%mm6,%%mm6\n\t"
152
0
    "movq (%[src],%[ystride]),%%mm5\n\t"
153
0
    "psubb %%mm6,%%mm7\n\t"
154
0
    OC_SAD2_LOOP
155
0
    OC_SAD2_LOOP
156
0
    OC_SAD2_LOOP
157
0
    OC_SAD2_TAIL
158
0
    :[ret]"=&a"(ret),[src]"+r"(_src),[ref1]"+r"(_ref1),[ref2]"+r"(_ref2)
159
0
    :[ystride]"r"((ptrdiff_t)_ystride)
160
0
  );
161
0
  return (unsigned)ret;
162
0
}
163
164
/*Load an 8x4 array of pixel values from %[src] and %[ref] and compute their
165
   16-bit difference in %%mm0...%%mm7.*/
166
#define OC_LOAD_SUB_8x4(_off) \
167
 "#OC_LOAD_SUB_8x4\n\t" \
168
 "movd "#_off"(%[src]),%%mm0\n\t" \
169
 "movd "#_off"(%[ref]),%%mm4\n\t" \
170
 "movd "#_off"(%[src],%[src_ystride]),%%mm1\n\t" \
171
 "lea (%[src],%[src_ystride],2),%[src]\n\t" \
172
 "movd "#_off"(%[ref],%[ref_ystride]),%%mm5\n\t" \
173
 "lea (%[ref],%[ref_ystride],2),%[ref]\n\t" \
174
 "movd "#_off"(%[src]),%%mm2\n\t" \
175
 "movd "#_off"(%[ref]),%%mm7\n\t" \
176
 "movd "#_off"(%[src],%[src_ystride]),%%mm3\n\t" \
177
 "movd "#_off"(%[ref],%[ref_ystride]),%%mm6\n\t" \
178
 "punpcklbw %%mm4,%%mm0\n\t" \
179
 "lea (%[src],%[src_ystride],2),%[src]\n\t" \
180
 "punpcklbw %%mm4,%%mm4\n\t" \
181
 "lea (%[ref],%[ref_ystride],2),%[ref]\n\t" \
182
 "psubw %%mm4,%%mm0\n\t" \
183
 "movd "#_off"(%[src]),%%mm4\n\t" \
184
 "movq %%mm0,"OC_MEM_OFFS(_off*2,buf)"\n\t" \
185
 "movd "#_off"(%[ref]),%%mm0\n\t" \
186
 "punpcklbw %%mm5,%%mm1\n\t" \
187
 "punpcklbw %%mm5,%%mm5\n\t" \
188
 "psubw %%mm5,%%mm1\n\t" \
189
 "movd "#_off"(%[src],%[src_ystride]),%%mm5\n\t" \
190
 "punpcklbw %%mm7,%%mm2\n\t" \
191
 "punpcklbw %%mm7,%%mm7\n\t" \
192
 "psubw %%mm7,%%mm2\n\t" \
193
 "movd "#_off"(%[ref],%[ref_ystride]),%%mm7\n\t" \
194
 "punpcklbw %%mm6,%%mm3\n\t" \
195
 "lea (%[src],%[src_ystride],2),%[src]\n\t" \
196
 "punpcklbw %%mm6,%%mm6\n\t" \
197
 "psubw %%mm6,%%mm3\n\t" \
198
 "movd "#_off"(%[src]),%%mm6\n\t" \
199
 "punpcklbw %%mm0,%%mm4\n\t" \
200
 "lea (%[ref],%[ref_ystride],2),%[ref]\n\t" \
201
 "punpcklbw %%mm0,%%mm0\n\t" \
202
 "lea (%[src],%[src_ystride],2),%[src]\n\t" \
203
 "psubw %%mm0,%%mm4\n\t" \
204
 "movd "#_off"(%[ref]),%%mm0\n\t" \
205
 "punpcklbw %%mm7,%%mm5\n\t" \
206
 "neg %[src_ystride]\n\t" \
207
 "punpcklbw %%mm7,%%mm7\n\t" \
208
 "psubw %%mm7,%%mm5\n\t" \
209
 "movd "#_off"(%[src],%[src_ystride]),%%mm7\n\t" \
210
 "punpcklbw %%mm0,%%mm6\n\t" \
211
 "lea (%[ref],%[ref_ystride],2),%[ref]\n\t" \
212
 "punpcklbw %%mm0,%%mm0\n\t" \
213
 "neg %[ref_ystride]\n\t" \
214
 "psubw %%mm0,%%mm6\n\t" \
215
 "movd "#_off"(%[ref],%[ref_ystride]),%%mm0\n\t" \
216
 "lea (%[src],%[src_ystride],8),%[src]\n\t" \
217
 "punpcklbw %%mm0,%%mm7\n\t" \
218
 "neg %[src_ystride]\n\t" \
219
 "punpcklbw %%mm0,%%mm0\n\t" \
220
 "lea (%[ref],%[ref_ystride],8),%[ref]\n\t" \
221
 "psubw %%mm0,%%mm7\n\t" \
222
 "neg %[ref_ystride]\n\t" \
223
 "movq "OC_MEM_OFFS(_off*2,buf)",%%mm0\n\t" \
224
225
/*Load an 8x4 array of pixel values from %[src] into %%mm0...%%mm7.*/
226
#define OC_LOAD_8x4(_off) \
227
 "#OC_LOAD_8x4\n\t" \
228
 "movd "#_off"(%[src]),%%mm0\n\t" \
229
 "movd "#_off"(%[src],%[ystride]),%%mm1\n\t" \
230
 "movd "#_off"(%[src],%[ystride],2),%%mm2\n\t" \
231
 "pxor %%mm7,%%mm7\n\t" \
232
 "movd "#_off"(%[src],%[ystride3]),%%mm3\n\t" \
233
 "punpcklbw %%mm7,%%mm0\n\t" \
234
 "movd "#_off"(%[src4]),%%mm4\n\t" \
235
 "punpcklbw %%mm7,%%mm1\n\t" \
236
 "movd "#_off"(%[src4],%[ystride]),%%mm5\n\t" \
237
 "punpcklbw %%mm7,%%mm2\n\t" \
238
 "movd "#_off"(%[src4],%[ystride],2),%%mm6\n\t" \
239
 "punpcklbw %%mm7,%%mm3\n\t" \
240
 "movd "#_off"(%[src4],%[ystride3]),%%mm7\n\t" \
241
 "punpcklbw %%mm4,%%mm4\n\t" \
242
 "punpcklbw %%mm5,%%mm5\n\t" \
243
 "psrlw $8,%%mm4\n\t" \
244
 "psrlw $8,%%mm5\n\t" \
245
 "punpcklbw %%mm6,%%mm6\n\t" \
246
 "punpcklbw %%mm7,%%mm7\n\t" \
247
 "psrlw $8,%%mm6\n\t" \
248
 "psrlw $8,%%mm7\n\t" \
249
250
/*Performs the first two stages of an 8-point 1-D Hadamard transform.
251
  The transform is performed in place, except that outputs 0-3 are swapped with
252
   outputs 4-7.
253
  Outputs 2, 3, 6, and 7 from the second stage are negated (which allows us to
254
   perform this stage in place with no temporary registers).*/
255
#define OC_HADAMARD_AB_8x4 \
256
 "#OC_HADAMARD_AB_8x4\n\t" \
257
 /*Stage A: \
258
   Outputs 0-3 are swapped with 4-7 here.*/ \
259
 "paddw %%mm1,%%mm5\n\t" \
260
 "paddw %%mm2,%%mm6\n\t" \
261
 "paddw %%mm1,%%mm1\n\t" \
262
 "paddw %%mm2,%%mm2\n\t" \
263
 "psubw %%mm5,%%mm1\n\t" \
264
 "psubw %%mm6,%%mm2\n\t" \
265
 "paddw %%mm3,%%mm7\n\t" \
266
 "paddw %%mm0,%%mm4\n\t" \
267
 "paddw %%mm3,%%mm3\n\t" \
268
 "paddw %%mm0,%%mm0\n\t" \
269
 "psubw %%mm7,%%mm3\n\t" \
270
 "psubw %%mm4,%%mm0\n\t" \
271
 /*Stage B:*/ \
272
 "paddw %%mm2,%%mm0\n\t" \
273
 "paddw %%mm3,%%mm1\n\t" \
274
 "paddw %%mm6,%%mm4\n\t" \
275
 "paddw %%mm7,%%mm5\n\t" \
276
 "paddw %%mm2,%%mm2\n\t" \
277
 "paddw %%mm3,%%mm3\n\t" \
278
 "paddw %%mm6,%%mm6\n\t" \
279
 "paddw %%mm7,%%mm7\n\t" \
280
 "psubw %%mm0,%%mm2\n\t" \
281
 "psubw %%mm1,%%mm3\n\t" \
282
 "psubw %%mm4,%%mm6\n\t" \
283
 "psubw %%mm5,%%mm7\n\t" \
284
285
/*Performs the last stage of an 8-point 1-D Hadamard transform in place.
286
  Outputs 1, 3, 5, and 7 are negated (which allows us to perform this stage in
287
   place with no temporary registers).*/
288
#define OC_HADAMARD_C_8x4 \
289
 "#OC_HADAMARD_C_8x4\n\t" \
290
 /*Stage C:*/ \
291
 "paddw %%mm1,%%mm0\n\t" \
292
 "paddw %%mm3,%%mm2\n\t" \
293
 "paddw %%mm5,%%mm4\n\t" \
294
 "paddw %%mm7,%%mm6\n\t" \
295
 "paddw %%mm1,%%mm1\n\t" \
296
 "paddw %%mm3,%%mm3\n\t" \
297
 "paddw %%mm5,%%mm5\n\t" \
298
 "paddw %%mm7,%%mm7\n\t" \
299
 "psubw %%mm0,%%mm1\n\t" \
300
 "psubw %%mm2,%%mm3\n\t" \
301
 "psubw %%mm4,%%mm5\n\t" \
302
 "psubw %%mm6,%%mm7\n\t" \
303
304
/*Performs an 8-point 1-D Hadamard transform.
305
  The transform is performed in place, except that outputs 0-3 are swapped with
306
   outputs 4-7.
307
  Outputs 1, 2, 5 and 6 are negated (which allows us to perform the transform
308
   in place with no temporary registers).*/
309
#define OC_HADAMARD_8x4 \
310
 OC_HADAMARD_AB_8x4 \
311
 OC_HADAMARD_C_8x4 \
312
313
/*Performs the first part of the final stage of the Hadamard transform and
314
   summing of absolute values.
315
  At the end of this part, %%mm1 will contain the DC coefficient of the
316
   transform.*/
317
#define OC_HADAMARD_C_ABS_ACCUM_A_8x4(_r6,_r7) \
318
 /*We use the fact that \
319
     (abs(a+b)+abs(a-b))/2=max(abs(a),abs(b)) \
320
    to merge the final butterfly with the abs and the first stage of \
321
    accumulation. \
322
   Thus we can avoid using pabsw, which is not available until SSSE3. \
323
   Emulating pabsw takes 3 instructions, so the straightforward MMXEXT \
324
    implementation would be (3+3)*8+7=55 instructions (+4 for spilling \
325
    registers). \
326
   Even with pabsw, it would be (3+1)*8+7=39 instructions (with no spills). \
327
   This implementation is only 26 (+4 for spilling registers).*/ \
328
 "#OC_HADAMARD_C_ABS_ACCUM_A_8x4\n\t" \
329
 "movq %%mm7,"OC_MEM_OFFS(_r7,buf)"\n\t" \
330
 "movq %%mm6,"OC_MEM_OFFS(_r6,buf)"\n\t" \
331
 /*mm7={0x7FFF}x4 \
332
   mm0=max(abs(mm0),abs(mm1))-0x7FFF*/ \
333
 "pcmpeqb %%mm7,%%mm7\n\t" \
334
 "movq %%mm0,%%mm6\n\t" \
335
 "psrlw $1,%%mm7\n\t" \
336
 "paddw %%mm1,%%mm6\n\t" \
337
 "pmaxsw %%mm1,%%mm0\n\t" \
338
 "paddsw %%mm7,%%mm6\n\t" \
339
 "psubw %%mm6,%%mm0\n\t" \
340
 /*mm2=max(abs(mm2),abs(mm3))-0x7FFF \
341
   mm4=max(abs(mm4),abs(mm5))-0x7FFF*/ \
342
 "movq %%mm2,%%mm6\n\t" \
343
 "movq %%mm4,%%mm1\n\t" \
344
 "pmaxsw %%mm3,%%mm2\n\t" \
345
 "pmaxsw %%mm5,%%mm4\n\t" \
346
 "paddw %%mm3,%%mm6\n\t" \
347
 "paddw %%mm5,%%mm1\n\t" \
348
 "movq "OC_MEM_OFFS(_r7,buf)",%%mm3\n\t" \
349
350
/*Performs the second part of the final stage of the Hadamard transform and
351
   summing of absolute values.*/
352
#define OC_HADAMARD_C_ABS_ACCUM_B_8x4(_r6,_r7) \
353
 "#OC_HADAMARD_C_ABS_ACCUM_B_8x4\n\t" \
354
 "paddsw %%mm7,%%mm6\n\t" \
355
 "movq "OC_MEM_OFFS(_r6,buf)",%%mm5\n\t" \
356
 "paddsw %%mm7,%%mm1\n\t" \
357
 "psubw %%mm6,%%mm2\n\t" \
358
 "psubw %%mm1,%%mm4\n\t" \
359
 /*mm7={1}x4 (needed for the horizontal add that follows) \
360
   mm0+=mm2+mm4+max(abs(mm3),abs(mm5))-0x7FFF*/ \
361
 "movq %%mm3,%%mm6\n\t" \
362
 "pmaxsw %%mm5,%%mm3\n\t" \
363
 "paddw %%mm2,%%mm0\n\t" \
364
 "paddw %%mm5,%%mm6\n\t" \
365
 "paddw %%mm4,%%mm0\n\t" \
366
 "paddsw %%mm7,%%mm6\n\t" \
367
 "paddw %%mm3,%%mm0\n\t" \
368
 "psrlw $14,%%mm7\n\t" \
369
 "psubw %%mm6,%%mm0\n\t" \
370
371
/*Performs the last stage of an 8-point 1-D Hadamard transform, takes the
372
   absolute value of each component, and accumulates everything into mm0.
373
  This is the only portion of SATD which requires MMXEXT (we could use plain
374
   MMX, but it takes 4 instructions and an extra register to work around the
375
   lack of a pmaxsw, which is a pretty serious penalty).*/
376
#define OC_HADAMARD_C_ABS_ACCUM_8x4(_r6,_r7) \
377
 OC_HADAMARD_C_ABS_ACCUM_A_8x4(_r6,_r7) \
378
 OC_HADAMARD_C_ABS_ACCUM_B_8x4(_r6,_r7) \
379
380
/*Performs an 8-point 1-D Hadamard transform, takes the absolute value of each
381
   component, and accumulates everything into mm0.
382
  Note that mm0 will have an extra 4 added to each column, and that after
383
   removing this value, the remainder will be half the conventional value.*/
384
#define OC_HADAMARD_ABS_ACCUM_8x4(_r6,_r7) \
385
 OC_HADAMARD_AB_8x4 \
386
 OC_HADAMARD_C_ABS_ACCUM_8x4(_r6,_r7)
387
388
/*Performs two 4x4 transposes (mostly) in place.
389
  On input, {mm0,mm1,mm2,mm3} contains rows {e,f,g,h}, and {mm4,mm5,mm6,mm7}
390
   contains rows {a,b,c,d}.
391
  On output, {0x40,0x50,0x60,0x70}+_off(%[buf]) contains {e,f,g,h}^T, and
392
   {mm4,mm5,mm6,mm7} contains the transposed rows {a,b,c,d}^T.*/
393
#define OC_TRANSPOSE_4x4x2(_off) \
394
 "#OC_TRANSPOSE_4x4x2\n\t" \
395
 /*First 4x4 transpose:*/ \
396
 "movq %%mm5,"OC_MEM_OFFS(0x10+(_off),buf)"\n\t" \
397
 /*mm0 = e3 e2 e1 e0 \
398
   mm1 = f3 f2 f1 f0 \
399
   mm2 = g3 g2 g1 g0 \
400
   mm3 = h3 h2 h1 h0*/ \
401
 "movq %%mm2,%%mm5\n\t" \
402
 "punpcklwd %%mm3,%%mm2\n\t" \
403
 "punpckhwd %%mm3,%%mm5\n\t" \
404
 "movq %%mm0,%%mm3\n\t" \
405
 "punpcklwd %%mm1,%%mm0\n\t" \
406
 "punpckhwd %%mm1,%%mm3\n\t" \
407
 /*mm0 = f1 e1 f0 e0 \
408
   mm3 = f3 e3 f2 e2 \
409
   mm2 = h1 g1 h0 g0 \
410
   mm5 = h3 g3 h2 g2*/ \
411
 "movq %%mm0,%%mm1\n\t" \
412
 "punpckldq %%mm2,%%mm0\n\t" \
413
 "punpckhdq %%mm2,%%mm1\n\t" \
414
 "movq %%mm3,%%mm2\n\t" \
415
 "punpckhdq %%mm5,%%mm3\n\t" \
416
 "movq %%mm0,"OC_MEM_OFFS(0x40+(_off),buf)"\n\t" \
417
 "punpckldq %%mm5,%%mm2\n\t" \
418
 /*mm0 = h0 g0 f0 e0 \
419
   mm1 = h1 g1 f1 e1 \
420
   mm2 = h2 g2 f2 e2 \
421
   mm3 = h3 g3 f3 e3*/ \
422
 "movq "OC_MEM_OFFS(0x10+(_off),buf)",%%mm5\n\t" \
423
 /*Second 4x4 transpose:*/ \
424
 /*mm4 = a3 a2 a1 a0 \
425
   mm5 = b3 b2 b1 b0 \
426
   mm6 = c3 c2 c1 c0 \
427
   mm7 = d3 d2 d1 d0*/ \
428
 "movq %%mm6,%%mm0\n\t" \
429
 "punpcklwd %%mm7,%%mm6\n\t" \
430
 "movq %%mm1,"OC_MEM_OFFS(0x50+(_off),buf)"\n\t" \
431
 "punpckhwd %%mm7,%%mm0\n\t" \
432
 "movq %%mm4,%%mm7\n\t" \
433
 "punpcklwd %%mm5,%%mm4\n\t" \
434
 "movq %%mm2,"OC_MEM_OFFS(0x60+(_off),buf)"\n\t" \
435
 "punpckhwd %%mm5,%%mm7\n\t" \
436
 /*mm4 = b1 a1 b0 a0 \
437
   mm7 = b3 a3 b2 a2 \
438
   mm6 = d1 c1 d0 c0 \
439
   mm0 = d3 c3 d2 c2*/ \
440
 "movq %%mm4,%%mm5\n\t" \
441
 "punpckldq %%mm6,%%mm4\n\t" \
442
 "movq %%mm3,"OC_MEM_OFFS(0x70+(_off),buf)"\n\t" \
443
 "punpckhdq %%mm6,%%mm5\n\t" \
444
 "movq %%mm7,%%mm6\n\t" \
445
 "punpckhdq %%mm0,%%mm7\n\t" \
446
 "punpckldq %%mm0,%%mm6\n\t" \
447
 /*mm4 = d0 c0 b0 a0 \
448
   mm5 = d1 c1 b1 a1 \
449
   mm6 = d2 c2 b2 a2 \
450
   mm7 = d3 c3 b3 a3*/ \
451
452
static unsigned oc_int_frag_satd_mmxext(int *_dc,
453
 const unsigned char *_src,int _src_ystride,
454
0
 const unsigned char *_ref,int _ref_ystride){
455
0
  OC_ALIGN8(ogg_int16_t buf[64]);
456
0
  unsigned ret;
457
0
  unsigned ret2;
458
0
  int      dc;
459
0
  __asm__ __volatile__(
460
0
    OC_LOAD_SUB_8x4(0x00)
461
0
    OC_HADAMARD_8x4
462
0
    OC_TRANSPOSE_4x4x2(0x00)
463
    /*Finish swapping out this 8x4 block to make room for the next one.
464
      mm0...mm3 have been swapped out already.*/
465
0
    "movq %%mm4,"OC_MEM_OFFS(0x00,buf)"\n\t"
466
0
    "movq %%mm5,"OC_MEM_OFFS(0x10,buf)"\n\t"
467
0
    "movq %%mm6,"OC_MEM_OFFS(0x20,buf)"\n\t"
468
0
    "movq %%mm7,"OC_MEM_OFFS(0x30,buf)"\n\t"
469
0
    OC_LOAD_SUB_8x4(0x04)
470
0
    OC_HADAMARD_8x4
471
0
    OC_TRANSPOSE_4x4x2(0x08)
472
    /*Here the first 4x4 block of output from the last transpose is the second
473
       4x4 block of input for the next transform.
474
      We have cleverly arranged that it already be in the appropriate place, so
475
       we only have to do half the loads.*/
476
0
    "movq "OC_MEM_OFFS(0x10,buf)",%%mm1\n\t"
477
0
    "movq "OC_MEM_OFFS(0x20,buf)",%%mm2\n\t"
478
0
    "movq "OC_MEM_OFFS(0x30,buf)",%%mm3\n\t"
479
0
    "movq "OC_MEM_OFFS(0x00,buf)",%%mm0\n\t"
480
    /*We split out the stages here so we can save the DC coefficient in the
481
       middle.*/
482
0
    OC_HADAMARD_AB_8x4
483
0
    OC_HADAMARD_C_ABS_ACCUM_A_8x4(0x28,0x38)
484
0
    "movd %%mm1,%[dc]\n\t"
485
0
    OC_HADAMARD_C_ABS_ACCUM_B_8x4(0x28,0x38)
486
    /*Up to this point, everything fit in 16 bits (8 input + 1 for the
487
       difference + 2*3 for the two 8-point 1-D Hadamards - 1 for the abs - 1
488
       for the factor of two we dropped + 3 for the vertical accumulation).
489
      Now we finally have to promote things to dwords.
490
      We break this part out of OC_HADAMARD_ABS_ACCUM_8x4 to hide the long
491
       latency of pmaddwd by starting the next series of loads now.*/
492
0
    "pmaddwd %%mm7,%%mm0\n\t"
493
0
    "movq "OC_MEM_OFFS(0x50,buf)",%%mm1\n\t"
494
0
    "movq "OC_MEM_OFFS(0x58,buf)",%%mm5\n\t"
495
0
    "movq %%mm0,%%mm4\n\t"
496
0
    "movq "OC_MEM_OFFS(0x60,buf)",%%mm2\n\t"
497
0
    "punpckhdq %%mm0,%%mm0\n\t"
498
0
    "movq "OC_MEM_OFFS(0x68,buf)",%%mm6\n\t"
499
0
    "paddd %%mm0,%%mm4\n\t"
500
0
    "movq "OC_MEM_OFFS(0x70,buf)",%%mm3\n\t"
501
0
    "movd %%mm4,%[ret2]\n\t"
502
0
    "movq "OC_MEM_OFFS(0x78,buf)",%%mm7\n\t"
503
0
    "movq "OC_MEM_OFFS(0x40,buf)",%%mm0\n\t"
504
0
    "movq "OC_MEM_OFFS(0x48,buf)",%%mm4\n\t"
505
0
    OC_HADAMARD_ABS_ACCUM_8x4(0x68,0x78)
506
0
    "pmaddwd %%mm7,%%mm0\n\t"
507
    /*Subtract abs(dc) from 2*ret2.*/
508
0
    "movsx %w[dc],%[dc]\n\t"
509
0
    "cdq\n\t"
510
0
    "lea (%[ret],%[ret2],2),%[ret2]\n\t"
511
0
    "movq %%mm0,%%mm4\n\t"
512
0
    "punpckhdq %%mm0,%%mm0\n\t"
513
0
    "xor %[dc],%[ret]\n\t"
514
0
    "paddd %%mm0,%%mm4\n\t"
515
    /*The sums produced by OC_HADAMARD_ABS_ACCUM_8x4 each have an extra 4
516
       added to them, a factor of two removed, and the DC value included;
517
       correct the final sum here.*/
518
0
    "sub %[ret],%[ret2]\n\t"
519
0
    "movd %%mm4,%[ret]\n\t"
520
0
    "lea -64(%[ret2],%[ret],2),%[ret]\n\t"
521
    /*Although it looks like we're using 8 registers here, gcc can alias %[ret]
522
       and %[ret2] with some of the inputs, since for once we don't write to
523
       them until after we're done using everything but %[buf].*/
524
    /*Note that _src_ystride and _ref_ystride must be given non-overlapping
525
       constraints, otherewise if gcc can prove they're equal it will allocate
526
       them to the same register (which is bad); _src and _ref face a similar
527
       problem, though those are never actually the same.*/
528
0
    :[ret]"=d"(ret),[ret2]"=r"(ret2),[dc]"=a"(dc),
529
0
     [buf]"=m"(OC_ARRAY_OPERAND(ogg_int16_t,buf,64))
530
0
    :[src]"r"(_src),[src_ystride]"c"((ptrdiff_t)_src_ystride),
531
0
     [ref]"r"(_ref),[ref_ystride]"d"((ptrdiff_t)_ref_ystride)
532
    /*We have to use neg, so we actually clobber the condition codes for once
533
       (not to mention cmp, sub, and add).*/
534
0
    :"cc"
535
0
  );
536
0
  *_dc=dc;
537
0
  return ret;
538
0
}
539
540
unsigned oc_enc_frag_satd_mmxext(int *_dc,const unsigned char *_src,
541
0
 const unsigned char *_ref,int _ystride){
542
0
  return oc_int_frag_satd_mmxext(_dc,_src,_ystride,_ref,_ystride);
543
0
}
544
545
/*Our internal implementation of frag_copy2 takes an extra stride parameter so
546
   we can share code with oc_enc_frag_satd2_mmxext().*/
547
void oc_int_frag_copy2_mmxext(unsigned char *_dst,int _dst_ystride,
548
20.1M
 const unsigned char *_src1,const unsigned char *_src2,int _src_ystride){
549
20.1M
  __asm__ __volatile__(
550
    /*Load the first 3 rows.*/
551
20.1M
    "movq (%[src1]),%%mm0\n\t"
552
20.1M
    "movq (%[src2]),%%mm1\n\t"
553
20.1M
    "movq (%[src1],%[src_ystride]),%%mm2\n\t"
554
20.1M
    "lea (%[src1],%[src_ystride],2),%[src1]\n\t"
555
20.1M
    "movq (%[src2],%[src_ystride]),%%mm3\n\t"
556
20.1M
    "lea (%[src2],%[src_ystride],2),%[src2]\n\t"
557
20.1M
    "pxor %%mm7,%%mm7\n\t"
558
20.1M
    "movq (%[src1]),%%mm4\n\t"
559
20.1M
    "pcmpeqb %%mm6,%%mm6\n\t"
560
20.1M
    "movq (%[src2]),%%mm5\n\t"
561
    /*mm7={1}x8.*/
562
20.1M
    "psubb %%mm6,%%mm7\n\t"
563
    /*Start averaging %%mm0 and %%mm1 into %%mm6.*/
564
20.1M
    "movq %%mm0,%%mm6\n\t"
565
20.1M
    "pxor %%mm1,%%mm0\n\t"
566
20.1M
    "pavgb %%mm1,%%mm6\n\t"
567
    /*%%mm1 is free, start averaging %%mm3 into %%mm2 using %%mm1.*/
568
20.1M
    "movq %%mm2,%%mm1\n\t"
569
20.1M
    "pand %%mm7,%%mm0\n\t"
570
20.1M
    "pavgb %%mm3,%%mm2\n\t"
571
20.1M
    "pxor %%mm3,%%mm1\n\t"
572
    /*%%mm3 is free.*/
573
20.1M
    "psubb %%mm0,%%mm6\n\t"
574
    /*%%mm0 is free, start loading the next row.*/
575
20.1M
    "movq (%[src1],%[src_ystride]),%%mm0\n\t"
576
    /*Start averaging %%mm5 and %%mm4 using %%mm3.*/
577
20.1M
    "movq %%mm4,%%mm3\n\t"
578
    /*%%mm6 (row 0) is done; write it out.*/
579
20.1M
    "movq %%mm6,(%[dst])\n\t"
580
20.1M
    "pand %%mm7,%%mm1\n\t"
581
20.1M
    "pavgb %%mm5,%%mm4\n\t"
582
20.1M
    "psubb %%mm1,%%mm2\n\t"
583
    /*%%mm1 is free, continue loading the next row.*/
584
20.1M
    "movq (%[src2],%[src_ystride]),%%mm1\n\t"
585
20.1M
    "pxor %%mm5,%%mm3\n\t"
586
20.1M
    "lea (%[src1],%[src_ystride],2),%[src1]\n\t"
587
    /*%%mm2 (row 1) is done; write it out.*/
588
20.1M
    "movq %%mm2,(%[dst],%[dst_ystride])\n\t"
589
20.1M
    "pand %%mm7,%%mm3\n\t"
590
    /*Start loading the next row.*/
591
20.1M
    "movq (%[src1]),%%mm2\n\t"
592
20.1M
    "lea (%[dst],%[dst_ystride],2),%[dst]\n\t"
593
20.1M
    "psubb %%mm3,%%mm4\n\t"
594
20.1M
    "lea (%[src2],%[src_ystride],2),%[src2]\n\t"
595
    /*%%mm4 (row 2) is done; write it out.*/
596
20.1M
    "movq %%mm4,(%[dst])\n\t"
597
    /*Continue loading the next row.*/
598
20.1M
    "movq (%[src2]),%%mm3\n\t"
599
    /*Start averaging %%mm0 and %%mm1 into %%mm6.*/
600
20.1M
    "movq %%mm0,%%mm6\n\t"
601
20.1M
    "pxor %%mm1,%%mm0\n\t"
602
    /*Start loading the next row.*/
603
20.1M
    "movq (%[src1],%[src_ystride]),%%mm4\n\t"
604
20.1M
    "pavgb %%mm1,%%mm6\n\t"
605
    /*%%mm1 is free; start averaging %%mm3 into %%mm2 using %%mm1.*/
606
20.1M
    "movq %%mm2,%%mm1\n\t"
607
20.1M
    "pand %%mm7,%%mm0\n\t"
608
    /*Continue loading the next row.*/
609
20.1M
    "movq (%[src2],%[src_ystride]),%%mm5\n\t"
610
20.1M
    "pavgb %%mm3,%%mm2\n\t"
611
20.1M
    "lea (%[src1],%[src_ystride],2),%[src1]\n\t"
612
20.1M
    "pxor %%mm3,%%mm1\n\t"
613
    /*%%mm3 is free.*/
614
20.1M
    "psubb %%mm0,%%mm6\n\t"
615
    /*%%mm0 is free, start loading the next row.*/
616
20.1M
    "movq (%[src1]),%%mm0\n\t"
617
    /*Start averaging %%mm5 into %%mm4 using %%mm3.*/
618
20.1M
    "movq %%mm4,%%mm3\n\t"
619
    /*%%mm6 (row 3) is done; write it out.*/
620
20.1M
    "movq %%mm6,(%[dst],%[dst_ystride])\n\t"
621
20.1M
    "pand %%mm7,%%mm1\n\t"
622
20.1M
    "lea (%[src2],%[src_ystride],2),%[src2]\n\t"
623
20.1M
    "pavgb %%mm5,%%mm4\n\t"
624
20.1M
    "lea (%[dst],%[dst_ystride],2),%[dst]\n\t"
625
20.1M
    "psubb %%mm1,%%mm2\n\t"
626
    /*%%mm1 is free; continue loading the next row.*/
627
20.1M
    "movq (%[src2]),%%mm1\n\t"
628
20.1M
    "pxor %%mm5,%%mm3\n\t"
629
    /*%%mm2 (row 4) is done; write it out.*/
630
20.1M
    "movq %%mm2,(%[dst])\n\t"
631
20.1M
    "pand %%mm7,%%mm3\n\t"
632
    /*Start loading the next row.*/
633
20.1M
    "movq (%[src1],%[src_ystride]),%%mm2\n\t"
634
20.1M
    "psubb %%mm3,%%mm4\n\t"
635
    /*Start averaging %%mm0 and %%mm1 into %%mm6.*/
636
20.1M
    "movq %%mm0,%%mm6\n\t"
637
    /*Continue loading the next row.*/
638
20.1M
    "movq (%[src2],%[src_ystride]),%%mm3\n\t"
639
    /*%%mm4 (row 5) is done; write it out.*/
640
20.1M
    "movq %%mm4,(%[dst],%[dst_ystride])\n\t"
641
20.1M
    "pxor %%mm1,%%mm0\n\t"
642
20.1M
    "pavgb %%mm1,%%mm6\n\t"
643
    /*%%mm4 is free; start averaging %%mm3 into %%mm2 using %%mm4.*/
644
20.1M
    "movq %%mm2,%%mm4\n\t"
645
20.1M
    "pand %%mm7,%%mm0\n\t"
646
20.1M
    "pavgb %%mm3,%%mm2\n\t"
647
20.1M
    "pxor %%mm3,%%mm4\n\t"
648
20.1M
    "lea (%[dst],%[dst_ystride],2),%[dst]\n\t"
649
20.1M
    "psubb %%mm0,%%mm6\n\t"
650
20.1M
    "pand %%mm7,%%mm4\n\t"
651
    /*%%mm6 (row 6) is done, write it out.*/
652
20.1M
    "movq %%mm6,(%[dst])\n\t"
653
20.1M
    "psubb %%mm4,%%mm2\n\t"
654
    /*%%mm2 (row 7) is done, write it out.*/
655
20.1M
    "movq %%mm2,(%[dst],%[dst_ystride])\n\t"
656
20.1M
    :[dst]"+r"(_dst),[src1]"+r"(_src1),[src2]"+r"(_src2)
657
20.1M
    :[dst_ystride]"r"((ptrdiff_t)_dst_ystride),
658
20.1M
     [src_ystride]"r"((ptrdiff_t)_src_ystride)
659
20.1M
    :"memory"
660
20.1M
  );
661
20.1M
}
662
663
unsigned oc_enc_frag_satd2_mmxext(int *_dc,const unsigned char *_src,
664
0
 const unsigned char *_ref1,const unsigned char *_ref2,int _ystride){
665
0
  OC_ALIGN8(unsigned char ref[64]);
666
0
  oc_int_frag_copy2_mmxext(ref,8,_ref1,_ref2,_ystride);
667
0
  return oc_int_frag_satd_mmxext(_dc,_src,_ystride,ref,8);
668
0
}
669
670
unsigned oc_enc_frag_intra_satd_mmxext(int *_dc,
671
0
 const unsigned char *_src,int _ystride){
672
0
  OC_ALIGN8(ogg_int16_t buf[64]);
673
0
  unsigned ret;
674
0
  unsigned ret2;
675
0
  int      dc;
676
0
  __asm__ __volatile__(
677
0
    OC_LOAD_8x4(0x00)
678
0
    OC_HADAMARD_8x4
679
0
    OC_TRANSPOSE_4x4x2(0x00)
680
    /*Finish swapping out this 8x4 block to make room for the next one.
681
      mm0...mm3 have been swapped out already.*/
682
0
    "movq %%mm4,"OC_MEM_OFFS(0x00,buf)"\n\t"
683
0
    "movq %%mm5,"OC_MEM_OFFS(0x10,buf)"\n\t"
684
0
    "movq %%mm6,"OC_MEM_OFFS(0x20,buf)"\n\t"
685
0
    "movq %%mm7,"OC_MEM_OFFS(0x30,buf)"\n\t"
686
0
    OC_LOAD_8x4(0x04)
687
0
    OC_HADAMARD_8x4
688
0
    OC_TRANSPOSE_4x4x2(0x08)
689
    /*Here the first 4x4 block of output from the last transpose is the second
690
       4x4 block of input for the next transform.
691
      We have cleverly arranged that it already be in the appropriate place, so
692
       we only have to do half the loads.*/
693
0
    "movq "OC_MEM_OFFS(0x10,buf)",%%mm1\n\t"
694
0
    "movq "OC_MEM_OFFS(0x20,buf)",%%mm2\n\t"
695
0
    "movq "OC_MEM_OFFS(0x30,buf)",%%mm3\n\t"
696
0
    "movq "OC_MEM_OFFS(0x00,buf)",%%mm0\n\t"
697
    /*We split out the stages here so we can save the DC coefficient in the
698
       middle.*/
699
0
    OC_HADAMARD_AB_8x4
700
0
    OC_HADAMARD_C_ABS_ACCUM_A_8x4(0x28,0x38)
701
0
    "movd %%mm1,%[dc]\n\t"
702
0
    OC_HADAMARD_C_ABS_ACCUM_B_8x4(0x28,0x38)
703
    /*Up to this point, everything fit in 16 bits (8 input + 1 for the
704
       difference + 2*3 for the two 8-point 1-D Hadamards - 1 for the abs - 1
705
       for the factor of two we dropped + 3 for the vertical accumulation).
706
      Now we finally have to promote things to dwords.
707
      We break this part out of OC_HADAMARD_ABS_ACCUM_8x4 to hide the long
708
       latency of pmaddwd by starting the next series of loads now.*/
709
0
    "pmaddwd %%mm7,%%mm0\n\t"
710
0
    "movq "OC_MEM_OFFS(0x50,buf)",%%mm1\n\t"
711
0
    "movq "OC_MEM_OFFS(0x58,buf)",%%mm5\n\t"
712
0
    "movq "OC_MEM_OFFS(0x60,buf)",%%mm2\n\t"
713
0
    "movq %%mm0,%%mm4\n\t"
714
0
    "movq "OC_MEM_OFFS(0x68,buf)",%%mm6\n\t"
715
0
    "punpckhdq %%mm0,%%mm0\n\t"
716
0
    "movq "OC_MEM_OFFS(0x70,buf)",%%mm3\n\t"
717
0
    "paddd %%mm0,%%mm4\n\t"
718
0
    "movq "OC_MEM_OFFS(0x78,buf)",%%mm7\n\t"
719
0
    "movd %%mm4,%[ret]\n\t"
720
0
    "movq "OC_MEM_OFFS(0x40,buf)",%%mm0\n\t"
721
0
    "movq "OC_MEM_OFFS(0x48,buf)",%%mm4\n\t"
722
0
    OC_HADAMARD_ABS_ACCUM_8x4(0x68,0x78)
723
0
    "pmaddwd %%mm7,%%mm0\n\t"
724
    /*We assume that the DC coefficient is always positive (which is true,
725
       because the input to the INTRA transform was not a difference).*/
726
0
    "movzx %w[dc],%[dc]\n\t"
727
0
    "add %[ret],%[ret]\n\t"
728
0
    "sub %[dc],%[ret]\n\t"
729
0
    "movq %%mm0,%%mm4\n\t"
730
0
    "punpckhdq %%mm0,%%mm0\n\t"
731
0
    "paddd %%mm0,%%mm4\n\t"
732
0
    "movd %%mm4,%[ret2]\n\t"
733
0
    "lea -64(%[ret],%[ret2],2),%[ret]\n\t"
734
    /*Although it looks like we're using 8 registers here, gcc can alias %[ret]
735
       and %[ret2] with some of the inputs, since for once we don't write to
736
       them until after we're done using everything but %[buf] (which is also
737
       listed as an output to ensure gcc _doesn't_ alias them against it).*/
738
0
    :[ret]"=a"(ret),[ret2]"=r"(ret2),[dc]"=r"(dc),
739
0
     [buf]"=m"(OC_ARRAY_OPERAND(ogg_int16_t,buf,64))
740
0
    :[src]"r"(_src),[src4]"r"(_src+4*_ystride),
741
0
     [ystride]"r"((ptrdiff_t)_ystride),[ystride3]"r"((ptrdiff_t)3*_ystride)
742
    /*We have to use sub, so we actually clobber the condition codes for once
743
       (not to mention add).*/
744
0
    :"cc"
745
0
  );
746
0
  *_dc=dc;
747
0
  return ret;
748
0
}
749
750
void oc_enc_frag_sub_mmx(ogg_int16_t _residue[64],
751
834k
 const unsigned char *_src,const unsigned char *_ref,int _ystride){
752
834k
  int i;
753
834k
  __asm__ __volatile__("pxor %%mm7,%%mm7\n\t"::);
754
4.17M
  for(i=4;i-->0;){
755
3.33M
    __asm__ __volatile__(
756
      /*mm0=[src]*/
757
3.33M
      "movq (%[src]),%%mm0\n\t"
758
      /*mm1=[ref]*/
759
3.33M
      "movq (%[ref]),%%mm1\n\t"
760
      /*mm4=[src+ystride]*/
761
3.33M
      "movq (%[src],%[ystride]),%%mm4\n\t"
762
      /*mm5=[ref+ystride]*/
763
3.33M
      "movq (%[ref],%[ystride]),%%mm5\n\t"
764
      /*Compute [src]-[ref].*/
765
3.33M
      "movq %%mm0,%%mm2\n\t"
766
3.33M
      "punpcklbw %%mm7,%%mm0\n\t"
767
3.33M
      "movq %%mm1,%%mm3\n\t"
768
3.33M
      "punpckhbw %%mm7,%%mm2\n\t"
769
3.33M
      "punpcklbw %%mm7,%%mm1\n\t"
770
3.33M
      "punpckhbw %%mm7,%%mm3\n\t"
771
3.33M
      "psubw %%mm1,%%mm0\n\t"
772
3.33M
      "psubw %%mm3,%%mm2\n\t"
773
      /*Compute [src+ystride]-[ref+ystride].*/
774
3.33M
      "movq %%mm4,%%mm1\n\t"
775
3.33M
      "punpcklbw %%mm7,%%mm4\n\t"
776
3.33M
      "movq %%mm5,%%mm3\n\t"
777
3.33M
      "punpckhbw %%mm7,%%mm1\n\t"
778
3.33M
      "lea (%[src],%[ystride],2),%[src]\n\t"
779
3.33M
      "punpcklbw %%mm7,%%mm5\n\t"
780
3.33M
      "lea (%[ref],%[ystride],2),%[ref]\n\t"
781
3.33M
      "punpckhbw %%mm7,%%mm3\n\t"
782
3.33M
      "psubw %%mm5,%%mm4\n\t"
783
3.33M
      "psubw %%mm3,%%mm1\n\t"
784
      /*Write the answer out.*/
785
3.33M
      "movq %%mm0,0x00(%[residue])\n\t"
786
3.33M
      "movq %%mm2,0x08(%[residue])\n\t"
787
3.33M
      "movq %%mm4,0x10(%[residue])\n\t"
788
3.33M
      "movq %%mm1,0x18(%[residue])\n\t"
789
3.33M
      "lea 0x20(%[residue]),%[residue]\n\t"
790
3.33M
      :[residue]"+r"(_residue),[src]"+r"(_src),[ref]"+r"(_ref)
791
3.33M
      :[ystride]"r"((ptrdiff_t)_ystride)
792
3.33M
      :"memory"
793
3.33M
    );
794
3.33M
  }
795
834k
}
796
797
void oc_enc_frag_sub_128_mmx(ogg_int16_t _residue[64],
798
23.7M
 const unsigned char *_src,int _ystride){
799
23.7M
  ptrdiff_t ystride3;
800
23.7M
  __asm__ __volatile__(
801
    /*mm0=[src]*/
802
23.7M
    "movq (%[src]),%%mm0\n\t"
803
    /*mm1=[src+ystride]*/
804
23.7M
    "movq (%[src],%[ystride]),%%mm1\n\t"
805
    /*mm6={-1}x4*/
806
23.7M
    "pcmpeqw %%mm6,%%mm6\n\t"
807
    /*mm2=[src+2*ystride]*/
808
23.7M
    "movq (%[src],%[ystride],2),%%mm2\n\t"
809
    /*[ystride3]=3*[ystride]*/
810
23.7M
    "lea (%[ystride],%[ystride],2),%[ystride3]\n\t"
811
    /*mm6={1}x4*/
812
23.7M
    "psllw $15,%%mm6\n\t"
813
    /*mm3=[src+3*ystride]*/
814
23.7M
    "movq (%[src],%[ystride3]),%%mm3\n\t"
815
    /*mm6={128}x4*/
816
23.7M
    "psrlw $8,%%mm6\n\t"
817
    /*mm7=0*/
818
23.7M
    "pxor %%mm7,%%mm7\n\t"
819
    /*[src]=[src]+4*[ystride]*/
820
23.7M
    "lea (%[src],%[ystride],4),%[src]\n\t"
821
    /*Compute [src]-128 and [src+ystride]-128*/
822
23.7M
    "movq %%mm0,%%mm4\n\t"
823
23.7M
    "punpcklbw %%mm7,%%mm0\n\t"
824
23.7M
    "movq %%mm1,%%mm5\n\t"
825
23.7M
    "punpckhbw %%mm7,%%mm4\n\t"
826
23.7M
    "psubw %%mm6,%%mm0\n\t"
827
23.7M
    "punpcklbw %%mm7,%%mm1\n\t"
828
23.7M
    "psubw %%mm6,%%mm4\n\t"
829
23.7M
    "punpckhbw %%mm7,%%mm5\n\t"
830
23.7M
    "psubw %%mm6,%%mm1\n\t"
831
23.7M
    "psubw %%mm6,%%mm5\n\t"
832
    /*Write the answer out.*/
833
23.7M
    "movq %%mm0,0x00(%[residue])\n\t"
834
23.7M
    "movq %%mm4,0x08(%[residue])\n\t"
835
23.7M
    "movq %%mm1,0x10(%[residue])\n\t"
836
23.7M
    "movq %%mm5,0x18(%[residue])\n\t"
837
    /*mm0=[src+4*ystride]*/
838
23.7M
    "movq (%[src]),%%mm0\n\t"
839
    /*mm1=[src+5*ystride]*/
840
23.7M
    "movq (%[src],%[ystride]),%%mm1\n\t"
841
    /*Compute [src+2*ystride]-128 and [src+3*ystride]-128*/
842
23.7M
    "movq %%mm2,%%mm4\n\t"
843
23.7M
    "punpcklbw %%mm7,%%mm2\n\t"
844
23.7M
    "movq %%mm3,%%mm5\n\t"
845
23.7M
    "punpckhbw %%mm7,%%mm4\n\t"
846
23.7M
    "psubw %%mm6,%%mm2\n\t"
847
23.7M
    "punpcklbw %%mm7,%%mm3\n\t"
848
23.7M
    "psubw %%mm6,%%mm4\n\t"
849
23.7M
    "punpckhbw %%mm7,%%mm5\n\t"
850
23.7M
    "psubw %%mm6,%%mm3\n\t"
851
23.7M
    "psubw %%mm6,%%mm5\n\t"
852
    /*Write the answer out.*/
853
23.7M
    "movq %%mm2,0x20(%[residue])\n\t"
854
23.7M
    "movq %%mm4,0x28(%[residue])\n\t"
855
23.7M
    "movq %%mm3,0x30(%[residue])\n\t"
856
23.7M
    "movq %%mm5,0x38(%[residue])\n\t"
857
    /*mm2=[src+6*ystride]*/
858
23.7M
    "movq (%[src],%[ystride],2),%%mm2\n\t"
859
    /*mm3=[src+7*ystride]*/
860
23.7M
    "movq (%[src],%[ystride3]),%%mm3\n\t"
861
    /*Compute [src+4*ystride]-128 and [src+5*ystride]-128*/
862
23.7M
    "movq %%mm0,%%mm4\n\t"
863
23.7M
    "punpcklbw %%mm7,%%mm0\n\t"
864
23.7M
    "movq %%mm1,%%mm5\n\t"
865
23.7M
    "punpckhbw %%mm7,%%mm4\n\t"
866
23.7M
    "psubw %%mm6,%%mm0\n\t"
867
23.7M
    "punpcklbw %%mm7,%%mm1\n\t"
868
23.7M
    "psubw %%mm6,%%mm4\n\t"
869
23.7M
    "punpckhbw %%mm7,%%mm5\n\t"
870
23.7M
    "psubw %%mm6,%%mm1\n\t"
871
23.7M
    "psubw %%mm6,%%mm5\n\t"
872
    /*Write the answer out.*/
873
23.7M
    "movq %%mm0,0x40(%[residue])\n\t"
874
23.7M
    "movq %%mm4,0x48(%[residue])\n\t"
875
23.7M
    "movq %%mm1,0x50(%[residue])\n\t"
876
23.7M
    "movq %%mm5,0x58(%[residue])\n\t"
877
    /*Compute [src+6*ystride]-128 and [src+7*ystride]-128*/
878
23.7M
    "movq %%mm2,%%mm4\n\t"
879
23.7M
    "punpcklbw %%mm7,%%mm2\n\t"
880
23.7M
    "movq %%mm3,%%mm5\n\t"
881
23.7M
    "punpckhbw %%mm7,%%mm4\n\t"
882
23.7M
    "psubw %%mm6,%%mm2\n\t"
883
23.7M
    "punpcklbw %%mm7,%%mm3\n\t"
884
23.7M
    "psubw %%mm6,%%mm4\n\t"
885
23.7M
    "punpckhbw %%mm7,%%mm5\n\t"
886
23.7M
    "psubw %%mm6,%%mm3\n\t"
887
23.7M
    "psubw %%mm6,%%mm5\n\t"
888
    /*Write the answer out.*/
889
23.7M
    "movq %%mm2,0x60(%[residue])\n\t"
890
23.7M
    "movq %%mm4,0x68(%[residue])\n\t"
891
23.7M
    "movq %%mm3,0x70(%[residue])\n\t"
892
23.7M
    "movq %%mm5,0x78(%[residue])\n\t"
893
23.7M
    :[src]"+r"(_src),[ystride3]"=&r"(ystride3)
894
23.7M
    :[residue]"r"(_residue),[ystride]"r"((ptrdiff_t)_ystride)
895
23.7M
    :"memory"
896
23.7M
  );
897
23.7M
}
898
899
void oc_enc_frag_copy2_mmxext(unsigned char *_dst,
900
0
 const unsigned char *_src1,const unsigned char *_src2,int _ystride){
901
0
  oc_int_frag_copy2_mmxext(_dst,_ystride,_src1,_src2,_ystride);
902
0
}
903
904
#endif