Coverage Report

Created: 2025-11-16 07:20

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/src/theora/lib/x86/mmxencfrag.c
Line
Count
Source
1
/********************************************************************
2
 *                                                                  *
3
 * THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE.   *
4
 * USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS     *
5
 * GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
6
 * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING.       *
7
 *                                                                  *
8
 * THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2009                *
9
 * by the Xiph.Org Foundation https://www.xiph.org/                 *
10
 *                                                                  *
11
 ********************************************************************
12
13
  function:
14
15
 ********************************************************************/
16
#include <stddef.h>
17
#include "x86enc.h"
18
19
#if defined(OC_X86_ASM)
20
21
unsigned oc_enc_frag_sad_mmxext(const unsigned char *_src,
22
16.3M
 const unsigned char *_ref,int _ystride){
23
16.3M
  ptrdiff_t ystride3;
24
16.3M
  ptrdiff_t ret;
25
16.3M
  __asm__ __volatile__(
26
    /*Load the first 4 rows of each block.*/
27
16.3M
    "movq (%[src]),%%mm0\n\t"
28
16.3M
    "movq (%[ref]),%%mm1\n\t"
29
16.3M
    "movq (%[src],%[ystride]),%%mm2\n\t"
30
16.3M
    "movq (%[ref],%[ystride]),%%mm3\n\t"
31
16.3M
    "lea (%[ystride],%[ystride],2),%[ystride3]\n\t"
32
16.3M
    "movq (%[src],%[ystride],2),%%mm4\n\t"
33
16.3M
    "movq (%[ref],%[ystride],2),%%mm5\n\t"
34
16.3M
    "movq (%[src],%[ystride3]),%%mm6\n\t"
35
16.3M
    "movq (%[ref],%[ystride3]),%%mm7\n\t"
36
    /*Compute their SADs and add them in %%mm0*/
37
16.3M
    "psadbw %%mm1,%%mm0\n\t"
38
16.3M
    "psadbw %%mm3,%%mm2\n\t"
39
16.3M
    "lea (%[src],%[ystride],4),%[src]\n\t"
40
16.3M
    "paddw %%mm2,%%mm0\n\t"
41
16.3M
    "lea (%[ref],%[ystride],4),%[ref]\n\t"
42
    /*Load the next 3 rows as registers become available.*/
43
16.3M
    "movq (%[src]),%%mm2\n\t"
44
16.3M
    "movq (%[ref]),%%mm3\n\t"
45
16.3M
    "psadbw %%mm5,%%mm4\n\t"
46
16.3M
    "psadbw %%mm7,%%mm6\n\t"
47
16.3M
    "paddw %%mm4,%%mm0\n\t"
48
16.3M
    "movq (%[ref],%[ystride]),%%mm5\n\t"
49
16.3M
    "movq (%[src],%[ystride]),%%mm4\n\t"
50
16.3M
    "paddw %%mm6,%%mm0\n\t"
51
16.3M
    "movq (%[ref],%[ystride],2),%%mm7\n\t"
52
16.3M
    "movq (%[src],%[ystride],2),%%mm6\n\t"
53
    /*Start adding their SADs to %%mm0*/
54
16.3M
    "psadbw %%mm3,%%mm2\n\t"
55
16.3M
    "psadbw %%mm5,%%mm4\n\t"
56
16.3M
    "paddw %%mm2,%%mm0\n\t"
57
16.3M
    "psadbw %%mm7,%%mm6\n\t"
58
    /*Load last row as registers become available.*/
59
16.3M
    "movq (%[src],%[ystride3]),%%mm2\n\t"
60
16.3M
    "movq (%[ref],%[ystride3]),%%mm3\n\t"
61
    /*And finish adding up their SADs.*/
62
16.3M
    "paddw %%mm4,%%mm0\n\t"
63
16.3M
    "psadbw %%mm3,%%mm2\n\t"
64
16.3M
    "paddw %%mm6,%%mm0\n\t"
65
16.3M
    "paddw %%mm2,%%mm0\n\t"
66
16.3M
    "movd %%mm0,%[ret]\n\t"
67
16.3M
    :[ret]"=a"(ret),[src]"+r"(_src),[ref]"+r"(_ref),[ystride3]"=&r"(ystride3)
68
16.3M
    :[ystride]"r"((ptrdiff_t)_ystride)
69
16.3M
  );
70
16.3M
  return (unsigned)ret;
71
16.3M
}
72
73
unsigned oc_enc_frag_sad_thresh_mmxext(const unsigned char *_src,
74
0
 const unsigned char *_ref,int _ystride,unsigned _thresh){
75
  /*Early termination is for suckers.*/
76
0
  return oc_enc_frag_sad_mmxext(_src,_ref,_ystride);
77
0
}
78
79
/*Assumes the first two rows of %[ref1] and %[ref2] are in %%mm0...%%mm3, the
80
   first two rows of %[src] are in %%mm4,%%mm5, and {1}x8 is in %%mm7.
81
  We pre-load the next two rows of data as registers become available.*/
82
#define OC_SAD2_LOOP \
83
 "#OC_SAD2_LOOP\n\t" \
84
 /*We want to compute (%%mm0+%%mm1>>1) on unsigned bytes without overflow, but \
85
    pavgb computes (%%mm0+%%mm1+1>>1). \
86
   The latter is exactly 1 too large when the low bit of two corresponding \
87
    bytes is only set in one of them. \
88
   Therefore we pxor the operands, pand to mask out the low bits, and psubb to \
89
    correct the output of pavgb. \
90
   TODO: This should be rewritten to compute ~pavgb(~a,~b) instead, which \
91
    schedules better; currently, however, this function is unused.*/ \
92
 "movq %%mm0,%%mm6\n\t" \
93
 "lea (%[ref1],%[ystride],2),%[ref1]\n\t" \
94
 "pxor %%mm1,%%mm0\n\t" \
95
 "pavgb %%mm1,%%mm6\n\t" \
96
 "lea (%[ref2],%[ystride],2),%[ref2]\n\t" \
97
 "movq %%mm2,%%mm1\n\t" \
98
 "pand %%mm7,%%mm0\n\t" \
99
 "pavgb %%mm3,%%mm2\n\t" \
100
 "pxor %%mm3,%%mm1\n\t" \
101
 "movq (%[ref2],%[ystride]),%%mm3\n\t" \
102
 "psubb %%mm0,%%mm6\n\t" \
103
 "movq (%[ref1]),%%mm0\n\t" \
104
 "pand %%mm7,%%mm1\n\t" \
105
 "psadbw %%mm6,%%mm4\n\t" \
106
 "movd %[ret],%%mm6\n\t" \
107
 "psubb %%mm1,%%mm2\n\t" \
108
 "movq (%[ref2]),%%mm1\n\t" \
109
 "lea (%[src],%[ystride],2),%[src]\n\t" \
110
 "psadbw %%mm2,%%mm5\n\t" \
111
 "movq (%[ref1],%[ystride]),%%mm2\n\t" \
112
 "paddw %%mm4,%%mm5\n\t" \
113
 "movq (%[src]),%%mm4\n\t" \
114
 "paddw %%mm5,%%mm6\n\t" \
115
 "movq (%[src],%[ystride]),%%mm5\n\t" \
116
 "movd %%mm6,%[ret]\n\t" \
117
118
/*Same as above, but does not pre-load the next two rows.*/
119
#define OC_SAD2_TAIL \
120
 "#OC_SAD2_TAIL\n\t" \
121
 "movq %%mm0,%%mm6\n\t" \
122
 "pavgb %%mm1,%%mm0\n\t" \
123
 "pxor %%mm1,%%mm6\n\t" \
124
 "movq %%mm2,%%mm1\n\t" \
125
 "pand %%mm7,%%mm6\n\t" \
126
 "pavgb %%mm3,%%mm2\n\t" \
127
 "pxor %%mm3,%%mm1\n\t" \
128
 "psubb %%mm6,%%mm0\n\t" \
129
 "pand %%mm7,%%mm1\n\t" \
130
 "psadbw %%mm0,%%mm4\n\t" \
131
 "psubb %%mm1,%%mm2\n\t" \
132
 "movd %[ret],%%mm6\n\t" \
133
 "psadbw %%mm2,%%mm5\n\t" \
134
 "paddw %%mm4,%%mm5\n\t" \
135
 "paddw %%mm5,%%mm6\n\t" \
136
 "movd %%mm6,%[ret]\n\t" \
137
138
unsigned oc_enc_frag_sad2_thresh_mmxext(const unsigned char *_src,
139
 const unsigned char *_ref1,const unsigned char *_ref2,int _ystride,
140
0
 unsigned _thresh){
141
0
  ptrdiff_t ret;
142
0
  __asm__ __volatile__(
143
0
    "movq (%[ref1]),%%mm0\n\t"
144
0
    "movq (%[ref2]),%%mm1\n\t"
145
0
    "movq (%[ref1],%[ystride]),%%mm2\n\t"
146
0
    "movq (%[ref2],%[ystride]),%%mm3\n\t"
147
0
    "xor %[ret],%[ret]\n\t"
148
0
    "movq (%[src]),%%mm4\n\t"
149
0
    "pxor %%mm7,%%mm7\n\t"
150
0
    "pcmpeqb %%mm6,%%mm6\n\t"
151
0
    "movq (%[src],%[ystride]),%%mm5\n\t"
152
0
    "psubb %%mm6,%%mm7\n\t"
153
0
    OC_SAD2_LOOP
154
0
    OC_SAD2_LOOP
155
0
    OC_SAD2_LOOP
156
0
    OC_SAD2_TAIL
157
0
    :[ret]"=&a"(ret),[src]"+r"(_src),[ref1]"+r"(_ref1),[ref2]"+r"(_ref2)
158
0
    :[ystride]"r"((ptrdiff_t)_ystride)
159
0
  );
160
0
  return (unsigned)ret;
161
0
}
162
163
/*Load an 8x4 array of pixel values from %[src] and %[ref] and compute their
164
   16-bit difference in %%mm0...%%mm7.*/
165
#define OC_LOAD_SUB_8x4(_off) \
166
 "#OC_LOAD_SUB_8x4\n\t" \
167
 "movd "#_off"(%[src]),%%mm0\n\t" \
168
 "movd "#_off"(%[ref]),%%mm4\n\t" \
169
 "movd "#_off"(%[src],%[src_ystride]),%%mm1\n\t" \
170
 "lea (%[src],%[src_ystride],2),%[src]\n\t" \
171
 "movd "#_off"(%[ref],%[ref_ystride]),%%mm5\n\t" \
172
 "lea (%[ref],%[ref_ystride],2),%[ref]\n\t" \
173
 "movd "#_off"(%[src]),%%mm2\n\t" \
174
 "movd "#_off"(%[ref]),%%mm7\n\t" \
175
 "movd "#_off"(%[src],%[src_ystride]),%%mm3\n\t" \
176
 "movd "#_off"(%[ref],%[ref_ystride]),%%mm6\n\t" \
177
 "punpcklbw %%mm4,%%mm0\n\t" \
178
 "lea (%[src],%[src_ystride],2),%[src]\n\t" \
179
 "punpcklbw %%mm4,%%mm4\n\t" \
180
 "lea (%[ref],%[ref_ystride],2),%[ref]\n\t" \
181
 "psubw %%mm4,%%mm0\n\t" \
182
 "movd "#_off"(%[src]),%%mm4\n\t" \
183
 "movq %%mm0,"OC_MEM_OFFS(_off*2,buf)"\n\t" \
184
 "movd "#_off"(%[ref]),%%mm0\n\t" \
185
 "punpcklbw %%mm5,%%mm1\n\t" \
186
 "punpcklbw %%mm5,%%mm5\n\t" \
187
 "psubw %%mm5,%%mm1\n\t" \
188
 "movd "#_off"(%[src],%[src_ystride]),%%mm5\n\t" \
189
 "punpcklbw %%mm7,%%mm2\n\t" \
190
 "punpcklbw %%mm7,%%mm7\n\t" \
191
 "psubw %%mm7,%%mm2\n\t" \
192
 "movd "#_off"(%[ref],%[ref_ystride]),%%mm7\n\t" \
193
 "punpcklbw %%mm6,%%mm3\n\t" \
194
 "lea (%[src],%[src_ystride],2),%[src]\n\t" \
195
 "punpcklbw %%mm6,%%mm6\n\t" \
196
 "psubw %%mm6,%%mm3\n\t" \
197
 "movd "#_off"(%[src]),%%mm6\n\t" \
198
 "punpcklbw %%mm0,%%mm4\n\t" \
199
 "lea (%[ref],%[ref_ystride],2),%[ref]\n\t" \
200
 "punpcklbw %%mm0,%%mm0\n\t" \
201
 "lea (%[src],%[src_ystride],2),%[src]\n\t" \
202
 "psubw %%mm0,%%mm4\n\t" \
203
 "movd "#_off"(%[ref]),%%mm0\n\t" \
204
 "punpcklbw %%mm7,%%mm5\n\t" \
205
 "neg %[src_ystride]\n\t" \
206
 "punpcklbw %%mm7,%%mm7\n\t" \
207
 "psubw %%mm7,%%mm5\n\t" \
208
 "movd "#_off"(%[src],%[src_ystride]),%%mm7\n\t" \
209
 "punpcklbw %%mm0,%%mm6\n\t" \
210
 "lea (%[ref],%[ref_ystride],2),%[ref]\n\t" \
211
 "punpcklbw %%mm0,%%mm0\n\t" \
212
 "neg %[ref_ystride]\n\t" \
213
 "psubw %%mm0,%%mm6\n\t" \
214
 "movd "#_off"(%[ref],%[ref_ystride]),%%mm0\n\t" \
215
 "lea (%[src],%[src_ystride],8),%[src]\n\t" \
216
 "punpcklbw %%mm0,%%mm7\n\t" \
217
 "neg %[src_ystride]\n\t" \
218
 "punpcklbw %%mm0,%%mm0\n\t" \
219
 "lea (%[ref],%[ref_ystride],8),%[ref]\n\t" \
220
 "psubw %%mm0,%%mm7\n\t" \
221
 "neg %[ref_ystride]\n\t" \
222
 "movq "OC_MEM_OFFS(_off*2,buf)",%%mm0\n\t" \
223
224
/*Load an 8x4 array of pixel values from %[src] into %%mm0...%%mm7.*/
225
#define OC_LOAD_8x4(_off) \
226
 "#OC_LOAD_8x4\n\t" \
227
 "movd "#_off"(%[src]),%%mm0\n\t" \
228
 "movd "#_off"(%[src],%[ystride]),%%mm1\n\t" \
229
 "movd "#_off"(%[src],%[ystride],2),%%mm2\n\t" \
230
 "pxor %%mm7,%%mm7\n\t" \
231
 "movd "#_off"(%[src],%[ystride3]),%%mm3\n\t" \
232
 "punpcklbw %%mm7,%%mm0\n\t" \
233
 "movd "#_off"(%[src4]),%%mm4\n\t" \
234
 "punpcklbw %%mm7,%%mm1\n\t" \
235
 "movd "#_off"(%[src4],%[ystride]),%%mm5\n\t" \
236
 "punpcklbw %%mm7,%%mm2\n\t" \
237
 "movd "#_off"(%[src4],%[ystride],2),%%mm6\n\t" \
238
 "punpcklbw %%mm7,%%mm3\n\t" \
239
 "movd "#_off"(%[src4],%[ystride3]),%%mm7\n\t" \
240
 "punpcklbw %%mm4,%%mm4\n\t" \
241
 "punpcklbw %%mm5,%%mm5\n\t" \
242
 "psrlw $8,%%mm4\n\t" \
243
 "psrlw $8,%%mm5\n\t" \
244
 "punpcklbw %%mm6,%%mm6\n\t" \
245
 "punpcklbw %%mm7,%%mm7\n\t" \
246
 "psrlw $8,%%mm6\n\t" \
247
 "psrlw $8,%%mm7\n\t" \
248
249
/*Performs the first two stages of an 8-point 1-D Hadamard transform.
250
  The transform is performed in place, except that outputs 0-3 are swapped with
251
   outputs 4-7.
252
  Outputs 2, 3, 6, and 7 from the second stage are negated (which allows us to
253
   perform this stage in place with no temporary registers).*/
254
#define OC_HADAMARD_AB_8x4 \
255
 "#OC_HADAMARD_AB_8x4\n\t" \
256
 /*Stage A: \
257
   Outputs 0-3 are swapped with 4-7 here.*/ \
258
 "paddw %%mm1,%%mm5\n\t" \
259
 "paddw %%mm2,%%mm6\n\t" \
260
 "paddw %%mm1,%%mm1\n\t" \
261
 "paddw %%mm2,%%mm2\n\t" \
262
 "psubw %%mm5,%%mm1\n\t" \
263
 "psubw %%mm6,%%mm2\n\t" \
264
 "paddw %%mm3,%%mm7\n\t" \
265
 "paddw %%mm0,%%mm4\n\t" \
266
 "paddw %%mm3,%%mm3\n\t" \
267
 "paddw %%mm0,%%mm0\n\t" \
268
 "psubw %%mm7,%%mm3\n\t" \
269
 "psubw %%mm4,%%mm0\n\t" \
270
 /*Stage B:*/ \
271
 "paddw %%mm2,%%mm0\n\t" \
272
 "paddw %%mm3,%%mm1\n\t" \
273
 "paddw %%mm6,%%mm4\n\t" \
274
 "paddw %%mm7,%%mm5\n\t" \
275
 "paddw %%mm2,%%mm2\n\t" \
276
 "paddw %%mm3,%%mm3\n\t" \
277
 "paddw %%mm6,%%mm6\n\t" \
278
 "paddw %%mm7,%%mm7\n\t" \
279
 "psubw %%mm0,%%mm2\n\t" \
280
 "psubw %%mm1,%%mm3\n\t" \
281
 "psubw %%mm4,%%mm6\n\t" \
282
 "psubw %%mm5,%%mm7\n\t" \
283
284
/*Performs the last stage of an 8-point 1-D Hadamard transform in place.
285
  Outputs 1, 3, 5, and 7 are negated (which allows us to perform this stage in
286
   place with no temporary registers).*/
287
#define OC_HADAMARD_C_8x4 \
288
 "#OC_HADAMARD_C_8x4\n\t" \
289
 /*Stage C:*/ \
290
 "paddw %%mm1,%%mm0\n\t" \
291
 "paddw %%mm3,%%mm2\n\t" \
292
 "paddw %%mm5,%%mm4\n\t" \
293
 "paddw %%mm7,%%mm6\n\t" \
294
 "paddw %%mm1,%%mm1\n\t" \
295
 "paddw %%mm3,%%mm3\n\t" \
296
 "paddw %%mm5,%%mm5\n\t" \
297
 "paddw %%mm7,%%mm7\n\t" \
298
 "psubw %%mm0,%%mm1\n\t" \
299
 "psubw %%mm2,%%mm3\n\t" \
300
 "psubw %%mm4,%%mm5\n\t" \
301
 "psubw %%mm6,%%mm7\n\t" \
302
303
/*Performs an 8-point 1-D Hadamard transform.
304
  The transform is performed in place, except that outputs 0-3 are swapped with
305
   outputs 4-7.
306
  Outputs 1, 2, 5 and 6 are negated (which allows us to perform the transform
307
   in place with no temporary registers).*/
308
#define OC_HADAMARD_8x4 \
309
 OC_HADAMARD_AB_8x4 \
310
 OC_HADAMARD_C_8x4 \
311
312
/*Performs the first part of the final stage of the Hadamard transform and
313
   summing of absolute values.
314
  At the end of this part, %%mm1 will contain the DC coefficient of the
315
   transform.*/
316
#define OC_HADAMARD_C_ABS_ACCUM_A_8x4(_r6,_r7) \
317
 /*We use the fact that \
318
     (abs(a+b)+abs(a-b))/2=max(abs(a),abs(b)) \
319
    to merge the final butterfly with the abs and the first stage of \
320
    accumulation. \
321
   Thus we can avoid using pabsw, which is not available until SSSE3. \
322
   Emulating pabsw takes 3 instructions, so the straightforward MMXEXT \
323
    implementation would be (3+3)*8+7=55 instructions (+4 for spilling \
324
    registers). \
325
   Even with pabsw, it would be (3+1)*8+7=39 instructions (with no spills). \
326
   This implementation is only 26 (+4 for spilling registers).*/ \
327
 "#OC_HADAMARD_C_ABS_ACCUM_A_8x4\n\t" \
328
 "movq %%mm7,"OC_MEM_OFFS(_r7,buf)"\n\t" \
329
 "movq %%mm6,"OC_MEM_OFFS(_r6,buf)"\n\t" \
330
 /*mm7={0x7FFF}x4 \
331
   mm0=max(abs(mm0),abs(mm1))-0x7FFF*/ \
332
 "pcmpeqb %%mm7,%%mm7\n\t" \
333
 "movq %%mm0,%%mm6\n\t" \
334
 "psrlw $1,%%mm7\n\t" \
335
 "paddw %%mm1,%%mm6\n\t" \
336
 "pmaxsw %%mm1,%%mm0\n\t" \
337
 "paddsw %%mm7,%%mm6\n\t" \
338
 "psubw %%mm6,%%mm0\n\t" \
339
 /*mm2=max(abs(mm2),abs(mm3))-0x7FFF \
340
   mm4=max(abs(mm4),abs(mm5))-0x7FFF*/ \
341
 "movq %%mm2,%%mm6\n\t" \
342
 "movq %%mm4,%%mm1\n\t" \
343
 "pmaxsw %%mm3,%%mm2\n\t" \
344
 "pmaxsw %%mm5,%%mm4\n\t" \
345
 "paddw %%mm3,%%mm6\n\t" \
346
 "paddw %%mm5,%%mm1\n\t" \
347
 "movq "OC_MEM_OFFS(_r7,buf)",%%mm3\n\t" \
348
349
/*Performs the second part of the final stage of the Hadamard transform and
350
   summing of absolute values.*/
351
#define OC_HADAMARD_C_ABS_ACCUM_B_8x4(_r6,_r7) \
352
 "#OC_HADAMARD_C_ABS_ACCUM_B_8x4\n\t" \
353
 "paddsw %%mm7,%%mm6\n\t" \
354
 "movq "OC_MEM_OFFS(_r6,buf)",%%mm5\n\t" \
355
 "paddsw %%mm7,%%mm1\n\t" \
356
 "psubw %%mm6,%%mm2\n\t" \
357
 "psubw %%mm1,%%mm4\n\t" \
358
 /*mm7={1}x4 (needed for the horizontal add that follows) \
359
   mm0+=mm2+mm4+max(abs(mm3),abs(mm5))-0x7FFF*/ \
360
 "movq %%mm3,%%mm6\n\t" \
361
 "pmaxsw %%mm5,%%mm3\n\t" \
362
 "paddw %%mm2,%%mm0\n\t" \
363
 "paddw %%mm5,%%mm6\n\t" \
364
 "paddw %%mm4,%%mm0\n\t" \
365
 "paddsw %%mm7,%%mm6\n\t" \
366
 "paddw %%mm3,%%mm0\n\t" \
367
 "psrlw $14,%%mm7\n\t" \
368
 "psubw %%mm6,%%mm0\n\t" \
369
370
/*Performs the last stage of an 8-point 1-D Hadamard transform, takes the
371
   absolute value of each component, and accumulates everything into mm0.
372
  This is the only portion of SATD which requires MMXEXT (we could use plain
373
   MMX, but it takes 4 instructions and an extra register to work around the
374
   lack of a pmaxsw, which is a pretty serious penalty).*/
375
#define OC_HADAMARD_C_ABS_ACCUM_8x4(_r6,_r7) \
376
 OC_HADAMARD_C_ABS_ACCUM_A_8x4(_r6,_r7) \
377
 OC_HADAMARD_C_ABS_ACCUM_B_8x4(_r6,_r7) \
378
379
/*Performs an 8-point 1-D Hadamard transform, takes the absolute value of each
380
   component, and accumulates everything into mm0.
381
  Note that mm0 will have an extra 4 added to each column, and that after
382
   removing this value, the remainder will be half the conventional value.*/
383
#define OC_HADAMARD_ABS_ACCUM_8x4(_r6,_r7) \
384
 OC_HADAMARD_AB_8x4 \
385
 OC_HADAMARD_C_ABS_ACCUM_8x4(_r6,_r7)
386
387
/*Performs two 4x4 transposes (mostly) in place.
388
  On input, {mm0,mm1,mm2,mm3} contains rows {e,f,g,h}, and {mm4,mm5,mm6,mm7}
389
   contains rows {a,b,c,d}.
390
  On output, {0x40,0x50,0x60,0x70}+_off(%[buf]) contains {e,f,g,h}^T, and
391
   {mm4,mm5,mm6,mm7} contains the transposed rows {a,b,c,d}^T.*/
392
#define OC_TRANSPOSE_4x4x2(_off) \
393
 "#OC_TRANSPOSE_4x4x2\n\t" \
394
 /*First 4x4 transpose:*/ \
395
 "movq %%mm5,"OC_MEM_OFFS(0x10+(_off),buf)"\n\t" \
396
 /*mm0 = e3 e2 e1 e0 \
397
   mm1 = f3 f2 f1 f0 \
398
   mm2 = g3 g2 g1 g0 \
399
   mm3 = h3 h2 h1 h0*/ \
400
 "movq %%mm2,%%mm5\n\t" \
401
 "punpcklwd %%mm3,%%mm2\n\t" \
402
 "punpckhwd %%mm3,%%mm5\n\t" \
403
 "movq %%mm0,%%mm3\n\t" \
404
 "punpcklwd %%mm1,%%mm0\n\t" \
405
 "punpckhwd %%mm1,%%mm3\n\t" \
406
 /*mm0 = f1 e1 f0 e0 \
407
   mm3 = f3 e3 f2 e2 \
408
   mm2 = h1 g1 h0 g0 \
409
   mm5 = h3 g3 h2 g2*/ \
410
 "movq %%mm0,%%mm1\n\t" \
411
 "punpckldq %%mm2,%%mm0\n\t" \
412
 "punpckhdq %%mm2,%%mm1\n\t" \
413
 "movq %%mm3,%%mm2\n\t" \
414
 "punpckhdq %%mm5,%%mm3\n\t" \
415
 "movq %%mm0,"OC_MEM_OFFS(0x40+(_off),buf)"\n\t" \
416
 "punpckldq %%mm5,%%mm2\n\t" \
417
 /*mm0 = h0 g0 f0 e0 \
418
   mm1 = h1 g1 f1 e1 \
419
   mm2 = h2 g2 f2 e2 \
420
   mm3 = h3 g3 f3 e3*/ \
421
 "movq "OC_MEM_OFFS(0x10+(_off),buf)",%%mm5\n\t" \
422
 /*Second 4x4 transpose:*/ \
423
 /*mm4 = a3 a2 a1 a0 \
424
   mm5 = b3 b2 b1 b0 \
425
   mm6 = c3 c2 c1 c0 \
426
   mm7 = d3 d2 d1 d0*/ \
427
 "movq %%mm6,%%mm0\n\t" \
428
 "punpcklwd %%mm7,%%mm6\n\t" \
429
 "movq %%mm1,"OC_MEM_OFFS(0x50+(_off),buf)"\n\t" \
430
 "punpckhwd %%mm7,%%mm0\n\t" \
431
 "movq %%mm4,%%mm7\n\t" \
432
 "punpcklwd %%mm5,%%mm4\n\t" \
433
 "movq %%mm2,"OC_MEM_OFFS(0x60+(_off),buf)"\n\t" \
434
 "punpckhwd %%mm5,%%mm7\n\t" \
435
 /*mm4 = b1 a1 b0 a0 \
436
   mm7 = b3 a3 b2 a2 \
437
   mm6 = d1 c1 d0 c0 \
438
   mm0 = d3 c3 d2 c2*/ \
439
 "movq %%mm4,%%mm5\n\t" \
440
 "punpckldq %%mm6,%%mm4\n\t" \
441
 "movq %%mm3,"OC_MEM_OFFS(0x70+(_off),buf)"\n\t" \
442
 "punpckhdq %%mm6,%%mm5\n\t" \
443
 "movq %%mm7,%%mm6\n\t" \
444
 "punpckhdq %%mm0,%%mm7\n\t" \
445
 "punpckldq %%mm0,%%mm6\n\t" \
446
 /*mm4 = d0 c0 b0 a0 \
447
   mm5 = d1 c1 b1 a1 \
448
   mm6 = d2 c2 b2 a2 \
449
   mm7 = d3 c3 b3 a3*/ \
450
451
static unsigned oc_int_frag_satd_mmxext(int *_dc,
452
 const unsigned char *_src,int _src_ystride,
453
0
 const unsigned char *_ref,int _ref_ystride){
454
0
  OC_ALIGN8(ogg_int16_t buf[64]);
455
0
  unsigned ret;
456
0
  unsigned ret2;
457
0
  int      dc;
458
0
  __asm__ __volatile__(
459
0
    OC_LOAD_SUB_8x4(0x00)
460
0
    OC_HADAMARD_8x4
461
0
    OC_TRANSPOSE_4x4x2(0x00)
462
    /*Finish swapping out this 8x4 block to make room for the next one.
463
      mm0...mm3 have been swapped out already.*/
464
0
    "movq %%mm4,"OC_MEM_OFFS(0x00,buf)"\n\t"
465
0
    "movq %%mm5,"OC_MEM_OFFS(0x10,buf)"\n\t"
466
0
    "movq %%mm6,"OC_MEM_OFFS(0x20,buf)"\n\t"
467
0
    "movq %%mm7,"OC_MEM_OFFS(0x30,buf)"\n\t"
468
0
    OC_LOAD_SUB_8x4(0x04)
469
0
    OC_HADAMARD_8x4
470
0
    OC_TRANSPOSE_4x4x2(0x08)
471
    /*Here the first 4x4 block of output from the last transpose is the second
472
       4x4 block of input for the next transform.
473
      We have cleverly arranged that it already be in the appropriate place, so
474
       we only have to do half the loads.*/
475
0
    "movq "OC_MEM_OFFS(0x10,buf)",%%mm1\n\t"
476
0
    "movq "OC_MEM_OFFS(0x20,buf)",%%mm2\n\t"
477
0
    "movq "OC_MEM_OFFS(0x30,buf)",%%mm3\n\t"
478
0
    "movq "OC_MEM_OFFS(0x00,buf)",%%mm0\n\t"
479
    /*We split out the stages here so we can save the DC coefficient in the
480
       middle.*/
481
0
    OC_HADAMARD_AB_8x4
482
0
    OC_HADAMARD_C_ABS_ACCUM_A_8x4(0x28,0x38)
483
0
    "movd %%mm1,%[dc]\n\t"
484
0
    OC_HADAMARD_C_ABS_ACCUM_B_8x4(0x28,0x38)
485
    /*Up to this point, everything fit in 16 bits (8 input + 1 for the
486
       difference + 2*3 for the two 8-point 1-D Hadamards - 1 for the abs - 1
487
       for the factor of two we dropped + 3 for the vertical accumulation).
488
      Now we finally have to promote things to dwords.
489
      We break this part out of OC_HADAMARD_ABS_ACCUM_8x4 to hide the long
490
       latency of pmaddwd by starting the next series of loads now.*/
491
0
    "pmaddwd %%mm7,%%mm0\n\t"
492
0
    "movq "OC_MEM_OFFS(0x50,buf)",%%mm1\n\t"
493
0
    "movq "OC_MEM_OFFS(0x58,buf)",%%mm5\n\t"
494
0
    "movq %%mm0,%%mm4\n\t"
495
0
    "movq "OC_MEM_OFFS(0x60,buf)",%%mm2\n\t"
496
0
    "punpckhdq %%mm0,%%mm0\n\t"
497
0
    "movq "OC_MEM_OFFS(0x68,buf)",%%mm6\n\t"
498
0
    "paddd %%mm0,%%mm4\n\t"
499
0
    "movq "OC_MEM_OFFS(0x70,buf)",%%mm3\n\t"
500
0
    "movd %%mm4,%[ret2]\n\t"
501
0
    "movq "OC_MEM_OFFS(0x78,buf)",%%mm7\n\t"
502
0
    "movq "OC_MEM_OFFS(0x40,buf)",%%mm0\n\t"
503
0
    "movq "OC_MEM_OFFS(0x48,buf)",%%mm4\n\t"
504
0
    OC_HADAMARD_ABS_ACCUM_8x4(0x68,0x78)
505
0
    "pmaddwd %%mm7,%%mm0\n\t"
506
    /*Subtract abs(dc) from 2*ret2.*/
507
0
    "movsx %w[dc],%[dc]\n\t"
508
0
    "cdq\n\t"
509
0
    "lea (%[ret],%[ret2],2),%[ret2]\n\t"
510
0
    "movq %%mm0,%%mm4\n\t"
511
0
    "punpckhdq %%mm0,%%mm0\n\t"
512
0
    "xor %[dc],%[ret]\n\t"
513
0
    "paddd %%mm0,%%mm4\n\t"
514
    /*The sums produced by OC_HADAMARD_ABS_ACCUM_8x4 each have an extra 4
515
       added to them, a factor of two removed, and the DC value included;
516
       correct the final sum here.*/
517
0
    "sub %[ret],%[ret2]\n\t"
518
0
    "movd %%mm4,%[ret]\n\t"
519
0
    "lea -64(%[ret2],%[ret],2),%[ret]\n\t"
520
    /*Although it looks like we're using 8 registers here, gcc can alias %[ret]
521
       and %[ret2] with some of the inputs, since for once we don't write to
522
       them until after we're done using everything but %[buf].*/
523
    /*Note that _src_ystride and _ref_ystride must be given non-overlapping
524
       constraints, otherwise if gcc can prove they're equal it will allocate
525
       them to the same register (which is bad); _src and _ref face a similar
526
       problem, though those are never actually the same.*/
527
0
    :[ret]"=d"(ret),[ret2]"=r"(ret2),[dc]"=a"(dc),
528
0
     [buf]"=m"(OC_ARRAY_OPERAND(ogg_int16_t,buf,64))
529
0
    :[src]"r"(_src),[src_ystride]"c"((ptrdiff_t)_src_ystride),
530
0
     [ref]"r"(_ref),[ref_ystride]"d"((ptrdiff_t)_ref_ystride)
531
    /*We have to use neg, so we actually clobber the condition codes for once
532
       (not to mention cmp, sub, and add).*/
533
0
    :"cc"
534
0
  );
535
0
  *_dc=dc;
536
0
  return ret;
537
0
}
538
539
unsigned oc_enc_frag_satd_mmxext(int *_dc,const unsigned char *_src,
540
0
 const unsigned char *_ref,int _ystride){
541
0
  return oc_int_frag_satd_mmxext(_dc,_src,_ystride,_ref,_ystride);
542
0
}
543
544
/*Our internal implementation of frag_copy2 takes an extra stride parameter so
545
   we can share code with oc_enc_frag_satd2_mmxext().*/
546
void oc_int_frag_copy2_mmxext(unsigned char *_dst,int _dst_ystride,
547
14.3M
 const unsigned char *_src1,const unsigned char *_src2,int _src_ystride){
548
14.3M
  __asm__ __volatile__(
549
    /*Load the first 3 rows.*/
550
14.3M
    "movq (%[src1]),%%mm0\n\t"
551
14.3M
    "movq (%[src2]),%%mm1\n\t"
552
14.3M
    "movq (%[src1],%[src_ystride]),%%mm2\n\t"
553
14.3M
    "lea (%[src1],%[src_ystride],2),%[src1]\n\t"
554
14.3M
    "movq (%[src2],%[src_ystride]),%%mm3\n\t"
555
14.3M
    "lea (%[src2],%[src_ystride],2),%[src2]\n\t"
556
14.3M
    "pxor %%mm7,%%mm7\n\t"
557
14.3M
    "movq (%[src1]),%%mm4\n\t"
558
14.3M
    "pcmpeqb %%mm6,%%mm6\n\t"
559
14.3M
    "movq (%[src2]),%%mm5\n\t"
560
    /*mm7={1}x8.*/
561
14.3M
    "psubb %%mm6,%%mm7\n\t"
562
    /*Start averaging %%mm0 and %%mm1 into %%mm6.*/
563
14.3M
    "movq %%mm0,%%mm6\n\t"
564
14.3M
    "pxor %%mm1,%%mm0\n\t"
565
14.3M
    "pavgb %%mm1,%%mm6\n\t"
566
    /*%%mm1 is free, start averaging %%mm3 into %%mm2 using %%mm1.*/
567
14.3M
    "movq %%mm2,%%mm1\n\t"
568
14.3M
    "pand %%mm7,%%mm0\n\t"
569
14.3M
    "pavgb %%mm3,%%mm2\n\t"
570
14.3M
    "pxor %%mm3,%%mm1\n\t"
571
    /*%%mm3 is free.*/
572
14.3M
    "psubb %%mm0,%%mm6\n\t"
573
    /*%%mm0 is free, start loading the next row.*/
574
14.3M
    "movq (%[src1],%[src_ystride]),%%mm0\n\t"
575
    /*Start averaging %%mm5 and %%mm4 using %%mm3.*/
576
14.3M
    "movq %%mm4,%%mm3\n\t"
577
    /*%%mm6 (row 0) is done; write it out.*/
578
14.3M
    "movq %%mm6,(%[dst])\n\t"
579
14.3M
    "pand %%mm7,%%mm1\n\t"
580
14.3M
    "pavgb %%mm5,%%mm4\n\t"
581
14.3M
    "psubb %%mm1,%%mm2\n\t"
582
    /*%%mm1 is free, continue loading the next row.*/
583
14.3M
    "movq (%[src2],%[src_ystride]),%%mm1\n\t"
584
14.3M
    "pxor %%mm5,%%mm3\n\t"
585
14.3M
    "lea (%[src1],%[src_ystride],2),%[src1]\n\t"
586
    /*%%mm2 (row 1) is done; write it out.*/
587
14.3M
    "movq %%mm2,(%[dst],%[dst_ystride])\n\t"
588
14.3M
    "pand %%mm7,%%mm3\n\t"
589
    /*Start loading the next row.*/
590
14.3M
    "movq (%[src1]),%%mm2\n\t"
591
14.3M
    "lea (%[dst],%[dst_ystride],2),%[dst]\n\t"
592
14.3M
    "psubb %%mm3,%%mm4\n\t"
593
14.3M
    "lea (%[src2],%[src_ystride],2),%[src2]\n\t"
594
    /*%%mm4 (row 2) is done; write it out.*/
595
14.3M
    "movq %%mm4,(%[dst])\n\t"
596
    /*Continue loading the next row.*/
597
14.3M
    "movq (%[src2]),%%mm3\n\t"
598
    /*Start averaging %%mm0 and %%mm1 into %%mm6.*/
599
14.3M
    "movq %%mm0,%%mm6\n\t"
600
14.3M
    "pxor %%mm1,%%mm0\n\t"
601
    /*Start loading the next row.*/
602
14.3M
    "movq (%[src1],%[src_ystride]),%%mm4\n\t"
603
14.3M
    "pavgb %%mm1,%%mm6\n\t"
604
    /*%%mm1 is free; start averaging %%mm3 into %%mm2 using %%mm1.*/
605
14.3M
    "movq %%mm2,%%mm1\n\t"
606
14.3M
    "pand %%mm7,%%mm0\n\t"
607
    /*Continue loading the next row.*/
608
14.3M
    "movq (%[src2],%[src_ystride]),%%mm5\n\t"
609
14.3M
    "pavgb %%mm3,%%mm2\n\t"
610
14.3M
    "lea (%[src1],%[src_ystride],2),%[src1]\n\t"
611
14.3M
    "pxor %%mm3,%%mm1\n\t"
612
    /*%%mm3 is free.*/
613
14.3M
    "psubb %%mm0,%%mm6\n\t"
614
    /*%%mm0 is free, start loading the next row.*/
615
14.3M
    "movq (%[src1]),%%mm0\n\t"
616
    /*Start averaging %%mm5 into %%mm4 using %%mm3.*/
617
14.3M
    "movq %%mm4,%%mm3\n\t"
618
    /*%%mm6 (row 3) is done; write it out.*/
619
14.3M
    "movq %%mm6,(%[dst],%[dst_ystride])\n\t"
620
14.3M
    "pand %%mm7,%%mm1\n\t"
621
14.3M
    "lea (%[src2],%[src_ystride],2),%[src2]\n\t"
622
14.3M
    "pavgb %%mm5,%%mm4\n\t"
623
14.3M
    "lea (%[dst],%[dst_ystride],2),%[dst]\n\t"
624
14.3M
    "psubb %%mm1,%%mm2\n\t"
625
    /*%%mm1 is free; continue loading the next row.*/
626
14.3M
    "movq (%[src2]),%%mm1\n\t"
627
14.3M
    "pxor %%mm5,%%mm3\n\t"
628
    /*%%mm2 (row 4) is done; write it out.*/
629
14.3M
    "movq %%mm2,(%[dst])\n\t"
630
14.3M
    "pand %%mm7,%%mm3\n\t"
631
    /*Start loading the next row.*/
632
14.3M
    "movq (%[src1],%[src_ystride]),%%mm2\n\t"
633
14.3M
    "psubb %%mm3,%%mm4\n\t"
634
    /*Start averaging %%mm0 and %%mm1 into %%mm6.*/
635
14.3M
    "movq %%mm0,%%mm6\n\t"
636
    /*Continue loading the next row.*/
637
14.3M
    "movq (%[src2],%[src_ystride]),%%mm3\n\t"
638
    /*%%mm4 (row 5) is done; write it out.*/
639
14.3M
    "movq %%mm4,(%[dst],%[dst_ystride])\n\t"
640
14.3M
    "pxor %%mm1,%%mm0\n\t"
641
14.3M
    "pavgb %%mm1,%%mm6\n\t"
642
    /*%%mm4 is free; start averaging %%mm3 into %%mm2 using %%mm4.*/
643
14.3M
    "movq %%mm2,%%mm4\n\t"
644
14.3M
    "pand %%mm7,%%mm0\n\t"
645
14.3M
    "pavgb %%mm3,%%mm2\n\t"
646
14.3M
    "pxor %%mm3,%%mm4\n\t"
647
14.3M
    "lea (%[dst],%[dst_ystride],2),%[dst]\n\t"
648
14.3M
    "psubb %%mm0,%%mm6\n\t"
649
14.3M
    "pand %%mm7,%%mm4\n\t"
650
    /*%%mm6 (row 6) is done, write it out.*/
651
14.3M
    "movq %%mm6,(%[dst])\n\t"
652
14.3M
    "psubb %%mm4,%%mm2\n\t"
653
    /*%%mm2 (row 7) is done, write it out.*/
654
14.3M
    "movq %%mm2,(%[dst],%[dst_ystride])\n\t"
655
14.3M
    :[dst]"+r"(_dst),[src1]"+r"(_src1),[src2]"+r"(_src2)
656
14.3M
    :[dst_ystride]"r"((ptrdiff_t)_dst_ystride),
657
14.3M
     [src_ystride]"r"((ptrdiff_t)_src_ystride)
658
14.3M
    :"memory"
659
14.3M
  );
660
14.3M
}
661
662
unsigned oc_enc_frag_satd2_mmxext(int *_dc,const unsigned char *_src,
663
0
 const unsigned char *_ref1,const unsigned char *_ref2,int _ystride){
664
0
  OC_ALIGN8(unsigned char ref[64]);
665
0
  oc_int_frag_copy2_mmxext(ref,8,_ref1,_ref2,_ystride);
666
0
  return oc_int_frag_satd_mmxext(_dc,_src,_ystride,ref,8);
667
0
}
668
669
unsigned oc_enc_frag_intra_satd_mmxext(int *_dc,
670
0
 const unsigned char *_src,int _ystride){
671
0
  OC_ALIGN8(ogg_int16_t buf[64]);
672
0
  unsigned ret;
673
0
  unsigned ret2;
674
0
  int      dc;
675
0
  __asm__ __volatile__(
676
0
    OC_LOAD_8x4(0x00)
677
0
    OC_HADAMARD_8x4
678
0
    OC_TRANSPOSE_4x4x2(0x00)
679
    /*Finish swapping out this 8x4 block to make room for the next one.
680
      mm0...mm3 have been swapped out already.*/
681
0
    "movq %%mm4,"OC_MEM_OFFS(0x00,buf)"\n\t"
682
0
    "movq %%mm5,"OC_MEM_OFFS(0x10,buf)"\n\t"
683
0
    "movq %%mm6,"OC_MEM_OFFS(0x20,buf)"\n\t"
684
0
    "movq %%mm7,"OC_MEM_OFFS(0x30,buf)"\n\t"
685
0
    OC_LOAD_8x4(0x04)
686
0
    OC_HADAMARD_8x4
687
0
    OC_TRANSPOSE_4x4x2(0x08)
688
    /*Here the first 4x4 block of output from the last transpose is the second
689
       4x4 block of input for the next transform.
690
      We have cleverly arranged that it already be in the appropriate place, so
691
       we only have to do half the loads.*/
692
0
    "movq "OC_MEM_OFFS(0x10,buf)",%%mm1\n\t"
693
0
    "movq "OC_MEM_OFFS(0x20,buf)",%%mm2\n\t"
694
0
    "movq "OC_MEM_OFFS(0x30,buf)",%%mm3\n\t"
695
0
    "movq "OC_MEM_OFFS(0x00,buf)",%%mm0\n\t"
696
    /*We split out the stages here so we can save the DC coefficient in the
697
       middle.*/
698
0
    OC_HADAMARD_AB_8x4
699
0
    OC_HADAMARD_C_ABS_ACCUM_A_8x4(0x28,0x38)
700
0
    "movd %%mm1,%[dc]\n\t"
701
0
    OC_HADAMARD_C_ABS_ACCUM_B_8x4(0x28,0x38)
702
    /*Up to this point, everything fit in 16 bits (8 input + 1 for the
703
       difference + 2*3 for the two 8-point 1-D Hadamards - 1 for the abs - 1
704
       for the factor of two we dropped + 3 for the vertical accumulation).
705
      Now we finally have to promote things to dwords.
706
      We break this part out of OC_HADAMARD_ABS_ACCUM_8x4 to hide the long
707
       latency of pmaddwd by starting the next series of loads now.*/
708
0
    "pmaddwd %%mm7,%%mm0\n\t"
709
0
    "movq "OC_MEM_OFFS(0x50,buf)",%%mm1\n\t"
710
0
    "movq "OC_MEM_OFFS(0x58,buf)",%%mm5\n\t"
711
0
    "movq "OC_MEM_OFFS(0x60,buf)",%%mm2\n\t"
712
0
    "movq %%mm0,%%mm4\n\t"
713
0
    "movq "OC_MEM_OFFS(0x68,buf)",%%mm6\n\t"
714
0
    "punpckhdq %%mm0,%%mm0\n\t"
715
0
    "movq "OC_MEM_OFFS(0x70,buf)",%%mm3\n\t"
716
0
    "paddd %%mm0,%%mm4\n\t"
717
0
    "movq "OC_MEM_OFFS(0x78,buf)",%%mm7\n\t"
718
0
    "movd %%mm4,%[ret]\n\t"
719
0
    "movq "OC_MEM_OFFS(0x40,buf)",%%mm0\n\t"
720
0
    "movq "OC_MEM_OFFS(0x48,buf)",%%mm4\n\t"
721
0
    OC_HADAMARD_ABS_ACCUM_8x4(0x68,0x78)
722
0
    "pmaddwd %%mm7,%%mm0\n\t"
723
    /*We assume that the DC coefficient is always positive (which is true,
724
       because the input to the INTRA transform was not a difference).*/
725
0
    "movzx %w[dc],%[dc]\n\t"
726
0
    "add %[ret],%[ret]\n\t"
727
0
    "sub %[dc],%[ret]\n\t"
728
0
    "movq %%mm0,%%mm4\n\t"
729
0
    "punpckhdq %%mm0,%%mm0\n\t"
730
0
    "paddd %%mm0,%%mm4\n\t"
731
0
    "movd %%mm4,%[ret2]\n\t"
732
0
    "lea -64(%[ret],%[ret2],2),%[ret]\n\t"
733
    /*Although it looks like we're using 8 registers here, gcc can alias %[ret]
734
       and %[ret2] with some of the inputs, since for once we don't write to
735
       them until after we're done using everything but %[buf] (which is also
736
       listed as an output to ensure gcc _doesn't_ alias them against it).*/
737
0
    :[ret]"=a"(ret),[ret2]"=r"(ret2),[dc]"=r"(dc),
738
0
     [buf]"=m"(OC_ARRAY_OPERAND(ogg_int16_t,buf,64))
739
0
    :[src]"r"(_src),[src4]"r"(_src+4*_ystride),
740
0
     [ystride]"r"((ptrdiff_t)_ystride),[ystride3]"r"((ptrdiff_t)3*_ystride)
741
    /*We have to use sub, so we actually clobber the condition codes for once
742
       (not to mention add).*/
743
0
    :"cc"
744
0
  );
745
0
  *_dc=dc;
746
0
  return ret;
747
0
}
748
749
void oc_enc_frag_sub_mmx(ogg_int16_t _residue[64],
750
394k
 const unsigned char *_src,const unsigned char *_ref,int _ystride){
751
394k
  int i;
752
394k
  __asm__ __volatile__("pxor %%mm7,%%mm7\n\t"::);
753
1.97M
  for(i=4;i-->0;){
754
1.57M
    __asm__ __volatile__(
755
      /*mm0=[src]*/
756
1.57M
      "movq (%[src]),%%mm0\n\t"
757
      /*mm1=[ref]*/
758
1.57M
      "movq (%[ref]),%%mm1\n\t"
759
      /*mm4=[src+ystride]*/
760
1.57M
      "movq (%[src],%[ystride]),%%mm4\n\t"
761
      /*mm5=[ref+ystride]*/
762
1.57M
      "movq (%[ref],%[ystride]),%%mm5\n\t"
763
      /*Compute [src]-[ref].*/
764
1.57M
      "movq %%mm0,%%mm2\n\t"
765
1.57M
      "punpcklbw %%mm7,%%mm0\n\t"
766
1.57M
      "movq %%mm1,%%mm3\n\t"
767
1.57M
      "punpckhbw %%mm7,%%mm2\n\t"
768
1.57M
      "punpcklbw %%mm7,%%mm1\n\t"
769
1.57M
      "punpckhbw %%mm7,%%mm3\n\t"
770
1.57M
      "psubw %%mm1,%%mm0\n\t"
771
1.57M
      "psubw %%mm3,%%mm2\n\t"
772
      /*Compute [src+ystride]-[ref+ystride].*/
773
1.57M
      "movq %%mm4,%%mm1\n\t"
774
1.57M
      "punpcklbw %%mm7,%%mm4\n\t"
775
1.57M
      "movq %%mm5,%%mm3\n\t"
776
1.57M
      "punpckhbw %%mm7,%%mm1\n\t"
777
1.57M
      "lea (%[src],%[ystride],2),%[src]\n\t"
778
1.57M
      "punpcklbw %%mm7,%%mm5\n\t"
779
1.57M
      "lea (%[ref],%[ystride],2),%[ref]\n\t"
780
1.57M
      "punpckhbw %%mm7,%%mm3\n\t"
781
1.57M
      "psubw %%mm5,%%mm4\n\t"
782
1.57M
      "psubw %%mm3,%%mm1\n\t"
783
      /*Write the answer out.*/
784
1.57M
      "movq %%mm0,0x00(%[residue])\n\t"
785
1.57M
      "movq %%mm2,0x08(%[residue])\n\t"
786
1.57M
      "movq %%mm4,0x10(%[residue])\n\t"
787
1.57M
      "movq %%mm1,0x18(%[residue])\n\t"
788
1.57M
      "lea 0x20(%[residue]),%[residue]\n\t"
789
1.57M
      :[residue]"+r"(_residue),[src]"+r"(_src),[ref]"+r"(_ref)
790
1.57M
      :[ystride]"r"((ptrdiff_t)_ystride)
791
1.57M
      :"memory"
792
1.57M
    );
793
1.57M
  }
794
394k
}
795
796
void oc_enc_frag_sub_128_mmx(ogg_int16_t _residue[64],
797
21.4M
 const unsigned char *_src,int _ystride){
798
21.4M
  ptrdiff_t ystride3;
799
21.4M
  __asm__ __volatile__(
800
    /*mm0=[src]*/
801
21.4M
    "movq (%[src]),%%mm0\n\t"
802
    /*mm1=[src+ystride]*/
803
21.4M
    "movq (%[src],%[ystride]),%%mm1\n\t"
804
    /*mm6={-1}x4*/
805
21.4M
    "pcmpeqw %%mm6,%%mm6\n\t"
806
    /*mm2=[src+2*ystride]*/
807
21.4M
    "movq (%[src],%[ystride],2),%%mm2\n\t"
808
    /*[ystride3]=3*[ystride]*/
809
21.4M
    "lea (%[ystride],%[ystride],2),%[ystride3]\n\t"
810
    /*mm6={1}x4*/
811
21.4M
    "psllw $15,%%mm6\n\t"
812
    /*mm3=[src+3*ystride]*/
813
21.4M
    "movq (%[src],%[ystride3]),%%mm3\n\t"
814
    /*mm6={128}x4*/
815
21.4M
    "psrlw $8,%%mm6\n\t"
816
    /*mm7=0*/
817
21.4M
    "pxor %%mm7,%%mm7\n\t"
818
    /*[src]=[src]+4*[ystride]*/
819
21.4M
    "lea (%[src],%[ystride],4),%[src]\n\t"
820
    /*Compute [src]-128 and [src+ystride]-128*/
821
21.4M
    "movq %%mm0,%%mm4\n\t"
822
21.4M
    "punpcklbw %%mm7,%%mm0\n\t"
823
21.4M
    "movq %%mm1,%%mm5\n\t"
824
21.4M
    "punpckhbw %%mm7,%%mm4\n\t"
825
21.4M
    "psubw %%mm6,%%mm0\n\t"
826
21.4M
    "punpcklbw %%mm7,%%mm1\n\t"
827
21.4M
    "psubw %%mm6,%%mm4\n\t"
828
21.4M
    "punpckhbw %%mm7,%%mm5\n\t"
829
21.4M
    "psubw %%mm6,%%mm1\n\t"
830
21.4M
    "psubw %%mm6,%%mm5\n\t"
831
    /*Write the answer out.*/
832
21.4M
    "movq %%mm0,0x00(%[residue])\n\t"
833
21.4M
    "movq %%mm4,0x08(%[residue])\n\t"
834
21.4M
    "movq %%mm1,0x10(%[residue])\n\t"
835
21.4M
    "movq %%mm5,0x18(%[residue])\n\t"
836
    /*mm0=[src+4*ystride]*/
837
21.4M
    "movq (%[src]),%%mm0\n\t"
838
    /*mm1=[src+5*ystride]*/
839
21.4M
    "movq (%[src],%[ystride]),%%mm1\n\t"
840
    /*Compute [src+2*ystride]-128 and [src+3*ystride]-128*/
841
21.4M
    "movq %%mm2,%%mm4\n\t"
842
21.4M
    "punpcklbw %%mm7,%%mm2\n\t"
843
21.4M
    "movq %%mm3,%%mm5\n\t"
844
21.4M
    "punpckhbw %%mm7,%%mm4\n\t"
845
21.4M
    "psubw %%mm6,%%mm2\n\t"
846
21.4M
    "punpcklbw %%mm7,%%mm3\n\t"
847
21.4M
    "psubw %%mm6,%%mm4\n\t"
848
21.4M
    "punpckhbw %%mm7,%%mm5\n\t"
849
21.4M
    "psubw %%mm6,%%mm3\n\t"
850
21.4M
    "psubw %%mm6,%%mm5\n\t"
851
    /*Write the answer out.*/
852
21.4M
    "movq %%mm2,0x20(%[residue])\n\t"
853
21.4M
    "movq %%mm4,0x28(%[residue])\n\t"
854
21.4M
    "movq %%mm3,0x30(%[residue])\n\t"
855
21.4M
    "movq %%mm5,0x38(%[residue])\n\t"
856
    /*mm2=[src+6*ystride]*/
857
21.4M
    "movq (%[src],%[ystride],2),%%mm2\n\t"
858
    /*mm3=[src+7*ystride]*/
859
21.4M
    "movq (%[src],%[ystride3]),%%mm3\n\t"
860
    /*Compute [src+4*ystride]-128 and [src+5*ystride]-128*/
861
21.4M
    "movq %%mm0,%%mm4\n\t"
862
21.4M
    "punpcklbw %%mm7,%%mm0\n\t"
863
21.4M
    "movq %%mm1,%%mm5\n\t"
864
21.4M
    "punpckhbw %%mm7,%%mm4\n\t"
865
21.4M
    "psubw %%mm6,%%mm0\n\t"
866
21.4M
    "punpcklbw %%mm7,%%mm1\n\t"
867
21.4M
    "psubw %%mm6,%%mm4\n\t"
868
21.4M
    "punpckhbw %%mm7,%%mm5\n\t"
869
21.4M
    "psubw %%mm6,%%mm1\n\t"
870
21.4M
    "psubw %%mm6,%%mm5\n\t"
871
    /*Write the answer out.*/
872
21.4M
    "movq %%mm0,0x40(%[residue])\n\t"
873
21.4M
    "movq %%mm4,0x48(%[residue])\n\t"
874
21.4M
    "movq %%mm1,0x50(%[residue])\n\t"
875
21.4M
    "movq %%mm5,0x58(%[residue])\n\t"
876
    /*Compute [src+6*ystride]-128 and [src+7*ystride]-128*/
877
21.4M
    "movq %%mm2,%%mm4\n\t"
878
21.4M
    "punpcklbw %%mm7,%%mm2\n\t"
879
21.4M
    "movq %%mm3,%%mm5\n\t"
880
21.4M
    "punpckhbw %%mm7,%%mm4\n\t"
881
21.4M
    "psubw %%mm6,%%mm2\n\t"
882
21.4M
    "punpcklbw %%mm7,%%mm3\n\t"
883
21.4M
    "psubw %%mm6,%%mm4\n\t"
884
21.4M
    "punpckhbw %%mm7,%%mm5\n\t"
885
21.4M
    "psubw %%mm6,%%mm3\n\t"
886
21.4M
    "psubw %%mm6,%%mm5\n\t"
887
    /*Write the answer out.*/
888
21.4M
    "movq %%mm2,0x60(%[residue])\n\t"
889
21.4M
    "movq %%mm4,0x68(%[residue])\n\t"
890
21.4M
    "movq %%mm3,0x70(%[residue])\n\t"
891
21.4M
    "movq %%mm5,0x78(%[residue])\n\t"
892
21.4M
    :[src]"+r"(_src),[ystride3]"=&r"(ystride3)
893
21.4M
    :[residue]"r"(_residue),[ystride]"r"((ptrdiff_t)_ystride)
894
21.4M
    :"memory"
895
21.4M
  );
896
21.4M
}
897
898
void oc_enc_frag_copy2_mmxext(unsigned char *_dst,
899
0
 const unsigned char *_src1,const unsigned char *_src2,int _ystride){
900
0
  oc_int_frag_copy2_mmxext(_dst,_ystride,_src1,_src2,_ystride);
901
0
}
902
903
#endif