/src/theora/lib/x86/sse2encfrag.c

Source
/********************************************************************
 *                                                                  *
 * THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE.   *
 * USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS     *
 * GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
 * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING.       *
 *                                                                  *
 * THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2009                *
 * by the Xiph.Org Foundation http://www.xiph.org/                  *
 *                                                                  *
 ********************************************************************

  function:
  last mod: $Id: dsp_mmx.c 14579 2008-03-12 06:42:40Z xiphmont $

 ********************************************************************/
#include <stddef.h>
#include "x86enc.h"
#include "sse2trans.h"

#if defined(OC_X86_ASM)

/*Load a 4x8 array of pixels values from %[src] and %[ref] and compute their
   16-bit differences.
  On output, these are stored in _m0, xmm1, xmm2, and xmm3.
  xmm4 and xmm5 are clobbered.*/
#define OC_LOAD_SUB_4x8(_m0) \
 "#OC_LOAD_SUB_4x8\n\t" \
 /*Load the first three rows.*/ \
 "movq (%[src]),"_m0"\n\t" \
 "movq (%[ref]),%%xmm4\n\t" \
 "movq (%[src],%[ystride]),%%xmm1\n\t" \
 "movq (%[ref],%[ystride]),%%xmm3\n\t" \
 "movq (%[src],%[ystride],2),%%xmm2\n\t" \
 "movq (%[ref],%[ystride],2),%%xmm5\n\t" \
 /*Unpack and subtract.*/ \
 "punpcklbw %%xmm4,"_m0"\n\t" \
 "punpcklbw %%xmm4,%%xmm4\n\t" \
 "punpcklbw %%xmm3,%%xmm1\n\t" \
 "punpcklbw %%xmm3,%%xmm3\n\t" \
 "psubw %%xmm4,"_m0"\n\t" \
 "psubw %%xmm3,%%xmm1\n\t" \
 /*Load the last row.*/ \
 "movq (%[src],%[ystride3]),%%xmm3\n\t" \
 "movq (%[ref],%[ystride3]),%%xmm4\n\t" \
 /*Unpack, subtract, and advance the pointers.*/ \
 "punpcklbw %%xmm5,%%xmm2\n\t" \
 "punpcklbw %%xmm5,%%xmm5\n\t" \
 "lea (%[src],%[ystride],4),%[src]\n\t" \
 "psubw %%xmm5,%%xmm2\n\t" \
 "punpcklbw %%xmm4,%%xmm3\n\t" \
 "punpcklbw %%xmm4,%%xmm4\n\t" \
 "lea (%[ref],%[ystride],4),%[ref]\n\t" \
 "psubw %%xmm4,%%xmm3\n\t" \

/*Square and accumulate four rows of differences in _m0, xmm1, xmm2, and xmm3.
  On output, xmm0 contains the sum of two of the rows, and the other two are
   added to xmm7.*/
#define OC_SSD_4x8(_m0) \
 "pmaddwd "_m0","_m0"\n\t" \
 "pmaddwd %%xmm1,%%xmm1\n\t" \
 "pmaddwd %%xmm2,%%xmm2\n\t" \
 "pmaddwd %%xmm3,%%xmm3\n\t" \
 "paddd %%xmm1,"_m0"\n\t" \
 "paddd %%xmm3,%%xmm2\n\t" \
 "paddd %%xmm2,%%xmm7\n\t" \

unsigned oc_enc_frag_ssd_sse2(const unsigned char *_src,
 const unsigned char *_ref,int _ystride){
  unsigned ret;
  __asm__ __volatile__(
    OC_LOAD_SUB_4x8("%%xmm7")
    OC_SSD_4x8("%%xmm7")
    OC_LOAD_SUB_4x8("%%xmm0")
    OC_SSD_4x8("%%xmm0")
    "paddd %%xmm0,%%xmm7\n\t"
    "movdqa %%xmm7,%%xmm6\n\t"
    "punpckhqdq %%xmm7,%%xmm7\n\t"
    "paddd %%xmm6,%%xmm7\n\t"
    "pshufd $1,%%xmm7,%%xmm6\n\t"
    "paddd %%xmm6,%%xmm7\n\t"
    "movd %%xmm7,%[ret]\n\t"
    :[ret]"=a"(ret)
    :[src]"r"(_src),[ref]"r"(_ref),[ystride]"r"((ptrdiff_t)_ystride),
     [ystride3]"r"((ptrdiff_t)_ystride*3)
  );
  return ret;
}

static const unsigned char __attribute__((aligned(16))) OC_MASK_CONSTS[8]={
  0x01,0x02,0x04,0x08,0x10,0x20,0x40,0x80
};

/*Load a 2x8 array of pixels values from %[src] and %[ref] and compute their
   horizontal sums as well as their 16-bit differences subject to a mask.
  %%xmm5 must contain OC_MASK_CONSTS[0...7] and %%xmm6 must contain 0.*/
#define OC_LOAD_SUB_MASK_2x8 \
 "#OC_LOAD_SUB_MASK_2x8\n\t" \
 /*Start the loads and expand the next 8 bits of the mask.*/ \
 "shl $8,%[m]\n\t" \
 "movq (%[src]),%%xmm0\n\t" \
 "mov %h[m],%b[m]\n\t" \
 "movq (%[ref]),%%xmm2\n\t" \
 "movd %[m],%%xmm4\n\t" \
 "shr $8,%[m]\n\t" \
 "pshuflw $0x00,%%xmm4,%%xmm4\n\t" \
 "mov %h[m],%b[m]\n\t" \
 "pand %%xmm6,%%xmm4\n\t" \
 "pcmpeqb %%xmm6,%%xmm4\n\t" \
 /*Perform the masking.*/ \
 "pand %%xmm4,%%xmm0\n\t" \
 "pand %%xmm4,%%xmm2\n\t" \
 /*Finish the loads while unpacking the first set of rows, and expand the next
    8 bits of the mask.*/ \
 "movd %[m],%%xmm4\n\t" \
 "movq (%[src],%[ystride]),%%xmm1\n\t" \
 "pshuflw $0x00,%%xmm4,%%xmm4\n\t" \
 "movq (%[ref],%[ystride]),%%xmm3\n\t" \
 "pand %%xmm6,%%xmm4\n\t" \
 "punpcklbw %%xmm2,%%xmm0\n\t" \
 "pcmpeqb %%xmm6,%%xmm4\n\t" \
 "punpcklbw %%xmm2,%%xmm2\n\t" \
 /*Mask and unpack the second set of rows.*/ \
 "pand %%xmm4,%%xmm1\n\t" \
 "pand %%xmm4,%%xmm3\n\t" \
 "punpcklbw %%xmm3,%%xmm1\n\t" \
 "punpcklbw %%xmm3,%%xmm3\n\t" \
 "psubw %%xmm2,%%xmm0\n\t" \
 "psubw %%xmm3,%%xmm1\n\t" \

unsigned oc_enc_frag_border_ssd_sse2(const unsigned char *_src,
 const unsigned char *_ref,int _ystride,ogg_int64_t _mask){
  ptrdiff_t ystride;
  unsigned  ret;
  int       i;
  ystride=_ystride;
  __asm__ __volatile__(
    "pxor %%xmm7,%%xmm7\n\t"
    "movq %[c],%%xmm6\n\t"
    :
    :[c]"m"(OC_CONST_ARRAY_OPERAND(unsigned char,OC_MASK_CONSTS,8))
  );
  for(i=0;i<4;i++){
    unsigned m;
    m=_mask&0xFFFF;
    _mask>>=16;
    if(m){
      __asm__ __volatile__(
        OC_LOAD_SUB_MASK_2x8
        "pmaddwd %%xmm0,%%xmm0\n\t"
        "pmaddwd %%xmm1,%%xmm1\n\t"
        "paddd %%xmm0,%%xmm7\n\t"
        "paddd %%xmm1,%%xmm7\n\t"
        :[src]"+r"(_src),[ref]"+r"(_ref),[ystride]"+r"(ystride),[m]"+Q"(m)
      );
    }
    _src+=2*ystride;
    _ref+=2*ystride;
  }
  __asm__ __volatile__(
    "movdqa %%xmm7,%%xmm6\n\t"
    "punpckhqdq %%xmm7,%%xmm7\n\t"
    "paddd %%xmm6,%%xmm7\n\t"
    "pshufd $1,%%xmm7,%%xmm6\n\t"
    "paddd %%xmm6,%%xmm7\n\t"
    "movd %%xmm7,%[ret]\n\t"
    :[ret]"=a"(ret)
  );
  return ret;
}


/*Load an 8x8 array of pixel values from %[src] and %[ref] and compute their
   16-bit difference in %%xmm0...%%xmm7.*/
#define OC_LOAD_SUB_8x8 \
 "#OC_LOAD_SUB_8x8\n\t" \
 "movq (%[src]),%%xmm0\n\t" \
 "movq (%[ref]),%%xmm4\n\t" \
 "movq (%[src],%[src_ystride]),%%xmm1\n\t" \
 "lea (%[src],%[src_ystride],2),%[src]\n\t" \
 "movq (%[ref],%[ref_ystride]),%%xmm5\n\t" \
 "lea (%[ref],%[ref_ystride],2),%[ref]\n\t" \
 "movq (%[src]),%%xmm2\n\t" \
 "movq (%[ref]),%%xmm7\n\t" \
 "movq (%[src],%[src_ystride]),%%xmm3\n\t" \
 "movq (%[ref],%[ref_ystride]),%%xmm6\n\t" \
 "punpcklbw %%xmm4,%%xmm0\n\t" \
 "lea (%[src],%[src_ystride],2),%[src]\n\t" \
 "punpcklbw %%xmm4,%%xmm4\n\t" \
 "lea (%[ref],%[ref_ystride],2),%[ref]\n\t" \
 "psubw %%xmm4,%%xmm0\n\t" \
 "movq (%[src]),%%xmm4\n\t" \
 "movdqa %%xmm0,"OC_MEM_OFFS(0x00,buf)"\n\t" \
 "movq (%[ref]),%%xmm0\n\t" \
 "punpcklbw %%xmm5,%%xmm1\n\t" \
 "punpcklbw %%xmm5,%%xmm5\n\t" \
 "psubw %%xmm5,%%xmm1\n\t" \
 "movq (%[src],%[src_ystride]),%%xmm5\n\t" \
 "punpcklbw %%xmm7,%%xmm2\n\t" \
 "punpcklbw %%xmm7,%%xmm7\n\t" \
 "psubw %%xmm7,%%xmm2\n\t" \
 "movq (%[ref],%[ref_ystride]),%%xmm7\n\t" \
 "punpcklbw %%xmm6,%%xmm3\n\t" \
 "lea (%[src],%[src_ystride],2),%[src]\n\t" \
 "punpcklbw %%xmm6,%%xmm6\n\t" \
 "psubw %%xmm6,%%xmm3\n\t" \
 "movq (%[src]),%%xmm6\n\t" \
 "punpcklbw %%xmm0,%%xmm4\n\t" \
 "lea (%[ref],%[ref_ystride],2),%[ref]\n\t" \
 "punpcklbw %%xmm0,%%xmm0\n\t" \
 "lea (%[src],%[src_ystride],2),%[src]\n\t" \
 "psubw %%xmm0,%%xmm4\n\t" \
 "movq (%[ref]),%%xmm0\n\t" \
 "punpcklbw %%xmm7,%%xmm5\n\t" \
 "neg %[src_ystride]\n\t" \
 "punpcklbw %%xmm7,%%xmm7\n\t" \
 "psubw %%xmm7,%%xmm5\n\t" \
 "movq (%[src],%[src_ystride]),%%xmm7\n\t" \
 "punpcklbw %%xmm0,%%xmm6\n\t" \
 "lea (%[ref],%[ref_ystride],2),%[ref]\n\t" \
 "punpcklbw %%xmm0,%%xmm0\n\t" \
 "neg %[ref_ystride]\n\t" \
 "psubw %%xmm0,%%xmm6\n\t" \
 "movq (%[ref],%[ref_ystride]),%%xmm0\n\t" \
 "punpcklbw %%xmm0,%%xmm7\n\t" \
 "punpcklbw %%xmm0,%%xmm0\n\t" \
 "psubw %%xmm0,%%xmm7\n\t" \
 "movdqa "OC_MEM_OFFS(0x00,buf)",%%xmm0\n\t" \

/*Load an 8x8 array of pixel values from %[src] into %%xmm0...%%xmm7.*/
#define OC_LOAD_8x8 \
 "#OC_LOAD_8x8\n\t" \
 "movq (%[src]),%%xmm0\n\t" \
 "movq (%[src],%[ystride]),%%xmm1\n\t" \
 "movq (%[src],%[ystride],2),%%xmm2\n\t" \
 "pxor %%xmm7,%%xmm7\n\t" \
 "movq (%[src],%[ystride3]),%%xmm3\n\t" \
 "punpcklbw %%xmm7,%%xmm0\n\t" \
 "movq (%[src4]),%%xmm4\n\t" \
 "punpcklbw %%xmm7,%%xmm1\n\t" \
 "movq (%[src4],%[ystride]),%%xmm5\n\t" \
 "punpcklbw %%xmm7,%%xmm2\n\t" \
 "movq (%[src4],%[ystride],2),%%xmm6\n\t" \
 "punpcklbw %%xmm7,%%xmm3\n\t" \
 "movq (%[src4],%[ystride3]),%%xmm7\n\t" \
 "punpcklbw %%xmm4,%%xmm4\n\t" \
 "punpcklbw %%xmm5,%%xmm5\n\t" \
 "psrlw $8,%%xmm4\n\t" \
 "psrlw $8,%%xmm5\n\t" \
 "punpcklbw %%xmm6,%%xmm6\n\t" \
 "punpcklbw %%xmm7,%%xmm7\n\t" \
 "psrlw $8,%%xmm6\n\t" \
 "psrlw $8,%%xmm7\n\t" \

/*Performs the first two stages of an 8-point 1-D Hadamard transform in place.
  Outputs 1, 3, 4, and 5 from the second stage are negated (which allows us to
   perform this stage in place with no temporary registers).*/
#define OC_HADAMARD_AB_8x8 \
 "#OC_HADAMARD_AB_8x8\n\t" \
 /*Stage A:*/ \
 "paddw %%xmm5,%%xmm1\n\t" \
 "paddw %%xmm6,%%xmm2\n\t" \
 "paddw %%xmm5,%%xmm5\n\t" \
 "paddw %%xmm6,%%xmm6\n\t" \
 "psubw %%xmm1,%%xmm5\n\t" \
 "psubw %%xmm2,%%xmm6\n\t" \
 "paddw %%xmm7,%%xmm3\n\t" \
 "paddw %%xmm4,%%xmm0\n\t" \
 "paddw %%xmm7,%%xmm7\n\t" \
 "paddw %%xmm4,%%xmm4\n\t" \
 "psubw %%xmm3,%%xmm7\n\t" \
 "psubw %%xmm0,%%xmm4\n\t" \
 /*Stage B:*/ \
 "paddw %%xmm2,%%xmm0\n\t" \
 "paddw %%xmm3,%%xmm1\n\t" \
 "paddw %%xmm6,%%xmm4\n\t" \
 "paddw %%xmm7,%%xmm5\n\t" \
 "paddw %%xmm2,%%xmm2\n\t" \
 "paddw %%xmm3,%%xmm3\n\t" \
 "paddw %%xmm6,%%xmm6\n\t" \
 "paddw %%xmm7,%%xmm7\n\t" \
 "psubw %%xmm0,%%xmm2\n\t" \
 "psubw %%xmm1,%%xmm3\n\t" \
 "psubw %%xmm4,%%xmm6\n\t" \
 "psubw %%xmm5,%%xmm7\n\t" \

/*Performs the last stage of an 8-point 1-D Hadamard transform in place.
  Outputs 1, 3, 5, and 7 are negated (which allows us to perform this stage in
   place with no temporary registers).*/
#define OC_HADAMARD_C_8x8 \
 "#OC_HADAMARD_C_8x8\n\t" \
 /*Stage C:*/ \
 "paddw %%xmm1,%%xmm0\n\t" \
 "paddw %%xmm3,%%xmm2\n\t" \
 "paddw %%xmm5,%%xmm4\n\t" \
 "paddw %%xmm7,%%xmm6\n\t" \
 "paddw %%xmm1,%%xmm1\n\t" \
 "paddw %%xmm3,%%xmm3\n\t" \
 "paddw %%xmm5,%%xmm5\n\t" \
 "paddw %%xmm7,%%xmm7\n\t" \
 "psubw %%xmm0,%%xmm1\n\t" \
 "psubw %%xmm2,%%xmm3\n\t" \
 "psubw %%xmm4,%%xmm5\n\t" \
 "psubw %%xmm6,%%xmm7\n\t" \

/*Performs an 8-point 1-D Hadamard transform in place.
  Outputs 1, 2, 4, and 7 are negated (which allows us to perform the transform
   in place with no temporary registers).*/
#define OC_HADAMARD_8x8 \
 OC_HADAMARD_AB_8x8 \
 OC_HADAMARD_C_8x8 \

/*Performs the first part of the final stage of the Hadamard transform and
   summing of absolute values.
  At the end of this part, %%xmm1 will contain the DC coefficient of the
   transform.*/
#define OC_HADAMARD_C_ABS_ACCUM_A_8x8 \
 /*We use the fact that \
     (abs(a+b)+abs(a-b))/2=max(abs(a),abs(b)) \
    to merge the final butterfly with the abs and the first stage of \
    accumulation. \
   Thus we can avoid using pabsw, which is not available until SSSE3. \
   Emulating pabsw takes 3 instructions, so the straightforward SSE2 \
    implementation would be (3+3)*8+7=55 instructions (+4 for spilling \
    registers). \
   Even with pabsw, it would be (3+1)*8+7=39 instructions (with no spills). \
   This implementation is only 26 (+4 for spilling registers).*/ \
 "#OC_HADAMARD_C_ABS_ACCUM_A_8x8\n\t" \
 "movdqa %%xmm7,"OC_MEM_OFFS(0x10,buf)"\n\t" \
 "movdqa %%xmm6,"OC_MEM_OFFS(0x00,buf)"\n\t" \
 /*xmm7={0x7FFF}x4 \
   xmm4=max(abs(xmm4),abs(xmm5))-0x7FFF*/ \
 "pcmpeqb %%xmm7,%%xmm7\n\t" \
 "movdqa %%xmm4,%%xmm6\n\t" \
 "psrlw $1,%%xmm7\n\t" \
 "paddw %%xmm5,%%xmm6\n\t" \
 "pmaxsw %%xmm5,%%xmm4\n\t" \
 "paddsw %%xmm7,%%xmm6\n\t" \
 "psubw %%xmm6,%%xmm4\n\t" \
 /*xmm2=max(abs(xmm2),abs(xmm3))-0x7FFF \
   xmm0=max(abs(xmm0),abs(xmm1))-0x7FFF*/ \
 "movdqa %%xmm2,%%xmm6\n\t" \
 "movdqa %%xmm0,%%xmm5\n\t" \
 "pmaxsw %%xmm3,%%xmm2\n\t" \
 "pmaxsw %%xmm1,%%xmm0\n\t" \
 "paddw %%xmm3,%%xmm6\n\t" \
 "movdqa "OC_MEM_OFFS(0x10,buf)",%%xmm3\n\t" \
 "paddw %%xmm5,%%xmm1\n\t" \
 "movdqa "OC_MEM_OFFS(0x00,buf)",%%xmm5\n\t" \

/*Performs the second part of the final stage of the Hadamard transform and
   summing of absolute values.*/
#define OC_HADAMARD_C_ABS_ACCUM_B_8x8 \
 "#OC_HADAMARD_C_ABS_ACCUM_B_8x8\n\t" \
 "paddsw %%xmm7,%%xmm6\n\t" \
 "paddsw %%xmm7,%%xmm1\n\t" \
 "psubw %%xmm6,%%xmm2\n\t" \
 "psubw %%xmm1,%%xmm0\n\t" \
 /*xmm7={1}x4 (needed for the horizontal add that follows) \
   xmm0+=xmm2+xmm4+max(abs(xmm3),abs(xmm5))-0x7FFF*/ \
 "movdqa %%xmm3,%%xmm6\n\t" \
 "pmaxsw %%xmm5,%%xmm3\n\t" \
 "paddw %%xmm2,%%xmm0\n\t" \
 "paddw %%xmm5,%%xmm6\n\t" \
 "paddw %%xmm4,%%xmm0\n\t" \
 "paddsw %%xmm7,%%xmm6\n\t" \
 "paddw %%xmm3,%%xmm0\n\t" \
 "psrlw $14,%%xmm7\n\t" \
 "psubw %%xmm6,%%xmm0\n\t" \

/*Performs the last stage of an 8-point 1-D Hadamard transform, takes the
   absolute value of each component, and accumulates everything into xmm0.*/
#define OC_HADAMARD_C_ABS_ACCUM_8x8 \
 OC_HADAMARD_C_ABS_ACCUM_A_8x8 \
 OC_HADAMARD_C_ABS_ACCUM_B_8x8 \

/*Performs an 8-point 1-D Hadamard transform, takes the absolute value of each
   component, and accumulates everything into xmm0.
  Note that xmm0 will have an extra 4 added to each column, and that after
   removing this value, the remainder will be half the conventional value.*/
#define OC_HADAMARD_ABS_ACCUM_8x8 \
 OC_HADAMARD_AB_8x8 \
 OC_HADAMARD_C_ABS_ACCUM_8x8

static unsigned oc_int_frag_satd_sse2(int *_dc,
 const unsigned char *_src,int _src_ystride,
 const unsigned char *_ref,int _ref_ystride){
  OC_ALIGN16(ogg_int16_t buf[16]);
  unsigned ret;
  unsigned ret2;
  int      dc;
  __asm__ __volatile__(
    OC_LOAD_SUB_8x8
    OC_HADAMARD_8x8
    OC_TRANSPOSE_8x8
    /*We split out the stages here so we can save the DC coefficient in the
       middle.*/
    OC_HADAMARD_AB_8x8
    OC_HADAMARD_C_ABS_ACCUM_A_8x8
    "movd %%xmm1,%[dc]\n\t"
    OC_HADAMARD_C_ABS_ACCUM_B_8x8
    /*Up to this point, everything fit in 16 bits (8 input + 1 for the
       difference + 2*3 for the two 8-point 1-D Hadamards - 1 for the abs - 1
       for the factor of two we dropped + 3 for the vertical accumulation).
      Now we finally have to promote things to dwords.
      We break this part out of OC_HADAMARD_ABS_ACCUM_8x8 to hide the long
       latency of pmaddwd by starting to compute abs(dc) here.*/
    "pmaddwd %%xmm7,%%xmm0\n\t"
    "movsx %w[dc],%[dc]\n\t"
    "cdq\n\t"
    "movdqa %%xmm0,%%xmm1\n\t"
    "punpckhqdq %%xmm0,%%xmm0\n\t"
    "paddd %%xmm1,%%xmm0\n\t"
    "pshuflw $0xE,%%xmm0,%%xmm1\n\t"
    "paddd %%xmm1,%%xmm0\n\t"
    "movd %%xmm0,%[ret]\n\t"
    /*The sums produced by OC_HADAMARD_ABS_ACCUM_8x8 each have an extra 4
       added to them, a factor of two removed, and the DC value included;
       correct the final sum here.*/
    "lea -64(%[ret2],%[ret],2),%[ret]\n\t"
    "xor %[dc],%[ret2]\n\t"
    "sub %[ret2],%[ret]\n\t"
    /*Although it looks like we're using 7 registers here, gcc can alias %[ret]
       and %[dc] with some of the inputs, since for once we don't write to
       them until after we're done using everything but %[buf].*/
    /*Note that _src_ystride and _ref_ystride must be given non-overlapping
       constraints, otherewise if gcc can prove they're equal it will allocate
       them to the same register (which is bad); _src and _ref face a similar
       problem.
      All four are destructively modified, but if we list them as output
       constraints, gcc can't alias them with other outputs.*/
    :[ret]"=r"(ret),[ret2]"=d"(ret2),[dc]"=a"(dc),
     [buf]"=m"(OC_ARRAY_OPERAND(ogg_int16_t,buf,16))
    :[src]"S"(_src),[src_ystride]"c"((ptrdiff_t)_src_ystride),
     [ref]"a"(_ref),[ref_ystride]"d"((ptrdiff_t)_ref_ystride)
    /*We have to use neg, so we actually clobber the condition codes for once
       (not to mention sub, and add).*/
    :"cc"
  );
  *_dc=dc;
  return ret;
}

unsigned oc_enc_frag_satd_sse2(int *_dc,const unsigned char *_src,
 const unsigned char *_ref,int _ystride){
  return oc_int_frag_satd_sse2(_dc,_src,_ystride,_ref,_ystride);
}

unsigned oc_enc_frag_satd2_sse2(int *_dc,const unsigned char *_src,
 const unsigned char *_ref1,const unsigned char *_ref2,int _ystride){
  OC_ALIGN8(unsigned char ref[64]);
  oc_int_frag_copy2_mmxext(ref,8,_ref1,_ref2,_ystride);
  return oc_int_frag_satd_sse2(_dc,_src,_ystride,ref,8);
}

unsigned oc_enc_frag_intra_satd_sse2(int *_dc,
 const unsigned char *_src,int _ystride){
  OC_ALIGN16(ogg_int16_t buf[16]);
  unsigned ret;
  int      dc;
  __asm__ __volatile__(
    OC_LOAD_8x8
    OC_HADAMARD_8x8
    OC_TRANSPOSE_8x8
    /*We split out the stages here so we can save the DC coefficient in the
       middle.*/
    OC_HADAMARD_AB_8x8
    OC_HADAMARD_C_ABS_ACCUM_A_8x8
    "movd %%xmm1,%[dc]\n\t"
    OC_HADAMARD_C_ABS_ACCUM_B_8x8
    /*Up to this point, everything fit in 16 bits (8 input + 1 for the
       difference + 2*3 for the two 8-point 1-D Hadamards - 1 for the abs - 1
       for the factor of two we dropped + 3 for the vertical accumulation).
      Now we finally have to promote things to dwords.*/
    "pmaddwd %%xmm7,%%xmm0\n\t"
    /*We assume that the DC coefficient is always positive (which is true,
       because the input to the INTRA transform was not a difference).*/
    "movzx %w[dc],%[dc]\n\t"
    "movdqa %%xmm0,%%xmm1\n\t"
    "punpckhqdq %%xmm0,%%xmm0\n\t"
    "paddd %%xmm1,%%xmm0\n\t"
    "pshuflw $0xE,%%xmm0,%%xmm1\n\t"
    "paddd %%xmm1,%%xmm0\n\t"
    "movd %%xmm0,%[ret]\n\t"
    "lea -64(%[ret],%[ret]),%[ret]\n\t"
    "sub %[dc],%[ret]\n\t"
    /*Although it looks like we're using 7 registers here, gcc can alias %[ret]
       and %[dc] with some of the inputs, since for once we don't write to
       them until after we're done using everything but %[buf].*/
    :[ret]"=a"(ret),[dc]"=r"(dc),
     [buf]"=m"(OC_ARRAY_OPERAND(ogg_int16_t,buf,16))
    :[src]"r"(_src),[src4]"r"(_src+4*_ystride),
     [ystride]"r"((ptrdiff_t)_ystride),[ystride3]"r"((ptrdiff_t)3*_ystride)
    /*We have to use sub, so we actually clobber the condition codes for once.*/
    :"cc"
  );
  *_dc=dc;
  return ret;
}

#endif

Coverage Report

Created: 2024-09-06 07:53

Line	Count	Source
1		/********************************************************************
2		* *
3		* THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE. *
4		* USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS *
5		* GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
6		* IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING. *
7		* *
8		* THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2009 *
9		* by the Xiph.Org Foundation http://www.xiph.org/ *
10		* *
11		********************************************************************
12
13		function:
14		last mod: $Id: dsp_mmx.c 14579 2008-03-12 06:42:40Z xiphmont $
15
16		********************************************************************/
17		#include <stddef.h>
18		#include "x86enc.h"
19		#include "sse2trans.h"
20
21		#if defined(OC_X86_ASM)
22
23		/*Load a 4x8 array of pixels values from %[src] and %[ref] and compute their
24		16-bit differences.
25		On output, these are stored in _m0, xmm1, xmm2, and xmm3.
26		xmm4 and xmm5 are clobbered.*/
27		#define OC_LOAD_SUB_4x8(_m0) \
28		"#OC_LOAD_SUB_4x8\n\t" \
29		/Load the first three rows./ \
30		"movq (%[src]),"_m0"\n\t" \
31		"movq (%[ref]),%%xmm4\n\t" \
32		"movq (%[src],%[ystride]),%%xmm1\n\t" \
33		"movq (%[ref],%[ystride]),%%xmm3\n\t" \
34		"movq (%[src],%[ystride],2),%%xmm2\n\t" \
35		"movq (%[ref],%[ystride],2),%%xmm5\n\t" \
36		/Unpack and subtract./ \
37		"punpcklbw %%xmm4,"_m0"\n\t" \
38		"punpcklbw %%xmm4,%%xmm4\n\t" \
39		"punpcklbw %%xmm3,%%xmm1\n\t" \
40		"punpcklbw %%xmm3,%%xmm3\n\t" \
41		"psubw %%xmm4,"_m0"\n\t" \
42		"psubw %%xmm3,%%xmm1\n\t" \
43		/Load the last row./ \
44		"movq (%[src],%[ystride3]),%%xmm3\n\t" \
45		"movq (%[ref],%[ystride3]),%%xmm4\n\t" \
46		/Unpack, subtract, and advance the pointers./ \
47		"punpcklbw %%xmm5,%%xmm2\n\t" \
48		"punpcklbw %%xmm5,%%xmm5\n\t" \
49		"lea (%[src],%[ystride],4),%[src]\n\t" \
50		"psubw %%xmm5,%%xmm2\n\t" \
51		"punpcklbw %%xmm4,%%xmm3\n\t" \
52		"punpcklbw %%xmm4,%%xmm4\n\t" \
53		"lea (%[ref],%[ystride],4),%[ref]\n\t" \
54		"psubw %%xmm4,%%xmm3\n\t" \
55
56		/*Square and accumulate four rows of differences in _m0, xmm1, xmm2, and xmm3.
57		On output, xmm0 contains the sum of two of the rows, and the other two are
58		added to xmm7.*/
59		#define OC_SSD_4x8(_m0) \
60		"pmaddwd "_m0","_m0"\n\t" \
61		"pmaddwd %%xmm1,%%xmm1\n\t" \
62		"pmaddwd %%xmm2,%%xmm2\n\t" \
63		"pmaddwd %%xmm3,%%xmm3\n\t" \
64		"paddd %%xmm1,"_m0"\n\t" \
65		"paddd %%xmm3,%%xmm2\n\t" \
66		"paddd %%xmm2,%%xmm7\n\t" \
67
68		unsigned oc_enc_frag_ssd_sse2(const unsigned char *_src,
69	3.54M	const unsigned char *_ref,int _ystride){
70	3.54M	unsigned ret;
71	3.54M	__asm__ __volatile__(
72	3.54M	OC_LOAD_SUB_4x8("%%xmm7")
73	3.54M	OC_SSD_4x8("%%xmm7")
74	3.54M	OC_LOAD_SUB_4x8("%%xmm0")
75	3.54M	OC_SSD_4x8("%%xmm0")
76	3.54M	"paddd %%xmm0,%%xmm7\n\t"
77	3.54M	"movdqa %%xmm7,%%xmm6\n\t"
78	3.54M	"punpckhqdq %%xmm7,%%xmm7\n\t"
79	3.54M	"paddd %%xmm6,%%xmm7\n\t"
80	3.54M	"pshufd $1,%%xmm7,%%xmm6\n\t"
81	3.54M	"paddd %%xmm6,%%xmm7\n\t"
82	3.54M	"movd %%xmm7,%[ret]\n\t"
83	3.54M	:[ret]"=a"(ret)
84	3.54M	:[src]"r"(_src),[ref]"r"(_ref),[ystride]"r"((ptrdiff_t)_ystride),
85	3.54M	[ystride3]"r"((ptrdiff_t)_ystride*3)
86	3.54M	);
87	3.54M	return ret;
88	3.54M	}
89
90		static const unsigned char __attribute__((aligned(16))) OC_MASK_CONSTS[8]={
91		0x01,0x02,0x04,0x08,0x10,0x20,0x40,0x80
92		};
93
94		/*Load a 2x8 array of pixels values from %[src] and %[ref] and compute their
95		horizontal sums as well as their 16-bit differences subject to a mask.
96		%%xmm5 must contain OC_MASK_CONSTS[0...7] and %%xmm6 must contain 0.*/
97		#define OC_LOAD_SUB_MASK_2x8 \
98		"#OC_LOAD_SUB_MASK_2x8\n\t" \
99		/Start the loads and expand the next 8 bits of the mask./ \
100		"shl $8,%[m]\n\t" \
101		"movq (%[src]),%%xmm0\n\t" \
102		"mov %h[m],%b[m]\n\t" \
103		"movq (%[ref]),%%xmm2\n\t" \
104		"movd %[m],%%xmm4\n\t" \
105		"shr $8,%[m]\n\t" \
106		"pshuflw $0x00,%%xmm4,%%xmm4\n\t" \
107		"mov %h[m],%b[m]\n\t" \
108		"pand %%xmm6,%%xmm4\n\t" \
109		"pcmpeqb %%xmm6,%%xmm4\n\t" \
110		/Perform the masking./ \
111		"pand %%xmm4,%%xmm0\n\t" \
112		"pand %%xmm4,%%xmm2\n\t" \
113		/*Finish the loads while unpacking the first set of rows, and expand the next
114		8 bits of the mask.*/ \
115		"movd %[m],%%xmm4\n\t" \
116		"movq (%[src],%[ystride]),%%xmm1\n\t" \
117		"pshuflw $0x00,%%xmm4,%%xmm4\n\t" \
118		"movq (%[ref],%[ystride]),%%xmm3\n\t" \
119		"pand %%xmm6,%%xmm4\n\t" \
120		"punpcklbw %%xmm2,%%xmm0\n\t" \
121		"pcmpeqb %%xmm6,%%xmm4\n\t" \
122		"punpcklbw %%xmm2,%%xmm2\n\t" \
123		/Mask and unpack the second set of rows./ \
124		"pand %%xmm4,%%xmm1\n\t" \
125		"pand %%xmm4,%%xmm3\n\t" \
126		"punpcklbw %%xmm3,%%xmm1\n\t" \
127		"punpcklbw %%xmm3,%%xmm3\n\t" \
128		"psubw %%xmm2,%%xmm0\n\t" \
129		"psubw %%xmm3,%%xmm1\n\t" \
130
131		unsigned oc_enc_frag_border_ssd_sse2(const unsigned char *_src,
132	1.25M	const unsigned char *_ref,int _ystride,ogg_int64_t _mask){
133	1.25M	ptrdiff_t ystride;
134	1.25M	unsigned ret;
135	1.25M	int i;
136	1.25M	ystride=_ystride;
137	1.25M	__asm__ __volatile__(
138	1.25M	"pxor %%xmm7,%%xmm7\n\t"
139	1.25M	"movq %[c],%%xmm6\n\t"
140	1.25M	:
141	1.25M	:[c]"m"(OC_CONST_ARRAY_OPERAND(unsigned char,OC_MASK_CONSTS,8))
142	1.25M	);
143	6.25M	for(i=0;i<4;i++){
144	5.00M	unsigned m;
145	5.00M	m=_mask&0xFFFF;
146	5.00M	_mask>>=16;
147	5.00M	if(m){
148	3.20M	__asm__ __volatile__(
149	3.20M	OC_LOAD_SUB_MASK_2x8
150	3.20M	"pmaddwd %%xmm0,%%xmm0\n\t"
151	3.20M	"pmaddwd %%xmm1,%%xmm1\n\t"
152	3.20M	"paddd %%xmm0,%%xmm7\n\t"
153	3.20M	"paddd %%xmm1,%%xmm7\n\t"
154	3.20M	:[src]"+r"(_src),[ref]"+r"(_ref),[ystride]"+r"(ystride),[m]"+Q"(m)
155	3.20M	);
156	3.20M	}
157	5.00M	_src+=2*ystride;
158	5.00M	_ref+=2*ystride;
159	5.00M	}
160	1.25M	__asm__ __volatile__(
161	1.25M	"movdqa %%xmm7,%%xmm6\n\t"
162	1.25M	"punpckhqdq %%xmm7,%%xmm7\n\t"
163	1.25M	"paddd %%xmm6,%%xmm7\n\t"
164	1.25M	"pshufd $1,%%xmm7,%%xmm6\n\t"
165	1.25M	"paddd %%xmm6,%%xmm7\n\t"
166	1.25M	"movd %%xmm7,%[ret]\n\t"
167	1.25M	:[ret]"=a"(ret)
168	1.25M	);
169	1.25M	return ret;
170	1.25M	}
171
172
173		/*Load an 8x8 array of pixel values from %[src] and %[ref] and compute their
174		16-bit difference in %%xmm0...%%xmm7.*/
175		#define OC_LOAD_SUB_8x8 \
176		"#OC_LOAD_SUB_8x8\n\t" \
177		"movq (%[src]),%%xmm0\n\t" \
178		"movq (%[ref]),%%xmm4\n\t" \
179		"movq (%[src],%[src_ystride]),%%xmm1\n\t" \
180		"lea (%[src],%[src_ystride],2),%[src]\n\t" \
181		"movq (%[ref],%[ref_ystride]),%%xmm5\n\t" \
182		"lea (%[ref],%[ref_ystride],2),%[ref]\n\t" \
183		"movq (%[src]),%%xmm2\n\t" \
184		"movq (%[ref]),%%xmm7\n\t" \
185		"movq (%[src],%[src_ystride]),%%xmm3\n\t" \
186		"movq (%[ref],%[ref_ystride]),%%xmm6\n\t" \
187		"punpcklbw %%xmm4,%%xmm0\n\t" \
188		"lea (%[src],%[src_ystride],2),%[src]\n\t" \
189		"punpcklbw %%xmm4,%%xmm4\n\t" \
190		"lea (%[ref],%[ref_ystride],2),%[ref]\n\t" \
191		"psubw %%xmm4,%%xmm0\n\t" \
192		"movq (%[src]),%%xmm4\n\t" \
193		"movdqa %%xmm0,"OC_MEM_OFFS(0x00,buf)"\n\t" \
194		"movq (%[ref]),%%xmm0\n\t" \
195		"punpcklbw %%xmm5,%%xmm1\n\t" \
196		"punpcklbw %%xmm5,%%xmm5\n\t" \
197		"psubw %%xmm5,%%xmm1\n\t" \
198		"movq (%[src],%[src_ystride]),%%xmm5\n\t" \
199		"punpcklbw %%xmm7,%%xmm2\n\t" \
200		"punpcklbw %%xmm7,%%xmm7\n\t" \
201		"psubw %%xmm7,%%xmm2\n\t" \
202		"movq (%[ref],%[ref_ystride]),%%xmm7\n\t" \
203		"punpcklbw %%xmm6,%%xmm3\n\t" \
204		"lea (%[src],%[src_ystride],2),%[src]\n\t" \
205		"punpcklbw %%xmm6,%%xmm6\n\t" \
206		"psubw %%xmm6,%%xmm3\n\t" \
207		"movq (%[src]),%%xmm6\n\t" \
208		"punpcklbw %%xmm0,%%xmm4\n\t" \
209		"lea (%[ref],%[ref_ystride],2),%[ref]\n\t" \
210		"punpcklbw %%xmm0,%%xmm0\n\t" \
211		"lea (%[src],%[src_ystride],2),%[src]\n\t" \
212		"psubw %%xmm0,%%xmm4\n\t" \
213		"movq (%[ref]),%%xmm0\n\t" \
214		"punpcklbw %%xmm7,%%xmm5\n\t" \
215		"neg %[src_ystride]\n\t" \
216		"punpcklbw %%xmm7,%%xmm7\n\t" \
217		"psubw %%xmm7,%%xmm5\n\t" \
218		"movq (%[src],%[src_ystride]),%%xmm7\n\t" \
219		"punpcklbw %%xmm0,%%xmm6\n\t" \
220		"lea (%[ref],%[ref_ystride],2),%[ref]\n\t" \
221		"punpcklbw %%xmm0,%%xmm0\n\t" \
222		"neg %[ref_ystride]\n\t" \
223		"psubw %%xmm0,%%xmm6\n\t" \
224		"movq (%[ref],%[ref_ystride]),%%xmm0\n\t" \
225		"punpcklbw %%xmm0,%%xmm7\n\t" \
226		"punpcklbw %%xmm0,%%xmm0\n\t" \
227		"psubw %%xmm0,%%xmm7\n\t" \
228		"movdqa "OC_MEM_OFFS(0x00,buf)",%%xmm0\n\t" \
229
230		/Load an 8x8 array of pixel values from %[src] into %%xmm0...%%xmm7./
231		#define OC_LOAD_8x8 \
232		"#OC_LOAD_8x8\n\t" \
233		"movq (%[src]),%%xmm0\n\t" \
234		"movq (%[src],%[ystride]),%%xmm1\n\t" \
235		"movq (%[src],%[ystride],2),%%xmm2\n\t" \
236		"pxor %%xmm7,%%xmm7\n\t" \
237		"movq (%[src],%[ystride3]),%%xmm3\n\t" \
238		"punpcklbw %%xmm7,%%xmm0\n\t" \
239		"movq (%[src4]),%%xmm4\n\t" \
240		"punpcklbw %%xmm7,%%xmm1\n\t" \
241		"movq (%[src4],%[ystride]),%%xmm5\n\t" \
242		"punpcklbw %%xmm7,%%xmm2\n\t" \
243		"movq (%[src4],%[ystride],2),%%xmm6\n\t" \
244		"punpcklbw %%xmm7,%%xmm3\n\t" \
245		"movq (%[src4],%[ystride3]),%%xmm7\n\t" \
246		"punpcklbw %%xmm4,%%xmm4\n\t" \
247		"punpcklbw %%xmm5,%%xmm5\n\t" \
248		"psrlw $8,%%xmm4\n\t" \
249		"psrlw $8,%%xmm5\n\t" \
250		"punpcklbw %%xmm6,%%xmm6\n\t" \
251		"punpcklbw %%xmm7,%%xmm7\n\t" \
252		"psrlw $8,%%xmm6\n\t" \
253		"psrlw $8,%%xmm7\n\t" \
254
255		/*Performs the first two stages of an 8-point 1-D Hadamard transform in place.
256		Outputs 1, 3, 4, and 5 from the second stage are negated (which allows us to
257		perform this stage in place with no temporary registers).*/
258		#define OC_HADAMARD_AB_8x8 \
259		"#OC_HADAMARD_AB_8x8\n\t" \
260		/Stage A:/ \
261		"paddw %%xmm5,%%xmm1\n\t" \
262		"paddw %%xmm6,%%xmm2\n\t" \
263		"paddw %%xmm5,%%xmm5\n\t" \
264		"paddw %%xmm6,%%xmm6\n\t" \
265		"psubw %%xmm1,%%xmm5\n\t" \
266		"psubw %%xmm2,%%xmm6\n\t" \
267		"paddw %%xmm7,%%xmm3\n\t" \
268		"paddw %%xmm4,%%xmm0\n\t" \
269		"paddw %%xmm7,%%xmm7\n\t" \
270		"paddw %%xmm4,%%xmm4\n\t" \
271		"psubw %%xmm3,%%xmm7\n\t" \
272		"psubw %%xmm0,%%xmm4\n\t" \
273		/Stage B:/ \
274		"paddw %%xmm2,%%xmm0\n\t" \
275		"paddw %%xmm3,%%xmm1\n\t" \
276		"paddw %%xmm6,%%xmm4\n\t" \
277		"paddw %%xmm7,%%xmm5\n\t" \
278		"paddw %%xmm2,%%xmm2\n\t" \
279		"paddw %%xmm3,%%xmm3\n\t" \
280		"paddw %%xmm6,%%xmm6\n\t" \
281		"paddw %%xmm7,%%xmm7\n\t" \
282		"psubw %%xmm0,%%xmm2\n\t" \
283		"psubw %%xmm1,%%xmm3\n\t" \
284		"psubw %%xmm4,%%xmm6\n\t" \
285		"psubw %%xmm5,%%xmm7\n\t" \
286
287		/*Performs the last stage of an 8-point 1-D Hadamard transform in place.
288		Outputs 1, 3, 5, and 7 are negated (which allows us to perform this stage in
289		place with no temporary registers).*/
290		#define OC_HADAMARD_C_8x8 \
291		"#OC_HADAMARD_C_8x8\n\t" \
292		/Stage C:/ \
293		"paddw %%xmm1,%%xmm0\n\t" \
294		"paddw %%xmm3,%%xmm2\n\t" \
295		"paddw %%xmm5,%%xmm4\n\t" \
296		"paddw %%xmm7,%%xmm6\n\t" \
297		"paddw %%xmm1,%%xmm1\n\t" \
298		"paddw %%xmm3,%%xmm3\n\t" \
299		"paddw %%xmm5,%%xmm5\n\t" \
300		"paddw %%xmm7,%%xmm7\n\t" \
301		"psubw %%xmm0,%%xmm1\n\t" \
302		"psubw %%xmm2,%%xmm3\n\t" \
303		"psubw %%xmm4,%%xmm5\n\t" \
304		"psubw %%xmm6,%%xmm7\n\t" \
305
306		/*Performs an 8-point 1-D Hadamard transform in place.
307		Outputs 1, 2, 4, and 7 are negated (which allows us to perform the transform
308		in place with no temporary registers).*/
309		#define OC_HADAMARD_8x8 \
310		OC_HADAMARD_AB_8x8 \
311		OC_HADAMARD_C_8x8 \
312
313		/*Performs the first part of the final stage of the Hadamard transform and
314		summing of absolute values.
315		At the end of this part, %%xmm1 will contain the DC coefficient of the
316		transform.*/
317		#define OC_HADAMARD_C_ABS_ACCUM_A_8x8 \
318		/*We use the fact that \
319		(abs(a+b)+abs(a-b))/2=max(abs(a),abs(b)) \
320		to merge the final butterfly with the abs and the first stage of \
321		accumulation. \
322		Thus we can avoid using pabsw, which is not available until SSSE3. \
323		Emulating pabsw takes 3 instructions, so the straightforward SSE2 \
324		implementation would be (3+3)*8+7=55 instructions (+4 for spilling \
325		registers). \
326		Even with pabsw, it would be (3+1)*8+7=39 instructions (with no spills). \
327		This implementation is only 26 (+4 for spilling registers).*/ \
328		"#OC_HADAMARD_C_ABS_ACCUM_A_8x8\n\t" \
329		"movdqa %%xmm7,"OC_MEM_OFFS(0x10,buf)"\n\t" \
330		"movdqa %%xmm6,"OC_MEM_OFFS(0x00,buf)"\n\t" \
331		/*xmm7={0x7FFF}x4 \
332		xmm4=max(abs(xmm4),abs(xmm5))-0x7FFF*/ \
333		"pcmpeqb %%xmm7,%%xmm7\n\t" \
334		"movdqa %%xmm4,%%xmm6\n\t" \
335		"psrlw $1,%%xmm7\n\t" \
336		"paddw %%xmm5,%%xmm6\n\t" \
337		"pmaxsw %%xmm5,%%xmm4\n\t" \
338		"paddsw %%xmm7,%%xmm6\n\t" \
339		"psubw %%xmm6,%%xmm4\n\t" \
340		/*xmm2=max(abs(xmm2),abs(xmm3))-0x7FFF \
341		xmm0=max(abs(xmm0),abs(xmm1))-0x7FFF*/ \
342		"movdqa %%xmm2,%%xmm6\n\t" \
343		"movdqa %%xmm0,%%xmm5\n\t" \
344		"pmaxsw %%xmm3,%%xmm2\n\t" \
345		"pmaxsw %%xmm1,%%xmm0\n\t" \
346		"paddw %%xmm3,%%xmm6\n\t" \
347		"movdqa "OC_MEM_OFFS(0x10,buf)",%%xmm3\n\t" \
348		"paddw %%xmm5,%%xmm1\n\t" \
349		"movdqa "OC_MEM_OFFS(0x00,buf)",%%xmm5\n\t" \
350
351		/*Performs the second part of the final stage of the Hadamard transform and
352		summing of absolute values.*/
353		#define OC_HADAMARD_C_ABS_ACCUM_B_8x8 \
354		"#OC_HADAMARD_C_ABS_ACCUM_B_8x8\n\t" \
355		"paddsw %%xmm7,%%xmm6\n\t" \
356		"paddsw %%xmm7,%%xmm1\n\t" \
357		"psubw %%xmm6,%%xmm2\n\t" \
358		"psubw %%xmm1,%%xmm0\n\t" \
359		/*xmm7={1}x4 (needed for the horizontal add that follows) \
360		xmm0+=xmm2+xmm4+max(abs(xmm3),abs(xmm5))-0x7FFF*/ \
361		"movdqa %%xmm3,%%xmm6\n\t" \
362		"pmaxsw %%xmm5,%%xmm3\n\t" \
363		"paddw %%xmm2,%%xmm0\n\t" \
364		"paddw %%xmm5,%%xmm6\n\t" \
365		"paddw %%xmm4,%%xmm0\n\t" \
366		"paddsw %%xmm7,%%xmm6\n\t" \
367		"paddw %%xmm3,%%xmm0\n\t" \
368		"psrlw $14,%%xmm7\n\t" \
369		"psubw %%xmm6,%%xmm0\n\t" \
370
371		/*Performs the last stage of an 8-point 1-D Hadamard transform, takes the
372		absolute value of each component, and accumulates everything into xmm0.*/
373		#define OC_HADAMARD_C_ABS_ACCUM_8x8 \
374		OC_HADAMARD_C_ABS_ACCUM_A_8x8 \
375		OC_HADAMARD_C_ABS_ACCUM_B_8x8 \
376
377		/*Performs an 8-point 1-D Hadamard transform, takes the absolute value of each
378		component, and accumulates everything into xmm0.
379		Note that xmm0 will have an extra 4 added to each column, and that after
380		removing this value, the remainder will be half the conventional value.*/
381		#define OC_HADAMARD_ABS_ACCUM_8x8 \
382		OC_HADAMARD_AB_8x8 \
383		OC_HADAMARD_C_ABS_ACCUM_8x8
384
385		static unsigned oc_int_frag_satd_sse2(int *_dc,
386		const unsigned char *_src,int _src_ystride,
387	40.1M	const unsigned char *_ref,int _ref_ystride){
388	40.1M	OC_ALIGN16(ogg_int16_t buf[16]);
389	40.1M	unsigned ret;
390	40.1M	unsigned ret2;
391	40.1M	int dc;
392	40.1M	__asm__ __volatile__(
393	40.1M	OC_LOAD_SUB_8x8
394	40.1M	OC_HADAMARD_8x8
395	40.1M	OC_TRANSPOSE_8x8
396		/*We split out the stages here so we can save the DC coefficient in the
397		middle.*/
398	40.1M	OC_HADAMARD_AB_8x8
399	40.1M	OC_HADAMARD_C_ABS_ACCUM_A_8x8
400	40.1M	"movd %%xmm1,%[dc]\n\t"
401	40.1M	OC_HADAMARD_C_ABS_ACCUM_B_8x8
402		/*Up to this point, everything fit in 16 bits (8 input + 1 for the
403		difference + 2*3 for the two 8-point 1-D Hadamards - 1 for the abs - 1
404		for the factor of two we dropped + 3 for the vertical accumulation).
405		Now we finally have to promote things to dwords.
406		We break this part out of OC_HADAMARD_ABS_ACCUM_8x8 to hide the long
407		latency of pmaddwd by starting to compute abs(dc) here.*/
408	40.1M	"pmaddwd %%xmm7,%%xmm0\n\t"
409	40.1M	"movsx %w[dc],%[dc]\n\t"
410	40.1M	"cdq\n\t"
411	40.1M	"movdqa %%xmm0,%%xmm1\n\t"
412	40.1M	"punpckhqdq %%xmm0,%%xmm0\n\t"
413	40.1M	"paddd %%xmm1,%%xmm0\n\t"
414	40.1M	"pshuflw $0xE,%%xmm0,%%xmm1\n\t"
415	40.1M	"paddd %%xmm1,%%xmm0\n\t"
416	40.1M	"movd %%xmm0,%[ret]\n\t"
417		/*The sums produced by OC_HADAMARD_ABS_ACCUM_8x8 each have an extra 4
418		added to them, a factor of two removed, and the DC value included;
419		correct the final sum here.*/
420	40.1M	"lea -64(%[ret2],%[ret],2),%[ret]\n\t"
421	40.1M	"xor %[dc],%[ret2]\n\t"
422	40.1M	"sub %[ret2],%[ret]\n\t"
423		/*Although it looks like we're using 7 registers here, gcc can alias %[ret]
424		and %[dc] with some of the inputs, since for once we don't write to
425		them until after we're done using everything but %[buf].*/
426		/*Note that _src_ystride and _ref_ystride must be given non-overlapping
427		constraints, otherewise if gcc can prove they're equal it will allocate
428		them to the same register (which is bad); _src and _ref face a similar
429		problem.
430		All four are destructively modified, but if we list them as output
431		constraints, gcc can't alias them with other outputs.*/
432	40.1M	:[ret]"=r"(ret),[ret2]"=d"(ret2),[dc]"=a"(dc),
433	40.1M	[buf]"=m"(OC_ARRAY_OPERAND(ogg_int16_t,buf,16))
434	40.1M	:[src]"S"(_src),[src_ystride]"c"((ptrdiff_t)_src_ystride),
435	40.1M	[ref]"a"(_ref),[ref_ystride]"d"((ptrdiff_t)_ref_ystride)
436		/*We have to use neg, so we actually clobber the condition codes for once
437		(not to mention sub, and add).*/
438	40.1M	:"cc"
439	40.1M	);
440	40.1M	*_dc=dc;
441	40.1M	return ret;
442	40.1M	}
443
444		unsigned oc_enc_frag_satd_sse2(int _dc,const unsigned char _src,
445	20.3M	const unsigned char *_ref,int _ystride){
446	20.3M	return oc_int_frag_satd_sse2(_dc,_src,_ystride,_ref,_ystride);
447	20.3M	}
448
449		unsigned oc_enc_frag_satd2_sse2(int _dc,const unsigned char _src,
450	19.7M	const unsigned char _ref1,const unsigned char _ref2,int _ystride){
451	19.7M	OC_ALIGN8(unsigned char ref[64]);
452	19.7M	oc_int_frag_copy2_mmxext(ref,8,_ref1,_ref2,_ystride);
453	19.7M	return oc_int_frag_satd_sse2(_dc,_src,_ystride,ref,8);
454	19.7M	}
455
456		unsigned oc_enc_frag_intra_satd_sse2(int *_dc,
457	25.1M	const unsigned char *_src,int _ystride){
458	25.1M	OC_ALIGN16(ogg_int16_t buf[16]);
459	25.1M	unsigned ret;
460	25.1M	int dc;
461	25.1M	__asm__ __volatile__(
462	25.1M	OC_LOAD_8x8
463	25.1M	OC_HADAMARD_8x8
464	25.1M	OC_TRANSPOSE_8x8
465		/*We split out the stages here so we can save the DC coefficient in the
466		middle.*/
467	25.1M	OC_HADAMARD_AB_8x8
468	25.1M	OC_HADAMARD_C_ABS_ACCUM_A_8x8
469	25.1M	"movd %%xmm1,%[dc]\n\t"
470	25.1M	OC_HADAMARD_C_ABS_ACCUM_B_8x8
471		/*Up to this point, everything fit in 16 bits (8 input + 1 for the
472		difference + 2*3 for the two 8-point 1-D Hadamards - 1 for the abs - 1
473		for the factor of two we dropped + 3 for the vertical accumulation).
474		Now we finally have to promote things to dwords.*/
475	25.1M	"pmaddwd %%xmm7,%%xmm0\n\t"
476		/*We assume that the DC coefficient is always positive (which is true,
477		because the input to the INTRA transform was not a difference).*/
478	25.1M	"movzx %w[dc],%[dc]\n\t"
479	25.1M	"movdqa %%xmm0,%%xmm1\n\t"
480	25.1M	"punpckhqdq %%xmm0,%%xmm0\n\t"
481	25.1M	"paddd %%xmm1,%%xmm0\n\t"
482	25.1M	"pshuflw $0xE,%%xmm0,%%xmm1\n\t"
483	25.1M	"paddd %%xmm1,%%xmm0\n\t"
484	25.1M	"movd %%xmm0,%[ret]\n\t"
485	25.1M	"lea -64(%[ret],%[ret]),%[ret]\n\t"
486	25.1M	"sub %[dc],%[ret]\n\t"
487		/*Although it looks like we're using 7 registers here, gcc can alias %[ret]
488		and %[dc] with some of the inputs, since for once we don't write to
489		them until after we're done using everything but %[buf].*/
490	25.1M	:[ret]"=a"(ret),[dc]"=r"(dc),
491	25.1M	[buf]"=m"(OC_ARRAY_OPERAND(ogg_int16_t,buf,16))
492	25.1M	:[src]"r"(_src),[src4]"r"(_src+4*_ystride),
493	25.1M	[ystride]"r"((ptrdiff_t)_ystride),[ystride3]"r"((ptrdiff_t)3*_ystride)
494		/We have to use sub, so we actually clobber the condition codes for once./
495	25.1M	:"cc"
496	25.1M	);
497	25.1M	*_dc=dc;
498	25.1M	return ret;
499	25.1M	}
500
501		#endif