/src/theora/lib/x86/sse2encfrag.c

Source
/********************************************************************
 *                                                                  *
 * THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE.   *
 * USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS     *
 * GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
 * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING.       *
 *                                                                  *
 * THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2009                *
 * by the Xiph.Org Foundation https://www.xiph.org/                 *
 *                                                                  *
 ********************************************************************

  function:

 ********************************************************************/
#include <stddef.h>
#include "x86enc.h"
#include "sse2trans.h"

#if defined(OC_X86_ASM)

/*Load a 4x8 array of pixels values from %[src] and %[ref] and compute their
   16-bit differences.
  On output, these are stored in _m0, xmm1, xmm2, and xmm3.
  xmm4 and xmm5 are clobbered.*/
#define OC_LOAD_SUB_4x8(_m0) \
 "#OC_LOAD_SUB_4x8\n\t" \
 /*Load the first three rows.*/ \
 "movq (%[src]),"_m0"\n\t" \
 "movq (%[ref]),%%xmm4\n\t" \
 "movq (%[src],%[ystride]),%%xmm1\n\t" \
 "movq (%[ref],%[ystride]),%%xmm3\n\t" \
 "movq (%[src],%[ystride],2),%%xmm2\n\t" \
 "movq (%[ref],%[ystride],2),%%xmm5\n\t" \
 /*Unpack and subtract.*/ \
 "punpcklbw %%xmm4,"_m0"\n\t" \
 "punpcklbw %%xmm4,%%xmm4\n\t" \
 "punpcklbw %%xmm3,%%xmm1\n\t" \
 "punpcklbw %%xmm3,%%xmm3\n\t" \
 "psubw %%xmm4,"_m0"\n\t" \
 "psubw %%xmm3,%%xmm1\n\t" \
 /*Load the last row.*/ \
 "movq (%[src],%[ystride3]),%%xmm3\n\t" \
 "movq (%[ref],%[ystride3]),%%xmm4\n\t" \
 /*Unpack, subtract, and advance the pointers.*/ \
 "punpcklbw %%xmm5,%%xmm2\n\t" \
 "punpcklbw %%xmm5,%%xmm5\n\t" \
 "lea (%[src],%[ystride],4),%[src]\n\t" \
 "psubw %%xmm5,%%xmm2\n\t" \
 "punpcklbw %%xmm4,%%xmm3\n\t" \
 "punpcklbw %%xmm4,%%xmm4\n\t" \
 "lea (%[ref],%[ystride],4),%[ref]\n\t" \
 "psubw %%xmm4,%%xmm3\n\t" \

/*Square and accumulate four rows of differences in _m0, xmm1, xmm2, and xmm3.
  On output, xmm0 contains the sum of two of the rows, and the other two are
   added to xmm7.*/
#define OC_SSD_4x8(_m0) \
 "pmaddwd "_m0","_m0"\n\t" \
 "pmaddwd %%xmm1,%%xmm1\n\t" \
 "pmaddwd %%xmm2,%%xmm2\n\t" \
 "pmaddwd %%xmm3,%%xmm3\n\t" \
 "paddd %%xmm1,"_m0"\n\t" \
 "paddd %%xmm3,%%xmm2\n\t" \
 "paddd %%xmm2,%%xmm7\n\t" \

unsigned __attribute__((target("sse2"))) oc_enc_frag_ssd_sse2(const unsigned char *_src,
 const unsigned char *_ref,int _ystride){
  unsigned ret;
  __asm__ __volatile__(
    OC_LOAD_SUB_4x8("%%xmm7")
    OC_SSD_4x8("%%xmm7")
    OC_LOAD_SUB_4x8("%%xmm0")
    OC_SSD_4x8("%%xmm0")
    "paddd %%xmm0,%%xmm7\n\t"
    "movdqa %%xmm7,%%xmm6\n\t"
    "punpckhqdq %%xmm7,%%xmm7\n\t"
    "paddd %%xmm6,%%xmm7\n\t"
    "pshufd $1,%%xmm7,%%xmm6\n\t"
    "paddd %%xmm6,%%xmm7\n\t"
    "movd %%xmm7,%[ret]\n\t"
    :[ret]"=a"(ret)
    :[src]"r"(_src),[ref]"r"(_ref),[ystride]"r"((ptrdiff_t)_ystride),
     [ystride3]"r"((ptrdiff_t)_ystride*3)
    :"%xmm0", "%xmm1", "%xmm2", "%xmm3",
     "%xmm4", "%xmm5", "%xmm6", "%xmm7"
  );
  return ret;
}

static const unsigned char __attribute__((aligned(16))) OC_MASK_CONSTS[8]={
  0x01,0x02,0x04,0x08,0x10,0x20,0x40,0x80
};

/*Load a 2x8 array of pixels values from %[src] and %[ref] and compute their
   horizontal sums as well as their 16-bit differences subject to a mask.
  %[cr] must contain OC_MASK_CONSTS[0...7] and %[mr] must contain 0.*/
#define OC_LOAD_SUB_MASK_2x8 \
 "#OC_LOAD_SUB_MASK_2x8\n\t" \
 /*Start the loads and expand the next 8 bits of the mask.*/ \
 "shl $8,%[m]\n\t" \
 "movq (%[src]),%%xmm0\n\t" \
 "mov %h[m],%b[m]\n\t" \
 "movq (%[ref]),%%xmm2\n\t" \
 "movd %[m],%%xmm4\n\t" \
 "shr $8,%[m]\n\t" \
 "pshuflw $0x00,%%xmm4,%%xmm4\n\t" \
 "mov %h[m],%b[m]\n\t" \
 "pand %[cr],%%xmm4\n\t" \
 "pcmpeqb %[cr],%%xmm4\n\t" \
 /*Perform the masking.*/ \
 "pand %%xmm4,%%xmm0\n\t" \
 "pand %%xmm4,%%xmm2\n\t" \
 /*Finish the loads while unpacking the first set of rows, and expand the next
    8 bits of the mask.*/ \
 "movd %[m],%%xmm4\n\t" \
 "movq (%[src],%[ystride]),%%xmm1\n\t" \
 "pshuflw $0x00,%%xmm4,%%xmm4\n\t" \
 "movq (%[ref],%[ystride]),%%xmm3\n\t" \
 "pand %[cr],%%xmm4\n\t" \
 "punpcklbw %%xmm2,%%xmm0\n\t" \
 "pcmpeqb %[cr],%%xmm4\n\t" \
 "punpcklbw %%xmm2,%%xmm2\n\t" \
 /*Mask and unpack the second set of rows.*/ \
 "pand %%xmm4,%%xmm1\n\t" \
 "pand %%xmm4,%%xmm3\n\t" \
 "punpcklbw %%xmm3,%%xmm1\n\t" \
 "punpcklbw %%xmm3,%%xmm3\n\t" \
 "psubw %%xmm2,%%xmm0\n\t" \
 "psubw %%xmm3,%%xmm1\n\t" \

unsigned __attribute__((target("sse2"))) oc_enc_frag_border_ssd_sse2(const unsigned char *_src,
 const unsigned char *_ref,int _ystride,ogg_int64_t _mask){
  ptrdiff_t ystride;
  unsigned  ret;
  int       i;
  ystride=_ystride;
  /*Store intermediate values across __asm__ blocks*/
  register sse2_reg cr;
  register sse2_reg mr;
  __asm__ __volatile__(
    "pxor %[mr],%[mr]\n\t"
    "movq %[c],%[cr]\n\t"
    :[cr]"=x"(cr), [mr]"=x"(mr)
    :[c]"m"(OC_CONST_ARRAY_OPERAND(unsigned char,OC_MASK_CONSTS,8))
  );
  for(i=0;i<4;i++){
    unsigned m;
    m=_mask&0xFFFF;
    _mask>>=16;
    if(m){
      __asm__ __volatile__(
        OC_LOAD_SUB_MASK_2x8
        "pmaddwd %%xmm0,%%xmm0\n\t"
        "pmaddwd %%xmm1,%%xmm1\n\t"
        "paddd %%xmm0,%[mr]\n\t"
        "paddd %%xmm1,%[mr]\n\t"
        :[src]"+r"(_src),[ref]"+r"(_ref),[ystride]"+r"(ystride),[m]"+Q"(m),[mr]"+x"(mr)
        :[cr]"x"(cr)
        :"%xmm0", "%xmm1", "%xmm2", "%xmm3",
         "%xmm4"/*, "%xmm5", "%xmm6", "%xmm7"*/
      );
    }
    _src+=2*ystride;
    _ref+=2*ystride;
  }
  __asm__ __volatile__(
    "movdqa %[mr],%%xmm6\n\t"
    "punpckhqdq %[mr],%%xmm7\n\t"
    "paddd %%xmm6,%%xmm7\n\t"
    "pshufd $1,%%xmm7,%%xmm6\n\t"
    "paddd %%xmm6,%%xmm7\n\t"
    "movd %%xmm7,%[ret]\n\t"
    :[ret]"=a"(ret)
    :[mr]"x"(mr)
    :"%xmm6", "%xmm7"
  );
  return ret;
}


/*Load an 8x8 array of pixel values from %[src] and %[ref] and compute their
   16-bit difference in %%xmm0...%%xmm7.*/
#define OC_LOAD_SUB_8x8 \
 "#OC_LOAD_SUB_8x8\n\t" \
 "movq (%[src]),%%xmm0\n\t" \
 "movq (%[ref]),%%xmm4\n\t" \
 "movq (%[src],%[src_ystride]),%%xmm1\n\t" \
 "lea (%[src],%[src_ystride],2),%[src]\n\t" \
 "movq (%[ref],%[ref_ystride]),%%xmm5\n\t" \
 "lea (%[ref],%[ref_ystride],2),%[ref]\n\t" \
 "movq (%[src]),%%xmm2\n\t" \
 "movq (%[ref]),%%xmm7\n\t" \
 "movq (%[src],%[src_ystride]),%%xmm3\n\t" \
 "movq (%[ref],%[ref_ystride]),%%xmm6\n\t" \
 "punpcklbw %%xmm4,%%xmm0\n\t" \
 "lea (%[src],%[src_ystride],2),%[src]\n\t" \
 "punpcklbw %%xmm4,%%xmm4\n\t" \
 "lea (%[ref],%[ref_ystride],2),%[ref]\n\t" \
 "psubw %%xmm4,%%xmm0\n\t" \
 "movq (%[src]),%%xmm4\n\t" \
 "movdqa %%xmm0,"OC_MEM_OFFS(0x00,buf)"\n\t" \
 "movq (%[ref]),%%xmm0\n\t" \
 "punpcklbw %%xmm5,%%xmm1\n\t" \
 "punpcklbw %%xmm5,%%xmm5\n\t" \
 "psubw %%xmm5,%%xmm1\n\t" \
 "movq (%[src],%[src_ystride]),%%xmm5\n\t" \
 "punpcklbw %%xmm7,%%xmm2\n\t" \
 "punpcklbw %%xmm7,%%xmm7\n\t" \
 "psubw %%xmm7,%%xmm2\n\t" \
 "movq (%[ref],%[ref_ystride]),%%xmm7\n\t" \
 "punpcklbw %%xmm6,%%xmm3\n\t" \
 "lea (%[src],%[src_ystride],2),%[src]\n\t" \
 "punpcklbw %%xmm6,%%xmm6\n\t" \
 "psubw %%xmm6,%%xmm3\n\t" \
 "movq (%[src]),%%xmm6\n\t" \
 "punpcklbw %%xmm0,%%xmm4\n\t" \
 "lea (%[ref],%[ref_ystride],2),%[ref]\n\t" \
 "punpcklbw %%xmm0,%%xmm0\n\t" \
 "lea (%[src],%[src_ystride],2),%[src]\n\t" \
 "psubw %%xmm0,%%xmm4\n\t" \
 "movq (%[ref]),%%xmm0\n\t" \
 "punpcklbw %%xmm7,%%xmm5\n\t" \
 "neg %[src_ystride]\n\t" \
 "punpcklbw %%xmm7,%%xmm7\n\t" \
 "psubw %%xmm7,%%xmm5\n\t" \
 "movq (%[src],%[src_ystride]),%%xmm7\n\t" \
 "punpcklbw %%xmm0,%%xmm6\n\t" \
 "lea (%[ref],%[ref_ystride],2),%[ref]\n\t" \
 "punpcklbw %%xmm0,%%xmm0\n\t" \
 "neg %[ref_ystride]\n\t" \
 "psubw %%xmm0,%%xmm6\n\t" \
 "movq (%[ref],%[ref_ystride]),%%xmm0\n\t" \
 "punpcklbw %%xmm0,%%xmm7\n\t" \
 "punpcklbw %%xmm0,%%xmm0\n\t" \
 "psubw %%xmm0,%%xmm7\n\t" \
 "movdqa "OC_MEM_OFFS(0x00,buf)",%%xmm0\n\t" \

/*Load an 8x8 array of pixel values from %[src] into %%xmm0...%%xmm7.*/
#define OC_LOAD_8x8 \
 "#OC_LOAD_8x8\n\t" \
 "movq (%[src]),%%xmm0\n\t" \
 "movq (%[src],%[ystride]),%%xmm1\n\t" \
 "movq (%[src],%[ystride],2),%%xmm2\n\t" \
 "pxor %%xmm7,%%xmm7\n\t" \
 "movq (%[src],%[ystride3]),%%xmm3\n\t" \
 "punpcklbw %%xmm7,%%xmm0\n\t" \
 "movq (%[src4]),%%xmm4\n\t" \
 "punpcklbw %%xmm7,%%xmm1\n\t" \
 "movq (%[src4],%[ystride]),%%xmm5\n\t" \
 "punpcklbw %%xmm7,%%xmm2\n\t" \
 "movq (%[src4],%[ystride],2),%%xmm6\n\t" \
 "punpcklbw %%xmm7,%%xmm3\n\t" \
 "movq (%[src4],%[ystride3]),%%xmm7\n\t" \
 "punpcklbw %%xmm4,%%xmm4\n\t" \
 "punpcklbw %%xmm5,%%xmm5\n\t" \
 "psrlw $8,%%xmm4\n\t" \
 "psrlw $8,%%xmm5\n\t" \
 "punpcklbw %%xmm6,%%xmm6\n\t" \
 "punpcklbw %%xmm7,%%xmm7\n\t" \
 "psrlw $8,%%xmm6\n\t" \
 "psrlw $8,%%xmm7\n\t" \

/*Performs the first two stages of an 8-point 1-D Hadamard transform in place.
  Outputs 1, 3, 4, and 5 from the second stage are negated (which allows us to
   perform this stage in place with no temporary registers).*/
#define OC_HADAMARD_AB_8x8 \
 "#OC_HADAMARD_AB_8x8\n\t" \
 /*Stage A:*/ \
 "paddw %%xmm5,%%xmm1\n\t" \
 "paddw %%xmm6,%%xmm2\n\t" \
 "paddw %%xmm5,%%xmm5\n\t" \
 "paddw %%xmm6,%%xmm6\n\t" \
 "psubw %%xmm1,%%xmm5\n\t" \
 "psubw %%xmm2,%%xmm6\n\t" \
 "paddw %%xmm7,%%xmm3\n\t" \
 "paddw %%xmm4,%%xmm0\n\t" \
 "paddw %%xmm7,%%xmm7\n\t" \
 "paddw %%xmm4,%%xmm4\n\t" \
 "psubw %%xmm3,%%xmm7\n\t" \
 "psubw %%xmm0,%%xmm4\n\t" \
 /*Stage B:*/ \
 "paddw %%xmm2,%%xmm0\n\t" \
 "paddw %%xmm3,%%xmm1\n\t" \
 "paddw %%xmm6,%%xmm4\n\t" \
 "paddw %%xmm7,%%xmm5\n\t" \
 "paddw %%xmm2,%%xmm2\n\t" \
 "paddw %%xmm3,%%xmm3\n\t" \
 "paddw %%xmm6,%%xmm6\n\t" \
 "paddw %%xmm7,%%xmm7\n\t" \
 "psubw %%xmm0,%%xmm2\n\t" \
 "psubw %%xmm1,%%xmm3\n\t" \
 "psubw %%xmm4,%%xmm6\n\t" \
 "psubw %%xmm5,%%xmm7\n\t" \

/*Performs the last stage of an 8-point 1-D Hadamard transform in place.
  Outputs 1, 3, 5, and 7 are negated (which allows us to perform this stage in
   place with no temporary registers).*/
#define OC_HADAMARD_C_8x8 \
 "#OC_HADAMARD_C_8x8\n\t" \
 /*Stage C:*/ \
 "paddw %%xmm1,%%xmm0\n\t" \
 "paddw %%xmm3,%%xmm2\n\t" \
 "paddw %%xmm5,%%xmm4\n\t" \
 "paddw %%xmm7,%%xmm6\n\t" \
 "paddw %%xmm1,%%xmm1\n\t" \
 "paddw %%xmm3,%%xmm3\n\t" \
 "paddw %%xmm5,%%xmm5\n\t" \
 "paddw %%xmm7,%%xmm7\n\t" \
 "psubw %%xmm0,%%xmm1\n\t" \
 "psubw %%xmm2,%%xmm3\n\t" \
 "psubw %%xmm4,%%xmm5\n\t" \
 "psubw %%xmm6,%%xmm7\n\t" \

/*Performs an 8-point 1-D Hadamard transform in place.
  Outputs 1, 2, 4, and 7 are negated (which allows us to perform the transform
   in place with no temporary registers).*/
#define OC_HADAMARD_8x8 \
 OC_HADAMARD_AB_8x8 \
 OC_HADAMARD_C_8x8 \

/*Performs the first part of the final stage of the Hadamard transform and
   summing of absolute values.
  At the end of this part, %%xmm1 will contain the DC coefficient of the
   transform.*/
#define OC_HADAMARD_C_ABS_ACCUM_A_8x8 \
 /*We use the fact that \
     (abs(a+b)+abs(a-b))/2=max(abs(a),abs(b)) \
    to merge the final butterfly with the abs and the first stage of \
    accumulation. \
   Thus we can avoid using pabsw, which is not available until SSSE3. \
   Emulating pabsw takes 3 instructions, so the straightforward SSE2 \
    implementation would be (3+3)*8+7=55 instructions (+4 for spilling \
    registers). \
   Even with pabsw, it would be (3+1)*8+7=39 instructions (with no spills). \
   This implementation is only 26 (+4 for spilling registers).*/ \
 "#OC_HADAMARD_C_ABS_ACCUM_A_8x8\n\t" \
 "movdqa %%xmm7,"OC_MEM_OFFS(0x10,buf)"\n\t" \
 "movdqa %%xmm6,"OC_MEM_OFFS(0x00,buf)"\n\t" \
 /*xmm7={0x7FFF}x4 \
   xmm4=max(abs(xmm4),abs(xmm5))-0x7FFF*/ \
 "pcmpeqb %%xmm7,%%xmm7\n\t" \
 "movdqa %%xmm4,%%xmm6\n\t" \
 "psrlw $1,%%xmm7\n\t" \
 "paddw %%xmm5,%%xmm6\n\t" \
 "pmaxsw %%xmm5,%%xmm4\n\t" \
 "paddsw %%xmm7,%%xmm6\n\t" \
 "psubw %%xmm6,%%xmm4\n\t" \
 /*xmm2=max(abs(xmm2),abs(xmm3))-0x7FFF \
   xmm0=max(abs(xmm0),abs(xmm1))-0x7FFF*/ \
 "movdqa %%xmm2,%%xmm6\n\t" \
 "movdqa %%xmm0,%%xmm5\n\t" \
 "pmaxsw %%xmm3,%%xmm2\n\t" \
 "pmaxsw %%xmm1,%%xmm0\n\t" \
 "paddw %%xmm3,%%xmm6\n\t" \
 "movdqa "OC_MEM_OFFS(0x10,buf)",%%xmm3\n\t" \
 "paddw %%xmm5,%%xmm1\n\t" \
 "movdqa "OC_MEM_OFFS(0x00,buf)",%%xmm5\n\t" \

/*Performs the second part of the final stage of the Hadamard transform and
   summing of absolute values.*/
#define OC_HADAMARD_C_ABS_ACCUM_B_8x8 \
 "#OC_HADAMARD_C_ABS_ACCUM_B_8x8\n\t" \
 "paddsw %%xmm7,%%xmm6\n\t" \
 "paddsw %%xmm7,%%xmm1\n\t" \
 "psubw %%xmm6,%%xmm2\n\t" \
 "psubw %%xmm1,%%xmm0\n\t" \
 /*xmm7={1}x4 (needed for the horizontal add that follows) \
   xmm0+=xmm2+xmm4+max(abs(xmm3),abs(xmm5))-0x7FFF*/ \
 "movdqa %%xmm3,%%xmm6\n\t" \
 "pmaxsw %%xmm5,%%xmm3\n\t" \
 "paddw %%xmm2,%%xmm0\n\t" \
 "paddw %%xmm5,%%xmm6\n\t" \
 "paddw %%xmm4,%%xmm0\n\t" \
 "paddsw %%xmm7,%%xmm6\n\t" \
 "paddw %%xmm3,%%xmm0\n\t" \
 "psrlw $14,%%xmm7\n\t" \
 "psubw %%xmm6,%%xmm0\n\t" \

/*Performs the last stage of an 8-point 1-D Hadamard transform, takes the
   absolute value of each component, and accumulates everything into xmm0.*/
#define OC_HADAMARD_C_ABS_ACCUM_8x8 \
 OC_HADAMARD_C_ABS_ACCUM_A_8x8 \
 OC_HADAMARD_C_ABS_ACCUM_B_8x8 \

/*Performs an 8-point 1-D Hadamard transform, takes the absolute value of each
   component, and accumulates everything into xmm0.
  Note that xmm0 will have an extra 4 added to each column, and that after
   removing this value, the remainder will be half the conventional value.*/
#define OC_HADAMARD_ABS_ACCUM_8x8 \
 OC_HADAMARD_AB_8x8 \
 OC_HADAMARD_C_ABS_ACCUM_8x8

static unsigned __attribute__((target("sse2"))) oc_int_frag_satd_sse2(int *_dc,
 const unsigned char *_src,int _src_ystride,
 const unsigned char *_ref,int _ref_ystride){
  OC_ALIGN16(ogg_int16_t buf[16]);
  unsigned ret;
  unsigned ret2;
  int      dc;
  __asm__ __volatile__(
    OC_LOAD_SUB_8x8
    OC_HADAMARD_8x8
    OC_TRANSPOSE_8x8
    /*We split out the stages here so we can save the DC coefficient in the
       middle.*/
    OC_HADAMARD_AB_8x8
    OC_HADAMARD_C_ABS_ACCUM_A_8x8
    "movd %%xmm1,%[dc]\n\t"
    OC_HADAMARD_C_ABS_ACCUM_B_8x8
    /*Up to this point, everything fit in 16 bits (8 input + 1 for the
       difference + 2*3 for the two 8-point 1-D Hadamards - 1 for the abs - 1
       for the factor of two we dropped + 3 for the vertical accumulation).
      Now we finally have to promote things to dwords.
      We break this part out of OC_HADAMARD_ABS_ACCUM_8x8 to hide the long
       latency of pmaddwd by starting to compute abs(dc) here.*/
    "pmaddwd %%xmm7,%%xmm0\n\t"
    "movsx %w[dc],%[dc]\n\t"
    "cdq\n\t"
    "movdqa %%xmm0,%%xmm1\n\t"
    "punpckhqdq %%xmm0,%%xmm0\n\t"
    "paddd %%xmm1,%%xmm0\n\t"
    "pshuflw $0xE,%%xmm0,%%xmm1\n\t"
    "paddd %%xmm1,%%xmm0\n\t"
    "movd %%xmm0,%[ret]\n\t"
    /*The sums produced by OC_HADAMARD_ABS_ACCUM_8x8 each have an extra 4
       added to them, a factor of two removed, and the DC value included;
       correct the final sum here.*/
    "lea -64(%[ret2],%[ret],2),%[ret]\n\t"
    "xor %[dc],%[ret2]\n\t"
    "sub %[ret2],%[ret]\n\t"
    /*Although it looks like we're using 7 registers here, gcc can alias %[ret]
       and %[dc] with some of the inputs, since for once we don't write to
       them until after we're done using everything but %[buf].*/
    /*Note that _src_ystride and _ref_ystride must be given non-overlapping
       constraints, otherwise if gcc can prove they're equal it will allocate
       them to the same register (which is bad); _src and _ref face a similar
       problem.
      All four are destructively modified, but if we list them as output
       constraints, gcc can't alias them with other outputs.*/
    :[ret]"=r"(ret),[ret2]"=d"(ret2),[dc]"=a"(dc),
     [buf]"=m"(OC_ARRAY_OPERAND(ogg_int16_t,buf,16))
    :[src]"S"(_src),[src_ystride]"c"((ptrdiff_t)_src_ystride),
     [ref]"a"(_ref),[ref_ystride]"d"((ptrdiff_t)_ref_ystride)
    /*We have to use neg, so we actually clobber the condition codes for once
       (not to mention sub, and add).*/
    :"cc",
     "%xmm0", "%xmm1", "%xmm2", "%xmm3",
     "%xmm4", "%xmm5", "%xmm6", "%xmm7"
  );
  *_dc=dc;
  return ret;
}

unsigned __attribute__((target("sse2"))) oc_enc_frag_satd_sse2(int *_dc,const unsigned char *_src,
 const unsigned char *_ref,int _ystride){
  return oc_int_frag_satd_sse2(_dc,_src,_ystride,_ref,_ystride);
}

unsigned __attribute__((target("sse2"))) oc_enc_frag_satd2_sse2(int *_dc,const unsigned char *_src,
 const unsigned char *_ref1,const unsigned char *_ref2,int _ystride){
  OC_ALIGN8(unsigned char ref[64]);
  oc_int_frag_copy2_mmxext(ref,8,_ref1,_ref2,_ystride);
  return oc_int_frag_satd_sse2(_dc,_src,_ystride,ref,8);
}

unsigned __attribute__((target("sse2"))) oc_enc_frag_intra_satd_sse2(int *_dc,
 const unsigned char *_src,int _ystride){
  OC_ALIGN16(ogg_int16_t buf[16]);
  unsigned ret;
  int      dc;
  __asm__ __volatile__(
    OC_LOAD_8x8
    OC_HADAMARD_8x8
    OC_TRANSPOSE_8x8
    /*We split out the stages here so we can save the DC coefficient in the
       middle.*/
    OC_HADAMARD_AB_8x8
    OC_HADAMARD_C_ABS_ACCUM_A_8x8
    "movd %%xmm1,%[dc]\n\t"
    OC_HADAMARD_C_ABS_ACCUM_B_8x8
    /*Up to this point, everything fit in 16 bits (8 input + 1 for the
       difference + 2*3 for the two 8-point 1-D Hadamards - 1 for the abs - 1
       for the factor of two we dropped + 3 for the vertical accumulation).
      Now we finally have to promote things to dwords.*/
    "pmaddwd %%xmm7,%%xmm0\n\t"
    /*We assume that the DC coefficient is always positive (which is true,
       because the input to the INTRA transform was not a difference).*/
    "movzx %w[dc],%[dc]\n\t"
    "movdqa %%xmm0,%%xmm1\n\t"
    "punpckhqdq %%xmm0,%%xmm0\n\t"
    "paddd %%xmm1,%%xmm0\n\t"
    "pshuflw $0xE,%%xmm0,%%xmm1\n\t"
    "paddd %%xmm1,%%xmm0\n\t"
    "movd %%xmm0,%[ret]\n\t"
    "lea -64(%[ret],%[ret]),%[ret]\n\t"
    "sub %[dc],%[ret]\n\t"
    /*Although it looks like we're using 7 registers here, gcc can alias %[ret]
       and %[dc] with some of the inputs, since for once we don't write to
       them until after we're done using everything but %[buf].*/
    :[ret]"=a"(ret),[dc]"=r"(dc),
     [buf]"=m"(OC_ARRAY_OPERAND(ogg_int16_t,buf,16))
    :[src]"r"(_src),[src4]"r"(_src+4*_ystride),
     [ystride]"r"((ptrdiff_t)_ystride),[ystride3]"r"((ptrdiff_t)3*_ystride)
    /*We have to use sub, so we actually clobber the condition codes for once.*/
    :"cc",
     "%xmm0", "%xmm1", "%xmm2", "%xmm3",
     "%xmm4", "%xmm5", "%xmm6", "%xmm7"
  );
  *_dc=dc;
  return ret;
}

#endif

Coverage Report

Created: 2026-05-23 07:06

Line	Count	Source
1		/********************************************************************
2		* *
3		* THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE. *
4		* USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS *
5		* GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
6		* IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING. *
7		* *
8		* THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2009 *
9		* by the Xiph.Org Foundation https://www.xiph.org/ *
10		* *
11		********************************************************************
12
13		function:
14
15		********************************************************************/
16		#include <stddef.h>
17		#include "x86enc.h"
18		#include "sse2trans.h"
19
20		#if defined(OC_X86_ASM)
21
22		/*Load a 4x8 array of pixels values from %[src] and %[ref] and compute their
23		16-bit differences.
24		On output, these are stored in _m0, xmm1, xmm2, and xmm3.
25		xmm4 and xmm5 are clobbered.*/
26		#define OC_LOAD_SUB_4x8(_m0) \
27		"#OC_LOAD_SUB_4x8\n\t" \
28		/Load the first three rows./ \
29		"movq (%[src]),"_m0"\n\t" \
30		"movq (%[ref]),%%xmm4\n\t" \
31		"movq (%[src],%[ystride]),%%xmm1\n\t" \
32		"movq (%[ref],%[ystride]),%%xmm3\n\t" \
33		"movq (%[src],%[ystride],2),%%xmm2\n\t" \
34		"movq (%[ref],%[ystride],2),%%xmm5\n\t" \
35		/Unpack and subtract./ \
36		"punpcklbw %%xmm4,"_m0"\n\t" \
37		"punpcklbw %%xmm4,%%xmm4\n\t" \
38		"punpcklbw %%xmm3,%%xmm1\n\t" \
39		"punpcklbw %%xmm3,%%xmm3\n\t" \
40		"psubw %%xmm4,"_m0"\n\t" \
41		"psubw %%xmm3,%%xmm1\n\t" \
42		/Load the last row./ \
43		"movq (%[src],%[ystride3]),%%xmm3\n\t" \
44		"movq (%[ref],%[ystride3]),%%xmm4\n\t" \
45		/Unpack, subtract, and advance the pointers./ \
46		"punpcklbw %%xmm5,%%xmm2\n\t" \
47		"punpcklbw %%xmm5,%%xmm5\n\t" \
48		"lea (%[src],%[ystride],4),%[src]\n\t" \
49		"psubw %%xmm5,%%xmm2\n\t" \
50		"punpcklbw %%xmm4,%%xmm3\n\t" \
51		"punpcklbw %%xmm4,%%xmm4\n\t" \
52		"lea (%[ref],%[ystride],4),%[ref]\n\t" \
53		"psubw %%xmm4,%%xmm3\n\t" \
54
55		/*Square and accumulate four rows of differences in _m0, xmm1, xmm2, and xmm3.
56		On output, xmm0 contains the sum of two of the rows, and the other two are
57		added to xmm7.*/
58		#define OC_SSD_4x8(_m0) \
59		"pmaddwd "_m0","_m0"\n\t" \
60		"pmaddwd %%xmm1,%%xmm1\n\t" \
61		"pmaddwd %%xmm2,%%xmm2\n\t" \
62		"pmaddwd %%xmm3,%%xmm3\n\t" \
63		"paddd %%xmm1,"_m0"\n\t" \
64		"paddd %%xmm3,%%xmm2\n\t" \
65		"paddd %%xmm2,%%xmm7\n\t" \
66
67		unsigned __attribute__((target("sse2"))) oc_enc_frag_ssd_sse2(const unsigned char *_src,
68	2.07M	const unsigned char *_ref,int _ystride){
69	2.07M	unsigned ret;
70	2.07M	__asm__ __volatile__(
71	2.07M	OC_LOAD_SUB_4x8("%%xmm7")
72	2.07M	OC_SSD_4x8("%%xmm7")
73	2.07M	OC_LOAD_SUB_4x8("%%xmm0")
74	2.07M	OC_SSD_4x8("%%xmm0")
75	2.07M	"paddd %%xmm0,%%xmm7\n\t"
76	2.07M	"movdqa %%xmm7,%%xmm6\n\t"
77	2.07M	"punpckhqdq %%xmm7,%%xmm7\n\t"
78	2.07M	"paddd %%xmm6,%%xmm7\n\t"
79	2.07M	"pshufd $1,%%xmm7,%%xmm6\n\t"
80	2.07M	"paddd %%xmm6,%%xmm7\n\t"
81	2.07M	"movd %%xmm7,%[ret]\n\t"
82	2.07M	:[ret]"=a"(ret)
83	2.07M	:[src]"r"(_src),[ref]"r"(_ref),[ystride]"r"((ptrdiff_t)_ystride),
84	2.07M	[ystride3]"r"((ptrdiff_t)_ystride*3)
85	2.07M	:"%xmm0", "%xmm1", "%xmm2", "%xmm3",
86	2.07M	"%xmm4", "%xmm5", "%xmm6", "%xmm7"
87	2.07M	);
88	2.07M	return ret;
89	2.07M	}
90
91		static const unsigned char __attribute__((aligned(16))) OC_MASK_CONSTS[8]={
92		0x01,0x02,0x04,0x08,0x10,0x20,0x40,0x80
93		};
94
95		/*Load a 2x8 array of pixels values from %[src] and %[ref] and compute their
96		horizontal sums as well as their 16-bit differences subject to a mask.
97		%[cr] must contain OC_MASK_CONSTS[0...7] and %[mr] must contain 0.*/
98		#define OC_LOAD_SUB_MASK_2x8 \
99		"#OC_LOAD_SUB_MASK_2x8\n\t" \
100		/Start the loads and expand the next 8 bits of the mask./ \
101		"shl $8,%[m]\n\t" \
102		"movq (%[src]),%%xmm0\n\t" \
103		"mov %h[m],%b[m]\n\t" \
104		"movq (%[ref]),%%xmm2\n\t" \
105		"movd %[m],%%xmm4\n\t" \
106		"shr $8,%[m]\n\t" \
107		"pshuflw $0x00,%%xmm4,%%xmm4\n\t" \
108		"mov %h[m],%b[m]\n\t" \
109		"pand %[cr],%%xmm4\n\t" \
110		"pcmpeqb %[cr],%%xmm4\n\t" \
111		/Perform the masking./ \
112		"pand %%xmm4,%%xmm0\n\t" \
113		"pand %%xmm4,%%xmm2\n\t" \
114		/*Finish the loads while unpacking the first set of rows, and expand the next
115		8 bits of the mask.*/ \
116		"movd %[m],%%xmm4\n\t" \
117		"movq (%[src],%[ystride]),%%xmm1\n\t" \
118		"pshuflw $0x00,%%xmm4,%%xmm4\n\t" \
119		"movq (%[ref],%[ystride]),%%xmm3\n\t" \
120		"pand %[cr],%%xmm4\n\t" \
121		"punpcklbw %%xmm2,%%xmm0\n\t" \
122		"pcmpeqb %[cr],%%xmm4\n\t" \
123		"punpcklbw %%xmm2,%%xmm2\n\t" \
124		/Mask and unpack the second set of rows./ \
125		"pand %%xmm4,%%xmm1\n\t" \
126		"pand %%xmm4,%%xmm3\n\t" \
127		"punpcklbw %%xmm3,%%xmm1\n\t" \
128		"punpcklbw %%xmm3,%%xmm3\n\t" \
129		"psubw %%xmm2,%%xmm0\n\t" \
130		"psubw %%xmm3,%%xmm1\n\t" \
131
132		unsigned __attribute__((target("sse2"))) oc_enc_frag_border_ssd_sse2(const unsigned char *_src,
133	1.45M	const unsigned char *_ref,int _ystride,ogg_int64_t _mask){
134	1.45M	ptrdiff_t ystride;
135	1.45M	unsigned ret;
136	1.45M	int i;
137	1.45M	ystride=_ystride;
138		/Store intermediate values across __asm__ blocks/
139	1.45M	register sse2_reg cr;
140	1.45M	register sse2_reg mr;
141	1.45M	__asm__ __volatile__(
142	1.45M	"pxor %[mr],%[mr]\n\t"
143	1.45M	"movq %[c],%[cr]\n\t"
144	1.45M	:[cr]"=x"(cr), [mr]"=x"(mr)
145	1.45M	:[c]"m"(OC_CONST_ARRAY_OPERAND(unsigned char,OC_MASK_CONSTS,8))
146	1.45M	);
147	7.27M	for(i=0;i<4;i++){
148	5.82M	unsigned m;
149	5.82M	m=_mask&0xFFFF;
150	5.82M	_mask>>=16;
151	5.82M	if(m){
152	3.43M	__asm__ __volatile__(
153	3.43M	OC_LOAD_SUB_MASK_2x8
154	3.43M	"pmaddwd %%xmm0,%%xmm0\n\t"
155	3.43M	"pmaddwd %%xmm1,%%xmm1\n\t"
156	3.43M	"paddd %%xmm0,%[mr]\n\t"
157	3.43M	"paddd %%xmm1,%[mr]\n\t"
158	3.43M	:[src]"+r"(_src),[ref]"+r"(_ref),[ystride]"+r"(ystride),[m]"+Q"(m),[mr]"+x"(mr)
159	3.43M	:[cr]"x"(cr)
160	3.43M	:"%xmm0", "%xmm1", "%xmm2", "%xmm3",
161	3.43M	"%xmm4"/, "%xmm5", "%xmm6", "%xmm7"/
162	3.43M	);
163	3.43M	}
164	5.82M	_src+=2*ystride;
165	5.82M	_ref+=2*ystride;
166	5.82M	}
167	1.45M	__asm__ __volatile__(
168	1.45M	"movdqa %[mr],%%xmm6\n\t"
169	1.45M	"punpckhqdq %[mr],%%xmm7\n\t"
170	1.45M	"paddd %%xmm6,%%xmm7\n\t"
171	1.45M	"pshufd $1,%%xmm7,%%xmm6\n\t"
172	1.45M	"paddd %%xmm6,%%xmm7\n\t"
173	1.45M	"movd %%xmm7,%[ret]\n\t"
174	1.45M	:[ret]"=a"(ret)
175	1.45M	:[mr]"x"(mr)
176	1.45M	:"%xmm6", "%xmm7"
177	1.45M	);
178	1.45M	return ret;
179	1.45M	}
180
181
182		/*Load an 8x8 array of pixel values from %[src] and %[ref] and compute their
183		16-bit difference in %%xmm0...%%xmm7.*/
184		#define OC_LOAD_SUB_8x8 \
185		"#OC_LOAD_SUB_8x8\n\t" \
186		"movq (%[src]),%%xmm0\n\t" \
187		"movq (%[ref]),%%xmm4\n\t" \
188		"movq (%[src],%[src_ystride]),%%xmm1\n\t" \
189		"lea (%[src],%[src_ystride],2),%[src]\n\t" \
190		"movq (%[ref],%[ref_ystride]),%%xmm5\n\t" \
191		"lea (%[ref],%[ref_ystride],2),%[ref]\n\t" \
192		"movq (%[src]),%%xmm2\n\t" \
193		"movq (%[ref]),%%xmm7\n\t" \
194		"movq (%[src],%[src_ystride]),%%xmm3\n\t" \
195		"movq (%[ref],%[ref_ystride]),%%xmm6\n\t" \
196		"punpcklbw %%xmm4,%%xmm0\n\t" \
197		"lea (%[src],%[src_ystride],2),%[src]\n\t" \
198		"punpcklbw %%xmm4,%%xmm4\n\t" \
199		"lea (%[ref],%[ref_ystride],2),%[ref]\n\t" \
200		"psubw %%xmm4,%%xmm0\n\t" \
201		"movq (%[src]),%%xmm4\n\t" \
202		"movdqa %%xmm0,"OC_MEM_OFFS(0x00,buf)"\n\t" \
203		"movq (%[ref]),%%xmm0\n\t" \
204		"punpcklbw %%xmm5,%%xmm1\n\t" \
205		"punpcklbw %%xmm5,%%xmm5\n\t" \
206		"psubw %%xmm5,%%xmm1\n\t" \
207		"movq (%[src],%[src_ystride]),%%xmm5\n\t" \
208		"punpcklbw %%xmm7,%%xmm2\n\t" \
209		"punpcklbw %%xmm7,%%xmm7\n\t" \
210		"psubw %%xmm7,%%xmm2\n\t" \
211		"movq (%[ref],%[ref_ystride]),%%xmm7\n\t" \
212		"punpcklbw %%xmm6,%%xmm3\n\t" \
213		"lea (%[src],%[src_ystride],2),%[src]\n\t" \
214		"punpcklbw %%xmm6,%%xmm6\n\t" \
215		"psubw %%xmm6,%%xmm3\n\t" \
216		"movq (%[src]),%%xmm6\n\t" \
217		"punpcklbw %%xmm0,%%xmm4\n\t" \
218		"lea (%[ref],%[ref_ystride],2),%[ref]\n\t" \
219		"punpcklbw %%xmm0,%%xmm0\n\t" \
220		"lea (%[src],%[src_ystride],2),%[src]\n\t" \
221		"psubw %%xmm0,%%xmm4\n\t" \
222		"movq (%[ref]),%%xmm0\n\t" \
223		"punpcklbw %%xmm7,%%xmm5\n\t" \
224		"neg %[src_ystride]\n\t" \
225		"punpcklbw %%xmm7,%%xmm7\n\t" \
226		"psubw %%xmm7,%%xmm5\n\t" \
227		"movq (%[src],%[src_ystride]),%%xmm7\n\t" \
228		"punpcklbw %%xmm0,%%xmm6\n\t" \
229		"lea (%[ref],%[ref_ystride],2),%[ref]\n\t" \
230		"punpcklbw %%xmm0,%%xmm0\n\t" \
231		"neg %[ref_ystride]\n\t" \
232		"psubw %%xmm0,%%xmm6\n\t" \
233		"movq (%[ref],%[ref_ystride]),%%xmm0\n\t" \
234		"punpcklbw %%xmm0,%%xmm7\n\t" \
235		"punpcklbw %%xmm0,%%xmm0\n\t" \
236		"psubw %%xmm0,%%xmm7\n\t" \
237		"movdqa "OC_MEM_OFFS(0x00,buf)",%%xmm0\n\t" \
238
239		/Load an 8x8 array of pixel values from %[src] into %%xmm0...%%xmm7./
240		#define OC_LOAD_8x8 \
241		"#OC_LOAD_8x8\n\t" \
242		"movq (%[src]),%%xmm0\n\t" \
243		"movq (%[src],%[ystride]),%%xmm1\n\t" \
244		"movq (%[src],%[ystride],2),%%xmm2\n\t" \
245		"pxor %%xmm7,%%xmm7\n\t" \
246		"movq (%[src],%[ystride3]),%%xmm3\n\t" \
247		"punpcklbw %%xmm7,%%xmm0\n\t" \
248		"movq (%[src4]),%%xmm4\n\t" \
249		"punpcklbw %%xmm7,%%xmm1\n\t" \
250		"movq (%[src4],%[ystride]),%%xmm5\n\t" \
251		"punpcklbw %%xmm7,%%xmm2\n\t" \
252		"movq (%[src4],%[ystride],2),%%xmm6\n\t" \
253		"punpcklbw %%xmm7,%%xmm3\n\t" \
254		"movq (%[src4],%[ystride3]),%%xmm7\n\t" \
255		"punpcklbw %%xmm4,%%xmm4\n\t" \
256		"punpcklbw %%xmm5,%%xmm5\n\t" \
257		"psrlw $8,%%xmm4\n\t" \
258		"psrlw $8,%%xmm5\n\t" \
259		"punpcklbw %%xmm6,%%xmm6\n\t" \
260		"punpcklbw %%xmm7,%%xmm7\n\t" \
261		"psrlw $8,%%xmm6\n\t" \
262		"psrlw $8,%%xmm7\n\t" \
263
264		/*Performs the first two stages of an 8-point 1-D Hadamard transform in place.
265		Outputs 1, 3, 4, and 5 from the second stage are negated (which allows us to
266		perform this stage in place with no temporary registers).*/
267		#define OC_HADAMARD_AB_8x8 \
268		"#OC_HADAMARD_AB_8x8\n\t" \
269		/Stage A:/ \
270		"paddw %%xmm5,%%xmm1\n\t" \
271		"paddw %%xmm6,%%xmm2\n\t" \
272		"paddw %%xmm5,%%xmm5\n\t" \
273		"paddw %%xmm6,%%xmm6\n\t" \
274		"psubw %%xmm1,%%xmm5\n\t" \
275		"psubw %%xmm2,%%xmm6\n\t" \
276		"paddw %%xmm7,%%xmm3\n\t" \
277		"paddw %%xmm4,%%xmm0\n\t" \
278		"paddw %%xmm7,%%xmm7\n\t" \
279		"paddw %%xmm4,%%xmm4\n\t" \
280		"psubw %%xmm3,%%xmm7\n\t" \
281		"psubw %%xmm0,%%xmm4\n\t" \
282		/Stage B:/ \
283		"paddw %%xmm2,%%xmm0\n\t" \
284		"paddw %%xmm3,%%xmm1\n\t" \
285		"paddw %%xmm6,%%xmm4\n\t" \
286		"paddw %%xmm7,%%xmm5\n\t" \
287		"paddw %%xmm2,%%xmm2\n\t" \
288		"paddw %%xmm3,%%xmm3\n\t" \
289		"paddw %%xmm6,%%xmm6\n\t" \
290		"paddw %%xmm7,%%xmm7\n\t" \
291		"psubw %%xmm0,%%xmm2\n\t" \
292		"psubw %%xmm1,%%xmm3\n\t" \
293		"psubw %%xmm4,%%xmm6\n\t" \
294		"psubw %%xmm5,%%xmm7\n\t" \
295
296		/*Performs the last stage of an 8-point 1-D Hadamard transform in place.
297		Outputs 1, 3, 5, and 7 are negated (which allows us to perform this stage in
298		place with no temporary registers).*/
299		#define OC_HADAMARD_C_8x8 \
300		"#OC_HADAMARD_C_8x8\n\t" \
301		/Stage C:/ \
302		"paddw %%xmm1,%%xmm0\n\t" \
303		"paddw %%xmm3,%%xmm2\n\t" \
304		"paddw %%xmm5,%%xmm4\n\t" \
305		"paddw %%xmm7,%%xmm6\n\t" \
306		"paddw %%xmm1,%%xmm1\n\t" \
307		"paddw %%xmm3,%%xmm3\n\t" \
308		"paddw %%xmm5,%%xmm5\n\t" \
309		"paddw %%xmm7,%%xmm7\n\t" \
310		"psubw %%xmm0,%%xmm1\n\t" \
311		"psubw %%xmm2,%%xmm3\n\t" \
312		"psubw %%xmm4,%%xmm5\n\t" \
313		"psubw %%xmm6,%%xmm7\n\t" \
314
315		/*Performs an 8-point 1-D Hadamard transform in place.
316		Outputs 1, 2, 4, and 7 are negated (which allows us to perform the transform
317		in place with no temporary registers).*/
318		#define OC_HADAMARD_8x8 \
319		OC_HADAMARD_AB_8x8 \
320		OC_HADAMARD_C_8x8 \
321
322		/*Performs the first part of the final stage of the Hadamard transform and
323		summing of absolute values.
324		At the end of this part, %%xmm1 will contain the DC coefficient of the
325		transform.*/
326		#define OC_HADAMARD_C_ABS_ACCUM_A_8x8 \
327		/*We use the fact that \
328		(abs(a+b)+abs(a-b))/2=max(abs(a),abs(b)) \
329		to merge the final butterfly with the abs and the first stage of \
330		accumulation. \
331		Thus we can avoid using pabsw, which is not available until SSSE3. \
332		Emulating pabsw takes 3 instructions, so the straightforward SSE2 \
333		implementation would be (3+3)*8+7=55 instructions (+4 for spilling \
334		registers). \
335		Even with pabsw, it would be (3+1)*8+7=39 instructions (with no spills). \
336		This implementation is only 26 (+4 for spilling registers).*/ \
337		"#OC_HADAMARD_C_ABS_ACCUM_A_8x8\n\t" \
338		"movdqa %%xmm7,"OC_MEM_OFFS(0x10,buf)"\n\t" \
339		"movdqa %%xmm6,"OC_MEM_OFFS(0x00,buf)"\n\t" \
340		/*xmm7={0x7FFF}x4 \
341		xmm4=max(abs(xmm4),abs(xmm5))-0x7FFF*/ \
342		"pcmpeqb %%xmm7,%%xmm7\n\t" \
343		"movdqa %%xmm4,%%xmm6\n\t" \
344		"psrlw $1,%%xmm7\n\t" \
345		"paddw %%xmm5,%%xmm6\n\t" \
346		"pmaxsw %%xmm5,%%xmm4\n\t" \
347		"paddsw %%xmm7,%%xmm6\n\t" \
348		"psubw %%xmm6,%%xmm4\n\t" \
349		/*xmm2=max(abs(xmm2),abs(xmm3))-0x7FFF \
350		xmm0=max(abs(xmm0),abs(xmm1))-0x7FFF*/ \
351		"movdqa %%xmm2,%%xmm6\n\t" \
352		"movdqa %%xmm0,%%xmm5\n\t" \
353		"pmaxsw %%xmm3,%%xmm2\n\t" \
354		"pmaxsw %%xmm1,%%xmm0\n\t" \
355		"paddw %%xmm3,%%xmm6\n\t" \
356		"movdqa "OC_MEM_OFFS(0x10,buf)",%%xmm3\n\t" \
357		"paddw %%xmm5,%%xmm1\n\t" \
358		"movdqa "OC_MEM_OFFS(0x00,buf)",%%xmm5\n\t" \
359
360		/*Performs the second part of the final stage of the Hadamard transform and
361		summing of absolute values.*/
362		#define OC_HADAMARD_C_ABS_ACCUM_B_8x8 \
363		"#OC_HADAMARD_C_ABS_ACCUM_B_8x8\n\t" \
364		"paddsw %%xmm7,%%xmm6\n\t" \
365		"paddsw %%xmm7,%%xmm1\n\t" \
366		"psubw %%xmm6,%%xmm2\n\t" \
367		"psubw %%xmm1,%%xmm0\n\t" \
368		/*xmm7={1}x4 (needed for the horizontal add that follows) \
369		xmm0+=xmm2+xmm4+max(abs(xmm3),abs(xmm5))-0x7FFF*/ \
370		"movdqa %%xmm3,%%xmm6\n\t" \
371		"pmaxsw %%xmm5,%%xmm3\n\t" \
372		"paddw %%xmm2,%%xmm0\n\t" \
373		"paddw %%xmm5,%%xmm6\n\t" \
374		"paddw %%xmm4,%%xmm0\n\t" \
375		"paddsw %%xmm7,%%xmm6\n\t" \
376		"paddw %%xmm3,%%xmm0\n\t" \
377		"psrlw $14,%%xmm7\n\t" \
378		"psubw %%xmm6,%%xmm0\n\t" \
379
380		/*Performs the last stage of an 8-point 1-D Hadamard transform, takes the
381		absolute value of each component, and accumulates everything into xmm0.*/
382		#define OC_HADAMARD_C_ABS_ACCUM_8x8 \
383		OC_HADAMARD_C_ABS_ACCUM_A_8x8 \
384		OC_HADAMARD_C_ABS_ACCUM_B_8x8 \
385
386		/*Performs an 8-point 1-D Hadamard transform, takes the absolute value of each
387		component, and accumulates everything into xmm0.
388		Note that xmm0 will have an extra 4 added to each column, and that after
389		removing this value, the remainder will be half the conventional value.*/
390		#define OC_HADAMARD_ABS_ACCUM_8x8 \
391		OC_HADAMARD_AB_8x8 \
392		OC_HADAMARD_C_ABS_ACCUM_8x8
393
394		static unsigned __attribute__((target("sse2"))) oc_int_frag_satd_sse2(int *_dc,
395		const unsigned char *_src,int _src_ystride,
396	29.2M	const unsigned char *_ref,int _ref_ystride){
397	29.2M	OC_ALIGN16(ogg_int16_t buf[16]);
398	29.2M	unsigned ret;
399	29.2M	unsigned ret2;
400	29.2M	int dc;
401	29.2M	__asm__ __volatile__(
402	29.2M	OC_LOAD_SUB_8x8
403	29.2M	OC_HADAMARD_8x8
404	29.2M	OC_TRANSPOSE_8x8
405		/*We split out the stages here so we can save the DC coefficient in the
406		middle.*/
407	29.2M	OC_HADAMARD_AB_8x8
408	29.2M	OC_HADAMARD_C_ABS_ACCUM_A_8x8
409	29.2M	"movd %%xmm1,%[dc]\n\t"
410	29.2M	OC_HADAMARD_C_ABS_ACCUM_B_8x8
411		/*Up to this point, everything fit in 16 bits (8 input + 1 for the
412		difference + 2*3 for the two 8-point 1-D Hadamards - 1 for the abs - 1
413		for the factor of two we dropped + 3 for the vertical accumulation).
414		Now we finally have to promote things to dwords.
415		We break this part out of OC_HADAMARD_ABS_ACCUM_8x8 to hide the long
416		latency of pmaddwd by starting to compute abs(dc) here.*/
417	29.2M	"pmaddwd %%xmm7,%%xmm0\n\t"
418	29.2M	"movsx %w[dc],%[dc]\n\t"
419	29.2M	"cdq\n\t"
420	29.2M	"movdqa %%xmm0,%%xmm1\n\t"
421	29.2M	"punpckhqdq %%xmm0,%%xmm0\n\t"
422	29.2M	"paddd %%xmm1,%%xmm0\n\t"
423	29.2M	"pshuflw $0xE,%%xmm0,%%xmm1\n\t"
424	29.2M	"paddd %%xmm1,%%xmm0\n\t"
425	29.2M	"movd %%xmm0,%[ret]\n\t"
426		/*The sums produced by OC_HADAMARD_ABS_ACCUM_8x8 each have an extra 4
427		added to them, a factor of two removed, and the DC value included;
428		correct the final sum here.*/
429	29.2M	"lea -64(%[ret2],%[ret],2),%[ret]\n\t"
430	29.2M	"xor %[dc],%[ret2]\n\t"
431	29.2M	"sub %[ret2],%[ret]\n\t"
432		/*Although it looks like we're using 7 registers here, gcc can alias %[ret]
433		and %[dc] with some of the inputs, since for once we don't write to
434		them until after we're done using everything but %[buf].*/
435		/*Note that _src_ystride and _ref_ystride must be given non-overlapping
436		constraints, otherwise if gcc can prove they're equal it will allocate
437		them to the same register (which is bad); _src and _ref face a similar
438		problem.
439		All four are destructively modified, but if we list them as output
440		constraints, gcc can't alias them with other outputs.*/
441	29.2M	:[ret]"=r"(ret),[ret2]"=d"(ret2),[dc]"=a"(dc),
442	29.2M	[buf]"=m"(OC_ARRAY_OPERAND(ogg_int16_t,buf,16))
443	29.2M	:[src]"S"(_src),[src_ystride]"c"((ptrdiff_t)_src_ystride),
444	29.2M	[ref]"a"(_ref),[ref_ystride]"d"((ptrdiff_t)_ref_ystride)
445		/*We have to use neg, so we actually clobber the condition codes for once
446		(not to mention sub, and add).*/
447	29.2M	:"cc",
448	29.2M	"%xmm0", "%xmm1", "%xmm2", "%xmm3",
449	29.2M	"%xmm4", "%xmm5", "%xmm6", "%xmm7"
450	29.2M	);
451	29.2M	*_dc=dc;
452	29.2M	return ret;
453	29.2M	}
454
455		unsigned __attribute__((target("sse2"))) oc_enc_frag_satd_sse2(int _dc,const unsigned char _src,
456	14.5M	const unsigned char *_ref,int _ystride){
457	14.5M	return oc_int_frag_satd_sse2(_dc,_src,_ystride,_ref,_ystride);
458	14.5M	}
459
460		unsigned __attribute__((target("sse2"))) oc_enc_frag_satd2_sse2(int _dc,const unsigned char _src,
461	14.6M	const unsigned char _ref1,const unsigned char _ref2,int _ystride){
462	14.6M	OC_ALIGN8(unsigned char ref[64]);
463	14.6M	oc_int_frag_copy2_mmxext(ref,8,_ref1,_ref2,_ystride);
464	14.6M	return oc_int_frag_satd_sse2(_dc,_src,_ystride,ref,8);
465	14.6M	}
466
467		unsigned __attribute__((target("sse2"))) oc_enc_frag_intra_satd_sse2(int *_dc,
468	21.1M	const unsigned char *_src,int _ystride){
469	21.1M	OC_ALIGN16(ogg_int16_t buf[16]);
470	21.1M	unsigned ret;
471	21.1M	int dc;
472	21.1M	__asm__ __volatile__(
473	21.1M	OC_LOAD_8x8
474	21.1M	OC_HADAMARD_8x8
475	21.1M	OC_TRANSPOSE_8x8
476		/*We split out the stages here so we can save the DC coefficient in the
477		middle.*/
478	21.1M	OC_HADAMARD_AB_8x8
479	21.1M	OC_HADAMARD_C_ABS_ACCUM_A_8x8
480	21.1M	"movd %%xmm1,%[dc]\n\t"
481	21.1M	OC_HADAMARD_C_ABS_ACCUM_B_8x8
482		/*Up to this point, everything fit in 16 bits (8 input + 1 for the
483		difference + 2*3 for the two 8-point 1-D Hadamards - 1 for the abs - 1
484		for the factor of two we dropped + 3 for the vertical accumulation).
485		Now we finally have to promote things to dwords.*/
486	21.1M	"pmaddwd %%xmm7,%%xmm0\n\t"
487		/*We assume that the DC coefficient is always positive (which is true,
488		because the input to the INTRA transform was not a difference).*/
489	21.1M	"movzx %w[dc],%[dc]\n\t"
490	21.1M	"movdqa %%xmm0,%%xmm1\n\t"
491	21.1M	"punpckhqdq %%xmm0,%%xmm0\n\t"
492	21.1M	"paddd %%xmm1,%%xmm0\n\t"
493	21.1M	"pshuflw $0xE,%%xmm0,%%xmm1\n\t"
494	21.1M	"paddd %%xmm1,%%xmm0\n\t"
495	21.1M	"movd %%xmm0,%[ret]\n\t"
496	21.1M	"lea -64(%[ret],%[ret]),%[ret]\n\t"
497	21.1M	"sub %[dc],%[ret]\n\t"
498		/*Although it looks like we're using 7 registers here, gcc can alias %[ret]
499		and %[dc] with some of the inputs, since for once we don't write to
500		them until after we're done using everything but %[buf].*/
501	21.1M	:[ret]"=a"(ret),[dc]"=r"(dc),
502	21.1M	[buf]"=m"(OC_ARRAY_OPERAND(ogg_int16_t,buf,16))
503	21.1M	:[src]"r"(_src),[src4]"r"(_src+4*_ystride),
504	21.1M	[ystride]"r"((ptrdiff_t)_ystride),[ystride3]"r"((ptrdiff_t)3*_ystride)
505		/We have to use sub, so we actually clobber the condition codes for once./
506	21.1M	:"cc",
507	21.1M	"%xmm0", "%xmm1", "%xmm2", "%xmm3",
508	21.1M	"%xmm4", "%xmm5", "%xmm6", "%xmm7"
509	21.1M	);
510	21.1M	*_dc=dc;
511	21.1M	return ret;
512	21.1M	}
513
514		#endif