/src/theora/lib/x86/sse2encfrag.c

Source
/********************************************************************
 *                                                                  *
 * THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE.   *
 * USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS     *
 * GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
 * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING.       *
 *                                                                  *
 * THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2009                *
 * by the Xiph.Org Foundation https://www.xiph.org/                 *
 *                                                                  *
 ********************************************************************

  function:

 ********************************************************************/
#include <stddef.h>
#include "x86enc.h"
#include "sse2trans.h"

#if defined(OC_X86_ASM)

/*Load a 4x8 array of pixels values from %[src] and %[ref] and compute their
   16-bit differences.
  On output, these are stored in _m0, xmm1, xmm2, and xmm3.
  xmm4 and xmm5 are clobbered.*/
#define OC_LOAD_SUB_4x8(_m0) \
 "#OC_LOAD_SUB_4x8\n\t" \
 /*Load the first three rows.*/ \
 "movq (%[src]),"_m0"\n\t" \
 "movq (%[ref]),%%xmm4\n\t" \
 "movq (%[src],%[ystride]),%%xmm1\n\t" \
 "movq (%[ref],%[ystride]),%%xmm3\n\t" \
 "movq (%[src],%[ystride],2),%%xmm2\n\t" \
 "movq (%[ref],%[ystride],2),%%xmm5\n\t" \
 /*Unpack and subtract.*/ \
 "punpcklbw %%xmm4,"_m0"\n\t" \
 "punpcklbw %%xmm4,%%xmm4\n\t" \
 "punpcklbw %%xmm3,%%xmm1\n\t" \
 "punpcklbw %%xmm3,%%xmm3\n\t" \
 "psubw %%xmm4,"_m0"\n\t" \
 "psubw %%xmm3,%%xmm1\n\t" \
 /*Load the last row.*/ \
 "movq (%[src],%[ystride3]),%%xmm3\n\t" \
 "movq (%[ref],%[ystride3]),%%xmm4\n\t" \
 /*Unpack, subtract, and advance the pointers.*/ \
 "punpcklbw %%xmm5,%%xmm2\n\t" \
 "punpcklbw %%xmm5,%%xmm5\n\t" \
 "lea (%[src],%[ystride],4),%[src]\n\t" \
 "psubw %%xmm5,%%xmm2\n\t" \
 "punpcklbw %%xmm4,%%xmm3\n\t" \
 "punpcklbw %%xmm4,%%xmm4\n\t" \
 "lea (%[ref],%[ystride],4),%[ref]\n\t" \
 "psubw %%xmm4,%%xmm3\n\t" \

/*Square and accumulate four rows of differences in _m0, xmm1, xmm2, and xmm3.
  On output, xmm0 contains the sum of two of the rows, and the other two are
   added to xmm7.*/
#define OC_SSD_4x8(_m0) \
 "pmaddwd "_m0","_m0"\n\t" \
 "pmaddwd %%xmm1,%%xmm1\n\t" \
 "pmaddwd %%xmm2,%%xmm2\n\t" \
 "pmaddwd %%xmm3,%%xmm3\n\t" \
 "paddd %%xmm1,"_m0"\n\t" \
 "paddd %%xmm3,%%xmm2\n\t" \
 "paddd %%xmm2,%%xmm7\n\t" \

unsigned oc_enc_frag_ssd_sse2(const unsigned char *_src,
 const unsigned char *_ref,int _ystride){
  unsigned ret;
  __asm__ __volatile__(
    OC_LOAD_SUB_4x8("%%xmm7")
    OC_SSD_4x8("%%xmm7")
    OC_LOAD_SUB_4x8("%%xmm0")
    OC_SSD_4x8("%%xmm0")
    "paddd %%xmm0,%%xmm7\n\t"
    "movdqa %%xmm7,%%xmm6\n\t"
    "punpckhqdq %%xmm7,%%xmm7\n\t"
    "paddd %%xmm6,%%xmm7\n\t"
    "pshufd $1,%%xmm7,%%xmm6\n\t"
    "paddd %%xmm6,%%xmm7\n\t"
    "movd %%xmm7,%[ret]\n\t"
    :[ret]"=a"(ret)
    :[src]"r"(_src),[ref]"r"(_ref),[ystride]"r"((ptrdiff_t)_ystride),
     [ystride3]"r"((ptrdiff_t)_ystride*3)
  );
  return ret;
}

static const unsigned char __attribute__((aligned(16))) OC_MASK_CONSTS[8]={
  0x01,0x02,0x04,0x08,0x10,0x20,0x40,0x80
};

/*Load a 2x8 array of pixels values from %[src] and %[ref] and compute their
   horizontal sums as well as their 16-bit differences subject to a mask.
  %%xmm5 must contain OC_MASK_CONSTS[0...7] and %%xmm6 must contain 0.*/
#define OC_LOAD_SUB_MASK_2x8 \
 "#OC_LOAD_SUB_MASK_2x8\n\t" \
 /*Start the loads and expand the next 8 bits of the mask.*/ \
 "shl $8,%[m]\n\t" \
 "movq (%[src]),%%xmm0\n\t" \
 "mov %h[m],%b[m]\n\t" \
 "movq (%[ref]),%%xmm2\n\t" \
 "movd %[m],%%xmm4\n\t" \
 "shr $8,%[m]\n\t" \
 "pshuflw $0x00,%%xmm4,%%xmm4\n\t" \
 "mov %h[m],%b[m]\n\t" \
 "pand %%xmm6,%%xmm4\n\t" \
 "pcmpeqb %%xmm6,%%xmm4\n\t" \
 /*Perform the masking.*/ \
 "pand %%xmm4,%%xmm0\n\t" \
 "pand %%xmm4,%%xmm2\n\t" \
 /*Finish the loads while unpacking the first set of rows, and expand the next
    8 bits of the mask.*/ \
 "movd %[m],%%xmm4\n\t" \
 "movq (%[src],%[ystride]),%%xmm1\n\t" \
 "pshuflw $0x00,%%xmm4,%%xmm4\n\t" \
 "movq (%[ref],%[ystride]),%%xmm3\n\t" \
 "pand %%xmm6,%%xmm4\n\t" \
 "punpcklbw %%xmm2,%%xmm0\n\t" \
 "pcmpeqb %%xmm6,%%xmm4\n\t" \
 "punpcklbw %%xmm2,%%xmm2\n\t" \
 /*Mask and unpack the second set of rows.*/ \
 "pand %%xmm4,%%xmm1\n\t" \
 "pand %%xmm4,%%xmm3\n\t" \
 "punpcklbw %%xmm3,%%xmm1\n\t" \
 "punpcklbw %%xmm3,%%xmm3\n\t" \
 "psubw %%xmm2,%%xmm0\n\t" \
 "psubw %%xmm3,%%xmm1\n\t" \

unsigned oc_enc_frag_border_ssd_sse2(const unsigned char *_src,
 const unsigned char *_ref,int _ystride,ogg_int64_t _mask){
  ptrdiff_t ystride;
  unsigned  ret;
  int       i;
  ystride=_ystride;
  __asm__ __volatile__(
    "pxor %%xmm7,%%xmm7\n\t"
    "movq %[c],%%xmm6\n\t"
    :
    :[c]"m"(OC_CONST_ARRAY_OPERAND(unsigned char,OC_MASK_CONSTS,8))
  );
  for(i=0;i<4;i++){
    unsigned m;
    m=_mask&0xFFFF;
    _mask>>=16;
    if(m){
      __asm__ __volatile__(
        OC_LOAD_SUB_MASK_2x8
        "pmaddwd %%xmm0,%%xmm0\n\t"
        "pmaddwd %%xmm1,%%xmm1\n\t"
        "paddd %%xmm0,%%xmm7\n\t"
        "paddd %%xmm1,%%xmm7\n\t"
        :[src]"+r"(_src),[ref]"+r"(_ref),[ystride]"+r"(ystride),[m]"+Q"(m)
      );
    }
    _src+=2*ystride;
    _ref+=2*ystride;
  }
  __asm__ __volatile__(
    "movdqa %%xmm7,%%xmm6\n\t"
    "punpckhqdq %%xmm7,%%xmm7\n\t"
    "paddd %%xmm6,%%xmm7\n\t"
    "pshufd $1,%%xmm7,%%xmm6\n\t"
    "paddd %%xmm6,%%xmm7\n\t"
    "movd %%xmm7,%[ret]\n\t"
    :[ret]"=a"(ret)
  );
  return ret;
}


/*Load an 8x8 array of pixel values from %[src] and %[ref] and compute their
   16-bit difference in %%xmm0...%%xmm7.*/
#define OC_LOAD_SUB_8x8 \
 "#OC_LOAD_SUB_8x8\n\t" \
 "movq (%[src]),%%xmm0\n\t" \
 "movq (%[ref]),%%xmm4\n\t" \
 "movq (%[src],%[src_ystride]),%%xmm1\n\t" \
 "lea (%[src],%[src_ystride],2),%[src]\n\t" \
 "movq (%[ref],%[ref_ystride]),%%xmm5\n\t" \
 "lea (%[ref],%[ref_ystride],2),%[ref]\n\t" \
 "movq (%[src]),%%xmm2\n\t" \
 "movq (%[ref]),%%xmm7\n\t" \
 "movq (%[src],%[src_ystride]),%%xmm3\n\t" \
 "movq (%[ref],%[ref_ystride]),%%xmm6\n\t" \
 "punpcklbw %%xmm4,%%xmm0\n\t" \
 "lea (%[src],%[src_ystride],2),%[src]\n\t" \
 "punpcklbw %%xmm4,%%xmm4\n\t" \
 "lea (%[ref],%[ref_ystride],2),%[ref]\n\t" \
 "psubw %%xmm4,%%xmm0\n\t" \
 "movq (%[src]),%%xmm4\n\t" \
 "movdqa %%xmm0,"OC_MEM_OFFS(0x00,buf)"\n\t" \
 "movq (%[ref]),%%xmm0\n\t" \
 "punpcklbw %%xmm5,%%xmm1\n\t" \
 "punpcklbw %%xmm5,%%xmm5\n\t" \
 "psubw %%xmm5,%%xmm1\n\t" \
 "movq (%[src],%[src_ystride]),%%xmm5\n\t" \
 "punpcklbw %%xmm7,%%xmm2\n\t" \
 "punpcklbw %%xmm7,%%xmm7\n\t" \
 "psubw %%xmm7,%%xmm2\n\t" \
 "movq (%[ref],%[ref_ystride]),%%xmm7\n\t" \
 "punpcklbw %%xmm6,%%xmm3\n\t" \
 "lea (%[src],%[src_ystride],2),%[src]\n\t" \
 "punpcklbw %%xmm6,%%xmm6\n\t" \
 "psubw %%xmm6,%%xmm3\n\t" \
 "movq (%[src]),%%xmm6\n\t" \
 "punpcklbw %%xmm0,%%xmm4\n\t" \
 "lea (%[ref],%[ref_ystride],2),%[ref]\n\t" \
 "punpcklbw %%xmm0,%%xmm0\n\t" \
 "lea (%[src],%[src_ystride],2),%[src]\n\t" \
 "psubw %%xmm0,%%xmm4\n\t" \
 "movq (%[ref]),%%xmm0\n\t" \
 "punpcklbw %%xmm7,%%xmm5\n\t" \
 "neg %[src_ystride]\n\t" \
 "punpcklbw %%xmm7,%%xmm7\n\t" \
 "psubw %%xmm7,%%xmm5\n\t" \
 "movq (%[src],%[src_ystride]),%%xmm7\n\t" \
 "punpcklbw %%xmm0,%%xmm6\n\t" \
 "lea (%[ref],%[ref_ystride],2),%[ref]\n\t" \
 "punpcklbw %%xmm0,%%xmm0\n\t" \
 "neg %[ref_ystride]\n\t" \
 "psubw %%xmm0,%%xmm6\n\t" \
 "movq (%[ref],%[ref_ystride]),%%xmm0\n\t" \
 "punpcklbw %%xmm0,%%xmm7\n\t" \
 "punpcklbw %%xmm0,%%xmm0\n\t" \
 "psubw %%xmm0,%%xmm7\n\t" \
 "movdqa "OC_MEM_OFFS(0x00,buf)",%%xmm0\n\t" \

/*Load an 8x8 array of pixel values from %[src] into %%xmm0...%%xmm7.*/
#define OC_LOAD_8x8 \
 "#OC_LOAD_8x8\n\t" \
 "movq (%[src]),%%xmm0\n\t" \
 "movq (%[src],%[ystride]),%%xmm1\n\t" \
 "movq (%[src],%[ystride],2),%%xmm2\n\t" \
 "pxor %%xmm7,%%xmm7\n\t" \
 "movq (%[src],%[ystride3]),%%xmm3\n\t" \
 "punpcklbw %%xmm7,%%xmm0\n\t" \
 "movq (%[src4]),%%xmm4\n\t" \
 "punpcklbw %%xmm7,%%xmm1\n\t" \
 "movq (%[src4],%[ystride]),%%xmm5\n\t" \
 "punpcklbw %%xmm7,%%xmm2\n\t" \
 "movq (%[src4],%[ystride],2),%%xmm6\n\t" \
 "punpcklbw %%xmm7,%%xmm3\n\t" \
 "movq (%[src4],%[ystride3]),%%xmm7\n\t" \
 "punpcklbw %%xmm4,%%xmm4\n\t" \
 "punpcklbw %%xmm5,%%xmm5\n\t" \
 "psrlw $8,%%xmm4\n\t" \
 "psrlw $8,%%xmm5\n\t" \
 "punpcklbw %%xmm6,%%xmm6\n\t" \
 "punpcklbw %%xmm7,%%xmm7\n\t" \
 "psrlw $8,%%xmm6\n\t" \
 "psrlw $8,%%xmm7\n\t" \

/*Performs the first two stages of an 8-point 1-D Hadamard transform in place.
  Outputs 1, 3, 4, and 5 from the second stage are negated (which allows us to
   perform this stage in place with no temporary registers).*/
#define OC_HADAMARD_AB_8x8 \
 "#OC_HADAMARD_AB_8x8\n\t" \
 /*Stage A:*/ \
 "paddw %%xmm5,%%xmm1\n\t" \
 "paddw %%xmm6,%%xmm2\n\t" \
 "paddw %%xmm5,%%xmm5\n\t" \
 "paddw %%xmm6,%%xmm6\n\t" \
 "psubw %%xmm1,%%xmm5\n\t" \
 "psubw %%xmm2,%%xmm6\n\t" \
 "paddw %%xmm7,%%xmm3\n\t" \
 "paddw %%xmm4,%%xmm0\n\t" \
 "paddw %%xmm7,%%xmm7\n\t" \
 "paddw %%xmm4,%%xmm4\n\t" \
 "psubw %%xmm3,%%xmm7\n\t" \
 "psubw %%xmm0,%%xmm4\n\t" \
 /*Stage B:*/ \
 "paddw %%xmm2,%%xmm0\n\t" \
 "paddw %%xmm3,%%xmm1\n\t" \
 "paddw %%xmm6,%%xmm4\n\t" \
 "paddw %%xmm7,%%xmm5\n\t" \
 "paddw %%xmm2,%%xmm2\n\t" \
 "paddw %%xmm3,%%xmm3\n\t" \
 "paddw %%xmm6,%%xmm6\n\t" \
 "paddw %%xmm7,%%xmm7\n\t" \
 "psubw %%xmm0,%%xmm2\n\t" \
 "psubw %%xmm1,%%xmm3\n\t" \
 "psubw %%xmm4,%%xmm6\n\t" \
 "psubw %%xmm5,%%xmm7\n\t" \

/*Performs the last stage of an 8-point 1-D Hadamard transform in place.
  Outputs 1, 3, 5, and 7 are negated (which allows us to perform this stage in
   place with no temporary registers).*/
#define OC_HADAMARD_C_8x8 \
 "#OC_HADAMARD_C_8x8\n\t" \
 /*Stage C:*/ \
 "paddw %%xmm1,%%xmm0\n\t" \
 "paddw %%xmm3,%%xmm2\n\t" \
 "paddw %%xmm5,%%xmm4\n\t" \
 "paddw %%xmm7,%%xmm6\n\t" \
 "paddw %%xmm1,%%xmm1\n\t" \
 "paddw %%xmm3,%%xmm3\n\t" \
 "paddw %%xmm5,%%xmm5\n\t" \
 "paddw %%xmm7,%%xmm7\n\t" \
 "psubw %%xmm0,%%xmm1\n\t" \
 "psubw %%xmm2,%%xmm3\n\t" \
 "psubw %%xmm4,%%xmm5\n\t" \
 "psubw %%xmm6,%%xmm7\n\t" \

/*Performs an 8-point 1-D Hadamard transform in place.
  Outputs 1, 2, 4, and 7 are negated (which allows us to perform the transform
   in place with no temporary registers).*/
#define OC_HADAMARD_8x8 \
 OC_HADAMARD_AB_8x8 \
 OC_HADAMARD_C_8x8 \

/*Performs the first part of the final stage of the Hadamard transform and
   summing of absolute values.
  At the end of this part, %%xmm1 will contain the DC coefficient of the
   transform.*/
#define OC_HADAMARD_C_ABS_ACCUM_A_8x8 \
 /*We use the fact that \
     (abs(a+b)+abs(a-b))/2=max(abs(a),abs(b)) \
    to merge the final butterfly with the abs and the first stage of \
    accumulation. \
   Thus we can avoid using pabsw, which is not available until SSSE3. \
   Emulating pabsw takes 3 instructions, so the straightforward SSE2 \
    implementation would be (3+3)*8+7=55 instructions (+4 for spilling \
    registers). \
   Even with pabsw, it would be (3+1)*8+7=39 instructions (with no spills). \
   This implementation is only 26 (+4 for spilling registers).*/ \
 "#OC_HADAMARD_C_ABS_ACCUM_A_8x8\n\t" \
 "movdqa %%xmm7,"OC_MEM_OFFS(0x10,buf)"\n\t" \
 "movdqa %%xmm6,"OC_MEM_OFFS(0x00,buf)"\n\t" \
 /*xmm7={0x7FFF}x4 \
   xmm4=max(abs(xmm4),abs(xmm5))-0x7FFF*/ \
 "pcmpeqb %%xmm7,%%xmm7\n\t" \
 "movdqa %%xmm4,%%xmm6\n\t" \
 "psrlw $1,%%xmm7\n\t" \
 "paddw %%xmm5,%%xmm6\n\t" \
 "pmaxsw %%xmm5,%%xmm4\n\t" \
 "paddsw %%xmm7,%%xmm6\n\t" \
 "psubw %%xmm6,%%xmm4\n\t" \
 /*xmm2=max(abs(xmm2),abs(xmm3))-0x7FFF \
   xmm0=max(abs(xmm0),abs(xmm1))-0x7FFF*/ \
 "movdqa %%xmm2,%%xmm6\n\t" \
 "movdqa %%xmm0,%%xmm5\n\t" \
 "pmaxsw %%xmm3,%%xmm2\n\t" \
 "pmaxsw %%xmm1,%%xmm0\n\t" \
 "paddw %%xmm3,%%xmm6\n\t" \
 "movdqa "OC_MEM_OFFS(0x10,buf)",%%xmm3\n\t" \
 "paddw %%xmm5,%%xmm1\n\t" \
 "movdqa "OC_MEM_OFFS(0x00,buf)",%%xmm5\n\t" \

/*Performs the second part of the final stage of the Hadamard transform and
   summing of absolute values.*/
#define OC_HADAMARD_C_ABS_ACCUM_B_8x8 \
 "#OC_HADAMARD_C_ABS_ACCUM_B_8x8\n\t" \
 "paddsw %%xmm7,%%xmm6\n\t" \
 "paddsw %%xmm7,%%xmm1\n\t" \
 "psubw %%xmm6,%%xmm2\n\t" \
 "psubw %%xmm1,%%xmm0\n\t" \
 /*xmm7={1}x4 (needed for the horizontal add that follows) \
   xmm0+=xmm2+xmm4+max(abs(xmm3),abs(xmm5))-0x7FFF*/ \
 "movdqa %%xmm3,%%xmm6\n\t" \
 "pmaxsw %%xmm5,%%xmm3\n\t" \
 "paddw %%xmm2,%%xmm0\n\t" \
 "paddw %%xmm5,%%xmm6\n\t" \
 "paddw %%xmm4,%%xmm0\n\t" \
 "paddsw %%xmm7,%%xmm6\n\t" \
 "paddw %%xmm3,%%xmm0\n\t" \
 "psrlw $14,%%xmm7\n\t" \
 "psubw %%xmm6,%%xmm0\n\t" \

/*Performs the last stage of an 8-point 1-D Hadamard transform, takes the
   absolute value of each component, and accumulates everything into xmm0.*/
#define OC_HADAMARD_C_ABS_ACCUM_8x8 \
 OC_HADAMARD_C_ABS_ACCUM_A_8x8 \
 OC_HADAMARD_C_ABS_ACCUM_B_8x8 \

/*Performs an 8-point 1-D Hadamard transform, takes the absolute value of each
   component, and accumulates everything into xmm0.
  Note that xmm0 will have an extra 4 added to each column, and that after
   removing this value, the remainder will be half the conventional value.*/
#define OC_HADAMARD_ABS_ACCUM_8x8 \
 OC_HADAMARD_AB_8x8 \
 OC_HADAMARD_C_ABS_ACCUM_8x8

static unsigned oc_int_frag_satd_sse2(int *_dc,
 const unsigned char *_src,int _src_ystride,
 const unsigned char *_ref,int _ref_ystride){
  OC_ALIGN16(ogg_int16_t buf[16]);
  unsigned ret;
  unsigned ret2;
  int      dc;
  __asm__ __volatile__(
    OC_LOAD_SUB_8x8
    OC_HADAMARD_8x8
    OC_TRANSPOSE_8x8
    /*We split out the stages here so we can save the DC coefficient in the
       middle.*/
    OC_HADAMARD_AB_8x8
    OC_HADAMARD_C_ABS_ACCUM_A_8x8
    "movd %%xmm1,%[dc]\n\t"
    OC_HADAMARD_C_ABS_ACCUM_B_8x8
    /*Up to this point, everything fit in 16 bits (8 input + 1 for the
       difference + 2*3 for the two 8-point 1-D Hadamards - 1 for the abs - 1
       for the factor of two we dropped + 3 for the vertical accumulation).
      Now we finally have to promote things to dwords.
      We break this part out of OC_HADAMARD_ABS_ACCUM_8x8 to hide the long
       latency of pmaddwd by starting to compute abs(dc) here.*/
    "pmaddwd %%xmm7,%%xmm0\n\t"
    "movsx %w[dc],%[dc]\n\t"
    "cdq\n\t"
    "movdqa %%xmm0,%%xmm1\n\t"
    "punpckhqdq %%xmm0,%%xmm0\n\t"
    "paddd %%xmm1,%%xmm0\n\t"
    "pshuflw $0xE,%%xmm0,%%xmm1\n\t"
    "paddd %%xmm1,%%xmm0\n\t"
    "movd %%xmm0,%[ret]\n\t"
    /*The sums produced by OC_HADAMARD_ABS_ACCUM_8x8 each have an extra 4
       added to them, a factor of two removed, and the DC value included;
       correct the final sum here.*/
    "lea -64(%[ret2],%[ret],2),%[ret]\n\t"
    "xor %[dc],%[ret2]\n\t"
    "sub %[ret2],%[ret]\n\t"
    /*Although it looks like we're using 7 registers here, gcc can alias %[ret]
       and %[dc] with some of the inputs, since for once we don't write to
       them until after we're done using everything but %[buf].*/
    /*Note that _src_ystride and _ref_ystride must be given non-overlapping
       constraints, otherwise if gcc can prove they're equal it will allocate
       them to the same register (which is bad); _src and _ref face a similar
       problem.
      All four are destructively modified, but if we list them as output
       constraints, gcc can't alias them with other outputs.*/
    :[ret]"=r"(ret),[ret2]"=d"(ret2),[dc]"=a"(dc),
     [buf]"=m"(OC_ARRAY_OPERAND(ogg_int16_t,buf,16))
    :[src]"S"(_src),[src_ystride]"c"((ptrdiff_t)_src_ystride),
     [ref]"a"(_ref),[ref_ystride]"d"((ptrdiff_t)_ref_ystride)
    /*We have to use neg, so we actually clobber the condition codes for once
       (not to mention sub, and add).*/
    :"cc"
  );
  *_dc=dc;
  return ret;
}

unsigned oc_enc_frag_satd_sse2(int *_dc,const unsigned char *_src,
 const unsigned char *_ref,int _ystride){
  return oc_int_frag_satd_sse2(_dc,_src,_ystride,_ref,_ystride);
}

unsigned oc_enc_frag_satd2_sse2(int *_dc,const unsigned char *_src,
 const unsigned char *_ref1,const unsigned char *_ref2,int _ystride){
  OC_ALIGN8(unsigned char ref[64]);
  oc_int_frag_copy2_mmxext(ref,8,_ref1,_ref2,_ystride);
  return oc_int_frag_satd_sse2(_dc,_src,_ystride,ref,8);
}

unsigned oc_enc_frag_intra_satd_sse2(int *_dc,
 const unsigned char *_src,int _ystride){
  OC_ALIGN16(ogg_int16_t buf[16]);
  unsigned ret;
  int      dc;
  __asm__ __volatile__(
    OC_LOAD_8x8
    OC_HADAMARD_8x8
    OC_TRANSPOSE_8x8
    /*We split out the stages here so we can save the DC coefficient in the
       middle.*/
    OC_HADAMARD_AB_8x8
    OC_HADAMARD_C_ABS_ACCUM_A_8x8
    "movd %%xmm1,%[dc]\n\t"
    OC_HADAMARD_C_ABS_ACCUM_B_8x8
    /*Up to this point, everything fit in 16 bits (8 input + 1 for the
       difference + 2*3 for the two 8-point 1-D Hadamards - 1 for the abs - 1
       for the factor of two we dropped + 3 for the vertical accumulation).
      Now we finally have to promote things to dwords.*/
    "pmaddwd %%xmm7,%%xmm0\n\t"
    /*We assume that the DC coefficient is always positive (which is true,
       because the input to the INTRA transform was not a difference).*/
    "movzx %w[dc],%[dc]\n\t"
    "movdqa %%xmm0,%%xmm1\n\t"
    "punpckhqdq %%xmm0,%%xmm0\n\t"
    "paddd %%xmm1,%%xmm0\n\t"
    "pshuflw $0xE,%%xmm0,%%xmm1\n\t"
    "paddd %%xmm1,%%xmm0\n\t"
    "movd %%xmm0,%[ret]\n\t"
    "lea -64(%[ret],%[ret]),%[ret]\n\t"
    "sub %[dc],%[ret]\n\t"
    /*Although it looks like we're using 7 registers here, gcc can alias %[ret]
       and %[dc] with some of the inputs, since for once we don't write to
       them until after we're done using everything but %[buf].*/
    :[ret]"=a"(ret),[dc]"=r"(dc),
     [buf]"=m"(OC_ARRAY_OPERAND(ogg_int16_t,buf,16))
    :[src]"r"(_src),[src4]"r"(_src+4*_ystride),
     [ystride]"r"((ptrdiff_t)_ystride),[ystride3]"r"((ptrdiff_t)3*_ystride)
    /*We have to use sub, so we actually clobber the condition codes for once.*/
    :"cc"
  );
  *_dc=dc;
  return ret;
}

#endif

Coverage Report

Created: 2025-08-28 07:12

Line	Count	Source
1		/********************************************************************
2		* *
3		* THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE. *
4		* USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS *
5		* GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
6		* IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING. *
7		* *
8		* THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2009 *
9		* by the Xiph.Org Foundation https://www.xiph.org/ *
10		* *
11		********************************************************************
12
13		function:
14
15		********************************************************************/
16		#include <stddef.h>
17		#include "x86enc.h"
18		#include "sse2trans.h"
19
20		#if defined(OC_X86_ASM)
21
22		/*Load a 4x8 array of pixels values from %[src] and %[ref] and compute their
23		16-bit differences.
24		On output, these are stored in _m0, xmm1, xmm2, and xmm3.
25		xmm4 and xmm5 are clobbered.*/
26		#define OC_LOAD_SUB_4x8(_m0) \
27		"#OC_LOAD_SUB_4x8\n\t" \
28		/Load the first three rows./ \
29		"movq (%[src]),"_m0"\n\t" \
30		"movq (%[ref]),%%xmm4\n\t" \
31		"movq (%[src],%[ystride]),%%xmm1\n\t" \
32		"movq (%[ref],%[ystride]),%%xmm3\n\t" \
33		"movq (%[src],%[ystride],2),%%xmm2\n\t" \
34		"movq (%[ref],%[ystride],2),%%xmm5\n\t" \
35		/Unpack and subtract./ \
36		"punpcklbw %%xmm4,"_m0"\n\t" \
37		"punpcklbw %%xmm4,%%xmm4\n\t" \
38		"punpcklbw %%xmm3,%%xmm1\n\t" \
39		"punpcklbw %%xmm3,%%xmm3\n\t" \
40		"psubw %%xmm4,"_m0"\n\t" \
41		"psubw %%xmm3,%%xmm1\n\t" \
42		/Load the last row./ \
43		"movq (%[src],%[ystride3]),%%xmm3\n\t" \
44		"movq (%[ref],%[ystride3]),%%xmm4\n\t" \
45		/Unpack, subtract, and advance the pointers./ \
46		"punpcklbw %%xmm5,%%xmm2\n\t" \
47		"punpcklbw %%xmm5,%%xmm5\n\t" \
48		"lea (%[src],%[ystride],4),%[src]\n\t" \
49		"psubw %%xmm5,%%xmm2\n\t" \
50		"punpcklbw %%xmm4,%%xmm3\n\t" \
51		"punpcklbw %%xmm4,%%xmm4\n\t" \
52		"lea (%[ref],%[ystride],4),%[ref]\n\t" \
53		"psubw %%xmm4,%%xmm3\n\t" \
54
55		/*Square and accumulate four rows of differences in _m0, xmm1, xmm2, and xmm3.
56		On output, xmm0 contains the sum of two of the rows, and the other two are
57		added to xmm7.*/
58		#define OC_SSD_4x8(_m0) \
59		"pmaddwd "_m0","_m0"\n\t" \
60		"pmaddwd %%xmm1,%%xmm1\n\t" \
61		"pmaddwd %%xmm2,%%xmm2\n\t" \
62		"pmaddwd %%xmm3,%%xmm3\n\t" \
63		"paddd %%xmm1,"_m0"\n\t" \
64		"paddd %%xmm3,%%xmm2\n\t" \
65		"paddd %%xmm2,%%xmm7\n\t" \
66
67		unsigned oc_enc_frag_ssd_sse2(const unsigned char *_src,
68	2.05M	const unsigned char *_ref,int _ystride){
69	2.05M	unsigned ret;
70	2.05M	__asm__ __volatile__(
71	2.05M	OC_LOAD_SUB_4x8("%%xmm7")
72	2.05M	OC_SSD_4x8("%%xmm7")
73	2.05M	OC_LOAD_SUB_4x8("%%xmm0")
74	2.05M	OC_SSD_4x8("%%xmm0")
75	2.05M	"paddd %%xmm0,%%xmm7\n\t"
76	2.05M	"movdqa %%xmm7,%%xmm6\n\t"
77	2.05M	"punpckhqdq %%xmm7,%%xmm7\n\t"
78	2.05M	"paddd %%xmm6,%%xmm7\n\t"
79	2.05M	"pshufd $1,%%xmm7,%%xmm6\n\t"
80	2.05M	"paddd %%xmm6,%%xmm7\n\t"
81	2.05M	"movd %%xmm7,%[ret]\n\t"
82	2.05M	:[ret]"=a"(ret)
83	2.05M	:[src]"r"(_src),[ref]"r"(_ref),[ystride]"r"((ptrdiff_t)_ystride),
84	2.05M	[ystride3]"r"((ptrdiff_t)_ystride*3)
85	2.05M	);
86	2.05M	return ret;
87	2.05M	}
88
89		static const unsigned char __attribute__((aligned(16))) OC_MASK_CONSTS[8]={
90		0x01,0x02,0x04,0x08,0x10,0x20,0x40,0x80
91		};
92
93		/*Load a 2x8 array of pixels values from %[src] and %[ref] and compute their
94		horizontal sums as well as their 16-bit differences subject to a mask.
95		%%xmm5 must contain OC_MASK_CONSTS[0...7] and %%xmm6 must contain 0.*/
96		#define OC_LOAD_SUB_MASK_2x8 \
97		"#OC_LOAD_SUB_MASK_2x8\n\t" \
98		/Start the loads and expand the next 8 bits of the mask./ \
99		"shl $8,%[m]\n\t" \
100		"movq (%[src]),%%xmm0\n\t" \
101		"mov %h[m],%b[m]\n\t" \
102		"movq (%[ref]),%%xmm2\n\t" \
103		"movd %[m],%%xmm4\n\t" \
104		"shr $8,%[m]\n\t" \
105		"pshuflw $0x00,%%xmm4,%%xmm4\n\t" \
106		"mov %h[m],%b[m]\n\t" \
107		"pand %%xmm6,%%xmm4\n\t" \
108		"pcmpeqb %%xmm6,%%xmm4\n\t" \
109		/Perform the masking./ \
110		"pand %%xmm4,%%xmm0\n\t" \
111		"pand %%xmm4,%%xmm2\n\t" \
112		/*Finish the loads while unpacking the first set of rows, and expand the next
113		8 bits of the mask.*/ \
114		"movd %[m],%%xmm4\n\t" \
115		"movq (%[src],%[ystride]),%%xmm1\n\t" \
116		"pshuflw $0x00,%%xmm4,%%xmm4\n\t" \
117		"movq (%[ref],%[ystride]),%%xmm3\n\t" \
118		"pand %%xmm6,%%xmm4\n\t" \
119		"punpcklbw %%xmm2,%%xmm0\n\t" \
120		"pcmpeqb %%xmm6,%%xmm4\n\t" \
121		"punpcklbw %%xmm2,%%xmm2\n\t" \
122		/Mask and unpack the second set of rows./ \
123		"pand %%xmm4,%%xmm1\n\t" \
124		"pand %%xmm4,%%xmm3\n\t" \
125		"punpcklbw %%xmm3,%%xmm1\n\t" \
126		"punpcklbw %%xmm3,%%xmm3\n\t" \
127		"psubw %%xmm2,%%xmm0\n\t" \
128		"psubw %%xmm3,%%xmm1\n\t" \
129
130		unsigned oc_enc_frag_border_ssd_sse2(const unsigned char *_src,
131	1.48M	const unsigned char *_ref,int _ystride,ogg_int64_t _mask){
132	1.48M	ptrdiff_t ystride;
133	1.48M	unsigned ret;
134	1.48M	int i;
135	1.48M	ystride=_ystride;
136	1.48M	__asm__ __volatile__(
137	1.48M	"pxor %%xmm7,%%xmm7\n\t"
138	1.48M	"movq %[c],%%xmm6\n\t"
139	1.48M	:
140	1.48M	:[c]"m"(OC_CONST_ARRAY_OPERAND(unsigned char,OC_MASK_CONSTS,8))
141	1.48M	);
142	7.41M	for(i=0;i<4;i++){
143	5.93M	unsigned m;
144	5.93M	m=_mask&0xFFFF;
145	5.93M	_mask>>=16;
146	5.93M	if(m){
147	3.64M	__asm__ __volatile__(
148	3.64M	OC_LOAD_SUB_MASK_2x8
149	3.64M	"pmaddwd %%xmm0,%%xmm0\n\t"
150	3.64M	"pmaddwd %%xmm1,%%xmm1\n\t"
151	3.64M	"paddd %%xmm0,%%xmm7\n\t"
152	3.64M	"paddd %%xmm1,%%xmm7\n\t"
153	3.64M	:[src]"+r"(_src),[ref]"+r"(_ref),[ystride]"+r"(ystride),[m]"+Q"(m)
154	3.64M	);
155	3.64M	}
156	5.93M	_src+=2*ystride;
157	5.93M	_ref+=2*ystride;
158	5.93M	}
159	1.48M	__asm__ __volatile__(
160	1.48M	"movdqa %%xmm7,%%xmm6\n\t"
161	1.48M	"punpckhqdq %%xmm7,%%xmm7\n\t"
162	1.48M	"paddd %%xmm6,%%xmm7\n\t"
163	1.48M	"pshufd $1,%%xmm7,%%xmm6\n\t"
164	1.48M	"paddd %%xmm6,%%xmm7\n\t"
165	1.48M	"movd %%xmm7,%[ret]\n\t"
166	1.48M	:[ret]"=a"(ret)
167	1.48M	);
168	1.48M	return ret;
169	1.48M	}
170
171
172		/*Load an 8x8 array of pixel values from %[src] and %[ref] and compute their
173		16-bit difference in %%xmm0...%%xmm7.*/
174		#define OC_LOAD_SUB_8x8 \
175		"#OC_LOAD_SUB_8x8\n\t" \
176		"movq (%[src]),%%xmm0\n\t" \
177		"movq (%[ref]),%%xmm4\n\t" \
178		"movq (%[src],%[src_ystride]),%%xmm1\n\t" \
179		"lea (%[src],%[src_ystride],2),%[src]\n\t" \
180		"movq (%[ref],%[ref_ystride]),%%xmm5\n\t" \
181		"lea (%[ref],%[ref_ystride],2),%[ref]\n\t" \
182		"movq (%[src]),%%xmm2\n\t" \
183		"movq (%[ref]),%%xmm7\n\t" \
184		"movq (%[src],%[src_ystride]),%%xmm3\n\t" \
185		"movq (%[ref],%[ref_ystride]),%%xmm6\n\t" \
186		"punpcklbw %%xmm4,%%xmm0\n\t" \
187		"lea (%[src],%[src_ystride],2),%[src]\n\t" \
188		"punpcklbw %%xmm4,%%xmm4\n\t" \
189		"lea (%[ref],%[ref_ystride],2),%[ref]\n\t" \
190		"psubw %%xmm4,%%xmm0\n\t" \
191		"movq (%[src]),%%xmm4\n\t" \
192		"movdqa %%xmm0,"OC_MEM_OFFS(0x00,buf)"\n\t" \
193		"movq (%[ref]),%%xmm0\n\t" \
194		"punpcklbw %%xmm5,%%xmm1\n\t" \
195		"punpcklbw %%xmm5,%%xmm5\n\t" \
196		"psubw %%xmm5,%%xmm1\n\t" \
197		"movq (%[src],%[src_ystride]),%%xmm5\n\t" \
198		"punpcklbw %%xmm7,%%xmm2\n\t" \
199		"punpcklbw %%xmm7,%%xmm7\n\t" \
200		"psubw %%xmm7,%%xmm2\n\t" \
201		"movq (%[ref],%[ref_ystride]),%%xmm7\n\t" \
202		"punpcklbw %%xmm6,%%xmm3\n\t" \
203		"lea (%[src],%[src_ystride],2),%[src]\n\t" \
204		"punpcklbw %%xmm6,%%xmm6\n\t" \
205		"psubw %%xmm6,%%xmm3\n\t" \
206		"movq (%[src]),%%xmm6\n\t" \
207		"punpcklbw %%xmm0,%%xmm4\n\t" \
208		"lea (%[ref],%[ref_ystride],2),%[ref]\n\t" \
209		"punpcklbw %%xmm0,%%xmm0\n\t" \
210		"lea (%[src],%[src_ystride],2),%[src]\n\t" \
211		"psubw %%xmm0,%%xmm4\n\t" \
212		"movq (%[ref]),%%xmm0\n\t" \
213		"punpcklbw %%xmm7,%%xmm5\n\t" \
214		"neg %[src_ystride]\n\t" \
215		"punpcklbw %%xmm7,%%xmm7\n\t" \
216		"psubw %%xmm7,%%xmm5\n\t" \
217		"movq (%[src],%[src_ystride]),%%xmm7\n\t" \
218		"punpcklbw %%xmm0,%%xmm6\n\t" \
219		"lea (%[ref],%[ref_ystride],2),%[ref]\n\t" \
220		"punpcklbw %%xmm0,%%xmm0\n\t" \
221		"neg %[ref_ystride]\n\t" \
222		"psubw %%xmm0,%%xmm6\n\t" \
223		"movq (%[ref],%[ref_ystride]),%%xmm0\n\t" \
224		"punpcklbw %%xmm0,%%xmm7\n\t" \
225		"punpcklbw %%xmm0,%%xmm0\n\t" \
226		"psubw %%xmm0,%%xmm7\n\t" \
227		"movdqa "OC_MEM_OFFS(0x00,buf)",%%xmm0\n\t" \
228
229		/Load an 8x8 array of pixel values from %[src] into %%xmm0...%%xmm7./
230		#define OC_LOAD_8x8 \
231		"#OC_LOAD_8x8\n\t" \
232		"movq (%[src]),%%xmm0\n\t" \
233		"movq (%[src],%[ystride]),%%xmm1\n\t" \
234		"movq (%[src],%[ystride],2),%%xmm2\n\t" \
235		"pxor %%xmm7,%%xmm7\n\t" \
236		"movq (%[src],%[ystride3]),%%xmm3\n\t" \
237		"punpcklbw %%xmm7,%%xmm0\n\t" \
238		"movq (%[src4]),%%xmm4\n\t" \
239		"punpcklbw %%xmm7,%%xmm1\n\t" \
240		"movq (%[src4],%[ystride]),%%xmm5\n\t" \
241		"punpcklbw %%xmm7,%%xmm2\n\t" \
242		"movq (%[src4],%[ystride],2),%%xmm6\n\t" \
243		"punpcklbw %%xmm7,%%xmm3\n\t" \
244		"movq (%[src4],%[ystride3]),%%xmm7\n\t" \
245		"punpcklbw %%xmm4,%%xmm4\n\t" \
246		"punpcklbw %%xmm5,%%xmm5\n\t" \
247		"psrlw $8,%%xmm4\n\t" \
248		"psrlw $8,%%xmm5\n\t" \
249		"punpcklbw %%xmm6,%%xmm6\n\t" \
250		"punpcklbw %%xmm7,%%xmm7\n\t" \
251		"psrlw $8,%%xmm6\n\t" \
252		"psrlw $8,%%xmm7\n\t" \
253
254		/*Performs the first two stages of an 8-point 1-D Hadamard transform in place.
255		Outputs 1, 3, 4, and 5 from the second stage are negated (which allows us to
256		perform this stage in place with no temporary registers).*/
257		#define OC_HADAMARD_AB_8x8 \
258		"#OC_HADAMARD_AB_8x8\n\t" \
259		/Stage A:/ \
260		"paddw %%xmm5,%%xmm1\n\t" \
261		"paddw %%xmm6,%%xmm2\n\t" \
262		"paddw %%xmm5,%%xmm5\n\t" \
263		"paddw %%xmm6,%%xmm6\n\t" \
264		"psubw %%xmm1,%%xmm5\n\t" \
265		"psubw %%xmm2,%%xmm6\n\t" \
266		"paddw %%xmm7,%%xmm3\n\t" \
267		"paddw %%xmm4,%%xmm0\n\t" \
268		"paddw %%xmm7,%%xmm7\n\t" \
269		"paddw %%xmm4,%%xmm4\n\t" \
270		"psubw %%xmm3,%%xmm7\n\t" \
271		"psubw %%xmm0,%%xmm4\n\t" \
272		/Stage B:/ \
273		"paddw %%xmm2,%%xmm0\n\t" \
274		"paddw %%xmm3,%%xmm1\n\t" \
275		"paddw %%xmm6,%%xmm4\n\t" \
276		"paddw %%xmm7,%%xmm5\n\t" \
277		"paddw %%xmm2,%%xmm2\n\t" \
278		"paddw %%xmm3,%%xmm3\n\t" \
279		"paddw %%xmm6,%%xmm6\n\t" \
280		"paddw %%xmm7,%%xmm7\n\t" \
281		"psubw %%xmm0,%%xmm2\n\t" \
282		"psubw %%xmm1,%%xmm3\n\t" \
283		"psubw %%xmm4,%%xmm6\n\t" \
284		"psubw %%xmm5,%%xmm7\n\t" \
285
286		/*Performs the last stage of an 8-point 1-D Hadamard transform in place.
287		Outputs 1, 3, 5, and 7 are negated (which allows us to perform this stage in
288		place with no temporary registers).*/
289		#define OC_HADAMARD_C_8x8 \
290		"#OC_HADAMARD_C_8x8\n\t" \
291		/Stage C:/ \
292		"paddw %%xmm1,%%xmm0\n\t" \
293		"paddw %%xmm3,%%xmm2\n\t" \
294		"paddw %%xmm5,%%xmm4\n\t" \
295		"paddw %%xmm7,%%xmm6\n\t" \
296		"paddw %%xmm1,%%xmm1\n\t" \
297		"paddw %%xmm3,%%xmm3\n\t" \
298		"paddw %%xmm5,%%xmm5\n\t" \
299		"paddw %%xmm7,%%xmm7\n\t" \
300		"psubw %%xmm0,%%xmm1\n\t" \
301		"psubw %%xmm2,%%xmm3\n\t" \
302		"psubw %%xmm4,%%xmm5\n\t" \
303		"psubw %%xmm6,%%xmm7\n\t" \
304
305		/*Performs an 8-point 1-D Hadamard transform in place.
306		Outputs 1, 2, 4, and 7 are negated (which allows us to perform the transform
307		in place with no temporary registers).*/
308		#define OC_HADAMARD_8x8 \
309		OC_HADAMARD_AB_8x8 \
310		OC_HADAMARD_C_8x8 \
311
312		/*Performs the first part of the final stage of the Hadamard transform and
313		summing of absolute values.
314		At the end of this part, %%xmm1 will contain the DC coefficient of the
315		transform.*/
316		#define OC_HADAMARD_C_ABS_ACCUM_A_8x8 \
317		/*We use the fact that \
318		(abs(a+b)+abs(a-b))/2=max(abs(a),abs(b)) \
319		to merge the final butterfly with the abs and the first stage of \
320		accumulation. \
321		Thus we can avoid using pabsw, which is not available until SSSE3. \
322		Emulating pabsw takes 3 instructions, so the straightforward SSE2 \
323		implementation would be (3+3)*8+7=55 instructions (+4 for spilling \
324		registers). \
325		Even with pabsw, it would be (3+1)*8+7=39 instructions (with no spills). \
326		This implementation is only 26 (+4 for spilling registers).*/ \
327		"#OC_HADAMARD_C_ABS_ACCUM_A_8x8\n\t" \
328		"movdqa %%xmm7,"OC_MEM_OFFS(0x10,buf)"\n\t" \
329		"movdqa %%xmm6,"OC_MEM_OFFS(0x00,buf)"\n\t" \
330		/*xmm7={0x7FFF}x4 \
331		xmm4=max(abs(xmm4),abs(xmm5))-0x7FFF*/ \
332		"pcmpeqb %%xmm7,%%xmm7\n\t" \
333		"movdqa %%xmm4,%%xmm6\n\t" \
334		"psrlw $1,%%xmm7\n\t" \
335		"paddw %%xmm5,%%xmm6\n\t" \
336		"pmaxsw %%xmm5,%%xmm4\n\t" \
337		"paddsw %%xmm7,%%xmm6\n\t" \
338		"psubw %%xmm6,%%xmm4\n\t" \
339		/*xmm2=max(abs(xmm2),abs(xmm3))-0x7FFF \
340		xmm0=max(abs(xmm0),abs(xmm1))-0x7FFF*/ \
341		"movdqa %%xmm2,%%xmm6\n\t" \
342		"movdqa %%xmm0,%%xmm5\n\t" \
343		"pmaxsw %%xmm3,%%xmm2\n\t" \
344		"pmaxsw %%xmm1,%%xmm0\n\t" \
345		"paddw %%xmm3,%%xmm6\n\t" \
346		"movdqa "OC_MEM_OFFS(0x10,buf)",%%xmm3\n\t" \
347		"paddw %%xmm5,%%xmm1\n\t" \
348		"movdqa "OC_MEM_OFFS(0x00,buf)",%%xmm5\n\t" \
349
350		/*Performs the second part of the final stage of the Hadamard transform and
351		summing of absolute values.*/
352		#define OC_HADAMARD_C_ABS_ACCUM_B_8x8 \
353		"#OC_HADAMARD_C_ABS_ACCUM_B_8x8\n\t" \
354		"paddsw %%xmm7,%%xmm6\n\t" \
355		"paddsw %%xmm7,%%xmm1\n\t" \
356		"psubw %%xmm6,%%xmm2\n\t" \
357		"psubw %%xmm1,%%xmm0\n\t" \
358		/*xmm7={1}x4 (needed for the horizontal add that follows) \
359		xmm0+=xmm2+xmm4+max(abs(xmm3),abs(xmm5))-0x7FFF*/ \
360		"movdqa %%xmm3,%%xmm6\n\t" \
361		"pmaxsw %%xmm5,%%xmm3\n\t" \
362		"paddw %%xmm2,%%xmm0\n\t" \
363		"paddw %%xmm5,%%xmm6\n\t" \
364		"paddw %%xmm4,%%xmm0\n\t" \
365		"paddsw %%xmm7,%%xmm6\n\t" \
366		"paddw %%xmm3,%%xmm0\n\t" \
367		"psrlw $14,%%xmm7\n\t" \
368		"psubw %%xmm6,%%xmm0\n\t" \
369
370		/*Performs the last stage of an 8-point 1-D Hadamard transform, takes the
371		absolute value of each component, and accumulates everything into xmm0.*/
372		#define OC_HADAMARD_C_ABS_ACCUM_8x8 \
373		OC_HADAMARD_C_ABS_ACCUM_A_8x8 \
374		OC_HADAMARD_C_ABS_ACCUM_B_8x8 \
375
376		/*Performs an 8-point 1-D Hadamard transform, takes the absolute value of each
377		component, and accumulates everything into xmm0.
378		Note that xmm0 will have an extra 4 added to each column, and that after
379		removing this value, the remainder will be half the conventional value.*/
380		#define OC_HADAMARD_ABS_ACCUM_8x8 \
381		OC_HADAMARD_AB_8x8 \
382		OC_HADAMARD_C_ABS_ACCUM_8x8
383
384		static unsigned oc_int_frag_satd_sse2(int *_dc,
385		const unsigned char *_src,int _src_ystride,
386	30.2M	const unsigned char *_ref,int _ref_ystride){
387	30.2M	OC_ALIGN16(ogg_int16_t buf[16]);
388	30.2M	unsigned ret;
389	30.2M	unsigned ret2;
390	30.2M	int dc;
391	30.2M	__asm__ __volatile__(
392	30.2M	OC_LOAD_SUB_8x8
393	30.2M	OC_HADAMARD_8x8
394	30.2M	OC_TRANSPOSE_8x8
395		/*We split out the stages here so we can save the DC coefficient in the
396		middle.*/
397	30.2M	OC_HADAMARD_AB_8x8
398	30.2M	OC_HADAMARD_C_ABS_ACCUM_A_8x8
399	30.2M	"movd %%xmm1,%[dc]\n\t"
400	30.2M	OC_HADAMARD_C_ABS_ACCUM_B_8x8
401		/*Up to this point, everything fit in 16 bits (8 input + 1 for the
402		difference + 2*3 for the two 8-point 1-D Hadamards - 1 for the abs - 1
403		for the factor of two we dropped + 3 for the vertical accumulation).
404		Now we finally have to promote things to dwords.
405		We break this part out of OC_HADAMARD_ABS_ACCUM_8x8 to hide the long
406		latency of pmaddwd by starting to compute abs(dc) here.*/
407	30.2M	"pmaddwd %%xmm7,%%xmm0\n\t"
408	30.2M	"movsx %w[dc],%[dc]\n\t"
409	30.2M	"cdq\n\t"
410	30.2M	"movdqa %%xmm0,%%xmm1\n\t"
411	30.2M	"punpckhqdq %%xmm0,%%xmm0\n\t"
412	30.2M	"paddd %%xmm1,%%xmm0\n\t"
413	30.2M	"pshuflw $0xE,%%xmm0,%%xmm1\n\t"
414	30.2M	"paddd %%xmm1,%%xmm0\n\t"
415	30.2M	"movd %%xmm0,%[ret]\n\t"
416		/*The sums produced by OC_HADAMARD_ABS_ACCUM_8x8 each have an extra 4
417		added to them, a factor of two removed, and the DC value included;
418		correct the final sum here.*/
419	30.2M	"lea -64(%[ret2],%[ret],2),%[ret]\n\t"
420	30.2M	"xor %[dc],%[ret2]\n\t"
421	30.2M	"sub %[ret2],%[ret]\n\t"
422		/*Although it looks like we're using 7 registers here, gcc can alias %[ret]
423		and %[dc] with some of the inputs, since for once we don't write to
424		them until after we're done using everything but %[buf].*/
425		/*Note that _src_ystride and _ref_ystride must be given non-overlapping
426		constraints, otherwise if gcc can prove they're equal it will allocate
427		them to the same register (which is bad); _src and _ref face a similar
428		problem.
429		All four are destructively modified, but if we list them as output
430		constraints, gcc can't alias them with other outputs.*/
431	30.2M	:[ret]"=r"(ret),[ret2]"=d"(ret2),[dc]"=a"(dc),
432	30.2M	[buf]"=m"(OC_ARRAY_OPERAND(ogg_int16_t,buf,16))
433	30.2M	:[src]"S"(_src),[src_ystride]"c"((ptrdiff_t)_src_ystride),
434	30.2M	[ref]"a"(_ref),[ref_ystride]"d"((ptrdiff_t)_ref_ystride)
435		/*We have to use neg, so we actually clobber the condition codes for once
436		(not to mention sub, and add).*/
437	30.2M	:"cc"
438	30.2M	);
439	30.2M	*_dc=dc;
440	30.2M	return ret;
441	30.2M	}
442
443		unsigned oc_enc_frag_satd_sse2(int _dc,const unsigned char _src,
444	15.4M	const unsigned char *_ref,int _ystride){
445	15.4M	return oc_int_frag_satd_sse2(_dc,_src,_ystride,_ref,_ystride);
446	15.4M	}
447
448		unsigned oc_enc_frag_satd2_sse2(int _dc,const unsigned char _src,
449	14.8M	const unsigned char _ref1,const unsigned char _ref2,int _ystride){
450	14.8M	OC_ALIGN8(unsigned char ref[64]);
451	14.8M	oc_int_frag_copy2_mmxext(ref,8,_ref1,_ref2,_ystride);
452	14.8M	return oc_int_frag_satd_sse2(_dc,_src,_ystride,ref,8);
453	14.8M	}
454
455		unsigned oc_enc_frag_intra_satd_sse2(int *_dc,
456	23.2M	const unsigned char *_src,int _ystride){
457	23.2M	OC_ALIGN16(ogg_int16_t buf[16]);
458	23.2M	unsigned ret;
459	23.2M	int dc;
460	23.2M	__asm__ __volatile__(
461	23.2M	OC_LOAD_8x8
462	23.2M	OC_HADAMARD_8x8
463	23.2M	OC_TRANSPOSE_8x8
464		/*We split out the stages here so we can save the DC coefficient in the
465		middle.*/
466	23.2M	OC_HADAMARD_AB_8x8
467	23.2M	OC_HADAMARD_C_ABS_ACCUM_A_8x8
468	23.2M	"movd %%xmm1,%[dc]\n\t"
469	23.2M	OC_HADAMARD_C_ABS_ACCUM_B_8x8
470		/*Up to this point, everything fit in 16 bits (8 input + 1 for the
471		difference + 2*3 for the two 8-point 1-D Hadamards - 1 for the abs - 1
472		for the factor of two we dropped + 3 for the vertical accumulation).
473		Now we finally have to promote things to dwords.*/
474	23.2M	"pmaddwd %%xmm7,%%xmm0\n\t"
475		/*We assume that the DC coefficient is always positive (which is true,
476		because the input to the INTRA transform was not a difference).*/
477	23.2M	"movzx %w[dc],%[dc]\n\t"
478	23.2M	"movdqa %%xmm0,%%xmm1\n\t"
479	23.2M	"punpckhqdq %%xmm0,%%xmm0\n\t"
480	23.2M	"paddd %%xmm1,%%xmm0\n\t"
481	23.2M	"pshuflw $0xE,%%xmm0,%%xmm1\n\t"
482	23.2M	"paddd %%xmm1,%%xmm0\n\t"
483	23.2M	"movd %%xmm0,%[ret]\n\t"
484	23.2M	"lea -64(%[ret],%[ret]),%[ret]\n\t"
485	23.2M	"sub %[dc],%[ret]\n\t"
486		/*Although it looks like we're using 7 registers here, gcc can alias %[ret]
487		and %[dc] with some of the inputs, since for once we don't write to
488		them until after we're done using everything but %[buf].*/
489	23.2M	:[ret]"=a"(ret),[dc]"=r"(dc),
490	23.2M	[buf]"=m"(OC_ARRAY_OPERAND(ogg_int16_t,buf,16))
491	23.2M	:[src]"r"(_src),[src4]"r"(_src+4*_ystride),
492	23.2M	[ystride]"r"((ptrdiff_t)_ystride),[ystride3]"r"((ptrdiff_t)3*_ystride)
493		/We have to use sub, so we actually clobber the condition codes for once./
494	23.2M	:"cc"
495	23.2M	);
496	23.2M	*_dc=dc;
497	23.2M	return ret;
498	23.2M	}
499
500		#endif