Coverage Report

Created: 2025-08-28 07:12

/src/theora/lib/x86/x86enquant.c
Line
Count
Source
1
/********************************************************************
2
 *                                                                  *
3
 * THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE.   *
4
 * USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS     *
5
 * GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
6
 * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING.       *
7
 *                                                                  *
8
 * THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2009                *
9
 * by the Xiph.Org Foundation and contributors                      *
10
 * https://www.xiph.org/                                            *
11
 *                                                                  *
12
 ********************************************************************
13
14
  function:
15
16
 ********************************************************************/
17
18
#include "x86enc.h"
19
20
#if defined(OC_X86_ASM)
21
22
23
24
/*The default enquant table is not quite suitable for SIMD purposes.
25
  First, the m and l parameters need to be separated so that an entire row full
26
   of m's or l's can be loaded at a time.
27
  Second, x86 SIMD has no element-wise arithmetic right-shift, so we have to
28
   emulate one with a multiply.
29
  Therefore we translate the shift count into a scale factor.*/
30
void oc_enc_enquant_table_init_x86(void *_enquant,
31
1.31M
 const ogg_uint16_t _dequant[64]){
32
1.31M
  ogg_int16_t *m;
33
1.31M
  ogg_int16_t *l;
34
1.31M
  int          zzi;
35
1.31M
  m=(ogg_int16_t *)_enquant;
36
1.31M
  l=m+64;
37
85.2M
  for(zzi=0;zzi<64;zzi++){
38
83.9M
    oc_iquant q;
39
83.9M
    oc_iquant_init(&q,_dequant[zzi]);
40
83.9M
    m[zzi]=q.m;
41
    /*q.l must be at least 2 for this to work; fortunately, once all the scale
42
       factors are baked in, the minimum quantizer is much larger than that.*/
43
83.9M
    l[zzi]=1<<16-q.l;
44
83.9M
  }
45
1.31M
}
46
47
51.0k
void oc_enc_enquant_table_fixup_x86(void *_enquant[3][3][2],int _nqis){
48
51.0k
  int pli;
49
51.0k
  int qii;
50
51.0k
  int qti;
51
320k
  for(pli=0;pli<3;pli++)for(qii=1;qii<_nqis;qii++)for(qti=0;qti<2;qti++){
52
213k
    ((ogg_int16_t *)_enquant[pli][qii][qti])[0]=
53
213k
     ((ogg_int16_t *)_enquant[pli][0][qti])[0];
54
213k
    ((ogg_int16_t *)_enquant[pli][qii][qti])[64]=
55
213k
     ((ogg_int16_t *)_enquant[pli][0][qti])[64];
56
213k
  }
57
51.0k
}
58
59
int oc_enc_quantize_sse2(ogg_int16_t _qdct[64],const ogg_int16_t _dct[64],
60
22.7M
 const ogg_uint16_t _dequant[64],const void *_enquant){
61
22.7M
  ptrdiff_t r;
62
22.7M
  __asm__ __volatile__(
63
22.7M
    "xor %[r],%[r]\n\t"
64
    /*Loop through two rows at a time.*/
65
22.7M
    ".p2align 4\n\t"
66
22.7M
    "0:\n\t"
67
    /*Load the first two rows of the data and the quant matrices.*/
68
22.7M
    "movdqa 0x00(%[dct],%[r]),%%xmm0\n\t"
69
22.7M
    "movdqa 0x10(%[dct],%[r]),%%xmm1\n\t"
70
22.7M
    "movdqa 0x00(%[dq],%[r]),%%xmm2\n\t"
71
22.7M
    "movdqa 0x10(%[dq],%[r]),%%xmm3\n\t"
72
22.7M
    "movdqa 0x00(%[q],%[r]),%%xmm4\n\t"
73
22.7M
    "movdqa 0x10(%[q],%[r]),%%xmm5\n\t"
74
    /*Double the input and propagate its sign to the rounding factor.
75
      Using SSSE3's psignw would help here, but we need the mask later anyway.*/
76
22.7M
    "movdqa %%xmm0,%%xmm6\n\t"
77
22.7M
    "psraw $15,%%xmm0\n\t"
78
22.7M
    "movdqa %%xmm1,%%xmm7\n\t"
79
22.7M
    "paddw %%xmm6,%%xmm6\n\t"
80
22.7M
    "psraw $15,%%xmm1\n\t"
81
22.7M
    "paddw %%xmm7,%%xmm7\n\t"
82
22.7M
    "paddw %%xmm0,%%xmm2\n\t"
83
22.7M
    "paddw %%xmm1,%%xmm3\n\t"
84
22.7M
    "pxor %%xmm0,%%xmm2\n\t"
85
22.7M
    "pxor %%xmm1,%%xmm3\n\t"
86
    /*Add the rounding factor and perform the first multiply.*/
87
22.7M
    "paddw %%xmm2,%%xmm6\n\t"
88
22.7M
    "paddw %%xmm3,%%xmm7\n\t"
89
22.7M
    "pmulhw %%xmm6,%%xmm4\n\t"
90
22.7M
    "pmulhw %%xmm7,%%xmm5\n\t"
91
22.7M
    "movdqa 0x80(%[q],%[r]),%%xmm2\n\t"
92
22.7M
    "movdqa 0x90(%[q],%[r]),%%xmm3\n\t"
93
22.7M
    "paddw %%xmm4,%%xmm6\n\t"
94
22.7M
    "paddw %%xmm5,%%xmm7\n\t"
95
    /*Emulate an element-wise right-shift via a second multiply.*/
96
22.7M
    "pmulhw %%xmm2,%%xmm6\n\t"
97
22.7M
    "pmulhw %%xmm3,%%xmm7\n\t"
98
22.7M
    "add $32,%[r]\n\t"
99
22.7M
    "cmp $96,%[r]\n\t"
100
    /*Correct for the sign.*/
101
22.7M
    "psubw %%xmm0,%%xmm6\n\t"
102
22.7M
    "psubw %%xmm1,%%xmm7\n\t"
103
    /*Save the result.*/
104
22.7M
    "movdqa %%xmm6,-0x20(%[qdct],%[r])\n\t"
105
22.7M
    "movdqa %%xmm7,-0x10(%[qdct],%[r])\n\t"
106
22.7M
    "jle 0b\n\t"
107
    /*Now find the location of the last non-zero value.*/
108
22.7M
    "movdqa 0x50(%[qdct]),%%xmm5\n\t"
109
22.7M
    "movdqa 0x40(%[qdct]),%%xmm4\n\t"
110
22.7M
    "packsswb %%xmm7,%%xmm6\n\t"
111
22.7M
    "packsswb %%xmm5,%%xmm4\n\t"
112
22.7M
    "pxor %%xmm0,%%xmm0\n\t"
113
22.7M
    "mov $-1,%k[dq]\n\t"
114
22.7M
    "pcmpeqb %%xmm0,%%xmm6\n\t"
115
22.7M
    "pcmpeqb %%xmm0,%%xmm4\n\t"
116
22.7M
    "pmovmskb %%xmm6,%k[q]\n\t"
117
22.7M
    "pmovmskb %%xmm4,%k[r]\n\t"
118
22.7M
    "shl $16,%k[q]\n\t"
119
22.7M
    "or %k[r],%k[q]\n\t"
120
22.7M
    "mov $32,%[r]\n\t"
121
    /*We have to use xor here instead of not in order to set the flags.*/
122
22.7M
    "xor %k[dq],%k[q]\n\t"
123
22.7M
    "jnz 1f\n\t"
124
22.7M
    "movdqa 0x30(%[qdct]),%%xmm7\n\t"
125
22.7M
    "movdqa 0x20(%[qdct]),%%xmm6\n\t"
126
22.7M
    "movdqa 0x10(%[qdct]),%%xmm5\n\t"
127
22.7M
    "movdqa 0x00(%[qdct]),%%xmm4\n\t"
128
22.7M
    "packsswb %%xmm7,%%xmm6\n\t"
129
22.7M
    "packsswb %%xmm5,%%xmm4\n\t"
130
22.7M
    "pcmpeqb %%xmm0,%%xmm6\n\t"
131
22.7M
    "pcmpeqb %%xmm0,%%xmm4\n\t"
132
22.7M
    "pmovmskb %%xmm6,%k[q]\n\t"
133
22.7M
    "pmovmskb %%xmm4,%k[r]\n\t"
134
22.7M
    "shl $16,%k[q]\n\t"
135
22.7M
    "or %k[r],%k[q]\n\t"
136
22.7M
    "xor %[r],%[r]\n\t"
137
22.7M
    "not %k[q]\n\t"
138
22.7M
    "or $1,%k[q]\n\t"
139
22.7M
    "1:\n\t"
140
22.7M
    "bsr %k[q],%k[q]\n\t"
141
22.7M
    "add %k[q],%k[r]\n\t"
142
22.7M
    :[r]"=&a"(r),[q]"+r"(_enquant),[dq]"+r"(_dequant)
143
22.7M
    :[dct]"r"(_dct),[qdct]"r"(_qdct)
144
22.7M
    :"cc","memory"
145
22.7M
  );
146
22.7M
  return (int)r;
147
22.7M
}
148
149
#endif