Coverage Report

Created: 2026-05-24 06:16

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/src/libtheora/lib/x86/mmxstate.c
Line
Count
Source
1
/********************************************************************
2
 *                                                                  *
3
 * THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE.   *
4
 * USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS     *
5
 * GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
6
 * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING.       *
7
 *                                                                  *
8
 * THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2009                *
9
 * by the Xiph.Org Foundation and contributors                      *
10
 * https://www.xiph.org/                                            *
11
 *                                                                  *
12
 ********************************************************************
13
14
  function:
15
16
 ********************************************************************/
17
18
/*MMX acceleration of complete fragment reconstruction algorithm.
19
  Originally written by Rudolf Marek.*/
20
#include <string.h>
21
#include "x86int.h"
22
#include "mmxloop.h"
23
24
#if defined(OC_X86_ASM)
25
26
void oc_state_frag_recon_mmx(const oc_theora_state *_state,ptrdiff_t _fragi,
27
9.51M
 int _pli,ogg_int16_t _dct_coeffs[128],int _last_zzi,ogg_uint16_t _dc_quant){
28
9.51M
  unsigned char *dst;
29
9.51M
  ptrdiff_t      frag_buf_off;
30
9.51M
  int            ystride;
31
9.51M
  int            refi;
32
  /*Apply the inverse transform.*/
33
  /*Special case only having a DC component.*/
34
9.51M
  if(_last_zzi<2){
35
    /*Note that this value must be unsigned, to keep the __asm__ block from
36
       sign-extending it when it puts it in a register.*/
37
4.96M
    ogg_uint16_t p;
38
4.96M
    int          i;
39
    /*We round this dequant product (and not any of the others) because there's
40
       no iDCT rounding.*/
41
4.96M
    p=(ogg_int16_t)(_dct_coeffs[0]*(ogg_int32_t)_dc_quant+15>>5);
42
    /*Fill _dct_coeffs with p.*/
43
4.96M
    __asm__ __volatile__(
44
      /*mm0=0000 0000 0000 AAAA*/
45
4.96M
      "movd %[p],%%mm0\n\t"
46
      /*mm0=0000 0000 AAAA AAAA*/
47
4.96M
      "punpcklwd %%mm0,%%mm0\n\t"
48
      /*mm0=AAAA AAAA AAAA AAAA*/
49
4.96M
      "punpckldq %%mm0,%%mm0\n\t"
50
4.96M
      :
51
4.96M
      :[p]"r"((unsigned)p)
52
4.96M
    );
53
24.8M
    for(i=0;i<4;i++){
54
19.8M
      __asm__ __volatile__(
55
19.8M
        "movq %%mm0,"OC_MEM_OFFS(0x00,y)"\n\t"
56
19.8M
        "movq %%mm0,"OC_MEM_OFFS(0x08,y)"\n\t"
57
19.8M
        "movq %%mm0,"OC_MEM_OFFS(0x10,y)"\n\t"
58
19.8M
        "movq %%mm0,"OC_MEM_OFFS(0x18,y)"\n\t"
59
19.8M
        :[y]"=m"OC_ARRAY_OPERAND(ogg_int16_t,_dct_coeffs+64+16*i,16)
60
19.8M
      );
61
19.8M
    }
62
4.96M
  }
63
4.55M
  else{
64
    /*Dequantize the DC coefficient.*/
65
4.55M
    _dct_coeffs[0]=(ogg_int16_t)(_dct_coeffs[0]*(int)_dc_quant);
66
4.55M
    oc_idct8x8(_state,_dct_coeffs+64,_dct_coeffs,_last_zzi);
67
4.55M
  }
68
  /*Fill in the target buffer.*/
69
9.51M
  frag_buf_off=_state->frag_buf_offs[_fragi];
70
9.51M
  refi=_state->frags[_fragi].refi;
71
9.51M
  ystride=_state->ref_ystride[_pli];
72
9.51M
  dst=_state->ref_frame_data[OC_FRAME_SELF]+frag_buf_off;
73
9.51M
  if(refi==OC_FRAME_SELF)oc_frag_recon_intra_mmx(dst,ystride,_dct_coeffs+64);
74
3.01M
  else{
75
3.01M
    const unsigned char *ref;
76
3.01M
    int                  mvoffsets[2];
77
3.01M
    ref=_state->ref_frame_data[refi]+frag_buf_off;
78
3.01M
    if(oc_state_get_mv_offsets(_state,mvoffsets,_pli,
79
3.01M
     _state->frag_mvs[_fragi])>1){
80
48.8k
      oc_frag_recon_inter2_mmx(dst,ref+mvoffsets[0],ref+mvoffsets[1],ystride,
81
48.8k
       _dct_coeffs+64);
82
48.8k
    }
83
2.96M
    else oc_frag_recon_inter_mmx(dst,ref+mvoffsets[0],ystride,_dct_coeffs+64);
84
3.01M
  }
85
9.51M
}
86
87
/*We copy these entire function to inline the actual MMX routines so that we
88
   use only a single indirect call.*/
89
90
0
void oc_loop_filter_init_mmx(signed char _bv[256],int _flimit){
91
0
  memset(_bv,_flimit,8);
92
0
}
93
94
/*Apply the loop filter to a given set of fragment rows in the given plane.
95
  The filter may be run on the bottom edge, affecting pixels in the next row of
96
   fragments, so this row also needs to be available.
97
  _bv:        The bounding values array.
98
  _refi:      The index of the frame buffer to filter.
99
  _pli:       The color plane to filter.
100
  _fragy0:    The Y coordinate of the first fragment row to filter.
101
  _fragy_end: The Y coordinate of the fragment row to stop filtering at.*/
102
void oc_state_loop_filter_frag_rows_mmx(const oc_theora_state *_state,
103
0
 signed char _bv[256],int _refi,int _pli,int _fragy0,int _fragy_end){
104
0
  OC_ALIGN8(unsigned char   ll[8]);
105
0
  const oc_fragment_plane *fplane;
106
0
  const oc_fragment       *frags;
107
0
  const ptrdiff_t         *frag_buf_offs;
108
0
  unsigned char           *ref_frame_data;
109
0
  ptrdiff_t                fragi_top;
110
0
  ptrdiff_t                fragi_bot;
111
0
  ptrdiff_t                fragi0;
112
0
  ptrdiff_t                fragi0_end;
113
0
  int                      ystride;
114
0
  int                      nhfrags;
115
0
  memset(ll,_state->loop_filter_limits[_state->qis[0]],sizeof(ll));
116
0
  fplane=_state->fplanes+_pli;
117
0
  nhfrags=fplane->nhfrags;
118
0
  fragi_top=fplane->froffset;
119
0
  fragi_bot=fragi_top+fplane->nfrags;
120
0
  fragi0=fragi_top+_fragy0*(ptrdiff_t)nhfrags;
121
0
  fragi0_end=fragi0+(_fragy_end-_fragy0)*(ptrdiff_t)nhfrags;
122
0
  ystride=_state->ref_ystride[_pli];
123
0
  frags=_state->frags;
124
0
  frag_buf_offs=_state->frag_buf_offs;
125
0
  ref_frame_data=_state->ref_frame_data[_refi];
126
  /*The following loops are constructed somewhat non-intuitively on purpose.
127
    The main idea is: if a block boundary has at least one coded fragment on
128
     it, the filter is applied to it.
129
    However, the order that the filters are applied in matters, and VP3 chose
130
     the somewhat strange ordering used below.*/
131
0
  while(fragi0<fragi0_end){
132
0
    ptrdiff_t fragi;
133
0
    ptrdiff_t fragi_end;
134
0
    fragi=fragi0;
135
0
    fragi_end=fragi+nhfrags;
136
0
    while(fragi<fragi_end){
137
0
      if(frags[fragi].coded){
138
0
        unsigned char *ref;
139
0
        ref=ref_frame_data+frag_buf_offs[fragi];
140
0
        if(fragi>fragi0){
141
0
          OC_LOOP_FILTER_H(OC_LOOP_FILTER8_MMX,ref,ystride,ll);
142
0
        }
143
0
        if(fragi0>fragi_top){
144
0
          OC_LOOP_FILTER_V(OC_LOOP_FILTER8_MMX,ref,ystride,ll);
145
0
        }
146
0
        if(fragi+1<fragi_end&&!frags[fragi+1].coded){
147
0
          OC_LOOP_FILTER_H(OC_LOOP_FILTER8_MMX,ref+8,ystride,ll);
148
0
        }
149
0
        if(fragi+nhfrags<fragi_bot&&!frags[fragi+nhfrags].coded){
150
0
          OC_LOOP_FILTER_V(OC_LOOP_FILTER8_MMX,ref+(ystride*8),ystride,ll);
151
0
        }
152
0
      }
153
0
      fragi++;
154
0
    }
155
0
    fragi0+=nhfrags;
156
0
  }
157
0
}
158
159
82
void oc_loop_filter_init_mmxext(signed char _bv[256],int _flimit){
160
82
  memset(_bv,~(_flimit<<1),8);
161
82
}
162
163
/*Apply the loop filter to a given set of fragment rows in the given plane.
164
  The filter may be run on the bottom edge, affecting pixels in the next row of
165
   fragments, so this row also needs to be available.
166
  _bv:        The bounding values array.
167
  _refi:      The index of the frame buffer to filter.
168
  _pli:       The color plane to filter.
169
  _fragy0:    The Y coordinate of the first fragment row to filter.
170
  _fragy_end: The Y coordinate of the fragment row to stop filtering at.*/
171
void oc_state_loop_filter_frag_rows_mmxext(const oc_theora_state *_state,
172
3.25k
 signed char _bv[256],int _refi,int _pli,int _fragy0,int _fragy_end){
173
3.25k
  const oc_fragment_plane *fplane;
174
3.25k
  const oc_fragment       *frags;
175
3.25k
  const ptrdiff_t         *frag_buf_offs;
176
3.25k
  unsigned char           *ref_frame_data;
177
3.25k
  ptrdiff_t                fragi_top;
178
3.25k
  ptrdiff_t                fragi_bot;
179
3.25k
  ptrdiff_t                fragi0;
180
3.25k
  ptrdiff_t                fragi0_end;
181
3.25k
  int                      ystride;
182
3.25k
  int                      nhfrags;
183
3.25k
  fplane=_state->fplanes+_pli;
184
3.25k
  nhfrags=fplane->nhfrags;
185
3.25k
  fragi_top=fplane->froffset;
186
3.25k
  fragi_bot=fragi_top+fplane->nfrags;
187
3.25k
  fragi0=fragi_top+_fragy0*(ptrdiff_t)nhfrags;
188
3.25k
  fragi0_end=fragi_top+_fragy_end*(ptrdiff_t)nhfrags;
189
3.25k
  ystride=_state->ref_ystride[_pli];
190
3.25k
  frags=_state->frags;
191
3.25k
  frag_buf_offs=_state->frag_buf_offs;
192
3.25k
  ref_frame_data=_state->ref_frame_data[_refi];
193
  /*The following loops are constructed somewhat non-intuitively on purpose.
194
    The main idea is: if a block boundary has at least one coded fragment on
195
     it, the filter is applied to it.
196
    However, the order that the filters are applied in matters, and VP3 chose
197
     the somewhat strange ordering used below.*/
198
16.9k
  while(fragi0<fragi0_end){
199
13.6k
    ptrdiff_t fragi;
200
13.6k
    ptrdiff_t fragi_end;
201
13.6k
    fragi=fragi0;
202
13.6k
    fragi_end=fragi+nhfrags;
203
740k
    while(fragi<fragi_end){
204
727k
      if(frags[fragi].coded){
205
535k
        unsigned char *ref;
206
535k
        ref=ref_frame_data+frag_buf_offs[fragi];
207
535k
        if(fragi>fragi0){
208
525k
          OC_LOOP_FILTER_H(OC_LOOP_FILTER8_MMXEXT,ref,ystride,_bv);
209
525k
        }
210
535k
        if(fragi0>fragi_top){
211
527k
          OC_LOOP_FILTER_V(OC_LOOP_FILTER8_MMXEXT,ref,ystride,_bv);
212
527k
        }
213
535k
        if(fragi+1<fragi_end&&!frags[fragi+1].coded){
214
91.1k
          OC_LOOP_FILTER_H(OC_LOOP_FILTER8_MMXEXT,ref+8,ystride,_bv);
215
91.1k
        }
216
535k
        if(fragi+nhfrags<fragi_bot&&!frags[fragi+nhfrags].coded){
217
97.8k
          OC_LOOP_FILTER_V(OC_LOOP_FILTER8_MMXEXT,ref+(ystride*8),ystride,_bv);
218
97.8k
        }
219
535k
      }
220
727k
      fragi++;
221
727k
    }
222
13.6k
    fragi0+=nhfrags;
223
13.6k
  }
224
3.25k
}
225
226
#endif