Coverage Report

Created: 2026-06-15 06:22

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/work/libde265/libde265/x86/sse-intrapred.cc
Line
Count
Source
1
/*
2
 * H.265 video codec.
3
 * Copyright (c) 2026 Dirk Farin <dirk.farin@gmail.com>
4
 *
5
 * This file is part of libde265.
6
 *
7
 * libde265 is free software: you can redistribute it and/or modify
8
 * it under the terms of the GNU Lesser General Public License as
9
 * published by the Free Software Foundation, either version 3 of
10
 * the License, or (at your option) any later version.
11
 *
12
 * libde265 is distributed in the hope that it will be useful,
13
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
15
 * GNU Lesser General Public License for more details.
16
 *
17
 * You should have received a copy of the GNU Lesser General Public License
18
 * along with libde265.  If not, see <http://www.gnu.org/licenses/>.
19
 */
20
21
#include "x86/sse-intrapred.h"
22
#include "libde265/util.h"
23
24
#ifdef HAVE_CONFIG_H
25
#include "config.h"
26
#endif
27
28
#include <string.h>
29
#include <emmintrin.h> // SSE2
30
31
#if HAVE_SSE4_1
32
#include <smmintrin.h> // SSE4.1
33
34
// angle / inverse-angle lookup tables, defined in intrapred.cc
35
extern const int intraPredAngle_table[1+34];
36
extern const int invAngle_table[25-10];
37
38
namespace {
39
40
const int kMaxBlk = 64; // == MAX_INTRA_PRED_BLOCK_SIZE
41
42
// load 4 / 8 consecutive uint8_t and zero-extend to 8x int16
43
0
inline __m128i load4_epi16(const uint8_t* p) {
44
0
  int32_t t; memcpy(&t, p, 4);
45
0
  return _mm_cvtepu8_epi16(_mm_cvtsi32_si128(t));
46
0
}
47
0
inline __m128i load8_epi16(const uint8_t* p) {
48
0
  return _mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i*)p));
49
0
}
50
51
// store nT bytes from the low lanes of an u8-packed register (nT in {4,8,16,32})
52
0
inline void store_row(uint8_t* d, __m128i v, int nT) {
53
0
  switch (nT) {
54
0
    case 4:  { int32_t t = _mm_cvtsi128_si32(v); memcpy(d, &t, 4); } break;
55
0
    case 8:  _mm_storel_epi64((__m128i*)d, v); break;
56
0
    case 16: _mm_storeu_si128((__m128i*)d, v); break;
57
0
    default: _mm_storeu_si128((__m128i*)d, v);
58
0
             _mm_storeu_si128((__m128i*)(d+16), v); break;
59
0
  }
60
0
}
61
62
// copy exactly nT bytes (nT in {4,8,16,32})
63
0
inline void copy_row(uint8_t* d, const uint8_t* s, int nT) {
64
0
  switch (nT) {
65
0
    case 4:  memcpy(d, s, 4); break;
66
0
    case 8:  memcpy(d, s, 8); break;
67
0
    case 16: _mm_storeu_si128((__m128i*)d, _mm_loadu_si128((const __m128i*)s)); break;
68
0
    default: _mm_storeu_si128((__m128i*)d,      _mm_loadu_si128((const __m128i*)s));
69
0
             _mm_storeu_si128((__m128i*)(d+16), _mm_loadu_si128((const __m128i*)(s+16))); break;
70
0
  }
71
0
}
72
73
0
inline int shift_for(int nT) { return (nT==4)?3 : (nT==8)?4 : (nT==16)?5 : 6; } // Log2(nT)+1
74
75
} // namespace
76
77
78
void intra_pred_dc_8_sse4(uint8_t* dst, ptrdiff_t stride, int nT, int cIdx, const uint8_t* border)
79
0
{
80
0
  const int shift = shift_for(nT);
81
82
0
  int dcVal = 0;
83
0
  for (int i=0;i<nT;i++) { dcVal += border[i+1]; dcVal += border[-i-1]; }
84
0
  dcVal += nT;
85
0
  dcVal >>= shift;
86
87
0
  const __m128i v = _mm_set1_epi8((char)dcVal);
88
0
  for (int y=0;y<nT;y++) store_row(dst + y*stride, v, nT);
89
90
  // luma edge smoothing overwrites first row and first column (disjoint cells)
91
0
  if (cIdx==0 && nT<32) {
92
0
    dst[0] = (uint8_t)((border[-1] + 2*dcVal + border[1] + 2) >> 2);
93
0
    for (int x=1;x<nT;x++) dst[x]          = (uint8_t)((border[ x+1] + 3*dcVal + 2) >> 2);
94
0
    for (int y=1;y<nT;y++) dst[y*stride]   = (uint8_t)((border[-y-1] + 3*dcVal + 2) >> 2);
95
0
  }
96
0
}
97
98
99
void intra_pred_planar_8_sse4(uint8_t* dst, ptrdiff_t stride, int nT, int cIdx, const uint8_t* border)
100
0
{
101
0
  const int shift = shift_for(nT);
102
0
  const int TR = border[ 1+nT];   // top-right corner sample
103
0
  const int BL = border[-1-nT];   // bottom-left corner sample
104
105
0
  const __m128i base  = _mm_setr_epi16(0,1,2,3,4,5,6,7);
106
0
  const __m128i vTR   = _mm_set1_epi16((short)TR);
107
0
  const __m128i vNTm1 = _mm_set1_epi16((short)(nT-1));
108
0
  const __m128i one   = _mm_set1_epi16(1);
109
110
0
  for (int y=0;y<nT;y++) {
111
0
    const int left_y = border[-1-y];
112
0
    const int Cy = (y+1)*BL + nT;                 // constant term for this row
113
0
    const __m128i vL    = _mm_set1_epi16((short)left_y);
114
0
    const __m128i vNT1Y = _mm_set1_epi16((short)(nT-1-y));
115
0
    const __m128i vC    = _mm_set1_epi16((short)Cy);
116
117
0
    for (int x=0;x<nT;x+=8) {
118
0
      const __m128i xidx = _mm_add_epi16(_mm_set1_epi16((short)x), base);
119
0
      const __m128i vA   = _mm_sub_epi16(vNTm1, xidx);     // (nT-1-x)
120
0
      const __m128i vB   = _mm_add_epi16(xidx, one);       // (x+1)
121
0
      const __m128i top  = (nT==4) ? load4_epi16(border+1+x) : load8_epi16(border+1+x);
122
123
0
      __m128i acc = _mm_mullo_epi16(vA,  vL);              // (nT-1-x)*border[-1-y]
124
0
      acc = _mm_add_epi16(acc, _mm_mullo_epi16(vB,  vTR)); // (x+1)*border[1+nT]
125
0
      acc = _mm_add_epi16(acc, _mm_mullo_epi16(top, vNT1Y));// (nT-1-y)*border[1+x]
126
0
      acc = _mm_add_epi16(acc, vC);                        // (y+1)*border[-1-nT] + nT
127
0
      acc = _mm_srli_epi16(acc, shift);
128
129
0
      const __m128i p = _mm_packus_epi16(acc, acc);
130
0
      store_row(dst + y*stride + x, p, (nT<8)?nT:8);
131
0
    }
132
0
  }
133
0
}
134
135
136
void intra_pred_angular_8_sse4(uint8_t* dst, ptrdiff_t stride, int bit_depth, int disableBoundaryFilter,
137
                               int xB0, int yB0, int mode, int nT, int cIdx, const uint8_t* border)
138
0
{
139
0
  const int intraPredAngle = intraPredAngle_table[mode];
140
141
0
  uint8_t  ref_mem[4*kMaxBlk+1];
142
0
  uint8_t* ref = &ref_mem[2*kMaxBlk];
143
144
0
  if (mode >= 18) {
145
0
    for (int x=0;x<=nT;x++) ref[x] = border[x];
146
147
0
    if (intraPredAngle<0) {
148
0
      const int invAngle = invAngle_table[mode-11];
149
0
      if (((nT*intraPredAngle)>>5) < -1) {
150
0
        for (int x=(nT*intraPredAngle)>>5; x<=-1; x++)
151
0
          ref[x] = border[0-((x*invAngle+128)>>8)];
152
0
      }
153
0
    } else {
154
0
      for (int x=nT+1; x<=2*nT; x++) ref[x] = border[x];
155
0
    }
156
157
0
    for (int y=0;y<nT;y++) {
158
0
      const int iIdx  = ((y+1)*intraPredAngle)>>5;
159
0
      const int iFact = ((y+1)*intraPredAngle)&31;
160
0
      const uint8_t* src = ref + iIdx + 1;
161
0
      uint8_t* d = dst + y*stride;
162
163
0
      if (iFact==0) {
164
0
        copy_row(d, src, nT);                     // dst[x] = ref[x+iIdx+1]
165
0
      } else {
166
0
        const __m128i w0  = _mm_set1_epi16((short)(32-iFact));
167
0
        const __m128i w1  = _mm_set1_epi16((short)iFact);
168
0
        const __m128i r16 = _mm_set1_epi16(16);
169
0
        if (nT==4) {
170
0
          __m128i a = load4_epi16(src), b = load4_epi16(src+1);
171
0
          __m128i acc = _mm_add_epi16(_mm_add_epi16(_mm_mullo_epi16(a,w0), _mm_mullo_epi16(b,w1)), r16);
172
0
          acc = _mm_srli_epi16(acc, 5);
173
0
          store_row(d, _mm_packus_epi16(acc,acc), 4);
174
0
        } else {
175
0
          for (int x=0;x<nT;x+=8) {
176
0
            __m128i a = load8_epi16(src+x), b = load8_epi16(src+x+1);
177
0
            __m128i acc = _mm_add_epi16(_mm_add_epi16(_mm_mullo_epi16(a,w0), _mm_mullo_epi16(b,w1)), r16);
178
0
            acc = _mm_srli_epi16(acc, 5);
179
0
            store_row(d+x, _mm_packus_epi16(acc,acc), 8);
180
0
          }
181
0
        }
182
0
      }
183
0
    }
184
185
0
    if (mode==26 && cIdx==0 && nT<32 && !disableBoundaryFilter) {
186
0
      for (int y=0;y<nT;y++)
187
0
        dst[y*stride] = (uint8_t)Clip_BitDepth(border[1] + ((border[-1-y] - border[0])>>1), bit_depth);
188
0
    }
189
0
  }
190
0
  else {
191
    // Modes 2..17: the reference projection is transposed (per-column iIdx/iFact and
192
    // a row-indexed reference fetch), which does not map onto contiguous SIMD loads or
193
    // stores. Use the scalar reference path here -- kept bit-identical to
194
    // intra_prediction_angular() in intrapred.h.
195
0
    for (int x=0;x<=nT;x++) ref[x] = border[-x];
196
197
0
    if (intraPredAngle<0) {
198
0
      const int invAngle = invAngle_table[mode-11];
199
0
      if (((nT*intraPredAngle)>>5) < -1) {
200
0
        for (int x=(nT*intraPredAngle)>>5; x<=-1; x++)
201
0
          ref[x] = border[(x*invAngle+128)>>8];
202
0
      }
203
0
    } else {
204
0
      for (int x=nT+1; x<=2*nT; x++) ref[x] = border[-x];
205
0
    }
206
207
0
    for (int y=0;y<nT;y++)
208
0
      for (int x=0;x<nT;x++) {
209
0
        const int iIdx  = ((x+1)*intraPredAngle)>>5;
210
0
        const int iFact = ((x+1)*intraPredAngle)&31;
211
0
        if (iFact != 0)
212
0
          dst[x+y*stride] = (uint8_t)(((32-iFact)*ref[y+iIdx+1] + iFact*ref[y+iIdx+2] + 16)>>5);
213
0
        else
214
0
          dst[x+y*stride] = ref[y+iIdx+1];
215
0
      }
216
217
0
    if (mode==10 && cIdx==0 && nT<32 && !disableBoundaryFilter) {
218
0
      for (int x=0;x<nT;x++)
219
0
        dst[x] = (uint8_t)Clip_BitDepth(border[-1] + ((border[1+x] - border[0])>>1), bit_depth);
220
0
    }
221
0
  }
222
0
}
223
224
#endif // HAVE_SSE4_1