Coverage Report

Created: 2026-06-07 07:20

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/src/libde265/libde265/x86/sse-intrapred.cc
Line
Count
Source
1
/*
2
 * H.265 video codec.
3
 * Copyright (c) 2026 Dirk Farin <dirk.farin@gmail.com>
4
 *
5
 * This file is part of libde265.
6
 *
7
 * libde265 is free software: you can redistribute it and/or modify
8
 * it under the terms of the GNU Lesser General Public License as
9
 * published by the Free Software Foundation, either version 3 of
10
 * the License, or (at your option) any later version.
11
 *
12
 * libde265 is distributed in the hope that it will be useful,
13
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
15
 * GNU Lesser General Public License for more details.
16
 *
17
 * You should have received a copy of the GNU Lesser General Public License
18
 * along with libde265.  If not, see <http://www.gnu.org/licenses/>.
19
 */
20
21
#include "x86/sse-intrapred.h"
22
#include "libde265/util.h"
23
24
#ifdef HAVE_CONFIG_H
25
#include "config.h"
26
#endif
27
28
#include <string.h>
29
#include <emmintrin.h> // SSE2
30
31
#if HAVE_SSE4_1
32
#include <smmintrin.h> // SSE4.1
33
34
// angle / inverse-angle lookup tables, defined in intrapred.cc
35
extern const int intraPredAngle_table[1+34];
36
extern const int invAngle_table[25-10];
37
38
namespace {
39
40
const int kMaxBlk = 64; // == MAX_INTRA_PRED_BLOCK_SIZE
41
42
// load 4 / 8 consecutive uint8_t and zero-extend to 8x int16
43
7.28M
inline __m128i load4_epi16(const uint8_t* p) {
44
7.28M
  int32_t t; memcpy(&t, p, 4);
45
7.28M
  return _mm_cvtepu8_epi16(_mm_cvtsi32_si128(t));
46
7.28M
}
47
2.39M
inline __m128i load8_epi16(const uint8_t* p) {
48
2.39M
  return _mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i*)p));
49
2.39M
}
50
51
// store nT bytes from the low lanes of an u8-packed register (nT in {4,8,16,32})
52
9.67M
inline void store_row(uint8_t* d, __m128i v, int nT) {
53
9.67M
  switch (nT) {
54
7.33M
    case 4:  { int32_t t = _mm_cvtsi128_si32(v); memcpy(d, &t, 4); } break;
55
2.26M
    case 8:  _mm_storel_epi64((__m128i*)d, v); break;
56
42.6k
    case 16: _mm_storeu_si128((__m128i*)d, v); break;
57
19.3k
    default: _mm_storeu_si128((__m128i*)d, v);
58
19.3k
             _mm_storeu_si128((__m128i*)(d+16), v); break;
59
9.67M
  }
60
9.67M
}
61
62
// copy exactly nT bytes (nT in {4,8,16,32})
63
1.18M
inline void copy_row(uint8_t* d, const uint8_t* s, int nT) {
64
1.18M
  switch (nT) {
65
886k
    case 4:  memcpy(d, s, 4); break;
66
205k
    case 8:  memcpy(d, s, 8); break;
67
60.7k
    case 16: _mm_storeu_si128((__m128i*)d, _mm_loadu_si128((const __m128i*)s)); break;
68
31.8k
    default: _mm_storeu_si128((__m128i*)d,      _mm_loadu_si128((const __m128i*)s));
69
31.8k
             _mm_storeu_si128((__m128i*)(d+16), _mm_loadu_si128((const __m128i*)(s+16))); break;
70
1.18M
  }
71
1.18M
}
72
73
1.68M
inline int shift_for(int nT) { return (nT==4)?3 : (nT==8)?4 : (nT==16)?5 : 6; } // Log2(nT)+1
74
75
} // namespace
76
77
78
void intra_pred_dc_8_sse4(uint8_t* dst, ptrdiff_t stride, int nT, int cIdx, const uint8_t* border)
79
393k
{
80
393k
  const int shift = shift_for(nT);
81
82
393k
  int dcVal = 0;
83
2.20M
  for (int i=0;i<nT;i++) { dcVal += border[i+1]; dcVal += border[-i-1]; }
84
393k
  dcVal += nT;
85
393k
  dcVal >>= shift;
86
87
393k
  const __m128i v = _mm_set1_epi8((char)dcVal);
88
2.20M
  for (int y=0;y<nT;y++) store_row(dst + y*stride, v, nT);
89
90
  // luma edge smoothing overwrites first row and first column (disjoint cells)
91
393k
  if (cIdx==0 && nT<32) {
92
98.3k
    dst[0] = (uint8_t)((border[-1] + 2*dcVal + border[1] + 2) >> 2);
93
554k
    for (int x=1;x<nT;x++) dst[x]          = (uint8_t)((border[ x+1] + 3*dcVal + 2) >> 2);
94
554k
    for (int y=1;y<nT;y++) dst[y*stride]   = (uint8_t)((border[-y-1] + 3*dcVal + 2) >> 2);
95
98.3k
  }
96
393k
}
97
98
99
void intra_pred_planar_8_sse4(uint8_t* dst, ptrdiff_t stride, int nT, int cIdx, const uint8_t* border)
100
1.28M
{
101
1.28M
  const int shift = shift_for(nT);
102
1.28M
  const int TR = border[ 1+nT];   // top-right corner sample
103
1.28M
  const int BL = border[-1-nT];   // bottom-left corner sample
104
105
1.28M
  const __m128i base  = _mm_setr_epi16(0,1,2,3,4,5,6,7);
106
1.28M
  const __m128i vTR   = _mm_set1_epi16((short)TR);
107
1.28M
  const __m128i vNTm1 = _mm_set1_epi16((short)(nT-1));
108
1.28M
  const __m128i one   = _mm_set1_epi16(1);
109
110
7.05M
  for (int y=0;y<nT;y++) {
111
5.76M
    const int left_y = border[-1-y];
112
5.76M
    const int Cy = (y+1)*BL + nT;                 // constant term for this row
113
5.76M
    const __m128i vL    = _mm_set1_epi16((short)left_y);
114
5.76M
    const __m128i vNT1Y = _mm_set1_epi16((short)(nT-1-y));
115
5.76M
    const __m128i vC    = _mm_set1_epi16((short)Cy);
116
117
11.7M
    for (int x=0;x<nT;x+=8) {
118
6.02M
      const __m128i xidx = _mm_add_epi16(_mm_set1_epi16((short)x), base);
119
6.02M
      const __m128i vA   = _mm_sub_epi16(vNTm1, xidx);     // (nT-1-x)
120
6.02M
      const __m128i vB   = _mm_add_epi16(xidx, one);       // (x+1)
121
6.02M
      const __m128i top  = (nT==4) ? load4_epi16(border+1+x) : load8_epi16(border+1+x);
122
123
6.02M
      __m128i acc = _mm_mullo_epi16(vA,  vL);              // (nT-1-x)*border[-1-y]
124
6.02M
      acc = _mm_add_epi16(acc, _mm_mullo_epi16(vB,  vTR)); // (x+1)*border[1+nT]
125
6.02M
      acc = _mm_add_epi16(acc, _mm_mullo_epi16(top, vNT1Y));// (nT-1-y)*border[1+x]
126
6.02M
      acc = _mm_add_epi16(acc, vC);                        // (y+1)*border[-1-nT] + nT
127
6.02M
      acc = _mm_srli_epi16(acc, shift);
128
129
6.02M
      const __m128i p = _mm_packus_epi16(acc, acc);
130
6.02M
      store_row(dst + y*stride + x, p, (nT<8)?nT:8);
131
6.02M
    }
132
5.76M
  }
133
1.28M
}
134
135
136
void intra_pred_angular_8_sse4(uint8_t* dst, ptrdiff_t stride, int bit_depth, int disableBoundaryFilter,
137
                               int xB0, int yB0, int mode, int nT, int cIdx, const uint8_t* border)
138
1.80M
{
139
1.80M
  const int intraPredAngle = intraPredAngle_table[mode];
140
141
1.80M
  uint8_t  ref_mem[4*kMaxBlk+1];
142
1.80M
  uint8_t* ref = &ref_mem[2*kMaxBlk];
143
144
1.80M
  if (mode >= 18) {
145
4.11M
    for (int x=0;x<=nT;x++) ref[x] = border[x];
146
147
620k
    if (intraPredAngle<0) {
148
187k
      const int invAngle = invAngle_table[mode-11];
149
187k
      if (((nT*intraPredAngle)>>5) < -1) {
150
600k
        for (int x=(nT*intraPredAngle)>>5; x<=-1; x++)
151
464k
          ref[x] = border[0-((x*invAngle+128)>>8)];
152
135k
      }
153
432k
    } else {
154
2.43M
      for (int x=nT+1; x<=2*nT; x++) ref[x] = border[x];
155
432k
    }
156
157
3.48M
    for (int y=0;y<nT;y++) {
158
2.86M
      const int iIdx  = ((y+1)*intraPredAngle)>>5;
159
2.86M
      const int iFact = ((y+1)*intraPredAngle)&31;
160
2.86M
      const uint8_t* src = ref + iIdx + 1;
161
2.86M
      uint8_t* d = dst + y*stride;
162
163
2.86M
      if (iFact==0) {
164
1.18M
        copy_row(d, src, nT);                     // dst[x] = ref[x+iIdx+1]
165
1.68M
      } else {
166
1.68M
        const __m128i w0  = _mm_set1_epi16((short)(32-iFact));
167
1.68M
        const __m128i w1  = _mm_set1_epi16((short)iFact);
168
1.68M
        const __m128i r16 = _mm_set1_epi16(16);
169
1.68M
        if (nT==4) {
170
1.31M
          __m128i a = load4_epi16(src), b = load4_epi16(src+1);
171
1.31M
          __m128i acc = _mm_add_epi16(_mm_add_epi16(_mm_mullo_epi16(a,w0), _mm_mullo_epi16(b,w1)), r16);
172
1.31M
          acc = _mm_srli_epi16(acc, 5);
173
1.31M
          store_row(d, _mm_packus_epi16(acc,acc), 4);
174
1.31M
        } else {
175
876k
          for (int x=0;x<nT;x+=8) {
176
512k
            __m128i a = load8_epi16(src+x), b = load8_epi16(src+x+1);
177
512k
            __m128i acc = _mm_add_epi16(_mm_add_epi16(_mm_mullo_epi16(a,w0), _mm_mullo_epi16(b,w1)), r16);
178
512k
            acc = _mm_srli_epi16(acc, 5);
179
512k
            store_row(d+x, _mm_packus_epi16(acc,acc), 8);
180
512k
          }
181
364k
        }
182
1.68M
      }
183
2.86M
    }
184
185
620k
    if (mode==26 && cIdx==0 && nT<32 && !disableBoundaryFilter) {
186
211k
      for (int y=0;y<nT;y++)
187
178k
        dst[y*stride] = (uint8_t)Clip_BitDepth(border[1] + ((border[-1-y] - border[0])>>1), bit_depth);
188
33.2k
    }
189
620k
  }
190
1.18M
  else {
191
    // Modes 2..17: the reference projection is transposed (per-column iIdx/iFact and
192
    // a row-indexed reference fetch), which does not map onto contiguous SIMD loads or
193
    // stores. Use the scalar reference path here -- kept bit-identical to
194
    // intra_prediction_angular() in intrapred.h.
195
7.68M
    for (int x=0;x<=nT;x++) ref[x] = border[-x];
196
197
1.18M
    if (intraPredAngle<0) {
198
117k
      const int invAngle = invAngle_table[mode-11];
199
117k
      if (((nT*intraPredAngle)>>5) < -1) {
200
377k
        for (int x=(nT*intraPredAngle)>>5; x<=-1; x++)
201
287k
          ref[x] = border[(x*invAngle+128)>>8];
202
90.1k
      }
203
1.06M
    } else {
204
5.82M
      for (int x=nT+1; x<=2*nT; x++) ref[x] = border[-x];
205
1.06M
    }
206
207
6.50M
    for (int y=0;y<nT;y++)
208
33.5M
      for (int x=0;x<nT;x++) {
209
28.2M
        const int iIdx  = ((x+1)*intraPredAngle)>>5;
210
28.2M
        const int iFact = ((x+1)*intraPredAngle)&31;
211
28.2M
        if (iFact != 0)
212
8.25M
          dst[x+y*stride] = (uint8_t)(((32-iFact)*ref[y+iIdx+1] + iFact*ref[y+iIdx+2] + 16)>>5);
213
19.9M
        else
214
19.9M
          dst[x+y*stride] = ref[y+iIdx+1];
215
28.2M
      }
216
217
1.18M
    if (mode==10 && cIdx==0 && nT<32 && !disableBoundaryFilter) {
218
88.2k
      for (int x=0;x<nT;x++)
219
74.7k
        dst[x] = (uint8_t)Clip_BitDepth(border[-1] + ((border[1+x] - border[0])>>1), bit_depth);
220
13.4k
    }
221
1.18M
  }
222
1.80M
}
223
224
#endif // HAVE_SSE4_1