/src/libde265/libde265/x86/sse-intrapred.cc

Source
/*
 * H.265 video codec.
 * Copyright (c) 2026 Dirk Farin <dirk.farin@gmail.com>
 *
 * This file is part of libde265.
 *
 * libde265 is free software: you can redistribute it and/or modify
 * it under the terms of the GNU Lesser General Public License as
 * published by the Free Software Foundation, either version 3 of
 * the License, or (at your option) any later version.
 *
 * libde265 is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU Lesser General Public License for more details.
 *
 * You should have received a copy of the GNU Lesser General Public License
 * along with libde265.  If not, see <http://www.gnu.org/licenses/>.
 */

#include "x86/sse-intrapred.h"
#include "libde265/util.h"

#ifdef HAVE_CONFIG_H
#include "config.h"
#endif

#include <string.h>
#include <emmintrin.h> // SSE2

#if HAVE_SSE4_1
#include <smmintrin.h> // SSE4.1

// angle / inverse-angle lookup tables, defined in intrapred.cc
extern const int intraPredAngle_table[1+34];
extern const int invAngle_table[25-10];

namespace {

const int kMaxBlk = 64; // == MAX_INTRA_PRED_BLOCK_SIZE

// load 4 / 8 consecutive uint8_t and zero-extend to 8x int16
inline __m128i load4_epi16(const uint8_t* p) {
  int32_t t; memcpy(&t, p, 4);
  return _mm_cvtepu8_epi16(_mm_cvtsi32_si128(t));
}
inline __m128i load8_epi16(const uint8_t* p) {
  return _mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i*)p));
}

// store nT bytes from the low lanes of an u8-packed register (nT in {4,8,16,32})
inline void store_row(uint8_t* d, __m128i v, int nT) {
  switch (nT) {
    case 4:  { int32_t t = _mm_cvtsi128_si32(v); memcpy(d, &t, 4); } break;
    case 8:  _mm_storel_epi64((__m128i*)d, v); break;
    case 16: _mm_storeu_si128((__m128i*)d, v); break;
    default: _mm_storeu_si128((__m128i*)d, v);
             _mm_storeu_si128((__m128i*)(d+16), v); break;
  }
}

// copy exactly nT bytes (nT in {4,8,16,32})
inline void copy_row(uint8_t* d, const uint8_t* s, int nT) {
  switch (nT) {
    case 4:  memcpy(d, s, 4); break;
    case 8:  memcpy(d, s, 8); break;
    case 16: _mm_storeu_si128((__m128i*)d, _mm_loadu_si128((const __m128i*)s)); break;
    default: _mm_storeu_si128((__m128i*)d,      _mm_loadu_si128((const __m128i*)s));
             _mm_storeu_si128((__m128i*)(d+16), _mm_loadu_si128((const __m128i*)(s+16))); break;
  }
}

inline int shift_for(int nT) { return (nT==4)?3 : (nT==8)?4 : (nT==16)?5 : 6; } // Log2(nT)+1

} // namespace


void intra_pred_dc_8_sse4(uint8_t* dst, ptrdiff_t stride, int nT, int cIdx, const uint8_t* border)
{
  const int shift = shift_for(nT);

  int dcVal = 0;
  for (int i=0;i<nT;i++) { dcVal += border[i+1]; dcVal += border[-i-1]; }
  dcVal += nT;
  dcVal >>= shift;

  const __m128i v = _mm_set1_epi8((char)dcVal);
  for (int y=0;y<nT;y++) store_row(dst + y*stride, v, nT);

  // luma edge smoothing overwrites first row and first column (disjoint cells)
  if (cIdx==0 && nT<32) {
    dst[0] = (uint8_t)((border[-1] + 2*dcVal + border[1] + 2) >> 2);
    for (int x=1;x<nT;x++) dst[x]          = (uint8_t)((border[ x+1] + 3*dcVal + 2) >> 2);
    for (int y=1;y<nT;y++) dst[y*stride]   = (uint8_t)((border[-y-1] + 3*dcVal + 2) >> 2);
  }
}


void intra_pred_planar_8_sse4(uint8_t* dst, ptrdiff_t stride, int nT, int cIdx, const uint8_t* border)
{
  const int shift = shift_for(nT);
  const int TR = border[ 1+nT];   // top-right corner sample
  const int BL = border[-1-nT];   // bottom-left corner sample

  const __m128i base  = _mm_setr_epi16(0,1,2,3,4,5,6,7);
  const __m128i vTR   = _mm_set1_epi16((short)TR);
  const __m128i vNTm1 = _mm_set1_epi16((short)(nT-1));
  const __m128i one   = _mm_set1_epi16(1);

  for (int y=0;y<nT;y++) {
    const int left_y = border[-1-y];
    const int Cy = (y+1)*BL + nT;                 // constant term for this row
    const __m128i vL    = _mm_set1_epi16((short)left_y);
    const __m128i vNT1Y = _mm_set1_epi16((short)(nT-1-y));
    const __m128i vC    = _mm_set1_epi16((short)Cy);

    for (int x=0;x<nT;x+=8) {
      const __m128i xidx = _mm_add_epi16(_mm_set1_epi16((short)x), base);
      const __m128i vA   = _mm_sub_epi16(vNTm1, xidx);     // (nT-1-x)
      const __m128i vB   = _mm_add_epi16(xidx, one);       // (x+1)
      const __m128i top  = (nT==4) ? load4_epi16(border+1+x) : load8_epi16(border+1+x);

      __m128i acc = _mm_mullo_epi16(vA,  vL);              // (nT-1-x)*border[-1-y]
      acc = _mm_add_epi16(acc, _mm_mullo_epi16(vB,  vTR)); // (x+1)*border[1+nT]
      acc = _mm_add_epi16(acc, _mm_mullo_epi16(top, vNT1Y));// (nT-1-y)*border[1+x]
      acc = _mm_add_epi16(acc, vC);                        // (y+1)*border[-1-nT] + nT
      acc = _mm_srli_epi16(acc, shift);

      const __m128i p = _mm_packus_epi16(acc, acc);
      store_row(dst + y*stride + x, p, (nT<8)?nT:8);
    }
  }
}


void intra_pred_angular_8_sse4(uint8_t* dst, ptrdiff_t stride, int bit_depth, int disableBoundaryFilter,
                               int xB0, int yB0, int mode, int nT, int cIdx, const uint8_t* border)
{
  const int intraPredAngle = intraPredAngle_table[mode];

  uint8_t  ref_mem[4*kMaxBlk+1];
  uint8_t* ref = &ref_mem[2*kMaxBlk];

  if (mode >= 18) {
    for (int x=0;x<=nT;x++) ref[x] = border[x];

    if (intraPredAngle<0) {
      const int invAngle = invAngle_table[mode-11];
      if (((nT*intraPredAngle)>>5) < -1) {
        for (int x=(nT*intraPredAngle)>>5; x<=-1; x++)
          ref[x] = border[0-((x*invAngle+128)>>8)];
      }
    } else {
      for (int x=nT+1; x<=2*nT; x++) ref[x] = border[x];
    }

    for (int y=0;y<nT;y++) {
      const int iIdx  = ((y+1)*intraPredAngle)>>5;
      const int iFact = ((y+1)*intraPredAngle)&31;
      const uint8_t* src = ref + iIdx + 1;
      uint8_t* d = dst + y*stride;

      if (iFact==0) {
        copy_row(d, src, nT);                     // dst[x] = ref[x+iIdx+1]
      } else {
        const __m128i w0  = _mm_set1_epi16((short)(32-iFact));
        const __m128i w1  = _mm_set1_epi16((short)iFact);
        const __m128i r16 = _mm_set1_epi16(16);
        if (nT==4) {
          __m128i a = load4_epi16(src), b = load4_epi16(src+1);
          __m128i acc = _mm_add_epi16(_mm_add_epi16(_mm_mullo_epi16(a,w0), _mm_mullo_epi16(b,w1)), r16);
          acc = _mm_srli_epi16(acc, 5);
          store_row(d, _mm_packus_epi16(acc,acc), 4);
        } else {
          for (int x=0;x<nT;x+=8) {
            __m128i a = load8_epi16(src+x), b = load8_epi16(src+x+1);
            __m128i acc = _mm_add_epi16(_mm_add_epi16(_mm_mullo_epi16(a,w0), _mm_mullo_epi16(b,w1)), r16);
            acc = _mm_srli_epi16(acc, 5);
            store_row(d+x, _mm_packus_epi16(acc,acc), 8);
          }
        }
      }
    }

    if (mode==26 && cIdx==0 && nT<32 && !disableBoundaryFilter) {
      for (int y=0;y<nT;y++)
        dst[y*stride] = (uint8_t)Clip_BitDepth(border[1] + ((border[-1-y] - border[0])>>1), bit_depth);
    }
  }
  else {
    // Modes 2..17: the reference projection is transposed (per-column iIdx/iFact and
    // a row-indexed reference fetch), which does not map onto contiguous SIMD loads or
    // stores. Use the scalar reference path here -- kept bit-identical to
    // intra_prediction_angular() in intrapred.h.
    for (int x=0;x<=nT;x++) ref[x] = border[-x];

    if (intraPredAngle<0) {
      const int invAngle = invAngle_table[mode-11];
      if (((nT*intraPredAngle)>>5) < -1) {
        for (int x=(nT*intraPredAngle)>>5; x<=-1; x++)
          ref[x] = border[(x*invAngle+128)>>8];
      }
    } else {
      for (int x=nT+1; x<=2*nT; x++) ref[x] = border[-x];
    }

    for (int y=0;y<nT;y++)
      for (int x=0;x<nT;x++) {
        const int iIdx  = ((x+1)*intraPredAngle)>>5;
        const int iFact = ((x+1)*intraPredAngle)&31;
        if (iFact != 0)
          dst[x+y*stride] = (uint8_t)(((32-iFact)*ref[y+iIdx+1] + iFact*ref[y+iIdx+2] + 16)>>5);
        else
          dst[x+y*stride] = ref[y+iIdx+1];
      }

    if (mode==10 && cIdx==0 && nT<32 && !disableBoundaryFilter) {
      for (int x=0;x<nT;x++)
        dst[x] = (uint8_t)Clip_BitDepth(border[-1] + ((border[1+x] - border[0])>>1), bit_depth);
    }
  }
}

#endif // HAVE_SSE4_1

Coverage Report

Created: 2026-06-07 07:20

Line	Count	Source
1		/*
2		* H.265 video codec.
3		* Copyright (c) 2026 Dirk Farin <dirk.farin@gmail.com>
4		*
5		* This file is part of libde265.
6		*
7		* libde265 is free software: you can redistribute it and/or modify
8		* it under the terms of the GNU Lesser General Public License as
9		* published by the Free Software Foundation, either version 3 of
10		* the License, or (at your option) any later version.
11		*
12		* libde265 is distributed in the hope that it will be useful,
13		* but WITHOUT ANY WARRANTY; without even the implied warranty of
14		* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15		* GNU Lesser General Public License for more details.
16		*
17		* You should have received a copy of the GNU Lesser General Public License
18		* along with libde265. If not, see <http://www.gnu.org/licenses/>.
19		*/
20
21		#include "x86/sse-intrapred.h"
22		#include "libde265/util.h"
23
24		#ifdef HAVE_CONFIG_H
25		#include "config.h"
26		#endif
27
28		#include <string.h>
29		#include <emmintrin.h> // SSE2
30
31		#if HAVE_SSE4_1
32		#include <smmintrin.h> // SSE4.1
33
34		// angle / inverse-angle lookup tables, defined in intrapred.cc
35		extern const int intraPredAngle_table[1+34];
36		extern const int invAngle_table[25-10];
37
38		namespace {
39
40		const int kMaxBlk = 64; // == MAX_INTRA_PRED_BLOCK_SIZE
41
42		// load 4 / 8 consecutive uint8_t and zero-extend to 8x int16
43	7.28M	inline __m128i load4_epi16(const uint8_t* p) {
44	7.28M	int32_t t; memcpy(&t, p, 4);
45	7.28M	return _mm_cvtepu8_epi16(_mm_cvtsi32_si128(t));
46	7.28M	}
47	2.39M	inline __m128i load8_epi16(const uint8_t* p) {
48	2.39M	return _mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i*)p));
49	2.39M	}
50
51		// store nT bytes from the low lanes of an u8-packed register (nT in {4,8,16,32})
52	9.67M	inline void store_row(uint8_t* d, __m128i v, int nT) {
53	9.67M	switch (nT) {
54	7.33M	case 4: { int32_t t = _mm_cvtsi128_si32(v); memcpy(d, &t, 4); } break;
55	2.26M	case 8: _mm_storel_epi64((__m128i*)d, v); break;
56	42.6k	case 16: _mm_storeu_si128((__m128i*)d, v); break;
57	19.3k	default: _mm_storeu_si128((__m128i*)d, v);
58	19.3k	_mm_storeu_si128((__m128i*)(d+16), v); break;
59	9.67M	}
60	9.67M	}
61
62		// copy exactly nT bytes (nT in {4,8,16,32})
63	1.18M	inline void copy_row(uint8_t* d, const uint8_t* s, int nT) {
64	1.18M	switch (nT) {
65	886k	case 4: memcpy(d, s, 4); break;
66	205k	case 8: memcpy(d, s, 8); break;
67	60.7k	case 16: _mm_storeu_si128((__m128i)d, _mm_loadu_si128((const __m128i)s)); break;
68	31.8k	default: _mm_storeu_si128((__m128i)d, _mm_loadu_si128((const __m128i)s));
69	31.8k	_mm_storeu_si128((__m128i)(d+16), _mm_loadu_si128((const __m128i)(s+16))); break;
70	1.18M	}
71	1.18M	}
72
73	1.68M	inline int shift_for(int nT) { return (nT==4)?3 : (nT==8)?4 : (nT==16)?5 : 6; } // Log2(nT)+1
74
75		} // namespace
76
77
78		void intra_pred_dc_8_sse4(uint8_t* dst, ptrdiff_t stride, int nT, int cIdx, const uint8_t* border)
79	393k	{
80	393k	const int shift = shift_for(nT);
81
82	393k	int dcVal = 0;
83	2.20M	for (int i=0;i<nT;i++) { dcVal += border[i+1]; dcVal += border[-i-1]; }
84	393k	dcVal += nT;
85	393k	dcVal >>= shift;
86
87	393k	const __m128i v = _mm_set1_epi8((char)dcVal);
88	2.20M	for (int y=0;y<nT;y++) store_row(dst + y*stride, v, nT);
89
90		// luma edge smoothing overwrites first row and first column (disjoint cells)
91	393k	if (cIdx==0 && nT<32) {
92	98.3k	dst[0] = (uint8_t)((border[-1] + 2*dcVal + border[1] + 2) >> 2);
93	554k	for (int x=1;x<nT;x++) dst[x] = (uint8_t)((border[ x+1] + 3*dcVal + 2) >> 2);
94	554k	for (int y=1;y<nT;y++) dst[ystride] = (uint8_t)((border[-y-1] + 3dcVal + 2) >> 2);
95	98.3k	}
96	393k	}
97
98
99		void intra_pred_planar_8_sse4(uint8_t* dst, ptrdiff_t stride, int nT, int cIdx, const uint8_t* border)
100	1.28M	{
101	1.28M	const int shift = shift_for(nT);
102	1.28M	const int TR = border[ 1+nT]; // top-right corner sample
103	1.28M	const int BL = border[-1-nT]; // bottom-left corner sample
104
105	1.28M	const __m128i base = _mm_setr_epi16(0,1,2,3,4,5,6,7);
106	1.28M	const __m128i vTR = _mm_set1_epi16((short)TR);
107	1.28M	const __m128i vNTm1 = _mm_set1_epi16((short)(nT-1));
108	1.28M	const __m128i one = _mm_set1_epi16(1);
109
110	7.05M	for (int y=0;y<nT;y++) {
111	5.76M	const int left_y = border[-1-y];
112	5.76M	const int Cy = (y+1)*BL + nT; // constant term for this row
113	5.76M	const __m128i vL = _mm_set1_epi16((short)left_y);
114	5.76M	const __m128i vNT1Y = _mm_set1_epi16((short)(nT-1-y));
115	5.76M	const __m128i vC = _mm_set1_epi16((short)Cy);
116
117	11.7M	for (int x=0;x<nT;x+=8) {
118	6.02M	const __m128i xidx = _mm_add_epi16(_mm_set1_epi16((short)x), base);
119	6.02M	const __m128i vA = _mm_sub_epi16(vNTm1, xidx); // (nT-1-x)
120	6.02M	const __m128i vB = _mm_add_epi16(xidx, one); // (x+1)
121	6.02M	const __m128i top = (nT==4) ? load4_epi16(border+1+x) : load8_epi16(border+1+x);
122
123	6.02M	__m128i acc = _mm_mullo_epi16(vA, vL); // (nT-1-x)*border[-1-y]
124	6.02M	acc = _mm_add_epi16(acc, _mm_mullo_epi16(vB, vTR)); // (x+1)*border[1+nT]
125	6.02M	acc = _mm_add_epi16(acc, _mm_mullo_epi16(top, vNT1Y));// (nT-1-y)*border[1+x]
126	6.02M	acc = _mm_add_epi16(acc, vC); // (y+1)*border[-1-nT] + nT
127	6.02M	acc = _mm_srli_epi16(acc, shift);
128
129	6.02M	const __m128i p = _mm_packus_epi16(acc, acc);
130	6.02M	store_row(dst + y*stride + x, p, (nT<8)?nT:8);
131	6.02M	}
132	5.76M	}
133	1.28M	}
134
135
136		void intra_pred_angular_8_sse4(uint8_t* dst, ptrdiff_t stride, int bit_depth, int disableBoundaryFilter,
137		int xB0, int yB0, int mode, int nT, int cIdx, const uint8_t* border)
138	1.80M	{
139	1.80M	const int intraPredAngle = intraPredAngle_table[mode];
140
141	1.80M	uint8_t ref_mem[4*kMaxBlk+1];
142	1.80M	uint8_t* ref = &ref_mem[2*kMaxBlk];
143
144	1.80M	if (mode >= 18) {
145	4.11M	for (int x=0;x<=nT;x++) ref[x] = border[x];
146
147	620k	if (intraPredAngle<0) {
148	187k	const int invAngle = invAngle_table[mode-11];
149	187k	if (((nT*intraPredAngle)>>5) < -1) {
150	600k	for (int x=(nT*intraPredAngle)>>5; x<=-1; x++)
151	464k	ref[x] = border[0-((x*invAngle+128)>>8)];
152	135k	}
153	432k	} else {
154	2.43M	for (int x=nT+1; x<=2*nT; x++) ref[x] = border[x];
155	432k	}
156
157	3.48M	for (int y=0;y<nT;y++) {
158	2.86M	const int iIdx = ((y+1)*intraPredAngle)>>5;
159	2.86M	const int iFact = ((y+1)*intraPredAngle)&31;
160	2.86M	const uint8_t* src = ref + iIdx + 1;
161	2.86M	uint8_t* d = dst + y*stride;
162
163	2.86M	if (iFact==0) {
164	1.18M	copy_row(d, src, nT); // dst[x] = ref[x+iIdx+1]
165	1.68M	} else {
166	1.68M	const __m128i w0 = _mm_set1_epi16((short)(32-iFact));
167	1.68M	const __m128i w1 = _mm_set1_epi16((short)iFact);
168	1.68M	const __m128i r16 = _mm_set1_epi16(16);
169	1.68M	if (nT==4) {
170	1.31M	__m128i a = load4_epi16(src), b = load4_epi16(src+1);
171	1.31M	__m128i acc = _mm_add_epi16(_mm_add_epi16(_mm_mullo_epi16(a,w0), _mm_mullo_epi16(b,w1)), r16);
172	1.31M	acc = _mm_srli_epi16(acc, 5);
173	1.31M	store_row(d, _mm_packus_epi16(acc,acc), 4);
174	1.31M	} else {
175	876k	for (int x=0;x<nT;x+=8) {
176	512k	__m128i a = load8_epi16(src+x), b = load8_epi16(src+x+1);
177	512k	__m128i acc = _mm_add_epi16(_mm_add_epi16(_mm_mullo_epi16(a,w0), _mm_mullo_epi16(b,w1)), r16);
178	512k	acc = _mm_srli_epi16(acc, 5);
179	512k	store_row(d+x, _mm_packus_epi16(acc,acc), 8);
180	512k	}
181	364k	}
182	1.68M	}
183	2.86M	}
184
185	620k	if (mode==26 && cIdx==0 && nT<32 && !disableBoundaryFilter) {
186	211k	for (int y=0;y<nT;y++)
187	178k	dst[y*stride] = (uint8_t)Clip_BitDepth(border[1] + ((border[-1-y] - border[0])>>1), bit_depth);
188	33.2k	}
189	620k	}
190	1.18M	else {
191		// Modes 2..17: the reference projection is transposed (per-column iIdx/iFact and
192		// a row-indexed reference fetch), which does not map onto contiguous SIMD loads or
193		// stores. Use the scalar reference path here -- kept bit-identical to
194		// intra_prediction_angular() in intrapred.h.
195	7.68M	for (int x=0;x<=nT;x++) ref[x] = border[-x];
196
197	1.18M	if (intraPredAngle<0) {
198	117k	const int invAngle = invAngle_table[mode-11];
199	117k	if (((nT*intraPredAngle)>>5) < -1) {
200	377k	for (int x=(nT*intraPredAngle)>>5; x<=-1; x++)
201	287k	ref[x] = border[(x*invAngle+128)>>8];
202	90.1k	}
203	1.06M	} else {
204	5.82M	for (int x=nT+1; x<=2*nT; x++) ref[x] = border[-x];
205	1.06M	}
206
207	6.50M	for (int y=0;y<nT;y++)
208	33.5M	for (int x=0;x<nT;x++) {
209	28.2M	const int iIdx = ((x+1)*intraPredAngle)>>5;
210	28.2M	const int iFact = ((x+1)*intraPredAngle)&31;
211	28.2M	if (iFact != 0)
212	8.25M	dst[x+ystride] = (uint8_t)(((32-iFact)ref[y+iIdx+1] + iFact*ref[y+iIdx+2] + 16)>>5);
213	19.9M	else
214	19.9M	dst[x+y*stride] = ref[y+iIdx+1];
215	28.2M	}
216
217	1.18M	if (mode==10 && cIdx==0 && nT<32 && !disableBoundaryFilter) {
218	88.2k	for (int x=0;x<nT;x++)
219	74.7k	dst[x] = (uint8_t)Clip_BitDepth(border[-1] + ((border[1+x] - border[0])>>1), bit_depth);
220	13.4k	}
221	1.18M	}
222	1.80M	}
223
224		#endif // HAVE_SSE4_1