/work/libde265/libde265/x86/sse-intrapred.cc

Source
/*
 * H.265 video codec.
 * Copyright (c) 2026 Dirk Farin <dirk.farin@gmail.com>
 *
 * This file is part of libde265.
 *
 * libde265 is free software: you can redistribute it and/or modify
 * it under the terms of the GNU Lesser General Public License as
 * published by the Free Software Foundation, either version 3 of
 * the License, or (at your option) any later version.
 *
 * libde265 is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU Lesser General Public License for more details.
 *
 * You should have received a copy of the GNU Lesser General Public License
 * along with libde265.  If not, see <http://www.gnu.org/licenses/>.
 */

#include "x86/sse-intrapred.h"
#include "libde265/util.h"

#ifdef HAVE_CONFIG_H
#include "config.h"
#endif

#include <string.h>
#include <emmintrin.h> // SSE2

#if HAVE_SSE4_1
#include <smmintrin.h> // SSE4.1

// angle / inverse-angle lookup tables, defined in intrapred.cc
extern const int intraPredAngle_table[1+34];
extern const int invAngle_table[25-10];

namespace {

const int kMaxBlk = 64; // == MAX_INTRA_PRED_BLOCK_SIZE

// load 4 / 8 consecutive uint8_t and zero-extend to 8x int16
inline __m128i load4_epi16(const uint8_t* p) {
  int32_t t; memcpy(&t, p, 4);
  return _mm_cvtepu8_epi16(_mm_cvtsi32_si128(t));
}
inline __m128i load8_epi16(const uint8_t* p) {
  return _mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i*)p));
}

// store nT bytes from the low lanes of an u8-packed register (nT in {4,8,16,32})
inline void store_row(uint8_t* d, __m128i v, int nT) {
  switch (nT) {
    case 4:  { int32_t t = _mm_cvtsi128_si32(v); memcpy(d, &t, 4); } break;
    case 8:  _mm_storel_epi64((__m128i*)d, v); break;
    case 16: _mm_storeu_si128((__m128i*)d, v); break;
    default: _mm_storeu_si128((__m128i*)d, v);
             _mm_storeu_si128((__m128i*)(d+16), v); break;
  }
}

// copy exactly nT bytes (nT in {4,8,16,32})
inline void copy_row(uint8_t* d, const uint8_t* s, int nT) {
  switch (nT) {
    case 4:  memcpy(d, s, 4); break;
    case 8:  memcpy(d, s, 8); break;
    case 16: _mm_storeu_si128((__m128i*)d, _mm_loadu_si128((const __m128i*)s)); break;
    default: _mm_storeu_si128((__m128i*)d,      _mm_loadu_si128((const __m128i*)s));
             _mm_storeu_si128((__m128i*)(d+16), _mm_loadu_si128((const __m128i*)(s+16))); break;
  }
}

inline int shift_for(int nT) { return (nT==4)?3 : (nT==8)?4 : (nT==16)?5 : 6; } // Log2(nT)+1

} // namespace


void intra_pred_dc_8_sse4(uint8_t* dst, ptrdiff_t stride, int nT, int cIdx, const uint8_t* border)
{
  const int shift = shift_for(nT);

  int dcVal = 0;
  for (int i=0;i<nT;i++) { dcVal += border[i+1]; dcVal += border[-i-1]; }
  dcVal += nT;
  dcVal >>= shift;

  const __m128i v = _mm_set1_epi8((char)dcVal);
  for (int y=0;y<nT;y++) store_row(dst + y*stride, v, nT);

  // luma edge smoothing overwrites first row and first column (disjoint cells)
  if (cIdx==0 && nT<32) {
    dst[0] = (uint8_t)((border[-1] + 2*dcVal + border[1] + 2) >> 2);
    for (int x=1;x<nT;x++) dst[x]          = (uint8_t)((border[ x+1] + 3*dcVal + 2) >> 2);
    for (int y=1;y<nT;y++) dst[y*stride]   = (uint8_t)((border[-y-1] + 3*dcVal + 2) >> 2);
  }
}


void intra_pred_planar_8_sse4(uint8_t* dst, ptrdiff_t stride, int nT, int cIdx, const uint8_t* border)
{
  const int shift = shift_for(nT);
  const int TR = border[ 1+nT];   // top-right corner sample
  const int BL = border[-1-nT];   // bottom-left corner sample

  const __m128i base  = _mm_setr_epi16(0,1,2,3,4,5,6,7);
  const __m128i vTR   = _mm_set1_epi16((short)TR);
  const __m128i vNTm1 = _mm_set1_epi16((short)(nT-1));
  const __m128i one   = _mm_set1_epi16(1);

  for (int y=0;y<nT;y++) {
    const int left_y = border[-1-y];
    const int Cy = (y+1)*BL + nT;                 // constant term for this row
    const __m128i vL    = _mm_set1_epi16((short)left_y);
    const __m128i vNT1Y = _mm_set1_epi16((short)(nT-1-y));
    const __m128i vC    = _mm_set1_epi16((short)Cy);

    for (int x=0;x<nT;x+=8) {
      const __m128i xidx = _mm_add_epi16(_mm_set1_epi16((short)x), base);
      const __m128i vA   = _mm_sub_epi16(vNTm1, xidx);     // (nT-1-x)
      const __m128i vB   = _mm_add_epi16(xidx, one);       // (x+1)
      const __m128i top  = (nT==4) ? load4_epi16(border+1+x) : load8_epi16(border+1+x);

      __m128i acc = _mm_mullo_epi16(vA,  vL);              // (nT-1-x)*border[-1-y]
      acc = _mm_add_epi16(acc, _mm_mullo_epi16(vB,  vTR)); // (x+1)*border[1+nT]
      acc = _mm_add_epi16(acc, _mm_mullo_epi16(top, vNT1Y));// (nT-1-y)*border[1+x]
      acc = _mm_add_epi16(acc, vC);                        // (y+1)*border[-1-nT] + nT
      acc = _mm_srli_epi16(acc, shift);

      const __m128i p = _mm_packus_epi16(acc, acc);
      store_row(dst + y*stride + x, p, (nT<8)?nT:8);
    }
  }
}


void intra_pred_angular_8_sse4(uint8_t* dst, ptrdiff_t stride, int bit_depth, int disableBoundaryFilter,
                               int xB0, int yB0, int mode, int nT, int cIdx, const uint8_t* border)
{
  const int intraPredAngle = intraPredAngle_table[mode];

  uint8_t  ref_mem[4*kMaxBlk+1];
  uint8_t* ref = &ref_mem[2*kMaxBlk];

  if (mode >= 18) {
    for (int x=0;x<=nT;x++) ref[x] = border[x];

    if (intraPredAngle<0) {
      const int invAngle = invAngle_table[mode-11];
      if (((nT*intraPredAngle)>>5) < -1) {
        for (int x=(nT*intraPredAngle)>>5; x<=-1; x++)
          ref[x] = border[0-((x*invAngle+128)>>8)];
      }
    } else {
      for (int x=nT+1; x<=2*nT; x++) ref[x] = border[x];
    }

    for (int y=0;y<nT;y++) {
      const int iIdx  = ((y+1)*intraPredAngle)>>5;
      const int iFact = ((y+1)*intraPredAngle)&31;
      const uint8_t* src = ref + iIdx + 1;
      uint8_t* d = dst + y*stride;

      if (iFact==0) {
        copy_row(d, src, nT);                     // dst[x] = ref[x+iIdx+1]
      } else {
        const __m128i w0  = _mm_set1_epi16((short)(32-iFact));
        const __m128i w1  = _mm_set1_epi16((short)iFact);
        const __m128i r16 = _mm_set1_epi16(16);
        if (nT==4) {
          __m128i a = load4_epi16(src), b = load4_epi16(src+1);
          __m128i acc = _mm_add_epi16(_mm_add_epi16(_mm_mullo_epi16(a,w0), _mm_mullo_epi16(b,w1)), r16);
          acc = _mm_srli_epi16(acc, 5);
          store_row(d, _mm_packus_epi16(acc,acc), 4);
        } else {
          for (int x=0;x<nT;x+=8) {
            __m128i a = load8_epi16(src+x), b = load8_epi16(src+x+1);
            __m128i acc = _mm_add_epi16(_mm_add_epi16(_mm_mullo_epi16(a,w0), _mm_mullo_epi16(b,w1)), r16);
            acc = _mm_srli_epi16(acc, 5);
            store_row(d+x, _mm_packus_epi16(acc,acc), 8);
          }
        }
      }
    }

    if (mode==26 && cIdx==0 && nT<32 && !disableBoundaryFilter) {
      for (int y=0;y<nT;y++)
        dst[y*stride] = (uint8_t)Clip_BitDepth(border[1] + ((border[-1-y] - border[0])>>1), bit_depth);
    }
  }
  else {
    // Modes 2..17: the reference projection is transposed (per-column iIdx/iFact and
    // a row-indexed reference fetch), which does not map onto contiguous SIMD loads or
    // stores. Use the scalar reference path here -- kept bit-identical to
    // intra_prediction_angular() in intrapred.h.
    for (int x=0;x<=nT;x++) ref[x] = border[-x];

    if (intraPredAngle<0) {
      const int invAngle = invAngle_table[mode-11];
      if (((nT*intraPredAngle)>>5) < -1) {
        for (int x=(nT*intraPredAngle)>>5; x<=-1; x++)
          ref[x] = border[(x*invAngle+128)>>8];
      }
    } else {
      for (int x=nT+1; x<=2*nT; x++) ref[x] = border[-x];
    }

    for (int y=0;y<nT;y++)
      for (int x=0;x<nT;x++) {
        const int iIdx  = ((x+1)*intraPredAngle)>>5;
        const int iFact = ((x+1)*intraPredAngle)&31;
        if (iFact != 0)
          dst[x+y*stride] = (uint8_t)(((32-iFact)*ref[y+iIdx+1] + iFact*ref[y+iIdx+2] + 16)>>5);
        else
          dst[x+y*stride] = ref[y+iIdx+1];
      }

    if (mode==10 && cIdx==0 && nT<32 && !disableBoundaryFilter) {
      for (int x=0;x<nT;x++)
        dst[x] = (uint8_t)Clip_BitDepth(border[-1] + ((border[1+x] - border[0])>>1), bit_depth);
    }
  }
}

#endif // HAVE_SSE4_1

Coverage Report

Created: 2026-06-15 06:22

Line	Count	Source
1		/*
2		* H.265 video codec.
3		* Copyright (c) 2026 Dirk Farin <dirk.farin@gmail.com>
4		*
5		* This file is part of libde265.
6		*
7		* libde265 is free software: you can redistribute it and/or modify
8		* it under the terms of the GNU Lesser General Public License as
9		* published by the Free Software Foundation, either version 3 of
10		* the License, or (at your option) any later version.
11		*
12		* libde265 is distributed in the hope that it will be useful,
13		* but WITHOUT ANY WARRANTY; without even the implied warranty of
14		* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15		* GNU Lesser General Public License for more details.
16		*
17		* You should have received a copy of the GNU Lesser General Public License
18		* along with libde265. If not, see <http://www.gnu.org/licenses/>.
19		*/
20
21		#include "x86/sse-intrapred.h"
22		#include "libde265/util.h"
23
24		#ifdef HAVE_CONFIG_H
25		#include "config.h"
26		#endif
27
28		#include <string.h>
29		#include <emmintrin.h> // SSE2
30
31		#if HAVE_SSE4_1
32		#include <smmintrin.h> // SSE4.1
33
34		// angle / inverse-angle lookup tables, defined in intrapred.cc
35		extern const int intraPredAngle_table[1+34];
36		extern const int invAngle_table[25-10];
37
38		namespace {
39
40		const int kMaxBlk = 64; // == MAX_INTRA_PRED_BLOCK_SIZE
41
42		// load 4 / 8 consecutive uint8_t and zero-extend to 8x int16
43	0	inline __m128i load4_epi16(const uint8_t* p) {
44	0	int32_t t; memcpy(&t, p, 4);
45	0	return _mm_cvtepu8_epi16(_mm_cvtsi32_si128(t));
46	0	}
47	0	inline __m128i load8_epi16(const uint8_t* p) {
48	0	return _mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i*)p));
49	0	}
50
51		// store nT bytes from the low lanes of an u8-packed register (nT in {4,8,16,32})
52	0	inline void store_row(uint8_t* d, __m128i v, int nT) {
53	0	switch (nT) {
54	0	case 4: { int32_t t = _mm_cvtsi128_si32(v); memcpy(d, &t, 4); } break;
55	0	case 8: _mm_storel_epi64((__m128i*)d, v); break;
56	0	case 16: _mm_storeu_si128((__m128i*)d, v); break;
57	0	default: _mm_storeu_si128((__m128i*)d, v);
58	0	_mm_storeu_si128((__m128i*)(d+16), v); break;
59	0	}
60	0	}
61
62		// copy exactly nT bytes (nT in {4,8,16,32})
63	0	inline void copy_row(uint8_t* d, const uint8_t* s, int nT) {
64	0	switch (nT) {
65	0	case 4: memcpy(d, s, 4); break;
66	0	case 8: memcpy(d, s, 8); break;
67	0	case 16: _mm_storeu_si128((__m128i)d, _mm_loadu_si128((const __m128i)s)); break;
68	0	default: _mm_storeu_si128((__m128i)d, _mm_loadu_si128((const __m128i)s));
69	0	_mm_storeu_si128((__m128i)(d+16), _mm_loadu_si128((const __m128i)(s+16))); break;
70	0	}
71	0	}
72
73	0	inline int shift_for(int nT) { return (nT==4)?3 : (nT==8)?4 : (nT==16)?5 : 6; } // Log2(nT)+1
74
75		} // namespace
76
77
78		void intra_pred_dc_8_sse4(uint8_t* dst, ptrdiff_t stride, int nT, int cIdx, const uint8_t* border)
79	0	{
80	0	const int shift = shift_for(nT);
81
82	0	int dcVal = 0;
83	0	for (int i=0;i<nT;i++) { dcVal += border[i+1]; dcVal += border[-i-1]; }
84	0	dcVal += nT;
85	0	dcVal >>= shift;
86
87	0	const __m128i v = _mm_set1_epi8((char)dcVal);
88	0	for (int y=0;y<nT;y++) store_row(dst + y*stride, v, nT);
89
90		// luma edge smoothing overwrites first row and first column (disjoint cells)
91	0	if (cIdx==0 && nT<32) {
92	0	dst[0] = (uint8_t)((border[-1] + 2*dcVal + border[1] + 2) >> 2);
93	0	for (int x=1;x<nT;x++) dst[x] = (uint8_t)((border[ x+1] + 3*dcVal + 2) >> 2);
94	0	for (int y=1;y<nT;y++) dst[ystride] = (uint8_t)((border[-y-1] + 3dcVal + 2) >> 2);
95	0	}
96	0	}
97
98
99		void intra_pred_planar_8_sse4(uint8_t* dst, ptrdiff_t stride, int nT, int cIdx, const uint8_t* border)
100	0	{
101	0	const int shift = shift_for(nT);
102	0	const int TR = border[ 1+nT]; // top-right corner sample
103	0	const int BL = border[-1-nT]; // bottom-left corner sample
104
105	0	const __m128i base = _mm_setr_epi16(0,1,2,3,4,5,6,7);
106	0	const __m128i vTR = _mm_set1_epi16((short)TR);
107	0	const __m128i vNTm1 = _mm_set1_epi16((short)(nT-1));
108	0	const __m128i one = _mm_set1_epi16(1);
109
110	0	for (int y=0;y<nT;y++) {
111	0	const int left_y = border[-1-y];
112	0	const int Cy = (y+1)*BL + nT; // constant term for this row
113	0	const __m128i vL = _mm_set1_epi16((short)left_y);
114	0	const __m128i vNT1Y = _mm_set1_epi16((short)(nT-1-y));
115	0	const __m128i vC = _mm_set1_epi16((short)Cy);
116
117	0	for (int x=0;x<nT;x+=8) {
118	0	const __m128i xidx = _mm_add_epi16(_mm_set1_epi16((short)x), base);
119	0	const __m128i vA = _mm_sub_epi16(vNTm1, xidx); // (nT-1-x)
120	0	const __m128i vB = _mm_add_epi16(xidx, one); // (x+1)
121	0	const __m128i top = (nT==4) ? load4_epi16(border+1+x) : load8_epi16(border+1+x);
122
123	0	__m128i acc = _mm_mullo_epi16(vA, vL); // (nT-1-x)*border[-1-y]
124	0	acc = _mm_add_epi16(acc, _mm_mullo_epi16(vB, vTR)); // (x+1)*border[1+nT]
125	0	acc = _mm_add_epi16(acc, _mm_mullo_epi16(top, vNT1Y));// (nT-1-y)*border[1+x]
126	0	acc = _mm_add_epi16(acc, vC); // (y+1)*border[-1-nT] + nT
127	0	acc = _mm_srli_epi16(acc, shift);
128
129	0	const __m128i p = _mm_packus_epi16(acc, acc);
130	0	store_row(dst + y*stride + x, p, (nT<8)?nT:8);
131	0	}
132	0	}
133	0	}
134
135
136		void intra_pred_angular_8_sse4(uint8_t* dst, ptrdiff_t stride, int bit_depth, int disableBoundaryFilter,
137		int xB0, int yB0, int mode, int nT, int cIdx, const uint8_t* border)
138	0	{
139	0	const int intraPredAngle = intraPredAngle_table[mode];
140
141	0	uint8_t ref_mem[4*kMaxBlk+1];
142	0	uint8_t* ref = &ref_mem[2*kMaxBlk];
143
144	0	if (mode >= 18) {
145	0	for (int x=0;x<=nT;x++) ref[x] = border[x];
146
147	0	if (intraPredAngle<0) {
148	0	const int invAngle = invAngle_table[mode-11];
149	0	if (((nT*intraPredAngle)>>5) < -1) {
150	0	for (int x=(nT*intraPredAngle)>>5; x<=-1; x++)
151	0	ref[x] = border[0-((x*invAngle+128)>>8)];
152	0	}
153	0	} else {
154	0	for (int x=nT+1; x<=2*nT; x++) ref[x] = border[x];
155	0	}
156
157	0	for (int y=0;y<nT;y++) {
158	0	const int iIdx = ((y+1)*intraPredAngle)>>5;
159	0	const int iFact = ((y+1)*intraPredAngle)&31;
160	0	const uint8_t* src = ref + iIdx + 1;
161	0	uint8_t* d = dst + y*stride;
162
163	0	if (iFact==0) {
164	0	copy_row(d, src, nT); // dst[x] = ref[x+iIdx+1]
165	0	} else {
166	0	const __m128i w0 = _mm_set1_epi16((short)(32-iFact));
167	0	const __m128i w1 = _mm_set1_epi16((short)iFact);
168	0	const __m128i r16 = _mm_set1_epi16(16);
169	0	if (nT==4) {
170	0	__m128i a = load4_epi16(src), b = load4_epi16(src+1);
171	0	__m128i acc = _mm_add_epi16(_mm_add_epi16(_mm_mullo_epi16(a,w0), _mm_mullo_epi16(b,w1)), r16);
172	0	acc = _mm_srli_epi16(acc, 5);
173	0	store_row(d, _mm_packus_epi16(acc,acc), 4);
174	0	} else {
175	0	for (int x=0;x<nT;x+=8) {
176	0	__m128i a = load8_epi16(src+x), b = load8_epi16(src+x+1);
177	0	__m128i acc = _mm_add_epi16(_mm_add_epi16(_mm_mullo_epi16(a,w0), _mm_mullo_epi16(b,w1)), r16);
178	0	acc = _mm_srli_epi16(acc, 5);
179	0	store_row(d+x, _mm_packus_epi16(acc,acc), 8);
180	0	}
181	0	}
182	0	}
183	0	}
184
185	0	if (mode==26 && cIdx==0 && nT<32 && !disableBoundaryFilter) {
186	0	for (int y=0;y<nT;y++)
187	0	dst[y*stride] = (uint8_t)Clip_BitDepth(border[1] + ((border[-1-y] - border[0])>>1), bit_depth);
188	0	}
189	0	}
190	0	else {
191		// Modes 2..17: the reference projection is transposed (per-column iIdx/iFact and
192		// a row-indexed reference fetch), which does not map onto contiguous SIMD loads or
193		// stores. Use the scalar reference path here -- kept bit-identical to
194		// intra_prediction_angular() in intrapred.h.
195	0	for (int x=0;x<=nT;x++) ref[x] = border[-x];
196
197	0	if (intraPredAngle<0) {
198	0	const int invAngle = invAngle_table[mode-11];
199	0	if (((nT*intraPredAngle)>>5) < -1) {
200	0	for (int x=(nT*intraPredAngle)>>5; x<=-1; x++)
201	0	ref[x] = border[(x*invAngle+128)>>8];
202	0	}
203	0	} else {
204	0	for (int x=nT+1; x<=2*nT; x++) ref[x] = border[-x];
205	0	}
206
207	0	for (int y=0;y<nT;y++)
208	0	for (int x=0;x<nT;x++) {
209	0	const int iIdx = ((x+1)*intraPredAngle)>>5;
210	0	const int iFact = ((x+1)*intraPredAngle)&31;
211	0	if (iFact != 0)
212	0	dst[x+ystride] = (uint8_t)(((32-iFact)ref[y+iIdx+1] + iFact*ref[y+iIdx+2] + 16)>>5);
213	0	else
214	0	dst[x+y*stride] = ref[y+iIdx+1];
215	0	}
216
217	0	if (mode==10 && cIdx==0 && nT<32 && !disableBoundaryFilter) {
218	0	for (int x=0;x<nT;x++)
219	0	dst[x] = (uint8_t)Clip_BitDepth(border[-1] + ((border[1+x] - border[0])>>1), bit_depth);
220	0	}
221	0	}
222	0	}
223
224		#endif // HAVE_SSE4_1