/src/aom/av1/common/x86/reconinter_ssse3.c

Source (jump to first uncovered line)
/*
 * Copyright (c) 2018, Alliance for Open Media. All rights reserved.
 *
 * This source code is subject to the terms of the BSD 2 Clause License and
 * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
 * was not distributed with this source code in the LICENSE file, you can
 * obtain it at www.aomedia.org/license/software. If the Alliance for Open
 * Media Patent License 1.0 was not distributed with this source code in the
 * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
 */

#include "config/av1_rtcd.h"

#if CONFIG_AV1_HIGHBITDEPTH

#include <tmmintrin.h>

#include "aom/aom_integer.h"
#include "aom_dsp/blend.h"
#include "aom_dsp/x86/synonyms.h"
#include "av1/common/blockd.h"

void av1_build_compound_diffwtd_mask_highbd_ssse3(
    uint8_t *mask, DIFFWTD_MASK_TYPE mask_type, const uint8_t *src0,
    int src0_stride, const uint8_t *src1, int src1_stride, int h, int w,
    int bd) {
  if (w < 8) {
    av1_build_compound_diffwtd_mask_highbd_c(mask, mask_type, src0, src0_stride,
                                             src1, src1_stride, h, w, bd);
  } else {
    assert(bd >= 8);
    assert((w % 8) == 0);
    assert(mask_type == DIFFWTD_38 || mask_type == DIFFWTD_38_INV);
    const __m128i x0 = _mm_setzero_si128();
    const __m128i xAOM_BLEND_A64_MAX_ALPHA =
        _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA);
    const int mask_base = 38;
    const __m128i xmask_base = _mm_set1_epi16(mask_base);
    const uint16_t *ssrc0 = CONVERT_TO_SHORTPTR(src0);
    const uint16_t *ssrc1 = CONVERT_TO_SHORTPTR(src1);
    if (bd == 8) {
      if (mask_type == DIFFWTD_38_INV) {
        for (int i = 0; i < h; ++i) {
          for (int j = 0; j < w; j += 8) {
            __m128i s0 = _mm_loadu_si128((const __m128i *)&ssrc0[j]);
            __m128i s1 = _mm_loadu_si128((const __m128i *)&ssrc1[j]);
            __m128i diff = _mm_srai_epi16(_mm_abs_epi16(_mm_sub_epi16(s0, s1)),
                                          DIFF_FACTOR_LOG2);
            __m128i m = _mm_min_epi16(
                _mm_max_epi16(x0, _mm_add_epi16(diff, xmask_base)),
                xAOM_BLEND_A64_MAX_ALPHA);
            m = _mm_sub_epi16(xAOM_BLEND_A64_MAX_ALPHA, m);
            m = _mm_packus_epi16(m, m);
            _mm_storel_epi64((__m128i *)&mask[j], m);
          }
          ssrc0 += src0_stride;
          ssrc1 += src1_stride;
          mask += w;
        }
      } else {
        for (int i = 0; i < h; ++i) {
          for (int j = 0; j < w; j += 8) {
            __m128i s0 = _mm_loadu_si128((const __m128i *)&ssrc0[j]);
            __m128i s1 = _mm_loadu_si128((const __m128i *)&ssrc1[j]);
            __m128i diff = _mm_srai_epi16(_mm_abs_epi16(_mm_sub_epi16(s0, s1)),
                                          DIFF_FACTOR_LOG2);
            __m128i m = _mm_min_epi16(
                _mm_max_epi16(x0, _mm_add_epi16(diff, xmask_base)),
                xAOM_BLEND_A64_MAX_ALPHA);
            m = _mm_packus_epi16(m, m);
            _mm_storel_epi64((__m128i *)&mask[j], m);
          }
          ssrc0 += src0_stride;
          ssrc1 += src1_stride;
          mask += w;
        }
      }
    } else {
      const __m128i xshift = _mm_set1_epi64x(bd - 8 + DIFF_FACTOR_LOG2);
      if (mask_type == DIFFWTD_38_INV) {
        for (int i = 0; i < h; ++i) {
          for (int j = 0; j < w; j += 8) {
            __m128i s0 = _mm_loadu_si128((const __m128i *)&ssrc0[j]);
            __m128i s1 = _mm_loadu_si128((const __m128i *)&ssrc1[j]);
            __m128i diff =
                _mm_sra_epi16(_mm_abs_epi16(_mm_sub_epi16(s0, s1)), xshift);
            __m128i m = _mm_min_epi16(
                _mm_max_epi16(x0, _mm_add_epi16(diff, xmask_base)),
                xAOM_BLEND_A64_MAX_ALPHA);
            m = _mm_sub_epi16(xAOM_BLEND_A64_MAX_ALPHA, m);
            m = _mm_packus_epi16(m, m);
            _mm_storel_epi64((__m128i *)&mask[j], m);
          }
          ssrc0 += src0_stride;
          ssrc1 += src1_stride;
          mask += w;
        }
      } else {
        for (int i = 0; i < h; ++i) {
          for (int j = 0; j < w; j += 8) {
            __m128i s0 = _mm_loadu_si128((const __m128i *)&ssrc0[j]);
            __m128i s1 = _mm_loadu_si128((const __m128i *)&ssrc1[j]);
            __m128i diff =
                _mm_sra_epi16(_mm_abs_epi16(_mm_sub_epi16(s0, s1)), xshift);
            __m128i m = _mm_min_epi16(
                _mm_max_epi16(x0, _mm_add_epi16(diff, xmask_base)),
                xAOM_BLEND_A64_MAX_ALPHA);
            m = _mm_packus_epi16(m, m);
            _mm_storel_epi64((__m128i *)&mask[j], m);
          }
          ssrc0 += src0_stride;
          ssrc1 += src1_stride;
          mask += w;
        }
      }
    }
  }
}

#endif  // CONFIG_AV1_HIGHBITDEPTH

Coverage Report

Created: 2025-06-13 07:07

Line	Count	Source (jump to first uncovered line)
1		/*
2		* Copyright (c) 2018, Alliance for Open Media. All rights reserved.
3		*
4		* This source code is subject to the terms of the BSD 2 Clause License and
5		* the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
6		* was not distributed with this source code in the LICENSE file, you can
7		* obtain it at www.aomedia.org/license/software. If the Alliance for Open
8		* Media Patent License 1.0 was not distributed with this source code in the
9		* PATENTS file, you can obtain it at www.aomedia.org/license/patent.
10		*/
11
12		#include "config/av1_rtcd.h"
13
14		#if CONFIG_AV1_HIGHBITDEPTH
15
16		#include <tmmintrin.h>
17
18		#include "aom/aom_integer.h"
19		#include "aom_dsp/blend.h"
20		#include "aom_dsp/x86/synonyms.h"
21		#include "av1/common/blockd.h"
22
23		void av1_build_compound_diffwtd_mask_highbd_ssse3(
24		uint8_t mask, DIFFWTD_MASK_TYPE mask_type, const uint8_t src0,
25		int src0_stride, const uint8_t *src1, int src1_stride, int h, int w,
26	0	int bd) {
27	0	if (w < 8) {
28	0	av1_build_compound_diffwtd_mask_highbd_c(mask, mask_type, src0, src0_stride,
29	0	src1, src1_stride, h, w, bd);
30	0	} else {
31	0	assert(bd >= 8);
32	0	assert((w % 8) == 0);
33	0	assert(mask_type == DIFFWTD_38 \|\| mask_type == DIFFWTD_38_INV);
34	0	const __m128i x0 = _mm_setzero_si128();
35	0	const __m128i xAOM_BLEND_A64_MAX_ALPHA =
36	0	_mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA);
37	0	const int mask_base = 38;
38	0	const __m128i xmask_base = _mm_set1_epi16(mask_base);
39	0	const uint16_t *ssrc0 = CONVERT_TO_SHORTPTR(src0);
40	0	const uint16_t *ssrc1 = CONVERT_TO_SHORTPTR(src1);
41	0	if (bd == 8) {
42	0	if (mask_type == DIFFWTD_38_INV) {
43	0	for (int i = 0; i < h; ++i) {
44	0	for (int j = 0; j < w; j += 8) {
45	0	__m128i s0 = _mm_loadu_si128((const __m128i *)&ssrc0[j]);
46	0	__m128i s1 = _mm_loadu_si128((const __m128i *)&ssrc1[j]);
47	0	__m128i diff = _mm_srai_epi16(_mm_abs_epi16(_mm_sub_epi16(s0, s1)),
48	0	DIFF_FACTOR_LOG2);
49	0	__m128i m = _mm_min_epi16(
50	0	_mm_max_epi16(x0, _mm_add_epi16(diff, xmask_base)),
51	0	xAOM_BLEND_A64_MAX_ALPHA);
52	0	m = _mm_sub_epi16(xAOM_BLEND_A64_MAX_ALPHA, m);
53	0	m = _mm_packus_epi16(m, m);
54	0	_mm_storel_epi64((__m128i *)&mask[j], m);
55	0	}
56	0	ssrc0 += src0_stride;
57	0	ssrc1 += src1_stride;
58	0	mask += w;
59	0	}
60	0	} else {
61	0	for (int i = 0; i < h; ++i) {
62	0	for (int j = 0; j < w; j += 8) {
63	0	__m128i s0 = _mm_loadu_si128((const __m128i *)&ssrc0[j]);
64	0	__m128i s1 = _mm_loadu_si128((const __m128i *)&ssrc1[j]);
65	0	__m128i diff = _mm_srai_epi16(_mm_abs_epi16(_mm_sub_epi16(s0, s1)),
66	0	DIFF_FACTOR_LOG2);
67	0	__m128i m = _mm_min_epi16(
68	0	_mm_max_epi16(x0, _mm_add_epi16(diff, xmask_base)),
69	0	xAOM_BLEND_A64_MAX_ALPHA);
70	0	m = _mm_packus_epi16(m, m);
71	0	_mm_storel_epi64((__m128i *)&mask[j], m);
72	0	}
73	0	ssrc0 += src0_stride;
74	0	ssrc1 += src1_stride;
75	0	mask += w;
76	0	}
77	0	}
78	0	} else {
79	0	const __m128i xshift = _mm_set1_epi64x(bd - 8 + DIFF_FACTOR_LOG2);
80	0	if (mask_type == DIFFWTD_38_INV) {
81	0	for (int i = 0; i < h; ++i) {
82	0	for (int j = 0; j < w; j += 8) {
83	0	__m128i s0 = _mm_loadu_si128((const __m128i *)&ssrc0[j]);
84	0	__m128i s1 = _mm_loadu_si128((const __m128i *)&ssrc1[j]);
85	0	__m128i diff =
86	0	_mm_sra_epi16(_mm_abs_epi16(_mm_sub_epi16(s0, s1)), xshift);
87	0	__m128i m = _mm_min_epi16(
88	0	_mm_max_epi16(x0, _mm_add_epi16(diff, xmask_base)),
89	0	xAOM_BLEND_A64_MAX_ALPHA);
90	0	m = _mm_sub_epi16(xAOM_BLEND_A64_MAX_ALPHA, m);
91	0	m = _mm_packus_epi16(m, m);
92	0	_mm_storel_epi64((__m128i *)&mask[j], m);
93	0	}
94	0	ssrc0 += src0_stride;
95	0	ssrc1 += src1_stride;
96	0	mask += w;
97	0	}
98	0	} else {
99	0	for (int i = 0; i < h; ++i) {
100	0	for (int j = 0; j < w; j += 8) {
101	0	__m128i s0 = _mm_loadu_si128((const __m128i *)&ssrc0[j]);
102	0	__m128i s1 = _mm_loadu_si128((const __m128i *)&ssrc1[j]);
103	0	__m128i diff =
104	0	_mm_sra_epi16(_mm_abs_epi16(_mm_sub_epi16(s0, s1)), xshift);
105	0	__m128i m = _mm_min_epi16(
106	0	_mm_max_epi16(x0, _mm_add_epi16(diff, xmask_base)),
107	0	xAOM_BLEND_A64_MAX_ALPHA);
108	0	m = _mm_packus_epi16(m, m);
109	0	_mm_storel_epi64((__m128i *)&mask[j], m);
110	0	}
111	0	ssrc0 += src0_stride;
112	0	ssrc1 += src1_stride;
113	0	mask += w;
114	0	}
115	0	}
116	0	}
117	0	}
118	0	}
119
120		#endif // CONFIG_AV1_HIGHBITDEPTH