Coverage Report

Created: 2024-09-06 07:53

/src/libvpx/vpx_dsp/x86/inv_txfm_ssse3.h
Line
Count
Source
1
/*
2
 *  Copyright (c) 2017 The WebM project authors. All Rights Reserved.
3
 *
4
 *  Use of this source code is governed by a BSD-style license
5
 *  that can be found in the LICENSE file in the root of the source
6
 *  tree. An additional intellectual property rights grant can be found
7
 *  in the file PATENTS.  All contributing project authors may
8
 *  be found in the AUTHORS file in the root of the source tree.
9
 */
10
11
#ifndef VPX_VPX_DSP_X86_INV_TXFM_SSSE3_H_
12
#define VPX_VPX_DSP_X86_INV_TXFM_SSSE3_H_
13
14
#include <tmmintrin.h>
15
16
#include "./vpx_dsp_rtcd.h"
17
#include "vpx_dsp/x86/inv_txfm_sse2.h"
18
#include "vpx_dsp/x86/transpose_sse2.h"
19
#include "vpx_dsp/x86/txfm_common_sse2.h"
20
21
695k
static INLINE void idct8x8_12_add_kernel_ssse3(__m128i *const io /* io[8] */) {
22
695k
  const __m128i cp_28d_4d = dual_set_epi16(2 * cospi_28_64, 2 * cospi_4_64);
23
695k
  const __m128i cp_n20d_12d = dual_set_epi16(-2 * cospi_20_64, 2 * cospi_12_64);
24
695k
  const __m128i cp_8d_24d = dual_set_epi16(2 * cospi_8_64, 2 * cospi_24_64);
25
695k
  const __m128i cp_16_16 = _mm_set1_epi16(cospi_16_64);
26
695k
  const __m128i cp_16_n16 = pair_set_epi16(cospi_16_64, -cospi_16_64);
27
695k
  const __m128i cospi_16_64d = _mm_set1_epi16((int16_t)(2 * cospi_16_64));
28
695k
  const __m128i cospi_28_64d = _mm_set1_epi16((int16_t)(2 * cospi_28_64));
29
695k
  const __m128i cospi_4_64d = _mm_set1_epi16((int16_t)(2 * cospi_4_64));
30
695k
  const __m128i cospi_n20_64d = _mm_set1_epi16((int16_t)(-2 * cospi_20_64));
31
695k
  const __m128i cospi_12_64d = _mm_set1_epi16((int16_t)(2 * cospi_12_64));
32
695k
  const __m128i cospi_24_64d = _mm_set1_epi16((int16_t)(2 * cospi_24_64));
33
695k
  const __m128i cospi_8_64d = _mm_set1_epi16((int16_t)(2 * cospi_8_64));
34
695k
  __m128i step1[8], step2[8], tmp[4];
35
36
  // pass 1
37
38
695k
  transpose_16bit_4x4(io, io);
39
  // io[0]: 00 10 20 30  01 11 21 31
40
  // io[1]: 02 12 22 32  03 13 23 33
41
42
  // stage 1
43
695k
  tmp[0] = _mm_unpacklo_epi64(io[0], io[0]);
44
695k
  tmp[1] = _mm_unpackhi_epi64(io[0], io[0]);
45
695k
  tmp[2] = _mm_unpacklo_epi64(io[1], io[1]);
46
695k
  tmp[3] = _mm_unpackhi_epi64(io[1], io[1]);
47
695k
  step1[4] = _mm_mulhrs_epi16(tmp[1], cp_28d_4d);    // step1 4&7
48
695k
  step1[5] = _mm_mulhrs_epi16(tmp[3], cp_n20d_12d);  // step1 5&6
49
50
  // stage 2
51
695k
  step2[0] = _mm_mulhrs_epi16(tmp[0], cospi_16_64d);  // step2 0&1
52
695k
  step2[2] = _mm_mulhrs_epi16(tmp[2], cp_8d_24d);     // step2 3&2
53
695k
  step2[4] = _mm_add_epi16(step1[4], step1[5]);       // step2 4&7
54
695k
  step2[5] = _mm_sub_epi16(step1[4], step1[5]);       // step2 5&6
55
695k
  step2[6] = _mm_unpackhi_epi64(step2[5], step2[5]);  // step2 6
56
57
  // stage 3
58
695k
  tmp[0] = _mm_unpacklo_epi16(step2[6], step2[5]);
59
695k
  step1[5] = idct_calc_wraplow_sse2(cp_16_n16, cp_16_16, tmp[0]);  // step1 5&6
60
695k
  tmp[0] = _mm_add_epi16(step2[0], step2[2]);                      // step1 0&1
61
695k
  tmp[1] = _mm_sub_epi16(step2[0], step2[2]);                      // step1 3&2
62
695k
  step1[2] = _mm_unpackhi_epi64(tmp[1], tmp[0]);                   // step1 2&1
63
695k
  step1[3] = _mm_unpacklo_epi64(tmp[1], tmp[0]);                   // step1 3&0
64
65
  // stage 4
66
695k
  tmp[0] = _mm_add_epi16(step1[3], step2[4]);  // output 3&0
67
695k
  tmp[1] = _mm_add_epi16(step1[2], step1[5]);  // output 2&1
68
695k
  tmp[2] = _mm_sub_epi16(step1[3], step2[4]);  // output 4&7
69
695k
  tmp[3] = _mm_sub_epi16(step1[2], step1[5]);  // output 5&6
70
71
  // pass 2
72
73
695k
  idct8x8_12_transpose_16bit_4x8(tmp, io);
74
75
  // stage 1
76
695k
  step1[4] = _mm_mulhrs_epi16(io[1], cospi_28_64d);
77
695k
  step1[7] = _mm_mulhrs_epi16(io[1], cospi_4_64d);
78
695k
  step1[5] = _mm_mulhrs_epi16(io[3], cospi_n20_64d);
79
695k
  step1[6] = _mm_mulhrs_epi16(io[3], cospi_12_64d);
80
81
  // stage 2
82
695k
  step2[0] = _mm_mulhrs_epi16(io[0], cospi_16_64d);  // step2[1] = step2[0]
83
695k
  step2[2] = _mm_mulhrs_epi16(io[2], cospi_24_64d);
84
695k
  step2[3] = _mm_mulhrs_epi16(io[2], cospi_8_64d);
85
695k
  step2[4] = _mm_add_epi16(step1[4], step1[5]);
86
695k
  step2[5] = _mm_sub_epi16(step1[4], step1[5]);
87
695k
  step2[6] = _mm_sub_epi16(step1[7], step1[6]);
88
695k
  step2[7] = _mm_add_epi16(step1[7], step1[6]);
89
90
  // stage 3
91
695k
  step1[0] = _mm_add_epi16(step2[0], step2[3]);
92
695k
  step1[1] = _mm_add_epi16(step2[0], step2[2]);
93
695k
  step1[2] = _mm_sub_epi16(step2[0], step2[2]);
94
695k
  step1[3] = _mm_sub_epi16(step2[0], step2[3]);
95
695k
  butterfly(step2[6], step2[5], cospi_16_64, cospi_16_64, &step1[5], &step1[6]);
96
97
  // stage 4
98
695k
  io[0] = _mm_add_epi16(step1[0], step2[7]);
99
695k
  io[1] = _mm_add_epi16(step1[1], step1[6]);
100
695k
  io[2] = _mm_add_epi16(step1[2], step1[5]);
101
695k
  io[3] = _mm_add_epi16(step1[3], step2[4]);
102
695k
  io[4] = _mm_sub_epi16(step1[3], step2[4]);
103
695k
  io[5] = _mm_sub_epi16(step1[2], step1[5]);
104
695k
  io[6] = _mm_sub_epi16(step1[1], step1[6]);
105
695k
  io[7] = _mm_sub_epi16(step1[0], step2[7]);
106
695k
}
inv_txfm_ssse3.c:idct8x8_12_add_kernel_ssse3
Line
Count
Source
21
695k
static INLINE void idct8x8_12_add_kernel_ssse3(__m128i *const io /* io[8] */) {
22
695k
  const __m128i cp_28d_4d = dual_set_epi16(2 * cospi_28_64, 2 * cospi_4_64);
23
695k
  const __m128i cp_n20d_12d = dual_set_epi16(-2 * cospi_20_64, 2 * cospi_12_64);
24
695k
  const __m128i cp_8d_24d = dual_set_epi16(2 * cospi_8_64, 2 * cospi_24_64);
25
695k
  const __m128i cp_16_16 = _mm_set1_epi16(cospi_16_64);
26
695k
  const __m128i cp_16_n16 = pair_set_epi16(cospi_16_64, -cospi_16_64);
27
695k
  const __m128i cospi_16_64d = _mm_set1_epi16((int16_t)(2 * cospi_16_64));
28
695k
  const __m128i cospi_28_64d = _mm_set1_epi16((int16_t)(2 * cospi_28_64));
29
695k
  const __m128i cospi_4_64d = _mm_set1_epi16((int16_t)(2 * cospi_4_64));
30
695k
  const __m128i cospi_n20_64d = _mm_set1_epi16((int16_t)(-2 * cospi_20_64));
31
695k
  const __m128i cospi_12_64d = _mm_set1_epi16((int16_t)(2 * cospi_12_64));
32
695k
  const __m128i cospi_24_64d = _mm_set1_epi16((int16_t)(2 * cospi_24_64));
33
695k
  const __m128i cospi_8_64d = _mm_set1_epi16((int16_t)(2 * cospi_8_64));
34
695k
  __m128i step1[8], step2[8], tmp[4];
35
36
  // pass 1
37
38
695k
  transpose_16bit_4x4(io, io);
39
  // io[0]: 00 10 20 30  01 11 21 31
40
  // io[1]: 02 12 22 32  03 13 23 33
41
42
  // stage 1
43
695k
  tmp[0] = _mm_unpacklo_epi64(io[0], io[0]);
44
695k
  tmp[1] = _mm_unpackhi_epi64(io[0], io[0]);
45
695k
  tmp[2] = _mm_unpacklo_epi64(io[1], io[1]);
46
695k
  tmp[3] = _mm_unpackhi_epi64(io[1], io[1]);
47
695k
  step1[4] = _mm_mulhrs_epi16(tmp[1], cp_28d_4d);    // step1 4&7
48
695k
  step1[5] = _mm_mulhrs_epi16(tmp[3], cp_n20d_12d);  // step1 5&6
49
50
  // stage 2
51
695k
  step2[0] = _mm_mulhrs_epi16(tmp[0], cospi_16_64d);  // step2 0&1
52
695k
  step2[2] = _mm_mulhrs_epi16(tmp[2], cp_8d_24d);     // step2 3&2
53
695k
  step2[4] = _mm_add_epi16(step1[4], step1[5]);       // step2 4&7
54
695k
  step2[5] = _mm_sub_epi16(step1[4], step1[5]);       // step2 5&6
55
695k
  step2[6] = _mm_unpackhi_epi64(step2[5], step2[5]);  // step2 6
56
57
  // stage 3
58
695k
  tmp[0] = _mm_unpacklo_epi16(step2[6], step2[5]);
59
695k
  step1[5] = idct_calc_wraplow_sse2(cp_16_n16, cp_16_16, tmp[0]);  // step1 5&6
60
695k
  tmp[0] = _mm_add_epi16(step2[0], step2[2]);                      // step1 0&1
61
695k
  tmp[1] = _mm_sub_epi16(step2[0], step2[2]);                      // step1 3&2
62
695k
  step1[2] = _mm_unpackhi_epi64(tmp[1], tmp[0]);                   // step1 2&1
63
695k
  step1[3] = _mm_unpacklo_epi64(tmp[1], tmp[0]);                   // step1 3&0
64
65
  // stage 4
66
695k
  tmp[0] = _mm_add_epi16(step1[3], step2[4]);  // output 3&0
67
695k
  tmp[1] = _mm_add_epi16(step1[2], step1[5]);  // output 2&1
68
695k
  tmp[2] = _mm_sub_epi16(step1[3], step2[4]);  // output 4&7
69
695k
  tmp[3] = _mm_sub_epi16(step1[2], step1[5]);  // output 5&6
70
71
  // pass 2
72
73
695k
  idct8x8_12_transpose_16bit_4x8(tmp, io);
74
75
  // stage 1
76
695k
  step1[4] = _mm_mulhrs_epi16(io[1], cospi_28_64d);
77
695k
  step1[7] = _mm_mulhrs_epi16(io[1], cospi_4_64d);
78
695k
  step1[5] = _mm_mulhrs_epi16(io[3], cospi_n20_64d);
79
695k
  step1[6] = _mm_mulhrs_epi16(io[3], cospi_12_64d);
80
81
  // stage 2
82
695k
  step2[0] = _mm_mulhrs_epi16(io[0], cospi_16_64d);  // step2[1] = step2[0]
83
695k
  step2[2] = _mm_mulhrs_epi16(io[2], cospi_24_64d);
84
695k
  step2[3] = _mm_mulhrs_epi16(io[2], cospi_8_64d);
85
695k
  step2[4] = _mm_add_epi16(step1[4], step1[5]);
86
695k
  step2[5] = _mm_sub_epi16(step1[4], step1[5]);
87
695k
  step2[6] = _mm_sub_epi16(step1[7], step1[6]);
88
695k
  step2[7] = _mm_add_epi16(step1[7], step1[6]);
89
90
  // stage 3
91
695k
  step1[0] = _mm_add_epi16(step2[0], step2[3]);
92
695k
  step1[1] = _mm_add_epi16(step2[0], step2[2]);
93
695k
  step1[2] = _mm_sub_epi16(step2[0], step2[2]);
94
695k
  step1[3] = _mm_sub_epi16(step2[0], step2[3]);
95
695k
  butterfly(step2[6], step2[5], cospi_16_64, cospi_16_64, &step1[5], &step1[6]);
96
97
  // stage 4
98
695k
  io[0] = _mm_add_epi16(step1[0], step2[7]);
99
695k
  io[1] = _mm_add_epi16(step1[1], step1[6]);
100
695k
  io[2] = _mm_add_epi16(step1[2], step1[5]);
101
695k
  io[3] = _mm_add_epi16(step1[3], step2[4]);
102
695k
  io[4] = _mm_sub_epi16(step1[3], step2[4]);
103
695k
  io[5] = _mm_sub_epi16(step1[2], step1[5]);
104
695k
  io[6] = _mm_sub_epi16(step1[1], step1[6]);
105
695k
  io[7] = _mm_sub_epi16(step1[0], step2[7]);
106
695k
}
Unexecuted instantiation: highbd_idct8x8_add_sse4.c:idct8x8_12_add_kernel_ssse3
Unexecuted instantiation: highbd_idct32x32_add_sse4.c:idct8x8_12_add_kernel_ssse3
107
108
void idct32_135_8x32_ssse3(const __m128i *const in, __m128i *const out);
109
110
#endif  // VPX_VPX_DSP_X86_INV_TXFM_SSSE3_H_