/src/libvpx/vpx_dsp/x86/inv_txfm_sse2.h
Line | Count | Source (jump to first uncovered line) |
1 | | /* |
2 | | * Copyright (c) 2015 The WebM project authors. All Rights Reserved. |
3 | | * |
4 | | * Use of this source code is governed by a BSD-style license |
5 | | * that can be found in the LICENSE file in the root of the source |
6 | | * tree. An additional intellectual property rights grant can be found |
7 | | * in the file PATENTS. All contributing project authors may |
8 | | * be found in the AUTHORS file in the root of the source tree. |
9 | | */ |
10 | | |
11 | | #ifndef VPX_VPX_DSP_X86_INV_TXFM_SSE2_H_ |
12 | | #define VPX_VPX_DSP_X86_INV_TXFM_SSE2_H_ |
13 | | |
14 | | #include <emmintrin.h> // SSE2 |
15 | | |
16 | | #include "./vpx_config.h" |
17 | | #include "vpx/vpx_integer.h" |
18 | | #include "vpx_dsp/inv_txfm.h" |
19 | | #include "vpx_dsp/x86/transpose_sse2.h" |
20 | | #include "vpx_dsp/x86/txfm_common_sse2.h" |
21 | | |
22 | | static INLINE void idct8x8_12_transpose_16bit_4x8(const __m128i *const in, |
23 | 695k | __m128i *const out) { |
24 | | // Unpack 16 bit elements. Goes from: |
25 | | // in[0]: 30 31 32 33 00 01 02 03 |
26 | | // in[1]: 20 21 22 23 10 11 12 13 |
27 | | // in[2]: 40 41 42 43 70 71 72 73 |
28 | | // in[3]: 50 51 52 53 60 61 62 63 |
29 | | // to: |
30 | | // tr0_0: 00 10 01 11 02 12 03 13 |
31 | | // tr0_1: 20 30 21 31 22 32 23 33 |
32 | | // tr0_2: 40 50 41 51 42 52 43 53 |
33 | | // tr0_3: 60 70 61 71 62 72 63 73 |
34 | 695k | const __m128i tr0_0 = _mm_unpackhi_epi16(in[0], in[1]); |
35 | 695k | const __m128i tr0_1 = _mm_unpacklo_epi16(in[1], in[0]); |
36 | 695k | const __m128i tr0_2 = _mm_unpacklo_epi16(in[2], in[3]); |
37 | 695k | const __m128i tr0_3 = _mm_unpackhi_epi16(in[3], in[2]); |
38 | | |
39 | | // Unpack 32 bit elements resulting in: |
40 | | // tr1_0: 00 10 20 30 01 11 21 31 |
41 | | // tr1_1: 02 12 22 32 03 13 23 33 |
42 | | // tr1_2: 40 50 60 70 41 51 61 71 |
43 | | // tr1_3: 42 52 62 72 43 53 63 73 |
44 | 695k | const __m128i tr1_0 = _mm_unpacklo_epi32(tr0_0, tr0_1); |
45 | 695k | const __m128i tr1_1 = _mm_unpacklo_epi32(tr0_2, tr0_3); |
46 | 695k | const __m128i tr1_2 = _mm_unpackhi_epi32(tr0_0, tr0_1); |
47 | 695k | const __m128i tr1_3 = _mm_unpackhi_epi32(tr0_2, tr0_3); |
48 | | |
49 | | // Unpack 64 bit elements resulting in: |
50 | | // out[0]: 00 10 20 30 40 50 60 70 |
51 | | // out[1]: 01 11 21 31 41 51 61 71 |
52 | | // out[2]: 02 12 22 32 42 52 62 72 |
53 | | // out[3]: 03 13 23 33 43 53 63 73 |
54 | 695k | out[0] = _mm_unpacklo_epi64(tr1_0, tr1_1); |
55 | 695k | out[1] = _mm_unpackhi_epi64(tr1_0, tr1_1); |
56 | 695k | out[2] = _mm_unpacklo_epi64(tr1_2, tr1_3); |
57 | 695k | out[3] = _mm_unpackhi_epi64(tr1_2, tr1_3); |
58 | 695k | } Unexecuted instantiation: vp9_idct_intrin_sse2.c:idct8x8_12_transpose_16bit_4x8 Unexecuted instantiation: vp9_highbd_iht4x4_add_sse4.c:idct8x8_12_transpose_16bit_4x8 Unexecuted instantiation: vp9_highbd_iht8x8_add_sse4.c:idct8x8_12_transpose_16bit_4x8 Unexecuted instantiation: vp9_highbd_iht16x16_add_sse4.c:idct8x8_12_transpose_16bit_4x8 Unexecuted instantiation: inv_txfm_sse2.c:idct8x8_12_transpose_16bit_4x8 inv_txfm_ssse3.c:idct8x8_12_transpose_16bit_4x8 Line | Count | Source | 23 | 695k | __m128i *const out) { | 24 | | // Unpack 16 bit elements. Goes from: | 25 | | // in[0]: 30 31 32 33 00 01 02 03 | 26 | | // in[1]: 20 21 22 23 10 11 12 13 | 27 | | // in[2]: 40 41 42 43 70 71 72 73 | 28 | | // in[3]: 50 51 52 53 60 61 62 63 | 29 | | // to: | 30 | | // tr0_0: 00 10 01 11 02 12 03 13 | 31 | | // tr0_1: 20 30 21 31 22 32 23 33 | 32 | | // tr0_2: 40 50 41 51 42 52 43 53 | 33 | | // tr0_3: 60 70 61 71 62 72 63 73 | 34 | 695k | const __m128i tr0_0 = _mm_unpackhi_epi16(in[0], in[1]); | 35 | 695k | const __m128i tr0_1 = _mm_unpacklo_epi16(in[1], in[0]); | 36 | 695k | const __m128i tr0_2 = _mm_unpacklo_epi16(in[2], in[3]); | 37 | 695k | const __m128i tr0_3 = _mm_unpackhi_epi16(in[3], in[2]); | 38 | | | 39 | | // Unpack 32 bit elements resulting in: | 40 | | // tr1_0: 00 10 20 30 01 11 21 31 | 41 | | // tr1_1: 02 12 22 32 03 13 23 33 | 42 | | // tr1_2: 40 50 60 70 41 51 61 71 | 43 | | // tr1_3: 42 52 62 72 43 53 63 73 | 44 | 695k | const __m128i tr1_0 = _mm_unpacklo_epi32(tr0_0, tr0_1); | 45 | 695k | const __m128i tr1_1 = _mm_unpacklo_epi32(tr0_2, tr0_3); | 46 | 695k | const __m128i tr1_2 = _mm_unpackhi_epi32(tr0_0, tr0_1); | 47 | 695k | const __m128i tr1_3 = _mm_unpackhi_epi32(tr0_2, tr0_3); | 48 | | | 49 | | // Unpack 64 bit elements resulting in: | 50 | | // out[0]: 00 10 20 30 40 50 60 70 | 51 | | // out[1]: 01 11 21 31 41 51 61 71 | 52 | | // out[2]: 02 12 22 32 42 52 62 72 | 53 | | // out[3]: 03 13 23 33 43 53 63 73 | 54 | 695k | out[0] = _mm_unpacklo_epi64(tr1_0, tr1_1); | 55 | 695k | out[1] = _mm_unpackhi_epi64(tr1_0, tr1_1); | 56 | 695k | out[2] = _mm_unpacklo_epi64(tr1_2, tr1_3); | 57 | 695k | out[3] = _mm_unpackhi_epi64(tr1_2, tr1_3); | 58 | 695k | } |
Unexecuted instantiation: highbd_idct4x4_add_sse2.c:idct8x8_12_transpose_16bit_4x8 Unexecuted instantiation: highbd_idct8x8_add_sse2.c:idct8x8_12_transpose_16bit_4x8 Unexecuted instantiation: highbd_idct16x16_add_sse2.c:idct8x8_12_transpose_16bit_4x8 Unexecuted instantiation: highbd_idct32x32_add_sse2.c:idct8x8_12_transpose_16bit_4x8 Unexecuted instantiation: highbd_idct4x4_add_sse4.c:idct8x8_12_transpose_16bit_4x8 Unexecuted instantiation: highbd_idct8x8_add_sse4.c:idct8x8_12_transpose_16bit_4x8 Unexecuted instantiation: highbd_idct16x16_add_sse4.c:idct8x8_12_transpose_16bit_4x8 Unexecuted instantiation: highbd_idct32x32_add_sse4.c:idct8x8_12_transpose_16bit_4x8 |
59 | | |
60 | 5.78G | static INLINE __m128i dct_const_round_shift_sse2(const __m128i in) { |
61 | 5.78G | const __m128i t = _mm_add_epi32(in, _mm_set1_epi32(DCT_CONST_ROUNDING)); |
62 | 5.78G | return _mm_srai_epi32(t, DCT_CONST_BITS); |
63 | 5.78G | } Unexecuted instantiation: vp9_idct_intrin_sse2.c:dct_const_round_shift_sse2 Unexecuted instantiation: vp9_highbd_iht4x4_add_sse4.c:dct_const_round_shift_sse2 Unexecuted instantiation: vp9_highbd_iht8x8_add_sse4.c:dct_const_round_shift_sse2 Unexecuted instantiation: vp9_highbd_iht16x16_add_sse4.c:dct_const_round_shift_sse2 inv_txfm_sse2.c:dct_const_round_shift_sse2 Line | Count | Source | 60 | 5.77G | static INLINE __m128i dct_const_round_shift_sse2(const __m128i in) { | 61 | 5.77G | const __m128i t = _mm_add_epi32(in, _mm_set1_epi32(DCT_CONST_ROUNDING)); | 62 | 5.77G | return _mm_srai_epi32(t, DCT_CONST_BITS); | 63 | 5.77G | } |
inv_txfm_ssse3.c:dct_const_round_shift_sse2 Line | Count | Source | 60 | 11.1M | static INLINE __m128i dct_const_round_shift_sse2(const __m128i in) { | 61 | 11.1M | const __m128i t = _mm_add_epi32(in, _mm_set1_epi32(DCT_CONST_ROUNDING)); | 62 | 11.1M | return _mm_srai_epi32(t, DCT_CONST_BITS); | 63 | 11.1M | } |
Unexecuted instantiation: highbd_idct4x4_add_sse2.c:dct_const_round_shift_sse2 Unexecuted instantiation: highbd_idct8x8_add_sse2.c:dct_const_round_shift_sse2 Unexecuted instantiation: highbd_idct16x16_add_sse2.c:dct_const_round_shift_sse2 Unexecuted instantiation: highbd_idct32x32_add_sse2.c:dct_const_round_shift_sse2 Unexecuted instantiation: highbd_idct4x4_add_sse4.c:dct_const_round_shift_sse2 Unexecuted instantiation: highbd_idct8x8_add_sse4.c:dct_const_round_shift_sse2 Unexecuted instantiation: highbd_idct16x16_add_sse4.c:dct_const_round_shift_sse2 Unexecuted instantiation: highbd_idct32x32_add_sse4.c:dct_const_round_shift_sse2 |
64 | | |
65 | | static INLINE __m128i idct_madd_round_shift_sse2(const __m128i in, |
66 | 3.66G | const __m128i cospi) { |
67 | 3.66G | const __m128i t = _mm_madd_epi16(in, cospi); |
68 | 3.66G | return dct_const_round_shift_sse2(t); |
69 | 3.66G | } Unexecuted instantiation: vp9_idct_intrin_sse2.c:idct_madd_round_shift_sse2 Unexecuted instantiation: vp9_highbd_iht4x4_add_sse4.c:idct_madd_round_shift_sse2 Unexecuted instantiation: vp9_highbd_iht8x8_add_sse4.c:idct_madd_round_shift_sse2 Unexecuted instantiation: vp9_highbd_iht16x16_add_sse4.c:idct_madd_round_shift_sse2 inv_txfm_sse2.c:idct_madd_round_shift_sse2 Line | Count | Source | 66 | 3.65G | const __m128i cospi) { | 67 | 3.65G | const __m128i t = _mm_madd_epi16(in, cospi); | 68 | 3.65G | return dct_const_round_shift_sse2(t); | 69 | 3.65G | } |
inv_txfm_ssse3.c:idct_madd_round_shift_sse2 Line | Count | Source | 66 | 11.1M | const __m128i cospi) { | 67 | 11.1M | const __m128i t = _mm_madd_epi16(in, cospi); | 68 | 11.1M | return dct_const_round_shift_sse2(t); | 69 | 11.1M | } |
Unexecuted instantiation: highbd_idct4x4_add_sse2.c:idct_madd_round_shift_sse2 Unexecuted instantiation: highbd_idct8x8_add_sse2.c:idct_madd_round_shift_sse2 Unexecuted instantiation: highbd_idct16x16_add_sse2.c:idct_madd_round_shift_sse2 Unexecuted instantiation: highbd_idct32x32_add_sse2.c:idct_madd_round_shift_sse2 Unexecuted instantiation: highbd_idct4x4_add_sse4.c:idct_madd_round_shift_sse2 Unexecuted instantiation: highbd_idct8x8_add_sse4.c:idct_madd_round_shift_sse2 Unexecuted instantiation: highbd_idct16x16_add_sse4.c:idct_madd_round_shift_sse2 Unexecuted instantiation: highbd_idct32x32_add_sse4.c:idct_madd_round_shift_sse2 |
70 | | |
71 | | // Calculate the dot product between in0/1 and x and wrap to short. |
72 | | static INLINE __m128i idct_calc_wraplow_sse2(const __m128i in0, |
73 | | const __m128i in1, |
74 | 1.83G | const __m128i x) { |
75 | 1.83G | const __m128i t0 = idct_madd_round_shift_sse2(in0, x); |
76 | 1.83G | const __m128i t1 = idct_madd_round_shift_sse2(in1, x); |
77 | 1.83G | return _mm_packs_epi32(t0, t1); |
78 | 1.83G | } Unexecuted instantiation: vp9_idct_intrin_sse2.c:idct_calc_wraplow_sse2 Unexecuted instantiation: vp9_highbd_iht4x4_add_sse4.c:idct_calc_wraplow_sse2 Unexecuted instantiation: vp9_highbd_iht8x8_add_sse4.c:idct_calc_wraplow_sse2 Unexecuted instantiation: vp9_highbd_iht16x16_add_sse4.c:idct_calc_wraplow_sse2 inv_txfm_sse2.c:idct_calc_wraplow_sse2 Line | Count | Source | 74 | 1.82G | const __m128i x) { | 75 | 1.82G | const __m128i t0 = idct_madd_round_shift_sse2(in0, x); | 76 | 1.82G | const __m128i t1 = idct_madd_round_shift_sse2(in1, x); | 77 | 1.82G | return _mm_packs_epi32(t0, t1); | 78 | 1.82G | } |
inv_txfm_ssse3.c:idct_calc_wraplow_sse2 Line | Count | Source | 74 | 5.58M | const __m128i x) { | 75 | 5.58M | const __m128i t0 = idct_madd_round_shift_sse2(in0, x); | 76 | 5.58M | const __m128i t1 = idct_madd_round_shift_sse2(in1, x); | 77 | 5.58M | return _mm_packs_epi32(t0, t1); | 78 | 5.58M | } |
Unexecuted instantiation: highbd_idct4x4_add_sse2.c:idct_calc_wraplow_sse2 Unexecuted instantiation: highbd_idct8x8_add_sse2.c:idct_calc_wraplow_sse2 Unexecuted instantiation: highbd_idct16x16_add_sse2.c:idct_calc_wraplow_sse2 Unexecuted instantiation: highbd_idct32x32_add_sse2.c:idct_calc_wraplow_sse2 Unexecuted instantiation: highbd_idct4x4_add_sse4.c:idct_calc_wraplow_sse2 Unexecuted instantiation: highbd_idct8x8_add_sse4.c:idct_calc_wraplow_sse2 Unexecuted instantiation: highbd_idct16x16_add_sse4.c:idct_calc_wraplow_sse2 Unexecuted instantiation: highbd_idct32x32_add_sse4.c:idct_calc_wraplow_sse2 |
79 | | |
80 | | // Multiply elements by constants and add them together. |
81 | | static INLINE void butterfly(const __m128i in0, const __m128i in1, const int c0, |
82 | | const int c1, __m128i *const out0, |
83 | 462M | __m128i *const out1) { |
84 | 462M | const __m128i cst0 = pair_set_epi16(c0, -c1); |
85 | 462M | const __m128i cst1 = pair_set_epi16(c1, c0); |
86 | 462M | const __m128i lo = _mm_unpacklo_epi16(in0, in1); |
87 | 462M | const __m128i hi = _mm_unpackhi_epi16(in0, in1); |
88 | 462M | *out0 = idct_calc_wraplow_sse2(lo, hi, cst0); |
89 | 462M | *out1 = idct_calc_wraplow_sse2(lo, hi, cst1); |
90 | 462M | } Unexecuted instantiation: vp9_idct_intrin_sse2.c:butterfly Unexecuted instantiation: vp9_highbd_iht4x4_add_sse4.c:butterfly Unexecuted instantiation: vp9_highbd_iht8x8_add_sse4.c:butterfly Unexecuted instantiation: vp9_highbd_iht16x16_add_sse4.c:butterfly inv_txfm_sse2.c:butterfly Line | Count | Source | 83 | 459M | __m128i *const out1) { | 84 | 459M | const __m128i cst0 = pair_set_epi16(c0, -c1); | 85 | 459M | const __m128i cst1 = pair_set_epi16(c1, c0); | 86 | 459M | const __m128i lo = _mm_unpacklo_epi16(in0, in1); | 87 | 459M | const __m128i hi = _mm_unpackhi_epi16(in0, in1); | 88 | 459M | *out0 = idct_calc_wraplow_sse2(lo, hi, cst0); | 89 | 459M | *out1 = idct_calc_wraplow_sse2(lo, hi, cst1); | 90 | 459M | } |
inv_txfm_ssse3.c:butterfly Line | Count | Source | 83 | 2.44M | __m128i *const out1) { | 84 | 2.44M | const __m128i cst0 = pair_set_epi16(c0, -c1); | 85 | 2.44M | const __m128i cst1 = pair_set_epi16(c1, c0); | 86 | 2.44M | const __m128i lo = _mm_unpacklo_epi16(in0, in1); | 87 | 2.44M | const __m128i hi = _mm_unpackhi_epi16(in0, in1); | 88 | 2.44M | *out0 = idct_calc_wraplow_sse2(lo, hi, cst0); | 89 | 2.44M | *out1 = idct_calc_wraplow_sse2(lo, hi, cst1); | 90 | 2.44M | } |
Unexecuted instantiation: highbd_idct4x4_add_sse2.c:butterfly Unexecuted instantiation: highbd_idct8x8_add_sse2.c:butterfly Unexecuted instantiation: highbd_idct16x16_add_sse2.c:butterfly Unexecuted instantiation: highbd_idct32x32_add_sse2.c:butterfly Unexecuted instantiation: highbd_idct4x4_add_sse4.c:butterfly Unexecuted instantiation: highbd_idct8x8_add_sse4.c:butterfly Unexecuted instantiation: highbd_idct16x16_add_sse4.c:butterfly Unexecuted instantiation: highbd_idct32x32_add_sse4.c:butterfly |
91 | | |
92 | 52.0k | static INLINE __m128i butterfly_cospi16(const __m128i in) { |
93 | 52.0k | const __m128i cst = pair_set_epi16(cospi_16_64, cospi_16_64); |
94 | 52.0k | const __m128i lo = _mm_unpacklo_epi16(in, _mm_setzero_si128()); |
95 | 52.0k | const __m128i hi = _mm_unpackhi_epi16(in, _mm_setzero_si128()); |
96 | 52.0k | return idct_calc_wraplow_sse2(lo, hi, cst); |
97 | 52.0k | } Unexecuted instantiation: vp9_idct_intrin_sse2.c:butterfly_cospi16 Unexecuted instantiation: vp9_highbd_iht4x4_add_sse4.c:butterfly_cospi16 Unexecuted instantiation: vp9_highbd_iht8x8_add_sse4.c:butterfly_cospi16 Unexecuted instantiation: vp9_highbd_iht16x16_add_sse4.c:butterfly_cospi16 inv_txfm_sse2.c:butterfly_cospi16 Line | Count | Source | 92 | 52.0k | static INLINE __m128i butterfly_cospi16(const __m128i in) { | 93 | 52.0k | const __m128i cst = pair_set_epi16(cospi_16_64, cospi_16_64); | 94 | 52.0k | const __m128i lo = _mm_unpacklo_epi16(in, _mm_setzero_si128()); | 95 | 52.0k | const __m128i hi = _mm_unpackhi_epi16(in, _mm_setzero_si128()); | 96 | 52.0k | return idct_calc_wraplow_sse2(lo, hi, cst); | 97 | 52.0k | } |
Unexecuted instantiation: inv_txfm_ssse3.c:butterfly_cospi16 Unexecuted instantiation: highbd_idct4x4_add_sse2.c:butterfly_cospi16 Unexecuted instantiation: highbd_idct8x8_add_sse2.c:butterfly_cospi16 Unexecuted instantiation: highbd_idct16x16_add_sse2.c:butterfly_cospi16 Unexecuted instantiation: highbd_idct32x32_add_sse2.c:butterfly_cospi16 Unexecuted instantiation: highbd_idct4x4_add_sse4.c:butterfly_cospi16 Unexecuted instantiation: highbd_idct8x8_add_sse4.c:butterfly_cospi16 Unexecuted instantiation: highbd_idct16x16_add_sse4.c:butterfly_cospi16 Unexecuted instantiation: highbd_idct32x32_add_sse4.c:butterfly_cospi16 |
98 | | |
99 | | // Functions to allow 8 bit optimisations to be used when profile 0 is used with |
100 | | // highbitdepth enabled |
101 | 2.88M | static INLINE __m128i load_input_data4(const tran_low_t *data) { |
102 | 2.88M | #if CONFIG_VP9_HIGHBITDEPTH |
103 | 2.88M | const __m128i zero = _mm_setzero_si128(); |
104 | 2.88M | const __m128i in = _mm_load_si128((const __m128i *)data); |
105 | 2.88M | return _mm_packs_epi32(in, zero); |
106 | | #else |
107 | | return _mm_loadl_epi64((const __m128i *)data); |
108 | | #endif |
109 | 2.88M | } Unexecuted instantiation: vp9_idct_intrin_sse2.c:load_input_data4 Unexecuted instantiation: vp9_highbd_iht4x4_add_sse4.c:load_input_data4 Unexecuted instantiation: vp9_highbd_iht8x8_add_sse4.c:load_input_data4 Unexecuted instantiation: vp9_highbd_iht16x16_add_sse4.c:load_input_data4 inv_txfm_sse2.c:load_input_data4 Line | Count | Source | 101 | 104k | static INLINE __m128i load_input_data4(const tran_low_t *data) { | 102 | 104k | #if CONFIG_VP9_HIGHBITDEPTH | 103 | 104k | const __m128i zero = _mm_setzero_si128(); | 104 | 104k | const __m128i in = _mm_load_si128((const __m128i *)data); | 105 | 104k | return _mm_packs_epi32(in, zero); | 106 | | #else | 107 | | return _mm_loadl_epi64((const __m128i *)data); | 108 | | #endif | 109 | 104k | } |
inv_txfm_ssse3.c:load_input_data4 Line | Count | Source | 101 | 2.78M | static INLINE __m128i load_input_data4(const tran_low_t *data) { | 102 | 2.78M | #if CONFIG_VP9_HIGHBITDEPTH | 103 | 2.78M | const __m128i zero = _mm_setzero_si128(); | 104 | 2.78M | const __m128i in = _mm_load_si128((const __m128i *)data); | 105 | 2.78M | return _mm_packs_epi32(in, zero); | 106 | | #else | 107 | | return _mm_loadl_epi64((const __m128i *)data); | 108 | | #endif | 109 | 2.78M | } |
Unexecuted instantiation: highbd_idct4x4_add_sse2.c:load_input_data4 Unexecuted instantiation: highbd_idct8x8_add_sse2.c:load_input_data4 Unexecuted instantiation: highbd_idct16x16_add_sse2.c:load_input_data4 Unexecuted instantiation: highbd_idct32x32_add_sse2.c:load_input_data4 Unexecuted instantiation: highbd_idct4x4_add_sse4.c:load_input_data4 Unexecuted instantiation: highbd_idct8x8_add_sse4.c:load_input_data4 Unexecuted instantiation: highbd_idct16x16_add_sse4.c:load_input_data4 Unexecuted instantiation: highbd_idct32x32_add_sse4.c:load_input_data4 |
110 | | |
111 | 1.00G | static INLINE __m128i load_input_data8(const tran_low_t *data) { |
112 | 1.00G | #if CONFIG_VP9_HIGHBITDEPTH |
113 | 1.00G | const __m128i in0 = _mm_load_si128((const __m128i *)data); |
114 | 1.00G | const __m128i in1 = _mm_load_si128((const __m128i *)(data + 4)); |
115 | 1.00G | return _mm_packs_epi32(in0, in1); |
116 | | #else |
117 | | return _mm_load_si128((const __m128i *)data); |
118 | | #endif |
119 | 1.00G | } vp9_idct_intrin_sse2.c:load_input_data8 Line | Count | Source | 111 | 548M | static INLINE __m128i load_input_data8(const tran_low_t *data) { | 112 | 548M | #if CONFIG_VP9_HIGHBITDEPTH | 113 | 548M | const __m128i in0 = _mm_load_si128((const __m128i *)data); | 114 | 548M | const __m128i in1 = _mm_load_si128((const __m128i *)(data + 4)); | 115 | 548M | return _mm_packs_epi32(in0, in1); | 116 | | #else | 117 | | return _mm_load_si128((const __m128i *)data); | 118 | | #endif | 119 | 548M | } |
Unexecuted instantiation: vp9_highbd_iht4x4_add_sse4.c:load_input_data8 Unexecuted instantiation: vp9_highbd_iht8x8_add_sse4.c:load_input_data8 Unexecuted instantiation: vp9_highbd_iht16x16_add_sse4.c:load_input_data8 inv_txfm_sse2.c:load_input_data8 Line | Count | Source | 111 | 460M | static INLINE __m128i load_input_data8(const tran_low_t *data) { | 112 | 460M | #if CONFIG_VP9_HIGHBITDEPTH | 113 | 460M | const __m128i in0 = _mm_load_si128((const __m128i *)data); | 114 | 460M | const __m128i in1 = _mm_load_si128((const __m128i *)(data + 4)); | 115 | 460M | return _mm_packs_epi32(in0, in1); | 116 | | #else | 117 | | return _mm_load_si128((const __m128i *)data); | 118 | | #endif | 119 | 460M | } |
inv_txfm_ssse3.c:load_input_data8 Line | Count | Source | 111 | 164k | static INLINE __m128i load_input_data8(const tran_low_t *data) { | 112 | 164k | #if CONFIG_VP9_HIGHBITDEPTH | 113 | 164k | const __m128i in0 = _mm_load_si128((const __m128i *)data); | 114 | 164k | const __m128i in1 = _mm_load_si128((const __m128i *)(data + 4)); | 115 | 164k | return _mm_packs_epi32(in0, in1); | 116 | | #else | 117 | | return _mm_load_si128((const __m128i *)data); | 118 | | #endif | 119 | 164k | } |
Unexecuted instantiation: highbd_idct4x4_add_sse2.c:load_input_data8 Unexecuted instantiation: highbd_idct8x8_add_sse2.c:load_input_data8 Unexecuted instantiation: highbd_idct16x16_add_sse2.c:load_input_data8 Unexecuted instantiation: highbd_idct32x32_add_sse2.c:load_input_data8 Unexecuted instantiation: highbd_idct4x4_add_sse4.c:load_input_data8 Unexecuted instantiation: highbd_idct8x8_add_sse4.c:load_input_data8 Unexecuted instantiation: highbd_idct16x16_add_sse4.c:load_input_data8 Unexecuted instantiation: highbd_idct32x32_add_sse4.c:load_input_data8 |
120 | | |
121 | | static INLINE void load_transpose_16bit_8x8(const tran_low_t *input, |
122 | | const int stride, |
123 | 20.5k | __m128i *const in) { |
124 | 20.5k | in[0] = load_input_data8(input + 0 * stride); |
125 | 20.5k | in[1] = load_input_data8(input + 1 * stride); |
126 | 20.5k | in[2] = load_input_data8(input + 2 * stride); |
127 | 20.5k | in[3] = load_input_data8(input + 3 * stride); |
128 | 20.5k | in[4] = load_input_data8(input + 4 * stride); |
129 | 20.5k | in[5] = load_input_data8(input + 5 * stride); |
130 | 20.5k | in[6] = load_input_data8(input + 6 * stride); |
131 | 20.5k | in[7] = load_input_data8(input + 7 * stride); |
132 | 20.5k | transpose_16bit_8x8(in, in); |
133 | 20.5k | } Unexecuted instantiation: vp9_idct_intrin_sse2.c:load_transpose_16bit_8x8 Unexecuted instantiation: vp9_highbd_iht4x4_add_sse4.c:load_transpose_16bit_8x8 Unexecuted instantiation: vp9_highbd_iht8x8_add_sse4.c:load_transpose_16bit_8x8 Unexecuted instantiation: vp9_highbd_iht16x16_add_sse4.c:load_transpose_16bit_8x8 Unexecuted instantiation: inv_txfm_sse2.c:load_transpose_16bit_8x8 inv_txfm_ssse3.c:load_transpose_16bit_8x8 Line | Count | Source | 123 | 20.5k | __m128i *const in) { | 124 | 20.5k | in[0] = load_input_data8(input + 0 * stride); | 125 | 20.5k | in[1] = load_input_data8(input + 1 * stride); | 126 | 20.5k | in[2] = load_input_data8(input + 2 * stride); | 127 | 20.5k | in[3] = load_input_data8(input + 3 * stride); | 128 | 20.5k | in[4] = load_input_data8(input + 4 * stride); | 129 | 20.5k | in[5] = load_input_data8(input + 5 * stride); | 130 | 20.5k | in[6] = load_input_data8(input + 6 * stride); | 131 | 20.5k | in[7] = load_input_data8(input + 7 * stride); | 132 | 20.5k | transpose_16bit_8x8(in, in); | 133 | 20.5k | } |
Unexecuted instantiation: highbd_idct4x4_add_sse2.c:load_transpose_16bit_8x8 Unexecuted instantiation: highbd_idct8x8_add_sse2.c:load_transpose_16bit_8x8 Unexecuted instantiation: highbd_idct16x16_add_sse2.c:load_transpose_16bit_8x8 Unexecuted instantiation: highbd_idct32x32_add_sse2.c:load_transpose_16bit_8x8 Unexecuted instantiation: highbd_idct4x4_add_sse4.c:load_transpose_16bit_8x8 Unexecuted instantiation: highbd_idct8x8_add_sse4.c:load_transpose_16bit_8x8 Unexecuted instantiation: highbd_idct16x16_add_sse4.c:load_transpose_16bit_8x8 Unexecuted instantiation: highbd_idct32x32_add_sse4.c:load_transpose_16bit_8x8 |
134 | | |
135 | 609M | static INLINE void recon_and_store(uint8_t *const dest, const __m128i in_x) { |
136 | 609M | const __m128i zero = _mm_setzero_si128(); |
137 | 609M | __m128i d0 = _mm_loadl_epi64((__m128i *)(dest)); |
138 | 609M | d0 = _mm_unpacklo_epi8(d0, zero); |
139 | 609M | d0 = _mm_add_epi16(in_x, d0); |
140 | 609M | d0 = _mm_packus_epi16(d0, d0); |
141 | 609M | _mm_storel_epi64((__m128i *)(dest), d0); |
142 | 609M | } vp9_idct_intrin_sse2.c:recon_and_store Line | Count | Source | 135 | 414M | static INLINE void recon_and_store(uint8_t *const dest, const __m128i in_x) { | 136 | 414M | const __m128i zero = _mm_setzero_si128(); | 137 | 414M | __m128i d0 = _mm_loadl_epi64((__m128i *)(dest)); | 138 | 414M | d0 = _mm_unpacklo_epi8(d0, zero); | 139 | 414M | d0 = _mm_add_epi16(in_x, d0); | 140 | 414M | d0 = _mm_packus_epi16(d0, d0); | 141 | 414M | _mm_storel_epi64((__m128i *)(dest), d0); | 142 | 414M | } |
Unexecuted instantiation: vp9_highbd_iht4x4_add_sse4.c:recon_and_store Unexecuted instantiation: vp9_highbd_iht8x8_add_sse4.c:recon_and_store Unexecuted instantiation: vp9_highbd_iht16x16_add_sse4.c:recon_and_store inv_txfm_sse2.c:recon_and_store Line | Count | Source | 135 | 186M | static INLINE void recon_and_store(uint8_t *const dest, const __m128i in_x) { | 136 | 186M | const __m128i zero = _mm_setzero_si128(); | 137 | 186M | __m128i d0 = _mm_loadl_epi64((__m128i *)(dest)); | 138 | 186M | d0 = _mm_unpacklo_epi8(d0, zero); | 139 | 186M | d0 = _mm_add_epi16(in_x, d0); | 140 | 186M | d0 = _mm_packus_epi16(d0, d0); | 141 | 186M | _mm_storel_epi64((__m128i *)(dest), d0); | 142 | 186M | } |
inv_txfm_ssse3.c:recon_and_store Line | Count | Source | 135 | 8.19M | static INLINE void recon_and_store(uint8_t *const dest, const __m128i in_x) { | 136 | 8.19M | const __m128i zero = _mm_setzero_si128(); | 137 | 8.19M | __m128i d0 = _mm_loadl_epi64((__m128i *)(dest)); | 138 | 8.19M | d0 = _mm_unpacklo_epi8(d0, zero); | 139 | 8.19M | d0 = _mm_add_epi16(in_x, d0); | 140 | 8.19M | d0 = _mm_packus_epi16(d0, d0); | 141 | 8.19M | _mm_storel_epi64((__m128i *)(dest), d0); | 142 | 8.19M | } |
Unexecuted instantiation: highbd_idct4x4_add_sse2.c:recon_and_store Unexecuted instantiation: highbd_idct8x8_add_sse2.c:recon_and_store Unexecuted instantiation: highbd_idct16x16_add_sse2.c:recon_and_store Unexecuted instantiation: highbd_idct32x32_add_sse2.c:recon_and_store Unexecuted instantiation: highbd_idct4x4_add_sse4.c:recon_and_store Unexecuted instantiation: highbd_idct8x8_add_sse4.c:recon_and_store Unexecuted instantiation: highbd_idct16x16_add_sse4.c:recon_and_store Unexecuted instantiation: highbd_idct32x32_add_sse4.c:recon_and_store |
143 | | |
144 | | static INLINE void round_shift_8x8(const __m128i *const in, |
145 | 23.7M | __m128i *const out) { |
146 | 23.7M | const __m128i final_rounding = _mm_set1_epi16(1 << 4); |
147 | | |
148 | 23.7M | out[0] = _mm_add_epi16(in[0], final_rounding); |
149 | 23.7M | out[1] = _mm_add_epi16(in[1], final_rounding); |
150 | 23.7M | out[2] = _mm_add_epi16(in[2], final_rounding); |
151 | 23.7M | out[3] = _mm_add_epi16(in[3], final_rounding); |
152 | 23.7M | out[4] = _mm_add_epi16(in[4], final_rounding); |
153 | 23.7M | out[5] = _mm_add_epi16(in[5], final_rounding); |
154 | 23.7M | out[6] = _mm_add_epi16(in[6], final_rounding); |
155 | 23.7M | out[7] = _mm_add_epi16(in[7], final_rounding); |
156 | | |
157 | 23.7M | out[0] = _mm_srai_epi16(out[0], 5); |
158 | 23.7M | out[1] = _mm_srai_epi16(out[1], 5); |
159 | 23.7M | out[2] = _mm_srai_epi16(out[2], 5); |
160 | 23.7M | out[3] = _mm_srai_epi16(out[3], 5); |
161 | 23.7M | out[4] = _mm_srai_epi16(out[4], 5); |
162 | 23.7M | out[5] = _mm_srai_epi16(out[5], 5); |
163 | 23.7M | out[6] = _mm_srai_epi16(out[6], 5); |
164 | 23.7M | out[7] = _mm_srai_epi16(out[7], 5); |
165 | 23.7M | } Unexecuted instantiation: vp9_idct_intrin_sse2.c:round_shift_8x8 Unexecuted instantiation: vp9_highbd_iht4x4_add_sse4.c:round_shift_8x8 Unexecuted instantiation: vp9_highbd_iht8x8_add_sse4.c:round_shift_8x8 Unexecuted instantiation: vp9_highbd_iht16x16_add_sse4.c:round_shift_8x8 inv_txfm_sse2.c:round_shift_8x8 Line | Count | Source | 145 | 23.0M | __m128i *const out) { | 146 | 23.0M | const __m128i final_rounding = _mm_set1_epi16(1 << 4); | 147 | | | 148 | 23.0M | out[0] = _mm_add_epi16(in[0], final_rounding); | 149 | 23.0M | out[1] = _mm_add_epi16(in[1], final_rounding); | 150 | 23.0M | out[2] = _mm_add_epi16(in[2], final_rounding); | 151 | 23.0M | out[3] = _mm_add_epi16(in[3], final_rounding); | 152 | 23.0M | out[4] = _mm_add_epi16(in[4], final_rounding); | 153 | 23.0M | out[5] = _mm_add_epi16(in[5], final_rounding); | 154 | 23.0M | out[6] = _mm_add_epi16(in[6], final_rounding); | 155 | 23.0M | out[7] = _mm_add_epi16(in[7], final_rounding); | 156 | | | 157 | 23.0M | out[0] = _mm_srai_epi16(out[0], 5); | 158 | 23.0M | out[1] = _mm_srai_epi16(out[1], 5); | 159 | 23.0M | out[2] = _mm_srai_epi16(out[2], 5); | 160 | 23.0M | out[3] = _mm_srai_epi16(out[3], 5); | 161 | 23.0M | out[4] = _mm_srai_epi16(out[4], 5); | 162 | 23.0M | out[5] = _mm_srai_epi16(out[5], 5); | 163 | 23.0M | out[6] = _mm_srai_epi16(out[6], 5); | 164 | 23.0M | out[7] = _mm_srai_epi16(out[7], 5); | 165 | 23.0M | } |
inv_txfm_ssse3.c:round_shift_8x8 Line | Count | Source | 145 | 695k | __m128i *const out) { | 146 | 695k | const __m128i final_rounding = _mm_set1_epi16(1 << 4); | 147 | | | 148 | 695k | out[0] = _mm_add_epi16(in[0], final_rounding); | 149 | 695k | out[1] = _mm_add_epi16(in[1], final_rounding); | 150 | 695k | out[2] = _mm_add_epi16(in[2], final_rounding); | 151 | 695k | out[3] = _mm_add_epi16(in[3], final_rounding); | 152 | 695k | out[4] = _mm_add_epi16(in[4], final_rounding); | 153 | 695k | out[5] = _mm_add_epi16(in[5], final_rounding); | 154 | 695k | out[6] = _mm_add_epi16(in[6], final_rounding); | 155 | 695k | out[7] = _mm_add_epi16(in[7], final_rounding); | 156 | | | 157 | 695k | out[0] = _mm_srai_epi16(out[0], 5); | 158 | 695k | out[1] = _mm_srai_epi16(out[1], 5); | 159 | 695k | out[2] = _mm_srai_epi16(out[2], 5); | 160 | 695k | out[3] = _mm_srai_epi16(out[3], 5); | 161 | 695k | out[4] = _mm_srai_epi16(out[4], 5); | 162 | 695k | out[5] = _mm_srai_epi16(out[5], 5); | 163 | 695k | out[6] = _mm_srai_epi16(out[6], 5); | 164 | 695k | out[7] = _mm_srai_epi16(out[7], 5); | 165 | 695k | } |
Unexecuted instantiation: highbd_idct4x4_add_sse2.c:round_shift_8x8 Unexecuted instantiation: highbd_idct8x8_add_sse2.c:round_shift_8x8 Unexecuted instantiation: highbd_idct16x16_add_sse2.c:round_shift_8x8 Unexecuted instantiation: highbd_idct32x32_add_sse2.c:round_shift_8x8 Unexecuted instantiation: highbd_idct4x4_add_sse4.c:round_shift_8x8 Unexecuted instantiation: highbd_idct8x8_add_sse4.c:round_shift_8x8 Unexecuted instantiation: highbd_idct16x16_add_sse4.c:round_shift_8x8 Unexecuted instantiation: highbd_idct32x32_add_sse4.c:round_shift_8x8 |
166 | | |
167 | | static INLINE void write_buffer_8x8(const __m128i *const in, |
168 | 23.7M | uint8_t *const dest, const int stride) { |
169 | 23.7M | __m128i t[8]; |
170 | | |
171 | 23.7M | round_shift_8x8(in, t); |
172 | | |
173 | 23.7M | recon_and_store(dest + 0 * stride, t[0]); |
174 | 23.7M | recon_and_store(dest + 1 * stride, t[1]); |
175 | 23.7M | recon_and_store(dest + 2 * stride, t[2]); |
176 | 23.7M | recon_and_store(dest + 3 * stride, t[3]); |
177 | 23.7M | recon_and_store(dest + 4 * stride, t[4]); |
178 | 23.7M | recon_and_store(dest + 5 * stride, t[5]); |
179 | 23.7M | recon_and_store(dest + 6 * stride, t[6]); |
180 | 23.7M | recon_and_store(dest + 7 * stride, t[7]); |
181 | 23.7M | } Unexecuted instantiation: vp9_idct_intrin_sse2.c:write_buffer_8x8 Unexecuted instantiation: vp9_highbd_iht4x4_add_sse4.c:write_buffer_8x8 Unexecuted instantiation: vp9_highbd_iht8x8_add_sse4.c:write_buffer_8x8 Unexecuted instantiation: vp9_highbd_iht16x16_add_sse4.c:write_buffer_8x8 inv_txfm_sse2.c:write_buffer_8x8 Line | Count | Source | 168 | 23.0M | uint8_t *const dest, const int stride) { | 169 | 23.0M | __m128i t[8]; | 170 | | | 171 | 23.0M | round_shift_8x8(in, t); | 172 | | | 173 | 23.0M | recon_and_store(dest + 0 * stride, t[0]); | 174 | 23.0M | recon_and_store(dest + 1 * stride, t[1]); | 175 | 23.0M | recon_and_store(dest + 2 * stride, t[2]); | 176 | 23.0M | recon_and_store(dest + 3 * stride, t[3]); | 177 | 23.0M | recon_and_store(dest + 4 * stride, t[4]); | 178 | 23.0M | recon_and_store(dest + 5 * stride, t[5]); | 179 | 23.0M | recon_and_store(dest + 6 * stride, t[6]); | 180 | 23.0M | recon_and_store(dest + 7 * stride, t[7]); | 181 | 23.0M | } |
inv_txfm_ssse3.c:write_buffer_8x8 Line | Count | Source | 168 | 695k | uint8_t *const dest, const int stride) { | 169 | 695k | __m128i t[8]; | 170 | | | 171 | 695k | round_shift_8x8(in, t); | 172 | | | 173 | 695k | recon_and_store(dest + 0 * stride, t[0]); | 174 | 695k | recon_and_store(dest + 1 * stride, t[1]); | 175 | 695k | recon_and_store(dest + 2 * stride, t[2]); | 176 | 695k | recon_and_store(dest + 3 * stride, t[3]); | 177 | 695k | recon_and_store(dest + 4 * stride, t[4]); | 178 | 695k | recon_and_store(dest + 5 * stride, t[5]); | 179 | 695k | recon_and_store(dest + 6 * stride, t[6]); | 180 | 695k | recon_and_store(dest + 7 * stride, t[7]); | 181 | 695k | } |
Unexecuted instantiation: highbd_idct4x4_add_sse2.c:write_buffer_8x8 Unexecuted instantiation: highbd_idct8x8_add_sse2.c:write_buffer_8x8 Unexecuted instantiation: highbd_idct16x16_add_sse2.c:write_buffer_8x8 Unexecuted instantiation: highbd_idct32x32_add_sse2.c:write_buffer_8x8 Unexecuted instantiation: highbd_idct4x4_add_sse4.c:write_buffer_8x8 Unexecuted instantiation: highbd_idct8x8_add_sse4.c:write_buffer_8x8 Unexecuted instantiation: highbd_idct16x16_add_sse4.c:write_buffer_8x8 Unexecuted instantiation: highbd_idct32x32_add_sse4.c:write_buffer_8x8 |
182 | | |
183 | | static INLINE void recon_and_store4x4_sse2(const __m128i *const in, |
184 | | uint8_t *const dest, |
185 | 204M | const int stride) { |
186 | 204M | const __m128i zero = _mm_setzero_si128(); |
187 | 204M | __m128i d[2]; |
188 | | |
189 | | // Reconstruction and Store |
190 | 204M | d[0] = _mm_cvtsi32_si128(*(const int *)(dest)); |
191 | 204M | d[1] = _mm_cvtsi32_si128(*(const int *)(dest + stride * 3)); |
192 | 204M | d[0] = _mm_unpacklo_epi32(d[0], |
193 | 204M | _mm_cvtsi32_si128(*(const int *)(dest + stride))); |
194 | 204M | d[1] = _mm_unpacklo_epi32( |
195 | 204M | _mm_cvtsi32_si128(*(const int *)(dest + stride * 2)), d[1]); |
196 | 204M | d[0] = _mm_unpacklo_epi8(d[0], zero); |
197 | 204M | d[1] = _mm_unpacklo_epi8(d[1], zero); |
198 | 204M | d[0] = _mm_add_epi16(d[0], in[0]); |
199 | 204M | d[1] = _mm_add_epi16(d[1], in[1]); |
200 | 204M | d[0] = _mm_packus_epi16(d[0], d[1]); |
201 | | |
202 | 204M | *(int *)dest = _mm_cvtsi128_si32(d[0]); |
203 | 204M | d[0] = _mm_srli_si128(d[0], 4); |
204 | 204M | *(int *)(dest + stride) = _mm_cvtsi128_si32(d[0]); |
205 | 204M | d[0] = _mm_srli_si128(d[0], 4); |
206 | 204M | *(int *)(dest + stride * 2) = _mm_cvtsi128_si32(d[0]); |
207 | 204M | d[0] = _mm_srli_si128(d[0], 4); |
208 | 204M | *(int *)(dest + stride * 3) = _mm_cvtsi128_si32(d[0]); |
209 | 204M | } vp9_idct_intrin_sse2.c:recon_and_store4x4_sse2 Line | Count | Source | 185 | 66.6M | const int stride) { | 186 | 66.6M | const __m128i zero = _mm_setzero_si128(); | 187 | 66.6M | __m128i d[2]; | 188 | | | 189 | | // Reconstruction and Store | 190 | 66.6M | d[0] = _mm_cvtsi32_si128(*(const int *)(dest)); | 191 | 66.6M | d[1] = _mm_cvtsi32_si128(*(const int *)(dest + stride * 3)); | 192 | 66.6M | d[0] = _mm_unpacklo_epi32(d[0], | 193 | 66.6M | _mm_cvtsi32_si128(*(const int *)(dest + stride))); | 194 | 66.6M | d[1] = _mm_unpacklo_epi32( | 195 | 66.6M | _mm_cvtsi32_si128(*(const int *)(dest + stride * 2)), d[1]); | 196 | 66.6M | d[0] = _mm_unpacklo_epi8(d[0], zero); | 197 | 66.6M | d[1] = _mm_unpacklo_epi8(d[1], zero); | 198 | 66.6M | d[0] = _mm_add_epi16(d[0], in[0]); | 199 | 66.6M | d[1] = _mm_add_epi16(d[1], in[1]); | 200 | 66.6M | d[0] = _mm_packus_epi16(d[0], d[1]); | 201 | | | 202 | 66.6M | *(int *)dest = _mm_cvtsi128_si32(d[0]); | 203 | 66.6M | d[0] = _mm_srli_si128(d[0], 4); | 204 | 66.6M | *(int *)(dest + stride) = _mm_cvtsi128_si32(d[0]); | 205 | 66.6M | d[0] = _mm_srli_si128(d[0], 4); | 206 | 66.6M | *(int *)(dest + stride * 2) = _mm_cvtsi128_si32(d[0]); | 207 | 66.6M | d[0] = _mm_srli_si128(d[0], 4); | 208 | 66.6M | *(int *)(dest + stride * 3) = _mm_cvtsi128_si32(d[0]); | 209 | 66.6M | } |
Unexecuted instantiation: vp9_highbd_iht4x4_add_sse4.c:recon_and_store4x4_sse2 Unexecuted instantiation: vp9_highbd_iht8x8_add_sse4.c:recon_and_store4x4_sse2 Unexecuted instantiation: vp9_highbd_iht16x16_add_sse4.c:recon_and_store4x4_sse2 inv_txfm_sse2.c:recon_and_store4x4_sse2 Line | Count | Source | 185 | 137M | const int stride) { | 186 | 137M | const __m128i zero = _mm_setzero_si128(); | 187 | 137M | __m128i d[2]; | 188 | | | 189 | | // Reconstruction and Store | 190 | 137M | d[0] = _mm_cvtsi32_si128(*(const int *)(dest)); | 191 | 137M | d[1] = _mm_cvtsi32_si128(*(const int *)(dest + stride * 3)); | 192 | 137M | d[0] = _mm_unpacklo_epi32(d[0], | 193 | 137M | _mm_cvtsi32_si128(*(const int *)(dest + stride))); | 194 | 137M | d[1] = _mm_unpacklo_epi32( | 195 | 137M | _mm_cvtsi32_si128(*(const int *)(dest + stride * 2)), d[1]); | 196 | 137M | d[0] = _mm_unpacklo_epi8(d[0], zero); | 197 | 137M | d[1] = _mm_unpacklo_epi8(d[1], zero); | 198 | 137M | d[0] = _mm_add_epi16(d[0], in[0]); | 199 | 137M | d[1] = _mm_add_epi16(d[1], in[1]); | 200 | 137M | d[0] = _mm_packus_epi16(d[0], d[1]); | 201 | | | 202 | 137M | *(int *)dest = _mm_cvtsi128_si32(d[0]); | 203 | 137M | d[0] = _mm_srli_si128(d[0], 4); | 204 | 137M | *(int *)(dest + stride) = _mm_cvtsi128_si32(d[0]); | 205 | 137M | d[0] = _mm_srli_si128(d[0], 4); | 206 | 137M | *(int *)(dest + stride * 2) = _mm_cvtsi128_si32(d[0]); | 207 | 137M | d[0] = _mm_srli_si128(d[0], 4); | 208 | 137M | *(int *)(dest + stride * 3) = _mm_cvtsi128_si32(d[0]); | 209 | 137M | } |
Unexecuted instantiation: inv_txfm_ssse3.c:recon_and_store4x4_sse2 Unexecuted instantiation: highbd_idct4x4_add_sse2.c:recon_and_store4x4_sse2 Unexecuted instantiation: highbd_idct8x8_add_sse2.c:recon_and_store4x4_sse2 Unexecuted instantiation: highbd_idct16x16_add_sse2.c:recon_and_store4x4_sse2 Unexecuted instantiation: highbd_idct32x32_add_sse2.c:recon_and_store4x4_sse2 Unexecuted instantiation: highbd_idct4x4_add_sse4.c:recon_and_store4x4_sse2 Unexecuted instantiation: highbd_idct8x8_add_sse4.c:recon_and_store4x4_sse2 Unexecuted instantiation: highbd_idct16x16_add_sse4.c:recon_and_store4x4_sse2 Unexecuted instantiation: highbd_idct32x32_add_sse4.c:recon_and_store4x4_sse2 |
210 | | |
211 | 0 | static INLINE void store_buffer_8x32(__m128i *in, uint8_t *dst, int stride) { |
212 | 0 | const __m128i final_rounding = _mm_set1_epi16(1 << 5); |
213 | 0 | int j = 0; |
214 | 0 | while (j < 32) { |
215 | 0 | in[j] = _mm_adds_epi16(in[j], final_rounding); |
216 | 0 | in[j + 1] = _mm_adds_epi16(in[j + 1], final_rounding); |
217 | |
|
218 | 0 | in[j] = _mm_srai_epi16(in[j], 6); |
219 | 0 | in[j + 1] = _mm_srai_epi16(in[j + 1], 6); |
220 | |
|
221 | 0 | recon_and_store(dst, in[j]); |
222 | 0 | dst += stride; |
223 | 0 | recon_and_store(dst, in[j + 1]); |
224 | 0 | dst += stride; |
225 | 0 | j += 2; |
226 | 0 | } |
227 | 0 | } Unexecuted instantiation: vp9_idct_intrin_sse2.c:store_buffer_8x32 Unexecuted instantiation: vp9_highbd_iht4x4_add_sse4.c:store_buffer_8x32 Unexecuted instantiation: vp9_highbd_iht8x8_add_sse4.c:store_buffer_8x32 Unexecuted instantiation: vp9_highbd_iht16x16_add_sse4.c:store_buffer_8x32 Unexecuted instantiation: inv_txfm_sse2.c:store_buffer_8x32 Unexecuted instantiation: inv_txfm_ssse3.c:store_buffer_8x32 Unexecuted instantiation: highbd_idct4x4_add_sse2.c:store_buffer_8x32 Unexecuted instantiation: highbd_idct8x8_add_sse2.c:store_buffer_8x32 Unexecuted instantiation: highbd_idct16x16_add_sse2.c:store_buffer_8x32 Unexecuted instantiation: highbd_idct32x32_add_sse2.c:store_buffer_8x32 Unexecuted instantiation: highbd_idct4x4_add_sse4.c:store_buffer_8x32 Unexecuted instantiation: highbd_idct8x8_add_sse4.c:store_buffer_8x32 Unexecuted instantiation: highbd_idct16x16_add_sse4.c:store_buffer_8x32 Unexecuted instantiation: highbd_idct32x32_add_sse4.c:store_buffer_8x32 |
228 | | |
229 | 4.40M | static INLINE void write_buffer_8x1(uint8_t *const dest, const __m128i in) { |
230 | 4.40M | const __m128i final_rounding = _mm_set1_epi16(1 << 5); |
231 | 4.40M | __m128i out; |
232 | 4.40M | out = _mm_adds_epi16(in, final_rounding); |
233 | 4.40M | out = _mm_srai_epi16(out, 6); |
234 | 4.40M | recon_and_store(dest, out); |
235 | 4.40M | } Unexecuted instantiation: vp9_idct_intrin_sse2.c:write_buffer_8x1 Unexecuted instantiation: vp9_highbd_iht4x4_add_sse4.c:write_buffer_8x1 Unexecuted instantiation: vp9_highbd_iht8x8_add_sse4.c:write_buffer_8x1 Unexecuted instantiation: vp9_highbd_iht16x16_add_sse4.c:write_buffer_8x1 inv_txfm_sse2.c:write_buffer_8x1 Line | Count | Source | 229 | 1.77M | static INLINE void write_buffer_8x1(uint8_t *const dest, const __m128i in) { | 230 | 1.77M | const __m128i final_rounding = _mm_set1_epi16(1 << 5); | 231 | 1.77M | __m128i out; | 232 | 1.77M | out = _mm_adds_epi16(in, final_rounding); | 233 | 1.77M | out = _mm_srai_epi16(out, 6); | 234 | 1.77M | recon_and_store(dest, out); | 235 | 1.77M | } |
inv_txfm_ssse3.c:write_buffer_8x1 Line | Count | Source | 229 | 2.63M | static INLINE void write_buffer_8x1(uint8_t *const dest, const __m128i in) { | 230 | 2.63M | const __m128i final_rounding = _mm_set1_epi16(1 << 5); | 231 | 2.63M | __m128i out; | 232 | 2.63M | out = _mm_adds_epi16(in, final_rounding); | 233 | 2.63M | out = _mm_srai_epi16(out, 6); | 234 | 2.63M | recon_and_store(dest, out); | 235 | 2.63M | } |
Unexecuted instantiation: highbd_idct4x4_add_sse2.c:write_buffer_8x1 Unexecuted instantiation: highbd_idct8x8_add_sse2.c:write_buffer_8x1 Unexecuted instantiation: highbd_idct16x16_add_sse2.c:write_buffer_8x1 Unexecuted instantiation: highbd_idct32x32_add_sse2.c:write_buffer_8x1 Unexecuted instantiation: highbd_idct4x4_add_sse4.c:write_buffer_8x1 Unexecuted instantiation: highbd_idct8x8_add_sse4.c:write_buffer_8x1 Unexecuted instantiation: highbd_idct16x16_add_sse4.c:write_buffer_8x1 Unexecuted instantiation: highbd_idct32x32_add_sse4.c:write_buffer_8x1 |
236 | | |
237 | | // Only do addition and subtraction butterfly, size = 16, 32 |
238 | | static INLINE void add_sub_butterfly(const __m128i *in, __m128i *out, |
239 | 205k | int size) { |
240 | 205k | int i = 0; |
241 | 205k | const int num = size >> 1; |
242 | 205k | const int bound = size - 1; |
243 | 2.67M | while (i < num) { |
244 | 2.46M | out[i] = _mm_add_epi16(in[i], in[bound - i]); |
245 | 2.46M | out[bound - i] = _mm_sub_epi16(in[i], in[bound - i]); |
246 | 2.46M | i++; |
247 | 2.46M | } |
248 | 205k | } Unexecuted instantiation: vp9_idct_intrin_sse2.c:add_sub_butterfly Unexecuted instantiation: vp9_highbd_iht4x4_add_sse4.c:add_sub_butterfly Unexecuted instantiation: vp9_highbd_iht8x8_add_sse4.c:add_sub_butterfly Unexecuted instantiation: vp9_highbd_iht16x16_add_sse4.c:add_sub_butterfly Unexecuted instantiation: inv_txfm_sse2.c:add_sub_butterfly inv_txfm_ssse3.c:add_sub_butterfly Line | Count | Source | 239 | 205k | int size) { | 240 | 205k | int i = 0; | 241 | 205k | const int num = size >> 1; | 242 | 205k | const int bound = size - 1; | 243 | 2.67M | while (i < num) { | 244 | 2.46M | out[i] = _mm_add_epi16(in[i], in[bound - i]); | 245 | 2.46M | out[bound - i] = _mm_sub_epi16(in[i], in[bound - i]); | 246 | 2.46M | i++; | 247 | 2.46M | } | 248 | 205k | } |
Unexecuted instantiation: highbd_idct4x4_add_sse2.c:add_sub_butterfly Unexecuted instantiation: highbd_idct8x8_add_sse2.c:add_sub_butterfly Unexecuted instantiation: highbd_idct16x16_add_sse2.c:add_sub_butterfly Unexecuted instantiation: highbd_idct32x32_add_sse2.c:add_sub_butterfly Unexecuted instantiation: highbd_idct4x4_add_sse4.c:add_sub_butterfly Unexecuted instantiation: highbd_idct8x8_add_sse4.c:add_sub_butterfly Unexecuted instantiation: highbd_idct16x16_add_sse4.c:add_sub_butterfly Unexecuted instantiation: highbd_idct32x32_add_sse4.c:add_sub_butterfly |
249 | | |
250 | | static INLINE void idct8(const __m128i *const in /*in[8]*/, |
251 | 66.2M | __m128i *const out /*out[8]*/) { |
252 | 66.2M | __m128i step1[8], step2[8]; |
253 | | |
254 | | // stage 1 |
255 | 66.2M | butterfly(in[1], in[7], cospi_28_64, cospi_4_64, &step1[4], &step1[7]); |
256 | 66.2M | butterfly(in[5], in[3], cospi_12_64, cospi_20_64, &step1[5], &step1[6]); |
257 | | |
258 | | // stage 2 |
259 | 66.2M | butterfly(in[0], in[4], cospi_16_64, cospi_16_64, &step2[1], &step2[0]); |
260 | 66.2M | butterfly(in[2], in[6], cospi_24_64, cospi_8_64, &step2[2], &step2[3]); |
261 | | |
262 | 66.2M | step2[4] = _mm_add_epi16(step1[4], step1[5]); |
263 | 66.2M | step2[5] = _mm_sub_epi16(step1[4], step1[5]); |
264 | 66.2M | step2[6] = _mm_sub_epi16(step1[7], step1[6]); |
265 | 66.2M | step2[7] = _mm_add_epi16(step1[7], step1[6]); |
266 | | |
267 | | // stage 3 |
268 | 66.2M | step1[0] = _mm_add_epi16(step2[0], step2[3]); |
269 | 66.2M | step1[1] = _mm_add_epi16(step2[1], step2[2]); |
270 | 66.2M | step1[2] = _mm_sub_epi16(step2[1], step2[2]); |
271 | 66.2M | step1[3] = _mm_sub_epi16(step2[0], step2[3]); |
272 | 66.2M | butterfly(step2[6], step2[5], cospi_16_64, cospi_16_64, &step1[5], &step1[6]); |
273 | | |
274 | | // stage 4 |
275 | 66.2M | out[0] = _mm_add_epi16(step1[0], step2[7]); |
276 | 66.2M | out[1] = _mm_add_epi16(step1[1], step1[6]); |
277 | 66.2M | out[2] = _mm_add_epi16(step1[2], step1[5]); |
278 | 66.2M | out[3] = _mm_add_epi16(step1[3], step2[4]); |
279 | 66.2M | out[4] = _mm_sub_epi16(step1[3], step2[4]); |
280 | 66.2M | out[5] = _mm_sub_epi16(step1[2], step1[5]); |
281 | 66.2M | out[6] = _mm_sub_epi16(step1[1], step1[6]); |
282 | 66.2M | out[7] = _mm_sub_epi16(step1[0], step2[7]); |
283 | 66.2M | } Unexecuted instantiation: vp9_idct_intrin_sse2.c:idct8 Unexecuted instantiation: vp9_highbd_iht4x4_add_sse4.c:idct8 Unexecuted instantiation: vp9_highbd_iht8x8_add_sse4.c:idct8 Unexecuted instantiation: vp9_highbd_iht16x16_add_sse4.c:idct8 Line | Count | Source | 251 | 66.2M | __m128i *const out /*out[8]*/) { | 252 | 66.2M | __m128i step1[8], step2[8]; | 253 | | | 254 | | // stage 1 | 255 | 66.2M | butterfly(in[1], in[7], cospi_28_64, cospi_4_64, &step1[4], &step1[7]); | 256 | 66.2M | butterfly(in[5], in[3], cospi_12_64, cospi_20_64, &step1[5], &step1[6]); | 257 | | | 258 | | // stage 2 | 259 | 66.2M | butterfly(in[0], in[4], cospi_16_64, cospi_16_64, &step2[1], &step2[0]); | 260 | 66.2M | butterfly(in[2], in[6], cospi_24_64, cospi_8_64, &step2[2], &step2[3]); | 261 | | | 262 | 66.2M | step2[4] = _mm_add_epi16(step1[4], step1[5]); | 263 | 66.2M | step2[5] = _mm_sub_epi16(step1[4], step1[5]); | 264 | 66.2M | step2[6] = _mm_sub_epi16(step1[7], step1[6]); | 265 | 66.2M | step2[7] = _mm_add_epi16(step1[7], step1[6]); | 266 | | | 267 | | // stage 3 | 268 | 66.2M | step1[0] = _mm_add_epi16(step2[0], step2[3]); | 269 | 66.2M | step1[1] = _mm_add_epi16(step2[1], step2[2]); | 270 | 66.2M | step1[2] = _mm_sub_epi16(step2[1], step2[2]); | 271 | 66.2M | step1[3] = _mm_sub_epi16(step2[0], step2[3]); | 272 | 66.2M | butterfly(step2[6], step2[5], cospi_16_64, cospi_16_64, &step1[5], &step1[6]); | 273 | | | 274 | | // stage 4 | 275 | 66.2M | out[0] = _mm_add_epi16(step1[0], step2[7]); | 276 | 66.2M | out[1] = _mm_add_epi16(step1[1], step1[6]); | 277 | 66.2M | out[2] = _mm_add_epi16(step1[2], step1[5]); | 278 | 66.2M | out[3] = _mm_add_epi16(step1[3], step2[4]); | 279 | 66.2M | out[4] = _mm_sub_epi16(step1[3], step2[4]); | 280 | 66.2M | out[5] = _mm_sub_epi16(step1[2], step1[5]); | 281 | 66.2M | out[6] = _mm_sub_epi16(step1[1], step1[6]); | 282 | 66.2M | out[7] = _mm_sub_epi16(step1[0], step2[7]); | 283 | 66.2M | } |
Unexecuted instantiation: inv_txfm_ssse3.c:idct8 Unexecuted instantiation: highbd_idct4x4_add_sse2.c:idct8 Unexecuted instantiation: highbd_idct8x8_add_sse2.c:idct8 Unexecuted instantiation: highbd_idct16x16_add_sse2.c:idct8 Unexecuted instantiation: highbd_idct32x32_add_sse2.c:idct8 Unexecuted instantiation: highbd_idct4x4_add_sse4.c:idct8 Unexecuted instantiation: highbd_idct8x8_add_sse4.c:idct8 Unexecuted instantiation: highbd_idct16x16_add_sse4.c:idct8 Unexecuted instantiation: highbd_idct32x32_add_sse4.c:idct8 |
284 | | |
285 | 0 | static INLINE void idct8x8_12_add_kernel_sse2(__m128i *const io /*io[8]*/) { |
286 | 0 | const __m128i zero = _mm_setzero_si128(); |
287 | 0 | const __m128i cp_16_16 = pair_set_epi16(cospi_16_64, cospi_16_64); |
288 | 0 | const __m128i cp_16_n16 = pair_set_epi16(cospi_16_64, -cospi_16_64); |
289 | 0 | __m128i step1[8], step2[8], tmp[4]; |
290 | |
|
291 | 0 | transpose_16bit_4x4(io, io); |
292 | | // io[0]: 00 10 20 30 01 11 21 31 |
293 | | // io[1]: 02 12 22 32 03 13 23 33 |
294 | | |
295 | | // stage 1 |
296 | 0 | { |
297 | 0 | const __m128i cp_28_n4 = pair_set_epi16(cospi_28_64, -cospi_4_64); |
298 | 0 | const __m128i cp_4_28 = pair_set_epi16(cospi_4_64, cospi_28_64); |
299 | 0 | const __m128i cp_n20_12 = pair_set_epi16(-cospi_20_64, cospi_12_64); |
300 | 0 | const __m128i cp_12_20 = pair_set_epi16(cospi_12_64, cospi_20_64); |
301 | 0 | const __m128i lo_1 = _mm_unpackhi_epi16(io[0], zero); |
302 | 0 | const __m128i lo_3 = _mm_unpackhi_epi16(io[1], zero); |
303 | 0 | step1[4] = idct_calc_wraplow_sse2(cp_28_n4, cp_4_28, lo_1); // step1 4&7 |
304 | 0 | step1[5] = idct_calc_wraplow_sse2(cp_n20_12, cp_12_20, lo_3); // step1 5&6 |
305 | 0 | } |
306 | | |
307 | | // stage 2 |
308 | 0 | { |
309 | 0 | const __m128i cp_24_n8 = pair_set_epi16(cospi_24_64, -cospi_8_64); |
310 | 0 | const __m128i cp_8_24 = pair_set_epi16(cospi_8_64, cospi_24_64); |
311 | 0 | const __m128i lo_0 = _mm_unpacklo_epi16(io[0], zero); |
312 | 0 | const __m128i lo_2 = _mm_unpacklo_epi16(io[1], zero); |
313 | 0 | const __m128i t = idct_madd_round_shift_sse2(cp_16_16, lo_0); |
314 | 0 | step2[0] = _mm_packs_epi32(t, t); // step2 0&1 |
315 | 0 | step2[2] = idct_calc_wraplow_sse2(cp_8_24, cp_24_n8, lo_2); // step2 3&2 |
316 | 0 | step2[4] = _mm_add_epi16(step1[4], step1[5]); // step2 4&7 |
317 | 0 | step2[5] = _mm_sub_epi16(step1[4], step1[5]); // step2 5&6 |
318 | 0 | step2[6] = _mm_unpackhi_epi64(step2[5], zero); // step2 6 |
319 | 0 | } |
320 | | |
321 | | // stage 3 |
322 | 0 | { |
323 | 0 | const __m128i lo_65 = _mm_unpacklo_epi16(step2[6], step2[5]); |
324 | 0 | tmp[0] = _mm_add_epi16(step2[0], step2[2]); // step1 0&1 |
325 | 0 | tmp[1] = _mm_sub_epi16(step2[0], step2[2]); // step1 3&2 |
326 | 0 | step1[2] = _mm_unpackhi_epi64(tmp[1], tmp[0]); // step1 2&1 |
327 | 0 | step1[3] = _mm_unpacklo_epi64(tmp[1], tmp[0]); // step1 3&0 |
328 | 0 | step1[5] = idct_calc_wraplow_sse2(cp_16_n16, cp_16_16, lo_65); // step1 5&6 |
329 | 0 | } |
330 | | |
331 | | // stage 4 |
332 | 0 | tmp[0] = _mm_add_epi16(step1[3], step2[4]); // output 3&0 |
333 | 0 | tmp[1] = _mm_add_epi16(step1[2], step1[5]); // output 2&1 |
334 | 0 | tmp[2] = _mm_sub_epi16(step1[3], step2[4]); // output 4&7 |
335 | 0 | tmp[3] = _mm_sub_epi16(step1[2], step1[5]); // output 5&6 |
336 | |
|
337 | 0 | idct8x8_12_transpose_16bit_4x8(tmp, io); |
338 | 0 | io[4] = io[5] = io[6] = io[7] = zero; |
339 | |
|
340 | 0 | idct8(io, io); |
341 | 0 | } Unexecuted instantiation: vp9_idct_intrin_sse2.c:idct8x8_12_add_kernel_sse2 Unexecuted instantiation: vp9_highbd_iht4x4_add_sse4.c:idct8x8_12_add_kernel_sse2 Unexecuted instantiation: vp9_highbd_iht8x8_add_sse4.c:idct8x8_12_add_kernel_sse2 Unexecuted instantiation: vp9_highbd_iht16x16_add_sse4.c:idct8x8_12_add_kernel_sse2 Unexecuted instantiation: inv_txfm_sse2.c:idct8x8_12_add_kernel_sse2 Unexecuted instantiation: inv_txfm_ssse3.c:idct8x8_12_add_kernel_sse2 Unexecuted instantiation: highbd_idct4x4_add_sse2.c:idct8x8_12_add_kernel_sse2 Unexecuted instantiation: highbd_idct8x8_add_sse2.c:idct8x8_12_add_kernel_sse2 Unexecuted instantiation: highbd_idct16x16_add_sse2.c:idct8x8_12_add_kernel_sse2 Unexecuted instantiation: highbd_idct32x32_add_sse2.c:idct8x8_12_add_kernel_sse2 Unexecuted instantiation: highbd_idct4x4_add_sse4.c:idct8x8_12_add_kernel_sse2 Unexecuted instantiation: highbd_idct8x8_add_sse4.c:idct8x8_12_add_kernel_sse2 Unexecuted instantiation: highbd_idct16x16_add_sse4.c:idct8x8_12_add_kernel_sse2 Unexecuted instantiation: highbd_idct32x32_add_sse4.c:idct8x8_12_add_kernel_sse2 |
342 | | |
343 | | static INLINE void idct16_8col(const __m128i *const in /*in[16]*/, |
344 | 9.83M | __m128i *const out /*out[16]*/) { |
345 | 9.83M | __m128i step1[16], step2[16]; |
346 | | |
347 | | // stage 2 |
348 | 9.83M | butterfly(in[1], in[15], cospi_30_64, cospi_2_64, &step2[8], &step2[15]); |
349 | 9.83M | butterfly(in[9], in[7], cospi_14_64, cospi_18_64, &step2[9], &step2[14]); |
350 | 9.83M | butterfly(in[5], in[11], cospi_22_64, cospi_10_64, &step2[10], &step2[13]); |
351 | 9.83M | butterfly(in[13], in[3], cospi_6_64, cospi_26_64, &step2[11], &step2[12]); |
352 | | |
353 | | // stage 3 |
354 | 9.83M | butterfly(in[2], in[14], cospi_28_64, cospi_4_64, &step1[4], &step1[7]); |
355 | 9.83M | butterfly(in[10], in[6], cospi_12_64, cospi_20_64, &step1[5], &step1[6]); |
356 | 9.83M | step1[8] = _mm_add_epi16(step2[8], step2[9]); |
357 | 9.83M | step1[9] = _mm_sub_epi16(step2[8], step2[9]); |
358 | 9.83M | step1[10] = _mm_sub_epi16(step2[11], step2[10]); |
359 | 9.83M | step1[11] = _mm_add_epi16(step2[10], step2[11]); |
360 | 9.83M | step1[12] = _mm_add_epi16(step2[12], step2[13]); |
361 | 9.83M | step1[13] = _mm_sub_epi16(step2[12], step2[13]); |
362 | 9.83M | step1[14] = _mm_sub_epi16(step2[15], step2[14]); |
363 | 9.83M | step1[15] = _mm_add_epi16(step2[14], step2[15]); |
364 | | |
365 | | // stage 4 |
366 | 9.83M | butterfly(in[0], in[8], cospi_16_64, cospi_16_64, &step2[1], &step2[0]); |
367 | 9.83M | butterfly(in[4], in[12], cospi_24_64, cospi_8_64, &step2[2], &step2[3]); |
368 | 9.83M | butterfly(step1[14], step1[9], cospi_24_64, cospi_8_64, &step2[9], |
369 | 9.83M | &step2[14]); |
370 | 9.83M | butterfly(step1[10], step1[13], -cospi_8_64, -cospi_24_64, &step2[13], |
371 | 9.83M | &step2[10]); |
372 | 9.83M | step2[5] = _mm_sub_epi16(step1[4], step1[5]); |
373 | 9.83M | step1[4] = _mm_add_epi16(step1[4], step1[5]); |
374 | 9.83M | step2[6] = _mm_sub_epi16(step1[7], step1[6]); |
375 | 9.83M | step1[7] = _mm_add_epi16(step1[6], step1[7]); |
376 | 9.83M | step2[8] = step1[8]; |
377 | 9.83M | step2[11] = step1[11]; |
378 | 9.83M | step2[12] = step1[12]; |
379 | 9.83M | step2[15] = step1[15]; |
380 | | |
381 | | // stage 5 |
382 | 9.83M | step1[0] = _mm_add_epi16(step2[0], step2[3]); |
383 | 9.83M | step1[1] = _mm_add_epi16(step2[1], step2[2]); |
384 | 9.83M | step1[2] = _mm_sub_epi16(step2[1], step2[2]); |
385 | 9.83M | step1[3] = _mm_sub_epi16(step2[0], step2[3]); |
386 | 9.83M | butterfly(step2[6], step2[5], cospi_16_64, cospi_16_64, &step1[5], &step1[6]); |
387 | 9.83M | step1[8] = _mm_add_epi16(step2[8], step2[11]); |
388 | 9.83M | step1[9] = _mm_add_epi16(step2[9], step2[10]); |
389 | 9.83M | step1[10] = _mm_sub_epi16(step2[9], step2[10]); |
390 | 9.83M | step1[11] = _mm_sub_epi16(step2[8], step2[11]); |
391 | 9.83M | step1[12] = _mm_sub_epi16(step2[15], step2[12]); |
392 | 9.83M | step1[13] = _mm_sub_epi16(step2[14], step2[13]); |
393 | 9.83M | step1[14] = _mm_add_epi16(step2[14], step2[13]); |
394 | 9.83M | step1[15] = _mm_add_epi16(step2[15], step2[12]); |
395 | | |
396 | | // stage 6 |
397 | 9.83M | step2[0] = _mm_add_epi16(step1[0], step1[7]); |
398 | 9.83M | step2[1] = _mm_add_epi16(step1[1], step1[6]); |
399 | 9.83M | step2[2] = _mm_add_epi16(step1[2], step1[5]); |
400 | 9.83M | step2[3] = _mm_add_epi16(step1[3], step1[4]); |
401 | 9.83M | step2[4] = _mm_sub_epi16(step1[3], step1[4]); |
402 | 9.83M | step2[5] = _mm_sub_epi16(step1[2], step1[5]); |
403 | 9.83M | step2[6] = _mm_sub_epi16(step1[1], step1[6]); |
404 | 9.83M | step2[7] = _mm_sub_epi16(step1[0], step1[7]); |
405 | 9.83M | butterfly(step1[13], step1[10], cospi_16_64, cospi_16_64, &step2[10], |
406 | 9.83M | &step2[13]); |
407 | 9.83M | butterfly(step1[12], step1[11], cospi_16_64, cospi_16_64, &step2[11], |
408 | 9.83M | &step2[12]); |
409 | | |
410 | | // stage 7 |
411 | 9.83M | out[0] = _mm_add_epi16(step2[0], step1[15]); |
412 | 9.83M | out[1] = _mm_add_epi16(step2[1], step1[14]); |
413 | 9.83M | out[2] = _mm_add_epi16(step2[2], step2[13]); |
414 | 9.83M | out[3] = _mm_add_epi16(step2[3], step2[12]); |
415 | 9.83M | out[4] = _mm_add_epi16(step2[4], step2[11]); |
416 | 9.83M | out[5] = _mm_add_epi16(step2[5], step2[10]); |
417 | 9.83M | out[6] = _mm_add_epi16(step2[6], step1[9]); |
418 | 9.83M | out[7] = _mm_add_epi16(step2[7], step1[8]); |
419 | 9.83M | out[8] = _mm_sub_epi16(step2[7], step1[8]); |
420 | 9.83M | out[9] = _mm_sub_epi16(step2[6], step1[9]); |
421 | 9.83M | out[10] = _mm_sub_epi16(step2[5], step2[10]); |
422 | 9.83M | out[11] = _mm_sub_epi16(step2[4], step2[11]); |
423 | 9.83M | out[12] = _mm_sub_epi16(step2[3], step2[12]); |
424 | 9.83M | out[13] = _mm_sub_epi16(step2[2], step2[13]); |
425 | 9.83M | out[14] = _mm_sub_epi16(step2[1], step1[14]); |
426 | 9.83M | out[15] = _mm_sub_epi16(step2[0], step1[15]); |
427 | 9.83M | } Unexecuted instantiation: vp9_idct_intrin_sse2.c:idct16_8col Unexecuted instantiation: vp9_highbd_iht4x4_add_sse4.c:idct16_8col Unexecuted instantiation: vp9_highbd_iht8x8_add_sse4.c:idct16_8col Unexecuted instantiation: vp9_highbd_iht16x16_add_sse4.c:idct16_8col inv_txfm_sse2.c:idct16_8col Line | Count | Source | 344 | 9.83M | __m128i *const out /*out[16]*/) { | 345 | 9.83M | __m128i step1[16], step2[16]; | 346 | | | 347 | | // stage 2 | 348 | 9.83M | butterfly(in[1], in[15], cospi_30_64, cospi_2_64, &step2[8], &step2[15]); | 349 | 9.83M | butterfly(in[9], in[7], cospi_14_64, cospi_18_64, &step2[9], &step2[14]); | 350 | 9.83M | butterfly(in[5], in[11], cospi_22_64, cospi_10_64, &step2[10], &step2[13]); | 351 | 9.83M | butterfly(in[13], in[3], cospi_6_64, cospi_26_64, &step2[11], &step2[12]); | 352 | | | 353 | | // stage 3 | 354 | 9.83M | butterfly(in[2], in[14], cospi_28_64, cospi_4_64, &step1[4], &step1[7]); | 355 | 9.83M | butterfly(in[10], in[6], cospi_12_64, cospi_20_64, &step1[5], &step1[6]); | 356 | 9.83M | step1[8] = _mm_add_epi16(step2[8], step2[9]); | 357 | 9.83M | step1[9] = _mm_sub_epi16(step2[8], step2[9]); | 358 | 9.83M | step1[10] = _mm_sub_epi16(step2[11], step2[10]); | 359 | 9.83M | step1[11] = _mm_add_epi16(step2[10], step2[11]); | 360 | 9.83M | step1[12] = _mm_add_epi16(step2[12], step2[13]); | 361 | 9.83M | step1[13] = _mm_sub_epi16(step2[12], step2[13]); | 362 | 9.83M | step1[14] = _mm_sub_epi16(step2[15], step2[14]); | 363 | 9.83M | step1[15] = _mm_add_epi16(step2[14], step2[15]); | 364 | | | 365 | | // stage 4 | 366 | 9.83M | butterfly(in[0], in[8], cospi_16_64, cospi_16_64, &step2[1], &step2[0]); | 367 | 9.83M | butterfly(in[4], in[12], cospi_24_64, cospi_8_64, &step2[2], &step2[3]); | 368 | 9.83M | butterfly(step1[14], step1[9], cospi_24_64, cospi_8_64, &step2[9], | 369 | 9.83M | &step2[14]); | 370 | 9.83M | butterfly(step1[10], step1[13], -cospi_8_64, -cospi_24_64, &step2[13], | 371 | 9.83M | &step2[10]); | 372 | 9.83M | step2[5] = _mm_sub_epi16(step1[4], step1[5]); | 373 | 9.83M | step1[4] = _mm_add_epi16(step1[4], step1[5]); | 374 | 9.83M | step2[6] = _mm_sub_epi16(step1[7], step1[6]); | 375 | 9.83M | step1[7] = _mm_add_epi16(step1[6], step1[7]); | 376 | 9.83M | step2[8] = step1[8]; | 377 | 9.83M | step2[11] = step1[11]; | 378 | 9.83M | step2[12] = step1[12]; | 379 | 9.83M | step2[15] = step1[15]; | 380 | | | 381 | | // stage 5 | 382 | 9.83M | step1[0] = _mm_add_epi16(step2[0], step2[3]); | 383 | 9.83M | step1[1] = _mm_add_epi16(step2[1], step2[2]); | 384 | 9.83M | step1[2] = _mm_sub_epi16(step2[1], step2[2]); | 385 | 9.83M | step1[3] = _mm_sub_epi16(step2[0], step2[3]); | 386 | 9.83M | butterfly(step2[6], step2[5], cospi_16_64, cospi_16_64, &step1[5], &step1[6]); | 387 | 9.83M | step1[8] = _mm_add_epi16(step2[8], step2[11]); | 388 | 9.83M | step1[9] = _mm_add_epi16(step2[9], step2[10]); | 389 | 9.83M | step1[10] = _mm_sub_epi16(step2[9], step2[10]); | 390 | 9.83M | step1[11] = _mm_sub_epi16(step2[8], step2[11]); | 391 | 9.83M | step1[12] = _mm_sub_epi16(step2[15], step2[12]); | 392 | 9.83M | step1[13] = _mm_sub_epi16(step2[14], step2[13]); | 393 | 9.83M | step1[14] = _mm_add_epi16(step2[14], step2[13]); | 394 | 9.83M | step1[15] = _mm_add_epi16(step2[15], step2[12]); | 395 | | | 396 | | // stage 6 | 397 | 9.83M | step2[0] = _mm_add_epi16(step1[0], step1[7]); | 398 | 9.83M | step2[1] = _mm_add_epi16(step1[1], step1[6]); | 399 | 9.83M | step2[2] = _mm_add_epi16(step1[2], step1[5]); | 400 | 9.83M | step2[3] = _mm_add_epi16(step1[3], step1[4]); | 401 | 9.83M | step2[4] = _mm_sub_epi16(step1[3], step1[4]); | 402 | 9.83M | step2[5] = _mm_sub_epi16(step1[2], step1[5]); | 403 | 9.83M | step2[6] = _mm_sub_epi16(step1[1], step1[6]); | 404 | 9.83M | step2[7] = _mm_sub_epi16(step1[0], step1[7]); | 405 | 9.83M | butterfly(step1[13], step1[10], cospi_16_64, cospi_16_64, &step2[10], | 406 | 9.83M | &step2[13]); | 407 | 9.83M | butterfly(step1[12], step1[11], cospi_16_64, cospi_16_64, &step2[11], | 408 | 9.83M | &step2[12]); | 409 | | | 410 | | // stage 7 | 411 | 9.83M | out[0] = _mm_add_epi16(step2[0], step1[15]); | 412 | 9.83M | out[1] = _mm_add_epi16(step2[1], step1[14]); | 413 | 9.83M | out[2] = _mm_add_epi16(step2[2], step2[13]); | 414 | 9.83M | out[3] = _mm_add_epi16(step2[3], step2[12]); | 415 | 9.83M | out[4] = _mm_add_epi16(step2[4], step2[11]); | 416 | 9.83M | out[5] = _mm_add_epi16(step2[5], step2[10]); | 417 | 9.83M | out[6] = _mm_add_epi16(step2[6], step1[9]); | 418 | 9.83M | out[7] = _mm_add_epi16(step2[7], step1[8]); | 419 | 9.83M | out[8] = _mm_sub_epi16(step2[7], step1[8]); | 420 | 9.83M | out[9] = _mm_sub_epi16(step2[6], step1[9]); | 421 | 9.83M | out[10] = _mm_sub_epi16(step2[5], step2[10]); | 422 | 9.83M | out[11] = _mm_sub_epi16(step2[4], step2[11]); | 423 | 9.83M | out[12] = _mm_sub_epi16(step2[3], step2[12]); | 424 | 9.83M | out[13] = _mm_sub_epi16(step2[2], step2[13]); | 425 | 9.83M | out[14] = _mm_sub_epi16(step2[1], step1[14]); | 426 | 9.83M | out[15] = _mm_sub_epi16(step2[0], step1[15]); | 427 | 9.83M | } |
Unexecuted instantiation: inv_txfm_ssse3.c:idct16_8col Unexecuted instantiation: highbd_idct4x4_add_sse2.c:idct16_8col Unexecuted instantiation: highbd_idct8x8_add_sse2.c:idct16_8col Unexecuted instantiation: highbd_idct16x16_add_sse2.c:idct16_8col Unexecuted instantiation: highbd_idct32x32_add_sse2.c:idct16_8col Unexecuted instantiation: highbd_idct4x4_add_sse4.c:idct16_8col Unexecuted instantiation: highbd_idct8x8_add_sse4.c:idct16_8col Unexecuted instantiation: highbd_idct16x16_add_sse4.c:idct16_8col Unexecuted instantiation: highbd_idct32x32_add_sse4.c:idct16_8col |
428 | | |
429 | | static INLINE void idct16x16_10_pass1(const __m128i *const input /*input[4]*/, |
430 | 26.0k | __m128i *const output /*output[16]*/) { |
431 | 26.0k | const __m128i zero = _mm_setzero_si128(); |
432 | 26.0k | const __m128i k__cospi_p16_p16 = pair_set_epi16(cospi_16_64, cospi_16_64); |
433 | 26.0k | const __m128i k__cospi_m16_p16 = pair_set_epi16(-cospi_16_64, cospi_16_64); |
434 | 26.0k | __m128i step1[16], step2[16]; |
435 | | |
436 | 26.0k | transpose_16bit_4x4(input, output); |
437 | | |
438 | | // stage 2 |
439 | 26.0k | { |
440 | 26.0k | const __m128i k__cospi_p30_m02 = pair_set_epi16(cospi_30_64, -cospi_2_64); |
441 | 26.0k | const __m128i k__cospi_p02_p30 = pair_set_epi16(cospi_2_64, cospi_30_64); |
442 | 26.0k | const __m128i k__cospi_p06_m26 = pair_set_epi16(cospi_6_64, -cospi_26_64); |
443 | 26.0k | const __m128i k__cospi_p26_p06 = pair_set_epi16(cospi_26_64, cospi_6_64); |
444 | 26.0k | const __m128i lo_1_15 = _mm_unpackhi_epi16(output[0], zero); |
445 | 26.0k | const __m128i lo_13_3 = _mm_unpackhi_epi16(zero, output[1]); |
446 | 26.0k | step2[8] = idct_calc_wraplow_sse2(k__cospi_p30_m02, k__cospi_p02_p30, |
447 | 26.0k | lo_1_15); // step2 8&15 |
448 | 26.0k | step2[11] = idct_calc_wraplow_sse2(k__cospi_p06_m26, k__cospi_p26_p06, |
449 | 26.0k | lo_13_3); // step2 11&12 |
450 | 26.0k | } |
451 | | |
452 | | // stage 3 |
453 | 26.0k | { |
454 | 26.0k | const __m128i k__cospi_p28_m04 = pair_set_epi16(cospi_28_64, -cospi_4_64); |
455 | 26.0k | const __m128i k__cospi_p04_p28 = pair_set_epi16(cospi_4_64, cospi_28_64); |
456 | 26.0k | const __m128i lo_2_14 = _mm_unpacklo_epi16(output[1], zero); |
457 | 26.0k | step1[4] = idct_calc_wraplow_sse2(k__cospi_p28_m04, k__cospi_p04_p28, |
458 | 26.0k | lo_2_14); // step1 4&7 |
459 | 26.0k | step1[13] = _mm_unpackhi_epi64(step2[11], zero); |
460 | 26.0k | step1[14] = _mm_unpackhi_epi64(step2[8], zero); |
461 | 26.0k | } |
462 | | |
463 | | // stage 4 |
464 | 26.0k | { |
465 | 26.0k | const __m128i k__cospi_m08_p24 = pair_set_epi16(-cospi_8_64, cospi_24_64); |
466 | 26.0k | const __m128i k__cospi_p24_p08 = pair_set_epi16(cospi_24_64, cospi_8_64); |
467 | 26.0k | const __m128i k__cospi_m24_m08 = pair_set_epi16(-cospi_24_64, -cospi_8_64); |
468 | 26.0k | const __m128i lo_0_8 = _mm_unpacklo_epi16(output[0], zero); |
469 | 26.0k | const __m128i lo_9_14 = _mm_unpacklo_epi16(step2[8], step1[14]); |
470 | 26.0k | const __m128i lo_10_13 = _mm_unpacklo_epi16(step2[11], step1[13]); |
471 | 26.0k | const __m128i t = idct_madd_round_shift_sse2(lo_0_8, k__cospi_p16_p16); |
472 | 26.0k | step1[0] = _mm_packs_epi32(t, t); // step2 0&1 |
473 | 26.0k | step2[9] = idct_calc_wraplow_sse2(k__cospi_m08_p24, k__cospi_p24_p08, |
474 | 26.0k | lo_9_14); // step2 9&14 |
475 | 26.0k | step2[10] = idct_calc_wraplow_sse2(k__cospi_m24_m08, k__cospi_m08_p24, |
476 | 26.0k | lo_10_13); // step2 10&13 |
477 | 26.0k | step2[6] = _mm_unpackhi_epi64(step1[4], zero); |
478 | 26.0k | } |
479 | | |
480 | | // stage 5 |
481 | 26.0k | { |
482 | 26.0k | const __m128i lo_5_6 = _mm_unpacklo_epi16(step1[4], step2[6]); |
483 | 26.0k | step1[6] = idct_calc_wraplow_sse2(k__cospi_p16_p16, k__cospi_m16_p16, |
484 | 26.0k | lo_5_6); // step1 6&5 |
485 | 26.0k | step1[8] = _mm_add_epi16(step2[8], step2[11]); |
486 | 26.0k | step1[9] = _mm_add_epi16(step2[9], step2[10]); |
487 | 26.0k | step1[10] = _mm_sub_epi16(step2[9], step2[10]); |
488 | 26.0k | step1[11] = _mm_sub_epi16(step2[8], step2[11]); |
489 | 26.0k | step1[12] = _mm_unpackhi_epi64(step1[11], zero); |
490 | 26.0k | step1[13] = _mm_unpackhi_epi64(step1[10], zero); |
491 | 26.0k | step1[14] = _mm_unpackhi_epi64(step1[9], zero); |
492 | 26.0k | step1[15] = _mm_unpackhi_epi64(step1[8], zero); |
493 | 26.0k | } |
494 | | |
495 | | // stage 6 |
496 | 26.0k | { |
497 | 26.0k | const __m128i lo_10_13 = _mm_unpacklo_epi16(step1[10], step1[13]); |
498 | 26.0k | const __m128i lo_11_12 = _mm_unpacklo_epi16(step1[11], step1[12]); |
499 | 26.0k | step2[10] = idct_calc_wraplow_sse2(k__cospi_m16_p16, k__cospi_p16_p16, |
500 | 26.0k | lo_10_13); // step2 10&13 |
501 | 26.0k | step2[11] = idct_calc_wraplow_sse2(k__cospi_m16_p16, k__cospi_p16_p16, |
502 | 26.0k | lo_11_12); // step2 11&12 |
503 | 26.0k | step2[13] = _mm_unpackhi_epi64(step2[10], zero); |
504 | 26.0k | step2[12] = _mm_unpackhi_epi64(step2[11], zero); |
505 | 26.0k | step2[3] = _mm_add_epi16(step1[0], step1[4]); |
506 | 26.0k | step2[1] = _mm_add_epi16(step1[0], step1[6]); |
507 | 26.0k | step2[6] = _mm_sub_epi16(step1[0], step1[6]); |
508 | 26.0k | step2[4] = _mm_sub_epi16(step1[0], step1[4]); |
509 | 26.0k | step2[0] = _mm_unpackhi_epi64(step2[3], zero); |
510 | 26.0k | step2[2] = _mm_unpackhi_epi64(step2[1], zero); |
511 | 26.0k | step2[5] = _mm_unpackhi_epi64(step2[6], zero); |
512 | 26.0k | step2[7] = _mm_unpackhi_epi64(step2[4], zero); |
513 | 26.0k | } |
514 | | |
515 | | // stage 7. Left 8x16 only. |
516 | 26.0k | output[0] = _mm_add_epi16(step2[0], step1[15]); |
517 | 26.0k | output[1] = _mm_add_epi16(step2[1], step1[14]); |
518 | 26.0k | output[2] = _mm_add_epi16(step2[2], step2[13]); |
519 | 26.0k | output[3] = _mm_add_epi16(step2[3], step2[12]); |
520 | 26.0k | output[4] = _mm_add_epi16(step2[4], step2[11]); |
521 | 26.0k | output[5] = _mm_add_epi16(step2[5], step2[10]); |
522 | 26.0k | output[6] = _mm_add_epi16(step2[6], step1[9]); |
523 | 26.0k | output[7] = _mm_add_epi16(step2[7], step1[8]); |
524 | 26.0k | output[8] = _mm_sub_epi16(step2[7], step1[8]); |
525 | 26.0k | output[9] = _mm_sub_epi16(step2[6], step1[9]); |
526 | 26.0k | output[10] = _mm_sub_epi16(step2[5], step2[10]); |
527 | 26.0k | output[11] = _mm_sub_epi16(step2[4], step2[11]); |
528 | 26.0k | output[12] = _mm_sub_epi16(step2[3], step2[12]); |
529 | 26.0k | output[13] = _mm_sub_epi16(step2[2], step2[13]); |
530 | 26.0k | output[14] = _mm_sub_epi16(step2[1], step1[14]); |
531 | 26.0k | output[15] = _mm_sub_epi16(step2[0], step1[15]); |
532 | 26.0k | } Unexecuted instantiation: vp9_idct_intrin_sse2.c:idct16x16_10_pass1 Unexecuted instantiation: vp9_highbd_iht4x4_add_sse4.c:idct16x16_10_pass1 Unexecuted instantiation: vp9_highbd_iht8x8_add_sse4.c:idct16x16_10_pass1 Unexecuted instantiation: vp9_highbd_iht16x16_add_sse4.c:idct16x16_10_pass1 inv_txfm_sse2.c:idct16x16_10_pass1 Line | Count | Source | 430 | 26.0k | __m128i *const output /*output[16]*/) { | 431 | 26.0k | const __m128i zero = _mm_setzero_si128(); | 432 | 26.0k | const __m128i k__cospi_p16_p16 = pair_set_epi16(cospi_16_64, cospi_16_64); | 433 | 26.0k | const __m128i k__cospi_m16_p16 = pair_set_epi16(-cospi_16_64, cospi_16_64); | 434 | 26.0k | __m128i step1[16], step2[16]; | 435 | | | 436 | 26.0k | transpose_16bit_4x4(input, output); | 437 | | | 438 | | // stage 2 | 439 | 26.0k | { | 440 | 26.0k | const __m128i k__cospi_p30_m02 = pair_set_epi16(cospi_30_64, -cospi_2_64); | 441 | 26.0k | const __m128i k__cospi_p02_p30 = pair_set_epi16(cospi_2_64, cospi_30_64); | 442 | 26.0k | const __m128i k__cospi_p06_m26 = pair_set_epi16(cospi_6_64, -cospi_26_64); | 443 | 26.0k | const __m128i k__cospi_p26_p06 = pair_set_epi16(cospi_26_64, cospi_6_64); | 444 | 26.0k | const __m128i lo_1_15 = _mm_unpackhi_epi16(output[0], zero); | 445 | 26.0k | const __m128i lo_13_3 = _mm_unpackhi_epi16(zero, output[1]); | 446 | 26.0k | step2[8] = idct_calc_wraplow_sse2(k__cospi_p30_m02, k__cospi_p02_p30, | 447 | 26.0k | lo_1_15); // step2 8&15 | 448 | 26.0k | step2[11] = idct_calc_wraplow_sse2(k__cospi_p06_m26, k__cospi_p26_p06, | 449 | 26.0k | lo_13_3); // step2 11&12 | 450 | 26.0k | } | 451 | | | 452 | | // stage 3 | 453 | 26.0k | { | 454 | 26.0k | const __m128i k__cospi_p28_m04 = pair_set_epi16(cospi_28_64, -cospi_4_64); | 455 | 26.0k | const __m128i k__cospi_p04_p28 = pair_set_epi16(cospi_4_64, cospi_28_64); | 456 | 26.0k | const __m128i lo_2_14 = _mm_unpacklo_epi16(output[1], zero); | 457 | 26.0k | step1[4] = idct_calc_wraplow_sse2(k__cospi_p28_m04, k__cospi_p04_p28, | 458 | 26.0k | lo_2_14); // step1 4&7 | 459 | 26.0k | step1[13] = _mm_unpackhi_epi64(step2[11], zero); | 460 | 26.0k | step1[14] = _mm_unpackhi_epi64(step2[8], zero); | 461 | 26.0k | } | 462 | | | 463 | | // stage 4 | 464 | 26.0k | { | 465 | 26.0k | const __m128i k__cospi_m08_p24 = pair_set_epi16(-cospi_8_64, cospi_24_64); | 466 | 26.0k | const __m128i k__cospi_p24_p08 = pair_set_epi16(cospi_24_64, cospi_8_64); | 467 | 26.0k | const __m128i k__cospi_m24_m08 = pair_set_epi16(-cospi_24_64, -cospi_8_64); | 468 | 26.0k | const __m128i lo_0_8 = _mm_unpacklo_epi16(output[0], zero); | 469 | 26.0k | const __m128i lo_9_14 = _mm_unpacklo_epi16(step2[8], step1[14]); | 470 | 26.0k | const __m128i lo_10_13 = _mm_unpacklo_epi16(step2[11], step1[13]); | 471 | 26.0k | const __m128i t = idct_madd_round_shift_sse2(lo_0_8, k__cospi_p16_p16); | 472 | 26.0k | step1[0] = _mm_packs_epi32(t, t); // step2 0&1 | 473 | 26.0k | step2[9] = idct_calc_wraplow_sse2(k__cospi_m08_p24, k__cospi_p24_p08, | 474 | 26.0k | lo_9_14); // step2 9&14 | 475 | 26.0k | step2[10] = idct_calc_wraplow_sse2(k__cospi_m24_m08, k__cospi_m08_p24, | 476 | 26.0k | lo_10_13); // step2 10&13 | 477 | 26.0k | step2[6] = _mm_unpackhi_epi64(step1[4], zero); | 478 | 26.0k | } | 479 | | | 480 | | // stage 5 | 481 | 26.0k | { | 482 | 26.0k | const __m128i lo_5_6 = _mm_unpacklo_epi16(step1[4], step2[6]); | 483 | 26.0k | step1[6] = idct_calc_wraplow_sse2(k__cospi_p16_p16, k__cospi_m16_p16, | 484 | 26.0k | lo_5_6); // step1 6&5 | 485 | 26.0k | step1[8] = _mm_add_epi16(step2[8], step2[11]); | 486 | 26.0k | step1[9] = _mm_add_epi16(step2[9], step2[10]); | 487 | 26.0k | step1[10] = _mm_sub_epi16(step2[9], step2[10]); | 488 | 26.0k | step1[11] = _mm_sub_epi16(step2[8], step2[11]); | 489 | 26.0k | step1[12] = _mm_unpackhi_epi64(step1[11], zero); | 490 | 26.0k | step1[13] = _mm_unpackhi_epi64(step1[10], zero); | 491 | 26.0k | step1[14] = _mm_unpackhi_epi64(step1[9], zero); | 492 | 26.0k | step1[15] = _mm_unpackhi_epi64(step1[8], zero); | 493 | 26.0k | } | 494 | | | 495 | | // stage 6 | 496 | 26.0k | { | 497 | 26.0k | const __m128i lo_10_13 = _mm_unpacklo_epi16(step1[10], step1[13]); | 498 | 26.0k | const __m128i lo_11_12 = _mm_unpacklo_epi16(step1[11], step1[12]); | 499 | 26.0k | step2[10] = idct_calc_wraplow_sse2(k__cospi_m16_p16, k__cospi_p16_p16, | 500 | 26.0k | lo_10_13); // step2 10&13 | 501 | 26.0k | step2[11] = idct_calc_wraplow_sse2(k__cospi_m16_p16, k__cospi_p16_p16, | 502 | 26.0k | lo_11_12); // step2 11&12 | 503 | 26.0k | step2[13] = _mm_unpackhi_epi64(step2[10], zero); | 504 | 26.0k | step2[12] = _mm_unpackhi_epi64(step2[11], zero); | 505 | 26.0k | step2[3] = _mm_add_epi16(step1[0], step1[4]); | 506 | 26.0k | step2[1] = _mm_add_epi16(step1[0], step1[6]); | 507 | 26.0k | step2[6] = _mm_sub_epi16(step1[0], step1[6]); | 508 | 26.0k | step2[4] = _mm_sub_epi16(step1[0], step1[4]); | 509 | 26.0k | step2[0] = _mm_unpackhi_epi64(step2[3], zero); | 510 | 26.0k | step2[2] = _mm_unpackhi_epi64(step2[1], zero); | 511 | 26.0k | step2[5] = _mm_unpackhi_epi64(step2[6], zero); | 512 | 26.0k | step2[7] = _mm_unpackhi_epi64(step2[4], zero); | 513 | 26.0k | } | 514 | | | 515 | | // stage 7. Left 8x16 only. | 516 | 26.0k | output[0] = _mm_add_epi16(step2[0], step1[15]); | 517 | 26.0k | output[1] = _mm_add_epi16(step2[1], step1[14]); | 518 | 26.0k | output[2] = _mm_add_epi16(step2[2], step2[13]); | 519 | 26.0k | output[3] = _mm_add_epi16(step2[3], step2[12]); | 520 | 26.0k | output[4] = _mm_add_epi16(step2[4], step2[11]); | 521 | 26.0k | output[5] = _mm_add_epi16(step2[5], step2[10]); | 522 | 26.0k | output[6] = _mm_add_epi16(step2[6], step1[9]); | 523 | 26.0k | output[7] = _mm_add_epi16(step2[7], step1[8]); | 524 | 26.0k | output[8] = _mm_sub_epi16(step2[7], step1[8]); | 525 | 26.0k | output[9] = _mm_sub_epi16(step2[6], step1[9]); | 526 | 26.0k | output[10] = _mm_sub_epi16(step2[5], step2[10]); | 527 | 26.0k | output[11] = _mm_sub_epi16(step2[4], step2[11]); | 528 | 26.0k | output[12] = _mm_sub_epi16(step2[3], step2[12]); | 529 | 26.0k | output[13] = _mm_sub_epi16(step2[2], step2[13]); | 530 | 26.0k | output[14] = _mm_sub_epi16(step2[1], step1[14]); | 531 | 26.0k | output[15] = _mm_sub_epi16(step2[0], step1[15]); | 532 | 26.0k | } |
Unexecuted instantiation: inv_txfm_ssse3.c:idct16x16_10_pass1 Unexecuted instantiation: highbd_idct4x4_add_sse2.c:idct16x16_10_pass1 Unexecuted instantiation: highbd_idct8x8_add_sse2.c:idct16x16_10_pass1 Unexecuted instantiation: highbd_idct16x16_add_sse2.c:idct16x16_10_pass1 Unexecuted instantiation: highbd_idct32x32_add_sse2.c:idct16x16_10_pass1 Unexecuted instantiation: highbd_idct4x4_add_sse4.c:idct16x16_10_pass1 Unexecuted instantiation: highbd_idct8x8_add_sse4.c:idct16x16_10_pass1 Unexecuted instantiation: highbd_idct16x16_add_sse4.c:idct16x16_10_pass1 Unexecuted instantiation: highbd_idct32x32_add_sse4.c:idct16x16_10_pass1 |
533 | | |
534 | | static INLINE void idct16x16_10_pass2(__m128i *const l /*l[8]*/, |
535 | 52.0k | __m128i *const io /*io[16]*/) { |
536 | 52.0k | const __m128i zero = _mm_setzero_si128(); |
537 | 52.0k | __m128i step1[16], step2[16]; |
538 | | |
539 | 52.0k | transpose_16bit_4x8(l, io); |
540 | | |
541 | | // stage 2 |
542 | 52.0k | butterfly(io[1], zero, cospi_30_64, cospi_2_64, &step2[8], &step2[15]); |
543 | 52.0k | butterfly(zero, io[3], cospi_6_64, cospi_26_64, &step2[11], &step2[12]); |
544 | | |
545 | | // stage 3 |
546 | 52.0k | butterfly(io[2], zero, cospi_28_64, cospi_4_64, &step1[4], &step1[7]); |
547 | | |
548 | | // stage 4 |
549 | 52.0k | step1[0] = butterfly_cospi16(io[0]); |
550 | 52.0k | butterfly(step2[15], step2[8], cospi_24_64, cospi_8_64, &step2[9], |
551 | 52.0k | &step2[14]); |
552 | 52.0k | butterfly(step2[11], step2[12], -cospi_8_64, -cospi_24_64, &step2[13], |
553 | 52.0k | &step2[10]); |
554 | | |
555 | | // stage 5 |
556 | 52.0k | butterfly(step1[7], step1[4], cospi_16_64, cospi_16_64, &step1[5], &step1[6]); |
557 | 52.0k | step1[8] = _mm_add_epi16(step2[8], step2[11]); |
558 | 52.0k | step1[9] = _mm_add_epi16(step2[9], step2[10]); |
559 | 52.0k | step1[10] = _mm_sub_epi16(step2[9], step2[10]); |
560 | 52.0k | step1[11] = _mm_sub_epi16(step2[8], step2[11]); |
561 | 52.0k | step1[12] = _mm_sub_epi16(step2[15], step2[12]); |
562 | 52.0k | step1[13] = _mm_sub_epi16(step2[14], step2[13]); |
563 | 52.0k | step1[14] = _mm_add_epi16(step2[14], step2[13]); |
564 | 52.0k | step1[15] = _mm_add_epi16(step2[15], step2[12]); |
565 | | |
566 | | // stage 6 |
567 | 52.0k | step2[0] = _mm_add_epi16(step1[0], step1[7]); |
568 | 52.0k | step2[1] = _mm_add_epi16(step1[0], step1[6]); |
569 | 52.0k | step2[2] = _mm_add_epi16(step1[0], step1[5]); |
570 | 52.0k | step2[3] = _mm_add_epi16(step1[0], step1[4]); |
571 | 52.0k | step2[4] = _mm_sub_epi16(step1[0], step1[4]); |
572 | 52.0k | step2[5] = _mm_sub_epi16(step1[0], step1[5]); |
573 | 52.0k | step2[6] = _mm_sub_epi16(step1[0], step1[6]); |
574 | 52.0k | step2[7] = _mm_sub_epi16(step1[0], step1[7]); |
575 | 52.0k | butterfly(step1[13], step1[10], cospi_16_64, cospi_16_64, &step2[10], |
576 | 52.0k | &step2[13]); |
577 | 52.0k | butterfly(step1[12], step1[11], cospi_16_64, cospi_16_64, &step2[11], |
578 | 52.0k | &step2[12]); |
579 | | |
580 | | // stage 7 |
581 | 52.0k | io[0] = _mm_add_epi16(step2[0], step1[15]); |
582 | 52.0k | io[1] = _mm_add_epi16(step2[1], step1[14]); |
583 | 52.0k | io[2] = _mm_add_epi16(step2[2], step2[13]); |
584 | 52.0k | io[3] = _mm_add_epi16(step2[3], step2[12]); |
585 | 52.0k | io[4] = _mm_add_epi16(step2[4], step2[11]); |
586 | 52.0k | io[5] = _mm_add_epi16(step2[5], step2[10]); |
587 | 52.0k | io[6] = _mm_add_epi16(step2[6], step1[9]); |
588 | 52.0k | io[7] = _mm_add_epi16(step2[7], step1[8]); |
589 | 52.0k | io[8] = _mm_sub_epi16(step2[7], step1[8]); |
590 | 52.0k | io[9] = _mm_sub_epi16(step2[6], step1[9]); |
591 | 52.0k | io[10] = _mm_sub_epi16(step2[5], step2[10]); |
592 | 52.0k | io[11] = _mm_sub_epi16(step2[4], step2[11]); |
593 | 52.0k | io[12] = _mm_sub_epi16(step2[3], step2[12]); |
594 | 52.0k | io[13] = _mm_sub_epi16(step2[2], step2[13]); |
595 | 52.0k | io[14] = _mm_sub_epi16(step2[1], step1[14]); |
596 | 52.0k | io[15] = _mm_sub_epi16(step2[0], step1[15]); |
597 | 52.0k | } Unexecuted instantiation: vp9_idct_intrin_sse2.c:idct16x16_10_pass2 Unexecuted instantiation: vp9_highbd_iht4x4_add_sse4.c:idct16x16_10_pass2 Unexecuted instantiation: vp9_highbd_iht8x8_add_sse4.c:idct16x16_10_pass2 Unexecuted instantiation: vp9_highbd_iht16x16_add_sse4.c:idct16x16_10_pass2 inv_txfm_sse2.c:idct16x16_10_pass2 Line | Count | Source | 535 | 52.0k | __m128i *const io /*io[16]*/) { | 536 | 52.0k | const __m128i zero = _mm_setzero_si128(); | 537 | 52.0k | __m128i step1[16], step2[16]; | 538 | | | 539 | 52.0k | transpose_16bit_4x8(l, io); | 540 | | | 541 | | // stage 2 | 542 | 52.0k | butterfly(io[1], zero, cospi_30_64, cospi_2_64, &step2[8], &step2[15]); | 543 | 52.0k | butterfly(zero, io[3], cospi_6_64, cospi_26_64, &step2[11], &step2[12]); | 544 | | | 545 | | // stage 3 | 546 | 52.0k | butterfly(io[2], zero, cospi_28_64, cospi_4_64, &step1[4], &step1[7]); | 547 | | | 548 | | // stage 4 | 549 | 52.0k | step1[0] = butterfly_cospi16(io[0]); | 550 | 52.0k | butterfly(step2[15], step2[8], cospi_24_64, cospi_8_64, &step2[9], | 551 | 52.0k | &step2[14]); | 552 | 52.0k | butterfly(step2[11], step2[12], -cospi_8_64, -cospi_24_64, &step2[13], | 553 | 52.0k | &step2[10]); | 554 | | | 555 | | // stage 5 | 556 | 52.0k | butterfly(step1[7], step1[4], cospi_16_64, cospi_16_64, &step1[5], &step1[6]); | 557 | 52.0k | step1[8] = _mm_add_epi16(step2[8], step2[11]); | 558 | 52.0k | step1[9] = _mm_add_epi16(step2[9], step2[10]); | 559 | 52.0k | step1[10] = _mm_sub_epi16(step2[9], step2[10]); | 560 | 52.0k | step1[11] = _mm_sub_epi16(step2[8], step2[11]); | 561 | 52.0k | step1[12] = _mm_sub_epi16(step2[15], step2[12]); | 562 | 52.0k | step1[13] = _mm_sub_epi16(step2[14], step2[13]); | 563 | 52.0k | step1[14] = _mm_add_epi16(step2[14], step2[13]); | 564 | 52.0k | step1[15] = _mm_add_epi16(step2[15], step2[12]); | 565 | | | 566 | | // stage 6 | 567 | 52.0k | step2[0] = _mm_add_epi16(step1[0], step1[7]); | 568 | 52.0k | step2[1] = _mm_add_epi16(step1[0], step1[6]); | 569 | 52.0k | step2[2] = _mm_add_epi16(step1[0], step1[5]); | 570 | 52.0k | step2[3] = _mm_add_epi16(step1[0], step1[4]); | 571 | 52.0k | step2[4] = _mm_sub_epi16(step1[0], step1[4]); | 572 | 52.0k | step2[5] = _mm_sub_epi16(step1[0], step1[5]); | 573 | 52.0k | step2[6] = _mm_sub_epi16(step1[0], step1[6]); | 574 | 52.0k | step2[7] = _mm_sub_epi16(step1[0], step1[7]); | 575 | 52.0k | butterfly(step1[13], step1[10], cospi_16_64, cospi_16_64, &step2[10], | 576 | 52.0k | &step2[13]); | 577 | 52.0k | butterfly(step1[12], step1[11], cospi_16_64, cospi_16_64, &step2[11], | 578 | 52.0k | &step2[12]); | 579 | | | 580 | | // stage 7 | 581 | 52.0k | io[0] = _mm_add_epi16(step2[0], step1[15]); | 582 | 52.0k | io[1] = _mm_add_epi16(step2[1], step1[14]); | 583 | 52.0k | io[2] = _mm_add_epi16(step2[2], step2[13]); | 584 | 52.0k | io[3] = _mm_add_epi16(step2[3], step2[12]); | 585 | 52.0k | io[4] = _mm_add_epi16(step2[4], step2[11]); | 586 | 52.0k | io[5] = _mm_add_epi16(step2[5], step2[10]); | 587 | 52.0k | io[6] = _mm_add_epi16(step2[6], step1[9]); | 588 | 52.0k | io[7] = _mm_add_epi16(step2[7], step1[8]); | 589 | 52.0k | io[8] = _mm_sub_epi16(step2[7], step1[8]); | 590 | 52.0k | io[9] = _mm_sub_epi16(step2[6], step1[9]); | 591 | 52.0k | io[10] = _mm_sub_epi16(step2[5], step2[10]); | 592 | 52.0k | io[11] = _mm_sub_epi16(step2[4], step2[11]); | 593 | 52.0k | io[12] = _mm_sub_epi16(step2[3], step2[12]); | 594 | 52.0k | io[13] = _mm_sub_epi16(step2[2], step2[13]); | 595 | 52.0k | io[14] = _mm_sub_epi16(step2[1], step1[14]); | 596 | 52.0k | io[15] = _mm_sub_epi16(step2[0], step1[15]); | 597 | 52.0k | } |
Unexecuted instantiation: inv_txfm_ssse3.c:idct16x16_10_pass2 Unexecuted instantiation: highbd_idct4x4_add_sse2.c:idct16x16_10_pass2 Unexecuted instantiation: highbd_idct8x8_add_sse2.c:idct16x16_10_pass2 Unexecuted instantiation: highbd_idct16x16_add_sse2.c:idct16x16_10_pass2 Unexecuted instantiation: highbd_idct32x32_add_sse2.c:idct16x16_10_pass2 Unexecuted instantiation: highbd_idct4x4_add_sse4.c:idct16x16_10_pass2 Unexecuted instantiation: highbd_idct8x8_add_sse4.c:idct16x16_10_pass2 Unexecuted instantiation: highbd_idct16x16_add_sse4.c:idct16x16_10_pass2 Unexecuted instantiation: highbd_idct32x32_add_sse4.c:idct16x16_10_pass2 |
598 | | |
599 | | static INLINE void idct32_8x32_quarter_2_stage_4_to_6( |
600 | 102k | __m128i *const step1 /*step1[16]*/, __m128i *const out /*out[16]*/) { |
601 | 102k | __m128i step2[32]; |
602 | | |
603 | | // stage 4 |
604 | 102k | step2[8] = step1[8]; |
605 | 102k | step2[15] = step1[15]; |
606 | 102k | butterfly(step1[14], step1[9], cospi_24_64, cospi_8_64, &step2[9], |
607 | 102k | &step2[14]); |
608 | 102k | butterfly(step1[13], step1[10], -cospi_8_64, cospi_24_64, &step2[10], |
609 | 102k | &step2[13]); |
610 | 102k | step2[11] = step1[11]; |
611 | 102k | step2[12] = step1[12]; |
612 | | |
613 | | // stage 5 |
614 | 102k | step1[8] = _mm_add_epi16(step2[8], step2[11]); |
615 | 102k | step1[9] = _mm_add_epi16(step2[9], step2[10]); |
616 | 102k | step1[10] = _mm_sub_epi16(step2[9], step2[10]); |
617 | 102k | step1[11] = _mm_sub_epi16(step2[8], step2[11]); |
618 | 102k | step1[12] = _mm_sub_epi16(step2[15], step2[12]); |
619 | 102k | step1[13] = _mm_sub_epi16(step2[14], step2[13]); |
620 | 102k | step1[14] = _mm_add_epi16(step2[14], step2[13]); |
621 | 102k | step1[15] = _mm_add_epi16(step2[15], step2[12]); |
622 | | |
623 | | // stage 6 |
624 | 102k | out[8] = step1[8]; |
625 | 102k | out[9] = step1[9]; |
626 | 102k | butterfly(step1[13], step1[10], cospi_16_64, cospi_16_64, &out[10], &out[13]); |
627 | 102k | butterfly(step1[12], step1[11], cospi_16_64, cospi_16_64, &out[11], &out[12]); |
628 | 102k | out[14] = step1[14]; |
629 | 102k | out[15] = step1[15]; |
630 | 102k | } Unexecuted instantiation: vp9_idct_intrin_sse2.c:idct32_8x32_quarter_2_stage_4_to_6 Unexecuted instantiation: vp9_highbd_iht4x4_add_sse4.c:idct32_8x32_quarter_2_stage_4_to_6 Unexecuted instantiation: vp9_highbd_iht8x8_add_sse4.c:idct32_8x32_quarter_2_stage_4_to_6 Unexecuted instantiation: vp9_highbd_iht16x16_add_sse4.c:idct32_8x32_quarter_2_stage_4_to_6 Unexecuted instantiation: inv_txfm_sse2.c:idct32_8x32_quarter_2_stage_4_to_6 inv_txfm_ssse3.c:idct32_8x32_quarter_2_stage_4_to_6 Line | Count | Source | 600 | 102k | __m128i *const step1 /*step1[16]*/, __m128i *const out /*out[16]*/) { | 601 | 102k | __m128i step2[32]; | 602 | | | 603 | | // stage 4 | 604 | 102k | step2[8] = step1[8]; | 605 | 102k | step2[15] = step1[15]; | 606 | 102k | butterfly(step1[14], step1[9], cospi_24_64, cospi_8_64, &step2[9], | 607 | 102k | &step2[14]); | 608 | 102k | butterfly(step1[13], step1[10], -cospi_8_64, cospi_24_64, &step2[10], | 609 | 102k | &step2[13]); | 610 | 102k | step2[11] = step1[11]; | 611 | 102k | step2[12] = step1[12]; | 612 | | | 613 | | // stage 5 | 614 | 102k | step1[8] = _mm_add_epi16(step2[8], step2[11]); | 615 | 102k | step1[9] = _mm_add_epi16(step2[9], step2[10]); | 616 | 102k | step1[10] = _mm_sub_epi16(step2[9], step2[10]); | 617 | 102k | step1[11] = _mm_sub_epi16(step2[8], step2[11]); | 618 | 102k | step1[12] = _mm_sub_epi16(step2[15], step2[12]); | 619 | 102k | step1[13] = _mm_sub_epi16(step2[14], step2[13]); | 620 | 102k | step1[14] = _mm_add_epi16(step2[14], step2[13]); | 621 | 102k | step1[15] = _mm_add_epi16(step2[15], step2[12]); | 622 | | | 623 | | // stage 6 | 624 | 102k | out[8] = step1[8]; | 625 | 102k | out[9] = step1[9]; | 626 | 102k | butterfly(step1[13], step1[10], cospi_16_64, cospi_16_64, &out[10], &out[13]); | 627 | 102k | butterfly(step1[12], step1[11], cospi_16_64, cospi_16_64, &out[11], &out[12]); | 628 | 102k | out[14] = step1[14]; | 629 | 102k | out[15] = step1[15]; | 630 | 102k | } |
Unexecuted instantiation: highbd_idct4x4_add_sse2.c:idct32_8x32_quarter_2_stage_4_to_6 Unexecuted instantiation: highbd_idct8x8_add_sse2.c:idct32_8x32_quarter_2_stage_4_to_6 Unexecuted instantiation: highbd_idct16x16_add_sse2.c:idct32_8x32_quarter_2_stage_4_to_6 Unexecuted instantiation: highbd_idct32x32_add_sse2.c:idct32_8x32_quarter_2_stage_4_to_6 Unexecuted instantiation: highbd_idct4x4_add_sse4.c:idct32_8x32_quarter_2_stage_4_to_6 Unexecuted instantiation: highbd_idct8x8_add_sse4.c:idct32_8x32_quarter_2_stage_4_to_6 Unexecuted instantiation: highbd_idct16x16_add_sse4.c:idct32_8x32_quarter_2_stage_4_to_6 Unexecuted instantiation: highbd_idct32x32_add_sse4.c:idct32_8x32_quarter_2_stage_4_to_6 |
631 | | |
632 | | static INLINE void idct32_8x32_quarter_3_4_stage_4_to_7( |
633 | 102k | __m128i *const step1 /*step1[32]*/, __m128i *const out /*out[32]*/) { |
634 | 102k | __m128i step2[32]; |
635 | | |
636 | | // stage 4 |
637 | 102k | step2[16] = _mm_add_epi16(step1[16], step1[19]); |
638 | 102k | step2[17] = _mm_add_epi16(step1[17], step1[18]); |
639 | 102k | step2[18] = _mm_sub_epi16(step1[17], step1[18]); |
640 | 102k | step2[19] = _mm_sub_epi16(step1[16], step1[19]); |
641 | 102k | step2[20] = _mm_sub_epi16(step1[23], step1[20]); |
642 | 102k | step2[21] = _mm_sub_epi16(step1[22], step1[21]); |
643 | 102k | step2[22] = _mm_add_epi16(step1[22], step1[21]); |
644 | 102k | step2[23] = _mm_add_epi16(step1[23], step1[20]); |
645 | | |
646 | 102k | step2[24] = _mm_add_epi16(step1[24], step1[27]); |
647 | 102k | step2[25] = _mm_add_epi16(step1[25], step1[26]); |
648 | 102k | step2[26] = _mm_sub_epi16(step1[25], step1[26]); |
649 | 102k | step2[27] = _mm_sub_epi16(step1[24], step1[27]); |
650 | 102k | step2[28] = _mm_sub_epi16(step1[31], step1[28]); |
651 | 102k | step2[29] = _mm_sub_epi16(step1[30], step1[29]); |
652 | 102k | step2[30] = _mm_add_epi16(step1[29], step1[30]); |
653 | 102k | step2[31] = _mm_add_epi16(step1[28], step1[31]); |
654 | | |
655 | | // stage 5 |
656 | 102k | step1[16] = step2[16]; |
657 | 102k | step1[17] = step2[17]; |
658 | 102k | butterfly(step2[29], step2[18], cospi_24_64, cospi_8_64, &step1[18], |
659 | 102k | &step1[29]); |
660 | 102k | butterfly(step2[28], step2[19], cospi_24_64, cospi_8_64, &step1[19], |
661 | 102k | &step1[28]); |
662 | 102k | butterfly(step2[27], step2[20], -cospi_8_64, cospi_24_64, &step1[20], |
663 | 102k | &step1[27]); |
664 | 102k | butterfly(step2[26], step2[21], -cospi_8_64, cospi_24_64, &step1[21], |
665 | 102k | &step1[26]); |
666 | 102k | step1[22] = step2[22]; |
667 | 102k | step1[23] = step2[23]; |
668 | 102k | step1[24] = step2[24]; |
669 | 102k | step1[25] = step2[25]; |
670 | 102k | step1[30] = step2[30]; |
671 | 102k | step1[31] = step2[31]; |
672 | | |
673 | | // stage 6 |
674 | 102k | out[16] = _mm_add_epi16(step1[16], step1[23]); |
675 | 102k | out[17] = _mm_add_epi16(step1[17], step1[22]); |
676 | 102k | out[18] = _mm_add_epi16(step1[18], step1[21]); |
677 | 102k | out[19] = _mm_add_epi16(step1[19], step1[20]); |
678 | 102k | step2[20] = _mm_sub_epi16(step1[19], step1[20]); |
679 | 102k | step2[21] = _mm_sub_epi16(step1[18], step1[21]); |
680 | 102k | step2[22] = _mm_sub_epi16(step1[17], step1[22]); |
681 | 102k | step2[23] = _mm_sub_epi16(step1[16], step1[23]); |
682 | | |
683 | 102k | step2[24] = _mm_sub_epi16(step1[31], step1[24]); |
684 | 102k | step2[25] = _mm_sub_epi16(step1[30], step1[25]); |
685 | 102k | step2[26] = _mm_sub_epi16(step1[29], step1[26]); |
686 | 102k | step2[27] = _mm_sub_epi16(step1[28], step1[27]); |
687 | 102k | out[28] = _mm_add_epi16(step1[27], step1[28]); |
688 | 102k | out[29] = _mm_add_epi16(step1[26], step1[29]); |
689 | 102k | out[30] = _mm_add_epi16(step1[25], step1[30]); |
690 | 102k | out[31] = _mm_add_epi16(step1[24], step1[31]); |
691 | | |
692 | | // stage 7 |
693 | 102k | butterfly(step2[27], step2[20], cospi_16_64, cospi_16_64, &out[20], &out[27]); |
694 | 102k | butterfly(step2[26], step2[21], cospi_16_64, cospi_16_64, &out[21], &out[26]); |
695 | 102k | butterfly(step2[25], step2[22], cospi_16_64, cospi_16_64, &out[22], &out[25]); |
696 | 102k | butterfly(step2[24], step2[23], cospi_16_64, cospi_16_64, &out[23], &out[24]); |
697 | 102k | } Unexecuted instantiation: vp9_idct_intrin_sse2.c:idct32_8x32_quarter_3_4_stage_4_to_7 Unexecuted instantiation: vp9_highbd_iht4x4_add_sse4.c:idct32_8x32_quarter_3_4_stage_4_to_7 Unexecuted instantiation: vp9_highbd_iht8x8_add_sse4.c:idct32_8x32_quarter_3_4_stage_4_to_7 Unexecuted instantiation: vp9_highbd_iht16x16_add_sse4.c:idct32_8x32_quarter_3_4_stage_4_to_7 Unexecuted instantiation: inv_txfm_sse2.c:idct32_8x32_quarter_3_4_stage_4_to_7 inv_txfm_ssse3.c:idct32_8x32_quarter_3_4_stage_4_to_7 Line | Count | Source | 633 | 102k | __m128i *const step1 /*step1[32]*/, __m128i *const out /*out[32]*/) { | 634 | 102k | __m128i step2[32]; | 635 | | | 636 | | // stage 4 | 637 | 102k | step2[16] = _mm_add_epi16(step1[16], step1[19]); | 638 | 102k | step2[17] = _mm_add_epi16(step1[17], step1[18]); | 639 | 102k | step2[18] = _mm_sub_epi16(step1[17], step1[18]); | 640 | 102k | step2[19] = _mm_sub_epi16(step1[16], step1[19]); | 641 | 102k | step2[20] = _mm_sub_epi16(step1[23], step1[20]); | 642 | 102k | step2[21] = _mm_sub_epi16(step1[22], step1[21]); | 643 | 102k | step2[22] = _mm_add_epi16(step1[22], step1[21]); | 644 | 102k | step2[23] = _mm_add_epi16(step1[23], step1[20]); | 645 | | | 646 | 102k | step2[24] = _mm_add_epi16(step1[24], step1[27]); | 647 | 102k | step2[25] = _mm_add_epi16(step1[25], step1[26]); | 648 | 102k | step2[26] = _mm_sub_epi16(step1[25], step1[26]); | 649 | 102k | step2[27] = _mm_sub_epi16(step1[24], step1[27]); | 650 | 102k | step2[28] = _mm_sub_epi16(step1[31], step1[28]); | 651 | 102k | step2[29] = _mm_sub_epi16(step1[30], step1[29]); | 652 | 102k | step2[30] = _mm_add_epi16(step1[29], step1[30]); | 653 | 102k | step2[31] = _mm_add_epi16(step1[28], step1[31]); | 654 | | | 655 | | // stage 5 | 656 | 102k | step1[16] = step2[16]; | 657 | 102k | step1[17] = step2[17]; | 658 | 102k | butterfly(step2[29], step2[18], cospi_24_64, cospi_8_64, &step1[18], | 659 | 102k | &step1[29]); | 660 | 102k | butterfly(step2[28], step2[19], cospi_24_64, cospi_8_64, &step1[19], | 661 | 102k | &step1[28]); | 662 | 102k | butterfly(step2[27], step2[20], -cospi_8_64, cospi_24_64, &step1[20], | 663 | 102k | &step1[27]); | 664 | 102k | butterfly(step2[26], step2[21], -cospi_8_64, cospi_24_64, &step1[21], | 665 | 102k | &step1[26]); | 666 | 102k | step1[22] = step2[22]; | 667 | 102k | step1[23] = step2[23]; | 668 | 102k | step1[24] = step2[24]; | 669 | 102k | step1[25] = step2[25]; | 670 | 102k | step1[30] = step2[30]; | 671 | 102k | step1[31] = step2[31]; | 672 | | | 673 | | // stage 6 | 674 | 102k | out[16] = _mm_add_epi16(step1[16], step1[23]); | 675 | 102k | out[17] = _mm_add_epi16(step1[17], step1[22]); | 676 | 102k | out[18] = _mm_add_epi16(step1[18], step1[21]); | 677 | 102k | out[19] = _mm_add_epi16(step1[19], step1[20]); | 678 | 102k | step2[20] = _mm_sub_epi16(step1[19], step1[20]); | 679 | 102k | step2[21] = _mm_sub_epi16(step1[18], step1[21]); | 680 | 102k | step2[22] = _mm_sub_epi16(step1[17], step1[22]); | 681 | 102k | step2[23] = _mm_sub_epi16(step1[16], step1[23]); | 682 | | | 683 | 102k | step2[24] = _mm_sub_epi16(step1[31], step1[24]); | 684 | 102k | step2[25] = _mm_sub_epi16(step1[30], step1[25]); | 685 | 102k | step2[26] = _mm_sub_epi16(step1[29], step1[26]); | 686 | 102k | step2[27] = _mm_sub_epi16(step1[28], step1[27]); | 687 | 102k | out[28] = _mm_add_epi16(step1[27], step1[28]); | 688 | 102k | out[29] = _mm_add_epi16(step1[26], step1[29]); | 689 | 102k | out[30] = _mm_add_epi16(step1[25], step1[30]); | 690 | 102k | out[31] = _mm_add_epi16(step1[24], step1[31]); | 691 | | | 692 | | // stage 7 | 693 | 102k | butterfly(step2[27], step2[20], cospi_16_64, cospi_16_64, &out[20], &out[27]); | 694 | 102k | butterfly(step2[26], step2[21], cospi_16_64, cospi_16_64, &out[21], &out[26]); | 695 | 102k | butterfly(step2[25], step2[22], cospi_16_64, cospi_16_64, &out[22], &out[25]); | 696 | 102k | butterfly(step2[24], step2[23], cospi_16_64, cospi_16_64, &out[23], &out[24]); | 697 | 102k | } |
Unexecuted instantiation: highbd_idct4x4_add_sse2.c:idct32_8x32_quarter_3_4_stage_4_to_7 Unexecuted instantiation: highbd_idct8x8_add_sse2.c:idct32_8x32_quarter_3_4_stage_4_to_7 Unexecuted instantiation: highbd_idct16x16_add_sse2.c:idct32_8x32_quarter_3_4_stage_4_to_7 Unexecuted instantiation: highbd_idct32x32_add_sse2.c:idct32_8x32_quarter_3_4_stage_4_to_7 Unexecuted instantiation: highbd_idct4x4_add_sse4.c:idct32_8x32_quarter_3_4_stage_4_to_7 Unexecuted instantiation: highbd_idct8x8_add_sse4.c:idct32_8x32_quarter_3_4_stage_4_to_7 Unexecuted instantiation: highbd_idct16x16_add_sse4.c:idct32_8x32_quarter_3_4_stage_4_to_7 Unexecuted instantiation: highbd_idct32x32_add_sse4.c:idct32_8x32_quarter_3_4_stage_4_to_7 |
698 | | |
699 | | void idct4_sse2(__m128i *const in); |
700 | | void vpx_idct8_sse2(__m128i *const in); |
701 | | void idct16_sse2(__m128i *const in0, __m128i *const in1); |
702 | | void iadst4_sse2(__m128i *const in); |
703 | | void iadst8_sse2(__m128i *const in); |
704 | | void vpx_iadst16_8col_sse2(__m128i *const in); |
705 | | void iadst16_sse2(__m128i *const in0, __m128i *const in1); |
706 | | void idct32_1024_8x32(const __m128i *const in, __m128i *const out); |
707 | | void idct32_34_8x32_sse2(const __m128i *const in, __m128i *const out); |
708 | | void idct32_34_8x32_ssse3(const __m128i *const in, __m128i *const out); |
709 | | |
710 | | #endif // VPX_VPX_DSP_X86_INV_TXFM_SSE2_H_ |