/src/ffmpeg/libavcodec/simple_idct_template.c
Line | Count | Source |
1 | | /* |
2 | | * Simple IDCT |
3 | | * |
4 | | * Copyright (c) 2001 Michael Niedermayer <michaelni@gmx.at> |
5 | | * |
6 | | * This file is part of FFmpeg. |
7 | | * |
8 | | * FFmpeg is free software; you can redistribute it and/or |
9 | | * modify it under the terms of the GNU Lesser General Public |
10 | | * License as published by the Free Software Foundation; either |
11 | | * version 2.1 of the License, or (at your option) any later version. |
12 | | * |
13 | | * FFmpeg is distributed in the hope that it will be useful, |
14 | | * but WITHOUT ANY WARRANTY; without even the implied warranty of |
15 | | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
16 | | * Lesser General Public License for more details. |
17 | | * |
18 | | * You should have received a copy of the GNU Lesser General Public |
19 | | * License along with FFmpeg; if not, write to the Free Software |
20 | | * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA |
21 | | */ |
22 | | |
23 | | /** |
24 | | * @file |
25 | | * simpleidct in C. |
26 | | */ |
27 | | |
28 | | /* Based upon some commented-out C code from mpeg2dec (idct_mmx.c |
29 | | * written by Aaron Holtzman <aholtzma@ess.engr.uvic.ca>). */ |
30 | | |
31 | | #include "bit_depth_template.c" |
32 | | |
33 | | #undef W1 |
34 | | #undef W2 |
35 | | #undef W3 |
36 | | #undef W4 |
37 | | #undef W5 |
38 | | #undef W6 |
39 | | #undef W7 |
40 | | #undef ROW_SHIFT |
41 | | #undef COL_SHIFT |
42 | | #undef DC_SHIFT |
43 | | #undef MUL |
44 | | #undef MAC |
45 | | |
46 | | #if BIT_DEPTH == 8 |
47 | | |
48 | | #define W1 22725 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 |
49 | 3.56G | #define W2 21407 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 |
50 | | #define W3 19266 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 |
51 | 3.88G | #define W4 16383 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 |
52 | | #define W5 12873 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 |
53 | 3.56G | #define W6 8867 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 |
54 | | #define W7 4520 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 |
55 | | |
56 | 1.07G | #define ROW_SHIFT 11 |
57 | 3.08G | #define COL_SHIFT 20 |
58 | 2.83G | #define DC_SHIFT 3 |
59 | | |
60 | 6.58G | #define MUL(a, b) MUL16(a, b) |
61 | 7.62G | #define MAC(a, b, c) MAC16(a, b, c) |
62 | | |
63 | | #elif BIT_DEPTH == 10 || BIT_DEPTH == 12 |
64 | | |
65 | | # if BIT_DEPTH == 10 |
66 | | #define W1 22725 // 90901 |
67 | 40.3M | #define W2 21407 // 85627 |
68 | | #define W3 19265 // 77062 |
69 | 41.7M | #define W4 16384 // 65535 |
70 | | #define W5 12873 // 51491 |
71 | 40.3M | #define W6 8867 // 35468 |
72 | | #define W7 4520 // 18081 |
73 | | |
74 | | # ifdef EXTRA_SHIFT |
75 | 2.15M | #define ROW_SHIFT 13 |
76 | 44.0M | #define COL_SHIFT 18 |
77 | 13.9M | #define DC_SHIFT 1 |
78 | | # elif IN_IDCT_DEPTH == 32 |
79 | 8.23M | #define ROW_SHIFT 13 |
80 | 914k | #define COL_SHIFT 21 |
81 | | #define DC_SHIFT 2 |
82 | | # else |
83 | 8.69M | #define ROW_SHIFT 12 |
84 | 11.0M | #define COL_SHIFT 19 |
85 | 20.0M | #define DC_SHIFT 2 |
86 | | # endif |
87 | | |
88 | | # else |
89 | | #define W1 45451 |
90 | 13.4M | #define W2 42813 |
91 | | #define W3 38531 |
92 | 14.2M | #define W4 32767 |
93 | | #define W5 25746 |
94 | 13.4M | #define W6 17734 |
95 | | #define W7 9041 |
96 | | |
97 | 2.86M | #define ROW_SHIFT 16 |
98 | 12.9M | #define COL_SHIFT 17 |
99 | 16.7M | #define DC_SHIFT -1 |
100 | | # endif |
101 | | |
102 | 100M | #define MUL(a, b) ((int)((SUINT)(a) * (b))) |
103 | 117M | #define MAC(a, b, c) ((a) += (SUINT)(b) * (c)) |
104 | | |
105 | | #else |
106 | | |
107 | | #error "Unsupported bitdepth" |
108 | | |
109 | | #endif |
110 | | |
111 | | #ifdef EXTRA_SHIFT |
112 | | static inline void FUNC(idctRowCondDC_extrashift)(int16_t *row, int extra_shift) |
113 | | #else |
114 | | static inline void FUNC6(idctRowCondDC)(idctin *row, int extra_shift) |
115 | | #endif |
116 | 1.55G | { |
117 | 1.55G | SUINT a0, a1, a2, a3, b0, b1, b2, b3; |
118 | | |
119 | | // TODO: Add DC-only support for int32_t input |
120 | | #if IN_IDCT_DEPTH == 16 |
121 | | #if HAVE_FAST_64BIT |
122 | 1.55G | #define ROW0_MASK (0xffffULL << 48 * HAVE_BIGENDIAN) |
123 | 1.55G | if (((AV_RN64A(row) & ~ROW0_MASK) | AV_RN64A(row+4)) == 0) { |
124 | 1.43G | uint64_t temp; |
125 | 1.43G | if (DC_SHIFT - extra_shift >= 0) { |
126 | 1.42G | temp = (row[0] * (1 << (DC_SHIFT - extra_shift))) & 0xffff; |
127 | 1.42G | } else { |
128 | 10.2M | temp = ((row[0] + (1<<(extra_shift - DC_SHIFT-1))) >> (extra_shift - DC_SHIFT)) & 0xffff; |
129 | 10.2M | } |
130 | 1.43G | temp += temp * (1 << 16); |
131 | 1.43G | temp += temp * ((uint64_t) 1 << 32); |
132 | 1.43G | AV_WN64A(row, temp); |
133 | 1.43G | AV_WN64A(row + 4, temp); |
134 | 1.43G | return; |
135 | 1.43G | } |
136 | | #else |
137 | | if (!(AV_RN32A(row+2) | |
138 | | AV_RN32A(row+4) | |
139 | | AV_RN32A(row+6) | |
140 | | row[1])) { |
141 | | uint32_t temp; |
142 | | if (DC_SHIFT - extra_shift >= 0) { |
143 | | temp = (row[0] * (1 << (DC_SHIFT - extra_shift))) & 0xffff; |
144 | | } else { |
145 | | temp = ((row[0] + (1<<(extra_shift - DC_SHIFT-1))) >> (extra_shift - DC_SHIFT)) & 0xffff; |
146 | | } |
147 | | temp += temp * (1 << 16); |
148 | | AV_WN32A(row, temp); |
149 | | AV_WN32A(row+2, temp); |
150 | | AV_WN32A(row+4, temp); |
151 | | AV_WN32A(row+6, temp); |
152 | | return; |
153 | | } |
154 | | #endif |
155 | | #endif |
156 | | |
157 | 121M | a0 = ((SUINT)W4 * row[0]) + (1 << (ROW_SHIFT + extra_shift - 1)); |
158 | 120M | a1 = a0; |
159 | 120M | a2 = a0; |
160 | 120M | a3 = a0; |
161 | | |
162 | 121M | a0 += (SUINT)W2 * row[2]; |
163 | 121M | a1 += (SUINT)W6 * row[2]; |
164 | 121M | a2 -= (SUINT)W6 * row[2]; |
165 | 121M | a3 -= (SUINT)W2 * row[2]; |
166 | | |
167 | 121M | b0 = MUL(W1, row[1]); |
168 | 121M | MAC(b0, W3, row[3]); |
169 | 121M | b1 = MUL(W3, row[1]); |
170 | 121M | MAC(b1, -W7, row[3]); |
171 | 121M | b2 = MUL(W5, row[1]); |
172 | 121M | MAC(b2, -W1, row[3]); |
173 | 121M | b3 = MUL(W7, row[1]); |
174 | 121M | MAC(b3, -W5, row[3]); |
175 | | |
176 | | #if IN_IDCT_DEPTH == 32 |
177 | 914k | if (AV_RN64A(row + 4) | AV_RN64A(row + 6)) { |
178 | | #else |
179 | 120M | if (AV_RN64A(row + 4)) { |
180 | 55.9M | #endif |
181 | 56.0M | a0 += (SUINT) W4*row[4] + (SUINT)W6*row[6]; |
182 | 56.0M | a1 += (SUINT)- W4*row[4] - (SUINT)W2*row[6]; |
183 | 56.0M | a2 += (SUINT)- W4*row[4] + (SUINT)W2*row[6]; |
184 | 56.0M | a3 += (SUINT) W4*row[4] - (SUINT)W6*row[6]; |
185 | | |
186 | 56.0M | MAC(b0, W5, row[5]); |
187 | 56.0M | MAC(b0, W7, row[7]); |
188 | | |
189 | 56.0M | MAC(b1, -W1, row[5]); |
190 | 56.0M | MAC(b1, -W5, row[7]); |
191 | | |
192 | 56.0M | MAC(b2, W7, row[5]); |
193 | 56.0M | MAC(b2, W3, row[7]); |
194 | | |
195 | 56.0M | MAC(b3, W3, row[5]); |
196 | 56.0M | MAC(b3, -W1, row[7]); |
197 | 55.9M | } |
198 | | |
199 | 121M | row[0] = (int)(a0 + b0) >> (ROW_SHIFT + extra_shift); |
200 | 121M | row[7] = (int)(a0 - b0) >> (ROW_SHIFT + extra_shift); |
201 | 121M | row[1] = (int)(a1 + b1) >> (ROW_SHIFT + extra_shift); |
202 | 121M | row[6] = (int)(a1 - b1) >> (ROW_SHIFT + extra_shift); |
203 | 121M | row[2] = (int)(a2 + b2) >> (ROW_SHIFT + extra_shift); |
204 | 121M | row[5] = (int)(a2 - b2) >> (ROW_SHIFT + extra_shift); |
205 | 121M | row[3] = (int)(a3 + b3) >> (ROW_SHIFT + extra_shift); |
206 | 121M | row[4] = (int)(a3 - b3) >> (ROW_SHIFT + extra_shift); |
207 | 120M | } simple_idct.c:idctRowCondDC_int16_8bit Line | Count | Source | 116 | 1.53G | { | 117 | 1.53G | SUINT a0, a1, a2, a3, b0, b1, b2, b3; | 118 | | | 119 | | // TODO: Add DC-only support for int32_t input | 120 | 1.53G | #if IN_IDCT_DEPTH == 16 | 121 | 1.53G | #if HAVE_FAST_64BIT | 122 | 1.53G | #define ROW0_MASK (0xffffULL << 48 * HAVE_BIGENDIAN) | 123 | 1.53G | if (((AV_RN64A(row) & ~ROW0_MASK) | AV_RN64A(row+4)) == 0) { | 124 | 1.41G | uint64_t temp; | 125 | 1.41G | if (DC_SHIFT - extra_shift >= 0) { | 126 | 1.41G | temp = (row[0] * (1 << (DC_SHIFT - extra_shift))) & 0xffff; | 127 | 1.41G | } else { | 128 | 0 | temp = ((row[0] + (1<<(extra_shift - DC_SHIFT-1))) >> (extra_shift - DC_SHIFT)) & 0xffff; | 129 | 0 | } | 130 | 1.41G | temp += temp * (1 << 16); | 131 | 1.41G | temp += temp * ((uint64_t) 1 << 32); | 132 | 1.41G | AV_WN64A(row, temp); | 133 | 1.41G | AV_WN64A(row + 4, temp); | 134 | 1.41G | return; | 135 | 1.41G | } | 136 | | #else | 137 | | if (!(AV_RN32A(row+2) | | 138 | | AV_RN32A(row+4) | | 139 | | AV_RN32A(row+6) | | 140 | | row[1])) { | 141 | | uint32_t temp; | 142 | | if (DC_SHIFT - extra_shift >= 0) { | 143 | | temp = (row[0] * (1 << (DC_SHIFT - extra_shift))) & 0xffff; | 144 | | } else { | 145 | | temp = ((row[0] + (1<<(extra_shift - DC_SHIFT-1))) >> (extra_shift - DC_SHIFT)) & 0xffff; | 146 | | } | 147 | | temp += temp * (1 << 16); | 148 | | AV_WN32A(row, temp); | 149 | | AV_WN32A(row+2, temp); | 150 | | AV_WN32A(row+4, temp); | 151 | | AV_WN32A(row+6, temp); | 152 | | return; | 153 | | } | 154 | | #endif | 155 | 118M | #endif | 156 | | | 157 | 118M | a0 = ((SUINT)W4 * row[0]) + (1 << (ROW_SHIFT + extra_shift - 1)); | 158 | 118M | a1 = a0; | 159 | 118M | a2 = a0; | 160 | 118M | a3 = a0; | 161 | | | 162 | 118M | a0 += (SUINT)W2 * row[2]; | 163 | 118M | a1 += (SUINT)W6 * row[2]; | 164 | 118M | a2 -= (SUINT)W6 * row[2]; | 165 | 118M | a3 -= (SUINT)W2 * row[2]; | 166 | | | 167 | 118M | b0 = MUL(W1, row[1]); | 168 | 118M | MAC(b0, W3, row[3]); | 169 | 118M | b1 = MUL(W3, row[1]); | 170 | 118M | MAC(b1, -W7, row[3]); | 171 | 118M | b2 = MUL(W5, row[1]); | 172 | 118M | MAC(b2, -W1, row[3]); | 173 | 118M | b3 = MUL(W7, row[1]); | 174 | 118M | MAC(b3, -W5, row[3]); | 175 | | | 176 | | #if IN_IDCT_DEPTH == 32 | 177 | | if (AV_RN64A(row + 4) | AV_RN64A(row + 6)) { | 178 | | #else | 179 | 118M | if (AV_RN64A(row + 4)) { | 180 | 55.3M | #endif | 181 | 55.3M | a0 += (SUINT) W4*row[4] + (SUINT)W6*row[6]; | 182 | 55.3M | a1 += (SUINT)- W4*row[4] - (SUINT)W2*row[6]; | 183 | 55.3M | a2 += (SUINT)- W4*row[4] + (SUINT)W2*row[6]; | 184 | 55.3M | a3 += (SUINT) W4*row[4] - (SUINT)W6*row[6]; | 185 | | | 186 | 55.3M | MAC(b0, W5, row[5]); | 187 | 55.3M | MAC(b0, W7, row[7]); | 188 | | | 189 | 55.3M | MAC(b1, -W1, row[5]); | 190 | 55.3M | MAC(b1, -W5, row[7]); | 191 | | | 192 | 55.3M | MAC(b2, W7, row[5]); | 193 | 55.3M | MAC(b2, W3, row[7]); | 194 | | | 195 | 55.3M | MAC(b3, W3, row[5]); | 196 | 55.3M | MAC(b3, -W1, row[7]); | 197 | 55.3M | } | 198 | | | 199 | 118M | row[0] = (int)(a0 + b0) >> (ROW_SHIFT + extra_shift); | 200 | 118M | row[7] = (int)(a0 - b0) >> (ROW_SHIFT + extra_shift); | 201 | 118M | row[1] = (int)(a1 + b1) >> (ROW_SHIFT + extra_shift); | 202 | 118M | row[6] = (int)(a1 - b1) >> (ROW_SHIFT + extra_shift); | 203 | 118M | row[2] = (int)(a2 + b2) >> (ROW_SHIFT + extra_shift); | 204 | 118M | row[5] = (int)(a2 - b2) >> (ROW_SHIFT + extra_shift); | 205 | 118M | row[3] = (int)(a3 + b3) >> (ROW_SHIFT + extra_shift); | 206 | 118M | row[4] = (int)(a3 - b3) >> (ROW_SHIFT + extra_shift); | 207 | 118M | } |
simple_idct.c:idctRowCondDC_int16_10bit Line | Count | Source | 116 | 11.0M | { | 117 | 11.0M | SUINT a0, a1, a2, a3, b0, b1, b2, b3; | 118 | | | 119 | | // TODO: Add DC-only support for int32_t input | 120 | 11.0M | #if IN_IDCT_DEPTH == 16 | 121 | 11.0M | #if HAVE_FAST_64BIT | 122 | 11.0M | #define ROW0_MASK (0xffffULL << 48 * HAVE_BIGENDIAN) | 123 | 11.0M | if (((AV_RN64A(row) & ~ROW0_MASK) | AV_RN64A(row+4)) == 0) { | 124 | 10.0M | uint64_t temp; | 125 | 10.0M | if (DC_SHIFT - extra_shift >= 0) { | 126 | 10.0M | temp = (row[0] * (1 << (DC_SHIFT - extra_shift))) & 0xffff; | 127 | 10.0M | } else { | 128 | 0 | temp = ((row[0] + (1<<(extra_shift - DC_SHIFT-1))) >> (extra_shift - DC_SHIFT)) & 0xffff; | 129 | 0 | } | 130 | 10.0M | temp += temp * (1 << 16); | 131 | 10.0M | temp += temp * ((uint64_t) 1 << 32); | 132 | 10.0M | AV_WN64A(row, temp); | 133 | 10.0M | AV_WN64A(row + 4, temp); | 134 | 10.0M | return; | 135 | 10.0M | } | 136 | | #else | 137 | | if (!(AV_RN32A(row+2) | | 138 | | AV_RN32A(row+4) | | 139 | | AV_RN32A(row+6) | | 140 | | row[1])) { | 141 | | uint32_t temp; | 142 | | if (DC_SHIFT - extra_shift >= 0) { | 143 | | temp = (row[0] * (1 << (DC_SHIFT - extra_shift))) & 0xffff; | 144 | | } else { | 145 | | temp = ((row[0] + (1<<(extra_shift - DC_SHIFT-1))) >> (extra_shift - DC_SHIFT)) & 0xffff; | 146 | | } | 147 | | temp += temp * (1 << 16); | 148 | | AV_WN32A(row, temp); | 149 | | AV_WN32A(row+2, temp); | 150 | | AV_WN32A(row+4, temp); | 151 | | AV_WN32A(row+6, temp); | 152 | | return; | 153 | | } | 154 | | #endif | 155 | 966k | #endif | 156 | | | 157 | 966k | a0 = ((SUINT)W4 * row[0]) + (1 << (ROW_SHIFT + extra_shift - 1)); | 158 | 966k | a1 = a0; | 159 | 966k | a2 = a0; | 160 | 966k | a3 = a0; | 161 | | | 162 | 966k | a0 += (SUINT)W2 * row[2]; | 163 | 966k | a1 += (SUINT)W6 * row[2]; | 164 | 966k | a2 -= (SUINT)W6 * row[2]; | 165 | 966k | a3 -= (SUINT)W2 * row[2]; | 166 | | | 167 | 966k | b0 = MUL(W1, row[1]); | 168 | 966k | MAC(b0, W3, row[3]); | 169 | 966k | b1 = MUL(W3, row[1]); | 170 | 966k | MAC(b1, -W7, row[3]); | 171 | 966k | b2 = MUL(W5, row[1]); | 172 | 966k | MAC(b2, -W1, row[3]); | 173 | 966k | b3 = MUL(W7, row[1]); | 174 | 966k | MAC(b3, -W5, row[3]); | 175 | | | 176 | | #if IN_IDCT_DEPTH == 32 | 177 | | if (AV_RN64A(row + 4) | AV_RN64A(row + 6)) { | 178 | | #else | 179 | 966k | if (AV_RN64A(row + 4)) { | 180 | 333k | #endif | 181 | 333k | a0 += (SUINT) W4*row[4] + (SUINT)W6*row[6]; | 182 | 333k | a1 += (SUINT)- W4*row[4] - (SUINT)W2*row[6]; | 183 | 333k | a2 += (SUINT)- W4*row[4] + (SUINT)W2*row[6]; | 184 | 333k | a3 += (SUINT) W4*row[4] - (SUINT)W6*row[6]; | 185 | | | 186 | 333k | MAC(b0, W5, row[5]); | 187 | 333k | MAC(b0, W7, row[7]); | 188 | | | 189 | 333k | MAC(b1, -W1, row[5]); | 190 | 333k | MAC(b1, -W5, row[7]); | 191 | | | 192 | 333k | MAC(b2, W7, row[5]); | 193 | 333k | MAC(b2, W3, row[7]); | 194 | | | 195 | 333k | MAC(b3, W3, row[5]); | 196 | 333k | MAC(b3, -W1, row[7]); | 197 | 333k | } | 198 | | | 199 | 966k | row[0] = (int)(a0 + b0) >> (ROW_SHIFT + extra_shift); | 200 | 966k | row[7] = (int)(a0 - b0) >> (ROW_SHIFT + extra_shift); | 201 | 966k | row[1] = (int)(a1 + b1) >> (ROW_SHIFT + extra_shift); | 202 | 966k | row[6] = (int)(a1 - b1) >> (ROW_SHIFT + extra_shift); | 203 | 966k | row[2] = (int)(a2 + b2) >> (ROW_SHIFT + extra_shift); | 204 | 966k | row[5] = (int)(a2 - b2) >> (ROW_SHIFT + extra_shift); | 205 | 966k | row[3] = (int)(a3 + b3) >> (ROW_SHIFT + extra_shift); | 206 | 966k | row[4] = (int)(a3 - b3) >> (ROW_SHIFT + extra_shift); | 207 | 966k | } |
simple_idct.c:idctRowCondDC_int16_12bit Line | Count | Source | 116 | 5.01M | { | 117 | 5.01M | SUINT a0, a1, a2, a3, b0, b1, b2, b3; | 118 | | | 119 | | // TODO: Add DC-only support for int32_t input | 120 | 5.01M | #if IN_IDCT_DEPTH == 16 | 121 | 5.01M | #if HAVE_FAST_64BIT | 122 | 5.01M | #define ROW0_MASK (0xffffULL << 48 * HAVE_BIGENDIAN) | 123 | 5.01M | if (((AV_RN64A(row) & ~ROW0_MASK) | AV_RN64A(row+4)) == 0) { | 124 | 4.74M | uint64_t temp; | 125 | 4.74M | if (DC_SHIFT - extra_shift >= 0) { | 126 | 0 | temp = (row[0] * (1 << (DC_SHIFT - extra_shift))) & 0xffff; | 127 | 4.74M | } else { | 128 | 4.74M | temp = ((row[0] + (1<<(extra_shift - DC_SHIFT-1))) >> (extra_shift - DC_SHIFT)) & 0xffff; | 129 | 4.74M | } | 130 | 4.74M | temp += temp * (1 << 16); | 131 | 4.74M | temp += temp * ((uint64_t) 1 << 32); | 132 | 4.74M | AV_WN64A(row, temp); | 133 | 4.74M | AV_WN64A(row + 4, temp); | 134 | 4.74M | return; | 135 | 4.74M | } | 136 | | #else | 137 | | if (!(AV_RN32A(row+2) | | 138 | | AV_RN32A(row+4) | | 139 | | AV_RN32A(row+6) | | 140 | | row[1])) { | 141 | | uint32_t temp; | 142 | | if (DC_SHIFT - extra_shift >= 0) { | 143 | | temp = (row[0] * (1 << (DC_SHIFT - extra_shift))) & 0xffff; | 144 | | } else { | 145 | | temp = ((row[0] + (1<<(extra_shift - DC_SHIFT-1))) >> (extra_shift - DC_SHIFT)) & 0xffff; | 146 | | } | 147 | | temp += temp * (1 << 16); | 148 | | AV_WN32A(row, temp); | 149 | | AV_WN32A(row+2, temp); | 150 | | AV_WN32A(row+4, temp); | 151 | | AV_WN32A(row+6, temp); | 152 | | return; | 153 | | } | 154 | | #endif | 155 | 275k | #endif | 156 | | | 157 | 275k | a0 = ((SUINT)W4 * row[0]) + (1 << (ROW_SHIFT + extra_shift - 1)); | 158 | 275k | a1 = a0; | 159 | 275k | a2 = a0; | 160 | 275k | a3 = a0; | 161 | | | 162 | 275k | a0 += (SUINT)W2 * row[2]; | 163 | 275k | a1 += (SUINT)W6 * row[2]; | 164 | 275k | a2 -= (SUINT)W6 * row[2]; | 165 | 275k | a3 -= (SUINT)W2 * row[2]; | 166 | | | 167 | 275k | b0 = MUL(W1, row[1]); | 168 | 275k | MAC(b0, W3, row[3]); | 169 | 275k | b1 = MUL(W3, row[1]); | 170 | 275k | MAC(b1, -W7, row[3]); | 171 | 275k | b2 = MUL(W5, row[1]); | 172 | 275k | MAC(b2, -W1, row[3]); | 173 | 275k | b3 = MUL(W7, row[1]); | 174 | 275k | MAC(b3, -W5, row[3]); | 175 | | | 176 | | #if IN_IDCT_DEPTH == 32 | 177 | | if (AV_RN64A(row + 4) | AV_RN64A(row + 6)) { | 178 | | #else | 179 | 275k | if (AV_RN64A(row + 4)) { | 180 | 207k | #endif | 181 | 207k | a0 += (SUINT) W4*row[4] + (SUINT)W6*row[6]; | 182 | 207k | a1 += (SUINT)- W4*row[4] - (SUINT)W2*row[6]; | 183 | 207k | a2 += (SUINT)- W4*row[4] + (SUINT)W2*row[6]; | 184 | 207k | a3 += (SUINT) W4*row[4] - (SUINT)W6*row[6]; | 185 | | | 186 | 207k | MAC(b0, W5, row[5]); | 187 | 207k | MAC(b0, W7, row[7]); | 188 | | | 189 | 207k | MAC(b1, -W1, row[5]); | 190 | 207k | MAC(b1, -W5, row[7]); | 191 | | | 192 | 207k | MAC(b2, W7, row[5]); | 193 | 207k | MAC(b2, W3, row[7]); | 194 | | | 195 | 207k | MAC(b3, W3, row[5]); | 196 | 207k | MAC(b3, -W1, row[7]); | 197 | 207k | } | 198 | | | 199 | 275k | row[0] = (int)(a0 + b0) >> (ROW_SHIFT + extra_shift); | 200 | 275k | row[7] = (int)(a0 - b0) >> (ROW_SHIFT + extra_shift); | 201 | 275k | row[1] = (int)(a1 + b1) >> (ROW_SHIFT + extra_shift); | 202 | 275k | row[6] = (int)(a1 - b1) >> (ROW_SHIFT + extra_shift); | 203 | 275k | row[2] = (int)(a2 + b2) >> (ROW_SHIFT + extra_shift); | 204 | 275k | row[5] = (int)(a2 - b2) >> (ROW_SHIFT + extra_shift); | 205 | 275k | row[3] = (int)(a3 + b3) >> (ROW_SHIFT + extra_shift); | 206 | 275k | row[4] = (int)(a3 - b3) >> (ROW_SHIFT + extra_shift); | 207 | 275k | } |
simple_idct.c:idctRowCondDC_int32_10bit Line | Count | Source | 116 | 914k | { | 117 | 914k | SUINT a0, a1, a2, a3, b0, b1, b2, b3; | 118 | | | 119 | | // TODO: Add DC-only support for int32_t input | 120 | | #if IN_IDCT_DEPTH == 16 | 121 | | #if HAVE_FAST_64BIT | 122 | | #define ROW0_MASK (0xffffULL << 48 * HAVE_BIGENDIAN) | 123 | | if (((AV_RN64A(row) & ~ROW0_MASK) | AV_RN64A(row+4)) == 0) { | 124 | | uint64_t temp; | 125 | | if (DC_SHIFT - extra_shift >= 0) { | 126 | | temp = (row[0] * (1 << (DC_SHIFT - extra_shift))) & 0xffff; | 127 | | } else { | 128 | | temp = ((row[0] + (1<<(extra_shift - DC_SHIFT-1))) >> (extra_shift - DC_SHIFT)) & 0xffff; | 129 | | } | 130 | | temp += temp * (1 << 16); | 131 | | temp += temp * ((uint64_t) 1 << 32); | 132 | | AV_WN64A(row, temp); | 133 | | AV_WN64A(row + 4, temp); | 134 | | return; | 135 | | } | 136 | | #else | 137 | | if (!(AV_RN32A(row+2) | | 138 | | AV_RN32A(row+4) | | 139 | | AV_RN32A(row+6) | | 140 | | row[1])) { | 141 | | uint32_t temp; | 142 | | if (DC_SHIFT - extra_shift >= 0) { | 143 | | temp = (row[0] * (1 << (DC_SHIFT - extra_shift))) & 0xffff; | 144 | | } else { | 145 | | temp = ((row[0] + (1<<(extra_shift - DC_SHIFT-1))) >> (extra_shift - DC_SHIFT)) & 0xffff; | 146 | | } | 147 | | temp += temp * (1 << 16); | 148 | | AV_WN32A(row, temp); | 149 | | AV_WN32A(row+2, temp); | 150 | | AV_WN32A(row+4, temp); | 151 | | AV_WN32A(row+6, temp); | 152 | | return; | 153 | | } | 154 | | #endif | 155 | | #endif | 156 | | | 157 | 914k | a0 = ((SUINT)W4 * row[0]) + (1 << (ROW_SHIFT + extra_shift - 1)); | 158 | 914k | a1 = a0; | 159 | 914k | a2 = a0; | 160 | 914k | a3 = a0; | 161 | | | 162 | 914k | a0 += (SUINT)W2 * row[2]; | 163 | 914k | a1 += (SUINT)W6 * row[2]; | 164 | 914k | a2 -= (SUINT)W6 * row[2]; | 165 | 914k | a3 -= (SUINT)W2 * row[2]; | 166 | | | 167 | 914k | b0 = MUL(W1, row[1]); | 168 | 914k | MAC(b0, W3, row[3]); | 169 | 914k | b1 = MUL(W3, row[1]); | 170 | 914k | MAC(b1, -W7, row[3]); | 171 | 914k | b2 = MUL(W5, row[1]); | 172 | 914k | MAC(b2, -W1, row[3]); | 173 | 914k | b3 = MUL(W7, row[1]); | 174 | 914k | MAC(b3, -W5, row[3]); | 175 | | | 176 | 914k | #if IN_IDCT_DEPTH == 32 | 177 | 914k | if (AV_RN64A(row + 4) | AV_RN64A(row + 6)) { | 178 | | #else | 179 | | if (AV_RN64A(row + 4)) { | 180 | | #endif | 181 | 98.3k | a0 += (SUINT) W4*row[4] + (SUINT)W6*row[6]; | 182 | 98.3k | a1 += (SUINT)- W4*row[4] - (SUINT)W2*row[6]; | 183 | 98.3k | a2 += (SUINT)- W4*row[4] + (SUINT)W2*row[6]; | 184 | 98.3k | a3 += (SUINT) W4*row[4] - (SUINT)W6*row[6]; | 185 | | | 186 | 98.3k | MAC(b0, W5, row[5]); | 187 | 98.3k | MAC(b0, W7, row[7]); | 188 | | | 189 | 98.3k | MAC(b1, -W1, row[5]); | 190 | 98.3k | MAC(b1, -W5, row[7]); | 191 | | | 192 | 98.3k | MAC(b2, W7, row[5]); | 193 | 98.3k | MAC(b2, W3, row[7]); | 194 | | | 195 | 98.3k | MAC(b3, W3, row[5]); | 196 | 98.3k | MAC(b3, -W1, row[7]); | 197 | 98.3k | } | 198 | | | 199 | 914k | row[0] = (int)(a0 + b0) >> (ROW_SHIFT + extra_shift); | 200 | 914k | row[7] = (int)(a0 - b0) >> (ROW_SHIFT + extra_shift); | 201 | 914k | row[1] = (int)(a1 + b1) >> (ROW_SHIFT + extra_shift); | 202 | 914k | row[6] = (int)(a1 - b1) >> (ROW_SHIFT + extra_shift); | 203 | 914k | row[2] = (int)(a2 + b2) >> (ROW_SHIFT + extra_shift); | 204 | 914k | row[5] = (int)(a2 - b2) >> (ROW_SHIFT + extra_shift); | 205 | 914k | row[3] = (int)(a3 + b3) >> (ROW_SHIFT + extra_shift); | 206 | 914k | row[4] = (int)(a3 - b3) >> (ROW_SHIFT + extra_shift); | 207 | 914k | } |
proresdsp.c:idctRowCondDC_extrashift_10 Line | Count | Source | 116 | 4.89M | { | 117 | 4.89M | SUINT a0, a1, a2, a3, b0, b1, b2, b3; | 118 | | | 119 | | // TODO: Add DC-only support for int32_t input | 120 | 4.89M | #if IN_IDCT_DEPTH == 16 | 121 | 4.89M | #if HAVE_FAST_64BIT | 122 | 4.89M | #define ROW0_MASK (0xffffULL << 48 * HAVE_BIGENDIAN) | 123 | 4.89M | if (((AV_RN64A(row) & ~ROW0_MASK) | AV_RN64A(row+4)) == 0) { | 124 | 4.65M | uint64_t temp; | 125 | 4.65M | if (DC_SHIFT - extra_shift >= 0) { | 126 | 0 | temp = (row[0] * (1 << (DC_SHIFT - extra_shift))) & 0xffff; | 127 | 4.65M | } else { | 128 | 4.65M | temp = ((row[0] + (1<<(extra_shift - DC_SHIFT-1))) >> (extra_shift - DC_SHIFT)) & 0xffff; | 129 | 4.65M | } | 130 | 4.65M | temp += temp * (1 << 16); | 131 | 4.65M | temp += temp * ((uint64_t) 1 << 32); | 132 | 4.65M | AV_WN64A(row, temp); | 133 | 4.65M | AV_WN64A(row + 4, temp); | 134 | 4.65M | return; | 135 | 4.65M | } | 136 | | #else | 137 | | if (!(AV_RN32A(row+2) | | 138 | | AV_RN32A(row+4) | | 139 | | AV_RN32A(row+6) | | 140 | | row[1])) { | 141 | | uint32_t temp; | 142 | | if (DC_SHIFT - extra_shift >= 0) { | 143 | | temp = (row[0] * (1 << (DC_SHIFT - extra_shift))) & 0xffff; | 144 | | } else { | 145 | | temp = ((row[0] + (1<<(extra_shift - DC_SHIFT-1))) >> (extra_shift - DC_SHIFT)) & 0xffff; | 146 | | } | 147 | | temp += temp * (1 << 16); | 148 | | AV_WN32A(row, temp); | 149 | | AV_WN32A(row+2, temp); | 150 | | AV_WN32A(row+4, temp); | 151 | | AV_WN32A(row+6, temp); | 152 | | return; | 153 | | } | 154 | | #endif | 155 | 238k | #endif | 156 | | | 157 | 238k | a0 = ((SUINT)W4 * row[0]) + (1 << (ROW_SHIFT + extra_shift - 1)); | 158 | 238k | a1 = a0; | 159 | 238k | a2 = a0; | 160 | 238k | a3 = a0; | 161 | | | 162 | 238k | a0 += (SUINT)W2 * row[2]; | 163 | 238k | a1 += (SUINT)W6 * row[2]; | 164 | 238k | a2 -= (SUINT)W6 * row[2]; | 165 | 238k | a3 -= (SUINT)W2 * row[2]; | 166 | | | 167 | 238k | b0 = MUL(W1, row[1]); | 168 | 238k | MAC(b0, W3, row[3]); | 169 | 238k | b1 = MUL(W3, row[1]); | 170 | 238k | MAC(b1, -W7, row[3]); | 171 | 238k | b2 = MUL(W5, row[1]); | 172 | 238k | MAC(b2, -W1, row[3]); | 173 | 238k | b3 = MUL(W7, row[1]); | 174 | 238k | MAC(b3, -W5, row[3]); | 175 | | | 176 | | #if IN_IDCT_DEPTH == 32 | 177 | | if (AV_RN64A(row + 4) | AV_RN64A(row + 6)) { | 178 | | #else | 179 | 238k | if (AV_RN64A(row + 4)) { | 180 | 97.1k | #endif | 181 | 97.1k | a0 += (SUINT) W4*row[4] + (SUINT)W6*row[6]; | 182 | 97.1k | a1 += (SUINT)- W4*row[4] - (SUINT)W2*row[6]; | 183 | 97.1k | a2 += (SUINT)- W4*row[4] + (SUINT)W2*row[6]; | 184 | 97.1k | a3 += (SUINT) W4*row[4] - (SUINT)W6*row[6]; | 185 | | | 186 | 97.1k | MAC(b0, W5, row[5]); | 187 | 97.1k | MAC(b0, W7, row[7]); | 188 | | | 189 | 97.1k | MAC(b1, -W1, row[5]); | 190 | 97.1k | MAC(b1, -W5, row[7]); | 191 | | | 192 | 97.1k | MAC(b2, W7, row[5]); | 193 | 97.1k | MAC(b2, W3, row[7]); | 194 | | | 195 | 97.1k | MAC(b3, W3, row[5]); | 196 | 97.1k | MAC(b3, -W1, row[7]); | 197 | 97.1k | } | 198 | | | 199 | 238k | row[0] = (int)(a0 + b0) >> (ROW_SHIFT + extra_shift); | 200 | 238k | row[7] = (int)(a0 - b0) >> (ROW_SHIFT + extra_shift); | 201 | 238k | row[1] = (int)(a1 + b1) >> (ROW_SHIFT + extra_shift); | 202 | 238k | row[6] = (int)(a1 - b1) >> (ROW_SHIFT + extra_shift); | 203 | 238k | row[2] = (int)(a2 + b2) >> (ROW_SHIFT + extra_shift); | 204 | 238k | row[5] = (int)(a2 - b2) >> (ROW_SHIFT + extra_shift); | 205 | 238k | row[3] = (int)(a3 + b3) >> (ROW_SHIFT + extra_shift); | 206 | 238k | row[4] = (int)(a3 - b3) >> (ROW_SHIFT + extra_shift); | 207 | 238k | } |
proresdsp.c:idctRowCondDC_int16_12bit Line | Count | Source | 116 | 886k | { | 117 | 886k | SUINT a0, a1, a2, a3, b0, b1, b2, b3; | 118 | | | 119 | | // TODO: Add DC-only support for int32_t input | 120 | 886k | #if IN_IDCT_DEPTH == 16 | 121 | 886k | #if HAVE_FAST_64BIT | 122 | 886k | #define ROW0_MASK (0xffffULL << 48 * HAVE_BIGENDIAN) | 123 | 886k | if (((AV_RN64A(row) & ~ROW0_MASK) | AV_RN64A(row+4)) == 0) { | 124 | 842k | uint64_t temp; | 125 | 842k | if (DC_SHIFT - extra_shift >= 0) { | 126 | 0 | temp = (row[0] * (1 << (DC_SHIFT - extra_shift))) & 0xffff; | 127 | 842k | } else { | 128 | 842k | temp = ((row[0] + (1<<(extra_shift - DC_SHIFT-1))) >> (extra_shift - DC_SHIFT)) & 0xffff; | 129 | 842k | } | 130 | 842k | temp += temp * (1 << 16); | 131 | 842k | temp += temp * ((uint64_t) 1 << 32); | 132 | 842k | AV_WN64A(row, temp); | 133 | 842k | AV_WN64A(row + 4, temp); | 134 | 842k | return; | 135 | 842k | } | 136 | | #else | 137 | | if (!(AV_RN32A(row+2) | | 138 | | AV_RN32A(row+4) | | 139 | | AV_RN32A(row+6) | | 140 | | row[1])) { | 141 | | uint32_t temp; | 142 | | if (DC_SHIFT - extra_shift >= 0) { | 143 | | temp = (row[0] * (1 << (DC_SHIFT - extra_shift))) & 0xffff; | 144 | | } else { | 145 | | temp = ((row[0] + (1<<(extra_shift - DC_SHIFT-1))) >> (extra_shift - DC_SHIFT)) & 0xffff; | 146 | | } | 147 | | temp += temp * (1 << 16); | 148 | | AV_WN32A(row, temp); | 149 | | AV_WN32A(row+2, temp); | 150 | | AV_WN32A(row+4, temp); | 151 | | AV_WN32A(row+6, temp); | 152 | | return; | 153 | | } | 154 | | #endif | 155 | 43.3k | #endif | 156 | | | 157 | 43.3k | a0 = ((SUINT)W4 * row[0]) + (1 << (ROW_SHIFT + extra_shift - 1)); | 158 | 43.3k | a1 = a0; | 159 | 43.3k | a2 = a0; | 160 | 43.3k | a3 = a0; | 161 | | | 162 | 43.3k | a0 += (SUINT)W2 * row[2]; | 163 | 43.3k | a1 += (SUINT)W6 * row[2]; | 164 | 43.3k | a2 -= (SUINT)W6 * row[2]; | 165 | 43.3k | a3 -= (SUINT)W2 * row[2]; | 166 | | | 167 | 43.3k | b0 = MUL(W1, row[1]); | 168 | 43.3k | MAC(b0, W3, row[3]); | 169 | 43.3k | b1 = MUL(W3, row[1]); | 170 | 43.3k | MAC(b1, -W7, row[3]); | 171 | 43.3k | b2 = MUL(W5, row[1]); | 172 | 43.3k | MAC(b2, -W1, row[3]); | 173 | 43.3k | b3 = MUL(W7, row[1]); | 174 | 43.3k | MAC(b3, -W5, row[3]); | 175 | | | 176 | | #if IN_IDCT_DEPTH == 32 | 177 | | if (AV_RN64A(row + 4) | AV_RN64A(row + 6)) { | 178 | | #else | 179 | 43.3k | if (AV_RN64A(row + 4)) { | 180 | 21.3k | #endif | 181 | 21.3k | a0 += (SUINT) W4*row[4] + (SUINT)W6*row[6]; | 182 | 21.3k | a1 += (SUINT)- W4*row[4] - (SUINT)W2*row[6]; | 183 | 21.3k | a2 += (SUINT)- W4*row[4] + (SUINT)W2*row[6]; | 184 | 21.3k | a3 += (SUINT) W4*row[4] - (SUINT)W6*row[6]; | 185 | | | 186 | 21.3k | MAC(b0, W5, row[5]); | 187 | 21.3k | MAC(b0, W7, row[7]); | 188 | | | 189 | 21.3k | MAC(b1, -W1, row[5]); | 190 | 21.3k | MAC(b1, -W5, row[7]); | 191 | | | 192 | 21.3k | MAC(b2, W7, row[5]); | 193 | 21.3k | MAC(b2, W3, row[7]); | 194 | | | 195 | 21.3k | MAC(b3, W3, row[5]); | 196 | 21.3k | MAC(b3, -W1, row[7]); | 197 | 21.3k | } | 198 | | | 199 | 43.3k | row[0] = (int)(a0 + b0) >> (ROW_SHIFT + extra_shift); | 200 | 43.3k | row[7] = (int)(a0 - b0) >> (ROW_SHIFT + extra_shift); | 201 | 43.3k | row[1] = (int)(a1 + b1) >> (ROW_SHIFT + extra_shift); | 202 | 43.3k | row[6] = (int)(a1 - b1) >> (ROW_SHIFT + extra_shift); | 203 | 43.3k | row[2] = (int)(a2 + b2) >> (ROW_SHIFT + extra_shift); | 204 | 43.3k | row[5] = (int)(a2 - b2) >> (ROW_SHIFT + extra_shift); | 205 | 43.3k | row[3] = (int)(a3 + b3) >> (ROW_SHIFT + extra_shift); | 206 | 43.3k | row[4] = (int)(a3 - b3) >> (ROW_SHIFT + extra_shift); | 207 | 43.3k | } |
|
208 | | |
209 | 1.54G | #define IDCT_COLS do { \ |
210 | 1.54G | a0 = (SUINT)W4 * (col[8*0] + ((1<<(COL_SHIFT-1))/W4)); \ |
211 | 1.54G | a1 = a0; \ |
212 | 1.54G | a2 = a0; \ |
213 | 1.54G | a3 = a0; \ |
214 | 1.54G | \ |
215 | 1.54G | a0 += (SUINT) W2*col[8*2]; \ |
216 | 1.54G | a1 += (SUINT) W6*col[8*2]; \ |
217 | 1.54G | a2 += (SUINT)-W6*col[8*2]; \ |
218 | 1.54G | a3 += (SUINT)-W2*col[8*2]; \ |
219 | 1.54G | \ |
220 | 1.54G | b0 = MUL(W1, col[8*1]); \ |
221 | 1.54G | b1 = MUL(W3, col[8*1]); \ |
222 | 1.54G | b2 = MUL(W5, col[8*1]); \ |
223 | 1.54G | b3 = MUL(W7, col[8*1]); \ |
224 | 1.54G | \ |
225 | 1.54G | MAC(b0, W3, col[8*3]); \ |
226 | 1.54G | MAC(b1, -W7, col[8*3]); \ |
227 | 1.54G | MAC(b2, -W1, col[8*3]); \ |
228 | 1.54G | MAC(b3, -W5, col[8*3]); \ |
229 | 1.54G | \ |
230 | 1.54G | if (col[8*4]) { \ |
231 | 125M | a0 += (SUINT) W4*col[8*4]; \ |
232 | 125M | a1 += (SUINT)-W4*col[8*4]; \ |
233 | 125M | a2 += (SUINT)-W4*col[8*4]; \ |
234 | 125M | a3 += (SUINT) W4*col[8*4]; \ |
235 | 125M | } \ |
236 | 1.54G | \ |
237 | 1.54G | if (col[8*5]) { \ |
238 | 93.2M | MAC(b0, W5, col[8*5]); \ |
239 | 93.2M | MAC(b1, -W1, col[8*5]); \ |
240 | 93.2M | MAC(b2, W7, col[8*5]); \ |
241 | 93.2M | MAC(b3, W3, col[8*5]); \ |
242 | 93.2M | } \ |
243 | 1.54G | \ |
244 | 1.54G | if (col[8*6]) { \ |
245 | 83.0M | a0 += (SUINT) W6*col[8*6]; \ |
246 | 83.0M | a1 += (SUINT)-W2*col[8*6]; \ |
247 | 83.0M | a2 += (SUINT) W2*col[8*6]; \ |
248 | 83.0M | a3 += (SUINT)-W6*col[8*6]; \ |
249 | 83.0M | } \ |
250 | 1.54G | \ |
251 | 1.54G | if (col[8*7]) { \ |
252 | 60.6M | MAC(b0, W7, col[8*7]); \ |
253 | 60.6M | MAC(b1, -W5, col[8*7]); \ |
254 | 60.6M | MAC(b2, W3, col[8*7]); \ |
255 | 60.6M | MAC(b3, -W1, col[8*7]); \ |
256 | 60.6M | } \ |
257 | 1.54G | } while (0) |
258 | | |
259 | | #ifdef EXTRA_SHIFT |
260 | | static inline void FUNC(idctSparseCol_extrashift)(int16_t *col) |
261 | | #else |
262 | | static inline void FUNC6(idctSparseCol)(idctin *col) |
263 | | #endif |
264 | 200M | { |
265 | 200M | unsigned a0, a1, a2, a3, b0, b1, b2, b3; |
266 | | |
267 | 200M | IDCT_COLS; |
268 | | |
269 | 200M | col[0 ] = ((int)(a0 + b0) >> COL_SHIFT); |
270 | 200M | col[8 ] = ((int)(a1 + b1) >> COL_SHIFT); |
271 | 200M | col[16] = ((int)(a2 + b2) >> COL_SHIFT); |
272 | 200M | col[24] = ((int)(a3 + b3) >> COL_SHIFT); |
273 | 200M | col[32] = ((int)(a3 - b3) >> COL_SHIFT); |
274 | 200M | col[40] = ((int)(a2 - b2) >> COL_SHIFT); |
275 | 200M | col[48] = ((int)(a1 - b1) >> COL_SHIFT); |
276 | 200M | col[56] = ((int)(a0 - b0) >> COL_SHIFT); |
277 | 200M | } simple_idct.c:idctSparseCol_int16_8bit Line | Count | Source | 264 | 194M | { | 265 | 194M | unsigned a0, a1, a2, a3, b0, b1, b2, b3; | 266 | | | 267 | 194M | IDCT_COLS; | 268 | | | 269 | 194M | col[0 ] = ((int)(a0 + b0) >> COL_SHIFT); | 270 | 194M | col[8 ] = ((int)(a1 + b1) >> COL_SHIFT); | 271 | 194M | col[16] = ((int)(a2 + b2) >> COL_SHIFT); | 272 | 194M | col[24] = ((int)(a3 + b3) >> COL_SHIFT); | 273 | 194M | col[32] = ((int)(a3 - b3) >> COL_SHIFT); | 274 | 194M | col[40] = ((int)(a2 - b2) >> COL_SHIFT); | 275 | 194M | col[48] = ((int)(a1 - b1) >> COL_SHIFT); | 276 | 194M | col[56] = ((int)(a0 - b0) >> COL_SHIFT); | 277 | 194M | } |
Unexecuted instantiation: simple_idct.c:idctSparseCol_int16_10bit Unexecuted instantiation: simple_idct.c:idctSparseCol_int16_12bit Unexecuted instantiation: simple_idct.c:idctSparseCol_int32_10bit proresdsp.c:idctSparseCol_extrashift_10 Line | Count | Source | 264 | 4.89M | { | 265 | 4.89M | unsigned a0, a1, a2, a3, b0, b1, b2, b3; | 266 | | | 267 | 4.89M | IDCT_COLS; | 268 | | | 269 | 4.89M | col[0 ] = ((int)(a0 + b0) >> COL_SHIFT); | 270 | 4.89M | col[8 ] = ((int)(a1 + b1) >> COL_SHIFT); | 271 | 4.89M | col[16] = ((int)(a2 + b2) >> COL_SHIFT); | 272 | 4.89M | col[24] = ((int)(a3 + b3) >> COL_SHIFT); | 273 | 4.89M | col[32] = ((int)(a3 - b3) >> COL_SHIFT); | 274 | 4.89M | col[40] = ((int)(a2 - b2) >> COL_SHIFT); | 275 | 4.89M | col[48] = ((int)(a1 - b1) >> COL_SHIFT); | 276 | 4.89M | col[56] = ((int)(a0 - b0) >> COL_SHIFT); | 277 | 4.89M | } |
proresdsp.c:idctSparseCol_int16_12bit Line | Count | Source | 264 | 886k | { | 265 | 886k | unsigned a0, a1, a2, a3, b0, b1, b2, b3; | 266 | | | 267 | 886k | IDCT_COLS; | 268 | | | 269 | 886k | col[0 ] = ((int)(a0 + b0) >> COL_SHIFT); | 270 | 886k | col[8 ] = ((int)(a1 + b1) >> COL_SHIFT); | 271 | 886k | col[16] = ((int)(a2 + b2) >> COL_SHIFT); | 272 | 886k | col[24] = ((int)(a3 + b3) >> COL_SHIFT); | 273 | 886k | col[32] = ((int)(a3 - b3) >> COL_SHIFT); | 274 | 886k | col[40] = ((int)(a2 - b2) >> COL_SHIFT); | 275 | 886k | col[48] = ((int)(a1 - b1) >> COL_SHIFT); | 276 | 886k | col[56] = ((int)(a0 - b0) >> COL_SHIFT); | 277 | 886k | } |
|
278 | | |
279 | | #ifndef PRORES_ONLY |
280 | | #ifndef EXTRA_SHIFT |
281 | | static inline void FUNC6(idctSparseColPut)(pixel *dest, ptrdiff_t line_size, |
282 | | idctin *col) |
283 | 1.25G | { |
284 | 1.25G | SUINT a0, a1, a2, a3, b0, b1, b2, b3; |
285 | | |
286 | 1.25G | IDCT_COLS; |
287 | | |
288 | 1.25G | dest[0] = av_clip_pixel((int)(a0 + b0) >> COL_SHIFT); |
289 | 1.25G | dest += line_size; |
290 | 1.25G | dest[0] = av_clip_pixel((int)(a1 + b1) >> COL_SHIFT); |
291 | 1.25G | dest += line_size; |
292 | 1.25G | dest[0] = av_clip_pixel((int)(a2 + b2) >> COL_SHIFT); |
293 | 1.25G | dest += line_size; |
294 | 1.25G | dest[0] = av_clip_pixel((int)(a3 + b3) >> COL_SHIFT); |
295 | 1.25G | dest += line_size; |
296 | 1.25G | dest[0] = av_clip_pixel((int)(a3 - b3) >> COL_SHIFT); |
297 | 1.25G | dest += line_size; |
298 | 1.25G | dest[0] = av_clip_pixel((int)(a2 - b2) >> COL_SHIFT); |
299 | 1.25G | dest += line_size; |
300 | 1.25G | dest[0] = av_clip_pixel((int)(a1 - b1) >> COL_SHIFT); |
301 | 1.25G | dest += line_size; |
302 | 1.25G | dest[0] = av_clip_pixel((int)(a0 - b0) >> COL_SHIFT); |
303 | 1.25G | } simple_idct.c:idctSparseColPut_int16_8bit Line | Count | Source | 283 | 1.23G | { | 284 | 1.23G | SUINT a0, a1, a2, a3, b0, b1, b2, b3; | 285 | | | 286 | 1.23G | IDCT_COLS; | 287 | | | 288 | 1.23G | dest[0] = av_clip_pixel((int)(a0 + b0) >> COL_SHIFT); | 289 | 1.23G | dest += line_size; | 290 | 1.23G | dest[0] = av_clip_pixel((int)(a1 + b1) >> COL_SHIFT); | 291 | 1.23G | dest += line_size; | 292 | 1.23G | dest[0] = av_clip_pixel((int)(a2 + b2) >> COL_SHIFT); | 293 | 1.23G | dest += line_size; | 294 | 1.23G | dest[0] = av_clip_pixel((int)(a3 + b3) >> COL_SHIFT); | 295 | 1.23G | dest += line_size; | 296 | 1.23G | dest[0] = av_clip_pixel((int)(a3 - b3) >> COL_SHIFT); | 297 | 1.23G | dest += line_size; | 298 | 1.23G | dest[0] = av_clip_pixel((int)(a2 - b2) >> COL_SHIFT); | 299 | 1.23G | dest += line_size; | 300 | 1.23G | dest[0] = av_clip_pixel((int)(a1 - b1) >> COL_SHIFT); | 301 | 1.23G | dest += line_size; | 302 | 1.23G | dest[0] = av_clip_pixel((int)(a0 - b0) >> COL_SHIFT); | 303 | 1.23G | } |
simple_idct.c:idctSparseColPut_int16_10bit Line | Count | Source | 283 | 11.0M | { | 284 | 11.0M | SUINT a0, a1, a2, a3, b0, b1, b2, b3; | 285 | | | 286 | 11.0M | IDCT_COLS; | 287 | | | 288 | 11.0M | dest[0] = av_clip_pixel((int)(a0 + b0) >> COL_SHIFT); | 289 | 11.0M | dest += line_size; | 290 | 11.0M | dest[0] = av_clip_pixel((int)(a1 + b1) >> COL_SHIFT); | 291 | 11.0M | dest += line_size; | 292 | 11.0M | dest[0] = av_clip_pixel((int)(a2 + b2) >> COL_SHIFT); | 293 | 11.0M | dest += line_size; | 294 | 11.0M | dest[0] = av_clip_pixel((int)(a3 + b3) >> COL_SHIFT); | 295 | 11.0M | dest += line_size; | 296 | 11.0M | dest[0] = av_clip_pixel((int)(a3 - b3) >> COL_SHIFT); | 297 | 11.0M | dest += line_size; | 298 | 11.0M | dest[0] = av_clip_pixel((int)(a2 - b2) >> COL_SHIFT); | 299 | 11.0M | dest += line_size; | 300 | 11.0M | dest[0] = av_clip_pixel((int)(a1 - b1) >> COL_SHIFT); | 301 | 11.0M | dest += line_size; | 302 | 11.0M | dest[0] = av_clip_pixel((int)(a0 - b0) >> COL_SHIFT); | 303 | 11.0M | } |
simple_idct.c:idctSparseColPut_int16_12bit Line | Count | Source | 283 | 5.01M | { | 284 | 5.01M | SUINT a0, a1, a2, a3, b0, b1, b2, b3; | 285 | | | 286 | 5.01M | IDCT_COLS; | 287 | | | 288 | 5.01M | dest[0] = av_clip_pixel((int)(a0 + b0) >> COL_SHIFT); | 289 | 5.01M | dest += line_size; | 290 | 5.01M | dest[0] = av_clip_pixel((int)(a1 + b1) >> COL_SHIFT); | 291 | 5.01M | dest += line_size; | 292 | 5.01M | dest[0] = av_clip_pixel((int)(a2 + b2) >> COL_SHIFT); | 293 | 5.01M | dest += line_size; | 294 | 5.01M | dest[0] = av_clip_pixel((int)(a3 + b3) >> COL_SHIFT); | 295 | 5.01M | dest += line_size; | 296 | 5.01M | dest[0] = av_clip_pixel((int)(a3 - b3) >> COL_SHIFT); | 297 | 5.01M | dest += line_size; | 298 | 5.01M | dest[0] = av_clip_pixel((int)(a2 - b2) >> COL_SHIFT); | 299 | 5.01M | dest += line_size; | 300 | 5.01M | dest[0] = av_clip_pixel((int)(a1 - b1) >> COL_SHIFT); | 301 | 5.01M | dest += line_size; | 302 | 5.01M | dest[0] = av_clip_pixel((int)(a0 - b0) >> COL_SHIFT); | 303 | 5.01M | } |
simple_idct.c:idctSparseColPut_int32_10bit Line | Count | Source | 283 | 914k | { | 284 | 914k | SUINT a0, a1, a2, a3, b0, b1, b2, b3; | 285 | | | 286 | 914k | IDCT_COLS; | 287 | | | 288 | 914k | dest[0] = av_clip_pixel((int)(a0 + b0) >> COL_SHIFT); | 289 | 914k | dest += line_size; | 290 | 914k | dest[0] = av_clip_pixel((int)(a1 + b1) >> COL_SHIFT); | 291 | 914k | dest += line_size; | 292 | 914k | dest[0] = av_clip_pixel((int)(a2 + b2) >> COL_SHIFT); | 293 | 914k | dest += line_size; | 294 | 914k | dest[0] = av_clip_pixel((int)(a3 + b3) >> COL_SHIFT); | 295 | 914k | dest += line_size; | 296 | 914k | dest[0] = av_clip_pixel((int)(a3 - b3) >> COL_SHIFT); | 297 | 914k | dest += line_size; | 298 | 914k | dest[0] = av_clip_pixel((int)(a2 - b2) >> COL_SHIFT); | 299 | 914k | dest += line_size; | 300 | 914k | dest[0] = av_clip_pixel((int)(a1 - b1) >> COL_SHIFT); | 301 | 914k | dest += line_size; | 302 | 914k | dest[0] = av_clip_pixel((int)(a0 - b0) >> COL_SHIFT); | 303 | 914k | } |
|
304 | | |
305 | | static inline void FUNC6(idctSparseColAdd)(pixel *dest, ptrdiff_t line_size, |
306 | | idctin *col) |
307 | 96.4M | { |
308 | 96.4M | unsigned a0, a1, a2, a3, b0, b1, b2, b3; |
309 | | |
310 | 96.4M | IDCT_COLS; |
311 | | |
312 | 96.4M | dest[0] = av_clip_pixel(dest[0] + ((int)(a0 + b0) >> COL_SHIFT)); |
313 | 96.4M | dest += line_size; |
314 | 96.4M | dest[0] = av_clip_pixel(dest[0] + ((int)(a1 + b1) >> COL_SHIFT)); |
315 | 96.4M | dest += line_size; |
316 | 96.4M | dest[0] = av_clip_pixel(dest[0] + ((int)(a2 + b2) >> COL_SHIFT)); |
317 | 96.4M | dest += line_size; |
318 | 96.4M | dest[0] = av_clip_pixel(dest[0] + ((int)(a3 + b3) >> COL_SHIFT)); |
319 | 96.4M | dest += line_size; |
320 | 96.4M | dest[0] = av_clip_pixel(dest[0] + ((int)(a3 - b3) >> COL_SHIFT)); |
321 | 96.4M | dest += line_size; |
322 | 96.4M | dest[0] = av_clip_pixel(dest[0] + ((int)(a2 - b2) >> COL_SHIFT)); |
323 | 96.4M | dest += line_size; |
324 | 96.4M | dest[0] = av_clip_pixel(dest[0] + ((int)(a1 - b1) >> COL_SHIFT)); |
325 | 96.4M | dest += line_size; |
326 | 96.4M | dest[0] = av_clip_pixel(dest[0] + ((int)(a0 - b0) >> COL_SHIFT)); |
327 | 96.4M | } simple_idct.c:idctSparseColAdd_int16_8bit Line | Count | Source | 307 | 96.4M | { | 308 | 96.4M | unsigned a0, a1, a2, a3, b0, b1, b2, b3; | 309 | | | 310 | 96.4M | IDCT_COLS; | 311 | | | 312 | 96.4M | dest[0] = av_clip_pixel(dest[0] + ((int)(a0 + b0) >> COL_SHIFT)); | 313 | 96.4M | dest += line_size; | 314 | 96.4M | dest[0] = av_clip_pixel(dest[0] + ((int)(a1 + b1) >> COL_SHIFT)); | 315 | 96.4M | dest += line_size; | 316 | 96.4M | dest[0] = av_clip_pixel(dest[0] + ((int)(a2 + b2) >> COL_SHIFT)); | 317 | 96.4M | dest += line_size; | 318 | 96.4M | dest[0] = av_clip_pixel(dest[0] + ((int)(a3 + b3) >> COL_SHIFT)); | 319 | 96.4M | dest += line_size; | 320 | 96.4M | dest[0] = av_clip_pixel(dest[0] + ((int)(a3 - b3) >> COL_SHIFT)); | 321 | 96.4M | dest += line_size; | 322 | 96.4M | dest[0] = av_clip_pixel(dest[0] + ((int)(a2 - b2) >> COL_SHIFT)); | 323 | 96.4M | dest += line_size; | 324 | 96.4M | dest[0] = av_clip_pixel(dest[0] + ((int)(a1 - b1) >> COL_SHIFT)); | 325 | 96.4M | dest += line_size; | 326 | 96.4M | dest[0] = av_clip_pixel(dest[0] + ((int)(a0 - b0) >> COL_SHIFT)); | 327 | 96.4M | } |
Unexecuted instantiation: simple_idct.c:idctSparseColAdd_int16_10bit Unexecuted instantiation: simple_idct.c:idctSparseColAdd_int16_12bit Unexecuted instantiation: simple_idct.c:idctSparseColAdd_int32_10bit |
328 | | |
329 | | void FUNC6(ff_simple_idct_put)(uint8_t *dest_, ptrdiff_t line_size, int16_t *block_) |
330 | 156M | { |
331 | 156M | idctin *block = (idctin *)block_; |
332 | 156M | pixel *dest = (pixel *)dest_; |
333 | 156M | int i; |
334 | | |
335 | 156M | line_size /= sizeof(pixel); |
336 | | |
337 | 1.40G | for (i = 0; i < 8; i++) |
338 | 1.25G | FUNC6(idctRowCondDC)(block + i*8, 0); |
339 | | |
340 | 1.40G | for (i = 0; i < 8; i++) |
341 | 1.25G | FUNC6(idctSparseColPut)(dest + i, line_size, block + i); |
342 | 156M | } ff_simple_idct_put_int16_8bit Line | Count | Source | 330 | 154M | { | 331 | 154M | idctin *block = (idctin *)block_; | 332 | 154M | pixel *dest = (pixel *)dest_; | 333 | 154M | int i; | 334 | | | 335 | 154M | line_size /= sizeof(pixel); | 336 | | | 337 | 1.38G | for (i = 0; i < 8; i++) | 338 | 1.23G | FUNC6(idctRowCondDC)(block + i*8, 0); | 339 | | | 340 | 1.38G | for (i = 0; i < 8; i++) | 341 | 1.23G | FUNC6(idctSparseColPut)(dest + i, line_size, block + i); | 342 | 154M | } |
ff_simple_idct_put_int16_10bit Line | Count | Source | 330 | 1.37M | { | 331 | 1.37M | idctin *block = (idctin *)block_; | 332 | 1.37M | pixel *dest = (pixel *)dest_; | 333 | 1.37M | int i; | 334 | | | 335 | 1.37M | line_size /= sizeof(pixel); | 336 | | | 337 | 12.3M | for (i = 0; i < 8; i++) | 338 | 11.0M | FUNC6(idctRowCondDC)(block + i*8, 0); | 339 | | | 340 | 12.3M | for (i = 0; i < 8; i++) | 341 | 11.0M | FUNC6(idctSparseColPut)(dest + i, line_size, block + i); | 342 | 1.37M | } |
ff_simple_idct_put_int16_12bit Line | Count | Source | 330 | 626k | { | 331 | 626k | idctin *block = (idctin *)block_; | 332 | 626k | pixel *dest = (pixel *)dest_; | 333 | 626k | int i; | 334 | | | 335 | 626k | line_size /= sizeof(pixel); | 336 | | | 337 | 5.64M | for (i = 0; i < 8; i++) | 338 | 5.01M | FUNC6(idctRowCondDC)(block + i*8, 0); | 339 | | | 340 | 5.64M | for (i = 0; i < 8; i++) | 341 | 5.01M | FUNC6(idctSparseColPut)(dest + i, line_size, block + i); | 342 | 626k | } |
ff_simple_idct_put_int32_10bit Line | Count | Source | 330 | 114k | { | 331 | 114k | idctin *block = (idctin *)block_; | 332 | 114k | pixel *dest = (pixel *)dest_; | 333 | 114k | int i; | 334 | | | 335 | 114k | line_size /= sizeof(pixel); | 336 | | | 337 | 1.02M | for (i = 0; i < 8; i++) | 338 | 914k | FUNC6(idctRowCondDC)(block + i*8, 0); | 339 | | | 340 | 1.02M | for (i = 0; i < 8; i++) | 341 | 914k | FUNC6(idctSparseColPut)(dest + i, line_size, block + i); | 342 | 114k | } |
|
343 | | |
344 | | #if IN_IDCT_DEPTH == 16 |
345 | | void FUNC6(ff_simple_idct_add)(uint8_t *dest_, ptrdiff_t line_size, int16_t *block) |
346 | 6.91M | { |
347 | 6.91M | pixel *dest = (pixel *)dest_; |
348 | 6.91M | int i; |
349 | | |
350 | 6.91M | line_size /= sizeof(pixel); |
351 | | |
352 | 62.2M | for (i = 0; i < 8; i++) |
353 | 55.2M | FUNC6(idctRowCondDC)(block + i*8, 0); |
354 | | |
355 | 62.2M | for (i = 0; i < 8; i++) |
356 | 55.2M | FUNC6(idctSparseColAdd)(dest + i, line_size, block + i); |
357 | 6.91M | } ff_simple_idct_add_int16_8bit Line | Count | Source | 346 | 6.91M | { | 347 | 6.91M | pixel *dest = (pixel *)dest_; | 348 | 6.91M | int i; | 349 | | | 350 | 6.91M | line_size /= sizeof(pixel); | 351 | | | 352 | 62.2M | for (i = 0; i < 8; i++) | 353 | 55.2M | FUNC6(idctRowCondDC)(block + i*8, 0); | 354 | | | 355 | 62.2M | for (i = 0; i < 8; i++) | 356 | 55.2M | FUNC6(idctSparseColAdd)(dest + i, line_size, block + i); | 357 | 6.91M | } |
Unexecuted instantiation: ff_simple_idct_add_int16_10bit Unexecuted instantiation: ff_simple_idct_add_int16_12bit |
358 | | |
359 | | void FUNC6(ff_simple_idct)(int16_t *block) |
360 | 24.3M | { |
361 | 24.3M | int i; |
362 | | |
363 | 218M | for (i = 0; i < 8; i++) |
364 | 194M | FUNC6(idctRowCondDC)(block + i*8, 0); |
365 | | |
366 | 218M | for (i = 0; i < 8; i++) |
367 | 194M | FUNC6(idctSparseCol)(block + i); |
368 | 24.3M | } ff_simple_idct_int16_8bit Line | Count | Source | 360 | 24.3M | { | 361 | 24.3M | int i; | 362 | | | 363 | 218M | for (i = 0; i < 8; i++) | 364 | 194M | FUNC6(idctRowCondDC)(block + i*8, 0); | 365 | | | 366 | 218M | for (i = 0; i < 8; i++) | 367 | 194M | FUNC6(idctSparseCol)(block + i); | 368 | 24.3M | } |
Unexecuted instantiation: ff_simple_idct_int16_10bit Unexecuted instantiation: ff_simple_idct_int16_12bit |
369 | | #endif |
370 | | #endif |
371 | | #endif /* PRORES_ONLY */ |