/src/x265/source/common/primitives.cpp
Line | Count | Source (jump to first uncovered line) |
1 | | /***************************************************************************** |
2 | | * Copyright (C) 2013-2020 MulticoreWare, Inc |
3 | | * |
4 | | * Authors: Steve Borho <steve@borho.org> |
5 | | * |
6 | | * This program is free software; you can redistribute it and/or modify |
7 | | * it under the terms of the GNU General Public License as published by |
8 | | * the Free Software Foundation; either version 2 of the License, or |
9 | | * (at your option) any later version. |
10 | | * |
11 | | * This program is distributed in the hope that it will be useful, |
12 | | * but WITHOUT ANY WARRANTY; without even the implied warranty of |
13 | | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
14 | | * GNU General Public License for more details. |
15 | | * |
16 | | * You should have received a copy of the GNU General Public License |
17 | | * along with this program; if not, write to the Free Software |
18 | | * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. |
19 | | * |
20 | | * This program is also available under a commercial proprietary license. |
21 | | * For more information, contact us at license @ x265.com. |
22 | | *****************************************************************************/ |
23 | | |
24 | | #include "common.h" |
25 | | #include "primitives.h" |
26 | | |
27 | | namespace X265_NS { |
28 | | // x265 private namespace |
29 | | |
30 | | extern const uint8_t lumaPartitionMapTable[] = |
31 | | { |
32 | | // 4 8 12 16 20 24 28 32 36 40 44 48 52 56 60 64 |
33 | | LUMA_4x4, LUMA_4x8, 255, LUMA_4x16, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, // 4 |
34 | | LUMA_8x4, LUMA_8x8, 255, LUMA_8x16, 255, 255, 255, LUMA_8x32, 255, 255, 255, 255, 255, 255, 255, 255, // 8 |
35 | | 255, 255, 255, LUMA_12x16, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, // 12 |
36 | | LUMA_16x4, LUMA_16x8, LUMA_16x12, LUMA_16x16, 255, 255, 255, LUMA_16x32, 255, 255, 255, 255, 255, 255, 255, LUMA_16x64, // 16 |
37 | | 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, // 20 |
38 | | 255, 255, 255, 255, 255, 255, 255, LUMA_24x32, 255, 255, 255, 255, 255, 255, 255, 255, // 24 |
39 | | 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, // 28 |
40 | | 255, LUMA_32x8, 255, LUMA_32x16, 255, LUMA_32x24, 255, LUMA_32x32, 255, 255, 255, 255, 255, 255, 255, LUMA_32x64, // 32 |
41 | | 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, // 36 |
42 | | 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, // 40 |
43 | | 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, // 44 |
44 | | 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, LUMA_48x64, // 48 |
45 | | 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, // 52 |
46 | | 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, // 56 |
47 | | 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, // 60 |
48 | | 255, 255, 255, LUMA_64x16, 255, 255, 255, LUMA_64x32, 255, 255, 255, LUMA_64x48, 255, 255, 255, LUMA_64x64 // 64 |
49 | | }; |
50 | | |
51 | | /* the "authoritative" set of encoder primitives */ |
52 | | EncoderPrimitives primitives; |
53 | | |
54 | | void setupPixelPrimitives_c(EncoderPrimitives &p); |
55 | | void setupDCTPrimitives_c(EncoderPrimitives &p); |
56 | | void setupFilterPrimitives_c(EncoderPrimitives &p); |
57 | | void setupIntraPrimitives_c(EncoderPrimitives &p); |
58 | | void setupLoopFilterPrimitives_c(EncoderPrimitives &p); |
59 | | void setupSaoPrimitives_c(EncoderPrimitives &p); |
60 | | void setupSeaIntegralPrimitives_c(EncoderPrimitives &p); |
61 | | void setupLowPassPrimitives_c(EncoderPrimitives& p); |
62 | | |
63 | | void setupCPrimitives(EncoderPrimitives &p) |
64 | 0 | { |
65 | 0 | setupPixelPrimitives_c(p); // pixel.cpp |
66 | 0 | setupDCTPrimitives_c(p); // dct.cpp |
67 | 0 | setupLowPassPrimitives_c(p); // lowpassdct.cpp |
68 | 0 | setupFilterPrimitives_c(p); // ipfilter.cpp |
69 | 0 | setupIntraPrimitives_c(p); // intrapred.cpp |
70 | 0 | setupLoopFilterPrimitives_c(p); // loopfilter.cpp |
71 | 0 | setupSaoPrimitives_c(p); // sao.cpp |
72 | 0 | setupSeaIntegralPrimitives_c(p); // framefilter.cpp |
73 | 0 | } |
74 | | |
75 | | void enableLowpassDCTPrimitives(EncoderPrimitives &p) |
76 | 0 | { |
77 | | // update copies of the standard dct transform |
78 | 0 | p.cu[BLOCK_4x4].standard_dct = p.cu[BLOCK_4x4].dct; |
79 | 0 | p.cu[BLOCK_8x8].standard_dct = p.cu[BLOCK_8x8].dct; |
80 | 0 | p.cu[BLOCK_16x16].standard_dct = p.cu[BLOCK_16x16].dct; |
81 | 0 | p.cu[BLOCK_32x32].standard_dct = p.cu[BLOCK_32x32].dct; |
82 | | |
83 | | // replace active dct by lowpass dct for high dct transforms |
84 | 0 | p.cu[BLOCK_16x16].dct = p.cu[BLOCK_16x16].lowpass_dct; |
85 | 0 | p.cu[BLOCK_32x32].dct = p.cu[BLOCK_32x32].lowpass_dct; |
86 | 0 | } |
87 | | |
88 | | void setupAliasPrimitives(EncoderPrimitives &p) |
89 | 0 | { |
90 | | #if HIGH_BIT_DEPTH |
91 | | /* at HIGH_BIT_DEPTH, pixel == short so we can alias many primitives */ |
92 | | for (int i = 0; i < NUM_CU_SIZES; i++) |
93 | | { |
94 | | #if !defined(X265_ARCH_ARM64) |
95 | | p.cu[i].sse_pp = (pixel_sse_t)p.cu[i].sse_ss; |
96 | | #endif |
97 | | |
98 | | p.cu[i].copy_ps = (copy_ps_t)p.pu[i].copy_pp; |
99 | | p.cu[i].copy_sp = (copy_sp_t)p.pu[i].copy_pp; |
100 | | p.cu[i].copy_ss = (copy_ss_t)p.pu[i].copy_pp; |
101 | | |
102 | | p.chroma[X265_CSP_I420].cu[i].copy_ps = (copy_ps_t)p.chroma[X265_CSP_I420].pu[i].copy_pp; |
103 | | p.chroma[X265_CSP_I420].cu[i].copy_sp = (copy_sp_t)p.chroma[X265_CSP_I420].pu[i].copy_pp; |
104 | | p.chroma[X265_CSP_I420].cu[i].copy_ss = (copy_ss_t)p.chroma[X265_CSP_I420].pu[i].copy_pp; |
105 | | |
106 | | p.chroma[X265_CSP_I422].cu[i].copy_ps = (copy_ps_t)p.chroma[X265_CSP_I422].pu[i].copy_pp; |
107 | | p.chroma[X265_CSP_I422].cu[i].copy_sp = (copy_sp_t)p.chroma[X265_CSP_I422].pu[i].copy_pp; |
108 | | p.chroma[X265_CSP_I422].cu[i].copy_ss = (copy_ss_t)p.chroma[X265_CSP_I422].pu[i].copy_pp; |
109 | | } |
110 | | #endif |
111 | | |
112 | | /* alias chroma 4:4:4 from luma primitives (all but chroma filters) */ |
113 | |
|
114 | 0 | p.chroma[X265_CSP_I444].cu[BLOCK_4x4].sa8d = NULL; |
115 | |
|
116 | 0 | for (int i = 0; i < NUM_PU_SIZES; i++) |
117 | 0 | { |
118 | 0 | p.chroma[X265_CSP_I444].pu[i].copy_pp = p.pu[i].copy_pp; |
119 | 0 | p.chroma[X265_CSP_I444].pu[i].addAvg[NONALIGNED] = p.pu[i].addAvg[NONALIGNED]; |
120 | 0 | p.chroma[X265_CSP_I444].pu[i].addAvg[ALIGNED] = p.pu[i].addAvg[ALIGNED]; |
121 | 0 | p.chroma[X265_CSP_I444].pu[i].satd = p.pu[i].satd; |
122 | 0 | p.chroma[X265_CSP_I444].pu[i].p2s[NONALIGNED] = p.pu[i].convert_p2s[NONALIGNED]; |
123 | 0 | p.chroma[X265_CSP_I444].pu[i].p2s[ALIGNED] = p.pu[i].convert_p2s[ALIGNED]; |
124 | 0 | } |
125 | |
|
126 | 0 | for (int i = 0; i < NUM_CU_SIZES; i++) |
127 | 0 | { |
128 | 0 | p.chroma[X265_CSP_I444].cu[i].sa8d = p.cu[i].sa8d; |
129 | 0 | p.chroma[X265_CSP_I444].cu[i].sse_pp = p.cu[i].sse_pp; |
130 | 0 | p.chroma[X265_CSP_I444].cu[i].sub_ps = p.cu[i].sub_ps; |
131 | 0 | p.chroma[X265_CSP_I444].cu[i].add_ps[NONALIGNED] = p.cu[i].add_ps[NONALIGNED]; |
132 | 0 | p.chroma[X265_CSP_I444].cu[i].add_ps[ALIGNED] = p.cu[i].add_ps[ALIGNED]; |
133 | 0 | p.chroma[X265_CSP_I444].cu[i].copy_ps = p.cu[i].copy_ps; |
134 | 0 | p.chroma[X265_CSP_I444].cu[i].copy_sp = p.cu[i].copy_sp; |
135 | 0 | p.chroma[X265_CSP_I444].cu[i].copy_ss = p.cu[i].copy_ss; |
136 | 0 | } |
137 | |
|
138 | 0 | p.cu[BLOCK_4x4].sa8d = p.pu[LUMA_4x4].satd; |
139 | | |
140 | | /* Chroma PU can often use luma satd primitives */ |
141 | 0 | p.chroma[X265_CSP_I420].pu[CHROMA_420_4x4].satd = p.pu[LUMA_4x4].satd; |
142 | 0 | p.chroma[X265_CSP_I420].pu[CHROMA_420_8x8].satd = p.pu[LUMA_8x8].satd; |
143 | 0 | p.chroma[X265_CSP_I420].pu[CHROMA_420_16x16].satd = p.pu[LUMA_16x16].satd; |
144 | 0 | p.chroma[X265_CSP_I420].pu[CHROMA_420_32x32].satd = p.pu[LUMA_32x32].satd; |
145 | |
|
146 | 0 | p.chroma[X265_CSP_I420].pu[CHROMA_420_8x4].satd = p.pu[LUMA_8x4].satd; |
147 | 0 | p.chroma[X265_CSP_I420].pu[CHROMA_420_4x8].satd = p.pu[LUMA_4x8].satd; |
148 | 0 | p.chroma[X265_CSP_I420].pu[CHROMA_420_16x8].satd = p.pu[LUMA_16x8].satd; |
149 | 0 | p.chroma[X265_CSP_I420].pu[CHROMA_420_8x16].satd = p.pu[LUMA_8x16].satd; |
150 | 0 | p.chroma[X265_CSP_I420].pu[CHROMA_420_32x16].satd = p.pu[LUMA_32x16].satd; |
151 | 0 | p.chroma[X265_CSP_I420].pu[CHROMA_420_16x32].satd = p.pu[LUMA_16x32].satd; |
152 | |
|
153 | 0 | p.chroma[X265_CSP_I420].pu[CHROMA_420_16x12].satd = p.pu[LUMA_16x12].satd; |
154 | 0 | p.chroma[X265_CSP_I420].pu[CHROMA_420_12x16].satd = p.pu[LUMA_12x16].satd; |
155 | 0 | p.chroma[X265_CSP_I420].pu[CHROMA_420_16x4].satd = p.pu[LUMA_16x4].satd; |
156 | 0 | p.chroma[X265_CSP_I420].pu[CHROMA_420_4x16].satd = p.pu[LUMA_4x16].satd; |
157 | 0 | p.chroma[X265_CSP_I420].pu[CHROMA_420_32x24].satd = p.pu[LUMA_32x24].satd; |
158 | 0 | p.chroma[X265_CSP_I420].pu[CHROMA_420_24x32].satd = p.pu[LUMA_24x32].satd; |
159 | 0 | p.chroma[X265_CSP_I420].pu[CHROMA_420_32x8].satd = p.pu[LUMA_32x8].satd; |
160 | 0 | p.chroma[X265_CSP_I420].pu[CHROMA_420_8x32].satd = p.pu[LUMA_8x32].satd; |
161 | |
|
162 | 0 | p.chroma[X265_CSP_I422].pu[CHROMA_422_4x8].satd = p.pu[LUMA_4x8].satd; |
163 | 0 | p.chroma[X265_CSP_I422].pu[CHROMA_422_8x16].satd = p.pu[LUMA_8x16].satd; |
164 | 0 | p.chroma[X265_CSP_I422].pu[CHROMA_422_16x32].satd = p.pu[LUMA_16x32].satd; |
165 | 0 | p.chroma[X265_CSP_I422].pu[CHROMA_422_32x64].satd = p.pu[LUMA_32x64].satd; |
166 | |
|
167 | 0 | p.chroma[X265_CSP_I422].pu[CHROMA_422_4x4].satd = p.pu[LUMA_4x4].satd; |
168 | 0 | p.chroma[X265_CSP_I422].pu[CHROMA_422_8x8].satd = p.pu[LUMA_8x8].satd; |
169 | 0 | p.chroma[X265_CSP_I422].pu[CHROMA_422_4x16].satd = p.pu[LUMA_4x16].satd; |
170 | 0 | p.chroma[X265_CSP_I422].pu[CHROMA_422_16x16].satd = p.pu[LUMA_16x16].satd; |
171 | 0 | p.chroma[X265_CSP_I422].pu[CHROMA_422_8x32].satd = p.pu[LUMA_8x32].satd; |
172 | 0 | p.chroma[X265_CSP_I422].pu[CHROMA_422_32x32].satd = p.pu[LUMA_32x32].satd; |
173 | 0 | p.chroma[X265_CSP_I422].pu[CHROMA_422_16x64].satd = p.pu[LUMA_16x64].satd; |
174 | | |
175 | | //p.chroma[X265_CSP_I422].satd[CHROMA_422_8x12] = satd4<8, 12>; |
176 | 0 | p.chroma[X265_CSP_I422].pu[CHROMA_422_8x4].satd = p.pu[LUMA_8x4].satd; |
177 | | //p.chroma[X265_CSP_I422].satd[CHROMA_422_16x24] = satd8<16, 24>; |
178 | | //p.chroma[X265_CSP_I422].satd[CHROMA_422_12x32] = satd4<12, 32>; |
179 | 0 | p.chroma[X265_CSP_I422].pu[CHROMA_422_16x8].satd = p.pu[LUMA_16x8].satd; |
180 | | //p.chroma[X265_CSP_I422].satd[CHROMA_422_4x32] = satd4<4, 32>; |
181 | | //p.chroma[X265_CSP_I422].satd[CHROMA_422_32x48] = satd8<32, 48>; |
182 | | //p.chroma[X265_CSP_I422].satd[CHROMA_422_24x64] = satd8<24, 64>; |
183 | 0 | p.chroma[X265_CSP_I422].pu[CHROMA_422_32x16].satd = p.pu[LUMA_32x16].satd; |
184 | | //p.chroma[X265_CSP_I422].satd[CHROMA_422_8x64] = satd8<8, 64>; |
185 | |
|
186 | 0 | p.chroma[X265_CSP_I420].cu[BLOCK_420_2x2].sa8d = NULL; |
187 | 0 | p.chroma[X265_CSP_I420].cu[BLOCK_420_4x4].sa8d = p.pu[LUMA_4x4].satd; |
188 | 0 | p.chroma[X265_CSP_I420].cu[BLOCK_420_8x8].sa8d = p.cu[BLOCK_8x8].sa8d; |
189 | 0 | p.chroma[X265_CSP_I420].cu[BLOCK_420_16x16].sa8d = p.cu[BLOCK_16x16].sa8d; |
190 | 0 | p.chroma[X265_CSP_I420].cu[BLOCK_420_32x32].sa8d = p.cu[BLOCK_32x32].sa8d; |
191 | |
|
192 | 0 | p.chroma[X265_CSP_I422].cu[BLOCK_422_2x4].sa8d = NULL; |
193 | 0 | p.chroma[X265_CSP_I422].cu[BLOCK_422_4x8].sa8d = p.pu[LUMA_4x8].satd; |
194 | | |
195 | | /* alias CU copy_pp from square PU copy_pp */ |
196 | 0 | for (int i = 0; i < NUM_CU_SIZES; i++) |
197 | 0 | { |
198 | 0 | p.cu[i].copy_pp = p.pu[i].copy_pp; |
199 | |
|
200 | 0 | for (int c = 0; c < X265_CSP_COUNT; c++) |
201 | 0 | p.chroma[c].cu[i].copy_pp = p.chroma[c].pu[i].copy_pp; |
202 | 0 | } |
203 | |
|
204 | 0 | p.chroma[X265_CSP_I420].cu[BLOCK_420_2x2].sse_pp = NULL; |
205 | 0 | p.chroma[X265_CSP_I420].cu[BLOCK_420_4x4].sse_pp = p.cu[BLOCK_4x4].sse_pp; |
206 | 0 | p.chroma[X265_CSP_I420].cu[BLOCK_420_8x8].sse_pp = p.cu[BLOCK_8x8].sse_pp; |
207 | 0 | p.chroma[X265_CSP_I420].cu[BLOCK_420_16x16].sse_pp = p.cu[BLOCK_16x16].sse_pp; |
208 | 0 | p.chroma[X265_CSP_I420].cu[BLOCK_420_32x32].sse_pp = p.cu[BLOCK_32x32].sse_pp; |
209 | |
|
210 | 0 | p.chroma[X265_CSP_I422].cu[BLOCK_422_2x4].sse_pp = NULL; |
211 | 0 | } |
212 | | |
213 | | void x265_report_simd(x265_param* param) |
214 | 0 | { |
215 | 0 | if (param->logLevel >= X265_LOG_INFO) |
216 | 0 | { |
217 | 0 | int cpuid = param->cpuid; |
218 | |
|
219 | 0 | char buf[1000]; |
220 | 0 | char *p = buf + snprintf(buf, sizeof(buf), "using cpu capabilities:"); |
221 | 0 | char *none = p; |
222 | 0 | for (int i = 0; X265_NS::cpu_names[i].flags; i++) |
223 | 0 | { |
224 | 0 | if (!strcmp(X265_NS::cpu_names[i].name, "SSE") |
225 | 0 | && (cpuid & X265_CPU_SSE2)) |
226 | 0 | continue; |
227 | 0 | if (!strcmp(X265_NS::cpu_names[i].name, "SSE2") |
228 | 0 | && (cpuid & (X265_CPU_SSE2_IS_FAST | X265_CPU_SSE2_IS_SLOW))) |
229 | 0 | continue; |
230 | 0 | if (!strcmp(X265_NS::cpu_names[i].name, "SSE3") |
231 | 0 | && (cpuid & X265_CPU_SSSE3 || !(cpuid & X265_CPU_CACHELINE_64))) |
232 | 0 | continue; |
233 | 0 | if (!strcmp(X265_NS::cpu_names[i].name, "SSE4.1") |
234 | 0 | && (cpuid & X265_CPU_SSE42)) |
235 | 0 | continue; |
236 | 0 | if (!strcmp(X265_NS::cpu_names[i].name, "BMI1") |
237 | 0 | && (cpuid & X265_CPU_BMI2)) |
238 | 0 | continue; |
239 | 0 | if ((cpuid & X265_NS::cpu_names[i].flags) == X265_NS::cpu_names[i].flags |
240 | 0 | && (!i || X265_NS::cpu_names[i].flags != X265_NS::cpu_names[i - 1].flags)) |
241 | 0 | p += snprintf(p, sizeof(buf) - (p - buf), " %s", X265_NS::cpu_names[i].name); |
242 | 0 | } |
243 | |
|
244 | 0 | if (p == none) |
245 | 0 | snprintf(p, sizeof(buf) - (p - buf), " none!"); |
246 | 0 | x265_log(param, X265_LOG_INFO, "%s\n", buf); |
247 | 0 | } |
248 | 0 | } |
249 | | |
250 | | void x265_setup_primitives(x265_param *param) |
251 | 0 | { |
252 | 0 | if (!primitives.pu[0].sad) |
253 | 0 | { |
254 | 0 | setupCPrimitives(primitives); |
255 | | |
256 | | /* We do not want the encoder to use the un-optimized intra all-angles |
257 | | * C references. It is better to call the individual angle functions |
258 | | * instead. We must check for NULL before using this primitive */ |
259 | 0 | for (int i = 0; i < NUM_TR_SIZE; i++) |
260 | 0 | primitives.cu[i].intra_pred_allangs = NULL; |
261 | |
|
262 | | #if ENABLE_ASSEMBLY |
263 | | #if defined(X265_ARCH_X86) || defined(X265_ARCH_ARM64) |
264 | | setupIntrinsicPrimitives(primitives, param->cpuid); |
265 | | #endif |
266 | | setupAssemblyPrimitives(primitives, param->cpuid); |
267 | | #endif |
268 | | #if HAVE_ALTIVEC |
269 | | if (param->cpuid & X265_CPU_ALTIVEC) |
270 | | { |
271 | | setupPixelPrimitives_altivec(primitives); // pixel_altivec.cpp, overwrite the initialization for altivec optimizated functions |
272 | | setupDCTPrimitives_altivec(primitives); // dct_altivec.cpp, overwrite the initialization for altivec optimizated functions |
273 | | setupFilterPrimitives_altivec(primitives); // ipfilter.cpp, overwrite the initialization for altivec optimizated functions |
274 | | setupIntraPrimitives_altivec(primitives); // intrapred_altivec.cpp, overwrite the initialization for altivec optimizated functions |
275 | | } |
276 | | #endif |
277 | |
|
278 | 0 | setupAliasPrimitives(primitives); |
279 | |
|
280 | 0 | if (param->bLowPassDct) |
281 | 0 | { |
282 | 0 | enableLowpassDCTPrimitives(primitives); |
283 | 0 | } |
284 | 0 | } |
285 | |
|
286 | 0 | x265_report_simd(param); |
287 | 0 | } |
288 | | } |
289 | | |
290 | | #if ENABLE_ASSEMBLY && X265_ARCH_X86 |
291 | | /* these functions are implemented in assembly. When assembly is not being |
292 | | * compiled, they are unnecessary and can be NOPs */ |
293 | | #else |
294 | | extern "C" { |
295 | 0 | int PFX(cpu_cpuid_test)(void) { return 0; } |
296 | 0 | void PFX(cpu_emms)(void) {} |
297 | 0 | void PFX(cpu_cpuid)(uint32_t, uint32_t *eax, uint32_t *, uint32_t *, uint32_t *) { *eax = 0; } |
298 | 0 | void PFX(cpu_xgetbv)(uint32_t, uint32_t *, uint32_t *) {} |
299 | | |
300 | | #if X265_ARCH_ARM == 0 |
301 | 0 | void PFX(cpu_neon_test)(void) {} |
302 | 0 | int PFX(cpu_fast_neon_mrc_test)(void) { return 0; } |
303 | | #endif // X265_ARCH_ARM |
304 | | } |
305 | | #endif |