/src/x265/source/common/primitives.cpp
Line | Count | Source (jump to first uncovered line) |
1 | | /***************************************************************************** |
2 | | * Copyright (C) 2013-2020 MulticoreWare, Inc |
3 | | * |
4 | | * Authors: Steve Borho <steve@borho.org> |
5 | | * |
6 | | * This program is free software; you can redistribute it and/or modify |
7 | | * it under the terms of the GNU General Public License as published by |
8 | | * the Free Software Foundation; either version 2 of the License, or |
9 | | * (at your option) any later version. |
10 | | * |
11 | | * This program is distributed in the hope that it will be useful, |
12 | | * but WITHOUT ANY WARRANTY; without even the implied warranty of |
13 | | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
14 | | * GNU General Public License for more details. |
15 | | * |
16 | | * You should have received a copy of the GNU General Public License |
17 | | * along with this program; if not, write to the Free Software |
18 | | * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. |
19 | | * |
20 | | * This program is also available under a commercial proprietary license. |
21 | | * For more information, contact us at license @ x265.com. |
22 | | *****************************************************************************/ |
23 | | |
24 | | #include "common.h" |
25 | | #include "primitives.h" |
26 | | |
27 | | namespace X265_NS { |
28 | | // x265 private namespace |
29 | | |
30 | | extern const uint8_t lumaPartitionMapTable[] = |
31 | | { |
32 | | // 4 8 12 16 20 24 28 32 36 40 44 48 52 56 60 64 |
33 | | LUMA_4x4, LUMA_4x8, 255, LUMA_4x16, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, // 4 |
34 | | LUMA_8x4, LUMA_8x8, 255, LUMA_8x16, 255, 255, 255, LUMA_8x32, 255, 255, 255, 255, 255, 255, 255, 255, // 8 |
35 | | 255, 255, 255, LUMA_12x16, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, // 12 |
36 | | LUMA_16x4, LUMA_16x8, LUMA_16x12, LUMA_16x16, 255, 255, 255, LUMA_16x32, 255, 255, 255, 255, 255, 255, 255, LUMA_16x64, // 16 |
37 | | 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, // 20 |
38 | | 255, 255, 255, 255, 255, 255, 255, LUMA_24x32, 255, 255, 255, 255, 255, 255, 255, 255, // 24 |
39 | | 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, // 28 |
40 | | 255, LUMA_32x8, 255, LUMA_32x16, 255, LUMA_32x24, 255, LUMA_32x32, 255, 255, 255, 255, 255, 255, 255, LUMA_32x64, // 32 |
41 | | 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, // 36 |
42 | | 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, // 40 |
43 | | 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, // 44 |
44 | | 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, LUMA_48x64, // 48 |
45 | | 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, // 52 |
46 | | 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, // 56 |
47 | | 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, // 60 |
48 | | 255, 255, 255, LUMA_64x16, 255, 255, 255, LUMA_64x32, 255, 255, 255, LUMA_64x48, 255, 255, 255, LUMA_64x64 // 64 |
49 | | }; |
50 | | |
51 | | /* the "authoritative" set of encoder primitives */ |
52 | | EncoderPrimitives primitives; |
53 | | |
54 | | void setupPixelPrimitives_c(EncoderPrimitives &p); |
55 | | void setupDCTPrimitives_c(EncoderPrimitives &p); |
56 | | void setupFilterPrimitives_c(EncoderPrimitives &p); |
57 | | void setupIntraPrimitives_c(EncoderPrimitives &p); |
58 | | void setupLoopFilterPrimitives_c(EncoderPrimitives &p); |
59 | | void setupSaoPrimitives_c(EncoderPrimitives &p); |
60 | | void setupSeaIntegralPrimitives_c(EncoderPrimitives &p); |
61 | | void setupLowPassPrimitives_c(EncoderPrimitives& p); |
62 | | |
63 | | void setupCPrimitives(EncoderPrimitives &p) |
64 | 1 | { |
65 | 1 | setupPixelPrimitives_c(p); // pixel.cpp |
66 | 1 | setupDCTPrimitives_c(p); // dct.cpp |
67 | 1 | setupLowPassPrimitives_c(p); // lowpassdct.cpp |
68 | 1 | setupFilterPrimitives_c(p); // ipfilter.cpp |
69 | 1 | setupIntraPrimitives_c(p); // intrapred.cpp |
70 | 1 | setupLoopFilterPrimitives_c(p); // loopfilter.cpp |
71 | 1 | setupSaoPrimitives_c(p); // sao.cpp |
72 | 1 | setupSeaIntegralPrimitives_c(p); // framefilter.cpp |
73 | 1 | } |
74 | | |
75 | | void enableLowpassDCTPrimitives(EncoderPrimitives &p) |
76 | 0 | { |
77 | | // update copies of the standard dct transform |
78 | 0 | p.cu[BLOCK_4x4].standard_dct = p.cu[BLOCK_4x4].dct; |
79 | 0 | p.cu[BLOCK_8x8].standard_dct = p.cu[BLOCK_8x8].dct; |
80 | 0 | p.cu[BLOCK_16x16].standard_dct = p.cu[BLOCK_16x16].dct; |
81 | 0 | p.cu[BLOCK_32x32].standard_dct = p.cu[BLOCK_32x32].dct; |
82 | | |
83 | | // replace active dct by lowpass dct for high dct transforms |
84 | 0 | p.cu[BLOCK_16x16].dct = p.cu[BLOCK_16x16].lowpass_dct; |
85 | 0 | p.cu[BLOCK_32x32].dct = p.cu[BLOCK_32x32].lowpass_dct; |
86 | 0 | } |
87 | | |
88 | | void setupAliasPrimitives(EncoderPrimitives &p) |
89 | 1 | { |
90 | | #if HIGH_BIT_DEPTH |
91 | | /* at HIGH_BIT_DEPTH, pixel == short so we can alias many primitives */ |
92 | | for (int i = 0; i < NUM_CU_SIZES; i++) |
93 | | { |
94 | | p.cu[i].sse_pp = (pixel_sse_t)p.cu[i].sse_ss; |
95 | | |
96 | | p.cu[i].copy_ps = (copy_ps_t)p.pu[i].copy_pp; |
97 | | p.cu[i].copy_sp = (copy_sp_t)p.pu[i].copy_pp; |
98 | | p.cu[i].copy_ss = (copy_ss_t)p.pu[i].copy_pp; |
99 | | |
100 | | p.chroma[X265_CSP_I420].cu[i].copy_ps = (copy_ps_t)p.chroma[X265_CSP_I420].pu[i].copy_pp; |
101 | | p.chroma[X265_CSP_I420].cu[i].copy_sp = (copy_sp_t)p.chroma[X265_CSP_I420].pu[i].copy_pp; |
102 | | p.chroma[X265_CSP_I420].cu[i].copy_ss = (copy_ss_t)p.chroma[X265_CSP_I420].pu[i].copy_pp; |
103 | | |
104 | | p.chroma[X265_CSP_I422].cu[i].copy_ps = (copy_ps_t)p.chroma[X265_CSP_I422].pu[i].copy_pp; |
105 | | p.chroma[X265_CSP_I422].cu[i].copy_sp = (copy_sp_t)p.chroma[X265_CSP_I422].pu[i].copy_pp; |
106 | | p.chroma[X265_CSP_I422].cu[i].copy_ss = (copy_ss_t)p.chroma[X265_CSP_I422].pu[i].copy_pp; |
107 | | } |
108 | | #endif |
109 | | |
110 | | /* alias chroma 4:4:4 from luma primitives (all but chroma filters) */ |
111 | | |
112 | 1 | p.chroma[X265_CSP_I444].cu[BLOCK_4x4].sa8d = NULL; |
113 | | |
114 | 26 | for (int i = 0; i < NUM_PU_SIZES; i++) |
115 | 25 | { |
116 | 25 | p.chroma[X265_CSP_I444].pu[i].copy_pp = p.pu[i].copy_pp; |
117 | 25 | p.chroma[X265_CSP_I444].pu[i].addAvg[NONALIGNED] = p.pu[i].addAvg[NONALIGNED]; |
118 | 25 | p.chroma[X265_CSP_I444].pu[i].addAvg[ALIGNED] = p.pu[i].addAvg[ALIGNED]; |
119 | 25 | p.chroma[X265_CSP_I444].pu[i].satd = p.pu[i].satd; |
120 | 25 | p.chroma[X265_CSP_I444].pu[i].p2s[NONALIGNED] = p.pu[i].convert_p2s[NONALIGNED]; |
121 | 25 | p.chroma[X265_CSP_I444].pu[i].p2s[ALIGNED] = p.pu[i].convert_p2s[ALIGNED]; |
122 | 25 | } |
123 | | |
124 | 6 | for (int i = 0; i < NUM_CU_SIZES; i++) |
125 | 5 | { |
126 | 5 | p.chroma[X265_CSP_I444].cu[i].sa8d = p.cu[i].sa8d; |
127 | 5 | p.chroma[X265_CSP_I444].cu[i].sse_pp = p.cu[i].sse_pp; |
128 | 5 | p.chroma[X265_CSP_I444].cu[i].sub_ps = p.cu[i].sub_ps; |
129 | 5 | p.chroma[X265_CSP_I444].cu[i].add_ps[NONALIGNED] = p.cu[i].add_ps[NONALIGNED]; |
130 | 5 | p.chroma[X265_CSP_I444].cu[i].add_ps[ALIGNED] = p.cu[i].add_ps[ALIGNED]; |
131 | 5 | p.chroma[X265_CSP_I444].cu[i].copy_ps = p.cu[i].copy_ps; |
132 | 5 | p.chroma[X265_CSP_I444].cu[i].copy_sp = p.cu[i].copy_sp; |
133 | 5 | p.chroma[X265_CSP_I444].cu[i].copy_ss = p.cu[i].copy_ss; |
134 | 5 | } |
135 | | |
136 | 1 | p.cu[BLOCK_4x4].sa8d = p.pu[LUMA_4x4].satd; |
137 | | |
138 | | /* Chroma PU can often use luma satd primitives */ |
139 | 1 | p.chroma[X265_CSP_I420].pu[CHROMA_420_4x4].satd = p.pu[LUMA_4x4].satd; |
140 | 1 | p.chroma[X265_CSP_I420].pu[CHROMA_420_8x8].satd = p.pu[LUMA_8x8].satd; |
141 | 1 | p.chroma[X265_CSP_I420].pu[CHROMA_420_16x16].satd = p.pu[LUMA_16x16].satd; |
142 | 1 | p.chroma[X265_CSP_I420].pu[CHROMA_420_32x32].satd = p.pu[LUMA_32x32].satd; |
143 | | |
144 | 1 | p.chroma[X265_CSP_I420].pu[CHROMA_420_8x4].satd = p.pu[LUMA_8x4].satd; |
145 | 1 | p.chroma[X265_CSP_I420].pu[CHROMA_420_4x8].satd = p.pu[LUMA_4x8].satd; |
146 | 1 | p.chroma[X265_CSP_I420].pu[CHROMA_420_16x8].satd = p.pu[LUMA_16x8].satd; |
147 | 1 | p.chroma[X265_CSP_I420].pu[CHROMA_420_8x16].satd = p.pu[LUMA_8x16].satd; |
148 | 1 | p.chroma[X265_CSP_I420].pu[CHROMA_420_32x16].satd = p.pu[LUMA_32x16].satd; |
149 | 1 | p.chroma[X265_CSP_I420].pu[CHROMA_420_16x32].satd = p.pu[LUMA_16x32].satd; |
150 | | |
151 | 1 | p.chroma[X265_CSP_I420].pu[CHROMA_420_16x12].satd = p.pu[LUMA_16x12].satd; |
152 | 1 | p.chroma[X265_CSP_I420].pu[CHROMA_420_12x16].satd = p.pu[LUMA_12x16].satd; |
153 | 1 | p.chroma[X265_CSP_I420].pu[CHROMA_420_16x4].satd = p.pu[LUMA_16x4].satd; |
154 | 1 | p.chroma[X265_CSP_I420].pu[CHROMA_420_4x16].satd = p.pu[LUMA_4x16].satd; |
155 | 1 | p.chroma[X265_CSP_I420].pu[CHROMA_420_32x24].satd = p.pu[LUMA_32x24].satd; |
156 | 1 | p.chroma[X265_CSP_I420].pu[CHROMA_420_24x32].satd = p.pu[LUMA_24x32].satd; |
157 | 1 | p.chroma[X265_CSP_I420].pu[CHROMA_420_32x8].satd = p.pu[LUMA_32x8].satd; |
158 | 1 | p.chroma[X265_CSP_I420].pu[CHROMA_420_8x32].satd = p.pu[LUMA_8x32].satd; |
159 | | |
160 | 1 | p.chroma[X265_CSP_I422].pu[CHROMA_422_4x8].satd = p.pu[LUMA_4x8].satd; |
161 | 1 | p.chroma[X265_CSP_I422].pu[CHROMA_422_8x16].satd = p.pu[LUMA_8x16].satd; |
162 | 1 | p.chroma[X265_CSP_I422].pu[CHROMA_422_16x32].satd = p.pu[LUMA_16x32].satd; |
163 | 1 | p.chroma[X265_CSP_I422].pu[CHROMA_422_32x64].satd = p.pu[LUMA_32x64].satd; |
164 | | |
165 | 1 | p.chroma[X265_CSP_I422].pu[CHROMA_422_4x4].satd = p.pu[LUMA_4x4].satd; |
166 | 1 | p.chroma[X265_CSP_I422].pu[CHROMA_422_8x8].satd = p.pu[LUMA_8x8].satd; |
167 | 1 | p.chroma[X265_CSP_I422].pu[CHROMA_422_4x16].satd = p.pu[LUMA_4x16].satd; |
168 | 1 | p.chroma[X265_CSP_I422].pu[CHROMA_422_16x16].satd = p.pu[LUMA_16x16].satd; |
169 | 1 | p.chroma[X265_CSP_I422].pu[CHROMA_422_8x32].satd = p.pu[LUMA_8x32].satd; |
170 | 1 | p.chroma[X265_CSP_I422].pu[CHROMA_422_32x32].satd = p.pu[LUMA_32x32].satd; |
171 | 1 | p.chroma[X265_CSP_I422].pu[CHROMA_422_16x64].satd = p.pu[LUMA_16x64].satd; |
172 | | |
173 | | //p.chroma[X265_CSP_I422].satd[CHROMA_422_8x12] = satd4<8, 12>; |
174 | 1 | p.chroma[X265_CSP_I422].pu[CHROMA_422_8x4].satd = p.pu[LUMA_8x4].satd; |
175 | | //p.chroma[X265_CSP_I422].satd[CHROMA_422_16x24] = satd8<16, 24>; |
176 | | //p.chroma[X265_CSP_I422].satd[CHROMA_422_12x32] = satd4<12, 32>; |
177 | 1 | p.chroma[X265_CSP_I422].pu[CHROMA_422_16x8].satd = p.pu[LUMA_16x8].satd; |
178 | | //p.chroma[X265_CSP_I422].satd[CHROMA_422_4x32] = satd4<4, 32>; |
179 | | //p.chroma[X265_CSP_I422].satd[CHROMA_422_32x48] = satd8<32, 48>; |
180 | | //p.chroma[X265_CSP_I422].satd[CHROMA_422_24x64] = satd8<24, 64>; |
181 | 1 | p.chroma[X265_CSP_I422].pu[CHROMA_422_32x16].satd = p.pu[LUMA_32x16].satd; |
182 | | //p.chroma[X265_CSP_I422].satd[CHROMA_422_8x64] = satd8<8, 64>; |
183 | | |
184 | 1 | p.chroma[X265_CSP_I420].cu[BLOCK_420_2x2].sa8d = NULL; |
185 | 1 | p.chroma[X265_CSP_I420].cu[BLOCK_420_4x4].sa8d = p.pu[LUMA_4x4].satd; |
186 | 1 | p.chroma[X265_CSP_I420].cu[BLOCK_420_8x8].sa8d = p.cu[BLOCK_8x8].sa8d; |
187 | 1 | p.chroma[X265_CSP_I420].cu[BLOCK_420_16x16].sa8d = p.cu[BLOCK_16x16].sa8d; |
188 | 1 | p.chroma[X265_CSP_I420].cu[BLOCK_420_32x32].sa8d = p.cu[BLOCK_32x32].sa8d; |
189 | | |
190 | 1 | p.chroma[X265_CSP_I422].cu[BLOCK_422_2x4].sa8d = NULL; |
191 | 1 | p.chroma[X265_CSP_I422].cu[BLOCK_422_4x8].sa8d = p.pu[LUMA_4x8].satd; |
192 | | |
193 | | /* alias CU copy_pp from square PU copy_pp */ |
194 | 6 | for (int i = 0; i < NUM_CU_SIZES; i++) |
195 | 5 | { |
196 | 5 | p.cu[i].copy_pp = p.pu[i].copy_pp; |
197 | | |
198 | 25 | for (int c = 0; c < X265_CSP_COUNT; c++) |
199 | 20 | p.chroma[c].cu[i].copy_pp = p.chroma[c].pu[i].copy_pp; |
200 | 5 | } |
201 | | |
202 | 1 | p.chroma[X265_CSP_I420].cu[BLOCK_420_2x2].sse_pp = NULL; |
203 | 1 | p.chroma[X265_CSP_I420].cu[BLOCK_420_4x4].sse_pp = p.cu[BLOCK_4x4].sse_pp; |
204 | 1 | p.chroma[X265_CSP_I420].cu[BLOCK_420_8x8].sse_pp = p.cu[BLOCK_8x8].sse_pp; |
205 | 1 | p.chroma[X265_CSP_I420].cu[BLOCK_420_16x16].sse_pp = p.cu[BLOCK_16x16].sse_pp; |
206 | 1 | p.chroma[X265_CSP_I420].cu[BLOCK_420_32x32].sse_pp = p.cu[BLOCK_32x32].sse_pp; |
207 | | |
208 | 1 | p.chroma[X265_CSP_I422].cu[BLOCK_422_2x4].sse_pp = NULL; |
209 | 1 | } |
210 | | |
211 | | void x265_report_simd(x265_param* param) |
212 | 698 | { |
213 | 698 | if (param->logLevel >= X265_LOG_INFO) |
214 | 0 | { |
215 | 0 | int cpuid = param->cpuid; |
216 | |
|
217 | 0 | char buf[1000]; |
218 | 0 | char *p = buf + sprintf(buf, "using cpu capabilities:"); |
219 | 0 | char *none = p; |
220 | 0 | for (int i = 0; X265_NS::cpu_names[i].flags; i++) |
221 | 0 | { |
222 | 0 | if (!strcmp(X265_NS::cpu_names[i].name, "SSE") |
223 | 0 | && (cpuid & X265_CPU_SSE2)) |
224 | 0 | continue; |
225 | 0 | if (!strcmp(X265_NS::cpu_names[i].name, "SSE2") |
226 | 0 | && (cpuid & (X265_CPU_SSE2_IS_FAST | X265_CPU_SSE2_IS_SLOW))) |
227 | 0 | continue; |
228 | 0 | if (!strcmp(X265_NS::cpu_names[i].name, "SSE3") |
229 | 0 | && (cpuid & X265_CPU_SSSE3 || !(cpuid & X265_CPU_CACHELINE_64))) |
230 | 0 | continue; |
231 | 0 | if (!strcmp(X265_NS::cpu_names[i].name, "SSE4.1") |
232 | 0 | && (cpuid & X265_CPU_SSE42)) |
233 | 0 | continue; |
234 | 0 | if (!strcmp(X265_NS::cpu_names[i].name, "BMI1") |
235 | 0 | && (cpuid & X265_CPU_BMI2)) |
236 | 0 | continue; |
237 | 0 | if ((cpuid & X265_NS::cpu_names[i].flags) == X265_NS::cpu_names[i].flags |
238 | 0 | && (!i || X265_NS::cpu_names[i].flags != X265_NS::cpu_names[i - 1].flags)) |
239 | 0 | p += sprintf(p, " %s", X265_NS::cpu_names[i].name); |
240 | 0 | } |
241 | |
|
242 | 0 | if (p == none) |
243 | 0 | sprintf(p, " none!"); |
244 | 0 | x265_log(param, X265_LOG_INFO, "%s\n", buf); |
245 | 0 | } |
246 | 698 | } |
247 | | |
248 | | void x265_setup_primitives(x265_param *param) |
249 | 698 | { |
250 | 698 | if (!primitives.pu[0].sad) |
251 | 1 | { |
252 | 1 | setupCPrimitives(primitives); |
253 | | |
254 | | /* We do not want the encoder to use the un-optimized intra all-angles |
255 | | * C references. It is better to call the individual angle functions |
256 | | * instead. We must check for NULL before using this primitive */ |
257 | 5 | for (int i = 0; i < NUM_TR_SIZE; i++) |
258 | 4 | primitives.cu[i].intra_pred_allangs = NULL; |
259 | | |
260 | | #if ENABLE_ASSEMBLY |
261 | | #if X265_ARCH_X86 |
262 | | setupInstrinsicPrimitives(primitives, param->cpuid); |
263 | | #endif |
264 | | setupAssemblyPrimitives(primitives, param->cpuid); |
265 | | #endif |
266 | | #if HAVE_ALTIVEC |
267 | | if (param->cpuid & X265_CPU_ALTIVEC) |
268 | | { |
269 | | setupPixelPrimitives_altivec(primitives); // pixel_altivec.cpp, overwrite the initialization for altivec optimizated functions |
270 | | setupDCTPrimitives_altivec(primitives); // dct_altivec.cpp, overwrite the initialization for altivec optimizated functions |
271 | | setupFilterPrimitives_altivec(primitives); // ipfilter.cpp, overwrite the initialization for altivec optimizated functions |
272 | | setupIntraPrimitives_altivec(primitives); // intrapred_altivec.cpp, overwrite the initialization for altivec optimizated functions |
273 | | } |
274 | | #endif |
275 | | |
276 | 1 | setupAliasPrimitives(primitives); |
277 | | |
278 | 1 | if (param->bLowPassDct) |
279 | 0 | { |
280 | 0 | enableLowpassDCTPrimitives(primitives); |
281 | 0 | } |
282 | 1 | } |
283 | | |
284 | 698 | x265_report_simd(param); |
285 | 698 | } |
286 | | } |
287 | | |
288 | | #if ENABLE_ASSEMBLY && X265_ARCH_X86 |
289 | | /* these functions are implemented in assembly. When assembly is not being |
290 | | * compiled, they are unnecessary and can be NOPs */ |
291 | | #else |
292 | | extern "C" { |
293 | 0 | int PFX(cpu_cpuid_test)(void) { return 0; } |
294 | 102k | void PFX(cpu_emms)(void) {} |
295 | 2.79k | void PFX(cpu_cpuid)(uint32_t, uint32_t *eax, uint32_t *, uint32_t *, uint32_t *) { *eax = 0; } |
296 | 0 | void PFX(cpu_xgetbv)(uint32_t, uint32_t *, uint32_t *) {} |
297 | | |
298 | | #if X265_ARCH_ARM == 0 |
299 | 0 | void PFX(cpu_neon_test)(void) {} |
300 | 0 | int PFX(cpu_fast_neon_mrc_test)(void) { return 0; } |
301 | | #endif // X265_ARCH_ARM |
302 | | } |
303 | | #endif |