/work/x264/common/opencl.c
Line | Count | Source |
1 | | /***************************************************************************** |
2 | | * opencl.c: OpenCL initialization and kernel compilation |
3 | | ***************************************************************************** |
4 | | * Copyright (C) 2012-2025 x264 project |
5 | | * |
6 | | * Authors: Steve Borho <sborho@multicorewareinc.com> |
7 | | * Anton Mitrofanov <BugMaster@narod.ru> |
8 | | * |
9 | | * This program is free software; you can redistribute it and/or modify |
10 | | * it under the terms of the GNU General Public License as published by |
11 | | * the Free Software Foundation; either version 2 of the License, or |
12 | | * (at your option) any later version. |
13 | | * |
14 | | * This program is distributed in the hope that it will be useful, |
15 | | * but WITHOUT ANY WARRANTY; without even the implied warranty of |
16 | | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
17 | | * GNU General Public License for more details. |
18 | | * |
19 | | * You should have received a copy of the GNU General Public License |
20 | | * along with this program; if not, write to the Free Software |
21 | | * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. |
22 | | * |
23 | | * This program is also available under a commercial proprietary license. |
24 | | * For more information, contact us at licensing@x264.com. |
25 | | *****************************************************************************/ |
26 | | |
27 | | #include "common.h" |
28 | | |
29 | | #ifdef _WIN32 |
30 | | #include <windows.h> |
31 | | #define ocl_open LoadLibraryW( L"OpenCL" ) |
32 | | #define ocl_close FreeLibrary |
33 | | #define ocl_address GetProcAddress |
34 | | #else |
35 | | #include <dlfcn.h> //dlopen, dlsym, dlclose |
36 | | #if SYS_MACOSX |
37 | | #define ocl_open dlopen( "/System/Library/Frameworks/OpenCL.framework/OpenCL", RTLD_NOW ) |
38 | | #else |
39 | 0 | #define ocl_open dlopen( "libOpenCL.so", RTLD_NOW ) |
40 | | #endif |
41 | 0 | #define ocl_close dlclose |
42 | 0 | #define ocl_address dlsym |
43 | | #endif |
44 | | |
45 | 0 | #define LOAD_OCL_FUNC(name, continue_on_fail)\ |
46 | 0 | {\ |
47 | 0 | ocl->name = (void*)ocl_address( ocl->library, #name );\ |
48 | 0 | if( !continue_on_fail && !ocl->name )\ |
49 | 0 | goto fail;\ |
50 | 0 | } |
51 | | |
52 | | /* load the library and functions we require from it */ |
53 | | x264_opencl_function_t *x264_opencl_load_library( void ) |
54 | 0 | { |
55 | 0 | x264_opencl_function_t *ocl; |
56 | 0 | #undef fail |
57 | 0 | #define fail fail0 |
58 | 0 | CHECKED_MALLOCZERO( ocl, sizeof(x264_opencl_function_t) ); |
59 | 0 | #undef fail |
60 | 0 | #define fail fail1 |
61 | 0 | ocl->library = ocl_open; |
62 | 0 | if( !ocl->library ) |
63 | 0 | goto fail; |
64 | 0 | #undef fail |
65 | 0 | #define fail fail2 |
66 | 0 | LOAD_OCL_FUNC( clBuildProgram, 0 ); |
67 | 0 | LOAD_OCL_FUNC( clCreateBuffer, 0 ); |
68 | 0 | LOAD_OCL_FUNC( clCreateCommandQueue, 0 ); |
69 | 0 | LOAD_OCL_FUNC( clCreateContext, 0 ); |
70 | 0 | LOAD_OCL_FUNC( clCreateImage2D, 0 ); |
71 | 0 | LOAD_OCL_FUNC( clCreateKernel, 0 ); |
72 | 0 | LOAD_OCL_FUNC( clCreateProgramWithBinary, 0 ); |
73 | 0 | LOAD_OCL_FUNC( clCreateProgramWithSource, 0 ); |
74 | 0 | LOAD_OCL_FUNC( clEnqueueCopyBuffer, 0 ); |
75 | 0 | LOAD_OCL_FUNC( clEnqueueMapBuffer, 0 ); |
76 | 0 | LOAD_OCL_FUNC( clEnqueueNDRangeKernel, 0 ); |
77 | 0 | LOAD_OCL_FUNC( clEnqueueReadBuffer, 0 ); |
78 | 0 | LOAD_OCL_FUNC( clEnqueueWriteBuffer, 0 ); |
79 | 0 | LOAD_OCL_FUNC( clFinish, 0 ); |
80 | 0 | LOAD_OCL_FUNC( clGetCommandQueueInfo, 0 ); |
81 | 0 | LOAD_OCL_FUNC( clGetDeviceIDs, 0 ); |
82 | 0 | LOAD_OCL_FUNC( clGetDeviceInfo, 0 ); |
83 | 0 | LOAD_OCL_FUNC( clGetKernelWorkGroupInfo, 0 ); |
84 | 0 | LOAD_OCL_FUNC( clGetPlatformIDs, 0 ); |
85 | 0 | LOAD_OCL_FUNC( clGetProgramBuildInfo, 0 ); |
86 | 0 | LOAD_OCL_FUNC( clGetProgramInfo, 0 ); |
87 | 0 | LOAD_OCL_FUNC( clGetSupportedImageFormats, 0 ); |
88 | 0 | LOAD_OCL_FUNC( clReleaseCommandQueue, 0 ); |
89 | 0 | LOAD_OCL_FUNC( clReleaseContext, 0 ); |
90 | 0 | LOAD_OCL_FUNC( clReleaseKernel, 0 ); |
91 | 0 | LOAD_OCL_FUNC( clReleaseMemObject, 0 ); |
92 | 0 | LOAD_OCL_FUNC( clReleaseProgram, 0 ); |
93 | 0 | LOAD_OCL_FUNC( clSetKernelArg, 0 ); |
94 | 0 | return ocl; |
95 | 0 | #undef fail |
96 | 0 | fail2: |
97 | 0 | ocl_close( ocl->library ); |
98 | 0 | fail1: |
99 | 0 | x264_free( ocl ); |
100 | 0 | fail0: |
101 | 0 | return NULL; |
102 | 0 | } |
103 | | |
104 | | void x264_opencl_close_library( x264_opencl_function_t *ocl ) |
105 | 0 | { |
106 | 0 | if( !ocl ) |
107 | 0 | return; |
108 | 0 | ocl_close( ocl->library ); |
109 | 0 | x264_free( ocl ); |
110 | 0 | } |
111 | | |
112 | | /* define from recent cl_ext.h, copied here in case headers are old */ |
113 | 0 | #define CL_DEVICE_SIMD_INSTRUCTION_WIDTH_AMD 0x4042 |
114 | | |
115 | | /* Requires full include path in case of out-of-tree builds */ |
116 | | #include "common/oclobj.h" |
117 | | |
118 | | static int detect_switchable_graphics( void ); |
119 | | |
120 | | /* Try to load the cached compiled program binary, verify the device context is |
121 | | * still valid before reuse */ |
122 | | static cl_program opencl_cache_load( x264_t *h, const char *dev_name, const char *dev_vendor, const char *driver_version ) |
123 | 0 | { |
124 | | /* try to load cached program binary */ |
125 | 0 | FILE *fp = x264_fopen( h->param.psz_clbin_file, "rb" ); |
126 | 0 | if( !fp ) |
127 | 0 | return NULL; |
128 | | |
129 | 0 | x264_opencl_function_t *ocl = h->opencl.ocl; |
130 | 0 | cl_program program = NULL; |
131 | 0 | uint8_t *binary = NULL; |
132 | |
|
133 | 0 | fseek( fp, 0, SEEK_END ); |
134 | 0 | int64_t file_size = ftell( fp ); |
135 | 0 | fseek( fp, 0, SEEK_SET ); |
136 | 0 | if( file_size < 0 || (uint64_t)file_size > SIZE_MAX ) |
137 | 0 | goto fail; |
138 | 0 | size_t size = file_size; |
139 | 0 | CHECKED_MALLOC( binary, size ); |
140 | | |
141 | 0 | if( fread( binary, 1, size, fp ) != size ) |
142 | 0 | goto fail; |
143 | 0 | const uint8_t *ptr = (const uint8_t*)binary; |
144 | |
|
145 | 0 | #define CHECK_STRING( STR )\ |
146 | 0 | do {\ |
147 | 0 | size_t len = strlen( STR );\ |
148 | 0 | if( size <= len || strncmp( (char*)ptr, STR, len ) )\ |
149 | 0 | goto fail;\ |
150 | 0 | else {\ |
151 | 0 | size -= (len+1); ptr += (len+1);\ |
152 | 0 | }\ |
153 | 0 | } while( 0 ) |
154 | |
|
155 | 0 | CHECK_STRING( dev_name ); |
156 | 0 | CHECK_STRING( dev_vendor ); |
157 | 0 | CHECK_STRING( driver_version ); |
158 | 0 | CHECK_STRING( x264_opencl_source_hash ); |
159 | 0 | #undef CHECK_STRING |
160 | | |
161 | 0 | cl_int status; |
162 | 0 | program = ocl->clCreateProgramWithBinary( h->opencl.context, 1, &h->opencl.device, &size, &ptr, NULL, &status ); |
163 | 0 | if( status != CL_SUCCESS ) |
164 | 0 | program = NULL; |
165 | |
|
166 | 0 | fail: |
167 | 0 | fclose( fp ); |
168 | 0 | x264_free( binary ); |
169 | 0 | return program; |
170 | 0 | } |
171 | | |
172 | | /* Save the compiled program binary to a file for later reuse. Device context |
173 | | * is also saved in the cache file so we do not reuse stale binaries */ |
174 | | static void opencl_cache_save( x264_t *h, cl_program program, const char *dev_name, const char *dev_vendor, const char *driver_version ) |
175 | 0 | { |
176 | 0 | FILE *fp = x264_fopen( h->param.psz_clbin_file, "wb" ); |
177 | 0 | if( !fp ) |
178 | 0 | { |
179 | 0 | x264_log( h, X264_LOG_INFO, "OpenCL: unable to open clbin file for write\n" ); |
180 | 0 | return; |
181 | 0 | } |
182 | | |
183 | 0 | x264_opencl_function_t *ocl = h->opencl.ocl; |
184 | 0 | uint8_t *binary = NULL; |
185 | |
|
186 | 0 | size_t size = 0; |
187 | 0 | cl_int status = ocl->clGetProgramInfo( program, CL_PROGRAM_BINARY_SIZES, sizeof(size_t), &size, NULL ); |
188 | 0 | if( status != CL_SUCCESS || !size ) |
189 | 0 | { |
190 | 0 | x264_log( h, X264_LOG_INFO, "OpenCL: Unable to query program binary size, no cache file generated\n" ); |
191 | 0 | goto fail; |
192 | 0 | } |
193 | | |
194 | 0 | CHECKED_MALLOC( binary, size ); |
195 | 0 | status = ocl->clGetProgramInfo( program, CL_PROGRAM_BINARIES, sizeof(uint8_t *), &binary, NULL ); |
196 | 0 | if( status != CL_SUCCESS ) |
197 | 0 | { |
198 | 0 | x264_log( h, X264_LOG_INFO, "OpenCL: Unable to query program binary, no cache file generated\n" ); |
199 | 0 | goto fail; |
200 | 0 | } |
201 | | |
202 | 0 | fputs( dev_name, fp ); |
203 | 0 | fputc( '\n', fp ); |
204 | 0 | fputs( dev_vendor, fp ); |
205 | 0 | fputc( '\n', fp ); |
206 | 0 | fputs( driver_version, fp ); |
207 | 0 | fputc( '\n', fp ); |
208 | 0 | fputs( x264_opencl_source_hash, fp ); |
209 | 0 | fputc( '\n', fp ); |
210 | 0 | fwrite( binary, 1, size, fp ); |
211 | |
|
212 | 0 | fail: |
213 | 0 | fclose( fp ); |
214 | 0 | x264_free( binary ); |
215 | 0 | return; |
216 | 0 | } |
217 | | |
218 | | /* The OpenCL source under common/opencl will be merged into common/oclobj.h by |
219 | | * the Makefile. It defines a x264_opencl_source byte array which we will pass |
220 | | * to clCreateProgramWithSource(). We also attempt to use a cache file for the |
221 | | * compiled binary, stored in the current working folder. */ |
222 | | static cl_program opencl_compile( x264_t *h ) |
223 | 0 | { |
224 | 0 | x264_opencl_function_t *ocl = h->opencl.ocl; |
225 | 0 | cl_program program = NULL; |
226 | 0 | char *build_log = NULL; |
227 | |
|
228 | 0 | char dev_name[64]; |
229 | 0 | char dev_vendor[64]; |
230 | 0 | char driver_version[64]; |
231 | 0 | cl_int status; |
232 | 0 | status = ocl->clGetDeviceInfo( h->opencl.device, CL_DEVICE_NAME, sizeof(dev_name), dev_name, NULL ); |
233 | 0 | status |= ocl->clGetDeviceInfo( h->opencl.device, CL_DEVICE_VENDOR, sizeof(dev_vendor), dev_vendor, NULL ); |
234 | 0 | status |= ocl->clGetDeviceInfo( h->opencl.device, CL_DRIVER_VERSION, sizeof(driver_version), driver_version, NULL ); |
235 | 0 | if( status != CL_SUCCESS ) |
236 | 0 | return NULL; |
237 | | |
238 | | // Most AMD GPUs have vector registers |
239 | 0 | int vectorize = !strcmp( dev_vendor, "Advanced Micro Devices, Inc." ); |
240 | 0 | h->opencl.b_device_AMD_SI = 0; |
241 | |
|
242 | 0 | if( vectorize ) |
243 | 0 | { |
244 | | /* Disable OpenCL on Intel/AMD switchable graphics devices */ |
245 | 0 | if( detect_switchable_graphics() ) |
246 | 0 | { |
247 | 0 | x264_log( h, X264_LOG_INFO, "OpenCL acceleration disabled, switchable graphics detected\n" ); |
248 | 0 | return NULL; |
249 | 0 | } |
250 | | |
251 | | /* Detect AMD SouthernIsland or newer device (single-width registers) */ |
252 | 0 | cl_uint simdwidth = 4; |
253 | 0 | status = ocl->clGetDeviceInfo( h->opencl.device, CL_DEVICE_SIMD_INSTRUCTION_WIDTH_AMD, sizeof(cl_uint), &simdwidth, NULL ); |
254 | 0 | if( status == CL_SUCCESS && simdwidth == 1 ) |
255 | 0 | { |
256 | 0 | vectorize = 0; |
257 | 0 | h->opencl.b_device_AMD_SI = 1; |
258 | 0 | } |
259 | 0 | } |
260 | | |
261 | 0 | x264_log( h, X264_LOG_INFO, "OpenCL acceleration enabled with %s %s %s\n", dev_vendor, dev_name, h->opencl.b_device_AMD_SI ? "(SI)" : "" ); |
262 | |
|
263 | 0 | program = opencl_cache_load( h, dev_name, dev_vendor, driver_version ); |
264 | 0 | if( !program ) |
265 | 0 | { |
266 | | /* clCreateProgramWithSource() requires a pointer variable, you cannot just use &x264_opencl_source */ |
267 | 0 | x264_log( h, X264_LOG_INFO, "Compiling OpenCL kernels...\n" ); |
268 | 0 | const char *strptr = (const char*)x264_opencl_source; |
269 | 0 | size_t size = sizeof(x264_opencl_source); |
270 | 0 | program = ocl->clCreateProgramWithSource( h->opencl.context, 1, &strptr, &size, &status ); |
271 | 0 | if( status != CL_SUCCESS || !program ) |
272 | 0 | { |
273 | 0 | x264_log( h, X264_LOG_WARNING, "OpenCL: unable to create program\n" ); |
274 | 0 | return NULL; |
275 | 0 | } |
276 | 0 | } |
277 | | |
278 | | /* Build the program binary for the OpenCL device */ |
279 | 0 | const char *buildopts = vectorize ? "-DVECTORIZE=1" : ""; |
280 | 0 | status = ocl->clBuildProgram( program, 1, &h->opencl.device, buildopts, NULL, NULL ); |
281 | 0 | if( status == CL_SUCCESS ) |
282 | 0 | { |
283 | 0 | opencl_cache_save( h, program, dev_name, dev_vendor, driver_version ); |
284 | 0 | return program; |
285 | 0 | } |
286 | | |
287 | | /* Compile failure, should not happen with production code. */ |
288 | | |
289 | 0 | size_t build_log_len = 0; |
290 | 0 | status = ocl->clGetProgramBuildInfo( program, h->opencl.device, CL_PROGRAM_BUILD_LOG, 0, NULL, &build_log_len ); |
291 | 0 | if( status != CL_SUCCESS || !build_log_len ) |
292 | 0 | { |
293 | 0 | x264_log( h, X264_LOG_WARNING, "OpenCL: Compilation failed, unable to query build log\n" ); |
294 | 0 | goto fail; |
295 | 0 | } |
296 | | |
297 | 0 | build_log = x264_malloc( build_log_len ); |
298 | 0 | if( !build_log ) |
299 | 0 | { |
300 | 0 | x264_log( h, X264_LOG_WARNING, "OpenCL: Compilation failed, unable to alloc build log\n" ); |
301 | 0 | goto fail; |
302 | 0 | } |
303 | | |
304 | 0 | status = ocl->clGetProgramBuildInfo( program, h->opencl.device, CL_PROGRAM_BUILD_LOG, build_log_len, build_log, NULL ); |
305 | 0 | if( status != CL_SUCCESS ) |
306 | 0 | { |
307 | 0 | x264_log( h, X264_LOG_WARNING, "OpenCL: Compilation failed, unable to get build log\n" ); |
308 | 0 | goto fail; |
309 | 0 | } |
310 | | |
311 | 0 | FILE *log_file = x264_fopen( "x264_kernel_build_log.txt", "w" ); |
312 | 0 | if( !log_file ) |
313 | 0 | { |
314 | 0 | x264_log( h, X264_LOG_WARNING, "OpenCL: Compilation failed, unable to create file x264_kernel_build_log.txt\n" ); |
315 | 0 | goto fail; |
316 | 0 | } |
317 | 0 | fwrite( build_log, 1, build_log_len, log_file ); |
318 | 0 | fclose( log_file ); |
319 | 0 | x264_log( h, X264_LOG_WARNING, "OpenCL: kernel build errors written to x264_kernel_build_log.txt\n" ); |
320 | |
|
321 | 0 | fail: |
322 | 0 | x264_free( build_log ); |
323 | 0 | if( program ) |
324 | 0 | ocl->clReleaseProgram( program ); |
325 | 0 | return NULL; |
326 | 0 | } |
327 | | |
328 | | static int opencl_lookahead_alloc( x264_t *h ) |
329 | 0 | { |
330 | 0 | if( !h->param.rc.i_lookahead ) |
331 | 0 | return -1; |
332 | | |
333 | 0 | static const char *kernelnames[] = { |
334 | 0 | "mb_intra_cost_satd_8x8", |
335 | 0 | "sum_intra_cost", |
336 | 0 | "downscale_hpel", |
337 | 0 | "downscale1", |
338 | 0 | "downscale2", |
339 | 0 | "memset_int16", |
340 | 0 | "weightp_scaled_images", |
341 | 0 | "weightp_hpel", |
342 | 0 | "hierarchical_motion", |
343 | 0 | "subpel_refine", |
344 | 0 | "mode_selection", |
345 | 0 | "sum_inter_cost" |
346 | 0 | }; |
347 | |
|
348 | 0 | cl_kernel *kernels[] = { |
349 | 0 | &h->opencl.intra_kernel, |
350 | 0 | &h->opencl.rowsum_intra_kernel, |
351 | 0 | &h->opencl.downscale_hpel_kernel, |
352 | 0 | &h->opencl.downscale_kernel1, |
353 | 0 | &h->opencl.downscale_kernel2, |
354 | 0 | &h->opencl.memset_kernel, |
355 | 0 | &h->opencl.weightp_scaled_images_kernel, |
356 | 0 | &h->opencl.weightp_hpel_kernel, |
357 | 0 | &h->opencl.hme_kernel, |
358 | 0 | &h->opencl.subpel_refine_kernel, |
359 | 0 | &h->opencl.mode_select_kernel, |
360 | 0 | &h->opencl.rowsum_inter_kernel |
361 | 0 | }; |
362 | |
|
363 | 0 | x264_opencl_function_t *ocl = h->opencl.ocl; |
364 | 0 | cl_int status; |
365 | |
|
366 | 0 | h->opencl.lookahead_program = opencl_compile( h ); |
367 | 0 | if( !h->opencl.lookahead_program ) |
368 | 0 | goto fail; |
369 | | |
370 | 0 | for( int i = 0; i < ARRAY_ELEMS(kernelnames); i++ ) |
371 | 0 | { |
372 | 0 | *kernels[i] = ocl->clCreateKernel( h->opencl.lookahead_program, kernelnames[i], &status ); |
373 | 0 | if( status != CL_SUCCESS ) |
374 | 0 | { |
375 | 0 | x264_log( h, X264_LOG_WARNING, "OpenCL: Unable to compile kernel '%s' (%d)\n", kernelnames[i], status ); |
376 | 0 | goto fail; |
377 | 0 | } |
378 | 0 | } |
379 | | |
380 | 0 | h->opencl.page_locked_buffer = ocl->clCreateBuffer( h->opencl.context, CL_MEM_WRITE_ONLY|CL_MEM_ALLOC_HOST_PTR, PAGE_LOCKED_BUF_SIZE, NULL, &status ); |
381 | 0 | if( status != CL_SUCCESS ) |
382 | 0 | { |
383 | 0 | x264_log( h, X264_LOG_WARNING, "OpenCL: Unable to allocate page-locked buffer, error '%d'\n", status ); |
384 | 0 | goto fail; |
385 | 0 | } |
386 | 0 | h->opencl.page_locked_ptr = ocl->clEnqueueMapBuffer( h->opencl.queue, h->opencl.page_locked_buffer, CL_TRUE, CL_MAP_READ | CL_MAP_WRITE, |
387 | 0 | 0, PAGE_LOCKED_BUF_SIZE, 0, NULL, NULL, &status ); |
388 | 0 | if( status != CL_SUCCESS ) |
389 | 0 | { |
390 | 0 | x264_log( h, X264_LOG_WARNING, "OpenCL: Unable to map page-locked buffer, error '%d'\n", status ); |
391 | 0 | goto fail; |
392 | 0 | } |
393 | | |
394 | 0 | return 0; |
395 | 0 | fail: |
396 | 0 | x264_opencl_lookahead_delete( h ); |
397 | 0 | return -1; |
398 | 0 | } |
399 | | |
400 | | static void CL_CALLBACK opencl_error_notify( const char *errinfo, const void *private_info, size_t cb, void *user_data ) |
401 | 0 | { |
402 | | /* Any error notification can be assumed to be fatal to the OpenCL context. |
403 | | * We need to stop using it immediately to prevent further damage. */ |
404 | 0 | x264_t *h = (x264_t*)user_data; |
405 | 0 | h->param.b_opencl = 0; |
406 | 0 | h->opencl.b_fatal_error = 1; |
407 | 0 | x264_log( h, X264_LOG_ERROR, "OpenCL: %s\n", errinfo ); |
408 | 0 | x264_log( h, X264_LOG_ERROR, "OpenCL: fatal error, aborting encode\n" ); |
409 | 0 | } |
410 | | |
411 | | int x264_opencl_lookahead_init( x264_t *h ) |
412 | 0 | { |
413 | 0 | x264_opencl_function_t *ocl = h->opencl.ocl; |
414 | 0 | cl_platform_id *platforms = NULL; |
415 | 0 | cl_device_id *devices = NULL; |
416 | 0 | cl_image_format *imageType = NULL; |
417 | 0 | cl_context context = NULL; |
418 | 0 | int ret = -1; |
419 | |
|
420 | 0 | cl_uint numPlatforms = 0; |
421 | 0 | cl_int status = ocl->clGetPlatformIDs( 0, NULL, &numPlatforms ); |
422 | 0 | if( status != CL_SUCCESS || !numPlatforms ) |
423 | 0 | { |
424 | 0 | x264_log( h, X264_LOG_WARNING, "OpenCL: Unable to query installed platforms\n" ); |
425 | 0 | goto fail; |
426 | 0 | } |
427 | 0 | platforms = (cl_platform_id*)x264_malloc( sizeof(cl_platform_id) * numPlatforms ); |
428 | 0 | if( !platforms ) |
429 | 0 | { |
430 | 0 | x264_log( h, X264_LOG_WARNING, "OpenCL: malloc of installed platforms buffer failed\n" ); |
431 | 0 | goto fail; |
432 | 0 | } |
433 | 0 | status = ocl->clGetPlatformIDs( numPlatforms, platforms, NULL ); |
434 | 0 | if( status != CL_SUCCESS ) |
435 | 0 | { |
436 | 0 | x264_log( h, X264_LOG_WARNING, "OpenCL: Unable to query installed platforms\n" ); |
437 | 0 | goto fail; |
438 | 0 | } |
439 | | |
440 | | /* Select the first OpenCL platform with a GPU device that supports our |
441 | | * required image (texture) formats */ |
442 | 0 | for( cl_uint i = 0; i < numPlatforms; i++ ) |
443 | 0 | { |
444 | 0 | cl_uint gpu_count = 0; |
445 | 0 | status = ocl->clGetDeviceIDs( platforms[i], CL_DEVICE_TYPE_GPU, 0, NULL, &gpu_count ); |
446 | 0 | if( status != CL_SUCCESS || !gpu_count ) |
447 | 0 | continue; |
448 | | |
449 | 0 | x264_free( devices ); |
450 | 0 | devices = x264_malloc( sizeof(cl_device_id) * gpu_count ); |
451 | 0 | if( !devices ) |
452 | 0 | continue; |
453 | | |
454 | 0 | status = ocl->clGetDeviceIDs( platforms[i], CL_DEVICE_TYPE_GPU, gpu_count, devices, NULL ); |
455 | 0 | if( status != CL_SUCCESS ) |
456 | 0 | continue; |
457 | | |
458 | | /* Find a GPU device that supports our image formats */ |
459 | 0 | for( cl_uint gpu = 0; gpu < gpu_count; gpu++ ) |
460 | 0 | { |
461 | 0 | h->opencl.device = devices[gpu]; |
462 | | |
463 | | /* if the user has specified an exact device ID, skip all other |
464 | | * GPUs. If this device matches, allow it to continue through the |
465 | | * checks for supported images, etc. */ |
466 | 0 | if( h->param.opencl_device_id && devices[gpu] != (cl_device_id)h->param.opencl_device_id ) |
467 | 0 | continue; |
468 | | |
469 | 0 | cl_bool image_support = 0; |
470 | 0 | status = ocl->clGetDeviceInfo( h->opencl.device, CL_DEVICE_IMAGE_SUPPORT, sizeof(cl_bool), &image_support, NULL ); |
471 | 0 | if( status != CL_SUCCESS || !image_support ) |
472 | 0 | continue; |
473 | | |
474 | 0 | if( context ) |
475 | 0 | ocl->clReleaseContext( context ); |
476 | 0 | context = ocl->clCreateContext( NULL, 1, &h->opencl.device, (void*)opencl_error_notify, (void*)h, &status ); |
477 | 0 | if( status != CL_SUCCESS || !context ) |
478 | 0 | continue; |
479 | | |
480 | 0 | cl_uint imagecount = 0; |
481 | 0 | status = ocl->clGetSupportedImageFormats( context, CL_MEM_READ_WRITE, CL_MEM_OBJECT_IMAGE2D, 0, NULL, &imagecount ); |
482 | 0 | if( status != CL_SUCCESS || !imagecount ) |
483 | 0 | continue; |
484 | | |
485 | 0 | x264_free( imageType ); |
486 | 0 | imageType = x264_malloc( sizeof(cl_image_format) * imagecount ); |
487 | 0 | if( !imageType ) |
488 | 0 | continue; |
489 | | |
490 | 0 | status = ocl->clGetSupportedImageFormats( context, CL_MEM_READ_WRITE, CL_MEM_OBJECT_IMAGE2D, imagecount, imageType, NULL ); |
491 | 0 | if( status != CL_SUCCESS ) |
492 | 0 | continue; |
493 | | |
494 | 0 | int b_has_r = 0; |
495 | 0 | int b_has_rgba = 0; |
496 | 0 | for( cl_uint j = 0; j < imagecount; j++ ) |
497 | 0 | { |
498 | 0 | if( imageType[j].image_channel_order == CL_R && |
499 | 0 | imageType[j].image_channel_data_type == CL_UNSIGNED_INT32 ) |
500 | 0 | b_has_r = 1; |
501 | 0 | else if( imageType[j].image_channel_order == CL_RGBA && |
502 | 0 | imageType[j].image_channel_data_type == CL_UNSIGNED_INT8 ) |
503 | 0 | b_has_rgba = 1; |
504 | 0 | } |
505 | 0 | if( !b_has_r || !b_has_rgba ) |
506 | 0 | { |
507 | 0 | char dev_name[64]; |
508 | 0 | status = ocl->clGetDeviceInfo( h->opencl.device, CL_DEVICE_NAME, sizeof(dev_name), dev_name, NULL ); |
509 | 0 | if( status == CL_SUCCESS ) |
510 | 0 | { |
511 | | /* emit warning if we are discarding the user's explicit choice */ |
512 | 0 | int level = h->param.opencl_device_id ? X264_LOG_WARNING : X264_LOG_DEBUG; |
513 | 0 | x264_log( h, level, "OpenCL: %s does not support required image formats\n", dev_name ); |
514 | 0 | } |
515 | 0 | continue; |
516 | 0 | } |
517 | | |
518 | | /* user selection of GPU device, skip N first matches */ |
519 | 0 | if( h->param.i_opencl_device ) |
520 | 0 | { |
521 | 0 | h->param.i_opencl_device--; |
522 | 0 | continue; |
523 | 0 | } |
524 | | |
525 | 0 | h->opencl.queue = ocl->clCreateCommandQueue( context, h->opencl.device, 0, &status ); |
526 | 0 | if( status != CL_SUCCESS || !h->opencl.queue ) |
527 | 0 | continue; |
528 | | |
529 | 0 | h->opencl.context = context; |
530 | 0 | context = NULL; |
531 | |
|
532 | 0 | ret = 0; |
533 | 0 | break; |
534 | 0 | } |
535 | |
|
536 | 0 | if( !ret ) |
537 | 0 | break; |
538 | 0 | } |
539 | |
|
540 | 0 | if( !h->param.psz_clbin_file ) |
541 | 0 | h->param.psz_clbin_file = "x264_lookahead.clbin"; |
542 | |
|
543 | 0 | if( ret ) |
544 | 0 | x264_log( h, X264_LOG_WARNING, "OpenCL: Unable to find a compatible device\n" ); |
545 | 0 | else |
546 | 0 | ret = opencl_lookahead_alloc( h ); |
547 | |
|
548 | 0 | fail: |
549 | 0 | if( context ) |
550 | 0 | ocl->clReleaseContext( context ); |
551 | 0 | x264_free( imageType ); |
552 | 0 | x264_free( devices ); |
553 | 0 | x264_free( platforms ); |
554 | 0 | return ret; |
555 | 0 | } |
556 | | |
557 | | static void opencl_lookahead_free( x264_t *h ) |
558 | 0 | { |
559 | 0 | x264_opencl_function_t *ocl = h->opencl.ocl; |
560 | |
|
561 | 0 | #define RELEASE( a, f ) do { if( a ) { ocl->f( a ); a = NULL; } } while( 0 ) |
562 | 0 | RELEASE( h->opencl.downscale_hpel_kernel, clReleaseKernel ); |
563 | 0 | RELEASE( h->opencl.downscale_kernel1, clReleaseKernel ); |
564 | 0 | RELEASE( h->opencl.downscale_kernel2, clReleaseKernel ); |
565 | 0 | RELEASE( h->opencl.weightp_hpel_kernel, clReleaseKernel ); |
566 | 0 | RELEASE( h->opencl.weightp_scaled_images_kernel, clReleaseKernel ); |
567 | 0 | RELEASE( h->opencl.memset_kernel, clReleaseKernel ); |
568 | 0 | RELEASE( h->opencl.intra_kernel, clReleaseKernel ); |
569 | 0 | RELEASE( h->opencl.rowsum_intra_kernel, clReleaseKernel ); |
570 | 0 | RELEASE( h->opencl.hme_kernel, clReleaseKernel ); |
571 | 0 | RELEASE( h->opencl.subpel_refine_kernel, clReleaseKernel ); |
572 | 0 | RELEASE( h->opencl.mode_select_kernel, clReleaseKernel ); |
573 | 0 | RELEASE( h->opencl.rowsum_inter_kernel, clReleaseKernel ); |
574 | |
|
575 | 0 | RELEASE( h->opencl.lookahead_program, clReleaseProgram ); |
576 | |
|
577 | 0 | RELEASE( h->opencl.page_locked_buffer, clReleaseMemObject ); |
578 | 0 | RELEASE( h->opencl.luma_16x16_image[0], clReleaseMemObject ); |
579 | 0 | RELEASE( h->opencl.luma_16x16_image[1], clReleaseMemObject ); |
580 | 0 | for( int i = 0; i < NUM_IMAGE_SCALES; i++ ) |
581 | 0 | RELEASE( h->opencl.weighted_scaled_images[i], clReleaseMemObject ); |
582 | 0 | RELEASE( h->opencl.weighted_luma_hpel, clReleaseMemObject ); |
583 | 0 | RELEASE( h->opencl.row_satds[0], clReleaseMemObject ); |
584 | 0 | RELEASE( h->opencl.row_satds[1], clReleaseMemObject ); |
585 | 0 | RELEASE( h->opencl.mv_buffers[0], clReleaseMemObject ); |
586 | 0 | RELEASE( h->opencl.mv_buffers[1], clReleaseMemObject ); |
587 | 0 | RELEASE( h->opencl.lowres_mv_costs, clReleaseMemObject ); |
588 | 0 | RELEASE( h->opencl.mvp_buffer, clReleaseMemObject ); |
589 | 0 | RELEASE( h->opencl.lowres_costs[0], clReleaseMemObject ); |
590 | 0 | RELEASE( h->opencl.lowres_costs[1], clReleaseMemObject ); |
591 | 0 | RELEASE( h->opencl.frame_stats[0], clReleaseMemObject ); |
592 | 0 | RELEASE( h->opencl.frame_stats[1], clReleaseMemObject ); |
593 | 0 | #undef RELEASE |
594 | 0 | } |
595 | | |
596 | | void x264_opencl_lookahead_delete( x264_t *h ) |
597 | 0 | { |
598 | 0 | x264_opencl_function_t *ocl = h->opencl.ocl; |
599 | |
|
600 | 0 | if( !ocl ) |
601 | 0 | return; |
602 | | |
603 | 0 | if( h->opencl.queue ) |
604 | 0 | ocl->clFinish( h->opencl.queue ); |
605 | |
|
606 | 0 | opencl_lookahead_free( h ); |
607 | |
|
608 | 0 | if( h->opencl.queue ) |
609 | 0 | { |
610 | 0 | ocl->clReleaseCommandQueue( h->opencl.queue ); |
611 | 0 | h->opencl.queue = NULL; |
612 | 0 | } |
613 | 0 | if( h->opencl.context ) |
614 | 0 | { |
615 | 0 | ocl->clReleaseContext( h->opencl.context ); |
616 | 0 | h->opencl.context = NULL; |
617 | 0 | } |
618 | 0 | } |
619 | | |
620 | | void x264_opencl_frame_delete( x264_frame_t *frame ) |
621 | 0 | { |
622 | 0 | x264_opencl_function_t *ocl = frame->opencl.ocl; |
623 | |
|
624 | 0 | if( !ocl ) |
625 | 0 | return; |
626 | | |
627 | 0 | #define RELEASEBUF(mem) do { if( mem ) { ocl->clReleaseMemObject( mem ); mem = NULL; } } while( 0 ) |
628 | 0 | for( int j = 0; j < NUM_IMAGE_SCALES; j++ ) |
629 | 0 | RELEASEBUF( frame->opencl.scaled_image2Ds[j] ); |
630 | 0 | RELEASEBUF( frame->opencl.luma_hpel ); |
631 | 0 | RELEASEBUF( frame->opencl.inv_qscale_factor ); |
632 | 0 | RELEASEBUF( frame->opencl.intra_cost ); |
633 | 0 | RELEASEBUF( frame->opencl.lowres_mvs0 ); |
634 | 0 | RELEASEBUF( frame->opencl.lowres_mvs1 ); |
635 | 0 | RELEASEBUF( frame->opencl.lowres_mv_costs0 ); |
636 | 0 | RELEASEBUF( frame->opencl.lowres_mv_costs1 ); |
637 | 0 | #undef RELEASEBUF |
638 | 0 | } |
639 | | |
640 | | /* OpenCL misbehaves on hybrid laptops with Intel iGPU and AMD dGPU, so |
641 | | * we consult AMD's ADL interface to detect this situation and disable |
642 | | * OpenCL on these machines (Linux and Windows) */ |
643 | | #ifdef _WIN32 |
644 | | #define ADL_API_CALL |
645 | | #define ADL_CALLBACK __stdcall |
646 | | #define adl_close FreeLibrary |
647 | | #define adl_address GetProcAddress |
648 | | #else |
649 | | #define ADL_API_CALL |
650 | | #define ADL_CALLBACK |
651 | 0 | #define adl_close dlclose |
652 | 0 | #define adl_address dlsym |
653 | | #endif |
654 | | |
655 | | typedef void* ( ADL_CALLBACK *ADL_MAIN_MALLOC_CALLBACK )( int ); |
656 | | typedef int ( ADL_API_CALL *ADL_MAIN_CONTROL_CREATE )( ADL_MAIN_MALLOC_CALLBACK, int ); |
657 | | typedef int ( ADL_API_CALL *ADL_ADAPTER_NUMBEROFADAPTERS_GET )( int * ); |
658 | | typedef int ( ADL_API_CALL *ADL_POWERXPRESS_SCHEME_GET )( int, int *, int *, int * ); |
659 | | typedef int ( ADL_API_CALL *ADL_MAIN_CONTROL_DESTROY )( void ); |
660 | | |
661 | 0 | #define ADL_OK 0 |
662 | 0 | #define ADL_PX_SCHEME_DYNAMIC 2 |
663 | | |
664 | | static void* ADL_CALLBACK adl_malloc_wrapper( int iSize ) |
665 | 0 | { |
666 | 0 | return x264_malloc( iSize ); |
667 | 0 | } |
668 | | |
669 | | static int detect_switchable_graphics( void ) |
670 | 0 | { |
671 | 0 | void *hDLL; |
672 | 0 | ADL_MAIN_CONTROL_CREATE ADL_Main_Control_Create; |
673 | 0 | ADL_ADAPTER_NUMBEROFADAPTERS_GET ADL_Adapter_NumberOfAdapters_Get; |
674 | 0 | ADL_POWERXPRESS_SCHEME_GET ADL_PowerXpress_Scheme_Get; |
675 | 0 | ADL_MAIN_CONTROL_DESTROY ADL_Main_Control_Destroy; |
676 | 0 | int ret = 0; |
677 | |
|
678 | | #ifdef _WIN32 |
679 | | hDLL = LoadLibraryW( L"atiadlxx.dll" ); |
680 | | if( !hDLL ) |
681 | | hDLL = LoadLibraryW( L"atiadlxy.dll" ); |
682 | | #else |
683 | 0 | hDLL = dlopen( "libatiadlxx.so", RTLD_LAZY|RTLD_GLOBAL ); |
684 | 0 | #endif |
685 | 0 | if( !hDLL ) |
686 | 0 | goto fail0; |
687 | | |
688 | 0 | ADL_Main_Control_Create = (ADL_MAIN_CONTROL_CREATE)adl_address(hDLL, "ADL_Main_Control_Create"); |
689 | 0 | ADL_Main_Control_Destroy = (ADL_MAIN_CONTROL_DESTROY)adl_address(hDLL, "ADL_Main_Control_Destroy"); |
690 | 0 | ADL_Adapter_NumberOfAdapters_Get = (ADL_ADAPTER_NUMBEROFADAPTERS_GET)adl_address(hDLL, "ADL_Adapter_NumberOfAdapters_Get"); |
691 | 0 | ADL_PowerXpress_Scheme_Get = (ADL_POWERXPRESS_SCHEME_GET)adl_address(hDLL, "ADL_PowerXpress_Scheme_Get"); |
692 | 0 | if( !ADL_Main_Control_Create || !ADL_Main_Control_Destroy || !ADL_Adapter_NumberOfAdapters_Get || |
693 | 0 | !ADL_PowerXpress_Scheme_Get ) |
694 | 0 | goto fail1; |
695 | | |
696 | 0 | if( ADL_OK != ADL_Main_Control_Create( adl_malloc_wrapper, 1 ) ) |
697 | 0 | goto fail1; |
698 | | |
699 | 0 | int numAdapters = 0; |
700 | 0 | if( ADL_OK != ADL_Adapter_NumberOfAdapters_Get( &numAdapters ) ) |
701 | 0 | goto fail2; |
702 | | |
703 | 0 | for( int i = 0; i < numAdapters; i++ ) |
704 | 0 | { |
705 | 0 | int PXSchemeRange, PXSchemeCurrentState, PXSchemeDefaultState; |
706 | 0 | if( ADL_OK != ADL_PowerXpress_Scheme_Get( i, &PXSchemeRange, &PXSchemeCurrentState, &PXSchemeDefaultState) ) |
707 | 0 | break; |
708 | | |
709 | 0 | if( PXSchemeRange >= ADL_PX_SCHEME_DYNAMIC ) |
710 | 0 | { |
711 | 0 | ret = 1; |
712 | 0 | break; |
713 | 0 | } |
714 | 0 | } |
715 | |
|
716 | 0 | fail2: |
717 | 0 | ADL_Main_Control_Destroy(); |
718 | 0 | fail1: |
719 | 0 | adl_close( hDLL ); |
720 | 0 | fail0: |
721 | 0 | return ret; |
722 | 0 | } |