Coverage Report

Created: 2026-05-16 06:28

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/work/x264/common/opencl.c
Line
Count
Source
1
/*****************************************************************************
2
 * opencl.c: OpenCL initialization and kernel compilation
3
 *****************************************************************************
4
 * Copyright (C) 2012-2025 x264 project
5
 *
6
 * Authors: Steve Borho <sborho@multicorewareinc.com>
7
 *          Anton Mitrofanov <BugMaster@narod.ru>
8
 *
9
 * This program is free software; you can redistribute it and/or modify
10
 * it under the terms of the GNU General Public License as published by
11
 * the Free Software Foundation; either version 2 of the License, or
12
 * (at your option) any later version.
13
 *
14
 * This program is distributed in the hope that it will be useful,
15
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
17
 * GNU General Public License for more details.
18
 *
19
 * You should have received a copy of the GNU General Public License
20
 * along with this program; if not, write to the Free Software
21
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
22
 *
23
 * This program is also available under a commercial proprietary license.
24
 * For more information, contact us at licensing@x264.com.
25
 *****************************************************************************/
26
27
#include "common.h"
28
29
#ifdef _WIN32
30
#include <windows.h>
31
#define ocl_open LoadLibraryW( L"OpenCL" )
32
#define ocl_close FreeLibrary
33
#define ocl_address GetProcAddress
34
#else
35
#include <dlfcn.h> //dlopen, dlsym, dlclose
36
#if SYS_MACOSX
37
#define ocl_open dlopen( "/System/Library/Frameworks/OpenCL.framework/OpenCL", RTLD_NOW )
38
#else
39
0
#define ocl_open dlopen( "libOpenCL.so", RTLD_NOW )
40
#endif
41
0
#define ocl_close dlclose
42
0
#define ocl_address dlsym
43
#endif
44
45
0
#define LOAD_OCL_FUNC(name, continue_on_fail)\
46
0
{\
47
0
    ocl->name = (void*)ocl_address( ocl->library, #name );\
48
0
    if( !continue_on_fail && !ocl->name )\
49
0
        goto fail;\
50
0
}
51
52
/* load the library and functions we require from it */
53
x264_opencl_function_t *x264_opencl_load_library( void )
54
0
{
55
0
    x264_opencl_function_t *ocl;
56
0
#undef fail
57
0
#define fail fail0
58
0
    CHECKED_MALLOCZERO( ocl, sizeof(x264_opencl_function_t) );
59
0
#undef fail
60
0
#define fail fail1
61
0
    ocl->library = ocl_open;
62
0
    if( !ocl->library )
63
0
        goto fail;
64
0
#undef fail
65
0
#define fail fail2
66
0
    LOAD_OCL_FUNC( clBuildProgram, 0 );
67
0
    LOAD_OCL_FUNC( clCreateBuffer, 0 );
68
0
    LOAD_OCL_FUNC( clCreateCommandQueue, 0 );
69
0
    LOAD_OCL_FUNC( clCreateContext, 0 );
70
0
    LOAD_OCL_FUNC( clCreateImage2D, 0 );
71
0
    LOAD_OCL_FUNC( clCreateKernel, 0 );
72
0
    LOAD_OCL_FUNC( clCreateProgramWithBinary, 0 );
73
0
    LOAD_OCL_FUNC( clCreateProgramWithSource, 0 );
74
0
    LOAD_OCL_FUNC( clEnqueueCopyBuffer, 0 );
75
0
    LOAD_OCL_FUNC( clEnqueueMapBuffer, 0 );
76
0
    LOAD_OCL_FUNC( clEnqueueNDRangeKernel, 0 );
77
0
    LOAD_OCL_FUNC( clEnqueueReadBuffer, 0 );
78
0
    LOAD_OCL_FUNC( clEnqueueWriteBuffer, 0 );
79
0
    LOAD_OCL_FUNC( clFinish, 0 );
80
0
    LOAD_OCL_FUNC( clGetCommandQueueInfo, 0 );
81
0
    LOAD_OCL_FUNC( clGetDeviceIDs, 0 );
82
0
    LOAD_OCL_FUNC( clGetDeviceInfo, 0 );
83
0
    LOAD_OCL_FUNC( clGetKernelWorkGroupInfo, 0 );
84
0
    LOAD_OCL_FUNC( clGetPlatformIDs, 0 );
85
0
    LOAD_OCL_FUNC( clGetProgramBuildInfo, 0 );
86
0
    LOAD_OCL_FUNC( clGetProgramInfo, 0 );
87
0
    LOAD_OCL_FUNC( clGetSupportedImageFormats, 0 );
88
0
    LOAD_OCL_FUNC( clReleaseCommandQueue, 0 );
89
0
    LOAD_OCL_FUNC( clReleaseContext, 0 );
90
0
    LOAD_OCL_FUNC( clReleaseKernel, 0 );
91
0
    LOAD_OCL_FUNC( clReleaseMemObject, 0 );
92
0
    LOAD_OCL_FUNC( clReleaseProgram, 0 );
93
0
    LOAD_OCL_FUNC( clSetKernelArg, 0 );
94
0
    return ocl;
95
0
#undef fail
96
0
fail2:
97
0
    ocl_close( ocl->library );
98
0
fail1:
99
0
    x264_free( ocl );
100
0
fail0:
101
0
    return NULL;
102
0
}
103
104
void x264_opencl_close_library( x264_opencl_function_t *ocl )
105
0
{
106
0
    if( !ocl )
107
0
        return;
108
0
    ocl_close( ocl->library );
109
0
    x264_free( ocl );
110
0
}
111
112
/* define from recent cl_ext.h, copied here in case headers are old */
113
0
#define CL_DEVICE_SIMD_INSTRUCTION_WIDTH_AMD        0x4042
114
115
/* Requires full include path in case of out-of-tree builds */
116
#include "common/oclobj.h"
117
118
static int detect_switchable_graphics( void );
119
120
/* Try to load the cached compiled program binary, verify the device context is
121
 * still valid before reuse */
122
static cl_program opencl_cache_load( x264_t *h, const char *dev_name, const char *dev_vendor, const char *driver_version )
123
0
{
124
    /* try to load cached program binary */
125
0
    FILE *fp = x264_fopen( h->param.psz_clbin_file, "rb" );
126
0
    if( !fp )
127
0
        return NULL;
128
129
0
    x264_opencl_function_t *ocl = h->opencl.ocl;
130
0
    cl_program program = NULL;
131
0
    uint8_t *binary = NULL;
132
133
0
    fseek( fp, 0, SEEK_END );
134
0
    int64_t file_size = ftell( fp );
135
0
    fseek( fp, 0, SEEK_SET );
136
0
    if( file_size < 0 || (uint64_t)file_size > SIZE_MAX )
137
0
        goto fail;
138
0
    size_t size = file_size;
139
0
    CHECKED_MALLOC( binary, size );
140
141
0
    if( fread( binary, 1, size, fp ) != size )
142
0
        goto fail;
143
0
    const uint8_t *ptr = (const uint8_t*)binary;
144
145
0
#define CHECK_STRING( STR )\
146
0
    do {\
147
0
        size_t len = strlen( STR );\
148
0
        if( size <= len || strncmp( (char*)ptr, STR, len ) )\
149
0
            goto fail;\
150
0
        else {\
151
0
            size -= (len+1); ptr += (len+1);\
152
0
        }\
153
0
    } while( 0 )
154
155
0
    CHECK_STRING( dev_name );
156
0
    CHECK_STRING( dev_vendor );
157
0
    CHECK_STRING( driver_version );
158
0
    CHECK_STRING( x264_opencl_source_hash );
159
0
#undef CHECK_STRING
160
161
0
    cl_int status;
162
0
    program = ocl->clCreateProgramWithBinary( h->opencl.context, 1, &h->opencl.device, &size, &ptr, NULL, &status );
163
0
    if( status != CL_SUCCESS )
164
0
        program = NULL;
165
166
0
fail:
167
0
    fclose( fp );
168
0
    x264_free( binary );
169
0
    return program;
170
0
}
171
172
/* Save the compiled program binary to a file for later reuse.  Device context
173
 * is also saved in the cache file so we do not reuse stale binaries */
174
static void opencl_cache_save( x264_t *h, cl_program program, const char *dev_name, const char *dev_vendor, const char *driver_version )
175
0
{
176
0
    FILE *fp = x264_fopen( h->param.psz_clbin_file, "wb" );
177
0
    if( !fp )
178
0
    {
179
0
        x264_log( h, X264_LOG_INFO, "OpenCL: unable to open clbin file for write\n" );
180
0
        return;
181
0
    }
182
183
0
    x264_opencl_function_t *ocl = h->opencl.ocl;
184
0
    uint8_t *binary = NULL;
185
186
0
    size_t size = 0;
187
0
    cl_int status = ocl->clGetProgramInfo( program, CL_PROGRAM_BINARY_SIZES, sizeof(size_t), &size, NULL );
188
0
    if( status != CL_SUCCESS || !size )
189
0
    {
190
0
        x264_log( h, X264_LOG_INFO, "OpenCL: Unable to query program binary size, no cache file generated\n" );
191
0
        goto fail;
192
0
    }
193
194
0
    CHECKED_MALLOC( binary, size );
195
0
    status = ocl->clGetProgramInfo( program, CL_PROGRAM_BINARIES, sizeof(uint8_t *), &binary, NULL );
196
0
    if( status != CL_SUCCESS )
197
0
    {
198
0
        x264_log( h, X264_LOG_INFO, "OpenCL: Unable to query program binary, no cache file generated\n" );
199
0
        goto fail;
200
0
    }
201
202
0
    fputs( dev_name, fp );
203
0
    fputc( '\n', fp );
204
0
    fputs( dev_vendor, fp );
205
0
    fputc( '\n', fp );
206
0
    fputs( driver_version, fp );
207
0
    fputc( '\n', fp );
208
0
    fputs( x264_opencl_source_hash, fp );
209
0
    fputc( '\n', fp );
210
0
    fwrite( binary, 1, size, fp );
211
212
0
fail:
213
0
    fclose( fp );
214
0
    x264_free( binary );
215
0
    return;
216
0
}
217
218
/* The OpenCL source under common/opencl will be merged into common/oclobj.h by
219
 * the Makefile. It defines a x264_opencl_source byte array which we will pass
220
 * to clCreateProgramWithSource().  We also attempt to use a cache file for the
221
 * compiled binary, stored in the current working folder. */
222
static cl_program opencl_compile( x264_t *h )
223
0
{
224
0
    x264_opencl_function_t *ocl = h->opencl.ocl;
225
0
    cl_program program = NULL;
226
0
    char *build_log = NULL;
227
228
0
    char dev_name[64];
229
0
    char dev_vendor[64];
230
0
    char driver_version[64];
231
0
    cl_int status;
232
0
    status  = ocl->clGetDeviceInfo( h->opencl.device, CL_DEVICE_NAME,    sizeof(dev_name), dev_name, NULL );
233
0
    status |= ocl->clGetDeviceInfo( h->opencl.device, CL_DEVICE_VENDOR,  sizeof(dev_vendor), dev_vendor, NULL );
234
0
    status |= ocl->clGetDeviceInfo( h->opencl.device, CL_DRIVER_VERSION, sizeof(driver_version), driver_version, NULL );
235
0
    if( status != CL_SUCCESS )
236
0
        return NULL;
237
238
    // Most AMD GPUs have vector registers
239
0
    int vectorize = !strcmp( dev_vendor, "Advanced Micro Devices, Inc." );
240
0
    h->opencl.b_device_AMD_SI = 0;
241
242
0
    if( vectorize )
243
0
    {
244
        /* Disable OpenCL on Intel/AMD switchable graphics devices */
245
0
        if( detect_switchable_graphics() )
246
0
        {
247
0
            x264_log( h, X264_LOG_INFO, "OpenCL acceleration disabled, switchable graphics detected\n" );
248
0
            return NULL;
249
0
        }
250
251
        /* Detect AMD SouthernIsland or newer device (single-width registers) */
252
0
        cl_uint simdwidth = 4;
253
0
        status = ocl->clGetDeviceInfo( h->opencl.device, CL_DEVICE_SIMD_INSTRUCTION_WIDTH_AMD, sizeof(cl_uint), &simdwidth, NULL );
254
0
        if( status == CL_SUCCESS && simdwidth == 1 )
255
0
        {
256
0
            vectorize = 0;
257
0
            h->opencl.b_device_AMD_SI = 1;
258
0
        }
259
0
    }
260
261
0
    x264_log( h, X264_LOG_INFO, "OpenCL acceleration enabled with %s %s %s\n", dev_vendor, dev_name, h->opencl.b_device_AMD_SI ? "(SI)" : "" );
262
263
0
    program = opencl_cache_load( h, dev_name, dev_vendor, driver_version );
264
0
    if( !program )
265
0
    {
266
        /* clCreateProgramWithSource() requires a pointer variable, you cannot just use &x264_opencl_source */
267
0
        x264_log( h, X264_LOG_INFO, "Compiling OpenCL kernels...\n" );
268
0
        const char *strptr = (const char*)x264_opencl_source;
269
0
        size_t size = sizeof(x264_opencl_source);
270
0
        program = ocl->clCreateProgramWithSource( h->opencl.context, 1, &strptr, &size, &status );
271
0
        if( status != CL_SUCCESS || !program )
272
0
        {
273
0
            x264_log( h, X264_LOG_WARNING, "OpenCL: unable to create program\n" );
274
0
            return NULL;
275
0
        }
276
0
    }
277
278
    /* Build the program binary for the OpenCL device */
279
0
    const char *buildopts = vectorize ? "-DVECTORIZE=1" : "";
280
0
    status = ocl->clBuildProgram( program, 1, &h->opencl.device, buildopts, NULL, NULL );
281
0
    if( status == CL_SUCCESS )
282
0
    {
283
0
        opencl_cache_save( h, program, dev_name, dev_vendor, driver_version );
284
0
        return program;
285
0
    }
286
287
    /* Compile failure, should not happen with production code. */
288
289
0
    size_t build_log_len = 0;
290
0
    status = ocl->clGetProgramBuildInfo( program, h->opencl.device, CL_PROGRAM_BUILD_LOG, 0, NULL, &build_log_len );
291
0
    if( status != CL_SUCCESS || !build_log_len )
292
0
    {
293
0
        x264_log( h, X264_LOG_WARNING, "OpenCL: Compilation failed, unable to query build log\n" );
294
0
        goto fail;
295
0
    }
296
297
0
    build_log = x264_malloc( build_log_len );
298
0
    if( !build_log )
299
0
    {
300
0
        x264_log( h, X264_LOG_WARNING, "OpenCL: Compilation failed, unable to alloc build log\n" );
301
0
        goto fail;
302
0
    }
303
304
0
    status = ocl->clGetProgramBuildInfo( program, h->opencl.device, CL_PROGRAM_BUILD_LOG, build_log_len, build_log, NULL );
305
0
    if( status != CL_SUCCESS )
306
0
    {
307
0
        x264_log( h, X264_LOG_WARNING, "OpenCL: Compilation failed, unable to get build log\n" );
308
0
        goto fail;
309
0
    }
310
311
0
    FILE *log_file = x264_fopen( "x264_kernel_build_log.txt", "w" );
312
0
    if( !log_file )
313
0
    {
314
0
        x264_log( h, X264_LOG_WARNING, "OpenCL: Compilation failed, unable to create file x264_kernel_build_log.txt\n" );
315
0
        goto fail;
316
0
    }
317
0
    fwrite( build_log, 1, build_log_len, log_file );
318
0
    fclose( log_file );
319
0
    x264_log( h, X264_LOG_WARNING, "OpenCL: kernel build errors written to x264_kernel_build_log.txt\n" );
320
321
0
fail:
322
0
    x264_free( build_log );
323
0
    if( program )
324
0
        ocl->clReleaseProgram( program );
325
0
    return NULL;
326
0
}
327
328
static int opencl_lookahead_alloc( x264_t *h )
329
0
{
330
0
    if( !h->param.rc.i_lookahead )
331
0
        return -1;
332
333
0
    static const char *kernelnames[] = {
334
0
        "mb_intra_cost_satd_8x8",
335
0
        "sum_intra_cost",
336
0
        "downscale_hpel",
337
0
        "downscale1",
338
0
        "downscale2",
339
0
        "memset_int16",
340
0
        "weightp_scaled_images",
341
0
        "weightp_hpel",
342
0
        "hierarchical_motion",
343
0
        "subpel_refine",
344
0
        "mode_selection",
345
0
        "sum_inter_cost"
346
0
    };
347
348
0
    cl_kernel *kernels[] = {
349
0
        &h->opencl.intra_kernel,
350
0
        &h->opencl.rowsum_intra_kernel,
351
0
        &h->opencl.downscale_hpel_kernel,
352
0
        &h->opencl.downscale_kernel1,
353
0
        &h->opencl.downscale_kernel2,
354
0
        &h->opencl.memset_kernel,
355
0
        &h->opencl.weightp_scaled_images_kernel,
356
0
        &h->opencl.weightp_hpel_kernel,
357
0
        &h->opencl.hme_kernel,
358
0
        &h->opencl.subpel_refine_kernel,
359
0
        &h->opencl.mode_select_kernel,
360
0
        &h->opencl.rowsum_inter_kernel
361
0
    };
362
363
0
    x264_opencl_function_t *ocl = h->opencl.ocl;
364
0
    cl_int status;
365
366
0
    h->opencl.lookahead_program = opencl_compile( h );
367
0
    if( !h->opencl.lookahead_program )
368
0
        goto fail;
369
370
0
    for( int i = 0; i < ARRAY_ELEMS(kernelnames); i++ )
371
0
    {
372
0
        *kernels[i] = ocl->clCreateKernel( h->opencl.lookahead_program, kernelnames[i], &status );
373
0
        if( status != CL_SUCCESS )
374
0
        {
375
0
            x264_log( h, X264_LOG_WARNING, "OpenCL: Unable to compile kernel '%s' (%d)\n", kernelnames[i], status );
376
0
            goto fail;
377
0
        }
378
0
    }
379
380
0
    h->opencl.page_locked_buffer = ocl->clCreateBuffer( h->opencl.context, CL_MEM_WRITE_ONLY|CL_MEM_ALLOC_HOST_PTR, PAGE_LOCKED_BUF_SIZE, NULL, &status );
381
0
    if( status != CL_SUCCESS )
382
0
    {
383
0
        x264_log( h, X264_LOG_WARNING, "OpenCL: Unable to allocate page-locked buffer, error '%d'\n", status );
384
0
        goto fail;
385
0
    }
386
0
    h->opencl.page_locked_ptr = ocl->clEnqueueMapBuffer( h->opencl.queue, h->opencl.page_locked_buffer, CL_TRUE, CL_MAP_READ | CL_MAP_WRITE,
387
0
                                                         0, PAGE_LOCKED_BUF_SIZE, 0, NULL, NULL, &status );
388
0
    if( status != CL_SUCCESS )
389
0
    {
390
0
        x264_log( h, X264_LOG_WARNING, "OpenCL: Unable to map page-locked buffer, error '%d'\n", status );
391
0
        goto fail;
392
0
    }
393
394
0
    return 0;
395
0
fail:
396
0
    x264_opencl_lookahead_delete( h );
397
0
    return -1;
398
0
}
399
400
static void CL_CALLBACK opencl_error_notify( const char *errinfo, const void *private_info, size_t cb, void *user_data )
401
0
{
402
    /* Any error notification can be assumed to be fatal to the OpenCL context.
403
     * We need to stop using it immediately to prevent further damage. */
404
0
    x264_t *h = (x264_t*)user_data;
405
0
    h->param.b_opencl = 0;
406
0
    h->opencl.b_fatal_error = 1;
407
0
    x264_log( h, X264_LOG_ERROR, "OpenCL: %s\n", errinfo );
408
0
    x264_log( h, X264_LOG_ERROR, "OpenCL: fatal error, aborting encode\n" );
409
0
}
410
411
int x264_opencl_lookahead_init( x264_t *h )
412
0
{
413
0
    x264_opencl_function_t *ocl = h->opencl.ocl;
414
0
    cl_platform_id *platforms = NULL;
415
0
    cl_device_id *devices = NULL;
416
0
    cl_image_format *imageType = NULL;
417
0
    cl_context context = NULL;
418
0
    int ret = -1;
419
420
0
    cl_uint numPlatforms = 0;
421
0
    cl_int status = ocl->clGetPlatformIDs( 0, NULL, &numPlatforms );
422
0
    if( status != CL_SUCCESS || !numPlatforms )
423
0
    {
424
0
        x264_log( h, X264_LOG_WARNING, "OpenCL: Unable to query installed platforms\n" );
425
0
        goto fail;
426
0
    }
427
0
    platforms = (cl_platform_id*)x264_malloc( sizeof(cl_platform_id) * numPlatforms );
428
0
    if( !platforms )
429
0
    {
430
0
        x264_log( h, X264_LOG_WARNING, "OpenCL: malloc of installed platforms buffer failed\n" );
431
0
        goto fail;
432
0
    }
433
0
    status = ocl->clGetPlatformIDs( numPlatforms, platforms, NULL );
434
0
    if( status != CL_SUCCESS )
435
0
    {
436
0
        x264_log( h, X264_LOG_WARNING, "OpenCL: Unable to query installed platforms\n" );
437
0
        goto fail;
438
0
    }
439
440
    /* Select the first OpenCL platform with a GPU device that supports our
441
     * required image (texture) formats */
442
0
    for( cl_uint i = 0; i < numPlatforms; i++ )
443
0
    {
444
0
        cl_uint gpu_count = 0;
445
0
        status = ocl->clGetDeviceIDs( platforms[i], CL_DEVICE_TYPE_GPU, 0, NULL, &gpu_count );
446
0
        if( status != CL_SUCCESS || !gpu_count )
447
0
            continue;
448
449
0
        x264_free( devices );
450
0
        devices = x264_malloc( sizeof(cl_device_id) * gpu_count );
451
0
        if( !devices )
452
0
            continue;
453
454
0
        status = ocl->clGetDeviceIDs( platforms[i], CL_DEVICE_TYPE_GPU, gpu_count, devices, NULL );
455
0
        if( status != CL_SUCCESS )
456
0
            continue;
457
458
        /* Find a GPU device that supports our image formats */
459
0
        for( cl_uint gpu = 0; gpu < gpu_count; gpu++ )
460
0
        {
461
0
            h->opencl.device = devices[gpu];
462
463
            /* if the user has specified an exact device ID, skip all other
464
             * GPUs.  If this device matches, allow it to continue through the
465
             * checks for supported images, etc.  */
466
0
            if( h->param.opencl_device_id && devices[gpu] != (cl_device_id)h->param.opencl_device_id )
467
0
                continue;
468
469
0
            cl_bool image_support = 0;
470
0
            status = ocl->clGetDeviceInfo( h->opencl.device, CL_DEVICE_IMAGE_SUPPORT, sizeof(cl_bool), &image_support, NULL );
471
0
            if( status != CL_SUCCESS || !image_support )
472
0
                continue;
473
474
0
            if( context )
475
0
                ocl->clReleaseContext( context );
476
0
            context = ocl->clCreateContext( NULL, 1, &h->opencl.device, (void*)opencl_error_notify, (void*)h, &status );
477
0
            if( status != CL_SUCCESS || !context )
478
0
                continue;
479
480
0
            cl_uint imagecount = 0;
481
0
            status = ocl->clGetSupportedImageFormats( context, CL_MEM_READ_WRITE, CL_MEM_OBJECT_IMAGE2D, 0, NULL, &imagecount );
482
0
            if( status != CL_SUCCESS || !imagecount )
483
0
                continue;
484
485
0
            x264_free( imageType );
486
0
            imageType = x264_malloc( sizeof(cl_image_format) * imagecount );
487
0
            if( !imageType )
488
0
                continue;
489
490
0
            status = ocl->clGetSupportedImageFormats( context, CL_MEM_READ_WRITE, CL_MEM_OBJECT_IMAGE2D, imagecount, imageType, NULL );
491
0
            if( status != CL_SUCCESS )
492
0
                continue;
493
494
0
            int b_has_r = 0;
495
0
            int b_has_rgba = 0;
496
0
            for( cl_uint j = 0; j < imagecount; j++ )
497
0
            {
498
0
                if( imageType[j].image_channel_order == CL_R &&
499
0
                    imageType[j].image_channel_data_type == CL_UNSIGNED_INT32 )
500
0
                    b_has_r = 1;
501
0
                else if( imageType[j].image_channel_order == CL_RGBA &&
502
0
                         imageType[j].image_channel_data_type == CL_UNSIGNED_INT8 )
503
0
                    b_has_rgba = 1;
504
0
            }
505
0
            if( !b_has_r || !b_has_rgba )
506
0
            {
507
0
                char dev_name[64];
508
0
                status = ocl->clGetDeviceInfo( h->opencl.device, CL_DEVICE_NAME, sizeof(dev_name), dev_name, NULL );
509
0
                if( status == CL_SUCCESS )
510
0
                {
511
                    /* emit warning if we are discarding the user's explicit choice */
512
0
                    int level = h->param.opencl_device_id ? X264_LOG_WARNING : X264_LOG_DEBUG;
513
0
                    x264_log( h, level, "OpenCL: %s does not support required image formats\n", dev_name );
514
0
                }
515
0
                continue;
516
0
            }
517
518
            /* user selection of GPU device, skip N first matches */
519
0
            if( h->param.i_opencl_device )
520
0
            {
521
0
                h->param.i_opencl_device--;
522
0
                continue;
523
0
            }
524
525
0
            h->opencl.queue = ocl->clCreateCommandQueue( context, h->opencl.device, 0, &status );
526
0
            if( status != CL_SUCCESS || !h->opencl.queue )
527
0
                continue;
528
529
0
            h->opencl.context = context;
530
0
            context = NULL;
531
532
0
            ret = 0;
533
0
            break;
534
0
        }
535
536
0
        if( !ret )
537
0
            break;
538
0
    }
539
540
0
    if( !h->param.psz_clbin_file )
541
0
        h->param.psz_clbin_file = "x264_lookahead.clbin";
542
543
0
    if( ret )
544
0
        x264_log( h, X264_LOG_WARNING, "OpenCL: Unable to find a compatible device\n" );
545
0
    else
546
0
        ret = opencl_lookahead_alloc( h );
547
548
0
fail:
549
0
    if( context )
550
0
        ocl->clReleaseContext( context );
551
0
    x264_free( imageType );
552
0
    x264_free( devices );
553
0
    x264_free( platforms );
554
0
    return ret;
555
0
}
556
557
static void opencl_lookahead_free( x264_t *h )
558
0
{
559
0
    x264_opencl_function_t *ocl = h->opencl.ocl;
560
561
0
#define RELEASE( a, f ) do { if( a ) { ocl->f( a ); a = NULL; } } while( 0 )
562
0
    RELEASE( h->opencl.downscale_hpel_kernel, clReleaseKernel );
563
0
    RELEASE( h->opencl.downscale_kernel1, clReleaseKernel );
564
0
    RELEASE( h->opencl.downscale_kernel2, clReleaseKernel );
565
0
    RELEASE( h->opencl.weightp_hpel_kernel, clReleaseKernel );
566
0
    RELEASE( h->opencl.weightp_scaled_images_kernel, clReleaseKernel );
567
0
    RELEASE( h->opencl.memset_kernel, clReleaseKernel );
568
0
    RELEASE( h->opencl.intra_kernel, clReleaseKernel );
569
0
    RELEASE( h->opencl.rowsum_intra_kernel, clReleaseKernel );
570
0
    RELEASE( h->opencl.hme_kernel, clReleaseKernel );
571
0
    RELEASE( h->opencl.subpel_refine_kernel, clReleaseKernel );
572
0
    RELEASE( h->opencl.mode_select_kernel, clReleaseKernel );
573
0
    RELEASE( h->opencl.rowsum_inter_kernel, clReleaseKernel );
574
575
0
    RELEASE( h->opencl.lookahead_program, clReleaseProgram );
576
577
0
    RELEASE( h->opencl.page_locked_buffer, clReleaseMemObject );
578
0
    RELEASE( h->opencl.luma_16x16_image[0], clReleaseMemObject );
579
0
    RELEASE( h->opencl.luma_16x16_image[1], clReleaseMemObject );
580
0
    for( int i = 0; i < NUM_IMAGE_SCALES; i++ )
581
0
        RELEASE( h->opencl.weighted_scaled_images[i], clReleaseMemObject );
582
0
    RELEASE( h->opencl.weighted_luma_hpel, clReleaseMemObject );
583
0
    RELEASE( h->opencl.row_satds[0], clReleaseMemObject );
584
0
    RELEASE( h->opencl.row_satds[1], clReleaseMemObject );
585
0
    RELEASE( h->opencl.mv_buffers[0], clReleaseMemObject );
586
0
    RELEASE( h->opencl.mv_buffers[1], clReleaseMemObject );
587
0
    RELEASE( h->opencl.lowres_mv_costs, clReleaseMemObject );
588
0
    RELEASE( h->opencl.mvp_buffer, clReleaseMemObject );
589
0
    RELEASE( h->opencl.lowres_costs[0], clReleaseMemObject );
590
0
    RELEASE( h->opencl.lowres_costs[1], clReleaseMemObject );
591
0
    RELEASE( h->opencl.frame_stats[0], clReleaseMemObject );
592
0
    RELEASE( h->opencl.frame_stats[1], clReleaseMemObject );
593
0
#undef RELEASE
594
0
}
595
596
void x264_opencl_lookahead_delete( x264_t *h )
597
0
{
598
0
    x264_opencl_function_t *ocl = h->opencl.ocl;
599
600
0
    if( !ocl )
601
0
        return;
602
603
0
    if( h->opencl.queue )
604
0
        ocl->clFinish( h->opencl.queue );
605
606
0
    opencl_lookahead_free( h );
607
608
0
    if( h->opencl.queue )
609
0
    {
610
0
        ocl->clReleaseCommandQueue( h->opencl.queue );
611
0
        h->opencl.queue = NULL;
612
0
    }
613
0
    if( h->opencl.context )
614
0
    {
615
0
        ocl->clReleaseContext( h->opencl.context );
616
0
        h->opencl.context = NULL;
617
0
    }
618
0
}
619
620
void x264_opencl_frame_delete( x264_frame_t *frame )
621
0
{
622
0
    x264_opencl_function_t *ocl = frame->opencl.ocl;
623
624
0
    if( !ocl )
625
0
        return;
626
627
0
#define RELEASEBUF(mem) do { if( mem ) { ocl->clReleaseMemObject( mem ); mem = NULL; } } while( 0 )
628
0
    for( int j = 0; j < NUM_IMAGE_SCALES; j++ )
629
0
        RELEASEBUF( frame->opencl.scaled_image2Ds[j] );
630
0
    RELEASEBUF( frame->opencl.luma_hpel );
631
0
    RELEASEBUF( frame->opencl.inv_qscale_factor );
632
0
    RELEASEBUF( frame->opencl.intra_cost );
633
0
    RELEASEBUF( frame->opencl.lowres_mvs0 );
634
0
    RELEASEBUF( frame->opencl.lowres_mvs1 );
635
0
    RELEASEBUF( frame->opencl.lowres_mv_costs0 );
636
0
    RELEASEBUF( frame->opencl.lowres_mv_costs1 );
637
0
#undef RELEASEBUF
638
0
}
639
640
/* OpenCL misbehaves on hybrid laptops with Intel iGPU and AMD dGPU, so
641
 * we consult AMD's ADL interface to detect this situation and disable
642
 * OpenCL on these machines (Linux and Windows) */
643
#ifdef _WIN32
644
#define ADL_API_CALL
645
#define ADL_CALLBACK __stdcall
646
#define adl_close FreeLibrary
647
#define adl_address GetProcAddress
648
#else
649
#define ADL_API_CALL
650
#define ADL_CALLBACK
651
0
#define adl_close dlclose
652
0
#define adl_address dlsym
653
#endif
654
655
typedef void* ( ADL_CALLBACK *ADL_MAIN_MALLOC_CALLBACK )( int );
656
typedef int   ( ADL_API_CALL *ADL_MAIN_CONTROL_CREATE )( ADL_MAIN_MALLOC_CALLBACK, int );
657
typedef int   ( ADL_API_CALL *ADL_ADAPTER_NUMBEROFADAPTERS_GET )( int * );
658
typedef int   ( ADL_API_CALL *ADL_POWERXPRESS_SCHEME_GET )( int, int *, int *, int * );
659
typedef int   ( ADL_API_CALL *ADL_MAIN_CONTROL_DESTROY )( void );
660
661
0
#define ADL_OK 0
662
0
#define ADL_PX_SCHEME_DYNAMIC 2
663
664
static void* ADL_CALLBACK adl_malloc_wrapper( int iSize )
665
0
{
666
0
    return x264_malloc( iSize );
667
0
}
668
669
static int detect_switchable_graphics( void )
670
0
{
671
0
    void *hDLL;
672
0
    ADL_MAIN_CONTROL_CREATE          ADL_Main_Control_Create;
673
0
    ADL_ADAPTER_NUMBEROFADAPTERS_GET ADL_Adapter_NumberOfAdapters_Get;
674
0
    ADL_POWERXPRESS_SCHEME_GET       ADL_PowerXpress_Scheme_Get;
675
0
    ADL_MAIN_CONTROL_DESTROY         ADL_Main_Control_Destroy;
676
0
    int ret = 0;
677
678
#ifdef _WIN32
679
    hDLL = LoadLibraryW( L"atiadlxx.dll" );
680
    if( !hDLL )
681
        hDLL = LoadLibraryW( L"atiadlxy.dll" );
682
#else
683
0
    hDLL = dlopen( "libatiadlxx.so", RTLD_LAZY|RTLD_GLOBAL );
684
0
#endif
685
0
    if( !hDLL )
686
0
        goto fail0;
687
688
0
    ADL_Main_Control_Create          = (ADL_MAIN_CONTROL_CREATE)adl_address(hDLL, "ADL_Main_Control_Create");
689
0
    ADL_Main_Control_Destroy         = (ADL_MAIN_CONTROL_DESTROY)adl_address(hDLL, "ADL_Main_Control_Destroy");
690
0
    ADL_Adapter_NumberOfAdapters_Get = (ADL_ADAPTER_NUMBEROFADAPTERS_GET)adl_address(hDLL, "ADL_Adapter_NumberOfAdapters_Get");
691
0
    ADL_PowerXpress_Scheme_Get       = (ADL_POWERXPRESS_SCHEME_GET)adl_address(hDLL, "ADL_PowerXpress_Scheme_Get");
692
0
    if( !ADL_Main_Control_Create || !ADL_Main_Control_Destroy || !ADL_Adapter_NumberOfAdapters_Get ||
693
0
        !ADL_PowerXpress_Scheme_Get )
694
0
        goto fail1;
695
696
0
    if( ADL_OK != ADL_Main_Control_Create( adl_malloc_wrapper, 1 ) )
697
0
        goto fail1;
698
699
0
    int numAdapters = 0;
700
0
    if( ADL_OK != ADL_Adapter_NumberOfAdapters_Get( &numAdapters ) )
701
0
        goto fail2;
702
703
0
    for( int i = 0; i < numAdapters; i++ )
704
0
    {
705
0
        int PXSchemeRange, PXSchemeCurrentState, PXSchemeDefaultState;
706
0
        if( ADL_OK != ADL_PowerXpress_Scheme_Get( i, &PXSchemeRange, &PXSchemeCurrentState, &PXSchemeDefaultState) )
707
0
            break;
708
709
0
        if( PXSchemeRange >= ADL_PX_SCHEME_DYNAMIC )
710
0
        {
711
0
            ret = 1;
712
0
            break;
713
0
        }
714
0
    }
715
716
0
fail2:
717
0
    ADL_Main_Control_Destroy();
718
0
fail1:
719
0
    adl_close( hDLL );
720
0
fail0:
721
0
    return ret;
722
0
}