1 /*M///////////////////////////////////////////////////////////////////////////////////////
2 //
3 // IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
4 //
5 // By downloading, copying, installing or using the software you agree to this license.
6 // If you do not agree to this license, do not download, install,
7 // copy or use the software.
8 //
9 //
10 // License Agreement
11 // For Open Source Computer Vision Library
12 //
13 // Copyright (C) 2013, OpenCV Foundation, all rights reserved.
14 // Third party copyrights are property of their respective owners.
15 //
16 // Redistribution and use in source and binary forms, with or without modification,
17 // are permitted provided that the following conditions are met:
18 //
19 // * Redistribution's of source code must retain the above copyright notice,
20 // this list of conditions and the following disclaimer.
21 //
22 // * Redistribution's in binary form must reproduce the above copyright notice,
23 // this list of conditions and the following disclaimer in the documentation
24 // and/or other materials provided with the distribution.
25 //
26 // * The name of the copyright holders may not be used to endorse or promote products
27 // derived from this software without specific prior written permission.
28 //
29 // This software is provided by the copyright holders and contributors "as is" and
30 // any express or implied warranties, including, but not limited to, the implied
31 // warranties of merchantability and fitness for a particular purpose are disclaimed.
32 // In no event shall the OpenCV Foundation or contributors be liable for any direct,
33 // indirect, incidental, special, exemplary, or consequential damages
34 // (including, but not limited to, procurement of substitute goods or services;
35 // loss of use, data, or profits; or business interruption) however caused
36 // and on any theory of liability, whether in contract, strict liability,
37 // or tort (including negligence or otherwise) arising in any way out of
38 // the use of this software, even if advised of the possibility of such damage.
39 //
40 //M*/
41
42 #include "precomp.hpp"
43 #include <list>
44 #include <map>
45 #include <string>
46 #include <sstream>
47 #include <iostream> // std::cerr
48
49 #define CV_OPENCL_ALWAYS_SHOW_BUILD_LOG 0
50 #define CV_OPENCL_SHOW_RUN_ERRORS 0
51 #define CV_OPENCL_SHOW_SVM_ERROR_LOG 1
52 #define CV_OPENCL_SHOW_SVM_LOG 0
53
54 #include "opencv2/core/bufferpool.hpp"
55 #ifndef LOG_BUFFER_POOL
56 # if 0
57 # define LOG_BUFFER_POOL printf
58 # else
59 # define LOG_BUFFER_POOL(...)
60 # endif
61 #endif
62
63
64 // TODO Move to some common place
getBoolParameter(const char * name,bool defaultValue)65 static bool getBoolParameter(const char* name, bool defaultValue)
66 {
67 /*
68 * If your system doesn't support getenv(), define NO_GETENV to disable
69 * this feature.
70 */
71 #ifdef NO_GETENV
72 const char* envValue = NULL;
73 #else
74 const char* envValue = getenv(name);
75 #endif
76 if (envValue == NULL)
77 {
78 return defaultValue;
79 }
80 cv::String value = envValue;
81 if (value == "1" || value == "True" || value == "true" || value == "TRUE")
82 {
83 return true;
84 }
85 if (value == "0" || value == "False" || value == "false" || value == "FALSE")
86 {
87 return false;
88 }
89 CV_ErrorNoReturn(cv::Error::StsBadArg, cv::format("Invalid value for %s parameter: %s", name, value.c_str()));
90 }
91
92
93 // TODO Move to some common place
getConfigurationParameterForSize(const char * name,size_t defaultValue)94 static size_t getConfigurationParameterForSize(const char* name, size_t defaultValue)
95 {
96 #ifdef NO_GETENV
97 const char* envValue = NULL;
98 #else
99 const char* envValue = getenv(name);
100 #endif
101 if (envValue == NULL)
102 {
103 return defaultValue;
104 }
105 cv::String value = envValue;
106 size_t pos = 0;
107 for (; pos < value.size(); pos++)
108 {
109 if (!isdigit(value[pos]))
110 break;
111 }
112 cv::String valueStr = value.substr(0, pos);
113 cv::String suffixStr = value.substr(pos, value.length() - pos);
114 int v = atoi(valueStr.c_str());
115 if (suffixStr.length() == 0)
116 return v;
117 else if (suffixStr == "MB" || suffixStr == "Mb" || suffixStr == "mb")
118 return v * 1024 * 1024;
119 else if (suffixStr == "KB" || suffixStr == "Kb" || suffixStr == "kb")
120 return v * 1024;
121 CV_ErrorNoReturn(cv::Error::StsBadArg, cv::format("Invalid value for %s parameter: %s", name, value.c_str()));
122 }
123
124 #if CV_OPENCL_SHOW_SVM_LOG
125 // TODO add timestamp logging
126 #define CV_OPENCL_SVM_TRACE_P printf("line %d (ocl.cpp): ", __LINE__); printf
127 #else
128 #define CV_OPENCL_SVM_TRACE_P(...)
129 #endif
130
131 #if CV_OPENCL_SHOW_SVM_ERROR_LOG
132 // TODO add timestamp logging
133 #define CV_OPENCL_SVM_TRACE_ERROR_P printf("Error on line %d (ocl.cpp): ", __LINE__); printf
134 #else
135 #define CV_OPENCL_SVM_TRACE_ERROR_P(...)
136 #endif
137
138 #include "opencv2/core/opencl/runtime/opencl_clamdblas.hpp"
139 #include "opencv2/core/opencl/runtime/opencl_clamdfft.hpp"
140
141 #ifdef HAVE_OPENCL
142 #include "opencv2/core/opencl/runtime/opencl_core.hpp"
143 #else
144 // TODO FIXIT: This file can't be build without OPENCL
145
146 /*
147 Part of the file is an extract from the standard OpenCL headers from Khronos site.
148 Below is the original copyright.
149 */
150
151 /*******************************************************************************
152 * Copyright (c) 2008 - 2012 The Khronos Group Inc.
153 *
154 * Permission is hereby granted, free of charge, to any person obtaining a
155 * copy of this software and/or associated documentation files (the
156 * "Materials"), to deal in the Materials without restriction, including
157 * without limitation the rights to use, copy, modify, merge, publish,
158 * distribute, sublicense, and/or sell copies of the Materials, and to
159 * permit persons to whom the Materials are furnished to do so, subject to
160 * the following conditions:
161 *
162 * The above copyright notice and this permission notice shall be included
163 * in all copies or substantial portions of the Materials.
164 *
165 * THE MATERIALS ARE PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
166 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
167 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
168 * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
169 * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
170 * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
171 * MATERIALS OR THE USE OR OTHER DEALINGS IN THE MATERIALS.
172 ******************************************************************************/
173
174 #if 0 //defined __APPLE__
175 #define HAVE_OPENCL 1
176 #else
177 #undef HAVE_OPENCL
178 #endif
179
180 #define OPENCV_CL_NOT_IMPLEMENTED -1000
181
182 #ifdef HAVE_OPENCL
183
184 #if defined __APPLE__
185 #include <OpenCL/opencl.h>
186 #else
187 #include <CL/opencl.h>
188 #endif
189
190 static const bool g_haveOpenCL = true;
191
192 #else
193
194 extern "C" {
195
196 struct _cl_platform_id { int dummy; };
197 struct _cl_device_id { int dummy; };
198 struct _cl_context { int dummy; };
199 struct _cl_command_queue { int dummy; };
200 struct _cl_mem { int dummy; };
201 struct _cl_program { int dummy; };
202 struct _cl_kernel { int dummy; };
203 struct _cl_event { int dummy; };
204 struct _cl_sampler { int dummy; };
205
206 typedef struct _cl_platform_id * cl_platform_id;
207 typedef struct _cl_device_id * cl_device_id;
208 typedef struct _cl_context * cl_context;
209 typedef struct _cl_command_queue * cl_command_queue;
210 typedef struct _cl_mem * cl_mem;
211 typedef struct _cl_program * cl_program;
212 typedef struct _cl_kernel * cl_kernel;
213 typedef struct _cl_event * cl_event;
214 typedef struct _cl_sampler * cl_sampler;
215
216 typedef int cl_int;
217 typedef unsigned cl_uint;
218 #if defined (_WIN32) && defined(_MSC_VER)
219 typedef __int64 cl_long;
220 typedef unsigned __int64 cl_ulong;
221 #else
222 typedef long cl_long;
223 typedef unsigned long cl_ulong;
224 #endif
225
226 typedef cl_uint cl_bool; /* WARNING! Unlike cl_ types in cl_platform.h, cl_bool is not guaranteed to be the same size as the bool in kernels. */
227 typedef cl_ulong cl_bitfield;
228 typedef cl_bitfield cl_device_type;
229 typedef cl_uint cl_platform_info;
230 typedef cl_uint cl_device_info;
231 typedef cl_bitfield cl_device_fp_config;
232 typedef cl_uint cl_device_mem_cache_type;
233 typedef cl_uint cl_device_local_mem_type;
234 typedef cl_bitfield cl_device_exec_capabilities;
235 typedef cl_bitfield cl_command_queue_properties;
236 typedef intptr_t cl_device_partition_property;
237 typedef cl_bitfield cl_device_affinity_domain;
238
239 typedef intptr_t cl_context_properties;
240 typedef cl_uint cl_context_info;
241 typedef cl_uint cl_command_queue_info;
242 typedef cl_uint cl_channel_order;
243 typedef cl_uint cl_channel_type;
244 typedef cl_bitfield cl_mem_flags;
245 typedef cl_uint cl_mem_object_type;
246 typedef cl_uint cl_mem_info;
247 typedef cl_bitfield cl_mem_migration_flags;
248 typedef cl_uint cl_image_info;
249 typedef cl_uint cl_buffer_create_type;
250 typedef cl_uint cl_addressing_mode;
251 typedef cl_uint cl_filter_mode;
252 typedef cl_uint cl_sampler_info;
253 typedef cl_bitfield cl_map_flags;
254 typedef cl_uint cl_program_info;
255 typedef cl_uint cl_program_build_info;
256 typedef cl_uint cl_program_binary_type;
257 typedef cl_int cl_build_status;
258 typedef cl_uint cl_kernel_info;
259 typedef cl_uint cl_kernel_arg_info;
260 typedef cl_uint cl_kernel_arg_address_qualifier;
261 typedef cl_uint cl_kernel_arg_access_qualifier;
262 typedef cl_bitfield cl_kernel_arg_type_qualifier;
263 typedef cl_uint cl_kernel_work_group_info;
264 typedef cl_uint cl_event_info;
265 typedef cl_uint cl_command_type;
266 typedef cl_uint cl_profiling_info;
267
268
269 typedef struct _cl_image_format {
270 cl_channel_order image_channel_order;
271 cl_channel_type image_channel_data_type;
272 } cl_image_format;
273
274 typedef struct _cl_image_desc {
275 cl_mem_object_type image_type;
276 size_t image_width;
277 size_t image_height;
278 size_t image_depth;
279 size_t image_array_size;
280 size_t image_row_pitch;
281 size_t image_slice_pitch;
282 cl_uint num_mip_levels;
283 cl_uint num_samples;
284 cl_mem buffer;
285 } cl_image_desc;
286
287 typedef struct _cl_buffer_region {
288 size_t origin;
289 size_t size;
290 } cl_buffer_region;
291
292
293 //////////////////////////////////////////////////////////
294
295 #define CL_SUCCESS 0
296 #define CL_DEVICE_NOT_FOUND -1
297 #define CL_DEVICE_NOT_AVAILABLE -2
298 #define CL_COMPILER_NOT_AVAILABLE -3
299 #define CL_MEM_OBJECT_ALLOCATION_FAILURE -4
300 #define CL_OUT_OF_RESOURCES -5
301 #define CL_OUT_OF_HOST_MEMORY -6
302 #define CL_PROFILING_INFO_NOT_AVAILABLE -7
303 #define CL_MEM_COPY_OVERLAP -8
304 #define CL_IMAGE_FORMAT_MISMATCH -9
305 #define CL_IMAGE_FORMAT_NOT_SUPPORTED -10
306 #define CL_BUILD_PROGRAM_FAILURE -11
307 #define CL_MAP_FAILURE -12
308 #define CL_MISALIGNED_SUB_BUFFER_OFFSET -13
309 #define CL_EXEC_STATUS_ERROR_FOR_EVENTS_IN_WAIT_LIST -14
310 #define CL_COMPILE_PROGRAM_FAILURE -15
311 #define CL_LINKER_NOT_AVAILABLE -16
312 #define CL_LINK_PROGRAM_FAILURE -17
313 #define CL_DEVICE_PARTITION_FAILED -18
314 #define CL_KERNEL_ARG_INFO_NOT_AVAILABLE -19
315
316 #define CL_INVALID_VALUE -30
317 #define CL_INVALID_DEVICE_TYPE -31
318 #define CL_INVALID_PLATFORM -32
319 #define CL_INVALID_DEVICE -33
320 #define CL_INVALID_CONTEXT -34
321 #define CL_INVALID_QUEUE_PROPERTIES -35
322 #define CL_INVALID_COMMAND_QUEUE -36
323 #define CL_INVALID_HOST_PTR -37
324 #define CL_INVALID_MEM_OBJECT -38
325 #define CL_INVALID_IMAGE_FORMAT_DESCRIPTOR -39
326 #define CL_INVALID_IMAGE_SIZE -40
327 #define CL_INVALID_SAMPLER -41
328 #define CL_INVALID_BINARY -42
329 #define CL_INVALID_BUILD_OPTIONS -43
330 #define CL_INVALID_PROGRAM -44
331 #define CL_INVALID_PROGRAM_EXECUTABLE -45
332 #define CL_INVALID_KERNEL_NAME -46
333 #define CL_INVALID_KERNEL_DEFINITION -47
334 #define CL_INVALID_KERNEL -48
335 #define CL_INVALID_ARG_INDEX -49
336 #define CL_INVALID_ARG_VALUE -50
337 #define CL_INVALID_ARG_SIZE -51
338 #define CL_INVALID_KERNEL_ARGS -52
339 #define CL_INVALID_WORK_DIMENSION -53
340 #define CL_INVALID_WORK_GROUP_SIZE -54
341 #define CL_INVALID_WORK_ITEM_SIZE -55
342 #define CL_INVALID_GLOBAL_OFFSET -56
343 #define CL_INVALID_EVENT_WAIT_LIST -57
344 #define CL_INVALID_EVENT -58
345 #define CL_INVALID_OPERATION -59
346 #define CL_INVALID_GL_OBJECT -60
347 #define CL_INVALID_BUFFER_SIZE -61
348 #define CL_INVALID_MIP_LEVEL -62
349 #define CL_INVALID_GLOBAL_WORK_SIZE -63
350 #define CL_INVALID_PROPERTY -64
351 #define CL_INVALID_IMAGE_DESCRIPTOR -65
352 #define CL_INVALID_COMPILER_OPTIONS -66
353 #define CL_INVALID_LINKER_OPTIONS -67
354 #define CL_INVALID_DEVICE_PARTITION_COUNT -68
355
356 /*#define CL_VERSION_1_0 1
357 #define CL_VERSION_1_1 1
358 #define CL_VERSION_1_2 1*/
359
360 #define CL_FALSE 0
361 #define CL_TRUE 1
362 #define CL_BLOCKING CL_TRUE
363 #define CL_NON_BLOCKING CL_FALSE
364
365 #define CL_PLATFORM_PROFILE 0x0900
366 #define CL_PLATFORM_VERSION 0x0901
367 #define CL_PLATFORM_NAME 0x0902
368 #define CL_PLATFORM_VENDOR 0x0903
369 #define CL_PLATFORM_EXTENSIONS 0x0904
370
371 #define CL_DEVICE_TYPE_DEFAULT (1 << 0)
372 #define CL_DEVICE_TYPE_CPU (1 << 1)
373 #define CL_DEVICE_TYPE_GPU (1 << 2)
374 #define CL_DEVICE_TYPE_ACCELERATOR (1 << 3)
375 #define CL_DEVICE_TYPE_CUSTOM (1 << 4)
376 #define CL_DEVICE_TYPE_ALL 0xFFFFFFFF
377 #define CL_DEVICE_TYPE 0x1000
378 #define CL_DEVICE_VENDOR_ID 0x1001
379 #define CL_DEVICE_MAX_COMPUTE_UNITS 0x1002
380 #define CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS 0x1003
381 #define CL_DEVICE_MAX_WORK_GROUP_SIZE 0x1004
382 #define CL_DEVICE_MAX_WORK_ITEM_SIZES 0x1005
383 #define CL_DEVICE_PREFERRED_VECTOR_WIDTH_CHAR 0x1006
384 #define CL_DEVICE_PREFERRED_VECTOR_WIDTH_SHORT 0x1007
385 #define CL_DEVICE_PREFERRED_VECTOR_WIDTH_INT 0x1008
386 #define CL_DEVICE_PREFERRED_VECTOR_WIDTH_LONG 0x1009
387 #define CL_DEVICE_PREFERRED_VECTOR_WIDTH_FLOAT 0x100A
388 #define CL_DEVICE_PREFERRED_VECTOR_WIDTH_DOUBLE 0x100B
389 #define CL_DEVICE_MAX_CLOCK_FREQUENCY 0x100C
390 #define CL_DEVICE_ADDRESS_BITS 0x100D
391 #define CL_DEVICE_MAX_READ_IMAGE_ARGS 0x100E
392 #define CL_DEVICE_MAX_WRITE_IMAGE_ARGS 0x100F
393 #define CL_DEVICE_MAX_MEM_ALLOC_SIZE 0x1010
394 #define CL_DEVICE_IMAGE2D_MAX_WIDTH 0x1011
395 #define CL_DEVICE_IMAGE2D_MAX_HEIGHT 0x1012
396 #define CL_DEVICE_IMAGE3D_MAX_WIDTH 0x1013
397 #define CL_DEVICE_IMAGE3D_MAX_HEIGHT 0x1014
398 #define CL_DEVICE_IMAGE3D_MAX_DEPTH 0x1015
399 #define CL_DEVICE_IMAGE_SUPPORT 0x1016
400 #define CL_DEVICE_MAX_PARAMETER_SIZE 0x1017
401 #define CL_DEVICE_MAX_SAMPLERS 0x1018
402 #define CL_DEVICE_MEM_BASE_ADDR_ALIGN 0x1019
403 #define CL_DEVICE_MIN_DATA_TYPE_ALIGN_SIZE 0x101A
404 #define CL_DEVICE_SINGLE_FP_CONFIG 0x101B
405 #define CL_DEVICE_GLOBAL_MEM_CACHE_TYPE 0x101C
406 #define CL_DEVICE_GLOBAL_MEM_CACHELINE_SIZE 0x101D
407 #define CL_DEVICE_GLOBAL_MEM_CACHE_SIZE 0x101E
408 #define CL_DEVICE_GLOBAL_MEM_SIZE 0x101F
409 #define CL_DEVICE_MAX_CONSTANT_BUFFER_SIZE 0x1020
410 #define CL_DEVICE_MAX_CONSTANT_ARGS 0x1021
411 #define CL_DEVICE_LOCAL_MEM_TYPE 0x1022
412 #define CL_DEVICE_LOCAL_MEM_SIZE 0x1023
413 #define CL_DEVICE_ERROR_CORRECTION_SUPPORT 0x1024
414 #define CL_DEVICE_PROFILING_TIMER_RESOLUTION 0x1025
415 #define CL_DEVICE_ENDIAN_LITTLE 0x1026
416 #define CL_DEVICE_AVAILABLE 0x1027
417 #define CL_DEVICE_COMPILER_AVAILABLE 0x1028
418 #define CL_DEVICE_EXECUTION_CAPABILITIES 0x1029
419 #define CL_DEVICE_QUEUE_PROPERTIES 0x102A
420 #define CL_DEVICE_NAME 0x102B
421 #define CL_DEVICE_VENDOR 0x102C
422 #define CL_DRIVER_VERSION 0x102D
423 #define CL_DEVICE_PROFILE 0x102E
424 #define CL_DEVICE_VERSION 0x102F
425 #define CL_DEVICE_EXTENSIONS 0x1030
426 #define CL_DEVICE_PLATFORM 0x1031
427 #define CL_DEVICE_DOUBLE_FP_CONFIG 0x1032
428 #define CL_DEVICE_HALF_FP_CONFIG 0x1033
429 #define CL_DEVICE_PREFERRED_VECTOR_WIDTH_HALF 0x1034
430 #define CL_DEVICE_HOST_UNIFIED_MEMORY 0x1035
431 #define CL_DEVICE_NATIVE_VECTOR_WIDTH_CHAR 0x1036
432 #define CL_DEVICE_NATIVE_VECTOR_WIDTH_SHORT 0x1037
433 #define CL_DEVICE_NATIVE_VECTOR_WIDTH_INT 0x1038
434 #define CL_DEVICE_NATIVE_VECTOR_WIDTH_LONG 0x1039
435 #define CL_DEVICE_NATIVE_VECTOR_WIDTH_FLOAT 0x103A
436 #define CL_DEVICE_NATIVE_VECTOR_WIDTH_DOUBLE 0x103B
437 #define CL_DEVICE_NATIVE_VECTOR_WIDTH_HALF 0x103C
438 #define CL_DEVICE_OPENCL_C_VERSION 0x103D
439 #define CL_DEVICE_LINKER_AVAILABLE 0x103E
440 #define CL_DEVICE_BUILT_IN_KERNELS 0x103F
441 #define CL_DEVICE_IMAGE_MAX_BUFFER_SIZE 0x1040
442 #define CL_DEVICE_IMAGE_MAX_ARRAY_SIZE 0x1041
443 #define CL_DEVICE_PARENT_DEVICE 0x1042
444 #define CL_DEVICE_PARTITION_MAX_SUB_DEVICES 0x1043
445 #define CL_DEVICE_PARTITION_PROPERTIES 0x1044
446 #define CL_DEVICE_PARTITION_AFFINITY_DOMAIN 0x1045
447 #define CL_DEVICE_PARTITION_TYPE 0x1046
448 #define CL_DEVICE_REFERENCE_COUNT 0x1047
449 #define CL_DEVICE_PREFERRED_INTEROP_USER_SYNC 0x1048
450 #define CL_DEVICE_PRINTF_BUFFER_SIZE 0x1049
451 #define CL_DEVICE_IMAGE_PITCH_ALIGNMENT 0x104A
452 #define CL_DEVICE_IMAGE_BASE_ADDRESS_ALIGNMENT 0x104B
453
454 #define CL_FP_DENORM (1 << 0)
455 #define CL_FP_INF_NAN (1 << 1)
456 #define CL_FP_ROUND_TO_NEAREST (1 << 2)
457 #define CL_FP_ROUND_TO_ZERO (1 << 3)
458 #define CL_FP_ROUND_TO_INF (1 << 4)
459 #define CL_FP_FMA (1 << 5)
460 #define CL_FP_SOFT_FLOAT (1 << 6)
461 #define CL_FP_CORRECTLY_ROUNDED_DIVIDE_SQRT (1 << 7)
462
463 #define CL_NONE 0x0
464 #define CL_READ_ONLY_CACHE 0x1
465 #define CL_READ_WRITE_CACHE 0x2
466 #define CL_LOCAL 0x1
467 #define CL_GLOBAL 0x2
468 #define CL_EXEC_KERNEL (1 << 0)
469 #define CL_EXEC_NATIVE_KERNEL (1 << 1)
470 #define CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE (1 << 0)
471 #define CL_QUEUE_PROFILING_ENABLE (1 << 1)
472
473 #define CL_CONTEXT_REFERENCE_COUNT 0x1080
474 #define CL_CONTEXT_DEVICES 0x1081
475 #define CL_CONTEXT_PROPERTIES 0x1082
476 #define CL_CONTEXT_NUM_DEVICES 0x1083
477 #define CL_CONTEXT_PLATFORM 0x1084
478 #define CL_CONTEXT_INTEROP_USER_SYNC 0x1085
479
480 #define CL_DEVICE_PARTITION_EQUALLY 0x1086
481 #define CL_DEVICE_PARTITION_BY_COUNTS 0x1087
482 #define CL_DEVICE_PARTITION_BY_COUNTS_LIST_END 0x0
483 #define CL_DEVICE_PARTITION_BY_AFFINITY_DOMAIN 0x1088
484 #define CL_DEVICE_AFFINITY_DOMAIN_NUMA (1 << 0)
485 #define CL_DEVICE_AFFINITY_DOMAIN_L4_CACHE (1 << 1)
486 #define CL_DEVICE_AFFINITY_DOMAIN_L3_CACHE (1 << 2)
487 #define CL_DEVICE_AFFINITY_DOMAIN_L2_CACHE (1 << 3)
488 #define CL_DEVICE_AFFINITY_DOMAIN_L1_CACHE (1 << 4)
489 #define CL_DEVICE_AFFINITY_DOMAIN_NEXT_PARTITIONABLE (1 << 5)
490 #define CL_QUEUE_CONTEXT 0x1090
491 #define CL_QUEUE_DEVICE 0x1091
492 #define CL_QUEUE_REFERENCE_COUNT 0x1092
493 #define CL_QUEUE_PROPERTIES 0x1093
494 #define CL_MEM_READ_WRITE (1 << 0)
495 #define CL_MEM_WRITE_ONLY (1 << 1)
496 #define CL_MEM_READ_ONLY (1 << 2)
497 #define CL_MEM_USE_HOST_PTR (1 << 3)
498 #define CL_MEM_ALLOC_HOST_PTR (1 << 4)
499 #define CL_MEM_COPY_HOST_PTR (1 << 5)
500 // reserved (1 << 6)
501 #define CL_MEM_HOST_WRITE_ONLY (1 << 7)
502 #define CL_MEM_HOST_READ_ONLY (1 << 8)
503 #define CL_MEM_HOST_NO_ACCESS (1 << 9)
504 #define CL_MIGRATE_MEM_OBJECT_HOST (1 << 0)
505 #define CL_MIGRATE_MEM_OBJECT_CONTENT_UNDEFINED (1 << 1)
506
507 #define CL_R 0x10B0
508 #define CL_A 0x10B1
509 #define CL_RG 0x10B2
510 #define CL_RA 0x10B3
511 #define CL_RGB 0x10B4
512 #define CL_RGBA 0x10B5
513 #define CL_BGRA 0x10B6
514 #define CL_ARGB 0x10B7
515 #define CL_INTENSITY 0x10B8
516 #define CL_LUMINANCE 0x10B9
517 #define CL_Rx 0x10BA
518 #define CL_RGx 0x10BB
519 #define CL_RGBx 0x10BC
520 #define CL_DEPTH 0x10BD
521 #define CL_DEPTH_STENCIL 0x10BE
522
523 #define CL_SNORM_INT8 0x10D0
524 #define CL_SNORM_INT16 0x10D1
525 #define CL_UNORM_INT8 0x10D2
526 #define CL_UNORM_INT16 0x10D3
527 #define CL_UNORM_SHORT_565 0x10D4
528 #define CL_UNORM_SHORT_555 0x10D5
529 #define CL_UNORM_INT_101010 0x10D6
530 #define CL_SIGNED_INT8 0x10D7
531 #define CL_SIGNED_INT16 0x10D8
532 #define CL_SIGNED_INT32 0x10D9
533 #define CL_UNSIGNED_INT8 0x10DA
534 #define CL_UNSIGNED_INT16 0x10DB
535 #define CL_UNSIGNED_INT32 0x10DC
536 #define CL_HALF_FLOAT 0x10DD
537 #define CL_FLOAT 0x10DE
538 #define CL_UNORM_INT24 0x10DF
539
540 #define CL_MEM_OBJECT_BUFFER 0x10F0
541 #define CL_MEM_OBJECT_IMAGE2D 0x10F1
542 #define CL_MEM_OBJECT_IMAGE3D 0x10F2
543 #define CL_MEM_OBJECT_IMAGE2D_ARRAY 0x10F3
544 #define CL_MEM_OBJECT_IMAGE1D 0x10F4
545 #define CL_MEM_OBJECT_IMAGE1D_ARRAY 0x10F5
546 #define CL_MEM_OBJECT_IMAGE1D_BUFFER 0x10F6
547
548 #define CL_MEM_TYPE 0x1100
549 #define CL_MEM_FLAGS 0x1101
550 #define CL_MEM_SIZE 0x1102
551 #define CL_MEM_HOST_PTR 0x1103
552 #define CL_MEM_MAP_COUNT 0x1104
553 #define CL_MEM_REFERENCE_COUNT 0x1105
554 #define CL_MEM_CONTEXT 0x1106
555 #define CL_MEM_ASSOCIATED_MEMOBJECT 0x1107
556 #define CL_MEM_OFFSET 0x1108
557
558 #define CL_IMAGE_FORMAT 0x1110
559 #define CL_IMAGE_ELEMENT_SIZE 0x1111
560 #define CL_IMAGE_ROW_PITCH 0x1112
561 #define CL_IMAGE_SLICE_PITCH 0x1113
562 #define CL_IMAGE_WIDTH 0x1114
563 #define CL_IMAGE_HEIGHT 0x1115
564 #define CL_IMAGE_DEPTH 0x1116
565 #define CL_IMAGE_ARRAY_SIZE 0x1117
566 #define CL_IMAGE_BUFFER 0x1118
567 #define CL_IMAGE_NUM_MIP_LEVELS 0x1119
568 #define CL_IMAGE_NUM_SAMPLES 0x111A
569
570 #define CL_ADDRESS_NONE 0x1130
571 #define CL_ADDRESS_CLAMP_TO_EDGE 0x1131
572 #define CL_ADDRESS_CLAMP 0x1132
573 #define CL_ADDRESS_REPEAT 0x1133
574 #define CL_ADDRESS_MIRRORED_REPEAT 0x1134
575
576 #define CL_FILTER_NEAREST 0x1140
577 #define CL_FILTER_LINEAR 0x1141
578
579 #define CL_SAMPLER_REFERENCE_COUNT 0x1150
580 #define CL_SAMPLER_CONTEXT 0x1151
581 #define CL_SAMPLER_NORMALIZED_COORDS 0x1152
582 #define CL_SAMPLER_ADDRESSING_MODE 0x1153
583 #define CL_SAMPLER_FILTER_MODE 0x1154
584
585 #define CL_MAP_READ (1 << 0)
586 #define CL_MAP_WRITE (1 << 1)
587 #define CL_MAP_WRITE_INVALIDATE_REGION (1 << 2)
588
589 #define CL_PROGRAM_REFERENCE_COUNT 0x1160
590 #define CL_PROGRAM_CONTEXT 0x1161
591 #define CL_PROGRAM_NUM_DEVICES 0x1162
592 #define CL_PROGRAM_DEVICES 0x1163
593 #define CL_PROGRAM_SOURCE 0x1164
594 #define CL_PROGRAM_BINARY_SIZES 0x1165
595 #define CL_PROGRAM_BINARIES 0x1166
596 #define CL_PROGRAM_NUM_KERNELS 0x1167
597 #define CL_PROGRAM_KERNEL_NAMES 0x1168
598 #define CL_PROGRAM_BUILD_STATUS 0x1181
599 #define CL_PROGRAM_BUILD_OPTIONS 0x1182
600 #define CL_PROGRAM_BUILD_LOG 0x1183
601 #define CL_PROGRAM_BINARY_TYPE 0x1184
602 #define CL_PROGRAM_BINARY_TYPE_NONE 0x0
603 #define CL_PROGRAM_BINARY_TYPE_COMPILED_OBJECT 0x1
604 #define CL_PROGRAM_BINARY_TYPE_LIBRARY 0x2
605 #define CL_PROGRAM_BINARY_TYPE_EXECUTABLE 0x4
606
607 #define CL_BUILD_SUCCESS 0
608 #define CL_BUILD_NONE -1
609 #define CL_BUILD_ERROR -2
610 #define CL_BUILD_IN_PROGRESS -3
611
612 #define CL_KERNEL_FUNCTION_NAME 0x1190
613 #define CL_KERNEL_NUM_ARGS 0x1191
614 #define CL_KERNEL_REFERENCE_COUNT 0x1192
615 #define CL_KERNEL_CONTEXT 0x1193
616 #define CL_KERNEL_PROGRAM 0x1194
617 #define CL_KERNEL_ATTRIBUTES 0x1195
618 #define CL_KERNEL_ARG_ADDRESS_QUALIFIER 0x1196
619 #define CL_KERNEL_ARG_ACCESS_QUALIFIER 0x1197
620 #define CL_KERNEL_ARG_TYPE_NAME 0x1198
621 #define CL_KERNEL_ARG_TYPE_QUALIFIER 0x1199
622 #define CL_KERNEL_ARG_NAME 0x119A
623 #define CL_KERNEL_ARG_ADDRESS_GLOBAL 0x119B
624 #define CL_KERNEL_ARG_ADDRESS_LOCAL 0x119C
625 #define CL_KERNEL_ARG_ADDRESS_CONSTANT 0x119D
626 #define CL_KERNEL_ARG_ADDRESS_PRIVATE 0x119E
627 #define CL_KERNEL_ARG_ACCESS_READ_ONLY 0x11A0
628 #define CL_KERNEL_ARG_ACCESS_WRITE_ONLY 0x11A1
629 #define CL_KERNEL_ARG_ACCESS_READ_WRITE 0x11A2
630 #define CL_KERNEL_ARG_ACCESS_NONE 0x11A3
631 #define CL_KERNEL_ARG_TYPE_NONE 0
632 #define CL_KERNEL_ARG_TYPE_CONST (1 << 0)
633 #define CL_KERNEL_ARG_TYPE_RESTRICT (1 << 1)
634 #define CL_KERNEL_ARG_TYPE_VOLATILE (1 << 2)
635 #define CL_KERNEL_WORK_GROUP_SIZE 0x11B0
636 #define CL_KERNEL_COMPILE_WORK_GROUP_SIZE 0x11B1
637 #define CL_KERNEL_LOCAL_MEM_SIZE 0x11B2
638 #define CL_KERNEL_PREFERRED_WORK_GROUP_SIZE_MULTIPLE 0x11B3
639 #define CL_KERNEL_PRIVATE_MEM_SIZE 0x11B4
640 #define CL_KERNEL_GLOBAL_WORK_SIZE 0x11B5
641
642 #define CL_EVENT_COMMAND_QUEUE 0x11D0
643 #define CL_EVENT_COMMAND_TYPE 0x11D1
644 #define CL_EVENT_REFERENCE_COUNT 0x11D2
645 #define CL_EVENT_COMMAND_EXECUTION_STATUS 0x11D3
646 #define CL_EVENT_CONTEXT 0x11D4
647
648 #define CL_COMMAND_NDRANGE_KERNEL 0x11F0
649 #define CL_COMMAND_TASK 0x11F1
650 #define CL_COMMAND_NATIVE_KERNEL 0x11F2
651 #define CL_COMMAND_READ_BUFFER 0x11F3
652 #define CL_COMMAND_WRITE_BUFFER 0x11F4
653 #define CL_COMMAND_COPY_BUFFER 0x11F5
654 #define CL_COMMAND_READ_IMAGE 0x11F6
655 #define CL_COMMAND_WRITE_IMAGE 0x11F7
656 #define CL_COMMAND_COPY_IMAGE 0x11F8
657 #define CL_COMMAND_COPY_IMAGE_TO_BUFFER 0x11F9
658 #define CL_COMMAND_COPY_BUFFER_TO_IMAGE 0x11FA
659 #define CL_COMMAND_MAP_BUFFER 0x11FB
660 #define CL_COMMAND_MAP_IMAGE 0x11FC
661 #define CL_COMMAND_UNMAP_MEM_OBJECT 0x11FD
662 #define CL_COMMAND_MARKER 0x11FE
663 #define CL_COMMAND_ACQUIRE_GL_OBJECTS 0x11FF
664 #define CL_COMMAND_RELEASE_GL_OBJECTS 0x1200
665 #define CL_COMMAND_READ_BUFFER_RECT 0x1201
666 #define CL_COMMAND_WRITE_BUFFER_RECT 0x1202
667 #define CL_COMMAND_COPY_BUFFER_RECT 0x1203
668 #define CL_COMMAND_USER 0x1204
669 #define CL_COMMAND_BARRIER 0x1205
670 #define CL_COMMAND_MIGRATE_MEM_OBJECTS 0x1206
671 #define CL_COMMAND_FILL_BUFFER 0x1207
672 #define CL_COMMAND_FILL_IMAGE 0x1208
673
674 #define CL_COMPLETE 0x0
675 #define CL_RUNNING 0x1
676 #define CL_SUBMITTED 0x2
677 #define CL_QUEUED 0x3
678 #define CL_BUFFER_CREATE_TYPE_REGION 0x1220
679
680 #define CL_PROFILING_COMMAND_QUEUED 0x1280
681 #define CL_PROFILING_COMMAND_SUBMIT 0x1281
682 #define CL_PROFILING_COMMAND_START 0x1282
683 #define CL_PROFILING_COMMAND_END 0x1283
684
685 #define CL_CALLBACK CV_STDCALL
686
687 static volatile bool g_haveOpenCL = false;
688 static const char* oclFuncToCheck = "clEnqueueReadBufferRect";
689
690 #if defined(__APPLE__)
691 #include <dlfcn.h>
692
initOpenCLAndLoad(const char * funcname)693 static void* initOpenCLAndLoad(const char* funcname)
694 {
695 static bool initialized = false;
696 static void* handle = 0;
697 if (!handle)
698 {
699 if(!initialized)
700 {
701 const char* oclpath = getenv("OPENCV_OPENCL_RUNTIME");
702 oclpath = oclpath && strlen(oclpath) > 0 ? oclpath :
703 "/System/Library/Frameworks/OpenCL.framework/Versions/Current/OpenCL";
704 handle = dlopen(oclpath, RTLD_LAZY);
705 initialized = true;
706 g_haveOpenCL = handle != 0 && dlsym(handle, oclFuncToCheck) != 0;
707 if( g_haveOpenCL )
708 fprintf(stderr, "Successfully loaded OpenCL v1.1+ runtime from %s\n", oclpath);
709 else
710 fprintf(stderr, "Failed to load OpenCL runtime\n");
711 }
712 if(!handle)
713 return 0;
714 }
715
716 return funcname && handle ? dlsym(handle, funcname) : 0;
717 }
718
719 #elif defined WIN32 || defined _WIN32
720
721 #ifndef _WIN32_WINNT // This is needed for the declaration of TryEnterCriticalSection in winbase.h with Visual Studio 2005 (and older?)
722 #define _WIN32_WINNT 0x0400 // http://msdn.microsoft.com/en-us/library/ms686857(VS.85).aspx
723 #endif
724 #include <windows.h>
725 #if (_WIN32_WINNT >= 0x0602)
726 #include <synchapi.h>
727 #endif
728 #undef small
729 #undef min
730 #undef max
731 #undef abs
732
initOpenCLAndLoad(const char * funcname)733 static void* initOpenCLAndLoad(const char* funcname)
734 {
735 static bool initialized = false;
736 static HMODULE handle = 0;
737 if (!handle)
738 {
739 #ifndef WINRT
740 if(!initialized)
741 {
742 handle = LoadLibraryA("OpenCL.dll");
743 initialized = true;
744 g_haveOpenCL = handle != 0 && GetProcAddress(handle, oclFuncToCheck) != 0;
745 }
746 #endif
747 if(!handle)
748 return 0;
749 }
750
751 return funcname ? (void*)GetProcAddress(handle, funcname) : 0;
752 }
753
754 #elif defined(__linux)
755
756 #include <dlfcn.h>
757 #include <stdio.h>
758
initOpenCLAndLoad(const char * funcname)759 static void* initOpenCLAndLoad(const char* funcname)
760 {
761 static bool initialized = false;
762 static void* handle = 0;
763 if (!handle)
764 {
765 if(!initialized)
766 {
767 handle = dlopen("libOpenCL.so", RTLD_LAZY);
768 if(!handle)
769 handle = dlopen("libCL.so", RTLD_LAZY);
770 initialized = true;
771 g_haveOpenCL = handle != 0 && dlsym(handle, oclFuncToCheck) != 0;
772 }
773 if(!handle)
774 return 0;
775 }
776
777 return funcname ? (void*)dlsym(handle, funcname) : 0;
778 }
779
780 #else
781
initOpenCLAndLoad(const char *)782 static void* initOpenCLAndLoad(const char*)
783 {
784 return 0;
785 }
786
787 #endif
788
789
790 #define OCL_FUNC(rettype, funcname, argsdecl, args) \
791 typedef rettype (CV_STDCALL * funcname##_t) argsdecl; \
792 static rettype funcname argsdecl \
793 { \
794 static funcname##_t funcname##_p = 0; \
795 if( !funcname##_p ) \
796 { \
797 funcname##_p = (funcname##_t)initOpenCLAndLoad(#funcname); \
798 if( !funcname##_p ) \
799 return OPENCV_CL_NOT_IMPLEMENTED; \
800 } \
801 return funcname##_p args; \
802 }
803
804
805 #define OCL_FUNC_P(rettype, funcname, argsdecl, args) \
806 typedef rettype (CV_STDCALL * funcname##_t) argsdecl; \
807 static rettype funcname argsdecl \
808 { \
809 static funcname##_t funcname##_p = 0; \
810 if( !funcname##_p ) \
811 { \
812 funcname##_p = (funcname##_t)initOpenCLAndLoad(#funcname); \
813 if( !funcname##_p ) \
814 { \
815 if( errcode_ret ) \
816 *errcode_ret = OPENCV_CL_NOT_IMPLEMENTED; \
817 return 0; \
818 } \
819 } \
820 return funcname##_p args; \
821 }
822
823 OCL_FUNC(cl_int, clGetPlatformIDs,
824 (cl_uint num_entries, cl_platform_id* platforms, cl_uint* num_platforms),
825 (num_entries, platforms, num_platforms))
826
827 OCL_FUNC(cl_int, clGetPlatformInfo,
828 (cl_platform_id platform, cl_platform_info param_name,
829 size_t param_value_size, void * param_value,
830 size_t * param_value_size_ret),
831 (platform, param_name, param_value_size, param_value, param_value_size_ret))
832
833 OCL_FUNC(cl_int, clGetDeviceInfo,
834 (cl_device_id device,
835 cl_device_info param_name,
836 size_t param_value_size,
837 void * param_value,
838 size_t * param_value_size_ret),
839 (device, param_name, param_value_size, param_value, param_value_size_ret))
840
841
842 OCL_FUNC(cl_int, clGetDeviceIDs,
843 (cl_platform_id platform,
844 cl_device_type device_type,
845 cl_uint num_entries,
846 cl_device_id * devices,
847 cl_uint * num_devices),
848 (platform, device_type, num_entries, devices, num_devices))
849
850 OCL_FUNC_P(cl_context, clCreateContext,
851 (const cl_context_properties * properties,
852 cl_uint num_devices,
853 const cl_device_id * devices,
854 void (CL_CALLBACK * pfn_notify)(const char *, const void *, size_t, void *),
855 void * user_data,
856 cl_int * errcode_ret),
857 (properties, num_devices, devices, pfn_notify, user_data, errcode_ret))
858
859 OCL_FUNC(cl_int, clReleaseContext, (cl_context context), (context))
860
861 /*
862 OCL_FUNC(cl_int, clRetainContext, (cl_context context), (context))
863
864 OCL_FUNC_P(cl_context, clCreateContextFromType,
865 (const cl_context_properties * properties,
866 cl_device_type device_type,
867 void (CL_CALLBACK * pfn_notify)(const char *, const void *, size_t, void *),
868 void * user_data,
869 cl_int * errcode_ret),
870 (properties, device_type, pfn_notify, user_data, errcode_ret))
871
872 OCL_FUNC(cl_int, clGetContextInfo,
873 (cl_context context,
874 cl_context_info param_name,
875 size_t param_value_size,
876 void * param_value,
877 size_t * param_value_size_ret),
878 (context, param_name, param_value_size,
879 param_value, param_value_size_ret))
880 */
881 OCL_FUNC_P(cl_command_queue, clCreateCommandQueue,
882 (cl_context context,
883 cl_device_id device,
884 cl_command_queue_properties properties,
885 cl_int * errcode_ret),
886 (context, device, properties, errcode_ret))
887
888 OCL_FUNC(cl_int, clReleaseCommandQueue, (cl_command_queue command_queue), (command_queue))
889
890 OCL_FUNC_P(cl_mem, clCreateBuffer,
891 (cl_context context,
892 cl_mem_flags flags,
893 size_t size,
894 void * host_ptr,
895 cl_int * errcode_ret),
896 (context, flags, size, host_ptr, errcode_ret))
897
898 /*
899 OCL_FUNC(cl_int, clRetainCommandQueue, (cl_command_queue command_queue), (command_queue))
900
901 OCL_FUNC(cl_int, clGetCommandQueueInfo,
902 (cl_command_queue command_queue,
903 cl_command_queue_info param_name,
904 size_t param_value_size,
905 void * param_value,
906 size_t * param_value_size_ret),
907 (command_queue, param_name, param_value_size, param_value, param_value_size_ret))
908
909 OCL_FUNC_P(cl_mem, clCreateSubBuffer,
910 (cl_mem buffer,
911 cl_mem_flags flags,
912 cl_buffer_create_type buffer_create_type,
913 const void * buffer_create_info,
914 cl_int * errcode_ret),
915 (buffer, flags, buffer_create_type, buffer_create_info, errcode_ret))
916 */
917
918 OCL_FUNC_P(cl_mem, clCreateImage,
919 (cl_context context,
920 cl_mem_flags flags,
921 const cl_image_format * image_format,
922 const cl_image_desc * image_desc,
923 void * host_ptr,
924 cl_int * errcode_ret),
925 (context, flags, image_format, image_desc, host_ptr, errcode_ret))
926
927 OCL_FUNC_P(cl_mem, clCreateImage2D,
928 (cl_context context,
929 cl_mem_flags flags,
930 const cl_image_format * image_format,
931 size_t image_width,
932 size_t image_height,
933 size_t image_row_pitch,
934 void * host_ptr,
935 cl_int *errcode_ret),
936 (context, flags, image_format, image_width, image_height, image_row_pitch, host_ptr, errcode_ret))
937
938 OCL_FUNC(cl_int, clGetSupportedImageFormats,
939 (cl_context context,
940 cl_mem_flags flags,
941 cl_mem_object_type image_type,
942 cl_uint num_entries,
943 cl_image_format * image_formats,
944 cl_uint * num_image_formats),
945 (context, flags, image_type, num_entries, image_formats, num_image_formats))
946
947
948 /*
949 OCL_FUNC(cl_int, clGetMemObjectInfo,
950 (cl_mem memobj,
951 cl_mem_info param_name,
952 size_t param_value_size,
953 void * param_value,
954 size_t * param_value_size_ret),
955 (memobj, param_name, param_value_size, param_value, param_value_size_ret))
956
957 OCL_FUNC(cl_int, clGetImageInfo,
958 (cl_mem image,
959 cl_image_info param_name,
960 size_t param_value_size,
961 void * param_value,
962 size_t * param_value_size_ret),
963 (image, param_name, param_value_size, param_value, param_value_size_ret))
964
965 OCL_FUNC(cl_int, clCreateKernelsInProgram,
966 (cl_program program,
967 cl_uint num_kernels,
968 cl_kernel * kernels,
969 cl_uint * num_kernels_ret),
970 (program, num_kernels, kernels, num_kernels_ret))
971
972 OCL_FUNC(cl_int, clRetainKernel, (cl_kernel kernel), (kernel))
973
974 OCL_FUNC(cl_int, clGetKernelArgInfo,
975 (cl_kernel kernel,
976 cl_uint arg_indx,
977 cl_kernel_arg_info param_name,
978 size_t param_value_size,
979 void * param_value,
980 size_t * param_value_size_ret),
981 (kernel, arg_indx, param_name, param_value_size, param_value, param_value_size_ret))
982
983 OCL_FUNC(cl_int, clEnqueueReadImage,
984 (cl_command_queue command_queue,
985 cl_mem image,
986 cl_bool blocking_read,
987 const size_t * origin[3],
988 const size_t * region[3],
989 size_t row_pitch,
990 size_t slice_pitch,
991 void * ptr,
992 cl_uint num_events_in_wait_list,
993 const cl_event * event_wait_list,
994 cl_event * event),
995 (command_queue, image, blocking_read, origin, region,
996 row_pitch, slice_pitch,
997 ptr,
998 num_events_in_wait_list,
999 event_wait_list,
1000 event))
1001
1002 OCL_FUNC(cl_int, clEnqueueWriteImage,
1003 (cl_command_queue command_queue,
1004 cl_mem image,
1005 cl_bool blocking_write,
1006 const size_t * origin[3],
1007 const size_t * region[3],
1008 size_t input_row_pitch,
1009 size_t input_slice_pitch,
1010 const void * ptr,
1011 cl_uint num_events_in_wait_list,
1012 const cl_event * event_wait_list,
1013 cl_event * event),
1014 (command_queue, image, blocking_write, origin, region, input_row_pitch,
1015 input_slice_pitch, ptr, num_events_in_wait_list, event_wait_list, event))
1016
1017 OCL_FUNC(cl_int, clEnqueueFillImage,
1018 (cl_command_queue command_queue,
1019 cl_mem image,
1020 const void * fill_color,
1021 const size_t * origin[3],
1022 const size_t * region[3],
1023 cl_uint num_events_in_wait_list,
1024 const cl_event * event_wait_list,
1025 cl_event * event),
1026 (command_queue, image, fill_color, origin, region,
1027 num_events_in_wait_list, event_wait_list, event))
1028
1029 OCL_FUNC(cl_int, clEnqueueCopyImage,
1030 (cl_command_queue command_queue,
1031 cl_mem src_image,
1032 cl_mem dst_image,
1033 const size_t * src_origin[3],
1034 const size_t * dst_origin[3],
1035 const size_t * region[3],
1036 cl_uint num_events_in_wait_list,
1037 const cl_event * event_wait_list,
1038 cl_event * event),
1039 (command_queue, src_image, dst_image, src_origin, dst_origin,
1040 region, num_events_in_wait_list, event_wait_list, event))
1041
1042 OCL_FUNC(cl_int, clEnqueueCopyImageToBuffer,
1043 (cl_command_queue command_queue,
1044 cl_mem src_image,
1045 cl_mem dst_buffer,
1046 const size_t * src_origin[3],
1047 const size_t * region[3],
1048 size_t dst_offset,
1049 cl_uint num_events_in_wait_list,
1050 const cl_event * event_wait_list,
1051 cl_event * event),
1052 (command_queue, src_image, dst_buffer, src_origin, region, dst_offset,
1053 num_events_in_wait_list, event_wait_list, event))
1054 */
1055
1056 OCL_FUNC(cl_int, clEnqueueCopyBufferToImage,
1057 (cl_command_queue command_queue,
1058 cl_mem src_buffer,
1059 cl_mem dst_image,
1060 size_t src_offset,
1061 const size_t dst_origin[3],
1062 const size_t region[3],
1063 cl_uint num_events_in_wait_list,
1064 const cl_event * event_wait_list,
1065 cl_event * event),
1066 (command_queue, src_buffer, dst_image, src_offset, dst_origin,
1067 region, num_events_in_wait_list, event_wait_list, event))
1068
1069 OCL_FUNC(cl_int, clFlush,
1070 (cl_command_queue command_queue),
1071 (command_queue))
1072
1073 /*
1074 OCL_FUNC_P(void*, clEnqueueMapImage,
1075 (cl_command_queue command_queue,
1076 cl_mem image,
1077 cl_bool blocking_map,
1078 cl_map_flags map_flags,
1079 const size_t * origin[3],
1080 const size_t * region[3],
1081 size_t * image_row_pitch,
1082 size_t * image_slice_pitch,
1083 cl_uint num_events_in_wait_list,
1084 const cl_event * event_wait_list,
1085 cl_event * event,
1086 cl_int * errcode_ret),
1087 (command_queue, image, blocking_map, map_flags, origin, region,
1088 image_row_pitch, image_slice_pitch, num_events_in_wait_list,
1089 event_wait_list, event, errcode_ret))
1090 */
1091
1092 /*
1093 OCL_FUNC(cl_int, clRetainProgram, (cl_program program), (program))
1094
1095 OCL_FUNC(cl_int, clGetKernelInfo,
1096 (cl_kernel kernel,
1097 cl_kernel_info param_name,
1098 size_t param_value_size,
1099 void * param_value,
1100 size_t * param_value_size_ret),
1101 (kernel, param_name, param_value_size, param_value, param_value_size_ret))
1102
1103 OCL_FUNC(cl_int, clRetainMemObject, (cl_mem memobj), (memobj))
1104
1105 */
1106
1107 OCL_FUNC(cl_int, clReleaseMemObject, (cl_mem memobj), (memobj))
1108
1109
1110 OCL_FUNC_P(cl_program, clCreateProgramWithSource,
1111 (cl_context context,
1112 cl_uint count,
1113 const char ** strings,
1114 const size_t * lengths,
1115 cl_int * errcode_ret),
1116 (context, count, strings, lengths, errcode_ret))
1117
1118 OCL_FUNC_P(cl_program, clCreateProgramWithBinary,
1119 (cl_context context,
1120 cl_uint num_devices,
1121 const cl_device_id * device_list,
1122 const size_t * lengths,
1123 const unsigned char ** binaries,
1124 cl_int * binary_status,
1125 cl_int * errcode_ret),
1126 (context, num_devices, device_list, lengths, binaries, binary_status, errcode_ret))
1127
1128 OCL_FUNC(cl_int, clReleaseProgram, (cl_program program), (program))
1129
1130 OCL_FUNC(cl_int, clBuildProgram,
1131 (cl_program program,
1132 cl_uint num_devices,
1133 const cl_device_id * device_list,
1134 const char * options,
1135 void (CL_CALLBACK * pfn_notify)(cl_program, void *),
1136 void * user_data),
1137 (program, num_devices, device_list, options, pfn_notify, user_data))
1138
1139 OCL_FUNC(cl_int, clGetProgramInfo,
1140 (cl_program program,
1141 cl_program_info param_name,
1142 size_t param_value_size,
1143 void * param_value,
1144 size_t * param_value_size_ret),
1145 (program, param_name, param_value_size, param_value, param_value_size_ret))
1146
1147 OCL_FUNC(cl_int, clGetProgramBuildInfo,
1148 (cl_program program,
1149 cl_device_id device,
1150 cl_program_build_info param_name,
1151 size_t param_value_size,
1152 void * param_value,
1153 size_t * param_value_size_ret),
1154 (program, device, param_name, param_value_size, param_value, param_value_size_ret))
1155
1156 OCL_FUNC_P(cl_kernel, clCreateKernel,
1157 (cl_program program,
1158 const char * kernel_name,
1159 cl_int * errcode_ret),
1160 (program, kernel_name, errcode_ret))
1161
1162 OCL_FUNC(cl_int, clReleaseKernel, (cl_kernel kernel), (kernel))
1163
1164 OCL_FUNC(cl_int, clSetKernelArg,
1165 (cl_kernel kernel,
1166 cl_uint arg_index,
1167 size_t arg_size,
1168 const void * arg_value),
1169 (kernel, arg_index, arg_size, arg_value))
1170
1171 OCL_FUNC(cl_int, clGetKernelWorkGroupInfo,
1172 (cl_kernel kernel,
1173 cl_device_id device,
1174 cl_kernel_work_group_info param_name,
1175 size_t param_value_size,
1176 void * param_value,
1177 size_t * param_value_size_ret),
1178 (kernel, device, param_name, param_value_size, param_value, param_value_size_ret))
1179
1180 OCL_FUNC(cl_int, clFinish, (cl_command_queue command_queue), (command_queue))
1181
1182 OCL_FUNC(cl_int, clEnqueueReadBuffer,
1183 (cl_command_queue command_queue,
1184 cl_mem buffer,
1185 cl_bool blocking_read,
1186 size_t offset,
1187 size_t size,
1188 void * ptr,
1189 cl_uint num_events_in_wait_list,
1190 const cl_event * event_wait_list,
1191 cl_event * event),
1192 (command_queue, buffer, blocking_read, offset, size, ptr,
1193 num_events_in_wait_list, event_wait_list, event))
1194
1195 OCL_FUNC(cl_int, clEnqueueReadBufferRect,
1196 (cl_command_queue command_queue,
1197 cl_mem buffer,
1198 cl_bool blocking_read,
1199 const size_t * buffer_offset,
1200 const size_t * host_offset,
1201 const size_t * region,
1202 size_t buffer_row_pitch,
1203 size_t buffer_slice_pitch,
1204 size_t host_row_pitch,
1205 size_t host_slice_pitch,
1206 void * ptr,
1207 cl_uint num_events_in_wait_list,
1208 const cl_event * event_wait_list,
1209 cl_event * event),
1210 (command_queue, buffer, blocking_read, buffer_offset, host_offset, region, buffer_row_pitch,
1211 buffer_slice_pitch, host_row_pitch, host_slice_pitch, ptr, num_events_in_wait_list,
1212 event_wait_list, event))
1213
1214 OCL_FUNC(cl_int, clEnqueueWriteBuffer,
1215 (cl_command_queue command_queue,
1216 cl_mem buffer,
1217 cl_bool blocking_write,
1218 size_t offset,
1219 size_t size,
1220 const void * ptr,
1221 cl_uint num_events_in_wait_list,
1222 const cl_event * event_wait_list,
1223 cl_event * event),
1224 (command_queue, buffer, blocking_write, offset, size, ptr,
1225 num_events_in_wait_list, event_wait_list, event))
1226
1227 OCL_FUNC(cl_int, clEnqueueWriteBufferRect,
1228 (cl_command_queue command_queue,
1229 cl_mem buffer,
1230 cl_bool blocking_write,
1231 const size_t * buffer_offset,
1232 const size_t * host_offset,
1233 const size_t * region,
1234 size_t buffer_row_pitch,
1235 size_t buffer_slice_pitch,
1236 size_t host_row_pitch,
1237 size_t host_slice_pitch,
1238 const void * ptr,
1239 cl_uint num_events_in_wait_list,
1240 const cl_event * event_wait_list,
1241 cl_event * event),
1242 (command_queue, buffer, blocking_write, buffer_offset, host_offset,
1243 region, buffer_row_pitch, buffer_slice_pitch, host_row_pitch,
1244 host_slice_pitch, ptr, num_events_in_wait_list, event_wait_list, event))
1245
1246 /*OCL_FUNC(cl_int, clEnqueueFillBuffer,
1247 (cl_command_queue command_queue,
1248 cl_mem buffer,
1249 const void * pattern,
1250 size_t pattern_size,
1251 size_t offset,
1252 size_t size,
1253 cl_uint num_events_in_wait_list,
1254 const cl_event * event_wait_list,
1255 cl_event * event),
1256 (command_queue, buffer, pattern, pattern_size, offset, size,
1257 num_events_in_wait_list, event_wait_list, event))*/
1258
1259 OCL_FUNC(cl_int, clEnqueueCopyBuffer,
1260 (cl_command_queue command_queue,
1261 cl_mem src_buffer,
1262 cl_mem dst_buffer,
1263 size_t src_offset,
1264 size_t dst_offset,
1265 size_t size,
1266 cl_uint num_events_in_wait_list,
1267 const cl_event * event_wait_list,
1268 cl_event * event),
1269 (command_queue, src_buffer, dst_buffer, src_offset, dst_offset,
1270 size, num_events_in_wait_list, event_wait_list, event))
1271
1272 OCL_FUNC(cl_int, clEnqueueCopyBufferRect,
1273 (cl_command_queue command_queue,
1274 cl_mem src_buffer,
1275 cl_mem dst_buffer,
1276 const size_t * src_origin,
1277 const size_t * dst_origin,
1278 const size_t * region,
1279 size_t src_row_pitch,
1280 size_t src_slice_pitch,
1281 size_t dst_row_pitch,
1282 size_t dst_slice_pitch,
1283 cl_uint num_events_in_wait_list,
1284 const cl_event * event_wait_list,
1285 cl_event * event),
1286 (command_queue, src_buffer, dst_buffer, src_origin, dst_origin,
1287 region, src_row_pitch, src_slice_pitch, dst_row_pitch, dst_slice_pitch,
1288 num_events_in_wait_list, event_wait_list, event))
1289
1290 OCL_FUNC_P(void*, clEnqueueMapBuffer,
1291 (cl_command_queue command_queue,
1292 cl_mem buffer,
1293 cl_bool blocking_map,
1294 cl_map_flags map_flags,
1295 size_t offset,
1296 size_t size,
1297 cl_uint num_events_in_wait_list,
1298 const cl_event * event_wait_list,
1299 cl_event * event,
1300 cl_int * errcode_ret),
1301 (command_queue, buffer, blocking_map, map_flags, offset, size,
1302 num_events_in_wait_list, event_wait_list, event, errcode_ret))
1303
1304 OCL_FUNC(cl_int, clEnqueueUnmapMemObject,
1305 (cl_command_queue command_queue,
1306 cl_mem memobj,
1307 void * mapped_ptr,
1308 cl_uint num_events_in_wait_list,
1309 const cl_event * event_wait_list,
1310 cl_event * event),
1311 (command_queue, memobj, mapped_ptr, num_events_in_wait_list, event_wait_list, event))
1312
1313 OCL_FUNC(cl_int, clEnqueueNDRangeKernel,
1314 (cl_command_queue command_queue,
1315 cl_kernel kernel,
1316 cl_uint work_dim,
1317 const size_t * global_work_offset,
1318 const size_t * global_work_size,
1319 const size_t * local_work_size,
1320 cl_uint num_events_in_wait_list,
1321 const cl_event * event_wait_list,
1322 cl_event * event),
1323 (command_queue, kernel, work_dim, global_work_offset, global_work_size,
1324 local_work_size, num_events_in_wait_list, event_wait_list, event))
1325
1326 OCL_FUNC(cl_int, clEnqueueTask,
1327 (cl_command_queue command_queue,
1328 cl_kernel kernel,
1329 cl_uint num_events_in_wait_list,
1330 const cl_event * event_wait_list,
1331 cl_event * event),
1332 (command_queue, kernel, num_events_in_wait_list, event_wait_list, event))
1333
1334 OCL_FUNC(cl_int, clSetEventCallback,
1335 (cl_event event,
1336 cl_int command_exec_callback_type ,
1337 void (CL_CALLBACK *pfn_event_notify) (cl_event event, cl_int event_command_exec_status, void *user_data),
1338 void *user_data),
1339 (event, command_exec_callback_type, pfn_event_notify, user_data))
1340
1341 OCL_FUNC(cl_int, clReleaseEvent, (cl_event event), (event))
1342
1343 }
1344
1345 #endif
1346
1347 #ifndef CL_VERSION_1_2
1348 #define CL_VERSION_1_2
1349 #endif
1350
1351 #endif
1352
1353 #ifdef _DEBUG
1354 #define CV_OclDbgAssert CV_DbgAssert
1355 #else
isRaiseError()1356 static bool isRaiseError()
1357 {
1358 static bool initialized = false;
1359 static bool value = false;
1360 if (!initialized)
1361 {
1362 value = getBoolParameter("OPENCV_OPENCL_RAISE_ERROR", false);
1363 initialized = true;
1364 }
1365 return value;
1366 }
1367 #define CV_OclDbgAssert(expr) do { if (isRaiseError()) { CV_Assert(expr); } else { (void)(expr); } } while ((void)0, 0)
1368 #endif
1369
1370 #ifdef HAVE_OPENCL_SVM
1371 #include "opencv2/core/opencl/runtime/opencl_svm_20.hpp"
1372 #include "opencv2/core/opencl/runtime/opencl_svm_hsa_extension.hpp"
1373 #include "opencv2/core/opencl/opencl_svm.hpp"
1374 #endif
1375
1376 namespace cv { namespace ocl {
1377
1378 struct UMat2D
1379 {
UMat2Dcv::ocl::UMat2D1380 UMat2D(const UMat& m)
1381 {
1382 offset = (int)m.offset;
1383 step = (int)m.step;
1384 rows = m.rows;
1385 cols = m.cols;
1386 }
1387 int offset;
1388 int step;
1389 int rows;
1390 int cols;
1391 };
1392
1393 struct UMat3D
1394 {
UMat3Dcv::ocl::UMat3D1395 UMat3D(const UMat& m)
1396 {
1397 offset = (int)m.offset;
1398 step = (int)m.step.p[1];
1399 slicestep = (int)m.step.p[0];
1400 slices = (int)m.size.p[0];
1401 rows = m.size.p[1];
1402 cols = m.size.p[2];
1403 }
1404 int offset;
1405 int slicestep;
1406 int step;
1407 int slices;
1408 int rows;
1409 int cols;
1410 };
1411
1412 // Computes 64-bit "cyclic redundancy check" sum, as specified in ECMA-182
crc64(const uchar * data,size_t size,uint64 crc0=0)1413 static uint64 crc64( const uchar* data, size_t size, uint64 crc0=0 )
1414 {
1415 static uint64 table[256];
1416 static bool initialized = false;
1417
1418 if( !initialized )
1419 {
1420 for( int i = 0; i < 256; i++ )
1421 {
1422 uint64 c = i;
1423 for( int j = 0; j < 8; j++ )
1424 c = ((c & 1) ? CV_BIG_UINT(0xc96c5795d7870f42) : 0) ^ (c >> 1);
1425 table[i] = c;
1426 }
1427 initialized = true;
1428 }
1429
1430 uint64 crc = ~crc0;
1431 for( size_t idx = 0; idx < size; idx++ )
1432 crc = table[(uchar)crc ^ data[idx]] ^ (crc >> 8);
1433
1434 return ~crc;
1435 }
1436
1437 struct HashKey
1438 {
1439 typedef uint64 part;
HashKeycv::ocl::HashKey1440 HashKey(part _a, part _b) : a(_a), b(_b) {}
1441 part a, b;
1442 };
1443
operator ==(const HashKey & h1,const HashKey & h2)1444 inline bool operator == (const HashKey& h1, const HashKey& h2)
1445 {
1446 return h1.a == h2.a && h1.b == h2.b;
1447 }
1448
operator <(const HashKey & h1,const HashKey & h2)1449 inline bool operator < (const HashKey& h1, const HashKey& h2)
1450 {
1451 return h1.a < h2.a || (h1.a == h2.a && h1.b < h2.b);
1452 }
1453
1454
haveOpenCL()1455 bool haveOpenCL()
1456 {
1457 #ifdef HAVE_OPENCL
1458 static bool g_isOpenCLInitialized = false;
1459 static bool g_isOpenCLAvailable = false;
1460
1461 if (!g_isOpenCLInitialized)
1462 {
1463 try
1464 {
1465 cl_uint n = 0;
1466 g_isOpenCLAvailable = ::clGetPlatformIDs(0, NULL, &n) == CL_SUCCESS;
1467 }
1468 catch (...)
1469 {
1470 g_isOpenCLAvailable = false;
1471 }
1472 g_isOpenCLInitialized = true;
1473 }
1474 return g_isOpenCLAvailable;
1475 #else
1476 return false;
1477 #endif
1478 }
1479
useOpenCL()1480 bool useOpenCL()
1481 {
1482 CoreTLSData* data = getCoreTlsData().get();
1483 if( data->useOpenCL < 0 )
1484 {
1485 try
1486 {
1487 data->useOpenCL = (int)haveOpenCL() && Device::getDefault().ptr() && Device::getDefault().available();
1488 }
1489 catch (...)
1490 {
1491 data->useOpenCL = 0;
1492 }
1493 }
1494 return data->useOpenCL > 0;
1495 }
1496
setUseOpenCL(bool flag)1497 void setUseOpenCL(bool flag)
1498 {
1499 if( haveOpenCL() )
1500 {
1501 CoreTLSData* data = getCoreTlsData().get();
1502 data->useOpenCL = (flag && Device::getDefault().ptr() != NULL) ? 1 : 0;
1503 }
1504 }
1505
1506 #ifdef HAVE_CLAMDBLAS
1507
1508 class AmdBlasHelper
1509 {
1510 public:
getInstance()1511 static AmdBlasHelper & getInstance()
1512 {
1513 static AmdBlasHelper amdBlas;
1514 return amdBlas;
1515 }
1516
isAvailable() const1517 bool isAvailable() const
1518 {
1519 return g_isAmdBlasAvailable;
1520 }
1521
~AmdBlasHelper()1522 ~AmdBlasHelper()
1523 {
1524 try
1525 {
1526 clAmdBlasTeardown();
1527 }
1528 catch (...) { }
1529 }
1530
1531 protected:
AmdBlasHelper()1532 AmdBlasHelper()
1533 {
1534 if (!g_isAmdBlasInitialized)
1535 {
1536 AutoLock lock(m);
1537
1538 if (!g_isAmdBlasInitialized && haveOpenCL())
1539 {
1540 try
1541 {
1542 g_isAmdBlasAvailable = clAmdBlasSetup() == clAmdBlasSuccess;
1543 }
1544 catch (...)
1545 {
1546 g_isAmdBlasAvailable = false;
1547 }
1548 }
1549 else
1550 g_isAmdBlasAvailable = false;
1551
1552 g_isAmdBlasInitialized = true;
1553 }
1554 }
1555
1556 private:
1557 static Mutex m;
1558 static bool g_isAmdBlasInitialized;
1559 static bool g_isAmdBlasAvailable;
1560 };
1561
1562 bool AmdBlasHelper::g_isAmdBlasAvailable = false;
1563 bool AmdBlasHelper::g_isAmdBlasInitialized = false;
1564 Mutex AmdBlasHelper::m;
1565
haveAmdBlas()1566 bool haveAmdBlas()
1567 {
1568 return AmdBlasHelper::getInstance().isAvailable();
1569 }
1570
1571 #else
1572
haveAmdBlas()1573 bool haveAmdBlas()
1574 {
1575 return false;
1576 }
1577
1578 #endif
1579
1580 #ifdef HAVE_CLAMDFFT
1581
1582 class AmdFftHelper
1583 {
1584 public:
getInstance()1585 static AmdFftHelper & getInstance()
1586 {
1587 static AmdFftHelper amdFft;
1588 return amdFft;
1589 }
1590
isAvailable() const1591 bool isAvailable() const
1592 {
1593 return g_isAmdFftAvailable;
1594 }
1595
~AmdFftHelper()1596 ~AmdFftHelper()
1597 {
1598 try
1599 {
1600 // clAmdFftTeardown();
1601 }
1602 catch (...) { }
1603 }
1604
1605 protected:
AmdFftHelper()1606 AmdFftHelper()
1607 {
1608 if (!g_isAmdFftInitialized)
1609 {
1610 AutoLock lock(m);
1611
1612 if (!g_isAmdFftInitialized && haveOpenCL())
1613 {
1614 try
1615 {
1616 cl_uint major, minor, patch;
1617 CV_Assert(clAmdFftInitSetupData(&setupData) == CLFFT_SUCCESS);
1618
1619 // it throws exception in case AmdFft binaries are not found
1620 CV_Assert(clAmdFftGetVersion(&major, &minor, &patch) == CLFFT_SUCCESS);
1621 g_isAmdFftAvailable = true;
1622 }
1623 catch (const Exception &)
1624 {
1625 g_isAmdFftAvailable = false;
1626 }
1627 }
1628 else
1629 g_isAmdFftAvailable = false;
1630
1631 g_isAmdFftInitialized = true;
1632 }
1633 }
1634
1635 private:
1636 static clAmdFftSetupData setupData;
1637 static Mutex m;
1638 static bool g_isAmdFftInitialized;
1639 static bool g_isAmdFftAvailable;
1640 };
1641
1642 clAmdFftSetupData AmdFftHelper::setupData;
1643 bool AmdFftHelper::g_isAmdFftAvailable = false;
1644 bool AmdFftHelper::g_isAmdFftInitialized = false;
1645 Mutex AmdFftHelper::m;
1646
haveAmdFft()1647 bool haveAmdFft()
1648 {
1649 return AmdFftHelper::getInstance().isAvailable();
1650 }
1651
1652 #else
1653
haveAmdFft()1654 bool haveAmdFft()
1655 {
1656 return false;
1657 }
1658
1659 #endif
1660
haveSVM()1661 bool haveSVM()
1662 {
1663 #ifdef HAVE_OPENCL_SVM
1664 return true;
1665 #else
1666 return false;
1667 #endif
1668 }
1669
finish()1670 void finish()
1671 {
1672 Queue::getDefault().finish();
1673 }
1674
1675 #define IMPLEMENT_REFCOUNTABLE() \
1676 void addref() { CV_XADD(&refcount, 1); } \
1677 void release() { if( CV_XADD(&refcount, -1) == 1 && !cv::__termination) delete this; } \
1678 int refcount
1679
1680 /////////////////////////////////////////// Platform /////////////////////////////////////////////
1681
1682 struct Platform::Impl
1683 {
Implcv::ocl::Platform::Impl1684 Impl()
1685 {
1686 refcount = 1;
1687 handle = 0;
1688 initialized = false;
1689 }
1690
~Implcv::ocl::Platform::Impl1691 ~Impl() {}
1692
initcv::ocl::Platform::Impl1693 void init()
1694 {
1695 if( !initialized )
1696 {
1697 //cl_uint num_entries
1698 cl_uint n = 0;
1699 if( clGetPlatformIDs(1, &handle, &n) != CL_SUCCESS || n == 0 )
1700 handle = 0;
1701 if( handle != 0 )
1702 {
1703 char buf[1000];
1704 size_t len = 0;
1705 CV_OclDbgAssert(clGetPlatformInfo(handle, CL_PLATFORM_VENDOR, sizeof(buf), buf, &len) == CL_SUCCESS);
1706 buf[len] = '\0';
1707 vendor = String(buf);
1708 }
1709
1710 initialized = true;
1711 }
1712 }
1713
1714 IMPLEMENT_REFCOUNTABLE();
1715
1716 cl_platform_id handle;
1717 String vendor;
1718 bool initialized;
1719 };
1720
Platform()1721 Platform::Platform()
1722 {
1723 p = 0;
1724 }
1725
~Platform()1726 Platform::~Platform()
1727 {
1728 if(p)
1729 p->release();
1730 }
1731
Platform(const Platform & pl)1732 Platform::Platform(const Platform& pl)
1733 {
1734 p = (Impl*)pl.p;
1735 if(p)
1736 p->addref();
1737 }
1738
operator =(const Platform & pl)1739 Platform& Platform::operator = (const Platform& pl)
1740 {
1741 Impl* newp = (Impl*)pl.p;
1742 if(newp)
1743 newp->addref();
1744 if(p)
1745 p->release();
1746 p = newp;
1747 return *this;
1748 }
1749
ptr() const1750 void* Platform::ptr() const
1751 {
1752 return p ? p->handle : 0;
1753 }
1754
getDefault()1755 Platform& Platform::getDefault()
1756 {
1757 static Platform p;
1758 if( !p.p )
1759 {
1760 p.p = new Impl;
1761 p.p->init();
1762 }
1763 return p;
1764 }
1765
1766 /////////////////////////////////////// Device ////////////////////////////////////////////
1767
1768 // deviceVersion has format
1769 // OpenCL<space><major_version.minor_version><space><vendor-specific information>
1770 // by specification
1771 // http://www.khronos.org/registry/cl/sdk/1.1/docs/man/xhtml/clGetDeviceInfo.html
1772 // http://www.khronos.org/registry/cl/sdk/1.2/docs/man/xhtml/clGetDeviceInfo.html
parseDeviceVersion(const String & deviceVersion,int & major,int & minor)1773 static void parseDeviceVersion(const String &deviceVersion, int &major, int &minor)
1774 {
1775 major = minor = 0;
1776 if (10 >= deviceVersion.length())
1777 return;
1778 const char *pstr = deviceVersion.c_str();
1779 if (0 != strncmp(pstr, "OpenCL ", 7))
1780 return;
1781 size_t ppos = deviceVersion.find('.', 7);
1782 if (String::npos == ppos)
1783 return;
1784 String temp = deviceVersion.substr(7, ppos - 7);
1785 major = atoi(temp.c_str());
1786 temp = deviceVersion.substr(ppos + 1);
1787 minor = atoi(temp.c_str());
1788 }
1789
1790 struct Device::Impl
1791 {
Implcv::ocl::Device::Impl1792 Impl(void* d)
1793 {
1794 handle = (cl_device_id)d;
1795 refcount = 1;
1796
1797 name_ = getStrProp(CL_DEVICE_NAME);
1798 version_ = getStrProp(CL_DEVICE_VERSION);
1799 doubleFPConfig_ = getProp<cl_device_fp_config, int>(CL_DEVICE_DOUBLE_FP_CONFIG);
1800 hostUnifiedMemory_ = getBoolProp(CL_DEVICE_HOST_UNIFIED_MEMORY);
1801 maxComputeUnits_ = getProp<cl_uint, int>(CL_DEVICE_MAX_COMPUTE_UNITS);
1802 maxWorkGroupSize_ = getProp<size_t, size_t>(CL_DEVICE_MAX_WORK_GROUP_SIZE);
1803 type_ = getProp<cl_device_type, int>(CL_DEVICE_TYPE);
1804 driverVersion_ = getStrProp(CL_DRIVER_VERSION);
1805
1806 String deviceVersion_ = getStrProp(CL_DEVICE_VERSION);
1807 parseDeviceVersion(deviceVersion_, deviceVersionMajor_, deviceVersionMinor_);
1808
1809 vendorName_ = getStrProp(CL_DEVICE_VENDOR);
1810 if (vendorName_ == "Advanced Micro Devices, Inc." ||
1811 vendorName_ == "AMD")
1812 vendorID_ = VENDOR_AMD;
1813 else if (vendorName_ == "Intel(R) Corporation" || vendorName_ == "Intel" || strstr(name_.c_str(), "Iris") != 0)
1814 vendorID_ = VENDOR_INTEL;
1815 else if (vendorName_ == "NVIDIA Corporation")
1816 vendorID_ = VENDOR_NVIDIA;
1817 else
1818 vendorID_ = UNKNOWN_VENDOR;
1819 }
1820
1821 template<typename _TpCL, typename _TpOut>
getPropcv::ocl::Device::Impl1822 _TpOut getProp(cl_device_info prop) const
1823 {
1824 _TpCL temp=_TpCL();
1825 size_t sz = 0;
1826
1827 return clGetDeviceInfo(handle, prop, sizeof(temp), &temp, &sz) == CL_SUCCESS &&
1828 sz == sizeof(temp) ? _TpOut(temp) : _TpOut();
1829 }
1830
getBoolPropcv::ocl::Device::Impl1831 bool getBoolProp(cl_device_info prop) const
1832 {
1833 cl_bool temp = CL_FALSE;
1834 size_t sz = 0;
1835
1836 return clGetDeviceInfo(handle, prop, sizeof(temp), &temp, &sz) == CL_SUCCESS &&
1837 sz == sizeof(temp) ? temp != 0 : false;
1838 }
1839
getStrPropcv::ocl::Device::Impl1840 String getStrProp(cl_device_info prop) const
1841 {
1842 char buf[1024];
1843 size_t sz=0;
1844 return clGetDeviceInfo(handle, prop, sizeof(buf)-16, buf, &sz) == CL_SUCCESS &&
1845 sz < sizeof(buf) ? String(buf) : String();
1846 }
1847
1848 IMPLEMENT_REFCOUNTABLE();
1849 cl_device_id handle;
1850
1851 String name_;
1852 String version_;
1853 int doubleFPConfig_;
1854 bool hostUnifiedMemory_;
1855 int maxComputeUnits_;
1856 size_t maxWorkGroupSize_;
1857 int type_;
1858 int deviceVersionMajor_;
1859 int deviceVersionMinor_;
1860 String driverVersion_;
1861 String vendorName_;
1862 int vendorID_;
1863 };
1864
1865
Device()1866 Device::Device()
1867 {
1868 p = 0;
1869 }
1870
Device(void * d)1871 Device::Device(void* d)
1872 {
1873 p = 0;
1874 set(d);
1875 }
1876
Device(const Device & d)1877 Device::Device(const Device& d)
1878 {
1879 p = d.p;
1880 if(p)
1881 p->addref();
1882 }
1883
operator =(const Device & d)1884 Device& Device::operator = (const Device& d)
1885 {
1886 Impl* newp = (Impl*)d.p;
1887 if(newp)
1888 newp->addref();
1889 if(p)
1890 p->release();
1891 p = newp;
1892 return *this;
1893 }
1894
~Device()1895 Device::~Device()
1896 {
1897 if(p)
1898 p->release();
1899 }
1900
set(void * d)1901 void Device::set(void* d)
1902 {
1903 if(p)
1904 p->release();
1905 p = new Impl(d);
1906 }
1907
ptr() const1908 void* Device::ptr() const
1909 {
1910 return p ? p->handle : 0;
1911 }
1912
name() const1913 String Device::name() const
1914 { return p ? p->name_ : String(); }
1915
extensions() const1916 String Device::extensions() const
1917 { return p ? p->getStrProp(CL_DEVICE_EXTENSIONS) : String(); }
1918
version() const1919 String Device::version() const
1920 { return p ? p->version_ : String(); }
1921
vendorName() const1922 String Device::vendorName() const
1923 { return p ? p->vendorName_ : String(); }
1924
vendorID() const1925 int Device::vendorID() const
1926 { return p ? p->vendorID_ : 0; }
1927
OpenCL_C_Version() const1928 String Device::OpenCL_C_Version() const
1929 { return p ? p->getStrProp(CL_DEVICE_OPENCL_C_VERSION) : String(); }
1930
OpenCLVersion() const1931 String Device::OpenCLVersion() const
1932 { return p ? p->getStrProp(CL_DEVICE_EXTENSIONS) : String(); }
1933
deviceVersionMajor() const1934 int Device::deviceVersionMajor() const
1935 { return p ? p->deviceVersionMajor_ : 0; }
1936
deviceVersionMinor() const1937 int Device::deviceVersionMinor() const
1938 { return p ? p->deviceVersionMinor_ : 0; }
1939
driverVersion() const1940 String Device::driverVersion() const
1941 { return p ? p->driverVersion_ : String(); }
1942
type() const1943 int Device::type() const
1944 { return p ? p->type_ : 0; }
1945
addressBits() const1946 int Device::addressBits() const
1947 { return p ? p->getProp<cl_uint, int>(CL_DEVICE_ADDRESS_BITS) : 0; }
1948
available() const1949 bool Device::available() const
1950 { return p ? p->getBoolProp(CL_DEVICE_AVAILABLE) : false; }
1951
compilerAvailable() const1952 bool Device::compilerAvailable() const
1953 { return p ? p->getBoolProp(CL_DEVICE_COMPILER_AVAILABLE) : false; }
1954
linkerAvailable() const1955 bool Device::linkerAvailable() const
1956 #ifdef CL_VERSION_1_2
1957 { return p ? p->getBoolProp(CL_DEVICE_LINKER_AVAILABLE) : false; }
1958 #else
1959 { CV_REQUIRE_OPENCL_1_2_ERROR; }
1960 #endif
1961
doubleFPConfig() const1962 int Device::doubleFPConfig() const
1963 { return p ? p->doubleFPConfig_ : 0; }
1964
singleFPConfig() const1965 int Device::singleFPConfig() const
1966 { return p ? p->getProp<cl_device_fp_config, int>(CL_DEVICE_SINGLE_FP_CONFIG) : 0; }
1967
halfFPConfig() const1968 int Device::halfFPConfig() const
1969 #ifdef CL_VERSION_1_2
1970 { return p ? p->getProp<cl_device_fp_config, int>(CL_DEVICE_HALF_FP_CONFIG) : 0; }
1971 #else
1972 { CV_REQUIRE_OPENCL_1_2_ERROR; }
1973 #endif
1974
endianLittle() const1975 bool Device::endianLittle() const
1976 { return p ? p->getBoolProp(CL_DEVICE_ENDIAN_LITTLE) : false; }
1977
errorCorrectionSupport() const1978 bool Device::errorCorrectionSupport() const
1979 { return p ? p->getBoolProp(CL_DEVICE_ERROR_CORRECTION_SUPPORT) : false; }
1980
executionCapabilities() const1981 int Device::executionCapabilities() const
1982 { return p ? p->getProp<cl_device_exec_capabilities, int>(CL_DEVICE_EXECUTION_CAPABILITIES) : 0; }
1983
globalMemCacheSize() const1984 size_t Device::globalMemCacheSize() const
1985 { return p ? p->getProp<cl_ulong, size_t>(CL_DEVICE_GLOBAL_MEM_CACHE_SIZE) : 0; }
1986
globalMemCacheType() const1987 int Device::globalMemCacheType() const
1988 { return p ? p->getProp<cl_device_mem_cache_type, int>(CL_DEVICE_GLOBAL_MEM_CACHE_TYPE) : 0; }
1989
globalMemCacheLineSize() const1990 int Device::globalMemCacheLineSize() const
1991 { return p ? p->getProp<cl_uint, int>(CL_DEVICE_GLOBAL_MEM_CACHELINE_SIZE) : 0; }
1992
globalMemSize() const1993 size_t Device::globalMemSize() const
1994 { return p ? p->getProp<cl_ulong, size_t>(CL_DEVICE_GLOBAL_MEM_SIZE) : 0; }
1995
localMemSize() const1996 size_t Device::localMemSize() const
1997 { return p ? p->getProp<cl_ulong, size_t>(CL_DEVICE_LOCAL_MEM_SIZE) : 0; }
1998
localMemType() const1999 int Device::localMemType() const
2000 { return p ? p->getProp<cl_device_local_mem_type, int>(CL_DEVICE_LOCAL_MEM_TYPE) : 0; }
2001
hostUnifiedMemory() const2002 bool Device::hostUnifiedMemory() const
2003 { return p ? p->hostUnifiedMemory_ : false; }
2004
imageSupport() const2005 bool Device::imageSupport() const
2006 { return p ? p->getBoolProp(CL_DEVICE_IMAGE_SUPPORT) : false; }
2007
imageFromBufferSupport() const2008 bool Device::imageFromBufferSupport() const
2009 {
2010 bool ret = false;
2011 if (p)
2012 {
2013 size_t pos = p->getStrProp(CL_DEVICE_EXTENSIONS).find("cl_khr_image2d_from_buffer");
2014 if (pos != String::npos)
2015 {
2016 ret = true;
2017 }
2018 }
2019 return ret;
2020 }
2021
imagePitchAlignment() const2022 uint Device::imagePitchAlignment() const
2023 {
2024 #ifdef CL_DEVICE_IMAGE_PITCH_ALIGNMENT
2025 return p ? p->getProp<cl_uint, uint>(CL_DEVICE_IMAGE_PITCH_ALIGNMENT) : 0;
2026 #else
2027 return 0;
2028 #endif
2029 }
2030
imageBaseAddressAlignment() const2031 uint Device::imageBaseAddressAlignment() const
2032 {
2033 #ifdef CL_DEVICE_IMAGE_BASE_ADDRESS_ALIGNMENT
2034 return p ? p->getProp<cl_uint, uint>(CL_DEVICE_IMAGE_BASE_ADDRESS_ALIGNMENT) : 0;
2035 #else
2036 return 0;
2037 #endif
2038 }
2039
image2DMaxWidth() const2040 size_t Device::image2DMaxWidth() const
2041 { return p ? p->getProp<size_t, size_t>(CL_DEVICE_IMAGE2D_MAX_WIDTH) : 0; }
2042
image2DMaxHeight() const2043 size_t Device::image2DMaxHeight() const
2044 { return p ? p->getProp<size_t, size_t>(CL_DEVICE_IMAGE2D_MAX_HEIGHT) : 0; }
2045
image3DMaxWidth() const2046 size_t Device::image3DMaxWidth() const
2047 { return p ? p->getProp<size_t, size_t>(CL_DEVICE_IMAGE3D_MAX_WIDTH) : 0; }
2048
image3DMaxHeight() const2049 size_t Device::image3DMaxHeight() const
2050 { return p ? p->getProp<size_t, size_t>(CL_DEVICE_IMAGE3D_MAX_HEIGHT) : 0; }
2051
image3DMaxDepth() const2052 size_t Device::image3DMaxDepth() const
2053 { return p ? p->getProp<size_t, size_t>(CL_DEVICE_IMAGE3D_MAX_DEPTH) : 0; }
2054
imageMaxBufferSize() const2055 size_t Device::imageMaxBufferSize() const
2056 #ifdef CL_VERSION_1_2
2057 { return p ? p->getProp<size_t, size_t>(CL_DEVICE_IMAGE_MAX_BUFFER_SIZE) : 0; }
2058 #else
2059 { CV_REQUIRE_OPENCL_1_2_ERROR; }
2060 #endif
2061
imageMaxArraySize() const2062 size_t Device::imageMaxArraySize() const
2063 #ifdef CL_VERSION_1_2
2064 { return p ? p->getProp<size_t, size_t>(CL_DEVICE_IMAGE_MAX_ARRAY_SIZE) : 0; }
2065 #else
2066 { CV_REQUIRE_OPENCL_1_2_ERROR; }
2067 #endif
2068
maxClockFrequency() const2069 int Device::maxClockFrequency() const
2070 { return p ? p->getProp<cl_uint, int>(CL_DEVICE_MAX_CLOCK_FREQUENCY) : 0; }
2071
maxComputeUnits() const2072 int Device::maxComputeUnits() const
2073 { return p ? p->maxComputeUnits_ : 0; }
2074
maxConstantArgs() const2075 int Device::maxConstantArgs() const
2076 { return p ? p->getProp<cl_uint, int>(CL_DEVICE_MAX_CONSTANT_ARGS) : 0; }
2077
maxConstantBufferSize() const2078 size_t Device::maxConstantBufferSize() const
2079 { return p ? p->getProp<cl_ulong, size_t>(CL_DEVICE_MAX_CONSTANT_BUFFER_SIZE) : 0; }
2080
maxMemAllocSize() const2081 size_t Device::maxMemAllocSize() const
2082 { return p ? p->getProp<cl_ulong, size_t>(CL_DEVICE_MAX_MEM_ALLOC_SIZE) : 0; }
2083
maxParameterSize() const2084 size_t Device::maxParameterSize() const
2085 { return p ? p->getProp<cl_ulong, size_t>(CL_DEVICE_MAX_PARAMETER_SIZE) : 0; }
2086
maxReadImageArgs() const2087 int Device::maxReadImageArgs() const
2088 { return p ? p->getProp<cl_uint, int>(CL_DEVICE_MAX_READ_IMAGE_ARGS) : 0; }
2089
maxWriteImageArgs() const2090 int Device::maxWriteImageArgs() const
2091 { return p ? p->getProp<cl_uint, int>(CL_DEVICE_MAX_WRITE_IMAGE_ARGS) : 0; }
2092
maxSamplers() const2093 int Device::maxSamplers() const
2094 { return p ? p->getProp<cl_uint, int>(CL_DEVICE_MAX_SAMPLERS) : 0; }
2095
maxWorkGroupSize() const2096 size_t Device::maxWorkGroupSize() const
2097 { return p ? p->maxWorkGroupSize_ : 0; }
2098
maxWorkItemDims() const2099 int Device::maxWorkItemDims() const
2100 { return p ? p->getProp<cl_uint, int>(CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS) : 0; }
2101
maxWorkItemSizes(size_t * sizes) const2102 void Device::maxWorkItemSizes(size_t* sizes) const
2103 {
2104 if(p)
2105 {
2106 const int MAX_DIMS = 32;
2107 size_t retsz = 0;
2108 CV_OclDbgAssert(clGetDeviceInfo(p->handle, CL_DEVICE_MAX_WORK_ITEM_SIZES,
2109 MAX_DIMS*sizeof(sizes[0]), &sizes[0], &retsz) == CL_SUCCESS);
2110 }
2111 }
2112
memBaseAddrAlign() const2113 int Device::memBaseAddrAlign() const
2114 { return p ? p->getProp<cl_uint, int>(CL_DEVICE_MEM_BASE_ADDR_ALIGN) : 0; }
2115
nativeVectorWidthChar() const2116 int Device::nativeVectorWidthChar() const
2117 { return p ? p->getProp<cl_uint, int>(CL_DEVICE_NATIVE_VECTOR_WIDTH_CHAR) : 0; }
2118
nativeVectorWidthShort() const2119 int Device::nativeVectorWidthShort() const
2120 { return p ? p->getProp<cl_uint, int>(CL_DEVICE_NATIVE_VECTOR_WIDTH_SHORT) : 0; }
2121
nativeVectorWidthInt() const2122 int Device::nativeVectorWidthInt() const
2123 { return p ? p->getProp<cl_uint, int>(CL_DEVICE_NATIVE_VECTOR_WIDTH_INT) : 0; }
2124
nativeVectorWidthLong() const2125 int Device::nativeVectorWidthLong() const
2126 { return p ? p->getProp<cl_uint, int>(CL_DEVICE_NATIVE_VECTOR_WIDTH_LONG) : 0; }
2127
nativeVectorWidthFloat() const2128 int Device::nativeVectorWidthFloat() const
2129 { return p ? p->getProp<cl_uint, int>(CL_DEVICE_NATIVE_VECTOR_WIDTH_FLOAT) : 0; }
2130
nativeVectorWidthDouble() const2131 int Device::nativeVectorWidthDouble() const
2132 { return p ? p->getProp<cl_uint, int>(CL_DEVICE_NATIVE_VECTOR_WIDTH_DOUBLE) : 0; }
2133
nativeVectorWidthHalf() const2134 int Device::nativeVectorWidthHalf() const
2135 { return p ? p->getProp<cl_uint, int>(CL_DEVICE_NATIVE_VECTOR_WIDTH_HALF) : 0; }
2136
preferredVectorWidthChar() const2137 int Device::preferredVectorWidthChar() const
2138 { return p ? p->getProp<cl_uint, int>(CL_DEVICE_PREFERRED_VECTOR_WIDTH_CHAR) : 0; }
2139
preferredVectorWidthShort() const2140 int Device::preferredVectorWidthShort() const
2141 { return p ? p->getProp<cl_uint, int>(CL_DEVICE_PREFERRED_VECTOR_WIDTH_SHORT) : 0; }
2142
preferredVectorWidthInt() const2143 int Device::preferredVectorWidthInt() const
2144 { return p ? p->getProp<cl_uint, int>(CL_DEVICE_PREFERRED_VECTOR_WIDTH_INT) : 0; }
2145
preferredVectorWidthLong() const2146 int Device::preferredVectorWidthLong() const
2147 { return p ? p->getProp<cl_uint, int>(CL_DEVICE_PREFERRED_VECTOR_WIDTH_LONG) : 0; }
2148
preferredVectorWidthFloat() const2149 int Device::preferredVectorWidthFloat() const
2150 { return p ? p->getProp<cl_uint, int>(CL_DEVICE_PREFERRED_VECTOR_WIDTH_FLOAT) : 0; }
2151
preferredVectorWidthDouble() const2152 int Device::preferredVectorWidthDouble() const
2153 { return p ? p->getProp<cl_uint, int>(CL_DEVICE_PREFERRED_VECTOR_WIDTH_DOUBLE) : 0; }
2154
preferredVectorWidthHalf() const2155 int Device::preferredVectorWidthHalf() const
2156 { return p ? p->getProp<cl_uint, int>(CL_DEVICE_PREFERRED_VECTOR_WIDTH_HALF) : 0; }
2157
printfBufferSize() const2158 size_t Device::printfBufferSize() const
2159 #ifdef CL_VERSION_1_2
2160 { return p ? p->getProp<size_t, size_t>(CL_DEVICE_PRINTF_BUFFER_SIZE) : 0; }
2161 #else
2162 { CV_REQUIRE_OPENCL_1_2_ERROR; }
2163 #endif
2164
2165
profilingTimerResolution() const2166 size_t Device::profilingTimerResolution() const
2167 { return p ? p->getProp<size_t, size_t>(CL_DEVICE_PROFILING_TIMER_RESOLUTION) : 0; }
2168
getDefault()2169 const Device& Device::getDefault()
2170 {
2171 const Context& ctx = Context::getDefault();
2172 int idx = getCoreTlsData().get()->device;
2173 const Device& device = ctx.device(idx);
2174 return device;
2175 }
2176
2177 ////////////////////////////////////// Context ///////////////////////////////////////////////////
2178
2179 template <typename Functor, typename ObjectType>
getStringInfo(Functor f,ObjectType obj,cl_uint name,std::string & param)2180 inline cl_int getStringInfo(Functor f, ObjectType obj, cl_uint name, std::string& param)
2181 {
2182 ::size_t required;
2183 cl_int err = f(obj, name, 0, NULL, &required);
2184 if (err != CL_SUCCESS)
2185 return err;
2186
2187 param.clear();
2188 if (required > 0)
2189 {
2190 AutoBuffer<char> buf(required + 1);
2191 char* ptr = (char*)buf; // cleanup is not needed
2192 err = f(obj, name, required, ptr, NULL);
2193 if (err != CL_SUCCESS)
2194 return err;
2195 param = ptr;
2196 }
2197
2198 return CL_SUCCESS;
2199 }
2200
split(const std::string & s,char delim,std::vector<std::string> & elems)2201 static void split(const std::string &s, char delim, std::vector<std::string> &elems)
2202 {
2203 elems.clear();
2204 if (s.size() == 0)
2205 return;
2206 std::istringstream ss(s);
2207 std::string item;
2208 while (!ss.eof())
2209 {
2210 std::getline(ss, item, delim);
2211 elems.push_back(item);
2212 }
2213 }
2214
2215 // Layout: <Platform>:<CPU|GPU|ACCELERATOR|nothing=GPU/CPU>:<deviceName>
2216 // Sample: AMD:GPU:
2217 // Sample: AMD:GPU:Tahiti
2218 // Sample: :GPU|CPU: = '' = ':' = '::'
parseOpenCLDeviceConfiguration(const std::string & configurationStr,std::string & platform,std::vector<std::string> & deviceTypes,std::string & deviceNameOrID)2219 static bool parseOpenCLDeviceConfiguration(const std::string& configurationStr,
2220 std::string& platform, std::vector<std::string>& deviceTypes, std::string& deviceNameOrID)
2221 {
2222 std::vector<std::string> parts;
2223 split(configurationStr, ':', parts);
2224 if (parts.size() > 3)
2225 {
2226 std::cerr << "ERROR: Invalid configuration string for OpenCL device" << std::endl;
2227 return false;
2228 }
2229 if (parts.size() > 2)
2230 deviceNameOrID = parts[2];
2231 if (parts.size() > 1)
2232 {
2233 split(parts[1], '|', deviceTypes);
2234 }
2235 if (parts.size() > 0)
2236 {
2237 platform = parts[0];
2238 }
2239 return true;
2240 }
2241
2242 #ifdef WINRT
selectOpenCLDevice()2243 static cl_device_id selectOpenCLDevice()
2244 {
2245 return NULL;
2246 }
2247 #else
selectOpenCLDevice()2248 static cl_device_id selectOpenCLDevice()
2249 {
2250 std::string platform, deviceName;
2251 std::vector<std::string> deviceTypes;
2252
2253 const char* configuration = getenv("OPENCV_OPENCL_DEVICE");
2254 if (configuration &&
2255 (strcmp(configuration, "disabled") == 0 ||
2256 !parseOpenCLDeviceConfiguration(std::string(configuration), platform, deviceTypes, deviceName)
2257 ))
2258 return NULL;
2259
2260 bool isID = false;
2261 int deviceID = -1;
2262 if (deviceName.length() == 1)
2263 // We limit ID range to 0..9, because we want to write:
2264 // - '2500' to mean i5-2500
2265 // - '8350' to mean AMD FX-8350
2266 // - '650' to mean GeForce 650
2267 // To extend ID range change condition to '> 0'
2268 {
2269 isID = true;
2270 for (size_t i = 0; i < deviceName.length(); i++)
2271 {
2272 if (!isdigit(deviceName[i]))
2273 {
2274 isID = false;
2275 break;
2276 }
2277 }
2278 if (isID)
2279 {
2280 deviceID = atoi(deviceName.c_str());
2281 if (deviceID < 0)
2282 return NULL;
2283 }
2284 }
2285
2286 std::vector<cl_platform_id> platforms;
2287 {
2288 cl_uint numPlatforms = 0;
2289 CV_OclDbgAssert(clGetPlatformIDs(0, NULL, &numPlatforms) == CL_SUCCESS);
2290
2291 if (numPlatforms == 0)
2292 return NULL;
2293 platforms.resize((size_t)numPlatforms);
2294 CV_OclDbgAssert(clGetPlatformIDs(numPlatforms, &platforms[0], &numPlatforms) == CL_SUCCESS);
2295 platforms.resize(numPlatforms);
2296 }
2297
2298 int selectedPlatform = -1;
2299 if (platform.length() > 0)
2300 {
2301 for (size_t i = 0; i < platforms.size(); i++)
2302 {
2303 std::string name;
2304 CV_OclDbgAssert(getStringInfo(clGetPlatformInfo, platforms[i], CL_PLATFORM_NAME, name) == CL_SUCCESS);
2305 if (name.find(platform) != std::string::npos)
2306 {
2307 selectedPlatform = (int)i;
2308 break;
2309 }
2310 }
2311 if (selectedPlatform == -1)
2312 {
2313 std::cerr << "ERROR: Can't find OpenCL platform by name: " << platform << std::endl;
2314 goto not_found;
2315 }
2316 }
2317 if (deviceTypes.size() == 0)
2318 {
2319 if (!isID)
2320 {
2321 deviceTypes.push_back("GPU");
2322 if (configuration)
2323 deviceTypes.push_back("CPU");
2324 }
2325 else
2326 deviceTypes.push_back("ALL");
2327 }
2328 for (size_t t = 0; t < deviceTypes.size(); t++)
2329 {
2330 int deviceType = 0;
2331 std::string tempStrDeviceType = deviceTypes[t];
2332 std::transform( tempStrDeviceType.begin(), tempStrDeviceType.end(), tempStrDeviceType.begin(), tolower );
2333
2334 if (tempStrDeviceType == "gpu" || tempStrDeviceType == "dgpu" || tempStrDeviceType == "igpu")
2335 deviceType = Device::TYPE_GPU;
2336 else if (tempStrDeviceType == "cpu")
2337 deviceType = Device::TYPE_CPU;
2338 else if (tempStrDeviceType == "accelerator")
2339 deviceType = Device::TYPE_ACCELERATOR;
2340 else if (tempStrDeviceType == "all")
2341 deviceType = Device::TYPE_ALL;
2342 else
2343 {
2344 std::cerr << "ERROR: Unsupported device type for OpenCL device (GPU, CPU, ACCELERATOR): " << deviceTypes[t] << std::endl;
2345 goto not_found;
2346 }
2347
2348 std::vector<cl_device_id> devices; // TODO Use clReleaseDevice to cleanup
2349 for (int i = selectedPlatform >= 0 ? selectedPlatform : 0;
2350 (selectedPlatform >= 0 ? i == selectedPlatform : true) && (i < (int)platforms.size());
2351 i++)
2352 {
2353 cl_uint count = 0;
2354 cl_int status = clGetDeviceIDs(platforms[i], deviceType, 0, NULL, &count);
2355 CV_OclDbgAssert(status == CL_SUCCESS || status == CL_DEVICE_NOT_FOUND);
2356 if (count == 0)
2357 continue;
2358 size_t base = devices.size();
2359 devices.resize(base + count);
2360 status = clGetDeviceIDs(platforms[i], deviceType, count, &devices[base], &count);
2361 CV_OclDbgAssert(status == CL_SUCCESS || status == CL_DEVICE_NOT_FOUND);
2362 }
2363
2364 for (size_t i = (isID ? deviceID : 0);
2365 (isID ? (i == (size_t)deviceID) : true) && (i < devices.size());
2366 i++)
2367 {
2368 std::string name;
2369 CV_OclDbgAssert(getStringInfo(clGetDeviceInfo, devices[i], CL_DEVICE_NAME, name) == CL_SUCCESS);
2370 cl_bool useGPU = true;
2371 if(tempStrDeviceType == "dgpu" || tempStrDeviceType == "igpu")
2372 {
2373 cl_bool isIGPU = CL_FALSE;
2374 clGetDeviceInfo(devices[i], CL_DEVICE_HOST_UNIFIED_MEMORY, sizeof(isIGPU), &isIGPU, NULL);
2375 useGPU = tempStrDeviceType == "dgpu" ? !isIGPU : isIGPU;
2376 }
2377 if ( (isID || name.find(deviceName) != std::string::npos) && useGPU)
2378 {
2379 // TODO check for OpenCL 1.1
2380 return devices[i];
2381 }
2382 }
2383 }
2384
2385 not_found:
2386 if (!configuration)
2387 return NULL; // suppress messages on stderr
2388
2389 std::cerr << "ERROR: Requested OpenCL device not found, check configuration: " << (configuration == NULL ? "" : configuration) << std::endl
2390 << " Platform: " << (platform.length() == 0 ? "any" : platform) << std::endl
2391 << " Device types: ";
2392 for (size_t t = 0; t < deviceTypes.size(); t++)
2393 std::cerr << deviceTypes[t] << " ";
2394
2395 std::cerr << std::endl << " Device name: " << (deviceName.length() == 0 ? "any" : deviceName) << std::endl;
2396 return NULL;
2397 }
2398 #endif
2399
2400 #ifdef HAVE_OPENCL_SVM
2401 namespace svm {
2402
2403 enum AllocatorFlags { // don't use first 16 bits
2404 OPENCL_SVM_COARSE_GRAIN_BUFFER = 1 << 16, // clSVMAlloc + SVM map/unmap
2405 OPENCL_SVM_FINE_GRAIN_BUFFER = 2 << 16, // clSVMAlloc
2406 OPENCL_SVM_FINE_GRAIN_SYSTEM = 3 << 16, // direct access
2407 OPENCL_SVM_BUFFER_MASK = 3 << 16,
2408 OPENCL_SVM_BUFFER_MAP = 4 << 16
2409 };
2410
checkForceSVMUmatUsage()2411 static bool checkForceSVMUmatUsage()
2412 {
2413 static bool initialized = false;
2414 static bool force = false;
2415 if (!initialized)
2416 {
2417 force = getBoolParameter("OPENCV_OPENCL_SVM_FORCE_UMAT_USAGE", false);
2418 initialized = true;
2419 }
2420 return force;
2421 }
checkDisableSVMUMatUsage()2422 static bool checkDisableSVMUMatUsage()
2423 {
2424 static bool initialized = false;
2425 static bool force = false;
2426 if (!initialized)
2427 {
2428 force = getBoolParameter("OPENCV_OPENCL_SVM_DISABLE_UMAT_USAGE", false);
2429 initialized = true;
2430 }
2431 return force;
2432 }
checkDisableSVM()2433 static bool checkDisableSVM()
2434 {
2435 static bool initialized = false;
2436 static bool force = false;
2437 if (!initialized)
2438 {
2439 force = getBoolParameter("OPENCV_OPENCL_SVM_DISABLE", false);
2440 initialized = true;
2441 }
2442 return force;
2443 }
2444 // see SVMCapabilities
getSVMCapabilitiesMask()2445 static unsigned int getSVMCapabilitiesMask()
2446 {
2447 static bool initialized = false;
2448 static unsigned int mask = 0;
2449 if (!initialized)
2450 {
2451 const char* envValue = getenv("OPENCV_OPENCL_SVM_CAPABILITIES_MASK");
2452 if (envValue == NULL)
2453 {
2454 return ~0U; // all bits 1
2455 }
2456 mask = atoi(envValue);
2457 initialized = true;
2458 }
2459 return mask;
2460 }
2461 } // namespace
2462 #endif
2463
2464 struct Context::Impl
2465 {
getcv::ocl::Context::Impl2466 static Context::Impl* get(Context& context) { return context.p; }
2467
__initcv::ocl::Context::Impl2468 void __init()
2469 {
2470 refcount = 1;
2471 handle = 0;
2472 #ifdef HAVE_OPENCL_SVM
2473 svmInitialized = false;
2474 #endif
2475 }
2476
Implcv::ocl::Context::Impl2477 Impl()
2478 {
2479 __init();
2480 }
2481
setDefaultcv::ocl::Context::Impl2482 void setDefault()
2483 {
2484 CV_Assert(handle == NULL);
2485
2486 cl_device_id d = selectOpenCLDevice();
2487
2488 if (d == NULL)
2489 return;
2490
2491 cl_platform_id pl = NULL;
2492 CV_OclDbgAssert(clGetDeviceInfo(d, CL_DEVICE_PLATFORM, sizeof(cl_platform_id), &pl, NULL) == CL_SUCCESS);
2493
2494 cl_context_properties prop[] =
2495 {
2496 CL_CONTEXT_PLATFORM, (cl_context_properties)pl,
2497 0
2498 };
2499
2500 // !!! in the current implementation force the number of devices to 1 !!!
2501 cl_uint nd = 1;
2502 cl_int status;
2503
2504 handle = clCreateContext(prop, nd, &d, 0, 0, &status);
2505
2506 bool ok = handle != 0 && status == CL_SUCCESS;
2507 if( ok )
2508 {
2509 devices.resize(nd);
2510 devices[0].set(d);
2511 }
2512 else
2513 handle = NULL;
2514 }
2515
Implcv::ocl::Context::Impl2516 Impl(int dtype0)
2517 {
2518 __init();
2519
2520 cl_int retval = 0;
2521 cl_platform_id pl = (cl_platform_id)Platform::getDefault().ptr();
2522 cl_context_properties prop[] =
2523 {
2524 CL_CONTEXT_PLATFORM, (cl_context_properties)pl,
2525 0
2526 };
2527
2528 cl_uint i, nd0 = 0, nd = 0;
2529 int dtype = dtype0 & 15;
2530 CV_OclDbgAssert(clGetDeviceIDs( pl, dtype, 0, 0, &nd0 ) == CL_SUCCESS);
2531
2532 AutoBuffer<void*> dlistbuf(nd0*2+1);
2533 cl_device_id* dlist = (cl_device_id*)(void**)dlistbuf;
2534 cl_device_id* dlist_new = dlist + nd0;
2535 CV_OclDbgAssert(clGetDeviceIDs( pl, dtype, nd0, dlist, &nd0 ) == CL_SUCCESS);
2536 String name0;
2537
2538 for(i = 0; i < nd0; i++)
2539 {
2540 Device d(dlist[i]);
2541 if( !d.available() || !d.compilerAvailable() )
2542 continue;
2543 if( dtype0 == Device::TYPE_DGPU && d.hostUnifiedMemory() )
2544 continue;
2545 if( dtype0 == Device::TYPE_IGPU && !d.hostUnifiedMemory() )
2546 continue;
2547 String name = d.name();
2548 if( nd != 0 && name != name0 )
2549 continue;
2550 name0 = name;
2551 dlist_new[nd++] = dlist[i];
2552 }
2553
2554 if(nd == 0)
2555 return;
2556
2557 // !!! in the current implementation force the number of devices to 1 !!!
2558 nd = 1;
2559
2560 handle = clCreateContext(prop, nd, dlist_new, 0, 0, &retval);
2561 bool ok = handle != 0 && retval == CL_SUCCESS;
2562 if( ok )
2563 {
2564 devices.resize(nd);
2565 for( i = 0; i < nd; i++ )
2566 devices[i].set(dlist_new[i]);
2567 }
2568 }
2569
~Implcv::ocl::Context::Impl2570 ~Impl()
2571 {
2572 if(handle)
2573 {
2574 clReleaseContext(handle);
2575 handle = NULL;
2576 }
2577 devices.clear();
2578 }
2579
getProgcv::ocl::Context::Impl2580 Program getProg(const ProgramSource& src,
2581 const String& buildflags, String& errmsg)
2582 {
2583 String prefix = Program::getPrefix(buildflags);
2584 HashKey k(src.hash(), crc64((const uchar*)prefix.c_str(), prefix.size()));
2585 phash_t::iterator it = phash.find(k);
2586 if( it != phash.end() )
2587 return it->second;
2588 //String filename = format("%08x%08x_%08x%08x.clb2",
2589 Program prog(src, buildflags, errmsg);
2590 if(prog.ptr())
2591 phash.insert(std::pair<HashKey,Program>(k, prog));
2592 return prog;
2593 }
2594
2595 IMPLEMENT_REFCOUNTABLE();
2596
2597 cl_context handle;
2598 std::vector<Device> devices;
2599
2600 typedef ProgramSource::hash_t hash_t;
2601
2602 struct HashKey
2603 {
HashKeycv::ocl::Context::Impl::HashKey2604 HashKey(hash_t _a, hash_t _b) : a(_a), b(_b) {}
operator <cv::ocl::Context::Impl::HashKey2605 bool operator < (const HashKey& k) const { return a < k.a || (a == k.a && b < k.b); }
operator ==cv::ocl::Context::Impl::HashKey2606 bool operator == (const HashKey& k) const { return a == k.a && b == k.b; }
operator !=cv::ocl::Context::Impl::HashKey2607 bool operator != (const HashKey& k) const { return a != k.a || b != k.b; }
2608 hash_t a, b;
2609 };
2610 typedef std::map<HashKey, Program> phash_t;
2611 phash_t phash;
2612
2613 #ifdef HAVE_OPENCL_SVM
2614 bool svmInitialized;
2615 bool svmAvailable;
2616 bool svmEnabled;
2617 svm::SVMCapabilities svmCapabilities;
2618 svm::SVMFunctions svmFunctions;
2619
svmInitcv::ocl::Context::Impl2620 void svmInit()
2621 {
2622 CV_Assert(handle != NULL);
2623 const Device& device = devices[0];
2624 cl_device_svm_capabilities deviceCaps = 0;
2625 CV_Assert(((void)0, CL_DEVICE_SVM_CAPABILITIES == CL_DEVICE_SVM_CAPABILITIES_AMD)); // Check assumption
2626 cl_int status = clGetDeviceInfo((cl_device_id)device.ptr(), CL_DEVICE_SVM_CAPABILITIES, sizeof(deviceCaps), &deviceCaps, NULL);
2627 if (status != CL_SUCCESS)
2628 {
2629 CV_OPENCL_SVM_TRACE_ERROR_P("CL_DEVICE_SVM_CAPABILITIES via clGetDeviceInfo failed: %d\n", status);
2630 goto noSVM;
2631 }
2632 CV_OPENCL_SVM_TRACE_P("CL_DEVICE_SVM_CAPABILITIES returned: 0x%x\n", (int)deviceCaps);
2633 CV_Assert(((void)0, CL_DEVICE_SVM_COARSE_GRAIN_BUFFER == CL_DEVICE_SVM_COARSE_GRAIN_BUFFER_AMD)); // Check assumption
2634 svmCapabilities.value_ =
2635 ((deviceCaps & CL_DEVICE_SVM_COARSE_GRAIN_BUFFER) ? svm::SVMCapabilities::SVM_COARSE_GRAIN_BUFFER : 0) |
2636 ((deviceCaps & CL_DEVICE_SVM_FINE_GRAIN_BUFFER) ? svm::SVMCapabilities::SVM_FINE_GRAIN_BUFFER : 0) |
2637 ((deviceCaps & CL_DEVICE_SVM_FINE_GRAIN_SYSTEM) ? svm::SVMCapabilities::SVM_FINE_GRAIN_SYSTEM : 0) |
2638 ((deviceCaps & CL_DEVICE_SVM_ATOMICS) ? svm::SVMCapabilities::SVM_ATOMICS : 0);
2639 svmCapabilities.value_ &= svm::getSVMCapabilitiesMask();
2640 if (svmCapabilities.value_ == 0)
2641 {
2642 CV_OPENCL_SVM_TRACE_ERROR_P("svmCapabilities is empty\n");
2643 goto noSVM;
2644 }
2645 try
2646 {
2647 // Try OpenCL 2.0
2648 CV_OPENCL_SVM_TRACE_P("Try SVM from OpenCL 2.0 ...\n");
2649 void* ptr = clSVMAlloc(handle, CL_MEM_READ_WRITE, 100, 0);
2650 if (!ptr)
2651 {
2652 CV_OPENCL_SVM_TRACE_ERROR_P("clSVMAlloc returned NULL...\n");
2653 CV_ErrorNoReturn(Error::StsBadArg, "clSVMAlloc returned NULL");
2654 }
2655 try
2656 {
2657 bool error = false;
2658 cl_command_queue q = (cl_command_queue)Queue::getDefault().ptr();
2659 if (CL_SUCCESS != clEnqueueSVMMap(q, CL_TRUE, CL_MAP_WRITE, ptr, 100, 0, NULL, NULL))
2660 {
2661 CV_OPENCL_SVM_TRACE_ERROR_P("clEnqueueSVMMap failed...\n");
2662 CV_ErrorNoReturn(Error::StsBadArg, "clEnqueueSVMMap FAILED");
2663 }
2664 clFinish(q);
2665 try
2666 {
2667 ((int*)ptr)[0] = 100;
2668 }
2669 catch (...)
2670 {
2671 CV_OPENCL_SVM_TRACE_ERROR_P("SVM buffer access test FAILED\n");
2672 error = true;
2673 }
2674 if (CL_SUCCESS != clEnqueueSVMUnmap(q, ptr, 0, NULL, NULL))
2675 {
2676 CV_OPENCL_SVM_TRACE_ERROR_P("clEnqueueSVMUnmap failed...\n");
2677 CV_ErrorNoReturn(Error::StsBadArg, "clEnqueueSVMUnmap FAILED");
2678 }
2679 clFinish(q);
2680 if (error)
2681 {
2682 CV_ErrorNoReturn(Error::StsBadArg, "OpenCL SVM buffer access test was FAILED");
2683 }
2684 }
2685 catch (...)
2686 {
2687 CV_OPENCL_SVM_TRACE_ERROR_P("OpenCL SVM buffer access test was FAILED\n");
2688 clSVMFree(handle, ptr);
2689 throw;
2690 }
2691 clSVMFree(handle, ptr);
2692 svmFunctions.fn_clSVMAlloc = clSVMAlloc;
2693 svmFunctions.fn_clSVMFree = clSVMFree;
2694 svmFunctions.fn_clSetKernelArgSVMPointer = clSetKernelArgSVMPointer;
2695 //svmFunctions.fn_clSetKernelExecInfo = clSetKernelExecInfo;
2696 //svmFunctions.fn_clEnqueueSVMFree = clEnqueueSVMFree;
2697 svmFunctions.fn_clEnqueueSVMMemcpy = clEnqueueSVMMemcpy;
2698 svmFunctions.fn_clEnqueueSVMMemFill = clEnqueueSVMMemFill;
2699 svmFunctions.fn_clEnqueueSVMMap = clEnqueueSVMMap;
2700 svmFunctions.fn_clEnqueueSVMUnmap = clEnqueueSVMUnmap;
2701 }
2702 catch (...)
2703 {
2704 CV_OPENCL_SVM_TRACE_P("clSVMAlloc failed, trying HSA extension...\n");
2705 try
2706 {
2707 // Try HSA extension
2708 String extensions = device.extensions();
2709 if (extensions.find("cl_amd_svm") == String::npos)
2710 {
2711 CV_OPENCL_SVM_TRACE_P("Device extension doesn't have cl_amd_svm: %s\n", extensions.c_str());
2712 goto noSVM;
2713 }
2714 cl_platform_id p = NULL;
2715 status = clGetDeviceInfo((cl_device_id)device.ptr(), CL_DEVICE_PLATFORM, sizeof(cl_platform_id), &p, NULL);
2716 CV_Assert(status == CL_SUCCESS);
2717 svmFunctions.fn_clSVMAlloc = (clSVMAllocAMD_fn)clGetExtensionFunctionAddressForPlatform(p, "clSVMAllocAMD");
2718 svmFunctions.fn_clSVMFree = (clSVMFreeAMD_fn)clGetExtensionFunctionAddressForPlatform(p, "clSVMFreeAMD");
2719 svmFunctions.fn_clSetKernelArgSVMPointer = (clSetKernelArgSVMPointerAMD_fn)clGetExtensionFunctionAddressForPlatform(p, "clSetKernelArgSVMPointerAMD");
2720 //svmFunctions.fn_clSetKernelExecInfo = (clSetKernelExecInfoAMD_fn)clGetExtensionFunctionAddressForPlatform(p, "clSetKernelExecInfoAMD");
2721 //svmFunctions.fn_clEnqueueSVMFree = (clEnqueueSVMFreeAMD_fn)clGetExtensionFunctionAddressForPlatform(p, "clEnqueueSVMFreeAMD");
2722 svmFunctions.fn_clEnqueueSVMMemcpy = (clEnqueueSVMMemcpyAMD_fn)clGetExtensionFunctionAddressForPlatform(p, "clEnqueueSVMMemcpyAMD");
2723 svmFunctions.fn_clEnqueueSVMMemFill = (clEnqueueSVMMemFillAMD_fn)clGetExtensionFunctionAddressForPlatform(p, "clEnqueueSVMMemFillAMD");
2724 svmFunctions.fn_clEnqueueSVMMap = (clEnqueueSVMMapAMD_fn)clGetExtensionFunctionAddressForPlatform(p, "clEnqueueSVMMapAMD");
2725 svmFunctions.fn_clEnqueueSVMUnmap = (clEnqueueSVMUnmapAMD_fn)clGetExtensionFunctionAddressForPlatform(p, "clEnqueueSVMUnmapAMD");
2726 CV_Assert(svmFunctions.isValid());
2727 }
2728 catch (...)
2729 {
2730 CV_OPENCL_SVM_TRACE_P("Something is totally wrong\n");
2731 goto noSVM;
2732 }
2733 }
2734
2735 svmAvailable = true;
2736 svmEnabled = !svm::checkDisableSVM();
2737 svmInitialized = true;
2738 CV_OPENCL_SVM_TRACE_P("OpenCV OpenCL SVM support initialized\n");
2739 return;
2740 noSVM:
2741 CV_OPENCL_SVM_TRACE_P("OpenCL SVM is not detected\n");
2742 svmAvailable = false;
2743 svmEnabled = false;
2744 svmCapabilities.value_ = 0;
2745 svmInitialized = true;
2746 svmFunctions.fn_clSVMAlloc = NULL;
2747 return;
2748 }
2749 #endif
2750 };
2751
2752
Context()2753 Context::Context()
2754 {
2755 p = 0;
2756 }
2757
Context(int dtype)2758 Context::Context(int dtype)
2759 {
2760 p = 0;
2761 create(dtype);
2762 }
2763
create()2764 bool Context::create()
2765 {
2766 if( !haveOpenCL() )
2767 return false;
2768 if(p)
2769 p->release();
2770 p = new Impl();
2771 if(!p->handle)
2772 {
2773 delete p;
2774 p = 0;
2775 }
2776 return p != 0;
2777 }
2778
create(int dtype0)2779 bool Context::create(int dtype0)
2780 {
2781 if( !haveOpenCL() )
2782 return false;
2783 if(p)
2784 p->release();
2785 p = new Impl(dtype0);
2786 if(!p->handle)
2787 {
2788 delete p;
2789 p = 0;
2790 }
2791 return p != 0;
2792 }
2793
~Context()2794 Context::~Context()
2795 {
2796 if (p)
2797 {
2798 p->release();
2799 p = NULL;
2800 }
2801 }
2802
Context(const Context & c)2803 Context::Context(const Context& c)
2804 {
2805 p = (Impl*)c.p;
2806 if(p)
2807 p->addref();
2808 }
2809
operator =(const Context & c)2810 Context& Context::operator = (const Context& c)
2811 {
2812 Impl* newp = (Impl*)c.p;
2813 if(newp)
2814 newp->addref();
2815 if(p)
2816 p->release();
2817 p = newp;
2818 return *this;
2819 }
2820
ptr() const2821 void* Context::ptr() const
2822 {
2823 return p == NULL ? NULL : p->handle;
2824 }
2825
ndevices() const2826 size_t Context::ndevices() const
2827 {
2828 return p ? p->devices.size() : 0;
2829 }
2830
device(size_t idx) const2831 const Device& Context::device(size_t idx) const
2832 {
2833 static Device dummy;
2834 return !p || idx >= p->devices.size() ? dummy : p->devices[idx];
2835 }
2836
getDefault(bool initialize)2837 Context& Context::getDefault(bool initialize)
2838 {
2839 static Context* ctx = new Context();
2840 if(!ctx->p && haveOpenCL())
2841 {
2842 if (!ctx->p)
2843 ctx->p = new Impl();
2844 if (initialize)
2845 {
2846 // do not create new Context right away.
2847 // First, try to retrieve existing context of the same type.
2848 // In its turn, Platform::getContext() may call Context::create()
2849 // if there is no such context.
2850 if (ctx->p->handle == NULL)
2851 ctx->p->setDefault();
2852 }
2853 }
2854
2855 return *ctx;
2856 }
2857
getProg(const ProgramSource & prog,const String & buildopts,String & errmsg)2858 Program Context::getProg(const ProgramSource& prog,
2859 const String& buildopts, String& errmsg)
2860 {
2861 return p ? p->getProg(prog, buildopts, errmsg) : Program();
2862 }
2863
2864
2865
2866 #ifdef HAVE_OPENCL_SVM
useSVM() const2867 bool Context::useSVM() const
2868 {
2869 Context::Impl* i = p;
2870 CV_Assert(i);
2871 if (!i->svmInitialized)
2872 i->svmInit();
2873 return i->svmEnabled;
2874 }
setUseSVM(bool enabled)2875 void Context::setUseSVM(bool enabled)
2876 {
2877 Context::Impl* i = p;
2878 CV_Assert(i);
2879 if (!i->svmInitialized)
2880 i->svmInit();
2881 if (enabled && !i->svmAvailable)
2882 {
2883 CV_ErrorNoReturn(Error::StsError, "OpenCL Shared Virtual Memory (SVM) is not supported by OpenCL device");
2884 }
2885 i->svmEnabled = enabled;
2886 }
2887 #else
useSVM() const2888 bool Context::useSVM() const { return false; }
setUseSVM(bool enabled)2889 void Context::setUseSVM(bool enabled) { CV_Assert(!enabled); }
2890 #endif
2891
2892 #ifdef HAVE_OPENCL_SVM
2893 namespace svm {
2894
getSVMCapabilitites(const ocl::Context & context)2895 const SVMCapabilities getSVMCapabilitites(const ocl::Context& context)
2896 {
2897 Context::Impl* i = context.p;
2898 CV_Assert(i);
2899 if (!i->svmInitialized)
2900 i->svmInit();
2901 return i->svmCapabilities;
2902 }
2903
getSVMFunctions(const ocl::Context & context)2904 CV_EXPORTS const SVMFunctions* getSVMFunctions(const ocl::Context& context)
2905 {
2906 Context::Impl* i = context.p;
2907 CV_Assert(i);
2908 CV_Assert(i->svmInitialized); // getSVMCapabilitites() must be called first
2909 CV_Assert(i->svmFunctions.fn_clSVMAlloc != NULL);
2910 return &i->svmFunctions;
2911 }
2912
useSVM(UMatUsageFlags usageFlags)2913 CV_EXPORTS bool useSVM(UMatUsageFlags usageFlags)
2914 {
2915 if (checkForceSVMUmatUsage())
2916 return true;
2917 if (checkDisableSVMUMatUsage())
2918 return false;
2919 if ((usageFlags & USAGE_ALLOCATE_SHARED_MEMORY) != 0)
2920 return true;
2921 return false; // don't use SVM by default
2922 }
2923
2924 } // namespace cv::ocl::svm
2925 #endif // HAVE_OPENCL_SVM
2926
2927
2928
initializeContextFromHandle(Context & ctx,void * platform,void * _context,void * _device)2929 void initializeContextFromHandle(Context& ctx, void* platform, void* _context, void* _device)
2930 {
2931 cl_context context = (cl_context)_context;
2932 cl_device_id device = (cl_device_id)_device;
2933
2934 // cleanup old context
2935 Context::Impl * impl = ctx.p;
2936 if (impl->handle)
2937 {
2938 CV_OclDbgAssert(clReleaseContext(impl->handle) == CL_SUCCESS);
2939 }
2940 impl->devices.clear();
2941
2942 impl->handle = context;
2943 impl->devices.resize(1);
2944 impl->devices[0].set(device);
2945
2946 Platform& p = Platform::getDefault();
2947 Platform::Impl* pImpl = p.p;
2948 pImpl->handle = (cl_platform_id)platform;
2949 }
2950
2951 /////////////////////////////////////////// Queue /////////////////////////////////////////////
2952
2953 struct Queue::Impl
2954 {
Implcv::ocl::Queue::Impl2955 Impl(const Context& c, const Device& d)
2956 {
2957 refcount = 1;
2958 const Context* pc = &c;
2959 cl_context ch = (cl_context)pc->ptr();
2960 if( !ch )
2961 {
2962 pc = &Context::getDefault();
2963 ch = (cl_context)pc->ptr();
2964 }
2965 cl_device_id dh = (cl_device_id)d.ptr();
2966 if( !dh )
2967 dh = (cl_device_id)pc->device(0).ptr();
2968 cl_int retval = 0;
2969 handle = clCreateCommandQueue(ch, dh, 0, &retval);
2970 CV_OclDbgAssert(retval == CL_SUCCESS);
2971 }
2972
~Implcv::ocl::Queue::Impl2973 ~Impl()
2974 {
2975 #ifdef _WIN32
2976 if (!cv::__termination)
2977 #endif
2978 {
2979 if(handle)
2980 {
2981 clFinish(handle);
2982 clReleaseCommandQueue(handle);
2983 handle = NULL;
2984 }
2985 }
2986 }
2987
2988 IMPLEMENT_REFCOUNTABLE();
2989
2990 cl_command_queue handle;
2991 };
2992
Queue()2993 Queue::Queue()
2994 {
2995 p = 0;
2996 }
2997
Queue(const Context & c,const Device & d)2998 Queue::Queue(const Context& c, const Device& d)
2999 {
3000 p = 0;
3001 create(c, d);
3002 }
3003
Queue(const Queue & q)3004 Queue::Queue(const Queue& q)
3005 {
3006 p = q.p;
3007 if(p)
3008 p->addref();
3009 }
3010
operator =(const Queue & q)3011 Queue& Queue::operator = (const Queue& q)
3012 {
3013 Impl* newp = (Impl*)q.p;
3014 if(newp)
3015 newp->addref();
3016 if(p)
3017 p->release();
3018 p = newp;
3019 return *this;
3020 }
3021
~Queue()3022 Queue::~Queue()
3023 {
3024 if(p)
3025 p->release();
3026 }
3027
create(const Context & c,const Device & d)3028 bool Queue::create(const Context& c, const Device& d)
3029 {
3030 if(p)
3031 p->release();
3032 p = new Impl(c, d);
3033 return p->handle != 0;
3034 }
3035
finish()3036 void Queue::finish()
3037 {
3038 if(p && p->handle)
3039 {
3040 CV_OclDbgAssert(clFinish(p->handle) == CL_SUCCESS);
3041 }
3042 }
3043
ptr() const3044 void* Queue::ptr() const
3045 {
3046 return p ? p->handle : 0;
3047 }
3048
getDefault()3049 Queue& Queue::getDefault()
3050 {
3051 Queue& q = getCoreTlsData().get()->oclQueue;
3052 if( !q.p && haveOpenCL() )
3053 q.create(Context::getDefault());
3054 return q;
3055 }
3056
getQueue(const Queue & q)3057 static cl_command_queue getQueue(const Queue& q)
3058 {
3059 cl_command_queue qq = (cl_command_queue)q.ptr();
3060 if(!qq)
3061 qq = (cl_command_queue)Queue::getDefault().ptr();
3062 return qq;
3063 }
3064
3065 /////////////////////////////////////////// KernelArg /////////////////////////////////////////////
3066
KernelArg()3067 KernelArg::KernelArg()
3068 : flags(0), m(0), obj(0), sz(0), wscale(1), iwscale(1)
3069 {
3070 }
3071
KernelArg(int _flags,UMat * _m,int _wscale,int _iwscale,const void * _obj,size_t _sz)3072 KernelArg::KernelArg(int _flags, UMat* _m, int _wscale, int _iwscale, const void* _obj, size_t _sz)
3073 : flags(_flags), m(_m), obj(_obj), sz(_sz), wscale(_wscale), iwscale(_iwscale)
3074 {
3075 }
3076
Constant(const Mat & m)3077 KernelArg KernelArg::Constant(const Mat& m)
3078 {
3079 CV_Assert(m.isContinuous());
3080 return KernelArg(CONSTANT, 0, 0, 0, m.ptr(), m.total()*m.elemSize());
3081 }
3082
3083 /////////////////////////////////////////// Kernel /////////////////////////////////////////////
3084
3085 struct Kernel::Impl
3086 {
Implcv::ocl::Kernel::Impl3087 Impl(const char* kname, const Program& prog) :
3088 refcount(1), e(0), nu(0)
3089 {
3090 cl_program ph = (cl_program)prog.ptr();
3091 cl_int retval = 0;
3092 handle = ph != 0 ?
3093 clCreateKernel(ph, kname, &retval) : 0;
3094 CV_OclDbgAssert(retval == CL_SUCCESS);
3095 for( int i = 0; i < MAX_ARRS; i++ )
3096 u[i] = 0;
3097 haveTempDstUMats = false;
3098 }
3099
cleanupUMatscv::ocl::Kernel::Impl3100 void cleanupUMats()
3101 {
3102 for( int i = 0; i < MAX_ARRS; i++ )
3103 if( u[i] )
3104 {
3105 if( CV_XADD(&u[i]->urefcount, -1) == 1 )
3106 u[i]->currAllocator->deallocate(u[i]);
3107 u[i] = 0;
3108 }
3109 nu = 0;
3110 haveTempDstUMats = false;
3111 }
3112
addUMatcv::ocl::Kernel::Impl3113 void addUMat(const UMat& m, bool dst)
3114 {
3115 CV_Assert(nu < MAX_ARRS && m.u && m.u->urefcount > 0);
3116 u[nu] = m.u;
3117 CV_XADD(&m.u->urefcount, 1);
3118 nu++;
3119 if(dst && m.u->tempUMat())
3120 haveTempDstUMats = true;
3121 }
3122
addImagecv::ocl::Kernel::Impl3123 void addImage(const Image2D& image)
3124 {
3125 images.push_back(image);
3126 }
3127
finitcv::ocl::Kernel::Impl3128 void finit()
3129 {
3130 cleanupUMats();
3131 images.clear();
3132 if(e) { clReleaseEvent(e); e = 0; }
3133 release();
3134 }
3135
~Implcv::ocl::Kernel::Impl3136 ~Impl()
3137 {
3138 if(handle)
3139 clReleaseKernel(handle);
3140 }
3141
3142 IMPLEMENT_REFCOUNTABLE();
3143
3144 cl_kernel handle;
3145 cl_event e;
3146 enum { MAX_ARRS = 16 };
3147 UMatData* u[MAX_ARRS];
3148 int nu;
3149 std::list<Image2D> images;
3150 bool haveTempDstUMats;
3151 };
3152
3153 }}
3154
3155 extern "C"
3156 {
oclCleanupCallback(cl_event,cl_int,void * p)3157 static void CL_CALLBACK oclCleanupCallback(cl_event, cl_int, void *p)
3158 {
3159 ((cv::ocl::Kernel::Impl*)p)->finit();
3160 }
3161
3162 }
3163
3164 namespace cv { namespace ocl {
3165
Kernel()3166 Kernel::Kernel()
3167 {
3168 p = 0;
3169 }
3170
Kernel(const char * kname,const Program & prog)3171 Kernel::Kernel(const char* kname, const Program& prog)
3172 {
3173 p = 0;
3174 create(kname, prog);
3175 }
3176
Kernel(const char * kname,const ProgramSource & src,const String & buildopts,String * errmsg)3177 Kernel::Kernel(const char* kname, const ProgramSource& src,
3178 const String& buildopts, String* errmsg)
3179 {
3180 p = 0;
3181 create(kname, src, buildopts, errmsg);
3182 }
3183
Kernel(const Kernel & k)3184 Kernel::Kernel(const Kernel& k)
3185 {
3186 p = k.p;
3187 if(p)
3188 p->addref();
3189 }
3190
operator =(const Kernel & k)3191 Kernel& Kernel::operator = (const Kernel& k)
3192 {
3193 Impl* newp = (Impl*)k.p;
3194 if(newp)
3195 newp->addref();
3196 if(p)
3197 p->release();
3198 p = newp;
3199 return *this;
3200 }
3201
~Kernel()3202 Kernel::~Kernel()
3203 {
3204 if(p)
3205 p->release();
3206 }
3207
create(const char * kname,const Program & prog)3208 bool Kernel::create(const char* kname, const Program& prog)
3209 {
3210 if(p)
3211 p->release();
3212 p = new Impl(kname, prog);
3213 if(p->handle == 0)
3214 {
3215 p->release();
3216 p = 0;
3217 }
3218 #ifdef CV_OPENCL_RUN_ASSERT // check kernel compilation fails
3219 CV_Assert(p);
3220 #endif
3221 return p != 0;
3222 }
3223
create(const char * kname,const ProgramSource & src,const String & buildopts,String * errmsg)3224 bool Kernel::create(const char* kname, const ProgramSource& src,
3225 const String& buildopts, String* errmsg)
3226 {
3227 if(p)
3228 {
3229 p->release();
3230 p = 0;
3231 }
3232 String tempmsg;
3233 if( !errmsg ) errmsg = &tempmsg;
3234 const Program& prog = Context::getDefault().getProg(src, buildopts, *errmsg);
3235 return create(kname, prog);
3236 }
3237
ptr() const3238 void* Kernel::ptr() const
3239 {
3240 return p ? p->handle : 0;
3241 }
3242
empty() const3243 bool Kernel::empty() const
3244 {
3245 return ptr() == 0;
3246 }
3247
set(int i,const void * value,size_t sz)3248 int Kernel::set(int i, const void* value, size_t sz)
3249 {
3250 if (!p || !p->handle)
3251 return -1;
3252 if (i < 0)
3253 return i;
3254 if( i == 0 )
3255 p->cleanupUMats();
3256
3257 cl_int retval = clSetKernelArg(p->handle, (cl_uint)i, sz, value);
3258 CV_OclDbgAssert(retval == CL_SUCCESS);
3259 if (retval != CL_SUCCESS)
3260 return -1;
3261 return i+1;
3262 }
3263
set(int i,const Image2D & image2D)3264 int Kernel::set(int i, const Image2D& image2D)
3265 {
3266 p->addImage(image2D);
3267 cl_mem h = (cl_mem)image2D.ptr();
3268 return set(i, &h, sizeof(h));
3269 }
3270
set(int i,const UMat & m)3271 int Kernel::set(int i, const UMat& m)
3272 {
3273 return set(i, KernelArg(KernelArg::READ_WRITE, (UMat*)&m, 0, 0));
3274 }
3275
set(int i,const KernelArg & arg)3276 int Kernel::set(int i, const KernelArg& arg)
3277 {
3278 if( !p || !p->handle )
3279 return -1;
3280 if (i < 0)
3281 return i;
3282 if( i == 0 )
3283 p->cleanupUMats();
3284 if( arg.m )
3285 {
3286 int accessFlags = ((arg.flags & KernelArg::READ_ONLY) ? ACCESS_READ : 0) +
3287 ((arg.flags & KernelArg::WRITE_ONLY) ? ACCESS_WRITE : 0);
3288 bool ptronly = (arg.flags & KernelArg::PTR_ONLY) != 0;
3289 cl_mem h = (cl_mem)arg.m->handle(accessFlags);
3290
3291 if (!h)
3292 {
3293 p->release();
3294 p = 0;
3295 return -1;
3296 }
3297
3298 #ifdef HAVE_OPENCL_SVM
3299 if ((arg.m->u->allocatorFlags_ & svm::OPENCL_SVM_BUFFER_MASK) != 0)
3300 {
3301 const Context& ctx = Context::getDefault();
3302 const svm::SVMFunctions* svmFns = svm::getSVMFunctions(ctx);
3303 uchar*& svmDataPtr = (uchar*&)arg.m->u->handle;
3304 CV_OPENCL_SVM_TRACE_P("clSetKernelArgSVMPointer: %p\n", svmDataPtr);
3305 #if 1 // TODO
3306 cl_int status = svmFns->fn_clSetKernelArgSVMPointer(p->handle, (cl_uint)i, svmDataPtr);
3307 #else
3308 cl_int status = svmFns->fn_clSetKernelArgSVMPointer(p->handle, (cl_uint)i, &svmDataPtr);
3309 #endif
3310 CV_Assert(status == CL_SUCCESS);
3311 }
3312 else
3313 #endif
3314 {
3315 CV_OclDbgAssert(clSetKernelArg(p->handle, (cl_uint)i, sizeof(h), &h) == CL_SUCCESS);
3316 }
3317
3318 if (ptronly)
3319 {
3320 i++;
3321 }
3322 else if( arg.m->dims <= 2 )
3323 {
3324 UMat2D u2d(*arg.m);
3325 CV_OclDbgAssert(clSetKernelArg(p->handle, (cl_uint)(i+1), sizeof(u2d.step), &u2d.step) == CL_SUCCESS);
3326 CV_OclDbgAssert(clSetKernelArg(p->handle, (cl_uint)(i+2), sizeof(u2d.offset), &u2d.offset) == CL_SUCCESS);
3327 i += 3;
3328
3329 if( !(arg.flags & KernelArg::NO_SIZE) )
3330 {
3331 int cols = u2d.cols*arg.wscale/arg.iwscale;
3332 CV_OclDbgAssert(clSetKernelArg(p->handle, (cl_uint)i, sizeof(u2d.rows), &u2d.rows) == CL_SUCCESS);
3333 CV_OclDbgAssert(clSetKernelArg(p->handle, (cl_uint)(i+1), sizeof(cols), &cols) == CL_SUCCESS);
3334 i += 2;
3335 }
3336 }
3337 else
3338 {
3339 UMat3D u3d(*arg.m);
3340 CV_OclDbgAssert(clSetKernelArg(p->handle, (cl_uint)(i+1), sizeof(u3d.slicestep), &u3d.slicestep) == CL_SUCCESS);
3341 CV_OclDbgAssert(clSetKernelArg(p->handle, (cl_uint)(i+2), sizeof(u3d.step), &u3d.step) == CL_SUCCESS);
3342 CV_OclDbgAssert(clSetKernelArg(p->handle, (cl_uint)(i+3), sizeof(u3d.offset), &u3d.offset) == CL_SUCCESS);
3343 i += 4;
3344 if( !(arg.flags & KernelArg::NO_SIZE) )
3345 {
3346 int cols = u3d.cols*arg.wscale/arg.iwscale;
3347 CV_OclDbgAssert(clSetKernelArg(p->handle, (cl_uint)i, sizeof(u3d.slices), &u3d.rows) == CL_SUCCESS);
3348 CV_OclDbgAssert(clSetKernelArg(p->handle, (cl_uint)(i+1), sizeof(u3d.rows), &u3d.rows) == CL_SUCCESS);
3349 CV_OclDbgAssert(clSetKernelArg(p->handle, (cl_uint)(i+2), sizeof(u3d.cols), &cols) == CL_SUCCESS);
3350 i += 3;
3351 }
3352 }
3353 p->addUMat(*arg.m, (accessFlags & ACCESS_WRITE) != 0);
3354 return i;
3355 }
3356 CV_OclDbgAssert(clSetKernelArg(p->handle, (cl_uint)i, arg.sz, arg.obj) == CL_SUCCESS);
3357 return i+1;
3358 }
3359
3360
run(int dims,size_t _globalsize[],size_t _localsize[],bool sync,const Queue & q)3361 bool Kernel::run(int dims, size_t _globalsize[], size_t _localsize[],
3362 bool sync, const Queue& q)
3363 {
3364 if(!p || !p->handle || p->e != 0)
3365 return false;
3366
3367 cl_command_queue qq = getQueue(q);
3368 size_t offset[CV_MAX_DIM] = {0}, globalsize[CV_MAX_DIM] = {1,1,1};
3369 size_t total = 1;
3370 CV_Assert(_globalsize != 0);
3371 for (int i = 0; i < dims; i++)
3372 {
3373 size_t val = _localsize ? _localsize[i] :
3374 dims == 1 ? 64 : dims == 2 ? (i == 0 ? 256 : 8) : dims == 3 ? (8>>(int)(i>0)) : 1;
3375 CV_Assert( val > 0 );
3376 total *= _globalsize[i];
3377 globalsize[i] = ((_globalsize[i] + val - 1)/val)*val;
3378 }
3379 if( total == 0 )
3380 return true;
3381 if( p->haveTempDstUMats )
3382 sync = true;
3383 cl_int retval = clEnqueueNDRangeKernel(qq, p->handle, (cl_uint)dims,
3384 offset, globalsize, _localsize, 0, 0,
3385 sync ? 0 : &p->e);
3386 #if CV_OPENCL_SHOW_RUN_ERRORS
3387 if (retval != CL_SUCCESS)
3388 {
3389 printf("OpenCL program returns error: %d\n", retval);
3390 fflush(stdout);
3391 }
3392 #endif
3393 if( sync || retval != CL_SUCCESS )
3394 {
3395 CV_OclDbgAssert(clFinish(qq) == CL_SUCCESS);
3396 p->cleanupUMats();
3397 }
3398 else
3399 {
3400 p->addref();
3401 CV_OclDbgAssert(clSetEventCallback(p->e, CL_COMPLETE, oclCleanupCallback, p) == CL_SUCCESS);
3402 }
3403 return retval == CL_SUCCESS;
3404 }
3405
runTask(bool sync,const Queue & q)3406 bool Kernel::runTask(bool sync, const Queue& q)
3407 {
3408 if(!p || !p->handle || p->e != 0)
3409 return false;
3410
3411 cl_command_queue qq = getQueue(q);
3412 cl_int retval = clEnqueueTask(qq, p->handle, 0, 0, sync ? 0 : &p->e);
3413 if( sync || retval != CL_SUCCESS )
3414 {
3415 CV_OclDbgAssert(clFinish(qq) == CL_SUCCESS);
3416 p->cleanupUMats();
3417 }
3418 else
3419 {
3420 p->addref();
3421 CV_OclDbgAssert(clSetEventCallback(p->e, CL_COMPLETE, oclCleanupCallback, p) == CL_SUCCESS);
3422 }
3423 return retval == CL_SUCCESS;
3424 }
3425
3426
workGroupSize() const3427 size_t Kernel::workGroupSize() const
3428 {
3429 if(!p || !p->handle)
3430 return 0;
3431 size_t val = 0, retsz = 0;
3432 cl_device_id dev = (cl_device_id)Device::getDefault().ptr();
3433 return clGetKernelWorkGroupInfo(p->handle, dev, CL_KERNEL_WORK_GROUP_SIZE,
3434 sizeof(val), &val, &retsz) == CL_SUCCESS ? val : 0;
3435 }
3436
preferedWorkGroupSizeMultiple() const3437 size_t Kernel::preferedWorkGroupSizeMultiple() const
3438 {
3439 if(!p || !p->handle)
3440 return 0;
3441 size_t val = 0, retsz = 0;
3442 cl_device_id dev = (cl_device_id)Device::getDefault().ptr();
3443 return clGetKernelWorkGroupInfo(p->handle, dev, CL_KERNEL_PREFERRED_WORK_GROUP_SIZE_MULTIPLE,
3444 sizeof(val), &val, &retsz) == CL_SUCCESS ? val : 0;
3445 }
3446
compileWorkGroupSize(size_t wsz[]) const3447 bool Kernel::compileWorkGroupSize(size_t wsz[]) const
3448 {
3449 if(!p || !p->handle || !wsz)
3450 return 0;
3451 size_t retsz = 0;
3452 cl_device_id dev = (cl_device_id)Device::getDefault().ptr();
3453 return clGetKernelWorkGroupInfo(p->handle, dev, CL_KERNEL_COMPILE_WORK_GROUP_SIZE,
3454 sizeof(wsz[0])*3, wsz, &retsz) == CL_SUCCESS;
3455 }
3456
localMemSize() const3457 size_t Kernel::localMemSize() const
3458 {
3459 if(!p || !p->handle)
3460 return 0;
3461 size_t retsz = 0;
3462 cl_ulong val = 0;
3463 cl_device_id dev = (cl_device_id)Device::getDefault().ptr();
3464 return clGetKernelWorkGroupInfo(p->handle, dev, CL_KERNEL_LOCAL_MEM_SIZE,
3465 sizeof(val), &val, &retsz) == CL_SUCCESS ? (size_t)val : 0;
3466 }
3467
3468 /////////////////////////////////////////// Program /////////////////////////////////////////////
3469
3470 struct Program::Impl
3471 {
Implcv::ocl::Program::Impl3472 Impl(const ProgramSource& _src,
3473 const String& _buildflags, String& errmsg)
3474 {
3475 refcount = 1;
3476 const Context& ctx = Context::getDefault();
3477 src = _src;
3478 buildflags = _buildflags;
3479 const String& srcstr = src.source();
3480 const char* srcptr = srcstr.c_str();
3481 size_t srclen = srcstr.size();
3482 cl_int retval = 0;
3483
3484 handle = clCreateProgramWithSource((cl_context)ctx.ptr(), 1, &srcptr, &srclen, &retval);
3485 if( handle && retval == CL_SUCCESS )
3486 {
3487 int i, n = (int)ctx.ndevices();
3488 AutoBuffer<void*> deviceListBuf(n+1);
3489 void** deviceList = deviceListBuf;
3490 for( i = 0; i < n; i++ )
3491 deviceList[i] = ctx.device(i).ptr();
3492
3493 Device device = Device::getDefault();
3494 if (device.isAMD())
3495 buildflags += " -D AMD_DEVICE";
3496 else if (device.isIntel())
3497 buildflags += " -D INTEL_DEVICE";
3498
3499 retval = clBuildProgram(handle, n,
3500 (const cl_device_id*)deviceList,
3501 buildflags.c_str(), 0, 0);
3502 #if !CV_OPENCL_ALWAYS_SHOW_BUILD_LOG
3503 if( retval != CL_SUCCESS )
3504 #endif
3505 {
3506 size_t retsz = 0;
3507 cl_int buildInfo_retval = clGetProgramBuildInfo(handle, (cl_device_id)deviceList[0],
3508 CL_PROGRAM_BUILD_LOG, 0, 0, &retsz);
3509 if (buildInfo_retval == CL_SUCCESS && retsz > 1)
3510 {
3511 AutoBuffer<char> bufbuf(retsz + 16);
3512 char* buf = bufbuf;
3513 buildInfo_retval = clGetProgramBuildInfo(handle, (cl_device_id)deviceList[0],
3514 CL_PROGRAM_BUILD_LOG, retsz+1, buf, &retsz);
3515 if (buildInfo_retval == CL_SUCCESS)
3516 {
3517 // TODO It is useful to see kernel name & program file name also
3518 errmsg = String(buf);
3519 printf("OpenCL program build log: %s\n%s\n", buildflags.c_str(), errmsg.c_str());
3520 fflush(stdout);
3521 }
3522 }
3523 if (retval != CL_SUCCESS && handle)
3524 {
3525 clReleaseProgram(handle);
3526 handle = NULL;
3527 }
3528 }
3529 }
3530 }
3531
Implcv::ocl::Program::Impl3532 Impl(const String& _buf, const String& _buildflags)
3533 {
3534 refcount = 1;
3535 handle = 0;
3536 buildflags = _buildflags;
3537 if(_buf.empty())
3538 return;
3539 String prefix0 = Program::getPrefix(buildflags);
3540 const Context& ctx = Context::getDefault();
3541 const Device& dev = Device::getDefault();
3542 const char* pos0 = _buf.c_str();
3543 const char* pos1 = strchr(pos0, '\n');
3544 if(!pos1)
3545 return;
3546 const char* pos2 = strchr(pos1+1, '\n');
3547 if(!pos2)
3548 return;
3549 const char* pos3 = strchr(pos2+1, '\n');
3550 if(!pos3)
3551 return;
3552 size_t prefixlen = (pos3 - pos0)+1;
3553 String prefix(pos0, prefixlen);
3554 if( prefix != prefix0 )
3555 return;
3556 const uchar* bin = (uchar*)(pos3+1);
3557 void* devid = dev.ptr();
3558 size_t codelen = _buf.length() - prefixlen;
3559 cl_int binstatus = 0, retval = 0;
3560 handle = clCreateProgramWithBinary((cl_context)ctx.ptr(), 1, (cl_device_id*)&devid,
3561 &codelen, &bin, &binstatus, &retval);
3562 CV_OclDbgAssert(retval == CL_SUCCESS);
3563 }
3564
storecv::ocl::Program::Impl3565 String store()
3566 {
3567 if(!handle)
3568 return String();
3569 size_t progsz = 0, retsz = 0;
3570 String prefix = Program::getPrefix(buildflags);
3571 size_t prefixlen = prefix.length();
3572 if(clGetProgramInfo(handle, CL_PROGRAM_BINARY_SIZES, sizeof(progsz), &progsz, &retsz) != CL_SUCCESS)
3573 return String();
3574 AutoBuffer<uchar> bufbuf(prefixlen + progsz + 16);
3575 uchar* buf = bufbuf;
3576 memcpy(buf, prefix.c_str(), prefixlen);
3577 buf += prefixlen;
3578 if(clGetProgramInfo(handle, CL_PROGRAM_BINARIES, sizeof(buf), &buf, &retsz) != CL_SUCCESS)
3579 return String();
3580 buf[progsz] = (uchar)'\0';
3581 return String((const char*)(uchar*)bufbuf, prefixlen + progsz);
3582 }
3583
~Implcv::ocl::Program::Impl3584 ~Impl()
3585 {
3586 if( handle )
3587 {
3588 #ifdef _WIN32
3589 if (!cv::__termination)
3590 #endif
3591 {
3592 clReleaseProgram(handle);
3593 }
3594 handle = NULL;
3595 }
3596 }
3597
3598 IMPLEMENT_REFCOUNTABLE();
3599
3600 ProgramSource src;
3601 String buildflags;
3602 cl_program handle;
3603 };
3604
3605
Program()3606 Program::Program() { p = 0; }
3607
Program(const ProgramSource & src,const String & buildflags,String & errmsg)3608 Program::Program(const ProgramSource& src,
3609 const String& buildflags, String& errmsg)
3610 {
3611 p = 0;
3612 create(src, buildflags, errmsg);
3613 }
3614
Program(const Program & prog)3615 Program::Program(const Program& prog)
3616 {
3617 p = prog.p;
3618 if(p)
3619 p->addref();
3620 }
3621
operator =(const Program & prog)3622 Program& Program::operator = (const Program& prog)
3623 {
3624 Impl* newp = (Impl*)prog.p;
3625 if(newp)
3626 newp->addref();
3627 if(p)
3628 p->release();
3629 p = newp;
3630 return *this;
3631 }
3632
~Program()3633 Program::~Program()
3634 {
3635 if(p)
3636 p->release();
3637 }
3638
create(const ProgramSource & src,const String & buildflags,String & errmsg)3639 bool Program::create(const ProgramSource& src,
3640 const String& buildflags, String& errmsg)
3641 {
3642 if(p)
3643 p->release();
3644 p = new Impl(src, buildflags, errmsg);
3645 if(!p->handle)
3646 {
3647 p->release();
3648 p = 0;
3649 }
3650 return p != 0;
3651 }
3652
source() const3653 const ProgramSource& Program::source() const
3654 {
3655 static ProgramSource dummy;
3656 return p ? p->src : dummy;
3657 }
3658
ptr() const3659 void* Program::ptr() const
3660 {
3661 return p ? p->handle : 0;
3662 }
3663
read(const String & bin,const String & buildflags)3664 bool Program::read(const String& bin, const String& buildflags)
3665 {
3666 if(p)
3667 p->release();
3668 p = new Impl(bin, buildflags);
3669 return p->handle != 0;
3670 }
3671
write(String & bin) const3672 bool Program::write(String& bin) const
3673 {
3674 if(!p)
3675 return false;
3676 bin = p->store();
3677 return !bin.empty();
3678 }
3679
getPrefix() const3680 String Program::getPrefix() const
3681 {
3682 if(!p)
3683 return String();
3684 return getPrefix(p->buildflags);
3685 }
3686
getPrefix(const String & buildflags)3687 String Program::getPrefix(const String& buildflags)
3688 {
3689 const Context& ctx = Context::getDefault();
3690 const Device& dev = ctx.device(0);
3691 return format("name=%s\ndriver=%s\nbuildflags=%s\n",
3692 dev.name().c_str(), dev.driverVersion().c_str(), buildflags.c_str());
3693 }
3694
3695 ///////////////////////////////////////// ProgramSource ///////////////////////////////////////////////
3696
3697 struct ProgramSource::Impl
3698 {
Implcv::ocl::ProgramSource::Impl3699 Impl(const char* _src)
3700 {
3701 init(String(_src));
3702 }
Implcv::ocl::ProgramSource::Impl3703 Impl(const String& _src)
3704 {
3705 init(_src);
3706 }
initcv::ocl::ProgramSource::Impl3707 void init(const String& _src)
3708 {
3709 refcount = 1;
3710 src = _src;
3711 h = crc64((uchar*)src.c_str(), src.size());
3712 }
3713
3714 IMPLEMENT_REFCOUNTABLE();
3715 String src;
3716 ProgramSource::hash_t h;
3717 };
3718
3719
ProgramSource()3720 ProgramSource::ProgramSource()
3721 {
3722 p = 0;
3723 }
3724
ProgramSource(const char * prog)3725 ProgramSource::ProgramSource(const char* prog)
3726 {
3727 p = new Impl(prog);
3728 }
3729
ProgramSource(const String & prog)3730 ProgramSource::ProgramSource(const String& prog)
3731 {
3732 p = new Impl(prog);
3733 }
3734
~ProgramSource()3735 ProgramSource::~ProgramSource()
3736 {
3737 if(p)
3738 p->release();
3739 }
3740
ProgramSource(const ProgramSource & prog)3741 ProgramSource::ProgramSource(const ProgramSource& prog)
3742 {
3743 p = prog.p;
3744 if(p)
3745 p->addref();
3746 }
3747
operator =(const ProgramSource & prog)3748 ProgramSource& ProgramSource::operator = (const ProgramSource& prog)
3749 {
3750 Impl* newp = (Impl*)prog.p;
3751 if(newp)
3752 newp->addref();
3753 if(p)
3754 p->release();
3755 p = newp;
3756 return *this;
3757 }
3758
source() const3759 const String& ProgramSource::source() const
3760 {
3761 static String dummy;
3762 return p ? p->src : dummy;
3763 }
3764
hash() const3765 ProgramSource::hash_t ProgramSource::hash() const
3766 {
3767 return p ? p->h : 0;
3768 }
3769
3770 //////////////////////////////////////////// OpenCLAllocator //////////////////////////////////////////////////
3771
3772 template<typename T>
3773 class OpenCLBufferPool
3774 {
3775 protected:
~OpenCLBufferPool()3776 ~OpenCLBufferPool() { }
3777 public:
3778 virtual T allocate(size_t size) = 0;
3779 virtual void release(T buffer) = 0;
3780 };
3781
3782 template <typename Derived, typename BufferEntry, typename T>
3783 class OpenCLBufferPoolBaseImpl : public BufferPoolController, public OpenCLBufferPool<T>
3784 {
3785 private:
derived()3786 inline Derived& derived() { return *static_cast<Derived*>(this); }
3787 protected:
3788 Mutex mutex_;
3789
3790 size_t currentReservedSize;
3791 size_t maxReservedSize;
3792
3793 std::list<BufferEntry> allocatedEntries_; // Allocated and used entries
3794 std::list<BufferEntry> reservedEntries_; // LRU order. Allocated, but not used entries
3795
3796 // synchronized
_findAndRemoveEntryFromAllocatedList(CV_OUT BufferEntry & entry,T buffer)3797 bool _findAndRemoveEntryFromAllocatedList(CV_OUT BufferEntry& entry, T buffer)
3798 {
3799 typename std::list<BufferEntry>::iterator i = allocatedEntries_.begin();
3800 for (; i != allocatedEntries_.end(); ++i)
3801 {
3802 BufferEntry& e = *i;
3803 if (e.clBuffer_ == buffer)
3804 {
3805 entry = e;
3806 allocatedEntries_.erase(i);
3807 return true;
3808 }
3809 }
3810 return false;
3811 }
3812
3813 // synchronized
_findAndRemoveEntryFromReservedList(CV_OUT BufferEntry & entry,const size_t size)3814 bool _findAndRemoveEntryFromReservedList(CV_OUT BufferEntry& entry, const size_t size)
3815 {
3816 if (reservedEntries_.empty())
3817 return false;
3818 typename std::list<BufferEntry>::iterator i = reservedEntries_.begin();
3819 typename std::list<BufferEntry>::iterator result_pos = reservedEntries_.end();
3820 BufferEntry result;
3821 size_t minDiff = (size_t)(-1);
3822 for (; i != reservedEntries_.end(); ++i)
3823 {
3824 BufferEntry& e = *i;
3825 if (e.capacity_ >= size)
3826 {
3827 size_t diff = e.capacity_ - size;
3828 if (diff < size / 8 && (result_pos == reservedEntries_.end() || diff < minDiff))
3829 {
3830 minDiff = diff;
3831 result_pos = i;
3832 result = e;
3833 if (diff == 0)
3834 break;
3835 }
3836 }
3837 }
3838 if (result_pos != reservedEntries_.end())
3839 {
3840 //CV_DbgAssert(result == *result_pos);
3841 reservedEntries_.erase(result_pos);
3842 entry = result;
3843 currentReservedSize -= entry.capacity_;
3844 allocatedEntries_.push_back(entry);
3845 return true;
3846 }
3847 return false;
3848 }
3849
3850 // synchronized
_checkSizeOfReservedEntries()3851 void _checkSizeOfReservedEntries()
3852 {
3853 while (currentReservedSize > maxReservedSize)
3854 {
3855 CV_DbgAssert(!reservedEntries_.empty());
3856 const BufferEntry& entry = reservedEntries_.back();
3857 CV_DbgAssert(currentReservedSize >= entry.capacity_);
3858 currentReservedSize -= entry.capacity_;
3859 derived()._releaseBufferEntry(entry);
3860 reservedEntries_.pop_back();
3861 }
3862 }
3863
_allocationGranularity(size_t size)3864 inline size_t _allocationGranularity(size_t size)
3865 {
3866 // heuristic values
3867 if (size < 1024)
3868 return 16;
3869 else if (size < 64*1024)
3870 return 64;
3871 else if (size < 1024*1024)
3872 return 4096;
3873 else if (size < 16*1024*1024)
3874 return 64*1024;
3875 else
3876 return 1024*1024;
3877 }
3878
3879 public:
OpenCLBufferPoolBaseImpl()3880 OpenCLBufferPoolBaseImpl()
3881 : currentReservedSize(0),
3882 maxReservedSize(0)
3883 {
3884 // nothing
3885 }
~OpenCLBufferPoolBaseImpl()3886 virtual ~OpenCLBufferPoolBaseImpl()
3887 {
3888 freeAllReservedBuffers();
3889 CV_Assert(reservedEntries_.empty());
3890 }
3891 public:
allocate(size_t size)3892 virtual T allocate(size_t size)
3893 {
3894 AutoLock locker(mutex_);
3895 BufferEntry entry;
3896 if (maxReservedSize > 0 && _findAndRemoveEntryFromReservedList(entry, size))
3897 {
3898 CV_DbgAssert(size <= entry.capacity_);
3899 LOG_BUFFER_POOL("Reuse reserved buffer: %p\n", entry.clBuffer_);
3900 }
3901 else
3902 {
3903 derived()._allocateBufferEntry(entry, size);
3904 }
3905 return entry.clBuffer_;
3906 }
release(T buffer)3907 virtual void release(T buffer)
3908 {
3909 AutoLock locker(mutex_);
3910 BufferEntry entry;
3911 CV_Assert(_findAndRemoveEntryFromAllocatedList(entry, buffer));
3912 if (maxReservedSize == 0 || entry.capacity_ > maxReservedSize / 8)
3913 {
3914 derived()._releaseBufferEntry(entry);
3915 }
3916 else
3917 {
3918 reservedEntries_.push_front(entry);
3919 currentReservedSize += entry.capacity_;
3920 _checkSizeOfReservedEntries();
3921 }
3922 }
3923
getReservedSize() const3924 virtual size_t getReservedSize() const { return currentReservedSize; }
getMaxReservedSize() const3925 virtual size_t getMaxReservedSize() const { return maxReservedSize; }
setMaxReservedSize(size_t size)3926 virtual void setMaxReservedSize(size_t size)
3927 {
3928 AutoLock locker(mutex_);
3929 size_t oldMaxReservedSize = maxReservedSize;
3930 maxReservedSize = size;
3931 if (maxReservedSize < oldMaxReservedSize)
3932 {
3933 typename std::list<BufferEntry>::iterator i = reservedEntries_.begin();
3934 for (; i != reservedEntries_.end();)
3935 {
3936 const BufferEntry& entry = *i;
3937 if (entry.capacity_ > maxReservedSize / 8)
3938 {
3939 CV_DbgAssert(currentReservedSize >= entry.capacity_);
3940 currentReservedSize -= entry.capacity_;
3941 derived()._releaseBufferEntry(entry);
3942 i = reservedEntries_.erase(i);
3943 continue;
3944 }
3945 ++i;
3946 }
3947 _checkSizeOfReservedEntries();
3948 }
3949 }
freeAllReservedBuffers()3950 virtual void freeAllReservedBuffers()
3951 {
3952 AutoLock locker(mutex_);
3953 typename std::list<BufferEntry>::const_iterator i = reservedEntries_.begin();
3954 for (; i != reservedEntries_.end(); ++i)
3955 {
3956 const BufferEntry& entry = *i;
3957 derived()._releaseBufferEntry(entry);
3958 }
3959 reservedEntries_.clear();
3960 currentReservedSize = 0;
3961 }
3962 };
3963
3964 struct CLBufferEntry
3965 {
3966 cl_mem clBuffer_;
3967 size_t capacity_;
CLBufferEntrycv::ocl::CLBufferEntry3968 CLBufferEntry() : clBuffer_((cl_mem)NULL), capacity_(0) { }
3969 };
3970
3971 class OpenCLBufferPoolImpl : public OpenCLBufferPoolBaseImpl<OpenCLBufferPoolImpl, CLBufferEntry, cl_mem>
3972 {
3973 public:
3974 typedef struct CLBufferEntry BufferEntry;
3975 protected:
3976 int createFlags_;
3977 public:
OpenCLBufferPoolImpl(int createFlags=0)3978 OpenCLBufferPoolImpl(int createFlags = 0)
3979 : createFlags_(createFlags)
3980 {
3981 }
3982
_allocateBufferEntry(BufferEntry & entry,size_t size)3983 void _allocateBufferEntry(BufferEntry& entry, size_t size)
3984 {
3985 CV_DbgAssert(entry.clBuffer_ == NULL);
3986 entry.capacity_ = alignSize(size, (int)_allocationGranularity(size));
3987 Context& ctx = Context::getDefault();
3988 cl_int retval = CL_SUCCESS;
3989 entry.clBuffer_ = clCreateBuffer((cl_context)ctx.ptr(), CL_MEM_READ_WRITE|createFlags_, entry.capacity_, 0, &retval);
3990 CV_Assert(retval == CL_SUCCESS);
3991 CV_Assert(entry.clBuffer_ != NULL);
3992 if(retval == CL_SUCCESS)
3993 {
3994 CV_IMPL_ADD(CV_IMPL_OCL);
3995 }
3996 LOG_BUFFER_POOL("OpenCL allocate %lld (0x%llx) bytes: %p\n",
3997 (long long)entry.capacity_, (long long)entry.capacity_, entry.clBuffer_);
3998 allocatedEntries_.push_back(entry);
3999 }
4000
_releaseBufferEntry(const BufferEntry & entry)4001 void _releaseBufferEntry(const BufferEntry& entry)
4002 {
4003 CV_Assert(entry.capacity_ != 0);
4004 CV_Assert(entry.clBuffer_ != NULL);
4005 LOG_BUFFER_POOL("OpenCL release buffer: %p, %lld (0x%llx) bytes\n",
4006 entry.clBuffer_, (long long)entry.capacity_, (long long)entry.capacity_);
4007 clReleaseMemObject(entry.clBuffer_);
4008 }
4009 };
4010
4011 #ifdef HAVE_OPENCL_SVM
4012 struct CLSVMBufferEntry
4013 {
4014 void* clBuffer_;
4015 size_t capacity_;
CLSVMBufferEntrycv::ocl::CLSVMBufferEntry4016 CLSVMBufferEntry() : clBuffer_(NULL), capacity_(0) { }
4017 };
4018 class OpenCLSVMBufferPoolImpl : public OpenCLBufferPoolBaseImpl<OpenCLSVMBufferPoolImpl, CLSVMBufferEntry, void*>
4019 {
4020 public:
4021 typedef struct CLSVMBufferEntry BufferEntry;
4022 public:
OpenCLSVMBufferPoolImpl()4023 OpenCLSVMBufferPoolImpl()
4024 {
4025 }
4026
_allocateBufferEntry(BufferEntry & entry,size_t size)4027 void _allocateBufferEntry(BufferEntry& entry, size_t size)
4028 {
4029 CV_DbgAssert(entry.clBuffer_ == NULL);
4030 entry.capacity_ = alignSize(size, (int)_allocationGranularity(size));
4031
4032 Context& ctx = Context::getDefault();
4033 const svm::SVMCapabilities svmCaps = svm::getSVMCapabilitites(ctx);
4034 bool isFineGrainBuffer = svmCaps.isSupportFineGrainBuffer();
4035 cl_svm_mem_flags memFlags = CL_MEM_READ_WRITE |
4036 (isFineGrainBuffer ? CL_MEM_SVM_FINE_GRAIN_BUFFER : 0);
4037
4038 const svm::SVMFunctions* svmFns = svm::getSVMFunctions(ctx);
4039 CV_DbgAssert(svmFns->isValid());
4040
4041 CV_OPENCL_SVM_TRACE_P("clSVMAlloc: %d\n", (int)entry.capacity_);
4042 void *buf = svmFns->fn_clSVMAlloc((cl_context)ctx.ptr(), memFlags, entry.capacity_, 0);
4043 CV_Assert(buf);
4044
4045 entry.clBuffer_ = buf;
4046 {
4047 CV_IMPL_ADD(CV_IMPL_OCL);
4048 }
4049 LOG_BUFFER_POOL("OpenCL SVM allocate %lld (0x%llx) bytes: %p\n",
4050 (long long)entry.capacity_, (long long)entry.capacity_, entry.clBuffer_);
4051 allocatedEntries_.push_back(entry);
4052 }
4053
_releaseBufferEntry(const BufferEntry & entry)4054 void _releaseBufferEntry(const BufferEntry& entry)
4055 {
4056 CV_Assert(entry.capacity_ != 0);
4057 CV_Assert(entry.clBuffer_ != NULL);
4058 LOG_BUFFER_POOL("OpenCL release SVM buffer: %p, %lld (0x%llx) bytes\n",
4059 entry.clBuffer_, (long long)entry.capacity_, (long long)entry.capacity_);
4060 Context& ctx = Context::getDefault();
4061 const svm::SVMFunctions* svmFns = svm::getSVMFunctions(ctx);
4062 CV_DbgAssert(svmFns->isValid());
4063 CV_OPENCL_SVM_TRACE_P("clSVMFree: %p\n", entry.clBuffer_);
4064 svmFns->fn_clSVMFree((cl_context)ctx.ptr(), entry.clBuffer_);
4065 }
4066 };
4067 #endif
4068
4069
4070
4071 #if defined _MSC_VER
4072 #pragma warning(disable:4127) // conditional expression is constant
4073 #endif
4074 template <bool readAccess, bool writeAccess>
4075 class AlignedDataPtr
4076 {
4077 protected:
4078 const size_t size_;
4079 uchar* const originPtr_;
4080 const size_t alignment_;
4081 uchar* ptr_;
4082 uchar* allocatedPtr_;
4083
4084 public:
AlignedDataPtr(uchar * ptr,size_t size,size_t alignment)4085 AlignedDataPtr(uchar* ptr, size_t size, size_t alignment)
4086 : size_(size), originPtr_(ptr), alignment_(alignment), ptr_(ptr), allocatedPtr_(NULL)
4087 {
4088 CV_DbgAssert((alignment & (alignment - 1)) == 0); // check for 2^n
4089 if (((size_t)ptr_ & (alignment - 1)) != 0)
4090 {
4091 allocatedPtr_ = new uchar[size_ + alignment - 1];
4092 ptr_ = (uchar*)(((uintptr_t)allocatedPtr_ + (alignment - 1)) & ~(alignment - 1));
4093 if (readAccess)
4094 {
4095 memcpy(ptr_, originPtr_, size_);
4096 }
4097 }
4098 }
4099
getAlignedPtr() const4100 uchar* getAlignedPtr() const
4101 {
4102 CV_DbgAssert(((size_t)ptr_ & (alignment_ - 1)) == 0);
4103 return ptr_;
4104 }
4105
~AlignedDataPtr()4106 ~AlignedDataPtr()
4107 {
4108 if (allocatedPtr_)
4109 {
4110 if (writeAccess)
4111 {
4112 memcpy(originPtr_, ptr_, size_);
4113 }
4114 delete[] allocatedPtr_;
4115 allocatedPtr_ = NULL;
4116 }
4117 ptr_ = NULL;
4118 }
4119 private:
4120 AlignedDataPtr(const AlignedDataPtr&); // disabled
4121 AlignedDataPtr& operator=(const AlignedDataPtr&); // disabled
4122 };
4123 #if defined _MSC_VER
4124 #pragma warning(default:4127) // conditional expression is constant
4125 #endif
4126
4127 #ifndef CV_OPENCL_DATA_PTR_ALIGNMENT
4128 #define CV_OPENCL_DATA_PTR_ALIGNMENT 16
4129 #endif
4130
4131 class OpenCLAllocator : public MatAllocator
4132 {
4133 mutable OpenCLBufferPoolImpl bufferPool;
4134 mutable OpenCLBufferPoolImpl bufferPoolHostPtr;
4135 #ifdef HAVE_OPENCL_SVM
4136 mutable OpenCLSVMBufferPoolImpl bufferPoolSVM;
4137 #endif
4138
4139 enum AllocatorFlags
4140 {
4141 ALLOCATOR_FLAGS_BUFFER_POOL_USED = 1 << 0,
4142 ALLOCATOR_FLAGS_BUFFER_POOL_HOST_PTR_USED = 1 << 1
4143 #ifdef HAVE_OPENCL_SVM
4144 ,ALLOCATOR_FLAGS_BUFFER_POOL_SVM_USED = 1 << 2
4145 #endif
4146 };
4147 public:
OpenCLAllocator()4148 OpenCLAllocator()
4149 : bufferPool(0),
4150 bufferPoolHostPtr(CL_MEM_ALLOC_HOST_PTR)
4151 {
4152 size_t defaultPoolSize, poolSize;
4153 defaultPoolSize = ocl::Device::getDefault().isIntel() ? 1 << 27 : 0;
4154 poolSize = getConfigurationParameterForSize("OPENCV_OPENCL_BUFFERPOOL_LIMIT", defaultPoolSize);
4155 bufferPool.setMaxReservedSize(poolSize);
4156 poolSize = getConfigurationParameterForSize("OPENCV_OPENCL_HOST_PTR_BUFFERPOOL_LIMIT", defaultPoolSize);
4157 bufferPoolHostPtr.setMaxReservedSize(poolSize);
4158 #ifdef HAVE_OPENCL_SVM
4159 poolSize = getConfigurationParameterForSize("OPENCV_OPENCL_SVM_BUFFERPOOL_LIMIT", defaultPoolSize);
4160 bufferPoolSVM.setMaxReservedSize(poolSize);
4161 #endif
4162
4163 matStdAllocator = Mat::getStdAllocator();
4164 }
4165
defaultAllocate(int dims,const int * sizes,int type,void * data,size_t * step,int flags,UMatUsageFlags usageFlags) const4166 UMatData* defaultAllocate(int dims, const int* sizes, int type, void* data, size_t* step,
4167 int flags, UMatUsageFlags usageFlags) const
4168 {
4169 UMatData* u = matStdAllocator->allocate(dims, sizes, type, data, step, flags, usageFlags);
4170 return u;
4171 }
4172
getBestFlags(const Context & ctx,int,UMatUsageFlags usageFlags,int & createFlags,int & flags0) const4173 void getBestFlags(const Context& ctx, int /*flags*/, UMatUsageFlags usageFlags, int& createFlags, int& flags0) const
4174 {
4175 const Device& dev = ctx.device(0);
4176 createFlags = 0;
4177 if ((usageFlags & USAGE_ALLOCATE_HOST_MEMORY) != 0)
4178 createFlags |= CL_MEM_ALLOC_HOST_PTR;
4179
4180 if( dev.hostUnifiedMemory() )
4181 flags0 = 0;
4182 else
4183 flags0 = UMatData::COPY_ON_MAP;
4184 }
4185
allocate(int dims,const int * sizes,int type,void * data,size_t * step,int flags,UMatUsageFlags usageFlags) const4186 UMatData* allocate(int dims, const int* sizes, int type,
4187 void* data, size_t* step, int flags, UMatUsageFlags usageFlags) const
4188 {
4189 if(!useOpenCL())
4190 return defaultAllocate(dims, sizes, type, data, step, flags, usageFlags);
4191 CV_Assert(data == 0);
4192 size_t total = CV_ELEM_SIZE(type);
4193 for( int i = dims-1; i >= 0; i-- )
4194 {
4195 if( step )
4196 step[i] = total;
4197 total *= sizes[i];
4198 }
4199
4200 Context& ctx = Context::getDefault();
4201
4202 int createFlags = 0, flags0 = 0;
4203 getBestFlags(ctx, flags, usageFlags, createFlags, flags0);
4204
4205 void* handle = NULL;
4206 int allocatorFlags = 0;
4207
4208 #ifdef HAVE_OPENCL_SVM
4209 const svm::SVMCapabilities svmCaps = svm::getSVMCapabilitites(ctx);
4210 if (ctx.useSVM() && svm::useSVM(usageFlags) && !svmCaps.isNoSVMSupport())
4211 {
4212 allocatorFlags = ALLOCATOR_FLAGS_BUFFER_POOL_SVM_USED;
4213 handle = bufferPoolSVM.allocate(total);
4214
4215 // this property is constant, so single buffer pool can be used here
4216 bool isFineGrainBuffer = svmCaps.isSupportFineGrainBuffer();
4217 allocatorFlags |= isFineGrainBuffer ? svm::OPENCL_SVM_FINE_GRAIN_BUFFER : svm::OPENCL_SVM_COARSE_GRAIN_BUFFER;
4218 }
4219 else
4220 #endif
4221 if (createFlags == 0)
4222 {
4223 allocatorFlags = ALLOCATOR_FLAGS_BUFFER_POOL_USED;
4224 handle = bufferPool.allocate(total);
4225 }
4226 else if (createFlags == CL_MEM_ALLOC_HOST_PTR)
4227 {
4228 allocatorFlags = ALLOCATOR_FLAGS_BUFFER_POOL_HOST_PTR_USED;
4229 handle = bufferPoolHostPtr.allocate(total);
4230 }
4231 else
4232 {
4233 CV_Assert(handle != NULL); // Unsupported, throw
4234 }
4235
4236 if (!handle)
4237 return defaultAllocate(dims, sizes, type, data, step, flags, usageFlags);
4238
4239 UMatData* u = new UMatData(this);
4240 u->data = 0;
4241 u->size = total;
4242 u->handle = handle;
4243 u->flags = flags0;
4244 u->allocatorFlags_ = allocatorFlags;
4245 CV_DbgAssert(!u->tempUMat()); // for bufferPool.release() consistency in deallocate()
4246 return u;
4247 }
4248
allocate(UMatData * u,int accessFlags,UMatUsageFlags usageFlags) const4249 bool allocate(UMatData* u, int accessFlags, UMatUsageFlags usageFlags) const
4250 {
4251 if(!u)
4252 return false;
4253
4254 UMatDataAutoLock lock(u);
4255
4256 if(u->handle == 0)
4257 {
4258 CV_Assert(u->origdata != 0);
4259 Context& ctx = Context::getDefault();
4260 int createFlags = 0, flags0 = 0;
4261 getBestFlags(ctx, accessFlags, usageFlags, createFlags, flags0);
4262
4263 cl_context ctx_handle = (cl_context)ctx.ptr();
4264 int allocatorFlags = 0;
4265 int tempUMatFlags = 0;
4266 void* handle = NULL;
4267 cl_int retval = CL_SUCCESS;
4268
4269 #ifdef HAVE_OPENCL_SVM
4270 svm::SVMCapabilities svmCaps = svm::getSVMCapabilitites(ctx);
4271 bool useSVM = ctx.useSVM() && svm::useSVM(usageFlags);
4272 if (useSVM && svmCaps.isSupportFineGrainSystem())
4273 {
4274 allocatorFlags = svm::OPENCL_SVM_FINE_GRAIN_SYSTEM;
4275 tempUMatFlags = UMatData::TEMP_UMAT;
4276 handle = u->origdata;
4277 CV_OPENCL_SVM_TRACE_P("Use fine grain system: %d (%p)\n", (int)u->size, handle);
4278 }
4279 else if (useSVM && (svmCaps.isSupportFineGrainBuffer() || svmCaps.isSupportCoarseGrainBuffer()))
4280 {
4281 if (!(accessFlags & ACCESS_FAST)) // memcpy used
4282 {
4283 bool isFineGrainBuffer = svmCaps.isSupportFineGrainBuffer();
4284
4285 cl_svm_mem_flags memFlags = createFlags |
4286 (isFineGrainBuffer ? CL_MEM_SVM_FINE_GRAIN_BUFFER : 0);
4287
4288 const svm::SVMFunctions* svmFns = svm::getSVMFunctions(ctx);
4289 CV_DbgAssert(svmFns->isValid());
4290
4291 CV_OPENCL_SVM_TRACE_P("clSVMAlloc + copy: %d\n", (int)u->size);
4292 handle = svmFns->fn_clSVMAlloc((cl_context)ctx.ptr(), memFlags, u->size, 0);
4293 CV_Assert(handle);
4294
4295 cl_command_queue q = NULL;
4296 if (!isFineGrainBuffer)
4297 {
4298 q = (cl_command_queue)Queue::getDefault().ptr();
4299 CV_OPENCL_SVM_TRACE_P("clEnqueueSVMMap: %p (%d)\n", handle, (int)u->size);
4300 cl_int status = svmFns->fn_clEnqueueSVMMap(q, CL_TRUE, CL_MAP_WRITE,
4301 handle, u->size,
4302 0, NULL, NULL);
4303 CV_Assert(status == CL_SUCCESS);
4304
4305 }
4306 memcpy(handle, u->origdata, u->size);
4307 if (!isFineGrainBuffer)
4308 {
4309 CV_OPENCL_SVM_TRACE_P("clEnqueueSVMUnmap: %p\n", handle);
4310 cl_int status = svmFns->fn_clEnqueueSVMUnmap(q, handle, 0, NULL, NULL);
4311 CV_Assert(status == CL_SUCCESS);
4312 }
4313
4314 tempUMatFlags = UMatData::TEMP_UMAT | UMatData::TEMP_COPIED_UMAT;
4315 allocatorFlags |= isFineGrainBuffer ? svm::OPENCL_SVM_FINE_GRAIN_BUFFER
4316 : svm::OPENCL_SVM_COARSE_GRAIN_BUFFER;
4317 }
4318 }
4319 else
4320 #endif
4321 {
4322 tempUMatFlags = UMatData::TEMP_UMAT;
4323 handle = clCreateBuffer(ctx_handle, CL_MEM_USE_HOST_PTR|createFlags,
4324 u->size, u->origdata, &retval);
4325 if((!handle || retval < 0) && !(accessFlags & ACCESS_FAST))
4326 {
4327 handle = clCreateBuffer(ctx_handle, CL_MEM_COPY_HOST_PTR|CL_MEM_READ_WRITE|createFlags,
4328 u->size, u->origdata, &retval);
4329 tempUMatFlags |= UMatData::TEMP_COPIED_UMAT;
4330 }
4331 }
4332 if(!handle || retval != CL_SUCCESS)
4333 return false;
4334 u->handle = handle;
4335 u->prevAllocator = u->currAllocator;
4336 u->currAllocator = this;
4337 u->flags |= tempUMatFlags;
4338 u->allocatorFlags_ = allocatorFlags;
4339 }
4340 if(accessFlags & ACCESS_WRITE)
4341 u->markHostCopyObsolete(true);
4342 return true;
4343 }
4344
4345 /*void sync(UMatData* u) const
4346 {
4347 cl_command_queue q = (cl_command_queue)Queue::getDefault().ptr();
4348 UMatDataAutoLock lock(u);
4349
4350 if( u->hostCopyObsolete() && u->handle && u->refcount > 0 && u->origdata)
4351 {
4352 if( u->tempCopiedUMat() )
4353 {
4354 clEnqueueReadBuffer(q, (cl_mem)u->handle, CL_TRUE, 0,
4355 u->size, u->origdata, 0, 0, 0);
4356 }
4357 else
4358 {
4359 cl_int retval = 0;
4360 void* data = clEnqueueMapBuffer(q, (cl_mem)u->handle, CL_TRUE,
4361 (CL_MAP_READ | CL_MAP_WRITE),
4362 0, u->size, 0, 0, 0, &retval);
4363 clEnqueueUnmapMemObject(q, (cl_mem)u->handle, data, 0, 0, 0);
4364 clFinish(q);
4365 }
4366 u->markHostCopyObsolete(false);
4367 }
4368 else if( u->copyOnMap() && u->deviceCopyObsolete() && u->data )
4369 {
4370 clEnqueueWriteBuffer(q, (cl_mem)u->handle, CL_TRUE, 0,
4371 u->size, u->data, 0, 0, 0);
4372 }
4373 }*/
4374
deallocate(UMatData * u) const4375 void deallocate(UMatData* u) const
4376 {
4377 if(!u)
4378 return;
4379
4380 CV_Assert(u->urefcount >= 0);
4381 CV_Assert(u->refcount >= 0);
4382
4383 CV_Assert(u->handle != 0 && u->urefcount == 0);
4384 if(u->tempUMat())
4385 {
4386 // UMatDataAutoLock lock(u);
4387
4388 if( u->hostCopyObsolete() && u->refcount > 0 )
4389 {
4390 #ifdef HAVE_OPENCL_SVM
4391 if ((u->allocatorFlags_ & svm::OPENCL_SVM_BUFFER_MASK) != 0)
4392 {
4393 Context& ctx = Context::getDefault();
4394 const svm::SVMFunctions* svmFns = svm::getSVMFunctions(ctx);
4395 CV_DbgAssert(svmFns->isValid());
4396
4397 if( u->tempCopiedUMat() )
4398 {
4399 CV_DbgAssert((u->allocatorFlags_ & svm::OPENCL_SVM_BUFFER_MASK) == svm::OPENCL_SVM_FINE_GRAIN_BUFFER ||
4400 (u->allocatorFlags_ & svm::OPENCL_SVM_BUFFER_MASK) == svm::OPENCL_SVM_COARSE_GRAIN_BUFFER);
4401 bool isFineGrainBuffer = (u->allocatorFlags_ & svm::OPENCL_SVM_BUFFER_MASK) == svm::OPENCL_SVM_FINE_GRAIN_BUFFER;
4402 cl_command_queue q = NULL;
4403 if (!isFineGrainBuffer)
4404 {
4405 CV_DbgAssert(((u->allocatorFlags_ & svm::OPENCL_SVM_BUFFER_MAP) == 0));
4406 q = (cl_command_queue)Queue::getDefault().ptr();
4407 CV_OPENCL_SVM_TRACE_P("clEnqueueSVMMap: %p (%d)\n", u->handle, (int)u->size);
4408 cl_int status = svmFns->fn_clEnqueueSVMMap(q, CL_FALSE, CL_MAP_READ,
4409 u->handle, u->size,
4410 0, NULL, NULL);
4411 CV_Assert(status == CL_SUCCESS);
4412 }
4413 clFinish(q);
4414 memcpy(u->origdata, u->handle, u->size);
4415 if (!isFineGrainBuffer)
4416 {
4417 CV_OPENCL_SVM_TRACE_P("clEnqueueSVMUnmap: %p\n", u->handle);
4418 cl_int status = svmFns->fn_clEnqueueSVMUnmap(q, u->handle, 0, NULL, NULL);
4419 CV_Assert(status == CL_SUCCESS);
4420 }
4421 }
4422 else
4423 {
4424 CV_DbgAssert((u->allocatorFlags_ & svm::OPENCL_SVM_BUFFER_MASK) == svm::OPENCL_SVM_FINE_GRAIN_SYSTEM);
4425 // nothing
4426 }
4427 }
4428 else
4429 #endif
4430 {
4431 cl_command_queue q = (cl_command_queue)Queue::getDefault().ptr();
4432 if( u->tempCopiedUMat() )
4433 {
4434 AlignedDataPtr<false, true> alignedPtr(u->origdata, u->size, CV_OPENCL_DATA_PTR_ALIGNMENT);
4435 CV_OclDbgAssert(clEnqueueReadBuffer(q, (cl_mem)u->handle, CL_TRUE, 0,
4436 u->size, alignedPtr.getAlignedPtr(), 0, 0, 0) == CL_SUCCESS);
4437 }
4438 else
4439 {
4440 // TODO Is it really needed for clCreateBuffer with CL_MEM_USE_HOST_PTR?
4441 cl_int retval = 0;
4442 void* data = clEnqueueMapBuffer(q, (cl_mem)u->handle, CL_TRUE,
4443 (CL_MAP_READ | CL_MAP_WRITE),
4444 0, u->size, 0, 0, 0, &retval);
4445 CV_OclDbgAssert(retval == CL_SUCCESS);
4446 CV_OclDbgAssert(clEnqueueUnmapMemObject(q, (cl_mem)u->handle, data, 0, 0, 0) == CL_SUCCESS);
4447 CV_OclDbgAssert(clFinish(q) == CL_SUCCESS);
4448 }
4449 }
4450 u->markHostCopyObsolete(false);
4451 }
4452 #ifdef HAVE_OPENCL_SVM
4453 if ((u->allocatorFlags_ & svm::OPENCL_SVM_BUFFER_MASK) != 0)
4454 {
4455 if( u->tempCopiedUMat() )
4456 {
4457 Context& ctx = Context::getDefault();
4458 const svm::SVMFunctions* svmFns = svm::getSVMFunctions(ctx);
4459 CV_DbgAssert(svmFns->isValid());
4460
4461 CV_OPENCL_SVM_TRACE_P("clSVMFree: %p\n", u->handle);
4462 svmFns->fn_clSVMFree((cl_context)ctx.ptr(), u->handle);
4463 }
4464 }
4465 else
4466 #endif
4467 {
4468 clReleaseMemObject((cl_mem)u->handle);
4469 }
4470 u->handle = 0;
4471 u->currAllocator = u->prevAllocator;
4472 if(u->data && u->copyOnMap() && !(u->flags & UMatData::USER_ALLOCATED))
4473 fastFree(u->data);
4474 u->data = u->origdata;
4475 if(u->refcount == 0)
4476 u->currAllocator->deallocate(u);
4477 }
4478 else
4479 {
4480 CV_Assert(u->refcount == 0);
4481 if(u->data && u->copyOnMap() && !(u->flags & UMatData::USER_ALLOCATED))
4482 {
4483 fastFree(u->data);
4484 u->data = 0;
4485 }
4486 if (u->allocatorFlags_ & ALLOCATOR_FLAGS_BUFFER_POOL_USED)
4487 {
4488 bufferPool.release((cl_mem)u->handle);
4489 }
4490 else if (u->allocatorFlags_ & ALLOCATOR_FLAGS_BUFFER_POOL_HOST_PTR_USED)
4491 {
4492 bufferPoolHostPtr.release((cl_mem)u->handle);
4493 }
4494 #ifdef HAVE_OPENCL_SVM
4495 else if (u->allocatorFlags_ & ALLOCATOR_FLAGS_BUFFER_POOL_SVM_USED)
4496 {
4497 if ((u->allocatorFlags_ & svm::OPENCL_SVM_BUFFER_MASK) == svm::OPENCL_SVM_FINE_GRAIN_SYSTEM)
4498 {
4499 //nothing
4500 }
4501 else if ((u->allocatorFlags_ & svm::OPENCL_SVM_BUFFER_MASK) == svm::OPENCL_SVM_FINE_GRAIN_BUFFER ||
4502 (u->allocatorFlags_ & svm::OPENCL_SVM_BUFFER_MASK) == svm::OPENCL_SVM_COARSE_GRAIN_BUFFER)
4503 {
4504 Context& ctx = Context::getDefault();
4505 const svm::SVMFunctions* svmFns = svm::getSVMFunctions(ctx);
4506 CV_DbgAssert(svmFns->isValid());
4507 cl_command_queue q = (cl_command_queue)Queue::getDefault().ptr();
4508
4509 if ((u->allocatorFlags_ & svm::OPENCL_SVM_BUFFER_MAP) != 0)
4510 {
4511 CV_OPENCL_SVM_TRACE_P("clEnqueueSVMUnmap: %p\n", u->handle);
4512 cl_int status = svmFns->fn_clEnqueueSVMUnmap(q, u->handle, 0, NULL, NULL);
4513 CV_Assert(status == CL_SUCCESS);
4514 }
4515 }
4516 bufferPoolSVM.release((void*)u->handle);
4517 }
4518 #endif
4519 else
4520 {
4521 clReleaseMemObject((cl_mem)u->handle);
4522 }
4523 u->handle = 0;
4524 delete u;
4525 }
4526 }
4527
map(UMatData * u,int accessFlags) const4528 void map(UMatData* u, int accessFlags) const
4529 {
4530 if(!u)
4531 return;
4532
4533 CV_Assert( u->handle != 0 );
4534
4535 UMatDataAutoLock autolock(u);
4536
4537 if(accessFlags & ACCESS_WRITE)
4538 u->markDeviceCopyObsolete(true);
4539
4540 cl_command_queue q = (cl_command_queue)Queue::getDefault().ptr();
4541
4542 // FIXIT Workaround for UMat synchronization issue
4543 // if( u->refcount == 0 )
4544 {
4545 if( !u->copyOnMap() )
4546 {
4547 // TODO
4548 // because there can be other map requests for the same UMat with different access flags,
4549 // we use the universal (read-write) access mode.
4550 #ifdef HAVE_OPENCL_SVM
4551 if ((u->allocatorFlags_ & svm::OPENCL_SVM_BUFFER_MASK) != 0)
4552 {
4553 if ((u->allocatorFlags_ & svm::OPENCL_SVM_BUFFER_MASK) == svm::OPENCL_SVM_COARSE_GRAIN_BUFFER)
4554 {
4555 Context& ctx = Context::getDefault();
4556 const svm::SVMFunctions* svmFns = svm::getSVMFunctions(ctx);
4557 CV_DbgAssert(svmFns->isValid());
4558
4559 if ((u->allocatorFlags_ & svm::OPENCL_SVM_BUFFER_MAP) == 0)
4560 {
4561 CV_OPENCL_SVM_TRACE_P("clEnqueueSVMMap: %p (%d)\n", u->handle, (int)u->size);
4562 cl_int status = svmFns->fn_clEnqueueSVMMap(q, CL_FALSE, CL_MAP_READ | CL_MAP_WRITE,
4563 u->handle, u->size,
4564 0, NULL, NULL);
4565 CV_Assert(status == CL_SUCCESS);
4566 u->allocatorFlags_ |= svm::OPENCL_SVM_BUFFER_MAP;
4567 }
4568 }
4569 clFinish(q);
4570 u->data = (uchar*)u->handle;
4571 u->markHostCopyObsolete(false);
4572 u->markDeviceMemMapped(true);
4573 return;
4574 }
4575 #endif
4576 if (u->data) // FIXIT Workaround for UMat synchronization issue
4577 {
4578 //CV_Assert(u->hostCopyObsolete() == false);
4579 return;
4580 }
4581
4582 cl_int retval = 0;
4583 u->data = (uchar*)clEnqueueMapBuffer(q, (cl_mem)u->handle, CL_TRUE,
4584 (CL_MAP_READ | CL_MAP_WRITE),
4585 0, u->size, 0, 0, 0, &retval);
4586 if(u->data && retval == CL_SUCCESS)
4587 {
4588 u->markHostCopyObsolete(false);
4589 u->markDeviceMemMapped(true);
4590 return;
4591 }
4592
4593 // TODO Is it really a good idea and was it tested well?
4594 // if map failed, switch to copy-on-map mode for the particular buffer
4595 u->flags |= UMatData::COPY_ON_MAP;
4596 }
4597
4598 if(!u->data)
4599 {
4600 u->data = (uchar*)fastMalloc(u->size);
4601 u->markHostCopyObsolete(true);
4602 }
4603 }
4604
4605 if( (accessFlags & ACCESS_READ) != 0 && u->hostCopyObsolete() )
4606 {
4607 AlignedDataPtr<false, true> alignedPtr(u->data, u->size, CV_OPENCL_DATA_PTR_ALIGNMENT);
4608 #ifdef HAVE_OPENCL_SVM
4609 CV_DbgAssert((u->allocatorFlags_ & svm::OPENCL_SVM_BUFFER_MASK) == 0);
4610 #endif
4611 CV_Assert( clEnqueueReadBuffer(q, (cl_mem)u->handle, CL_TRUE, 0,
4612 u->size, alignedPtr.getAlignedPtr(), 0, 0, 0) == CL_SUCCESS );
4613 u->markHostCopyObsolete(false);
4614 }
4615 }
4616
unmap(UMatData * u) const4617 void unmap(UMatData* u) const
4618 {
4619 if(!u)
4620 return;
4621
4622
4623 CV_Assert(u->handle != 0);
4624
4625 UMatDataAutoLock autolock(u);
4626
4627 // FIXIT Workaround for UMat synchronization issue
4628 if(u->refcount > 0)
4629 return;
4630
4631 cl_command_queue q = (cl_command_queue)Queue::getDefault().ptr();
4632 cl_int retval = 0;
4633 if( !u->copyOnMap() && u->deviceMemMapped() )
4634 {
4635 CV_Assert(u->data != NULL);
4636 u->markDeviceMemMapped(false);
4637 #ifdef HAVE_OPENCL_SVM
4638 if ((u->allocatorFlags_ & svm::OPENCL_SVM_BUFFER_MASK) != 0)
4639 {
4640 if ((u->allocatorFlags_ & svm::OPENCL_SVM_BUFFER_MASK) == svm::OPENCL_SVM_COARSE_GRAIN_BUFFER)
4641 {
4642 Context& ctx = Context::getDefault();
4643 const svm::SVMFunctions* svmFns = svm::getSVMFunctions(ctx);
4644 CV_DbgAssert(svmFns->isValid());
4645
4646 CV_DbgAssert((u->allocatorFlags_ & svm::OPENCL_SVM_BUFFER_MAP) != 0);
4647 {
4648 CV_OPENCL_SVM_TRACE_P("clEnqueueSVMUnmap: %p\n", u->handle);
4649 cl_int status = svmFns->fn_clEnqueueSVMUnmap(q, u->handle,
4650 0, NULL, NULL);
4651 CV_Assert(status == CL_SUCCESS);
4652 clFinish(q);
4653 u->allocatorFlags_ &= ~svm::OPENCL_SVM_BUFFER_MAP;
4654 }
4655 }
4656 u->data = 0;
4657 u->markDeviceCopyObsolete(false);
4658 u->markHostCopyObsolete(false);
4659 return;
4660 }
4661 #endif
4662 CV_Assert( (retval = clEnqueueUnmapMemObject(q,
4663 (cl_mem)u->handle, u->data, 0, 0, 0)) == CL_SUCCESS );
4664 if (Device::getDefault().isAMD())
4665 {
4666 // required for multithreaded applications (see stitching test)
4667 CV_OclDbgAssert(clFinish(q) == CL_SUCCESS);
4668 }
4669 u->data = 0;
4670 }
4671 else if( u->copyOnMap() && u->deviceCopyObsolete() )
4672 {
4673 AlignedDataPtr<true, false> alignedPtr(u->data, u->size, CV_OPENCL_DATA_PTR_ALIGNMENT);
4674 #ifdef HAVE_OPENCL_SVM
4675 CV_DbgAssert((u->allocatorFlags_ & svm::OPENCL_SVM_BUFFER_MASK) == 0);
4676 #endif
4677 CV_Assert( (retval = clEnqueueWriteBuffer(q, (cl_mem)u->handle, CL_TRUE, 0,
4678 u->size, alignedPtr.getAlignedPtr(), 0, 0, 0)) == CL_SUCCESS );
4679 }
4680 u->markDeviceCopyObsolete(false);
4681 u->markHostCopyObsolete(false);
4682 }
4683
checkContinuous(int dims,const size_t sz[],const size_t srcofs[],const size_t srcstep[],const size_t dstofs[],const size_t dststep[],size_t & total,size_t new_sz[],size_t & srcrawofs,size_t new_srcofs[],size_t new_srcstep[],size_t & dstrawofs,size_t new_dstofs[],size_t new_dststep[]) const4684 bool checkContinuous(int dims, const size_t sz[],
4685 const size_t srcofs[], const size_t srcstep[],
4686 const size_t dstofs[], const size_t dststep[],
4687 size_t& total, size_t new_sz[],
4688 size_t& srcrawofs, size_t new_srcofs[], size_t new_srcstep[],
4689 size_t& dstrawofs, size_t new_dstofs[], size_t new_dststep[]) const
4690 {
4691 bool iscontinuous = true;
4692 srcrawofs = srcofs ? srcofs[dims-1] : 0;
4693 dstrawofs = dstofs ? dstofs[dims-1] : 0;
4694 total = sz[dims-1];
4695 for( int i = dims-2; i >= 0; i-- )
4696 {
4697 if( i >= 0 && (total != srcstep[i] || total != dststep[i]) )
4698 iscontinuous = false;
4699 total *= sz[i];
4700 if( srcofs )
4701 srcrawofs += srcofs[i]*srcstep[i];
4702 if( dstofs )
4703 dstrawofs += dstofs[i]*dststep[i];
4704 }
4705
4706 if( !iscontinuous )
4707 {
4708 // OpenCL uses {x, y, z} order while OpenCV uses {z, y, x} order.
4709 if( dims == 2 )
4710 {
4711 new_sz[0] = sz[1]; new_sz[1] = sz[0]; new_sz[2] = 1;
4712 // we assume that new_... arrays are initialized by caller
4713 // with 0's, so there is no else branch
4714 if( srcofs )
4715 {
4716 new_srcofs[0] = srcofs[1];
4717 new_srcofs[1] = srcofs[0];
4718 new_srcofs[2] = 0;
4719 }
4720
4721 if( dstofs )
4722 {
4723 new_dstofs[0] = dstofs[1];
4724 new_dstofs[1] = dstofs[0];
4725 new_dstofs[2] = 0;
4726 }
4727
4728 new_srcstep[0] = srcstep[0]; new_srcstep[1] = 0;
4729 new_dststep[0] = dststep[0]; new_dststep[1] = 0;
4730 }
4731 else
4732 {
4733 // we could check for dims == 3 here,
4734 // but from user perspective this one is more informative
4735 CV_Assert(dims <= 3);
4736 new_sz[0] = sz[2]; new_sz[1] = sz[1]; new_sz[2] = sz[0];
4737 if( srcofs )
4738 {
4739 new_srcofs[0] = srcofs[2];
4740 new_srcofs[1] = srcofs[1];
4741 new_srcofs[2] = srcofs[0];
4742 }
4743
4744 if( dstofs )
4745 {
4746 new_dstofs[0] = dstofs[2];
4747 new_dstofs[1] = dstofs[1];
4748 new_dstofs[2] = dstofs[0];
4749 }
4750
4751 new_srcstep[0] = srcstep[1]; new_srcstep[1] = srcstep[0];
4752 new_dststep[0] = dststep[1]; new_dststep[1] = dststep[0];
4753 }
4754 }
4755 return iscontinuous;
4756 }
4757
download(UMatData * u,void * dstptr,int dims,const size_t sz[],const size_t srcofs[],const size_t srcstep[],const size_t dststep[]) const4758 void download(UMatData* u, void* dstptr, int dims, const size_t sz[],
4759 const size_t srcofs[], const size_t srcstep[],
4760 const size_t dststep[]) const
4761 {
4762 if(!u)
4763 return;
4764 UMatDataAutoLock autolock(u);
4765
4766 if( u->data && !u->hostCopyObsolete() )
4767 {
4768 Mat::getStdAllocator()->download(u, dstptr, dims, sz, srcofs, srcstep, dststep);
4769 return;
4770 }
4771 CV_Assert( u->handle != 0 );
4772
4773 cl_command_queue q = (cl_command_queue)Queue::getDefault().ptr();
4774
4775 size_t total = 0, new_sz[] = {0, 0, 0};
4776 size_t srcrawofs = 0, new_srcofs[] = {0, 0, 0}, new_srcstep[] = {0, 0, 0};
4777 size_t dstrawofs = 0, new_dstofs[] = {0, 0, 0}, new_dststep[] = {0, 0, 0};
4778
4779 bool iscontinuous = checkContinuous(dims, sz, srcofs, srcstep, 0, dststep,
4780 total, new_sz,
4781 srcrawofs, new_srcofs, new_srcstep,
4782 dstrawofs, new_dstofs, new_dststep);
4783
4784 #ifdef HAVE_OPENCL_SVM
4785 if ((u->allocatorFlags_ & svm::OPENCL_SVM_BUFFER_MASK) != 0)
4786 {
4787 CV_DbgAssert(u->data == NULL || u->data == u->handle);
4788 Context& ctx = Context::getDefault();
4789 const svm::SVMFunctions* svmFns = svm::getSVMFunctions(ctx);
4790 CV_DbgAssert(svmFns->isValid());
4791
4792 CV_DbgAssert((u->allocatorFlags_ & svm::OPENCL_SVM_BUFFER_MAP) == 0);
4793 if ((u->allocatorFlags_ & svm::OPENCL_SVM_BUFFER_MASK) == svm::OPENCL_SVM_COARSE_GRAIN_BUFFER)
4794 {
4795 CV_OPENCL_SVM_TRACE_P("clEnqueueSVMMap: %p (%d)\n", u->handle, (int)u->size);
4796 cl_int status = svmFns->fn_clEnqueueSVMMap(q, CL_FALSE, CL_MAP_READ,
4797 u->handle, u->size,
4798 0, NULL, NULL);
4799 CV_Assert(status == CL_SUCCESS);
4800 }
4801 clFinish(q);
4802 if( iscontinuous )
4803 {
4804 memcpy(dstptr, (uchar*)u->handle + srcrawofs, total);
4805 }
4806 else
4807 {
4808 // This code is from MatAllocator::download()
4809 int isz[CV_MAX_DIM];
4810 uchar* srcptr = (uchar*)u->handle;
4811 for( int i = 0; i < dims; i++ )
4812 {
4813 CV_Assert( sz[i] <= (size_t)INT_MAX );
4814 if( sz[i] == 0 )
4815 return;
4816 if( srcofs )
4817 srcptr += srcofs[i]*(i <= dims-2 ? srcstep[i] : 1);
4818 isz[i] = (int)sz[i];
4819 }
4820
4821 Mat src(dims, isz, CV_8U, srcptr, srcstep);
4822 Mat dst(dims, isz, CV_8U, dstptr, dststep);
4823
4824 const Mat* arrays[] = { &src, &dst };
4825 uchar* ptrs[2];
4826 NAryMatIterator it(arrays, ptrs, 2);
4827 size_t j, planesz = it.size;
4828
4829 for( j = 0; j < it.nplanes; j++, ++it )
4830 memcpy(ptrs[1], ptrs[0], planesz);
4831 }
4832 if ((u->allocatorFlags_ & svm::OPENCL_SVM_BUFFER_MASK) == svm::OPENCL_SVM_COARSE_GRAIN_BUFFER)
4833 {
4834 CV_OPENCL_SVM_TRACE_P("clEnqueueSVMUnmap: %p\n", u->handle);
4835 cl_int status = svmFns->fn_clEnqueueSVMUnmap(q, u->handle,
4836 0, NULL, NULL);
4837 CV_Assert(status == CL_SUCCESS);
4838 clFinish(q);
4839 }
4840 }
4841 else
4842 #endif
4843 {
4844 AlignedDataPtr<false, true> alignedPtr((uchar*)dstptr, sz[0] * dststep[0], CV_OPENCL_DATA_PTR_ALIGNMENT);
4845 if( iscontinuous )
4846 {
4847 CV_Assert( clEnqueueReadBuffer(q, (cl_mem)u->handle, CL_TRUE,
4848 srcrawofs, total, alignedPtr.getAlignedPtr(), 0, 0, 0) >= 0 );
4849 }
4850 else
4851 {
4852 CV_Assert( clEnqueueReadBufferRect(q, (cl_mem)u->handle, CL_TRUE,
4853 new_srcofs, new_dstofs, new_sz, new_srcstep[0], new_srcstep[1],
4854 new_dststep[0], new_dststep[1], alignedPtr.getAlignedPtr(), 0, 0, 0) >= 0 );
4855 }
4856 }
4857 }
4858
upload(UMatData * u,const void * srcptr,int dims,const size_t sz[],const size_t dstofs[],const size_t dststep[],const size_t srcstep[]) const4859 void upload(UMatData* u, const void* srcptr, int dims, const size_t sz[],
4860 const size_t dstofs[], const size_t dststep[],
4861 const size_t srcstep[]) const
4862 {
4863 if(!u)
4864 return;
4865
4866 // there should be no user-visible CPU copies of the UMat which we are going to copy to
4867 CV_Assert(u->refcount == 0 || u->tempUMat());
4868
4869 size_t total = 0, new_sz[] = {0, 0, 0};
4870 size_t srcrawofs = 0, new_srcofs[] = {0, 0, 0}, new_srcstep[] = {0, 0, 0};
4871 size_t dstrawofs = 0, new_dstofs[] = {0, 0, 0}, new_dststep[] = {0, 0, 0};
4872
4873 bool iscontinuous = checkContinuous(dims, sz, 0, srcstep, dstofs, dststep,
4874 total, new_sz,
4875 srcrawofs, new_srcofs, new_srcstep,
4876 dstrawofs, new_dstofs, new_dststep);
4877
4878 UMatDataAutoLock autolock(u);
4879
4880 // if there is cached CPU copy of the GPU matrix,
4881 // we could use it as a destination.
4882 // we can do it in 2 cases:
4883 // 1. we overwrite the whole content
4884 // 2. we overwrite part of the matrix, but the GPU copy is out-of-date
4885 if( u->data && (u->hostCopyObsolete() < u->deviceCopyObsolete() || total == u->size))
4886 {
4887 Mat::getStdAllocator()->upload(u, srcptr, dims, sz, dstofs, dststep, srcstep);
4888 u->markHostCopyObsolete(false);
4889 u->markDeviceCopyObsolete(true);
4890 return;
4891 }
4892
4893 CV_Assert( u->handle != 0 );
4894 cl_command_queue q = (cl_command_queue)Queue::getDefault().ptr();
4895
4896 #ifdef HAVE_OPENCL_SVM
4897 if ((u->allocatorFlags_ & svm::OPENCL_SVM_BUFFER_MASK) != 0)
4898 {
4899 CV_DbgAssert(u->data == NULL || u->data == u->handle);
4900 Context& ctx = Context::getDefault();
4901 const svm::SVMFunctions* svmFns = svm::getSVMFunctions(ctx);
4902 CV_DbgAssert(svmFns->isValid());
4903
4904 CV_DbgAssert((u->allocatorFlags_ & svm::OPENCL_SVM_BUFFER_MAP) == 0);
4905 if ((u->allocatorFlags_ & svm::OPENCL_SVM_BUFFER_MASK) == svm::OPENCL_SVM_COARSE_GRAIN_BUFFER)
4906 {
4907 CV_OPENCL_SVM_TRACE_P("clEnqueueSVMMap: %p (%d)\n", u->handle, (int)u->size);
4908 cl_int status = svmFns->fn_clEnqueueSVMMap(q, CL_FALSE, CL_MAP_WRITE,
4909 u->handle, u->size,
4910 0, NULL, NULL);
4911 CV_Assert(status == CL_SUCCESS);
4912 }
4913 clFinish(q);
4914 if( iscontinuous )
4915 {
4916 memcpy((uchar*)u->handle + dstrawofs, srcptr, total);
4917 }
4918 else
4919 {
4920 // This code is from MatAllocator::upload()
4921 int isz[CV_MAX_DIM];
4922 uchar* dstptr = (uchar*)u->handle;
4923 for( int i = 0; i < dims; i++ )
4924 {
4925 CV_Assert( sz[i] <= (size_t)INT_MAX );
4926 if( sz[i] == 0 )
4927 return;
4928 if( dstofs )
4929 dstptr += dstofs[i]*(i <= dims-2 ? dststep[i] : 1);
4930 isz[i] = (int)sz[i];
4931 }
4932
4933 Mat src(dims, isz, CV_8U, (void*)srcptr, srcstep);
4934 Mat dst(dims, isz, CV_8U, dstptr, dststep);
4935
4936 const Mat* arrays[] = { &src, &dst };
4937 uchar* ptrs[2];
4938 NAryMatIterator it(arrays, ptrs, 2);
4939 size_t j, planesz = it.size;
4940
4941 for( j = 0; j < it.nplanes; j++, ++it )
4942 memcpy(ptrs[1], ptrs[0], planesz);
4943 }
4944 if ((u->allocatorFlags_ & svm::OPENCL_SVM_BUFFER_MASK) == svm::OPENCL_SVM_COARSE_GRAIN_BUFFER)
4945 {
4946 CV_OPENCL_SVM_TRACE_P("clEnqueueSVMUnmap: %p\n", u->handle);
4947 cl_int status = svmFns->fn_clEnqueueSVMUnmap(q, u->handle,
4948 0, NULL, NULL);
4949 CV_Assert(status == CL_SUCCESS);
4950 clFinish(q);
4951 }
4952 }
4953 else
4954 #endif
4955 {
4956 AlignedDataPtr<true, false> alignedPtr((uchar*)srcptr, sz[0] * srcstep[0], CV_OPENCL_DATA_PTR_ALIGNMENT);
4957 if( iscontinuous )
4958 {
4959 CV_Assert( clEnqueueWriteBuffer(q, (cl_mem)u->handle,
4960 CL_TRUE, dstrawofs, total, alignedPtr.getAlignedPtr(), 0, 0, 0) >= 0 );
4961 }
4962 else
4963 {
4964 CV_Assert( clEnqueueWriteBufferRect(q, (cl_mem)u->handle, CL_TRUE,
4965 new_dstofs, new_srcofs, new_sz, new_dststep[0], new_dststep[1],
4966 new_srcstep[0], new_srcstep[1], alignedPtr.getAlignedPtr(), 0, 0, 0) >= 0 );
4967 }
4968 }
4969 u->markHostCopyObsolete(true);
4970 #ifdef HAVE_OPENCL_SVM
4971 if ((u->allocatorFlags_ & svm::OPENCL_SVM_BUFFER_MASK) == svm::OPENCL_SVM_FINE_GRAIN_BUFFER ||
4972 (u->allocatorFlags_ & svm::OPENCL_SVM_BUFFER_MASK) == svm::OPENCL_SVM_FINE_GRAIN_SYSTEM)
4973 {
4974 // nothing
4975 }
4976 else
4977 #endif
4978 {
4979 u->markHostCopyObsolete(true);
4980 }
4981 u->markDeviceCopyObsolete(false);
4982 }
4983
copy(UMatData * src,UMatData * dst,int dims,const size_t sz[],const size_t srcofs[],const size_t srcstep[],const size_t dstofs[],const size_t dststep[],bool _sync) const4984 void copy(UMatData* src, UMatData* dst, int dims, const size_t sz[],
4985 const size_t srcofs[], const size_t srcstep[],
4986 const size_t dstofs[], const size_t dststep[], bool _sync) const
4987 {
4988 if(!src || !dst)
4989 return;
4990
4991 size_t total = 0, new_sz[] = {0, 0, 0};
4992 size_t srcrawofs = 0, new_srcofs[] = {0, 0, 0}, new_srcstep[] = {0, 0, 0};
4993 size_t dstrawofs = 0, new_dstofs[] = {0, 0, 0}, new_dststep[] = {0, 0, 0};
4994
4995 bool iscontinuous = checkContinuous(dims, sz, srcofs, srcstep, dstofs, dststep,
4996 total, new_sz,
4997 srcrawofs, new_srcofs, new_srcstep,
4998 dstrawofs, new_dstofs, new_dststep);
4999
5000 UMatDataAutoLock src_autolock(src);
5001 UMatDataAutoLock dst_autolock(dst);
5002
5003 if( !src->handle || (src->data && src->hostCopyObsolete() < src->deviceCopyObsolete()) )
5004 {
5005 upload(dst, src->data + srcrawofs, dims, sz, dstofs, dststep, srcstep);
5006 return;
5007 }
5008 if( !dst->handle || (dst->data && dst->hostCopyObsolete() < dst->deviceCopyObsolete()) )
5009 {
5010 download(src, dst->data + dstrawofs, dims, sz, srcofs, srcstep, dststep);
5011 dst->markHostCopyObsolete(false);
5012 #ifdef HAVE_OPENCL_SVM
5013 if ((dst->allocatorFlags_ & svm::OPENCL_SVM_BUFFER_MASK) == svm::OPENCL_SVM_FINE_GRAIN_BUFFER ||
5014 (dst->allocatorFlags_ & svm::OPENCL_SVM_BUFFER_MASK) == svm::OPENCL_SVM_FINE_GRAIN_SYSTEM)
5015 {
5016 // nothing
5017 }
5018 else
5019 #endif
5020 {
5021 dst->markDeviceCopyObsolete(true);
5022 }
5023 return;
5024 }
5025
5026 // there should be no user-visible CPU copies of the UMat which we are going to copy to
5027 CV_Assert(dst->refcount == 0);
5028 cl_command_queue q = (cl_command_queue)Queue::getDefault().ptr();
5029
5030 cl_int retval = CL_SUCCESS;
5031 #ifdef HAVE_OPENCL_SVM
5032 if ((src->allocatorFlags_ & svm::OPENCL_SVM_BUFFER_MASK) != 0 ||
5033 (dst->allocatorFlags_ & svm::OPENCL_SVM_BUFFER_MASK) != 0)
5034 {
5035 if ((src->allocatorFlags_ & svm::OPENCL_SVM_BUFFER_MASK) != 0 &&
5036 (dst->allocatorFlags_ & svm::OPENCL_SVM_BUFFER_MASK) != 0)
5037 {
5038 Context& ctx = Context::getDefault();
5039 const svm::SVMFunctions* svmFns = svm::getSVMFunctions(ctx);
5040 CV_DbgAssert(svmFns->isValid());
5041
5042 if( iscontinuous )
5043 {
5044 CV_OPENCL_SVM_TRACE_P("clEnqueueSVMMemcpy: %p <-- %p (%d)\n",
5045 (uchar*)dst->handle + dstrawofs, (uchar*)src->handle + srcrawofs, (int)total);
5046 cl_int status = svmFns->fn_clEnqueueSVMMemcpy(q, CL_TRUE,
5047 (uchar*)dst->handle + dstrawofs, (uchar*)src->handle + srcrawofs,
5048 total, 0, NULL, NULL);
5049 CV_Assert(status == CL_SUCCESS);
5050 }
5051 else
5052 {
5053 clFinish(q);
5054 // This code is from MatAllocator::download()/upload()
5055 int isz[CV_MAX_DIM];
5056 uchar* srcptr = (uchar*)src->handle;
5057 for( int i = 0; i < dims; i++ )
5058 {
5059 CV_Assert( sz[i] <= (size_t)INT_MAX );
5060 if( sz[i] == 0 )
5061 return;
5062 if( srcofs )
5063 srcptr += srcofs[i]*(i <= dims-2 ? srcstep[i] : 1);
5064 isz[i] = (int)sz[i];
5065 }
5066 Mat m_src(dims, isz, CV_8U, srcptr, srcstep);
5067
5068 uchar* dstptr = (uchar*)dst->handle;
5069 for( int i = 0; i < dims; i++ )
5070 {
5071 if( dstofs )
5072 dstptr += dstofs[i]*(i <= dims-2 ? dststep[i] : 1);
5073 }
5074 Mat m_dst(dims, isz, CV_8U, dstptr, dststep);
5075
5076 const Mat* arrays[] = { &m_src, &m_dst };
5077 uchar* ptrs[2];
5078 NAryMatIterator it(arrays, ptrs, 2);
5079 size_t j, planesz = it.size;
5080
5081 for( j = 0; j < it.nplanes; j++, ++it )
5082 memcpy(ptrs[1], ptrs[0], planesz);
5083 }
5084 }
5085 else
5086 {
5087 if ((src->allocatorFlags_ & svm::OPENCL_SVM_BUFFER_MASK) != 0)
5088 {
5089 map(src, ACCESS_READ);
5090 upload(dst, src->data + srcrawofs, dims, sz, dstofs, dststep, srcstep);
5091 unmap(src);
5092 }
5093 else
5094 {
5095 map(dst, ACCESS_WRITE);
5096 download(src, dst->data + dstrawofs, dims, sz, srcofs, srcstep, dststep);
5097 unmap(dst);
5098 }
5099 }
5100 }
5101 else
5102 #endif
5103 {
5104 if( iscontinuous )
5105 {
5106 CV_Assert( (retval = clEnqueueCopyBuffer(q, (cl_mem)src->handle, (cl_mem)dst->handle,
5107 srcrawofs, dstrawofs, total, 0, 0, 0)) == CL_SUCCESS );
5108 }
5109 else
5110 {
5111 CV_Assert( (retval = clEnqueueCopyBufferRect(q, (cl_mem)src->handle, (cl_mem)dst->handle,
5112 new_srcofs, new_dstofs, new_sz,
5113 new_srcstep[0], new_srcstep[1],
5114 new_dststep[0], new_dststep[1],
5115 0, 0, 0)) == CL_SUCCESS );
5116 }
5117 }
5118 if (retval == CL_SUCCESS)
5119 {
5120 CV_IMPL_ADD(CV_IMPL_OCL)
5121 }
5122
5123 #ifdef HAVE_OPENCL_SVM
5124 if ((dst->allocatorFlags_ & svm::OPENCL_SVM_BUFFER_MASK) == svm::OPENCL_SVM_FINE_GRAIN_BUFFER ||
5125 (dst->allocatorFlags_ & svm::OPENCL_SVM_BUFFER_MASK) == svm::OPENCL_SVM_FINE_GRAIN_SYSTEM)
5126 {
5127 // nothing
5128 }
5129 else
5130 #endif
5131 {
5132 dst->markHostCopyObsolete(true);
5133 }
5134 dst->markDeviceCopyObsolete(false);
5135
5136 if( _sync )
5137 {
5138 CV_OclDbgAssert(clFinish(q) == CL_SUCCESS);
5139 }
5140 }
5141
getBufferPoolController(const char * id) const5142 BufferPoolController* getBufferPoolController(const char* id) const {
5143 #ifdef HAVE_OPENCL_SVM
5144 if ((svm::checkForceSVMUmatUsage() && (id == NULL || strcmp(id, "OCL") == 0)) || (id != NULL && strcmp(id, "SVM") == 0))
5145 {
5146 return &bufferPoolSVM;
5147 }
5148 #endif
5149 if (id != NULL && strcmp(id, "HOST_ALLOC") == 0)
5150 {
5151 return &bufferPoolHostPtr;
5152 }
5153 if (id != NULL && strcmp(id, "OCL") != 0)
5154 {
5155 CV_ErrorNoReturn(cv::Error::StsBadArg, "getBufferPoolController(): unknown BufferPool ID\n");
5156 }
5157 return &bufferPool;
5158 }
5159
5160 MatAllocator* matStdAllocator;
5161 };
5162
getOpenCLAllocator()5163 MatAllocator* getOpenCLAllocator()
5164 {
5165 static MatAllocator * allocator = new OpenCLAllocator();
5166 return allocator;
5167 }
5168
5169 ///////////////////////////////////////////// Utility functions /////////////////////////////////////////////////
5170
getDevices(std::vector<cl_device_id> & devices,cl_platform_id platform)5171 static void getDevices(std::vector<cl_device_id>& devices, cl_platform_id platform)
5172 {
5173 cl_uint numDevices = 0;
5174 CV_OclDbgAssert(clGetDeviceIDs(platform, (cl_device_type)Device::TYPE_ALL,
5175 0, NULL, &numDevices) == CL_SUCCESS);
5176
5177 if (numDevices == 0)
5178 {
5179 devices.clear();
5180 return;
5181 }
5182
5183 devices.resize((size_t)numDevices);
5184 CV_OclDbgAssert(clGetDeviceIDs(platform, (cl_device_type)Device::TYPE_ALL,
5185 numDevices, &devices[0], &numDevices) == CL_SUCCESS);
5186 }
5187
5188 struct PlatformInfo::Impl
5189 {
Implcv::ocl::PlatformInfo::Impl5190 Impl(void* id)
5191 {
5192 refcount = 1;
5193 handle = *(cl_platform_id*)id;
5194 getDevices(devices, handle);
5195 }
5196
getStrPropcv::ocl::PlatformInfo::Impl5197 String getStrProp(cl_device_info prop) const
5198 {
5199 char buf[1024];
5200 size_t sz=0;
5201 return clGetPlatformInfo(handle, prop, sizeof(buf)-16, buf, &sz) == CL_SUCCESS &&
5202 sz < sizeof(buf) ? String(buf) : String();
5203 }
5204
5205 IMPLEMENT_REFCOUNTABLE();
5206 std::vector<cl_device_id> devices;
5207 cl_platform_id handle;
5208 };
5209
PlatformInfo()5210 PlatformInfo::PlatformInfo()
5211 {
5212 p = 0;
5213 }
5214
PlatformInfo(void * platform_id)5215 PlatformInfo::PlatformInfo(void* platform_id)
5216 {
5217 p = new Impl(platform_id);
5218 }
5219
~PlatformInfo()5220 PlatformInfo::~PlatformInfo()
5221 {
5222 if(p)
5223 p->release();
5224 }
5225
PlatformInfo(const PlatformInfo & i)5226 PlatformInfo::PlatformInfo(const PlatformInfo& i)
5227 {
5228 if (i.p)
5229 i.p->addref();
5230 p = i.p;
5231 }
5232
operator =(const PlatformInfo & i)5233 PlatformInfo& PlatformInfo::operator =(const PlatformInfo& i)
5234 {
5235 if (i.p != p)
5236 {
5237 if (i.p)
5238 i.p->addref();
5239 if (p)
5240 p->release();
5241 p = i.p;
5242 }
5243 return *this;
5244 }
5245
deviceNumber() const5246 int PlatformInfo::deviceNumber() const
5247 {
5248 return p ? (int)p->devices.size() : 0;
5249 }
5250
getDevice(Device & device,int d) const5251 void PlatformInfo::getDevice(Device& device, int d) const
5252 {
5253 CV_Assert(p && d < (int)p->devices.size() );
5254 if(p)
5255 device.set(p->devices[d]);
5256 }
5257
name() const5258 String PlatformInfo::name() const
5259 {
5260 return p ? p->getStrProp(CL_PLATFORM_NAME) : String();
5261 }
5262
vendor() const5263 String PlatformInfo::vendor() const
5264 {
5265 return p ? p->getStrProp(CL_PLATFORM_VENDOR) : String();
5266 }
5267
version() const5268 String PlatformInfo::version() const
5269 {
5270 return p ? p->getStrProp(CL_PLATFORM_VERSION) : String();
5271 }
5272
getPlatforms(std::vector<cl_platform_id> & platforms)5273 static void getPlatforms(std::vector<cl_platform_id>& platforms)
5274 {
5275 cl_uint numPlatforms = 0;
5276 CV_OclDbgAssert(clGetPlatformIDs(0, NULL, &numPlatforms) == CL_SUCCESS);
5277
5278 if (numPlatforms == 0)
5279 {
5280 platforms.clear();
5281 return;
5282 }
5283
5284 platforms.resize((size_t)numPlatforms);
5285 CV_OclDbgAssert(clGetPlatformIDs(numPlatforms, &platforms[0], &numPlatforms) == CL_SUCCESS);
5286 }
5287
getPlatfomsInfo(std::vector<PlatformInfo> & platformsInfo)5288 void getPlatfomsInfo(std::vector<PlatformInfo>& platformsInfo)
5289 {
5290 std::vector<cl_platform_id> platforms;
5291 getPlatforms(platforms);
5292
5293 for (size_t i = 0; i < platforms.size(); i++)
5294 platformsInfo.push_back( PlatformInfo((void*)&platforms[i]) );
5295 }
5296
typeToStr(int type)5297 const char* typeToStr(int type)
5298 {
5299 static const char* tab[]=
5300 {
5301 "uchar", "uchar2", "uchar3", "uchar4", 0, 0, 0, "uchar8", 0, 0, 0, 0, 0, 0, 0, "uchar16",
5302 "char", "char2", "char3", "char4", 0, 0, 0, "char8", 0, 0, 0, 0, 0, 0, 0, "char16",
5303 "ushort", "ushort2", "ushort3", "ushort4",0, 0, 0, "ushort8", 0, 0, 0, 0, 0, 0, 0, "ushort16",
5304 "short", "short2", "short3", "short4", 0, 0, 0, "short8", 0, 0, 0, 0, 0, 0, 0, "short16",
5305 "int", "int2", "int3", "int4", 0, 0, 0, "int8", 0, 0, 0, 0, 0, 0, 0, "int16",
5306 "float", "float2", "float3", "float4", 0, 0, 0, "float8", 0, 0, 0, 0, 0, 0, 0, "float16",
5307 "double", "double2", "double3", "double4", 0, 0, 0, "double8", 0, 0, 0, 0, 0, 0, 0, "double16",
5308 "?", "?", "?", "?", "?", "?", "?", "?", "?", "?", "?", "?", "?", "?", "?", "?"
5309 };
5310 int cn = CV_MAT_CN(type), depth = CV_MAT_DEPTH(type);
5311 return cn > 16 ? "?" : tab[depth*16 + cn-1];
5312 }
5313
memopTypeToStr(int type)5314 const char* memopTypeToStr(int type)
5315 {
5316 static const char* tab[] =
5317 {
5318 "uchar", "uchar2", "uchar3", "uchar4", 0, 0, 0, "uchar8", 0, 0, 0, 0, 0, 0, 0, "uchar16",
5319 "char", "char2", "char3", "char4", 0, 0, 0, "char8", 0, 0, 0, 0, 0, 0, 0, "char16",
5320 "ushort", "ushort2", "ushort3", "ushort4",0, 0, 0, "ushort8", 0, 0, 0, 0, 0, 0, 0, "ushort16",
5321 "short", "short2", "short3", "short4", 0, 0, 0, "short8", 0, 0, 0, 0, 0, 0, 0, "short16",
5322 "int", "int2", "int3", "int4", 0, 0, 0, "int8", 0, 0, 0, 0, 0, 0, 0, "int16",
5323 "int", "int2", "int3", "int4", 0, 0, 0, "int8", 0, 0, 0, 0, 0, 0, 0, "int16",
5324 "ulong", "ulong2", "ulong3", "ulong4", 0, 0, 0, "ulong8", 0, 0, 0, 0, 0, 0, 0, "ulong16",
5325 "?", "?", "?", "?", "?", "?", "?", "?", "?", "?", "?", "?", "?", "?", "?", "?"
5326 };
5327 int cn = CV_MAT_CN(type), depth = CV_MAT_DEPTH(type);
5328 return cn > 16 ? "?" : tab[depth*16 + cn-1];
5329 }
5330
vecopTypeToStr(int type)5331 const char* vecopTypeToStr(int type)
5332 {
5333 static const char* tab[] =
5334 {
5335 "uchar", "short", "uchar3", "int", 0, 0, 0, "int2", 0, 0, 0, 0, 0, 0, 0, "int4",
5336 "char", "short", "char3", "int", 0, 0, 0, "int2", 0, 0, 0, 0, 0, 0, 0, "int4",
5337 "ushort", "int", "ushort3", "int2",0, 0, 0, "int4", 0, 0, 0, 0, 0, 0, 0, "int8",
5338 "short", "int", "short3", "int2", 0, 0, 0, "int4", 0, 0, 0, 0, 0, 0, 0, "int8",
5339 "int", "int2", "int3", "int4", 0, 0, 0, "int8", 0, 0, 0, 0, 0, 0, 0, "int16",
5340 "int", "int2", "int3", "int4", 0, 0, 0, "int8", 0, 0, 0, 0, 0, 0, 0, "int16",
5341 "ulong", "ulong2", "ulong3", "ulong4", 0, 0, 0, "ulong8", 0, 0, 0, 0, 0, 0, 0, "ulong16",
5342 "?", "?", "?", "?", "?", "?", "?", "?", "?", "?", "?", "?", "?", "?", "?", "?"
5343 };
5344 int cn = CV_MAT_CN(type), depth = CV_MAT_DEPTH(type);
5345 return cn > 16 ? "?" : tab[depth*16 + cn-1];
5346 }
5347
convertTypeStr(int sdepth,int ddepth,int cn,char * buf)5348 const char* convertTypeStr(int sdepth, int ddepth, int cn, char* buf)
5349 {
5350 if( sdepth == ddepth )
5351 return "noconvert";
5352 const char *typestr = typeToStr(CV_MAKETYPE(ddepth, cn));
5353 if( ddepth >= CV_32F ||
5354 (ddepth == CV_32S && sdepth < CV_32S) ||
5355 (ddepth == CV_16S && sdepth <= CV_8S) ||
5356 (ddepth == CV_16U && sdepth == CV_8U))
5357 {
5358 sprintf(buf, "convert_%s", typestr);
5359 }
5360 else if( sdepth >= CV_32F )
5361 sprintf(buf, "convert_%s%s_rte", typestr, (ddepth < CV_32S ? "_sat" : ""));
5362 else
5363 sprintf(buf, "convert_%s_sat", typestr);
5364
5365 return buf;
5366 }
5367
5368 template <typename T>
kerToStr(const Mat & k)5369 static std::string kerToStr(const Mat & k)
5370 {
5371 int width = k.cols - 1, depth = k.depth();
5372 const T * const data = k.ptr<T>();
5373
5374 std::ostringstream stream;
5375 stream.precision(10);
5376
5377 if (depth <= CV_8S)
5378 {
5379 for (int i = 0; i < width; ++i)
5380 stream << "DIG(" << (int)data[i] << ")";
5381 stream << "DIG(" << (int)data[width] << ")";
5382 }
5383 else if (depth == CV_32F)
5384 {
5385 stream.setf(std::ios_base::showpoint);
5386 for (int i = 0; i < width; ++i)
5387 stream << "DIG(" << data[i] << "f)";
5388 stream << "DIG(" << data[width] << "f)";
5389 }
5390 else
5391 {
5392 for (int i = 0; i < width; ++i)
5393 stream << "DIG(" << data[i] << ")";
5394 stream << "DIG(" << data[width] << ")";
5395 }
5396
5397 return stream.str();
5398 }
5399
kernelToStr(InputArray _kernel,int ddepth,const char * name)5400 String kernelToStr(InputArray _kernel, int ddepth, const char * name)
5401 {
5402 Mat kernel = _kernel.getMat().reshape(1, 1);
5403
5404 int depth = kernel.depth();
5405 if (ddepth < 0)
5406 ddepth = depth;
5407
5408 if (ddepth != depth)
5409 kernel.convertTo(kernel, ddepth);
5410
5411 typedef std::string (* func_t)(const Mat &);
5412 static const func_t funcs[] = { kerToStr<uchar>, kerToStr<char>, kerToStr<ushort>, kerToStr<short>,
5413 kerToStr<int>, kerToStr<float>, kerToStr<double>, 0 };
5414 const func_t func = funcs[ddepth];
5415 CV_Assert(func != 0);
5416
5417 return cv::format(" -D %s=%s", name ? name : "COEFF", func(kernel).c_str());
5418 }
5419
5420 #define PROCESS_SRC(src) \
5421 do \
5422 { \
5423 if (!src.empty()) \
5424 { \
5425 CV_Assert(src.isMat() || src.isUMat()); \
5426 Size csize = src.size(); \
5427 int ctype = src.type(), ccn = CV_MAT_CN(ctype), cdepth = CV_MAT_DEPTH(ctype), \
5428 ckercn = vectorWidths[cdepth], cwidth = ccn * csize.width; \
5429 if (cwidth < ckercn || ckercn <= 0) \
5430 return 1; \
5431 cols.push_back(cwidth); \
5432 if (strat == OCL_VECTOR_OWN && ctype != ref_type) \
5433 return 1; \
5434 offsets.push_back(src.offset()); \
5435 steps.push_back(src.step()); \
5436 dividers.push_back(ckercn * CV_ELEM_SIZE1(ctype)); \
5437 kercns.push_back(ckercn); \
5438 } \
5439 } \
5440 while ((void)0, 0)
5441
predictOptimalVectorWidth(InputArray src1,InputArray src2,InputArray src3,InputArray src4,InputArray src5,InputArray src6,InputArray src7,InputArray src8,InputArray src9,OclVectorStrategy strat)5442 int predictOptimalVectorWidth(InputArray src1, InputArray src2, InputArray src3,
5443 InputArray src4, InputArray src5, InputArray src6,
5444 InputArray src7, InputArray src8, InputArray src9,
5445 OclVectorStrategy strat)
5446 {
5447 const ocl::Device & d = ocl::Device::getDefault();
5448
5449 int vectorWidths[] = { d.preferredVectorWidthChar(), d.preferredVectorWidthChar(),
5450 d.preferredVectorWidthShort(), d.preferredVectorWidthShort(),
5451 d.preferredVectorWidthInt(), d.preferredVectorWidthFloat(),
5452 d.preferredVectorWidthDouble(), -1 };
5453
5454 // if the device says don't use vectors
5455 if (vectorWidths[0] == 1)
5456 {
5457 // it's heuristic
5458 vectorWidths[CV_8U] = vectorWidths[CV_8S] = 4;
5459 vectorWidths[CV_16U] = vectorWidths[CV_16S] = 2;
5460 vectorWidths[CV_32S] = vectorWidths[CV_32F] = vectorWidths[CV_64F] = 1;
5461 }
5462
5463 return checkOptimalVectorWidth(vectorWidths, src1, src2, src3, src4, src5, src6, src7, src8, src9, strat);
5464 }
5465
checkOptimalVectorWidth(const int * vectorWidths,InputArray src1,InputArray src2,InputArray src3,InputArray src4,InputArray src5,InputArray src6,InputArray src7,InputArray src8,InputArray src9,OclVectorStrategy strat)5466 int checkOptimalVectorWidth(const int *vectorWidths,
5467 InputArray src1, InputArray src2, InputArray src3,
5468 InputArray src4, InputArray src5, InputArray src6,
5469 InputArray src7, InputArray src8, InputArray src9,
5470 OclVectorStrategy strat)
5471 {
5472 CV_Assert(vectorWidths);
5473
5474 int ref_type = src1.type();
5475
5476 std::vector<size_t> offsets, steps, cols;
5477 std::vector<int> dividers, kercns;
5478 PROCESS_SRC(src1);
5479 PROCESS_SRC(src2);
5480 PROCESS_SRC(src3);
5481 PROCESS_SRC(src4);
5482 PROCESS_SRC(src5);
5483 PROCESS_SRC(src6);
5484 PROCESS_SRC(src7);
5485 PROCESS_SRC(src8);
5486 PROCESS_SRC(src9);
5487
5488 size_t size = offsets.size();
5489
5490 for (size_t i = 0; i < size; ++i)
5491 while (offsets[i] % dividers[i] != 0 || steps[i] % dividers[i] != 0 || cols[i] % kercns[i] != 0)
5492 dividers[i] >>= 1, kercns[i] >>= 1;
5493
5494 // default strategy
5495 int kercn = *std::min_element(kercns.begin(), kercns.end());
5496
5497 return kercn;
5498 }
5499
predictOptimalVectorWidthMax(InputArray src1,InputArray src2,InputArray src3,InputArray src4,InputArray src5,InputArray src6,InputArray src7,InputArray src8,InputArray src9)5500 int predictOptimalVectorWidthMax(InputArray src1, InputArray src2, InputArray src3,
5501 InputArray src4, InputArray src5, InputArray src6,
5502 InputArray src7, InputArray src8, InputArray src9)
5503 {
5504 return predictOptimalVectorWidth(src1, src2, src3, src4, src5, src6, src7, src8, src9, OCL_VECTOR_MAX);
5505 }
5506
5507 #undef PROCESS_SRC
5508
5509
5510 // TODO Make this as a method of OpenCL "BuildOptions" class
buildOptionsAddMatrixDescription(String & buildOptions,const String & name,InputArray _m)5511 void buildOptionsAddMatrixDescription(String& buildOptions, const String& name, InputArray _m)
5512 {
5513 if (!buildOptions.empty())
5514 buildOptions += " ";
5515 int type = _m.type(), depth = CV_MAT_DEPTH(type);
5516 buildOptions += format(
5517 "-D %s_T=%s -D %s_T1=%s -D %s_CN=%d -D %s_TSIZE=%d -D %s_T1SIZE=%d -D %s_DEPTH=%d",
5518 name.c_str(), ocl::typeToStr(type),
5519 name.c_str(), ocl::typeToStr(CV_MAKE_TYPE(depth, 1)),
5520 name.c_str(), (int)CV_MAT_CN(type),
5521 name.c_str(), (int)CV_ELEM_SIZE(type),
5522 name.c_str(), (int)CV_ELEM_SIZE1(type),
5523 name.c_str(), (int)depth
5524 );
5525 }
5526
5527
5528 struct Image2D::Impl
5529 {
Implcv::ocl::Image2D::Impl5530 Impl(const UMat &src, bool norm, bool alias)
5531 {
5532 handle = 0;
5533 refcount = 1;
5534 init(src, norm, alias);
5535 }
5536
~Implcv::ocl::Image2D::Impl5537 ~Impl()
5538 {
5539 if (handle)
5540 clReleaseMemObject(handle);
5541 }
5542
getImageFormatcv::ocl::Image2D::Impl5543 static cl_image_format getImageFormat(int depth, int cn, bool norm)
5544 {
5545 cl_image_format format;
5546 static const int channelTypes[] = { CL_UNSIGNED_INT8, CL_SIGNED_INT8, CL_UNSIGNED_INT16,
5547 CL_SIGNED_INT16, CL_SIGNED_INT32, CL_FLOAT, -1, -1 };
5548 static const int channelTypesNorm[] = { CL_UNORM_INT8, CL_SNORM_INT8, CL_UNORM_INT16,
5549 CL_SNORM_INT16, -1, -1, -1, -1 };
5550 static const int channelOrders[] = { -1, CL_R, CL_RG, -1, CL_RGBA };
5551
5552 int channelType = norm ? channelTypesNorm[depth] : channelTypes[depth];
5553 int channelOrder = channelOrders[cn];
5554 format.image_channel_data_type = (cl_channel_type)channelType;
5555 format.image_channel_order = (cl_channel_order)channelOrder;
5556 return format;
5557 }
5558
isFormatSupportedcv::ocl::Image2D::Impl5559 static bool isFormatSupported(cl_image_format format)
5560 {
5561 if (!haveOpenCL())
5562 CV_Error(Error::OpenCLApiCallError, "OpenCL runtime not found!");
5563
5564 cl_context context = (cl_context)Context::getDefault().ptr();
5565 // Figure out how many formats are supported by this context.
5566 cl_uint numFormats = 0;
5567 cl_int err = clGetSupportedImageFormats(context, CL_MEM_READ_WRITE,
5568 CL_MEM_OBJECT_IMAGE2D, numFormats,
5569 NULL, &numFormats);
5570 AutoBuffer<cl_image_format> formats(numFormats);
5571 err = clGetSupportedImageFormats(context, CL_MEM_READ_WRITE,
5572 CL_MEM_OBJECT_IMAGE2D, numFormats,
5573 formats, NULL);
5574 CV_OclDbgAssert(err == CL_SUCCESS);
5575 for (cl_uint i = 0; i < numFormats; ++i)
5576 {
5577 if (!memcmp(&formats[i], &format, sizeof(format)))
5578 {
5579 return true;
5580 }
5581 }
5582 return false;
5583 }
5584
initcv::ocl::Image2D::Impl5585 void init(const UMat &src, bool norm, bool alias)
5586 {
5587 if (!haveOpenCL())
5588 CV_Error(Error::OpenCLApiCallError, "OpenCL runtime not found!");
5589
5590 CV_Assert(!src.empty());
5591 CV_Assert(ocl::Device::getDefault().imageSupport());
5592
5593 int err, depth = src.depth(), cn = src.channels();
5594 CV_Assert(cn <= 4);
5595 cl_image_format format = getImageFormat(depth, cn, norm);
5596
5597 if (!isFormatSupported(format))
5598 CV_Error(Error::OpenCLApiCallError, "Image format is not supported");
5599
5600 if (alias && !src.handle(ACCESS_RW))
5601 CV_Error(Error::OpenCLApiCallError, "Incorrect UMat, handle is null");
5602
5603 cl_context context = (cl_context)Context::getDefault().ptr();
5604 cl_command_queue queue = (cl_command_queue)Queue::getDefault().ptr();
5605
5606 #ifdef CL_VERSION_1_2
5607 // this enables backwards portability to
5608 // run on OpenCL 1.1 platform if library binaries are compiled with OpenCL 1.2 support
5609 const Device & d = ocl::Device::getDefault();
5610 int minor = d.deviceVersionMinor(), major = d.deviceVersionMajor();
5611 CV_Assert(!alias || canCreateAlias(src));
5612 if (1 < major || (1 == major && 2 <= minor))
5613 {
5614 cl_image_desc desc;
5615 desc.image_type = CL_MEM_OBJECT_IMAGE2D;
5616 desc.image_width = src.cols;
5617 desc.image_height = src.rows;
5618 desc.image_depth = 0;
5619 desc.image_array_size = 1;
5620 desc.image_row_pitch = alias ? src.step[0] : 0;
5621 desc.image_slice_pitch = 0;
5622 desc.buffer = alias ? (cl_mem)src.handle(ACCESS_RW) : 0;
5623 desc.num_mip_levels = 0;
5624 desc.num_samples = 0;
5625 handle = clCreateImage(context, CL_MEM_READ_WRITE, &format, &desc, NULL, &err);
5626 }
5627 else
5628 #endif
5629 {
5630 CV_SUPPRESS_DEPRECATED_START
5631 CV_Assert(!alias); // This is an OpenCL 1.2 extension
5632 handle = clCreateImage2D(context, CL_MEM_READ_WRITE, &format, src.cols, src.rows, 0, NULL, &err);
5633 CV_SUPPRESS_DEPRECATED_END
5634 }
5635 CV_OclDbgAssert(err == CL_SUCCESS);
5636
5637 size_t origin[] = { 0, 0, 0 };
5638 size_t region[] = { static_cast<size_t>(src.cols), static_cast<size_t>(src.rows), 1 };
5639
5640 cl_mem devData;
5641 if (!alias && !src.isContinuous())
5642 {
5643 devData = clCreateBuffer(context, CL_MEM_READ_ONLY, src.cols * src.rows * src.elemSize(), NULL, &err);
5644 CV_OclDbgAssert(err == CL_SUCCESS);
5645
5646 const size_t roi[3] = {static_cast<size_t>(src.cols) * src.elemSize(), static_cast<size_t>(src.rows), 1};
5647 CV_Assert(clEnqueueCopyBufferRect(queue, (cl_mem)src.handle(ACCESS_READ), devData, origin, origin,
5648 roi, src.step, 0, src.cols * src.elemSize(), 0, 0, NULL, NULL) == CL_SUCCESS);
5649 CV_OclDbgAssert(clFlush(queue) == CL_SUCCESS);
5650 }
5651 else
5652 {
5653 devData = (cl_mem)src.handle(ACCESS_READ);
5654 }
5655 CV_Assert(devData != NULL);
5656
5657 if (!alias)
5658 {
5659 CV_OclDbgAssert(clEnqueueCopyBufferToImage(queue, devData, handle, 0, origin, region, 0, NULL, 0) == CL_SUCCESS);
5660 if (!src.isContinuous())
5661 {
5662 CV_OclDbgAssert(clFlush(queue) == CL_SUCCESS);
5663 CV_OclDbgAssert(clReleaseMemObject(devData) == CL_SUCCESS);
5664 }
5665 }
5666 }
5667
5668 IMPLEMENT_REFCOUNTABLE();
5669
5670 cl_mem handle;
5671 };
5672
Image2D()5673 Image2D::Image2D()
5674 {
5675 p = NULL;
5676 }
5677
Image2D(const UMat & src,bool norm,bool alias)5678 Image2D::Image2D(const UMat &src, bool norm, bool alias)
5679 {
5680 p = new Impl(src, norm, alias);
5681 }
5682
canCreateAlias(const UMat & m)5683 bool Image2D::canCreateAlias(const UMat &m)
5684 {
5685 bool ret = false;
5686 const Device & d = ocl::Device::getDefault();
5687 if (d.imageFromBufferSupport() && !m.empty())
5688 {
5689 // This is the required pitch alignment in pixels
5690 uint pitchAlign = d.imagePitchAlignment();
5691 if (pitchAlign && !(m.step % (pitchAlign * m.elemSize())))
5692 {
5693 // We don't currently handle the case where the buffer was created
5694 // with CL_MEM_USE_HOST_PTR
5695 if (!m.u->tempUMat())
5696 {
5697 ret = true;
5698 }
5699 }
5700 }
5701 return ret;
5702 }
5703
isFormatSupported(int depth,int cn,bool norm)5704 bool Image2D::isFormatSupported(int depth, int cn, bool norm)
5705 {
5706 cl_image_format format = Impl::getImageFormat(depth, cn, norm);
5707
5708 return Impl::isFormatSupported(format);
5709 }
5710
Image2D(const Image2D & i)5711 Image2D::Image2D(const Image2D & i)
5712 {
5713 p = i.p;
5714 if (p)
5715 p->addref();
5716 }
5717
operator =(const Image2D & i)5718 Image2D & Image2D::operator = (const Image2D & i)
5719 {
5720 if (i.p != p)
5721 {
5722 if (i.p)
5723 i.p->addref();
5724 if (p)
5725 p->release();
5726 p = i.p;
5727 }
5728 return *this;
5729 }
5730
~Image2D()5731 Image2D::~Image2D()
5732 {
5733 if (p)
5734 p->release();
5735 }
5736
ptr() const5737 void* Image2D::ptr() const
5738 {
5739 return p ? p->handle : 0;
5740 }
5741
isPerformanceCheckBypassed()5742 bool internal::isPerformanceCheckBypassed()
5743 {
5744 static bool initialized = false;
5745 static bool value = false;
5746 if (!initialized)
5747 {
5748 value = getBoolParameter("OPENCV_OPENCL_PERF_CHECK_BYPASS", false);
5749 initialized = true;
5750 }
5751 return value;
5752 }
5753
isCLBuffer(UMat & u)5754 bool internal::isCLBuffer(UMat& u)
5755 {
5756 void* h = u.handle(ACCESS_RW);
5757 if (!h)
5758 return true;
5759 CV_DbgAssert(u.u->currAllocator == getOpenCLAllocator());
5760 #if 1
5761 if ((u.u->allocatorFlags_ & 0xffff0000) != 0) // OpenCL SVM flags are stored here
5762 return false;
5763 #else
5764 cl_mem_object_type type = 0;
5765 cl_int ret = clGetMemObjectInfo((cl_mem)h, CL_MEM_TYPE, sizeof(type), &type, NULL);
5766 if (ret != CL_SUCCESS || type != CL_MEM_OBJECT_BUFFER)
5767 return false;
5768 #endif
5769 return true;
5770 }
5771
5772 }}
5773