1/*M/////////////////////////////////////////////////////////////////////////////////////// 2// 3// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING. 4// 5// By downloading, copying, installing or using the software you agree to this license. 6// If you do not agree to this license, do not download, install, 7// copy or use the software. 8// 9// 10// License Agreement 11// For Open Source Computer Vision Library 12// 13// Copyright (C) 2010-2012, Institute Of Software Chinese Academy Of Science, all rights reserved. 14// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved. 15// Copyright (C) 2013, OpenCV Foundation, all rights reserved. 16// Third party copyrights are property of their respective owners. 17// 18// @Authors 19// Jia Haipeng, jiahaipeng95@gmail.com 20// 21// 22// Redistribution and use in source and binary forms, with or without modification, 23// are permitted provided that the following conditions are met: 24// 25// * Redistribution's of source code must retain the above copyright notice, 26// this list of conditions and the following disclaimer. 27// 28// * Redistribution's in binary form must reproduce the above copyright notice, 29// this list of conditions and the following disclaimer in the documentation 30// and/or other materials provided with the distribution. 31// 32// * The name of the copyright holders may not be used to endorse or promote products 33// derived from this software without specific prior written permission. 34// 35// This software is provided by the copyright holders and contributors as is and 36// any express or implied warranties, including, but not limited to, the implied 37// warranties of merchantability and fitness for a particular purpose are disclaimed. 38// In no event shall the copyright holders or contributors be liable for any direct, 39// indirect, incidental, special, exemplary, or consequential damages 40// (including, but not limited to, procurement of substitute goods or services; 41// loss of use, data, or profits; or business interruption) however caused 42// and on any theory of liability, whether in contract, strict liability, 43// or tort (including negligence or otherwise) arising in any way out of 44// the use of this software, even if advised of the possibility of such damage. 45// 46//M*/ 47 48/* 49 Usage: 50 after compiling this program user gets a single kernel called KF. 51 the following flags should be passed: 52 1) one of "-D BINARY_OP", "-D UNARY_OP", "-D MASK_BINARY_OP" or "-D MASK_UNARY_OP" 53 2) the actual operation performed, one of "-D OP_...", see below the list of operations. 54 2a) "-D dstDepth=<destination depth> [-D cn=<num channels]" 55 for some operations, like min/max/and/or/xor it's enough 56 2b) "-D srcDepth1=<source1 depth> -D srcDepth2=<source2 depth> -D dstDepth=<destination depth> 57 -D workDepth=<work depth> [-D cn=<num channels>]" - for mixed-type operations 58*/ 59 60#ifdef DOUBLE_SUPPORT 61#ifdef cl_amd_fp64 62#pragma OPENCL EXTENSION cl_amd_fp64:enable 63#elif defined cl_khr_fp64 64#pragma OPENCL EXTENSION cl_khr_fp64:enable 65#endif 66#endif 67 68#ifdef INTEL_DEVICE 69#pragma OPENCL FP_CONTRACT ON 70#pragma OPENCL FP_FAST_FMAF ON 71#pragma OPENCL FP_FAST_FMA ON 72#endif 73 74#if depth <= 5 75#define CV_PI M_PI_F 76#else 77#define CV_PI M_PI 78#endif 79 80#ifndef cn 81#define cn 1 82#endif 83 84#if cn == 1 85#undef srcT1_C1 86#undef srcT2_C1 87#undef dstT_C1 88#define srcT1_C1 srcT1 89#define srcT2_C1 srcT2 90#define dstT_C1 dstT 91#endif 92 93#if cn != 3 94 #define storedst(val) *(__global dstT *)(dstptr + dst_index) = val 95 #define storedst2(val) *(__global dstT *)(dstptr2 + dst_index2) = val 96#else 97 #define storedst(val) vstore3(val, 0, (__global dstT_C1 *)(dstptr + dst_index)) 98 #define storedst2(val) vstore3(val, 0, (__global dstT_C1 *)(dstptr2 + dst_index2)) 99#endif 100 101#define noconvert 102 103#ifndef workT 104 105 #ifndef srcT1 106 #define srcT1 dstT 107 #endif 108 109 #ifndef srcT1_C1 110 #define srcT1_C1 dstT_C1 111 #endif 112 113 #ifndef srcT2 114 #define srcT2 dstT 115 #endif 116 117 #ifndef srcT2_C1 118 #define srcT2_C1 dstT_C1 119 #endif 120 121 #define workT dstT 122 #if cn != 3 123 #define srcelem1 *(__global srcT1 *)(srcptr1 + src1_index) 124 #define srcelem2 *(__global srcT2 *)(srcptr2 + src2_index) 125 #else 126 #define srcelem1 vload3(0, (__global srcT1_C1 *)(srcptr1 + src1_index)) 127 #define srcelem2 vload3(0, (__global srcT2_C1 *)(srcptr2 + src2_index)) 128 #endif 129 #ifndef convertToDT 130 #define convertToDT noconvert 131 #endif 132 133#else 134 135 #ifndef convertToWT2 136 #define convertToWT2 convertToWT1 137 #endif 138 #if cn != 3 139 #define srcelem1 convertToWT1(*(__global srcT1 *)(srcptr1 + src1_index)) 140 #define srcelem2 convertToWT2(*(__global srcT2 *)(srcptr2 + src2_index)) 141 #else 142 #define srcelem1 convertToWT1(vload3(0, (__global srcT1_C1 *)(srcptr1 + src1_index))) 143 #define srcelem2 convertToWT2(vload3(0, (__global srcT2_C1 *)(srcptr2 + src2_index))) 144 #endif 145 146#endif 147 148#ifndef workST 149#define workST workT 150#endif 151 152#define EXTRA_PARAMS 153#define EXTRA_INDEX 154#define EXTRA_INDEX_ADD 155 156#if defined OP_ADD 157#define PROCESS_ELEM storedst(convertToDT(srcelem1 + srcelem2)) 158 159#elif defined OP_SUB 160#define PROCESS_ELEM storedst(convertToDT(srcelem1 - srcelem2)) 161 162#elif defined OP_RSUB 163#define PROCESS_ELEM storedst(convertToDT(srcelem2 - srcelem1)) 164 165#elif defined OP_ABSDIFF 166#if wdepth <= 4 167#define PROCESS_ELEM \ 168 storedst(convertToDT(convertFromU(abs_diff(srcelem1, srcelem2)))) 169#else 170#define PROCESS_ELEM \ 171 storedst(convertToDT(fabs(srcelem1 - srcelem2))) 172#endif 173 174#elif defined OP_AND 175#define PROCESS_ELEM storedst(srcelem1 & srcelem2) 176 177#elif defined OP_OR 178#define PROCESS_ELEM storedst(srcelem1 | srcelem2) 179 180#elif defined OP_XOR 181#define PROCESS_ELEM storedst(srcelem1 ^ srcelem2) 182 183#elif defined OP_NOT 184#define PROCESS_ELEM storedst(~srcelem1) 185 186#elif defined OP_MIN 187#define PROCESS_ELEM storedst(min(srcelem1, srcelem2)) 188 189#elif defined OP_MAX 190#define PROCESS_ELEM storedst(max(srcelem1, srcelem2)) 191 192#elif defined OP_MUL 193#define PROCESS_ELEM storedst(convertToDT(srcelem1 * srcelem2)) 194 195#elif defined OP_MUL_SCALE 196#undef EXTRA_PARAMS 197#ifdef UNARY_OP 198#define EXTRA_PARAMS , workST srcelem2_, scaleT scale 199#undef srcelem2 200#define srcelem2 srcelem2_ 201#else 202#define EXTRA_PARAMS , scaleT scale 203#endif 204#define PROCESS_ELEM storedst(convertToDT(srcelem1 * scale * srcelem2)) 205 206#elif defined OP_DIV 207#define PROCESS_ELEM \ 208 workT e2 = srcelem2, zero = (workT)(0); \ 209 storedst(convertToDT(e2 != zero ? srcelem1 / e2 : zero)) 210 211#elif defined OP_DIV_SCALE 212#undef EXTRA_PARAMS 213#ifdef UNARY_OP 214#define EXTRA_PARAMS , workST srcelem2_, scaleT scale 215#undef srcelem2 216#define srcelem2 srcelem2_ 217#else 218#define EXTRA_PARAMS , scaleT scale 219#endif 220#define PROCESS_ELEM \ 221 workT e2 = srcelem2, zero = (workT)(0); \ 222 storedst(convertToDT(e2 == zero ? zero : (srcelem1 * (workT)(scale) / e2))) 223 224#elif defined OP_RDIV_SCALE 225#undef EXTRA_PARAMS 226#ifdef UNARY_OP 227#define EXTRA_PARAMS , workST srcelem2_, scaleT scale 228#undef srcelem2 229#define srcelem2 srcelem2_ 230#else 231#define EXTRA_PARAMS , scaleT scale 232#endif 233#define PROCESS_ELEM \ 234 workT e1 = srcelem1, zero = (workT)(0); \ 235 storedst(convertToDT(e1 == zero ? zero : (srcelem2 * (workT)(scale) / e1))) 236 237#elif defined OP_RECIP_SCALE 238#undef EXTRA_PARAMS 239#define EXTRA_PARAMS , scaleT scale 240#define PROCESS_ELEM \ 241 workT e1 = srcelem1, zero = (workT)(0); \ 242 storedst(convertToDT(e1 != zero ? scale / e1 : zero)) 243 244#elif defined OP_ADDW 245#undef EXTRA_PARAMS 246#define EXTRA_PARAMS , scaleT alpha, scaleT beta, scaleT gamma 247#if wdepth <= 4 248#define PROCESS_ELEM storedst(convertToDT(mad24(srcelem1, alpha, mad24(srcelem2, beta, gamma)))) 249#else 250#define PROCESS_ELEM storedst(convertToDT(fma(srcelem1, alpha, fma(srcelem2, beta, gamma)))) 251#endif 252 253#elif defined OP_MAG 254#define PROCESS_ELEM storedst(hypot(srcelem1, srcelem2)) 255 256#elif defined OP_PHASE_RADIANS 257#define PROCESS_ELEM \ 258 workT tmp = atan2(srcelem2, srcelem1); \ 259 if (tmp < 0) \ 260 tmp += 2 * CV_PI; \ 261 storedst(tmp) 262 263#elif defined OP_PHASE_DEGREES 264 #define PROCESS_ELEM \ 265 workT tmp = degrees(atan2(srcelem2, srcelem1)); \ 266 if (tmp < 0) \ 267 tmp += 360; \ 268 storedst(tmp) 269 270#elif defined OP_EXP 271#if wdepth == 5 272#define PROCESS_ELEM storedst(native_exp(srcelem1)) 273#else 274#define PROCESS_ELEM storedst(exp(srcelem1)) 275#endif 276 277#elif defined OP_POW 278#define PROCESS_ELEM storedst(pow(srcelem1, srcelem2)) 279 280#elif defined OP_POWN 281#undef workT 282#define workT int 283#define PROCESS_ELEM storedst(pown(srcelem1, srcelem2)) 284 285#elif defined OP_SQRT 286#if depth <= 5 287#define PROCESS_ELEM storedst(native_sqrt(srcelem1)) 288#else 289#define PROCESS_ELEM storedst(sqrt(srcelem1)) 290#endif 291 292#elif defined OP_LOG 293#define PROCESS_ELEM \ 294 storedst(log(fabs(srcelem1))) 295 296#elif defined OP_CMP 297#define srcT2 srcT1 298#ifndef convertToWT1 299#define convertToWT1 300#endif 301#define PROCESS_ELEM \ 302 storedst(srcelem1 CMP_OPERATOR srcelem2 ? (dstT)(255) : (dstT)(0)) 303 304#elif defined OP_CONVERT_SCALE_ABS 305#undef EXTRA_PARAMS 306#define EXTRA_PARAMS , workT1 alpha, workT1 beta 307#if wdepth <= 4 308#define PROCESS_ELEM \ 309 workT value = mad24(srcelem1, (workT)(alpha), (workT)(beta)); \ 310 storedst(convertToDT(abs(value))) 311#else 312#define PROCESS_ELEM \ 313 workT value = fma(srcelem1, (workT)(alpha), (workT)(beta)); \ 314 storedst(convertToDT(fabs(value))) 315#endif 316 317#elif defined OP_SCALE_ADD 318#undef EXTRA_PARAMS 319#define EXTRA_PARAMS , workT1 alpha 320#if wdepth <= 4 321#define PROCESS_ELEM storedst(convertToDT(mad24(srcelem1, (workT)(alpha), srcelem2))) 322#else 323#define PROCESS_ELEM storedst(convertToDT(fma(srcelem1, (workT)(alpha), srcelem2))) 324#endif 325 326#elif defined OP_CTP_AD || defined OP_CTP_AR 327#if depth <= 5 328#define CV_EPSILON FLT_EPSILON 329#else 330#define CV_EPSILON DBL_EPSILON 331#endif 332#ifdef OP_CTP_AD 333#define TO_DEGREE cartToPolar = degrees(cartToPolar); 334#elif defined OP_CTP_AR 335#define TO_DEGREE 336#endif 337#define PROCESS_ELEM \ 338 dstT x = srcelem1, y = srcelem2; \ 339 dstT x2 = x * x, y2 = y * y; \ 340 dstT magnitude = sqrt(x2 + y2); \ 341 dstT tmp = y >= 0 ? 0 : CV_PI * 2; \ 342 tmp = x < 0 ? CV_PI : tmp; \ 343 dstT tmp1 = y >= 0 ? CV_PI * 0.5f : CV_PI * 1.5f; \ 344 dstT cartToPolar = y2 <= x2 ? x * y / mad((dstT)(0.28f), y2, x2 + CV_EPSILON) + tmp : (tmp1 - x * y / mad((dstT)(0.28f), x2, y2 + CV_EPSILON)); \ 345 TO_DEGREE \ 346 storedst(magnitude); \ 347 storedst2(cartToPolar) 348 349#elif defined OP_PTC_AD || defined OP_PTC_AR 350#ifdef OP_PTC_AD 351#define FROM_DEGREE y = radians(y) 352#else 353#define FROM_DEGREE 354#endif 355#define PROCESS_ELEM \ 356 dstT x = srcelem1, y = srcelem2, cosval; \ 357 FROM_DEGREE; \ 358 storedst2(sincos(y, &cosval) * x); \ 359 storedst(cosval * x); 360 361#elif defined OP_PATCH_NANS 362#undef EXTRA_PARAMS 363#define EXTRA_PARAMS , dstT val 364#define PROCESS_ELEM \ 365 if (isnan(srcelem1)) \ 366 storedst(val) 367 368#else 369#error "unknown op type" 370#endif 371 372#if defined OP_CTP_AD || defined OP_CTP_AR || defined OP_PTC_AD || defined OP_PTC_AR 373 #undef EXTRA_PARAMS 374 #define EXTRA_PARAMS , __global uchar* dstptr2, int dststep2, int dstoffset2 375 #undef EXTRA_INDEX 376 #define EXTRA_INDEX int dst_index2 = mad24(y0, dststep2, mad24(x, (int)sizeof(dstT_C1) * cn, dstoffset2)) 377 #undef EXTRA_INDEX_ADD 378 #define EXTRA_INDEX_ADD dst_index2 += dststep2 379#endif 380 381#if defined UNARY_OP || defined MASK_UNARY_OP 382 383#if defined OP_AND || defined OP_OR || defined OP_XOR || defined OP_ADD || defined OP_SAT_ADD || \ 384 defined OP_SUB || defined OP_SAT_SUB || defined OP_RSUB || defined OP_SAT_RSUB || \ 385 defined OP_ABSDIFF || defined OP_CMP || defined OP_MIN || defined OP_MAX || defined OP_POW || \ 386 defined OP_MUL || defined OP_DIV || defined OP_POWN || defined OP_POWR || defined OP_ROOTN 387 #undef EXTRA_PARAMS 388 #define EXTRA_PARAMS , workST srcelem2_ 389 #undef srcelem2 390 #define srcelem2 srcelem2_ 391#endif 392 393#if cn == 3 394#undef srcelem2 395#define srcelem2 (workT)(srcelem2_.x, srcelem2_.y, srcelem2_.z) 396#endif 397 398#endif 399 400#if defined BINARY_OP 401 402__kernel void KF(__global const uchar * srcptr1, int srcstep1, int srcoffset1, 403 __global const uchar * srcptr2, int srcstep2, int srcoffset2, 404 __global uchar * dstptr, int dststep, int dstoffset, 405 int rows, int cols EXTRA_PARAMS ) 406{ 407 int x = get_global_id(0); 408 int y0 = get_global_id(1) * rowsPerWI; 409 410 if (x < cols) 411 { 412 int src1_index = mad24(y0, srcstep1, mad24(x, (int)sizeof(srcT1_C1) * cn, srcoffset1)); 413#if !(defined(OP_RECIP_SCALE) || defined(OP_NOT)) 414 int src2_index = mad24(y0, srcstep2, mad24(x, (int)sizeof(srcT2_C1) * cn, srcoffset2)); 415#endif 416 int dst_index = mad24(y0, dststep, mad24(x, (int)sizeof(dstT_C1) * cn, dstoffset)); 417 EXTRA_INDEX; 418 419 for (int y = y0, y1 = min(rows, y0 + rowsPerWI); y < y1; ++y, src1_index += srcstep1, dst_index += dststep) 420 { 421 PROCESS_ELEM; 422#if !(defined(OP_RECIP_SCALE) || defined(OP_NOT)) 423 src2_index += srcstep2; 424#endif 425 EXTRA_INDEX_ADD; 426 } 427 } 428} 429 430#elif defined MASK_BINARY_OP 431 432__kernel void KF(__global const uchar * srcptr1, int srcstep1, int srcoffset1, 433 __global const uchar * srcptr2, int srcstep2, int srcoffset2, 434 __global const uchar * mask, int maskstep, int maskoffset, 435 __global uchar * dstptr, int dststep, int dstoffset, 436 int rows, int cols EXTRA_PARAMS ) 437{ 438 int x = get_global_id(0); 439 int y0 = get_global_id(1) * rowsPerWI; 440 441 if (x < cols) 442 { 443 int mask_index = mad24(y0, maskstep, x + maskoffset); 444 int src1_index = mad24(y0, srcstep1, mad24(x, (int)sizeof(srcT1_C1) * cn, srcoffset1)); 445 int src2_index = mad24(y0, srcstep2, mad24(x, (int)sizeof(srcT2_C1) * cn, srcoffset2)); 446 int dst_index = mad24(y0, dststep, mad24(x, (int)sizeof(dstT_C1) * cn, dstoffset)); 447 448 for (int y = y0, y1 = min(rows, y0 + rowsPerWI); y < y1; ++y, src1_index += srcstep1, src2_index += srcstep2, 449 mask_index += maskstep, dst_index += dststep) 450 if (mask[mask_index]) 451 { 452 PROCESS_ELEM; 453 } 454 } 455} 456 457#elif defined UNARY_OP 458 459__kernel void KF(__global const uchar * srcptr1, int srcstep1, int srcoffset1, 460 __global uchar * dstptr, int dststep, int dstoffset, 461 int rows, int cols EXTRA_PARAMS ) 462{ 463 int x = get_global_id(0); 464 int y0 = get_global_id(1) * rowsPerWI; 465 466 if (x < cols) 467 { 468 int src1_index = mad24(y0, srcstep1, mad24(x, (int)sizeof(srcT1_C1) * cn, srcoffset1)); 469 int dst_index = mad24(y0, dststep, mad24(x, (int)sizeof(dstT_C1) * cn, dstoffset)); 470 471 for (int y = y0, y1 = min(rows, y0 + rowsPerWI); y < y1; ++y, src1_index += srcstep1, dst_index += dststep) 472 { 473 PROCESS_ELEM; 474 } 475 } 476} 477 478#elif defined MASK_UNARY_OP 479 480__kernel void KF(__global const uchar * srcptr1, int srcstep1, int srcoffset1, 481 __global const uchar * mask, int maskstep, int maskoffset, 482 __global uchar * dstptr, int dststep, int dstoffset, 483 int rows, int cols EXTRA_PARAMS ) 484{ 485 int x = get_global_id(0); 486 int y0 = get_global_id(1) * rowsPerWI; 487 488 if (x < cols) 489 { 490 int mask_index = mad24(y0, maskstep, x + maskoffset); 491 int src1_index = mad24(y0, srcstep1, mad24(x, (int)sizeof(srcT1_C1) * cn, srcoffset1)); 492 int dst_index = mad24(y0, dststep, mad24(x, (int)sizeof(dstT_C1) * cn, dstoffset)); 493 494 for (int y = y0, y1 = min(rows, y0 + rowsPerWI); y < y1; ++y, src1_index += srcstep1, mask_index += maskstep, dst_index += dststep) 495 if (mask[mask_index]) 496 { 497 PROCESS_ELEM; 498 } 499 } 500} 501 502#else 503 504#error "Unknown operation type" 505 506#endif 507