1/*M/////////////////////////////////////////////////////////////////////////////////////// 2// 3// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING. 4// 5// By downloading, copying, installing or using the software you agree to this license. 6// If you do not agree to this license, do not download, install, 7// copy or use the software. 8// 9// 10// License Agreement 11// For Open Source Computer Vision Library 12// 13// Copyright (C) 2010-2012, Multicoreware, Inc., all rights reserved. 14// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved. 15// Third party copyrights are property of their respective owners. 16// 17// @Authors 18// Sen Liu, swjtuls1987@126.com 19// 20// Redistribution and use in source and binary forms, with or without modification, 21// are permitted provided that the following conditions are met: 22// 23// * Redistribution's of source code must retain the above copyright notice, 24// this list of conditions and the following disclaimer. 25// 26// * Redistribution's in binary form must reproduce the above copyright notice, 27// this list of conditions and the following disclaimer in the documentation 28// and/or other materials provided with the distribution. 29// 30// * The name of the copyright holders may not be used to endorse or promote products 31// derived from this software without specific prior written permission. 32// 33// This software is provided by the copyright holders and contributors as is and 34// any express or implied warranties, including, but not limited to, the implied 35// warranties of merchantability and fitness for a particular purpose are disclaimed. 36// In no event shall the Intel Corporation or contributors be liable for any direct, 37// indirect, incidental, special, exemplary, or consequential damages 38// (including, but not limited to, procurement of substitute goods or services; 39// loss of use, data, or profits; or business interruption) however caused 40// and on any theory of liability, whether in contract, strict liability, 41// or tort (including negligence or otherwise) arising in any way out of 42// the use of this software, even if advised of the possibility of such damage. 43// 44//M*/ 45 46#ifndef WAVE_SIZE 47#define WAVE_SIZE 1 48#endif 49 50inline int calc_lut(__local int* smem, int val, int tid) 51{ 52 smem[tid] = val; 53 barrier(CLK_LOCAL_MEM_FENCE); 54 55 if (tid == 0) 56 for (int i = 1; i < 256; ++i) 57 smem[i] += smem[i - 1]; 58 barrier(CLK_LOCAL_MEM_FENCE); 59 60 return smem[tid]; 61} 62 63#ifdef CPU 64inline void reduce(volatile __local int* smem, int val, int tid) 65{ 66 smem[tid] = val; 67 barrier(CLK_LOCAL_MEM_FENCE); 68 69 if (tid < 128) 70 smem[tid] = val += smem[tid + 128]; 71 barrier(CLK_LOCAL_MEM_FENCE); 72 73 if (tid < 64) 74 smem[tid] = val += smem[tid + 64]; 75 barrier(CLK_LOCAL_MEM_FENCE); 76 77 if (tid < 32) 78 smem[tid] += smem[tid + 32]; 79 barrier(CLK_LOCAL_MEM_FENCE); 80 81 if (tid < 16) 82 smem[tid] += smem[tid + 16]; 83 barrier(CLK_LOCAL_MEM_FENCE); 84 85 if (tid < 8) 86 smem[tid] += smem[tid + 8]; 87 barrier(CLK_LOCAL_MEM_FENCE); 88 89 if (tid < 4) 90 smem[tid] += smem[tid + 4]; 91 barrier(CLK_LOCAL_MEM_FENCE); 92 93 if (tid < 2) 94 smem[tid] += smem[tid + 2]; 95 barrier(CLK_LOCAL_MEM_FENCE); 96 97 if (tid < 1) 98 smem[256] = smem[tid] + smem[tid + 1]; 99 barrier(CLK_LOCAL_MEM_FENCE); 100} 101 102#else 103 104inline void reduce(__local volatile int* smem, int val, int tid) 105{ 106 smem[tid] = val; 107 barrier(CLK_LOCAL_MEM_FENCE); 108 109 if (tid < 128) 110 smem[tid] = val += smem[tid + 128]; 111 barrier(CLK_LOCAL_MEM_FENCE); 112 113 if (tid < 64) 114 smem[tid] = val += smem[tid + 64]; 115 barrier(CLK_LOCAL_MEM_FENCE); 116 117 if (tid < 32) 118 { 119 smem[tid] += smem[tid + 32]; 120#if WAVE_SIZE < 32 121 } barrier(CLK_LOCAL_MEM_FENCE); 122 123 if (tid < 16) 124 { 125#endif 126 smem[tid] += smem[tid + 16]; 127#if WAVE_SIZE < 16 128 } 129 barrier(CLK_LOCAL_MEM_FENCE); 130 131 if (tid < 8) 132 { 133#endif 134 smem[tid] += smem[tid + 8]; 135 smem[tid] += smem[tid + 4]; 136 smem[tid] += smem[tid + 2]; 137 smem[tid] += smem[tid + 1]; 138 } 139} 140#endif 141 142__kernel void calcLut(__global __const uchar * src, const int srcStep, 143 const int src_offset, __global uchar * lut, 144 const int dstStep, const int dst_offset, 145 const int2 tileSize, const int tilesX, 146 const int clipLimit, const float lutScale) 147{ 148 __local int smem[512]; 149 150 int tx = get_group_id(0); 151 int ty = get_group_id(1); 152 int tid = get_local_id(1) * get_local_size(0) 153 + get_local_id(0); 154 smem[tid] = 0; 155 barrier(CLK_LOCAL_MEM_FENCE); 156 157 for (int i = get_local_id(1); i < tileSize.y; i += get_local_size(1)) 158 { 159 __global const uchar* srcPtr = src + mad24(ty * tileSize.y + i, srcStep, tx * tileSize.x + src_offset); 160 for (int j = get_local_id(0); j < tileSize.x; j += get_local_size(0)) 161 { 162 const int data = srcPtr[j]; 163 atomic_inc(&smem[data]); 164 } 165 } 166 barrier(CLK_LOCAL_MEM_FENCE); 167 168 int tHistVal = smem[tid]; 169 barrier(CLK_LOCAL_MEM_FENCE); 170 171 if (clipLimit > 0) 172 { 173 // clip histogram bar 174 int clipped = 0; 175 if (tHistVal > clipLimit) 176 { 177 clipped = tHistVal - clipLimit; 178 tHistVal = clipLimit; 179 } 180 181 // find number of overall clipped samples 182 reduce(smem, clipped, tid); 183 barrier(CLK_LOCAL_MEM_FENCE); 184#ifdef CPU 185 clipped = smem[256]; 186#else 187 clipped = smem[0]; 188#endif 189 190 // broadcast evaluated value 191 192 __local int totalClipped; 193 194 if (tid == 0) 195 totalClipped = clipped; 196 barrier(CLK_LOCAL_MEM_FENCE); 197 198 // redistribute clipped samples evenly 199 200 int redistBatch = totalClipped / 256; 201 tHistVal += redistBatch; 202 203 int residual = totalClipped - redistBatch * 256; 204 if (tid < residual) 205 ++tHistVal; 206 } 207 208 const int lutVal = calc_lut(smem, tHistVal, tid); 209 uint ires = (uint)convert_int_rte(lutScale * lutVal); 210 lut[(ty * tilesX + tx) * dstStep + tid + dst_offset] = 211 convert_uchar(clamp(ires, (uint)0, (uint)255)); 212} 213 214__kernel void transform(__global __const uchar * src, const int srcStep, const int src_offset, 215 __global uchar * dst, const int dstStep, const int dst_offset, 216 __global uchar * lut, const int lutStep, int lut_offset, 217 const int cols, const int rows, 218 const int2 tileSize, 219 const int tilesX, const int tilesY) 220{ 221 const int x = get_global_id(0); 222 const int y = get_global_id(1); 223 224 if (x >= cols || y >= rows) 225 return; 226 227 const float tyf = (convert_float(y) / tileSize.y) - 0.5f; 228 int ty1 = convert_int_rtn(tyf); 229 int ty2 = ty1 + 1; 230 const float ya = tyf - ty1; 231 ty1 = max(ty1, 0); 232 ty2 = min(ty2, tilesY - 1); 233 234 const float txf = (convert_float(x) / tileSize.x) - 0.5f; 235 int tx1 = convert_int_rtn(txf); 236 int tx2 = tx1 + 1; 237 const float xa = txf - tx1; 238 tx1 = max(tx1, 0); 239 tx2 = min(tx2, tilesX - 1); 240 241 const int srcVal = src[mad24(y, srcStep, x + src_offset)]; 242 243 float res = 0; 244 245 res += lut[mad24(ty1 * tilesX + tx1, lutStep, srcVal + lut_offset)] * ((1.0f - xa) * (1.0f - ya)); 246 res += lut[mad24(ty1 * tilesX + tx2, lutStep, srcVal + lut_offset)] * ((xa) * (1.0f - ya)); 247 res += lut[mad24(ty2 * tilesX + tx1, lutStep, srcVal + lut_offset)] * ((1.0f - xa) * (ya)); 248 res += lut[mad24(ty2 * tilesX + tx2, lutStep, srcVal + lut_offset)] * ((xa) * (ya)); 249 250 uint ires = (uint)convert_int_rte(res); 251 dst[mad24(y, dstStep, x + dst_offset)] = convert_uchar(clamp(ires, (uint)0, (uint)255)); 252} 253