• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1/*M///////////////////////////////////////////////////////////////////////////////////////
2//
3//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
4//
5//  By downloading, copying, installing or using the software you agree to this license.
6//  If you do not agree to this license, do not download, install,
7//  copy or use the software.
8//
9//
10//                           License Agreement
11//                For Open Source Computer Vision Library
12//
13// Copyright (C) 2010-2012, Multicoreware, Inc., all rights reserved.
14// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
15// Third party copyrights are property of their respective owners.
16//
17// @Authors
18//    Sen Liu, swjtuls1987@126.com
19//
20// Redistribution and use in source and binary forms, with or without modification,
21// are permitted provided that the following conditions are met:
22//
23//   * Redistribution's of source code must retain the above copyright notice,
24//     this list of conditions and the following disclaimer.
25//
26//   * Redistribution's in binary form must reproduce the above copyright notice,
27//     this list of conditions and the following disclaimer in the documentation
28//     and/or other materials provided with the distribution.
29//
30//   * The name of the copyright holders may not be used to endorse or promote products
31//     derived from this software without specific prior written permission.
32//
33// This software is provided by the copyright holders and contributors as is and
34// any express or implied warranties, including, but not limited to, the implied
35// warranties of merchantability and fitness for a particular purpose are disclaimed.
36// In no event shall the Intel Corporation or contributors be liable for any direct,
37// indirect, incidental, special, exemplary, or consequential damages
38// (including, but not limited to, procurement of substitute goods or services;
39// loss of use, data, or profits; or business interruption) however caused
40// and on any theory of liability, whether in contract, strict liability,
41// or tort (including negligence or otherwise) arising in any way out of
42// the use of this software, even if advised of the possibility of such damage.
43//
44//M*/
45
46#ifndef WAVE_SIZE
47#define WAVE_SIZE 1
48#endif
49
50inline int calc_lut(__local int* smem, int val, int tid)
51{
52    smem[tid] = val;
53    barrier(CLK_LOCAL_MEM_FENCE);
54
55    if (tid == 0)
56        for (int i = 1; i < 256; ++i)
57            smem[i] += smem[i - 1];
58    barrier(CLK_LOCAL_MEM_FENCE);
59
60    return smem[tid];
61}
62
63#ifdef CPU
64inline void reduce(volatile __local int* smem, int val, int tid)
65{
66    smem[tid] = val;
67    barrier(CLK_LOCAL_MEM_FENCE);
68
69    if (tid < 128)
70        smem[tid] = val += smem[tid + 128];
71    barrier(CLK_LOCAL_MEM_FENCE);
72
73    if (tid < 64)
74        smem[tid] = val += smem[tid + 64];
75    barrier(CLK_LOCAL_MEM_FENCE);
76
77    if (tid < 32)
78        smem[tid] += smem[tid + 32];
79    barrier(CLK_LOCAL_MEM_FENCE);
80
81    if (tid < 16)
82        smem[tid] += smem[tid + 16];
83    barrier(CLK_LOCAL_MEM_FENCE);
84
85    if (tid < 8)
86        smem[tid] += smem[tid + 8];
87    barrier(CLK_LOCAL_MEM_FENCE);
88
89    if (tid < 4)
90        smem[tid] += smem[tid + 4];
91    barrier(CLK_LOCAL_MEM_FENCE);
92
93    if (tid < 2)
94        smem[tid] += smem[tid + 2];
95    barrier(CLK_LOCAL_MEM_FENCE);
96
97    if (tid < 1)
98        smem[256] = smem[tid] + smem[tid + 1];
99    barrier(CLK_LOCAL_MEM_FENCE);
100}
101
102#else
103
104inline void reduce(__local volatile int* smem, int val, int tid)
105{
106    smem[tid] = val;
107    barrier(CLK_LOCAL_MEM_FENCE);
108
109    if (tid < 128)
110        smem[tid] = val += smem[tid + 128];
111    barrier(CLK_LOCAL_MEM_FENCE);
112
113    if (tid < 64)
114        smem[tid] = val += smem[tid + 64];
115    barrier(CLK_LOCAL_MEM_FENCE);
116
117    if (tid < 32)
118    {
119        smem[tid] += smem[tid + 32];
120#if WAVE_SIZE < 32
121    } barrier(CLK_LOCAL_MEM_FENCE);
122
123    if (tid < 16)
124    {
125#endif
126        smem[tid] += smem[tid + 16];
127#if WAVE_SIZE < 16
128    }
129    barrier(CLK_LOCAL_MEM_FENCE);
130
131    if (tid < 8)
132    {
133#endif
134        smem[tid] += smem[tid + 8];
135        smem[tid] += smem[tid + 4];
136        smem[tid] += smem[tid + 2];
137        smem[tid] += smem[tid + 1];
138    }
139}
140#endif
141
142__kernel void calcLut(__global __const uchar * src, const int srcStep,
143                      const int src_offset, __global uchar * lut,
144                      const int dstStep, const int dst_offset,
145                      const int2 tileSize, const int tilesX,
146                      const int clipLimit, const float lutScale)
147{
148    __local int smem[512];
149
150    int tx = get_group_id(0);
151    int ty = get_group_id(1);
152    int tid = get_local_id(1) * get_local_size(0)
153                             + get_local_id(0);
154    smem[tid] = 0;
155    barrier(CLK_LOCAL_MEM_FENCE);
156
157    for (int i = get_local_id(1); i < tileSize.y; i += get_local_size(1))
158    {
159        __global const uchar* srcPtr = src + mad24(ty * tileSize.y + i, srcStep, tx * tileSize.x + src_offset);
160        for (int j = get_local_id(0); j < tileSize.x; j += get_local_size(0))
161        {
162            const int data = srcPtr[j];
163            atomic_inc(&smem[data]);
164        }
165    }
166    barrier(CLK_LOCAL_MEM_FENCE);
167
168    int tHistVal = smem[tid];
169    barrier(CLK_LOCAL_MEM_FENCE);
170
171    if (clipLimit > 0)
172    {
173        // clip histogram bar
174        int clipped = 0;
175        if (tHistVal > clipLimit)
176        {
177            clipped = tHistVal - clipLimit;
178            tHistVal = clipLimit;
179        }
180
181        // find number of overall clipped samples
182        reduce(smem, clipped, tid);
183        barrier(CLK_LOCAL_MEM_FENCE);
184#ifdef CPU
185        clipped = smem[256];
186#else
187        clipped = smem[0];
188#endif
189
190        // broadcast evaluated value
191
192        __local int totalClipped;
193
194        if (tid == 0)
195            totalClipped = clipped;
196        barrier(CLK_LOCAL_MEM_FENCE);
197
198        // redistribute clipped samples evenly
199
200        int redistBatch = totalClipped / 256;
201        tHistVal += redistBatch;
202
203        int residual = totalClipped - redistBatch * 256;
204        if (tid < residual)
205            ++tHistVal;
206    }
207
208    const int lutVal = calc_lut(smem, tHistVal, tid);
209    uint ires = (uint)convert_int_rte(lutScale * lutVal);
210    lut[(ty * tilesX + tx) * dstStep + tid + dst_offset] =
211        convert_uchar(clamp(ires, (uint)0, (uint)255));
212}
213
214__kernel void transform(__global __const uchar * src, const int srcStep, const int src_offset,
215                        __global uchar * dst, const int dstStep, const int dst_offset,
216                        __global uchar * lut, const int lutStep, int lut_offset,
217                        const int cols, const int rows,
218                        const int2 tileSize,
219                        const int tilesX, const int tilesY)
220{
221    const int x = get_global_id(0);
222    const int y = get_global_id(1);
223
224    if (x >= cols || y >= rows)
225        return;
226
227    const float tyf = (convert_float(y) / tileSize.y) - 0.5f;
228    int ty1 = convert_int_rtn(tyf);
229    int ty2 = ty1 + 1;
230    const float ya = tyf - ty1;
231    ty1 = max(ty1, 0);
232    ty2 = min(ty2, tilesY - 1);
233
234    const float txf = (convert_float(x) / tileSize.x) - 0.5f;
235    int tx1 = convert_int_rtn(txf);
236    int tx2 = tx1 + 1;
237    const float xa = txf - tx1;
238    tx1 = max(tx1, 0);
239    tx2 = min(tx2, tilesX - 1);
240
241    const int srcVal = src[mad24(y, srcStep, x + src_offset)];
242
243    float res = 0;
244
245    res += lut[mad24(ty1 * tilesX + tx1, lutStep, srcVal + lut_offset)] * ((1.0f - xa) * (1.0f - ya));
246    res += lut[mad24(ty1 * tilesX + tx2, lutStep, srcVal + lut_offset)] * ((xa) * (1.0f - ya));
247    res += lut[mad24(ty2 * tilesX + tx1, lutStep, srcVal + lut_offset)] * ((1.0f - xa) * (ya));
248    res += lut[mad24(ty2 * tilesX + tx2, lutStep, srcVal + lut_offset)] * ((xa) * (ya));
249
250    uint ires = (uint)convert_int_rte(res);
251    dst[mad24(y, dstStep, x + dst_offset)] = convert_uchar(clamp(ires, (uint)0, (uint)255));
252}
253