1 /*M/////////////////////////////////////////////////////////////////////////////////////// 2 // 3 // IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING. 4 // 5 // By downloading, copying, installing or using the software you agree to this license. 6 // If you do not agree to this license, do not download, install, 7 // copy or use the software. 8 // 9 // 10 // License Agreement 11 // For Open Source Computer Vision Library 12 // 13 // Copyright (C) 2000-2008, Intel Corporation, all rights reserved. 14 // Copyright (C) 2009, Willow Garage Inc., all rights reserved. 15 // Third party copyrights are property of their respective owners. 16 // 17 // Redistribution and use in source and binary forms, with or without modification, 18 // are permitted provided that the following conditions are met: 19 // 20 // * Redistribution's of source code must retain the above copyright notice, 21 // this list of conditions and the following disclaimer. 22 // 23 // * Redistribution's in binary form must reproduce the above copyright notice, 24 // this list of conditions and the following disclaimer in the documentation 25 // and/or other materials provided with the distribution. 26 // 27 // * The name of the copyright holders may not be used to endorse or promote products 28 // derived from this software without specific prior written permission. 29 // 30 // This software is provided by the copyright holders and contributors "as is" and 31 // any express or implied warranties, including, but not limited to, the implied 32 // warranties of merchantability and fitness for a particular purpose are disclaimed. 33 // In no event shall the Intel Corporation or contributors be liable for any direct, 34 // indirect, incidental, special, exemplary, or consequential damages 35 // (including, but not limited to, procurement of substitute goods or services; 36 // loss of use, data, or profits; or business interruption) however caused 37 // and on any theory of liability, whether in contract, strict liability, 38 // or tort (including negligence or otherwise) arising in any way out of 39 // the use of this software, even if advised of the possibility of such damage. 40 // 41 //M*/ 42 43 #if !defined CUDA_DISABLER 44 45 #include "opencv2/core/cuda/common.hpp" 46 #include "opencv2/core/cuda/reduce.hpp" 47 #include "opencv2/core/cuda/functional.hpp" 48 #include "opencv2/core/cuda/warp_shuffle.hpp" 49 50 namespace cv { namespace cuda { namespace device 51 { 52 // Other values are not supported 53 #define CELL_WIDTH 8 54 #define CELL_HEIGHT 8 55 #define CELLS_PER_BLOCK_X 2 56 #define CELLS_PER_BLOCK_Y 2 57 58 namespace hog 59 { 60 __constant__ int cnbins; 61 __constant__ int cblock_stride_x; 62 __constant__ int cblock_stride_y; 63 __constant__ int cnblocks_win_x; 64 __constant__ int cnblocks_win_y; 65 __constant__ int cblock_hist_size; 66 __constant__ int cblock_hist_size_2up; 67 __constant__ int cdescr_size; 68 __constant__ int cdescr_width; 69 70 71 /* Returns the nearest upper power of two, works only for 72 the typical GPU thread count (pert block) values */ power_2up(unsigned int n)73 int power_2up(unsigned int n) 74 { 75 if (n < 1) return 1; 76 else if (n < 2) return 2; 77 else if (n < 4) return 4; 78 else if (n < 8) return 8; 79 else if (n < 16) return 16; 80 else if (n < 32) return 32; 81 else if (n < 64) return 64; 82 else if (n < 128) return 128; 83 else if (n < 256) return 256; 84 else if (n < 512) return 512; 85 else if (n < 1024) return 1024; 86 return -1; // Input is too big 87 } 88 89 set_up_constants(int nbins,int block_stride_x,int block_stride_y,int nblocks_win_x,int nblocks_win_y)90 void set_up_constants(int nbins, int block_stride_x, int block_stride_y, 91 int nblocks_win_x, int nblocks_win_y) 92 { 93 cudaSafeCall( cudaMemcpyToSymbol(cnbins, &nbins, sizeof(nbins)) ); 94 cudaSafeCall( cudaMemcpyToSymbol(cblock_stride_x, &block_stride_x, sizeof(block_stride_x)) ); 95 cudaSafeCall( cudaMemcpyToSymbol(cblock_stride_y, &block_stride_y, sizeof(block_stride_y)) ); 96 cudaSafeCall( cudaMemcpyToSymbol(cnblocks_win_x, &nblocks_win_x, sizeof(nblocks_win_x)) ); 97 cudaSafeCall( cudaMemcpyToSymbol(cnblocks_win_y, &nblocks_win_y, sizeof(nblocks_win_y)) ); 98 99 int block_hist_size = nbins * CELLS_PER_BLOCK_X * CELLS_PER_BLOCK_Y; 100 cudaSafeCall( cudaMemcpyToSymbol(cblock_hist_size, &block_hist_size, sizeof(block_hist_size)) ); 101 102 int block_hist_size_2up = power_2up(block_hist_size); 103 cudaSafeCall( cudaMemcpyToSymbol(cblock_hist_size_2up, &block_hist_size_2up, sizeof(block_hist_size_2up)) ); 104 105 int descr_width = nblocks_win_x * block_hist_size; 106 cudaSafeCall( cudaMemcpyToSymbol(cdescr_width, &descr_width, sizeof(descr_width)) ); 107 108 int descr_size = descr_width * nblocks_win_y; 109 cudaSafeCall( cudaMemcpyToSymbol(cdescr_size, &descr_size, sizeof(descr_size)) ); 110 } 111 112 113 //---------------------------------------------------------------------------- 114 // Histogram computation 115 116 117 template <int nblocks> // Number of histogram blocks processed by single GPU thread block compute_hists_kernel_many_blocks(const int img_block_width,const PtrStepf grad,const PtrStepb qangle,float scale,float * block_hists)118 __global__ void compute_hists_kernel_many_blocks(const int img_block_width, const PtrStepf grad, 119 const PtrStepb qangle, float scale, float* block_hists) 120 { 121 const int block_x = threadIdx.z; 122 const int cell_x = threadIdx.x / 16; 123 const int cell_y = threadIdx.y; 124 const int cell_thread_x = threadIdx.x & 0xF; 125 126 if (blockIdx.x * blockDim.z + block_x >= img_block_width) 127 return; 128 129 extern __shared__ float smem[]; 130 float* hists = smem; 131 float* final_hist = smem + cnbins * 48 * nblocks; 132 133 const int offset_x = (blockIdx.x * blockDim.z + block_x) * cblock_stride_x + 134 4 * cell_x + cell_thread_x; 135 const int offset_y = blockIdx.y * cblock_stride_y + 4 * cell_y; 136 137 const float* grad_ptr = grad.ptr(offset_y) + offset_x * 2; 138 const unsigned char* qangle_ptr = qangle.ptr(offset_y) + offset_x * 2; 139 140 // 12 means that 12 pixels affect on block's cell (in one row) 141 if (cell_thread_x < 12) 142 { 143 float* hist = hists + 12 * (cell_y * blockDim.z * CELLS_PER_BLOCK_Y + 144 cell_x + block_x * CELLS_PER_BLOCK_X) + 145 cell_thread_x; 146 for (int bin_id = 0; bin_id < cnbins; ++bin_id) 147 hist[bin_id * 48 * nblocks] = 0.f; 148 149 const int dist_x = -4 + (int)cell_thread_x - 4 * cell_x; 150 151 const int dist_y_begin = -4 - 4 * (int)threadIdx.y; 152 for (int dist_y = dist_y_begin; dist_y < dist_y_begin + 12; ++dist_y) 153 { 154 float2 vote = *(const float2*)grad_ptr; 155 uchar2 bin = *(const uchar2*)qangle_ptr; 156 157 grad_ptr += grad.step/sizeof(float); 158 qangle_ptr += qangle.step; 159 160 int dist_center_y = dist_y - 4 * (1 - 2 * cell_y); 161 int dist_center_x = dist_x - 4 * (1 - 2 * cell_x); 162 163 float gaussian = ::expf(-(dist_center_y * dist_center_y + 164 dist_center_x * dist_center_x) * scale); 165 float interp_weight = (8.f - ::fabs(dist_y + 0.5f)) * 166 (8.f - ::fabs(dist_x + 0.5f)) / 64.f; 167 168 hist[bin.x * 48 * nblocks] += gaussian * interp_weight * vote.x; 169 hist[bin.y * 48 * nblocks] += gaussian * interp_weight * vote.y; 170 } 171 172 volatile float* hist_ = hist; 173 for (int bin_id = 0; bin_id < cnbins; ++bin_id, hist_ += 48 * nblocks) 174 { 175 if (cell_thread_x < 6) hist_[0] += hist_[6]; 176 if (cell_thread_x < 3) hist_[0] += hist_[3]; 177 if (cell_thread_x == 0) 178 final_hist[((cell_x + block_x * 2) * 2 + cell_y) * cnbins + bin_id] 179 = hist_[0] + hist_[1] + hist_[2]; 180 } 181 } 182 183 __syncthreads(); 184 185 float* block_hist = block_hists + (blockIdx.y * img_block_width + 186 blockIdx.x * blockDim.z + block_x) * 187 cblock_hist_size; 188 189 int tid = (cell_y * CELLS_PER_BLOCK_Y + cell_x) * 16 + cell_thread_x; 190 if (tid < cblock_hist_size) 191 block_hist[tid] = final_hist[block_x * cblock_hist_size + tid]; 192 } 193 194 compute_hists(int nbins,int block_stride_x,int block_stride_y,int height,int width,const PtrStepSzf & grad,const PtrStepSzb & qangle,float sigma,float * block_hists)195 void compute_hists(int nbins, int block_stride_x, int block_stride_y, 196 int height, int width, const PtrStepSzf& grad, 197 const PtrStepSzb& qangle, float sigma, float* block_hists) 198 { 199 const int nblocks = 1; 200 201 int img_block_width = (width - CELLS_PER_BLOCK_X * CELL_WIDTH + block_stride_x) / 202 block_stride_x; 203 int img_block_height = (height - CELLS_PER_BLOCK_Y * CELL_HEIGHT + block_stride_y) / 204 block_stride_y; 205 206 dim3 grid(divUp(img_block_width, nblocks), img_block_height); 207 dim3 threads(32, 2, nblocks); 208 209 cudaSafeCall(cudaFuncSetCacheConfig(compute_hists_kernel_many_blocks<nblocks>, 210 cudaFuncCachePreferL1)); 211 212 // Precompute gaussian spatial window parameter 213 float scale = 1.f / (2.f * sigma * sigma); 214 215 int hists_size = (nbins * CELLS_PER_BLOCK_X * CELLS_PER_BLOCK_Y * 12 * nblocks) * sizeof(float); 216 int final_hists_size = (nbins * CELLS_PER_BLOCK_X * CELLS_PER_BLOCK_Y * nblocks) * sizeof(float); 217 int smem = hists_size + final_hists_size; 218 compute_hists_kernel_many_blocks<nblocks><<<grid, threads, smem>>>( 219 img_block_width, grad, qangle, scale, block_hists); 220 cudaSafeCall( cudaGetLastError() ); 221 222 cudaSafeCall( cudaDeviceSynchronize() ); 223 } 224 225 226 //------------------------------------------------------------- 227 // Normalization of histograms via L2Hys_norm 228 // 229 230 231 template<int size> reduce_smem(float * smem,float val)232 __device__ float reduce_smem(float* smem, float val) 233 { 234 unsigned int tid = threadIdx.x; 235 float sum = val; 236 237 reduce<size>(smem, sum, tid, plus<float>()); 238 239 if (size == 32) 240 { 241 #if __CUDA_ARCH__ >= 300 242 return shfl(sum, 0); 243 #else 244 return smem[0]; 245 #endif 246 } 247 else 248 { 249 #if __CUDA_ARCH__ >= 300 250 if (threadIdx.x == 0) 251 smem[0] = sum; 252 #endif 253 254 __syncthreads(); 255 256 return smem[0]; 257 } 258 } 259 260 261 template <int nthreads, // Number of threads which process one block historgam 262 int nblocks> // Number of block hisograms processed by one GPU thread block normalize_hists_kernel_many_blocks(const int block_hist_size,const int img_block_width,float * block_hists,float threshold)263 __global__ void normalize_hists_kernel_many_blocks(const int block_hist_size, 264 const int img_block_width, 265 float* block_hists, float threshold) 266 { 267 if (blockIdx.x * blockDim.z + threadIdx.z >= img_block_width) 268 return; 269 270 float* hist = block_hists + (blockIdx.y * img_block_width + 271 blockIdx.x * blockDim.z + threadIdx.z) * 272 block_hist_size + threadIdx.x; 273 274 __shared__ float sh_squares[nthreads * nblocks]; 275 float* squares = sh_squares + threadIdx.z * nthreads; 276 277 float elem = 0.f; 278 if (threadIdx.x < block_hist_size) 279 elem = hist[0]; 280 281 float sum = reduce_smem<nthreads>(squares, elem * elem); 282 283 float scale = 1.0f / (::sqrtf(sum) + 0.1f * block_hist_size); 284 elem = ::min(elem * scale, threshold); 285 286 sum = reduce_smem<nthreads>(squares, elem * elem); 287 288 scale = 1.0f / (::sqrtf(sum) + 1e-3f); 289 290 if (threadIdx.x < block_hist_size) 291 hist[0] = elem * scale; 292 } 293 294 normalize_hists(int nbins,int block_stride_x,int block_stride_y,int height,int width,float * block_hists,float threshold)295 void normalize_hists(int nbins, int block_stride_x, int block_stride_y, 296 int height, int width, float* block_hists, float threshold) 297 { 298 const int nblocks = 1; 299 300 int block_hist_size = nbins * CELLS_PER_BLOCK_X * CELLS_PER_BLOCK_Y; 301 int nthreads = power_2up(block_hist_size); 302 dim3 threads(nthreads, 1, nblocks); 303 304 int img_block_width = (width - CELLS_PER_BLOCK_X * CELL_WIDTH + block_stride_x) / block_stride_x; 305 int img_block_height = (height - CELLS_PER_BLOCK_Y * CELL_HEIGHT + block_stride_y) / block_stride_y; 306 dim3 grid(divUp(img_block_width, nblocks), img_block_height); 307 308 if (nthreads == 32) 309 normalize_hists_kernel_many_blocks<32, nblocks><<<grid, threads>>>(block_hist_size, img_block_width, block_hists, threshold); 310 else if (nthreads == 64) 311 normalize_hists_kernel_many_blocks<64, nblocks><<<grid, threads>>>(block_hist_size, img_block_width, block_hists, threshold); 312 else if (nthreads == 128) 313 normalize_hists_kernel_many_blocks<64, nblocks><<<grid, threads>>>(block_hist_size, img_block_width, block_hists, threshold); 314 else if (nthreads == 256) 315 normalize_hists_kernel_many_blocks<256, nblocks><<<grid, threads>>>(block_hist_size, img_block_width, block_hists, threshold); 316 else if (nthreads == 512) 317 normalize_hists_kernel_many_blocks<512, nblocks><<<grid, threads>>>(block_hist_size, img_block_width, block_hists, threshold); 318 else 319 CV_Error(cv::Error::StsBadArg, "normalize_hists: histogram's size is too big, try to decrease number of bins"); 320 321 cudaSafeCall( cudaGetLastError() ); 322 323 cudaSafeCall( cudaDeviceSynchronize() ); 324 } 325 326 327 //--------------------------------------------------------------------- 328 // Linear SVM based classification 329 // 330 331 // return confidence values not just positive location 332 template <int nthreads, // Number of threads per one histogram block 333 int nblocks> // Number of histogram block processed by single GPU thread block compute_confidence_hists_kernel_many_blocks(const int img_win_width,const int img_block_width,const int win_block_stride_x,const int win_block_stride_y,const float * block_hists,const float * coefs,float free_coef,float threshold,float * confidences)334 __global__ void compute_confidence_hists_kernel_many_blocks(const int img_win_width, const int img_block_width, 335 const int win_block_stride_x, const int win_block_stride_y, 336 const float* block_hists, const float* coefs, 337 float free_coef, float threshold, float* confidences) 338 { 339 const int win_x = threadIdx.z; 340 if (blockIdx.x * blockDim.z + win_x >= img_win_width) 341 return; 342 343 const float* hist = block_hists + (blockIdx.y * win_block_stride_y * img_block_width + 344 blockIdx.x * win_block_stride_x * blockDim.z + win_x) * 345 cblock_hist_size; 346 347 float product = 0.f; 348 for (int i = threadIdx.x; i < cdescr_size; i += nthreads) 349 { 350 int offset_y = i / cdescr_width; 351 int offset_x = i - offset_y * cdescr_width; 352 product += coefs[i] * hist[offset_y * img_block_width * cblock_hist_size + offset_x]; 353 } 354 355 __shared__ float products[nthreads * nblocks]; 356 357 const int tid = threadIdx.z * nthreads + threadIdx.x; 358 359 reduce<nthreads>(products, product, tid, plus<float>()); 360 361 if (threadIdx.x == 0) 362 confidences[blockIdx.y * img_win_width + blockIdx.x * blockDim.z + win_x] = product + free_coef; 363 364 } 365 compute_confidence_hists(int win_height,int win_width,int block_stride_y,int block_stride_x,int win_stride_y,int win_stride_x,int height,int width,float * block_hists,float * coefs,float free_coef,float threshold,float * confidences)366 void compute_confidence_hists(int win_height, int win_width, int block_stride_y, int block_stride_x, 367 int win_stride_y, int win_stride_x, int height, int width, float* block_hists, 368 float* coefs, float free_coef, float threshold, float *confidences) 369 { 370 const int nthreads = 256; 371 const int nblocks = 1; 372 373 int win_block_stride_x = win_stride_x / block_stride_x; 374 int win_block_stride_y = win_stride_y / block_stride_y; 375 int img_win_width = (width - win_width + win_stride_x) / win_stride_x; 376 int img_win_height = (height - win_height + win_stride_y) / win_stride_y; 377 378 dim3 threads(nthreads, 1, nblocks); 379 dim3 grid(divUp(img_win_width, nblocks), img_win_height); 380 381 cudaSafeCall(cudaFuncSetCacheConfig(compute_confidence_hists_kernel_many_blocks<nthreads, nblocks>, 382 cudaFuncCachePreferL1)); 383 384 int img_block_width = (width - CELLS_PER_BLOCK_X * CELL_WIDTH + block_stride_x) / 385 block_stride_x; 386 compute_confidence_hists_kernel_many_blocks<nthreads, nblocks><<<grid, threads>>>( 387 img_win_width, img_block_width, win_block_stride_x, win_block_stride_y, 388 block_hists, coefs, free_coef, threshold, confidences); 389 cudaSafeCall(cudaThreadSynchronize()); 390 } 391 392 393 394 template <int nthreads, // Number of threads per one histogram block 395 int nblocks> // Number of histogram block processed by single GPU thread block classify_hists_kernel_many_blocks(const int img_win_width,const int img_block_width,const int win_block_stride_x,const int win_block_stride_y,const float * block_hists,const float * coefs,float free_coef,float threshold,unsigned char * labels)396 __global__ void classify_hists_kernel_many_blocks(const int img_win_width, const int img_block_width, 397 const int win_block_stride_x, const int win_block_stride_y, 398 const float* block_hists, const float* coefs, 399 float free_coef, float threshold, unsigned char* labels) 400 { 401 const int win_x = threadIdx.z; 402 if (blockIdx.x * blockDim.z + win_x >= img_win_width) 403 return; 404 405 const float* hist = block_hists + (blockIdx.y * win_block_stride_y * img_block_width + 406 blockIdx.x * win_block_stride_x * blockDim.z + win_x) * 407 cblock_hist_size; 408 409 float product = 0.f; 410 for (int i = threadIdx.x; i < cdescr_size; i += nthreads) 411 { 412 int offset_y = i / cdescr_width; 413 int offset_x = i - offset_y * cdescr_width; 414 product += coefs[i] * hist[offset_y * img_block_width * cblock_hist_size + offset_x]; 415 } 416 417 __shared__ float products[nthreads * nblocks]; 418 419 const int tid = threadIdx.z * nthreads + threadIdx.x; 420 421 reduce<nthreads>(products, product, tid, plus<float>()); 422 423 if (threadIdx.x == 0) 424 labels[blockIdx.y * img_win_width + blockIdx.x * blockDim.z + win_x] = (product + free_coef >= threshold); 425 } 426 427 classify_hists(int win_height,int win_width,int block_stride_y,int block_stride_x,int win_stride_y,int win_stride_x,int height,int width,float * block_hists,float * coefs,float free_coef,float threshold,unsigned char * labels)428 void classify_hists(int win_height, int win_width, int block_stride_y, int block_stride_x, 429 int win_stride_y, int win_stride_x, int height, int width, float* block_hists, 430 float* coefs, float free_coef, float threshold, unsigned char* labels) 431 { 432 const int nthreads = 256; 433 const int nblocks = 1; 434 435 int win_block_stride_x = win_stride_x / block_stride_x; 436 int win_block_stride_y = win_stride_y / block_stride_y; 437 int img_win_width = (width - win_width + win_stride_x) / win_stride_x; 438 int img_win_height = (height - win_height + win_stride_y) / win_stride_y; 439 440 dim3 threads(nthreads, 1, nblocks); 441 dim3 grid(divUp(img_win_width, nblocks), img_win_height); 442 443 cudaSafeCall(cudaFuncSetCacheConfig(classify_hists_kernel_many_blocks<nthreads, nblocks>, cudaFuncCachePreferL1)); 444 445 int img_block_width = (width - CELLS_PER_BLOCK_X * CELL_WIDTH + block_stride_x) / block_stride_x; 446 classify_hists_kernel_many_blocks<nthreads, nblocks><<<grid, threads>>>( 447 img_win_width, img_block_width, win_block_stride_x, win_block_stride_y, 448 block_hists, coefs, free_coef, threshold, labels); 449 cudaSafeCall( cudaGetLastError() ); 450 451 cudaSafeCall( cudaDeviceSynchronize() ); 452 } 453 454 //---------------------------------------------------------------------------- 455 // Extract descriptors 456 457 458 template <int nthreads> extract_descrs_by_rows_kernel(const int img_block_width,const int win_block_stride_x,const int win_block_stride_y,const float * block_hists,PtrStepf descriptors)459 __global__ void extract_descrs_by_rows_kernel(const int img_block_width, const int win_block_stride_x, const int win_block_stride_y, 460 const float* block_hists, PtrStepf descriptors) 461 { 462 // Get left top corner of the window in src 463 const float* hist = block_hists + (blockIdx.y * win_block_stride_y * img_block_width + 464 blockIdx.x * win_block_stride_x) * cblock_hist_size; 465 466 // Get left top corner of the window in dst 467 float* descriptor = descriptors.ptr(blockIdx.y * gridDim.x + blockIdx.x); 468 469 // Copy elements from src to dst 470 for (int i = threadIdx.x; i < cdescr_size; i += nthreads) 471 { 472 int offset_y = i / cdescr_width; 473 int offset_x = i - offset_y * cdescr_width; 474 descriptor[i] = hist[offset_y * img_block_width * cblock_hist_size + offset_x]; 475 } 476 } 477 478 extract_descrs_by_rows(int win_height,int win_width,int block_stride_y,int block_stride_x,int win_stride_y,int win_stride_x,int height,int width,float * block_hists,PtrStepSzf descriptors)479 void extract_descrs_by_rows(int win_height, int win_width, int block_stride_y, int block_stride_x, int win_stride_y, int win_stride_x, 480 int height, int width, float* block_hists, PtrStepSzf descriptors) 481 { 482 const int nthreads = 256; 483 484 int win_block_stride_x = win_stride_x / block_stride_x; 485 int win_block_stride_y = win_stride_y / block_stride_y; 486 int img_win_width = (width - win_width + win_stride_x) / win_stride_x; 487 int img_win_height = (height - win_height + win_stride_y) / win_stride_y; 488 dim3 threads(nthreads, 1); 489 dim3 grid(img_win_width, img_win_height); 490 491 int img_block_width = (width - CELLS_PER_BLOCK_X * CELL_WIDTH + block_stride_x) / block_stride_x; 492 extract_descrs_by_rows_kernel<nthreads><<<grid, threads>>>( 493 img_block_width, win_block_stride_x, win_block_stride_y, block_hists, descriptors); 494 cudaSafeCall( cudaGetLastError() ); 495 496 cudaSafeCall( cudaDeviceSynchronize() ); 497 } 498 499 500 template <int nthreads> extract_descrs_by_cols_kernel(const int img_block_width,const int win_block_stride_x,const int win_block_stride_y,const float * block_hists,PtrStepf descriptors)501 __global__ void extract_descrs_by_cols_kernel(const int img_block_width, const int win_block_stride_x, 502 const int win_block_stride_y, const float* block_hists, 503 PtrStepf descriptors) 504 { 505 // Get left top corner of the window in src 506 const float* hist = block_hists + (blockIdx.y * win_block_stride_y * img_block_width + 507 blockIdx.x * win_block_stride_x) * cblock_hist_size; 508 509 // Get left top corner of the window in dst 510 float* descriptor = descriptors.ptr(blockIdx.y * gridDim.x + blockIdx.x); 511 512 // Copy elements from src to dst 513 for (int i = threadIdx.x; i < cdescr_size; i += nthreads) 514 { 515 int block_idx = i / cblock_hist_size; 516 int idx_in_block = i - block_idx * cblock_hist_size; 517 518 int y = block_idx / cnblocks_win_x; 519 int x = block_idx - y * cnblocks_win_x; 520 521 descriptor[(x * cnblocks_win_y + y) * cblock_hist_size + idx_in_block] 522 = hist[(y * img_block_width + x) * cblock_hist_size + idx_in_block]; 523 } 524 } 525 526 extract_descrs_by_cols(int win_height,int win_width,int block_stride_y,int block_stride_x,int win_stride_y,int win_stride_x,int height,int width,float * block_hists,PtrStepSzf descriptors)527 void extract_descrs_by_cols(int win_height, int win_width, int block_stride_y, int block_stride_x, 528 int win_stride_y, int win_stride_x, int height, int width, float* block_hists, 529 PtrStepSzf descriptors) 530 { 531 const int nthreads = 256; 532 533 int win_block_stride_x = win_stride_x / block_stride_x; 534 int win_block_stride_y = win_stride_y / block_stride_y; 535 int img_win_width = (width - win_width + win_stride_x) / win_stride_x; 536 int img_win_height = (height - win_height + win_stride_y) / win_stride_y; 537 dim3 threads(nthreads, 1); 538 dim3 grid(img_win_width, img_win_height); 539 540 int img_block_width = (width - CELLS_PER_BLOCK_X * CELL_WIDTH + block_stride_x) / block_stride_x; 541 extract_descrs_by_cols_kernel<nthreads><<<grid, threads>>>( 542 img_block_width, win_block_stride_x, win_block_stride_y, block_hists, descriptors); 543 cudaSafeCall( cudaGetLastError() ); 544 545 cudaSafeCall( cudaDeviceSynchronize() ); 546 } 547 548 //---------------------------------------------------------------------------- 549 // Gradients computation 550 551 552 template <int nthreads, int correct_gamma> compute_gradients_8UC4_kernel(int height,int width,const PtrStepb img,float angle_scale,PtrStepf grad,PtrStepb qangle)553 __global__ void compute_gradients_8UC4_kernel(int height, int width, const PtrStepb img, 554 float angle_scale, PtrStepf grad, PtrStepb qangle) 555 { 556 const int x = blockIdx.x * blockDim.x + threadIdx.x; 557 558 const uchar4* row = (const uchar4*)img.ptr(blockIdx.y); 559 560 __shared__ float sh_row[(nthreads + 2) * 3]; 561 562 uchar4 val; 563 if (x < width) 564 val = row[x]; 565 else 566 val = row[width - 2]; 567 568 sh_row[threadIdx.x + 1] = val.x; 569 sh_row[threadIdx.x + 1 + (nthreads + 2)] = val.y; 570 sh_row[threadIdx.x + 1 + 2 * (nthreads + 2)] = val.z; 571 572 if (threadIdx.x == 0) 573 { 574 val = row[::max(x - 1, 1)]; 575 sh_row[0] = val.x; 576 sh_row[(nthreads + 2)] = val.y; 577 sh_row[2 * (nthreads + 2)] = val.z; 578 } 579 580 if (threadIdx.x == blockDim.x - 1) 581 { 582 val = row[::min(x + 1, width - 2)]; 583 sh_row[blockDim.x + 1] = val.x; 584 sh_row[blockDim.x + 1 + (nthreads + 2)] = val.y; 585 sh_row[blockDim.x + 1 + 2 * (nthreads + 2)] = val.z; 586 } 587 588 __syncthreads(); 589 if (x < width) 590 { 591 float3 a, b; 592 593 b.x = sh_row[threadIdx.x + 2]; 594 b.y = sh_row[threadIdx.x + 2 + (nthreads + 2)]; 595 b.z = sh_row[threadIdx.x + 2 + 2 * (nthreads + 2)]; 596 a.x = sh_row[threadIdx.x]; 597 a.y = sh_row[threadIdx.x + (nthreads + 2)]; 598 a.z = sh_row[threadIdx.x + 2 * (nthreads + 2)]; 599 600 float3 dx; 601 if (correct_gamma) 602 dx = make_float3(::sqrtf(b.x) - ::sqrtf(a.x), ::sqrtf(b.y) - ::sqrtf(a.y), ::sqrtf(b.z) - ::sqrtf(a.z)); 603 else 604 dx = make_float3(b.x - a.x, b.y - a.y, b.z - a.z); 605 606 float3 dy = make_float3(0.f, 0.f, 0.f); 607 608 if (blockIdx.y > 0 && blockIdx.y < height - 1) 609 { 610 val = ((const uchar4*)img.ptr(blockIdx.y - 1))[x]; 611 a = make_float3(val.x, val.y, val.z); 612 613 val = ((const uchar4*)img.ptr(blockIdx.y + 1))[x]; 614 b = make_float3(val.x, val.y, val.z); 615 616 if (correct_gamma) 617 dy = make_float3(::sqrtf(b.x) - ::sqrtf(a.x), ::sqrtf(b.y) - ::sqrtf(a.y), ::sqrtf(b.z) - ::sqrtf(a.z)); 618 else 619 dy = make_float3(b.x - a.x, b.y - a.y, b.z - a.z); 620 } 621 622 float best_dx = dx.x; 623 float best_dy = dy.x; 624 625 float mag0 = dx.x * dx.x + dy.x * dy.x; 626 float mag1 = dx.y * dx.y + dy.y * dy.y; 627 if (mag0 < mag1) 628 { 629 best_dx = dx.y; 630 best_dy = dy.y; 631 mag0 = mag1; 632 } 633 634 mag1 = dx.z * dx.z + dy.z * dy.z; 635 if (mag0 < mag1) 636 { 637 best_dx = dx.z; 638 best_dy = dy.z; 639 mag0 = mag1; 640 } 641 642 mag0 = ::sqrtf(mag0); 643 644 float ang = (::atan2f(best_dy, best_dx) + CV_PI_F) * angle_scale - 0.5f; 645 int hidx = (int)::floorf(ang); 646 ang -= hidx; 647 hidx = (hidx + cnbins) % cnbins; 648 649 ((uchar2*)qangle.ptr(blockIdx.y))[x] = make_uchar2(hidx, (hidx + 1) % cnbins); 650 ((float2*)grad.ptr(blockIdx.y))[x] = make_float2(mag0 * (1.f - ang), mag0 * ang); 651 } 652 } 653 654 compute_gradients_8UC4(int nbins,int height,int width,const PtrStepSzb & img,float angle_scale,PtrStepSzf grad,PtrStepSzb qangle,bool correct_gamma)655 void compute_gradients_8UC4(int nbins, int height, int width, const PtrStepSzb& img, 656 float angle_scale, PtrStepSzf grad, PtrStepSzb qangle, bool correct_gamma) 657 { 658 (void)nbins; 659 const int nthreads = 256; 660 661 dim3 bdim(nthreads, 1); 662 dim3 gdim(divUp(width, bdim.x), divUp(height, bdim.y)); 663 664 if (correct_gamma) 665 compute_gradients_8UC4_kernel<nthreads, 1><<<gdim, bdim>>>(height, width, img, angle_scale, grad, qangle); 666 else 667 compute_gradients_8UC4_kernel<nthreads, 0><<<gdim, bdim>>>(height, width, img, angle_scale, grad, qangle); 668 669 cudaSafeCall( cudaGetLastError() ); 670 671 cudaSafeCall( cudaDeviceSynchronize() ); 672 } 673 674 template <int nthreads, int correct_gamma> compute_gradients_8UC1_kernel(int height,int width,const PtrStepb img,float angle_scale,PtrStepf grad,PtrStepb qangle)675 __global__ void compute_gradients_8UC1_kernel(int height, int width, const PtrStepb img, 676 float angle_scale, PtrStepf grad, PtrStepb qangle) 677 { 678 const int x = blockIdx.x * blockDim.x + threadIdx.x; 679 680 const unsigned char* row = (const unsigned char*)img.ptr(blockIdx.y); 681 682 __shared__ float sh_row[nthreads + 2]; 683 684 if (x < width) 685 sh_row[threadIdx.x + 1] = row[x]; 686 else 687 sh_row[threadIdx.x + 1] = row[width - 2]; 688 689 if (threadIdx.x == 0) 690 sh_row[0] = row[::max(x - 1, 1)]; 691 692 if (threadIdx.x == blockDim.x - 1) 693 sh_row[blockDim.x + 1] = row[::min(x + 1, width - 2)]; 694 695 __syncthreads(); 696 if (x < width) 697 { 698 float dx; 699 700 if (correct_gamma) 701 dx = ::sqrtf(sh_row[threadIdx.x + 2]) - ::sqrtf(sh_row[threadIdx.x]); 702 else 703 dx = sh_row[threadIdx.x + 2] - sh_row[threadIdx.x]; 704 705 float dy = 0.f; 706 if (blockIdx.y > 0 && blockIdx.y < height - 1) 707 { 708 float a = ((const unsigned char*)img.ptr(blockIdx.y + 1))[x]; 709 float b = ((const unsigned char*)img.ptr(blockIdx.y - 1))[x]; 710 if (correct_gamma) 711 dy = ::sqrtf(a) - ::sqrtf(b); 712 else 713 dy = a - b; 714 } 715 float mag = ::sqrtf(dx * dx + dy * dy); 716 717 float ang = (::atan2f(dy, dx) + CV_PI_F) * angle_scale - 0.5f; 718 int hidx = (int)::floorf(ang); 719 ang -= hidx; 720 hidx = (hidx + cnbins) % cnbins; 721 722 ((uchar2*)qangle.ptr(blockIdx.y))[x] = make_uchar2(hidx, (hidx + 1) % cnbins); 723 ((float2*) grad.ptr(blockIdx.y))[x] = make_float2(mag * (1.f - ang), mag * ang); 724 } 725 } 726 727 compute_gradients_8UC1(int nbins,int height,int width,const PtrStepSzb & img,float angle_scale,PtrStepSzf grad,PtrStepSzb qangle,bool correct_gamma)728 void compute_gradients_8UC1(int nbins, int height, int width, const PtrStepSzb& img, 729 float angle_scale, PtrStepSzf grad, PtrStepSzb qangle, bool correct_gamma) 730 { 731 (void)nbins; 732 const int nthreads = 256; 733 734 dim3 bdim(nthreads, 1); 735 dim3 gdim(divUp(width, bdim.x), divUp(height, bdim.y)); 736 737 if (correct_gamma) 738 compute_gradients_8UC1_kernel<nthreads, 1><<<gdim, bdim>>>(height, width, img, angle_scale, grad, qangle); 739 else 740 compute_gradients_8UC1_kernel<nthreads, 0><<<gdim, bdim>>>(height, width, img, angle_scale, grad, qangle); 741 742 cudaSafeCall( cudaGetLastError() ); 743 744 cudaSafeCall( cudaDeviceSynchronize() ); 745 } 746 747 748 749 //------------------------------------------------------------------- 750 // Resize 751 752 texture<uchar4, 2, cudaReadModeNormalizedFloat> resize8UC4_tex; 753 texture<uchar, 2, cudaReadModeNormalizedFloat> resize8UC1_tex; 754 resize_for_hog_kernel(float sx,float sy,PtrStepSz<uchar> dst,int colOfs)755 __global__ void resize_for_hog_kernel(float sx, float sy, PtrStepSz<uchar> dst, int colOfs) 756 { 757 unsigned int x = blockIdx.x * blockDim.x + threadIdx.x; 758 unsigned int y = blockIdx.y * blockDim.y + threadIdx.y; 759 760 if (x < dst.cols && y < dst.rows) 761 dst.ptr(y)[x] = tex2D(resize8UC1_tex, x * sx + colOfs, y * sy) * 255; 762 } 763 resize_for_hog_kernel(float sx,float sy,PtrStepSz<uchar4> dst,int colOfs)764 __global__ void resize_for_hog_kernel(float sx, float sy, PtrStepSz<uchar4> dst, int colOfs) 765 { 766 unsigned int x = blockIdx.x * blockDim.x + threadIdx.x; 767 unsigned int y = blockIdx.y * blockDim.y + threadIdx.y; 768 769 if (x < dst.cols && y < dst.rows) 770 { 771 float4 val = tex2D(resize8UC4_tex, x * sx + colOfs, y * sy); 772 dst.ptr(y)[x] = make_uchar4(val.x * 255, val.y * 255, val.z * 255, val.w * 255); 773 } 774 } 775 776 template<class T, class TEX> resize_for_hog(const PtrStepSzb & src,PtrStepSzb dst,TEX & tex)777 static void resize_for_hog(const PtrStepSzb& src, PtrStepSzb dst, TEX& tex) 778 { 779 tex.filterMode = cudaFilterModeLinear; 780 781 size_t texOfs = 0; 782 int colOfs = 0; 783 784 cudaChannelFormatDesc desc = cudaCreateChannelDesc<T>(); 785 cudaSafeCall( cudaBindTexture2D(&texOfs, tex, src.data, desc, src.cols, src.rows, src.step) ); 786 787 if (texOfs != 0) 788 { 789 colOfs = static_cast<int>( texOfs/sizeof(T) ); 790 cudaSafeCall( cudaUnbindTexture(tex) ); 791 cudaSafeCall( cudaBindTexture2D(&texOfs, tex, src.data, desc, src.cols, src.rows, src.step) ); 792 } 793 794 dim3 threads(32, 8); 795 dim3 grid(divUp(dst.cols, threads.x), divUp(dst.rows, threads.y)); 796 797 float sx = static_cast<float>(src.cols) / dst.cols; 798 float sy = static_cast<float>(src.rows) / dst.rows; 799 800 resize_for_hog_kernel<<<grid, threads>>>(sx, sy, (PtrStepSz<T>)dst, colOfs); 801 cudaSafeCall( cudaGetLastError() ); 802 803 cudaSafeCall( cudaDeviceSynchronize() ); 804 805 cudaSafeCall( cudaUnbindTexture(tex) ); 806 } 807 resize_8UC1(const PtrStepSzb & src,PtrStepSzb dst)808 void resize_8UC1(const PtrStepSzb& src, PtrStepSzb dst) { resize_for_hog<uchar> (src, dst, resize8UC1_tex); } resize_8UC4(const PtrStepSzb & src,PtrStepSzb dst)809 void resize_8UC4(const PtrStepSzb& src, PtrStepSzb dst) { resize_for_hog<uchar4>(src, dst, resize8UC4_tex); } 810 } // namespace hog 811 }}} // namespace cv { namespace cuda { namespace cudev 812 813 814 #endif /* CUDA_DISABLER */ 815