/* * function: kernel_wavelet_denoise * wavelet filter for denoise usage * in: input image data as read only * threshold: noise threshold * low: */ __constant float threshConst[5] = { 50.430166f, 20.376415f, 10.184031f, 6.640919f, 3.367972f }; __kernel void kernel_wavelet_denoise(__global uint *src, __global uint *approxOut, __global float *details, __global uint *dest, int inputYOffset, int outputYOffset, uint inputUVOffset, uint outputUVOffset, int layer, int decomLevels, float hardThresh, float softThresh) { int x = get_global_id(0); int y = get_global_id(1); size_t width = get_global_size(0); size_t height = get_global_size(1); int imageWidth = width * 16; int imageHeight = height; float stdev = 0.0f; float thold = 0.0f; float16 deviation = (float16)0.0f; layer = (layer > 1) ? layer : 1; layer = (layer < decomLevels) ? layer : decomLevels; src += inputYOffset; dest += outputYOffset; #if WAVELET_DENOISE_UV int xScaler = pown(2.0f, layer); int yScaler = pown(2.0f, (layer - 1)); #else int xScaler = pown(2.0f, (layer - 1)); int yScaler = xScaler; #endif xScaler = ((x == 0) || (x > imageWidth / 16 - xScaler)) ? 0 : xScaler; yScaler = ((y < yScaler) || (y > imageHeight - yScaler)) ? 0 : yScaler; uint4 approx; float16 detail; #if WAVELET_DENOISE_UV int srcOffset = (layer % 2) ? (inputUVOffset * imageWidth / 4) : 0; __global uchar *src_p = (__global uchar *)(src + srcOffset); #else __global uchar *src_p = (__global uchar *)(src); #endif int pixel_index = x * 16 + y * imageWidth; int group_index = x * 4 + y * (imageWidth / 4); #if WAVELET_DENOISE_UV uint4 luma; int luma_index0 = x * 4 + (2 * y) * (imageWidth / 4); int luma_index1 = x * 4 + (2 * y + 1) * (imageWidth / 4); #else uint4 chroma; int chroma_index = x * 4 + (y / 2) * (imageWidth / 4); #endif ushort16 a; ushort16 b; ushort16 c; ushort16 d; ushort16 e; ushort16 f; ushort16 g; ushort16 h; ushort16 i; float div = 1.0f / 16.0f; a = (ushort16)(convert_ushort(src_p[pixel_index - yScaler * imageWidth - xScaler]), convert_ushort(src_p[pixel_index - yScaler * imageWidth - xScaler + 1]), convert_ushort(src_p[pixel_index - yScaler * imageWidth - xScaler + 2]), convert_ushort(src_p[pixel_index - yScaler * imageWidth - xScaler + 3]), convert_ushort(src_p[pixel_index - yScaler * imageWidth - xScaler + 4]), convert_ushort(src_p[pixel_index - yScaler * imageWidth - xScaler + 5]), convert_ushort(src_p[pixel_index - yScaler * imageWidth - xScaler + 6]), convert_ushort(src_p[pixel_index - yScaler * imageWidth - xScaler + 7]), convert_ushort(src_p[pixel_index - yScaler * imageWidth - xScaler + 8]), convert_ushort(src_p[pixel_index - yScaler * imageWidth - xScaler + 9]), convert_ushort(src_p[pixel_index - yScaler * imageWidth - xScaler + 10]), convert_ushort(src_p[pixel_index - yScaler * imageWidth - xScaler + 11]), convert_ushort(src_p[pixel_index - yScaler * imageWidth - xScaler + 12]), convert_ushort(src_p[pixel_index - yScaler * imageWidth - xScaler + 13]), convert_ushort(src_p[pixel_index - yScaler * imageWidth - xScaler + 14]), convert_ushort(src_p[pixel_index - yScaler * imageWidth - xScaler + 15]) ); b = (ushort16)(convert_ushort(src_p[pixel_index - yScaler * imageWidth]), convert_ushort(src_p[pixel_index - yScaler * imageWidth + 1]), convert_ushort(src_p[pixel_index - yScaler * imageWidth + 2]), convert_ushort(src_p[pixel_index - yScaler * imageWidth + 3]), convert_ushort(src_p[pixel_index - yScaler * imageWidth + 4]), convert_ushort(src_p[pixel_index - yScaler * imageWidth + 5]), convert_ushort(src_p[pixel_index - yScaler * imageWidth + 6]), convert_ushort(src_p[pixel_index - yScaler * imageWidth + 7]), convert_ushort(src_p[pixel_index - yScaler * imageWidth + 8]), convert_ushort(src_p[pixel_index - yScaler * imageWidth + 9]), convert_ushort(src_p[pixel_index - yScaler * imageWidth + 10]), convert_ushort(src_p[pixel_index - yScaler * imageWidth + 11]), convert_ushort(src_p[pixel_index - yScaler * imageWidth + 12]), convert_ushort(src_p[pixel_index - yScaler * imageWidth + 13]), convert_ushort(src_p[pixel_index - yScaler * imageWidth + 14]), convert_ushort(src_p[pixel_index - yScaler * imageWidth + 15]) ); c = (ushort16)(convert_ushort(src_p[pixel_index - yScaler * imageWidth + xScaler]), convert_ushort(src_p[pixel_index - yScaler * imageWidth + xScaler + 1]), convert_ushort(src_p[pixel_index - yScaler * imageWidth + xScaler + 2]), convert_ushort(src_p[pixel_index - yScaler * imageWidth + xScaler + 3]), convert_ushort(src_p[pixel_index - yScaler * imageWidth + xScaler + 4]), convert_ushort(src_p[pixel_index - yScaler * imageWidth + xScaler + 5]), convert_ushort(src_p[pixel_index - yScaler * imageWidth + xScaler + 6]), convert_ushort(src_p[pixel_index - yScaler * imageWidth + xScaler + 7]), convert_ushort(src_p[pixel_index - yScaler * imageWidth + xScaler + 8]), convert_ushort(src_p[pixel_index - yScaler * imageWidth + xScaler + 9]), convert_ushort(src_p[pixel_index - yScaler * imageWidth + xScaler + 10]), convert_ushort(src_p[pixel_index - yScaler * imageWidth + xScaler + 11]), convert_ushort(src_p[pixel_index - yScaler * imageWidth + xScaler + 12]), convert_ushort(src_p[pixel_index - yScaler * imageWidth + xScaler + 13]), convert_ushort(src_p[pixel_index - yScaler * imageWidth + xScaler + 14]), convert_ushort(src_p[pixel_index - yScaler * imageWidth + xScaler + 15]) ); d = (ushort16)(convert_ushort(src_p[pixel_index - xScaler]), convert_ushort(src_p[pixel_index - xScaler + 1]), convert_ushort(src_p[pixel_index - xScaler + 2]), convert_ushort(src_p[pixel_index - xScaler + 3]), convert_ushort(src_p[pixel_index - xScaler + 4]), convert_ushort(src_p[pixel_index - xScaler + 5]), convert_ushort(src_p[pixel_index - xScaler + 6]), convert_ushort(src_p[pixel_index - xScaler + 7]), convert_ushort(src_p[pixel_index - xScaler + 8]), convert_ushort(src_p[pixel_index - xScaler + 9]), convert_ushort(src_p[pixel_index - xScaler + 10]), convert_ushort(src_p[pixel_index - xScaler + 11]), convert_ushort(src_p[pixel_index - xScaler + 12]), convert_ushort(src_p[pixel_index - xScaler + 13]), convert_ushort(src_p[pixel_index - xScaler + 14]), convert_ushort(src_p[pixel_index - xScaler + 15]) ); e = (ushort16)(convert_ushort(src_p[pixel_index]), convert_ushort(src_p[pixel_index + 1]), convert_ushort(src_p[pixel_index + 2]), convert_ushort(src_p[pixel_index + 3]), convert_ushort(src_p[pixel_index + 4]), convert_ushort(src_p[pixel_index + 5]), convert_ushort(src_p[pixel_index + 6]), convert_ushort(src_p[pixel_index + 7]), convert_ushort(src_p[pixel_index + 8]), convert_ushort(src_p[pixel_index + 9]), convert_ushort(src_p[pixel_index + 10]), convert_ushort(src_p[pixel_index + 11]), convert_ushort(src_p[pixel_index + 12]), convert_ushort(src_p[pixel_index + 13]), convert_ushort(src_p[pixel_index + 14]), convert_ushort(src_p[pixel_index + 15]) ); f = (ushort16)(convert_ushort(src_p[pixel_index + xScaler]), convert_ushort(src_p[pixel_index + xScaler + 1]), convert_ushort(src_p[pixel_index + xScaler + 2]), convert_ushort(src_p[pixel_index + xScaler + 3]), convert_ushort(src_p[pixel_index + xScaler + 4]), convert_ushort(src_p[pixel_index + xScaler + 5]), convert_ushort(src_p[pixel_index + xScaler + 6]), convert_ushort(src_p[pixel_index + xScaler + 7]), convert_ushort(src_p[pixel_index + xScaler + 8]), convert_ushort(src_p[pixel_index + xScaler + 9]), convert_ushort(src_p[pixel_index + xScaler + 10]), convert_ushort(src_p[pixel_index + xScaler + 11]), convert_ushort(src_p[pixel_index + xScaler + 12]), convert_ushort(src_p[pixel_index + xScaler + 13]), convert_ushort(src_p[pixel_index + xScaler + 14]), convert_ushort(src_p[pixel_index + xScaler + 15]) ); g = (ushort16)(convert_ushort(src_p[pixel_index + yScaler * imageWidth - xScaler]), convert_ushort(src_p[pixel_index + yScaler * imageWidth - xScaler + 1]), convert_ushort(src_p[pixel_index + yScaler * imageWidth - xScaler + 2]), convert_ushort(src_p[pixel_index + yScaler * imageWidth - xScaler + 3]), convert_ushort(src_p[pixel_index + yScaler * imageWidth - xScaler + 4]), convert_ushort(src_p[pixel_index + yScaler * imageWidth - xScaler + 5]), convert_ushort(src_p[pixel_index + yScaler * imageWidth - xScaler + 6]), convert_ushort(src_p[pixel_index + yScaler * imageWidth - xScaler + 7]), convert_ushort(src_p[pixel_index + yScaler * imageWidth - xScaler + 8]), convert_ushort(src_p[pixel_index + yScaler * imageWidth - xScaler + 9]), convert_ushort(src_p[pixel_index + yScaler * imageWidth - xScaler + 10]), convert_ushort(src_p[pixel_index + yScaler * imageWidth - xScaler + 11]), convert_ushort(src_p[pixel_index + yScaler * imageWidth - xScaler + 12]), convert_ushort(src_p[pixel_index + yScaler * imageWidth - xScaler + 13]), convert_ushort(src_p[pixel_index + yScaler * imageWidth - xScaler + 14]), convert_ushort(src_p[pixel_index + yScaler * imageWidth - xScaler + 15]) ); h = (ushort16)(convert_ushort(src_p[pixel_index + yScaler * imageWidth]), convert_ushort(src_p[pixel_index + yScaler * imageWidth + 1]), convert_ushort(src_p[pixel_index + yScaler * imageWidth + 2]), convert_ushort(src_p[pixel_index + yScaler * imageWidth + 3]), convert_ushort(src_p[pixel_index + yScaler * imageWidth + 4]), convert_ushort(src_p[pixel_index + yScaler * imageWidth + 5]), convert_ushort(src_p[pixel_index + yScaler * imageWidth + 6]), convert_ushort(src_p[pixel_index + yScaler * imageWidth + 7]), convert_ushort(src_p[pixel_index + yScaler * imageWidth + 8]), convert_ushort(src_p[pixel_index + yScaler * imageWidth + 9]), convert_ushort(src_p[pixel_index + yScaler * imageWidth + 10]), convert_ushort(src_p[pixel_index + yScaler * imageWidth + 11]), convert_ushort(src_p[pixel_index + yScaler * imageWidth + 12]), convert_ushort(src_p[pixel_index + yScaler * imageWidth + 13]), convert_ushort(src_p[pixel_index + yScaler * imageWidth + 14]), convert_ushort(src_p[pixel_index + yScaler * imageWidth + 15]) ); i = (ushort16)(convert_ushort(src_p[pixel_index + yScaler * imageWidth + xScaler]), convert_ushort(src_p[pixel_index + yScaler * imageWidth + xScaler + 1]), convert_ushort(src_p[pixel_index + yScaler * imageWidth + xScaler + 2]), convert_ushort(src_p[pixel_index + yScaler * imageWidth + xScaler + 3]), convert_ushort(src_p[pixel_index + yScaler * imageWidth + xScaler + 4]), convert_ushort(src_p[pixel_index + yScaler * imageWidth + xScaler + 5]), convert_ushort(src_p[pixel_index + yScaler * imageWidth + xScaler + 6]), convert_ushort(src_p[pixel_index + yScaler * imageWidth + xScaler + 7]), convert_ushort(src_p[pixel_index + yScaler * imageWidth + xScaler + 8]), convert_ushort(src_p[pixel_index + yScaler * imageWidth + xScaler + 9]), convert_ushort(src_p[pixel_index + yScaler * imageWidth + xScaler + 10]), convert_ushort(src_p[pixel_index + yScaler * imageWidth + xScaler + 11]), convert_ushort(src_p[pixel_index + yScaler * imageWidth + xScaler + 12]), convert_ushort(src_p[pixel_index + yScaler * imageWidth + xScaler + 13]), convert_ushort(src_p[pixel_index + yScaler * imageWidth + xScaler + 14]), convert_ushort(src_p[pixel_index + yScaler * imageWidth + xScaler + 15]) ); /* { a, b, c } { 1, 2, 1 } { d, e, f } { 2, 4, 2 } { g, h, i } { 1, 2, 1 } */ ushort16 sum; sum = (ushort16)1 * a + (ushort16)2 * b + (ushort16)1 * c + (ushort16)2 * d + (ushort16)4 * e + (ushort16)2 * f + (ushort16)1 * g + (ushort16)2 * h + (ushort16)1 * i; approx = as_uint4(convert_uchar16(((convert_float16(sum) + 0.5f / div) * div))); detail = convert_float16(convert_char16(e) - as_char16(approx)); thold = hardThresh * threshConst[layer - 1]; detail = (detail < -thold) ? detail + (thold - thold * softThresh) : detail; detail = (detail > thold) ? detail - (thold - thold * softThresh) : detail; detail = (detail > -thold && detail < thold) ? detail * softThresh : detail; __global float16 *details_p = (__global float16 *)(&details[pixel_index]); if (layer == 1) { (*details_p) = detail; #if WAVELET_DENOISE_UV // copy Y luma = vload4(0, src + luma_index0); vstore4(luma, 0, dest + luma_index0); luma = vload4(0, src + luma_index1); vstore4(luma, 0, dest + luma_index1); #else // copy UV if (y % 2 == 0) { chroma = vload4(0, src + chroma_index + inputUVOffset * (imageWidth / 4)); vstore4(chroma, 0, dest + chroma_index + outputUVOffset * (imageWidth / 4)); } #endif } else { (*details_p) += detail; } if (layer < decomLevels) { #if WAVELET_DENOISE_UV int approxOffset = (layer % 2) ? 0 : (inputUVOffset * imageWidth / 4); (*(__global uint4*)(approxOut + group_index + approxOffset)) = approx; #else (*(__global uint4*)(approxOut + group_index)) = approx; #endif } else { // Reconstruction #if WAVELET_DENOISE_UV __global uint4* dest_p = (__global uint4*)(&dest[group_index + outputUVOffset * imageWidth / 4]); (*dest_p) = as_uint4(convert_uchar16(*details_p + convert_float16(as_uchar16(approx)))); #else __global uint4* dest_p = (__global uint4*)(&dest[group_index]); (*dest_p) = as_uint4(convert_uchar16(*details_p + convert_float16(as_uchar16(approx)))); #endif } }