1/* 2 * Copyright (c) 2018-2020 Arm Limited. 3 * 4 * SPDX-License-Identifier: MIT 5 * 6 * Permission is hereby granted, free of charge, to any person obtaining a copy 7 * of this software and associated documentation files (the "Software"), to 8 * deal in the Software without restriction, including without limitation the 9 * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or 10 * sell copies of the Software, and to permit persons to whom the Software is 11 * furnished to do so, subject to the following conditions: 12 * 13 * The above copyright notice and this permission notice shall be included in all 14 * copies or substantial portions of the Software. 15 * 16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 19 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 22 * SOFTWARE. 23 */ 24#include "helpers.h" 25 26#if defined(DATA_TYPE) && defined(ELEMENT_SIZE) 27 28#if ELEMENT_SIZE == 1 29#define COND_DATA_TYPE char 30#elif ELEMENT_SIZE == 2 31#define COND_DATA_TYPE short 32#elif ELEMENT_SIZE == 4 33#define COND_DATA_TYPE int 34#else // ELEMENT_SIZE 35#error "Element size not support" 36#endif // ELEMENT_SIZE 37 38#if defined(CONVOLVED_WIDTH) && defined(STRIDE_Y) && defined(SRC_DEPTH) 39/** This opencl kernel performs im2col when the kernel size is 1x1, the stride_x = 1 and the data layout is NCHW 40 * 41 * @note The data type must be passed at compile time using -DDATA_TYPE: e.g. -DDATA_TYPE=float 42 * @note The width of output tensor after matrix multiplication must be passed at compile time using -DCONVOLVED_WIDTH: e.g. -DCONVOLVED_WIDTH=34 43 * @note The number of input channels must be passed at compile time using -DSRC_DEPTH: e.g. -DSRC_DEPTH=3 44 * @note The stride along the Y direction must be passed at compile time using -DSTRIDE_Y: e.g. -DSTRIDE_Y=1 45 * @note In case biases will be added to the convolution -DHAS_BIAS has to be passed to append the final matrix with 1 in each row. 46 * @note In case grouping is performed, the number of groups must be passed at compile time using -DNUM_GROUPS: e.g. -DNUM_GROUPS=4 47 * 48 * @param[in] src_ptr Pointer to the source tensor. Supported data types: QASYMM8_SIGNED/QASYMM8/F16/F32 49 * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes) 50 * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes) 51 * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes) 52 * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes) 53 * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes) 54 * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes) 55 * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor 56 * @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr 57 * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes) 58 * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes) 59 * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes) 60 * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes) 61 * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes) 62 * @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes) 63 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor 64 * @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes). 65 * @param[in] dst_stride_w Stride of the destination tensor in W dimension (in bytes). 66 */ 67__kernel void im2col1x1_stridex1_nchw( 68 TENSOR3D_DECLARATION(src), 69#if defined(NUM_GROUPS) 70 TENSOR3D_DECLARATION(dst), 71#else // defined(NUM_GROUPS) 72 IMAGE_DECLARATION(dst), 73#endif // defined(NUM_GROUPS) 74 uint src_stride_w, 75 uint dst_stride_w) 76{ 77 const uint xc = get_global_id(0) * 4; // x coordinate in the convolved tensor 78 const uint yc = get_global_id(1); // y coordinate in the convolved tensor 79 const uint ch = get_global_id(2) % SRC_DEPTH; // input feature map 80 const uint batch = get_global_id(2) / SRC_DEPTH; // batch size 81 82 // Clamp xc 83 // The strategy clamps at "xc" as it will be a valid value for sure 84 uint4 xc_clamped = xc + (uint4)(0, 1, 2, 3); 85 86 // Check which values are valid 87 const VEC_DATA_TYPE(COND_DATA_TYPE, 4) cond0 = CONVERT((xc_clamped < SRC_WIDTH), VEC_DATA_TYPE(COND_DATA_TYPE, 4)); 88 89 xc_clamped = select((uint4)xc, xc_clamped, convert_int4(cond0)); 90 91 // Calculate input indices 92 const uint xi = xc; 93 const uint yi = yc * STRIDE_Y; 94 95 // Calculate output indices 96 97#if defined(NUM_GROUPS) 98 const uint xo = ch % (SRC_DEPTH / NUM_GROUPS); 99 const uint zo = ch / (SRC_DEPTH / NUM_GROUPS); 100#else // defined(NUM_GROUPS) 101 const uint xo = ch; 102#endif // defined(NUM_GROUPS) 103 const uint4 yo = xc_clamped + yc * CONVOLVED_WIDTH; // Index of the convolution 104 105 // Get input and output address 106 __global uchar *input_ptr = src_ptr + src_offset_first_element_in_bytes + xi * src_stride_x + yi * src_stride_y + ch * src_stride_z + batch * src_stride_w; 107#if defined(NUM_GROUPS) 108 __global uchar *output_ptr = dst_ptr + dst_offset_first_element_in_bytes + xo * dst_stride_x + zo * dst_stride_z + batch * dst_stride_w; 109#else // defined(NUM_GROUPS) 110 __global uchar *output_ptr = dst_ptr + dst_offset_first_element_in_bytes + xo * dst_stride_x + batch * dst_stride_w; 111#endif // defined(NUM_GROUPS) 112 113 VEC_DATA_TYPE(DATA_TYPE, 4) 114 data = vload4(0, (__global DATA_TYPE *)input_ptr); 115 116 // If out-of-bound, overwrite with the first element 117 data = select((VEC_DATA_TYPE(DATA_TYPE, 4))data.s0, data, cond0); 118 119 *(__global DATA_TYPE *)(output_ptr + yo.s0 * dst_stride_y) = data.s0; 120 *(__global DATA_TYPE *)(output_ptr + yo.s1 * dst_stride_y) = data.s1; 121 *(__global DATA_TYPE *)(output_ptr + yo.s2 * dst_stride_y) = data.s2; 122 *(__global DATA_TYPE *)(output_ptr + yo.s3 * dst_stride_y) = data.s3; 123 124#ifdef HAS_BIAS 125#if defined(NUM_GROUPS) 126 if(xo == (SRC_DEPTH / NUM_GROUPS - 1)) 127#else // defined(NUM_GROUPS) 128 if(ch == (SRC_DEPTH - 1)) 129#endif // defined(NUM_GROUPS) 130 { 131 *((__global DATA_TYPE *)(output_ptr + yo.s0 * dst_stride_y) + 1) = 1.0f; 132 *((__global DATA_TYPE *)(output_ptr + yo.s1 * dst_stride_y) + 1) = 1.0f; 133 *((__global DATA_TYPE *)(output_ptr + yo.s2 * dst_stride_y) + 1) = 1.0f; 134 *((__global DATA_TYPE *)(output_ptr + yo.s3 * dst_stride_y) + 1) = 1.0f; 135 } 136#endif // HAS_BIAS 137} 138#endif // defined(CONVOLVED_WIDTH) && defined(STRIDE_Y) && defined(SRC_DEPTH) 139 140#if defined(CONVOLVED_WIDTH) && defined(SRC_WIDTH) && defined(SRC_HEIGHT) && defined(STRIDE_X) && defined(STRIDE_Y) && defined(SRC_DEPTH) && defined(PAD_LEFT) && defined(PAD_RIGHT) && defined(PAD_TOP) && defined(PAD_BOTTOM) && defined(PAD_VALUE) 141#if defined(DILATION_X) && defined(DILATION_Y) 142/** This opencl kernel performs a generic im2col implementation when the data layout is NCHW 143 * 144 * @note The data type must be passed at compile time using -DDATA_TYPE: e.g. -DDATA_TYPE=float 145 * @note The width and height of the input tensor must be passed at compile time using -DSRC_WIDTH and -DSRC_HEIGHT: e.g. -DSRC_WIDTH=128 and -DSRC_HEIGHT=128 146 * @note The width of output tensor after matrix multiplication must be passed at compile time using -DCONVOLVED_WIDTH: e.g. -DCONVOLVED_WIDTH=34 147 * @note The kernel width, height and depth must be passed at compile time using -DKERNEL_WIDTH, -DKERNEL_HEIGHT and -DSRC_DEPTH: e.g. -DKERNEL_WIDTH=3, -DKERNEL_HEIGHT=3 and -DSRC_DEPTH=64 148 * @note The pad_left, pad_right, pad_top and pad_bottom must be passed at compile time using -DPAD_LEFT, -DPAD_RIGHT, -DPAD_TOP and -DPAD_BOTTOM: e.g. -DPAD_LEFT=1, -DPAD_RIGHT=2, -DPAD_TOP=3 and -DPAD_BOTTOM=2 149 * @note The zero value to store in case we load values out-of-bounds must be passed at compile time using -DPAD_VALUE: e.g. -DPAD_VALUE=0.0 150 * @note The stride along the X and Y directions must be passed at compile time using -DSTRIDE_X and -DSTRIDE_Y: e.g. -DSTRIDE_X=1 and -DSTRIDE_Y=1 151 * @note The dilation_x and dilation_y must be passed at compile time using -DDILATION_X and -DDILATION_Y: e.g. -DDILATION_X=1, -DDILATION_Y=1 152 * @note In case biases will be added to the convolution -DHAS_BIAS has to be passed to append the final matrix with 1 in each row. 153 * @note In case grouping is performed, the number of groups must be passed at compile time using -DNUM_GROUPS: e.g. -DNUM_GROUPS=4 154 * 155 * @param[in] src_ptr Pointer to the source tensor. Supported data types: QASYMM8_SIGNED/QASYMM8/F16/F32 156 * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes) 157 * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes) 158 * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes) 159 * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes) 160 * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes) 161 * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes) 162 * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor 163 * @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr 164 * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes) 165 * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes) 166 * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes) 167 * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes) 168 * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes) 169 * @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes) 170 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor 171 * @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes). 172 * @param[in] dst_stride_w Stride of the destination tensor in W dimension (in bytes). 173 */ 174__kernel void im2col_generic_nchw( 175 TENSOR3D_DECLARATION(src), 176#if defined(NUM_GROUPS) 177 TENSOR3D_DECLARATION(dst), 178#else // defined(NUM_GROUPS) 179 IMAGE_DECLARATION(dst), 180#endif // defined(NUM_GROUPS) 181 uint src_stride_w, 182 uint dst_stride_w) 183{ 184 const int xc = get_global_id(0); // x coordinate in the convolved tensor 185 const int yc = get_global_id(1); // y coordinate in the convolved tensor 186 const int ch = get_global_id(2) % SRC_DEPTH; // input feature map 187 const int batch = get_global_id(2) / SRC_DEPTH; // batch size 188 189 // Calculate input indices 190 const int xi = xc * STRIDE_X - PAD_LEFT; 191 const int yi = yc * STRIDE_Y - PAD_TOP; 192 193 // Calculate output indices 194#if defined(NUM_GROUPS) 195 const int xo = (ch % (SRC_DEPTH / NUM_GROUPS)) * KERNEL_WIDTH * KERNEL_HEIGHT; 196 const int zo = ch / (SRC_DEPTH / NUM_GROUPS); 197#else // defined(NUM_GROUPS) 198 const int xo = ch * KERNEL_WIDTH * KERNEL_HEIGHT; 199#endif // defined(NUM_GROUPS) 200 const int yo = xc + yc * CONVOLVED_WIDTH; // Index of the convolution 201 202 __global uchar *input_ptr = src_ptr + src_offset_first_element_in_bytes + ch * src_stride_z + batch * src_stride_w; 203#if defined(NUM_GROUPS) 204 __global DATA_TYPE *output_ptr = ((__global DATA_TYPE *)(dst_ptr + dst_offset_first_element_in_bytes + yo * dst_stride_y + zo * dst_stride_z + batch * dst_stride_w)) + xo; 205#else // defined(NUM_GROUPS) 206 __global DATA_TYPE *output_ptr = ((__global DATA_TYPE *)(dst_ptr + dst_offset_first_element_in_bytes + yo * dst_stride_y + batch * dst_stride_w)) + xo; 207#endif // defined(NUM_GROUPS) 208 209 // Linearize convolution elements 210 for(int yk = 0; yk < KERNEL_HEIGHT; ++yk) 211 { 212 int y = yi + yk * DILATION_Y; 213 for(int xk = 0; xk < KERNEL_WIDTH; ++xk, ++output_ptr) 214 { 215 int x = xi + xk * DILATION_X; 216#if PAD_LEFT == 0 && PAD_TOP == 0 && PAD_RIGHT == 0 && PAD_BOTTOM == 0 217 *output_ptr = *((__global DATA_TYPE *)(input_ptr + x * src_stride_x + y * src_stride_y)); 218#else // PAD_LEFT == 0 && PAD_TOP == 0 && PAD_RIGHT == 0 && PAD_BOTTOM == 0 219 if(x < 0 || x >= SRC_WIDTH || y < 0 || y >= SRC_HEIGHT) 220 { 221 *output_ptr = PAD_VALUE; 222 } 223 else 224 { 225 *output_ptr = *((__global DATA_TYPE *)(input_ptr + x * src_stride_x + y * src_stride_y)); 226 } 227#endif // PAD_LEFT == 0 && PAD_TOP == 0 && PAD_RIGHT == 0 && PAD_BOTTOM == 0 228 } 229 } 230 231#ifdef HAS_BIAS 232#if defined(NUM_GROUPS) 233 if((xo / (KERNEL_WIDTH * KERNEL_HEIGHT)) == (SRC_DEPTH / NUM_GROUPS - 1)) 234#else // defined(NUM_GROUPS) 235 if(ch == (SRC_DEPTH - 1)) 236#endif // defined(NUM_GROUPS) 237 { 238 *output_ptr = 1.0f; 239 } 240#endif // HAS_BIAS 241} 242#endif // defined(DILATION_X) && defined(DILATION_Y) 243 244/** This opencl kernel performs im2col when the kernel size is 3x3 and the data layout is NCHW 245 * 246 * @note The data type must be passed at compile time using -DDATA_TYPE: e.g. -DDATA_TYPE=float 247 * @note The width and height of the input tensor must be passed at compile time using -DSRC_WIDTH and -DSRC_HEIGHT: e.g. -DSRC_WIDTH=128 and -DSRC_HEIGHT=128 248 * @note The width of output tensor after matrix multiplication must be passed at compile time using -DCONVOLVED_WIDTH: e.g. -DCONVOLVED_WIDTH=34 249 * @note The number of input channels must be passed at compile time using -DSRC_DEPTH: e.g. -DSRC_DEPTH=3 250 * @note The pad_left, pad_right, pad_top and pad_bottom must be passed at compile time using -DPAD_LEFT, -DPAD_RIGHT, -DPAD_TOP and -DPAD_BOTTOM: e.g. -DPAD_LEFT=1, -DPAD_RIGHT=2, -DPAD_TOP=3 and -DPAD_BOTTOM=2 251 * @note The zero value to store in case we load values out-of-bounds must be passed at compile time using -DPAD_VALUE: e.g. -DPAD_VALUE=0.0 252 * @note The stride along the X and Y directions must be passed at compile time using -DSTRIDE_X and -DSTRIDE_Y: e.g. -DSTRIDE_X=1 and -DSTRIDE_Y=1 253 * @note In case biases will be added to the convolution -DHAS_BIAS has to be passed to append the final matrix with 1 in each row. 254 * 255 * @param[in] src_ptr Pointer to the source tensor. Supported data types: QASYMM8_SIGNED/QASYMM8/F16/F32 256 * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes) 257 * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes) 258 * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes) 259 * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes) 260 * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes) 261 * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes) 262 * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor 263 * @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr 264 * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes) 265 * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes) 266 * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes) 267 * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes) 268 * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes) 269 * @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes) 270 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor 271 * @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes). 272 * @param[in] dst_stride_w Stride of the destination tensor in W dimension (in bytes). 273 */ 274__kernel void im2col3x3_nchw( 275 TENSOR3D_DECLARATION(src), 276#if defined(NUM_GROUPS) 277 TENSOR3D_DECLARATION(dst), 278#else // defined(NUM_GROUPS) 279 IMAGE_DECLARATION(dst), 280#endif // defined(NUM_GROUPS) 281 uint src_stride_w, 282 uint dst_stride_w) 283{ 284 const int xc = get_global_id(0); // x coordinate in the convolved tensor 285 const int yc = get_global_id(1); // y coordinate in the convolved tensor 286 const int ch = get_global_id(2) % SRC_DEPTH; // input feature map 287 const int batch = get_global_id(2) / SRC_DEPTH; // batch size 288 289 // Calculate input indices 290 const int xi = xc * STRIDE_X - PAD_LEFT; 291 const int yi = yc * STRIDE_Y - PAD_TOP; 292 293 // Calculate output indices 294#if defined(NUM_GROUPS) 295 const int xo = (ch % (SRC_DEPTH / NUM_GROUPS)) * 9; // 3x3 296 const int zo = ch / (SRC_DEPTH / NUM_GROUPS); 297#else // defined(NUM_GROUPS) 298 const int xo = ch * 9; // 3x3 299#endif // defined(NUM_GROUPS) 300 const int yo = xc + yc * CONVOLVED_WIDTH; // Index of the convolution 301 302 // Get input and output address 303 __global uchar *input_ptr = src_ptr + src_offset_first_element_in_bytes + xi * (int)src_stride_x + yi * (int)src_stride_y + ch * src_stride_z + batch * src_stride_w; 304#if defined(NUM_GROUPS) 305 __global uchar *output_ptr = dst_ptr + dst_offset_first_element_in_bytes + xo * dst_stride_x + yo * dst_stride_y + zo * dst_stride_z + batch * dst_stride_w; 306#else // defined(NUM_GROUPS) 307 __global uchar *output_ptr = dst_ptr + dst_offset_first_element_in_bytes + xo * dst_stride_x + yo * dst_stride_y + batch * dst_stride_w; 308#endif // defined(NUM_GROUPS) 309 310 VEC_DATA_TYPE(DATA_TYPE, 3) 311 row0 = vload3(0, (__global DATA_TYPE *)(input_ptr + 0 * src_stride_y)); 312 VEC_DATA_TYPE(DATA_TYPE, 3) 313 row1 = vload3(0, (__global DATA_TYPE *)(input_ptr + 1 * src_stride_y)); 314 VEC_DATA_TYPE(DATA_TYPE, 3) 315 row2 = vload3(0, (__global DATA_TYPE *)(input_ptr + 2 * src_stride_y)); 316 317#if PAD_LEFT != 0 || PAD_TOP != 0 || PAD_RIGHT != 0 || PAD_BOTTOM != 0 318 // Put 0 if the value is out-of-bound 319 int3 x = (int3)xi + (int3)(0, 1, 2); 320 int3 y = (int3)yi + (int3)(0, 1, 2); 321 322 VEC_DATA_TYPE(COND_DATA_TYPE, 3) 323 cond0 = CONVERT((x >= (int3)0 && x < (int3)SRC_WIDTH && (int3)(y.s0 >= 0 && y.s0 < SRC_HEIGHT)), VEC_DATA_TYPE(COND_DATA_TYPE, 3)); 324 VEC_DATA_TYPE(COND_DATA_TYPE, 3) 325 cond1 = CONVERT((x >= (int3)0 && x < (int3)SRC_WIDTH && (int3)(y.s1 >= 0 && y.s1 < SRC_HEIGHT)), VEC_DATA_TYPE(COND_DATA_TYPE, 3)); 326 VEC_DATA_TYPE(COND_DATA_TYPE, 3) 327 cond2 = CONVERT((x >= (int3)0 && x < (int3)SRC_WIDTH && (int3)(y.s2 >= 0 && y.s2 < SRC_HEIGHT)), VEC_DATA_TYPE(COND_DATA_TYPE, 3)); 328 329 row0 = select((VEC_DATA_TYPE(DATA_TYPE, 3))PAD_VALUE, row0, cond0); 330 row1 = select((VEC_DATA_TYPE(DATA_TYPE, 3))PAD_VALUE, row1, cond1); 331 row2 = select((VEC_DATA_TYPE(DATA_TYPE, 3))PAD_VALUE, row2, cond2); 332#endif // PAD_LEFT != 0 || PAD_TOP != 0 || PAD_RIGHT != 0 || PAD_BOTTOM != 0 333 334 vstore8((VEC_DATA_TYPE(DATA_TYPE, 8))(row0.s012, row1.s012, row2.s01), 0, (__global DATA_TYPE *)output_ptr); 335 *((__global DATA_TYPE *)output_ptr + 8) = row2.s2; 336 337#ifdef HAS_BIAS 338#if defined(NUM_GROUPS) 339 if((xo / 9) == (SRC_DEPTH / NUM_GROUPS - 1)) 340#else // defined(NUM_GROUPS) 341 if(ch == (SRC_DEPTH - 1)) 342#endif // defined(NUM_GROUPS) 343 { 344 *((__global DATA_TYPE *)output_ptr + 9) = 1.0f; 345 } 346#endif // HAS_BIAS 347} 348 349/** This opencl kernel performs im2col when the kernel size is 5x5 and the data layout is NCHW 350 * 351 * @note The data type must be passed at compile time using -DDATA_TYPE: e.g. -DDATA_TYPE=float 352 * @note The width and height of the input tensor must be passed at compile time using -DSRC_WIDTH and -DSRC_HEIGHT: e.g. -DSRC_WIDTH=128 and -DSRC_HEIGHT=128 353 * @note The width of output tensor after matrix multiplication must be passed at compile time using -DCONVOLVED_WIDTH: e.g. -DCONVOLVED_WIDTH=34 354 * @note The number of input channels must be passed at compile time using -DSRC_DEPTH: e.g. -DSRC_DEPTH=3 355 * @note The pad_left, pad_right, pad_top and pad_bottom must be passed at compile time using -DPAD_LEFT, -DPAD_RIGHT, -DPAD_TOP and -DPAD_BOTTOM: e.g. -DPAD_LEFT=1, -DPAD_RIGHT=2, -DPAD_TOP=3 and -DPAD_BOTTOM=2 356 * @note The zero value to store in case we load values out-of-bounds must be passed at compile time using -DPAD_VALUE: e.g. -DPAD_VALUE=0.0 357 * @note The stride along the X and Y directions must be passed at compile time using -DSTRIDE_X and -DSTRIDE_Y: e.g. -DSTRIDE_X=1 and -DSTRIDE_Y=1 358 * @note In case biases will be added to the convolution -DHAS_BIAS has to be passed to append the final matrix with 1 in each row. 359 * @note In case grouping is performed, the number of groups must be passed at compile time using -DNUM_GROUPS: e.g. -DNUM_GROUPS=4 360 * 361 * @param[in] src_ptr Pointer to the source tensor. Supported data types: QASYMM8_SIGNED/QASYMM8/F16/F32 362 * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes) 363 * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes) 364 * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes) 365 * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes) 366 * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes) 367 * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes) 368 * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor 369 * @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr 370 * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes) 371 * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes) 372 * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes) 373 * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes) 374 * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes) 375 * @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes) 376 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor 377 * @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes). 378 * @param[in] dst_stride_w Stride of the destination tensor in W dimension (in bytes). 379 */ 380__kernel void im2col5x5_nchw( 381 TENSOR3D_DECLARATION(src), 382#if defined(NUM_GROUPS) 383 TENSOR3D_DECLARATION(dst), 384#else // defined(NUM_GROUPS) 385 IMAGE_DECLARATION(dst), 386#endif // defined(NUM_GROUPS) 387 uint src_stride_w, 388 uint dst_stride_w) 389{ 390 const int xc = get_global_id(0); // x coordinate in the convolved tensor 391 const int yc = get_global_id(1); // y coordinate in the convolved tensor 392 const int ch = get_global_id(2) % SRC_DEPTH; // input feature map 393 const int batch = get_global_id(2) / SRC_DEPTH; // batch size 394 395 // Calculate input indices 396 const int xi = xc * STRIDE_X - PAD_LEFT; 397 const int yi = yc * STRIDE_Y - PAD_TOP; 398 399 // Calculate output indices 400#if defined(NUM_GROUPS) 401 const int xo = (ch % (SRC_DEPTH / NUM_GROUPS)) * 25; // 5x5 402 const int zo = ch / (SRC_DEPTH / NUM_GROUPS); 403#else // defined(NUM_GROUPS) 404 const int xo = ch * 25; // 5x5 405#endif // defined(NUM_GROUPS) 406 const int yo = xc + yc * CONVOLVED_WIDTH; // Index of the convolution 407 408#if PAD_LEFT != 0 || PAD_TOP != 0 || PAD_RIGHT != 0 || PAD_BOTTOM != 0 409 // Put 0 if the value is out-of-bound 410 int4 x0 = (int4)xi + (int4)(0, 1, 2, 3); 411 int4 y0 = (int4)yi + (int4)(0, 1, 2, 3); 412 int x1 = xi + 4; 413 int y1 = yi + 4; 414 415 // Check if we could have out-of-bounds elements in the x direction 416 VEC_DATA_TYPE(COND_DATA_TYPE, 4) 417 x0_condition = CONVERT((x0 >= (int4)0 && x0 < (int4)SRC_WIDTH), VEC_DATA_TYPE(COND_DATA_TYPE, 4)); 418 VEC_DATA_TYPE(COND_DATA_TYPE, 4) 419 y0_condition = CONVERT((y0 >= (int4)0 && y0 < (int4)SRC_HEIGHT), VEC_DATA_TYPE(COND_DATA_TYPE, 4)); 420 COND_DATA_TYPE x1_condition = (COND_DATA_TYPE)(x1 >= 0 && x1 < SRC_WIDTH); 421 COND_DATA_TYPE y1_condition = (COND_DATA_TYPE)(y1 >= 0 && y1 < SRC_HEIGHT); 422#endif // PAD_LEFT != 0 || PAD_TOP != 0 || PAD_RIGHT != 0 || PAD_BOTTOM != 0 423 424 // Get input and output address 425 __global uchar *input_ptr = src_ptr + src_offset_first_element_in_bytes + xi * (int)src_stride_x + yi * (int)src_stride_y + ch * src_stride_z + batch * src_stride_w; 426#if defined(NUM_GROUPS) 427 __global uchar *output_ptr = dst_ptr + dst_offset_first_element_in_bytes + xo * dst_stride_x + yo * dst_stride_y + zo * dst_stride_z + batch * dst_stride_w; 428#else // defined(NUM_GROUPS) 429 __global uchar *output_ptr = dst_ptr + dst_offset_first_element_in_bytes + xo * dst_stride_x + yo * dst_stride_y + batch * dst_stride_w; 430#endif // defined(NUM_GROUPS) 431 432 { 433 VEC_DATA_TYPE(DATA_TYPE, 4) 434 row00 = vload4(0, (__global DATA_TYPE *)input_ptr); 435 DATA_TYPE 436 row01 = *((__global DATA_TYPE *)input_ptr + 4); 437 438 input_ptr += src_stride_y; 439 440 VEC_DATA_TYPE(DATA_TYPE, 4) 441 row10 = vload4(0, (__global DATA_TYPE *)input_ptr); 442 DATA_TYPE 443 row11 = *((__global DATA_TYPE *)input_ptr + 4); 444 445#if PAD_LEFT != 0 || PAD_TOP != 0 || PAD_RIGHT != 0 || PAD_BOTTOM != 0 446 VEC_DATA_TYPE(COND_DATA_TYPE, 4) 447 cond00 = x0_condition && (VEC_DATA_TYPE(COND_DATA_TYPE, 4))y0_condition.s0; 448 VEC_DATA_TYPE(COND_DATA_TYPE, 4) 449 cond10 = x0_condition && (VEC_DATA_TYPE(COND_DATA_TYPE, 4))y0_condition.s1; 450 COND_DATA_TYPE cond01 = (COND_DATA_TYPE)(x1_condition && y0_condition.s0); 451 COND_DATA_TYPE cond11 = (COND_DATA_TYPE)(x1_condition && y0_condition.s1); 452 453 // Replace with 0 if the value is not valid 454 row00 = select((VEC_DATA_TYPE(DATA_TYPE, 4))PAD_VALUE, row00, cond00); 455 row10 = select((VEC_DATA_TYPE(DATA_TYPE, 4))PAD_VALUE, row10, cond10); 456 row01 = select((DATA_TYPE)PAD_VALUE, row01, cond01); 457 row11 = select((DATA_TYPE)PAD_VALUE, row11, cond11); 458#endif // PAD_LEFT != 0 || PAD_TOP != 0 || PAD_RIGHT != 0 || PAD_BOTTOM != 0 459 460 vstore8((VEC_DATA_TYPE(DATA_TYPE, 8))(row00.s0123, row01, 461 row10.s012), 462 0, (__global DATA_TYPE *)output_ptr); 463 vstore2((VEC_DATA_TYPE(DATA_TYPE, 2))(row10.s3, row11), 0, (__global DATA_TYPE *)output_ptr + 8); 464 465 input_ptr += src_stride_y; 466 output_ptr += 10 * dst_stride_x; 467 } 468 469 { 470 VEC_DATA_TYPE(DATA_TYPE, 4) 471 row00 = vload4(0, (__global DATA_TYPE *)input_ptr); 472 DATA_TYPE 473 row01 = *((__global DATA_TYPE *)input_ptr + 4); 474 475 input_ptr += src_stride_y; 476 477 VEC_DATA_TYPE(DATA_TYPE, 4) 478 row10 = vload4(0, (__global DATA_TYPE *)input_ptr); 479 DATA_TYPE 480 row11 = *((__global DATA_TYPE *)input_ptr + 4); 481 482#if PAD_LEFT != 0 || PAD_TOP != 0 || PAD_RIGHT != 0 || PAD_BOTTOM != 0 483 VEC_DATA_TYPE(COND_DATA_TYPE, 4) 484 cond00 = x0_condition && (VEC_DATA_TYPE(COND_DATA_TYPE, 4))y0_condition.s2; 485 VEC_DATA_TYPE(COND_DATA_TYPE, 4) 486 cond10 = x0_condition && (VEC_DATA_TYPE(COND_DATA_TYPE, 4))y0_condition.s3; 487 COND_DATA_TYPE cond01 = (COND_DATA_TYPE)(x1_condition && y0_condition.s2); 488 COND_DATA_TYPE cond11 = (COND_DATA_TYPE)(x1_condition && y0_condition.s3); 489 490 // Replace with 0 if the value is not valid 491 row00 = select((VEC_DATA_TYPE(DATA_TYPE, 4))PAD_VALUE, row00, cond00); 492 row10 = select((VEC_DATA_TYPE(DATA_TYPE, 4))PAD_VALUE, row10, cond10); 493 row01 = select((DATA_TYPE)PAD_VALUE, row01, cond01); 494 row11 = select((DATA_TYPE)PAD_VALUE, row11, cond11); 495#endif // PAD_LEFT != 0 || PAD_TOP != 0 || PAD_RIGHT != 0 || PAD_BOTTOM != 0 496 497 vstore8((VEC_DATA_TYPE(DATA_TYPE, 8))(row00.s0123, row01, 498 row10.s012), 499 0, (__global DATA_TYPE *)output_ptr); 500 vstore2((VEC_DATA_TYPE(DATA_TYPE, 2))(row10.s3, row11), 0, (__global DATA_TYPE *)output_ptr + 8); 501 502 input_ptr += src_stride_y; 503 output_ptr += 10 * dst_stride_x; 504 } 505 506 { 507 VEC_DATA_TYPE(DATA_TYPE, 4) 508 row00 = vload4(0, (__global DATA_TYPE *)input_ptr); 509 DATA_TYPE 510 row01 = *((__global DATA_TYPE *)input_ptr + 4); 511 512 input_ptr += src_stride_y; 513 514#if PAD_LEFT != 0 || PAD_TOP != 0 || PAD_RIGHT != 0 || PAD_BOTTOM != 0 515 VEC_DATA_TYPE(COND_DATA_TYPE, 4) 516 cond00 = x0_condition && (VEC_DATA_TYPE(COND_DATA_TYPE, 4))y1_condition; 517 COND_DATA_TYPE cond01 = (COND_DATA_TYPE)(x1_condition && y1_condition); 518 519 // Replace with 0 if the value is not valid 520 row00 = select((VEC_DATA_TYPE(DATA_TYPE, 4))PAD_VALUE, row00, cond00); 521 row01 = select((DATA_TYPE)PAD_VALUE, row01, cond01); 522#endif // PAD_LEFT != 0 || PAD_TOP != 0 || PAD_RIGHT != 0 || PAD_BOTTOM != 0 523 524 vstore4(row00, 0, (__global DATA_TYPE *)output_ptr); 525 *((__global DATA_TYPE *)output_ptr + 4) = row01; 526 527 output_ptr += 5 * dst_stride_x; 528 } 529 530#ifdef HAS_BIAS 531#if defined(NUM_GROUPS) 532 if((xo / 25) == (SRC_DEPTH / NUM_GROUPS - 1)) 533#else // defined(NUM_GROUPS) 534 if(ch == (SRC_DEPTH - 1)) 535#endif // defined(NUM_GROUPS) 536 { 537 *((__global DATA_TYPE *)output_ptr) = 1.0f; 538 } 539#endif // HAS_BIAS 540} 541#endif // defined(CONVOLVED_WIDTH) && defined(SRC_WIDTH) && defined(SRC_HEIGHT) && defined(STRIDE_X) && defined(STRIDE_Y) && defined(SRC_DEPTH) && defined(PAD_LEFT) && defined(PAD_RIGHT) && defined(PAD_TOP) && defined(PAD_BOTTOM) && defined(PAD_VALUE) 542 543#if defined(CONVOLVED_WIDTH) && defined(STRIDE_X) && defined(STRIDE_Y) && defined(SRC_DEPTH) 544/** This opencl kernel performs im2col when the kernel size is 11x11, we do not have paddings and the data layout is NCHW 545 * 546 * @note The data type must be passed at compile time using -DDATA_TYPE: e.g. -DDATA_TYPE=float 547 * @note The width of output tensor after matrix multiplication must be passed at compile time using -DCONVOLVED_WIDTH: e.g. -DCONVOLVED_WIDTH=34 548 * @note The number of input channels must be passed at compile time using -DSRC_DEPTH: e.g. -DSRC_DEPTH=3 549 * @note The stride along the X and Y directions must be passed at compile time using -DSTRIDE_X and -DSTRIDE_Y: e.g. -DSTRIDE_X=1 and -DSTRIDE_Y=1 550 * @note In case biases will be added to the convolution -DHAS_BIAS has to be passed to append the final matrix with 1 in each row. 551 * @note In case grouping is performed, the number of groups must be passed at compile time using -DNUM_GROUPS: e.g. -DNUM_GROUPS=4 552 * 553 * @param[in] src_ptr Pointer to the source tensor. Supported data types: QASYMM8_SIGNED/QASYMM8/F16/F32 554 * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes) 555 * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes) 556 * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes) 557 * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes) 558 * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes) 559 * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes) 560 * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor 561 * @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr 562 * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes) 563 * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes) 564 * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes) 565 * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes) 566 * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes) 567 * @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes) 568 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor 569 * @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes). 570 * @param[in] dst_stride_w Stride of the destination tensor in W dimension (in bytes). 571 */ 572__kernel void im2col11x11_padx0_pady0_nchw( 573 TENSOR3D_DECLARATION(src), 574#if defined(NUM_GROUPS) 575 TENSOR3D_DECLARATION(dst), 576#else // defined(NUM_GROUPS) 577 IMAGE_DECLARATION(dst), 578#endif // defined(NUM_GROUPS) 579 uint src_stride_w, 580 uint dst_stride_w) 581{ 582 const int xc = get_global_id(0); // x coordinate in the convolved tensor 583 const int yc = get_global_id(1); // y coordinate in the convolved tensor 584 const int ch = get_global_id(2) % SRC_DEPTH; // input feature map 585 const int batch = get_global_id(2) / SRC_DEPTH; // batch size 586 587 // Calculate input indices 588 const int xi = xc * STRIDE_X; 589 const int yi = yc * STRIDE_Y; 590 591 // Calculate output indices 592#if defined(NUM_GROUPS) 593 const int xo = (ch % (SRC_DEPTH / NUM_GROUPS)) * 121; // 11x11 594 const int zo = ch / (SRC_DEPTH / NUM_GROUPS); 595#else // defined(NUM_GROUPS) 596 const int xo = ch * 121; // 11x11 597#endif // defined(NUM_GROUPS) 598 const int yo = xc + yc * CONVOLVED_WIDTH; // Index of the convolution 599 600 // Get input and output address 601 __global uchar *input_ptr = src_ptr + src_offset_first_element_in_bytes + xi * src_stride_x + yi * src_stride_y + ch * src_stride_z + batch * src_stride_w; 602#if defined(NUM_GROUPS) 603 __global uchar *output_ptr = dst_ptr + dst_offset_first_element_in_bytes + xo * dst_stride_x + yo * dst_stride_y + zo * dst_stride_z + batch * dst_stride_w; 604#else // defined(NUM_GROUPS) 605 __global uchar *output_ptr = dst_ptr + dst_offset_first_element_in_bytes + xo * dst_stride_x + yo * dst_stride_y + batch * dst_stride_w; 606#endif // defined(NUM_GROUPS) 607 608 { 609 VEC_DATA_TYPE(DATA_TYPE, 8) 610 row00 = vload8(0, (__global DATA_TYPE *)(input_ptr)); 611 VEC_DATA_TYPE(DATA_TYPE, 3) 612 row01 = vload3(0, (__global DATA_TYPE *)(input_ptr) + 8); 613 614 vstore8((VEC_DATA_TYPE(DATA_TYPE, 8))(row00.s01234567), 0, (__global DATA_TYPE *)output_ptr); 615 vstore3((VEC_DATA_TYPE(DATA_TYPE, 3))(row01.s012), 0, (__global DATA_TYPE *)output_ptr + 8); 616 617 input_ptr += src_stride_y; 618 output_ptr += 11 * src_stride_x; 619 } 620 621 { 622 VEC_DATA_TYPE(DATA_TYPE, 8) 623 row00 = vload8(0, (__global DATA_TYPE *)(input_ptr)); 624 VEC_DATA_TYPE(DATA_TYPE, 3) 625 row01 = vload3(0, (__global DATA_TYPE *)(input_ptr) + 8); 626 627 vstore8((VEC_DATA_TYPE(DATA_TYPE, 8))(row00.s01234567), 0, (__global DATA_TYPE *)output_ptr); 628 vstore3((VEC_DATA_TYPE(DATA_TYPE, 3))(row01.s012), 0, (__global DATA_TYPE *)output_ptr + 8); 629 630 input_ptr += src_stride_y; 631 output_ptr += 11 * src_stride_x; 632 } 633 634 { 635 VEC_DATA_TYPE(DATA_TYPE, 8) 636 row00 = vload8(0, (__global DATA_TYPE *)(input_ptr)); 637 VEC_DATA_TYPE(DATA_TYPE, 3) 638 row01 = vload3(0, (__global DATA_TYPE *)(input_ptr) + 8); 639 640 vstore8((VEC_DATA_TYPE(DATA_TYPE, 8))(row00.s01234567), 0, (__global DATA_TYPE *)output_ptr); 641 vstore3((VEC_DATA_TYPE(DATA_TYPE, 3))(row01.s012), 0, (__global DATA_TYPE *)output_ptr + 8); 642 643 input_ptr += src_stride_y; 644 output_ptr += 11 * src_stride_x; 645 } 646 647 { 648 VEC_DATA_TYPE(DATA_TYPE, 8) 649 row00 = vload8(0, (__global DATA_TYPE *)(input_ptr)); 650 VEC_DATA_TYPE(DATA_TYPE, 3) 651 row01 = vload3(0, (__global DATA_TYPE *)(input_ptr) + 8); 652 653 vstore8((VEC_DATA_TYPE(DATA_TYPE, 8))(row00.s01234567), 0, (__global DATA_TYPE *)output_ptr); 654 vstore3((VEC_DATA_TYPE(DATA_TYPE, 3))(row01.s012), 0, (__global DATA_TYPE *)output_ptr + 8); 655 656 input_ptr += src_stride_y; 657 output_ptr += 11 * src_stride_x; 658 } 659 660 { 661 VEC_DATA_TYPE(DATA_TYPE, 8) 662 row00 = vload8(0, (__global DATA_TYPE *)(input_ptr)); 663 VEC_DATA_TYPE(DATA_TYPE, 3) 664 row01 = vload3(0, (__global DATA_TYPE *)(input_ptr) + 8); 665 666 vstore8((VEC_DATA_TYPE(DATA_TYPE, 8))(row00.s01234567), 0, (__global DATA_TYPE *)output_ptr); 667 vstore3((VEC_DATA_TYPE(DATA_TYPE, 3))(row01.s012), 0, (__global DATA_TYPE *)output_ptr + 8); 668 669 input_ptr += src_stride_y; 670 output_ptr += 11 * src_stride_x; 671 } 672 673 { 674 VEC_DATA_TYPE(DATA_TYPE, 8) 675 row00 = vload8(0, (__global DATA_TYPE *)(input_ptr)); 676 VEC_DATA_TYPE(DATA_TYPE, 3) 677 row01 = vload3(0, (__global DATA_TYPE *)(input_ptr) + 8); 678 679 vstore8((VEC_DATA_TYPE(DATA_TYPE, 8))(row00.s01234567), 0, (__global DATA_TYPE *)output_ptr); 680 vstore3((VEC_DATA_TYPE(DATA_TYPE, 3))(row01.s012), 0, (__global DATA_TYPE *)output_ptr + 8); 681 682 input_ptr += src_stride_y; 683 output_ptr += 11 * src_stride_x; 684 } 685 686 { 687 VEC_DATA_TYPE(DATA_TYPE, 8) 688 row00 = vload8(0, (__global DATA_TYPE *)(input_ptr)); 689 VEC_DATA_TYPE(DATA_TYPE, 3) 690 row01 = vload3(0, (__global DATA_TYPE *)(input_ptr) + 8); 691 692 vstore8((VEC_DATA_TYPE(DATA_TYPE, 8))(row00.s01234567), 0, (__global DATA_TYPE *)output_ptr); 693 vstore3((VEC_DATA_TYPE(DATA_TYPE, 3))(row01.s012), 0, (__global DATA_TYPE *)output_ptr + 8); 694 695 input_ptr += src_stride_y; 696 output_ptr += 11 * src_stride_x; 697 } 698 699 { 700 VEC_DATA_TYPE(DATA_TYPE, 8) 701 row00 = vload8(0, (__global DATA_TYPE *)(input_ptr)); 702 VEC_DATA_TYPE(DATA_TYPE, 3) 703 row01 = vload3(0, (__global DATA_TYPE *)(input_ptr) + 8); 704 705 vstore8((VEC_DATA_TYPE(DATA_TYPE, 8))(row00.s01234567), 0, (__global DATA_TYPE *)output_ptr); 706 vstore3((VEC_DATA_TYPE(DATA_TYPE, 3))(row01.s012), 0, (__global DATA_TYPE *)output_ptr + 8); 707 708 input_ptr += src_stride_y; 709 output_ptr += 11 * src_stride_x; 710 } 711 712 { 713 VEC_DATA_TYPE(DATA_TYPE, 8) 714 row00 = vload8(0, (__global DATA_TYPE *)(input_ptr)); 715 VEC_DATA_TYPE(DATA_TYPE, 3) 716 row01 = vload3(0, (__global DATA_TYPE *)(input_ptr) + 8); 717 718 vstore8((VEC_DATA_TYPE(DATA_TYPE, 8))(row00.s01234567), 0, (__global DATA_TYPE *)output_ptr); 719 vstore3((VEC_DATA_TYPE(DATA_TYPE, 3))(row01.s012), 0, (__global DATA_TYPE *)output_ptr + 8); 720 721 input_ptr += src_stride_y; 722 output_ptr += 11 * src_stride_x; 723 } 724 725 { 726 VEC_DATA_TYPE(DATA_TYPE, 8) 727 row00 = vload8(0, (__global DATA_TYPE *)(input_ptr)); 728 VEC_DATA_TYPE(DATA_TYPE, 3) 729 row01 = vload3(0, (__global DATA_TYPE *)(input_ptr) + 8); 730 731 vstore8((VEC_DATA_TYPE(DATA_TYPE, 8))(row00.s01234567), 0, (__global DATA_TYPE *)output_ptr); 732 vstore3((VEC_DATA_TYPE(DATA_TYPE, 3))(row01.s012), 0, (__global DATA_TYPE *)output_ptr + 8); 733 734 input_ptr += src_stride_y; 735 output_ptr += 11 * src_stride_x; 736 } 737 738 { 739 VEC_DATA_TYPE(DATA_TYPE, 8) 740 row00 = vload8(0, (__global DATA_TYPE *)(input_ptr)); 741 VEC_DATA_TYPE(DATA_TYPE, 3) 742 row01 = vload3(0, (__global DATA_TYPE *)(input_ptr) + 8); 743 744 vstore8((VEC_DATA_TYPE(DATA_TYPE, 8))(row00.s01234567), 0, (__global DATA_TYPE *)output_ptr); 745 vstore3((VEC_DATA_TYPE(DATA_TYPE, 3))(row01.s012), 0, (__global DATA_TYPE *)output_ptr + 8); 746 747 output_ptr += 11 * src_stride_x; 748 } 749 750#ifdef HAS_BIAS 751#if defined(NUM_GROUPS) 752 if((xo / 121) == (SRC_DEPTH / NUM_GROUPS - 1)) 753#else // defined(NUM_GROUPS) 754 if(ch == (SRC_DEPTH - 1)) 755#endif // defined(NUM_GROUPS) 756 { 757 *((__global DATA_TYPE *)output_ptr) = 1.0f; 758 } 759#endif // HAS_BIAS 760} 761#endif // defined(CONVOLVED_WIDTH) && defined(STRIDE_X) && defined(STRIDE_Y) && defined(SRC_DEPTH) 762 763#if defined(CONVOLVED_WIDTH) && defined(STRIDE_X) && defined(STRIDE_Y) && defined(KERNEL_WIDTH) && defined(KERNEL_HEIGHT) && defined(SRC_DEPTH) && defined(SRC_WIDTH) && defined(SRC_HEIGHT) && defined(VECTOR_SIZE) && defined(WIDTH_MOD_VECTOR_SIZE) 764/** This opencl kernel performs im2col when the kernel size is greater than 1x1, we do not have paddings and the data layout is NCHW 765 * 766 * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. 767 * @note The vector size must be passed at compile time using -DVECTOR_SIZE e.g. -DVECTOR_SIZE=4. 768 * @note The width modulo vector size must be passed at compile time using -DWIDTH_MOD_VECTOR_SIZE e.g. -DWIDTH_MOD_VECTOR_SIZE=3. 769 * @note The stride along the X and Y directions must be passed at compile time using -DSTRIDE_X and -DSTRIDE_Y: e.g. -DSTRIDE_X=1 and -DSTRIDE_Y=1 770 * @note In case biases will be added to the convolution -DHAS_BIAS has to be passed to append the final matrix with 1 in each row. 771 * @note In case grouping is performed, the number of groups must be passed at compile time using -DNUM_GROUPS: e.g. -DNUM_GROUPS=4 772 * 773 * @param[in] src_ptr Pointer to the source tensor. Supported data types: QASYMM8_SIGNED/QASYMM8/F16/F32 774 * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes) 775 * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes) 776 * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes) 777 * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes) 778 * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes) 779 * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes) 780 * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor 781 * @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr 782 * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes) 783 * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes) 784 * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes) 785 * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes) 786 * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes) 787 * @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes) 788 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor 789 * @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes). 790 * @param[in] dst_stride_w Stride of the destination tensor in W dimension (in bytes). 791 */ 792__kernel void im2col_generic_padx0_pady0_nchw( 793 TENSOR3D_DECLARATION(src), 794#if defined(NUM_GROUPS) 795 TENSOR3D_DECLARATION(dst), 796#else // defined(NUM_GROUPS) 797 IMAGE_DECLARATION(dst), 798#endif // defined(NUM_GROUPS) 799 uint src_stride_w, 800 uint dst_stride_w) 801{ 802 const int xc = get_global_id(0); // x coordinate in the convolved tensor 803 const int yc = get_global_id(1); // y coordinate in the convolved tensor 804 const int ch = get_global_id(2) % SRC_DEPTH; // input feature map 805 const int batch = get_global_id(2) / SRC_DEPTH; // batch size 806 807 // Calculate input indices 808 const int xi = xc * STRIDE_X; 809 const int yi = yc * STRIDE_Y; 810 811 // Calculate output indices 812#if defined(NUM_GROUPS) 813 const int xo = (ch % (SRC_DEPTH / NUM_GROUPS)) * KERNEL_WIDTH * KERNEL_HEIGHT; 814 const int zo = ch / (SRC_DEPTH / NUM_GROUPS); 815#else // defined(NUM_GROUPS) 816 const int xo = ch * KERNEL_WIDTH * KERNEL_HEIGHT; 817#endif // defined(NUM_GROUPS) 818 const int yo = xc + yc * CONVOLVED_WIDTH; // Index of the convolution 819 820 __global uchar *input_ptr = src_ptr + src_offset_first_element_in_bytes + ch * src_stride_z + batch * src_stride_w; 821#if defined(NUM_GROUPS) 822 __global DATA_TYPE *output_ptr = ((__global DATA_TYPE *)(dst_ptr + dst_offset_first_element_in_bytes + yo * dst_stride_y + zo * dst_stride_z + batch * dst_stride_w)) + xo; 823#else // defined(NUM_GROUPS) 824 __global DATA_TYPE *output_ptr = ((__global DATA_TYPE *)(dst_ptr + dst_offset_first_element_in_bytes + yo * dst_stride_y + batch * dst_stride_w)) + xo; 825#endif // defined(NUM_GROUPS) 826 827 // Linearize convolution elements 828 for(int y = yi, y_e = yi + KERNEL_HEIGHT; y < y_e; ++y) 829 { 830 int last_x = 0; 831 for(int x = xi, x_e = xi + KERNEL_WIDTH; x + VECTOR_SIZE <= x_e; x += VECTOR_SIZE, output_ptr += VECTOR_SIZE) 832 { 833 VEC_DATA_TYPE(DATA_TYPE, VECTOR_SIZE) 834 row = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(input_ptr + x * src_stride_x + y * src_stride_y)); 835 VSTORE(VECTOR_SIZE) 836 (row, 0, output_ptr); 837 last_x = x; 838 } 839 // Copy the remainder of the row by doing VLOAD(WIDTH_MOD_VECTOR_SIZE) and VSTORE(WIDTH_MOD_VECTOR_SIZE). 840 // Note that x and output_ptr have already been incremented by VECTOR_SIZE by the loop just before exit. 841#if WIDTH_MOD_VECTOR_SIZE == 1 842 *output_ptr = *((__global DATA_TYPE *)(input_ptr + (last_x + VECTOR_SIZE) * src_stride_x + y * src_stride_y)); 843#elif WIDTH_MOD_VECTOR_SIZE > 1 844 VEC_DATA_TYPE(DATA_TYPE, WIDTH_MOD_VECTOR_SIZE) 845 row = VLOAD(WIDTH_MOD_VECTOR_SIZE)(0, (__global DATA_TYPE *)(input_ptr + (last_x + VECTOR_SIZE) * src_stride_x + y * src_stride_y)); 846 VSTORE(WIDTH_MOD_VECTOR_SIZE) 847 (row, 0, output_ptr); 848#endif /* WIDTH_MOD_VECTOR_SIZE */ 849 output_ptr += WIDTH_MOD_VECTOR_SIZE; 850 } /* End of loop over KERNEL_HEIGHT */ 851 852#ifdef HAS_BIAS 853#if defined(NUM_GROUPS) 854 if((xo / (KERNEL_WIDTH * KERNEL_HEIGHT)) == (SRC_DEPTH / NUM_GROUPS - 1)) 855#else // defined(NUM_GROUPS) 856 if(ch == (SRC_DEPTH - 1)) 857#endif // defined(NUM_GROUPS) 858 { 859 *output_ptr = 1.0f; 860 } 861#endif // HAS_BIAS 862} 863#endif //defined(CONVOLVED_WIDTH) && defined(STRIDE_X) && defined(STRIDE_Y) && defined(PAD_LEFT) && defined(PAD_TOP) && defined(PAD_RIGHT) && defined(PAD_BOTTOM) && defined(KERNEL_WIDTH) && defined(KERNEL_HEIGHT) && defined(SRC_DEPTH) && defined(SRC_WIDTH) && defined(SRC_HEIGHT) && defined(VECTOR_SIZE) && defined(WIDTH_MOD_VECTOR_SIZE) 864 865#if defined(CONVOLVED_WIDTH) && defined(SRC_WIDTH) && defined(SRC_HEIGHT) && defined(STRIDE_X) && defined(STRIDE_Y) && defined(KERNEL_WIDTH) && defined(KERNEL_HEIGHT) && defined(SRC_DEPTH) && defined(PAD_LEFT) && defined(PAD_RIGHT) && defined(PAD_TOP) && defined(PAD_BOTTOM) && defined(PAD_VALUE) && defined(VECTOR_SIZE) && defined(BOUNDARY_VECTOR_SIZE) 866 867#define VECTOR_N VEC_DATA_TYPE(DATA_TYPE, VECTOR_SIZE) 868#define COND_N VEC_DATA_TYPE(COND_DATA_TYPE, VECTOR_SIZE) 869 870/** Store a 1x9 row or a 3x3 block in a boundary-aware manner to avoid paddings in the channel dimension 871 * @name IM2COL1X9_NHWC_STORE 872 * 873 * @note To use this macro for a 3x3 block, @p ROW has to be 0 874 * 875 * @param[in] VECTOR_SIZE The non-boundary vector width of @p DATA. Supported: 1(scalar), 2, 3, 4, 8, 16 876 * @param[in] BOUNDARY_VECTOR_SIZE The boundary vector width of @p DATA. Supported: 1-16, but has to be <= @p size 877 * @param[in] DATA_TYPE Data type of @p DATA 878 * @param[in] SRC_DEPTH Input channel size / depth 879 * @param[in] DATA Value variable base name 880 * @param[in] ROW The row number to store. Supported: 0-8 881 * @param[in] OUTPUT_PTR Output pointer 882 * @{ 883 */ 884#if defined(VECTOR_SIZE) && defined(BOUNDARY_VECTOR_SIZE) && BOUNDARY_VECTOR_SIZE < VECTOR_SIZE 885#define IM2COL1X9_NHWC_STORE(VECTOR_SIZE, BOUNDARY_VECTOR_SIZE, DATA_TYPE, SRC_DEPTH, DATA, ROW, OUTPUT_PTR) \ 886 const bool at_channel_boundary = get_global_id(0) == 0; \ 887 if(at_channel_boundary) \ 888 { \ 889 IM2COL1X9_NHWC_STORE_PARTIAL(VECTOR_SIZE, BOUNDARY_VECTOR_SIZE, DATA_TYPE, SRC_DEPTH, DATA, ROW, OUTPUT_PTR) \ 890 } \ 891 else \ 892 { \ 893 IM2COL1X9_NHWC_STORE_NONPARTIAL(VECTOR_SIZE, DATA_TYPE, SRC_DEPTH, DATA, ROW, OUTPUT_PTR) \ 894 } 895#else // defined(VECTOR_SIZE) && defined(BOUNDARY_VECTOR_SIZE) && BOUNDARY_VECTOR_SIZE < VECTOR_SIZE 896#define IM2COL1X9_NHWC_STORE(VECTOR_SIZE, BOUNDARY_VECTOR_SIZE, DATA_TYPE, SRC_DEPTH, DATA, ROW, OUTPUT_PTR) \ 897 IM2COL1X9_NHWC_STORE_NONPARTIAL(VECTOR_SIZE, DATA_TYPE, SRC_DEPTH, DATA, ROW, OUTPUT_PTR) 898#endif // defined(VECTOR_SIZE) && defined(BOUNDARY_VECTOR_SIZE) && BOUNDARY_VECTOR_SIZE < VECTOR_SIZE 899 900#define IM2COL1X9_NHWC_STORE_NONPARTIAL(VECTOR_SIZE, DATA_TYPE, SRC_DEPTH, DATA, ROW, OUTPUT_PTR) \ 901 VSTORE(VECTOR_SIZE) \ 902 (DATA##0, 0, (__global DATA_TYPE *)(OUTPUT_PTR) + (0 + ROW * 9) * SRC_DEPTH); \ 903 VSTORE(VECTOR_SIZE) \ 904 (DATA##1, 0, (__global DATA_TYPE *)(OUTPUT_PTR) + (1 + ROW * 9) * SRC_DEPTH); \ 905 VSTORE(VECTOR_SIZE) \ 906 (DATA##2, 0, (__global DATA_TYPE *)(OUTPUT_PTR) + (2 + ROW * 9) * SRC_DEPTH); \ 907 VSTORE(VECTOR_SIZE) \ 908 (DATA##3, 0, (__global DATA_TYPE *)(OUTPUT_PTR) + (3 + ROW * 9) * SRC_DEPTH); \ 909 VSTORE(VECTOR_SIZE) \ 910 (DATA##4, 0, (__global DATA_TYPE *)(OUTPUT_PTR) + (4 + ROW * 9) * SRC_DEPTH); \ 911 VSTORE(VECTOR_SIZE) \ 912 (DATA##5, 0, (__global DATA_TYPE *)(OUTPUT_PTR) + (5 + ROW * 9) * SRC_DEPTH); \ 913 VSTORE(VECTOR_SIZE) \ 914 (DATA##6, 0, (__global DATA_TYPE *)(OUTPUT_PTR) + (6 + ROW * 9) * SRC_DEPTH); \ 915 VSTORE(VECTOR_SIZE) \ 916 (DATA##7, 0, (__global DATA_TYPE *)(OUTPUT_PTR) + (7 + ROW * 9) * SRC_DEPTH); \ 917 VSTORE(VECTOR_SIZE) \ 918 (DATA##8, 0, (__global DATA_TYPE *)(OUTPUT_PTR) + (8 + ROW * 9) * SRC_DEPTH); 919 920#define IM2COL1X9_NHWC_STORE_PARTIAL(VECTOR_SIZE, BOUNDARY_VECTOR_SIZE, DATA_TYPE, SRC_DEPTH, DATA, ROW, OUTPUT_PTR) \ 921 VSTORE_PARTIAL(VECTOR_SIZE, BOUNDARY_VECTOR_SIZE) \ 922 (DATA##0, 0, (__global DATA_TYPE *)(OUTPUT_PTR) + (0 + ROW * 9) * SRC_DEPTH); \ 923 VSTORE_PARTIAL(VECTOR_SIZE, BOUNDARY_VECTOR_SIZE) \ 924 (DATA##1, 0, (__global DATA_TYPE *)(OUTPUT_PTR) + (1 + ROW * 9) * SRC_DEPTH); \ 925 VSTORE_PARTIAL(VECTOR_SIZE, BOUNDARY_VECTOR_SIZE) \ 926 (DATA##2, 0, (__global DATA_TYPE *)(OUTPUT_PTR) + (2 + ROW * 9) * SRC_DEPTH); \ 927 VSTORE_PARTIAL(VECTOR_SIZE, BOUNDARY_VECTOR_SIZE) \ 928 (DATA##3, 0, (__global DATA_TYPE *)(OUTPUT_PTR) + (3 + ROW * 9) * SRC_DEPTH); \ 929 VSTORE_PARTIAL(VECTOR_SIZE, BOUNDARY_VECTOR_SIZE) \ 930 (DATA##4, 0, (__global DATA_TYPE *)(OUTPUT_PTR) + (4 + ROW * 9) * SRC_DEPTH); \ 931 VSTORE_PARTIAL(VECTOR_SIZE, BOUNDARY_VECTOR_SIZE) \ 932 (DATA##5, 0, (__global DATA_TYPE *)(OUTPUT_PTR) + (5 + ROW * 9) * SRC_DEPTH); \ 933 VSTORE_PARTIAL(VECTOR_SIZE, BOUNDARY_VECTOR_SIZE) \ 934 (DATA##6, 0, (__global DATA_TYPE *)(OUTPUT_PTR) + (6 + ROW * 9) * SRC_DEPTH); \ 935 VSTORE_PARTIAL(VECTOR_SIZE, BOUNDARY_VECTOR_SIZE) \ 936 (DATA##7, 0, (__global DATA_TYPE *)(OUTPUT_PTR) + (7 + ROW * 9) * SRC_DEPTH); \ 937 VSTORE_PARTIAL(VECTOR_SIZE, BOUNDARY_VECTOR_SIZE) \ 938 (DATA##8, 0, (__global DATA_TYPE *)(OUTPUT_PTR) + (8 + ROW * 9) * SRC_DEPTH); 939/** @}*/ 940 941/** This kernel performs im2col when the kernel size is 3x3 and the data layout is NHWC 942 * 943 * @note This kernel computes VECTOR_SIZE elements 944 * @note This kernel stores VECTOR_SIZE or BOUNDARY_VECTOR_SIZE (if at boundary) elements 945 * @note The vector size must be passed at compile time using -DVECTOR_SIZE: e.g. -DVECTOR_SIZE=2 946 * @note The boundary vector size must be passed at compile time using -DBOUNDARY_VECTOR_SIZE: e.g. -DBOUNDARY_VECTOR_SIZE=1 947 * @note The data type must be passed at compile time using -DDATA_TYPE: e.g. -DDATA_TYPE=float 948 * @note The width of output tensor after matrix multiplication must be passed at compile time using -DCONVOLVED_WIDTH: e.g. -DCONVOLVED_WIDTH=34 949 * @note The kernel depth must be passed at compile time using -DSRC_DEPTH: e.g. -DSRC_DEPTH=3 950 * @note The stride along the Y direction must be passed at compile time using -DSTRIDE_Y: e.g. -DSTRIDE_Y=1 951 * @note In case biases will be added to the convolution -DHAS_BIAS has to be passed to append the final matrix with 1 in each row. 952 * 953 * @param[in] src_ptr Pointer to the source tensor. Supported data types: QASYMM8_SIGNED/QASYMM8/F16/F32 954 * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes) 955 * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes) 956 * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes) 957 * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes) 958 * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes) 959 * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes) 960 * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor 961 * @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr 962 * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes) 963 * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes) 964 * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes) 965 * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes) 966 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor 967 * @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes). 968 * @param[in] dst_stride_w Stride of the destination tensor in W dimension (in bytes). 969 */ 970__kernel void im2col3x3_nhwc( 971 TENSOR3D_DECLARATION(src), 972 IMAGE_DECLARATION(dst), 973 uint src_stride_w, 974 uint dst_stride_w) 975{ 976 // input feature map, boundary-corrected (shift all non-boundary vectors by shift_amount) to avoid padding 977 const int shift_amount = (int)VECTOR_SIZE - (int)BOUNDARY_VECTOR_SIZE; 978 const int ch = max((int)(get_global_id(0) * VECTOR_SIZE) - shift_amount, 0); 979 const int yo = get_global_id(1); 980 const int batch = get_global_id(2); // batch size 981 982 // Calculate input indices 983 const int xi = (get_global_id(1) % CONVOLVED_WIDTH) * STRIDE_X; 984 const int yi = (get_global_id(1) / (int)CONVOLVED_WIDTH) * STRIDE_Y; 985 986 // Get input and output address 987 __global uchar *input_ptr = src_ptr + src_offset_first_element_in_bytes + ch * sizeof(DATA_TYPE) + batch * (int)src_stride_w; 988 __global uchar *output_ptr = dst_ptr + dst_offset_first_element_in_bytes + ch * sizeof(DATA_TYPE) + yo * (int)dst_stride_y + batch * (int)dst_stride_w; 989 990 int yi_coord = 0; 991 int3 offset = 0; 992 993 // Clamp xi 994 int3 xi_offset = ((int3)xi + (int3)(0, 1, 2) * DILATION_X - (int3)PAD_LEFT); 995#if PAD_LEFT != 0 || PAD_RIGHT != 0 996#define CLAMP(x, min_val, max_val) min(max(x, min_val), max_val) 997 xi_offset = CLAMP(xi_offset, (int3)0, (int3)(SRC_WIDTH - 1)); 998#endif // PAD_LEFT != 0 || PAD_RIGHT != 0 999 // Multiply by src_stride_y as the width (X) dimension here is the second (y) dimension in src NHWC tensor 1000 xi_offset *= (int3)src_stride_y; 1001 1002 // Out-of-bound condition for X 1003 int3 x_cond = (((int3)xi + (int3)(0, 1, 2) * DILATION_X - (int3)PAD_LEFT) < (int3)0) || (((int3)xi + (int3)(0, 1, 2) * DILATION_X - (int3)PAD_LEFT) >= (int3)SRC_WIDTH); 1004 1005 // yi == 0 1006 // Clamp yi 1007 // yi_coord is casted to unsigned int in order to use just a min() operation 1008 // A "-1" 32 bit signed variable converted to unsigned gives 4294967295 1009 // This is a trick so that the values loaded in the padding areas are always from the last row (SRC_HEIGHT - 1), 1010 // because of the negative yi_coord wrap-around, but it gets overwritten by PAD_VALUE immediately as the wrap-around 1011 // also causes y_cond (y padding condition) to be satisfied 1012 yi_coord = yi - (int)PAD_TOP; 1013 1014 // Clamp only if PAD_TOP or PAD_BOTTOM is not equal to 0 1015#if PAD_TOP != 0 || PAD_BOTTOM != 0 1016 yi_coord = min((uint)yi_coord, (uint)(SRC_HEIGHT - 1)); 1017#endif // PAD_TOP != 0 || PAD_BOTTOM != 0 1018 1019 // Compute offset 1020 offset = xi_offset + (yi_coord * (int)src_stride_z); 1021 1022 // Load input values 1023 VECTOR_N values0 = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(input_ptr + offset.s0)); 1024 VECTOR_N values1 = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(input_ptr + offset.s1)); 1025 VECTOR_N values2 = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(input_ptr + offset.s2)); 1026 1027#if PAD_TOP != 0 || PAD_LEFT != 0 || PAD_BOTTOM != 0 || PAD_RIGHT != 0 1028 // Replace invalid values with PAD_VALUE 1029 int y_cond = (int)((uint)(yi - (int)PAD_TOP) >= (uint)(SRC_HEIGHT)); 1030 values0 = select(values0, (VECTOR_N)PAD_VALUE, (COND_N)((COND_N)y_cond || (COND_N)(x_cond.s0))); 1031 values1 = select(values1, (VECTOR_N)PAD_VALUE, (COND_N)((COND_N)y_cond || (COND_N)(x_cond.s1))); 1032 values2 = select(values2, (VECTOR_N)PAD_VALUE, (COND_N)((COND_N)y_cond || (COND_N)(x_cond.s2))); 1033#endif // PAD_TOP != 0 || PAD_LEFT != 0 || PAD_BOTTOM != 0 || PAD_RIGHT != 0 1034 1035 // yi == 1 1036 // Clamp yi_coord (it can be negative if PAD_TOP > 1) 1037 yi_coord = yi - (int)PAD_TOP + 1 * DILATION_Y; 1038 1039 // Clamp only if PAD_TOP or PAD_BOTTOM is not equal to 0 1040#if PAD_TOP != 0 || PAD_BOTTOM != 0 1041 yi_coord = min((uint)yi_coord, (uint)(SRC_HEIGHT - 1)); 1042#endif // PAD_TOP != 0 || PAD_BOTTOM != 0 1043 1044 // Compute offset 1045 offset = xi_offset + (yi_coord * (int)src_stride_z); 1046 1047 // Load input values 1048 VECTOR_N values3 = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(input_ptr + offset.s0)); 1049 VECTOR_N values4 = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(input_ptr + offset.s1)); 1050 VECTOR_N values5 = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(input_ptr + offset.s2)); 1051 1052#if PAD_TOP != 0 || PAD_LEFT != 0 || PAD_BOTTOM != 0 || PAD_RIGHT != 0 1053 // Replace invalid values with zeros 1054 y_cond = (int)((uint)(yi - (int)PAD_TOP + 1 * DILATION_Y) >= (uint)(SRC_HEIGHT)); 1055 values3 = select(values3, (VECTOR_N)PAD_VALUE, (COND_N)((COND_N)y_cond || (COND_N)(x_cond.s0))); 1056 values4 = select(values4, (VECTOR_N)PAD_VALUE, (COND_N)((COND_N)y_cond || (COND_N)(x_cond.s1))); 1057 values5 = select(values5, (VECTOR_N)PAD_VALUE, (COND_N)((COND_N)y_cond || (COND_N)(x_cond.s2))); 1058#endif // PAD_TOP != 0 || PAD_LEFT != 0 || PAD_BOTTOM != 0 || PAD_RIGHT != 0 1059 1060 // yi == 2 1061 // Clamp yi_coord 1062 yi_coord = yi - (int)PAD_TOP + 2 * DILATION_Y; 1063 1064 // Clamp only if PAD_TOP or PAD_BOTTOM is not equal to 0 1065#if PAD_TOP != 0 || PAD_BOTTOM != 0 1066 yi_coord = min((uint)yi_coord, (uint)(SRC_HEIGHT - 1)); 1067#endif // PAD_TOP != 0 || PAD_BOTTOM != 0 1068 1069 // Compute offset 1070 offset = xi_offset + (yi_coord * (int)src_stride_z); 1071 1072 // Load input values 1073 VECTOR_N values6 = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(input_ptr + offset.s0)); 1074 VECTOR_N values7 = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(input_ptr + offset.s1)); 1075 VECTOR_N values8 = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(input_ptr + offset.s2)); 1076 1077#if PAD_TOP != 0 || PAD_LEFT != 0 || PAD_BOTTOM != 0 || PAD_RIGHT != 0 1078 // Replace invalid values with PAD_VALUE 1079 y_cond = (int)((uint)(yi - (int)PAD_TOP + 2 * DILATION_Y) >= (uint)(SRC_HEIGHT)); 1080 values6 = select(values6, (VECTOR_N)PAD_VALUE, (COND_N)((COND_N)y_cond || (COND_N)(x_cond.s0))); 1081 values7 = select(values7, (VECTOR_N)PAD_VALUE, (COND_N)((COND_N)y_cond || (COND_N)(x_cond.s1))); 1082 values8 = select(values8, (VECTOR_N)PAD_VALUE, (COND_N)((COND_N)y_cond || (COND_N)(x_cond.s2))); 1083#endif // PAD_TOP != 0 || PAD_LEFT != 0 || PAD_BOTTOM != 0 || PAD_RIGHT != 0 1084 1085 // Store in a boundary-aware way to avoid padding 1086 IM2COL1X9_NHWC_STORE(VECTOR_SIZE, BOUNDARY_VECTOR_SIZE, DATA_TYPE, SRC_DEPTH, values, 0, output_ptr) 1087 1088#ifdef HAS_BIAS 1089 // We can use VECTOR_SIZE instead of BOUNDARY_VECTOR_SIZE even if it's at the boundary. This is because the bias is 1090 // added at the end of the channel, while the boundary vec is at the beginning of the channel. 1091 // The only case where the boundary vec is at the end of the channel is when there's only a single boundary vec in 1092 // the whole channel dimension, but in that case VECTOR_SIZE is also equal to BOUNDARY_VECTOR_SIZE 1093 // See the value of num_elems_processed_per_iteration in configure_opencl_kernel method in CLIm2ColKernel.cpp 1094 if((ch + VECTOR_SIZE) >= SRC_DEPTH) 1095 { 1096 *((__global DATA_TYPE *)(output_ptr) - ch + SRC_DEPTH * 9) = 1.0f; 1097 } 1098#endif // HAS_BIAS 1099} 1100 1101#if PAD_TOP != 0 || PAD_LEFT != 0 || PAD_BOTTOM != 0 || PAD_RIGHT != 0 1102#define IM2COL1x9(i) \ 1103 ({ \ 1104 yi_coord = yi - (int)PAD_TOP + i * DILATION_Y; \ 1105 yi_coord = min((uint)yi_coord, (uint)(SRC_HEIGHT - 1)); \ 1106 \ 1107 offset0 = xi_offset0 + (yi_coord * (int)src_stride_z); \ 1108 offset1 = xi_offset1 + (yi_coord * (int)src_stride_z); \ 1109 \ 1110 VECTOR_N values0 = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(input_ptr + offset0.s0)); \ 1111 VECTOR_N values1 = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(input_ptr + offset0.s1)); \ 1112 VECTOR_N values2 = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(input_ptr + offset0.s2)); \ 1113 VECTOR_N values3 = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(input_ptr + offset0.s3)); \ 1114 VECTOR_N values4 = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(input_ptr + offset0.s4)); \ 1115 VECTOR_N values5 = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(input_ptr + offset0.s5)); \ 1116 VECTOR_N values6 = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(input_ptr + offset0.s6)); \ 1117 VECTOR_N values7 = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(input_ptr + offset0.s7)); \ 1118 VECTOR_N values8 = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(input_ptr + offset1)); \ 1119 \ 1120 int y_cond = (int)((uint)(yi - (int)PAD_TOP + i * DILATION_Y) >= (uint)(SRC_HEIGHT)); \ 1121 values0 = select(values0, (VECTOR_N)PAD_VALUE, (COND_N)((COND_N)y_cond || (COND_N)(x_cond0.s0))); \ 1122 values1 = select(values1, (VECTOR_N)PAD_VALUE, (COND_N)((COND_N)y_cond || (COND_N)(x_cond0.s1))); \ 1123 values2 = select(values2, (VECTOR_N)PAD_VALUE, (COND_N)((COND_N)y_cond || (COND_N)(x_cond0.s2))); \ 1124 values3 = select(values3, (VECTOR_N)PAD_VALUE, (COND_N)((COND_N)y_cond || (COND_N)(x_cond0.s3))); \ 1125 values4 = select(values4, (VECTOR_N)PAD_VALUE, (COND_N)((COND_N)y_cond || (COND_N)(x_cond0.s4))); \ 1126 values5 = select(values5, (VECTOR_N)PAD_VALUE, (COND_N)((COND_N)y_cond || (COND_N)(x_cond0.s5))); \ 1127 values6 = select(values6, (VECTOR_N)PAD_VALUE, (COND_N)((COND_N)y_cond || (COND_N)(x_cond0.s6))); \ 1128 values7 = select(values7, (VECTOR_N)PAD_VALUE, (COND_N)((COND_N)y_cond || (COND_N)(x_cond0.s7))); \ 1129 values8 = select(values8, (VECTOR_N)PAD_VALUE, (COND_N)((COND_N)y_cond || (COND_N)(x_cond1))); \ 1130 \ 1131 IM2COL1X9_NHWC_STORE(VECTOR_SIZE, BOUNDARY_VECTOR_SIZE, DATA_TYPE, SRC_DEPTH, values, i, output_ptr) \ 1132 }) 1133#else // PAD_TOP != 0 || PAD_LEFT != 0 || PAD_BOTTOM != 0 || PAD_RIGHT != 0 1134#define IM2COL1x9(i) \ 1135 ({ \ 1136 yi_coord = yi - (int)PAD_TOP + i * DILATION_Y; \ 1137 yi_coord = min((uint)yi_coord, (uint)(SRC_HEIGHT - 1)); \ 1138 \ 1139 offset0 = xi_offset0 + (yi_coord * (int)src_stride_z); \ 1140 offset1 = xi_offset1 + (yi_coord * (int)src_stride_z); \ 1141 \ 1142 VECTOR_N values0 = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(input_ptr + offset0.s0)); \ 1143 VECTOR_N values1 = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(input_ptr + offset0.s1)); \ 1144 VECTOR_N values2 = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(input_ptr + offset0.s2)); \ 1145 VECTOR_N values3 = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(input_ptr + offset0.s3)); \ 1146 VECTOR_N values4 = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(input_ptr + offset0.s4)); \ 1147 VECTOR_N values5 = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(input_ptr + offset0.s5)); \ 1148 VECTOR_N values6 = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(input_ptr + offset0.s6)); \ 1149 VECTOR_N values7 = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(input_ptr + offset0.s7)); \ 1150 VECTOR_N values8 = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(input_ptr + offset1)); \ 1151 \ 1152 IM2COL1X9_NHWC_STORE(VECTOR_SIZE, BOUNDARY_VECTOR_SIZE, DATA_TYPE, SRC_DEPTH, values, i, output_ptr) \ 1153 }) 1154#endif // PAD_TOP != 0 || PAD_LEFT != 0 || PAD_BOTTOM != 0 || PAD_RIGHT != 0 1155 1156/** This kernel performs im2col when the kernel size is 9x9 and the data layout is NHWC 1157 * 1158 * @note This kernel computes VECTOR_SIZE elements 1159 * @note This kernel stores VECTOR_SIZE or BOUNDARY_VECTOR_SIZE (if at boundary) elements 1160 * @note The vector size must be passed at compile time using -DVECTOR_SIZE: e.g. -DVECTOR_SIZE=2 1161 * @note The boundary vector size must be passed at compile time using -DBOUNDARY_VECTOR_SIZE: e.g. -DBOUNDARY_VECTOR_SIZE=1 1162 * @note The data type must be passed at compile time using -DDATA_TYPE: e.g. -DDATA_TYPE=float 1163 * @note The width of output tensor after matrix multiplication must be passed at compile time using -DCONVOLVED_WIDTH: e.g. -DCONVOLVED_WIDTH=34 1164 * @note The kernel depth must be passed at compile time using -DSRC_DEPTH: e.g. -DSRC_DEPTH=3 1165 * @note The stride along the Y direction must be passed at compile time using -DSTRIDE_Y: e.g. -DSTRIDE_Y=1 1166 * @note In case biases will be added to the convolution -DHAS_BIAS has to be passed to append the final matrix with 1 in each row. 1167 * 1168 * @param[in] src_ptr Pointer to the source tensor. Supported data types: QASYMM8_SIGNED/QASYMM8/F16/F32 1169 * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes) 1170 * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes) 1171 * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes) 1172 * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes) 1173 * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes) 1174 * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes) 1175 * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor 1176 * @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr 1177 * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes) 1178 * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes) 1179 * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes) 1180 * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes) 1181 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor 1182 * @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes). 1183 * @param[in] dst_stride_w Stride of the destination tensor in W dimension (in bytes). 1184 */ 1185__kernel void im2col9x9_nhwc( 1186 TENSOR3D_DECLARATION(src), 1187 IMAGE_DECLARATION(dst), 1188 uint src_stride_w, 1189 uint dst_stride_w) 1190{ 1191 // input feature map, boundary-corrected (shift all non-boundary vectors by shift_amount) to avoid padding 1192 const int shift_amount = (int)VECTOR_SIZE - (int)BOUNDARY_VECTOR_SIZE; 1193 const int ch = max((int)(get_global_id(0) * VECTOR_SIZE) - shift_amount, 0); 1194 const int yo = get_global_id(1); 1195 const int batch = get_global_id(2); // batch size 1196 1197 // Calculate input indices 1198 const int xi = (get_global_id(1) % CONVOLVED_WIDTH) * STRIDE_X; 1199 const int yi = (get_global_id(1) / (int)CONVOLVED_WIDTH) * STRIDE_Y; 1200 1201 // Get input and output address 1202 __global uchar *input_ptr = src_ptr + src_offset_first_element_in_bytes + ch * sizeof(DATA_TYPE) + batch * (int)src_stride_w; 1203 __global uchar *output_ptr = dst_ptr + dst_offset_first_element_in_bytes + ch * sizeof(DATA_TYPE) + yo * (int)dst_stride_y + batch * (int)dst_stride_w; 1204 1205 int yi_coord = 0; 1206 int8 offset0 = 0; 1207 int offset1 = 0; 1208 1209 // Clamp xi 1210 int8 xi_offset0 = ((int8)xi + (int8)(0, 1, 2, 3, 4, 5, 6, 7) * DILATION_X - (int8)PAD_LEFT); 1211 int xi_offset1 = ((int)xi + (int)(8) * DILATION_X - (int)PAD_LEFT); 1212 1213#if PAD_LEFT != 0 || PAD_RIGHT != 0 1214#define CLAMP(x, min_val, max_val) min(max(x, min_val), max_val) 1215 xi_offset0 = CLAMP(xi_offset0, (int8)0, (int8)(SRC_WIDTH - 1)); 1216 xi_offset1 = CLAMP(xi_offset1, (int)0, (int)(SRC_WIDTH - 1)); 1217#endif // PAD_LEFT != 0 || PAD_RIGHT != 0 1218 xi_offset0 *= (int8)src_stride_y; 1219 xi_offset1 *= (int)src_stride_y; 1220 1221 // Out-of-bound condition for X 1222 int8 x_cond0 = (((int8)xi + (int8)(0, 1, 2, 3, 4, 5, 6, 7) * DILATION_X - (int8)PAD_LEFT) < (int8)0) || (((int8)xi + (int8)(0, 1, 2, 3, 4, 5, 6, 7) * DILATION_X - (int8)PAD_LEFT) >= (int8)SRC_WIDTH); 1223 int x_cond1 = (((int)xi + (int)(8) * DILATION_X - (int)PAD_LEFT) < (int)0) || (((int)xi + (int)(8) * DILATION_X - (int)PAD_LEFT) >= (int)SRC_WIDTH); 1224 1225 IM2COL1x9(0); 1226 IM2COL1x9(1); 1227 IM2COL1x9(2); 1228 IM2COL1x9(3); 1229 IM2COL1x9(4); 1230 IM2COL1x9(5); 1231 IM2COL1x9(6); 1232 IM2COL1x9(7); 1233 IM2COL1x9(8); 1234 1235#ifdef HAS_BIAS 1236 // We can use VECTOR_SIZE instead of BOUNDARY_VECTOR_SIZE even if it's at the boundary. This is because the bias is 1237 // added at the end of the channel, while the boundary vec is at the beginning of the channel. 1238 // The only case where the boundary vec is at the end of the channel is when there's only a single boundary vec in 1239 // the whole channel dimension, but in that case VECTOR_SIZE is also equal to BOUNDARY_VECTOR_SIZE 1240 // See the value of num_elems_processed_per_iteration in configure_opencl_kernel method in CLIm2ColKernel.cpp 1241 if((ch + VECTOR_SIZE) >= SRC_DEPTH) 1242 { 1243 *((__global DATA_TYPE *)(output_ptr) - ch + SRC_DEPTH * 81) = 1.0f; 1244 } 1245#endif // HAS_BIAS 1246} 1247 1248/** This opencl kernel performs a generic im2col implementation when the data layout is NHWC 1249 * 1250 * @note This kernel computes VECTOR_SIZE elements 1251 * @note This kernel stores VECTOR_SIZE or BOUNDARY_VECTOR_SIZE (if at boundary) elements 1252 * @note The vector size must be passed at compile time using -DVECTOR_SIZE: e.g. -DVECTOR_SIZE=2 1253 * @note The boundary vector size must be passed at compile time using -DBOUNDARY_VECTOR_SIZE: e.g. -DBOUNDARY_VECTOR_SIZE=1 1254 * @note The data type must be passed at compile time using -DDATA_TYPE: e.g. -DDATA_TYPE=float 1255 * @note The width and height of the input tensor must be passed at compile time using -DSRC_WIDTH and -DSRC_HEIGHT: e.g. -DSRC_WIDTH=128 and -DSRC_HEIGHT=128 1256 * @note The width of output tensor after matrix multiplication must be passed at compile time using -DCONVOLVED_WIDTH: e.g. -DCONVOLVED_WIDTH=34 1257 * @note The kernel width, height and depth must be passed at compile time using -DKERNEL_WIDTH, -DKERNEL_HEIGHT and -DSRC_DEPTH: e.g. -DKERNEL_WIDTH=3, -DKERNEL_HEIGHT=3 and -DSRC_DEPTH=64 1258 * @note The pad_left, pad_right, pad_top and pad_bottom must be passed at compile time using -DPAD_LEFT, -DPAD_RIGHT, -DPAD_TOP and -DPAD_BOTTOM: e.g. -DPAD_LEFT=1, -DPAD_RIGHT=2, -DPAD_TOP=3 and -DPAD_BOTTOM=2 1259 * @note The zero value to store in case we load values out-of-bounds must be passed at compile time using -DPAD_VALUE: e.g. -DPAD_VALUE=0.0 1260 * @note The stride along the X and Y directions must be passed at compile time using -DSTRIDE_X and -DSTRIDE_Y: e.g. -DSTRIDE_X=1 and -DSTRIDE_Y=1 1261 * @note The dilation_x and dilation_y must be passed at compile time using -DDILATION_X and -DDILATION_Y: e.g. -DDILATION_X=1, -DDILATION_Y=1 1262 * @note In case biases will be added to the convolution -DHAS_BIAS has to be passed to append the final matrix with 1 in each row. 1263 * 1264 * @param[in] src_ptr Pointer to the source tensor. Supported data types: QASYMM8_SIGNED/QASYMM8/F16/F32 1265 * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes) 1266 * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes) 1267 * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes) 1268 * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes) 1269 * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes) 1270 * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes) 1271 * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor 1272 * @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr 1273 * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes) 1274 * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes) 1275 * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes) 1276 * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes) 1277 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor 1278 * @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes). 1279 * @param[in] dst_stride_w Stride of the destination tensor in W dimension (in bytes). 1280 */ 1281__kernel void im2col_generic_nhwc( 1282 TENSOR3D_DECLARATION(src), 1283 IMAGE_DECLARATION(dst), 1284 uint src_stride_w, 1285 uint dst_stride_w) 1286{ 1287 // input feature map, boundary-corrected (shift all non-boundary vectors by shift_amount) to avoid padding 1288 const int shift_amount = (int)VECTOR_SIZE - (int)BOUNDARY_VECTOR_SIZE; 1289 const int ch = max((int)(get_global_id(0) * VECTOR_SIZE) - shift_amount, 0); 1290 const int yo = get_global_id(1); 1291 const int batch = get_global_id(2); // batch size 1292 1293 // Calculate input indices 1294 const int xi = (get_global_id(1) % CONVOLVED_WIDTH) * STRIDE_X; 1295 const int yi = (get_global_id(1) / (int)CONVOLVED_WIDTH) * STRIDE_Y; 1296 1297 // Get input and output address 1298 __global uchar *input_ptr = src_ptr + src_offset_first_element_in_bytes + ch * sizeof(DATA_TYPE) + batch * (int)src_stride_w; 1299 __global uchar *output_ptr = dst_ptr + dst_offset_first_element_in_bytes + ch * sizeof(DATA_TYPE) + yo * (int)dst_stride_y + batch * (int)dst_stride_w; 1300 1301 int i = 0; 1302 for(int yk = 0; yk < KERNEL_HEIGHT; ++yk) 1303 { 1304 // Clamp yi_coord 1305 int yi_coord = yi + yk * DILATION_Y - (int)PAD_TOP; 1306 yi_coord = CLAMP(yi_coord, (int)0, (int)(SRC_HEIGHT - 1)); 1307 1308 // Out-of-bound condition for Y 1309 int y_border_condition = ((yi + yk * DILATION_Y - (int)PAD_TOP) < (int)0) || ((yi + yk * DILATION_Y - (int)PAD_TOP) >= (int)SRC_HEIGHT); 1310 1311 for(int xk = 0; xk < KERNEL_WIDTH; ++xk) 1312 { 1313 // Clamp xi_coord 1314 int xi_coord = (xi + xk * DILATION_X - (int)PAD_LEFT); 1315 xi_coord = CLAMP(xi_coord, (int)0, (int)(SRC_WIDTH - 1)); 1316 1317 // Out-of-bound condition for X 1318 int x_border_condition = ((xi + xk * DILATION_X - (int)PAD_LEFT) < (int)0) || ((xi + xk * DILATION_X - (int)PAD_LEFT) >= (int)SRC_WIDTH); 1319 1320 int offset = xi_coord * (int)src_stride_y + (yi_coord * (int)src_stride_z); 1321 1322 VECTOR_N values0 = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(input_ptr + offset)); 1323 1324#if PAD_LEFT != 0 || PAD_TOP != 0 || PAD_RIGHT != 0 || PAD_BOTTOM != 0 1325 // Replace with PAD_VALUE if the value is out-of-bound 1326 values0 = select(values0, (VECTOR_N)PAD_VALUE, (COND_N)((COND_N)x_border_condition || (COND_N)(y_border_condition))); 1327#endif // PAD_LEFT != 0 || PAD_TOP != 0 || PAD_RIGHT != 0 || PAD_BOTTOM != 0 1328 1329 // Store in a boundary-aware way to avoid padding 1330#if BOUNDARY_VECTOR_SIZE != VECTOR_SIZE 1331 const bool at_channel_boundary = get_global_id(0) == 0; 1332 if(at_channel_boundary) 1333 { 1334 VSTORE_PARTIAL(VECTOR_SIZE, BOUNDARY_VECTOR_SIZE) 1335 (values0, 0, (__global DATA_TYPE *)(output_ptr) + i * (int)SRC_DEPTH); 1336 } 1337 else // at_channel_boundary 1338#endif // BOUNDARY_VECTOR_SIZE != VECTOR_SIZE 1339 { 1340 VSTORE(VECTOR_SIZE) 1341 (values0, 0, (__global DATA_TYPE *)(output_ptr) + i * (int)SRC_DEPTH); 1342 } 1343 i++; 1344 } 1345 } 1346 1347#ifdef HAS_BIAS 1348 // We can use VECTOR_SIZE instead of BOUNDARY_VECTOR_SIZE even if it's at the boundary. This is because the bias is 1349 // added at the end of the channel, while the boundary vec is at the beginning of the channel. 1350 // The only case where the boundary vec is at the end of the channel is when there's only a single boundary vec in 1351 // the whole channel dimension, but in that case VECTOR_SIZE is also equal to BOUNDARY_VECTOR_SIZE 1352 // See the value of num_elems_processed_per_iteration in configure_opencl_kernel method in CLIm2ColKernel.cpp 1353 if((ch + VECTOR_SIZE) >= SRC_DEPTH) 1354 { 1355 *((__global DATA_TYPE *)(output_ptr) - ch + SRC_DEPTH * KERNEL_WIDTH * KERNEL_HEIGHT) = 1.0f; 1356 } 1357#endif // HAS_BIAS 1358} 1359#endif // defined(CONVOLVED_WIDTH) && defined(SRC_WIDTH) && defined(SRC_HEIGHT) && defined(STRIDE_X) && defined(STRIDE_Y) && defined(KERNEL_WIDTH) && defined(KERNEL_HEIGHT) && defined(SRC_DEPTH) && defined(PAD_LEFT) && defined(PAD_RIGHT) && defined(PAD_TOP) && defined(PAD_BOTTOM) && defined(PAD_VALUE) && defined(VECTOR_SIZE) && defined(BOUNDARY_VECTOR_SIZE) 1360#endif // defined(DATA_TYPE) && defined(ELEMENT_SIZE) 1361