1// Copyright 2020 Google LLC 2// 3// This source code is licensed under the BSD-style license found in the 4// LICENSE file in the root directory of this source tree. 5 6$assert ROW_TILE >= 1 7$assert ACCUMULATORS >= 1 8#include <assert.h> 9 10#include <xmmintrin.h> 11 12#include <xnnpack/dwconv.h> 13#include <xnnpack/math.h> 14 15 16void xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__sse_${ROW_TILE}x4${"_acc%d" % ACCUMULATORS if ACCUMULATORS > 1 else ""}( 17 size_t input_height, 18 size_t input_width, 19 const float* input, 20 const float* weights, 21 const float* zero, 22 float* output, 23 uint32_t padding_top, 24 const union xnn_f32_chw_params params[restrict XNN_MIN_ELEMENTS(1)]) 25{ 26 assert(input_height != 0); 27 assert(input_width != 0); 28 assert(input_width % sizeof(float) == 0); 29 assert(padding_top >= 1); 30 assert(padding_top <= 2); 31 32 const __m128 vmask_even = _mm_load_ps((const float*) params->sse.mask_even); 33 const __m128 vmask_odd = _mm_load_ps((const float*) params->sse.mask_odd); 34 const __m128 vmax = _mm_load_ps(params->sse.max); 35 const __m128 vmin = _mm_load_ps(params->sse.min); 36 37 const __m128 vbias = _mm_load1_ps(weights); 38 $for R in range(5): 39 $for S in range(5): 40 const __m128 vk${R}${S} = _mm_load1_ps(weights + ${R*5+S+1}); 41 42 const uint32_t padding_top_less_1 = padding_top - 1; 43 const size_t input_decrement = round_up_po2(input_width, 8 * sizeof(float)); 44 45 const float* i0 = zero; 46 const float* i1 = (const float*) ((uintptr_t) input - ((-padding_top_less_1) & input_width)); 47 const float* i2 = (const float*) ((uintptr_t) i1 + input_width); 48 if XNN_UNPREDICTABLE(padding_top_less_1 != 0) { 49 i1 = zero; 50 } 51 $for M in range(3, 3 + 2 * ROW_TILE): 52 const float* i${M} = (const float*) ((uintptr_t) i${M-1} + input_width); 53 54 $if ROW_TILE > 1: 55 const size_t output_width = round_down_po2((input_width + (2 /* padding */ - 3 /* kernel size */ + 2 /* subsampling */) * sizeof(float)) / 2, sizeof(float)); 56 57 float* o0 = output; 58 $for M in range(1, ROW_TILE): 59 float* o${M} = (float*) ((uintptr_t) o${M-1} + output_width); 60 61 size_t padded_input_height = input_height + (padding_top_less_1 + 1) + 2 /* padding bottom */; 62 size_t output_height = (padded_input_height - 5 /* kernel size */ + 2 /* subsampling */) / 2; 63 do { 64 $for M in range(3, 3 + 2 * ROW_TILE): 65 if XNN_UNPREDICTABLE(padded_input_height < ${3 + M}) { 66 i${M} = zero; 67 $if M % 2 == 0 and M <= 2 * ROW_TILE + 1: 68 o${M / 2 - 1} = o${M / 2 - 2}; 69 } 70 71 $for M in range(3 + 2 * ROW_TILE): 72 __m128 vi${M}x6024 = _mm_setzero_ps(); 73 74 $for M in range(3 + 2 * ROW_TILE): 75 __m128 vi${M}x7135 = _mm_setzero_ps(); 76 77 $for M in range(3 + 2 * ROW_TILE): 78 const __m128 vi${M}x89AB = _mm_loadu_ps(i${M}); 79 const __m128 vi${M}xCDEF = _mm_loadu_ps(i${M} + 4); 80 i${M} += 8; 81 82 $for M in range(3 + 2 * ROW_TILE): 83 __m128 vi${M}x8ACE = _mm_shuffle_ps(vi${M}x89AB, vi${M}xCDEF, _MM_SHUFFLE(2, 0, 2, 0)); 84 __m128 vi${M}x9BDF = _mm_shuffle_ps(vi${M}x89AB, vi${M}xCDEF, _MM_SHUFFLE(3, 1, 3, 1)); 85 86 size_t w = input_width; 87 for (; w > 8 * sizeof(float); w -= 8 * sizeof(float)) { 88 $for K in range(5): 89 $for M in range(ROW_TILE): 90 $if K == 0: 91 __m128 vo${M}p0 = _mm_add_ps(vbias, _mm_mul_ps(vi${2*M+K}x8ACE, vk${K}2)); 92 $elif K < ACCUMULATORS: 93 __m128 vo${M}p${K} = _mm_mul_ps(vi${2*M+K}x8ACE, vk${K}2); 94 $else: 95 vo${M}p${K % ACCUMULATORS} = _mm_add_ps(vo${M}p${K % ACCUMULATORS}, _mm_mul_ps(vi${2*M+K}x8ACE, vk${K}2)); 96 97 $for M in range(3 + 2 * ROW_TILE): 98 const __m128 vi${M}xE8AC = _mm_shuffle_ps(vi${M}x8ACE, vi${M}x8ACE, _MM_SHUFFLE(2, 1, 0, 3)); 99 100 $for K in range(5): 101 $for M in range(ROW_TILE): 102 vo${M}p${(K+5) % ACCUMULATORS} = _mm_add_ps(vo${M}p${(K+5) % ACCUMULATORS}, _mm_mul_ps(vi${2*M+K}x9BDF, vk${K}3)); 103 104 $for M in range(3 + 2 * ROW_TILE): 105 const __m128 vi${M}x68AC = _mm_move_ss(vi${M}xE8AC, vi${M}x6024); 106 vi${M}x6024 = vi${M}xE8AC; 107 108 $for M in range(3 + 2 * ROW_TILE): 109 const __m128 vi${M}xF9BD = _mm_shuffle_ps(vi${M}x9BDF, vi${M}x9BDF, _MM_SHUFFLE(2, 1, 0, 3)); 110 111 $for K in range(5): 112 $for M in range(ROW_TILE): 113 vo${M}p${(K+10) % ACCUMULATORS} = _mm_add_ps(vo${M}p${(K+10) % ACCUMULATORS}, _mm_mul_ps(vi${2*M+K}x68AC, vk${K}0)); 114 115 $for M in range(3 + 2 * ROW_TILE): 116 const __m128 vi${M}xGHIJ = _mm_loadu_ps(i${M}); 117 const __m128 vi${M}xKLMN = _mm_loadu_ps(i${M} + 4); 118 i${M} += 8; 119 120 $for M in range(3 + 2 * ROW_TILE): 121 const __m128 vi${M}x79BD = _mm_move_ss(vi${M}xF9BD, vi${M}x7135); 122 vi${M}x7135 = vi${M}xF9BD; 123 124 $for M in range(3 + 2 * ROW_TILE): 125 const __m128 vi${M}xGIKM = _mm_shuffle_ps(vi${M}xGHIJ, vi${M}xKLMN, _MM_SHUFFLE(2, 0, 2, 0)); 126 const __m128 vi${M}xHJLN = _mm_shuffle_ps(vi${M}xGHIJ, vi${M}xKLMN, _MM_SHUFFLE(3, 1, 3, 1)); 127 vi${M}x9BDF = vi${M}xHJLN; 128 129 $for K in range(5): 130 $for M in range(ROW_TILE): 131 vo${M}p${(K+15) % ACCUMULATORS} = _mm_add_ps(vo${M}p${(K+15) % ACCUMULATORS}, _mm_mul_ps(vi${2*M+K}x79BD, vk${K}1)); 132 133 $for M in range(3 + 2 * ROW_TILE): 134 const __m128 vi${M}xGACE = _mm_move_ss(vi${M}x8ACE, vi${M}xGIKM); 135 vi${M}x8ACE = vi${M}xGIKM; 136 137 $for M in range(3 + 2 * ROW_TILE): 138 const __m128 vi${M}xACEG = _mm_shuffle_ps(vi${M}xGACE, vi${M}xGACE, _MM_SHUFFLE(0, 3, 2, 1)); 139 140 $for K in range(5): 141 $for M in range(ROW_TILE): 142 vo${M}p${(K+20) % ACCUMULATORS} = _mm_add_ps(vo${M}p${(K+20) % ACCUMULATORS}, _mm_mul_ps(vi${2*M+K}xACEG, vk${K}4)); 143 144 $if ACCUMULATORS > 1: 145 $ACC_SLICE = 1 146 $while ACC_SLICE < ACCUMULATORS: 147 $for A in range(0, ACCUMULATORS, ACC_SLICE * 2): 148 $if A + ACC_SLICE < ACCUMULATORS: 149 $for M in range(ROW_TILE): 150 vo${M}p${A} = _mm_add_ps(vo${M}p${A}, vo${M}p${A + ACC_SLICE}); 151 $ACC_SLICE *= 2 152 153 $for M in range(ROW_TILE): 154 __m128 vo${M} = _mm_max_ps(vo${M}p0, vmin); 155 156 $for M in range(ROW_TILE): 157 vo${M} = _mm_min_ps(vo${M}, vmax); 158 159 $for M in reversed(range(ROW_TILE)): 160 _mm_storeu_ps(o${M}, vo${M}); 161 o${M} += 4; 162 } 163 // Last block has 1-8 pixels to process. 164 assert(w <= 8 * sizeof(float)); 165 assert(w >= 1 * sizeof(float)); 166 { 167 $for M in range(3 + 2 * ROW_TILE): 168 vi${M}x8ACE = _mm_and_ps(vi${M}x8ACE, vmask_even); 169 vi${M}x9BDF = _mm_and_ps(vi${M}x9BDF, vmask_odd); 170 171 $for K in range(5): 172 $for M in range(ROW_TILE): 173 $if K == 0: 174 __m128 vo${M}p0 = _mm_add_ps(vbias, _mm_mul_ps(vi${2*M+K}x8ACE, vk${K}2)); 175 $elif K < ACCUMULATORS: 176 __m128 vo${M}p${K} = _mm_mul_ps(vi${2*M+K}x8ACE, vk${K}2); 177 $else: 178 vo${M}p${K % ACCUMULATORS} = _mm_add_ps(vo${M}p${K % ACCUMULATORS}, _mm_mul_ps(vi${2*M+K}x8ACE, vk${K}2)); 179 180 $for M in range(3 + 2 * ROW_TILE): 181 const __m128 vi${M}xE8AC = _mm_shuffle_ps(vi${M}x8ACE, vi${M}x8ACE, _MM_SHUFFLE(2, 1, 0, 3)); 182 183 $for K in range(5): 184 $for M in range(ROW_TILE): 185 vo${M}p${(K+5) % ACCUMULATORS} = _mm_add_ps(vo${M}p${(K+5) % ACCUMULATORS}, _mm_mul_ps(vi${2*M+K}x9BDF, vk${K}3)); 186 187 $for M in range(3 + 2 * ROW_TILE): 188 const __m128 vi${M}x68AC = _mm_move_ss(vi${M}xE8AC, vi${M}x6024); 189 190 $for M in range(3 + 2 * ROW_TILE): 191 const __m128 vi${M}xF9BD = _mm_shuffle_ps(vi${M}x9BDF, vi${M}x9BDF, _MM_SHUFFLE(2, 1, 0, 3)); 192 193 $for K in range(5): 194 $for M in range(ROW_TILE): 195 vo${M}p${(K+10) % ACCUMULATORS} = _mm_add_ps(vo${M}p${(K+10) % ACCUMULATORS}, _mm_mul_ps(vi${2*M+K}x68AC, vk${K}0)); 196 197 $for M in range(3 + 2 * ROW_TILE): 198 const __m128 vi${M}x79BD = _mm_move_ss(vi${M}xF9BD, vi${M}x7135); 199 200 $for K in range(5): 201 $for M in range(ROW_TILE): 202 vo${M}p${(K+15) % ACCUMULATORS} = _mm_add_ps(vo${M}p${(K+15) % ACCUMULATORS}, _mm_mul_ps(vi${2*M+K}x79BD, vk${K}1)); 203 204 const __m128 vzero = _mm_setzero_ps(); 205 $for M in range(3 + 2 * ROW_TILE): 206 const __m128 vi${M}xGACE = _mm_move_ss(vi${M}x8ACE, vzero); 207 208 $for M in range(3 + 2 * ROW_TILE): 209 const __m128 vi${M}xACEG = _mm_shuffle_ps(vi${M}xGACE, vi${M}xGACE, _MM_SHUFFLE(0, 3, 2, 1)); 210 211 $for K in range(5): 212 $for M in range(ROW_TILE): 213 vo${M}p${(K+20) % ACCUMULATORS} = _mm_add_ps(vo${M}p${(K+20) % ACCUMULATORS}, _mm_mul_ps(vi${2*M+K}xACEG, vk${K}4)); 214 215 $if ACCUMULATORS > 1: 216 $ACC_SLICE = 1 217 $while ACC_SLICE < ACCUMULATORS: 218 $for A in range(0, ACCUMULATORS, ACC_SLICE * 2): 219 $if A + ACC_SLICE < ACCUMULATORS: 220 $for M in range(ROW_TILE): 221 vo${M}p${A} = _mm_add_ps(vo${M}p${A}, vo${M}p${A + ACC_SLICE}); 222 $ACC_SLICE *= 2 223 224 $for M in range(ROW_TILE): 225 __m128 vo${M} = _mm_max_ps(vo${M}p0, vmin); 226 227 $for M in range(ROW_TILE): 228 vo${M} = _mm_min_ps(vo${M}, vmax); 229 230 size_t w_tmp = (w + 1 * sizeof(float)) / (2 * sizeof(float)); 231 if XNN_LIKELY(w_tmp >= 4) { 232 $for M in reversed(range(ROW_TILE)): 233 _mm_storeu_ps(o${M}, vo${M}); 234 o${M} += 4; 235 } else { 236 if (w_tmp & 2) { 237 $for M in reversed(range(ROW_TILE)): 238 _mm_storel_pi((__m64*) o${M}, vo${M}); 239 o${M} += 2; 240 241 $for M in range(ROW_TILE): 242 vo${M} = _mm_movehl_ps(vo${M}, vo${M}); 243 } 244 if (w_tmp & 1) { 245 $for M in reversed(range(ROW_TILE)): 246 _mm_store_ss(o${M}, vo${M}); 247 o${M} += 1; 248 } 249 } 250 } 251 252 i0 = (const float*) ((uintptr_t) i${2 * ROW_TILE} - input_decrement); 253 i1 = (const float*) ((uintptr_t) i${2 * ROW_TILE + 1} - input_decrement); 254 i2 = (const float*) ((uintptr_t) i${2 * ROW_TILE + 2} - input_decrement); 255 $for M in range(3, 3 + 2 * ROW_TILE): 256 i${M} = (const float*) ((uintptr_t) i${M-1} + input_width); 257 258 $if ROW_TILE > 1: 259 o0 = o${ROW_TILE - 1}; 260 $for M in range(1, ROW_TILE): 261 o${M} = (float*) ((uintptr_t) o${M-1} + output_width); 262 263 $if ROW_TILE > 1: 264 output_height = doz(output_height, ${ROW_TILE}); 265 padded_input_height = doz(padded_input_height, ${ROW_TILE * 2}); 266 $else: 267 output_height -= 1; 268 padded_input_height -= 2; 269 } while (output_height != 0); 270} 271