1// Copyright 2020 Google LLC 2// 3// This source code is licensed under the BSD-style license found in the 4// LICENSE file in the root directory of this source tree. 5 6$assert ROW_TILE >= 1 7$assert ACCUMULATORS >= 1 8#include <assert.h> 9 10#include <wasm_simd128.h> 11 12#include <xnnpack/dwconv.h> 13#include <xnnpack/math.h> 14 15 16$ARCH_SUFFIX = "_x86" if X86 else "_arm" 17 18void xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd${ARCH_SUFFIX}_splat_${ROW_TILE}x4${"_acc%d" % ACCUMULATORS if ACCUMULATORS > 1 else ""}( 19 size_t input_height, 20 size_t input_width, 21 const float* input, 22 const float* weights, 23 const float* zero, 24 float* output, 25 uint32_t padding_top, 26 const union xnn_f32_chw_params params[restrict XNN_MIN_ELEMENTS(1)]) 27{ 28 assert(input_height != 0); 29 assert(input_width != 0); 30 assert(input_width % sizeof(float) == 0); 31 assert(padding_top >= 1); 32 assert(padding_top <= 2); 33 34 const v128_t vmask_even = wasm_v128_load(params->scalar.mask_even); 35 const v128_t vmask_odd = wasm_v128_load(params->scalar.mask_odd); 36 const v128_t vmax = wasm_v32x4_load_splat(¶ms->scalar.max); 37 const v128_t vmin = wasm_v32x4_load_splat(¶ms->scalar.min); 38 39 const v128_t vw0123 = wasm_v128_load(weights); 40 const v128_t vw4567 = wasm_v128_load(weights + 4); 41 const v128_t vw89AB = wasm_v128_load(weights + 8); 42 const v128_t vwCDEF = wasm_v128_load(weights + 12); 43 const v128_t vwGHIJ = wasm_v128_load(weights + 16); 44 const v128_t vwKLMN = wasm_v128_load(weights + 20); 45 const v128_t vwOP = wasm_v64x2_load_splat(weights + 24); 46 47 const v128_t vzero = wasm_f32x4_splat(0.0f); 48 49 const uint32_t padding_top_less_1 = padding_top - 1; 50 const size_t input_decrement = round_up_po2(input_width, 8 * sizeof(float)); 51 52 const float* i0 = zero; 53 const float* i1 = (const float*) ((uintptr_t) input - ((-padding_top_less_1) & input_width)); 54 const float* i2 = (const float*) ((uintptr_t) i1 + input_width); 55 if XNN_UNPREDICTABLE(padding_top_less_1 != 0) { 56 i1 = zero; 57 } 58 $for M in range(3, 3 + 2 * ROW_TILE): 59 const float* i${M} = (const float*) ((uintptr_t) i${M-1} + input_width); 60 61 $if ROW_TILE > 1: 62 const size_t output_width = round_down_po2((input_width + (2 /* padding */ - 3 /* kernel size */ + 2 /* subsampling */) * sizeof(float)) / 2, sizeof(float)); 63 64 float* o0 = output; 65 $for M in range(1, ROW_TILE): 66 float* o${M} = (float*) ((uintptr_t) o${M-1} + output_width); 67 68 size_t padded_input_height = input_height + (padding_top_less_1 + 1) + 2 /* padding bottom */; 69 size_t output_height = (padded_input_height - 5 /* kernel size */ + 2 /* subsampling */) / 2; 70 do { 71 $for M in range(3, 3 + 2 * ROW_TILE): 72 if XNN_UNPREDICTABLE(padded_input_height < ${3 + M}) { 73 i${M} = zero; 74 $if M % 2 == 0 and M <= 2 * ROW_TILE + 1: 75 o${M / 2 - 1} = o${M / 2 - 2}; 76 } 77 78 $for M in range(3 + 2 * ROW_TILE): 79 v128_t vi${M}x0246 = vzero; 80 81 $for M in range(3 + 2 * ROW_TILE): 82 v128_t vi${M}x1357 = vzero; 83 84 $for M in range(3 + 2 * ROW_TILE): 85 const v128_t vi${M}x89AB = wasm_v128_load(i${M}); 86 const v128_t vi${M}xCDEF = wasm_v128_load(i${M} + 4); 87 i${M} += 8; 88 89 $for M in range(3 + 2 * ROW_TILE): 90 v128_t vi${M}x8ACE = wasm_v32x4_shuffle(vi${M}x89AB, vi${M}xCDEF, 0, 2, 4, 6); 91 v128_t vi${M}x9BDF = wasm_v32x4_shuffle(vi${M}x89AB, vi${M}xCDEF, 1, 3, 5, 7); 92 93 size_t w = input_width; 94 for (; w > 8 * sizeof(float); w -= 8 * sizeof(float)) { 95 $for M in range(ROW_TILE): 96 v128_t vo${M}p0 = wasm_v32x4_shuffle(vw0123, vw0123, 0, 0, 0, 0); 97 98 $for M in range(ROW_TILE): 99 $if ACCUMULATORS > 1: 100 v128_t vo${M}p1 = wasm_f32x4_mul(vi${2*M}x8ACE, wasm_v32x4_shuffle(vw0123, vw0123, 3, 3, 3, 3)); 101 $else: 102 vo${M}p0 = wasm_f32x4_add(vo${M}p0, wasm_f32x4_mul(vi${2*M}x8ACE, wasm_v32x4_shuffle(vw0123, vw0123, 3, 3, 3, 3))); 103 104 $for M in range(ROW_TILE): 105 $if ACCUMULATORS > 2: 106 v128_t vo${M}p2 = wasm_f32x4_mul(vi${2*M+1}x8ACE, wasm_v32x4_shuffle(vw89AB, vw89AB, 0, 0, 0, 0)); 107 $else: 108 vo${M}p0 = wasm_f32x4_add(vo${M}p0, wasm_f32x4_mul(vi${2*M+1}x8ACE, wasm_v32x4_shuffle(vw89AB, vw89AB, 0, 0, 0, 0))); 109 110 $for M in range(ROW_TILE): 111 $if ACCUMULATORS > 3: 112 v128_t vo${M}p3 = wasm_f32x4_mul(vi${2*M+2}x8ACE, wasm_v32x4_shuffle(vwCDEF, vwCDEF, 1, 1, 1, 1)); 113 $else: 114 vo${M}p${4 % ACCUMULATORS} = wasm_f32x4_add(vo${M}p${4 % ACCUMULATORS}, wasm_f32x4_mul(vi${2*M+2}x8ACE, wasm_v32x4_shuffle(vwCDEF, vwCDEF, 1, 1, 1, 1))); 115 116 $for M in range(ROW_TILE): 117 $if ACCUMULATORS > 4: 118 v128_t vo${M}p4 = wasm_f32x4_mul(vi${2*M+3}x8ACE, wasm_v32x4_shuffle(vwGHIJ, vwGHIJ, 2, 2, 2, 2)); 119 $else: 120 vo${M}p${5 % ACCUMULATORS} = wasm_f32x4_add(vo${M}p${5 % ACCUMULATORS}, wasm_f32x4_mul(vi${2*M+3}x8ACE, wasm_v32x4_shuffle(vwGHIJ, vwGHIJ, 2, 2, 2, 2))); 121 122 $for M in range(ROW_TILE): 123 $if ACCUMULATORS > 5: 124 vo${M}p5 = wasm_f32x4_mul(vi${2*M+4}x8ACE, wasm_v32x4_shuffle(vwKLMN, vwKLMN, 3, 3, 3, 3)); 125 $else: 126 vo${M}p${6 % ACCUMULATORS} = wasm_f32x4_add(vo${M}p${6 % ACCUMULATORS}, wasm_f32x4_mul(vi${2*M+4}x8ACE, wasm_v32x4_shuffle(vwKLMN, vwKLMN, 3, 3, 3, 3))); 127 128 $for M in range(ROW_TILE): 129 vo${M}p${7 % ACCUMULATORS} = wasm_f32x4_add(vo${M}p${7 % ACCUMULATORS}, wasm_f32x4_mul(vi${2*M}x9BDF, wasm_v32x4_shuffle(vw4567, vw4567, 0, 0, 0, 0))); 130 131 $for M in range(ROW_TILE): 132 vo${M}p${8 % ACCUMULATORS} = wasm_f32x4_add(vo${M}p${8 % ACCUMULATORS}, wasm_f32x4_mul(vi${2*M+1}x9BDF, wasm_v32x4_shuffle(vw89AB, vw89AB, 1, 1, 1, 1))); 133 134 $for M in range(ROW_TILE): 135 vo${M}p${9 % ACCUMULATORS} = wasm_f32x4_add(vo${M}p${9 % ACCUMULATORS}, wasm_f32x4_mul(vi${2*M+2}x9BDF, wasm_v32x4_shuffle(vwCDEF, vwCDEF, 2, 2, 2, 2))); 136 137 $for M in range(ROW_TILE): 138 vo${M}p${10 % ACCUMULATORS} = wasm_f32x4_add(vo${M}p${10 % ACCUMULATORS}, wasm_f32x4_mul(vi${2*M+3}x9BDF, wasm_v32x4_shuffle(vwGHIJ, vwGHIJ, 3, 3, 3, 3))); 139 140 $for M in range(ROW_TILE): 141 vo${M}p${11 % ACCUMULATORS} = wasm_f32x4_add(vo${M}p${11 % ACCUMULATORS}, wasm_f32x4_mul(vi${2*M+4}x9BDF, wasm_v32x4_shuffle(vwOP, vwOP, 0, 0, 0, 0))); 142 143 $for M in range(3 + 2 * ROW_TILE): 144 const v128_t vi${M}x68AC = wasm_v32x4_shuffle(vi${M}x0246, vi${M}x8ACE, 3, 4, 5, 6); 145 vi${M}x0246 = vi${M}x8ACE; 146 147 $for M in range(ROW_TILE): 148 vo${M}p${12 % ACCUMULATORS} = wasm_f32x4_add(vo${M}p${12 % ACCUMULATORS}, wasm_f32x4_mul(vi${2*M}x68AC, wasm_v32x4_shuffle(vw0123, vw0123, 1, 1, 1, 1))); 149 150 $for M in range(ROW_TILE): 151 vo${M}p${13 % ACCUMULATORS} = wasm_f32x4_add(vo${M}p${13 % ACCUMULATORS}, wasm_f32x4_mul(vi${2*M+1}x68AC, wasm_v32x4_shuffle(vw4567, vw4567, 2, 2, 2, 2))); 152 153 $for M in range(ROW_TILE): 154 vo${M}p${14 % ACCUMULATORS} = wasm_f32x4_add(vo${M}p${14 % ACCUMULATORS}, wasm_f32x4_mul(vi${2*M+2}x68AC, wasm_v32x4_shuffle(vw89AB, vw89AB, 3, 3, 3, 3))); 155 156 $for M in range(ROW_TILE): 157 vo${M}p${15 % ACCUMULATORS} = wasm_f32x4_add(vo${M}p${15 % ACCUMULATORS}, wasm_f32x4_mul(vi${2*M+3}x68AC, wasm_v32x4_shuffle(vwGHIJ, vwGHIJ, 0, 0, 0, 0))); 158 159 $for M in range(ROW_TILE): 160 vo${M}p${16 % ACCUMULATORS} = wasm_f32x4_add(vo${M}p${16 % ACCUMULATORS}, wasm_f32x4_mul(vi${2*M+4}x68AC, wasm_v32x4_shuffle(vwKLMN, vwKLMN, 1, 1, 1, 1))); 161 162 $for M in range(3 + 2 * ROW_TILE): 163 const v128_t vi${M}x79BD = wasm_v32x4_shuffle(vi${M}x1357, vi${M}x9BDF, 3, 4, 5, 6); 164 vi${M}x1357 = vi${M}x9BDF; 165 166 $for M in range(3 + 2 * ROW_TILE): 167 const v128_t vi${M}xGHIJ = wasm_v128_load(i${M}); 168 const v128_t vi${M}xKLMN = wasm_v128_load(i${M} + 4); 169 i${M} += 8; 170 171 $for M in range(3 + 2 * ROW_TILE): 172 const v128_t vi${M}xGIKM = wasm_v32x4_shuffle(vi${M}xGHIJ, vi${M}xKLMN, 0, 2, 4, 6); 173 const v128_t vi${M}xHJLN = wasm_v32x4_shuffle(vi${M}xGHIJ, vi${M}xKLMN, 1, 3, 5, 7); 174 175 $for M in range(ROW_TILE): 176 vo${M}p${17 % ACCUMULATORS} = wasm_f32x4_add(vo${M}p${17 % ACCUMULATORS}, wasm_f32x4_mul(vi${2*M}x79BD, wasm_v32x4_shuffle(vw0123, vw0123, 2, 2, 2, 2))); 177 178 $for M in range(ROW_TILE): 179 vo${M}p${18 % ACCUMULATORS} = wasm_f32x4_add(vo${M}p${18 % ACCUMULATORS}, wasm_f32x4_mul(vi${2*M+1}x79BD, wasm_v32x4_shuffle(vw4567, vw4567, 3, 3, 3, 3))); 180 181 $for M in range(ROW_TILE): 182 vo${M}p${19 % ACCUMULATORS} = wasm_f32x4_add(vo${M}p${19 % ACCUMULATORS}, wasm_f32x4_mul(vi${2*M+2}x79BD, wasm_v32x4_shuffle(vwCDEF, vwCDEF, 0, 0, 0, 0))); 183 184 $for M in range(ROW_TILE): 185 vo${M}p${20 % ACCUMULATORS} = wasm_f32x4_add(vo${M}p${20 % ACCUMULATORS}, wasm_f32x4_mul(vi${2*M+3}x79BD, wasm_v32x4_shuffle(vwGHIJ, vwGHIJ, 1, 1, 1, 1))); 186 187 $for M in range(ROW_TILE): 188 vo${M}p${21 % ACCUMULATORS} = wasm_f32x4_add(vo${M}p${21 % ACCUMULATORS}, wasm_f32x4_mul(vi${2*M+4}x79BD, wasm_v32x4_shuffle(vwKLMN, vwKLMN, 2, 2, 2, 2))); 189 190 $for M in range(3 + 2 * ROW_TILE): 191 const v128_t vi${M}xACEG = wasm_v32x4_shuffle(vi${M}x8ACE, vi${M}xGIKM, 1, 2, 3, 4); 192 vi${M}x8ACE = vi${M}xGIKM; 193 vi${M}x9BDF = vi${M}xHJLN; 194 195 $for M in range(ROW_TILE): 196 vo${M}p${22 % ACCUMULATORS} = wasm_f32x4_add(vo${M}p${22 % ACCUMULATORS}, wasm_f32x4_mul(vi${2*M}xACEG, wasm_v32x4_shuffle(vw4567, vw4567, 1, 1, 1, 1))); 197 198 $for M in range(ROW_TILE): 199 vo${M}p${23 % ACCUMULATORS} = wasm_f32x4_add(vo${M}p${23 % ACCUMULATORS}, wasm_f32x4_mul(vi${2*M+1}xACEG, wasm_v32x4_shuffle(vw89AB, vw89AB, 2, 2, 2, 2))); 200 201 $for M in range(ROW_TILE): 202 vo${M}p${24 % ACCUMULATORS} = wasm_f32x4_add(vo${M}p${24 % ACCUMULATORS}, wasm_f32x4_mul(vi${2*M+2}xACEG, wasm_v32x4_shuffle(vwCDEF, vwCDEF, 3, 3, 3, 3))); 203 204 $for M in range(ROW_TILE): 205 vo${M}p${25 % ACCUMULATORS} = wasm_f32x4_add(vo${M}p${25 % ACCUMULATORS}, wasm_f32x4_mul(vi${2*M+3}xACEG, wasm_v32x4_shuffle(vwKLMN, vwKLMN, 0, 0, 0, 0))); 206 207 $for M in range(ROW_TILE): 208 vo${M}p${26 % ACCUMULATORS} = wasm_f32x4_add(vo${M}p${26 % ACCUMULATORS}, wasm_f32x4_mul(vi${2*M+4}xACEG, wasm_v32x4_shuffle(vwOP, vwOP, 1, 1, 1, 1))); 209 210 $if ACCUMULATORS > 1: 211 $ACC_SLICE = 1 212 $while ACC_SLICE < ACCUMULATORS: 213 $for A in range(0, ACCUMULATORS, ACC_SLICE * 2): 214 $if A + ACC_SLICE < ACCUMULATORS: 215 $for M in range(ROW_TILE): 216 vo${M}p${A} = wasm_f32x4_add(vo${M}p${A}, vo${M}p${A + ACC_SLICE}); 217 $ACC_SLICE *= 2 218 219 $if X86: 220 $for M in range(ROW_TILE): 221 v128_t vo${M} = wasm_v128_bitselect(vmin, vo${M}p0, wasm_f32x4_lt(vo${M}p0, vmin)); 222 $for M in range(ROW_TILE): 223 vo${M} = wasm_v128_bitselect(vo${M}, vmax, wasm_f32x4_le(vo${M}, vmax)); 224 $else: 225 $for M in range(ROW_TILE): 226 v128_t vo${M} = wasm_f32x4_max(vo${M}p0, vmin); 227 $for M in range(ROW_TILE): 228 vo${M} = wasm_f32x4_min(vo${M}, vmax); 229 230 $for M in reversed(range(ROW_TILE)): 231 wasm_v128_store(o${M}, vo${M}); o${M} += 4; 232 } 233 // Last block has 1-8 pixels to process. 234 assert(w <= 8 * sizeof(float)); 235 assert(w >= 1 * sizeof(float)); 236 { 237 $for M in range(ROW_TILE): 238 v128_t vo${M}p0 = wasm_v32x4_shuffle(vw0123, vw0123, 0, 0, 0, 0); 239 240 $for M in range(3 + 2 * ROW_TILE): 241 vi${M}x8ACE = wasm_v128_and(vmask_even, vi${M}x8ACE); 242 243 $for M in range(3 + 2 * ROW_TILE): 244 vi${M}x9BDF = wasm_v128_and(vmask_odd, vi${M}x9BDF); 245 246 $for M in range(ROW_TILE): 247 $if ACCUMULATORS > 1: 248 v128_t vo${M}p1 = wasm_f32x4_mul(vi${2*M}x8ACE, wasm_v32x4_shuffle(vw0123, vw0123, 3, 3, 3, 3)); 249 $else: 250 vo${M}p0 = wasm_f32x4_add(vo${M}p0, wasm_f32x4_mul(vi${2*M}x8ACE, wasm_v32x4_shuffle(vw0123, vw0123, 3, 3, 3, 3))); 251 252 $for M in range(ROW_TILE): 253 $if ACCUMULATORS > 2: 254 v128_t vo${M}p2 = wasm_f32x4_mul(vi${2*M+1}x8ACE, wasm_v32x4_shuffle(vw89AB, vw89AB, 0, 0, 0, 0)); 255 $else: 256 vo${M}p0 = wasm_f32x4_add(vo${M}p0, wasm_f32x4_mul(vi${2*M+1}x8ACE, wasm_v32x4_shuffle(vw89AB, vw89AB, 0, 0, 0, 0))); 257 258 $for M in range(ROW_TILE): 259 $if ACCUMULATORS > 3: 260 v128_t vo${M}p3 = wasm_f32x4_mul(vi${2*M+2}x8ACE, wasm_v32x4_shuffle(vwCDEF, vwCDEF, 1, 1, 1, 1)); 261 $else: 262 vo${M}p${4 % ACCUMULATORS} = wasm_f32x4_add(vo${M}p${4 % ACCUMULATORS}, wasm_f32x4_mul(vi${2*M+2}x8ACE, wasm_v32x4_shuffle(vwCDEF, vwCDEF, 1, 1, 1, 1))); 263 264 $for M in range(ROW_TILE): 265 $if ACCUMULATORS > 4: 266 v128_t vo${M}p4 = wasm_f32x4_mul(vi${2*M+3}x8ACE, wasm_v32x4_shuffle(vwGHIJ, vwGHIJ, 2, 2, 2, 2)); 267 $else: 268 vo${M}p${5 % ACCUMULATORS} = wasm_f32x4_add(vo${M}p${5 % ACCUMULATORS}, wasm_f32x4_mul(vi${2*M+3}x8ACE, wasm_v32x4_shuffle(vwGHIJ, vwGHIJ, 2, 2, 2, 2))); 269 270 $for M in range(ROW_TILE): 271 $if ACCUMULATORS > 5: 272 vo${M}p5 = wasm_f32x4_mul(vi${2*M+4}x8ACE, wasm_v32x4_shuffle(vwKLMN, vwKLMN, 3, 3, 3, 3)); 273 $else: 274 vo${M}p${6 % ACCUMULATORS} = wasm_f32x4_add(vo${M}p${6 % ACCUMULATORS}, wasm_f32x4_mul(vi${2*M+4}x8ACE, wasm_v32x4_shuffle(vwKLMN, vwKLMN, 3, 3, 3, 3))); 275 276 $for M in range(ROW_TILE): 277 vo${M}p${7 % ACCUMULATORS} = wasm_f32x4_add(vo${M}p${7 % ACCUMULATORS}, wasm_f32x4_mul(vi${2*M}x9BDF, wasm_v32x4_shuffle(vw4567, vw4567, 0, 0, 0, 0))); 278 279 $for M in range(ROW_TILE): 280 vo${M}p${8 % ACCUMULATORS} = wasm_f32x4_add(vo${M}p${8 % ACCUMULATORS}, wasm_f32x4_mul(vi${2*M+1}x9BDF, wasm_v32x4_shuffle(vw89AB, vw89AB, 1, 1, 1, 1))); 281 282 $for M in range(ROW_TILE): 283 vo${M}p${9 % ACCUMULATORS} = wasm_f32x4_add(vo${M}p${9 % ACCUMULATORS}, wasm_f32x4_mul(vi${2*M+2}x9BDF, wasm_v32x4_shuffle(vwCDEF, vwCDEF, 2, 2, 2, 2))); 284 285 $for M in range(ROW_TILE): 286 vo${M}p${10 % ACCUMULATORS} = wasm_f32x4_add(vo${M}p${10 % ACCUMULATORS}, wasm_f32x4_mul(vi${2*M+3}x9BDF, wasm_v32x4_shuffle(vwGHIJ, vwGHIJ, 3, 3, 3, 3))); 287 288 $for M in range(ROW_TILE): 289 vo${M}p${11 % ACCUMULATORS} = wasm_f32x4_add(vo${M}p${11 % ACCUMULATORS}, wasm_f32x4_mul(vi${2*M+4}x9BDF, wasm_v32x4_shuffle(vwOP, vwOP, 0, 0, 0, 0))); 290 291 $for M in range(3 + 2 * ROW_TILE): 292 const v128_t vi${M}x68AC = wasm_v32x4_shuffle(vi${M}x0246, vi${M}x8ACE, 3, 4, 5, 6); 293 294 $for M in range(ROW_TILE): 295 vo${M}p${12 % ACCUMULATORS} = wasm_f32x4_add(vo${M}p${12 % ACCUMULATORS}, wasm_f32x4_mul(vi${2*M}x68AC, wasm_v32x4_shuffle(vw0123, vw0123, 1, 1, 1, 1))); 296 297 $for M in range(ROW_TILE): 298 vo${M}p${13 % ACCUMULATORS} = wasm_f32x4_add(vo${M}p${13 % ACCUMULATORS}, wasm_f32x4_mul(vi${2*M+1}x68AC, wasm_v32x4_shuffle(vw4567, vw4567, 2, 2, 2, 2))); 299 300 $for M in range(ROW_TILE): 301 vo${M}p${14 % ACCUMULATORS} = wasm_f32x4_add(vo${M}p${14 % ACCUMULATORS}, wasm_f32x4_mul(vi${2*M+2}x68AC, wasm_v32x4_shuffle(vw89AB, vw89AB, 3, 3, 3, 3))); 302 303 $for M in range(ROW_TILE): 304 vo${M}p${15 % ACCUMULATORS} = wasm_f32x4_add(vo${M}p${15 % ACCUMULATORS}, wasm_f32x4_mul(vi${2*M+3}x68AC, wasm_v32x4_shuffle(vwGHIJ, vwGHIJ, 0, 0, 0, 0))); 305 306 $for M in range(ROW_TILE): 307 vo${M}p${16 % ACCUMULATORS} = wasm_f32x4_add(vo${M}p${16 % ACCUMULATORS}, wasm_f32x4_mul(vi${2*M+4}x68AC, wasm_v32x4_shuffle(vwKLMN, vwKLMN, 1, 1, 1, 1))); 308 309 $for M in range(3 + 2 * ROW_TILE): 310 const v128_t vi${M}x79BD = wasm_v32x4_shuffle(vi${M}x1357, vi${M}x9BDF, 3, 4, 5, 6); 311 312 $for M in range(ROW_TILE): 313 vo${M}p${17 % ACCUMULATORS} = wasm_f32x4_add(vo${M}p${17 % ACCUMULATORS}, wasm_f32x4_mul(vi${2*M}x79BD, wasm_v32x4_shuffle(vw0123, vw0123, 2, 2, 2, 2))); 314 315 $for M in range(ROW_TILE): 316 vo${M}p${18 % ACCUMULATORS} = wasm_f32x4_add(vo${M}p${18 % ACCUMULATORS}, wasm_f32x4_mul(vi${2*M+1}x79BD, wasm_v32x4_shuffle(vw4567, vw4567, 3, 3, 3, 3))); 317 318 $for M in range(ROW_TILE): 319 vo${M}p${19 % ACCUMULATORS} = wasm_f32x4_add(vo${M}p${19 % ACCUMULATORS}, wasm_f32x4_mul(vi${2*M+2}x79BD, wasm_v32x4_shuffle(vwCDEF, vwCDEF, 0, 0, 0, 0))); 320 321 $for M in range(ROW_TILE): 322 vo${M}p${20 % ACCUMULATORS} = wasm_f32x4_add(vo${M}p${20 % ACCUMULATORS}, wasm_f32x4_mul(vi${2*M+3}x79BD, wasm_v32x4_shuffle(vwGHIJ, vwGHIJ, 1, 1, 1, 1))); 323 324 $for M in range(ROW_TILE): 325 vo${M}p${21 % ACCUMULATORS} = wasm_f32x4_add(vo${M}p${21 % ACCUMULATORS}, wasm_f32x4_mul(vi${2*M+4}x79BD, wasm_v32x4_shuffle(vwKLMN, vwKLMN, 2, 2, 2, 2))); 326 327 $for M in range(3 + 2 * ROW_TILE): 328 const v128_t vi${M}xACEG = wasm_v32x4_shuffle(vi${M}x8ACE, vzero, 1, 2, 3, 4); 329 330 $for M in range(ROW_TILE): 331 vo${M}p${22 % ACCUMULATORS} = wasm_f32x4_add(vo${M}p${22 % ACCUMULATORS}, wasm_f32x4_mul(vi${2*M}xACEG, wasm_v32x4_shuffle(vw4567, vw4567, 1, 1, 1, 1))); 332 333 $for M in range(ROW_TILE): 334 vo${M}p${23 % ACCUMULATORS} = wasm_f32x4_add(vo${M}p${23 % ACCUMULATORS}, wasm_f32x4_mul(vi${2*M+1}xACEG, wasm_v32x4_shuffle(vw89AB, vw89AB, 2, 2, 2, 2))); 335 336 $for M in range(ROW_TILE): 337 vo${M}p${24 % ACCUMULATORS} = wasm_f32x4_add(vo${M}p${24 % ACCUMULATORS}, wasm_f32x4_mul(vi${2*M+2}xACEG, wasm_v32x4_shuffle(vwCDEF, vwCDEF, 3, 3, 3, 3))); 338 339 $for M in range(ROW_TILE): 340 vo${M}p${25 % ACCUMULATORS} = wasm_f32x4_add(vo${M}p${25 % ACCUMULATORS}, wasm_f32x4_mul(vi${2*M+3}xACEG, wasm_v32x4_shuffle(vwKLMN, vwKLMN, 0, 0, 0, 0))); 341 342 $for M in range(ROW_TILE): 343 vo${M}p${26 % ACCUMULATORS} = wasm_f32x4_add(vo${M}p${26 % ACCUMULATORS}, wasm_f32x4_mul(vi${2*M+4}xACEG, wasm_v32x4_shuffle(vwOP, vwOP, 1, 1, 1, 1))); 344 345 $if ACCUMULATORS > 1: 346 $ACC_SLICE = 1 347 $while ACC_SLICE < ACCUMULATORS: 348 $for A in range(0, ACCUMULATORS, ACC_SLICE * 2): 349 $if A + ACC_SLICE < ACCUMULATORS: 350 $for M in range(ROW_TILE): 351 vo${M}p${A} = wasm_f32x4_add(vo${M}p${A}, vo${M}p${A + ACC_SLICE}); 352 $ACC_SLICE *= 2 353 354 $if X86: 355 $for M in range(ROW_TILE): 356 v128_t vo${M} = wasm_v128_bitselect(vmin, vo${M}p0, wasm_f32x4_lt(vo${M}p0, vmin)); 357 $for M in range(ROW_TILE): 358 vo${M} = wasm_v128_bitselect(vo${M}, vmax, wasm_f32x4_le(vo${M}, vmax)); 359 $else: 360 $for M in range(ROW_TILE): 361 v128_t vo${M} = wasm_f32x4_max(vo${M}p0, vmin); 362 $for M in range(ROW_TILE): 363 vo${M} = wasm_f32x4_min(vo${M}, vmax); 364 365 size_t w_tmp = (w + 1 * sizeof(float)) / (2 * sizeof(float)); 366 if XNN_LIKELY(w_tmp >= 4) { 367 $for M in reversed(range(ROW_TILE)): 368 wasm_v128_store(o${M}, vo${M}); o${M} += 4; 369 } else { 370 if (w_tmp & 2) { 371 $for M in reversed(range(ROW_TILE)): 372 *((double*) o${M}) = wasm_f64x2_extract_lane(vo${M}, 0); o${M} += 2; 373 374 $for M in range(ROW_TILE): 375 vo${M} = wasm_v32x4_shuffle(vo${M}, vo${M}, 2, 3, 0, 1); 376 } 377 if (w_tmp & 1) { 378 $for M in reversed(range(ROW_TILE)): 379 *o${M} = wasm_f32x4_extract_lane(vo${M}, 0); o${M} += 1; 380 } 381 } 382 } 383 384 i0 = (const float*) ((uintptr_t) i${2 * ROW_TILE} - input_decrement); 385 i1 = (const float*) ((uintptr_t) i${2 * ROW_TILE + 1} - input_decrement); 386 i2 = (const float*) ((uintptr_t) i${2 * ROW_TILE + 2} - input_decrement); 387 $for M in range(3, 3 + 2 * ROW_TILE): 388 i${M} = (const float*) ((uintptr_t) i${M-1} + input_width); 389 390 $if ROW_TILE > 1: 391 o0 = o${ROW_TILE - 1}; 392 $for M in range(1, ROW_TILE): 393 o${M} = (float*) ((uintptr_t) o${M-1} + output_width); 394 395 $if ROW_TILE > 1: 396 output_height = doz(output_height, ${ROW_TILE}); 397 padded_input_height = doz(padded_input_height, ${ROW_TILE * 2}); 398 $else: 399 output_height -= 1; 400 padded_input_height -= 2; 401 } while (output_height != 0); 402} 403