1// Copyright 2020 Google LLC 2// 3// This source code is licensed under the BSD-style license found in the 4// LICENSE file in the root directory of this source tree. 5 6$assert ROW_TILE >= 1 7$assert ACCUMULATORS >= 1 8#include <assert.h> 9 10#include <wasm_simd128.h> 11 12#include <xnnpack/dwconv.h> 13#include <xnnpack/math.h> 14 15 16$ARCH_SUFFIX = "_x86" if X86 else "_arm" 17 18void xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd${ARCH_SUFFIX}_loadsplat_${ROW_TILE}x4${"_acc%d" % ACCUMULATORS if ACCUMULATORS > 1 else ""}( 19 size_t input_height, 20 size_t input_width, 21 const float* input, 22 const float* weights, 23 const float* zero, 24 float* output, 25 uint32_t padding_top, 26 const union xnn_f32_chw_params params[restrict XNN_MIN_ELEMENTS(1)]) 27{ 28 assert(input_height != 0); 29 assert(input_width != 0); 30 assert(input_width % sizeof(float) == 0); 31 assert(padding_top == 2); 32 33 const v128_t vmask = wasm_v128_load(params->scalar.mask); 34 const v128_t vmax = wasm_v32x4_load_splat(¶ms->scalar.max); 35 const v128_t vmin = wasm_v32x4_load_splat(¶ms->scalar.min); 36 37 const v128_t vw0123 = wasm_v128_load(weights); 38 const v128_t vw4567 = wasm_v128_load(weights + 4); 39 const v128_t vw89AB = wasm_v128_load(weights + 8); 40 const v128_t vwCDEF = wasm_v128_load(weights + 12); 41 const v128_t vwGHIJ = wasm_v128_load(weights + 16); 42 const v128_t vwKLMN = wasm_v128_load(weights + 20); 43 const v128_t vwOP = wasm_v64x2_load_splat(weights + 24); 44 const v128_t vbias = wasm_v32x4_shuffle(vw0123, vw0123, 0, 0, 0, 0); 45 const v128_t vk00 = wasm_v32x4_shuffle(vw0123, vw0123, 1, 1, 1, 1); 46 const v128_t vk01 = wasm_v32x4_shuffle(vw0123, vw0123, 2, 2, 2, 2); 47 const v128_t vk02 = wasm_v32x4_shuffle(vw0123, vw0123, 3, 3, 3, 3); 48 const v128_t vk03 = wasm_v32x4_shuffle(vw4567, vw4567, 0, 0, 0, 0); 49 const v128_t vk04 = wasm_v32x4_shuffle(vw4567, vw4567, 1, 1, 1, 1); 50 const v128_t vk10 = wasm_v32x4_shuffle(vw4567, vw4567, 2, 2, 2, 2); 51 const v128_t vk11 = wasm_v32x4_shuffle(vw4567, vw4567, 3, 3, 3, 3); 52 const v128_t vk12 = wasm_v32x4_shuffle(vw89AB, vw89AB, 0, 0, 0, 0); 53 const v128_t vk13 = wasm_v32x4_shuffle(vw89AB, vw89AB, 1, 1, 1, 1); 54 const v128_t vk14 = wasm_v32x4_shuffle(vw89AB, vw89AB, 2, 2, 2, 2); 55 const v128_t vk20 = wasm_v32x4_shuffle(vw89AB, vw89AB, 3, 3, 3, 3); 56 const v128_t vk21 = wasm_v32x4_shuffle(vwCDEF, vwCDEF, 0, 0, 0, 0); 57 const v128_t vk22 = wasm_v32x4_shuffle(vwCDEF, vwCDEF, 1, 1, 1, 1); 58 const v128_t vk23 = wasm_v32x4_shuffle(vwCDEF, vwCDEF, 2, 2, 2, 2); 59 const v128_t vk24 = wasm_v32x4_shuffle(vwCDEF, vwCDEF, 3, 3, 3, 3); 60 const v128_t vk30 = wasm_v32x4_shuffle(vwGHIJ, vwGHIJ, 0, 0, 0, 0); 61 const v128_t vk31 = wasm_v32x4_shuffle(vwGHIJ, vwGHIJ, 1, 1, 1, 1); 62 const v128_t vk32 = wasm_v32x4_shuffle(vwGHIJ, vwGHIJ, 2, 2, 2, 2); 63 const v128_t vk33 = wasm_v32x4_shuffle(vwGHIJ, vwGHIJ, 3, 3, 3, 3); 64 const v128_t vk34 = wasm_v32x4_shuffle(vwKLMN, vwKLMN, 0, 0, 0, 0); 65 const v128_t vk40 = wasm_v32x4_shuffle(vwKLMN, vwKLMN, 1, 1, 1, 1); 66 const v128_t vk41 = wasm_v32x4_shuffle(vwKLMN, vwKLMN, 2, 2, 2, 2); 67 const v128_t vk42 = wasm_v32x4_shuffle(vwKLMN, vwKLMN, 3, 3, 3, 3); 68 const v128_t vk43 = wasm_v32x4_shuffle(vwOP, vwOP, 0, 0, 0, 0); 69 const v128_t vk44 = wasm_v32x4_shuffle(vwOP, vwOP, 1, 1, 1, 1); 70 71 const v128_t vzero = wasm_f32x4_splat(0.0f); 72 73 const size_t input_decrement = round_up_po2(input_width, 4 * sizeof(float)); 74 75 const float* i0 = zero; 76 const float* i1 = zero; 77 const float* i2 = input; 78 $for M in range(3, 4 + ROW_TILE): 79 const float* i${M} = (const float*) ((uintptr_t) i${M-1} + input_width); 80 81 float* o0 = output; 82 $for M in range(1, ROW_TILE): 83 float* o${M} = (float*) ((uintptr_t) o${M-1} + input_width); 84 85 size_t output_height = input_height; 86 do { 87 $for M in range(2, 3 + ROW_TILE): 88 if XNN_UNPREDICTABLE(output_height < ${M}) { 89 i${M+1} = zero; 90 $if M <= ROW_TILE: 91 o${M-1} = o${M-2}; 92 } 93 94 $for M in range(4 + ROW_TILE): 95 v128_t vi${M}x0123 = vzero; 96 97 $for M in range(4 + ROW_TILE): 98 v128_t vi${M}x4567 = wasm_v128_load(i${M}); i${M} += 4; 99 100 size_t w = input_width; 101 for (; w > 8 * sizeof(float); w -= 4 * sizeof(float)) { 102 $for M in range(ROW_TILE): 103 v128_t vo${M}p0 = vbias; 104 105 $for M in range(4 + ROW_TILE): 106 const v128_t vi${M}x89AB = wasm_v128_load(i${M}); i${M} += 4; 107 108 $for M in range(ROW_TILE): 109 $if ACCUMULATORS > 1: 110 v128_t vo${M}p1 = wasm_f32x4_mul(vi${M}x4567, vk02); 111 $else: 112 vo${M}p0 = wasm_f32x4_add(vo${M}p0, wasm_f32x4_mul(vi${M}x4567, vk02)); 113 114 $for M in range(ROW_TILE): 115 $if ACCUMULATORS > 2: 116 v128_t vo${M}p2 = wasm_f32x4_mul(vi${M+1}x4567, vk12); 117 $else: 118 vo${M}p0 = wasm_f32x4_add(vo${M}p0, wasm_f32x4_mul(vi${M+1}x4567, vk12)); 119 120 $for M in range(ROW_TILE): 121 $if ACCUMULATORS > 3: 122 v128_t vo${M}p3 = wasm_f32x4_mul(vi${M+2}x4567, vk22); 123 $else: 124 vo${M}p${4 % ACCUMULATORS} = wasm_f32x4_add(vo${M}p${4 % ACCUMULATORS}, wasm_f32x4_mul(vi${M+2}x4567, vk22)); 125 126 $for M in range(ROW_TILE): 127 $if ACCUMULATORS > 4: 128 v128_t vo${M}p4 = wasm_f32x4_mul(vi${M+3}x4567, vk32); 129 $else: 130 vo${M}p${5 % ACCUMULATORS} = wasm_f32x4_add(vo${M}p${5 % ACCUMULATORS}, wasm_f32x4_mul(vi${M+3}x4567, vk32)); 131 132 $for M in range(ROW_TILE): 133 $if ACCUMULATORS > 6: 134 v128_t vo${M}p5 = wasm_f32x4_mul(vi${M+4}x4567, vk42); 135 $else: 136 vo${M}p${6 % ACCUMULATORS} = wasm_f32x4_add(vo${M}p${6 % ACCUMULATORS}, wasm_f32x4_mul(vi${M+4}x4567, vk42)); 137 138 $for M in range(4 + ROW_TILE): 139 const v128_t vi${M}x3456 = wasm_v32x4_shuffle(vi${M}x0123, vi${M}x4567, 3, 4, 5, 6); 140 141 $for M in range(ROW_TILE): 142 vo${M}p${7 % ACCUMULATORS} = wasm_f32x4_add(vo${M}p${7 % ACCUMULATORS}, wasm_f32x4_mul(vi${M}x3456, vk01)); 143 144 $for M in range(ROW_TILE): 145 vo${M}p${8 % ACCUMULATORS} = wasm_f32x4_add(vo${M}p${8 % ACCUMULATORS}, wasm_f32x4_mul(vi${M+1}x3456, vk11)); 146 147 $for M in range(ROW_TILE): 148 vo${M}p${9 % ACCUMULATORS} = wasm_f32x4_add(vo${M}p${9 % ACCUMULATORS}, wasm_f32x4_mul(vi${M+2}x3456, vk21)); 149 150 $for M in range(ROW_TILE): 151 vo${M}p${10 % ACCUMULATORS} = wasm_f32x4_add(vo${M}p${10 % ACCUMULATORS}, wasm_f32x4_mul(vi${M+3}x3456, vk31)); 152 153 $for M in range(ROW_TILE): 154 vo${M}p${11 % ACCUMULATORS} = wasm_f32x4_add(vo${M}p${11 % ACCUMULATORS}, wasm_f32x4_mul(vi${M+4}x3456, vk41)); 155 156 $for M in range(4 + ROW_TILE): 157 const v128_t vi${M}x2345 = wasm_v32x4_shuffle(vi${M}x0123, vi${M}x4567, 2, 3, 4, 5); 158 vi${M}x0123 = vi${M}x4567; 159 160 $for M in range(ROW_TILE): 161 vo${M}p${12 % ACCUMULATORS} = wasm_f32x4_add(vo${M}p${12 % ACCUMULATORS}, wasm_f32x4_mul(vi${M}x2345, vk00)); 162 163 $for M in range(ROW_TILE): 164 vo${M}p${13 % ACCUMULATORS} = wasm_f32x4_add(vo${M}p${13 % ACCUMULATORS}, wasm_f32x4_mul(vi${M+1}x2345, vk10)); 165 166 $for M in range(ROW_TILE): 167 vo${M}p${14 % ACCUMULATORS} = wasm_f32x4_add(vo${M}p${14 % ACCUMULATORS}, wasm_f32x4_mul(vi${M+2}x2345, vk20)); 168 169 $for M in range(ROW_TILE): 170 vo${M}p${15 % ACCUMULATORS} = wasm_f32x4_add(vo${M}p${15 % ACCUMULATORS}, wasm_f32x4_mul(vi${M+3}x2345, vk30)); 171 172 $for M in range(ROW_TILE): 173 vo${M}p${16 % ACCUMULATORS} = wasm_f32x4_add(vo${M}p${16 % ACCUMULATORS}, wasm_f32x4_mul(vi${M+4}x2345, vk40)); 174 175 $for M in range(4 + ROW_TILE): 176 const v128_t vi${M}x5678 = wasm_v32x4_shuffle(vi${M}x4567, vi${M}x89AB, 1, 2, 3, 4); 177 178 $for M in range(ROW_TILE): 179 vo${M}p${17 % ACCUMULATORS} = wasm_f32x4_add(vo${M}p${17 % ACCUMULATORS}, wasm_f32x4_mul(vi${M}x5678, vk03)); 180 181 $for M in range(ROW_TILE): 182 vo${M}p${18 % ACCUMULATORS} = wasm_f32x4_add(vo${M}p${18 % ACCUMULATORS}, wasm_f32x4_mul(vi${M+1}x5678, vk13)); 183 184 $for M in range(ROW_TILE): 185 vo${M}p${19 % ACCUMULATORS} = wasm_f32x4_add(vo${M}p${19 % ACCUMULATORS}, wasm_f32x4_mul(vi${M+2}x5678, vk23)); 186 187 $for M in range(ROW_TILE): 188 vo${M}p${20 % ACCUMULATORS} = wasm_f32x4_add(vo${M}p${20 % ACCUMULATORS}, wasm_f32x4_mul(vi${M+3}x5678, vk33)); 189 190 $for M in range(ROW_TILE): 191 vo${M}p${21 % ACCUMULATORS} = wasm_f32x4_add(vo${M}p${21 % ACCUMULATORS}, wasm_f32x4_mul(vi${M+4}x5678, vk43)); 192 193 $for M in range(4 + ROW_TILE): 194 const v128_t vi${M}x6789 = wasm_v32x4_shuffle(vi${M}x4567, vi${M}x89AB, 2, 3, 4, 5); 195 vi${M}x4567 = vi${M}x89AB; 196 197 $for M in range(ROW_TILE): 198 vo${M}p${22 % ACCUMULATORS} = wasm_f32x4_add(vo${M}p${22 % ACCUMULATORS}, wasm_f32x4_mul(vi${M}x6789, vk04)); 199 200 $for M in range(ROW_TILE): 201 vo${M}p${23 % ACCUMULATORS} = wasm_f32x4_add(vo${M}p${23 % ACCUMULATORS}, wasm_f32x4_mul(vi${M+1}x6789, vk14)); 202 203 $for M in range(ROW_TILE): 204 vo${M}p${24 % ACCUMULATORS} = wasm_f32x4_add(vo${M}p${24 % ACCUMULATORS}, wasm_f32x4_mul(vi${M+2}x6789, vk24)); 205 206 $for M in range(ROW_TILE): 207 vo${M}p${25 % ACCUMULATORS} = wasm_f32x4_add(vo${M}p${25 % ACCUMULATORS}, wasm_f32x4_mul(vi${M+3}x6789, vk34)); 208 209 $for M in range(ROW_TILE): 210 vo${M}p${26 % ACCUMULATORS} = wasm_f32x4_add(vo${M}p${26 % ACCUMULATORS}, wasm_f32x4_mul(vi${M+4}x6789, vk44)); 211 212 $if ACCUMULATORS > 1: 213 $ACC_SLICE = 1 214 $while ACC_SLICE < ACCUMULATORS: 215 $for A in range(0, ACCUMULATORS, ACC_SLICE * 2): 216 $if A + ACC_SLICE < ACCUMULATORS: 217 $for M in range(ROW_TILE): 218 vo${M}p${A} = wasm_f32x4_add(vo${M}p${A}, vo${M}p${A + ACC_SLICE}); 219 $ACC_SLICE *= 2 220 221 $if X86: 222 $for M in range(ROW_TILE): 223 v128_t vo${M} = wasm_v128_bitselect(vmin, vo${M}p0, wasm_f32x4_lt(vo${M}p0, vmin)); 224 $for M in range(ROW_TILE): 225 vo${M} = wasm_v128_bitselect(vo${M}, vmax, wasm_f32x4_le(vo${M}, vmax)); 226 $else: 227 $for M in range(ROW_TILE): 228 v128_t vo${M} = wasm_f32x4_max(vo${M}p0, vmin); 229 $for M in range(ROW_TILE): 230 vo${M} = wasm_f32x4_min(vo${M}, vmax); 231 232 $for M in reversed(range(ROW_TILE)): 233 wasm_v128_store(o${M}, vo${M}); o${M} += 4; 234 } 235 // Always process the last block of 5..8 pixels. 236 if XNN_LIKELY(w > 4 * sizeof(float)) { 237 $for M in range(ROW_TILE): 238 v128_t vo${M}p0 = vbias; 239 240 $for M in range(4 + ROW_TILE): 241 v128_t vi${M}x89AB = wasm_v128_load(i${M}); i${M} += 4; 242 243 $for M in range(4 + ROW_TILE): 244 vi${M}x89AB = wasm_v128_and(vmask, vi${M}x89AB); 245 246 $for M in range(ROW_TILE): 247 $if ACCUMULATORS > 1: 248 v128_t vo${M}p1 = wasm_f32x4_mul(vi${M}x4567, vk02); 249 $else: 250 vo${M}p0 = wasm_f32x4_add(vo${M}p0, wasm_f32x4_mul(vi${M}x4567, vk02)); 251 252 $for M in range(ROW_TILE): 253 $if ACCUMULATORS > 2: 254 v128_t vo${M}p2 = wasm_f32x4_mul(vi${M+1}x4567, vk12); 255 $else: 256 vo${M}p0 = wasm_f32x4_add(vo${M}p0, wasm_f32x4_mul(vi${M+1}x4567, vk12)); 257 258 $for M in range(ROW_TILE): 259 $if ACCUMULATORS > 3: 260 v128_t vo${M}p3 = wasm_f32x4_mul(vi${M+2}x4567, vk22); 261 $else: 262 vo${M}p${4 % ACCUMULATORS} = wasm_f32x4_add(vo${M}p${4 % ACCUMULATORS}, wasm_f32x4_mul(vi${M+2}x4567, vk22)); 263 264 $for M in range(ROW_TILE): 265 $if ACCUMULATORS > 4: 266 v128_t vo${M}p4 = wasm_f32x4_mul(vi${M+3}x4567, vk32); 267 $else: 268 vo${M}p${5 % ACCUMULATORS} = wasm_f32x4_add(vo${M}p${5 % ACCUMULATORS}, wasm_f32x4_mul(vi${M+3}x4567, vk32)); 269 270 $for M in range(ROW_TILE): 271 $if ACCUMULATORS > 6: 272 v128_t vo${M}p5 = wasm_f32x4_mul(vi${M+4}x4567, vk42); 273 $else: 274 vo${M}p${6 % ACCUMULATORS} = wasm_f32x4_add(vo${M}p${6 % ACCUMULATORS}, wasm_f32x4_mul(vi${M+4}x4567, vk42)); 275 276 $for M in range(4 + ROW_TILE): 277 const v128_t vi${M}x3456 = wasm_v32x4_shuffle(vi${M}x0123, vi${M}x4567, 3, 4, 5, 6); 278 279 $for M in range(ROW_TILE): 280 vo${M}p${7 % ACCUMULATORS} = wasm_f32x4_add(vo${M}p${7 % ACCUMULATORS}, wasm_f32x4_mul(vi${M}x3456, vk01)); 281 282 $for M in range(ROW_TILE): 283 vo${M}p${8 % ACCUMULATORS} = wasm_f32x4_add(vo${M}p${8 % ACCUMULATORS}, wasm_f32x4_mul(vi${M+1}x3456, vk11)); 284 285 $for M in range(ROW_TILE): 286 vo${M}p${9 % ACCUMULATORS} = wasm_f32x4_add(vo${M}p${9 % ACCUMULATORS}, wasm_f32x4_mul(vi${M+2}x3456, vk21)); 287 288 $for M in range(ROW_TILE): 289 vo${M}p${10 % ACCUMULATORS} = wasm_f32x4_add(vo${M}p${10 % ACCUMULATORS}, wasm_f32x4_mul(vi${M+3}x3456, vk31)); 290 291 $for M in range(ROW_TILE): 292 vo${M}p${11 % ACCUMULATORS} = wasm_f32x4_add(vo${M}p${11 % ACCUMULATORS}, wasm_f32x4_mul(vi${M+4}x3456, vk41)); 293 294 $for M in range(4 + ROW_TILE): 295 const v128_t vi${M}x2345 = wasm_v32x4_shuffle(vi${M}x0123, vi${M}x4567, 2, 3, 4, 5); 296 vi${M}x0123 = vi${M}x4567; 297 298 $for M in range(ROW_TILE): 299 vo${M}p${12 % ACCUMULATORS} = wasm_f32x4_add(vo${M}p${12 % ACCUMULATORS}, wasm_f32x4_mul(vi${M}x2345, vk00)); 300 301 $for M in range(ROW_TILE): 302 vo${M}p${13 % ACCUMULATORS} = wasm_f32x4_add(vo${M}p${13 % ACCUMULATORS}, wasm_f32x4_mul(vi${M+1}x2345, vk10)); 303 304 $for M in range(ROW_TILE): 305 vo${M}p${14 % ACCUMULATORS} = wasm_f32x4_add(vo${M}p${14 % ACCUMULATORS}, wasm_f32x4_mul(vi${M+2}x2345, vk20)); 306 307 $for M in range(ROW_TILE): 308 vo${M}p${15 % ACCUMULATORS} = wasm_f32x4_add(vo${M}p${15 % ACCUMULATORS}, wasm_f32x4_mul(vi${M+3}x2345, vk30)); 309 310 $for M in range(ROW_TILE): 311 vo${M}p${16 % ACCUMULATORS} = wasm_f32x4_add(vo${M}p${16 % ACCUMULATORS}, wasm_f32x4_mul(vi${M+4}x2345, vk40)); 312 313 $for M in range(4 + ROW_TILE): 314 const v128_t vi${M}x5678 = wasm_v32x4_shuffle(vi${M}x4567, vi${M}x89AB, 1, 2, 3, 4); 315 316 $for M in range(ROW_TILE): 317 vo${M}p${17 % ACCUMULATORS} = wasm_f32x4_add(vo${M}p${17 % ACCUMULATORS}, wasm_f32x4_mul(vi${M}x5678, vk03)); 318 319 $for M in range(ROW_TILE): 320 vo${M}p${18 % ACCUMULATORS} = wasm_f32x4_add(vo${M}p${18 % ACCUMULATORS}, wasm_f32x4_mul(vi${M+1}x5678, vk13)); 321 322 $for M in range(ROW_TILE): 323 vo${M}p${19 % ACCUMULATORS} = wasm_f32x4_add(vo${M}p${19 % ACCUMULATORS}, wasm_f32x4_mul(vi${M+2}x5678, vk23)); 324 325 $for M in range(ROW_TILE): 326 vo${M}p${20 % ACCUMULATORS} = wasm_f32x4_add(vo${M}p${20 % ACCUMULATORS}, wasm_f32x4_mul(vi${M+3}x5678, vk33)); 327 328 $for M in range(ROW_TILE): 329 vo${M}p${21 % ACCUMULATORS} = wasm_f32x4_add(vo${M}p${21 % ACCUMULATORS}, wasm_f32x4_mul(vi${M+4}x5678, vk43)); 330 331 $for M in range(4 + ROW_TILE): 332 const v128_t vi${M}x6789 = wasm_v32x4_shuffle(vi${M}x4567, vi${M}x89AB, 2, 3, 4, 5); 333 vi${M}x4567 = vi${M}x89AB; 334 335 $for M in range(ROW_TILE): 336 vo${M}p${22 % ACCUMULATORS} = wasm_f32x4_add(vo${M}p${22 % ACCUMULATORS}, wasm_f32x4_mul(vi${M}x6789, vk04)); 337 338 $for M in range(ROW_TILE): 339 vo${M}p${23 % ACCUMULATORS} = wasm_f32x4_add(vo${M}p${23 % ACCUMULATORS}, wasm_f32x4_mul(vi${M+1}x6789, vk14)); 340 341 $for M in range(ROW_TILE): 342 vo${M}p${24 % ACCUMULATORS} = wasm_f32x4_add(vo${M}p${24 % ACCUMULATORS}, wasm_f32x4_mul(vi${M+2}x6789, vk24)); 343 344 $for M in range(ROW_TILE): 345 vo${M}p${25 % ACCUMULATORS} = wasm_f32x4_add(vo${M}p${25 % ACCUMULATORS}, wasm_f32x4_mul(vi${M+3}x6789, vk34)); 346 347 $for M in range(ROW_TILE): 348 vo${M}p${26 % ACCUMULATORS} = wasm_f32x4_add(vo${M}p${26 % ACCUMULATORS}, wasm_f32x4_mul(vi${M+4}x6789, vk44)); 349 350 $if ACCUMULATORS > 1: 351 $ACC_SLICE = 1 352 $while ACC_SLICE < ACCUMULATORS: 353 $for A in range(0, ACCUMULATORS, ACC_SLICE * 2): 354 $if A + ACC_SLICE < ACCUMULATORS: 355 $for M in range(ROW_TILE): 356 vo${M}p${A} = wasm_f32x4_add(vo${M}p${A}, vo${M}p${A + ACC_SLICE}); 357 $ACC_SLICE *= 2 358 359 $if X86: 360 $for M in range(ROW_TILE): 361 v128_t vo${M} = wasm_v128_bitselect(vmin, vo${M}p0, wasm_f32x4_lt(vo${M}p0, vmin)); 362 $for M in range(ROW_TILE): 363 vo${M} = wasm_v128_bitselect(vo${M}, vmax, wasm_f32x4_le(vo${M}, vmax)); 364 $else: 365 $for M in range(ROW_TILE): 366 v128_t vo${M} = wasm_f32x4_max(vo${M}p0, vmin); 367 $for M in range(ROW_TILE): 368 vo${M} = wasm_f32x4_min(vo${M}, vmax); 369 370 $for M in reversed(range(ROW_TILE)): 371 wasm_v128_store(o${M}, vo${M}); o${M} += 4; 372 373 w -= 4 * sizeof(float); 374 } 375 assert(w >= 1 * sizeof(float)); 376 assert(w <= 4 * sizeof(float)); 377 { 378 $for M in range(ROW_TILE): 379 v128_t vo${M}p0 = vbias; 380 381 $for M in range(4 + ROW_TILE): 382 vi${M}x4567 = wasm_v128_and(vmask, vi${M}x4567); 383 384 $for M in range(ROW_TILE): 385 $if ACCUMULATORS > 1: 386 v128_t vo${M}p1 = wasm_f32x4_mul(vi${M}x4567, vk02); 387 $else: 388 vo${M}p0 = wasm_f32x4_add(vo${M}p0, wasm_f32x4_mul(vi${M}x4567, vk02)); 389 390 $for M in range(ROW_TILE): 391 $if ACCUMULATORS > 2: 392 v128_t vo${M}p2 = wasm_f32x4_mul(vi${M+1}x4567, vk12); 393 $else: 394 vo${M}p0 = wasm_f32x4_add(vo${M}p0, wasm_f32x4_mul(vi${M+1}x4567, vk12)); 395 396 $for M in range(ROW_TILE): 397 $if ACCUMULATORS > 3: 398 v128_t vo${M}p3 = wasm_f32x4_mul(vi${M+2}x4567, vk22); 399 $else: 400 vo${M}p${4 % ACCUMULATORS} = wasm_f32x4_add(vo${M}p${4 % ACCUMULATORS}, wasm_f32x4_mul(vi${M+2}x4567, vk22)); 401 402 $for M in range(ROW_TILE): 403 $if ACCUMULATORS > 4: 404 v128_t vo${M}p4 = wasm_f32x4_mul(vi${M+3}x4567, vk32); 405 $else: 406 vo${M}p${5 % ACCUMULATORS} = wasm_f32x4_add(vo${M}p${5 % ACCUMULATORS}, wasm_f32x4_mul(vi${M+3}x4567, vk32)); 407 408 $for M in range(ROW_TILE): 409 $if ACCUMULATORS > 6: 410 v128_t vo${M}p5 = wasm_f32x4_mul(vi${M+4}x4567, vk42); 411 $else: 412 vo${M}p${6 % ACCUMULATORS} = wasm_f32x4_add(vo${M}p${6 % ACCUMULATORS}, wasm_f32x4_mul(vi${M+4}x4567, vk42)); 413 414 $for M in range(4 + ROW_TILE): 415 const v128_t vi${M}x3456 = wasm_v32x4_shuffle(vi${M}x0123, vi${M}x4567, 3, 4, 5, 6); 416 417 $for M in range(ROW_TILE): 418 vo${M}p${7 % ACCUMULATORS} = wasm_f32x4_add(vo${M}p${7 % ACCUMULATORS}, wasm_f32x4_mul(vi${M}x3456, vk01)); 419 420 $for M in range(ROW_TILE): 421 vo${M}p${8 % ACCUMULATORS} = wasm_f32x4_add(vo${M}p${8 % ACCUMULATORS}, wasm_f32x4_mul(vi${M+1}x3456, vk11)); 422 423 $for M in range(ROW_TILE): 424 vo${M}p${9 % ACCUMULATORS} = wasm_f32x4_add(vo${M}p${9 % ACCUMULATORS}, wasm_f32x4_mul(vi${M+2}x3456, vk21)); 425 426 $for M in range(ROW_TILE): 427 vo${M}p${10 % ACCUMULATORS} = wasm_f32x4_add(vo${M}p${10 % ACCUMULATORS}, wasm_f32x4_mul(vi${M+3}x3456, vk31)); 428 429 $for M in range(ROW_TILE): 430 vo${M}p${11 % ACCUMULATORS} = wasm_f32x4_add(vo${M}p${11 % ACCUMULATORS}, wasm_f32x4_mul(vi${M+4}x3456, vk41)); 431 432 $for M in range(4 + ROW_TILE): 433 const v128_t vi${M}x2345 = wasm_v32x4_shuffle(vi${M}x0123, vi${M}x4567, 2, 3, 4, 5); 434 435 $for M in range(ROW_TILE): 436 vo${M}p${12 % ACCUMULATORS} = wasm_f32x4_add(vo${M}p${12 % ACCUMULATORS}, wasm_f32x4_mul(vi${M}x2345, vk00)); 437 438 $for M in range(ROW_TILE): 439 vo${M}p${13 % ACCUMULATORS} = wasm_f32x4_add(vo${M}p${13 % ACCUMULATORS}, wasm_f32x4_mul(vi${M+1}x2345, vk10)); 440 441 $for M in range(ROW_TILE): 442 vo${M}p${14 % ACCUMULATORS} = wasm_f32x4_add(vo${M}p${14 % ACCUMULATORS}, wasm_f32x4_mul(vi${M+2}x2345, vk20)); 443 444 $for M in range(ROW_TILE): 445 vo${M}p${15 % ACCUMULATORS} = wasm_f32x4_add(vo${M}p${15 % ACCUMULATORS}, wasm_f32x4_mul(vi${M+3}x2345, vk30)); 446 447 $for M in range(ROW_TILE): 448 vo${M}p${16 % ACCUMULATORS} = wasm_f32x4_add(vo${M}p${16 % ACCUMULATORS}, wasm_f32x4_mul(vi${M+4}x2345, vk40)); 449 450 $for M in range(4 + ROW_TILE): 451 const v128_t vi${M}x5678 = wasm_v32x4_shuffle(vi${M}x4567, vzero, 1, 2, 3, 4); 452 453 $for M in range(ROW_TILE): 454 vo${M}p${17 % ACCUMULATORS} = wasm_f32x4_add(vo${M}p${17 % ACCUMULATORS}, wasm_f32x4_mul(vi${M}x5678, vk03)); 455 456 $for M in range(ROW_TILE): 457 vo${M}p${18 % ACCUMULATORS} = wasm_f32x4_add(vo${M}p${18 % ACCUMULATORS}, wasm_f32x4_mul(vi${M+1}x5678, vk13)); 458 459 $for M in range(ROW_TILE): 460 vo${M}p${19 % ACCUMULATORS} = wasm_f32x4_add(vo${M}p${19 % ACCUMULATORS}, wasm_f32x4_mul(vi${M+2}x5678, vk23)); 461 462 $for M in range(ROW_TILE): 463 vo${M}p${20 % ACCUMULATORS} = wasm_f32x4_add(vo${M}p${20 % ACCUMULATORS}, wasm_f32x4_mul(vi${M+3}x5678, vk33)); 464 465 $for M in range(ROW_TILE): 466 vo${M}p${21 % ACCUMULATORS} = wasm_f32x4_add(vo${M}p${21 % ACCUMULATORS}, wasm_f32x4_mul(vi${M+4}x5678, vk43)); 467 468 $for M in range(4 + ROW_TILE): 469 const v128_t vi${M}x6789 = wasm_v32x4_shuffle(vi${M}x5678, vzero, 1, 2, 3, 4); 470 471 $for M in range(ROW_TILE): 472 vo${M}p${22 % ACCUMULATORS} = wasm_f32x4_add(vo${M}p${22 % ACCUMULATORS}, wasm_f32x4_mul(vi${M}x6789, vk04)); 473 474 $for M in range(ROW_TILE): 475 vo${M}p${23 % ACCUMULATORS} = wasm_f32x4_add(vo${M}p${23 % ACCUMULATORS}, wasm_f32x4_mul(vi${M+1}x6789, vk14)); 476 477 $for M in range(ROW_TILE): 478 vo${M}p${24 % ACCUMULATORS} = wasm_f32x4_add(vo${M}p${24 % ACCUMULATORS}, wasm_f32x4_mul(vi${M+2}x6789, vk24)); 479 480 $for M in range(ROW_TILE): 481 vo${M}p${25 % ACCUMULATORS} = wasm_f32x4_add(vo${M}p${25 % ACCUMULATORS}, wasm_f32x4_mul(vi${M+3}x6789, vk34)); 482 483 $for M in range(ROW_TILE): 484 vo${M}p${26 % ACCUMULATORS} = wasm_f32x4_add(vo${M}p${26 % ACCUMULATORS}, wasm_f32x4_mul(vi${M+4}x6789, vk44)); 485 486 $if ACCUMULATORS > 1: 487 $ACC_SLICE = 1 488 $while ACC_SLICE < ACCUMULATORS: 489 $for A in range(0, ACCUMULATORS, ACC_SLICE * 2): 490 $if A + ACC_SLICE < ACCUMULATORS: 491 $for M in range(ROW_TILE): 492 vo${M}p${A} = wasm_f32x4_add(vo${M}p${A}, vo${M}p${A + ACC_SLICE}); 493 $ACC_SLICE *= 2 494 495 $if X86: 496 $for M in range(ROW_TILE): 497 v128_t vo${M} = wasm_v128_bitselect(vmin, vo${M}p0, wasm_f32x4_lt(vo${M}p0, vmin)); 498 $for M in range(ROW_TILE): 499 vo${M} = wasm_v128_bitselect(vo${M}, vmax, wasm_f32x4_le(vo${M}, vmax)); 500 $else: 501 $for M in range(ROW_TILE): 502 v128_t vo${M} = wasm_f32x4_max(vo${M}p0, vmin); 503 $for M in range(ROW_TILE): 504 vo${M} = wasm_f32x4_min(vo${M}, vmax); 505 506 if XNN_LIKELY(w & (4 * sizeof(float))) { 507 $for M in reversed(range(ROW_TILE)): 508 wasm_v128_store(o${M}, vo${M}); o${M} += 4; 509 } else { 510 if (w & (2 * sizeof(float))) { 511 $for M in reversed(range(ROW_TILE)): 512 *((double*) o${M}) = wasm_f64x2_extract_lane(vo${M}, 0); o${M} += 2; 513 514 $for M in range(ROW_TILE): 515 vo${M} = wasm_v32x4_shuffle(vo${M}, vo${M}, 2, 3, 0, 1); 516 } 517 if (w & (1 * sizeof(float))) { 518 $for M in reversed(range(ROW_TILE)): 519 *o${M} = wasm_f32x4_extract_lane(vo${M}, 0); o${M} += 1; 520 } 521 } 522 } 523 524 i0 = (const float*) ((uintptr_t) i${ROW_TILE} - input_decrement); 525 i1 = (const float*) ((uintptr_t) i${ROW_TILE+1} - input_decrement); 526 $for M in range(2, 4 + ROW_TILE): 527 i${M} = (const float*) ((uintptr_t) i${M-1} + input_width); 528 529 $if ROW_TILE > 1: 530 o0 = o${ROW_TILE - 1}; 531 $for M in range(1, ROW_TILE): 532 o${M} = (float*) ((uintptr_t) o${M-1} + input_width); 533 534 $if ROW_TILE > 1: 535 output_height = doz(output_height, ${ROW_TILE}); 536 } while (${"--" if ROW_TILE == 1 else ""}output_height != 0); 537} 538