1// Copyright 2020 Google LLC 2// 3// This source code is licensed under the BSD-style license found in the 4// LICENSE file in the root directory of this source tree. 5 6$assert ROW_TILE >= 1 7$assert ACCUMULATORS >= 1 8#include <assert.h> 9 10#include <wasm_simd128.h> 11 12#include <xnnpack/dwconv.h> 13#include <xnnpack/math.h> 14 15 16$ARCH_SUFFIX = "_x86" if X86 else "_arm" 17 18void xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd${ARCH_SUFFIX}_splat_${ROW_TILE}x4${"_acc%d" % ACCUMULATORS if ACCUMULATORS > 1 else ""}( 19 size_t input_height, 20 size_t input_width, 21 const float* input, 22 const float* weights, 23 const float* zero, 24 float* output, 25 uint32_t padding_top, 26 const union xnn_f32_chw_params params[restrict XNN_MIN_ELEMENTS(1)]) 27{ 28 assert(input_height != 0); 29 assert(input_width != 0); 30 assert(input_width % sizeof(float) == 0); 31 assert(padding_top == 2); 32 33 const v128_t vmask = wasm_v128_load(params->scalar.mask); 34 const v128_t vmax = wasm_v32x4_load_splat(¶ms->scalar.max); 35 const v128_t vmin = wasm_v32x4_load_splat(¶ms->scalar.min); 36 37 const v128_t vw0123 = wasm_v128_load(weights); 38 const v128_t vw4567 = wasm_v128_load(weights + 4); 39 const v128_t vw89AB = wasm_v128_load(weights + 8); 40 const v128_t vwCDEF = wasm_v128_load(weights + 12); 41 const v128_t vwGHIJ = wasm_v128_load(weights + 16); 42 const v128_t vwKLMN = wasm_v128_load(weights + 20); 43 const v128_t vwOP = wasm_v64x2_load_splat(weights + 24); 44 45 const v128_t vzero = wasm_f32x4_splat(0.0f); 46 47 const size_t input_decrement = round_up_po2(input_width, 4 * sizeof(float)); 48 49 const float* i0 = zero; 50 const float* i1 = zero; 51 const float* i2 = input; 52 $for M in range(3, 4 + ROW_TILE): 53 const float* i${M} = (const float*) ((uintptr_t) i${M-1} + input_width); 54 55 float* o0 = output; 56 $for M in range(1, ROW_TILE): 57 float* o${M} = (float*) ((uintptr_t) o${M-1} + input_width); 58 59 size_t output_height = input_height; 60 do { 61 $for M in range(2, 3 + ROW_TILE): 62 if XNN_UNPREDICTABLE(output_height < ${M}) { 63 i${M+1} = zero; 64 $if M <= ROW_TILE: 65 o${M-1} = o${M-2}; 66 } 67 68 $for M in range(4 + ROW_TILE): 69 v128_t vi${M}x0123 = vzero; 70 71 $for M in range(4 + ROW_TILE): 72 v128_t vi${M}x4567 = wasm_v128_load(i${M}); i${M} += 4; 73 74 size_t w = input_width; 75 for (; w > 8 * sizeof(float); w -= 4 * sizeof(float)) { 76 $for M in range(ROW_TILE): 77 v128_t vo${M}p0 = wasm_v32x4_shuffle(vw0123, vw0123, 0, 0, 0, 0); 78 79 $for M in range(4 + ROW_TILE): 80 const v128_t vi${M}x89AB = wasm_v128_load(i${M}); i${M} += 4; 81 82 $for M in range(ROW_TILE): 83 $if ACCUMULATORS > 1: 84 v128_t vo${M}p1 = wasm_f32x4_mul(vi${M}x4567, wasm_v32x4_shuffle(vw0123, vw0123, 3, 3, 3, 3)); 85 $else: 86 vo${M}p0 = wasm_f32x4_add(vo${M}p0, wasm_f32x4_mul(vi${M}x4567, wasm_v32x4_shuffle(vw0123, vw0123, 3, 3, 3, 3))); 87 88 $for M in range(ROW_TILE): 89 $if ACCUMULATORS > 2: 90 v128_t vo${M}p2 = wasm_f32x4_mul(vi${M+1}x4567, wasm_v32x4_shuffle(vw89AB, vw89AB, 0, 0, 0, 0)); 91 $else: 92 vo${M}p0 = wasm_f32x4_add(vo${M}p0, wasm_f32x4_mul(vi${M+1}x4567, wasm_v32x4_shuffle(vw89AB, vw89AB, 0, 0, 0, 0))); 93 94 $for M in range(ROW_TILE): 95 $if ACCUMULATORS > 3: 96 v128_t vo${M}p3 = wasm_f32x4_mul(vi${M+2}x4567, wasm_v32x4_shuffle(vwCDEF, vwCDEF, 1, 1, 1, 1)); 97 $else: 98 vo${M}p${4 % ACCUMULATORS} = wasm_f32x4_add(vo${M}p${4 % ACCUMULATORS}, wasm_f32x4_mul(vi${M+2}x4567, wasm_v32x4_shuffle(vwCDEF, vwCDEF, 1, 1, 1, 1))); 99 100 $for M in range(ROW_TILE): 101 $if ACCUMULATORS > 4: 102 v128_t vo${M}p4 = wasm_f32x4_mul(vi${M+3}x4567, wasm_v32x4_shuffle(vwGHIJ, vwGHIJ, 2, 2, 2, 2)); 103 $else: 104 vo${M}p${5 % ACCUMULATORS} = wasm_f32x4_add(vo${M}p${5 % ACCUMULATORS}, wasm_f32x4_mul(vi${M+3}x4567, wasm_v32x4_shuffle(vwGHIJ, vwGHIJ, 2, 2, 2, 2))); 105 106 $for M in range(ROW_TILE): 107 $if ACCUMULATORS > 6: 108 v128_t vo${M}p5 = wasm_f32x4_mul(vi${M+4}x4567, wasm_v32x4_shuffle(vwKLMN, vwKLMN, 3, 3, 3, 3)); 109 $else: 110 vo${M}p${6 % ACCUMULATORS} = wasm_f32x4_add(vo${M}p${6 % ACCUMULATORS}, wasm_f32x4_mul(vi${M+4}x4567, wasm_v32x4_shuffle(vwKLMN, vwKLMN, 3, 3, 3, 3))); 111 112 $for M in range(4 + ROW_TILE): 113 const v128_t vi${M}x3456 = wasm_v32x4_shuffle(vi${M}x0123, vi${M}x4567, 3, 4, 5, 6); 114 115 $for M in range(ROW_TILE): 116 vo${M}p${7 % ACCUMULATORS} = wasm_f32x4_add(vo${M}p${7 % ACCUMULATORS}, wasm_f32x4_mul(vi${M}x3456, wasm_v32x4_shuffle(vw0123, vw0123, 2, 2, 2, 2))); 117 118 $for M in range(ROW_TILE): 119 vo${M}p${8 % ACCUMULATORS} = wasm_f32x4_add(vo${M}p${8 % ACCUMULATORS}, wasm_f32x4_mul(vi${M+1}x3456, wasm_v32x4_shuffle(vw4567, vw4567, 3, 3, 3, 3))); 120 121 $for M in range(ROW_TILE): 122 vo${M}p${9 % ACCUMULATORS} = wasm_f32x4_add(vo${M}p${9 % ACCUMULATORS}, wasm_f32x4_mul(vi${M+2}x3456, wasm_v32x4_shuffle(vwCDEF, vwCDEF, 0, 0, 0, 0))); 123 124 $for M in range(ROW_TILE): 125 vo${M}p${10 % ACCUMULATORS} = wasm_f32x4_add(vo${M}p${10 % ACCUMULATORS}, wasm_f32x4_mul(vi${M+3}x3456, wasm_v32x4_shuffle(vwGHIJ, vwGHIJ, 1, 1, 1, 1))); 126 127 $for M in range(ROW_TILE): 128 vo${M}p${11 % ACCUMULATORS} = wasm_f32x4_add(vo${M}p${11 % ACCUMULATORS}, wasm_f32x4_mul(vi${M+4}x3456, wasm_v32x4_shuffle(vwKLMN, vwKLMN, 2, 2, 2, 2))); 129 130 $for M in range(4 + ROW_TILE): 131 const v128_t vi${M}x2345 = wasm_v32x4_shuffle(vi${M}x0123, vi${M}x4567, 2, 3, 4, 5); 132 vi${M}x0123 = vi${M}x4567; 133 134 $for M in range(ROW_TILE): 135 vo${M}p${12 % ACCUMULATORS} = wasm_f32x4_add(vo${M}p${12 % ACCUMULATORS}, wasm_f32x4_mul(vi${M}x2345, wasm_v32x4_shuffle(vw0123, vw0123, 1, 1, 1, 1))); 136 137 $for M in range(ROW_TILE): 138 vo${M}p${13 % ACCUMULATORS} = wasm_f32x4_add(vo${M}p${13 % ACCUMULATORS}, wasm_f32x4_mul(vi${M+1}x2345, wasm_v32x4_shuffle(vw4567, vw4567, 2, 2, 2, 2))); 139 140 $for M in range(ROW_TILE): 141 vo${M}p${14 % ACCUMULATORS} = wasm_f32x4_add(vo${M}p${14 % ACCUMULATORS}, wasm_f32x4_mul(vi${M+2}x2345, wasm_v32x4_shuffle(vw89AB, vw89AB, 3, 3, 3, 3))); 142 143 $for M in range(ROW_TILE): 144 vo${M}p${15 % ACCUMULATORS} = wasm_f32x4_add(vo${M}p${15 % ACCUMULATORS}, wasm_f32x4_mul(vi${M+3}x2345, wasm_v32x4_shuffle(vwGHIJ, vwGHIJ, 0, 0, 0, 0))); 145 146 $for M in range(ROW_TILE): 147 vo${M}p${16 % ACCUMULATORS} = wasm_f32x4_add(vo${M}p${16 % ACCUMULATORS}, wasm_f32x4_mul(vi${M+4}x2345, wasm_v32x4_shuffle(vwKLMN, vwKLMN, 1, 1, 1, 1))); 148 149 $for M in range(4 + ROW_TILE): 150 const v128_t vi${M}x5678 = wasm_v32x4_shuffle(vi${M}x4567, vi${M}x89AB, 1, 2, 3, 4); 151 152 $for M in range(ROW_TILE): 153 vo${M}p${17 % ACCUMULATORS} = wasm_f32x4_add(vo${M}p${17 % ACCUMULATORS}, wasm_f32x4_mul(vi${M}x5678, wasm_v32x4_shuffle(vw4567, vw4567, 0, 0, 0, 0))); 154 155 $for M in range(ROW_TILE): 156 vo${M}p${18 % ACCUMULATORS} = wasm_f32x4_add(vo${M}p${18 % ACCUMULATORS}, wasm_f32x4_mul(vi${M+1}x5678, wasm_v32x4_shuffle(vw89AB, vw89AB, 1, 1, 1, 1))); 157 158 $for M in range(ROW_TILE): 159 vo${M}p${19 % ACCUMULATORS} = wasm_f32x4_add(vo${M}p${19 % ACCUMULATORS}, wasm_f32x4_mul(vi${M+2}x5678, wasm_v32x4_shuffle(vwCDEF, vwCDEF, 2, 2, 2, 2))); 160 161 $for M in range(ROW_TILE): 162 vo${M}p${20 % ACCUMULATORS} = wasm_f32x4_add(vo${M}p${20 % ACCUMULATORS}, wasm_f32x4_mul(vi${M+3}x5678, wasm_v32x4_shuffle(vwGHIJ, vwGHIJ, 3, 3, 3, 3))); 163 164 $for M in range(ROW_TILE): 165 vo${M}p${21 % ACCUMULATORS} = wasm_f32x4_add(vo${M}p${21 % ACCUMULATORS}, wasm_f32x4_mul(vi${M+4}x5678, wasm_v32x4_shuffle(vwOP, vwOP, 0, 0, 0, 0))); 166 167 $for M in range(4 + ROW_TILE): 168 const v128_t vi${M}x6789 = wasm_v32x4_shuffle(vi${M}x4567, vi${M}x89AB, 2, 3, 4, 5); 169 vi${M}x4567 = vi${M}x89AB; 170 171 $for M in range(ROW_TILE): 172 vo${M}p${22 % ACCUMULATORS} = wasm_f32x4_add(vo${M}p${22 % ACCUMULATORS}, wasm_f32x4_mul(vi${M}x6789, wasm_v32x4_shuffle(vw4567, vw4567, 1, 1, 1, 1))); 173 174 $for M in range(ROW_TILE): 175 vo${M}p${23 % ACCUMULATORS} = wasm_f32x4_add(vo${M}p${23 % ACCUMULATORS}, wasm_f32x4_mul(vi${M+1}x6789, wasm_v32x4_shuffle(vw89AB, vw89AB, 2, 2, 2, 2))); 176 177 $for M in range(ROW_TILE): 178 vo${M}p${24 % ACCUMULATORS} = wasm_f32x4_add(vo${M}p${24 % ACCUMULATORS}, wasm_f32x4_mul(vi${M+2}x6789, wasm_v32x4_shuffle(vwCDEF, vwCDEF, 3, 3, 3, 3))); 179 180 $for M in range(ROW_TILE): 181 vo${M}p${25 % ACCUMULATORS} = wasm_f32x4_add(vo${M}p${25 % ACCUMULATORS}, wasm_f32x4_mul(vi${M+3}x6789, wasm_v32x4_shuffle(vwKLMN, vwKLMN, 0, 0, 0, 0))); 182 183 $for M in range(ROW_TILE): 184 vo${M}p${26 % ACCUMULATORS} = wasm_f32x4_add(vo${M}p${26 % ACCUMULATORS}, wasm_f32x4_mul(vi${M+4}x6789, wasm_v32x4_shuffle(vwOP, vwOP, 1, 1, 1, 1))); 185 186 $if ACCUMULATORS > 1: 187 $ACC_SLICE = 1 188 $while ACC_SLICE < ACCUMULATORS: 189 $for A in range(0, ACCUMULATORS, ACC_SLICE * 2): 190 $if A + ACC_SLICE < ACCUMULATORS: 191 $for M in range(ROW_TILE): 192 vo${M}p${A} = wasm_f32x4_add(vo${M}p${A}, vo${M}p${A + ACC_SLICE}); 193 $ACC_SLICE *= 2 194 195 $if X86: 196 $for M in range(ROW_TILE): 197 v128_t vo${M} = wasm_v128_bitselect(vmin, vo${M}p0, wasm_f32x4_lt(vo${M}p0, vmin)); 198 $for M in range(ROW_TILE): 199 vo${M} = wasm_v128_bitselect(vo${M}, vmax, wasm_f32x4_le(vo${M}, vmax)); 200 $else: 201 $for M in range(ROW_TILE): 202 v128_t vo${M} = wasm_f32x4_max(vo${M}p0, vmin); 203 $for M in range(ROW_TILE): 204 vo${M} = wasm_f32x4_min(vo${M}, vmax); 205 206 $for M in reversed(range(ROW_TILE)): 207 wasm_v128_store(o${M}, vo${M}); o${M} += 4; 208 } 209 // Always process the last block of 5..8 pixels. 210 if XNN_LIKELY(w > 4 * sizeof(float)) { 211 $for M in range(ROW_TILE): 212 v128_t vo${M}p0 = wasm_v32x4_shuffle(vw0123, vw0123, 0, 0, 0, 0); 213 214 $for M in range(4 + ROW_TILE): 215 v128_t vi${M}x89AB = wasm_v128_load(i${M}); i${M} += 4; 216 217 $for M in range(4 + ROW_TILE): 218 vi${M}x89AB = wasm_v128_and(vmask, vi${M}x89AB); 219 220 $for M in range(ROW_TILE): 221 $if ACCUMULATORS > 1: 222 v128_t vo${M}p1 = wasm_f32x4_mul(vi${M}x4567, wasm_v32x4_shuffle(vw0123, vw0123, 3, 3, 3, 3)); 223 $else: 224 vo${M}p0 = wasm_f32x4_add(vo${M}p0, wasm_f32x4_mul(vi${M}x4567, wasm_v32x4_shuffle(vw0123, vw0123, 3, 3, 3, 3))); 225 226 $for M in range(ROW_TILE): 227 $if ACCUMULATORS > 2: 228 v128_t vo${M}p2 = wasm_f32x4_mul(vi${M+1}x4567, wasm_v32x4_shuffle(vw89AB, vw89AB, 0, 0, 0, 0)); 229 $else: 230 vo${M}p0 = wasm_f32x4_add(vo${M}p0, wasm_f32x4_mul(vi${M+1}x4567, wasm_v32x4_shuffle(vw89AB, vw89AB, 0, 0, 0, 0))); 231 232 $for M in range(ROW_TILE): 233 $if ACCUMULATORS > 3: 234 v128_t vo${M}p3 = wasm_f32x4_mul(vi${M+2}x4567, wasm_v32x4_shuffle(vwCDEF, vwCDEF, 1, 1, 1, 1)); 235 $else: 236 vo${M}p${4 % ACCUMULATORS} = wasm_f32x4_add(vo${M}p${4 % ACCUMULATORS}, wasm_f32x4_mul(vi${M+2}x4567, wasm_v32x4_shuffle(vwCDEF, vwCDEF, 1, 1, 1, 1))); 237 238 $for M in range(ROW_TILE): 239 $if ACCUMULATORS > 4: 240 v128_t vo${M}p4 = wasm_f32x4_mul(vi${M+3}x4567, wasm_v32x4_shuffle(vwGHIJ, vwGHIJ, 2, 2, 2, 2)); 241 $else: 242 vo${M}p${5 % ACCUMULATORS} = wasm_f32x4_add(vo${M}p${5 % ACCUMULATORS}, wasm_f32x4_mul(vi${M+3}x4567, wasm_v32x4_shuffle(vwGHIJ, vwGHIJ, 2, 2, 2, 2))); 243 244 $for M in range(ROW_TILE): 245 $if ACCUMULATORS > 6: 246 v128_t vo${M}p5 = wasm_f32x4_mul(vi${M+4}x4567, wasm_v32x4_shuffle(vwKLMN, vwKLMN, 3, 3, 3, 3)); 247 $else: 248 vo${M}p${6 % ACCUMULATORS} = wasm_f32x4_add(vo${M}p${6 % ACCUMULATORS}, wasm_f32x4_mul(vi${M+4}x4567, wasm_v32x4_shuffle(vwKLMN, vwKLMN, 3, 3, 3, 3))); 249 250 $for M in range(4 + ROW_TILE): 251 const v128_t vi${M}x3456 = wasm_v32x4_shuffle(vi${M}x0123, vi${M}x4567, 3, 4, 5, 6); 252 253 $for M in range(ROW_TILE): 254 vo${M}p${7 % ACCUMULATORS} = wasm_f32x4_add(vo${M}p${7 % ACCUMULATORS}, wasm_f32x4_mul(vi${M}x3456, wasm_v32x4_shuffle(vw0123, vw0123, 2, 2, 2, 2))); 255 256 $for M in range(ROW_TILE): 257 vo${M}p${8 % ACCUMULATORS} = wasm_f32x4_add(vo${M}p${8 % ACCUMULATORS}, wasm_f32x4_mul(vi${M+1}x3456, wasm_v32x4_shuffle(vw4567, vw4567, 3, 3, 3, 3))); 258 259 $for M in range(ROW_TILE): 260 vo${M}p${9 % ACCUMULATORS} = wasm_f32x4_add(vo${M}p${9 % ACCUMULATORS}, wasm_f32x4_mul(vi${M+2}x3456, wasm_v32x4_shuffle(vwCDEF, vwCDEF, 0, 0, 0, 0))); 261 262 $for M in range(ROW_TILE): 263 vo${M}p${10 % ACCUMULATORS} = wasm_f32x4_add(vo${M}p${10 % ACCUMULATORS}, wasm_f32x4_mul(vi${M+3}x3456, wasm_v32x4_shuffle(vwGHIJ, vwGHIJ, 1, 1, 1, 1))); 264 265 $for M in range(ROW_TILE): 266 vo${M}p${11 % ACCUMULATORS} = wasm_f32x4_add(vo${M}p${11 % ACCUMULATORS}, wasm_f32x4_mul(vi${M+4}x3456, wasm_v32x4_shuffle(vwKLMN, vwKLMN, 2, 2, 2, 2))); 267 268 $for M in range(4 + ROW_TILE): 269 const v128_t vi${M}x2345 = wasm_v32x4_shuffle(vi${M}x0123, vi${M}x4567, 2, 3, 4, 5); 270 vi${M}x0123 = vi${M}x4567; 271 272 $for M in range(ROW_TILE): 273 vo${M}p${12 % ACCUMULATORS} = wasm_f32x4_add(vo${M}p${12 % ACCUMULATORS}, wasm_f32x4_mul(vi${M}x2345, wasm_v32x4_shuffle(vw0123, vw0123, 1, 1, 1, 1))); 274 275 $for M in range(ROW_TILE): 276 vo${M}p${13 % ACCUMULATORS} = wasm_f32x4_add(vo${M}p${13 % ACCUMULATORS}, wasm_f32x4_mul(vi${M+1}x2345, wasm_v32x4_shuffle(vw4567, vw4567, 2, 2, 2, 2))); 277 278 $for M in range(ROW_TILE): 279 vo${M}p${14 % ACCUMULATORS} = wasm_f32x4_add(vo${M}p${14 % ACCUMULATORS}, wasm_f32x4_mul(vi${M+2}x2345, wasm_v32x4_shuffle(vw89AB, vw89AB, 3, 3, 3, 3))); 280 281 $for M in range(ROW_TILE): 282 vo${M}p${15 % ACCUMULATORS} = wasm_f32x4_add(vo${M}p${15 % ACCUMULATORS}, wasm_f32x4_mul(vi${M+3}x2345, wasm_v32x4_shuffle(vwGHIJ, vwGHIJ, 0, 0, 0, 0))); 283 284 $for M in range(ROW_TILE): 285 vo${M}p${16 % ACCUMULATORS} = wasm_f32x4_add(vo${M}p${16 % ACCUMULATORS}, wasm_f32x4_mul(vi${M+4}x2345, wasm_v32x4_shuffle(vwKLMN, vwKLMN, 1, 1, 1, 1))); 286 287 $for M in range(4 + ROW_TILE): 288 const v128_t vi${M}x5678 = wasm_v32x4_shuffle(vi${M}x4567, vi${M}x89AB, 1, 2, 3, 4); 289 290 $for M in range(ROW_TILE): 291 vo${M}p${17 % ACCUMULATORS} = wasm_f32x4_add(vo${M}p${17 % ACCUMULATORS}, wasm_f32x4_mul(vi${M}x5678, wasm_v32x4_shuffle(vw4567, vw4567, 0, 0, 0, 0))); 292 293 $for M in range(ROW_TILE): 294 vo${M}p${18 % ACCUMULATORS} = wasm_f32x4_add(vo${M}p${18 % ACCUMULATORS}, wasm_f32x4_mul(vi${M+1}x5678, wasm_v32x4_shuffle(vw89AB, vw89AB, 1, 1, 1, 1))); 295 296 $for M in range(ROW_TILE): 297 vo${M}p${19 % ACCUMULATORS} = wasm_f32x4_add(vo${M}p${19 % ACCUMULATORS}, wasm_f32x4_mul(vi${M+2}x5678, wasm_v32x4_shuffle(vwCDEF, vwCDEF, 2, 2, 2, 2))); 298 299 $for M in range(ROW_TILE): 300 vo${M}p${20 % ACCUMULATORS} = wasm_f32x4_add(vo${M}p${20 % ACCUMULATORS}, wasm_f32x4_mul(vi${M+3}x5678, wasm_v32x4_shuffle(vwGHIJ, vwGHIJ, 3, 3, 3, 3))); 301 302 $for M in range(ROW_TILE): 303 vo${M}p${21 % ACCUMULATORS} = wasm_f32x4_add(vo${M}p${21 % ACCUMULATORS}, wasm_f32x4_mul(vi${M+4}x5678, wasm_v32x4_shuffle(vwOP, vwOP, 0, 0, 0, 0))); 304 305 $for M in range(4 + ROW_TILE): 306 const v128_t vi${M}x6789 = wasm_v32x4_shuffle(vi${M}x4567, vi${M}x89AB, 2, 3, 4, 5); 307 vi${M}x4567 = vi${M}x89AB; 308 309 $for M in range(ROW_TILE): 310 vo${M}p${22 % ACCUMULATORS} = wasm_f32x4_add(vo${M}p${22 % ACCUMULATORS}, wasm_f32x4_mul(vi${M}x6789, wasm_v32x4_shuffle(vw4567, vw4567, 1, 1, 1, 1))); 311 312 $for M in range(ROW_TILE): 313 vo${M}p${23 % ACCUMULATORS} = wasm_f32x4_add(vo${M}p${23 % ACCUMULATORS}, wasm_f32x4_mul(vi${M+1}x6789, wasm_v32x4_shuffle(vw89AB, vw89AB, 2, 2, 2, 2))); 314 315 $for M in range(ROW_TILE): 316 vo${M}p${24 % ACCUMULATORS} = wasm_f32x4_add(vo${M}p${24 % ACCUMULATORS}, wasm_f32x4_mul(vi${M+2}x6789, wasm_v32x4_shuffle(vwCDEF, vwCDEF, 3, 3, 3, 3))); 317 318 $for M in range(ROW_TILE): 319 vo${M}p${25 % ACCUMULATORS} = wasm_f32x4_add(vo${M}p${25 % ACCUMULATORS}, wasm_f32x4_mul(vi${M+3}x6789, wasm_v32x4_shuffle(vwKLMN, vwKLMN, 0, 0, 0, 0))); 320 321 $for M in range(ROW_TILE): 322 vo${M}p${26 % ACCUMULATORS} = wasm_f32x4_add(vo${M}p${26 % ACCUMULATORS}, wasm_f32x4_mul(vi${M+4}x6789, wasm_v32x4_shuffle(vwOP, vwOP, 1, 1, 1, 1))); 323 324 $if ACCUMULATORS > 1: 325 $ACC_SLICE = 1 326 $while ACC_SLICE < ACCUMULATORS: 327 $for A in range(0, ACCUMULATORS, ACC_SLICE * 2): 328 $if A + ACC_SLICE < ACCUMULATORS: 329 $for M in range(ROW_TILE): 330 vo${M}p${A} = wasm_f32x4_add(vo${M}p${A}, vo${M}p${A + ACC_SLICE}); 331 $ACC_SLICE *= 2 332 333 $if X86: 334 $for M in range(ROW_TILE): 335 v128_t vo${M} = wasm_v128_bitselect(vmin, vo${M}p0, wasm_f32x4_lt(vo${M}p0, vmin)); 336 $for M in range(ROW_TILE): 337 vo${M} = wasm_v128_bitselect(vo${M}, vmax, wasm_f32x4_le(vo${M}, vmax)); 338 $else: 339 $for M in range(ROW_TILE): 340 v128_t vo${M} = wasm_f32x4_max(vo${M}p0, vmin); 341 $for M in range(ROW_TILE): 342 vo${M} = wasm_f32x4_min(vo${M}, vmax); 343 344 $for M in reversed(range(ROW_TILE)): 345 wasm_v128_store(o${M}, vo${M}); o${M} += 4; 346 347 w -= 4 * sizeof(float); 348 } 349 assert(w >= 1 * sizeof(float)); 350 assert(w <= 4 * sizeof(float)); 351 { 352 $for M in range(ROW_TILE): 353 v128_t vo${M}p0 = wasm_v32x4_shuffle(vw0123, vw0123, 0, 0, 0, 0); 354 355 $for M in range(4 + ROW_TILE): 356 vi${M}x4567 = wasm_v128_and(vmask, vi${M}x4567); 357 358 $for M in range(ROW_TILE): 359 $if ACCUMULATORS > 1: 360 v128_t vo${M}p1 = wasm_f32x4_mul(vi${M}x4567, wasm_v32x4_shuffle(vw0123, vw0123, 3, 3, 3, 3)); 361 $else: 362 vo${M}p0 = wasm_f32x4_add(vo${M}p0, wasm_f32x4_mul(vi${M}x4567, wasm_v32x4_shuffle(vw0123, vw0123, 3, 3, 3, 3))); 363 364 $for M in range(ROW_TILE): 365 $if ACCUMULATORS > 2: 366 v128_t vo${M}p2 = wasm_f32x4_mul(vi${M+1}x4567, wasm_v32x4_shuffle(vw89AB, vw89AB, 0, 0, 0, 0)); 367 $else: 368 vo${M}p0 = wasm_f32x4_add(vo${M}p0, wasm_f32x4_mul(vi${M+1}x4567, wasm_v32x4_shuffle(vw89AB, vw89AB, 0, 0, 0, 0))); 369 370 $for M in range(ROW_TILE): 371 $if ACCUMULATORS > 3: 372 v128_t vo${M}p3 = wasm_f32x4_mul(vi${M+2}x4567, wasm_v32x4_shuffle(vwCDEF, vwCDEF, 1, 1, 1, 1)); 373 $else: 374 vo${M}p${4 % ACCUMULATORS} = wasm_f32x4_add(vo${M}p${4 % ACCUMULATORS}, wasm_f32x4_mul(vi${M+2}x4567, wasm_v32x4_shuffle(vwCDEF, vwCDEF, 1, 1, 1, 1))); 375 376 $for M in range(ROW_TILE): 377 $if ACCUMULATORS > 4: 378 v128_t vo${M}p4 = wasm_f32x4_mul(vi${M+3}x4567, wasm_v32x4_shuffle(vwGHIJ, vwGHIJ, 2, 2, 2, 2)); 379 $else: 380 vo${M}p${5 % ACCUMULATORS} = wasm_f32x4_add(vo${M}p${5 % ACCUMULATORS}, wasm_f32x4_mul(vi${M+3}x4567, wasm_v32x4_shuffle(vwGHIJ, vwGHIJ, 2, 2, 2, 2))); 381 382 $for M in range(ROW_TILE): 383 $if ACCUMULATORS > 6: 384 v128_t vo${M}p5 = wasm_f32x4_mul(vi${M+4}x4567, wasm_v32x4_shuffle(vwKLMN, vwKLMN, 3, 3, 3, 3)); 385 $else: 386 vo${M}p${6 % ACCUMULATORS} = wasm_f32x4_add(vo${M}p${6 % ACCUMULATORS}, wasm_f32x4_mul(vi${M+4}x4567, wasm_v32x4_shuffle(vwKLMN, vwKLMN, 3, 3, 3, 3))); 387 388 $for M in range(4 + ROW_TILE): 389 const v128_t vi${M}x3456 = wasm_v32x4_shuffle(vi${M}x0123, vi${M}x4567, 3, 4, 5, 6); 390 391 $for M in range(ROW_TILE): 392 vo${M}p${7 % ACCUMULATORS} = wasm_f32x4_add(vo${M}p${7 % ACCUMULATORS}, wasm_f32x4_mul(vi${M}x3456, wasm_v32x4_shuffle(vw0123, vw0123, 2, 2, 2, 2))); 393 394 $for M in range(ROW_TILE): 395 vo${M}p${8 % ACCUMULATORS} = wasm_f32x4_add(vo${M}p${8 % ACCUMULATORS}, wasm_f32x4_mul(vi${M+1}x3456, wasm_v32x4_shuffle(vw4567, vw4567, 3, 3, 3, 3))); 396 397 $for M in range(ROW_TILE): 398 vo${M}p${9 % ACCUMULATORS} = wasm_f32x4_add(vo${M}p${9 % ACCUMULATORS}, wasm_f32x4_mul(vi${M+2}x3456, wasm_v32x4_shuffle(vwCDEF, vwCDEF, 0, 0, 0, 0))); 399 400 $for M in range(ROW_TILE): 401 vo${M}p${10 % ACCUMULATORS} = wasm_f32x4_add(vo${M}p${10 % ACCUMULATORS}, wasm_f32x4_mul(vi${M+3}x3456, wasm_v32x4_shuffle(vwGHIJ, vwGHIJ, 1, 1, 1, 1))); 402 403 $for M in range(ROW_TILE): 404 vo${M}p${11 % ACCUMULATORS} = wasm_f32x4_add(vo${M}p${11 % ACCUMULATORS}, wasm_f32x4_mul(vi${M+4}x3456, wasm_v32x4_shuffle(vwKLMN, vwKLMN, 2, 2, 2, 2))); 405 406 $for M in range(4 + ROW_TILE): 407 const v128_t vi${M}x2345 = wasm_v32x4_shuffle(vi${M}x0123, vi${M}x4567, 2, 3, 4, 5); 408 409 $for M in range(ROW_TILE): 410 vo${M}p${12 % ACCUMULATORS} = wasm_f32x4_add(vo${M}p${12 % ACCUMULATORS}, wasm_f32x4_mul(vi${M}x2345, wasm_v32x4_shuffle(vw0123, vw0123, 1, 1, 1, 1))); 411 412 $for M in range(ROW_TILE): 413 vo${M}p${13 % ACCUMULATORS} = wasm_f32x4_add(vo${M}p${13 % ACCUMULATORS}, wasm_f32x4_mul(vi${M+1}x2345, wasm_v32x4_shuffle(vw4567, vw4567, 2, 2, 2, 2))); 414 415 $for M in range(ROW_TILE): 416 vo${M}p${14 % ACCUMULATORS} = wasm_f32x4_add(vo${M}p${14 % ACCUMULATORS}, wasm_f32x4_mul(vi${M+2}x2345, wasm_v32x4_shuffle(vw89AB, vw89AB, 3, 3, 3, 3))); 417 418 $for M in range(ROW_TILE): 419 vo${M}p${15 % ACCUMULATORS} = wasm_f32x4_add(vo${M}p${15 % ACCUMULATORS}, wasm_f32x4_mul(vi${M+3}x2345, wasm_v32x4_shuffle(vwGHIJ, vwGHIJ, 0, 0, 0, 0))); 420 421 $for M in range(ROW_TILE): 422 vo${M}p${16 % ACCUMULATORS} = wasm_f32x4_add(vo${M}p${16 % ACCUMULATORS}, wasm_f32x4_mul(vi${M+4}x2345, wasm_v32x4_shuffle(vwKLMN, vwKLMN, 1, 1, 1, 1))); 423 424 $for M in range(4 + ROW_TILE): 425 const v128_t vi${M}x5678 = wasm_v32x4_shuffle(vi${M}x4567, vzero, 1, 2, 3, 4); 426 427 $for M in range(ROW_TILE): 428 vo${M}p${17 % ACCUMULATORS} = wasm_f32x4_add(vo${M}p${17 % ACCUMULATORS}, wasm_f32x4_mul(vi${M}x5678, wasm_v32x4_shuffle(vw4567, vw4567, 0, 0, 0, 0))); 429 430 $for M in range(ROW_TILE): 431 vo${M}p${18 % ACCUMULATORS} = wasm_f32x4_add(vo${M}p${18 % ACCUMULATORS}, wasm_f32x4_mul(vi${M+1}x5678, wasm_v32x4_shuffle(vw89AB, vw89AB, 1, 1, 1, 1))); 432 433 $for M in range(ROW_TILE): 434 vo${M}p${19 % ACCUMULATORS} = wasm_f32x4_add(vo${M}p${19 % ACCUMULATORS}, wasm_f32x4_mul(vi${M+2}x5678, wasm_v32x4_shuffle(vwCDEF, vwCDEF, 2, 2, 2, 2))); 435 436 $for M in range(ROW_TILE): 437 vo${M}p${20 % ACCUMULATORS} = wasm_f32x4_add(vo${M}p${20 % ACCUMULATORS}, wasm_f32x4_mul(vi${M+3}x5678, wasm_v32x4_shuffle(vwGHIJ, vwGHIJ, 3, 3, 3, 3))); 438 439 $for M in range(ROW_TILE): 440 vo${M}p${21 % ACCUMULATORS} = wasm_f32x4_add(vo${M}p${21 % ACCUMULATORS}, wasm_f32x4_mul(vi${M+4}x5678, wasm_v32x4_shuffle(vwOP, vwOP, 0, 0, 0, 0))); 441 442 $for M in range(4 + ROW_TILE): 443 const v128_t vi${M}x6789 = wasm_v32x4_shuffle(vi${M}x5678, vzero, 1, 2, 3, 4); 444 445 $for M in range(ROW_TILE): 446 vo${M}p${22 % ACCUMULATORS} = wasm_f32x4_add(vo${M}p${22 % ACCUMULATORS}, wasm_f32x4_mul(vi${M}x6789, wasm_v32x4_shuffle(vw4567, vw4567, 1, 1, 1, 1))); 447 448 $for M in range(ROW_TILE): 449 vo${M}p${23 % ACCUMULATORS} = wasm_f32x4_add(vo${M}p${23 % ACCUMULATORS}, wasm_f32x4_mul(vi${M+1}x6789, wasm_v32x4_shuffle(vw89AB, vw89AB, 2, 2, 2, 2))); 450 451 $for M in range(ROW_TILE): 452 vo${M}p${24 % ACCUMULATORS} = wasm_f32x4_add(vo${M}p${24 % ACCUMULATORS}, wasm_f32x4_mul(vi${M+2}x6789, wasm_v32x4_shuffle(vwCDEF, vwCDEF, 3, 3, 3, 3))); 453 454 $for M in range(ROW_TILE): 455 vo${M}p${25 % ACCUMULATORS} = wasm_f32x4_add(vo${M}p${25 % ACCUMULATORS}, wasm_f32x4_mul(vi${M+3}x6789, wasm_v32x4_shuffle(vwKLMN, vwKLMN, 0, 0, 0, 0))); 456 457 $for M in range(ROW_TILE): 458 vo${M}p${26 % ACCUMULATORS} = wasm_f32x4_add(vo${M}p${26 % ACCUMULATORS}, wasm_f32x4_mul(vi${M+4}x6789, wasm_v32x4_shuffle(vwOP, vwOP, 1, 1, 1, 1))); 459 460 $if ACCUMULATORS > 1: 461 $ACC_SLICE = 1 462 $while ACC_SLICE < ACCUMULATORS: 463 $for A in range(0, ACCUMULATORS, ACC_SLICE * 2): 464 $if A + ACC_SLICE < ACCUMULATORS: 465 $for M in range(ROW_TILE): 466 vo${M}p${A} = wasm_f32x4_add(vo${M}p${A}, vo${M}p${A + ACC_SLICE}); 467 $ACC_SLICE *= 2 468 469 $if X86: 470 $for M in range(ROW_TILE): 471 v128_t vo${M} = wasm_v128_bitselect(vmin, vo${M}p0, wasm_f32x4_lt(vo${M}p0, vmin)); 472 $for M in range(ROW_TILE): 473 vo${M} = wasm_v128_bitselect(vo${M}, vmax, wasm_f32x4_le(vo${M}, vmax)); 474 $else: 475 $for M in range(ROW_TILE): 476 v128_t vo${M} = wasm_f32x4_max(vo${M}p0, vmin); 477 $for M in range(ROW_TILE): 478 vo${M} = wasm_f32x4_min(vo${M}, vmax); 479 480 if XNN_LIKELY(w & (4 * sizeof(float))) { 481 $for M in reversed(range(ROW_TILE)): 482 wasm_v128_store(o${M}, vo${M}); o${M} += 4; 483 } else { 484 if (w & (2 * sizeof(float))) { 485 $for M in reversed(range(ROW_TILE)): 486 *((double*) o${M}) = wasm_f64x2_extract_lane(vo${M}, 0); o${M} += 2; 487 488 $for M in range(ROW_TILE): 489 vo${M} = wasm_v32x4_shuffle(vo${M}, vo${M}, 2, 3, 0, 1); 490 } 491 if (w & (1 * sizeof(float))) { 492 $for M in reversed(range(ROW_TILE)): 493 *o${M} = wasm_f32x4_extract_lane(vo${M}, 0); o${M} += 1; 494 } 495 } 496 } 497 498 i0 = (const float*) ((uintptr_t) i${ROW_TILE} - input_decrement); 499 i1 = (const float*) ((uintptr_t) i${ROW_TILE+1} - input_decrement); 500 $for M in range(2, 4 + ROW_TILE): 501 i${M} = (const float*) ((uintptr_t) i${M-1} + input_width); 502 503 $if ROW_TILE > 1: 504 o0 = o${ROW_TILE - 1}; 505 $for M in range(1, ROW_TILE): 506 o${M} = (float*) ((uintptr_t) o${M-1} + input_width); 507 508 $if ROW_TILE > 1: 509 output_height = doz(output_height, ${ROW_TILE}); 510 } while (${"--" if ROW_TILE == 1 else ""}output_height != 0); 511} 512