/external/XNNPACK/src/f32-dwconv2d-chw/gen/ |
D | 5x5p2-minmax-scalar-3x1.c | 55 const float vk40 = weights[21]; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_3x1() local 145 vo0p0 += vi4x0 * vk40; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_3x1() 146 vo1p0 += vi5x0 * vk40; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_3x1() 147 vo2p0 += vi6x0 * vk40; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_3x1() 272 vo0p0 += vi4x0 * vk40; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_3x1() 273 vo1p0 += vi5x0 * vk40; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_3x1() 274 vo2p0 += vi6x0 * vk40; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_3x1() 377 vo0p0 += vi4x0 * vk40; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_3x1() 378 vo1p0 += vi5x0 * vk40; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_3x1() 379 vo2p0 += vi6x0 * vk40; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_3x1()
|
D | 5x5s2p2-minmax-scalar-3x1.c | 56 const float vk40 = weights[21]; in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__scalar_3x1() local 180 vo0p0 += vi4x0 * vk40; in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__scalar_3x1() 181 vo1p0 += vi6x0 * vk40; in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__scalar_3x1() 182 vo2p0 += vi8x0 * vk40; in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__scalar_3x1() 314 vo0p0 += vi4x0 * vk40; in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__scalar_3x1() 315 vo1p0 += vi6x0 * vk40; in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__scalar_3x1() 316 vo2p0 += vi8x0 * vk40; in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__scalar_3x1() 391 vo0p0 += vi4x0 * vk40; in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__scalar_3x1() 392 vo1p0 += vi6x0 * vk40; in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__scalar_3x1() 393 vo2p0 += vi8x0 * vk40; in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__scalar_3x1()
|
D | 5x5s2p2-minmax-scalar-3x1-acc2.c | 56 const float vk40 = weights[21]; in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__scalar_3x1_acc2() local 180 vo0p0 += vi4x0 * vk40; in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__scalar_3x1_acc2() 181 vo1p0 += vi6x0 * vk40; in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__scalar_3x1_acc2() 182 vo2p0 += vi8x0 * vk40; in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__scalar_3x1_acc2() 317 vo0p0 += vi4x0 * vk40; in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__scalar_3x1_acc2() 318 vo1p0 += vi6x0 * vk40; in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__scalar_3x1_acc2() 319 vo2p0 += vi8x0 * vk40; in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__scalar_3x1_acc2() 397 vo0p0 += vi4x0 * vk40; in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__scalar_3x1_acc2() 398 vo1p0 += vi6x0 * vk40; in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__scalar_3x1_acc2() 399 vo2p0 += vi8x0 * vk40; in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__scalar_3x1_acc2()
|
D | 5x5p2-minmax-scalar-3x1-acc2.c | 55 const float vk40 = weights[21]; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_3x1_acc2() local 145 vo0p0 += vi4x0 * vk40; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_3x1_acc2() 146 vo1p0 += vi5x0 * vk40; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_3x1_acc2() 147 vo2p0 += vi6x0 * vk40; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_3x1_acc2() 275 vo0p0 += vi4x0 * vk40; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_3x1_acc2() 276 vo1p0 += vi5x0 * vk40; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_3x1_acc2() 277 vo2p0 += vi6x0 * vk40; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_3x1_acc2() 383 vo0p0 += vi4x0 * vk40; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_3x1_acc2() 384 vo1p0 += vi5x0 * vk40; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_3x1_acc2() 385 vo2p0 += vi6x0 * vk40; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_3x1_acc2()
|
D | 5x5p2-minmax-scalar-2x1.c | 55 const float vk40 = weights[21]; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_2x1() local 130 vo0p0 += vi4x0 * vk40; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_2x1() 131 vo1p0 += vi5x0 * vk40; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_2x1() 225 vo0p0 += vi4x0 * vk40; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_2x1() 226 vo1p0 += vi5x0 * vk40; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_2x1() 304 vo0p0 += vi4x0 * vk40; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_2x1() 305 vo1p0 += vi5x0 * vk40; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_2x1()
|
D | 5x5s2p2-minmax-scalar-2x1-acc3.c | 56 const float vk40 = weights[21]; in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__scalar_2x1_acc3() local 154 vo0p1 += vi4x0 * vk40; in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__scalar_2x1_acc3() 155 vo1p1 += vi6x0 * vk40; in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__scalar_2x1_acc3() 256 vo0p1 += vi4x0 * vk40; in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__scalar_2x1_acc3() 257 vo1p1 += vi6x0 * vk40; in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__scalar_2x1_acc3() 314 vo0p1 += vi4x0 * vk40; in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__scalar_2x1_acc3() 315 vo1p1 += vi6x0 * vk40; in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__scalar_2x1_acc3()
|
D | 5x5p2-minmax-scalar-2x1-acc3.c | 55 const float vk40 = weights[21]; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_2x1_acc3() local 130 vo0p1 += vi4x0 * vk40; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_2x1_acc3() 131 vo1p1 += vi5x0 * vk40; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_2x1_acc3() 229 vo0p1 += vi4x0 * vk40; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_2x1_acc3() 230 vo1p1 += vi5x0 * vk40; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_2x1_acc3() 312 vo0p1 += vi4x0 * vk40; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_2x1_acc3() 313 vo1p1 += vi5x0 * vk40; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_2x1_acc3()
|
D | 5x5p2-minmax-scalar-2x1-acc2.c | 55 const float vk40 = weights[21]; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_2x1_acc2() local 130 vo0p0 += vi4x0 * vk40; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_2x1_acc2() 131 vo1p0 += vi5x0 * vk40; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_2x1_acc2() 227 vo0p0 += vi4x0 * vk40; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_2x1_acc2() 228 vo1p0 += vi5x0 * vk40; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_2x1_acc2() 308 vo0p0 += vi4x0 * vk40; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_2x1_acc2() 309 vo1p0 += vi5x0 * vk40; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_2x1_acc2()
|
D | 5x5s2p2-minmax-scalar-2x1.c | 56 const float vk40 = weights[21]; in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__scalar_2x1() local 154 vo0p0 += vi4x0 * vk40; in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__scalar_2x1() 155 vo1p0 += vi6x0 * vk40; in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__scalar_2x1() 252 vo0p0 += vi4x0 * vk40; in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__scalar_2x1() 253 vo1p0 += vi6x0 * vk40; in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__scalar_2x1() 306 vo0p0 += vi4x0 * vk40; in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__scalar_2x1() 307 vo1p0 += vi6x0 * vk40; in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__scalar_2x1()
|
D | 5x5s2p2-minmax-scalar-2x1-acc2.c | 56 const float vk40 = weights[21]; in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__scalar_2x1_acc2() local 154 vo0p0 += vi4x0 * vk40; in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__scalar_2x1_acc2() 155 vo1p0 += vi6x0 * vk40; in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__scalar_2x1_acc2() 254 vo0p0 += vi4x0 * vk40; in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__scalar_2x1_acc2() 255 vo1p0 += vi6x0 * vk40; in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__scalar_2x1_acc2() 310 vo0p0 += vi4x0 * vk40; in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__scalar_2x1_acc2() 311 vo1p0 += vi6x0 * vk40; in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__scalar_2x1_acc2()
|
D | 5x5p2-minmax-scalar-1x1-acc4.c | 55 const float vk40 = weights[21]; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_1x1_acc4() local 115 vo0p0 += vi4x0 * vk40; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_1x1_acc4() 181 vo0p0 += vi4x0 * vk40; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_1x1_acc4() 237 vo0p0 += vi4x0 * vk40; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_1x1_acc4()
|
D | 5x5s2p2-minmax-scalar-1x1-acc2.c | 56 const float vk40 = weights[21]; in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__scalar_1x1_acc2() local 127 vo0p0 += vi4x0 * vk40; in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__scalar_1x1_acc2() 190 vo0p0 += vi4x0 * vk40; in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__scalar_1x1_acc2() 222 vo0p0 += vi4x0 * vk40; in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__scalar_1x1_acc2()
|
D | 5x5p2-minmax-scalar-1x1-acc3.c | 55 const float vk40 = weights[21]; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_1x1_acc3() local 115 vo0p1 += vi4x0 * vk40; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_1x1_acc3() 180 vo0p1 += vi4x0 * vk40; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_1x1_acc3() 235 vo0p1 += vi4x0 * vk40; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_1x1_acc3()
|
D | 5x5p2-minmax-scalar-1x1.c | 55 const float vk40 = weights[21]; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_1x1() local 115 vo0p0 += vi4x0 * vk40; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_1x1() 178 vo0p0 += vi4x0 * vk40; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_1x1() 231 vo0p0 += vi4x0 * vk40; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_1x1()
|
D | 5x5s2p2-minmax-scalar-1x1-acc4.c | 56 const float vk40 = weights[21]; in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__scalar_1x1_acc4() local 127 vo0p0 += vi4x0 * vk40; in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__scalar_1x1_acc4() 192 vo0p0 += vi4x0 * vk40; in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__scalar_1x1_acc4() 226 vo0p0 += vi4x0 * vk40; in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__scalar_1x1_acc4()
|
D | 5x5s2p2-minmax-scalar-1x1.c | 56 const float vk40 = weights[21]; in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__scalar_1x1() local 127 vo0p0 += vi4x0 * vk40; in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__scalar_1x1() 189 vo0p0 += vi4x0 * vk40; in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__scalar_1x1() 220 vo0p0 += vi4x0 * vk40; in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__scalar_1x1()
|
D | 5x5p2-minmax-scalar-1x1-acc2.c | 55 const float vk40 = weights[21]; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_1x1_acc2() local 115 vo0p0 += vi4x0 * vk40; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_1x1_acc2() 179 vo0p0 += vi4x0 * vk40; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_1x1_acc2() 233 vo0p0 += vi4x0 * vk40; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_1x1_acc2()
|
D | 5x5s2p2-minmax-scalar-1x1-acc3.c | 56 const float vk40 = weights[21]; in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__scalar_1x1_acc3() local 127 vo0p1 += vi4x0 * vk40; in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__scalar_1x1_acc3() 191 vo0p1 += vi4x0 * vk40; in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__scalar_1x1_acc3() 224 vo0p1 += vi4x0 * vk40; in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__scalar_1x1_acc3()
|
D | 5x5p2-minmax-scalar-1x1-acc5.c | 55 const float vk40 = weights[21]; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_1x1_acc5() local 115 float vo0p4 = vi4x0 * vk40; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_1x1_acc5() 182 float vo0p4 = vi4x0 * vk40; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_1x1_acc5() 239 float vo0p4 = vi4x0 * vk40; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_1x1_acc5()
|
D | 5x5s2p2-minmax-scalar-1x1-acc5.c | 56 const float vk40 = weights[21]; in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__scalar_1x1_acc5() local 127 float vo0p4 = vi4x0 * vk40; in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__scalar_1x1_acc5() 193 float vo0p4 = vi4x0 * vk40; in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__scalar_1x1_acc5() 228 float vo0p4 = vi4x0 * vk40; in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__scalar_1x1_acc5()
|
D | 5x5p2-minmax-wasmsimd-x86-loadsplat-5x4.c | 66 const v128_t vk40 = wasm_v32x4_shuffle(vwKLMN, vwKLMN, 1, 1, 1, 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_5x4() local 266 vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi4x2345, vk40)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_5x4() 267 vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi5x2345, vk40)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_5x4() 268 vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi6x2345, vk40)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_5x4() 269 vo3p0 = wasm_f32x4_add(vo3p0, wasm_f32x4_mul(vi7x2345, vk40)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_5x4() 270 vo4p0 = wasm_f32x4_add(vo4p0, wasm_f32x4_mul(vi8x2345, vk40)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_5x4() 520 vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi4x2345, vk40)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_5x4() 521 vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi5x2345, vk40)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_5x4() 522 vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi6x2345, vk40)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_5x4() 523 vo3p0 = wasm_f32x4_add(vo3p0, wasm_f32x4_mul(vi7x2345, vk40)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_5x4() [all …]
|
D | 5x5p2-minmax-wasmsimd-arm-loadsplat-5x4.c | 66 const v128_t vk40 = wasm_v32x4_shuffle(vwKLMN, vwKLMN, 1, 1, 1, 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_5x4() local 266 vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi4x2345, vk40)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_5x4() 267 vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi5x2345, vk40)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_5x4() 268 vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi6x2345, vk40)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_5x4() 269 vo3p0 = wasm_f32x4_add(vo3p0, wasm_f32x4_mul(vi7x2345, vk40)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_5x4() 270 vo4p0 = wasm_f32x4_add(vo4p0, wasm_f32x4_mul(vi8x2345, vk40)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_5x4() 520 vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi4x2345, vk40)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_5x4() 521 vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi5x2345, vk40)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_5x4() 522 vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi6x2345, vk40)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_5x4() 523 vo3p0 = wasm_f32x4_add(vo3p0, wasm_f32x4_mul(vi7x2345, vk40)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_5x4() [all …]
|
D | 5x5p2-minmax-sse-5x4.c | 58 const __m128 vk40 = _mm_load1_ps(weights + 21); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_5x4() local 287 vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi4x2345, vk40)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_5x4() 288 vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi5x2345, vk40)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_5x4() 289 vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi6x2345, vk40)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_5x4() 290 vo3p0 = _mm_add_ps(vo3p0, _mm_mul_ps(vi7x2345, vk40)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_5x4() 291 vo4p0 = _mm_add_ps(vo4p0, _mm_mul_ps(vi8x2345, vk40)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_5x4() 540 vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi4x2345, vk40)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_5x4() 541 vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi5x2345, vk40)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_5x4() 542 vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi6x2345, vk40)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_5x4() 543 vo3p0 = _mm_add_ps(vo3p0, _mm_mul_ps(vi7x2345, vk40)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_5x4() [all …]
|
D | 5x5p2-minmax-wasmsimd-arm-loadsplat-4x4-acc2.c | 66 const v128_t vk40 = wasm_v32x4_shuffle(vwKLMN, vwKLMN, 1, 1, 1, 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_4x4_acc2() local 239 vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi4x2345, vk40)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_4x4_acc2() 240 vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi5x2345, vk40)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_4x4_acc2() 241 vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi6x2345, vk40)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_4x4_acc2() 242 vo3p0 = wasm_f32x4_add(vo3p0, wasm_f32x4_mul(vi7x2345, vk40)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_4x4_acc2() 460 vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi4x2345, vk40)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_4x4_acc2() 461 vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi5x2345, vk40)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_4x4_acc2() 462 vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi6x2345, vk40)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_4x4_acc2() 463 vo3p0 = wasm_f32x4_add(vo3p0, wasm_f32x4_mul(vi7x2345, vk40)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_4x4_acc2() 667 vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi4x2345, vk40)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_4x4_acc2() [all …]
|
D | 5x5p2-minmax-wasmsimd-x86-loadsplat-4x4-acc2.c | 66 const v128_t vk40 = wasm_v32x4_shuffle(vwKLMN, vwKLMN, 1, 1, 1, 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_4x4_acc2() local 239 vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi4x2345, vk40)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_4x4_acc2() 240 vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi5x2345, vk40)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_4x4_acc2() 241 vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi6x2345, vk40)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_4x4_acc2() 242 vo3p0 = wasm_f32x4_add(vo3p0, wasm_f32x4_mul(vi7x2345, vk40)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_4x4_acc2() 460 vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi4x2345, vk40)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_4x4_acc2() 461 vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi5x2345, vk40)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_4x4_acc2() 462 vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi6x2345, vk40)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_4x4_acc2() 463 vo3p0 = wasm_f32x4_add(vo3p0, wasm_f32x4_mul(vi7x2345, vk40)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_4x4_acc2() 667 vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi4x2345, vk40)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_4x4_acc2() [all …]
|