/external/XNNPACK/src/f32-dwconv2d-chw/gen/ |
D | 5x5p2-minmax-scalar-3x1.c | 56 const float vk41 = weights[22]; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_3x1() local 169 vo0p0 += vi4x1 * vk41; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_3x1() 170 vo1p0 += vi5x1 * vk41; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_3x1() 171 vo2p0 += vi6x1 * vk41; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_3x1() 296 vo0p0 += vi4x1 * vk41; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_3x1() 297 vo1p0 += vi5x1 * vk41; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_3x1() 298 vo2p0 += vi6x1 * vk41; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_3x1() 393 vo0p0 += vi4x1 * vk41; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_3x1() 394 vo1p0 += vi5x1 * vk41; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_3x1() 395 vo2p0 += vi6x1 * vk41; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_3x1()
|
D | 5x5s2p2-minmax-scalar-3x1.c | 57 const float vk41 = weights[22]; in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__scalar_3x1() local 206 vo0p0 += vi4x1 * vk41; in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__scalar_3x1() 207 vo1p0 += vi6x1 * vk41; in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__scalar_3x1() 208 vo2p0 += vi8x1 * vk41; in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__scalar_3x1() 330 vo0p0 += vi4x1 * vk41; in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__scalar_3x1() 331 vo1p0 += vi6x1 * vk41; in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__scalar_3x1() 332 vo2p0 += vi8x1 * vk41; in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__scalar_3x1() 407 vo0p0 += vi4x1 * vk41; in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__scalar_3x1() 408 vo1p0 += vi6x1 * vk41; in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__scalar_3x1() 409 vo2p0 += vi8x1 * vk41; in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__scalar_3x1()
|
D | 5x5s2p2-minmax-scalar-3x1-acc2.c | 57 const float vk41 = weights[22]; in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__scalar_3x1_acc2() local 206 vo0p1 += vi4x1 * vk41; in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__scalar_3x1_acc2() 207 vo1p1 += vi6x1 * vk41; in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__scalar_3x1_acc2() 208 vo2p1 += vi8x1 * vk41; in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__scalar_3x1_acc2() 333 vo0p1 += vi4x1 * vk41; in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__scalar_3x1_acc2() 334 vo1p1 += vi6x1 * vk41; in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__scalar_3x1_acc2() 335 vo2p1 += vi8x1 * vk41; in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__scalar_3x1_acc2() 413 vo0p1 += vi4x1 * vk41; in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__scalar_3x1_acc2() 414 vo1p1 += vi6x1 * vk41; in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__scalar_3x1_acc2() 415 vo2p1 += vi8x1 * vk41; in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__scalar_3x1_acc2()
|
D | 5x5p2-minmax-scalar-3x1-acc2.c | 56 const float vk41 = weights[22]; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_3x1_acc2() local 169 vo0p1 += vi4x1 * vk41; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_3x1_acc2() 170 vo1p1 += vi5x1 * vk41; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_3x1_acc2() 171 vo2p1 += vi6x1 * vk41; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_3x1_acc2() 299 vo0p1 += vi4x1 * vk41; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_3x1_acc2() 300 vo1p1 += vi5x1 * vk41; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_3x1_acc2() 301 vo2p1 += vi6x1 * vk41; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_3x1_acc2() 399 vo0p1 += vi4x1 * vk41; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_3x1_acc2() 400 vo1p1 += vi5x1 * vk41; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_3x1_acc2() 401 vo2p1 += vi6x1 * vk41; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_3x1_acc2()
|
D | 5x5p2-minmax-scalar-2x1.c | 56 const float vk41 = weights[22]; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_2x1() local 148 vo0p0 += vi4x1 * vk41; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_2x1() 149 vo1p0 += vi5x1 * vk41; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_2x1() 243 vo0p0 += vi4x1 * vk41; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_2x1() 244 vo1p0 += vi5x1 * vk41; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_2x1() 315 vo0p0 += vi4x1 * vk41; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_2x1() 316 vo1p0 += vi5x1 * vk41; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_2x1()
|
D | 5x5s2p2-minmax-scalar-2x1-acc3.c | 57 const float vk41 = weights[22]; in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__scalar_2x1_acc3() local 173 vo0p0 += vi4x1 * vk41; in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__scalar_2x1_acc3() 174 vo1p0 += vi6x1 * vk41; in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__scalar_2x1_acc3() 267 vo0p0 += vi4x1 * vk41; in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__scalar_2x1_acc3() 268 vo1p0 += vi6x1 * vk41; in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__scalar_2x1_acc3() 325 vo0p0 += vi4x1 * vk41; in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__scalar_2x1_acc3() 326 vo1p0 += vi6x1 * vk41; in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__scalar_2x1_acc3()
|
D | 5x5p2-minmax-scalar-2x1-acc3.c | 56 const float vk41 = weights[22]; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_2x1_acc3() local 148 vo0p0 += vi4x1 * vk41; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_2x1_acc3() 149 vo1p0 += vi5x1 * vk41; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_2x1_acc3() 247 vo0p0 += vi4x1 * vk41; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_2x1_acc3() 248 vo1p0 += vi5x1 * vk41; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_2x1_acc3() 323 vo0p0 += vi4x1 * vk41; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_2x1_acc3() 324 vo1p0 += vi5x1 * vk41; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_2x1_acc3()
|
D | 5x5p2-minmax-scalar-2x1-acc2.c | 56 const float vk41 = weights[22]; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_2x1_acc2() local 148 vo0p1 += vi4x1 * vk41; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_2x1_acc2() 149 vo1p1 += vi5x1 * vk41; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_2x1_acc2() 245 vo0p1 += vi4x1 * vk41; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_2x1_acc2() 246 vo1p1 += vi5x1 * vk41; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_2x1_acc2() 319 vo0p1 += vi4x1 * vk41; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_2x1_acc2() 320 vo1p1 += vi5x1 * vk41; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_2x1_acc2()
|
D | 5x5s2p2-minmax-scalar-2x1.c | 57 const float vk41 = weights[22]; in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__scalar_2x1() local 173 vo0p0 += vi4x1 * vk41; in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__scalar_2x1() 174 vo1p0 += vi6x1 * vk41; in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__scalar_2x1() 263 vo0p0 += vi4x1 * vk41; in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__scalar_2x1() 264 vo1p0 += vi6x1 * vk41; in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__scalar_2x1() 317 vo0p0 += vi4x1 * vk41; in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__scalar_2x1() 318 vo1p0 += vi6x1 * vk41; in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__scalar_2x1()
|
D | 5x5s2p2-minmax-scalar-2x1-acc2.c | 57 const float vk41 = weights[22]; in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__scalar_2x1_acc2() local 173 vo0p1 += vi4x1 * vk41; in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__scalar_2x1_acc2() 174 vo1p1 += vi6x1 * vk41; in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__scalar_2x1_acc2() 265 vo0p1 += vi4x1 * vk41; in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__scalar_2x1_acc2() 266 vo1p1 += vi6x1 * vk41; in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__scalar_2x1_acc2() 321 vo0p1 += vi4x1 * vk41; in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__scalar_2x1_acc2() 322 vo1p1 += vi6x1 * vk41; in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__scalar_2x1_acc2()
|
D | 5x5p2-minmax-scalar-1x1-acc4.c | 56 const float vk41 = weights[22]; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_1x1_acc4() local 127 vo0p1 += vi4x1 * vk41; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_1x1_acc4() 193 vo0p1 += vi4x1 * vk41; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_1x1_acc4() 243 vo0p1 += vi4x1 * vk41; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_1x1_acc4()
|
D | 5x5s2p2-minmax-scalar-1x1-acc2.c | 57 const float vk41 = weights[22]; in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__scalar_1x1_acc2() local 139 vo0p1 += vi4x1 * vk41; in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__scalar_1x1_acc2() 196 vo0p1 += vi4x1 * vk41; in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__scalar_1x1_acc2() 228 vo0p1 += vi4x1 * vk41; in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__scalar_1x1_acc2()
|
D | 5x5p2-minmax-scalar-1x1-acc3.c | 56 const float vk41 = weights[22]; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_1x1_acc3() local 127 vo0p0 += vi4x1 * vk41; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_1x1_acc3() 192 vo0p0 += vi4x1 * vk41; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_1x1_acc3() 241 vo0p0 += vi4x1 * vk41; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_1x1_acc3()
|
D | 5x5p2-minmax-scalar-1x1.c | 56 const float vk41 = weights[22]; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_1x1() local 127 vo0p0 += vi4x1 * vk41; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_1x1() 190 vo0p0 += vi4x1 * vk41; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_1x1() 237 vo0p0 += vi4x1 * vk41; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_1x1()
|
D | 5x5s2p2-minmax-scalar-1x1-acc4.c | 57 const float vk41 = weights[22]; in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__scalar_1x1_acc4() local 139 vo0p1 += vi4x1 * vk41; in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__scalar_1x1_acc4() 198 vo0p1 += vi4x1 * vk41; in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__scalar_1x1_acc4() 232 vo0p1 += vi4x1 * vk41; in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__scalar_1x1_acc4()
|
D | 5x5s2p2-minmax-scalar-1x1.c | 57 const float vk41 = weights[22]; in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__scalar_1x1() local 139 vo0p0 += vi4x1 * vk41; in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__scalar_1x1() 195 vo0p0 += vi4x1 * vk41; in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__scalar_1x1() 226 vo0p0 += vi4x1 * vk41; in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__scalar_1x1()
|
D | 5x5p2-minmax-scalar-1x1-acc2.c | 56 const float vk41 = weights[22]; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_1x1_acc2() local 127 vo0p1 += vi4x1 * vk41; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_1x1_acc2() 191 vo0p1 += vi4x1 * vk41; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_1x1_acc2() 239 vo0p1 += vi4x1 * vk41; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_1x1_acc2()
|
D | 5x5s2p2-minmax-scalar-1x1-acc3.c | 57 const float vk41 = weights[22]; in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__scalar_1x1_acc3() local 139 vo0p0 += vi4x1 * vk41; in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__scalar_1x1_acc3() 197 vo0p0 += vi4x1 * vk41; in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__scalar_1x1_acc3() 230 vo0p0 += vi4x1 * vk41; in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__scalar_1x1_acc3()
|
D | 5x5p2-minmax-scalar-1x1-acc5.c | 56 const float vk41 = weights[22]; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_1x1_acc5() local 127 vo0p4 += vi4x1 * vk41; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_1x1_acc5() 194 vo0p4 += vi4x1 * vk41; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_1x1_acc5() 245 vo0p4 += vi4x1 * vk41; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_1x1_acc5()
|
D | 5x5s2p2-minmax-scalar-1x1-acc5.c | 57 const float vk41 = weights[22]; in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__scalar_1x1_acc5() local 139 vo0p4 += vi4x1 * vk41; in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__scalar_1x1_acc5() 199 vo0p4 += vi4x1 * vk41; in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__scalar_1x1_acc5() 234 vo0p4 += vi4x1 * vk41; in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__scalar_1x1_acc5()
|
D | 5x5p2-minmax-wasmsimd-x86-loadsplat-5x4.c | 67 const v128_t vk41 = wasm_v32x4_shuffle(vwKLMN, vwKLMN, 2, 2, 2, 2); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_5x4() local 217 vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi4x3456, vk41)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_5x4() 218 vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi5x3456, vk41)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_5x4() 219 vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi6x3456, vk41)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_5x4() 220 vo3p0 = wasm_f32x4_add(vo3p0, wasm_f32x4_mul(vi7x3456, vk41)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_5x4() 221 vo4p0 = wasm_f32x4_add(vo4p0, wasm_f32x4_mul(vi8x3456, vk41)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_5x4() 471 vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi4x3456, vk41)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_5x4() 472 vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi5x3456, vk41)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_5x4() 473 vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi6x3456, vk41)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_5x4() 474 vo3p0 = wasm_f32x4_add(vo3p0, wasm_f32x4_mul(vi7x3456, vk41)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_5x4() [all …]
|
D | 5x5p2-minmax-wasmsimd-arm-loadsplat-5x4.c | 67 const v128_t vk41 = wasm_v32x4_shuffle(vwKLMN, vwKLMN, 2, 2, 2, 2); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_5x4() local 217 vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi4x3456, vk41)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_5x4() 218 vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi5x3456, vk41)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_5x4() 219 vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi6x3456, vk41)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_5x4() 220 vo3p0 = wasm_f32x4_add(vo3p0, wasm_f32x4_mul(vi7x3456, vk41)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_5x4() 221 vo4p0 = wasm_f32x4_add(vo4p0, wasm_f32x4_mul(vi8x3456, vk41)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_5x4() 471 vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi4x3456, vk41)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_5x4() 472 vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi5x3456, vk41)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_5x4() 473 vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi6x3456, vk41)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_5x4() 474 vo3p0 = wasm_f32x4_add(vo3p0, wasm_f32x4_mul(vi7x3456, vk41)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_5x4() [all …]
|
D | 5x5p2-minmax-sse-5x4.c | 59 const __m128 vk41 = _mm_load1_ps(weights + 22); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_5x4() local 223 vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi4x3456, vk41)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_5x4() 224 vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi5x3456, vk41)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_5x4() 225 vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi6x3456, vk41)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_5x4() 226 vo3p0 = _mm_add_ps(vo3p0, _mm_mul_ps(vi7x3456, vk41)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_5x4() 227 vo4p0 = _mm_add_ps(vo4p0, _mm_mul_ps(vi8x3456, vk41)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_5x4() 476 vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi4x3456, vk41)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_5x4() 477 vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi5x3456, vk41)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_5x4() 478 vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi6x3456, vk41)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_5x4() 479 vo3p0 = _mm_add_ps(vo3p0, _mm_mul_ps(vi7x3456, vk41)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_5x4() [all …]
|
D | 5x5p2-minmax-wasmsimd-arm-loadsplat-4x4-acc2.c | 67 const v128_t vk41 = wasm_v32x4_shuffle(vwKLMN, vwKLMN, 2, 2, 2, 2); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_4x4_acc2() local 197 vo0p1 = wasm_f32x4_add(vo0p1, wasm_f32x4_mul(vi4x3456, vk41)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_4x4_acc2() 198 vo1p1 = wasm_f32x4_add(vo1p1, wasm_f32x4_mul(vi5x3456, vk41)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_4x4_acc2() 199 vo2p1 = wasm_f32x4_add(vo2p1, wasm_f32x4_mul(vi6x3456, vk41)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_4x4_acc2() 200 vo3p1 = wasm_f32x4_add(vo3p1, wasm_f32x4_mul(vi7x3456, vk41)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_4x4_acc2() 418 vo0p1 = wasm_f32x4_add(vo0p1, wasm_f32x4_mul(vi4x3456, vk41)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_4x4_acc2() 419 vo1p1 = wasm_f32x4_add(vo1p1, wasm_f32x4_mul(vi5x3456, vk41)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_4x4_acc2() 420 vo2p1 = wasm_f32x4_add(vo2p1, wasm_f32x4_mul(vi6x3456, vk41)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_4x4_acc2() 421 vo3p1 = wasm_f32x4_add(vo3p1, wasm_f32x4_mul(vi7x3456, vk41)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_4x4_acc2() 633 vo0p1 = wasm_f32x4_add(vo0p1, wasm_f32x4_mul(vi4x3456, vk41)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_4x4_acc2() [all …]
|
D | 5x5p2-minmax-wasmsimd-x86-loadsplat-4x4-acc2.c | 67 const v128_t vk41 = wasm_v32x4_shuffle(vwKLMN, vwKLMN, 2, 2, 2, 2); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_4x4_acc2() local 197 vo0p1 = wasm_f32x4_add(vo0p1, wasm_f32x4_mul(vi4x3456, vk41)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_4x4_acc2() 198 vo1p1 = wasm_f32x4_add(vo1p1, wasm_f32x4_mul(vi5x3456, vk41)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_4x4_acc2() 199 vo2p1 = wasm_f32x4_add(vo2p1, wasm_f32x4_mul(vi6x3456, vk41)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_4x4_acc2() 200 vo3p1 = wasm_f32x4_add(vo3p1, wasm_f32x4_mul(vi7x3456, vk41)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_4x4_acc2() 418 vo0p1 = wasm_f32x4_add(vo0p1, wasm_f32x4_mul(vi4x3456, vk41)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_4x4_acc2() 419 vo1p1 = wasm_f32x4_add(vo1p1, wasm_f32x4_mul(vi5x3456, vk41)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_4x4_acc2() 420 vo2p1 = wasm_f32x4_add(vo2p1, wasm_f32x4_mul(vi6x3456, vk41)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_4x4_acc2() 421 vo3p1 = wasm_f32x4_add(vo3p1, wasm_f32x4_mul(vi7x3456, vk41)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_4x4_acc2() 633 vo0p1 = wasm_f32x4_add(vo0p1, wasm_f32x4_mul(vi4x3456, vk41)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_4x4_acc2() [all …]
|