/external/XNNPACK/src/f32-dwconv2d-chw/gen/ |
D | 5x5p2-minmax-scalar-3x1.c | 57 const float vk42 = weights[23]; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_3x1() local 193 vo0p0 += vi4x2 * vk42; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_3x1() 194 vo1p0 += vi5x2 * vk42; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_3x1() 195 vo2p0 += vi6x2 * vk42; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_3x1() 320 vo0p0 += vi4x2 * vk42; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_3x1() 321 vo1p0 += vi5x2 * vk42; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_3x1() 322 vo2p0 += vi6x2 * vk42; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_3x1() 409 vo0p0 += vi4x2 * vk42; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_3x1() 410 vo1p0 += vi5x2 * vk42; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_3x1() 411 vo2p0 += vi6x2 * vk42; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_3x1()
|
D | 5x5s2p2-minmax-scalar-3x1.c | 58 const float vk42 = weights[23]; in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__scalar_3x1() local 232 vo0p0 += vi4x2 * vk42; in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__scalar_3x1() 233 vo1p0 += vi6x2 * vk42; in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__scalar_3x1() 234 vo2p0 += vi8x2 * vk42; in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__scalar_3x1() 346 vo0p0 += vi4x2 * vk42; in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__scalar_3x1() 347 vo1p0 += vi6x2 * vk42; in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__scalar_3x1() 348 vo2p0 += vi8x2 * vk42; in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__scalar_3x1() 423 vo0p0 += vi4x2 * vk42; in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__scalar_3x1() 424 vo1p0 += vi6x2 * vk42; in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__scalar_3x1() 425 vo2p0 += vi8x2 * vk42; in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__scalar_3x1()
|
D | 5x5s2p2-minmax-scalar-3x1-acc2.c | 58 const float vk42 = weights[23]; in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__scalar_3x1_acc2() local 232 vo0p0 += vi4x2 * vk42; in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__scalar_3x1_acc2() 233 vo1p0 += vi6x2 * vk42; in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__scalar_3x1_acc2() 234 vo2p0 += vi8x2 * vk42; in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__scalar_3x1_acc2() 349 vo0p0 += vi4x2 * vk42; in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__scalar_3x1_acc2() 350 vo1p0 += vi6x2 * vk42; in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__scalar_3x1_acc2() 351 vo2p0 += vi8x2 * vk42; in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__scalar_3x1_acc2() 429 vo0p0 += vi4x2 * vk42; in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__scalar_3x1_acc2() 430 vo1p0 += vi6x2 * vk42; in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__scalar_3x1_acc2() 431 vo2p0 += vi8x2 * vk42; in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__scalar_3x1_acc2()
|
D | 5x5p2-minmax-scalar-3x1-acc2.c | 57 const float vk42 = weights[23]; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_3x1_acc2() local 193 vo0p0 += vi4x2 * vk42; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_3x1_acc2() 194 vo1p0 += vi5x2 * vk42; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_3x1_acc2() 195 vo2p0 += vi6x2 * vk42; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_3x1_acc2() 323 vo0p0 += vi4x2 * vk42; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_3x1_acc2() 324 vo1p0 += vi5x2 * vk42; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_3x1_acc2() 325 vo2p0 += vi6x2 * vk42; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_3x1_acc2() 415 vo0p0 += vi4x2 * vk42; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_3x1_acc2() 416 vo1p0 += vi5x2 * vk42; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_3x1_acc2() 417 vo2p0 += vi6x2 * vk42; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_3x1_acc2()
|
D | 5x5p2-minmax-scalar-2x1.c | 57 const float vk42 = weights[23]; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_2x1() local 166 vo0p0 += vi4x2 * vk42; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_2x1() 167 vo1p0 += vi5x2 * vk42; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_2x1() 261 vo0p0 += vi4x2 * vk42; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_2x1() 262 vo1p0 += vi5x2 * vk42; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_2x1() 326 vo0p0 += vi4x2 * vk42; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_2x1() 327 vo1p0 += vi5x2 * vk42; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_2x1()
|
D | 5x5s2p2-minmax-scalar-2x1-acc3.c | 58 const float vk42 = weights[23]; in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__scalar_2x1_acc3() local 192 vo0p2 += vi4x2 * vk42; in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__scalar_2x1_acc3() 193 vo1p2 += vi6x2 * vk42; in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__scalar_2x1_acc3() 278 vo0p2 += vi4x2 * vk42; in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__scalar_2x1_acc3() 279 vo1p2 += vi6x2 * vk42; in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__scalar_2x1_acc3() 336 vo0p2 += vi4x2 * vk42; in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__scalar_2x1_acc3() 337 vo1p2 += vi6x2 * vk42; in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__scalar_2x1_acc3()
|
D | 5x5p2-minmax-scalar-2x1-acc3.c | 57 const float vk42 = weights[23]; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_2x1_acc3() local 166 vo0p2 += vi4x2 * vk42; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_2x1_acc3() 167 vo1p2 += vi5x2 * vk42; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_2x1_acc3() 265 vo0p2 += vi4x2 * vk42; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_2x1_acc3() 266 vo1p2 += vi5x2 * vk42; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_2x1_acc3() 334 vo0p2 += vi4x2 * vk42; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_2x1_acc3() 335 vo1p2 += vi5x2 * vk42; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_2x1_acc3()
|
D | 5x5p2-minmax-scalar-2x1-acc2.c | 57 const float vk42 = weights[23]; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_2x1_acc2() local 166 vo0p0 += vi4x2 * vk42; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_2x1_acc2() 167 vo1p0 += vi5x2 * vk42; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_2x1_acc2() 263 vo0p0 += vi4x2 * vk42; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_2x1_acc2() 264 vo1p0 += vi5x2 * vk42; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_2x1_acc2() 330 vo0p0 += vi4x2 * vk42; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_2x1_acc2() 331 vo1p0 += vi5x2 * vk42; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_2x1_acc2()
|
D | 5x5s2p2-minmax-scalar-2x1.c | 58 const float vk42 = weights[23]; in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__scalar_2x1() local 192 vo0p0 += vi4x2 * vk42; in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__scalar_2x1() 193 vo1p0 += vi6x2 * vk42; in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__scalar_2x1() 274 vo0p0 += vi4x2 * vk42; in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__scalar_2x1() 275 vo1p0 += vi6x2 * vk42; in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__scalar_2x1() 328 vo0p0 += vi4x2 * vk42; in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__scalar_2x1() 329 vo1p0 += vi6x2 * vk42; in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__scalar_2x1()
|
D | 5x5s2p2-minmax-scalar-2x1-acc2.c | 58 const float vk42 = weights[23]; in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__scalar_2x1_acc2() local 192 vo0p0 += vi4x2 * vk42; in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__scalar_2x1_acc2() 193 vo1p0 += vi6x2 * vk42; in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__scalar_2x1_acc2() 276 vo0p0 += vi4x2 * vk42; in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__scalar_2x1_acc2() 277 vo1p0 += vi6x2 * vk42; in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__scalar_2x1_acc2() 332 vo0p0 += vi4x2 * vk42; in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__scalar_2x1_acc2() 333 vo1p0 += vi6x2 * vk42; in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__scalar_2x1_acc2()
|
D | 5x5p2-minmax-scalar-1x1-acc4.c | 57 const float vk42 = weights[23]; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_1x1_acc4() local 139 vo0p2 += vi4x2 * vk42; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_1x1_acc4() 205 vo0p2 += vi4x2 * vk42; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_1x1_acc4() 249 vo0p2 += vi4x2 * vk42; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_1x1_acc4()
|
D | 5x5s2p2-minmax-scalar-1x1-acc2.c | 58 const float vk42 = weights[23]; in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__scalar_1x1_acc2() local 151 vo0p0 += vi4x2 * vk42; in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__scalar_1x1_acc2() 202 vo0p0 += vi4x2 * vk42; in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__scalar_1x1_acc2() 234 vo0p0 += vi4x2 * vk42; in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__scalar_1x1_acc2()
|
D | 5x5p2-minmax-scalar-1x1-acc3.c | 57 const float vk42 = weights[23]; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_1x1_acc3() local 139 vo0p2 += vi4x2 * vk42; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_1x1_acc3() 204 vo0p2 += vi4x2 * vk42; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_1x1_acc3() 247 vo0p2 += vi4x2 * vk42; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_1x1_acc3()
|
D | 5x5p2-minmax-scalar-1x1.c | 57 const float vk42 = weights[23]; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_1x1() local 139 vo0p0 += vi4x2 * vk42; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_1x1() 202 vo0p0 += vi4x2 * vk42; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_1x1() 243 vo0p0 += vi4x2 * vk42; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_1x1()
|
D | 5x5s2p2-minmax-scalar-1x1-acc4.c | 58 const float vk42 = weights[23]; in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__scalar_1x1_acc4() local 151 vo0p2 += vi4x2 * vk42; in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__scalar_1x1_acc4() 204 vo0p2 += vi4x2 * vk42; in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__scalar_1x1_acc4() 238 vo0p2 += vi4x2 * vk42; in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__scalar_1x1_acc4()
|
D | 5x5s2p2-minmax-scalar-1x1.c | 58 const float vk42 = weights[23]; in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__scalar_1x1() local 151 vo0p0 += vi4x2 * vk42; in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__scalar_1x1() 201 vo0p0 += vi4x2 * vk42; in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__scalar_1x1() 232 vo0p0 += vi4x2 * vk42; in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__scalar_1x1()
|
D | 5x5p2-minmax-scalar-1x1-acc2.c | 57 const float vk42 = weights[23]; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_1x1_acc2() local 139 vo0p0 += vi4x2 * vk42; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_1x1_acc2() 203 vo0p0 += vi4x2 * vk42; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_1x1_acc2() 245 vo0p0 += vi4x2 * vk42; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_1x1_acc2()
|
D | 5x5s2p2-minmax-scalar-1x1-acc3.c | 58 const float vk42 = weights[23]; in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__scalar_1x1_acc3() local 151 vo0p2 += vi4x2 * vk42; in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__scalar_1x1_acc3() 203 vo0p2 += vi4x2 * vk42; in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__scalar_1x1_acc3() 236 vo0p2 += vi4x2 * vk42; in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__scalar_1x1_acc3()
|
D | 5x5p2-minmax-scalar-1x1-acc5.c | 57 const float vk42 = weights[23]; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_1x1_acc5() local 139 vo0p4 += vi4x2 * vk42; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_1x1_acc5() 206 vo0p4 += vi4x2 * vk42; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_1x1_acc5() 251 vo0p4 += vi4x2 * vk42; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_1x1_acc5()
|
D | 5x5s2p2-minmax-scalar-1x1-acc5.c | 58 const float vk42 = weights[23]; in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__scalar_1x1_acc5() local 151 vo0p4 += vi4x2 * vk42; in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__scalar_1x1_acc5() 205 vo0p4 += vi4x2 * vk42; in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__scalar_1x1_acc5() 240 vo0p4 += vi4x2 * vk42; in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__scalar_1x1_acc5()
|
D | 5x5p2-minmax-wasmsimd-x86-loadsplat-5x4.c | 68 const v128_t vk42 = wasm_v32x4_shuffle(vwKLMN, vwKLMN, 3, 3, 3, 3); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_5x4() local 177 vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi4x4567, vk42)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_5x4() 178 vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi5x4567, vk42)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_5x4() 179 vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi6x4567, vk42)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_5x4() 180 vo3p0 = wasm_f32x4_add(vo3p0, wasm_f32x4_mul(vi7x4567, vk42)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_5x4() 181 vo4p0 = wasm_f32x4_add(vo4p0, wasm_f32x4_mul(vi8x4567, vk42)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_5x4() 431 vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi4x4567, vk42)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_5x4() 432 vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi5x4567, vk42)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_5x4() 433 vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi6x4567, vk42)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_5x4() 434 vo3p0 = wasm_f32x4_add(vo3p0, wasm_f32x4_mul(vi7x4567, vk42)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_5x4() [all …]
|
D | 5x5p2-minmax-wasmsimd-arm-loadsplat-5x4.c | 68 const v128_t vk42 = wasm_v32x4_shuffle(vwKLMN, vwKLMN, 3, 3, 3, 3); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_5x4() local 177 vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi4x4567, vk42)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_5x4() 178 vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi5x4567, vk42)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_5x4() 179 vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi6x4567, vk42)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_5x4() 180 vo3p0 = wasm_f32x4_add(vo3p0, wasm_f32x4_mul(vi7x4567, vk42)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_5x4() 181 vo4p0 = wasm_f32x4_add(vo4p0, wasm_f32x4_mul(vi8x4567, vk42)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_5x4() 431 vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi4x4567, vk42)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_5x4() 432 vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi5x4567, vk42)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_5x4() 433 vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi6x4567, vk42)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_5x4() 434 vo3p0 = wasm_f32x4_add(vo3p0, wasm_f32x4_mul(vi7x4567, vk42)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_5x4() [all …]
|
D | 5x5p2-minmax-sse-5x4.c | 60 const __m128 vk42 = _mm_load1_ps(weights + 23); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_5x4() local 158 vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi4x4567, vk42)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_5x4() 159 vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi5x4567, vk42)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_5x4() 160 vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi6x4567, vk42)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_5x4() 161 vo3p0 = _mm_add_ps(vo3p0, _mm_mul_ps(vi7x4567, vk42)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_5x4() 162 vo4p0 = _mm_add_ps(vo4p0, _mm_mul_ps(vi8x4567, vk42)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_5x4() 411 vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi4x4567, vk42)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_5x4() 412 vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi5x4567, vk42)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_5x4() 413 vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi6x4567, vk42)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_5x4() 414 vo3p0 = _mm_add_ps(vo3p0, _mm_mul_ps(vi7x4567, vk42)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_5x4() [all …]
|
D | 5x5p2-minmax-wasmsimd-arm-loadsplat-4x4-acc2.c | 68 const v128_t vk42 = wasm_v32x4_shuffle(vwKLMN, vwKLMN, 3, 3, 3, 3); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_4x4_acc2() local 163 vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi4x4567, vk42)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_4x4_acc2() 164 vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi5x4567, vk42)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_4x4_acc2() 165 vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi6x4567, vk42)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_4x4_acc2() 166 vo3p0 = wasm_f32x4_add(vo3p0, wasm_f32x4_mul(vi7x4567, vk42)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_4x4_acc2() 384 vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi4x4567, vk42)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_4x4_acc2() 385 vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi5x4567, vk42)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_4x4_acc2() 386 vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi6x4567, vk42)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_4x4_acc2() 387 vo3p0 = wasm_f32x4_add(vo3p0, wasm_f32x4_mul(vi7x4567, vk42)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_4x4_acc2() 599 vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi4x4567, vk42)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_4x4_acc2() [all …]
|
D | 5x5p2-minmax-wasmsimd-x86-loadsplat-4x4-acc2.c | 68 const v128_t vk42 = wasm_v32x4_shuffle(vwKLMN, vwKLMN, 3, 3, 3, 3); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_4x4_acc2() local 163 vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi4x4567, vk42)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_4x4_acc2() 164 vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi5x4567, vk42)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_4x4_acc2() 165 vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi6x4567, vk42)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_4x4_acc2() 166 vo3p0 = wasm_f32x4_add(vo3p0, wasm_f32x4_mul(vi7x4567, vk42)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_4x4_acc2() 384 vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi4x4567, vk42)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_4x4_acc2() 385 vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi5x4567, vk42)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_4x4_acc2() 386 vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi6x4567, vk42)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_4x4_acc2() 387 vo3p0 = wasm_f32x4_add(vo3p0, wasm_f32x4_mul(vi7x4567, vk42)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_4x4_acc2() 599 vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi4x4567, vk42)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_4x4_acc2() [all …]
|