/external/XNNPACK/src/f32-dwconv2d-chw/gen/ |
D | 5x5p2-minmax-scalar-3x1.c | 37 const float vk02 = weights[3]; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_3x1() local 181 vo0p0 += vi0x2 * vk02; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_3x1() 182 vo1p0 += vi1x2 * vk02; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_3x1() 183 vo2p0 += vi2x2 * vk02; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_3x1() 308 vo0p0 += vi0x2 * vk02; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_3x1() 309 vo1p0 += vi1x2 * vk02; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_3x1() 310 vo2p0 += vi2x2 * vk02; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_3x1() 397 vo0p0 += vi0x2 * vk02; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_3x1() 398 vo1p0 += vi1x2 * vk02; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_3x1() 399 vo2p0 += vi2x2 * vk02; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_3x1()
|
D | 5x5p2-minmax-scalar-3x1-acc2.c | 37 const float vk02 = weights[3]; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_3x1_acc2() local 181 vo0p0 += vi0x2 * vk02; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_3x1_acc2() 182 vo1p0 += vi1x2 * vk02; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_3x1_acc2() 183 vo2p0 += vi2x2 * vk02; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_3x1_acc2() 311 vo0p0 += vi0x2 * vk02; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_3x1_acc2() 312 vo1p0 += vi1x2 * vk02; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_3x1_acc2() 313 vo2p0 += vi2x2 * vk02; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_3x1_acc2() 403 vo0p0 += vi0x2 * vk02; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_3x1_acc2() 404 vo1p0 += vi1x2 * vk02; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_3x1_acc2() 405 vo2p0 += vi2x2 * vk02; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_3x1_acc2()
|
D | 5x5s2p2-minmax-scalar-3x1-acc2.c | 38 const float vk02 = weights[3]; in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__scalar_3x1_acc2() local 220 vo0p0 += vi0x2 * vk02; in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__scalar_3x1_acc2() 221 vo1p0 += vi2x2 * vk02; in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__scalar_3x1_acc2() 222 vo2p0 += vi4x2 * vk02; in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__scalar_3x1_acc2() 337 vo0p0 += vi0x2 * vk02; in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__scalar_3x1_acc2() 338 vo1p0 += vi2x2 * vk02; in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__scalar_3x1_acc2() 339 vo2p0 += vi4x2 * vk02; in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__scalar_3x1_acc2() 417 vo0p0 += vi0x2 * vk02; in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__scalar_3x1_acc2() 418 vo1p0 += vi2x2 * vk02; in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__scalar_3x1_acc2() 419 vo2p0 += vi4x2 * vk02; in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__scalar_3x1_acc2()
|
D | 5x5s2p2-minmax-scalar-3x1.c | 38 const float vk02 = weights[3]; in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__scalar_3x1() local 220 vo0p0 += vi0x2 * vk02; in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__scalar_3x1() 221 vo1p0 += vi2x2 * vk02; in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__scalar_3x1() 222 vo2p0 += vi4x2 * vk02; in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__scalar_3x1() 334 vo0p0 += vi0x2 * vk02; in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__scalar_3x1() 335 vo1p0 += vi2x2 * vk02; in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__scalar_3x1() 336 vo2p0 += vi4x2 * vk02; in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__scalar_3x1() 411 vo0p0 += vi0x2 * vk02; in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__scalar_3x1() 412 vo1p0 += vi2x2 * vk02; in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__scalar_3x1() 413 vo2p0 += vi4x2 * vk02; in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__scalar_3x1()
|
D | 3x3p1-minmax-scalar-5x1.c | 37 const float vk02 = weights[3]; in xnn_f32_dwconv2d_chw_ukernel_3x3p1__scalar_5x1() local 155 vo0p0 += vi0x2 * vk02; in xnn_f32_dwconv2d_chw_ukernel_3x3p1__scalar_5x1() 156 vo1p0 += vi1x2 * vk02; in xnn_f32_dwconv2d_chw_ukernel_3x3p1__scalar_5x1() 157 vo2p0 += vi2x2 * vk02; in xnn_f32_dwconv2d_chw_ukernel_3x3p1__scalar_5x1() 158 vo3p0 += vi3x2 * vk02; in xnn_f32_dwconv2d_chw_ukernel_3x3p1__scalar_5x1() 159 vo4p0 += vi4x2 * vk02; in xnn_f32_dwconv2d_chw_ukernel_3x3p1__scalar_5x1()
|
D | 3x3p1-minmax-scalar-6x1.c | 37 const float vk02 = weights[3]; in xnn_f32_dwconv2d_chw_ukernel_3x3p1__scalar_6x1() local 172 vo0p0 += vi0x2 * vk02; in xnn_f32_dwconv2d_chw_ukernel_3x3p1__scalar_6x1() 173 vo1p0 += vi1x2 * vk02; in xnn_f32_dwconv2d_chw_ukernel_3x3p1__scalar_6x1() 174 vo2p0 += vi2x2 * vk02; in xnn_f32_dwconv2d_chw_ukernel_3x3p1__scalar_6x1() 175 vo3p0 += vi3x2 * vk02; in xnn_f32_dwconv2d_chw_ukernel_3x3p1__scalar_6x1() 176 vo4p0 += vi4x2 * vk02; in xnn_f32_dwconv2d_chw_ukernel_3x3p1__scalar_6x1() 177 vo5p0 += vi5x2 * vk02; in xnn_f32_dwconv2d_chw_ukernel_3x3p1__scalar_6x1()
|
D | 5x5p2-minmax-scalar-2x1.c | 37 const float vk02 = weights[3]; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_2x1() local 158 vo0p0 += vi0x2 * vk02; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_2x1() 159 vo1p0 += vi1x2 * vk02; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_2x1() 253 vo0p0 += vi0x2 * vk02; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_2x1() 254 vo1p0 += vi1x2 * vk02; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_2x1() 318 vo0p0 += vi0x2 * vk02; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_2x1() 319 vo1p0 += vi1x2 * vk02; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_2x1()
|
D | 3x3p1-minmax-wasmsimd-arm-loadsplat-5x4.c | 45 const v128_t vk02 = wasm_v32x4_shuffle(vw0123, vw0123, 3, 3, 3, 3); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_loadsplat_5x4() local 189 vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi0x5678, vk02)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_loadsplat_5x4() 190 vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi1x5678, vk02)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_loadsplat_5x4() 191 vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi2x5678, vk02)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_loadsplat_5x4() 192 vo3p0 = wasm_f32x4_add(vo3p0, wasm_f32x4_mul(vi3x5678, vk02)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_loadsplat_5x4() 193 vo4p0 = wasm_f32x4_add(vo4p0, wasm_f32x4_mul(vi4x5678, vk02)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_loadsplat_5x4() 296 vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi0x5678, vk02)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_loadsplat_5x4() 297 vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi1x5678, vk02)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_loadsplat_5x4() 298 vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi2x5678, vk02)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_loadsplat_5x4() 299 vo3p0 = wasm_f32x4_add(vo3p0, wasm_f32x4_mul(vi3x5678, vk02)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_loadsplat_5x4() [all …]
|
D | 3x3p1-minmax-ssse3-6x4.c | 40 const __m128 vk02 = _mm_load1_ps(weights + 3); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__ssse3_6x4() local 202 vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi0x5678, vk02)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__ssse3_6x4() 203 vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi1x5678, vk02)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__ssse3_6x4() 204 vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi2x5678, vk02)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__ssse3_6x4() 205 vo3p0 = _mm_add_ps(vo3p0, _mm_mul_ps(vi3x5678, vk02)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__ssse3_6x4() 206 vo4p0 = _mm_add_ps(vo4p0, _mm_mul_ps(vi4x5678, vk02)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__ssse3_6x4() 207 vo5p0 = _mm_add_ps(vo5p0, _mm_mul_ps(vi5x5678, vk02)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__ssse3_6x4() 328 vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi0x5678, vk02)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__ssse3_6x4() 329 vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi1x5678, vk02)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__ssse3_6x4() 330 vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi2x5678, vk02)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__ssse3_6x4() [all …]
|
D | 3x3p1-minmax-wasmsimd-x86-loadsplat-6x4.c | 45 const v128_t vk02 = wasm_v32x4_shuffle(vw0123, vw0123, 3, 3, 3, 3); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_loadsplat_6x4() local 209 vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi0x5678, vk02)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_loadsplat_6x4() 210 vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi1x5678, vk02)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_loadsplat_6x4() 211 vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi2x5678, vk02)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_loadsplat_6x4() 212 vo3p0 = wasm_f32x4_add(vo3p0, wasm_f32x4_mul(vi3x5678, vk02)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_loadsplat_6x4() 213 vo4p0 = wasm_f32x4_add(vo4p0, wasm_f32x4_mul(vi4x5678, vk02)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_loadsplat_6x4() 214 vo5p0 = wasm_f32x4_add(vo5p0, wasm_f32x4_mul(vi5x5678, vk02)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_loadsplat_6x4() 333 vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi0x5678, vk02)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_loadsplat_6x4() 334 vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi1x5678, vk02)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_loadsplat_6x4() 335 vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi2x5678, vk02)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_loadsplat_6x4() [all …]
|
D | 3x3p1-minmax-wasmsimd-arm-loadsplat-6x4.c | 45 const v128_t vk02 = wasm_v32x4_shuffle(vw0123, vw0123, 3, 3, 3, 3); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_loadsplat_6x4() local 209 vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi0x5678, vk02)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_loadsplat_6x4() 210 vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi1x5678, vk02)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_loadsplat_6x4() 211 vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi2x5678, vk02)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_loadsplat_6x4() 212 vo3p0 = wasm_f32x4_add(vo3p0, wasm_f32x4_mul(vi3x5678, vk02)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_loadsplat_6x4() 213 vo4p0 = wasm_f32x4_add(vo4p0, wasm_f32x4_mul(vi4x5678, vk02)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_loadsplat_6x4() 214 vo5p0 = wasm_f32x4_add(vo5p0, wasm_f32x4_mul(vi5x5678, vk02)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_loadsplat_6x4() 333 vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi0x5678, vk02)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_loadsplat_6x4() 334 vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi1x5678, vk02)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_loadsplat_6x4() 335 vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi2x5678, vk02)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_loadsplat_6x4() [all …]
|
D | 5x5s2p2-minmax-scalar-2x1-acc3.c | 38 const float vk02 = weights[3]; in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__scalar_2x1_acc3() local 184 vo0p1 += vi0x2 * vk02; in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__scalar_2x1_acc3() 185 vo1p1 += vi2x2 * vk02; in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__scalar_2x1_acc3() 270 vo0p1 += vi0x2 * vk02; in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__scalar_2x1_acc3() 271 vo1p1 += vi2x2 * vk02; in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__scalar_2x1_acc3() 328 vo0p1 += vi0x2 * vk02; in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__scalar_2x1_acc3() 329 vo1p1 += vi2x2 * vk02; in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__scalar_2x1_acc3()
|
D | 5x5p2-minmax-scalar-2x1-acc3.c | 37 const float vk02 = weights[3]; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_2x1_acc3() local 158 vo0p1 += vi0x2 * vk02; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_2x1_acc3() 159 vo1p1 += vi1x2 * vk02; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_2x1_acc3() 257 vo0p1 += vi0x2 * vk02; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_2x1_acc3() 258 vo1p1 += vi1x2 * vk02; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_2x1_acc3() 326 vo0p1 += vi0x2 * vk02; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_2x1_acc3() 327 vo1p1 += vi1x2 * vk02; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_2x1_acc3()
|
D | 5x5s2p2-minmax-scalar-2x1-acc2.c | 38 const float vk02 = weights[3]; in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__scalar_2x1_acc2() local 184 vo0p0 += vi0x2 * vk02; in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__scalar_2x1_acc2() 185 vo1p0 += vi2x2 * vk02; in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__scalar_2x1_acc2() 268 vo0p0 += vi0x2 * vk02; in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__scalar_2x1_acc2() 269 vo1p0 += vi2x2 * vk02; in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__scalar_2x1_acc2() 324 vo0p0 += vi0x2 * vk02; in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__scalar_2x1_acc2() 325 vo1p0 += vi2x2 * vk02; in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__scalar_2x1_acc2()
|
D | 5x5p2-minmax-scalar-2x1-acc2.c | 37 const float vk02 = weights[3]; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_2x1_acc2() local 158 vo0p0 += vi0x2 * vk02; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_2x1_acc2() 159 vo1p0 += vi1x2 * vk02; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_2x1_acc2() 255 vo0p0 += vi0x2 * vk02; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_2x1_acc2() 256 vo1p0 += vi1x2 * vk02; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_2x1_acc2() 322 vo0p0 += vi0x2 * vk02; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_2x1_acc2() 323 vo1p0 += vi1x2 * vk02; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_2x1_acc2()
|
D | 5x5s2p2-minmax-scalar-2x1.c | 38 const float vk02 = weights[3]; in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__scalar_2x1() local 184 vo0p0 += vi0x2 * vk02; in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__scalar_2x1() 185 vo1p0 += vi2x2 * vk02; in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__scalar_2x1() 266 vo0p0 += vi0x2 * vk02; in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__scalar_2x1() 267 vo1p0 += vi2x2 * vk02; in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__scalar_2x1() 320 vo0p0 += vi0x2 * vk02; in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__scalar_2x1() 321 vo1p0 += vi2x2 * vk02; in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__scalar_2x1()
|
D | 3x3p1-minmax-sse-6x4.c | 40 const __m128 vk02 = _mm_load1_ps(weights + 3); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__sse_6x4() local 268 vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi0x5678, vk02)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__sse_6x4() 269 vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi1x5678, vk02)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__sse_6x4() 270 vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi2x5678, vk02)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__sse_6x4() 271 vo3p0 = _mm_add_ps(vo3p0, _mm_mul_ps(vi3x5678, vk02)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__sse_6x4() 272 vo4p0 = _mm_add_ps(vo4p0, _mm_mul_ps(vi4x5678, vk02)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__sse_6x4() 273 vo5p0 = _mm_add_ps(vo5p0, _mm_mul_ps(vi5x5678, vk02)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__sse_6x4() 444 vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi0x5678, vk02)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__sse_6x4() 445 vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi1x5678, vk02)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__sse_6x4() 446 vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi2x5678, vk02)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__sse_6x4() [all …]
|
D | 3x3p1-minmax-wasmsimd-x86-loadsplat-5x4.c | 45 const v128_t vk02 = wasm_v32x4_shuffle(vw0123, vw0123, 3, 3, 3, 3); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_loadsplat_5x4() local 189 vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi0x5678, vk02)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_loadsplat_5x4() 190 vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi1x5678, vk02)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_loadsplat_5x4() 191 vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi2x5678, vk02)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_loadsplat_5x4() 192 vo3p0 = wasm_f32x4_add(vo3p0, wasm_f32x4_mul(vi3x5678, vk02)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_loadsplat_5x4() 193 vo4p0 = wasm_f32x4_add(vo4p0, wasm_f32x4_mul(vi4x5678, vk02)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_loadsplat_5x4() 296 vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi0x5678, vk02)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_loadsplat_5x4() 297 vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi1x5678, vk02)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_loadsplat_5x4() 298 vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi2x5678, vk02)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_loadsplat_5x4() 299 vo3p0 = wasm_f32x4_add(vo3p0, wasm_f32x4_mul(vi3x5678, vk02)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_loadsplat_5x4() [all …]
|
D | 3x3p1-minmax-ssse3-5x4.c | 40 const __m128 vk02 = _mm_load1_ps(weights + 3); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__ssse3_5x4() local 182 vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi0x5678, vk02)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__ssse3_5x4() 183 vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi1x5678, vk02)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__ssse3_5x4() 184 vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi2x5678, vk02)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__ssse3_5x4() 185 vo3p0 = _mm_add_ps(vo3p0, _mm_mul_ps(vi3x5678, vk02)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__ssse3_5x4() 186 vo4p0 = _mm_add_ps(vo4p0, _mm_mul_ps(vi4x5678, vk02)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__ssse3_5x4() 291 vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi0x5678, vk02)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__ssse3_5x4() 292 vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi1x5678, vk02)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__ssse3_5x4() 293 vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi2x5678, vk02)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__ssse3_5x4() 294 vo3p0 = _mm_add_ps(vo3p0, _mm_mul_ps(vi3x5678, vk02)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__ssse3_5x4() [all …]
|
D | 3x3p1-minmax-sse-5x4.c | 40 const __m128 vk02 = _mm_load1_ps(weights + 3); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__sse_5x4() local 240 vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi0x5678, vk02)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__sse_5x4() 241 vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi1x5678, vk02)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__sse_5x4() 242 vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi2x5678, vk02)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__sse_5x4() 243 vo3p0 = _mm_add_ps(vo3p0, _mm_mul_ps(vi3x5678, vk02)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__sse_5x4() 244 vo4p0 = _mm_add_ps(vo4p0, _mm_mul_ps(vi4x5678, vk02)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__sse_5x4() 393 vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi0x5678, vk02)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__sse_5x4() 394 vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi1x5678, vk02)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__sse_5x4() 395 vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi2x5678, vk02)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__sse_5x4() 396 vo3p0 = _mm_add_ps(vo3p0, _mm_mul_ps(vi3x5678, vk02)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__sse_5x4() [all …]
|
D | 3x3p1-minmax-scalar-4x1.c | 37 const float vk02 = weights[3]; in xnn_f32_dwconv2d_chw_ukernel_3x3p1__scalar_4x1() local 138 vo0p0 += vi0x2 * vk02; in xnn_f32_dwconv2d_chw_ukernel_3x3p1__scalar_4x1() 139 vo1p0 += vi1x2 * vk02; in xnn_f32_dwconv2d_chw_ukernel_3x3p1__scalar_4x1() 140 vo2p0 += vi2x2 * vk02; in xnn_f32_dwconv2d_chw_ukernel_3x3p1__scalar_4x1() 141 vo3p0 += vi3x2 * vk02; in xnn_f32_dwconv2d_chw_ukernel_3x3p1__scalar_4x1()
|
D | 3x3p1-minmax-ssse3-4x4.c | 40 const __m128 vk02 = _mm_load1_ps(weights + 3); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__ssse3_4x4() local 162 vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi0x5678, vk02)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__ssse3_4x4() 163 vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi1x5678, vk02)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__ssse3_4x4() 164 vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi2x5678, vk02)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__ssse3_4x4() 165 vo3p0 = _mm_add_ps(vo3p0, _mm_mul_ps(vi3x5678, vk02)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__ssse3_4x4() 254 vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi0x5678, vk02)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__ssse3_4x4() 255 vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi1x5678, vk02)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__ssse3_4x4() 256 vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi2x5678, vk02)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__ssse3_4x4() 257 vo3p0 = _mm_add_ps(vo3p0, _mm_mul_ps(vi3x5678, vk02)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__ssse3_4x4()
|
D | 3x3p1-minmax-wasmsimd-x86-loadsplat-4x4.c | 45 const v128_t vk02 = wasm_v32x4_shuffle(vw0123, vw0123, 3, 3, 3, 3); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_loadsplat_4x4() local 169 vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi0x5678, vk02)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_loadsplat_4x4() 170 vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi1x5678, vk02)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_loadsplat_4x4() 171 vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi2x5678, vk02)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_loadsplat_4x4() 172 vo3p0 = wasm_f32x4_add(vo3p0, wasm_f32x4_mul(vi3x5678, vk02)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_loadsplat_4x4() 259 vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi0x5678, vk02)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_loadsplat_4x4() 260 vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi1x5678, vk02)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_loadsplat_4x4() 261 vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi2x5678, vk02)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_loadsplat_4x4() 262 vo3p0 = wasm_f32x4_add(vo3p0, wasm_f32x4_mul(vi3x5678, vk02)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_loadsplat_4x4()
|
D | 3x3p1-minmax-wasmsimd-arm-loadsplat-4x4.c | 45 const v128_t vk02 = wasm_v32x4_shuffle(vw0123, vw0123, 3, 3, 3, 3); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_loadsplat_4x4() local 169 vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi0x5678, vk02)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_loadsplat_4x4() 170 vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi1x5678, vk02)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_loadsplat_4x4() 171 vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi2x5678, vk02)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_loadsplat_4x4() 172 vo3p0 = wasm_f32x4_add(vo3p0, wasm_f32x4_mul(vi3x5678, vk02)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_loadsplat_4x4() 259 vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi0x5678, vk02)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_loadsplat_4x4() 260 vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi1x5678, vk02)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_loadsplat_4x4() 261 vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi2x5678, vk02)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_loadsplat_4x4() 262 vo3p0 = wasm_f32x4_add(vo3p0, wasm_f32x4_mul(vi3x5678, vk02)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_loadsplat_4x4()
|
D | 3x3p1-minmax-wasmsimd-arm-loadsplat-3x4.c | 45 const v128_t vk02 = wasm_v32x4_shuffle(vw0123, vw0123, 3, 3, 3, 3); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_loadsplat_3x4() local 149 vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi0x5678, vk02)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_loadsplat_3x4() 150 vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi1x5678, vk02)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_loadsplat_3x4() 151 vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi2x5678, vk02)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_loadsplat_3x4() 222 vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi0x5678, vk02)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_loadsplat_3x4() 223 vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi1x5678, vk02)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_loadsplat_3x4() 224 vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi2x5678, vk02)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_loadsplat_3x4()
|