/external/XNNPACK/src/f32-dwconv2d-chw/gen/ |
D | 3x3p1-minmax-scalar-6x1.c | 42 const float vk21 = weights[8]; in xnn_f32_dwconv2d_chw_ukernel_3x3p1__scalar_6x1() local 156 vo0p0 += vi2x1 * vk21; in xnn_f32_dwconv2d_chw_ukernel_3x3p1__scalar_6x1() 157 vo1p0 += vi3x1 * vk21; in xnn_f32_dwconv2d_chw_ukernel_3x3p1__scalar_6x1() 158 vo2p0 += vi4x1 * vk21; in xnn_f32_dwconv2d_chw_ukernel_3x3p1__scalar_6x1() 159 vo3p0 += vi5x1 * vk21; in xnn_f32_dwconv2d_chw_ukernel_3x3p1__scalar_6x1() 160 vo4p0 += vi6x1 * vk21; in xnn_f32_dwconv2d_chw_ukernel_3x3p1__scalar_6x1() 161 vo5p0 += vi7x1 * vk21; in xnn_f32_dwconv2d_chw_ukernel_3x3p1__scalar_6x1() 247 vo0p0 += vi2x1 * vk21; in xnn_f32_dwconv2d_chw_ukernel_3x3p1__scalar_6x1() 248 vo1p0 += vi3x1 * vk21; in xnn_f32_dwconv2d_chw_ukernel_3x3p1__scalar_6x1() 249 vo2p0 += vi4x1 * vk21; in xnn_f32_dwconv2d_chw_ukernel_3x3p1__scalar_6x1() [all …]
|
D | 3x3p1-minmax-scalar-5x1.c | 42 const float vk21 = weights[8]; in xnn_f32_dwconv2d_chw_ukernel_3x3p1__scalar_5x1() local 141 vo0p0 += vi2x1 * vk21; in xnn_f32_dwconv2d_chw_ukernel_3x3p1__scalar_5x1() 142 vo1p0 += vi3x1 * vk21; in xnn_f32_dwconv2d_chw_ukernel_3x3p1__scalar_5x1() 143 vo2p0 += vi4x1 * vk21; in xnn_f32_dwconv2d_chw_ukernel_3x3p1__scalar_5x1() 144 vo3p0 += vi5x1 * vk21; in xnn_f32_dwconv2d_chw_ukernel_3x3p1__scalar_5x1() 145 vo4p0 += vi6x1 * vk21; in xnn_f32_dwconv2d_chw_ukernel_3x3p1__scalar_5x1() 219 vo0p0 += vi2x1 * vk21; in xnn_f32_dwconv2d_chw_ukernel_3x3p1__scalar_5x1() 220 vo1p0 += vi3x1 * vk21; in xnn_f32_dwconv2d_chw_ukernel_3x3p1__scalar_5x1() 221 vo2p0 += vi4x1 * vk21; in xnn_f32_dwconv2d_chw_ukernel_3x3p1__scalar_5x1() 222 vo3p0 += vi5x1 * vk21; in xnn_f32_dwconv2d_chw_ukernel_3x3p1__scalar_5x1() [all …]
|
D | 3x3p1-minmax-scalar-4x1.c | 42 const float vk21 = weights[8]; in xnn_f32_dwconv2d_chw_ukernel_3x3p1__scalar_4x1() local 126 vo0p0 += vi2x1 * vk21; in xnn_f32_dwconv2d_chw_ukernel_3x3p1__scalar_4x1() 127 vo1p0 += vi3x1 * vk21; in xnn_f32_dwconv2d_chw_ukernel_3x3p1__scalar_4x1() 128 vo2p0 += vi4x1 * vk21; in xnn_f32_dwconv2d_chw_ukernel_3x3p1__scalar_4x1() 129 vo3p0 += vi5x1 * vk21; in xnn_f32_dwconv2d_chw_ukernel_3x3p1__scalar_4x1() 191 vo0p0 += vi2x1 * vk21; in xnn_f32_dwconv2d_chw_ukernel_3x3p1__scalar_4x1() 192 vo1p0 += vi3x1 * vk21; in xnn_f32_dwconv2d_chw_ukernel_3x3p1__scalar_4x1() 193 vo2p0 += vi4x1 * vk21; in xnn_f32_dwconv2d_chw_ukernel_3x3p1__scalar_4x1() 194 vo3p0 += vi5x1 * vk21; in xnn_f32_dwconv2d_chw_ukernel_3x3p1__scalar_4x1()
|
D | 3x3s2p1-minmax-scalar-4x1.c | 43 const float vk21 = weights[8]; in xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__scalar_4x1() local 156 vo0p0 += vi2x1 * vk21; in xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__scalar_4x1() 157 vo1p0 += vi4x1 * vk21; in xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__scalar_4x1() 158 vo2p0 += vi6x1 * vk21; in xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__scalar_4x1() 159 vo3p0 += vi8x1 * vk21; in xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__scalar_4x1() 234 vo0p0 += vi2x1 * vk21; in xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__scalar_4x1() 235 vo1p0 += vi4x1 * vk21; in xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__scalar_4x1() 236 vo2p0 += vi6x1 * vk21; in xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__scalar_4x1() 237 vo3p0 += vi8x1 * vk21; in xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__scalar_4x1()
|
D | 3x3p1-minmax-scalar-3x1.c | 42 const float vk21 = weights[8]; in xnn_f32_dwconv2d_chw_ukernel_3x3p1__scalar_3x1() local 111 vo0p0 += vi2x1 * vk21; in xnn_f32_dwconv2d_chw_ukernel_3x3p1__scalar_3x1() 112 vo1p0 += vi3x1 * vk21; in xnn_f32_dwconv2d_chw_ukernel_3x3p1__scalar_3x1() 113 vo2p0 += vi4x1 * vk21; in xnn_f32_dwconv2d_chw_ukernel_3x3p1__scalar_3x1() 163 vo0p0 += vi2x1 * vk21; in xnn_f32_dwconv2d_chw_ukernel_3x3p1__scalar_3x1() 164 vo1p0 += vi3x1 * vk21; in xnn_f32_dwconv2d_chw_ukernel_3x3p1__scalar_3x1() 165 vo2p0 += vi4x1 * vk21; in xnn_f32_dwconv2d_chw_ukernel_3x3p1__scalar_3x1()
|
D | 3x3s2p1-minmax-scalar-3x1.c | 43 const float vk21 = weights[8]; in xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__scalar_3x1() local 133 vo0p0 += vi2x1 * vk21; in xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__scalar_3x1() 134 vo1p0 += vi4x1 * vk21; in xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__scalar_3x1() 135 vo2p0 += vi6x1 * vk21; in xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__scalar_3x1() 195 vo0p0 += vi2x1 * vk21; in xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__scalar_3x1() 196 vo1p0 += vi4x1 * vk21; in xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__scalar_3x1() 197 vo2p0 += vi6x1 * vk21; in xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__scalar_3x1()
|
D | 3x3p1-minmax-scalar-2x1-acc2.c | 42 const float vk21 = weights[8]; in xnn_f32_dwconv2d_chw_ukernel_3x3p1__scalar_2x1_acc2() local 96 vo0p1 += vi2x1 * vk21; in xnn_f32_dwconv2d_chw_ukernel_3x3p1__scalar_2x1_acc2() 97 vo1p1 += vi3x1 * vk21; in xnn_f32_dwconv2d_chw_ukernel_3x3p1__scalar_2x1_acc2() 137 vo0p1 += vi2x1 * vk21; in xnn_f32_dwconv2d_chw_ukernel_3x3p1__scalar_2x1_acc2() 138 vo1p1 += vi3x1 * vk21; in xnn_f32_dwconv2d_chw_ukernel_3x3p1__scalar_2x1_acc2()
|
D | 3x3p1-minmax-scalar-2x1.c | 42 const float vk21 = weights[8]; in xnn_f32_dwconv2d_chw_ukernel_3x3p1__scalar_2x1() local 96 vo0p0 += vi2x1 * vk21; in xnn_f32_dwconv2d_chw_ukernel_3x3p1__scalar_2x1() 97 vo1p0 += vi3x1 * vk21; in xnn_f32_dwconv2d_chw_ukernel_3x3p1__scalar_2x1() 135 vo0p0 += vi2x1 * vk21; in xnn_f32_dwconv2d_chw_ukernel_3x3p1__scalar_2x1() 136 vo1p0 += vi3x1 * vk21; in xnn_f32_dwconv2d_chw_ukernel_3x3p1__scalar_2x1()
|
D | 3x3s2p1-minmax-scalar-2x1-acc2.c | 43 const float vk21 = weights[8]; in xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__scalar_2x1_acc2() local 110 vo0p1 += vi2x1 * vk21; in xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__scalar_2x1_acc2() 111 vo1p1 += vi4x1 * vk21; in xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__scalar_2x1_acc2() 158 vo0p1 += vi2x1 * vk21; in xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__scalar_2x1_acc2() 159 vo1p1 += vi4x1 * vk21; in xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__scalar_2x1_acc2()
|
D | 3x3s2p1-minmax-scalar-2x1.c | 43 const float vk21 = weights[8]; in xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__scalar_2x1() local 110 vo0p0 += vi2x1 * vk21; in xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__scalar_2x1() 111 vo1p0 += vi4x1 * vk21; in xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__scalar_2x1() 156 vo0p0 += vi2x1 * vk21; in xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__scalar_2x1() 157 vo1p0 += vi4x1 * vk21; in xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__scalar_2x1()
|
D | 5x5p2-minmax-scalar-3x1.c | 46 const float vk21 = weights[12]; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_3x1() local 163 vo0p0 += vi2x1 * vk21; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_3x1() 164 vo1p0 += vi3x1 * vk21; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_3x1() 165 vo2p0 += vi4x1 * vk21; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_3x1() 290 vo0p0 += vi2x1 * vk21; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_3x1() 291 vo1p0 += vi3x1 * vk21; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_3x1() 292 vo2p0 += vi4x1 * vk21; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_3x1() 387 vo0p0 += vi2x1 * vk21; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_3x1() 388 vo1p0 += vi3x1 * vk21; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_3x1() 389 vo2p0 += vi4x1 * vk21; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_3x1()
|
D | 5x5s2p2-minmax-scalar-3x1.c | 47 const float vk21 = weights[12]; in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__scalar_3x1() local 200 vo0p0 += vi2x1 * vk21; in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__scalar_3x1() 201 vo1p0 += vi4x1 * vk21; in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__scalar_3x1() 202 vo2p0 += vi6x1 * vk21; in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__scalar_3x1() 324 vo0p0 += vi2x1 * vk21; in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__scalar_3x1() 325 vo1p0 += vi4x1 * vk21; in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__scalar_3x1() 326 vo2p0 += vi6x1 * vk21; in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__scalar_3x1() 401 vo0p0 += vi2x1 * vk21; in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__scalar_3x1() 402 vo1p0 += vi4x1 * vk21; in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__scalar_3x1() 403 vo2p0 += vi6x1 * vk21; in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__scalar_3x1()
|
D | 5x5s2p2-minmax-scalar-3x1-acc2.c | 47 const float vk21 = weights[12]; in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__scalar_3x1_acc2() local 200 vo0p1 += vi2x1 * vk21; in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__scalar_3x1_acc2() 201 vo1p1 += vi4x1 * vk21; in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__scalar_3x1_acc2() 202 vo2p1 += vi6x1 * vk21; in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__scalar_3x1_acc2() 327 vo0p1 += vi2x1 * vk21; in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__scalar_3x1_acc2() 328 vo1p1 += vi4x1 * vk21; in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__scalar_3x1_acc2() 329 vo2p1 += vi6x1 * vk21; in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__scalar_3x1_acc2() 407 vo0p1 += vi2x1 * vk21; in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__scalar_3x1_acc2() 408 vo1p1 += vi4x1 * vk21; in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__scalar_3x1_acc2() 409 vo2p1 += vi6x1 * vk21; in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__scalar_3x1_acc2()
|
D | 5x5p2-minmax-scalar-3x1-acc2.c | 46 const float vk21 = weights[12]; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_3x1_acc2() local 163 vo0p1 += vi2x1 * vk21; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_3x1_acc2() 164 vo1p1 += vi3x1 * vk21; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_3x1_acc2() 165 vo2p1 += vi4x1 * vk21; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_3x1_acc2() 293 vo0p1 += vi2x1 * vk21; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_3x1_acc2() 294 vo1p1 += vi3x1 * vk21; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_3x1_acc2() 295 vo2p1 += vi4x1 * vk21; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_3x1_acc2() 393 vo0p1 += vi2x1 * vk21; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_3x1_acc2() 394 vo1p1 += vi3x1 * vk21; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_3x1_acc2() 395 vo2p1 += vi4x1 * vk21; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_3x1_acc2()
|
D | 3x3p1-minmax-ssse3-6x4.c | 45 const __m128 vk21 = _mm_load1_ps(weights + 8); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__ssse3_6x4() local 149 vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi2x4567, vk21)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__ssse3_6x4() 150 vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi3x4567, vk21)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__ssse3_6x4() 151 vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi4x4567, vk21)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__ssse3_6x4() 152 vo3p0 = _mm_add_ps(vo3p0, _mm_mul_ps(vi5x4567, vk21)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__ssse3_6x4() 153 vo4p0 = _mm_add_ps(vo4p0, _mm_mul_ps(vi6x4567, vk21)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__ssse3_6x4() 154 vo5p0 = _mm_add_ps(vo5p0, _mm_mul_ps(vi7x4567, vk21)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__ssse3_6x4() 283 vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi2x4567, vk21)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__ssse3_6x4() 284 vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi3x4567, vk21)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__ssse3_6x4() 285 vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi4x4567, vk21)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__ssse3_6x4() [all …]
|
D | 3x3p1-minmax-wasmsimd-x86-loadsplat-6x4.c | 50 const v128_t vk21 = wasm_v32x4_shuffle(vw89, vw89, 0, 0, 0, 0); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_loadsplat_6x4() local 154 vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi2x4567, vk21)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_loadsplat_6x4() 155 vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi3x4567, vk21)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_loadsplat_6x4() 156 vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi4x4567, vk21)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_loadsplat_6x4() 157 vo3p0 = wasm_f32x4_add(vo3p0, wasm_f32x4_mul(vi5x4567, vk21)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_loadsplat_6x4() 158 vo4p0 = wasm_f32x4_add(vo4p0, wasm_f32x4_mul(vi6x4567, vk21)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_loadsplat_6x4() 159 vo5p0 = wasm_f32x4_add(vo5p0, wasm_f32x4_mul(vi7x4567, vk21)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_loadsplat_6x4() 287 vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi2x4567, vk21)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_loadsplat_6x4() 288 vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi3x4567, vk21)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_loadsplat_6x4() 289 vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi4x4567, vk21)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_loadsplat_6x4() [all …]
|
D | 3x3p1-minmax-wasmsimd-arm-loadsplat-6x4.c | 50 const v128_t vk21 = wasm_v32x4_shuffle(vw89, vw89, 0, 0, 0, 0); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_loadsplat_6x4() local 154 vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi2x4567, vk21)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_loadsplat_6x4() 155 vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi3x4567, vk21)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_loadsplat_6x4() 156 vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi4x4567, vk21)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_loadsplat_6x4() 157 vo3p0 = wasm_f32x4_add(vo3p0, wasm_f32x4_mul(vi5x4567, vk21)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_loadsplat_6x4() 158 vo4p0 = wasm_f32x4_add(vo4p0, wasm_f32x4_mul(vi6x4567, vk21)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_loadsplat_6x4() 159 vo5p0 = wasm_f32x4_add(vo5p0, wasm_f32x4_mul(vi7x4567, vk21)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_loadsplat_6x4() 287 vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi2x4567, vk21)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_loadsplat_6x4() 288 vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi3x4567, vk21)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_loadsplat_6x4() 289 vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi4x4567, vk21)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_loadsplat_6x4() [all …]
|
D | 3x3p1-minmax-wasmsimd-x86-loadsplat-5x4.c | 50 const v128_t vk21 = wasm_v32x4_shuffle(vw89, vw89, 0, 0, 0, 0); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_loadsplat_5x4() local 141 vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi2x4567, vk21)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_loadsplat_5x4() 142 vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi3x4567, vk21)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_loadsplat_5x4() 143 vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi4x4567, vk21)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_loadsplat_5x4() 144 vo3p0 = wasm_f32x4_add(vo3p0, wasm_f32x4_mul(vi5x4567, vk21)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_loadsplat_5x4() 145 vo4p0 = wasm_f32x4_add(vo4p0, wasm_f32x4_mul(vi6x4567, vk21)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_loadsplat_5x4() 256 vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi2x4567, vk21)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_loadsplat_5x4() 257 vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi3x4567, vk21)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_loadsplat_5x4() 258 vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi4x4567, vk21)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_loadsplat_5x4() 259 vo3p0 = wasm_f32x4_add(vo3p0, wasm_f32x4_mul(vi5x4567, vk21)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_loadsplat_5x4() [all …]
|
D | 3x3p1-minmax-wasmsimd-arm-loadsplat-5x4.c | 50 const v128_t vk21 = wasm_v32x4_shuffle(vw89, vw89, 0, 0, 0, 0); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_loadsplat_5x4() local 141 vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi2x4567, vk21)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_loadsplat_5x4() 142 vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi3x4567, vk21)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_loadsplat_5x4() 143 vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi4x4567, vk21)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_loadsplat_5x4() 144 vo3p0 = wasm_f32x4_add(vo3p0, wasm_f32x4_mul(vi5x4567, vk21)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_loadsplat_5x4() 145 vo4p0 = wasm_f32x4_add(vo4p0, wasm_f32x4_mul(vi6x4567, vk21)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_loadsplat_5x4() 256 vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi2x4567, vk21)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_loadsplat_5x4() 257 vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi3x4567, vk21)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_loadsplat_5x4() 258 vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi4x4567, vk21)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_loadsplat_5x4() 259 vo3p0 = wasm_f32x4_add(vo3p0, wasm_f32x4_mul(vi5x4567, vk21)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_loadsplat_5x4() [all …]
|
D | 5x5p2-minmax-scalar-2x1.c | 46 const float vk21 = weights[12]; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_2x1() local 144 vo0p0 += vi2x1 * vk21; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_2x1() 145 vo1p0 += vi3x1 * vk21; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_2x1() 239 vo0p0 += vi2x1 * vk21; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_2x1() 240 vo1p0 += vi3x1 * vk21; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_2x1() 311 vo0p0 += vi2x1 * vk21; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_2x1() 312 vo1p0 += vi3x1 * vk21; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_2x1()
|
D | 3x3p1-minmax-sse-6x4.c | 45 const __m128 vk21 = _mm_load1_ps(weights + 8); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__sse_6x4() local 182 vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi2x4567, vk21)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__sse_6x4() 183 vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi3x4567, vk21)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__sse_6x4() 184 vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi4x4567, vk21)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__sse_6x4() 185 vo3p0 = _mm_add_ps(vo3p0, _mm_mul_ps(vi5x4567, vk21)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__sse_6x4() 186 vo4p0 = _mm_add_ps(vo4p0, _mm_mul_ps(vi6x4567, vk21)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__sse_6x4() 187 vo5p0 = _mm_add_ps(vo5p0, _mm_mul_ps(vi7x4567, vk21)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__sse_6x4() 366 vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi2x4567, vk21)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__sse_6x4() 367 vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi3x4567, vk21)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__sse_6x4() 368 vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi4x4567, vk21)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__sse_6x4() [all …]
|
D | 5x5s2p2-minmax-scalar-2x1-acc3.c | 47 const float vk21 = weights[12]; in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__scalar_2x1_acc3() local 169 vo0p1 += vi2x1 * vk21; in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__scalar_2x1_acc3() 170 vo1p1 += vi4x1 * vk21; in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__scalar_2x1_acc3() 263 vo0p1 += vi2x1 * vk21; in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__scalar_2x1_acc3() 264 vo1p1 += vi4x1 * vk21; in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__scalar_2x1_acc3() 321 vo0p1 += vi2x1 * vk21; in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__scalar_2x1_acc3() 322 vo1p1 += vi4x1 * vk21; in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__scalar_2x1_acc3()
|
D | 5x5p2-minmax-scalar-2x1-acc3.c | 46 const float vk21 = weights[12]; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_2x1_acc3() local 144 vo0p1 += vi2x1 * vk21; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_2x1_acc3() 145 vo1p1 += vi3x1 * vk21; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_2x1_acc3() 243 vo0p1 += vi2x1 * vk21; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_2x1_acc3() 244 vo1p1 += vi3x1 * vk21; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_2x1_acc3() 319 vo0p1 += vi2x1 * vk21; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_2x1_acc3() 320 vo1p1 += vi3x1 * vk21; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_2x1_acc3()
|
D | 5x5p2-minmax-scalar-2x1-acc2.c | 46 const float vk21 = weights[12]; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_2x1_acc2() local 144 vo0p1 += vi2x1 * vk21; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_2x1_acc2() 145 vo1p1 += vi3x1 * vk21; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_2x1_acc2() 241 vo0p1 += vi2x1 * vk21; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_2x1_acc2() 242 vo1p1 += vi3x1 * vk21; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_2x1_acc2() 315 vo0p1 += vi2x1 * vk21; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_2x1_acc2() 316 vo1p1 += vi3x1 * vk21; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_2x1_acc2()
|
D | 5x5s2p2-minmax-scalar-2x1.c | 47 const float vk21 = weights[12]; in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__scalar_2x1() local 169 vo0p0 += vi2x1 * vk21; in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__scalar_2x1() 170 vo1p0 += vi4x1 * vk21; in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__scalar_2x1() 259 vo0p0 += vi2x1 * vk21; in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__scalar_2x1() 260 vo1p0 += vi4x1 * vk21; in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__scalar_2x1() 313 vo0p0 += vi2x1 * vk21; in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__scalar_2x1() 314 vo1p0 += vi4x1 * vk21; in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__scalar_2x1()
|