/external/XNNPACK/src/f32-dwconv2d-chw/gen/ |
D | 3x3s2p1-minmax-neonfma-4x4.c | 35 const uint32x4_t vmask_odd = vld1q_u32(params->neon.mask_odd); in xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__neonfma_4x4() local 218 …const float32x4_t vi0x9BDF = vreinterpretq_f32_u32(vandq_u32(vmask_odd, vreinterpretq_u32_f32(vi0… in xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__neonfma_4x4() 220 …const float32x4_t vi1x9BDF = vreinterpretq_f32_u32(vandq_u32(vmask_odd, vreinterpretq_u32_f32(vi1… in xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__neonfma_4x4() 222 …const float32x4_t vi2x9BDF = vreinterpretq_f32_u32(vandq_u32(vmask_odd, vreinterpretq_u32_f32(vi2… in xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__neonfma_4x4() 224 …const float32x4_t vi3x9BDF = vreinterpretq_f32_u32(vandq_u32(vmask_odd, vreinterpretq_u32_f32(vi3… in xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__neonfma_4x4() 226 …const float32x4_t vi4x9BDF = vreinterpretq_f32_u32(vandq_u32(vmask_odd, vreinterpretq_u32_f32(vi4… in xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__neonfma_4x4() 228 …const float32x4_t vi5x9BDF = vreinterpretq_f32_u32(vandq_u32(vmask_odd, vreinterpretq_u32_f32(vi5… in xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__neonfma_4x4() 230 …const float32x4_t vi6x9BDF = vreinterpretq_f32_u32(vandq_u32(vmask_odd, vreinterpretq_u32_f32(vi6… in xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__neonfma_4x4() 232 …const float32x4_t vi7x9BDF = vreinterpretq_f32_u32(vandq_u32(vmask_odd, vreinterpretq_u32_f32(vi7… in xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__neonfma_4x4() 234 …const float32x4_t vi8x9BDF = vreinterpretq_f32_u32(vandq_u32(vmask_odd, vreinterpretq_u32_f32(vi8… in xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__neonfma_4x4()
|
D | 3x3s2p1-minmax-neon-4x4.c | 35 const uint32x4_t vmask_odd = vld1q_u32(params->neon.mask_odd); in xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__neon_4x4() local 218 …const float32x4_t vi0x9BDF = vreinterpretq_f32_u32(vandq_u32(vmask_odd, vreinterpretq_u32_f32(vi0… in xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__neon_4x4() 220 …const float32x4_t vi1x9BDF = vreinterpretq_f32_u32(vandq_u32(vmask_odd, vreinterpretq_u32_f32(vi1… in xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__neon_4x4() 222 …const float32x4_t vi2x9BDF = vreinterpretq_f32_u32(vandq_u32(vmask_odd, vreinterpretq_u32_f32(vi2… in xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__neon_4x4() 224 …const float32x4_t vi3x9BDF = vreinterpretq_f32_u32(vandq_u32(vmask_odd, vreinterpretq_u32_f32(vi3… in xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__neon_4x4() 226 …const float32x4_t vi4x9BDF = vreinterpretq_f32_u32(vandq_u32(vmask_odd, vreinterpretq_u32_f32(vi4… in xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__neon_4x4() 228 …const float32x4_t vi5x9BDF = vreinterpretq_f32_u32(vandq_u32(vmask_odd, vreinterpretq_u32_f32(vi5… in xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__neon_4x4() 230 …const float32x4_t vi6x9BDF = vreinterpretq_f32_u32(vandq_u32(vmask_odd, vreinterpretq_u32_f32(vi6… in xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__neon_4x4() 232 …const float32x4_t vi7x9BDF = vreinterpretq_f32_u32(vandq_u32(vmask_odd, vreinterpretq_u32_f32(vi7… in xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__neon_4x4() 234 …const float32x4_t vi8x9BDF = vreinterpretq_f32_u32(vandq_u32(vmask_odd, vreinterpretq_u32_f32(vi8… in xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__neon_4x4()
|
D | 3x3s2p1-minmax-neon-3x4.c | 35 const uint32x4_t vmask_odd = vld1q_u32(params->neon.mask_odd); in xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__neon_3x4() local 184 …const float32x4_t vi0x9BDF = vreinterpretq_f32_u32(vandq_u32(vmask_odd, vreinterpretq_u32_f32(vi0… in xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__neon_3x4() 186 …const float32x4_t vi1x9BDF = vreinterpretq_f32_u32(vandq_u32(vmask_odd, vreinterpretq_u32_f32(vi1… in xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__neon_3x4() 188 …const float32x4_t vi2x9BDF = vreinterpretq_f32_u32(vandq_u32(vmask_odd, vreinterpretq_u32_f32(vi2… in xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__neon_3x4() 190 …const float32x4_t vi3x9BDF = vreinterpretq_f32_u32(vandq_u32(vmask_odd, vreinterpretq_u32_f32(vi3… in xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__neon_3x4() 192 …const float32x4_t vi4x9BDF = vreinterpretq_f32_u32(vandq_u32(vmask_odd, vreinterpretq_u32_f32(vi4… in xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__neon_3x4() 194 …const float32x4_t vi5x9BDF = vreinterpretq_f32_u32(vandq_u32(vmask_odd, vreinterpretq_u32_f32(vi5… in xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__neon_3x4() 196 …const float32x4_t vi6x9BDF = vreinterpretq_f32_u32(vandq_u32(vmask_odd, vreinterpretq_u32_f32(vi6… in xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__neon_3x4()
|
D | 3x3s2p1-minmax-neonfma-3x4.c | 35 const uint32x4_t vmask_odd = vld1q_u32(params->neon.mask_odd); in xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__neonfma_3x4() local 184 …const float32x4_t vi0x9BDF = vreinterpretq_f32_u32(vandq_u32(vmask_odd, vreinterpretq_u32_f32(vi0… in xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__neonfma_3x4() 186 …const float32x4_t vi1x9BDF = vreinterpretq_f32_u32(vandq_u32(vmask_odd, vreinterpretq_u32_f32(vi1… in xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__neonfma_3x4() 188 …const float32x4_t vi2x9BDF = vreinterpretq_f32_u32(vandq_u32(vmask_odd, vreinterpretq_u32_f32(vi2… in xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__neonfma_3x4() 190 …const float32x4_t vi3x9BDF = vreinterpretq_f32_u32(vandq_u32(vmask_odd, vreinterpretq_u32_f32(vi3… in xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__neonfma_3x4() 192 …const float32x4_t vi4x9BDF = vreinterpretq_f32_u32(vandq_u32(vmask_odd, vreinterpretq_u32_f32(vi4… in xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__neonfma_3x4() 194 …const float32x4_t vi5x9BDF = vreinterpretq_f32_u32(vandq_u32(vmask_odd, vreinterpretq_u32_f32(vi5… in xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__neonfma_3x4() 196 …const float32x4_t vi6x9BDF = vreinterpretq_f32_u32(vandq_u32(vmask_odd, vreinterpretq_u32_f32(vi6… in xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__neonfma_3x4()
|
D | 3x3s2p1-minmax-neonfma-2x4.c | 35 const uint32x4_t vmask_odd = vld1q_u32(params->neon.mask_odd); in xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__neonfma_2x4() local 150 …const float32x4_t vi0x9BDF = vreinterpretq_f32_u32(vandq_u32(vmask_odd, vreinterpretq_u32_f32(vi0… in xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__neonfma_2x4() 152 …const float32x4_t vi1x9BDF = vreinterpretq_f32_u32(vandq_u32(vmask_odd, vreinterpretq_u32_f32(vi1… in xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__neonfma_2x4() 154 …const float32x4_t vi2x9BDF = vreinterpretq_f32_u32(vandq_u32(vmask_odd, vreinterpretq_u32_f32(vi2… in xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__neonfma_2x4() 156 …const float32x4_t vi3x9BDF = vreinterpretq_f32_u32(vandq_u32(vmask_odd, vreinterpretq_u32_f32(vi3… in xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__neonfma_2x4() 158 …const float32x4_t vi4x9BDF = vreinterpretq_f32_u32(vandq_u32(vmask_odd, vreinterpretq_u32_f32(vi4… in xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__neonfma_2x4()
|
D | 3x3s2p1-minmax-neon-2x4-acc2.c | 35 const uint32x4_t vmask_odd = vld1q_u32(params->neon.mask_odd); in xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__neon_2x4_acc2() local 152 …const float32x4_t vi0x9BDF = vreinterpretq_f32_u32(vandq_u32(vmask_odd, vreinterpretq_u32_f32(vi0… in xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__neon_2x4_acc2() 154 …const float32x4_t vi1x9BDF = vreinterpretq_f32_u32(vandq_u32(vmask_odd, vreinterpretq_u32_f32(vi1… in xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__neon_2x4_acc2() 156 …const float32x4_t vi2x9BDF = vreinterpretq_f32_u32(vandq_u32(vmask_odd, vreinterpretq_u32_f32(vi2… in xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__neon_2x4_acc2() 158 …const float32x4_t vi3x9BDF = vreinterpretq_f32_u32(vandq_u32(vmask_odd, vreinterpretq_u32_f32(vi3… in xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__neon_2x4_acc2() 160 …const float32x4_t vi4x9BDF = vreinterpretq_f32_u32(vandq_u32(vmask_odd, vreinterpretq_u32_f32(vi4… in xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__neon_2x4_acc2()
|
D | 3x3s2p1-minmax-neon-2x4.c | 35 const uint32x4_t vmask_odd = vld1q_u32(params->neon.mask_odd); in xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__neon_2x4() local 150 …const float32x4_t vi0x9BDF = vreinterpretq_f32_u32(vandq_u32(vmask_odd, vreinterpretq_u32_f32(vi0… in xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__neon_2x4() 152 …const float32x4_t vi1x9BDF = vreinterpretq_f32_u32(vandq_u32(vmask_odd, vreinterpretq_u32_f32(vi1… in xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__neon_2x4() 154 …const float32x4_t vi2x9BDF = vreinterpretq_f32_u32(vandq_u32(vmask_odd, vreinterpretq_u32_f32(vi2… in xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__neon_2x4() 156 …const float32x4_t vi3x9BDF = vreinterpretq_f32_u32(vandq_u32(vmask_odd, vreinterpretq_u32_f32(vi3… in xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__neon_2x4() 158 …const float32x4_t vi4x9BDF = vreinterpretq_f32_u32(vandq_u32(vmask_odd, vreinterpretq_u32_f32(vi4… in xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__neon_2x4()
|
D | 3x3s2p1-minmax-neonfma-2x4-acc2.c | 35 const uint32x4_t vmask_odd = vld1q_u32(params->neon.mask_odd); in xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__neonfma_2x4_acc2() local 152 …const float32x4_t vi0x9BDF = vreinterpretq_f32_u32(vandq_u32(vmask_odd, vreinterpretq_u32_f32(vi0… in xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__neonfma_2x4_acc2() 154 …const float32x4_t vi1x9BDF = vreinterpretq_f32_u32(vandq_u32(vmask_odd, vreinterpretq_u32_f32(vi1… in xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__neonfma_2x4_acc2() 156 …const float32x4_t vi2x9BDF = vreinterpretq_f32_u32(vandq_u32(vmask_odd, vreinterpretq_u32_f32(vi2… in xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__neonfma_2x4_acc2() 158 …const float32x4_t vi3x9BDF = vreinterpretq_f32_u32(vandq_u32(vmask_odd, vreinterpretq_u32_f32(vi3… in xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__neonfma_2x4_acc2() 160 …const float32x4_t vi4x9BDF = vreinterpretq_f32_u32(vandq_u32(vmask_odd, vreinterpretq_u32_f32(vi4… in xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__neonfma_2x4_acc2()
|
D | 3x3s2p1-minmax-wasmsimd-arm-loadsplat-4x4.c | 36 const v128_t vmask_odd = wasm_v128_load(params->scalar.mask_odd); in xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__wasmsimd_arm_loadsplat_4x4() local 276 …const v128_t vi0x9BDF = wasm_v128_and(vmask_odd, wasm_v32x4_shuffle(vi0x89AB, vi0xCDEF, 1, 3, 5, … in xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__wasmsimd_arm_loadsplat_4x4() 278 …const v128_t vi1x9BDF = wasm_v128_and(vmask_odd, wasm_v32x4_shuffle(vi1x89AB, vi1xCDEF, 1, 3, 5, … in xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__wasmsimd_arm_loadsplat_4x4() 280 …const v128_t vi2x9BDF = wasm_v128_and(vmask_odd, wasm_v32x4_shuffle(vi2x89AB, vi2xCDEF, 1, 3, 5, … in xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__wasmsimd_arm_loadsplat_4x4() 282 …const v128_t vi3x9BDF = wasm_v128_and(vmask_odd, wasm_v32x4_shuffle(vi3x89AB, vi3xCDEF, 1, 3, 5, … in xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__wasmsimd_arm_loadsplat_4x4() 284 …const v128_t vi4x9BDF = wasm_v128_and(vmask_odd, wasm_v32x4_shuffle(vi4x89AB, vi4xCDEF, 1, 3, 5, … in xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__wasmsimd_arm_loadsplat_4x4() 286 …const v128_t vi5x9BDF = wasm_v128_and(vmask_odd, wasm_v32x4_shuffle(vi5x89AB, vi5xCDEF, 1, 3, 5, … in xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__wasmsimd_arm_loadsplat_4x4() 288 …const v128_t vi6x9BDF = wasm_v128_and(vmask_odd, wasm_v32x4_shuffle(vi6x89AB, vi6xCDEF, 1, 3, 5, … in xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__wasmsimd_arm_loadsplat_4x4() 290 …const v128_t vi7x9BDF = wasm_v128_and(vmask_odd, wasm_v32x4_shuffle(vi7x89AB, vi7xCDEF, 1, 3, 5, … in xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__wasmsimd_arm_loadsplat_4x4() 292 …const v128_t vi8x9BDF = wasm_v128_and(vmask_odd, wasm_v32x4_shuffle(vi8x89AB, vi8xCDEF, 1, 3, 5, … in xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__wasmsimd_arm_loadsplat_4x4()
|
D | 3x3s2p1-minmax-neon-1x4-acc3.c | 35 const uint32x4_t vmask_odd = vld1q_u32(params->neon.mask_odd); in xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__neon_1x4_acc3() local 117 …const float32x4_t vi0x9BDF = vreinterpretq_f32_u32(vandq_u32(vmask_odd, vreinterpretq_u32_f32(vi0… in xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__neon_1x4_acc3() 119 …const float32x4_t vi1x9BDF = vreinterpretq_f32_u32(vandq_u32(vmask_odd, vreinterpretq_u32_f32(vi1… in xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__neon_1x4_acc3() 121 …const float32x4_t vi2x9BDF = vreinterpretq_f32_u32(vandq_u32(vmask_odd, vreinterpretq_u32_f32(vi2… in xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__neon_1x4_acc3()
|
D | 3x3s2p1-minmax-neon-1x4-acc2.c | 35 const uint32x4_t vmask_odd = vld1q_u32(params->neon.mask_odd); in xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__neon_1x4_acc2() local 116 …const float32x4_t vi0x9BDF = vreinterpretq_f32_u32(vandq_u32(vmask_odd, vreinterpretq_u32_f32(vi0… in xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__neon_1x4_acc2() 118 …const float32x4_t vi1x9BDF = vreinterpretq_f32_u32(vandq_u32(vmask_odd, vreinterpretq_u32_f32(vi1… in xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__neon_1x4_acc2() 120 …const float32x4_t vi2x9BDF = vreinterpretq_f32_u32(vandq_u32(vmask_odd, vreinterpretq_u32_f32(vi2… in xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__neon_1x4_acc2()
|
D | 3x3s2p1-minmax-neonfma-1x4-acc2.c | 35 const uint32x4_t vmask_odd = vld1q_u32(params->neon.mask_odd); in xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__neonfma_1x4_acc2() local 116 …const float32x4_t vi0x9BDF = vreinterpretq_f32_u32(vandq_u32(vmask_odd, vreinterpretq_u32_f32(vi0… in xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__neonfma_1x4_acc2() 118 …const float32x4_t vi1x9BDF = vreinterpretq_f32_u32(vandq_u32(vmask_odd, vreinterpretq_u32_f32(vi1… in xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__neonfma_1x4_acc2() 120 …const float32x4_t vi2x9BDF = vreinterpretq_f32_u32(vandq_u32(vmask_odd, vreinterpretq_u32_f32(vi2… in xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__neonfma_1x4_acc2()
|
D | 3x3s2p1-minmax-neon-1x4.c | 35 const uint32x4_t vmask_odd = vld1q_u32(params->neon.mask_odd); in xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__neon_1x4() local 115 …const float32x4_t vi0x9BDF = vreinterpretq_f32_u32(vandq_u32(vmask_odd, vreinterpretq_u32_f32(vi0… in xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__neon_1x4() 117 …const float32x4_t vi1x9BDF = vreinterpretq_f32_u32(vandq_u32(vmask_odd, vreinterpretq_u32_f32(vi1… in xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__neon_1x4() 119 …const float32x4_t vi2x9BDF = vreinterpretq_f32_u32(vandq_u32(vmask_odd, vreinterpretq_u32_f32(vi2… in xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__neon_1x4()
|
D | 3x3s2p1-minmax-neon-1x4-acc4.c | 35 const uint32x4_t vmask_odd = vld1q_u32(params->neon.mask_odd); in xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__neon_1x4_acc4() local 118 …const float32x4_t vi0x9BDF = vreinterpretq_f32_u32(vandq_u32(vmask_odd, vreinterpretq_u32_f32(vi0… in xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__neon_1x4_acc4() 120 …const float32x4_t vi1x9BDF = vreinterpretq_f32_u32(vandq_u32(vmask_odd, vreinterpretq_u32_f32(vi1… in xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__neon_1x4_acc4() 122 …const float32x4_t vi2x9BDF = vreinterpretq_f32_u32(vandq_u32(vmask_odd, vreinterpretq_u32_f32(vi2… in xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__neon_1x4_acc4()
|
D | 3x3s2p1-minmax-neonfma-1x4-acc3.c | 35 const uint32x4_t vmask_odd = vld1q_u32(params->neon.mask_odd); in xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__neonfma_1x4_acc3() local 117 …const float32x4_t vi0x9BDF = vreinterpretq_f32_u32(vandq_u32(vmask_odd, vreinterpretq_u32_f32(vi0… in xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__neonfma_1x4_acc3() 119 …const float32x4_t vi1x9BDF = vreinterpretq_f32_u32(vandq_u32(vmask_odd, vreinterpretq_u32_f32(vi1… in xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__neonfma_1x4_acc3() 121 …const float32x4_t vi2x9BDF = vreinterpretq_f32_u32(vandq_u32(vmask_odd, vreinterpretq_u32_f32(vi2… in xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__neonfma_1x4_acc3()
|
D | 3x3s2p1-minmax-neonfma-1x4.c | 35 const uint32x4_t vmask_odd = vld1q_u32(params->neon.mask_odd); in xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__neonfma_1x4() local 115 …const float32x4_t vi0x9BDF = vreinterpretq_f32_u32(vandq_u32(vmask_odd, vreinterpretq_u32_f32(vi0… in xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__neonfma_1x4() 117 …const float32x4_t vi1x9BDF = vreinterpretq_f32_u32(vandq_u32(vmask_odd, vreinterpretq_u32_f32(vi1… in xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__neonfma_1x4() 119 …const float32x4_t vi2x9BDF = vreinterpretq_f32_u32(vandq_u32(vmask_odd, vreinterpretq_u32_f32(vi2… in xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__neonfma_1x4()
|
D | 3x3s2p1-minmax-neonfma-1x4-acc4.c | 35 const uint32x4_t vmask_odd = vld1q_u32(params->neon.mask_odd); in xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__neonfma_1x4_acc4() local 118 …const float32x4_t vi0x9BDF = vreinterpretq_f32_u32(vandq_u32(vmask_odd, vreinterpretq_u32_f32(vi0… in xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__neonfma_1x4_acc4() 120 …const float32x4_t vi1x9BDF = vreinterpretq_f32_u32(vandq_u32(vmask_odd, vreinterpretq_u32_f32(vi1… in xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__neonfma_1x4_acc4() 122 …const float32x4_t vi2x9BDF = vreinterpretq_f32_u32(vandq_u32(vmask_odd, vreinterpretq_u32_f32(vi2… in xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__neonfma_1x4_acc4()
|
D | 3x3s2p1-minmax-wasmsimd-arm-splat-3x4.c | 36 const v128_t vmask_odd = wasm_v128_load(params->scalar.mask_odd); in xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__wasmsimd_arm_splat_3x4() local 222 …const v128_t vi0x9BDF = wasm_v128_and(vmask_odd, wasm_v32x4_shuffle(vi0x89AB, vi0xCDEF, 1, 3, 5, … in xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__wasmsimd_arm_splat_3x4() 224 …const v128_t vi1x9BDF = wasm_v128_and(vmask_odd, wasm_v32x4_shuffle(vi1x89AB, vi1xCDEF, 1, 3, 5, … in xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__wasmsimd_arm_splat_3x4() 226 …const v128_t vi2x9BDF = wasm_v128_and(vmask_odd, wasm_v32x4_shuffle(vi2x89AB, vi2xCDEF, 1, 3, 5, … in xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__wasmsimd_arm_splat_3x4() 228 …const v128_t vi3x9BDF = wasm_v128_and(vmask_odd, wasm_v32x4_shuffle(vi3x89AB, vi3xCDEF, 1, 3, 5, … in xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__wasmsimd_arm_splat_3x4() 230 …const v128_t vi4x9BDF = wasm_v128_and(vmask_odd, wasm_v32x4_shuffle(vi4x89AB, vi4xCDEF, 1, 3, 5, … in xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__wasmsimd_arm_splat_3x4() 232 …const v128_t vi5x9BDF = wasm_v128_and(vmask_odd, wasm_v32x4_shuffle(vi5x89AB, vi5xCDEF, 1, 3, 5, … in xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__wasmsimd_arm_splat_3x4() 234 …const v128_t vi6x9BDF = wasm_v128_and(vmask_odd, wasm_v32x4_shuffle(vi6x89AB, vi6xCDEF, 1, 3, 5, … in xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__wasmsimd_arm_splat_3x4()
|
D | 3x3s2p1-minmax-wasmsimd-arm-loadsplat-3x4.c | 36 const v128_t vmask_odd = wasm_v128_load(params->scalar.mask_odd); in xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__wasmsimd_arm_loadsplat_3x4() local 232 …const v128_t vi0x9BDF = wasm_v128_and(vmask_odd, wasm_v32x4_shuffle(vi0x89AB, vi0xCDEF, 1, 3, 5, … in xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__wasmsimd_arm_loadsplat_3x4() 234 …const v128_t vi1x9BDF = wasm_v128_and(vmask_odd, wasm_v32x4_shuffle(vi1x89AB, vi1xCDEF, 1, 3, 5, … in xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__wasmsimd_arm_loadsplat_3x4() 236 …const v128_t vi2x9BDF = wasm_v128_and(vmask_odd, wasm_v32x4_shuffle(vi2x89AB, vi2xCDEF, 1, 3, 5, … in xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__wasmsimd_arm_loadsplat_3x4() 238 …const v128_t vi3x9BDF = wasm_v128_and(vmask_odd, wasm_v32x4_shuffle(vi3x89AB, vi3xCDEF, 1, 3, 5, … in xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__wasmsimd_arm_loadsplat_3x4() 240 …const v128_t vi4x9BDF = wasm_v128_and(vmask_odd, wasm_v32x4_shuffle(vi4x89AB, vi4xCDEF, 1, 3, 5, … in xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__wasmsimd_arm_loadsplat_3x4() 242 …const v128_t vi5x9BDF = wasm_v128_and(vmask_odd, wasm_v32x4_shuffle(vi5x89AB, vi5xCDEF, 1, 3, 5, … in xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__wasmsimd_arm_loadsplat_3x4() 244 …const v128_t vi6x9BDF = wasm_v128_and(vmask_odd, wasm_v32x4_shuffle(vi6x89AB, vi6xCDEF, 1, 3, 5, … in xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__wasmsimd_arm_loadsplat_3x4()
|
D | 3x3s2p1-minmax-wasmsimd-x86-loadsplat-3x4.c | 36 const v128_t vmask_odd = wasm_v128_load(params->scalar.mask_odd); in xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__wasmsimd_x86_loadsplat_3x4() local 232 …const v128_t vi0x9BDF = wasm_v128_and(vmask_odd, wasm_v32x4_shuffle(vi0x89AB, vi0xCDEF, 1, 3, 5, … in xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__wasmsimd_x86_loadsplat_3x4() 234 …const v128_t vi1x9BDF = wasm_v128_and(vmask_odd, wasm_v32x4_shuffle(vi1x89AB, vi1xCDEF, 1, 3, 5, … in xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__wasmsimd_x86_loadsplat_3x4() 236 …const v128_t vi2x9BDF = wasm_v128_and(vmask_odd, wasm_v32x4_shuffle(vi2x89AB, vi2xCDEF, 1, 3, 5, … in xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__wasmsimd_x86_loadsplat_3x4() 238 …const v128_t vi3x9BDF = wasm_v128_and(vmask_odd, wasm_v32x4_shuffle(vi3x89AB, vi3xCDEF, 1, 3, 5, … in xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__wasmsimd_x86_loadsplat_3x4() 240 …const v128_t vi4x9BDF = wasm_v128_and(vmask_odd, wasm_v32x4_shuffle(vi4x89AB, vi4xCDEF, 1, 3, 5, … in xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__wasmsimd_x86_loadsplat_3x4() 242 …const v128_t vi5x9BDF = wasm_v128_and(vmask_odd, wasm_v32x4_shuffle(vi5x89AB, vi5xCDEF, 1, 3, 5, … in xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__wasmsimd_x86_loadsplat_3x4() 244 …const v128_t vi6x9BDF = wasm_v128_and(vmask_odd, wasm_v32x4_shuffle(vi6x89AB, vi6xCDEF, 1, 3, 5, … in xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__wasmsimd_x86_loadsplat_3x4()
|
D | 3x3s2p1-minmax-sse-6x4.c | 35 const __m128 vmask_odd = _mm_load_ps((const float*) params->sse.mask_odd); in xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__sse_6x4() local 360 …const __m128 vi0x9BDF = _mm_and_ps(vmask_odd, _mm_shuffle_ps(vi0x89AB, vi0xCDEF, _MM_SHUFFLE(3, 1… in xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__sse_6x4() 362 …const __m128 vi1x9BDF = _mm_and_ps(vmask_odd, _mm_shuffle_ps(vi1x89AB, vi1xCDEF, _MM_SHUFFLE(3, 1… in xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__sse_6x4() 364 …const __m128 vi2x9BDF = _mm_and_ps(vmask_odd, _mm_shuffle_ps(vi2x89AB, vi2xCDEF, _MM_SHUFFLE(3, 1… in xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__sse_6x4() 366 …const __m128 vi3x9BDF = _mm_and_ps(vmask_odd, _mm_shuffle_ps(vi3x89AB, vi3xCDEF, _MM_SHUFFLE(3, 1… in xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__sse_6x4() 368 …const __m128 vi4x9BDF = _mm_and_ps(vmask_odd, _mm_shuffle_ps(vi4x89AB, vi4xCDEF, _MM_SHUFFLE(3, 1… in xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__sse_6x4() 370 …const __m128 vi5x9BDF = _mm_and_ps(vmask_odd, _mm_shuffle_ps(vi5x89AB, vi5xCDEF, _MM_SHUFFLE(3, 1… in xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__sse_6x4() 372 …const __m128 vi6x9BDF = _mm_and_ps(vmask_odd, _mm_shuffle_ps(vi6x89AB, vi6xCDEF, _MM_SHUFFLE(3, 1… in xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__sse_6x4() 374 …const __m128 vi7x9BDF = _mm_and_ps(vmask_odd, _mm_shuffle_ps(vi7x89AB, vi7xCDEF, _MM_SHUFFLE(3, 1… in xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__sse_6x4() 376 …const __m128 vi8x9BDF = _mm_and_ps(vmask_odd, _mm_shuffle_ps(vi8x89AB, vi8xCDEF, _MM_SHUFFLE(3, 1… in xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__sse_6x4() [all …]
|
D | 3x3s2p1-minmax-wasmsimd-arm-splat-4x4.c | 36 const v128_t vmask_odd = wasm_v128_load(params->scalar.mask_odd); in xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__wasmsimd_arm_splat_4x4() local 266 …const v128_t vi0x9BDF = wasm_v128_and(vmask_odd, wasm_v32x4_shuffle(vi0x89AB, vi0xCDEF, 1, 3, 5, … in xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__wasmsimd_arm_splat_4x4() 268 …const v128_t vi1x9BDF = wasm_v128_and(vmask_odd, wasm_v32x4_shuffle(vi1x89AB, vi1xCDEF, 1, 3, 5, … in xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__wasmsimd_arm_splat_4x4() 270 …const v128_t vi2x9BDF = wasm_v128_and(vmask_odd, wasm_v32x4_shuffle(vi2x89AB, vi2xCDEF, 1, 3, 5, … in xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__wasmsimd_arm_splat_4x4() 272 …const v128_t vi3x9BDF = wasm_v128_and(vmask_odd, wasm_v32x4_shuffle(vi3x89AB, vi3xCDEF, 1, 3, 5, … in xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__wasmsimd_arm_splat_4x4() 274 …const v128_t vi4x9BDF = wasm_v128_and(vmask_odd, wasm_v32x4_shuffle(vi4x89AB, vi4xCDEF, 1, 3, 5, … in xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__wasmsimd_arm_splat_4x4() 276 …const v128_t vi5x9BDF = wasm_v128_and(vmask_odd, wasm_v32x4_shuffle(vi5x89AB, vi5xCDEF, 1, 3, 5, … in xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__wasmsimd_arm_splat_4x4() 278 …const v128_t vi6x9BDF = wasm_v128_and(vmask_odd, wasm_v32x4_shuffle(vi6x89AB, vi6xCDEF, 1, 3, 5, … in xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__wasmsimd_arm_splat_4x4() 280 …const v128_t vi7x9BDF = wasm_v128_and(vmask_odd, wasm_v32x4_shuffle(vi7x89AB, vi7xCDEF, 1, 3, 5, … in xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__wasmsimd_arm_splat_4x4() 282 …const v128_t vi8x9BDF = wasm_v128_and(vmask_odd, wasm_v32x4_shuffle(vi8x89AB, vi8xCDEF, 1, 3, 5, … in xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__wasmsimd_arm_splat_4x4()
|
D | 3x3s2p1-minmax-sse-4x4.c | 35 const __m128 vmask_odd = _mm_load_ps((const float*) params->sse.mask_odd); in xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__sse_4x4() local 270 …const __m128 vi0x9BDF = _mm_and_ps(vmask_odd, _mm_shuffle_ps(vi0x89AB, vi0xCDEF, _MM_SHUFFLE(3, 1… in xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__sse_4x4() 272 …const __m128 vi1x9BDF = _mm_and_ps(vmask_odd, _mm_shuffle_ps(vi1x89AB, vi1xCDEF, _MM_SHUFFLE(3, 1… in xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__sse_4x4() 274 …const __m128 vi2x9BDF = _mm_and_ps(vmask_odd, _mm_shuffle_ps(vi2x89AB, vi2xCDEF, _MM_SHUFFLE(3, 1… in xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__sse_4x4() 276 …const __m128 vi3x9BDF = _mm_and_ps(vmask_odd, _mm_shuffle_ps(vi3x89AB, vi3xCDEF, _MM_SHUFFLE(3, 1… in xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__sse_4x4() 278 …const __m128 vi4x9BDF = _mm_and_ps(vmask_odd, _mm_shuffle_ps(vi4x89AB, vi4xCDEF, _MM_SHUFFLE(3, 1… in xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__sse_4x4() 280 …const __m128 vi5x9BDF = _mm_and_ps(vmask_odd, _mm_shuffle_ps(vi5x89AB, vi5xCDEF, _MM_SHUFFLE(3, 1… in xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__sse_4x4() 282 …const __m128 vi6x9BDF = _mm_and_ps(vmask_odd, _mm_shuffle_ps(vi6x89AB, vi6xCDEF, _MM_SHUFFLE(3, 1… in xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__sse_4x4() 284 …const __m128 vi7x9BDF = _mm_and_ps(vmask_odd, _mm_shuffle_ps(vi7x89AB, vi7xCDEF, _MM_SHUFFLE(3, 1… in xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__sse_4x4() 286 …const __m128 vi8x9BDF = _mm_and_ps(vmask_odd, _mm_shuffle_ps(vi8x89AB, vi8xCDEF, _MM_SHUFFLE(3, 1… in xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__sse_4x4()
|
D | 3x3s2p1-minmax-wasmsimd-x86-splat-4x4.c | 36 const v128_t vmask_odd = wasm_v128_load(params->scalar.mask_odd); in xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__wasmsimd_x86_splat_4x4() local 266 …const v128_t vi0x9BDF = wasm_v128_and(vmask_odd, wasm_v32x4_shuffle(vi0x89AB, vi0xCDEF, 1, 3, 5, … in xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__wasmsimd_x86_splat_4x4() 268 …const v128_t vi1x9BDF = wasm_v128_and(vmask_odd, wasm_v32x4_shuffle(vi1x89AB, vi1xCDEF, 1, 3, 5, … in xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__wasmsimd_x86_splat_4x4() 270 …const v128_t vi2x9BDF = wasm_v128_and(vmask_odd, wasm_v32x4_shuffle(vi2x89AB, vi2xCDEF, 1, 3, 5, … in xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__wasmsimd_x86_splat_4x4() 272 …const v128_t vi3x9BDF = wasm_v128_and(vmask_odd, wasm_v32x4_shuffle(vi3x89AB, vi3xCDEF, 1, 3, 5, … in xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__wasmsimd_x86_splat_4x4() 274 …const v128_t vi4x9BDF = wasm_v128_and(vmask_odd, wasm_v32x4_shuffle(vi4x89AB, vi4xCDEF, 1, 3, 5, … in xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__wasmsimd_x86_splat_4x4() 276 …const v128_t vi5x9BDF = wasm_v128_and(vmask_odd, wasm_v32x4_shuffle(vi5x89AB, vi5xCDEF, 1, 3, 5, … in xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__wasmsimd_x86_splat_4x4() 278 …const v128_t vi6x9BDF = wasm_v128_and(vmask_odd, wasm_v32x4_shuffle(vi6x89AB, vi6xCDEF, 1, 3, 5, … in xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__wasmsimd_x86_splat_4x4() 280 …const v128_t vi7x9BDF = wasm_v128_and(vmask_odd, wasm_v32x4_shuffle(vi7x89AB, vi7xCDEF, 1, 3, 5, … in xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__wasmsimd_x86_splat_4x4() 282 …const v128_t vi8x9BDF = wasm_v128_and(vmask_odd, wasm_v32x4_shuffle(vi8x89AB, vi8xCDEF, 1, 3, 5, … in xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__wasmsimd_x86_splat_4x4()
|
D | 3x3s2p1-minmax-wasmsimd-x86-loadsplat-4x4.c | 36 const v128_t vmask_odd = wasm_v128_load(params->scalar.mask_odd); in xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__wasmsimd_x86_loadsplat_4x4() local 276 …const v128_t vi0x9BDF = wasm_v128_and(vmask_odd, wasm_v32x4_shuffle(vi0x89AB, vi0xCDEF, 1, 3, 5, … in xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__wasmsimd_x86_loadsplat_4x4() 278 …const v128_t vi1x9BDF = wasm_v128_and(vmask_odd, wasm_v32x4_shuffle(vi1x89AB, vi1xCDEF, 1, 3, 5, … in xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__wasmsimd_x86_loadsplat_4x4() 280 …const v128_t vi2x9BDF = wasm_v128_and(vmask_odd, wasm_v32x4_shuffle(vi2x89AB, vi2xCDEF, 1, 3, 5, … in xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__wasmsimd_x86_loadsplat_4x4() 282 …const v128_t vi3x9BDF = wasm_v128_and(vmask_odd, wasm_v32x4_shuffle(vi3x89AB, vi3xCDEF, 1, 3, 5, … in xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__wasmsimd_x86_loadsplat_4x4() 284 …const v128_t vi4x9BDF = wasm_v128_and(vmask_odd, wasm_v32x4_shuffle(vi4x89AB, vi4xCDEF, 1, 3, 5, … in xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__wasmsimd_x86_loadsplat_4x4() 286 …const v128_t vi5x9BDF = wasm_v128_and(vmask_odd, wasm_v32x4_shuffle(vi5x89AB, vi5xCDEF, 1, 3, 5, … in xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__wasmsimd_x86_loadsplat_4x4() 288 …const v128_t vi6x9BDF = wasm_v128_and(vmask_odd, wasm_v32x4_shuffle(vi6x89AB, vi6xCDEF, 1, 3, 5, … in xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__wasmsimd_x86_loadsplat_4x4() 290 …const v128_t vi7x9BDF = wasm_v128_and(vmask_odd, wasm_v32x4_shuffle(vi7x89AB, vi7xCDEF, 1, 3, 5, … in xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__wasmsimd_x86_loadsplat_4x4() 292 …const v128_t vi8x9BDF = wasm_v128_and(vmask_odd, wasm_v32x4_shuffle(vi8x89AB, vi8xCDEF, 1, 3, 5, … in xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__wasmsimd_x86_loadsplat_4x4()
|