/external/XNNPACK/src/f32-dwconv2d-chw/gen/ |
D | 3x3p1-minmax-neon-1x4-acc3.c | 60 float32x4_t vi1x4567 = vld1q_f32(i1); i1 += 4; in xnn_f32_dwconv2d_chw_ukernel_3x3p1__neon_1x4_acc3() local 73 float32x4_t vo0p1 = vmulq_lane_f32(vi1x4567, vget_low_f32(vw4567), 1); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__neon_1x4_acc3() 78 const float32x4_t vi1x3456 = vextq_f32(vi1x0123, vi1x4567, 3); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__neon_1x4_acc3() 88 vi1x0123 = vi1x4567; in xnn_f32_dwconv2d_chw_ukernel_3x3p1__neon_1x4_acc3() 92 const float32x4_t vi1x5678 = vextq_f32(vi1x4567, vi1x89AB, 1); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__neon_1x4_acc3() 102 vi1x4567 = vi1x89AB; in xnn_f32_dwconv2d_chw_ukernel_3x3p1__neon_1x4_acc3() 121 vi1x4567 = vreinterpretq_f32_u32(vandq_u32(vmask, vreinterpretq_u32_f32(vi1x4567))); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__neon_1x4_acc3() 126 float32x4_t vo0p1 = vmulq_lane_f32(vi1x4567, vget_low_f32(vw4567), 1); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__neon_1x4_acc3() 131 const float32x4_t vi1x3456 = vextq_f32(vi1x0123, vi1x4567, 3); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__neon_1x4_acc3() 142 const float32x4_t vi1x5678 = vextq_f32(vi1x4567, vzero, 1); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__neon_1x4_acc3()
|
D | 3x3p1-minmax-neonfma-1x4-acc3.c | 60 float32x4_t vi1x4567 = vld1q_f32(i1); i1 += 4; in xnn_f32_dwconv2d_chw_ukernel_3x3p1__neonfma_1x4_acc3() local 73 float32x4_t vo0p1 = vmulq_lane_f32(vi1x4567, vget_low_f32(vw4567), 1); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__neonfma_1x4_acc3() 78 const float32x4_t vi1x3456 = vextq_f32(vi1x0123, vi1x4567, 3); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__neonfma_1x4_acc3() 88 vi1x0123 = vi1x4567; in xnn_f32_dwconv2d_chw_ukernel_3x3p1__neonfma_1x4_acc3() 92 const float32x4_t vi1x5678 = vextq_f32(vi1x4567, vi1x89AB, 1); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__neonfma_1x4_acc3() 102 vi1x4567 = vi1x89AB; in xnn_f32_dwconv2d_chw_ukernel_3x3p1__neonfma_1x4_acc3() 121 vi1x4567 = vreinterpretq_f32_u32(vandq_u32(vmask, vreinterpretq_u32_f32(vi1x4567))); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__neonfma_1x4_acc3() 126 float32x4_t vo0p1 = vmulq_lane_f32(vi1x4567, vget_low_f32(vw4567), 1); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__neonfma_1x4_acc3() 131 const float32x4_t vi1x3456 = vextq_f32(vi1x0123, vi1x4567, 3); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__neonfma_1x4_acc3() 142 const float32x4_t vi1x5678 = vextq_f32(vi1x4567, vzero, 1); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__neonfma_1x4_acc3()
|
D | 3x3p1-minmax-neonfma-1x4-acc2.c | 60 float32x4_t vi1x4567 = vld1q_f32(i1); i1 += 4; in xnn_f32_dwconv2d_chw_ukernel_3x3p1__neonfma_1x4_acc2() local 73 float32x4_t vo0p1 = vmulq_lane_f32(vi1x4567, vget_low_f32(vw4567), 1); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__neonfma_1x4_acc2() 78 const float32x4_t vi1x3456 = vextq_f32(vi1x0123, vi1x4567, 3); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__neonfma_1x4_acc2() 88 vi1x0123 = vi1x4567; in xnn_f32_dwconv2d_chw_ukernel_3x3p1__neonfma_1x4_acc2() 92 const float32x4_t vi1x5678 = vextq_f32(vi1x4567, vi1x89AB, 1); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__neonfma_1x4_acc2() 102 vi1x4567 = vi1x89AB; in xnn_f32_dwconv2d_chw_ukernel_3x3p1__neonfma_1x4_acc2() 120 vi1x4567 = vreinterpretq_f32_u32(vandq_u32(vmask, vreinterpretq_u32_f32(vi1x4567))); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__neonfma_1x4_acc2() 125 float32x4_t vo0p1 = vmulq_lane_f32(vi1x4567, vget_low_f32(vw4567), 1); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__neonfma_1x4_acc2() 130 const float32x4_t vi1x3456 = vextq_f32(vi1x0123, vi1x4567, 3); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__neonfma_1x4_acc2() 141 const float32x4_t vi1x5678 = vextq_f32(vi1x4567, vzero, 1); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__neonfma_1x4_acc2()
|
D | 3x3p1-minmax-neonfma-1x4.c | 60 float32x4_t vi1x4567 = vld1q_f32(i1); i1 += 4; in xnn_f32_dwconv2d_chw_ukernel_3x3p1__neonfma_1x4() local 73 vo0p0 = vfmaq_lane_f32(vo0p0, vi1x4567, vget_low_f32(vw4567), 1); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__neonfma_1x4() 78 const float32x4_t vi1x3456 = vextq_f32(vi1x0123, vi1x4567, 3); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__neonfma_1x4() 88 vi1x0123 = vi1x4567; in xnn_f32_dwconv2d_chw_ukernel_3x3p1__neonfma_1x4() 92 const float32x4_t vi1x5678 = vextq_f32(vi1x4567, vi1x89AB, 1); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__neonfma_1x4() 102 vi1x4567 = vi1x89AB; in xnn_f32_dwconv2d_chw_ukernel_3x3p1__neonfma_1x4() 119 vi1x4567 = vreinterpretq_f32_u32(vandq_u32(vmask, vreinterpretq_u32_f32(vi1x4567))); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__neonfma_1x4() 124 vo0p0 = vfmaq_lane_f32(vo0p0, vi1x4567, vget_low_f32(vw4567), 1); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__neonfma_1x4() 129 const float32x4_t vi1x3456 = vextq_f32(vi1x0123, vi1x4567, 3); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__neonfma_1x4() 140 const float32x4_t vi1x5678 = vextq_f32(vi1x4567, vzero, 1); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__neonfma_1x4()
|
D | 3x3p1-minmax-neon-1x4.c | 60 float32x4_t vi1x4567 = vld1q_f32(i1); i1 += 4; in xnn_f32_dwconv2d_chw_ukernel_3x3p1__neon_1x4() local 73 vo0p0 = vmlaq_lane_f32(vo0p0, vi1x4567, vget_low_f32(vw4567), 1); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__neon_1x4() 78 const float32x4_t vi1x3456 = vextq_f32(vi1x0123, vi1x4567, 3); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__neon_1x4() 88 vi1x0123 = vi1x4567; in xnn_f32_dwconv2d_chw_ukernel_3x3p1__neon_1x4() 92 const float32x4_t vi1x5678 = vextq_f32(vi1x4567, vi1x89AB, 1); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__neon_1x4() 102 vi1x4567 = vi1x89AB; in xnn_f32_dwconv2d_chw_ukernel_3x3p1__neon_1x4() 119 vi1x4567 = vreinterpretq_f32_u32(vandq_u32(vmask, vreinterpretq_u32_f32(vi1x4567))); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__neon_1x4() 124 vo0p0 = vmlaq_lane_f32(vo0p0, vi1x4567, vget_low_f32(vw4567), 1); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__neon_1x4() 129 const float32x4_t vi1x3456 = vextq_f32(vi1x0123, vi1x4567, 3); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__neon_1x4() 140 const float32x4_t vi1x5678 = vextq_f32(vi1x4567, vzero, 1); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__neon_1x4()
|
D | 3x3p1-minmax-neon-1x4-acc2.c | 60 float32x4_t vi1x4567 = vld1q_f32(i1); i1 += 4; in xnn_f32_dwconv2d_chw_ukernel_3x3p1__neon_1x4_acc2() local 73 float32x4_t vo0p1 = vmulq_lane_f32(vi1x4567, vget_low_f32(vw4567), 1); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__neon_1x4_acc2() 78 const float32x4_t vi1x3456 = vextq_f32(vi1x0123, vi1x4567, 3); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__neon_1x4_acc2() 88 vi1x0123 = vi1x4567; in xnn_f32_dwconv2d_chw_ukernel_3x3p1__neon_1x4_acc2() 92 const float32x4_t vi1x5678 = vextq_f32(vi1x4567, vi1x89AB, 1); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__neon_1x4_acc2() 102 vi1x4567 = vi1x89AB; in xnn_f32_dwconv2d_chw_ukernel_3x3p1__neon_1x4_acc2() 120 vi1x4567 = vreinterpretq_f32_u32(vandq_u32(vmask, vreinterpretq_u32_f32(vi1x4567))); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__neon_1x4_acc2() 125 float32x4_t vo0p1 = vmulq_lane_f32(vi1x4567, vget_low_f32(vw4567), 1); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__neon_1x4_acc2() 130 const float32x4_t vi1x3456 = vextq_f32(vi1x0123, vi1x4567, 3); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__neon_1x4_acc2() 141 const float32x4_t vi1x5678 = vextq_f32(vi1x4567, vzero, 1); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__neon_1x4_acc2()
|
D | 3x3p1-minmax-wasmsimd-arm-splat-1x4-acc3.c | 61 v128_t vi1x4567 = wasm_v128_load(i1); i1 += 4; in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_splat_1x4_acc3() local 74 v128_t vo0p1 = wasm_f32x4_mul(vi1x4567, wasm_v32x4_shuffle(vw4567, vw4567, 1, 1, 1, 1)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_splat_1x4_acc3() 79 const v128_t vi1x3456 = wasm_v32x4_shuffle(vi1x0123, vi1x4567, 3, 4, 5, 6); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_splat_1x4_acc3() 89 vi1x0123 = vi1x4567; in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_splat_1x4_acc3() 93 const v128_t vi1x5678 = wasm_v32x4_shuffle(vi1x4567, vi1x89AB, 1, 2, 3, 4); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_splat_1x4_acc3() 103 vi1x4567 = vi1x89AB; in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_splat_1x4_acc3() 121 vi1x4567 = wasm_v128_and(vmask, vi1x4567); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_splat_1x4_acc3() 126 v128_t vo0p1 = wasm_f32x4_mul(vi1x4567, wasm_v32x4_shuffle(vw4567, vw4567, 1, 1, 1, 1)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_splat_1x4_acc3() 131 const v128_t vi1x3456 = wasm_v32x4_shuffle(vi1x0123, vi1x4567, 3, 4, 5, 6); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_splat_1x4_acc3() 142 const v128_t vi1x5678 = wasm_v32x4_shuffle(vi1x4567, vzero, 1, 2, 3, 4); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_splat_1x4_acc3()
|
D | 3x3p1-minmax-wasmsimd-arm-splat-1x4-acc4.c | 61 v128_t vi1x4567 = wasm_v128_load(i1); i1 += 4; in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_splat_1x4_acc4() local 74 v128_t vo0p1 = wasm_f32x4_mul(vi1x4567, wasm_v32x4_shuffle(vw4567, vw4567, 1, 1, 1, 1)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_splat_1x4_acc4() 79 const v128_t vi1x3456 = wasm_v32x4_shuffle(vi1x0123, vi1x4567, 3, 4, 5, 6); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_splat_1x4_acc4() 89 vi1x0123 = vi1x4567; in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_splat_1x4_acc4() 93 const v128_t vi1x5678 = wasm_v32x4_shuffle(vi1x4567, vi1x89AB, 1, 2, 3, 4); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_splat_1x4_acc4() 103 vi1x4567 = vi1x89AB; in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_splat_1x4_acc4() 122 vi1x4567 = wasm_v128_and(vmask, vi1x4567); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_splat_1x4_acc4() 127 v128_t vo0p1 = wasm_f32x4_mul(vi1x4567, wasm_v32x4_shuffle(vw4567, vw4567, 1, 1, 1, 1)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_splat_1x4_acc4() 132 const v128_t vi1x3456 = wasm_v32x4_shuffle(vi1x0123, vi1x4567, 3, 4, 5, 6); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_splat_1x4_acc4() 143 const v128_t vi1x5678 = wasm_v32x4_shuffle(vi1x4567, vzero, 1, 2, 3, 4); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_splat_1x4_acc4()
|
D | 3x3p1-minmax-wasmsimd-x86-loadsplat-1x4-acc3.c | 73 v128_t vi1x4567 = wasm_v128_load(i1); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_loadsplat_1x4_acc3() local 88 v128_t vo0p1 = wasm_f32x4_mul(vi1x4567, vk11); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_loadsplat_1x4_acc3() 92 const v128_t vi1x3456 = wasm_v32x4_shuffle(vi1x0123, vi1x4567, 3, 4, 5, 6); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_loadsplat_1x4_acc3() 100 vi1x0123 = vi1x4567; in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_loadsplat_1x4_acc3() 104 const v128_t vi1x5678 = wasm_v32x4_shuffle(vi1x4567, vi1x89AB, 1, 2, 3, 4); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_loadsplat_1x4_acc3() 112 vi1x4567 = vi1x89AB; in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_loadsplat_1x4_acc3() 129 vi1x4567 = wasm_v128_and(vmask, vi1x4567); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_loadsplat_1x4_acc3() 133 v128_t vo0p1 = wasm_f32x4_mul(vi1x4567, vk11); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_loadsplat_1x4_acc3() 137 const v128_t vi1x3456 = wasm_v32x4_shuffle(vi1x0123, vi1x4567, 3, 4, 5, 6); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_loadsplat_1x4_acc3() 146 const v128_t vi1x5678 = wasm_v32x4_shuffle(vi1x4567, vzero, 1, 2, 3, 4); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_loadsplat_1x4_acc3()
|
D | 3x3p1-minmax-wasmsimd-arm-loadsplat-1x4-acc2.c | 73 v128_t vi1x4567 = wasm_v128_load(i1); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_loadsplat_1x4_acc2() local 88 v128_t vo0p1 = wasm_f32x4_mul(vi1x4567, vk11); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_loadsplat_1x4_acc2() 92 const v128_t vi1x3456 = wasm_v32x4_shuffle(vi1x0123, vi1x4567, 3, 4, 5, 6); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_loadsplat_1x4_acc2() 100 vi1x0123 = vi1x4567; in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_loadsplat_1x4_acc2() 104 const v128_t vi1x5678 = wasm_v32x4_shuffle(vi1x4567, vi1x89AB, 1, 2, 3, 4); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_loadsplat_1x4_acc2() 112 vi1x4567 = vi1x89AB; in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_loadsplat_1x4_acc2() 128 vi1x4567 = wasm_v128_and(vmask, vi1x4567); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_loadsplat_1x4_acc2() 132 v128_t vo0p1 = wasm_f32x4_mul(vi1x4567, vk11); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_loadsplat_1x4_acc2() 136 const v128_t vi1x3456 = wasm_v32x4_shuffle(vi1x0123, vi1x4567, 3, 4, 5, 6); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_loadsplat_1x4_acc2() 145 const v128_t vi1x5678 = wasm_v32x4_shuffle(vi1x4567, vzero, 1, 2, 3, 4); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_loadsplat_1x4_acc2()
|
D | 3x3p1-minmax-wasmsimd-x86-loadsplat-1x4.c | 73 v128_t vi1x4567 = wasm_v128_load(i1); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_loadsplat_1x4() local 88 vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi1x4567, vk11)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_loadsplat_1x4() 92 const v128_t vi1x3456 = wasm_v32x4_shuffle(vi1x0123, vi1x4567, 3, 4, 5, 6); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_loadsplat_1x4() 100 vi1x0123 = vi1x4567; in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_loadsplat_1x4() 104 const v128_t vi1x5678 = wasm_v32x4_shuffle(vi1x4567, vi1x89AB, 1, 2, 3, 4); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_loadsplat_1x4() 112 vi1x4567 = vi1x89AB; in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_loadsplat_1x4() 127 vi1x4567 = wasm_v128_and(vmask, vi1x4567); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_loadsplat_1x4() 131 vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi1x4567, vk11)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_loadsplat_1x4() 135 const v128_t vi1x3456 = wasm_v32x4_shuffle(vi1x0123, vi1x4567, 3, 4, 5, 6); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_loadsplat_1x4() 144 const v128_t vi1x5678 = wasm_v32x4_shuffle(vi1x4567, vzero, 1, 2, 3, 4); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_loadsplat_1x4()
|
D | 3x3p1-minmax-wasmsimd-x86-splat-1x4-acc4.c | 61 v128_t vi1x4567 = wasm_v128_load(i1); i1 += 4; in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_splat_1x4_acc4() local 74 v128_t vo0p1 = wasm_f32x4_mul(vi1x4567, wasm_v32x4_shuffle(vw4567, vw4567, 1, 1, 1, 1)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_splat_1x4_acc4() 79 const v128_t vi1x3456 = wasm_v32x4_shuffle(vi1x0123, vi1x4567, 3, 4, 5, 6); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_splat_1x4_acc4() 89 vi1x0123 = vi1x4567; in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_splat_1x4_acc4() 93 const v128_t vi1x5678 = wasm_v32x4_shuffle(vi1x4567, vi1x89AB, 1, 2, 3, 4); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_splat_1x4_acc4() 103 vi1x4567 = vi1x89AB; in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_splat_1x4_acc4() 122 vi1x4567 = wasm_v128_and(vmask, vi1x4567); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_splat_1x4_acc4() 127 v128_t vo0p1 = wasm_f32x4_mul(vi1x4567, wasm_v32x4_shuffle(vw4567, vw4567, 1, 1, 1, 1)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_splat_1x4_acc4() 132 const v128_t vi1x3456 = wasm_v32x4_shuffle(vi1x0123, vi1x4567, 3, 4, 5, 6); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_splat_1x4_acc4() 143 const v128_t vi1x5678 = wasm_v32x4_shuffle(vi1x4567, vzero, 1, 2, 3, 4); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_splat_1x4_acc4()
|
D | 3x3p1-minmax-wasmsimd-arm-splat-1x4.c | 61 v128_t vi1x4567 = wasm_v128_load(i1); i1 += 4; in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_splat_1x4() local 74 …vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi1x4567, wasm_v32x4_shuffle(vw4567, vw4567, 1, 1, 1,… in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_splat_1x4() 79 const v128_t vi1x3456 = wasm_v32x4_shuffle(vi1x0123, vi1x4567, 3, 4, 5, 6); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_splat_1x4() 89 vi1x0123 = vi1x4567; in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_splat_1x4() 93 const v128_t vi1x5678 = wasm_v32x4_shuffle(vi1x4567, vi1x89AB, 1, 2, 3, 4); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_splat_1x4() 103 vi1x4567 = vi1x89AB; in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_splat_1x4() 119 vi1x4567 = wasm_v128_and(vmask, vi1x4567); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_splat_1x4() 124 …vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi1x4567, wasm_v32x4_shuffle(vw4567, vw4567, 1, 1, 1,… in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_splat_1x4() 129 const v128_t vi1x3456 = wasm_v32x4_shuffle(vi1x0123, vi1x4567, 3, 4, 5, 6); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_splat_1x4() 140 const v128_t vi1x5678 = wasm_v32x4_shuffle(vi1x4567, vzero, 1, 2, 3, 4); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_splat_1x4()
|
D | 3x3p1-minmax-wasmsimd-x86-loadsplat-1x4-acc2.c | 73 v128_t vi1x4567 = wasm_v128_load(i1); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_loadsplat_1x4_acc2() local 88 v128_t vo0p1 = wasm_f32x4_mul(vi1x4567, vk11); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_loadsplat_1x4_acc2() 92 const v128_t vi1x3456 = wasm_v32x4_shuffle(vi1x0123, vi1x4567, 3, 4, 5, 6); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_loadsplat_1x4_acc2() 100 vi1x0123 = vi1x4567; in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_loadsplat_1x4_acc2() 104 const v128_t vi1x5678 = wasm_v32x4_shuffle(vi1x4567, vi1x89AB, 1, 2, 3, 4); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_loadsplat_1x4_acc2() 112 vi1x4567 = vi1x89AB; in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_loadsplat_1x4_acc2() 128 vi1x4567 = wasm_v128_and(vmask, vi1x4567); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_loadsplat_1x4_acc2() 132 v128_t vo0p1 = wasm_f32x4_mul(vi1x4567, vk11); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_loadsplat_1x4_acc2() 136 const v128_t vi1x3456 = wasm_v32x4_shuffle(vi1x0123, vi1x4567, 3, 4, 5, 6); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_loadsplat_1x4_acc2() 145 const v128_t vi1x5678 = wasm_v32x4_shuffle(vi1x4567, vzero, 1, 2, 3, 4); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_loadsplat_1x4_acc2()
|
D | 3x3p1-minmax-wasmsimd-x86-splat-1x4-acc2.c | 61 v128_t vi1x4567 = wasm_v128_load(i1); i1 += 4; in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_splat_1x4_acc2() local 74 v128_t vo0p1 = wasm_f32x4_mul(vi1x4567, wasm_v32x4_shuffle(vw4567, vw4567, 1, 1, 1, 1)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_splat_1x4_acc2() 79 const v128_t vi1x3456 = wasm_v32x4_shuffle(vi1x0123, vi1x4567, 3, 4, 5, 6); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_splat_1x4_acc2() 89 vi1x0123 = vi1x4567; in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_splat_1x4_acc2() 93 const v128_t vi1x5678 = wasm_v32x4_shuffle(vi1x4567, vi1x89AB, 1, 2, 3, 4); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_splat_1x4_acc2() 103 vi1x4567 = vi1x89AB; in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_splat_1x4_acc2() 120 vi1x4567 = wasm_v128_and(vmask, vi1x4567); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_splat_1x4_acc2() 125 v128_t vo0p1 = wasm_f32x4_mul(vi1x4567, wasm_v32x4_shuffle(vw4567, vw4567, 1, 1, 1, 1)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_splat_1x4_acc2() 130 const v128_t vi1x3456 = wasm_v32x4_shuffle(vi1x0123, vi1x4567, 3, 4, 5, 6); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_splat_1x4_acc2() 141 const v128_t vi1x5678 = wasm_v32x4_shuffle(vi1x4567, vzero, 1, 2, 3, 4); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_splat_1x4_acc2()
|
D | 3x3p1-minmax-wasmsimd-arm-splat-1x4-acc2.c | 61 v128_t vi1x4567 = wasm_v128_load(i1); i1 += 4; in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_splat_1x4_acc2() local 74 v128_t vo0p1 = wasm_f32x4_mul(vi1x4567, wasm_v32x4_shuffle(vw4567, vw4567, 1, 1, 1, 1)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_splat_1x4_acc2() 79 const v128_t vi1x3456 = wasm_v32x4_shuffle(vi1x0123, vi1x4567, 3, 4, 5, 6); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_splat_1x4_acc2() 89 vi1x0123 = vi1x4567; in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_splat_1x4_acc2() 93 const v128_t vi1x5678 = wasm_v32x4_shuffle(vi1x4567, vi1x89AB, 1, 2, 3, 4); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_splat_1x4_acc2() 103 vi1x4567 = vi1x89AB; in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_splat_1x4_acc2() 120 vi1x4567 = wasm_v128_and(vmask, vi1x4567); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_splat_1x4_acc2() 125 v128_t vo0p1 = wasm_f32x4_mul(vi1x4567, wasm_v32x4_shuffle(vw4567, vw4567, 1, 1, 1, 1)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_splat_1x4_acc2() 130 const v128_t vi1x3456 = wasm_v32x4_shuffle(vi1x0123, vi1x4567, 3, 4, 5, 6); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_splat_1x4_acc2() 141 const v128_t vi1x5678 = wasm_v32x4_shuffle(vi1x4567, vzero, 1, 2, 3, 4); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_splat_1x4_acc2()
|
D | 3x3p1-minmax-wasmsimd-x86-splat-1x4-acc3.c | 61 v128_t vi1x4567 = wasm_v128_load(i1); i1 += 4; in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_splat_1x4_acc3() local 74 v128_t vo0p1 = wasm_f32x4_mul(vi1x4567, wasm_v32x4_shuffle(vw4567, vw4567, 1, 1, 1, 1)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_splat_1x4_acc3() 79 const v128_t vi1x3456 = wasm_v32x4_shuffle(vi1x0123, vi1x4567, 3, 4, 5, 6); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_splat_1x4_acc3() 89 vi1x0123 = vi1x4567; in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_splat_1x4_acc3() 93 const v128_t vi1x5678 = wasm_v32x4_shuffle(vi1x4567, vi1x89AB, 1, 2, 3, 4); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_splat_1x4_acc3() 103 vi1x4567 = vi1x89AB; in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_splat_1x4_acc3() 121 vi1x4567 = wasm_v128_and(vmask, vi1x4567); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_splat_1x4_acc3() 126 v128_t vo0p1 = wasm_f32x4_mul(vi1x4567, wasm_v32x4_shuffle(vw4567, vw4567, 1, 1, 1, 1)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_splat_1x4_acc3() 131 const v128_t vi1x3456 = wasm_v32x4_shuffle(vi1x0123, vi1x4567, 3, 4, 5, 6); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_splat_1x4_acc3() 142 const v128_t vi1x5678 = wasm_v32x4_shuffle(vi1x4567, vzero, 1, 2, 3, 4); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_splat_1x4_acc3()
|
D | 3x3p1-minmax-ssse3-1x4.c | 68 __m128 vi1x4567 = _mm_loadu_ps(i1); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__ssse3_1x4() local 83 vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi1x4567, vk11)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__ssse3_1x4() 87 …const __m128 vi1x3456 = _mm_castsi128_ps(_mm_alignr_epi8(_mm_castps_si128(vi1x4567), _mm_castps_si… in xnn_f32_dwconv2d_chw_ukernel_3x3p1__ssse3_1x4() 95 vi1x0123 = vi1x4567; in xnn_f32_dwconv2d_chw_ukernel_3x3p1__ssse3_1x4() 99 …678 = _mm_castsi128_ps(_mm_alignr_epi8(_mm_castps_si128(vi1x89AB), _mm_castps_si128(vi1x4567), 4)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__ssse3_1x4() 107 vi1x4567 = vi1x89AB; in xnn_f32_dwconv2d_chw_ukernel_3x3p1__ssse3_1x4() 123 vi1x4567 = _mm_and_ps(vmask, vi1x4567); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__ssse3_1x4() 127 vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi1x4567, vk11)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__ssse3_1x4() 131 …const __m128 vi1x3456 = _mm_castsi128_ps(_mm_alignr_epi8(_mm_castps_si128(vi1x4567), _mm_castps_si… in xnn_f32_dwconv2d_chw_ukernel_3x3p1__ssse3_1x4() 140 … const __m128 vi1x5678 = _mm_castsi128_ps(_mm_alignr_epi8(vzero, _mm_castps_si128(vi1x4567), 4)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__ssse3_1x4()
|
D | 3x3p1-minmax-neonfma-1x4-acc4.c | 60 float32x4_t vi1x4567 = vld1q_f32(i1); i1 += 4; in xnn_f32_dwconv2d_chw_ukernel_3x3p1__neonfma_1x4_acc4() local 73 float32x4_t vo0p1 = vmulq_lane_f32(vi1x4567, vget_low_f32(vw4567), 1); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__neonfma_1x4_acc4() 78 const float32x4_t vi1x3456 = vextq_f32(vi1x0123, vi1x4567, 3); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__neonfma_1x4_acc4() 88 vi1x0123 = vi1x4567; in xnn_f32_dwconv2d_chw_ukernel_3x3p1__neonfma_1x4_acc4() 92 const float32x4_t vi1x5678 = vextq_f32(vi1x4567, vi1x89AB, 1); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__neonfma_1x4_acc4() 102 vi1x4567 = vi1x89AB; in xnn_f32_dwconv2d_chw_ukernel_3x3p1__neonfma_1x4_acc4() 122 vi1x4567 = vreinterpretq_f32_u32(vandq_u32(vmask, vreinterpretq_u32_f32(vi1x4567))); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__neonfma_1x4_acc4() 127 float32x4_t vo0p1 = vmulq_lane_f32(vi1x4567, vget_low_f32(vw4567), 1); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__neonfma_1x4_acc4() 132 const float32x4_t vi1x3456 = vextq_f32(vi1x0123, vi1x4567, 3); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__neonfma_1x4_acc4() 143 const float32x4_t vi1x5678 = vextq_f32(vi1x4567, vzero, 1); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__neonfma_1x4_acc4()
|
D | 3x3p1-minmax-wasmsimd-arm-loadsplat-1x4-acc3.c | 73 v128_t vi1x4567 = wasm_v128_load(i1); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_loadsplat_1x4_acc3() local 88 v128_t vo0p1 = wasm_f32x4_mul(vi1x4567, vk11); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_loadsplat_1x4_acc3() 92 const v128_t vi1x3456 = wasm_v32x4_shuffle(vi1x0123, vi1x4567, 3, 4, 5, 6); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_loadsplat_1x4_acc3() 100 vi1x0123 = vi1x4567; in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_loadsplat_1x4_acc3() 104 const v128_t vi1x5678 = wasm_v32x4_shuffle(vi1x4567, vi1x89AB, 1, 2, 3, 4); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_loadsplat_1x4_acc3() 112 vi1x4567 = vi1x89AB; in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_loadsplat_1x4_acc3() 129 vi1x4567 = wasm_v128_and(vmask, vi1x4567); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_loadsplat_1x4_acc3() 133 v128_t vo0p1 = wasm_f32x4_mul(vi1x4567, vk11); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_loadsplat_1x4_acc3() 137 const v128_t vi1x3456 = wasm_v32x4_shuffle(vi1x0123, vi1x4567, 3, 4, 5, 6); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_loadsplat_1x4_acc3() 146 const v128_t vi1x5678 = wasm_v32x4_shuffle(vi1x4567, vzero, 1, 2, 3, 4); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_loadsplat_1x4_acc3()
|
D | 3x3p1-minmax-wasmsimd-arm-loadsplat-1x4.c | 73 v128_t vi1x4567 = wasm_v128_load(i1); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_loadsplat_1x4() local 88 vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi1x4567, vk11)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_loadsplat_1x4() 92 const v128_t vi1x3456 = wasm_v32x4_shuffle(vi1x0123, vi1x4567, 3, 4, 5, 6); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_loadsplat_1x4() 100 vi1x0123 = vi1x4567; in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_loadsplat_1x4() 104 const v128_t vi1x5678 = wasm_v32x4_shuffle(vi1x4567, vi1x89AB, 1, 2, 3, 4); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_loadsplat_1x4() 112 vi1x4567 = vi1x89AB; in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_loadsplat_1x4() 127 vi1x4567 = wasm_v128_and(vmask, vi1x4567); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_loadsplat_1x4() 131 vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi1x4567, vk11)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_loadsplat_1x4() 135 const v128_t vi1x3456 = wasm_v32x4_shuffle(vi1x0123, vi1x4567, 3, 4, 5, 6); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_loadsplat_1x4() 144 const v128_t vi1x5678 = wasm_v32x4_shuffle(vi1x4567, vzero, 1, 2, 3, 4); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_loadsplat_1x4()
|
D | 3x3p1-minmax-neon-1x4-acc4.c | 60 float32x4_t vi1x4567 = vld1q_f32(i1); i1 += 4; in xnn_f32_dwconv2d_chw_ukernel_3x3p1__neon_1x4_acc4() local 73 float32x4_t vo0p1 = vmulq_lane_f32(vi1x4567, vget_low_f32(vw4567), 1); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__neon_1x4_acc4() 78 const float32x4_t vi1x3456 = vextq_f32(vi1x0123, vi1x4567, 3); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__neon_1x4_acc4() 88 vi1x0123 = vi1x4567; in xnn_f32_dwconv2d_chw_ukernel_3x3p1__neon_1x4_acc4() 92 const float32x4_t vi1x5678 = vextq_f32(vi1x4567, vi1x89AB, 1); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__neon_1x4_acc4() 102 vi1x4567 = vi1x89AB; in xnn_f32_dwconv2d_chw_ukernel_3x3p1__neon_1x4_acc4() 122 vi1x4567 = vreinterpretq_f32_u32(vandq_u32(vmask, vreinterpretq_u32_f32(vi1x4567))); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__neon_1x4_acc4() 127 float32x4_t vo0p1 = vmulq_lane_f32(vi1x4567, vget_low_f32(vw4567), 1); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__neon_1x4_acc4() 132 const float32x4_t vi1x3456 = vextq_f32(vi1x0123, vi1x4567, 3); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__neon_1x4_acc4() 143 const float32x4_t vi1x5678 = vextq_f32(vi1x4567, vzero, 1); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__neon_1x4_acc4()
|
D | 3x3p1-minmax-wasmsimd-x86-splat-1x4.c | 61 v128_t vi1x4567 = wasm_v128_load(i1); i1 += 4; in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_splat_1x4() local 74 …vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi1x4567, wasm_v32x4_shuffle(vw4567, vw4567, 1, 1, 1,… in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_splat_1x4() 79 const v128_t vi1x3456 = wasm_v32x4_shuffle(vi1x0123, vi1x4567, 3, 4, 5, 6); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_splat_1x4() 89 vi1x0123 = vi1x4567; in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_splat_1x4() 93 const v128_t vi1x5678 = wasm_v32x4_shuffle(vi1x4567, vi1x89AB, 1, 2, 3, 4); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_splat_1x4() 103 vi1x4567 = vi1x89AB; in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_splat_1x4() 119 vi1x4567 = wasm_v128_and(vmask, vi1x4567); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_splat_1x4() 124 …vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi1x4567, wasm_v32x4_shuffle(vw4567, vw4567, 1, 1, 1,… in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_splat_1x4() 129 const v128_t vi1x3456 = wasm_v32x4_shuffle(vi1x0123, vi1x4567, 3, 4, 5, 6); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_splat_1x4() 140 const v128_t vi1x5678 = wasm_v32x4_shuffle(vi1x4567, vzero, 1, 2, 3, 4); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_splat_1x4()
|
D | 3x3p1-minmax-sse-1x4-acc4.c | 71 __m128 vi1x4567 = _mm_loadu_ps(i1); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__sse_1x4_acc4() local 91 const __m128 vi1x7456 = _mm_shuffle_ps(vi1x4567, vi1x4567, _MM_SHUFFLE(2, 1, 0, 3)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__sse_1x4_acc4() 96 __m128 vo0p1 = _mm_mul_ps(vi1x4567, vk11); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__sse_1x4_acc4() 117 const __m128 vi1x8567 = _mm_move_ss(vi1x4567, vi1x89AB); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__sse_1x4_acc4() 133 vi1x4567 = vi1x89AB; in xnn_f32_dwconv2d_chw_ukernel_3x3p1__sse_1x4_acc4() 152 vi1x4567 = _mm_and_ps(vmask, vi1x4567); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__sse_1x4_acc4() 158 const __m128 vi1x7456 = _mm_shuffle_ps(vi1x4567, vi1x4567, _MM_SHUFFLE(2, 1, 0, 3)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__sse_1x4_acc4() 163 __m128 vo0p1 = _mm_mul_ps(vi1x4567, vk11); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__sse_1x4_acc4() 181 const __m128 vi1x8567 = _mm_move_ss(vi1x4567, vzero); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__sse_1x4_acc4()
|
D | 3x3p1-minmax-sse-1x4.c | 71 __m128 vi1x4567 = _mm_loadu_ps(i1); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__sse_1x4() local 91 const __m128 vi1x7456 = _mm_shuffle_ps(vi1x4567, vi1x4567, _MM_SHUFFLE(2, 1, 0, 3)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__sse_1x4() 96 vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi1x4567, vk11)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__sse_1x4() 117 const __m128 vi1x8567 = _mm_move_ss(vi1x4567, vi1x89AB); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__sse_1x4() 133 vi1x4567 = vi1x89AB; in xnn_f32_dwconv2d_chw_ukernel_3x3p1__sse_1x4() 149 vi1x4567 = _mm_and_ps(vmask, vi1x4567); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__sse_1x4() 155 const __m128 vi1x7456 = _mm_shuffle_ps(vi1x4567, vi1x4567, _MM_SHUFFLE(2, 1, 0, 3)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__sse_1x4() 160 vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi1x4567, vk11)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__sse_1x4() 178 const __m128 vi1x8567 = _mm_move_ss(vi1x4567, vzero); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__sse_1x4()
|