Home
last modified time | relevance | path

Searched refs:vo0p0 (Results 1 – 25 of 380) sorted by relevance

12345678910>>...16

/external/XNNPACK/src/f16-dwconv2d-chw/gen/
D5x5p2-minmax-neonfp16arith-1x4.c76 float16x4_t vo0p0 = vdup_laneq_f16(vw01234567, 0); in xnn_f16_dwconv2d_chw_ukernel_5x5p2__neonfp16arith_1x4() local
84 vo0p0 = vfma_laneq_f16(vo0p0, vi0x4567, vw01234567, 3); in xnn_f16_dwconv2d_chw_ukernel_5x5p2__neonfp16arith_1x4()
86 vo0p0 = vfma_laneq_f16(vo0p0, vi1x4567, vw89ABCDEF, 0); in xnn_f16_dwconv2d_chw_ukernel_5x5p2__neonfp16arith_1x4()
88 vo0p0 = vfma_laneq_f16(vo0p0, vi2x4567, vw89ABCDEF, 5); in xnn_f16_dwconv2d_chw_ukernel_5x5p2__neonfp16arith_1x4()
90 vo0p0 = vfma_laneq_f16(vo0p0, vi3x4567, vwGHIJKLMN, 2); in xnn_f16_dwconv2d_chw_ukernel_5x5p2__neonfp16arith_1x4()
92 vo0p0 = vfma_laneq_f16(vo0p0, vi4x4567, vwGHIJKLMN, 7); in xnn_f16_dwconv2d_chw_ukernel_5x5p2__neonfp16arith_1x4()
100 vo0p0 = vfma_laneq_f16(vo0p0, vi0x3456, vw01234567, 2); in xnn_f16_dwconv2d_chw_ukernel_5x5p2__neonfp16arith_1x4()
102 vo0p0 = vfma_laneq_f16(vo0p0, vi1x3456, vw01234567, 7); in xnn_f16_dwconv2d_chw_ukernel_5x5p2__neonfp16arith_1x4()
104 vo0p0 = vfma_laneq_f16(vo0p0, vi2x3456, vw89ABCDEF, 4); in xnn_f16_dwconv2d_chw_ukernel_5x5p2__neonfp16arith_1x4()
106 vo0p0 = vfma_laneq_f16(vo0p0, vi3x3456, vwGHIJKLMN, 1); in xnn_f16_dwconv2d_chw_ukernel_5x5p2__neonfp16arith_1x4()
[all …]
D5x5s2p2-minmax-neonfp16arith-1x4.c90 float16x4_t vo0p0 = vdup_laneq_f16(vw01234567, 0); in xnn_f16_dwconv2d_chw_ukernel_5x5s2p2__neonfp16arith_1x4() local
92 vo0p0 = vfma_laneq_f16(vo0p0, vi0x8ACE9BDF.val[0], vw01234567, 3); in xnn_f16_dwconv2d_chw_ukernel_5x5s2p2__neonfp16arith_1x4()
94 vo0p0 = vfma_laneq_f16(vo0p0, vi1x8ACE9BDF.val[0], vw89ABCDEF, 0); in xnn_f16_dwconv2d_chw_ukernel_5x5s2p2__neonfp16arith_1x4()
96 vo0p0 = vfma_laneq_f16(vo0p0, vi2x8ACE9BDF.val[0], vw89ABCDEF, 5); in xnn_f16_dwconv2d_chw_ukernel_5x5s2p2__neonfp16arith_1x4()
98 vo0p0 = vfma_laneq_f16(vo0p0, vi3x8ACE9BDF.val[0], vwGHIJKLMN, 2); in xnn_f16_dwconv2d_chw_ukernel_5x5s2p2__neonfp16arith_1x4()
100 vo0p0 = vfma_laneq_f16(vo0p0, vi4x8ACE9BDF.val[0], vwGHIJKLMN, 7); in xnn_f16_dwconv2d_chw_ukernel_5x5s2p2__neonfp16arith_1x4()
102 vo0p0 = vfma_laneq_f16(vo0p0, vi0x8ACE9BDF.val[1], vw01234567, 4); in xnn_f16_dwconv2d_chw_ukernel_5x5s2p2__neonfp16arith_1x4()
104 vo0p0 = vfma_laneq_f16(vo0p0, vi1x8ACE9BDF.val[1], vw89ABCDEF, 1); in xnn_f16_dwconv2d_chw_ukernel_5x5s2p2__neonfp16arith_1x4()
106 vo0p0 = vfma_laneq_f16(vo0p0, vi2x8ACE9BDF.val[1], vw89ABCDEF, 6); in xnn_f16_dwconv2d_chw_ukernel_5x5s2p2__neonfp16arith_1x4()
108 vo0p0 = vfma_laneq_f16(vo0p0, vi3x8ACE9BDF.val[1], vwGHIJKLMN, 3); in xnn_f16_dwconv2d_chw_ukernel_5x5s2p2__neonfp16arith_1x4()
[all …]
D5x5p2-minmax-neonfp16arith-2x4.c84 float16x4_t vo0p0 = vdup_laneq_f16(vw01234567, 0); in xnn_f16_dwconv2d_chw_ukernel_5x5p2__neonfp16arith_2x4() local
94 vo0p0 = vfma_laneq_f16(vo0p0, vi0x4567, vw01234567, 3); in xnn_f16_dwconv2d_chw_ukernel_5x5p2__neonfp16arith_2x4()
97 vo0p0 = vfma_laneq_f16(vo0p0, vi1x4567, vw89ABCDEF, 0); in xnn_f16_dwconv2d_chw_ukernel_5x5p2__neonfp16arith_2x4()
100 vo0p0 = vfma_laneq_f16(vo0p0, vi2x4567, vw89ABCDEF, 5); in xnn_f16_dwconv2d_chw_ukernel_5x5p2__neonfp16arith_2x4()
103 vo0p0 = vfma_laneq_f16(vo0p0, vi3x4567, vwGHIJKLMN, 2); in xnn_f16_dwconv2d_chw_ukernel_5x5p2__neonfp16arith_2x4()
106 vo0p0 = vfma_laneq_f16(vo0p0, vi4x4567, vwGHIJKLMN, 7); in xnn_f16_dwconv2d_chw_ukernel_5x5p2__neonfp16arith_2x4()
116 vo0p0 = vfma_laneq_f16(vo0p0, vi0x3456, vw01234567, 2); in xnn_f16_dwconv2d_chw_ukernel_5x5p2__neonfp16arith_2x4()
119 vo0p0 = vfma_laneq_f16(vo0p0, vi1x3456, vw01234567, 7); in xnn_f16_dwconv2d_chw_ukernel_5x5p2__neonfp16arith_2x4()
122 vo0p0 = vfma_laneq_f16(vo0p0, vi2x3456, vw89ABCDEF, 4); in xnn_f16_dwconv2d_chw_ukernel_5x5p2__neonfp16arith_2x4()
125 vo0p0 = vfma_laneq_f16(vo0p0, vi3x3456, vwGHIJKLMN, 1); in xnn_f16_dwconv2d_chw_ukernel_5x5p2__neonfp16arith_2x4()
[all …]
/external/XNNPACK/src/f32-dwconv2d-chw/gen/
D5x5p2-minmax-neonfma-1x4.c78 float32x4_t vo0p0 = vdupq_lane_f32(vget_low_f32(vw0123), 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_1x4() local
86 vo0p0 = vfmaq_lane_f32(vo0p0, vi0x4567, vget_high_f32(vw0123), 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_1x4()
88 vo0p0 = vfmaq_lane_f32(vo0p0, vi1x4567, vget_low_f32(vw89AB), 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_1x4()
90 vo0p0 = vfmaq_lane_f32(vo0p0, vi2x4567, vget_low_f32(vwCDEF), 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_1x4()
92 vo0p0 = vfmaq_lane_f32(vo0p0, vi3x4567, vget_high_f32(vwGHIJ), 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_1x4()
94 vo0p0 = vfmaq_lane_f32(vo0p0, vi4x4567, vget_high_f32(vwKLMN), 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_1x4()
102 vo0p0 = vfmaq_lane_f32(vo0p0, vi0x3456, vget_high_f32(vw0123), 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_1x4()
104 vo0p0 = vfmaq_lane_f32(vo0p0, vi1x3456, vget_high_f32(vw4567), 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_1x4()
106 vo0p0 = vfmaq_lane_f32(vo0p0, vi2x3456, vget_low_f32(vwCDEF), 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_1x4()
108 vo0p0 = vfmaq_lane_f32(vo0p0, vi3x3456, vget_low_f32(vwGHIJ), 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_1x4()
[all …]
D5x5p2-minmax-neon-1x4.c78 float32x4_t vo0p0 = vdupq_lane_f32(vget_low_f32(vw0123), 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_1x4() local
86 vo0p0 = vmlaq_lane_f32(vo0p0, vi0x4567, vget_high_f32(vw0123), 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_1x4()
88 vo0p0 = vmlaq_lane_f32(vo0p0, vi1x4567, vget_low_f32(vw89AB), 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_1x4()
90 vo0p0 = vmlaq_lane_f32(vo0p0, vi2x4567, vget_low_f32(vwCDEF), 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_1x4()
92 vo0p0 = vmlaq_lane_f32(vo0p0, vi3x4567, vget_high_f32(vwGHIJ), 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_1x4()
94 vo0p0 = vmlaq_lane_f32(vo0p0, vi4x4567, vget_high_f32(vwKLMN), 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_1x4()
102 vo0p0 = vmlaq_lane_f32(vo0p0, vi0x3456, vget_high_f32(vw0123), 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_1x4()
104 vo0p0 = vmlaq_lane_f32(vo0p0, vi1x3456, vget_high_f32(vw4567), 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_1x4()
106 vo0p0 = vmlaq_lane_f32(vo0p0, vi2x3456, vget_low_f32(vwCDEF), 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_1x4()
108 vo0p0 = vmlaq_lane_f32(vo0p0, vi3x3456, vget_low_f32(vwGHIJ), 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_1x4()
[all …]
D5x5p2-minmax-wasmsimd-x86-loadsplat-1x4.c105 v128_t vo0p0 = vbias; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_1x4() local
113 vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi0x4567, vk02)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_1x4()
115 vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi1x4567, vk12)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_1x4()
117 vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi2x4567, vk22)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_1x4()
119 vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi3x4567, vk32)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_1x4()
121 vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi4x4567, vk42)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_1x4()
129 vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi0x3456, vk01)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_1x4()
131 vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi1x3456, vk11)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_1x4()
133 vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi2x3456, vk21)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_1x4()
135 vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi3x3456, vk31)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_1x4()
[all …]
D5x5p2-minmax-wasmsimd-arm-loadsplat-1x4.c105 v128_t vo0p0 = vbias; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_1x4() local
113 vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi0x4567, vk02)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_1x4()
115 vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi1x4567, vk12)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_1x4()
117 vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi2x4567, vk22)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_1x4()
119 vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi3x4567, vk32)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_1x4()
121 vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi4x4567, vk42)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_1x4()
129 vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi0x3456, vk01)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_1x4()
131 vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi1x3456, vk11)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_1x4()
133 vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi2x3456, vk21)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_1x4()
135 vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi3x3456, vk31)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_1x4()
[all …]
D5x5p2-minmax-wasmsimd-arm-splat-1x4.c79 v128_t vo0p0 = wasm_v32x4_shuffle(vw0123, vw0123, 0, 0, 0, 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_splat_1x4() local
87vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi0x4567, wasm_v32x4_shuffle(vw0123, vw0123, 3, 3, 3,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_splat_1x4()
89vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi1x4567, wasm_v32x4_shuffle(vw89AB, vw89AB, 0, 0, 0,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_splat_1x4()
91vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi2x4567, wasm_v32x4_shuffle(vwCDEF, vwCDEF, 1, 1, 1,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_splat_1x4()
93vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi3x4567, wasm_v32x4_shuffle(vwGHIJ, vwGHIJ, 2, 2, 2,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_splat_1x4()
95vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi4x4567, wasm_v32x4_shuffle(vwKLMN, vwKLMN, 3, 3, 3,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_splat_1x4()
103vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi0x3456, wasm_v32x4_shuffle(vw0123, vw0123, 2, 2, 2,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_splat_1x4()
105vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi1x3456, wasm_v32x4_shuffle(vw4567, vw4567, 3, 3, 3,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_splat_1x4()
107vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi2x3456, wasm_v32x4_shuffle(vwCDEF, vwCDEF, 0, 0, 0,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_splat_1x4()
109vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi3x3456, wasm_v32x4_shuffle(vwGHIJ, vwGHIJ, 1, 1, 1,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_splat_1x4()
[all …]
D5x5p2-minmax-wasmsimd-x86-splat-1x4.c79 v128_t vo0p0 = wasm_v32x4_shuffle(vw0123, vw0123, 0, 0, 0, 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_splat_1x4() local
87vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi0x4567, wasm_v32x4_shuffle(vw0123, vw0123, 3, 3, 3,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_splat_1x4()
89vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi1x4567, wasm_v32x4_shuffle(vw89AB, vw89AB, 0, 0, 0,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_splat_1x4()
91vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi2x4567, wasm_v32x4_shuffle(vwCDEF, vwCDEF, 1, 1, 1,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_splat_1x4()
93vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi3x4567, wasm_v32x4_shuffle(vwGHIJ, vwGHIJ, 2, 2, 2,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_splat_1x4()
95vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi4x4567, wasm_v32x4_shuffle(vwKLMN, vwKLMN, 3, 3, 3,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_splat_1x4()
103vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi0x3456, wasm_v32x4_shuffle(vw0123, vw0123, 2, 2, 2,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_splat_1x4()
105vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi1x3456, wasm_v32x4_shuffle(vw4567, vw4567, 3, 3, 3,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_splat_1x4()
107vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi2x3456, wasm_v32x4_shuffle(vwCDEF, vwCDEF, 0, 0, 0,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_splat_1x4()
109vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi3x3456, wasm_v32x4_shuffle(vwGHIJ, vwGHIJ, 1, 1, 1,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_splat_1x4()
[all …]
D5x5p2-minmax-sse-1x4.c102 __m128 vo0p0 = _mm_add_ps(vbias, _mm_mul_ps(vi0x4567, vk02)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_1x4() local
103 vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi1x4567, vk12)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_1x4()
104 vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi2x4567, vk22)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_1x4()
105 vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi3x4567, vk32)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_1x4()
106 vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi4x4567, vk42)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_1x4()
131 vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi0x3456, vk01)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_1x4()
132 vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi1x3456, vk11)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_1x4()
133 vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi2x3456, vk21)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_1x4()
134 vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi3x3456, vk31)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_1x4()
135 vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi4x3456, vk41)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_1x4()
[all …]
D5x5s2p2-minmax-neon-1x4.c92 float32x4_t vo0p0 = vdupq_lane_f32(vget_low_f32(vw0123), 0); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__neon_1x4() local
94 vo0p0 = vmlaq_lane_f32(vo0p0, vi0x8ACE9BDF.val[0], vget_high_f32(vw0123), 1); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__neon_1x4()
96 vo0p0 = vmlaq_lane_f32(vo0p0, vi1x8ACE9BDF.val[0], vget_low_f32(vw89AB), 0); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__neon_1x4()
98 vo0p0 = vmlaq_lane_f32(vo0p0, vi2x8ACE9BDF.val[0], vget_low_f32(vwCDEF), 1); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__neon_1x4()
100 vo0p0 = vmlaq_lane_f32(vo0p0, vi3x8ACE9BDF.val[0], vget_high_f32(vwGHIJ), 0); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__neon_1x4()
102 vo0p0 = vmlaq_lane_f32(vo0p0, vi4x8ACE9BDF.val[0], vget_high_f32(vwKLMN), 1); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__neon_1x4()
104 vo0p0 = vmlaq_lane_f32(vo0p0, vi0x8ACE9BDF.val[1], vget_low_f32(vw4567), 0); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__neon_1x4()
106 vo0p0 = vmlaq_lane_f32(vo0p0, vi1x8ACE9BDF.val[1], vget_low_f32(vw89AB), 1); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__neon_1x4()
108 vo0p0 = vmlaq_lane_f32(vo0p0, vi2x8ACE9BDF.val[1], vget_high_f32(vwCDEF), 0); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__neon_1x4()
110 vo0p0 = vmlaq_lane_f32(vo0p0, vi3x8ACE9BDF.val[1], vget_high_f32(vwGHIJ), 1); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__neon_1x4()
[all …]
D5x5s2p2-minmax-neonfma-1x4.c92 float32x4_t vo0p0 = vdupq_lane_f32(vget_low_f32(vw0123), 0); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__neonfma_1x4() local
94 vo0p0 = vfmaq_lane_f32(vo0p0, vi0x8ACE9BDF.val[0], vget_high_f32(vw0123), 1); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__neonfma_1x4()
96 vo0p0 = vfmaq_lane_f32(vo0p0, vi1x8ACE9BDF.val[0], vget_low_f32(vw89AB), 0); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__neonfma_1x4()
98 vo0p0 = vfmaq_lane_f32(vo0p0, vi2x8ACE9BDF.val[0], vget_low_f32(vwCDEF), 1); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__neonfma_1x4()
100 vo0p0 = vfmaq_lane_f32(vo0p0, vi3x8ACE9BDF.val[0], vget_high_f32(vwGHIJ), 0); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__neonfma_1x4()
102 vo0p0 = vfmaq_lane_f32(vo0p0, vi4x8ACE9BDF.val[0], vget_high_f32(vwKLMN), 1); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__neonfma_1x4()
104 vo0p0 = vfmaq_lane_f32(vo0p0, vi0x8ACE9BDF.val[1], vget_low_f32(vw4567), 0); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__neonfma_1x4()
106 vo0p0 = vfmaq_lane_f32(vo0p0, vi1x8ACE9BDF.val[1], vget_low_f32(vw89AB), 1); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__neonfma_1x4()
108 vo0p0 = vfmaq_lane_f32(vo0p0, vi2x8ACE9BDF.val[1], vget_high_f32(vwCDEF), 0); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__neonfma_1x4()
110 vo0p0 = vfmaq_lane_f32(vo0p0, vi3x8ACE9BDF.val[1], vget_high_f32(vwGHIJ), 1); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__neonfma_1x4()
[all …]
D5x5p2-minmax-scalar-1x1.c111 float vo0p0 = vbias + vi0x0 * vk00; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_1x1() local
112 vo0p0 += vi1x0 * vk10; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_1x1()
113 vo0p0 += vi2x0 * vk20; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_1x1()
114 vo0p0 += vi3x0 * vk30; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_1x1()
115 vo0p0 += vi4x0 * vk40; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_1x1()
123 vo0p0 += vi0x1 * vk01; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_1x1()
124 vo0p0 += vi1x1 * vk11; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_1x1()
125 vo0p0 += vi2x1 * vk21; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_1x1()
126 vo0p0 += vi3x1 * vk31; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_1x1()
127 vo0p0 += vi4x1 * vk41; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_1x1()
[all …]
D5x5s2p2-minmax-scalar-1x1.c123 float vo0p0 = vbias + vi0x0 * vk00; in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__scalar_1x1() local
124 vo0p0 += vi1x0 * vk10; in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__scalar_1x1()
125 vo0p0 += vi2x0 * vk20; in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__scalar_1x1()
126 vo0p0 += vi3x0 * vk30; in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__scalar_1x1()
127 vo0p0 += vi4x0 * vk40; in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__scalar_1x1()
135 vo0p0 += vi0x1 * vk01; in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__scalar_1x1()
136 vo0p0 += vi1x1 * vk11; in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__scalar_1x1()
137 vo0p0 += vi2x1 * vk21; in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__scalar_1x1()
138 vo0p0 += vi3x1 * vk31; in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__scalar_1x1()
139 vo0p0 += vi4x1 * vk41; in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__scalar_1x1()
[all …]
D5x5s2p2-minmax-wasmsimd-x86-loadsplat-1x4.c134 v128_t vo0p0 = vbias; in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_x86_loadsplat_1x4() local
136 vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi0x8ACE, vk02)); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_x86_loadsplat_1x4()
138 vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi1x8ACE, vk12)); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_x86_loadsplat_1x4()
140 vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi2x8ACE, vk22)); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_x86_loadsplat_1x4()
142 vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi3x8ACE, vk32)); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_x86_loadsplat_1x4()
144 vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi4x8ACE, vk42)); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_x86_loadsplat_1x4()
146 vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi0x9BDF, vk03)); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_x86_loadsplat_1x4()
148 vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi1x9BDF, vk13)); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_x86_loadsplat_1x4()
150 vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi2x9BDF, vk23)); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_x86_loadsplat_1x4()
152 vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi3x9BDF, vk33)); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_x86_loadsplat_1x4()
[all …]
D5x5s2p2-minmax-wasmsimd-arm-loadsplat-1x4.c134 v128_t vo0p0 = vbias; in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_arm_loadsplat_1x4() local
136 vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi0x8ACE, vk02)); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_arm_loadsplat_1x4()
138 vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi1x8ACE, vk12)); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_arm_loadsplat_1x4()
140 vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi2x8ACE, vk22)); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_arm_loadsplat_1x4()
142 vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi3x8ACE, vk32)); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_arm_loadsplat_1x4()
144 vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi4x8ACE, vk42)); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_arm_loadsplat_1x4()
146 vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi0x9BDF, vk03)); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_arm_loadsplat_1x4()
148 vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi1x9BDF, vk13)); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_arm_loadsplat_1x4()
150 vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi2x9BDF, vk23)); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_arm_loadsplat_1x4()
152 vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi3x9BDF, vk33)); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_arm_loadsplat_1x4()
[all …]
D5x5p2-minmax-neonfma-2x4.c86 float32x4_t vo0p0 = vdupq_lane_f32(vget_low_f32(vw0123), 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_2x4() local
96 vo0p0 = vfmaq_lane_f32(vo0p0, vi0x4567, vget_high_f32(vw0123), 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_2x4()
99 vo0p0 = vfmaq_lane_f32(vo0p0, vi1x4567, vget_low_f32(vw89AB), 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_2x4()
102 vo0p0 = vfmaq_lane_f32(vo0p0, vi2x4567, vget_low_f32(vwCDEF), 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_2x4()
105 vo0p0 = vfmaq_lane_f32(vo0p0, vi3x4567, vget_high_f32(vwGHIJ), 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_2x4()
108 vo0p0 = vfmaq_lane_f32(vo0p0, vi4x4567, vget_high_f32(vwKLMN), 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_2x4()
118 vo0p0 = vfmaq_lane_f32(vo0p0, vi0x3456, vget_high_f32(vw0123), 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_2x4()
121 vo0p0 = vfmaq_lane_f32(vo0p0, vi1x3456, vget_high_f32(vw4567), 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_2x4()
124 vo0p0 = vfmaq_lane_f32(vo0p0, vi2x3456, vget_low_f32(vwCDEF), 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_2x4()
127 vo0p0 = vfmaq_lane_f32(vo0p0, vi3x3456, vget_low_f32(vwGHIJ), 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_2x4()
[all …]
D5x5p2-minmax-neon-2x4.c86 float32x4_t vo0p0 = vdupq_lane_f32(vget_low_f32(vw0123), 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_2x4() local
96 vo0p0 = vmlaq_lane_f32(vo0p0, vi0x4567, vget_high_f32(vw0123), 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_2x4()
99 vo0p0 = vmlaq_lane_f32(vo0p0, vi1x4567, vget_low_f32(vw89AB), 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_2x4()
102 vo0p0 = vmlaq_lane_f32(vo0p0, vi2x4567, vget_low_f32(vwCDEF), 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_2x4()
105 vo0p0 = vmlaq_lane_f32(vo0p0, vi3x4567, vget_high_f32(vwGHIJ), 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_2x4()
108 vo0p0 = vmlaq_lane_f32(vo0p0, vi4x4567, vget_high_f32(vwKLMN), 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_2x4()
118 vo0p0 = vmlaq_lane_f32(vo0p0, vi0x3456, vget_high_f32(vw0123), 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_2x4()
121 vo0p0 = vmlaq_lane_f32(vo0p0, vi1x3456, vget_high_f32(vw4567), 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_2x4()
124 vo0p0 = vmlaq_lane_f32(vo0p0, vi2x3456, vget_low_f32(vwCDEF), 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_2x4()
127 vo0p0 = vmlaq_lane_f32(vo0p0, vi3x3456, vget_low_f32(vwGHIJ), 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_2x4()
[all …]
D5x5s2p2-minmax-wasmsimd-arm-splat-1x4.c114 v128_t vo0p0 = wasm_v32x4_shuffle(vw0123, vw0123, 0, 0, 0, 0); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_arm_splat_1x4() local
116vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi0x8ACE, wasm_v32x4_shuffle(vw0123, vw0123, 3, 3, 3,… in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_arm_splat_1x4()
118vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi1x8ACE, wasm_v32x4_shuffle(vw89AB, vw89AB, 0, 0, 0,… in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_arm_splat_1x4()
120vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi2x8ACE, wasm_v32x4_shuffle(vwCDEF, vwCDEF, 1, 1, 1,… in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_arm_splat_1x4()
122vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi3x8ACE, wasm_v32x4_shuffle(vwGHIJ, vwGHIJ, 2, 2, 2,… in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_arm_splat_1x4()
124vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi4x8ACE, wasm_v32x4_shuffle(vwKLMN, vwKLMN, 3, 3, 3,… in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_arm_splat_1x4()
126vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi0x9BDF, wasm_v32x4_shuffle(vw4567, vw4567, 0, 0, 0,… in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_arm_splat_1x4()
128vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi1x9BDF, wasm_v32x4_shuffle(vw89AB, vw89AB, 1, 1, 1,… in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_arm_splat_1x4()
130vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi2x9BDF, wasm_v32x4_shuffle(vwCDEF, vwCDEF, 2, 2, 2,… in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_arm_splat_1x4()
132vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi3x9BDF, wasm_v32x4_shuffle(vwGHIJ, vwGHIJ, 3, 3, 3,… in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_arm_splat_1x4()
[all …]
D5x5s2p2-minmax-wasmsimd-x86-splat-1x4.c114 v128_t vo0p0 = wasm_v32x4_shuffle(vw0123, vw0123, 0, 0, 0, 0); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_x86_splat_1x4() local
116vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi0x8ACE, wasm_v32x4_shuffle(vw0123, vw0123, 3, 3, 3,… in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_x86_splat_1x4()
118vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi1x8ACE, wasm_v32x4_shuffle(vw89AB, vw89AB, 0, 0, 0,… in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_x86_splat_1x4()
120vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi2x8ACE, wasm_v32x4_shuffle(vwCDEF, vwCDEF, 1, 1, 1,… in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_x86_splat_1x4()
122vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi3x8ACE, wasm_v32x4_shuffle(vwGHIJ, vwGHIJ, 2, 2, 2,… in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_x86_splat_1x4()
124vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi4x8ACE, wasm_v32x4_shuffle(vwKLMN, vwKLMN, 3, 3, 3,… in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_x86_splat_1x4()
126vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi0x9BDF, wasm_v32x4_shuffle(vw4567, vw4567, 0, 0, 0,… in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_x86_splat_1x4()
128vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi1x9BDF, wasm_v32x4_shuffle(vw89AB, vw89AB, 1, 1, 1,… in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_x86_splat_1x4()
130vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi2x9BDF, wasm_v32x4_shuffle(vwCDEF, vwCDEF, 2, 2, 2,… in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_x86_splat_1x4()
132vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi3x9BDF, wasm_v32x4_shuffle(vwGHIJ, vwGHIJ, 3, 3, 3,… in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_x86_splat_1x4()
[all …]
D5x5p2-minmax-wasmsimd-arm-loadsplat-2x4.c113 v128_t vo0p0 = vbias; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_2x4() local
123 vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi0x4567, vk02)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_2x4()
126 vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi1x4567, vk12)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_2x4()
129 vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi2x4567, vk22)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_2x4()
132 vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi3x4567, vk32)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_2x4()
135 vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi4x4567, vk42)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_2x4()
145 vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi0x3456, vk01)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_2x4()
148 vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi1x3456, vk11)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_2x4()
151 vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi2x3456, vk21)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_2x4()
154 vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi3x3456, vk31)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_2x4()
[all …]
D5x5p2-minmax-wasmsimd-x86-loadsplat-2x4.c113 v128_t vo0p0 = vbias; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_2x4() local
123 vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi0x4567, vk02)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_2x4()
126 vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi1x4567, vk12)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_2x4()
129 vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi2x4567, vk22)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_2x4()
132 vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi3x4567, vk32)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_2x4()
135 vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi4x4567, vk42)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_2x4()
145 vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi0x3456, vk01)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_2x4()
148 vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi1x3456, vk11)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_2x4()
151 vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi2x3456, vk21)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_2x4()
154 vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi3x3456, vk31)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_2x4()
[all …]
D5x5p2-minmax-wasmsimd-arm-splat-2x4.c87 v128_t vo0p0 = wasm_v32x4_shuffle(vw0123, vw0123, 0, 0, 0, 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_splat_2x4() local
97vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi0x4567, wasm_v32x4_shuffle(vw0123, vw0123, 3, 3, 3,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_splat_2x4()
100vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi1x4567, wasm_v32x4_shuffle(vw89AB, vw89AB, 0, 0, 0,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_splat_2x4()
103vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi2x4567, wasm_v32x4_shuffle(vwCDEF, vwCDEF, 1, 1, 1,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_splat_2x4()
106vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi3x4567, wasm_v32x4_shuffle(vwGHIJ, vwGHIJ, 2, 2, 2,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_splat_2x4()
109vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi4x4567, wasm_v32x4_shuffle(vwKLMN, vwKLMN, 3, 3, 3,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_splat_2x4()
119vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi0x3456, wasm_v32x4_shuffle(vw0123, vw0123, 2, 2, 2,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_splat_2x4()
122vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi1x3456, wasm_v32x4_shuffle(vw4567, vw4567, 3, 3, 3,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_splat_2x4()
125vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi2x3456, wasm_v32x4_shuffle(vwCDEF, vwCDEF, 0, 0, 0,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_splat_2x4()
128vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi3x3456, wasm_v32x4_shuffle(vwGHIJ, vwGHIJ, 1, 1, 1,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_splat_2x4()
[all …]
D5x5p2-minmax-wasmsimd-x86-splat-2x4.c87 v128_t vo0p0 = wasm_v32x4_shuffle(vw0123, vw0123, 0, 0, 0, 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_splat_2x4() local
97vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi0x4567, wasm_v32x4_shuffle(vw0123, vw0123, 3, 3, 3,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_splat_2x4()
100vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi1x4567, wasm_v32x4_shuffle(vw89AB, vw89AB, 0, 0, 0,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_splat_2x4()
103vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi2x4567, wasm_v32x4_shuffle(vwCDEF, vwCDEF, 1, 1, 1,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_splat_2x4()
106vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi3x4567, wasm_v32x4_shuffle(vwGHIJ, vwGHIJ, 2, 2, 2,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_splat_2x4()
109vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi4x4567, wasm_v32x4_shuffle(vwKLMN, vwKLMN, 3, 3, 3,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_splat_2x4()
119vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi0x3456, wasm_v32x4_shuffle(vw0123, vw0123, 2, 2, 2,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_splat_2x4()
122vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi1x3456, wasm_v32x4_shuffle(vw4567, vw4567, 3, 3, 3,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_splat_2x4()
125vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi2x3456, wasm_v32x4_shuffle(vwCDEF, vwCDEF, 0, 0, 0,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_splat_2x4()
128vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi3x3456, wasm_v32x4_shuffle(vwGHIJ, vwGHIJ, 1, 1, 1,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_splat_2x4()
[all …]
D5x5s2p2-minmax-sse-1x4.c132 __m128 vo0p0 = _mm_add_ps(vbias, _mm_mul_ps(vi0x8ACE, vk02)); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__sse_1x4() local
133 vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi1x8ACE, vk12)); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__sse_1x4()
134 vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi2x8ACE, vk22)); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__sse_1x4()
135 vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi3x8ACE, vk32)); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__sse_1x4()
136 vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi4x8ACE, vk42)); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__sse_1x4()
144 vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi0x9BDF, vk03)); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__sse_1x4()
145 vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi1x9BDF, vk13)); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__sse_1x4()
146 vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi2x9BDF, vk23)); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__sse_1x4()
147 vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi3x9BDF, vk33)); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__sse_1x4()
148 vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi4x9BDF, vk43)); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__sse_1x4()
[all …]

12345678910>>...16