Home
last modified time | relevance | path

Searched refs:vo3p0 (Results 1 – 25 of 58) sorted by relevance

123

/external/XNNPACK/src/f32-dwconv2d-chw/gen/
D5x5p2-minmax-neon-4x4.c105 float32x4_t vo3p0 = vdupq_lane_f32(vget_low_f32(vw0123), 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_4x4() local
119 vo3p0 = vmlaq_lane_f32(vo3p0, vi3x4567, vget_high_f32(vw0123), 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_4x4()
124 vo3p0 = vmlaq_lane_f32(vo3p0, vi4x4567, vget_low_f32(vw89AB), 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_4x4()
129 vo3p0 = vmlaq_lane_f32(vo3p0, vi5x4567, vget_low_f32(vwCDEF), 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_4x4()
134 vo3p0 = vmlaq_lane_f32(vo3p0, vi6x4567, vget_high_f32(vwGHIJ), 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_4x4()
139 vo3p0 = vmlaq_lane_f32(vo3p0, vi7x4567, vget_high_f32(vwKLMN), 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_4x4()
153 vo3p0 = vmlaq_lane_f32(vo3p0, vi3x3456, vget_high_f32(vw0123), 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_4x4()
158 vo3p0 = vmlaq_lane_f32(vo3p0, vi4x3456, vget_high_f32(vw4567), 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_4x4()
163 vo3p0 = vmlaq_lane_f32(vo3p0, vi5x3456, vget_low_f32(vwCDEF), 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_4x4()
168 vo3p0 = vmlaq_lane_f32(vo3p0, vi6x3456, vget_low_f32(vwGHIJ), 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_4x4()
[all …]
D5x5p2-minmax-neonfma-4x4.c105 float32x4_t vo3p0 = vdupq_lane_f32(vget_low_f32(vw0123), 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_4x4() local
119 vo3p0 = vfmaq_lane_f32(vo3p0, vi3x4567, vget_high_f32(vw0123), 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_4x4()
124 vo3p0 = vfmaq_lane_f32(vo3p0, vi4x4567, vget_low_f32(vw89AB), 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_4x4()
129 vo3p0 = vfmaq_lane_f32(vo3p0, vi5x4567, vget_low_f32(vwCDEF), 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_4x4()
134 vo3p0 = vfmaq_lane_f32(vo3p0, vi6x4567, vget_high_f32(vwGHIJ), 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_4x4()
139 vo3p0 = vfmaq_lane_f32(vo3p0, vi7x4567, vget_high_f32(vwKLMN), 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_4x4()
153 vo3p0 = vfmaq_lane_f32(vo3p0, vi3x3456, vget_high_f32(vw0123), 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_4x4()
158 vo3p0 = vfmaq_lane_f32(vo3p0, vi4x3456, vget_high_f32(vw4567), 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_4x4()
163 vo3p0 = vfmaq_lane_f32(vo3p0, vi5x3456, vget_low_f32(vwCDEF), 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_4x4()
168 vo3p0 = vfmaq_lane_f32(vo3p0, vi6x3456, vget_low_f32(vwGHIJ), 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_4x4()
[all …]
D5x5p2-minmax-wasmsimd-x86-loadsplat-4x4.c134 v128_t vo3p0 = vbias; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_4x4() local
148 vo3p0 = wasm_f32x4_add(vo3p0, wasm_f32x4_mul(vi3x4567, vk02)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_4x4()
153 vo3p0 = wasm_f32x4_add(vo3p0, wasm_f32x4_mul(vi4x4567, vk12)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_4x4()
158 vo3p0 = wasm_f32x4_add(vo3p0, wasm_f32x4_mul(vi5x4567, vk22)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_4x4()
163 vo3p0 = wasm_f32x4_add(vo3p0, wasm_f32x4_mul(vi6x4567, vk32)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_4x4()
168 vo3p0 = wasm_f32x4_add(vo3p0, wasm_f32x4_mul(vi7x4567, vk42)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_4x4()
182 vo3p0 = wasm_f32x4_add(vo3p0, wasm_f32x4_mul(vi3x3456, vk01)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_4x4()
187 vo3p0 = wasm_f32x4_add(vo3p0, wasm_f32x4_mul(vi4x3456, vk11)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_4x4()
192 vo3p0 = wasm_f32x4_add(vo3p0, wasm_f32x4_mul(vi5x3456, vk21)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_4x4()
197 vo3p0 = wasm_f32x4_add(vo3p0, wasm_f32x4_mul(vi6x3456, vk31)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_4x4()
[all …]
D5x5p2-minmax-wasmsimd-arm-loadsplat-4x4.c134 v128_t vo3p0 = vbias; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_4x4() local
148 vo3p0 = wasm_f32x4_add(vo3p0, wasm_f32x4_mul(vi3x4567, vk02)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_4x4()
153 vo3p0 = wasm_f32x4_add(vo3p0, wasm_f32x4_mul(vi4x4567, vk12)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_4x4()
158 vo3p0 = wasm_f32x4_add(vo3p0, wasm_f32x4_mul(vi5x4567, vk22)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_4x4()
163 vo3p0 = wasm_f32x4_add(vo3p0, wasm_f32x4_mul(vi6x4567, vk32)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_4x4()
168 vo3p0 = wasm_f32x4_add(vo3p0, wasm_f32x4_mul(vi7x4567, vk42)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_4x4()
182 vo3p0 = wasm_f32x4_add(vo3p0, wasm_f32x4_mul(vi3x3456, vk01)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_4x4()
187 vo3p0 = wasm_f32x4_add(vo3p0, wasm_f32x4_mul(vi4x3456, vk11)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_4x4()
192 vo3p0 = wasm_f32x4_add(vo3p0, wasm_f32x4_mul(vi5x3456, vk21)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_4x4()
197 vo3p0 = wasm_f32x4_add(vo3p0, wasm_f32x4_mul(vi6x3456, vk31)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_4x4()
[all …]
D5x5p2-minmax-sse-4x4.c132 __m128 vo3p0 = _mm_add_ps(vbias, _mm_mul_ps(vi3x4567, vk02)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_4x4() local
136 vo3p0 = _mm_add_ps(vo3p0, _mm_mul_ps(vi4x4567, vk12)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_4x4()
140 vo3p0 = _mm_add_ps(vo3p0, _mm_mul_ps(vi5x4567, vk22)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_4x4()
144 vo3p0 = _mm_add_ps(vo3p0, _mm_mul_ps(vi6x4567, vk32)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_4x4()
148 vo3p0 = _mm_add_ps(vo3p0, _mm_mul_ps(vi7x4567, vk42)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_4x4()
188 vo3p0 = _mm_add_ps(vo3p0, _mm_mul_ps(vi3x3456, vk01)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_4x4()
192 vo3p0 = _mm_add_ps(vo3p0, _mm_mul_ps(vi4x3456, vk11)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_4x4()
196 vo3p0 = _mm_add_ps(vo3p0, _mm_mul_ps(vi5x3456, vk21)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_4x4()
200 vo3p0 = _mm_add_ps(vo3p0, _mm_mul_ps(vi6x3456, vk31)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_4x4()
204 vo3p0 = _mm_add_ps(vo3p0, _mm_mul_ps(vi7x3456, vk41)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_4x4()
[all …]
D5x5p2-minmax-wasmsimd-x86-splat-4x4.c108 v128_t vo3p0 = wasm_v32x4_shuffle(vw0123, vw0123, 0, 0, 0, 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_splat_4x4() local
122vo3p0 = wasm_f32x4_add(vo3p0, wasm_f32x4_mul(vi3x4567, wasm_v32x4_shuffle(vw0123, vw0123, 3, 3, 3,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_splat_4x4()
127vo3p0 = wasm_f32x4_add(vo3p0, wasm_f32x4_mul(vi4x4567, wasm_v32x4_shuffle(vw89AB, vw89AB, 0, 0, 0,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_splat_4x4()
132vo3p0 = wasm_f32x4_add(vo3p0, wasm_f32x4_mul(vi5x4567, wasm_v32x4_shuffle(vwCDEF, vwCDEF, 1, 1, 1,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_splat_4x4()
137vo3p0 = wasm_f32x4_add(vo3p0, wasm_f32x4_mul(vi6x4567, wasm_v32x4_shuffle(vwGHIJ, vwGHIJ, 2, 2, 2,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_splat_4x4()
142vo3p0 = wasm_f32x4_add(vo3p0, wasm_f32x4_mul(vi7x4567, wasm_v32x4_shuffle(vwKLMN, vwKLMN, 3, 3, 3,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_splat_4x4()
156vo3p0 = wasm_f32x4_add(vo3p0, wasm_f32x4_mul(vi3x3456, wasm_v32x4_shuffle(vw0123, vw0123, 2, 2, 2,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_splat_4x4()
161vo3p0 = wasm_f32x4_add(vo3p0, wasm_f32x4_mul(vi4x3456, wasm_v32x4_shuffle(vw4567, vw4567, 3, 3, 3,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_splat_4x4()
166vo3p0 = wasm_f32x4_add(vo3p0, wasm_f32x4_mul(vi5x3456, wasm_v32x4_shuffle(vwCDEF, vwCDEF, 0, 0, 0,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_splat_4x4()
171vo3p0 = wasm_f32x4_add(vo3p0, wasm_f32x4_mul(vi6x3456, wasm_v32x4_shuffle(vwGHIJ, vwGHIJ, 1, 1, 1,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_splat_4x4()
[all …]
D5x5p2-minmax-wasmsimd-x86-loadsplat-5x4.c142 v128_t vo3p0 = vbias; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_5x4() local
158 vo3p0 = wasm_f32x4_add(vo3p0, wasm_f32x4_mul(vi3x4567, vk02)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_5x4()
164 vo3p0 = wasm_f32x4_add(vo3p0, wasm_f32x4_mul(vi4x4567, vk12)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_5x4()
170 vo3p0 = wasm_f32x4_add(vo3p0, wasm_f32x4_mul(vi5x4567, vk22)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_5x4()
176 vo3p0 = wasm_f32x4_add(vo3p0, wasm_f32x4_mul(vi6x4567, vk32)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_5x4()
182 vo3p0 = wasm_f32x4_add(vo3p0, wasm_f32x4_mul(vi7x4567, vk42)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_5x4()
198 vo3p0 = wasm_f32x4_add(vo3p0, wasm_f32x4_mul(vi3x3456, vk01)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_5x4()
204 vo3p0 = wasm_f32x4_add(vo3p0, wasm_f32x4_mul(vi4x3456, vk11)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_5x4()
210 vo3p0 = wasm_f32x4_add(vo3p0, wasm_f32x4_mul(vi5x3456, vk21)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_5x4()
216 vo3p0 = wasm_f32x4_add(vo3p0, wasm_f32x4_mul(vi6x3456, vk31)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_5x4()
[all …]
D5x5p2-minmax-neonfma-5x4.c113 float32x4_t vo3p0 = vdupq_lane_f32(vget_low_f32(vw0123), 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_5x4() local
129 vo3p0 = vfmaq_lane_f32(vo3p0, vi3x4567, vget_high_f32(vw0123), 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_5x4()
135 vo3p0 = vfmaq_lane_f32(vo3p0, vi4x4567, vget_low_f32(vw89AB), 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_5x4()
141 vo3p0 = vfmaq_lane_f32(vo3p0, vi5x4567, vget_low_f32(vwCDEF), 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_5x4()
147 vo3p0 = vfmaq_lane_f32(vo3p0, vi6x4567, vget_high_f32(vwGHIJ), 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_5x4()
153 vo3p0 = vfmaq_lane_f32(vo3p0, vi7x4567, vget_high_f32(vwKLMN), 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_5x4()
169 vo3p0 = vfmaq_lane_f32(vo3p0, vi3x3456, vget_high_f32(vw0123), 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_5x4()
175 vo3p0 = vfmaq_lane_f32(vo3p0, vi4x3456, vget_high_f32(vw4567), 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_5x4()
181 vo3p0 = vfmaq_lane_f32(vo3p0, vi5x3456, vget_low_f32(vwCDEF), 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_5x4()
187 vo3p0 = vfmaq_lane_f32(vo3p0, vi6x3456, vget_low_f32(vwGHIJ), 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_5x4()
[all …]
D5x5p2-minmax-neon-5x4.c113 float32x4_t vo3p0 = vdupq_lane_f32(vget_low_f32(vw0123), 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_5x4() local
129 vo3p0 = vmlaq_lane_f32(vo3p0, vi3x4567, vget_high_f32(vw0123), 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_5x4()
135 vo3p0 = vmlaq_lane_f32(vo3p0, vi4x4567, vget_low_f32(vw89AB), 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_5x4()
141 vo3p0 = vmlaq_lane_f32(vo3p0, vi5x4567, vget_low_f32(vwCDEF), 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_5x4()
147 vo3p0 = vmlaq_lane_f32(vo3p0, vi6x4567, vget_high_f32(vwGHIJ), 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_5x4()
153 vo3p0 = vmlaq_lane_f32(vo3p0, vi7x4567, vget_high_f32(vwKLMN), 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_5x4()
169 vo3p0 = vmlaq_lane_f32(vo3p0, vi3x3456, vget_high_f32(vw0123), 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_5x4()
175 vo3p0 = vmlaq_lane_f32(vo3p0, vi4x3456, vget_high_f32(vw4567), 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_5x4()
181 vo3p0 = vmlaq_lane_f32(vo3p0, vi5x3456, vget_low_f32(vwCDEF), 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_5x4()
187 vo3p0 = vmlaq_lane_f32(vo3p0, vi6x3456, vget_low_f32(vwGHIJ), 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_5x4()
[all …]
D5x5p2-minmax-wasmsimd-arm-loadsplat-5x4.c142 v128_t vo3p0 = vbias; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_5x4() local
158 vo3p0 = wasm_f32x4_add(vo3p0, wasm_f32x4_mul(vi3x4567, vk02)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_5x4()
164 vo3p0 = wasm_f32x4_add(vo3p0, wasm_f32x4_mul(vi4x4567, vk12)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_5x4()
170 vo3p0 = wasm_f32x4_add(vo3p0, wasm_f32x4_mul(vi5x4567, vk22)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_5x4()
176 vo3p0 = wasm_f32x4_add(vo3p0, wasm_f32x4_mul(vi6x4567, vk32)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_5x4()
182 vo3p0 = wasm_f32x4_add(vo3p0, wasm_f32x4_mul(vi7x4567, vk42)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_5x4()
198 vo3p0 = wasm_f32x4_add(vo3p0, wasm_f32x4_mul(vi3x3456, vk01)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_5x4()
204 vo3p0 = wasm_f32x4_add(vo3p0, wasm_f32x4_mul(vi4x3456, vk11)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_5x4()
210 vo3p0 = wasm_f32x4_add(vo3p0, wasm_f32x4_mul(vi5x3456, vk21)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_5x4()
216 vo3p0 = wasm_f32x4_add(vo3p0, wasm_f32x4_mul(vi6x3456, vk31)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_5x4()
[all …]
D5x5p2-minmax-wasmsimd-arm-splat-4x4.c108 v128_t vo3p0 = wasm_v32x4_shuffle(vw0123, vw0123, 0, 0, 0, 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_splat_4x4() local
122vo3p0 = wasm_f32x4_add(vo3p0, wasm_f32x4_mul(vi3x4567, wasm_v32x4_shuffle(vw0123, vw0123, 3, 3, 3,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_splat_4x4()
127vo3p0 = wasm_f32x4_add(vo3p0, wasm_f32x4_mul(vi4x4567, wasm_v32x4_shuffle(vw89AB, vw89AB, 0, 0, 0,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_splat_4x4()
132vo3p0 = wasm_f32x4_add(vo3p0, wasm_f32x4_mul(vi5x4567, wasm_v32x4_shuffle(vwCDEF, vwCDEF, 1, 1, 1,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_splat_4x4()
137vo3p0 = wasm_f32x4_add(vo3p0, wasm_f32x4_mul(vi6x4567, wasm_v32x4_shuffle(vwGHIJ, vwGHIJ, 2, 2, 2,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_splat_4x4()
142vo3p0 = wasm_f32x4_add(vo3p0, wasm_f32x4_mul(vi7x4567, wasm_v32x4_shuffle(vwKLMN, vwKLMN, 3, 3, 3,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_splat_4x4()
156vo3p0 = wasm_f32x4_add(vo3p0, wasm_f32x4_mul(vi3x3456, wasm_v32x4_shuffle(vw0123, vw0123, 2, 2, 2,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_splat_4x4()
161vo3p0 = wasm_f32x4_add(vo3p0, wasm_f32x4_mul(vi4x3456, wasm_v32x4_shuffle(vw4567, vw4567, 3, 3, 3,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_splat_4x4()
166vo3p0 = wasm_f32x4_add(vo3p0, wasm_f32x4_mul(vi5x3456, wasm_v32x4_shuffle(vwCDEF, vwCDEF, 0, 0, 0,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_splat_4x4()
171vo3p0 = wasm_f32x4_add(vo3p0, wasm_f32x4_mul(vi6x3456, wasm_v32x4_shuffle(vwGHIJ, vwGHIJ, 1, 1, 1,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_splat_4x4()
[all …]
D5x5p2-minmax-sse-5x4.c141 __m128 vo3p0 = _mm_add_ps(vbias, _mm_mul_ps(vi3x4567, vk02)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_5x4() local
146 vo3p0 = _mm_add_ps(vo3p0, _mm_mul_ps(vi4x4567, vk12)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_5x4()
151 vo3p0 = _mm_add_ps(vo3p0, _mm_mul_ps(vi5x4567, vk22)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_5x4()
156 vo3p0 = _mm_add_ps(vo3p0, _mm_mul_ps(vi6x4567, vk32)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_5x4()
161 vo3p0 = _mm_add_ps(vo3p0, _mm_mul_ps(vi7x4567, vk42)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_5x4()
206 vo3p0 = _mm_add_ps(vo3p0, _mm_mul_ps(vi3x3456, vk01)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_5x4()
211 vo3p0 = _mm_add_ps(vo3p0, _mm_mul_ps(vi4x3456, vk11)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_5x4()
216 vo3p0 = _mm_add_ps(vo3p0, _mm_mul_ps(vi5x3456, vk21)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_5x4()
221 vo3p0 = _mm_add_ps(vo3p0, _mm_mul_ps(vi6x3456, vk31)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_5x4()
226 vo3p0 = _mm_add_ps(vo3p0, _mm_mul_ps(vi7x3456, vk41)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_5x4()
[all …]
D5x5p2-minmax-wasmsimd-x86-splat-5x4.c116 v128_t vo3p0 = wasm_v32x4_shuffle(vw0123, vw0123, 0, 0, 0, 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_splat_5x4() local
132vo3p0 = wasm_f32x4_add(vo3p0, wasm_f32x4_mul(vi3x4567, wasm_v32x4_shuffle(vw0123, vw0123, 3, 3, 3,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_splat_5x4()
138vo3p0 = wasm_f32x4_add(vo3p0, wasm_f32x4_mul(vi4x4567, wasm_v32x4_shuffle(vw89AB, vw89AB, 0, 0, 0,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_splat_5x4()
144vo3p0 = wasm_f32x4_add(vo3p0, wasm_f32x4_mul(vi5x4567, wasm_v32x4_shuffle(vwCDEF, vwCDEF, 1, 1, 1,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_splat_5x4()
150vo3p0 = wasm_f32x4_add(vo3p0, wasm_f32x4_mul(vi6x4567, wasm_v32x4_shuffle(vwGHIJ, vwGHIJ, 2, 2, 2,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_splat_5x4()
156vo3p0 = wasm_f32x4_add(vo3p0, wasm_f32x4_mul(vi7x4567, wasm_v32x4_shuffle(vwKLMN, vwKLMN, 3, 3, 3,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_splat_5x4()
172vo3p0 = wasm_f32x4_add(vo3p0, wasm_f32x4_mul(vi3x3456, wasm_v32x4_shuffle(vw0123, vw0123, 2, 2, 2,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_splat_5x4()
178vo3p0 = wasm_f32x4_add(vo3p0, wasm_f32x4_mul(vi4x3456, wasm_v32x4_shuffle(vw4567, vw4567, 3, 3, 3,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_splat_5x4()
184vo3p0 = wasm_f32x4_add(vo3p0, wasm_f32x4_mul(vi5x3456, wasm_v32x4_shuffle(vwCDEF, vwCDEF, 0, 0, 0,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_splat_5x4()
190vo3p0 = wasm_f32x4_add(vo3p0, wasm_f32x4_mul(vi6x3456, wasm_v32x4_shuffle(vwGHIJ, vwGHIJ, 1, 1, 1,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_splat_5x4()
[all …]
D5x5p2-minmax-wasmsimd-arm-splat-5x4.c116 v128_t vo3p0 = wasm_v32x4_shuffle(vw0123, vw0123, 0, 0, 0, 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_splat_5x4() local
132vo3p0 = wasm_f32x4_add(vo3p0, wasm_f32x4_mul(vi3x4567, wasm_v32x4_shuffle(vw0123, vw0123, 3, 3, 3,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_splat_5x4()
138vo3p0 = wasm_f32x4_add(vo3p0, wasm_f32x4_mul(vi4x4567, wasm_v32x4_shuffle(vw89AB, vw89AB, 0, 0, 0,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_splat_5x4()
144vo3p0 = wasm_f32x4_add(vo3p0, wasm_f32x4_mul(vi5x4567, wasm_v32x4_shuffle(vwCDEF, vwCDEF, 1, 1, 1,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_splat_5x4()
150vo3p0 = wasm_f32x4_add(vo3p0, wasm_f32x4_mul(vi6x4567, wasm_v32x4_shuffle(vwGHIJ, vwGHIJ, 2, 2, 2,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_splat_5x4()
156vo3p0 = wasm_f32x4_add(vo3p0, wasm_f32x4_mul(vi7x4567, wasm_v32x4_shuffle(vwKLMN, vwKLMN, 3, 3, 3,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_splat_5x4()
172vo3p0 = wasm_f32x4_add(vo3p0, wasm_f32x4_mul(vi3x3456, wasm_v32x4_shuffle(vw0123, vw0123, 2, 2, 2,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_splat_5x4()
178vo3p0 = wasm_f32x4_add(vo3p0, wasm_f32x4_mul(vi4x3456, wasm_v32x4_shuffle(vw4567, vw4567, 3, 3, 3,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_splat_5x4()
184vo3p0 = wasm_f32x4_add(vo3p0, wasm_f32x4_mul(vi5x3456, wasm_v32x4_shuffle(vwCDEF, vwCDEF, 0, 0, 0,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_splat_5x4()
190vo3p0 = wasm_f32x4_add(vo3p0, wasm_f32x4_mul(vi6x3456, wasm_v32x4_shuffle(vwGHIJ, vwGHIJ, 1, 1, 1,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_splat_5x4()
[all …]
D3x3p1-minmax-neonfma-4x4.c92 float32x4_t vo3p0 = vdupq_lane_f32(vget_low_f32(vw0123), 0); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__neonfma_4x4() local
104 vo3p0 = vfmaq_lane_f32(vo3p0, vi3x4567, vget_high_f32(vw0123), 0); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__neonfma_4x4()
109 vo3p0 = vfmaq_lane_f32(vo3p0, vi4x4567, vget_low_f32(vw4567), 1); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__neonfma_4x4()
114 vo3p0 = vfmaq_lane_f32(vo3p0, vi5x4567, vw89, 0); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__neonfma_4x4()
126 vo3p0 = vfmaq_lane_f32(vo3p0, vi3x3456, vget_low_f32(vw0123), 1); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__neonfma_4x4()
131 vo3p0 = vfmaq_lane_f32(vo3p0, vi4x3456, vget_low_f32(vw4567), 0); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__neonfma_4x4()
136 vo3p0 = vfmaq_lane_f32(vo3p0, vi5x3456, vget_high_f32(vw4567), 1); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__neonfma_4x4()
155 vo3p0 = vfmaq_lane_f32(vo3p0, vi3x5678, vget_high_f32(vw0123), 1); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__neonfma_4x4()
160 vo3p0 = vfmaq_lane_f32(vo3p0, vi4x5678, vget_high_f32(vw4567), 0); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__neonfma_4x4()
165 vo3p0 = vfmaq_lane_f32(vo3p0, vi5x5678, vw89, 1); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__neonfma_4x4()
[all …]
D3x3p1-minmax-neon-4x4.c92 float32x4_t vo3p0 = vdupq_lane_f32(vget_low_f32(vw0123), 0); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__neon_4x4() local
104 vo3p0 = vmlaq_lane_f32(vo3p0, vi3x4567, vget_high_f32(vw0123), 0); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__neon_4x4()
109 vo3p0 = vmlaq_lane_f32(vo3p0, vi4x4567, vget_low_f32(vw4567), 1); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__neon_4x4()
114 vo3p0 = vmlaq_lane_f32(vo3p0, vi5x4567, vw89, 0); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__neon_4x4()
126 vo3p0 = vmlaq_lane_f32(vo3p0, vi3x3456, vget_low_f32(vw0123), 1); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__neon_4x4()
131 vo3p0 = vmlaq_lane_f32(vo3p0, vi4x3456, vget_low_f32(vw4567), 0); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__neon_4x4()
136 vo3p0 = vmlaq_lane_f32(vo3p0, vi5x3456, vget_high_f32(vw4567), 1); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__neon_4x4()
155 vo3p0 = vmlaq_lane_f32(vo3p0, vi3x5678, vget_high_f32(vw0123), 1); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__neon_4x4()
160 vo3p0 = vmlaq_lane_f32(vo3p0, vi4x5678, vget_high_f32(vw4567), 0); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__neon_4x4()
165 vo3p0 = vmlaq_lane_f32(vo3p0, vi5x5678, vw89, 1); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__neon_4x4()
[all …]
D3x3p1-minmax-wasmsimd-x86-splat-4x4.c95 v128_t vo3p0 = wasm_v32x4_shuffle(vw0123, vw0123, 0, 0, 0, 0); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_splat_4x4() local
107vo3p0 = wasm_f32x4_add(vo3p0, wasm_f32x4_mul(vi3x4567, wasm_v32x4_shuffle(vw0123, vw0123, 2, 2, 2,… in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_splat_4x4()
112vo3p0 = wasm_f32x4_add(vo3p0, wasm_f32x4_mul(vi4x4567, wasm_v32x4_shuffle(vw4567, vw4567, 1, 1, 1,… in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_splat_4x4()
117vo3p0 = wasm_f32x4_add(vo3p0, wasm_f32x4_mul(vi5x4567, wasm_v32x4_shuffle(vw89, vw89, 0, 0, 0, 0))… in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_splat_4x4()
129vo3p0 = wasm_f32x4_add(vo3p0, wasm_f32x4_mul(vi3x3456, wasm_v32x4_shuffle(vw0123, vw0123, 1, 1, 1,… in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_splat_4x4()
134vo3p0 = wasm_f32x4_add(vo3p0, wasm_f32x4_mul(vi4x3456, wasm_v32x4_shuffle(vw4567, vw4567, 0, 0, 0,… in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_splat_4x4()
139vo3p0 = wasm_f32x4_add(vo3p0, wasm_f32x4_mul(vi5x3456, wasm_v32x4_shuffle(vw4567, vw4567, 3, 3, 3,… in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_splat_4x4()
158vo3p0 = wasm_f32x4_add(vo3p0, wasm_f32x4_mul(vi3x5678, wasm_v32x4_shuffle(vw0123, vw0123, 3, 3, 3,… in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_splat_4x4()
163vo3p0 = wasm_f32x4_add(vo3p0, wasm_f32x4_mul(vi4x5678, wasm_v32x4_shuffle(vw4567, vw4567, 2, 2, 2,… in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_splat_4x4()
168vo3p0 = wasm_f32x4_add(vo3p0, wasm_f32x4_mul(vi5x5678, wasm_v32x4_shuffle(vw89, vw89, 1, 1, 1, 1))… in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_splat_4x4()
[all …]
D3x3p1-minmax-wasmsimd-x86-loadsplat-4x4.c125 v128_t vo3p0 = wasm_f32x4_add(vbias, wasm_f32x4_mul(vi3x4567, vk01)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_loadsplat_4x4() local
129 vo3p0 = wasm_f32x4_add(vo3p0, wasm_f32x4_mul(vi4x4567, vk11)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_loadsplat_4x4()
133 vo3p0 = wasm_f32x4_add(vo3p0, wasm_f32x4_mul(vi5x4567, vk21)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_loadsplat_4x4()
145 vo3p0 = wasm_f32x4_add(vo3p0, wasm_f32x4_mul(vi3x3456, vk00)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_loadsplat_4x4()
149 vo3p0 = wasm_f32x4_add(vo3p0, wasm_f32x4_mul(vi4x3456, vk10)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_loadsplat_4x4()
153 vo3p0 = wasm_f32x4_add(vo3p0, wasm_f32x4_mul(vi5x3456, vk20)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_loadsplat_4x4()
172 vo3p0 = wasm_f32x4_add(vo3p0, wasm_f32x4_mul(vi3x5678, vk02)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_loadsplat_4x4()
176 vo3p0 = wasm_f32x4_add(vo3p0, wasm_f32x4_mul(vi4x5678, vk12)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_loadsplat_4x4()
180 vo3p0 = wasm_f32x4_add(vo3p0, wasm_f32x4_mul(vi5x5678, vk22)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_loadsplat_4x4()
193 v128_t vo3 = wasm_v128_bitselect(vmin, vo3p0, wasm_f32x4_lt(vo3p0, vmin)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_loadsplat_4x4()
[all …]
D5x5p2-minmax-neon-4x4-acc2.c105 float32x4_t vo3p0 = vdupq_lane_f32(vget_low_f32(vw0123), 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_4x4_acc2() local
124 vo3p0 = vmlaq_lane_f32(vo3p0, vi4x4567, vget_low_f32(vw89AB), 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_4x4_acc2()
129 vo3p0 = vmlaq_lane_f32(vo3p0, vi5x4567, vget_low_f32(vwCDEF), 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_4x4_acc2()
139 vo3p0 = vmlaq_lane_f32(vo3p0, vi7x4567, vget_high_f32(vwKLMN), 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_4x4_acc2()
158 vo3p0 = vmlaq_lane_f32(vo3p0, vi4x3456, vget_high_f32(vw4567), 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_4x4_acc2()
168 vo3p0 = vmlaq_lane_f32(vo3p0, vi6x3456, vget_low_f32(vwGHIJ), 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_4x4_acc2()
195 vo3p0 = vmlaq_lane_f32(vo3p0, vi3x2345, vget_low_f32(vw0123), 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_4x4_acc2()
205 vo3p0 = vmlaq_lane_f32(vo3p0, vi5x2345, vget_high_f32(vw89AB), 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_4x4_acc2()
215 vo3p0 = vmlaq_lane_f32(vo3p0, vi7x2345, vget_low_f32(vwKLMN), 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_4x4_acc2()
234 vo3p0 = vmlaq_lane_f32(vo3p0, vi4x5678, vget_low_f32(vw89AB), 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_4x4_acc2()
[all …]
D5x5p2-minmax-neonfma-4x4-acc2.c105 float32x4_t vo3p0 = vdupq_lane_f32(vget_low_f32(vw0123), 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_4x4_acc2() local
124 vo3p0 = vfmaq_lane_f32(vo3p0, vi4x4567, vget_low_f32(vw89AB), 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_4x4_acc2()
129 vo3p0 = vfmaq_lane_f32(vo3p0, vi5x4567, vget_low_f32(vwCDEF), 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_4x4_acc2()
139 vo3p0 = vfmaq_lane_f32(vo3p0, vi7x4567, vget_high_f32(vwKLMN), 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_4x4_acc2()
158 vo3p0 = vfmaq_lane_f32(vo3p0, vi4x3456, vget_high_f32(vw4567), 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_4x4_acc2()
168 vo3p0 = vfmaq_lane_f32(vo3p0, vi6x3456, vget_low_f32(vwGHIJ), 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_4x4_acc2()
195 vo3p0 = vfmaq_lane_f32(vo3p0, vi3x2345, vget_low_f32(vw0123), 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_4x4_acc2()
205 vo3p0 = vfmaq_lane_f32(vo3p0, vi5x2345, vget_high_f32(vw89AB), 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_4x4_acc2()
215 vo3p0 = vfmaq_lane_f32(vo3p0, vi7x2345, vget_low_f32(vwKLMN), 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_4x4_acc2()
234 vo3p0 = vfmaq_lane_f32(vo3p0, vi4x5678, vget_low_f32(vw89AB), 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_4x4_acc2()
[all …]
D3x3p1-minmax-neonfma-5x4.c100 float32x4_t vo3p0 = vdupq_lane_f32(vget_low_f32(vw0123), 0); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__neonfma_5x4() local
114 vo3p0 = vfmaq_lane_f32(vo3p0, vi3x4567, vget_high_f32(vw0123), 0); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__neonfma_5x4()
120 vo3p0 = vfmaq_lane_f32(vo3p0, vi4x4567, vget_low_f32(vw4567), 1); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__neonfma_5x4()
126 vo3p0 = vfmaq_lane_f32(vo3p0, vi5x4567, vw89, 0); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__neonfma_5x4()
140 vo3p0 = vfmaq_lane_f32(vo3p0, vi3x3456, vget_low_f32(vw0123), 1); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__neonfma_5x4()
146 vo3p0 = vfmaq_lane_f32(vo3p0, vi4x3456, vget_low_f32(vw4567), 0); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__neonfma_5x4()
152 vo3p0 = vfmaq_lane_f32(vo3p0, vi5x3456, vget_high_f32(vw4567), 1); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__neonfma_5x4()
174 vo3p0 = vfmaq_lane_f32(vo3p0, vi3x5678, vget_high_f32(vw0123), 1); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__neonfma_5x4()
180 vo3p0 = vfmaq_lane_f32(vo3p0, vi4x5678, vget_high_f32(vw4567), 0); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__neonfma_5x4()
186 vo3p0 = vfmaq_lane_f32(vo3p0, vi5x5678, vw89, 1); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__neonfma_5x4()
[all …]
D3x3s2p1-minmax-neonfma-4x4.c107 float32x4_t vo3p0 = vdupq_lane_f32(vget_low_f32(vw0123), 0); in xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__neonfma_4x4() local
122 vo3p0 = vfmaq_lane_f32(vo3p0, vi6x8ACE9BDF.val[0], vget_high_f32(vw0123), 0); in xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__neonfma_4x4()
127 vo3p0 = vfmaq_lane_f32(vo3p0, vi7x8ACE9BDF.val[0], vget_low_f32(vw4567), 1); in xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__neonfma_4x4()
132 vo3p0 = vfmaq_lane_f32(vo3p0, vi8x8ACE9BDF.val[0], vw89, 0); in xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__neonfma_4x4()
156 vo3p0 = vfmaq_lane_f32(vo3p0, vi6x7BDF, vget_low_f32(vw0123), 1); in xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__neonfma_4x4()
161 vo3p0 = vfmaq_lane_f32(vo3p0, vi7x7BDF, vget_low_f32(vw4567), 0); in xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__neonfma_4x4()
166 vo3p0 = vfmaq_lane_f32(vo3p0, vi8x7BDF, vget_high_f32(vw4567), 1); in xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__neonfma_4x4()
171 vo3p0 = vfmaq_lane_f32(vo3p0, vi6x8ACE9BDF.val[1], vget_high_f32(vw0123), 1); in xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__neonfma_4x4()
176 vo3p0 = vfmaq_lane_f32(vo3p0, vi7x8ACE9BDF.val[1], vget_high_f32(vw4567), 0); in xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__neonfma_4x4()
181 vo3p0 = vfmaq_lane_f32(vo3p0, vi8x8ACE9BDF.val[1], vw89, 1); in xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__neonfma_4x4()
[all …]
D3x3p1-minmax-wasmsimd-arm-splat-4x4.c95 v128_t vo3p0 = wasm_v32x4_shuffle(vw0123, vw0123, 0, 0, 0, 0); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_splat_4x4() local
107vo3p0 = wasm_f32x4_add(vo3p0, wasm_f32x4_mul(vi3x4567, wasm_v32x4_shuffle(vw0123, vw0123, 2, 2, 2,… in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_splat_4x4()
112vo3p0 = wasm_f32x4_add(vo3p0, wasm_f32x4_mul(vi4x4567, wasm_v32x4_shuffle(vw4567, vw4567, 1, 1, 1,… in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_splat_4x4()
117vo3p0 = wasm_f32x4_add(vo3p0, wasm_f32x4_mul(vi5x4567, wasm_v32x4_shuffle(vw89, vw89, 0, 0, 0, 0))… in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_splat_4x4()
129vo3p0 = wasm_f32x4_add(vo3p0, wasm_f32x4_mul(vi3x3456, wasm_v32x4_shuffle(vw0123, vw0123, 1, 1, 1,… in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_splat_4x4()
134vo3p0 = wasm_f32x4_add(vo3p0, wasm_f32x4_mul(vi4x3456, wasm_v32x4_shuffle(vw4567, vw4567, 0, 0, 0,… in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_splat_4x4()
139vo3p0 = wasm_f32x4_add(vo3p0, wasm_f32x4_mul(vi5x3456, wasm_v32x4_shuffle(vw4567, vw4567, 3, 3, 3,… in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_splat_4x4()
158vo3p0 = wasm_f32x4_add(vo3p0, wasm_f32x4_mul(vi3x5678, wasm_v32x4_shuffle(vw0123, vw0123, 3, 3, 3,… in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_splat_4x4()
163vo3p0 = wasm_f32x4_add(vo3p0, wasm_f32x4_mul(vi4x5678, wasm_v32x4_shuffle(vw4567, vw4567, 2, 2, 2,… in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_splat_4x4()
168vo3p0 = wasm_f32x4_add(vo3p0, wasm_f32x4_mul(vi5x5678, wasm_v32x4_shuffle(vw89, vw89, 1, 1, 1, 1))… in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_splat_4x4()
[all …]
D3x3p1-minmax-neon-5x4.c100 float32x4_t vo3p0 = vdupq_lane_f32(vget_low_f32(vw0123), 0); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__neon_5x4() local
114 vo3p0 = vmlaq_lane_f32(vo3p0, vi3x4567, vget_high_f32(vw0123), 0); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__neon_5x4()
120 vo3p0 = vmlaq_lane_f32(vo3p0, vi4x4567, vget_low_f32(vw4567), 1); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__neon_5x4()
126 vo3p0 = vmlaq_lane_f32(vo3p0, vi5x4567, vw89, 0); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__neon_5x4()
140 vo3p0 = vmlaq_lane_f32(vo3p0, vi3x3456, vget_low_f32(vw0123), 1); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__neon_5x4()
146 vo3p0 = vmlaq_lane_f32(vo3p0, vi4x3456, vget_low_f32(vw4567), 0); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__neon_5x4()
152 vo3p0 = vmlaq_lane_f32(vo3p0, vi5x3456, vget_high_f32(vw4567), 1); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__neon_5x4()
174 vo3p0 = vmlaq_lane_f32(vo3p0, vi3x5678, vget_high_f32(vw0123), 1); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__neon_5x4()
180 vo3p0 = vmlaq_lane_f32(vo3p0, vi4x5678, vget_high_f32(vw4567), 0); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__neon_5x4()
186 vo3p0 = vmlaq_lane_f32(vo3p0, vi5x5678, vw89, 1); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__neon_5x4()
[all …]
D3x3s2p1-minmax-neon-4x4.c107 float32x4_t vo3p0 = vdupq_lane_f32(vget_low_f32(vw0123), 0); in xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__neon_4x4() local
122 vo3p0 = vmlaq_lane_f32(vo3p0, vi6x8ACE9BDF.val[0], vget_high_f32(vw0123), 0); in xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__neon_4x4()
127 vo3p0 = vmlaq_lane_f32(vo3p0, vi7x8ACE9BDF.val[0], vget_low_f32(vw4567), 1); in xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__neon_4x4()
132 vo3p0 = vmlaq_lane_f32(vo3p0, vi8x8ACE9BDF.val[0], vw89, 0); in xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__neon_4x4()
156 vo3p0 = vmlaq_lane_f32(vo3p0, vi6x7BDF, vget_low_f32(vw0123), 1); in xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__neon_4x4()
161 vo3p0 = vmlaq_lane_f32(vo3p0, vi7x7BDF, vget_low_f32(vw4567), 0); in xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__neon_4x4()
166 vo3p0 = vmlaq_lane_f32(vo3p0, vi8x7BDF, vget_high_f32(vw4567), 1); in xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__neon_4x4()
171 vo3p0 = vmlaq_lane_f32(vo3p0, vi6x8ACE9BDF.val[1], vget_high_f32(vw0123), 1); in xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__neon_4x4()
176 vo3p0 = vmlaq_lane_f32(vo3p0, vi7x8ACE9BDF.val[1], vget_high_f32(vw4567), 0); in xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__neon_4x4()
181 vo3p0 = vmlaq_lane_f32(vo3p0, vi8x8ACE9BDF.val[1], vw89, 1); in xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__neon_4x4()
[all …]

123