/external/XNNPACK/src/f32-dwconv2d-chw/gen/ |
D | 5x5p2-minmax-neon-4x4.c | 105 float32x4_t vo3p0 = vdupq_lane_f32(vget_low_f32(vw0123), 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_4x4() local 119 vo3p0 = vmlaq_lane_f32(vo3p0, vi3x4567, vget_high_f32(vw0123), 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_4x4() 124 vo3p0 = vmlaq_lane_f32(vo3p0, vi4x4567, vget_low_f32(vw89AB), 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_4x4() 129 vo3p0 = vmlaq_lane_f32(vo3p0, vi5x4567, vget_low_f32(vwCDEF), 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_4x4() 134 vo3p0 = vmlaq_lane_f32(vo3p0, vi6x4567, vget_high_f32(vwGHIJ), 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_4x4() 139 vo3p0 = vmlaq_lane_f32(vo3p0, vi7x4567, vget_high_f32(vwKLMN), 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_4x4() 153 vo3p0 = vmlaq_lane_f32(vo3p0, vi3x3456, vget_high_f32(vw0123), 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_4x4() 158 vo3p0 = vmlaq_lane_f32(vo3p0, vi4x3456, vget_high_f32(vw4567), 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_4x4() 163 vo3p0 = vmlaq_lane_f32(vo3p0, vi5x3456, vget_low_f32(vwCDEF), 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_4x4() 168 vo3p0 = vmlaq_lane_f32(vo3p0, vi6x3456, vget_low_f32(vwGHIJ), 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_4x4() [all …]
|
D | 5x5p2-minmax-neonfma-4x4.c | 105 float32x4_t vo3p0 = vdupq_lane_f32(vget_low_f32(vw0123), 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_4x4() local 119 vo3p0 = vfmaq_lane_f32(vo3p0, vi3x4567, vget_high_f32(vw0123), 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_4x4() 124 vo3p0 = vfmaq_lane_f32(vo3p0, vi4x4567, vget_low_f32(vw89AB), 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_4x4() 129 vo3p0 = vfmaq_lane_f32(vo3p0, vi5x4567, vget_low_f32(vwCDEF), 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_4x4() 134 vo3p0 = vfmaq_lane_f32(vo3p0, vi6x4567, vget_high_f32(vwGHIJ), 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_4x4() 139 vo3p0 = vfmaq_lane_f32(vo3p0, vi7x4567, vget_high_f32(vwKLMN), 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_4x4() 153 vo3p0 = vfmaq_lane_f32(vo3p0, vi3x3456, vget_high_f32(vw0123), 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_4x4() 158 vo3p0 = vfmaq_lane_f32(vo3p0, vi4x3456, vget_high_f32(vw4567), 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_4x4() 163 vo3p0 = vfmaq_lane_f32(vo3p0, vi5x3456, vget_low_f32(vwCDEF), 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_4x4() 168 vo3p0 = vfmaq_lane_f32(vo3p0, vi6x3456, vget_low_f32(vwGHIJ), 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_4x4() [all …]
|
D | 5x5p2-minmax-wasmsimd-x86-loadsplat-4x4.c | 134 v128_t vo3p0 = vbias; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_4x4() local 148 vo3p0 = wasm_f32x4_add(vo3p0, wasm_f32x4_mul(vi3x4567, vk02)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_4x4() 153 vo3p0 = wasm_f32x4_add(vo3p0, wasm_f32x4_mul(vi4x4567, vk12)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_4x4() 158 vo3p0 = wasm_f32x4_add(vo3p0, wasm_f32x4_mul(vi5x4567, vk22)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_4x4() 163 vo3p0 = wasm_f32x4_add(vo3p0, wasm_f32x4_mul(vi6x4567, vk32)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_4x4() 168 vo3p0 = wasm_f32x4_add(vo3p0, wasm_f32x4_mul(vi7x4567, vk42)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_4x4() 182 vo3p0 = wasm_f32x4_add(vo3p0, wasm_f32x4_mul(vi3x3456, vk01)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_4x4() 187 vo3p0 = wasm_f32x4_add(vo3p0, wasm_f32x4_mul(vi4x3456, vk11)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_4x4() 192 vo3p0 = wasm_f32x4_add(vo3p0, wasm_f32x4_mul(vi5x3456, vk21)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_4x4() 197 vo3p0 = wasm_f32x4_add(vo3p0, wasm_f32x4_mul(vi6x3456, vk31)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_4x4() [all …]
|
D | 5x5p2-minmax-wasmsimd-arm-loadsplat-4x4.c | 134 v128_t vo3p0 = vbias; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_4x4() local 148 vo3p0 = wasm_f32x4_add(vo3p0, wasm_f32x4_mul(vi3x4567, vk02)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_4x4() 153 vo3p0 = wasm_f32x4_add(vo3p0, wasm_f32x4_mul(vi4x4567, vk12)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_4x4() 158 vo3p0 = wasm_f32x4_add(vo3p0, wasm_f32x4_mul(vi5x4567, vk22)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_4x4() 163 vo3p0 = wasm_f32x4_add(vo3p0, wasm_f32x4_mul(vi6x4567, vk32)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_4x4() 168 vo3p0 = wasm_f32x4_add(vo3p0, wasm_f32x4_mul(vi7x4567, vk42)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_4x4() 182 vo3p0 = wasm_f32x4_add(vo3p0, wasm_f32x4_mul(vi3x3456, vk01)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_4x4() 187 vo3p0 = wasm_f32x4_add(vo3p0, wasm_f32x4_mul(vi4x3456, vk11)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_4x4() 192 vo3p0 = wasm_f32x4_add(vo3p0, wasm_f32x4_mul(vi5x3456, vk21)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_4x4() 197 vo3p0 = wasm_f32x4_add(vo3p0, wasm_f32x4_mul(vi6x3456, vk31)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_4x4() [all …]
|
D | 5x5p2-minmax-sse-4x4.c | 132 __m128 vo3p0 = _mm_add_ps(vbias, _mm_mul_ps(vi3x4567, vk02)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_4x4() local 136 vo3p0 = _mm_add_ps(vo3p0, _mm_mul_ps(vi4x4567, vk12)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_4x4() 140 vo3p0 = _mm_add_ps(vo3p0, _mm_mul_ps(vi5x4567, vk22)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_4x4() 144 vo3p0 = _mm_add_ps(vo3p0, _mm_mul_ps(vi6x4567, vk32)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_4x4() 148 vo3p0 = _mm_add_ps(vo3p0, _mm_mul_ps(vi7x4567, vk42)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_4x4() 188 vo3p0 = _mm_add_ps(vo3p0, _mm_mul_ps(vi3x3456, vk01)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_4x4() 192 vo3p0 = _mm_add_ps(vo3p0, _mm_mul_ps(vi4x3456, vk11)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_4x4() 196 vo3p0 = _mm_add_ps(vo3p0, _mm_mul_ps(vi5x3456, vk21)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_4x4() 200 vo3p0 = _mm_add_ps(vo3p0, _mm_mul_ps(vi6x3456, vk31)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_4x4() 204 vo3p0 = _mm_add_ps(vo3p0, _mm_mul_ps(vi7x3456, vk41)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_4x4() [all …]
|
D | 5x5p2-minmax-wasmsimd-x86-splat-4x4.c | 108 v128_t vo3p0 = wasm_v32x4_shuffle(vw0123, vw0123, 0, 0, 0, 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_splat_4x4() local 122 …vo3p0 = wasm_f32x4_add(vo3p0, wasm_f32x4_mul(vi3x4567, wasm_v32x4_shuffle(vw0123, vw0123, 3, 3, 3,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_splat_4x4() 127 …vo3p0 = wasm_f32x4_add(vo3p0, wasm_f32x4_mul(vi4x4567, wasm_v32x4_shuffle(vw89AB, vw89AB, 0, 0, 0,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_splat_4x4() 132 …vo3p0 = wasm_f32x4_add(vo3p0, wasm_f32x4_mul(vi5x4567, wasm_v32x4_shuffle(vwCDEF, vwCDEF, 1, 1, 1,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_splat_4x4() 137 …vo3p0 = wasm_f32x4_add(vo3p0, wasm_f32x4_mul(vi6x4567, wasm_v32x4_shuffle(vwGHIJ, vwGHIJ, 2, 2, 2,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_splat_4x4() 142 …vo3p0 = wasm_f32x4_add(vo3p0, wasm_f32x4_mul(vi7x4567, wasm_v32x4_shuffle(vwKLMN, vwKLMN, 3, 3, 3,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_splat_4x4() 156 …vo3p0 = wasm_f32x4_add(vo3p0, wasm_f32x4_mul(vi3x3456, wasm_v32x4_shuffle(vw0123, vw0123, 2, 2, 2,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_splat_4x4() 161 …vo3p0 = wasm_f32x4_add(vo3p0, wasm_f32x4_mul(vi4x3456, wasm_v32x4_shuffle(vw4567, vw4567, 3, 3, 3,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_splat_4x4() 166 …vo3p0 = wasm_f32x4_add(vo3p0, wasm_f32x4_mul(vi5x3456, wasm_v32x4_shuffle(vwCDEF, vwCDEF, 0, 0, 0,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_splat_4x4() 171 …vo3p0 = wasm_f32x4_add(vo3p0, wasm_f32x4_mul(vi6x3456, wasm_v32x4_shuffle(vwGHIJ, vwGHIJ, 1, 1, 1,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_splat_4x4() [all …]
|
D | 5x5p2-minmax-wasmsimd-x86-loadsplat-5x4.c | 142 v128_t vo3p0 = vbias; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_5x4() local 158 vo3p0 = wasm_f32x4_add(vo3p0, wasm_f32x4_mul(vi3x4567, vk02)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_5x4() 164 vo3p0 = wasm_f32x4_add(vo3p0, wasm_f32x4_mul(vi4x4567, vk12)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_5x4() 170 vo3p0 = wasm_f32x4_add(vo3p0, wasm_f32x4_mul(vi5x4567, vk22)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_5x4() 176 vo3p0 = wasm_f32x4_add(vo3p0, wasm_f32x4_mul(vi6x4567, vk32)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_5x4() 182 vo3p0 = wasm_f32x4_add(vo3p0, wasm_f32x4_mul(vi7x4567, vk42)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_5x4() 198 vo3p0 = wasm_f32x4_add(vo3p0, wasm_f32x4_mul(vi3x3456, vk01)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_5x4() 204 vo3p0 = wasm_f32x4_add(vo3p0, wasm_f32x4_mul(vi4x3456, vk11)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_5x4() 210 vo3p0 = wasm_f32x4_add(vo3p0, wasm_f32x4_mul(vi5x3456, vk21)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_5x4() 216 vo3p0 = wasm_f32x4_add(vo3p0, wasm_f32x4_mul(vi6x3456, vk31)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_5x4() [all …]
|
D | 5x5p2-minmax-neonfma-5x4.c | 113 float32x4_t vo3p0 = vdupq_lane_f32(vget_low_f32(vw0123), 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_5x4() local 129 vo3p0 = vfmaq_lane_f32(vo3p0, vi3x4567, vget_high_f32(vw0123), 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_5x4() 135 vo3p0 = vfmaq_lane_f32(vo3p0, vi4x4567, vget_low_f32(vw89AB), 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_5x4() 141 vo3p0 = vfmaq_lane_f32(vo3p0, vi5x4567, vget_low_f32(vwCDEF), 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_5x4() 147 vo3p0 = vfmaq_lane_f32(vo3p0, vi6x4567, vget_high_f32(vwGHIJ), 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_5x4() 153 vo3p0 = vfmaq_lane_f32(vo3p0, vi7x4567, vget_high_f32(vwKLMN), 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_5x4() 169 vo3p0 = vfmaq_lane_f32(vo3p0, vi3x3456, vget_high_f32(vw0123), 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_5x4() 175 vo3p0 = vfmaq_lane_f32(vo3p0, vi4x3456, vget_high_f32(vw4567), 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_5x4() 181 vo3p0 = vfmaq_lane_f32(vo3p0, vi5x3456, vget_low_f32(vwCDEF), 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_5x4() 187 vo3p0 = vfmaq_lane_f32(vo3p0, vi6x3456, vget_low_f32(vwGHIJ), 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_5x4() [all …]
|
D | 5x5p2-minmax-neon-5x4.c | 113 float32x4_t vo3p0 = vdupq_lane_f32(vget_low_f32(vw0123), 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_5x4() local 129 vo3p0 = vmlaq_lane_f32(vo3p0, vi3x4567, vget_high_f32(vw0123), 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_5x4() 135 vo3p0 = vmlaq_lane_f32(vo3p0, vi4x4567, vget_low_f32(vw89AB), 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_5x4() 141 vo3p0 = vmlaq_lane_f32(vo3p0, vi5x4567, vget_low_f32(vwCDEF), 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_5x4() 147 vo3p0 = vmlaq_lane_f32(vo3p0, vi6x4567, vget_high_f32(vwGHIJ), 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_5x4() 153 vo3p0 = vmlaq_lane_f32(vo3p0, vi7x4567, vget_high_f32(vwKLMN), 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_5x4() 169 vo3p0 = vmlaq_lane_f32(vo3p0, vi3x3456, vget_high_f32(vw0123), 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_5x4() 175 vo3p0 = vmlaq_lane_f32(vo3p0, vi4x3456, vget_high_f32(vw4567), 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_5x4() 181 vo3p0 = vmlaq_lane_f32(vo3p0, vi5x3456, vget_low_f32(vwCDEF), 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_5x4() 187 vo3p0 = vmlaq_lane_f32(vo3p0, vi6x3456, vget_low_f32(vwGHIJ), 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_5x4() [all …]
|
D | 5x5p2-minmax-wasmsimd-arm-loadsplat-5x4.c | 142 v128_t vo3p0 = vbias; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_5x4() local 158 vo3p0 = wasm_f32x4_add(vo3p0, wasm_f32x4_mul(vi3x4567, vk02)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_5x4() 164 vo3p0 = wasm_f32x4_add(vo3p0, wasm_f32x4_mul(vi4x4567, vk12)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_5x4() 170 vo3p0 = wasm_f32x4_add(vo3p0, wasm_f32x4_mul(vi5x4567, vk22)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_5x4() 176 vo3p0 = wasm_f32x4_add(vo3p0, wasm_f32x4_mul(vi6x4567, vk32)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_5x4() 182 vo3p0 = wasm_f32x4_add(vo3p0, wasm_f32x4_mul(vi7x4567, vk42)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_5x4() 198 vo3p0 = wasm_f32x4_add(vo3p0, wasm_f32x4_mul(vi3x3456, vk01)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_5x4() 204 vo3p0 = wasm_f32x4_add(vo3p0, wasm_f32x4_mul(vi4x3456, vk11)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_5x4() 210 vo3p0 = wasm_f32x4_add(vo3p0, wasm_f32x4_mul(vi5x3456, vk21)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_5x4() 216 vo3p0 = wasm_f32x4_add(vo3p0, wasm_f32x4_mul(vi6x3456, vk31)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_5x4() [all …]
|
D | 5x5p2-minmax-wasmsimd-arm-splat-4x4.c | 108 v128_t vo3p0 = wasm_v32x4_shuffle(vw0123, vw0123, 0, 0, 0, 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_splat_4x4() local 122 …vo3p0 = wasm_f32x4_add(vo3p0, wasm_f32x4_mul(vi3x4567, wasm_v32x4_shuffle(vw0123, vw0123, 3, 3, 3,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_splat_4x4() 127 …vo3p0 = wasm_f32x4_add(vo3p0, wasm_f32x4_mul(vi4x4567, wasm_v32x4_shuffle(vw89AB, vw89AB, 0, 0, 0,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_splat_4x4() 132 …vo3p0 = wasm_f32x4_add(vo3p0, wasm_f32x4_mul(vi5x4567, wasm_v32x4_shuffle(vwCDEF, vwCDEF, 1, 1, 1,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_splat_4x4() 137 …vo3p0 = wasm_f32x4_add(vo3p0, wasm_f32x4_mul(vi6x4567, wasm_v32x4_shuffle(vwGHIJ, vwGHIJ, 2, 2, 2,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_splat_4x4() 142 …vo3p0 = wasm_f32x4_add(vo3p0, wasm_f32x4_mul(vi7x4567, wasm_v32x4_shuffle(vwKLMN, vwKLMN, 3, 3, 3,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_splat_4x4() 156 …vo3p0 = wasm_f32x4_add(vo3p0, wasm_f32x4_mul(vi3x3456, wasm_v32x4_shuffle(vw0123, vw0123, 2, 2, 2,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_splat_4x4() 161 …vo3p0 = wasm_f32x4_add(vo3p0, wasm_f32x4_mul(vi4x3456, wasm_v32x4_shuffle(vw4567, vw4567, 3, 3, 3,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_splat_4x4() 166 …vo3p0 = wasm_f32x4_add(vo3p0, wasm_f32x4_mul(vi5x3456, wasm_v32x4_shuffle(vwCDEF, vwCDEF, 0, 0, 0,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_splat_4x4() 171 …vo3p0 = wasm_f32x4_add(vo3p0, wasm_f32x4_mul(vi6x3456, wasm_v32x4_shuffle(vwGHIJ, vwGHIJ, 1, 1, 1,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_splat_4x4() [all …]
|
D | 5x5p2-minmax-sse-5x4.c | 141 __m128 vo3p0 = _mm_add_ps(vbias, _mm_mul_ps(vi3x4567, vk02)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_5x4() local 146 vo3p0 = _mm_add_ps(vo3p0, _mm_mul_ps(vi4x4567, vk12)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_5x4() 151 vo3p0 = _mm_add_ps(vo3p0, _mm_mul_ps(vi5x4567, vk22)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_5x4() 156 vo3p0 = _mm_add_ps(vo3p0, _mm_mul_ps(vi6x4567, vk32)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_5x4() 161 vo3p0 = _mm_add_ps(vo3p0, _mm_mul_ps(vi7x4567, vk42)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_5x4() 206 vo3p0 = _mm_add_ps(vo3p0, _mm_mul_ps(vi3x3456, vk01)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_5x4() 211 vo3p0 = _mm_add_ps(vo3p0, _mm_mul_ps(vi4x3456, vk11)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_5x4() 216 vo3p0 = _mm_add_ps(vo3p0, _mm_mul_ps(vi5x3456, vk21)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_5x4() 221 vo3p0 = _mm_add_ps(vo3p0, _mm_mul_ps(vi6x3456, vk31)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_5x4() 226 vo3p0 = _mm_add_ps(vo3p0, _mm_mul_ps(vi7x3456, vk41)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_5x4() [all …]
|
D | 5x5p2-minmax-wasmsimd-x86-splat-5x4.c | 116 v128_t vo3p0 = wasm_v32x4_shuffle(vw0123, vw0123, 0, 0, 0, 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_splat_5x4() local 132 …vo3p0 = wasm_f32x4_add(vo3p0, wasm_f32x4_mul(vi3x4567, wasm_v32x4_shuffle(vw0123, vw0123, 3, 3, 3,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_splat_5x4() 138 …vo3p0 = wasm_f32x4_add(vo3p0, wasm_f32x4_mul(vi4x4567, wasm_v32x4_shuffle(vw89AB, vw89AB, 0, 0, 0,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_splat_5x4() 144 …vo3p0 = wasm_f32x4_add(vo3p0, wasm_f32x4_mul(vi5x4567, wasm_v32x4_shuffle(vwCDEF, vwCDEF, 1, 1, 1,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_splat_5x4() 150 …vo3p0 = wasm_f32x4_add(vo3p0, wasm_f32x4_mul(vi6x4567, wasm_v32x4_shuffle(vwGHIJ, vwGHIJ, 2, 2, 2,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_splat_5x4() 156 …vo3p0 = wasm_f32x4_add(vo3p0, wasm_f32x4_mul(vi7x4567, wasm_v32x4_shuffle(vwKLMN, vwKLMN, 3, 3, 3,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_splat_5x4() 172 …vo3p0 = wasm_f32x4_add(vo3p0, wasm_f32x4_mul(vi3x3456, wasm_v32x4_shuffle(vw0123, vw0123, 2, 2, 2,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_splat_5x4() 178 …vo3p0 = wasm_f32x4_add(vo3p0, wasm_f32x4_mul(vi4x3456, wasm_v32x4_shuffle(vw4567, vw4567, 3, 3, 3,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_splat_5x4() 184 …vo3p0 = wasm_f32x4_add(vo3p0, wasm_f32x4_mul(vi5x3456, wasm_v32x4_shuffle(vwCDEF, vwCDEF, 0, 0, 0,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_splat_5x4() 190 …vo3p0 = wasm_f32x4_add(vo3p0, wasm_f32x4_mul(vi6x3456, wasm_v32x4_shuffle(vwGHIJ, vwGHIJ, 1, 1, 1,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_splat_5x4() [all …]
|
D | 5x5p2-minmax-wasmsimd-arm-splat-5x4.c | 116 v128_t vo3p0 = wasm_v32x4_shuffle(vw0123, vw0123, 0, 0, 0, 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_splat_5x4() local 132 …vo3p0 = wasm_f32x4_add(vo3p0, wasm_f32x4_mul(vi3x4567, wasm_v32x4_shuffle(vw0123, vw0123, 3, 3, 3,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_splat_5x4() 138 …vo3p0 = wasm_f32x4_add(vo3p0, wasm_f32x4_mul(vi4x4567, wasm_v32x4_shuffle(vw89AB, vw89AB, 0, 0, 0,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_splat_5x4() 144 …vo3p0 = wasm_f32x4_add(vo3p0, wasm_f32x4_mul(vi5x4567, wasm_v32x4_shuffle(vwCDEF, vwCDEF, 1, 1, 1,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_splat_5x4() 150 …vo3p0 = wasm_f32x4_add(vo3p0, wasm_f32x4_mul(vi6x4567, wasm_v32x4_shuffle(vwGHIJ, vwGHIJ, 2, 2, 2,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_splat_5x4() 156 …vo3p0 = wasm_f32x4_add(vo3p0, wasm_f32x4_mul(vi7x4567, wasm_v32x4_shuffle(vwKLMN, vwKLMN, 3, 3, 3,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_splat_5x4() 172 …vo3p0 = wasm_f32x4_add(vo3p0, wasm_f32x4_mul(vi3x3456, wasm_v32x4_shuffle(vw0123, vw0123, 2, 2, 2,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_splat_5x4() 178 …vo3p0 = wasm_f32x4_add(vo3p0, wasm_f32x4_mul(vi4x3456, wasm_v32x4_shuffle(vw4567, vw4567, 3, 3, 3,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_splat_5x4() 184 …vo3p0 = wasm_f32x4_add(vo3p0, wasm_f32x4_mul(vi5x3456, wasm_v32x4_shuffle(vwCDEF, vwCDEF, 0, 0, 0,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_splat_5x4() 190 …vo3p0 = wasm_f32x4_add(vo3p0, wasm_f32x4_mul(vi6x3456, wasm_v32x4_shuffle(vwGHIJ, vwGHIJ, 1, 1, 1,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_splat_5x4() [all …]
|
D | 3x3p1-minmax-neonfma-4x4.c | 92 float32x4_t vo3p0 = vdupq_lane_f32(vget_low_f32(vw0123), 0); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__neonfma_4x4() local 104 vo3p0 = vfmaq_lane_f32(vo3p0, vi3x4567, vget_high_f32(vw0123), 0); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__neonfma_4x4() 109 vo3p0 = vfmaq_lane_f32(vo3p0, vi4x4567, vget_low_f32(vw4567), 1); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__neonfma_4x4() 114 vo3p0 = vfmaq_lane_f32(vo3p0, vi5x4567, vw89, 0); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__neonfma_4x4() 126 vo3p0 = vfmaq_lane_f32(vo3p0, vi3x3456, vget_low_f32(vw0123), 1); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__neonfma_4x4() 131 vo3p0 = vfmaq_lane_f32(vo3p0, vi4x3456, vget_low_f32(vw4567), 0); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__neonfma_4x4() 136 vo3p0 = vfmaq_lane_f32(vo3p0, vi5x3456, vget_high_f32(vw4567), 1); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__neonfma_4x4() 155 vo3p0 = vfmaq_lane_f32(vo3p0, vi3x5678, vget_high_f32(vw0123), 1); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__neonfma_4x4() 160 vo3p0 = vfmaq_lane_f32(vo3p0, vi4x5678, vget_high_f32(vw4567), 0); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__neonfma_4x4() 165 vo3p0 = vfmaq_lane_f32(vo3p0, vi5x5678, vw89, 1); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__neonfma_4x4() [all …]
|
D | 3x3p1-minmax-neon-4x4.c | 92 float32x4_t vo3p0 = vdupq_lane_f32(vget_low_f32(vw0123), 0); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__neon_4x4() local 104 vo3p0 = vmlaq_lane_f32(vo3p0, vi3x4567, vget_high_f32(vw0123), 0); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__neon_4x4() 109 vo3p0 = vmlaq_lane_f32(vo3p0, vi4x4567, vget_low_f32(vw4567), 1); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__neon_4x4() 114 vo3p0 = vmlaq_lane_f32(vo3p0, vi5x4567, vw89, 0); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__neon_4x4() 126 vo3p0 = vmlaq_lane_f32(vo3p0, vi3x3456, vget_low_f32(vw0123), 1); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__neon_4x4() 131 vo3p0 = vmlaq_lane_f32(vo3p0, vi4x3456, vget_low_f32(vw4567), 0); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__neon_4x4() 136 vo3p0 = vmlaq_lane_f32(vo3p0, vi5x3456, vget_high_f32(vw4567), 1); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__neon_4x4() 155 vo3p0 = vmlaq_lane_f32(vo3p0, vi3x5678, vget_high_f32(vw0123), 1); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__neon_4x4() 160 vo3p0 = vmlaq_lane_f32(vo3p0, vi4x5678, vget_high_f32(vw4567), 0); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__neon_4x4() 165 vo3p0 = vmlaq_lane_f32(vo3p0, vi5x5678, vw89, 1); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__neon_4x4() [all …]
|
D | 3x3p1-minmax-wasmsimd-x86-splat-4x4.c | 95 v128_t vo3p0 = wasm_v32x4_shuffle(vw0123, vw0123, 0, 0, 0, 0); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_splat_4x4() local 107 …vo3p0 = wasm_f32x4_add(vo3p0, wasm_f32x4_mul(vi3x4567, wasm_v32x4_shuffle(vw0123, vw0123, 2, 2, 2,… in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_splat_4x4() 112 …vo3p0 = wasm_f32x4_add(vo3p0, wasm_f32x4_mul(vi4x4567, wasm_v32x4_shuffle(vw4567, vw4567, 1, 1, 1,… in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_splat_4x4() 117 …vo3p0 = wasm_f32x4_add(vo3p0, wasm_f32x4_mul(vi5x4567, wasm_v32x4_shuffle(vw89, vw89, 0, 0, 0, 0))… in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_splat_4x4() 129 …vo3p0 = wasm_f32x4_add(vo3p0, wasm_f32x4_mul(vi3x3456, wasm_v32x4_shuffle(vw0123, vw0123, 1, 1, 1,… in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_splat_4x4() 134 …vo3p0 = wasm_f32x4_add(vo3p0, wasm_f32x4_mul(vi4x3456, wasm_v32x4_shuffle(vw4567, vw4567, 0, 0, 0,… in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_splat_4x4() 139 …vo3p0 = wasm_f32x4_add(vo3p0, wasm_f32x4_mul(vi5x3456, wasm_v32x4_shuffle(vw4567, vw4567, 3, 3, 3,… in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_splat_4x4() 158 …vo3p0 = wasm_f32x4_add(vo3p0, wasm_f32x4_mul(vi3x5678, wasm_v32x4_shuffle(vw0123, vw0123, 3, 3, 3,… in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_splat_4x4() 163 …vo3p0 = wasm_f32x4_add(vo3p0, wasm_f32x4_mul(vi4x5678, wasm_v32x4_shuffle(vw4567, vw4567, 2, 2, 2,… in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_splat_4x4() 168 …vo3p0 = wasm_f32x4_add(vo3p0, wasm_f32x4_mul(vi5x5678, wasm_v32x4_shuffle(vw89, vw89, 1, 1, 1, 1))… in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_splat_4x4() [all …]
|
D | 3x3p1-minmax-wasmsimd-x86-loadsplat-4x4.c | 125 v128_t vo3p0 = wasm_f32x4_add(vbias, wasm_f32x4_mul(vi3x4567, vk01)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_loadsplat_4x4() local 129 vo3p0 = wasm_f32x4_add(vo3p0, wasm_f32x4_mul(vi4x4567, vk11)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_loadsplat_4x4() 133 vo3p0 = wasm_f32x4_add(vo3p0, wasm_f32x4_mul(vi5x4567, vk21)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_loadsplat_4x4() 145 vo3p0 = wasm_f32x4_add(vo3p0, wasm_f32x4_mul(vi3x3456, vk00)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_loadsplat_4x4() 149 vo3p0 = wasm_f32x4_add(vo3p0, wasm_f32x4_mul(vi4x3456, vk10)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_loadsplat_4x4() 153 vo3p0 = wasm_f32x4_add(vo3p0, wasm_f32x4_mul(vi5x3456, vk20)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_loadsplat_4x4() 172 vo3p0 = wasm_f32x4_add(vo3p0, wasm_f32x4_mul(vi3x5678, vk02)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_loadsplat_4x4() 176 vo3p0 = wasm_f32x4_add(vo3p0, wasm_f32x4_mul(vi4x5678, vk12)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_loadsplat_4x4() 180 vo3p0 = wasm_f32x4_add(vo3p0, wasm_f32x4_mul(vi5x5678, vk22)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_loadsplat_4x4() 193 v128_t vo3 = wasm_v128_bitselect(vmin, vo3p0, wasm_f32x4_lt(vo3p0, vmin)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_loadsplat_4x4() [all …]
|
D | 5x5p2-minmax-neon-4x4-acc2.c | 105 float32x4_t vo3p0 = vdupq_lane_f32(vget_low_f32(vw0123), 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_4x4_acc2() local 124 vo3p0 = vmlaq_lane_f32(vo3p0, vi4x4567, vget_low_f32(vw89AB), 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_4x4_acc2() 129 vo3p0 = vmlaq_lane_f32(vo3p0, vi5x4567, vget_low_f32(vwCDEF), 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_4x4_acc2() 139 vo3p0 = vmlaq_lane_f32(vo3p0, vi7x4567, vget_high_f32(vwKLMN), 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_4x4_acc2() 158 vo3p0 = vmlaq_lane_f32(vo3p0, vi4x3456, vget_high_f32(vw4567), 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_4x4_acc2() 168 vo3p0 = vmlaq_lane_f32(vo3p0, vi6x3456, vget_low_f32(vwGHIJ), 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_4x4_acc2() 195 vo3p0 = vmlaq_lane_f32(vo3p0, vi3x2345, vget_low_f32(vw0123), 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_4x4_acc2() 205 vo3p0 = vmlaq_lane_f32(vo3p0, vi5x2345, vget_high_f32(vw89AB), 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_4x4_acc2() 215 vo3p0 = vmlaq_lane_f32(vo3p0, vi7x2345, vget_low_f32(vwKLMN), 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_4x4_acc2() 234 vo3p0 = vmlaq_lane_f32(vo3p0, vi4x5678, vget_low_f32(vw89AB), 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_4x4_acc2() [all …]
|
D | 5x5p2-minmax-neonfma-4x4-acc2.c | 105 float32x4_t vo3p0 = vdupq_lane_f32(vget_low_f32(vw0123), 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_4x4_acc2() local 124 vo3p0 = vfmaq_lane_f32(vo3p0, vi4x4567, vget_low_f32(vw89AB), 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_4x4_acc2() 129 vo3p0 = vfmaq_lane_f32(vo3p0, vi5x4567, vget_low_f32(vwCDEF), 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_4x4_acc2() 139 vo3p0 = vfmaq_lane_f32(vo3p0, vi7x4567, vget_high_f32(vwKLMN), 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_4x4_acc2() 158 vo3p0 = vfmaq_lane_f32(vo3p0, vi4x3456, vget_high_f32(vw4567), 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_4x4_acc2() 168 vo3p0 = vfmaq_lane_f32(vo3p0, vi6x3456, vget_low_f32(vwGHIJ), 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_4x4_acc2() 195 vo3p0 = vfmaq_lane_f32(vo3p0, vi3x2345, vget_low_f32(vw0123), 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_4x4_acc2() 205 vo3p0 = vfmaq_lane_f32(vo3p0, vi5x2345, vget_high_f32(vw89AB), 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_4x4_acc2() 215 vo3p0 = vfmaq_lane_f32(vo3p0, vi7x2345, vget_low_f32(vwKLMN), 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_4x4_acc2() 234 vo3p0 = vfmaq_lane_f32(vo3p0, vi4x5678, vget_low_f32(vw89AB), 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_4x4_acc2() [all …]
|
D | 3x3p1-minmax-neonfma-5x4.c | 100 float32x4_t vo3p0 = vdupq_lane_f32(vget_low_f32(vw0123), 0); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__neonfma_5x4() local 114 vo3p0 = vfmaq_lane_f32(vo3p0, vi3x4567, vget_high_f32(vw0123), 0); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__neonfma_5x4() 120 vo3p0 = vfmaq_lane_f32(vo3p0, vi4x4567, vget_low_f32(vw4567), 1); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__neonfma_5x4() 126 vo3p0 = vfmaq_lane_f32(vo3p0, vi5x4567, vw89, 0); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__neonfma_5x4() 140 vo3p0 = vfmaq_lane_f32(vo3p0, vi3x3456, vget_low_f32(vw0123), 1); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__neonfma_5x4() 146 vo3p0 = vfmaq_lane_f32(vo3p0, vi4x3456, vget_low_f32(vw4567), 0); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__neonfma_5x4() 152 vo3p0 = vfmaq_lane_f32(vo3p0, vi5x3456, vget_high_f32(vw4567), 1); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__neonfma_5x4() 174 vo3p0 = vfmaq_lane_f32(vo3p0, vi3x5678, vget_high_f32(vw0123), 1); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__neonfma_5x4() 180 vo3p0 = vfmaq_lane_f32(vo3p0, vi4x5678, vget_high_f32(vw4567), 0); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__neonfma_5x4() 186 vo3p0 = vfmaq_lane_f32(vo3p0, vi5x5678, vw89, 1); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__neonfma_5x4() [all …]
|
D | 3x3s2p1-minmax-neonfma-4x4.c | 107 float32x4_t vo3p0 = vdupq_lane_f32(vget_low_f32(vw0123), 0); in xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__neonfma_4x4() local 122 vo3p0 = vfmaq_lane_f32(vo3p0, vi6x8ACE9BDF.val[0], vget_high_f32(vw0123), 0); in xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__neonfma_4x4() 127 vo3p0 = vfmaq_lane_f32(vo3p0, vi7x8ACE9BDF.val[0], vget_low_f32(vw4567), 1); in xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__neonfma_4x4() 132 vo3p0 = vfmaq_lane_f32(vo3p0, vi8x8ACE9BDF.val[0], vw89, 0); in xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__neonfma_4x4() 156 vo3p0 = vfmaq_lane_f32(vo3p0, vi6x7BDF, vget_low_f32(vw0123), 1); in xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__neonfma_4x4() 161 vo3p0 = vfmaq_lane_f32(vo3p0, vi7x7BDF, vget_low_f32(vw4567), 0); in xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__neonfma_4x4() 166 vo3p0 = vfmaq_lane_f32(vo3p0, vi8x7BDF, vget_high_f32(vw4567), 1); in xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__neonfma_4x4() 171 vo3p0 = vfmaq_lane_f32(vo3p0, vi6x8ACE9BDF.val[1], vget_high_f32(vw0123), 1); in xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__neonfma_4x4() 176 vo3p0 = vfmaq_lane_f32(vo3p0, vi7x8ACE9BDF.val[1], vget_high_f32(vw4567), 0); in xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__neonfma_4x4() 181 vo3p0 = vfmaq_lane_f32(vo3p0, vi8x8ACE9BDF.val[1], vw89, 1); in xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__neonfma_4x4() [all …]
|
D | 3x3p1-minmax-wasmsimd-arm-splat-4x4.c | 95 v128_t vo3p0 = wasm_v32x4_shuffle(vw0123, vw0123, 0, 0, 0, 0); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_splat_4x4() local 107 …vo3p0 = wasm_f32x4_add(vo3p0, wasm_f32x4_mul(vi3x4567, wasm_v32x4_shuffle(vw0123, vw0123, 2, 2, 2,… in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_splat_4x4() 112 …vo3p0 = wasm_f32x4_add(vo3p0, wasm_f32x4_mul(vi4x4567, wasm_v32x4_shuffle(vw4567, vw4567, 1, 1, 1,… in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_splat_4x4() 117 …vo3p0 = wasm_f32x4_add(vo3p0, wasm_f32x4_mul(vi5x4567, wasm_v32x4_shuffle(vw89, vw89, 0, 0, 0, 0))… in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_splat_4x4() 129 …vo3p0 = wasm_f32x4_add(vo3p0, wasm_f32x4_mul(vi3x3456, wasm_v32x4_shuffle(vw0123, vw0123, 1, 1, 1,… in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_splat_4x4() 134 …vo3p0 = wasm_f32x4_add(vo3p0, wasm_f32x4_mul(vi4x3456, wasm_v32x4_shuffle(vw4567, vw4567, 0, 0, 0,… in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_splat_4x4() 139 …vo3p0 = wasm_f32x4_add(vo3p0, wasm_f32x4_mul(vi5x3456, wasm_v32x4_shuffle(vw4567, vw4567, 3, 3, 3,… in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_splat_4x4() 158 …vo3p0 = wasm_f32x4_add(vo3p0, wasm_f32x4_mul(vi3x5678, wasm_v32x4_shuffle(vw0123, vw0123, 3, 3, 3,… in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_splat_4x4() 163 …vo3p0 = wasm_f32x4_add(vo3p0, wasm_f32x4_mul(vi4x5678, wasm_v32x4_shuffle(vw4567, vw4567, 2, 2, 2,… in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_splat_4x4() 168 …vo3p0 = wasm_f32x4_add(vo3p0, wasm_f32x4_mul(vi5x5678, wasm_v32x4_shuffle(vw89, vw89, 1, 1, 1, 1))… in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_splat_4x4() [all …]
|
D | 3x3p1-minmax-neon-5x4.c | 100 float32x4_t vo3p0 = vdupq_lane_f32(vget_low_f32(vw0123), 0); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__neon_5x4() local 114 vo3p0 = vmlaq_lane_f32(vo3p0, vi3x4567, vget_high_f32(vw0123), 0); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__neon_5x4() 120 vo3p0 = vmlaq_lane_f32(vo3p0, vi4x4567, vget_low_f32(vw4567), 1); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__neon_5x4() 126 vo3p0 = vmlaq_lane_f32(vo3p0, vi5x4567, vw89, 0); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__neon_5x4() 140 vo3p0 = vmlaq_lane_f32(vo3p0, vi3x3456, vget_low_f32(vw0123), 1); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__neon_5x4() 146 vo3p0 = vmlaq_lane_f32(vo3p0, vi4x3456, vget_low_f32(vw4567), 0); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__neon_5x4() 152 vo3p0 = vmlaq_lane_f32(vo3p0, vi5x3456, vget_high_f32(vw4567), 1); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__neon_5x4() 174 vo3p0 = vmlaq_lane_f32(vo3p0, vi3x5678, vget_high_f32(vw0123), 1); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__neon_5x4() 180 vo3p0 = vmlaq_lane_f32(vo3p0, vi4x5678, vget_high_f32(vw4567), 0); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__neon_5x4() 186 vo3p0 = vmlaq_lane_f32(vo3p0, vi5x5678, vw89, 1); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__neon_5x4() [all …]
|
D | 3x3s2p1-minmax-neon-4x4.c | 107 float32x4_t vo3p0 = vdupq_lane_f32(vget_low_f32(vw0123), 0); in xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__neon_4x4() local 122 vo3p0 = vmlaq_lane_f32(vo3p0, vi6x8ACE9BDF.val[0], vget_high_f32(vw0123), 0); in xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__neon_4x4() 127 vo3p0 = vmlaq_lane_f32(vo3p0, vi7x8ACE9BDF.val[0], vget_low_f32(vw4567), 1); in xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__neon_4x4() 132 vo3p0 = vmlaq_lane_f32(vo3p0, vi8x8ACE9BDF.val[0], vw89, 0); in xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__neon_4x4() 156 vo3p0 = vmlaq_lane_f32(vo3p0, vi6x7BDF, vget_low_f32(vw0123), 1); in xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__neon_4x4() 161 vo3p0 = vmlaq_lane_f32(vo3p0, vi7x7BDF, vget_low_f32(vw4567), 0); in xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__neon_4x4() 166 vo3p0 = vmlaq_lane_f32(vo3p0, vi8x7BDF, vget_high_f32(vw4567), 1); in xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__neon_4x4() 171 vo3p0 = vmlaq_lane_f32(vo3p0, vi6x8ACE9BDF.val[1], vget_high_f32(vw0123), 1); in xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__neon_4x4() 176 vo3p0 = vmlaq_lane_f32(vo3p0, vi7x8ACE9BDF.val[1], vget_high_f32(vw4567), 0); in xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__neon_4x4() 181 vo3p0 = vmlaq_lane_f32(vo3p0, vi8x8ACE9BDF.val[1], vw89, 1); in xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__neon_4x4() [all …]
|