/external/XNNPACK/src/f32-dwconv2d-chw/gen/ |
D | 5x5p2-minmax-wasmsimd-arm-loadsplat-3x4-acc2.c | 137 v128_t vo2p1 = wasm_f32x4_mul(vi2x4567, vk02); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_3x4_acc2() local 149 vo2p1 = wasm_f32x4_add(vo2p1, wasm_f32x4_mul(vi5x4567, vk32)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_3x4_acc2() 165 vo2p1 = wasm_f32x4_add(vo2p1, wasm_f32x4_mul(vi2x3456, vk01)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_3x4_acc2() 173 vo2p1 = wasm_f32x4_add(vo2p1, wasm_f32x4_mul(vi4x3456, vk21)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_3x4_acc2() 181 vo2p1 = wasm_f32x4_add(vo2p1, wasm_f32x4_mul(vi6x3456, vk41)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_3x4_acc2() 204 vo2p1 = wasm_f32x4_add(vo2p1, wasm_f32x4_mul(vi3x2345, vk10)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_3x4_acc2() 212 vo2p1 = wasm_f32x4_add(vo2p1, wasm_f32x4_mul(vi5x2345, vk30)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_3x4_acc2() 228 vo2p1 = wasm_f32x4_add(vo2p1, wasm_f32x4_mul(vi2x5678, vk03)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_3x4_acc2() 236 vo2p1 = wasm_f32x4_add(vo2p1, wasm_f32x4_mul(vi4x5678, vk23)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_3x4_acc2() 244 vo2p1 = wasm_f32x4_add(vo2p1, wasm_f32x4_mul(vi6x5678, vk43)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_3x4_acc2() [all …]
|
D | 5x5p2-minmax-neonfma-3x4-acc2.c | 108 float32x4_t vo2p1 = vmulq_lane_f32(vi2x4567, vget_high_f32(vw0123), 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_3x4_acc2() local 120 vo2p1 = vfmaq_lane_f32(vo2p1, vi5x4567, vget_high_f32(vwGHIJ), 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_3x4_acc2() 136 vo2p1 = vfmaq_lane_f32(vo2p1, vi2x3456, vget_high_f32(vw0123), 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_3x4_acc2() 144 vo2p1 = vfmaq_lane_f32(vo2p1, vi4x3456, vget_low_f32(vwCDEF), 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_3x4_acc2() 152 vo2p1 = vfmaq_lane_f32(vo2p1, vi6x3456, vget_high_f32(vwKLMN), 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_3x4_acc2() 175 vo2p1 = vfmaq_lane_f32(vo2p1, vi3x2345, vget_high_f32(vw4567), 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_3x4_acc2() 183 vo2p1 = vfmaq_lane_f32(vo2p1, vi5x2345, vget_low_f32(vwGHIJ), 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_3x4_acc2() 199 vo2p1 = vfmaq_lane_f32(vo2p1, vi2x5678, vget_low_f32(vw4567), 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_3x4_acc2() 207 vo2p1 = vfmaq_lane_f32(vo2p1, vi4x5678, vget_high_f32(vwCDEF), 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_3x4_acc2() 215 vo2p1 = vfmaq_lane_f32(vo2p1, vi6x5678, vwOP, 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_3x4_acc2() [all …]
|
D | 5x5p2-minmax-neon-3x4-acc2.c | 108 float32x4_t vo2p1 = vmulq_lane_f32(vi2x4567, vget_high_f32(vw0123), 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_3x4_acc2() local 120 vo2p1 = vmlaq_lane_f32(vo2p1, vi5x4567, vget_high_f32(vwGHIJ), 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_3x4_acc2() 136 vo2p1 = vmlaq_lane_f32(vo2p1, vi2x3456, vget_high_f32(vw0123), 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_3x4_acc2() 144 vo2p1 = vmlaq_lane_f32(vo2p1, vi4x3456, vget_low_f32(vwCDEF), 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_3x4_acc2() 152 vo2p1 = vmlaq_lane_f32(vo2p1, vi6x3456, vget_high_f32(vwKLMN), 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_3x4_acc2() 175 vo2p1 = vmlaq_lane_f32(vo2p1, vi3x2345, vget_high_f32(vw4567), 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_3x4_acc2() 183 vo2p1 = vmlaq_lane_f32(vo2p1, vi5x2345, vget_low_f32(vwGHIJ), 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_3x4_acc2() 199 vo2p1 = vmlaq_lane_f32(vo2p1, vi2x5678, vget_low_f32(vw4567), 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_3x4_acc2() 207 vo2p1 = vmlaq_lane_f32(vo2p1, vi4x5678, vget_high_f32(vwCDEF), 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_3x4_acc2() 215 vo2p1 = vmlaq_lane_f32(vo2p1, vi6x5678, vwOP, 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_3x4_acc2() [all …]
|
D | 5x5p2-minmax-scalar-3x1-acc2.c | 138 float vo2p1 = vi3x0 * vk10; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_3x1_acc2() local 144 vo2p1 += vi5x0 * vk30; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_3x1_acc2() 159 vo2p1 += vi2x1 * vk01; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_3x1_acc2() 165 vo2p1 += vi4x1 * vk21; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_3x1_acc2() 171 vo2p1 += vi6x1 * vk41; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_3x1_acc2() 186 vo2p1 += vi3x2 * vk12; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_3x1_acc2() 192 vo2p1 += vi5x2 * vk32; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_3x1_acc2() 207 vo2p1 += vi2x3 * vk03; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_3x1_acc2() 213 vo2p1 += vi4x3 * vk23; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_3x1_acc2() 219 vo2p1 += vi6x3 * vk43; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_3x1_acc2() [all …]
|
D | 5x5s2p2-minmax-scalar-3x1-acc2.c | 173 float vo2p1 = vi5x0 * vk10; in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__scalar_3x1_acc2() local 179 vo2p1 += vi7x0 * vk30; in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__scalar_3x1_acc2() 196 vo2p1 += vi4x1 * vk01; in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__scalar_3x1_acc2() 202 vo2p1 += vi6x1 * vk21; in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__scalar_3x1_acc2() 208 vo2p1 += vi8x1 * vk41; in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__scalar_3x1_acc2() 225 vo2p1 += vi5x2 * vk12; in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__scalar_3x1_acc2() 231 vo2p1 += vi7x2 * vk32; in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__scalar_3x1_acc2() 248 vo2p1 += vi4x3 * vk03; in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__scalar_3x1_acc2() 254 vo2p1 += vi6x3 * vk23; in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__scalar_3x1_acc2() 260 vo2p1 += vi8x3 * vk43; in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__scalar_3x1_acc2() [all …]
|
D | 5x5p2-minmax-wasmsimd-x86-loadsplat-3x4-acc2.c | 137 v128_t vo2p1 = wasm_f32x4_mul(vi2x4567, vk02); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_3x4_acc2() local 149 vo2p1 = wasm_f32x4_add(vo2p1, wasm_f32x4_mul(vi5x4567, vk32)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_3x4_acc2() 165 vo2p1 = wasm_f32x4_add(vo2p1, wasm_f32x4_mul(vi2x3456, vk01)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_3x4_acc2() 173 vo2p1 = wasm_f32x4_add(vo2p1, wasm_f32x4_mul(vi4x3456, vk21)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_3x4_acc2() 181 vo2p1 = wasm_f32x4_add(vo2p1, wasm_f32x4_mul(vi6x3456, vk41)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_3x4_acc2() 204 vo2p1 = wasm_f32x4_add(vo2p1, wasm_f32x4_mul(vi3x2345, vk10)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_3x4_acc2() 212 vo2p1 = wasm_f32x4_add(vo2p1, wasm_f32x4_mul(vi5x2345, vk30)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_3x4_acc2() 228 vo2p1 = wasm_f32x4_add(vo2p1, wasm_f32x4_mul(vi2x5678, vk03)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_3x4_acc2() 236 vo2p1 = wasm_f32x4_add(vo2p1, wasm_f32x4_mul(vi4x5678, vk23)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_3x4_acc2() 244 vo2p1 = wasm_f32x4_add(vo2p1, wasm_f32x4_mul(vi6x5678, vk43)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_3x4_acc2() [all …]
|
D | 5x5p2-minmax-sse-3x4-acc2.c | 125 __m128 vo2p1 = _mm_mul_ps(vi3x4567, vk12); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_3x4_acc2() local 131 vo2p1 = _mm_add_ps(vo2p1, _mm_mul_ps(vi5x4567, vk32)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_3x4_acc2() 169 vo2p1 = _mm_add_ps(vo2p1, _mm_mul_ps(vi2x3456, vk01)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_3x4_acc2() 175 vo2p1 = _mm_add_ps(vo2p1, _mm_mul_ps(vi4x3456, vk21)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_3x4_acc2() 181 vo2p1 = _mm_add_ps(vo2p1, _mm_mul_ps(vi6x3456, vk41)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_3x4_acc2() 218 vo2p1 = _mm_add_ps(vo2p1, _mm_mul_ps(vi3x2345, vk10)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_3x4_acc2() 224 vo2p1 = _mm_add_ps(vo2p1, _mm_mul_ps(vi5x2345, vk30)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_3x4_acc2() 239 vo2p1 = _mm_add_ps(vo2p1, _mm_mul_ps(vi2x5678, vk03)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_3x4_acc2() 245 vo2p1 = _mm_add_ps(vo2p1, _mm_mul_ps(vi4x5678, vk23)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_3x4_acc2() 251 vo2p1 = _mm_add_ps(vo2p1, _mm_mul_ps(vi6x5678, vk43)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_3x4_acc2() [all …]
|
D | 5x5p2-minmax-wasmsimd-x86-splat-3x4-acc2.c | 111 v128_t vo2p1 = wasm_f32x4_mul(vi2x4567, wasm_v32x4_shuffle(vw0123, vw0123, 3, 3, 3, 3)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_splat_3x4_acc2() local 123 …vo2p1 = wasm_f32x4_add(vo2p1, wasm_f32x4_mul(vi5x4567, wasm_v32x4_shuffle(vwGHIJ, vwGHIJ, 2, 2, 2,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_splat_3x4_acc2() 139 …vo2p1 = wasm_f32x4_add(vo2p1, wasm_f32x4_mul(vi2x3456, wasm_v32x4_shuffle(vw0123, vw0123, 2, 2, 2,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_splat_3x4_acc2() 147 …vo2p1 = wasm_f32x4_add(vo2p1, wasm_f32x4_mul(vi4x3456, wasm_v32x4_shuffle(vwCDEF, vwCDEF, 0, 0, 0,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_splat_3x4_acc2() 155 …vo2p1 = wasm_f32x4_add(vo2p1, wasm_f32x4_mul(vi6x3456, wasm_v32x4_shuffle(vwKLMN, vwKLMN, 2, 2, 2,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_splat_3x4_acc2() 178 …vo2p1 = wasm_f32x4_add(vo2p1, wasm_f32x4_mul(vi3x2345, wasm_v32x4_shuffle(vw4567, vw4567, 2, 2, 2,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_splat_3x4_acc2() 186 …vo2p1 = wasm_f32x4_add(vo2p1, wasm_f32x4_mul(vi5x2345, wasm_v32x4_shuffle(vwGHIJ, vwGHIJ, 0, 0, 0,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_splat_3x4_acc2() 202 …vo2p1 = wasm_f32x4_add(vo2p1, wasm_f32x4_mul(vi2x5678, wasm_v32x4_shuffle(vw4567, vw4567, 0, 0, 0,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_splat_3x4_acc2() 210 …vo2p1 = wasm_f32x4_add(vo2p1, wasm_f32x4_mul(vi4x5678, wasm_v32x4_shuffle(vwCDEF, vwCDEF, 2, 2, 2,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_splat_3x4_acc2() 218 …vo2p1 = wasm_f32x4_add(vo2p1, wasm_f32x4_mul(vi6x5678, wasm_v32x4_shuffle(vwOP, vwOP, 0, 0, 0, 0))… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_splat_3x4_acc2() [all …]
|
D | 5x5p2-minmax-neon-4x4-acc2.c | 118 float32x4_t vo2p1 = vmulq_lane_f32(vi2x4567, vget_high_f32(vw0123), 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_4x4_acc2() local 133 vo2p1 = vmlaq_lane_f32(vo2p1, vi5x4567, vget_high_f32(vwGHIJ), 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_4x4_acc2() 152 vo2p1 = vmlaq_lane_f32(vo2p1, vi2x3456, vget_high_f32(vw0123), 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_4x4_acc2() 162 vo2p1 = vmlaq_lane_f32(vo2p1, vi4x3456, vget_low_f32(vwCDEF), 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_4x4_acc2() 172 vo2p1 = vmlaq_lane_f32(vo2p1, vi6x3456, vget_high_f32(vwKLMN), 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_4x4_acc2() 199 vo2p1 = vmlaq_lane_f32(vo2p1, vi3x2345, vget_high_f32(vw4567), 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_4x4_acc2() 209 vo2p1 = vmlaq_lane_f32(vo2p1, vi5x2345, vget_low_f32(vwGHIJ), 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_4x4_acc2() 228 vo2p1 = vmlaq_lane_f32(vo2p1, vi2x5678, vget_low_f32(vw4567), 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_4x4_acc2() 238 vo2p1 = vmlaq_lane_f32(vo2p1, vi4x5678, vget_high_f32(vwCDEF), 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_4x4_acc2() 248 vo2p1 = vmlaq_lane_f32(vo2p1, vi6x5678, vwOP, 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_4x4_acc2() [all …]
|
D | 5x5p2-minmax-wasmsimd-arm-splat-3x4-acc2.c | 111 v128_t vo2p1 = wasm_f32x4_mul(vi2x4567, wasm_v32x4_shuffle(vw0123, vw0123, 3, 3, 3, 3)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_splat_3x4_acc2() local 123 …vo2p1 = wasm_f32x4_add(vo2p1, wasm_f32x4_mul(vi5x4567, wasm_v32x4_shuffle(vwGHIJ, vwGHIJ, 2, 2, 2,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_splat_3x4_acc2() 139 …vo2p1 = wasm_f32x4_add(vo2p1, wasm_f32x4_mul(vi2x3456, wasm_v32x4_shuffle(vw0123, vw0123, 2, 2, 2,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_splat_3x4_acc2() 147 …vo2p1 = wasm_f32x4_add(vo2p1, wasm_f32x4_mul(vi4x3456, wasm_v32x4_shuffle(vwCDEF, vwCDEF, 0, 0, 0,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_splat_3x4_acc2() 155 …vo2p1 = wasm_f32x4_add(vo2p1, wasm_f32x4_mul(vi6x3456, wasm_v32x4_shuffle(vwKLMN, vwKLMN, 2, 2, 2,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_splat_3x4_acc2() 178 …vo2p1 = wasm_f32x4_add(vo2p1, wasm_f32x4_mul(vi3x2345, wasm_v32x4_shuffle(vw4567, vw4567, 2, 2, 2,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_splat_3x4_acc2() 186 …vo2p1 = wasm_f32x4_add(vo2p1, wasm_f32x4_mul(vi5x2345, wasm_v32x4_shuffle(vwGHIJ, vwGHIJ, 0, 0, 0,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_splat_3x4_acc2() 202 …vo2p1 = wasm_f32x4_add(vo2p1, wasm_f32x4_mul(vi2x5678, wasm_v32x4_shuffle(vw4567, vw4567, 0, 0, 0,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_splat_3x4_acc2() 210 …vo2p1 = wasm_f32x4_add(vo2p1, wasm_f32x4_mul(vi4x5678, wasm_v32x4_shuffle(vwCDEF, vwCDEF, 2, 2, 2,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_splat_3x4_acc2() 218 …vo2p1 = wasm_f32x4_add(vo2p1, wasm_f32x4_mul(vi6x5678, wasm_v32x4_shuffle(vwOP, vwOP, 0, 0, 0, 0))… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_splat_3x4_acc2() [all …]
|
D | 5x5p2-minmax-neonfma-4x4-acc2.c | 118 float32x4_t vo2p1 = vmulq_lane_f32(vi2x4567, vget_high_f32(vw0123), 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_4x4_acc2() local 133 vo2p1 = vfmaq_lane_f32(vo2p1, vi5x4567, vget_high_f32(vwGHIJ), 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_4x4_acc2() 152 vo2p1 = vfmaq_lane_f32(vo2p1, vi2x3456, vget_high_f32(vw0123), 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_4x4_acc2() 162 vo2p1 = vfmaq_lane_f32(vo2p1, vi4x3456, vget_low_f32(vwCDEF), 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_4x4_acc2() 172 vo2p1 = vfmaq_lane_f32(vo2p1, vi6x3456, vget_high_f32(vwKLMN), 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_4x4_acc2() 199 vo2p1 = vfmaq_lane_f32(vo2p1, vi3x2345, vget_high_f32(vw4567), 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_4x4_acc2() 209 vo2p1 = vfmaq_lane_f32(vo2p1, vi5x2345, vget_low_f32(vwGHIJ), 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_4x4_acc2() 228 vo2p1 = vfmaq_lane_f32(vo2p1, vi2x5678, vget_low_f32(vw4567), 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_4x4_acc2() 238 vo2p1 = vfmaq_lane_f32(vo2p1, vi4x5678, vget_high_f32(vwCDEF), 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_4x4_acc2() 248 vo2p1 = vfmaq_lane_f32(vo2p1, vi6x5678, vwOP, 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_4x4_acc2() [all …]
|
D | 5x5s2p2-minmax-neonfma-3x4-acc2.c | 131 float32x4_t vo2p1 = vmulq_lane_f32(vi4x8ACE9BDF.val[0], vget_high_f32(vw0123), 1); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__neonfma_3x4_acc2() local 143 vo2p1 = vfmaq_lane_f32(vo2p1, vi7x8ACE9BDF.val[0], vget_high_f32(vwGHIJ), 0); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__neonfma_3x4_acc2() 151 vo2p1 = vfmaq_lane_f32(vo2p1, vi4x8ACE9BDF.val[1], vget_low_f32(vw4567), 0); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__neonfma_3x4_acc2() 159 vo2p1 = vfmaq_lane_f32(vo2p1, vi6x8ACE9BDF.val[1], vget_high_f32(vwCDEF), 0); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__neonfma_3x4_acc2() 167 vo2p1 = vfmaq_lane_f32(vo2p1, vi8x8ACE9BDF.val[1], vwOP, 0); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__neonfma_3x4_acc2() 194 vo2p1 = vfmaq_lane_f32(vo2p1, vi5x68AC, vget_high_f32(vw4567), 0); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__neonfma_3x4_acc2() 202 vo2p1 = vfmaq_lane_f32(vo2p1, vi7x68AC, vget_low_f32(vwGHIJ), 0); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__neonfma_3x4_acc2() 239 vo2p1 = vfmaq_lane_f32(vo2p1, vi4x79BD, vget_high_f32(vw0123), 0); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__neonfma_3x4_acc2() 247 vo2p1 = vfmaq_lane_f32(vo2p1, vi6x79BD, vget_low_f32(vwCDEF), 0); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__neonfma_3x4_acc2() 255 vo2p1 = vfmaq_lane_f32(vo2p1, vi8x79BD, vget_high_f32(vwKLMN), 0); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__neonfma_3x4_acc2() [all …]
|
D | 5x5s2p2-minmax-neon-3x4-acc2.c | 131 float32x4_t vo2p1 = vmulq_lane_f32(vi4x8ACE9BDF.val[0], vget_high_f32(vw0123), 1); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__neon_3x4_acc2() local 143 vo2p1 = vmlaq_lane_f32(vo2p1, vi7x8ACE9BDF.val[0], vget_high_f32(vwGHIJ), 0); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__neon_3x4_acc2() 151 vo2p1 = vmlaq_lane_f32(vo2p1, vi4x8ACE9BDF.val[1], vget_low_f32(vw4567), 0); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__neon_3x4_acc2() 159 vo2p1 = vmlaq_lane_f32(vo2p1, vi6x8ACE9BDF.val[1], vget_high_f32(vwCDEF), 0); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__neon_3x4_acc2() 167 vo2p1 = vmlaq_lane_f32(vo2p1, vi8x8ACE9BDF.val[1], vwOP, 0); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__neon_3x4_acc2() 194 vo2p1 = vmlaq_lane_f32(vo2p1, vi5x68AC, vget_high_f32(vw4567), 0); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__neon_3x4_acc2() 202 vo2p1 = vmlaq_lane_f32(vo2p1, vi7x68AC, vget_low_f32(vwGHIJ), 0); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__neon_3x4_acc2() 239 vo2p1 = vmlaq_lane_f32(vo2p1, vi4x79BD, vget_high_f32(vw0123), 0); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__neon_3x4_acc2() 247 vo2p1 = vmlaq_lane_f32(vo2p1, vi6x79BD, vget_low_f32(vwCDEF), 0); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__neon_3x4_acc2() 255 vo2p1 = vmlaq_lane_f32(vo2p1, vi8x79BD, vget_high_f32(vwKLMN), 0); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__neon_3x4_acc2() [all …]
|
D | 5x5p2-minmax-wasmsimd-arm-loadsplat-4x4-acc2.c | 147 v128_t vo2p1 = wasm_f32x4_mul(vi2x4567, vk02); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_4x4_acc2() local 162 vo2p1 = wasm_f32x4_add(vo2p1, wasm_f32x4_mul(vi5x4567, vk32)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_4x4_acc2() 181 vo2p1 = wasm_f32x4_add(vo2p1, wasm_f32x4_mul(vi2x3456, vk01)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_4x4_acc2() 191 vo2p1 = wasm_f32x4_add(vo2p1, wasm_f32x4_mul(vi4x3456, vk21)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_4x4_acc2() 201 vo2p1 = wasm_f32x4_add(vo2p1, wasm_f32x4_mul(vi6x3456, vk41)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_4x4_acc2() 228 vo2p1 = wasm_f32x4_add(vo2p1, wasm_f32x4_mul(vi3x2345, vk10)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_4x4_acc2() 238 vo2p1 = wasm_f32x4_add(vo2p1, wasm_f32x4_mul(vi5x2345, vk30)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_4x4_acc2() 257 vo2p1 = wasm_f32x4_add(vo2p1, wasm_f32x4_mul(vi2x5678, vk03)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_4x4_acc2() 267 vo2p1 = wasm_f32x4_add(vo2p1, wasm_f32x4_mul(vi4x5678, vk23)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_4x4_acc2() 277 vo2p1 = wasm_f32x4_add(vo2p1, wasm_f32x4_mul(vi6x5678, vk43)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_4x4_acc2() [all …]
|
D | 5x5p2-minmax-wasmsimd-x86-loadsplat-4x4-acc2.c | 147 v128_t vo2p1 = wasm_f32x4_mul(vi2x4567, vk02); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_4x4_acc2() local 162 vo2p1 = wasm_f32x4_add(vo2p1, wasm_f32x4_mul(vi5x4567, vk32)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_4x4_acc2() 181 vo2p1 = wasm_f32x4_add(vo2p1, wasm_f32x4_mul(vi2x3456, vk01)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_4x4_acc2() 191 vo2p1 = wasm_f32x4_add(vo2p1, wasm_f32x4_mul(vi4x3456, vk21)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_4x4_acc2() 201 vo2p1 = wasm_f32x4_add(vo2p1, wasm_f32x4_mul(vi6x3456, vk41)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_4x4_acc2() 228 vo2p1 = wasm_f32x4_add(vo2p1, wasm_f32x4_mul(vi3x2345, vk10)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_4x4_acc2() 238 vo2p1 = wasm_f32x4_add(vo2p1, wasm_f32x4_mul(vi5x2345, vk30)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_4x4_acc2() 257 vo2p1 = wasm_f32x4_add(vo2p1, wasm_f32x4_mul(vi2x5678, vk03)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_4x4_acc2() 267 vo2p1 = wasm_f32x4_add(vo2p1, wasm_f32x4_mul(vi4x5678, vk23)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_4x4_acc2() 277 vo2p1 = wasm_f32x4_add(vo2p1, wasm_f32x4_mul(vi6x5678, vk43)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_4x4_acc2() [all …]
|
D | 5x5s2p2-minmax-wasmsimd-arm-loadsplat-3x4-acc2.c | 191 v128_t vo2p1 = wasm_f32x4_mul(vi4x8ACE, vk02); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_arm_loadsplat_3x4_acc2() local 203 vo2p1 = wasm_f32x4_add(vo2p1, wasm_f32x4_mul(vi7x8ACE, vk32)); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_arm_loadsplat_3x4_acc2() 211 vo2p1 = wasm_f32x4_add(vo2p1, wasm_f32x4_mul(vi4x9BDF, vk03)); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_arm_loadsplat_3x4_acc2() 219 vo2p1 = wasm_f32x4_add(vo2p1, wasm_f32x4_mul(vi6x9BDF, vk23)); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_arm_loadsplat_3x4_acc2() 227 vo2p1 = wasm_f32x4_add(vo2p1, wasm_f32x4_mul(vi8x9BDF, vk43)); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_arm_loadsplat_3x4_acc2() 254 vo2p1 = wasm_f32x4_add(vo2p1, wasm_f32x4_mul(vi5x68AC, vk10)); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_arm_loadsplat_3x4_acc2() 262 vo2p1 = wasm_f32x4_add(vo2p1, wasm_f32x4_mul(vi7x68AC, vk30)); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_arm_loadsplat_3x4_acc2() 336 vo2p1 = wasm_f32x4_add(vo2p1, wasm_f32x4_mul(vi4x79BD, vk01)); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_arm_loadsplat_3x4_acc2() 344 vo2p1 = wasm_f32x4_add(vo2p1, wasm_f32x4_mul(vi6x79BD, vk21)); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_arm_loadsplat_3x4_acc2() 352 vo2p1 = wasm_f32x4_add(vo2p1, wasm_f32x4_mul(vi8x79BD, vk41)); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_arm_loadsplat_3x4_acc2() [all …]
|
D | 5x5s2p2-minmax-wasmsimd-x86-loadsplat-3x4-acc2.c | 191 v128_t vo2p1 = wasm_f32x4_mul(vi4x8ACE, vk02); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_x86_loadsplat_3x4_acc2() local 203 vo2p1 = wasm_f32x4_add(vo2p1, wasm_f32x4_mul(vi7x8ACE, vk32)); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_x86_loadsplat_3x4_acc2() 211 vo2p1 = wasm_f32x4_add(vo2p1, wasm_f32x4_mul(vi4x9BDF, vk03)); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_x86_loadsplat_3x4_acc2() 219 vo2p1 = wasm_f32x4_add(vo2p1, wasm_f32x4_mul(vi6x9BDF, vk23)); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_x86_loadsplat_3x4_acc2() 227 vo2p1 = wasm_f32x4_add(vo2p1, wasm_f32x4_mul(vi8x9BDF, vk43)); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_x86_loadsplat_3x4_acc2() 254 vo2p1 = wasm_f32x4_add(vo2p1, wasm_f32x4_mul(vi5x68AC, vk10)); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_x86_loadsplat_3x4_acc2() 262 vo2p1 = wasm_f32x4_add(vo2p1, wasm_f32x4_mul(vi7x68AC, vk30)); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_x86_loadsplat_3x4_acc2() 336 vo2p1 = wasm_f32x4_add(vo2p1, wasm_f32x4_mul(vi4x79BD, vk01)); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_x86_loadsplat_3x4_acc2() 344 vo2p1 = wasm_f32x4_add(vo2p1, wasm_f32x4_mul(vi6x79BD, vk21)); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_x86_loadsplat_3x4_acc2() 352 vo2p1 = wasm_f32x4_add(vo2p1, wasm_f32x4_mul(vi8x79BD, vk41)); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_x86_loadsplat_3x4_acc2() [all …]
|
D | 5x5p2-minmax-sse-4x4-acc2.c | 135 __m128 vo2p1 = _mm_mul_ps(vi3x4567, vk12); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_4x4_acc2() local 143 vo2p1 = _mm_add_ps(vo2p1, _mm_mul_ps(vi5x4567, vk32)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_4x4_acc2() 187 vo2p1 = _mm_add_ps(vo2p1, _mm_mul_ps(vi2x3456, vk01)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_4x4_acc2() 195 vo2p1 = _mm_add_ps(vo2p1, _mm_mul_ps(vi4x3456, vk21)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_4x4_acc2() 203 vo2p1 = _mm_add_ps(vo2p1, _mm_mul_ps(vi6x3456, vk41)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_4x4_acc2() 246 vo2p1 = _mm_add_ps(vo2p1, _mm_mul_ps(vi3x2345, vk10)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_4x4_acc2() 254 vo2p1 = _mm_add_ps(vo2p1, _mm_mul_ps(vi5x2345, vk30)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_4x4_acc2() 272 vo2p1 = _mm_add_ps(vo2p1, _mm_mul_ps(vi2x5678, vk03)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_4x4_acc2() 280 vo2p1 = _mm_add_ps(vo2p1, _mm_mul_ps(vi4x5678, vk23)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_4x4_acc2() 288 vo2p1 = _mm_add_ps(vo2p1, _mm_mul_ps(vi6x5678, vk43)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_4x4_acc2() [all …]
|
D | 5x5s2p2-minmax-wasmsimd-x86-splat-3x4-acc2.c | 171 v128_t vo2p1 = wasm_f32x4_mul(vi4x8ACE, wasm_v32x4_shuffle(vw0123, vw0123, 3, 3, 3, 3)); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_x86_splat_3x4_acc2() local 183 …vo2p1 = wasm_f32x4_add(vo2p1, wasm_f32x4_mul(vi7x8ACE, wasm_v32x4_shuffle(vwGHIJ, vwGHIJ, 2, 2, 2,… in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_x86_splat_3x4_acc2() 191 …vo2p1 = wasm_f32x4_add(vo2p1, wasm_f32x4_mul(vi4x9BDF, wasm_v32x4_shuffle(vw4567, vw4567, 0, 0, 0,… in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_x86_splat_3x4_acc2() 199 …vo2p1 = wasm_f32x4_add(vo2p1, wasm_f32x4_mul(vi6x9BDF, wasm_v32x4_shuffle(vwCDEF, vwCDEF, 2, 2, 2,… in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_x86_splat_3x4_acc2() 207 …vo2p1 = wasm_f32x4_add(vo2p1, wasm_f32x4_mul(vi8x9BDF, wasm_v32x4_shuffle(vwOP, vwOP, 0, 0, 0, 0))… in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_x86_splat_3x4_acc2() 234 …vo2p1 = wasm_f32x4_add(vo2p1, wasm_f32x4_mul(vi5x68AC, wasm_v32x4_shuffle(vw4567, vw4567, 2, 2, 2,… in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_x86_splat_3x4_acc2() 242 …vo2p1 = wasm_f32x4_add(vo2p1, wasm_f32x4_mul(vi7x68AC, wasm_v32x4_shuffle(vwGHIJ, vwGHIJ, 0, 0, 0,… in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_x86_splat_3x4_acc2() 316 …vo2p1 = wasm_f32x4_add(vo2p1, wasm_f32x4_mul(vi4x79BD, wasm_v32x4_shuffle(vw0123, vw0123, 2, 2, 2,… in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_x86_splat_3x4_acc2() 324 …vo2p1 = wasm_f32x4_add(vo2p1, wasm_f32x4_mul(vi6x79BD, wasm_v32x4_shuffle(vwCDEF, vwCDEF, 0, 0, 0,… in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_x86_splat_3x4_acc2() 332 …vo2p1 = wasm_f32x4_add(vo2p1, wasm_f32x4_mul(vi8x79BD, wasm_v32x4_shuffle(vwKLMN, vwKLMN, 2, 2, 2,… in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_x86_splat_3x4_acc2() [all …]
|
D | 5x5s2p2-minmax-wasmsimd-arm-splat-3x4-acc2.c | 171 v128_t vo2p1 = wasm_f32x4_mul(vi4x8ACE, wasm_v32x4_shuffle(vw0123, vw0123, 3, 3, 3, 3)); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_arm_splat_3x4_acc2() local 183 …vo2p1 = wasm_f32x4_add(vo2p1, wasm_f32x4_mul(vi7x8ACE, wasm_v32x4_shuffle(vwGHIJ, vwGHIJ, 2, 2, 2,… in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_arm_splat_3x4_acc2() 191 …vo2p1 = wasm_f32x4_add(vo2p1, wasm_f32x4_mul(vi4x9BDF, wasm_v32x4_shuffle(vw4567, vw4567, 0, 0, 0,… in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_arm_splat_3x4_acc2() 199 …vo2p1 = wasm_f32x4_add(vo2p1, wasm_f32x4_mul(vi6x9BDF, wasm_v32x4_shuffle(vwCDEF, vwCDEF, 2, 2, 2,… in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_arm_splat_3x4_acc2() 207 …vo2p1 = wasm_f32x4_add(vo2p1, wasm_f32x4_mul(vi8x9BDF, wasm_v32x4_shuffle(vwOP, vwOP, 0, 0, 0, 0))… in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_arm_splat_3x4_acc2() 234 …vo2p1 = wasm_f32x4_add(vo2p1, wasm_f32x4_mul(vi5x68AC, wasm_v32x4_shuffle(vw4567, vw4567, 2, 2, 2,… in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_arm_splat_3x4_acc2() 242 …vo2p1 = wasm_f32x4_add(vo2p1, wasm_f32x4_mul(vi7x68AC, wasm_v32x4_shuffle(vwGHIJ, vwGHIJ, 0, 0, 0,… in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_arm_splat_3x4_acc2() 316 …vo2p1 = wasm_f32x4_add(vo2p1, wasm_f32x4_mul(vi4x79BD, wasm_v32x4_shuffle(vw0123, vw0123, 2, 2, 2,… in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_arm_splat_3x4_acc2() 324 …vo2p1 = wasm_f32x4_add(vo2p1, wasm_f32x4_mul(vi6x79BD, wasm_v32x4_shuffle(vwCDEF, vwCDEF, 0, 0, 0,… in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_arm_splat_3x4_acc2() 332 …vo2p1 = wasm_f32x4_add(vo2p1, wasm_f32x4_mul(vi8x79BD, wasm_v32x4_shuffle(vwKLMN, vwKLMN, 2, 2, 2,… in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_arm_splat_3x4_acc2() [all …]
|
D | 5x5s2p2-minmax-sse-3x4-acc2.c | 186 __m128 vo2p1 = _mm_mul_ps(vi5x8ACE, vk12); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__sse_3x4_acc2() local 192 vo2p1 = _mm_add_ps(vo2p1, _mm_mul_ps(vi7x8ACE, vk32)); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__sse_3x4_acc2() 209 vo2p1 = _mm_add_ps(vo2p1, _mm_mul_ps(vi4x9BDF, vk03)); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__sse_3x4_acc2() 215 vo2p1 = _mm_add_ps(vo2p1, _mm_mul_ps(vi6x9BDF, vk23)); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__sse_3x4_acc2() 221 vo2p1 = _mm_add_ps(vo2p1, _mm_mul_ps(vi8x9BDF, vk43)); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__sse_3x4_acc2() 257 vo2p1 = _mm_add_ps(vo2p1, _mm_mul_ps(vi5x68AC, vk10)); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__sse_3x4_acc2() 263 vo2p1 = _mm_add_ps(vo2p1, _mm_mul_ps(vi7x68AC, vk30)); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__sse_3x4_acc2() 345 vo2p1 = _mm_add_ps(vo2p1, _mm_mul_ps(vi4x79BD, vk01)); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__sse_3x4_acc2() 351 vo2p1 = _mm_add_ps(vo2p1, _mm_mul_ps(vi6x79BD, vk21)); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__sse_3x4_acc2() 357 vo2p1 = _mm_add_ps(vo2p1, _mm_mul_ps(vi8x79BD, vk41)); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__sse_3x4_acc2() [all …]
|
D | 5x5p2-minmax-wasmsimd-x86-splat-4x4-acc2.c | 121 v128_t vo2p1 = wasm_f32x4_mul(vi2x4567, wasm_v32x4_shuffle(vw0123, vw0123, 3, 3, 3, 3)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_splat_4x4_acc2() local 136 …vo2p1 = wasm_f32x4_add(vo2p1, wasm_f32x4_mul(vi5x4567, wasm_v32x4_shuffle(vwGHIJ, vwGHIJ, 2, 2, 2,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_splat_4x4_acc2() 155 …vo2p1 = wasm_f32x4_add(vo2p1, wasm_f32x4_mul(vi2x3456, wasm_v32x4_shuffle(vw0123, vw0123, 2, 2, 2,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_splat_4x4_acc2() 165 …vo2p1 = wasm_f32x4_add(vo2p1, wasm_f32x4_mul(vi4x3456, wasm_v32x4_shuffle(vwCDEF, vwCDEF, 0, 0, 0,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_splat_4x4_acc2() 175 …vo2p1 = wasm_f32x4_add(vo2p1, wasm_f32x4_mul(vi6x3456, wasm_v32x4_shuffle(vwKLMN, vwKLMN, 2, 2, 2,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_splat_4x4_acc2() 202 …vo2p1 = wasm_f32x4_add(vo2p1, wasm_f32x4_mul(vi3x2345, wasm_v32x4_shuffle(vw4567, vw4567, 2, 2, 2,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_splat_4x4_acc2() 212 …vo2p1 = wasm_f32x4_add(vo2p1, wasm_f32x4_mul(vi5x2345, wasm_v32x4_shuffle(vwGHIJ, vwGHIJ, 0, 0, 0,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_splat_4x4_acc2() 231 …vo2p1 = wasm_f32x4_add(vo2p1, wasm_f32x4_mul(vi2x5678, wasm_v32x4_shuffle(vw4567, vw4567, 0, 0, 0,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_splat_4x4_acc2() 241 …vo2p1 = wasm_f32x4_add(vo2p1, wasm_f32x4_mul(vi4x5678, wasm_v32x4_shuffle(vwCDEF, vwCDEF, 2, 2, 2,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_splat_4x4_acc2() 251 …vo2p1 = wasm_f32x4_add(vo2p1, wasm_f32x4_mul(vi6x5678, wasm_v32x4_shuffle(vwOP, vwOP, 0, 0, 0, 0))… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_splat_4x4_acc2() [all …]
|
D | 5x5p2-minmax-wasmsimd-arm-splat-4x4-acc2.c | 121 v128_t vo2p1 = wasm_f32x4_mul(vi2x4567, wasm_v32x4_shuffle(vw0123, vw0123, 3, 3, 3, 3)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_splat_4x4_acc2() local 136 …vo2p1 = wasm_f32x4_add(vo2p1, wasm_f32x4_mul(vi5x4567, wasm_v32x4_shuffle(vwGHIJ, vwGHIJ, 2, 2, 2,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_splat_4x4_acc2() 155 …vo2p1 = wasm_f32x4_add(vo2p1, wasm_f32x4_mul(vi2x3456, wasm_v32x4_shuffle(vw0123, vw0123, 2, 2, 2,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_splat_4x4_acc2() 165 …vo2p1 = wasm_f32x4_add(vo2p1, wasm_f32x4_mul(vi4x3456, wasm_v32x4_shuffle(vwCDEF, vwCDEF, 0, 0, 0,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_splat_4x4_acc2() 175 …vo2p1 = wasm_f32x4_add(vo2p1, wasm_f32x4_mul(vi6x3456, wasm_v32x4_shuffle(vwKLMN, vwKLMN, 2, 2, 2,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_splat_4x4_acc2() 202 …vo2p1 = wasm_f32x4_add(vo2p1, wasm_f32x4_mul(vi3x2345, wasm_v32x4_shuffle(vw4567, vw4567, 2, 2, 2,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_splat_4x4_acc2() 212 …vo2p1 = wasm_f32x4_add(vo2p1, wasm_f32x4_mul(vi5x2345, wasm_v32x4_shuffle(vwGHIJ, vwGHIJ, 0, 0, 0,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_splat_4x4_acc2() 231 …vo2p1 = wasm_f32x4_add(vo2p1, wasm_f32x4_mul(vi2x5678, wasm_v32x4_shuffle(vw4567, vw4567, 0, 0, 0,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_splat_4x4_acc2() 241 …vo2p1 = wasm_f32x4_add(vo2p1, wasm_f32x4_mul(vi4x5678, wasm_v32x4_shuffle(vwCDEF, vwCDEF, 2, 2, 2,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_splat_4x4_acc2() 251 …vo2p1 = wasm_f32x4_add(vo2p1, wasm_f32x4_mul(vi6x5678, wasm_v32x4_shuffle(vwOP, vwOP, 0, 0, 0, 0))… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_splat_4x4_acc2() [all …]
|