/external/XNNPACK/src/f32-dwconv2d-chw/gen/ |
D | 5x5p2-minmax-neonfma-1x4-acc2.c | 86 float32x4_t vo0p1 = vmulq_lane_f32(vi0x4567, vget_high_f32(vw0123), 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_1x4_acc2() local 92 vo0p1 = vfmaq_lane_f32(vo0p1, vi3x4567, vget_high_f32(vwGHIJ), 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_1x4_acc2() 102 vo0p1 = vfmaq_lane_f32(vo0p1, vi0x3456, vget_high_f32(vw0123), 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_1x4_acc2() 106 vo0p1 = vfmaq_lane_f32(vo0p1, vi2x3456, vget_low_f32(vwCDEF), 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_1x4_acc2() 110 vo0p1 = vfmaq_lane_f32(vo0p1, vi4x3456, vget_high_f32(vwKLMN), 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_1x4_acc2() 125 vo0p1 = vfmaq_lane_f32(vo0p1, vi1x2345, vget_high_f32(vw4567), 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_1x4_acc2() 129 vo0p1 = vfmaq_lane_f32(vo0p1, vi3x2345, vget_low_f32(vwGHIJ), 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_1x4_acc2() 139 vo0p1 = vfmaq_lane_f32(vo0p1, vi0x5678, vget_low_f32(vw4567), 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_1x4_acc2() 143 vo0p1 = vfmaq_lane_f32(vo0p1, vi2x5678, vget_high_f32(vwCDEF), 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_1x4_acc2() 147 vo0p1 = vfmaq_lane_f32(vo0p1, vi4x5678, vwOP, 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_1x4_acc2() [all …]
|
D | 5x5p2-minmax-neon-1x4-acc2.c | 86 float32x4_t vo0p1 = vmulq_lane_f32(vi0x4567, vget_high_f32(vw0123), 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_1x4_acc2() local 92 vo0p1 = vmlaq_lane_f32(vo0p1, vi3x4567, vget_high_f32(vwGHIJ), 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_1x4_acc2() 102 vo0p1 = vmlaq_lane_f32(vo0p1, vi0x3456, vget_high_f32(vw0123), 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_1x4_acc2() 106 vo0p1 = vmlaq_lane_f32(vo0p1, vi2x3456, vget_low_f32(vwCDEF), 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_1x4_acc2() 110 vo0p1 = vmlaq_lane_f32(vo0p1, vi4x3456, vget_high_f32(vwKLMN), 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_1x4_acc2() 125 vo0p1 = vmlaq_lane_f32(vo0p1, vi1x2345, vget_high_f32(vw4567), 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_1x4_acc2() 129 vo0p1 = vmlaq_lane_f32(vo0p1, vi3x2345, vget_low_f32(vwGHIJ), 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_1x4_acc2() 139 vo0p1 = vmlaq_lane_f32(vo0p1, vi0x5678, vget_low_f32(vw4567), 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_1x4_acc2() 143 vo0p1 = vmlaq_lane_f32(vo0p1, vi2x5678, vget_high_f32(vwCDEF), 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_1x4_acc2() 147 vo0p1 = vmlaq_lane_f32(vo0p1, vi4x5678, vwOP, 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_1x4_acc2() [all …]
|
D | 5x5p2-minmax-wasmsimd-x86-loadsplat-1x4-acc2.c | 115 v128_t vo0p1 = wasm_f32x4_mul(vi0x4567, vk02); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_1x4_acc2() local 121 vo0p1 = wasm_f32x4_add(vo0p1, wasm_f32x4_mul(vi3x4567, vk32)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_1x4_acc2() 131 vo0p1 = wasm_f32x4_add(vo0p1, wasm_f32x4_mul(vi0x3456, vk01)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_1x4_acc2() 135 vo0p1 = wasm_f32x4_add(vo0p1, wasm_f32x4_mul(vi2x3456, vk21)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_1x4_acc2() 139 vo0p1 = wasm_f32x4_add(vo0p1, wasm_f32x4_mul(vi4x3456, vk41)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_1x4_acc2() 154 vo0p1 = wasm_f32x4_add(vo0p1, wasm_f32x4_mul(vi1x2345, vk10)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_1x4_acc2() 158 vo0p1 = wasm_f32x4_add(vo0p1, wasm_f32x4_mul(vi3x2345, vk30)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_1x4_acc2() 168 vo0p1 = wasm_f32x4_add(vo0p1, wasm_f32x4_mul(vi0x5678, vk03)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_1x4_acc2() 172 vo0p1 = wasm_f32x4_add(vo0p1, wasm_f32x4_mul(vi2x5678, vk23)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_1x4_acc2() 176 vo0p1 = wasm_f32x4_add(vo0p1, wasm_f32x4_mul(vi4x5678, vk43)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_1x4_acc2() [all …]
|
D | 5x5p2-minmax-wasmsimd-x86-splat-1x4-acc2.c | 89 v128_t vo0p1 = wasm_f32x4_mul(vi0x4567, wasm_v32x4_shuffle(vw0123, vw0123, 3, 3, 3, 3)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_splat_1x4_acc2() local 95 …vo0p1 = wasm_f32x4_add(vo0p1, wasm_f32x4_mul(vi3x4567, wasm_v32x4_shuffle(vwGHIJ, vwGHIJ, 2, 2, 2,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_splat_1x4_acc2() 105 …vo0p1 = wasm_f32x4_add(vo0p1, wasm_f32x4_mul(vi0x3456, wasm_v32x4_shuffle(vw0123, vw0123, 2, 2, 2,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_splat_1x4_acc2() 109 …vo0p1 = wasm_f32x4_add(vo0p1, wasm_f32x4_mul(vi2x3456, wasm_v32x4_shuffle(vwCDEF, vwCDEF, 0, 0, 0,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_splat_1x4_acc2() 113 …vo0p1 = wasm_f32x4_add(vo0p1, wasm_f32x4_mul(vi4x3456, wasm_v32x4_shuffle(vwKLMN, vwKLMN, 2, 2, 2,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_splat_1x4_acc2() 128 …vo0p1 = wasm_f32x4_add(vo0p1, wasm_f32x4_mul(vi1x2345, wasm_v32x4_shuffle(vw4567, vw4567, 2, 2, 2,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_splat_1x4_acc2() 132 …vo0p1 = wasm_f32x4_add(vo0p1, wasm_f32x4_mul(vi3x2345, wasm_v32x4_shuffle(vwGHIJ, vwGHIJ, 0, 0, 0,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_splat_1x4_acc2() 142 …vo0p1 = wasm_f32x4_add(vo0p1, wasm_f32x4_mul(vi0x5678, wasm_v32x4_shuffle(vw4567, vw4567, 0, 0, 0,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_splat_1x4_acc2() 146 …vo0p1 = wasm_f32x4_add(vo0p1, wasm_f32x4_mul(vi2x5678, wasm_v32x4_shuffle(vwCDEF, vwCDEF, 2, 2, 2,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_splat_1x4_acc2() 150 …vo0p1 = wasm_f32x4_add(vo0p1, wasm_f32x4_mul(vi4x5678, wasm_v32x4_shuffle(vwOP, vwOP, 0, 0, 0, 0))… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_splat_1x4_acc2() [all …]
|
D | 5x5p2-minmax-wasmsimd-arm-loadsplat-1x4-acc2.c | 115 v128_t vo0p1 = wasm_f32x4_mul(vi0x4567, vk02); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_1x4_acc2() local 121 vo0p1 = wasm_f32x4_add(vo0p1, wasm_f32x4_mul(vi3x4567, vk32)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_1x4_acc2() 131 vo0p1 = wasm_f32x4_add(vo0p1, wasm_f32x4_mul(vi0x3456, vk01)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_1x4_acc2() 135 vo0p1 = wasm_f32x4_add(vo0p1, wasm_f32x4_mul(vi2x3456, vk21)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_1x4_acc2() 139 vo0p1 = wasm_f32x4_add(vo0p1, wasm_f32x4_mul(vi4x3456, vk41)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_1x4_acc2() 154 vo0p1 = wasm_f32x4_add(vo0p1, wasm_f32x4_mul(vi1x2345, vk10)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_1x4_acc2() 158 vo0p1 = wasm_f32x4_add(vo0p1, wasm_f32x4_mul(vi3x2345, vk30)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_1x4_acc2() 168 vo0p1 = wasm_f32x4_add(vo0p1, wasm_f32x4_mul(vi0x5678, vk03)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_1x4_acc2() 172 vo0p1 = wasm_f32x4_add(vo0p1, wasm_f32x4_mul(vi2x5678, vk23)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_1x4_acc2() 176 vo0p1 = wasm_f32x4_add(vo0p1, wasm_f32x4_mul(vi4x5678, vk43)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_1x4_acc2() [all …]
|
D | 5x5p2-minmax-wasmsimd-arm-splat-1x4-acc2.c | 89 v128_t vo0p1 = wasm_f32x4_mul(vi0x4567, wasm_v32x4_shuffle(vw0123, vw0123, 3, 3, 3, 3)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_splat_1x4_acc2() local 95 …vo0p1 = wasm_f32x4_add(vo0p1, wasm_f32x4_mul(vi3x4567, wasm_v32x4_shuffle(vwGHIJ, vwGHIJ, 2, 2, 2,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_splat_1x4_acc2() 105 …vo0p1 = wasm_f32x4_add(vo0p1, wasm_f32x4_mul(vi0x3456, wasm_v32x4_shuffle(vw0123, vw0123, 2, 2, 2,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_splat_1x4_acc2() 109 …vo0p1 = wasm_f32x4_add(vo0p1, wasm_f32x4_mul(vi2x3456, wasm_v32x4_shuffle(vwCDEF, vwCDEF, 0, 0, 0,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_splat_1x4_acc2() 113 …vo0p1 = wasm_f32x4_add(vo0p1, wasm_f32x4_mul(vi4x3456, wasm_v32x4_shuffle(vwKLMN, vwKLMN, 2, 2, 2,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_splat_1x4_acc2() 128 …vo0p1 = wasm_f32x4_add(vo0p1, wasm_f32x4_mul(vi1x2345, wasm_v32x4_shuffle(vw4567, vw4567, 2, 2, 2,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_splat_1x4_acc2() 132 …vo0p1 = wasm_f32x4_add(vo0p1, wasm_f32x4_mul(vi3x2345, wasm_v32x4_shuffle(vwGHIJ, vwGHIJ, 0, 0, 0,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_splat_1x4_acc2() 142 …vo0p1 = wasm_f32x4_add(vo0p1, wasm_f32x4_mul(vi0x5678, wasm_v32x4_shuffle(vw4567, vw4567, 0, 0, 0,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_splat_1x4_acc2() 146 …vo0p1 = wasm_f32x4_add(vo0p1, wasm_f32x4_mul(vi2x5678, wasm_v32x4_shuffle(vwCDEF, vwCDEF, 2, 2, 2,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_splat_1x4_acc2() 150 …vo0p1 = wasm_f32x4_add(vo0p1, wasm_f32x4_mul(vi4x5678, wasm_v32x4_shuffle(vwOP, vwOP, 0, 0, 0, 0))… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_splat_1x4_acc2() [all …]
|
D | 5x5s2p2-minmax-scalar-1x1-acc2.c | 124 float vo0p1 = vi1x0 * vk10; in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__scalar_1x1_acc2() local 126 vo0p1 += vi3x0 * vk30; in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__scalar_1x1_acc2() 135 vo0p1 += vi0x1 * vk01; in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__scalar_1x1_acc2() 137 vo0p1 += vi2x1 * vk21; in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__scalar_1x1_acc2() 139 vo0p1 += vi4x1 * vk41; in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__scalar_1x1_acc2() 148 vo0p1 += vi1x2 * vk12; in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__scalar_1x1_acc2() 150 vo0p1 += vi3x2 * vk32; in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__scalar_1x1_acc2() 159 vo0p1 += vi0x3 * vk03; in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__scalar_1x1_acc2() 161 vo0p1 += vi2x3 * vk23; in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__scalar_1x1_acc2() 163 vo0p1 += vi4x3 * vk43; in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__scalar_1x1_acc2() [all …]
|
D | 5x5p2-minmax-scalar-1x1-acc2.c | 112 float vo0p1 = vi1x0 * vk10; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_1x1_acc2() local 114 vo0p1 += vi3x0 * vk30; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_1x1_acc2() 123 vo0p1 += vi0x1 * vk01; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_1x1_acc2() 125 vo0p1 += vi2x1 * vk21; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_1x1_acc2() 127 vo0p1 += vi4x1 * vk41; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_1x1_acc2() 136 vo0p1 += vi1x2 * vk12; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_1x1_acc2() 138 vo0p1 += vi3x2 * vk32; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_1x1_acc2() 147 vo0p1 += vi0x3 * vk03; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_1x1_acc2() 149 vo0p1 += vi2x3 * vk23; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_1x1_acc2() 151 vo0p1 += vi4x3 * vk43; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_1x1_acc2() [all …]
|
D | 5x5p2-minmax-sse-1x4-acc2.c | 103 __m128 vo0p1 = _mm_mul_ps(vi1x4567, vk12); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_1x4_acc2() local 105 vo0p1 = _mm_add_ps(vo0p1, _mm_mul_ps(vi3x4567, vk32)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_1x4_acc2() 131 vo0p1 = _mm_add_ps(vo0p1, _mm_mul_ps(vi0x3456, vk01)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_1x4_acc2() 133 vo0p1 = _mm_add_ps(vo0p1, _mm_mul_ps(vi2x3456, vk21)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_1x4_acc2() 135 vo0p1 = _mm_add_ps(vo0p1, _mm_mul_ps(vi4x3456, vk41)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_1x4_acc2() 160 vo0p1 = _mm_add_ps(vo0p1, _mm_mul_ps(vi1x2345, vk10)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_1x4_acc2() 162 vo0p1 = _mm_add_ps(vo0p1, _mm_mul_ps(vi3x2345, vk30)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_1x4_acc2() 171 vo0p1 = _mm_add_ps(vo0p1, _mm_mul_ps(vi0x5678, vk03)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_1x4_acc2() 173 vo0p1 = _mm_add_ps(vo0p1, _mm_mul_ps(vi2x5678, vk23)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_1x4_acc2() 175 vo0p1 = _mm_add_ps(vo0p1, _mm_mul_ps(vi4x5678, vk43)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_1x4_acc2() [all …]
|
D | 5x5s2p2-minmax-neon-1x4-acc2.c | 94 float32x4_t vo0p1 = vmulq_lane_f32(vi0x8ACE9BDF.val[0], vget_high_f32(vw0123), 1); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__neon_1x4_acc2() local 100 vo0p1 = vmlaq_lane_f32(vo0p1, vi3x8ACE9BDF.val[0], vget_high_f32(vwGHIJ), 0); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__neon_1x4_acc2() 104 vo0p1 = vmlaq_lane_f32(vo0p1, vi0x8ACE9BDF.val[1], vget_low_f32(vw4567), 0); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__neon_1x4_acc2() 108 vo0p1 = vmlaq_lane_f32(vo0p1, vi2x8ACE9BDF.val[1], vget_high_f32(vwCDEF), 0); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__neon_1x4_acc2() 112 vo0p1 = vmlaq_lane_f32(vo0p1, vi4x8ACE9BDF.val[1], vwOP, 0); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__neon_1x4_acc2() 127 vo0p1 = vmlaq_lane_f32(vo0p1, vi1x68AC, vget_high_f32(vw4567), 0); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__neon_1x4_acc2() 131 vo0p1 = vmlaq_lane_f32(vo0p1, vi3x68AC, vget_low_f32(vwGHIJ), 0); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__neon_1x4_acc2() 152 vo0p1 = vmlaq_lane_f32(vo0p1, vi0x79BD, vget_high_f32(vw0123), 0); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__neon_1x4_acc2() 156 vo0p1 = vmlaq_lane_f32(vo0p1, vi2x79BD, vget_low_f32(vwCDEF), 0); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__neon_1x4_acc2() 160 vo0p1 = vmlaq_lane_f32(vo0p1, vi4x79BD, vget_high_f32(vwKLMN), 0); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__neon_1x4_acc2() [all …]
|
D | 5x5s2p2-minmax-neonfma-1x4-acc2.c | 94 float32x4_t vo0p1 = vmulq_lane_f32(vi0x8ACE9BDF.val[0], vget_high_f32(vw0123), 1); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__neonfma_1x4_acc2() local 100 vo0p1 = vfmaq_lane_f32(vo0p1, vi3x8ACE9BDF.val[0], vget_high_f32(vwGHIJ), 0); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__neonfma_1x4_acc2() 104 vo0p1 = vfmaq_lane_f32(vo0p1, vi0x8ACE9BDF.val[1], vget_low_f32(vw4567), 0); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__neonfma_1x4_acc2() 108 vo0p1 = vfmaq_lane_f32(vo0p1, vi2x8ACE9BDF.val[1], vget_high_f32(vwCDEF), 0); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__neonfma_1x4_acc2() 112 vo0p1 = vfmaq_lane_f32(vo0p1, vi4x8ACE9BDF.val[1], vwOP, 0); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__neonfma_1x4_acc2() 127 vo0p1 = vfmaq_lane_f32(vo0p1, vi1x68AC, vget_high_f32(vw4567), 0); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__neonfma_1x4_acc2() 131 vo0p1 = vfmaq_lane_f32(vo0p1, vi3x68AC, vget_low_f32(vwGHIJ), 0); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__neonfma_1x4_acc2() 152 vo0p1 = vfmaq_lane_f32(vo0p1, vi0x79BD, vget_high_f32(vw0123), 0); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__neonfma_1x4_acc2() 156 vo0p1 = vfmaq_lane_f32(vo0p1, vi2x79BD, vget_low_f32(vwCDEF), 0); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__neonfma_1x4_acc2() 160 vo0p1 = vfmaq_lane_f32(vo0p1, vi4x79BD, vget_high_f32(vwKLMN), 0); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__neonfma_1x4_acc2() [all …]
|
D | 5x5p2-minmax-neonfma-1x4-acc3.c | 86 float32x4_t vo0p1 = vmulq_lane_f32(vi0x4567, vget_high_f32(vw0123), 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_1x4_acc3() local 90 vo0p1 = vfmaq_lane_f32(vo0p1, vi2x4567, vget_low_f32(vwCDEF), 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_1x4_acc3() 102 vo0p1 = vfmaq_lane_f32(vo0p1, vi0x3456, vget_high_f32(vw0123), 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_1x4_acc3() 108 vo0p1 = vfmaq_lane_f32(vo0p1, vi3x3456, vget_low_f32(vwGHIJ), 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_1x4_acc3() 125 vo0p1 = vfmaq_lane_f32(vo0p1, vi1x2345, vget_high_f32(vw4567), 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_1x4_acc3() 131 vo0p1 = vfmaq_lane_f32(vo0p1, vi4x2345, vget_low_f32(vwKLMN), 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_1x4_acc3() 143 vo0p1 = vfmaq_lane_f32(vo0p1, vi2x5678, vget_high_f32(vwCDEF), 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_1x4_acc3() 160 vo0p1 = vfmaq_lane_f32(vo0p1, vi0x6789, vget_low_f32(vw4567), 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_1x4_acc3() 166 vo0p1 = vfmaq_lane_f32(vo0p1, vi3x6789, vget_low_f32(vwKLMN), 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_1x4_acc3() 170 vo0p0 = vaddq_f32(vo0p0, vo0p1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_1x4_acc3() [all …]
|
D | 5x5p2-minmax-neon-1x4-acc3.c | 86 float32x4_t vo0p1 = vmulq_lane_f32(vi0x4567, vget_high_f32(vw0123), 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_1x4_acc3() local 90 vo0p1 = vmlaq_lane_f32(vo0p1, vi2x4567, vget_low_f32(vwCDEF), 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_1x4_acc3() 102 vo0p1 = vmlaq_lane_f32(vo0p1, vi0x3456, vget_high_f32(vw0123), 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_1x4_acc3() 108 vo0p1 = vmlaq_lane_f32(vo0p1, vi3x3456, vget_low_f32(vwGHIJ), 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_1x4_acc3() 125 vo0p1 = vmlaq_lane_f32(vo0p1, vi1x2345, vget_high_f32(vw4567), 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_1x4_acc3() 131 vo0p1 = vmlaq_lane_f32(vo0p1, vi4x2345, vget_low_f32(vwKLMN), 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_1x4_acc3() 143 vo0p1 = vmlaq_lane_f32(vo0p1, vi2x5678, vget_high_f32(vwCDEF), 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_1x4_acc3() 160 vo0p1 = vmlaq_lane_f32(vo0p1, vi0x6789, vget_low_f32(vw4567), 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_1x4_acc3() 166 vo0p1 = vmlaq_lane_f32(vo0p1, vi3x6789, vget_low_f32(vwKLMN), 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_1x4_acc3() 170 vo0p0 = vaddq_f32(vo0p0, vo0p1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_1x4_acc3() [all …]
|
D | 5x5s2p2-minmax-wasmsimd-arm-loadsplat-1x4-acc2.c | 138 v128_t vo0p1 = wasm_f32x4_mul(vi0x8ACE, vk02); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_arm_loadsplat_1x4_acc2() local 144 vo0p1 = wasm_f32x4_add(vo0p1, wasm_f32x4_mul(vi3x8ACE, vk32)); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_arm_loadsplat_1x4_acc2() 148 vo0p1 = wasm_f32x4_add(vo0p1, wasm_f32x4_mul(vi0x9BDF, vk03)); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_arm_loadsplat_1x4_acc2() 152 vo0p1 = wasm_f32x4_add(vo0p1, wasm_f32x4_mul(vi2x9BDF, vk23)); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_arm_loadsplat_1x4_acc2() 156 vo0p1 = wasm_f32x4_add(vo0p1, wasm_f32x4_mul(vi4x9BDF, vk43)); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_arm_loadsplat_1x4_acc2() 171 vo0p1 = wasm_f32x4_add(vo0p1, wasm_f32x4_mul(vi1x68AC, vk10)); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_arm_loadsplat_1x4_acc2() 175 vo0p1 = wasm_f32x4_add(vo0p1, wasm_f32x4_mul(vi3x68AC, vk30)); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_arm_loadsplat_1x4_acc2() 217 vo0p1 = wasm_f32x4_add(vo0p1, wasm_f32x4_mul(vi0x79BD, vk01)); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_arm_loadsplat_1x4_acc2() 221 vo0p1 = wasm_f32x4_add(vo0p1, wasm_f32x4_mul(vi2x79BD, vk21)); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_arm_loadsplat_1x4_acc2() 225 vo0p1 = wasm_f32x4_add(vo0p1, wasm_f32x4_mul(vi4x79BD, vk41)); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_arm_loadsplat_1x4_acc2() [all …]
|
D | 5x5p2-minmax-wasmsimd-arm-splat-1x4-acc3.c | 89 v128_t vo0p1 = wasm_f32x4_mul(vi0x4567, wasm_v32x4_shuffle(vw0123, vw0123, 3, 3, 3, 3)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_splat_1x4_acc3() local 93 …vo0p1 = wasm_f32x4_add(vo0p1, wasm_f32x4_mul(vi2x4567, wasm_v32x4_shuffle(vwCDEF, vwCDEF, 1, 1, 1,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_splat_1x4_acc3() 105 …vo0p1 = wasm_f32x4_add(vo0p1, wasm_f32x4_mul(vi0x3456, wasm_v32x4_shuffle(vw0123, vw0123, 2, 2, 2,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_splat_1x4_acc3() 111 …vo0p1 = wasm_f32x4_add(vo0p1, wasm_f32x4_mul(vi3x3456, wasm_v32x4_shuffle(vwGHIJ, vwGHIJ, 1, 1, 1,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_splat_1x4_acc3() 128 …vo0p1 = wasm_f32x4_add(vo0p1, wasm_f32x4_mul(vi1x2345, wasm_v32x4_shuffle(vw4567, vw4567, 2, 2, 2,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_splat_1x4_acc3() 134 …vo0p1 = wasm_f32x4_add(vo0p1, wasm_f32x4_mul(vi4x2345, wasm_v32x4_shuffle(vwKLMN, vwKLMN, 1, 1, 1,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_splat_1x4_acc3() 146 …vo0p1 = wasm_f32x4_add(vo0p1, wasm_f32x4_mul(vi2x5678, wasm_v32x4_shuffle(vwCDEF, vwCDEF, 2, 2, 2,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_splat_1x4_acc3() 163 …vo0p1 = wasm_f32x4_add(vo0p1, wasm_f32x4_mul(vi0x6789, wasm_v32x4_shuffle(vw4567, vw4567, 1, 1, 1,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_splat_1x4_acc3() 169 …vo0p1 = wasm_f32x4_add(vo0p1, wasm_f32x4_mul(vi3x6789, wasm_v32x4_shuffle(vwKLMN, vwKLMN, 0, 0, 0,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_splat_1x4_acc3() 173 vo0p0 = wasm_f32x4_add(vo0p0, vo0p1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_splat_1x4_acc3() [all …]
|
D | 5x5p2-minmax-wasmsimd-x86-loadsplat-1x4-acc3.c | 115 v128_t vo0p1 = wasm_f32x4_mul(vi0x4567, vk02); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_1x4_acc3() local 119 vo0p1 = wasm_f32x4_add(vo0p1, wasm_f32x4_mul(vi2x4567, vk22)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_1x4_acc3() 131 vo0p1 = wasm_f32x4_add(vo0p1, wasm_f32x4_mul(vi0x3456, vk01)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_1x4_acc3() 137 vo0p1 = wasm_f32x4_add(vo0p1, wasm_f32x4_mul(vi3x3456, vk31)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_1x4_acc3() 154 vo0p1 = wasm_f32x4_add(vo0p1, wasm_f32x4_mul(vi1x2345, vk10)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_1x4_acc3() 160 vo0p1 = wasm_f32x4_add(vo0p1, wasm_f32x4_mul(vi4x2345, vk40)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_1x4_acc3() 172 vo0p1 = wasm_f32x4_add(vo0p1, wasm_f32x4_mul(vi2x5678, vk23)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_1x4_acc3() 189 vo0p1 = wasm_f32x4_add(vo0p1, wasm_f32x4_mul(vi0x6789, vk04)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_1x4_acc3() 195 vo0p1 = wasm_f32x4_add(vo0p1, wasm_f32x4_mul(vi3x6789, vk34)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_1x4_acc3() 199 vo0p0 = wasm_f32x4_add(vo0p0, vo0p1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_1x4_acc3() [all …]
|
D | 5x5p2-minmax-wasmsimd-arm-loadsplat-1x4-acc3.c | 115 v128_t vo0p1 = wasm_f32x4_mul(vi0x4567, vk02); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_1x4_acc3() local 119 vo0p1 = wasm_f32x4_add(vo0p1, wasm_f32x4_mul(vi2x4567, vk22)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_1x4_acc3() 131 vo0p1 = wasm_f32x4_add(vo0p1, wasm_f32x4_mul(vi0x3456, vk01)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_1x4_acc3() 137 vo0p1 = wasm_f32x4_add(vo0p1, wasm_f32x4_mul(vi3x3456, vk31)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_1x4_acc3() 154 vo0p1 = wasm_f32x4_add(vo0p1, wasm_f32x4_mul(vi1x2345, vk10)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_1x4_acc3() 160 vo0p1 = wasm_f32x4_add(vo0p1, wasm_f32x4_mul(vi4x2345, vk40)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_1x4_acc3() 172 vo0p1 = wasm_f32x4_add(vo0p1, wasm_f32x4_mul(vi2x5678, vk23)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_1x4_acc3() 189 vo0p1 = wasm_f32x4_add(vo0p1, wasm_f32x4_mul(vi0x6789, vk04)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_1x4_acc3() 195 vo0p1 = wasm_f32x4_add(vo0p1, wasm_f32x4_mul(vi3x6789, vk34)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_1x4_acc3() 199 vo0p0 = wasm_f32x4_add(vo0p0, vo0p1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_1x4_acc3() [all …]
|
D | 5x5s2p2-minmax-wasmsimd-x86-splat-1x4-acc2.c | 118 v128_t vo0p1 = wasm_f32x4_mul(vi0x8ACE, wasm_v32x4_shuffle(vw0123, vw0123, 3, 3, 3, 3)); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_x86_splat_1x4_acc2() local 124 …vo0p1 = wasm_f32x4_add(vo0p1, wasm_f32x4_mul(vi3x8ACE, wasm_v32x4_shuffle(vwGHIJ, vwGHIJ, 2, 2, 2,… in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_x86_splat_1x4_acc2() 128 …vo0p1 = wasm_f32x4_add(vo0p1, wasm_f32x4_mul(vi0x9BDF, wasm_v32x4_shuffle(vw4567, vw4567, 0, 0, 0,… in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_x86_splat_1x4_acc2() 132 …vo0p1 = wasm_f32x4_add(vo0p1, wasm_f32x4_mul(vi2x9BDF, wasm_v32x4_shuffle(vwCDEF, vwCDEF, 2, 2, 2,… in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_x86_splat_1x4_acc2() 136 …vo0p1 = wasm_f32x4_add(vo0p1, wasm_f32x4_mul(vi4x9BDF, wasm_v32x4_shuffle(vwOP, vwOP, 0, 0, 0, 0))… in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_x86_splat_1x4_acc2() 151 …vo0p1 = wasm_f32x4_add(vo0p1, wasm_f32x4_mul(vi1x68AC, wasm_v32x4_shuffle(vw4567, vw4567, 2, 2, 2,… in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_x86_splat_1x4_acc2() 155 …vo0p1 = wasm_f32x4_add(vo0p1, wasm_f32x4_mul(vi3x68AC, wasm_v32x4_shuffle(vwGHIJ, vwGHIJ, 0, 0, 0,… in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_x86_splat_1x4_acc2() 197 …vo0p1 = wasm_f32x4_add(vo0p1, wasm_f32x4_mul(vi0x79BD, wasm_v32x4_shuffle(vw0123, vw0123, 2, 2, 2,… in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_x86_splat_1x4_acc2() 201 …vo0p1 = wasm_f32x4_add(vo0p1, wasm_f32x4_mul(vi2x79BD, wasm_v32x4_shuffle(vwCDEF, vwCDEF, 0, 0, 0,… in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_x86_splat_1x4_acc2() 205 …vo0p1 = wasm_f32x4_add(vo0p1, wasm_f32x4_mul(vi4x79BD, wasm_v32x4_shuffle(vwKLMN, vwKLMN, 2, 2, 2,… in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_x86_splat_1x4_acc2() [all …]
|
D | 5x5s2p2-minmax-wasmsimd-arm-splat-1x4-acc2.c | 118 v128_t vo0p1 = wasm_f32x4_mul(vi0x8ACE, wasm_v32x4_shuffle(vw0123, vw0123, 3, 3, 3, 3)); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_arm_splat_1x4_acc2() local 124 …vo0p1 = wasm_f32x4_add(vo0p1, wasm_f32x4_mul(vi3x8ACE, wasm_v32x4_shuffle(vwGHIJ, vwGHIJ, 2, 2, 2,… in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_arm_splat_1x4_acc2() 128 …vo0p1 = wasm_f32x4_add(vo0p1, wasm_f32x4_mul(vi0x9BDF, wasm_v32x4_shuffle(vw4567, vw4567, 0, 0, 0,… in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_arm_splat_1x4_acc2() 132 …vo0p1 = wasm_f32x4_add(vo0p1, wasm_f32x4_mul(vi2x9BDF, wasm_v32x4_shuffle(vwCDEF, vwCDEF, 2, 2, 2,… in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_arm_splat_1x4_acc2() 136 …vo0p1 = wasm_f32x4_add(vo0p1, wasm_f32x4_mul(vi4x9BDF, wasm_v32x4_shuffle(vwOP, vwOP, 0, 0, 0, 0))… in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_arm_splat_1x4_acc2() 151 …vo0p1 = wasm_f32x4_add(vo0p1, wasm_f32x4_mul(vi1x68AC, wasm_v32x4_shuffle(vw4567, vw4567, 2, 2, 2,… in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_arm_splat_1x4_acc2() 155 …vo0p1 = wasm_f32x4_add(vo0p1, wasm_f32x4_mul(vi3x68AC, wasm_v32x4_shuffle(vwGHIJ, vwGHIJ, 0, 0, 0,… in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_arm_splat_1x4_acc2() 197 …vo0p1 = wasm_f32x4_add(vo0p1, wasm_f32x4_mul(vi0x79BD, wasm_v32x4_shuffle(vw0123, vw0123, 2, 2, 2,… in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_arm_splat_1x4_acc2() 201 …vo0p1 = wasm_f32x4_add(vo0p1, wasm_f32x4_mul(vi2x79BD, wasm_v32x4_shuffle(vwCDEF, vwCDEF, 0, 0, 0,… in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_arm_splat_1x4_acc2() 205 …vo0p1 = wasm_f32x4_add(vo0p1, wasm_f32x4_mul(vi4x79BD, wasm_v32x4_shuffle(vwKLMN, vwKLMN, 2, 2, 2,… in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_arm_splat_1x4_acc2() [all …]
|
D | 5x5s2p2-minmax-wasmsimd-x86-loadsplat-1x4-acc2.c | 138 v128_t vo0p1 = wasm_f32x4_mul(vi0x8ACE, vk02); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_x86_loadsplat_1x4_acc2() local 144 vo0p1 = wasm_f32x4_add(vo0p1, wasm_f32x4_mul(vi3x8ACE, vk32)); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_x86_loadsplat_1x4_acc2() 148 vo0p1 = wasm_f32x4_add(vo0p1, wasm_f32x4_mul(vi0x9BDF, vk03)); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_x86_loadsplat_1x4_acc2() 152 vo0p1 = wasm_f32x4_add(vo0p1, wasm_f32x4_mul(vi2x9BDF, vk23)); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_x86_loadsplat_1x4_acc2() 156 vo0p1 = wasm_f32x4_add(vo0p1, wasm_f32x4_mul(vi4x9BDF, vk43)); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_x86_loadsplat_1x4_acc2() 171 vo0p1 = wasm_f32x4_add(vo0p1, wasm_f32x4_mul(vi1x68AC, vk10)); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_x86_loadsplat_1x4_acc2() 175 vo0p1 = wasm_f32x4_add(vo0p1, wasm_f32x4_mul(vi3x68AC, vk30)); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_x86_loadsplat_1x4_acc2() 217 vo0p1 = wasm_f32x4_add(vo0p1, wasm_f32x4_mul(vi0x79BD, vk01)); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_x86_loadsplat_1x4_acc2() 221 vo0p1 = wasm_f32x4_add(vo0p1, wasm_f32x4_mul(vi2x79BD, vk21)); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_x86_loadsplat_1x4_acc2() 225 vo0p1 = wasm_f32x4_add(vo0p1, wasm_f32x4_mul(vi4x79BD, vk41)); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_x86_loadsplat_1x4_acc2() [all …]
|
D | 5x5p2-minmax-neonfma-2x4-acc2.c | 96 float32x4_t vo0p1 = vmulq_lane_f32(vi0x4567, vget_high_f32(vw0123), 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_2x4_acc2() local 105 vo0p1 = vfmaq_lane_f32(vo0p1, vi3x4567, vget_high_f32(vwGHIJ), 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_2x4_acc2() 118 vo0p1 = vfmaq_lane_f32(vo0p1, vi0x3456, vget_high_f32(vw0123), 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_2x4_acc2() 124 vo0p1 = vfmaq_lane_f32(vo0p1, vi2x3456, vget_low_f32(vwCDEF), 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_2x4_acc2() 130 vo0p1 = vfmaq_lane_f32(vo0p1, vi4x3456, vget_high_f32(vwKLMN), 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_2x4_acc2() 149 vo0p1 = vfmaq_lane_f32(vo0p1, vi1x2345, vget_high_f32(vw4567), 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_2x4_acc2() 155 vo0p1 = vfmaq_lane_f32(vo0p1, vi3x2345, vget_low_f32(vwGHIJ), 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_2x4_acc2() 168 vo0p1 = vfmaq_lane_f32(vo0p1, vi0x5678, vget_low_f32(vw4567), 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_2x4_acc2() 174 vo0p1 = vfmaq_lane_f32(vo0p1, vi2x5678, vget_high_f32(vwCDEF), 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_2x4_acc2() 180 vo0p1 = vfmaq_lane_f32(vo0p1, vi4x5678, vwOP, 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_2x4_acc2() [all …]
|
D | 5x5p2-minmax-neon-2x4-acc2.c | 96 float32x4_t vo0p1 = vmulq_lane_f32(vi0x4567, vget_high_f32(vw0123), 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_2x4_acc2() local 105 vo0p1 = vmlaq_lane_f32(vo0p1, vi3x4567, vget_high_f32(vwGHIJ), 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_2x4_acc2() 118 vo0p1 = vmlaq_lane_f32(vo0p1, vi0x3456, vget_high_f32(vw0123), 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_2x4_acc2() 124 vo0p1 = vmlaq_lane_f32(vo0p1, vi2x3456, vget_low_f32(vwCDEF), 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_2x4_acc2() 130 vo0p1 = vmlaq_lane_f32(vo0p1, vi4x3456, vget_high_f32(vwKLMN), 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_2x4_acc2() 149 vo0p1 = vmlaq_lane_f32(vo0p1, vi1x2345, vget_high_f32(vw4567), 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_2x4_acc2() 155 vo0p1 = vmlaq_lane_f32(vo0p1, vi3x2345, vget_low_f32(vwGHIJ), 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_2x4_acc2() 168 vo0p1 = vmlaq_lane_f32(vo0p1, vi0x5678, vget_low_f32(vw4567), 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_2x4_acc2() 174 vo0p1 = vmlaq_lane_f32(vo0p1, vi2x5678, vget_high_f32(vwCDEF), 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_2x4_acc2() 180 vo0p1 = vmlaq_lane_f32(vo0p1, vi4x5678, vwOP, 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_2x4_acc2() [all …]
|
D | 5x5p2-minmax-wasmsimd-x86-splat-1x4-acc3.c | 89 v128_t vo0p1 = wasm_f32x4_mul(vi0x4567, wasm_v32x4_shuffle(vw0123, vw0123, 3, 3, 3, 3)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_splat_1x4_acc3() local 93 …vo0p1 = wasm_f32x4_add(vo0p1, wasm_f32x4_mul(vi2x4567, wasm_v32x4_shuffle(vwCDEF, vwCDEF, 1, 1, 1,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_splat_1x4_acc3() 105 …vo0p1 = wasm_f32x4_add(vo0p1, wasm_f32x4_mul(vi0x3456, wasm_v32x4_shuffle(vw0123, vw0123, 2, 2, 2,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_splat_1x4_acc3() 111 …vo0p1 = wasm_f32x4_add(vo0p1, wasm_f32x4_mul(vi3x3456, wasm_v32x4_shuffle(vwGHIJ, vwGHIJ, 1, 1, 1,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_splat_1x4_acc3() 128 …vo0p1 = wasm_f32x4_add(vo0p1, wasm_f32x4_mul(vi1x2345, wasm_v32x4_shuffle(vw4567, vw4567, 2, 2, 2,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_splat_1x4_acc3() 134 …vo0p1 = wasm_f32x4_add(vo0p1, wasm_f32x4_mul(vi4x2345, wasm_v32x4_shuffle(vwKLMN, vwKLMN, 1, 1, 1,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_splat_1x4_acc3() 146 …vo0p1 = wasm_f32x4_add(vo0p1, wasm_f32x4_mul(vi2x5678, wasm_v32x4_shuffle(vwCDEF, vwCDEF, 2, 2, 2,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_splat_1x4_acc3() 163 …vo0p1 = wasm_f32x4_add(vo0p1, wasm_f32x4_mul(vi0x6789, wasm_v32x4_shuffle(vw4567, vw4567, 1, 1, 1,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_splat_1x4_acc3() 169 …vo0p1 = wasm_f32x4_add(vo0p1, wasm_f32x4_mul(vi3x6789, wasm_v32x4_shuffle(vwKLMN, vwKLMN, 0, 0, 0,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_splat_1x4_acc3() 173 vo0p0 = wasm_f32x4_add(vo0p0, vo0p1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_splat_1x4_acc3() [all …]
|
D | 5x5p2-minmax-wasmsimd-arm-loadsplat-2x4-acc2.c | 125 v128_t vo0p1 = wasm_f32x4_mul(vi0x4567, vk02); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_2x4_acc2() local 134 vo0p1 = wasm_f32x4_add(vo0p1, wasm_f32x4_mul(vi3x4567, vk32)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_2x4_acc2() 147 vo0p1 = wasm_f32x4_add(vo0p1, wasm_f32x4_mul(vi0x3456, vk01)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_2x4_acc2() 153 vo0p1 = wasm_f32x4_add(vo0p1, wasm_f32x4_mul(vi2x3456, vk21)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_2x4_acc2() 159 vo0p1 = wasm_f32x4_add(vo0p1, wasm_f32x4_mul(vi4x3456, vk41)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_2x4_acc2() 178 vo0p1 = wasm_f32x4_add(vo0p1, wasm_f32x4_mul(vi1x2345, vk10)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_2x4_acc2() 184 vo0p1 = wasm_f32x4_add(vo0p1, wasm_f32x4_mul(vi3x2345, vk30)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_2x4_acc2() 197 vo0p1 = wasm_f32x4_add(vo0p1, wasm_f32x4_mul(vi0x5678, vk03)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_2x4_acc2() 203 vo0p1 = wasm_f32x4_add(vo0p1, wasm_f32x4_mul(vi2x5678, vk23)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_2x4_acc2() 209 vo0p1 = wasm_f32x4_add(vo0p1, wasm_f32x4_mul(vi4x5678, vk43)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_2x4_acc2() [all …]
|
D | 5x5p2-minmax-wasmsimd-x86-loadsplat-2x4-acc2.c | 125 v128_t vo0p1 = wasm_f32x4_mul(vi0x4567, vk02); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_2x4_acc2() local 134 vo0p1 = wasm_f32x4_add(vo0p1, wasm_f32x4_mul(vi3x4567, vk32)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_2x4_acc2() 147 vo0p1 = wasm_f32x4_add(vo0p1, wasm_f32x4_mul(vi0x3456, vk01)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_2x4_acc2() 153 vo0p1 = wasm_f32x4_add(vo0p1, wasm_f32x4_mul(vi2x3456, vk21)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_2x4_acc2() 159 vo0p1 = wasm_f32x4_add(vo0p1, wasm_f32x4_mul(vi4x3456, vk41)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_2x4_acc2() 178 vo0p1 = wasm_f32x4_add(vo0p1, wasm_f32x4_mul(vi1x2345, vk10)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_2x4_acc2() 184 vo0p1 = wasm_f32x4_add(vo0p1, wasm_f32x4_mul(vi3x2345, vk30)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_2x4_acc2() 197 vo0p1 = wasm_f32x4_add(vo0p1, wasm_f32x4_mul(vi0x5678, vk03)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_2x4_acc2() 203 vo0p1 = wasm_f32x4_add(vo0p1, wasm_f32x4_mul(vi2x5678, vk23)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_2x4_acc2() 209 vo0p1 = wasm_f32x4_add(vo0p1, wasm_f32x4_mul(vi4x5678, vk43)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_2x4_acc2() [all …]
|