/external/XNNPACK/src/f32-dwconv2d-chw/gen/ |
D | 5x5p2-minmax-neonfma-1x4-acc3.c | 74 float32x4_t vi4x4567 = vld1q_f32(i4); i4 += 4; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_1x4_acc3() local 94 vo0p0 = vfmaq_lane_f32(vo0p0, vi4x4567, vget_high_f32(vwKLMN), 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_1x4_acc3() 100 const float32x4_t vi4x3456 = vextq_f32(vi4x0123, vi4x4567, 3); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_1x4_acc3() 120 const float32x4_t vi4x2345 = vextq_f32(vi4x0123, vi4x4567, 2); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_1x4_acc3() 121 vi4x0123 = vi4x4567; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_1x4_acc3() 137 const float32x4_t vi4x5678 = vextq_f32(vi4x4567, vi4x89AB, 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_1x4_acc3() 157 const float32x4_t vi4x6789 = vextq_f32(vi4x4567, vi4x89AB, 2); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_1x4_acc3() 158 vi4x4567 = vi4x89AB; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_1x4_acc3() 203 vo0p0 = vfmaq_lane_f32(vo0p0, vi4x4567, vget_high_f32(vwKLMN), 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_1x4_acc3() 209 const float32x4_t vi4x3456 = vextq_f32(vi4x0123, vi4x4567, 3); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_1x4_acc3() [all …]
|
D | 5x5p2-minmax-neon-1x4.c | 74 float32x4_t vi4x4567 = vld1q_f32(i4); i4 += 4; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_1x4() local 94 vo0p0 = vmlaq_lane_f32(vo0p0, vi4x4567, vget_high_f32(vwKLMN), 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_1x4() 100 const float32x4_t vi4x3456 = vextq_f32(vi4x0123, vi4x4567, 3); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_1x4() 120 const float32x4_t vi4x2345 = vextq_f32(vi4x0123, vi4x4567, 2); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_1x4() 121 vi4x0123 = vi4x4567; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_1x4() 137 const float32x4_t vi4x5678 = vextq_f32(vi4x4567, vi4x89AB, 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_1x4() 157 const float32x4_t vi4x6789 = vextq_f32(vi4x4567, vi4x89AB, 2); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_1x4() 158 vi4x4567 = vi4x89AB; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_1x4() 201 vo0p0 = vmlaq_lane_f32(vo0p0, vi4x4567, vget_high_f32(vwKLMN), 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_1x4() 207 const float32x4_t vi4x3456 = vextq_f32(vi4x0123, vi4x4567, 3); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_1x4() [all …]
|
D | 5x5p2-minmax-neonfma-1x4-acc2.c | 74 float32x4_t vi4x4567 = vld1q_f32(i4); i4 += 4; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_1x4_acc2() local 94 vo0p0 = vfmaq_lane_f32(vo0p0, vi4x4567, vget_high_f32(vwKLMN), 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_1x4_acc2() 100 const float32x4_t vi4x3456 = vextq_f32(vi4x0123, vi4x4567, 3); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_1x4_acc2() 120 const float32x4_t vi4x2345 = vextq_f32(vi4x0123, vi4x4567, 2); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_1x4_acc2() 121 vi4x0123 = vi4x4567; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_1x4_acc2() 137 const float32x4_t vi4x5678 = vextq_f32(vi4x4567, vi4x89AB, 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_1x4_acc2() 157 const float32x4_t vi4x6789 = vextq_f32(vi4x4567, vi4x89AB, 2); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_1x4_acc2() 158 vi4x4567 = vi4x89AB; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_1x4_acc2() 202 vo0p0 = vfmaq_lane_f32(vo0p0, vi4x4567, vget_high_f32(vwKLMN), 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_1x4_acc2() 208 const float32x4_t vi4x3456 = vextq_f32(vi4x0123, vi4x4567, 3); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_1x4_acc2() [all …]
|
D | 5x5p2-minmax-neon-1x4-acc2.c | 74 float32x4_t vi4x4567 = vld1q_f32(i4); i4 += 4; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_1x4_acc2() local 94 vo0p0 = vmlaq_lane_f32(vo0p0, vi4x4567, vget_high_f32(vwKLMN), 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_1x4_acc2() 100 const float32x4_t vi4x3456 = vextq_f32(vi4x0123, vi4x4567, 3); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_1x4_acc2() 120 const float32x4_t vi4x2345 = vextq_f32(vi4x0123, vi4x4567, 2); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_1x4_acc2() 121 vi4x0123 = vi4x4567; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_1x4_acc2() 137 const float32x4_t vi4x5678 = vextq_f32(vi4x4567, vi4x89AB, 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_1x4_acc2() 157 const float32x4_t vi4x6789 = vextq_f32(vi4x4567, vi4x89AB, 2); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_1x4_acc2() 158 vi4x4567 = vi4x89AB; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_1x4_acc2() 202 vo0p0 = vmlaq_lane_f32(vo0p0, vi4x4567, vget_high_f32(vwKLMN), 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_1x4_acc2() 208 const float32x4_t vi4x3456 = vextq_f32(vi4x0123, vi4x4567, 3); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_1x4_acc2() [all …]
|
D | 5x5p2-minmax-neon-1x4-acc3.c | 74 float32x4_t vi4x4567 = vld1q_f32(i4); i4 += 4; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_1x4_acc3() local 94 vo0p0 = vmlaq_lane_f32(vo0p0, vi4x4567, vget_high_f32(vwKLMN), 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_1x4_acc3() 100 const float32x4_t vi4x3456 = vextq_f32(vi4x0123, vi4x4567, 3); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_1x4_acc3() 120 const float32x4_t vi4x2345 = vextq_f32(vi4x0123, vi4x4567, 2); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_1x4_acc3() 121 vi4x0123 = vi4x4567; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_1x4_acc3() 137 const float32x4_t vi4x5678 = vextq_f32(vi4x4567, vi4x89AB, 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_1x4_acc3() 157 const float32x4_t vi4x6789 = vextq_f32(vi4x4567, vi4x89AB, 2); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_1x4_acc3() 158 vi4x4567 = vi4x89AB; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_1x4_acc3() 203 vo0p0 = vmlaq_lane_f32(vo0p0, vi4x4567, vget_high_f32(vwKLMN), 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_1x4_acc3() 209 const float32x4_t vi4x3456 = vextq_f32(vi4x0123, vi4x4567, 3); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_1x4_acc3() [all …]
|
D | 5x5p2-minmax-neonfma-1x4.c | 74 float32x4_t vi4x4567 = vld1q_f32(i4); i4 += 4; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_1x4() local 94 vo0p0 = vfmaq_lane_f32(vo0p0, vi4x4567, vget_high_f32(vwKLMN), 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_1x4() 100 const float32x4_t vi4x3456 = vextq_f32(vi4x0123, vi4x4567, 3); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_1x4() 120 const float32x4_t vi4x2345 = vextq_f32(vi4x0123, vi4x4567, 2); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_1x4() 121 vi4x0123 = vi4x4567; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_1x4() 137 const float32x4_t vi4x5678 = vextq_f32(vi4x4567, vi4x89AB, 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_1x4() 157 const float32x4_t vi4x6789 = vextq_f32(vi4x4567, vi4x89AB, 2); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_1x4() 158 vi4x4567 = vi4x89AB; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_1x4() 201 vo0p0 = vfmaq_lane_f32(vo0p0, vi4x4567, vget_high_f32(vwKLMN), 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_1x4() 207 const float32x4_t vi4x3456 = vextq_f32(vi4x0123, vi4x4567, 3); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_1x4() [all …]
|
D | 5x5p2-minmax-wasmsimd-arm-splat-1x4-acc4.c | 77 v128_t vi4x4567 = wasm_v128_load(i4); i4 += 4; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_splat_1x4_acc4() local 97 …vo0p2 = wasm_f32x4_add(vo0p2, wasm_f32x4_mul(vi4x4567, wasm_v32x4_shuffle(vwKLMN, vwKLMN, 3, 3, 3,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_splat_1x4_acc4() 103 const v128_t vi4x3456 = wasm_v32x4_shuffle(vi4x0123, vi4x4567, 3, 4, 5, 6); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_splat_1x4_acc4() 123 const v128_t vi4x2345 = wasm_v32x4_shuffle(vi4x0123, vi4x4567, 2, 3, 4, 5); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_splat_1x4_acc4() 124 vi4x0123 = vi4x4567; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_splat_1x4_acc4() 140 const v128_t vi4x5678 = wasm_v32x4_shuffle(vi4x4567, vi4x89AB, 1, 2, 3, 4); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_splat_1x4_acc4() 160 const v128_t vi4x6789 = wasm_v32x4_shuffle(vi4x4567, vi4x89AB, 2, 3, 4, 5); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_splat_1x4_acc4() 161 vi4x4567 = vi4x89AB; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_splat_1x4_acc4() 206 …vo0p2 = wasm_f32x4_add(vo0p2, wasm_f32x4_mul(vi4x4567, wasm_v32x4_shuffle(vwKLMN, vwKLMN, 3, 3, 3,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_splat_1x4_acc4() 212 const v128_t vi4x3456 = wasm_v32x4_shuffle(vi4x0123, vi4x4567, 3, 4, 5, 6); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_splat_1x4_acc4() [all …]
|
D | 5x5p2-minmax-wasmsimd-x86-splat-1x4.c | 77 v128_t vi4x4567 = wasm_v128_load(i4); i4 += 4; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_splat_1x4() local 97 …vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi4x4567, wasm_v32x4_shuffle(vwKLMN, vwKLMN, 3, 3, 3,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_splat_1x4() 103 const v128_t vi4x3456 = wasm_v32x4_shuffle(vi4x0123, vi4x4567, 3, 4, 5, 6); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_splat_1x4() 123 const v128_t vi4x2345 = wasm_v32x4_shuffle(vi4x0123, vi4x4567, 2, 3, 4, 5); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_splat_1x4() 124 vi4x0123 = vi4x4567; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_splat_1x4() 140 const v128_t vi4x5678 = wasm_v32x4_shuffle(vi4x4567, vi4x89AB, 1, 2, 3, 4); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_splat_1x4() 160 const v128_t vi4x6789 = wasm_v32x4_shuffle(vi4x4567, vi4x89AB, 2, 3, 4, 5); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_splat_1x4() 161 vi4x4567 = vi4x89AB; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_splat_1x4() 203 …vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi4x4567, wasm_v32x4_shuffle(vwKLMN, vwKLMN, 3, 3, 3,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_splat_1x4() 209 const v128_t vi4x3456 = wasm_v32x4_shuffle(vi4x0123, vi4x4567, 3, 4, 5, 6); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_splat_1x4() [all …]
|
D | 5x5p2-minmax-wasmsimd-arm-splat-1x4-acc3.c | 77 v128_t vi4x4567 = wasm_v128_load(i4); i4 += 4; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_splat_1x4_acc3() local 97 …vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi4x4567, wasm_v32x4_shuffle(vwKLMN, vwKLMN, 3, 3, 3,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_splat_1x4_acc3() 103 const v128_t vi4x3456 = wasm_v32x4_shuffle(vi4x0123, vi4x4567, 3, 4, 5, 6); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_splat_1x4_acc3() 123 const v128_t vi4x2345 = wasm_v32x4_shuffle(vi4x0123, vi4x4567, 2, 3, 4, 5); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_splat_1x4_acc3() 124 vi4x0123 = vi4x4567; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_splat_1x4_acc3() 140 const v128_t vi4x5678 = wasm_v32x4_shuffle(vi4x4567, vi4x89AB, 1, 2, 3, 4); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_splat_1x4_acc3() 160 const v128_t vi4x6789 = wasm_v32x4_shuffle(vi4x4567, vi4x89AB, 2, 3, 4, 5); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_splat_1x4_acc3() 161 vi4x4567 = vi4x89AB; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_splat_1x4_acc3() 205 …vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi4x4567, wasm_v32x4_shuffle(vwKLMN, vwKLMN, 3, 3, 3,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_splat_1x4_acc3() 211 const v128_t vi4x3456 = wasm_v32x4_shuffle(vi4x0123, vi4x4567, 3, 4, 5, 6); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_splat_1x4_acc3() [all …]
|
D | 5x5p2-minmax-wasmsimd-x86-loadsplat-1x4-acc3.c | 103 v128_t vi4x4567 = wasm_v128_load(i4); i4 += 4; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_1x4_acc3() local 123 vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi4x4567, vk42)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_1x4_acc3() 129 const v128_t vi4x3456 = wasm_v32x4_shuffle(vi4x0123, vi4x4567, 3, 4, 5, 6); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_1x4_acc3() 149 const v128_t vi4x2345 = wasm_v32x4_shuffle(vi4x0123, vi4x4567, 2, 3, 4, 5); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_1x4_acc3() 150 vi4x0123 = vi4x4567; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_1x4_acc3() 166 const v128_t vi4x5678 = wasm_v32x4_shuffle(vi4x4567, vi4x89AB, 1, 2, 3, 4); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_1x4_acc3() 186 const v128_t vi4x6789 = wasm_v32x4_shuffle(vi4x4567, vi4x89AB, 2, 3, 4, 5); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_1x4_acc3() 187 vi4x4567 = vi4x89AB; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_1x4_acc3() 231 vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi4x4567, vk42)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_1x4_acc3() 237 const v128_t vi4x3456 = wasm_v32x4_shuffle(vi4x0123, vi4x4567, 3, 4, 5, 6); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_1x4_acc3() [all …]
|
D | 5x5p2-minmax-wasmsimd-x86-loadsplat-1x4-acc2.c | 103 v128_t vi4x4567 = wasm_v128_load(i4); i4 += 4; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_1x4_acc2() local 123 vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi4x4567, vk42)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_1x4_acc2() 129 const v128_t vi4x3456 = wasm_v32x4_shuffle(vi4x0123, vi4x4567, 3, 4, 5, 6); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_1x4_acc2() 149 const v128_t vi4x2345 = wasm_v32x4_shuffle(vi4x0123, vi4x4567, 2, 3, 4, 5); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_1x4_acc2() 150 vi4x0123 = vi4x4567; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_1x4_acc2() 166 const v128_t vi4x5678 = wasm_v32x4_shuffle(vi4x4567, vi4x89AB, 1, 2, 3, 4); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_1x4_acc2() 186 const v128_t vi4x6789 = wasm_v32x4_shuffle(vi4x4567, vi4x89AB, 2, 3, 4, 5); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_1x4_acc2() 187 vi4x4567 = vi4x89AB; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_1x4_acc2() 230 vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi4x4567, vk42)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_1x4_acc2() 236 const v128_t vi4x3456 = wasm_v32x4_shuffle(vi4x0123, vi4x4567, 3, 4, 5, 6); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_1x4_acc2() [all …]
|
D | 5x5p2-minmax-wasmsimd-arm-loadsplat-1x4-acc4.c | 103 v128_t vi4x4567 = wasm_v128_load(i4); i4 += 4; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_1x4_acc4() local 123 vo0p2 = wasm_f32x4_add(vo0p2, wasm_f32x4_mul(vi4x4567, vk42)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_1x4_acc4() 129 const v128_t vi4x3456 = wasm_v32x4_shuffle(vi4x0123, vi4x4567, 3, 4, 5, 6); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_1x4_acc4() 149 const v128_t vi4x2345 = wasm_v32x4_shuffle(vi4x0123, vi4x4567, 2, 3, 4, 5); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_1x4_acc4() 150 vi4x0123 = vi4x4567; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_1x4_acc4() 166 const v128_t vi4x5678 = wasm_v32x4_shuffle(vi4x4567, vi4x89AB, 1, 2, 3, 4); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_1x4_acc4() 186 const v128_t vi4x6789 = wasm_v32x4_shuffle(vi4x4567, vi4x89AB, 2, 3, 4, 5); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_1x4_acc4() 187 vi4x4567 = vi4x89AB; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_1x4_acc4() 232 vo0p2 = wasm_f32x4_add(vo0p2, wasm_f32x4_mul(vi4x4567, vk42)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_1x4_acc4() 238 const v128_t vi4x3456 = wasm_v32x4_shuffle(vi4x0123, vi4x4567, 3, 4, 5, 6); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_1x4_acc4() [all …]
|
D | 5x5p2-minmax-wasmsimd-arm-loadsplat-1x4.c | 103 v128_t vi4x4567 = wasm_v128_load(i4); i4 += 4; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_1x4() local 123 vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi4x4567, vk42)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_1x4() 129 const v128_t vi4x3456 = wasm_v32x4_shuffle(vi4x0123, vi4x4567, 3, 4, 5, 6); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_1x4() 149 const v128_t vi4x2345 = wasm_v32x4_shuffle(vi4x0123, vi4x4567, 2, 3, 4, 5); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_1x4() 150 vi4x0123 = vi4x4567; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_1x4() 166 const v128_t vi4x5678 = wasm_v32x4_shuffle(vi4x4567, vi4x89AB, 1, 2, 3, 4); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_1x4() 186 const v128_t vi4x6789 = wasm_v32x4_shuffle(vi4x4567, vi4x89AB, 2, 3, 4, 5); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_1x4() 187 vi4x4567 = vi4x89AB; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_1x4() 229 vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi4x4567, vk42)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_1x4() 235 const v128_t vi4x3456 = wasm_v32x4_shuffle(vi4x0123, vi4x4567, 3, 4, 5, 6); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_1x4() [all …]
|
D | 5x5p2-minmax-wasmsimd-x86-splat-1x4-acc2.c | 77 v128_t vi4x4567 = wasm_v128_load(i4); i4 += 4; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_splat_1x4_acc2() local 97 …vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi4x4567, wasm_v32x4_shuffle(vwKLMN, vwKLMN, 3, 3, 3,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_splat_1x4_acc2() 103 const v128_t vi4x3456 = wasm_v32x4_shuffle(vi4x0123, vi4x4567, 3, 4, 5, 6); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_splat_1x4_acc2() 123 const v128_t vi4x2345 = wasm_v32x4_shuffle(vi4x0123, vi4x4567, 2, 3, 4, 5); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_splat_1x4_acc2() 124 vi4x0123 = vi4x4567; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_splat_1x4_acc2() 140 const v128_t vi4x5678 = wasm_v32x4_shuffle(vi4x4567, vi4x89AB, 1, 2, 3, 4); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_splat_1x4_acc2() 160 const v128_t vi4x6789 = wasm_v32x4_shuffle(vi4x4567, vi4x89AB, 2, 3, 4, 5); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_splat_1x4_acc2() 161 vi4x4567 = vi4x89AB; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_splat_1x4_acc2() 204 …vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi4x4567, wasm_v32x4_shuffle(vwKLMN, vwKLMN, 3, 3, 3,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_splat_1x4_acc2() 210 const v128_t vi4x3456 = wasm_v32x4_shuffle(vi4x0123, vi4x4567, 3, 4, 5, 6); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_splat_1x4_acc2() [all …]
|
D | 5x5p2-minmax-wasmsimd-arm-loadsplat-1x4-acc2.c | 103 v128_t vi4x4567 = wasm_v128_load(i4); i4 += 4; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_1x4_acc2() local 123 vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi4x4567, vk42)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_1x4_acc2() 129 const v128_t vi4x3456 = wasm_v32x4_shuffle(vi4x0123, vi4x4567, 3, 4, 5, 6); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_1x4_acc2() 149 const v128_t vi4x2345 = wasm_v32x4_shuffle(vi4x0123, vi4x4567, 2, 3, 4, 5); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_1x4_acc2() 150 vi4x0123 = vi4x4567; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_1x4_acc2() 166 const v128_t vi4x5678 = wasm_v32x4_shuffle(vi4x4567, vi4x89AB, 1, 2, 3, 4); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_1x4_acc2() 186 const v128_t vi4x6789 = wasm_v32x4_shuffle(vi4x4567, vi4x89AB, 2, 3, 4, 5); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_1x4_acc2() 187 vi4x4567 = vi4x89AB; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_1x4_acc2() 230 vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi4x4567, vk42)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_1x4_acc2() 236 const v128_t vi4x3456 = wasm_v32x4_shuffle(vi4x0123, vi4x4567, 3, 4, 5, 6); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_1x4_acc2() [all …]
|
D | 5x5p2-minmax-neonfma-1x4-acc5.c | 74 float32x4_t vi4x4567 = vld1q_f32(i4); i4 += 4; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_1x4_acc5() local 94 vo0p1 = vfmaq_lane_f32(vo0p1, vi4x4567, vget_high_f32(vwKLMN), 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_1x4_acc5() 100 const float32x4_t vi4x3456 = vextq_f32(vi4x0123, vi4x4567, 3); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_1x4_acc5() 120 const float32x4_t vi4x2345 = vextq_f32(vi4x0123, vi4x4567, 2); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_1x4_acc5() 121 vi4x0123 = vi4x4567; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_1x4_acc5() 137 const float32x4_t vi4x5678 = vextq_f32(vi4x4567, vi4x89AB, 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_1x4_acc5() 157 const float32x4_t vi4x6789 = vextq_f32(vi4x4567, vi4x89AB, 2); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_1x4_acc5() 158 vi4x4567 = vi4x89AB; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_1x4_acc5() 205 vo0p1 = vfmaq_lane_f32(vo0p1, vi4x4567, vget_high_f32(vwKLMN), 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_1x4_acc5() 211 const float32x4_t vi4x3456 = vextq_f32(vi4x0123, vi4x4567, 3); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_1x4_acc5() [all …]
|
D | 5x5p2-minmax-wasmsimd-arm-loadsplat-1x4-acc3.c | 103 v128_t vi4x4567 = wasm_v128_load(i4); i4 += 4; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_1x4_acc3() local 123 vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi4x4567, vk42)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_1x4_acc3() 129 const v128_t vi4x3456 = wasm_v32x4_shuffle(vi4x0123, vi4x4567, 3, 4, 5, 6); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_1x4_acc3() 149 const v128_t vi4x2345 = wasm_v32x4_shuffle(vi4x0123, vi4x4567, 2, 3, 4, 5); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_1x4_acc3() 150 vi4x0123 = vi4x4567; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_1x4_acc3() 166 const v128_t vi4x5678 = wasm_v32x4_shuffle(vi4x4567, vi4x89AB, 1, 2, 3, 4); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_1x4_acc3() 186 const v128_t vi4x6789 = wasm_v32x4_shuffle(vi4x4567, vi4x89AB, 2, 3, 4, 5); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_1x4_acc3() 187 vi4x4567 = vi4x89AB; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_1x4_acc3() 231 vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi4x4567, vk42)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_1x4_acc3() 237 const v128_t vi4x3456 = wasm_v32x4_shuffle(vi4x0123, vi4x4567, 3, 4, 5, 6); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_1x4_acc3() [all …]
|
D | 5x5p2-minmax-neonfma-1x4-acc4.c | 74 float32x4_t vi4x4567 = vld1q_f32(i4); i4 += 4; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_1x4_acc4() local 94 vo0p2 = vfmaq_lane_f32(vo0p2, vi4x4567, vget_high_f32(vwKLMN), 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_1x4_acc4() 100 const float32x4_t vi4x3456 = vextq_f32(vi4x0123, vi4x4567, 3); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_1x4_acc4() 120 const float32x4_t vi4x2345 = vextq_f32(vi4x0123, vi4x4567, 2); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_1x4_acc4() 121 vi4x0123 = vi4x4567; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_1x4_acc4() 137 const float32x4_t vi4x5678 = vextq_f32(vi4x4567, vi4x89AB, 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_1x4_acc4() 157 const float32x4_t vi4x6789 = vextq_f32(vi4x4567, vi4x89AB, 2); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_1x4_acc4() 158 vi4x4567 = vi4x89AB; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_1x4_acc4() 204 vo0p2 = vfmaq_lane_f32(vo0p2, vi4x4567, vget_high_f32(vwKLMN), 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_1x4_acc4() 210 const float32x4_t vi4x3456 = vextq_f32(vi4x0123, vi4x4567, 3); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_1x4_acc4() [all …]
|
D | 5x5p2-minmax-neon-1x4-acc5.c | 74 float32x4_t vi4x4567 = vld1q_f32(i4); i4 += 4; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_1x4_acc5() local 94 vo0p1 = vmlaq_lane_f32(vo0p1, vi4x4567, vget_high_f32(vwKLMN), 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_1x4_acc5() 100 const float32x4_t vi4x3456 = vextq_f32(vi4x0123, vi4x4567, 3); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_1x4_acc5() 120 const float32x4_t vi4x2345 = vextq_f32(vi4x0123, vi4x4567, 2); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_1x4_acc5() 121 vi4x0123 = vi4x4567; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_1x4_acc5() 137 const float32x4_t vi4x5678 = vextq_f32(vi4x4567, vi4x89AB, 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_1x4_acc5() 157 const float32x4_t vi4x6789 = vextq_f32(vi4x4567, vi4x89AB, 2); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_1x4_acc5() 158 vi4x4567 = vi4x89AB; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_1x4_acc5() 205 vo0p1 = vmlaq_lane_f32(vo0p1, vi4x4567, vget_high_f32(vwKLMN), 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_1x4_acc5() 211 const float32x4_t vi4x3456 = vextq_f32(vi4x0123, vi4x4567, 3); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_1x4_acc5() [all …]
|
D | 5x5p2-minmax-wasmsimd-x86-loadsplat-1x4.c | 103 v128_t vi4x4567 = wasm_v128_load(i4); i4 += 4; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_1x4() local 123 vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi4x4567, vk42)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_1x4() 129 const v128_t vi4x3456 = wasm_v32x4_shuffle(vi4x0123, vi4x4567, 3, 4, 5, 6); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_1x4() 149 const v128_t vi4x2345 = wasm_v32x4_shuffle(vi4x0123, vi4x4567, 2, 3, 4, 5); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_1x4() 150 vi4x0123 = vi4x4567; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_1x4() 166 const v128_t vi4x5678 = wasm_v32x4_shuffle(vi4x4567, vi4x89AB, 1, 2, 3, 4); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_1x4() 186 const v128_t vi4x6789 = wasm_v32x4_shuffle(vi4x4567, vi4x89AB, 2, 3, 4, 5); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_1x4() 187 vi4x4567 = vi4x89AB; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_1x4() 229 vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi4x4567, vk42)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_1x4() 235 const v128_t vi4x3456 = wasm_v32x4_shuffle(vi4x0123, vi4x4567, 3, 4, 5, 6); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_1x4() [all …]
|
D | 5x5p2-minmax-neon-1x4-acc4.c | 74 float32x4_t vi4x4567 = vld1q_f32(i4); i4 += 4; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_1x4_acc4() local 94 vo0p2 = vmlaq_lane_f32(vo0p2, vi4x4567, vget_high_f32(vwKLMN), 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_1x4_acc4() 100 const float32x4_t vi4x3456 = vextq_f32(vi4x0123, vi4x4567, 3); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_1x4_acc4() 120 const float32x4_t vi4x2345 = vextq_f32(vi4x0123, vi4x4567, 2); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_1x4_acc4() 121 vi4x0123 = vi4x4567; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_1x4_acc4() 137 const float32x4_t vi4x5678 = vextq_f32(vi4x4567, vi4x89AB, 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_1x4_acc4() 157 const float32x4_t vi4x6789 = vextq_f32(vi4x4567, vi4x89AB, 2); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_1x4_acc4() 158 vi4x4567 = vi4x89AB; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_1x4_acc4() 204 vo0p2 = vmlaq_lane_f32(vo0p2, vi4x4567, vget_high_f32(vwKLMN), 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_1x4_acc4() 210 const float32x4_t vi4x3456 = vextq_f32(vi4x0123, vi4x4567, 3); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_1x4_acc4() [all …]
|
D | 5x5p2-minmax-wasmsimd-arm-splat-1x4-acc2.c | 77 v128_t vi4x4567 = wasm_v128_load(i4); i4 += 4; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_splat_1x4_acc2() local 97 …vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi4x4567, wasm_v32x4_shuffle(vwKLMN, vwKLMN, 3, 3, 3,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_splat_1x4_acc2() 103 const v128_t vi4x3456 = wasm_v32x4_shuffle(vi4x0123, vi4x4567, 3, 4, 5, 6); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_splat_1x4_acc2() 123 const v128_t vi4x2345 = wasm_v32x4_shuffle(vi4x0123, vi4x4567, 2, 3, 4, 5); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_splat_1x4_acc2() 124 vi4x0123 = vi4x4567; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_splat_1x4_acc2() 140 const v128_t vi4x5678 = wasm_v32x4_shuffle(vi4x4567, vi4x89AB, 1, 2, 3, 4); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_splat_1x4_acc2() 160 const v128_t vi4x6789 = wasm_v32x4_shuffle(vi4x4567, vi4x89AB, 2, 3, 4, 5); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_splat_1x4_acc2() 161 vi4x4567 = vi4x89AB; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_splat_1x4_acc2() 204 …vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi4x4567, wasm_v32x4_shuffle(vwKLMN, vwKLMN, 3, 3, 3,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_splat_1x4_acc2() 210 const v128_t vi4x3456 = wasm_v32x4_shuffle(vi4x0123, vi4x4567, 3, 4, 5, 6); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_splat_1x4_acc2() [all …]
|
D | 5x5p2-minmax-wasmsimd-x86-loadsplat-1x4-acc4.c | 103 v128_t vi4x4567 = wasm_v128_load(i4); i4 += 4; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_1x4_acc4() local 123 vo0p2 = wasm_f32x4_add(vo0p2, wasm_f32x4_mul(vi4x4567, vk42)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_1x4_acc4() 129 const v128_t vi4x3456 = wasm_v32x4_shuffle(vi4x0123, vi4x4567, 3, 4, 5, 6); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_1x4_acc4() 149 const v128_t vi4x2345 = wasm_v32x4_shuffle(vi4x0123, vi4x4567, 2, 3, 4, 5); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_1x4_acc4() 150 vi4x0123 = vi4x4567; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_1x4_acc4() 166 const v128_t vi4x5678 = wasm_v32x4_shuffle(vi4x4567, vi4x89AB, 1, 2, 3, 4); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_1x4_acc4() 186 const v128_t vi4x6789 = wasm_v32x4_shuffle(vi4x4567, vi4x89AB, 2, 3, 4, 5); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_1x4_acc4() 187 vi4x4567 = vi4x89AB; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_1x4_acc4() 232 vo0p2 = wasm_f32x4_add(vo0p2, wasm_f32x4_mul(vi4x4567, vk42)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_1x4_acc4() 238 const v128_t vi4x3456 = wasm_v32x4_shuffle(vi4x0123, vi4x4567, 3, 4, 5, 6); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_1x4_acc4() [all …]
|
D | 5x5p2-minmax-wasmsimd-arm-loadsplat-1x4-acc5.c | 103 v128_t vi4x4567 = wasm_v128_load(i4); i4 += 4; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_1x4_acc5() local 123 vo0p1 = wasm_f32x4_add(vo0p1, wasm_f32x4_mul(vi4x4567, vk42)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_1x4_acc5() 129 const v128_t vi4x3456 = wasm_v32x4_shuffle(vi4x0123, vi4x4567, 3, 4, 5, 6); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_1x4_acc5() 149 const v128_t vi4x2345 = wasm_v32x4_shuffle(vi4x0123, vi4x4567, 2, 3, 4, 5); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_1x4_acc5() 150 vi4x0123 = vi4x4567; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_1x4_acc5() 166 const v128_t vi4x5678 = wasm_v32x4_shuffle(vi4x4567, vi4x89AB, 1, 2, 3, 4); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_1x4_acc5() 186 const v128_t vi4x6789 = wasm_v32x4_shuffle(vi4x4567, vi4x89AB, 2, 3, 4, 5); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_1x4_acc5() 187 vi4x4567 = vi4x89AB; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_1x4_acc5() 233 vo0p1 = wasm_f32x4_add(vo0p1, wasm_f32x4_mul(vi4x4567, vk42)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_1x4_acc5() 239 const v128_t vi4x3456 = wasm_v32x4_shuffle(vi4x0123, vi4x4567, 3, 4, 5, 6); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_1x4_acc5() [all …]
|
D | 5x5p2-minmax-wasmsimd-arm-splat-1x4.c | 77 v128_t vi4x4567 = wasm_v128_load(i4); i4 += 4; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_splat_1x4() local 97 …vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi4x4567, wasm_v32x4_shuffle(vwKLMN, vwKLMN, 3, 3, 3,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_splat_1x4() 103 const v128_t vi4x3456 = wasm_v32x4_shuffle(vi4x0123, vi4x4567, 3, 4, 5, 6); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_splat_1x4() 123 const v128_t vi4x2345 = wasm_v32x4_shuffle(vi4x0123, vi4x4567, 2, 3, 4, 5); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_splat_1x4() 124 vi4x0123 = vi4x4567; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_splat_1x4() 140 const v128_t vi4x5678 = wasm_v32x4_shuffle(vi4x4567, vi4x89AB, 1, 2, 3, 4); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_splat_1x4() 160 const v128_t vi4x6789 = wasm_v32x4_shuffle(vi4x4567, vi4x89AB, 2, 3, 4, 5); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_splat_1x4() 161 vi4x4567 = vi4x89AB; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_splat_1x4() 203 …vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi4x4567, wasm_v32x4_shuffle(vwKLMN, vwKLMN, 3, 3, 3,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_splat_1x4() 209 const v128_t vi4x3456 = wasm_v32x4_shuffle(vi4x0123, vi4x4567, 3, 4, 5, 6); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_splat_1x4() [all …]
|