Home
last modified time | relevance | path

Searched refs:vw89 (Results 1 – 25 of 114) sorted by relevance

12345

/external/XNNPACK/src/f32-dwconv2d-chw/gen/
D3x3p1-minmax-wasmsimd-arm-splat-6x4.c40 const v128_t vw89 = wasm_v64x2_load_splat(weights + 8); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_splat_6x4() local
138 …vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi2x4567, wasm_v32x4_shuffle(vw89, vw89, 0, 0, 0, 0))… in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_splat_6x4()
139 …vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi3x4567, wasm_v32x4_shuffle(vw89, vw89, 0, 0, 0, 0))… in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_splat_6x4()
140 …vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi4x4567, wasm_v32x4_shuffle(vw89, vw89, 0, 0, 0, 0))… in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_splat_6x4()
141 …vo3p0 = wasm_f32x4_add(vo3p0, wasm_f32x4_mul(vi5x4567, wasm_v32x4_shuffle(vw89, vw89, 0, 0, 0, 0))… in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_splat_6x4()
142 …vo4p0 = wasm_f32x4_add(vo4p0, wasm_f32x4_mul(vi6x4567, wasm_v32x4_shuffle(vw89, vw89, 0, 0, 0, 0))… in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_splat_6x4()
143 …vo5p0 = wasm_f32x4_add(vo5p0, wasm_f32x4_mul(vi7x4567, wasm_v32x4_shuffle(vw89, vw89, 0, 0, 0, 0))… in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_splat_6x4()
207 …vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi2x5678, wasm_v32x4_shuffle(vw89, vw89, 1, 1, 1, 1))… in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_splat_6x4()
208 …vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi3x5678, wasm_v32x4_shuffle(vw89, vw89, 1, 1, 1, 1))… in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_splat_6x4()
209 …vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi4x5678, wasm_v32x4_shuffle(vw89, vw89, 1, 1, 1, 1))… in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_splat_6x4()
[all …]
D3x3p1-minmax-wasmsimd-x86-splat-6x4.c40 const v128_t vw89 = wasm_v64x2_load_splat(weights + 8); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_splat_6x4() local
138 …vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi2x4567, wasm_v32x4_shuffle(vw89, vw89, 0, 0, 0, 0))… in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_splat_6x4()
139 …vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi3x4567, wasm_v32x4_shuffle(vw89, vw89, 0, 0, 0, 0))… in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_splat_6x4()
140 …vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi4x4567, wasm_v32x4_shuffle(vw89, vw89, 0, 0, 0, 0))… in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_splat_6x4()
141 …vo3p0 = wasm_f32x4_add(vo3p0, wasm_f32x4_mul(vi5x4567, wasm_v32x4_shuffle(vw89, vw89, 0, 0, 0, 0))… in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_splat_6x4()
142 …vo4p0 = wasm_f32x4_add(vo4p0, wasm_f32x4_mul(vi6x4567, wasm_v32x4_shuffle(vw89, vw89, 0, 0, 0, 0))… in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_splat_6x4()
143 …vo5p0 = wasm_f32x4_add(vo5p0, wasm_f32x4_mul(vi7x4567, wasm_v32x4_shuffle(vw89, vw89, 0, 0, 0, 0))… in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_splat_6x4()
207 …vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi2x5678, wasm_v32x4_shuffle(vw89, vw89, 1, 1, 1, 1))… in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_splat_6x4()
208 …vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi3x5678, wasm_v32x4_shuffle(vw89, vw89, 1, 1, 1, 1))… in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_splat_6x4()
209 …vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi4x5678, wasm_v32x4_shuffle(vw89, vw89, 1, 1, 1, 1))… in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_splat_6x4()
[all …]
D3x3p1-minmax-wasmsimd-arm-splat-5x4.c40 const v128_t vw89 = wasm_v64x2_load_splat(weights + 8); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_splat_5x4() local
126 …vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi2x4567, wasm_v32x4_shuffle(vw89, vw89, 0, 0, 0, 0))… in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_splat_5x4()
127 …vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi3x4567, wasm_v32x4_shuffle(vw89, vw89, 0, 0, 0, 0))… in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_splat_5x4()
128 …vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi4x4567, wasm_v32x4_shuffle(vw89, vw89, 0, 0, 0, 0))… in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_splat_5x4()
129 …vo3p0 = wasm_f32x4_add(vo3p0, wasm_f32x4_mul(vi5x4567, wasm_v32x4_shuffle(vw89, vw89, 0, 0, 0, 0))… in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_splat_5x4()
130 …vo4p0 = wasm_f32x4_add(vo4p0, wasm_f32x4_mul(vi6x4567, wasm_v32x4_shuffle(vw89, vw89, 0, 0, 0, 0))… in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_splat_5x4()
186 …vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi2x5678, wasm_v32x4_shuffle(vw89, vw89, 1, 1, 1, 1))… in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_splat_5x4()
187 …vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi3x5678, wasm_v32x4_shuffle(vw89, vw89, 1, 1, 1, 1))… in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_splat_5x4()
188 …vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi4x5678, wasm_v32x4_shuffle(vw89, vw89, 1, 1, 1, 1))… in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_splat_5x4()
189 …vo3p0 = wasm_f32x4_add(vo3p0, wasm_f32x4_mul(vi5x5678, wasm_v32x4_shuffle(vw89, vw89, 1, 1, 1, 1))… in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_splat_5x4()
[all …]
D3x3p1-minmax-wasmsimd-x86-splat-5x4.c40 const v128_t vw89 = wasm_v64x2_load_splat(weights + 8); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_splat_5x4() local
126 …vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi2x4567, wasm_v32x4_shuffle(vw89, vw89, 0, 0, 0, 0))… in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_splat_5x4()
127 …vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi3x4567, wasm_v32x4_shuffle(vw89, vw89, 0, 0, 0, 0))… in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_splat_5x4()
128 …vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi4x4567, wasm_v32x4_shuffle(vw89, vw89, 0, 0, 0, 0))… in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_splat_5x4()
129 …vo3p0 = wasm_f32x4_add(vo3p0, wasm_f32x4_mul(vi5x4567, wasm_v32x4_shuffle(vw89, vw89, 0, 0, 0, 0))… in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_splat_5x4()
130 …vo4p0 = wasm_f32x4_add(vo4p0, wasm_f32x4_mul(vi6x4567, wasm_v32x4_shuffle(vw89, vw89, 0, 0, 0, 0))… in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_splat_5x4()
186 …vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi2x5678, wasm_v32x4_shuffle(vw89, vw89, 1, 1, 1, 1))… in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_splat_5x4()
187 …vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi3x5678, wasm_v32x4_shuffle(vw89, vw89, 1, 1, 1, 1))… in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_splat_5x4()
188 …vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi4x5678, wasm_v32x4_shuffle(vw89, vw89, 1, 1, 1, 1))… in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_splat_5x4()
189 …vo3p0 = wasm_f32x4_add(vo3p0, wasm_f32x4_mul(vi5x5678, wasm_v32x4_shuffle(vw89, vw89, 1, 1, 1, 1))… in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_splat_5x4()
[all …]
D3x3p1-minmax-wasmsimd-x86-splat-4x4.c40 const v128_t vw89 = wasm_v64x2_load_splat(weights + 8); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_splat_4x4() local
114 …vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi2x4567, wasm_v32x4_shuffle(vw89, vw89, 0, 0, 0, 0))… in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_splat_4x4()
115 …vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi3x4567, wasm_v32x4_shuffle(vw89, vw89, 0, 0, 0, 0))… in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_splat_4x4()
116 …vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi4x4567, wasm_v32x4_shuffle(vw89, vw89, 0, 0, 0, 0))… in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_splat_4x4()
117 …vo3p0 = wasm_f32x4_add(vo3p0, wasm_f32x4_mul(vi5x4567, wasm_v32x4_shuffle(vw89, vw89, 0, 0, 0, 0))… in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_splat_4x4()
165 …vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi2x5678, wasm_v32x4_shuffle(vw89, vw89, 1, 1, 1, 1))… in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_splat_4x4()
166 …vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi3x5678, wasm_v32x4_shuffle(vw89, vw89, 1, 1, 1, 1))… in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_splat_4x4()
167 …vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi4x5678, wasm_v32x4_shuffle(vw89, vw89, 1, 1, 1, 1))… in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_splat_4x4()
168 …vo3p0 = wasm_f32x4_add(vo3p0, wasm_f32x4_mul(vi5x5678, wasm_v32x4_shuffle(vw89, vw89, 1, 1, 1, 1))… in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_splat_4x4()
218 …vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi2x4567, wasm_v32x4_shuffle(vw89, vw89, 0, 0, 0, 0))… in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_splat_4x4()
[all …]
D3x3p1-minmax-wasmsimd-arm-splat-4x4.c40 const v128_t vw89 = wasm_v64x2_load_splat(weights + 8); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_splat_4x4() local
114 …vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi2x4567, wasm_v32x4_shuffle(vw89, vw89, 0, 0, 0, 0))… in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_splat_4x4()
115 …vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi3x4567, wasm_v32x4_shuffle(vw89, vw89, 0, 0, 0, 0))… in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_splat_4x4()
116 …vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi4x4567, wasm_v32x4_shuffle(vw89, vw89, 0, 0, 0, 0))… in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_splat_4x4()
117 …vo3p0 = wasm_f32x4_add(vo3p0, wasm_f32x4_mul(vi5x4567, wasm_v32x4_shuffle(vw89, vw89, 0, 0, 0, 0))… in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_splat_4x4()
165 …vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi2x5678, wasm_v32x4_shuffle(vw89, vw89, 1, 1, 1, 1))… in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_splat_4x4()
166 …vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi3x5678, wasm_v32x4_shuffle(vw89, vw89, 1, 1, 1, 1))… in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_splat_4x4()
167 …vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi4x5678, wasm_v32x4_shuffle(vw89, vw89, 1, 1, 1, 1))… in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_splat_4x4()
168 …vo3p0 = wasm_f32x4_add(vo3p0, wasm_f32x4_mul(vi5x5678, wasm_v32x4_shuffle(vw89, vw89, 1, 1, 1, 1))… in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_splat_4x4()
218 …vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi2x4567, wasm_v32x4_shuffle(vw89, vw89, 0, 0, 0, 0))… in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_splat_4x4()
[all …]
D3x3p1-minmax-wasmsimd-arm-splat-3x4.c40 const v128_t vw89 = wasm_v64x2_load_splat(weights + 8); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_splat_3x4() local
102 …vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi2x4567, wasm_v32x4_shuffle(vw89, vw89, 0, 0, 0, 0))… in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_splat_3x4()
103 …vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi3x4567, wasm_v32x4_shuffle(vw89, vw89, 0, 0, 0, 0))… in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_splat_3x4()
104 …vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi4x4567, wasm_v32x4_shuffle(vw89, vw89, 0, 0, 0, 0))… in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_splat_3x4()
144 …vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi2x5678, wasm_v32x4_shuffle(vw89, vw89, 1, 1, 1, 1))… in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_splat_3x4()
145 …vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi3x5678, wasm_v32x4_shuffle(vw89, vw89, 1, 1, 1, 1))… in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_splat_3x4()
146 …vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi4x5678, wasm_v32x4_shuffle(vw89, vw89, 1, 1, 1, 1))… in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_splat_3x4()
188 …vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi2x4567, wasm_v32x4_shuffle(vw89, vw89, 0, 0, 0, 0))… in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_splat_3x4()
189 …vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi3x4567, wasm_v32x4_shuffle(vw89, vw89, 0, 0, 0, 0))… in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_splat_3x4()
190 …vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi4x4567, wasm_v32x4_shuffle(vw89, vw89, 0, 0, 0, 0))… in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_splat_3x4()
[all …]
D3x3p1-minmax-wasmsimd-x86-splat-3x4.c40 const v128_t vw89 = wasm_v64x2_load_splat(weights + 8); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_splat_3x4() local
102 …vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi2x4567, wasm_v32x4_shuffle(vw89, vw89, 0, 0, 0, 0))… in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_splat_3x4()
103 …vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi3x4567, wasm_v32x4_shuffle(vw89, vw89, 0, 0, 0, 0))… in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_splat_3x4()
104 …vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi4x4567, wasm_v32x4_shuffle(vw89, vw89, 0, 0, 0, 0))… in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_splat_3x4()
144 …vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi2x5678, wasm_v32x4_shuffle(vw89, vw89, 1, 1, 1, 1))… in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_splat_3x4()
145 …vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi3x5678, wasm_v32x4_shuffle(vw89, vw89, 1, 1, 1, 1))… in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_splat_3x4()
146 …vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi4x5678, wasm_v32x4_shuffle(vw89, vw89, 1, 1, 1, 1))… in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_splat_3x4()
188 …vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi2x4567, wasm_v32x4_shuffle(vw89, vw89, 0, 0, 0, 0))… in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_splat_3x4()
189 …vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi3x4567, wasm_v32x4_shuffle(vw89, vw89, 0, 0, 0, 0))… in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_splat_3x4()
190 …vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi4x4567, wasm_v32x4_shuffle(vw89, vw89, 0, 0, 0, 0))… in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_splat_3x4()
[all …]
D3x3s2p1-minmax-wasmsimd-arm-splat-4x4.c42 const v128_t vw89 = wasm_v64x2_load_splat(weights + 8); in xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__wasmsimd_arm_splat_4x4() local
169 …vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi2x8ACE, wasm_v32x4_shuffle(vw89, vw89, 0, 0, 0, 0))… in xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__wasmsimd_arm_splat_4x4()
170 …vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi4x8ACE, wasm_v32x4_shuffle(vw89, vw89, 0, 0, 0, 0))… in xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__wasmsimd_arm_splat_4x4()
171 …vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi6x8ACE, wasm_v32x4_shuffle(vw89, vw89, 0, 0, 0, 0))… in xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__wasmsimd_arm_splat_4x4()
172 …vo3p0 = wasm_f32x4_add(vo3p0, wasm_f32x4_mul(vi8x8ACE, wasm_v32x4_shuffle(vw89, vw89, 0, 0, 0, 0))… in xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__wasmsimd_arm_splat_4x4()
218 …vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi2x9BDF, wasm_v32x4_shuffle(vw89, vw89, 1, 1, 1, 1))… in xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__wasmsimd_arm_splat_4x4()
219 …vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi4x9BDF, wasm_v32x4_shuffle(vw89, vw89, 1, 1, 1, 1))… in xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__wasmsimd_arm_splat_4x4()
220 …vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi6x9BDF, wasm_v32x4_shuffle(vw89, vw89, 1, 1, 1, 1))… in xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__wasmsimd_arm_splat_4x4()
221 …vo3p0 = wasm_f32x4_add(vo3p0, wasm_f32x4_mul(vi8x9BDF, wasm_v32x4_shuffle(vw89, vw89, 1, 1, 1, 1))… in xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__wasmsimd_arm_splat_4x4()
294 …vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi2x8ACE, wasm_v32x4_shuffle(vw89, vw89, 0, 0, 0, 0))… in xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__wasmsimd_arm_splat_4x4()
[all …]
D3x3s2p1-minmax-wasmsimd-x86-splat-4x4.c42 const v128_t vw89 = wasm_v64x2_load_splat(weights + 8); in xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__wasmsimd_x86_splat_4x4() local
169 …vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi2x8ACE, wasm_v32x4_shuffle(vw89, vw89, 0, 0, 0, 0))… in xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__wasmsimd_x86_splat_4x4()
170 …vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi4x8ACE, wasm_v32x4_shuffle(vw89, vw89, 0, 0, 0, 0))… in xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__wasmsimd_x86_splat_4x4()
171 …vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi6x8ACE, wasm_v32x4_shuffle(vw89, vw89, 0, 0, 0, 0))… in xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__wasmsimd_x86_splat_4x4()
172 …vo3p0 = wasm_f32x4_add(vo3p0, wasm_f32x4_mul(vi8x8ACE, wasm_v32x4_shuffle(vw89, vw89, 0, 0, 0, 0))… in xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__wasmsimd_x86_splat_4x4()
218 …vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi2x9BDF, wasm_v32x4_shuffle(vw89, vw89, 1, 1, 1, 1))… in xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__wasmsimd_x86_splat_4x4()
219 …vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi4x9BDF, wasm_v32x4_shuffle(vw89, vw89, 1, 1, 1, 1))… in xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__wasmsimd_x86_splat_4x4()
220 …vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi6x9BDF, wasm_v32x4_shuffle(vw89, vw89, 1, 1, 1, 1))… in xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__wasmsimd_x86_splat_4x4()
221 …vo3p0 = wasm_f32x4_add(vo3p0, wasm_f32x4_mul(vi8x9BDF, wasm_v32x4_shuffle(vw89, vw89, 1, 1, 1, 1))… in xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__wasmsimd_x86_splat_4x4()
294 …vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi2x8ACE, wasm_v32x4_shuffle(vw89, vw89, 0, 0, 0, 0))… in xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__wasmsimd_x86_splat_4x4()
[all …]
D3x3s2p1-minmax-wasmsimd-arm-splat-3x4.c42 const v128_t vw89 = wasm_v64x2_load_splat(weights + 8); in xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__wasmsimd_arm_splat_3x4() local
144 …vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi2x8ACE, wasm_v32x4_shuffle(vw89, vw89, 0, 0, 0, 0))… in xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__wasmsimd_arm_splat_3x4()
145 …vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi4x8ACE, wasm_v32x4_shuffle(vw89, vw89, 0, 0, 0, 0))… in xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__wasmsimd_arm_splat_3x4()
146 …vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi6x8ACE, wasm_v32x4_shuffle(vw89, vw89, 0, 0, 0, 0))… in xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__wasmsimd_arm_splat_3x4()
183 …vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi2x9BDF, wasm_v32x4_shuffle(vw89, vw89, 1, 1, 1, 1))… in xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__wasmsimd_arm_splat_3x4()
184 …vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi4x9BDF, wasm_v32x4_shuffle(vw89, vw89, 1, 1, 1, 1))… in xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__wasmsimd_arm_splat_3x4()
185 …vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi6x9BDF, wasm_v32x4_shuffle(vw89, vw89, 1, 1, 1, 1))… in xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__wasmsimd_arm_splat_3x4()
244 …vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi2x8ACE, wasm_v32x4_shuffle(vw89, vw89, 0, 0, 0, 0))… in xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__wasmsimd_arm_splat_3x4()
245 …vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi4x8ACE, wasm_v32x4_shuffle(vw89, vw89, 0, 0, 0, 0))… in xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__wasmsimd_arm_splat_3x4()
246 …vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi6x8ACE, wasm_v32x4_shuffle(vw89, vw89, 0, 0, 0, 0))… in xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__wasmsimd_arm_splat_3x4()
[all …]
D3x3p1-minmax-wasmsimd-arm-splat-2x4-acc2.c40 const v128_t vw89 = wasm_v64x2_load_splat(weights + 8); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_splat_2x4_acc2() local
90 …vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi2x4567, wasm_v32x4_shuffle(vw89, vw89, 0, 0, 0, 0))… in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_splat_2x4_acc2()
91 …vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi3x4567, wasm_v32x4_shuffle(vw89, vw89, 0, 0, 0, 0))… in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_splat_2x4_acc2()
123 …vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi2x5678, wasm_v32x4_shuffle(vw89, vw89, 1, 1, 1, 1))… in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_splat_2x4_acc2()
124 …vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi3x5678, wasm_v32x4_shuffle(vw89, vw89, 1, 1, 1, 1))… in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_splat_2x4_acc2()
160 …vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi2x4567, wasm_v32x4_shuffle(vw89, vw89, 0, 0, 0, 0))… in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_splat_2x4_acc2()
161 …vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi3x4567, wasm_v32x4_shuffle(vw89, vw89, 0, 0, 0, 0))… in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_splat_2x4_acc2()
188 …vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi2x5678, wasm_v32x4_shuffle(vw89, vw89, 1, 1, 1, 1))… in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_splat_2x4_acc2()
189 …vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi3x5678, wasm_v32x4_shuffle(vw89, vw89, 1, 1, 1, 1))… in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_splat_2x4_acc2()
D3x3p1-minmax-wasmsimd-x86-splat-2x4-acc2.c40 const v128_t vw89 = wasm_v64x2_load_splat(weights + 8); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_splat_2x4_acc2() local
90 …vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi2x4567, wasm_v32x4_shuffle(vw89, vw89, 0, 0, 0, 0))… in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_splat_2x4_acc2()
91 …vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi3x4567, wasm_v32x4_shuffle(vw89, vw89, 0, 0, 0, 0))… in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_splat_2x4_acc2()
123 …vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi2x5678, wasm_v32x4_shuffle(vw89, vw89, 1, 1, 1, 1))… in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_splat_2x4_acc2()
124 …vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi3x5678, wasm_v32x4_shuffle(vw89, vw89, 1, 1, 1, 1))… in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_splat_2x4_acc2()
160 …vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi2x4567, wasm_v32x4_shuffle(vw89, vw89, 0, 0, 0, 0))… in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_splat_2x4_acc2()
161 …vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi3x4567, wasm_v32x4_shuffle(vw89, vw89, 0, 0, 0, 0))… in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_splat_2x4_acc2()
188 …vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi2x5678, wasm_v32x4_shuffle(vw89, vw89, 1, 1, 1, 1))… in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_splat_2x4_acc2()
189 …vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi3x5678, wasm_v32x4_shuffle(vw89, vw89, 1, 1, 1, 1))… in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_splat_2x4_acc2()
D3x3p1-minmax-wasmsimd-x86-splat-2x4.c40 const v128_t vw89 = wasm_v64x2_load_splat(weights + 8); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_splat_2x4() local
90 …vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi2x4567, wasm_v32x4_shuffle(vw89, vw89, 0, 0, 0, 0))… in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_splat_2x4()
91 …vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi3x4567, wasm_v32x4_shuffle(vw89, vw89, 0, 0, 0, 0))… in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_splat_2x4()
123 …vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi2x5678, wasm_v32x4_shuffle(vw89, vw89, 1, 1, 1, 1))… in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_splat_2x4()
124 …vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi3x5678, wasm_v32x4_shuffle(vw89, vw89, 1, 1, 1, 1))… in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_splat_2x4()
158 …vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi2x4567, wasm_v32x4_shuffle(vw89, vw89, 0, 0, 0, 0))… in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_splat_2x4()
159 …vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi3x4567, wasm_v32x4_shuffle(vw89, vw89, 0, 0, 0, 0))… in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_splat_2x4()
186 …vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi2x5678, wasm_v32x4_shuffle(vw89, vw89, 1, 1, 1, 1))… in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_splat_2x4()
187 …vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi3x5678, wasm_v32x4_shuffle(vw89, vw89, 1, 1, 1, 1))… in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_splat_2x4()
D3x3p1-minmax-wasmsimd-arm-splat-2x4.c40 const v128_t vw89 = wasm_v64x2_load_splat(weights + 8); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_splat_2x4() local
90 …vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi2x4567, wasm_v32x4_shuffle(vw89, vw89, 0, 0, 0, 0))… in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_splat_2x4()
91 …vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi3x4567, wasm_v32x4_shuffle(vw89, vw89, 0, 0, 0, 0))… in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_splat_2x4()
123 …vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi2x5678, wasm_v32x4_shuffle(vw89, vw89, 1, 1, 1, 1))… in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_splat_2x4()
124 …vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi3x5678, wasm_v32x4_shuffle(vw89, vw89, 1, 1, 1, 1))… in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_splat_2x4()
158 …vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi2x4567, wasm_v32x4_shuffle(vw89, vw89, 0, 0, 0, 0))… in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_splat_2x4()
159 …vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi3x4567, wasm_v32x4_shuffle(vw89, vw89, 0, 0, 0, 0))… in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_splat_2x4()
186 …vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi2x5678, wasm_v32x4_shuffle(vw89, vw89, 1, 1, 1, 1))… in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_splat_2x4()
187 …vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi3x5678, wasm_v32x4_shuffle(vw89, vw89, 1, 1, 1, 1))… in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_splat_2x4()
D3x3s2p1-minmax-wasmsimd-x86-splat-3x4.c42 const v128_t vw89 = wasm_v64x2_load_splat(weights + 8); in xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__wasmsimd_x86_splat_3x4() local
144 …vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi2x8ACE, wasm_v32x4_shuffle(vw89, vw89, 0, 0, 0, 0))… in xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__wasmsimd_x86_splat_3x4()
145 …vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi4x8ACE, wasm_v32x4_shuffle(vw89, vw89, 0, 0, 0, 0))… in xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__wasmsimd_x86_splat_3x4()
146 …vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi6x8ACE, wasm_v32x4_shuffle(vw89, vw89, 0, 0, 0, 0))… in xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__wasmsimd_x86_splat_3x4()
183 …vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi2x9BDF, wasm_v32x4_shuffle(vw89, vw89, 1, 1, 1, 1))… in xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__wasmsimd_x86_splat_3x4()
184 …vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi4x9BDF, wasm_v32x4_shuffle(vw89, vw89, 1, 1, 1, 1))… in xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__wasmsimd_x86_splat_3x4()
185 …vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi6x9BDF, wasm_v32x4_shuffle(vw89, vw89, 1, 1, 1, 1))… in xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__wasmsimd_x86_splat_3x4()
244 …vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi2x8ACE, wasm_v32x4_shuffle(vw89, vw89, 0, 0, 0, 0))… in xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__wasmsimd_x86_splat_3x4()
245 …vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi4x8ACE, wasm_v32x4_shuffle(vw89, vw89, 0, 0, 0, 0))… in xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__wasmsimd_x86_splat_3x4()
246 …vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi6x8ACE, wasm_v32x4_shuffle(vw89, vw89, 0, 0, 0, 0))… in xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__wasmsimd_x86_splat_3x4()
[all …]
D3x3s2p1-minmax-wasmsimd-arm-splat-2x4-acc2.c42 const v128_t vw89 = wasm_v64x2_load_splat(weights + 8); in xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__wasmsimd_arm_splat_2x4_acc2() local
119 …vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi2x8ACE, wasm_v32x4_shuffle(vw89, vw89, 0, 0, 0, 0))… in xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__wasmsimd_arm_splat_2x4_acc2()
120 …vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi4x8ACE, wasm_v32x4_shuffle(vw89, vw89, 0, 0, 0, 0))… in xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__wasmsimd_arm_splat_2x4_acc2()
148 …vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi2x9BDF, wasm_v32x4_shuffle(vw89, vw89, 1, 1, 1, 1))… in xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__wasmsimd_arm_splat_2x4_acc2()
149 …vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi4x9BDF, wasm_v32x4_shuffle(vw89, vw89, 1, 1, 1, 1))… in xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__wasmsimd_arm_splat_2x4_acc2()
196 …vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi2x8ACE, wasm_v32x4_shuffle(vw89, vw89, 0, 0, 0, 0))… in xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__wasmsimd_arm_splat_2x4_acc2()
197 …vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi4x8ACE, wasm_v32x4_shuffle(vw89, vw89, 0, 0, 0, 0))… in xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__wasmsimd_arm_splat_2x4_acc2()
220 …vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi2x9BDF, wasm_v32x4_shuffle(vw89, vw89, 1, 1, 1, 1))… in xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__wasmsimd_arm_splat_2x4_acc2()
221 …vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi4x9BDF, wasm_v32x4_shuffle(vw89, vw89, 1, 1, 1, 1))… in xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__wasmsimd_arm_splat_2x4_acc2()
D3x3s2p1-minmax-wasmsimd-arm-splat-2x4.c42 const v128_t vw89 = wasm_v64x2_load_splat(weights + 8); in xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__wasmsimd_arm_splat_2x4() local
119 …vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi2x8ACE, wasm_v32x4_shuffle(vw89, vw89, 0, 0, 0, 0))… in xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__wasmsimd_arm_splat_2x4()
120 …vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi4x8ACE, wasm_v32x4_shuffle(vw89, vw89, 0, 0, 0, 0))… in xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__wasmsimd_arm_splat_2x4()
148 …vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi2x9BDF, wasm_v32x4_shuffle(vw89, vw89, 1, 1, 1, 1))… in xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__wasmsimd_arm_splat_2x4()
149 …vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi4x9BDF, wasm_v32x4_shuffle(vw89, vw89, 1, 1, 1, 1))… in xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__wasmsimd_arm_splat_2x4()
194 …vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi2x8ACE, wasm_v32x4_shuffle(vw89, vw89, 0, 0, 0, 0))… in xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__wasmsimd_arm_splat_2x4()
195 …vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi4x8ACE, wasm_v32x4_shuffle(vw89, vw89, 0, 0, 0, 0))… in xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__wasmsimd_arm_splat_2x4()
218 …vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi2x9BDF, wasm_v32x4_shuffle(vw89, vw89, 1, 1, 1, 1))… in xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__wasmsimd_arm_splat_2x4()
219 …vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi4x9BDF, wasm_v32x4_shuffle(vw89, vw89, 1, 1, 1, 1))… in xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__wasmsimd_arm_splat_2x4()
D3x3s2p1-minmax-wasmsimd-x86-splat-2x4-acc2.c42 const v128_t vw89 = wasm_v64x2_load_splat(weights + 8); in xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__wasmsimd_x86_splat_2x4_acc2() local
119 …vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi2x8ACE, wasm_v32x4_shuffle(vw89, vw89, 0, 0, 0, 0))… in xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__wasmsimd_x86_splat_2x4_acc2()
120 …vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi4x8ACE, wasm_v32x4_shuffle(vw89, vw89, 0, 0, 0, 0))… in xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__wasmsimd_x86_splat_2x4_acc2()
148 …vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi2x9BDF, wasm_v32x4_shuffle(vw89, vw89, 1, 1, 1, 1))… in xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__wasmsimd_x86_splat_2x4_acc2()
149 …vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi4x9BDF, wasm_v32x4_shuffle(vw89, vw89, 1, 1, 1, 1))… in xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__wasmsimd_x86_splat_2x4_acc2()
196 …vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi2x8ACE, wasm_v32x4_shuffle(vw89, vw89, 0, 0, 0, 0))… in xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__wasmsimd_x86_splat_2x4_acc2()
197 …vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi4x8ACE, wasm_v32x4_shuffle(vw89, vw89, 0, 0, 0, 0))… in xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__wasmsimd_x86_splat_2x4_acc2()
220 …vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi2x9BDF, wasm_v32x4_shuffle(vw89, vw89, 1, 1, 1, 1))… in xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__wasmsimd_x86_splat_2x4_acc2()
221 …vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi4x9BDF, wasm_v32x4_shuffle(vw89, vw89, 1, 1, 1, 1))… in xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__wasmsimd_x86_splat_2x4_acc2()
D3x3s2p1-minmax-wasmsimd-x86-splat-2x4.c42 const v128_t vw89 = wasm_v64x2_load_splat(weights + 8); in xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__wasmsimd_x86_splat_2x4() local
119 …vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi2x8ACE, wasm_v32x4_shuffle(vw89, vw89, 0, 0, 0, 0))… in xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__wasmsimd_x86_splat_2x4()
120 …vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi4x8ACE, wasm_v32x4_shuffle(vw89, vw89, 0, 0, 0, 0))… in xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__wasmsimd_x86_splat_2x4()
148 …vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi2x9BDF, wasm_v32x4_shuffle(vw89, vw89, 1, 1, 1, 1))… in xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__wasmsimd_x86_splat_2x4()
149 …vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi4x9BDF, wasm_v32x4_shuffle(vw89, vw89, 1, 1, 1, 1))… in xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__wasmsimd_x86_splat_2x4()
194 …vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi2x8ACE, wasm_v32x4_shuffle(vw89, vw89, 0, 0, 0, 0))… in xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__wasmsimd_x86_splat_2x4()
195 …vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi4x8ACE, wasm_v32x4_shuffle(vw89, vw89, 0, 0, 0, 0))… in xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__wasmsimd_x86_splat_2x4()
218 …vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi2x9BDF, wasm_v32x4_shuffle(vw89, vw89, 1, 1, 1, 1))… in xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__wasmsimd_x86_splat_2x4()
219 …vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi4x9BDF, wasm_v32x4_shuffle(vw89, vw89, 1, 1, 1, 1))… in xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__wasmsimd_x86_splat_2x4()
D3x3p1-minmax-neon-6x4.c39 const float32x2_t vw89 = vld1_f32(weights + 8); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__neon_6x4() local
135 vo0p0 = vmlaq_lane_f32(vo0p0, vi2x4567, vw89, 0); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__neon_6x4()
136 vo1p0 = vmlaq_lane_f32(vo1p0, vi3x4567, vw89, 0); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__neon_6x4()
137 vo2p0 = vmlaq_lane_f32(vo2p0, vi4x4567, vw89, 0); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__neon_6x4()
138 vo3p0 = vmlaq_lane_f32(vo3p0, vi5x4567, vw89, 0); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__neon_6x4()
139 vo4p0 = vmlaq_lane_f32(vo4p0, vi6x4567, vw89, 0); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__neon_6x4()
140 vo5p0 = vmlaq_lane_f32(vo5p0, vi7x4567, vw89, 0); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__neon_6x4()
204 vo0p0 = vmlaq_lane_f32(vo0p0, vi2x5678, vw89, 1); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__neon_6x4()
205 vo1p0 = vmlaq_lane_f32(vo1p0, vi3x5678, vw89, 1); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__neon_6x4()
206 vo2p0 = vmlaq_lane_f32(vo2p0, vi4x5678, vw89, 1); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__neon_6x4()
[all …]
D3x3p1-minmax-neonfma-6x4.c39 const float32x2_t vw89 = vld1_f32(weights + 8); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__neonfma_6x4() local
135 vo0p0 = vfmaq_lane_f32(vo0p0, vi2x4567, vw89, 0); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__neonfma_6x4()
136 vo1p0 = vfmaq_lane_f32(vo1p0, vi3x4567, vw89, 0); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__neonfma_6x4()
137 vo2p0 = vfmaq_lane_f32(vo2p0, vi4x4567, vw89, 0); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__neonfma_6x4()
138 vo3p0 = vfmaq_lane_f32(vo3p0, vi5x4567, vw89, 0); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__neonfma_6x4()
139 vo4p0 = vfmaq_lane_f32(vo4p0, vi6x4567, vw89, 0); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__neonfma_6x4()
140 vo5p0 = vfmaq_lane_f32(vo5p0, vi7x4567, vw89, 0); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__neonfma_6x4()
204 vo0p0 = vfmaq_lane_f32(vo0p0, vi2x5678, vw89, 1); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__neonfma_6x4()
205 vo1p0 = vfmaq_lane_f32(vo1p0, vi3x5678, vw89, 1); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__neonfma_6x4()
206 vo2p0 = vfmaq_lane_f32(vo2p0, vi4x5678, vw89, 1); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__neonfma_6x4()
[all …]
D3x3p1-minmax-neonfma-5x4.c39 const float32x2_t vw89 = vld1_f32(weights + 8); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__neonfma_5x4() local
123 vo0p0 = vfmaq_lane_f32(vo0p0, vi2x4567, vw89, 0); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__neonfma_5x4()
124 vo1p0 = vfmaq_lane_f32(vo1p0, vi3x4567, vw89, 0); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__neonfma_5x4()
125 vo2p0 = vfmaq_lane_f32(vo2p0, vi4x4567, vw89, 0); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__neonfma_5x4()
126 vo3p0 = vfmaq_lane_f32(vo3p0, vi5x4567, vw89, 0); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__neonfma_5x4()
127 vo4p0 = vfmaq_lane_f32(vo4p0, vi6x4567, vw89, 0); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__neonfma_5x4()
183 vo0p0 = vfmaq_lane_f32(vo0p0, vi2x5678, vw89, 1); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__neonfma_5x4()
184 vo1p0 = vfmaq_lane_f32(vo1p0, vi3x5678, vw89, 1); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__neonfma_5x4()
185 vo2p0 = vfmaq_lane_f32(vo2p0, vi4x5678, vw89, 1); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__neonfma_5x4()
186 vo3p0 = vfmaq_lane_f32(vo3p0, vi5x5678, vw89, 1); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__neonfma_5x4()
[all …]
D3x3p1-minmax-neon-5x4.c39 const float32x2_t vw89 = vld1_f32(weights + 8); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__neon_5x4() local
123 vo0p0 = vmlaq_lane_f32(vo0p0, vi2x4567, vw89, 0); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__neon_5x4()
124 vo1p0 = vmlaq_lane_f32(vo1p0, vi3x4567, vw89, 0); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__neon_5x4()
125 vo2p0 = vmlaq_lane_f32(vo2p0, vi4x4567, vw89, 0); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__neon_5x4()
126 vo3p0 = vmlaq_lane_f32(vo3p0, vi5x4567, vw89, 0); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__neon_5x4()
127 vo4p0 = vmlaq_lane_f32(vo4p0, vi6x4567, vw89, 0); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__neon_5x4()
183 vo0p0 = vmlaq_lane_f32(vo0p0, vi2x5678, vw89, 1); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__neon_5x4()
184 vo1p0 = vmlaq_lane_f32(vo1p0, vi3x5678, vw89, 1); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__neon_5x4()
185 vo2p0 = vmlaq_lane_f32(vo2p0, vi4x5678, vw89, 1); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__neon_5x4()
186 vo3p0 = vmlaq_lane_f32(vo3p0, vi5x5678, vw89, 1); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__neon_5x4()
[all …]
D3x3p1-minmax-wasmsimd-x86-splat-1x4.c40 const v128_t vw89 = wasm_v64x2_load_splat(weights + 8); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_splat_1x4() local
78 …vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi2x4567, wasm_v32x4_shuffle(vw89, vw89, 0, 0, 0, 0))… in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_splat_1x4()
102 …vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi2x5678, wasm_v32x4_shuffle(vw89, vw89, 1, 1, 1, 1))… in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_splat_1x4()
128 …vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi2x4567, wasm_v32x4_shuffle(vw89, vw89, 0, 0, 0, 0))… in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_splat_1x4()
148 …vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi2x5678, wasm_v32x4_shuffle(vw89, vw89, 1, 1, 1, 1))… in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_splat_1x4()

12345