/external/XNNPACK/src/qs8-gemm/gen/ |
D | 4x8-minmax-neon-mlal-lane.c | 68 int32x4_t vacc3x4567 = vacc0x4567; in xnn_qs8_gemm_minmax_ukernel_4x8__neon_mlal_lane() local 91 vacc3x4567 = vmlal_lane_s16(vacc3x4567, vget_high_s16(vxb01234567c0), vget_low_s16(vxa3), 0); in xnn_qs8_gemm_minmax_ukernel_4x8__neon_mlal_lane() 102 vacc3x4567 = vmlal_lane_s16(vacc3x4567, vget_high_s16(vxb01234567c1), vget_low_s16(vxa3), 1); in xnn_qs8_gemm_minmax_ukernel_4x8__neon_mlal_lane() 113 vacc3x4567 = vmlal_lane_s16(vacc3x4567, vget_high_s16(vxb01234567c2), vget_low_s16(vxa3), 2); in xnn_qs8_gemm_minmax_ukernel_4x8__neon_mlal_lane() 124 vacc3x4567 = vmlal_lane_s16(vacc3x4567, vget_high_s16(vxb01234567c3), vget_low_s16(vxa3), 3); in xnn_qs8_gemm_minmax_ukernel_4x8__neon_mlal_lane() 136 vacc3x4567 = vmlal_lane_s16(vacc3x4567, vget_high_s16(vxb01234567c4), vget_high_s16(vxa3), 0); in xnn_qs8_gemm_minmax_ukernel_4x8__neon_mlal_lane() 147 vacc3x4567 = vmlal_lane_s16(vacc3x4567, vget_high_s16(vxb01234567c5), vget_high_s16(vxa3), 1); in xnn_qs8_gemm_minmax_ukernel_4x8__neon_mlal_lane() 158 vacc3x4567 = vmlal_lane_s16(vacc3x4567, vget_high_s16(vxb01234567c6), vget_high_s16(vxa3), 2); in xnn_qs8_gemm_minmax_ukernel_4x8__neon_mlal_lane() 169 vacc3x4567 = vmlal_lane_s16(vacc3x4567, vget_high_s16(vxb01234567c7), vget_high_s16(vxa3), 3); in xnn_qs8_gemm_minmax_ukernel_4x8__neon_mlal_lane() 193 vacc3x4567 = vmlal_lane_s16(vacc3x4567, vget_high_s16(vxb01234567c0), vget_low_s16(vxa3), 0); in xnn_qs8_gemm_minmax_ukernel_4x8__neon_mlal_lane() [all …]
|
D | 4x8-minmax-neon-mull-addw-dup.c | 68 int32x4_t vacc3x4567 = vacc0x4567; in xnn_qs8_gemm_minmax_ukernel_4x8__neon_mull_addw_dup() local 90 vacc3x4567 = vaddw_s16(vacc3x4567, vget_high_s16(vprod3x01234567c0)); in xnn_qs8_gemm_minmax_ukernel_4x8__neon_mull_addw_dup() 104 vacc3x4567 = vaddw_s16(vacc3x4567, vget_high_s16(vprod3x01234567c1)); in xnn_qs8_gemm_minmax_ukernel_4x8__neon_mull_addw_dup() 118 vacc3x4567 = vaddw_s16(vacc3x4567, vget_high_s16(vprod3x01234567c2)); in xnn_qs8_gemm_minmax_ukernel_4x8__neon_mull_addw_dup() 132 vacc3x4567 = vaddw_s16(vacc3x4567, vget_high_s16(vprod3x01234567c3)); in xnn_qs8_gemm_minmax_ukernel_4x8__neon_mull_addw_dup() 146 vacc3x4567 = vaddw_s16(vacc3x4567, vget_high_s16(vprod3x01234567c4)); in xnn_qs8_gemm_minmax_ukernel_4x8__neon_mull_addw_dup() 160 vacc3x4567 = vaddw_s16(vacc3x4567, vget_high_s16(vprod3x01234567c5)); in xnn_qs8_gemm_minmax_ukernel_4x8__neon_mull_addw_dup() 174 vacc3x4567 = vaddw_s16(vacc3x4567, vget_high_s16(vprod3x01234567c6)); in xnn_qs8_gemm_minmax_ukernel_4x8__neon_mull_addw_dup() 188 vacc3x4567 = vaddw_s16(vacc3x4567, vget_high_s16(vprod3x01234567c7)); in xnn_qs8_gemm_minmax_ukernel_4x8__neon_mull_addw_dup() 211 vacc3x4567 = vaddw_s16(vacc3x4567, vget_high_s16(vprod3x01234567c0)); in xnn_qs8_gemm_minmax_ukernel_4x8__neon_mull_addw_dup() [all …]
|
/external/XNNPACK/src/qu8-igemm/ |
D | 4x8-minmax-neon.c | 62 int32x4_t vacc3x4567 = vacc0x4567; in xnn_qu8_igemm_minmax_ukernel_4x8__neon() local 106 … vacc3x4567 = vmlal_lane_s16(vacc3x4567, vget_high_s16(vxb01234567), vget_low_s16(vxa3), 0); in xnn_qu8_igemm_minmax_ukernel_4x8__neon() 120 … vacc3x4567 = vmlal_lane_s16(vacc3x4567, vget_high_s16(vxb01234567), vget_low_s16(vxa3), 1); in xnn_qu8_igemm_minmax_ukernel_4x8__neon() 134 … vacc3x4567 = vmlal_lane_s16(vacc3x4567, vget_high_s16(vxb01234567), vget_low_s16(vxa3), 2); in xnn_qu8_igemm_minmax_ukernel_4x8__neon() 148 … vacc3x4567 = vmlal_lane_s16(vacc3x4567, vget_high_s16(vxb01234567), vget_low_s16(vxa3), 3); in xnn_qu8_igemm_minmax_ukernel_4x8__neon() 162 … vacc3x4567 = vmlal_lane_s16(vacc3x4567, vget_high_s16(vxb01234567), vget_high_s16(vxa3), 0); in xnn_qu8_igemm_minmax_ukernel_4x8__neon() 176 … vacc3x4567 = vmlal_lane_s16(vacc3x4567, vget_high_s16(vxb01234567), vget_high_s16(vxa3), 1); in xnn_qu8_igemm_minmax_ukernel_4x8__neon() 190 … vacc3x4567 = vmlal_lane_s16(vacc3x4567, vget_high_s16(vxb01234567), vget_high_s16(vxa3), 2); in xnn_qu8_igemm_minmax_ukernel_4x8__neon() 204 … vacc3x4567 = vmlal_lane_s16(vacc3x4567, vget_high_s16(vxb01234567), vget_high_s16(vxa3), 3); in xnn_qu8_igemm_minmax_ukernel_4x8__neon() 230 … vacc3x4567 = vmlal_lane_s16(vacc3x4567, vget_high_s16(vxb01234567), vget_low_s16(vxa3), 0); in xnn_qu8_igemm_minmax_ukernel_4x8__neon() [all …]
|
/external/XNNPACK/src/qu8-gemm/ |
D | 4x8-minmax-neon.c | 65 int32x4_t vacc3x4567 = vacc0x4567; in xnn_qu8_gemm_minmax_ukernel_4x8__neon() local 88 vacc3x4567 = vmlal_lane_s16(vacc3x4567, vget_high_s16(vxb01234567c0), vget_low_s16(vxa3), 0); in xnn_qu8_gemm_minmax_ukernel_4x8__neon() 100 vacc3x4567 = vmlal_lane_s16(vacc3x4567, vget_high_s16(vxb01234567c1), vget_low_s16(vxa3), 1); in xnn_qu8_gemm_minmax_ukernel_4x8__neon() 112 vacc3x4567 = vmlal_lane_s16(vacc3x4567, vget_high_s16(vxb01234567c2), vget_low_s16(vxa3), 2); in xnn_qu8_gemm_minmax_ukernel_4x8__neon() 124 vacc3x4567 = vmlal_lane_s16(vacc3x4567, vget_high_s16(vxb01234567c3), vget_low_s16(vxa3), 3); in xnn_qu8_gemm_minmax_ukernel_4x8__neon() 136 vacc3x4567 = vmlal_lane_s16(vacc3x4567, vget_high_s16(vxb01234567c4), vget_high_s16(vxa3), 0); in xnn_qu8_gemm_minmax_ukernel_4x8__neon() 148 vacc3x4567 = vmlal_lane_s16(vacc3x4567, vget_high_s16(vxb01234567c5), vget_high_s16(vxa3), 1); in xnn_qu8_gemm_minmax_ukernel_4x8__neon() 160 vacc3x4567 = vmlal_lane_s16(vacc3x4567, vget_high_s16(vxb01234567c6), vget_high_s16(vxa3), 2); in xnn_qu8_gemm_minmax_ukernel_4x8__neon() 172 vacc3x4567 = vmlal_lane_s16(vacc3x4567, vget_high_s16(vxb01234567c7), vget_high_s16(vxa3), 3); in xnn_qu8_gemm_minmax_ukernel_4x8__neon() 196 vacc3x4567 = vmlal_lane_s16(vacc3x4567, vget_high_s16(vxb01234567c0), vget_low_s16(vxa3), 0); in xnn_qu8_gemm_minmax_ukernel_4x8__neon() [all …]
|
/external/XNNPACK/src/qs8-igemm/gen/ |
D | 4x8-minmax-neon-mlal-lane.c | 65 int32x4_t vacc3x4567 = vacc0x4567; in xnn_qs8_igemm_minmax_ukernel_4x8__neon_mlal_lane() local 108 … vacc3x4567 = vmlal_lane_s16(vacc3x4567, vget_high_s16(vxb01234567c0), vget_low_s16(vxa3), 0); in xnn_qs8_igemm_minmax_ukernel_4x8__neon_mlal_lane() 119 … vacc3x4567 = vmlal_lane_s16(vacc3x4567, vget_high_s16(vxb01234567c1), vget_low_s16(vxa3), 1); in xnn_qs8_igemm_minmax_ukernel_4x8__neon_mlal_lane() 130 … vacc3x4567 = vmlal_lane_s16(vacc3x4567, vget_high_s16(vxb01234567c2), vget_low_s16(vxa3), 2); in xnn_qs8_igemm_minmax_ukernel_4x8__neon_mlal_lane() 141 … vacc3x4567 = vmlal_lane_s16(vacc3x4567, vget_high_s16(vxb01234567c3), vget_low_s16(vxa3), 3); in xnn_qs8_igemm_minmax_ukernel_4x8__neon_mlal_lane() 153 … vacc3x4567 = vmlal_lane_s16(vacc3x4567, vget_high_s16(vxb01234567c4), vget_high_s16(vxa3), 0); in xnn_qs8_igemm_minmax_ukernel_4x8__neon_mlal_lane() 164 … vacc3x4567 = vmlal_lane_s16(vacc3x4567, vget_high_s16(vxb01234567c5), vget_high_s16(vxa3), 1); in xnn_qs8_igemm_minmax_ukernel_4x8__neon_mlal_lane() 175 … vacc3x4567 = vmlal_lane_s16(vacc3x4567, vget_high_s16(vxb01234567c6), vget_high_s16(vxa3), 2); in xnn_qs8_igemm_minmax_ukernel_4x8__neon_mlal_lane() 186 … vacc3x4567 = vmlal_lane_s16(vacc3x4567, vget_high_s16(vxb01234567c7), vget_high_s16(vxa3), 3); in xnn_qs8_igemm_minmax_ukernel_4x8__neon_mlal_lane() 210 … vacc3x4567 = vmlal_lane_s16(vacc3x4567, vget_high_s16(vxb01234567c0), vget_low_s16(vxa3), 0); in xnn_qs8_igemm_minmax_ukernel_4x8__neon_mlal_lane() [all …]
|
D | 4x8-minmax-neon-mull-addw-dup.c | 65 int32x4_t vacc3x4567 = vacc0x4567; in xnn_qs8_igemm_minmax_ukernel_4x8__neon_mull_addw_dup() local 107 vacc3x4567 = vaddw_s16(vacc3x4567, vget_high_s16(vprod3x01234567c0)); in xnn_qs8_igemm_minmax_ukernel_4x8__neon_mull_addw_dup() 121 vacc3x4567 = vaddw_s16(vacc3x4567, vget_high_s16(vprod3x01234567c1)); in xnn_qs8_igemm_minmax_ukernel_4x8__neon_mull_addw_dup() 135 vacc3x4567 = vaddw_s16(vacc3x4567, vget_high_s16(vprod3x01234567c2)); in xnn_qs8_igemm_minmax_ukernel_4x8__neon_mull_addw_dup() 149 vacc3x4567 = vaddw_s16(vacc3x4567, vget_high_s16(vprod3x01234567c3)); in xnn_qs8_igemm_minmax_ukernel_4x8__neon_mull_addw_dup() 163 vacc3x4567 = vaddw_s16(vacc3x4567, vget_high_s16(vprod3x01234567c4)); in xnn_qs8_igemm_minmax_ukernel_4x8__neon_mull_addw_dup() 177 vacc3x4567 = vaddw_s16(vacc3x4567, vget_high_s16(vprod3x01234567c5)); in xnn_qs8_igemm_minmax_ukernel_4x8__neon_mull_addw_dup() 191 vacc3x4567 = vaddw_s16(vacc3x4567, vget_high_s16(vprod3x01234567c6)); in xnn_qs8_igemm_minmax_ukernel_4x8__neon_mull_addw_dup() 205 vacc3x4567 = vaddw_s16(vacc3x4567, vget_high_s16(vprod3x01234567c7)); in xnn_qs8_igemm_minmax_ukernel_4x8__neon_mull_addw_dup() 228 vacc3x4567 = vaddw_s16(vacc3x4567, vget_high_s16(vprod3x01234567c0)); in xnn_qs8_igemm_minmax_ukernel_4x8__neon_mull_addw_dup() [all …]
|
/external/XNNPACK/src/f32-gemm/gen/ |
D | 4x8-minmax-neonfma-lane-ld128.c | 68 float32x4_t vacc3x4567 = vacc0x4567; in xnn_f32_gemm_minmax_ukernel_4x8__neonfma_lane_ld128() local 88 vacc3x4567 = vfmaq_lane_f32(vacc3x4567, vb4567c0, vget_low_f32(va3), 0); in xnn_f32_gemm_minmax_ukernel_4x8__neonfma_lane_ld128() 100 vacc3x4567 = vfmaq_lane_f32(vacc3x4567, vb4567c1, vget_low_f32(va3), 1); in xnn_f32_gemm_minmax_ukernel_4x8__neonfma_lane_ld128() 112 vacc3x4567 = vfmaq_lane_f32(vacc3x4567, vb4567c2, vget_high_f32(va3), 0); in xnn_f32_gemm_minmax_ukernel_4x8__neonfma_lane_ld128() 124 vacc3x4567 = vfmaq_lane_f32(vacc3x4567, vb4567c3, vget_high_f32(va3), 1); in xnn_f32_gemm_minmax_ukernel_4x8__neonfma_lane_ld128() 143 vacc3x4567 = vfmaq_f32(vacc3x4567, va3, vb4567); in xnn_f32_gemm_minmax_ukernel_4x8__neonfma_lane_ld128() 156 vacc3x4567 = vminq_f32(vacc3x4567, vmax); in xnn_f32_gemm_minmax_ukernel_4x8__neonfma_lane_ld128() 166 vacc3x4567 = vmaxq_f32(vacc3x4567, vmin); in xnn_f32_gemm_minmax_ukernel_4x8__neonfma_lane_ld128() 170 vst1q_f32(c3 + 4, vacc3x4567); in xnn_f32_gemm_minmax_ukernel_4x8__neonfma_lane_ld128() 196 vacc3x0123 = vacc3x4567; in xnn_f32_gemm_minmax_ukernel_4x8__neonfma_lane_ld128()
|
D | 4x8-minmax-neon-lane-ld128.c | 68 float32x4_t vacc3x4567 = vacc0x4567; in xnn_f32_gemm_minmax_ukernel_4x8__neon_lane_ld128() local 88 vacc3x4567 = vmlaq_lane_f32(vacc3x4567, vb4567c0, vget_low_f32(va3), 0); in xnn_f32_gemm_minmax_ukernel_4x8__neon_lane_ld128() 100 vacc3x4567 = vmlaq_lane_f32(vacc3x4567, vb4567c1, vget_low_f32(va3), 1); in xnn_f32_gemm_minmax_ukernel_4x8__neon_lane_ld128() 112 vacc3x4567 = vmlaq_lane_f32(vacc3x4567, vb4567c2, vget_high_f32(va3), 0); in xnn_f32_gemm_minmax_ukernel_4x8__neon_lane_ld128() 124 vacc3x4567 = vmlaq_lane_f32(vacc3x4567, vb4567c3, vget_high_f32(va3), 1); in xnn_f32_gemm_minmax_ukernel_4x8__neon_lane_ld128() 143 vacc3x4567 = vmlaq_f32(vacc3x4567, va3, vb4567); in xnn_f32_gemm_minmax_ukernel_4x8__neon_lane_ld128() 156 vacc3x4567 = vminq_f32(vacc3x4567, vmax); in xnn_f32_gemm_minmax_ukernel_4x8__neon_lane_ld128() 166 vacc3x4567 = vmaxq_f32(vacc3x4567, vmin); in xnn_f32_gemm_minmax_ukernel_4x8__neon_lane_ld128() 170 vst1q_f32(c3 + 4, vacc3x4567); in xnn_f32_gemm_minmax_ukernel_4x8__neon_lane_ld128() 196 vacc3x0123 = vacc3x4567; in xnn_f32_gemm_minmax_ukernel_4x8__neon_lane_ld128()
|
D | 4x8s4-minmax-wasmsimd-x86.c | 67 v128_t vacc3x4567 = vacc0x4567; in xnn_f32_gemm_minmax_ukernel_4x8s4__wasmsimd_x86() local 92 vacc3x4567 = wasm_f32x4_add(vacc3x4567, wasm_f32x4_mul(va3, vb4567c0)); in xnn_f32_gemm_minmax_ukernel_4x8s4__wasmsimd_x86() 109 vacc3x4567 = wasm_f32x4_add(vacc3x4567, wasm_f32x4_mul(va3, vb4567c1)); in xnn_f32_gemm_minmax_ukernel_4x8s4__wasmsimd_x86() 126 vacc3x4567 = wasm_f32x4_add(vacc3x4567, wasm_f32x4_mul(va3, vb4567c2)); in xnn_f32_gemm_minmax_ukernel_4x8s4__wasmsimd_x86() 143 vacc3x4567 = wasm_f32x4_add(vacc3x4567, wasm_f32x4_mul(va3, vb4567c3)); in xnn_f32_gemm_minmax_ukernel_4x8s4__wasmsimd_x86() 171 vacc3x4567 = wasm_f32x4_add(vacc3x4567, wasm_f32x4_mul(va3, vb4567)); in xnn_f32_gemm_minmax_ukernel_4x8s4__wasmsimd_x86() 185 vacc3x4567 = wasm_v128_bitselect(vmin, vacc3x4567, wasm_f32x4_lt(vacc3x4567, vmin)); in xnn_f32_gemm_minmax_ukernel_4x8s4__wasmsimd_x86() 195 vacc3x4567 = wasm_v128_bitselect(vacc3x4567, vmax, wasm_f32x4_le(vacc3x4567, vmax)); in xnn_f32_gemm_minmax_ukernel_4x8s4__wasmsimd_x86() 199 wasm_v128_store(c3 + 4, vacc3x4567); in xnn_f32_gemm_minmax_ukernel_4x8s4__wasmsimd_x86() 224 vacc3x0123 = vacc3x4567; in xnn_f32_gemm_minmax_ukernel_4x8s4__wasmsimd_x86()
|
D | 4x8-minmax-wasmsimd-x86-splat.c | 67 v128_t vacc3x4567 = vacc0x4567; in xnn_f32_gemm_minmax_ukernel_4x8__wasmsimd_x86_splat() local 96 vacc3x4567 = wasm_f32x4_add(vacc3x4567, wasm_f32x4_mul(va3c0, vb4567c0)); in xnn_f32_gemm_minmax_ukernel_4x8__wasmsimd_x86_splat() 112 vacc3x4567 = wasm_f32x4_add(vacc3x4567, wasm_f32x4_mul(va3c1, vb4567c1)); in xnn_f32_gemm_minmax_ukernel_4x8__wasmsimd_x86_splat() 128 vacc3x4567 = wasm_f32x4_add(vacc3x4567, wasm_f32x4_mul(va3c2, vb4567c2)); in xnn_f32_gemm_minmax_ukernel_4x8__wasmsimd_x86_splat() 144 vacc3x4567 = wasm_f32x4_add(vacc3x4567, wasm_f32x4_mul(va3c3, vb4567c3)); in xnn_f32_gemm_minmax_ukernel_4x8__wasmsimd_x86_splat() 171 vacc3x4567 = wasm_f32x4_add(vacc3x4567, wasm_f32x4_mul(va3, vb4567)); in xnn_f32_gemm_minmax_ukernel_4x8__wasmsimd_x86_splat() 185 vacc3x4567 = wasm_v128_bitselect(vmin, vacc3x4567, wasm_f32x4_lt(vacc3x4567, vmin)); in xnn_f32_gemm_minmax_ukernel_4x8__wasmsimd_x86_splat() 195 vacc3x4567 = wasm_v128_bitselect(vacc3x4567, vmax, wasm_f32x4_le(vacc3x4567, vmax)); in xnn_f32_gemm_minmax_ukernel_4x8__wasmsimd_x86_splat() 199 wasm_v128_store(c3 + 4, vacc3x4567); in xnn_f32_gemm_minmax_ukernel_4x8__wasmsimd_x86_splat() 224 vacc3x0123 = vacc3x4567; in xnn_f32_gemm_minmax_ukernel_4x8__wasmsimd_x86_splat()
|
D | 4x8-minmax-neon-dup-ld128.c | 68 float32x4_t vacc3x4567 = vacc0x4567; in xnn_f32_gemm_minmax_ukernel_4x8__neon_dup_ld128() local 92 vacc3x4567 = vmlaq_f32(vacc3x4567, va3c0, vb4567c0); in xnn_f32_gemm_minmax_ukernel_4x8__neon_dup_ld128() 108 vacc3x4567 = vmlaq_f32(vacc3x4567, va3c1, vb4567c1); in xnn_f32_gemm_minmax_ukernel_4x8__neon_dup_ld128() 124 vacc3x4567 = vmlaq_f32(vacc3x4567, va3c2, vb4567c2); in xnn_f32_gemm_minmax_ukernel_4x8__neon_dup_ld128() 140 vacc3x4567 = vmlaq_f32(vacc3x4567, va3c3, vb4567c3); in xnn_f32_gemm_minmax_ukernel_4x8__neon_dup_ld128() 159 vacc3x4567 = vmlaq_f32(vacc3x4567, va3, vb4567); in xnn_f32_gemm_minmax_ukernel_4x8__neon_dup_ld128() 172 vacc3x4567 = vminq_f32(vacc3x4567, vmax); in xnn_f32_gemm_minmax_ukernel_4x8__neon_dup_ld128() 182 vacc3x4567 = vmaxq_f32(vacc3x4567, vmin); in xnn_f32_gemm_minmax_ukernel_4x8__neon_dup_ld128() 186 vst1q_f32(c3 + 4, vacc3x4567); in xnn_f32_gemm_minmax_ukernel_4x8__neon_dup_ld128() 212 vacc3x0123 = vacc3x4567; in xnn_f32_gemm_minmax_ukernel_4x8__neon_dup_ld128()
|
D | 4x8s4-minmax-wasmsimd-arm.c | 69 v128_t vacc3x4567 = vacc0x4567; in xnn_f32_gemm_minmax_ukernel_4x8s4__wasmsimd_arm() local 94 vacc3x4567 = wasm_f32x4_add(vacc3x4567, wasm_f32x4_mul(va3, vb4567c0)); in xnn_f32_gemm_minmax_ukernel_4x8s4__wasmsimd_arm() 111 vacc3x4567 = wasm_f32x4_add(vacc3x4567, wasm_f32x4_mul(va3, vb4567c1)); in xnn_f32_gemm_minmax_ukernel_4x8s4__wasmsimd_arm() 128 vacc3x4567 = wasm_f32x4_add(vacc3x4567, wasm_f32x4_mul(va3, vb4567c2)); in xnn_f32_gemm_minmax_ukernel_4x8s4__wasmsimd_arm() 145 vacc3x4567 = wasm_f32x4_add(vacc3x4567, wasm_f32x4_mul(va3, vb4567c3)); in xnn_f32_gemm_minmax_ukernel_4x8s4__wasmsimd_arm() 173 vacc3x4567 = wasm_f32x4_add(vacc3x4567, wasm_f32x4_mul(va3, vb4567)); in xnn_f32_gemm_minmax_ukernel_4x8s4__wasmsimd_arm() 186 vacc3x4567 = wasm_f32x4_max(vacc3x4567, vmin); in xnn_f32_gemm_minmax_ukernel_4x8s4__wasmsimd_arm() 195 vacc3x4567 = wasm_f32x4_min(vacc3x4567, vmax); in xnn_f32_gemm_minmax_ukernel_4x8s4__wasmsimd_arm() 199 wasm_v128_store(c3 + 4, vacc3x4567); in xnn_f32_gemm_minmax_ukernel_4x8s4__wasmsimd_arm() 224 vacc3x0123 = vacc3x4567; in xnn_f32_gemm_minmax_ukernel_4x8s4__wasmsimd_arm()
|
D | 4x8s4-minmax-neonfma.c | 68 float32x4_t vacc3x4567 = vacc0x4567; in xnn_f32_gemm_minmax_ukernel_4x8s4__neonfma() local 88 vacc3x4567 = vfmaq_f32(vacc3x4567, va3, vb4567c0); in xnn_f32_gemm_minmax_ukernel_4x8s4__neonfma() 105 vacc3x4567 = vfmaq_f32(vacc3x4567, va3, vb4567c1); in xnn_f32_gemm_minmax_ukernel_4x8s4__neonfma() 122 vacc3x4567 = vfmaq_f32(vacc3x4567, va3, vb4567c2); in xnn_f32_gemm_minmax_ukernel_4x8s4__neonfma() 139 vacc3x4567 = vfmaq_f32(vacc3x4567, va3, vb4567c3); in xnn_f32_gemm_minmax_ukernel_4x8s4__neonfma() 162 vacc3x4567 = vfmaq_f32(vacc3x4567, va3, vb4567); in xnn_f32_gemm_minmax_ukernel_4x8s4__neonfma() 175 vacc3x4567 = vminq_f32(vacc3x4567, vmax); in xnn_f32_gemm_minmax_ukernel_4x8s4__neonfma() 185 vacc3x4567 = vmaxq_f32(vacc3x4567, vmin); in xnn_f32_gemm_minmax_ukernel_4x8s4__neonfma() 189 vst1q_f32(c3 + 4, vacc3x4567); in xnn_f32_gemm_minmax_ukernel_4x8s4__neonfma() 215 vacc3x0123 = vacc3x4567; in xnn_f32_gemm_minmax_ukernel_4x8s4__neonfma()
|
/external/XNNPACK/src/f32-gemm/gen-inc/ |
D | 4x8s4inc-minmax-wasmsimd-x86.c | 69 v128_t vacc3x4567 = wasm_v128_load(acc + 28); in xnn_f32_gemminc_minmax_ukernel_4x8s4__wasmsimd_x86() local 94 vacc3x4567 = wasm_f32x4_add(vacc3x4567, wasm_f32x4_mul(va3, vb4567c0)); in xnn_f32_gemminc_minmax_ukernel_4x8s4__wasmsimd_x86() 111 vacc3x4567 = wasm_f32x4_add(vacc3x4567, wasm_f32x4_mul(va3, vb4567c1)); in xnn_f32_gemminc_minmax_ukernel_4x8s4__wasmsimd_x86() 128 vacc3x4567 = wasm_f32x4_add(vacc3x4567, wasm_f32x4_mul(va3, vb4567c2)); in xnn_f32_gemminc_minmax_ukernel_4x8s4__wasmsimd_x86() 145 vacc3x4567 = wasm_f32x4_add(vacc3x4567, wasm_f32x4_mul(va3, vb4567c3)); in xnn_f32_gemminc_minmax_ukernel_4x8s4__wasmsimd_x86() 173 vacc3x4567 = wasm_f32x4_add(vacc3x4567, wasm_f32x4_mul(va3, vb4567)); in xnn_f32_gemminc_minmax_ukernel_4x8s4__wasmsimd_x86() 187 vacc3x4567 = wasm_v128_bitselect(vmin, vacc3x4567, wasm_f32x4_lt(vacc3x4567, vmin)); in xnn_f32_gemminc_minmax_ukernel_4x8s4__wasmsimd_x86() 197 vacc3x4567 = wasm_v128_bitselect(vacc3x4567, vmax, wasm_f32x4_le(vacc3x4567, vmax)); in xnn_f32_gemminc_minmax_ukernel_4x8s4__wasmsimd_x86() 201 wasm_v128_store(c3 + 4, vacc3x4567); in xnn_f32_gemminc_minmax_ukernel_4x8s4__wasmsimd_x86() 226 vacc3x0123 = vacc3x4567; in xnn_f32_gemminc_minmax_ukernel_4x8s4__wasmsimd_x86()
|
D | 4x8inc-minmax-wasmsimd-x86-splat.c | 69 v128_t vacc3x4567 = wasm_v128_load(acc + 28); in xnn_f32_gemminc_minmax_ukernel_4x8__wasmsimd_x86_splat() local 98 vacc3x4567 = wasm_f32x4_add(vacc3x4567, wasm_f32x4_mul(va3c0, vb4567c0)); in xnn_f32_gemminc_minmax_ukernel_4x8__wasmsimd_x86_splat() 114 vacc3x4567 = wasm_f32x4_add(vacc3x4567, wasm_f32x4_mul(va3c1, vb4567c1)); in xnn_f32_gemminc_minmax_ukernel_4x8__wasmsimd_x86_splat() 130 vacc3x4567 = wasm_f32x4_add(vacc3x4567, wasm_f32x4_mul(va3c2, vb4567c2)); in xnn_f32_gemminc_minmax_ukernel_4x8__wasmsimd_x86_splat() 146 vacc3x4567 = wasm_f32x4_add(vacc3x4567, wasm_f32x4_mul(va3c3, vb4567c3)); in xnn_f32_gemminc_minmax_ukernel_4x8__wasmsimd_x86_splat() 173 vacc3x4567 = wasm_f32x4_add(vacc3x4567, wasm_f32x4_mul(va3, vb4567)); in xnn_f32_gemminc_minmax_ukernel_4x8__wasmsimd_x86_splat() 187 vacc3x4567 = wasm_v128_bitselect(vmin, vacc3x4567, wasm_f32x4_lt(vacc3x4567, vmin)); in xnn_f32_gemminc_minmax_ukernel_4x8__wasmsimd_x86_splat() 197 vacc3x4567 = wasm_v128_bitselect(vacc3x4567, vmax, wasm_f32x4_le(vacc3x4567, vmax)); in xnn_f32_gemminc_minmax_ukernel_4x8__wasmsimd_x86_splat() 201 wasm_v128_store(c3 + 4, vacc3x4567); in xnn_f32_gemminc_minmax_ukernel_4x8__wasmsimd_x86_splat() 226 vacc3x0123 = vacc3x4567; in xnn_f32_gemminc_minmax_ukernel_4x8__wasmsimd_x86_splat()
|
D | 4x8s4inc-minmax-neonfma.c | 70 float32x4_t vacc3x4567 = vld1q_f32(acc); acc += 4; in xnn_f32_gemminc_minmax_ukernel_4x8s4__neonfma() local 90 vacc3x4567 = vfmaq_f32(vacc3x4567, va3, vb4567c0); in xnn_f32_gemminc_minmax_ukernel_4x8s4__neonfma() 107 vacc3x4567 = vfmaq_f32(vacc3x4567, va3, vb4567c1); in xnn_f32_gemminc_minmax_ukernel_4x8s4__neonfma() 124 vacc3x4567 = vfmaq_f32(vacc3x4567, va3, vb4567c2); in xnn_f32_gemminc_minmax_ukernel_4x8s4__neonfma() 141 vacc3x4567 = vfmaq_f32(vacc3x4567, va3, vb4567c3); in xnn_f32_gemminc_minmax_ukernel_4x8s4__neonfma() 164 vacc3x4567 = vfmaq_f32(vacc3x4567, va3, vb4567); in xnn_f32_gemminc_minmax_ukernel_4x8s4__neonfma() 177 vacc3x4567 = vminq_f32(vacc3x4567, vmax); in xnn_f32_gemminc_minmax_ukernel_4x8s4__neonfma() 187 vacc3x4567 = vmaxq_f32(vacc3x4567, vmin); in xnn_f32_gemminc_minmax_ukernel_4x8s4__neonfma() 191 vst1q_f32(c3 + 4, vacc3x4567); in xnn_f32_gemminc_minmax_ukernel_4x8s4__neonfma() 217 vacc3x0123 = vacc3x4567; in xnn_f32_gemminc_minmax_ukernel_4x8s4__neonfma()
|
D | 4x8s4inc-minmax-wasmsimd-arm.c | 71 v128_t vacc3x4567 = wasm_v128_load(acc + 28); in xnn_f32_gemminc_minmax_ukernel_4x8s4__wasmsimd_arm() local 96 vacc3x4567 = wasm_f32x4_add(vacc3x4567, wasm_f32x4_mul(va3, vb4567c0)); in xnn_f32_gemminc_minmax_ukernel_4x8s4__wasmsimd_arm() 113 vacc3x4567 = wasm_f32x4_add(vacc3x4567, wasm_f32x4_mul(va3, vb4567c1)); in xnn_f32_gemminc_minmax_ukernel_4x8s4__wasmsimd_arm() 130 vacc3x4567 = wasm_f32x4_add(vacc3x4567, wasm_f32x4_mul(va3, vb4567c2)); in xnn_f32_gemminc_minmax_ukernel_4x8s4__wasmsimd_arm() 147 vacc3x4567 = wasm_f32x4_add(vacc3x4567, wasm_f32x4_mul(va3, vb4567c3)); in xnn_f32_gemminc_minmax_ukernel_4x8s4__wasmsimd_arm() 175 vacc3x4567 = wasm_f32x4_add(vacc3x4567, wasm_f32x4_mul(va3, vb4567)); in xnn_f32_gemminc_minmax_ukernel_4x8s4__wasmsimd_arm() 188 vacc3x4567 = wasm_f32x4_max(vacc3x4567, vmin); in xnn_f32_gemminc_minmax_ukernel_4x8s4__wasmsimd_arm() 197 vacc3x4567 = wasm_f32x4_min(vacc3x4567, vmax); in xnn_f32_gemminc_minmax_ukernel_4x8s4__wasmsimd_arm() 201 wasm_v128_store(c3 + 4, vacc3x4567); in xnn_f32_gemminc_minmax_ukernel_4x8s4__wasmsimd_arm() 226 vacc3x0123 = vacc3x4567; in xnn_f32_gemminc_minmax_ukernel_4x8s4__wasmsimd_arm()
|
D | 4x8inc-minmax-neon-lane-ld128.c | 70 float32x4_t vacc3x4567 = vld1q_f32(acc); acc += 4; in xnn_f32_gemminc_minmax_ukernel_4x8__neon_lane_ld128() local 90 vacc3x4567 = vmlaq_lane_f32(vacc3x4567, vb4567c0, vget_low_f32(va3), 0); in xnn_f32_gemminc_minmax_ukernel_4x8__neon_lane_ld128() 102 vacc3x4567 = vmlaq_lane_f32(vacc3x4567, vb4567c1, vget_low_f32(va3), 1); in xnn_f32_gemminc_minmax_ukernel_4x8__neon_lane_ld128() 114 vacc3x4567 = vmlaq_lane_f32(vacc3x4567, vb4567c2, vget_high_f32(va3), 0); in xnn_f32_gemminc_minmax_ukernel_4x8__neon_lane_ld128() 126 vacc3x4567 = vmlaq_lane_f32(vacc3x4567, vb4567c3, vget_high_f32(va3), 1); in xnn_f32_gemminc_minmax_ukernel_4x8__neon_lane_ld128() 145 vacc3x4567 = vmlaq_f32(vacc3x4567, va3, vb4567); in xnn_f32_gemminc_minmax_ukernel_4x8__neon_lane_ld128() 158 vacc3x4567 = vminq_f32(vacc3x4567, vmax); in xnn_f32_gemminc_minmax_ukernel_4x8__neon_lane_ld128() 168 vacc3x4567 = vmaxq_f32(vacc3x4567, vmin); in xnn_f32_gemminc_minmax_ukernel_4x8__neon_lane_ld128() 172 vst1q_f32(c3 + 4, vacc3x4567); in xnn_f32_gemminc_minmax_ukernel_4x8__neon_lane_ld128() 198 vacc3x0123 = vacc3x4567; in xnn_f32_gemminc_minmax_ukernel_4x8__neon_lane_ld128()
|
D | 4x8s4inc-minmax-sse.c | 69 __m128 vacc3x4567 = _mm_load_ps(acc + 28); in xnn_f32_gemminc_minmax_ukernel_4x8s4__sse() local 94 vacc3x4567 = _mm_add_ps(vacc3x4567, _mm_mul_ps(va3, vb4567c0)); in xnn_f32_gemminc_minmax_ukernel_4x8s4__sse() 111 vacc3x4567 = _mm_add_ps(vacc3x4567, _mm_mul_ps(va3, vb4567c1)); in xnn_f32_gemminc_minmax_ukernel_4x8s4__sse() 128 vacc3x4567 = _mm_add_ps(vacc3x4567, _mm_mul_ps(va3, vb4567c2)); in xnn_f32_gemminc_minmax_ukernel_4x8s4__sse() 145 vacc3x4567 = _mm_add_ps(vacc3x4567, _mm_mul_ps(va3, vb4567c3)); in xnn_f32_gemminc_minmax_ukernel_4x8s4__sse() 173 vacc3x4567 = _mm_add_ps(vacc3x4567, _mm_mul_ps(va3, vb4567)); in xnn_f32_gemminc_minmax_ukernel_4x8s4__sse() 187 vacc3x4567 = _mm_min_ps(vacc3x4567, vmax); in xnn_f32_gemminc_minmax_ukernel_4x8s4__sse() 197 vacc3x4567 = _mm_max_ps(vacc3x4567, vmin); in xnn_f32_gemminc_minmax_ukernel_4x8s4__sse() 201 _mm_storeu_ps(c3 + 4, vacc3x4567); in xnn_f32_gemminc_minmax_ukernel_4x8s4__sse() 226 vacc3x0123 = vacc3x4567; in xnn_f32_gemminc_minmax_ukernel_4x8s4__sse()
|
D | 4x8s4inc-minmax-neon.c | 70 float32x4_t vacc3x4567 = vld1q_f32(acc); acc += 4; in xnn_f32_gemminc_minmax_ukernel_4x8s4__neon() local 90 vacc3x4567 = vmlaq_f32(vacc3x4567, va3, vb4567c0); in xnn_f32_gemminc_minmax_ukernel_4x8s4__neon() 107 vacc3x4567 = vmlaq_f32(vacc3x4567, va3, vb4567c1); in xnn_f32_gemminc_minmax_ukernel_4x8s4__neon() 124 vacc3x4567 = vmlaq_f32(vacc3x4567, va3, vb4567c2); in xnn_f32_gemminc_minmax_ukernel_4x8s4__neon() 141 vacc3x4567 = vmlaq_f32(vacc3x4567, va3, vb4567c3); in xnn_f32_gemminc_minmax_ukernel_4x8s4__neon() 164 vacc3x4567 = vmlaq_f32(vacc3x4567, va3, vb4567); in xnn_f32_gemminc_minmax_ukernel_4x8s4__neon() 177 vacc3x4567 = vminq_f32(vacc3x4567, vmax); in xnn_f32_gemminc_minmax_ukernel_4x8s4__neon() 187 vacc3x4567 = vmaxq_f32(vacc3x4567, vmin); in xnn_f32_gemminc_minmax_ukernel_4x8s4__neon() 191 vst1q_f32(c3 + 4, vacc3x4567); in xnn_f32_gemminc_minmax_ukernel_4x8s4__neon() 217 vacc3x0123 = vacc3x4567; in xnn_f32_gemminc_minmax_ukernel_4x8s4__neon()
|
D | 4x8inc-minmax-neon-dup-ld128.c | 70 float32x4_t vacc3x4567 = vld1q_f32(acc); acc += 4; in xnn_f32_gemminc_minmax_ukernel_4x8__neon_dup_ld128() local 94 vacc3x4567 = vmlaq_f32(vacc3x4567, va3c0, vb4567c0); in xnn_f32_gemminc_minmax_ukernel_4x8__neon_dup_ld128() 110 vacc3x4567 = vmlaq_f32(vacc3x4567, va3c1, vb4567c1); in xnn_f32_gemminc_minmax_ukernel_4x8__neon_dup_ld128() 126 vacc3x4567 = vmlaq_f32(vacc3x4567, va3c2, vb4567c2); in xnn_f32_gemminc_minmax_ukernel_4x8__neon_dup_ld128() 142 vacc3x4567 = vmlaq_f32(vacc3x4567, va3c3, vb4567c3); in xnn_f32_gemminc_minmax_ukernel_4x8__neon_dup_ld128() 161 vacc3x4567 = vmlaq_f32(vacc3x4567, va3, vb4567); in xnn_f32_gemminc_minmax_ukernel_4x8__neon_dup_ld128() 174 vacc3x4567 = vminq_f32(vacc3x4567, vmax); in xnn_f32_gemminc_minmax_ukernel_4x8__neon_dup_ld128() 184 vacc3x4567 = vmaxq_f32(vacc3x4567, vmin); in xnn_f32_gemminc_minmax_ukernel_4x8__neon_dup_ld128() 188 vst1q_f32(c3 + 4, vacc3x4567); in xnn_f32_gemminc_minmax_ukernel_4x8__neon_dup_ld128() 214 vacc3x0123 = vacc3x4567; in xnn_f32_gemminc_minmax_ukernel_4x8__neon_dup_ld128()
|
/external/XNNPACK/src/f32-igemm/gen/ |
D | 4x8s4-minmax-wasmsimd-x86.c | 65 v128_t vacc3x4567 = vacc0x4567; in xnn_f32_igemm_minmax_ukernel_4x8s4__wasmsimd_x86() local 114 vacc3x4567 = wasm_f32x4_add(vacc3x4567, wasm_f32x4_mul(va3, vb4567c0)); in xnn_f32_igemm_minmax_ukernel_4x8s4__wasmsimd_x86() 131 vacc3x4567 = wasm_f32x4_add(vacc3x4567, wasm_f32x4_mul(va3, vb4567c1)); in xnn_f32_igemm_minmax_ukernel_4x8s4__wasmsimd_x86() 148 vacc3x4567 = wasm_f32x4_add(vacc3x4567, wasm_f32x4_mul(va3, vb4567c2)); in xnn_f32_igemm_minmax_ukernel_4x8s4__wasmsimd_x86() 165 vacc3x4567 = wasm_f32x4_add(vacc3x4567, wasm_f32x4_mul(va3, vb4567c3)); in xnn_f32_igemm_minmax_ukernel_4x8s4__wasmsimd_x86() 193 vacc3x4567 = wasm_f32x4_add(vacc3x4567, wasm_f32x4_mul(va3, vb4567)); in xnn_f32_igemm_minmax_ukernel_4x8s4__wasmsimd_x86() 208 vacc3x4567 = wasm_v128_bitselect(vmin, vacc3x4567, wasm_f32x4_lt(vacc3x4567, vmin)); in xnn_f32_igemm_minmax_ukernel_4x8s4__wasmsimd_x86() 218 vacc3x4567 = wasm_v128_bitselect(vacc3x4567, vmax, wasm_f32x4_le(vacc3x4567, vmax)); in xnn_f32_igemm_minmax_ukernel_4x8s4__wasmsimd_x86() 222 wasm_v128_store(c3 + 4, vacc3x4567); in xnn_f32_igemm_minmax_ukernel_4x8s4__wasmsimd_x86() 243 vacc3x0123 = vacc3x4567; in xnn_f32_igemm_minmax_ukernel_4x8s4__wasmsimd_x86()
|
D | 4x8-minmax-neon-lane-ld128.c | 66 float32x4_t vacc3x4567 = vacc0x4567; in xnn_f32_igemm_minmax_ukernel_4x8__neon_lane_ld128() local 110 vacc3x4567 = vmlaq_lane_f32(vacc3x4567, vb4567c0, vget_low_f32(va3), 0); in xnn_f32_igemm_minmax_ukernel_4x8__neon_lane_ld128() 122 vacc3x4567 = vmlaq_lane_f32(vacc3x4567, vb4567c1, vget_low_f32(va3), 1); in xnn_f32_igemm_minmax_ukernel_4x8__neon_lane_ld128() 134 vacc3x4567 = vmlaq_lane_f32(vacc3x4567, vb4567c2, vget_high_f32(va3), 0); in xnn_f32_igemm_minmax_ukernel_4x8__neon_lane_ld128() 146 vacc3x4567 = vmlaq_lane_f32(vacc3x4567, vb4567c3, vget_high_f32(va3), 1); in xnn_f32_igemm_minmax_ukernel_4x8__neon_lane_ld128() 165 vacc3x4567 = vmlaq_f32(vacc3x4567, va3, vb4567); in xnn_f32_igemm_minmax_ukernel_4x8__neon_lane_ld128() 182 vacc3x4567 = vminq_f32(vacc3x4567, vmax); in xnn_f32_igemm_minmax_ukernel_4x8__neon_lane_ld128() 192 vacc3x4567 = vmaxq_f32(vacc3x4567, vmin); in xnn_f32_igemm_minmax_ukernel_4x8__neon_lane_ld128() 196 vst1q_f32(c3 + 4, vacc3x4567); in xnn_f32_igemm_minmax_ukernel_4x8__neon_lane_ld128() 217 vacc3x0123 = vacc3x4567; in xnn_f32_igemm_minmax_ukernel_4x8__neon_lane_ld128()
|
D | 4x8s4-minmax-neonfma.c | 65 float32x4_t vacc3x4567 = vacc0x4567; in xnn_f32_igemm_minmax_ukernel_4x8s4__neonfma() local 109 vacc3x4567 = vfmaq_f32(vacc3x4567, va3, vb4567c0); in xnn_f32_igemm_minmax_ukernel_4x8s4__neonfma() 126 vacc3x4567 = vfmaq_f32(vacc3x4567, va3, vb4567c1); in xnn_f32_igemm_minmax_ukernel_4x8s4__neonfma() 143 vacc3x4567 = vfmaq_f32(vacc3x4567, va3, vb4567c2); in xnn_f32_igemm_minmax_ukernel_4x8s4__neonfma() 160 vacc3x4567 = vfmaq_f32(vacc3x4567, va3, vb4567c3); in xnn_f32_igemm_minmax_ukernel_4x8s4__neonfma() 183 vacc3x4567 = vfmaq_f32(vacc3x4567, va3, vb4567); in xnn_f32_igemm_minmax_ukernel_4x8s4__neonfma() 200 vacc3x4567 = vminq_f32(vacc3x4567, vmax); in xnn_f32_igemm_minmax_ukernel_4x8s4__neonfma() 210 vacc3x4567 = vmaxq_f32(vacc3x4567, vmin); in xnn_f32_igemm_minmax_ukernel_4x8s4__neonfma() 214 vst1q_f32(c3 + 4, vacc3x4567); in xnn_f32_igemm_minmax_ukernel_4x8s4__neonfma() 235 vacc3x0123 = vacc3x4567; in xnn_f32_igemm_minmax_ukernel_4x8s4__neonfma()
|
D | 4x8s4-minmax-neon.c | 65 float32x4_t vacc3x4567 = vacc0x4567; in xnn_f32_igemm_minmax_ukernel_4x8s4__neon() local 109 vacc3x4567 = vmlaq_f32(vacc3x4567, va3, vb4567c0); in xnn_f32_igemm_minmax_ukernel_4x8s4__neon() 126 vacc3x4567 = vmlaq_f32(vacc3x4567, va3, vb4567c1); in xnn_f32_igemm_minmax_ukernel_4x8s4__neon() 143 vacc3x4567 = vmlaq_f32(vacc3x4567, va3, vb4567c2); in xnn_f32_igemm_minmax_ukernel_4x8s4__neon() 160 vacc3x4567 = vmlaq_f32(vacc3x4567, va3, vb4567c3); in xnn_f32_igemm_minmax_ukernel_4x8s4__neon() 183 vacc3x4567 = vmlaq_f32(vacc3x4567, va3, vb4567); in xnn_f32_igemm_minmax_ukernel_4x8s4__neon() 200 vacc3x4567 = vminq_f32(vacc3x4567, vmax); in xnn_f32_igemm_minmax_ukernel_4x8s4__neon() 210 vacc3x4567 = vmaxq_f32(vacc3x4567, vmin); in xnn_f32_igemm_minmax_ukernel_4x8s4__neon() 214 vst1q_f32(c3 + 4, vacc3x4567); in xnn_f32_igemm_minmax_ukernel_4x8s4__neon() 235 vacc3x0123 = vacc3x4567; in xnn_f32_igemm_minmax_ukernel_4x8s4__neon()
|