/external/XNNPACK/src/f32-vmulcaddc/gen/ |
D | c8-minmax-sse-2x.c | 56 __m128 vacc1x0123 = _mm_loadu_ps(i1); in xnn_f32_vmulcaddc_minmax_ukernel_c8__sse_2x() local 62 vacc1x0123 = _mm_mul_ps(vacc1x0123, vscale0123); in xnn_f32_vmulcaddc_minmax_ukernel_c8__sse_2x() 70 vacc1x0123 = _mm_add_ps(vacc1x0123, vbias0123); in xnn_f32_vmulcaddc_minmax_ukernel_c8__sse_2x() 75 vacc1x0123 = _mm_max_ps(vacc1x0123, vmin); in xnn_f32_vmulcaddc_minmax_ukernel_c8__sse_2x() 80 vacc1x0123 = _mm_min_ps(vacc1x0123, vmax); in xnn_f32_vmulcaddc_minmax_ukernel_c8__sse_2x() 86 _mm_storeu_ps(o1, vacc1x0123); in xnn_f32_vmulcaddc_minmax_ukernel_c8__sse_2x() 97 __m128 vacc1x0123 = _mm_loadu_ps(i1); in xnn_f32_vmulcaddc_minmax_ukernel_c8__sse_2x() local 101 vacc1x0123 = _mm_mul_ps(vacc1x0123, vscale0123); in xnn_f32_vmulcaddc_minmax_ukernel_c8__sse_2x() 106 vacc1x0123 = _mm_add_ps(vacc1x0123, vbias0123); in xnn_f32_vmulcaddc_minmax_ukernel_c8__sse_2x() 109 vacc1x0123 = _mm_max_ps(vacc1x0123, vmin); in xnn_f32_vmulcaddc_minmax_ukernel_c8__sse_2x() [all …]
|
D | c8-minmax-neon-2x.c | 55 float32x4_t vacc1x0123 = vld1q_f32(i1); i1 += 4; in xnn_f32_vmulcaddc_minmax_ukernel_c8__neon_2x() local 60 vacc1x0123 = vmulq_f32(vacc1x0123, vscale0123); in xnn_f32_vmulcaddc_minmax_ukernel_c8__neon_2x() 68 vacc1x0123 = vaddq_f32(vacc1x0123, vbias0123); in xnn_f32_vmulcaddc_minmax_ukernel_c8__neon_2x() 73 vacc1x0123 = vmaxq_f32(vacc1x0123, vmin); in xnn_f32_vmulcaddc_minmax_ukernel_c8__neon_2x() 78 vacc1x0123 = vminq_f32(vacc1x0123, vmax); in xnn_f32_vmulcaddc_minmax_ukernel_c8__neon_2x() 83 vst1q_f32(o1, vacc1x0123); o1 += 4; in xnn_f32_vmulcaddc_minmax_ukernel_c8__neon_2x() 90 float32x4_t vacc1x0123 = vld1q_f32(i1); i1 += 4; in xnn_f32_vmulcaddc_minmax_ukernel_c8__neon_2x() local 93 vacc1x0123 = vmulq_f32(vacc1x0123, vscale0123); in xnn_f32_vmulcaddc_minmax_ukernel_c8__neon_2x() 98 vacc1x0123 = vaddq_f32(vacc1x0123, vbias0123); in xnn_f32_vmulcaddc_minmax_ukernel_c8__neon_2x() 101 vacc1x0123 = vmaxq_f32(vacc1x0123, vmin); in xnn_f32_vmulcaddc_minmax_ukernel_c8__neon_2x() [all …]
|
D | c4-minmax-sse-2x.c | 54 __m128 vacc1x0123 = _mm_loadu_ps(i1); in xnn_f32_vmulcaddc_minmax_ukernel_c4__sse_2x() local 58 vacc1x0123 = _mm_mul_ps(vacc1x0123, vscale0123); in xnn_f32_vmulcaddc_minmax_ukernel_c4__sse_2x() 63 vacc1x0123 = _mm_add_ps(vacc1x0123, vbias0123); in xnn_f32_vmulcaddc_minmax_ukernel_c4__sse_2x() 66 vacc1x0123 = _mm_max_ps(vacc1x0123, vmin); in xnn_f32_vmulcaddc_minmax_ukernel_c4__sse_2x() 69 vacc1x0123 = _mm_min_ps(vacc1x0123, vmax); in xnn_f32_vmulcaddc_minmax_ukernel_c4__sse_2x() 73 _mm_storeu_ps(o1, vacc1x0123); in xnn_f32_vmulcaddc_minmax_ukernel_c4__sse_2x() 83 __m128 vacc1x0123 = _mm_loadu_ps(i1); in xnn_f32_vmulcaddc_minmax_ukernel_c4__sse_2x() local 87 vacc1x0123 = _mm_mul_ps(vacc1x0123, vscale0123); in xnn_f32_vmulcaddc_minmax_ukernel_c4__sse_2x() 92 vacc1x0123 = _mm_add_ps(vacc1x0123, vbias0123); in xnn_f32_vmulcaddc_minmax_ukernel_c4__sse_2x() 95 vacc1x0123 = _mm_max_ps(vacc1x0123, vmin); in xnn_f32_vmulcaddc_minmax_ukernel_c4__sse_2x() [all …]
|
D | c8-minmax-neonfma-2x.c | 55 float32x4_t vacc1x0123 = vld1q_f32(i1); i1 += 4; in xnn_f32_vmulcaddc_minmax_ukernel_c8__neonfma_2x() local 64 vacc1x0123 = vfmaq_f32(vbias0123, vscale0123, vacc1x0123); in xnn_f32_vmulcaddc_minmax_ukernel_c8__neonfma_2x() 69 vacc1x0123 = vmaxq_f32(vacc1x0123, vmin); in xnn_f32_vmulcaddc_minmax_ukernel_c8__neonfma_2x() 74 vacc1x0123 = vminq_f32(vacc1x0123, vmax); in xnn_f32_vmulcaddc_minmax_ukernel_c8__neonfma_2x() 79 vst1q_f32(o1, vacc1x0123); o1 += 4; in xnn_f32_vmulcaddc_minmax_ukernel_c8__neonfma_2x() 86 float32x4_t vacc1x0123 = vld1q_f32(i1); i1 += 4; in xnn_f32_vmulcaddc_minmax_ukernel_c8__neonfma_2x() local 92 vacc1x0123 = vfmaq_f32(vbias0123, vscale0123, vacc1x0123); in xnn_f32_vmulcaddc_minmax_ukernel_c8__neonfma_2x() 95 vacc1x0123 = vmaxq_f32(vacc1x0123, vmin); in xnn_f32_vmulcaddc_minmax_ukernel_c8__neonfma_2x() 98 vacc1x0123 = vminq_f32(vacc1x0123, vmax); in xnn_f32_vmulcaddc_minmax_ukernel_c8__neonfma_2x() 101 vst1q_f32(o1, vacc1x0123); o1 += 4; in xnn_f32_vmulcaddc_minmax_ukernel_c8__neonfma_2x() [all …]
|
D | c4-minmax-neon-2x.c | 53 float32x4_t vacc1x0123 = vld1q_f32(i1); i1 += 4; in xnn_f32_vmulcaddc_minmax_ukernel_c4__neon_2x() local 56 vacc1x0123 = vmulq_f32(vacc1x0123, vscale0123); in xnn_f32_vmulcaddc_minmax_ukernel_c4__neon_2x() 61 vacc1x0123 = vaddq_f32(vacc1x0123, vbias0123); in xnn_f32_vmulcaddc_minmax_ukernel_c4__neon_2x() 64 vacc1x0123 = vmaxq_f32(vacc1x0123, vmin); in xnn_f32_vmulcaddc_minmax_ukernel_c4__neon_2x() 67 vacc1x0123 = vminq_f32(vacc1x0123, vmax); in xnn_f32_vmulcaddc_minmax_ukernel_c4__neon_2x() 70 vst1q_f32(o1, vacc1x0123); o1 += 4; in xnn_f32_vmulcaddc_minmax_ukernel_c4__neon_2x() 76 float32x4_t vacc1x0123 = vld1q_f32(i1); i1 = (const float*) ((uintptr_t) i1 + c); in xnn_f32_vmulcaddc_minmax_ukernel_c4__neon_2x() local 79 vacc1x0123 = vmulq_f32(vacc1x0123, vscale0123); in xnn_f32_vmulcaddc_minmax_ukernel_c4__neon_2x() 84 vacc1x0123 = vaddq_f32(vacc1x0123, vbias0123); in xnn_f32_vmulcaddc_minmax_ukernel_c4__neon_2x() 87 vacc1x0123 = vmaxq_f32(vacc1x0123, vmin); in xnn_f32_vmulcaddc_minmax_ukernel_c4__neon_2x() [all …]
|
D | c4-minmax-neonfma-2x.c | 53 float32x4_t vacc1x0123 = vld1q_f32(i1); i1 += 4; in xnn_f32_vmulcaddc_minmax_ukernel_c4__neonfma_2x() local 59 vacc1x0123 = vfmaq_f32(vbias0123, vscale0123, vacc1x0123); in xnn_f32_vmulcaddc_minmax_ukernel_c4__neonfma_2x() 62 vacc1x0123 = vmaxq_f32(vacc1x0123, vmin); in xnn_f32_vmulcaddc_minmax_ukernel_c4__neonfma_2x() 65 vacc1x0123 = vminq_f32(vacc1x0123, vmax); in xnn_f32_vmulcaddc_minmax_ukernel_c4__neonfma_2x() 68 vst1q_f32(o1, vacc1x0123); o1 += 4; in xnn_f32_vmulcaddc_minmax_ukernel_c4__neonfma_2x() 74 float32x4_t vacc1x0123 = vld1q_f32(i1); i1 = (const float*) ((uintptr_t) i1 + c); in xnn_f32_vmulcaddc_minmax_ukernel_c4__neonfma_2x() local 80 vacc1x0123 = vfmaq_f32(vbias0123, vscale0123, vacc1x0123); in xnn_f32_vmulcaddc_minmax_ukernel_c4__neonfma_2x() 83 vacc1x0123 = vmaxq_f32(vacc1x0123, vmin); in xnn_f32_vmulcaddc_minmax_ukernel_c4__neonfma_2x() 86 vacc1x0123 = vminq_f32(vacc1x0123, vmax); in xnn_f32_vmulcaddc_minmax_ukernel_c4__neonfma_2x() 89 float32x2_t vacc1x01 = vget_low_f32(vacc1x0123); in xnn_f32_vmulcaddc_minmax_ukernel_c4__neonfma_2x() [all …]
|
/external/XNNPACK/src/qs8-gemm/gen/ |
D | 2x8-minmax-neon-mull-addw-dup.c | 51 int32x4_t vacc1x0123 = vacc0x0123; in xnn_qs8_gemm_minmax_ukernel_2x8__neon_mull_addw_dup() local 65 vacc1x0123 = vaddw_s16(vacc1x0123, vget_low_s16(vprod1x01234567c0)); in xnn_qs8_gemm_minmax_ukernel_2x8__neon_mull_addw_dup() 73 vacc1x0123 = vaddw_s16(vacc1x0123, vget_low_s16(vprod1x01234567c1)); in xnn_qs8_gemm_minmax_ukernel_2x8__neon_mull_addw_dup() 81 vacc1x0123 = vaddw_s16(vacc1x0123, vget_low_s16(vprod1x01234567c2)); in xnn_qs8_gemm_minmax_ukernel_2x8__neon_mull_addw_dup() 89 vacc1x0123 = vaddw_s16(vacc1x0123, vget_low_s16(vprod1x01234567c3)); in xnn_qs8_gemm_minmax_ukernel_2x8__neon_mull_addw_dup() 97 vacc1x0123 = vaddw_s16(vacc1x0123, vget_low_s16(vprod1x01234567c4)); in xnn_qs8_gemm_minmax_ukernel_2x8__neon_mull_addw_dup() 105 vacc1x0123 = vaddw_s16(vacc1x0123, vget_low_s16(vprod1x01234567c5)); in xnn_qs8_gemm_minmax_ukernel_2x8__neon_mull_addw_dup() 113 vacc1x0123 = vaddw_s16(vacc1x0123, vget_low_s16(vprod1x01234567c6)); in xnn_qs8_gemm_minmax_ukernel_2x8__neon_mull_addw_dup() 121 vacc1x0123 = vaddw_s16(vacc1x0123, vget_low_s16(vprod1x01234567c7)); in xnn_qs8_gemm_minmax_ukernel_2x8__neon_mull_addw_dup() 136 vacc1x0123 = vaddw_s16(vacc1x0123, vget_low_s16(vprod1x01234567c0)); in xnn_qs8_gemm_minmax_ukernel_2x8__neon_mull_addw_dup() [all …]
|
D | 2x8-minmax-neon-mlal-lane.c | 51 int32x4_t vacc1x0123 = vacc0x0123; in xnn_qs8_gemm_minmax_ukernel_2x8__neon_mlal_lane() local 66 vacc1x0123 = vmlal_lane_s16(vacc1x0123, vget_low_s16(vxb01234567c0), vget_low_s16(vxa1), 0); in xnn_qs8_gemm_minmax_ukernel_2x8__neon_mlal_lane() 73 vacc1x0123 = vmlal_lane_s16(vacc1x0123, vget_low_s16(vxb01234567c1), vget_low_s16(vxa1), 1); in xnn_qs8_gemm_minmax_ukernel_2x8__neon_mlal_lane() 80 vacc1x0123 = vmlal_lane_s16(vacc1x0123, vget_low_s16(vxb01234567c2), vget_low_s16(vxa1), 2); in xnn_qs8_gemm_minmax_ukernel_2x8__neon_mlal_lane() 87 vacc1x0123 = vmlal_lane_s16(vacc1x0123, vget_low_s16(vxb01234567c3), vget_low_s16(vxa1), 3); in xnn_qs8_gemm_minmax_ukernel_2x8__neon_mlal_lane() 95 vacc1x0123 = vmlal_lane_s16(vacc1x0123, vget_low_s16(vxb01234567c4), vget_high_s16(vxa1), 0); in xnn_qs8_gemm_minmax_ukernel_2x8__neon_mlal_lane() 102 vacc1x0123 = vmlal_lane_s16(vacc1x0123, vget_low_s16(vxb01234567c5), vget_high_s16(vxa1), 1); in xnn_qs8_gemm_minmax_ukernel_2x8__neon_mlal_lane() 109 vacc1x0123 = vmlal_lane_s16(vacc1x0123, vget_low_s16(vxb01234567c6), vget_high_s16(vxa1), 2); in xnn_qs8_gemm_minmax_ukernel_2x8__neon_mlal_lane() 116 vacc1x0123 = vmlal_lane_s16(vacc1x0123, vget_low_s16(vxb01234567c7), vget_high_s16(vxa1), 3); in xnn_qs8_gemm_minmax_ukernel_2x8__neon_mlal_lane() 132 vacc1x0123 = vmlal_lane_s16(vacc1x0123, vget_low_s16(vxb01234567c0), vget_low_s16(vxa1), 0); in xnn_qs8_gemm_minmax_ukernel_2x8__neon_mlal_lane() [all …]
|
/external/XNNPACK/src/qs8-igemm/gen/ |
D | 2x8-minmax-neon-mull-addw-dup.c | 52 int32x4_t vacc1x0123 = vacc0x0123; in xnn_qs8_igemm_minmax_ukernel_2x8__neon_mull_addw_dup() local 78 vacc1x0123 = vaddw_s16(vacc1x0123, vget_low_s16(vprod1x01234567c0)); in xnn_qs8_igemm_minmax_ukernel_2x8__neon_mull_addw_dup() 86 vacc1x0123 = vaddw_s16(vacc1x0123, vget_low_s16(vprod1x01234567c1)); in xnn_qs8_igemm_minmax_ukernel_2x8__neon_mull_addw_dup() 94 vacc1x0123 = vaddw_s16(vacc1x0123, vget_low_s16(vprod1x01234567c2)); in xnn_qs8_igemm_minmax_ukernel_2x8__neon_mull_addw_dup() 102 vacc1x0123 = vaddw_s16(vacc1x0123, vget_low_s16(vprod1x01234567c3)); in xnn_qs8_igemm_minmax_ukernel_2x8__neon_mull_addw_dup() 110 vacc1x0123 = vaddw_s16(vacc1x0123, vget_low_s16(vprod1x01234567c4)); in xnn_qs8_igemm_minmax_ukernel_2x8__neon_mull_addw_dup() 118 vacc1x0123 = vaddw_s16(vacc1x0123, vget_low_s16(vprod1x01234567c5)); in xnn_qs8_igemm_minmax_ukernel_2x8__neon_mull_addw_dup() 126 vacc1x0123 = vaddw_s16(vacc1x0123, vget_low_s16(vprod1x01234567c6)); in xnn_qs8_igemm_minmax_ukernel_2x8__neon_mull_addw_dup() 134 vacc1x0123 = vaddw_s16(vacc1x0123, vget_low_s16(vprod1x01234567c7)); in xnn_qs8_igemm_minmax_ukernel_2x8__neon_mull_addw_dup() 149 vacc1x0123 = vaddw_s16(vacc1x0123, vget_low_s16(vprod1x01234567c0)); in xnn_qs8_igemm_minmax_ukernel_2x8__neon_mull_addw_dup() [all …]
|
D | 2x8-minmax-neon-mlal-lane.c | 52 int32x4_t vacc1x0123 = vacc0x0123; in xnn_qs8_igemm_minmax_ukernel_2x8__neon_mlal_lane() local 79 vacc1x0123 = vmlal_lane_s16(vacc1x0123, vget_low_s16(vxb01234567c0), vget_low_s16(vxa1), 0); in xnn_qs8_igemm_minmax_ukernel_2x8__neon_mlal_lane() 86 vacc1x0123 = vmlal_lane_s16(vacc1x0123, vget_low_s16(vxb01234567c1), vget_low_s16(vxa1), 1); in xnn_qs8_igemm_minmax_ukernel_2x8__neon_mlal_lane() 93 vacc1x0123 = vmlal_lane_s16(vacc1x0123, vget_low_s16(vxb01234567c2), vget_low_s16(vxa1), 2); in xnn_qs8_igemm_minmax_ukernel_2x8__neon_mlal_lane() 100 vacc1x0123 = vmlal_lane_s16(vacc1x0123, vget_low_s16(vxb01234567c3), vget_low_s16(vxa1), 3); in xnn_qs8_igemm_minmax_ukernel_2x8__neon_mlal_lane() 108 … vacc1x0123 = vmlal_lane_s16(vacc1x0123, vget_low_s16(vxb01234567c4), vget_high_s16(vxa1), 0); in xnn_qs8_igemm_minmax_ukernel_2x8__neon_mlal_lane() 115 … vacc1x0123 = vmlal_lane_s16(vacc1x0123, vget_low_s16(vxb01234567c5), vget_high_s16(vxa1), 1); in xnn_qs8_igemm_minmax_ukernel_2x8__neon_mlal_lane() 122 … vacc1x0123 = vmlal_lane_s16(vacc1x0123, vget_low_s16(vxb01234567c6), vget_high_s16(vxa1), 2); in xnn_qs8_igemm_minmax_ukernel_2x8__neon_mlal_lane() 129 … vacc1x0123 = vmlal_lane_s16(vacc1x0123, vget_low_s16(vxb01234567c7), vget_high_s16(vxa1), 3); in xnn_qs8_igemm_minmax_ukernel_2x8__neon_mlal_lane() 145 vacc1x0123 = vmlal_lane_s16(vacc1x0123, vget_low_s16(vxb01234567c0), vget_low_s16(vxa1), 0); in xnn_qs8_igemm_minmax_ukernel_2x8__neon_mlal_lane() [all …]
|
/external/XNNPACK/src/f32-gemm/gen/ |
D | 3x8s4-minmax-wasmsimd-x86.c | 56 v128_t vacc1x0123 = vacc0x0123; in xnn_f32_gemm_minmax_ukernel_3x8s4__wasmsimd_x86() local 76 vacc1x0123 = wasm_f32x4_add(vacc1x0123, wasm_f32x4_mul(va1, vb0123c0)); in xnn_f32_gemm_minmax_ukernel_3x8s4__wasmsimd_x86() 90 vacc1x0123 = wasm_f32x4_add(vacc1x0123, wasm_f32x4_mul(va1, vb0123c1)); in xnn_f32_gemm_minmax_ukernel_3x8s4__wasmsimd_x86() 104 vacc1x0123 = wasm_f32x4_add(vacc1x0123, wasm_f32x4_mul(va1, vb0123c2)); in xnn_f32_gemm_minmax_ukernel_3x8s4__wasmsimd_x86() 118 vacc1x0123 = wasm_f32x4_add(vacc1x0123, wasm_f32x4_mul(va1, vb0123c3)); in xnn_f32_gemm_minmax_ukernel_3x8s4__wasmsimd_x86() 142 vacc1x0123 = wasm_f32x4_add(vacc1x0123, wasm_f32x4_mul(va1, vb0123)); in xnn_f32_gemm_minmax_ukernel_3x8s4__wasmsimd_x86() 154 vacc1x0123 = wasm_v128_bitselect(vmin, vacc1x0123, wasm_f32x4_lt(vacc1x0123, vmin)); in xnn_f32_gemm_minmax_ukernel_3x8s4__wasmsimd_x86() 162 vacc1x0123 = wasm_v128_bitselect(vacc1x0123, vmax, wasm_f32x4_le(vacc1x0123, vmax)); in xnn_f32_gemm_minmax_ukernel_3x8s4__wasmsimd_x86() 172 wasm_v128_store(c1, vacc1x0123); in xnn_f32_gemm_minmax_ukernel_3x8s4__wasmsimd_x86() 187 wasm_v128_store(c1, vacc1x0123); in xnn_f32_gemm_minmax_ukernel_3x8s4__wasmsimd_x86() [all …]
|
D | 3x8-minmax-wasmsimd-x86-splat.c | 56 v128_t vacc1x0123 = vacc0x0123; in xnn_f32_gemm_minmax_ukernel_3x8__wasmsimd_x86_splat() local 79 vacc1x0123 = wasm_f32x4_add(vacc1x0123, wasm_f32x4_mul(va1c0, vb0123c0)); in xnn_f32_gemm_minmax_ukernel_3x8__wasmsimd_x86_splat() 92 vacc1x0123 = wasm_f32x4_add(vacc1x0123, wasm_f32x4_mul(va1c1, vb0123c1)); in xnn_f32_gemm_minmax_ukernel_3x8__wasmsimd_x86_splat() 105 vacc1x0123 = wasm_f32x4_add(vacc1x0123, wasm_f32x4_mul(va1c2, vb0123c2)); in xnn_f32_gemm_minmax_ukernel_3x8__wasmsimd_x86_splat() 118 vacc1x0123 = wasm_f32x4_add(vacc1x0123, wasm_f32x4_mul(va1c3, vb0123c3)); in xnn_f32_gemm_minmax_ukernel_3x8__wasmsimd_x86_splat() 141 vacc1x0123 = wasm_f32x4_add(vacc1x0123, wasm_f32x4_mul(va1, vb0123)); in xnn_f32_gemm_minmax_ukernel_3x8__wasmsimd_x86_splat() 153 vacc1x0123 = wasm_v128_bitselect(vmin, vacc1x0123, wasm_f32x4_lt(vacc1x0123, vmin)); in xnn_f32_gemm_minmax_ukernel_3x8__wasmsimd_x86_splat() 161 vacc1x0123 = wasm_v128_bitselect(vacc1x0123, vmax, wasm_f32x4_le(vacc1x0123, vmax)); in xnn_f32_gemm_minmax_ukernel_3x8__wasmsimd_x86_splat() 171 wasm_v128_store(c1, vacc1x0123); in xnn_f32_gemm_minmax_ukernel_3x8__wasmsimd_x86_splat() 186 wasm_v128_store(c1, vacc1x0123); in xnn_f32_gemm_minmax_ukernel_3x8__wasmsimd_x86_splat() [all …]
|
D | 3x8-minmax-sse-dup.c | 56 __m128 vacc1x0123 = vacc0x0123; in xnn_f32_gemm_minmax_ukernel_3x8__sse_dup() local 80 vacc1x0123 = _mm_add_ps(vacc1x0123, _mm_mul_ps(va1c0000, vb0123c0)); in xnn_f32_gemm_minmax_ukernel_3x8__sse_dup() 94 vacc1x0123 = _mm_add_ps(vacc1x0123, _mm_mul_ps(va1c1111, vb0123c1)); in xnn_f32_gemm_minmax_ukernel_3x8__sse_dup() 108 vacc1x0123 = _mm_add_ps(vacc1x0123, _mm_mul_ps(va1c2222, vb0123c2)); in xnn_f32_gemm_minmax_ukernel_3x8__sse_dup() 122 vacc1x0123 = _mm_add_ps(vacc1x0123, _mm_mul_ps(va1c3333, vb0123c3)); in xnn_f32_gemm_minmax_ukernel_3x8__sse_dup() 145 vacc1x0123 = _mm_add_ps(vacc1x0123, _mm_mul_ps(va1, vb0123)); in xnn_f32_gemm_minmax_ukernel_3x8__sse_dup() 157 vacc1x0123 = _mm_min_ps(vacc1x0123, vmax); in xnn_f32_gemm_minmax_ukernel_3x8__sse_dup() 165 vacc1x0123 = _mm_max_ps(vacc1x0123, vmin); in xnn_f32_gemm_minmax_ukernel_3x8__sse_dup() 175 _mm_storeu_ps(c1, vacc1x0123); in xnn_f32_gemm_minmax_ukernel_3x8__sse_dup() 190 _mm_storeu_ps(c1, vacc1x0123); in xnn_f32_gemm_minmax_ukernel_3x8__sse_dup() [all …]
|
D | 3x8-minmax-sse2-dup.c | 56 __m128 vacc1x0123 = vacc0x0123; in xnn_f32_gemm_minmax_ukernel_3x8__sse2_dup() local 80 vacc1x0123 = _mm_add_ps(vacc1x0123, _mm_mul_ps(va1c0000, vb0123c0)); in xnn_f32_gemm_minmax_ukernel_3x8__sse2_dup() 94 vacc1x0123 = _mm_add_ps(vacc1x0123, _mm_mul_ps(va1c1111, vb0123c1)); in xnn_f32_gemm_minmax_ukernel_3x8__sse2_dup() 108 vacc1x0123 = _mm_add_ps(vacc1x0123, _mm_mul_ps(va1c2222, vb0123c2)); in xnn_f32_gemm_minmax_ukernel_3x8__sse2_dup() 122 vacc1x0123 = _mm_add_ps(vacc1x0123, _mm_mul_ps(va1c3333, vb0123c3)); in xnn_f32_gemm_minmax_ukernel_3x8__sse2_dup() 145 vacc1x0123 = _mm_add_ps(vacc1x0123, _mm_mul_ps(va1, vb0123)); in xnn_f32_gemm_minmax_ukernel_3x8__sse2_dup() 157 vacc1x0123 = _mm_min_ps(vacc1x0123, vmax); in xnn_f32_gemm_minmax_ukernel_3x8__sse2_dup() 165 vacc1x0123 = _mm_max_ps(vacc1x0123, vmin); in xnn_f32_gemm_minmax_ukernel_3x8__sse2_dup() 175 _mm_storeu_ps(c1, vacc1x0123); in xnn_f32_gemm_minmax_ukernel_3x8__sse2_dup() 190 _mm_storeu_ps(c1, vacc1x0123); in xnn_f32_gemm_minmax_ukernel_3x8__sse2_dup() [all …]
|
D | 3x8s4-minmax-wasmsimd-arm.c | 58 v128_t vacc1x0123 = vacc0x0123; in xnn_f32_gemm_minmax_ukernel_3x8s4__wasmsimd_arm() local 78 vacc1x0123 = wasm_f32x4_add(vacc1x0123, wasm_f32x4_mul(va1, vb0123c0)); in xnn_f32_gemm_minmax_ukernel_3x8s4__wasmsimd_arm() 92 vacc1x0123 = wasm_f32x4_add(vacc1x0123, wasm_f32x4_mul(va1, vb0123c1)); in xnn_f32_gemm_minmax_ukernel_3x8s4__wasmsimd_arm() 106 vacc1x0123 = wasm_f32x4_add(vacc1x0123, wasm_f32x4_mul(va1, vb0123c2)); in xnn_f32_gemm_minmax_ukernel_3x8s4__wasmsimd_arm() 120 vacc1x0123 = wasm_f32x4_add(vacc1x0123, wasm_f32x4_mul(va1, vb0123c3)); in xnn_f32_gemm_minmax_ukernel_3x8s4__wasmsimd_arm() 144 vacc1x0123 = wasm_f32x4_add(vacc1x0123, wasm_f32x4_mul(va1, vb0123)); in xnn_f32_gemm_minmax_ukernel_3x8s4__wasmsimd_arm() 155 vacc1x0123 = wasm_f32x4_max(vacc1x0123, vmin); in xnn_f32_gemm_minmax_ukernel_3x8s4__wasmsimd_arm() 162 vacc1x0123 = wasm_f32x4_min(vacc1x0123, vmax); in xnn_f32_gemm_minmax_ukernel_3x8s4__wasmsimd_arm() 172 wasm_v128_store(c1, vacc1x0123); in xnn_f32_gemm_minmax_ukernel_3x8s4__wasmsimd_arm() 187 wasm_v128_store(c1, vacc1x0123); in xnn_f32_gemm_minmax_ukernel_3x8s4__wasmsimd_arm() [all …]
|
D | 3x8s4-minmax-sse.c | 56 __m128 vacc1x0123 = vacc0x0123; in xnn_f32_gemm_minmax_ukernel_3x8s4__sse() local 76 vacc1x0123 = _mm_add_ps(vacc1x0123, _mm_mul_ps(va1, vb0123c0)); in xnn_f32_gemm_minmax_ukernel_3x8s4__sse() 90 vacc1x0123 = _mm_add_ps(vacc1x0123, _mm_mul_ps(va1, vb0123c1)); in xnn_f32_gemm_minmax_ukernel_3x8s4__sse() 104 vacc1x0123 = _mm_add_ps(vacc1x0123, _mm_mul_ps(va1, vb0123c2)); in xnn_f32_gemm_minmax_ukernel_3x8s4__sse() 118 vacc1x0123 = _mm_add_ps(vacc1x0123, _mm_mul_ps(va1, vb0123c3)); in xnn_f32_gemm_minmax_ukernel_3x8s4__sse() 142 vacc1x0123 = _mm_add_ps(vacc1x0123, _mm_mul_ps(va1, vb0123)); in xnn_f32_gemm_minmax_ukernel_3x8s4__sse() 154 vacc1x0123 = _mm_min_ps(vacc1x0123, vmax); in xnn_f32_gemm_minmax_ukernel_3x8s4__sse() 162 vacc1x0123 = _mm_max_ps(vacc1x0123, vmin); in xnn_f32_gemm_minmax_ukernel_3x8s4__sse() 172 _mm_storeu_ps(c1, vacc1x0123); in xnn_f32_gemm_minmax_ukernel_3x8s4__sse() 187 _mm_storeu_ps(c1, vacc1x0123); in xnn_f32_gemm_minmax_ukernel_3x8s4__sse() [all …]
|
D | 3x8-minmax-wasmsimd-arm-splat.c | 58 v128_t vacc1x0123 = vacc0x0123; in xnn_f32_gemm_minmax_ukernel_3x8__wasmsimd_arm_splat() local 81 vacc1x0123 = wasm_f32x4_add(vacc1x0123, wasm_f32x4_mul(va1c0, vb0123c0)); in xnn_f32_gemm_minmax_ukernel_3x8__wasmsimd_arm_splat() 94 vacc1x0123 = wasm_f32x4_add(vacc1x0123, wasm_f32x4_mul(va1c1, vb0123c1)); in xnn_f32_gemm_minmax_ukernel_3x8__wasmsimd_arm_splat() 107 vacc1x0123 = wasm_f32x4_add(vacc1x0123, wasm_f32x4_mul(va1c2, vb0123c2)); in xnn_f32_gemm_minmax_ukernel_3x8__wasmsimd_arm_splat() 120 vacc1x0123 = wasm_f32x4_add(vacc1x0123, wasm_f32x4_mul(va1c3, vb0123c3)); in xnn_f32_gemm_minmax_ukernel_3x8__wasmsimd_arm_splat() 143 vacc1x0123 = wasm_f32x4_add(vacc1x0123, wasm_f32x4_mul(va1, vb0123)); in xnn_f32_gemm_minmax_ukernel_3x8__wasmsimd_arm_splat() 154 vacc1x0123 = wasm_f32x4_max(vacc1x0123, vmin); in xnn_f32_gemm_minmax_ukernel_3x8__wasmsimd_arm_splat() 161 vacc1x0123 = wasm_f32x4_min(vacc1x0123, vmax); in xnn_f32_gemm_minmax_ukernel_3x8__wasmsimd_arm_splat() 171 wasm_v128_store(c1, vacc1x0123); in xnn_f32_gemm_minmax_ukernel_3x8__wasmsimd_arm_splat() 186 wasm_v128_store(c1, vacc1x0123); in xnn_f32_gemm_minmax_ukernel_3x8__wasmsimd_arm_splat() [all …]
|
/external/XNNPACK/src/f32-gemm/gen-inc/ |
D | 3x8s4inc-minmax-wasmsimd-x86.c | 58 v128_t vacc1x0123 = wasm_v128_load(acc + 8); in xnn_f32_gemminc_minmax_ukernel_3x8s4__wasmsimd_x86() local 78 vacc1x0123 = wasm_f32x4_add(vacc1x0123, wasm_f32x4_mul(va1, vb0123c0)); in xnn_f32_gemminc_minmax_ukernel_3x8s4__wasmsimd_x86() 92 vacc1x0123 = wasm_f32x4_add(vacc1x0123, wasm_f32x4_mul(va1, vb0123c1)); in xnn_f32_gemminc_minmax_ukernel_3x8s4__wasmsimd_x86() 106 vacc1x0123 = wasm_f32x4_add(vacc1x0123, wasm_f32x4_mul(va1, vb0123c2)); in xnn_f32_gemminc_minmax_ukernel_3x8s4__wasmsimd_x86() 120 vacc1x0123 = wasm_f32x4_add(vacc1x0123, wasm_f32x4_mul(va1, vb0123c3)); in xnn_f32_gemminc_minmax_ukernel_3x8s4__wasmsimd_x86() 144 vacc1x0123 = wasm_f32x4_add(vacc1x0123, wasm_f32x4_mul(va1, vb0123)); in xnn_f32_gemminc_minmax_ukernel_3x8s4__wasmsimd_x86() 156 vacc1x0123 = wasm_v128_bitselect(vmin, vacc1x0123, wasm_f32x4_lt(vacc1x0123, vmin)); in xnn_f32_gemminc_minmax_ukernel_3x8s4__wasmsimd_x86() 164 vacc1x0123 = wasm_v128_bitselect(vacc1x0123, vmax, wasm_f32x4_le(vacc1x0123, vmax)); in xnn_f32_gemminc_minmax_ukernel_3x8s4__wasmsimd_x86() 174 wasm_v128_store(c1, vacc1x0123); in xnn_f32_gemminc_minmax_ukernel_3x8s4__wasmsimd_x86() 189 wasm_v128_store(c1, vacc1x0123); in xnn_f32_gemminc_minmax_ukernel_3x8s4__wasmsimd_x86() [all …]
|
D | 3x8inc-minmax-wasmsimd-x86-splat.c | 58 v128_t vacc1x0123 = wasm_v128_load(acc + 8); in xnn_f32_gemminc_minmax_ukernel_3x8__wasmsimd_x86_splat() local 81 vacc1x0123 = wasm_f32x4_add(vacc1x0123, wasm_f32x4_mul(va1c0, vb0123c0)); in xnn_f32_gemminc_minmax_ukernel_3x8__wasmsimd_x86_splat() 94 vacc1x0123 = wasm_f32x4_add(vacc1x0123, wasm_f32x4_mul(va1c1, vb0123c1)); in xnn_f32_gemminc_minmax_ukernel_3x8__wasmsimd_x86_splat() 107 vacc1x0123 = wasm_f32x4_add(vacc1x0123, wasm_f32x4_mul(va1c2, vb0123c2)); in xnn_f32_gemminc_minmax_ukernel_3x8__wasmsimd_x86_splat() 120 vacc1x0123 = wasm_f32x4_add(vacc1x0123, wasm_f32x4_mul(va1c3, vb0123c3)); in xnn_f32_gemminc_minmax_ukernel_3x8__wasmsimd_x86_splat() 143 vacc1x0123 = wasm_f32x4_add(vacc1x0123, wasm_f32x4_mul(va1, vb0123)); in xnn_f32_gemminc_minmax_ukernel_3x8__wasmsimd_x86_splat() 155 vacc1x0123 = wasm_v128_bitselect(vmin, vacc1x0123, wasm_f32x4_lt(vacc1x0123, vmin)); in xnn_f32_gemminc_minmax_ukernel_3x8__wasmsimd_x86_splat() 163 vacc1x0123 = wasm_v128_bitselect(vacc1x0123, vmax, wasm_f32x4_le(vacc1x0123, vmax)); in xnn_f32_gemminc_minmax_ukernel_3x8__wasmsimd_x86_splat() 173 wasm_v128_store(c1, vacc1x0123); in xnn_f32_gemminc_minmax_ukernel_3x8__wasmsimd_x86_splat() 188 wasm_v128_store(c1, vacc1x0123); in xnn_f32_gemminc_minmax_ukernel_3x8__wasmsimd_x86_splat() [all …]
|
D | 3x8s4inc-minmax-sse.c | 58 __m128 vacc1x0123 = _mm_load_ps(acc + 8); in xnn_f32_gemminc_minmax_ukernel_3x8s4__sse() local 78 vacc1x0123 = _mm_add_ps(vacc1x0123, _mm_mul_ps(va1, vb0123c0)); in xnn_f32_gemminc_minmax_ukernel_3x8s4__sse() 92 vacc1x0123 = _mm_add_ps(vacc1x0123, _mm_mul_ps(va1, vb0123c1)); in xnn_f32_gemminc_minmax_ukernel_3x8s4__sse() 106 vacc1x0123 = _mm_add_ps(vacc1x0123, _mm_mul_ps(va1, vb0123c2)); in xnn_f32_gemminc_minmax_ukernel_3x8s4__sse() 120 vacc1x0123 = _mm_add_ps(vacc1x0123, _mm_mul_ps(va1, vb0123c3)); in xnn_f32_gemminc_minmax_ukernel_3x8s4__sse() 144 vacc1x0123 = _mm_add_ps(vacc1x0123, _mm_mul_ps(va1, vb0123)); in xnn_f32_gemminc_minmax_ukernel_3x8s4__sse() 156 vacc1x0123 = _mm_min_ps(vacc1x0123, vmax); in xnn_f32_gemminc_minmax_ukernel_3x8s4__sse() 164 vacc1x0123 = _mm_max_ps(vacc1x0123, vmin); in xnn_f32_gemminc_minmax_ukernel_3x8s4__sse() 174 _mm_storeu_ps(c1, vacc1x0123); in xnn_f32_gemminc_minmax_ukernel_3x8s4__sse() 189 _mm_storeu_ps(c1, vacc1x0123); in xnn_f32_gemminc_minmax_ukernel_3x8s4__sse() [all …]
|
D | 3x8inc-minmax-wasmsimd-arm-splat.c | 60 v128_t vacc1x0123 = wasm_v128_load(acc + 8); in xnn_f32_gemminc_minmax_ukernel_3x8__wasmsimd_arm_splat() local 83 vacc1x0123 = wasm_f32x4_add(vacc1x0123, wasm_f32x4_mul(va1c0, vb0123c0)); in xnn_f32_gemminc_minmax_ukernel_3x8__wasmsimd_arm_splat() 96 vacc1x0123 = wasm_f32x4_add(vacc1x0123, wasm_f32x4_mul(va1c1, vb0123c1)); in xnn_f32_gemminc_minmax_ukernel_3x8__wasmsimd_arm_splat() 109 vacc1x0123 = wasm_f32x4_add(vacc1x0123, wasm_f32x4_mul(va1c2, vb0123c2)); in xnn_f32_gemminc_minmax_ukernel_3x8__wasmsimd_arm_splat() 122 vacc1x0123 = wasm_f32x4_add(vacc1x0123, wasm_f32x4_mul(va1c3, vb0123c3)); in xnn_f32_gemminc_minmax_ukernel_3x8__wasmsimd_arm_splat() 145 vacc1x0123 = wasm_f32x4_add(vacc1x0123, wasm_f32x4_mul(va1, vb0123)); in xnn_f32_gemminc_minmax_ukernel_3x8__wasmsimd_arm_splat() 156 vacc1x0123 = wasm_f32x4_max(vacc1x0123, vmin); in xnn_f32_gemminc_minmax_ukernel_3x8__wasmsimd_arm_splat() 163 vacc1x0123 = wasm_f32x4_min(vacc1x0123, vmax); in xnn_f32_gemminc_minmax_ukernel_3x8__wasmsimd_arm_splat() 173 wasm_v128_store(c1, vacc1x0123); in xnn_f32_gemminc_minmax_ukernel_3x8__wasmsimd_arm_splat() 188 wasm_v128_store(c1, vacc1x0123); in xnn_f32_gemminc_minmax_ukernel_3x8__wasmsimd_arm_splat() [all …]
|
/external/XNNPACK/src/f32-igemm/gen/ |
D | 3x8-minmax-wasmsimd-x86-splat.c | 56 v128_t vacc1x0123 = vacc0x0123; in xnn_f32_igemm_minmax_ukernel_3x8__wasmsimd_x86_splat() local 98 vacc1x0123 = wasm_f32x4_add(vacc1x0123, wasm_f32x4_mul(va1c0, vb0123c0)); in xnn_f32_igemm_minmax_ukernel_3x8__wasmsimd_x86_splat() 111 vacc1x0123 = wasm_f32x4_add(vacc1x0123, wasm_f32x4_mul(va1c1, vb0123c1)); in xnn_f32_igemm_minmax_ukernel_3x8__wasmsimd_x86_splat() 124 vacc1x0123 = wasm_f32x4_add(vacc1x0123, wasm_f32x4_mul(va1c2, vb0123c2)); in xnn_f32_igemm_minmax_ukernel_3x8__wasmsimd_x86_splat() 137 vacc1x0123 = wasm_f32x4_add(vacc1x0123, wasm_f32x4_mul(va1c3, vb0123c3)); in xnn_f32_igemm_minmax_ukernel_3x8__wasmsimd_x86_splat() 161 vacc1x0123 = wasm_f32x4_add(vacc1x0123, wasm_f32x4_mul(va1, vb0123)); in xnn_f32_igemm_minmax_ukernel_3x8__wasmsimd_x86_splat() 173 vacc1x0123 = wasm_v128_bitselect(vmin, vacc1x0123, wasm_f32x4_lt(vacc1x0123, vmin)); in xnn_f32_igemm_minmax_ukernel_3x8__wasmsimd_x86_splat() 181 vacc1x0123 = wasm_v128_bitselect(vacc1x0123, vmax, wasm_f32x4_le(vacc1x0123, vmax)); in xnn_f32_igemm_minmax_ukernel_3x8__wasmsimd_x86_splat() 191 wasm_v128_store(c1, vacc1x0123); in xnn_f32_igemm_minmax_ukernel_3x8__wasmsimd_x86_splat() 203 wasm_v128_store(c1, vacc1x0123); in xnn_f32_igemm_minmax_ukernel_3x8__wasmsimd_x86_splat() [all …]
|
D | 3x8s4-minmax-wasmsimd-x86.c | 56 v128_t vacc1x0123 = vacc0x0123; in xnn_f32_igemm_minmax_ukernel_3x8s4__wasmsimd_x86() local 95 vacc1x0123 = wasm_f32x4_add(vacc1x0123, wasm_f32x4_mul(va1, vb0123c0)); in xnn_f32_igemm_minmax_ukernel_3x8s4__wasmsimd_x86() 109 vacc1x0123 = wasm_f32x4_add(vacc1x0123, wasm_f32x4_mul(va1, vb0123c1)); in xnn_f32_igemm_minmax_ukernel_3x8s4__wasmsimd_x86() 123 vacc1x0123 = wasm_f32x4_add(vacc1x0123, wasm_f32x4_mul(va1, vb0123c2)); in xnn_f32_igemm_minmax_ukernel_3x8s4__wasmsimd_x86() 137 vacc1x0123 = wasm_f32x4_add(vacc1x0123, wasm_f32x4_mul(va1, vb0123c3)); in xnn_f32_igemm_minmax_ukernel_3x8s4__wasmsimd_x86() 162 vacc1x0123 = wasm_f32x4_add(vacc1x0123, wasm_f32x4_mul(va1, vb0123)); in xnn_f32_igemm_minmax_ukernel_3x8s4__wasmsimd_x86() 174 vacc1x0123 = wasm_v128_bitselect(vmin, vacc1x0123, wasm_f32x4_lt(vacc1x0123, vmin)); in xnn_f32_igemm_minmax_ukernel_3x8s4__wasmsimd_x86() 182 vacc1x0123 = wasm_v128_bitselect(vacc1x0123, vmax, wasm_f32x4_le(vacc1x0123, vmax)); in xnn_f32_igemm_minmax_ukernel_3x8s4__wasmsimd_x86() 192 wasm_v128_store(c1, vacc1x0123); in xnn_f32_igemm_minmax_ukernel_3x8s4__wasmsimd_x86() 204 wasm_v128_store(c1, vacc1x0123); in xnn_f32_igemm_minmax_ukernel_3x8s4__wasmsimd_x86() [all …]
|
D | 3x8s4-minmax-wasmsimd-arm.c | 58 v128_t vacc1x0123 = vacc0x0123; in xnn_f32_igemm_minmax_ukernel_3x8s4__wasmsimd_arm() local 97 vacc1x0123 = wasm_f32x4_add(vacc1x0123, wasm_f32x4_mul(va1, vb0123c0)); in xnn_f32_igemm_minmax_ukernel_3x8s4__wasmsimd_arm() 111 vacc1x0123 = wasm_f32x4_add(vacc1x0123, wasm_f32x4_mul(va1, vb0123c1)); in xnn_f32_igemm_minmax_ukernel_3x8s4__wasmsimd_arm() 125 vacc1x0123 = wasm_f32x4_add(vacc1x0123, wasm_f32x4_mul(va1, vb0123c2)); in xnn_f32_igemm_minmax_ukernel_3x8s4__wasmsimd_arm() 139 vacc1x0123 = wasm_f32x4_add(vacc1x0123, wasm_f32x4_mul(va1, vb0123c3)); in xnn_f32_igemm_minmax_ukernel_3x8s4__wasmsimd_arm() 164 vacc1x0123 = wasm_f32x4_add(vacc1x0123, wasm_f32x4_mul(va1, vb0123)); in xnn_f32_igemm_minmax_ukernel_3x8s4__wasmsimd_arm() 175 vacc1x0123 = wasm_f32x4_max(vacc1x0123, vmin); in xnn_f32_igemm_minmax_ukernel_3x8s4__wasmsimd_arm() 182 vacc1x0123 = wasm_f32x4_min(vacc1x0123, vmax); in xnn_f32_igemm_minmax_ukernel_3x8s4__wasmsimd_arm() 192 wasm_v128_store(c1, vacc1x0123); in xnn_f32_igemm_minmax_ukernel_3x8s4__wasmsimd_arm() 204 wasm_v128_store(c1, vacc1x0123); in xnn_f32_igemm_minmax_ukernel_3x8s4__wasmsimd_arm() [all …]
|
D | 3x8s4-minmax-sse.c | 56 __m128 vacc1x0123 = vacc0x0123; in xnn_f32_igemm_minmax_ukernel_3x8s4__sse() local 95 vacc1x0123 = _mm_add_ps(vacc1x0123, _mm_mul_ps(va1, vb0123c0)); in xnn_f32_igemm_minmax_ukernel_3x8s4__sse() 109 vacc1x0123 = _mm_add_ps(vacc1x0123, _mm_mul_ps(va1, vb0123c1)); in xnn_f32_igemm_minmax_ukernel_3x8s4__sse() 123 vacc1x0123 = _mm_add_ps(vacc1x0123, _mm_mul_ps(va1, vb0123c2)); in xnn_f32_igemm_minmax_ukernel_3x8s4__sse() 137 vacc1x0123 = _mm_add_ps(vacc1x0123, _mm_mul_ps(va1, vb0123c3)); in xnn_f32_igemm_minmax_ukernel_3x8s4__sse() 162 vacc1x0123 = _mm_add_ps(vacc1x0123, _mm_mul_ps(va1, vb0123)); in xnn_f32_igemm_minmax_ukernel_3x8s4__sse() 174 vacc1x0123 = _mm_min_ps(vacc1x0123, vmax); in xnn_f32_igemm_minmax_ukernel_3x8s4__sse() 182 vacc1x0123 = _mm_max_ps(vacc1x0123, vmin); in xnn_f32_igemm_minmax_ukernel_3x8s4__sse() 192 _mm_storeu_ps(c1, vacc1x0123); in xnn_f32_igemm_minmax_ukernel_3x8s4__sse() 204 _mm_storeu_ps(c1, vacc1x0123); in xnn_f32_igemm_minmax_ukernel_3x8s4__sse() [all …]
|