/external/XNNPACK/src/bf16-gemm/gen/ |
D | 1x4c8-minmax-neonbf16-bfmlal.c | 47 …float32x4_t vacc0x3 = vcvt_f32_bf16(vld1_lane_bf16(w, vreinterpret_bf16_u16(vdup_n_u16(0)), 0)); w… in xnn_bf16_gemm_minmax_ukernel_1x4c8__neonbf16_bfmlal() local 61 vacc0x3 = vbfmlalbq_f32(vacc0x3, va0, vb3); in xnn_bf16_gemm_minmax_ukernel_1x4c8__neonbf16_bfmlal() 66 vacc0x3 = vbfmlaltq_f32(vacc0x3, va0, vb3); in xnn_bf16_gemm_minmax_ukernel_1x4c8__neonbf16_bfmlal() 91 vacc0x3 = vbfmlalbq_f32(vacc0x3, va0x3, vb3); in xnn_bf16_gemm_minmax_ukernel_1x4c8__neonbf16_bfmlal() 92 vacc0x3 = vbfmlaltq_f32(vacc0x3, va0x3, vb3); in xnn_bf16_gemm_minmax_ukernel_1x4c8__neonbf16_bfmlal() 97 const float32x4_t vacc0x23 = vpaddq_f32(vacc0x2, vacc0x3); in xnn_bf16_gemm_minmax_ukernel_1x4c8__neonbf16_bfmlal() 104 const float32x2_t vsum0x3 = vadd_f32(vget_low_f32(vacc0x3), vget_high_f32(vacc0x3)); in xnn_bf16_gemm_minmax_ukernel_1x4c8__neonbf16_bfmlal()
|
D | 1x4c8-minmax-neonfma-zip.c | 48 …float32x4_t vacc0x3 = vreinterpretq_f32_u32(vshll_n_u16(vld1_lane_u16(w, vdup_n_u16(0), 0), 16)); … in xnn_bf16_gemm_minmax_ukernel_1x4c8__neonfma_zip() local 69 vacc0x3 = vfmaq_f32(vacc0x3, va0e, vb3e); in xnn_bf16_gemm_minmax_ukernel_1x4c8__neonfma_zip() 81 vacc0x3 = vfmaq_f32(vacc0x3, va0o, vb3o); in xnn_bf16_gemm_minmax_ukernel_1x4c8__neonfma_zip() 114 vacc0x3 = vfmaq_f32(vacc0x3, va0x3e, vb3e); in xnn_bf16_gemm_minmax_ukernel_1x4c8__neonfma_zip() 129 vacc0x3 = vfmaq_f32(vacc0x3, va0x3o, vb3o); in xnn_bf16_gemm_minmax_ukernel_1x4c8__neonfma_zip() 134 const float32x4_t vacc0x23 = vpaddq_f32(vacc0x2, vacc0x3); in xnn_bf16_gemm_minmax_ukernel_1x4c8__neonfma_zip() 141 const float32x2_t vsum0x3 = vadd_f32(vget_low_f32(vacc0x3), vget_high_f32(vacc0x3)); in xnn_bf16_gemm_minmax_ukernel_1x4c8__neonfma_zip()
|
D | 2x4c8-minmax-neonbf16-bfmlal.c | 53 …float32x4_t vacc0x3 = vcvt_f32_bf16(vld1_lane_bf16(w, vreinterpret_bf16_u16(vdup_n_u16(0)), 0)); w… in xnn_bf16_gemm_minmax_ukernel_2x4c8__neonbf16_bfmlal() local 57 float32x4_t vacc1x3 = vacc0x3; in xnn_bf16_gemm_minmax_ukernel_2x4c8__neonbf16_bfmlal() 75 vacc0x3 = vbfmlalbq_f32(vacc0x3, va0, vb3); in xnn_bf16_gemm_minmax_ukernel_2x4c8__neonbf16_bfmlal() 84 vacc0x3 = vbfmlaltq_f32(vacc0x3, va0, vb3); in xnn_bf16_gemm_minmax_ukernel_2x4c8__neonbf16_bfmlal() 120 vacc0x3 = vbfmlalbq_f32(vacc0x3, va0x3, vb3); in xnn_bf16_gemm_minmax_ukernel_2x4c8__neonbf16_bfmlal() 121 vacc0x3 = vbfmlaltq_f32(vacc0x3, va0x3, vb3); in xnn_bf16_gemm_minmax_ukernel_2x4c8__neonbf16_bfmlal() 130 const float32x4_t vacc0x23 = vpaddq_f32(vacc0x2, vacc0x3); in xnn_bf16_gemm_minmax_ukernel_2x4c8__neonbf16_bfmlal() 142 const float32x2_t vsum0x3 = vadd_f32(vget_low_f32(vacc0x3), vget_high_f32(vacc0x3)); in xnn_bf16_gemm_minmax_ukernel_2x4c8__neonbf16_bfmlal()
|
D | 1x4c8-minmax-neonfma-shland.c | 48 …float32x4_t vacc0x3 = vreinterpretq_f32_u32(vshll_n_u16(vld1_lane_u16(w, vdup_n_u16(0), 0), 16)); … in xnn_bf16_gemm_minmax_ukernel_1x4c8__neonfma_shland() local 69 vacc0x3 = vfmaq_f32(vacc0x3, va0e, vb3e); in xnn_bf16_gemm_minmax_ukernel_1x4c8__neonfma_shland() 81 vacc0x3 = vfmaq_f32(vacc0x3, va0o, vb3o); in xnn_bf16_gemm_minmax_ukernel_1x4c8__neonfma_shland() 114 vacc0x3 = vfmaq_f32(vacc0x3, va0x3e, vb3e); in xnn_bf16_gemm_minmax_ukernel_1x4c8__neonfma_shland() 129 vacc0x3 = vfmaq_f32(vacc0x3, va0x3o, vb3o); in xnn_bf16_gemm_minmax_ukernel_1x4c8__neonfma_shland() 134 const float32x4_t vacc0x23 = vpaddq_f32(vacc0x2, vacc0x3); in xnn_bf16_gemm_minmax_ukernel_1x4c8__neonfma_shland() 141 const float32x2_t vsum0x3 = vadd_f32(vget_low_f32(vacc0x3), vget_high_f32(vacc0x3)); in xnn_bf16_gemm_minmax_ukernel_1x4c8__neonfma_shland()
|
D | 1x4c8-minmax-neonbf16-bfdot.c | 47 …float32x4_t vacc0x3 = vcvt_f32_bf16(vld1_lane_bf16(w, vreinterpret_bf16_u16(vdup_n_u16(0)), 0)); w… in xnn_bf16_gemm_minmax_ukernel_1x4c8__neonbf16_bfdot() local 61 vacc0x3 = vbfdotq_f32(vacc0x3, va0, vb3); in xnn_bf16_gemm_minmax_ukernel_1x4c8__neonbf16_bfdot() 83 vacc0x3 = vbfdotq_f32(vacc0x3, va0x3, vb3); in xnn_bf16_gemm_minmax_ukernel_1x4c8__neonbf16_bfdot() 88 const float32x4_t vacc0x23 = vpaddq_f32(vacc0x2, vacc0x3); in xnn_bf16_gemm_minmax_ukernel_1x4c8__neonbf16_bfdot() 95 const float32x2_t vsum0x3 = vadd_f32(vget_low_f32(vacc0x3), vget_high_f32(vacc0x3)); in xnn_bf16_gemm_minmax_ukernel_1x4c8__neonbf16_bfdot()
|
D | 3x4c8-minmax-neonbf16-bfmlal.c | 59 …float32x4_t vacc0x3 = vcvt_f32_bf16(vld1_lane_bf16(w, vreinterpret_bf16_u16(vdup_n_u16(0)), 0)); w… in xnn_bf16_gemm_minmax_ukernel_3x4c8__neonbf16_bfmlal() local 63 float32x4_t vacc1x3 = vacc0x3; in xnn_bf16_gemm_minmax_ukernel_3x4c8__neonbf16_bfmlal() 67 float32x4_t vacc2x3 = vacc0x3; in xnn_bf16_gemm_minmax_ukernel_3x4c8__neonbf16_bfmlal() 89 vacc0x3 = vbfmlalbq_f32(vacc0x3, va0, vb3); in xnn_bf16_gemm_minmax_ukernel_3x4c8__neonbf16_bfmlal() 102 vacc0x3 = vbfmlaltq_f32(vacc0x3, va0, vb3); in xnn_bf16_gemm_minmax_ukernel_3x4c8__neonbf16_bfmlal() 149 vacc0x3 = vbfmlalbq_f32(vacc0x3, va0x3, vb3); in xnn_bf16_gemm_minmax_ukernel_3x4c8__neonbf16_bfmlal() 150 vacc0x3 = vbfmlaltq_f32(vacc0x3, va0x3, vb3); in xnn_bf16_gemm_minmax_ukernel_3x4c8__neonbf16_bfmlal() 163 const float32x4_t vacc0x23 = vpaddq_f32(vacc0x2, vacc0x3); in xnn_bf16_gemm_minmax_ukernel_3x4c8__neonbf16_bfmlal() 180 const float32x2_t vsum0x3 = vadd_f32(vget_low_f32(vacc0x3), vget_high_f32(vacc0x3)); in xnn_bf16_gemm_minmax_ukernel_3x4c8__neonbf16_bfmlal()
|
D | 2x4c8-minmax-neonfma-shland.c | 54 …float32x4_t vacc0x3 = vreinterpretq_f32_u32(vshll_n_u16(vld1_lane_u16(w, vdup_n_u16(0), 0), 16)); … in xnn_bf16_gemm_minmax_ukernel_2x4c8__neonfma_shland() local 58 float32x4_t vacc1x3 = vacc0x3; in xnn_bf16_gemm_minmax_ukernel_2x4c8__neonfma_shland() 84 vacc0x3 = vfmaq_f32(vacc0x3, va0e, vb3e); in xnn_bf16_gemm_minmax_ukernel_2x4c8__neonfma_shland() 101 vacc0x3 = vfmaq_f32(vacc0x3, va0o, vb3o); in xnn_bf16_gemm_minmax_ukernel_2x4c8__neonfma_shland() 147 vacc0x3 = vfmaq_f32(vacc0x3, va0x3e, vb3e); in xnn_bf16_gemm_minmax_ukernel_2x4c8__neonfma_shland() 170 vacc0x3 = vfmaq_f32(vacc0x3, va0x3o, vb3o); in xnn_bf16_gemm_minmax_ukernel_2x4c8__neonfma_shland() 177 const float32x4_t vacc0x23 = vpaddq_f32(vacc0x2, vacc0x3); in xnn_bf16_gemm_minmax_ukernel_2x4c8__neonfma_shland() 189 const float32x2_t vsum0x3 = vadd_f32(vget_low_f32(vacc0x3), vget_high_f32(vacc0x3)); in xnn_bf16_gemm_minmax_ukernel_2x4c8__neonfma_shland()
|
D | 2x4c8-minmax-neonfma-zip.c | 54 …float32x4_t vacc0x3 = vreinterpretq_f32_u32(vshll_n_u16(vld1_lane_u16(w, vdup_n_u16(0), 0), 16)); … in xnn_bf16_gemm_minmax_ukernel_2x4c8__neonfma_zip() local 58 float32x4_t vacc1x3 = vacc0x3; in xnn_bf16_gemm_minmax_ukernel_2x4c8__neonfma_zip() 84 vacc0x3 = vfmaq_f32(vacc0x3, va0e, vb3e); in xnn_bf16_gemm_minmax_ukernel_2x4c8__neonfma_zip() 101 vacc0x3 = vfmaq_f32(vacc0x3, va0o, vb3o); in xnn_bf16_gemm_minmax_ukernel_2x4c8__neonfma_zip() 147 vacc0x3 = vfmaq_f32(vacc0x3, va0x3e, vb3e); in xnn_bf16_gemm_minmax_ukernel_2x4c8__neonfma_zip() 170 vacc0x3 = vfmaq_f32(vacc0x3, va0x3o, vb3o); in xnn_bf16_gemm_minmax_ukernel_2x4c8__neonfma_zip() 177 const float32x4_t vacc0x23 = vpaddq_f32(vacc0x2, vacc0x3); in xnn_bf16_gemm_minmax_ukernel_2x4c8__neonfma_zip() 189 const float32x2_t vsum0x3 = vadd_f32(vget_low_f32(vacc0x3), vget_high_f32(vacc0x3)); in xnn_bf16_gemm_minmax_ukernel_2x4c8__neonfma_zip()
|
D | 4x4c8-minmax-neonbf16-bfmlal.c | 65 …float32x4_t vacc0x3 = vcvt_f32_bf16(vld1_lane_bf16(w, vreinterpret_bf16_u16(vdup_n_u16(0)), 0)); w… in xnn_bf16_gemm_minmax_ukernel_4x4c8__neonbf16_bfmlal() local 69 float32x4_t vacc1x3 = vacc0x3; in xnn_bf16_gemm_minmax_ukernel_4x4c8__neonbf16_bfmlal() 73 float32x4_t vacc2x3 = vacc0x3; in xnn_bf16_gemm_minmax_ukernel_4x4c8__neonbf16_bfmlal() 77 float32x4_t vacc3x3 = vacc0x3; in xnn_bf16_gemm_minmax_ukernel_4x4c8__neonbf16_bfmlal() 103 vacc0x3 = vbfmlalbq_f32(vacc0x3, va0, vb3); in xnn_bf16_gemm_minmax_ukernel_4x4c8__neonbf16_bfmlal() 120 vacc0x3 = vbfmlaltq_f32(vacc0x3, va0, vb3); in xnn_bf16_gemm_minmax_ukernel_4x4c8__neonbf16_bfmlal() 178 vacc0x3 = vbfmlalbq_f32(vacc0x3, va0x3, vb3); in xnn_bf16_gemm_minmax_ukernel_4x4c8__neonbf16_bfmlal() 179 vacc0x3 = vbfmlaltq_f32(vacc0x3, va0x3, vb3); in xnn_bf16_gemm_minmax_ukernel_4x4c8__neonbf16_bfmlal() 196 const float32x4_t vacc0x23 = vpaddq_f32(vacc0x2, vacc0x3); in xnn_bf16_gemm_minmax_ukernel_4x4c8__neonbf16_bfmlal() 218 const float32x2_t vsum0x3 = vadd_f32(vget_low_f32(vacc0x3), vget_high_f32(vacc0x3)); in xnn_bf16_gemm_minmax_ukernel_4x4c8__neonbf16_bfmlal()
|
D | 2x4c8-minmax-neonbf16-bfdot.c | 53 …float32x4_t vacc0x3 = vcvt_f32_bf16(vld1_lane_bf16(w, vreinterpret_bf16_u16(vdup_n_u16(0)), 0)); w… in xnn_bf16_gemm_minmax_ukernel_2x4c8__neonbf16_bfdot() local 57 float32x4_t vacc1x3 = vacc0x3; in xnn_bf16_gemm_minmax_ukernel_2x4c8__neonbf16_bfdot() 75 vacc0x3 = vbfdotq_f32(vacc0x3, va0, vb3); in xnn_bf16_gemm_minmax_ukernel_2x4c8__neonbf16_bfdot() 105 vacc0x3 = vbfdotq_f32(vacc0x3, va0x3, vb3); in xnn_bf16_gemm_minmax_ukernel_2x4c8__neonbf16_bfdot() 113 const float32x4_t vacc0x23 = vpaddq_f32(vacc0x2, vacc0x3); in xnn_bf16_gemm_minmax_ukernel_2x4c8__neonbf16_bfdot() 125 const float32x2_t vsum0x3 = vadd_f32(vget_low_f32(vacc0x3), vget_high_f32(vacc0x3)); in xnn_bf16_gemm_minmax_ukernel_2x4c8__neonbf16_bfdot()
|
D | 3x4c8-minmax-neonfma-shland.c | 60 …float32x4_t vacc0x3 = vreinterpretq_f32_u32(vshll_n_u16(vld1_lane_u16(w, vdup_n_u16(0), 0), 16)); … in xnn_bf16_gemm_minmax_ukernel_3x4c8__neonfma_shland() local 64 float32x4_t vacc1x3 = vacc0x3; in xnn_bf16_gemm_minmax_ukernel_3x4c8__neonfma_shland() 68 float32x4_t vacc2x3 = vacc0x3; in xnn_bf16_gemm_minmax_ukernel_3x4c8__neonfma_shland() 99 vacc0x3 = vfmaq_f32(vacc0x3, va0e, vb3e); in xnn_bf16_gemm_minmax_ukernel_3x4c8__neonfma_shland() 121 vacc0x3 = vfmaq_f32(vacc0x3, va0o, vb3o); in xnn_bf16_gemm_minmax_ukernel_3x4c8__neonfma_shland() 180 vacc0x3 = vfmaq_f32(vacc0x3, va0x3e, vb3e); in xnn_bf16_gemm_minmax_ukernel_3x4c8__neonfma_shland() 211 vacc0x3 = vfmaq_f32(vacc0x3, va0x3o, vb3o); in xnn_bf16_gemm_minmax_ukernel_3x4c8__neonfma_shland() 220 const float32x4_t vacc0x23 = vpaddq_f32(vacc0x2, vacc0x3); in xnn_bf16_gemm_minmax_ukernel_3x4c8__neonfma_shland() 237 const float32x2_t vsum0x3 = vadd_f32(vget_low_f32(vacc0x3), vget_high_f32(vacc0x3)); in xnn_bf16_gemm_minmax_ukernel_3x4c8__neonfma_shland()
|
D | 3x4c8-minmax-neonfma-zip.c | 60 …float32x4_t vacc0x3 = vreinterpretq_f32_u32(vshll_n_u16(vld1_lane_u16(w, vdup_n_u16(0), 0), 16)); … in xnn_bf16_gemm_minmax_ukernel_3x4c8__neonfma_zip() local 64 float32x4_t vacc1x3 = vacc0x3; in xnn_bf16_gemm_minmax_ukernel_3x4c8__neonfma_zip() 68 float32x4_t vacc2x3 = vacc0x3; in xnn_bf16_gemm_minmax_ukernel_3x4c8__neonfma_zip() 99 vacc0x3 = vfmaq_f32(vacc0x3, va0e, vb3e); in xnn_bf16_gemm_minmax_ukernel_3x4c8__neonfma_zip() 121 vacc0x3 = vfmaq_f32(vacc0x3, va0o, vb3o); in xnn_bf16_gemm_minmax_ukernel_3x4c8__neonfma_zip() 180 vacc0x3 = vfmaq_f32(vacc0x3, va0x3e, vb3e); in xnn_bf16_gemm_minmax_ukernel_3x4c8__neonfma_zip() 211 vacc0x3 = vfmaq_f32(vacc0x3, va0x3o, vb3o); in xnn_bf16_gemm_minmax_ukernel_3x4c8__neonfma_zip() 220 const float32x4_t vacc0x23 = vpaddq_f32(vacc0x2, vacc0x3); in xnn_bf16_gemm_minmax_ukernel_3x4c8__neonfma_zip() 237 const float32x2_t vsum0x3 = vadd_f32(vget_low_f32(vacc0x3), vget_high_f32(vacc0x3)); in xnn_bf16_gemm_minmax_ukernel_3x4c8__neonfma_zip()
|
D | 5x4c8-minmax-neonbf16-bfmlal.c | 71 …float32x4_t vacc0x3 = vcvt_f32_bf16(vld1_lane_bf16(w, vreinterpret_bf16_u16(vdup_n_u16(0)), 0)); w… in xnn_bf16_gemm_minmax_ukernel_5x4c8__neonbf16_bfmlal() local 75 float32x4_t vacc1x3 = vacc0x3; in xnn_bf16_gemm_minmax_ukernel_5x4c8__neonbf16_bfmlal() 79 float32x4_t vacc2x3 = vacc0x3; in xnn_bf16_gemm_minmax_ukernel_5x4c8__neonbf16_bfmlal() 83 float32x4_t vacc3x3 = vacc0x3; in xnn_bf16_gemm_minmax_ukernel_5x4c8__neonbf16_bfmlal() 87 float32x4_t vacc4x3 = vacc0x3; in xnn_bf16_gemm_minmax_ukernel_5x4c8__neonbf16_bfmlal() 117 vacc0x3 = vbfmlalbq_f32(vacc0x3, va0, vb3); in xnn_bf16_gemm_minmax_ukernel_5x4c8__neonbf16_bfmlal() 138 vacc0x3 = vbfmlaltq_f32(vacc0x3, va0, vb3); in xnn_bf16_gemm_minmax_ukernel_5x4c8__neonbf16_bfmlal() 207 vacc0x3 = vbfmlalbq_f32(vacc0x3, va0x3, vb3); in xnn_bf16_gemm_minmax_ukernel_5x4c8__neonbf16_bfmlal() 208 vacc0x3 = vbfmlaltq_f32(vacc0x3, va0x3, vb3); in xnn_bf16_gemm_minmax_ukernel_5x4c8__neonbf16_bfmlal() 229 const float32x4_t vacc0x23 = vpaddq_f32(vacc0x2, vacc0x3); in xnn_bf16_gemm_minmax_ukernel_5x4c8__neonbf16_bfmlal() [all …]
|
D | 3x4c8-minmax-neonbf16-bfdot.c | 59 …float32x4_t vacc0x3 = vcvt_f32_bf16(vld1_lane_bf16(w, vreinterpret_bf16_u16(vdup_n_u16(0)), 0)); w… in xnn_bf16_gemm_minmax_ukernel_3x4c8__neonbf16_bfdot() local 63 float32x4_t vacc1x3 = vacc0x3; in xnn_bf16_gemm_minmax_ukernel_3x4c8__neonbf16_bfdot() 67 float32x4_t vacc2x3 = vacc0x3; in xnn_bf16_gemm_minmax_ukernel_3x4c8__neonbf16_bfdot() 89 vacc0x3 = vbfdotq_f32(vacc0x3, va0, vb3); in xnn_bf16_gemm_minmax_ukernel_3x4c8__neonbf16_bfdot() 127 vacc0x3 = vbfdotq_f32(vacc0x3, va0x3, vb3); in xnn_bf16_gemm_minmax_ukernel_3x4c8__neonbf16_bfdot() 138 const float32x4_t vacc0x23 = vpaddq_f32(vacc0x2, vacc0x3); in xnn_bf16_gemm_minmax_ukernel_3x4c8__neonbf16_bfdot() 155 const float32x2_t vsum0x3 = vadd_f32(vget_low_f32(vacc0x3), vget_high_f32(vacc0x3)); in xnn_bf16_gemm_minmax_ukernel_3x4c8__neonbf16_bfdot()
|
/external/XNNPACK/src/f32-ppmm/gen/ |
D | 2x4-minmax-scalar.c | 43 float vacc0x3 = w[3]; in xnn_f32_ppmm_minmax_ukernel_2x4__scalar() local 47 float vacc1x3 = vacc0x3; in xnn_f32_ppmm_minmax_ukernel_2x4__scalar() 68 vacc0x3 += va0 * vb3; in xnn_f32_ppmm_minmax_ukernel_2x4__scalar() 81 vacc0x3 = math_min_f32(vacc0x3, vmax); in xnn_f32_ppmm_minmax_ukernel_2x4__scalar() 91 vacc0x3 = math_max_f32(vacc0x3, vmin); in xnn_f32_ppmm_minmax_ukernel_2x4__scalar() 102 c0[3] = vacc0x3; in xnn_f32_ppmm_minmax_ukernel_2x4__scalar()
|
D | 4x4-minmax-scalar.c | 51 float vacc0x3 = w[3]; in xnn_f32_ppmm_minmax_ukernel_4x4__scalar() local 55 float vacc1x3 = vacc0x3; in xnn_f32_ppmm_minmax_ukernel_4x4__scalar() 59 float vacc2x3 = vacc0x3; in xnn_f32_ppmm_minmax_ukernel_4x4__scalar() 63 float vacc3x3 = vacc0x3; in xnn_f32_ppmm_minmax_ukernel_4x4__scalar() 92 vacc0x3 += va0 * vb3; in xnn_f32_ppmm_minmax_ukernel_4x4__scalar() 113 vacc0x3 = math_min_f32(vacc0x3, vmax); in xnn_f32_ppmm_minmax_ukernel_4x4__scalar() 131 vacc0x3 = math_max_f32(vacc0x3, vmin); in xnn_f32_ppmm_minmax_ukernel_4x4__scalar() 152 c0[3] = vacc0x3; in xnn_f32_ppmm_minmax_ukernel_4x4__scalar()
|
/external/XNNPACK/src/f32-vmulcaddc/gen/ |
D | c4-minmax-scalar-2x.c | 57 float vacc0x3 = i0[3]; in xnn_f32_vmulcaddc_minmax_ukernel_c4__scalar_2x() local 73 vacc0x3 = vacc0x3 * vscale3 + vbias3; in xnn_f32_vmulcaddc_minmax_ukernel_c4__scalar_2x() 82 vacc0x3 = math_max_f32(vacc0x3, vmin); in xnn_f32_vmulcaddc_minmax_ukernel_c4__scalar_2x() 91 vacc0x3 = math_min_f32(vacc0x3, vmax); in xnn_f32_vmulcaddc_minmax_ukernel_c4__scalar_2x() 100 o0[3] = vacc0x3; in xnn_f32_vmulcaddc_minmax_ukernel_c4__scalar_2x()
|
D | c4-minmax-wasm-2x.c | 57 float vacc0x3 = i0[3]; in xnn_f32_vmulcaddc_minmax_ukernel_c4__wasm_2x() local 73 vacc0x3 = vacc0x3 * vscale3 + vbias3; in xnn_f32_vmulcaddc_minmax_ukernel_c4__wasm_2x() 82 vacc0x3 = __builtin_wasm_max_f32(vacc0x3, vmin); in xnn_f32_vmulcaddc_minmax_ukernel_c4__wasm_2x() 91 vacc0x3 = __builtin_wasm_min_f32(vacc0x3, vmax); in xnn_f32_vmulcaddc_minmax_ukernel_c4__wasm_2x() 100 o0[3] = vacc0x3; in xnn_f32_vmulcaddc_minmax_ukernel_c4__wasm_2x()
|
/external/XNNPACK/src/f32-spmm/gen/ |
D | 8x4-minmax-scalar.c | 65 float vacc0x3 = *w++; in xnn_f32_spmm_minmax_ukernel_8x4__scalar() local 66 float vacc1x3 = vacc0x3; in xnn_f32_spmm_minmax_ukernel_8x4__scalar() 67 float vacc2x3 = vacc0x3; in xnn_f32_spmm_minmax_ukernel_8x4__scalar() 68 float vacc3x3 = vacc0x3; in xnn_f32_spmm_minmax_ukernel_8x4__scalar() 69 float vacc4x3 = vacc0x3; in xnn_f32_spmm_minmax_ukernel_8x4__scalar() 70 float vacc5x3 = vacc0x3; in xnn_f32_spmm_minmax_ukernel_8x4__scalar() 71 float vacc6x3 = vacc0x3; in xnn_f32_spmm_minmax_ukernel_8x4__scalar() 72 float vacc7x3 = vacc0x3; in xnn_f32_spmm_minmax_ukernel_8x4__scalar() 113 vacc0x3 += vi0 * vw3; in xnn_f32_spmm_minmax_ukernel_8x4__scalar() 147 float vout0x3 = math_min_f32(vacc0x3, vmax); in xnn_f32_spmm_minmax_ukernel_8x4__scalar() [all …]
|
/external/XNNPACK/src/qs8-gemm/gen/ |
D | 1x4c8-xw-minmax-fp32-sse2.c | 48 __m128i vacc0x3 = _mm_cvtsi32_si128(((const int*) w)[3]); in xnn_qs8_gemm_xw_minmax_fp32_ukernel_1x4c8__sse2() local 68 vacc0x3 = _mm_add_epi32(vacc0x3, _mm_madd_epi16(vxa0, vxb3)); in xnn_qs8_gemm_xw_minmax_fp32_ukernel_1x4c8__sse2() 75 … vacc0x13 = _mm_add_epi32(_mm_unpacklo_epi32(vacc0x1, vacc0x3), _mm_unpackhi_epi32(vacc0x1, vacc0x… in xnn_qs8_gemm_xw_minmax_fp32_ukernel_1x4c8__sse2()
|
D | 1x4c8-minmax-fp32-wasmsimd-dot16x2-ld128.c | 47 v128_t vacc0x3 = wasm_v128_load32_zero((const int32_t*) w + 3); in xnn_qs8_gemm_minmax_fp32_ukernel_1x4c8__wasmsimd_dot16x2_ld128() local 66 vacc0x3 = wasm_i32x4_add(vacc0x3, wasm_i32x4_dot_i16x8(vxa0, vxb3)); in xnn_qs8_gemm_minmax_fp32_ukernel_1x4c8__wasmsimd_dot16x2_ld128() 73 …13 = wasm_i32x4_add(wasm_v32x4_shuffle(vacc0x1, vacc0x3, 0, 4, 1, 5), wasm_v32x4_shuffle(vacc0x1, … in xnn_qs8_gemm_minmax_fp32_ukernel_1x4c8__wasmsimd_dot16x2_ld128()
|
D | 1x4c8-minmax-fp32-wasmsimd-dot16x2-ld64.c | 47 v128_t vacc0x3 = wasm_v128_load32_zero((const int32_t*) w + 3); in xnn_qs8_gemm_minmax_fp32_ukernel_1x4c8__wasmsimd_dot16x2_ld64() local 66 vacc0x3 = wasm_i32x4_add(vacc0x3, wasm_i32x4_dot_i16x8(vxa0, vxb3)); in xnn_qs8_gemm_minmax_fp32_ukernel_1x4c8__wasmsimd_dot16x2_ld64() 73 …13 = wasm_i32x4_add(wasm_v32x4_shuffle(vacc0x1, vacc0x3, 0, 4, 1, 5), wasm_v32x4_shuffle(vacc0x1, … in xnn_qs8_gemm_minmax_fp32_ukernel_1x4c8__wasmsimd_dot16x2_ld64()
|
D | 1x4c8-xw-minmax-fp32-wasmsimd-dot16x2.c | 47 v128_t vacc0x3 = wasm_v128_load32_zero((const int32_t*) w + 3); in xnn_qs8_gemm_xw_minmax_fp32_ukernel_1x4c8__wasmsimd_dot16x2() local 66 vacc0x3 = wasm_i32x4_add(vacc0x3, wasm_i32x4_dot_i16x8(vxa0, vxb3)); in xnn_qs8_gemm_xw_minmax_fp32_ukernel_1x4c8__wasmsimd_dot16x2() 73 …13 = wasm_i32x4_add(wasm_v32x4_shuffle(vacc0x1, vacc0x3, 0, 4, 1, 5), wasm_v32x4_shuffle(vacc0x1, … in xnn_qs8_gemm_xw_minmax_fp32_ukernel_1x4c8__wasmsimd_dot16x2()
|
/external/XNNPACK/src/qc8-gemm/gen/ |
D | 1x4c8-minmax-fp32-wasmsimd-dot16x2-ld64.c | 47 v128_t vacc0x3 = wasm_v128_load32_zero((const int32_t*) w + 3); in xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__wasmsimd_dot16x2_ld64() local 66 vacc0x3 = wasm_i32x4_add(vacc0x3, wasm_i32x4_dot_i16x8(vxa0, vxb3)); in xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__wasmsimd_dot16x2_ld64() 73 …13 = wasm_i32x4_add(wasm_v32x4_shuffle(vacc0x1, vacc0x3, 0, 4, 1, 5), wasm_v32x4_shuffle(vacc0x1, … in xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__wasmsimd_dot16x2_ld64()
|
D | 1x4c8-minmax-fp32-wasmsimd-dot16x2-ld128.c | 47 v128_t vacc0x3 = wasm_v128_load32_zero((const int32_t*) w + 3); in xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__wasmsimd_dot16x2_ld128() local 66 vacc0x3 = wasm_i32x4_add(vacc0x3, wasm_i32x4_dot_i16x8(vxa0, vxb3)); in xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__wasmsimd_dot16x2_ld128() 73 …13 = wasm_i32x4_add(wasm_v32x4_shuffle(vacc0x1, vacc0x3, 0, 4, 1, 5), wasm_v32x4_shuffle(vacc0x1, … in xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__wasmsimd_dot16x2_ld128()
|