/external/XNNPACK/src/f16-gemm/gen-inc/ |
D | 4x8inc-minmax-neonfp16arith-ld64.c | 67 …float16x8_t vacc2x01234567 = vld1q_f16(acc); acc = (const void*) ((uintptr_t) acc + sizeof(float16… in xnn_f16_gemminc_minmax_ukernel_4x8__neonfp16arith_ld64() local 82 vacc2x01234567 = vfmaq_lane_f16(vacc2x01234567, vb01234567c0, va2, 0); in xnn_f16_gemminc_minmax_ukernel_4x8__neonfp16arith_ld64() 92 vacc2x01234567 = vfmaq_f16(vacc2x01234567, va2c0, vb01234567c0); in xnn_f16_gemminc_minmax_ukernel_4x8__neonfp16arith_ld64() 100 vacc2x01234567 = vfmaq_lane_f16(vacc2x01234567, vb01234567c1, va2, 1); in xnn_f16_gemminc_minmax_ukernel_4x8__neonfp16arith_ld64() 110 vacc2x01234567 = vfmaq_f16(vacc2x01234567, va2c1, vb01234567c1); in xnn_f16_gemminc_minmax_ukernel_4x8__neonfp16arith_ld64() 118 vacc2x01234567 = vfmaq_lane_f16(vacc2x01234567, vb01234567c2, va2, 2); in xnn_f16_gemminc_minmax_ukernel_4x8__neonfp16arith_ld64() 128 vacc2x01234567 = vfmaq_f16(vacc2x01234567, va2c2, vb01234567c2); in xnn_f16_gemminc_minmax_ukernel_4x8__neonfp16arith_ld64() 136 vacc2x01234567 = vfmaq_lane_f16(vacc2x01234567, vb01234567c3, va2, 3); in xnn_f16_gemminc_minmax_ukernel_4x8__neonfp16arith_ld64() 146 vacc2x01234567 = vfmaq_f16(vacc2x01234567, va2c3, vb01234567c3); in xnn_f16_gemminc_minmax_ukernel_4x8__neonfp16arith_ld64() 163 vacc2x01234567 = vfmaq_f16(vacc2x01234567, va2, vb01234567); in xnn_f16_gemminc_minmax_ukernel_4x8__neonfp16arith_ld64() [all …]
|
D | 4x16inc-minmax-neonfp16arith-ld64.c | 69 …float16x8_t vacc2x01234567 = vld1q_f16(acc); acc = (const void*) ((uintptr_t) acc + sizeof(float16… in xnn_f16_gemminc_minmax_ukernel_4x16__neonfp16arith_ld64() local 87 vacc2x01234567 = vfmaq_lane_f16(vacc2x01234567, vb01234567c0, va2, 0); in xnn_f16_gemminc_minmax_ukernel_4x16__neonfp16arith_ld64() 101 vacc2x01234567 = vfmaq_f16(vacc2x01234567, va2c0, vb01234567c0); in xnn_f16_gemminc_minmax_ukernel_4x16__neonfp16arith_ld64() 114 vacc2x01234567 = vfmaq_lane_f16(vacc2x01234567, vb01234567c1, va2, 1); in xnn_f16_gemminc_minmax_ukernel_4x16__neonfp16arith_ld64() 128 vacc2x01234567 = vfmaq_f16(vacc2x01234567, va2c1, vb01234567c1); in xnn_f16_gemminc_minmax_ukernel_4x16__neonfp16arith_ld64() 141 vacc2x01234567 = vfmaq_lane_f16(vacc2x01234567, vb01234567c2, va2, 2); in xnn_f16_gemminc_minmax_ukernel_4x16__neonfp16arith_ld64() 155 vacc2x01234567 = vfmaq_f16(vacc2x01234567, va2c2, vb01234567c2); in xnn_f16_gemminc_minmax_ukernel_4x16__neonfp16arith_ld64() 168 vacc2x01234567 = vfmaq_lane_f16(vacc2x01234567, vb01234567c3, va2, 3); in xnn_f16_gemminc_minmax_ukernel_4x16__neonfp16arith_ld64() 182 vacc2x01234567 = vfmaq_f16(vacc2x01234567, va2c3, vb01234567c3); in xnn_f16_gemminc_minmax_ukernel_4x16__neonfp16arith_ld64() 204 vacc2x01234567 = vfmaq_f16(vacc2x01234567, va2, vb01234567); in xnn_f16_gemminc_minmax_ukernel_4x16__neonfp16arith_ld64() [all …]
|
D | 6x8inc-minmax-neonfp16arith-ld64.c | 79 …float16x8_t vacc2x01234567 = vld1q_f16(acc); acc = (const void*) ((uintptr_t) acc + sizeof(float16… in xnn_f16_gemminc_minmax_ukernel_6x8__neonfp16arith_ld64() local 98 vacc2x01234567 = vfmaq_lane_f16(vacc2x01234567, vb01234567c0, va2, 0); in xnn_f16_gemminc_minmax_ukernel_6x8__neonfp16arith_ld64() 112 vacc2x01234567 = vfmaq_f16(vacc2x01234567, va2c0, vb01234567c0); in xnn_f16_gemminc_minmax_ukernel_6x8__neonfp16arith_ld64() 122 vacc2x01234567 = vfmaq_lane_f16(vacc2x01234567, vb01234567c1, va2, 1); in xnn_f16_gemminc_minmax_ukernel_6x8__neonfp16arith_ld64() 136 vacc2x01234567 = vfmaq_f16(vacc2x01234567, va2c1, vb01234567c1); in xnn_f16_gemminc_minmax_ukernel_6x8__neonfp16arith_ld64() 146 vacc2x01234567 = vfmaq_lane_f16(vacc2x01234567, vb01234567c2, va2, 2); in xnn_f16_gemminc_minmax_ukernel_6x8__neonfp16arith_ld64() 160 vacc2x01234567 = vfmaq_f16(vacc2x01234567, va2c2, vb01234567c2); in xnn_f16_gemminc_minmax_ukernel_6x8__neonfp16arith_ld64() 170 vacc2x01234567 = vfmaq_lane_f16(vacc2x01234567, vb01234567c3, va2, 3); in xnn_f16_gemminc_minmax_ukernel_6x8__neonfp16arith_ld64() 184 vacc2x01234567 = vfmaq_f16(vacc2x01234567, va2c3, vb01234567c3); in xnn_f16_gemminc_minmax_ukernel_6x8__neonfp16arith_ld64() 205 vacc2x01234567 = vfmaq_f16(vacc2x01234567, va2, vb01234567); in xnn_f16_gemminc_minmax_ukernel_6x8__neonfp16arith_ld64() [all …]
|
D | 8x8inc-minmax-neonfp16arith-ld64.c | 91 …float16x8_t vacc2x01234567 = vld1q_f16(acc); acc = (const void*) ((uintptr_t) acc + sizeof(float16… in xnn_f16_gemminc_minmax_ukernel_8x8__neonfp16arith_ld64() local 114 vacc2x01234567 = vfmaq_lane_f16(vacc2x01234567, vb01234567c0, va2, 0); in xnn_f16_gemminc_minmax_ukernel_8x8__neonfp16arith_ld64() 132 vacc2x01234567 = vfmaq_f16(vacc2x01234567, va2c0, vb01234567c0); in xnn_f16_gemminc_minmax_ukernel_8x8__neonfp16arith_ld64() 144 vacc2x01234567 = vfmaq_lane_f16(vacc2x01234567, vb01234567c1, va2, 1); in xnn_f16_gemminc_minmax_ukernel_8x8__neonfp16arith_ld64() 162 vacc2x01234567 = vfmaq_f16(vacc2x01234567, va2c1, vb01234567c1); in xnn_f16_gemminc_minmax_ukernel_8x8__neonfp16arith_ld64() 174 vacc2x01234567 = vfmaq_lane_f16(vacc2x01234567, vb01234567c2, va2, 2); in xnn_f16_gemminc_minmax_ukernel_8x8__neonfp16arith_ld64() 192 vacc2x01234567 = vfmaq_f16(vacc2x01234567, va2c2, vb01234567c2); in xnn_f16_gemminc_minmax_ukernel_8x8__neonfp16arith_ld64() 204 vacc2x01234567 = vfmaq_lane_f16(vacc2x01234567, vb01234567c3, va2, 3); in xnn_f16_gemminc_minmax_ukernel_8x8__neonfp16arith_ld64() 222 vacc2x01234567 = vfmaq_f16(vacc2x01234567, va2c3, vb01234567c3); in xnn_f16_gemminc_minmax_ukernel_8x8__neonfp16arith_ld64() 247 vacc2x01234567 = vfmaq_f16(vacc2x01234567, va2, vb01234567); in xnn_f16_gemminc_minmax_ukernel_8x8__neonfp16arith_ld64() [all …]
|
/external/XNNPACK/src/f16-gemm/gen/ |
D | 4x8-minmax-neonfp16arith-ld64.c | 65 float16x8_t vacc2x01234567 = vacc0x01234567; in xnn_f16_gemm_minmax_ukernel_4x8__neonfp16arith_ld64() local 80 vacc2x01234567 = vfmaq_lane_f16(vacc2x01234567, vb01234567c0, va2, 0); in xnn_f16_gemm_minmax_ukernel_4x8__neonfp16arith_ld64() 90 vacc2x01234567 = vfmaq_f16(vacc2x01234567, va2c0, vb01234567c0); in xnn_f16_gemm_minmax_ukernel_4x8__neonfp16arith_ld64() 98 vacc2x01234567 = vfmaq_lane_f16(vacc2x01234567, vb01234567c1, va2, 1); in xnn_f16_gemm_minmax_ukernel_4x8__neonfp16arith_ld64() 108 vacc2x01234567 = vfmaq_f16(vacc2x01234567, va2c1, vb01234567c1); in xnn_f16_gemm_minmax_ukernel_4x8__neonfp16arith_ld64() 116 vacc2x01234567 = vfmaq_lane_f16(vacc2x01234567, vb01234567c2, va2, 2); in xnn_f16_gemm_minmax_ukernel_4x8__neonfp16arith_ld64() 126 vacc2x01234567 = vfmaq_f16(vacc2x01234567, va2c2, vb01234567c2); in xnn_f16_gemm_minmax_ukernel_4x8__neonfp16arith_ld64() 134 vacc2x01234567 = vfmaq_lane_f16(vacc2x01234567, vb01234567c3, va2, 3); in xnn_f16_gemm_minmax_ukernel_4x8__neonfp16arith_ld64() 144 vacc2x01234567 = vfmaq_f16(vacc2x01234567, va2c3, vb01234567c3); in xnn_f16_gemm_minmax_ukernel_4x8__neonfp16arith_ld64() 161 vacc2x01234567 = vfmaq_f16(vacc2x01234567, va2, vb01234567); in xnn_f16_gemm_minmax_ukernel_4x8__neonfp16arith_ld64() [all …]
|
D | 4x16-minmax-neonfp16arith-ld64.c | 67 float16x8_t vacc2x01234567 = vacc0x01234567; in xnn_f16_gemm_minmax_ukernel_4x16__neonfp16arith_ld64() local 85 vacc2x01234567 = vfmaq_lane_f16(vacc2x01234567, vb01234567c0, va2, 0); in xnn_f16_gemm_minmax_ukernel_4x16__neonfp16arith_ld64() 99 vacc2x01234567 = vfmaq_f16(vacc2x01234567, va2c0, vb01234567c0); in xnn_f16_gemm_minmax_ukernel_4x16__neonfp16arith_ld64() 112 vacc2x01234567 = vfmaq_lane_f16(vacc2x01234567, vb01234567c1, va2, 1); in xnn_f16_gemm_minmax_ukernel_4x16__neonfp16arith_ld64() 126 vacc2x01234567 = vfmaq_f16(vacc2x01234567, va2c1, vb01234567c1); in xnn_f16_gemm_minmax_ukernel_4x16__neonfp16arith_ld64() 139 vacc2x01234567 = vfmaq_lane_f16(vacc2x01234567, vb01234567c2, va2, 2); in xnn_f16_gemm_minmax_ukernel_4x16__neonfp16arith_ld64() 153 vacc2x01234567 = vfmaq_f16(vacc2x01234567, va2c2, vb01234567c2); in xnn_f16_gemm_minmax_ukernel_4x16__neonfp16arith_ld64() 166 vacc2x01234567 = vfmaq_lane_f16(vacc2x01234567, vb01234567c3, va2, 3); in xnn_f16_gemm_minmax_ukernel_4x16__neonfp16arith_ld64() 180 vacc2x01234567 = vfmaq_f16(vacc2x01234567, va2c3, vb01234567c3); in xnn_f16_gemm_minmax_ukernel_4x16__neonfp16arith_ld64() 202 vacc2x01234567 = vfmaq_f16(vacc2x01234567, va2, vb01234567); in xnn_f16_gemm_minmax_ukernel_4x16__neonfp16arith_ld64() [all …]
|
D | 6x8-minmax-neonfp16arith-ld64.c | 77 float16x8_t vacc2x01234567 = vacc0x01234567; in xnn_f16_gemm_minmax_ukernel_6x8__neonfp16arith_ld64() local 96 vacc2x01234567 = vfmaq_lane_f16(vacc2x01234567, vb01234567c0, va2, 0); in xnn_f16_gemm_minmax_ukernel_6x8__neonfp16arith_ld64() 110 vacc2x01234567 = vfmaq_f16(vacc2x01234567, va2c0, vb01234567c0); in xnn_f16_gemm_minmax_ukernel_6x8__neonfp16arith_ld64() 120 vacc2x01234567 = vfmaq_lane_f16(vacc2x01234567, vb01234567c1, va2, 1); in xnn_f16_gemm_minmax_ukernel_6x8__neonfp16arith_ld64() 134 vacc2x01234567 = vfmaq_f16(vacc2x01234567, va2c1, vb01234567c1); in xnn_f16_gemm_minmax_ukernel_6x8__neonfp16arith_ld64() 144 vacc2x01234567 = vfmaq_lane_f16(vacc2x01234567, vb01234567c2, va2, 2); in xnn_f16_gemm_minmax_ukernel_6x8__neonfp16arith_ld64() 158 vacc2x01234567 = vfmaq_f16(vacc2x01234567, va2c2, vb01234567c2); in xnn_f16_gemm_minmax_ukernel_6x8__neonfp16arith_ld64() 168 vacc2x01234567 = vfmaq_lane_f16(vacc2x01234567, vb01234567c3, va2, 3); in xnn_f16_gemm_minmax_ukernel_6x8__neonfp16arith_ld64() 182 vacc2x01234567 = vfmaq_f16(vacc2x01234567, va2c3, vb01234567c3); in xnn_f16_gemm_minmax_ukernel_6x8__neonfp16arith_ld64() 203 vacc2x01234567 = vfmaq_f16(vacc2x01234567, va2, vb01234567); in xnn_f16_gemm_minmax_ukernel_6x8__neonfp16arith_ld64() [all …]
|
D | 8x8-minmax-neonfp16arith-ld64.c | 89 float16x8_t vacc2x01234567 = vacc0x01234567; in xnn_f16_gemm_minmax_ukernel_8x8__neonfp16arith_ld64() local 112 vacc2x01234567 = vfmaq_lane_f16(vacc2x01234567, vb01234567c0, va2, 0); in xnn_f16_gemm_minmax_ukernel_8x8__neonfp16arith_ld64() 130 vacc2x01234567 = vfmaq_f16(vacc2x01234567, va2c0, vb01234567c0); in xnn_f16_gemm_minmax_ukernel_8x8__neonfp16arith_ld64() 142 vacc2x01234567 = vfmaq_lane_f16(vacc2x01234567, vb01234567c1, va2, 1); in xnn_f16_gemm_minmax_ukernel_8x8__neonfp16arith_ld64() 160 vacc2x01234567 = vfmaq_f16(vacc2x01234567, va2c1, vb01234567c1); in xnn_f16_gemm_minmax_ukernel_8x8__neonfp16arith_ld64() 172 vacc2x01234567 = vfmaq_lane_f16(vacc2x01234567, vb01234567c2, va2, 2); in xnn_f16_gemm_minmax_ukernel_8x8__neonfp16arith_ld64() 190 vacc2x01234567 = vfmaq_f16(vacc2x01234567, va2c2, vb01234567c2); in xnn_f16_gemm_minmax_ukernel_8x8__neonfp16arith_ld64() 202 vacc2x01234567 = vfmaq_lane_f16(vacc2x01234567, vb01234567c3, va2, 3); in xnn_f16_gemm_minmax_ukernel_8x8__neonfp16arith_ld64() 220 vacc2x01234567 = vfmaq_f16(vacc2x01234567, va2c3, vb01234567c3); in xnn_f16_gemm_minmax_ukernel_8x8__neonfp16arith_ld64() 245 vacc2x01234567 = vfmaq_f16(vacc2x01234567, va2, vb01234567); in xnn_f16_gemm_minmax_ukernel_8x8__neonfp16arith_ld64() [all …]
|
D | 6x16-minmax-neonfp16arith-ld64.c | 79 float16x8_t vacc2x01234567 = vacc0x01234567; in xnn_f16_gemm_minmax_ukernel_6x16__neonfp16arith_ld64() local 103 vacc2x01234567 = vfmaq_lane_f16(vacc2x01234567, vb01234567c0, va2, 0); in xnn_f16_gemm_minmax_ukernel_6x16__neonfp16arith_ld64() 123 vacc2x01234567 = vfmaq_f16(vacc2x01234567, va2c0, vb01234567c0); in xnn_f16_gemm_minmax_ukernel_6x16__neonfp16arith_ld64() 140 vacc2x01234567 = vfmaq_lane_f16(vacc2x01234567, vb01234567c1, va2, 1); in xnn_f16_gemm_minmax_ukernel_6x16__neonfp16arith_ld64() 160 vacc2x01234567 = vfmaq_f16(vacc2x01234567, va2c1, vb01234567c1); in xnn_f16_gemm_minmax_ukernel_6x16__neonfp16arith_ld64() 177 vacc2x01234567 = vfmaq_lane_f16(vacc2x01234567, vb01234567c2, va2, 2); in xnn_f16_gemm_minmax_ukernel_6x16__neonfp16arith_ld64() 197 vacc2x01234567 = vfmaq_f16(vacc2x01234567, va2c2, vb01234567c2); in xnn_f16_gemm_minmax_ukernel_6x16__neonfp16arith_ld64() 214 vacc2x01234567 = vfmaq_lane_f16(vacc2x01234567, vb01234567c3, va2, 3); in xnn_f16_gemm_minmax_ukernel_6x16__neonfp16arith_ld64() 234 vacc2x01234567 = vfmaq_f16(vacc2x01234567, va2c3, vb01234567c3); in xnn_f16_gemm_minmax_ukernel_6x16__neonfp16arith_ld64() 262 vacc2x01234567 = vfmaq_f16(vacc2x01234567, va2, vb01234567); in xnn_f16_gemm_minmax_ukernel_6x16__neonfp16arith_ld64() [all …]
|
/external/XNNPACK/src/f16-igemm/gen/ |
D | 4x8-minmax-neonfp16arith-ld64.c | 61 float16x8_t vacc2x01234567 = vacc0x01234567; in xnn_f16_igemm_minmax_ukernel_4x8__neonfp16arith_ld64() local 100 vacc2x01234567 = vfmaq_lane_f16(vacc2x01234567, vb01234567c0, va2, 0); in xnn_f16_igemm_minmax_ukernel_4x8__neonfp16arith_ld64() 110 vacc2x01234567 = vfmaq_f16(vacc2x01234567, va2c0, vb01234567c0); in xnn_f16_igemm_minmax_ukernel_4x8__neonfp16arith_ld64() 118 vacc2x01234567 = vfmaq_lane_f16(vacc2x01234567, vb01234567c1, va2, 1); in xnn_f16_igemm_minmax_ukernel_4x8__neonfp16arith_ld64() 128 vacc2x01234567 = vfmaq_f16(vacc2x01234567, va2c1, vb01234567c1); in xnn_f16_igemm_minmax_ukernel_4x8__neonfp16arith_ld64() 136 vacc2x01234567 = vfmaq_lane_f16(vacc2x01234567, vb01234567c2, va2, 2); in xnn_f16_igemm_minmax_ukernel_4x8__neonfp16arith_ld64() 146 vacc2x01234567 = vfmaq_f16(vacc2x01234567, va2c2, vb01234567c2); in xnn_f16_igemm_minmax_ukernel_4x8__neonfp16arith_ld64() 154 vacc2x01234567 = vfmaq_lane_f16(vacc2x01234567, vb01234567c3, va2, 3); in xnn_f16_igemm_minmax_ukernel_4x8__neonfp16arith_ld64() 164 vacc2x01234567 = vfmaq_f16(vacc2x01234567, va2c3, vb01234567c3); in xnn_f16_igemm_minmax_ukernel_4x8__neonfp16arith_ld64() 179 vacc2x01234567 = vfmaq_f16(vacc2x01234567, va2, vb01234567); in xnn_f16_igemm_minmax_ukernel_4x8__neonfp16arith_ld64() [all …]
|
D | 4x16-minmax-neonfp16arith-ld64.c | 63 float16x8_t vacc2x01234567 = vacc0x01234567; in xnn_f16_igemm_minmax_ukernel_4x16__neonfp16arith_ld64() local 105 vacc2x01234567 = vfmaq_lane_f16(vacc2x01234567, vb01234567c0, va2, 0); in xnn_f16_igemm_minmax_ukernel_4x16__neonfp16arith_ld64() 119 vacc2x01234567 = vfmaq_f16(vacc2x01234567, va2c0, vb01234567c0); in xnn_f16_igemm_minmax_ukernel_4x16__neonfp16arith_ld64() 132 vacc2x01234567 = vfmaq_lane_f16(vacc2x01234567, vb01234567c1, va2, 1); in xnn_f16_igemm_minmax_ukernel_4x16__neonfp16arith_ld64() 146 vacc2x01234567 = vfmaq_f16(vacc2x01234567, va2c1, vb01234567c1); in xnn_f16_igemm_minmax_ukernel_4x16__neonfp16arith_ld64() 159 vacc2x01234567 = vfmaq_lane_f16(vacc2x01234567, vb01234567c2, va2, 2); in xnn_f16_igemm_minmax_ukernel_4x16__neonfp16arith_ld64() 173 vacc2x01234567 = vfmaq_f16(vacc2x01234567, va2c2, vb01234567c2); in xnn_f16_igemm_minmax_ukernel_4x16__neonfp16arith_ld64() 186 vacc2x01234567 = vfmaq_lane_f16(vacc2x01234567, vb01234567c3, va2, 3); in xnn_f16_igemm_minmax_ukernel_4x16__neonfp16arith_ld64() 200 vacc2x01234567 = vfmaq_f16(vacc2x01234567, va2c3, vb01234567c3); in xnn_f16_igemm_minmax_ukernel_4x16__neonfp16arith_ld64() 220 vacc2x01234567 = vfmaq_f16(vacc2x01234567, va2, vb01234567); in xnn_f16_igemm_minmax_ukernel_4x16__neonfp16arith_ld64() [all …]
|
D | 6x8-minmax-neonfp16arith-ld64.c | 69 float16x8_t vacc2x01234567 = vacc0x01234567; in xnn_f16_igemm_minmax_ukernel_6x8__neonfp16arith_ld64() local 122 vacc2x01234567 = vfmaq_lane_f16(vacc2x01234567, vb01234567c0, va2, 0); in xnn_f16_igemm_minmax_ukernel_6x8__neonfp16arith_ld64() 136 vacc2x01234567 = vfmaq_f16(vacc2x01234567, va2c0, vb01234567c0); in xnn_f16_igemm_minmax_ukernel_6x8__neonfp16arith_ld64() 146 vacc2x01234567 = vfmaq_lane_f16(vacc2x01234567, vb01234567c1, va2, 1); in xnn_f16_igemm_minmax_ukernel_6x8__neonfp16arith_ld64() 160 vacc2x01234567 = vfmaq_f16(vacc2x01234567, va2c1, vb01234567c1); in xnn_f16_igemm_minmax_ukernel_6x8__neonfp16arith_ld64() 170 vacc2x01234567 = vfmaq_lane_f16(vacc2x01234567, vb01234567c2, va2, 2); in xnn_f16_igemm_minmax_ukernel_6x8__neonfp16arith_ld64() 184 vacc2x01234567 = vfmaq_f16(vacc2x01234567, va2c2, vb01234567c2); in xnn_f16_igemm_minmax_ukernel_6x8__neonfp16arith_ld64() 194 vacc2x01234567 = vfmaq_lane_f16(vacc2x01234567, vb01234567c3, va2, 3); in xnn_f16_igemm_minmax_ukernel_6x8__neonfp16arith_ld64() 208 vacc2x01234567 = vfmaq_f16(vacc2x01234567, va2c3, vb01234567c3); in xnn_f16_igemm_minmax_ukernel_6x8__neonfp16arith_ld64() 227 vacc2x01234567 = vfmaq_f16(vacc2x01234567, va2, vb01234567); in xnn_f16_igemm_minmax_ukernel_6x8__neonfp16arith_ld64() [all …]
|
D | 8x8-minmax-neonfp16arith-ld64.c | 77 float16x8_t vacc2x01234567 = vacc0x01234567; in xnn_f16_igemm_minmax_ukernel_8x8__neonfp16arith_ld64() local 144 vacc2x01234567 = vfmaq_lane_f16(vacc2x01234567, vb01234567c0, va2, 0); in xnn_f16_igemm_minmax_ukernel_8x8__neonfp16arith_ld64() 162 vacc2x01234567 = vfmaq_f16(vacc2x01234567, va2c0, vb01234567c0); in xnn_f16_igemm_minmax_ukernel_8x8__neonfp16arith_ld64() 174 vacc2x01234567 = vfmaq_lane_f16(vacc2x01234567, vb01234567c1, va2, 1); in xnn_f16_igemm_minmax_ukernel_8x8__neonfp16arith_ld64() 192 vacc2x01234567 = vfmaq_f16(vacc2x01234567, va2c1, vb01234567c1); in xnn_f16_igemm_minmax_ukernel_8x8__neonfp16arith_ld64() 204 vacc2x01234567 = vfmaq_lane_f16(vacc2x01234567, vb01234567c2, va2, 2); in xnn_f16_igemm_minmax_ukernel_8x8__neonfp16arith_ld64() 222 vacc2x01234567 = vfmaq_f16(vacc2x01234567, va2c2, vb01234567c2); in xnn_f16_igemm_minmax_ukernel_8x8__neonfp16arith_ld64() 234 vacc2x01234567 = vfmaq_lane_f16(vacc2x01234567, vb01234567c3, va2, 3); in xnn_f16_igemm_minmax_ukernel_8x8__neonfp16arith_ld64() 252 vacc2x01234567 = vfmaq_f16(vacc2x01234567, va2c3, vb01234567c3); in xnn_f16_igemm_minmax_ukernel_8x8__neonfp16arith_ld64() 275 vacc2x01234567 = vfmaq_f16(vacc2x01234567, va2, vb01234567); in xnn_f16_igemm_minmax_ukernel_8x8__neonfp16arith_ld64() [all …]
|
/external/XNNPACK/src/f32-igemm/gen/ |
D | 3x16s4-minmax-fma3-broadcast.c | 58 __m256 vacc2x01234567 = vacc0x01234567; in xnn_f32_igemm_minmax_ukernel_3x16s4__fma3_broadcast() local 96 vacc2x01234567 = _mm256_fmadd_ps(va2, vb01234567c0, vacc2x01234567); in xnn_f32_igemm_minmax_ukernel_3x16s4__fma3_broadcast() 110 vacc2x01234567 = _mm256_fmadd_ps(va2, vb01234567c1, vacc2x01234567); in xnn_f32_igemm_minmax_ukernel_3x16s4__fma3_broadcast() 124 vacc2x01234567 = _mm256_fmadd_ps(va2, vb01234567c2, vacc2x01234567); in xnn_f32_igemm_minmax_ukernel_3x16s4__fma3_broadcast() 138 vacc2x01234567 = _mm256_fmadd_ps(va2, vb01234567c3, vacc2x01234567); in xnn_f32_igemm_minmax_ukernel_3x16s4__fma3_broadcast() 162 vacc2x01234567 = _mm256_fmadd_ps(va2, vb01234567, vacc2x01234567); in xnn_f32_igemm_minmax_ukernel_3x16s4__fma3_broadcast() 176 vacc2x01234567 = _mm256_min_ps(vacc2x01234567, vmax); in xnn_f32_igemm_minmax_ukernel_3x16s4__fma3_broadcast() 184 vacc2x01234567 = _mm256_max_ps(vacc2x01234567, vmin); in xnn_f32_igemm_minmax_ukernel_3x16s4__fma3_broadcast() 190 _mm256_storeu_ps(c2, vacc2x01234567); in xnn_f32_igemm_minmax_ukernel_3x16s4__fma3_broadcast() 204 _mm256_storeu_ps(c2, vacc2x01234567); in xnn_f32_igemm_minmax_ukernel_3x16s4__fma3_broadcast() [all …]
|
D | 4x16s4-minmax-fma3-broadcast.c | 62 __m256 vacc2x01234567 = vacc0x01234567; in xnn_f32_igemm_minmax_ukernel_4x16s4__fma3_broadcast() local 109 vacc2x01234567 = _mm256_fmadd_ps(va2, vb01234567c0, vacc2x01234567); in xnn_f32_igemm_minmax_ukernel_4x16s4__fma3_broadcast() 126 vacc2x01234567 = _mm256_fmadd_ps(va2, vb01234567c1, vacc2x01234567); in xnn_f32_igemm_minmax_ukernel_4x16s4__fma3_broadcast() 143 vacc2x01234567 = _mm256_fmadd_ps(va2, vb01234567c2, vacc2x01234567); in xnn_f32_igemm_minmax_ukernel_4x16s4__fma3_broadcast() 160 vacc2x01234567 = _mm256_fmadd_ps(va2, vb01234567c3, vacc2x01234567); in xnn_f32_igemm_minmax_ukernel_4x16s4__fma3_broadcast() 188 vacc2x01234567 = _mm256_fmadd_ps(va2, vb01234567, vacc2x01234567); in xnn_f32_igemm_minmax_ukernel_4x16s4__fma3_broadcast() 204 vacc2x01234567 = _mm256_min_ps(vacc2x01234567, vmax); in xnn_f32_igemm_minmax_ukernel_4x16s4__fma3_broadcast() 214 vacc2x01234567 = _mm256_max_ps(vacc2x01234567, vmin); in xnn_f32_igemm_minmax_ukernel_4x16s4__fma3_broadcast() 225 _mm256_storeu_ps(c2, vacc2x01234567); in xnn_f32_igemm_minmax_ukernel_4x16s4__fma3_broadcast() 240 _mm256_storeu_ps(c2, vacc2x01234567); in xnn_f32_igemm_minmax_ukernel_4x16s4__fma3_broadcast() [all …]
|
D | 3x16-minmax-fma3-broadcast.c | 58 __m256 vacc2x01234567 = vacc0x01234567; in xnn_f32_igemm_minmax_ukernel_3x16__fma3_broadcast() local 98 vacc2x01234567 = _mm256_fmadd_ps(va2, vb01234567, vacc2x01234567); in xnn_f32_igemm_minmax_ukernel_3x16__fma3_broadcast() 108 vacc2x01234567 = _mm256_min_ps(vacc2x01234567, vmax); in xnn_f32_igemm_minmax_ukernel_3x16__fma3_broadcast() 116 vacc2x01234567 = _mm256_max_ps(vacc2x01234567, vmin); in xnn_f32_igemm_minmax_ukernel_3x16__fma3_broadcast() 122 _mm256_storeu_ps(c2, vacc2x01234567); in xnn_f32_igemm_minmax_ukernel_3x16__fma3_broadcast() 136 _mm256_storeu_ps(c2, vacc2x01234567); in xnn_f32_igemm_minmax_ukernel_3x16__fma3_broadcast() 140 vacc2x01234567 = vacc2x89ABCDEF; in xnn_f32_igemm_minmax_ukernel_3x16__fma3_broadcast() 148 __m128 vacc2x0123 = _mm256_castps256_ps128(vacc2x01234567); in xnn_f32_igemm_minmax_ukernel_3x16__fma3_broadcast() 156 vacc2x0123 = _mm256_extractf128_ps(vacc2x01234567, 1); in xnn_f32_igemm_minmax_ukernel_3x16__fma3_broadcast()
|
D | 3x16-minmax-avx-broadcast.c | 58 __m256 vacc2x01234567 = vacc0x01234567; in xnn_f32_igemm_minmax_ukernel_3x16__avx_broadcast() local 98 vacc2x01234567 = _mm256_add_ps(vacc2x01234567, _mm256_mul_ps(va2, vb01234567)); in xnn_f32_igemm_minmax_ukernel_3x16__avx_broadcast() 108 vacc2x01234567 = _mm256_min_ps(vacc2x01234567, vmax); in xnn_f32_igemm_minmax_ukernel_3x16__avx_broadcast() 116 vacc2x01234567 = _mm256_max_ps(vacc2x01234567, vmin); in xnn_f32_igemm_minmax_ukernel_3x16__avx_broadcast() 122 _mm256_storeu_ps(c2, vacc2x01234567); in xnn_f32_igemm_minmax_ukernel_3x16__avx_broadcast() 136 _mm256_storeu_ps(c2, vacc2x01234567); in xnn_f32_igemm_minmax_ukernel_3x16__avx_broadcast() 140 vacc2x01234567 = vacc2x89ABCDEF; in xnn_f32_igemm_minmax_ukernel_3x16__avx_broadcast() 148 __m128 vacc2x0123 = _mm256_castps256_ps128(vacc2x01234567); in xnn_f32_igemm_minmax_ukernel_3x16__avx_broadcast() 156 vacc2x0123 = _mm256_extractf128_ps(vacc2x01234567, 1); in xnn_f32_igemm_minmax_ukernel_3x16__avx_broadcast()
|
/external/XNNPACK/src/f32-gemm/gen-inc/ |
D | 3x16s4inc-minmax-fma3-broadcast.c | 60 __m256 vacc2x01234567 = _mm256_load_ps(acc + 32); in xnn_f32_gemminc_minmax_ukernel_3x16s4__fma3_broadcast() local 79 vacc2x01234567 = _mm256_fmadd_ps(va2, vb01234567c0, vacc2x01234567); in xnn_f32_gemminc_minmax_ukernel_3x16s4__fma3_broadcast() 93 vacc2x01234567 = _mm256_fmadd_ps(va2, vb01234567c1, vacc2x01234567); in xnn_f32_gemminc_minmax_ukernel_3x16s4__fma3_broadcast() 107 vacc2x01234567 = _mm256_fmadd_ps(va2, vb01234567c2, vacc2x01234567); in xnn_f32_gemminc_minmax_ukernel_3x16s4__fma3_broadcast() 121 vacc2x01234567 = _mm256_fmadd_ps(va2, vb01234567c3, vacc2x01234567); in xnn_f32_gemminc_minmax_ukernel_3x16s4__fma3_broadcast() 145 vacc2x01234567 = _mm256_fmadd_ps(va2, vb01234567, vacc2x01234567); in xnn_f32_gemminc_minmax_ukernel_3x16s4__fma3_broadcast() 157 vacc2x01234567 = _mm256_min_ps(vacc2x01234567, vmax); in xnn_f32_gemminc_minmax_ukernel_3x16s4__fma3_broadcast() 165 vacc2x01234567 = _mm256_max_ps(vacc2x01234567, vmin); in xnn_f32_gemminc_minmax_ukernel_3x16s4__fma3_broadcast() 171 _mm256_storeu_ps(c2, vacc2x01234567); in xnn_f32_gemminc_minmax_ukernel_3x16s4__fma3_broadcast() 188 _mm256_storeu_ps(c2, vacc2x01234567); in xnn_f32_gemminc_minmax_ukernel_3x16s4__fma3_broadcast() [all …]
|
D | 4x16s4inc-minmax-fma3-broadcast.c | 66 __m256 vacc2x01234567 = _mm256_load_ps(acc + 32); in xnn_f32_gemminc_minmax_ukernel_4x16s4__fma3_broadcast() local 89 vacc2x01234567 = _mm256_fmadd_ps(va2, vb01234567c0, vacc2x01234567); in xnn_f32_gemminc_minmax_ukernel_4x16s4__fma3_broadcast() 106 vacc2x01234567 = _mm256_fmadd_ps(va2, vb01234567c1, vacc2x01234567); in xnn_f32_gemminc_minmax_ukernel_4x16s4__fma3_broadcast() 123 vacc2x01234567 = _mm256_fmadd_ps(va2, vb01234567c2, vacc2x01234567); in xnn_f32_gemminc_minmax_ukernel_4x16s4__fma3_broadcast() 140 vacc2x01234567 = _mm256_fmadd_ps(va2, vb01234567c3, vacc2x01234567); in xnn_f32_gemminc_minmax_ukernel_4x16s4__fma3_broadcast() 168 vacc2x01234567 = _mm256_fmadd_ps(va2, vb01234567, vacc2x01234567); in xnn_f32_gemminc_minmax_ukernel_4x16s4__fma3_broadcast() 182 vacc2x01234567 = _mm256_min_ps(vacc2x01234567, vmax); in xnn_f32_gemminc_minmax_ukernel_4x16s4__fma3_broadcast() 192 vacc2x01234567 = _mm256_max_ps(vacc2x01234567, vmin); in xnn_f32_gemminc_minmax_ukernel_4x16s4__fma3_broadcast() 203 _mm256_storeu_ps(c2, vacc2x01234567); in xnn_f32_gemminc_minmax_ukernel_4x16s4__fma3_broadcast() 222 _mm256_storeu_ps(c2, vacc2x01234567); in xnn_f32_gemminc_minmax_ukernel_4x16s4__fma3_broadcast() [all …]
|
D | 3x16inc-minmax-fma3-broadcast.c | 60 __m256 vacc2x01234567 = _mm256_load_ps(acc + 32); in xnn_f32_gemminc_minmax_ukernel_3x16__fma3_broadcast() local 79 vacc2x01234567 = _mm256_fmadd_ps(va2, vb01234567, vacc2x01234567); in xnn_f32_gemminc_minmax_ukernel_3x16__fma3_broadcast() 90 vacc2x01234567 = _mm256_min_ps(vacc2x01234567, vmax); in xnn_f32_gemminc_minmax_ukernel_3x16__fma3_broadcast() 98 vacc2x01234567 = _mm256_max_ps(vacc2x01234567, vmin); in xnn_f32_gemminc_minmax_ukernel_3x16__fma3_broadcast() 104 _mm256_storeu_ps(c2, vacc2x01234567); in xnn_f32_gemminc_minmax_ukernel_3x16__fma3_broadcast() 121 _mm256_storeu_ps(c2, vacc2x01234567); in xnn_f32_gemminc_minmax_ukernel_3x16__fma3_broadcast() 125 vacc2x01234567 = vacc2x89ABCDEF; in xnn_f32_gemminc_minmax_ukernel_3x16__fma3_broadcast() 133 __m128 vacc2x0123 = _mm256_castps256_ps128(vacc2x01234567); in xnn_f32_gemminc_minmax_ukernel_3x16__fma3_broadcast() 141 vacc2x0123 = _mm256_extractf128_ps(vacc2x01234567, 1); in xnn_f32_gemminc_minmax_ukernel_3x16__fma3_broadcast()
|
D | 3x16inc-minmax-avx-broadcast.c | 60 __m256 vacc2x01234567 = _mm256_load_ps(acc + 32); in xnn_f32_gemminc_minmax_ukernel_3x16__avx_broadcast() local 79 vacc2x01234567 = _mm256_add_ps(vacc2x01234567, _mm256_mul_ps(va2, vb01234567)); in xnn_f32_gemminc_minmax_ukernel_3x16__avx_broadcast() 90 vacc2x01234567 = _mm256_min_ps(vacc2x01234567, vmax); in xnn_f32_gemminc_minmax_ukernel_3x16__avx_broadcast() 98 vacc2x01234567 = _mm256_max_ps(vacc2x01234567, vmin); in xnn_f32_gemminc_minmax_ukernel_3x16__avx_broadcast() 104 _mm256_storeu_ps(c2, vacc2x01234567); in xnn_f32_gemminc_minmax_ukernel_3x16__avx_broadcast() 121 _mm256_storeu_ps(c2, vacc2x01234567); in xnn_f32_gemminc_minmax_ukernel_3x16__avx_broadcast() 125 vacc2x01234567 = vacc2x89ABCDEF; in xnn_f32_gemminc_minmax_ukernel_3x16__avx_broadcast() 133 __m128 vacc2x0123 = _mm256_castps256_ps128(vacc2x01234567); in xnn_f32_gemminc_minmax_ukernel_3x16__avx_broadcast() 141 vacc2x0123 = _mm256_extractf128_ps(vacc2x01234567, 1); in xnn_f32_gemminc_minmax_ukernel_3x16__avx_broadcast()
|
/external/XNNPACK/src/f32-gemm/gen/ |
D | 3x16s4-minmax-fma3-broadcast.c | 58 __m256 vacc2x01234567 = vacc0x01234567; in xnn_f32_gemm_minmax_ukernel_3x16s4__fma3_broadcast() local 77 vacc2x01234567 = _mm256_fmadd_ps(va2, vb01234567c0, vacc2x01234567); in xnn_f32_gemm_minmax_ukernel_3x16s4__fma3_broadcast() 91 vacc2x01234567 = _mm256_fmadd_ps(va2, vb01234567c1, vacc2x01234567); in xnn_f32_gemm_minmax_ukernel_3x16s4__fma3_broadcast() 105 vacc2x01234567 = _mm256_fmadd_ps(va2, vb01234567c2, vacc2x01234567); in xnn_f32_gemm_minmax_ukernel_3x16s4__fma3_broadcast() 119 vacc2x01234567 = _mm256_fmadd_ps(va2, vb01234567c3, vacc2x01234567); in xnn_f32_gemm_minmax_ukernel_3x16s4__fma3_broadcast() 143 vacc2x01234567 = _mm256_fmadd_ps(va2, vb01234567, vacc2x01234567); in xnn_f32_gemm_minmax_ukernel_3x16s4__fma3_broadcast() 155 vacc2x01234567 = _mm256_min_ps(vacc2x01234567, vmax); in xnn_f32_gemm_minmax_ukernel_3x16s4__fma3_broadcast() 163 vacc2x01234567 = _mm256_max_ps(vacc2x01234567, vmin); in xnn_f32_gemm_minmax_ukernel_3x16s4__fma3_broadcast() 169 _mm256_storeu_ps(c2, vacc2x01234567); in xnn_f32_gemm_minmax_ukernel_3x16s4__fma3_broadcast() 186 _mm256_storeu_ps(c2, vacc2x01234567); in xnn_f32_gemm_minmax_ukernel_3x16s4__fma3_broadcast() [all …]
|
D | 4x16s4-minmax-fma3-broadcast.c | 64 __m256 vacc2x01234567 = vacc0x01234567; in xnn_f32_gemm_minmax_ukernel_4x16s4__fma3_broadcast() local 87 vacc2x01234567 = _mm256_fmadd_ps(va2, vb01234567c0, vacc2x01234567); in xnn_f32_gemm_minmax_ukernel_4x16s4__fma3_broadcast() 104 vacc2x01234567 = _mm256_fmadd_ps(va2, vb01234567c1, vacc2x01234567); in xnn_f32_gemm_minmax_ukernel_4x16s4__fma3_broadcast() 121 vacc2x01234567 = _mm256_fmadd_ps(va2, vb01234567c2, vacc2x01234567); in xnn_f32_gemm_minmax_ukernel_4x16s4__fma3_broadcast() 138 vacc2x01234567 = _mm256_fmadd_ps(va2, vb01234567c3, vacc2x01234567); in xnn_f32_gemm_minmax_ukernel_4x16s4__fma3_broadcast() 166 vacc2x01234567 = _mm256_fmadd_ps(va2, vb01234567, vacc2x01234567); in xnn_f32_gemm_minmax_ukernel_4x16s4__fma3_broadcast() 180 vacc2x01234567 = _mm256_min_ps(vacc2x01234567, vmax); in xnn_f32_gemm_minmax_ukernel_4x16s4__fma3_broadcast() 190 vacc2x01234567 = _mm256_max_ps(vacc2x01234567, vmin); in xnn_f32_gemm_minmax_ukernel_4x16s4__fma3_broadcast() 201 _mm256_storeu_ps(c2, vacc2x01234567); in xnn_f32_gemm_minmax_ukernel_4x16s4__fma3_broadcast() 220 _mm256_storeu_ps(c2, vacc2x01234567); in xnn_f32_gemm_minmax_ukernel_4x16s4__fma3_broadcast() [all …]
|
D | 3x16-minmax-avx-broadcast.c | 58 __m256 vacc2x01234567 = vacc0x01234567; in xnn_f32_gemm_minmax_ukernel_3x16__avx_broadcast() local 77 vacc2x01234567 = _mm256_add_ps(vacc2x01234567, _mm256_mul_ps(va2, vb01234567)); in xnn_f32_gemm_minmax_ukernel_3x16__avx_broadcast() 88 vacc2x01234567 = _mm256_min_ps(vacc2x01234567, vmax); in xnn_f32_gemm_minmax_ukernel_3x16__avx_broadcast() 96 vacc2x01234567 = _mm256_max_ps(vacc2x01234567, vmin); in xnn_f32_gemm_minmax_ukernel_3x16__avx_broadcast() 102 _mm256_storeu_ps(c2, vacc2x01234567); in xnn_f32_gemm_minmax_ukernel_3x16__avx_broadcast() 119 _mm256_storeu_ps(c2, vacc2x01234567); in xnn_f32_gemm_minmax_ukernel_3x16__avx_broadcast() 123 vacc2x01234567 = vacc2x89ABCDEF; in xnn_f32_gemm_minmax_ukernel_3x16__avx_broadcast() 131 __m128 vacc2x0123 = _mm256_castps256_ps128(vacc2x01234567); in xnn_f32_gemm_minmax_ukernel_3x16__avx_broadcast() 139 vacc2x0123 = _mm256_extractf128_ps(vacc2x01234567, 1); in xnn_f32_gemm_minmax_ukernel_3x16__avx_broadcast()
|
D | 3x16-minmax-fma3-broadcast.c | 58 __m256 vacc2x01234567 = vacc0x01234567; in xnn_f32_gemm_minmax_ukernel_3x16__fma3_broadcast() local 77 vacc2x01234567 = _mm256_fmadd_ps(va2, vb01234567, vacc2x01234567); in xnn_f32_gemm_minmax_ukernel_3x16__fma3_broadcast() 88 vacc2x01234567 = _mm256_min_ps(vacc2x01234567, vmax); in xnn_f32_gemm_minmax_ukernel_3x16__fma3_broadcast() 96 vacc2x01234567 = _mm256_max_ps(vacc2x01234567, vmin); in xnn_f32_gemm_minmax_ukernel_3x16__fma3_broadcast() 102 _mm256_storeu_ps(c2, vacc2x01234567); in xnn_f32_gemm_minmax_ukernel_3x16__fma3_broadcast() 119 _mm256_storeu_ps(c2, vacc2x01234567); in xnn_f32_gemm_minmax_ukernel_3x16__fma3_broadcast() 123 vacc2x01234567 = vacc2x89ABCDEF; in xnn_f32_gemm_minmax_ukernel_3x16__fma3_broadcast() 131 __m128 vacc2x0123 = _mm256_castps256_ps128(vacc2x01234567); in xnn_f32_gemm_minmax_ukernel_3x16__fma3_broadcast() 139 vacc2x0123 = _mm256_extractf128_ps(vacc2x01234567, 1); in xnn_f32_gemm_minmax_ukernel_3x16__fma3_broadcast()
|