/external/XNNPACK/src/f16-gemm/gen/ |
D | 4x8-minmax-neonfp16arith-ld64.c | 66 float16x8_t vacc3x01234567 = vacc0x01234567; in xnn_f16_gemm_minmax_ukernel_4x8__neonfp16arith_ld64() local 81 vacc3x01234567 = vfmaq_lane_f16(vacc3x01234567, vb01234567c0, va3, 0); in xnn_f16_gemm_minmax_ukernel_4x8__neonfp16arith_ld64() 91 vacc3x01234567 = vfmaq_f16(vacc3x01234567, va3c0, vb01234567c0); in xnn_f16_gemm_minmax_ukernel_4x8__neonfp16arith_ld64() 99 vacc3x01234567 = vfmaq_lane_f16(vacc3x01234567, vb01234567c1, va3, 1); in xnn_f16_gemm_minmax_ukernel_4x8__neonfp16arith_ld64() 109 vacc3x01234567 = vfmaq_f16(vacc3x01234567, va3c1, vb01234567c1); in xnn_f16_gemm_minmax_ukernel_4x8__neonfp16arith_ld64() 117 vacc3x01234567 = vfmaq_lane_f16(vacc3x01234567, vb01234567c2, va3, 2); in xnn_f16_gemm_minmax_ukernel_4x8__neonfp16arith_ld64() 127 vacc3x01234567 = vfmaq_f16(vacc3x01234567, va3c2, vb01234567c2); in xnn_f16_gemm_minmax_ukernel_4x8__neonfp16arith_ld64() 135 vacc3x01234567 = vfmaq_lane_f16(vacc3x01234567, vb01234567c3, va3, 3); in xnn_f16_gemm_minmax_ukernel_4x8__neonfp16arith_ld64() 145 vacc3x01234567 = vfmaq_f16(vacc3x01234567, va3c3, vb01234567c3); in xnn_f16_gemm_minmax_ukernel_4x8__neonfp16arith_ld64() 162 vacc3x01234567 = vfmaq_f16(vacc3x01234567, va3, vb01234567); in xnn_f16_gemm_minmax_ukernel_4x8__neonfp16arith_ld64() [all …]
|
D | 4x16-minmax-neonfp16arith-ld64.c | 69 float16x8_t vacc3x01234567 = vacc0x01234567; in xnn_f16_gemm_minmax_ukernel_4x16__neonfp16arith_ld64() local 86 vacc3x01234567 = vfmaq_lane_f16(vacc3x01234567, vb01234567c0, va3, 0); in xnn_f16_gemm_minmax_ukernel_4x16__neonfp16arith_ld64() 100 vacc3x01234567 = vfmaq_f16(vacc3x01234567, va3c0, vb01234567c0); in xnn_f16_gemm_minmax_ukernel_4x16__neonfp16arith_ld64() 113 vacc3x01234567 = vfmaq_lane_f16(vacc3x01234567, vb01234567c1, va3, 1); in xnn_f16_gemm_minmax_ukernel_4x16__neonfp16arith_ld64() 127 vacc3x01234567 = vfmaq_f16(vacc3x01234567, va3c1, vb01234567c1); in xnn_f16_gemm_minmax_ukernel_4x16__neonfp16arith_ld64() 140 vacc3x01234567 = vfmaq_lane_f16(vacc3x01234567, vb01234567c2, va3, 2); in xnn_f16_gemm_minmax_ukernel_4x16__neonfp16arith_ld64() 154 vacc3x01234567 = vfmaq_f16(vacc3x01234567, va3c2, vb01234567c2); in xnn_f16_gemm_minmax_ukernel_4x16__neonfp16arith_ld64() 167 vacc3x01234567 = vfmaq_lane_f16(vacc3x01234567, vb01234567c3, va3, 3); in xnn_f16_gemm_minmax_ukernel_4x16__neonfp16arith_ld64() 181 vacc3x01234567 = vfmaq_f16(vacc3x01234567, va3c3, vb01234567c3); in xnn_f16_gemm_minmax_ukernel_4x16__neonfp16arith_ld64() 203 vacc3x01234567 = vfmaq_f16(vacc3x01234567, va3, vb01234567); in xnn_f16_gemm_minmax_ukernel_4x16__neonfp16arith_ld64() [all …]
|
D | 6x8-minmax-neonfp16arith-ld64.c | 78 float16x8_t vacc3x01234567 = vacc0x01234567; in xnn_f16_gemm_minmax_ukernel_6x8__neonfp16arith_ld64() local 97 vacc3x01234567 = vfmaq_lane_f16(vacc3x01234567, vb01234567c0, va3, 0); in xnn_f16_gemm_minmax_ukernel_6x8__neonfp16arith_ld64() 111 vacc3x01234567 = vfmaq_f16(vacc3x01234567, va3c0, vb01234567c0); in xnn_f16_gemm_minmax_ukernel_6x8__neonfp16arith_ld64() 121 vacc3x01234567 = vfmaq_lane_f16(vacc3x01234567, vb01234567c1, va3, 1); in xnn_f16_gemm_minmax_ukernel_6x8__neonfp16arith_ld64() 135 vacc3x01234567 = vfmaq_f16(vacc3x01234567, va3c1, vb01234567c1); in xnn_f16_gemm_minmax_ukernel_6x8__neonfp16arith_ld64() 145 vacc3x01234567 = vfmaq_lane_f16(vacc3x01234567, vb01234567c2, va3, 2); in xnn_f16_gemm_minmax_ukernel_6x8__neonfp16arith_ld64() 159 vacc3x01234567 = vfmaq_f16(vacc3x01234567, va3c2, vb01234567c2); in xnn_f16_gemm_minmax_ukernel_6x8__neonfp16arith_ld64() 169 vacc3x01234567 = vfmaq_lane_f16(vacc3x01234567, vb01234567c3, va3, 3); in xnn_f16_gemm_minmax_ukernel_6x8__neonfp16arith_ld64() 183 vacc3x01234567 = vfmaq_f16(vacc3x01234567, va3c3, vb01234567c3); in xnn_f16_gemm_minmax_ukernel_6x8__neonfp16arith_ld64() 204 vacc3x01234567 = vfmaq_f16(vacc3x01234567, va3, vb01234567); in xnn_f16_gemm_minmax_ukernel_6x8__neonfp16arith_ld64() [all …]
|
D | 8x8-minmax-neonfp16arith-ld64.c | 90 float16x8_t vacc3x01234567 = vacc0x01234567; in xnn_f16_gemm_minmax_ukernel_8x8__neonfp16arith_ld64() local 113 vacc3x01234567 = vfmaq_lane_f16(vacc3x01234567, vb01234567c0, va3, 0); in xnn_f16_gemm_minmax_ukernel_8x8__neonfp16arith_ld64() 131 vacc3x01234567 = vfmaq_f16(vacc3x01234567, va3c0, vb01234567c0); in xnn_f16_gemm_minmax_ukernel_8x8__neonfp16arith_ld64() 143 vacc3x01234567 = vfmaq_lane_f16(vacc3x01234567, vb01234567c1, va3, 1); in xnn_f16_gemm_minmax_ukernel_8x8__neonfp16arith_ld64() 161 vacc3x01234567 = vfmaq_f16(vacc3x01234567, va3c1, vb01234567c1); in xnn_f16_gemm_minmax_ukernel_8x8__neonfp16arith_ld64() 173 vacc3x01234567 = vfmaq_lane_f16(vacc3x01234567, vb01234567c2, va3, 2); in xnn_f16_gemm_minmax_ukernel_8x8__neonfp16arith_ld64() 191 vacc3x01234567 = vfmaq_f16(vacc3x01234567, va3c2, vb01234567c2); in xnn_f16_gemm_minmax_ukernel_8x8__neonfp16arith_ld64() 203 vacc3x01234567 = vfmaq_lane_f16(vacc3x01234567, vb01234567c3, va3, 3); in xnn_f16_gemm_minmax_ukernel_8x8__neonfp16arith_ld64() 221 vacc3x01234567 = vfmaq_f16(vacc3x01234567, va3c3, vb01234567c3); in xnn_f16_gemm_minmax_ukernel_8x8__neonfp16arith_ld64() 246 vacc3x01234567 = vfmaq_f16(vacc3x01234567, va3, vb01234567); in xnn_f16_gemm_minmax_ukernel_8x8__neonfp16arith_ld64() [all …]
|
D | 6x16-minmax-neonfp16arith-ld64.c | 81 float16x8_t vacc3x01234567 = vacc0x01234567; in xnn_f16_gemm_minmax_ukernel_6x16__neonfp16arith_ld64() local 104 vacc3x01234567 = vfmaq_lane_f16(vacc3x01234567, vb01234567c0, va3, 0); in xnn_f16_gemm_minmax_ukernel_6x16__neonfp16arith_ld64() 124 vacc3x01234567 = vfmaq_f16(vacc3x01234567, va3c0, vb01234567c0); in xnn_f16_gemm_minmax_ukernel_6x16__neonfp16arith_ld64() 141 vacc3x01234567 = vfmaq_lane_f16(vacc3x01234567, vb01234567c1, va3, 1); in xnn_f16_gemm_minmax_ukernel_6x16__neonfp16arith_ld64() 161 vacc3x01234567 = vfmaq_f16(vacc3x01234567, va3c1, vb01234567c1); in xnn_f16_gemm_minmax_ukernel_6x16__neonfp16arith_ld64() 178 vacc3x01234567 = vfmaq_lane_f16(vacc3x01234567, vb01234567c2, va3, 2); in xnn_f16_gemm_minmax_ukernel_6x16__neonfp16arith_ld64() 198 vacc3x01234567 = vfmaq_f16(vacc3x01234567, va3c2, vb01234567c2); in xnn_f16_gemm_minmax_ukernel_6x16__neonfp16arith_ld64() 215 vacc3x01234567 = vfmaq_lane_f16(vacc3x01234567, vb01234567c3, va3, 3); in xnn_f16_gemm_minmax_ukernel_6x16__neonfp16arith_ld64() 235 vacc3x01234567 = vfmaq_f16(vacc3x01234567, va3c3, vb01234567c3); in xnn_f16_gemm_minmax_ukernel_6x16__neonfp16arith_ld64() 263 vacc3x01234567 = vfmaq_f16(vacc3x01234567, va3, vb01234567); in xnn_f16_gemm_minmax_ukernel_6x16__neonfp16arith_ld64() [all …]
|
/external/XNNPACK/src/f16-igemm/gen/ |
D | 4x8-minmax-neonfp16arith-ld64.c | 62 float16x8_t vacc3x01234567 = vacc0x01234567; in xnn_f16_igemm_minmax_ukernel_4x8__neonfp16arith_ld64() local 101 vacc3x01234567 = vfmaq_lane_f16(vacc3x01234567, vb01234567c0, va3, 0); in xnn_f16_igemm_minmax_ukernel_4x8__neonfp16arith_ld64() 111 vacc3x01234567 = vfmaq_f16(vacc3x01234567, va3c0, vb01234567c0); in xnn_f16_igemm_minmax_ukernel_4x8__neonfp16arith_ld64() 119 vacc3x01234567 = vfmaq_lane_f16(vacc3x01234567, vb01234567c1, va3, 1); in xnn_f16_igemm_minmax_ukernel_4x8__neonfp16arith_ld64() 129 vacc3x01234567 = vfmaq_f16(vacc3x01234567, va3c1, vb01234567c1); in xnn_f16_igemm_minmax_ukernel_4x8__neonfp16arith_ld64() 137 vacc3x01234567 = vfmaq_lane_f16(vacc3x01234567, vb01234567c2, va3, 2); in xnn_f16_igemm_minmax_ukernel_4x8__neonfp16arith_ld64() 147 vacc3x01234567 = vfmaq_f16(vacc3x01234567, va3c2, vb01234567c2); in xnn_f16_igemm_minmax_ukernel_4x8__neonfp16arith_ld64() 155 vacc3x01234567 = vfmaq_lane_f16(vacc3x01234567, vb01234567c3, va3, 3); in xnn_f16_igemm_minmax_ukernel_4x8__neonfp16arith_ld64() 165 vacc3x01234567 = vfmaq_f16(vacc3x01234567, va3c3, vb01234567c3); in xnn_f16_igemm_minmax_ukernel_4x8__neonfp16arith_ld64() 180 vacc3x01234567 = vfmaq_f16(vacc3x01234567, va3, vb01234567); in xnn_f16_igemm_minmax_ukernel_4x8__neonfp16arith_ld64() [all …]
|
D | 4x16-minmax-neonfp16arith-ld64.c | 65 float16x8_t vacc3x01234567 = vacc0x01234567; in xnn_f16_igemm_minmax_ukernel_4x16__neonfp16arith_ld64() local 106 vacc3x01234567 = vfmaq_lane_f16(vacc3x01234567, vb01234567c0, va3, 0); in xnn_f16_igemm_minmax_ukernel_4x16__neonfp16arith_ld64() 120 vacc3x01234567 = vfmaq_f16(vacc3x01234567, va3c0, vb01234567c0); in xnn_f16_igemm_minmax_ukernel_4x16__neonfp16arith_ld64() 133 vacc3x01234567 = vfmaq_lane_f16(vacc3x01234567, vb01234567c1, va3, 1); in xnn_f16_igemm_minmax_ukernel_4x16__neonfp16arith_ld64() 147 vacc3x01234567 = vfmaq_f16(vacc3x01234567, va3c1, vb01234567c1); in xnn_f16_igemm_minmax_ukernel_4x16__neonfp16arith_ld64() 160 vacc3x01234567 = vfmaq_lane_f16(vacc3x01234567, vb01234567c2, va3, 2); in xnn_f16_igemm_minmax_ukernel_4x16__neonfp16arith_ld64() 174 vacc3x01234567 = vfmaq_f16(vacc3x01234567, va3c2, vb01234567c2); in xnn_f16_igemm_minmax_ukernel_4x16__neonfp16arith_ld64() 187 vacc3x01234567 = vfmaq_lane_f16(vacc3x01234567, vb01234567c3, va3, 3); in xnn_f16_igemm_minmax_ukernel_4x16__neonfp16arith_ld64() 201 vacc3x01234567 = vfmaq_f16(vacc3x01234567, va3c3, vb01234567c3); in xnn_f16_igemm_minmax_ukernel_4x16__neonfp16arith_ld64() 221 vacc3x01234567 = vfmaq_f16(vacc3x01234567, va3, vb01234567); in xnn_f16_igemm_minmax_ukernel_4x16__neonfp16arith_ld64() [all …]
|
D | 6x8-minmax-neonfp16arith-ld64.c | 70 float16x8_t vacc3x01234567 = vacc0x01234567; in xnn_f16_igemm_minmax_ukernel_6x8__neonfp16arith_ld64() local 123 vacc3x01234567 = vfmaq_lane_f16(vacc3x01234567, vb01234567c0, va3, 0); in xnn_f16_igemm_minmax_ukernel_6x8__neonfp16arith_ld64() 137 vacc3x01234567 = vfmaq_f16(vacc3x01234567, va3c0, vb01234567c0); in xnn_f16_igemm_minmax_ukernel_6x8__neonfp16arith_ld64() 147 vacc3x01234567 = vfmaq_lane_f16(vacc3x01234567, vb01234567c1, va3, 1); in xnn_f16_igemm_minmax_ukernel_6x8__neonfp16arith_ld64() 161 vacc3x01234567 = vfmaq_f16(vacc3x01234567, va3c1, vb01234567c1); in xnn_f16_igemm_minmax_ukernel_6x8__neonfp16arith_ld64() 171 vacc3x01234567 = vfmaq_lane_f16(vacc3x01234567, vb01234567c2, va3, 2); in xnn_f16_igemm_minmax_ukernel_6x8__neonfp16arith_ld64() 185 vacc3x01234567 = vfmaq_f16(vacc3x01234567, va3c2, vb01234567c2); in xnn_f16_igemm_minmax_ukernel_6x8__neonfp16arith_ld64() 195 vacc3x01234567 = vfmaq_lane_f16(vacc3x01234567, vb01234567c3, va3, 3); in xnn_f16_igemm_minmax_ukernel_6x8__neonfp16arith_ld64() 209 vacc3x01234567 = vfmaq_f16(vacc3x01234567, va3c3, vb01234567c3); in xnn_f16_igemm_minmax_ukernel_6x8__neonfp16arith_ld64() 228 vacc3x01234567 = vfmaq_f16(vacc3x01234567, va3, vb01234567); in xnn_f16_igemm_minmax_ukernel_6x8__neonfp16arith_ld64() [all …]
|
D | 8x8-minmax-neonfp16arith-ld64.c | 78 float16x8_t vacc3x01234567 = vacc0x01234567; in xnn_f16_igemm_minmax_ukernel_8x8__neonfp16arith_ld64() local 145 vacc3x01234567 = vfmaq_lane_f16(vacc3x01234567, vb01234567c0, va3, 0); in xnn_f16_igemm_minmax_ukernel_8x8__neonfp16arith_ld64() 163 vacc3x01234567 = vfmaq_f16(vacc3x01234567, va3c0, vb01234567c0); in xnn_f16_igemm_minmax_ukernel_8x8__neonfp16arith_ld64() 175 vacc3x01234567 = vfmaq_lane_f16(vacc3x01234567, vb01234567c1, va3, 1); in xnn_f16_igemm_minmax_ukernel_8x8__neonfp16arith_ld64() 193 vacc3x01234567 = vfmaq_f16(vacc3x01234567, va3c1, vb01234567c1); in xnn_f16_igemm_minmax_ukernel_8x8__neonfp16arith_ld64() 205 vacc3x01234567 = vfmaq_lane_f16(vacc3x01234567, vb01234567c2, va3, 2); in xnn_f16_igemm_minmax_ukernel_8x8__neonfp16arith_ld64() 223 vacc3x01234567 = vfmaq_f16(vacc3x01234567, va3c2, vb01234567c2); in xnn_f16_igemm_minmax_ukernel_8x8__neonfp16arith_ld64() 235 vacc3x01234567 = vfmaq_lane_f16(vacc3x01234567, vb01234567c3, va3, 3); in xnn_f16_igemm_minmax_ukernel_8x8__neonfp16arith_ld64() 253 vacc3x01234567 = vfmaq_f16(vacc3x01234567, va3c3, vb01234567c3); in xnn_f16_igemm_minmax_ukernel_8x8__neonfp16arith_ld64() 276 vacc3x01234567 = vfmaq_f16(vacc3x01234567, va3, vb01234567); in xnn_f16_igemm_minmax_ukernel_8x8__neonfp16arith_ld64() [all …]
|
D | 6x16-minmax-neonfp16arith-ld64.c | 73 float16x8_t vacc3x01234567 = vacc0x01234567; in xnn_f16_igemm_minmax_ukernel_6x16__neonfp16arith_ld64() local 130 vacc3x01234567 = vfmaq_lane_f16(vacc3x01234567, vb01234567c0, va3, 0); in xnn_f16_igemm_minmax_ukernel_6x16__neonfp16arith_ld64() 150 vacc3x01234567 = vfmaq_f16(vacc3x01234567, va3c0, vb01234567c0); in xnn_f16_igemm_minmax_ukernel_6x16__neonfp16arith_ld64() 167 vacc3x01234567 = vfmaq_lane_f16(vacc3x01234567, vb01234567c1, va3, 1); in xnn_f16_igemm_minmax_ukernel_6x16__neonfp16arith_ld64() 187 vacc3x01234567 = vfmaq_f16(vacc3x01234567, va3c1, vb01234567c1); in xnn_f16_igemm_minmax_ukernel_6x16__neonfp16arith_ld64() 204 vacc3x01234567 = vfmaq_lane_f16(vacc3x01234567, vb01234567c2, va3, 2); in xnn_f16_igemm_minmax_ukernel_6x16__neonfp16arith_ld64() 224 vacc3x01234567 = vfmaq_f16(vacc3x01234567, va3c2, vb01234567c2); in xnn_f16_igemm_minmax_ukernel_6x16__neonfp16arith_ld64() 241 vacc3x01234567 = vfmaq_lane_f16(vacc3x01234567, vb01234567c3, va3, 3); in xnn_f16_igemm_minmax_ukernel_6x16__neonfp16arith_ld64() 261 vacc3x01234567 = vfmaq_f16(vacc3x01234567, va3c3, vb01234567c3); in xnn_f16_igemm_minmax_ukernel_6x16__neonfp16arith_ld64() 287 vacc3x01234567 = vfmaq_f16(vacc3x01234567, va3, vb01234567); in xnn_f16_igemm_minmax_ukernel_6x16__neonfp16arith_ld64() [all …]
|
/external/XNNPACK/src/f16-gemm/gen-inc/ |
D | 4x8inc-minmax-neonfp16arith-ld64.c | 68 …float16x8_t vacc3x01234567 = vld1q_f16(acc); acc = (const void*) ((uintptr_t) acc + sizeof(float16… in xnn_f16_gemminc_minmax_ukernel_4x8__neonfp16arith_ld64() local 83 vacc3x01234567 = vfmaq_lane_f16(vacc3x01234567, vb01234567c0, va3, 0); in xnn_f16_gemminc_minmax_ukernel_4x8__neonfp16arith_ld64() 93 vacc3x01234567 = vfmaq_f16(vacc3x01234567, va3c0, vb01234567c0); in xnn_f16_gemminc_minmax_ukernel_4x8__neonfp16arith_ld64() 101 vacc3x01234567 = vfmaq_lane_f16(vacc3x01234567, vb01234567c1, va3, 1); in xnn_f16_gemminc_minmax_ukernel_4x8__neonfp16arith_ld64() 111 vacc3x01234567 = vfmaq_f16(vacc3x01234567, va3c1, vb01234567c1); in xnn_f16_gemminc_minmax_ukernel_4x8__neonfp16arith_ld64() 119 vacc3x01234567 = vfmaq_lane_f16(vacc3x01234567, vb01234567c2, va3, 2); in xnn_f16_gemminc_minmax_ukernel_4x8__neonfp16arith_ld64() 129 vacc3x01234567 = vfmaq_f16(vacc3x01234567, va3c2, vb01234567c2); in xnn_f16_gemminc_minmax_ukernel_4x8__neonfp16arith_ld64() 137 vacc3x01234567 = vfmaq_lane_f16(vacc3x01234567, vb01234567c3, va3, 3); in xnn_f16_gemminc_minmax_ukernel_4x8__neonfp16arith_ld64() 147 vacc3x01234567 = vfmaq_f16(vacc3x01234567, va3c3, vb01234567c3); in xnn_f16_gemminc_minmax_ukernel_4x8__neonfp16arith_ld64() 164 vacc3x01234567 = vfmaq_f16(vacc3x01234567, va3, vb01234567); in xnn_f16_gemminc_minmax_ukernel_4x8__neonfp16arith_ld64() [all …]
|
D | 4x16inc-minmax-neonfp16arith-ld64.c | 71 …float16x8_t vacc3x01234567 = vld1q_f16(acc); acc = (const void*) ((uintptr_t) acc + sizeof(float16… in xnn_f16_gemminc_minmax_ukernel_4x16__neonfp16arith_ld64() local 88 vacc3x01234567 = vfmaq_lane_f16(vacc3x01234567, vb01234567c0, va3, 0); in xnn_f16_gemminc_minmax_ukernel_4x16__neonfp16arith_ld64() 102 vacc3x01234567 = vfmaq_f16(vacc3x01234567, va3c0, vb01234567c0); in xnn_f16_gemminc_minmax_ukernel_4x16__neonfp16arith_ld64() 115 vacc3x01234567 = vfmaq_lane_f16(vacc3x01234567, vb01234567c1, va3, 1); in xnn_f16_gemminc_minmax_ukernel_4x16__neonfp16arith_ld64() 129 vacc3x01234567 = vfmaq_f16(vacc3x01234567, va3c1, vb01234567c1); in xnn_f16_gemminc_minmax_ukernel_4x16__neonfp16arith_ld64() 142 vacc3x01234567 = vfmaq_lane_f16(vacc3x01234567, vb01234567c2, va3, 2); in xnn_f16_gemminc_minmax_ukernel_4x16__neonfp16arith_ld64() 156 vacc3x01234567 = vfmaq_f16(vacc3x01234567, va3c2, vb01234567c2); in xnn_f16_gemminc_minmax_ukernel_4x16__neonfp16arith_ld64() 169 vacc3x01234567 = vfmaq_lane_f16(vacc3x01234567, vb01234567c3, va3, 3); in xnn_f16_gemminc_minmax_ukernel_4x16__neonfp16arith_ld64() 183 vacc3x01234567 = vfmaq_f16(vacc3x01234567, va3c3, vb01234567c3); in xnn_f16_gemminc_minmax_ukernel_4x16__neonfp16arith_ld64() 205 vacc3x01234567 = vfmaq_f16(vacc3x01234567, va3, vb01234567); in xnn_f16_gemminc_minmax_ukernel_4x16__neonfp16arith_ld64() [all …]
|
D | 6x8inc-minmax-neonfp16arith-ld64.c | 80 …float16x8_t vacc3x01234567 = vld1q_f16(acc); acc = (const void*) ((uintptr_t) acc + sizeof(float16… in xnn_f16_gemminc_minmax_ukernel_6x8__neonfp16arith_ld64() local 99 vacc3x01234567 = vfmaq_lane_f16(vacc3x01234567, vb01234567c0, va3, 0); in xnn_f16_gemminc_minmax_ukernel_6x8__neonfp16arith_ld64() 113 vacc3x01234567 = vfmaq_f16(vacc3x01234567, va3c0, vb01234567c0); in xnn_f16_gemminc_minmax_ukernel_6x8__neonfp16arith_ld64() 123 vacc3x01234567 = vfmaq_lane_f16(vacc3x01234567, vb01234567c1, va3, 1); in xnn_f16_gemminc_minmax_ukernel_6x8__neonfp16arith_ld64() 137 vacc3x01234567 = vfmaq_f16(vacc3x01234567, va3c1, vb01234567c1); in xnn_f16_gemminc_minmax_ukernel_6x8__neonfp16arith_ld64() 147 vacc3x01234567 = vfmaq_lane_f16(vacc3x01234567, vb01234567c2, va3, 2); in xnn_f16_gemminc_minmax_ukernel_6x8__neonfp16arith_ld64() 161 vacc3x01234567 = vfmaq_f16(vacc3x01234567, va3c2, vb01234567c2); in xnn_f16_gemminc_minmax_ukernel_6x8__neonfp16arith_ld64() 171 vacc3x01234567 = vfmaq_lane_f16(vacc3x01234567, vb01234567c3, va3, 3); in xnn_f16_gemminc_minmax_ukernel_6x8__neonfp16arith_ld64() 185 vacc3x01234567 = vfmaq_f16(vacc3x01234567, va3c3, vb01234567c3); in xnn_f16_gemminc_minmax_ukernel_6x8__neonfp16arith_ld64() 206 vacc3x01234567 = vfmaq_f16(vacc3x01234567, va3, vb01234567); in xnn_f16_gemminc_minmax_ukernel_6x8__neonfp16arith_ld64() [all …]
|
D | 8x8inc-minmax-neonfp16arith-ld64.c | 92 …float16x8_t vacc3x01234567 = vld1q_f16(acc); acc = (const void*) ((uintptr_t) acc + sizeof(float16… in xnn_f16_gemminc_minmax_ukernel_8x8__neonfp16arith_ld64() local 115 vacc3x01234567 = vfmaq_lane_f16(vacc3x01234567, vb01234567c0, va3, 0); in xnn_f16_gemminc_minmax_ukernel_8x8__neonfp16arith_ld64() 133 vacc3x01234567 = vfmaq_f16(vacc3x01234567, va3c0, vb01234567c0); in xnn_f16_gemminc_minmax_ukernel_8x8__neonfp16arith_ld64() 145 vacc3x01234567 = vfmaq_lane_f16(vacc3x01234567, vb01234567c1, va3, 1); in xnn_f16_gemminc_minmax_ukernel_8x8__neonfp16arith_ld64() 163 vacc3x01234567 = vfmaq_f16(vacc3x01234567, va3c1, vb01234567c1); in xnn_f16_gemminc_minmax_ukernel_8x8__neonfp16arith_ld64() 175 vacc3x01234567 = vfmaq_lane_f16(vacc3x01234567, vb01234567c2, va3, 2); in xnn_f16_gemminc_minmax_ukernel_8x8__neonfp16arith_ld64() 193 vacc3x01234567 = vfmaq_f16(vacc3x01234567, va3c2, vb01234567c2); in xnn_f16_gemminc_minmax_ukernel_8x8__neonfp16arith_ld64() 205 vacc3x01234567 = vfmaq_lane_f16(vacc3x01234567, vb01234567c3, va3, 3); in xnn_f16_gemminc_minmax_ukernel_8x8__neonfp16arith_ld64() 223 vacc3x01234567 = vfmaq_f16(vacc3x01234567, va3c3, vb01234567c3); in xnn_f16_gemminc_minmax_ukernel_8x8__neonfp16arith_ld64() 248 vacc3x01234567 = vfmaq_f16(vacc3x01234567, va3, vb01234567); in xnn_f16_gemminc_minmax_ukernel_8x8__neonfp16arith_ld64() [all …]
|
D | 6x16inc-minmax-neonfp16arith-ld64.c | 83 …float16x8_t vacc3x01234567 = vld1q_f16(acc); acc = (const void*) ((uintptr_t) acc + sizeof(float16… in xnn_f16_gemminc_minmax_ukernel_6x16__neonfp16arith_ld64() local 106 vacc3x01234567 = vfmaq_lane_f16(vacc3x01234567, vb01234567c0, va3, 0); in xnn_f16_gemminc_minmax_ukernel_6x16__neonfp16arith_ld64() 126 vacc3x01234567 = vfmaq_f16(vacc3x01234567, va3c0, vb01234567c0); in xnn_f16_gemminc_minmax_ukernel_6x16__neonfp16arith_ld64() 143 vacc3x01234567 = vfmaq_lane_f16(vacc3x01234567, vb01234567c1, va3, 1); in xnn_f16_gemminc_minmax_ukernel_6x16__neonfp16arith_ld64() 163 vacc3x01234567 = vfmaq_f16(vacc3x01234567, va3c1, vb01234567c1); in xnn_f16_gemminc_minmax_ukernel_6x16__neonfp16arith_ld64() 180 vacc3x01234567 = vfmaq_lane_f16(vacc3x01234567, vb01234567c2, va3, 2); in xnn_f16_gemminc_minmax_ukernel_6x16__neonfp16arith_ld64() 200 vacc3x01234567 = vfmaq_f16(vacc3x01234567, va3c2, vb01234567c2); in xnn_f16_gemminc_minmax_ukernel_6x16__neonfp16arith_ld64() 217 vacc3x01234567 = vfmaq_lane_f16(vacc3x01234567, vb01234567c3, va3, 3); in xnn_f16_gemminc_minmax_ukernel_6x16__neonfp16arith_ld64() 237 vacc3x01234567 = vfmaq_f16(vacc3x01234567, va3c3, vb01234567c3); in xnn_f16_gemminc_minmax_ukernel_6x16__neonfp16arith_ld64() 265 vacc3x01234567 = vfmaq_f16(vacc3x01234567, va3, vb01234567); in xnn_f16_gemminc_minmax_ukernel_6x16__neonfp16arith_ld64() [all …]
|
/external/XNNPACK/src/f32-gemm/gen-inc/ |
D | 4x16s4inc-minmax-fma3-broadcast.c | 68 __m256 vacc3x01234567 = _mm256_load_ps(acc + 48); in xnn_f32_gemminc_minmax_ukernel_4x16s4__fma3_broadcast() local 90 vacc3x01234567 = _mm256_fmadd_ps(va3, vb01234567c0, vacc3x01234567); in xnn_f32_gemminc_minmax_ukernel_4x16s4__fma3_broadcast() 107 vacc3x01234567 = _mm256_fmadd_ps(va3, vb01234567c1, vacc3x01234567); in xnn_f32_gemminc_minmax_ukernel_4x16s4__fma3_broadcast() 124 vacc3x01234567 = _mm256_fmadd_ps(va3, vb01234567c2, vacc3x01234567); in xnn_f32_gemminc_minmax_ukernel_4x16s4__fma3_broadcast() 141 vacc3x01234567 = _mm256_fmadd_ps(va3, vb01234567c3, vacc3x01234567); in xnn_f32_gemminc_minmax_ukernel_4x16s4__fma3_broadcast() 169 …vacc3x01234567 = _mm256_fmadd_ps(_mm256_and_ps(va3, _mm256_cmp_ps(vb01234567c0, vzero, _CMP_NEQ_OQ… in xnn_f32_gemminc_minmax_ukernel_4x16s4__fma3_broadcast() 186 …vacc3x01234567 = _mm256_fmadd_ps(_mm256_and_ps(va3, _mm256_cmp_ps(vb01234567c1, vzero, _CMP_NEQ_OQ… in xnn_f32_gemminc_minmax_ukernel_4x16s4__fma3_broadcast() 203 …vacc3x01234567 = _mm256_fmadd_ps(_mm256_and_ps(va3, _mm256_cmp_ps(vb01234567c2, vzero, _CMP_NEQ_OQ… in xnn_f32_gemminc_minmax_ukernel_4x16s4__fma3_broadcast() 220 …vacc3x01234567 = _mm256_fmadd_ps(_mm256_and_ps(va3, _mm256_cmp_ps(vb01234567c3, vzero, _CMP_NEQ_OQ… in xnn_f32_gemminc_minmax_ukernel_4x16s4__fma3_broadcast() 234 vacc3x01234567 = _mm256_max_ps(vacc3x01234567, vmin); in xnn_f32_gemminc_minmax_ukernel_4x16s4__fma3_broadcast() [all …]
|
D | 4x8inc-minmax-fma3-broadcast.c | 65 __m256 vacc3x01234567 = _mm256_load_ps(acc + 24); in xnn_f32_gemminc_minmax_ukernel_4x8__fma3_broadcast() local 85 vacc3x01234567 = _mm256_fmadd_ps(va3, vb01234567, vacc3x01234567); in xnn_f32_gemminc_minmax_ukernel_4x8__fma3_broadcast() 94 vacc3x01234567 = _mm256_max_ps(vacc3x01234567, vmin); in xnn_f32_gemminc_minmax_ukernel_4x8__fma3_broadcast() 100 vacc3x01234567 = _mm256_min_ps(vacc3x01234567, vmax); in xnn_f32_gemminc_minmax_ukernel_4x8__fma3_broadcast() 103 _mm256_storeu_ps(c3, vacc3x01234567); in xnn_f32_gemminc_minmax_ukernel_4x8__fma3_broadcast() 119 __m128 vacc3x0123 = _mm256_castps256_ps128(vacc3x01234567); in xnn_f32_gemminc_minmax_ukernel_4x8__fma3_broadcast() 129 vacc3x0123 = _mm256_extractf128_ps(vacc3x01234567, 1); in xnn_f32_gemminc_minmax_ukernel_4x8__fma3_broadcast()
|
D | 4x16inc-minmax-fma3-broadcast.c | 68 __m256 vacc3x01234567 = _mm256_load_ps(acc + 48); in xnn_f32_gemminc_minmax_ukernel_4x16__fma3_broadcast() local 90 vacc3x01234567 = _mm256_fmadd_ps(va3, vb01234567, vacc3x01234567); in xnn_f32_gemminc_minmax_ukernel_4x16__fma3_broadcast() 103 vacc3x01234567 = _mm256_max_ps(vacc3x01234567, vmin); in xnn_f32_gemminc_minmax_ukernel_4x16__fma3_broadcast() 113 vacc3x01234567 = _mm256_min_ps(vacc3x01234567, vmax); in xnn_f32_gemminc_minmax_ukernel_4x16__fma3_broadcast() 120 _mm256_storeu_ps(c3, vacc3x01234567); in xnn_f32_gemminc_minmax_ukernel_4x16__fma3_broadcast() 141 _mm256_storeu_ps(c3, vacc3x01234567); in xnn_f32_gemminc_minmax_ukernel_4x16__fma3_broadcast() 146 vacc3x01234567 = vacc3x89ABCDEF; in xnn_f32_gemminc_minmax_ukernel_4x16__fma3_broadcast() 156 __m128 vacc3x0123 = _mm256_castps256_ps128(vacc3x01234567); in xnn_f32_gemminc_minmax_ukernel_4x16__fma3_broadcast() 166 vacc3x0123 = _mm256_extractf128_ps(vacc3x01234567, 1); in xnn_f32_gemminc_minmax_ukernel_4x16__fma3_broadcast()
|
D | 4x16inc-minmax-avx-broadcast.c | 68 __m256 vacc3x01234567 = _mm256_load_ps(acc + 48); in xnn_f32_gemminc_minmax_ukernel_4x16__avx_broadcast() local 90 vacc3x01234567 = _mm256_add_ps(vacc3x01234567, _mm256_mul_ps(va3, vb01234567)); in xnn_f32_gemminc_minmax_ukernel_4x16__avx_broadcast() 103 vacc3x01234567 = _mm256_max_ps(vacc3x01234567, vmin); in xnn_f32_gemminc_minmax_ukernel_4x16__avx_broadcast() 113 vacc3x01234567 = _mm256_min_ps(vacc3x01234567, vmax); in xnn_f32_gemminc_minmax_ukernel_4x16__avx_broadcast() 120 _mm256_storeu_ps(c3, vacc3x01234567); in xnn_f32_gemminc_minmax_ukernel_4x16__avx_broadcast() 141 _mm256_storeu_ps(c3, vacc3x01234567); in xnn_f32_gemminc_minmax_ukernel_4x16__avx_broadcast() 146 vacc3x01234567 = vacc3x89ABCDEF; in xnn_f32_gemminc_minmax_ukernel_4x16__avx_broadcast() 156 __m128 vacc3x0123 = _mm256_castps256_ps128(vacc3x01234567); in xnn_f32_gemminc_minmax_ukernel_4x16__avx_broadcast() 166 vacc3x0123 = _mm256_extractf128_ps(vacc3x01234567, 1); in xnn_f32_gemminc_minmax_ukernel_4x16__avx_broadcast()
|
/external/XNNPACK/src/f32-igemm/gen/ |
D | 4x16s4-minmax-fma3-broadcast.c | 64 __m256 vacc3x01234567 = vacc0x01234567; in xnn_f32_igemm_minmax_ukernel_4x16s4__fma3_broadcast() local 110 vacc3x01234567 = _mm256_fmadd_ps(va3, vb01234567c0, vacc3x01234567); in xnn_f32_igemm_minmax_ukernel_4x16s4__fma3_broadcast() 127 vacc3x01234567 = _mm256_fmadd_ps(va3, vb01234567c1, vacc3x01234567); in xnn_f32_igemm_minmax_ukernel_4x16s4__fma3_broadcast() 144 vacc3x01234567 = _mm256_fmadd_ps(va3, vb01234567c2, vacc3x01234567); in xnn_f32_igemm_minmax_ukernel_4x16s4__fma3_broadcast() 161 vacc3x01234567 = _mm256_fmadd_ps(va3, vb01234567c3, vacc3x01234567); in xnn_f32_igemm_minmax_ukernel_4x16s4__fma3_broadcast() 189 …vacc3x01234567 = _mm256_fmadd_ps(_mm256_and_ps(va3, _mm256_cmp_ps(vb01234567c0, vzero, _CMP_NEQ_OQ… in xnn_f32_igemm_minmax_ukernel_4x16s4__fma3_broadcast() 206 …vacc3x01234567 = _mm256_fmadd_ps(_mm256_and_ps(va3, _mm256_cmp_ps(vb01234567c1, vzero, _CMP_NEQ_OQ… in xnn_f32_igemm_minmax_ukernel_4x16s4__fma3_broadcast() 223 …vacc3x01234567 = _mm256_fmadd_ps(_mm256_and_ps(va3, _mm256_cmp_ps(vb01234567c2, vzero, _CMP_NEQ_OQ… in xnn_f32_igemm_minmax_ukernel_4x16s4__fma3_broadcast() 240 …vacc3x01234567 = _mm256_fmadd_ps(_mm256_and_ps(va3, _mm256_cmp_ps(vb01234567c3, vzero, _CMP_NEQ_OQ… in xnn_f32_igemm_minmax_ukernel_4x16s4__fma3_broadcast() 256 vacc3x01234567 = _mm256_max_ps(vacc3x01234567, vmin); in xnn_f32_igemm_minmax_ukernel_4x16s4__fma3_broadcast() [all …]
|
/external/XNNPACK/src/f32-gemm/gen/ |
D | 4x16s4-minmax-fma3-broadcast.c | 66 __m256 vacc3x01234567 = vacc0x01234567; in xnn_f32_gemm_minmax_ukernel_4x16s4__fma3_broadcast() local 88 vacc3x01234567 = _mm256_fmadd_ps(va3, vb01234567c0, vacc3x01234567); in xnn_f32_gemm_minmax_ukernel_4x16s4__fma3_broadcast() 105 vacc3x01234567 = _mm256_fmadd_ps(va3, vb01234567c1, vacc3x01234567); in xnn_f32_gemm_minmax_ukernel_4x16s4__fma3_broadcast() 122 vacc3x01234567 = _mm256_fmadd_ps(va3, vb01234567c2, vacc3x01234567); in xnn_f32_gemm_minmax_ukernel_4x16s4__fma3_broadcast() 139 vacc3x01234567 = _mm256_fmadd_ps(va3, vb01234567c3, vacc3x01234567); in xnn_f32_gemm_minmax_ukernel_4x16s4__fma3_broadcast() 167 …vacc3x01234567 = _mm256_fmadd_ps(_mm256_and_ps(va3, _mm256_cmp_ps(vb01234567c0, vzero, _CMP_NEQ_OQ… in xnn_f32_gemm_minmax_ukernel_4x16s4__fma3_broadcast() 184 …vacc3x01234567 = _mm256_fmadd_ps(_mm256_and_ps(va3, _mm256_cmp_ps(vb01234567c1, vzero, _CMP_NEQ_OQ… in xnn_f32_gemm_minmax_ukernel_4x16s4__fma3_broadcast() 201 …vacc3x01234567 = _mm256_fmadd_ps(_mm256_and_ps(va3, _mm256_cmp_ps(vb01234567c2, vzero, _CMP_NEQ_OQ… in xnn_f32_gemm_minmax_ukernel_4x16s4__fma3_broadcast() 218 …vacc3x01234567 = _mm256_fmadd_ps(_mm256_and_ps(va3, _mm256_cmp_ps(vb01234567c3, vzero, _CMP_NEQ_OQ… in xnn_f32_gemm_minmax_ukernel_4x16s4__fma3_broadcast() 232 vacc3x01234567 = _mm256_max_ps(vacc3x01234567, vmin); in xnn_f32_gemm_minmax_ukernel_4x16s4__fma3_broadcast() [all …]
|
D | 4x8-minmax-avx-broadcast.c | 63 __m256 vacc3x01234567 = vacc0x01234567; in xnn_f32_gemm_minmax_ukernel_4x8__avx_broadcast() local 83 vacc3x01234567 = _mm256_add_ps(vacc3x01234567, _mm256_mul_ps(va3, vb01234567)); in xnn_f32_gemm_minmax_ukernel_4x8__avx_broadcast() 92 vacc3x01234567 = _mm256_max_ps(vacc3x01234567, vmin); in xnn_f32_gemm_minmax_ukernel_4x8__avx_broadcast() 98 vacc3x01234567 = _mm256_min_ps(vacc3x01234567, vmax); in xnn_f32_gemm_minmax_ukernel_4x8__avx_broadcast() 101 _mm256_storeu_ps(c3, vacc3x01234567); in xnn_f32_gemm_minmax_ukernel_4x8__avx_broadcast() 117 __m128 vacc3x0123 = _mm256_castps256_ps128(vacc3x01234567); in xnn_f32_gemm_minmax_ukernel_4x8__avx_broadcast() 127 vacc3x0123 = _mm256_extractf128_ps(vacc3x01234567, 1); in xnn_f32_gemm_minmax_ukernel_4x8__avx_broadcast()
|
D | 4x8-minmax-fma3-broadcast.c | 63 __m256 vacc3x01234567 = vacc0x01234567; in xnn_f32_gemm_minmax_ukernel_4x8__fma3_broadcast() local 83 vacc3x01234567 = _mm256_fmadd_ps(va3, vb01234567, vacc3x01234567); in xnn_f32_gemm_minmax_ukernel_4x8__fma3_broadcast() 92 vacc3x01234567 = _mm256_max_ps(vacc3x01234567, vmin); in xnn_f32_gemm_minmax_ukernel_4x8__fma3_broadcast() 98 vacc3x01234567 = _mm256_min_ps(vacc3x01234567, vmax); in xnn_f32_gemm_minmax_ukernel_4x8__fma3_broadcast() 101 _mm256_storeu_ps(c3, vacc3x01234567); in xnn_f32_gemm_minmax_ukernel_4x8__fma3_broadcast() 117 __m128 vacc3x0123 = _mm256_castps256_ps128(vacc3x01234567); in xnn_f32_gemm_minmax_ukernel_4x8__fma3_broadcast() 127 vacc3x0123 = _mm256_extractf128_ps(vacc3x01234567, 1); in xnn_f32_gemm_minmax_ukernel_4x8__fma3_broadcast()
|
D | 4x16-minmax-fma3-broadcast.c | 66 __m256 vacc3x01234567 = vacc0x01234567; in xnn_f32_gemm_minmax_ukernel_4x16__fma3_broadcast() local 88 vacc3x01234567 = _mm256_fmadd_ps(va3, vb01234567, vacc3x01234567); in xnn_f32_gemm_minmax_ukernel_4x16__fma3_broadcast() 101 vacc3x01234567 = _mm256_max_ps(vacc3x01234567, vmin); in xnn_f32_gemm_minmax_ukernel_4x16__fma3_broadcast() 111 vacc3x01234567 = _mm256_min_ps(vacc3x01234567, vmax); in xnn_f32_gemm_minmax_ukernel_4x16__fma3_broadcast() 118 _mm256_storeu_ps(c3, vacc3x01234567); in xnn_f32_gemm_minmax_ukernel_4x16__fma3_broadcast() 139 _mm256_storeu_ps(c3, vacc3x01234567); in xnn_f32_gemm_minmax_ukernel_4x16__fma3_broadcast() 144 vacc3x01234567 = vacc3x89ABCDEF; in xnn_f32_gemm_minmax_ukernel_4x16__fma3_broadcast() 154 __m128 vacc3x0123 = _mm256_castps256_ps128(vacc3x01234567); in xnn_f32_gemm_minmax_ukernel_4x16__fma3_broadcast() 164 vacc3x0123 = _mm256_extractf128_ps(vacc3x01234567, 1); in xnn_f32_gemm_minmax_ukernel_4x16__fma3_broadcast()
|
D | 4x16-minmax-avx-broadcast.c | 66 __m256 vacc3x01234567 = vacc0x01234567; in xnn_f32_gemm_minmax_ukernel_4x16__avx_broadcast() local 88 vacc3x01234567 = _mm256_add_ps(vacc3x01234567, _mm256_mul_ps(va3, vb01234567)); in xnn_f32_gemm_minmax_ukernel_4x16__avx_broadcast() 101 vacc3x01234567 = _mm256_max_ps(vacc3x01234567, vmin); in xnn_f32_gemm_minmax_ukernel_4x16__avx_broadcast() 111 vacc3x01234567 = _mm256_min_ps(vacc3x01234567, vmax); in xnn_f32_gemm_minmax_ukernel_4x16__avx_broadcast() 118 _mm256_storeu_ps(c3, vacc3x01234567); in xnn_f32_gemm_minmax_ukernel_4x16__avx_broadcast() 139 _mm256_storeu_ps(c3, vacc3x01234567); in xnn_f32_gemm_minmax_ukernel_4x16__avx_broadcast() 144 vacc3x01234567 = vacc3x89ABCDEF; in xnn_f32_gemm_minmax_ukernel_4x16__avx_broadcast() 154 __m128 vacc3x0123 = _mm256_castps256_ps128(vacc3x01234567); in xnn_f32_gemm_minmax_ukernel_4x16__avx_broadcast() 164 vacc3x0123 = _mm256_extractf128_ps(vacc3x01234567, 1); in xnn_f32_gemm_minmax_ukernel_4x16__avx_broadcast()
|