/external/XNNPACK/src/qs8-gemm/gen/ |
D | 4x16c4-minmax-neondot.c | 88 const int8x8_t va2x01234567 = vld1_s8(a2); a2 += 8; in xnn_qs8_gemm_minmax_ukernel_4x16c4__neondot() local 110 vacc2x0123 = vdotq_lane_s32(vacc2x0123, vb0123x0123, va2x01234567, 0); in xnn_qs8_gemm_minmax_ukernel_4x16c4__neondot() 111 vacc2x4567 = vdotq_lane_s32(vacc2x4567, vb0123x4567, va2x01234567, 0); in xnn_qs8_gemm_minmax_ukernel_4x16c4__neondot() 112 vacc2x89AB = vdotq_lane_s32(vacc2x89AB, vb0123x89AB, va2x01234567, 0); in xnn_qs8_gemm_minmax_ukernel_4x16c4__neondot() 113 vacc2xCDEF = vdotq_lane_s32(vacc2xCDEF, vb0123xCDEF, va2x01234567, 0); in xnn_qs8_gemm_minmax_ukernel_4x16c4__neondot() 126 vacc2x0123 = vdotq_lane_s32(vacc2x0123, vb4567x0123, va2x01234567, 1); in xnn_qs8_gemm_minmax_ukernel_4x16c4__neondot() 127 vacc2x4567 = vdotq_lane_s32(vacc2x4567, vb4567x4567, va2x01234567, 1); in xnn_qs8_gemm_minmax_ukernel_4x16c4__neondot() 128 vacc2x89AB = vdotq_lane_s32(vacc2x89AB, vb4567x89AB, va2x01234567, 1); in xnn_qs8_gemm_minmax_ukernel_4x16c4__neondot() 129 vacc2xCDEF = vdotq_lane_s32(vacc2xCDEF, vb4567xCDEF, va2x01234567, 1); in xnn_qs8_gemm_minmax_ukernel_4x16c4__neondot() 142 const int8x8_t va2x01234567 = vld1_s8(a2); a2 += 4; in xnn_qs8_gemm_minmax_ukernel_4x16c4__neondot() local [all …]
|
D | 4x8c4-minmax-neondot.c | 80 const int8x8_t va2x01234567 = vld1_s8(a2); a2 += 8; in xnn_qs8_gemm_minmax_ukernel_4x8c4__neondot() local 94 vacc2x0123 = vdotq_lane_s32(vacc2x0123, vb0123x0123, va2x01234567, 0); in xnn_qs8_gemm_minmax_ukernel_4x8c4__neondot() 95 vacc2x4567 = vdotq_lane_s32(vacc2x4567, vb0123x4567, va2x01234567, 0); in xnn_qs8_gemm_minmax_ukernel_4x8c4__neondot() 102 vacc2x0123 = vdotq_lane_s32(vacc2x0123, vb4567x0123, va2x01234567, 1); in xnn_qs8_gemm_minmax_ukernel_4x8c4__neondot() 103 vacc2x4567 = vdotq_lane_s32(vacc2x4567, vb4567x4567, va2x01234567, 1); in xnn_qs8_gemm_minmax_ukernel_4x8c4__neondot() 114 const int8x8_t va2x01234567 = vld1_s8(a2); a2 += 4; in xnn_qs8_gemm_minmax_ukernel_4x8c4__neondot() local 126 vacc2x0123 = vdotq_lane_s32(vacc2x0123, vb0123x0123, va2x01234567, 0); in xnn_qs8_gemm_minmax_ukernel_4x8c4__neondot() 127 vacc2x4567 = vdotq_lane_s32(vacc2x4567, vb0123x4567, va2x01234567, 0); in xnn_qs8_gemm_minmax_ukernel_4x8c4__neondot()
|
D | 6x16c4-minmax-neondot.c | 108 const int8x8_t va2x01234567 = vld1_s8(a2); a2 += 8; in xnn_qs8_gemm_minmax_ukernel_6x16c4__neondot() local 132 vacc2x0123 = vdotq_lane_s32(vacc2x0123, vb0123x0123, va2x01234567, 0); in xnn_qs8_gemm_minmax_ukernel_6x16c4__neondot() 133 vacc2x4567 = vdotq_lane_s32(vacc2x4567, vb0123x4567, va2x01234567, 0); in xnn_qs8_gemm_minmax_ukernel_6x16c4__neondot() 134 vacc2x89AB = vdotq_lane_s32(vacc2x89AB, vb0123x89AB, va2x01234567, 0); in xnn_qs8_gemm_minmax_ukernel_6x16c4__neondot() 135 vacc2xCDEF = vdotq_lane_s32(vacc2xCDEF, vb0123xCDEF, va2x01234567, 0); in xnn_qs8_gemm_minmax_ukernel_6x16c4__neondot() 156 vacc2x0123 = vdotq_lane_s32(vacc2x0123, vb4567x0123, va2x01234567, 1); in xnn_qs8_gemm_minmax_ukernel_6x16c4__neondot() 157 vacc2x4567 = vdotq_lane_s32(vacc2x4567, vb4567x4567, va2x01234567, 1); in xnn_qs8_gemm_minmax_ukernel_6x16c4__neondot() 158 vacc2x89AB = vdotq_lane_s32(vacc2x89AB, vb4567x89AB, va2x01234567, 1); in xnn_qs8_gemm_minmax_ukernel_6x16c4__neondot() 159 vacc2xCDEF = vdotq_lane_s32(vacc2xCDEF, vb4567xCDEF, va2x01234567, 1); in xnn_qs8_gemm_minmax_ukernel_6x16c4__neondot() 180 const int8x8_t va2x01234567 = vld1_s8(a2); a2 += 4; in xnn_qs8_gemm_minmax_ukernel_6x16c4__neondot() local [all …]
|
D | 6x8c4-minmax-neondot.c | 96 const int8x8_t va2x01234567 = vld1_s8(a2); a2 += 8; in xnn_qs8_gemm_minmax_ukernel_6x8c4__neondot() local 112 vacc2x0123 = vdotq_lane_s32(vacc2x0123, vb0123x0123, va2x01234567, 0); in xnn_qs8_gemm_minmax_ukernel_6x8c4__neondot() 113 vacc2x4567 = vdotq_lane_s32(vacc2x4567, vb0123x4567, va2x01234567, 0); in xnn_qs8_gemm_minmax_ukernel_6x8c4__neondot() 124 vacc2x0123 = vdotq_lane_s32(vacc2x0123, vb4567x0123, va2x01234567, 1); in xnn_qs8_gemm_minmax_ukernel_6x8c4__neondot() 125 vacc2x4567 = vdotq_lane_s32(vacc2x4567, vb4567x4567, va2x01234567, 1); in xnn_qs8_gemm_minmax_ukernel_6x8c4__neondot() 140 const int8x8_t va2x01234567 = vld1_s8(a2); a2 += 4; in xnn_qs8_gemm_minmax_ukernel_6x8c4__neondot() local 154 vacc2x0123 = vdotq_lane_s32(vacc2x0123, vb0123x0123, va2x01234567, 0); in xnn_qs8_gemm_minmax_ukernel_6x8c4__neondot() 155 vacc2x4567 = vdotq_lane_s32(vacc2x4567, vb0123x4567, va2x01234567, 0); in xnn_qs8_gemm_minmax_ukernel_6x8c4__neondot()
|
D | 8x16c4-minmax-neondot.c | 128 const int8x8_t va2x01234567 = vld1_s8(a2); a2 += 8; in xnn_qs8_gemm_minmax_ukernel_8x16c4__neondot() local 154 vacc2x0123 = vdotq_lane_s32(vacc2x0123, vb0123x0123, va2x01234567, 0); in xnn_qs8_gemm_minmax_ukernel_8x16c4__neondot() 155 vacc2x4567 = vdotq_lane_s32(vacc2x4567, vb0123x4567, va2x01234567, 0); in xnn_qs8_gemm_minmax_ukernel_8x16c4__neondot() 156 vacc2x89AB = vdotq_lane_s32(vacc2x89AB, vb0123x89AB, va2x01234567, 0); in xnn_qs8_gemm_minmax_ukernel_8x16c4__neondot() 157 vacc2xCDEF = vdotq_lane_s32(vacc2xCDEF, vb0123xCDEF, va2x01234567, 0); in xnn_qs8_gemm_minmax_ukernel_8x16c4__neondot() 186 vacc2x0123 = vdotq_lane_s32(vacc2x0123, vb4567x0123, va2x01234567, 1); in xnn_qs8_gemm_minmax_ukernel_8x16c4__neondot() 187 vacc2x4567 = vdotq_lane_s32(vacc2x4567, vb4567x4567, va2x01234567, 1); in xnn_qs8_gemm_minmax_ukernel_8x16c4__neondot() 188 vacc2x89AB = vdotq_lane_s32(vacc2x89AB, vb4567x89AB, va2x01234567, 1); in xnn_qs8_gemm_minmax_ukernel_8x16c4__neondot() 189 vacc2xCDEF = vdotq_lane_s32(vacc2xCDEF, vb4567xCDEF, va2x01234567, 1); in xnn_qs8_gemm_minmax_ukernel_8x16c4__neondot() 218 const int8x8_t va2x01234567 = vld1_s8(a2); a2 += 4; in xnn_qs8_gemm_minmax_ukernel_8x16c4__neondot() local [all …]
|
D | 8x8c4-minmax-neondot.c | 112 const int8x8_t va2x01234567 = vld1_s8(a2); a2 += 8; in xnn_qs8_gemm_minmax_ukernel_8x8c4__neondot() local 130 vacc2x0123 = vdotq_lane_s32(vacc2x0123, vb0123x0123, va2x01234567, 0); in xnn_qs8_gemm_minmax_ukernel_8x8c4__neondot() 131 vacc2x4567 = vdotq_lane_s32(vacc2x4567, vb0123x4567, va2x01234567, 0); in xnn_qs8_gemm_minmax_ukernel_8x8c4__neondot() 146 vacc2x0123 = vdotq_lane_s32(vacc2x0123, vb4567x0123, va2x01234567, 1); in xnn_qs8_gemm_minmax_ukernel_8x8c4__neondot() 147 vacc2x4567 = vdotq_lane_s32(vacc2x4567, vb4567x4567, va2x01234567, 1); in xnn_qs8_gemm_minmax_ukernel_8x8c4__neondot() 166 const int8x8_t va2x01234567 = vld1_s8(a2); a2 += 4; in xnn_qs8_gemm_minmax_ukernel_8x8c4__neondot() local 182 vacc2x0123 = vdotq_lane_s32(vacc2x0123, vb0123x0123, va2x01234567, 0); in xnn_qs8_gemm_minmax_ukernel_8x8c4__neondot() 183 vacc2x4567 = vdotq_lane_s32(vacc2x4567, vb0123x4567, va2x01234567, 0); in xnn_qs8_gemm_minmax_ukernel_8x8c4__neondot()
|
/external/XNNPACK/src/qs8-igemm/gen/ |
D | 4x16c4-minmax-neondot.c | 103 const int8x8_t va2x01234567 = vld1_s8(a2); a2 += 8; in xnn_qs8_igemm_minmax_ukernel_4x16c4__neondot() local 125 vacc2x0123 = vdotq_lane_s32(vacc2x0123, vb0123x0123, va2x01234567, 0); in xnn_qs8_igemm_minmax_ukernel_4x16c4__neondot() 126 vacc2x4567 = vdotq_lane_s32(vacc2x4567, vb0123x4567, va2x01234567, 0); in xnn_qs8_igemm_minmax_ukernel_4x16c4__neondot() 127 vacc2x89AB = vdotq_lane_s32(vacc2x89AB, vb0123x89AB, va2x01234567, 0); in xnn_qs8_igemm_minmax_ukernel_4x16c4__neondot() 128 vacc2xCDEF = vdotq_lane_s32(vacc2xCDEF, vb0123xCDEF, va2x01234567, 0); in xnn_qs8_igemm_minmax_ukernel_4x16c4__neondot() 141 vacc2x0123 = vdotq_lane_s32(vacc2x0123, vb4567x0123, va2x01234567, 1); in xnn_qs8_igemm_minmax_ukernel_4x16c4__neondot() 142 vacc2x4567 = vdotq_lane_s32(vacc2x4567, vb4567x4567, va2x01234567, 1); in xnn_qs8_igemm_minmax_ukernel_4x16c4__neondot() 143 vacc2x89AB = vdotq_lane_s32(vacc2x89AB, vb4567x89AB, va2x01234567, 1); in xnn_qs8_igemm_minmax_ukernel_4x16c4__neondot() 144 vacc2xCDEF = vdotq_lane_s32(vacc2xCDEF, vb4567xCDEF, va2x01234567, 1); in xnn_qs8_igemm_minmax_ukernel_4x16c4__neondot() 157 const int8x8_t va2x01234567 = vld1_s8(a2); in xnn_qs8_igemm_minmax_ukernel_4x16c4__neondot() local [all …]
|
D | 4x8c4-minmax-neondot.c | 95 const int8x8_t va2x01234567 = vld1_s8(a2); a2 += 8; in xnn_qs8_igemm_minmax_ukernel_4x8c4__neondot() local 109 vacc2x0123 = vdotq_lane_s32(vacc2x0123, vb0123x0123, va2x01234567, 0); in xnn_qs8_igemm_minmax_ukernel_4x8c4__neondot() 110 vacc2x4567 = vdotq_lane_s32(vacc2x4567, vb0123x4567, va2x01234567, 0); in xnn_qs8_igemm_minmax_ukernel_4x8c4__neondot() 117 vacc2x0123 = vdotq_lane_s32(vacc2x0123, vb4567x0123, va2x01234567, 1); in xnn_qs8_igemm_minmax_ukernel_4x8c4__neondot() 118 vacc2x4567 = vdotq_lane_s32(vacc2x4567, vb4567x4567, va2x01234567, 1); in xnn_qs8_igemm_minmax_ukernel_4x8c4__neondot() 129 const int8x8_t va2x01234567 = vld1_s8(a2); in xnn_qs8_igemm_minmax_ukernel_4x8c4__neondot() local 141 vacc2x0123 = vdotq_lane_s32(vacc2x0123, vb0123x0123, va2x01234567, 0); in xnn_qs8_igemm_minmax_ukernel_4x8c4__neondot() 142 vacc2x4567 = vdotq_lane_s32(vacc2x4567, vb0123x4567, va2x01234567, 0); in xnn_qs8_igemm_minmax_ukernel_4x8c4__neondot()
|
D | 6x16c4-minmax-neondot.c | 127 const int8x8_t va2x01234567 = vld1_s8(a2); a2 += 8; in xnn_qs8_igemm_minmax_ukernel_6x16c4__neondot() local 151 vacc2x0123 = vdotq_lane_s32(vacc2x0123, vb0123x0123, va2x01234567, 0); in xnn_qs8_igemm_minmax_ukernel_6x16c4__neondot() 152 vacc2x4567 = vdotq_lane_s32(vacc2x4567, vb0123x4567, va2x01234567, 0); in xnn_qs8_igemm_minmax_ukernel_6x16c4__neondot() 153 vacc2x89AB = vdotq_lane_s32(vacc2x89AB, vb0123x89AB, va2x01234567, 0); in xnn_qs8_igemm_minmax_ukernel_6x16c4__neondot() 154 vacc2xCDEF = vdotq_lane_s32(vacc2xCDEF, vb0123xCDEF, va2x01234567, 0); in xnn_qs8_igemm_minmax_ukernel_6x16c4__neondot() 175 vacc2x0123 = vdotq_lane_s32(vacc2x0123, vb4567x0123, va2x01234567, 1); in xnn_qs8_igemm_minmax_ukernel_6x16c4__neondot() 176 vacc2x4567 = vdotq_lane_s32(vacc2x4567, vb4567x4567, va2x01234567, 1); in xnn_qs8_igemm_minmax_ukernel_6x16c4__neondot() 177 vacc2x89AB = vdotq_lane_s32(vacc2x89AB, vb4567x89AB, va2x01234567, 1); in xnn_qs8_igemm_minmax_ukernel_6x16c4__neondot() 178 vacc2xCDEF = vdotq_lane_s32(vacc2xCDEF, vb4567xCDEF, va2x01234567, 1); in xnn_qs8_igemm_minmax_ukernel_6x16c4__neondot() 199 const int8x8_t va2x01234567 = vld1_s8(a2); in xnn_qs8_igemm_minmax_ukernel_6x16c4__neondot() local [all …]
|
D | 6x8c4-minmax-neondot.c | 115 const int8x8_t va2x01234567 = vld1_s8(a2); a2 += 8; in xnn_qs8_igemm_minmax_ukernel_6x8c4__neondot() local 131 vacc2x0123 = vdotq_lane_s32(vacc2x0123, vb0123x0123, va2x01234567, 0); in xnn_qs8_igemm_minmax_ukernel_6x8c4__neondot() 132 vacc2x4567 = vdotq_lane_s32(vacc2x4567, vb0123x4567, va2x01234567, 0); in xnn_qs8_igemm_minmax_ukernel_6x8c4__neondot() 143 vacc2x0123 = vdotq_lane_s32(vacc2x0123, vb4567x0123, va2x01234567, 1); in xnn_qs8_igemm_minmax_ukernel_6x8c4__neondot() 144 vacc2x4567 = vdotq_lane_s32(vacc2x4567, vb4567x4567, va2x01234567, 1); in xnn_qs8_igemm_minmax_ukernel_6x8c4__neondot() 159 const int8x8_t va2x01234567 = vld1_s8(a2); in xnn_qs8_igemm_minmax_ukernel_6x8c4__neondot() local 173 vacc2x0123 = vdotq_lane_s32(vacc2x0123, vb0123x0123, va2x01234567, 0); in xnn_qs8_igemm_minmax_ukernel_6x8c4__neondot() 174 vacc2x4567 = vdotq_lane_s32(vacc2x4567, vb0123x4567, va2x01234567, 0); in xnn_qs8_igemm_minmax_ukernel_6x8c4__neondot()
|
D | 8x16c4-minmax-neondot.c | 151 const int8x8_t va2x01234567 = vld1_s8(a2); a2 += 8; in xnn_qs8_igemm_minmax_ukernel_8x16c4__neondot() local 177 vacc2x0123 = vdotq_lane_s32(vacc2x0123, vb0123x0123, va2x01234567, 0); in xnn_qs8_igemm_minmax_ukernel_8x16c4__neondot() 178 vacc2x4567 = vdotq_lane_s32(vacc2x4567, vb0123x4567, va2x01234567, 0); in xnn_qs8_igemm_minmax_ukernel_8x16c4__neondot() 179 vacc2x89AB = vdotq_lane_s32(vacc2x89AB, vb0123x89AB, va2x01234567, 0); in xnn_qs8_igemm_minmax_ukernel_8x16c4__neondot() 180 vacc2xCDEF = vdotq_lane_s32(vacc2xCDEF, vb0123xCDEF, va2x01234567, 0); in xnn_qs8_igemm_minmax_ukernel_8x16c4__neondot() 209 vacc2x0123 = vdotq_lane_s32(vacc2x0123, vb4567x0123, va2x01234567, 1); in xnn_qs8_igemm_minmax_ukernel_8x16c4__neondot() 210 vacc2x4567 = vdotq_lane_s32(vacc2x4567, vb4567x4567, va2x01234567, 1); in xnn_qs8_igemm_minmax_ukernel_8x16c4__neondot() 211 vacc2x89AB = vdotq_lane_s32(vacc2x89AB, vb4567x89AB, va2x01234567, 1); in xnn_qs8_igemm_minmax_ukernel_8x16c4__neondot() 212 vacc2xCDEF = vdotq_lane_s32(vacc2xCDEF, vb4567xCDEF, va2x01234567, 1); in xnn_qs8_igemm_minmax_ukernel_8x16c4__neondot() 241 const int8x8_t va2x01234567 = vld1_s8(a2); in xnn_qs8_igemm_minmax_ukernel_8x16c4__neondot() local [all …]
|
D | 8x8c4-minmax-neondot.c | 135 const int8x8_t va2x01234567 = vld1_s8(a2); a2 += 8; in xnn_qs8_igemm_minmax_ukernel_8x8c4__neondot() local 153 vacc2x0123 = vdotq_lane_s32(vacc2x0123, vb0123x0123, va2x01234567, 0); in xnn_qs8_igemm_minmax_ukernel_8x8c4__neondot() 154 vacc2x4567 = vdotq_lane_s32(vacc2x4567, vb0123x4567, va2x01234567, 0); in xnn_qs8_igemm_minmax_ukernel_8x8c4__neondot() 169 vacc2x0123 = vdotq_lane_s32(vacc2x0123, vb4567x0123, va2x01234567, 1); in xnn_qs8_igemm_minmax_ukernel_8x8c4__neondot() 170 vacc2x4567 = vdotq_lane_s32(vacc2x4567, vb4567x4567, va2x01234567, 1); in xnn_qs8_igemm_minmax_ukernel_8x8c4__neondot() 189 const int8x8_t va2x01234567 = vld1_s8(a2); in xnn_qs8_igemm_minmax_ukernel_8x8c4__neondot() local 205 vacc2x0123 = vdotq_lane_s32(vacc2x0123, vb0123x0123, va2x01234567, 0); in xnn_qs8_igemm_minmax_ukernel_8x8c4__neondot() 206 vacc2x4567 = vdotq_lane_s32(vacc2x4567, vb0123x4567, va2x01234567, 0); in xnn_qs8_igemm_minmax_ukernel_8x8c4__neondot()
|