/external/XNNPACK/src/qs8-gemm/gen/ |
D | 4x16c4-minmax-neondot.c | 87 const int8x8_t va1x01234567 = vld1_s8(a1); a1 += 8; in xnn_qs8_gemm_minmax_ukernel_4x16c4__neondot() local 106 vacc1x0123 = vdotq_lane_s32(vacc1x0123, vb0123x0123, va1x01234567, 0); in xnn_qs8_gemm_minmax_ukernel_4x16c4__neondot() 107 vacc1x4567 = vdotq_lane_s32(vacc1x4567, vb0123x4567, va1x01234567, 0); in xnn_qs8_gemm_minmax_ukernel_4x16c4__neondot() 108 vacc1x89AB = vdotq_lane_s32(vacc1x89AB, vb0123x89AB, va1x01234567, 0); in xnn_qs8_gemm_minmax_ukernel_4x16c4__neondot() 109 vacc1xCDEF = vdotq_lane_s32(vacc1xCDEF, vb0123xCDEF, va1x01234567, 0); in xnn_qs8_gemm_minmax_ukernel_4x16c4__neondot() 122 vacc1x0123 = vdotq_lane_s32(vacc1x0123, vb4567x0123, va1x01234567, 1); in xnn_qs8_gemm_minmax_ukernel_4x16c4__neondot() 123 vacc1x4567 = vdotq_lane_s32(vacc1x4567, vb4567x4567, va1x01234567, 1); in xnn_qs8_gemm_minmax_ukernel_4x16c4__neondot() 124 vacc1x89AB = vdotq_lane_s32(vacc1x89AB, vb4567x89AB, va1x01234567, 1); in xnn_qs8_gemm_minmax_ukernel_4x16c4__neondot() 125 vacc1xCDEF = vdotq_lane_s32(vacc1xCDEF, vb4567xCDEF, va1x01234567, 1); in xnn_qs8_gemm_minmax_ukernel_4x16c4__neondot() 141 const int8x8_t va1x01234567 = vld1_s8(a1); a1 += 4; in xnn_qs8_gemm_minmax_ukernel_4x16c4__neondot() local [all …]
|
D | 4x8c4-minmax-neondot.c | 79 const int8x8_t va1x01234567 = vld1_s8(a1); a1 += 8; in xnn_qs8_gemm_minmax_ukernel_4x8c4__neondot() local 92 vacc1x0123 = vdotq_lane_s32(vacc1x0123, vb0123x0123, va1x01234567, 0); in xnn_qs8_gemm_minmax_ukernel_4x8c4__neondot() 93 vacc1x4567 = vdotq_lane_s32(vacc1x4567, vb0123x4567, va1x01234567, 0); in xnn_qs8_gemm_minmax_ukernel_4x8c4__neondot() 100 vacc1x0123 = vdotq_lane_s32(vacc1x0123, vb4567x0123, va1x01234567, 1); in xnn_qs8_gemm_minmax_ukernel_4x8c4__neondot() 101 vacc1x4567 = vdotq_lane_s32(vacc1x4567, vb4567x4567, va1x01234567, 1); in xnn_qs8_gemm_minmax_ukernel_4x8c4__neondot() 113 const int8x8_t va1x01234567 = vld1_s8(a1); a1 += 4; in xnn_qs8_gemm_minmax_ukernel_4x8c4__neondot() local 124 vacc1x0123 = vdotq_lane_s32(vacc1x0123, vb0123x0123, va1x01234567, 0); in xnn_qs8_gemm_minmax_ukernel_4x8c4__neondot() 125 vacc1x4567 = vdotq_lane_s32(vacc1x4567, vb0123x4567, va1x01234567, 0); in xnn_qs8_gemm_minmax_ukernel_4x8c4__neondot()
|
D | 6x16c4-minmax-neondot.c | 107 const int8x8_t va1x01234567 = vld1_s8(a1); a1 += 8; in xnn_qs8_gemm_minmax_ukernel_6x16c4__neondot() local 128 vacc1x0123 = vdotq_lane_s32(vacc1x0123, vb0123x0123, va1x01234567, 0); in xnn_qs8_gemm_minmax_ukernel_6x16c4__neondot() 129 vacc1x4567 = vdotq_lane_s32(vacc1x4567, vb0123x4567, va1x01234567, 0); in xnn_qs8_gemm_minmax_ukernel_6x16c4__neondot() 130 vacc1x89AB = vdotq_lane_s32(vacc1x89AB, vb0123x89AB, va1x01234567, 0); in xnn_qs8_gemm_minmax_ukernel_6x16c4__neondot() 131 vacc1xCDEF = vdotq_lane_s32(vacc1xCDEF, vb0123xCDEF, va1x01234567, 0); in xnn_qs8_gemm_minmax_ukernel_6x16c4__neondot() 152 vacc1x0123 = vdotq_lane_s32(vacc1x0123, vb4567x0123, va1x01234567, 1); in xnn_qs8_gemm_minmax_ukernel_6x16c4__neondot() 153 vacc1x4567 = vdotq_lane_s32(vacc1x4567, vb4567x4567, va1x01234567, 1); in xnn_qs8_gemm_minmax_ukernel_6x16c4__neondot() 154 vacc1x89AB = vdotq_lane_s32(vacc1x89AB, vb4567x89AB, va1x01234567, 1); in xnn_qs8_gemm_minmax_ukernel_6x16c4__neondot() 155 vacc1xCDEF = vdotq_lane_s32(vacc1xCDEF, vb4567xCDEF, va1x01234567, 1); in xnn_qs8_gemm_minmax_ukernel_6x16c4__neondot() 179 const int8x8_t va1x01234567 = vld1_s8(a1); a1 += 4; in xnn_qs8_gemm_minmax_ukernel_6x16c4__neondot() local [all …]
|
D | 6x8c4-minmax-neondot.c | 95 const int8x8_t va1x01234567 = vld1_s8(a1); a1 += 8; in xnn_qs8_gemm_minmax_ukernel_6x8c4__neondot() local 110 vacc1x0123 = vdotq_lane_s32(vacc1x0123, vb0123x0123, va1x01234567, 0); in xnn_qs8_gemm_minmax_ukernel_6x8c4__neondot() 111 vacc1x4567 = vdotq_lane_s32(vacc1x4567, vb0123x4567, va1x01234567, 0); in xnn_qs8_gemm_minmax_ukernel_6x8c4__neondot() 122 vacc1x0123 = vdotq_lane_s32(vacc1x0123, vb4567x0123, va1x01234567, 1); in xnn_qs8_gemm_minmax_ukernel_6x8c4__neondot() 123 vacc1x4567 = vdotq_lane_s32(vacc1x4567, vb4567x4567, va1x01234567, 1); in xnn_qs8_gemm_minmax_ukernel_6x8c4__neondot() 139 const int8x8_t va1x01234567 = vld1_s8(a1); a1 += 4; in xnn_qs8_gemm_minmax_ukernel_6x8c4__neondot() local 152 vacc1x0123 = vdotq_lane_s32(vacc1x0123, vb0123x0123, va1x01234567, 0); in xnn_qs8_gemm_minmax_ukernel_6x8c4__neondot() 153 vacc1x4567 = vdotq_lane_s32(vacc1x4567, vb0123x4567, va1x01234567, 0); in xnn_qs8_gemm_minmax_ukernel_6x8c4__neondot()
|
D | 8x16c4-minmax-neondot.c | 127 const int8x8_t va1x01234567 = vld1_s8(a1); a1 += 8; in xnn_qs8_gemm_minmax_ukernel_8x16c4__neondot() local 150 vacc1x0123 = vdotq_lane_s32(vacc1x0123, vb0123x0123, va1x01234567, 0); in xnn_qs8_gemm_minmax_ukernel_8x16c4__neondot() 151 vacc1x4567 = vdotq_lane_s32(vacc1x4567, vb0123x4567, va1x01234567, 0); in xnn_qs8_gemm_minmax_ukernel_8x16c4__neondot() 152 vacc1x89AB = vdotq_lane_s32(vacc1x89AB, vb0123x89AB, va1x01234567, 0); in xnn_qs8_gemm_minmax_ukernel_8x16c4__neondot() 153 vacc1xCDEF = vdotq_lane_s32(vacc1xCDEF, vb0123xCDEF, va1x01234567, 0); in xnn_qs8_gemm_minmax_ukernel_8x16c4__neondot() 182 vacc1x0123 = vdotq_lane_s32(vacc1x0123, vb4567x0123, va1x01234567, 1); in xnn_qs8_gemm_minmax_ukernel_8x16c4__neondot() 183 vacc1x4567 = vdotq_lane_s32(vacc1x4567, vb4567x4567, va1x01234567, 1); in xnn_qs8_gemm_minmax_ukernel_8x16c4__neondot() 184 vacc1x89AB = vdotq_lane_s32(vacc1x89AB, vb4567x89AB, va1x01234567, 1); in xnn_qs8_gemm_minmax_ukernel_8x16c4__neondot() 185 vacc1xCDEF = vdotq_lane_s32(vacc1xCDEF, vb4567xCDEF, va1x01234567, 1); in xnn_qs8_gemm_minmax_ukernel_8x16c4__neondot() 217 const int8x8_t va1x01234567 = vld1_s8(a1); a1 += 4; in xnn_qs8_gemm_minmax_ukernel_8x16c4__neondot() local [all …]
|
D | 8x8c4-minmax-neondot.c | 111 const int8x8_t va1x01234567 = vld1_s8(a1); a1 += 8; in xnn_qs8_gemm_minmax_ukernel_8x8c4__neondot() local 128 vacc1x0123 = vdotq_lane_s32(vacc1x0123, vb0123x0123, va1x01234567, 0); in xnn_qs8_gemm_minmax_ukernel_8x8c4__neondot() 129 vacc1x4567 = vdotq_lane_s32(vacc1x4567, vb0123x4567, va1x01234567, 0); in xnn_qs8_gemm_minmax_ukernel_8x8c4__neondot() 144 vacc1x0123 = vdotq_lane_s32(vacc1x0123, vb4567x0123, va1x01234567, 1); in xnn_qs8_gemm_minmax_ukernel_8x8c4__neondot() 145 vacc1x4567 = vdotq_lane_s32(vacc1x4567, vb4567x4567, va1x01234567, 1); in xnn_qs8_gemm_minmax_ukernel_8x8c4__neondot() 165 const int8x8_t va1x01234567 = vld1_s8(a1); a1 += 4; in xnn_qs8_gemm_minmax_ukernel_8x8c4__neondot() local 180 vacc1x0123 = vdotq_lane_s32(vacc1x0123, vb0123x0123, va1x01234567, 0); in xnn_qs8_gemm_minmax_ukernel_8x8c4__neondot() 181 vacc1x4567 = vdotq_lane_s32(vacc1x4567, vb0123x4567, va1x01234567, 0); in xnn_qs8_gemm_minmax_ukernel_8x8c4__neondot()
|
/external/XNNPACK/src/qs8-igemm/gen/ |
D | 4x16c4-minmax-neondot.c | 102 const int8x8_t va1x01234567 = vld1_s8(a1); a1 += 8; in xnn_qs8_igemm_minmax_ukernel_4x16c4__neondot() local 121 vacc1x0123 = vdotq_lane_s32(vacc1x0123, vb0123x0123, va1x01234567, 0); in xnn_qs8_igemm_minmax_ukernel_4x16c4__neondot() 122 vacc1x4567 = vdotq_lane_s32(vacc1x4567, vb0123x4567, va1x01234567, 0); in xnn_qs8_igemm_minmax_ukernel_4x16c4__neondot() 123 vacc1x89AB = vdotq_lane_s32(vacc1x89AB, vb0123x89AB, va1x01234567, 0); in xnn_qs8_igemm_minmax_ukernel_4x16c4__neondot() 124 vacc1xCDEF = vdotq_lane_s32(vacc1xCDEF, vb0123xCDEF, va1x01234567, 0); in xnn_qs8_igemm_minmax_ukernel_4x16c4__neondot() 137 vacc1x0123 = vdotq_lane_s32(vacc1x0123, vb4567x0123, va1x01234567, 1); in xnn_qs8_igemm_minmax_ukernel_4x16c4__neondot() 138 vacc1x4567 = vdotq_lane_s32(vacc1x4567, vb4567x4567, va1x01234567, 1); in xnn_qs8_igemm_minmax_ukernel_4x16c4__neondot() 139 vacc1x89AB = vdotq_lane_s32(vacc1x89AB, vb4567x89AB, va1x01234567, 1); in xnn_qs8_igemm_minmax_ukernel_4x16c4__neondot() 140 vacc1xCDEF = vdotq_lane_s32(vacc1xCDEF, vb4567xCDEF, va1x01234567, 1); in xnn_qs8_igemm_minmax_ukernel_4x16c4__neondot() 156 const int8x8_t va1x01234567 = vld1_s8(a1); in xnn_qs8_igemm_minmax_ukernel_4x16c4__neondot() local [all …]
|
D | 4x8c4-minmax-neondot.c | 94 const int8x8_t va1x01234567 = vld1_s8(a1); a1 += 8; in xnn_qs8_igemm_minmax_ukernel_4x8c4__neondot() local 107 vacc1x0123 = vdotq_lane_s32(vacc1x0123, vb0123x0123, va1x01234567, 0); in xnn_qs8_igemm_minmax_ukernel_4x8c4__neondot() 108 vacc1x4567 = vdotq_lane_s32(vacc1x4567, vb0123x4567, va1x01234567, 0); in xnn_qs8_igemm_minmax_ukernel_4x8c4__neondot() 115 vacc1x0123 = vdotq_lane_s32(vacc1x0123, vb4567x0123, va1x01234567, 1); in xnn_qs8_igemm_minmax_ukernel_4x8c4__neondot() 116 vacc1x4567 = vdotq_lane_s32(vacc1x4567, vb4567x4567, va1x01234567, 1); in xnn_qs8_igemm_minmax_ukernel_4x8c4__neondot() 128 const int8x8_t va1x01234567 = vld1_s8(a1); in xnn_qs8_igemm_minmax_ukernel_4x8c4__neondot() local 139 vacc1x0123 = vdotq_lane_s32(vacc1x0123, vb0123x0123, va1x01234567, 0); in xnn_qs8_igemm_minmax_ukernel_4x8c4__neondot() 140 vacc1x4567 = vdotq_lane_s32(vacc1x4567, vb0123x4567, va1x01234567, 0); in xnn_qs8_igemm_minmax_ukernel_4x8c4__neondot()
|
D | 6x16c4-minmax-neondot.c | 126 const int8x8_t va1x01234567 = vld1_s8(a1); a1 += 8; in xnn_qs8_igemm_minmax_ukernel_6x16c4__neondot() local 147 vacc1x0123 = vdotq_lane_s32(vacc1x0123, vb0123x0123, va1x01234567, 0); in xnn_qs8_igemm_minmax_ukernel_6x16c4__neondot() 148 vacc1x4567 = vdotq_lane_s32(vacc1x4567, vb0123x4567, va1x01234567, 0); in xnn_qs8_igemm_minmax_ukernel_6x16c4__neondot() 149 vacc1x89AB = vdotq_lane_s32(vacc1x89AB, vb0123x89AB, va1x01234567, 0); in xnn_qs8_igemm_minmax_ukernel_6x16c4__neondot() 150 vacc1xCDEF = vdotq_lane_s32(vacc1xCDEF, vb0123xCDEF, va1x01234567, 0); in xnn_qs8_igemm_minmax_ukernel_6x16c4__neondot() 171 vacc1x0123 = vdotq_lane_s32(vacc1x0123, vb4567x0123, va1x01234567, 1); in xnn_qs8_igemm_minmax_ukernel_6x16c4__neondot() 172 vacc1x4567 = vdotq_lane_s32(vacc1x4567, vb4567x4567, va1x01234567, 1); in xnn_qs8_igemm_minmax_ukernel_6x16c4__neondot() 173 vacc1x89AB = vdotq_lane_s32(vacc1x89AB, vb4567x89AB, va1x01234567, 1); in xnn_qs8_igemm_minmax_ukernel_6x16c4__neondot() 174 vacc1xCDEF = vdotq_lane_s32(vacc1xCDEF, vb4567xCDEF, va1x01234567, 1); in xnn_qs8_igemm_minmax_ukernel_6x16c4__neondot() 198 const int8x8_t va1x01234567 = vld1_s8(a1); in xnn_qs8_igemm_minmax_ukernel_6x16c4__neondot() local [all …]
|
D | 6x8c4-minmax-neondot.c | 114 const int8x8_t va1x01234567 = vld1_s8(a1); a1 += 8; in xnn_qs8_igemm_minmax_ukernel_6x8c4__neondot() local 129 vacc1x0123 = vdotq_lane_s32(vacc1x0123, vb0123x0123, va1x01234567, 0); in xnn_qs8_igemm_minmax_ukernel_6x8c4__neondot() 130 vacc1x4567 = vdotq_lane_s32(vacc1x4567, vb0123x4567, va1x01234567, 0); in xnn_qs8_igemm_minmax_ukernel_6x8c4__neondot() 141 vacc1x0123 = vdotq_lane_s32(vacc1x0123, vb4567x0123, va1x01234567, 1); in xnn_qs8_igemm_minmax_ukernel_6x8c4__neondot() 142 vacc1x4567 = vdotq_lane_s32(vacc1x4567, vb4567x4567, va1x01234567, 1); in xnn_qs8_igemm_minmax_ukernel_6x8c4__neondot() 158 const int8x8_t va1x01234567 = vld1_s8(a1); in xnn_qs8_igemm_minmax_ukernel_6x8c4__neondot() local 171 vacc1x0123 = vdotq_lane_s32(vacc1x0123, vb0123x0123, va1x01234567, 0); in xnn_qs8_igemm_minmax_ukernel_6x8c4__neondot() 172 vacc1x4567 = vdotq_lane_s32(vacc1x4567, vb0123x4567, va1x01234567, 0); in xnn_qs8_igemm_minmax_ukernel_6x8c4__neondot()
|
D | 8x16c4-minmax-neondot.c | 150 const int8x8_t va1x01234567 = vld1_s8(a1); a1 += 8; in xnn_qs8_igemm_minmax_ukernel_8x16c4__neondot() local 173 vacc1x0123 = vdotq_lane_s32(vacc1x0123, vb0123x0123, va1x01234567, 0); in xnn_qs8_igemm_minmax_ukernel_8x16c4__neondot() 174 vacc1x4567 = vdotq_lane_s32(vacc1x4567, vb0123x4567, va1x01234567, 0); in xnn_qs8_igemm_minmax_ukernel_8x16c4__neondot() 175 vacc1x89AB = vdotq_lane_s32(vacc1x89AB, vb0123x89AB, va1x01234567, 0); in xnn_qs8_igemm_minmax_ukernel_8x16c4__neondot() 176 vacc1xCDEF = vdotq_lane_s32(vacc1xCDEF, vb0123xCDEF, va1x01234567, 0); in xnn_qs8_igemm_minmax_ukernel_8x16c4__neondot() 205 vacc1x0123 = vdotq_lane_s32(vacc1x0123, vb4567x0123, va1x01234567, 1); in xnn_qs8_igemm_minmax_ukernel_8x16c4__neondot() 206 vacc1x4567 = vdotq_lane_s32(vacc1x4567, vb4567x4567, va1x01234567, 1); in xnn_qs8_igemm_minmax_ukernel_8x16c4__neondot() 207 vacc1x89AB = vdotq_lane_s32(vacc1x89AB, vb4567x89AB, va1x01234567, 1); in xnn_qs8_igemm_minmax_ukernel_8x16c4__neondot() 208 vacc1xCDEF = vdotq_lane_s32(vacc1xCDEF, vb4567xCDEF, va1x01234567, 1); in xnn_qs8_igemm_minmax_ukernel_8x16c4__neondot() 240 const int8x8_t va1x01234567 = vld1_s8(a1); in xnn_qs8_igemm_minmax_ukernel_8x16c4__neondot() local [all …]
|
D | 8x8c4-minmax-neondot.c | 134 const int8x8_t va1x01234567 = vld1_s8(a1); a1 += 8; in xnn_qs8_igemm_minmax_ukernel_8x8c4__neondot() local 151 vacc1x0123 = vdotq_lane_s32(vacc1x0123, vb0123x0123, va1x01234567, 0); in xnn_qs8_igemm_minmax_ukernel_8x8c4__neondot() 152 vacc1x4567 = vdotq_lane_s32(vacc1x4567, vb0123x4567, va1x01234567, 0); in xnn_qs8_igemm_minmax_ukernel_8x8c4__neondot() 167 vacc1x0123 = vdotq_lane_s32(vacc1x0123, vb4567x0123, va1x01234567, 1); in xnn_qs8_igemm_minmax_ukernel_8x8c4__neondot() 168 vacc1x4567 = vdotq_lane_s32(vacc1x4567, vb4567x4567, va1x01234567, 1); in xnn_qs8_igemm_minmax_ukernel_8x8c4__neondot() 188 const int8x8_t va1x01234567 = vld1_s8(a1); in xnn_qs8_igemm_minmax_ukernel_8x8c4__neondot() local 203 vacc1x0123 = vdotq_lane_s32(vacc1x0123, vb0123x0123, va1x01234567, 0); in xnn_qs8_igemm_minmax_ukernel_8x8c4__neondot() 204 vacc1x4567 = vdotq_lane_s32(vacc1x4567, vb0123x4567, va1x01234567, 0); in xnn_qs8_igemm_minmax_ukernel_8x8c4__neondot()
|