/external/XNNPACK/src/qs8-igemm/gen/ |
D | 4x16c16-minmax-neon-mlal-padal.c | 169 int16x8_t vprod0x0 = vmull_s8(vget_low_s8(vb0), vget_low_s8(va0)); in xnn_qs8_igemm_minmax_ukernel_4x16c16__neon_mlal_padal() 170 int16x8_t vprod1x0 = vmull_s8(vget_low_s8(vb0), vget_low_s8(va1)); in xnn_qs8_igemm_minmax_ukernel_4x16c16__neon_mlal_padal() 171 int16x8_t vprod2x0 = vmull_s8(vget_low_s8(vb0), vget_low_s8(va2)); in xnn_qs8_igemm_minmax_ukernel_4x16c16__neon_mlal_padal() 172 int16x8_t vprod3x0 = vmull_s8(vget_low_s8(vb0), vget_low_s8(va3)); in xnn_qs8_igemm_minmax_ukernel_4x16c16__neon_mlal_padal() 181 int16x8_t vprod0x1 = vmull_s8(vget_low_s8(vb1), vget_low_s8(va0)); in xnn_qs8_igemm_minmax_ukernel_4x16c16__neon_mlal_padal() 182 int16x8_t vprod1x1 = vmull_s8(vget_low_s8(vb1), vget_low_s8(va1)); in xnn_qs8_igemm_minmax_ukernel_4x16c16__neon_mlal_padal() 183 int16x8_t vprod2x1 = vmull_s8(vget_low_s8(vb1), vget_low_s8(va2)); in xnn_qs8_igemm_minmax_ukernel_4x16c16__neon_mlal_padal() 184 int16x8_t vprod3x1 = vmull_s8(vget_low_s8(vb1), vget_low_s8(va3)); in xnn_qs8_igemm_minmax_ukernel_4x16c16__neon_mlal_padal() 193 int16x8_t vprod0x2 = vmull_s8(vget_low_s8(vb2), vget_low_s8(va0)); in xnn_qs8_igemm_minmax_ukernel_4x16c16__neon_mlal_padal() 194 int16x8_t vprod1x2 = vmull_s8(vget_low_s8(vb2), vget_low_s8(va1)); in xnn_qs8_igemm_minmax_ukernel_4x16c16__neon_mlal_padal() [all …]
|
D | 3x16c16-minmax-neon-mlal-padal.c | 144 int16x8_t vprod0x0 = vmull_s8(vget_low_s8(vb0), vget_low_s8(va0)); in xnn_qs8_igemm_minmax_ukernel_3x16c16__neon_mlal_padal() 145 int16x8_t vprod1x0 = vmull_s8(vget_low_s8(vb0), vget_low_s8(va1)); in xnn_qs8_igemm_minmax_ukernel_3x16c16__neon_mlal_padal() 146 int16x8_t vprod2x0 = vmull_s8(vget_low_s8(vb0), vget_low_s8(va2)); in xnn_qs8_igemm_minmax_ukernel_3x16c16__neon_mlal_padal() 153 int16x8_t vprod0x1 = vmull_s8(vget_low_s8(vb1), vget_low_s8(va0)); in xnn_qs8_igemm_minmax_ukernel_3x16c16__neon_mlal_padal() 154 int16x8_t vprod1x1 = vmull_s8(vget_low_s8(vb1), vget_low_s8(va1)); in xnn_qs8_igemm_minmax_ukernel_3x16c16__neon_mlal_padal() 155 int16x8_t vprod2x1 = vmull_s8(vget_low_s8(vb1), vget_low_s8(va2)); in xnn_qs8_igemm_minmax_ukernel_3x16c16__neon_mlal_padal() 162 int16x8_t vprod0x2 = vmull_s8(vget_low_s8(vb2), vget_low_s8(va0)); in xnn_qs8_igemm_minmax_ukernel_3x16c16__neon_mlal_padal() 163 int16x8_t vprod1x2 = vmull_s8(vget_low_s8(vb2), vget_low_s8(va1)); in xnn_qs8_igemm_minmax_ukernel_3x16c16__neon_mlal_padal() 164 int16x8_t vprod2x2 = vmull_s8(vget_low_s8(vb2), vget_low_s8(va2)); in xnn_qs8_igemm_minmax_ukernel_3x16c16__neon_mlal_padal() 171 int16x8_t vprod0x3 = vmull_s8(vget_low_s8(vb3), vget_low_s8(va0)); in xnn_qs8_igemm_minmax_ukernel_3x16c16__neon_mlal_padal() [all …]
|
D | 4x8c16-minmax-neon-mlal-padal.c | 129 int16x8_t vprod0x0 = vmull_s8(vget_low_s8(vb0), vget_low_s8(va0)); in xnn_qs8_igemm_minmax_ukernel_4x8c16__neon_mlal_padal() 130 int16x8_t vprod1x0 = vmull_s8(vget_low_s8(vb0), vget_low_s8(va1)); in xnn_qs8_igemm_minmax_ukernel_4x8c16__neon_mlal_padal() 131 int16x8_t vprod2x0 = vmull_s8(vget_low_s8(vb0), vget_low_s8(va2)); in xnn_qs8_igemm_minmax_ukernel_4x8c16__neon_mlal_padal() 132 int16x8_t vprod3x0 = vmull_s8(vget_low_s8(vb0), vget_low_s8(va3)); in xnn_qs8_igemm_minmax_ukernel_4x8c16__neon_mlal_padal() 141 int16x8_t vprod0x1 = vmull_s8(vget_low_s8(vb1), vget_low_s8(va0)); in xnn_qs8_igemm_minmax_ukernel_4x8c16__neon_mlal_padal() 142 int16x8_t vprod1x1 = vmull_s8(vget_low_s8(vb1), vget_low_s8(va1)); in xnn_qs8_igemm_minmax_ukernel_4x8c16__neon_mlal_padal() 143 int16x8_t vprod2x1 = vmull_s8(vget_low_s8(vb1), vget_low_s8(va2)); in xnn_qs8_igemm_minmax_ukernel_4x8c16__neon_mlal_padal() 144 int16x8_t vprod3x1 = vmull_s8(vget_low_s8(vb1), vget_low_s8(va3)); in xnn_qs8_igemm_minmax_ukernel_4x8c16__neon_mlal_padal() 153 int16x8_t vprod0x2 = vmull_s8(vget_low_s8(vb2), vget_low_s8(va0)); in xnn_qs8_igemm_minmax_ukernel_4x8c16__neon_mlal_padal() 154 int16x8_t vprod1x2 = vmull_s8(vget_low_s8(vb2), vget_low_s8(va1)); in xnn_qs8_igemm_minmax_ukernel_4x8c16__neon_mlal_padal() [all …]
|
D | 3x8c16-minmax-neon-mlal-padal.c | 112 int16x8_t vprod0x0 = vmull_s8(vget_low_s8(vb0), vget_low_s8(va0)); in xnn_qs8_igemm_minmax_ukernel_3x8c16__neon_mlal_padal() 113 int16x8_t vprod1x0 = vmull_s8(vget_low_s8(vb0), vget_low_s8(va1)); in xnn_qs8_igemm_minmax_ukernel_3x8c16__neon_mlal_padal() 114 int16x8_t vprod2x0 = vmull_s8(vget_low_s8(vb0), vget_low_s8(va2)); in xnn_qs8_igemm_minmax_ukernel_3x8c16__neon_mlal_padal() 121 int16x8_t vprod0x1 = vmull_s8(vget_low_s8(vb1), vget_low_s8(va0)); in xnn_qs8_igemm_minmax_ukernel_3x8c16__neon_mlal_padal() 122 int16x8_t vprod1x1 = vmull_s8(vget_low_s8(vb1), vget_low_s8(va1)); in xnn_qs8_igemm_minmax_ukernel_3x8c16__neon_mlal_padal() 123 int16x8_t vprod2x1 = vmull_s8(vget_low_s8(vb1), vget_low_s8(va2)); in xnn_qs8_igemm_minmax_ukernel_3x8c16__neon_mlal_padal() 130 int16x8_t vprod0x2 = vmull_s8(vget_low_s8(vb2), vget_low_s8(va0)); in xnn_qs8_igemm_minmax_ukernel_3x8c16__neon_mlal_padal() 131 int16x8_t vprod1x2 = vmull_s8(vget_low_s8(vb2), vget_low_s8(va1)); in xnn_qs8_igemm_minmax_ukernel_3x8c16__neon_mlal_padal() 132 int16x8_t vprod2x2 = vmull_s8(vget_low_s8(vb2), vget_low_s8(va2)); in xnn_qs8_igemm_minmax_ukernel_3x8c16__neon_mlal_padal() 139 int16x8_t vprod0x3 = vmull_s8(vget_low_s8(vb3), vget_low_s8(va0)); in xnn_qs8_igemm_minmax_ukernel_3x8c16__neon_mlal_padal() [all …]
|
D | 2x16c16-minmax-neon-mlal-padal.c | 119 int16x8_t vprod0x0 = vmull_s8(vget_low_s8(vb0), vget_low_s8(va0)); in xnn_qs8_igemm_minmax_ukernel_2x16c16__neon_mlal_padal() 120 int16x8_t vprod1x0 = vmull_s8(vget_low_s8(vb0), vget_low_s8(va1)); in xnn_qs8_igemm_minmax_ukernel_2x16c16__neon_mlal_padal() 125 int16x8_t vprod0x1 = vmull_s8(vget_low_s8(vb1), vget_low_s8(va0)); in xnn_qs8_igemm_minmax_ukernel_2x16c16__neon_mlal_padal() 126 int16x8_t vprod1x1 = vmull_s8(vget_low_s8(vb1), vget_low_s8(va1)); in xnn_qs8_igemm_minmax_ukernel_2x16c16__neon_mlal_padal() 131 int16x8_t vprod0x2 = vmull_s8(vget_low_s8(vb2), vget_low_s8(va0)); in xnn_qs8_igemm_minmax_ukernel_2x16c16__neon_mlal_padal() 132 int16x8_t vprod1x2 = vmull_s8(vget_low_s8(vb2), vget_low_s8(va1)); in xnn_qs8_igemm_minmax_ukernel_2x16c16__neon_mlal_padal() 137 int16x8_t vprod0x3 = vmull_s8(vget_low_s8(vb3), vget_low_s8(va0)); in xnn_qs8_igemm_minmax_ukernel_2x16c16__neon_mlal_padal() 138 int16x8_t vprod1x3 = vmull_s8(vget_low_s8(vb3), vget_low_s8(va1)); in xnn_qs8_igemm_minmax_ukernel_2x16c16__neon_mlal_padal() 143 int16x8_t vprod0x4 = vmull_s8(vget_low_s8(vb4), vget_low_s8(va0)); in xnn_qs8_igemm_minmax_ukernel_2x16c16__neon_mlal_padal() 144 int16x8_t vprod1x4 = vmull_s8(vget_low_s8(vb4), vget_low_s8(va1)); in xnn_qs8_igemm_minmax_ukernel_2x16c16__neon_mlal_padal() [all …]
|
D | 2x8c16-minmax-neon-mlal-padal.c | 95 int16x8_t vprod0x0 = vmull_s8(vget_low_s8(vb0), vget_low_s8(va0)); in xnn_qs8_igemm_minmax_ukernel_2x8c16__neon_mlal_padal() 96 int16x8_t vprod1x0 = vmull_s8(vget_low_s8(vb0), vget_low_s8(va1)); in xnn_qs8_igemm_minmax_ukernel_2x8c16__neon_mlal_padal() 101 int16x8_t vprod0x1 = vmull_s8(vget_low_s8(vb1), vget_low_s8(va0)); in xnn_qs8_igemm_minmax_ukernel_2x8c16__neon_mlal_padal() 102 int16x8_t vprod1x1 = vmull_s8(vget_low_s8(vb1), vget_low_s8(va1)); in xnn_qs8_igemm_minmax_ukernel_2x8c16__neon_mlal_padal() 107 int16x8_t vprod0x2 = vmull_s8(vget_low_s8(vb2), vget_low_s8(va0)); in xnn_qs8_igemm_minmax_ukernel_2x8c16__neon_mlal_padal() 108 int16x8_t vprod1x2 = vmull_s8(vget_low_s8(vb2), vget_low_s8(va1)); in xnn_qs8_igemm_minmax_ukernel_2x8c16__neon_mlal_padal() 113 int16x8_t vprod0x3 = vmull_s8(vget_low_s8(vb3), vget_low_s8(va0)); in xnn_qs8_igemm_minmax_ukernel_2x8c16__neon_mlal_padal() 114 int16x8_t vprod1x3 = vmull_s8(vget_low_s8(vb3), vget_low_s8(va1)); in xnn_qs8_igemm_minmax_ukernel_2x8c16__neon_mlal_padal() 119 int16x8_t vprod0x4 = vmull_s8(vget_low_s8(vb4), vget_low_s8(va0)); in xnn_qs8_igemm_minmax_ukernel_2x8c16__neon_mlal_padal() 120 int16x8_t vprod1x4 = vmull_s8(vget_low_s8(vb4), vget_low_s8(va1)); in xnn_qs8_igemm_minmax_ukernel_2x8c16__neon_mlal_padal() [all …]
|
D | 1x16c16-minmax-neon-mlal-padal.c | 94 int16x8_t vprod0x0 = vmull_s8(vget_low_s8(vb0), vget_low_s8(va0)); in xnn_qs8_igemm_minmax_ukernel_1x16c16__neon_mlal_padal() 97 int16x8_t vprod0x1 = vmull_s8(vget_low_s8(vb1), vget_low_s8(va0)); in xnn_qs8_igemm_minmax_ukernel_1x16c16__neon_mlal_padal() 100 int16x8_t vprod0x2 = vmull_s8(vget_low_s8(vb2), vget_low_s8(va0)); in xnn_qs8_igemm_minmax_ukernel_1x16c16__neon_mlal_padal() 103 int16x8_t vprod0x3 = vmull_s8(vget_low_s8(vb3), vget_low_s8(va0)); in xnn_qs8_igemm_minmax_ukernel_1x16c16__neon_mlal_padal() 106 int16x8_t vprod0x4 = vmull_s8(vget_low_s8(vb4), vget_low_s8(va0)); in xnn_qs8_igemm_minmax_ukernel_1x16c16__neon_mlal_padal() 109 int16x8_t vprod0x5 = vmull_s8(vget_low_s8(vb5), vget_low_s8(va0)); in xnn_qs8_igemm_minmax_ukernel_1x16c16__neon_mlal_padal() 112 int16x8_t vprod0x6 = vmull_s8(vget_low_s8(vb6), vget_low_s8(va0)); in xnn_qs8_igemm_minmax_ukernel_1x16c16__neon_mlal_padal() 115 int16x8_t vprod0x7 = vmull_s8(vget_low_s8(vb7), vget_low_s8(va0)); in xnn_qs8_igemm_minmax_ukernel_1x16c16__neon_mlal_padal() 118 int16x8_t vprod0x8 = vmull_s8(vget_low_s8(vb8), vget_low_s8(va0)); in xnn_qs8_igemm_minmax_ukernel_1x16c16__neon_mlal_padal() 121 int16x8_t vprod0x9 = vmull_s8(vget_low_s8(vb9), vget_low_s8(va0)); in xnn_qs8_igemm_minmax_ukernel_1x16c16__neon_mlal_padal() [all …]
|
D | 1x8c16-minmax-neon-mlal-padal.c | 78 int16x8_t vprod0x0 = vmull_s8(vget_low_s8(vb0), vget_low_s8(va0)); in xnn_qs8_igemm_minmax_ukernel_1x8c16__neon_mlal_padal() 81 int16x8_t vprod0x1 = vmull_s8(vget_low_s8(vb1), vget_low_s8(va0)); in xnn_qs8_igemm_minmax_ukernel_1x8c16__neon_mlal_padal() 84 int16x8_t vprod0x2 = vmull_s8(vget_low_s8(vb2), vget_low_s8(va0)); in xnn_qs8_igemm_minmax_ukernel_1x8c16__neon_mlal_padal() 87 int16x8_t vprod0x3 = vmull_s8(vget_low_s8(vb3), vget_low_s8(va0)); in xnn_qs8_igemm_minmax_ukernel_1x8c16__neon_mlal_padal() 90 int16x8_t vprod0x4 = vmull_s8(vget_low_s8(vb4), vget_low_s8(va0)); in xnn_qs8_igemm_minmax_ukernel_1x8c16__neon_mlal_padal() 93 int16x8_t vprod0x5 = vmull_s8(vget_low_s8(vb5), vget_low_s8(va0)); in xnn_qs8_igemm_minmax_ukernel_1x8c16__neon_mlal_padal() 96 int16x8_t vprod0x6 = vmull_s8(vget_low_s8(vb6), vget_low_s8(va0)); in xnn_qs8_igemm_minmax_ukernel_1x8c16__neon_mlal_padal() 99 int16x8_t vprod0x7 = vmull_s8(vget_low_s8(vb7), vget_low_s8(va0)); in xnn_qs8_igemm_minmax_ukernel_1x8c16__neon_mlal_padal()
|
/external/XNNPACK/src/qs8-gemm/gen/ |
D | 4x16c16-minmax-neon-mlal-padal.c | 152 int16x8_t vprod0x0 = vmull_s8(vget_low_s8(vb0), vget_low_s8(va0)); in xnn_qs8_gemm_minmax_ukernel_4x16c16__neon_mlal_padal() 153 int16x8_t vprod1x0 = vmull_s8(vget_low_s8(vb0), vget_low_s8(va1)); in xnn_qs8_gemm_minmax_ukernel_4x16c16__neon_mlal_padal() 154 int16x8_t vprod2x0 = vmull_s8(vget_low_s8(vb0), vget_low_s8(va2)); in xnn_qs8_gemm_minmax_ukernel_4x16c16__neon_mlal_padal() 155 int16x8_t vprod3x0 = vmull_s8(vget_low_s8(vb0), vget_low_s8(va3)); in xnn_qs8_gemm_minmax_ukernel_4x16c16__neon_mlal_padal() 164 int16x8_t vprod0x1 = vmull_s8(vget_low_s8(vb1), vget_low_s8(va0)); in xnn_qs8_gemm_minmax_ukernel_4x16c16__neon_mlal_padal() 165 int16x8_t vprod1x1 = vmull_s8(vget_low_s8(vb1), vget_low_s8(va1)); in xnn_qs8_gemm_minmax_ukernel_4x16c16__neon_mlal_padal() 166 int16x8_t vprod2x1 = vmull_s8(vget_low_s8(vb1), vget_low_s8(va2)); in xnn_qs8_gemm_minmax_ukernel_4x16c16__neon_mlal_padal() 167 int16x8_t vprod3x1 = vmull_s8(vget_low_s8(vb1), vget_low_s8(va3)); in xnn_qs8_gemm_minmax_ukernel_4x16c16__neon_mlal_padal() 176 int16x8_t vprod0x2 = vmull_s8(vget_low_s8(vb2), vget_low_s8(va0)); in xnn_qs8_gemm_minmax_ukernel_4x16c16__neon_mlal_padal() 177 int16x8_t vprod1x2 = vmull_s8(vget_low_s8(vb2), vget_low_s8(va1)); in xnn_qs8_gemm_minmax_ukernel_4x16c16__neon_mlal_padal() [all …]
|
D | 3x16c16-minmax-neon-mlal-padal.c | 129 int16x8_t vprod0x0 = vmull_s8(vget_low_s8(vb0), vget_low_s8(va0)); in xnn_qs8_gemm_minmax_ukernel_3x16c16__neon_mlal_padal() 130 int16x8_t vprod1x0 = vmull_s8(vget_low_s8(vb0), vget_low_s8(va1)); in xnn_qs8_gemm_minmax_ukernel_3x16c16__neon_mlal_padal() 131 int16x8_t vprod2x0 = vmull_s8(vget_low_s8(vb0), vget_low_s8(va2)); in xnn_qs8_gemm_minmax_ukernel_3x16c16__neon_mlal_padal() 138 int16x8_t vprod0x1 = vmull_s8(vget_low_s8(vb1), vget_low_s8(va0)); in xnn_qs8_gemm_minmax_ukernel_3x16c16__neon_mlal_padal() 139 int16x8_t vprod1x1 = vmull_s8(vget_low_s8(vb1), vget_low_s8(va1)); in xnn_qs8_gemm_minmax_ukernel_3x16c16__neon_mlal_padal() 140 int16x8_t vprod2x1 = vmull_s8(vget_low_s8(vb1), vget_low_s8(va2)); in xnn_qs8_gemm_minmax_ukernel_3x16c16__neon_mlal_padal() 147 int16x8_t vprod0x2 = vmull_s8(vget_low_s8(vb2), vget_low_s8(va0)); in xnn_qs8_gemm_minmax_ukernel_3x16c16__neon_mlal_padal() 148 int16x8_t vprod1x2 = vmull_s8(vget_low_s8(vb2), vget_low_s8(va1)); in xnn_qs8_gemm_minmax_ukernel_3x16c16__neon_mlal_padal() 149 int16x8_t vprod2x2 = vmull_s8(vget_low_s8(vb2), vget_low_s8(va2)); in xnn_qs8_gemm_minmax_ukernel_3x16c16__neon_mlal_padal() 156 int16x8_t vprod0x3 = vmull_s8(vget_low_s8(vb3), vget_low_s8(va0)); in xnn_qs8_gemm_minmax_ukernel_3x16c16__neon_mlal_padal() [all …]
|
D | 2x16c16-minmax-neon-mlal-padal.c | 106 int16x8_t vprod0x0 = vmull_s8(vget_low_s8(vb0), vget_low_s8(va0)); in xnn_qs8_gemm_minmax_ukernel_2x16c16__neon_mlal_padal() 107 int16x8_t vprod1x0 = vmull_s8(vget_low_s8(vb0), vget_low_s8(va1)); in xnn_qs8_gemm_minmax_ukernel_2x16c16__neon_mlal_padal() 112 int16x8_t vprod0x1 = vmull_s8(vget_low_s8(vb1), vget_low_s8(va0)); in xnn_qs8_gemm_minmax_ukernel_2x16c16__neon_mlal_padal() 113 int16x8_t vprod1x1 = vmull_s8(vget_low_s8(vb1), vget_low_s8(va1)); in xnn_qs8_gemm_minmax_ukernel_2x16c16__neon_mlal_padal() 118 int16x8_t vprod0x2 = vmull_s8(vget_low_s8(vb2), vget_low_s8(va0)); in xnn_qs8_gemm_minmax_ukernel_2x16c16__neon_mlal_padal() 119 int16x8_t vprod1x2 = vmull_s8(vget_low_s8(vb2), vget_low_s8(va1)); in xnn_qs8_gemm_minmax_ukernel_2x16c16__neon_mlal_padal() 124 int16x8_t vprod0x3 = vmull_s8(vget_low_s8(vb3), vget_low_s8(va0)); in xnn_qs8_gemm_minmax_ukernel_2x16c16__neon_mlal_padal() 125 int16x8_t vprod1x3 = vmull_s8(vget_low_s8(vb3), vget_low_s8(va1)); in xnn_qs8_gemm_minmax_ukernel_2x16c16__neon_mlal_padal() 130 int16x8_t vprod0x4 = vmull_s8(vget_low_s8(vb4), vget_low_s8(va0)); in xnn_qs8_gemm_minmax_ukernel_2x16c16__neon_mlal_padal() 131 int16x8_t vprod1x4 = vmull_s8(vget_low_s8(vb4), vget_low_s8(va1)); in xnn_qs8_gemm_minmax_ukernel_2x16c16__neon_mlal_padal() [all …]
|
D | 4x8c16-minmax-neon-mlal-padal.c | 112 int16x8_t vprod0x0 = vmull_s8(vget_low_s8(vb0), vget_low_s8(va0)); in xnn_qs8_gemm_minmax_ukernel_4x8c16__neon_mlal_padal() 113 int16x8_t vprod1x0 = vmull_s8(vget_low_s8(vb0), vget_low_s8(va1)); in xnn_qs8_gemm_minmax_ukernel_4x8c16__neon_mlal_padal() 114 int16x8_t vprod2x0 = vmull_s8(vget_low_s8(vb0), vget_low_s8(va2)); in xnn_qs8_gemm_minmax_ukernel_4x8c16__neon_mlal_padal() 115 int16x8_t vprod3x0 = vmull_s8(vget_low_s8(vb0), vget_low_s8(va3)); in xnn_qs8_gemm_minmax_ukernel_4x8c16__neon_mlal_padal() 124 int16x8_t vprod0x1 = vmull_s8(vget_low_s8(vb1), vget_low_s8(va0)); in xnn_qs8_gemm_minmax_ukernel_4x8c16__neon_mlal_padal() 125 int16x8_t vprod1x1 = vmull_s8(vget_low_s8(vb1), vget_low_s8(va1)); in xnn_qs8_gemm_minmax_ukernel_4x8c16__neon_mlal_padal() 126 int16x8_t vprod2x1 = vmull_s8(vget_low_s8(vb1), vget_low_s8(va2)); in xnn_qs8_gemm_minmax_ukernel_4x8c16__neon_mlal_padal() 127 int16x8_t vprod3x1 = vmull_s8(vget_low_s8(vb1), vget_low_s8(va3)); in xnn_qs8_gemm_minmax_ukernel_4x8c16__neon_mlal_padal() 136 int16x8_t vprod0x2 = vmull_s8(vget_low_s8(vb2), vget_low_s8(va0)); in xnn_qs8_gemm_minmax_ukernel_4x8c16__neon_mlal_padal() 137 int16x8_t vprod1x2 = vmull_s8(vget_low_s8(vb2), vget_low_s8(va1)); in xnn_qs8_gemm_minmax_ukernel_4x8c16__neon_mlal_padal() [all …]
|
D | 3x8c16-minmax-neon-mlal-padal.c | 97 int16x8_t vprod0x0 = vmull_s8(vget_low_s8(vb0), vget_low_s8(va0)); in xnn_qs8_gemm_minmax_ukernel_3x8c16__neon_mlal_padal() 98 int16x8_t vprod1x0 = vmull_s8(vget_low_s8(vb0), vget_low_s8(va1)); in xnn_qs8_gemm_minmax_ukernel_3x8c16__neon_mlal_padal() 99 int16x8_t vprod2x0 = vmull_s8(vget_low_s8(vb0), vget_low_s8(va2)); in xnn_qs8_gemm_minmax_ukernel_3x8c16__neon_mlal_padal() 106 int16x8_t vprod0x1 = vmull_s8(vget_low_s8(vb1), vget_low_s8(va0)); in xnn_qs8_gemm_minmax_ukernel_3x8c16__neon_mlal_padal() 107 int16x8_t vprod1x1 = vmull_s8(vget_low_s8(vb1), vget_low_s8(va1)); in xnn_qs8_gemm_minmax_ukernel_3x8c16__neon_mlal_padal() 108 int16x8_t vprod2x1 = vmull_s8(vget_low_s8(vb1), vget_low_s8(va2)); in xnn_qs8_gemm_minmax_ukernel_3x8c16__neon_mlal_padal() 115 int16x8_t vprod0x2 = vmull_s8(vget_low_s8(vb2), vget_low_s8(va0)); in xnn_qs8_gemm_minmax_ukernel_3x8c16__neon_mlal_padal() 116 int16x8_t vprod1x2 = vmull_s8(vget_low_s8(vb2), vget_low_s8(va1)); in xnn_qs8_gemm_minmax_ukernel_3x8c16__neon_mlal_padal() 117 int16x8_t vprod2x2 = vmull_s8(vget_low_s8(vb2), vget_low_s8(va2)); in xnn_qs8_gemm_minmax_ukernel_3x8c16__neon_mlal_padal() 124 int16x8_t vprod0x3 = vmull_s8(vget_low_s8(vb3), vget_low_s8(va0)); in xnn_qs8_gemm_minmax_ukernel_3x8c16__neon_mlal_padal() [all …]
|
D | 2x8c16-minmax-neon-mlal-padal.c | 82 int16x8_t vprod0x0 = vmull_s8(vget_low_s8(vb0), vget_low_s8(va0)); in xnn_qs8_gemm_minmax_ukernel_2x8c16__neon_mlal_padal() 83 int16x8_t vprod1x0 = vmull_s8(vget_low_s8(vb0), vget_low_s8(va1)); in xnn_qs8_gemm_minmax_ukernel_2x8c16__neon_mlal_padal() 88 int16x8_t vprod0x1 = vmull_s8(vget_low_s8(vb1), vget_low_s8(va0)); in xnn_qs8_gemm_minmax_ukernel_2x8c16__neon_mlal_padal() 89 int16x8_t vprod1x1 = vmull_s8(vget_low_s8(vb1), vget_low_s8(va1)); in xnn_qs8_gemm_minmax_ukernel_2x8c16__neon_mlal_padal() 94 int16x8_t vprod0x2 = vmull_s8(vget_low_s8(vb2), vget_low_s8(va0)); in xnn_qs8_gemm_minmax_ukernel_2x8c16__neon_mlal_padal() 95 int16x8_t vprod1x2 = vmull_s8(vget_low_s8(vb2), vget_low_s8(va1)); in xnn_qs8_gemm_minmax_ukernel_2x8c16__neon_mlal_padal() 100 int16x8_t vprod0x3 = vmull_s8(vget_low_s8(vb3), vget_low_s8(va0)); in xnn_qs8_gemm_minmax_ukernel_2x8c16__neon_mlal_padal() 101 int16x8_t vprod1x3 = vmull_s8(vget_low_s8(vb3), vget_low_s8(va1)); in xnn_qs8_gemm_minmax_ukernel_2x8c16__neon_mlal_padal() 106 int16x8_t vprod0x4 = vmull_s8(vget_low_s8(vb4), vget_low_s8(va0)); in xnn_qs8_gemm_minmax_ukernel_2x8c16__neon_mlal_padal() 107 int16x8_t vprod1x4 = vmull_s8(vget_low_s8(vb4), vget_low_s8(va1)); in xnn_qs8_gemm_minmax_ukernel_2x8c16__neon_mlal_padal() [all …]
|
D | 1x16c16-minmax-neon-mlal-padal.c | 83 int16x8_t vprod0x0 = vmull_s8(vget_low_s8(vb0), vget_low_s8(va0)); in xnn_qs8_gemm_minmax_ukernel_1x16c16__neon_mlal_padal() 86 int16x8_t vprod0x1 = vmull_s8(vget_low_s8(vb1), vget_low_s8(va0)); in xnn_qs8_gemm_minmax_ukernel_1x16c16__neon_mlal_padal() 89 int16x8_t vprod0x2 = vmull_s8(vget_low_s8(vb2), vget_low_s8(va0)); in xnn_qs8_gemm_minmax_ukernel_1x16c16__neon_mlal_padal() 92 int16x8_t vprod0x3 = vmull_s8(vget_low_s8(vb3), vget_low_s8(va0)); in xnn_qs8_gemm_minmax_ukernel_1x16c16__neon_mlal_padal() 95 int16x8_t vprod0x4 = vmull_s8(vget_low_s8(vb4), vget_low_s8(va0)); in xnn_qs8_gemm_minmax_ukernel_1x16c16__neon_mlal_padal() 98 int16x8_t vprod0x5 = vmull_s8(vget_low_s8(vb5), vget_low_s8(va0)); in xnn_qs8_gemm_minmax_ukernel_1x16c16__neon_mlal_padal() 101 int16x8_t vprod0x6 = vmull_s8(vget_low_s8(vb6), vget_low_s8(va0)); in xnn_qs8_gemm_minmax_ukernel_1x16c16__neon_mlal_padal() 104 int16x8_t vprod0x7 = vmull_s8(vget_low_s8(vb7), vget_low_s8(va0)); in xnn_qs8_gemm_minmax_ukernel_1x16c16__neon_mlal_padal() 107 int16x8_t vprod0x8 = vmull_s8(vget_low_s8(vb8), vget_low_s8(va0)); in xnn_qs8_gemm_minmax_ukernel_1x16c16__neon_mlal_padal() 110 int16x8_t vprod0x9 = vmull_s8(vget_low_s8(vb9), vget_low_s8(va0)); in xnn_qs8_gemm_minmax_ukernel_1x16c16__neon_mlal_padal() [all …]
|
D | 1x8c16-minmax-neon-mlal-padal.c | 67 int16x8_t vprod0x0 = vmull_s8(vget_low_s8(vb0), vget_low_s8(va0)); in xnn_qs8_gemm_minmax_ukernel_1x8c16__neon_mlal_padal() 70 int16x8_t vprod0x1 = vmull_s8(vget_low_s8(vb1), vget_low_s8(va0)); in xnn_qs8_gemm_minmax_ukernel_1x8c16__neon_mlal_padal() 73 int16x8_t vprod0x2 = vmull_s8(vget_low_s8(vb2), vget_low_s8(va0)); in xnn_qs8_gemm_minmax_ukernel_1x8c16__neon_mlal_padal() 76 int16x8_t vprod0x3 = vmull_s8(vget_low_s8(vb3), vget_low_s8(va0)); in xnn_qs8_gemm_minmax_ukernel_1x8c16__neon_mlal_padal() 79 int16x8_t vprod0x4 = vmull_s8(vget_low_s8(vb4), vget_low_s8(va0)); in xnn_qs8_gemm_minmax_ukernel_1x8c16__neon_mlal_padal() 82 int16x8_t vprod0x5 = vmull_s8(vget_low_s8(vb5), vget_low_s8(va0)); in xnn_qs8_gemm_minmax_ukernel_1x8c16__neon_mlal_padal() 85 int16x8_t vprod0x6 = vmull_s8(vget_low_s8(vb6), vget_low_s8(va0)); in xnn_qs8_gemm_minmax_ukernel_1x8c16__neon_mlal_padal() 88 int16x8_t vprod0x7 = vmull_s8(vget_low_s8(vb7), vget_low_s8(va0)); in xnn_qs8_gemm_minmax_ukernel_1x8c16__neon_mlal_padal()
|
/external/tensorflow/tensorflow/lite/kernels/internal/optimized/ |
D | depthwiseconv_3x3_filter_common.h | 48 repacked_data.val[0] = vget_low_s8(a.val[0]); in vqtbl4q_s8() 49 repacked_data.val[1] = vget_low_s8(a.val[1]); in vqtbl4q_s8() 50 repacked_data.val[2] = vget_low_s8(a.val[2]); in vqtbl4q_s8() 51 repacked_data.val[3] = vget_low_s8(a.val[3]); in vqtbl4q_s8() 53 vcombine_s8(vtbl4_s8(repacked_data, vget_low_s8(deleted_bit_3)), in vqtbl4q_s8() 62 vcombine_s8(vtbl4_s8(repacked_data, vget_low_s8(deleted_bit_3)), in vqtbl4q_s8() 132 return vdotq_lane_s32(acc, lhs, vreinterpret_s32_s8(vget_low_s8(rhs)), 0); in vdotq_four_lane_s32() 134 return vdotq_lane_s32(acc, lhs, vreinterpret_s32_s8(vget_low_s8(rhs)), 1); in vdotq_four_lane_s32() 148 int32x4_t sum0 = vpaddlq_s16(vmull_s8(vget_low_s8(lhs), vget_low_s8(rhs))); in vdotq_s32() 159 vdup_lane_s32(vreinterpret_s32_s8(vget_low_s8(rhs)), 0)); in vdotq_four_lane_s32() [all …]
|
/external/libgav1/libgav1/src/dsp/arm/ |
D | warp_neon.cc | 74 int8x8_t src_row_window = vget_low_s8(src_row_centered); in HorizontalFilter() 77 src_row_window = vget_low_s8(vextq_s8(src_row_centered, src_row_centered, 1)); in HorizontalFilter() 80 src_row_window = vget_low_s8(vextq_s8(src_row_centered, src_row_centered, 2)); in HorizontalFilter() 83 src_row_window = vget_low_s8(vextq_s8(src_row_centered, src_row_centered, 3)); in HorizontalFilter() 86 src_row_window = vget_low_s8(vextq_s8(src_row_centered, src_row_centered, 4)); in HorizontalFilter() 89 src_row_window = vget_low_s8(vextq_s8(src_row_centered, src_row_centered, 5)); in HorizontalFilter() 92 src_row_window = vget_low_s8(vextq_s8(src_row_centered, src_row_centered, 6)); in HorizontalFilter() 95 src_row_window = vget_low_s8(vextq_s8(src_row_centered, src_row_centered, 7)); in HorizontalFilter()
|
/external/XNNPACK/src/qs8-vaddc/gen/ |
D | minmax-neon-ld64-x8.c | 57 vout01234567 = vmax_s8(vout01234567, vget_low_s8(voutput_min)); in xnn_qs8_vaddc_minmax_ukernel__neon_ld64_x8() 59 vout01234567 = vmin_s8(vout01234567, vget_low_s8(voutput_max)); in xnn_qs8_vaddc_minmax_ukernel__neon_ld64_x8() 81 vout01234567 = vmax_s8(vout01234567, vget_low_s8(voutput_min)); in xnn_qs8_vaddc_minmax_ukernel__neon_ld64_x8() 82 vout01234567 = vmin_s8(vout01234567, vget_low_s8(voutput_max)); in xnn_qs8_vaddc_minmax_ukernel__neon_ld64_x8()
|
D | minmax-neon-ld64-x24.c | 77 voutGHIJKLMN = vmax_s8(voutGHIJKLMN, vget_low_s8(voutput_min)); in xnn_qs8_vaddc_minmax_ukernel__neon_ld64_x24() 80 voutGHIJKLMN = vmin_s8(voutGHIJKLMN, vget_low_s8(voutput_max)); in xnn_qs8_vaddc_minmax_ukernel__neon_ld64_x24() 103 vout01234567 = vmax_s8(vout01234567, vget_low_s8(voutput_min)); in xnn_qs8_vaddc_minmax_ukernel__neon_ld64_x24() 104 vout01234567 = vmin_s8(vout01234567, vget_low_s8(voutput_max)); in xnn_qs8_vaddc_minmax_ukernel__neon_ld64_x24()
|
/external/XNNPACK/src/qs8-vadd/gen/ |
D | minmax-neon-ld64-x8.c | 57 vout01234567 = vmax_s8(vout01234567, vget_low_s8(voutput_min)); in xnn_qs8_vadd_minmax_ukernel__neon_ld64_x8() 59 vout01234567 = vmin_s8(vout01234567, vget_low_s8(voutput_max)); in xnn_qs8_vadd_minmax_ukernel__neon_ld64_x8() 86 vout01234567 = vmax_s8(vout01234567, vget_low_s8(voutput_min)); in xnn_qs8_vadd_minmax_ukernel__neon_ld64_x8() 87 vout01234567 = vmin_s8(vout01234567, vget_low_s8(voutput_max)); in xnn_qs8_vadd_minmax_ukernel__neon_ld64_x8()
|
/external/libvpx/libvpx/vp8/common/arm/neon/ |
D | loopfiltersimplehorizontaledge_neon.c | 52 q2s16 = vsubl_s8(vget_low_s8(vreinterpretq_s8_u8(q7u8)), in vp8_loop_filter_simple_horizontal_edge_neon() 53 vget_low_s8(vreinterpretq_s8_u8(q6u8))); in vp8_loop_filter_simple_horizontal_edge_neon() 65 q2s16 = vaddw_s8(q2s16, vget_low_s8(q4s8)); in vp8_loop_filter_simple_horizontal_edge_neon()
|
/external/libhevc/common/arm/ |
D | ihevc_sao_edge_offset_class1_chroma.s | 179 VTBL.8 D12,{D6},D12 @vtbl1_s8(edge_idx_tbl, vget_low_s8(edge_idx)) 189 VTBL.8 D12,{D7},D12 @offset = vtbl1_s8(offset_tbl, vget_low_s8(edge_idx)) 197 VTBL.8 D22,{D6},D22 @II vtbl1_s8(edge_idx_tbl, vget_low_s8(edge_idx)) 209 VTBL.8 D24,{D7},D22 @offset = vtbl1_s8(offset_tbl, vget_low_s8(edge_idx)) 214 @VTBL.8 D24,D7,D22 @II offset = vtbl1_s8(offset_tbl, vget_low_s8(edge_idx)) 255 VTBL.8 D22,{D6},D22 @vtbl1_s8(edge_idx_tbl, vget_low_s8(edge_idx)) 263 @VTBL.8 D24,D7,D22 @offset = vtbl1_s8(offset_tbl, vget_low_s8(edge_idx)) 334 VTBL.8 D12,{D6},D12 @vtbl1_s8(edge_idx_tbl, vget_low_s8(edge_idx)) 346 @VTBL.8 D12,D7,D12 @offset = vtbl1_s8(offset_tbl, vget_low_s8(edge_idx)) 352 VTBL.8 D22,{D6},D22 @II vtbl1_s8(edge_idx_tbl, vget_low_s8(edge_idx)) [all …]
|
D | ihevc_sao_edge_offset_class1.s | 175 VTBL.8 D12,{D6},D12 @vtbl1_s8(edge_idx_tbl, vget_low_s8(edge_idx)) 184 VTBL.8 D12,{D7},D12 @offset = vtbl1_s8(offset_tbl, vget_low_s8(edge_idx)) 189 VTBL.8 D22,{D6},D22 @II vtbl1_s8(edge_idx_tbl, vget_low_s8(edge_idx)) 202 VTBL.8 D24,{D7},D22 @II offset = vtbl1_s8(offset_tbl, vget_low_s8(edge_idx)) 243 VTBL.8 D22,{D6},D22 @vtbl1_s8(edge_idx_tbl, vget_low_s8(edge_idx)) 246 VTBL.8 D24,{D7},D22 @offset = vtbl1_s8(offset_tbl, vget_low_s8(edge_idx)) 317 VTBL.8 D12,{D6},D12 @vtbl1_s8(edge_idx_tbl, vget_low_s8(edge_idx)) 321 VTBL.8 D12,{D7},D12 @offset = vtbl1_s8(offset_tbl, vget_low_s8(edge_idx)) 328 VTBL.8 D22,{D6},D22 @II vtbl1_s8(edge_idx_tbl, vget_low_s8(edge_idx)) 332 VTBL.8 D24,{D7},D22 @II offset = vtbl1_s8(offset_tbl, vget_low_s8(edge_idx)) [all …]
|
/external/gemmlowp/standalone/ |
D | neon-gemm-kernel-benchmark.cc | 4577 vmull_s8(vget_low_s8(lhs[i]), vget_low_s8(rhs[j])); in Run() 4766 local_acc[i][0] = vmull_s8(vget_low_s8(lhs[i][0]), in Run() 4767 vget_low_s8(rhs[0])); in Run() 4768 local_acc[i][0] = vmlal_s8(local_acc[i][0], vget_low_s8(lhs[i][1]), in Run() 4769 vget_low_s8(rhs[2])); in Run() 4770 local_acc[i][1] = vmull_s8(vget_low_s8(lhs[i][0]), in Run() 4771 vget_low_s8(rhs[1])); in Run() 4773 vget_low_s8(lhs[i][1]), in Run() 4774 vget_low_s8(rhs[3])); in Run() 4776 local_acc[i][0] = vmlal_s8(local_acc[i][0], vget_low_s8(lhs[i][0]), in Run() [all …]
|