1 // 2 // Licensed under the Apache License, Version 2.0 (the "License"); 3 // you may not use this file except in compliance with the License. 4 // You may obtain a copy of the License at 5 // 6 // http://www.apache.org/licenses/LICENSE-2.0 7 // 8 // Unless required by applicable law or agreed to in writing, software 9 // distributed under the License is distributed on an "AS IS" BASIS, 10 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 11 // See the License for the specific language governing permissions and 12 // limitations under the License. 13 14 // kernel_SSE.h: a collection of Intel SSE optimized kernels. 15 // Check in kernel_default.h which one(s) are actually used by default. 16 // Others are mere experiments; they are still covered by tests 17 // in case they might be useful some day. 18 // 19 20 #ifndef GEMMLOWP_INTERNAL_KERNEL_AVX_H_ 21 #define GEMMLOWP_INTERNAL_KERNEL_AVX_H_ 22 23 #include "kernel.h" 24 25 #include <string.h> 26 #include <cassert> 27 28 namespace gemmlowp { 29 30 #ifdef GEMMLOWP_AVX2_64 31 struct AVX2_64_Kernel24x8Depth2 : KernelBase { 32 typedef KernelFormat<KernelSideFormat<CellFormat<8, 2, CellOrder::WidthMajor>, 3>, 33 KernelSideFormat<CellFormat<4, 2, CellOrder::WidthMajor>, 1>> 34 Format; 35 NameAVX2_64_Kernel24x8Depth236 const char *Name() const override { return "AVX, 24x8, depth 2"; } 37 RunAVX2_64_Kernel24x8Depth238 void Run(std::int32_t *dst_ptr, std::size_t dst_row_stride, std::size_t dst_col_stride, 39 const std::uint8_t *lhs_ptr, const std::uint8_t *rhs_ptr, std::size_t start_depth, 40 std::size_t run_depth) const override { 41 ScopedProfilingLabel label("optimized kernel"); 42 assert(dst_row_stride == 1); 43 const std::int64_t run_depth_cells = run_depth / Format::kDepth; 44 const std::int64_t dst_col_stride_q = dst_col_stride; 45 46 /* Main loop */ 47 48 // A 2x8 cell of Rhs is stored in 16bit in ymm1 . 49 // A 24x2 block of 3 8x2 cells Lhs is stored in 16bit in ymm0, replaced 50 // every Iteration. 51 // A 8x8 block of accumulators is stored in 32bit in xmm4--xmm15. 52 // 53 // +-------+-------+-------+-------+ 54 // |ymm1[0] |ymm2[2] | 55 // Rhs +-------+---------------+-------+ 56 // |ymm1[1] |ymm1[4] | 57 // +-------+-------+-------+-------+ 58 // 59 // | | | | | 60 // 61 // Lhs | | | | | 62 // 63 // +--+--+ - - - - +-------+-------+-------+-------+ 64 // |ymm0 | | ymm4 | ymm5 | ymm6 | ymm7 | 65 // |ymm0 | (Iter1) | ymm4 | ymm5 | ymm6 | ymm7 | 66 // |ymm0 | | ymm4 | ymm5 | ymm6 | ymm7 | 67 // |ymm0 | | ymm4 | ymm5 | ymm6 | ymm7 | 68 // +--+--+ - - - - +-------+-------+-------+-------+ 69 // |ymm0 | | ymm8 | ymm9 | ymm10 | ymm11 | 70 // |ymm0 | (Iter2) | ymm8 | ymm9 | ymm10 | ymm11 | 71 // |ymm0 | | ymm8 | ymm9 | ymm10 | ymm11 | 72 // |ymm0 | | ymm8 | ymm9 | ymm10 | ymm11 | 73 // +--+--+ - - - - +-------+-------+-------+-------+ 74 // |ymm0 | | ymm12 | ymm13 | ymm14 | ymm15 | 75 // |ymm0 | (Iter3) | ymm12 | ymm13 | ymm14 | ymm15 | 76 // |ymm0 | | ymm12 | ymm13 | ymm14 | ymm15 | 77 // |ymm0 | | ymm12 | ymm13 | ymm14 | ymm15 | 78 // +--+--+ - - - - +-------+-------+-------+-------+ 79 // 80 // Accumulator 81 82 asm volatile( 83 // Set registers for destination 84 "movq %[dst_col_stride_q], %%r12\n\t" // stride is r12 85 "shlq $2, %%r12\n\t" // set stride dword 86 "leaq (%%r12,%%r12,0x2), %%r13\n\t" // load stride aligned r13 87 88 // Set accumulators to zero. 89 "vpxor %%ymm4, %%ymm4, %%ymm4 \n\t" // zero accumulators 90 "vpxor %%ymm5, %%ymm5, %%ymm5 \n\t" // zero accumulators 91 "vpxor %%ymm6, %%ymm6, %%ymm6 \n\t" // zero accumulators 92 "vpxor %%ymm7, %%ymm7, %%ymm7 \n\t" // zero accumulators 93 "vpxor %%ymm8, %%ymm8, %%ymm8 \n\t" // zero accumulators 94 "vpxor %%ymm9, %%ymm9, %%ymm9 \n\t" // zero accumulators 95 "vpxor %%ymm10, %%ymm10, %%ymm10\n\t" // zero accumulators 96 "vpxor %%ymm11, %%ymm11, %%ymm11\n\t" // zero accumulators 97 "vpxor %%ymm12, %%ymm12, %%ymm12\n\t" // zero accumulators 98 "vpxor %%ymm13, %%ymm13, %%ymm13\n\t" // zero accumulators 99 "vpxor %%ymm14, %%ymm14, %%ymm14\n\t" // zero accumulators 100 "vpxor %%ymm15, %%ymm15, %%ymm15\n\t" // zero accumulators 101 102 "movq %[run_depth_cells], %%r14 \n\t" // load cell depth r14 103 "subq $2, %%r14 \n\t" // cell depth is 2 104 "js outerLoop1%= \n\t" // outerloop for matrix 105 106 // Loop for K unrolled by 4 107 "outerLoop2%=: \n\t" // outer loop unroll 108 109 // K = 0,1,2,3 110 // RHS cell to ymm1 111 112 // lower half 113 "vpmovzxbw (%[rhs_ptr]), %%ymm1 \n\t" // mov rhs to ymm1 114 "vpermq $0x44,%%ymm1, %%ymm1 \n\t" 115 // LHS cell elements 0 and 1 116 "vpmovzxbw 0x00(%[lhs_ptr]), %%ymm0\n\t" // mov lhs to ymm0 117 "vpshufd $0x00,%%ymm1,%%ymm2 \n\t" // move rhs 0 element to all ymm2 118 "vpshufd $0x55,%%ymm1,%%ymm3 \n\t" // move rhs 1 element to all ymm3 119 "vpmaddwd %%ymm0, %%ymm2, %%ymm2 \n\t" // mul add lhs rhs0 into ymm2 120 "vpmaddwd %%ymm0, %%ymm3, %%ymm3 \n\t" // mul add lhs rhs1 into ymm3 121 "vpaddd %%ymm2, %%ymm4, %%ymm4 \n\t" // add muladd lhs + rhs0 into ymm4 122 "vpaddd %%ymm3, %%ymm5, %%ymm5 \n\t" // add muladd lhs + rhs1 into ymm5 123 // LHS cell elements 2 and 3 124 "vpshufd $0xaa, %%ymm1, %%ymm2 \n\t" // move rhs 2 element to all ymm2 125 "vpmaddwd %%ymm0, %%ymm2, %%ymm2 \n\t" // mul add lhs rh3 into ymm2 126 "vpshufd $0xff,%%ymm1,%%ymm3 \n\t" // mov rhs 3 element into all ymm3 127 "vpmaddwd %%ymm0, %%ymm3, %%ymm3 \n\t" // mul add lhs rh4 into ymm3 128 "vpaddd %%ymm2, %%ymm6, %%ymm6 \n\t" // add muladd lhs + rhs2 into ymm6 129 "vpaddd %%ymm3, %%ymm7, %%ymm7 \n\t" // add muladd lhs + rhs3 into ymm7 130 131 // cache prefect lhs //see if it works better? 132 //"prefetcht0 0x80(%[lhs_ptr]) \n\t" //prefetch cache lines 133 "vpmovzxbw (%[rhs_ptr]), %%ymm1 \n\t" // mov rhs to ymm1 134 "vpermq $0x44,%%ymm1, %%ymm1 \n\t" 135 136 // K = 5,6,7,8 137 // next LHS cell elements 0 and 1 138 "vpmovzxbw 0x10(%[lhs_ptr]), %%ymm0 \n\t" // mov lhs to ymm0 139 "vpshufd $0x00,%%ymm1,%%ymm2 \n\t" // mov rhs 0 element to all ymm2 140 "vpshufd $0x55,%%ymm1,%%ymm3 \n\t" // mov rhs 1 element to all ymm3 141 "vpmaddwd %%ymm0, %%ymm2, %%ymm2 \n\t" // mul add lhs rhs0 into ymm2 142 "vpmaddwd %%ymm0, %%ymm3, %%ymm3 \n\t" // mul add lhs rhs1 into ymm3 143 "vpaddd %%ymm2, %%ymm8, %%ymm8 \n\t" // add muladd lhs + rhs0 into ymm8 144 "vpaddd %%ymm3, %%ymm9, %%ymm9 \n\t" // add muladd lhs + rhs1 into ymm9 145 // next LHS cell elements 2 and 3 146 "vpshufd $0xaa,%%ymm1,%%ymm2 \n\t" // mov rhs 2 element to all ymm2 147 "vpshufd $0xff,%%ymm1,%%ymm3 \n\t" // mov rhs 3 element to all ymm3 148 "vpmaddwd %%ymm0, %%ymm2, %%ymm2 \n\t" // mul add lhs rhs2 into ymm2 149 "vpmaddwd %%ymm0, %%ymm3, %%ymm3 \n\t" // mul add lhs rhs3 into ymm3 150 "vpaddd %%ymm2, %%ymm10, %%ymm10 \n\t" // add muladd lhs + rhs2 into ymm10 151 "vpaddd %%ymm3, %%ymm11, %%ymm11 \n\t" // add muladd lhs + rhs3 into ymm11 152 153 // rhs lower half 154 "vpmovzxbw (%[rhs_ptr]), %%ymm1 \n\t" // mov rhs to ymm1 155 "vpermq $0x44,%%ymm1, %%ymm1 \n\t" // duplcate lower 16 156 157 // next LHS cell elements 0 and 1 158 "vpmovzxbw 0x20(%[lhs_ptr]), %%ymm0 \n\t" // mov lhs to ymm0 159 "vpshufd $0x00,%%ymm1,%%ymm2 \n\t" // mov rhs 0 element to all ymm2 160 "vpshufd $0x55,%%ymm1,%%ymm3 \n\t" // mov rhs 1 element to all ymm3 161 "vpmaddwd %%ymm0, %%ymm2, %%ymm2 \n\t" // mul add lhs rhs0 into ymm2 162 "vpmaddwd %%ymm0, %%ymm3, %%ymm3 \n\t" // mul add lhs rhs1 into ymm3 163 "vpaddd %%ymm2, %%ymm12, %%ymm12 \n\t" // add muladd lhs + rhs0 into ymm8 164 "vpaddd %%ymm3, %%ymm13, %%ymm13 \n\t" // add muladd lhs + rhs1 into ymm9 165 166 // cache prefetch rhs //see if it works better? 167 //"prefetcht0 0x80(%[rhs_ptr]) \n\t" 168 169 // next LHS cell elements 2 and 3 170 "vpshufd $0xaa,%%ymm1,%%ymm2 \n\t" // mov rhs 2 element to all ymm2 171 "vpshufd $0xff,%%ymm1,%%ymm3 \n\t" // mov rhs 3 element to all ymm3 172 "vpmaddwd %%ymm0, %%ymm2, %%ymm2 \n\t" // mul add lhs rhs2 into ymm2 173 "vpmaddwd %%ymm0, %%ymm3, %%ymm3 \n\t" // mul add lhs rhs3 into ymm3 174 "vpaddd %%ymm2, %%ymm14, %%ymm14 \n\t" // add muladd lhs + rhs2 into ymm10 175 "vpaddd %%ymm3, %%ymm15, %%ymm15 \n\t" // add muladd lhs + rhs3 into ymm11 176 177 // current result in ymm4, ymm5, ymm6, ymm7, ymm8, ymm9, ymm10 ymm11 ymm12 ymm13 ymm14 ymm15 178 179 // rhs+10 lower half 180 "vpmovzxbw 0x08(%[rhs_ptr]), %%ymm1 \n\t" // mov rhs to ymm1 181 "vpermq $0x44,%%ymm1, %%ymm1 \n\t" 182 // next LHS cell elements 0 and 1 183 "vpmovzxbw 0x30(%[lhs_ptr]), %%ymm0 \n\t" // mov lhs to ymm0 184 "vpshufd $0x00,%%ymm1,%%ymm2 \n\t" // move rhs 0 element to ymm2 185 "vpshufd $0x55,%%ymm1,%%ymm3 \n\t" // move rhs 1 element to ymm3 186 "vpmaddwd %%ymm0, %%ymm2, %%ymm2 \n\t" // muladd lhs rhs0 into ymm2 187 "vpmaddwd %%ymm0, %%ymm3, %%ymm3 \n\t" // muladd lhs rhs1 into ymm3 188 "vpaddd %%ymm2, %%ymm4, %%ymm4 \n\t" // accumulate to ymm4 189 "vpaddd %%ymm3, %%ymm5, %%ymm5 \n\t" // accumulate to ymm5 190 // next LHS cell elements 2 and 3 191 "vpshufd $0xaa,%%ymm1,%%ymm2 \n\t" // mov rhs 2 element to ymm2 192 "vpshufd $0xff,%%ymm1,%%ymm3 \n\t" // mov rhs 3 element to ymm2 193 "vpmaddwd %%ymm0, %%ymm2, %%ymm2 \n\t" // mul add lhs rhs2 into ymm2 194 "vpmaddwd %%ymm0, %%ymm3, %%ymm3 \n\t" // mull add lhs rhs3 into ymm3 195 "vpaddd %%ymm2, %%ymm6, %%ymm6 \n\t" // add lhs rhs2 to ymm6 196 "vpaddd %%ymm3, %%ymm7, %%ymm7 \n\t" // add lhs rhs3 to ymm7 197 198 // rhs+10 lower half 199 "vpmovzxbw 0x08(%[rhs_ptr]), %%ymm1 \n\t" // mov rhs to ymm1 200 "vpermq $0x44,%%ymm1, %%ymm1 \n\t" 201 202 // next LHS cell elements 4 and 5 203 "vpmovzxbw 0x40(%[lhs_ptr]), %%ymm0 \n\t" // mov lhs to ymm0 204 "vpshufd $0x00,%%ymm1,%%ymm2 \n\t" // move rhs 0 element to ymm2 205 "vpshufd $0x55,%%ymm1,%%ymm3 \n\t" // move rhs 1 element to ymm3 206 "vpmaddwd %%ymm0, %%ymm2, %%ymm2 \n\t" // muladd lhs rhs0 into ymm2 207 "vpmaddwd %%ymm0, %%ymm3, %%ymm3 \n\t" // muladd lhs rhs1 into ymm3 208 "vpaddd %%ymm2, %%ymm8, %%ymm8 \n\t" // accumulate to ymm8 209 "vpaddd %%ymm3, %%ymm9, %%ymm9 \n\t" // accumulate to ymm9 210 // next LHS cell elements 6 and 7 211 "vpshufd $0xaa,%%ymm1,%%ymm2 \n\t" // mov rhs 2 element to ymm2 212 "vpshufd $0xff,%%ymm1,%%ymm3 \n\t" // mov rhs 3 element to ymm2 213 "vpmaddwd %%ymm0, %%ymm2, %%ymm2 \n\t" // mul add lhs rhs2 into ymm2 214 "vpmaddwd %%ymm0, %%ymm3, %%ymm3 \n\t" // mull add lhs rhs3 into ymm3 215 "vpaddd %%ymm2, %%ymm10, %%ymm10 \n\t" // add lhs rhs2 to ymm10 216 "vpaddd %%ymm3, %%ymm11, %%ymm11 \n\t" // add lhs rhs3 to ymm11 217 218 "vpmovzxbw 0x08(%[rhs_ptr]), %%ymm1 \n\t" // mov rhs to ymm1 219 "vpermq $0x44,%%ymm1, %%ymm1 \n\t" 220 // next LHS cell elements 9 and 10 221 "vpmovzxbw 0x50(%[lhs_ptr]), %%ymm0 \n\t" // mov lhs to ymm0 222 "vpshufd $0x00,%%ymm1,%%ymm2 \n\t" // move rhs 0 element to ymm2 223 "vpshufd $0x55,%%ymm1,%%ymm3 \n\t" // move rhs 1 element to ymm3 224 "vpmaddwd %%ymm0, %%ymm2, %%ymm2 \n\t" // muladd lhs rhs0 into ymm2 225 "vpmaddwd %%ymm0, %%ymm3, %%ymm3 \n\t" // muladd lhs rhs1 into ymm3 226 "vpaddd %%ymm2, %%ymm12, %%ymm12 \n\t" // accumulate to ymm12 227 "vpaddd %%ymm3, %%ymm13, %%ymm13 \n\t" // accumulate to ymm13 228 229 // next LHS cell elements 11 and 12 230 "vpshufd $0xaa,%%ymm1,%%ymm2 \n\t" // mov rhs 2 element to ymm2 231 "vpshufd $0xff,%%ymm1,%%ymm3 \n\t" // mov rhs 3 element to ymm2 232 "vpmaddwd %%ymm0, %%ymm2, %%ymm2 \n\t" // mul add lhs rhs2 into ymm2 233 "vpmaddwd %%ymm0, %%ymm3, %%ymm3 \n\t" // mull add lhs rhs3 into ymm3 234 "vpaddd %%ymm2, %%ymm14, %%ymm14 \n\t" // add lhs rhs2 to ymm14 235 "vpaddd %%ymm3, %%ymm15, %%ymm15 \n\t" // add lhs rhs3 to ymm15 236 237 // completed rhs+10 238 "addq $0x60, %[lhs_ptr] \n\t" // increment stride lhs 239 "addq $0x10, %[rhs_ptr] \n\t" // increment stride rhs 240 241 "subq $2, %[run_depth_cells] \n\t" 242 "ja outerLoop2%= \n\t" 243 244 "movq %[run_depth_cells], %%r14 \n\t" 245 "decq %%r14 \n\t" 246 "js finish%= \n\t" 247 248 // Loop for K unrolled by 2 249 "outerLoop1%=: \n\t" 250 251 // rhs lower 252 "vpmovzxbw (%[rhs_ptr]), %%ymm1 \n\t" // get rhs into ymm1 253 "vpermq $0x44,%%ymm1, %%ymm1 \n\t" 254 255 // LHS cell 256 "vpmovzxbw (%[lhs_ptr]), %%ymm0 \n\t" // lhs in into ymm0 257 "vpshufd $0x00,%%ymm1,%%ymm2 \n\t" // rhs element 0 into ymm2 258 "vpshufd $0x55,%%ymm1,%%ymm3 \n\t" // rhs element 1 into ymm3 259 "vpmaddwd %%ymm0, %%ymm2, %%ymm2 \n\t" // muladd lhs rhs element 0 ymm2 260 "vpmaddwd %%ymm0, %%ymm3, %%ymm3 \n\t" // muladd lhs rhs element 1 ymm3 261 "vpaddd %%ymm2, %%ymm4, %%ymm4 \n\t" // acc element 0 ymm4 262 "vpaddd %%ymm3, %%ymm5, %%ymm5 \n\t" // acc element 1 ymm5 263 "vpshufd $0xaa,%%ymm1,%%ymm2 \n\t" // rhs element 2 into ymm2 264 "vpshufd $0xff,%%ymm1,%%ymm3 \n\t" // rhs element 3 into ymm3 265 "vpmaddwd %%ymm0, %%ymm2, %%ymm2 \n\t" // muladd lhs rhs element 2 ymm2 266 "vpmaddwd %%ymm0, %%ymm3, %%ymm3 \n\t" // muladd lhs rhs element 3 ymm3 267 "vpaddd %%ymm2, %%ymm6, %%ymm6 \n\t" // acc element 2 into ymm6 268 "vpaddd %%ymm3, %%ymm7, %%ymm7 \n\t" // acc element 3 into ymm7 269 270 // lhs+10 271 "vpmovzxbw 0x10(%[lhs_ptr]), %%ymm0 \n\t" // lhs in into ymm0 272 "vpshufd $0x00, %%ymm1, %%ymm2 \n\t" // rhs element 0 into ymm2 273 "vpshufd $0x55, %%ymm1, %%ymm3 \n\t" // rhs element 1 into ymm3 274 "vpmaddwd %%ymm0, %%ymm2, %%ymm2 \n\t" // muladd lhs rhs element 0 ymm2 275 "vpmaddwd %%ymm0, %%ymm3, %%ymm3 \n\t" // muladd lhs rhs element 1 ymm3 276 "vpaddd %%ymm2, %%ymm8, %%ymm8 \n\t" // acc element 0 ymm8 277 "vpaddd %%ymm3, %%ymm9, %%ymm9 \n\t" // acc element 1 ymm9 278 "vpshufd $0xaa,%%ymm1,%%ymm2 \n\t" // rhs element 2 into ymm2 279 "vpshufd $0xff,%%ymm1,%%ymm3 \n\t" // rhs element 3 into ymm3 280 "vpmaddwd %%ymm0, %%ymm2, %%ymm2 \n\t" // muladd lhs rhs element 2 ymm2 281 "vpmaddwd %%ymm0, %%ymm3, %%ymm3 \n\t" // muladd lhs rhs element 3 ymm3 282 "vpaddd %%ymm2, %%ymm10, %%ymm10 \n\t" // acc element 2 into ymm10 283 "vpaddd %%ymm3, %%ymm11, %%ymm11 \n\t" // acc element 3 into ymm11 284 285 "vpmovzxbw 0x20(%[lhs_ptr]), %%ymm0 \n\t" 286 "vpshufd $0x00, %%ymm1, %%ymm2 \n\t" // rhs element 0 into ymm2 287 "vpshufd $0x55, %%ymm1, %%ymm3 \n\t" // rhs element 1 into ymm3 288 "vpmaddwd %%ymm0, %%ymm2, %%ymm2 \n\t" // muladd lhs rhs element 0 ymm2 289 "vpmaddwd %%ymm0, %%ymm3, %%ymm3 \n\t" // muladd lhs rhs element 1 ymm3 290 "vpaddd %%ymm2, %%ymm12, %%ymm12 \n\t" // acc element 0 ymm12 291 "vpaddd %%ymm3, %%ymm13, %%ymm13 \n\t" // acc element 1 ymm13 292 "vpshufd $0xaa,%%ymm1,%%ymm2 \n\t" // rhs element 2 into ymm2 293 "vpshufd $0xff,%%ymm1,%%ymm3 \n\t" // rhs element 3 into ymm3 294 "vpmaddwd %%ymm0, %%ymm2, %%ymm2 \n\t" // muladd lhs rhs element 2 ymm2 295 "vpmaddwd %%ymm0, %%ymm3, %%ymm3 \n\t" // muladd lhs rhs element 3 ymm3 296 "vpaddd %%ymm2, %%ymm14, %%ymm14 \n\t" // acc element 2 into ymm14 297 "vpaddd %%ymm3, %%ymm15, %%ymm15 \n\t" // acc element 3 into ymm15 298 299 // update matrix pointers 300 "addq $0x30, %[lhs_ptr] \n\t" 301 "addq $0x08, %[rhs_ptr] \n\t" 302 303 "decq %[run_depth_cells] \n\t" 304 "jnz outerLoop1%= \n\t" 305 306 "finish%=:\n\t" 307 308 "test %[start_depth], %[start_depth] \n\t" 309 "jz storeDst%= \n\t" 310 311 "vpaddd 0x00(%[dst_ptr]), %%ymm4, %%ymm4 \n\t" // rhs0 312 "vpaddd 0x20(%[dst_ptr]), %%ymm8, %%ymm8 \n\t" // rhs0 313 "vpaddd 0x40(%[dst_ptr]), %%ymm12, %%ymm12 \n\t" // rhs0 314 315 "vpaddd 0x00(%[dst_ptr], %%r12, 1) , %%ymm5, %%ymm5 \n\t" // rhs1 316 "vpaddd 0x20(%[dst_ptr], %%r12, 1) , %%ymm9, %%ymm9 \n\t" // rhs1 317 "vpaddd 0x40(%[dst_ptr], %%r12, 1) , %%ymm13, %%ymm13 \n\t" // rhs1 318 319 "vpaddd 0x00(%[dst_ptr], %%r12, 2) , %%ymm6, %%ymm6 \n\t" // rhs2 320 "vpaddd 0x20(%[dst_ptr], %%r12, 2) , %%ymm10, %%ymm10 \n\t" // rhs2 321 "vpaddd 0x40(%[dst_ptr], %%r12, 2) , %%ymm14, %%ymm14 \n\t" // rhs2 322 323 "vpaddd 0x00(%[dst_ptr], %%r13, 1) , %%ymm7, %%ymm7 \n\t" // rhs3 324 "vpaddd 0x20(%[dst_ptr], %%r13, 1) , %%ymm11, %%ymm11 \n\t" // rhs3 325 "vpaddd 0x40(%[dst_ptr], %%r13, 1) , %%ymm15, %%ymm15 \n\t" // rhs3 326 327 "storeDst%=:\n\t" 328 329 "vmovdqu %%ymm4, 0x00(%[dst_ptr]) \n\t" // rhs0 330 "vmovdqu %%ymm8, 0x20(%[dst_ptr]) \n\t" // rhs0 331 "vmovdqu %%ymm12, 0x40(%[dst_ptr]) \n\t" // rhs0 332 333 "vmovdqu %%ymm5, 0x00(%[dst_ptr], %%r12, 1) \n\t" // rhs1 334 "vmovdqu %%ymm9, 0x20(%[dst_ptr], %%r12, 1) \n\t" // rhs1 335 "vmovdqu %%ymm13, 0x40(%[dst_ptr], %%r12, 1) \n\t" // rhs1 336 337 "vmovdqu %%ymm6, 0x00(%[dst_ptr], %%r12, 2) \n\t" // rhs2 338 "vmovdqu %%ymm10, 0x20(%[dst_ptr], %%r12, 2) \n\t" // rhs2 339 "vmovdqu %%ymm14, 0x40(%[dst_ptr], %%r12, 2) \n\t" // rhs2 340 341 "vmovdqu %%ymm7, 0x00(%[dst_ptr], %%r13, 1) \n\t" // rhs3 342 "vmovdqu %%ymm11, 0x20(%[dst_ptr], %%r13, 1) \n\t" // rhs3 343 "vmovdqu %%ymm15, 0x40(%[dst_ptr], %%r13, 1) \n\t" // rhs3 344 345 : // outputs 346 [lhs_ptr] "+r"(lhs_ptr), [rhs_ptr] "+r"(rhs_ptr), 347 [dst_ptr] "+r"(dst_ptr) 348 : // inputs 349 [start_depth] "r"(start_depth), [dst_col_stride_q] "r"(dst_col_stride_q), 350 [run_depth_cells] "r"(run_depth_cells) 351 : // clobbers 352 "cc", "memory", "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", 353 "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "%r12", 354 "%r13", "%r14"); 355 } 356 }; 357 #endif 358 359 } // namespace gemmlowp 360 361 #endif // GEMMLOWP_INTERNAL_KERNEL_AVX_H_ 362