• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 //
2 // Licensed under the Apache License, Version 2.0 (the "License");
3 // you may not use this file except in compliance with the License.
4 // You may obtain a copy of the License at
5 //
6 //     http://www.apache.org/licenses/LICENSE-2.0
7 //
8 // Unless required by applicable law or agreed to in writing, software
9 // distributed under the License is distributed on an "AS IS" BASIS,
10 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
11 // See the License for the specific language governing permissions and
12 // limitations under the License.
13 
14 // kernel_SSE.h: a collection of Intel SSE optimized kernels.
15 // Check in kernel_default.h which one(s) are actually used by default.
16 // Others are mere experiments; they are still covered by tests
17 // in case they might be useful some day.
18 //
19 
20 #ifndef GEMMLOWP_INTERNAL_KERNEL_AVX_H_
21 #define GEMMLOWP_INTERNAL_KERNEL_AVX_H_
22 
23 #include "kernel.h"
24 
25 #include <string.h>
26 #include <cassert>
27 
28 namespace gemmlowp {
29 
30 #ifdef GEMMLOWP_AVX2_64
31 struct AVX2_64_Kernel24x8Depth2 : KernelBase {
32   typedef KernelFormat<KernelSideFormat<CellFormat<8, 2, CellOrder::WidthMajor>, 3>,
33                        KernelSideFormat<CellFormat<4, 2, CellOrder::WidthMajor>, 1>>
34       Format;
35 
NameAVX2_64_Kernel24x8Depth236   const char *Name() const override { return "AVX, 24x8, depth 2"; }
37 
RunAVX2_64_Kernel24x8Depth238   void Run(std::int32_t *dst_ptr, std::size_t dst_row_stride, std::size_t dst_col_stride,
39            const std::uint8_t *lhs_ptr, const std::uint8_t *rhs_ptr, std::size_t start_depth,
40            std::size_t run_depth) const override {
41     ScopedProfilingLabel label("optimized kernel");
42     assert(dst_row_stride == 1);
43     const std::int64_t run_depth_cells = run_depth / Format::kDepth;
44     const std::int64_t dst_col_stride_q = dst_col_stride;
45 
46     /* Main loop */
47 
48     // A 2x8 cell of Rhs is stored in 16bit in ymm1 .
49     // A 24x2 block of 3 8x2 cells Lhs is stored in 16bit in ymm0, replaced
50     // every Iteration.
51     // A 8x8 block of accumulators is stored in 32bit in xmm4--xmm15.
52     //
53     //                   +-------+-------+-------+-------+
54     //                   |ymm1[0]        |ymm2[2]        |
55     //              Rhs  +-------+---------------+-------+
56     //                   |ymm1[1]        |ymm1[4]        |
57     //                   +-------+-------+-------+-------+
58     //
59     //                   |       |       |       |       |
60     //
61     //    Lhs            |       |       |       |       |
62     //
63     //  +--+--+ - - - -  +-------+-------+-------+-------+
64     //  |ymm0 |          | ymm4  | ymm5  | ymm6  | ymm7  |
65     //  |ymm0 | (Iter1)  | ymm4  | ymm5  | ymm6  | ymm7  |
66     //  |ymm0 |          | ymm4  | ymm5  | ymm6  | ymm7  |
67     //  |ymm0 |          | ymm4  | ymm5  | ymm6  | ymm7  |
68     //  +--+--+ - - - -  +-------+-------+-------+-------+
69     //  |ymm0 |          | ymm8  | ymm9  | ymm10 | ymm11 |
70     //  |ymm0 | (Iter2)  | ymm8  | ymm9  | ymm10 | ymm11 |
71     //  |ymm0 |          | ymm8  | ymm9  | ymm10 | ymm11 |
72     //  |ymm0 |          | ymm8  | ymm9  | ymm10 | ymm11 |
73     //  +--+--+ - - - -  +-------+-------+-------+-------+
74     //  |ymm0 |          | ymm12 | ymm13 | ymm14 | ymm15 |
75     //  |ymm0 | (Iter3)  | ymm12 | ymm13 | ymm14 | ymm15 |
76     //  |ymm0 |          | ymm12 | ymm13 | ymm14 | ymm15 |
77     //  |ymm0 |          | ymm12 | ymm13 | ymm14 | ymm15 |
78     //  +--+--+ - - - -  +-------+-------+-------+-------+
79     //
80     //                              Accumulator
81 
82     asm volatile(
83         // Set registers for destination
84         "movq  %[dst_col_stride_q], %%r12\n\t"  // stride is r12
85         "shlq $2, %%r12\n\t"                    // set stride dword
86         "leaq (%%r12,%%r12,0x2), %%r13\n\t"     // load stride aligned r13
87 
88         // Set accumulators to zero.
89         "vpxor %%ymm4, %%ymm4, %%ymm4 \n\t"    // zero accumulators
90         "vpxor %%ymm5, %%ymm5, %%ymm5 \n\t"    // zero accumulators
91         "vpxor %%ymm6, %%ymm6, %%ymm6 \n\t"    // zero accumulators
92         "vpxor %%ymm7, %%ymm7, %%ymm7 \n\t"    // zero accumulators
93         "vpxor %%ymm8, %%ymm8, %%ymm8 \n\t"    // zero accumulators
94         "vpxor %%ymm9, %%ymm9, %%ymm9 \n\t"    // zero accumulators
95         "vpxor %%ymm10, %%ymm10, %%ymm10\n\t"  // zero accumulators
96         "vpxor %%ymm11, %%ymm11, %%ymm11\n\t"  // zero accumulators
97         "vpxor %%ymm12, %%ymm12, %%ymm12\n\t"  // zero accumulators
98         "vpxor %%ymm13, %%ymm13, %%ymm13\n\t"  // zero accumulators
99         "vpxor %%ymm14, %%ymm14, %%ymm14\n\t"  // zero accumulators
100         "vpxor %%ymm15, %%ymm15, %%ymm15\n\t"  // zero accumulators
101 
102         "movq  %[run_depth_cells], %%r14 \n\t"  // load cell depth r14
103         "subq $2, %%r14 \n\t"                   // cell depth is 2
104         "js outerLoop1%= \n\t"                  // outerloop for matrix
105 
106         // Loop for K unrolled by 4
107         "outerLoop2%=: \n\t"  // outer loop unroll
108 
109         // K = 0,1,2,3
110         // RHS cell to ymm1
111 
112         // lower half
113         "vpmovzxbw (%[rhs_ptr]), %%ymm1 \n\t"  // mov rhs to ymm1
114         "vpermq $0x44,%%ymm1, %%ymm1 \n\t"
115         // LHS cell elements 0 and 1
116         "vpmovzxbw 0x00(%[lhs_ptr]), %%ymm0\n\t"  // mov lhs to ymm0
117         "vpshufd $0x00,%%ymm1,%%ymm2     \n\t"    // move rhs 0 element to all ymm2
118         "vpshufd $0x55,%%ymm1,%%ymm3     \n\t"    // move rhs 1 element to all ymm3
119         "vpmaddwd %%ymm0, %%ymm2, %%ymm2 \n\t"    // mul add lhs rhs0 into ymm2
120         "vpmaddwd %%ymm0, %%ymm3, %%ymm3 \n\t"    // mul add lhs rhs1 into ymm3
121         "vpaddd %%ymm2, %%ymm4, %%ymm4   \n\t"    // add muladd lhs + rhs0 into ymm4
122         "vpaddd %%ymm3, %%ymm5, %%ymm5   \n\t"    // add muladd lhs + rhs1 into ymm5
123         // LHS cell elements 2 and 3
124         "vpshufd $0xaa, %%ymm1, %%ymm2   \n\t"  // move rhs 2 element to all ymm2
125         "vpmaddwd %%ymm0, %%ymm2, %%ymm2 \n\t"  // mul add lhs rh3 into ymm2
126         "vpshufd $0xff,%%ymm1,%%ymm3     \n\t"  // mov rhs 3 element into all ymm3
127         "vpmaddwd %%ymm0, %%ymm3, %%ymm3 \n\t"  // mul add lhs rh4 into ymm3
128         "vpaddd %%ymm2, %%ymm6, %%ymm6   \n\t"  // add muladd lhs + rhs2 into ymm6
129         "vpaddd %%ymm3, %%ymm7, %%ymm7   \n\t"  // add muladd lhs + rhs3 into ymm7
130 
131         // cache prefect lhs //see if it works better?
132         //"prefetcht0 0x80(%[lhs_ptr]) \n\t" //prefetch cache lines
133         "vpmovzxbw (%[rhs_ptr]), %%ymm1 \n\t"  // mov rhs to ymm1
134         "vpermq $0x44,%%ymm1, %%ymm1 \n\t"
135 
136         // K = 5,6,7,8
137         // next LHS cell elements 0 and 1
138         "vpmovzxbw 0x10(%[lhs_ptr]), %%ymm0 \n\t"  // mov lhs to ymm0
139         "vpshufd $0x00,%%ymm1,%%ymm2        \n\t"  // mov rhs 0 element to all ymm2
140         "vpshufd $0x55,%%ymm1,%%ymm3        \n\t"  // mov rhs 1 element to all ymm3
141         "vpmaddwd %%ymm0, %%ymm2, %%ymm2    \n\t"  // mul add lhs rhs0 into ymm2
142         "vpmaddwd %%ymm0, %%ymm3, %%ymm3    \n\t"  // mul add lhs rhs1 into ymm3
143         "vpaddd %%ymm2, %%ymm8, %%ymm8      \n\t"  // add muladd lhs + rhs0 into ymm8
144         "vpaddd %%ymm3, %%ymm9, %%ymm9      \n\t"  // add muladd lhs + rhs1 into ymm9
145         // next LHS cell elements 2 and 3
146         "vpshufd $0xaa,%%ymm1,%%ymm2        \n\t"  // mov rhs 2 element to all ymm2
147         "vpshufd $0xff,%%ymm1,%%ymm3        \n\t"  // mov rhs 3 element to all ymm3
148         "vpmaddwd %%ymm0, %%ymm2, %%ymm2    \n\t"  // mul add lhs rhs2 into ymm2
149         "vpmaddwd %%ymm0, %%ymm3, %%ymm3    \n\t"  // mul add lhs rhs3 into ymm3
150         "vpaddd %%ymm2, %%ymm10, %%ymm10    \n\t"  // add muladd lhs + rhs2 into ymm10
151         "vpaddd %%ymm3, %%ymm11, %%ymm11    \n\t"  // add muladd lhs + rhs3 into ymm11
152 
153         // rhs lower half
154         "vpmovzxbw (%[rhs_ptr]), %%ymm1 \n\t"  // mov rhs to ymm1
155         "vpermq $0x44,%%ymm1, %%ymm1 \n\t"     // duplcate lower 16
156 
157         // next LHS cell elements 0 and 1
158         "vpmovzxbw 0x20(%[lhs_ptr]), %%ymm0 \n\t"    // mov lhs to ymm0
159         "vpshufd $0x00,%%ymm1,%%ymm2        \n\t"    // mov rhs 0 element to all ymm2
160         "vpshufd $0x55,%%ymm1,%%ymm3        \n\t"    // mov rhs 1 element to all ymm3
161         "vpmaddwd %%ymm0, %%ymm2, %%ymm2    \n\t"    // mul add lhs rhs0 into ymm2
162         "vpmaddwd %%ymm0, %%ymm3, %%ymm3    \n\t"    // mul add lhs rhs1 into ymm3
163         "vpaddd %%ymm2, %%ymm12, %%ymm12      \n\t"  // add muladd lhs + rhs0 into ymm8
164         "vpaddd %%ymm3, %%ymm13, %%ymm13      \n\t"  // add muladd lhs + rhs1 into ymm9
165 
166         // cache prefetch rhs //see if it works better?
167         //"prefetcht0 0x80(%[rhs_ptr]) \n\t"
168 
169         // next LHS cell elements 2 and 3
170         "vpshufd $0xaa,%%ymm1,%%ymm2        \n\t"  // mov rhs 2 element to all ymm2
171         "vpshufd $0xff,%%ymm1,%%ymm3        \n\t"  // mov rhs 3 element to all ymm3
172         "vpmaddwd %%ymm0, %%ymm2, %%ymm2    \n\t"  // mul add lhs rhs2 into ymm2
173         "vpmaddwd %%ymm0, %%ymm3, %%ymm3    \n\t"  // mul add lhs rhs3 into ymm3
174         "vpaddd %%ymm2, %%ymm14, %%ymm14    \n\t"  // add muladd lhs + rhs2 into ymm10
175         "vpaddd %%ymm3, %%ymm15, %%ymm15    \n\t"  // add muladd lhs + rhs3 into ymm11
176 
177         // current result in ymm4, ymm5, ymm6, ymm7, ymm8, ymm9, ymm10 ymm11 ymm12 ymm13 ymm14 ymm15
178 
179         // rhs+10 lower half
180         "vpmovzxbw 0x08(%[rhs_ptr]), %%ymm1 \n\t"  // mov rhs to ymm1
181         "vpermq $0x44,%%ymm1, %%ymm1 \n\t"
182         // next LHS cell elements 0 and 1
183         "vpmovzxbw 0x30(%[lhs_ptr]), %%ymm0 \n\t"  // mov lhs to ymm0
184         "vpshufd $0x00,%%ymm1,%%ymm2        \n\t"  // move rhs 0 element to ymm2
185         "vpshufd $0x55,%%ymm1,%%ymm3        \n\t"  // move rhs 1 element to ymm3
186         "vpmaddwd %%ymm0, %%ymm2, %%ymm2    \n\t"  // muladd lhs rhs0 into ymm2
187         "vpmaddwd %%ymm0, %%ymm3, %%ymm3    \n\t"  // muladd lhs rhs1 into ymm3
188         "vpaddd %%ymm2, %%ymm4, %%ymm4      \n\t"  // accumulate to ymm4
189         "vpaddd %%ymm3, %%ymm5, %%ymm5      \n\t"  // accumulate to ymm5
190         // next LHS cell elements 2 and 3
191         "vpshufd $0xaa,%%ymm1,%%ymm2        \n\t"  // mov rhs 2 element to ymm2
192         "vpshufd $0xff,%%ymm1,%%ymm3        \n\t"  // mov rhs 3 element to ymm2
193         "vpmaddwd %%ymm0, %%ymm2, %%ymm2    \n\t"  // mul add lhs rhs2 into ymm2
194         "vpmaddwd %%ymm0, %%ymm3, %%ymm3    \n\t"  // mull add lhs rhs3 into ymm3
195         "vpaddd %%ymm2, %%ymm6, %%ymm6      \n\t"  // add lhs rhs2 to ymm6
196         "vpaddd %%ymm3, %%ymm7, %%ymm7      \n\t"  // add lhs rhs3 to ymm7
197 
198         // rhs+10 lower half
199         "vpmovzxbw 0x08(%[rhs_ptr]), %%ymm1 \n\t"  // mov rhs to ymm1
200         "vpermq $0x44,%%ymm1, %%ymm1 \n\t"
201 
202         // next LHS cell elements 4 and 5
203         "vpmovzxbw 0x40(%[lhs_ptr]), %%ymm0 \n\t"  // mov lhs to ymm0
204         "vpshufd $0x00,%%ymm1,%%ymm2        \n\t"  // move rhs 0 element to ymm2
205         "vpshufd $0x55,%%ymm1,%%ymm3        \n\t"  // move rhs 1 element to ymm3
206         "vpmaddwd %%ymm0, %%ymm2, %%ymm2    \n\t"  // muladd lhs rhs0 into ymm2
207         "vpmaddwd %%ymm0, %%ymm3, %%ymm3    \n\t"  // muladd lhs rhs1 into ymm3
208         "vpaddd %%ymm2, %%ymm8, %%ymm8      \n\t"  // accumulate to ymm8
209         "vpaddd %%ymm3, %%ymm9, %%ymm9      \n\t"  // accumulate to ymm9
210         // next LHS cell elements 6 and 7
211         "vpshufd $0xaa,%%ymm1,%%ymm2        \n\t"  // mov rhs 2 element to ymm2
212         "vpshufd $0xff,%%ymm1,%%ymm3        \n\t"  // mov rhs 3 element to ymm2
213         "vpmaddwd %%ymm0, %%ymm2, %%ymm2    \n\t"  // mul add lhs rhs2 into ymm2
214         "vpmaddwd %%ymm0, %%ymm3, %%ymm3    \n\t"  // mull add lhs rhs3 into ymm3
215         "vpaddd %%ymm2, %%ymm10, %%ymm10    \n\t"  // add lhs rhs2 to ymm10
216         "vpaddd %%ymm3, %%ymm11, %%ymm11    \n\t"  // add lhs rhs3 to ymm11
217 
218         "vpmovzxbw 0x08(%[rhs_ptr]), %%ymm1 \n\t"  // mov rhs to ymm1
219         "vpermq $0x44,%%ymm1, %%ymm1 \n\t"
220         // next LHS cell elements 9 and 10
221         "vpmovzxbw 0x50(%[lhs_ptr]), %%ymm0 \n\t"  // mov lhs to ymm0
222         "vpshufd $0x00,%%ymm1,%%ymm2        \n\t"  // move rhs 0 element to ymm2
223         "vpshufd $0x55,%%ymm1,%%ymm3        \n\t"  // move rhs 1 element to ymm3
224         "vpmaddwd %%ymm0, %%ymm2, %%ymm2    \n\t"  // muladd lhs rhs0 into ymm2
225         "vpmaddwd %%ymm0, %%ymm3, %%ymm3    \n\t"  // muladd lhs rhs1 into ymm3
226         "vpaddd %%ymm2, %%ymm12, %%ymm12    \n\t"  // accumulate to ymm12
227         "vpaddd %%ymm3, %%ymm13, %%ymm13    \n\t"  // accumulate to ymm13
228 
229         // next LHS cell elements 11 and 12
230         "vpshufd $0xaa,%%ymm1,%%ymm2        \n\t"  // mov rhs 2 element to ymm2
231         "vpshufd $0xff,%%ymm1,%%ymm3        \n\t"  // mov rhs 3 element to ymm2
232         "vpmaddwd %%ymm0, %%ymm2, %%ymm2    \n\t"  // mul add lhs rhs2 into ymm2
233         "vpmaddwd %%ymm0, %%ymm3, %%ymm3    \n\t"  // mull add lhs rhs3 into ymm3
234         "vpaddd %%ymm2, %%ymm14, %%ymm14    \n\t"  // add lhs rhs2 to ymm14
235         "vpaddd %%ymm3, %%ymm15, %%ymm15    \n\t"  // add lhs rhs3 to ymm15
236 
237         // completed rhs+10
238         "addq $0x60, %[lhs_ptr]             \n\t"  // increment stride lhs
239         "addq $0x10, %[rhs_ptr]             \n\t"  // increment stride rhs
240 
241         "subq $2, %[run_depth_cells] \n\t"
242         "ja outerLoop2%= \n\t"
243 
244         "movq %[run_depth_cells], %%r14 \n\t"
245         "decq %%r14 \n\t"
246         "js finish%= \n\t"
247 
248         // Loop for K unrolled by 2
249         "outerLoop1%=: \n\t"
250 
251         // rhs lower
252         "vpmovzxbw (%[rhs_ptr]), %%ymm1 \n\t"  // get rhs into ymm1
253         "vpermq $0x44,%%ymm1, %%ymm1 \n\t"
254 
255         // LHS cell
256         "vpmovzxbw (%[lhs_ptr]), %%ymm0  \n\t"      // lhs in into ymm0
257         "vpshufd $0x00,%%ymm1,%%ymm2         \n\t"  // rhs element 0 into ymm2
258         "vpshufd $0x55,%%ymm1,%%ymm3         \n\t"  // rhs element 1 into ymm3
259         "vpmaddwd %%ymm0, %%ymm2, %%ymm2     \n\t"  // muladd lhs rhs element 0 ymm2
260         "vpmaddwd %%ymm0, %%ymm3, %%ymm3     \n\t"  // muladd lhs rhs element 1 ymm3
261         "vpaddd %%ymm2, %%ymm4, %%ymm4       \n\t"  // acc element 0 ymm4
262         "vpaddd %%ymm3, %%ymm5, %%ymm5       \n\t"  // acc element 1 ymm5
263         "vpshufd $0xaa,%%ymm1,%%ymm2         \n\t"  // rhs element 2 into ymm2
264         "vpshufd $0xff,%%ymm1,%%ymm3         \n\t"  // rhs element 3 into ymm3
265         "vpmaddwd %%ymm0, %%ymm2, %%ymm2     \n\t"  // muladd lhs rhs element 2 ymm2
266         "vpmaddwd %%ymm0, %%ymm3, %%ymm3     \n\t"  // muladd lhs rhs element 3 ymm3
267         "vpaddd %%ymm2, %%ymm6, %%ymm6       \n\t"  // acc element 2 into ymm6
268         "vpaddd %%ymm3, %%ymm7, %%ymm7       \n\t"  // acc element 3 into ymm7
269 
270         // lhs+10
271         "vpmovzxbw 0x10(%[lhs_ptr]), %%ymm0  \n\t"  // lhs in into ymm0
272         "vpshufd $0x00, %%ymm1, %%ymm2       \n\t"  // rhs element 0 into ymm2
273         "vpshufd $0x55, %%ymm1, %%ymm3       \n\t"  // rhs element 1 into ymm3
274         "vpmaddwd %%ymm0, %%ymm2, %%ymm2     \n\t"  // muladd lhs rhs element 0 ymm2
275         "vpmaddwd %%ymm0, %%ymm3, %%ymm3     \n\t"  // muladd lhs rhs element 1 ymm3
276         "vpaddd %%ymm2, %%ymm8, %%ymm8       \n\t"  // acc element 0 ymm8
277         "vpaddd %%ymm3, %%ymm9, %%ymm9       \n\t"  // acc element 1 ymm9
278         "vpshufd $0xaa,%%ymm1,%%ymm2         \n\t"  // rhs element 2 into ymm2
279         "vpshufd $0xff,%%ymm1,%%ymm3         \n\t"  // rhs element 3 into ymm3
280         "vpmaddwd %%ymm0, %%ymm2, %%ymm2     \n\t"  // muladd lhs rhs element 2 ymm2
281         "vpmaddwd %%ymm0, %%ymm3, %%ymm3     \n\t"  // muladd lhs rhs element 3 ymm3
282         "vpaddd %%ymm2, %%ymm10, %%ymm10     \n\t"  // acc element 2 into ymm10
283         "vpaddd %%ymm3, %%ymm11, %%ymm11     \n\t"  // acc element 3 into ymm11
284 
285         "vpmovzxbw 0x20(%[lhs_ptr]), %%ymm0  \n\t"
286         "vpshufd $0x00, %%ymm1, %%ymm2       \n\t"  // rhs element 0 into ymm2
287         "vpshufd $0x55, %%ymm1, %%ymm3       \n\t"  // rhs element 1 into ymm3
288         "vpmaddwd %%ymm0, %%ymm2, %%ymm2     \n\t"  // muladd lhs rhs element 0 ymm2
289         "vpmaddwd %%ymm0, %%ymm3, %%ymm3     \n\t"  // muladd lhs rhs element 1 ymm3
290         "vpaddd %%ymm2, %%ymm12, %%ymm12     \n\t"  // acc element 0 ymm12
291         "vpaddd %%ymm3, %%ymm13, %%ymm13     \n\t"  // acc element 1 ymm13
292         "vpshufd $0xaa,%%ymm1,%%ymm2         \n\t"  // rhs element 2 into ymm2
293         "vpshufd $0xff,%%ymm1,%%ymm3         \n\t"  // rhs element 3 into ymm3
294         "vpmaddwd %%ymm0, %%ymm2, %%ymm2     \n\t"  // muladd lhs rhs element 2 ymm2
295         "vpmaddwd %%ymm0, %%ymm3, %%ymm3     \n\t"  // muladd lhs rhs element 3 ymm3
296         "vpaddd %%ymm2, %%ymm14, %%ymm14     \n\t"  // acc element 2 into ymm14
297         "vpaddd %%ymm3, %%ymm15, %%ymm15     \n\t"  // acc element 3 into ymm15
298 
299         // update matrix pointers
300         "addq $0x30, %[lhs_ptr]              \n\t"
301         "addq $0x08, %[rhs_ptr]              \n\t"
302 
303         "decq %[run_depth_cells]             \n\t"
304         "jnz outerLoop1%=                    \n\t"
305 
306         "finish%=:\n\t"
307 
308         "test %[start_depth], %[start_depth] \n\t"
309         "jz storeDst%= \n\t"
310 
311         "vpaddd 0x00(%[dst_ptr]), %%ymm4, %%ymm4 \n\t"    // rhs0
312         "vpaddd 0x20(%[dst_ptr]), %%ymm8, %%ymm8 \n\t"    // rhs0
313         "vpaddd 0x40(%[dst_ptr]), %%ymm12, %%ymm12 \n\t"  // rhs0
314 
315         "vpaddd 0x00(%[dst_ptr], %%r12, 1) , %%ymm5, %%ymm5   \n\t"  // rhs1
316         "vpaddd 0x20(%[dst_ptr], %%r12, 1) , %%ymm9, %%ymm9   \n\t"  // rhs1
317         "vpaddd 0x40(%[dst_ptr], %%r12, 1) , %%ymm13, %%ymm13 \n\t"  // rhs1
318 
319         "vpaddd 0x00(%[dst_ptr], %%r12, 2) , %%ymm6, %%ymm6   \n\t"  // rhs2
320         "vpaddd 0x20(%[dst_ptr], %%r12, 2) , %%ymm10, %%ymm10 \n\t"  // rhs2
321         "vpaddd 0x40(%[dst_ptr], %%r12, 2) , %%ymm14, %%ymm14 \n\t"  // rhs2
322 
323         "vpaddd 0x00(%[dst_ptr], %%r13, 1) , %%ymm7, %%ymm7   \n\t"  // rhs3
324         "vpaddd 0x20(%[dst_ptr], %%r13, 1) , %%ymm11, %%ymm11 \n\t"  // rhs3
325         "vpaddd 0x40(%[dst_ptr], %%r13, 1) , %%ymm15, %%ymm15 \n\t"  // rhs3
326 
327         "storeDst%=:\n\t"
328 
329         "vmovdqu %%ymm4, 0x00(%[dst_ptr])            \n\t"  // rhs0
330         "vmovdqu %%ymm8, 0x20(%[dst_ptr])            \n\t"  // rhs0
331         "vmovdqu %%ymm12, 0x40(%[dst_ptr])           \n\t"  // rhs0
332 
333         "vmovdqu %%ymm5, 0x00(%[dst_ptr], %%r12, 1)  \n\t"  // rhs1
334         "vmovdqu %%ymm9, 0x20(%[dst_ptr], %%r12, 1)  \n\t"  // rhs1
335         "vmovdqu %%ymm13, 0x40(%[dst_ptr], %%r12, 1) \n\t"  // rhs1
336 
337         "vmovdqu %%ymm6, 0x00(%[dst_ptr], %%r12, 2)  \n\t"  // rhs2
338         "vmovdqu %%ymm10, 0x20(%[dst_ptr], %%r12, 2) \n\t"  // rhs2
339         "vmovdqu %%ymm14, 0x40(%[dst_ptr], %%r12, 2) \n\t"  // rhs2
340 
341         "vmovdqu %%ymm7, 0x00(%[dst_ptr], %%r13, 1)  \n\t"  // rhs3
342         "vmovdqu %%ymm11, 0x20(%[dst_ptr], %%r13, 1) \n\t"  // rhs3
343         "vmovdqu %%ymm15, 0x40(%[dst_ptr], %%r13, 1) \n\t"  // rhs3
344 
345         :  // outputs
346         [lhs_ptr] "+r"(lhs_ptr), [rhs_ptr] "+r"(rhs_ptr),
347         [dst_ptr] "+r"(dst_ptr)
348         :  // inputs
349         [start_depth] "r"(start_depth), [dst_col_stride_q] "r"(dst_col_stride_q),
350         [run_depth_cells] "r"(run_depth_cells)
351         :  // clobbers
352         "cc", "memory", "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7",
353         "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "%r12",
354         "%r13", "%r14");
355   }
356 };
357 #endif
358 
359 }  // namespace gemmlowp
360 
361 #endif  // GEMMLOWP_INTERNAL_KERNEL_AVX_H_
362