• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1From 043e4263bf4bcc6dd31a257da8f2a5f547ad72cd Mon Sep 17 00:00:00 2001
2From: chengfeng27 <chengfeng27@huawei.com>
3Date: Tue, 30 Jul 2024 17:14:13 +0800
4Subject: [PATCH] fix matmul assemble
5
6---
7 .../nnacl/assembly/arm64/BigMatmulFp32Opt.S   | 22 +++++++++----------
8 .../kernel/nnacl/assembly/arm64/MatmulFp32.S  | 14 ++++++------
9 .../nnacl/assembly/arm64/MatmulFp32Opt.S      | 14 ++++++------
10 .../nnacl/assembly/arm64/MatmulFp32OptRow12.S | 14 ++++++------
11 .../nnacl/assembly/arm64/MatmulFp32OptRow4.S  | 16 +++++++-------
12 .../nnacl/assembly/arm64/MatmulFp32OptRow8.S  | 14 ++++++------
13 .../nnacl/assembly/arm64/MatmulWinogradFp32.S |  7 +++---
14 7 files changed, 50 insertions(+), 51 deletions(-)
15
16diff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/arm64/BigMatmulFp32Opt.S b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/arm64/BigMatmulFp32Opt.S
17index 498038ff..03898585 100644
18--- a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/arm64/BigMatmulFp32Opt.S
19+++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/arm64/BigMatmulFp32Opt.S
20@@ -33,16 +33,17 @@
21
22 asm_function BigMatmulFloatNeon64Opt
23     sub sp, sp, #224
24-    st1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp], #64
25-    st1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp], #64
26-    stp x19, x20, [sp], #16
27-    stp x21, x22, [sp], #16
28-    stp x23, x24, [sp], #16
29-    stp x25, x26, [sp], #16
30-    stp x27, x28, [sp], #16
31-    stp x29, x30, [sp], #16
32-
33-    ldr x8, [sp]
34+    st1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp]
35+    add x9, sp, #64
36+    st1 {v12.4s, v13.4s, v14.4s, v15.4s}, [x9]
37+    stp x19, x20, [sp, #128]
38+    stp x21, x22, [sp, #144]
39+    stp x23, x24, [sp, #160]
40+    stp x25, x26, [sp, #176]
41+    stp x27, x28, [sp, #192]
42+    stp x29, x30, [sp, #208]
43+
44+    ldr x8, [sp, #224]
45     mov x20, #1
46     mov x22, #32
47     mov x23, #48
48@@ -2515,7 +2516,6 @@ Compute4x4Unit:
49         ret
50
51 End:
52-  sub sp, sp, #224
53   ld1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp], #64
54   ld1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp], #64
55   ldp x19, x20, [sp], #16
56diff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/arm64/MatmulFp32.S b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/arm64/MatmulFp32.S
57index 67d20dcc..2dedccd0 100644
58--- a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/arm64/MatmulFp32.S
59+++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/arm64/MatmulFp32.S
60@@ -34,17 +34,18 @@
61
62 asm_function MatmulFloatNeon64
63   sub sp, sp, #144
64-  st1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp], #64
65-  st1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp], #64
66-  stp x19, x20, [sp], #16
67+  st1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp]
68+  add x9, sp, #64
69+  st1 {v12.4s, v13.4s, v14.4s, v15.4s}, [x9]
70+  stp x19, x20, [sp, #128]
71
72-  ldr x9, [sp, #8]
73-  ldr x14, [sp, #16]
74+  ldr x9, [sp, #152]
75+  ldr x14, [sp, #160]
76
77   mov w19, #32 // sizeof(float) * 8
78   mul w15, w5, w19 // block stride of lhs/rhs: sizeof(float) * 8 * depth
79   mov x19, #4
80-  ldr x17, [sp]
81+  ldr x17, [sp, #144]
82   cbz x14, NoWinoSteps
83   mul x8, x7, x17
84   mov x11, #8
85@@ -779,7 +780,6 @@ NoDstStep:
86   bgt L1
87
88 End1:
89-  sub sp, sp, #144
90   ld1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp], #64
91   ld1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp], #64
92   ldp x19, x20, [sp], #16
93diff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/arm64/MatmulFp32Opt.S b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/arm64/MatmulFp32Opt.S
94index 6937f4ba..6e2d8846 100644
95--- a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/arm64/MatmulFp32Opt.S
96+++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/arm64/MatmulFp32Opt.S
97@@ -34,13 +34,14 @@
98
99 asm_function MatmulFloatNeon64Opt
100     sub sp, sp, #160
101-    st1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp], #64
102-    st1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp], #64
103-    stp x19, x20, [sp], #16
104-    stp x21, x22, [sp], #16
105+    st1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp]
106+    add x9, sp, #64
107+    st1 {v12.4s, v13.4s, v14.4s, v15.4s}, [x9]
108+    stp x19, x20, [sp, #128]
109+    stp x21, x22, [sp, #144]
110
111-    ldr x8, [sp]
112-    ldr x9, [sp, #8]
113+    ldr x8, [sp, #160]
114+    ldr x9, [sp, #168]
115
116     mov x21, #48 // sizeof(float) * 12
117     mul x17, x5, x21 // block stride of lhs/rhs: sizeof(float) * 12 * depth
118@@ -1659,7 +1660,6 @@ LoopColEnd:
119         subs x6, x6, #12
120         bgt LoopRowStart
121
122-  sub sp, sp, #160
123   ld1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp], #64
124   ld1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp], #64
125   ldp x19, x20, [sp], #16
126diff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/arm64/MatmulFp32OptRow12.S b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/arm64/MatmulFp32OptRow12.S
127index c9151a99..05465bd1 100644
128--- a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/arm64/MatmulFp32OptRow12.S
129+++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/arm64/MatmulFp32OptRow12.S
130@@ -34,13 +34,14 @@
131
132 asm_function MatmulFloatNeon64OptRow12
133     sub sp, sp, #160
134-    st1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp], #64
135-    st1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp], #64
136-    stp x19, x20, [sp], #16
137-    stp x21, x22, [sp], #16
138+    st1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp]
139+    add x9, sp, #64
140+    st1 {v12.4s, v13.4s, v14.4s, v15.4s}, [x9]
141+    stp x19, x20, [sp, #128]
142+    stp x21, x22, [sp, #144]
143
144-    ldr x8, [sp]
145-    ldr x9, [sp, #8]
146+    ldr x8, [sp, #160]
147+    ldr x9, [sp, #168]
148
149     mov x21, #48 // sizeof(float) * 12
150     mul x17, x5, x21 // block stride of lhs/rhs: sizeof(float) * 12 * depth
151@@ -1220,7 +1221,6 @@ LoopColEnd:
152         subs x6, x6, #12
153         bgt LoopRow
154
155-  sub sp, sp, #160
156   ld1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp], #64
157   ld1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp], #64
158   ldp x19, x20, [sp], #16
159diff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/arm64/MatmulFp32OptRow4.S b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/arm64/MatmulFp32OptRow4.S
160index 0cc49fb9..b984c494 100644
161--- a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/arm64/MatmulFp32OptRow4.S
162+++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/arm64/MatmulFp32OptRow4.S
163@@ -19,7 +19,7 @@
164 .text
165 .align 5
166
167-// void MatmulFloatNeon64(const float *a, const float *b, float *c, const float *bias, int act_type, int depth
168+// void MatmulFloatNeon64OptRow4(const float *a, const float *b, float *c, const float *bias, int act_type, int depth
169 //                        int row, int col, size_t stride, size_t writeMode)
170 // x0: a
171 // x1: b
172@@ -34,13 +34,14 @@
173
174 asm_function MatmulFloatNeon64OptRow4
175     sub sp, sp, #160
176-    st1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp], #64
177-    st1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp], #64
178-    stp x19, x20, [sp], #16
179-    stp x21, x22, [sp], #16
180+    st1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp]
181+    add x9, sp, #64
182+    st1 {v12.4s, v13.4s, v14.4s, v15.4s}, [x9]
183+    stp x19, x20, [sp, #128]
184+    stp x21, x22, [sp, #144]
185
186-    ldr x8, [sp]
187-    ldr x9, [sp, #8]
188+    ldr x8, [sp, #160]
189+    ldr x9, [sp, #168]
190
191     mov x21, #48 // sizeof(float) * 12
192
193@@ -588,7 +589,6 @@ LoopColEnd:
194         subs x6, x6, #12
195         bgt LoopRow4
196
197-  sub sp, sp, #160
198   ld1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp], #64
199   ld1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp], #64
200   ldp x19, x20, [sp], #16
201diff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/arm64/MatmulFp32OptRow8.S b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/arm64/MatmulFp32OptRow8.S
202index a9e42a54..c5b260c0 100644
203--- a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/arm64/MatmulFp32OptRow8.S
204+++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/arm64/MatmulFp32OptRow8.S
205@@ -34,13 +34,14 @@
206
207 asm_function MatmulFloatNeon64OptRow8
208     sub sp, sp, #160
209-    st1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp], #64
210-    st1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp], #64
211-    stp x19, x20, [sp], #16
212-    stp x21, x22, [sp], #16
213+    st1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp]
214+    add x9, sp, #64
215+    st1 {v12.4s, v13.4s, v14.4s, v15.4s}, [x9]
216+    stp x19, x20, [sp, #128]
217+    stp x21, x22, [sp, #144]
218
219-    ldr x8, [sp]
220-    ldr x9, [sp, #8]
221+    ldr x8, [sp, #160]
222+    ldr x9, [sp, #168]
223
224     mov x21, #48 // sizeof(float) * 12
225     mul x17, x5, x21 // block stride of lhs/rhs: sizeof(float) * 12 * depth
226@@ -902,7 +903,6 @@ LoopColEnd:
227         subs x6, x6, #12
228         bgt LoopCol8
229
230-  sub sp, sp, #160
231   ld1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp], #64
232   ld1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp], #64
233   ldp x19, x20, [sp], #16
234diff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/arm64/MatmulWinogradFp32.S b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/arm64/MatmulWinogradFp32.S
235index 0b814ce4..23032ab9 100644
236--- a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/arm64/MatmulWinogradFp32.S
237+++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/arm64/MatmulWinogradFp32.S
238@@ -27,9 +27,9 @@ asm_function MatrixMultiplyWinograd
239     // x19 ~ x29 should be also preserved
240     // whereas our coding style do not permit such amount of parameters
241     sub sp, sp, #48
242-    st1 {v8.4s}, [sp], #16
243-    stp x19, x20, [sp], #16
244-    stp x21, x22, [sp], #16
245+    st1 {v8.4s}, [sp]
246+    stp x19, x20, [sp, #16]
247+    stp x21, x22, [sp, #32]
248     mov x8, #4
249     mul x10, x5, x8
250     mov x17, x3  // m
251@@ -176,7 +176,6 @@ asm_function MatrixMultiplyWinograd
252             add x0, x0, x21
253             b LoopM
254     EndLoopM:
255-        sub sp, sp, #48
256         ld1 {v8.4s}, [sp], #16
257         ldp x19, x20, [sp], #16
258         ldp x21, x22, [sp], #16
259--
2602.17.1
261
262