1From 043e4263bf4bcc6dd31a257da8f2a5f547ad72cd Mon Sep 17 00:00:00 2001 2From: chengfeng27 <chengfeng27@huawei.com> 3Date: Tue, 30 Jul 2024 17:14:13 +0800 4Subject: [PATCH] fix matmul assemble 5 6--- 7 .../nnacl/assembly/arm64/BigMatmulFp32Opt.S | 22 +++++++++---------- 8 .../kernel/nnacl/assembly/arm64/MatmulFp32.S | 14 ++++++------ 9 .../nnacl/assembly/arm64/MatmulFp32Opt.S | 14 ++++++------ 10 .../nnacl/assembly/arm64/MatmulFp32OptRow12.S | 14 ++++++------ 11 .../nnacl/assembly/arm64/MatmulFp32OptRow4.S | 16 +++++++------- 12 .../nnacl/assembly/arm64/MatmulFp32OptRow8.S | 14 ++++++------ 13 .../nnacl/assembly/arm64/MatmulWinogradFp32.S | 7 +++--- 14 7 files changed, 50 insertions(+), 51 deletions(-) 15 16diff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/arm64/BigMatmulFp32Opt.S b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/arm64/BigMatmulFp32Opt.S 17index 498038ff..03898585 100644 18--- a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/arm64/BigMatmulFp32Opt.S 19+++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/arm64/BigMatmulFp32Opt.S 20@@ -33,16 +33,17 @@ 21 22 asm_function BigMatmulFloatNeon64Opt 23 sub sp, sp, #224 24- st1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp], #64 25- st1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp], #64 26- stp x19, x20, [sp], #16 27- stp x21, x22, [sp], #16 28- stp x23, x24, [sp], #16 29- stp x25, x26, [sp], #16 30- stp x27, x28, [sp], #16 31- stp x29, x30, [sp], #16 32- 33- ldr x8, [sp] 34+ st1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp] 35+ add x9, sp, #64 36+ st1 {v12.4s, v13.4s, v14.4s, v15.4s}, [x9] 37+ stp x19, x20, [sp, #128] 38+ stp x21, x22, [sp, #144] 39+ stp x23, x24, [sp, #160] 40+ stp x25, x26, [sp, #176] 41+ stp x27, x28, [sp, #192] 42+ stp x29, x30, [sp, #208] 43+ 44+ ldr x8, [sp, #224] 45 mov x20, #1 46 mov x22, #32 47 mov x23, #48 48@@ -2515,7 +2516,6 @@ Compute4x4Unit: 49 ret 50 51 End: 52- sub sp, sp, #224 53 ld1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp], #64 54 ld1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp], #64 55 ldp x19, x20, [sp], #16 56diff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/arm64/MatmulFp32.S b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/arm64/MatmulFp32.S 57index 67d20dcc..2dedccd0 100644 58--- a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/arm64/MatmulFp32.S 59+++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/arm64/MatmulFp32.S 60@@ -34,17 +34,18 @@ 61 62 asm_function MatmulFloatNeon64 63 sub sp, sp, #144 64- st1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp], #64 65- st1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp], #64 66- stp x19, x20, [sp], #16 67+ st1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp] 68+ add x9, sp, #64 69+ st1 {v12.4s, v13.4s, v14.4s, v15.4s}, [x9] 70+ stp x19, x20, [sp, #128] 71 72- ldr x9, [sp, #8] 73- ldr x14, [sp, #16] 74+ ldr x9, [sp, #152] 75+ ldr x14, [sp, #160] 76 77 mov w19, #32 // sizeof(float) * 8 78 mul w15, w5, w19 // block stride of lhs/rhs: sizeof(float) * 8 * depth 79 mov x19, #4 80- ldr x17, [sp] 81+ ldr x17, [sp, #144] 82 cbz x14, NoWinoSteps 83 mul x8, x7, x17 84 mov x11, #8 85@@ -779,7 +780,6 @@ NoDstStep: 86 bgt L1 87 88 End1: 89- sub sp, sp, #144 90 ld1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp], #64 91 ld1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp], #64 92 ldp x19, x20, [sp], #16 93diff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/arm64/MatmulFp32Opt.S b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/arm64/MatmulFp32Opt.S 94index 6937f4ba..6e2d8846 100644 95--- a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/arm64/MatmulFp32Opt.S 96+++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/arm64/MatmulFp32Opt.S 97@@ -34,13 +34,14 @@ 98 99 asm_function MatmulFloatNeon64Opt 100 sub sp, sp, #160 101- st1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp], #64 102- st1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp], #64 103- stp x19, x20, [sp], #16 104- stp x21, x22, [sp], #16 105+ st1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp] 106+ add x9, sp, #64 107+ st1 {v12.4s, v13.4s, v14.4s, v15.4s}, [x9] 108+ stp x19, x20, [sp, #128] 109+ stp x21, x22, [sp, #144] 110 111- ldr x8, [sp] 112- ldr x9, [sp, #8] 113+ ldr x8, [sp, #160] 114+ ldr x9, [sp, #168] 115 116 mov x21, #48 // sizeof(float) * 12 117 mul x17, x5, x21 // block stride of lhs/rhs: sizeof(float) * 12 * depth 118@@ -1659,7 +1660,6 @@ LoopColEnd: 119 subs x6, x6, #12 120 bgt LoopRowStart 121 122- sub sp, sp, #160 123 ld1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp], #64 124 ld1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp], #64 125 ldp x19, x20, [sp], #16 126diff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/arm64/MatmulFp32OptRow12.S b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/arm64/MatmulFp32OptRow12.S 127index c9151a99..05465bd1 100644 128--- a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/arm64/MatmulFp32OptRow12.S 129+++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/arm64/MatmulFp32OptRow12.S 130@@ -34,13 +34,14 @@ 131 132 asm_function MatmulFloatNeon64OptRow12 133 sub sp, sp, #160 134- st1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp], #64 135- st1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp], #64 136- stp x19, x20, [sp], #16 137- stp x21, x22, [sp], #16 138+ st1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp] 139+ add x9, sp, #64 140+ st1 {v12.4s, v13.4s, v14.4s, v15.4s}, [x9] 141+ stp x19, x20, [sp, #128] 142+ stp x21, x22, [sp, #144] 143 144- ldr x8, [sp] 145- ldr x9, [sp, #8] 146+ ldr x8, [sp, #160] 147+ ldr x9, [sp, #168] 148 149 mov x21, #48 // sizeof(float) * 12 150 mul x17, x5, x21 // block stride of lhs/rhs: sizeof(float) * 12 * depth 151@@ -1220,7 +1221,6 @@ LoopColEnd: 152 subs x6, x6, #12 153 bgt LoopRow 154 155- sub sp, sp, #160 156 ld1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp], #64 157 ld1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp], #64 158 ldp x19, x20, [sp], #16 159diff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/arm64/MatmulFp32OptRow4.S b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/arm64/MatmulFp32OptRow4.S 160index 0cc49fb9..b984c494 100644 161--- a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/arm64/MatmulFp32OptRow4.S 162+++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/arm64/MatmulFp32OptRow4.S 163@@ -19,7 +19,7 @@ 164 .text 165 .align 5 166 167-// void MatmulFloatNeon64(const float *a, const float *b, float *c, const float *bias, int act_type, int depth 168+// void MatmulFloatNeon64OptRow4(const float *a, const float *b, float *c, const float *bias, int act_type, int depth 169 // int row, int col, size_t stride, size_t writeMode) 170 // x0: a 171 // x1: b 172@@ -34,13 +34,14 @@ 173 174 asm_function MatmulFloatNeon64OptRow4 175 sub sp, sp, #160 176- st1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp], #64 177- st1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp], #64 178- stp x19, x20, [sp], #16 179- stp x21, x22, [sp], #16 180+ st1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp] 181+ add x9, sp, #64 182+ st1 {v12.4s, v13.4s, v14.4s, v15.4s}, [x9] 183+ stp x19, x20, [sp, #128] 184+ stp x21, x22, [sp, #144] 185 186- ldr x8, [sp] 187- ldr x9, [sp, #8] 188+ ldr x8, [sp, #160] 189+ ldr x9, [sp, #168] 190 191 mov x21, #48 // sizeof(float) * 12 192 193@@ -588,7 +589,6 @@ LoopColEnd: 194 subs x6, x6, #12 195 bgt LoopRow4 196 197- sub sp, sp, #160 198 ld1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp], #64 199 ld1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp], #64 200 ldp x19, x20, [sp], #16 201diff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/arm64/MatmulFp32OptRow8.S b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/arm64/MatmulFp32OptRow8.S 202index a9e42a54..c5b260c0 100644 203--- a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/arm64/MatmulFp32OptRow8.S 204+++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/arm64/MatmulFp32OptRow8.S 205@@ -34,13 +34,14 @@ 206 207 asm_function MatmulFloatNeon64OptRow8 208 sub sp, sp, #160 209- st1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp], #64 210- st1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp], #64 211- stp x19, x20, [sp], #16 212- stp x21, x22, [sp], #16 213+ st1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp] 214+ add x9, sp, #64 215+ st1 {v12.4s, v13.4s, v14.4s, v15.4s}, [x9] 216+ stp x19, x20, [sp, #128] 217+ stp x21, x22, [sp, #144] 218 219- ldr x8, [sp] 220- ldr x9, [sp, #8] 221+ ldr x8, [sp, #160] 222+ ldr x9, [sp, #168] 223 224 mov x21, #48 // sizeof(float) * 12 225 mul x17, x5, x21 // block stride of lhs/rhs: sizeof(float) * 12 * depth 226@@ -902,7 +903,6 @@ LoopColEnd: 227 subs x6, x6, #12 228 bgt LoopCol8 229 230- sub sp, sp, #160 231 ld1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp], #64 232 ld1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp], #64 233 ldp x19, x20, [sp], #16 234diff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/arm64/MatmulWinogradFp32.S b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/arm64/MatmulWinogradFp32.S 235index 0b814ce4..23032ab9 100644 236--- a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/arm64/MatmulWinogradFp32.S 237+++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/arm64/MatmulWinogradFp32.S 238@@ -27,9 +27,9 @@ asm_function MatrixMultiplyWinograd 239 // x19 ~ x29 should be also preserved 240 // whereas our coding style do not permit such amount of parameters 241 sub sp, sp, #48 242- st1 {v8.4s}, [sp], #16 243- stp x19, x20, [sp], #16 244- stp x21, x22, [sp], #16 245+ st1 {v8.4s}, [sp] 246+ stp x19, x20, [sp, #16] 247+ stp x21, x22, [sp, #32] 248 mov x8, #4 249 mul x10, x5, x8 250 mov x17, x3 // m 251@@ -176,7 +176,6 @@ asm_function MatrixMultiplyWinograd 252 add x0, x0, x21 253 b LoopM 254 EndLoopM: 255- sub sp, sp, #48 256 ld1 {v8.4s}, [sp], #16 257 ldp x19, x20, [sp], #16 258 ldp x21, x22, [sp], #16 259-- 2602.17.1 261 262