1 // Copyright 2015, VIXL authors
2 // All rights reserved.
3 //
4 // Redistribution and use in source and binary forms, with or without
5 // modification, are permitted provided that the following conditions are met:
6 //
7 // * Redistributions of source code must retain the above copyright notice,
8 // this list of conditions and the following disclaimer.
9 // * Redistributions in binary form must reproduce the above copyright notice,
10 // this list of conditions and the following disclaimer in the documentation
11 // and/or other materials provided with the distribution.
12 // * Neither the name of ARM Limited nor the names of its contributors may be
13 // used to endorse or promote products derived from this software without
14 // specific prior written permission.
15 //
16 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS CONTRIBUTORS "AS IS" AND
17 // ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
18 // WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
19 // DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE
20 // FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
21 // DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
22 // SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
23 // CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
24 // OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
25 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26
27 #include "examples.h"
28
29 #define __ masm->
30
31 // A vector by scalar multiply helper routine to generate code for
32 // the multiplication of each column of the resulting 4x4 matrix.
33 // This function provides a template for the following pattern:
34 //
35 // __ Fmul(v<v_out>.V4S(), v4.V4S(), v<s_column>.S(), 0);
36 // __ Fmla(v<v_out>.V4S(), v5.V4S(), v<s_column>.S(), 1);
37 // __ Fmla(v<v_out>.V4S(), v6.V4S(), v<s_column>.S(), 2);
38 // __ Fmla(v<v_out>.V4S(), v7.V4S(), v<s_column>.S(), 3);
39 //
40 // v<v_out> corresponds to a column of the output matrix (v0, v1, v2 or v3).
41 // v<s_column> corresponds to a column of the 2nd input (v16, v17, v18 or v19).
42 //
GenerateMultiplyColumn(MacroAssembler * masm,unsigned out_column,unsigned in_column)43 static void GenerateMultiplyColumn(MacroAssembler* masm,
44 unsigned out_column,
45 unsigned in_column) {
46 // 'v_out' splits a Q register into 4 lanes of 32 bits each.
47 VRegister v_out = VRegister(out_column, kQRegSize, 4);
48 // 'v_in' refers to a single 32 bit 'S' lane.
49 VRegister v_in = VRegister(in_column, kSRegSize);
50
51 __ Fmul(v_out, v4.V4S(), v_in, 0); // e.g. (v0.V4S(), v4.V4S(), v8.S(), 0).
52 __ Fmla(v_out, v5.V4S(), v_in, 1);
53 __ Fmla(v_out, v6.V4S(), v_in, 2);
54 __ Fmla(v_out, v7.V4S(), v_in, 3);
55 }
56
GenerateNEONMatrixMultiply(MacroAssembler * masm)57 void GenerateNEONMatrixMultiply(MacroAssembler* masm) {
58 // Argument location:
59 // dst -> x0
60 // mat1 -> x1
61 // mat2 -> x2
62
63 Label end;
64
65 __ And(x3, x0, x1);
66 __ And(x3, x3, x2);
67 __ Cbz(x3, &end); // Nothing to do if an input is null.
68
69 // Load the first matrix into v4, v5, v6 and v7.
70 __ Ld1(v4.V4S(), v5.V4S(), v6.V4S(), v7.V4S(), MemOperand(x1));
71 // Load the first matrix into v16, v17, v18 and v19.
72 __ Ld1(v16.V4S(), v17.V4S(), v18.V4S(), v19.V4S(), MemOperand(x2));
73
74 // Initialise vectors of the output matrix with zeros.
75 // This is only for the purposes of showing how this can be achived
76 // but technically this is not required because we overwrite all lanes
77 // of the output vectors.
78 __ Movi(v0.V16B(), 0);
79 __ Movi(v1.V16B(), 0);
80 __ Movi(v2.V16B(), 0);
81 __ Movi(v3.V16B(), 0);
82
83 GenerateMultiplyColumn(masm, 0, 16);
84 GenerateMultiplyColumn(masm, 1, 17);
85 GenerateMultiplyColumn(masm, 2, 18);
86 GenerateMultiplyColumn(masm, 3, 19);
87
88 // Store the resulting matrix from v0, v1, v2 and v3.
89 __ St1(v0.V4S(), v1.V4S(), v2.V4S(), v3.V4S(), MemOperand(x0));
90
91 __ Bind(&end);
92 __ Ret();
93 }
94
95
96 #ifndef TEST_EXAMPLES
97 #ifdef VIXL_INCLUDE_SIMULATOR_AARCH64
main(void)98 int main(void) {
99 MacroAssembler masm;
100 Decoder decoder;
101 Simulator simulator(&decoder);
102
103 // Generate the code for the example function.
104 Label neon_matrix_multiply;
105 masm.Bind(&neon_matrix_multiply);
106 GenerateNEONMatrixMultiply(&masm);
107 masm.FinalizeCode();
108
109 // Define the required variables and run the example function.
110 const int kRowSize = 4;
111 const int kColSize = 4;
112 const int kLength = kRowSize * kColSize;
113
114 float mat1[kLength], mat2[kLength], output[kLength];
115
116 // Initialise the output matrix to the zero matrix.
117 memset(output, 0, sizeof(output[0]) * kLength);
118
119 // Fill the two input matrices with some 32 bit floating point values.
120 // Array initialisation using curly brackets is also possible like so:
121 // float mat1[kLength] = { 1.0f, 52.03f, 4.43f, ... };
122 // However, the following way better shows the "column-major" arrangement.
123
124 mat1[0] = 1.0f;
125 mat1[4] = 2.0f;
126 mat1[8] = 3.0f;
127 mat1[12] = 4.0f;
128 mat1[1] = 52.03f;
129 mat1[5] = 12.24f;
130 mat1[9] = 53.56f;
131 mat1[13] = 22.22f;
132 mat1[2] = 4.43f;
133 mat1[6] = 5.00f;
134 mat1[10] = 7.00f;
135 mat1[14] = 3.11f;
136 mat1[3] = 43.47f;
137 mat1[7] = 10.97f;
138 mat1[11] = 37.78f;
139 mat1[15] = 90.91f;
140
141 mat2[0] = 1.0f;
142 mat2[4] = 11.24f;
143 mat2[8] = 21.00f;
144 mat2[12] = 21.31f;
145 mat2[1] = 2.0f;
146 mat2[5] = 2.24f;
147 mat2[9] = 8.56f;
148 mat2[13] = 52.03f;
149 mat2[2] = 3.0f;
150 mat2[6] = 51.00f;
151 mat2[10] = 21.00f;
152 mat2[14] = 33.11f;
153 mat2[3] = 4.0f;
154 mat2[7] = 0.00f;
155 mat2[11] = 84.00f;
156 mat2[15] = 1.97f;
157
158 simulator.ResetState();
159 simulator.WriteXRegister(0, reinterpret_cast<uintptr_t>(output));
160 simulator.WriteXRegister(1, reinterpret_cast<uintptr_t>(mat1));
161 simulator.WriteXRegister(2, reinterpret_cast<uintptr_t>(mat2));
162 simulator.RunFrom(masm.GetLabelAddress<Instruction*>(&neon_matrix_multiply));
163
164 // Print the 4x4 output matrix along with both 4x4 input matrices.
165 for (int i = 0; i < kRowSize; i++) {
166 printf(
167 "| %8.2f %8.2f %8.2f %8.2f | "
168 "| %8.2f %8.2f %8.2f %8.2f | "
169 "| %8.2f %8.2f %8.2f %8.2f |\n",
170 mat1[i],
171 mat1[4 + i],
172 mat1[8 + i],
173 mat1[12 + i],
174 mat2[i],
175 mat2[4 + i],
176 mat2[8 + i],
177 mat2[12 + i],
178 output[i],
179 output[4 + i],
180 output[8 + i],
181 output[12 + i]);
182 if (i == 0 || i == 2) {
183 printf(
184 "| | "
185 "| | "
186 "| |\n");
187 } else if (i == 1) {
188 printf(
189 "| | x "
190 "| | = "
191 "| |\n");
192 }
193 }
194
195 return 0;
196 }
197 #else
198 // Without the simulator there is nothing to test.
main(void)199 int main(void) { return 0; }
200 #endif // VIXL_INCLUDE_SIMULATOR_AARCH64
201 #endif // TEST_EXAMPLES
202