• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // Copyright 2015, VIXL authors
2 // All rights reserved.
3 //
4 // Redistribution and use in source and binary forms, with or without
5 // modification, are permitted provided that the following conditions are met:
6 //
7 //   * Redistributions of source code must retain the above copyright notice,
8 //     this list of conditions and the following disclaimer.
9 //   * Redistributions in binary form must reproduce the above copyright notice,
10 //     this list of conditions and the following disclaimer in the documentation
11 //     and/or other materials provided with the distribution.
12 //   * Neither the name of ARM Limited nor the names of its contributors may be
13 //     used to endorse or promote products derived from this software without
14 //     specific prior written permission.
15 //
16 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS CONTRIBUTORS "AS IS" AND
17 // ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
18 // WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
19 // DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE
20 // FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
21 // DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
22 // SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
23 // CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
24 // OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
25 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26 
27 #include "examples.h"
28 
29 using namespace vixl;
30 using namespace vixl::aarch64;
31 
32 #define __ masm->
33 
34 // A vector by scalar multiply helper routine to generate code for
35 // the multiplication of each column of the resulting 4x4 matrix.
36 // This function provides a template for the following pattern:
37 //
38 // __ Fmul(v<v_out>.V4S(), v4.V4S(),  v<s_column>.S(), 0);
39 // __ Fmla(v<v_out>.V4S(), v5.V4S(),  v<s_column>.S(), 1);
40 // __ Fmla(v<v_out>.V4S(), v6.V4S(),  v<s_column>.S(), 2);
41 // __ Fmla(v<v_out>.V4S(), v7.V4S(),  v<s_column>.S(), 3);
42 //
43 // v<v_out> corresponds to a column of the output matrix (v0, v1, v2 or v3).
44 // v<s_column> corresponds to a column of the 2nd input (v16, v17, v18 or v19).
45 //
GenerateMultiplyColumn(MacroAssembler * masm,unsigned out_column,unsigned in_column)46 static void GenerateMultiplyColumn(MacroAssembler* masm,
47                                    unsigned out_column,
48                                    unsigned in_column) {
49   // 'v_out' splits a Q register into 4 lanes of 32 bits each.
50   VRegister v_out = VRegister(out_column, kQRegSize, 4);
51   // 'v_in' refers to a single 32 bit 'S' lane.
52   VRegister v_in = VRegister(in_column, kSRegSize);
53 
54   __ Fmul(v_out, v4.V4S(), v_in, 0);  // e.g. (v0.V4S(), v4.V4S(),  v8.S(), 0).
55   __ Fmla(v_out, v5.V4S(), v_in, 1);
56   __ Fmla(v_out, v6.V4S(), v_in, 2);
57   __ Fmla(v_out, v7.V4S(), v_in, 3);
58 }
59 
GenerateNEONMatrixMultiply(MacroAssembler * masm)60 void GenerateNEONMatrixMultiply(MacroAssembler* masm) {
61   // Argument location:
62   //   dst  -> x0
63   //   mat1 -> x1
64   //   mat2 -> x2
65 
66   Label end;
67 
68   __ And(x3, x0, x1);
69   __ And(x3, x3, x2);
70   __ Cbz(x3, &end);  // Nothing to do if an input is null.
71 
72   // Load the first matrix into v4, v5, v6 and v7.
73   __ Ld1(v4.V4S(), v5.V4S(), v6.V4S(), v7.V4S(), MemOperand(x1));
74   // Load the first matrix into v16, v17, v18 and v19.
75   __ Ld1(v16.V4S(), v17.V4S(), v18.V4S(), v19.V4S(), MemOperand(x2));
76 
77   // Initialise vectors of the output matrix with zeros.
78   // This is only for the purposes of showing how this can be achived
79   // but technically this is not required because we overwrite all lanes
80   // of the output vectors.
81   __ Movi(v0.V16B(), 0);
82   __ Movi(v1.V16B(), 0);
83   __ Movi(v2.V16B(), 0);
84   __ Movi(v3.V16B(), 0);
85 
86   GenerateMultiplyColumn(masm, 0, 16);
87   GenerateMultiplyColumn(masm, 1, 17);
88   GenerateMultiplyColumn(masm, 2, 18);
89   GenerateMultiplyColumn(masm, 3, 19);
90 
91   // Store the resulting matrix from v0, v1, v2 and v3.
92   __ St1(v0.V4S(), v1.V4S(), v2.V4S(), v3.V4S(), MemOperand(x0));
93 
94   __ Bind(&end);
95   __ Ret();
96 }
97 
98 
99 #ifndef TEST_EXAMPLES
100 #ifdef VIXL_INCLUDE_SIMULATOR_AARCH64
main(void)101 int main(void) {
102   MacroAssembler masm;
103   Decoder decoder;
104   Simulator simulator(&decoder);
105 
106   // Generate the code for the example function.
107   Label neon_matrix_multiply;
108   masm.Bind(&neon_matrix_multiply);
109   GenerateNEONMatrixMultiply(&masm);
110   masm.FinalizeCode();
111 
112   // Define the required variables and run the example function.
113   const int kRowSize = 4;
114   const int kColSize = 4;
115   const int kLength = kRowSize * kColSize;
116 
117   float mat1[kLength], mat2[kLength], output[kLength];
118 
119   // Initialise the output matrix to the zero matrix.
120   memset(output, 0, sizeof(output[0]) * kLength);
121 
122   // Fill the two input matrices with some 32 bit floating point values.
123   // Array initialisation using curly brackets is also possible like so:
124   //   float mat1[kLength] = { 1.0f, 52.03f, 4.43f, ... };
125   // However, the following way better shows the "column-major" arrangement.
126 
127   mat1[0] = 1.0f;
128   mat1[4] = 2.0f;
129   mat1[8] = 3.0f;
130   mat1[12] = 4.0f;
131   mat1[1] = 52.03f;
132   mat1[5] = 12.24f;
133   mat1[9] = 53.56f;
134   mat1[13] = 22.22f;
135   mat1[2] = 4.43f;
136   mat1[6] = 5.00f;
137   mat1[10] = 7.00f;
138   mat1[14] = 3.11f;
139   mat1[3] = 43.47f;
140   mat1[7] = 10.97f;
141   mat1[11] = 37.78f;
142   mat1[15] = 90.91f;
143 
144   mat2[0] = 1.0f;
145   mat2[4] = 11.24f;
146   mat2[8] = 21.00f;
147   mat2[12] = 21.31f;
148   mat2[1] = 2.0f;
149   mat2[5] = 2.24f;
150   mat2[9] = 8.56f;
151   mat2[13] = 52.03f;
152   mat2[2] = 3.0f;
153   mat2[6] = 51.00f;
154   mat2[10] = 21.00f;
155   mat2[14] = 33.11f;
156   mat2[3] = 4.0f;
157   mat2[7] = 0.00f;
158   mat2[11] = 84.00f;
159   mat2[15] = 1.97f;
160 
161   simulator.ResetState();
162   simulator.WriteXRegister(0, reinterpret_cast<uintptr_t>(output));
163   simulator.WriteXRegister(1, reinterpret_cast<uintptr_t>(mat1));
164   simulator.WriteXRegister(2, reinterpret_cast<uintptr_t>(mat2));
165   simulator.RunFrom(masm.GetLabelAddress<Instruction*>(&neon_matrix_multiply));
166 
167   // Print the 4x4 output matrix along with both 4x4 input matrices.
168   for (int i = 0; i < kRowSize; i++) {
169     printf(
170         "| %8.2f %8.2f %8.2f %8.2f |   "
171         "| %8.2f %8.2f %8.2f %8.2f |       "
172         "| %8.2f %8.2f %8.2f %8.2f |\n",
173         mat1[i],
174         mat1[4 + i],
175         mat1[8 + i],
176         mat1[12 + i],
177         mat2[i],
178         mat2[4 + i],
179         mat2[8 + i],
180         mat2[12 + i],
181         output[i],
182         output[4 + i],
183         output[8 + i],
184         output[12 + i]);
185     if (i == 0 || i == 2) {
186       printf(
187           "|                                     |   "
188           "|                                     |       "
189           "|                                     |\n");
190     } else if (i == 1) {
191       printf(
192           "|                                     | x "
193           "|                                     |   =   "
194           "|                                     |\n");
195     }
196   }
197 
198   return 0;
199 }
200 #else
201 // Without the simulator there is nothing to test.
main(void)202 int main(void) { return 0; }
203 #endif  // VIXL_INCLUDE_SIMULATOR_AARCH64
204 #endif  // TEST_EXAMPLES
205