1 /* Copyright 2019 Google LLC. All Rights Reserved.
2
3 Licensed under the Apache License, Version 2.0 (the "License");
4 you may not use this file except in compliance with the License.
5 You may obtain a copy of the License at
6
7 http://www.apache.org/licenses/LICENSE-2.0
8
9 Unless required by applicable law or agreed to in writing, software
10 distributed under the License is distributed on an "AS IS" BASIS,
11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 See the License for the specific language governing permissions and
13 limitations under the License.
14 ==============================================================================*/
15
16 // Internal types and helpers for matrices.
17 //
18 // Ruy has a couple slightly different notions of matrices, besides the
19 // Matrix<T> class that we expose to the user-facing API.
20 //
21 // TODO(silvasean): Put parts of this architecture description somewhere more
22 // prominent.
23 //
24 // The 4 main matrix types are:
25 // - Matrix<T>: This is a user-facing type on Ruy's external API boundary. It is
26 // also used internally.
27 // - DMatrix: This is a type-erased version of Matrix<T>. "D" = "dynamic".
28 // - PMatrix: This represents a packed matrix, which requires tracking kernel
29 // layout and row/column sums for quantization. It is type-erased.
30 // - PackedMatrix<T>: This is a statically typed variant of PMatrix for
31 // convenience inside typed routines.
32 //
33 // Note that Matrix<T> is *not* implemented in terms of the internal types. It
34 // is an independent, simple, and user-facing type.
35 //
36 // The use of type-erasure might seem surprising for a library like Ruy with a
37 // heavily-templated entry point, but it is motivated by the desire for most of
38 // Ruy's "middle-end" to be non-templated. Ruy can be thought of as having 3
39 // main parts:
40 // - "front-end" (dispatch.h) - this is the highly templated ruy::Mul entry
41 // point, along with routines that select RunKernel and RunPack implementations
42 // statically based on those template parameters.
43 // - "back-end" (kernel.h, pack.h)- this consists of the implementations of
44 // RunKernel and RunPack, often in assembly code, which are the building blocks
45 // that Ruy calls to perform matrix multiplication. These are templated so that
46 // only the requested types/Path's are actually emitted by the compiler.
47 // - "middle-end" (trmul.h) - this is the part of Ruy that orchestrates the
48 // calls to the "back-end" optimized building blocks. This layer has to deal
49 // with issues like cache locality and low-overhead multi-threading.
50 //
51 // There is a desire for the "middle-end" to be non-templated in order to
52 // simplify the implementation and reduce code-size. We type-erase when going
53 // from the "front-end" to the "middle-end", and un-type-erase going from the
54 // "middle-end" to the "back-end". The un-type-erasure is possible because the
55 // "front-end" is responsible for instantiating the needed "back-end" templates,
56 // and thus the static type information is still present.
57 //
58 // Each layer of Ruy uses matrix types:
59 // - "front-end": Matrix<T>
60 // - "middle-end": DMatrix, PMatrix
61 // - "back-end": Matrix<T>, PackedMatrix<T>
62 //
63 // The use of separate types for packed matrices is not essential, but makes it
64 // obvious at a glance whether a matrix is a packed matrix or not. We would
65 // reconsider this decision if there was significant duplication between packed
66 // and unpacked matrices, but that doesn't seem to be the case at the moment.
67 //
68 // Another goal is to keep the user-facing Matrix<T> as simple and
69 // understandable as possible. Ideally, a user should be able to read the struct
70 // definition for Matrix<T> and see a very simple definition with no internal
71 // details like sums and kernel block layout.
72 //
73 // To present another structured view of our various matrix types, here's a
74 // table:
75 // Plain matrices Packed matrices
76 // +----------------------------------
77 // Templated | Matrix<T> PackedMatrix<T>
78 // Type-erased | DMatrix PMatrix
79 //
80 //
81 // There is 1 additional matrix type not mentioned above, due to its low
82 // importance:
83 // - PrepackedMatrix: This is a user-facing version of PMatrix. It has the bare
84 // minimum of fields needed for representing the raw data and sums buffers of a
85 // packed matrix for the "advanced" explicit pre-packing API. This type plays no
86 // role in Ruy's internals and can generally by ignored. The only reason it
87 // exists is so that PMatrix is not exposed to users -- we prefer to keep the
88 // internal matrix types hidden, even from "advanced" users.
89
90 #ifndef TENSORFLOW_LITE_EXPERIMENTAL_RUY_INTERNAL_MATRIX_H_
91 #define TENSORFLOW_LITE_EXPERIMENTAL_RUY_INTERNAL_MATRIX_H_
92
93 #include <cstddef>
94 #include <cstdint>
95 #include <type_traits>
96 #include <utility>
97
98 #include "tensorflow/lite/experimental/ruy/check_macros.h"
99 #include "tensorflow/lite/experimental/ruy/common.h"
100 #include "tensorflow/lite/experimental/ruy/matrix.h"
101 #include "tensorflow/lite/experimental/ruy/size_util.h"
102
103 namespace ruy {
104
105 // KernelLayout describes small-scale block structure in a packed matrix layout.
106 // It's a runtime (as opposed to compile-time-constant) version of the
107 // FixedKernelLayout struct used to declare kernel layouts.
108 //
109 // This is is sometimes known as "tiling" in other contexts.
110 //
111 // For example, consider a packed matrix in column-major format with a
112 // column-major KernelLayout. The matrix logically has a shape of
113 // `[cols, rows]`. However, the matrix is laid out as though it were a 4D array
114 // of shape `[cols / kcols, rows / krows, kcols, krows]`.
115 //
116 // Note that in the case of kcols=1, krows=1, this degenerates to
117 // `[cols, rows, 1, 1]` which is equivalent to having no small-scale block
118 // structure.
119 struct KernelLayout {
120 Order order = Order::kColMajor;
121 std::uint8_t rows = 1;
122 std::uint8_t cols = 1;
123 };
124
125 // A packed matrix has a small-scale block structure that is not present in in
126 // the input matrices. This block structure is necessary for the kernels to
127 // process data efficiently.
128 //
129 // This struct is very similar to Layout, but has the extra KernelLayout field.
130 struct PackedLayout {
131 std::int32_t rows = 0;
132 std::int32_t cols = 0;
133 // Stride is the offset between two adjacent matrix elements
134 // in the non-contiguous direction.
135 std::int32_t stride = 0;
136 Order order = Order::kColMajor;
137 // Small scale layout shuffling, potentially departing from
138 // linear row-major or column-major storage. See KernelLayout.
139 KernelLayout kernel;
140 };
141
142 // Dynamic representation for a type.
143 //
144 // The most important field in this struct is the size, which Ruy uses to know
145 // how much memory to allocate without having to be templated on a type.
146 // Signed-ness and floating-point-ness are mainly present as debugging checks.
147 //
148 // Note: Ruy does not use this struct to to dynamically dispatch between
149 // different typed implementations. As described in the comment at the top of
150 // this file, Ruy's "front-end", which is templated, instantiates all the
151 // necessary "back-end" routines with complete static knowledge of all the
152 // types.
153 struct Type {
154 template <typename T>
CreateType155 static Type Create() {
156 Type ret;
157 ret.is_signed = std::is_signed<T>::value;
158 ret.is_floating_point = std::is_floating_point<T>::value;
159 ret.size = sizeof(T);
160 return ret;
161 }
162
163 template <typename T>
AssertIsType164 void AssertIs() const {
165 RUY_DCHECK_EQ(is_signed, Create<T>().is_signed);
166 RUY_DCHECK_EQ(is_floating_point, Create<T>().is_floating_point);
167 RUY_DCHECK_EQ(size, Create<T>().size);
168 }
169
170 bool is_signed = false;
171 bool is_floating_point = false;
172 std::uint8_t size = 0;
173 };
174
175 // Type-erased matrix.
176 struct DMatrix {
177 Type data_type;
178 void* data = nullptr;
179 Layout layout;
180 std::int32_t zero_point = 0;
181 };
182
183 // Type-erased packed matrix.
184 struct PMatrix {
185 Type data_type;
186 void* data = nullptr;
187 Type sums_type;
188 void* sums = nullptr;
189 PackedLayout layout;
190 std::int32_t zero_point = 0;
191 };
192
193 // Convenient typed helper for packed matrices.
194 template <typename Scalar>
195 struct PackedMatrix {
196 // The row/column sums needed for quantized matrix multiplication when
197 // the opposite operand of the multiplication uses a non-symmetric zero
198 // point.
199 // This member is only relevant for packed matrices.
200 // Additionally, Ruy always uses 32-bit signed accumulators for quantized
201 // matrix multiplication.
202 // For floating point types, there is no quantization, so this pointer
203 // will always be null. We still need code referencing it to compile
204 // though, even if it is always branched around. Hence we use Scalar*
205 // itself as the type in that case.
206 using SumsType =
207 typename std::conditional<std::is_floating_point<Scalar>::value, Scalar,
208 std::int32_t>::type;
209
210 Scalar* data = nullptr;
211 SumsType* sums = nullptr;
212 PackedLayout layout;
213 std::int32_t zero_point = 0;
214 };
215
216 template <typename T>
ToDMatrix(const Matrix<T> & matrix)217 DMatrix ToDMatrix(const Matrix<T>& matrix) {
218 DMatrix ret;
219 ret.data_type = Type::Create<T>();
220 ret.data = ToVoidPtr(matrix.data.get());
221 ret.layout = matrix.layout;
222 ret.zero_point = matrix.zero_point;
223 return ret;
224 }
225
226 template <typename T>
ToMatrix(const DMatrix & dmatrix)227 Matrix<T> ToMatrix(const DMatrix& dmatrix) {
228 dmatrix.data_type.AssertIs<T>();
229 Matrix<T> ret;
230 ret.data = static_cast<T*>(dmatrix.data);
231 ret.layout = dmatrix.layout;
232 ret.zero_point = dmatrix.zero_point;
233 return ret;
234 }
235
236 template <typename T>
ToPackedMatrix(const PMatrix & pmatrix)237 PackedMatrix<T> ToPackedMatrix(const PMatrix& pmatrix) {
238 using SumsType = typename PackedMatrix<T>::SumsType;
239 pmatrix.data_type.AssertIs<T>();
240 pmatrix.sums_type.AssertIs<SumsType>();
241 PackedMatrix<T> ret;
242 ret.data = static_cast<T*>(pmatrix.data);
243 ret.sums = static_cast<SumsType*>(pmatrix.sums);
244 ret.layout = pmatrix.layout;
245 ret.zero_point = pmatrix.zero_point;
246 return ret;
247 }
248
249 // Helpers for Layout / PackedLayout.
250
IsPacked(const Layout & layout)251 inline bool IsPacked(const Layout& layout) {
252 if (layout.order == Order::kColMajor) {
253 return layout.stride == layout.rows;
254 } else {
255 return layout.stride == layout.cols;
256 }
257 }
258
IsRowMajor(const Layout & layout)259 inline bool IsRowMajor(const Layout& layout) {
260 return layout.order == Order::kRowMajor;
261 }
262
263 template <typename LayoutOrPackedLayout>
IsColMajor(const LayoutOrPackedLayout & layout)264 inline bool IsColMajor(const LayoutOrPackedLayout& layout) {
265 return layout.order == Order::kColMajor;
266 }
267
268 template <typename LayoutOrPackedLayout>
FlatSize(const LayoutOrPackedLayout & layout)269 inline int FlatSize(const LayoutOrPackedLayout& layout) {
270 const int outerdim =
271 layout.order == Order::kColMajor ? layout.cols : layout.rows;
272 return layout.stride * outerdim;
273 }
274
275 // TODO(b/130417400) add a unit test
Offset(const Layout & layout,int row,int col)276 inline int Offset(const Layout& layout, int row, int col) {
277 // TODO(benoitjacob) - should check this but this make the _slow tests take
278 // 5x longer. Find a mitigation like in Eigen with an 'internal' variant
279 // bypassing the check?
280 // RUY_DCHECK_GE(row, 0);
281 // RUY_DCHECK_GE(col, 0);
282 // RUY_DCHECK_LT(row, layout.rows);
283 // RUY_DCHECK_LT(col, layout.cols);
284 int row_stride = layout.order == Order::kColMajor ? 1 : layout.stride;
285 int col_stride = layout.order == Order::kRowMajor ? 1 : layout.stride;
286 return row * row_stride + col * col_stride;
287 }
288
289 // TODO(b/130417400) add a unit test
Offset(const PackedLayout & layout,int row,int col)290 inline int Offset(const PackedLayout& layout, int row, int col) {
291 RUY_DCHECK(is_pot(layout.kernel.rows));
292 RUY_DCHECK(is_pot(layout.kernel.cols));
293 int row_outer = row & ~(layout.kernel.rows - 1);
294 int col_outer = col & ~(layout.kernel.cols - 1);
295 int row_stride_outer =
296 layout.order == Order::kColMajor ? layout.kernel.cols : layout.stride;
297 int col_stride_outer =
298 layout.order == Order::kRowMajor ? layout.kernel.rows : layout.stride;
299 int offset_outer =
300 row_outer * row_stride_outer + col_outer * col_stride_outer;
301 int row_inner = row - row_outer;
302 int col_inner = col - col_outer;
303 int row_stride_inner =
304 layout.kernel.order == Order::kColMajor ? 1 : layout.kernel.cols;
305 int col_stride_inner =
306 layout.kernel.order == Order::kRowMajor ? 1 : layout.kernel.rows;
307 int offset_inner =
308 row_inner * row_stride_inner + col_inner * col_stride_inner;
309 return offset_outer + offset_inner;
310 }
311
312 // Helpers for Matrix<T>.
313
314 template <typename Scalar>
ElementPtr(const Matrix<Scalar> & mat,int row,int col)315 const Scalar* ElementPtr(const Matrix<Scalar>& mat, int row, int col) {
316 return mat.data.get() + Offset(mat.layout, row, col);
317 }
318
319 template <typename Scalar>
ElementPtr(Matrix<Scalar> * mat,int row,int col)320 Scalar* ElementPtr(Matrix<Scalar>* mat, int row, int col) {
321 return mat->data.get() + Offset(mat->layout, row, col);
322 }
323
324 template <typename Scalar>
Element(const Matrix<Scalar> & mat,int row,int col)325 Scalar Element(const Matrix<Scalar>& mat, int row, int col) {
326 return *ElementPtr(mat, row, col);
327 }
328
329 // Helpers for PackedMatrix<T>.
330 // Duplicated from Matrix<T>, but the duplication seems acceptable.
331
332 template <typename Scalar>
ElementPtr(const PackedMatrix<Scalar> & mat,int row,int col)333 const Scalar* ElementPtr(const PackedMatrix<Scalar>& mat, int row, int col) {
334 return mat.data + Offset(mat.layout, row, col);
335 }
336
337 template <typename Scalar>
ElementPtr(PackedMatrix<Scalar> * mat,int row,int col)338 Scalar* ElementPtr(PackedMatrix<Scalar>* mat, int row, int col) {
339 return mat->data + Offset(mat->layout, row, col);
340 }
341
342 template <typename Scalar>
Element(const PackedMatrix<Scalar> & mat,int row,int col)343 Scalar Element(const PackedMatrix<Scalar>& mat, int row, int col) {
344 return *ElementPtr(mat, row, col);
345 }
346
347 // Helpers for PMatrix.
348
DataSize(const PMatrix & packed)349 inline std::size_t DataSize(const PMatrix& packed) {
350 return FlatSize(packed.layout) * packed.data_type.size;
351 }
352
SumsSize(const PMatrix & packed)353 inline std::size_t SumsSize(const PMatrix& packed) {
354 // Packed matrices are only relevant for Ruy's TrMul implementations. For
355 // TrMul, the number of sums is always equal to the number of columns.
356 return packed.layout.cols * packed.sums_type.size;
357 }
358
359 // Transpose helpers.
360
Transpose(Order * order)361 inline void Transpose(Order* order) {
362 *order = *order == Order::kColMajor ? Order::kRowMajor : Order::kColMajor;
363 }
364
Transpose(Layout * layout)365 inline void Transpose(Layout* layout) {
366 Transpose(&layout->order);
367 std::swap(layout->rows, layout->cols);
368 }
369
370 template <typename Scalar>
Transpose(Matrix<Scalar> * matrix)371 inline void Transpose(Matrix<Scalar>* matrix) {
372 Transpose(&matrix->layout);
373 }
374
375 // Helpers for KernelLayout.
376
377 template <typename FixedKernelLayout>
ToKernelLayout()378 KernelLayout ToKernelLayout() {
379 KernelLayout ret;
380 ret.order = FixedKernelLayout::kOrder;
381 ret.rows = FixedKernelLayout::kRows;
382 ret.cols = FixedKernelLayout::kCols;
383 return ret;
384 }
385
386 } // namespace ruy
387
388 #endif // TENSORFLOW_LITE_EXPERIMENTAL_RUY_INTERNAL_MATRIX_H_
389