• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /* Copyright 2019 Google LLC. All Rights Reserved.
2 
3 Licensed under the Apache License, Version 2.0 (the "License");
4 you may not use this file except in compliance with the License.
5 You may obtain a copy of the License at
6 
7     http://www.apache.org/licenses/LICENSE-2.0
8 
9 Unless required by applicable law or agreed to in writing, software
10 distributed under the License is distributed on an "AS IS" BASIS,
11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 See the License for the specific language governing permissions and
13 limitations under the License.
14 ==============================================================================*/
15 
16 // Internal types and helpers for matrices.
17 //
18 // Ruy has a couple slightly different notions of matrices, besides the
19 // Matrix<T> class that we expose to the user-facing API.
20 //
21 // TODO(silvasean): Put parts of this architecture description somewhere more
22 // prominent.
23 //
24 // The 4 main matrix types are:
25 // - Matrix<T>: This is a user-facing type on Ruy's external API boundary. It is
26 // also used internally.
27 // - DMatrix: This is a type-erased version of Matrix<T>. "D" = "dynamic".
28 // - PMatrix: This represents a packed matrix, which requires tracking kernel
29 // layout and row/column sums for quantization. It is type-erased.
30 // - PackedMatrix<T>: This is a statically typed variant of PMatrix for
31 // convenience inside typed routines.
32 //
33 // Note that Matrix<T> is *not* implemented in terms of the internal types. It
34 // is an independent, simple, and user-facing type.
35 //
36 // The use of type-erasure might seem surprising for a library like Ruy with a
37 // heavily-templated entry point, but it is motivated by the desire for most of
38 // Ruy's "middle-end" to be non-templated. Ruy can be thought of as having 3
39 // main parts:
40 // - "front-end" (dispatch.h) - this is the highly templated ruy::Mul entry
41 // point, along with routines that select RunKernel and RunPack implementations
42 // statically based on those template parameters.
43 // - "back-end" (kernel.h, pack.h)- this consists of the implementations of
44 // RunKernel and RunPack, often in assembly code, which are the building blocks
45 // that Ruy calls to perform matrix multiplication.  These are templated so that
46 // only the requested types/Path's are actually emitted by the compiler.
47 // - "middle-end" (trmul.h) - this is the part of Ruy that orchestrates the
48 // calls to the "back-end" optimized building blocks. This layer has to deal
49 // with issues like cache locality and low-overhead multi-threading.
50 //
51 // There is a desire for the "middle-end" to be non-templated in order to
52 // simplify the implementation and reduce code-size. We type-erase when going
53 // from the "front-end" to the "middle-end", and un-type-erase going from the
54 // "middle-end" to the "back-end". The un-type-erasure is possible because the
55 // "front-end" is responsible for instantiating the needed "back-end" templates,
56 // and thus the static type information is still present.
57 //
58 // Each layer of Ruy uses matrix types:
59 // - "front-end": Matrix<T>
60 // - "middle-end": DMatrix, PMatrix
61 // - "back-end": Matrix<T>, PackedMatrix<T>
62 //
63 // The use of separate types for packed matrices is not essential, but makes it
64 // obvious at a glance whether a matrix is a packed matrix or not. We would
65 // reconsider this decision if there was significant duplication between packed
66 // and unpacked matrices, but that doesn't seem to be the case at the moment.
67 //
68 // Another goal is to keep the user-facing Matrix<T> as simple and
69 // understandable as possible. Ideally, a user should be able to read the struct
70 // definition for Matrix<T> and see a very simple definition with no internal
71 // details like sums and kernel block layout.
72 //
73 // To present another structured view of our various matrix types, here's a
74 // table:
75 //                Plain matrices   Packed matrices
76 //             +----------------------------------
77 // Templated   |  Matrix<T>        PackedMatrix<T>
78 // Type-erased |  DMatrix          PMatrix
79 //
80 //
81 // There is 1 additional matrix type not mentioned above, due to its low
82 // importance:
83 // - PrepackedMatrix: This is a user-facing version of PMatrix. It has the bare
84 // minimum of fields needed for representing the raw data and sums buffers of a
85 // packed matrix for the "advanced" explicit pre-packing API. This type plays no
86 // role in Ruy's internals and can generally by ignored. The only reason it
87 // exists is so that PMatrix is not exposed to users -- we prefer to keep the
88 // internal matrix types hidden, even from "advanced" users.
89 
90 #ifndef TENSORFLOW_LITE_EXPERIMENTAL_RUY_INTERNAL_MATRIX_H_
91 #define TENSORFLOW_LITE_EXPERIMENTAL_RUY_INTERNAL_MATRIX_H_
92 
93 #include <cstddef>
94 #include <cstdint>
95 #include <type_traits>
96 #include <utility>
97 
98 #include "tensorflow/lite/experimental/ruy/check_macros.h"
99 #include "tensorflow/lite/experimental/ruy/common.h"
100 #include "tensorflow/lite/experimental/ruy/matrix.h"
101 #include "tensorflow/lite/experimental/ruy/size_util.h"
102 
103 namespace ruy {
104 
105 // KernelLayout describes small-scale block structure in a packed matrix layout.
106 // It's a runtime (as opposed to compile-time-constant) version of the
107 // FixedKernelLayout struct used to declare kernel layouts.
108 //
109 // This is is sometimes known as "tiling" in other contexts.
110 //
111 // For example, consider a packed matrix in column-major format with a
112 // column-major KernelLayout. The matrix logically has a shape of
113 // `[cols, rows]`. However, the matrix is laid out as though it were a 4D array
114 // of shape `[cols / kcols, rows / krows, kcols, krows]`.
115 //
116 // Note that in the case of kcols=1, krows=1, this degenerates to
117 // `[cols, rows, 1, 1]` which is equivalent to having no small-scale block
118 // structure.
119 struct KernelLayout {
120   Order order = Order::kColMajor;
121   std::uint8_t rows = 1;
122   std::uint8_t cols = 1;
123 };
124 
125 // A packed matrix has a small-scale block structure that is not present in in
126 // the input matrices. This block structure is necessary for the kernels to
127 // process data efficiently.
128 //
129 // This struct is very similar to Layout, but has the extra KernelLayout field.
130 struct PackedLayout {
131   std::int32_t rows = 0;
132   std::int32_t cols = 0;
133   // Stride is the offset between two adjacent matrix elements
134   // in the non-contiguous direction.
135   std::int32_t stride = 0;
136   Order order = Order::kColMajor;
137   // Small scale layout shuffling, potentially departing from
138   // linear row-major or column-major storage. See KernelLayout.
139   KernelLayout kernel;
140 };
141 
142 // Dynamic representation for a type.
143 //
144 // The most important field in this struct is the size, which Ruy uses to know
145 // how much memory to allocate without having to be templated on a type.
146 // Signed-ness and floating-point-ness are mainly present as debugging checks.
147 //
148 // Note: Ruy does not use this struct to to dynamically dispatch between
149 // different typed implementations. As described in the comment at the top of
150 // this file, Ruy's "front-end", which is templated, instantiates all the
151 // necessary "back-end" routines with complete static knowledge of all the
152 // types.
153 struct Type {
154   template <typename T>
CreateType155   static Type Create() {
156     Type ret;
157     ret.is_signed = std::is_signed<T>::value;
158     ret.is_floating_point = std::is_floating_point<T>::value;
159     ret.size = sizeof(T);
160     return ret;
161   }
162 
163   template <typename T>
AssertIsType164   void AssertIs() const {
165     RUY_DCHECK_EQ(is_signed, Create<T>().is_signed);
166     RUY_DCHECK_EQ(is_floating_point, Create<T>().is_floating_point);
167     RUY_DCHECK_EQ(size, Create<T>().size);
168   }
169 
170   bool is_signed = false;
171   bool is_floating_point = false;
172   std::uint8_t size = 0;
173 };
174 
175 // Type-erased matrix.
176 struct DMatrix {
177   Type data_type;
178   void* data = nullptr;
179   Layout layout;
180   std::int32_t zero_point = 0;
181 };
182 
183 // Type-erased packed matrix.
184 struct PMatrix {
185   Type data_type;
186   void* data = nullptr;
187   Type sums_type;
188   void* sums = nullptr;
189   PackedLayout layout;
190   std::int32_t zero_point = 0;
191 };
192 
193 // Convenient typed helper for packed matrices.
194 template <typename Scalar>
195 struct PackedMatrix {
196   // The row/column sums needed for quantized matrix multiplication when
197   // the opposite operand of the multiplication uses a non-symmetric zero
198   // point.
199   // This member is only relevant for packed matrices.
200   // Additionally, Ruy always uses 32-bit signed accumulators for quantized
201   // matrix multiplication.
202   // For floating point types, there is no quantization, so this pointer
203   // will always be null. We still need code referencing it to compile
204   // though, even if it is always branched around. Hence we use Scalar*
205   // itself as the type in that case.
206   using SumsType =
207       typename std::conditional<std::is_floating_point<Scalar>::value, Scalar,
208                                 std::int32_t>::type;
209 
210   Scalar* data = nullptr;
211   SumsType* sums = nullptr;
212   PackedLayout layout;
213   std::int32_t zero_point = 0;
214 };
215 
216 template <typename T>
ToDMatrix(const Matrix<T> & matrix)217 DMatrix ToDMatrix(const Matrix<T>& matrix) {
218   DMatrix ret;
219   ret.data_type = Type::Create<T>();
220   ret.data = ToVoidPtr(matrix.data.get());
221   ret.layout = matrix.layout;
222   ret.zero_point = matrix.zero_point;
223   return ret;
224 }
225 
226 template <typename T>
ToMatrix(const DMatrix & dmatrix)227 Matrix<T> ToMatrix(const DMatrix& dmatrix) {
228   dmatrix.data_type.AssertIs<T>();
229   Matrix<T> ret;
230   ret.data = static_cast<T*>(dmatrix.data);
231   ret.layout = dmatrix.layout;
232   ret.zero_point = dmatrix.zero_point;
233   return ret;
234 }
235 
236 template <typename T>
ToPackedMatrix(const PMatrix & pmatrix)237 PackedMatrix<T> ToPackedMatrix(const PMatrix& pmatrix) {
238   using SumsType = typename PackedMatrix<T>::SumsType;
239   pmatrix.data_type.AssertIs<T>();
240   pmatrix.sums_type.AssertIs<SumsType>();
241   PackedMatrix<T> ret;
242   ret.data = static_cast<T*>(pmatrix.data);
243   ret.sums = static_cast<SumsType*>(pmatrix.sums);
244   ret.layout = pmatrix.layout;
245   ret.zero_point = pmatrix.zero_point;
246   return ret;
247 }
248 
249 // Helpers for Layout / PackedLayout.
250 
IsPacked(const Layout & layout)251 inline bool IsPacked(const Layout& layout) {
252   if (layout.order == Order::kColMajor) {
253     return layout.stride == layout.rows;
254   } else {
255     return layout.stride == layout.cols;
256   }
257 }
258 
IsRowMajor(const Layout & layout)259 inline bool IsRowMajor(const Layout& layout) {
260   return layout.order == Order::kRowMajor;
261 }
262 
263 template <typename LayoutOrPackedLayout>
IsColMajor(const LayoutOrPackedLayout & layout)264 inline bool IsColMajor(const LayoutOrPackedLayout& layout) {
265   return layout.order == Order::kColMajor;
266 }
267 
268 template <typename LayoutOrPackedLayout>
FlatSize(const LayoutOrPackedLayout & layout)269 inline int FlatSize(const LayoutOrPackedLayout& layout) {
270   const int outerdim =
271       layout.order == Order::kColMajor ? layout.cols : layout.rows;
272   return layout.stride * outerdim;
273 }
274 
275 // TODO(b/130417400) add a unit test
Offset(const Layout & layout,int row,int col)276 inline int Offset(const Layout& layout, int row, int col) {
277   // TODO(benoitjacob)  - should check this but this make the _slow tests take
278   // 5x longer.  Find a mitigation like in Eigen with an 'internal' variant
279   // bypassing the check?
280   // RUY_DCHECK_GE(row, 0);
281   // RUY_DCHECK_GE(col, 0);
282   // RUY_DCHECK_LT(row, layout.rows);
283   // RUY_DCHECK_LT(col, layout.cols);
284   int row_stride = layout.order == Order::kColMajor ? 1 : layout.stride;
285   int col_stride = layout.order == Order::kRowMajor ? 1 : layout.stride;
286   return row * row_stride + col * col_stride;
287 }
288 
289 // TODO(b/130417400) add a unit test
Offset(const PackedLayout & layout,int row,int col)290 inline int Offset(const PackedLayout& layout, int row, int col) {
291   RUY_DCHECK(is_pot(layout.kernel.rows));
292   RUY_DCHECK(is_pot(layout.kernel.cols));
293   int row_outer = row & ~(layout.kernel.rows - 1);
294   int col_outer = col & ~(layout.kernel.cols - 1);
295   int row_stride_outer =
296       layout.order == Order::kColMajor ? layout.kernel.cols : layout.stride;
297   int col_stride_outer =
298       layout.order == Order::kRowMajor ? layout.kernel.rows : layout.stride;
299   int offset_outer =
300       row_outer * row_stride_outer + col_outer * col_stride_outer;
301   int row_inner = row - row_outer;
302   int col_inner = col - col_outer;
303   int row_stride_inner =
304       layout.kernel.order == Order::kColMajor ? 1 : layout.kernel.cols;
305   int col_stride_inner =
306       layout.kernel.order == Order::kRowMajor ? 1 : layout.kernel.rows;
307   int offset_inner =
308       row_inner * row_stride_inner + col_inner * col_stride_inner;
309   return offset_outer + offset_inner;
310 }
311 
312 // Helpers for Matrix<T>.
313 
314 template <typename Scalar>
ElementPtr(const Matrix<Scalar> & mat,int row,int col)315 const Scalar* ElementPtr(const Matrix<Scalar>& mat, int row, int col) {
316   return mat.data.get() + Offset(mat.layout, row, col);
317 }
318 
319 template <typename Scalar>
ElementPtr(Matrix<Scalar> * mat,int row,int col)320 Scalar* ElementPtr(Matrix<Scalar>* mat, int row, int col) {
321   return mat->data.get() + Offset(mat->layout, row, col);
322 }
323 
324 template <typename Scalar>
Element(const Matrix<Scalar> & mat,int row,int col)325 Scalar Element(const Matrix<Scalar>& mat, int row, int col) {
326   return *ElementPtr(mat, row, col);
327 }
328 
329 // Helpers for PackedMatrix<T>.
330 // Duplicated from Matrix<T>, but the duplication seems acceptable.
331 
332 template <typename Scalar>
ElementPtr(const PackedMatrix<Scalar> & mat,int row,int col)333 const Scalar* ElementPtr(const PackedMatrix<Scalar>& mat, int row, int col) {
334   return mat.data + Offset(mat.layout, row, col);
335 }
336 
337 template <typename Scalar>
ElementPtr(PackedMatrix<Scalar> * mat,int row,int col)338 Scalar* ElementPtr(PackedMatrix<Scalar>* mat, int row, int col) {
339   return mat->data + Offset(mat->layout, row, col);
340 }
341 
342 template <typename Scalar>
Element(const PackedMatrix<Scalar> & mat,int row,int col)343 Scalar Element(const PackedMatrix<Scalar>& mat, int row, int col) {
344   return *ElementPtr(mat, row, col);
345 }
346 
347 // Helpers for PMatrix.
348 
DataSize(const PMatrix & packed)349 inline std::size_t DataSize(const PMatrix& packed) {
350   return FlatSize(packed.layout) * packed.data_type.size;
351 }
352 
SumsSize(const PMatrix & packed)353 inline std::size_t SumsSize(const PMatrix& packed) {
354   // Packed matrices are only relevant for Ruy's TrMul implementations. For
355   // TrMul, the number of sums is always equal to the number of columns.
356   return packed.layout.cols * packed.sums_type.size;
357 }
358 
359 // Transpose helpers.
360 
Transpose(Order * order)361 inline void Transpose(Order* order) {
362   *order = *order == Order::kColMajor ? Order::kRowMajor : Order::kColMajor;
363 }
364 
Transpose(Layout * layout)365 inline void Transpose(Layout* layout) {
366   Transpose(&layout->order);
367   std::swap(layout->rows, layout->cols);
368 }
369 
370 template <typename Scalar>
Transpose(Matrix<Scalar> * matrix)371 inline void Transpose(Matrix<Scalar>* matrix) {
372   Transpose(&matrix->layout);
373 }
374 
375 // Helpers for KernelLayout.
376 
377 template <typename FixedKernelLayout>
ToKernelLayout()378 KernelLayout ToKernelLayout() {
379   KernelLayout ret;
380   ret.order = FixedKernelLayout::kOrder;
381   ret.rows = FixedKernelLayout::kRows;
382   ret.cols = FixedKernelLayout::kCols;
383   return ret;
384 }
385 
386 }  // namespace ruy
387 
388 #endif  // TENSORFLOW_LITE_EXPERIMENTAL_RUY_INTERNAL_MATRIX_H_
389