1 // Copyright 2015 The Gemmlowp Authors. All Rights Reserved.
2 //
3 // Licensed under the Apache License, Version 2.0 (the "License");
4 // you may not use this file except in compliance with the License.
5 // You may obtain a copy of the License at
6 //
7 // http://www.apache.org/licenses/LICENSE-2.0
8 //
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an "AS IS" BASIS,
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 // See the License for the specific language governing permissions and
13 // limitations under the License.
14
15 // block_params.h: Logic to choose L1 and L2 block sizes
16 // to optimize cache-friendliness.
17
18 #ifndef GEMMLOWP_INTERNAL_BLOCK_PARAMS_H_
19 #define GEMMLOWP_INTERNAL_BLOCK_PARAMS_H_
20
21 #include "common.h"
22
23 namespace gemmlowp {
24
25 // A BlockParams instance contains a full description of all the block size
26 // parameters to be used by a Gemm.
27 // There are two nested levels of block subdivisions: first a subdivision
28 // into large blocks that should fit in last-level cache (what we call L2 here)
29 // and then another subdivision into smaller blocks that should fit in
30 // L1 cache. There is then actually a third level of subdivision to fit
31 // in registers, but we are not concerned with that here.
32 struct BlockParams {
33 // L1 block parameters determine the size of small blocks that should
34 // fit in L1 cache.
35 int l1_rows;
36 int l1_cols;
37 int l1_depth;
38
39 // L2 block parameters determine the size of larger blocks that should
40 // fit in L2 cache.
41 int l2_rows;
42 int l2_cols;
43 int l2_depth;
44
45 template <typename KernelFormat>
InitBlockParams46 void Init(int rows, int cols, int depth, int num_threads, int l1_bytes_to_use,
47 int l2_bytes_to_use, float l2_rhs_factor) {
48 FindL2BlockSizes<KernelFormat>(rows, cols, depth, num_threads,
49 l2_bytes_to_use, l2_rhs_factor, &l2_rows,
50 &l2_cols, &l2_depth);
51 FindL1BlockSizes<KernelFormat>(l2_rows, l2_cols, l2_depth, l1_bytes_to_use,
52 &l1_rows, &l1_cols, &l1_depth);
53 }
54
55 template <typename KernelFormat>
FindL2BlockSizesBlockParams56 static void FindL2BlockSizes(int rows, int cols, int depth, int num_threads,
57 int l2_bytes_to_use, float l2_rhs_factor,
58 int* out_l2_rows, int* out_l2_cols,
59 int* out_l2_depth) {
60 int l2_rows = 0;
61 int l2_cols = 0;
62 int l2_depth = 0;
63
64 int per_thread_rows =
65 std::max(1, RoundUp<KernelFormat::kRows>(rows) / num_threads);
66
67 // No L2 blocking in the depth dimension at the moment.
68 // Too much loss of accuracy due to storing intermediate results in
69 // low precision.
70 // However, we still want to round l2_depth up to the next multiple
71 // of register size, so as to avoid having to special-case unaligned depths.
72 l2_depth = RoundUp<kRegisterSize>(depth);
73
74 {
75 int max_cache_friendly_l2_cols = std::max(
76 1, static_cast<int>(l2_rhs_factor * (l2_bytes_to_use / l2_depth)));
77 int min_l2_cols_blocks =
78 std::max(1, CeilQuotient(cols, max_cache_friendly_l2_cols));
79 l2_cols =
80 RoundUp<KernelFormat::kCols>(CeilQuotient(cols, min_l2_cols_blocks));
81 }
82
83 // No L2 blocking in the row dimension if l2_rhs_factor is 1.0 as the row
84 // dimension concerns only the LHS. Blocking only RHS matrix for L2 enhances
85 // the performance on x86.
86 if (l2_rhs_factor == 1.0f) {
87 l2_rows = RoundUp<KernelFormat::kRows>(per_thread_rows);
88 } else {
89 int max_cache_friendly_l2_rows =
90 std::max(1, (l2_bytes_to_use - l2_depth * l2_cols) /
91 (num_threads * (l2_depth + 4 * l2_cols)));
92 int min_l2_rows_blocks = std::max(
93 1, CeilQuotient(per_thread_rows, max_cache_friendly_l2_rows));
94 l2_rows = RoundUp<KernelFormat::kRows>(
95 CeilQuotient(per_thread_rows, min_l2_rows_blocks));
96 }
97
98 *out_l2_rows = l2_rows;
99 *out_l2_cols = l2_cols;
100 *out_l2_depth = l2_depth;
101 }
102
103 template <typename KernelFormat>
FindL1BlockSizesBlockParams104 static void FindL1BlockSizes(int rows, int cols, int depth,
105 int l1_bytes_to_use, int* out_l1_rows,
106 int* out_l1_cols, int* out_l1_depth) {
107 int l1_rows = 0;
108 int l1_cols = 0;
109 int l1_depth = 0;
110
111 // L2 block sizes should already be multiples of kernel block sizes.
112 assert(rows % KernelFormat::kRows == 0);
113 assert(cols % KernelFormat::kCols == 0);
114 assert(depth % KernelFormat::kDepth == 0);
115
116 // No L1 blocking in the columns dimension at the moment.
117 // Thought not to be needed. Similar to Eigen.
118 l1_cols = cols;
119
120 {
121 int max_cache_friendly_l1_depth = std::max(
122 1, (l1_bytes_to_use - 4 * KernelFormat::kRows * KernelFormat::kCols) /
123 (KernelFormat::kRows + KernelFormat::kCols));
124 int min_l1_depth_blocks =
125 std::max(1, CeilQuotient(depth, max_cache_friendly_l1_depth));
126 l1_depth =
127 RoundUp<kRegisterSize>(CeilQuotient(depth, min_l1_depth_blocks));
128 }
129
130 {
131 int max_cache_friendly_l1_rows =
132 std::max(1, l1_bytes_to_use / (l1_depth + 4 * l1_cols));
133 int min_l1_rows_blocks =
134 std::max(1, CeilQuotient(rows, max_cache_friendly_l1_rows));
135 l1_rows =
136 RoundUp<KernelFormat::kRows>(CeilQuotient(rows, min_l1_rows_blocks));
137 }
138
139 *out_l1_rows = l1_rows;
140 *out_l1_cols = l1_cols;
141 *out_l1_depth = l1_depth;
142 }
143 };
144
145 // A SideBlockParams instance contains only the block params relevant to
146 // one side (LHS or RHS), expressed in terms of 'width' instead of
147 // rows/colums. See the explanation in kernel.h: in the LHS, 'width' means
148 // the number of rows, while in the RHS, 'width' means the number of columns.
149 // That allows us to write generic code that applies to either LHS or RHS.
150 struct SideBlockParams {
151 // L1 block parameters determine the size of small blocks that should
152 // fit in L1 cache.
153 int l1_width;
154 int l1_depth;
155
156 // L2 block parameters determine the size of larger blocks that should
157 // fit in L2 cache.
158 int l2_width;
159 int l2_depth;
160 };
161
162 enum class Side { Lhs, Rhs };
163
GetSideBlockParams(Side side,SideBlockParams * side_block_params,const BlockParams & block_params)164 inline void GetSideBlockParams(Side side, SideBlockParams* side_block_params,
165 const BlockParams& block_params) {
166 side_block_params->l1_width =
167 side == Side::Lhs ? block_params.l1_rows : block_params.l1_cols;
168 side_block_params->l2_width =
169 side == Side::Lhs ? block_params.l2_rows : block_params.l2_cols;
170
171 side_block_params->l1_depth = block_params.l1_depth;
172 side_block_params->l2_depth = block_params.l2_depth;
173 }
174
175 } // namespace gemmlowp
176
177 #endif // GEMMLOWP_INTERNAL_BLOCK_PARAMS_H_
178