1 // Copyright 2015 The Gemmlowp Authors. All Rights Reserved.
2 //
3 // Licensed under the Apache License, Version 2.0 (the "License");
4 // you may not use this file except in compliance with the License.
5 // You may obtain a copy of the License at
6 //
7 // http://www.apache.org/licenses/LICENSE-2.0
8 //
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an "AS IS" BASIS,
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 // See the License for the specific language governing permissions and
13 // limitations under the License.
14
15 // pack.h: packing blocks of the LHS and RHS into the data layout
16 // that is expected by compute.h and eventually by kernels.
17 // Because this data layout depends on the kernel format, code here
18 // is templated in KernelLhsFormat/KernelRhsFormat.
19 //
20 // Readers note: an important theme around here is that we try hard
21 // to handle both Lhs and Rhs with a single piece of code. We indifferently
22 // refer to the Lhs and Rhs as a 'Side'. Instead of addressing matrices
23 // by (row, column) indices, we address them by (width, depth), as explained
24 // in kernel.h. This allows us to handle both Lhs and Rhs on an equal footing,
25 // at once.
26
27 #ifndef GEMMLOWP_INTERNAL_PACK_H_
28 #define GEMMLOWP_INTERNAL_PACK_H_
29
30 #include <cstring>
31
32 #include "allocator.h"
33 #include "block_params.h"
34 #include "common.h"
35 #include "kernel.h"
36
37 namespace gemmlowp {
38
39 // A PackedSideBlock instance is a packed block of either the LHS or RHS
40 // (whence the generic 'Side' name).
41 //
42 // 'Packed' means that it is laid out in the storage order that
43 // is expected by the specified kernel format. From a block of the input
44 // LHS or RHS matrix, one obtains a PackedSideBlock by calling PackLhs()
45 // or PackRhs().
46 template <typename tKernelSideFormat>
47 class PackedSideBlock {
48 public:
49 typedef tKernelSideFormat KernelSideFormat;
50
PackedSideBlock(Side side,Allocator * allocator,const BlockParams & block_params)51 PackedSideBlock(Side side, Allocator* allocator,
52 const BlockParams& block_params)
53 : allocator_(allocator), pos_(0) {
54 GetSideBlockParams(side, ¶ms_, block_params);
55 data_handle_ =
56 allocator_->Reserve<std::uint8_t>(params_.l2_width * params_.l2_depth);
57 sums_of_each_slice_handle_ =
58 allocator_->Reserve<std::int32_t>(params_.l2_width);
59 }
60
~PackedSideBlock()61 ~PackedSideBlock() {}
62
seek_run(int start_width,int start_depth)63 void seek_run(int start_width, int start_depth) const {
64 int kernel_run_depth =
65 std::min<int>(params_.l1_depth, params_.l2_depth - start_depth);
66 pos_ = params_.l2_width * start_depth + start_width * kernel_run_depth;
67 }
68
seek_next_cell()69 void seek_next_cell() const { pos_ += KernelSideFormat::Cell::kSize; }
70
seek_forward_n_cells(int n)71 void seek_forward_n_cells(int n) const {
72 pos_ += n * KernelSideFormat::Cell::kSize;
73 }
74
current_data()75 const std::uint8_t* current_data() const {
76 return allocator_->GetPointer<std::uint8_t>(data_handle_) + pos_;
77 }
78
current_data()79 std::uint8_t* current_data() {
80 return allocator_->GetPointer<std::uint8_t>(data_handle_) + pos_;
81 }
82
sums_of_each_slice()83 std::int32_t* sums_of_each_slice() {
84 return allocator_->GetPointer<std::int32_t>(sums_of_each_slice_handle_);
85 }
86
sums_of_each_slice()87 const std::int32_t* sums_of_each_slice() const {
88 return allocator_->GetPointer<const std::int32_t>(
89 sums_of_each_slice_handle_);
90 }
91
params()92 const SideBlockParams& params() const { return params_; }
93
94 private:
95 // The block size parameters that this PackedSizeBlock follows.
96 // The L2 parameters determine its overall size, while the L1 parameters,
97 // together with the kernel format template parameter, determine
98 // the fine details of the storage/traversal order.
99 SideBlockParams params_;
100
101 // Pointer to the allocator provided by the caller. Not owned.
102 // The Allocator is assumed to outlive the PackedSideBlock.
103 Allocator* const allocator_;
104
105 // Handle on the buffer backing this packed block. Owned.
106 Allocator::Handle data_handle_;
107
108 // Handle on the additional buffer backing the vector of sums of slices
109 // associated with this block. Owned.
110 Allocator::Handle sums_of_each_slice_handle_;
111
112 // pos_ is the current position in the buffer, which we access
113 // sequentially, like a file.
114 // The idea is that we pack data in the same order as it is
115 // going to be traversed during the computation, which for
116 // cache-friendliness reasons is complicated to random-access,
117 // as the offsets calculations would be intricate. So we
118 // give up random-access addressing, and instead content ourselves
119 // with sequential access.
120 //
121 // pos_ is mutable because during the computation we will want to
122 // be able to iterate on the data in a const PackedSideBlock.
123 mutable int pos_;
124 };
125
126 // WidthMajor and DepthMajor are custom phrases modelled after the
127 // standard terminology 'row-major' and 'column-major'. Their meaning
128 // should be transparent once one has read the explanation in kernel.h:
129 // for example, in the Lhs, the 'width' dimension is the rows dimension,
130 // so there WidthMajor means RowMajor, while in the Rhs it is the opposite.
131 // Another way to put it: WidthMajor means that contiguous storage is used
132 // for entries having the same 'width' index.
133 enum class SideMapOrder { WidthMajor, DepthMajor };
134
135 // Similar to MatrixMap from map.h, but in terms of width/depth instead of
136 // rows/columns. Used to address blocks of the input LHS/RHS matrices when
137 // packing them.
138 template <typename tScalar, SideMapOrder tOrder>
139 class SideMap {
140 public:
141 typedef tScalar Scalar;
142 static const SideMapOrder kOrder = tOrder;
143
SideMap(Scalar * data,int width,int depth,int stride)144 SideMap(Scalar* data, int width, int depth, int stride)
145 : data_(data), width_(width), depth_(depth), stride_(stride) {}
146
SideMap(Scalar * data,int width,int depth)147 SideMap(Scalar* data, int width, int depth)
148 : data_(data), width_(width), depth_(depth) {
149 stride_ = kOrder == SideMapOrder::WidthMajor ? depth_ : width_;
150 }
151
SideMap(const SideMap & other)152 SideMap(const SideMap& other)
153 : data_(other.data_),
154 width_(other.width_),
155 depth_(other.depth_),
156 stride_(other.stride_) {}
157
width()158 int width() const { return width_; }
depth()159 int depth() const { return depth_; }
stride()160 int stride() const { return stride_; }
width_stride()161 int width_stride() const {
162 return kOrder == SideMapOrder::DepthMajor ? 1 : stride_;
163 }
depth_stride()164 int depth_stride() const {
165 return kOrder == SideMapOrder::WidthMajor ? 1 : stride_;
166 }
data()167 Scalar* data() const { return data_; }
data(int w,int d)168 Scalar* data(int w, int d) const {
169 return data_ + w * width_stride() + d * depth_stride();
170 }
operator()171 Scalar operator()(int w, int d) const { return *data(w, d); }
operator()172 Scalar& operator()(int w, int d) { return *data(w, d); }
173
block(int start_width,int start_depth,int block_width,int block_depth)174 SideMap block(int start_width, int start_depth, int block_width,
175 int block_depth) const {
176 assert(start_width >= 0);
177 assert(start_width + block_width <= width_);
178 assert(start_depth >= 0);
179 assert(start_depth + block_depth <= depth_);
180
181 return SideMap(data(start_width, start_depth), block_width, block_depth,
182 stride_);
183 }
184
185 private:
186 Scalar* data_; // not owned.
187 int width_, depth_, stride_;
188 };
189
190 // A PackingRegisterBlock is a small fixed-size block of a matrix being
191 // packed. This class is the generic non-optimized implementation,
192 // it is inherited by the generic implementation of PackingRegisterBlock,
193 // which may be overriden by template specialization. Overriding it is how
194 // one may provide optimized packing code paths.
195 //
196 // The packing of a block proceeds in two steps:
197 // 1. Ensuring that we have a complete block of source data, i.e. a block of
198 // the compile-time prescribed size. This is where we handle unaligned
199 // boundaries: if we don't have a complete block of source data, then
200 // we copy and zero-extend it into a local temporary (complete_src_),
201 // see MakeCompleteSrc. In the generic case, we do have a complete block,
202 // so we just use it in-place, see UseCompleteSrcInPlace.
203 // 2. Packing a complete block into the destination, see Pack. This is the
204 // most critical part, so it's convenient that unaligned boundaries have
205 // already been handled in step 1.
206 template <typename SrcMapType, typename PackedSideBlock>
207 class PackingRegisterBlockBase {
208 public:
209 typedef typename PackedSideBlock::KernelSideFormat KernelSideFormat;
210 typedef typename KernelSideFormat::Cell CellFormat;
211 typedef typename KernelSideFormat::Scalar KernelScalar;
212 static const int kCells = KernelSideFormat::kCells;
213 static const int kCellWidth = CellFormat::kWidth;
214 static const int kKernelWidth = CellFormat::kWidth * kCells;
215 static const int kCellDepth = CellFormat::kDepth;
216 static const int kCellSize = CellFormat::kSize;
217 static const SideMapOrder kSrcOrder = SrcMapType::kOrder;
218 static const int kZeroPointInputValue =
219 ZeroPointInputValue<KernelScalar>::kValue;
220
PackingRegisterBlockBase()221 PackingRegisterBlockBase() : complete_src_(nullptr, 0, 0, 0) {}
222
223 protected:
224 // The source data that's ready for packing. May point to
225 // in-place actual source data if it's already a complete block,
226 // (see UseCompleteSrcInPlace)
227 // or to the local buf_ below into which we copy incomplete blocks
228 // (see MakeCompleteSrc)
229 SrcMapType complete_src_;
230
231 // Temporary buffer for loading incomplete blocks to,
232 // in the source storage order
233 std::uint8_t buf_[kKernelWidth * kRegisterSize];
234
235 public:
236 // Selects a block if in-place source data that's already a complete block
UseCompleteSrcInPlace(const SrcMapType & src)237 void UseCompleteSrcInPlace(const SrcMapType& src) { complete_src_ = src; }
238 // Copies an incomplete block of source data into a local temporary
239 // complete block by zero-extending it.
MakeCompleteSrc(const SrcMapType & src)240 void MakeCompleteSrc(const SrcMapType& src) {
241 memset(buf_, kZeroPointInputValue, kKernelWidth * kRegisterSize);
242 if (kSrcOrder == SideMapOrder::WidthMajor) {
243 for (int w = 0; w < src.width(); w++) {
244 memcpy(buf_ + w * kRegisterSize, src.data(w, 0), src.depth());
245 }
246 } else {
247 assert(kSrcOrder == SideMapOrder::DepthMajor);
248 for (int d = 0; d < src.depth(); d++) {
249 memcpy(buf_ + d * kKernelWidth, src.data(0, d), src.width());
250 }
251 }
252 complete_src_ = SrcMapType(buf_, kKernelWidth, kRegisterSize);
253 }
254 // Packs a complete block into the destination. This is the most
255 // critical part and the part that we most typically want to
256 // override in architecture-specific optimized specializations.
Pack(PackedSideBlock * dst,int start_width)257 void Pack(PackedSideBlock* dst, int start_width) {
258 std::uint8_t* dst_ptr = dst->current_data();
259 for (int cell_start_depth = 0; cell_start_depth < kRegisterSize;
260 cell_start_depth += kCellDepth) {
261 for (int cell_start_width = 0; cell_start_width < kKernelWidth;
262 cell_start_width += kCellWidth) {
263 std::int32_t* cell_sums_of_each_slice_ptr =
264 dst->sums_of_each_slice() + start_width + cell_start_width;
265 const SideMap<const std::uint8_t, kSrcOrder> src_cell_map(
266 complete_src_.block(cell_start_width, cell_start_depth, kCellWidth,
267 kCellDepth));
268 for (int w = 0; w < kCellWidth; w++) {
269 std::int32_t sum = 0;
270 for (int d = 0; d < kCellDepth; d++) {
271 const std::uint8_t src_val = src_cell_map(w, d);
272 const std::int16_t kernel_val_unwrapped =
273 src_val - kZeroPointInputValue;
274 const std::uint8_t kernel_val_uint8 = kernel_val_unwrapped;
275 dst_ptr[OffsetIntoCell<CellFormat>(w, d)] = kernel_val_uint8;
276 sum += kernel_val_unwrapped;
277 }
278 cell_sums_of_each_slice_ptr[w] += sum;
279 }
280 dst_ptr += kCellSize;
281 }
282 }
283 dst->seek_forward_n_cells(kCells * kRegisterSize / kCellDepth);
284 }
285 };
286
287 template <typename SrcMapType, typename PackedSideBlock>
288 class PackingRegisterBlock
289 : public PackingRegisterBlockBase<SrcMapType, PackedSideBlock> {};
290
291 // Large-scale implementation of packing.
292 template <typename SrcMapType, typename PackedSideBlock>
293 class PackSideBlockImpl {
294 public:
295 typedef typename PackedSideBlock::KernelSideFormat KernelSideFormat;
296 typedef typename KernelSideFormat::Cell CellFormat;
297 static const int kCells = KernelSideFormat::kCells;
298 static const int kCellWidth = CellFormat::kWidth;
299 static const int kKernelWidth = CellFormat::kWidth * kCells;
300 static const int kCellDepth = CellFormat::kDepth;
301
302 typedef PackingRegisterBlock<SrcMapType, PackedSideBlock>
303 PackingRegisterBlockType;
304
PackSideBlockImpl(PackedSideBlock * packed_side_block,const SrcMapType & src_map)305 PackSideBlockImpl(PackedSideBlock* packed_side_block,
306 const SrcMapType& src_map)
307 : packed_side_block_(packed_side_block), src_map_(src_map) {}
308
packed_side_block()309 PackedSideBlock* packed_side_block() const { return packed_side_block_; }
310
src_map()311 const SrcMapType& src_map() const { return src_map_; }
312
313 // The public entry point to pack a block.
PackL2()314 void PackL2() {
315 memset(packed_side_block_->sums_of_each_slice(), 0,
316 sizeof(std::int32_t) * packed_side_block_->params().l2_width);
317 for (int d = 0; d < src_map_.depth();
318 d += packed_side_block_->params().l1_depth) {
319 int ds = std::min<int>(packed_side_block_->params().l1_depth,
320 src_map_.depth() - d);
321
322 for (int w = 0; w < src_map_.width();
323 w += packed_side_block_->params().l1_width) {
324 int ws = std::min<int>(packed_side_block_->params().l1_width,
325 src_map_.width() - w);
326
327 PrefetchL1(w, ws, d, ds);
328 PackL1(w, ws, d, ds);
329 }
330 }
331 }
332
333 protected:
334 // The intermediate-level loops, between PackL2 and PackRun.
PackL1(int start_width,int width,int start_depth,int depth)335 void PackL1(int start_width, int width, int start_depth, int depth) {
336 for (int w = 0; w < width; w += kKernelWidth) {
337 int ws = std::min(+kKernelWidth, width - w);
338 packed_side_block_->seek_run(start_width + w, start_depth);
339 PackRun(start_width + w, ws, start_depth, depth);
340 }
341 }
342
343 // Prefetches the data that will be read by PackL1
PrefetchL1(int start_width,int width,int start_depth,int depth)344 void PrefetchL1(int start_width, int width, int start_depth, int depth) {
345 if (SrcMapType::kOrder == SideMapOrder::WidthMajor) {
346 for (int d = 0; d < depth; d += kDefaultCacheLineSize) {
347 for (int w = 0; w < width; w += 1) {
348 Prefetch(src_map_.data(start_width + w, start_depth + d));
349 }
350 }
351 } else {
352 for (int d = 0; d < depth; d++) {
353 for (int w = 0; w < width; w += kDefaultCacheLineSize) {
354 Prefetch(src_map_.data(start_width + w, start_depth + d));
355 }
356 }
357 }
358 }
359
360 // PackRun packs only a run i.e. is the inner loop in the depth dimension.
PackRun(int start_width,int width,int start_depth,int depth)361 void PackRun(int start_width, int width, int start_depth, int depth) {
362 PackingRegisterBlockType b;
363 if (width == kKernelWidth) {
364 const int register_aligned_depth = RoundDown<kRegisterSize>(depth);
365 if (register_aligned_depth) {
366 for (int d = 0; d < register_aligned_depth; d += kRegisterSize) {
367 b.UseCompleteSrcInPlace(src_map_.block(start_width, start_depth + d,
368 width, kRegisterSize));
369 b.Pack(packed_side_block_, start_width);
370 }
371 }
372 if (register_aligned_depth < depth) {
373 b.MakeCompleteSrc(
374 src_map_.block(start_width, start_depth + register_aligned_depth,
375 width, depth - register_aligned_depth));
376 b.Pack(packed_side_block_, start_width);
377 }
378 } else {
379 assert(width < kKernelWidth);
380 for (int d = 0; d < depth; d += kRegisterSize) {
381 const int ds = std::min(+kRegisterSize, depth - d);
382 b.MakeCompleteSrc(
383 src_map_.block(start_width, start_depth + d, width, ds));
384 b.Pack(packed_side_block_, start_width);
385 }
386 }
387 }
388
389 // The PackedSideBlock being packed, i.e. the 'destination'.
390 PackedSideBlock* const packed_side_block_;
391
392 // A map on the block of the original matrix block being packed,
393 // i.e. the 'source'.
394 const SrcMapType& src_map_;
395 };
396
397 // Packs a block of the input LHS matrix, into a PackedSideBlock
398 template <typename PackedSideBlock, typename MatrixMapType>
PackLhs(PackedSideBlock * dst,const MatrixMapType & src)399 void PackLhs(PackedSideBlock* dst, const MatrixMapType& src) {
400 ScopedProfilingLabel label("pack LHS");
401 static const SideMapOrder kSideMapOrder =
402 MatrixMapType::kOrder == MapOrder::RowMajor ? SideMapOrder::WidthMajor
403 : SideMapOrder::DepthMajor;
404 typedef typename MatrixMapType::Scalar Scalar;
405 typedef SideMap<Scalar, kSideMapOrder> SideMapType;
406 SideMapType src_side_map(src.data(), src.rows(), src.cols(), src.stride());
407 typedef PackSideBlockImpl<SideMapType, PackedSideBlock> ImplType;
408 ImplType impl(dst, src_side_map);
409 impl.PackL2();
410 }
411
412 // Packs a block of the input RHS matrix, into a PackedSideBlock
413 template <typename PackedSideBlock, typename MatrixMapType>
PackRhs(PackedSideBlock * dst,const MatrixMapType & src)414 void PackRhs(PackedSideBlock* dst, const MatrixMapType& src) {
415 ScopedProfilingLabel label("pack RHS");
416 static const SideMapOrder kSideMapOrder =
417 MatrixMapType::kOrder == MapOrder::ColMajor ? SideMapOrder::WidthMajor
418 : SideMapOrder::DepthMajor;
419 typedef typename MatrixMapType::Scalar Scalar;
420 typedef SideMap<Scalar, kSideMapOrder> SideMapType;
421 SideMapType src_side_map(src.data(), src.cols(), src.rows(), src.stride());
422 typedef PackSideBlockImpl<SideMapType, PackedSideBlock> ImplType;
423 ImplType impl(dst, src_side_map);
424 impl.PackL2();
425 }
426
427 } // namespace gemmlowp
428
429 #ifdef GEMMLOWP_NEON
430 #include "pack_neon.h"
431 #elif defined(GEMMLOWP_SSE4)
432 #include "pack_sse.h"
433 #elif defined(GEMMLOWP_MSA)
434 #include "pack_msa.h"
435 #endif
436
437 #endif // GEMMLOWP_INTERNAL_PACK_H_
438