• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // Copyright 2015 Google Inc. All Rights Reserved.
2 //
3 // Licensed under the Apache License, Version 2.0 (the "License");
4 // you may not use this file except in compliance with the License.
5 // You may obtain a copy of the License at
6 //
7 //     http://www.apache.org/licenses/LICENSE-2.0
8 //
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an "AS IS" BASIS,
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 // See the License for the specific language governing permissions and
13 // limitations under the License.
14 
15 // pack.h: packing blocks of the LHS and RHS into the data layout
16 // that is expected by compute.h and eventually by kernels.
17 // Because this data layout depends on the kernel format, code here
18 // is templated in KernelLhsFormat/KernelRhsFormat.
19 //
20 // Readers note: an important theme around here is that we try hard
21 // to handle both Lhs and Rhs with a single piece of code. We indifferently
22 // refer to the Lhs and Rhs as a 'Side'. Instead of addressing matrices
23 // by (row, column) indices, we address them by (width, depth), as explained
24 // in kernel.h. This allows us to handle both Lhs and Rhs on an equal footing,
25 // at once.
26 
27 #ifndef GEMMLOWP_INTERNAL_PACK_H_
28 #define GEMMLOWP_INTERNAL_PACK_H_
29 
30 #include <cstring>
31 
32 #include "../public/bit_depth.h"
33 #include "allocator.h"
34 #include "block_params.h"
35 #include "common.h"
36 #include "kernel.h"
37 
38 namespace gemmlowp {
39 
40 // A PackedSideBlock instance is a packed block of either the LHS or RHS
41 // (whence the generic 'Side' name).
42 //
43 // 'Packed' means that it is laid out in the storage order that
44 // is expected by the specified kernel format. From a block of the input
45 // LHS or RHS matrix, one obtains a PackedSideBlock by calling PackLhs()
46 // or PackRhs().
47 template <typename tKernelSideFormat>
48 class PackedSideBlock {
49  public:
50   typedef tKernelSideFormat KernelSideFormat;
51 
PackedSideBlock(Side side,Allocator * allocator,const BlockParams & block_params)52   PackedSideBlock(Side side, Allocator* allocator,
53                   const BlockParams& block_params)
54       : allocator_(allocator),
55         pos_(0) {
56     GetSideBlockParams(side, &params_, block_params);
57     data_handle_ =
58         allocator_->Reserve<std::uint8_t>(params_.l2_width * params_.l2_depth);
59     sums_of_each_slice_handle_ =
60         allocator_->Reserve<std::int32_t>(params_.l2_width);
61   }
62 
~PackedSideBlock()63   ~PackedSideBlock() {}
64 
seek_run(int start_width,int start_depth)65   void seek_run(int start_width, int start_depth) const {
66     int kernel_run_depth =
67         std::min<int>(params_.l1_depth, params_.l2_depth - start_depth);
68     pos_ = params_.l2_width * start_depth + start_width * kernel_run_depth;
69   }
70 
seek_next_cell()71   void seek_next_cell() const { pos_ += KernelSideFormat::Cell::kSize; }
72 
seek_forward_n_cells(int n)73   void seek_forward_n_cells(int n) const {
74     pos_ += n * KernelSideFormat::Cell::kSize;
75   }
76 
current_data()77   const std::uint8_t* current_data() const {
78     return allocator_->GetPointer<std::uint8_t>(data_handle_) + pos_;
79   }
80 
current_data()81   std::uint8_t* current_data() {
82     return allocator_->GetPointer<std::uint8_t>(data_handle_) + pos_;
83   }
84 
sums_of_each_slice()85   std::int32_t* sums_of_each_slice() {
86     return allocator_->GetPointer<std::int32_t>(sums_of_each_slice_handle_);
87   }
88 
sums_of_each_slice()89   const std::int32_t* sums_of_each_slice() const {
90     return allocator_->GetPointer<const std::int32_t>(
91         sums_of_each_slice_handle_);
92   }
93 
params()94   const SideBlockParams& params() const { return params_; }
95 
96  private:
97   // The block size parameters that this PackedSizeBlock follows.
98   // The L2 parameters determine its overall size, while the L1 parameters,
99   // together with the kernel format template parameter, determine
100   // the fine details of the storage/traversal order.
101   SideBlockParams params_;
102 
103   // Pointer to the allocator provided by the caller. Not owned.
104   // The Allocator is assumed to outlive the PackedSideBlock.
105   Allocator* const allocator_;
106 
107   // Handle on the buffer backing this packed block. Owned.
108   Allocator::Handle data_handle_;
109 
110   // Handle on the additional buffer backing the vector of sums of slices
111   // associated with this block. Owned.
112   Allocator::Handle sums_of_each_slice_handle_;
113 
114   // pos_ is the current position in the buffer, which we access
115   // sequentially, like a file.
116   // The idea is that we pack data in the same order as it is
117   // going to be traversed during the computation, which for
118   // cache-friendliness reasons is complicated to random-access,
119   // as the offsets calculations would be intricate. So we
120   // give up random-access addressing, and instead content ourselves
121   // with sequential access.
122   //
123   // pos_ is mutable because during the computation we will want to
124   // be able to iterate on the data in a const PackedSideBlock.
125   mutable int pos_;
126 };
127 
128 // WidthMajor and DepthMajor are custom phrases modelled after the
129 // standard terminology 'row-major' and 'column-major'. Their meaning
130 // should be transparent once one has read the explanation in kernel.h:
131 // for example, in the Lhs, the 'width' dimension is the rows dimension,
132 // so there WidthMajor means RowMajor, while in the Rhs it is the opposite.
133 // Another way to put it: WidthMajor means that contiguous storage is used
134 // for entries having the same 'width' index.
135 enum class SideMapOrder { WidthMajor, DepthMajor };
136 
137 // Similar to MatrixMap from map.h, but in terms of width/depth instead of
138 // rows/columns. Used to address blocks of the input LHS/RHS matrices when
139 // packing them.
140 template <typename tScalar, SideMapOrder tOrder>
141 class SideMap {
142  public:
143   typedef tScalar Scalar;
144   static const SideMapOrder kOrder = tOrder;
145 
SideMap(Scalar * data,int width,int depth,int stride)146   SideMap(Scalar* data, int width, int depth, int stride)
147       : data_(data), width_(width), depth_(depth), stride_(stride) {}
148 
SideMap(Scalar * data,int width,int depth)149   SideMap(Scalar* data, int width, int depth)
150       : data_(data), width_(width), depth_(depth) {
151     stride_ = kOrder == SideMapOrder::WidthMajor ? depth_ : width_;
152   }
153 
SideMap(const SideMap & other)154   SideMap(const SideMap& other)
155       : data_(other.data_),
156         width_(other.width_),
157         depth_(other.depth_),
158         stride_(other.stride_) {}
159 
width()160   int width() const { return width_; }
depth()161   int depth() const { return depth_; }
stride()162   int stride() const { return stride_; }
width_stride()163   int width_stride() const {
164     return kOrder == SideMapOrder::DepthMajor ? 1 : stride_;
165   }
depth_stride()166   int depth_stride() const {
167     return kOrder == SideMapOrder::WidthMajor ? 1 : stride_;
168   }
data()169   Scalar* data() const { return data_; }
data(int w,int d)170   Scalar* data(int w, int d) const {
171     return data_ + w * width_stride() + d * depth_stride();
172   }
operator()173   Scalar operator()(int w, int d) const { return *data(w, d); }
operator()174   Scalar& operator()(int w, int d) { return *data(w, d); }
175 
block(int start_width,int start_depth,int block_width,int block_depth)176   SideMap block(int start_width, int start_depth, int block_width,
177                 int block_depth) const {
178     assert(start_width >= 0);
179     assert(start_width + block_width <= width_);
180     assert(start_depth >= 0);
181     assert(start_depth + block_depth <= depth_);
182 
183     return SideMap(data(start_width, start_depth), block_width, block_depth,
184                    stride_);
185   }
186 
187  private:
188   Scalar* data_;  // not owned.
189   int width_, depth_, stride_;
190 };
191 
192 template <RoundingMode tRoundingMode>
193 class ScalarRoundingOffsetGenerator {
194  public:
get()195   std::uint8_t get() {
196     assert(false);  // This generic path should never be called.
197     return 0;
198   }
199 };
200 
201 // A RoundingOffsetGenerator for rounding-to-nearest, always returning
202 // the midpoint value 127.
203 template <>
204 class ScalarRoundingOffsetGenerator<RoundingMode::Nearest> {
205  public:
get()206   std::uint8_t get() { return 127; }
207 };
208 
209 // A RoundingOffsetGenerator based on a 8-bit Xorshift.
210 // This gives good results as Xorshift naturally generates
211 // uniform random *nonzero* bytes i.e. 255 different values,
212 // so it only remains for us to subtract one.
213 template <>
214 class ScalarRoundingOffsetGenerator<RoundingMode::ProbabilisticXorshift> {
215  public:
ScalarRoundingOffsetGenerator()216   ScalarRoundingOffsetGenerator() { x_ = 128; }
217 
get()218   std::uint8_t get() {
219     std::uint8_t result = x_ - 1;
220     // Xorshift8(7,5,3)
221     x_ ^= x_ << 7;
222     x_ ^= x_ >> 5;
223     x_ ^= x_ << 3;
224     return result;
225   }
226 
227  private:
228   // State
229   std::uint8_t x_;
230 };
231 
232 // A RoundingOffsetGenerator based on an 8-bit add/mod
233 // low-discrepancy sequence.  See less-than-8-bit.txt for
234 // an explanation (the constant 97 is important - it must
235 // be both relatively prime to 255, in order for the sequence
236 // to be full-period, and c/255 should be close to 0.38 to
237 // obtain low discrepancy).  Uses a small bit hack to avoid
238 // expensive % operations.
239 template <>
240 class ScalarRoundingOffsetGenerator<RoundingMode::ProbabilisticAddmod> {
241   static const uint8_t AddConst = 97;
242 
243  public:
ScalarRoundingOffsetGenerator()244   ScalarRoundingOffsetGenerator() { x_ = 1; }  // Start must be non-zero
245 
get()246   std::uint8_t get() {
247     // The +'d boolean term causes the increment to skip over 255,
248     // (recalling that 255+1 = 256 = 0 for an 8 bit uint),
249     // thus implementing %255
250     x_ += (AddConst + (x_ >= (255 - AddConst)));
251     return x_;
252   }
253 
254  private:
255   // State
256   std::uint8_t x_;
257 };
258 
259 // Requantizes a source uint8 value in [0..255] range
260 // to the range specified by BitDepth, [0..((2^bits)-1)].
261 // Bias must be avoided. Currently this is achieved
262 // by probabilistic rounding.
263 template <typename QuantizationParams>
Requantize(std::uint8_t raw_src_val,ScalarRoundingOffsetGenerator<QuantizationParams::kRoundingMode> * rounding_offset_generator)264 std::uint8_t Requantize(
265     std::uint8_t raw_src_val,
266     ScalarRoundingOffsetGenerator<QuantizationParams::kRoundingMode>*
267         rounding_offset_generator) {
268   static const int kBits = QuantizationParams::BitDepth::kBits;
269   static const std::uint8_t kMaxVal = (1 << kBits) - 1;
270 
271   if (kBits == 8) {
272     return raw_src_val;
273   }
274 
275   std::uint16_t scaled = static_cast<std::uint16_t>(raw_src_val) * kMaxVal;
276   std::uint8_t rounding_offset = rounding_offset_generator->get();
277   return (scaled + rounding_offset) / 255;
278 }
279 
280 // A PackingRegisterBlock is a small fixed-size block of a matrix being
281 // packed. This class is the generic non-optimized implementation,
282 // it is inherited by the generic implementation of PackingRegisterBlock,
283 // which may be overriden by template specialization. Overriding it is how
284 // one may provide optimized packing code paths.
285 //
286 // The packing of a block proceeds in two steps:
287 //   1. Ensuring that we have a complete block of source data, i.e. a block of
288 //      the compile-time prescribed size. This is where we handle unaligned
289 //      boundaries: if we don't have a complete block of source data, then
290 //      we copy and zero-extend it into a local temporary (complete_src_),
291 //      see MakeCompleteSrc. In the generic case, we do have a complete block,
292 //      so we just use it in-place, see UseCompleteSrcInPlace.
293 //   2. Packing a complete block into the destination, see Pack. This is the
294 //      most critical part, so it's convenient that unaligned boundaries have
295 //      already been handled in step 1.
296 template <typename QuantizationParams, typename SrcMapType,
297           typename PackedSideBlock>
298 class PackingRegisterBlockBase {
299  public:
300   typedef typename PackedSideBlock::KernelSideFormat KernelSideFormat;
301   typedef typename KernelSideFormat::Cell CellFormat;
302   static const int kCells = KernelSideFormat::kCells;
303   static const int kCellWidth = CellFormat::kWidth;
304   static const int kKernelWidth = CellFormat::kWidth * kCells;
305   static const int kCellDepth = CellFormat::kDepth;
306   static const int kCellSize = CellFormat::kSize;
307   static const SideMapOrder kSrcOrder = SrcMapType::kOrder;
308 
309   typedef ScalarRoundingOffsetGenerator<QuantizationParams::kRoundingMode>
310       RoundingOffsetGenerator;
311 
PackingRegisterBlockBase()312   PackingRegisterBlockBase() : complete_src_(nullptr, 0, 0, 0) {}
313 
314  protected:
315   // The source data that's ready for packing. May point to
316   // in-place actual source data if it's already a complete block,
317   // (see UseCompleteSrcInPlace)
318   // or to the local buf_ below into which we copy incomplete blocks
319   // (see MakeCompleteSrc)
320   SrcMapType complete_src_;
321 
322   // Temporary buffer for loading incomplete blocks to,
323   // in the source storage order
324   std::uint8_t buf_[kKernelWidth * kRegisterSize];
325 
326  public:
327   // Selects a block if in-place source data that's already a complete block
UseCompleteSrcInPlace(const SrcMapType & src)328   void UseCompleteSrcInPlace(const SrcMapType& src) { complete_src_ = src; }
329   // Copies an incomplete block of source data into a local temporary
330   // complete block by zero-extending it.
MakeCompleteSrc(const SrcMapType & src)331   void MakeCompleteSrc(const SrcMapType& src) {
332     memset(buf_, 0, kKernelWidth * kRegisterSize);
333     if (kSrcOrder == SideMapOrder::WidthMajor) {
334       for (int w = 0; w < src.width(); w++) {
335         memcpy(buf_ + w * kRegisterSize, src.data(w, 0), src.depth());
336       }
337     } else {
338       assert(kSrcOrder == SideMapOrder::DepthMajor);
339       for (int d = 0; d < src.depth(); d++) {
340         memcpy(buf_ + d * kKernelWidth, src.data(0, d), src.width());
341       }
342     }
343     complete_src_ = SrcMapType(buf_, kKernelWidth, kRegisterSize);
344   }
345   // Packs a complete block into the destination. This is the most
346   // critical part and the part that we most typically want to
347   // override in architecture-specific optimized specializations.
Pack(PackedSideBlock * dst,int start_width,RoundingOffsetGenerator * rounding_offset_generator)348   void Pack(PackedSideBlock* dst, int start_width,
349             RoundingOffsetGenerator* rounding_offset_generator) {
350     std::uint8_t* dst_ptr = dst->current_data();
351     for (int cell_start_depth = 0; cell_start_depth < kRegisterSize;
352          cell_start_depth += kCellDepth) {
353       for (int cell_start_width = 0; cell_start_width < kKernelWidth;
354            cell_start_width += kCellWidth) {
355         std::int32_t* cell_sums_of_each_slice_ptr =
356             dst->sums_of_each_slice() + start_width + cell_start_width;
357         const SideMap<const std::uint8_t, kSrcOrder> src_cell_map(
358             complete_src_.block(cell_start_width, cell_start_depth, kCellWidth,
359                                 kCellDepth));
360         for (int w = 0; w < kCellWidth; w++) {
361           std::int32_t sum = 0;
362           for (int d = 0; d < kCellDepth; d++) {
363             const std::uint8_t raw_src_val = src_cell_map(w, d);
364             const std::uint8_t requantized = Requantize<QuantizationParams>(
365                 raw_src_val, rounding_offset_generator);
366             dst_ptr[OffsetIntoCell<CellFormat>(w, d)] = requantized;
367             sum += requantized;
368           }
369           cell_sums_of_each_slice_ptr[w] += sum;
370         }
371         dst_ptr += kCellSize;
372       }
373     }
374     dst->seek_forward_n_cells(kCells * kRegisterSize / kCellDepth);
375   }
376 };
377 
378 template <typename QuantizationParams, typename SrcMapType,
379           typename PackedSideBlock>
380 class PackingRegisterBlock
381     : public PackingRegisterBlockBase<QuantizationParams, SrcMapType,
382                                       PackedSideBlock> {};
383 
384 // Large-scale implementation of packing.
385 template <typename QuantizationParams, typename SrcMapType,
386           typename PackedSideBlock>
387 class PackSideBlockImpl {
388  public:
389   typedef typename PackedSideBlock::KernelSideFormat KernelSideFormat;
390   typedef typename KernelSideFormat::Cell CellFormat;
391   static const int kCells = KernelSideFormat::kCells;
392   static const int kCellWidth = CellFormat::kWidth;
393   static const int kKernelWidth = CellFormat::kWidth * kCells;
394   static const int kCellDepth = CellFormat::kDepth;
395 
396   typedef PackingRegisterBlock<QuantizationParams, SrcMapType, PackedSideBlock>
397       PackingRegisterBlockType;
398   typedef typename PackingRegisterBlockType::RoundingOffsetGenerator
399       RoundingOffsetGenerator;
400 
PackSideBlockImpl(PackedSideBlock * packed_side_block,const SrcMapType & src_map)401   PackSideBlockImpl(PackedSideBlock* packed_side_block,
402                     const SrcMapType& src_map)
403       : packed_side_block_(packed_side_block), src_map_(src_map) {}
404 
packed_side_block()405   PackedSideBlock* packed_side_block() const { return packed_side_block_; }
406 
src_map()407   const SrcMapType& src_map() const { return src_map_; }
408 
409   // The public entry point to pack a block.
PackL2()410   void PackL2() {
411     memset(packed_side_block_->sums_of_each_slice(), 0,
412            sizeof(std::int32_t) * packed_side_block_->params().l2_width);
413     for (int d = 0; d < src_map_.depth();
414          d += packed_side_block_->params().l1_depth) {
415       int ds = std::min<int>(packed_side_block_->params().l1_depth,
416                              src_map_.depth() - d);
417 
418       for (int w = 0; w < src_map_.width();
419            w += packed_side_block_->params().l1_width) {
420         int ws = std::min<int>(packed_side_block_->params().l1_width,
421                                src_map_.width() - w);
422 
423         PrefetchL1(w, ws, d, ds);
424         PackL1(w, ws, d, ds);
425       }
426     }
427   }
428 
429  protected:
430   // The intermediate-level loops, between PackL2 and PackRun.
PackL1(int start_width,int width,int start_depth,int depth)431   void PackL1(int start_width, int width, int start_depth, int depth) {
432     for (int w = 0; w < width; w += kKernelWidth) {
433       int ws = std::min(+kKernelWidth, width - w);
434       packed_side_block_->seek_run(start_width + w, start_depth);
435       PackRun(start_width + w, ws, start_depth, depth);
436     }
437   }
438 
439   // Prefetches the data that will be read by PackL1
PrefetchL1(int start_width,int width,int start_depth,int depth)440   void PrefetchL1(int start_width, int width, int start_depth, int depth) {
441     if (SrcMapType::kOrder == SideMapOrder::WidthMajor) {
442       for (int d = 0; d < depth; d += kDefaultCacheLineSize) {
443         for (int w = 0; w < width; w += 1) {
444           Prefetch(src_map_.data(start_width + w, start_depth + d));
445         }
446       }
447     } else {
448       for (int d = 0; d < depth; d++) {
449         for (int w = 0; w < width; w += kDefaultCacheLineSize) {
450           Prefetch(src_map_.data(start_width + w, start_depth + d));
451         }
452       }
453     }
454   }
455 
456   // PackRun packs only a run i.e. is the inner loop in the depth dimension.
PackRun(int start_width,int width,int start_depth,int depth)457   void PackRun(int start_width, int width, int start_depth, int depth) {
458     PackingRegisterBlockType b;
459     if (width == kKernelWidth) {
460       const int register_aligned_depth = RoundDown<kRegisterSize>(depth);
461       if (register_aligned_depth) {
462         for (int d = 0; d < register_aligned_depth; d += kRegisterSize) {
463           b.UseCompleteSrcInPlace(src_map_.block(start_width, start_depth + d,
464                                                  width, kRegisterSize));
465           b.Pack(packed_side_block_, start_width, &rounding_offset_generator_);
466         }
467       }
468       if (register_aligned_depth < depth) {
469         b.MakeCompleteSrc(
470             src_map_.block(start_width, start_depth + register_aligned_depth,
471                            width, depth - register_aligned_depth));
472         b.Pack(packed_side_block_, start_width, &rounding_offset_generator_);
473       }
474     } else {
475       assert(width < kKernelWidth);
476       for (int d = 0; d < depth; d += kRegisterSize) {
477         const int ds = std::min(+kRegisterSize, depth - d);
478         b.MakeCompleteSrc(
479             src_map_.block(start_width, start_depth + d, width, ds));
480         b.Pack(packed_side_block_, start_width, &rounding_offset_generator_);
481       }
482     }
483   }
484 
485   // The PackedSideBlock being packed, i.e. the 'destination'.
486   PackedSideBlock* const packed_side_block_;
487 
488   // A map on the block of the original matrix block being packed,
489   // i.e. the 'source'.
490   const SrcMapType& src_map_;
491 
492   // Used for requantization in the less-than-8-bit case.
493   // Otherwise unused.
494   RoundingOffsetGenerator rounding_offset_generator_;
495 };
496 
497 // Quantization parameters for the side (LHS or RHS) being packed,
498 // with the rounding strategy having been already resolved to a specific
499 // rounding mode.
500 template <typename tBitDepth, RoundingMode tRoundingMode>
501 struct QuantizationParams {
502   typedef tBitDepth BitDepth;
503   static const RoundingMode kRoundingMode = tRoundingMode;
504 };
505 
506 // Packs a block of the input LHS matrix, into a PackedSideBlock
507 template <typename BitDepthParams, typename PackedSideBlock,
508           typename MatrixMapType>
PackLhs(PackedSideBlock * dst,const MatrixMapType & src)509 void PackLhs(PackedSideBlock* dst, const MatrixMapType& src) {
510   ScopedProfilingLabel label("pack LHS");
511   static const SideMapOrder kSideMapOrder =
512       MatrixMapType::kOrder == MapOrder::RowMajor ? SideMapOrder::WidthMajor
513                                                   : SideMapOrder::DepthMajor;
514   typedef typename MatrixMapType::Scalar Scalar;
515   typedef SideMap<Scalar, kSideMapOrder> SideMapType;
516   SideMapType src_side_map(src.data(), src.rows(), src.cols(), src.stride());
517   typedef typename BitDepthParams::LhsBitDepth BitDepth;
518   typedef typename BitDepthParams::RoundingStrategy RoundingStrategy;
519   const int accumulation_depth = src_side_map.depth();
520   if (accumulation_depth < RoundingStrategy::kRoundingModeSizeThreshold) {
521     typedef QuantizationParams<BitDepth,
522                                RoundingStrategy::kRoundingModeForSmallSizes>
523         QParams;
524     typedef PackSideBlockImpl<QParams, SideMapType, PackedSideBlock> ImplType;
525     ImplType impl(dst, src_side_map);
526     impl.PackL2();
527   } else {
528     typedef QuantizationParams<BitDepth,
529                                RoundingStrategy::kRoundingModeForLargeSizes>
530         QParams;
531     typedef PackSideBlockImpl<QParams, SideMapType, PackedSideBlock> ImplType;
532     ImplType impl(dst, src_side_map);
533     impl.PackL2();
534   }
535 }
536 
537 // Packs a block of the input RHS matrix, into a PackedSideBlock
538 template <typename BitDepthParams, typename PackedSideBlock,
539           typename MatrixMapType>
PackRhs(PackedSideBlock * dst,const MatrixMapType & src)540 void PackRhs(PackedSideBlock* dst, const MatrixMapType& src) {
541   ScopedProfilingLabel label("pack RHS");
542   static const SideMapOrder kSideMapOrder =
543       MatrixMapType::kOrder == MapOrder::ColMajor ? SideMapOrder::WidthMajor
544                                                   : SideMapOrder::DepthMajor;
545   typedef typename MatrixMapType::Scalar Scalar;
546   typedef SideMap<Scalar, kSideMapOrder> SideMapType;
547   SideMapType src_side_map(src.data(), src.cols(), src.rows(), src.stride());
548   typedef typename BitDepthParams::RhsBitDepth BitDepth;
549   typedef typename BitDepthParams::RoundingStrategy RoundingStrategy;
550   const int accumulation_depth = src_side_map.depth();
551   if (accumulation_depth < RoundingStrategy::kRoundingModeSizeThreshold) {
552     typedef QuantizationParams<BitDepth,
553                                RoundingStrategy::kRoundingModeForSmallSizes>
554         QParams;
555     typedef PackSideBlockImpl<QParams, SideMapType, PackedSideBlock> ImplType;
556     ImplType impl(dst, src_side_map);
557     impl.PackL2();
558   } else {
559     typedef QuantizationParams<BitDepth,
560                                RoundingStrategy::kRoundingModeForLargeSizes>
561         QParams;
562     typedef PackSideBlockImpl<QParams, SideMapType, PackedSideBlock> ImplType;
563     ImplType impl(dst, src_side_map);
564     impl.PackL2();
565   }
566 }
567 
568 }  // namespace gemmlowp
569 
570 #ifdef GEMMLOWP_NEON
571 #include "pack_neon.h"
572 #elif defined(GEMMLOWP_SSE4)
573 #include "pack_SSE.h"
574 #endif
575 
576 #endif  // GEMMLOWP_INTERNAL_PACK_H_
577