• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // Copyright 2015 The Gemmlowp Authors. All Rights Reserved.
2 //
3 // Licensed under the Apache License, Version 2.0 (the "License");
4 // you may not use this file except in compliance with the License.
5 // You may obtain a copy of the License at
6 //
7 //     http://www.apache.org/licenses/LICENSE-2.0
8 //
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an "AS IS" BASIS,
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 // See the License for the specific language governing permissions and
13 // limitations under the License.
14 
15 // single_thread_gemm.h: Single-threaded GEMM implementation.
16 // This is a good place to start reading code, as it shows the overall
17 // structure of a GEMM and is much simpler than multi_thread_gemm.h.
18 
19 #ifndef GEMMLOWP_INTERNAL_SINGLE_THREAD_GEMM_H_
20 #define GEMMLOWP_INTERNAL_SINGLE_THREAD_GEMM_H_
21 
22 #include <cassert>
23 
24 #include "../public/map.h"
25 #include "allocator.h"
26 #include "compute.h"
27 #include "kernel.h"
28 #include "pack.h"
29 #include "unpack.h"
30 
31 #ifdef GEMMLOWP_PROFILING_SIZES
32 #ifndef GEMMLOWP_PROFILING
33 #error GEMMLOWP_PROFILING_SIZES without GEMMLOWP_PROFILING
34 #endif
35 #include <string>
36 #include <unordered_map>
37 #endif
38 
39 namespace gemmlowp {
40 
41 class SingleThreadGemmContext {
42  public:
allocator()43   Allocator* allocator() { return &allocator_; }
44 
set_l1_bytes_to_use(int n)45   void set_l1_bytes_to_use(int n) { l1_bytes_to_use_ = n; }
set_l2_bytes_to_use(int n)46   void set_l2_bytes_to_use(int n) { l2_bytes_to_use_ = n; }
set_l2_rhs_factor(float n)47   void set_l2_rhs_factor(float n) { l2_rhs_factor_ = n; }
48 
l1_bytes_to_use()49   int l1_bytes_to_use() const { return l1_bytes_to_use_; }
l2_bytes_to_use()50   int l2_bytes_to_use() const { return l2_bytes_to_use_; }
l2_rhs_factor()51   float l2_rhs_factor() const { return l2_rhs_factor_; }
52 
53  protected:
54   Allocator allocator_;
55 
56   // The cache configurationt to use.
57   int l1_bytes_to_use_ = kDefaultL1CacheSize;
58   int l2_bytes_to_use_ = kDefaultL2CacheSize;
59   float l2_rhs_factor_ = kDefaultL2RhsFactor;
60 };
61 
62 template <typename KernelFormat, typename InputScalar, typename OutputScalar,
63           typename BitDepthParams, MapOrder LhsOrder, MapOrder RhsOrder,
64           MapOrder ResultOrder, typename LhsOffset, typename RhsOffset,
65           typename OutputPipelineType>
SingleThreadGemm(SingleThreadGemmContext * context,const KernelBase & kernel,const MatrixMap<const InputScalar,LhsOrder> & lhs,const MatrixMap<const InputScalar,RhsOrder> & rhs,MatrixMap<OutputScalar,ResultOrder> * result,const LhsOffset & lhs_offset,const RhsOffset & rhs_offset,const OutputPipelineType & output_pipeline)66 void SingleThreadGemm(SingleThreadGemmContext* context,
67                       const KernelBase& kernel,
68                       const MatrixMap<const InputScalar, LhsOrder>& lhs,
69                       const MatrixMap<const InputScalar, RhsOrder>& rhs,
70                       MatrixMap<OutputScalar, ResultOrder>* result,
71                       const LhsOffset& lhs_offset, const RhsOffset& rhs_offset,
72                       const OutputPipelineType& output_pipeline) {
73   ScopedProfilingLabel label("gemmlowp::SingleThreadGemm");
74 
75   assert(lhs.cols() == rhs.rows());
76 
77   int rows = result->rows();
78   int cols = result->cols();
79   int depth = lhs.cols();
80 
81   // zero sizes should have been caught earlier and early-returned.
82   assert(rows > 0);
83   assert(cols > 0);
84   assert(depth > 0);
85 
86   // The case of rows<cols should have been caught earlier and transposed.
87   assert(rows >= cols);
88 
89   Allocator* allocator = context->allocator();
90 
91   BlockParams block_params;
92   block_params.Init<KernelFormat>(rows, cols, depth, 1,
93                                   context->l1_bytes_to_use(),
94                                   context->l2_bytes_to_use(),
95                                   context->l2_rhs_factor());
96 
97 #ifdef GEMMLOWP_PROFILING_SIZES
98   // Using a static map of label strings. Not reentrant at all!
99   static std::unordered_map<std::uint64_t, std::string> labels_map;
100   std::uint64_t sizes_hash = static_cast<std::uint64_t>(rows) ^
101                              (static_cast<std::uint64_t>(depth) << 16) ^
102                              (static_cast<std::uint64_t>(cols) << 32);
103   if (!labels_map.count(sizes_hash)) {
104     char label[256];
105     snprintf(label, sizeof(label),
106              "(rows = %d, depth = %d, cols = %d, l2_rows = %d, l2_depth = %d, "
107              "l2_cols = %d, l1_rows = %d, l1_depth = %d, l1_cols = %d)",
108              rows, depth, cols, block_params.l2_rows, block_params.l2_depth,
109              block_params.l2_cols, block_params.l1_rows, block_params.l1_depth,
110              block_params.l1_cols);
111     labels_map[sizes_hash] = label;
112   }
113   ScopedProfilingLabel size_label(labels_map[sizes_hash].c_str());
114 #endif
115 
116   PackedSideBlock<typename KernelFormat::Lhs> packed_lhs(Side::Lhs, allocator,
117                                                          block_params);
118   PackedSideBlock<typename KernelFormat::Rhs> packed_rhs(Side::Rhs, allocator,
119                                                          block_params);
120 
121   PackedResult packed_result(allocator, block_params);
122 
123   allocator->Commit();
124 
125   const bool pack_rhs_once = block_params.l2_cols >= cols;
126 
127   if (pack_rhs_once) {
128     PackRhs(&packed_rhs, rhs);
129   }
130 
131   for (int r = 0; r < rows; r += block_params.l2_rows) {
132     int rs = std::min(block_params.l2_rows, rows - r);
133 
134     PackLhs(&packed_lhs, lhs.block(r, 0, rs, depth));
135 
136     for (int c = 0; c < cols; c += block_params.l2_cols) {
137       int cs = std::min(block_params.l2_cols, cols - c);
138 
139       if (!pack_rhs_once) {
140         PackRhs(&packed_rhs, rhs.block(0, c, depth, cs));
141       }
142 
143       Compute(kernel, block_params, &packed_result, packed_lhs, packed_rhs,
144               depth);
145 
146       UnpackResult<KernelFormat>(
147           result, MatrixBlockBounds(r, c, rs, cs), packed_result, depth,
148           packed_lhs.sums_of_each_slice(), packed_rhs.sums_of_each_slice(),
149           lhs_offset.block(r, rs), rhs_offset.block(c, cs), output_pipeline);
150     }
151   }
152 
153   allocator->Decommit();
154 }
155 
156 }  // namespace gemmlowp
157 
158 #endif  // GEMMLOWP_INTERNAL_SINGLE_THREAD_GEMM_H_
159