• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /* Copyright 2019 Google LLC. All Rights Reserved.
2 
3 Licensed under the Apache License, Version 2.0 (the "License");
4 you may not use this file except in compliance with the License.
5 You may obtain a copy of the License at
6 
7     http://www.apache.org/licenses/LICENSE-2.0
8 
9 Unless required by applicable law or agreed to in writing, software
10 distributed under the License is distributed on an "AS IS" BASIS,
11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 See the License for the specific language governing permissions and
13 limitations under the License.
14 ==============================================================================*/
15 
16 // The 'middle-end' in ruy. See TrMul function comment.
17 
18 #include "ruy/trmul.h"
19 
20 #include <algorithm>
21 #include <atomic>
22 #include <cstdint>
23 #include <cstring>
24 #include <memory>
25 #include <vector>
26 
27 #include "ruy/allocator.h"
28 #include "ruy/block_map.h"
29 #include "ruy/check_macros.h"
30 #include "ruy/cpu_cache_params.h"
31 #include "ruy/cpuinfo.h"
32 #include "ruy/ctx.h"
33 #include "ruy/mat.h"
34 #include "ruy/matrix.h"
35 #include "ruy/mul_params.h"
36 #include "ruy/opt_set.h"
37 #include "ruy/profiler/instrumentation.h"
38 #include "ruy/side_pair.h"
39 #include "ruy/size_util.h"
40 #include "ruy/thread_pool.h"
41 #include "ruy/trace.h"
42 #include "ruy/tune.h"
43 
44 namespace ruy {
45 
46 namespace {
47 
48 // Enum to track the packingstatus of a block of the LHS or RHS matrix.
49 enum class PackingStatus : std::uint8_t {
50   kNotStarted,  // No thread has started packing this block yet.
51   kInProgress,  // Some thread is currently packing this block.
52   kFinished     // This block has already been packed.
53 };
54 
55 // TrMulTask is the task that a ruy thread runs to perform the TrMul operation.
56 class TrMulTask final : public Task {
57  public:
TrMulTask(TrMulParams * params,const BlockMap & block_map,std::atomic<int> * atomic_block_id,int thread_id,bool need_atomics,SidePair<std::atomic<PackingStatus> * > packing_status,TuningResolver * tuning_resolver,Allocator * local_allocator,CpuInfo * cpuinfo)58   TrMulTask(TrMulParams* params, const BlockMap& block_map,
59             std::atomic<int>* atomic_block_id, int thread_id, bool need_atomics,
60             SidePair<std::atomic<PackingStatus>*> packing_status,
61             TuningResolver* tuning_resolver, Allocator* local_allocator,
62             CpuInfo* cpuinfo)
63       : params_(params),
64         block_map_(block_map),
65         atomic_block_id_(atomic_block_id),
66         thread_id_(thread_id),
67         need_atomics_(need_atomics),
68         packing_status_(packing_status),
69         tuning_resolver_(tuning_resolver),
70         local_allocator_(local_allocator),
71         local_already_packed_{nullptr, nullptr},
72         cpuinfo_(cpuinfo) {}
73 
74   // Thread main function. This is one thread's share of the TrMul work.
Run()75   void Run() override {
76     RUY_TRACE_SCOPE_NAME("TrMulTask::Run");
77     RUY_TRACE_SET_THEAD_ID(thread_id_);
78     // Allocate and initialize `local_packed`.
79     for (Side side : {Side::kLhs, Side::kRhs}) {
80       if (!params_->is_prepacked[side]) {
81         const int size = NumBlocksPerSide(side, block_map_);
82         local_allocator_->Allocate(size, &local_already_packed_[side]);
83         memset(local_already_packed_[side], 0, size * sizeof(bool));
84       }
85     }
86 
87     const Tuning tuning = tuning_resolver_->Resolve(cpuinfo_);
88     const int num_blocks = NumBlocks(block_map_);
89 
90     // Each thread starts by initially reserving the block whose id
91     // is the thread id.
92     int block_id = thread_id_;
93     // Loop until all blocks have been computed.
94     while (block_id < num_blocks) {
95       RUY_TRACE_SCOPE_NAME("Main loop iteration");
96       // Reserve the next block to handle, hiding the latency of this atomic op.
97       const int next_block_id =
98           atomic_block_id_->fetch_add(1, std::memory_order_relaxed);
99       // Get coordinates of the current block to handle, in "block space".
100       SidePair<int> block;
101       GetBlockByIndex(block_map_, block_id, &block);
102       // Get coordinates of the current block to handle, in matrix space.
103       SidePair<int> start, end;
104       GetBlockMatrixCoords(block_map_, block, &start, &end);
105       RUY_TRACE_INFO(TRMUL_TASK_MAIN_LOOP_GOT_BLOCK_COORDS);
106       // Maybe pack the current LHS/RHS block, if not already packed.
107       EnsurePacked(block, start, end, tuning);
108       // Actually do matrix multiplication work
109       params_->RunKernel(tuning, start, end);
110       // Move on to the next block as obtained by the atomic increment
111       // at the start of this while loop iteration.
112       block_id = next_block_id;
113     }
114 
115     local_allocator_->FreeAll();
116   }
117 
118  private:
119   // Tries to pack a block, without blocking.
120   // If the block was already packed, returns true.
121   // If the block was not started packing, packs it and returns true.
122   // If the block was being packed by another thread, returns false.
TryPack(Side side,int block,int start,int end,Tuning tuning)123   bool TryPack(Side side, int block, int start, int end, Tuning tuning) {
124     if (params_->is_prepacked[side]) {
125       return true;
126     }
127     if (!local_already_packed_[side][block]) {
128       if (need_atomics_) {
129         // Explanation of this compare_exchange_strong operation:
130         // This atomically performs all of the following:
131         // 1. Read `status` with "acquire" memory order.
132         //    * That this read uses "acquire" is because both memory orders
133         //      specified have "acquire" as their read-component.
134         // 2. Compare (bitwise) with `exchanged_status`.
135         // 3. If equal, stores the value kInProgress to `status` with "release"
136         //    memory order, and returns true, so we take this 'if' branch.
137         //    * That this store uses "release" is because of the _rel part in
138         //      memory_order_acq_rel passed as the first memory order argument.
139         // 4. If not equal, stores the loaded value of `status` to
140         //    `exchanged_status` with "relaxed" semantics, and returns false,
141         //    so we take the 'else' branch.
142         //    * That this store uses "relaxed" is because the second memory
143         //      order argument, memory_order_acquire, implies no particular
144         //      store semantics. "relaxed" is acceptable here because this
145         //      stores to a local stack variable.
146         //
147         // Rationale for compare_exchange_strong as opposed to
148         // compare_exchange_weak:
149         // The spurious-failure case with compare_exchange_weak will actually
150         // happen a lot here, because the atomic 'status' bytes are stored
151         // contiguously in arrays and neighboring values will be accessed
152         // by multiple threads concurrently. On a typical ARM CPU, an exclusives
153         // reservation granule is 64 bytes, so a lot of false-sharing may
154         // happen. Using compare_exchange_weak would thus result in often having
155         // TryPack return 'false' when it could instead have done the packing
156         // work and returned 'true'. Heuristically, that is not a good thing.
157         // Moreover, this changes the TryPack contract, loosening it and making
158         // it harder for the caller to reason about. Finally, the overhead of
159         // atomic operations is mitigated by the enclosing check on
160         // local_already_packed, so maybe the overhead of
161         // compare_exchange_strong isn't such a problem. But we don't really
162         // know for sure, that would be interesting to experiment more with.
163         PackingStatus exchanged_status = PackingStatus::kNotStarted;
164         std::atomic<PackingStatus>& status = packing_status_[side][block];
165         if (status.compare_exchange_strong(
166                 exchanged_status, PackingStatus::kInProgress,
167                 std::memory_order_acq_rel, std::memory_order_acquire)) {
168           // In this branch, the status was kNotStarted and we just atomically
169           // changed it to kInProgress as we are about to handle the packing
170           // ourselves.
171           RUY_TRACE_INFO(TRYPACK_PACKING);
172           params_->RunPack(side, tuning, start, end);
173           status.store(PackingStatus::kFinished, std::memory_order_release);
174         } else if (exchanged_status == PackingStatus::kInProgress) {
175           // Another thread is currently packing this block.
176           RUY_TRACE_INFO(TRYPACK_ANOTHER_THREAD_PACKING);
177           return false;
178         } else {
179           RUY_TRACE_INFO(TRYPACK_PACKED_BY_ANOTHER_THREAD);
180         }
181         RUY_DCHECK(status.load(std::memory_order_acquire) ==
182                    PackingStatus::kFinished);
183       } else {
184         // Single-threaded case: no need for expensive atomics,
185         // local_already_packed is the truth already.
186         params_->RunPack(side, tuning, start, end);
187       }
188       local_already_packed_[side][block] = true;
189     } else {
190       RUY_TRACE_INFO(TRYPACK_PREVIOUSLY_PACKED);
191     }
192     return true;
193   }
194 
195   // Ensures that both the LHS and RHS blocks required by the specified block
196   // are packed. In the event that they are already being packed on another
197   // threads, this function may perform the packing of some other block while
198   // waiting for that other thread to finish packing the requested block.
EnsurePacked(const SidePair<int> & block,const SidePair<int> & start,const SidePair<int> & end,Tuning tuning)199   void EnsurePacked(const SidePair<int>& block, const SidePair<int>& start,
200                     const SidePair<int>& end, Tuning tuning) {
201 #if RUY_OPT(PACK_AHEAD)
202     SidePair<int> next_runahead_block{block[Side::kLhs] + 1,
203                                       block[Side::kRhs] + 1};
204     Side next_runahead_side = Side::kLhs;
205 #endif
206     while (true) {
207       bool both_sides_packed = true;
208       for (Side side : {Side::kLhs, Side::kRhs}) {
209         both_sides_packed &=
210             TryPack(side, block[side], start[side], end[side], tuning);
211       }
212       if (both_sides_packed) {
213         break;
214       }
215 #if RUY_OPT(PACK_AHEAD)
216       RUY_TRACE_INFO(ENSURE_PACKED_ENTER_RUN_AHEAD);
217       const Side runahead_side = next_runahead_side;
218       const int runahead_block = next_runahead_block[runahead_side];
219       next_runahead_side = OtherSide(next_runahead_side);
220       if (runahead_block >= NumBlocksPerSide(runahead_side, block_map_)) {
221         continue;
222       }
223       int runahead_block_start, runahead_block_end;
224       GetBlockMatrixCoords(runahead_side, block_map_, runahead_block,
225                            &runahead_block_start, &runahead_block_end);
226       TryPack(runahead_side, runahead_block, runahead_block_start,
227               runahead_block_end, tuning);
228       next_runahead_block[runahead_side] = runahead_block + 1;
229 #endif
230     }
231     RUY_TRACE_INFO(ENSURE_PACKED_END);
232   }
233 
234   TrMulParams* params_;
235   const BlockMap& block_map_;
236   std::atomic<int>* atomic_block_id_;
237   int thread_id_;
238   bool need_atomics_;
239   SidePair<std::atomic<PackingStatus>*> packing_status_;
240   TuningResolver* tuning_resolver_;
241   Allocator* local_allocator_;
242 
243   // Local indicators of packedness to avoid the overhead of atomic ops.
244   SidePair<bool*> local_already_packed_;
245 
246   CpuInfo* cpuinfo_;
247 };
248 
GetTentativeThreadCount(Ctx * ctx,int rows,int cols,int depth)249 int GetTentativeThreadCount(Ctx* ctx, int rows, int cols, int depth) {
250 #if RUY_PLATFORM_EMSCRIPTEN
251   // b/139927184, std::thread constructor raises exception
252   return 1;
253 #endif
254   RUY_TRACE_SCOPE;
255   // Empirically determined rule for reasonable number of
256   // threads to use. This is proportional to the number of arithmetic ops
257   // in this Mul (product of the 3 sizes).
258   static constexpr int kDivisorLog2 = 15;
259   const int guess_log2 = std::max(
260       0, ceil_log2(rows) + ceil_log2(cols) + ceil_log2(depth) - kDivisorLog2);
261   int tentative_thread_count =
262       std::min(1 << guess_log2, ctx->max_num_threads());
263   RUY_TRACE_INFO(GET_TENTATIVE_THREAD_COUNT);
264   return tentative_thread_count;
265 }
266 
GetUseSimpleLoop(int tentative_thread_count,int rows,int cols,int depth,int lhs_scalar_size,int rhs_scalar_size,const CpuCacheParams & cpu_cache_params)267 bool GetUseSimpleLoop(int tentative_thread_count, int rows, int cols, int depth,
268                       int lhs_scalar_size, int rhs_scalar_size,
269                       const CpuCacheParams& cpu_cache_params) {
270   RUY_TRACE_SCOPE;
271   if (tentative_thread_count == 1) {
272     if (IsObviouslyLinearTraversal(rows, cols, depth, lhs_scalar_size,
273                                    rhs_scalar_size, cpu_cache_params)) {
274       RUY_TRACE_INFO(GET_USE_SIMPLE_LOOP_RETURNS_TRUE);
275       return true;
276     }
277   }
278   RUY_TRACE_INFO(GET_USE_SIMPLE_LOOP_RETURNS_FALSE);
279   return false;
280 }
281 
282 }  // namespace
283 
284 // TrMul is the ruy middle-end. It contains the high-level logic to perform
285 // a ruy::Mul's work, down to calls to back-end Kernel and Pack functions.
286 // This includes determining how many threads to use, computing the BlockMap,
287 // executing tasks on a thread-pool. The TrMul function itself runs on the main
288 // thread, the code that is potentially running on worker threads is in
289 // TrMulTask::Run().
TrMul(Ctx * ctx,TrMulParams * params)290 void TrMul(Ctx* ctx, TrMulParams* params) {
291   RUY_TRACE_SCOPE;
292   profiler::ScopeLabel label(
293       "TrMul (Path=0x%x, max_num_threads=%d, is_prepacked=(%d,%d))",
294       static_cast<int>(params->path), ctx->max_num_threads(),
295       params->is_prepacked[Side::kLhs], params->is_prepacked[Side::kRhs]);
296 
297   PEMat& packed_lhs = params->packed_matrix[Side::kLhs];
298   PEMat& packed_rhs = params->packed_matrix[Side::kRhs];
299   EMat& lhs = params->src[Side::kLhs];
300   EMat& rhs = params->src[Side::kRhs];
301 
302   const int rows = lhs.layout.cols;
303   const int cols = rhs.layout.cols;
304   const int depth = lhs.layout.rows;
305 
306   const int tentative_thread_count =
307       GetTentativeThreadCount(ctx, rows, cols, depth);
308   const auto& cpu_cache_params = ctx->mutable_cpuinfo()->CacheParams();
309 
310   // Case of running this TrMul as a simple loop.
311   // This is a good place to start reading this function: all the rest
312   // of this function is just an optimized, but functionally equivalent,
313   // version of that.
314   if (GetUseSimpleLoop(tentative_thread_count, rows, cols, depth,
315                        lhs.data_type.size, rhs.data_type.size,
316                        cpu_cache_params)) {
317     profiler::ScopeLabel label_simple("TrMulImpl, simple loop");
318     Tuning tuning = ctx->GetMainThreadTuning();
319     RUY_TRACE_INFO(TRMUL_SIMPLE_LOOP);
320 
321     const SidePair<int> origin{0, 0};
322     const SidePair<int> rounded_dims{packed_lhs.layout.cols,
323                                      packed_rhs.layout.cols};
324     for (Side side : {Side::kLhs, Side::kRhs}) {
325       if (!params->is_prepacked[side]) {
326         params->RunPack(side, tuning, origin[side], rounded_dims[side]);
327       }
328     }
329     params->RunKernel(tuning, origin, rounded_dims);
330     return;
331   }
332 
333   profiler::ScopeLabel label_general("TrMulImpl, general case");
334   RUY_TRACE_INFO(TRMUL_GENERAL_CASE);
335   Allocator* main_allocator = ctx->GetMainAllocator();
336 
337   // Initialize block map.
338   BlockMap block_map;
339   MakeBlockMap(packed_lhs.layout.cols, packed_rhs.layout.cols, depth,
340                packed_lhs.layout.kernel.cols, packed_rhs.layout.kernel.cols,
341                packed_lhs.data_type.size, packed_rhs.data_type.size,
342                tentative_thread_count, cpu_cache_params, &block_map);
343 
344   // Initialize per-thread state.
345   const int thread_count = block_map.thread_count;
346   const bool need_atomics = thread_count > 1;
347   ctx->EnsureThreadSpecificResources(thread_count);
348   for (int i = 0; i < thread_count; i++) {
349     ctx->GetThreadSpecificTuningResolver(i)->SetTuning(ctx->explicit_tuning());
350   }
351 
352   // In the need_atomics case, allocate and initialize atomic values tracking
353   // the packing status of blocks.
354   SidePair<std::atomic<PackingStatus>*> packing_status{nullptr, nullptr};
355   if (need_atomics) {
356     for (Side side : {Side::kLhs, Side::kRhs}) {
357       if (!params->is_prepacked[side]) {
358         const int size = NumBlocksPerSide(side, block_map);
359         main_allocator->Allocate(size, &packing_status[side]);
360         for (int i = 0; i < size; i++) {
361           packing_status[side][i].store(PackingStatus::kNotStarted,
362                                         std::memory_order_relaxed);
363         }
364       }
365     }
366   }
367 
368   // Create the atomic block id, allocate it using Allocator so that
369   // we get the alignment ensuring that it sits alone in its exclusives
370   // reservation granule.
371   std::atomic<int>* atomic_block_id;
372   main_allocator->Allocate(1, &atomic_block_id);
373 
374   // Create task objects.
375   TrMulTask* tasks;
376   main_allocator->Allocate(thread_count, &tasks);
377 
378   atomic_block_id->store(thread_count);
379 
380   for (int i = 0; i < thread_count; i++) {
381     auto* allocator = ctx->GetThreadSpecificAllocator(i);
382     auto* tuning_resolver = ctx->GetThreadSpecificTuningResolver(i);
383     new (tasks + i) TrMulTask(params, block_map, atomic_block_id, i,
384                               need_atomics, packing_status, tuning_resolver,
385                               allocator, ctx->mutable_cpuinfo());
386   }
387 
388   // Do the computation.
389   ctx->mutable_thread_pool()->Execute(thread_count, tasks);
390 
391   // Finish up.
392   for (int i = 0; i < thread_count; i++) {
393     tasks[i].~TrMulTask();
394   }
395 }
396 
397 }  // namespace ruy
398