1 /* Copyright 2019 Google LLC. All Rights Reserved.
2
3 Licensed under the Apache License, Version 2.0 (the "License");
4 you may not use this file except in compliance with the License.
5 You may obtain a copy of the License at
6
7 http://www.apache.org/licenses/LICENSE-2.0
8
9 Unless required by applicable law or agreed to in writing, software
10 distributed under the License is distributed on an "AS IS" BASIS,
11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 See the License for the specific language governing permissions and
13 limitations under the License.
14 ==============================================================================*/
15
16 // The 'middle-end' in ruy. See TrMul function comment.
17
18 #include "ruy/trmul.h"
19
20 #include <algorithm>
21 #include <atomic>
22 #include <cstdint>
23 #include <cstring>
24 #include <memory>
25 #include <vector>
26
27 #include "ruy/allocator.h"
28 #include "ruy/block_map.h"
29 #include "ruy/check_macros.h"
30 #include "ruy/cpu_cache_params.h"
31 #include "ruy/cpuinfo.h"
32 #include "ruy/ctx.h"
33 #include "ruy/mat.h"
34 #include "ruy/matrix.h"
35 #include "ruy/mul_params.h"
36 #include "ruy/opt_set.h"
37 #include "ruy/profiler/instrumentation.h"
38 #include "ruy/side_pair.h"
39 #include "ruy/size_util.h"
40 #include "ruy/thread_pool.h"
41 #include "ruy/trace.h"
42 #include "ruy/tune.h"
43
44 namespace ruy {
45
46 namespace {
47
48 // Enum to track the packingstatus of a block of the LHS or RHS matrix.
49 enum class PackingStatus : std::uint8_t {
50 kNotStarted, // No thread has started packing this block yet.
51 kInProgress, // Some thread is currently packing this block.
52 kFinished // This block has already been packed.
53 };
54
55 // TrMulTask is the task that a ruy thread runs to perform the TrMul operation.
56 class TrMulTask final : public Task {
57 public:
TrMulTask(TrMulParams * params,const BlockMap & block_map,std::atomic<int> * atomic_block_id,int thread_id,bool need_atomics,SidePair<std::atomic<PackingStatus> * > packing_status,TuningResolver * tuning_resolver,Allocator * local_allocator,CpuInfo * cpuinfo)58 TrMulTask(TrMulParams* params, const BlockMap& block_map,
59 std::atomic<int>* atomic_block_id, int thread_id, bool need_atomics,
60 SidePair<std::atomic<PackingStatus>*> packing_status,
61 TuningResolver* tuning_resolver, Allocator* local_allocator,
62 CpuInfo* cpuinfo)
63 : params_(params),
64 block_map_(block_map),
65 atomic_block_id_(atomic_block_id),
66 thread_id_(thread_id),
67 need_atomics_(need_atomics),
68 packing_status_(packing_status),
69 tuning_resolver_(tuning_resolver),
70 local_allocator_(local_allocator),
71 local_already_packed_{nullptr, nullptr},
72 cpuinfo_(cpuinfo) {}
73
74 // Thread main function. This is one thread's share of the TrMul work.
Run()75 void Run() override {
76 RUY_TRACE_SCOPE_NAME("TrMulTask::Run");
77 RUY_TRACE_SET_THEAD_ID(thread_id_);
78 // Allocate and initialize `local_packed`.
79 for (Side side : {Side::kLhs, Side::kRhs}) {
80 if (!params_->is_prepacked[side]) {
81 const int size = NumBlocksPerSide(side, block_map_);
82 local_allocator_->Allocate(size, &local_already_packed_[side]);
83 memset(local_already_packed_[side], 0, size * sizeof(bool));
84 }
85 }
86
87 const Tuning tuning = tuning_resolver_->Resolve(cpuinfo_);
88 const int num_blocks = NumBlocks(block_map_);
89
90 // Each thread starts by initially reserving the block whose id
91 // is the thread id.
92 int block_id = thread_id_;
93 // Loop until all blocks have been computed.
94 while (block_id < num_blocks) {
95 RUY_TRACE_SCOPE_NAME("Main loop iteration");
96 // Reserve the next block to handle, hiding the latency of this atomic op.
97 const int next_block_id =
98 atomic_block_id_->fetch_add(1, std::memory_order_relaxed);
99 // Get coordinates of the current block to handle, in "block space".
100 SidePair<int> block;
101 GetBlockByIndex(block_map_, block_id, &block);
102 // Get coordinates of the current block to handle, in matrix space.
103 SidePair<int> start, end;
104 GetBlockMatrixCoords(block_map_, block, &start, &end);
105 RUY_TRACE_INFO(TRMUL_TASK_MAIN_LOOP_GOT_BLOCK_COORDS);
106 // Maybe pack the current LHS/RHS block, if not already packed.
107 EnsurePacked(block, start, end, tuning);
108 // Actually do matrix multiplication work
109 params_->RunKernel(tuning, start, end);
110 // Move on to the next block as obtained by the atomic increment
111 // at the start of this while loop iteration.
112 block_id = next_block_id;
113 }
114
115 local_allocator_->FreeAll();
116 }
117
118 private:
119 // Tries to pack a block, without blocking.
120 // If the block was already packed, returns true.
121 // If the block was not started packing, packs it and returns true.
122 // If the block was being packed by another thread, returns false.
TryPack(Side side,int block,int start,int end,Tuning tuning)123 bool TryPack(Side side, int block, int start, int end, Tuning tuning) {
124 if (params_->is_prepacked[side]) {
125 return true;
126 }
127 if (!local_already_packed_[side][block]) {
128 if (need_atomics_) {
129 // Explanation of this compare_exchange_strong operation:
130 // This atomically performs all of the following:
131 // 1. Read `status` with "acquire" memory order.
132 // * That this read uses "acquire" is because both memory orders
133 // specified have "acquire" as their read-component.
134 // 2. Compare (bitwise) with `exchanged_status`.
135 // 3. If equal, stores the value kInProgress to `status` with "release"
136 // memory order, and returns true, so we take this 'if' branch.
137 // * That this store uses "release" is because of the _rel part in
138 // memory_order_acq_rel passed as the first memory order argument.
139 // 4. If not equal, stores the loaded value of `status` to
140 // `exchanged_status` with "relaxed" semantics, and returns false,
141 // so we take the 'else' branch.
142 // * That this store uses "relaxed" is because the second memory
143 // order argument, memory_order_acquire, implies no particular
144 // store semantics. "relaxed" is acceptable here because this
145 // stores to a local stack variable.
146 //
147 // Rationale for compare_exchange_strong as opposed to
148 // compare_exchange_weak:
149 // The spurious-failure case with compare_exchange_weak will actually
150 // happen a lot here, because the atomic 'status' bytes are stored
151 // contiguously in arrays and neighboring values will be accessed
152 // by multiple threads concurrently. On a typical ARM CPU, an exclusives
153 // reservation granule is 64 bytes, so a lot of false-sharing may
154 // happen. Using compare_exchange_weak would thus result in often having
155 // TryPack return 'false' when it could instead have done the packing
156 // work and returned 'true'. Heuristically, that is not a good thing.
157 // Moreover, this changes the TryPack contract, loosening it and making
158 // it harder for the caller to reason about. Finally, the overhead of
159 // atomic operations is mitigated by the enclosing check on
160 // local_already_packed, so maybe the overhead of
161 // compare_exchange_strong isn't such a problem. But we don't really
162 // know for sure, that would be interesting to experiment more with.
163 PackingStatus exchanged_status = PackingStatus::kNotStarted;
164 std::atomic<PackingStatus>& status = packing_status_[side][block];
165 if (status.compare_exchange_strong(
166 exchanged_status, PackingStatus::kInProgress,
167 std::memory_order_acq_rel, std::memory_order_acquire)) {
168 // In this branch, the status was kNotStarted and we just atomically
169 // changed it to kInProgress as we are about to handle the packing
170 // ourselves.
171 RUY_TRACE_INFO(TRYPACK_PACKING);
172 params_->RunPack(side, tuning, start, end);
173 status.store(PackingStatus::kFinished, std::memory_order_release);
174 } else if (exchanged_status == PackingStatus::kInProgress) {
175 // Another thread is currently packing this block.
176 RUY_TRACE_INFO(TRYPACK_ANOTHER_THREAD_PACKING);
177 return false;
178 } else {
179 RUY_TRACE_INFO(TRYPACK_PACKED_BY_ANOTHER_THREAD);
180 }
181 RUY_DCHECK(status.load(std::memory_order_acquire) ==
182 PackingStatus::kFinished);
183 } else {
184 // Single-threaded case: no need for expensive atomics,
185 // local_already_packed is the truth already.
186 params_->RunPack(side, tuning, start, end);
187 }
188 local_already_packed_[side][block] = true;
189 } else {
190 RUY_TRACE_INFO(TRYPACK_PREVIOUSLY_PACKED);
191 }
192 return true;
193 }
194
195 // Ensures that both the LHS and RHS blocks required by the specified block
196 // are packed. In the event that they are already being packed on another
197 // threads, this function may perform the packing of some other block while
198 // waiting for that other thread to finish packing the requested block.
EnsurePacked(const SidePair<int> & block,const SidePair<int> & start,const SidePair<int> & end,Tuning tuning)199 void EnsurePacked(const SidePair<int>& block, const SidePair<int>& start,
200 const SidePair<int>& end, Tuning tuning) {
201 #if RUY_OPT(PACK_AHEAD)
202 SidePair<int> next_runahead_block{block[Side::kLhs] + 1,
203 block[Side::kRhs] + 1};
204 Side next_runahead_side = Side::kLhs;
205 #endif
206 while (true) {
207 bool both_sides_packed = true;
208 for (Side side : {Side::kLhs, Side::kRhs}) {
209 both_sides_packed &=
210 TryPack(side, block[side], start[side], end[side], tuning);
211 }
212 if (both_sides_packed) {
213 break;
214 }
215 #if RUY_OPT(PACK_AHEAD)
216 RUY_TRACE_INFO(ENSURE_PACKED_ENTER_RUN_AHEAD);
217 const Side runahead_side = next_runahead_side;
218 const int runahead_block = next_runahead_block[runahead_side];
219 next_runahead_side = OtherSide(next_runahead_side);
220 if (runahead_block >= NumBlocksPerSide(runahead_side, block_map_)) {
221 continue;
222 }
223 int runahead_block_start, runahead_block_end;
224 GetBlockMatrixCoords(runahead_side, block_map_, runahead_block,
225 &runahead_block_start, &runahead_block_end);
226 TryPack(runahead_side, runahead_block, runahead_block_start,
227 runahead_block_end, tuning);
228 next_runahead_block[runahead_side] = runahead_block + 1;
229 #endif
230 }
231 RUY_TRACE_INFO(ENSURE_PACKED_END);
232 }
233
234 TrMulParams* params_;
235 const BlockMap& block_map_;
236 std::atomic<int>* atomic_block_id_;
237 int thread_id_;
238 bool need_atomics_;
239 SidePair<std::atomic<PackingStatus>*> packing_status_;
240 TuningResolver* tuning_resolver_;
241 Allocator* local_allocator_;
242
243 // Local indicators of packedness to avoid the overhead of atomic ops.
244 SidePair<bool*> local_already_packed_;
245
246 CpuInfo* cpuinfo_;
247 };
248
GetTentativeThreadCount(Ctx * ctx,int rows,int cols,int depth)249 int GetTentativeThreadCount(Ctx* ctx, int rows, int cols, int depth) {
250 #if RUY_PLATFORM_EMSCRIPTEN
251 // b/139927184, std::thread constructor raises exception
252 return 1;
253 #endif
254 RUY_TRACE_SCOPE;
255 // Empirically determined rule for reasonable number of
256 // threads to use. This is proportional to the number of arithmetic ops
257 // in this Mul (product of the 3 sizes).
258 static constexpr int kDivisorLog2 = 15;
259 const int guess_log2 = std::max(
260 0, ceil_log2(rows) + ceil_log2(cols) + ceil_log2(depth) - kDivisorLog2);
261 int tentative_thread_count =
262 std::min(1 << guess_log2, ctx->max_num_threads());
263 RUY_TRACE_INFO(GET_TENTATIVE_THREAD_COUNT);
264 return tentative_thread_count;
265 }
266
GetUseSimpleLoop(int tentative_thread_count,int rows,int cols,int depth,int lhs_scalar_size,int rhs_scalar_size,const CpuCacheParams & cpu_cache_params)267 bool GetUseSimpleLoop(int tentative_thread_count, int rows, int cols, int depth,
268 int lhs_scalar_size, int rhs_scalar_size,
269 const CpuCacheParams& cpu_cache_params) {
270 RUY_TRACE_SCOPE;
271 if (tentative_thread_count == 1) {
272 if (IsObviouslyLinearTraversal(rows, cols, depth, lhs_scalar_size,
273 rhs_scalar_size, cpu_cache_params)) {
274 RUY_TRACE_INFO(GET_USE_SIMPLE_LOOP_RETURNS_TRUE);
275 return true;
276 }
277 }
278 RUY_TRACE_INFO(GET_USE_SIMPLE_LOOP_RETURNS_FALSE);
279 return false;
280 }
281
282 } // namespace
283
284 // TrMul is the ruy middle-end. It contains the high-level logic to perform
285 // a ruy::Mul's work, down to calls to back-end Kernel and Pack functions.
286 // This includes determining how many threads to use, computing the BlockMap,
287 // executing tasks on a thread-pool. The TrMul function itself runs on the main
288 // thread, the code that is potentially running on worker threads is in
289 // TrMulTask::Run().
TrMul(Ctx * ctx,TrMulParams * params)290 void TrMul(Ctx* ctx, TrMulParams* params) {
291 RUY_TRACE_SCOPE;
292 profiler::ScopeLabel label(
293 "TrMul (Path=0x%x, max_num_threads=%d, is_prepacked=(%d,%d))",
294 static_cast<int>(params->path), ctx->max_num_threads(),
295 params->is_prepacked[Side::kLhs], params->is_prepacked[Side::kRhs]);
296
297 PEMat& packed_lhs = params->packed_matrix[Side::kLhs];
298 PEMat& packed_rhs = params->packed_matrix[Side::kRhs];
299 EMat& lhs = params->src[Side::kLhs];
300 EMat& rhs = params->src[Side::kRhs];
301
302 const int rows = lhs.layout.cols;
303 const int cols = rhs.layout.cols;
304 const int depth = lhs.layout.rows;
305
306 const int tentative_thread_count =
307 GetTentativeThreadCount(ctx, rows, cols, depth);
308 const auto& cpu_cache_params = ctx->mutable_cpuinfo()->CacheParams();
309
310 // Case of running this TrMul as a simple loop.
311 // This is a good place to start reading this function: all the rest
312 // of this function is just an optimized, but functionally equivalent,
313 // version of that.
314 if (GetUseSimpleLoop(tentative_thread_count, rows, cols, depth,
315 lhs.data_type.size, rhs.data_type.size,
316 cpu_cache_params)) {
317 profiler::ScopeLabel label_simple("TrMulImpl, simple loop");
318 Tuning tuning = ctx->GetMainThreadTuning();
319 RUY_TRACE_INFO(TRMUL_SIMPLE_LOOP);
320
321 const SidePair<int> origin{0, 0};
322 const SidePair<int> rounded_dims{packed_lhs.layout.cols,
323 packed_rhs.layout.cols};
324 for (Side side : {Side::kLhs, Side::kRhs}) {
325 if (!params->is_prepacked[side]) {
326 params->RunPack(side, tuning, origin[side], rounded_dims[side]);
327 }
328 }
329 params->RunKernel(tuning, origin, rounded_dims);
330 return;
331 }
332
333 profiler::ScopeLabel label_general("TrMulImpl, general case");
334 RUY_TRACE_INFO(TRMUL_GENERAL_CASE);
335 Allocator* main_allocator = ctx->GetMainAllocator();
336
337 // Initialize block map.
338 BlockMap block_map;
339 MakeBlockMap(packed_lhs.layout.cols, packed_rhs.layout.cols, depth,
340 packed_lhs.layout.kernel.cols, packed_rhs.layout.kernel.cols,
341 packed_lhs.data_type.size, packed_rhs.data_type.size,
342 tentative_thread_count, cpu_cache_params, &block_map);
343
344 // Initialize per-thread state.
345 const int thread_count = block_map.thread_count;
346 const bool need_atomics = thread_count > 1;
347 ctx->EnsureThreadSpecificResources(thread_count);
348 for (int i = 0; i < thread_count; i++) {
349 ctx->GetThreadSpecificTuningResolver(i)->SetTuning(ctx->explicit_tuning());
350 }
351
352 // In the need_atomics case, allocate and initialize atomic values tracking
353 // the packing status of blocks.
354 SidePair<std::atomic<PackingStatus>*> packing_status{nullptr, nullptr};
355 if (need_atomics) {
356 for (Side side : {Side::kLhs, Side::kRhs}) {
357 if (!params->is_prepacked[side]) {
358 const int size = NumBlocksPerSide(side, block_map);
359 main_allocator->Allocate(size, &packing_status[side]);
360 for (int i = 0; i < size; i++) {
361 packing_status[side][i].store(PackingStatus::kNotStarted,
362 std::memory_order_relaxed);
363 }
364 }
365 }
366 }
367
368 // Create the atomic block id, allocate it using Allocator so that
369 // we get the alignment ensuring that it sits alone in its exclusives
370 // reservation granule.
371 std::atomic<int>* atomic_block_id;
372 main_allocator->Allocate(1, &atomic_block_id);
373
374 // Create task objects.
375 TrMulTask* tasks;
376 main_allocator->Allocate(thread_count, &tasks);
377
378 atomic_block_id->store(thread_count);
379
380 for (int i = 0; i < thread_count; i++) {
381 auto* allocator = ctx->GetThreadSpecificAllocator(i);
382 auto* tuning_resolver = ctx->GetThreadSpecificTuningResolver(i);
383 new (tasks + i) TrMulTask(params, block_map, atomic_block_id, i,
384 need_atomics, packing_status, tuning_resolver,
385 allocator, ctx->mutable_cpuinfo());
386 }
387
388 // Do the computation.
389 ctx->mutable_thread_pool()->Execute(thread_count, tasks);
390
391 // Finish up.
392 for (int i = 0; i < thread_count; i++) {
393 tasks[i].~TrMulTask();
394 }
395 }
396
397 } // namespace ruy
398