1 /**
2 * Copyright 2022 Huawei Technologies Co., Ltd
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17 #include "src/litert/thread_cost_model.h"
18 #include <map>
19 #include "src/common/log_util.h"
20 #include "src/litert/inner_context.h"
21 #include "thread/threadpool.h"
22 #include "nnacl/op_base.h"
23
24 namespace mindspore::lite {
25 const std::map<int32_t, float> kernel_compute_cost_map_ = {
26 {TC_TYPE(schema::PrimitiveType_Activation, schema::ActivationType_RELU), 1.806f}, // dataNum about 100k
27 {TC_TYPE(schema::PrimitiveType_Activation, schema::ActivationType_RELU6), 1.806f}, // dataNum about 100k
28 {TC_TYPE(schema::PrimitiveType_Activation, schema::ActivationType_LEAKY_RELU), 1.806f}, // dataNum about 100k
29 {TC_TYPE(schema::PrimitiveType_Activation, schema::ActivationType_TANH), 41.65625}, // dataNum about 5k
30 {TC_TYPE(schema::PrimitiveType_Activation, schema::ActivationType_SIGMOID), 59.65625f}, // dataNum about 3.5k
31 {TC_TYPE(schema::PrimitiveType_Activation, schema::ActivationType_GELU), 83.65625f}, // dataNum about 2.5k
32
33 {TC_TYPE(schema::PrimitiveType_Sqrt, 0), 1.806f}, // dataNum about 100k
34 {TC_TYPE(schema::PrimitiveType_Split, 0), 21.573f}, // dataNum about 8k
35 {TC_TYPE(schema::PrimitiveType_Stack, 0), 9.286}, // dataNum about 12k
36 {TC_TYPE(schema::PrimitiveType_Softmax, 0), 521.0}, // dataNum about 0.5k
37
38 {TC_TYPE(schema::PrimitiveType_MulFusion, schema::ActivationType_RELU), 2.288f}, // dataNum about 80k
39 {TC_TYPE(schema::PrimitiveType_MulFusion, schema::ActivationType_RELU6), 2.288f}, // dataNum about 80k
40 {TC_TYPE(schema::PrimitiveType_MulFusion, schema::ActivationType_NO_ACTIVATION), 1.806f}, // dataNum about 100k
41
42 {TC_TYPE(schema::PrimitiveType_AddFusion, schema::ActivationType_RELU), 2.288f}, // dataNum about 80k
43 {TC_TYPE(schema::PrimitiveType_AddFusion, schema::ActivationType_RELU6), 2.288f}, // dataNum about 80k
44 {TC_TYPE(schema::PrimitiveType_AddFusion, schema::ActivationType_NO_ACTIVATION), 1.806f}, // dataNum about 100k
45
46 {TC_TYPE(schema::PrimitiveType_SubFusion, schema::ActivationType_RELU), 2.288f}, // dataNum about 80k
47 {TC_TYPE(schema::PrimitiveType_SubFusion, schema::ActivationType_RELU6), 2.288f}, // dataNum about 80k
48 {TC_TYPE(schema::PrimitiveType_SubFusion, schema::ActivationType_NO_ACTIVATION), 1.806f}, // dataNum about 100k
49
50 {TC_TYPE(schema::PrimitiveType_DivFusion, schema::ActivationType_RELU), 2.288f}, // dataNum about 15k
51 {TC_TYPE(schema::PrimitiveType_DivFusion, schema::ActivationType_RELU6), 2.288f}, // dataNum about 15k
52 {TC_TYPE(schema::PrimitiveType_DivFusion, schema::ActivationType_NO_ACTIVATION), 1.806f}, // dataNum about 30k
53
54 {TC_TYPE(schema::PrimitiveType_RealDiv, schema::ActivationType_RELU), 13.65625f}, // dataNum about 15k
55 {TC_TYPE(schema::PrimitiveType_RealDiv, schema::ActivationType_RELU6), 13.65625f}, // dataNum about 15k
56 {TC_TYPE(schema::PrimitiveType_RealDiv, schema::ActivationType_NO_ACTIVATION), 6.65625f}, // dataNum about 30k
57
58 {TC_TYPE(schema::PrimitiveType_Minimum, schema::ActivationType_NO_ACTIVATION), 6.65625f}, // dataNum about 30k
59 {TC_TYPE(schema::PrimitiveType_Maximum, schema::ActivationType_NO_ACTIVATION), 6.65625f}, // dataNum about 30k
60
61 {TC_TYPE(schema::PrimitiveType_GreaterEqual, schema::ActivationType_NO_ACTIVATION), 4.90625f}, // dataNum about 40k
62 {TC_TYPE(schema::PrimitiveType_LessEqual, schema::ActivationType_NO_ACTIVATION), 4.90625f}, // dataNum about 40k
63
64 {TC_TYPE(schema::PrimitiveType_StridedSlice, 0), 38.027f}, // type 0 : parallel on outer tile, dataNum about 5.2k
65 {TC_TYPE(schema::PrimitiveType_StridedSlice, 1), 42.042f}, // type 1 : parallel on split axis, dataNum about 4.5k
66
67 {TC_TYPE(schema::PrimitiveType_BiasAdd, 0), 2.723f}, // dataNum about 65k
68 {TC_TYPE(schema::PrimitiveType_Gather, 0), 11.438f}, // dataNum about 16k
69
70 {TC_TYPE(schema::PrimitiveType_Fill, 0), 0.181f}, // dataNum about 260k(float/int IO : load 0, store 1)
71 {TC_TYPE(schema::PrimitiveType_Cast, 0), 0.181f}, // dataNum about 100k(float/int IO : load 1, store 1)
72
73 {TC_TYPE(schema::PrimitiveType_LayerNormFusion, 0), 507.812f}, // dataNum about 0.5k
74 {TC_TYPE(schema::PrimitiveType_OneHot, 0), 136.562f}, // dataNum about 1.5k
75 {TC_TYPE(schema::PrimitiveType_TileFusion, 0), 259.0625f}, // dataNum about 0.8k
76
77 {TC_TYPE(schema::PrimitiveType_ReduceFusion, schema::ReduceMode_ReduceAll), 66.5625f}, // dataNum about 3k
78 {TC_TYPE(schema::PrimitiveType_ReduceFusion, schema::ReduceMode_ReduceASum), 206.5625f}, // dataNum about 1k
79 {TC_TYPE(schema::PrimitiveType_ReduceFusion, schema::ReduceMode_ReduceL2), 259.0625f}, // dataNum about 0.8k
80 {TC_TYPE(schema::PrimitiveType_ReduceFusion, schema::ReduceMode_ReduceMax), 66.5625f}, // dataNum about 3k
81 {TC_TYPE(schema::PrimitiveType_ReduceFusion, schema::ReduceMode_ReduceMean), 259.0625f}, // dataNum about 0.8k
82 {TC_TYPE(schema::PrimitiveType_ReduceFusion, schema::ReduceMode_ReduceMin), 66.5625f}, // dataNum about 3k
83 {TC_TYPE(schema::PrimitiveType_ReduceFusion, schema::ReduceMode_ReduceProd), 206.5625f}, // dataNum about 1k
84 {TC_TYPE(schema::PrimitiveType_ReduceFusion, schema::ReduceMode_ReduceSum), 206.5625f}, // dataNum about 1k
85 {TC_TYPE(schema::PrimitiveType_ReduceFusion, schema::ReduceMode_ReduceSumSquare), 206.5625f}, // dataNum about 1k
86 {TC_TYPE(schema::PrimitiveType_ReduceFusion, (schema::ReduceMode_MAX + 1)), 259.0625f}, // dataNum about 0.8k
87 };
88
89 float ThreadCostModel::per_unit_load_cost_ = 1.0 / 64 * 11; // 64: L2 cache size, 11 : L2 cache latency on Haswell
90 float ThreadCostModel::per_unit_store_cost_ = 1.0 / 64 * 11; // 64: L2 cache size, 11 : L2 cache latency on Haswell
91 int64_t ThreadCostModel::per_unit_compute_num_ = 1; // 1 : per unit compute num
92
93 float ThreadCostModel::thread_startup_cost_ = 100000.0f; // 100000 : thread startup inherent cost
94 float ThreadCostModel::single_thread_cost_ = 100000.0f; // 100000 : Minimum cost of single-threaded
95 float ThreadCostModel::parallel_thread_cost_ = 40000.0f; // 40000 : Minimum cost of per thread in parallel-thread
96
GetOptimalThreadNum(const ThreadCostContext * thread_cost_context,const int thread_num)97 int ThreadCostModel::GetOptimalThreadNum(const ThreadCostContext *thread_cost_context, const int thread_num) {
98 const int64_t max_oversharding_factor = 4;
99
100 int64_t block_size = MSVALID(max_oversharding_factor * thread_num, ThreadBlockSize(thread_cost_context),
101 thread_cost_context->total_unit_num_);
102 int64_t block_count = UP_DIV(thread_cost_context->total_unit_num_, block_size);
103 // the maximum block size should be 2 times of the regular block size.
104 int64_t max_block_size = MSMIN(thread_cost_context->total_unit_num_, 2 * block_size);
105 double max_efficiency = static_cast<double>(block_count) / (UP_DIV(block_count, thread_num) * thread_num);
106 for (int64_t prev_block_count = block_count; max_efficiency < 1.0 && prev_block_count > 1;) {
107 int64_t cur_block_size = UP_DIV(thread_cost_context->total_unit_num_, prev_block_count - 1);
108 if (cur_block_size > max_block_size) {
109 break;
110 }
111 const int64_t cur_block_count = UP_DIV(thread_cost_context->total_unit_num_, cur_block_size);
112 MS_ASSERT(cur_block_count < prev_block_count);
113 prev_block_count = cur_block_count;
114 const double cur_efficiency =
115 static_cast<double>(cur_block_count) / (UP_DIV(cur_block_count, thread_num) * thread_num);
116 if (cur_efficiency + 0.01 >= max_efficiency) { // update threshold : 0.01
117 block_size = cur_block_size;
118 block_count = cur_block_count;
119 if (max_efficiency < cur_efficiency) {
120 max_efficiency = cur_efficiency;
121 }
122 }
123 }
124 return block_count;
125 }
126
ThreadNumUpdateStrategy(const ThreadCostContext * thread_cost_context,int task_num)127 int ThreadNumUpdateStrategy(const ThreadCostContext *thread_cost_context, int task_num) {
128 if (task_num <= 1) {
129 return task_num;
130 }
131
132 if (thread_cost_context != nullptr) {
133 if (ThreadCostModel::ThreadNum(thread_cost_context) <= 1) {
134 return 1;
135 }
136 int opt_thread = static_cast<int>(ThreadCostModel::ParallelDegree(thread_cost_context));
137 task_num = MSVALID(1, opt_thread, task_num);
138 task_num = MSMIN(task_num, thread_cost_context->total_unit_num_);
139 }
140 return task_num;
141 }
142
UpdateThreadNum(int32_t kernel_type,int64_t per_unit_load_num,int64_t per_unit_store_num,int64_t unit_num,int thread_num)143 int UpdateThreadNum(int32_t kernel_type, int64_t per_unit_load_num, int64_t per_unit_store_num, int64_t unit_num,
144 int thread_num) {
145 if (kernel_compute_cost_map_.count(kernel_type) > 0) {
146 lite::ThreadCostContext thread_cost_context;
147 thread_cost_context.per_unit_compute_cost_ = kernel_compute_cost_map_.at(kernel_type);
148 thread_cost_context.per_unit_load_num_ = per_unit_load_num;
149 thread_cost_context.per_unit_store_num_ = per_unit_store_num;
150 thread_cost_context.total_unit_num_ = unit_num;
151 return ThreadNumUpdateStrategy(&thread_cost_context, thread_num);
152 }
153 return thread_num;
154 }
155 } // namespace mindspore::lite
156