• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /**
2  * Copyright 2022 Huawei Technologies Co., Ltd
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  * http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
17 #include "src/litert/thread_cost_model.h"
18 #include <map>
19 #include "src/common/log_util.h"
20 #include "src/litert/inner_context.h"
21 #include "thread/threadpool.h"
22 #include "nnacl/op_base.h"
23 
24 namespace mindspore::lite {
25 const std::map<int32_t, float> kernel_compute_cost_map_ = {
26   {TC_TYPE(schema::PrimitiveType_Activation, schema::ActivationType_RELU), 1.806f},        // dataNum about 100k
27   {TC_TYPE(schema::PrimitiveType_Activation, schema::ActivationType_RELU6), 1.806f},       // dataNum about 100k
28   {TC_TYPE(schema::PrimitiveType_Activation, schema::ActivationType_LEAKY_RELU), 1.806f},  // dataNum about 100k
29   {TC_TYPE(schema::PrimitiveType_Activation, schema::ActivationType_TANH), 41.65625},      // dataNum about 5k
30   {TC_TYPE(schema::PrimitiveType_Activation, schema::ActivationType_SIGMOID), 59.65625f},  // dataNum about 3.5k
31   {TC_TYPE(schema::PrimitiveType_Activation, schema::ActivationType_GELU), 83.65625f},     // dataNum about 2.5k
32 
33   {TC_TYPE(schema::PrimitiveType_Sqrt, 0), 1.806f},    // dataNum about 100k
34   {TC_TYPE(schema::PrimitiveType_Split, 0), 21.573f},  // dataNum about 8k
35   {TC_TYPE(schema::PrimitiveType_Stack, 0), 9.286},    // dataNum about 12k
36   {TC_TYPE(schema::PrimitiveType_Softmax, 0), 521.0},  // dataNum about 0.5k
37 
38   {TC_TYPE(schema::PrimitiveType_MulFusion, schema::ActivationType_RELU), 2.288f},           // dataNum about 80k
39   {TC_TYPE(schema::PrimitiveType_MulFusion, schema::ActivationType_RELU6), 2.288f},          // dataNum about 80k
40   {TC_TYPE(schema::PrimitiveType_MulFusion, schema::ActivationType_NO_ACTIVATION), 1.806f},  // dataNum about 100k
41 
42   {TC_TYPE(schema::PrimitiveType_AddFusion, schema::ActivationType_RELU), 2.288f},           // dataNum about 80k
43   {TC_TYPE(schema::PrimitiveType_AddFusion, schema::ActivationType_RELU6), 2.288f},          // dataNum about 80k
44   {TC_TYPE(schema::PrimitiveType_AddFusion, schema::ActivationType_NO_ACTIVATION), 1.806f},  // dataNum about 100k
45 
46   {TC_TYPE(schema::PrimitiveType_SubFusion, schema::ActivationType_RELU), 2.288f},           // dataNum about 80k
47   {TC_TYPE(schema::PrimitiveType_SubFusion, schema::ActivationType_RELU6), 2.288f},          // dataNum about 80k
48   {TC_TYPE(schema::PrimitiveType_SubFusion, schema::ActivationType_NO_ACTIVATION), 1.806f},  // dataNum about 100k
49 
50   {TC_TYPE(schema::PrimitiveType_DivFusion, schema::ActivationType_RELU), 2.288f},           // dataNum about 15k
51   {TC_TYPE(schema::PrimitiveType_DivFusion, schema::ActivationType_RELU6), 2.288f},          // dataNum about 15k
52   {TC_TYPE(schema::PrimitiveType_DivFusion, schema::ActivationType_NO_ACTIVATION), 1.806f},  // dataNum about 30k
53 
54   {TC_TYPE(schema::PrimitiveType_RealDiv, schema::ActivationType_RELU), 13.65625f},          // dataNum about 15k
55   {TC_TYPE(schema::PrimitiveType_RealDiv, schema::ActivationType_RELU6), 13.65625f},         // dataNum about 15k
56   {TC_TYPE(schema::PrimitiveType_RealDiv, schema::ActivationType_NO_ACTIVATION), 6.65625f},  // dataNum about 30k
57 
58   {TC_TYPE(schema::PrimitiveType_Minimum, schema::ActivationType_NO_ACTIVATION), 6.65625f},  // dataNum about 30k
59   {TC_TYPE(schema::PrimitiveType_Maximum, schema::ActivationType_NO_ACTIVATION), 6.65625f},  // dataNum about 30k
60 
61   {TC_TYPE(schema::PrimitiveType_GreaterEqual, schema::ActivationType_NO_ACTIVATION), 4.90625f},  // dataNum about 40k
62   {TC_TYPE(schema::PrimitiveType_LessEqual, schema::ActivationType_NO_ACTIVATION), 4.90625f},     // dataNum about 40k
63 
64   {TC_TYPE(schema::PrimitiveType_StridedSlice, 0), 38.027f},  // type 0 : parallel on outer tile, dataNum about 5.2k
65   {TC_TYPE(schema::PrimitiveType_StridedSlice, 1), 42.042f},  // type 1 : parallel on split axis, dataNum about 4.5k
66 
67   {TC_TYPE(schema::PrimitiveType_BiasAdd, 0), 2.723f},  // dataNum about 65k
68   {TC_TYPE(schema::PrimitiveType_Gather, 0), 11.438f},  // dataNum about 16k
69 
70   {TC_TYPE(schema::PrimitiveType_Fill, 0), 0.181f},  // dataNum about 260k(float/int IO : load 0, store 1)
71   {TC_TYPE(schema::PrimitiveType_Cast, 0), 0.181f},  // dataNum about 100k(float/int IO : load 1, store 1)
72 
73   {TC_TYPE(schema::PrimitiveType_LayerNormFusion, 0), 507.812f},  // dataNum about 0.5k
74   {TC_TYPE(schema::PrimitiveType_OneHot, 0), 136.562f},           // dataNum about 1.5k
75   {TC_TYPE(schema::PrimitiveType_TileFusion, 0), 259.0625f},      // dataNum about 0.8k
76 
77   {TC_TYPE(schema::PrimitiveType_ReduceFusion, schema::ReduceMode_ReduceAll), 66.5625f},         // dataNum about 3k
78   {TC_TYPE(schema::PrimitiveType_ReduceFusion, schema::ReduceMode_ReduceASum), 206.5625f},       // dataNum about 1k
79   {TC_TYPE(schema::PrimitiveType_ReduceFusion, schema::ReduceMode_ReduceL2), 259.0625f},         // dataNum about 0.8k
80   {TC_TYPE(schema::PrimitiveType_ReduceFusion, schema::ReduceMode_ReduceMax), 66.5625f},         // dataNum about 3k
81   {TC_TYPE(schema::PrimitiveType_ReduceFusion, schema::ReduceMode_ReduceMean), 259.0625f},       // dataNum about 0.8k
82   {TC_TYPE(schema::PrimitiveType_ReduceFusion, schema::ReduceMode_ReduceMin), 66.5625f},         // dataNum about 3k
83   {TC_TYPE(schema::PrimitiveType_ReduceFusion, schema::ReduceMode_ReduceProd), 206.5625f},       // dataNum about 1k
84   {TC_TYPE(schema::PrimitiveType_ReduceFusion, schema::ReduceMode_ReduceSum), 206.5625f},        // dataNum about 1k
85   {TC_TYPE(schema::PrimitiveType_ReduceFusion, schema::ReduceMode_ReduceSumSquare), 206.5625f},  // dataNum about 1k
86   {TC_TYPE(schema::PrimitiveType_ReduceFusion, (schema::ReduceMode_MAX + 1)), 259.0625f},        // dataNum about 0.8k
87 };
88 
89 float ThreadCostModel::per_unit_load_cost_ = 1.0 / 64 * 11;   // 64: L2 cache size, 11 : L2 cache latency on Haswell
90 float ThreadCostModel::per_unit_store_cost_ = 1.0 / 64 * 11;  // 64: L2 cache size, 11 : L2 cache latency on Haswell
91 int64_t ThreadCostModel::per_unit_compute_num_ = 1;           // 1 : per unit compute num
92 
93 float ThreadCostModel::thread_startup_cost_ = 100000.0f;  // 100000 : thread startup inherent cost
94 float ThreadCostModel::single_thread_cost_ = 100000.0f;   // 100000 : Minimum cost of single-threaded
95 float ThreadCostModel::parallel_thread_cost_ = 40000.0f;  // 40000 : Minimum cost of per thread in parallel-thread
96 
GetOptimalThreadNum(const ThreadCostContext * thread_cost_context,const int thread_num)97 int ThreadCostModel::GetOptimalThreadNum(const ThreadCostContext *thread_cost_context, const int thread_num) {
98   const int64_t max_oversharding_factor = 4;
99 
100   int64_t block_size = MSVALID(max_oversharding_factor * thread_num, ThreadBlockSize(thread_cost_context),
101                                thread_cost_context->total_unit_num_);
102   int64_t block_count = UP_DIV(thread_cost_context->total_unit_num_, block_size);
103   // the maximum block size should be 2 times of the regular block size.
104   int64_t max_block_size = MSMIN(thread_cost_context->total_unit_num_, 2 * block_size);
105   double max_efficiency = static_cast<double>(block_count) / (UP_DIV(block_count, thread_num) * thread_num);
106   for (int64_t prev_block_count = block_count; max_efficiency < 1.0 && prev_block_count > 1;) {
107     int64_t cur_block_size = UP_DIV(thread_cost_context->total_unit_num_, prev_block_count - 1);
108     if (cur_block_size > max_block_size) {
109       break;
110     }
111     const int64_t cur_block_count = UP_DIV(thread_cost_context->total_unit_num_, cur_block_size);
112     MS_ASSERT(cur_block_count < prev_block_count);
113     prev_block_count = cur_block_count;
114     const double cur_efficiency =
115       static_cast<double>(cur_block_count) / (UP_DIV(cur_block_count, thread_num) * thread_num);
116     if (cur_efficiency + 0.01 >= max_efficiency) {  // update threshold : 0.01
117       block_size = cur_block_size;
118       block_count = cur_block_count;
119       if (max_efficiency < cur_efficiency) {
120         max_efficiency = cur_efficiency;
121       }
122     }
123   }
124   return block_count;
125 }
126 
ThreadNumUpdateStrategy(const ThreadCostContext * thread_cost_context,int task_num)127 int ThreadNumUpdateStrategy(const ThreadCostContext *thread_cost_context, int task_num) {
128   if (task_num <= 1) {
129     return task_num;
130   }
131 
132   if (thread_cost_context != nullptr) {
133     if (ThreadCostModel::ThreadNum(thread_cost_context) <= 1) {
134       return 1;
135     }
136     int opt_thread = static_cast<int>(ThreadCostModel::ParallelDegree(thread_cost_context));
137     task_num = MSVALID(1, opt_thread, task_num);
138     task_num = MSMIN(task_num, thread_cost_context->total_unit_num_);
139   }
140   return task_num;
141 }
142 
UpdateThreadNum(int32_t kernel_type,int64_t per_unit_load_num,int64_t per_unit_store_num,int64_t unit_num,int thread_num)143 int UpdateThreadNum(int32_t kernel_type, int64_t per_unit_load_num, int64_t per_unit_store_num, int64_t unit_num,
144                     int thread_num) {
145   if (kernel_compute_cost_map_.count(kernel_type) > 0) {
146     lite::ThreadCostContext thread_cost_context;
147     thread_cost_context.per_unit_compute_cost_ = kernel_compute_cost_map_.at(kernel_type);
148     thread_cost_context.per_unit_load_num_ = per_unit_load_num;
149     thread_cost_context.per_unit_store_num_ = per_unit_store_num;
150     thread_cost_context.total_unit_num_ = unit_num;
151     return ThreadNumUpdateStrategy(&thread_cost_context, thread_num);
152   }
153   return thread_num;
154 }
155 }  // namespace mindspore::lite
156