1 /*
2 * Copyright (C) 2021 The Android Open Source Project
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17 #include "TaskProcessor.h"
18
19 #include <assert.h>
20 #include <sys/prctl.h>
21
22 #include "RenderScriptToolkit.h"
23 #include "Utils.h"
24
25 #define LOG_TAG "renderscript.toolkit.TaskProcessor"
26
27 namespace android {
28 namespace renderscript {
29
setTiling(unsigned int targetTileSizeInBytes)30 int Task::setTiling(unsigned int targetTileSizeInBytes) {
31 // Empirically, values smaller than 1000 are unlikely to give good performance.
32 targetTileSizeInBytes = std::max(1000u, targetTileSizeInBytes);
33 const size_t cellSizeInBytes =
34 mVectorSize; // If we add float support, vectorSize * 4 for that.
35 const size_t targetCellsPerTile = targetTileSizeInBytes / cellSizeInBytes;
36 assert(targetCellsPerTile > 0);
37
38 size_t cellsToProcessY;
39 size_t cellsToProcessX;
40 if (mRestriction == nullptr) {
41 cellsToProcessX = mSizeX;
42 cellsToProcessY = mSizeY;
43 } else {
44 assert(mRestriction->endX > mRestriction->startX);
45 assert(mRestriction->endY > mRestriction->startY);
46 cellsToProcessX = mRestriction->endX - mRestriction->startX;
47 cellsToProcessY = mRestriction->endY - mRestriction->startY;
48 }
49
50 // We want rows as large as possible, as the SIMD code we have is more efficient with
51 // large rows.
52 mTilesPerRow = divideRoundingUp(cellsToProcessX, targetCellsPerTile);
53 // Once we know the number of tiles per row, we divide that row evenly. We round up to make
54 // sure all cells are included in the last tile of the row.
55 mCellsPerTileX = divideRoundingUp(cellsToProcessX, mTilesPerRow);
56
57 // We do the same thing for the Y direction.
58 size_t targetRowsPerTile = divideRoundingUp(targetCellsPerTile, mCellsPerTileX);
59 mTilesPerColumn = divideRoundingUp(cellsToProcessY, targetRowsPerTile);
60 mCellsPerTileY = divideRoundingUp(cellsToProcessY, mTilesPerColumn);
61
62 return mTilesPerRow * mTilesPerColumn;
63 }
64
processTile(unsigned int threadIndex,size_t tileIndex)65 void Task::processTile(unsigned int threadIndex, size_t tileIndex) {
66 // Figure out the overall boundaries.
67 size_t startWorkX;
68 size_t startWorkY;
69 size_t endWorkX;
70 size_t endWorkY;
71 if (mRestriction == nullptr) {
72 startWorkX = 0;
73 startWorkY = 0;
74 endWorkX = mSizeX;
75 endWorkY = mSizeY;
76 } else {
77 startWorkX = mRestriction->startX;
78 startWorkY = mRestriction->startY;
79 endWorkX = mRestriction->endX;
80 endWorkY = mRestriction->endY;
81 }
82 // Figure out the rectangle for this tileIndex. All our tiles form a 2D grid. Identify
83 // first the X, Y coordinate of our tile in that grid.
84 size_t tileIndexY = tileIndex / mTilesPerRow;
85 size_t tileIndexX = tileIndex % mTilesPerRow;
86 // Calculate the starting and ending point of that tile.
87 size_t startCellX = startWorkX + tileIndexX * mCellsPerTileX;
88 size_t startCellY = startWorkY + tileIndexY * mCellsPerTileY;
89 size_t endCellX = std::min(startCellX + mCellsPerTileX, endWorkX);
90 size_t endCellY = std::min(startCellY + mCellsPerTileY, endWorkY);
91
92 // Call the derived class to do the specific work.
93 if (mPrefersDataAsOneRow && startCellX == 0 && endCellX == mSizeX) {
94 // When the tile covers entire rows, we can take advantage that some ops are not 2D.
95 processData(threadIndex, 0, startCellY, mSizeX * (endCellY - startCellY), startCellY + 1);
96 } else {
97 processData(threadIndex, startCellX, startCellY, endCellX, endCellY);
98 }
99 }
100
TaskProcessor(unsigned int numThreads)101 TaskProcessor::TaskProcessor(unsigned int numThreads)
102 : mUsesSimd{cpuSupportsSimd()},
103 /* If the requested number of threads is 0, we'll decide based on the number of cores.
104 * Through empirical testing, we've found that using more than 6 threads does not help.
105 * There may be more optimal choices to make depending on the SoC but we'll stick to
106 * this simple heuristic for now.
107 *
108 * We'll re-use the thread that calls the processor doTask method, so we'll spawn one less
109 * worker pool thread than the total number of threads.
110 */
111 mNumberOfPoolThreads{numThreads ? numThreads - 1
112 : std::min(6u, std::thread::hardware_concurrency() - 1)} {
113 for (size_t i = 0; i < mNumberOfPoolThreads; i++) {
114 mPoolThreads.emplace_back(
115 std::bind(&TaskProcessor::processTilesOfWork, this, i + 1, false));
116 }
117 }
118
~TaskProcessor()119 TaskProcessor::~TaskProcessor() {
120 {
121 std::lock_guard<std::mutex> lock(mQueueMutex);
122 mStopThreads = true;
123 mWorkAvailableOrStop.notify_all();
124 }
125
126 for (auto& thread : mPoolThreads) {
127 thread.join();
128 }
129 }
130
processTilesOfWork(int threadIndex,bool returnWhenNoWork)131 void TaskProcessor::processTilesOfWork(int threadIndex, bool returnWhenNoWork) {
132 if (threadIndex != 0) {
133 // Set the name of the thread, except for thread 0, which is not part of the pool.
134 // PR_SET_NAME takes a maximum of 16 characters, including the terminating null.
135 char name[16]{"RenderScToolkit"};
136 prctl(PR_SET_NAME, name, 0, 0, 0);
137 // ALOGI("Starting thread%d", threadIndex);
138 }
139
140 std::unique_lock<std::mutex> lock(mQueueMutex);
141 while (true) {
142 mWorkAvailableOrStop.wait(lock, [this, returnWhenNoWork]() REQUIRES(mQueueMutex) {
143 return mStopThreads || (mTilesNotYetStarted > 0) ||
144 (returnWhenNoWork && (mTilesNotYetStarted == 0));
145 });
146 // ALOGI("Woke thread%d", threadIndex);
147
148 // This ScopedLockAssertion is to help the compiler when it checks thread annotations
149 // to realize that we have the lock. It's however not completely true; we don't
150 // hold the lock while processing the tile.
151 // TODO Figure out how to fix that.
152 android::base::ScopedLockAssertion lockAssert(mQueueMutex);
153 if (mStopThreads || (returnWhenNoWork && mTilesNotYetStarted == 0)) {
154 break;
155 }
156
157 while (mTilesNotYetStarted > 0 && !mStopThreads) {
158 // This picks the tiles in decreasing order but that does not matter.
159 int myTile = --mTilesNotYetStarted;
160 mTilesInProcess++;
161 lock.unlock();
162 {
163 // We won't be executing this code unless the main thread is
164 // holding the mTaskMutex lock, which guards mCurrentTask.
165 // The compiler can't figure this out.
166 android::base::ScopedLockAssertion lockAssert(mTaskMutex);
167 mCurrentTask->processTile(threadIndex, myTile);
168 }
169 lock.lock();
170 mTilesInProcess--;
171 if (mTilesInProcess == 0 && mTilesNotYetStarted == 0) {
172 mWorkIsFinished.notify_one();
173 }
174 }
175 }
176 // if (threadIndex != 0) {
177 // ALOGI("Ending thread%d", threadIndex);
178 // }
179 }
180
doTask(Task * task)181 void TaskProcessor::doTask(Task* task) {
182 std::lock_guard<std::mutex> lockGuard(mTaskMutex);
183 task->setUsesSimd(mUsesSimd);
184 mCurrentTask = task;
185 // Notify the thread pool of available work.
186 startWork(task);
187 // Start processing some of the tiles on the calling thread.
188 processTilesOfWork(0, true);
189 // Wait for all the pool workers to complete.
190 waitForPoolWorkersToComplete();
191 mCurrentTask = nullptr;
192 }
193
startWork(Task * task)194 void TaskProcessor::startWork(Task* task) {
195 /**
196 * The size in bytes that we're hoping each tile will be. If this value is too small,
197 * we'll spend too much time in synchronization. If it's too large, some cores may be
198 * idle while others still have a lot of work to do. Ideally, it would depend on the
199 * device we're running. 16k is the same value used by RenderScript and seems reasonable
200 * from ad-hoc tests.
201 */
202 const size_t targetTileSize = 16 * 1024;
203
204 std::lock_guard<std::mutex> lock(mQueueMutex);
205 assert(mTilesInProcess == 0);
206 mTilesNotYetStarted = task->setTiling(targetTileSize);
207 mWorkAvailableOrStop.notify_all();
208 }
209
waitForPoolWorkersToComplete()210 void TaskProcessor::waitForPoolWorkersToComplete() {
211 std::unique_lock<std::mutex> lock(mQueueMutex);
212 // The predicate, i.e. the lambda, will make sure that
213 // we terminate even if the main thread calls this after
214 // mWorkIsFinished is signaled.
215 mWorkIsFinished.wait(lock, [this]() REQUIRES(mQueueMutex) {
216 return mTilesNotYetStarted == 0 && mTilesInProcess == 0;
217 });
218 }
219
220 } // namespace renderscript
221 } // namespace android
222