1 //
2 // Copyright © 2017 Arm Ltd. All rights reserved.
3 // SPDX-License-Identifier: MIT
4 //
5 #include "Profiling.hpp"
6
7 #include <armnn/BackendId.hpp>
8 #include <armnn/utility/Assert.hpp>
9 #include <armnn/utility/IgnoreUnused.hpp>
10
11 #include "JsonPrinter.hpp"
12
13 #if ARMNN_STREAMLINE_ENABLED
14 #include <streamline_annotate.h>
15 #endif
16
17 #include <algorithm>
18 #include <iomanip>
19 #include <iostream>
20 #include <fstream>
21 #include <map>
22 #include <stack>
23
24 namespace armnn
25 {
26
27 // Controls the amount of memory initially allocated to store profiling events.
28 // If chosen carefully, the profiling system will not make any additional allocations, thus minimizing its impact on
29 // measured times.
30 constexpr std::size_t g_ProfilingEventCountHint = 1024;
31
32 // Whether profiling reports should include the sequence of events together with their timings.
33 constexpr bool g_WriteProfilingEventSequence = true;
34
35 // Whether profiling reports should also report detailed information on events grouped by inference.
36 // This can spam the output stream, so use carefully (or adapt the code to just output information
37 // of interest).
38 constexpr bool g_AggregateProfilingEventsByInference = true;
39
40 // Whether a call to Profiler::AnalyzeEventsAndWriteResults() will be made when the Profiler is destroyed.
41 // It can be convenient for local tests.
42 constexpr bool g_WriteReportToStdOutOnProfilerDestruction = false;
43
FindMeasurement(const std::string & name,const Event * event)44 Measurement FindMeasurement(const std::string& name, const Event* event)
45 {
46
47 ARMNN_ASSERT(event != nullptr);
48
49 // Search though the measurements.
50 for (const auto& measurement : event->GetMeasurements())
51 {
52 if (measurement.m_Name == name)
53 {
54 // Measurement found.
55 return measurement;
56 }
57 }
58
59 // Measurement not found.
60 return Measurement{ "", 0.f, Measurement::Unit::TIME_MS };
61 }
62
FindKernelMeasurements(const Event * event)63 std::vector<Measurement> FindKernelMeasurements(const Event* event)
64 {
65 ARMNN_ASSERT(event != nullptr);
66
67 std::vector<Measurement> measurements;
68
69 // Search through the measurements.
70 for (const auto& measurement : event->GetMeasurements())
71 {
72 if (measurement.m_Name.rfind("OpenClKernelTimer", 0) == 0
73 || measurement.m_Name.rfind("NeonKernelTimer", 0) == 0)
74 {
75 // Measurement found.
76 measurements.push_back(measurement);
77 }
78 }
79
80 return measurements;
81 }
82
CalculateProfilingEventStats() const83 std::map<std::string, Profiler::ProfilingEventStats> Profiler::CalculateProfilingEventStats() const
84 {
85 std::map<std::string, ProfilingEventStats> nameToStatsMap;
86
87 for (const auto& event : m_EventSequence)
88 {
89 Measurement measurement = FindMeasurement(WallClockTimer::WALL_CLOCK_TIME, event.get());
90
91 double durationMs = measurement.m_Value;
92 auto it = nameToStatsMap.find(event->GetName());
93 if (it != nameToStatsMap.end())
94 {
95 ProfilingEventStats& stats = it->second;
96 stats.m_TotalMs += durationMs;
97 stats.m_MinMs = std::min(stats.m_MinMs, durationMs);
98 stats.m_MaxMs = std::max(stats.m_MaxMs, durationMs);
99 ++stats.m_Count;
100 }
101 else
102 {
103 nameToStatsMap.emplace(event->GetName(), ProfilingEventStats{ durationMs, durationMs, durationMs, 1 });
104 }
105 }
106
107 return nameToStatsMap;
108 }
109
GetEventPtr(const Event * ptr)110 const Event* GetEventPtr(const Event* ptr) { return ptr;}
GetEventPtr(const std::unique_ptr<Event> & ptr)111 const Event* GetEventPtr(const std::unique_ptr<Event>& ptr) {return ptr.get(); }
112
113 template<typename ItertType>
AnalyzeEventSequenceAndWriteResults(ItertType first,ItertType last,std::ostream & outStream) const114 void Profiler::AnalyzeEventSequenceAndWriteResults(ItertType first, ItertType last, std::ostream& outStream) const
115 {
116 // Outputs event sequence, if needed.
117 if (g_WriteProfilingEventSequence)
118 {
119 // Makes sure timestamps are output with 6 decimals, and save old settings.
120 std::streamsize oldPrecision = outStream.precision();
121 outStream.precision(6);
122 std::ios_base::fmtflags oldFlags = outStream.flags();
123 outStream.setf(std::ios::fixed);
124 // Outputs fields.
125 outStream << "Event Sequence - Name | Duration (ms) | Start (ms) | Stop (ms) | Device" << std::endl;
126 for (auto event = first; event != last; ++event)
127 {
128 const Event* eventPtr = GetEventPtr((*event));
129 double startTimeMs = FindMeasurement(WallClockTimer::WALL_CLOCK_TIME_START, eventPtr).m_Value;
130 double stopTimeMs = FindMeasurement(WallClockTimer::WALL_CLOCK_TIME_STOP, eventPtr).m_Value;
131
132 // Find the WallClock measurement if there is one.
133 double durationMs = FindMeasurement(WallClockTimer::WALL_CLOCK_TIME, eventPtr).m_Value;
134 outStream << std::setw(50) << eventPtr->GetName() << " "
135 << std::setw(20) << durationMs
136 << std::setw(20) << startTimeMs
137 << std::setw(20) << stopTimeMs
138 << std::setw(20) << eventPtr->GetBackendId().Get()
139 << std::endl;
140 }
141 outStream << std::endl;
142 // Restores previous precision settings.
143 outStream.flags(oldFlags);
144 outStream.precision(oldPrecision);
145 }
146
147 // Aggregates results per event name.
148 std::map<std::string, ProfilingEventStats> nameToStatsMap = CalculateProfilingEventStats();
149
150 // Outputs aggregated stats.
151 outStream << "Event Stats - Name | Avg (ms) | Min (ms) | Max (ms) | Total (ms) | Count" << std::endl;
152 for (const auto& pair : nameToStatsMap)
153 {
154 const std::string& eventLabel = pair.first;
155 const ProfilingEventStats& eventStats = pair.second;
156 const double avgMs = eventStats.m_TotalMs / double(eventStats.m_Count);
157
158 outStream << "\t" << std::setw(50) << eventLabel << " " << std::setw(9) << avgMs << " "
159 << std::setw(9) << eventStats.m_MinMs << " " << std::setw(9) << eventStats.m_MaxMs << " "
160 << std::setw(9) << eventStats.m_TotalMs << " " << std::setw(9) << eventStats.m_Count << std::endl;
161 }
162 outStream << std::endl;
163 }
164
Profiler()165 Profiler::Profiler()
166 : m_ProfilingEnabled(false)
167 {
168 m_EventSequence.reserve(g_ProfilingEventCountHint);
169
170 #if ARMNN_STREAMLINE_ENABLED
171 // Initialises streamline annotations.
172 ANNOTATE_SETUP;
173 #endif
174 }
175
~Profiler()176 Profiler::~Profiler()
177 {
178 if (m_ProfilingEnabled)
179 {
180 if (g_WriteReportToStdOutOnProfilerDestruction)
181 {
182 Print(std::cout);
183 }
184 }
185
186 // Un-register this profiler from the current thread.
187 ProfilerManager::GetInstance().RegisterProfiler(nullptr);
188 }
189
IsProfilingEnabled()190 bool Profiler::IsProfilingEnabled()
191 {
192 return m_ProfilingEnabled;
193 }
194
EnableProfiling(bool enableProfiling)195 void Profiler::EnableProfiling(bool enableProfiling)
196 {
197 m_ProfilingEnabled = enableProfiling;
198 }
199
BeginEvent(const BackendId & backendId,const std::string & label,std::vector<InstrumentPtr> && instruments)200 Event* Profiler::BeginEvent(const BackendId& backendId,
201 const std::string& label,
202 std::vector<InstrumentPtr>&& instruments)
203 {
204 Event* parent = m_Parents.empty() ? nullptr : m_Parents.top();
205 m_EventSequence.push_back(std::make_unique<Event>(label, this, parent, backendId, std::move(instruments)));
206 Event* event = m_EventSequence.back().get();
207 event->Start();
208
209 #if ARMNN_STREAMLINE_ENABLED
210 ANNOTATE_CHANNEL_COLOR(uint32_t(m_Parents.size()), GetEventColor(backendId), label.c_str());
211 #endif
212
213 m_Parents.push(event);
214 return event;
215 }
216
EndEvent(Event * event)217 void Profiler::EndEvent(Event* event)
218 {
219 event->Stop();
220
221 ARMNN_ASSERT(!m_Parents.empty());
222 ARMNN_ASSERT(event == m_Parents.top());
223 m_Parents.pop();
224
225 Event* parent = m_Parents.empty() ? nullptr : m_Parents.top();
226 IgnoreUnused(parent);
227 ARMNN_ASSERT(event->GetParentEvent() == parent);
228
229 #if ARMNN_STREAMLINE_ENABLED
230 ANNOTATE_CHANNEL_END(uint32_t(m_Parents.size()));
231 #endif
232 }
233
CalcLevel(const Event * eventPtr)234 int CalcLevel(const Event* eventPtr)
235 {
236 int level=0;
237 while (eventPtr != nullptr)
238 {
239 eventPtr = eventPtr->GetParentEvent();
240 level++;
241 }
242 return level;
243 }
244
PopulateInferences(std::vector<const Event * > & outInferences,int & outBaseLevel) const245 void Profiler::PopulateInferences(std::vector<const Event*>& outInferences, int& outBaseLevel) const
246 {
247 outInferences.reserve(m_EventSequence.size());
248 for (const auto& event : m_EventSequence)
249 {
250 const Event* eventPtrRaw = event.get();
251 if (eventPtrRaw->GetName() == "EnqueueWorkload")
252 {
253 outBaseLevel = (outBaseLevel == -1) ? CalcLevel(eventPtrRaw) : outBaseLevel;
254 outInferences.push_back(eventPtrRaw);
255 }
256 }
257 }
258
PopulateDescendants(std::map<const Event *,std::vector<const Event * >> & outDescendantsMap) const259 void Profiler::PopulateDescendants(std::map<const Event*, std::vector<const Event*>>& outDescendantsMap) const
260 {
261 for (const auto& event : m_EventSequence)
262 {
263 const Event* eventPtrRaw = event.get();
264 const Event* parent = eventPtrRaw->GetParentEvent();
265
266 if (!parent)
267 {
268 continue;
269 }
270
271 auto it = outDescendantsMap.find(parent);
272 if (it == outDescendantsMap.end())
273 {
274 outDescendantsMap.emplace(parent, std::vector<const Event*>({eventPtrRaw}));
275 }
276 else
277 {
278 it->second.push_back(eventPtrRaw);
279 }
280 }
281 }
282
283
ExtractJsonObjects(unsigned int inferenceIndex,const Event * parentEvent,JsonChildObject & parentObject,std::map<const Event *,std::vector<const Event * >> descendantsMap)284 void ExtractJsonObjects(unsigned int inferenceIndex,
285 const Event* parentEvent,
286 JsonChildObject& parentObject,
287 std::map<const Event*, std::vector<const Event*>> descendantsMap)
288 {
289 ARMNN_ASSERT(parentEvent);
290 std::vector<Measurement> instrumentMeasurements = parentEvent->GetMeasurements();
291 unsigned int childIdx=0;
292 for(size_t measurementIndex = 0; measurementIndex < instrumentMeasurements.size(); ++measurementIndex, ++childIdx)
293 {
294 if (inferenceIndex == 0)
295 {
296 // Only add kernel measurement once, in case of multiple inferences
297 JsonChildObject measurementObject{instrumentMeasurements[measurementIndex].m_Name};
298 measurementObject.SetUnit(instrumentMeasurements[measurementIndex].m_Unit);
299 measurementObject.SetType(JsonObjectType::Measurement);
300
301 ARMNN_ASSERT(parentObject.NumChildren() == childIdx);
302 parentObject.AddChild(measurementObject);
303 }
304
305 parentObject.GetChild(childIdx).AddMeasurement(instrumentMeasurements[measurementIndex].m_Value);
306 }
307
308
309 auto childEventsIt = descendantsMap.find(parentEvent);
310 if (childEventsIt != descendantsMap.end())
311 {
312 for (auto childEvent : childEventsIt->second)
313 {
314 if (inferenceIndex == 0)
315 {
316 // Only add second level once, in case of multiple inferences
317 JsonChildObject childObject{childEvent->GetName()};
318 childObject.SetType(JsonObjectType::Event);
319 parentObject.AddChild(childObject);
320 }
321
322 // Recursively process children. In reality this won't be very deep recursion. ~4-6 levels deep.
323 ExtractJsonObjects(inferenceIndex, childEvent, parentObject.GetChild(childIdx), descendantsMap);
324
325 childIdx++;
326 }
327 }
328 }
329
Print(std::ostream & outStream) const330 void Profiler::Print(std::ostream& outStream) const
331 {
332 // Makes sure timestamps are output with 6 decimals, and save old settings.
333 std::streamsize oldPrecision = outStream.precision();
334 outStream.precision(6);
335 std::ios_base::fmtflags oldFlags = outStream.flags();
336 outStream.setf(std::ios::fixed);
337 JsonPrinter printer(outStream);
338
339 // First find all the "inference" Events and print out duration measurements.
340 int baseLevel = -1;
341 std::vector<const Event*> inferences;
342 PopulateInferences(inferences, baseLevel);
343
344 // Second map out descendants hierarchy
345 std::map<const Event*, std::vector<const Event*>> descendantsMap;
346 PopulateDescendants(descendantsMap);
347
348 JsonChildObject inferenceObject{"inference_measurements"};
349 JsonChildObject layerObject{"layer_measurements"};
350 std::vector<JsonChildObject> workloadObjects;
351 std::map<unsigned int, std::vector<JsonChildObject>> workloadToKernelObjects;
352
353 for (unsigned int inferenceIndex = 0; inferenceIndex < inferences.size(); ++inferenceIndex)
354 {
355 auto inference = inferences[inferenceIndex];
356 ExtractJsonObjects(inferenceIndex, inference, inferenceObject, descendantsMap);
357 }
358
359 printer.PrintHeader();
360 printer.PrintArmNNHeader();
361
362 // print inference object, also prints child layer and kernel measurements
363 size_t id=0;
364 printer.PrintJsonChildObject(inferenceObject, id);
365
366 // end of ArmNN
367 printer.PrintNewLine();
368 printer.PrintFooter();
369
370 // end of main JSON object
371 printer.PrintNewLine();
372 printer.PrintFooter();
373 printer.PrintNewLine();
374
375 // Restores previous precision settings.
376 outStream.flags(oldFlags);
377 outStream.precision(oldPrecision);
378 }
379
AnalyzeEventsAndWriteResults(std::ostream & outStream) const380 void Profiler::AnalyzeEventsAndWriteResults(std::ostream& outStream) const
381 {
382 // Stack should be empty now.
383 const bool saneMarkerSequence = m_Parents.empty();
384
385 // Abort if the sequence of markers was found to have incorrect information:
386 // The stats cannot be trusted.
387 if (!saneMarkerSequence)
388 {
389 outStream << "Cannot write profiling stats. "
390 "Unexpected errors were found when analyzing the sequence of logged events, which may lead to plainly "
391 "wrong stats. The profiling system may contain implementation issues or could have been used in an "
392 "unsafe manner." << std::endl;
393 return;
394 }
395
396 // Analyzes the full sequence of events.
397 AnalyzeEventSequenceAndWriteResults(m_EventSequence.cbegin(),
398 m_EventSequence.cend(),
399 outStream);
400
401 // Aggregates events by tag if requested (spams the output stream if done for all tags).
402 if (g_AggregateProfilingEventsByInference)
403 {
404 outStream << std::endl;
405 outStream << "***" << std::endl;
406 outStream << "*** Per Inference Stats" << std::endl;
407 outStream << "***" << std::endl;
408 outStream << std::endl;
409
410 int baseLevel = -1;
411 std::vector<const Event*> inferences;
412 PopulateInferences(inferences, baseLevel);
413
414 // Second map out descendants hierarchy
415 std::map<const Event*, std::vector<const Event*>> descendantsMap;
416 PopulateDescendants(descendantsMap);
417
418 std::function<void (const Event*, std::vector<const Event*>&)>
419 FindDescendantEvents = [&](const Event* eventPtr,
420 std::vector<const Event*>& sequence)
421 {
422 sequence.push_back(eventPtr);
423
424 if (CalcLevel(eventPtr) > baseLevel+2) //We only care about levels as deep as workload executions.
425 {
426 return;
427 }
428
429 auto children = descendantsMap.find(eventPtr);
430 if (children == descendantsMap.end())
431 {
432 return;
433 }
434
435 for (const Event* child : children->second)
436 {
437 return FindDescendantEvents(child, sequence);
438 }
439 };
440
441 // Third, find events belonging to each inference
442 int inferenceIdx = 0;
443 for (auto inference : inferences)
444 {
445 std::vector<const Event*> sequence;
446
447 //build sequence, depth first
448 FindDescendantEvents(inference, sequence);
449
450 outStream << "> Begin Inference: " << inferenceIdx << std::endl;
451 outStream << std::endl;
452 AnalyzeEventSequenceAndWriteResults(sequence.cbegin(),
453 sequence.cend(),
454 outStream);
455 outStream << std::endl;
456 outStream << "> End Inference: " << inferenceIdx << std::endl;
457
458 inferenceIdx++;
459 }
460 }
461 }
462
GetEventColor(const BackendId & backendId) const463 std::uint32_t Profiler::GetEventColor(const BackendId& backendId) const
464 {
465 static BackendId cpuRef("CpuRef");
466 static BackendId cpuAcc("CpuAcc");
467 static BackendId gpuAcc("GpuAcc");
468 if (backendId == cpuRef) {
469 // Cyan
470 return 0xffff001b;
471 } else if (backendId == cpuAcc) {
472 // Green
473 return 0x00ff001b;
474 } else if (backendId == gpuAcc) {
475 // Purple
476 return 0xff007f1b;
477 } else {
478 // Dark gray
479 return 0x5555551b;
480 }
481 }
482
483 // The thread_local pointer to the profiler instance.
484 thread_local Profiler* tl_Profiler = nullptr;
485
GetInstance()486 ProfilerManager& ProfilerManager::GetInstance()
487 {
488 // Global reference to the single ProfileManager instance allowed.
489 static ProfilerManager s_ProfilerManager;
490 return s_ProfilerManager;
491 }
492
RegisterProfiler(Profiler * profiler)493 void ProfilerManager::RegisterProfiler(Profiler* profiler)
494 {
495 tl_Profiler = profiler;
496 }
497
GetProfiler()498 Profiler* ProfilerManager::GetProfiler()
499 {
500 return tl_Profiler;
501 }
502
503 } // namespace armnn
504