/* * Copyright (C) 2020 The Android Open Source Project * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ // See /docs/design-docs/protozero.md for rationale and results. #include #include #include #include #include "perfetto/base/compiler.h" #include "perfetto/protozero/static_buffer.h" // Autogenerated headers in out/*/gen/ #include "src/protozero/test/example_proto/library.pbzero.h" #include "src/protozero/test/example_proto/test_messages.pb.h" #include "src/protozero/test/example_proto/test_messages.pbzero.h" // Generated by the protozero plugin. namespace pbzero = protozero::test::protos::pbzero; // Generated by the official protobuf compiler. namespace pblite = protozero::test::protos; namespace { // This needs to be > the max size written by each iteration. constexpr size_t kBufPerIteration = 512; // Write cyclically on a 64 MB buffer set to simulate a realistic tracing // scenario. constexpr size_t kTotalWorkingSetSize = 64 * 1024 * 1024; alignas(uint64_t) char g_out_buffer[kTotalWorkingSetSize]; char* g_cur = g_out_buffer; uint64_t g_fake_input_simple[] = {0x12345678, 0x90ABCDEF, 0x11111111, 0xFFFFFFFF, 0x6666666666666666ULL, 0x6666666666666666ULL, 0x6666666666666666ULL, 0x0066666666666666ULL}; // Speed-of-light serializer. Aa very simple C++ class that just appends data // into a linear buffer making all sorts of favourable assumptions. It does not // use any binary-stable encoding, it does not perform bound checking, // all writes are 64-bit aligned, it doesn't deal with any thread-safety. // The speed-of-light serializer serves as a reference for how fast a serializer // could be if argument marshalling and bound checking were zero cost. struct SOLMsg { template void Append(T x) { // The reinterpret_cast is to give favorable alignment guarantees. // The memcpy will be elided by the compiler, which will emit just a // 64-bit aligned mov instruction. memcpy(reinterpret_cast(ptr_), &x, sizeof(x)); ptr_ += sizeof(uint64_t); } void set_field_int32(int32_t x) { Append(x); } void set_field_uint32(uint32_t x) { Append(x); } void set_field_int64(int64_t x) { Append(x); } void set_field_uint64(uint64_t x) { Append(x); } void set_field_string(const char* str) { ptr_ = strcpy(ptr_, str); } SOLMsg* add_field_nested() { return new (this + 1) SOLMsg(); } alignas(uint64_t) char storage_[sizeof(g_fake_input_simple) + 8]; char* ptr_ = &storage_[0]; }; template PERFETTO_ALWAYS_INLINE void FillMessage_Simple(T* msg) { benchmark::DoNotOptimize(g_fake_input_simple); msg->set_field_int32(static_cast(g_fake_input_simple[0])); msg->set_field_uint32(static_cast(g_fake_input_simple[1])); msg->set_field_int64(static_cast(g_fake_input_simple[2])); msg->set_field_uint64(static_cast(g_fake_input_simple[3])); msg->set_field_string(reinterpret_cast(&g_fake_input_simple[4])); } template PERFETTO_ALWAYS_INLINE void FillMessage_Nested(T* msg, int depth = 0) { benchmark::DoNotOptimize(g_fake_input_simple); FillMessage_Simple(msg); if (depth < 3) { auto* child = msg->add_field_nested(); FillMessage_Nested(child, depth + 1); } } PERFETTO_ALWAYS_INLINE void Clobber(benchmark::State& state) { uint64_t* buf = reinterpret_cast(g_cur); // Read-back the data written to have a realistic evaluation of the // speed-of-light scenario. This is to deal with architecture of modern CPUs. // If we write a bunch of memory bytes, never read-back from them, and then // just over-write them, the CPU can just throw away the whole stream of // instructions that produced them, if that's still in flight and tracked in // the out-of-order units. // The buf[i-1] ^= buf forces the CPU to consume the result of the writes. buf[0] = reinterpret_cast(&state); for (size_t i = 1; i < kBufPerIteration / sizeof(uint64_t); i++) buf[i] ^= buf[i - 1]; if (buf[(kBufPerIteration / sizeof(uint64_t)) - 1] == 42) PERFETTO_LOG("."); benchmark::DoNotOptimize(buf); constexpr size_t kWrap = kTotalWorkingSetSize / kBufPerIteration; g_cur = &g_out_buffer[(state.iterations() % kWrap) * kBufPerIteration]; benchmark::ClobberMemory(); } } // namespace static void BM_Protozero_Simple_Libprotobuf(benchmark::State& state) { while (state.KeepRunning()) { { // The nested block is to account for RAII finalizers. pblite::EveryField msg; FillMessage_Simple(&msg); msg.SerializeToArray(g_cur, kBufPerIteration); } Clobber(state); } } static void BM_Protozero_Simple_Protozero(benchmark::State& state) { while (state.KeepRunning()) { { protozero::StaticBuffered msg(g_cur, kBufPerIteration); FillMessage_Simple(msg.get()); } Clobber(state); } } static void BM_Protozero_Simple_SpeedOfLight(benchmark::State& state) { while (state.KeepRunning()) { SOLMsg* msg = new (g_cur) SOLMsg(); FillMessage_Simple(msg); Clobber(state); } } static void BM_Protozero_Nested_Libprotobuf(benchmark::State& state) { while (state.KeepRunning()) { { pblite::EveryField msg; FillMessage_Nested(&msg); msg.SerializeToArray(g_cur, kBufPerIteration); } Clobber(state); } } static void BM_Protozero_Nested_Protozero(benchmark::State& state) { while (state.KeepRunning()) { { protozero::StaticBuffered msg(g_cur, kBufPerIteration); FillMessage_Nested(msg.get()); } Clobber(state); } } static void BM_Protozero_Nested_SpeedOfLight(benchmark::State& state) { while (state.KeepRunning()) { SOLMsg* msg = new (g_cur) SOLMsg(); FillMessage_Nested(msg); Clobber(state); } } BENCHMARK(BM_Protozero_Simple_Libprotobuf); BENCHMARK(BM_Protozero_Simple_Protozero); BENCHMARK(BM_Protozero_Simple_SpeedOfLight); BENCHMARK(BM_Protozero_Nested_Libprotobuf); BENCHMARK(BM_Protozero_Nested_Protozero); BENCHMARK(BM_Protozero_Nested_SpeedOfLight);