profiling/perf/perf_producer.cc

/*
 * Copyright (C) 2019 The Android Open Source Project
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *      http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

#include "src/profiling/perf/perf_producer.h"

#include <random>
#include <utility>

#include <unistd.h>

#include <unwindstack/Error.h>
#include <unwindstack/Unwinder.h>

#include "perfetto/base/logging.h"
#include "perfetto/base/task_runner.h"
#include "perfetto/ext/base/metatrace.h"
#include "perfetto/ext/base/utils.h"
#include "perfetto/ext/base/weak_ptr.h"
#include "perfetto/ext/tracing/core/basic_types.h"
#include "perfetto/ext/tracing/core/producer.h"
#include "perfetto/ext/tracing/core/tracing_service.h"
#include "perfetto/ext/tracing/ipc/producer_ipc_client.h"
#include "perfetto/tracing/core/data_source_config.h"
#include "perfetto/tracing/core/data_source_descriptor.h"
#include "src/profiling/common/callstack_trie.h"
#include "src/profiling/common/proc_cmdline.h"
#include "src/profiling/common/producer_support.h"
#include "src/profiling/common/profiler_guardrails.h"
#include "src/profiling/common/unwind_support.h"
#include "src/profiling/perf/common_types.h"
#include "src/profiling/perf/event_reader.h"

#include "protos/perfetto/common/builtin_clock.pbzero.h"
#include "protos/perfetto/common/perf_events.gen.h"
#include "protos/perfetto/common/perf_events.pbzero.h"
#include "protos/perfetto/config/profiling/perf_event_config.gen.h"
#include "protos/perfetto/trace/profiling/profile_packet.pbzero.h"
#include "protos/perfetto/trace/trace_packet.pbzero.h"
#include "protos/perfetto/trace/trace_packet_defaults.pbzero.h"

namespace perfetto {
namespace profiling {
namespace {

// TODO(b/151835887): on Android, when using signals, there exists a vulnerable
// window between a process image being replaced by execve, and the new
// libc instance reinstalling the proper signal handlers. During this window,
// the signal disposition is defaulted to terminating the process.
// This is a best-effort mitigation from the daemon's side, using a heuristic
// that most execve calls follow a fork. So if we get a sample for a very fresh
// process, the grace period will give it a chance to get to
// a properly initialised state prior to getting signalled. This doesn't help
// cases when a mature process calls execve, or when the target gets descheduled
// (since this is a naive walltime wait).
// The proper fix is in the platform, see bug for progress.
constexpr uint32_t kProcDescriptorsAndroidDelayMs = 50;

constexpr uint32_t kMemoryLimitCheckPeriodMs = 1000;

constexpr uint32_t kInitialConnectionBackoffMs = 100;
constexpr uint32_t kMaxConnectionBackoffMs = 30 * 1000;

constexpr char kProducerName[] = "perfetto.traced_perf";
constexpr char kDataSourceName[] = "linux.perf";

size_t NumberOfCpus() {
  return static_cast<size_t>(sysconf(_SC_NPROCESSORS_CONF));
}

int32_t ToBuiltinClock(int32_t clockid) {
  switch (clockid) {
    case CLOCK_REALTIME:
      return protos::pbzero::BUILTIN_CLOCK_REALTIME;
    case CLOCK_MONOTONIC:
      return protos::pbzero::BUILTIN_CLOCK_MONOTONIC;
    case CLOCK_MONOTONIC_RAW:
      return protos::pbzero::BUILTIN_CLOCK_MONOTONIC_RAW;
    case CLOCK_BOOTTIME:
      return protos::pbzero::BUILTIN_CLOCK_BOOTTIME;
    // Should never get invalid input here as otherwise the syscall itself
    // would've failed earlier.
    default:
      return protos::pbzero::BUILTIN_CLOCK_UNKNOWN;
  }
}

TraceWriter::TracePacketHandle StartTracePacket(TraceWriter* trace_writer) {
  auto packet = trace_writer->NewTracePacket();
  packet->set_sequence_flags(
      protos::pbzero::TracePacket::SEQ_NEEDS_INCREMENTAL_STATE);
  return packet;
}

void WritePerfEventDefaultsPacket(const EventConfig& event_config,
                                  TraceWriter* trace_writer) {
  auto packet = trace_writer->NewTracePacket();
  packet->set_timestamp(static_cast<uint64_t>(base::GetBootTimeNs().count()));
  packet->set_timestamp_clock_id(protos::pbzero::BUILTIN_CLOCK_BOOTTIME);

  // start new incremental state generation:
  packet->set_sequence_flags(
      protos::pbzero::TracePacket::SEQ_INCREMENTAL_STATE_CLEARED);

  // default packet timestamp clock for the samples:
  perf_event_attr* perf_attr = event_config.perf_attr();
  auto* defaults = packet->set_trace_packet_defaults();
  int32_t builtin_clock = ToBuiltinClock(perf_attr->clockid);
  defaults->set_timestamp_clock_id(static_cast<uint32_t>(builtin_clock));

  auto* perf_defaults = defaults->set_perf_sample_defaults();
  auto* timebase_pb = perf_defaults->set_timebase();

  // frequency/period:
  if (perf_attr->freq) {
    timebase_pb->set_frequency(perf_attr->sample_freq);
  } else {
    timebase_pb->set_period(perf_attr->sample_period);
  }

  // event:
  const PerfCounter& timebase = event_config.timebase_event();
  switch (timebase.event_type()) {
    case PerfCounter::Type::kBuiltinCounter: {
      timebase_pb->set_counter(
          static_cast<protos::pbzero::PerfEvents::Counter>(timebase.counter));
      break;
    }
    case PerfCounter::Type::kTracepoint: {
      auto* tracepoint_pb = timebase_pb->set_tracepoint();
      tracepoint_pb->set_name(timebase.tracepoint_name);
      tracepoint_pb->set_filter(timebase.tracepoint_filter);
      break;
    }
    case PerfCounter::Type::kRawEvent: {
      auto* raw_pb = timebase_pb->set_raw_event();
      raw_pb->set_type(timebase.attr_type);
      raw_pb->set_config(timebase.attr_config);
      raw_pb->set_config1(timebase.attr_config1);
      raw_pb->set_config2(timebase.attr_config2);
      break;
    }
  }

  // optional name to identify the counter during parsing:
  if (!timebase.name.empty()) {
    timebase_pb->set_name(timebase.name);
  }

  // Not setting timebase.timestamp_clock since the field that matters during
  // parsing is the root timestamp_clock_id set above.
}

uint32_t TimeToNextReadTickMs(DataSourceInstanceID ds_id, uint32_t period_ms) {
  // Normally, we'd schedule the next tick at the next |period_ms|
  // boundary of the boot clock. However, to avoid aligning the read tasks of
  // all concurrent data sources, we select a deterministic offset based on the
  // data source id.
  std::minstd_rand prng(static_cast<std::minstd_rand::result_type>(ds_id));
  std::uniform_int_distribution<uint32_t> dist(0, period_ms - 1);
  uint32_t ds_period_offset = dist(prng);

  uint64_t now_ms = static_cast<uint64_t>(base::GetWallTimeMs().count());
  return period_ms - ((now_ms - ds_period_offset) % period_ms);
}

bool ShouldRejectDueToFilter(pid_t pid,
                             base::FlatSet<std::string>* additional_cmdlines,
                             const TargetFilter& filter) {
  PERFETTO_CHECK(additional_cmdlines);

  std::string cmdline;
  bool have_cmdline = glob_aware::ReadProcCmdlineForPID(pid, &cmdline);

  const char* binname = "";
  if (have_cmdline) {
    binname = glob_aware::FindBinaryName(cmdline.c_str(), cmdline.size());
  }

  auto has_matching_pattern = [](const std::vector<std::string>& patterns,
                                 const char* cmd, const char* name) {
    for (const std::string& pattern : patterns) {
      if (glob_aware::MatchGlobPattern(pattern.c_str(), cmd, name)) {
        return true;
      }
    }
    return false;
  };

  if (have_cmdline &&
      has_matching_pattern(filter.exclude_cmdlines, cmdline.c_str(), binname)) {
    PERFETTO_DLOG("Explicitly rejecting samples for pid [%d] due to cmdline",
                  static_cast<int>(pid));
    return true;
  }
  if (filter.exclude_pids.count(pid)) {
    PERFETTO_DLOG("Explicitly rejecting samples for pid [%d] due to pid",
                  static_cast<int>(pid));
    return true;
  }

  if (have_cmdline &&
      has_matching_pattern(filter.cmdlines, cmdline.c_str(), binname)) {
    return false;
  }
  if (filter.pids.count(pid)) {
    return false;
  }

  // Empty allow filter means keep everything that isn't explicitly excluded.
  if (filter.cmdlines.empty() && filter.pids.empty() &&
      !filter.additional_cmdline_count) {
    return false;
  }

  // Config option that allows to profile just the N first seen cmdlines.
  if (have_cmdline) {
    if (additional_cmdlines->count(cmdline)) {
      return false;
    }
    if (additional_cmdlines->size() < filter.additional_cmdline_count) {
      additional_cmdlines->insert(cmdline);
      return false;
    }
  }

  PERFETTO_DLOG("Rejecting samples for pid [%d]", static_cast<int>(pid));
  return true;
}

protos::pbzero::Profiling::CpuMode ToCpuModeEnum(uint16_t perf_cpu_mode) {
  using Profiling = protos::pbzero::Profiling;
  switch (perf_cpu_mode) {
    case PERF_RECORD_MISC_KERNEL:
      return Profiling::MODE_KERNEL;
    case PERF_RECORD_MISC_USER:
      return Profiling::MODE_USER;
    case PERF_RECORD_MISC_HYPERVISOR:
      return Profiling::MODE_HYPERVISOR;
    case PERF_RECORD_MISC_GUEST_KERNEL:
      return Profiling::MODE_GUEST_KERNEL;
    case PERF_RECORD_MISC_GUEST_USER:
      return Profiling::MODE_GUEST_USER;
    default:
      return Profiling::MODE_UNKNOWN;
  }
}

protos::pbzero::Profiling::StackUnwindError ToProtoEnum(
    unwindstack::ErrorCode error_code) {
  using Profiling = protos::pbzero::Profiling;
  switch (error_code) {
    case unwindstack::ERROR_NONE:
      return Profiling::UNWIND_ERROR_NONE;
    case unwindstack::ERROR_MEMORY_INVALID:
      return Profiling::UNWIND_ERROR_MEMORY_INVALID;
    case unwindstack::ERROR_UNWIND_INFO:
      return Profiling::UNWIND_ERROR_UNWIND_INFO;
    case unwindstack::ERROR_UNSUPPORTED:
      return Profiling::UNWIND_ERROR_UNSUPPORTED;
    case unwindstack::ERROR_INVALID_MAP:
      return Profiling::UNWIND_ERROR_INVALID_MAP;
    case unwindstack::ERROR_MAX_FRAMES_EXCEEDED:
      return Profiling::UNWIND_ERROR_MAX_FRAMES_EXCEEDED;
    case unwindstack::ERROR_REPEATED_FRAME:
      return Profiling::UNWIND_ERROR_REPEATED_FRAME;
    case unwindstack::ERROR_INVALID_ELF:
      return Profiling::UNWIND_ERROR_INVALID_ELF;
    case unwindstack::ERROR_SYSTEM_CALL:
      return Profiling::UNWIND_ERROR_SYSTEM_CALL;
    case unwindstack::ERROR_THREAD_TIMEOUT:
      return Profiling::UNWIND_ERROR_THREAD_TIMEOUT;
    case unwindstack::ERROR_THREAD_DOES_NOT_EXIST:
      return Profiling::UNWIND_ERROR_THREAD_DOES_NOT_EXIST;
    case unwindstack::ERROR_BAD_ARCH:
      return Profiling::UNWIND_ERROR_BAD_ARCH;
    case unwindstack::ERROR_MAPS_PARSE:
      return Profiling::UNWIND_ERROR_MAPS_PARSE;
    case unwindstack::ERROR_INVALID_PARAMETER:
      return Profiling::UNWIND_ERROR_INVALID_PARAMETER;
  }
  return Profiling::UNWIND_ERROR_UNKNOWN;
}

}  // namespace

PerfProducer::PerfProducer(ProcDescriptorGetter* proc_fd_getter,
                           base::TaskRunner* task_runner)
    : task_runner_(task_runner),
      proc_fd_getter_(proc_fd_getter),
      unwinding_worker_(this),
      weak_factory_(this) {
  proc_fd_getter->SetDelegate(this);
}

void PerfProducer::SetupDataSource(DataSourceInstanceID,
                                   const DataSourceConfig&) {}

void PerfProducer::StartDataSource(DataSourceInstanceID ds_id,
                                   const DataSourceConfig& config) {
  PERFETTO_LOG("StartDataSource(%zu, %s)", static_cast<size_t>(ds_id),
               config.name().c_str());

  if (config.name() == MetatraceWriter::kDataSourceName) {
    StartMetatraceSource(ds_id, static_cast<BufferID>(config.target_buffer()));
    return;
  }

  // linux.perf data source
  if (config.name() != kDataSourceName)
    return;

  // Tracepoint name -> id lookup in case the config asks for tracepoints:
  auto tracepoint_id_lookup = [this](const std::string& group,
                                     const std::string& name) {
    if (!tracefs_)  // lazy init or retry
      tracefs_ = FtraceProcfs::CreateGuessingMountPoint();
    if (!tracefs_)  // still didn't find an accessible tracefs
      return 0u;
    return tracefs_->ReadEventId(group, name);
  };

  protos::gen::PerfEventConfig event_config_pb;
  if (!event_config_pb.ParseFromString(config.perf_event_config_raw())) {
    PERFETTO_ELOG("PerfEventConfig could not be parsed.");
    return;
  }
  base::Optional<EventConfig> event_config =
      EventConfig::Create(event_config_pb, config, tracepoint_id_lookup);
  if (!event_config.has_value()) {
    PERFETTO_ELOG("PerfEventConfig rejected.");
    return;
  }

  size_t num_cpus = NumberOfCpus();
  std::vector<EventReader> per_cpu_readers;
  for (uint32_t cpu = 0; cpu < num_cpus; cpu++) {
    base::Optional<EventReader> event_reader =
        EventReader::ConfigureEvents(cpu, event_config.value());
    if (!event_reader.has_value()) {
      PERFETTO_ELOG("Failed to set up perf events for cpu%" PRIu32
                    ", discarding data source.",
                    cpu);
      return;
    }
    per_cpu_readers.emplace_back(std::move(event_reader.value()));
  }

  auto buffer_id = static_cast<BufferID>(config.target_buffer());
  auto writer = endpoint_->CreateTraceWriter(buffer_id);

  // Construct the data source instance.
  std::map<DataSourceInstanceID, DataSourceState>::iterator ds_it;
  bool inserted;
  std::tie(ds_it, inserted) = data_sources_.emplace(
      std::piecewise_construct, std::forward_as_tuple(ds_id),
      std::forward_as_tuple(event_config.value(), std::move(writer),
                            std::move(per_cpu_readers)));
  PERFETTO_CHECK(inserted);
  DataSourceState& ds = ds_it->second;

  // Start the configured events.
  for (auto& per_cpu_reader : ds.per_cpu_readers) {
    per_cpu_reader.EnableEvents();
  }

  WritePerfEventDefaultsPacket(ds.event_config, ds.trace_writer.get());

  InterningOutputTracker::WriteFixedInterningsPacket(
      ds_it->second.trace_writer.get(),
      protos::pbzero::TracePacket::SEQ_NEEDS_INCREMENTAL_STATE);

  // Inform unwinder of the new data source instance, and optionally start a
  // periodic task to clear its cached state.
  unwinding_worker_->PostStartDataSource(ds_id,
                                         ds.event_config.kernel_frames());
  if (ds.event_config.unwind_state_clear_period_ms()) {
    unwinding_worker_->PostClearCachedStatePeriodic(
        ds_id, ds.event_config.unwind_state_clear_period_ms());
  }

  // Kick off periodic read task.
  auto tick_period_ms = ds.event_config.read_tick_period_ms();
  auto weak_this = weak_factory_.GetWeakPtr();
  task_runner_->PostDelayedTask(
      [weak_this, ds_id] {
        if (weak_this)
          weak_this->TickDataSourceRead(ds_id);
      },
      TimeToNextReadTickMs(ds_id, tick_period_ms));

  // Optionally kick off periodic memory footprint limit check.
  uint32_t max_daemon_memory_kb = event_config_pb.max_daemon_memory_kb();
  if (max_daemon_memory_kb > 0) {
    task_runner_->PostDelayedTask(
        [weak_this, ds_id, max_daemon_memory_kb] {
          if (weak_this)
            weak_this->CheckMemoryFootprintPeriodic(ds_id,
                                                    max_daemon_memory_kb);
        },
        kMemoryLimitCheckPeriodMs);
  }
}

void PerfProducer::CheckMemoryFootprintPeriodic(DataSourceInstanceID ds_id,
                                                uint32_t max_daemon_memory_kb) {
  auto ds_it = data_sources_.find(ds_id);
  if (ds_it == data_sources_.end())
    return;  // stop recurring

  GuardrailConfig gconfig = {};
  gconfig.memory_guardrail_kb = max_daemon_memory_kb;

  ProfilerMemoryGuardrails footprint_snapshot;
  if (footprint_snapshot.IsOverMemoryThreshold(gconfig)) {
    PurgeDataSource(ds_id);
    return;  // stop recurring
  }

  // repost
  auto weak_this = weak_factory_.GetWeakPtr();
  task_runner_->PostDelayedTask(
      [weak_this, ds_id, max_daemon_memory_kb] {
        if (weak_this)
          weak_this->CheckMemoryFootprintPeriodic(ds_id, max_daemon_memory_kb);
      },
      kMemoryLimitCheckPeriodMs);
}

void PerfProducer::StopDataSource(DataSourceInstanceID ds_id) {
  PERFETTO_LOG("StopDataSource(%zu)", static_cast<size_t>(ds_id));

  // Metatrace: stop immediately (will miss the events from the
  // asynchronous shutdown of the primary data source).
  auto meta_it = metatrace_writers_.find(ds_id);
  if (meta_it != metatrace_writers_.end()) {
    meta_it->second.WriteAllAndFlushTraceWriter([] {});
    metatrace_writers_.erase(meta_it);
    return;
  }

  auto ds_it = data_sources_.find(ds_id);
  if (ds_it == data_sources_.end()) {
    // Most likely, the source is missing due to an abrupt stop (via
    // |PurgeDataSource|). Tell the service that we've stopped the source now,
    // so that it doesn't wait for the ack until the timeout.
    endpoint_->NotifyDataSourceStopped(ds_id);
    return;
  }

  // Start shutting down the reading frontend, which will propagate the stop
  // further as the intermediate buffers are cleared.
  DataSourceState& ds = ds_it->second;
  InitiateReaderStop(&ds);
}

// The perf data sources ignore flush requests, as flushing would be
// unnecessarily complicated given out-of-order unwinding and proc-fd timeouts.
// Instead of responding to explicit flushes, we can ensure that we're otherwise
// well-behaved (do not reorder packets too much), and let the service scrape
// the SMB.
void PerfProducer::Flush(FlushRequestID flush_id,
                         const DataSourceInstanceID* data_source_ids,
                         size_t num_data_sources) {
  // Flush metatracing if requested.
  for (size_t i = 0; i < num_data_sources; i++) {
    auto ds_id = data_source_ids[i];
    PERFETTO_DLOG("Flush(%zu)", static_cast<size_t>(ds_id));

    auto meta_it = metatrace_writers_.find(ds_id);
    if (meta_it != metatrace_writers_.end()) {
      meta_it->second.WriteAllAndFlushTraceWriter([] {});
    }
  }

  endpoint_->NotifyFlushComplete(flush_id);
}

void PerfProducer::ClearIncrementalState(
    const DataSourceInstanceID* data_source_ids,
    size_t num_data_sources) {
  for (size_t i = 0; i < num_data_sources; i++) {
    auto ds_id = data_source_ids[i];
    PERFETTO_DLOG("ClearIncrementalState(%zu)", static_cast<size_t>(ds_id));

    if (metatrace_writers_.find(ds_id) != metatrace_writers_.end())
      continue;

    auto ds_it = data_sources_.find(ds_id);
    if (ds_it == data_sources_.end()) {
      PERFETTO_DLOG("ClearIncrementalState(%zu): did not find matching entry",
                    static_cast<size_t>(ds_id));
      continue;
    }
    DataSourceState& ds = ds_it->second;

    WritePerfEventDefaultsPacket(ds.event_config, ds.trace_writer.get());

    // Forget which incremental state we've emitted before.
    ds.interning_output.ClearHistory();
    InterningOutputTracker::WriteFixedInterningsPacket(
        ds.trace_writer.get(),
        protos::pbzero::TracePacket::SEQ_NEEDS_INCREMENTAL_STATE);

    // Drop the cross-datasource callstack interning trie. This is not
    // necessary for correctness (the preceding step is sufficient). However,
    // incremental clearing is likely to be used in ring buffer traces, where
    // it makes sense to reset the trie's size periodically, and this is a
    // reasonable point to do so. The trie keeps the monotonic interning IDs,
    // so there is no confusion for other concurrent data sources. We do not
    // bother with clearing concurrent sources' interning output trackers as
    // their footprint should be trivial.
    callstack_trie_.ClearTrie();
  }
}

void PerfProducer::TickDataSourceRead(DataSourceInstanceID ds_id) {
  auto it = data_sources_.find(ds_id);
  if (it == data_sources_.end()) {
    PERFETTO_DLOG("TickDataSourceRead(%zu): source gone",
                  static_cast<size_t>(ds_id));
    return;
  }
  DataSourceState& ds = it->second;

  PERFETTO_METATRACE_SCOPED(TAG_PRODUCER, PROFILER_READ_TICK);

  // Make a pass over all per-cpu readers.
  uint64_t max_samples = ds.event_config.samples_per_tick_limit();
  bool more_records_available = false;
  for (EventReader& reader : ds.per_cpu_readers) {
    if (ReadAndParsePerCpuBuffer(&reader, max_samples, ds_id, &ds)) {
      more_records_available = true;
    }
  }

  // Wake up the unwinder as we've (likely) pushed samples into its queue.
  unwinding_worker_->PostProcessQueue();

  if (PERFETTO_UNLIKELY(ds.status == DataSourceState::Status::kShuttingDown) &&
      !more_records_available) {
    unwinding_worker_->PostInitiateDataSourceStop(ds_id);
  } else {
    // otherwise, keep reading
    auto tick_period_ms = it->second.event_config.read_tick_period_ms();
    auto weak_this = weak_factory_.GetWeakPtr();
    task_runner_->PostDelayedTask(
        [weak_this, ds_id] {
          if (weak_this)
            weak_this->TickDataSourceRead(ds_id);
        },
        TimeToNextReadTickMs(ds_id, tick_period_ms));
  }
}

bool PerfProducer::ReadAndParsePerCpuBuffer(EventReader* reader,
                                            uint64_t max_samples,
                                            DataSourceInstanceID ds_id,
                                            DataSourceState* ds) {
  PERFETTO_METATRACE_SCOPED(TAG_PRODUCER, PROFILER_READ_CPU);

  // If the kernel ring buffer dropped data, record it in the trace.
  size_t cpu = reader->cpu();
  auto records_lost_callback = [this, ds_id, cpu](uint64_t records_lost) {
    auto weak_this = weak_factory_.GetWeakPtr();
    task_runner_->PostTask([weak_this, ds_id, cpu, records_lost] {
      if (weak_this)
        weak_this->EmitRingBufferLoss(ds_id, cpu, records_lost);
    });
  };

  for (uint64_t i = 0; i < max_samples; i++) {
    base::Optional<ParsedSample> sample =
        reader->ReadUntilSample(records_lost_callback);
    if (!sample) {
      return false;  // caught up to the writer
    }

    // Counter-only mode: skip the unwinding stage, enqueue the sample for
    // output immediately.
    if (!ds->event_config.sample_callstacks()) {
      CompletedSample output;
      output.common = sample->common;
      EmitSample(ds_id, std::move(output));
      continue;
    }

    // If sampling callstacks, we're not interested in kernel threads/workers.
    if (!sample->regs) {
      continue;
    }

    // Request proc-fds for the process if this is the first time we see it.
    pid_t pid = sample->common.pid;
    auto& process_state = ds->process_states[pid];  // insert if new

    if (process_state == ProcessTrackingStatus::kExpired) {
      PERFETTO_DLOG("Skipping sample for previously expired pid [%d]",
                    static_cast<int>(pid));
      EmitSkippedSample(ds_id, std::move(sample.value()),
                        SampleSkipReason::kReadStage);
      continue;
    }

    // Previously failed the target filter check.
    if (process_state == ProcessTrackingStatus::kRejected) {
      PERFETTO_DLOG("Skipping sample for pid [%d] due to target filter",
                    static_cast<int>(pid));
      continue;
    }

    // Seeing pid for the first time.
    if (process_state == ProcessTrackingStatus::kInitial) {
      PERFETTO_DLOG("New pid: [%d]", static_cast<int>(pid));

      // Check whether samples for this new process should be
      // dropped due to the target filtering.
      const TargetFilter& filter = ds->event_config.filter();
      if (ShouldRejectDueToFilter(pid, &ds->additional_cmdlines, filter)) {
        process_state = ProcessTrackingStatus::kRejected;
        continue;
      }

      // At this point, sampled process is known to be of interest, so start
      // resolving the proc-fds. Response is async.
      process_state = ProcessTrackingStatus::kResolving;
      InitiateDescriptorLookup(ds_id, pid,
                               ds->event_config.remote_descriptor_timeout_ms());
    }

    PERFETTO_CHECK(process_state == ProcessTrackingStatus::kResolved ||
                   process_state == ProcessTrackingStatus::kResolving);

    // Optionally: drop sample if above a given threshold of sampled stacks
    // that are waiting in the unwinding queue.
    uint64_t max_footprint_bytes =
        ds->event_config.max_enqueued_footprint_bytes();
    uint64_t sample_stack_size = sample->stack.size();
    if (max_footprint_bytes) {
      uint64_t footprint_bytes = unwinding_worker_->GetEnqueuedFootprint();
      if (footprint_bytes + sample_stack_size >= max_footprint_bytes) {
        PERFETTO_DLOG("Skipping sample enqueueing due to footprint limit.");
        EmitSkippedSample(ds_id, std::move(sample.value()),
                          SampleSkipReason::kUnwindEnqueue);
        continue;
      }
    }

    // Push the sample into the unwinding queue if there is room.
    auto& queue = unwinding_worker_->unwind_queue();
    WriteView write_view = queue.BeginWrite();
    if (write_view.valid) {
      queue.at(write_view.write_pos) =
          UnwindEntry{ds_id, std::move(sample.value())};
      queue.CommitWrite();
      unwinding_worker_->IncrementEnqueuedFootprint(sample_stack_size);
    } else {
      PERFETTO_DLOG("Unwinder queue full, skipping sample");
      EmitSkippedSample(ds_id, std::move(sample.value()),
                        SampleSkipReason::kUnwindEnqueue);
    }
  }

  // Most likely more events in the kernel buffer. Though we might be exactly on
  // the boundary due to |max_samples|.
  return true;
}

// Note: first-fit makes descriptor request fulfillment not true FIFO. But the
// edge-cases where it matters are very unlikely.
void PerfProducer::OnProcDescriptors(pid_t pid,
                                     uid_t uid,
                                     base::ScopedFile maps_fd,
                                     base::ScopedFile mem_fd) {
  // Find first-fit data source that requested descriptors for the process.
  for (auto& it : data_sources_) {
    DataSourceState& ds = it.second;
    auto proc_status_it = ds.process_states.find(pid);
    if (proc_status_it == ds.process_states.end())
      continue;

    if (!CanProfile(ds.event_config.raw_ds_config(), uid,
                    ds.event_config.target_installed_by())) {
      PERFETTO_DLOG("Not profileable: pid [%d], uid [%d] for DS [%zu]",
                    static_cast<int>(pid), static_cast<int>(uid),
                    static_cast<size_t>(it.first));
      continue;
    }

    // Match against either resolving, or expired state. In the latter
    // case, it means that the async response was slow enough that we've marked
    // the lookup as expired (but can now recover for future samples).
    auto proc_status = proc_status_it->second;
    if (proc_status == ProcessTrackingStatus::kResolving ||
        proc_status == ProcessTrackingStatus::kExpired) {
      PERFETTO_DLOG("Handing off proc-fds for pid [%d] to DS [%zu]",
                    static_cast<int>(pid), static_cast<size_t>(it.first));

      proc_status_it->second = ProcessTrackingStatus::kResolved;
      unwinding_worker_->PostAdoptProcDescriptors(
          it.first, pid, std::move(maps_fd), std::move(mem_fd));
      return;  // done
    }
  }
  PERFETTO_DLOG(
      "Discarding proc-fds for pid [%d] as found no outstanding requests.",
      static_cast<int>(pid));
}

void PerfProducer::InitiateDescriptorLookup(DataSourceInstanceID ds_id,
                                            pid_t pid,
                                            uint32_t timeout_ms) {
  if (!proc_fd_getter_->RequiresDelayedRequest()) {
    StartDescriptorLookup(ds_id, pid, timeout_ms);
    return;
  }

  // Delay lookups on Android. See comment on |kProcDescriptorsAndroidDelayMs|.
  auto weak_this = weak_factory_.GetWeakPtr();
  task_runner_->PostDelayedTask(
      [weak_this, ds_id, pid, timeout_ms] {
        if (weak_this)
          weak_this->StartDescriptorLookup(ds_id, pid, timeout_ms);
      },
      kProcDescriptorsAndroidDelayMs);
}

void PerfProducer::StartDescriptorLookup(DataSourceInstanceID ds_id,
                                         pid_t pid,
                                         uint32_t timeout_ms) {
  proc_fd_getter_->GetDescriptorsForPid(pid);

  auto weak_this = weak_factory_.GetWeakPtr();
  task_runner_->PostDelayedTask(
      [weak_this, ds_id, pid] {
        if (weak_this)
          weak_this->EvaluateDescriptorLookupTimeout(ds_id, pid);
      },
      timeout_ms);
}

void PerfProducer::EvaluateDescriptorLookupTimeout(DataSourceInstanceID ds_id,
                                                   pid_t pid) {
  auto ds_it = data_sources_.find(ds_id);
  if (ds_it == data_sources_.end())
    return;

  DataSourceState& ds = ds_it->second;
  auto proc_status_it = ds.process_states.find(pid);
  if (proc_status_it == ds.process_states.end())
    return;

  // If the request is still outstanding, mark the process as expired (causing
  // outstanding and future samples to be discarded).
  auto proc_status = proc_status_it->second;
  if (proc_status == ProcessTrackingStatus::kResolving) {
    PERFETTO_DLOG("Descriptor lookup timeout of pid [%d] for DS [%zu]",
                  static_cast<int>(pid), static_cast<size_t>(ds_it->first));

    proc_status_it->second = ProcessTrackingStatus::kExpired;
    // Also inform the unwinder of the state change (so that it can discard any
    // of the already-enqueued samples).
    unwinding_worker_->PostRecordTimedOutProcDescriptors(ds_id, pid);
  }
}

void PerfProducer::PostEmitSample(DataSourceInstanceID ds_id,
                                  CompletedSample sample) {
  // hack: c++11 lambdas can't be moved into, so stash the sample on the heap.
  CompletedSample* raw_sample = new CompletedSample(std::move(sample));
  auto weak_this = weak_factory_.GetWeakPtr();
  task_runner_->PostTask([weak_this, ds_id, raw_sample] {
    if (weak_this)
      weak_this->EmitSample(ds_id, std::move(*raw_sample));
    delete raw_sample;
  });
}

void PerfProducer::EmitSample(DataSourceInstanceID ds_id,
                              CompletedSample sample) {
  auto ds_it = data_sources_.find(ds_id);
  if (ds_it == data_sources_.end()) {
    PERFETTO_DLOG("EmitSample(ds: %zu): source gone",
                  static_cast<size_t>(ds_id));
    return;
  }
  DataSourceState& ds = ds_it->second;

  // intern callsite
  GlobalCallstackTrie::Node* callstack_root =
      callstack_trie_.CreateCallsite(sample.frames, sample.build_ids);
  uint64_t callstack_iid = callstack_root->id();

  // start packet, timestamp domain defaults to monotonic_raw
  auto packet = StartTracePacket(ds.trace_writer.get());
  packet->set_timestamp(sample.common.timestamp);

  // write new interning data (if any)
  protos::pbzero::InternedData* interned_out = packet->set_interned_data();
  ds.interning_output.WriteCallstack(callstack_root, &callstack_trie_,
                                     interned_out);

  // write the sample itself
  auto* perf_sample = packet->set_perf_sample();
  perf_sample->set_cpu(sample.common.cpu);
  perf_sample->set_pid(static_cast<uint32_t>(sample.common.pid));
  perf_sample->set_tid(static_cast<uint32_t>(sample.common.tid));
  perf_sample->set_cpu_mode(ToCpuModeEnum(sample.common.cpu_mode));
  perf_sample->set_timebase_count(sample.common.timebase_count);
  perf_sample->set_callstack_iid(callstack_iid);
  if (sample.unwind_error != unwindstack::ERROR_NONE) {
    perf_sample->set_unwind_error(ToProtoEnum(sample.unwind_error));
  }
}

void PerfProducer::EmitRingBufferLoss(DataSourceInstanceID ds_id,
                                      size_t cpu,
                                      uint64_t records_lost) {
  auto ds_it = data_sources_.find(ds_id);
  if (ds_it == data_sources_.end())
    return;
  DataSourceState& ds = ds_it->second;
  PERFETTO_DLOG("DataSource(%zu): cpu%zu lost [%" PRIu64 "] records",
                static_cast<size_t>(ds_id), cpu, records_lost);

  // The data loss record relates to a single ring buffer, and indicates loss
  // since the last successfully-written record in that buffer. Therefore the
  // data loss record itself has no timestamp.
  // We timestamp the packet with the boot clock for packet ordering purposes,
  // but it no longer has a (precise) interpretation relative to the sample
  // stream from that per-cpu buffer. See the proto comments for more details.
  auto packet = StartTracePacket(ds.trace_writer.get());
  packet->set_timestamp(static_cast<uint64_t>(base::GetBootTimeNs().count()));
  packet->set_timestamp_clock_id(
      protos::pbzero::BuiltinClock::BUILTIN_CLOCK_BOOTTIME);

  auto* perf_sample = packet->set_perf_sample();
  perf_sample->set_cpu(static_cast<uint32_t>(cpu));
  perf_sample->set_kernel_records_lost(records_lost);
}

void PerfProducer::PostEmitUnwinderSkippedSample(DataSourceInstanceID ds_id,
                                                 ParsedSample sample) {
  PostEmitSkippedSample(ds_id, std::move(sample),
                        SampleSkipReason::kUnwindStage);
}

void PerfProducer::PostEmitSkippedSample(DataSourceInstanceID ds_id,
                                         ParsedSample sample,
                                         SampleSkipReason reason) {
  // hack: c++11 lambdas can't be moved into, so stash the sample on the heap.
  ParsedSample* raw_sample = new ParsedSample(std::move(sample));
  auto weak_this = weak_factory_.GetWeakPtr();
  task_runner_->PostTask([weak_this, ds_id, raw_sample, reason] {
    if (weak_this)
      weak_this->EmitSkippedSample(ds_id, std::move(*raw_sample), reason);
    delete raw_sample;
  });
}

void PerfProducer::EmitSkippedSample(DataSourceInstanceID ds_id,
                                     ParsedSample sample,
                                     SampleSkipReason reason) {
  auto ds_it = data_sources_.find(ds_id);
  if (ds_it == data_sources_.end())
    return;
  DataSourceState& ds = ds_it->second;

  // Note: timestamp defaults to the monotonic_raw domain.
  auto packet = StartTracePacket(ds.trace_writer.get());
  packet->set_timestamp(sample.common.timestamp);
  auto* perf_sample = packet->set_perf_sample();
  perf_sample->set_cpu(sample.common.cpu);
  perf_sample->set_pid(static_cast<uint32_t>(sample.common.pid));
  perf_sample->set_tid(static_cast<uint32_t>(sample.common.tid));
  perf_sample->set_cpu_mode(ToCpuModeEnum(sample.common.cpu_mode));
  perf_sample->set_timebase_count(sample.common.timebase_count);

  using PerfSample = protos::pbzero::PerfSample;
  switch (reason) {
    case SampleSkipReason::kReadStage:
      perf_sample->set_sample_skipped_reason(
          PerfSample::PROFILER_SKIP_READ_STAGE);
      break;
    case SampleSkipReason::kUnwindEnqueue:
      perf_sample->set_sample_skipped_reason(
          PerfSample::PROFILER_SKIP_UNWIND_ENQUEUE);
      break;
    case SampleSkipReason::kUnwindStage:
      perf_sample->set_sample_skipped_reason(
          PerfSample::PROFILER_SKIP_UNWIND_STAGE);
      break;
  }
}

void PerfProducer::InitiateReaderStop(DataSourceState* ds) {
  PERFETTO_DLOG("InitiateReaderStop");
  PERFETTO_CHECK(ds->status != DataSourceState::Status::kShuttingDown);

  ds->status = DataSourceState::Status::kShuttingDown;
  for (auto& event_reader : ds->per_cpu_readers) {
    event_reader.DisableEvents();
  }
}

void PerfProducer::PostFinishDataSourceStop(DataSourceInstanceID ds_id) {
  auto weak_producer = weak_factory_.GetWeakPtr();
  task_runner_->PostTask([weak_producer, ds_id] {
    if (weak_producer)
      weak_producer->FinishDataSourceStop(ds_id);
  });
}

void PerfProducer::FinishDataSourceStop(DataSourceInstanceID ds_id) {
  PERFETTO_LOG("FinishDataSourceStop(%zu)", static_cast<size_t>(ds_id));
  auto ds_it = data_sources_.find(ds_id);
  if (ds_it == data_sources_.end()) {
    PERFETTO_DLOG("FinishDataSourceStop(%zu): source gone",
                  static_cast<size_t>(ds_id));
    return;
  }
  DataSourceState& ds = ds_it->second;
  PERFETTO_CHECK(ds.status == DataSourceState::Status::kShuttingDown);

  ds.trace_writer->Flush();
  data_sources_.erase(ds_it);

  endpoint_->NotifyDataSourceStopped(ds_id);

  // Clean up resources if there are no more active sources.
  if (data_sources_.empty()) {
    callstack_trie_.ClearTrie();  // purge internings
    base::MaybeReleaseAllocatorMemToOS();
  }
}

// TODO(rsavitski): maybe make the tracing service respect premature
// producer-driven stops, and then issue a NotifyDataSourceStopped here.
// Alternatively (and at the expense of higher complexity) introduce a new data
// source status of "tombstoned", and propagate it until the source is stopped
// by the service (this would technically allow for stricter lifetime checking
// of data sources, and help with discarding periodic flushes).
// TODO(rsavitski): Purging while stopping will currently leave the stop
// unacknowledged. Consider checking whether the DS is stopping here, and if so,
// notifying immediately after erasing.
void PerfProducer::PurgeDataSource(DataSourceInstanceID ds_id) {
  auto ds_it = data_sources_.find(ds_id);
  if (ds_it == data_sources_.end())
    return;
  DataSourceState& ds = ds_it->second;

  PERFETTO_LOG("Stopping DataSource(%zu) prematurely",
               static_cast<size_t>(ds_id));

  unwinding_worker_->PostPurgeDataSource(ds_id);

  // Write a packet indicating the abrupt stop.
  {
    auto packet = StartTracePacket(ds.trace_writer.get());
    packet->set_timestamp(static_cast<uint64_t>(base::GetBootTimeNs().count()));
    packet->set_timestamp_clock_id(
        protos::pbzero::BuiltinClock::BUILTIN_CLOCK_BOOTTIME);
    auto* perf_sample = packet->set_perf_sample();
    auto* producer_event = perf_sample->set_producer_event();
    producer_event->set_source_stop_reason(
        protos::pbzero::PerfSample::ProducerEvent::PROFILER_STOP_GUARDRAIL);
  }

  ds.trace_writer->Flush();
  data_sources_.erase(ds_it);

  // Clean up resources if there are no more active sources.
  if (data_sources_.empty()) {
    callstack_trie_.ClearTrie();  // purge internings
    base::MaybeReleaseAllocatorMemToOS();
  }
}

void PerfProducer::StartMetatraceSource(DataSourceInstanceID ds_id,
                                        BufferID target_buffer) {
  auto writer = endpoint_->CreateTraceWriter(target_buffer);

  auto it_and_inserted = metatrace_writers_.emplace(
      std::piecewise_construct, std::make_tuple(ds_id), std::make_tuple());
  PERFETTO_DCHECK(it_and_inserted.second);
  // Note: only the first concurrent writer will actually be active.
  metatrace_writers_[ds_id].Enable(task_runner_, std::move(writer),
                                   metatrace::TAG_ANY);
}

void PerfProducer::ConnectWithRetries(const char* socket_name) {
  PERFETTO_DCHECK(state_ == kNotStarted);
  state_ = kNotConnected;

  ResetConnectionBackoff();
  producer_socket_name_ = socket_name;
  ConnectService();
}

void PerfProducer::ConnectService() {
  PERFETTO_DCHECK(state_ == kNotConnected);
  state_ = kConnecting;
  endpoint_ = ProducerIPCClient::Connect(
      producer_socket_name_, this, kProducerName, task_runner_,
      TracingService::ProducerSMBScrapingMode::kEnabled);
}

void PerfProducer::IncreaseConnectionBackoff() {
  connection_backoff_ms_ *= 2;
  if (connection_backoff_ms_ > kMaxConnectionBackoffMs)
    connection_backoff_ms_ = kMaxConnectionBackoffMs;
}

void PerfProducer::ResetConnectionBackoff() {
  connection_backoff_ms_ = kInitialConnectionBackoffMs;
}

void PerfProducer::OnConnect() {
  PERFETTO_DCHECK(state_ == kConnecting);
  state_ = kConnected;
  ResetConnectionBackoff();
  PERFETTO_LOG("Connected to the service");

  {
    // linux.perf
    DataSourceDescriptor desc;
    desc.set_name(kDataSourceName);
    desc.set_handles_incremental_state_clear(true);
    desc.set_will_notify_on_stop(true);
    endpoint_->RegisterDataSource(desc);
  }
  {
    // metatrace
    DataSourceDescriptor desc;
    desc.set_name(MetatraceWriter::kDataSourceName);
    endpoint_->RegisterDataSource(desc);
  }
}

void PerfProducer::OnDisconnect() {
  PERFETTO_DCHECK(state_ == kConnected || state_ == kConnecting);
  PERFETTO_LOG("Disconnected from tracing service");

  auto weak_producer = weak_factory_.GetWeakPtr();
  if (state_ == kConnected)
    return task_runner_->PostTask([weak_producer] {
      if (weak_producer)
        weak_producer->Restart();
    });

  state_ = kNotConnected;
  IncreaseConnectionBackoff();
  task_runner_->PostDelayedTask(
      [weak_producer] {
        if (weak_producer)
          weak_producer->ConnectService();
      },
      connection_backoff_ms_);
}

void PerfProducer::Restart() {
  // We lost the connection with the tracing service. At this point we need
  // to reset all the data sources. Trying to handle that manually is going to
  // be error prone. What we do here is simply destroy the instance and
  // recreate it again.
  base::TaskRunner* task_runner = task_runner_;
  const char* socket_name = producer_socket_name_;
  ProcDescriptorGetter* proc_fd_getter = proc_fd_getter_;

  // Invoke destructor and then the constructor again.
  this->~PerfProducer();
  new (this) PerfProducer(proc_fd_getter, task_runner);

  ConnectWithRetries(socket_name);
}

}  // namespace profiling
}  // namespace perfetto