automemcpy/lib/ResultAnalyzer.cpp

//===-- Analyze benchmark JSON files --------------------------------------===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
// This code analyzes the json file produced by the `automemcpy` binary.
//
// As a remainder, `automemcpy` will benchmark each autogenerated memory
// functions against one of the predefined distributions available in the
// `libc/benchmarks/distributions` folder.
//
// It works as follows:
// - Reads one or more json files.
// - If there are several runs for the same function and distribution, picks the
//   median throughput (aka `BytesPerSecond`).
// - Aggregates the throughput per distributions and scores them from worst (0)
//   to best (1).
// - Each distribution categorizes each function into one of the following
//   categories: EXCELLENT, VERY_GOOD, GOOD, PASSABLE, INADEQUATE, MEDIOCRE,
//   BAD.
// - A process similar to the Majority Judgment voting system is used to `elect`
//   the best function. The histogram of grades is returned so we can
//   distinguish between functions with the same final grade. In the following
//   example both functions grade EXCELLENT but we may prefer the second one.
//
//   |            | EXCELLENT | VERY_GOOD | GOOD | PASSABLE | ...
//   |------------|-----------|-----------|------|----------| ...
//   | Function_1 |     7     |     1     |   2  |          | ...
//   | Function_2 |     6     |     4     |      |          | ...

#include "automemcpy/ResultAnalyzer.h"
#include "llvm/ADT/StringRef.h"
#include <numeric>
#include <unordered_map>

namespace llvm {

namespace automemcpy {

StringRef Grade::getString(const GradeEnum &GE) {
  switch (GE) {
  case EXCELLENT:
    return "EXCELLENT";
  case VERY_GOOD:
    return "VERY_GOOD";
  case GOOD:
    return "GOOD";
  case PASSABLE:
    return "PASSABLE";
  case INADEQUATE:
    return "INADEQUATE";
  case MEDIOCRE:
    return "MEDIOCRE";
  case BAD:
    return "BAD";
  case ARRAY_SIZE:
    report_fatal_error("logic error");
  }
}

Grade::GradeEnum Grade::judge(double Score) {
  if (Score >= 6. / 7)
    return EXCELLENT;
  if (Score >= 5. / 7)
    return VERY_GOOD;
  if (Score >= 4. / 7)
    return GOOD;
  if (Score >= 3. / 7)
    return PASSABLE;
  if (Score >= 2. / 7)
    return INADEQUATE;
  if (Score >= 1. / 7)
    return MEDIOCRE;
  return BAD;
}

static double computeUnbiasedSampleVariance(const std::vector<double> &Samples,
                                            const double SampleMean) {
  assert(!Samples.empty());
  if (Samples.size() == 1)
    return 0;
  double DiffSquaresSum = 0;
  for (const double S : Samples) {
    const double Diff = S - SampleMean;
    DiffSquaresSum += Diff * Diff;
  }
  return DiffSquaresSum / (Samples.size() - 1);
}

static void processPerDistributionData(PerDistributionData &Data) {
  auto &Samples = Data.BytesPerSecondSamples;
  assert(!Samples.empty());
  // Sample Mean
  const double Sum = std::accumulate(Samples.begin(), Samples.end(), 0.0);
  Data.BytesPerSecondMean = Sum / Samples.size();
  // Unbiased Sample Variance
  Data.BytesPerSecondVariance =
      computeUnbiasedSampleVariance(Samples, Data.BytesPerSecondMean);
  // Median
  const size_t HalfSize = Samples.size() / 2;
  std::nth_element(Samples.begin(), Samples.begin() + HalfSize, Samples.end());
  Data.BytesPerSecondMedian = Samples[HalfSize];
}

std::vector<FunctionData> getThroughputs(ArrayRef<Sample> Samples) {
  std::unordered_map<FunctionId, FunctionData, FunctionId::Hasher> Functions;
  for (const auto &S : Samples) {
    if (S.Type != SampleType::ITERATION)
      break;
    auto &Function = Functions[S.Id.Function];
    auto &Data = Function.PerDistributionData[S.Id.Distribution.Name];
    Data.BytesPerSecondSamples.push_back(S.BytesPerSecond);
  }

  std::vector<FunctionData> Output;
  for (auto &[FunctionId, Function] : Functions) {
    Function.Id = FunctionId;
    for (auto &Pair : Function.PerDistributionData)
      processPerDistributionData(Pair.second);
    Output.push_back(std::move(Function));
  }
  return Output;
}

void fillScores(MutableArrayRef<FunctionData> Functions) {
  // A key to bucket throughput per function type and distribution.
  struct Key {
    FunctionType Type;
    StringRef Distribution;

    COMPARABLE_AND_HASHABLE(Key, Type, Distribution)
  };

  // Tracks minimum and maximum values.
  struct MinMax {
    double Min = std::numeric_limits<double>::max();
    double Max = std::numeric_limits<double>::min();
    void update(double Value) {
      if (Value < Min)
        Min = Value;
      if (Value > Max)
        Max = Value;
    }
    double normalize(double Value) const { return (Value - Min) / (Max - Min); }
  };

  std::unordered_map<Key, MinMax, Key::Hasher> ThroughputMinMax;
  for (const auto &Function : Functions) {
    const FunctionType Type = Function.Id.Type;
    for (const auto &Pair : Function.PerDistributionData) {
      const auto &Distribution = Pair.getKey();
      const double Throughput = Pair.getValue().BytesPerSecondMedian;
      const Key K{Type, Distribution};
      ThroughputMinMax[K].update(Throughput);
    }
  }

  for (auto &Function : Functions) {
    const FunctionType Type = Function.Id.Type;
    for (const auto &Pair : Function.PerDistributionData) {
      const auto &Distribution = Pair.getKey();
      const double Throughput = Pair.getValue().BytesPerSecondMedian;
      const Key K{Type, Distribution};
      Function.PerDistributionData[Distribution].Score =
          ThroughputMinMax[K].normalize(Throughput);
    }
  }
}

void castVotes(MutableArrayRef<FunctionData> Functions) {
  for (FunctionData &Function : Functions) {
    Function.ScoresGeoMean = 1.0;
    for (const auto &Pair : Function.PerDistributionData) {
      const StringRef Distribution = Pair.getKey();
      const double Score = Pair.getValue().Score;
      Function.ScoresGeoMean *= Score;
      const auto G = Grade::judge(Score);
      ++(Function.GradeHisto[G]);
      Function.PerDistributionData[Distribution].Grade = G;
    }
  }

  for (FunctionData &Function : Functions) {
    const auto &GradeHisto = Function.GradeHisto;
    const size_t Votes =
        std::accumulate(GradeHisto.begin(), GradeHisto.end(), 0U);
    const size_t MedianVote = Votes / 2;
    size_t CountedVotes = 0;
    Grade::GradeEnum MedianGrade = Grade::BAD;
    for (size_t I = 0; I < GradeHisto.size(); ++I) {
      CountedVotes += GradeHisto[I];
      if (CountedVotes > MedianVote) {
        MedianGrade = Grade::GradeEnum(I);
        break;
      }
    }
    Function.FinalGrade = MedianGrade;
  }
}

} // namespace automemcpy
} // namespace llvm