use scopedperf

0e6651e8 · Nickolai Zeldovich · 343ef749 · 0e6651e8 · 0e6651e8 · 0e6651e8
--- a/user/intelctr.hh
+++ b/user/intelctr.hh
+namespace intelctr {
+
+using scopedperf::tsc_ctr;
+using scopedperf::pmc_setup;
+
+static tsc_ctr tsc;
+
+static pmc_setup<48> l2_ld_hit(0x00410124, "l2 ld hit");
+static pmc_setup<48> l2_ld_miss(0x00410224, "l2 ld miss");
+
+// rfo: request for ownership (~write)
+static pmc_setup<48> l2_rfo_hit(0x00410424, "l2 rfo hit");
+static pmc_setup<48> l2_rfo_miss(0x00410824, "l2 rfo miss");
+
+static pmc_setup<48> l2_i_hit(0x00411024, "l2 i hit");
+static pmc_setup<48> l2_i_miss(0x00412024, "l2 i miss");
+
+static pmc_setup<48> l2_prefetch_hit(0x00414024, "l2 pref hit");
+static pmc_setup<48> l2_prefetch_miss(0x00418024, "l2 pref miss");
+static pmc_setup<48> l2_prefetch(0x0041c024, "l2 prefetch");  // ~zero
+
+static pmc_setup<48> l2_miss(0x0041aa24, "l2 all miss");
+static pmc_setup<48> l2_refs(0x0041ff24, "l2 all refs");
+
+// ---
+static pmc_setup<48> l2_ld_demand(0x00410f26, "l2 demand ld");
+static pmc_setup<48> l2_ld_demand_i(0x00410126, "l2 dem ld i");  // ~l2_ld_miss
+static pmc_setup<48> l2_ld_demand_s(0x00410226, "l2 dem ld s");  // ~l2_rfo_miss
+static pmc_setup<48> l2_ld_demand_e(0x00410426, "l2 dem ld e");
+static pmc_setup<48> l2_ld_demand_m(0x00410826, "l2 dem ld m");
+static pmc_setup<48> l2_ld_prefetch(0x0041f026, "l2 prefetch ld");  // ~zero
+
+// ---
+static pmc_setup<48> l2_wr_i(0x00410127, "l2 write i");
+static pmc_setup<48> l2_wr_s(0x00410227, "l2 write s");
+static pmc_setup<48> l2_wr_m(0x00410827, "l2 write m");
+static pmc_setup<48> l2_wr_sem(0x00410e27, "l2 write sem");
+static pmc_setup<48> l2_wr(0x00410f27, "l2 write");  // l2_wr_i + l2_wr_sem
+static pmc_setup<48> l2_wrlk(0x0041f027, "l2 wrlk");  // ??
+
+// ---
+// where do loads come from?  interesting, but maybe inaccurate?
+//   doesn't add up to other l2 counters..
+static pmc_setup<48> ld_l1hit(0x004101cb, "ld l1 hit");
+static pmc_setup<48> ld_l2hit(0x004102cb, "ld l2 hit");
+static pmc_setup<48> ld_l3hit_unsh(0x004104cb, "ld l3 unsh");
+static pmc_setup<48> ld_l2other(0x004108cb, "ld l2 other");
+static pmc_setup<48> ld_offdie(0x004110cb, "ld offdie");
+static pmc_setup<48> ld_lfb(0x004140cb, "ld lfb");
+static pmc_setup<48> ld_dtlbmiss(0x004180cb, "ld dtlb-miss");
+
+// ---
+static pmc_setup<48> uops(0x0041010e, "uops_issued");
+static pmc_setup<48> mem_loads(0x0041010b, "mem load ins");
+static pmc_setup<48> mem_stores(0x0041020b, "mem store ins");
+static pmc_setup<48> dtlb_miss(0x00410149, "dtlb miss");
+static pmc_setup<48> itlb_miss(0x00410185, "itlb miss");
+
+}
--- a/user/scopedperf.hh
+++ b/user/scopedperf.hh
+#pragma once
+
+/*
+ * Canonical location:
+ *   git+ssh://amsterdam.csail.mit.edu/home/am1/prof/proftools.git
+ *   under spmc/lib/scopedperf.hh
+ */
+
+#include <string>
+#include <iostream>
+#include <iomanip>
+#include <sstream>
+#include <vector>
+#include <algorithm>
+#include <assert.h>
+#include <string.h>
+#include <stdint.h>
+#include <sys/time.h>
+
+namespace scopedperf {
+
+/*
+ * statically enable/disable most of the generated code for profiling.
+ */
+class default_enabler {
+ public:
+  bool enabled() const { return true; }
+};
+
+class always_enabled {
+ public:
+  bool enabled() const { return true; }
+};
+
+class always_disabled {
+ public:
+  bool enabled() const { return false; }
+};
+
+
+/*
+ * spinlock: mostly to avoid pthread mutex sleeping.
+ */
+class spinlock {
+ public:
+  spinlock() : x(0) {}
+
+  void acquire() {
+    while (!__sync_bool_compare_and_swap(&x, 0, 1))
+      ;
+  }
+
+  void release() {
+    x = 0;
+  }
+
+ private:
+  volatile uint x;
+};
+
+class scoped_spinlock {
+ public:
+  scoped_spinlock(spinlock *larg) : l(larg) {
+    l->acquire();
+    held = true;
+  }
+
+  void release() {
+    if (held)
+      l->release();
+    held = false;
+  }
+
+  ~scoped_spinlock() { release(); }
+
+ private:
+  spinlock *const l;
+  bool held;
+};
+
+
+/*
+ * ctrgroup: a group of performance counters.
+ */
+
+template<typename... Counters>
+class ctrgroup_chain;
+
+template<>
+class ctrgroup_chain<> {
+ public:
+  ctrgroup_chain() {}
+  static const uint nctr = 0;
+  void get_samples(uint64_t *v) const {}
+  void get_delta(uint64_t *delta, uint64_t *prev) const {}
+  std::vector<std::string> get_names() const { return {}; }
+};
+
+template<typename One, typename... Others>
+class ctrgroup_chain<One, Others...> : ctrgroup_chain<Others...> {
+ public:
+  ctrgroup_chain(One *x, Others*... y)
+    : ctrgroup_chain<Others...>(y...), ctr(x)
+  {
+    x->setup();
+  }
+
+  static const uint nctr = 1 + ctrgroup_chain<Others...>::nctr;
+
+  void get_samples(uint64_t *v) const {
+    v[0] = ctr->sample();
+    ctrgroup_chain<Others...>::get_samples(v+1);
+  }
+
+  void get_delta(uint64_t *delta, uint64_t *prev) const {
+    uint64_t x = ctr->sample();
+    *delta = (x - *prev) & ctr->mask;
+    *prev = x;
+    ctrgroup_chain<Others...>::get_delta(delta+1, prev+1);
+  }
+
+  std::vector<std::string> get_names() const {
+    std::vector<std::string> v = ctrgroup_chain<Others...>::get_names();
+    v.insert(v.begin(), ctr->name);
+    return v;
+  }
+
+ private:
+  const One *const ctr;
+};
+
+template<typename... Counters>
+ctrgroup_chain<Counters...>
+ctrgroup(Counters*... args)
+{
+  return ctrgroup_chain<Counters...>(args...);
+}
+
+
+/*
+ * perfsum: aggregating counter deltas across multiple CPUs.
+ */
+class perfsum_base {
+ public:
+  enum display_opt { show, hide };
+
+  perfsum_base(const std::string &n, display_opt d) : name(n), disp(d) {
+    scoped_spinlock x(get_sums_lock());
+    get_sums()->push_back(this);
+  }
+
+  static void printall(int w0 = 17, int w = 13) {
+    scoped_spinlock x(get_sums_lock());
+    auto sums = get_sums();
+    std::sort(sums->begin(), sums->end(),
+	      [](perfsum_base *a, perfsum_base *b) { return a->name < b->name; });
+    for (perfsum_base *ps: *sums) {
+      if (ps->disp == hide || !ps->get_enabled())
+	continue;
+      auto p = ps->get_stats();
+      print_row(ps->name, ps->get_names(), w0, w, [](const std::string &name)
+		{ return name; });
+      print_row("  avg",   p, w0, w, [](const std::pair<uint64_t, uint64_t> &e)
+	        { return ((double) e.second) / (double) e.first; });
+      print_row("  total", p, w0, w, [](const std::pair<uint64_t, uint64_t> &e)
+		{ return e.second; });
+      print_row("  count", p, w0, w, [](const std::pair<uint64_t, uint64_t> &e)
+		{ return e.first; });
+    }
+  }
+
+  static void resetall() {
+    scoped_spinlock x(get_sums_lock());
+    for (perfsum_base *ps: *get_sums())
+      ps->reset();
+  }
+
+  virtual std::vector<std::pair<uint64_t, uint64_t> > get_stats() const = 0;
+  virtual std::vector<std::string> get_names() const = 0;
+  virtual bool get_enabled() const = 0;
+  virtual void reset() = 0;
+
+ private:
+  template<class Row, class Callback>
+  static void print_row(const std::string &rowname, const Row &r,
+			int w0, int w, Callback f)
+  {
+    std::cout << std::left << std::setw(w0) << rowname;
+    for (const auto &elem: r)
+      std::cout << std::left << std::setw(w) << f(elem) << " ";
+    std::cout << std::endl;
+  }
+
+  static std::vector<perfsum_base*> *get_sums() {
+    static std::vector<perfsum_base*> v;
+    return &v;
+  }
+
+  static spinlock *get_sums_lock() {
+    static spinlock l;
+    return &l;
+  }
+
+  const std::string name;
+  const display_opt disp;
+};
+
+static inline void
+compiler_barrier()
+{
+  /* Avoid compile-time reordering across performance counter reads */
+  __asm __volatile("" ::: "memory");
+}
+
+template<typename Enabler, typename... Counters>
+class perfsum_ctr : public perfsum_base, public Enabler {
+ public:
+  perfsum_ctr(const ctrgroup_chain<Counters...> *c,
+	      const std::string &n, display_opt d)
+    : perfsum_base(n, d), cg(c), base(0)
+  {
+    reset();
+  }
+
+  perfsum_ctr(const std::string &n,
+	      const perfsum_ctr<Enabler, Counters...> *basesum, display_opt d)
+    : perfsum_base(n, d), cg(basesum->cg), base(basesum)
+  {
+    reset();
+  }
+
+  void get_samples(uint64_t *s) const {
+    compiler_barrier();
+    cg->get_samples(s);
+    compiler_barrier();
+  }
+
+  void record(uint cpuid, uint64_t *s) {
+    uint64_t delta[cg->nctr];
+
+    compiler_barrier();
+    cg->get_delta(delta, s);
+    compiler_barrier();
+
+    for (uint i = 0; i < cg->nctr; i++)
+      stat[cpuid].sum[i] += delta[i];
+    stat[cpuid].count++;
+  }
+
+  std::vector<std::pair<uint64_t, uint64_t> > get_stats() const /* override */ {
+    std::vector<std::pair<uint64_t, uint64_t> > v;
+    for (uint i = 0; i < cg->nctr; i++) {
+      uint64_t b =
+	base ? base->addcpus([i](const stats *s) { return s->sum[i]; })
+	     : addcpus([](const stats *s) { return s->count; });
+      v.push_back(std::make_pair(b,
+	addcpus([i](const stats *s) { return s->sum[i]; })));
+    }
+    return v;
+  }
+
+  std::vector<std::string> get_names() const /* override */ {
+    return cg->get_names();
+  }
+
+  bool get_enabled() const /* override */ {
+    return Enabler::enabled();
+  }
+
+  void reset() /* override */ {
+    memset(stat, 0, sizeof(stat));
+  }
+
+ private:
+  enum { maxcpu = 256 };
+
+  struct stats {
+    uint64_t count;
+    uint64_t sum[ctrgroup_chain<Counters...>::nctr];
+  } __attribute__((aligned (128)));
+
+  struct stats stat[maxcpu];
+  const struct ctrgroup_chain<Counters...> *const cg;
+  const struct perfsum_ctr<Enabler, Counters...> *const base;
+
+  template<class T>
+  uint64_t addcpus(T f) const {
+    uint64_t tot = 0;
+    for (uint i = 0; i < maxcpu; i++)
+      tot += f(&stat[i]);
+    return tot;
+  }
+};
+
+template<typename Enabler, typename... Counters>
+class perfsum_ctr_inlinegroup :
+  public ctrgroup_chain<Counters...>,
+  public perfsum_ctr<Enabler, Counters...>
+{
+ public:
+  perfsum_ctr_inlinegroup(const std::string &n, perfsum_base::display_opt d,
+                          Counters*... ctrs)
+    : ctrgroup_chain<Counters...>(ctrs...),
+      perfsum_ctr<Enabler, Counters...>(this, n, d) {}
+};
+
+template<typename Enabler = default_enabler, typename... Counters>
+perfsum_ctr<Enabler, Counters...>
+perfsum(const std::string &name, const ctrgroup_chain<Counters...> *c,
+	const perfsum_base::display_opt d = perfsum_base::show)
+{
+  return perfsum_ctr<Enabler, Counters...>(c, name, d);
+}
+
+template<typename Enabler = default_enabler, typename... Counters>
+perfsum_ctr_inlinegroup<Enabler, Counters...>
+perfsum_group(const std::string &name, Counters*... c)
+{
+  return perfsum_ctr_inlinegroup<Enabler, Counters...>(name, perfsum_base::show, c...);
+}
+
+template<typename Enabler, typename... Counters>
+perfsum_ctr<Enabler, Counters...>
+perfsum_frac(const std::string &name,
+	     const perfsum_ctr<Enabler, Counters...> *base)
+{
+  return perfsum_ctr<Enabler, Counters...>(name, base, perfsum_base::show);
+}
+
+
+/*
+ * namedctr &c: actual counter implementations.
+ */
+template<uint64_t CounterWidth>
+class namedctr {
+ public:
+  namedctr(const std::string &n) : name(n) {}
+  void setup() {}
+  const std::string name;
+  static const uint64_t mask =
+    ((1ULL << (CounterWidth - 1)) - 1) << 1 | 1;
+};
+
+class tsc_ctr : public namedctr<64> {
+ public:
+  tsc_ctr() : namedctr("tsc") {}
+  static uint64_t sample() {
+    uint64_t a, d;
+    __asm __volatile("rdtsc" : "=a" (a), "=d" (d));
+    return a | (d << 32);
+  }
+};
+
+class tscp_ctr : public namedctr<64> {
+ public:
+  tscp_ctr() : namedctr("tscp") {}
+  static uint64_t sample() {
+    uint64_t a, d, c;
+    __asm __volatile("rdtscp" : "=a" (a), "=d" (d), "=c" (c));
+    return a | (d << 32);
+  }
+};
+
+template<uint64_t CounterWidth>
+class pmc_ctr : public namedctr<CounterWidth> {
+ public:
+  pmc_ctr(int n) : namedctr<CounterWidth>(mkname(n)), cn(n) {}
+  pmc_ctr(const std::string &nm) : namedctr<CounterWidth>(nm), cn(-1) {}
+
+  uint64_t sample() const {
+    uint64_t a, d;
+    __asm __volatile("rdpmc" : "=a" (a), "=d" (d) : "c" (cn));
+    return a | (d << 32);
+  }
+
+  int cn;
+
+ private:
+  static std::string mkname(int n) {
+    std::stringstream ss;
+    ss << "pmc" << n;
+    return ss.str();
+  }
+};
+
+template<uint64_t CounterWidth = 64>
+class pmc_setup : public pmc_ctr<CounterWidth> {
+ public:
+  pmc_setup(uint64_t v, const std::string &nm)
+    : pmc_ctr<CounterWidth>(nm), pmc_v(v) {}
+
+  void setup() {
+    if (pmc_ctr<CounterWidth>::cn >= 0)
+      return;
+
+    /*
+     * XXX detect how many counters the hardware has
+     */
+    static bool pmcuse[4];
+    static spinlock pmcuselock;
+
+    int n = 0;
+    scoped_spinlock x(&pmcuselock);
+    while (n < 4 && pmcuse[n])
+      n++;
+    assert(n < 4);
+    pmcuse[n] = true;
+    x.release();
+
+    // ugly but effective
+    std::stringstream ss;
+    ss << "for f in /sys/kernel/spmc/cpu*/" << n << "; do "
+       << "echo " << std::hex << pmc_v << " > $f; done";
+    assert(0 == system(ss.str().c_str()));
+
+    pmc_ctr<CounterWidth>::cn = n;
+  }
+
+ private:
+  uint64_t pmc_v;
+};
+
+class tod_ctr : public namedctr<64> {
+ public:
+  tod_ctr() : namedctr("tod-usec") {}
+  uint64_t sample() const {
+    struct timeval tv;
+    gettimeofday(&tv, 0);
+    return ((uint64_t) tv.tv_usec) + ((uint64_t) tv.tv_sec) * 1000000;
+  }
+};
+
+class zero_ctr : public namedctr<64> {
+ public:
+  zero_ctr() : namedctr("zero") {}
+  uint64_t sample() const { return 0; }
+};
+
+
+/*
+ * scoped performance-counting regions, which record samples into a perfsum.
+ */
+template<typename Enabler, typename... Counters>
+class base_perf_region {
+ public:
+  base_perf_region(perfsum_ctr<Enabler, Counters...> *psarg)
+    : ps(psarg), enabled(ps->enabled()), cpuid(enabled ? sched_getcpu() : 0)
+  {
+    if (enabled)
+      ps->get_samples(s);
+  }
+
+  // invoke lap multiple times to precisely measure iterations
+  // (use same measurement for end of one & start of next round)
+  void lap() {
+    if (enabled)
+      ps->record(cpuid, s);
+  }
+
+ private:
+  perfsum_ctr<Enabler, Counters...> *const ps;
+  const bool enabled;
+  const uint cpuid;
+  uint64_t s[ctrgroup_chain<Counters...>::nctr];
+};
+
+template<typename Enabler, typename... Counters>
+class scoped_perf_region : public base_perf_region<Enabler, Counters...> {
+ public:
+  scoped_perf_region(perfsum_ctr<Enabler, Counters...> *psarg)
+    : base_perf_region<Enabler, Counters...>(psarg) {}
+  ~scoped_perf_region() { base_perf_region<Enabler, Counters...>::lap(); }
+};
+
+template<typename Enabler, typename... Counters>
+class killable_perf_region : public base_perf_region<Enabler, Counters...> {
+ public:
+  killable_perf_region(perfsum_ctr<Enabler, Counters...> *psarg)
+    : base_perf_region<Enabler, Counters...>(psarg), active(true) {}
+  ~killable_perf_region() { stop(); }
+
+  // perform a final measurement, if needed before destructor
+  void stop() {
+    if (active)
+      base_perf_region<Enabler, Counters...>::lap();
+    active = false;
+  }
+
+  // prevent destructor from performing a measurement
+  void kill() { active = false; }
+
+ private:
+  bool active;
+};
+
+template<typename Enabler, typename... Counters>
+scoped_perf_region<Enabler, Counters...>
+perf_region(perfsum_ctr<Enabler, Counters...> *ps)
+{
+  return scoped_perf_region<Enabler, Counters...>(ps);
+}
+
+template<typename Enabler, typename... Counters>
+killable_perf_region<Enabler, Counters...>
+killable_region(perfsum_ctr<Enabler, Counters...> *ps)
+{
+  return killable_perf_region<Enabler, Counters...>(ps);
+}
+
+
+/*
+ * macros for the common case of putting in a scoped perf-counting region.
+ */
+#define __PERF_CONCAT2(a, b)  a ## b
+#define __PERF_CONCAT(a, b)   __PERF_CONCAT2(a, b)
+#define __PERF_ANON	      __PERF_CONCAT(__anon_id_, __COUNTER__)
+
+#define __PERF_REGION(region_var, sum_var, region_type, text, group)	       \
+  static auto __PERF_CONCAT(sum_var, _sum) = scopedperf::perfsum(text, group); \
+  auto region_var = region_type(&__PERF_CONCAT(sum_var, _sum));
+
+#define ANON_REGION(text, group) \
+  __PERF_REGION(__PERF_ANON, __PERF_ANON, scopedperf::perf_region, text, group)
+#define PERF_REGION(var, text, group) \
+  __PERF_REGION(var, __PERF_ANON, scopedperf::perf_region, text, group)
+#define KILLABLE_REGION(var, text, group) \
+  __PERF_REGION(var, __PERF_ANON, scopedperf::killable_region, text, group)
+
+} /* namespace scopedperf */
--- a/user/umain.cc
+++ b/user/umain.cc
@@ -8,6 +8,8 @@
 #include "atomic_util.hh"
 #include "ns.hh"
 #include "rnd.hh"
+#include "scopedperf.hh"
+#include "intelctr.hh"

 u64
 proc_hash(const u32 &pid)
@@ -57,21 +59,30 @@ threadpin(void (*fn)(void*), void *arg, const char *name, int cpu)

 static pthread_barrier_t worker_b, populate_b;

-enum { iter_total = 10000000 };
+enum { iter_total = 1000000 };
 enum { crange_items = 1024 };

+static auto perfgroup = ctrgroup(&intelctr::tsc
+                                // ,&intelctr::l2_refs
+                                // ,&intelctr::l2_miss
+                                );
+
 static void
 worker(void *arg)
 {
  crange *cr = (crange*) arg;

  for (u32 i = 0; i < iter_total / ncpu; i++) {
+    ANON_REGION("worker op", &perfgroup);
    u64 k = 1 + rnd() % (crange_items * 2);
    auto span = cr->search_lock(k, 1);
-    if (rnd() & 1)
+    if (rnd() & 1) {
+      ANON_REGION("worker del", &perfgroup);
      span.replace(0);
-    else
+    } else {
+      ANON_REGION("worker add", &perfgroup);
      span.replace(new range(cr, k, 1));
+    }
  }

  pthread_barrier_wait(&worker_b);
@@ -153,5 +164,5 @@ main(int ac, char **av)
  }
  pthread_barrier_wait(&worker_b);

-  printf("exiting\n");
+  scopedperf::perfsum_base::printall();
 }