[PATCH v6 06/10] libperf: Add support for user space counter access

Tue May 4 19:12:19 PDT 2021

On Tue, May 4, 2021 at 4:41 PM Ian Rogers <irogers at google.com> wrote:
>
> On Wed, Mar 10, 2021 at 4:08 PM Rob Herring <robh at kernel.org> wrote:
> >
> > x86 and arm64 can both support direct access of event counters in
> > userspace. The access sequence is less than trivial and currently exists
> > in perf test code (tools/perf/arch/x86/tests/rdpmc.c) with copies in
> > projects such as PAPI and libpfm4.
> >
> > In order to support usersapce access, an event must be mmapped first
> > with perf_evsel__mmap(). Then subsequent calls to perf_evsel__read()
> > will use the fast path (assuming the arch supports it).
> >
> > Signed-off-by: Rob Herring <robh at kernel.org>
> > ---
> > v6:
> >  - Adapt to mmap changes adding MMAP NULL check
> > v5:
> >  - Make raw count s64 instead of u64 so that counter width shifting
> >    works
> >  - Adapt to mmap changes
> > v4:
> >  - Update perf_evsel__mmap size to pages
> > v3:
> >  - Split out perf_evsel__mmap() to separate patch
> > ---
> >  tools/lib/perf/evsel.c                 |  4 ++
> >  tools/lib/perf/include/internal/mmap.h |  3 +
> >  tools/lib/perf/mmap.c                  | 88 ++++++++++++++++++++++++++
> >  tools/lib/perf/tests/test-evsel.c      | 65 +++++++++++++++++++
> >  4 files changed, 160 insertions(+)
> >
> > diff --git a/tools/lib/perf/evsel.c b/tools/lib/perf/evsel.c
> > index 1057e9b15528..4d67343d36c9 100644
> > --- a/tools/lib/perf/evsel.c
> > +++ b/tools/lib/perf/evsel.c
> > @@ -242,6 +242,10 @@ int perf_evsel__read(struct perf_evsel *evsel, int cpu, int thread,
> >         if (FD(evsel, cpu, thread) < 0)
> >                 return -EINVAL;
> >
> > +       if (MMAP(evsel, cpu, thread) &&
> > +           !perf_mmap__read_self(MMAP(evsel, cpu, thread), count))
> > +               return 0;
> > +
> >         if (readn(FD(evsel, cpu, thread), count->values, size) <= 0)
> >                 return -errno;
> >
> > diff --git a/tools/lib/perf/include/internal/mmap.h b/tools/lib/perf/include/internal/mmap.h
> > index be7556e0a2b2..5e3422f40ed5 100644
> > --- a/tools/lib/perf/include/internal/mmap.h
> > +++ b/tools/lib/perf/include/internal/mmap.h
> > @@ -11,6 +11,7 @@
> >  #define PERF_SAMPLE_MAX_SIZE (1 << 16)
> >
> >  struct perf_mmap;
> > +struct perf_counts_values;
> >
> >  typedef void (*libperf_unmap_cb_t)(struct perf_mmap *map);
> >
> > @@ -52,4 +53,6 @@ void perf_mmap__put(struct perf_mmap *map);
> >
> >  u64 perf_mmap__read_head(struct perf_mmap *map);
> >
> > +int perf_mmap__read_self(struct perf_mmap *map, struct perf_counts_values *count);
> > +
> >  #endif /* __LIBPERF_INTERNAL_MMAP_H */
> > diff --git a/tools/lib/perf/mmap.c b/tools/lib/perf/mmap.c
> > index 79d5ed6c38cc..915469f00cf4 100644
> > --- a/tools/lib/perf/mmap.c
> > +++ b/tools/lib/perf/mmap.c
> > @@ -8,9 +8,11 @@
> >  #include <linux/perf_event.h>
> >  #include <perf/mmap.h>
> >  #include <perf/event.h>
> > +#include <perf/evsel.h>
> >  #include <internal/mmap.h>
> >  #include <internal/lib.h>
> >  #include <linux/kernel.h>
> > +#include <linux/math64.h>
> >  #include "internal.h"
> >
> >  void perf_mmap__init(struct perf_mmap *map, struct perf_mmap *prev,
> > @@ -273,3 +275,89 @@ union perf_event *perf_mmap__read_event(struct perf_mmap *map)
> >
> >         return event;
> >  }
> > +
> > +#if defined(__i386__) || defined(__x86_64__)
> > +static u64 read_perf_counter(unsigned int counter)
> > +{
> > +       unsigned int low, high;
> > +
> > +       asm volatile("rdpmc" : "=a" (low), "=d" (high) : "c" (counter));
> > +
> > +       return low | ((u64)high) << 32;
> > +}
> > +
> > +static u64 read_timestamp(void)
> > +{
> > +       unsigned int low, high;
> > +
> > +       asm volatile("rdtsc" : "=a" (low), "=d" (high));
> > +
> > +       return low | ((u64)high) << 32;
> > +}
> > +#else
> > +static u64 read_perf_counter(unsigned int counter) { return 0; }
> > +static u64 read_timestamp(void) { return 0; }
> > +#endif
> > +
> > +int perf_mmap__read_self(struct perf_mmap *map, struct perf_counts_values *count)
> > +{
> > +       struct perf_event_mmap_page *pc = map->base;
> > +       u32 seq, idx, time_mult = 0, time_shift = 0;
> > +       u64 cnt, cyc = 0, time_offset = 0, time_cycles = 0, time_mask = ~0ULL;
> > +
> > +       if (!pc || !pc->cap_user_rdpmc)
> > +               return -1;
> > +
> > +       do {
> > +               seq = READ_ONCE(pc->lock);
> > +               barrier();
> > +
> > +               count->ena = READ_ONCE(pc->time_enabled);
> > +               count->run = READ_ONCE(pc->time_running);
> > +
> > +               if (pc->cap_user_time && count->ena != count->run) {
> > +                       cyc = read_timestamp();
> > +                       time_mult = READ_ONCE(pc->time_mult);
> > +                       time_shift = READ_ONCE(pc->time_shift);
> > +                       time_offset = READ_ONCE(pc->time_offset);
> > +
> > +                       if (pc->cap_user_time_short) {
> > +                               time_cycles = READ_ONCE(pc->time_cycles);
> > +                               time_mask = READ_ONCE(pc->time_mask);
> > +                       }
>
> Nit, this is now out of sync with the comment code in perf_event.h.

IMO, we should just delete that version. One less slightly wrong version...

> > +               }
> > +
> > +               idx = READ_ONCE(pc->index);
> > +               cnt = READ_ONCE(pc->offset);
> > +               if (pc->cap_user_rdpmc && idx) {
> > +                       s64 evcnt = read_perf_counter(idx - 1);
> > +                       u16 width = READ_ONCE(pc->pmc_width);
> > +
> > +                       evcnt <<= 64 - width;
> > +                       evcnt >>= 64 - width;
> > +                       cnt += evcnt;
> > +               } else
> > +                       return -1;
> > +
> > +               barrier();
> > +       } while (READ_ONCE(pc->lock) != seq);
> > +
> > +       if (count->ena != count->run) {
> > +               u64 delta;
> > +
> > +               /* Adjust for cap_usr_time_short, a nop if not */
> > +               cyc = time_cycles + ((cyc - time_cycles) & time_mask);
> > +
> > +               delta = time_offset + mul_u64_u32_shr(cyc, time_mult, time_shift);
> > +
> > +               count->ena += delta;
> > +               if (idx)
> > +                       count->run += delta;
> > +
> > +               cnt = mul_u64_u64_div64(cnt, count->ena, count->run);
>
> Does this still suffer the divide by zero if multiplexing hasn't run
> the counter? If so, we still need to add something like:
> https://lore.kernel.org/lkml/CAP-5=fVRdqvswtyQMg5cB+ntTGda+SAYskjTQednEH-AeZo13g@mail.gmail.com/

I don't think so because if we don't have a valid counter index, we
exit before this if.

>
> > +       }
> > +
> > +       count->val = cnt;
> > +
> > +       return 0;
> > +}
> > diff --git a/tools/lib/perf/tests/test-evsel.c b/tools/lib/perf/tests/test-evsel.c
> > index 0ad82d7a2a51..54fb4809b9ee 100644
> > --- a/tools/lib/perf/tests/test-evsel.c
> > +++ b/tools/lib/perf/tests/test-evsel.c
> > @@ -120,6 +120,69 @@ static int test_stat_thread_enable(void)
> >         return 0;
> >  }
> >
> > +static int test_stat_user_read(int event)
> > +{
> > +       struct perf_counts_values counts = { .val = 0 };
> > +       struct perf_thread_map *threads;
> > +       struct perf_evsel *evsel;
> > +       struct perf_event_mmap_page *pc;
> > +       struct perf_event_attr attr = {
> > +               .type   = PERF_TYPE_HARDWARE,
> > +               .config = event,
> > +       };
>
> A nit, previously test-evsel was able to run and pass on a hypervisor.
> As now there is a reliance on hardware events the evsel open fails on
> a hypervisor. It'd be nice if we could detect running on a hypervisor
> and test software events in that case.

I suppose we can just exit if open fails on this test.

Rob