[PATCH] perf cs-etm: Add support for coresight trace for any range of CPUs

Ganapatrao Kulkarni gankulkarni at os.amperecomputing.com
Thu Apr 20 07:30:34 PDT 2023


Hi James,

On 20-04-2023 06:07 pm, Ganapatrao Kulkarni wrote:
> 
> 
> On 20-04-2023 06:00 pm, James Clark wrote:
>>
>>
>> On 20/04/2023 12:47, Ganapatrao Kulkarni wrote:
>>>
>>> Hi James,
>>>
>>> On 20-04-2023 03:13 pm, James Clark wrote:
>>>>
>>>>
>>>> On 19/04/2023 18:21, Ganapatrao Kulkarni wrote:
>>>>> The current implementation supports coresight trace for a range of
>>>>> CPUs, if the first CPU is CPU0.
>>>>>
>>>>> Adding changes to enable coresight trace for any range of CPUs by
>>>>> decoding the first CPU also from the header.
>>>>> Later, first CPU id is used instead of CPU0 across the decoder
>>>>> functions.
>>>>>
>>>>> Signed-off-by: Ganapatrao Kulkarni 
>>>>> <gankulkarni at os.amperecomputing.com>
>>>>> ---
>>>>>    .../perf/util/cs-etm-decoder/cs-etm-decoder.c |  4 +-
>>>>>    .../perf/util/cs-etm-decoder/cs-etm-decoder.h |  3 +-
>>>>>    tools/perf/util/cs-etm.c                      | 62 
>>>>> ++++++++++++-------
>>>>>    3 files changed, 42 insertions(+), 27 deletions(-)
>>>>>
>>>>> diff --git a/tools/perf/util/cs-etm-decoder/cs-etm-decoder.c
>>>>> b/tools/perf/util/cs-etm-decoder/cs-etm-decoder.c
>>>>> index 82a27ab90c8b..41ab299b643b 100644
>>>>> --- a/tools/perf/util/cs-etm-decoder/cs-etm-decoder.c
>>>>> +++ b/tools/perf/util/cs-etm-decoder/cs-etm-decoder.c
>>>>> @@ -724,7 +724,7 @@ cs_etm_decoder__create_etm_decoder(struct
>>>>> cs_etm_decoder_params *d_params,
>>>>>    }
>>>>>      struct cs_etm_decoder *
>>>>> -cs_etm_decoder__new(int decoders, struct cs_etm_decoder_params
>>>>> *d_params,
>>>>> +cs_etm_decoder__new(int first_decoder, int decoders, struct
>>>>> cs_etm_decoder_params *d_params,
>>>>>                struct cs_etm_trace_params t_params[])
>>>>>    {
>>>>>        struct cs_etm_decoder *decoder;
>>>>> @@ -769,7 +769,7 @@ cs_etm_decoder__new(int decoders, struct
>>>>> cs_etm_decoder_params *d_params,
>>>>>        /* init raw frame logging if required */
>>>>>        cs_etm_decoder__init_raw_frame_logging(d_params, decoder);
>>>>>    -    for (i = 0; i < decoders; i++) {
>>>>> +    for (i = first_decoder; i < decoders; i++) {
>>>>>            ret = cs_etm_decoder__create_etm_decoder(d_params,
>>>>>                                 &t_params[i],
>>>>>                                 decoder);
>>>>> diff --git a/tools/perf/util/cs-etm-decoder/cs-etm-decoder.h
>>>>> b/tools/perf/util/cs-etm-decoder/cs-etm-decoder.h
>>>>> index 92a855fbe5b8..b06193fc75b4 100644
>>>>> --- a/tools/perf/util/cs-etm-decoder/cs-etm-decoder.h
>>>>> +++ b/tools/perf/util/cs-etm-decoder/cs-etm-decoder.h
>>>>> @@ -90,7 +90,8 @@ int cs_etm_decoder__process_data_block(struct
>>>>> cs_etm_decoder *decoder,
>>>>>                           size_t len, size_t *consumed);
>>>>>      struct cs_etm_decoder *
>>>>> -cs_etm_decoder__new(int num_cpu,
>>>>> +cs_etm_decoder__new(int first_decoder,
>>>>> +            int decoders,
>>>>>                struct cs_etm_decoder_params *d_params,
>>>>>                struct cs_etm_trace_params t_params[]);
>>>>>    diff --git a/tools/perf/util/cs-etm.c b/tools/perf/util/cs-etm.c
>>>>> index 94e2d02009eb..2619513ae088 100644
>>>>> --- a/tools/perf/util/cs-etm.c
>>>>> +++ b/tools/perf/util/cs-etm.c
>>>>> @@ -55,6 +55,8 @@ struct cs_etm_auxtrace {
>>>>>        u8 has_virtual_ts; /* Virtual/Kernel timestamps in the 
>>>>> trace. */
>>>>>          int num_cpu;
>>>>> +    int first_cpu;
>>>>> +    int last_cpu;
>>>>>        u64 latest_kernel_timestamp;
>>>>>        u32 auxtrace_type;
>>>>>        u64 branches_sample_type;
>>>>> @@ -638,14 +640,13 @@ static void cs_etm__set_trace_param_ete(struct
>>>>> cs_etm_trace_params *t_params,
>>>>>    }
>>>>>      static int cs_etm__init_trace_params(struct cs_etm_trace_params
>>>>> *t_params,
>>>>> -                     struct cs_etm_auxtrace *etm,
>>>>> -                     int decoders)
>>>>> +                     struct cs_etm_auxtrace *etm)
>>>>>    {
>>>>>        int i;
>>>>>        u32 etmidr;
>>>>>        u64 architecture;
>>>>>    -    for (i = 0; i < decoders; i++) {
>>>>> +    for (i = etm->first_cpu; i < etm->last_cpu; i++) {
>>>>>            architecture = etm->metadata[i][CS_ETM_MAGIC];
>>>>>              switch (architecture) {
>>>>> @@ -817,7 +818,7 @@ static void cs_etm__free(struct perf_session
>>>>> *session)
>>>>>        /* Then the RB tree itself */
>>>>>        intlist__delete(traceid_list);
>>>>>    -    for (i = 0; i < aux->num_cpu; i++)
>>>>> +    for (i = aux->first_cpu; i < aux->last_cpu; i++)
>>>>>            zfree(&aux->metadata[i]);
>>>>>          thread__zput(aux->unknown_thread);
>>>>> @@ -921,7 +922,8 @@ static struct cs_etm_queue
>>>>> *cs_etm__alloc_queue(struct cs_etm_auxtrace *etm,
>>>>>         * Each queue can only contain data from one CPU when
>>>>> unformatted, so only one decoder is
>>>>>         * needed.
>>>>>         */
>>>>> -    int decoders = formatted ? etm->num_cpu : 1;
>>>>> +    int first_decoder = formatted ? etm->first_cpu : 0;
>>>>> +    int decoders = first_decoder + (formatted ? etm->num_cpu : 1);
>>>>>          etmq = zalloc(sizeof(*etmq));
>>>>>        if (!etmq)
>>>>> @@ -937,7 +939,7 @@ static struct cs_etm_queue
>>>>> *cs_etm__alloc_queue(struct cs_etm_auxtrace *etm,
>>>>>        if (!t_params)
>>>>>            goto out_free;
>>>>>    -    if (cs_etm__init_trace_params(t_params, etm, decoders))
>>>>> +    if (cs_etm__init_trace_params(t_params, etm))
>>>>>            goto out_free;
>>>>>          /* Set decoder parameters to decode trace packets */
>>>>> @@ -947,8 +949,7 @@ static struct cs_etm_queue
>>>>> *cs_etm__alloc_queue(struct cs_etm_auxtrace *etm,
>>>>>                        formatted))
>>>>>            goto out_free;
>>>>>    -    etmq->decoder = cs_etm_decoder__new(decoders, &d_params,
>>>>> -                        t_params);
>>>>> +    etmq->decoder = cs_etm_decoder__new(first_decoder, decoders,
>>>>> &d_params, t_params);
>>>>>          if (!etmq->decoder)
>>>>>            goto out_free;
>>>>> @@ -2959,11 +2960,11 @@ static int cs_etm__queue_aux_records(struct
>>>>> perf_session *session)
>>>>>     * Loop through the ETMs and complain if we find at least one where
>>>>> ts_source != 1 (virtual
>>>>>     * timestamps).
>>>>>     */
>>>>> -static bool cs_etm__has_virtual_ts(u64 **metadata, int num_cpu)
>>>>> +static bool cs_etm__has_virtual_ts(u64 **metadata, struct
>>>>> cs_etm_auxtrace *etm)
>>>>>    {
>>>>>        int j;
>>>>>    -    for (j = 0; j < num_cpu; j++) {
>>>>> +    for (j = etm->first_cpu; j < etm->last_cpu; j++) {
>>>>>            switch (metadata[j][CS_ETM_MAGIC]) {
>>>>>            case __perf_cs_etmv4_magic:
>>>>>                if (HAS_PARAM(j, ETMV4, TS_SOURCE) ||
>>>>> metadata[j][CS_ETMV4_TS_SOURCE] != 1)
>>>>> @@ -2982,13 +2983,14 @@ static bool cs_etm__has_virtual_ts(u64
>>>>> **metadata, int num_cpu)
>>>>>    }
>>>>>      /* map trace ids to correct metadata block, from information in
>>>>> metadata */
>>>>> -static int cs_etm__map_trace_ids_metadata(int num_cpu, u64 
>>>>> **metadata)
>>>>> +static int cs_etm__map_trace_ids_metadata(struct cs_etm_auxtrace 
>>>>> *etm)
>>>>>    {
>>>>>        u64 cs_etm_magic;
>>>>> +    u64 **metadata = etm->metadata;
>>>>>        u8 trace_chan_id;
>>>>>        int i, err;
>>>>>    -    for (i = 0; i < num_cpu; i++) {
>>>>> +    for (i = etm->first_cpu; i < etm->last_cpu; i++) {
>>>>>            cs_etm_magic = metadata[i][CS_ETM_MAGIC];
>>>>>            switch (cs_etm_magic) {
>>>>>            case __perf_cs_etmv3_magic:
>>>>> @@ -3015,12 +3017,13 @@ static int cs_etm__map_trace_ids_metadata(int
>>>>> num_cpu, u64 **metadata)
>>>>>     * If we found AUX_HW_ID packets, then set any metadata marked as
>>>>> unused to the
>>>>>     * unused value to reduce the number of unneeded decoders created.
>>>>>     */
>>>>> -static int cs_etm__clear_unused_trace_ids_metadata(int num_cpu, u64
>>>>> **metadata)
>>>>> +static int cs_etm__clear_unused_trace_ids_metadata(struct
>>>>> cs_etm_auxtrace *etm)
>>>>>    {
>>>>>        u64 cs_etm_magic;
>>>>> +    u64 **metadata = etm->metadata;
>>>>>        int i;
>>>>>    -    for (i = 0; i < num_cpu; i++) {
>>>>> +    for (i = etm->first_cpu; i < etm->last_cpu; i++) {
>>>>>            cs_etm_magic = metadata[i][CS_ETM_MAGIC];
>>>>>            switch (cs_etm_magic) {
>>>>>            case __perf_cs_etmv3_magic:
>>>>> @@ -3049,7 +3052,7 @@ int cs_etm__process_auxtrace_info_full(union
>>>>> perf_event *event,
>>>>>        int event_header_size = sizeof(struct perf_event_header);
>>>>>        int total_size = auxtrace_info->header.size;
>>>>>        int priv_size = 0;
>>>>> -    int num_cpu;
>>>>> +    int num_cpu, first_cpu = 0, last_cpu;
>>>>>        int err = 0;
>>>>>        int aux_hw_id_found;
>>>>>        int i, j;
>>>>> @@ -3068,22 +3071,31 @@ int cs_etm__process_auxtrace_info_full(union
>>>>> perf_event *event,
>>>>>        /* First the global part */
>>>>>        ptr = (u64 *) auxtrace_info->priv;
>>>>>        num_cpu = ptr[CS_PMU_TYPE_CPUS] & 0xffffffff;
>>>>> -    metadata = zalloc(sizeof(*metadata) * num_cpu);
>>>>> +
>>>>> +    /* Start parsing after the common part of the header */
>>>>> +    i = CS_HEADER_VERSION_MAX;
>>>>> +
>>>>> +    /*Get CPU id of first event */
>>>>> +    first_cpu = ptr[i + CS_ETM_CPU];
>>>>> +    last_cpu = first_cpu + num_cpu;
>>>>> +
>>>>> +    if (first_cpu > cpu__max_cpu().cpu ||
>>>>> +            last_cpu > cpu__max_cpu().cpu)
>>>>> +        return -EINVAL;
>>>>> +
>>>>> +    metadata = zalloc(sizeof(*metadata) * last_cpu);
>>>>
>>>> Hi Ganapatrao,
>>>>
>>>> I think I see what the problem is, but I'm wondering if a better fix
>>>> would be to further decouple the CPU ID from the index in the array.
>>>>
>>>> With your change it's not clear what happens with sparse recordings, 
>>>> for
>>>> example 'perf record -e cs_etm// -C 1,3,5'. And it seems like there is
>>>
>>> This patch fixes for any range of CPUs.
>>> Record with sparse list of CPUs will not work with current code still.
>>>
>>
>> Is there a major issue that means sparse can't be done? I'm thinking it
>> would be best to fix both issues with one change while we understand
>> this part rather than a half fix that might have do be completely
>> re-understood and re-done later anyway. Unless there is some big blocker
>> but I can't see it?
>>
>>>> some wastage in the zalloc here for example if only CPU 256 is traced
>>>> then we'd still make 256 decoders but 255 of them would be unused?
>>>>
>>>> I tried to test sparse recordings, but your change doesn't apply to the
>>>> latest coresight/next branch. I did notice that 'perf report -D' 
>>>> doesn't
>>>> work with them on coresight/next (it just quits), but I couldn't see if
>>>> that's fixed with your change.
>>>
>>> My patch is rebased on 6.3-RC7 codebase with Mike's 3 perf patches
>>> related to dynamic id [1] support(queued for 6.4).
>>>
>>> "perf report -D" works for me.
>>
>> I was referring to sparse CPU lists, which I think you mentioned above
>> doesn't work even with this patch.
>>
>>>
>>> [1] https://www.spinics.net/lists/linux-perf-users/msg27452.html
>>>
>>
>> It should be based on the next branch here:
>> git://git.kernel.org/pub/scm/linux/kernel/git/coresight/linux.git
> 
> OK.
>>
>>>>
>>>> Would a better fix not be to keep the metadata loops from 0-N and
>>>> instead save the CPU ID in cs_etm_decoder_params or the decoder. That
>>>> way it would support both sparse and not starting from 0 cases? I think
>>>
>>> Yep, I though this initially, it got complicated due to for more
>>> for-loops. I will try again and post V2.
>>>
>>
>> I can't imagine it would need any extra for loops off the top of my
>> head. Just saving the CPU ID in a few structs and using it wherever it's
>> needed instead of the loop index. I imagine most of the loops would
>> actually stay the same rather than be changed like you have in V1.
> 
> Working on V2, I will post it at the earliest.

My codebase is "6.3-RC7 + Mike's patche[1] + Anshuman/Suzuki patches[3]" 
and below diff works for any CPU range and sparse list as well.

I will send the V2 ASAP.

[root at B0-213 perf]# git diff
diff --git a/tools/perf/util/cs-etm.c b/tools/perf/util/cs-etm.c
index 94e2d02009eb..bfed0bb8be98 100644
--- a/tools/perf/util/cs-etm.c
+++ b/tools/perf/util/cs-etm.c
@@ -274,6 +274,22 @@ static int cs_etm__metadata_set_trace_id(u8 
trace_chan_id, u64 *cpu_metadata)
                 (typeof(_mask))(((_reg) & (_mask)) >> __bf_shf(_mask)); \
         })

+
+static u64 *get_cpu_data(struct cs_etm_auxtrace *etm, int cpu)
+{
+       int i;
+       u64 *metadata = NULL;
+
+       for (i = 0; i < etm->num_cpu; i++) {
+               if (etm->metadata[i][CS_ETM_CPU] == (u64)cpu) {
+                       metadata = etm->metadata[i];
+                       break;
+               }
+       }
+
+       return metadata;
+}
+
  /*
   * Handle the PERF_RECORD_AUX_OUTPUT_HW_ID event.
   *
@@ -344,7 +360,7 @@ static int cs_etm__process_aux_output_hw_id(struct 
perf_session *session,
         }

         /* not one we've seen before - lets map it */
-       cpu_data = etm->metadata[cpu];
+       cpu_data = get_cpu_data(etm, cpu);
         err = cs_etm__map_trace_id(trace_chan_id, cpu_data);
         if (err)
                 return err;


I tried below commands and works well with this diff.

timeout 8s ./perf record -e cs_etm// -C 0 dd if=/dev/zero of=/dev/null
timeout 8s ./perf record -e cs_etm// -C 1 dd if=/dev/zero of=/dev/null
timeout 8s ./perf record -e cs_etm// -C 1,3,5 dd if=/dev/zero of=/dev/null
timeout 8s ./perf record -e cs_etm// -C 10,3,12 dd if=/dev/zero of=/dev/null
timeout 8s ./perf record -e cs_etm//u -C 0-10 dd if=/dev/zero of=/dev/null
timeout 8s ./perf record -e cs_etm//u -C 10-20 dd if=/dev/zero of=/dev/nu

However I could not be able to apply to next branch of coresight repo
git://git.kernel.org/pub/scm/linux/kernel/git/coresight/linux.git

Then I looked at the code, it does not have Mike's dynamic ID patches[1] 
which are merged to perf/next.
It seems the code is not the latest or it is next for coresight driver only.

I think either patch should be rebased to most-recent kernel 
RC1/RCx(which I did) or perf-tools-next of [2] for perf tool.

[1] https://www.spinics.net/lists/linux-perf-users/msg27452.html
[2] git://git.kernel.org/pub/scm/linux/kernel/git/acme/linux.git
[3] https://www.spinics.net/lists/arm-kernel/msg1059012.html

>>
>>>> the code would be better if it's worded like "i < recorded_cpus" rather
>>>> than "i < cpu" to make it clear that i isn't actually the CPU ID it's
>>>> just an index.
>>>
>>> Yes, makes sense to call it "recorded_cpus".
>>>
>>>>
>>>> Also a new test for this scenario would probably be a good idea.
>>>>
>>>> Thanks
>>>> James
>>>>
>>> Thanks,
>>> Ganapat
> 
> Thanks,
> Ganapat

Thanks,
Ganapat



More information about the linux-arm-kernel mailing list