[PATCH] perf cs-etm: Add support for coresight trace for any range of CPUs

Ganapatrao Kulkarni gankulkarni at os.amperecomputing.com
Thu Apr 20 05:37:20 PDT 2023



On 20-04-2023 06:00 pm, James Clark wrote:
> 
> 
> On 20/04/2023 12:47, Ganapatrao Kulkarni wrote:
>>
>> Hi James,
>>
>> On 20-04-2023 03:13 pm, James Clark wrote:
>>>
>>>
>>> On 19/04/2023 18:21, Ganapatrao Kulkarni wrote:
>>>> The current implementation supports coresight trace for a range of
>>>> CPUs, if the first CPU is CPU0.
>>>>
>>>> Adding changes to enable coresight trace for any range of CPUs by
>>>> decoding the first CPU also from the header.
>>>> Later, first CPU id is used instead of CPU0 across the decoder
>>>> functions.
>>>>
>>>> Signed-off-by: Ganapatrao Kulkarni <gankulkarni at os.amperecomputing.com>
>>>> ---
>>>>    .../perf/util/cs-etm-decoder/cs-etm-decoder.c |  4 +-
>>>>    .../perf/util/cs-etm-decoder/cs-etm-decoder.h |  3 +-
>>>>    tools/perf/util/cs-etm.c                      | 62 ++++++++++++-------
>>>>    3 files changed, 42 insertions(+), 27 deletions(-)
>>>>
>>>> diff --git a/tools/perf/util/cs-etm-decoder/cs-etm-decoder.c
>>>> b/tools/perf/util/cs-etm-decoder/cs-etm-decoder.c
>>>> index 82a27ab90c8b..41ab299b643b 100644
>>>> --- a/tools/perf/util/cs-etm-decoder/cs-etm-decoder.c
>>>> +++ b/tools/perf/util/cs-etm-decoder/cs-etm-decoder.c
>>>> @@ -724,7 +724,7 @@ cs_etm_decoder__create_etm_decoder(struct
>>>> cs_etm_decoder_params *d_params,
>>>>    }
>>>>      struct cs_etm_decoder *
>>>> -cs_etm_decoder__new(int decoders, struct cs_etm_decoder_params
>>>> *d_params,
>>>> +cs_etm_decoder__new(int first_decoder, int decoders, struct
>>>> cs_etm_decoder_params *d_params,
>>>>                struct cs_etm_trace_params t_params[])
>>>>    {
>>>>        struct cs_etm_decoder *decoder;
>>>> @@ -769,7 +769,7 @@ cs_etm_decoder__new(int decoders, struct
>>>> cs_etm_decoder_params *d_params,
>>>>        /* init raw frame logging if required */
>>>>        cs_etm_decoder__init_raw_frame_logging(d_params, decoder);
>>>>    -    for (i = 0; i < decoders; i++) {
>>>> +    for (i = first_decoder; i < decoders; i++) {
>>>>            ret = cs_etm_decoder__create_etm_decoder(d_params,
>>>>                                 &t_params[i],
>>>>                                 decoder);
>>>> diff --git a/tools/perf/util/cs-etm-decoder/cs-etm-decoder.h
>>>> b/tools/perf/util/cs-etm-decoder/cs-etm-decoder.h
>>>> index 92a855fbe5b8..b06193fc75b4 100644
>>>> --- a/tools/perf/util/cs-etm-decoder/cs-etm-decoder.h
>>>> +++ b/tools/perf/util/cs-etm-decoder/cs-etm-decoder.h
>>>> @@ -90,7 +90,8 @@ int cs_etm_decoder__process_data_block(struct
>>>> cs_etm_decoder *decoder,
>>>>                           size_t len, size_t *consumed);
>>>>      struct cs_etm_decoder *
>>>> -cs_etm_decoder__new(int num_cpu,
>>>> +cs_etm_decoder__new(int first_decoder,
>>>> +            int decoders,
>>>>                struct cs_etm_decoder_params *d_params,
>>>>                struct cs_etm_trace_params t_params[]);
>>>>    diff --git a/tools/perf/util/cs-etm.c b/tools/perf/util/cs-etm.c
>>>> index 94e2d02009eb..2619513ae088 100644
>>>> --- a/tools/perf/util/cs-etm.c
>>>> +++ b/tools/perf/util/cs-etm.c
>>>> @@ -55,6 +55,8 @@ struct cs_etm_auxtrace {
>>>>        u8 has_virtual_ts; /* Virtual/Kernel timestamps in the trace. */
>>>>          int num_cpu;
>>>> +    int first_cpu;
>>>> +    int last_cpu;
>>>>        u64 latest_kernel_timestamp;
>>>>        u32 auxtrace_type;
>>>>        u64 branches_sample_type;
>>>> @@ -638,14 +640,13 @@ static void cs_etm__set_trace_param_ete(struct
>>>> cs_etm_trace_params *t_params,
>>>>    }
>>>>      static int cs_etm__init_trace_params(struct cs_etm_trace_params
>>>> *t_params,
>>>> -                     struct cs_etm_auxtrace *etm,
>>>> -                     int decoders)
>>>> +                     struct cs_etm_auxtrace *etm)
>>>>    {
>>>>        int i;
>>>>        u32 etmidr;
>>>>        u64 architecture;
>>>>    -    for (i = 0; i < decoders; i++) {
>>>> +    for (i = etm->first_cpu; i < etm->last_cpu; i++) {
>>>>            architecture = etm->metadata[i][CS_ETM_MAGIC];
>>>>              switch (architecture) {
>>>> @@ -817,7 +818,7 @@ static void cs_etm__free(struct perf_session
>>>> *session)
>>>>        /* Then the RB tree itself */
>>>>        intlist__delete(traceid_list);
>>>>    -    for (i = 0; i < aux->num_cpu; i++)
>>>> +    for (i = aux->first_cpu; i < aux->last_cpu; i++)
>>>>            zfree(&aux->metadata[i]);
>>>>          thread__zput(aux->unknown_thread);
>>>> @@ -921,7 +922,8 @@ static struct cs_etm_queue
>>>> *cs_etm__alloc_queue(struct cs_etm_auxtrace *etm,
>>>>         * Each queue can only contain data from one CPU when
>>>> unformatted, so only one decoder is
>>>>         * needed.
>>>>         */
>>>> -    int decoders = formatted ? etm->num_cpu : 1;
>>>> +    int first_decoder = formatted ? etm->first_cpu : 0;
>>>> +    int decoders = first_decoder + (formatted ? etm->num_cpu : 1);
>>>>          etmq = zalloc(sizeof(*etmq));
>>>>        if (!etmq)
>>>> @@ -937,7 +939,7 @@ static struct cs_etm_queue
>>>> *cs_etm__alloc_queue(struct cs_etm_auxtrace *etm,
>>>>        if (!t_params)
>>>>            goto out_free;
>>>>    -    if (cs_etm__init_trace_params(t_params, etm, decoders))
>>>> +    if (cs_etm__init_trace_params(t_params, etm))
>>>>            goto out_free;
>>>>          /* Set decoder parameters to decode trace packets */
>>>> @@ -947,8 +949,7 @@ static struct cs_etm_queue
>>>> *cs_etm__alloc_queue(struct cs_etm_auxtrace *etm,
>>>>                        formatted))
>>>>            goto out_free;
>>>>    -    etmq->decoder = cs_etm_decoder__new(decoders, &d_params,
>>>> -                        t_params);
>>>> +    etmq->decoder = cs_etm_decoder__new(first_decoder, decoders,
>>>> &d_params, t_params);
>>>>          if (!etmq->decoder)
>>>>            goto out_free;
>>>> @@ -2959,11 +2960,11 @@ static int cs_etm__queue_aux_records(struct
>>>> perf_session *session)
>>>>     * Loop through the ETMs and complain if we find at least one where
>>>> ts_source != 1 (virtual
>>>>     * timestamps).
>>>>     */
>>>> -static bool cs_etm__has_virtual_ts(u64 **metadata, int num_cpu)
>>>> +static bool cs_etm__has_virtual_ts(u64 **metadata, struct
>>>> cs_etm_auxtrace *etm)
>>>>    {
>>>>        int j;
>>>>    -    for (j = 0; j < num_cpu; j++) {
>>>> +    for (j = etm->first_cpu; j < etm->last_cpu; j++) {
>>>>            switch (metadata[j][CS_ETM_MAGIC]) {
>>>>            case __perf_cs_etmv4_magic:
>>>>                if (HAS_PARAM(j, ETMV4, TS_SOURCE) ||
>>>> metadata[j][CS_ETMV4_TS_SOURCE] != 1)
>>>> @@ -2982,13 +2983,14 @@ static bool cs_etm__has_virtual_ts(u64
>>>> **metadata, int num_cpu)
>>>>    }
>>>>      /* map trace ids to correct metadata block, from information in
>>>> metadata */
>>>> -static int cs_etm__map_trace_ids_metadata(int num_cpu, u64 **metadata)
>>>> +static int cs_etm__map_trace_ids_metadata(struct cs_etm_auxtrace *etm)
>>>>    {
>>>>        u64 cs_etm_magic;
>>>> +    u64 **metadata = etm->metadata;
>>>>        u8 trace_chan_id;
>>>>        int i, err;
>>>>    -    for (i = 0; i < num_cpu; i++) {
>>>> +    for (i = etm->first_cpu; i < etm->last_cpu; i++) {
>>>>            cs_etm_magic = metadata[i][CS_ETM_MAGIC];
>>>>            switch (cs_etm_magic) {
>>>>            case __perf_cs_etmv3_magic:
>>>> @@ -3015,12 +3017,13 @@ static int cs_etm__map_trace_ids_metadata(int
>>>> num_cpu, u64 **metadata)
>>>>     * If we found AUX_HW_ID packets, then set any metadata marked as
>>>> unused to the
>>>>     * unused value to reduce the number of unneeded decoders created.
>>>>     */
>>>> -static int cs_etm__clear_unused_trace_ids_metadata(int num_cpu, u64
>>>> **metadata)
>>>> +static int cs_etm__clear_unused_trace_ids_metadata(struct
>>>> cs_etm_auxtrace *etm)
>>>>    {
>>>>        u64 cs_etm_magic;
>>>> +    u64 **metadata = etm->metadata;
>>>>        int i;
>>>>    -    for (i = 0; i < num_cpu; i++) {
>>>> +    for (i = etm->first_cpu; i < etm->last_cpu; i++) {
>>>>            cs_etm_magic = metadata[i][CS_ETM_MAGIC];
>>>>            switch (cs_etm_magic) {
>>>>            case __perf_cs_etmv3_magic:
>>>> @@ -3049,7 +3052,7 @@ int cs_etm__process_auxtrace_info_full(union
>>>> perf_event *event,
>>>>        int event_header_size = sizeof(struct perf_event_header);
>>>>        int total_size = auxtrace_info->header.size;
>>>>        int priv_size = 0;
>>>> -    int num_cpu;
>>>> +    int num_cpu, first_cpu = 0, last_cpu;
>>>>        int err = 0;
>>>>        int aux_hw_id_found;
>>>>        int i, j;
>>>> @@ -3068,22 +3071,31 @@ int cs_etm__process_auxtrace_info_full(union
>>>> perf_event *event,
>>>>        /* First the global part */
>>>>        ptr = (u64 *) auxtrace_info->priv;
>>>>        num_cpu = ptr[CS_PMU_TYPE_CPUS] & 0xffffffff;
>>>> -    metadata = zalloc(sizeof(*metadata) * num_cpu);
>>>> +
>>>> +    /* Start parsing after the common part of the header */
>>>> +    i = CS_HEADER_VERSION_MAX;
>>>> +
>>>> +    /*Get CPU id of first event */
>>>> +    first_cpu = ptr[i + CS_ETM_CPU];
>>>> +    last_cpu = first_cpu + num_cpu;
>>>> +
>>>> +    if (first_cpu > cpu__max_cpu().cpu ||
>>>> +            last_cpu > cpu__max_cpu().cpu)
>>>> +        return -EINVAL;
>>>> +
>>>> +    metadata = zalloc(sizeof(*metadata) * last_cpu);
>>>
>>> Hi Ganapatrao,
>>>
>>> I think I see what the problem is, but I'm wondering if a better fix
>>> would be to further decouple the CPU ID from the index in the array.
>>>
>>> With your change it's not clear what happens with sparse recordings, for
>>> example 'perf record -e cs_etm// -C 1,3,5'. And it seems like there is
>>
>> This patch fixes for any range of CPUs.
>> Record with sparse list of CPUs will not work with current code still.
>>
> 
> Is there a major issue that means sparse can't be done? I'm thinking it
> would be best to fix both issues with one change while we understand
> this part rather than a half fix that might have do be completely
> re-understood and re-done later anyway. Unless there is some big blocker
> but I can't see it?
> 
>>> some wastage in the zalloc here for example if only CPU 256 is traced
>>> then we'd still make 256 decoders but 255 of them would be unused?
>>>
>>> I tried to test sparse recordings, but your change doesn't apply to the
>>> latest coresight/next branch. I did notice that 'perf report -D' doesn't
>>> work with them on coresight/next (it just quits), but I couldn't see if
>>> that's fixed with your change.
>>
>> My patch is rebased on 6.3-RC7 codebase with Mike's 3 perf patches
>> related to dynamic id [1] support(queued for 6.4).
>>
>> "perf report -D" works for me.
> 
> I was referring to sparse CPU lists, which I think you mentioned above
> doesn't work even with this patch.
> 
>>
>> [1] https://www.spinics.net/lists/linux-perf-users/msg27452.html
>>
> 
> It should be based on the next branch here:
> git://git.kernel.org/pub/scm/linux/kernel/git/coresight/linux.git

OK.
> 
>>>
>>> Would a better fix not be to keep the metadata loops from 0-N and
>>> instead save the CPU ID in cs_etm_decoder_params or the decoder. That
>>> way it would support both sparse and not starting from 0 cases? I think
>>
>> Yep, I though this initially, it got complicated due to for more
>> for-loops. I will try again and post V2.
>>
> 
> I can't imagine it would need any extra for loops off the top of my
> head. Just saving the CPU ID in a few structs and using it wherever it's
> needed instead of the loop index. I imagine most of the loops would
> actually stay the same rather than be changed like you have in V1.

Working on V2, I will post it at the earliest.
> 
>>> the code would be better if it's worded like "i < recorded_cpus" rather
>>> than "i < cpu" to make it clear that i isn't actually the CPU ID it's
>>> just an index.
>>
>> Yes, makes sense to call it "recorded_cpus".
>>
>>>
>>> Also a new test for this scenario would probably be a good idea.
>>>
>>> Thanks
>>> James
>>>
>> Thanks,
>> Ganapat

Thanks,
Ganapat



More information about the linux-arm-kernel mailing list