[PATCH v2] perf stat: Enable iostat mode for HiSilicon PCIe PMU

Yicong Yang yangyicong at huawei.com
Wed Feb 21 00:13:50 PST 2024


Hi Arnaldo,

Thanks for the comments.

On 2024/2/9 21:47, Arnaldo Carvalho de Melo wrote:
> On Thu, Feb 08, 2024 at 11:25:18AM +0800, Yicong Yang wrote:
>> From: Yicong Yang <yangyicong at hisilicon.com>
>>
>> Some HiSilicon platforms provide PCIe PMU devices for monitoring the
>> throughput and latency of PCIe traffic. With the support of PCIe PMU
>> we can enable the perf iostat mode.
[...]
>> +static void hisi_pcie_root_ports_add(u16 sicl_id, u16 core_id, u8 target_bus)
>> +{
>> +	const char *sysfs = sysfs__mountpoint();
>> +	struct hisi_pcie_root_port *rp;
>> +	struct dirent *dent;
>> +	char path[PATH_MAX];
>> +	u8 bus, dev, fn;
>> +	u32 domain;
>> +	DIR *dir;
>> +	int ret;
>> +
>> +	snprintf(path, PATH_MAX, "%s/%s", sysfs, PCI_ROOT_BUS_DEVICES_PATH);
>> +	dir = opendir(path);
>> +	if (!dir)
>> +		return;
>> +
>> +	/* Scan the PCI root bus to find the match root port on @target_bus */
>> +	while ((dent = readdir(dir))) {
>> +		ret = sscanf(dent->d_name, PCI_DEVICE_NAME_PATTERN,
>> +			     &domain, &bus, &dev, &fn);
>> +		if (ret != 4 || bus != target_bus)
>> +			continue;
>> +
>> +		rp = zalloc(sizeof(*rp));
>> +		if (!rp)
>> +			continue;
> 
> shouldn't it emit some pr_debug message? Like you did with that
> pr_debug3() info message below.
> 

ok will complete the debug messages here and below. I didn't add it here may because
I didn't met the issue here when debugging..

>> +
>> +		rp->selected = false;
>> +		rp->sicl_id = sicl_id;
>> +		rp->core_id = core_id;
>> +		rp->domain = domain;
>> +		rp->bus = bus;
>> +		rp->dev = dev;
>> +		rp->fn = fn;
>> +
>> +		hisi_pcie_init_root_port_mask(rp);
>> +
>> +		list_add(&rp->list, &hisi_pcie_root_ports_list);
>> +		hisi_pcie_root_ports_num++;
>> +
>> +		pr_debug3("Found root port %s\n", dent->d_name);
>> +	}
>> +
>> +	closedir(dir);
>> +}
>> +
>> +/* Scan the PMUs and build the mapping of the Root Ports to the PMU */
>> +static int hisi_pcie_root_ports_init(void)
>> +{
>> +	char event_source[PATH_MAX], bus_path[PATH_MAX];
>> +	unsigned long long bus;
>> +	u16 sicl_id, core_id;
>> +	struct dirent *dent;
>> +	DIR *dir;
>> +
>> +	perf_pmu__event_source_devices_scnprintf(event_source, sizeof(event_source));
>> +	dir = opendir(event_source);
>> +	if (!dir)
>> +		return -ENOENT;
> 
> Is this the only possibility? I.e. if a non root user tries this and the
> file is there, shouldn't we inform that root permission is needed? Or
> any other access mechanism that may be used.
> 
> For instance, in perf we have things like kernel.perf_event_paranoid,
> capabilities, etc.
> 

Thanks for the information. Will have a check on it, didn't consider about the
permission things.

> Take a look at the evsel__open_strerror() function, to see how it tries
> to translate errors like the above to an informative message to help the
> user setup its system.
> 

Thanks for the hint. Will have a reference on that.

>> +
>> +	while ((dent = readdir(dir))) {
>> +		/*
>> +		 * This HiSilicon PCIe PMU will be named as:
>> +		 *   hisi_pcie<sicl_id>_core<core_id>
>> +		 */
>> +		if ((sscanf(dent->d_name, "hisi_pcie%hu_core%hu", &sicl_id, &core_id)) != 2)
>> +			continue;
>> +
>> +		/*
>> +		 * Driver will export the root port it can monitor through
>> +		 * the "bus" sysfs attribute.
>> +		 */
>> +		scnprintf(bus_path, sizeof(bus_path), "%s/hisi_pcie%hu_core%hu/bus",
>> +			  event_source, sicl_id, core_id);
>> +
>> +		/*
>> +		 * Per PCIe spec the bus should be 8bit, use unsigned long long
>> +		 * for the convience of the library function.
>> +		 */
>> +		if (filename__read_ull(bus_path, &bus))
> 
> Shouldn't we have a pr_debug3() here as well?
> 

will add.

>> +			continue;
> 
>> +
>> +		pr_debug3("Found pmu %s bus 0x%llx\n", dent->d_name, bus);
>> +
>> +		hisi_pcie_root_ports_add(sicl_id, core_id, (u8)bus);
>> +	}
>> +
>> +	closedir(dir);
>> +	return hisi_pcie_root_ports_num > 0 ? 0 : -ENOENT;
>> +}
>> +
>> +static void hisi_pcie_root_ports_free(void)
>> +{
>> +	struct hisi_pcie_root_port *rp, *tmp;
>> +
>> +	if (hisi_pcie_root_ports_num == 0)
>> +		return;
>> +
>> +	list_for_each_entry_safe(rp, tmp, &hisi_pcie_root_ports_list, list) {
>> +		list_del(&rp->list);
> 
> list_del_init()
> 

will use it.

>> +		zfree(&rp);
> 
> No need to use zfree here, its a local variable, not something that is
> in a data structure that may have some dangling connection somewhere and
> thus we better initialize to zero instead of reading random deleted
> memory.
> 

got it. Will use free() instead here.

>> +		hisi_pcie_root_ports_num--;
>> +	}
>> +}
>> +
>> +static int hisi_iostat_add_events(struct evlist *evl)
>> +{
>> +	struct hisi_pcie_root_port *rp;
>> +	struct evsel *evsel;
>> +	unsigned int i, j;
>> +	char *iostat_cmd;
>> +	int pos = 0;
>> +	int ret;
>> +
>> +	if (!hisi_pcie_root_ports_num)
>> +		return -ENOENT;
>> +
>> +	iostat_cmd = zalloc(PATH_MAX);
>> +	if (!iostat_cmd)
>> +		return -ENOMEM;
>> +
>> +	list_for_each_entry(rp, &hisi_pcie_root_ports_list, list) {
>> +		if (!rp->selected)
>> +			continue;
>> +
>> +		iostat_cmd[pos++] = '{';
>> +		for (j = 0; j < METRIC_TYPE_MAX; j++) {
>> +			pos += snprintf(iostat_cmd + pos, ARG_MAX - pos - 1,
>> +					hisi_iostat_cmd_template[j],
>> +					rp->sicl_id, rp->core_id, rp->mask);
>> +
>> +			if (j == METRIC_TYPE_MAX - 1)
>> +				iostat_cmd[pos++] = '}';
>> +			else
>> +				iostat_cmd[pos++] = ',';
>> +		}
>> +
>> +		ret = parse_event(evl, iostat_cmd);
>> +		if (ret)
> 
> pr_debug3()?
ok.

>> +			break;
>> +
>> +		i = 0;
>> +		evlist__for_each_entry_reverse(evl, evsel) {
>> +			if (i == METRIC_TYPE_MAX)
>> +				break;
>> +
>> +			evsel->priv = rp;
>> +			i++;
>> +		}
>> +
>> +		memset(iostat_cmd, 0, PATH_MAX);
>> +		pos = 0;
>> +	}
>> +
>> +	zfree(&iostat_cmd);
> 
> no need for zfree here, again its a local variable.
> 
>> +	return ret;
>> +}
>> +
>> +int iostat_prepare(struct evlist *evlist,
>> +		   struct perf_stat_config *config)
>> +{
>> +	if (evlist->core.nr_entries > 0) {
>> +		pr_warning("The -e and -M options are not supported."
>> +			   "All chosen events/metrics will be dropped\n");
>> +		evlist__delete(evlist);
>> +		evlist = evlist__new();
>> +		if (!evlist)
>> +			return -ENOMEM;
> 
> Can you explain how this works? Shouldn't we just bail out completely
> instead of creating an empty evlist?
> 

Here I just keep consistence with x86's implementation. For iostat only
iostat related evsels should be added and organized in certain order.

Warn the user and bail out completely with out continue may be another
choice.

Do you suggest to bail out here and also for x86 iostat for consistence?

>> +	}
>> +
>> +	config->metric_only = true;
>> +	config->aggr_mode = AGGR_GLOBAL;
>> +
>> +	return hisi_iostat_add_events(evlist);
>> +}
>> +
>> +static int hisi_pcie_root_ports_list_filter(const char *str)
>> +{
>> +	char *tok, *tmp, *copy = NULL;
>> +	u8 bus, dev, fn;
>> +	u32 domain;
>> +	int ret;
>> +
>> +	copy = strdup(str);
>> +	if (!copy)
>> +		return -ENOMEM;
>> +
>> +	for (tok = strtok_r(copy, ",", &tmp); tok; tok = strtok_r(NULL, ",", &tmp)) {
>> +		ret = sscanf(tok, PCI_DEVICE_NAME_PATTERN, &domain, &bus, &dev, &fn);
>> +		if (ret != 4) {
>> +			ret = -EINVAL;
>> +			break;
>> +		}
>> +
>> +		ret = hisi_pcie_root_ports_select_one(domain, bus, dev, fn);
>> +		if (ret)
>> +			break;
>> +	}
>> +
>> +	zfree(&copy);
> 
> No need for zfree()
> 

ok.

>> +	return ret;
>> +}
>> +
>> +int iostat_parse(const struct option *opt, const char *str, int unset __maybe_unused)
>> +{
>> +	struct perf_stat_config *config = (struct perf_stat_config *)opt->data;
>> +	int ret;
>> +
>> +	ret = hisi_pcie_root_ports_init();
>> +	if (!ret) {
>> +		config->iostat_run = true;
>> +
>> +		if (!str) {
>> +			iostat_mode = IOSTAT_RUN;
>> +			hisi_pcie_root_ports_select_all();
>> +		} else if (!strcmp(str, "list")) {
>> +			iostat_mode = IOSTAT_LIST;
>> +			hisi_pcie_root_ports_select_all();
>> +		} else {
>> +			iostat_mode = IOSTAT_RUN;
>> +			ret = hisi_pcie_root_ports_list_filter(str);
>> +		}
>> +	}
>> +
>> +	return ret;
>> +}
>> +
>> +static void hisi_pcie_root_port_show(FILE *output,
>> +				     const struct hisi_pcie_root_port * const rp)
>> +{
>> +	if (output && rp)
>> +		fprintf(output, "hisi_pcie%hu_core%hu<" PCI_DEVICE_NAME_PATTERN ">\n",
>> +			rp->sicl_id, rp->core_id, rp->domain, rp->bus, rp->dev, rp->fn);
> 
> Multi line if blocks should be enclosed in {}
> 

got it. will enclose it with {}.

>> +}
>> +
>> +void iostat_list(struct evlist *evlist __maybe_unused, struct perf_stat_config *config)
>> +{
>> +	struct hisi_pcie_root_port *rp = NULL;
>> +	struct evsel *evsel;
>> +
>> +	evlist__for_each_entry(evlist, evsel) {
> 
> Can you add a comment explaining this evsel->priv check? Probably we
> have multiple ports and some are shared with multiple evsels and we
> don't have a ports list, so we reuse the evsel list, etc.
> 
> Having a comment with the above info may help future reviewers?
> 

sure. will add comment here.

>> +		if (rp != evsel->priv) {
>> +			hisi_pcie_root_port_show(config->output, evsel->priv);
>> +			rp = evsel->priv;
>> +		}
>> +	}
>> +}
>> +
>> +void iostat_release(struct evlist *evlist)
>> +{
>> +	struct evsel *evsel;
>> +
>> +	evlist__for_each_entry(evlist, evsel)
>> +		evsel->priv = NULL;
>> +
> 
> But then here you zero out all the privs and still can iterate the list
> of ports in the following function:
> 
>> +	hisi_pcie_root_ports_free();
> 
> Why not iterate on hisi_pcie_root_ports_list in iostat_list()?
> 

I think you're right. For listing mode (`perf iostat list`) we don't need
to counting and adding evsels, so iterate hisi_pcie_root_ports_list will be
enough. Since no evsels added evlist will be empty.

>> +}
>> +
>> +void iostat_print_header_prefix(struct perf_stat_config *config)
>> +{
>> +	if (config->csv_output)
>> +		fputs("port,", config->output);
>> +	else if (config->interval)
>> +		fprintf(config->output, "#          time    port         ");
>> +	else
>> +		fprintf(config->output, "   port         ");
>> +}
>> +
>> +void iostat_print_metric(struct perf_stat_config *config, struct evsel *evsel,
>> +			 struct perf_stat_output_ctx *out)
>> +{
>> +	struct perf_counts_values *count;
>> +	const char *iostat_metric;
>> +	double iostat_value;
>> +
>> +	iostat_metric = hisi_iostat_metrics[evsel->core.idx % METRIC_TYPE_MAX];
>> +
>> +	/* We're using AGGR_GLOBAL so there's only one aggr counts aggr[0]. */
>> +	count = &evsel->stats->aggr[0].counts;
>> +
>> +	/* The counts has been scaled, we can use it directly. */
>> +	iostat_value = (double)count->val;
>> +
>> +	/*
>> +	 * Calculate the bandwidth in MiB since the PMU counts in DWord.
>> +	 * Display two digits after decimal point for better accuracy if the
>> +	 * value is non-zero.
>> +	 */
>> +	out->print_metric(config, out->ctx, NULL,
>> +			  iostat_value > 0 ? "%8.2f" : "%8.0f", iostat_metric,
>> +			  iostat_value * sizeof(uint32_t) / (1024 * 1024));
>> +}
>> +
>> +void iostat_prefix(struct evlist *evlist, struct perf_stat_config *config,
>> +		   char *prefix, struct timespec *ts)
>> +{
>> +	struct hisi_pcie_root_port *rp = evlist->selected->priv;
>> +
>> +	if (rp) {
>> +		if (ts)
>> +			sprintf(prefix, "%6lu.%09lu%s" PCI_DEVICE_NAME_PATTERN "%s",
> 
> Can you use scnprintf? and pass the size of prefix to iostat_prefix()?
> 

sure. will use scnprintf().

Thanks,
Yicong



More information about the linux-arm-kernel mailing list