[PATCH 1/4] PM: EM: convert power field to micro-Watts precision and align drivers

Tue Jul 5 02:09:09 PDT 2022

On 22/06/2022 16:57, Lukasz Luba wrote:
> The milli-Watts precision causes rounding errors while calculating
> efficiency cost for each OPP. This is especially visible in the 'simple'
> Energy Model (EM), where the power for each OPP is provided from OPP
> framework. This can cause some OPPs to be marked inefficient, while
> using micro-Watts precision that might not happen.
> 
> Update all EM users which access 'power' field and assume the value is
> in milli-Watts.
> 
> Solve also an issue with potential overflow in calculation of energy
> estimation on 32bit machine. It's needed now since the power value
> (thus the 'cost' as well) are higher.
> 
> Example calculation which shows the rounding error and impact:
> 
> power = 'dyn-power-coeff' * volt_mV * volt_mV * freq_MHz
> 
> power_a_uW = (100 * 600mW * 600mW * 500MHz) / 10^6 = 18000
> power_a_mW = (100 * 600mW * 600mW * 500MHz) / 10^9 = 18
> 
> power_b_uW = (100 * 605mW * 605mW * 600MHz) / 10^6 = 21961
> power_b_mW = (100 * 605mW * 605mW * 600MHz) / 10^9 = 21
> 
> max_freq = 2000MHz
> 
> cost_a_mW = 18 * 2000MHz/500MHz = 72
> cost_a_uW = 18000 * 2000MHz/500MHz = 72000
> 
> cost_b_mW = 21 * 2000MHz/600MHz = 70 // <- artificially better
> cost_b_uW = 21961 * 2000MHz/600MHz = 73203
> 
> The 'cost_b_mW' (which is based on old milli-Watts) is misleadingly
> better that the 'cost_b_uW' (this patch uses micro-Watts) and such
> would have impact on the 'inefficient OPPs' information in the Cpufreq
> framework. This patch set removes the rounding issue.

Thanks for this detailed description, it really helps to understand why 
this change is needed.

Perhaps it would make sense to add a power_uw in the EM structure and 
keeping the old one with the milli-watts in order to reduce the impact 
of the change.

It is a suggestion if you find it more convenient. Otherwise I'm fine 
with this approach too.

A few comments below.

> Signed-off-by: Lukasz Luba <lukasz.luba at arm.com>
> ---
>   drivers/cpufreq/mediatek-cpufreq-hw.c |  7 +--
>   drivers/cpufreq/scmi-cpufreq.c        |  6 +++
>   drivers/opp/of.c                      | 15 ++++---
>   drivers/powercap/dtpm_cpu.c           |  5 +--
>   drivers/thermal/cpufreq_cooling.c     | 13 +++++-
>   drivers/thermal/devfreq_cooling.c     | 19 ++++++--
>   include/linux/energy_model.h          | 63 ++++++++++++++++++++-------
>   kernel/power/energy_model.c           | 31 ++++++++-----
>   8 files changed, 114 insertions(+), 45 deletions(-)
> 
> diff --git a/drivers/cpufreq/mediatek-cpufreq-hw.c b/drivers/cpufreq/mediatek-cpufreq-hw.c
> index 813cccbfe934..f0e0a35c7f21 100644
> --- a/drivers/cpufreq/mediatek-cpufreq-hw.c
> +++ b/drivers/cpufreq/mediatek-cpufreq-hw.c
> @@ -51,7 +51,7 @@ static const u16 cpufreq_mtk_offsets[REG_ARRAY_SIZE] = {
>   };
>   

[ ... ]

> diff --git a/drivers/thermal/cpufreq_cooling.c b/drivers/thermal/cpufreq_cooling.c
> index b8151d95a806..dc19e7c80751 100644
> --- a/drivers/thermal/cpufreq_cooling.c
> +++ b/drivers/thermal/cpufreq_cooling.c
> @@ -21,6 +21,7 @@
>   #include <linux/pm_qos.h>
>   #include <linux/slab.h>
>   #include <linux/thermal.h>
> +#include <linux/units.h>
>   
>   #include <trace/events/thermal.h>
>   
> @@ -101,6 +102,7 @@ static unsigned long get_level(struct cpufreq_cooling_device *cpufreq_cdev,
>   static u32 cpu_freq_to_power(struct cpufreq_cooling_device *cpufreq_cdev,
>   			     u32 freq)
>   {
> +	unsigned long power_mw;
>   	int i;
>   
>   	for (i = cpufreq_cdev->max_level - 1; i >= 0; i--) {
> @@ -108,16 +110,23 @@ static u32 cpu_freq_to_power(struct cpufreq_cooling_device *cpufreq_cdev,
>   			break;
>   	}
>   
> -	return cpufreq_cdev->em->table[i + 1].power;
> +	power_mw = cpufreq_cdev->em->table[i + 1].power;
> +	power_mw /= MICROWATT_PER_MILLIWATT;

Won't this fail with an unresolved symbols on some archs ? I mean may be 
do_div should be used instead ?

> +
> +	return power_mw;
>   }

[ ... ]

>   #ifdef CONFIG_64BIT
> -#define em_scale_power(p) ((p) * 1000)
> +#define em_estimate_energy(cost, sum_util, scale_cpu) \
> +	(((cost) * (sum_util)) / (scale_cpu))
>   #else
> -#define em_scale_power(p) (p)
> +#define em_estimate_energy(cost, sum_util, scale_cpu) \
> +	(((cost) / (scale_cpu)) * (sum_util))
>   #endif
>   
>   struct em_data_callback {
> @@ -112,7 +143,7 @@ struct em_data_callback {
>   	 * and frequency.
>   	 *
>   	 * In case of CPUs, the power is the one of a single CPU in the domain,
> -	 * expressed in milli-Watts or an abstract scale. It is expected to
> +	 * expressed in micro-Watts or an abstract scale. It is expected to
>   	 * fit in the [0, EM_MAX_POWER] range.
>   	 *
>   	 * Return 0 on success.
> @@ -148,7 +179,7 @@ struct em_perf_domain *em_cpu_get(int cpu);
>   struct em_perf_domain *em_pd_get(struct device *dev);
>   int em_dev_register_perf_domain(struct device *dev, unsigned int nr_states,
>   				struct em_data_callback *cb, cpumask_t *span,
> -				bool milliwatts);
> +				bool microwatts);
>   void em_dev_unregister_perf_domain(struct device *dev);
>   
>   /**
> @@ -273,7 +304,7 @@ static inline unsigned long em_cpu_energy(struct em_perf_domain *pd,
>   	 *   pd_nrg = ------------------------                       (4)
>   	 *                  scale_cpu
>   	 */
> -	return ps->cost * sum_util / scale_cpu;
> +	return em_estimate_energy(ps->cost, sum_util, scale_cpu);
>   }
>   
>   /**
> @@ -297,7 +328,7 @@ struct em_data_callback {};
>   static inline
>   int em_dev_register_perf_domain(struct device *dev, unsigned int nr_states,
>   				struct em_data_callback *cb, cpumask_t *span,
> -				bool milliwatts)
> +				bool microwatts)
>   {
>   	return -EINVAL;
>   }
> diff --git a/kernel/power/energy_model.c b/kernel/power/energy_model.c
> index 6c373f2960e7..910668ec8838 100644
> --- a/kernel/power/energy_model.c
> +++ b/kernel/power/energy_model.c
> @@ -108,10 +108,11 @@ static void em_debug_remove_pd(struct device *dev) {}
>   
>   static int em_create_perf_table(struct device *dev, struct em_perf_domain *pd,
>   				int nr_states, struct em_data_callback *cb,
> -				unsigned long flags)
> +				unsigned long flags, int num_devs)
>   {
>   	unsigned long power, freq, prev_freq = 0, prev_cost = ULONG_MAX;
>   	struct em_perf_state *table;
> +	unsigned long max_cost = 0;
>   	int i, ret;
>   	u64 fmax;
>   
> @@ -145,7 +146,7 @@ static int em_create_perf_table(struct device *dev, struct em_perf_domain *pd,
>   
>   		/*
>   		 * The power returned by active_state() is expected to be
> -		 * positive and to fit into 16 bits.
> +		 * positive and be in range.
>   		 */
>   		if (!power || power > EM_MAX_POWER) {
>   			dev_err(dev, "EM: invalid power: %lu\n",
> @@ -170,7 +171,7 @@ static int em_create_perf_table(struct device *dev, struct em_perf_domain *pd,
>   				goto free_ps_table;
>   			}
>   		} else {
> -			power_res = em_scale_power(table[i].power);
> +			power_res = table[i].power;
>   			cost = div64_u64(fmax * power_res, table[i].frequency);
>   		}
>   
> @@ -183,6 +184,15 @@ static int em_create_perf_table(struct device *dev, struct em_perf_domain *pd,
>   		} else {
>   			prev_cost = table[i].cost;
>   		}
> +
> +		if (max_cost < table[i].cost)
> +			max_cost = table[i].cost;
> +	}
> +
> +	/* Check if it won't overflow during energy estimation. */
> +	if (em_validate_cost(max_cost, num_devs)) {

I'm not finding the em_validate_cost() function

> +		dev_err(dev, "EM: too big 'cost' value: %lu\n",	max_cost);
> +		goto free_ps_table;
>   	}
>   
>   	pd->table = table;
> @@ -199,9 +209,9 @@ static int em_create_pd(struct device *dev, int nr_states,
>   			struct em_data_callback *cb, cpumask_t *cpus,
>   			unsigned long flags)
>   {
> +	int cpu, ret, num_devs = 1;
>   	struct em_perf_domain *pd;
>   	struct device *cpu_dev;
> -	int cpu, ret;
>   
>   	if (_is_cpu_device(dev)) {
>   		pd = kzalloc(sizeof(*pd) + cpumask_size(), GFP_KERNEL);
> @@ -209,13 +219,14 @@ static int em_create_pd(struct device *dev, int nr_states,
>   			return -ENOMEM;
>   
>   		cpumask_copy(em_span_cpus(pd), cpus);
> +		num_devs = cpumask_weight(cpus);

Why is this change needed ? What is the connection with the uW unit change ?

>   	} else {
>   		pd = kzalloc(sizeof(*pd), GFP_KERNEL);
>   		if (!pd)
>   			return -ENOMEM;
>   	}
>   
> -	ret = em_create_perf_table(dev, pd, nr_states, cb, flags);
> +	ret = em_create_perf_table(dev, pd, nr_states, cb, flags, num_devs);
>   	if (ret) {
>   		kfree(pd);
>   		return ret;
> @@ -314,13 +325,13 @@ EXPORT_SYMBOL_GPL(em_cpu_get);
>    * @cpus	: Pointer to cpumask_t, which in case of a CPU device is
>    *		obligatory. It can be taken from i.e. 'policy->cpus'. For other
>    *		type of devices this should be set to NULL.
> - * @milliwatts	: Flag indicating that the power values are in milliWatts or
> + * @microwatts	: Flag indicating that the power values are in micro-Watts or
>    *		in some other scale. It must be set properly.
>    *
>    * Create Energy Model tables for a performance domain using the callbacks
>    * defined in cb.
>    *
> - * The @milliwatts is important to set with correct value. Some kernel
> + * The @microwatts is important to set with correct value. Some kernel
>    * sub-systems might rely on this flag and check if all devices in the EM are
>    * using the same scale.
>    *
> @@ -331,7 +342,7 @@ EXPORT_SYMBOL_GPL(em_cpu_get);
>    */
>   int em_dev_register_perf_domain(struct device *dev, unsigned int nr_states,
>   				struct em_data_callback *cb, cpumask_t *cpus,
> -				bool milliwatts)
> +				bool microwatts)
>   {
>   	unsigned long cap, prev_cap = 0;
>   	unsigned long flags = 0;
> @@ -381,8 +392,8 @@ int em_dev_register_perf_domain(struct device *dev, unsigned int nr_states,
>   		}
>   	}
>   
> -	if (milliwatts)
> -		flags |= EM_PERF_DOMAIN_MILLIWATTS;
> +	if (microwatts)
> +		flags |= EM_PERF_DOMAIN_MICROWATTS;
>   	else if (cb->get_cost)
>   		flags |= EM_PERF_DOMAIN_ARTIFICIAL;
>   

-- 
<http://www.linaro.org/> Linaro.org │ Open source software for ARM SoCs

Follow Linaro:  <http://www.facebook.com/pages/Linaro> Facebook |
<http://twitter.com/#!/linaroorg> Twitter |
<http://www.linaro.org/linaro-blog/> Blog