[PATCH] nvme-pci: do not set the NUMA node of device if it has none

Pratyush Yadav ptyadav at amazon.de
Tue Aug 8 08:51:01 PDT 2023


On Fri, Aug 04 2023, Keith Busch wrote:

> On Fri, Aug 04, 2023 at 04:50:16PM +0200, Pratyush Yadav wrote:
>> With this patch, I get the below affinities:
>
> Something still seems off without effective_affinity set. That attribute
> should always reflect one CPU from the smp_affinity_list.
>
> At least with your patch, the smp_affinity_list looks as expected: every
> CPU is accounted for, and no vector appears to share the resource among
> CPUs in different nodes.
>
>>     $   for i in $(cat /proc/interrupts | grep nvme0 | sed "s/^ *//g" | cut -d":" -f 1); do \
>>     >     cat /proc/irq/$i/{smp,effective}_affinity_list; \
>>     >   done
>>     8
>>     8
>>     16-17,48,65,67,69
>>
>>     18-19,50,71,73,75
>>
>>     20,52,77,79
>>
>>     21,53,81,83
>>
>>     22,54,85,87
>>
>>     23,55,89,91
>>
>>     24,56,93,95
>>
>>     25,57,97,99
>>
>>     26,58,101,103
>>
>>     27,59,105,107
>>
>>     28,60,109,111
>>
>>     29,61,113,115
>>
>>     30,62,117,119
>>
>>     31,63,121,123
>>
>>     49,51,125,127
>>
>>     0,32,64,66
>>
>>     1,33,68,70
>>
>>     2,34,72,74
>>
>>     3,35,76,78
>>
>>     4,36,80,82
>>
>>     5,37,84,86
>>
>>     6,38,88,90
>>
>>     7,39,92,94
>>
>>     8,40,96,98
>>
>>     9,41,100,102
>>
>>     10,42,104,106
>>
>>     11,43,108,110
>>
>>     12,44,112,114
>>
>>     13,45,116,118
>>
>>     14,46,120,122
>>
>>     15,47,124,126
>>
>> The blank lines are because effective_smp_affinity is blank for all but the first interrupt.
>>
>> The problem is, even with this I still get the same performance
>> difference when running on Node 1 vs Node 0. I am not sure why. Any
>> pointers?
>
> I suspect effective_affinity isn't getting set and interrupts are
> triggering on unexpected CPUs. If you check /proc/interrupts, can you
> confirm if the interrupts are firing on CPUs within the
> smp_affinity_list or some other CPU?

All interrupts are hitting CPU0.

I did some more digging. Xen sets the effective affinity via the
function set_affinity_irq(). But it only sets it if info->evtchn is
valid. But info->evtchn is set by startup_pirq(), which is called _after_
set_affinity_irq().

This worked before because irqbalance was coming in after all this
happened and set the affinity, causing set_affinity_irq() to be called
again. By that time startup_pirq() has been called and info->evtchn is
set.

This whole thing needs some refactoring to work properly. I wrote up a
hacky patch (on top of my previous hacky patch) that fixes it. I will
write up a proper patch when I find some more time next week.

With this, I am now seeing good performance on node 1. I am seeing
slightly slower performance on node 1 but that might be due to some
other reasons and I do not want to dive into that for now.

Thanks for your help with this :-)

BTW, do you think I should re-send the patch with updated reasoning,
since Cristoph thinks we should do this regardless of the performance
reasons [0]?

[0] https://lore.kernel.org/linux-nvme/20230726131408.GA15909@lst.de/

----- 8< -----
diff --git a/drivers/xen/events/events_base.c b/drivers/xen/events/events_base.c
index c7715f8bd4522..ed33d33a2f1c9 100644
--- a/drivers/xen/events/events_base.c
+++ b/drivers/xen/events/events_base.c
@@ -108,6 +108,7 @@ struct irq_info {
 	unsigned irq;
 	evtchn_port_t evtchn;   /* event channel */
 	unsigned short cpu;     /* cpu bound */
+	unsigned short suggested_cpu;
 	unsigned short eoi_cpu; /* EOI must happen on this cpu-1 */
 	unsigned int irq_epoch; /* If eoi_cpu valid: irq_epoch of event */
 	u64 eoi_time;           /* Time in jiffies when to EOI. */
@@ -871,6 +873,8 @@ static void mask_ack_pirq(struct irq_data *data)
 	eoi_pirq(data);
 }
 
+static int xen_rebind_evtchn_to_cpu(struct irq_info *info, unsigned int tcpu);
+
 static unsigned int __startup_pirq(unsigned int irq)
 {
 	struct evtchn_bind_pirq bind_pirq;
@@ -903,6 +907,14 @@ static unsigned int __startup_pirq(unsigned int irq)
 	info->evtchn = evtchn;
 	bind_evtchn_to_cpu(evtchn, 0, false);
 
+	if (info->suggested_cpu) {
+		int ret;
+		ret = xen_rebind_evtchn_to_cpu(info, info->suggested_cpu);
+		if (!ret)
+			irq_data_update_effective_affinity(irq_get_irq_data(irq),
+							cpumask_of(info->suggested_cpu));
+	}
+
 	rc = xen_evtchn_port_setup(evtchn);
 	if (rc)
 		goto err;
@@ -1856,12 +1872,17 @@ static unsigned int select_target_cpu(const struct cpumask *dest)
 static int set_affinity_irq(struct irq_data *data, const struct cpumask *dest,
 			    bool force)
 {
+	struct irq_info *info = info_for_irq(data->irq);
 	unsigned int tcpu = select_target_cpu(dest);
 	int ret;
 
-	ret = xen_rebind_evtchn_to_cpu(info_for_irq(data->irq), tcpu);
-	if (!ret)
+	ret = xen_rebind_evtchn_to_cpu(info, tcpu);
+	if (!ret) {
 		irq_data_update_effective_affinity(data, cpumask_of(tcpu));
+	} else {
+		if (info)
+			info->suggested_cpu = tcpu;
+	}
 
 	return ret;
 }


-- 
Regards,
Pratyush Yadav



Amazon Development Center Germany GmbH
Krausenstr. 38
10117 Berlin
Geschaeftsfuehrung: Christian Schlaeger, Jonathan Weiss
Eingetragen am Amtsgericht Charlottenburg unter HRB 149173 B
Sitz: Berlin
Ust-ID: DE 289 237 879






More information about the Linux-nvme mailing list