[PATCH v8 2/2] sched/fair: Scan cluster before scanning LLC in wake-up path

Tue Jun 13 01:09:17 PDT 2023

On 2023/6/13 15:36, Yicong Yang wrote:
> On 2023/6/9 18:50, Chen Yu wrote:
>> On 2023-06-08 at 14:45:54 +0800, Yicong Yang wrote:
>>> On 2023/6/8 11:26, Chen Yu wrote:
>>>> On 2023-05-31 at 16:21:00 +0800, Yicong Yang wrote:
>>>>> On 2023/5/30 22:39, Yicong Yang wrote:
>>>>>> On 2023/5/30 19:55, Peter Zijlstra wrote:
>>>>>>> On Tue, May 30, 2023 at 03:02:53PM +0800, Yicong Yang wrote:
>>>>>>>
>>>>>>>> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
>>>>>>>> index 373ff5f55884..b8c129ed8b47 100644
>>>>>>>> --- a/kernel/sched/fair.c
>>>>>>>> +++ b/kernel/sched/fair.c
>>>>>>>> @@ -6994,6 +6994,30 @@ static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, bool
>>>>>>>>  		}
>>>>>>>>  	}
>>>>>>>>  
>>>>>>>> +	if (static_branch_unlikely(&sched_cluster_active)) {
>>>>>>>> +		struct sched_domain *sdc = rcu_dereference(per_cpu(sd_cluster, target));
>>>>>>>> +
>>>>>>>> +		if (sdc) {
>>>>>>>> +			for_each_cpu_wrap(cpu, sched_domain_span(sdc), target + 1) {
>>>>>>>> +				if (!cpumask_test_cpu(cpu, cpus))
>>>>>>>> +					continue;
>>>>>>>> +
>>>>>>>> +				if (has_idle_core) {
>>>>>>>> +					i = select_idle_core(p, cpu, cpus, &idle_cpu);
>>>>>>>> +					if ((unsigned int)i < nr_cpumask_bits)
>>>>>>>> +						return i;
>>>>>>>> +				} else {
>>>>>>>> +					if (--nr <= 0)
>>>>>>>> +						return -1;
>>>>>>>> +					idle_cpu = __select_idle_cpu(cpu, p);
>>>>>>>> +					if ((unsigned int)idle_cpu < nr_cpumask_bits)
>>>>>>>> +						return idle_cpu;
>>>>>>>> +				}
>>>>>>>> +			}
>>>>>>>> +			cpumask_andnot(cpus, cpus, sched_domain_span(sdc));
>>>>>>>> +		}
>>>>>>>> +	}
>>>>>>>
>>>>>>> Would not this:
>>>>>>>
>>>>>>> --- a/kernel/sched/fair.c
>>>>>>> +++ b/kernel/sched/fair.c
>>>>>>> @@ -6994,6 +6994,29 @@ static int select_idle_cpu(struct task_s
>>>>>>>  		}
>>>>>>>  	}
>>>>>>>  
>>>>>>> +	if (static_branch_unlikely(&sched_cluster_active)) {
>>>>>>> +		struct sched_group *sg = sd->groups;
>>>>>>> +		if (sg->flags & SD_CLUSTER) {
>>>>>>> +			for_each_cpu_wrap(cpu, sched_group_span(sg), target+1) {
>>>>>>> +				if (!cpumask_test_cpu(cpu, cpus))
>>>>>>> +					continue;
>>>>>>> +
>>>>>>> +				if (has_idle_core) {
>>>>>>> +					i = select_idle_core(p, cpu, cpus, &idle_cpu);
>>>>>>> +					if ((unsigned)i < nr_cpumask_bits)
>>>>>>> +						return 1;
>>>>>>> +				} else {
>>>>>>> +					if (--nr <= 0)
>>>>>>> +						return -1;
>>>>>>> +					idle_cpu = __select_idle_cpu(cpu, p);
>>>>>>> +					if ((unsigned)idle_cpu < nr_cpumask_bits)
>>>>>>> +						return idle_cpu;
>>>>>>> +				}
>>>>>>> +			}
>>>>>>> +			cpumask_andnot(cpus, cpus, sched_group_span(sg));
>>>>>>> +		}
>>>>>>> +	}
>>>>>>> +
>>>>>>>  	for_each_cpu_wrap(cpu, cpus, target + 1) {
>>>>>>>  		if (has_idle_core) {
>>>>>>>  			i = select_idle_core(p, cpu, cpus, &idle_cpu);
>>>>>>>
>>>>>>> also work? Then we can avoid the extra sd_cluster per-cpu variable.
>>>>>>>
>>>>>>
>>>>>> I thought it will be fine since sg->flags is derived from the child domain. But practically it doesn't.
>>>>>> Tested on a 2P Skylake server with no clusters, add some debug messages to see how sg->flags appears:
>>>>>>
>>>>>> diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c
>>>>>> index 69968ed9ffb9..5c443b74abf5 100644
>>>>>> --- a/kernel/sched/topology.c
>>>>>> +++ b/kernel/sched/topology.c
>>>>>> @@ -90,8 +90,8 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level,
>>>>>>
>>>>>>                 cpumask_or(groupmask, groupmask, sched_group_span(group));
>>>>>>
>>>>>> -               printk(KERN_CONT " %d:{ span=%*pbl",
>>>>>> -                               group->sgc->id,
>>>>>> +               printk(KERN_CONT " %d:{ cluster: %s span=%*pbl",
>>>>>> +                               group->sgc->id, group->flags & SD_CLUSTER ? "true" : "false",
>>>>>>                                 cpumask_pr_args(sched_group_span(group)));
>>>>>>
>>>>>>                 if ((sd->flags & SD_OVERLAP) &&
>>>>>>
>>>>>> Unfortunately the result doesn't match what I expected, the MC domain's sg->flags still marked
>>>>>> as cluster:
>>>>>>
>>>>>> [    8.886099] CPU0 attaching sched-domain(s):
>>>>>> [    8.889539]  domain-0: span=0,40 level=SMT
>>>>>> [    8.893538]   groups: 0:{ cluster: false span=0 }, 40:{ cluster: false span=40 }
>>>>>> [    8.897538]   domain-1: span=0-19,40-59 level=MC
>>>>>> [    8.901538]    groups: 0:{ cluster: true span=0,40 cap=2048 }, 1:{ cluster: true span=1,41 cap=2048 }, 2:{ cluster: true span=2,42 cap=2048 }, 3:{ cluster: true span=3,43 cap=2048 }, 4:{ cluster: true span=4,44 cap=2048 }, 5:{ cluster: true span=5,45 cap=2048 }, 6:{ cluster: true span=6,46 cap=2048 }, 7:{ cluster: true span=7,47 cap=2048 }, 8:{ cluster: true span=8,48 cap=2048 }, 9:{ cluster: true span=9,49 cap=2048 }, 10:{ cluster: true span=10,50 cap=2048 }, 11:{ cluster: true span=11,51 cap=2048 }, 12:{ cluster: true span=12,52 cap=2048 }, 13:{ cluster: true span=13,53 cap=2048 }, 14:{ cluster: true span=14,54 cap=2048 }, 15:{ cluster: true span=15,55 cap=2048 }, 16:{ cluster: true span=16,56 cap=2048 }, 17:{ cluster: true span=17,57 cap=2048 }, 18:{ cluster: true span=18,58 cap=2048 }, 19:{ cluster: true span=19,59 cap=2048 }
>>>>>> [    8.905538]    domain-2: span=0-79 level=NUMA
>>>>>> [    8.909538]     groups: 0:{ cluster: false span=0-19,40-59 cap=40960 }, 20:{ cluster: false span=20-39,60-79 cap=40960 }
>>>>>>
>>>>>> I assume we didn't handle the sg->flags correctly on the domain degeneration. Simply checked the code seems
>>>>>> we've already make sg->flags = 0 on degeneration, maybe I need to check where's wrong.
>>>>>>
>>>>>
>>>>> Currently we only update the groups' flags to 0 for the final lowest domain in [1]. The upper
>>>>> domains' group won't be updated if degeneration happens. So we cannot use the suggested approach
>>>>> for cluster scanning and sd_cluster per-cpu variable is still needed.
>>>>>
>>>>> [1] https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/tree/kernel/sched/topology.c?h=v6.4-rc4#n749
>>>>>
>>>>> Thanks.
>>>>>
>>>>>
>>>> Is this an issue? Suppose sched domain A's parent domain
>>>> is B, B's parent sched domain is C. When B degenerates, C's child domain
>>>> pointer is adjusted to A. However, currently the code does not adjust C's
>>>> sched groups' flags. Should we adjust C's sched groups flags to be the same
>>>> as A to keep consistency?
>>>
>>> It depends on whether we're going to use it. currently only asym_smt_can_pull_tasks() uses
>>> it within the SMT so I think update the lowest domain's group flag works. For correctness
>>> all the domain group's flag should derives from its real child. I tried to solve this at group
>>> building but seems hard to do, at that time we don't know whether a domain is going to degenerate
>>> or not since the groups it not built.
>>>
>>>>
>>>> diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c
>>>> index 6198fa135176..fe3fd70f2313 100644
>>>> --- a/kernel/sched/topology.c
>>>> +++ b/kernel/sched/topology.c
>>>> @@ -713,14 +713,13 @@ cpu_attach_domain(struct sched_domain *sd, struct root_domain *rd, int cpu)
>>>>  
>>>>  	/* Remove the sched domains which do not contribute to scheduling. */
>>>>  	for (tmp = sd; tmp; ) {
>>>> -		struct sched_domain *parent = tmp->parent;
>>>> +		struct sched_domain *parent = tmp->parent, *pparent;
>>>>  		if (!parent)
>>>>  			break;
>>>>  
>>>>  		if (sd_parent_degenerate(tmp, parent)) {
>>>> -			tmp->parent = parent->parent;
>>>> -			if (parent->parent)
>>>> -				parent->parent->child = tmp;
>>>> +			pparent = parent->parent;
>>>> +			tmp->parent = pparent;
>>>>  			/*
>>>>  			 * Transfer SD_PREFER_SIBLING down in case of a
>>>>  			 * degenerate parent; the spans match for this
>>>> @@ -728,6 +727,18 @@ cpu_attach_domain(struct sched_domain *sd, struct root_domain *rd, int cpu)
>>>>  			 */
>>>>  			if (parent->flags & SD_PREFER_SIBLING)
>>>>  				tmp->flags |= SD_PREFER_SIBLING;
>>>> +
>>>> +			if (pparent) {
>>>> +				struct sched_group *sg = pparent->groups;
>>>> +
>>>> +				do {
>>>> +					sg->flags = tmp->flags;
>>>
>>> May need to test on some heterogeous platforms. Does it always stand that child domain of CPU from
>>> remote group have the same flags with @tmp?
>>>
>> Good question, for heterogenous platforms sched groups within the same domain might not always
>> have the same flags, because if group1 and group2 sit in big/small-core child domain, they could
>> have different balance flags in theory. Maybe only update the local group's flag is accurate.
>>
>> I found Tim has proposed a fix for a similar scenario, and it is for SD_SHARE_CPUCAPACITY, and it
>> should be in tip:
>> https://lore.kernel.org/lkml/168372654916.404.6677242284447941021.tip-bot2@tip-bot2/
>> We could adjust it based on his change to remove SD_CLUSTER, or we can
>> replace groups->flag with tmp->flag directly, in case we have other flags to be
>> adjusted in the future.
>>
> 
> Thanks for the reference. I think we can handle the SD_CLUSTER in the same way and only update
> local groups flag should satisfy our needs. I tried to use the correct child domains to build the
> sched groups then all the groups will have the correct flags, but it'll be a bit more complex.
> 

something like below, detect the sched domain degeneration first and try to rebuild the groups if
necessary. Works on a non-cluster machine emulated with QEMU:

qemu-system-aarch64 \
        -kernel ${Image} \
        -smp cores=16,threads=2 \
        -cpu host --enable-kvm \
        -m 32G \
        -object memory-backend-ram,id=node0,size=8G \
        -object memory-backend-ram,id=node1,size=8G \
        -object memory-backend-ram,id=node2,size=8G \
        -object memory-backend-ram,id=node3,size=8G \
        -numa node,memdev=node0,cpus=0-7,nodeid=0 \
        -numa node,memdev=node1,cpus=8-15,nodeid=1 \
        -numa node,memdev=node2,cpus=16-23,nodeid=2 \
        -numa node,memdev=node3,cpus=24-31,nodeid=3 \
        -numa dist,src=0,dst=1,val=12 \
        -numa dist,src=0,dst=2,val=20 \
        -numa dist,src=0,dst=3,val=22 \
        -numa dist,src=1,dst=2,val=22 \
        -numa dist,src=1,dst=3,val=24 \
        -numa dist,src=2,dst=3,val=12 \
        -machine virt,iommu=smmuv3 \
        -net none -initrd ${Rootfs} \
        -nographic -bios $(pwd)/QEMU_EFI.fd \
        -append "rdinit=/init console=ttyAMA0 earlycon=pl011,0x9000000 sched_verbose schedstats=enable loglevel=8"

The flags looks correct:
[    4.379910] CPU0 attaching sched-domain(s):
[    4.380540]  domain-0: span=0-1 level=SMT
[    4.381130]   groups: 0:{ cluster: false span=0 cap=1023 }, 1:{ cluster: false span=1 }
[    4.382353]   domain-1: span=0-7 level=MC
[    4.382950]    groups: 0:{ cluster: false span=0-1 cap=2047 }, 2:{ cluster: false span=2-3 cap=2048 }, 4:{ cluster: false span=4-5 cap=2048 }, 6:{ cluster: false span=6-7 cap=2048 }
[    4.385378]    domain-2: span=0-15 level=NUMA
[    4.386047]     groups: 0:{ cluster: false span=0-7 cap=8191 }, 8:{ cluster: false span=8-15 cap=8192 }
[    4.387542]     domain-3: span=0-23 level=NUMA
[    4.388197]      groups: 0:{ cluster: false span=0-15 cap=16383 }, 16:{ cluster: false span=16-23 cap=8192 }
[    4.389661]      domain-4: span=0-31 level=NUMA
[    4.390358]       groups: 0:{ cluster: false span=0-23 mask=0-7 cap=24575 }, 24:{ cluster: false span=16-31 mask=24-31 cap=16384 }

diff --git a/include/linux/sched/topology.h b/include/linux/sched/topology.h
index c0d21667ddf3..d3bf5a1c85bc 100644
--- a/include/linux/sched/topology.h
+++ b/include/linux/sched/topology.h
@@ -99,6 +99,7 @@ struct sched_domain {
        int nohz_idle;                  /* NOHZ IDLE status */
        int flags;                      /* See SD_* */
        int level;
+       int need_degeneration;

        /* Runtime fields. */
        unsigned long last_balance;     /* init to jiffies. units in jiffies */
diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c
index 69968ed9ffb9..561649ddcfe7 100644
--- a/kernel/sched/topology.c
+++ b/kernel/sched/topology.c
@@ -90,8 +90,8 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level,

                cpumask_or(groupmask, groupmask, sched_group_span(group));

-               printk(KERN_CONT " %d:{ span=%*pbl",
-                               group->sgc->id,
+               printk(KERN_CONT " %d:{ cluster: %s span=%*pbl",
+                               group->sgc->id, group->flags & SD_CLUSTER ? "true" : "false",
                                cpumask_pr_args(sched_group_span(group)));

                if ((sd->flags & SD_OVERLAP) &&
@@ -623,6 +623,32 @@ static void free_sched_groups(struct sched_group *sg, int free_sgc)
        } while (sg != first);
 }

+/*
+ * Reset the sched groups for a rebuild.
+ */
+static void reset_sched_domain_groups(struct sched_domain *sd)
+{
+       struct sched_group *sg, *first, *next;
+
+       if (!sd->groups)
+               return;
+
+       sg = first = sd->groups;
+       do {
+               next = sg->next;
+
+               if (sd->flags & SD_OVERLAP) {
+                       if (atomic_dec_and_test(&sg->ref))
+                               kfree(sg);
+               } else {
+                       atomic_dec(&sg->ref);
+               }
+               atomic_dec(&sg->sgc->ref);
+
+               sg = next;
+       } while (sg != first);
+}
+
 static void destroy_sched_domain(struct sched_domain *sd)
 {
        /*
@@ -717,6 +743,42 @@ static void update_top_cache_domain(int cpu)
        rcu_assign_pointer(per_cpu(sd_asym_cpucapacity, cpu), sd);
 }

+static bool probe_sched_domain_degeneration(struct sched_domain *sd, int cpu)
+{
+       bool has_degeneration = false;
+       struct sched_domain *tmp = sd, *parent;
+
+       if (tmp) {
+               parent = tmp->parent;
+               while (parent) {
+                       if (sd_parent_degenerate(tmp, parent)) {
+                               parent->need_degeneration = 1;
+                               has_degeneration = true;
+                               parent = parent->parent;
+                       } else {
+                               tmp = parent;
+                               parent = tmp->parent;
+                       }
+               }
+       }
+
+       if (sd && sd_degenerate(sd)) {
+               sd->need_degeneration = 1;
+               has_degeneration = true;
+       }
+
+       /*
+        * On degeneration reset the sched group's refcount since we need to
+        * rebuild them.
+        */
+       if (has_degeneration) {
+               for (tmp = sd; tmp; tmp = tmp->parent)
+                       reset_sched_domain_groups(tmp);
+       }
+
+       return has_degeneration;
+}
+
 /*
  * Attach the domain 'sd' to 'cpu' as its base domain. Callers must
  * hold the hotplug lock.
@@ -1008,9 +1070,9 @@ find_descended_sibling(struct sched_domain *sd, struct sched_domain *sibling)
         * The proper descendant would be the one whose child won't span out
         * of sd
         */
-       while (sibling->child &&
+       while (sibling->child && (sibling->need_degeneration ||
               !cpumask_subset(sched_domain_span(sibling->child),
-                              sched_domain_span(sd)))
+                              sched_domain_span(sd))))
                sibling = sibling->child;

        /*
@@ -1018,9 +1080,9 @@ find_descended_sibling(struct sched_domain *sd, struct sched_domain *sibling)
         * to go down to skip those sched_domains which don't contribute to
         * scheduling because they will be degenerated in cpu_attach_domain
         */
-       while (sibling->child &&
+       while (sibling->child && (sibling->need_degeneration ||
               cpumask_equal(sched_domain_span(sibling->child),
-                            sched_domain_span(sibling)))
+                            sched_domain_span(sibling))))
                sibling = sibling->child;

        return sibling;
@@ -1199,6 +1261,9 @@ static struct sched_group *get_group(int cpu, struct sd_data *sdd)
        struct sched_group *sg;
        bool already_visited;

+       while (child && child->need_degeneration)
+               child = child->child;
+
        if (child)
                cpu = cpumask_first(sched_domain_span(child));

@@ -2366,6 +2431,7 @@ build_sched_domains(const struct cpumask *cpu_map, struct sched_domain_attr *att
        int i, ret = -ENOMEM;
        bool has_asym = false;
        bool has_cluster = false;
+       bool rebuild = true, need_rebuild = false;

        if (WARN_ON(cpumask_empty(cpu_map)))
                goto error;
@@ -2398,6 +2464,7 @@ build_sched_domains(const struct cpumask *cpu_map, struct sched_domain_attr *att
                }
        }

+rebuild_groups:
        /* Build the groups for the domains */
        for_each_cpu(i, cpu_map) {
                for (sd = *per_cpu_ptr(d.sd, i); sd; sd = sd->parent) {
@@ -2412,6 +2479,25 @@ build_sched_domains(const struct cpumask *cpu_map, struct sched_domain_attr *att
                }
        }

+       /*
+        * If some sched domains are going to be degenerated, the groups may not
+        * be built from the real child domains thus with incorrect flags or
+        * span. Detect the generation here and rebuild the sched groups if
+        * necessary.
+        */
+       if (rebuild) {
+               rebuild = false;
+
+               for_each_cpu(i, cpu_map) {
+                       sd = *per_cpu_ptr(d.sd, i);
+
+                       need_rebuild = probe_sched_domain_degeneration(sd, i);
+               }
+
+               if (need_rebuild)
+                       goto rebuild_groups;
+       }
+
        /*
         * Calculate an allowed NUMA imbalance such that LLCs do not get
         * imbalanced.