[PATCH RFC v2 2/2] pidfs: use maple tree
Marek Szyprowski
m.szyprowski at samsung.com
Fri Dec 13 02:35:48 PST 2024
On 09.12.2024 14:46, Christian Brauner wrote:
> So far we've been using an idr to track pidfs inodes. For some time now
> each struct pid has a unique 64bit value that is used as the inode
> number on 64 bit. That unique inode couldn't be used for looking up a
> specific struct pid though.
>
> Now that we support file handles we need this ability while avoiding to
> leak actual pid identifiers into userspace which can be problematic in
> containers.
>
> So far I had used an idr-based mechanism where the idr is used to
> generate a 32 bit number and each time it wraps we increment an upper
> bit value and generate a unique 64 bit value. The lower 32 bits are used
> to lookup the pid.
>
> I've been looking at the maple tree because it now has
> mas_alloc_cyclic(). Since it uses unsigned long it would simplify the
> 64bit implementation and its dense node mode supposedly also helps to
> mitigate fragmentation.
>
> Signed-off-by: Christian Brauner <brauner at kernel.org>
This patch landed in today's linux-next as commit a2c8e88a30f7 ("pidfs:
use maple tree"). In my tests I found that it triggers the following
lockdep warning, what probably means that something has not been
properly initialized:
================================
WARNING: inconsistent lock state
6.13.0-rc1-00015-ga2c8e88a30f7 #15489 Not tainted
--------------------------------
inconsistent {HARDIRQ-ON-W} -> {IN-HARDIRQ-W} usage.
swapper/0/0 [HC1[1]:SC0[0]:HE0:SE1] takes:
c25d4d10 (&sighand->siglock){?.+.}-{3:3}, at: __lock_task_sighand+0x80/0x1bc
{HARDIRQ-ON-W} state was registered at:
lockdep_hardirqs_on_prepare+0x1a4/0x2c0
trace_hardirqs_on+0x94/0xa0
_raw_spin_unlock_irq+0x20/0x50
mtree_erase+0x88/0xbc
free_pid+0xc/0xd4
release_task+0x630/0x7a8
do_exit+0x6b8/0xa64
call_usermodehelper_exec_async+0x120/0x144
ret_from_fork+0x14/0x28
irq event stamp: 1017892
hardirqs last enabled at (1017891): [<c0c8e510>]
default_idle_call+0x18/0x13c
hardirqs last disabled at (1017892): [<c0100b94>] __irq_svc+0x54/0xd0
softirqs last enabled at (1017868): [<c013b410>]
handle_softirqs+0x328/0x520
softirqs last disabled at (1017835): [<c013b7b4>] __irq_exit_rcu+0x144/0x1f0
other info that might help us debug this:
Possible unsafe locking scenario:
CPU0
----
lock(&sighand->siglock);
<Interrupt>
lock(&sighand->siglock);
*** DEADLOCK ***
2 locks held by swapper/0/0:
#0: c137b4d0 (rcu_read_lock){....}-{1:3}, at:
kill_pid_info_type+0x2c/0x168
#1: c137b4d0 (rcu_read_lock){....}-{1:3}, at:
__lock_task_sighand+0x0/0x1bc
stack backtrace:
CPU: 0 UID: 0 PID: 0 Comm: swapper/0 Not tainted
6.13.0-rc1-00015-ga2c8e88a30f7 #15489
Hardware name: Samsung Exynos (Flattened Device Tree)
Call trace:
unwind_backtrace from show_stack+0x10/0x14
show_stack from dump_stack_lvl+0x68/0x88
dump_stack_lvl from print_usage_bug.part.0+0x24c/0x270
print_usage_bug.part.0 from mark_lock.part.0+0xc20/0x129c
mark_lock.part.0 from __lock_acquire+0xae8/0x2a00
__lock_acquire from lock_acquire+0x134/0x384
lock_acquire from _raw_spin_lock_irqsave+0x4c/0x68
_raw_spin_lock_irqsave from __lock_task_sighand+0x80/0x1bc
__lock_task_sighand from group_send_sig_info+0x120/0x1b4
group_send_sig_info from kill_pid_info_type+0x8c/0x168
kill_pid_info_type from it_real_fn+0x5c/0x120
it_real_fn from __hrtimer_run_queues+0xcc/0x538
__hrtimer_run_queues from hrtimer_interrupt+0x128/0x2c4
hrtimer_interrupt from exynos4_mct_tick_isr+0x44/0x7c
exynos4_mct_tick_isr from handle_percpu_devid_irq+0x84/0x158
handle_percpu_devid_irq from generic_handle_domain_irq+0x24/0x34
generic_handle_domain_irq from gic_handle_irq+0x88/0xa8
gic_handle_irq from generic_handle_arch_irq+0x34/0x44
generic_handle_arch_irq from __irq_svc+0x8c/0xd0
Exception stack(0xc1301f20 to 0xc1301f68)
...
__irq_svc from default_idle_call+0x1c/0x13c
default_idle_call from do_idle+0x23c/0x2ac
do_idle from cpu_startup_entry+0x28/0x2c
cpu_startup_entry from kernel_init+0x0/0x12c
--->8---
> ---
> fs/pidfs.c | 52 +++++++++++++++++++++++++++++++---------------------
> kernel/pid.c | 34 +++++++++++++++++-----------------
> 2 files changed, 48 insertions(+), 38 deletions(-)
>
> diff --git a/fs/pidfs.c b/fs/pidfs.c
> index 7a1d606b09d7b315e780c81fc7341f4ec82f2639..4a622f906fc210d5e81ba2ac856cfe0ca930f219 100644
> --- a/fs/pidfs.c
> +++ b/fs/pidfs.c
> @@ -19,14 +19,15 @@
> #include <linux/ipc_namespace.h>
> #include <linux/time_namespace.h>
> #include <linux/utsname.h>
> +#include <linux/maple_tree.h>
> #include <net/net_namespace.h>
>
> #include "internal.h"
> #include "mount.h"
>
> -static DEFINE_IDR(pidfs_ino_idr);
> -
> -static u32 pidfs_ino_upper_32_bits = 0;
> +static struct maple_tree pidfs_ino_mtree = MTREE_INIT(pidfs_ino_mtree,
> + MT_FLAGS_ALLOC_RANGE |
> + MT_FLAGS_LOCK_IRQ);
>
> #if BITS_PER_LONG == 32
> /*
> @@ -34,8 +35,6 @@ static u32 pidfs_ino_upper_32_bits = 0;
> * the higher 32 bits are the generation number. The starting
> * value for the inode number and the generation number is one.
> */
> -static u32 pidfs_ino_lower_32_bits = 1;
> -
> static inline unsigned long pidfs_ino(u64 ino)
> {
> return lower_32_bits(ino);
> @@ -49,8 +48,6 @@ static inline u32 pidfs_gen(u64 ino)
>
> #else
>
> -static u32 pidfs_ino_lower_32_bits = 0;
> -
> /* On 64 bit simply return ino. */
> static inline unsigned long pidfs_ino(u64 ino)
> {
> @@ -71,30 +68,43 @@ static inline u32 pidfs_gen(u64 ino)
> */
> int pidfs_add_pid(struct pid *pid)
> {
> - u32 upper;
> - int lower;
> + static unsigned long lower_next = 0;
> + static u32 pidfs_ino_upper_32_bits = 0;
> + unsigned long lower;
> + int ret;
> + MA_STATE(mas, &pidfs_ino_mtree, 0, 0);
>
> /*
> * Inode numbering for pidfs start at 2. This avoids collisions
> * with the root inode which is 1 for pseudo filesystems.
> */
> - lower = idr_alloc_cyclic(&pidfs_ino_idr, pid, 2, 0, GFP_ATOMIC);
> - if (lower >= 0 && lower < pidfs_ino_lower_32_bits)
> - pidfs_ino_upper_32_bits++;
> - upper = pidfs_ino_upper_32_bits;
> - pidfs_ino_lower_32_bits = lower;
> - if (lower < 0)
> - return lower;
> -
> - pid->ino = ((u64)upper << 32) | lower;
> + mtree_lock(&pidfs_ino_mtree);
> + ret = mas_alloc_cyclic(&mas, &lower, pid, 2, ULONG_MAX, &lower_next,
> + GFP_KERNEL);
> + if (ret < 0)
> + goto out_unlock;
> +
> +#if BITS_PER_LONG == 32
> + if (ret == 1) {
> + u32 higher;
> +
> + if (check_add_overflow(pidfs_ino_upper_32_bits, 1, &higher))
> + goto out_unlock;
> + pidfs_ino_upper_32_bits = higher;
> + }
> +#endif
> + pid->ino = ((u64)pidfs_ino_upper_32_bits << 32) | lower;
> pid->stashed = NULL;
> - return 0;
> +
> +out_unlock:
> + mtree_unlock(&pidfs_ino_mtree);
> + return ret;
> }
>
> /* The idr number to remove is the lower 32 bits of the inode. */
> void pidfs_remove_pid(struct pid *pid)
> {
> - idr_remove(&pidfs_ino_idr, lower_32_bits(pid->ino));
> + mtree_erase(&pidfs_ino_mtree, pidfs_ino(pid->ino));
> }
>
> #ifdef CONFIG_PROC_FS
> @@ -522,7 +532,7 @@ static struct pid *pidfs_ino_get_pid(u64 ino)
>
> guard(rcu)();
>
> - pid = idr_find(&pidfs_ino_idr, lower_32_bits(pid_ino));
> + pid = mtree_load(&pidfs_ino_mtree, pid_ino);
> if (!pid)
> return NULL;
>
> diff --git a/kernel/pid.c b/kernel/pid.c
> index 6131543e7c090c164a2bac014f8eeee61926b13d..28100bbac4c130e192abf65d88cca9d330971c5c 100644
> --- a/kernel/pid.c
> +++ b/kernel/pid.c
> @@ -131,6 +131,8 @@ void free_pid(struct pid *pid)
> int i;
> unsigned long flags;
>
> + pidfs_remove_pid(pid);
> +
> spin_lock_irqsave(&pidmap_lock, flags);
> for (i = 0; i <= pid->level; i++) {
> struct upid *upid = pid->numbers + i;
> @@ -152,7 +154,6 @@ void free_pid(struct pid *pid)
> }
>
> idr_remove(&ns->idr, upid->nr);
> - pidfs_remove_pid(pid);
> }
> spin_unlock_irqrestore(&pidmap_lock, flags);
>
> @@ -249,16 +250,6 @@ struct pid *alloc_pid(struct pid_namespace *ns, pid_t *set_tid,
> tmp = tmp->parent;
> }
>
> - /*
> - * ENOMEM is not the most obvious choice especially for the case
> - * where the child subreaper has already exited and the pid
> - * namespace denies the creation of any new processes. But ENOMEM
> - * is what we have exposed to userspace for a long time and it is
> - * documented behavior for pid namespaces. So we can't easily
> - * change it even if there were an error code better suited.
> - */
> - retval = -ENOMEM;
> -
> get_pid_ns(ns);
> refcount_set(&pid->count, 1);
> spin_lock_init(&pid->lock);
> @@ -269,12 +260,23 @@ struct pid *alloc_pid(struct pid_namespace *ns, pid_t *set_tid,
> INIT_HLIST_HEAD(&pid->inodes);
>
> upid = pid->numbers + ns->level;
> - idr_preload(GFP_KERNEL);
> - spin_lock_irq(&pidmap_lock);
> - if (!(ns->pid_allocated & PIDNS_ADDING))
> - goto out_unlock;
> +
> retval = pidfs_add_pid(pid);
> if (retval)
> + goto out_free;
> +
> + /*
> + * ENOMEM is not the most obvious choice especially for the case
> + * where the child subreaper has already exited and the pid
> + * namespace denies the creation of any new processes. But ENOMEM
> + * is what we have exposed to userspace for a long time and it is
> + * documented behavior for pid namespaces. So we can't easily
> + * change it even if there were an error code better suited.
> + */
> + retval = -ENOMEM;
> +
> + spin_lock_irq(&pidmap_lock);
> + if (!(ns->pid_allocated & PIDNS_ADDING))
> goto out_unlock;
> for ( ; upid >= pid->numbers; --upid) {
> /* Make the PID visible to find_pid_ns. */
> @@ -282,13 +284,11 @@ struct pid *alloc_pid(struct pid_namespace *ns, pid_t *set_tid,
> upid->ns->pid_allocated++;
> }
> spin_unlock_irq(&pidmap_lock);
> - idr_preload_end();
>
> return pid;
>
> out_unlock:
> spin_unlock_irq(&pidmap_lock);
> - idr_preload_end();
> put_pid_ns(ns);
>
> out_free:
>
Best regards
--
Marek Szyprowski, PhD
Samsung R&D Institute Poland
More information about the maple-tree
mailing list