[PATCH 1/2] mm: introduce pagetable_alloc_nolock()
Yeoreum Yun
yeoreum.yun at arm.com
Fri Dec 12 08:18:31 PST 2025
Some architectures invoke pagetable_alloc() with preemption disabled
(e.g., arm64’s linear_map_split_to_ptes()).
Under PREEMPT_RT, calling pagetable_alloc() with
preemption disabled is not allowed, because it may acquire
a spin lock that becomes sleepable on RT, potentially
causing a sleep during page allocation.
To address this, introduce a pagetable_alloc_nolock() API and
permit two additional GFP flags for alloc_pages_nolock() — __GFP_HIGH and __GFP_ZERO.
Signed-off-by: Yeoreum Yun <yeoreum.yun at arm.com>
---
include/linux/mm.h | 18 ++++++++++++++++++
kernel/bpf/stream.c | 2 +-
kernel/bpf/syscall.c | 2 +-
mm/page_alloc.c | 10 +++-------
4 files changed, 23 insertions(+), 9 deletions(-)
diff --git a/include/linux/mm.h b/include/linux/mm.h
index 7c79b3369b82..11a27f60838b 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -2990,6 +2990,24 @@ static inline struct ptdesc *pagetable_alloc_noprof(gfp_t gfp, unsigned int orde
}
#define pagetable_alloc(...) alloc_hooks(pagetable_alloc_noprof(__VA_ARGS__))
+/**
+ * pagetable_alloc_nolock - opportunistic reetentrant pagetables allocation
+ * from any context
+ * @gfp: GFP flags. Only __GFP_ZERO, __GFP_HIGH, __GFP_ACCOUNT allowed.
+ * @order: desired pagetable order
+ *
+ * opportunistic reetentrant version of pagetable_alloc().
+ *
+ * Return: The ptdesc describing the allocated page tables.
+ */
+static inline struct ptdesc *pagetable_alloc_nolock_noprof(gfp_t gfp, unsigned int order)
+{
+ struct page *page = alloc_pages_nolock_noprof(gfp, NUMA_NO_NODE, order);
+
+ return page_ptdesc(page);
+}
+#define pagetable_alloc_nolock(...) alloc_hooks(pagetable_alloc_nolock_noprof(__VA_ARGS__))
+
/**
* pagetable_free - Free pagetables
* @pt: The page table descriptor
diff --git a/kernel/bpf/stream.c b/kernel/bpf/stream.c
index ff16c631951b..3c80c8007d91 100644
--- a/kernel/bpf/stream.c
+++ b/kernel/bpf/stream.c
@@ -83,7 +83,7 @@ static struct bpf_stream_page *bpf_stream_page_replace(void)
struct bpf_stream_page *stream_page, *old_stream_page;
struct page *page;
- page = alloc_pages_nolock(/* Don't account */ 0, NUMA_NO_NODE, 0);
+ page = alloc_pages_nolock(/* Don't account */ __GFP_ZERO, NUMA_NO_NODE, 0);
if (!page)
return NULL;
stream_page = page_address(page);
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index 8a129746bd6c..cbc0f8d0c18b 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -598,7 +598,7 @@ static bool can_alloc_pages(void)
static struct page *__bpf_alloc_page(int nid)
{
if (!can_alloc_pages())
- return alloc_pages_nolock(__GFP_ACCOUNT, nid, 0);
+ return alloc_pages_nolock(__GFP_ZERO | __GFP_ACCOUNT, nid, 0);
return alloc_pages_node(nid,
GFP_KERNEL | __GFP_ZERO | __GFP_ACCOUNT
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index ed82ee55e66a..88a920dc1e9a 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -7542,21 +7542,17 @@ struct page *alloc_frozen_pages_nolock_noprof(gfp_t gfp_flags, int nid, unsigned
* various contexts. We cannot use printk_deferred_enter() to mitigate,
* since the running context is unknown.
*
- * Specify __GFP_ZERO to make sure that call to kmsan_alloc_page() below
- * is safe in any context. Also zeroing the page is mandatory for
- * BPF use cases.
- *
* Though __GFP_NOMEMALLOC is not checked in the code path below,
* specify it here to highlight that alloc_pages_nolock()
* doesn't want to deplete reserves.
*/
- gfp_t alloc_gfp = __GFP_NOWARN | __GFP_ZERO | __GFP_NOMEMALLOC | __GFP_COMP
+ gfp_t alloc_gfp = __GFP_NOWARN | __GFP_NOMEMALLOC | __GFP_COMP
| gfp_flags;
unsigned int alloc_flags = ALLOC_TRYLOCK;
struct alloc_context ac = { };
struct page *page;
- VM_WARN_ON_ONCE(gfp_flags & ~__GFP_ACCOUNT);
+ VM_WARN_ON_ONCE(gfp_flags & ~(__GFP_HIGH | __GFP_ZERO | __GFP_ACCOUNT));
/*
* In PREEMPT_RT spin_trylock() will call raw_spin_lock() which is
* unsafe in NMI. If spin_trylock() is called from hard IRQ the current
@@ -7602,7 +7598,7 @@ struct page *alloc_frozen_pages_nolock_noprof(gfp_t gfp_flags, int nid, unsigned
}
/**
* alloc_pages_nolock - opportunistic reentrant allocation from any context
- * @gfp_flags: GFP flags. Only __GFP_ACCOUNT allowed.
+ * @gfp_flags: GFP flags. Only __GFP_ZERO, __GFP_HIGH, __GFP_ACCOUNT allowed.
* @nid: node to allocate from
* @order: allocation order size
*
--
LEVI:{C3F47F37-75D8-414A-A8BA-3980EC8A46D7}
More information about the linux-arm-kernel
mailing list