[PATCH v7 2/3] kho: fix deferred init of kho scratch
Mike Rapoport
rppt at kernel.org
Thu Mar 19 00:54:05 PDT 2026
Hi,
On Wed, Mar 18, 2026 at 01:36:07PM -0400, Zi Yan wrote:
> On 18 Mar 2026, at 13:19, Michał Cłapiński wrote:
> > On Wed, Mar 18, 2026 at 6:08 PM Zi Yan <ziy at nvidia.com> wrote:
> >>
> >> ## Call site analysis
> >>
> >> init_pageblock_migratetype() has nine call sites. The init call ordering
> >> relevant to scratch is:
> >>
> >> ```
> >> setup_arch()
> >> zone_sizes_init() -> free_area_init() -> memmap_init_range() [1]
Hmm, this is slightly outdated, but largely correct :)
> >>
> >> mm_init_free_all() / start_kernel():
> >> kho_memory_init() -> kho_release_scratch() [2]
> >> memblock_free_all()
> >> free_low_memory_core_early()
> >> memmap_init_reserved_pages()
> >> reserve_bootmem_region() -> __init_deferred_page()
> >> -> __init_page_from_nid() [3]
> >> deferred init kthreads -> __init_page_from_nid() [4]
And this is wrong, deferred init does not call __init_page_from_nid, only
reserve_bootmem_region() does.
And there's a case claude missed:
hugetlb_bootmem_free_invalid_page() -> __init_page_from_nid() that
shouldn't check for KHO. Well, at least until we have support for hugetlb
persistence and most probably even afterwards.
I don't think we should modify reserve_bootmem_region(). If there are
reserved pages in a pageblock, it does not matter if it's initialized to
MIGRATE_CMA. It only becomes important if the reserved pages freed, so we
can update pageblock migrate type in free_reserved_area().
When we boot with KHO, all memblock allocations come from scratch, so
anything freed in free_reserved_area() should become CMA again.
> >> ```
> >
> > I don't understand this. deferred_free_pages() doesn't call
> > __init_page_from_nid(). So I would clearly need to modify both
> > deferred_free_pages and __init_page_from_nid.
For deferred_free_pages() we don't need kho_scratch_overlap(), we already
have memblock_region (almost) at hand and it's enough to check if it's
MEMBLOCK_KHO_SCRATCH.
Something along these lines (compile tested only) should do the trick:
diff --git a/include/linux/memblock.h b/include/linux/memblock.h
index 3e217414e12d..b9b1e0991ec8 100644
--- a/include/linux/memblock.h
+++ b/include/linux/memblock.h
@@ -275,6 +275,8 @@ static inline void __next_physmem_range(u64 *idx, struct memblock_type *type,
__for_each_mem_range(i, &memblock.reserved, NULL, NUMA_NO_NODE, \
MEMBLOCK_NONE, p_start, p_end, NULL)
+struct memblock_region *memblock_region_from_iter(u64 iterator);
+
static inline bool memblock_is_hotpluggable(struct memblock_region *m)
{
return m->flags & MEMBLOCK_HOTPLUG;
diff --git a/mm/memblock.c b/mm/memblock.c
index ae6a5af46bd7..9cf99f32279f 100644
--- a/mm/memblock.c
+++ b/mm/memblock.c
@@ -1359,6 +1359,16 @@ void __init_memblock __next_mem_range_rev(u64 *idx, int nid,
*idx = ULLONG_MAX;
}
+__init_memblock struct memblock_region *memblock_region_from_iter(u64 iterator)
+{
+ int index = iterator & 0xffffffff;
+
+ if (index < 0 || index >= memblock.memory.cnt)
+ return NULL;
+
+ return &memblock.memory.regions[index];
+}
+
/*
* Common iterator interface used to define for_each_mem_pfn_range().
*/
diff --git a/mm/mm_init.c b/mm/mm_init.c
index cec7bb758bdd..96b25895ffbe 100644
--- a/mm/mm_init.c
+++ b/mm/mm_init.c
@@ -1996,7 +1996,7 @@ unsigned long __init node_map_pfn_alignment(void)
#ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT
static void __init deferred_free_pages(unsigned long pfn,
- unsigned long nr_pages)
+ unsigned long nr_pages, enum migratetype mt)
{
struct page *page;
unsigned long i;
@@ -2009,8 +2009,7 @@ static void __init deferred_free_pages(unsigned long pfn,
/* Free a large naturally-aligned chunk if possible */
if (nr_pages == MAX_ORDER_NR_PAGES && IS_MAX_ORDER_ALIGNED(pfn)) {
for (i = 0; i < nr_pages; i += pageblock_nr_pages)
- init_pageblock_migratetype(page + i, MIGRATE_MOVABLE,
- false);
+ init_pageblock_migratetype(page + i, mt, false);
__free_pages_core(page, MAX_PAGE_ORDER, MEMINIT_EARLY);
return;
}
@@ -2020,8 +2019,7 @@ static void __init deferred_free_pages(unsigned long pfn,
for (i = 0; i < nr_pages; i++, page++, pfn++) {
if (pageblock_aligned(pfn))
- init_pageblock_migratetype(page, MIGRATE_MOVABLE,
- false);
+ init_pageblock_migratetype(page, mt, false);
__free_pages_core(page, 0, MEMINIT_EARLY);
}
}
@@ -2077,6 +2075,8 @@ deferred_init_memmap_chunk(unsigned long start_pfn, unsigned long end_pfn,
u64 i = 0;
for_each_free_mem_range(i, nid, 0, &start, &end, NULL) {
+ struct memblock_region *region = memblock_region_from_iter(i);
+ enum migratetype mt = MIGRATE_MOVABLE;
unsigned long spfn = PFN_UP(start);
unsigned long epfn = PFN_DOWN(end);
@@ -2086,12 +2086,15 @@ deferred_init_memmap_chunk(unsigned long start_pfn, unsigned long end_pfn,
spfn = max(spfn, start_pfn);
epfn = min(epfn, end_pfn);
+ if (memblock_is_kho_scratch(region))
+ mt = MIGRATE_CMA;
+
while (spfn < epfn) {
unsigned long mo_pfn = ALIGN(spfn + 1, MAX_ORDER_NR_PAGES);
unsigned long chunk_end = min(mo_pfn, epfn);
nr_pages += deferred_init_pages(zone, spfn, chunk_end);
- deferred_free_pages(spfn, chunk_end - spfn);
+ deferred_free_pages(spfn, chunk_end - spfn, mt);
spfn = chunk_end;
--
Sincerely yours,
Mike.
More information about the kexec
mailing list