dax alignment problem on arm64 (and other achitectures)

Joao Martins joao.m.martins at oracle.com
Fri Jan 29 09:50:34 EST 2021


Hey Pavel,

On 1/29/21 1:50 PM, Pavel Tatashin wrote:
>> Since we last talked about this the enabling for EFI "Special Purpose"
>> / Soft Reserved Memory has gone upstream and instantiates device-dax
>> instances for address ranges marked with EFI_MEMORY_SP attribute.
>> Critically this way of declaring device-dax removes the consideration
>> of it as persistent memory and as such no metadata reservation. So, if
>> you are willing to maintain the metadata external to the device (which
>> seems reasonable for your environment) and have your platform firmware
>> / kernel command line mark it as EFI_CONVENTIONAL_MEMORY +
>> EFI_MEMORY_SP, then these reserve-free dax-devices will surface.
> 
> Hi Dan,
> 
> This is cool. Does it allow conversion between devdax and fsdax so DAX
> aware filesystem can be installed and data can be put there to be
> preserved across the reboot?
> 

fwiw wrt to the 'preserved across kexec' part, you are going to need
something conceptually similar to snippet below the scissors mark.
Alternatively, we could fix kexec userspace to add conventional memory
ranges (without the SP attribute part) when it sees a Soft-Reserved region.
But can't tell which one is the right thing to do.

At the moment, HMAT ranges (or those defined with efi_fake_mem=) aren't
preserved not because of anything special with HMAT, but simply because
the EFI memmap conventional ram ranges are not preserved (only runtime
services). And HMAT/efi_fake_mem expects these to based on EFI memmap.

---------------->8------------------

From: Joao Martins <joao.m.martins at oracle.com>
Subject: x86/efi: add Conventional Memory ranges to runtime-map

Through EFI/HMAT certain ranges are marked with Specific Purpose
EFI attribute (EFI_MEMORY_SP). These ranges are usually
specified in a memory descriptor of type Conventional Memory.

We only ever expose regions to the runtime-map that were marked
with efi_mem_reserve(). Currently these comprise the Runtime
Data/Code and Boot data. Everything else gets lost, so on a kexec
boot, if we had an HMAT (or efi_fake_mem= marked regions) the second
kernel kexec will lose this information, and expose this memory
as regular RAM.

To address that, let's add the Conventional Memory ranges from the
firmware EFI memory map to the runtime. kexec then picks these up
on kexec load. Specifically, we save the fw memmap first, and when
we enter EFI virtual mode which on x86 is the latest point where
we filter the EFI memmap to construct one with only runtime services.

Signed-off-by: Joao Martins <joao.m.martins at oracle.com>
---
diff --git a/arch/x86/platform/efi/efi.c b/arch/x86/platform/efi/efi.c
index 8a26e705cb06..c244da8b185d 100644
--- a/arch/x86/platform/efi/efi.c
+++ b/arch/x86/platform/efi/efi.c
@@ -663,6 +663,53 @@ static bool should_map_region(efi_memory_desc_t *md)
 	return false;
 }

+static void __init efi_fw_memmap_restore(void **map, int left,
+					 int *count, int *pg_shift)
+{
+	struct efi_memory_map_data *data = &efi_fw_memmap;
+	void *fw_memmap, *new_memmap = *map;
+	unsigned long desc_size;
+	int i, nr_map;
+
+	if (!data->phys_map)
+		return;
+
+	/* create new EFI memmap */
+	fw_memmap = early_memremap(data->phys_map, data->size);
+	if (!fw_memmap) {
+		return;
+	}
+
+	desc_size = data->desc_size;
+	nr_map = data->size / desc_size;
+
+	for (i = 0; i < nr_map; i++) {
+		efi_memory_desc_t *md = efi_early_memdesc_ptr(fw_memmap,
+							desc_size, i);
+
+		if (md->type != EFI_CONVENTIONAL_MEMORY)
+			continue;
+
+		if (left < desc_size) {
+			new_memmap = realloc_pages(new_memmap, *pg_shift);
+			if (!new_memmap) {
+				early_memunmap(fw_memmap, data->size);
+				return;
+			}
+
+			left += PAGE_SIZE << *pg_shift;
+			(*pg_shift)++;
+		}
+
+		memcpy(new_memmap + (*count * desc_size), md, desc_size);
+
+		left -= desc_size;
+		(*count)++;
+	}
+
+	early_memunmap(fw_memmap, data->size);
+}
+
 /*
  * Map the efi memory ranges of the runtime services and update new_mmap with
  * virtual addresses.
@@ -700,6 +747,8 @@ static void * __init efi_map_regions(int *count, int *pg_shift)
 		(*count)++;
 	}

+	efi_fw_memmap_restore(&new_memmap, left, count, pg_shift);
+
 	return new_memmap;
 }

diff --git a/drivers/firmware/efi/fake_mem.c b/drivers/firmware/efi/fake_mem.c
index 6e0f34a38171..5fd075503764 100644
--- a/drivers/firmware/efi/fake_mem.c
+++ b/drivers/firmware/efi/fake_mem.c
@@ -19,9 +19,30 @@
 #include <linux/sort.h>
 #include "fake_mem.h"

+struct efi_memory_map_data efi_fw_memmap;
 struct efi_mem_range efi_fake_mems[EFI_MAX_FAKEMEM];
 int nr_fake_mem;

+static void __init efi_fw_memmap_save(void)
+{
+	struct efi_memory_map_data *data = &efi_fw_memmap;
+	int new_nr_map = efi.memmap.nr_map;
+	void *new_memmap;
+
+	if (efi_memmap_alloc(new_nr_map, data) != 0)
+		return;
+
+	new_memmap = early_memremap(data->phys_map, data->size);
+	if (!new_memmap) {
+		__efi_memmap_free(data->phys_map, data->size, data->flags);
+		return;
+	}
+
+	efi_runtime_map_copy(new_memmap, data->size);
+
+	early_memunmap(new_memmap, data->size);
+}
+
 static int __init cmp_fake_mem(const void *x1, const void *x2)
 {
 	const struct efi_mem_range *m1 = x1;
@@ -68,7 +89,12 @@ void __init efi_fake_memmap(void)
 {
 	int i;

-	if (!efi_enabled(EFI_MEMMAP) || !nr_fake_mem)
+	if (!efi_enabled(EFI_MEMMAP))
+		return;
+
+	efi_fw_memmap_save();
+
+	if (!nr_fake_mem)
 		return;

 	for (i = 0; i < nr_fake_mem; i++)
diff --git a/include/linux/efi.h b/include/linux/efi.h
index 8710f5710c1d..72803b1a7a39 100644
--- a/include/linux/efi.h
+++ b/include/linux/efi.h
@@ -1280,4 +1280,6 @@ static inline struct efi_mokvar_table_entry *efi_mokvar_entry_find(
 }
 #endif

+extern struct efi_memory_map_data efi_fw_memmap;
+
 #endif /* _LINUX_EFI_H */



More information about the linux-arm-kernel mailing list