RFC Block Layer Extensions to Support NV-DIMMs

Rob Gittins rob.gittins at linux.intel.com
Wed Sep 4 17:54:49 EDT 2013

Non-volatile DIMMs have started to become available.  A NVDIMMs is a
DIMM that does not lose data across power interruptions.  Some of the
NVDIMMs act like memory, while others are more like a block device
on the memory bus. Application uses vary from being used to cache
critical data, to being a boot device.

There are two access classes of NVDIMMs,  block mode and
“load/store” mode DIMMs which are referred to as Direct Memory

The block mode is where the DIMM provides IO ports for read or write
of data.  These DIMMs reside on the memory bus but do not appear in the
application address space.  Block mode DIMMs do not require any changes
to the current infrastructure, since they provide IO type of interface.

Direct Memory Mappable DIMMs (DMMD) appear in the system address space
and are accessed via load and store instructions.  These NVDIMMs
are part of the system physical address space (SPA) as memory with
the attribute that data survives a power interruption.  As such this
memory is managed by the kernel which can  assign virtual addresses and
mapped into application’s address space as well as being accessible
by the kernel.  The area mapped into the system address space is
being referred to as persistent memory (PMEM).

PMEM introduces the need for new operations in the
block_device_operations to support the specific characteristics of
the media.

First data may not propagate all the way through the memory pipeline
when store instructions are executed.  Data may stay in the CPU cache
or in other buffers in the processor and memory complex.  In order to
ensure the durability of data there needs to be a driver entry point
to force a byte range out to media.  The methods of doing this are
specific to the PMEM technology and need to be handled by the driver
that is supporting the DMMDs.  To provide a way to ensure that data is
durable adding a commit function to the block_device_operations vector.

   void (*commitpmem)(struct block_device *bdev, void *addr);

Another area requiring extension is the need to be able to clear PMEM
errors.  When data is fetched from errored PMEM it is marked with the
poison attribute.  If the CPU attempts to access the data it causes a
machine check.  How errors are cleared is hardware dependent and needs
to be handled by the specific device driver.  The following function
in the block_device_operations vector would clear the correct range
of PMEM and put new data there.  If the argument data is null or the
size is zero the driver is free to put any data in PMEM it wishes.

	void (*clearerrorpmem)(struct block_device *bdev, void *addr,
			size_t len, void *data);

Different applications, filesystem and drivers may wish to share
ranges of PMEM.  This is analogous to partitioning a disk that is
using multiple and different filesystems.   Since PMEM is addressed
on a byte basis rather than a block basis the existing partitioning
model does not fit well.  As a result there needs to be a way to
describe PMEM ranges.

	struct pmem_layout *(*getpmem)(struct block_device *bdev);

Proposed patch.

Documentation/filesystems/Locking |    6 ++++
fs/block_dev.c                    |   42
include/linux/blkdev.h            |    4 +++
include/linux/pmem.h              |   47
4 files changed, 99 insertions(+), 0 deletions(-)
create mode 100644 include/linux/pmem.h

diff --git a/Documentation/filesystems/Locking
index 0706d32..78910f4 100644
--- a/Documentation/filesystems/Locking
+++ b/Documentation/filesystems/Locking
@@ -386,6 +386,9 @@ prototypes:
	int (*revalidate_disk) (struct gendisk *);
	int (*getgeo)(struct block_device *, struct hd_geometry *);
	void (*swap_slot_free_notify) (struct block_device *, unsigned long);
+	struct pmem_layout *(*getpmem)(struct block_device *);
+	void (*commitpmem)(struct block_device *, void *);
+	void (*clearerrorpmem)(struct block_device *, void *, size_t, void *);
locking rules:
@@ -399,6 +402,9 @@ unlock_native_capacity:	no
revalidate_disk:	no
getgeo:			no
swap_slot_free_notify:	no	(see below)
+getpmem:		no
+commitpmem:		no
+clearerrorpmem:	no
media_changed, unlock_native_capacity and revalidate_disk are called
diff --git a/fs/block_dev.c b/fs/block_dev.c
index aae187a..a57863c 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -27,6 +27,7 @@
#include <linux/namei.h>
#include <linux/log2.h>
#include <linux/cleancache.h>
+#include <linux/pmem.h>
#include <asm/uaccess.h>
#include "internal.h"
@@ -1716,3 +1717,44 @@ void iterate_bdevs(void (*func)(struct
*, void *), void *arg)
+ * get_pmemgeo() - Return persistent memory geometry information
+ * @bdev:  device to interrogate
+ *
+ * Provides the memory layout for a persistent memory volume which
+ * is made up of CPU-addressable persistent memory.  If the
+ * device does not support CPU-addressable persistent memory then
+ * is returned.
+ *
+ * Return: a pointer to a pmem_layout structure or ERR_PTR
+ */
+struct pmem_layout *get_pmemgeo(struct block_device *bdev)
+	struct gendisk *bd_disk = bdev->bd_disk;
+	if (!bd_disk || !bd_disk->fops->getpmem)
+		return ERR_PTR(-ENOTTY);
+	return bd_disk->fops->getpmem(bdev);
+void commit_pmem(struct block_device *bdev, void *addr)
+	struct gendisk *bd_disk = bdev->bd_disk;
+	if (bd_disk && bd_disk->fops->commitpmem)
+		bd_disk->fops->commitpmem(bdev, addr);
+void clear_pmem_error(struct block_device *bdev, void *addr, size_t
+				void *data)
+	struct gendisk *bd_disk = bdev->bd_disk;
+	if (bd_disk && bd_disk->fops->clearerrorpmem)
+		bd_disk->fops->clearerrorpmem(bdev, addr, len, data);
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 78feda9..ba2c1f5 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -1498,6 +1498,10 @@ struct block_device_operations {
	int (*getgeo)(struct block_device *, struct hd_geometry *);
	/* this callback is with swap_lock and sometimes page table lock held
	void (*swap_slot_free_notify) (struct block_device *, unsigned long);
+	/* persistent memory operations */
+	struct pmem_layout * (*getpmem)(struct block_device *);
+	void (*commitpmem)(struct block_device *, void *);
+	void (*clearerrorpmem)(struct block_device *, void *, size_t, void *);
	struct module *owner;
diff --git a/include/linux/pmem.h b/include/linux/pmem.h
new file mode 100644
index 0000000..f907307
--- /dev/null
+++ b/include/linux/pmem.h
@@ -0,0 +1,47 @@
+ * Definitions for the Persistent Memory interface
+ * Copyright (c) 2013, Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License
along with
+ * this program; if not, write to the Free Software Foundation, Inc.,
+ * 51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA.
+ */
+#ifndef _LINUX_PMEM_H
+#define _LINUX_PMEM_H
+#include <linux/types.h>
+struct persistent_memory_extent {
+	phys_addr_t pme_spa;
+	u64         pme_len;
+	int         pme_numa_node;
+struct pmem_layout {
+	u64         pml_flags;
+	u64         pml_total_size;
+	u32         pml_extent_count;
+	u32			pml_interleave;	/* interleave bytes */
+	struct persistent_memory_extent pml_extents[];
+ * Flags values
+ */
+#define PMEM_ENABLED 0x0000000000000001 /* can be used for Persistent
+#define PMEM_ERRORED 0x0000000000000002 /* in an error state */
+#define PMEM_COMMIT  0x0000000000000004 /* commit function available */
+#define PMEM_CLEAR_ERROR  0x0000000000000008 /* clear error function
provided */

More information about the Linux-pmfs mailing list