[PATCH v2 00/16] block atomic writes

John Garry john.g.garry at oracle.com
Tue Dec 19 04:41:37 PST 2023


On 19/12/2023 05:21, Christoph Hellwig wrote:
> On Mon, Dec 18, 2023 at 09:14:56PM -0800, Darrick J. Wong wrote:
>> /me stumbles back in with plenty of covidbrain to go around.
>>
>> So ... Christoph, you're asking for a common API for
>> sysadmins/applications to signal to the filesystem that they want all
>> data allocations aligned to a given value, right?
>>
>> e.g. if a nvme device advertises a capability for untorn writes between
>> $lbasize and 64k, then we need a way to set up each untorn-file with
>> some alignment between $lbasize and 64k?
>>
>> or if cxl storage becomes not ung-dly expensive, then we'd want a way to
>> set up files with an alignment of 2M (x86) or 512M (arm64lol) to take
>> advantage of PMD mmap mappings?
> 
> The most important point is to not mix these up.
> 
> If we want to use a file for atomic writes I should tell the fs about
> it, and preferably in a way that does not require me to know about weird
> internal implementation details of every file system.  I really just
> want to use atomic writes.  Preferably I'd just start using them after
> asking for the limits.  But that's obviously not going to work for
> file systems that use the hardware offload and don't otherwise align
> to the limit (which would suck for very small files anyway :))
> 
> So as a compromise I tell the file system before writing or otherwise
> adding any data [1] to file that I want to use atomic writes so that
> the fs can take the right decisions.
> 
> [1] reflinking data into a such marked file will be ... interesting.
> 

How about something based on fcntl, like below? We will prob also 
require some per-FS flag for enabling atomic writes without HW support. 
That flag might be also useful for XFS for differentiating forcealign 
for atomic writes with just forcealign.

---8<----

diff --git a/fs/fcntl.c b/fs/fcntl.c
index c80a6acad742..d4a50655565a 100644
--- a/fs/fcntl.c
+++ b/fs/fcntl.c
@@ -313,6 +313,15 @@ static long fcntl_rw_hint(struct file *file, 
unsigned int cmd,
  	}
  }

+static long fcntl_atomic_write(struct file *file, unsigned int cmd,
+			  unsigned long arg)
+{
+	if (file->f_op->enable_atomic_writes)
+		return file->f_op->enable_atomic_writes(file, arg);
+
+	return -EOPNOTSUPP;
+}
+
  static long do_fcntl(int fd, unsigned int cmd, unsigned long arg,
  		struct file *filp)
  {
@@ -419,6 +428,9 @@ static long do_fcntl(int fd, unsigned int cmd, 
unsigned long arg,
  	case F_SET_RW_HINT:
  		err = fcntl_rw_hint(filp, cmd, arg);
  		break;
+	case F_SET_ATOMIC_WRITE_SIZE:
+		err = fcntl_atomic_write(filp, cmd, arg);
+		break;
  	default:
  		break;
  	}
diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c
index e33e5e13b95f..dba206cbe1ab 100644
--- a/fs/xfs/xfs_file.c
+++ b/fs/xfs/xfs_file.c
@@ -1478,6 +1478,46 @@ xfs_file_mmap(
  	return 0;
  }

+STATIC int
+xfs_enable_atomic_writes(
+	struct file	*file,
+	unsigned int unit_max)
+{
+	struct xfs_inode *ip = XFS_I(file_inode(file));
+	struct mnt_idmap *idmap = file_mnt_idmap(file);
+	struct dentry *dentry = file->f_path.dentry;
+	struct xfs_buftarg	*target = xfs_inode_buftarg(ip);
+	struct block_device	*bdev = target->bt_bdev;
+	struct request_queue	*q = bdev->bd_queue;
+	struct inode		*inode = &ip->i_vnode;
+
+	if (queue_atomic_write_unit_max_bytes(q) == 0) {
+		if (unit_max != 0) {
+			/* We expect unbounded CoW size if no HW support */
+			return -ENOTBLK;
+		}
+		/* Do something for CoW support ... */
+
+		return 0;
+	}
+
+	if (!unit_max)
+		return -EINVAL;
+
+	/* bodge alert */
+	if (inode->i_op->fileattr_set) {
+		struct fileattr fa = {
+			.fsx_extsize = unit_max,
+			.fsx_xflags = FS_XFLAG_EXTSIZE | FS_XFLAG_FORCEALIGN,
+			.fsx_valid = 1,
+		};
+
+		return inode->i_op->fileattr_set(idmap, dentry, &fa);
+	}
+
+	return -EOPNOTSUPP;
+}
+
  const struct file_operations xfs_file_operations = {
  	.llseek		= xfs_file_llseek,
  	.read_iter	= xfs_file_read_iter,
@@ -1498,6 +1538,7 @@ const struct file_operations xfs_file_operations = {
  	.fallocate	= xfs_file_fallocate,
  	.fadvise	= xfs_file_fadvise,
  	.remap_file_range = xfs_file_remap_range,
+	.enable_atomic_writes = xfs_enable_atomic_writes,
  };

  const struct file_operations xfs_dir_file_operations = {
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 98b7a7a8c42e..a715f98057b8 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -1956,6 +1956,7 @@ struct file_operations {
  	int (*uring_cmd)(struct io_uring_cmd *ioucmd, unsigned int issue_flags);
  	int (*uring_cmd_iopoll)(struct io_uring_cmd *, struct io_comp_batch *,
  				unsigned int poll_flags);
+	int (*enable_atomic_writes)(struct file *, unsigned int unit_max);
  } __randomize_layout;

  /* Wrap a directory iterator that needs exclusive inode access */
diff --git a/include/uapi/linux/fcntl.h b/include/uapi/linux/fcntl.h
index 6c80f96049bd..69780c49622e 100644
--- a/include/uapi/linux/fcntl.h
+++ b/include/uapi/linux/fcntl.h
@@ -56,6 +56,8 @@
  #define F_GET_FILE_RW_HINT	(F_LINUX_SPECIFIC_BASE + 13)
  #define F_SET_FILE_RW_HINT	(F_LINUX_SPECIFIC_BASE + 14)

+#define F_SET_ATOMIC_WRITE_SIZE  (F_LINUX_SPECIFIC_BASE + 15)
+
  /*
   * Valid hint values for F_{GET,SET}_RW_HINT. 0 is "not set", or can be
   * used to clear any hints previously set.
-- 


--->8----



More information about the Linux-nvme mailing list