[PATCH RFC 12/16] xfs: Add support for fallocate2

John Garry john.g.garry at oracle.com
Wed May 3 11:38:17 PDT 2023


From: Allison Henderson <allison.henderson at oracle.com>

Add support for fallocate2 ioctl, which is xfs' own version of fallocate.

Struct xfs_fallocate2 is passed in the ioctl, and xfs_fallocate2.alignment
allows the user to specify required extent alignment. This is key for
atomic write support, as we expect extents to be aligned on
atomic_write_unit_max boundaries.

The alignment flag is not sticky, so further extent mutation will not
obey this original alignment request. In addition, extent lengths should
always be a multiple of atomic_write_unit_max, which they are not yet. So
this really just works for scenarios when we were lucky enough to get a
single extent.

The following is sample usage and c code:

mkfs.xfs -f /dev/sda
mount /dev/sda mnt
xfs_fallocate2 mnt/test_file1.img 0 20971520 262144
filefrag -v mnt/test_file1.img

xfs_fallocate2.c

struct xfs_fallocate2 {
	int64_t offset;     /* bytes */
	int64_t length;     /* bytes */
	uint64_t flags;
	uint32_t alignment;  /* bytes */
	uint32_t padding[9];
};

int main(int argc, char **argv) {
	char *file;
	int fd, ret;
	struct xfs_fallocate2 fa = {};

	if (argc != 5) {
		printf("expected 5 arguments\n");
		exit(0);
	}

	argv++;
	file = *argv;
	argv++;

	fa.offset = atoi(*argv);
	argv++;

	fa.length = atoi(*argv);
	argv++;

	fa.alignment = atoi(*argv);
	argv++;

	if (fa.alignment)
		fa.flags = XFS_FALLOC2_ALIGNED;

	fd = open(file, O_RDWR | O_CREAT, 0600);
	if (fd < 0)
		exit(0);

	ret = ioctl(fd, XFS_IOC_FALLOCATE2, &fa);
	close(fd);

	return ret;
}

Signed-off-by: Allison Henderson <allison.henderson at oracle.com>
Signed-off-by: Catherine Hoang <catherine.hoang at oracle.com>
Signed-off-by: John Garry <john.g.garry at oracle.com>
---
 fs/xfs/Makefile                 |  1 +
 fs/xfs/libxfs/xfs_attr_remote.c |  2 +-
 fs/xfs/libxfs/xfs_bmap.c        |  9 ++-
 fs/xfs/libxfs/xfs_bmap.h        |  4 +-
 fs/xfs/libxfs/xfs_da_btree.c    |  4 +-
 fs/xfs/libxfs/xfs_fs.h          |  1 +
 fs/xfs/xfs_bmap_util.c          |  7 ++-
 fs/xfs/xfs_bmap_util.h          |  2 +-
 fs/xfs/xfs_dquot.c              |  2 +-
 fs/xfs/xfs_file.c               | 19 +++++--
 fs/xfs/xfs_fs_staging.c         | 99 +++++++++++++++++++++++++++++++++
 fs/xfs/xfs_fs_staging.h         | 21 +++++++
 fs/xfs/xfs_ioctl.c              |  4 ++
 fs/xfs/xfs_iomap.c              |  4 +-
 fs/xfs/xfs_reflink.c            |  4 +-
 fs/xfs/xfs_rtalloc.c            |  2 +-
 fs/xfs/xfs_symlink.c            |  2 +-
 security/security.c             |  1 +
 18 files changed, 168 insertions(+), 20 deletions(-)
 create mode 100644 fs/xfs/xfs_fs_staging.c
 create mode 100644 fs/xfs/xfs_fs_staging.h

diff --git a/fs/xfs/Makefile b/fs/xfs/Makefile
index 92d88dc3c9f7..9b413544d358 100644
--- a/fs/xfs/Makefile
+++ b/fs/xfs/Makefile
@@ -93,6 +93,7 @@ xfs-y				+= xfs_aops.o \
 				   xfs_sysfs.o \
 				   xfs_trans.o \
 				   xfs_xattr.o \
+				   xfs_fs_staging.o \
 				   kmem.o
 
 # low-level transaction/log code
diff --git a/fs/xfs/libxfs/xfs_attr_remote.c b/fs/xfs/libxfs/xfs_attr_remote.c
index d440393b40eb..c5f190fef1b5 100644
--- a/fs/xfs/libxfs/xfs_attr_remote.c
+++ b/fs/xfs/libxfs/xfs_attr_remote.c
@@ -615,7 +615,7 @@ xfs_attr_rmtval_set_blk(
 	error = xfs_bmapi_write(args->trans, dp,
 			(xfs_fileoff_t)attr->xattri_lblkno,
 			attr->xattri_blkcnt, XFS_BMAPI_ATTRFORK, args->total,
-			map, &nmap);
+			map, &nmap, 0);
 	if (error)
 		return error;
 
diff --git a/fs/xfs/libxfs/xfs_bmap.c b/fs/xfs/libxfs/xfs_bmap.c
index 34de6e6898c4..52a6e2b61228 100644
--- a/fs/xfs/libxfs/xfs_bmap.c
+++ b/fs/xfs/libxfs/xfs_bmap.c
@@ -3275,7 +3275,9 @@ xfs_bmap_compute_alignments(
 	struct xfs_alloc_arg	*args)
 {
 	struct xfs_mount	*mp = args->mp;
-	xfs_extlen_t		align = 0; /* minimum allocation alignment */
+
+	/* minimum allocation alignment */
+	xfs_extlen_t		align = args->alignment;
 	int			stripe_align = 0;
 
 	/* stripe alignment for allocation is determined by mount parameters */
@@ -3652,6 +3654,7 @@ xfs_bmap_btalloc(
 		.datatype	= ap->datatype,
 		.alignment	= 1,
 		.minalignslop	= 0,
+		.alignment	= ap->align,
 	};
 	xfs_fileoff_t		orig_offset;
 	xfs_extlen_t		orig_length;
@@ -4279,12 +4282,14 @@ xfs_bmapi_write(
 	uint32_t		flags,		/* XFS_BMAPI_... */
 	xfs_extlen_t		total,		/* total blocks needed */
 	struct xfs_bmbt_irec	*mval,		/* output: map values */
-	int			*nmap)		/* i/o: mval size/count */
+	int			*nmap,
+	xfs_extlen_t		align)		/* i/o: mval size/count */
 {
 	struct xfs_bmalloca	bma = {
 		.tp		= tp,
 		.ip		= ip,
 		.total		= total,
+		.align		= align,
 	};
 	struct xfs_mount	*mp = ip->i_mount;
 	int			whichfork = xfs_bmapi_whichfork(flags);
diff --git a/fs/xfs/libxfs/xfs_bmap.h b/fs/xfs/libxfs/xfs_bmap.h
index dd08361ca5a6..0573dfc5fa6b 100644
--- a/fs/xfs/libxfs/xfs_bmap.h
+++ b/fs/xfs/libxfs/xfs_bmap.h
@@ -26,6 +26,7 @@ struct xfs_bmalloca {
 	xfs_fileoff_t		offset;	/* offset in file filling in */
 	xfs_extlen_t		length;	/* i/o length asked/allocated */
 	xfs_fsblock_t		blkno;	/* starting block of new extent */
+	xfs_extlen_t		align;
 
 	struct xfs_btree_cur	*cur;	/* btree cursor */
 	struct xfs_iext_cursor	icur;	/* incore extent cursor */
@@ -189,7 +190,8 @@ int	xfs_bmapi_read(struct xfs_inode *ip, xfs_fileoff_t bno,
 		int *nmap, uint32_t flags);
 int	xfs_bmapi_write(struct xfs_trans *tp, struct xfs_inode *ip,
 		xfs_fileoff_t bno, xfs_filblks_t len, uint32_t flags,
-		xfs_extlen_t total, struct xfs_bmbt_irec *mval, int *nmap);
+		xfs_extlen_t total, struct xfs_bmbt_irec *mval, int *nmap,
+		xfs_extlen_t align);
 int	__xfs_bunmapi(struct xfs_trans *tp, struct xfs_inode *ip,
 		xfs_fileoff_t bno, xfs_filblks_t *rlen, uint32_t flags,
 		xfs_extnum_t nexts);
diff --git a/fs/xfs/libxfs/xfs_da_btree.c b/fs/xfs/libxfs/xfs_da_btree.c
index e576560b46e9..e6581254092f 100644
--- a/fs/xfs/libxfs/xfs_da_btree.c
+++ b/fs/xfs/libxfs/xfs_da_btree.c
@@ -2174,7 +2174,7 @@ xfs_da_grow_inode_int(
 	nmap = 1;
 	error = xfs_bmapi_write(tp, dp, *bno, count,
 			xfs_bmapi_aflag(w)|XFS_BMAPI_METADATA|XFS_BMAPI_CONTIG,
-			args->total, &map, &nmap);
+			args->total, &map, &nmap, 0);
 	if (error)
 		return error;
 
@@ -2196,7 +2196,7 @@ xfs_da_grow_inode_int(
 			nmap = min(XFS_BMAP_MAX_NMAP, c);
 			error = xfs_bmapi_write(tp, dp, b, c,
 					xfs_bmapi_aflag(w)|XFS_BMAPI_METADATA,
-					args->total, &mapp[mapi], &nmap);
+					args->total, &mapp[mapi], &nmap, 0);
 			if (error)
 				goto out_free_map;
 			if (nmap < 1)
diff --git a/fs/xfs/libxfs/xfs_fs.h b/fs/xfs/libxfs/xfs_fs.h
index 1cfd5bc6520a..829316ca01ea 100644
--- a/fs/xfs/libxfs/xfs_fs.h
+++ b/fs/xfs/libxfs/xfs_fs.h
@@ -831,6 +831,7 @@ struct xfs_scrub_metadata {
 #define XFS_IOC_FSGEOMETRY	     _IOR ('X', 126, struct xfs_fsop_geom)
 #define XFS_IOC_BULKSTAT	     _IOR ('X', 127, struct xfs_bulkstat_req)
 #define XFS_IOC_INUMBERS	     _IOR ('X', 128, struct xfs_inumbers_req)
+#define XFS_IOC_FALLOCATE2	     _IOR ('X', 129, struct xfs_fallocate2)
 /*	XFS_IOC_GETFSUUID ---------- deprecated 140	 */
 
 
diff --git a/fs/xfs/xfs_bmap_util.c b/fs/xfs/xfs_bmap_util.c
index a09dd2606479..a0c55af6f051 100644
--- a/fs/xfs/xfs_bmap_util.c
+++ b/fs/xfs/xfs_bmap_util.c
@@ -776,10 +776,12 @@ int
 xfs_alloc_file_space(
 	struct xfs_inode	*ip,
 	xfs_off_t		offset,
-	xfs_off_t		len)
+	xfs_off_t		len,
+	xfs_off_t		align)
 {
 	xfs_mount_t		*mp = ip->i_mount;
 	xfs_off_t		count;
+	xfs_filblks_t		align_fsb;
 	xfs_filblks_t		allocated_fsb;
 	xfs_filblks_t		allocatesize_fsb;
 	xfs_extlen_t		extsz, temp;
@@ -811,6 +813,7 @@ xfs_alloc_file_space(
 	nimaps = 1;
 	startoffset_fsb	= XFS_B_TO_FSBT(mp, offset);
 	endoffset_fsb = XFS_B_TO_FSB(mp, offset + count);
+	align_fsb = XFS_B_TO_FSB(mp, align);
 	allocatesize_fsb = endoffset_fsb - startoffset_fsb;
 
 	/*
@@ -872,7 +875,7 @@ xfs_alloc_file_space(
 
 		error = xfs_bmapi_write(tp, ip, startoffset_fsb,
 				allocatesize_fsb, XFS_BMAPI_PREALLOC, 0, imapp,
-				&nimaps);
+				&nimaps, align_fsb);
 		if (error)
 			goto error;
 
diff --git a/fs/xfs/xfs_bmap_util.h b/fs/xfs/xfs_bmap_util.h
index 6888078f5c31..476f610ad617 100644
--- a/fs/xfs/xfs_bmap_util.h
+++ b/fs/xfs/xfs_bmap_util.h
@@ -54,7 +54,7 @@ int	xfs_bmap_last_extent(struct xfs_trans *tp, struct xfs_inode *ip,
 
 /* preallocation and hole punch interface */
 int	xfs_alloc_file_space(struct xfs_inode *ip, xfs_off_t offset,
-			     xfs_off_t len);
+			     xfs_off_t len, xfs_off_t align);
 int	xfs_free_file_space(struct xfs_inode *ip, xfs_off_t offset,
 			    xfs_off_t len);
 int	xfs_collapse_file_space(struct xfs_inode *, xfs_off_t offset,
diff --git a/fs/xfs/xfs_dquot.c b/fs/xfs/xfs_dquot.c
index 8fb90da89787..475e1a56d1b0 100644
--- a/fs/xfs/xfs_dquot.c
+++ b/fs/xfs/xfs_dquot.c
@@ -328,7 +328,7 @@ xfs_dquot_disk_alloc(
 	/* Create the block mapping. */
 	error = xfs_bmapi_write(tp, quotip, dqp->q_fileoffset,
 			XFS_DQUOT_CLUSTER_SIZE_FSB, XFS_BMAPI_METADATA, 0, &map,
-			&nmaps);
+			&nmaps, 0);
 	if (error)
 		goto err_cancel;
 
diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c
index 705250f9f90a..9b1db42a8d33 100644
--- a/fs/xfs/xfs_file.c
+++ b/fs/xfs/xfs_file.c
@@ -883,12 +883,13 @@ static inline bool xfs_file_sync_writes(struct file *filp)
 		 FALLOC_FL_COLLAPSE_RANGE | FALLOC_FL_ZERO_RANGE |	\
 		 FALLOC_FL_INSERT_RANGE | FALLOC_FL_UNSHARE_RANGE)
 
-STATIC long
-xfs_file_fallocate(
+long
+_xfs_file_fallocate(
 	struct file		*file,
 	int			mode,
 	loff_t			offset,
-	loff_t			len)
+	loff_t			len,
+	loff_t 			alignment)
 {
 	struct inode		*inode = file_inode(file);
 	struct xfs_inode	*ip = XFS_I(inode);
@@ -1035,7 +1036,7 @@ xfs_file_fallocate(
 		}
 
 		if (!xfs_is_always_cow_inode(ip)) {
-			error = xfs_alloc_file_space(ip, offset, len);
+			error = xfs_alloc_file_space(ip, offset, len, alignment);
 			if (error)
 				goto out_unlock;
 		}
@@ -1073,6 +1074,16 @@ xfs_file_fallocate(
 	return error;
 }
 
+STATIC long
+xfs_file_fallocate(
+	struct file		*file,
+	int			mode,
+	loff_t			offset,
+	loff_t			len)
+{
+	return _xfs_file_fallocate(file, mode, offset, len, 0);
+}
+
 STATIC int
 xfs_file_fadvise(
 	struct file	*file,
diff --git a/fs/xfs/xfs_fs_staging.c b/fs/xfs/xfs_fs_staging.c
new file mode 100644
index 000000000000..1d635c0a9f49
--- /dev/null
+++ b/fs/xfs/xfs_fs_staging.c
@@ -0,0 +1,99 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
+ * Copyright (C) 2023 Oracle.  All Rights Reserved.
+ */
+
+#include "xfs.h"
+#include "xfs_fs_staging.h"
+#include "xfs_shared.h"
+#include "xfs_format.h"
+#include "xfs_log_format.h"
+#include "xfs_trans_resv.h"
+#include "xfs_mount.h"
+#include "xfs_inode.h"
+
+#include "linux/security.h"
+#include "linux/fsnotify.h"
+
+extern long _xfs_file_fallocate(
+	struct file		*file,
+	int			mode,
+	loff_t			offset,
+	loff_t			len,
+	loff_t 			alignment);
+
+int xfs_fallocate2(	struct file		*filp,
+	void			__user *arg)
+{
+	struct inode		*inode = file_inode(filp);
+	//struct xfs_inode	*ip = XFS_I(inode);
+	struct xfs_fallocate2 fallocate2;
+	int ret;
+
+	if (copy_from_user(&fallocate2, arg, sizeof(fallocate2)))
+		return -EFAULT;
+
+	if (fallocate2.flags & XFS_FALLOC2_ALIGNED) {
+		if (!fallocate2.alignment || !is_power_of_2(fallocate2.alignment))
+			return -EINVAL;
+
+		if (fallocate2.offset % fallocate2.alignment)
+			return -EINVAL;
+
+		if (fallocate2.length % fallocate2.alignment)
+			return -EINVAL;
+	} else if (fallocate2.alignment) {
+		return -EINVAL;
+	}
+
+	/* These are all just copied from vfs_fallocate() */
+	if (fallocate2.offset < 0 || fallocate2.length <= 0)
+		return -EINVAL;
+
+	if (!(filp->f_mode & FMODE_WRITE))
+		return -EBADF;
+
+	if (IS_IMMUTABLE(inode))
+		return -EPERM;
+
+	/*
+	 * We cannot allow any fallocate operation on an active swapfile
+	 */
+	if (IS_SWAPFILE(inode))
+		return -ETXTBSY;
+
+	/*
+	 * Revalidate the write permissions, in case security policy has
+	 * changed since the files were opened.
+	 */
+	ret = security_file_permission(filp, MAY_WRITE);
+	if (ret)
+		return ret;
+
+	if (S_ISFIFO(inode->i_mode))
+		return -ESPIPE;
+
+	if (S_ISDIR(inode->i_mode))
+		return -EISDIR;
+
+	if (!S_ISREG(inode->i_mode) && !S_ISBLK(inode->i_mode))
+		return -ENODEV;
+
+	/* Check for wrap through zero too */
+	if (((fallocate2.offset + fallocate2.length) > inode->i_sb->s_maxbytes) ||
+		((fallocate2.offset + fallocate2.length) < 0))
+		return -EFBIG;
+
+	if (!filp->f_op->fallocate)
+		return -EOPNOTSUPP;
+
+	file_start_write(filp);
+	ret = _xfs_file_fallocate(filp, 0, fallocate2.offset, fallocate2.length, fallocate2.alignment);
+
+	if (ret == 0)
+		fsnotify_modify(filp);
+
+	file_end_write(filp);
+
+	return ret;
+}
diff --git a/fs/xfs/xfs_fs_staging.h b/fs/xfs/xfs_fs_staging.h
new file mode 100644
index 000000000000..a82e61063dba
--- /dev/null
+++ b/fs/xfs/xfs_fs_staging.h
@@ -0,0 +1,21 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
+ * Copyright (C) 2023 Oracle.  All Rights Reserved.
+ */
+#ifndef __XFS_FS_STAGING_H__
+#define __XFS_FS_STAGING_H__
+
+struct xfs_fallocate2 {
+	s64 offset;	/* bytes */
+	s64 length;	/* bytes */
+	u64 flags;
+	u32 alignment;	/* bytes */
+	u32 padding[8];
+};
+
+#define XFS_FALLOC2_ALIGNED (1U << 0)
+
+int xfs_fallocate2(	struct file		*filp,
+	void			__user *arg);
+
+#endif	/* __XFS_FS_STAGING_H__ */
diff --git a/fs/xfs/xfs_ioctl.c b/fs/xfs/xfs_ioctl.c
index 55bb01173cde..6e60fce44068 100644
--- a/fs/xfs/xfs_ioctl.c
+++ b/fs/xfs/xfs_ioctl.c
@@ -4,6 +4,7 @@
  * All Rights Reserved.
  */
 #include "xfs.h"
+#include "xfs_fs_staging.h"
 #include "xfs_fs.h"
 #include "xfs_shared.h"
 #include "xfs_format.h"
@@ -2149,6 +2150,9 @@ xfs_file_ioctl(
 		return error;
 	}
 
+	case XFS_IOC_FALLOCATE2:
+		return xfs_fallocate2(filp, arg);
+
 	default:
 		return -ENOTTY;
 	}
diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c
index 285885c308bd..a4389a0c4bf2 100644
--- a/fs/xfs/xfs_iomap.c
+++ b/fs/xfs/xfs_iomap.c
@@ -306,7 +306,7 @@ xfs_iomap_write_direct(
 	 */
 	nimaps = 1;
 	error = xfs_bmapi_write(tp, ip, offset_fsb, count_fsb, bmapi_flags, 0,
-				imap, &nimaps);
+				imap, &nimaps, 0);
 	if (error)
 		goto out_trans_cancel;
 
@@ -614,7 +614,7 @@ xfs_iomap_write_unwritten(
 		nimaps = 1;
 		error = xfs_bmapi_write(tp, ip, offset_fsb, count_fsb,
 					XFS_BMAPI_CONVERT, resblks, &imap,
-					&nimaps);
+					&nimaps, 0);
 		if (error)
 			goto error_on_bmapi_transaction;
 
diff --git a/fs/xfs/xfs_reflink.c b/fs/xfs/xfs_reflink.c
index f5dc46ce9803..a2e5ba6cf7f3 100644
--- a/fs/xfs/xfs_reflink.c
+++ b/fs/xfs/xfs_reflink.c
@@ -420,7 +420,7 @@ xfs_reflink_fill_cow_hole(
 	nimaps = 1;
 	error = xfs_bmapi_write(tp, ip, imap->br_startoff, imap->br_blockcount,
 			XFS_BMAPI_COWFORK | XFS_BMAPI_PREALLOC, 0, cmap,
-			&nimaps);
+			&nimaps, 0);
 	if (error)
 		goto out_trans_cancel;
 
@@ -490,7 +490,7 @@ xfs_reflink_fill_delalloc(
 		error = xfs_bmapi_write(tp, ip, cmap->br_startoff,
 				cmap->br_blockcount,
 				XFS_BMAPI_COWFORK | XFS_BMAPI_PREALLOC, 0,
-				cmap, &nimaps);
+				cmap, &nimaps, 0);
 		if (error)
 			goto out_trans_cancel;
 
diff --git a/fs/xfs/xfs_rtalloc.c b/fs/xfs/xfs_rtalloc.c
index 16534e9873f6..a57a8a4d8294 100644
--- a/fs/xfs/xfs_rtalloc.c
+++ b/fs/xfs/xfs_rtalloc.c
@@ -817,7 +817,7 @@ xfs_growfs_rt_alloc(
 		 */
 		nmap = 1;
 		error = xfs_bmapi_write(tp, ip, oblocks, nblocks - oblocks,
-					XFS_BMAPI_METADATA, 0, &map, &nmap);
+					XFS_BMAPI_METADATA, 0, &map, &nmap, 0);
 		if (!error && nmap < 1)
 			error = -ENOSPC;
 		if (error)
diff --git a/fs/xfs/xfs_symlink.c b/fs/xfs/xfs_symlink.c
index 85e433df6a3f..2a4524bf34a5 100644
--- a/fs/xfs/xfs_symlink.c
+++ b/fs/xfs/xfs_symlink.c
@@ -269,7 +269,7 @@ xfs_symlink(
 		nmaps = XFS_SYMLINK_MAPS;
 
 		error = xfs_bmapi_write(tp, ip, first_fsb, fs_blocks,
-				  XFS_BMAPI_METADATA, resblks, mval, &nmaps);
+				  XFS_BMAPI_METADATA, resblks, mval, &nmaps, 0);
 		if (error)
 			goto out_trans_cancel;
 
diff --git a/security/security.c b/security/security.c
index cf6cc576736f..d53b1b6c2d59 100644
--- a/security/security.c
+++ b/security/security.c
@@ -1593,6 +1593,7 @@ int security_file_permission(struct file *file, int mask)
 
 	return fsnotify_perm(file, mask);
 }
+EXPORT_SYMBOL(security_file_permission);
 
 int security_file_alloc(struct file *file)
 {
-- 
2.31.1




More information about the Linux-nvme mailing list