[RFC 12/18] pkernfs: Add IOMMU domain pgtables file
James Gowans
jgowans at amazon.com
Mon Feb 5 04:01:57 PST 2024
Similar to the IOMMU root pgtables file which was added in a previous
commit, now support a file type for IOMMU domain pgtables in the IOMMU
directory. These domain pgtable files only need to be useable after the
system has booted up, for example by QEMU creating one of these files
and using it to back the IOMMU pgtables for a persistent VM. As such the
filesystem abstraction can be better maintained here as the kernel code
doesn't need to reach "behind" the filesystem abstraction like it does
for the root pgtables.
A new inode type is created for domain pgtable files, and the IOMMU
directory gets inode_operation callbacks to support creating and
deleting these files in it.
Note: there is a use-after-free risk here too: if the domain pgtable
file is truncated while it's in-use for IOMMU pgtables then freed memory
could still be mapped into the IOMMU. To mitigate this there should be a
machanism to "freeze" the files once they've been given to the IOMMU.
---
fs/pkernfs/inode.c | 9 +++++--
fs/pkernfs/iommu.c | 55 +++++++++++++++++++++++++++++++++++++++--
fs/pkernfs/pkernfs.h | 4 +++
include/linux/pkernfs.h | 1 +
4 files changed, 65 insertions(+), 4 deletions(-)
diff --git a/fs/pkernfs/inode.c b/fs/pkernfs/inode.c
index 1d712e0a82a1..35842cd61002 100644
--- a/fs/pkernfs/inode.c
+++ b/fs/pkernfs/inode.c
@@ -35,7 +35,11 @@ struct inode *pkernfs_inode_get(struct super_block *sb, unsigned long ino)
inode->i_op = &pkernfs_iommu_dir_inode_operations;
inode->i_fop = &pkernfs_dir_fops;
inode->i_mode = S_IFDIR;
- } else if (pkernfs_inode->flags | PKERNFS_INODE_FLAG_IOMMU_ROOT_PGTABLES) {
+ } else if (pkernfs_inode->flags & PKERNFS_INODE_FLAG_IOMMU_ROOT_PGTABLES) {
+ inode->i_fop = &pkernfs_file_fops;
+ inode->i_mode = S_IFREG;
+ } else if (pkernfs_inode->flags & PKERNFS_INODE_FLAG_IOMMU_DOMAIN_PGTABLES) {
+ inode->i_fop = &pkernfs_file_fops;
inode->i_mode = S_IFREG;
}
@@ -175,6 +179,7 @@ const struct inode_operations pkernfs_dir_inode_operations = {
};
const struct inode_operations pkernfs_iommu_dir_inode_operations = {
+ .create = pkernfs_create_iommu_pgtables,
.lookup = pkernfs_lookup,
+ .unlink = pkernfs_unlink,
};
-
diff --git a/fs/pkernfs/iommu.c b/fs/pkernfs/iommu.c
index 5bce8146d7bb..f14e76013e85 100644
--- a/fs/pkernfs/iommu.c
+++ b/fs/pkernfs/iommu.c
@@ -4,6 +4,27 @@
#include <linux/io.h>
+void pkernfs_alloc_iommu_domain_pgtables(struct file *ppts, struct pkernfs_region *pkernfs_region)
+{
+ struct pkernfs_inode *pkernfs_inode;
+ unsigned long *mappings_block_vaddr;
+ unsigned long inode_idx;
+
+ /*
+ * For a pkernfs region block, the "mappings_block" field is still
+ * just a block index, but that block doesn't actually contain mappings
+ * it contains the pkernfs_region data
+ */
+
+ inode_idx = ppts->f_inode->i_ino;
+ pkernfs_inode = pkernfs_get_persisted_inode(NULL, inode_idx);
+
+ mappings_block_vaddr = (unsigned long *)pkernfs_addr_for_block(NULL,
+ pkernfs_inode->mappings_block);
+ set_bit(0, mappings_block_vaddr);
+ pkernfs_region->vaddr = mappings_block_vaddr;
+ pkernfs_region->paddr = pkernfs_base + (pkernfs_inode->mappings_block * (2 << 20));
+}
void pkernfs_alloc_iommu_root_pgtables(struct pkernfs_region *pkernfs_region)
{
unsigned long *mappings_block_vaddr;
@@ -63,9 +84,8 @@ void pkernfs_alloc_iommu_root_pgtables(struct pkernfs_region *pkernfs_region)
* just a block index, but that block doesn't actually contain mappings
* it contains the pkernfs_region data
*/
-
mappings_block_vaddr = (unsigned long *)pkernfs_addr_for_block(NULL,
- iommu_pgtables->mappings_block);
+ iommu_pgtables->mappings_block);
set_bit(0, mappings_block_vaddr);
pkernfs_region->vaddr = mappings_block_vaddr;
pkernfs_region->paddr = pkernfs_base + (iommu_pgtables->mappings_block * PMD_SIZE);
@@ -88,6 +108,29 @@ void pkernfs_alloc_iommu_root_pgtables(struct pkernfs_region *pkernfs_region)
(iommu_pgtables->mappings_block * PMD_SIZE);
}
+int pkernfs_create_iommu_pgtables(struct mnt_idmap *id, struct inode *dir,
+ struct dentry *dentry, umode_t mode, bool excl)
+{
+ unsigned long free_inode;
+ struct pkernfs_inode *pkernfs_inode;
+ struct inode *vfs_inode;
+
+ free_inode = pkernfs_allocate_inode(dir->i_sb);
+ if (free_inode <= 0)
+ return -ENOMEM;
+
+ pkernfs_inode = pkernfs_get_persisted_inode(dir->i_sb, free_inode);
+ pkernfs_inode->sibling_ino = pkernfs_get_persisted_inode(dir->i_sb, dir->i_ino)->child_ino;
+ pkernfs_get_persisted_inode(dir->i_sb, dir->i_ino)->child_ino = free_inode;
+ strscpy(pkernfs_inode->filename, dentry->d_name.name, PKERNFS_FILENAME_LEN);
+ pkernfs_inode->flags = PKERNFS_INODE_FLAG_IOMMU_DOMAIN_PGTABLES;
+ pkernfs_inode->mappings_block = pkernfs_alloc_block(dir->i_sb);
+ memset(pkernfs_addr_for_block(dir->i_sb, pkernfs_inode->mappings_block), 0, (2 << 20));
+ vfs_inode = pkernfs_inode_get(dir->i_sb, free_inode);
+ d_add(dentry, vfs_inode);
+ return 0;
+}
+
void *pkernfs_region_paddr_to_vaddr(struct pkernfs_region *region, unsigned long paddr)
{
if (WARN_ON(paddr >= region->paddr + region->bytes))
@@ -96,3 +139,11 @@ void *pkernfs_region_paddr_to_vaddr(struct pkernfs_region *region, unsigned long
return NULL;
return region->vaddr + (paddr - region->paddr);
}
+
+bool pkernfs_is_iommu_domain_pgtables(struct file *f)
+{
+ return f &&
+ pkernfs_get_persisted_inode(f->f_inode->i_sb, f->f_inode->i_ino)->flags &
+ PKERNFS_INODE_FLAG_IOMMU_DOMAIN_PGTABLES;
+}
+
diff --git a/fs/pkernfs/pkernfs.h b/fs/pkernfs/pkernfs.h
index e1b7ae3fe7f1..9bea827f8b40 100644
--- a/fs/pkernfs/pkernfs.h
+++ b/fs/pkernfs/pkernfs.h
@@ -21,6 +21,7 @@ struct pkernfs_sb {
#define PKERNFS_INODE_FLAG_DIR (1 << 1)
#define PKERNFS_INODE_FLAG_IOMMU_DIR (1 << 2)
#define PKERNFS_INODE_FLAG_IOMMU_ROOT_PGTABLES (1 << 3)
+#define PKERNFS_INODE_FLAG_IOMMU_DOMAIN_PGTABLES (1 << 4)
struct pkernfs_inode {
int flags;
/*
@@ -50,8 +51,11 @@ void *pkernfs_addr_for_block(struct super_block *sb, int block_idx);
unsigned long pkernfs_allocate_inode(struct super_block *sb);
struct pkernfs_inode *pkernfs_get_persisted_inode(struct super_block *sb, int ino);
+int pkernfs_create_iommu_pgtables(struct mnt_idmap *id, struct inode *dir,
+ struct dentry *dentry, umode_t mode, bool excl);
extern const struct file_operations pkernfs_dir_fops;
extern const struct file_operations pkernfs_file_fops;
extern const struct inode_operations pkernfs_file_inode_operations;
extern const struct inode_operations pkernfs_iommu_dir_inode_operations;
+extern const struct inode_operations pkernfs_iommu_domain_pgtables_inode_operations;
diff --git a/include/linux/pkernfs.h b/include/linux/pkernfs.h
index 0110e4784109..4ca923ee0d82 100644
--- a/include/linux/pkernfs.h
+++ b/include/linux/pkernfs.h
@@ -33,4 +33,5 @@ void pkernfs_alloc_page_from_region(struct pkernfs_region *pkernfs_region,
void **vaddr, unsigned long *paddr);
void *pkernfs_region_paddr_to_vaddr(struct pkernfs_region *region, unsigned long paddr);
+bool pkernfs_is_iommu_domain_pgtables(struct file *f);
#endif /* _LINUX_PKERNFS_H */
--
2.40.1
More information about the kexec
mailing list