[PATCH v2] nvme: remove unprivileged passthrough support
Kanchan Joshi
joshi.k at samsung.com
Sun Oct 15 23:05:19 PDT 2023
Passthrough has got a hole that can be exploited to cause kernel memory
corruption. This is about making the device do larger DMA into
short meta/data buffer owned by kernel [1].
As a stopgap measure, disable the support of unprivileged passthrough.
This patch brings back coarse-granular CAP_SYS_ADMIN checks by reverting
following patches:
- 7d9d7d59d44 ("nvme: replace the fmode_t argument to the nvme ioctl handlers with a simple bool")
- 313c08c72ee ("nvme: don't allow unprivileged passthrough on partitions")
- 6f99ac04c46 ("nvme: consult the CSE log page for unprivileged passthrough")
- ea43fceea41 ("nvme: allow unprivileged passthrough of Identify Controller")
- e4fbcf32c86 ("nvme: identify-namespace without CAP_SYS_ADMIN")
- 855b7717f44 ("nvme: fine-granular CAP_SYS_ADMIN for nvme io commands")
[1] https://lore.kernel.org/linux-nvme/20231013051458.39987-1-joshi.k@samsung.com/
CC: stable at vger.kernel.org # 6.2
Fixes: 855b7717f44b1 ("nvme: fine-granular CAP_SYS_ADMIN for nvme io commands")
Suggested-by: Christoph Hellwig <hch at lst.de>
Signed-off-by: Kanchan Joshi <joshi.k at samsung.com>
Reviewed-by: Christoph Hellwig <hch at lst.de>
---
Changes since v1:
- Fix the way "Fixes:" was written before (Greg)
drivers/nvme/host/ioctl.c | 159 ++++++++------------------------------
include/linux/nvme.h | 2 -
2 files changed, 34 insertions(+), 127 deletions(-)
diff --git a/drivers/nvme/host/ioctl.c b/drivers/nvme/host/ioctl.c
index d8ff796fd5f2..788b36e7915a 100644
--- a/drivers/nvme/host/ioctl.c
+++ b/drivers/nvme/host/ioctl.c
@@ -10,80 +10,8 @@
enum {
NVME_IOCTL_VEC = (1 << 0),
- NVME_IOCTL_PARTITION = (1 << 1),
};
-static bool nvme_cmd_allowed(struct nvme_ns *ns, struct nvme_command *c,
- unsigned int flags, bool open_for_write)
-{
- u32 effects;
-
- if (capable(CAP_SYS_ADMIN))
- return true;
-
- /*
- * Do not allow unprivileged passthrough on partitions, as that allows an
- * escape from the containment of the partition.
- */
- if (flags & NVME_IOCTL_PARTITION)
- return false;
-
- /*
- * Do not allow unprivileged processes to send vendor specific or fabrics
- * commands as we can't be sure about their effects.
- */
- if (c->common.opcode >= nvme_cmd_vendor_start ||
- c->common.opcode == nvme_fabrics_command)
- return false;
-
- /*
- * Do not allow unprivileged passthrough of admin commands except
- * for a subset of identify commands that contain information required
- * to form proper I/O commands in userspace and do not expose any
- * potentially sensitive information.
- */
- if (!ns) {
- if (c->common.opcode == nvme_admin_identify) {
- switch (c->identify.cns) {
- case NVME_ID_CNS_NS:
- case NVME_ID_CNS_CS_NS:
- case NVME_ID_CNS_NS_CS_INDEP:
- case NVME_ID_CNS_CS_CTRL:
- case NVME_ID_CNS_CTRL:
- return true;
- }
- }
- return false;
- }
-
- /*
- * Check if the controller provides a Commands Supported and Effects log
- * and marks this command as supported. If not reject unprivileged
- * passthrough.
- */
- effects = nvme_command_effects(ns->ctrl, ns, c->common.opcode);
- if (!(effects & NVME_CMD_EFFECTS_CSUPP))
- return false;
-
- /*
- * Don't allow passthrough for command that have intrusive (or unknown)
- * effects.
- */
- if (effects & ~(NVME_CMD_EFFECTS_CSUPP | NVME_CMD_EFFECTS_LBCC |
- NVME_CMD_EFFECTS_UUID_SEL |
- NVME_CMD_EFFECTS_SCOPE_MASK))
- return false;
-
- /*
- * Only allow I/O commands that transfer data to the controller or that
- * change the logical block contents if the file descriptor is open for
- * writing.
- */
- if (nvme_is_write(c) || (effects & NVME_CMD_EFFECTS_LBCC))
- return open_for_write;
- return true;
-}
-
/*
* Convert integer values from ioctl structures to user pointers, silently
* ignoring the upper bits in the compat case to match behaviour of 32-bit
@@ -335,8 +263,7 @@ static bool nvme_validate_passthru_nsid(struct nvme_ctrl *ctrl,
}
static int nvme_user_cmd(struct nvme_ctrl *ctrl, struct nvme_ns *ns,
- struct nvme_passthru_cmd __user *ucmd, unsigned int flags,
- bool open_for_write)
+ struct nvme_passthru_cmd __user *ucmd)
{
struct nvme_passthru_cmd cmd;
struct nvme_command c;
@@ -344,6 +271,8 @@ static int nvme_user_cmd(struct nvme_ctrl *ctrl, struct nvme_ns *ns,
u64 result;
int status;
+ if (!capable(CAP_SYS_ADMIN))
+ return -EACCES;
if (copy_from_user(&cmd, ucmd, sizeof(cmd)))
return -EFAULT;
if (cmd.flags)
@@ -364,9 +293,6 @@ static int nvme_user_cmd(struct nvme_ctrl *ctrl, struct nvme_ns *ns,
c.common.cdw14 = cpu_to_le32(cmd.cdw14);
c.common.cdw15 = cpu_to_le32(cmd.cdw15);
- if (!nvme_cmd_allowed(ns, &c, 0, open_for_write))
- return -EACCES;
-
if (cmd.timeout_ms)
timeout = msecs_to_jiffies(cmd.timeout_ms);
@@ -383,14 +309,16 @@ static int nvme_user_cmd(struct nvme_ctrl *ctrl, struct nvme_ns *ns,
}
static int nvme_user_cmd64(struct nvme_ctrl *ctrl, struct nvme_ns *ns,
- struct nvme_passthru_cmd64 __user *ucmd, unsigned int flags,
- bool open_for_write)
+ struct nvme_passthru_cmd64 __user *ucmd,
+ unsigned int flags)
{
struct nvme_passthru_cmd64 cmd;
struct nvme_command c;
unsigned timeout = 0;
int status;
+ if (!capable(CAP_SYS_ADMIN))
+ return -EACCES;
if (copy_from_user(&cmd, ucmd, sizeof(cmd)))
return -EFAULT;
if (cmd.flags)
@@ -411,9 +339,6 @@ static int nvme_user_cmd64(struct nvme_ctrl *ctrl, struct nvme_ns *ns,
c.common.cdw14 = cpu_to_le32(cmd.cdw14);
c.common.cdw15 = cpu_to_le32(cmd.cdw15);
- if (!nvme_cmd_allowed(ns, &c, flags, open_for_write))
- return -EACCES;
-
if (cmd.timeout_ms)
timeout = msecs_to_jiffies(cmd.timeout_ms);
@@ -563,6 +488,9 @@ static int nvme_uring_cmd_io(struct nvme_ctrl *ctrl, struct nvme_ns *ns,
void *meta = NULL;
int ret;
+ if (!capable(CAP_SYS_ADMIN))
+ return -EACCES;
+
c.common.opcode = READ_ONCE(cmd->opcode);
c.common.flags = READ_ONCE(cmd->flags);
if (c.common.flags)
@@ -584,9 +512,6 @@ static int nvme_uring_cmd_io(struct nvme_ctrl *ctrl, struct nvme_ns *ns,
c.common.cdw14 = cpu_to_le32(READ_ONCE(cmd->cdw14));
c.common.cdw15 = cpu_to_le32(READ_ONCE(cmd->cdw15));
- if (!nvme_cmd_allowed(ns, &c, 0, ioucmd->file->f_mode & FMODE_WRITE))
- return -EACCES;
-
d.metadata = READ_ONCE(cmd->metadata);
d.addr = READ_ONCE(cmd->addr);
d.data_len = READ_ONCE(cmd->data_len);
@@ -643,13 +568,13 @@ static bool is_ctrl_ioctl(unsigned int cmd)
}
static int nvme_ctrl_ioctl(struct nvme_ctrl *ctrl, unsigned int cmd,
- void __user *argp, bool open_for_write)
+ void __user *argp)
{
switch (cmd) {
case NVME_IOCTL_ADMIN_CMD:
- return nvme_user_cmd(ctrl, NULL, argp, 0, open_for_write);
+ return nvme_user_cmd(ctrl, NULL, argp);
case NVME_IOCTL_ADMIN64_CMD:
- return nvme_user_cmd64(ctrl, NULL, argp, 0, open_for_write);
+ return nvme_user_cmd64(ctrl, NULL, argp, 0);
default:
return sed_ioctl(ctrl->opal_dev, cmd, argp);
}
@@ -674,14 +599,16 @@ struct nvme_user_io32 {
#endif /* COMPAT_FOR_U64_ALIGNMENT */
static int nvme_ns_ioctl(struct nvme_ns *ns, unsigned int cmd,
- void __user *argp, unsigned int flags, bool open_for_write)
+ void __user *argp)
{
+ unsigned int flags = 0;
+
switch (cmd) {
case NVME_IOCTL_ID:
force_successful_syscall_return();
return ns->head->ns_id;
case NVME_IOCTL_IO_CMD:
- return nvme_user_cmd(ns->ctrl, ns, argp, flags, open_for_write);
+ return nvme_user_cmd(ns->ctrl, ns, argp);
/*
* struct nvme_user_io can have different padding on some 32-bit ABIs.
* Just accept the compat version as all fields that are used are the
@@ -696,39 +623,32 @@ static int nvme_ns_ioctl(struct nvme_ns *ns, unsigned int cmd,
flags |= NVME_IOCTL_VEC;
fallthrough;
case NVME_IOCTL_IO64_CMD:
- return nvme_user_cmd64(ns->ctrl, ns, argp, flags,
- open_for_write);
+ return nvme_user_cmd64(ns->ctrl, ns, argp, flags);
default:
return -ENOTTY;
}
}
-int nvme_ioctl(struct block_device *bdev, blk_mode_t mode,
+int nvme_ioctl(struct block_device *bdev, fmode_t mode,
unsigned int cmd, unsigned long arg)
{
struct nvme_ns *ns = bdev->bd_disk->private_data;
- bool open_for_write = mode & BLK_OPEN_WRITE;
void __user *argp = (void __user *)arg;
- unsigned int flags = 0;
-
- if (bdev_is_partition(bdev))
- flags |= NVME_IOCTL_PARTITION;
if (is_ctrl_ioctl(cmd))
- return nvme_ctrl_ioctl(ns->ctrl, cmd, argp, open_for_write);
- return nvme_ns_ioctl(ns, cmd, argp, flags, open_for_write);
+ return nvme_ctrl_ioctl(ns->ctrl, cmd, argp);
+ return nvme_ns_ioctl(ns, cmd, argp);
}
long nvme_ns_chr_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
{
struct nvme_ns *ns =
container_of(file_inode(file)->i_cdev, struct nvme_ns, cdev);
- bool open_for_write = file->f_mode & FMODE_WRITE;
void __user *argp = (void __user *)arg;
if (is_ctrl_ioctl(cmd))
- return nvme_ctrl_ioctl(ns->ctrl, cmd, argp, open_for_write);
- return nvme_ns_ioctl(ns, cmd, argp, 0, open_for_write);
+ return nvme_ctrl_ioctl(ns->ctrl, cmd, argp);
+ return nvme_ns_ioctl(ns, cmd, argp);
}
static int nvme_uring_cmd_checks(unsigned int issue_flags)
@@ -792,8 +712,7 @@ int nvme_ns_chr_uring_cmd_iopoll(struct io_uring_cmd *ioucmd,
}
#ifdef CONFIG_NVME_MULTIPATH
static int nvme_ns_head_ctrl_ioctl(struct nvme_ns *ns, unsigned int cmd,
- void __user *argp, struct nvme_ns_head *head, int srcu_idx,
- bool open_for_write)
+ void __user *argp, struct nvme_ns_head *head, int srcu_idx)
__releases(&head->srcu)
{
struct nvme_ctrl *ctrl = ns->ctrl;
@@ -801,7 +720,7 @@ static int nvme_ns_head_ctrl_ioctl(struct nvme_ns *ns, unsigned int cmd,
nvme_get_ctrl(ns->ctrl);
srcu_read_unlock(&head->srcu, srcu_idx);
- ret = nvme_ctrl_ioctl(ns->ctrl, cmd, argp, open_for_write);
+ ret = nvme_ctrl_ioctl(ns->ctrl, cmd, argp);
nvme_put_ctrl(ctrl);
return ret;
@@ -811,14 +730,9 @@ int nvme_ns_head_ioctl(struct block_device *bdev, blk_mode_t mode,
unsigned int cmd, unsigned long arg)
{
struct nvme_ns_head *head = bdev->bd_disk->private_data;
- bool open_for_write = mode & BLK_OPEN_WRITE;
void __user *argp = (void __user *)arg;
struct nvme_ns *ns;
int srcu_idx, ret = -EWOULDBLOCK;
- unsigned int flags = 0;
-
- if (bdev_is_partition(bdev))
- flags |= NVME_IOCTL_PARTITION;
srcu_idx = srcu_read_lock(&head->srcu);
ns = nvme_find_path(head);
@@ -831,10 +745,9 @@ int nvme_ns_head_ioctl(struct block_device *bdev, blk_mode_t mode,
* deadlock when deleting namespaces using the passthrough interface.
*/
if (is_ctrl_ioctl(cmd))
- return nvme_ns_head_ctrl_ioctl(ns, cmd, argp, head, srcu_idx,
- open_for_write);
+ return nvme_ns_head_ctrl_ioctl(ns, cmd, argp, head, srcu_idx);
- ret = nvme_ns_ioctl(ns, cmd, argp, flags, open_for_write);
+ ret = nvme_ns_ioctl(ns, cmd, argp);
out_unlock:
srcu_read_unlock(&head->srcu, srcu_idx);
return ret;
@@ -843,7 +756,6 @@ int nvme_ns_head_ioctl(struct block_device *bdev, blk_mode_t mode,
long nvme_ns_head_chr_ioctl(struct file *file, unsigned int cmd,
unsigned long arg)
{
- bool open_for_write = file->f_mode & FMODE_WRITE;
struct cdev *cdev = file_inode(file)->i_cdev;
struct nvme_ns_head *head =
container_of(cdev, struct nvme_ns_head, cdev);
@@ -857,10 +769,9 @@ long nvme_ns_head_chr_ioctl(struct file *file, unsigned int cmd,
goto out_unlock;
if (is_ctrl_ioctl(cmd))
- return nvme_ns_head_ctrl_ioctl(ns, cmd, argp, head, srcu_idx,
- open_for_write);
+ return nvme_ns_head_ctrl_ioctl(ns, cmd, argp, head, srcu_idx);
- ret = nvme_ns_ioctl(ns, cmd, argp, 0, open_for_write);
+ ret = nvme_ns_ioctl(ns, cmd, argp);
out_unlock:
srcu_read_unlock(&head->srcu, srcu_idx);
return ret;
@@ -909,8 +820,7 @@ int nvme_dev_uring_cmd(struct io_uring_cmd *ioucmd, unsigned int issue_flags)
return ret;
}
-static int nvme_dev_user_cmd(struct nvme_ctrl *ctrl, void __user *argp,
- bool open_for_write)
+static int nvme_dev_user_cmd(struct nvme_ctrl *ctrl, void __user *argp)
{
struct nvme_ns *ns;
int ret;
@@ -934,7 +844,7 @@ static int nvme_dev_user_cmd(struct nvme_ctrl *ctrl, void __user *argp,
kref_get(&ns->kref);
up_read(&ctrl->namespaces_rwsem);
- ret = nvme_user_cmd(ctrl, ns, argp, 0, open_for_write);
+ ret = nvme_user_cmd(ctrl, ns, argp);
nvme_put_ns(ns);
return ret;
@@ -946,17 +856,16 @@ static int nvme_dev_user_cmd(struct nvme_ctrl *ctrl, void __user *argp,
long nvme_dev_ioctl(struct file *file, unsigned int cmd,
unsigned long arg)
{
- bool open_for_write = file->f_mode & FMODE_WRITE;
struct nvme_ctrl *ctrl = file->private_data;
void __user *argp = (void __user *)arg;
switch (cmd) {
case NVME_IOCTL_ADMIN_CMD:
- return nvme_user_cmd(ctrl, NULL, argp, 0, open_for_write);
+ return nvme_user_cmd(ctrl, NULL, argp);
case NVME_IOCTL_ADMIN64_CMD:
- return nvme_user_cmd64(ctrl, NULL, argp, 0, open_for_write);
+ return nvme_user_cmd64(ctrl, NULL, argp, 0);
case NVME_IOCTL_IO_CMD:
- return nvme_dev_user_cmd(ctrl, argp, open_for_write);
+ return nvme_dev_user_cmd(ctrl, argp);
case NVME_IOCTL_RESET:
if (!capable(CAP_SYS_ADMIN))
return -EACCES;
diff --git a/include/linux/nvme.h b/include/linux/nvme.h
index 26dd3f859d9d..df3e2c619448 100644
--- a/include/linux/nvme.h
+++ b/include/linux/nvme.h
@@ -642,7 +642,6 @@ enum {
NVME_CMD_EFFECTS_CCC = 1 << 4,
NVME_CMD_EFFECTS_CSE_MASK = GENMASK(18, 16),
NVME_CMD_EFFECTS_UUID_SEL = 1 << 19,
- NVME_CMD_EFFECTS_SCOPE_MASK = GENMASK(31, 20),
};
struct nvme_effects_log {
@@ -834,7 +833,6 @@ enum nvme_opcode {
nvme_cmd_zone_mgmt_send = 0x79,
nvme_cmd_zone_mgmt_recv = 0x7a,
nvme_cmd_zone_append = 0x7d,
- nvme_cmd_vendor_start = 0x80,
};
#define nvme_opcode_name(opcode) { opcode, #opcode }
--
2.25.1
More information about the Linux-nvme
mailing list