[PATCH v3] nvme: fix memory corruption for passthrough metadata

Kanchan Joshi joshi.k at samsung.com
Fri Oct 6 06:47:06 PDT 2023


User can specify a smaller meta buffer than what the device is
wired to update/access. Kernel makes a copy of the meta buffer into
which the device does DMA.
As a result, the device overwrites the unrelated kernel memory, causing
random kernel crashes.

Same issue is possible for extended-lba case also. When user specifies a
short unaligned buffer, the kernel makes a copy and uses that for DMA.

Detect these situations for sync/uring passthrough, and bail out.

Fixes: 456cba386e94 ("nvme: wire-up uring-cmd support for io-passthru on char-device")
Cc: stable at vger.kernel.org

Reported-by: Vincent Fu <vincent.fu at samsung.com>
Signed-off-by: Kanchan Joshi <joshi.k at samsung.com>
---

Changes since v2:
- Handle extended-lba case: short unaligned buffer IO
- Reduce the scope of check to only well-known commands
- Do not check early. Move it deeper so that check gets executed less
  often
- Combine two patches into one.

Changes since v1:
- Revise the check to exclude PRACT=1 case

Random crash example:

[ 6815.014478] general protection fault, probably for non-canonical address 0x70e3cdbe9133b7a6: 0000 [#1] PREEMPT SMP PTI
[ 6815.014505] CPU: 1 PID: 434 Comm: systemd-timesyn Tainted: G           OE      6.4.0-rc3+ #5
[ 6815.014516] Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS rel-1.16.2-0-gea1b7a073390-prebuilt.qemu.org 04/01/2014
[ 6815.014522] RIP: 0010:__kmem_cache_alloc_node+0x100/0x440
[ 6815.014551] Code: 48 85 c0 0f 84 fb 02 00 00 41 83 ff ff 74 10 48 8b 00 48 c1 e8 36 41 39 c7 0f 85 e5 02 00 00 41 8b 45 28 49 8b 7d 00 4c 01 e0 <48> 8b 18 48 89 c1 49 33 9d b8 00 00 00 4c 89 e0 48 0f c9 48 31 cb
[ 6815.014559] RSP: 0018:ffffb510c0577d18 EFLAGS: 00010216
[ 6815.014569] RAX: 70e3cdbe9133b7a6 RBX: ffff8a9ec1042300 RCX: 0000000000000010
[ 6815.014575] RDX: 00000000048b0001 RSI: 0000000000000dc0 RDI: 0000000000037060
[ 6815.014581] RBP: ffffb510c0577d58 R08: ffffffffb9ffa280 R09: 0000000000000000
[ 6815.014586] R10: ffff8a9ecbcab1f0 R11: 0000000000000000 R12: 70e3cdbe9133b79e
[ 6815.014591] R13: ffff8a9ec1042300 R14: 0000000000000dc0 R15: 00000000ffffffff
[ 6815.014597] FS:  00007fce590d6940(0000) GS:ffff8a9f3dd00000(0000) knlGS:0000000000000000
[ 6815.014604] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[ 6815.014609] CR2: 00005579abbb6498 CR3: 000000000d9b0000 CR4: 00000000000006e0
[ 6815.014622] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
[ 6815.014627] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400
[ 6815.014632] Call Trace:
[ 6815.014650]  <TASK>
[ 6815.014655]  ? apparmor_sk_alloc_security+0x40/0x80
[ 6815.014673]  kmalloc_trace+0x2a/0xa0
[ 6815.014684]  apparmor_sk_alloc_security+0x40/0x80
[ 6815.014694]  security_sk_alloc+0x3f/0x60
[ 6815.014703]  sk_prot_alloc+0x75/0x110
[ 6815.014712]  sk_alloc+0x31/0x200
[ 6815.014721]  inet_create+0xd8/0x3a0
[ 6815.014734]  __sock_create+0x11b/0x220
[ 6815.014749]  __sys_socket_create.part.0+0x49/0x70
[ 6815.014756]  ? __secure_computing+0x94/0xf0
[ 6815.014768]  __sys_socket+0x3c/0xc0
[ 6815.014776]  __x64_sys_socket+0x1a/0x30
[ 6815.014783]  do_syscall_64+0x3b/0x90
[ 6815.014794]  entry_SYSCALL_64_after_hwframe+0x72/0xdc
[ 6815.014804] RIP: 0033:0x7fce59aa795b

 drivers/nvme/host/ioctl.c | 76 +++++++++++++++++++++++++++++++++++++++
 1 file changed, 76 insertions(+)

diff --git a/drivers/nvme/host/ioctl.c b/drivers/nvme/host/ioctl.c
index d8ff796fd5f2..8147750beff4 100644
--- a/drivers/nvme/host/ioctl.c
+++ b/drivers/nvme/host/ioctl.c
@@ -158,6 +158,67 @@ static struct request *nvme_alloc_user_request(struct request_queue *q,
 	return req;
 }
 
+static inline bool nvme_nlb_in_cdw12(u8 opcode)
+{
+	switch (opcode) {
+	case nvme_cmd_read:
+	case nvme_cmd_write:
+	case nvme_cmd_compare:
+	case nvme_cmd_zone_append:
+		return true;
+	}
+	return false;
+}
+
+static bool nvme_validate_passthru_meta(struct nvme_ns *ns,
+					struct nvme_command *c,
+					__u32 meta_len,
+					unsigned data_len)
+{
+	/*
+	 * User may specify smaller meta-buffer with a larger data-buffer.
+	 * Driver allocated meta buffer will also be small.
+	 * Device can do larger dma into that, overwriting unrelated kernel
+	 * memory.
+	 * For extended lba case, if user provides a short unaligned buffer,
+	 * the device will corrupt kernel memory.
+	 * Avoid running into corruption in both situations.
+	 */
+	bool ext_lba = ns->features & NVME_NS_EXT_LBAS;
+	u16 nlb, control;
+	unsigned dlen, mlen;
+
+	/* Exclude commands that do not have nlb in cdw12 */
+	if (!nvme_nlb_in_cdw12(c->common.opcode))
+		return true;
+
+	control = upper_16_bits(le32_to_cpu(c->common.cdw12));
+	/* Exclude when meta transfer from/to host is not done */
+	if (control & NVME_RW_PRINFO_PRACT && ns->ms == ns->pi_size)
+		return true;
+
+	nlb = lower_16_bits(le32_to_cpu(c->common.cdw12));
+	mlen = (nlb + 1) * ns->ms;
+
+	/* sanity for interleaved buffer */
+	if (ext_lba) {
+		dlen = (nlb + 1) << ns->lba_shift;
+		if (data_len < (dlen + mlen))
+			goto out_false;
+		return true;
+	}
+	/* sanity for separate meta buffer */
+	if (meta_len < mlen)
+		goto out_false;
+
+	return true;
+
+out_false:
+	dev_err(ns->ctrl->device,
+		"%s: metadata length is small!\n", current->comm);
+	return false;
+}
+
 static int nvme_map_user_request(struct request *req, u64 ubuffer,
 		unsigned bufflen, void __user *meta_buffer, unsigned meta_len,
 		u32 meta_seed, void **metap, struct io_uring_cmd *ioucmd,
@@ -194,6 +255,12 @@ static int nvme_map_user_request(struct request *req, u64 ubuffer,
 		bio_set_dev(bio, bdev);
 
 	if (bdev && meta_buffer && meta_len) {
+		if (!nvme_validate_passthru_meta(ns, nvme_req(req)->cmd,
+					meta_len, bufflen)) {
+			ret = -EINVAL;
+			goto out_unmap;
+		}
+
 		meta = nvme_add_user_metadata(req, meta_buffer, meta_len,
 				meta_seed);
 		if (IS_ERR(meta)) {
@@ -203,6 +270,15 @@ static int nvme_map_user_request(struct request *req, u64 ubuffer,
 		*metap = meta;
 	}
 
+	/* guard another case when kernel memory is being used */
+	if (bio->bi_private && ns && ns->features & NVME_NS_EXT_LBAS) {
+		if (!nvme_validate_passthru_meta(ns, nvme_req(req)->cmd,
+					meta_len, bufflen)) {
+			ret = -EINVAL;
+			goto out_unmap;
+		}
+	}
+
 	return ret;
 
 out_unmap:
-- 
2.25.1





More information about the Linux-nvme mailing list