[PATCH v3] nvme: fix memory corruption for passthrough metadata
Kanchan Joshi
joshi.k at samsung.com
Fri Oct 6 06:47:06 PDT 2023
User can specify a smaller meta buffer than what the device is
wired to update/access. Kernel makes a copy of the meta buffer into
which the device does DMA.
As a result, the device overwrites the unrelated kernel memory, causing
random kernel crashes.
Same issue is possible for extended-lba case also. When user specifies a
short unaligned buffer, the kernel makes a copy and uses that for DMA.
Detect these situations for sync/uring passthrough, and bail out.
Fixes: 456cba386e94 ("nvme: wire-up uring-cmd support for io-passthru on char-device")
Cc: stable at vger.kernel.org
Reported-by: Vincent Fu <vincent.fu at samsung.com>
Signed-off-by: Kanchan Joshi <joshi.k at samsung.com>
---
Changes since v2:
- Handle extended-lba case: short unaligned buffer IO
- Reduce the scope of check to only well-known commands
- Do not check early. Move it deeper so that check gets executed less
often
- Combine two patches into one.
Changes since v1:
- Revise the check to exclude PRACT=1 case
Random crash example:
[ 6815.014478] general protection fault, probably for non-canonical address 0x70e3cdbe9133b7a6: 0000 [#1] PREEMPT SMP PTI
[ 6815.014505] CPU: 1 PID: 434 Comm: systemd-timesyn Tainted: G OE 6.4.0-rc3+ #5
[ 6815.014516] Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS rel-1.16.2-0-gea1b7a073390-prebuilt.qemu.org 04/01/2014
[ 6815.014522] RIP: 0010:__kmem_cache_alloc_node+0x100/0x440
[ 6815.014551] Code: 48 85 c0 0f 84 fb 02 00 00 41 83 ff ff 74 10 48 8b 00 48 c1 e8 36 41 39 c7 0f 85 e5 02 00 00 41 8b 45 28 49 8b 7d 00 4c 01 e0 <48> 8b 18 48 89 c1 49 33 9d b8 00 00 00 4c 89 e0 48 0f c9 48 31 cb
[ 6815.014559] RSP: 0018:ffffb510c0577d18 EFLAGS: 00010216
[ 6815.014569] RAX: 70e3cdbe9133b7a6 RBX: ffff8a9ec1042300 RCX: 0000000000000010
[ 6815.014575] RDX: 00000000048b0001 RSI: 0000000000000dc0 RDI: 0000000000037060
[ 6815.014581] RBP: ffffb510c0577d58 R08: ffffffffb9ffa280 R09: 0000000000000000
[ 6815.014586] R10: ffff8a9ecbcab1f0 R11: 0000000000000000 R12: 70e3cdbe9133b79e
[ 6815.014591] R13: ffff8a9ec1042300 R14: 0000000000000dc0 R15: 00000000ffffffff
[ 6815.014597] FS: 00007fce590d6940(0000) GS:ffff8a9f3dd00000(0000) knlGS:0000000000000000
[ 6815.014604] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[ 6815.014609] CR2: 00005579abbb6498 CR3: 000000000d9b0000 CR4: 00000000000006e0
[ 6815.014622] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
[ 6815.014627] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400
[ 6815.014632] Call Trace:
[ 6815.014650] <TASK>
[ 6815.014655] ? apparmor_sk_alloc_security+0x40/0x80
[ 6815.014673] kmalloc_trace+0x2a/0xa0
[ 6815.014684] apparmor_sk_alloc_security+0x40/0x80
[ 6815.014694] security_sk_alloc+0x3f/0x60
[ 6815.014703] sk_prot_alloc+0x75/0x110
[ 6815.014712] sk_alloc+0x31/0x200
[ 6815.014721] inet_create+0xd8/0x3a0
[ 6815.014734] __sock_create+0x11b/0x220
[ 6815.014749] __sys_socket_create.part.0+0x49/0x70
[ 6815.014756] ? __secure_computing+0x94/0xf0
[ 6815.014768] __sys_socket+0x3c/0xc0
[ 6815.014776] __x64_sys_socket+0x1a/0x30
[ 6815.014783] do_syscall_64+0x3b/0x90
[ 6815.014794] entry_SYSCALL_64_after_hwframe+0x72/0xdc
[ 6815.014804] RIP: 0033:0x7fce59aa795b
drivers/nvme/host/ioctl.c | 76 +++++++++++++++++++++++++++++++++++++++
1 file changed, 76 insertions(+)
diff --git a/drivers/nvme/host/ioctl.c b/drivers/nvme/host/ioctl.c
index d8ff796fd5f2..8147750beff4 100644
--- a/drivers/nvme/host/ioctl.c
+++ b/drivers/nvme/host/ioctl.c
@@ -158,6 +158,67 @@ static struct request *nvme_alloc_user_request(struct request_queue *q,
return req;
}
+static inline bool nvme_nlb_in_cdw12(u8 opcode)
+{
+ switch (opcode) {
+ case nvme_cmd_read:
+ case nvme_cmd_write:
+ case nvme_cmd_compare:
+ case nvme_cmd_zone_append:
+ return true;
+ }
+ return false;
+}
+
+static bool nvme_validate_passthru_meta(struct nvme_ns *ns,
+ struct nvme_command *c,
+ __u32 meta_len,
+ unsigned data_len)
+{
+ /*
+ * User may specify smaller meta-buffer with a larger data-buffer.
+ * Driver allocated meta buffer will also be small.
+ * Device can do larger dma into that, overwriting unrelated kernel
+ * memory.
+ * For extended lba case, if user provides a short unaligned buffer,
+ * the device will corrupt kernel memory.
+ * Avoid running into corruption in both situations.
+ */
+ bool ext_lba = ns->features & NVME_NS_EXT_LBAS;
+ u16 nlb, control;
+ unsigned dlen, mlen;
+
+ /* Exclude commands that do not have nlb in cdw12 */
+ if (!nvme_nlb_in_cdw12(c->common.opcode))
+ return true;
+
+ control = upper_16_bits(le32_to_cpu(c->common.cdw12));
+ /* Exclude when meta transfer from/to host is not done */
+ if (control & NVME_RW_PRINFO_PRACT && ns->ms == ns->pi_size)
+ return true;
+
+ nlb = lower_16_bits(le32_to_cpu(c->common.cdw12));
+ mlen = (nlb + 1) * ns->ms;
+
+ /* sanity for interleaved buffer */
+ if (ext_lba) {
+ dlen = (nlb + 1) << ns->lba_shift;
+ if (data_len < (dlen + mlen))
+ goto out_false;
+ return true;
+ }
+ /* sanity for separate meta buffer */
+ if (meta_len < mlen)
+ goto out_false;
+
+ return true;
+
+out_false:
+ dev_err(ns->ctrl->device,
+ "%s: metadata length is small!\n", current->comm);
+ return false;
+}
+
static int nvme_map_user_request(struct request *req, u64 ubuffer,
unsigned bufflen, void __user *meta_buffer, unsigned meta_len,
u32 meta_seed, void **metap, struct io_uring_cmd *ioucmd,
@@ -194,6 +255,12 @@ static int nvme_map_user_request(struct request *req, u64 ubuffer,
bio_set_dev(bio, bdev);
if (bdev && meta_buffer && meta_len) {
+ if (!nvme_validate_passthru_meta(ns, nvme_req(req)->cmd,
+ meta_len, bufflen)) {
+ ret = -EINVAL;
+ goto out_unmap;
+ }
+
meta = nvme_add_user_metadata(req, meta_buffer, meta_len,
meta_seed);
if (IS_ERR(meta)) {
@@ -203,6 +270,15 @@ static int nvme_map_user_request(struct request *req, u64 ubuffer,
*metap = meta;
}
+ /* guard another case when kernel memory is being used */
+ if (bio->bi_private && ns && ns->features & NVME_NS_EXT_LBAS) {
+ if (!nvme_validate_passthru_meta(ns, nvme_req(req)->cmd,
+ meta_len, bufflen)) {
+ ret = -EINVAL;
+ goto out_unmap;
+ }
+ }
+
return ret;
out_unmap:
--
2.25.1
More information about the Linux-nvme
mailing list