[PATCH] nvmet: allow associating port to a cgroup via configfs
Sagi Grimberg
sagi at grimberg.me
Tue Jun 27 14:13:18 PDT 2023
Hey Ofir,
> Currently there is no way to throttle nvme targets with cgroup v2.
How do you do it with v1?
> The IOs that the nvme target submits lack associating to a cgroup,
> which makes them act as root cgroup. The root cgroup can't be throttled
> with the cgroup v2 mechanism.
What happens to file or passthru backends? You paid attention just to
bdev. I don't see how this is sanely supported with files. It's possible
if you convert nvmet to use its own dedicated kthreads and infer the
cg from the kthread. That presents a whole other set of issues.
Maybe the cleanest way to implement something like this is to implement
a full blown nvmet cgroup controller that you can apply a whole set of
resources to, in addition to I/O.
> Signed-off-by: Ofir Gal <ofir.gal at volumez.com>
> ---
> drivers/nvme/target/configfs.c | 77 +++++++++++++++++++++++++++++++
> drivers/nvme/target/core.c | 3 ++
> drivers/nvme/target/io-cmd-bdev.c | 13 ++++++
> drivers/nvme/target/nvmet.h | 3 ++
> include/linux/cgroup.h | 5 ++
> kernel/cgroup/cgroup-internal.h | 5 --
Don't mix cgroup and nvmet changes in the same patch.
> 6 files changed, 101 insertions(+), 5 deletions(-)
>
> diff --git a/drivers/nvme/target/configfs.c b/drivers/nvme/target/configfs.c
> index 907143870da5..2e8f93a07498 100644
> --- a/drivers/nvme/target/configfs.c
> +++ b/drivers/nvme/target/configfs.c
> @@ -12,6 +12,7 @@
> #include <linux/ctype.h>
> #include <linux/pci.h>
> #include <linux/pci-p2pdma.h>
> +#include <linux/cgroup.h>
> #ifdef CONFIG_NVME_TARGET_AUTH
> #include <linux/nvme-auth.h>
> #endif
> @@ -281,6 +282,81 @@ static ssize_t nvmet_param_pi_enable_store(struct config_item *item,
> CONFIGFS_ATTR(nvmet_, param_pi_enable);
> #endif
>
> +static ssize_t nvmet_param_associated_cgroup_show(struct config_item *item,
> + char *page)
> +{
> + struct nvmet_port *port = to_nvmet_port(item);
> + ssize_t len = 0;
> + ssize_t retval;
> + char *suffix;
> +
> + /* No cgroup has been set means the IOs are assoicated to the root cgroup */
> + if (!port->cgrp)
> + goto root_cgroup;
> +
> + retval = cgroup_path_ns(port->cgrp, page, PAGE_SIZE,
> + current->nsproxy->cgroup_ns);
> + if (retval >= PATH_MAX || retval >= PAGE_SIZE)
> + return -ENAMETOOLONG;
> +
> + /* No cgroup found means the IOs are assoicated to the root cgroup */
> + if (retval < 0)
> + goto root_cgroup;
> +
> + len += retval;
> +
> + suffix = cgroup_is_dead(port->cgrp) ? " (deleted)\n" : "\n";
> + len += snprintf(page + len, PAGE_SIZE - len, suffix);
> +
> + return len;
> +
> +root_cgroup:
> + return snprintf(page, PAGE_SIZE, "/\n");
> +}
> +
> +static ssize_t nvmet_param_associated_cgroup_store(struct config_item *item,
> + const char *page, size_t count)
> +{
> + struct nvmet_port *port = to_nvmet_port(item);
> + struct cgroup_subsys_state *blkcg;
> + ssize_t retval = -EINVAL;
> + struct cgroup *cgrp;
> + char *path;
> + int len;
> +
> + len = strcspn(page, "\n");
> + if (!len)
> + return -EINVAL;
> +
> + path = kmemdup_nul(page, len, GFP_KERNEL);
> + if (!path)
> + return -ENOMEM;
> +
> + cgrp = cgroup_get_from_path(path);
> + kfree(path);
> + if (IS_ERR(cgrp))
> + return -ENOENT;
> +
> + blkcg = cgroup_get_e_css(cgrp, &io_cgrp_subsys);
> + if (!blkcg)
> + goto out_put_cgroup;
> +
> + /* Put old cgroup */
> + if (port->cgrp)
> + cgroup_put(port->cgrp);
> +
> + port->cgrp = cgrp;
> + port->blkcg = blkcg;
> +
> + return count;
> +
> +out_put_cgroup:
> + cgroup_put(cgrp);
> + return retval;
> +}
I'm not at all convinced that nvmet ratelimiting does not
require a dedicated cgroup controller... Rgardles, this doesn't
look like a port attribute, its a subsystem attribute.
> +
> +CONFIGFS_ATTR(nvmet_, param_associated_cgroup);
> +
> static ssize_t nvmet_addr_trtype_show(struct config_item *item,
> char *page)
> {
> @@ -1742,6 +1818,7 @@ static struct configfs_attribute *nvmet_port_attrs[] = {
> &nvmet_attr_addr_trsvcid,
> &nvmet_attr_addr_trtype,
> &nvmet_attr_param_inline_data_size,
> + &nvmet_attr_param_associated_cgroup,
> #ifdef CONFIG_BLK_DEV_INTEGRITY
> &nvmet_attr_param_pi_enable,
> #endif
> diff --git a/drivers/nvme/target/core.c b/drivers/nvme/target/core.c
> index 3935165048e7..b63996b61c6d 100644
> --- a/drivers/nvme/target/core.c
> +++ b/drivers/nvme/target/core.c
> @@ -376,6 +376,9 @@ void nvmet_disable_port(struct nvmet_port *port)
> port->enabled = false;
> port->tr_ops = NULL;
>
> + if (port->cgrp)
> + cgroup_put(port->cgrp);
> +
> ops = nvmet_transports[port->disc_addr.trtype];
> ops->remove_port(port);
> module_put(ops->owner);
> diff --git a/drivers/nvme/target/io-cmd-bdev.c b/drivers/nvme/target/io-cmd-bdev.c
> index c2d6cea0236b..eb63a071131d 100644
> --- a/drivers/nvme/target/io-cmd-bdev.c
> +++ b/drivers/nvme/target/io-cmd-bdev.c
> @@ -8,6 +8,8 @@
> #include <linux/blk-integrity.h>
> #include <linux/memremap.h>
> #include <linux/module.h>
> +#include <linux/cgroup.h>
> +#include <linux/blk-cgroup.h>
> #include "nvmet.h"
>
> void nvmet_bdev_set_limits(struct block_device *bdev, struct nvme_id_ns *id)
> @@ -285,6 +287,8 @@ static void nvmet_bdev_execute_rw(struct nvmet_req *req)
> bio->bi_iter.bi_sector = sector;
> bio->bi_private = req;
> bio->bi_end_io = nvmet_bio_done;
> + if (req->port->blkcg)
> + bio_associate_blkg_from_css(bio, req->port->blkcg);
>
> blk_start_plug(&plug);
> if (req->metadata_len)
> @@ -308,6 +312,7 @@ static void nvmet_bdev_execute_rw(struct nvmet_req *req)
> bio = bio_alloc(req->ns->bdev, bio_max_segs(sg_cnt),
> opf, GFP_KERNEL);
> bio->bi_iter.bi_sector = sector;
> + bio_clone_blkg_association(bio, prev);
>
> bio_chain(bio, prev);
> submit_bio(prev);
> @@ -345,6 +350,8 @@ static void nvmet_bdev_execute_flush(struct nvmet_req *req)
> ARRAY_SIZE(req->inline_bvec), REQ_OP_WRITE | REQ_PREFLUSH);
> bio->bi_private = req;
> bio->bi_end_io = nvmet_bio_done;
> + if (req->port->blkcg)
> + bio_associate_blkg_from_css(bio, req->port->blkcg);
>
> submit_bio(bio);
> }
> @@ -397,6 +404,9 @@ static void nvmet_bdev_execute_discard(struct nvmet_req *req)
> if (bio) {
> bio->bi_private = req;
> bio->bi_end_io = nvmet_bio_done;
> + if (req->port->blkcg)
> + bio_associate_blkg_from_css(bio, req->port->blkcg);
> +
> if (status)
> bio_io_error(bio);
> else
> @@ -444,6 +454,9 @@ static void nvmet_bdev_execute_write_zeroes(struct nvmet_req *req)
> if (bio) {
> bio->bi_private = req;
> bio->bi_end_io = nvmet_bio_done;
> + if (req->port->blkcg)
> + bio_associate_blkg_from_css(bio, req->port->blkcg);
> +
> submit_bio(bio);
> } else {
> nvmet_req_complete(req, errno_to_nvme_status(req, ret));
> diff --git a/drivers/nvme/target/nvmet.h b/drivers/nvme/target/nvmet.h
> index dc60a22646f7..3e5c9737d07e 100644
> --- a/drivers/nvme/target/nvmet.h
> +++ b/drivers/nvme/target/nvmet.h
> @@ -20,6 +20,7 @@
> #include <linux/blkdev.h>
> #include <linux/radix-tree.h>
> #include <linux/t10-pi.h>
> +#include <linux/cgroup.h>
>
> #define NVMET_DEFAULT_VS NVME_VS(1, 3, 0)
>
> @@ -163,6 +164,8 @@ struct nvmet_port {
> int inline_data_size;
> const struct nvmet_fabrics_ops *tr_ops;
> bool pi_enable;
> + struct cgroup *cgrp;
> + struct cgroup_subsys_state *blkcg;
> };
>
> static inline struct nvmet_port *to_nvmet_port(struct config_item *item)
> diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h
> index 885f5395fcd0..47e2a7cdc31e 100644
> --- a/include/linux/cgroup.h
> +++ b/include/linux/cgroup.h
> @@ -562,6 +562,11 @@ static inline bool cgroup_is_populated(struct cgroup *cgrp)
> cgrp->nr_populated_threaded_children;
> }
>
> +static inline bool cgroup_is_dead(const struct cgroup *cgrp)
> +{
> + return !(cgrp->self.flags & CSS_ONLINE);
> +}
> +
> /* returns ino associated with a cgroup */
> static inline ino_t cgroup_ino(struct cgroup *cgrp)
> {
> diff --git a/kernel/cgroup/cgroup-internal.h b/kernel/cgroup/cgroup-internal.h
> index 367b0a42ada9..8c5c83e9edd7 100644
> --- a/kernel/cgroup/cgroup-internal.h
> +++ b/kernel/cgroup/cgroup-internal.h
> @@ -181,11 +181,6 @@ extern struct list_head cgroup_roots;
> for ((ssid) = 0; (ssid) < CGROUP_SUBSYS_COUNT && \
> (((ss) = cgroup_subsys[ssid]) || true); (ssid)++)
>
> -static inline bool cgroup_is_dead(const struct cgroup *cgrp)
> -{
> - return !(cgrp->self.flags & CSS_ONLINE);
> -}
> -
> static inline bool notify_on_release(const struct cgroup *cgrp)
> {
> return test_bit(CGRP_NOTIFY_ON_RELEASE, &cgrp->flags);
More information about the Linux-nvme
mailing list