[PATCH v9 1/1] nvmet: support reservation feature

Mon Sep 23 09:32:28 PDT 2024

> This patch implements the reservation feature, includes:
> 1. reservation register(register, unregister and replace).
> 2. reservation acquire(acquire, preempt, preempt and abort).
> 3. reservation release(release and clear).
> 4. reservation report.
> 5. set feature and get feature of reservation notify mask.
> 6. get log page of reservation event.
> 
> And also make reservation configurable, one can set ns to support
> reservation before enable ns. The default of resv_enable is false.
> 
> Signed-off-by: Guixin Liu <kanie at linux.alibaba.com>
> ---
>  drivers/nvme/target/Makefile    |    2 +-
>  drivers/nvme/target/admin-cmd.c |   24 +-
>  drivers/nvme/target/configfs.c  |   27 +
>  drivers/nvme/target/core.c      |   58 +-
>  drivers/nvme/target/nvmet.h     |   49 ++
>  drivers/nvme/target/pr.c        | 1214 +++++++++++++++++++++++++++++++
>  include/linux/nvme.h            |   54 ++
>  7 files changed, 1419 insertions(+), 9 deletions(-)
>  create mode 100644 drivers/nvme/target/pr.c
...
> +
> +static void nvmet_pr_unregister_one(struct nvmet_pr *pr,
> +                                   struct nvmet_pr_registrant *reg)
> +{
> +       struct nvmet_pr_registrant *first_reg;
> +       struct nvmet_pr_registrant *holder;
> +       u8 original_rtype;
> +
> +       lockdep_assert_held(&pr->pr_lock);
> +       list_del_rcu(&reg->entry);
> +
> +       holder = rcu_dereference_protected(pr->holder,
> +                       lockdep_is_held(&pr->pr_lock));
> +       if (reg != holder)
> +               goto out;
> +
> +       original_rtype = holder->rtype;
> +       if (original_rtype == NVME_PR_WRITE_EXCLUSIVE_ALL_REGS ||
> +           original_rtype == NVME_PR_EXCLUSIVE_ACCESS_ALL_REGS) {
> +               first_reg = list_first_or_null_rcu(&pr->registrant_list,
> +                               struct nvmet_pr_registrant, entry);
> +               if (first_reg)
> +                       first_reg->rtype = original_rtype;
> +               rcu_assign_pointer(pr->holder, first_reg);
> +       } else {
> +               rcu_assign_pointer(pr->holder, NULL);
> +
> +               if (original_rtype == NVME_PR_WRITE_EXCLUSIVE_REG_ONLY ||
> +                   original_rtype == NVME_PR_WRITE_EXCLUSIVE_REG_ONLY)

copy&past error?
The second check should be against NVME_PR_EXCLUSIVE_ACCESS_REG_ONLY?

> +                       nvmet_pr_resv_released(pr, &reg->hostid);
> +       }
> +out:
> +       synchronize_rcu();
> +       kfree(reg);
> +}
> +
> +static u16 nvmet_pr_unregister(struct nvmet_req *req,
> +                              struct nvmet_pr_register_data *d,
> +                              bool ignore_key)
> +{
> +       u16 status = NVME_SC_RESERVATION_CONFLICT | NVME_SC_DNR;
> +       struct nvmet_ctrl *ctrl = req->sq->ctrl;
> +       struct nvmet_pr *pr = &req->ns->pr;
> +       struct nvmet_pr_registrant *reg;
> +
> +       mutex_lock(&pr->pr_lock);
> +       list_for_each_entry_rcu(reg, &pr->registrant_list, entry,
> +                               lockdep_is_held(&pr->pr_lock)) {
> +               if (uuid_equal(&reg->hostid, &ctrl->hostid)) {
> +                       if (ignore_key || reg->rkey == le64_to_cpu(d->crkey)) {
> +                               status = NVME_SC_SUCCESS;
> +                               nvmet_pr_unregister_one(pr, reg);
> +                       }
> +                       break;
> +               }
> +       }
> +       mutex_unlock(&pr->pr_lock);
> +
> +       return status;
> +}
> +
> +static void nvmet_pr_update_reg_rkey(struct nvmet_pr_registrant *reg,
> +                                    void *attr)
> +{
> +       reg->rkey = *(u64 *)attr;
> +}
> +
> +static u16 nvmet_pr_update_reg_attr(struct nvmet_pr *pr,
> +                                   struct nvmet_pr_registrant *reg,
> +                                   void (*change_attr)(struct nvmet_pr_registrant *reg,
> +                                       void *attr),
> +                                   void *attr)
> +{
> +       struct nvmet_pr_registrant *holder;
> +       struct nvmet_pr_registrant *new;
> +
> +       lockdep_assert_held(&pr->pr_lock);
> +       holder = rcu_dereference_protected(pr->holder,
> +                       lockdep_is_held(&pr->pr_lock));
> +       if (reg != holder) {
> +               change_attr(reg, attr);
> +               return NVME_SC_SUCCESS;
> +       }
> +
> +       new = kmalloc(sizeof(*new), GFP_ATOMIC);
> +       if (!new)
> +               return NVME_SC_INTERNAL;
> +
> +       new->rkey = holder->rkey;
> +       new->rtype = holder->rtype;
> +       uuid_copy(&new->hostid, &holder->hostid);
> +       new->cntlid = holder->cntlid;
> +       INIT_LIST_HEAD(&new->entry);
> +
> +       change_attr(new, attr);
> +       list_replace_rcu(&holder->entry, &new->entry);
> +       rcu_assign_pointer(pr->holder, new);
> +       synchronize_rcu();
> +       kfree(holder);
> +
> +       return NVME_SC_SUCCESS;
> +}
> +
> +static u16 nvmet_pr_replace(struct nvmet_req *req,
> +                           struct nvmet_pr_register_data *d,
> +                           bool ignore_key)
> +{
> +       u16 status = NVME_SC_RESERVATION_CONFLICT | NVME_SC_DNR;
> +       struct nvmet_ctrl *ctrl = req->sq->ctrl;
> +       struct nvmet_pr *pr = &req->ns->pr;
> +       struct nvmet_pr_registrant *reg;
> +       u64 nrkey = le64_to_cpu(d->nrkey);
> +
> +       mutex_lock(&pr->pr_lock);
> +       list_for_each_entry_rcu(reg, &pr->registrant_list, entry,
> +                               lockdep_is_held(&pr->pr_lock)) {
> +               if (uuid_equal(&reg->hostid, &ctrl->hostid)) {
> +                       if (ignore_key || reg->rkey == le64_to_cpu(d->crkey))
> +                               status = nvmet_pr_update_reg_attr(pr, reg,
> +                                               nvmet_pr_update_reg_rkey,
> +                                               &nrkey);
> +                       break;
> +               }
> +       }
> +       mutex_unlock(&pr->pr_lock);
> +       return status;
> +}
> +
> +static void nvmet_execute_pr_register(struct nvmet_req *req)
> +{
> +       u32 cdw10 = le32_to_cpu(req->cmd->common.cdw10);
> +       bool ignore_key = (bool)((cdw10 >> 3) & 1); /* Ignore existing key, bit 03 */
> +       struct nvmet_pr_register_data *d;
> +       u8 reg_act = cdw10 & 0x07; /* Reservation Register Action, bit 02:00 */
> +       u16 status;
> +
> +       d = kmalloc(sizeof(*d), GFP_KERNEL);
> +       if (!d) {
> +               status = NVME_SC_INTERNAL;
> +               goto out;
> +       }
> +
> +       status = nvmet_copy_from_sgl(req, 0, d, sizeof(*d));
> +       if (status)
> +               goto free_data;
> +
> +       switch (reg_act) {
> +       case NVME_PR_REGISTER_ACT_REG:
> +               status = nvmet_pr_register(req, d);
> +               break;
> +       case NVME_PR_REGISTER_ACT_UNREG:
> +               status = nvmet_pr_unregister(req, d, ignore_key);
> +               break;
> +       case NVME_PR_REGISTER_ACT_REPLACE:
> +               status = nvmet_pr_replace(req, d, ignore_key);
> +               break;
> +       default:
> +               req->error_loc = offsetof(struct nvme_common_command, cdw10);
> +               status = NVME_SC_INVALID_OPCODE | NVME_SC_DNR;
> +               break;
> +       }
> +free_data:
> +       kfree(d);
> +out:
> +       if (!status)
> +               atomic_inc(&req->ns->pr.generation);
> +       nvmet_req_complete(req, status);
> +}
> +
> +static u16 nvmet_pr_acquire(struct nvmet_req *req,
> +                           struct nvmet_pr_registrant *reg,
> +                           u8 rtype)
> +{
> +       struct nvmet_pr *pr = &req->ns->pr;
> +       struct nvmet_pr_registrant *holder;
> +
> +       lockdep_assert_held(&pr->pr_lock);
> +       holder = rcu_dereference_protected(pr->holder,
> +                       lockdep_is_held(&pr->pr_lock));
> +       if (holder && reg != holder)
> +               return  NVME_SC_RESERVATION_CONFLICT | NVME_SC_DNR;
> +       if (holder && reg == holder) {
> +               if (holder->rtype == rtype)
> +                       return NVME_SC_SUCCESS;
> +               return NVME_SC_RESERVATION_CONFLICT | NVME_SC_DNR;
> +       }
> +
> +       nvmet_pr_set_new_holder(pr, rtype, reg);
> +       return NVME_SC_SUCCESS;
> +}
> +
> +static void nvmet_pr_set_ctrl_to_abort(struct nvmet_req *req, u16 cntlid)
> +{
> +       struct nvmet_subsys *subsys = req->sq->ctrl->subsys;
> +       struct nvmet_ctrl *ctrl = NULL;
> +
> +       mutex_lock(&subsys->lock);
> +       list_for_each_entry(ctrl, &subsys->ctrls, subsys_entry) {
> +               if (ctrl->cntlid == cntlid) {
> +                       if (!kref_get_unless_zero(&ctrl->ref))
> +                               break;
> +                       ctrl->pr_abort = true;
> +                       break;
> +               }
> +       }
> +       mutex_unlock(&subsys->lock);
> +}
> +
> +static u16 nvmet_pr_unreg_all_host_by_prkey(struct nvmet_req *req, u64 prkey,
> +                                           uuid_t *send_hostid,
> +                                           bool abort)
> +{
> +       u16 status = NVME_SC_RESERVATION_CONFLICT | NVME_SC_DNR;
> +       struct nvmet_pr_registrant *reg, *tmp;
> +       struct nvmet_pr *pr = &req->ns->pr;
> +       uuid_t hostid;
> +
> +       lockdep_assert_held(&pr->pr_lock);
> +
> +       list_for_each_entry_safe(reg, tmp, &pr->registrant_list, entry) {
> +               if (reg->rkey == prkey) {
> +                       status = NVME_SC_SUCCESS;
> +                       uuid_copy(&hostid, &reg->hostid);
> +                       if (abort)
> +                               nvmet_pr_set_ctrl_to_abort(req, reg->cntlid);
> +                       nvmet_pr_unregister_one(pr, reg);
> +                       if (!uuid_equal(&hostid, send_hostid))
> +                               nvmet_pr_registration_preempted(pr, &hostid);
> +               }
> +       }
> +       return status;
> +}
> +
> +static u16 nvmet_pr_unreg_all_others_by_prkey(struct nvmet_req *req,
> +                                             u64 prkey,
> +                                             uuid_t *send_hostid,
> +                                             bool abort)
> +{
> +       u16 status = NVME_SC_RESERVATION_CONFLICT | NVME_SC_DNR;
> +       struct nvmet_pr_registrant *reg, *tmp;
> +       struct nvmet_pr *pr = &req->ns->pr;
> +       uuid_t hostid;
> +
> +       lockdep_assert_held(&pr->pr_lock);
> +
> +       list_for_each_entry_safe(reg, tmp, &pr->registrant_list, entry) {
> +               if (reg->rkey == prkey &&
> +                   !uuid_equal(&reg->hostid, send_hostid)) {
> +                       status = NVME_SC_SUCCESS;
> +                       uuid_copy(&hostid, &reg->hostid);
> +                       if (abort)
> +                               nvmet_pr_set_ctrl_to_abort(req, reg->cntlid);
> +                       nvmet_pr_unregister_one(pr, reg);
> +                       nvmet_pr_registration_preempted(pr, &hostid);
> +               }
> +       }
> +       return status;
> +}
> +
> +static void nvmet_pr_unreg_all_others(struct nvmet_req *req,
> +                                     uuid_t *send_hostid,
> +                                     bool abort)
> +{
> +       struct nvmet_pr_registrant *reg, *tmp;
> +       struct nvmet_pr *pr = &req->ns->pr;
> +       uuid_t hostid;
> +
> +       lockdep_assert_held(&pr->pr_lock);
> +
> +       list_for_each_entry_safe(reg, tmp, &pr->registrant_list, entry) {
> +               if (!uuid_equal(&reg->hostid, send_hostid)) {
> +                       uuid_copy(&hostid, &reg->hostid);
> +                       if (abort)
> +                               nvmet_pr_set_ctrl_to_abort(req, reg->cntlid);
> +                       nvmet_pr_unregister_one(pr, reg);
> +                       nvmet_pr_registration_preempted(pr, &hostid);
> +               }
> +       }
> +}
> +
> +static void nvmet_pr_update_holder_rtype(struct nvmet_pr_registrant *reg,
> +                                        void *attr)
> +{
> +       u8 new_rtype = *(u8 *)attr;
> +
> +       reg->rtype = new_rtype;
> +}
> +
> +static u16 nvmet_pr_preempt(struct nvmet_req *req,
> +                           struct nvmet_pr_registrant *reg,
> +                           u8 rtype,
> +                           struct nvmet_pr_acquire_data *d,
> +                           bool abort)
> +{
> +       struct nvmet_ctrl *ctrl = req->sq->ctrl;
> +       struct nvmet_pr *pr = &req->ns->pr;
> +       struct nvmet_pr_registrant *holder;
> +       enum nvme_pr_type original_rtype;
> +       u64 prkey = le64_to_cpu(d->prkey);
> +       u16 status;
> +
> +       lockdep_assert_held(&pr->pr_lock);
> +       holder = rcu_dereference_protected(pr->holder,
> +                       lockdep_is_held(&pr->pr_lock));
> +       if (!holder)
> +               return nvmet_pr_unreg_all_host_by_prkey(req, prkey,
> +                                       &ctrl->hostid, abort);
> +
> +       original_rtype = holder->rtype;
> +       if (original_rtype == NVME_PR_WRITE_EXCLUSIVE_ALL_REGS ||
> +           original_rtype == NVME_PR_EXCLUSIVE_ACCESS_ALL_REGS) {
> +               if (!prkey) {
> +                       nvmet_pr_unreg_all_others(req, &ctrl->hostid, abort);
> +                       nvmet_pr_set_new_holder(pr, rtype, reg);

You didnot address the reservation atomicity here. You still remove an
old holder and then after a some time you set the new holder. 
In this time gap an incomming WRITE command from a non holder will write
to the media.

> +                       return NVME_SC_SUCCESS;
> +               }
> +               return nvmet_pr_unreg_all_host_by_prkey(req, prkey,
> +                               &ctrl->hostid, abort);
> +       }
> +
> +       if (holder == reg) {
> +               status = nvmet_pr_update_reg_attr(pr, holder,
> +                               nvmet_pr_update_holder_rtype, &rtype);
> +               if (!status && original_rtype != rtype)
> +                       nvmet_pr_resv_released(pr, &reg->hostid);
> +               return status;
> +       }
> +
> +       if (prkey == holder->rkey) {
> +               status = nvmet_pr_unreg_all_others_by_prkey(req, prkey,
> +                                       &ctrl->hostid, abort);

And here too that timegap with released reservation.

> +               if (status)
> +                       return status;
> +
> +               nvmet_pr_set_new_holder(pr, rtype, reg);
> +               if (original_rtype != rtype)
> +                       nvmet_pr_resv_released(pr, &reg->hostid);

This function is(may be) already invoked in nvmet_pr_unregister_one.

> +               return status;
> +       }
> +
> +       if (prkey)
> +               return nvmet_pr_unreg_all_host_by_prkey(req, prkey,
> +                                       &ctrl->hostid, abort);
> +       return NVME_SC_INVALID_FIELD | NVME_SC_DNR;
> +}
> +
> +static void nvmet_pr_confirm_ns_pc_ref(struct percpu_ref *ref)
> +{
> +       struct nvmet_pr_per_ctrl_ref *pc_ref =
> +               container_of(ref, struct nvmet_pr_per_ctrl_ref, ref);
> +
> +       complete(&pc_ref->confirm_done);
> +}
> +
> +static void nvmet_pr_do_abort(struct nvmet_req *req)
> +{
> +       struct nvmet_subsys *subsys = req->sq->ctrl->subsys;
> +       struct nvmet_pr_per_ctrl_ref *pc_ref;
> +       struct nvmet_ctrl *ctrl = NULL;
> +
> +find_next:
> +       mutex_lock(&subsys->lock);
> +       list_for_each_entry(ctrl, &subsys->ctrls, subsys_entry) {
> +               if (ctrl->pr_abort)
> +                       goto do_abort;
> +       }
> +       mutex_unlock(&subsys->lock);
> +       return;
> +
> +do_abort:
> +       mutex_unlock(&subsys->lock);
> +       /*
> +        * The target dose not support abort, just wait ref to 0.
typo "dose"->"does"

> +        */
> +       pc_ref = xa_load(&req->ns->pr_per_ctrl_refs, ctrl->cntlid);
> +       percpu_ref_kill_and_confirm(&pc_ref->ref, nvmet_pr_confirm_ns_pc_ref);
> +       wait_for_completion(&pc_ref->confirm_done);
> +       wait_for_completion(&pc_ref->free_done);
> +       reinit_completion(&pc_ref->confirm_done);
> +       reinit_completion(&pc_ref->free_done);
> +       percpu_ref_resurrect(&pc_ref->ref);
> +
> +       ctrl->pr_abort = false;

pr_abort flag is per controller, but this function is per namespace.
The use case of preempt_and_abort is to preempt reservation of a "zomby" host
for the all namespaces. It means that preempt_and_abort command will be sent
for the each namespace and there will be several parallel abort processes for
the same controllers. And you should not mix them.

May be doing ref_kill instead of setting ctlr->pr_abort=true will more
accuratelly set the barier for incoming IO. You may use Mark feature in
XArrays to mark pc_ref to wait the completion for and use xa_for_each_marked
for going through such pc_refs.

> +       nvmet_ctrl_put(ctrl);
> +       goto find_next;
> +}
> +
> +static u16 __nvmet_execute_pr_acquire(struct nvmet_req *req,
> +                                     struct nvmet_pr_registrant *reg,
> +                                     u8 acquire_act,
> +                                     u8 rtype,
> +                                     struct nvmet_pr_acquire_data *d)
> +{
> +       u16 status;
> +
> +       switch (acquire_act) {
> +       case NVME_PR_ACQUIRE_ACT_ACQUIRE:
> +               status = nvmet_pr_acquire(req, reg, rtype);
> +               goto out;
> +       case NVME_PR_ACQUIRE_ACT_PREEMPT:
> +               status = nvmet_pr_preempt(req, reg, rtype, d, false);
> +               goto inc_gen;
> +       case NVME_PR_ACQUIRE_ACT_PREEMPT_AND_ABORT:
> +               status = nvmet_pr_preempt(req, reg, rtype, d, true);
> +               if (!status)
> +                       nvmet_pr_do_abort(req);
> +               goto inc_gen;
> +       default:
> +               req->error_loc = offsetof(struct nvme_common_command, cdw10);
> +               status = NVME_SC_INVALID_OPCODE | NVME_SC_DNR;
> +               goto out;
> +       }
> +inc_gen:
> +       if (!status)
> +               atomic_inc(&req->ns->pr.generation);
> +out:
> +       return status;
> +}
> +