[PATCH v8 1/1] nvmet: support reservation feature

Wed Sep 18 01:26:10 PDT 2024

Hi Dmitry,

     My thanks for your advice, I will work on this back recently.

在 2024/9/17 16:43, Dmitry Bogdanov 写道:
> Hi,
>
> Waiting for the final solution half an year we taken this patch as is.
> Here is my comments on it. Hope, this will bring the life to the patch.
>
> On Thu, Feb 29, 2024 at 11:12:41AM +0800, Guixin Liu wrote:
>> This patch implements the reservation feature, includes:
>> 1. reservation register(register, unregister and replace).
>> 2. reservation acquire(acquire, preempt, preempt and abort).
>> 3. reservation release(release and clear).
>> 4. reservation report.
>>
>> And also make reservation configurable, one can set ns to support
>> reservation before enable ns. The default of resv_enable is false.
>>
>> Signed-off-by: Guixin Liu <kanie at linux.alibaba.com>
>> ---
>> @@ -550,7 +556,13 @@ static void nvmet_execute_identify_ns(struct nvmet_req *req)
>>           */
>>          id->nmic = NVME_NS_NMIC_SHARED;
>>          id->anagrpid = cpu_to_le32(req->ns->anagrpid);
>> -
>> +       id->rescap = NVME_PR_SUPPORT_WRITE_EXCLUSIVE |
>> +                    NVME_PR_SUPPORT_EXCLUSIVE_ACCESS |
>> +                    NVME_PR_SUPPORT_WRITE_EXCLUSIVE_REG_ONLY |
>> +                    NVME_PR_SUPPORT_EXCLUSIVE_ACCESS_REG_ONLY |
>> +                    NVME_PR_SUPPORT_WRITE_EXCLUSIVE_ALL_REGS |
>> +                    NVME_PR_SUPPORT_EXCLUSIVE_ACCESS_ALL_REGS |
>> +                    NVME_PR_SUPPORT_IEKEY_VER_1_3_DEF;
> You have a flag enabling PR on a namespace, so report rescap only if it
> is enabled.
Sure, I will add a "if" statement.
>
> Also I didnot find ONCS, is it not necessarily?
Oh, I missed this, thanks, I will change oncs.
>> +void nvmet_execute_get_log_page_resv(struct nvmet_req *req)
>> +{
>> +       struct nvmet_pr_log_mgr *log_mgr = &req->sq->ctrl->pr_log_mgr;
>> +       struct nvme_pr_log next_log = {0};
>> +       struct nvme_pr_log log = {0};
>> +       u16 status = NVME_SC_SUCCESS;
>> +       u64 lost_count;
>> +       u64 cur_count;
>> +       u64 next_count;
>> +
>> +       mutex_lock(&log_mgr->lock);
>> +       if (!kfifo_get(&log_mgr->log_queue, &log))
>> +               goto out;
>> +
>> +       /*
>> +        * We can't get the last in kfifo.
>> +        * Utilize the current count and the count from the next log to
>> +        * calculate the number of lost logs, while also addressing cases
>> +        * of overflow. If there is no subsequent log, the number of lost
>> +        * logs is equal to the lost_count within the nvmet_pr_log_mgr.
>> +        */
>> +       cur_count = le64_to_cpu(log.count);
>> +       if (kfifo_peek(&log_mgr->log_queue, &next_log)) {
>> +               next_count = le64_to_cpu(next_log.count);
>> +               if (next_count > cur_count)
>> +                       lost_count = next_count - cur_count - 1;
>> +               else
>> +                       lost_count = U64_MAX - cur_count + next_count - 1;
>> +       } else {
>> +               lost_count = log_mgr->lost_count;
>> +       }
>> +
>> +       log.count = cpu_to_le64(cur_count + lost_count);
> This value should be wrapped by U64_MAX too but in reverse direction
> (count is next_count-1). If == 0 then = -1.
>
The next_count will never be 0, you can see in nvmet_pr_add_resv_log, when

the log_mgr->counter == 0, I set the log_mgr->counter to 1.

>> +static void nvmet_pr_add_resv_log(struct nvmet_ctrl *ctrl, u8 log_type, u32 nsid)
>> +{
>> +       struct nvmet_pr_log_mgr *log_mgr = &ctrl->pr_log_mgr;
>> +       struct nvme_pr_log log = {0};
>> +
>> +       mutex_lock(&log_mgr->lock);
>> +       log_mgr->counter++;
>> +       if (log_mgr->counter == 0)
>> +               log_mgr->counter = 1;
>> +
>> +       log.count = cpu_to_le64(log_mgr->counter);
>> +       log.type = log_type;
>> +       log.nsid = cpu_to_le32(nsid);
>> +
>> +       if (!kfifo_put(&log_mgr->log_queue, log)) {
>> +               pr_warn("a reservation log lost, cntlid:%d, log_type:%d, nsid:%d\n",
>> +                       ctrl->cntlid, log_type, nsid);
> I think this log should be Info level, because it is not an error
> situation. Size of this queue is fixed and it is obvious that it can be
> overflowed.
Agree.
>> +static u16 nvmet_pr_create_new_reservation(struct nvmet_pr *pr, u8 new_rtype,
>> +                                          struct nvmet_pr_registrant *reg)
>> +{
>> +       u16 status;
>> +
>> +       status = nvmet_pr_validate_rtype(new_rtype);
> You shall check the validity of the command in the very beginning of
> command handling, not at the very end.

In some situations, we dont care the value of rtype, for example

using preempt to unregister other host.

So I only check it if needed.

>> +       if (!status) {
>> +               reg->rtype = new_rtype;
>> +               rcu_assign_pointer(pr->holder, reg);
>> +       }
>> +       return status;
>> +}
>> +
> ...
>
>> +static u16 nvmet_pr_unreg_by_prkey(struct nvmet_pr *pr, u64 prkey,
>> +                                  uuid_t *send_hostid)
>> +{
>> +       u16 status = NVME_SC_RESERVATION_CONFLICT | NVME_SC_DNR;
>> +       struct nvmet_pr_registrant *reg, *tmp;
>> +       uuid_t hostid;
>> +
>> +       lockdep_assert_held(&pr->pr_lock);
>> +
>> +       list_for_each_entry_safe(reg, tmp, &pr->registrant_list, entry) {
>> +               if (reg->rkey == prkey) {
>> +                       status = NVME_SC_SUCCESS;
>> +                       uuid_copy(&hostid, &reg->hostid);
>> +                       nvmet_pr_unregister_one(pr, reg);
>> +                       if (!uuid_equal(&reg->hostid, send_hostid))
> reg is free already, use just hostid as line below:
Yeah sure, thanks a lot.
>> +                               nvmet_pr_registration_preempted(pr, &hostid);
>> +               }
>> +       }
>> +       return status;
>> +}
>> +
> ...
>
>> +static u16 nvmet_pr_preempt(struct nvmet_req *req,
>> +                           struct nvmet_pr_registrant *reg,
>> +                           u8 rtype,
>> +                           struct nvmet_pr_acquire_data *d)
>> +{
>> +       struct nvmet_ctrl *ctrl = req->sq->ctrl;
>> +       struct nvmet_pr *pr = &req->ns->pr;
>> +       struct nvmet_pr_registrant *holder;
>> +       enum nvme_pr_type original_rtype;
>> +       u16 status;
>> +
>> +       lockdep_assert_held(&pr->pr_lock);
>> +       holder = rcu_dereference_protected(pr->holder,
>> +                       lockdep_is_held(&pr->pr_lock));
>> +       if (!holder)
>> +               return nvmet_pr_unreg_by_prkey(pr, le64_to_cpu(d->prkey),
>> +                               &ctrl->hostid);
>> +
>> +       original_rtype = holder->rtype;
>> +       if (original_rtype == NVME_PR_WRITE_EXCLUSIVE_ALL_REGS ||
>> +           original_rtype == NVME_PR_EXCLUSIVE_ACCESS_ALL_REGS) {
>> +               if (!le64_to_cpu(d->prkey)) {
>> +                       nvmet_pr_unreg_except_hostid(pr, &ctrl->hostid);
>> +                       return nvmet_pr_create_new_reservation(pr,
>> +                                       rtype, reg);
> There is not an atomic change of the reservation here. Probably if you swap
> these lines with each other, it will fix that.
>
You are right, we need create new reservation first.
>> +               }
>> +               return nvmet_pr_unreg_by_prkey(pr, le64_to_cpu(d->prkey),
>> +                               &ctrl->hostid);
>> +       }
>> +
>> +       if (holder == reg) {
>> +               status = nvmet_pr_update_reg_attr(pr, holder,
>> +                               nvmet_pr_update_holder_rtype, &rtype);
>> +               if (!status && original_rtype != rtype)
>> +                       nvmet_pr_resv_released(pr, &reg->hostid);
>> +               return status;
>> +       }
>> +
>> +       if (le64_to_cpu(d->prkey) == holder->rkey) {
>> +               status = nvmet_pr_unreg_by_prkey_except_hostid(pr,
>> +                               le64_to_cpu(d->prkey), &ctrl->hostid);
>> +               if (status)
>> +                       return status;
>> +               status = nvmet_pr_create_new_reservation(pr, rtype, reg);
> Here that time gap too.
Changed too.
>
>> +               if (!status && original_rtype != rtype)
>> +                       nvmet_pr_resv_released(pr, &reg->hostid);
>> +               return status;
>> +       }
>> +
>> +       if (le64_to_cpu(d->prkey))
>> +               return nvmet_pr_unreg_by_prkey(pr, le64_to_cpu(d->prkey),
>> +                               &ctrl->hostid);
>> +       return NVME_SC_INVALID_FIELD | NVME_SC_DNR;
>> +}
>> +
> ...
>
>> +void nvmet_ctrl_destroy_pr(struct nvmet_ctrl *ctrl)
>> +{
>> +       struct nvmet_pr_registrant *reg;
>> +       struct nvmet_ns *ns;
>> +       unsigned long idx;
>> +
>> +       kfifo_free(&ctrl->pr_log_mgr.log_queue);
>> +       mutex_destroy(&ctrl->pr_log_mgr.lock);
>> +
>  From here and until the end of the function it is against the standard.
>
>> +       if (nvmet_is_host_connected(&ctrl->hostid, ctrl->subsys))
>> +               return;
>> +
>> +       xa_for_each(&ctrl->subsys->namespaces, idx, ns) {
>> +               mutex_lock(&ns->pr.pr_lock);
>> +               list_for_each_entry_rcu(reg, &ns->pr.registrant_list, entry,
>> +                                       lockdep_is_held(&ns->pr.pr_lock)) {
>> +                       if (uuid_equal(&reg->hostid, &ctrl->hostid)) {
>> +                               nvmet_pr_unregister_one(&ns->pr, reg);
>> +                               break;
>> +                       }
>> +               }
>> +               mutex_unlock(&ns->pr.pr_lock);
>> +       }
> You shall not modify Reservation due to host connectivity.

You are right, we should not unregister the host when disconnect, we

should keep it.

>
>> +}
>> +
>> +void nvmet_pr_exit_ns(struct nvmet_ns *ns)
>> +{
>> +       struct nvmet_pr_registrant *reg, *tmp;
>> +       struct nvmet_pr *pr = &ns->pr;
>> +       LIST_HEAD(free_list);
>> +
>> +       mutex_lock(&pr->pr_lock);
>> +       rcu_assign_pointer(pr->holder, NULL);
>> +       list_for_each_entry_safe(reg, tmp, &pr->registrant_list, entry) {
>> +               list_del_rcu(&reg->entry);
>> +               list_add(&reg->entry, &free_list);
>> +       }
>> +       mutex_unlock(&pr->pr_lock);
>> +       synchronize_rcu();
> IMHO, there is no need in two lists and synchronize_rcu here. At this
> moment there shall be no other users of ns.
Yeah, we can just free all regs simply here.
>
>> +       list_for_each_entry_safe(reg, tmp, &free_list, entry) {
>> +               kfree(reg);
>> +       }
>> +
>> +       mutex_destroy(&pr->pr_lock);
>> +}
>
> BR,
>   Dmitry
>   

Best Regards,

Guixin Liu