[PATCH v10 08/26] gunyah: rsc_mgr: Add resource manager RPC core

Fri Feb 24 14:39:32 PST 2023

On 2/23/2023 3:28 PM, Alex Elder wrote:
> On 2/14/23 3:23 PM, Elliot Berman wrote:
>>
<snip>
>> +}
>> +
>> +static int gh_rm_init_connection_payload(struct gh_rm_connection 
>> *connection, void *msg,
>> +                    size_t hdr_size, size_t msg_size)
> 
> The value of hdr_size is *always* sizeof(*hdr), so you can
> do without passing it as an argument.
> 

hdr_size is different when receiving reply (1 extra word) vs notification.

>> +{
>> +    size_t max_buf_size, payload_size;
>> +    struct gh_rm_rpc_hdr *hdr = msg;
>> +
> 
> It probably sounds dumb, but I'd reverse the values
> compared below (and the operator).
> 
>> +    if (hdr_size > msg_size)
>> +        return -EINVAL;
>> +
>> +    payload_size = msg_size - hdr_size;
>> +
>> +    connection->num_fragments = FIELD_GET(RM_RPC_FRAGMENTS_MASK, 
>> hdr->type);
>> +    connection->fragments_received = 0;
>> +
>> +    /* There's not going to be any payload, no need to allocate 
>> buffer. */
>> +    if (!payload_size && !connection->num_fragments)
> 
> The payload size is the same across all messages in the
> "connection" right?  As is the number of fragments?
> It's not even possible/valid to have a zero payload size
> and non-zero number of fragments.  I think the second
> half of the above test can be dropped.
> 

The RM RPC specification doesn't require that the first message have 
payload. (It makes sense to do it and it does, but that's implementation 
detail)

>> +        return 0;
>> +
>> +    if (connection->num_fragments > GH_RM_MAX_NUM_FRAGMENTS)
>> +        return -EINVAL;
>> +
>> +    max_buf_size = payload_size + (connection->num_fragments * 
>> GH_RM_MAX_MSG_SIZE);
>> +
>> +    connection->payload = kzalloc(max_buf_size, GFP_KERNEL);
>> +    if (!connection->payload)
>> +        return -ENOMEM;
>> +
>> +    memcpy(connection->payload, msg + hdr_size, payload_size);
> 
> I think I suggested (hdr + 1) rather than (msg + size) elsewhere
> and you took that suggestion.  I'd say do it one way or the other,
> consistently, everywhere.
> 

hdr_size != sizeof(*hdr) when we receive a reply message.

>> +    connection->size = payload_size;
>> +    return 0;
>> +}
>> +
>> +static void gh_rm_notif_work(struct work_struct *work)
>> +{
>> +    struct gh_rm_connection *connection = container_of(work, struct 
>> gh_rm_connection,
>> +                                notification.work);
>> +    struct gh_rm *rm = connection->notification.rm;
>> +
>> +    blocking_notifier_call_chain(&rm->nh, connection->msg_id, 
>> connection->payload);
>> +
>> +    put_gh_rm(rm);
>> +    kfree(connection->payload);
>> +    kfree(connection);
>> +}
>> +
>> +static struct gh_rm_connection *gh_rm_process_notif(struct gh_rm *rm, 
>> void *msg, size_t msg_size)
> 
> I think it might be better if you do some of what the caller
> does here.  I.e., verify the current connection is null (and
> abort if not and make it NULL), then assign it to the new
> connection before you return success.  And return an errno.
> 

Since you and Srini both suggest to do it, I'll cave. :-)

>> +{
>> +    struct gh_rm_connection *connection;
>> +    struct gh_rm_rpc_hdr *hdr = msg;
>> +    int ret;
>> +
>> +    connection = gh_rm_alloc_connection(hdr->msg_id, RM_RPC_TYPE_NOTIF);
>> +    if (IS_ERR(connection)) {
>> +        dev_err(rm->dev, "Failed to alloc connection for 
>> notification: %ld, dropping.\n",
>> +            PTR_ERR(connection));
>> +        return NULL;
>> +    }
>> +
>> +    get_gh_rm(rm);
>> +    connection->notification.rm = rm;
>> +    INIT_WORK(&connection->notification.work, gh_rm_notif_work);
>> +
>> +    ret = gh_rm_init_connection_payload(connection, msg, 
>> sizeof(*hdr), msg_size);
>> +    if (ret) {
>> +        dev_err(rm->dev, "Failed to initialize connection buffer for 
>> notification: %d\n",
>> +            ret);
>> +        kfree(connection);
>> +        return NULL;
>> +    }
>> +
>> +    return connection;
>> +}
>> +
>> +static struct gh_rm_connection *gh_rm_process_rply(struct gh_rm *rm, 
>> void *msg, size_t msg_size)
>> +{
> 
> Here too, make sure there is no active connection and then
> set it within this function if the errno returned is 0.
> 
>> +    struct gh_rm_rpc_reply_hdr *reply_hdr = msg;
>> +    struct gh_rm_connection *connection;
>> +    u16 seq_id = le16_to_cpu(reply_hdr->hdr.seq);
>> +
>> +    mutex_lock(&rm->call_idr_lock);
>> +    connection = idr_find(&rm->call_idr, seq_id);
>> +    mutex_unlock(&rm->call_idr_lock);
>> +
>> +    if (!connection || connection->msg_id != reply_hdr->hdr.msg_id)
>> +        return NULL;
>> +
>> +    if (gh_rm_init_connection_payload(connection, msg, 
>> sizeof(*reply_hdr), msg_size)) {
>> +        dev_err(rm->dev, "Failed to alloc connection buffer for 
>> sequence %d\n", seq_id);
>> +        /* Send connection complete and error the client. */
>> +        connection->reply.ret = -ENOMEM;
>> +        complete(&connection->reply.seq_done);
>> +        return NULL;
>> +    }
>> +
>> +    connection->reply.rm_error = le32_to_cpu(reply_hdr->err_code);
>> +    return connection;
>> +}
>> +
>> +static int gh_rm_process_cont(struct gh_rm *rm, struct 
>> gh_rm_connection *connection,
>> +                void *msg, size_t msg_size)
> 
> Similar comment here.  Have this function verify there is
> a non-null active connection.  Then process the message
> and abort if there's an error (and null the active connection
> pointer).
> 
>> +{
>> +    struct gh_rm_rpc_hdr *hdr = msg;
>> +    size_t payload_size = msg_size - sizeof(*hdr);
>> +
>> +    /*
>> +     * hdr->fragments and hdr->msg_id preserves the value from first 
>> reply
>> +     * or notif message. To detect mishandling, check it's still intact.
>> +     */
>> +    if (connection->msg_id != hdr->msg_id ||
>> +        connection->num_fragments != FIELD_GET(RM_RPC_FRAGMENTS_MASK, 
>> hdr->type))
>> +        return -EINVAL;
> 
> Maybe -EBADMSG?
> 
>> +
>> +    memcpy(connection->payload + connection->size, msg + 
>> sizeof(*hdr), payload_size);
>> +    connection->size += payload_size;
>> +    connection->fragments_received++;
>> +    return 0;
>> +}
>> +
>> +static void gh_rm_abort_connection(struct gh_rm_connection *connection)
>> +{
>> +    switch (connection->type) {
>> +    case RM_RPC_TYPE_REPLY:
>> +        connection->reply.ret = -EIO;
>> +        complete(&connection->reply.seq_done);
>> +        break;
>> +    case RM_RPC_TYPE_NOTIF:
>> +        fallthrough;
>> +    default:
>> +        kfree(connection->payload);
>> +        kfree(connection);
>> +    }
>> +}
>> +
>> +static bool gh_rm_complete_connection(struct gh_rm *rm, struct 
>> gh_rm_connection *connection)
> 
> The only caller of this function passes rm->active_rx_connection
> as the second argument.  It is available to you here, so you
> can get rid of that argument.
> 
>> +{
>> +    if (!connection || connection->fragments_received != 
>> connection->num_fragments)
>> +        return false;
>> +
>> +    switch (connection->type) {
>> +    case RM_RPC_TYPE_REPLY:
>> +        complete(&connection->reply.seq_done);
>> +        break;
>> +    case RM_RPC_TYPE_NOTIF:
>> +        schedule_work(&connection->notification.work);
>> +        break;
>> +    default:
>> +        dev_err(rm->dev, "Invalid message type (%d) received\n", 
>> connection->type);
>> +        gh_rm_abort_connection(connection);
>> +        break;
>> +    }
>> +
>> +    return true;
>> +}
>> +
>> +static void gh_rm_msgq_rx_data(struct mbox_client *cl, void *mssg)
>> +{
>> +    struct gh_rm *rm = container_of(cl, struct gh_rm, msgq_client);
>> +    struct gh_msgq_rx_data *rx_data = mssg;
>> +    size_t msg_size = rx_data->length;
>> +    void *msg = rx_data->data;
>> +    struct gh_rm_rpc_hdr *hdr;
>> +
> 
> Is it required that at least one byte (past the header) will
> be received?  I.e., should the "<=" below just be "<"?
> 
>> +    if (msg_size <= sizeof(*hdr) || msg_size > GH_MSGQ_MAX_MSG_SIZE)
>> +        return;
> 
> You previously reported a message here.  These seem like
> errors, which if they occur, maybe should be reported.
> They seem like "never happen" issues, but it's defensive
> to make these checks (which is good).
> 
>> +
>> +    hdr = msg;
>> +    if (hdr->api != RM_RPC_API) {
> 
> If this ever happens, is the hardware failing?  It seems
> like once Gunyah is initialized and you've checked the
> API version once, there should be no need to check it
> repeatedly.

I'd need to check the API version for the first message. On subsequent 
messages, I'd need to check if I already checked. Might as well just 
check the version every time?

<done for all the comments snipped>

>> +
>> +void get_gh_rm(struct gh_rm *rm)
> 
> It is often pretty handy to return the argument in
> functions like this.  It simultaneously takes the
> reference and assigns the pointer the reference
> represents.
> 
> 

I've updated so that gh_rm_get() returns a struct device * (the 
miscdev's device). Is this too unusual?

>> +{
>> +    get_device(rm->dev);
>> +}
>> +EXPORT_SYMBOL_GPL(get_gh_rm);
>> +
>> +void put_gh_rm(struct gh_rm *rm)
>> +{
>> +    put_device(rm->dev);
>> +}
>> +EXPORT_SYMBOL_GPL(put_gh_rm);
>> +
>> +static int gh_msgq_platform_probe_direction(struct platform_device 
>> *pdev,
>> +                    bool tx, int idx, struct gunyah_resource *ghrsc)
>> +{
>> +    struct device_node *node = pdev->dev.of_node;
>> +    int ret;
> 
> I think you should declare idx as a local variable.
> 
>      int idx = tx ? 1 : 0;
> >> +
>> +    ghrsc->type = tx ? GUNYAH_RESOURCE_TYPE_MSGQ_TX : 
>> GUNYAH_RESOURCE_TYPE_MSGQ_RX;
>> +
>> +    ghrsc->irq = platform_get_irq(pdev, idx);
> 
> Do you suppose you could do platform_get_irq_byname(), and then
> specify the names of the IRQs ("rm_tx_irq" and "rm_rx_irq" maybe)?
> 
>> +    if (ghrsc->irq < 0) {
>> +        dev_err(&pdev->dev, "Failed to get irq%d: %d\n", idx, 
>> ghrsc->irq);
> 
> Maybe:    "Failed to get %cX IRQ: %d\n", tx ? 'T' : 'R', ghrsc->irq);
> 
>> +        return ghrsc->irq;
>> +    }
>> +
>> +    ret = of_property_read_u64_index(node, "reg", idx, &ghrsc->capid);
> 
> Is a capability ID a simple (but large) number?
> 
> The *resource manager* (which is a very special VM) has to
> have both TX and RX message queue capability IDs.  Is there
> 'any chance that these specific capability IDs have values
> that are fixed by the design?  Like, 0 and 1?  I don't know
> what they are, but it seems like it *could* be something
> fixed by the design, and if that were the case, there would
> be no need to specify the "reg" property to get the "capid"
> values.
> 

They aren't fixed by the design in a production version of Gunyah.

>> +    if (ret) {
>> +        dev_err(&pdev->dev, "Failed to get capid%d: %d\n", idx, ret);
>> +        return ret;
>> +    }
>> +
>> +    return 0;
>> +}
>> +
>> +static int gh_rm_drv_probe(struct platform_device *pdev)
>> +{
>> +    struct gh_msgq_tx_data *msg;
>> +    struct gh_rm *rm;
>> +    int ret;
>> +
>> +    rm = devm_kzalloc(&pdev->dev, sizeof(*rm), GFP_KERNEL);
>> +    if (!rm)
>> +        return -ENOMEM;
>> +
>> +    platform_set_drvdata(pdev, rm);
>> +    rm->dev = &pdev->dev;
>> +
>> +    mutex_init(&rm->call_idr_lock);
>> +    idr_init(&rm->call_idr);
>> +    rm->cache = kmem_cache_create("gh_rm", struct_size(msg, data, 
>> GH_MSGQ_MAX_MSG_SIZE), 0,
>> +        SLAB_HWCACHE_ALIGN, NULL);
>> +    if (!rm->cache)
>> +        return -ENOMEM;
> 
> If you abstracted the allocation interface for these messages,
> you could actually survive without the slab cache here.  But
> if this fails, maybe you won't get far anyway.
> 
>> +    mutex_init(&rm->send_lock);
>> +    BLOCKING_INIT_NOTIFIER_HEAD(&rm->nh);
>> +
>> +    ret = gh_msgq_platform_probe_direction(pdev, true, 0, 
>> &rm->tx_ghrsc);
>> +    if (ret)
>> +        goto err_cache;
>> +
>> +    ret = gh_msgq_platform_probe_direction(pdev, false, 1, 
>> &rm->rx_ghrsc);
>> +    if (ret)
>> +        goto err_cache;
>> +
>> +    rm->msgq_client.dev = &pdev->dev;
>> +    rm->msgq_client.tx_block = true;
>> +    rm->msgq_client.rx_callback = gh_rm_msgq_rx_data;
>> +    rm->msgq_client.tx_done = gh_rm_msgq_tx_done;
>> +
>> +    return gh_msgq_init(&pdev->dev, &rm->msgq, &rm->msgq_client, 
>> &rm->tx_ghrsc, &rm->rx_ghrsc);
>> +err_cache:
>> +    kmem_cache_destroy(rm->cache);
>> +    return ret;
>> +}
>> +
>> +static int gh_rm_drv_remove(struct platform_device *pdev)
>> +{
>> +    struct gh_rm *rm = platform_get_drvdata(pdev);
>> +
>> +    mbox_free_channel(gh_msgq_chan(&rm->msgq));
>> +    gh_msgq_remove(&rm->msgq);
>> +    kmem_cache_destroy(rm->cache);
>> +
>> +    return 0;
>> +}
>> +
>> +static const struct of_device_id gh_rm_of_match[] = {
>> +    { .compatible = "gunyah-resource-manager" },
>> +    {}
>> +};
>> +MODULE_DEVICE_TABLE(of, gh_rm_of_match);
>> +
>> +static struct platform_driver gh_rm_driver = {
>> +    .probe = gh_rm_drv_probe,
>> +    .remove = gh_rm_drv_remove,
>> +    .driver = {
>> +        .name = "gh_rsc_mgr",
>> +        .of_match_table = gh_rm_of_match,
>> +    },
>> +};
>> +module_platform_driver(gh_rm_driver);
>> +
>> +MODULE_LICENSE("GPL");
>> +MODULE_DESCRIPTION("Gunyah Resource Manager Driver");
>> diff --git a/drivers/virt/gunyah/rsc_mgr.h 
>> b/drivers/virt/gunyah/rsc_mgr.h
>> new file mode 100644
>> index 000000000000..d4e799a7526f
>> --- /dev/null
>> +++ b/drivers/virt/gunyah/rsc_mgr.h
>> @@ -0,0 +1,77 @@
>> +/* SPDX-License-Identifier: GPL-2.0-only */
>> +/*
>> + * Copyright (c) 2022-2023 Qualcomm Innovation Center, Inc. All 
>> rights reserved.
>> + */
>> +#ifndef __GH_RSC_MGR_PRIV_H
>> +#define __GH_RSC_MGR_PRIV_H
>> +
>> +#include <linux/gunyah.h>
>> +#include <linux/gunyah_rsc_mgr.h>
>> +#include <linux/types.h>
>> +
>> +/* RM Error codes */
>> +enum gh_rm_error {
>> +    GH_RM_ERROR_OK            = 0x0,
>> +    GH_RM_ERROR_UNIMPLEMENTED    = 0xFFFFFFFF,
>> +    GH_RM_ERROR_NOMEM        = 0x1,
>> +    GH_RM_ERROR_NORESOURCE        = 0x2,
>> +    GH_RM_ERROR_DENIED        = 0x3,
>> +    GH_RM_ERROR_INVALID        = 0x4,
>> +    GH_RM_ERROR_BUSY        = 0x5,
>> +    GH_RM_ERROR_ARGUMENT_INVALID    = 0x6,
>> +    GH_RM_ERROR_HANDLE_INVALID    = 0x7,
>> +    GH_RM_ERROR_VALIDATE_FAILED    = 0x8,
>> +    GH_RM_ERROR_MAP_FAILED        = 0x9,
>> +    GH_RM_ERROR_MEM_INVALID        = 0xA,
>> +    GH_RM_ERROR_MEM_INUSE        = 0xB,
>> +    GH_RM_ERROR_MEM_RELEASED    = 0xC,
>> +    GH_RM_ERROR_VMID_INVALID    = 0xD,
>> +    GH_RM_ERROR_LOOKUP_FAILED    = 0xE,
>> +    GH_RM_ERROR_IRQ_INVALID        = 0xF,
>> +    GH_RM_ERROR_IRQ_INUSE        = 0x10,
>> +    GH_RM_ERROR_IRQ_RELEASED    = 0x11,
>> +};
>> +
>> +/**
>> + * gh_rm_remap_error() - Remap Gunyah resource manager errors into a 
>> Linux error code
>> + * @gh_error: "Standard" return value from Gunyah resource manager
>> + */
>> +static inline int gh_rm_remap_error(enum gh_rm_error rm_error)
>> +{
>> +    switch (rm_error) {
>> +    case GH_RM_ERROR_OK:
>> +        return 0;
>> +    case GH_RM_ERROR_UNIMPLEMENTED:
>> +        return -EOPNOTSUPP;
>> +    case GH_RM_ERROR_NOMEM:
>> +        return -ENOMEM;
>> +    case GH_RM_ERROR_NORESOURCE:
>> +        return -ENODEV;
>> +    case GH_RM_ERROR_DENIED:
>> +        return -EPERM;
>> +    case GH_RM_ERROR_BUSY:
>> +        return -EBUSY;
>> +    case GH_RM_ERROR_INVALID:
>> +    case GH_RM_ERROR_ARGUMENT_INVALID:
>> +    case GH_RM_ERROR_HANDLE_INVALID:
>> +    case GH_RM_ERROR_VALIDATE_FAILED:
>> +    case GH_RM_ERROR_MAP_FAILED:
>> +    case GH_RM_ERROR_MEM_INVALID:
>> +    case GH_RM_ERROR_MEM_INUSE:
>> +    case GH_RM_ERROR_MEM_RELEASED:
>> +    case GH_RM_ERROR_VMID_INVALID:
>> +    case GH_RM_ERROR_LOOKUP_FAILED:
>> +    case GH_RM_ERROR_IRQ_INVALID:
>> +    case GH_RM_ERROR_IRQ_INUSE:
>> +    case GH_RM_ERROR_IRQ_RELEASED:
>> +        return -EINVAL;
>> +    default:
>> +        return -EBADMSG;
>> +    }
>> +}
>> +
>> +struct gh_rm;
> 
> This might just be my preference, but I like to see declarations
> like the one above grouped at the top of the file, under includes.
> 
>> +int gh_rm_call(struct gh_rm *rsc_mgr, u32 message_id, void *req_buff, 
>> size_t req_buff_size,
>> +        void **resp_buf, size_t *resp_buff_size);
>> +
>> +#endif
>> diff --git a/include/linux/gunyah_rsc_mgr.h 
>> b/include/linux/gunyah_rsc_mgr.h
>> new file mode 100644
>> index 000000000000..c992b3188c8d
>> --- /dev/null
>> +++ b/include/linux/gunyah_rsc_mgr.h
>> @@ -0,0 +1,24 @@
>> +/* SPDX-License-Identifier: GPL-2.0-only */
>> +/*
>> + * Copyright (c) 2022-2023 Qualcomm Innovation Center, Inc. All 
>> rights reserved.
>> + */
>> +
>> +#ifndef _GUNYAH_RSC_MGR_H
>> +#define _GUNYAH_RSC_MGR_H
>> +
>> +#include <linux/list.h>
>> +#include <linux/notifier.h>
>> +#include <linux/gunyah.h>
>> +
>> +#define GH_VMID_INVAL    U16_MAX
>> +
>> +/* Gunyah recognizes VMID0 as an alias to the current VM's ID */
>> +#define GH_VMID_SELF            0
> 
> I haven't really checked very well, bur you should *use this*
> definition where a VMID is being examined. I.e., if you're
> going to define this, then never just compare a VMID against 0.
> 

I realize now the only place I *could* use GH_VMID_SELF is the one 
exception to usage of VMID -- in gh_rm_vmid_alloc. There, vmid of 0 
means "use dynamic allocation". Since there aren't any users of the 
GH_VMID_SELF, I'll drop it.

Thanks,
Elliot

>                      -Alex
> 
>> +
>> +struct gh_rm;
>> +int gh_rm_notifier_register(struct gh_rm *rm, struct notifier_block 
>> *nb);
>> +int gh_rm_notifier_unregister(struct gh_rm *rm, struct notifier_block 
>> *nb);
>> +void get_gh_rm(struct gh_rm *rm);
>> +void put_gh_rm(struct gh_rm *rm);
>> +
>> +#endif
>