[PATCH v2 1/4] MicroSemi Switchtec management interface driver

Bjorn Helgaas helgaas at kernel.org
Thu Feb 23 16:35:59 PST 2017


[+cc Peter, Ingo, Arnaldo, Alexander, Christoph]

On Thu, Feb 02, 2017 at 11:06:00AM -0700, Logan Gunthorpe wrote:
> Microsemi's "Switchtec" line of PCI switch devices is already well
> supported by the kernel with standard PCI switch drivers. However, the
> Switchtec device advertises a special management endpoint with a separate
> PCI function address and class code. This endpoint enables some additional
> functionality which includes:
> 
>  * Packet and Byte Counters
>  * Switch Firmware Upgrades
>  * Event and Error logs
>  * Querying port link status
>  * Custom user firmware commands

Would it make any sense to integrate this with the perf tool?  It
looks like switchtec-user measures bandwidth and latency, which sounds
sort of perf-ish.

> This patch introduces the switchtec kernel module which provides
> PCI driver that exposes a char device. The char device provides
> userspace access to this interface through read, write and (optionally)
> poll calls.
> 
> A userspace tool and library which utilizes this interface is available
> at [1]. This tool takes inspiration (and borrows some code) from
> nvme-cli [2]. The tool is largely complete at this time but additional
> features may be added in the future.
> 
> [1] https://github.com/sbates130272/switchtec-user
> [2] https://github.com/linux-nvme/nvme-cli
> 
> Signed-off-by: Logan Gunthorpe <logang at deltatee.com>
> Signed-off-by: Stephen Bates <stephen.bates at microsemi.com>
> ---
>  MAINTAINERS                    |    8 +
>  drivers/pci/Kconfig            |    1 +
>  drivers/pci/Makefile           |    1 +
>  drivers/pci/switch/Kconfig     |   13 +
>  drivers/pci/switch/Makefile    |    1 +
>  drivers/pci/switch/switchtec.c | 1028 ++++++++++++++++++++++++++++++++++++++++
>  6 files changed, 1052 insertions(+)
>  create mode 100644 drivers/pci/switch/Kconfig
>  create mode 100644 drivers/pci/switch/Makefile
>  create mode 100644 drivers/pci/switch/switchtec.c
> 
> diff --git a/MAINTAINERS b/MAINTAINERS
> index 5f10c28..9c35198 100644
> --- a/MAINTAINERS
> +++ b/MAINTAINERS
> @@ -9507,6 +9507,14 @@ S:	Maintained
>  F:	Documentation/devicetree/bindings/pci/aardvark-pci.txt
>  F:	drivers/pci/host/pci-aardvark.c
>  
> +PCI DRIVER FOR MICROSEMI SWITCHTEC
> +M:	Kurt Schwemmer <kurt.schwemmer at microsemi.com>
> +M:	Stephen Bates <stephen.bates at microsemi.com>
> +M:	Logan Gunthorpe <logang at deltatee.com>
> +L:	linux-pci at vger.kernel.org
> +S:	Maintained
> +F:	drivers/pci/switch/switchtec*
> +
>  PCI DRIVER FOR NVIDIA TEGRA
>  M:	Thierry Reding <thierry.reding at gmail.com>
>  L:	linux-tegra at vger.kernel.org
> diff --git a/drivers/pci/Kconfig b/drivers/pci/Kconfig
> index 6555eb7..f72e8c5 100644
> --- a/drivers/pci/Kconfig
> +++ b/drivers/pci/Kconfig
> @@ -133,3 +133,4 @@ config PCI_HYPERV
>  
>  source "drivers/pci/hotplug/Kconfig"
>  source "drivers/pci/host/Kconfig"
> +source "drivers/pci/switch/Kconfig"
> diff --git a/drivers/pci/Makefile b/drivers/pci/Makefile
> index 8db5079..15b46dd 100644
> --- a/drivers/pci/Makefile
> +++ b/drivers/pci/Makefile
> @@ -68,3 +68,4 @@ ccflags-$(CONFIG_PCI_DEBUG) := -DDEBUG
>  
>  # PCI host controller drivers
>  obj-y += host/
> +obj-y += switch/
> diff --git a/drivers/pci/switch/Kconfig b/drivers/pci/switch/Kconfig
> new file mode 100644
> index 0000000..4c49648
> --- /dev/null
> +++ b/drivers/pci/switch/Kconfig
> @@ -0,0 +1,13 @@
> +menu "PCI switch controller drivers"
> +	depends on PCI
> +
> +config PCI_SW_SWITCHTEC
> +	tristate "MicroSemi Switchtec PCIe Switch Management Driver"
> +	help
> +	 Enables support for the management interface for the MicroSemi
> +	 Switchtec series of PCIe switches. Supports userspace access
> +	 to submit MRPC commands to the switch via /dev/switchtecX
> +	 devices. See <file:Documentation/switchtec.txt> for more
> +	 information.
> +
> +endmenu
> diff --git a/drivers/pci/switch/Makefile b/drivers/pci/switch/Makefile
> new file mode 100644
> index 0000000..37d8cfb
> --- /dev/null
> +++ b/drivers/pci/switch/Makefile
> @@ -0,0 +1 @@
> +obj-$(CONFIG_PCI_SW_SWITCHTEC) += switchtec.o
> diff --git a/drivers/pci/switch/switchtec.c b/drivers/pci/switch/switchtec.c
> new file mode 100644
> index 0000000..e4a4294
> --- /dev/null
> +++ b/drivers/pci/switch/switchtec.c
> @@ -0,0 +1,1028 @@
> +/*
> + * Microsemi Switchtec(tm) PCIe Management Driver
> + * Copyright (c) 2017, Microsemi Corporation
> + *
> + * This program is free software; you can redistribute it and/or modify it
> + * under the terms and conditions of the GNU General Public License,
> + * version 2, as published by the Free Software Foundation.
> + *
> + * This program is distributed in the hope it will be useful, but WITHOUT
> + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
> + * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
> + * more details.
> + *
> + */
> +
> +#include <linux/interrupt.h>
> +#include <linux/module.h>
> +#include <linux/fs.h>
> +#include <linux/uaccess.h>
> +#include <linux/poll.h>
> +#include <linux/pci.h>
> +#include <linux/cdev.h>
> +#include <linux/wait.h>
> +
> +MODULE_DESCRIPTION("Microsemi Switchtec(tm) PCI-E Management Driver");

Nit: s/PCI-E/PCIe/

> +MODULE_VERSION("0.1");
> +MODULE_LICENSE("GPL");
> +MODULE_AUTHOR("Microsemi Corporation");
> +
> +static int max_devices = 16;

This static initialization is slightly misleading because
switchtec_init() immediately sets max_devices to at least 256.

> +module_param(max_devices, int, 0644);
> +MODULE_PARM_DESC(max_devices, "max number of switchtec device instances");
> +
> +static dev_t switchtec_devt;
> +static struct class *switchtec_class;
> +static DEFINE_IDA(switchtec_minor_ida);
> +
> +#define MICROSEMI_VENDOR_ID         0x11f8
> +#define MICROSEMI_NTB_CLASSCODE     0x068000
> +#define MICROSEMI_MGMT_CLASSCODE    0x058000
> +
> +#define SWITCHTEC_MRPC_PAYLOAD_SIZE 1024
> +#define SWITCHTEC_MAX_PFF_CSR 48
> +
> +#define SWITCHTEC_EVENT_OCCURRED BIT(0)
> +#define SWITCHTEC_EVENT_CLEAR    BIT(0)
> +#define SWITCHTEC_EVENT_EN_LOG   BIT(1)
> +#define SWITCHTEC_EVENT_EN_CLI   BIT(2)
> +#define SWITCHTEC_EVENT_EN_IRQ   BIT(3)
> +#define SWITCHTEC_EVENT_FATAL    BIT(4)
> +
> +enum {
> +	SWITCHTEC_GAS_MRPC_OFFSET       = 0x0000,
> +	SWITCHTEC_GAS_TOP_CFG_OFFSET    = 0x1000,
> +	SWITCHTEC_GAS_SW_EVENT_OFFSET   = 0x1800,
> +	SWITCHTEC_GAS_SYS_INFO_OFFSET   = 0x2000,
> +	SWITCHTEC_GAS_FLASH_INFO_OFFSET = 0x2200,
> +	SWITCHTEC_GAS_PART_CFG_OFFSET   = 0x4000,
> +	SWITCHTEC_GAS_NTB_OFFSET        = 0x10000,
> +	SWITCHTEC_GAS_PFF_CSR_OFFSET    = 0x134000,
> +};
> +
> +struct mrpc_regs {
> +	u8 input_data[SWITCHTEC_MRPC_PAYLOAD_SIZE];
> +	u8 output_data[SWITCHTEC_MRPC_PAYLOAD_SIZE];
> +	u32 cmd;
> +	u32 status;
> +	u32 ret_value;
> +} __packed;
> +
> +enum mrpc_status {
> +	SWITCHTEC_MRPC_STATUS_INPROGRESS = 1,
> +	SWITCHTEC_MRPC_STATUS_DONE = 2,
> +	SWITCHTEC_MRPC_STATUS_ERROR = 0xFF,
> +	SWITCHTEC_MRPC_STATUS_INTERRUPTED = 0x100,
> +};
> +
> +struct sw_event_regs {
> +	u64 event_report_ctrl;
> +	u64 reserved1;
> +	u64 part_event_bitmap;
> +	u64 reserved2;
> +	u32 global_summary;
> +	u32 reserved3[3];
> +	u32 stack_error_event_hdr;
> +	u32 stack_error_event_data;
> +	u32 reserved4[4];
> +	u32 ppu_error_event_hdr;
> +	u32 ppu_error_event_data;
> +	u32 reserved5[4];
> +	u32 isp_error_event_hdr;
> +	u32 isp_error_event_data;
> +	u32 reserved6[4];
> +	u32 sys_reset_event_hdr;
> +	u32 reserved7[5];
> +	u32 fw_exception_hdr;
> +	u32 reserved8[5];
> +	u32 fw_nmi_hdr;
> +	u32 reserved9[5];
> +	u32 fw_non_fatal_hdr;
> +	u32 reserved10[5];
> +	u32 fw_fatal_hdr;
> +	u32 reserved11[5];
> +	u32 twi_mrpc_comp_hdr;
> +	u32 twi_mrpc_comp_data;
> +	u32 reserved12[4];
> +	u32 twi_mrpc_comp_async_hdr;
> +	u32 twi_mrpc_comp_async_data;
> +	u32 reserved13[4];
> +	u32 cli_mrpc_comp_hdr;
> +	u32 cli_mrpc_comp_data;
> +	u32 reserved14[4];
> +	u32 cli_mrpc_comp_async_hdr;
> +	u32 cli_mrpc_comp_async_data;
> +	u32 reserved15[4];
> +	u32 gpio_interrupt_hdr;
> +	u32 gpio_interrupt_data;
> +	u32 reserved16[4];
> +} __packed;
> +
> +struct sys_info_regs {
> +	u32 device_id;
> +	u32 device_version;
> +	u32 firmware_version;
> +	u32 reserved1;
> +	u32 vendor_table_revision;
> +	u32 table_format_version;
> +	u32 partition_id;
> +	u32 cfg_file_fmt_version;
> +	u32 reserved2[58];
> +	char vendor_id[8];
> +	char product_id[16];
> +	char product_revision[4];
> +	char component_vendor[8];
> +	u16 component_id;
> +	u8 component_revision;
> +} __packed;
> +
> +struct flash_info_regs {
> +	u32 flash_part_map_upd_idx;
> +
> +	struct active_partition_info {
> +		u32 address;
> +		u32 build_version;
> +		u32 build_string;
> +	} active_img;
> +
> +	struct active_partition_info active_cfg;
> +	struct active_partition_info inactive_img;
> +	struct active_partition_info inactive_cfg;
> +
> +	u32 flash_length;
> +
> +	struct partition_info {
> +		u32 address;
> +		u32 length;
> +	} cfg0;
> +
> +	struct partition_info cfg1;
> +	struct partition_info img0;
> +	struct partition_info img1;
> +	struct partition_info nvlog;
> +	struct partition_info vendor[8];
> +};
> +
> +struct ntb_info_regs {
> +	u8  partition_count;
> +	u8  partition_id;
> +	u16 reserved1;
> +	u64 ep_map;
> +	u16 requester_id;
> +} __packed;
> +
> +struct part_cfg_regs {
> +	u32 status;
> +	u32 state;
> +	u32 port_cnt;
> +	u32 usp_port_mode;
> +	u32 usp_pff_inst_id;
> +	u32 vep_pff_inst_id;
> +	u32 dsp_pff_inst_id[47];
> +	u32 reserved1[11];
> +	u16 vep_vector_number;
> +	u16 usp_vector_number;
> +	u32 port_event_bitmap;
> +	u32 reserved2[3];
> +	u32 part_event_summary;
> +	u32 reserved3[3];
> +	u32 part_reset_hdr;
> +	u32 part_reset_data[5];
> +	u32 mrpc_comp_hdr;
> +	u32 mrpc_comp_data[5];
> +	u32 mrpc_comp_async_hdr;
> +	u32 mrpc_comp_async_data[5];
> +	u32 dyn_binding_hdr;
> +	u32 dyn_binding_data[5];
> +	u32 reserved4[159];
> +} __packed;
> +
> +enum {
> +	SWITCHTEC_PART_CFG_EVENT_RESET = 1 << 0,
> +	SWITCHTEC_PART_CFG_EVENT_MRPC_CMP = 1 << 1,
> +	SWITCHTEC_PART_CFG_EVENT_MRPC_ASYNC_CMP = 1 << 2,
> +	SWITCHTEC_PART_CFG_EVENT_DYN_PART_CMP = 1 << 3,
> +};
> +
> +struct pff_csr_regs {
> +	u16 vendor_id;
> +	u16 device_id;
> +	u32 pci_cfg_header[15];
> +	u32 pci_cap_region[48];
> +	u32 pcie_cap_region[448];
> +	u32 indirect_gas_window[128];
> +	u32 indirect_gas_window_off;
> +	u32 reserved[127];
> +	u32 pff_event_summary;
> +	u32 reserved2[3];
> +	u32 aer_in_p2p_hdr;
> +	u32 aer_in_p2p_data[5];
> +	u32 aer_in_vep_hdr;
> +	u32 aer_in_vep_data[5];
> +	u32 dpc_hdr;
> +	u32 dpc_data[5];
> +	u32 cts_hdr;
> +	u32 cts_data[5];
> +	u32 reserved3[6];
> +	u32 hotplug_hdr;
> +	u32 hotplug_data[5];
> +	u32 ier_hdr;
> +	u32 ier_data[5];
> +	u32 threshold_hdr;
> +	u32 threshold_data[5];
> +	u32 power_mgmt_hdr;
> +	u32 power_mgmt_data[5];
> +	u32 tlp_throttling_hdr;
> +	u32 tlp_throttling_data[5];
> +	u32 force_speed_hdr;
> +	u32 force_speed_data[5];
> +	u32 credit_timeout_hdr;
> +	u32 credit_timeout_data[5];
> +	u32 link_state_hdr;
> +	u32 link_state_data[5];
> +	u32 reserved4[174];
> +} __packed;
> +
> +struct switchtec_dev {
> +	struct pci_dev *pdev;
> +	struct msix_entry msix[4];
> +	struct device dev;
> +	struct cdev cdev;
> +	unsigned int event_irq;
> +
> +	int partition;
> +	int partition_count;
> +	int pff_csr_count;
> +	char pff_local[SWITCHTEC_MAX_PFF_CSR];
> +
> +	void __iomem *mmio;
> +	struct mrpc_regs __iomem *mmio_mrpc;
> +	struct sw_event_regs __iomem *mmio_sw_event;
> +	struct sys_info_regs __iomem *mmio_sys_info;
> +	struct flash_info_regs __iomem *mmio_flash_info;
> +	struct ntb_info_regs __iomem *mmio_ntb;
> +	struct part_cfg_regs __iomem *mmio_part_cfg;
> +	struct part_cfg_regs __iomem *mmio_part_cfg_all;
> +	struct pff_csr_regs __iomem *mmio_pff_csr;
> +
> +	/*
> +	 * The mrpc mutex must be held when accessing the other
> +	 * mrpc fields
> +	 */
> +	struct mutex mrpc_mutex;
> +	struct list_head mrpc_queue;
> +	int mrpc_busy;
> +	struct work_struct mrpc_work;
> +	struct delayed_work mrpc_timeout;
> +	wait_queue_head_t event_wq;
> +	atomic_t event_cnt;

Why is this atomic_t?  You only do an atomic_set() (in stdev_create())
and atomic_reads() -- no writes other than the initial one.  So I'm
not sure what value atomic_t brings.

> +};
> +
> +static struct switchtec_dev *to_stdev(struct device *dev)
> +{
> +	return container_of(dev, struct switchtec_dev, dev);
> +}
> +
> +struct switchtec_user {
> +	struct switchtec_dev *stdev;
> +
> +	enum mrpc_state {
> +		MRPC_IDLE = 0,
> +		MRPC_QUEUED,
> +		MRPC_RUNNING,
> +		MRPC_DONE,
> +	} state;
> +
> +	struct completion comp;
> +	struct kref kref;
> +	struct list_head list;
> +
> +	u32 cmd;
> +	u32 status;
> +	u32 return_code;
> +	size_t data_len;
> +	unsigned char data[SWITCHTEC_MRPC_PAYLOAD_SIZE];
> +	int event_cnt;
> +};
> +
> +static struct switchtec_user *stuser_create(struct switchtec_dev *stdev)
> +{
> +	struct switchtec_user *stuser;
> +
> +	stuser = kzalloc(sizeof(*stuser), GFP_KERNEL);
> +	if (!stuser)
> +		return ERR_PTR(-ENOMEM);
> +
> +	get_device(&stdev->dev);
> +	stuser->stdev = stdev;
> +	kref_init(&stuser->kref);
> +	INIT_LIST_HEAD(&stuser->list);
> +	init_completion(&stuser->comp);
> +	stuser->event_cnt = atomic_read(&stdev->event_cnt);
> +
> +	dev_dbg(&stdev->dev, "%s: %p\n", __func__, stuser);
> +
> +	return stuser;
> +}
> +
> +static void stuser_free(struct kref *kref)
> +{
> +	struct switchtec_user *stuser;
> +
> +	stuser = container_of(kref, struct switchtec_user, kref);
> +
> +	dev_dbg(&stuser->stdev->dev, "%s: %p\n", __func__, stuser);
> +
> +	put_device(&stuser->stdev->dev);
> +	kfree(stuser);
> +}
> +
> +static void stuser_put(struct switchtec_user *stuser)
> +{
> +	kref_put(&stuser->kref, stuser_free);
> +}
> +
> +static void stuser_set_state(struct switchtec_user *stuser,
> +			     enum mrpc_state state)
> +{
> +	const char * const state_names[] = {
> +		[MRPC_IDLE] = "IDLE",
> +		[MRPC_QUEUED] = "QUEUED",
> +		[MRPC_RUNNING] = "RUNNING",
> +		[MRPC_DONE] = "DONE",
> +	};
> +
> +	stuser->state = state;
> +
> +	dev_dbg(&stuser->stdev->dev, "stuser state %p -> %s",
> +		stuser, state_names[state]);
> +}
> +
> +static int stdev_is_alive(struct switchtec_dev *stdev)
> +{
> +	return stdev->mmio != NULL;
> +}
> +
> +static void mrpc_complete_cmd(struct switchtec_dev *stdev);
> +
> +static void mrpc_cmd_submit(struct switchtec_dev *stdev)
> +{
> +	/* requires the mrpc_mutex to already be held when called */
> +
> +	struct switchtec_user *stuser;
> +
> +	if (stdev->mrpc_busy)
> +		return;
> +
> +	if (list_empty(&stdev->mrpc_queue))
> +		return;
> +
> +	stuser = list_entry(stdev->mrpc_queue.next, struct switchtec_user,
> +			    list);
> +
> +	stuser_set_state(stuser, MRPC_RUNNING);
> +	stdev->mrpc_busy = 1;
> +	memcpy_toio(&stdev->mmio_mrpc->input_data,
> +		    stuser->data, stuser->data_len);
> +	iowrite32(stuser->cmd, &stdev->mmio_mrpc->cmd);
> +
> +	stuser->status = ioread32(&stdev->mmio_mrpc->status);
> +	if (stuser->status != SWITCHTEC_MRPC_STATUS_INPROGRESS)
> +		mrpc_complete_cmd(stdev);
> +
> +	schedule_delayed_work(&stdev->mrpc_timeout,
> +			      msecs_to_jiffies(500));
> +}
> +
> +static void mrpc_queue_cmd(struct switchtec_user *stuser)
> +{
> +	/* requires the mrpc_mutex to already be held when called */
> +
> +	struct switchtec_dev *stdev = stuser->stdev;
> +
> +	kref_get(&stuser->kref);
> +	stuser_set_state(stuser, MRPC_QUEUED);
> +	init_completion(&stuser->comp);
> +	list_add_tail(&stuser->list, &stdev->mrpc_queue);
> +
> +	mrpc_cmd_submit(stdev);
> +}
> +
> +static void mrpc_complete_cmd(struct switchtec_dev *stdev)
> +{
> +	/* requires the mrpc_mutex to already be held when called */
> +	struct switchtec_user *stuser;
> +
> +	if (list_empty(&stdev->mrpc_queue))
> +		return;
> +
> +	stuser = list_entry(stdev->mrpc_queue.next, struct switchtec_user,
> +			    list);
> +
> +	stuser->status = ioread32(&stdev->mmio_mrpc->status);
> +	if (stuser->status == SWITCHTEC_MRPC_STATUS_INPROGRESS)
> +		return;
> +
> +	stuser_set_state(stuser, MRPC_DONE);
> +	stuser->return_code = 0;
> +
> +	if (stuser->status != SWITCHTEC_MRPC_STATUS_DONE)
> +		goto out;
> +
> +	stuser->return_code = ioread32(&stdev->mmio_mrpc->ret_value);
> +	if (stuser->return_code != 0)
> +		goto out;
> +
> +	memcpy_fromio(stuser->data, &stdev->mmio_mrpc->output_data,
> +		      sizeof(stuser->data));

Apparently you always get 1K of data back, no matter what?

> +
> +out:
> +	complete_all(&stuser->comp);
> +	list_del_init(&stuser->list);
> +	stuser_put(stuser);
> +	stdev->mrpc_busy = 0;
> +
> +	mrpc_cmd_submit(stdev);
> +}
> +
> +static void mrpc_event_work(struct work_struct *work)
> +{
> +	struct switchtec_dev *stdev;
> +
> +	stdev = container_of(work, struct switchtec_dev, mrpc_work);
> +
> +	dev_dbg(&stdev->dev, "%s\n", __func__);
> +
> +	mutex_lock(&stdev->mrpc_mutex);
> +	cancel_delayed_work(&stdev->mrpc_timeout);
> +	mrpc_complete_cmd(stdev);
> +	mutex_unlock(&stdev->mrpc_mutex);
> +}
> +
> +static void mrpc_timeout_work(struct work_struct *work)
> +{
> +	struct switchtec_dev *stdev;
> +	u32 status;
> +
> +	stdev = container_of(work, struct switchtec_dev, mrpc_timeout.work);
> +
> +	dev_dbg(&stdev->dev, "%s\n", __func__);
> +
> +	mutex_lock(&stdev->mrpc_mutex);
> +
> +	if (stdev_is_alive(stdev)) {
> +		status = ioread32(&stdev->mmio_mrpc->status);
> +		if (status == SWITCHTEC_MRPC_STATUS_INPROGRESS) {
> +			schedule_delayed_work(&stdev->mrpc_timeout,
> +					      msecs_to_jiffies(500));
> +			goto out;
> +		}
> +	}
> +
> +	mrpc_complete_cmd(stdev);
> +
> +out:
> +	mutex_unlock(&stdev->mrpc_mutex);
> +}
> +
> +static int switchtec_dev_open(struct inode *inode, struct file *filp)
> +{
> +	struct switchtec_dev *stdev;
> +	struct switchtec_user *stuser;
> +
> +	stdev = container_of(inode->i_cdev, struct switchtec_dev, cdev);
> +
> +	stuser = stuser_create(stdev);
> +	if (!stuser)
> +		return PTR_ERR(stuser);
> +
> +	filp->private_data = stuser;
> +	nonseekable_open(inode, filp);
> +
> +	dev_dbg(&stdev->dev, "%s: %p\n", __func__, stuser);
> +
> +	return 0;
> +}
> +
> +static int switchtec_dev_release(struct inode *inode, struct file *filp)
> +{
> +	struct switchtec_user *stuser = filp->private_data;
> +
> +	stuser_put(stuser);
> +
> +	return 0;
> +}
> +
> +static ssize_t switchtec_dev_write(struct file *filp, const char __user *data,
> +				   size_t size, loff_t *off)
> +{
> +	struct switchtec_user *stuser = filp->private_data;
> +	struct switchtec_dev *stdev = stuser->stdev;
> +	int rc;
> +
> +	if (!stdev_is_alive(stdev))
> +		return -ENXIO;

What exactly does this protect against?  Is it OK if stdev_is_alive()
becomes false immediately after we check?

> +
> +	if (size < sizeof(stuser->cmd) ||
> +	    size > sizeof(stuser->cmd) + SWITCHTEC_MRPC_PAYLOAD_SIZE)

I think I would use "sizeof(stuser->data)" instead of
SWITCHTEC_MRPC_PAYLOAD_SIZE so it matches what mrpc_complete_cmd()
does.  Same below in switchtec_dev_read().

mrpc_mutex doesn't protect the stuser things, does it?  If not, you
could do this without the gotos.  And I think it's a little easier to
be confident there are no buffer overruns:

  if (stuser->state != MRPC_IDLE)
    return -EBADE;

  if (size < sizeof(stuser->cmd))
    return -EINVAL;

  rc = copy_from_user(&stuser->cmd, data, sizeof(stuser->cmd));
  if (rc)
    return -EFAULT;

  size -= sizeof(stuser->cmd);
  data += sizeof(stuser->cmd);
  if (size > sizeof(stuser->data))
    return -EINVAL;

  rc = copy_from_user(&stuser->data, data, size);
  if (rc)
    return -EFAULT;

  stuser->data_len = size;

  if (mutex_lock_interruptible(&stdev->mrpc_mutex))
    return -EINTR;
  mrpc_queue_cmd(stuser);
  mutex_unlock(&stdev->mrpc_mutex);

  return size;

> +		return -EINVAL;
> +
> +	if (mutex_lock_interruptible(&stdev->mrpc_mutex))
> +		return -EINTR;
> +
> +	if (stuser->state != MRPC_IDLE) {
> +		rc = -EBADE;
> +		goto out;
> +	}
> +
> +	rc = copy_from_user(&stuser->cmd, data, sizeof(stuser->cmd));
> +	if (rc) {
> +		rc = -EFAULT;
> +		goto out;
> +	}
> +
> +	data += sizeof(stuser->cmd);
> +	rc = copy_from_user(&stuser->data, data, size - sizeof(stuser->cmd));
> +	if (rc) {
> +		rc = -EFAULT;
> +		goto out;
> +	}
> +
> +	stuser->data_len = size - sizeof(stuser->cmd);
> +
> +	mrpc_queue_cmd(stuser);
> +
> +	rc = size;
> +
> +out:
> +	mutex_unlock(&stdev->mrpc_mutex);
> +
> +	return rc;
> +}
> +
> +static ssize_t switchtec_dev_read(struct file *filp, char __user *data,
> +				  size_t size, loff_t *off)
> +{
> +	struct switchtec_user *stuser = filp->private_data;
> +	struct switchtec_dev *stdev = stuser->stdev;
> +	int rc;
> +
> +	if (!stdev_is_alive(stdev))
> +		return -ENXIO;
> +
> +	if (size < sizeof(stuser->cmd) ||
> +	    size > sizeof(stuser->cmd) + SWITCHTEC_MRPC_PAYLOAD_SIZE)
> +		return -EINVAL;
> +
> +	if (stuser->state == MRPC_IDLE)
> +		return -EBADE;
> +
> +	if (filp->f_flags & O_NONBLOCK) {
> +		if (!try_wait_for_completion(&stuser->comp))
> +			return -EAGAIN;
> +	} else {
> +		rc = wait_for_completion_interruptible(&stuser->comp);
> +		if (rc < 0)
> +			return rc;
> +	}
> +
> +	if (mutex_lock_interruptible(&stdev->mrpc_mutex))
> +		return -EINTR;
> +
> +	if (stuser->state != MRPC_DONE)
> +		return -EBADE;
> +
> +	rc = copy_to_user(data, &stuser->return_code,
> +			  sizeof(stuser->return_code));
> +	if (rc) {
> +		rc = -EFAULT;
> +		goto out;
> +	}
> +
> +	data += sizeof(stuser->return_code);
> +	rc = copy_to_user(data, &stuser->data,
> +			  size - sizeof(stuser->return_code));
> +	if (rc) {
> +		rc = -EFAULT;
> +		goto out;
> +	}
> +
> +	stuser_set_state(stuser, MRPC_IDLE);
> +
> +	if (stuser->status == SWITCHTEC_MRPC_STATUS_DONE)
> +		rc = size;
> +	else if (stuser->status == SWITCHTEC_MRPC_STATUS_INTERRUPTED)
> +		rc = -ENXIO;
> +	else
> +		rc = -EBADMSG;
> +
> +out:
> +	mutex_unlock(&stdev->mrpc_mutex);
> +
> +	return rc;
> +}
> +
> +static unsigned int switchtec_dev_poll(struct file *filp, poll_table *wait)
> +{
> +	struct switchtec_user *stuser = filp->private_data;
> +	struct switchtec_dev *stdev = stuser->stdev;
> +	int ret = 0;
> +
> +	poll_wait(filp, &stuser->comp.wait, wait);
> +	poll_wait(filp, &stdev->event_wq, wait);
> +
> +	if (!stdev_is_alive(stdev))
> +		return POLLERR;
> +
> +	if (try_wait_for_completion(&stuser->comp))
> +		ret |= POLLIN | POLLRDNORM;
> +
> +	if (stuser->event_cnt != atomic_read(&stdev->event_cnt))
> +		ret |= POLLPRI | POLLRDBAND;
> +
> +	return ret;
> +}
> +
> +static const struct file_operations switchtec_fops = {
> +	.owner = THIS_MODULE,
> +	.open = switchtec_dev_open,
> +	.release = switchtec_dev_release,
> +	.write = switchtec_dev_write,
> +	.read = switchtec_dev_read,
> +	.poll = switchtec_dev_poll,
> +};
> +
> +static void stdev_release(struct device *dev)
> +{
> +	struct switchtec_dev *stdev = to_stdev(dev);
> +
> +	ida_simple_remove(&switchtec_minor_ida,
> +			  MINOR(dev->devt));
> +	kfree(stdev);
> +}
> +
> +static void stdev_unregister(struct switchtec_dev *stdev)
> +{
> +	cdev_del(&stdev->cdev);
> +	device_unregister(&stdev->dev);
> +}
> +
> +static struct switchtec_dev *stdev_create(struct pci_dev *pdev)
> +{
> +	struct switchtec_dev *stdev;
> +	int minor;
> +	struct device *dev;
> +	struct cdev *cdev;
> +	int rc;
> +
> +	stdev = kzalloc_node(sizeof(*stdev), GFP_KERNEL,
> +			     dev_to_node(&pdev->dev));
> +	if (!stdev)
> +		return ERR_PTR(-ENOMEM);
> +
> +	stdev->pdev = pdev;
> +	INIT_LIST_HEAD(&stdev->mrpc_queue);
> +	mutex_init(&stdev->mrpc_mutex);
> +	stdev->mrpc_busy = 0;
> +	INIT_WORK(&stdev->mrpc_work, mrpc_event_work);
> +	INIT_DELAYED_WORK(&stdev->mrpc_timeout, mrpc_timeout_work);
> +	init_waitqueue_head(&stdev->event_wq);
> +	atomic_set(&stdev->event_cnt, 0);
> +
> +	minor = ida_simple_get(&switchtec_minor_ida, 0, 0,
> +			       GFP_KERNEL);
> +	if (minor < 0)
> +		return ERR_PTR(minor);
> +
> +	dev = &stdev->dev;
> +	device_initialize(dev);
> +	dev->devt = MKDEV(MAJOR(switchtec_devt), minor);
> +	dev->class = switchtec_class;
> +	dev->parent = &pdev->dev;
> +	dev->release = stdev_release;
> +	dev_set_name(dev, "switchtec%d", minor);
> +
> +	cdev = &stdev->cdev;
> +	cdev_init(cdev, &switchtec_fops);
> +	cdev->owner = THIS_MODULE;
> +	cdev->kobj.parent = &dev->kobj;
> +
> +	rc = cdev_add(&stdev->cdev, dev->devt, 1);
> +	if (rc)
> +		goto err_cdev;
> +
> +	rc = device_add(dev);
> +	if (rc) {
> +		cdev_del(&stdev->cdev);
> +		put_device(dev);
> +		return ERR_PTR(rc);
> +	}
> +
> +	return stdev;
> +
> +err_cdev:
> +	ida_simple_remove(&switchtec_minor_ida, minor);
> +
> +	return ERR_PTR(rc);
> +}
> +
> +static irqreturn_t switchtec_event_isr(int irq, void *dev)
> +{
> +	struct switchtec_dev *stdev = dev;
> +	u32 reg;
> +	irqreturn_t ret = IRQ_NONE;
> +
> +	reg = ioread32(&stdev->mmio_part_cfg->mrpc_comp_hdr);
> +	if (reg & SWITCHTEC_EVENT_OCCURRED) {
> +		dev_dbg(&stdev->dev, "%s: mrpc comp\n", __func__);
> +		ret = IRQ_HANDLED;
> +		schedule_work(&stdev->mrpc_work);
> +		iowrite32(reg, &stdev->mmio_part_cfg->mrpc_comp_hdr);
> +	}
> +
> +	return ret;
> +}
> +
> +static int switchtec_init_msix_isr(struct switchtec_dev *stdev)
> +{
> +	struct pci_dev *pdev = stdev->pdev;
> +	int rc, i, msix_count, node;
> +
> +	node = dev_to_node(&pdev->dev);

Unused?

> +	for (i = 0; i < ARRAY_SIZE(stdev->msix); ++i)
> +		stdev->msix[i].entry = i;
> +
> +	msix_count = pci_enable_msix_range(pdev, stdev->msix, 1,
> +					   ARRAY_SIZE(stdev->msix));
> +	if (msix_count < 0)
> +		return msix_count;

Maybe this could be simplified by using pci_alloc_irq_vectors()?

> +	stdev->event_irq = ioread32(&stdev->mmio_part_cfg->vep_vector_number);
> +	if (stdev->event_irq < 0 || stdev->event_irq >= msix_count) {
> +		rc = -EFAULT;
> +		goto err_msix_request;
> +	}

Not sure what this is doing, but you immediately overwrite
stdev->event_irq below.  If you're using it as just a temporary above
for doing some validation, I would just use a local variable to avoid
the connection with stdev->event_irq.

> +	stdev->event_irq = stdev->msix[stdev->event_irq].vector;

Oh, I guess you allocate several MSI-X vectors, but you only actually
use one of them?  Why is that?  I was confused about why you allocate
several vectors, but there's only one devm_request_irq() call below.

> +	dev_dbg(&stdev->dev, "Using msix interrupts: event_irq=%d\n",
> +		stdev->event_irq);
> +
> +	return 0;
> +
> +err_msix_request:
> +	pci_disable_msix(pdev);
> +	return rc;
> +}
> +
> +static int switchtec_init_msi_isr(struct switchtec_dev *stdev)
> +{
> +	int rc;
> +	struct pci_dev *pdev = stdev->pdev;
> +
> +	/* Try to set up msi irq */
> +	rc = pci_enable_msi_range(pdev, 1, 4);
> +	if (rc < 0)
> +		return rc;
> +
> +	stdev->event_irq = ioread32(&stdev->mmio_part_cfg->vep_vector_number);
> +	if (stdev->event_irq < 0 || stdev->event_irq >= 4) {
> +		rc = -EFAULT;
> +		goto err_msi_request;
> +	}
> +
> +	stdev->event_irq = pdev->irq + stdev->event_irq;
> +	dev_dbg(&stdev->dev, "Using msi interrupts: event_irq=%d\n",
> +		stdev->event_irq);
> +
> +	return 0;
> +
> +err_msi_request:
> +	pci_disable_msi(pdev);
> +	return rc;
> +}
> +
> +static int switchtec_init_isr(struct switchtec_dev *stdev)
> +{
> +	int rc;
> +
> +	rc = switchtec_init_msix_isr(stdev);
> +	if (rc)
> +		rc = switchtec_init_msi_isr(stdev);
> +
> +	if (rc)
> +		return rc;
> +
> +	rc = devm_request_irq(&stdev->pdev->dev, stdev->event_irq,
> +			      switchtec_event_isr, 0, KBUILD_MODNAME, stdev);
> +
> +	return rc;
> +}
> +
> +static void switchtec_deinit_isr(struct switchtec_dev *stdev)
> +{
> +	devm_free_irq(&stdev->pdev->dev, stdev->event_irq, stdev);
> +	pci_disable_msix(stdev->pdev);
> +	pci_disable_msi(stdev->pdev);
> +}
> +
> +static void init_pff(struct switchtec_dev *stdev)
> +{
> +	int i;
> +	u32 reg;
> +	struct part_cfg_regs *pcfg = stdev->mmio_part_cfg;
> +
> +	for (i = 0; i < SWITCHTEC_MAX_PFF_CSR; i++) {
> +		reg = ioread16(&stdev->mmio_pff_csr[i].vendor_id);
> +		if (reg != MICROSEMI_VENDOR_ID)
> +			break;
> +	}
> +
> +	stdev->pff_csr_count = i;
> +
> +	reg = ioread32(&pcfg->usp_pff_inst_id);
> +	if (reg < SWITCHTEC_MAX_PFF_CSR)
> +		stdev->pff_local[reg] = 1;
> +
> +	reg = ioread32(&pcfg->vep_pff_inst_id);
> +	if (reg < SWITCHTEC_MAX_PFF_CSR)
> +		stdev->pff_local[reg] = 1;
> +
> +	for (i = 0; i < ARRAY_SIZE(pcfg->dsp_pff_inst_id); i++) {
> +		reg = ioread32(&pcfg->dsp_pff_inst_id[i]);
> +		if (reg < SWITCHTEC_MAX_PFF_CSR)
> +			stdev->pff_local[reg] = 1;
> +	}
> +}
> +
> +static int switchtec_init_pci(struct switchtec_dev *stdev,
> +			      struct pci_dev *pdev)
> +{
> +	int rc;
> +
> +	rc = pcim_enable_device(pdev);
> +	if (rc)
> +		return rc;
> +
> +	rc = pcim_iomap_regions(pdev, 0x1, KBUILD_MODNAME);
> +	if (rc)
> +		return rc;
> +
> +	pci_set_master(pdev);
> +
> +	stdev->mmio = pcim_iomap_table(pdev)[0];
> +	stdev->mmio_mrpc = stdev->mmio + SWITCHTEC_GAS_MRPC_OFFSET;
> +	stdev->mmio_sw_event = stdev->mmio + SWITCHTEC_GAS_SW_EVENT_OFFSET;
> +	stdev->mmio_sys_info = stdev->mmio + SWITCHTEC_GAS_SYS_INFO_OFFSET;
> +	stdev->mmio_flash_info = stdev->mmio + SWITCHTEC_GAS_FLASH_INFO_OFFSET;
> +	stdev->mmio_ntb = stdev->mmio + SWITCHTEC_GAS_NTB_OFFSET;
> +	stdev->partition = ioread8(&stdev->mmio_ntb->partition_id);
> +	stdev->partition_count = ioread8(&stdev->mmio_ntb->partition_count);
> +	stdev->mmio_part_cfg_all = stdev->mmio + SWITCHTEC_GAS_PART_CFG_OFFSET;
> +	stdev->mmio_part_cfg = &stdev->mmio_part_cfg_all[stdev->partition];
> +	stdev->mmio_pff_csr = stdev->mmio + SWITCHTEC_GAS_PFF_CSR_OFFSET;
> +
> +	init_pff(stdev);
> +
> +	iowrite32(SWITCHTEC_EVENT_CLEAR |
> +		  SWITCHTEC_EVENT_EN_IRQ,

Does this enable the device to generate IRQs?  You haven't connected
the ISR yet (but I guess you probably also haven't told the device to
do anything that would actually generate an IRQ).

> +		  &stdev->mmio_part_cfg->mrpc_comp_hdr);
> +
> +	pci_set_drvdata(pdev, stdev);
> +
> +	return 0;
> +}
> +
> +static void switchtec_deinit_pci(struct switchtec_dev *stdev)
> +{
> +	struct pci_dev *pdev = stdev->pdev;
> +
> +	stdev->mmio = NULL;
> +	pci_clear_master(pdev);
> +	pci_set_drvdata(pdev, NULL);
> +}
> +
> +static int switchtec_pci_probe(struct pci_dev *pdev,
> +			       const struct pci_device_id *id)
> +{
> +	struct switchtec_dev *stdev;
> +	int rc;
> +
> +	stdev = stdev_create(pdev);
> +	if (!stdev)
> +		return PTR_ERR(stdev);
> +
> +	rc = switchtec_init_pci(stdev, pdev);
> +	if (rc)
> +		goto err_init_pci;
> +
> +	rc = switchtec_init_isr(stdev);
> +	if (rc) {
> +		dev_err(&stdev->dev, "failed to init isr.\n");
> +		goto err_init_isr;
> +	}
> +
> +	dev_info(&stdev->dev, "Management device registered.\n");
> +
> +	return 0;
> +
> +err_init_isr:
> +	switchtec_deinit_pci(stdev);
> +err_init_pci:
> +	stdev_unregister(stdev);
> +	return rc;
> +}
> +
> +static void switchtec_pci_remove(struct pci_dev *pdev)
> +{
> +	struct switchtec_dev *stdev = pci_get_drvdata(pdev);
> +
> +	switchtec_deinit_isr(stdev);
> +	switchtec_deinit_pci(stdev);
> +	stdev_unregister(stdev);
> +}
> +
> +#define SWITCHTEC_PCI_DEVICE(device_id) \
> +	{ \
> +		.vendor     = MICROSEMI_VENDOR_ID, \
> +		.device     = device_id, \
> +		.subvendor  = PCI_ANY_ID, \
> +		.subdevice  = PCI_ANY_ID, \
> +		.class      = MICROSEMI_MGMT_CLASSCODE, \
> +		.class_mask = 0xFFFFFFFF, \
> +	}, \
> +	{ \
> +		.vendor     = MICROSEMI_VENDOR_ID, \
> +		.device     = device_id, \
> +		.subvendor  = PCI_ANY_ID, \
> +		.subdevice  = PCI_ANY_ID, \
> +		.class      = MICROSEMI_NTB_CLASSCODE, \
> +		.class_mask = 0xFFFFFFFF, \
> +	}
> +
> +static const struct pci_device_id switchtec_pci_tbl[] = {
> +	SWITCHTEC_PCI_DEVICE(0x8531),  //PFX 24xG3
> +	SWITCHTEC_PCI_DEVICE(0x8532),  //PFX 32xG3
> +	SWITCHTEC_PCI_DEVICE(0x8533),  //PFX 48xG3
> +	SWITCHTEC_PCI_DEVICE(0x8534),  //PFX 64xG3
> +	SWITCHTEC_PCI_DEVICE(0x8535),  //PFX 80xG3
> +	SWITCHTEC_PCI_DEVICE(0x8536),  //PFX 96xG3
> +	SWITCHTEC_PCI_DEVICE(0x8543),  //PSX 48xG3
> +	SWITCHTEC_PCI_DEVICE(0x8544),  //PSX 64xG3
> +	SWITCHTEC_PCI_DEVICE(0x8545),  //PSX 80xG3
> +	SWITCHTEC_PCI_DEVICE(0x8546),  //PSX 96xG3
> +	{0}
> +};
> +MODULE_DEVICE_TABLE(pci, switchtec_pci_tbl);
> +
> +static struct pci_driver switchtec_pci_driver = {
> +	.name		= KBUILD_MODNAME,
> +	.id_table	= switchtec_pci_tbl,
> +	.probe		= switchtec_pci_probe,
> +	.remove		= switchtec_pci_remove,
> +};
> +
> +static int __init switchtec_init(void)
> +{
> +	int rc;
> +
> +	max_devices = max(max_devices, 256);
> +	rc = alloc_chrdev_region(&switchtec_devt, 0, max_devices,
> +				 "switchtec");
> +	if (rc)
> +		return rc;
> +
> +	switchtec_class = class_create(THIS_MODULE, "switchtec");
> +	if (IS_ERR(switchtec_class)) {
> +		rc = PTR_ERR(switchtec_class);
> +		goto err_create_class;
> +	}
> +
> +	rc = pci_register_driver(&switchtec_pci_driver);
> +	if (rc)
> +		goto err_pci_register;
> +
> +	pr_info(KBUILD_MODNAME ": loaded.\n");
> +
> +	return 0;
> +
> +err_pci_register:
> +	class_destroy(switchtec_class);
> +
> +err_create_class:
> +	unregister_chrdev_region(switchtec_devt, max_devices);
> +
> +	return rc;
> +}
> +module_init(switchtec_init);
> +
> +static void __exit switchtec_exit(void)
> +{
> +	pci_unregister_driver(&switchtec_pci_driver);
> +	class_destroy(switchtec_class);
> +	unregister_chrdev_region(switchtec_devt, max_devices);
> +	ida_destroy(&switchtec_minor_ida);
> +
> +	pr_info(KBUILD_MODNAME ": unloaded.\n");
> +}
> +module_exit(switchtec_exit);
> -- 
> 2.1.4
 



More information about the Linux-nvme mailing list