[RFC 3/3] CMDQ: Mediatek CMDQ driver

Thu Jan 28 23:39:10 PST 2016

Hi Dan,

Many thanks for your comments and time.
I reply my plan inline.

On Thu, 2016-01-28 at 12:49 +0800, Daniel Kurtz wrote:
> Hi HS,
> 
> Sorry for the delay.  It is hard to find time to review a >3700 line
> driver :-o in detail....
> 
> Some review comments inline, although I still do not completely
> understand how all that this driver does and how it works.
> I'll try to find time to go through this driver in detail again next
> time you post it for review.
> 
> On Tue, Jan 19, 2016 at 9:14 PM,  <hs.liao at mediatek.com> wrote:
> > From: HS Liao <hs.liao at mediatek.com>
> >
> > This patch is first version of Mediatek Command Queue(CMDQ) driver. The
> > CMDQ is used to help read/write registers with critical time limitation,
> > such as updating display configuration during the vblank. It controls
> > Global Command Engine (GCE) hardware to achieve this requirement.
> > Currently, CMDQ only supports display related hardwares, but we expect
> > it can be extended to other hardwares for future requirements.
> >
> > Signed-off-by: HS Liao <hs.liao at mediatek.com>
> 
> [snip]
> 
> > diff --git a/drivers/soc/mediatek/mtk-cmdq.c b/drivers/soc/mediatek/mtk-cmdq.c
> > new file mode 100644
> > index 0000000..7570f00
> > --- /dev/null
> > +++ b/drivers/soc/mediatek/mtk-cmdq.c
> 
> [snip]
> 
> > +/*
> > + * Maximum prefetch buffer size.
> > + * Unit is instructions.
> > + */
> > +#define CMDQ_MAX_PREFETCH_INSTUCTION   240
> 
> INSTRUCTION

will fix

> [snip]
> 
> > +
> > +/* get lsb for subsys encoding in arg_a (range: 0 - 31) */
> > +
> > +enum cmdq_eng {
> > +       CMDQ_ENG_DISP_UFOE = 0,
> > +       CMDQ_ENG_DISP_AAL,
> > +       CMDQ_ENG_DISP_COLOR0,
> > +       CMDQ_ENG_DISP_COLOR1,
> > +       CMDQ_ENG_DISP_RDMA0,
> > +       CMDQ_ENG_DISP_RDMA1,
> > +       CMDQ_ENG_DISP_RDMA2,
> > +       CMDQ_ENG_DISP_WDMA0,
> > +       CMDQ_ENG_DISP_WDMA1,
> > +       CMDQ_ENG_DISP_OVL0,
> > +       CMDQ_ENG_DISP_OVL1,
> > +       CMDQ_ENG_DISP_GAMMA,
> > +       CMDQ_ENG_DISP_DSI0_CMD,
> > +       CMDQ_ENG_DISP_DSI1_CMD,
> 
> Why do these last two have "_CMD" at the end?

will remove

> > +       CMDQ_MAX_ENGINE_COUNT   /* ALWAYS keep at the end */
> > +};
> > +
> > +struct cmdq_command {
> > +       struct cmdq             *cqctx;
> > +       u32                     scenario;
> > +       /* task priority (NOT HW thread priority) */
> > +       u32                     priority;
> > +       /* bit flag of used engines */
> > +       u64                     engine_flag;
> > +       /*
> > +        * pointer of instruction buffer
> > +        * This must point to an 64-bit aligned u32 array
> > +        */
> > +       u32                     *va_base;
> 
> All of your "va" and "va_base" should be void *, not u32 *.

will do

> > +       /* size of instruction buffer, in bytes. */
> > +       u32                     block_size;
> 
> Better to use size_t for "size in bytes".

will fix

> > +};
> > +
> > +enum cmdq_code {
> > +       /* These are actual HW op code. */
> > +       CMDQ_CODE_MOVE = 0x02,
> > +       CMDQ_CODE_WRITE = 0x04,
> > +       CMDQ_CODE_JUMP = 0x10,
> > +       CMDQ_CODE_WFE = 0x20,   /* wait for event (and clear) */
> > +       CMDQ_CODE_CLEAR_EVENT = 0x21,   /* clear event */
> > +       CMDQ_CODE_EOC = 0x40,   /* end of command */
> > +};
> > +
> > +enum cmdq_task_state {
> > +       TASK_STATE_IDLE,        /* free task */
> > +       TASK_STATE_BUSY,        /* task running on a thread */
> > +       TASK_STATE_KILLED,      /* task process being killed */
> > +       TASK_STATE_ERROR,       /* task execution error */
> > +       TASK_STATE_DONE,        /* task finished */
> > +       TASK_STATE_WAITING,     /* allocated but waiting for available thread */
> > +};
> > +
> > +struct cmdq_cmd_buf {
> > +       atomic_t                used;
> > +       void                    *va;
> > +       dma_addr_t              pa;
> > +};
> > +
> > +struct cmdq_task_cb {
> > +       /* called by isr */
> > +       cmdq_async_flush_cb     isr_cb;
> > +       void                    *isr_data;
> > +       /* called by releasing task */
> > +       cmdq_async_flush_cb     done_cb;
> > +       void                    *done_data;
> > +};
> > +
> > +struct cmdq_task {
> > +       struct cmdq             *cqctx;
> > +       struct list_head        list_entry;
> > +
> > +       /* state for task life cycle */
> > +       enum cmdq_task_state    task_state;
> > +       /* virtual address of command buffer */
> > +       u32                     *va_base;
> > +       /* physical address of command buffer */
> > +       dma_addr_t              mva_base;
> > +       /* size of allocated command buffer */
> > +       u32                     buf_size;
> 
> size_t

will fix

> > +       /* It points to a cmdq_cmd_buf if this task use command buffer pool. */
> > +       struct cmdq_cmd_buf     *cmd_buf;
> > +
> > +       int                     scenario;
> > +       int                     priority;
> > +       u64                     engine_flag;
> > +       u32                     command_size;
> > +       u32                     num_cmd;
> > +       int                     reorder;
> > +       /* HW thread ID; CMDQ_INVALID_THREAD if not running */
> > +       int                     thread;
> 
> I think this driver will be a lot more clear if you do this:
> 
> struct cmdq_thread *thread;
> 
> And then use "NULL" for "invalid thread" instead of CMDQ_INVALID_THREAD.

I will try to use cmdq_thread instead of thread id if possible.
If CMDQ_INVALID_THREAD id is not necessary any more, I will remove it.

> > +       /* flag of IRQ received */
> > +       int                     irq_flag;
> > +       /* callback functions */
> > +       struct cmdq_task_cb     cb;
> > +       /* work item when auto release is used */
> > +       struct work_struct      auto_release_work;
> > +
> > +       unsigned long long      submit; /* submit time */
> > +
> > +       pid_t                   caller_pid;
> > +       char                    caller_name[TASK_COMM_LEN];
> > +};
> > +
> > +struct cmdq_thread {
> > +       u32                     task_count;
> > +       u32                     wait_cookie;
> > +       u32                     next_cookie;
> > +       struct cmdq_task        *cur_task[CMDQ_MAX_TASK_IN_THREAD];
> > +};
> > +
> > +struct cmdq {
> > +       struct device           *dev;
> > +       struct notifier_block   pm_notifier;
> > +
> > +       void __iomem            *base_va;
> > +       unsigned long           base_pa;
> 
> I think we can remove base_pa (it is only used in a debug print), and just call
> "base_va" as "base".

will do

> > +       u32                     irq;
> > +
> > +       /*
> > +        * task information
> > +        * task_cache: struct cmdq_task object cache
> > +        * task_free_list: unused free tasks
> > +        * task_active_list: active tasks
> > +        * task_consume_wait_queue_item: task consumption work item
> > +        * task_auto_release_wq: auto-release workqueue
> > +        * task_consume_wq: task consumption workqueue (for queued tasks)
> > +        */
> > +       struct kmem_cache       *task_cache;
> > +       struct list_head        task_free_list;
> > +       struct list_head        task_active_list;
> > +       struct list_head        task_wait_list;
> > +       struct work_struct      task_consume_wait_queue_item;
> > +       struct workqueue_struct *task_auto_release_wq;
> > +       struct workqueue_struct *task_consume_wq;
> > +       u16                     task_count[CMDQ_MAX_THREAD_COUNT];
> 
> AFAICT, this task_count is not used?

will remove

> > +
> > +       struct cmdq_thread      thread[CMDQ_MAX_THREAD_COUNT];
> > +
> > +       /* mutex, spinlock, flag */
> > +       struct mutex            task_mutex;     /* for task list */
> > +       struct mutex            clock_mutex;    /* for clock operation */
> > +       spinlock_t              thread_lock;    /* for cmdq hardware thread */
> > +       int                     thread_usage;
> > +       spinlock_t              exec_lock;      /* for exec task */
> > +
> > +       /* suspend */
> > +       bool                    suspended;
> > +
> > +       /* command buffer pool */
> > +       struct cmdq_cmd_buf     cmd_buf_pool[CMDQ_CMD_BUF_POOL_BUF_NUM];
> > +
> > +       /*
> > +        * notification
> > +        * wait_queue: for task done
> > +        * thread_dispatch_queue: for thread acquiring
> > +        */
> > +       wait_queue_head_t       wait_queue[CMDQ_MAX_THREAD_COUNT];
> > +       wait_queue_head_t       thread_dispatch_queue;
> > +
> > +       /* ccf */
> > +       struct clk              *clock;
> > +};
> > +
> > +struct cmdq_event_name {
> > +       enum cmdq_event event;
> > +       char            *name;
> 
> const char *

will do

> > +};
> > +
> > +struct cmdq_subsys {
> > +       u32     base_addr;
> > +       int     id;
> > +       char    *grp_name;
> 
> const char *

will do

> > +};
> > +
> > +static const struct cmdq_event_name g_event_name[] = {
> > +       /* Display start of frame(SOF) events */
> > +       {CMDQ_EVENT_DISP_OVL0_SOF, "CMDQ_EVENT_DISP_OVL0_SOF",},
> 
> You can drop the "," inside "}".

will do

> > +       {CMDQ_EVENT_DISP_OVL1_SOF, "CMDQ_EVENT_DISP_OVL1_SOF",},
> > +       {CMDQ_EVENT_DISP_RDMA0_SOF, "CMDQ_EVENT_DISP_RDMA0_SOF",},
> > +       {CMDQ_EVENT_DISP_RDMA1_SOF, "CMDQ_EVENT_DISP_RDMA1_SOF",},
> > +       {CMDQ_EVENT_DISP_RDMA2_SOF, "CMDQ_EVENT_DISP_RDMA2_SOF",},
> > +       {CMDQ_EVENT_DISP_WDMA0_SOF, "CMDQ_EVENT_DISP_WDMA0_SOF",},
> > +       {CMDQ_EVENT_DISP_WDMA1_SOF, "CMDQ_EVENT_DISP_WDMA1_SOF",},
> > +       /* Display end of frame(EOF) events */
> > +       {CMDQ_EVENT_DISP_OVL0_EOF, "CMDQ_EVENT_DISP_OVL0_EOF",},
> > +       {CMDQ_EVENT_DISP_OVL1_EOF, "CMDQ_EVENT_DISP_OVL1_EOF",},
> > +       {CMDQ_EVENT_DISP_RDMA0_EOF, "CMDQ_EVENT_DISP_RDMA0_EOF",},
> > +       {CMDQ_EVENT_DISP_RDMA1_EOF, "CMDQ_EVENT_DISP_RDMA1_EOF",},
> > +       {CMDQ_EVENT_DISP_RDMA2_EOF, "CMDQ_EVENT_DISP_RDMA2_EOF",},
> > +       {CMDQ_EVENT_DISP_WDMA0_EOF, "CMDQ_EVENT_DISP_WDMA0_EOF",},
> > +       {CMDQ_EVENT_DISP_WDMA1_EOF, "CMDQ_EVENT_DISP_WDMA1_EOF",},
> > +       /* Mutex end of frame(EOF) events */
> > +       {CMDQ_EVENT_MUTEX0_STREAM_EOF, "CMDQ_EVENT_MUTEX0_STREAM_EOF",},
> > +       {CMDQ_EVENT_MUTEX1_STREAM_EOF, "CMDQ_EVENT_MUTEX1_STREAM_EOF",},
> > +       {CMDQ_EVENT_MUTEX2_STREAM_EOF, "CMDQ_EVENT_MUTEX2_STREAM_EOF",},
> > +       {CMDQ_EVENT_MUTEX3_STREAM_EOF, "CMDQ_EVENT_MUTEX3_STREAM_EOF",},
> > +       {CMDQ_EVENT_MUTEX4_STREAM_EOF, "CMDQ_EVENT_MUTEX4_STREAM_EOF",},
> > +       /* Display underrun events */
> > +       {CMDQ_EVENT_DISP_RDMA0_UNDERRUN, "CMDQ_EVENT_DISP_RDMA0_UNDERRUN",},
> > +       {CMDQ_EVENT_DISP_RDMA1_UNDERRUN, "CMDQ_EVENT_DISP_RDMA1_UNDERRUN",},
> > +       {CMDQ_EVENT_DISP_RDMA2_UNDERRUN, "CMDQ_EVENT_DISP_RDMA2_UNDERRUN",},
> > +       /* Keep this at the end of HW events */
> > +       {CMDQ_MAX_HW_EVENT_COUNT, "CMDQ_MAX_HW_EVENT_COUNT",},
> > +       /* GPR events */
> 
> What are "GPR" events?

They are events of General Purpose Register of GCE.
I will remove them if driver doesn't need to take care of them.

> > +       {CMDQ_SYNC_TOKEN_GPR_SET_0, "CMDQ_SYNC_TOKEN_GPR_SET_0",},
> > +       {CMDQ_SYNC_TOKEN_GPR_SET_1, "CMDQ_SYNC_TOKEN_GPR_SET_1",},
> > +       {CMDQ_SYNC_TOKEN_GPR_SET_2, "CMDQ_SYNC_TOKEN_GPR_SET_2",},
> > +       {CMDQ_SYNC_TOKEN_GPR_SET_3, "CMDQ_SYNC_TOKEN_GPR_SET_3",},
> > +       {CMDQ_SYNC_TOKEN_GPR_SET_4, "CMDQ_SYNC_TOKEN_GPR_SET_4",},
> > +       /* This is max event and also can be used as mask. */
> > +       {CMDQ_SYNC_TOKEN_MAX, "CMDQ_SYNC_TOKEN_MAX",},
> > +       /* Invalid event */
> > +       {CMDQ_SYNC_TOKEN_INVALID, "CMDQ_SYNC_TOKEN_INVALID",},
> > +};
> > +
> > +static const struct cmdq_subsys g_subsys[] = {
> > +       {0x1400, 1, "MMSYS"},
> > +       {0x1401, 2, "DISP"},
> > +       {0x1402, 3, "DISP"},
> 
> This isn't going to scale.  These addresses could be different on
> different chips.
> Instead of a static table like this, we probably need specify to the
> connection between gce and other devices via devicetree phandles, and
> then use the phandles to lookup the corresponding device address
> range.

I will define them in device tree.
E.g.
cmdq {
  reg_domain = 0x14000000, 0x14010000, 0x14020000
}

> > +};
> > +
> > +static const char *cmdq_event_get_name(enum cmdq_event event)
> > +{
> > +       int i;
> > +
> > +       for (i = 0; i < ARRAY_SIZE(g_event_name); i++)
> > +               if (g_event_name[i].event == event)
> > +                       return g_event_name[i].name;
> > +
> > +       return "CMDQ_EVENT_UNKNOWN";
> > +}
> > +
> > +static void cmdq_event_set(void __iomem *gce_base_va, enum cmdq_event event)
> > +{
> > +       writel(CMDQ_SYNC_TOKEN_SET | event,
> > +              gce_base_va + CMDQ_SYNC_TOKEN_UPD_OFFSET);
> > +}
> > +
> 
> For any cmdq helper like this, just pass cmdq, and extract base in the
> helper, eg:
> 
> static void cmdq_event_set(struct cmdq *cmdq, enum cmdq_event event)
> {
> writel(CMDQ_SYNC_TOKEN_SET | event,
> cqctx->base + CMDQ_SYNC_TOKEN_UPD_OFFSET);
> }

will do

> [snip]
> 
> > +
> > +static bool cmdq_scenario_is_prefetch(enum cmdq_scenario scenario)
> > +{
> > +       switch (scenario) {
> > +       case CMDQ_SCENARIO_PRIMARY_DISP:
> > +               /*
> > +                * Primary DISP HW should enable prefetch.
> > +                * Since thread 0/1 shares one prefetch buffer,
> > +                * we allow only PRIMARY path to use prefetch.
> > +                */
> 
> I don't do think we want to hard code this policy in the cmdq driver.
> There are systems that will only use the "sub" display channel, these systems
> should be able to do prefetch for the "SUB" display if it is the only
> one in use.

After discussed with Mediatek display owner,
we think prefetch function can be removed.
If we really need it in the future,
I will add back it and give controllable interfaces.

> > +               return true;
> > +       default:
> > +               return false;
> > +       }
> > +
> > +       return false;
> > +}
> 
> [snip]
> 
> > +static u64 cmdq_scenario_get_flag(struct cmdq *cqctx,
> > +                                 enum cmdq_scenario scenario)
> > +{
> > +       struct device *dev = cqctx->dev;
> > +       u64 flag;
> > +
> > +       switch (scenario) {
> > +       case CMDQ_SCENARIO_PRIMARY_DISP:
> > +               flag = ((1LL << CMDQ_ENG_DISP_OVL0) |
> > +                       (1LL << CMDQ_ENG_DISP_COLOR0) |
> > +                       (1LL << CMDQ_ENG_DISP_AAL) |
> > +                       (1LL << CMDQ_ENG_DISP_RDMA0) |
> > +                       (1LL << CMDQ_ENG_DISP_UFOE) |
> > +                       (1LL << CMDQ_ENG_DISP_DSI0_CMD));
> > +               break;
> > +       case CMDQ_SCENARIO_SUB_DISP:
> > +               flag = ((1LL << CMDQ_ENG_DISP_OVL1) |
> > +                       (1LL << CMDQ_ENG_DISP_COLOR1) |
> > +                       (1LL << CMDQ_ENG_DISP_GAMMA) |
> > +                       (1LL << CMDQ_ENG_DISP_RDMA1) |
> > +                       (1LL << CMDQ_ENG_DISP_DSI1_CMD));
> > +               break;
> 
> Instead of encoding policy (these arbitrary "scenarios"), perhaps the
> cmdq driver should just provide these flag bits, and the display
> subsystem can use them to build the "flag" parameter itself.
> 
> After all, the exact configuration of mmsys components is somewhat flexible.

I will remove scenario and provide flags for display driver.

> > +       default:
> > +               dev_err(dev, "unknown scenario type %d\n", scenario);
> > +               flag = 0LL;
> > +               break;
> > +       }
> > +
> > +       return flag;
> > +}
> 
> [snip]
> 
> > +static void cmdq_task_release_unlocked(struct cmdq_task *task)
> > +{
> > +       struct cmdq *cqctx = task->cqctx;
> > +
> > +       /* This func should be inside cqctx->task_mutex mutex */
> > +       lockdep_assert_held(&cqctx->task_mutex);
> > +
> > +       task->task_state = TASK_STATE_IDLE;
> > +       task->thread = CMDQ_INVALID_THREAD;
> > +
> > +       cmdq_task_free_command_buffer(task);
> > +
> > +       /* remove from active/waiting list */
> > +       list_del_init(&task->list_entry);
> > +       /* insert into free list. Currently we don't shrink free list. */
> > +       list_add_tail(&task->list_entry, &cqctx->task_free_list);
> 
> list_move_tail()

will do

> > +}
> 
> [snip]
> 
> > +static struct cmdq_task *cmdq_core_acquire_task(struct cmdq_command *cmd_desc,
> > +                                               struct cmdq_task_cb *cb)
> > +{
> > +       struct cmdq *cqctx = cmd_desc->cqctx;
> > +       struct device *dev = cqctx->dev;
> > +       int status;
> > +       struct cmdq_task *task;
> > +
> > +       task = cmdq_core_find_free_task(cqctx);
> > +       if (!task) {
> > +               dev_err(dev, "can't acquire task info\n");
> > +               return NULL;
> > +       }
> > +
> > +       /* initialize field values */
> > +       task->scenario = cmd_desc->scenario;
> > +       task->priority = cmd_desc->priority;
> > +       task->engine_flag = cmd_desc->engine_flag;
> > +       task->task_state = TASK_STATE_WAITING;
> > +       task->reorder = 0;
> > +       task->thread = CMDQ_INVALID_THREAD;
> > +       task->irq_flag = 0x0;
> > +       memcpy(&task->cb, cb, sizeof(*cb));
> 
> task->cb = *cb;

will do

> > +       task->command_size = cmd_desc->block_size;
> > +
> > +       /* store caller info for debug */
> > +       if (current) {
> > +               task->caller_pid = current->pid;
> > +               memcpy(task->caller_name, current->comm, sizeof(current->comm));
> > +       }
> > +
> > +       status = cmdq_core_sync_command(task, cmd_desc);
> > +       if (status < 0) {
> > +               dev_err(dev, "fail to sync command\n");
> > +               cmdq_task_release_internal(task);
> > +               return NULL;
> > +       }
> > +
> > +       /* insert into waiting list to process */
> > +       mutex_lock(&cqctx->task_mutex);
> > +       if (task) {
> > +               struct list_head *in_item = &cqctx->task_wait_list;
> > +               struct cmdq_task *wait_task = NULL;
> > +               struct list_head *p = NULL;
> > +
> > +               task->submit = sched_clock();
> > +
> > +               /*
> > +                * add to waiting list, keep it sorted by priority
> > +                * so that we add high-priority tasks first.
> > +                */
> > +               list_for_each(p, &cqctx->task_wait_list) {
> > +                       wait_task = list_entry(p, struct cmdq_task, list_entry);
> > +                       /*
> > +                        * keep the list sorted.
> > +                        * higher priority tasks are inserted
> > +                        * in front of the queue
> > +                        */
> > +                       if (wait_task->priority < task->priority)
> > +                               break;
> > +
> > +                       in_item = p;
> > +               }
> > +
> > +               list_add(&task->list_entry, in_item);
> > +       }
> > +       mutex_unlock(&cqctx->task_mutex);
> > +
> > +       return task;
> > +}
> > +
> > +static int cmdq_clk_enable(struct cmdq *cqctx)
> > +{
> > +       struct device *dev = cqctx->dev;
> > +       int ret = 0;
> > +
> > +       if (!cqctx->thread_usage) {
> > +               ret = clk_prepare_enable(cqctx->clock);
> > +               if (ret) {
> > +                       dev_err(dev, "prepare and enable clk:%s fail\n",
> > +                               CMDQ_CLK_NAME);
> > +                       return ret;
> > +               }
> > +               cmdq_event_reset(cqctx);
> 
> AFAICT, we only need to reset the device when it power cycles, not
> whenever we re-enable its clock.
> I think cmdq_event_reset() here should really be a pm_runtime resume handler,
> and "cmdq_clk_enable()" should just do a clk_prepare_enable() and
> pm_runtime_get().
> 
> Then you won't need your own custom refcounting (thread_usage).
> You won't need clock_mutex, either.

I will remove it and let display driver to control clear event.

> > +       }
> > +       cqctx->thread_usage++;
> > +
> > +       return ret;
> > +}
> > +
> > +static void cmdq_clk_disable(struct cmdq *cqctx)
> > +{
> > +       cqctx->thread_usage--;
> > +       if (cqctx->thread_usage <= 0)
> > +               clk_disable_unprepare(cqctx->clock);
> > +}
> > +
> > +static int cmdq_core_find_free_thread(struct cmdq *cqctx, u32 scenario)
> > +{
> > +       struct device *dev = cqctx->dev;
> > +       struct cmdq_thread *thread;
> > +       int tid;
> > +       u32 next_cookie;
> > +
> > +       thread = cqctx->thread;
> > +       tid = cmdq_scenario_get_thread(scenario);
> 
> cmdq_core_acquire_thread() and cmdq_scenario_get_thread() should take
> cmdq, and return a struct cmdq_thread * instead of tid.
> 
> On error, they should either return a standard linux error
> (-EBUSY/-EINVAL, etc) as ERR_PTR(), or if fine-grained error detection
> is not required, they can just return NULL.

will do

> > +
> > +       /*
> > +        * Currently, we only support disp,
> > +        * so it's error if tid is CMDQ_INVALID_THREAD.
> > +        */
> > +       if (tid == CMDQ_INVALID_THREAD) {
> > +               dev_err(dev, "got CMDQ_INVALID_THREAD!!!\n");
> > +               return tid;
> > +       }
> > +
> > +       /*
> > +        * make sure the found thread has enough space for the task;
> > +        * cmdq_thread->cur_task has size limitation.
> > +        */
> > +       if (thread[tid].task_count >= CMDQ_MAX_TASK_IN_THREAD)
> > +               tid = CMDQ_INVALID_THREAD;
> > +
> > +       next_cookie = thread[tid].next_cookie % CMDQ_MAX_TASK_IN_THREAD;
> > +       if (thread[tid].cur_task[next_cookie])
> > +               tid = CMDQ_INVALID_THREAD;
> > +
> > +       return tid;
> > +}
> 
> [snip]
> 
> > +static int cmdq_thread_suspend(struct cmdq *cqctx, int tid)
> > +{
> > +       struct device *dev = cqctx->dev;
> > +       void __iomem *gce_base_va = cqctx->base_va;
> > +       u32 enabled;
> > +       u32 status;
> > +
> > +       /* write suspend bit */
> > +       writel(CMDQ_THR_SUSPEND,
> > +              gce_base_va + CMDQ_THR_SUSPEND_TASK_OFFSET +
> > +              CMDQ_THR_SHIFT * tid);
> 
> All of these per-thread register accesses would be cleaner if we just
> they were in helper functions on the struct cmdq_thread *:
> 
> 
> First, when initializing cmdq->thread, save the thread's base:
> 
> thread->base = cmdq->base + cmdq_thread_to_tid(cmdq, thread) * CMDQ_THR_SHIFT;
> 
> Then, use these simple helper functions:
> 
> static void cmdq_thread_writel(struct cmdq_thread *thread, u32 value,
>       unsigned offset)
> {
> writel(value, thread->base + offset);
> }
> 
> static u32 cmdq_thread_readl(struct cmdq_thread *thread, unsigned offset)
> {
> return readl(thread->base + offset);
> }
> 
> cmdq_thread_writel(thread, CMDQ_THR_SUSPEND, CMDQ_THR_SUSPEND_TASK_OFFSET);
> enable = cmdq_thread_readl(thread, CMDQ_THR_ENABLE_TASK_OFFSET);

will do

> > +
> > +       /* If already disabled, treat as suspended successful. */
> > +       enabled = readl(gce_base_va + CMDQ_THR_ENABLE_TASK_OFFSET +
> > +                       CMDQ_THR_SHIFT * tid);
> > +       if (!(enabled & CMDQ_THR_ENABLED))
> > +               return 0;
> > +
> > +       /* poll suspended status */
> > +       if (readl_poll_timeout_atomic(gce_base_va +
> > +                                     CMDQ_THR_CURR_STATUS_OFFSET +
> > +                                     CMDQ_THR_SHIFT * tid,
> > +                                     status,
> > +                                     status & CMDQ_THR_STATUS_SUSPENDED,
> > +                                     0, 10)) {
> > +               dev_err(dev, "Suspend HW thread %d failed\n", tid);
> > +               return -EFAULT;
> > +       }
> > +
> > +       return 0;
> > +}
> 
> [snip]
> 
> > +static void cmdq_core_parse_error(struct cmdq_task *task, int tid,
> > +                                 const char **module_name, int *flag,
> > +                                 u32 *inst_a, u32 *inst_b)
> 
> Hmm. Instead of back calculating details of the failing task, I think it might
> be more natural to record these details when creating the command

Because CMDQ needs to deal with time critical situation,
we only parse command into human readable log if failed.

>  shouldn't you be
> able to use the task's cookie to look it up on the list of tasks to
> get its details?

We have known task here, so we don't need to get task from cookie.

> > +{
> > +       int irq_flag = task->irq_flag;
> > +       u32 insts[2] = { 0 };
> > +       const char *module;
> > +
> > +       /*
> > +        * other cases, use instruction to judge
> > +        * because scenario / HW flag are not sufficient
> > +        */
> > +       if (cmdq_task_get_pc_and_inst(task, tid, insts)) {
> > +               u32 op, arg_a, arg_b;
> > +
> > +               op = insts[1] >> CMDQ_OP_CODE_SHIFT;
> > +               arg_a = insts[1] & CMDQ_ARG_A_MASK;
> > +               arg_b = insts[0];
> > +
> > +               switch (op) {
> > +               case CMDQ_CODE_WRITE:
> > +                       module = cmdq_core_parse_module_from_subsys(arg_a);
> > +                       break;
> > +               case CMDQ_CODE_WFE:
> > +                       /* arg_a is the event id */
> > +                       module = cmdq_event_get_module((enum cmdq_event)arg_a);
> > +                       break;
> > +               case CMDQ_CODE_MOVE:
> > +               case CMDQ_CODE_JUMP:
> > +               case CMDQ_CODE_EOC:
> > +               default:
> > +                       module = "CMDQ";
> > +                       break;
> > +               }
> > +       } else {
> > +               module = "CMDQ";
> > +       }
> > +
> > +       /* fill output parameter */
> > +       *module_name = module;
> > +       *flag = irq_flag;
> > +       *inst_a = insts[1];
> > +       *inst_b = insts[0];
> > +}
> 
> [snip]
> 
> > +static int cmdq_task_insert_into_thread(struct cmdq_task *last,
> > +                                       struct cmdq_task *task,
> > +                                       int tid, int loop)
> > +{
> > +       struct cmdq *cqctx = task->cqctx;
> > +       struct device *dev = cqctx->dev;
> > +       void __iomem *gce_base_va = cqctx->base_va;
> > +       int status = 0;
> > +       struct cmdq_thread *thread;
> > +       struct cmdq_task *prev_task;
> > +       int index;
> > +       int prev;
> > +       int cookie;
> > +
> > +       thread = &cqctx->thread[tid];
> > +       cookie = thread->next_cookie;
> > +
> > +       /*
> > +        * Traverse forward to adjust tasks' order
> > +        * according to their priorities.
> > +        */
> 
> AFAICT, this driver does not actually use priorities, so this can all
> be simplified.
> If we ever implement priorities, we can add this back, since there
> will then be a way
> to test it.

will remove

> > +       for (prev = (cookie % CMDQ_MAX_TASK_IN_THREAD); loop > 0; loop--) {
> > +               index = prev;
> > +               if (index < 0)
> > +                       index = CMDQ_MAX_TASK_IN_THREAD - 1;
> > +
> > +               prev = index - 1;
> > +               if (prev < 0)
> > +                       prev = CMDQ_MAX_TASK_IN_THREAD - 1;
> > +
> > +               prev_task = thread->cur_task[prev];
> > +
> > +               /* maybe the job is killed, search a new one */
> > +               for (; !prev_task && loop > 1; loop--) {
> > +                       dev_err(dev,
> > +                               "prev_task is NULL, prev:%d, loop:%d, index:%d\n",
> > +                               prev, loop, index);
> > +
> > +                       prev--;
> > +                       if (prev < 0)
> > +                               prev = CMDQ_MAX_TASK_IN_THREAD - 1;
> > +
> > +                       prev_task = thread->cur_task[prev];
> > +               }
> > +
> > +               if (!prev_task) {
> > +                       dev_err(dev,
> > +                               "invalid task state for reorder %d %d\n",
> > +                               index, loop);
> > +                       status = -EFAULT;
> > +                       break;
> > +               }
> > +
> > +               /* insert this task */
> > +               if (loop <= 1) {
> > +                       thread->cur_task[index] = task;
> > +                       /* Jump: Absolute */
> > +                       prev_task->va_base[prev_task->num_cmd - 1] =
> > +                                       CMDQ_JUMP_BY_PA;
> > +                       /* Jump to here */
> > +                       prev_task->va_base[prev_task->num_cmd - 2] =
> > +                                       task->mva_base;
> > +
> > +                       /* re-fetch command buffer again. */
> > +                       cmdq_core_invalidate_hw_fetched_buffer(
> > +                                       gce_base_va, tid);
> > +
> > +                       break;
> > +               }
> > +
> > +               if (prev_task->priority < task->priority) {
> > +                       /* new task has higher priority */
> > +
> > +                       thread->cur_task[index] = prev_task;
> > +                       prev_task->va_base[prev_task->num_cmd - 1] =
> > +                                       task->va_base[task->num_cmd - 1];
> > +                       prev_task->va_base[prev_task->num_cmd - 2] =
> > +                                       task->va_base[task->num_cmd - 2];
> > +
> > +                       /* Boot priority for the task */
> > +                       prev_task->priority += CMDQ_MIN_AGE_VALUE;
> > +                       prev_task->reorder++;
> > +
> > +                       thread->cur_task[prev] = task;
> > +                       /* Jump: Absolute */
> > +                       task->va_base[task->num_cmd - 1] = CMDQ_JUMP_BY_PA;
> > +                       /* Jump to here */
> > +                       task->va_base[task->num_cmd - 2] = prev_task->mva_base;
> > +
> > +                       /* re-fetch command buffer again. */
> > +                       cmdq_core_invalidate_hw_fetched_buffer(
> > +                                       gce_base_va, tid);
> > +
> > +                       if (last == task)
> > +                               last = prev_task;
> > +               } else {
> > +                       /* previous task has higher priority */
> > +
> > +                       thread->cur_task[index] = task;
> > +                       /* Jump: Absolute */
> > +                       prev_task->va_base[prev_task->num_cmd - 1] =
> > +                                       CMDQ_JUMP_BY_PA;
> > +                       /* Jump to here */
> > +                       prev_task->va_base[prev_task->num_cmd - 2] =
> > +                                       task->mva_base;
> > +
> > +                       /* re-fetch command buffer again. */
> > +                       cmdq_core_invalidate_hw_fetched_buffer(
> > +                                       gce_base_va, tid);
> > +
> > +                       break;
> > +               }
> > +       }
> > +
> > +       return status;
> > +}
> > +
> > +static int cmdq_task_exec_async_impl(struct cmdq_task *task, int tid)
> > +{
> > +       struct cmdq *cqctx = task->cqctx;
> > +       struct device *dev = cqctx->dev;
> > +       void __iomem *gce_base_va = cqctx->base_va;
> > +       int status;
> > +       struct cmdq_thread *thread;
> > +       struct cmdq_task *last_task;
> > +       unsigned long flags;
> > +       int loop;
> > +       int minimum;
> > +       int cookie;
> > +       int thread_prio;
> > +
> > +       status = 0;
> > +       thread = &cqctx->thread[tid];
> > +
> > +       spin_lock_irqsave(&cqctx->exec_lock, flags);
> > +
> > +       /* update task's thread info */
> > +       task->thread = tid;
> > +       task->irq_flag = 0;
> > +       task->task_state = TASK_STATE_BUSY;
> > +
> > +       if (thread->task_count <= 0) {
> > +               bool is_prefetch;
> > +
> > +               if (cmdq_thread_reset(cqctx, tid) < 0) {
> 
> Do we really have to reset with the spin lock held and irqs disabled?
> This could take a while, right?

About exec_lock spinlock, I will rewrite it,
but need more time to make sure the correctness of new method.

> > +                       spin_unlock_irqrestore(&cqctx->exec_lock, flags);
> > +                       return -EFAULT;
> > +               }
> > +
> > +               writel(CMDQ_THR_NO_TIMEOUT,
> > +                      gce_base_va + CMDQ_THR_INST_CYCLES_OFFSET +
> > +                      CMDQ_THR_SHIFT * tid);
> > +
> > +               is_prefetch = cmdq_scenario_is_prefetch(task->scenario);
> > +               if (is_prefetch) {
> > +                       writel(CMDQ_THR_PREFETCH_EN,
> > +                              gce_base_va + CMDQ_THR_PREFETCH_OFFSET +
> > +                              CMDQ_THR_SHIFT * tid);
> > +               }
> > +
> > +               thread_prio = CMDQ_THR_PRIO_DISPLAY_CONFIG;
> > +               writel(task->mva_base,
> > +                      gce_base_va + CMDQ_THR_CURR_ADDR_OFFSET +
> > +                      CMDQ_THR_SHIFT * tid);
> > +               writel(task->mva_base + task->command_size,
> > +                      gce_base_va + CMDQ_THR_END_ADDR_OFFSET +
> > +                      CMDQ_THR_SHIFT * tid);
> > +               writel(thread_prio & CMDQ_THR_PRIORITY_MASK,
> > +                      gce_base_va + CMDQ_THR_CFG_OFFSET +
> > +                      CMDQ_THR_SHIFT * tid);
> > +
> > +               writel(CMDQ_THR_IRQ_EN,
> > +                      gce_base_va + CMDQ_THR_IRQ_ENABLE_OFFSET +
> > +                      CMDQ_THR_SHIFT * tid);
> > +
> > +               minimum = cmdq_thread_get_cookie(gce_base_va, tid);
> > +               cmdq_thread_insert_task_by_cookie(
> > +                               thread, task, (minimum + 1), true);
> > +
> > +               /* verify that we don't corrupt EOC + JUMP pattern */
> > +               cmdq_core_verfiy_task_command_end(task);
> > +
> > +               /* enable HW thread */
> > +               writel(CMDQ_THR_ENABLED,
> > +                      gce_base_va + CMDQ_THR_ENABLE_TASK_OFFSET +
> > +                      CMDQ_THR_SHIFT * tid);
> > +       } else {
> > +               unsigned long curr_pa, end_pa;
> > +
> > +               status = cmdq_thread_suspend(cqctx, tid);
> > +               if (status < 0) {
> > +                       spin_unlock_irqrestore(&cqctx->exec_lock, flags);
> > +                       return status;
> > +               }
> > +
> > +               writel(CMDQ_THR_NO_TIMEOUT,
> > +                      gce_base_va + CMDQ_THR_INST_CYCLES_OFFSET +
> > +                      CMDQ_THR_SHIFT * tid);
> > +
> > +               cookie = thread->next_cookie;
> > +
> > +               /*
> > +                * Boundary case tested: EOC have been executed,
> > +                *                       but JUMP is not executed
> > +                * Thread PC: 0x9edc0dd8, End: 0x9edc0de0,
> > +                * Curr Cookie: 1, Next Cookie: 2
> > +                * PC = END - 8, EOC is executed
> > +                * PC = END - 0, All CMDs are executed
> > +                */
> > +
> > +               curr_pa = (unsigned long)readl(gce_base_va +
> > +                                              CMDQ_THR_CURR_ADDR_OFFSET +
> > +                                              CMDQ_THR_SHIFT * tid);
> > +               end_pa = (unsigned long)readl(gce_base_va +
> > +                                             CMDQ_THR_END_ADDR_OFFSET +
> > +                                             CMDQ_THR_SHIFT * tid);
> > +               if ((curr_pa == end_pa - 8) || (curr_pa == end_pa - 0)) {
> > +                       /* set to task directly */
> > +                       writel(task->mva_base,
> > +                              gce_base_va + CMDQ_THR_CURR_ADDR_OFFSET +
> > +                              CMDQ_THR_SHIFT * tid);
> > +                       writel(task->mva_base + task->command_size,
> > +                              gce_base_va + CMDQ_THR_END_ADDR_OFFSET +
> > +                              CMDQ_THR_SHIFT * tid);
> > +                       thread->cur_task[cookie % CMDQ_MAX_TASK_IN_THREAD] = task;
> > +                       thread->task_count++;
> > +               } else {
> > +                       /* Current task that shuld be processed */
> > +                       minimum = cmdq_thread_get_cookie(gce_base_va, tid) + 1;
> > +                       if (minimum > CMDQ_MAX_COOKIE_VALUE)
> > +                               minimum = 0;
> > +
> > +                       /* Calculate loop count to adjust the tasks' order */
> > +                       if (minimum <= cookie)
> > +                               loop = cookie - minimum;
> > +                       else
> > +                               /* Counter wrapped */
> > +                               loop = (CMDQ_MAX_COOKIE_VALUE - minimum + 1) +
> > +                                      cookie;
> > +
> > +                       if (loop < 0) {
> > +                               dev_err(dev, "reorder fail:\n");
> > +                               dev_err(dev, "  task count=%d\n", loop);
> > +                               dev_err(dev, "  thread=%d\n", tid);
> > +                               dev_err(dev, "  next cookie=%d\n",
> > +                                       thread->next_cookie);
> > +                               dev_err(dev, "  (HW) next cookie=%d\n",
> > +                                       minimum);
> > +                               dev_err(dev, "  task=0x%p\n", task);
> > +
> > +                               spin_unlock_irqrestore(&cqctx->exec_lock,
> > +                                                      flags);
> > +                               return -EFAULT;
> > +                       }
> > +
> > +                       if (loop > CMDQ_MAX_TASK_IN_THREAD)
> > +                               loop %= CMDQ_MAX_TASK_IN_THREAD;
> > +
> > +                       /*
> > +                        * By default, task is the last task,
> > +                        * and insert [cookie % CMDQ_MAX_TASK_IN_THREAD]
> > +                        */
> > +                       last_task = task;       /* Default last task */
> > +
> > +                       status = cmdq_task_insert_into_thread(
> > +                                       last_task, task, tid, loop);
> > +                       if (status < 0) {
> > +                               spin_unlock_irqrestore(
> > +                                               &cqctx->exec_lock, flags);
> > +                               dev_err(dev,
> > +                                       "invalid task state for reorder.\n");
> > +                               return status;
> > +                       }
> > +
> > +                       smp_mb(); /* modify jump before enable thread */
> > +
> > +                       writel(task->mva_base + task->command_size,
> > +                              gce_base_va + CMDQ_THR_END_ADDR_OFFSET +
> > +                              CMDQ_THR_SHIFT * tid);
> > +                       thread->task_count++;
> > +               }
> > +
> > +               thread->next_cookie += 1;
> > +               if (thread->next_cookie > CMDQ_MAX_COOKIE_VALUE)
> > +                       thread->next_cookie = 0;
> > +
> > +               /* verify that we don't corrupt EOC + JUMP pattern */
> > +               cmdq_core_verfiy_task_command_end(task);
> > +
> > +               /* resume HW thread */
> > +               cmdq_thread_resume(cqctx, tid);
> > +       }
> > +
> > +       spin_unlock_irqrestore(&cqctx->exec_lock, flags);
> > +
> > +       return status;
> > +}
> 
> [snip]
> 
> > +static int cmdq_core_resumed_notifier(struct cmdq *cqctx)
> > +{
> > +       /*
> > +        * Note:
> > +        * delay resume timing until process-unfreeze done in order to
> > +        * ensure M4U driver had restore M4U port setting
> > +        */
> > +
> > +       unsigned long flags = 0L;
> > +
> > +       spin_lock_irqsave(&cqctx->thread_lock, flags);
> > +       cqctx->suspended = false;
> > +
> > +       /*
> > +        * during suspending, there may be queued tasks.
> > +        * we should process them if any.
> > +        */
> > +       if (!work_pending(&cqctx->task_consume_wait_queue_item))
> > +               /* we use system global work queue (kernel thread kworker/n) */
> > +               queue_work(cqctx->task_consume_wq,
> > +                          &cqctx->task_consume_wait_queue_item);
> > +
> > +       spin_unlock_irqrestore(&cqctx->thread_lock, flags);
> > +
> > +       return 0;
> > +}
> > +
> > +static int cmdq_task_exec_async_with_retry(struct cmdq_task *task, int tid)
> 
> Why do we want to automatically retry cmds that failed?

will remove

> > +{
> > +       struct device *dev = task->cqctx->dev;
> > +       int retry;
> > +       int status;
> > +
> > +       retry = 0;
> > +       status = -EFAULT;
> > +       do {
> > +               cmdq_core_verfiy_task_command_end(task);
> > +
> > +               status = cmdq_task_exec_async_impl(task, tid);
> > +
> > +               if (status >= 0)
> > +                       break;
> > +
> > +               if ((task->task_state == TASK_STATE_KILLED) ||
> > +                   (task->task_state == TASK_STATE_ERROR)) {
> > +                       dev_err(dev, "cmdq_task_exec_async_impl fail\n");
> > +                       status = -EFAULT;
> > +                       break;
> > +               }
> > +
> > +               retry++;
> > +       } while (retry < CMDQ_MAX_RETRY_COUNT);
> > +
> > +       return status;
> > +}
> > +
> > +static int cmdq_core_get_time_in_ms(unsigned long long start,
> > +                                   unsigned long long end)
> > +{
> > +       unsigned long long _duration = end - start;
> > +
> > +       do_div(_duration, 1000000);
> > +       return (int)_duration;
> > +}
> > +
> > +static void cmdq_core_consume_waiting_list(struct work_struct *work)
> > +{
> > +       struct list_head *p, *n = NULL;
> > +       bool thread_acquired;
> > +       unsigned long long consume_time;
> > +       int waiting_time_ms;
> > +       bool need_log;
> > +       struct cmdq *cqctx;
> > +       struct device *dev;
> > +
> > +       cqctx = container_of(work, struct cmdq,
> > +                            task_consume_wait_queue_item);
> > +       dev = cqctx->dev;
> > +
> > +       /*
> > +        * when we're suspending,
> > +        * do not execute any tasks. delay & hold them.
> > +        */
> > +       if (cqctx->suspended)
> > +               return;
> > +
> > +       consume_time = sched_clock();
> > +
> > +       mutex_lock(&cqctx->task_mutex);
> > +
> > +       thread_acquired = false;
> > +
> > +       /* scan and remove (if executed) waiting tasks */
> > +       list_for_each_safe(p, n, &cqctx->task_wait_list) {
> > +               struct cmdq_task *task;
> > +               struct cmdq_thread *thread = NULL;
> > +               int tid;
> > +               int status;
> > +               enum cmdq_hw_thread_priority thread_prio;
> > +
> > +               task = list_entry(p, struct cmdq_task, list_entry);
> > +
> > +               thread_prio = CMDQ_THR_PRIO_DISPLAY_CONFIG;
> > +
> > +               waiting_time_ms = cmdq_core_get_time_in_ms(
> > +                               task->submit, consume_time);
> > +               need_log = waiting_time_ms >= CMDQ_PREALARM_TIMEOUT_MS;
> > +               /* allocate hw thread */
> > +               tid = cmdq_core_acquire_thread(cqctx, task->scenario);
> > +               if (tid != CMDQ_INVALID_THREAD)
> > +                       thread = &cqctx->thread[tid];
> > +
> > +               if (tid == CMDQ_INVALID_THREAD || !thread) {
> 
> Move computation of consume_time, waiting_time_ms & need_log here, when
> printing a warning message, since this is the only place you need them.
> Also, why bother converting to ms?  Just leave time in unsigned long long.

will remove converting

> BTW, why sched_clock() instead of ktime?

will use ktime

> > +                       /* have to wait, remain in wait list */
> > +                       dev_warn(dev, "acquire thread fail, need to wait\n");
> > +                       if (need_log) /* task wait too long */
> > +                               dev_warn(dev, "waiting:%dms, task:0x%p\n",
> > +                                        waiting_time_ms, task);
> > +                       continue;
> > +               }
> > +
> > +               /* some task is ready to run */
> > +               thread_acquired = true;
> > +
> > +               /*
> > +                * start execution
> > +                * remove from wait list and put into active list
> > +                */
> > +               list_del_init(&task->list_entry);
> > +               list_add_tail(&task->list_entry,
> > +                             &cqctx->task_active_list);
> 
> list_move_tail(&task->list_entry, &cqctx->task_active_list);

will do

> > +
> > +               /* run task on thread */
> > +               status = cmdq_task_exec_async_with_retry(task, tid);
> > +               if (status < 0) {
> > +                       dev_err(dev, "%s fail, release task 0x%p\n",
> > +                               __func__, task);
> > +                       cmdq_task_remove_thread(task);
> > +                       cmdq_task_release_unlocked(task);
> > +                       task = NULL;
> > +               }
> > +       }
> > +
> > +       if (thread_acquired) {
> > +               /*
> > +                * notify some task's sw thread to change their waiting state.
> > +                * (if they have already called cmdq_task_wait_and_release())
> > +                */
> > +               wake_up_all(&cqctx->thread_dispatch_queue);
> > +       }
> > +
> > +       mutex_unlock(&cqctx->task_mutex);
> > +}
> 
> [snip]
> 
> 
> > +static int cmdq_task_wait_result(struct cmdq_task *task, int tid, int wait_q)
> > +{
> > +       struct cmdq *cqctx = task->cqctx;
> > +       struct cmdq_thread *thread = &cqctx->thread[tid];
> > +       int status = 0;
> > +       unsigned long flags;
> > +       struct cmdq_task_error_report error_report = {
> > +               .throw_err = false,
> > +               .module = NULL,
> > +               .inst_a = 0,
> > +               .inst_b = 0,
> > +               .irq_flag = 0,
> > +       };
> 
> This should be sufficient:
>  struct cmdq_task_error_report error_report = { 0 };

will do

> > +
> > +       /*
> > +        * Note that although we disable IRQ, HW continues to execute
> > +        * so it's possible to have pending IRQ
> > +        */
> > +       spin_lock_irqsave(&cqctx->exec_lock, flags);
> > +
> > +       if (task->task_state != TASK_STATE_DONE)
> > +               status = cmdq_task_handle_error_result(
> > +                               task, tid, wait_q, &error_report);
> 
> This error handling does too much with the spin lock held and irqs disabled.

About exec_lock spinlock, I will rewrite it.

> > +
> > +       if (thread->task_count <= 0)
> > +               cmdq_thread_disable(cqctx, tid);
> > +       else
> > +               cmdq_thread_resume(cqctx, tid);
> > +
> > +       spin_unlock_irqrestore(&cqctx->exec_lock, flags);
> > +
> > +       if (error_report.throw_err) {
> > +               u32 op = error_report.inst_a >> CMDQ_OP_CODE_SHIFT;
> > +
> > +               switch (op) {
> > +               case CMDQ_CODE_WFE:
> > +                       dev_err(cqctx->dev,
> > +                               "%s in CMDQ IRQ:0x%02x, INST:(0x%08x, 0x%08x), OP:WAIT EVENT:%s\n",
> > +                               error_report.module, error_report.irq_flag,
> > +                               error_report.inst_a, error_report.inst_b,
> > +                               cmdq_event_get_name(error_report.inst_a &
> > +                                                   CMDQ_ARG_A_MASK));
> > +                       break;
> > +               default:
> > +                       dev_err(cqctx->dev,
> > +                               "%s in CMDQ IRQ:0x%02x, INST:(0x%08x, 0x%08x), OP:%s\n",
> > +                               error_report.module, error_report.irq_flag,
> > +                               error_report.inst_a, error_report.inst_b,
> > +                               cmdq_core_parse_op(op));
> > +                       break;
> > +               }
> > +       }
> > +
> > +       return status;
> > +}
> > +
> > +static int cmdq_task_wait_done(struct cmdq_task *task)
> > +{
> > +       struct cmdq *cqctx = task->cqctx;
> > +       struct device *dev = cqctx->dev;
> > +       int wait_q;
> > +       int tid;
> > +       unsigned long timeout = msecs_to_jiffies(
> > +                       CMDQ_ACQUIRE_THREAD_TIMEOUT_MS);
> > +
> > +       tid = task->thread;
> > +       if (tid == CMDQ_INVALID_THREAD) {
> 
> You don't need this first "if" check.
> Just do the wait_event_timeout() here.
> If tid != CMDQ_INVALID_THREAD, it will return immediately.

will do

> > +               /*
> > +                * wait for acquire thread
> > +                * (this is done by cmdq_core_consume_waiting_list);
> > +                */
> > +               wait_q = wait_event_timeout(
> > +                               cqctx->thread_dispatch_queue,
> > +                               (task->thread != CMDQ_INVALID_THREAD), timeout);
> > +
> > +               if (!wait_q || task->thread == CMDQ_INVALID_THREAD) {
> 
> Why "task->thread == CMDQ_INVALID_THREAD" check here?  It is either
> not needed or racy.

will remove

> > +                       mutex_lock(&cqctx->task_mutex);
> > +
> > +                       /*
> > +                        * it's possible that the task was just consumed now.
> > +                        * so check again.
> > +                        */
> > +                       if (task->thread == CMDQ_INVALID_THREAD) {
> > +                               /*
> > +                                * Task may have released,
> > +                                * or starved to death.
> > +                                */
> > +                               dev_err(dev,
> > +                                       "task(0x%p) timeout with invalid thread\n",
> > +                                       task);
> > +
> > +                               /*
> > +                                * remove from waiting list,
> > +                                * so that it won't be consumed in the future
> > +                                */
> > +                               list_del_init(&task->list_entry);
> > +
> > +                               mutex_unlock(&cqctx->task_mutex);
> > +                               return -EINVAL;
> > +                       }
> > +
> > +                       /* valid thread, so we keep going */
> > +                       mutex_unlock(&cqctx->task_mutex);
> > +               }
> > +       }
> > +
> > +       tid = task->thread;
> > +       if (tid < 0 || tid >= CMDQ_MAX_THREAD_COUNT) {
> > +               dev_err(dev, "invalid thread %d in %s\n", tid, __func__);
> > +               return -EINVAL;
> > +       }
> > +
> > +       /* start to wait */
> > +       wait_q = wait_event_timeout(task->cqctx->wait_queue[tid],
> > +                                   (task->task_state != TASK_STATE_BUSY &&
> > +                                    task->task_state != TASK_STATE_WAITING),
> > +                                   msecs_to_jiffies(CMDQ_DEFAULT_TIMEOUT_MS));
> > +       if (!wait_q)
> > +               dev_dbg(dev, "timeout!\n");
> > +
> > +       /* wake up and continue */
> > +       return cmdq_task_wait_result(task, tid, wait_q);
> > +}
> > +
> > +static int cmdq_task_wait_and_release(struct cmdq_task *task)
> > +{
> > +       struct cmdq *cqctx;
> > +       int status;
> > +       int tid;
> > +
> > +       if (!task) {
> > +               pr_err("%s err ptr=0x%p\n", __func__, task);
> > +               return -EFAULT;
> > +       }
> > +
> > +       if (task->task_state == TASK_STATE_IDLE) {
> > +               pr_err("%s task=0x%p is IDLE\n", __func__, task);
> > +               return -EFAULT;
> > +       }
> > +
> > +       cqctx = task->cqctx;
> > +
> > +       /* wait for task finish */
> > +       tid = task->thread;
> 
> tid is not used.

will remove

> > +       status = cmdq_task_wait_done(task);
> > +
> > +       /* release */
> > +       cmdq_task_remove_thread(task);
> > +       cmdq_task_release_internal(task);
> > +
> > +       return status;
> > +}
> > +
> > +static void cmdq_core_auto_release_work(struct work_struct *work_item)
> > +{
> > +       struct cmdq_task *task;
> > +       int status;
> > +       struct cmdq_task_cb cb;
> > +
> > +       task = container_of(work_item, struct cmdq_task, auto_release_work);
> > +       memcpy(&cb, &task->cb, sizeof(cb));
> 
> Just:
>   cb = task->cb;

will do

> But, why do you need to make a copy?
> Can't you just access task->cb directly?

This is because task will be released before cb is called.

> > +       status = cmdq_task_wait_and_release(task);
> > +       task = NULL;
> 
> task is not used again, don't set to NULL.

will do

> > +
> > +       /* isr fail, so call isr_cb here to prevent lock */
> > +       if (status && cb.isr_cb)
> > +               cb.isr_cb(cb.isr_data);
> > +
> > +       if (cb.done_cb)
> > +               cb.done_cb(cb.done_data);
> > +}
> 
> [snip]
> 
> 
> > +static irqreturn_t cmdq_irq_handler(int irq, void *dev)
> > +{
> > +       struct cmdq *cqctx = dev;
> > +       int i;
> > +       u32 irq_status;
> > +       bool handled = false;
> > +
> > +       if (cqctx->irq == irq) {
> > +               irq_status = readl(cqctx->base_va +
> > +                                  CMDQ_CURR_IRQ_STATUS_OFFSET);
> > +               irq_status &= CMDQ_IRQ_MASK;
> > +               for (i = 0;
> > +                    (irq_status != CMDQ_IRQ_MASK) && i < CMDQ_MAX_THREAD_COUNT;
> > +                    i++) {
> > +                       /* STATUS bit set to 0 means IRQ asserted */
> > +                       if (irq_status & BIT(i))
> > +                               continue;
> > +
> > +                       /*
> > +                        * We mark irq_status to 1 to denote finished
> > +                        * processing, and we can early-exit if no more
> > +                        * threads being asserted.
> > +                        */
> > +                       irq_status |= BIT(i);
> > +
> > +                       cmdq_core_handle_irq(cqctx, i);
> > +                       handled = true;
> > +               }
> > +       }
> > +
> > +       if (handled) {
> > +               queue_work(cqctx->task_consume_wq,
> > +                          &cqctx->task_consume_wait_queue_item);
> 
> Since you need to queue work anyway, why no just do this all in a threaded irq
> and use a mutex instead of spinlock for exec_lock.

I will take care of this issue with exec_lock spinlock issue together.

> > +               return IRQ_HANDLED;
> > +       }
> > +
> > +       return IRQ_NONE;
> > +}
> > +
> > +static int cmdq_core_initialize(struct platform_device *pdev,
> > +                               struct cmdq **cqctx)
> > +{
> > +       struct cmdq *lcqctx; /* local cmdq context */
> > +       int i;
> > +       int ret = 0;
> > +
> > +       lcqctx = devm_kzalloc(&pdev->dev, sizeof(*lcqctx), GFP_KERNEL);
> > +
> > +       /* save dev */
> > +       lcqctx->dev = &pdev->dev;
> > +
> > +       /* initial cmdq device related data */
> > +       ret = cmdq_dev_init(pdev, lcqctx);
> > +       if (ret) {
> > +               dev_err(&pdev->dev, "failed to init cmdq device\n");
> > +               goto fail_dev;
> > +       }
> > +
> > +       /* initial mutex, spinlock */
> > +       mutex_init(&lcqctx->task_mutex);
> > +       mutex_init(&lcqctx->clock_mutex);
> > +       spin_lock_init(&lcqctx->thread_lock);
> > +       spin_lock_init(&lcqctx->exec_lock);
> > +
> > +       /* initial wait queue for notification */
> > +       for (i = 0; i < ARRAY_SIZE(lcqctx->wait_queue); i++)
> > +               init_waitqueue_head(&lcqctx->wait_queue[i]);
> > +       init_waitqueue_head(&lcqctx->thread_dispatch_queue);
> > +
> > +       /* create task pool */
> > +       lcqctx->task_cache = kmem_cache_create(
> > +                       CMDQ_DRIVER_DEVICE_NAME "_task",
> > +                       sizeof(struct cmdq_task),
> > +                       __alignof__(struct cmdq_task),
> > +                       SLAB_POISON | SLAB_HWCACHE_ALIGN | SLAB_RED_ZONE,
> > +                       &cmdq_task_ctor);
> > +
> > +       /* initialize task lists */
> > +       INIT_LIST_HEAD(&lcqctx->task_free_list);
> > +       INIT_LIST_HEAD(&lcqctx->task_active_list);
> > +       INIT_LIST_HEAD(&lcqctx->task_wait_list);
> > +       INIT_WORK(&lcqctx->task_consume_wait_queue_item,
> > +                 cmdq_core_consume_waiting_list);
> > +
> > +       /* initialize command buffer pool */
> > +       ret = cmdq_cmd_buf_pool_init(lcqctx);
> > +       if (ret) {
> > +               dev_err(&pdev->dev, "failed to init command buffer pool\n");
> > +               goto fail_cmd_buf_pool;
> > +       }
> > +
> > +       lcqctx->task_auto_release_wq = alloc_ordered_workqueue(
> > +                       "%s", WQ_MEM_RECLAIM | WQ_HIGHPRI, "cmdq_auto_release");
> > +       lcqctx->task_consume_wq = alloc_ordered_workqueue(
> > +                       "%s", WQ_MEM_RECLAIM | WQ_HIGHPRI, "cmdq_task");
> > +
> > +       *cqctx = lcqctx;
> > +       return ret;
> > +
> > +fail_cmd_buf_pool:
> > +       destroy_workqueue(lcqctx->task_auto_release_wq);
> > +       destroy_workqueue(lcqctx->task_consume_wq);
> > +       kmem_cache_destroy(lcqctx->task_cache);
> > +
> > +fail_dev:
> > +       return ret;
> > +}
> > +
> > +static int cmdq_rec_realloc_cmd_buffer(struct cmdq_rec *handle, u32 size)
> > +{
> > +       void *new_buf;
> > +
> > +       new_buf = krealloc(handle->buf_ptr, size, GFP_KERNEL | __GFP_ZERO);
> > +       if (!new_buf)
> > +               return -ENOMEM;
> > +       handle->buf_ptr = new_buf;
> > +       handle->buf_size = size;
> > +       return 0;
> > +}
> > +
> > +static struct cmdq *cmdq_rec_get_valid_ctx(struct cmdq_rec *handle)
> > +{
> 
> Make the caller for all cmdq_rec() APIs pass in struct cmdq *, also, and get rid
> of all of this NULL checking.

We may have multiple cmdq_rec, but only one cmdq.
If we use cmdq as parameter, we still need to pass an ID or index to
identify current used cmdq_rec.
Furthermore, we need to maintain an array or list in cmdq.
So, I think use cmdq_rec is a better way for interface.

I will remove all of this NULL checking.

> > +       if (!handle) {
> > +               WARN_ON(1);
> > +               return NULL;
> > +       }
> > +
> > +       WARN_ON(!handle->cqctx);
> > +       return handle->cqctx;
> > +}
> > +
> > +static int cmdq_rec_stop_running_task(struct cmdq_rec *handle)
> > +{
> > +       int status;
> > +
> > +       status = cmdq_core_release_task(handle->running_task_ptr);
> > +       handle->running_task_ptr = NULL;
> > +       return status;
> > +}
> > +
> > +int cmdq_rec_create(struct platform_device *pdev,
> 
> Use struct device *, not struct platform_device *.

will do

> Although, I think it would be better if this API was:
> 
> struct cmdq_rec *cmdq_rec_create(struct cmdq *cmdq, enum cmdq_scenario scenario)

Please see previous explanation.

> > +                   enum cmdq_scenario scenario,
> > +                   struct cmdq_rec **handle_ptr)
> > +{
> > +       struct cmdq *cqctx;
> > +       struct device *dev = &pdev->dev;
> > +       struct cmdq_rec *handle;
> > +       int ret;
> > +
> > +       cqctx = platform_get_drvdata(pdev);
> > +       if (!cqctx) {
> > +               dev_err(dev, "cmdq context is NULL\n");
> > +               return -EINVAL;
> > +       }
> > +
> > +       if (scenario < 0 || scenario >= CMDQ_MAX_SCENARIO_COUNT) {
> > +               dev_err(dev, "unknown scenario type %d\n", scenario);
> > +               return -EINVAL;
> > +       }
> > +
> > +       handle = kzalloc(sizeof(*handle), GFP_KERNEL);
> > +       if (!handle)
> > +               return -ENOMEM;
> > +
> > +       handle->cqctx = platform_get_drvdata(pdev);
> > +       handle->scenario = scenario;
> > +       handle->engine_flag = cmdq_scenario_get_flag(cqctx, scenario);
> > +       handle->priority = CMDQ_THR_PRIO_NORMAL;
> > +
> > +       ret = cmdq_rec_realloc_cmd_buffer(handle, CMDQ_INITIAL_CMD_BLOCK_SIZE);
> > +       if (ret) {
> > +               kfree(handle);
> > +               return ret;
> > +       }
> > +
> > +       *handle_ptr = handle;
> > +
> > +       return 0;
> > +}
> > +EXPORT_SYMBOL(cmdq_rec_create);
> 
> [snip]
> 
> > +static int cmdq_suspend(struct device *dev)
> > +{
> > +       struct cmdq *cqctx;
> > +       void __iomem *gce_base_va;
> > +       unsigned long flags = 0L;
> > +       u32 exec_threads;
> > +       int ref_count;
> > +       bool kill_tasks = false;
> > +       struct cmdq_task *task;
> > +       struct list_head *p;
> > +       int i;
> > +
> > +       cqctx = dev_get_drvdata(dev);
> > +       gce_base_va = cqctx->base_va;
> > +       exec_threads = readl(gce_base_va + CMDQ_CURR_LOADED_THR_OFFSET);
> > +       ref_count = cqctx->thread_usage;
> > +
> > +       if ((ref_count > 0) || (exec_threads & CMDQ_THR_EXECUTING)) {
> > +               dev_err(dev,
> > +                       "[SUSPEND] other running, kill tasks. threads:0x%08x, ref:%d\n",
> > +                       exec_threads, ref_count);
> > +               kill_tasks = true;
> > +       }
> > +
> > +       /*
> > +        * We need to ensure the system is ready to suspend,
> > +        * so kill all running CMDQ tasks
> > +        * and release HW engines.
> > +        */
> > +       if (kill_tasks) {
> > +               /* remove all active task from thread */
> > +               dev_err(dev, "[SUSPEND] remove all active tasks\n");
> > +               list_for_each(p, &cqctx->task_active_list) {
> > +                       task = list_entry(p, struct cmdq_task, list_entry);
> > +                       if (task->thread != CMDQ_INVALID_THREAD) {
> > +                               spin_lock_irqsave(&cqctx->exec_lock, flags);
> > +                               cmdq_thread_force_remove_task(
> > +                                               task, task->thread);
> > +                               task->task_state = TASK_STATE_KILLED;
> > +                               spin_unlock_irqrestore(
> > +                                               &cqctx->exec_lock, flags);
> > +
> > +                               /*
> > +                                * release all thread and
> > +                                * mark all active tasks as "KILLED"
> > +                                * (so that thread won't release again)
> > +                                */
> > +                               dev_err(dev,
> > +                                       "[SUSPEND] release all threads and HW clocks\n");
> > +                               cmdq_task_remove_thread(task);
> > +                       }
> > +               }
> > +
> > +               /* disable all HW thread */
> > +               dev_err(dev, "[SUSPEND] disable all HW threads\n");
> > +               for (i = 0; i < CMDQ_MAX_THREAD_COUNT; i++)
> > +                       cmdq_thread_disable(cqctx, i);
> > +
> > +               /* reset all cmdq_thread */
> > +               memset(&cqctx->thread[0], 0, sizeof(cqctx->thread));
> 
> This is actually confusing, because this actually sets the tids to 0,
> which is a valid thread.  Do you really want this?:
> 
> memset(cqctx->thread, -1, sizeof(cqctx->thread));
> 
> Which would all the threads to CMDQ_INVALID_THREAD ?

This is cmdq_thread but thread ID.
I will align to use cmdq_thread to prevent confusing.

> > +       }
> > +
> > +       spin_lock_irqsave(&cqctx->thread_lock, flags);
> > +       cqctx->suspended = true;
> > +       spin_unlock_irqrestore(&cqctx->thread_lock, flags);
> > +
> > +       /* ALWAYS allow suspend */
> > +       return 0;
> > +}
> 
> [snip]
> 
> > diff --git a/include/soc/mediatek/cmdq.h b/include/soc/mediatek/cmdq.h
> > new file mode 100644
> > index 0000000..2ec349f
> > --- /dev/null
> > +++ b/include/soc/mediatek/cmdq.h
> > @@ -0,0 +1,225 @@
> > +/*
> > + * Copyright (c) 2015 MediaTek Inc.
> > + *
> > + * This program is free software; you can redistribute it and/or modify
> > + * it under the terms of the GNU General Public License version 2 as
> > + * published by the Free Software Foundation.
> > + *
> > + * This program is distributed in the hope that it will be useful,
> > + * but WITHOUT ANY WARRANTY; without even the implied warranty of
> > + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
> > + * GNU General Public License for more details.
> > + */
> > +
> > +#ifndef __MTK_CMDQ_H__
> > +#define __MTK_CMDQ_H__
> > +
> > +#include <linux/platform_device.h>
> > +#include <linux/types.h>
> > +
> > +enum cmdq_scenario {
> > +       CMDQ_SCENARIO_PRIMARY_DISP,
> > +       CMDQ_SCENARIO_SUB_DISP,
> > +       CMDQ_MAX_SCENARIO_COUNT
> > +};
> > +
> > +enum cmdq_hw_thread_priority {
> > +       CMDQ_THR_PRIO_NORMAL = 0, /* nomral (low) priority */
> > +       CMDQ_THR_PRIO_DISPLAY_CONFIG = 3, /* display config (high) priority */
> > +       CMDQ_THR_PRIO_MAX = 7, /* maximum possible priority */
> > +};
> 
> We only ever use CMDQ_THR_PRIO_DISPLAY_CONFIG.
> I suggest you add support for priorities later in a follow-up patch when/if they
> are useful.

will remove

> > +
> > +/* events for CMDQ and display */
> > +enum cmdq_event {
> > +       /* Display start of frame(SOF) events */
> > +       CMDQ_EVENT_DISP_OVL0_SOF = 11,
> > +       CMDQ_EVENT_DISP_OVL1_SOF = 12,
> > +       CMDQ_EVENT_DISP_RDMA0_SOF = 13,
> > +       CMDQ_EVENT_DISP_RDMA1_SOF = 14,
> > +       CMDQ_EVENT_DISP_RDMA2_SOF = 15,
> > +       CMDQ_EVENT_DISP_WDMA0_SOF = 16,
> > +       CMDQ_EVENT_DISP_WDMA1_SOF = 17,
> > +       /* Display end of frame(EOF) events */
> > +       CMDQ_EVENT_DISP_OVL0_EOF = 39,
> > +       CMDQ_EVENT_DISP_OVL1_EOF = 40,
> > +       CMDQ_EVENT_DISP_RDMA0_EOF = 41,
> > +       CMDQ_EVENT_DISP_RDMA1_EOF = 42,
> > +       CMDQ_EVENT_DISP_RDMA2_EOF = 43,
> > +       CMDQ_EVENT_DISP_WDMA0_EOF = 44,
> > +       CMDQ_EVENT_DISP_WDMA1_EOF = 45,
> > +       /* Mutex end of frame(EOF) events */
> > +       CMDQ_EVENT_MUTEX0_STREAM_EOF = 53,
> > +       CMDQ_EVENT_MUTEX1_STREAM_EOF = 54,
> > +       CMDQ_EVENT_MUTEX2_STREAM_EOF = 55,
> > +       CMDQ_EVENT_MUTEX3_STREAM_EOF = 56,
> > +       CMDQ_EVENT_MUTEX4_STREAM_EOF = 57,
> > +       /* Display underrun events */
> > +       CMDQ_EVENT_DISP_RDMA0_UNDERRUN = 63,
> > +       CMDQ_EVENT_DISP_RDMA1_UNDERRUN = 64,
> > +       CMDQ_EVENT_DISP_RDMA2_UNDERRUN = 65,
> > +       /* Keep this at the end of HW events */
> > +       CMDQ_MAX_HW_EVENT_COUNT = 260,
> > +       /* GPR events */
> > +       CMDQ_SYNC_TOKEN_GPR_SET_0 = 400,
> > +       CMDQ_SYNC_TOKEN_GPR_SET_1 = 401,
> > +       CMDQ_SYNC_TOKEN_GPR_SET_2 = 402,
> > +       CMDQ_SYNC_TOKEN_GPR_SET_3 = 403,
> > +       CMDQ_SYNC_TOKEN_GPR_SET_4 = 404,
> > +       /* This is max event and also can be used as mask. */
> > +       CMDQ_SYNC_TOKEN_MAX = (0x1ff),
> > +       /* Invalid event */
> > +       CMDQ_SYNC_TOKEN_INVALID = (-1),
> > +};
> > +
> > +/* called after isr done or task done */
> > +typedef int (*cmdq_async_flush_cb)(void *data);
> > +
> > +struct cmdq_task;
> > +struct cmdq;
> > +
> > +struct cmdq_rec {
> > +       struct cmdq             *cqctx;
> > +       u64                     engine_flag;
> > +       int                     scenario;
> > +       u32                     block_size; /* command size */
> > +       void                    *buf_ptr;
> > +       u32                     buf_size;
> > +       /* running task after flush */
> > +       struct cmdq_task        *running_task_ptr;
> > +       /*
> > +        * HW thread priority
> > +        * high priority implies prefetch
> > +        */
> > +       enum cmdq_hw_thread_priority    priority;
> > +       bool                    finalized;
> > +       u32                     prefetch_count;
> > +};
> > +
> > +/**
> > + * cmdq_rec_create() - create command queue recorder handle
> 
> For all of these comments, I think you mean "command queue record",
> not recorder.

will rename to "record"

> Thanks,
> -Dan

Thanks,
HS Liao