[PATCH 7/8] drm/panthor: Add support for entering and exiting protected mode
Boris Brezillon
boris.brezillon at collabora.com
Wed May 6 01:51:45 PDT 2026
On Tue, 5 May 2026 16:05:13 +0200
Ketil Johnsen <ketil.johnsen at arm.com> wrote:
Here comes the second part of the review :-).
> diff --git a/drivers/gpu/drm/panthor/panthor_gpu.c b/drivers/gpu/drm/panthor/panthor_gpu.c
> index 2ab444ee8c710..e93042eaf3fc8 100644
> --- a/drivers/gpu/drm/panthor/panthor_gpu.c
> +++ b/drivers/gpu/drm/panthor/panthor_gpu.c
> @@ -100,8 +100,11 @@ static void panthor_gpu_irq_handler(struct panthor_device *ptdev, u32 status)
> fault_status, panthor_exception_name(ptdev, fault_status & 0xFF),
> address);
> }
> - if (status & GPU_IRQ_PROTM_FAULT)
> + if (status & GPU_IRQ_PROTM_FAULT) {
> drm_warn(&ptdev->base, "GPU Fault in protected mode\n");
> + panthor_gpu_disable_protm_fault_interrupt(ptdev);
It's only used in a single place, so I'd just inline the content of
panthor_gpu_disable_protm_fault_interrupt() here. Also, I think
panthor_gpu_disable_protm_fault_interrupt() is not taking the right
lock (see below).
> + panthor_device_schedule_reset(ptdev);
> + }
>
> spin_lock(&ptdev->gpu->reqs_lock);
> if (status & ptdev->gpu->pending_reqs) {
> @@ -367,6 +370,10 @@ int panthor_gpu_soft_reset(struct panthor_device *ptdev)
> unsigned long flags;
>
> spin_lock_irqsave(&ptdev->gpu->reqs_lock, flags);
> +
> + /** Re-enable the protm_irq_fault when reset is complete */
> + ptdev->gpu->irq.mask |= GPU_IRQ_PROTM_FAULT;
panthor_irq::mask should only be modified with the
panthor_irq::mask_lock held. Besides, we have a helper for
that:
panthor_gpu_irq_enable_events(&ptdev->gpu->irq, GPU_IRQ_PROTM_FAULT);
> +
> if (!drm_WARN_ON(&ptdev->base,
> ptdev->gpu->pending_reqs & GPU_IRQ_RESET_COMPLETED)) {
> ptdev->gpu->pending_reqs |= GPU_IRQ_RESET_COMPLETED;
> @@ -427,3 +434,8 @@ void panthor_gpu_resume(struct panthor_device *ptdev)
> panthor_hw_l2_power_on(ptdev);
> }
>
> +void panthor_gpu_disable_protm_fault_interrupt(struct panthor_device *ptdev)
> +{
> + scoped_guard(spinlock_irqsave, &ptdev->gpu->reqs_lock)
> + ptdev->gpu->irq.mask &= ~GPU_IRQ_PROTM_FAULT;
Same problem with wrong lock being taken to modify the mask, and
panthor_gpu_irq_disable_events() probably being a better option?
> +}
> diff --git a/drivers/gpu/drm/panthor/panthor_gpu.h b/drivers/gpu/drm/panthor/panthor_gpu.h
> index 12c263a399281..ca66c73f543e6 100644
> --- a/drivers/gpu/drm/panthor/panthor_gpu.h
> +++ b/drivers/gpu/drm/panthor/panthor_gpu.h
> @@ -54,4 +54,10 @@ int panthor_gpu_soft_reset(struct panthor_device *ptdev);
> void panthor_gpu_power_changed_off(struct panthor_device *ptdev);
> int panthor_gpu_power_changed_on(struct panthor_device *ptdev);
>
> +/**
> + * panthor_gpu_disable_protm_fault_interrupt() - Disable GPU_PROTECTED_FAULT interrupt
> + * @ptdev: Device.
> + */
> +void panthor_gpu_disable_protm_fault_interrupt(struct panthor_device *ptdev);
> +
> #endif
> diff --git a/drivers/gpu/drm/panthor/panthor_mmu.c b/drivers/gpu/drm/panthor/panthor_mmu.c
> index 07f54176ec1bf..702f537905b56 100644
> --- a/drivers/gpu/drm/panthor/panthor_mmu.c
> +++ b/drivers/gpu/drm/panthor/panthor_mmu.c
> @@ -31,6 +31,7 @@
> #include <linux/sizes.h>
>
> #include "panthor_device.h"
> +#include "panthor_fw.h"
> #include "panthor_gem.h"
> #include "panthor_gpu.h"
> #include "panthor_heap.h"
> @@ -1704,8 +1705,12 @@ static int panthor_vm_lock_region(struct panthor_vm *vm, u64 start, u64 size)
> if (drm_WARN_ON(&ptdev->base, vm->locked_region.size))
> return -EINVAL;
>
> + down_read(&ptdev->protm.lock);
> +
> mutex_lock(&ptdev->mmu->as.slots_lock);
> if (vm->as.id >= 0 && size) {
> + panthor_fw_protm_exit_sync(ptdev);
> +
> /* Lock the region that needs to be updated */
> gpu_write64(ptdev, AS_LOCKADDR(vm->as.id),
> pack_region_range(ptdev, &start, &size));
> @@ -1720,6 +1725,9 @@ static int panthor_vm_lock_region(struct panthor_vm *vm, u64 start, u64 size)
> }
> mutex_unlock(&ptdev->mmu->as.slots_lock);
>
> + if (ret)
> + up_read(&ptdev->protm.lock);
Let's try to keep the locked section local to a function: the protm.lock
should, IMHO, be taken/released in panthor_vm_exec_op(). If we go for
that, this also makes the panthor_vm_lock_region() vs
panthor_vm_expand_locked_region() distinction useless, though it's
probably fine to keep it for clarity.
> +
> return ret;
> }
>
> @@ -1805,6 +1813,8 @@ static void panthor_vm_unlock_region(struct panthor_vm *vm)
> vm->locked_region.start = 0;
> vm->locked_region.size = 0;
> mutex_unlock(&ptdev->mmu->as.slots_lock);
> +
> + up_read(&ptdev->protm.lock);
> }
>
> static void panthor_mmu_irq_handler(struct panthor_device *ptdev, u32 status)
> diff --git a/drivers/gpu/drm/panthor/panthor_sched.c b/drivers/gpu/drm/panthor/panthor_sched.c
> index 987072bd867c4..acb04250c7def 100644
> --- a/drivers/gpu/drm/panthor/panthor_sched.c
> +++ b/drivers/gpu/drm/panthor/panthor_sched.c
> @@ -308,6 +308,15 @@ struct panthor_scheduler {
> */
> struct list_head stopped_groups;
> } reset;
> +
> + /** @protm: Protected mode related fields. */
> + struct {
> + /** @protected_mode: True if GPU is in protected mode. */
> + bool protected_mode;
nit: s/protected_mode/enabled/s. But do we even need that boolean?
Isn't active_group != NULL providing the same info?
> +
> + /** @active_group: The active protected group. */
> + struct panthor_group *active_group;
> + } protm;
> };
>
> /**
> @@ -570,6 +579,16 @@ struct panthor_group {
> /** @fatal_queues: Bitmask reflecting the queues that hit a fatal exception. */
> u32 fatal_queues;
>
> + /**
> + * @protm_pending_queues: Bitmask reflecting the queues that are waiting
> + * on a CS_PROTM_PENDING.
> + *
> + * The GPU will set the bit associated to the queue pending protected mode
> + * when a PROT_REGION command is executing or when trying to resume previously
> + * suspended protected mode jobs.
> + */
> + u32 protm_pending_queues;
> +
> /** @tiler_oom: Mask of queues that have a tiler OOM event to process. */
> atomic_t tiler_oom;
>
> @@ -1176,6 +1195,7 @@ queue_resume_timeout(struct panthor_queue *queue)
> * @ptdev: Device.
> * @csg_id: Group slot ID.
> * @cs_id: Queue slot ID.
> + * @protm_ack: Acknowledge pending protected mode queues
> *
> * Program a queue slot with the queue information so things can start being
> * executed on this queue.
> @@ -1472,6 +1492,34 @@ csg_slot_prog_locked(struct panthor_device *ptdev, u32 csg_id, u32 priority)
> return 0;
> }
>
> +static void
> +cs_slot_process_protm_pending_event_locked(struct panthor_device *ptdev,
> + u32 csg_id, u32 cs_id)
> +{
> + struct panthor_scheduler *sched = ptdev->scheduler;
> + struct panthor_csg_slot *csg_slot = &sched->csg_slots[csg_id];
> + struct panthor_group *group = csg_slot->group;
> +
> + lockdep_assert_held(&sched->lock);
> +
> + if (!group)
> + return;
> +
> + /* No protected memory heap, a user space program tried to
> + * submit a protected mode jobs resulting in the GPU raising
> + * a CS_PROTM_PENDING request.
> + *
> + * This scenario is invalid and the protected mode jobs must
> + * not be allowed to progress.
> + */
> + if (!ptdev->protm.heap)
> + return;
Should we flag the group unusable when that happens and schedule it out
as soon as possible?
if (!ptdev->protm.heap)
group->fatal_queues |= BIT(cs_id);
else
group->protm_pending_queues |= BIT(cs_id);
sched_queue_delayed_work(sched, tick, 0);
> +
> + group->protm_pending_queues |= BIT(cs_id);
> +
> + sched_queue_delayed_work(sched, tick, 0);
> +}
> +
> static void
> cs_slot_process_fatal_event_locked(struct panthor_device *ptdev,
> u32 csg_id, u32 cs_id)
> @@ -1718,6 +1766,9 @@ static bool cs_slot_process_irq_locked(struct panthor_device *ptdev,
> if (events & CS_TILER_OOM)
> cs_slot_process_tiler_oom_event_locked(ptdev, csg_id, cs_id);
>
> + if (events & CS_PROTM_PENDING)
> + cs_slot_process_protm_pending_event_locked(ptdev, csg_id, cs_id);
> +
> /* We don't acknowledge the TILER_OOM event since its handling is
> * deferred to a separate work.
> */
> @@ -1848,6 +1899,17 @@ static void sched_process_idle_event_locked(struct panthor_device *ptdev)
> sched_queue_delayed_work(ptdev->scheduler, tick, 0);
> }
>
> +static void sched_process_protm_exit_event_locked(struct panthor_device *ptdev)
> +{
> + lockdep_assert_held(&ptdev->scheduler->lock);
> +
> + /* Acknowledge the protm exit and schedule a tick. */
> + panthor_fw_protm_exit(ptdev);
Let's just inline the content of panthor_fw_protm_exit() here.*
It doesn't make sense to have all these indirections, especially
since PROTM and scheduling are intertwined anyway, so I consider
it part of the scheduler responsibility (just like the scheduler
deals with other GLB events).
The same goes for the other panthor_fw_protm_ helpers defined in
panthor_fw.c, I think panthor_sched.c would be a better fit for
those, or even inlining their content if they are only used in
a single place.
> + sched_queue_delayed_work(ptdev->scheduler, tick, 0);
> + ptdev->scheduler->protm.protected_mode = false;
> + ptdev->scheduler->protm.active_group = NULL;
> +}
> +
> /**
> * sched_process_global_irq_locked() - Process the scheduling part of a global IRQ
> * @ptdev: Device.
> @@ -1863,6 +1925,9 @@ static void sched_process_global_irq_locked(struct panthor_device *ptdev)
> ack = READ_ONCE(glb_iface->output->ack);
> evts = (req ^ ack) & GLB_EVT_MASK;
>
> + if (evts & GLB_PROTM_EXIT)
> + sched_process_protm_exit_event_locked(ptdev);
> +
> if (evts & GLB_IDLE)
> sched_process_idle_event_locked(ptdev);
> }
> @@ -1872,23 +1937,71 @@ static void process_fw_events_work(struct work_struct *work)
> struct panthor_scheduler *sched = container_of(work, struct panthor_scheduler,
> fw_events_work);
> u32 events = atomic_xchg(&sched->fw_events, 0);
> + u32 csg_events = events & ~JOB_INT_GLOBAL_IF;
> struct panthor_device *ptdev = sched->ptdev;
>
> mutex_lock(&sched->lock);
>
> + while (csg_events) {
> + u32 csg_id = ffs(csg_events) - 1;
> +
> + sched_process_csg_irq_locked(ptdev, csg_id);
> + csg_events &= ~BIT(csg_id);
> + }
I'm sure you have a good reason to re-order the processing
of CSG and GLB events, and it'd be good to have it documented
in a comment.
> +
> if (events & JOB_INT_GLOBAL_IF) {
> sched_process_global_irq_locked(ptdev);
> events &= ~JOB_INT_GLOBAL_IF;
> }
>
> - while (events) {
> - u32 csg_id = ffs(events) - 1;
> + mutex_unlock(&sched->lock);
> +}
>
> - sched_process_csg_irq_locked(ptdev, csg_id);
> - events &= ~BIT(csg_id);
> +static void handle_protm_fault(struct panthor_device *ptdev)
This is a bit of a misnomer, I think. It doesn't seem to be triggered
by a FAULT event, it's more a timeout on a PROTM section that would
lead to a reset being scheduled, and this "blocked-in-protm" situation
being detected as part of the reset.
> +{
> + struct panthor_scheduler *sched = ptdev->scheduler;
> + u32 csg_id;
> + struct panthor_group *protm_group;
> +
> + guard(mutex)(&sched->lock);
> +
> + if (!sched->protm.protected_mode)
> + return;
> +
> + protm_group = sched->protm.active_group;
> +
> + if (drm_WARN_ON(&ptdev->base, !protm_group))
> + return;
See, that's a perfect example of sched->protm.protected_mode being redundant.
Now you're left with a potential protected_mode=true,active_group=NULL
inconsistency you don't expect.
> +
> + /* Group will be terminated by the device reset */
> + protm_group->fatal_queues |= GENMASK(protm_group->queue_count - 1, 0);
> +
> + if (!panthor_fw_protm_exit_wait_event_timeout(ptdev))
> + goto cleanup_protm;
> +
> + /**
> + * GPU failed to exit protected mode. Mark all non-protected mode CSGs
/* GPU failed to exit protected mode. Mark all non-protected mode CSGs
> + * as suspended so that they are unaffected by the GPU reset.
> + */
> +
> + for (csg_id = 0; csg_id < sched->csg_slot_count; csg_id++) {
> + struct panthor_group *group = sched->csg_slots[csg_id].group;
> +
> + if (!group || group == protm_group)
> + continue;
> +
> + group->state = PANTHOR_CS_GROUP_SUSPENDED;
> +
> + group_unbind_locked(group);
> +
> + list_move(&group->run_node, group_is_idle(group) ?
> + &sched->groups.idle[group->priority] :
> + &sched->groups.runnable[group->priority]);
nit: Let's use a local struct list_head * variable to select the right list.
> }
>
> - mutex_unlock(&sched->lock);
> +cleanup_protm:
> + sched->protm.protected_mode = false;
> + sched->protm.active_group = NULL;
> }
>
> /**
> @@ -2029,6 +2142,7 @@ struct panthor_sched_tick_ctx {
> bool immediate_tick;
> bool stop_tick;
> u32 csg_upd_failed_mask;
> + struct panthor_group *protm_group;
> };
>
> static bool
> @@ -2299,6 +2413,7 @@ tick_ctx_evict_group(struct panthor_scheduler *sched,
>
> static void
> tick_ctx_reschedule_group(struct panthor_scheduler *sched,
> + struct panthor_sched_tick_ctx *ctx,
> struct panthor_csg_slots_upd_ctx *upd_ctx,
> struct panthor_group *group,
> int new_csg_prio)
> @@ -2321,6 +2436,30 @@ tick_ctx_reschedule_group(struct panthor_scheduler *sched,
> csg_iface->output->ack ^ CSG_ENDPOINT_CONFIG,
> CSG_ENDPOINT_CONFIG);
> }
> +
> + if (ctx->protm_group == group) {
> + for (u32 q = 0; q < group->queue_count; q++) {
> + struct panthor_fw_cs_iface *cs_iface;
> +
> + if (!(group->protm_pending_queues & BIT(q)))
> + continue;
> +
> + cs_iface = panthor_fw_get_cs_iface(ptdev, group->csg_id, q);
> + panthor_fw_update_reqs(cs_iface, req, cs_iface->output->ack,
> + CS_PROTM_PENDING);
> + }
> +
> + panthor_fw_toggle_reqs(csg_iface, doorbell_req, doorbell_ack,
> + group->protm_pending_queues);
> + csgs_upd_ctx_ring_doorbell(upd_ctx, group->csg_id);
> + group->protm_pending_queues = 0;
> +
> + /*
> + * We only allow one protected group to run at same time,
> + * as it makes it easier to handle faults in protected mode.
It's more something to document in the panthor_scheduler::protm::active_group
section.
> + */
> + sched->protm.active_group = group;
Would it make sense to move this logic to a tick_ctx_handle_protm_group()
helper that's called before/after tick_ctx_reschedule_group()? This way
there's no extra if (ctx->protm_group == group) conditional branch in here.
static void
tick_ctx_handle_protm_group(struct panthor_scheduler *sched,
struct panthor_sched_tick_ctx *ctx,
struct panthor_csg_slots_upd_ctx *upd_ctx)
{
struct panthor_device *ptdev = sched->ptdev;
struct panthor_group *group = ctx->protm_group;
struct panthor_fw_csg_iface *csg_iface;
if (!group || drm_WARN_ON(&ptdev->base, group->csg_id < 0))
return;
csg_iface = panthor_fw_get_csg_iface(ptdev, group->csg_id);
for (u32 q = 0; q < group->queue_count; q++) {
struct panthor_fw_cs_iface *cs_iface;
if (!(group->protm_pending_queues & BIT(q)))
continue;
cs_iface = panthor_fw_get_cs_iface(ptdev, group->csg_id, q);
panthor_fw_update_reqs(cs_iface, req, cs_iface->output->ack,
CS_PROTM_PENDING);
}
panthor_fw_toggle_reqs(csg_iface, doorbell_req, doorbell_ack,
group->protm_pending_queues);
csgs_upd_ctx_ring_doorbell(upd_ctx, group->csg_id);
group->protm_pending_queues = 0;
sched->protm.active_group = group;
}
> + }
> }
>
> static void
> @@ -2336,6 +2475,17 @@ tick_ctx_schedule_group(struct panthor_scheduler *sched,
> group_bind_locked(group, csg_id);
> csg_slot_prog_locked(ptdev, csg_id, csg_prio);
>
> + /* If the group was waiting for protected mode before suspension,
> + * and the tick context enters this mode, it should be serviced
> + * immediately because the slot reset should have set the
> + * CS_PROTM_PENDING bit to zero, and cs_prog_slot_locked() sets it to
> + * zero too.
> + * It's not clear if we will get a new CS_PROTM_PENDING event in that
> + * case, but it should be safe either way.
> + */
> + if (group->protm_pending_queues && ctx->protm_group)
> + group->protm_pending_queues = 0;
I'd move this to the path where we do the SUSPEND, or group_unbind(), even.
> +
> csgs_upd_ctx_queue_reqs(ptdev, upd_ctx, csg_id,
> group->state == PANTHOR_CS_GROUP_SUSPENDED ?
> CSG_STATE_RESUME : CSG_STATE_START,
> @@ -2365,7 +2515,7 @@ tick_ctx_apply(struct panthor_scheduler *sched, struct panthor_sched_tick_ctx *c
>
> /* Update priorities on already running groups. */
> list_for_each_entry(group, &ctx->groups[prio], run_node) {
> - tick_ctx_reschedule_group(sched, &upd_ctx, group, new_csg_prio--);
> + tick_ctx_reschedule_group(sched, ctx, &upd_ctx, group, new_csg_prio--);
> }
> }
>
> @@ -2457,6 +2607,15 @@ tick_ctx_apply(struct panthor_scheduler *sched, struct panthor_sched_tick_ctx *c
>
> sched->used_csg_slot_count = ctx->group_count;
> sched->might_have_idle_groups = ctx->idle_group_count > 0;
> +
> + if (ctx->protm_group) {
> + ret = panthor_fw_protm_enter(ptdev);
> + if (ret) {
> + panthor_device_schedule_reset(ptdev);
> + ctx->csg_upd_failed_mask = U32_MAX;
It's weird to flag it as all CSGs update failed. Should we instead
have
/* If we failed to enter PROTM, consider the group who
* requested it as failed.
*/
ctx->csg_upd_failed_mask |= BIT(ctx->protm_group->csg_id);
> + }
> + sched->protm.protected_mode = true;
I'd move that to a tick_ctx_service_protm_req() helper that has the
panthor_fw_protm_enter() inlined, because again, it doesn't make
sense to have this defined in panthor_fw.c if the only user lives
in panthor_sched.c
> + }
> }
>
> static u64
> @@ -2490,7 +2649,7 @@ static void tick_work(struct work_struct *work)
> u64 resched_target = sched->resched_target;
> u64 remaining_jiffies = 0, resched_delay;
> u64 now = get_jiffies_64();
> - int prio, ret, cookie;
> + int prio, protm_prio, ret, cookie;
> bool full_tick;
>
> if (!drm_dev_enter(&ptdev->base, &cookie))
> @@ -2564,14 +2723,49 @@ static void tick_work(struct work_struct *work)
> }
> }
>
> + /* Check if the highest priority group want to switch to protected mode */
> + for (protm_prio = PANTHOR_CSG_PRIORITY_COUNT - 1; protm_prio >= 0; protm_prio--) {
> + struct panthor_group *group;
> +
> + group = list_first_entry_or_null(&ctx.groups[protm_prio],
> + struct panthor_group,
> + run_node);
> + if (group) {
> + ctx.protm_group = group;
> + break;
> + }
Should this be
if (group) {
if (group->protm_pending_queues)
ctx.protm_group = group;
break;
}
?
> + }
> +
> /* If we have free CSG slots left, pick idle groups */
> - for (prio = PANTHOR_CSG_PRIORITY_COUNT - 1;
> - prio >= 0 && !tick_ctx_is_full(sched, &ctx);
> - prio--) {
How about we keep it a single indentation level and skip higher prios if
PROTM is requested:
/* Pick only idle groups with equal or lower priority than the
* group triggering protected mode. Do not bother picking
* unscheduled idle groups.
*/
if (ctx.protm_group && prio < protm_prio)
continue;
This saves us an indentation level and limits the code duplication.
> - /* Check the old_group queue first to avoid reprogramming the slots */
> - tick_ctx_pick_groups_from_list(sched, &ctx, &ctx.old_groups[prio], false, true);
> - tick_ctx_pick_groups_from_list(sched, &ctx, &sched->groups.idle[prio],
> - false, false);
> + if (ctx.protm_group) {
> + /* Pick only idle groups with equal or lower priority than the
> + * group triggering protected mode. Do not bother picking
> + * unscheduled idle groups.
> + */
> + for (prio = protm_prio;
> + prio >= 0 && !tick_ctx_is_full(sched, &ctx);
> + prio--)
> + tick_ctx_pick_groups_from_list(sched, &ctx,
> + &ctx.old_groups[prio],
> + false, true);
> + } else {
> + /* No switch to protected, just pick any idle group according
> + * to priority
> + */
> + for (prio = PANTHOR_CSG_PRIORITY_COUNT - 1;
> + prio >= 0 && !tick_ctx_is_full(sched, &ctx);
> + prio--) {
> + /* Check the old_group queue first to avoid
> + * reprogramming the slots
> + */
> + tick_ctx_pick_groups_from_list(sched, &ctx,
> + &ctx.old_groups[prio],
> + false, true);
> + tick_ctx_pick_groups_from_list(sched, &ctx,
> + &sched->groups.idle[prio],
> + false, false);
> + }
> +
> }
>
> tick_ctx_apply(sched, &ctx);
> @@ -2993,6 +3187,8 @@ void panthor_sched_pre_reset(struct panthor_device *ptdev)
> cancel_work_sync(&sched->sync_upd_work);
> cancel_delayed_work_sync(&sched->tick_work);
>
> + handle_protm_fault(ptdev);
I actually wonder if this should be part of the panthor_sched_suspend()
logic. That is, we would automatically flag all non-protm groups as
suspended if the GPU was in PROTM mode at the time the hang happened.
> +
> panthor_sched_suspend(ptdev);
>
> /* Stop all groups that might still accept jobs, so we don't get passed
Regards,
Boris
More information about the linux-arm-kernel
mailing list