[PATCH 1/2] media: rkvdec: reduce excessive stack usage in assemble_hw_pps()

Nicolas Dufresne nicolas.dufresne at collabora.com
Mon Feb 2 05:42:41 PST 2026


Hi Arnd,

Le lundi 02 février 2026 à 10:47 +0100, Arnd Bergmann a écrit :
> From: Arnd Bergmann <arnd at arndb.de>
> 
> The rkvdec_pps had a large set of bitfields, all of which
> as misaligned. This causes clang-21 and likely other versions to
> produce absolutely awful object code and a warning about very
> large stack usage, on targets without unaligned access:
> 
> drivers/media/platform/rockchip/rkvdec/rkvdec-vp9.c:966:12: error: stack frame size (1472) exceeds limit (1280) in 'rkvdec_vp9_start' [-Werror,-Wframe-larger-than]

We had already addressed and validated that on clang-21, which indicates me that
we likely are missing an architecture (or a config) in our CI. Can you document
which architecture, configuration and flags was affected so we can add it on our
side ?

Our media pipeline before sending to Linus and the clang builds trace are in the
following link, in case it matters.

https://gitlab.freedesktop.org/linux-media/media-committers/-/pipelines/1588731
https://gitlab.freedesktop.org/linux-media/media-committers/-/jobs/91604655

> 
> Part of the problem here is how all the bitfield accesses are
> inlined into a function that already has large structures on
> the stack.

Another observation is that you had to enable ASAN to make it miss-behave on for
loop unrolling (with complex bitfield writes).  All I've obtained by visiting
the Link: is that its armv7-a architecture.

> 
> Mark set_field_order_cnt() as noinline_for_stack, and split out
> the following accesses in assemble_hw_pps() into another noinline
> function, both of which now using around 800 bytes of stack in the
> same configuration.
> 
> There is clearly still something wrong with clang here, but
> splitting it into multiple functions reduces the risk of stack
> overflow.

We've tried really hard to avoid this noninline_for_stack just because compilers
are buggy. I'll have a look again in case I find some ideas, but meanwhile, with
failing architecture in the commit message:

Reviewed-by: Nicolas Dufresne <nicolas.dufresne at collabora.com>

> 
> Fixes: fde24907570d ("media: rkvdec: Add H264 support for the VDPU383 variant")
> Link: https://godbolt.org/z/acP1eKeq9
> Signed-off-by: Arnd Bergmann <arnd at arndb.de>
> ---
>  .../rockchip/rkvdec/rkvdec-vdpu383-h264.c     | 50 ++++++++++---------
>  1 file changed, 27 insertions(+), 23 deletions(-)
> 
> diff --git a/drivers/media/platform/rockchip/rkvdec/rkvdec-vdpu383-h264.c b/drivers/media/platform/rockchip/rkvdec/rkvdec-vdpu383-h264.c
> index 6ab3167addc8..ef69f2a36478 100644
> --- a/drivers/media/platform/rockchip/rkvdec/rkvdec-vdpu383-h264.c
> +++ b/drivers/media/platform/rockchip/rkvdec/rkvdec-vdpu383-h264.c
> @@ -130,7 +130,7 @@ struct rkvdec_h264_ctx {
>  	struct vdpu383_regs_h26x regs;
>  };
>  
> -static void set_field_order_cnt(struct rkvdec_pps *pps, const struct v4l2_h264_dpb_entry *dpb)
> +static noinline_for_stack void set_field_order_cnt(struct rkvdec_pps *pps, const struct v4l2_h264_dpb_entry *dpb)
>  {
>  	pps->top_field_order_cnt0 = dpb[0].top_field_order_cnt;
>  	pps->bot_field_order_cnt0 = dpb[0].bottom_field_order_cnt;
> @@ -166,6 +166,31 @@ static void set_field_order_cnt(struct rkvdec_pps *pps, const struct v4l2_h264_d
>  	pps->bot_field_order_cnt15 = dpb[15].bottom_field_order_cnt;
>  }
>  
> +static noinline_for_stack void set_dec_params(struct rkvdec_pps *pps, const struct v4l2_ctrl_h264_decode_params *dec_params)
> +{
> +	const struct v4l2_h264_dpb_entry *dpb = dec_params->dpb;
> +
> +	for (int i = 0; i < ARRAY_SIZE(dec_params->dpb); i++) {
> +		if (dpb[i].flags & V4L2_H264_DPB_ENTRY_FLAG_LONG_TERM)
> +			pps->is_longterm |= (1 << i);
> +		pps->ref_field_flags |=
> +		 (!!(dpb[i].flags & V4L2_H264_DPB_ENTRY_FLAG_FIELD)) << i;
> +		pps->ref_colmv_use_flag |=
> +		 (!!(dpb[i].flags & V4L2_H264_DPB_ENTRY_FLAG_ACTIVE)) << i;
> +		pps->ref_topfield_used |=
> +		 (!!(dpb[i].fields & V4L2_H264_TOP_FIELD_REF)) << i;
> +		pps->ref_botfield_used |=
> +			(!!(dpb[i].fields & V4L2_H264_BOTTOM_FIELD_REF)) << i;
> +	}
> +	pps->pic_field_flag =
> +		!!(dec_params->flags & V4L2_H264_DECODE_PARAM_FLAG_FIELD_PIC);
> +	pps->pic_associated_flag =
> +		!!(dec_params->flags & V4L2_H264_DECODE_PARAM_FLAG_BOTTOM_FIELD);
> +
> +	pps->cur_top_field = dec_params->top_field_order_cnt;
> +	pps->cur_bot_field = dec_params->bottom_field_order_cnt;
> +}
> +
>  static void assemble_hw_pps(struct rkvdec_ctx *ctx,
>  			    struct rkvdec_h264_run *run)
>  {
> @@ -177,7 +202,6 @@ static void assemble_hw_pps(struct rkvdec_ctx *ctx,
>  	struct rkvdec_h264_priv_tbl *priv_tbl = h264_ctx->priv_tbl.cpu;
>  	struct rkvdec_sps_pps *hw_ps;
>  	u32 pic_width, pic_height;
> -	u32 i;
>  
>  	/*
>  	 * HW read the SPS/PPS information from PPS packet index by PPS id.
> @@ -261,28 +285,8 @@ static void assemble_hw_pps(struct rkvdec_ctx *ctx,
>  		!!(pps->flags & V4L2_H264_PPS_FLAG_SCALING_MATRIX_PRESENT);
>  
>  	set_field_order_cnt(&hw_ps->pps, dpb);
> +	set_dec_params(&hw_ps->pps, dec_params);
>  
> -	for (i = 0; i < ARRAY_SIZE(dec_params->dpb); i++) {
> -		if (dpb[i].flags & V4L2_H264_DPB_ENTRY_FLAG_LONG_TERM)
> -			hw_ps->pps.is_longterm |= (1 << i);
> -
> -		hw_ps->pps.ref_field_flags |=
> -			(!!(dpb[i].flags & V4L2_H264_DPB_ENTRY_FLAG_FIELD)) << i;
> -		hw_ps->pps.ref_colmv_use_flag |=
> -			(!!(dpb[i].flags & V4L2_H264_DPB_ENTRY_FLAG_ACTIVE)) << i;
> -		hw_ps->pps.ref_topfield_used |=
> -			(!!(dpb[i].fields & V4L2_H264_TOP_FIELD_REF)) << i;
> -		hw_ps->pps.ref_botfield_used |=
> -			(!!(dpb[i].fields & V4L2_H264_BOTTOM_FIELD_REF)) << i;
> -	}
> -
> -	hw_ps->pps.pic_field_flag =
> -		!!(dec_params->flags & V4L2_H264_DECODE_PARAM_FLAG_FIELD_PIC);
> -	hw_ps->pps.pic_associated_flag =
> -		!!(dec_params->flags & V4L2_H264_DECODE_PARAM_FLAG_BOTTOM_FIELD);
> -
> -	hw_ps->pps.cur_top_field = dec_params->top_field_order_cnt;
> -	hw_ps->pps.cur_bot_field = dec_params->bottom_field_order_cnt;
>  }
>  
>  static void rkvdec_write_regs(struct rkvdec_ctx *ctx)
-------------- next part --------------
A non-text attachment was scrubbed...
Name: signature.asc
Type: application/pgp-signature
Size: 228 bytes
Desc: This is a digitally signed message part
URL: <http://lists.infradead.org/pipermail/linux-arm-kernel/attachments/20260202/2492b855/attachment.sig>


More information about the linux-arm-kernel mailing list