[PATCH v5 4/4] media: mediatek: add MT8188 AIE driver

CK Hu (胡俊光) ck.hu at mediatek.com
Wed Apr 16 23:39:43 PDT 2025


On Thu, 2025-04-03 at 15:38 +0800, bo.kong wrote:
> From: Bo Kong <Bo.Kong at mediatek.com>
> 
> Add a V4L2 sub-device driver for MT8188 AIE.

This patch is a little big, so separate this patch by fd_mode.
So these patch would be:

1. Add MT8188 AIE driver (support FDMODE only)
2. Add ATTRIBUTEMODE
3. Add FLDMODE

> 
> Signed-off-by: Bo Kong <Bo.Kong at mediatek.com>
> ---

[snip]

> +
> +enum RS_CONFIG {
> +	RS_X_Y_MAG = 1,
> +	RS_SRZ_HORI_STEP = 3,
> +	RS_SRZ_VERT_STEP,

You allocate memory for the index 5 and 6 but it's not used.
Add the comment to describe why you allocate this memory but not use it.

> +	RS_INPUT_W_H = 7,
> +	RS_OUTPUT_W_H,
> +	RS_IN_X_Y_SIZE0 = 10,
> +	RS_IN_STRIDE0,
> +	RS_IN_X_Y_SIZE1,
> +	RS_IN_STRIDE1,
> +	RS_IN_X_Y_SIZE2,
> +	RS_IN_STRIDE2,
> +	RS_OUT_X_Y_SIZE0,
> +	RS_OUT_STRIDE0,
> +	RS_OUT_X_Y_SIZE1,
> +	RS_OUT_STRIDE1,
> +	RS_OUT_X_Y_SIZE2,
> +	RS_OUT_STRIDE2,
> +	RS_IN_0,
> +	RS_IN_1,
> +	RS_IN_2,
> +	RS_OUT_0,
> +	RS_OUT_1,
> +	RS_OUT_2,
> +	RS_CON_IN_BA_MSB,
> +	RS_CON_OUT_BA_MSB,
> +};
> +
> 

[snip]

> +
> +struct aie_static_info_element {
> +	u32 fd_wdma_size[OUTPUT_WDMA_WRA_NUM];
> +	u32 out_xsize_plus_1;
> +	u32 out_height;
> +	u32 out_ysize_plus_1_stride2;
> +	u32 out_stride;
> +	u32 out_stride_stride2;
> +	u32 out_width;
> +	u32 img_width;
> +	u32 img_height;
> +	u32 stride2_out_width;
> +	u32 stride2_out_height;
> +	u32 out_xsize_plus_1_stride2;
> +	u32 input_xsize_plus_1;
> +};
> +
> +struct aie_static_info {
> +	struct aie_static_info_element inf_elm[FD_LOOP_NUM];
> +};

aie_static_info{} is redundant.
Use struct aie_static_info_element{} directly.

> +
> +enum aie_state {
> +	STATE_NA,
> +	STATE_INIT,
> +	STATE_OPEN
> +};
> +

[snip]

> +
> +struct aie_reg_cfg {
> +	u32 rs_adr;
> +	u32 yuv2rgb_adr;
> +	u32 fd_adr;
> +	u32 fd_pose_adr;

fd_pose_adr is useless, so drop it.

> +	u32 fd_mode;
> +	u32 hw_result;
> +	u32 hw_result1;
> +	u32 reserved;

reserved is useless, so drop it.

> +};
> +
> +struct aie_hw_rect {
> +	u16 width;
> +	u16 height;
> +};

[snip]

> +
> +struct aie_attr_para {
> +	u32 w_idx;
> +	u32 r_idx;

In this code, only one attribute parameter is used at one time.
So it's not necessary to queue the attribute parameter.
Drop w_idx and r_idx.

> +	u32 sel_mode[MAX_ENQUE_FRAME_NUM];
> +	u16 img_width[MAX_ENQUE_FRAME_NUM];
> +	u16 img_height[MAX_ENQUE_FRAME_NUM];
> +	u16 crop_width[MAX_ENQUE_FRAME_NUM];
> +	u16 crop_height[MAX_ENQUE_FRAME_NUM];
> +	u32 src_img_fmt[MAX_ENQUE_FRAME_NUM];
> +	u32 rotate_degree[MAX_ENQUE_FRAME_NUM];
> +	u32 src_img_addr[MAX_ENQUE_FRAME_NUM];
> +	u32 src_img_addr_uv[MAX_ENQUE_FRAME_NUM];
> +};
> +

[snip]

> +
> +struct imem_buf_info {
> +	void *va;
> +	dma_addr_t pa;
> +	unsigned int size;
> +	unsigned int reserved;

reserved is useless, so drop it.

> +};
> +

[snip]

> +
> +static int aie_config_y2r(struct mtk_aie_dev *fd, struct aie_enq_info *aie_cfg,
> +			  int mode)
> +{
> +	u32 img_addr;
> +	u32 img_addr_UV;

Lower case for variable name.

> +	u32 img_off;
> +	u32 img_off_uv;
> +	u32 *yuv2rgb_cfg;
> +	u32 srcbuf, srcbuf_UV;
> +	u16 xmag_0, ymag_0;
> +	u16 pym0_out_w;
> +	u16 pym0_out_h;
> +	u16 stride_pym0_out_w;
> +	u16 sr_crp_w;
> +	u16 sr_crp_h;
> +	u16 y1_stride;
> +
> +	if (!aie_cfg->en_roi) {
> +		img_off = 0;
> +		img_off_uv = 0;
> +	} else {
> +		if (aie_cfg->src_img_fmt == FMT_MONO || aie_cfg->src_img_fmt == FMT_YUV_2P ||
> +		    aie_cfg->src_img_fmt == FMT_YVU_2P) {
> +			y1_stride = aie_cfg->src_img_stride * aie_cfg->src_roi.y1;
> +			img_off = y1_stride + aie_cfg->src_roi.x1;
> +			img_off_uv = y1_stride + aie_cfg->src_roi.x1;
> +		} else if (aie_cfg->src_img_fmt == FMT_YUV420_2P ||
> +			   aie_cfg->src_img_fmt == FMT_YUV420_1P) {
> +			y1_stride = aie_cfg->src_img_stride * aie_cfg->src_roi.y1;
> +			img_off = y1_stride + aie_cfg->src_roi.x1;
> +			img_off_uv = y1_stride / 2 + aie_cfg->src_roi.x1;
> +		} else if (aie_cfg->src_img_fmt == FMT_YUYV ||
> +			   aie_cfg->src_img_fmt == FMT_YVYU ||
> +			   aie_cfg->src_img_fmt == FMT_UYVY ||
> +			   aie_cfg->src_img_fmt == FMT_VYUY) {
> +			y1_stride = aie_cfg->src_img_stride * aie_cfg->src_roi.y1;
> +			img_off = y1_stride + aie_cfg->src_roi.x1 * 2;
> +			img_off_uv = y1_stride + aie_cfg->src_roi.x1 * 2;
> +		} else {
> +			dev_err(fd->dev, "Unsupport input format %d", aie_cfg->src_img_fmt);
> +			return -EINVAL;

I think you should check format when setting format not here.
It's not necessary to check format everywhere you use it.

> +		}
> +	}
> +
> +	img_addr = aie_cfg->src_img_addr + img_off;
> +	img_addr_UV = aie_cfg->src_img_addr_uv + img_off_uv;
> +
> +	srcbuf = img_addr;
> +	if (aie_cfg->src_img_fmt == FMT_YUV420_2P || aie_cfg->src_img_fmt == FMT_YUV420_1P ||
> +	    aie_cfg->src_img_fmt == FMT_YUV_2P || aie_cfg->src_img_fmt == FMT_YVU_2P)
> +		srcbuf_UV = img_addr_UV;
> +	else
> +		srcbuf_UV = 0;

srcbuf = aie_cfg->src_img_addr + img_off;
srcbuf_UV = aie_cfg->src_img_addr_uv + img_off_uv or 0;
and drop img_addr and img_addr_UV.

> +
> +	if (mode == FDMODE) {
> +		sr_crp_w = fd->base_para->crop_rect.width;
> +		sr_crp_h = fd->base_para->crop_rect.height;
> +		yuv2rgb_cfg = (u32 *)fd->base_para->fd_yuv2rgb_cfg_va;
> +		pym0_out_w = fd->base_para->pyramid_rect.width;
> +	}
> +	if (mode == ATTRIBUTEMODE) {
> +		sr_crp_w = fd->attr_para->crop_width[fd->attr_para->w_idx];
> +		sr_crp_h = fd->attr_para->crop_height[fd->attr_para->w_idx];
> +		yuv2rgb_cfg = (u32 *)fd->base_para->attr_yuv2rgb_cfg_va[fd->attr_para->w_idx];
> +		pym0_out_w = ATTR_MODE_PYRAMID_WIDTH;
> +	}
> +
> +	pym0_out_h = pym0_out_w * sr_crp_h / sr_crp_w;
> +
> +	if (pym0_out_w != 0) {
> +		xmag_0 = 512 * sr_crp_w / pym0_out_w;
> +		ymag_0 = xmag_0;
> +	} else {

Why do you allow pym0_out_w is zero?
return error here.
It's better to return error when user space set wrong parameter.

> +		xmag_0 = 0;
> +		ymag_0 = 0;
> +	}
> +
> +	yuv2rgb_cfg[Y2R_SRC_DST_FORMAT] = (yuv2rgb_cfg[Y2R_SRC_DST_FORMAT] & 0xFFFFFFF8) |
> +					  ((aie_cfg->src_img_fmt) & 0x7);

You does not assign other bits of Y2R_SRC_DST_FORMAT, so you could set it as

yuv2rgb_cfg[Y2R_SRC_DST_FORMAT] = (aie_cfg->src_img_fmt) & 0x7;

> +	if (aie_cfg->src_img_fmt == FMT_YUV420_2P || aie_cfg->src_img_fmt == FMT_YUV420_1P) {
> +		/* for match patten */
> +		yuv2rgb_cfg[Y2R_SRC_DST_FORMAT] = (yuv2rgb_cfg[Y2R_SRC_DST_FORMAT] &
> +						  0xFFFFFFF8) | ((0x3) & 0x7);

You does not assign other bits of Y2R_SRC_DST_FORMAT, so you could set it as

yuv2rgb_cfg[Y2R_SRC_DST_FORMAT] = 0x3;

Maybe use a symbol to replace 0x3;

> +	}
> +	yuv2rgb_cfg[Y2R_IN_W_H] = (yuv2rgb_cfg[Y2R_IN_W_H] & 0xF800F800) |
> +				  ((sr_crp_w << 16) & 0x7FF0000) | (sr_crp_h & 0x7FF);
> +	yuv2rgb_cfg[Y2R_OUT_W_H] = (yuv2rgb_cfg[Y2R_OUT_W_H] & 0xF800F800) |
> +				   ((pym0_out_w << 16) & 0x7FF0000) | (pym0_out_h & 0x7FF);
> +
> +	if (aie_cfg->src_img_fmt == FMT_YUV_2P || aie_cfg->src_img_fmt == FMT_YVU_2P) {
> +		/* 2 plane */
> +		yuv2rgb_cfg[Y2R_RA0_RA1_EN] = (yuv2rgb_cfg[Y2R_RA0_RA1_EN] & 0xFFFFFFEE) | 0x11;
> +		if (aie_cfg->en_roi) {
> +			yuv2rgb_cfg[Y2R_IN_X_Y_SIZE0] = aie_cmb_u16(dif_x(aie_cfg), dif_y(aie_cfg));
> +			yuv2rgb_cfg[Y2R_IN_X_Y_SIZE1] = aie_cmb_u16(dif_x(aie_cfg), dif_y(aie_cfg));

I think you could define a structure and its member has different type,

struct yuv2rgb_config {
	...
	u16	in_x_size0;
	u16	in_y_size0;
	u16	in_x_size1;
	u16	in_y_size1;
	...
};

struct yuv2rgb_config *yuv2rgb_cfg;

yuv2rgb_cfg->in_x_size0 = dif_x(aie_cfg);
yuv2rgb_cfg->in_y_size0 = dif_y(aie_cfg);
yuv2rgb_cfg->in_x_size1 = dif_x(aie_cfg);
yuv2rgb_cfg->in_y_size1 = dif_y(aie_cfg);

This look more readable.

> +		} else {
> +			yuv2rgb_cfg[Y2R_IN_X_Y_SIZE0] = aie_cmb_u16(sr_crp_w - 1, sr_crp_h - 1);
> +			yuv2rgb_cfg[Y2R_IN_X_Y_SIZE1] = aie_cmb_u16(sr_crp_w - 1, sr_crp_h - 1);
> +		}
> +		yuv2rgb_cfg[Y2R_IN_STRIDE0_BUS_SIZE0] =
> +			(yuv2rgb_cfg[Y2R_IN_STRIDE0_BUS_SIZE0] & 0xFFF0) |
> +			((aie_cfg->src_img_stride << 16) & 0xFFFF0000) | 0x1;
> +		yuv2rgb_cfg[Y2R_IN_STRIDE1_BUS_SIZE1] =
> +			(yuv2rgb_cfg[Y2R_IN_STRIDE1_BUS_SIZE1] & 0xFFF0) |
> +			((aie_cfg->src_img_stride << 16) & 0xFFFF0000) | 0x1;
> +	} else if (aie_cfg->src_img_fmt == FMT_MONO) {
> +		yuv2rgb_cfg[Y2R_RA0_RA1_EN] =
> +			(yuv2rgb_cfg[Y2R_RA0_RA1_EN] & 0xFFFFFFEE) | 0x01;

You does not assign other bits of Y2R_RA0_RA1_EN, so you could set it as

yuv2rgb_cfg[Y2R_RA0_RA1_EN] = 0x1;


> +		if (aie_cfg->en_roi) {
> +			yuv2rgb_cfg[Y2R_IN_X_Y_SIZE0] = aie_cmb_u16(dif_x(aie_cfg), dif_y(aie_cfg));
> +			yuv2rgb_cfg[Y2R_IN_X_Y_SIZE1] = aie_cmb_u16(dif_x(aie_cfg), dif_y(aie_cfg));
> +		} else {
> +			yuv2rgb_cfg[Y2R_IN_X_Y_SIZE0] = aie_cmb_u16(sr_crp_w - 1, sr_crp_h - 1);
> +			yuv2rgb_cfg[Y2R_IN_X_Y_SIZE1] = aie_cmb_u16(sr_crp_w - 1, sr_crp_h - 1);
> +		}
> +		yuv2rgb_cfg[Y2R_IN_STRIDE0_BUS_SIZE0] =
> +			(yuv2rgb_cfg[Y2R_IN_STRIDE0_BUS_SIZE0] & 0xFFF0) |
> +			((aie_cfg->src_img_stride << 16) & 0xFFFF0000) | 0x0;
> +		yuv2rgb_cfg[Y2R_IN_STRIDE1_BUS_SIZE1] =
> +			(yuv2rgb_cfg[Y2R_IN_STRIDE1_BUS_SIZE1] & 0xFFF0) |
> +			((aie_cfg->src_img_stride << 16) & 0xFFFF0000) | 0x0;

I think you could define a structure and its member has different type,

struct yuv2rgb_config {
	...
	u16	in_bus_size0;
	u16	in_stride0;
	u16	in_bus_size1;
	u16	in_stride1;
	...
};

struct yuv2rgb_config *yuv2rgb_cfg;

yuv2rgb_cfg->in_bus_size0 = 0;
yuv2rgb_cfg->in_stride0 = aie_cfg->src_img_stride;
yuv2rgb_cfg->in_bus_size1 = 0;
yuv2rgb_cfg->in_stride1 = aie_cfg->src_img_stride;

This look more readable.

> +	} else if (aie_cfg->src_img_fmt == FMT_YUYV ||
> +		   aie_cfg->src_img_fmt == FMT_YVYU ||
> +		   aie_cfg->src_img_fmt == FMT_UYVY ||
> +		   aie_cfg->src_img_fmt == FMT_VYUY) {
> +		/* 1 plane */
> +		yuv2rgb_cfg[Y2R_RA0_RA1_EN] = (yuv2rgb_cfg[Y2R_RA0_RA1_EN] & 0xFFFFFFEE) | 0x1;
> +		if (aie_cfg->en_roi) {
> +			yuv2rgb_cfg[Y2R_IN_X_Y_SIZE0] = aie_cmb_u16(2 * (dif_x(aie_cfg) + 1) - 1,
> +								    dif_y(aie_cfg));
> +			yuv2rgb_cfg[Y2R_IN_X_Y_SIZE1] = aie_cmb_u16(2 * (dif_x(aie_cfg) + 1) - 1,
> +								    dif_y(aie_cfg));
> +		} else {
> +			yuv2rgb_cfg[Y2R_IN_X_Y_SIZE0] = aie_cmb_u16(2 * sr_crp_w - 1, sr_crp_h - 1);
> +			yuv2rgb_cfg[Y2R_IN_X_Y_SIZE1] = aie_cmb_u16(2 * sr_crp_w - 1, sr_crp_h - 1);
> +		}
> +		yuv2rgb_cfg[Y2R_IN_STRIDE0_BUS_SIZE0] =
> +			(yuv2rgb_cfg[Y2R_IN_STRIDE0_BUS_SIZE0] & 0xFFF0) |
> +			((aie_cfg->src_img_stride << 16) & 0xFFFF0000) | 0x3;
> +		yuv2rgb_cfg[Y2R_IN_STRIDE1_BUS_SIZE1] =
> +			(yuv2rgb_cfg[Y2R_IN_STRIDE1_BUS_SIZE1] & 0xFFF0) |
> +			((aie_cfg->src_img_stride << 16) & 0xFFFF0000) | 0x3;
> +	}
> +
> +	/* AIE3.0 */
> +	if (aie_cfg->src_img_fmt == FMT_YUV420_2P ||
> +	    aie_cfg->src_img_fmt == FMT_YUV420_1P) {
> +		yuv2rgb_cfg[Y2R_RA0_RA1_EN] =
> +			(yuv2rgb_cfg[Y2R_RA0_RA1_EN] & 0xFFFFFFEE) | 0x11;
> +		if (aie_cfg->en_roi) {
> +			yuv2rgb_cfg[Y2R_IN_X_Y_SIZE0] = aie_cmb_u16(dif_x(aie_cfg), dif_y(aie_cfg));
> +			yuv2rgb_cfg[Y2R_IN_X_Y_SIZE1] = aie_cmb_u16(dif_x(aie_cfg),
> +								    dif_y(aie_cfg) / 2);
> +		} else {
> +			yuv2rgb_cfg[Y2R_IN_X_Y_SIZE0] =
> +				aie_cmb_u16(sr_crp_w - 1, sr_crp_h - 1);
> +			yuv2rgb_cfg[Y2R_IN_X_Y_SIZE1] = aie_cmb_u16(sr_crp_w - 1,
> +								    sr_crp_h / 2 - 1);
> +		}
> +		yuv2rgb_cfg[Y2R_IN_STRIDE0_BUS_SIZE0] =
> +			(yuv2rgb_cfg[Y2R_IN_STRIDE0_BUS_SIZE0] & 0xFFF0) |
> +			((aie_cfg->src_img_stride << 16) & 0xFFFF0000) | 0x0;
> +		yuv2rgb_cfg[Y2R_IN_STRIDE1_BUS_SIZE1] =
> +			(yuv2rgb_cfg[Y2R_IN_STRIDE1_BUS_SIZE1] & 0xFFF0) |
> +			((aie_cfg->src_img_stride << 16) & 0xFFFF0000) | 0x0;
> +
> +		yuv2rgb_cfg[Y2R_CO2_FMT_MODE_EN] =
> +			(yuv2rgb_cfg[Y2R_CO2_FMT_MODE_EN] & 0xFFFFFFFE) | 0x01;
> +		if (aie_cfg->en_roi) {
> +			yuv2rgb_cfg[Y2R_CO2_CROP_X] = aie_cmb_u16(0, dif_x(aie_cfg));
> +			yuv2rgb_cfg[Y2R_CO2_CROP_Y] = aie_cmb_u16(0, dif_y(aie_cfg));
> +		} else {
> +			yuv2rgb_cfg[Y2R_CO2_CROP_X] = aie_cmb_u16(0, sr_crp_w - 1);
> +			yuv2rgb_cfg[Y2R_CO2_CROP_Y] = aie_cmb_u16(0, sr_crp_h - 1);
> +		}
> +	} else {
> +		yuv2rgb_cfg[Y2R_CO2_FMT_MODE_EN] =
> +			(yuv2rgb_cfg[Y2R_CO2_FMT_MODE_EN] & 0xFFFFFFFE);
> +
> +		if (aie_cfg->en_roi) {
> +			yuv2rgb_cfg[Y2R_CO2_CROP_X] = aie_cmb_u16(0, dif_x(aie_cfg));
> +			yuv2rgb_cfg[Y2R_CO2_CROP_Y] = aie_cmb_u16(0, dif_y(aie_cfg));
> +		} else {
> +			yuv2rgb_cfg[Y2R_CO2_CROP_X] = aie_cmb_u16(0, sr_crp_w - 1);
> +			yuv2rgb_cfg[Y2R_CO2_CROP_Y] = aie_cmb_u16(0, sr_crp_h - 1);
> +		}
> +	}
> +
> +	stride_pym0_out_w = round_up(pym0_out_w, 8);
> +
> +	yuv2rgb_cfg[Y2R_OUT_X_Y_SIZE0] =
> +		aie_cmb_u16(pym0_out_w - 1, pym0_out_h - 1);
> +	set_cmb_cfg(yuv2rgb_cfg, Y2R_OUT_STRIDE0_BUS_SIZE0, stride_pym0_out_w);
> +	yuv2rgb_cfg[Y2R_OUT_X_Y_SIZE1] =
> +		aie_cmb_u16(pym0_out_w - 1, pym0_out_h - 1);
> +	set_cmb_cfg(yuv2rgb_cfg, Y2R_OUT_STRIDE1_BUS_SIZE1, stride_pym0_out_w);
> +	yuv2rgb_cfg[Y2R_OUT_X_Y_SIZE2] =
> +		aie_cmb_u16(pym0_out_w - 1, pym0_out_h - 1);
> +	set_cmb_cfg(yuv2rgb_cfg, Y2R_OUT_STRIDE2_BUS_SIZE2, stride_pym0_out_w);
> +
> +	if (aie_cfg->en_padding) {
> +		yuv2rgb_cfg[Y2R_PADDING_EN_UP_DOWN] =
> +			1 | ((aie_cfg->src_padding.up << 4) & 0x1FF0) |
> +			((aie_cfg->src_padding.down << 16) & 0x01FF0000);
> +		yuv2rgb_cfg[Y2R_PADDING_RIGHT_LEFT] =
> +			(aie_cfg->src_padding.right & 0x01FF) |
> +			((aie_cfg->src_padding.left << 16) & 0x01FF0000);

I think you could define a structure and its member has different type,

struct yuv2rgb_config {
	...
	u16	padding_en:4;
	u16	padding_up:12;
	u16	padding_down;
	u16	padding_right;
	u16	padding_left;
	...
};

struct yuv2rgb_config *yuv2rgb_cfg;

yuv2rgb_cfg->padding_en = 1;
yuv2rgb_cfg->padding_up = aie_cfg->src_padding.up;
yuv2rgb_cfg->padding_down = aie_cfg->src_padding.down;
yuv2rgb_cfg->padding_right = aie_cfg->src_padding.right;
yuv2rgb_cfg->padding_left = aie_cfg->src_padding.left;

This look more readable.

> +	} else {
> +		yuv2rgb_cfg[Y2R_PADDING_EN_UP_DOWN] = 0;
> +		yuv2rgb_cfg[Y2R_PADDING_RIGHT_LEFT] = 0;
> +	}
> +
> +	yuv2rgb_cfg[Y2R_IN_0] = srcbuf;
> +	yuv2rgb_cfg[Y2R_IN_1] = srcbuf_UV;
> +
> +	yuv2rgb_cfg[Y2R_OUT_0] = (u32)fd->base_para->rs_pym_rst_pa[0][0];
> +	yuv2rgb_cfg[Y2R_OUT_1] = (u32)fd->base_para->rs_pym_rst_pa[0][1];
> +	yuv2rgb_cfg[Y2R_OUT_2] = (u32)fd->base_para->rs_pym_rst_pa[0][2];
> +
> +	yuv2rgb_cfg[Y2R_X_Y_MAG] = (xmag_0 & 0x3FFF) | ((ymag_0 << 16) & 0x3FFF0000);
> +
> +	if (sr_crp_w >= pym0_out_w) {
> +		/* down scale AIE1.0 by FRZ */
> +		yuv2rgb_cfg[Y2R_RS_SEL_SRZ_EN] =
> +			(yuv2rgb_cfg[Y2R_RS_SEL_SRZ_EN] & 0x00100070);
> +		yuv2rgb_cfg[Y2R_SRZ_HORI_STEP] = 0;
> +		yuv2rgb_cfg[Y2R_SRZ_VERT_STEP] = 0;
> +	} else {
> +		/* SRZ */
> +		/* 0: FDRZ for down scaling */
> +		/* 1: SRZ for up scaling */
> +		yuv2rgb_cfg[Y2R_RS_SEL_SRZ_EN] =
> +			(yuv2rgb_cfg[Y2R_RS_SEL_SRZ_EN] & 0x00100070) | SRZ_BIT;
> +		yuv2rgb_cfg[Y2R_SRZ_HORI_STEP] = ((sr_crp_w - 1) << 15) / (pym0_out_w - 1);
> +		yuv2rgb_cfg[Y2R_SRZ_VERT_STEP] = ((sr_crp_h - 1) << 15) / (pym0_out_h - 1);
> +	}
> +
> +	yuv2rgb_cfg[Y2R_CON_IN_BA_MSB] = (u32)0x02020202;
> +	yuv2rgb_cfg[Y2R_CON_OUT_BA_MSB] = (u32)0x02020202;
> +
> +	return 0;
> +}
> +
> +static int aie_config_rs(struct mtk_aie_dev *fd, struct aie_enq_info *aie_cfg)
> +{
> +	u32 *rs_cfg;
> +	u32 *rs_tbl[2];
> +	u16 xmag_0, ymag_0;
> +	u16 pym_out_w[3];
> +	u16 pym_out_h[3];
> +	u16 round_w;
> +	u16 sr_crp_w;
> +	u16 sr_crp_h;
> +	int i;
> +
> +	sr_crp_w = fd->base_para->crop_rect.width;
> +	sr_crp_h = fd->base_para->crop_rect.height;
> +
> +	rs_cfg = (u32 *)fd->base_para->fd_rs_cfg_va;
> +
> +	pym_out_w[0] = fd->base_para->pyramid_rect.width;
> +	pym_out_w[1] = pym_out_w[0] >> 1;
> +	pym_out_w[2] = pym_out_w[1] >> 1;
> +
> +	pym_out_h[0] = pym_out_w[0] * sr_crp_h / sr_crp_w;

In aie_update_table(), img_height is equal to pym_height, but here is not.

	pstv->inf_elm[PYMX_START_LOOP(0)].img_width = pym_width;
	pstv->inf_elm[PYMX_START_LOOP(0)].img_height = pym_height;

Which one is correct? If both are correct, add comment to describe why these two are different.

> +	pym_out_h[1] = pym_out_h[0] >> 1;
> +	pym_out_h[2] = pym_out_h[1] >> 1;
> +
> +	for (i = 0; i < 2; i++) {
> +		rs_tbl[i] = rs_cfg + fd->variant->rs_cfg_size * i;

Because you would not use rs_tbl[0] and rs_tbl[1] at the same time,
it's not necessary to define rs_tbl as an array. Just define it as

u32 *rs_tbl;

> +
> +		rs_tbl[i][RS_IN_0] = (u32)fd->base_para->rs_pym_rst_pa[i][0];
> +		rs_tbl[i][RS_IN_1] = (u32)fd->base_para->rs_pym_rst_pa[i][1];
> +		rs_tbl[i][RS_IN_2] = (u32)fd->base_para->rs_pym_rst_pa[i][2];
> +
> +		rs_tbl[i][RS_OUT_0] = (u32)fd->base_para->rs_pym_rst_pa[i + 1][0];
> +		rs_tbl[i][RS_OUT_1] = (u32)fd->base_para->rs_pym_rst_pa[i + 1][1];
> +		rs_tbl[i][RS_OUT_2] = (u32)fd->base_para->rs_pym_rst_pa[i + 1][2];
> +
> +		rs_tbl[i][RS_INPUT_W_H] = (rs_tbl[i][RS_INPUT_W_H] & 0xF800F800) |
> +			(pym_out_h[i] & 0x7FF) | ((pym_out_w[i] << 16) & 0x7FF0000);
> +		rs_tbl[i][RS_OUTPUT_W_H] = (rs_tbl[i][RS_OUTPUT_W_H] & 0xF800F800) |
> +			(pym_out_h[i + 1] & 0x7FF) | ((pym_out_w[i + 1] << 16) & 0x7FF0000);

I think you could define a structure and its member has different type,

struct rs_table {
	...
	u16	in_h;
	u16	in_w;
	u16	out_h;
	u16	out_w;
	...
};

struct rs_table *rs_tbl;

rs_tbl->in_h = pym_out_h[i];
rs_tbl->in_w = pym_out_w[i];
rs_tbl->out_h = pym_out_h[i + 1];
rs_tbl->out_w = pym_out_w[i + 1];

This look more readable.

> +
> +		rs_tbl[i][RS_IN_X_Y_SIZE0] = aie_cmb_u16(pym_out_w[i] - 1, pym_out_h[i] - 1);
> +		rs_tbl[i][RS_IN_X_Y_SIZE1] = aie_cmb_u16(pym_out_w[i] - 1, pym_out_h[i] - 1);
> +		rs_tbl[i][RS_IN_X_Y_SIZE2] = aie_cmb_u16(pym_out_w[i] - 1, pym_out_h[i] - 1);
> +
> +		set_cmb_cfg(rs_tbl[i], RS_IN_STRIDE0, pym_out_w[i]);
> +		set_cmb_cfg(rs_tbl[i], RS_IN_STRIDE1, pym_out_w[i]);
> +		set_cmb_cfg(rs_tbl[i], RS_IN_STRIDE2, pym_out_w[i]);
> +
> +		rs_tbl[i][RS_OUT_X_Y_SIZE0] = aie_cmb_u16(pym_out_w[i + 1] - 1,
> +							  pym_out_h[i + 1] - 1);
> +		rs_tbl[i][RS_OUT_X_Y_SIZE1] = aie_cmb_u16(pym_out_w[i + 1] - 1,
> +							  pym_out_h[i + 1] - 1);
> +		rs_tbl[i][RS_OUT_X_Y_SIZE2] = aie_cmb_u16(pym_out_w[i + 1] - 1,
> +							  pym_out_h[i + 1] - 1);
> +
> +		if (i == 0)
> +			round_w = pym_out_w[i + 1];
> +		else
> +			round_w = round_up(pym_out_w[i + 1], 8);
> +
> +		set_cmb_cfg(rs_tbl[i], RS_OUT_STRIDE0, round_w);
> +		set_cmb_cfg(rs_tbl[i], RS_OUT_STRIDE1, round_w);
> +		set_cmb_cfg(rs_tbl[i], RS_OUT_STRIDE2, round_w);
> +
> +		xmag_0 = 512 * pym_out_w[i] / pym_out_w[i + 1];
> +		ymag_0 = xmag_0;
> +
> +		rs_tbl[i][RS_X_Y_MAG] = (xmag_0 & 0x3FFF) | ((ymag_0 << 16) & 0x3FFF0000);
> +		rs_tbl[i][RS_CON_IN_BA_MSB] = (u32)0x02020202;
> +		rs_tbl[i][RS_CON_OUT_BA_MSB] = (u32)0x02020202;
> +	}
> +
> +	return 0;
> +}
> +
> +static int aie_config_network(struct mtk_aie_dev *fd,
> +			      struct aie_enq_info *aie_cfg)
> +{
> +	struct aie_static_info *pstv = &fd->st_info;
> +	u16 pyramid0_out_w, pyramid0_out_h, pyramid1_out_h, pyramid2_out_h;
> +	u16 out_ysize_minus_1, out_ysize_minus_1_stride2;
> +	u16 fd_xsize[4], size, poolsize, stridesize;
> +	u16 input_height, out_height;
> +	u16 conv_width, conv_height;
> +	u32 *fd_cur_cfg, *fd_cur_set;
> +	u32 sr_crp_w, sr_crp_h;
> +	u32 cal_x, cal_y;
> +	u8 uch, uloop;
> +	u8 i, j;
> +	void *fd_cfg;
> +
> +	sr_crp_w = fd->base_para->crop_rect.width;
> +	sr_crp_h = fd->base_para->crop_rect.height;
> +
> +	pyramid0_out_w = fd->base_para->pyramid_rect.width;
> +	pyramid0_out_h = pyramid0_out_w * sr_crp_h / sr_crp_w;
> +
> +	pyramid1_out_h = pyramid0_out_h / 2;
> +	pyramid2_out_h = pyramid1_out_h / 2;
> +
> +	fd_cfg = fd->base_para->fd_fd_cfg_va;
> +
> +	for (i = 0; i < FD_LOOP_NUM; i++) {

I think this for-loop could be changed to nest for-loop and the code would be more simple.
Below comment is based on the nest for-loop.

struct fd_config {
	...
};

fd_cur_cfg = (struct fd_config *)fd_cfg;
for (i = 0; i < RPN_NUM; i++) {
	for (j = 0; j < RPN_LOOP_NUM; j++, fd_cur_cfg++) {

> +		fd_cur_cfg = (u32 *)fd_cfg + fd->variant->fd_cfg_size * i;

Drop this.

> +		fd_cur_cfg[FD_INPUT_ROTATE] = (fd_cur_cfg[FD_INPUT_ROTATE] & 0xFFFF0FFF) |
> +				     ((aie_cfg->rotate_degree << 12) & 0x3000);

struct fd_config {
	...
	u32	input_rotate;
	...
};

fd_cur_cfg->input_rotate = aie_cfg->rotate_degree << 12;

This looks more readable.

> +
> +		if (i == 0)
> +			input_height = pyramid2_out_h;
> +		else if (i == (RPNX_LOOP_NUM(2) + 1))
> +			input_height = pyramid1_out_h;
> +		else if (i == (RPNX_LOOP_NUM(1) + 1))
> +			input_height = pyramid0_out_h;
> +		else
> +			if (fd_out_stride2_in[i] == 0)
> +				input_height = out_height;
> +			else
> +				input_height = (out_height + 1) / 2;

Define pyramid0_out_h, pyramid1_out_h, pyramid2_out_h as an array pyramid_out_h[],
and

#define FD_OUT_STRIDE2_IN(n)	((n == 5) || (n == 11) ? 1 : 0)

if (j == 0)
	input_height = pyramid_out_h[2 - i];
else if (FD_OUT_STRIDE2_IN(j) == 0)
	input_height = out_height;
else
	input_height = (out_height + 1) / 2;

This looks more simple.

> +
> +		poolsize = is_fd_maxpool(i);
> +		stridesize = get_fd_stride(i);

poolsize = (j == 1) ? 1 : 0;
stridesize = (j == 0) ? 2 : 1;

This looks more simple.

> +		if (poolsize == 1)
> +			out_height =
> +				DIV_ROUND_UP(input_height, 2 * poolsize);
> +		else
> +			out_height = DIV_ROUND_UP(input_height, stridesize + 2 * poolsize);

When j == 0, poolsize = 0, stridesize = 2, then

out_height = DIV_ROUND_UP(input_height, 2);

When j == 1, poolsize = 1, stridesize = 1, then

out_height = DIV_ROUND_UP(input_height, 2);

When j > 1, poolsize = 0, stridesize = 1, then

out_height = DIV_ROUND_UP(input_height, 1);

So these could be simplified as

out_height = DIV_ROUND_UP(input_height, j > 1 ? 1 : 2);

> +
> +		if (i == RPNX_LOOP_NUM(0) || i == RPNX_LOOP_NUM(1) || i == RPNX_LOOP_NUM(2)) {

if (j == (RPN_LOOP_NUM - 1)) {

> +			conv_width = fd->base_para->img_rect.width;
> +			conv_height = fd->base_para->img_rect.height;
> +			fd_xsize[0] = pstv->inf_elm[i].img_width * 2 * 16 * ANCHOR_EN_NUM - 1;

fd_xsize[0] = pstv->inf_elm[i * RPN_LOOP_NUM + j].img_width * 2 * 16 * ANCHOR_EN_NUM - 1;

> +			fd_xsize[3] = pstv->inf_elm[i].img_width * 2 * 32 * ANCHOR_EN_NUM - 1;
> +			fd_xsize[2] = fd_xsize[3];
> +			fd_xsize[1] = fd_xsize[2];
> +		} else {
> +			conv_width = DIV_ROUND_UP(pstv->inf_elm[i].img_width, stridesize);
> +			conv_height = DIV_ROUND_UP(input_height, stridesize);
> +
> +			fd_xsize[3] = pstv->inf_elm[i].input_xsize_plus_1 - 1;
> +			fd_xsize[2] = fd_xsize[3];
> +			fd_xsize[1] = fd_xsize[2];
> +			fd_xsize[0] = fd_xsize[1];
> +		}
> +
> +		fd_cur_cfg[FD_CONV_WIDTH_MOD6] = (fd_cur_cfg[FD_CONV_WIDTH_MOD6] & 0xFF8FFFFF) |
> +						 (((conv_width % 6) << 20) & 0x00700000);
> +		fd_cur_cfg[FD_CONV_IMG_W_H] = aie_cmb_u16(conv_height, conv_width);

struct fd_config {
	...
	u32	conv_width_mod6;
	u16	conv_height;
	u16	conv_width;
	...
};

fd_cur_cfg->conv_width_mod6 = (conv_width % 6) << 20;
fd_cur_cfg->conv_width = conv_width;
fd_cur_cfg->conv_height = conv_height;

This looks more simple.

> +
> +		fd_cur_cfg[FD_IN_IMG_W_H] = aie_cmb_u16(input_height, pstv->inf_elm[i].img_width);
> +		fd_cur_cfg[FD_OUT_IMG_W_H] = aie_cmb_u16(out_height, pstv->inf_elm[i].out_width);
> +
> +		if (fd_rdma_en[i][0][0] != -1) {

This is always true, so drop this checking.

> +			for (j = 0; j < 4; j++) {
> +				fd_cur_cfg[FD_IN_X_Y_SIZE0 + 2 * j] =
> +					aie_cmb_u16(fd_xsize[j], input_height - 1);
> +				set_cmbst_cfg(fd_cur_cfg, FD_IN_STRIDE0_BUS_SIZE0 + 2 * j,
> +					      fd_xsize[j] + 1);
> +			}

struct fd_in_config {
	u16	xsize;
	u16	height;
	u16	bus_size;
	u16	stride;
};

struct fd_config {
	...
	struct fd_in_config fd_in[INPUT_WDMA_WRA_NUM];
	...
};

for (k = 0; k < INPUT_WDMA_WRA_NUM; k++) {
	fd_cur_cfg->fd_in[i].xsize = fd_xsize[k];
	fd_cur_cfg->fd_in[i].height = input_height - 1;
	fd_cur_cfg->fd_in[i].stride = fd_xsize[j] + 1;
}

This looks more readable.

> +		}
> +
> +		out_ysize_minus_1 = out_height - 1;
> +		out_ysize_minus_1_stride2 = (out_height + 1) / 2 - 1;
> +
> +		for (j = 0; j < OUTPUT_WDMA_WRA_NUM; j++) {
> +			fd_cur_set = fd_cur_cfg + 2 * j;
> +			if (!out_stride_size[i][j])
> +				continue;
> +
> +			if (out_stride_size[i][j] == 1) {
> +				fd_cur_set[FD_OUT_X_Y_SIZE0] =
> +					aie_cmb_u16(pstv->inf_elm[i].out_xsize_plus_1 - 1,
> +						    out_ysize_minus_1);
> +				set_cmbst_cfg(fd_cur_set, FD_OUT_STRIDE0_BUS_SIZE0,
> +					      pstv->inf_elm[i].out_stride);
> +			} else if (out_stride_size[i][j] == 2) {
> +				fd_cur_set[FD_OUT_X_Y_SIZE0] =
> +					aie_cmb_u16(pstv->inf_elm[i].out_xsize_plus_1_stride2 - 1,
> +						    out_ysize_minus_1_stride2);
> +				set_cmbst_cfg(fd_cur_set, FD_OUT_STRIDE0_BUS_SIZE0,
> +					      pstv->inf_elm[i].out_stride_stride2);
> +			}
> +		}

static const unsigned int out_stride_size[RPN_LOOP_NUM][OUTPUT_WDMA_WRA_NUM] = {
	{ 1, 0, 0, 0 }, { 1, 0, 2, 0 }, { 1, 0, 2, 0 }, { 1, 0, 0, 0 },
	{ 1, 1, 2, 2 }, { 1, 1, 2, 2 }, { 1, 0, 0, 0 }, { 1, 0, 2, 0 },
	{ 1, 1, 0, 0 }, { 1, 0, 0, 0 }, { 1, 0, 2, 0 }, { 1, 1, 0, 0 },
	{ 1, 0, 0, 0 }, { 1, 0, 0, 0 }, { 1, 0, 0, 0 }, { 1, 0, 0, 0 },
	{ 1, 0, 0, 0 }, { 1, 0, 0, 0 }, { 1, 1, 0, 0 }, { 1, 1, 0, 0 },
	{ 1, 1, 0, 0 }, { 1, 0, 0, 0 }, { 1, 1, 1, 1 }, { 1, 1, 1, 1 },
	{ 1, 1, 0, 0 }, { 1, 1, 0, 0 }, { 1, 1, 0, 0 }, { 1, 0, 0, 0 },
	{ 3, 0, 0, 0 }
};

struct fd_out_config {
	u16	xsize;
	u16	ysize_minus_1;
	u16	bus_size;
	u16	stride;
};

struct fd_config {
	...
	struct fd_out_config fd_out[OUTPUT_WDMA_WRA_NUM];
	...
};

for (k = 0; k < OUTPUT_WDMA_WRA_NUM; k++) {
	if (!out_stride_size[j][k])
		continue;

	if (out_stride_size[j][k] == 1) {
		fd_cur_cfg->fd_out[j].xsize = pstv->inf_elm[i].out_xsize_plus_1 - 1;
		fd_cur_cfg->fd_out[j].ysize_minus_1 = out_ysize_minus_1;
		fd_cur_cfg->fd_out[j].stride = pstv->inf_elm[i].out_stride;
	} else if (out_stride_size[j][k] == 2) {
		fd_cur_cfg->fd_out[j].xsize = pstv->inf_elm[i].out_xsize_plus_1_stride2 - 1;
		fd_cur_cfg->fd_out[j].ysize_minus_1 = out_ysize_minus_1_stride2;
		fd_cur_cfg->fd_out[j].stride = pstv->inf_elm[i].out_stride_stride2;
	}
}

This looks more readable.

> +
> +		if (i == RPNX_LOOP_NUM(0) || i == RPNX_LOOP_NUM(1) || i == RPNX_LOOP_NUM(2))

if (j == (RPN_LOOP_NUM - 1))

> +			set_cmb_cfg(fd_cur_cfg, FD_RPN_SET, fd->base_para->rpn_anchor_thrd);
> +
> +		if (i == RPNX_LOOP_NUM(0)) {
> +			cal_x = ((sr_crp_w << 10) * 100 /
> +				 (int)fd->base_para->pyramid_rect.width) >> 10;
> +			cal_y = cal_x * 512 / 100;
> +			fd_cur_cfg[FD_IMAGE_COORD] = (fd_cur_cfg[FD_IMAGE_COORD] & 0xF) |
> +						     ((cal_y << 4) & 0x7FFF0);
> +			fd_cur_cfg[FD_IMAGE_COORD_XY_OFST] = 0;
> +			if (aie_cfg->en_roi) {
> +				fd_cur_cfg[FD_IMAGE_COORD_XY_OFST] =
> +					(aie_cfg->src_roi.x1 - aie_cfg->src_padding.left) |
> +					(aie_cfg->src_roi.y1 - aie_cfg->src_padding.up) << 16;
> +			}
> +		} else if (i == RPNX_LOOP_NUM(1)) {
> +			cal_x = ((sr_crp_w << 10) * 100 /
> +				(int)fd->base_para->pyramid_rect.width) >> 10;
> +			cal_y = cal_x * 2 * 512 / 100;
> +			fd_cur_cfg[FD_IMAGE_COORD] = (fd_cur_cfg[FD_IMAGE_COORD] & 0xF) |
> +						     ((cal_y << 4) & 0x7FFF0);
> +			fd_cur_cfg[FD_IMAGE_COORD_XY_OFST] = 0;
> +			if (aie_cfg->en_roi) {
> +				fd_cur_cfg[FD_IMAGE_COORD_XY_OFST] =
> +					(aie_cfg->src_roi.x1 - aie_cfg->src_padding.left) |
> +					(aie_cfg->src_roi.y1 - aie_cfg->src_padding.up) << 16;
> +			}
> +		} else if (i == RPNX_LOOP_NUM(2)) {
> +			cal_x = ((sr_crp_w << 10) * 100 /
> +				(int)fd->base_para->pyramid_rect.width) >> 10;
> +			cal_y = cal_x * 4 * 512 / 100;
> +			fd_cur_cfg[FD_IMAGE_COORD] = (fd_cur_cfg[FD_IMAGE_COORD] & 0xF) |
> +						     ((cal_y << 4) & 0x7FFF0);
> +			fd_cur_cfg[FD_IMAGE_COORD_XY_OFST] = 0;
> +			if (aie_cfg->en_roi) {
> +				fd_cur_cfg[FD_IMAGE_COORD_XY_OFST] =
> +					(aie_cfg->src_roi.x1 - aie_cfg->src_padding.left) |
> +					(aie_cfg->src_roi.y1 - aie_cfg->src_padding.up) << 16;
> +			}
> +		}

struct fd_config {
	...
	u8	img_coord_x:4;
	u8	img_coord_y:4;
	u32	img_coord_reserved:24;
	u16	img_coord_x_ofst;
	u16	img_coord_y_ofst;
	...
};

if (j == (RPN_LOOP_NUM - 1)) {
	cal_x = ((sr_crp_w << 10) * 100 /
		 (int)fd->base_para->pyramid_rect.width) >> 10;
	cal_y = cal_x * (1 << (2 - i)) * 512 / 100;
	fd_cur_cfg->img_coord_x = fd_cur_cfg[FD_IMAGE_COORD];
	fd_cur_cfg->img_coord_y = cal_y;
	fd_cur_cfg->img_coord_x_ofst = 0;
	fd_cur_cfg->img_coord_y_ofst = 0;

	if (aie_cfg->en_roi) {
		fd_cur_cfg->img_coord_x_ofst = aie_cfg->src_roi.x1 - aie_cfg->src_padding.left;
		fd_cur_cfg->img_coord_y_ofst = aie_cfg->src_roi.y1 - aie_cfg->src_padding.up;
	}
}

This would shrink the code size and looks readable.

> +
> +		/* IN_FM_BASE_ADR */
> +		if (i == 0) {
> +			fd_cur_cfg[FD_IN_0] = (u32)(fd->base_para->rs_pym_rst_pa[2][0]);
> +			fd_cur_cfg[FD_IN_1] = (u32)(fd->base_para->rs_pym_rst_pa[2][1]);
> +			fd_cur_cfg[FD_IN_2] = (u32)(fd->base_para->rs_pym_rst_pa[2][2]);
> +		} else if (i == (RPNX_LOOP_NUM(2) + 1)) {
> +			fd_cur_cfg[FD_IN_0] = (u32)(fd->base_para->rs_pym_rst_pa[1][0]);
> +			fd_cur_cfg[FD_IN_1] = (u32)(fd->base_para->rs_pym_rst_pa[1][1]);
> +			fd_cur_cfg[FD_IN_2] = (u32)(fd->base_para->rs_pym_rst_pa[1][2]);
> +		} else if (i == (RPNX_LOOP_NUM(1) + 1)) {
> +			fd_cur_cfg[FD_IN_0] = (u32)(fd->base_para->rs_pym_rst_pa[0][0]);
> +			fd_cur_cfg[FD_IN_1] = (u32)(fd->base_para->rs_pym_rst_pa[0][1]);
> +			fd_cur_cfg[FD_IN_2] = (u32)(fd->base_para->rs_pym_rst_pa[0][2]);
> +		} else {
> +			for (j = 0; j < INPUT_WDMA_WRA_NUM; j++) {
> +				if (fd_rdma_en[i][j][0] != -1) {
> +					uloop = fd_rdma_en[i][j][0];
> +					uch = fd_rdma_en[i][j][1];
> +					fd_cur_cfg[FD_IN_0 + j] =
> +						(u32)(fd->dma_para->fd_out_hw_pa[uloop][uch]);
> +				}
> +			}
> +		}

static const signed int fd_rdma_en[RPN_LOOP_NUM][INPUT_WDMA_WRA_NUM][2] = {
	{ { 99, 99 }, { 99, 99 }, { 99, 99 }, { -1, -1 } },
	{ { 0, 0 }, { -1, -1 }, { -1, -1 }, { -1, -1 } },
	{ { 1, 0 }, { -1, -1 }, { -1, -1 }, { -1, -1 } },
	{ { 1, 0 }, { 2, 0 }, { -1, -1 }, { -1, -1 } },
	{ { 3, 0 }, { -1, -1 }, { -1, -1 }, { -1, -1 } },
	{ { 1, 2 }, { 2, 2 }, { 4, 2 }, { 4, 3 } },
	{ { 5, 0 }, { 5, 1 }, { -1, -1 }, { -1, -1 } },
	{ { 6, 0 }, { -1, -1 }, { -1, -1 }, { -1, -1 } },
	{ { 5, 0 }, { 5, 1 }, { 7, 0 }, { -1, -1 } },
	{ { 8, 0 }, { 8, 1 }, { -1, -1 }, { -1, -1 } },
	{ { 9, 0 }, { -1, -1 }, { -1, -1 }, { -1, -1 } },
	{ { 5, 2 }, { 5, 3 }, { 7, 2 }, { 10, 2 } },
	{ { 11, 0 }, { 11, 1 }, { -1, -1 }, { -1, -1 } },
	{ { 12, 0 }, { -1, -1 }, { -1, -1 }, { -1, -1 } },
	{ { 13, 0 }, { -1, -1 }, { -1, -1 }, { -1, -1 } },
	{ { 11, 0 }, { 11, 1 }, { 14, 0 }, { -1, -1 } },
	{ { 15, 0 }, { -1, -1 }, { -1, -1 }, { -1, -1 } },
	{ { 16, 0 }, { -1, -1 }, { -1, -1 }, { -1, -1 } },
	{ { 11, 0 }, { 11, 1 }, { 14, 0 }, { 17, 0 } },
	{ { 18, 0 }, { 18, 1 }, { -1, -1 }, { -1, -1 } },
	{ { 18, 0 }, { 18, 1 }, { -1, -1 }, { -1, -1 } },
	{ { 18, 0 }, { 18, 1 }, { -1, -1 }, { -1, -1 } },
	{ { 18, 0 }, { 18, 1 }, { -1, -1 }, { -1, -1 } },
	{ { 18, 0 }, { 18, 1 }, { -1, -1 }, { -1, -1 } },
	{ { 18, 0 }, { 18, 1 }, { -1, -1 }, { -1, -1 } },
	{ { 18, 0 }, { 18, 1 }, { -1, -1 }, { -1, -1 } },
	{ { 18, 0 }, { 18, 1 }, { -1, -1 }, { -1, -1 } },
	{ { 18, 0 }, { 18, 1 }, { -1, -1 }, { -1, -1 } },
	{ { 19, 0 }, { 22, 0 }, { 22, 1 }, { 25, 0 } }
};

struct fd_config {
	...
	u32 in[INPUT_WDMA_WRA_NUM];
	...
};

if (j == 0) {
	fd_cur_cfg->in_0 = fd->base_para->rs_pym_rst_pa[2 - i][0];
	fd_cur_cfg->in_1 = fd->base_para->rs_pym_rst_pa[2 - i][1];
	fd_cur_cfg->in_2 = fd->base_para->rs_pym_rst_pa[2 - i][2];
} else {
	for (k = 0; k < INPUT_WDMA_WRA_NUM; k++) {
		if (fd_rdma_en[j][k][0] != -1) {
			uloop = fd_rdma_en[j][k][0] + i * RPN_LOOP_NUM;
			uch = fd_rdma_en[j][k][1];
			fd_cur_cfg->in[k] =
				fd->dma_para->fd_out_hw_pa[uloop][uch];
		}
	}
}

This looks more simple.

> +
> +		/* OUT_FM_BASE_ADR */
> +		for (j = 0; j < OUTPUT_WDMA_WRA_NUM; j++) {
> +			if (out_stride_size[i][j])
> +				fd_cur_cfg[FD_OUT_0 + j] =
> +					(u32)(fd->dma_para->fd_out_hw_pa[i][j]);
> +		}
> +
> +		/* KERNEL_BASE_ADR */
> +		for (j = 0; j < KERNEL_RDMA_RA_NUM; j++) {
> +			size = get_fd_ker_rdma_size(i, j);
> +			if (size)
> +				fd_cur_cfg[FD_KERNEL_0 + j] =
> +					(u32)(fd->dma_para->fd_kernel_pa[i][j]);
> +		}
> +
> +		fd_cur_cfg[FD_CON_IN_BA_MSB] = (u32)0x02020202;

It's not necessary to case to u32.

> +		fd_cur_cfg[FD_CON_OUT_BA_MSB] = (u32)0x02020202;
> +		fd_cur_cfg[FD_CON_KERNEL_BA_MSB] = (u32)0x00000202;
> +	}
> +
> +	return 0;
> +}
> +
> +static int aie_config_attr_network(struct mtk_aie_dev *fd,
> +				   struct aie_enq_info *aie_cfg)
> +{
> +	bool is_regression_loop;
> +	void *fd_cfg;
> +	u32 *fd_cur_cfg;
> +	u16 fd_input_ht, fd_output_ht;
> +	u16 fd_out_y[4];
> +	u8 i, j;
> +	u8 uloop, uch, uidx;
> +	u16 pyramid0_out_w, pyramid0_out_h;
> +	int fd_conv_ht;
> +	u16 sr_crp_w, sr_crp_h;
> +
> +	sr_crp_w = fd->attr_para->crop_width[fd->attr_para->w_idx];
> +	sr_crp_h = fd->attr_para->crop_height[fd->attr_para->w_idx];
> +
> +	pyramid0_out_w = ATTR_MODE_PYRAMID_WIDTH;
> +	pyramid0_out_h = pyramid0_out_w * sr_crp_h / sr_crp_w;
> +
> +	fd_cfg = fd->base_para->attr_fd_cfg_va[fd->attr_para->w_idx];
> +
> +	for (i = 0; i < ATTR_LOOP_NUM; i++) {
> +		fd_cur_cfg = (u32 *)fd_cfg + fd->variant->fd_cfg_size * i;
> +		fd_cur_cfg[FD_INPUT_ROTATE] =
> +			(fd_cur_cfg[FD_INPUT_ROTATE] & 0xFFFF0FFF) |
> +			((aie_cfg->rotate_degree << 12) & 0x3000);
> +		if (i == 0)
> +			fd_input_ht = pyramid0_out_h;
> +		else
> +			if (attr_out_stride2_as_in[i] == 0)
> +				fd_input_ht = fd_output_ht;
> +			else if (attr_out_stride2_as_in[i] == 1)
> +				fd_input_ht = (fd_output_ht + 1) / 2;
> +
> +		fd_output_ht = DIV_ROUND_UP(fd_input_ht, ATTR_FD_STRIDE(i) +
> +					    2 * ATTR_FD_MAXPOOL(i));
> +		fd_conv_ht = DIV_ROUND_UP(fd_input_ht, ATTR_FD_STRIDE(i));
> +
> +		fd_cur_cfg[FD_CONV_IMG_W_H] =
> +			(fd_cur_cfg[FD_CONV_IMG_W_H] & 0xFFFF0000) |
> +			(fd_conv_ht & 0xFFFF);
> +		fd_cur_cfg[FD_IN_IMG_W_H] =
> +			(fd_cur_cfg[FD_IN_IMG_W_H] & 0xFFFF0000) |
> +			(fd_input_ht & 0xFFFF);
> +		fd_cur_cfg[FD_OUT_IMG_W_H] =
> +			(fd_cur_cfg[FD_OUT_IMG_W_H] & 0xFFFF0000) |
> +			(fd_output_ht & 0xFFFF);


I think you could define a structure and its member has different type,

struct fd_cur_config {
	...
	u16	in_w;
	u16	in_h;
	u16	out_w;
	u16	out_h;
	...
};

struct fd_cur_config *fd_cur_cfg;

fd_cur_cfg->in_w = fd_input_ht;
fd_cur_cfg->out_w = fd_output_ht;

This look more readable.

> +		set_cmb_cfg(fd_cur_cfg, FD_IN_X_Y_SIZE0, fd_input_ht - 1);
> +		set_cmb_cfg(fd_cur_cfg, FD_IN_X_Y_SIZE1, fd_input_ht - 1);
> +		set_cmb_cfg(fd_cur_cfg, FD_IN_X_Y_SIZE2, fd_input_ht - 1);
> +		set_cmb_cfg(fd_cur_cfg, FD_IN_X_Y_SIZE3, fd_input_ht - 1);
> +
> +		is_regression_loop = (i == AGE_OUT_RGS || i == GENDER_OUT_RGS ||
> +					      i == INDIAN_OUT_RGS || i == ETHNICITY_OUT_RGS);
> +
> +		if (is_regression_loop) {
> +			fd_out_y[0] = 0;
> +			fd_out_y[1] = 0;
> +			fd_out_y[2] = 0;
> +			fd_out_y[3] = 0;
> +		} else {
> +			fd_out_y[0] = fd_output_ht - 1;
> +			fd_out_y[1] = fd_output_ht - 1;
> +			if (attr_out_2size[i] == 0) {
> +				fd_out_y[2] = fd_output_ht - 1;
> +				fd_out_y[3] = fd_output_ht - 1;
> +			} else {
> +				fd_out_y[2] = (fd_output_ht + 1) / 2 - 1;
> +				fd_out_y[3] = (fd_output_ht + 1) / 2 - 1;
> +			}
> +		}
> +
> +		for (j = 0; j < 4; j++)
> +			set_cmb_cfg(fd_cur_cfg, FD_OUT_X_Y_SIZE0 + 2 * j, fd_out_y[j]);
> +
> +		/* IN_FM_BASE_ADR */
> +		if (i == 0) {
> +			fd_cur_cfg[FD_IN_0] = (u32)(fd->base_para->rs_pym_rst_pa[0][0]);
> +			fd_cur_cfg[FD_IN_1] = (u32)(fd->base_para->rs_pym_rst_pa[0][1]);
> +			fd_cur_cfg[FD_IN_2] = (u32)(fd->base_para->rs_pym_rst_pa[0][2]);
> +		} else {
> +			for (j = 0; j < INPUT_WDMA_WRA_NUM; j++) {
> +				if (attr_rdma_en[i][j][0] != -1) {
> +					uloop = attr_rdma_en[i][j][0];
> +					uch = attr_rdma_en[i][j][1];
> +					fd_cur_cfg[FD_IN_0 + j] =
> +						(u32)(fd->dma_para->attr_out_hw_pa[uloop][uch]);
> +				}
> +			}
> +		}
> +
> +		/* OUT_FM_BASE_ADR */
> +		for (j = 0; j < OUTPUT_WDMA_WRA_NUM; j++) {
> +			if (attr_wdma_en[i][j]) {
> +				uidx = fd->attr_para->w_idx;
> +				if (i == AGE_OUT_RGS && j == 0)
> +					fd_cur_cfg[FD_OUT_0 + j] =
> +						(u32)(fd->dma_para->age_out_hw_pa[uidx]);
> +				else if (i == GENDER_OUT_RGS && j == 0)
> +					fd_cur_cfg[FD_OUT_0 + j] =
> +						(u32)(fd->dma_para->gender_out_hw_pa[uidx]);
> +				else if (i == INDIAN_OUT_RGS && j == 0)
> +					fd_cur_cfg[FD_OUT_0 + j] =
> +						(u32)(fd->dma_para->is_indian_out_hw_pa[uidx]);
> +				else if (i == ETHNICITY_OUT_RGS && j == 0)
> +					fd_cur_cfg[FD_OUT_0 + j] =
> +						(u32)(fd->dma_para->ethnicity_out_hw_pa[uidx]);
> +				else
> +					fd_cur_cfg[FD_OUT_0 + j] =
> +						(u32)(fd->dma_para->attr_out_hw_pa[i][j]);
> +			}
> +		}
> +
> +		/* KERNEL_BASE_ADR */
> +		for (j = 0; j < KERNEL_RDMA_RA_NUM; j++) {
> +			fd_cur_cfg[FD_KERNEL_0 + j] =
> +				(u32)(fd->dma_para->attr_kernel_pa[i][j]);
> +		}
> +
> +		fd_cur_cfg[FD_CON_IN_BA_MSB] = (u32)0x02020202;
> +		fd_cur_cfg[FD_CON_OUT_BA_MSB] = (u32)0x02020202;
> +		fd_cur_cfg[FD_CON_KERNEL_BA_MSB] = (u32)0x00000202;
> +	}
> +	return 0;
> +}
> +

[snip]

> +
> +static void aie_execute_attribute_detection(struct mtk_aie_dev *fd,
> +					    struct aie_enq_info *aie_cfg)
> +{
> +	writel(0x0, fd->fd_base + AIE_START_REG);
> +	writel(0x00000101, fd->fd_base + AIE_ENABLE_REG);
> +	writel(0x00001A00, fd->fd_base + AIE_LOOP_REG);
> +	writel(0x1, fd->fd_base + AIE_INT_EN_REG);
> +	writel(fd->reg_cfg.rs_adr, fd->fd_base + AIE_RS_CON_BASE_ADR_REG);

You does not assign fd->reg_cfg.rs_adr in attribute mode, why do you set it to register?

> +	writel(fd->reg_cfg.fd_adr, fd->fd_base + AIE_FD_CON_BASE_ADR_REG);
> +	writel(fd->reg_cfg.yuv2rgb_adr, fd->fd_base + AIE_YUV2RGB_CON_BASE_ADR_REG);
> +	writel(0x00000002, fd->fd_base + AIE_YUV2RGB_CON_BASE_ADR_MSB);
> +	writel(0x00000002, fd->fd_base + AIE_RS_CON_BASE_ADR_MSB);
> +	writel(0x00000002, fd->fd_base + AIE_FD_CON_BASE_ADR_MSB);
> +	writel(0x1, fd->fd_base + AIE_START_REG);
> +}

You could merge aie_execute_face_detection() and aie_execute_attribute_detection() into one function:

aie_execute_face_attr_detection(struct mtk_aie_dev *fd,
				u32 nr_pyramid)
{
	writel(0x0, fd->fd_base + AIE_START_REG);
	if (nr_pyramid) {
		writel(0x00000111, fd->fd_base + AIE_ENABLE_REG);
		loop_num = FD_LOOP_NUM / 3 * nr_pyramid;
		loop_reg_val = (loop_num << 8) | (nr_pyramid - 1);
		writel(loop_reg_val, fd->fd_base + AIE_LOOP_REG);
	} else {
		writel(0x00000101, fd->fd_base + AIE_ENABLE_REG);
		writel(0x00001A00, fd->fd_base + AIE_LOOP_REG);
	}
	writel(0x1, fd->fd_base + AIE_INT_EN_REG);
	writel(fd->reg_cfg.rs_adr, fd->fd_base + AIE_RS_CON_BASE_ADR_REG);
	writel(fd->reg_cfg.fd_adr, fd->fd_base + AIE_FD_CON_BASE_ADR_REG);
	writel(fd->reg_cfg.yuv2rgb_adr, fd->fd_base + AIE_YUV2RGB_CON_BASE_ADR_REG);
	writel(0x00000002, fd->fd_base + AIE_YUV2RGB_CON_BASE_ADR_MSB);
	writel(0x00000002, fd->fd_base + AIE_RS_CON_BASE_ADR_MSB);
	writel(0x00000002, fd->fd_base + AIE_FD_CON_BASE_ADR_MSB);
	writel(0x1, fd->fd_base + AIE_START_REG);
}

For attribute mode, nr_pyramid is 0.

> +
> +static void aie_execute_fld_detection(struct mtk_aie_dev *fd,
> +				      struct aie_enq_info *aie_cfg)
> +{
> +	unsigned int i;
> +
> +	writel(0x10, fd->fd_base + AIE_START_REG);
> +	writel(0x00011111, fd->fd_base + AIE_DMA_CTL_REG);
> +	writel(0x01111111, fd->fd_base + FLD_EN);
> +	writel(0x1, fd->fd_base + AIE_INT_EN_REG);
> +	for (i = 0; i < aie_cfg->fld_face_num; i++) {
> +		writel(aie_cfg->src_img_addr, fd->fd_base + FLD_BASE_ADDR_FACE_0 + i * 0x4);
> +		writel(aie_cfg->fld_input[i].fld_in_crop_x1 << 16 |
> +		       aie_cfg->fld_input[i].fld_in_crop_y1,
> +		       fd->fd_base + fld_face_info_0[i]);
> +		writel(aie_cfg->fld_input[i].fld_in_crop_x2 << 16 |
> +		       aie_cfg->fld_input[i].fld_in_crop_y2,
> +		       fd->fd_base + FLD_FACE_INFO(i, 1));
> +		writel(aie_cfg->fld_input[i].fld_in_rip << 4 |
> +		       aie_cfg->fld_input[i].fld_in_rop,
> +		       fd->fd_base + FLD_FACE_INFO(i, 2));
> +	}
> +
> +	writel(aie_cfg->fld_face_num << 28 | FLD_FOREST << 16 |
> +	       FLD_POINT, fd->fd_base + FLD_MODEL_PARA1);
> +	writel(13 << 16 | 0xfe9, fd->fd_base + FLD_MODEL_PARA14);
> +	writel(aie_cfg->src_img_width << 16 | aie_cfg->src_img_height,
> +	       fd->fd_base + FLD_SRC_WD_HT);
> +
> +	/*input settings*/
> +	writel(0x007c003f, fd->fd_base + FLD_PL_IN_SIZE_0);
> +	writel(0x0040000f, fd->fd_base + FLD_PL_IN_STRIDE_0);
> +	writel(0x007c003f, fd->fd_base + FLD_PL_IN_SIZE_1);
> +	writel(0x0040000f, fd->fd_base + FLD_PL_IN_STRIDE_1);
> +	writel(0x0016003f, fd->fd_base + FLD_PL_IN_SIZE_2_0);
> +	writel(0x0040000f, fd->fd_base + FLD_PL_IN_STRIDE_2_0);
> +	writel(0x0013003f, fd->fd_base + FLD_PL_IN_SIZE_2_1);
> +	writel(0x0040000f, fd->fd_base + FLD_PL_IN_STRIDE_2_1);
> +	writel(0x0013003f, fd->fd_base + FLD_PL_IN_SIZE_2_2);
> +	writel(0x0040000f, fd->fd_base + FLD_PL_IN_STRIDE_2_2);
> +	writel(0x00a6001f, fd->fd_base + FLD_PL_IN_SIZE_3);
> +	writel(0x0020000f, fd->fd_base + FLD_PL_IN_STRIDE_3);
> +
> +	/*output setting*/
> +	writel((2400 * aie_cfg->fld_face_num - 1) << 16 | 127,
> +	       fd->fd_base + FLD_SH_IN_SIZE_0);
> +	writel(0x0010000f, fd->fd_base + FLD_SH_IN_STRIDE_0);
> +	writel(fd->fld_para->fld_output_pa[0],
> +	       fd->fd_base + FLD_TR_OUT_BASE_ADDR_0);

fld_output_pa[] is used in this function and you just use fld_output_pa[0],
so it's not necessary to use an array to store fld_output_pa.
In aie_arrange_fld_buf(),

fd->fld_para->fld_output_pa = fd->fd_fld_out_hw.pa;

And here,

writel(fd->fld_para->fld_output_pa, fd->fd_base + FLD_TR_OUT_BASE_ADDR_0);

> +	writel((aie_cfg->fld_face_num - 1) << 16 | 0x6f,
> +	       fd->fd_base + FLD_TR_OUT_SIZE_0);
> +	writel(0x0070000f, fd->fd_base + FLD_TR_OUT_STRIDE_0);
> +	writel(fd->fld_para->fld_output_pa[0],
> +	       fd->fd_base + FLD_PP_OUT_BASE_ADDR_0);

Ditto.

> +	writel((aie_cfg->fld_face_num - 1) << 16 | 0x6f,
> +	       fd->fd_base + FLD_PP_OUT_SIZE_0);
> +	writel(0x0070000f, fd->fd_base + FLD_PP_OUT_STRIDE_0);
> +
> +	/*cv score*/
> +	writel(0x00000001, fd->fd_base + FLD_BS_BIAS);
> +	writel(0x0000b835, fd->fd_base + FLD_CV_FM_RANGE_0);
> +	writel(0xffff5cba, fd->fd_base + FLD_CV_FM_RANGE_1);
> +	writel(0x00005ed5, fd->fd_base + FLD_CV_PM_RANGE_0);
> +	writel(0xffff910d, fd->fd_base + FLD_CV_PM_RANGE_1);
> +	writel(0x0000031e, fd->fd_base + FLD_BS_RANGE_0);
> +	writel(0xfffffcae, fd->fd_base + FLD_BS_RANGE_1);
> +
> +	/* 6 steps */
> +	writel(fd->fld_para->fld_step_pa[FLD_STEP_BLINK][14],
> +	       fd->fd_base + FLD_BS_IN_BASE_ADDR_14);
> +
> +	for (i = 0; i < 15; i++) {
> +		writel(fd->fld_para->fld_step_pa[FLD_STEP_CV][i],
> +		       fd->fd_base + FLD_PL_IN_BASE_ADDR_2_(i));
> +
> +		writel(fd->fld_para->fld_step_pa[FLD_STEP_FP][i],
> +		       fd->fd_base + FLD_PL_IN_BASE_ADDR_3_(i));
> +
> +		writel(fd->fld_para->fld_step_pa[FLD_STEP_LEAF][i],
> +		       fd->fd_base + FLD_SH_IN_BASE_ADDR_(i));
> +
> +		writel(fd->fld_para->fld_step_pa[FLD_STEP_KM02][i],
> +		       fd->fd_base + FLD_PL_IN_BASE_ADDR_0_(i));
> +
> +		writel(fd->fld_para->fld_step_pa[FLD_STEP_KM13][i],
> +		       fd->fd_base + FLD_PL_IN_BASE_ADDR_1_(i));
> +	}
> +
> +	writel(0x22222222, fd->fd_base + FLD_PL_IN_BASE_ADDR_0_0_7_MSB);
> +	writel(0x02222222, fd->fd_base + FLD_PL_IN_BASE_ADDR_0_8_15_MSB);
> +
> +	writel(0x22222222, fd->fd_base + FLD_PL_IN_BASE_ADDR_1_0_7_MSB);
> +	writel(0x02222222, fd->fd_base + FLD_PL_IN_BASE_ADDR_1_8_15_MSB);
> +
> +	writel(0x22222222, fd->fd_base + FLD_PL_IN_BASE_ADDR_2_0_7_MSB);
> +	writel(0x02222222, fd->fd_base + FLD_PL_IN_BASE_ADDR_2_8_15_MSB);
> +
> +	writel(0x22222222, fd->fd_base + FLD_PL_IN_BASE_ADDR_3_0_7_MSB);
> +	writel(0x02222222, fd->fd_base + FLD_PL_IN_BASE_ADDR_3_8_15_MSB);
> +
> +	writel(0x22222222, fd->fd_base + FLD_SH_IN_BASE_ADDR_0_7_MSB);
> +	writel(0x02222222, fd->fd_base + FLD_SH_IN_BASE_ADDR_8_15_MSB);
> +
> +	writel(0x02000000, fd->fd_base + FLD_BS_IN_BASE_ADDR_8_15_MSB);
> +
> +	writel(0x22222222, fd->fd_base + FLD_BASE_ADDR_FACE_0_7_MSB);
> +	writel(0x02222222, fd->fd_base + FLD_BASE_ADDR_FACE_8_14_MSB);
> +	writel(0x00000002, fd->fd_base + FLD_TR_OUT_BASE_ADDR_0_MSB);
> +	writel(0x00000002, fd->fd_base + FLD_PP_OUT_BASE_ADDR_0_MSB);
> +
> +	/* fld mode + trigger start */
> +	writel(0x11, fd->fd_base + AIE_START_REG);
> +}
> +

[snip]

> +
> +/* return aie_cfg to user space */
> +void aie_get_fd_result(struct mtk_aie_dev *fd, struct aie_enq_info *aie_cfg)
> +{
> +	u32 fd_result_hw, fd_result_1_hw, fd_total_num;
> +	struct aie_enq_info *tmp_aie_cfg;
> +	void *fd_pym_result[PYM_NUM];
> +	u32 fd_pyramid_num[PYM_NUM];
> +	signed short landmark;
> +	unsigned int *pto12;
> +	struct fd_ret *prst;
> +	unsigned int i, j;
> +
> +	aie_cfg->sel_mode = fd->base_para->sel_mode;
> +	aie_cfg->rotate_degree = fd->base_para->rotate_degree;
> +	aie_cfg->src_img_addr = fd->base_para->src_img_addr;
> +	aie_cfg->src_img_addr_uv = fd->base_para->src_img_addr_uv;
> +	aie_cfg->src_img_width = fd->base_para->img_rect.width;
> +	aie_cfg->src_img_height = fd->base_para->img_rect.height;
> +	aie_cfg->src_img_fmt = fd->base_para->src_img_fmt;

User space know all these information, so it's not necessary to set these information to output buffer.
Drop these.

> +
> +	aie_cfg->irq_status = readl(fd->fd_base + AIE_INT_EN_REG);

User space is not necessary to know irq_status.
Drop this.

> +
> +	fd_result_hw = fd->reg_cfg.hw_result;
> +	fd_result_1_hw = fd->reg_cfg.hw_result1;
> +	fd_total_num = fd_result_hw & 0xFFF;
> +	fd_pyramid_num[0] = (fd_result_hw & 0xFFF0000) >> 16;
> +	fd_pyramid_num[1] = fd_result_1_hw & 0xFFF;
> +	fd_pyramid_num[2] = (fd_result_1_hw & 0xFFF0000) >> 16;
> +
> +	if (fd_total_num == 0)
> +		goto nothing_out;
> +
> +	tmp_aie_cfg =  aie_cfg;

Drop tmp_aie_cfg and use aie_cfg directly.

> +
> +	tmp_aie_cfg->fd_out.fd_total_num = fd_total_num;
> +	tmp_aie_cfg->fd_out.fd_pyramid0_num = fd_pyramid_num[0];
> +	tmp_aie_cfg->fd_out.fd_pyramid1_num = fd_pyramid_num[1];
> +	tmp_aie_cfg->fd_out.fd_pyramid2_num = fd_pyramid_num[2];
> +
> +	switch (tmp_aie_cfg->number_of_pyramid) {
> +	case 1:
> +		fd_pym_result[2] = fd->dma_para->fd_out_hw_va[RPNX_LOOP_NUM(0)][0];

Why use index 2?
Use index 0 is more trivial.
Add comment to describe why use such weird index.

> +		break;
> +	case 2:
> +		fd_pym_result[1] = fd->dma_para->fd_out_hw_va[RPNX_LOOP_NUM(0)][0];
> +		fd_pym_result[2] = fd->dma_para->fd_out_hw_va[RPNX_LOOP_NUM(1)][0];
> +		break;
> +	case 3:
> +		fd_pym_result[0] = fd->dma_para->fd_out_hw_va[RPNX_LOOP_NUM(0)][0];
> +		fd_pym_result[1] = fd->dma_para->fd_out_hw_va[RPNX_LOOP_NUM(1)][0];
> +		fd_pym_result[2] = fd->dma_para->fd_out_hw_va[RPNX_LOOP_NUM(2)][0];
> +		break;
> +	default:
> +		dev_err(fd->dev, "Wrong number_of_pyramid\n");
> +		goto nothing_out;
> +	}
> +
> +	for (i = 0; i < 3; i++) {
> +		for (j = 0; j < fd_pyramid_num[i]; j++) {
> +			if (i == 0)
> +				prst = &tmp_aie_cfg->fd_out.pyramid0_result;
> +			else if (i == 1)
> +				prst = &tmp_aie_cfg->fd_out.pyramid1_result;
> +			else if (i == 2)
> +				prst = &tmp_aie_cfg->fd_out.pyramid2_result;

If you define pyramid0_result to pyramid_result[0], then you could simplify this as

prst = &tmp_aie_cfg->fd_out.pyramid_result[i];

In addition, this statement is just related to variable i, not related to variable j,
so move this statement before the for-loop for j.

> +
> +			pto12 = (unsigned int *)fd_pym_result[i] + 12 * j;
> +
> +			prst->anchor_x0[j] = aie_get_lo16(*(pto12 + 0));
> +			prst->anchor_y0[j] = aie_get_hi16(*(pto12 + 0));
> +			prst->anchor_x1[j] = aie_get_lo16(*(pto12 + 1));
> +			prst->anchor_y1[j] = aie_get_hi16(*(pto12 + 1));
> +
> +			if (prst->anchor_x1[j] == 0 ||
> +			    prst->anchor_y1[j] == 0) {
> +				dev_err(fd->dev,
> +					"wrong coordinate: i=%d j=%d M:%d %d %d %d\n", i, j,
> +					prst->anchor_x0[j],
> +					prst->anchor_x1[j],
> +					prst->anchor_y0[j],
> +					prst->anchor_y1[j]
> +				);
> +				goto nothing_out;
> +			}
> +
> +			/* ROP result at 1st run */
> +			landmark = (*(pto12 + 2) & 0x3FF);
> +			prst->rop_landmark_score0[j] = aie_refine_s16_value(landmark);
> +			landmark = ((*(pto12 + 2) & 0xFFC00) >> 10);
> +			prst->rop_landmark_score1[j] = aie_refine_s16_value(landmark);
> +			landmark = ((*(pto12 + 2) & 0x3FF00000) >> 20);
> +			prst->rop_landmark_score2[j] = aie_refine_s16_value(landmark);
> +
> +			prst->anchor_score[j] = aie_refine_s16_value(*(pto12 + 9) & 0x3FF);
> +
> +			/* RIP result at 1st run */
> +			landmark = ((*(pto12 + 9) & 0xFFC00) >> 10);
> +			prst->rip_landmark_score0[j] = aie_refine_s16_value(landmark);
> +			landmark = ((*(pto12 + 9) & 0x3FF00000) >> 20);
> +			prst->rip_landmark_score1[j] = aie_refine_s16_value(landmark);
> +			landmark = ((*(pto12 + 9) & 0xC0000000) >> 30) |
> +				   ((*(pto12 + 10) & 0xFF) << 2);
> +			prst->rip_landmark_score2[j] = aie_refine_s16_value(landmark);
> +			landmark = ((*(pto12 + 10) & 0x3FF00) >> 8);
> +			prst->rip_landmark_score3[j] = aie_refine_s16_value(landmark);
> +			landmark = ((*(pto12 + 10) & 0xFFC0000) >> 18);
> +			prst->rip_landmark_score4[j] = aie_refine_s16_value(landmark);
> +			landmark = ((*(pto12 + 10) & 0xF0000000) >> 28) |
> +				   ((*(pto12 + 11) & 0x3F) << 4);
> +			prst->rip_landmark_score5[j] = aie_refine_s16_value(landmark);
> +			landmark = ((*(pto12 + 11) & 0xFFC0) >> 6);
> +			prst->rip_landmark_score6[j] = aie_refine_s16_value(landmark);
> +			prst->face_result_index[j] = ((*(pto12 + 11) & 0xFFF0000) >> 16);
> +			prst->anchor_index[j] = ((*(pto12 + 11) & 0x70000000) >> 28);
> +
> +			prst->fd_partial_result = fd_pyramid_num[i];
> +		}
> +	}
> +	return;
> +nothing_out:
> +	// Ensure that user mode does not receive an inappropriate result structure

Use c style instead of c++ style comment.
Run checkpatch before you send out patches.

> +	memset(&aie_cfg->fd_out, 0, sizeof(struct fd_result));
> +}
> +
> +void aie_get_attr_result(struct mtk_aie_dev *fd, struct aie_enq_info *aie_cfg)
> +{
> +	u32 *attr_ethnicity_result, *attr_gender_result;
> +	u32 *attr_age_result, *attr_is_indian_result;
> +
> +	aie_cfg->sel_mode = fd->attr_para->sel_mode[fd->attr_para->r_idx];
> +	aie_cfg->rotate_degree = fd->attr_para->rotate_degree[fd->attr_para->r_idx];
> +	aie_cfg->src_img_addr =
> +		fd->attr_para->src_img_addr[fd->attr_para->r_idx];
> +	aie_cfg->src_img_addr_uv =
> +		fd->attr_para->src_img_addr_uv[fd->attr_para->r_idx];
> +	aie_cfg->src_img_width = fd->attr_para->img_width[fd->attr_para->r_idx];
> +	aie_cfg->src_img_height =
> +		fd->attr_para->img_height[fd->attr_para->r_idx];
> +	aie_cfg->src_img_fmt = fd->attr_para->src_img_fmt[fd->attr_para->r_idx];

User space know all these information, so it's not necessary to set these information to output buffer.
Drop these.

> +
> +	aie_cfg->irq_status = readl(fd->fd_base + AIE_INT_EN_REG);

User space is not necessary to know irq_status.
Drop this.

> +
> +	/* 64 feature * 32 bytes */
> +	attr_age_result =
> +		(u32 *)fd->dma_para->age_out_hw_va[fd->attr_para->r_idx];
> +	attr_gender_result =
> +		(u32 *)fd->dma_para->gender_out_hw_va[fd->attr_para->r_idx];
> +	attr_is_indian_result =
> +		(u32 *)fd->dma_para->is_indian_out_hw_va[fd->attr_para->r_idx];
> +	attr_ethnicity_result =
> +		(u32 *)fd->dma_para->ethnicity_out_hw_va[fd->attr_para->r_idx];
> +
> +	aie_cfg->attr_out.merged_age_ret[0] = aie_get_lo16(*attr_age_result);
> +	aie_cfg->attr_out.merged_age_ret[1] = aie_get_hi16(*attr_age_result);

Define merged_age_ret as u32,

u32 merged_age_ret;

And this would be

aie_cfg->attr_out.merged_age_ret = *attr_age_result;

> +
> +	aie_cfg->attr_out.merged_gender_ret[0] = aie_get_lo16(*attr_gender_result);
> +	aie_cfg->attr_out.merged_gender_ret[1] = aie_get_hi16(*attr_gender_result);

u32 merged_gender_ret;

> +
> +	aie_cfg->attr_out.merged_is_indian_ret[0] = aie_get_lo16(*attr_is_indian_result);
> +	aie_cfg->attr_out.merged_is_indian_ret[1] = aie_get_hi16(*attr_is_indian_result);

u32 merged_is_indian_ret;

> +
> +	aie_cfg->attr_out.merged_ethnicity_ret[0] = aie_get_lo16(*attr_ethnicity_result);
> +	aie_cfg->attr_out.merged_ethnicity_ret[1] = aie_get_hi16(*attr_ethnicity_result);
> +	aie_cfg->attr_out.merged_ethnicity_ret[2] = aie_get_lo16(*(attr_ethnicity_result + 1));

u64 merged_ethnicity_ret;

> +
> +	fd->attr_para->r_idx = (fd->attr_para->r_idx + 1) % MAX_ENQUE_FRAME_NUM;
> +}
> +
> +void aie_get_fld_result(struct mtk_aie_dev *fd, struct aie_enq_info *aie_cfg)
> +{
> +	u8 fld_rlt[FLD_OUTPUT_X_SIZE][FLD_OUTPUT_SIZE];
> +	u16 *out_parsing;
> +	int i, j;
> +
> +	aie_cfg->irq_status = readl(fd->fd_base + AIE_INT_EN_REG);
> +
> +	memcpy(fld_rlt, fd->fld_para->fld_output_va[0], sizeof(fld_rlt));

fld_output_va[] is used in this function and you just use fld_output_va[0],
so it's not necessary to use an array to store fld_output_va.
In aie_arrange_fld_buf(),

fd->fld_para->fld_output_va = fd->fd_fld_out_hw.va;

In below for-loop,

out_parsing = fd->fld_para->fld_output_va + j * FLD_OUTPUT_SIZE;


> +
> +	for (j = 0; j < aie_cfg->fld_face_num; j++) {
> +		out_parsing = (unsigned short *)&fld_rlt[j][0];
> +		for (i = 0; i < FLD_CUR_LANDMARK; i++) {
> +			aie_cfg->fld_out[j].fld_landmark[i].x = *out_parsing;
> +			aie_cfg->fld_out[j].fld_landmark[i].y = *(out_parsing + 1);

aie_cfg->fld_out[j].fld_landmark[i].x = out_parsing[0];
aie_cfg->fld_out[j].fld_landmark[i].y = out_parsing[1];

looks better.

> +
> +			if (i % 2)
> +				out_parsing = out_parsing + 6;
> +			else
> +				out_parsing = out_parsing + 2;
> +		}
> +		out_parsing = (unsigned short *)&fld_rlt[j][0];
> +		if (FLD_CUR_LANDMARK % 2)
> +			out_parsing = out_parsing + ((FLD_CUR_LANDMARK + 1) / 2) * 8;
> +		else
> +			out_parsing = out_parsing + (FLD_CUR_LANDMARK / 2) * 8;

Even though FLD_CUR_LANDMARK % 2 is 0, you could still use below formula,

out_parsing = out_parsing + ((FLD_CUR_LANDMARK + 1) / 2) * 8;

So you could drop this if-checking and use this formula for all case.

> +
> +		aie_cfg->fld_out[j].fld_out_rop = *out_parsing;
> +		aie_cfg->fld_out[j].fld_out_rip = *(out_parsing + 1);
> +		aie_cfg->fld_out[j].confidence = *(out_parsing + 2);
> +		aie_cfg->fld_out[j].blinkscore = *(out_parsing + 3);

aie_cfg->fld_out[j].fld_out_rop = out_parsing[0];
aie_cfg->fld_out[j].fld_out_rip = out_parsing[1];
aie_cfg->fld_out[j].confidence = out_parsing[2];
aie_cfg->fld_out[j].blinkscore = out_parsing[3];

looks better.

> +	}
> +}
> diff --git a/drivers/media/platform/mediatek/aie/mtk_aie_v4l2.c b/drivers/media/platform/mediatek/aie/mtk_aie_v4l2.c
> new file mode 100644
> index 000000000000..28646d5a53aa
> --- /dev/null
> 

[snip]

> +
> +static void mtk_aie_fill_pixfmt_mp(struct v4l2_pix_format_mplane *dfmt,
> +				   const struct v4l2_pix_format_mplane *sfmt)
> +{
> +	dfmt->field = V4L2_FIELD_NONE;
> +	dfmt->colorspace = V4L2_COLORSPACE_BT2020;
> +	dfmt->num_planes = sfmt->num_planes;
> +	dfmt->ycbcr_enc = V4L2_YCBCR_ENC_DEFAULT;
> +	dfmt->quantization = V4L2_QUANTIZATION_DEFAULT;
> +	dfmt->xfer_func = V4L2_MAP_XFER_FUNC_DEFAULT(dfmt->colorspace);
> +	dfmt->pixelformat = sfmt->pixelformat;
> +
> +	/* Keep user setting as possible */
> +	dfmt->width = clamp(dfmt->width, MTK_FD_OUTPUT_MIN_WIDTH,
> +			    MTK_FD_OUTPUT_MAX_WIDTH);
> +	dfmt->height = clamp(dfmt->height, MTK_FD_OUTPUT_MIN_HEIGHT,
> +			     MTK_FD_OUTPUT_MAX_HEIGHT);
> +
> +	dfmt->plane_fmt[0].bytesperline = ALIGN(dfmt->width, 16);
> +	dfmt->plane_fmt[1].bytesperline = ALIGN(dfmt->width, 16);
> +
> +	dfmt->plane_fmt[0].sizeimage = dfmt->height * dfmt->plane_fmt[0].bytesperline;
> +	dfmt->plane_fmt[1].sizeimage = dfmt->height * dfmt->plane_fmt[1].bytesperline;
> +	if (sfmt->num_planes == 2 && sfmt->pixelformat == V4L2_PIX_FMT_NV12M) {

if (sfmt->pixelformat == V4L2_PIX_FMT_NV12M) {

V4L2_PIX_FMT_NV12M imply that num_planes is 2, so checking 'sfmt->num_planes == 2' is redundant.

> +		dfmt->plane_fmt[1].sizeimage /= 2;
> +	} else if (sfmt->pixelformat == V4L2_PIX_FMT_NV12) {
> +		dfmt->plane_fmt[0].sizeimage *= 3;
> +		dfmt->plane_fmt[0].sizeimage /= 2;
> +	}
> +}

[snip]

> +
> +static __poll_t mtk_vfd_fop_poll(struct file *file, poll_table *wait)
> +{
> +	int ret;
> +
> +	struct mtk_aie_ctx *ctx = container_of(file->private_data, struct mtk_aie_ctx, fh);
> +	struct mtk_aie_dev *fd = ctx->fd_dev;
> +
> +	if (fd->fd_state & STATE_INIT) {
> +		/* Waiting Job Finsh */
> +		ret = wait_for_completion_timeout(&fd->fd_job_finished, msecs_to_jiffies(1000));

It's not necessary to wait here because v4l2_m2m_fop_poll() would do waiting.

> +		if (!ret) {
> +			dev_err(ctx->dev, "Wait job finish timeout from poll\n");
> +			return EPOLLERR;
> +		}
> +	}
> +
> +	return v4l2_m2m_fop_poll(file, wait);
> +}
> +
> +static const struct v4l2_file_operations fd_video_fops = {
> +	.owner = THIS_MODULE,
> +	.open = mtk_vfd_open,
> +	.release = mtk_vfd_release,
> +	.poll = mtk_vfd_fop_poll,
> +	.unlocked_ioctl = video_ioctl2,
> +	.mmap = v4l2_m2m_fop_mmap,
> +};
> +
> +static int mtk_aie_job_ready(void *priv)
> +{
> +	struct vb2_v4l2_buffer *src_buf, *dst_buf;
> +	struct mtk_aie_ctx *ctx = priv;
> +	struct mtk_aie_dev *fd = ctx->fd_dev;
> +	struct fd_buffer src_img[2] = {};
> +	void *plane_vaddr;
> +
> +	if (!ctx->fh.m2m_ctx) {

When file is open, ctx->fh.m2m_ctx is valid.
And device_run() callback is called when file is opened.
So this checking is redundant.

> +		dev_err(fd->dev, "Memory-to-memory context is NULL\n");
> +		return -1;
> +	}
> +
> +	if (!(fd->fd_state & STATE_OPEN)) {

I think when file is clode, V4L2 would not call device_run() callback function.
So this checking is redundant.

> +		dev_err(fd->dev, "Job ready with device closed\n");
> +		return -1;
> +	}
> +
> +	mutex_lock(&fd->fd_lock);
> +
> +	src_buf = v4l2_m2m_next_src_buf(ctx->fh.m2m_ctx);
> +	dst_buf = v4l2_m2m_next_dst_buf(ctx->fh.m2m_ctx);
> +
> +	if (!src_buf || !dst_buf) {
> +		dev_err(fd->dev, "src or dst buf is NULL\n");
> +		mutex_unlock(&fd->fd_lock);
> +		return -1;
> +	}
> +
> +	if (!(fd->fd_state & STATE_INIT)) {
> +		dev_err(fd->dev, "%s Wrong fd state: %d\n", __func__, fd->fd_state);
> +		mutex_unlock(&fd->fd_lock);
> +		return -1;
> +	}
> +
> +	plane_vaddr = vb2_plane_vaddr(&dst_buf->vb2_buf, 0);
> +	if (!plane_vaddr) {
> +		dev_err(fd->dev, "Failed to get plane virtual address\n");
> +		mutex_unlock(&fd->fd_lock);
> +		return -1;
> +	}
> +
> +	v4l2_ctrl_request_setup(src_buf->vb2_buf.req_obj.req, &ctx->hdl);
> +
> +	fd->aie_cfg = (struct aie_enq_info *)plane_vaddr;
> +	fd->aie_cfg->fld_face_num = ctx->user_param.fld_face_num;

You clear fd->aie_cfg below, so here assign fd->aie_cfg->fld_face_num is redundant.
Drop it.

> +
> +	memset(fd->aie_cfg, 0, sizeof(struct aie_enq_info));
> +	memcpy(fd->aie_cfg, &ctx->user_param, sizeof(struct aie_enq_info));

It's not necessary to copy user space setting to output buffer.
User space know all these setting.

> +	memcpy(fd->aie_cfg->fld_input, ctx->user_param.fld_input,
> +	       FLD_MAX_FRAME * sizeof(struct v4l2_fld_crop_rip_rop));
> +
> +	src_img[0].dma_addr = vb2_dma_contig_plane_dma_addr(&src_buf->vb2_buf, 0);
> +
> +	if (ctx->src_fmt.num_planes == 2) {
> +		src_img[1].dma_addr =
> +			vb2_dma_contig_plane_dma_addr(&src_buf->vb2_buf, 1);
> +	}
> +
> +	if ((fd->aie_cfg->sel_mode == FDMODE || fd->aie_cfg->sel_mode == ATTRIBUTEMODE) &&
> +	    fd->aie_cfg->src_img_fmt == FMT_YUV420_1P) {
> +		src_img[1].dma_addr = src_img[0].dma_addr + ctx->user_param.src_img_stride *
> +			ctx->user_param.src_img_height;
> +	}
> +
> +	fd->aie_cfg->src_img_addr = src_img[0].dma_addr;
> +	fd->aie_cfg->src_img_addr_uv = src_img[1].dma_addr;

fd->aie_cfg is output buffer, so do not write src_img_addr and src_img_addr_uv to output buffer.
You should define src_img_addr and src_img_addr_uv to somewhere not output buffer.

> +
> +	aie_prepare(fd, fd->aie_cfg);
> +
> +	mutex_unlock(&fd->fd_lock);
> +
> +	if (src_buf) {
> +		/* Complete request controls if any */
> +		v4l2_ctrl_request_complete(src_buf->vb2_buf.req_obj.req, &ctx->hdl);
> +	}
> +
> +	return 0;
> +}

[snip]

> +
> +static void mtk_aie_frame_done_worker(struct work_struct *work)
> +{
> +	struct mtk_aie_req_work *req_work = (struct mtk_aie_req_work *)work;
> +	struct mtk_aie_dev *fd = (struct mtk_aie_dev *)req_work->fd_dev;
> +
> +	if (fd->reg_cfg.fd_mode == FDMODE) {
> +		fd->reg_cfg.hw_result = readl(fd->fd_base + AIE_RESULT_0_REG);
> +		fd->reg_cfg.hw_result1 = readl(fd->fd_base + AIE_RESULT_1_REG);

hw_result and hw_result1 is used only in aie_get_fd_result() which will be called in next statement.
So you could read AIE_RESULT_0_REG and AIE_RESULT_1_REG in aie_get_fd_result() and drop hw_result and hw_result1.

Regards,
CK

> +	}
> +
> +	mutex_lock(&fd->fd_lock);
> +
> +	switch (fd->aie_cfg->sel_mode) {
> +	case FDMODE:
> +		aie_get_fd_result(fd, fd->aie_cfg);
> +		break;
> +	case ATTRIBUTEMODE:
> +		aie_get_attr_result(fd, fd->aie_cfg);
> +		break;
> +	case FLDMODE:
> +		aie_get_fld_result(fd, fd->aie_cfg);
> +		break;
> +	default:
> +		dev_dbg(fd->dev, "Wrong sel_mode\n");
> +		break;
> +	}
> +
> +	mutex_unlock(&fd->fd_lock);
> +
> +	if (!cancel_delayed_work(&fd->job_timeout_work))
> +		return;
> +
> +	atomic_dec(&fd->num_composing);
> +	mtk_aie_hw_job_finish(fd, VB2_BUF_STATE_DONE);
> +	wake_up(&fd->flushing_waitq);
> +}




More information about the linux-arm-kernel mailing list