/*
 * ARM Memory Throughput Benchmark
 *
 * Written by Nicolas Pitre <nico@marvell.com>
 * Copyright (C) 2008 Marvell Semiconductors
 */

/* max number of pld instructions without using a loop */
#define MAX_PLD_NO_LOOP	128

	.macro	preload, lines
	.if	\lines == 1
	pld	[r0, #28]
	.elseif	\lines != 0
	tst	r2, #((\lines - 1) * 32)
	bne	1f
	.if	\lines <= MAX_PLD_NO_LOOP
	.set	x, 0
	.rep	\lines
	pld	[r0, #(32 * x) + 28]
	.set	x, x + 1
	.endr
	.else
	mov	r3, r0
	mov	r4, #\lines
2:	pld	[r3, #28]
	subs	r4, r4, #1
	add	r3, r3, #32
	bne 2b
	.endif
1:
	.endif
	.endm  

	.macro	ldrb_8
	ldrb	r3, [r0], #1
	ldrb	r4, [r0], #1
	ldrb	r5, [r0], #1
	ldrb	r6, [r0], #1
	ldrb	r7, [r0], #1
	ldrb	r8, [r0], #1
	ldrb	r9, [r0], #1
	ldrb	r10,[r0], #1
	.endm

	.macro	strb_8
	strb	r3, [r1], #1
	strb	r4, [r1], #1
	strb	r5, [r1], #1
	strb	r6, [r1], #1
	strb	r7, [r1], #1
	strb	r8, [r1], #1
	strb	r9, [r1], #1
	strb	r10,[r1], #1
	.endm

	.macro	readb_32
	ldrb_8
	ldrb_8
	ldrb_8
	ldrb_8
	.endm

	.macro	writeb_32
	strb_8
	strb_8
	strb_8
	strb_8
	.endm

	.macro	byte_copy_32
	ldrb_8
	strb_8
	ldrb_8
	strb_8
	ldrb_8
	strb_8
	ldrb_8
	strb_8
	.endm

	.macro ldr_32
	ldr	r3, [r0], #4
	ldr	r4, [r0], #4
	ldr	r5, [r0], #4
	ldr	r6, [r0], #4
	ldr	r7, [r0], #4
	ldr	r8, [r0], #4
	ldr	r9, [r0], #4
	ldr	r10,[r0], #4
	.endm

	.macro	str_32
	str	r3, [r1], #4
	str	r4, [r1], #4
	str	r5, [r1], #4
	str	r6, [r1], #4
	str	r7, [r1], #4
	str	r8, [r1], #4
	str	r9, [r1], #4
	str	r10,[r1], #4
	.endm

	.macro	word_copy_32
	ldr_32
	str_32
	.endm

	.macro	ldm_32
	ldmia	r0!, {r3 - r10}
	.endm

	.macro	stm4_32
	stmia	r1!, {r3 - r6}
	stmia	r1!, {r7 - r10}
	.endm

	.macro	stm8_32
	stmia	r1!, {r3 - r10}
	.endm

	.macro	multi_copy_32
	ldm_32
	stm8_32
	.endm

	.macro	ldrd_32
	ldrd	r4, [r0], #8
	ldrd	r6, [r0], #8
	ldrd	r8, [r0], #8
	ldrd	r10,[r0], #8
	.endm

	.macro	strd_32
	strd	r4, [r1], #8
	strd	r6, [r1], #8
	strd	r8, [r1], #8
	strd	r10,[r1], #8
	.endm

	.macro	dword_copy_32
	ldrd_32
	strd_32
	.endm

	.macro	test_func func
	.text
	.align	5
	.global	test_\func
test_\func:
	stmfd	sp!, {r4 - r11, lr}
10:	subs	r2, r2, #32
	\func
	bne	10b
	ldmfd	sp!, {r4 - r11, pc}
	.align	5
	.global	test_\func\()_wa
test_\func\()_wa:
	stmfd	sp!, {r4 - r11, lr}
10:	subs	r2, r2, #32
	ldr	ip, [r1]
	\func
	bne	10b
	ldmfd	sp!, {r4 - r11, pc}
	.endm

	.macro	test_preload, func, pld=0
	.text
	.align	5
	.global	test_\func\()_p\pld
test_\func\()_p\pld:
	stmfd	sp!, {r4 - r11, lr}
10:	preload	\pld
	subs	r2, r2, #32
	\func
	bne	10b
	ldmfd	sp!, {r4 - r11, pc}
	.align	5
	.global	test_\func\()_wa_p\pld
test_\func\()_wa_p\pld:
	stmfd	sp!, {r4 - r11, lr}
10:	preload	\pld
	subs	r2, r2, #32
	ldr	ip, [r1]
	\func
	bne	10b
	ldmfd	sp!, {r4 - r11, pc}
	.endm

	.macro	gen_test func
	test_preload \func, 0
	test_preload \func, 1
	test_preload \func, 2
	test_preload \func, 4
	test_preload \func, 8
	test_preload \func, 16
	test_preload \func, 32
	test_preload \func, 64
	test_preload \func, 128
	.endm

test_func writeb_32
test_func str_32
test_func stm4_32
test_func stm8_32
test_func strd_32

gen_test readb_32
gen_test   ldr_32
gen_test   ldm_32
gen_test  ldrd_32

gen_test  byte_copy_32
gen_test  word_copy_32
gen_test multi_copy_32
gen_test dword_copy_32
