/*
 * ARM Memory Throughput Benchmark
 *
 * Written by Nicolas Pitre <nico@marvell.com>
 * Copyright (C) 2008 Marvell Semiconductors
 */

#include <stdio.h>
#include <stdlib.h>
#include <unistd.h>
#include <string.h>
#include <sched.h>
#include <sys/time.h>
#include <sys/times.h>

/*
 * Size of the buffer used to perform "uncached" tests on.
 * Must be large enough to continuously trash the data cache.
 */
#define BUF_SIZE		64*1024

/*
 * Size of the buffer used to perform "cached" tests on.
 * This must be small enough not to trash the data cache, i.e. smaller
 * than half the actual data cache size but not smaller than 4096.
 * (4096 is the largest preload size)
 */
#define CACHED_BUF_SIZE		4*1024

typedef void (test_fn)(void *src, void *dst, long size);

extern test_fn test_writeb_32,    test_str_32,    test_stm4_32,    test_stm8_32,    test_strd_32;
extern test_fn test_writeb_32_wa, test_str_32_wa, test_stm4_32_wa, test_stm8_32_wa, test_strd_32_wa;

struct write_tests {
	char *name;
	test_fn *fn;
	test_fn *fn_wa;
} write_tests[] = {
	{ "STRB",	test_writeb_32,	test_writeb_32_wa	},
	{ "STR",	test_str_32,	test_str_32_wa		},
	{ "STM4",	test_stm4_32,	test_stm4_32_wa		},
	{ "STM8",	test_stm8_32,	test_stm8_32_wa		},
	{ "STRD",	test_strd_32,	test_strd_32_wa		},
};

#define TEST_ALL_PRELOADS(fn) \
	test_##fn##_p0, \
	test_##fn##_p1, \
	test_##fn##_p2, \
	test_##fn##_p4, \
	test_##fn##_p8, \
	test_##fn##_p16, \
	test_##fn##_p32, \
	test_##fn##_p64, \
	test_##fn##_p128

extern test_fn TEST_ALL_PRELOADS(readb_32), TEST_ALL_PRELOADS( byte_copy_32);
extern test_fn TEST_ALL_PRELOADS(  ldr_32), TEST_ALL_PRELOADS( word_copy_32);
extern test_fn TEST_ALL_PRELOADS(  ldm_32), TEST_ALL_PRELOADS(multi_copy_32);
extern test_fn TEST_ALL_PRELOADS( ldrd_32), TEST_ALL_PRELOADS(dword_copy_32);

extern test_fn TEST_ALL_PRELOADS( byte_copy_32_wa);
extern test_fn TEST_ALL_PRELOADS( word_copy_32_wa);
extern test_fn TEST_ALL_PRELOADS(multi_copy_32_wa);
extern test_fn TEST_ALL_PRELOADS(dword_copy_32_wa);

struct read_copy_tests {
	char *name;
	test_fn *fn[9];
} read_copy_tests[] = {
	{ "LDRB", 	{ TEST_ALL_PRELOADS(readb_32) } },
	{ "LDR", 	{ TEST_ALL_PRELOADS(  ldr_32) } },
	{ "LDM",	{ TEST_ALL_PRELOADS(  ldm_32) } },
	{ "LDRD",	{ TEST_ALL_PRELOADS( ldrd_32) } },
	{ "CPY_B",	{ TEST_ALL_PRELOADS( byte_copy_32   ) } },
	{ "CPY_Bwa",	{ TEST_ALL_PRELOADS( byte_copy_32_wa) } },
	{ "CPY_R",	{ TEST_ALL_PRELOADS( word_copy_32   ) } },
	{ "CPY_Rwa",	{ TEST_ALL_PRELOADS( word_copy_32_wa) } },
	{ "CPY_M",	{ TEST_ALL_PRELOADS(multi_copy_32   ) } },
	{ "CPY_Mwa",	{ TEST_ALL_PRELOADS(multi_copy_32_wa) } },
	{ "CPY_D",	{ TEST_ALL_PRELOADS(dword_copy_32   ) } },
	{ "CPY_Dwa",	{ TEST_ALL_PRELOADS(dword_copy_32_wa) } },
};
 
#define ARRAY_SIZE(x) (sizeof(x) / sizeof((x)[0]))

void *test_buf;

long long now(void)
{
	struct timeval tv;
	gettimeofday(&tv, 0);
	return tv.tv_usec + tv.tv_sec * 1000000LL;
}

float do_test(test_fn fn, int in_cache)
{
	clock_t ck1, ck2, ck_per_sec = sysconf(_SC_CLK_TCK);
	long long t1, t2, tod_usecs, ck_usecs;
	int i, loops, tries;
	float x;
	int buf_size = in_cache ? CACHED_BUF_SIZE : BUF_SIZE;
	void *buf1 = test_buf;
	void *buf2 = test_buf + buf_size;

	tries = 0;

	do {
		if (tries >= 15) {
			fprintf(stderr, "\nAborting: unable to get a coherent "
			                "time measurement after %d attempts.\n"
			                "Please make sure the system is idle "
			                "before running this test.\n", tries);
			exit(1);
		}
		tries++;

		/* give others a chance to run and collect scheduler points */
		usleep(10000);

		/* first a single pass to warm caches and page memory in */
		if (in_cache) {
			test_ldm_32_p8(buf1, NULL, CACHED_BUF_SIZE);
			test_ldm_32_p8(buf2, NULL, CACHED_BUF_SIZE);
		}
		fn(buf2, buf1, buf_size);

		/* then some passes to estimate the duration */
		loops = 1;
		t1 = now();
		do {
			for (i = loops; i; i--) {
				fn(buf1, buf2, buf_size);
				fn(buf2, buf1, buf_size);
			}
			t2 = now();
			loops *= 2;
		} while ((t2 - t1) < 10000);
		loops -= 1;

		/* estimated number of loops to average one second */
		loops  = loops * 1000000LL / (t2 - t1);

		/* now the real test */
		usleep(10000);
		if (in_cache) {
			test_ldm_32_p8(buf1, NULL, CACHED_BUF_SIZE);
			test_ldm_32_p8(buf2, NULL, CACHED_BUF_SIZE);
		}
		fn(buf2, buf1, buf_size);
		i = loops;
		ck1 = times(NULL);
		do {
			ck2 = ck1;
			ck1 = times(NULL);
		} while (ck1 == ck2);
		t1 = now();
		while (i--) {
			fn(buf1, buf2, buf_size);
			fn(buf2, buf1, buf_size);
		}
		t2 = now();
		ck2 = times(NULL);

		ck_usecs = 1000000LL * (ck2 - ck1) / ck_per_sec;
		tod_usecs = t2 - t1;

		/*
		 * Using gettimeofday() gives us a better time stamp than
		 * times() or clock(), but only if the system is dedicated
		 * to this very task.  The discrepency must not exceed
		 * the next clock tick which is the best confidence test
		 * we can have, and should be really close if behind.
		 *
		 * Also, the duration of the test should never be far from
		 * one second (a 5% discrepency between the test and the
		 * initial duration estimate is tolerated).
		 */ 
#if 0
		fprintf(stderr, "try=%d loops=%d tod=%lld clk=%lld\n",
		        tries, loops, tod_usecs, ck_usecs);
#endif
	} while (tod_usecs <= ck_usecs - ( 100000 / ck_per_sec) ||
	         tod_usecs >= ck_usecs + (1000000 / ck_per_sec) ||
#if 0
	         ck_usecs < 950000 || tod_usecs > 1050000);
#else
	         ck_usecs < 900000 || tod_usecs > 1900000);
#endif

	/* finally, return throughput */
	x = loops;
	x *= buf_size * 2;
	x /= tod_usecs;
	x *= 1000000;
	x /= (1024 * 1024);
	return x;	
}

void boost_priority(void)
{
	struct sched_param p;
	int err;

	err = sched_getparam(0, &p);
	if (!err) {
		p.sched_priority = sched_get_priority_max(SCHED_FIFO);
		err = sched_setscheduler(0, SCHED_FIFO, &p);
	}
	if (err)
		perror("Warning: unable to set scheduling priority, ");
}

int main(int argc, char *argv[])
{
	void *test_buf_;
	int i, j, include_cached = 0;

	for (i = 1; i < argc; i++) {
		if (strcmp(argv[i], "-c") == 0) {
			include_cached = 1;
		} else {
			fprintf(stderr, "Usage: %s [-c]\n"
					"   -c\tinclude cached memory results\n",
					argv[0]);
			exit(1);
		}
	}

	test_buf_ = malloc(2*BUF_SIZE + CACHED_BUF_SIZE + 4096);

	/* page align */
	test_buf = (char *)((long)(test_buf_ + 4095) & ~4095L);

	/* try to get as much scheduling priority as we can */
	// disabled due to result stability problems
	//boost_priority();
	
	/* write tests */
	printf("*** Memory Write Throughput (in MB/s) ***");
	if (include_cached)
		printf("\n%-12s %12s %12s %12s %12s",
		       "method", "uncached", "write alloc", "cached", "cached + wa");
	else
		printf("\n%-12s %12s %12s", "method", "uncached", "write alloc");
	for (i = 0; i < ARRAY_SIZE(write_tests); i++) {
		printf("\n%-12s", write_tests[i].name);

		/* uncached test */
		fflush(stdout);
		printf(" %12.2f", do_test(write_tests[i].fn, 0));
		fflush(stdout);
		printf(" %12.2f", do_test(write_tests[i].fn_wa, 0));

		/* cached test */
		if (include_cached) {
			fflush(stdout);
			printf(" %12.2f", do_test(write_tests[i].fn, 1));
			fflush(stdout);
			printf(" %12.2f", do_test(write_tests[i].fn_wa, 1));
		}
	}
	printf("\n");

	/* read and copy tests (uncached) */
	printf("\n*** %sMemory Read/Copy Throughput (in MB/s) ***\n",
	       include_cached ? "Uncached " : "");
	printf("%-7s", "method");
	for (j = 0; j < ARRAY_SIZE(read_copy_tests[0].fn); j++) {
		char hdr[10];
		sprintf(hdr, "PLD=%d", (1 << j) >> 1);
		printf(" %7s", hdr);
	}
	for (i = 0; i < ARRAY_SIZE(read_copy_tests); i++) {
		printf("\n%-7s", read_copy_tests[i].name);
		for (j = 0; j < ARRAY_SIZE(read_copy_tests[0].fn); j++) {
			fflush(stdout);
			printf(" %7.2f", do_test(read_copy_tests[i].fn[j], 0));
		}
	}
	printf("\n");

	/* read and copy tests (cached) */
	if (!include_cached)
		goto skip_cached_read;
	printf("\n*** Cached Memory Read/Copy Throughput (in MB/s) ***\n");
	printf("%-7s", "method");
	for (j = 0; j < ARRAY_SIZE(read_copy_tests[0].fn); j++) {
		char hdr[10];
		sprintf(hdr, "PLD=%d", (1 << j) >> 1);
		printf(" %7s", hdr);
	}
	for (i = 0; i < ARRAY_SIZE(read_copy_tests); i++) {
		printf("\n%-7s", read_copy_tests[i].name);
		for (j = 0; j < ARRAY_SIZE(read_copy_tests[0].fn); j++) {
			fflush(stdout);
			printf(" %7.2f", do_test(read_copy_tests[i].fn[j], 1));
		}
	}
	printf("\n");
	skip_cached_read:

	free(test_buf_);
	return 0;
}
