[RESEND] Benchmark of I/O multiplexing with LZO/snappy compression
HATAYAMA Daisuke
d.hatayama at jp.fujitsu.com
Mon Jun 18 01:02:21 EDT 2012
Hello,
I evaluated I/O multiplexing together with paralell compression in two
kinds of formats: lzo and snappy.
In summary:
- By 8-dimentional I/O multiplexing, thoughtput is 5 times as quick as
the 1-dimentional: For snappy, 1TB copy takes 25min.
- For randomized data, snappy is as quick as raw, e.g. no compression
case.
- lzo consumes more CPU time than snappy, but it could probably be
better for quicker CPUs and more sparse data; another kind of bench
is required.
In advance, any comments are appreciated.
* Environments
- PRIMEQUEST 1800E2
- CPU: Intel Xeon E7-8870 (10core/2.4GHz) x 2 sockets
- RAM: 32GB
- DISKS
- MBD2147RC (10025rpm) x 4
- ETERNUS DX440: Emulex 8Gb/s fiber adapters x 4
(*) To get 8-dimentional I/O multiplexing, I used 4 disks and 4 LUNV
of SAN simply because I didn't have enough disks available (^^;
* How to measure what?
This bench measured the real time consumed for copying 10GB
on-memory data, simulating /proc/vmcore, into multiple different
disks with no compression or with LZO and snappy compressions.
The data is randomized enough so the time for compression is
meaningless; no I/O workload changes during compression; this bench
is only for worst case.
- Parameters
- number of writing/compressing threads (and so number of I/O
multiplexing)
- 1 ~ 8
- compression format
- raw
- lzo
- snappy
- kernel versions
- v3.4
- RHEL6.2 (2.6.32.220)
- RHEL5.8 (2.6.18-238)
example)
- Let fakevmcore of 10GB and its block size 4kB.
- split I/O into two different disks: /mnt/disk{0,1}
- Block size for compression is 4kB.
- compress data in LZO: -c is LZO and -s is snappy.
- flush page cache after nsplit.
$ insmod ./fakevmcore.ko fakevmcore_size=$((10*1024*1024*1024)) fakevmcore_block_size=4096
$ time { nsplit -c --blocksize=4096 /proc/fakevmcore /mnt/disk0/a /mnt/disk1/a ; \
echo 3 > /proc/sys/vm/drop_caches; }
To build nsplit.c on fc16, the following compression libraries are required:
- lzo-devel, lzo-minilzo, lzo
- snappy-devel, snappy
* Results
n: number of writing and compressing threads
- upstream v3.4 kernel
n raw lzo snappy
1 1m29.617s 2m41.979s 1m9.592s
2 1m8.519s 1m26.555s 1m26.902s
3 0m48.653s 1m0.462s 0m35.172s
4 0m28.039s 0m47.248s 0m28.430s
5 0m23.491s 0m37.181s 0m23.435s
6 0m18.202s 0m28.428s 0m18.580s
7 0m15.897s 0m29.873s 0m16.678s
8 0m13.659s 0m23.180s 0m13.922s
- RHEL6.2 (2.6.32.220)
n raw lzo snappy
1 0m53.119s 2m36.603s 1m33.061s
2 1m31.578s 1m28.808s 0m49.492s
3 0m31.675s 0m57.540s 0m33.795s
4 0m37.714s 0m45.035s 0m32.871s
5 0m20.363s 0m34.988s 0m21.894s
6 0m22.602s 0m31.216s 0m19.195s
7 0m18.837s 0m25.204s 0m15.906s
8 0m13.715s 0m22.228s 0m13.884s
- RHEL5.8 (2.6.18-238)
n raw lzo snappy
1 0m55.144s 1m20.771s 1m4.140s
2 0m52.157s 1m8.336s 1m1.089s
3 0m50.172s 0m41.329s 0m47.859s
4 0m35.409s 0m28.764s 0m43.286s
5 0m22.974s 0m20.501s 0m20.197s
6 0m17.430s 0m18.072s 0m19.524s
7 0m14.222s 0m14.936s 0m15.603s
8 0m13.071s 0m14.755s 0m13.313s
- By 8-dimentional I/O multiplexing, throughput is improved as quick
as 4~5 times in raw, 5-6 times in lzo, and 6-8 times in snappy.
- 10GB per 15sec corresponds to 1TB per 25min 36sec.
- snappy is as quick as raw. I think snappy can be used with a very
low risk even at the worst case.
- lzo is slower than raw and snappy. But paralell compression works
well. Although lzo is worse than other two in this bench, I expect
lzo could be better than the other two if using better CPU and data
consists of more sparse data.
- On LZO, RHEL5.8's is better than those of v3.4 and RHEL6.2. Due to
I/O workloads situation? But I don't know that precisely.
* TODO
- Retry benchmark using disks only.
- Evaluate btrfs's transparent compression for large data; for very
large data, compression in kernel-space has advantage compared to
that in user-space.
Thanks.
HATAYAMA, Daisuke
-------------- next part --------------
CC = gcc
OPTIONS = -g -O0 -W -Wall -pthread -llzo2 -lsnappy
obj-m := fakevmcore.o
fakevmcore.ko:
make -C /lib/modules/$(shell uname -r)/build M=$(PWD) modules
all: build
build: nsplit fakevmcore.ko
nsplit: nsplit.o
$(CC) $(OPTIONS) -o $@ $^
nsplit.o: nsplit.c
$(CC) $(OPTIONS) -c $^
clean:
make -C /lib/modules/$(shell uname -r)/build M=$(PWD) clean
rm -f ./nsplit
rm -rf *.o
rm -rf *~
-------------- next part --------------
#include <linux/init.h>
#include <linux/module.h>
#include <linux/proc_fs.h>
#include <linux/random.h>
#include <asm/io.h>
#include <asm/uaccess.h>
static void *fakevmcore_data;
static unsigned long fakevmcore_size;
static unsigned long fakevmcore_block_size;
static struct proc_dir_entry *proc_fakevmcore;
module_param(fakevmcore_size, ulong, 0444);
module_param(fakevmcore_block_size, ulong, 0444);
static ssize_t read_fakevmcore(struct file *file, char __user *buffer,
size_t buflen, loff_t *fpos)
{
ssize_t acc = 0;
if (buflen == 0 || *fpos >= fakevmcore_size)
return 0;
if (buflen > fakevmcore_size - *fpos)
buflen = fakevmcore_size - *fpos;
while (buflen > 0) {
size_t tsz, offset;
offset = *fpos % fakevmcore_block_size;
tsz = min(fakevmcore_block_size - offset, buflen);
if (copy_to_user(buffer, fakevmcore_data + offset, tsz))
return -EFAULT;
buflen -= tsz;
*fpos += tsz;
buffer += tsz;
acc += tsz;
}
return acc;
}
static const struct file_operations proc_fakevmcore_operations = {
.read = read_fakevmcore,
.llseek = default_llseek,
};
static int fakevmcore_init(void)
{
if (!fakevmcore_size) {
fakevmcore_size = PAGE_SIZE;
printk("fakevmcore_size defaults to PAGE_SIZE\n");
}
if (!fakevmcore_block_size) {
fakevmcore_block_size = PAGE_SIZE;
printk("fakevmcore_block_size defaults to PAGE_SIZE\n");
}
fakevmcore_data = (void *)__get_free_pages(GFP_KERNEL,
get_order(fakevmcore_block_size));
if (!fakevmcore_data) {
printk("__get_free_page failed\n");
goto out;
}
get_random_bytes(fakevmcore_data, fakevmcore_block_size);
proc_fakevmcore = proc_create("fakevmcore", S_IRUGO, NULL, &proc_fakevmcore_operations);
if (proc_fakevmcore)
proc_fakevmcore->size = fakevmcore_size;
out:
return 0;
}
static void fakevmcore_exit(void)
{
if (fakevmcore_data) {
free_pages((unsigned long)fakevmcore_data,
get_order(fakevmcore_block_size));
fakevmcore_data = NULL;
}
if (proc_fakevmcore) {
remove_proc_entry(proc_fakevmcore->name, proc_fakevmcore->parent);
proc_fakevmcore = NULL;
}
fakevmcore_size = 0;
fakevmcore_block_size = 0;
}
module_init(fakevmcore_init);
module_exit(fakevmcore_exit);
MODULE_LICENSE("GPL");
MODULE_AUTHOR("HATAYAMA Daisuke <d.hatayama at jp.fujitsu.com>");
-------------- next part --------------
#include <stdio.h>
#include <stdlib.h>
#include <pthread.h>
#include <sys/stat.h>
#include <unistd.h>
#include <fcntl.h>
#include <getopt.h>
#include <lzo/lzo1x.h>
#include <snappy-c.h>
enum nsplit_constants
{
FALSE = 0,
TRUE = 1,
ARGV_OUTFILE_START_POS = 2,
BLOCK_SIZE = 4096,
PAGE_SIZE = 4096,
};
enum nsplit_compress
{
NS_COMPRESS_LZO = 1,
NS_COMPRESS_SNAPPY = 2,
};
struct nsplit_data
{
char *infile;
int infd;
int nr_threads;
unsigned long nr_blocks;
struct stat st;
int debug;
enum nsplit_compress compress;
size_t blocksize;
};
struct nsplit_data nsplit_data;
struct nsplit_data *nd = &nsplit_data;
static int
nsplit_data_init(char *infile, int nr_threads)
{
int ret = FALSE;
if (!nd->blocksize)
nd->blocksize = BLOCK_SIZE;
nd->infile = infile;
nd->nr_threads = nr_threads;
if (nd->nr_threads < 1) {
fprintf(stderr, "invalid number of threads: %d\n", nd->nr_threads);
goto out;
}
if (stat(nd->infile, &nd->st) < 0) {
perror("stat");
goto out;
}
nd->nr_blocks = nd->st.st_size / nd->nr_threads / nd->blocksize;
if ((nd->infd = open(nd->infile, O_RDONLY)) < 0) {
perror("open");
goto out;
}
if (nd->debug) {
printf("infile size: %lu\n", nd->st.st_size);
printf("number of threads: %d\n", nd->nr_threads);
printf("block size: %lu\n", nd->blocksize);
printf("number of blocks: %lu\n", nd->nr_blocks);
}
ret = TRUE;
out:
return ret;
}
static void
nsplit_data_free(void)
{
if (nd->infd)
close(nd->infd);
}
struct nsplit_thread_data
{
char *outfile;
int thread_index;
loff_t block_start;
loff_t block_end;
};
static struct nsplit_thread_data *
nsplit_create_thread_data(char *outfile, int thread_index)
{
struct nsplit_thread_data *ntd;
if (posix_memalign((void *)&ntd, PAGE_SIZE, sizeof(struct nsplit_thread_data))) {
perror("posix_memalign");
goto out;
}
ntd->outfile = outfile;
ntd->thread_index = thread_index;
ntd->block_start = nd->nr_blocks * nd->blocksize * ntd->thread_index;
ntd->block_end = nd->nr_blocks * nd->blocksize * (ntd->thread_index + 1);
/* last thread needs to care the round. */
if (thread_index == nd->nr_threads - 1
&& nd->st.st_size > ntd->block_end)
ntd->block_end += nd->st.st_size - ntd->block_end;
if (nd->debug) {
printf("thread: %d\n", thread_index);
printf(" outfile: %s\n", ntd->outfile);
printf(" start: %lu\n", ntd->block_start);
printf(" end: %lu\n", ntd->block_end);
}
out:
return ntd;
}
/*
* Care for the case where only part of a whole requests succeeds.
*/
static inline int preadout(int fd, unsigned char *buf, size_t size, loff_t off)
{
ssize_t readbytes, bytes, rest;
readbytes = 0;
rest = size;
while (rest > 0) {
if ((bytes = pread(fd, buf + readbytes, rest, off)) < 0) {
perror("pread");
if (nd->debug) {
fprintf(stderr, "fd: %d\n", fd);
fprintf(stderr, "readbytes: %ld\n", readbytes);
fprintf(stderr, "rest: %ld\n", rest);
fprintf(stderr, "offset: %lu\n", off);
}
return FALSE;
}
readbytes += bytes;
rest -= bytes;
off += bytes;
}
return TRUE;
}
static inline unsigned long compress_bound_lzo(unsigned long size)
{
return size + size / 16 + 64 + 3;
}
static void *f(void *arg)
{
struct nsplit_thread_data *ntd = arg;
FILE *fp = NULL;
loff_t block;
unsigned char *buffer = NULL, *wrkmem = NULL, *bufout = NULL;
size_t iobytes = nd->blocksize;
fp = fopen(ntd->outfile, "w");
if (!fp) {
perror("fopen");
goto out;
}
if (posix_memalign((void *)&buffer, PAGE_SIZE, iobytes)) {
perror("posix_memalign");
goto out;
}
if (nd->compress == NS_COMPRESS_LZO) {
if (posix_memalign((void *)&wrkmem, PAGE_SIZE,
LZO1X_1_MEM_COMPRESS)) {
perror("posix_memalign");
goto out;
}
if (posix_memalign((void *)&bufout, PAGE_SIZE,
compress_bound_lzo(iobytes))) {
perror("posix_memalign");
goto out;
}
} else if (nd->compress == NS_COMPRESS_SNAPPY) {
if (posix_memalign((void *)&bufout, PAGE_SIZE,
snappy_max_compressed_length(iobytes))) {
perror("posix_memalign");
goto out;
}
}
for (block = ntd->block_start; block < ntd->block_end;
block += iobytes) {
unsigned long sizeout;
int retval;
if (ntd->block_end - block < (loff_t)nd->blocksize)
iobytes = ntd->block_end - block;
if (!preadout(nd->infd, buffer, iobytes, block))
goto out;
switch (nd->compress) {
case NS_COMPRESS_LZO:
sizeout = iobytes;
retval = lzo1x_1_compress(buffer, iobytes, bufout,
&sizeout, wrkmem);
if (retval == LZO_E_OK && sizeout < iobytes) {
if (fwrite(bufout, sizeout, 1, fp) != 1) {
perror("fwrite");
goto out;
}
} else
goto writebufout;
break;
case NS_COMPRESS_SNAPPY:
sizeout = snappy_max_compressed_length(iobytes);
retval = snappy_compress((char *)buffer, iobytes,
(char *)bufout, &sizeout);
if (retval == SNAPPY_OK && sizeout < iobytes) {
if (fwrite(bufout, sizeout, 1, fp) != 1) {
perror("fwrite");
goto out;
}
} else
goto writebufout;
break;
default:
writebufout:
if (fwrite(buffer, iobytes, 1, fp) != 1) {
perror("fwrite");
goto out;
}
break;
}
}
out:
if (fp)
fclose(fp);
free(ntd);
if (wrkmem)
free(wrkmem);
if (bufout)
free(bufout);
if (buffer)
free(buffer);
return NULL;
}
static void usage(void)
{
fprintf(stderr, "usage: nsplit [-b size|--block=size] [-d|--debug] <in file> [<out file>]+\n");
fprintf(stderr, " \n");
fprintf(stderr, " [-b size|--block=size] specify block size for compression\n");
fprintf(stderr, " [-d|--debug] print debug information\n");
}
static struct option longopts[] = {
{"block", required_argument, NULL, 'b'},
{"debug", required_argument, NULL, 'd'},
{0, 0, 0, 0}
};
static int nsplit_main(char *infile, char **outfiles, int nr_threads)
{
pthread_t *threads = NULL;
int i, ret = EXIT_FAILURE;
if (!nsplit_data_init(infile, nr_threads))
goto out;
threads = malloc(nd->nr_threads * sizeof(pthread_t));
if (!threads) {
perror("malloc");
goto out;
}
for (i = 0; i < nd->nr_threads; ++i) {
struct nsplit_thread_data *ntd;
ntd = nsplit_create_thread_data(outfiles[i], i);
if (!ntd)
goto out;
if (pthread_create(&threads[i], NULL, f, (void *)ntd) != 0) {
perror("pthread_create");
goto out;
}
}
for (i = 0; i < nd->nr_threads; ++i) {
if (pthread_join(threads[i], NULL) != 0) {
perror("pthread_join");
goto out;
}
}
ret = EXIT_SUCCESS;
out:
if (threads)
free(threads);
nsplit_data_free();
return ret;
}
int main(int argc, char **argv)
{
int c, argerrs, option_index;
argerrs = 0;
while ((c = getopt_long(argc, argv, "bcds", longopts, &option_index)) != -1) {
switch (c) {
case 'b':
nd->blocksize = strtoul(optarg, NULL, 10);
break;
case 'c':
nd->compress = NS_COMPRESS_LZO;
break;
case 'd':
nd->debug = TRUE;
break;
case 's':
nd->compress = NS_COMPRESS_SNAPPY;
break;
case '?':
argerrs++;
break;
}
}
/* Infile and a single outfile are given at least? */
if (!argv[optind] || !argv[optind+1])
argerrs++;
if (argerrs > 0) {
usage();
exit(EXIT_FAILURE);
}
return nsplit_main(argv[optind], &argv[optind+1], argc-optind-1);
}
More information about the kexec
mailing list