[PATCH] makedumpfile: Improve performance for parallel compression with zlib.
"Zhou, Wenjian/周文?"
zhouwj-fnst at cn.fujitsu.com
Wed Oct 21 20:11:10 PDT 2015
Hello Kumagai,
I test it, and it works well. The following is the results.
in virtual machine(memory 2G):
with empty memory:
version | num-threads | time(sec)
----------------+---------------+-------------
devel | 0 | 12.76
devel | 1 | 19.29
devel | 2 | 11.56
+ this patch | 0 | 12.85
+ this patch | 1 | 5.61
+ this patch | 2 | 2.68
with full memory:
version | num-threads | time(sec)
----------------+---------------+-------------
devel | 0 | 51.18
devel | 1 | 57.82
devel | 2 | 41.54
+ this patch | 0 | 49.25
+ this patch | 1 | 44.80
+ this patch | 2 | 33.87
in real machine(memory 16G):
with empty memory:
version | num-threads | time(sec)
----------------+---------------+-------------
devel | 0 | 86.12
devel | 1 | 222.37
devel | 8 | 81.50
devel | 16 | 98.44
+ this patch | 0 | 86.07
+ this patch | 1 | 84.33
+ this patch | 8 | 14.95
+ this patch | 16 | 13.96
with full memory:
version | num-threads | time(sec)
----------------+---------------+-------------
devel | 0 | 540.89
devel | 1 | 715.25
devel | 8 | 132.54
devel | 16 | 112.89
+ this patch | 0 | 542.79
+ this patch | 1 | 538.22
+ this patch | 8 | 108.28
+ this patch | 16 | 107.83
--
Thanks
Zhou
On 10/14/2015 01:24 PM, Atsushi Kumagai wrote:
> Hello,
>
> I have improved the performance issue of parallel compression
> which we faced in:
>
> http://lists.infradead.org/pipermail/kexec/2015-July/014137.html
>
> The cause of the issue is that compress2() calls malloc() and free()
> for a temp buffer in each call, it can cause many page faults since
> makedumpfile has to call compress2() for each page.
>
> It's easy to avoid the issue, just divide compress2() into three
> functions as initialization part, compression part and finalization
> part. Then we don't need to call the initialization function and the
> finalization function for each page.
>
> In order to benchmark, I measured the execution time and the number of
> page faults by *perf stat -e page-faults* on the current devel branch(v1.5.8+).
>
> The result is here:
>
> CPU: Dual-Core AMD Opteron(tm) Processor 2218 @ 2.6GHz (4 cores)
> Memory: 5GB
> zlib: 1.2.3-29
> glibc: 2.12-1.132
>
> version | num-threads | time(sec) | page-faults
> ----------------+---------------+-------------+------------------
> devel | 1 | 133.96 | 21,801,120
> devel | 3 | 87.25 | 21,801,150
> + this patch | 1 | 47.80 | 1,036,408
> + this patch | 3 | 39.14 | 1,036,478
>
>
> Thanks
> Atsushi Kumagai
>
>
> From: Atsushi Kumagai <ats-kumagai at wm.jp.nec.com>
> Date: Thu, 8 Oct 2015 15:06:08 +0900
> Subject: [PATCH] Improve performance for parallel compression with zlib.
>
> compress2() allocates a buffer, compresses a input data and
> deallocates the buffer in each call. makedumpfile has to call
> compress2() for each page, it can cause big performance
> degradation due to many page faults. This issue will be
> especially apparent in the case of multi thread compression
> since per-thread arena is easy to be grown and trimmed compared
> with main arena.
>
> Fortunately, the zlib functions called in compress2() are global,
> it's easy to extract the allocation and deallocation part from
> compress2().
>
> Signed-off-by: Atsushi Kumagai <ats-kumagai at wm.jp.nec.com>
> ---
> makedumpfile.c | 57 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++-
> makedumpfile.h | 4 ++++
> 2 files changed, 60 insertions(+), 1 deletion(-)
>
> diff --git a/makedumpfile.c b/makedumpfile.c
> index 06c8baf..fa0b779 100644
> --- a/makedumpfile.c
> +++ b/makedumpfile.c
> @@ -25,6 +25,7 @@
> #include <sys/time.h>
> #include <limits.h>
> #include <assert.h>
> +#include <zlib.h>
>
> struct symbol_table symbol_table;
> struct size_table size_table;
> @@ -3538,6 +3539,11 @@ initial_for_parallel()
> MMAP_CACHE_PARALLEL(i)->mmap_start_offset = 0;
> MMAP_CACHE_PARALLEL(i)->mmap_end_offset = 0;
>
> + if (initialize_zlib(&ZLIB_STREAM_PARALLEL(i), Z_BEST_SPEED) == FALSE) {
> + ERRMSG("zlib initialization failed.\n");
> + return FALSE;
> + }
> +
> #ifdef USELZO
> if ((WRKMEM_PARALLEL(i) = malloc(LZO1X_1_MEM_COMPRESS)) == NULL) {
> MSG("Can't allocate memory for the working memory. %s\n",
> @@ -3628,6 +3634,7 @@ free_for_parallel()
>
> free(MMAP_CACHE_PARALLEL(i));
> }
> + finalize_zlib(&ZLIB_STREAM_PARALLEL(i));
> #ifdef USELZO
> if (WRKMEM_PARALLEL(i) != NULL)
> free(WRKMEM_PARALLEL(i));
> @@ -7017,6 +7024,53 @@ write_kdump_page(struct cache_data *cd_header, struct cache_data *cd_page,
> return TRUE;
> }
>
> +int initialize_zlib(z_stream *stream, int level)
> +{
> + int err;
> +
> + stream->zalloc = (alloc_func)Z_NULL;
> + stream->zfree = (free_func)Z_NULL;
> + stream->opaque = (voidpf)Z_NULL;
> +
> + err = deflateInit(stream, level);
> + if (err != Z_OK) {
> + ERRMSG("deflateInit failed: %s\n", zError(err));
> + return FALSE;
> + }
> + return TRUE;
> +}
> +
> +int compress_mdf (z_stream *stream, Bytef *dest, uLongf *destLen,
> + const Bytef *source, uLong sourceLen, int level)
> +{
> + int err;
> + stream->next_in = (Bytef*)source;
> + stream->avail_in = (uInt)sourceLen;
> + stream->next_out = dest;
> + stream->avail_out = (uInt)*destLen;
> + if ((uLong)stream->avail_out != *destLen)
> + return Z_BUF_ERROR;
> +
> + err = deflate(stream, Z_FINISH);
> +
> + if (err != Z_STREAM_END) {
> + deflateReset(stream);
> + return err == Z_OK ? Z_BUF_ERROR : err;
> + }
> + *destLen = stream->total_out;
> +
> + err = deflateReset(stream);
> + return err;
> +}
> +
> +int finalize_zlib(z_stream *stream)
> +{
> + int err;
> + err = deflateEnd(stream);
> +
> + return err;
> +}
> +
> void *
> kdump_thread_function_cyclic(void *arg) {
> void *retval = PTHREAD_FAIL;
> @@ -7035,6 +7089,7 @@ kdump_thread_function_cyclic(void *arg) {
> struct mmap_cache *mmap_cache =
> MMAP_CACHE_PARALLEL(kdump_thread_args->thread_num);
> unsigned long size_out;
> + z_stream *stream = &ZLIB_STREAM_PARALLEL(kdump_thread_args->thread_num);
> #ifdef USELZO
> lzo_bytep wrkmem = WRKMEM_PARALLEL(kdump_thread_args->thread_num);
> #endif
> @@ -7135,7 +7190,7 @@ kdump_thread_function_cyclic(void *arg) {
> size_out = kdump_thread_args->len_buf_out;
> if ((info->flag_compress & DUMP_DH_COMPRESSED_ZLIB)
> && ((size_out = kdump_thread_args->len_buf_out),
> - compress2(buf_out, &size_out, buf,
> + compress_mdf(stream, buf_out, &size_out, buf,
> info->page_size,
> Z_BEST_SPEED) == Z_OK)
> && (size_out < info->page_size)) {
> diff --git a/makedumpfile.h b/makedumpfile.h
> index 0bd6425..cb8f0f3 100644
> --- a/makedumpfile.h
> +++ b/makedumpfile.h
> @@ -438,6 +438,7 @@ do { \
> #define BUF_PARALLEL(i) info->parallel_info[i].buf
> #define BUF_OUT_PARALLEL(i) info->parallel_info[i].buf_out
> #define MMAP_CACHE_PARALLEL(i) info->parallel_info[i].mmap_cache
> +#define ZLIB_STREAM_PARALLEL(i) info->parallel_info[i].zlib_stream
> #ifdef USELZO
> #define WRKMEM_PARALLEL(i) info->parallel_info[i].wrkmem
> #endif
> @@ -1050,6 +1051,7 @@ struct parallel_info {
> unsigned char *buf;
> unsigned char *buf_out;
> struct mmap_cache *mmap_cache;
> + z_stream zlib_stream;
> #ifdef USELZO
> lzo_bytep wrkmem;
> #endif
> @@ -2051,5 +2053,7 @@ int initial_xen(void);
> unsigned long long get_free_memory_size(void);
> int calculate_cyclic_buffer_size(void);
> int prepare_splitblock_table(void);
> +int initialize_zlib(z_stream *stream, int level);
> +int finalize_zlib(z_stream *stream);
>
> #endif /* MAKEDUMPFILE_H */
>
More information about the kexec
mailing list