dm-multipath low performance with blk-mq
Mike Snitzer
snitzer at redhat.com
Mon Jan 25 13:40:16 PST 2016
On Tue, Jan 19 2016 at 5:45P -0500,
Mike Snitzer <snitzer at redhat.com> wrote:
> On Mon, Jan 18 2016 at 7:04am -0500,
> Sagi Grimberg <sagig at dev.mellanox.co.il> wrote:
>
> > Hi All,
> >
> > I've recently tried out dm-multipath over a "super-fast" nvme device
> > and noticed a serious lock contention in dm-multipath that requires some
> > extra attention. The nvme device is a simple loopback device emulation
> > backed by null_blk device.
> >
> > With this I've seen dm-multipath pushing around ~470K IOPs while
> > the native (loopback) nvme performance can easily push up to 1500K+ IOPs.
> >
> > perf output [1] reveals a huge lock contention on the multipath lock
> > which is a per-dm_target contention point which seem to defeat the
> > purpose of blk-mq i/O path.
> >
> > The two current bottlenecks seem to come from multipath_busy and
> > __multipath_map. Would it make better sense to move to a percpu_ref
> > model with freeze/unfreeze logic for updates similar to what blk-mq
> > is doing?
> >
> > Thoughts?
>
> Your perf output clearly does identify the 'struct multipath' spinlock
> as a bottleneck.
>
> Is it fair to assume that implied in your test is that you increased
> md->tag_set.nr_hw_queues to > 1 in dm_init_request_based_blk_mq_queue()?
>
> I'd like to start by replicating your testbed. So I'll see about
> setting up the nvme loop driver you referenced in earlier mail.
> Can you share your fio job file and fio commandline for your test?
Would still appreciate answers to my 2 questions above (did you modify
md->tag_set.nr_hw_queues and can you share your fio job?)
I've yet to reproduce your config (using hch's nvme loop driver) or
test to verify your findings but I did develop a patch that switches
from spinlock_t to rwlock_t. I've only compile tested this but I'll try
to reproduce your setup and then test this patch to see if it helps.
Your worst offenders (multipath_busy and __multipath_map) are now using
a read lock in the fast path.
drivers/md/dm-mpath.c | 127 +++++++++++++++++++++++++++++++++-----------------
1 file changed, 83 insertions(+), 44 deletions(-)
diff --git a/drivers/md/dm-mpath.c b/drivers/md/dm-mpath.c
index cfa29f5..34aadb1 100644
--- a/drivers/md/dm-mpath.c
+++ b/drivers/md/dm-mpath.c
@@ -11,6 +11,7 @@
#include "dm-path-selector.h"
#include "dm-uevent.h"
+#include <linux/rwlock_types.h>
#include <linux/blkdev.h>
#include <linux/ctype.h>
#include <linux/init.h>
@@ -67,7 +68,7 @@ struct multipath {
const char *hw_handler_name;
char *hw_handler_params;
- spinlock_t lock;
+ rwlock_t lock;
unsigned nr_priority_groups;
struct list_head priority_groups;
@@ -189,7 +190,7 @@ static struct multipath *alloc_multipath(struct dm_target *ti)
m = kzalloc(sizeof(*m), GFP_KERNEL);
if (m) {
INIT_LIST_HEAD(&m->priority_groups);
- spin_lock_init(&m->lock);
+ rwlock_init(&m->lock);
m->queue_io = 1;
m->pg_init_delay_msecs = DM_PG_INIT_DELAY_DEFAULT;
INIT_WORK(&m->trigger_event, trigger_event);
@@ -386,12 +387,24 @@ static int __multipath_map(struct dm_target *ti, struct request *clone,
struct pgpath *pgpath;
struct block_device *bdev;
struct dm_mpath_io *mpio;
+ bool use_write_lock = false;
- spin_lock_irq(&m->lock);
+retry:
+ if (!use_write_lock)
+ read_lock_irq(&m->lock);
+ else
+ write_lock_irq(&m->lock);
/* Do we need to select a new pgpath? */
- if (!m->current_pgpath ||
- (!m->queue_io && (m->repeat_count && --m->repeat_count == 0)))
+ if (!use_write_lock) {
+ if (!m->current_pgpath ||
+ (!m->queue_io && (m->repeat_count && m->repeat_count == 1))) {
+ use_write_lock = true;
+ read_unlock_irq(&m->lock);
+ goto retry;
+ }
+ } else if (!m->current_pgpath ||
+ (!m->queue_io && (m->repeat_count && --m->repeat_count == 0)))
__choose_pgpath(m, nr_bytes);
pgpath = m->current_pgpath;
@@ -401,13 +414,23 @@ static int __multipath_map(struct dm_target *ti, struct request *clone,
r = -EIO; /* Failed */
goto out_unlock;
} else if (m->queue_io || m->pg_init_required) {
+ if (!use_write_lock) {
+ use_write_lock = true;
+ read_unlock_irq(&m->lock);
+ goto retry;
+ }
__pg_init_all_paths(m);
goto out_unlock;
}
+ if (!use_write_lock)
+ read_unlock_irq(&m->lock);
+ else
+ write_unlock_irq(&m->lock);
+
if (set_mapinfo(m, map_context) < 0)
/* ENOMEM, requeue */
- goto out_unlock;
+ return r;
mpio = map_context->ptr;
mpio->pgpath = pgpath;
@@ -415,8 +438,6 @@ static int __multipath_map(struct dm_target *ti, struct request *clone,
bdev = pgpath->path.dev->bdev;
- spin_unlock_irq(&m->lock);
-
if (clone) {
/* Old request-based interface: allocated clone is passed in */
clone->q = bdev_get_queue(bdev);
@@ -443,7 +464,10 @@ static int __multipath_map(struct dm_target *ti, struct request *clone,
return DM_MAPIO_REMAPPED;
out_unlock:
- spin_unlock_irq(&m->lock);
+ if (!use_write_lock)
+ read_unlock_irq(&m->lock);
+ else
+ write_unlock_irq(&m->lock);
return r;
}
@@ -474,14 +498,15 @@ static int queue_if_no_path(struct multipath *m, unsigned queue_if_no_path,
{
unsigned long flags;
- spin_lock_irqsave(&m->lock, flags);
+ write_lock_irqsave(&m->lock, flags);
if (save_old_value)
m->saved_queue_if_no_path = m->queue_if_no_path;
else
m->saved_queue_if_no_path = queue_if_no_path;
m->queue_if_no_path = queue_if_no_path;
- spin_unlock_irqrestore(&m->lock, flags);
+
+ write_unlock_irqrestore(&m->lock, flags);
if (!queue_if_no_path)
dm_table_run_md_queue_async(m->ti->table);
@@ -898,12 +923,12 @@ static void multipath_wait_for_pg_init_completion(struct multipath *m)
while (1) {
set_current_state(TASK_UNINTERRUPTIBLE);
- spin_lock_irqsave(&m->lock, flags);
+ read_lock_irqsave(&m->lock, flags);
if (!m->pg_init_in_progress) {
- spin_unlock_irqrestore(&m->lock, flags);
+ read_unlock_irqrestore(&m->lock, flags);
break;
}
- spin_unlock_irqrestore(&m->lock, flags);
+ read_unlock_irqrestore(&m->lock, flags);
io_schedule();
}
@@ -916,18 +941,18 @@ static void flush_multipath_work(struct multipath *m)
{
unsigned long flags;
- spin_lock_irqsave(&m->lock, flags);
+ write_lock_irqsave(&m->lock, flags);
m->pg_init_disabled = 1;
- spin_unlock_irqrestore(&m->lock, flags);
+ write_unlock_irqrestore(&m->lock, flags);
flush_workqueue(kmpath_handlerd);
multipath_wait_for_pg_init_completion(m);
flush_workqueue(kmultipathd);
flush_work(&m->trigger_event);
- spin_lock_irqsave(&m->lock, flags);
+ write_lock_irqsave(&m->lock, flags);
m->pg_init_disabled = 0;
- spin_unlock_irqrestore(&m->lock, flags);
+ write_unlock_irqrestore(&m->lock, flags);
}
static void multipath_dtr(struct dm_target *ti)
@@ -946,7 +971,7 @@ static int fail_path(struct pgpath *pgpath)
unsigned long flags;
struct multipath *m = pgpath->pg->m;
- spin_lock_irqsave(&m->lock, flags);
+ write_lock_irqsave(&m->lock, flags);
if (!pgpath->is_active)
goto out;
@@ -968,7 +993,7 @@ static int fail_path(struct pgpath *pgpath)
schedule_work(&m->trigger_event);
out:
- spin_unlock_irqrestore(&m->lock, flags);
+ write_unlock_irqrestore(&m->lock, flags);
return 0;
}
@@ -982,7 +1007,7 @@ static int reinstate_path(struct pgpath *pgpath)
unsigned long flags;
struct multipath *m = pgpath->pg->m;
- spin_lock_irqsave(&m->lock, flags);
+ write_lock_irqsave(&m->lock, flags);
if (pgpath->is_active)
goto out;
@@ -1014,7 +1039,7 @@ static int reinstate_path(struct pgpath *pgpath)
schedule_work(&m->trigger_event);
out:
- spin_unlock_irqrestore(&m->lock, flags);
+ write_unlock_irqrestore(&m->lock, flags);
if (run_queue)
dm_table_run_md_queue_async(m->ti->table);
@@ -1049,13 +1074,13 @@ static void bypass_pg(struct multipath *m, struct priority_group *pg,
{
unsigned long flags;
- spin_lock_irqsave(&m->lock, flags);
+ write_lock_irqsave(&m->lock, flags);
pg->bypassed = bypassed;
m->current_pgpath = NULL;
m->current_pg = NULL;
- spin_unlock_irqrestore(&m->lock, flags);
+ write_unlock_irqrestore(&m->lock, flags);
schedule_work(&m->trigger_event);
}
@@ -1076,7 +1101,7 @@ static int switch_pg_num(struct multipath *m, const char *pgstr)
return -EINVAL;
}
- spin_lock_irqsave(&m->lock, flags);
+ write_lock_irqsave(&m->lock, flags);
list_for_each_entry(pg, &m->priority_groups, list) {
pg->bypassed = 0;
if (--pgnum)
@@ -1086,7 +1111,7 @@ static int switch_pg_num(struct multipath *m, const char *pgstr)
m->current_pg = NULL;
m->next_pg = pg;
}
- spin_unlock_irqrestore(&m->lock, flags);
+ write_unlock_irqrestore(&m->lock, flags);
schedule_work(&m->trigger_event);
return 0;
@@ -1125,14 +1150,14 @@ static int pg_init_limit_reached(struct multipath *m, struct pgpath *pgpath)
unsigned long flags;
int limit_reached = 0;
- spin_lock_irqsave(&m->lock, flags);
+ write_lock_irqsave(&m->lock, flags);
if (m->pg_init_count <= m->pg_init_retries && !m->pg_init_disabled)
m->pg_init_required = 1;
else
limit_reached = 1;
- spin_unlock_irqrestore(&m->lock, flags);
+ write_unlock_irqrestore(&m->lock, flags);
return limit_reached;
}
@@ -1186,7 +1211,7 @@ static void pg_init_done(void *data, int errors)
fail_path(pgpath);
}
- spin_lock_irqsave(&m->lock, flags);
+ write_lock_irqsave(&m->lock, flags);
if (errors) {
if (pgpath == m->current_pgpath) {
DMERR("Could not failover device. Error %d.", errors);
@@ -1213,7 +1238,7 @@ static void pg_init_done(void *data, int errors)
wake_up(&m->pg_init_wait);
out:
- spin_unlock_irqrestore(&m->lock, flags);
+ write_unlock_irqrestore(&m->lock, flags);
}
static void activate_path(struct work_struct *work)
@@ -1272,7 +1297,7 @@ static int do_end_io(struct multipath *m, struct request *clone,
if (mpio->pgpath)
fail_path(mpio->pgpath);
- spin_lock_irqsave(&m->lock, flags);
+ read_lock_irqsave(&m->lock, flags);
if (!m->nr_valid_paths) {
if (!m->queue_if_no_path) {
if (!__must_push_back(m))
@@ -1282,7 +1307,7 @@ static int do_end_io(struct multipath *m, struct request *clone,
r = error;
}
}
- spin_unlock_irqrestore(&m->lock, flags);
+ read_unlock_irqrestore(&m->lock, flags);
return r;
}
@@ -1340,9 +1365,9 @@ static void multipath_resume(struct dm_target *ti)
struct multipath *m = (struct multipath *) ti->private;
unsigned long flags;
- spin_lock_irqsave(&m->lock, flags);
+ write_lock_irqsave(&m->lock, flags);
m->queue_if_no_path = m->saved_queue_if_no_path;
- spin_unlock_irqrestore(&m->lock, flags);
+ write_unlock_irqrestore(&m->lock, flags);
}
/*
@@ -1372,7 +1397,7 @@ static void multipath_status(struct dm_target *ti, status_type_t type,
unsigned pg_num;
char state;
- spin_lock_irqsave(&m->lock, flags);
+ read_lock_irqsave(&m->lock, flags);
/* Features */
if (type == STATUSTYPE_INFO)
@@ -1467,7 +1492,7 @@ static void multipath_status(struct dm_target *ti, status_type_t type,
break;
}
- spin_unlock_irqrestore(&m->lock, flags);
+ read_unlock_irqrestore(&m->lock, flags);
}
static int multipath_message(struct dm_target *ti, unsigned argc, char **argv)
@@ -1534,16 +1559,27 @@ out:
}
static int multipath_prepare_ioctl(struct dm_target *ti,
- struct block_device **bdev, fmode_t *mode)
+ struct block_device **bdev, fmode_t *mode)
{
struct multipath *m = ti->private;
unsigned long flags;
int r;
+ bool use_write_lock = false;
- spin_lock_irqsave(&m->lock, flags);
+retry:
+ if (!use_write_lock)
+ read_lock_irqsave(&m->lock, flags);
+ else
+ write_lock_irqsave(&m->lock, flags);
- if (!m->current_pgpath)
+ if (!m->current_pgpath) {
+ if (!use_write_lock) {
+ use_write_lock = true;
+ read_unlock_irqrestore(&m->lock, flags);
+ goto retry;
+ }
__choose_pgpath(m, 0);
+ }
if (m->current_pgpath) {
if (!m->queue_io) {
@@ -1562,17 +1598,20 @@ static int multipath_prepare_ioctl(struct dm_target *ti,
r = -EIO;
}
- spin_unlock_irqrestore(&m->lock, flags);
+ if (!use_write_lock)
+ read_unlock_irqrestore(&m->lock, flags);
+ else
+ write_unlock_irqrestore(&m->lock, flags);
if (r == -ENOTCONN) {
- spin_lock_irqsave(&m->lock, flags);
+ write_lock_irqsave(&m->lock, flags);
if (!m->current_pg) {
/* Path status changed, redo selection */
__choose_pgpath(m, 0);
}
if (m->pg_init_required)
__pg_init_all_paths(m);
- spin_unlock_irqrestore(&m->lock, flags);
+ write_unlock_irqrestore(&m->lock, flags);
dm_table_run_md_queue_async(m->ti->table);
}
@@ -1627,7 +1666,7 @@ static int multipath_busy(struct dm_target *ti)
struct pgpath *pgpath;
unsigned long flags;
- spin_lock_irqsave(&m->lock, flags);
+ read_lock_irqsave(&m->lock, flags);
/* pg_init in progress or no paths available */
if (m->pg_init_in_progress ||
@@ -1674,7 +1713,7 @@ static int multipath_busy(struct dm_target *ti)
busy = 0;
out:
- spin_unlock_irqrestore(&m->lock, flags);
+ read_unlock_irqrestore(&m->lock, flags);
return busy;
}
More information about the Linux-nvme
mailing list