[PATCH v3] blk-cgroup: Replace u64 sync with spinlock for iostat update
Boy Wu (吳勃誼)
Boy.Wu at mediatek.com
Tue Jul 16 20:44:47 PDT 2024
On Wed, 2024-07-17 at 10:25 +0800, Boy.Wu wrote:
> On Tue, 2024-07-16 at 11:13 -1000, Tejun Heo wrote:
> >
> > External email : Please do not click links or open attachments
> > until
> > you have verified the sender or the content.
> > Hello, Boy.
> >
> > So, looking at the patch, I'm not sure per-blkg lock makes sense.
> >
> > On Tue, Jul 16, 2024 at 03:52:06PM +0800, boy.wu wrote:
> > > @@ -995,15 +995,13 @@ static void blkcg_iostat_update(struct
> >
> > blkcg_gq *blkg, struct blkg_iostat *cur,
> > > struct blkg_iostat *last)
> > > {
> > > struct blkg_iostat delta;
> > > -unsigned long flags;
> > >
> > > /* propagate percpu delta to global */
> > > -flags = u64_stats_update_begin_irqsave(&blkg->iostat.sync);
> > > +guard(spinlock_irqsave)(&blkg->iostat.spinlock);
> > > blkg_iostat_set(&delta, cur);
> > > blkg_iostat_sub(&delta, last);
> > > blkg_iostat_add(&blkg->iostat.cur, &delta);
> > > blkg_iostat_add(last, &delta);
> > > -u64_stats_update_end_irqrestore(&blkg->iostat.sync, flags);
> > > }
> >
> > This is already called with blkg_stat_lock held.
>
> In blkcg_iostat_update, it is protected by both blkg_stat_lock and
> u64
> sync. I agree that no changes are needed here.
>
> >
> > > @@ -1051,10 +1048,8 @@ static void __blkcg_rstat_flush(struct
> > > blkcg
> >
> > *blkcg, int cpu)
> > > goto propagate_up; /* propagate up to parent only */
> > >
> > > /* fetch the current per-cpu values */
> > > -do {
> > > -seq = u64_stats_fetch_begin(&bisc->sync);
> > > +scoped_guard(spinlock_irqsave, &bisc->spinlock)
> > > blkg_iostat_set(&cur, &bisc->cur);
> > > -} while (u64_stats_fetch_retry(&bisc->sync, seq));
> >
> > This is per-cpu stat and we should keep using u64_sync for them.
> >
> > > @@ -1134,9 +1128,8 @@ static void blkcg_fill_root_iostats(void)
> > > cpu_dkstats->sectors[STAT_DISCARD] << 9;
> > > }
> > >
> > > -flags = u64_stats_update_begin_irqsave(&blkg->iostat.sync);
> > > +guard(spinlock_irqsave)(&blkg->iostat.spinlock);
> > > blkg_iostat_set(&blkg->iostat.cur, &tmp);
> > > -u64_stats_update_end_irqrestore(&blkg->iostat.sync, flags);
> > > }
> > > }
> >
> > ...
> > > @@ -1157,16 +1149,14 @@ static void blkcg_print_one_stat(struct
> >
> > blkcg_gq *blkg, struct seq_file *s)
> > >
> > > seq_printf(s, "%s ", dname);
> > >
> > > -do {
> > > -seq = u64_stats_fetch_begin(&bis->sync);
> > > -
> > > +scoped_guard(spinlock_irqsave, &bis->spinlock) {
> > > rbytes = bis->cur.bytes[BLKG_IOSTAT_READ];
> > > wbytes = bis->cur.bytes[BLKG_IOSTAT_WRITE];
> > > dbytes = bis->cur.bytes[BLKG_IOSTAT_DISCARD];
> > > rios = bis->cur.ios[BLKG_IOSTAT_READ];
> > > wios = bis->cur.ios[BLKG_IOSTAT_WRITE];
> > > dios = bis->cur.ios[BLKG_IOSTAT_DISCARD];
> > > -} while (u64_stats_fetch_retry(&bis->sync, seq));
> > > +}
> >
> > The above two are the only places which can potentially benefit
> > from
> > per-blkg lock but these aren't hot paths. I'd just use
> > blkg_stat_lock
> > for
> > the above.
>
> In __blkcg_rstat_flush, u64 sync is used for fetching data, changing
> to
> spinlock will increase overhead both 64 bit and 32 bit systems. 64
> bit
> systems do noting in u64 sync, and 32 bit systems can read data in
> parallel in u64 sync if no one updating data. However, it is already
> protected by blkg_stat_lock, so there is no parallelism now, and
> there
> is no issue here. I think that no changes are needed here.
>
> In blkcg_fill_root_iostats, this is where the issue happens in 32 bit
> SMP systems, so spinlock needs to be added to protect it.
>
> In blkcg_print_one_stat, u64 sync is used for fetching data. Changing
> to spinlock will increase overhead like in __blkcg_rstat_flush.
> However, it is already protected by spin_lock_irq(&blkg->q-
> >queue_lock)
> in blkcg_print_stat, so there is no parallelism now, and there is no
> issue here. I think that no changes are needed here.
>
> >
> > > @@ -2152,30 +2141,29 @@ void blk_cgroup_bio_start(struct bio
> > > *bio)
> > >
> > > cpu = get_cpu();
> > > bis = per_cpu_ptr(bio->bi_blkg->iostat_cpu, cpu);
> > > -flags = u64_stats_update_begin_irqsave(&bis->sync);
> > > -
> > > -/*
> > > - * If the bio is flagged with BIO_CGROUP_ACCT it means this is a
> >
> > split
> > > - * bio and we would have already accounted for the size of the
> >
> > bio.
> > > - */
> > > -if (!bio_flagged(bio, BIO_CGROUP_ACCT)) {
> > > -bio_set_flag(bio, BIO_CGROUP_ACCT);
> > > -bis->cur.bytes[rwd] += bio->bi_iter.bi_size;
> > > -}
> > > -bis->cur.ios[rwd]++;
> > > +scoped_guard(spinlock_irqsave, &bis->spinlock) {
> > > +/*
> > > + * If the bio is flagged with BIO_CGROUP_ACCT it means this is a
> >
> > split
> > > + * bio and we would have already accounted for the size of the
> >
> > bio.
> > > + */
> > > +if (!bio_flagged(bio, BIO_CGROUP_ACCT)) {
> > > +bio_set_flag(bio, BIO_CGROUP_ACCT);
> > > +bis->cur.bytes[rwd] += bio->bi_iter.bi_size;
> > > +}
> > > +bis->cur.ios[rwd]++;
> > >
> > > -/*
> > > - * If the iostat_cpu isn't in a lockless list, put it into the
> > > - * list to indicate that a stat update is pending.
> > > - */
> > > -if (!READ_ONCE(bis->lqueued)) {
> > > -struct llist_head *lhead = this_cpu_ptr(blkcg->lhead);
> > > +/*
> > > + * If the iostat_cpu isn't in a lockless list, put it into the
> > > + * list to indicate that a stat update is pending.
> > > + */
> > > +if (!READ_ONCE(bis->lqueued)) {
> > > +struct llist_head *lhead = this_cpu_ptr(blkcg->lhead);
> > >
> > > -llist_add(&bis->lnode, lhead);
> > > -WRITE_ONCE(bis->lqueued, true);
> > > +llist_add(&bis->lnode, lhead);
> > > +WRITE_ONCE(bis->lqueued, true);
> > > +}
> >
> > These are per-cpu stat updates which should keep using u64_sync. We
> > don't
> > want to incur locking overhead for stat updates in the hot issue
> > path.
> >
>
> I agree that no changes are needed in blk_cgroup_bio_start.
>
> > Thanks.
> >
> > --
> > tejun
>
> I think we can look back to the original issue, which is that on 32
> bit
> SMP systems will have concurrent problems on
> u64_stats_update_begin_irqsave and u64_stats_update_end_irqrestore in
> blkcg_fill_root_iostats. So, adding a lock only on 32 bits systems
> in blkcg_fill_root_iostats is to resolve the concurrent issue and not
> affect 64 bit systems, which do not have the issue in the first
> place,
> because u64 sync does noting in 64 bit systems and they don't need
> it.
>
> I think we can just fix it by following change.
>
> @@ -1134,9 +1128,8 @@ static void blkcg_fill_root_iostats(void)
> cpu_dkstats->sectors[STAT_DISCARD] <<
> 9;
> }
>
> +#if BITS_PER_LONG == 32
> + guard(spinlock_irqsave)(&blkg_stat_lock);
> +#endif
> flags = u64_stats_update_begin_irqsave(&blkg-
> > iostat.sync);
>
> blkg_iostat_set(&blkg->iostat.cur, &tmp);
> u64_stats_update_end_irqrestore(&blkg->iostat.sync,
> flags);
> }
> }
I think there is another way to fix it. Maybe we should address the
root cause, which is that u64_stats_update_begin_irqsave
and u64_stats_update_end_irqrestore do not protect the seqcount in 32
bit SMP systems. This can be fix by the following change.
diff --git a/include/linux/u64_stats_sync.h
b/include/linux/u64_stats_sync.h
index 46040d66334a..94dd74b4fb2c 100644
--- a/include/linux/u64_stats_sync.h
+++ b/include/linux/u64_stats_sync.h
@@ -64,6 +64,7 @@
struct u64_stats_sync {
#if BITS_PER_LONG == 32
seqcount_t seq;
+ spinlock_t spinlock;
#endif
};
@@ -138,6 +139,7 @@ static inline void u64_stats_inc(u64_stats_t *p)
static inline void u64_stats_init(struct u64_stats_sync *syncp)
{
seqcount_init(&syncp->seq);
+ spin_lock_init(&syncp->spinlock)
}
static inline void __u64_stats_update_begin(struct u64_stats_sync
*syncp)
@@ -191,6 +193,7 @@ static inline unsigned long
u64_stats_update_begin_irqsave(struct u64_stats_sync
{
unsigned long flags = __u64_stats_irqsave();
+ spin_lock_irq(&syncp->spinlock);
__u64_stats_update_begin(syncp);
return flags;
}
@@ -199,6 +202,7 @@ static inline void
u64_stats_update_end_irqrestore(struct u64_stats_sync *syncp,
unsigned long flags)
{
__u64_stats_update_end(syncp);
+ spin_unlock_irq(&syncp->spinlock);
__u64_stats_irqrestore(flags);
}
--
boy.wu
More information about the linux-arm-kernel
mailing list