[PATCH 13/26] netfs: Add some tools for managing bvecq chains
David Howells
dhowells at redhat.com
Thu Mar 26 03:45:28 PDT 2026
Provide a selection of tools for managing bvec queue chains. This
includes:
(1) Allocation, prepopulation, expansion, shortening and refcounting of
bvecqs and bvecq chains.
This can be used to do things like creating an encryption buffer in
cifs or a directory content buffer in afs. The memory segments will
be appropriate disposed off according to the flags on the bvecq.
(2) Management of a bvecq chain as a rolling buffer and the management of
positions within it.
(3) Loading folios, slicing chains and clearing content.
Signed-off-by: David Howells <dhowells at redhat.com>
cc: Paulo Alcantara <pc at manguebit.org>
cc: Matthew Wilcox <willy at infradead.org>
cc: Christoph Hellwig <hch at infradead.org>
cc: linux-cifs at vger.kernel.org
cc: netfs at lists.linux.dev
cc: linux-fsdevel at vger.kernel.org
---
fs/netfs/Makefile | 1 +
fs/netfs/bvecq.c | 706 +++++++++++++++++++++++++++++++++++
fs/netfs/internal.h | 1 +
fs/netfs/stats.c | 4 +-
include/linux/bvecq.h | 165 +++++++-
include/linux/iov_iter.h | 4 +-
include/linux/netfs.h | 1 +
include/trace/events/netfs.h | 24 ++
lib/iov_iter.c | 16 +-
lib/scatterlist.c | 4 +-
lib/tests/kunit_iov_iter.c | 18 +-
11 files changed, 919 insertions(+), 25 deletions(-)
create mode 100644 fs/netfs/bvecq.c
diff --git a/fs/netfs/Makefile b/fs/netfs/Makefile
index b43188d64bd8..e1f12ecb5abf 100644
--- a/fs/netfs/Makefile
+++ b/fs/netfs/Makefile
@@ -3,6 +3,7 @@
netfs-y := \
buffered_read.o \
buffered_write.o \
+ bvecq.o \
direct_read.o \
direct_write.o \
iterator.o \
diff --git a/fs/netfs/bvecq.c b/fs/netfs/bvecq.c
new file mode 100644
index 000000000000..c71646ea5243
--- /dev/null
+++ b/fs/netfs/bvecq.c
@@ -0,0 +1,706 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/* Buffering helpers for bvec queues
+ *
+ * Copyright (C) 2025 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells at redhat.com)
+ */
+
+#include "internal.h"
+
+void bvecq_dump(const struct bvecq *bq)
+{
+ int b = 0;
+
+ for (; bq; bq = bq->next, b++) {
+ int skipz = 0;
+
+ pr_notice("BQ[%u] %u/%u fp=%llx\n", b, bq->nr_slots, bq->max_slots, bq->fpos);
+ for (int s = 0; s < bq->nr_slots; s++) {
+ const struct bio_vec *bv = &bq->bv[s];
+
+ if (!bv->bv_page && !bv->bv_len && skipz < 2) {
+ skipz = 1;
+ continue;
+ }
+ if (skipz == 1)
+ pr_notice("BQ[%u:00-%02u] ...\n", b, s - 1);
+ skipz = 2;
+ pr_notice("BQ[%u:%02u] %10lx %04x %04x %u\n",
+ b, s,
+ bv->bv_page ? page_to_pfn(bv->bv_page) : 0,
+ bv->bv_offset, bv->bv_len,
+ bv->bv_page ? page_count(bv->bv_page) : 0);
+ }
+ }
+}
+EXPORT_SYMBOL(bvecq_dump);
+
+/**
+ * bvecq_alloc_one - Allocate a single bvecq node with unpopulated slots
+ * @nr_slots: Number of slots to allocate
+ * @gfp: The allocation constraints.
+ *
+ * Allocate a single bvecq node and initialise the header. A number of inline
+ * slots are also allocated, rounded up to fit after the header in a power-of-2
+ * slab object of up to 512 bytes (up to 29 slots on a 64-bit cpu). The slot
+ * array is not initialised.
+ *
+ * Return: The node pointer or NULL on allocation failure.
+ */
+struct bvecq *bvecq_alloc_one(size_t nr_slots, gfp_t gfp)
+{
+ struct bvecq *bq;
+ const size_t max_size = 512;
+ const size_t max_slots = (max_size - sizeof(*bq)) / sizeof(bq->__bv[0]);
+ size_t part = umin(nr_slots, max_slots);
+ size_t size = roundup_pow_of_two(struct_size(bq, __bv, part));
+
+ bq = kmalloc(size, gfp);
+ if (bq) {
+ *bq = (struct bvecq) {
+ .ref = REFCOUNT_INIT(1),
+ .bv = bq->__bv,
+ .inline_bv = true,
+ .max_slots = (size - sizeof(*bq)) / sizeof(bq->__bv[0]),
+ };
+ netfs_stat(&netfs_n_bvecq);
+ }
+ return bq;
+}
+EXPORT_SYMBOL(bvecq_alloc_one);
+
+/**
+ * bvecq_alloc_chain - Allocate an unpopulated bvecq chain
+ * @nr_slots: Number of slots to allocate
+ * @gfp: The allocation constraints.
+ *
+ * Allocate a chain of bvecq nodes providing at least the requested cumulative
+ * number of slots.
+ *
+ * Return: The first node pointer or NULL on allocation failure.
+ */
+struct bvecq *bvecq_alloc_chain(size_t nr_slots, gfp_t gfp)
+{
+ struct bvecq *head = NULL, *tail = NULL;
+
+ _enter("%zu", nr_slots);
+
+ for (;;) {
+ struct bvecq *bq;
+
+ bq = bvecq_alloc_one(nr_slots, gfp);
+ if (!bq)
+ goto oom;
+
+ if (tail) {
+ tail->next = bq;
+ bq->prev = tail;
+ } else {
+ head = bq;
+ }
+ tail = bq;
+ if (tail->max_slots >= nr_slots)
+ break;
+ nr_slots -= tail->max_slots;
+ }
+
+ return head;
+oom:
+ bvecq_put(head);
+ return NULL;
+}
+EXPORT_SYMBOL(bvecq_alloc_chain);
+
+/**
+ * bvecq_alloc_buffer - Allocate a bvecq chain and populate with buffers
+ * @size: Target size of the buffer (can be 0 for an empty buffer)
+ * @pre_slots: Number of preamble slots to set aside
+ * @gfp: The allocation constraints.
+ *
+ * Allocate a chain of bvecq nodes and populate the slots with sufficient pages
+ * to provide at least the requested amount of space, leaving the first
+ * @pre_slots slots unset. The pages allocated may be compound pages larger
+ * than PAGE_SIZE and thus occupy fewer slots. The pages have their refcounts
+ * set to 1 and can be passed to MSG_SPLICE_PAGES.
+ *
+ * Return: The first node pointer or NULL on allocation failure.
+ */
+struct bvecq *bvecq_alloc_buffer(size_t size, unsigned int pre_slots, gfp_t gfp)
+{
+ struct bvecq *head = NULL, *tail = NULL, *p = NULL;
+ size_t count = DIV_ROUND_UP(size, PAGE_SIZE);
+
+ _enter("%zx,%zx,%u", size, count, pre_slots);
+
+ do {
+ struct page **pages;
+ int want, got;
+
+ p = bvecq_alloc_one(umin(count, 32 - 3), gfp);
+ if (!p)
+ goto oom;
+
+ p->free = true;
+
+ if (tail) {
+ tail->next = p;
+ p->prev = tail;
+ } else {
+ head = p;
+ }
+ tail = p;
+ if (!count)
+ break;
+
+ pages = (struct page **)&p->bv[p->max_slots];
+ pages -= p->max_slots - pre_slots;
+ memset(pages, 0, (p->max_slots - pre_slots) * sizeof(pages[0]));
+
+ want = umin(count, p->max_slots - pre_slots);
+ got = alloc_pages_bulk(gfp, want, pages);
+ if (got < want) {
+ for (int i = 0; i < got; i++)
+ __free_page(pages[i]);
+ goto oom;
+ }
+
+ tail->nr_slots = pre_slots + got;
+ for (int i = 0; i < got; i++) {
+ int j = pre_slots + i;
+
+ set_page_count(pages[i], 1);
+ bvec_set_page(&tail->bv[j], pages[i], PAGE_SIZE, 0);
+ }
+
+ count -= got;
+ pre_slots = 0;
+ } while (count > 0);
+
+ return head;
+oom:
+ bvecq_put(head);
+ return NULL;
+}
+EXPORT_SYMBOL(bvecq_alloc_buffer);
+
+/*
+ * Free the page pointed to be a segment as necessary.
+ */
+static void bvecq_free_seg(struct bvecq *bq, unsigned int seg)
+{
+ if (!bq->free ||
+ !bq->bv[seg].bv_page)
+ return;
+
+ if (bq->unpin)
+ unpin_user_page(bq->bv[seg].bv_page);
+ else
+ __free_page(bq->bv[seg].bv_page);
+}
+
+/**
+ * bvecq_put - Put a ref on a bvec queue
+ * @bq: The start of the folio queue to free
+ *
+ * Put the ref(s) on the nodes in a bvec queue, freeing up the node and the
+ * page fragments it points to as the refcounts become zero.
+ */
+void bvecq_put(struct bvecq *bq)
+{
+ struct bvecq *next;
+
+ for (; bq; bq = next) {
+ if (!refcount_dec_and_test(&bq->ref))
+ break;
+ for (int seg = 0; seg < bq->nr_slots; seg++)
+ bvecq_free_seg(bq, seg);
+ next = bq->next;
+ netfs_stat_d(&netfs_n_bvecq);
+ kfree(bq);
+ }
+}
+EXPORT_SYMBOL(bvecq_put);
+
+/**
+ * bvecq_expand_buffer - Allocate buffer space into a bvec queue
+ * @_buffer: Pointer to the bvecq chain to expand (may point to a NULL; updated).
+ * @_cur_size: Current size of the buffer (updated).
+ * @size: Target size of the buffer.
+ * @gfp: The allocation constraints.
+ */
+int bvecq_expand_buffer(struct bvecq **_buffer, size_t *_cur_size, ssize_t size, gfp_t gfp)
+{
+ struct bvecq *tail = *_buffer;
+ const size_t max_slots = 32;
+
+ size = round_up(size, PAGE_SIZE);
+ if (*_cur_size >= size)
+ return 0;
+
+ if (tail)
+ while (tail->next)
+ tail = tail->next;
+
+ do {
+ struct page *page;
+ int order = 0;
+
+ if (!tail || bvecq_is_full(tail)) {
+ struct bvecq *p;
+
+ p = bvecq_alloc_one(max_slots, gfp);
+ if (!p)
+ return -ENOMEM;
+ if (tail) {
+ tail->next = p;
+ p->prev = tail;
+ } else {
+ *_buffer = p;
+ }
+ tail = p;
+ }
+
+ if (size - *_cur_size > PAGE_SIZE)
+ order = umin(ilog2(size - *_cur_size) - PAGE_SHIFT,
+ MAX_PAGECACHE_ORDER);
+
+ page = alloc_pages(gfp | __GFP_COMP, order);
+ if (!page && order > 0)
+ page = alloc_pages(gfp | __GFP_COMP, 0);
+ if (!page)
+ return -ENOMEM;
+
+ bvec_set_page(&tail->bv[tail->nr_slots++], page, PAGE_SIZE << order, 0);
+ *_cur_size += PAGE_SIZE << order;
+ } while (*_cur_size < size);
+
+ return 0;
+}
+EXPORT_SYMBOL(bvecq_expand_buffer);
+
+/**
+ * bvecq_shorten_buffer - Shorten a bvec queue buffer
+ * @bq: The start of the buffer to shorten
+ * @slot: The slot to start from
+ * @size: The size to retain
+ *
+ * Shorten the content of a bvec queue down to the minimum number of segments,
+ * starting at the specified segment, to retain the specified size.
+ *
+ * Return: 0 if successful; -EMSGSIZE if there is insufficient content.
+ */
+int bvecq_shorten_buffer(struct bvecq *bq, unsigned int slot, size_t size)
+{
+ ssize_t retain = size;
+
+ /* Skip through the segments we want to keep. */
+ for (; bq; bq = bq->next) {
+ for (; slot < bq->nr_slots; slot++) {
+ retain -= bq->bv[slot].bv_len;
+ if (retain < 0)
+ goto found;
+ }
+ slot = 0;
+ }
+ if (WARN_ON_ONCE(retain > 0))
+ return -EMSGSIZE;
+ return 0;
+
+found:
+ /* Shorten the entry to be retained and clean the rest of this bvecq. */
+ bq->bv[slot].bv_len += retain;
+ slot++;
+ for (int i = slot; i < bq->nr_slots; i++)
+ bvecq_free_seg(bq, i);
+ bq->nr_slots = slot;
+
+ /* Free the queue tail. */
+ bvecq_put(bq->next);
+ bq->next = NULL;
+ return 0;
+}
+EXPORT_SYMBOL(bvecq_shorten_buffer);
+
+/**
+ * bvecq_buffer_init - Initialise a buffer and set position
+ * @pos: The position to point at the new buffer.
+ * @gfp: The allocation constraints.
+ *
+ * Initialise a rolling buffer. We allocate an unpopulated bvecq node to so
+ * that the pointers can be independently driven by the producer and the
+ * consumer.
+ *
+ * Return 0 if successful; -ENOMEM on allocation failure.
+ */
+int bvecq_buffer_init(struct bvecq_pos *pos, gfp_t gfp)
+{
+ struct bvecq *bq;
+
+ bq = bvecq_alloc_one(13, gfp);
+ if (!bq)
+ return -ENOMEM;
+
+ pos->bvecq = bq; /* Comes with a ref. */
+ pos->slot = 0;
+ pos->offset = 0;
+ return 0;
+}
+
+/**
+ * bvecq_buffer_make_space - Start a new bvecq node in a buffer
+ * @pos: The position of the last node.
+ * @gfp: The allocation constraints.
+ *
+ * Add a new node on to the buffer chain at the specified position, either
+ * because the previous one is full or because we have a discontiguity to
+ * contend with, and update @pos to point to it.
+ *
+ * Return: 0 if successful; -ENOMEM on allocation failure.
+ */
+int bvecq_buffer_make_space(struct bvecq_pos *pos, gfp_t gfp)
+{
+ struct bvecq *bq, *head = pos->bvecq;
+
+ bq = bvecq_alloc_one(14, gfp);
+ if (!bq)
+ return -ENOMEM;
+ bq->prev = head;
+
+ pos->bvecq = bvecq_get(bq);
+ pos->slot = 0;
+ pos->offset = 0;
+
+ /* Make sure the initialisation is stored before the next pointer.
+ *
+ * [!] NOTE: After we set head->next, the consumer is at liberty to
+ * immediately delete the old head.
+ */
+ smp_store_release(&head->next, bq);
+ bvecq_put(head);
+ return 0;
+}
+
+/**
+ * bvecq_pos_advance - Advance a bvecq position
+ * @pos: The position to advance.
+ * @amount: The amount of bytes to advance by.
+ *
+ * Advance the specified bvecq position by @amount bytes. @pos is updated and
+ * bvecq ref counts may have been manipulated. If the position hits the end of
+ * the queue, then it is left pointing beyond the last slot of the last bvecq
+ * so that it doesn't break the chain.
+ */
+void bvecq_pos_advance(struct bvecq_pos *pos, size_t amount)
+{
+ struct bvecq *bq = pos->bvecq;
+ unsigned int slot = pos->slot;
+ size_t offset = pos->offset;
+
+ if (slot >= bq->nr_slots) {
+ bq = bq->next;
+ slot = 0;
+ }
+
+ while (amount) {
+ const struct bio_vec *bv = &bq->bv[slot];
+ size_t part = umin(bv->bv_len - offset, amount);
+
+ if (likely(part < bv->bv_len)) {
+ offset += part;
+ break;
+ }
+ amount -= part;
+ offset = 0;
+ slot++;
+ if (slot >= bq->nr_slots) {
+ if (!bq->next)
+ break;
+ bq = bq->next;
+ slot = 0;
+ }
+ }
+
+ pos->slot = slot;
+ pos->offset = offset;
+ bvecq_pos_move(pos, bq);
+}
+
+/**
+ * bvecq_zero - Clear memory starting at the bvecq position.
+ * @pos: The position in the bvecq chain to start clearing.
+ * @amount: The number of bytes to clear.
+ *
+ * Clear memory fragments pointed to by a bvec queue. @pos is updated and
+ * bvecq ref counts may have been manipulated. If the position hits the end of
+ * the queue, then it is left pointing beyond the last slot of the last bvecq
+ * so that it doesn't break the chain.
+ *
+ * Return: The number of bytes cleared.
+ */
+ssize_t bvecq_zero(struct bvecq_pos *pos, size_t amount)
+{
+ struct bvecq *bq = pos->bvecq;
+ unsigned int slot = pos->slot;
+ ssize_t cleared = 0;
+ size_t offset = pos->offset;
+
+ if (WARN_ON_ONCE(!bq))
+ return 0;
+
+ if (slot >= bq->nr_slots) {
+ bq = bq->next;
+ if (WARN_ON_ONCE(!bq))
+ return 0;
+ slot = 0;
+ }
+
+ do {
+ const struct bio_vec *bv = &bq->bv[slot];
+
+ if (offset < bv->bv_len) {
+ size_t part = umin(amount - cleared, bv->bv_len - offset);
+
+ memzero_page(bv->bv_page, bv->bv_offset + offset, part);
+
+ offset += part;
+ cleared += part;
+ }
+
+ if (offset >= bv->bv_len) {
+ offset = 0;
+ slot++;
+ if (slot >= bq->nr_slots) {
+ if (!bq->next)
+ break;
+ bq = bq->next;
+ slot = 0;
+ }
+ }
+ } while (cleared < amount);
+
+ bvecq_pos_move(pos, bq);
+ pos->slot = slot;
+ pos->offset = offset;
+ return cleared;
+}
+
+/**
+ * bvecq_slice - Find a slice of a bvecq queue
+ * @pos: The position to start at.
+ * @max_size: The maximum size of the slice (or ULONG_MAX).
+ * @max_segs: The maximum number of segments in the slice (or INT_MAX).
+ * @_nr_segs: Where to put the number of segments (updated).
+ *
+ * Determine the size and number of segments that can be obtained the next
+ * slice of bvec queue up to the maximum size and segment count specified. The
+ * slice is also limited if a discontiguity is found.
+ *
+ * @pos is updated to the end of the slice. If the position hits the end of
+ * the queue, then it is left pointing beyond the last slot of the last bvecq
+ * so that it doesn't break the chain.
+ *
+ * Return: The number of bytes in the slice.
+ */
+size_t bvecq_slice(struct bvecq_pos *pos, size_t max_size,
+ unsigned int max_segs, unsigned int *_nr_segs)
+{
+ struct bvecq *bq;
+ unsigned int slot = pos->slot, nsegs = 0;
+ size_t size = 0;
+ size_t offset = pos->offset;
+
+ bq = pos->bvecq;
+ for (;;) {
+ for (; slot < bq->nr_slots; slot++) {
+ const struct bio_vec *bvec = &bq->bv[slot];
+
+ if (offset < bvec->bv_len && bvec->bv_page) {
+ size_t part = umin(bvec->bv_len - offset, max_size);
+
+ size += part;
+ offset += part;
+ max_size -= part;
+ nsegs++;
+ if (!max_size || nsegs >= max_segs)
+ goto out;
+ }
+ offset = 0;
+ }
+
+ /* pos->bvecq isn't allowed to go NULL as the queue may get
+ * extended and we would lose our place.
+ */
+ if (!bq->next)
+ break;
+ slot = 0;
+ bq = bq->next;
+ if (bq->discontig && size > 0)
+ break;
+ }
+
+out:
+ *_nr_segs = nsegs;
+ if (slot == bq->nr_slots && bq->next) {
+ bq = bq->next;
+ slot = 0;
+ offset = 0;
+ }
+ bvecq_pos_move(pos, bq);
+ pos->slot = slot;
+ pos->offset = offset;
+ return size;
+}
+
+/**
+ * bvecq_extract - Extract a slice of a bvecq queue into a new bvecq queue
+ * @pos: The position to start at.
+ * @max_size: The maximum size of the slice (or ULONG_MAX).
+ * @max_segs: The maximum number of segments in the slice (or INT_MAX).
+ * @to: Where to put the extraction bvecq chain head (updated).
+ *
+ * Allocate a new bvecq and extract into it memory fragments from a slice of
+ * bvec queue, starting at @pos. The slice is also limited if a discontiguity
+ * is found. No refs are taken on the page.
+ *
+ * @pos is updated to the end of the slice. If the position hits the end of
+ * the queue, then it is left pointing beyond the last slot of the last bvecq
+ * so that it doesn't break the chain.
+ *
+ * If successful, *@to is set to point to the head of the newly allocated chain
+ * and the caller inherits a ref to it.
+ *
+ * Return: The number of bytes extracted; -ENOMEM on allocation failure or -EIO
+ * if no segments were available to extract.
+ */
+ssize_t bvecq_extract(struct bvecq_pos *pos, size_t max_size,
+ unsigned int max_segs, struct bvecq **to)
+{
+ struct bvecq_pos tmp_pos;
+ struct bvecq *src, *dst = NULL;
+ unsigned int slot = pos->slot, nsegs;
+ ssize_t extracted = 0;
+ size_t offset = pos->offset, amount;
+
+ *to = NULL;
+ if (WARN_ON_ONCE(!max_segs))
+ max_segs = INT_MAX;
+
+ bvecq_pos_set(&tmp_pos, pos);
+ amount = bvecq_slice(&tmp_pos, max_size, max_segs, &nsegs);
+ bvecq_pos_unset(&tmp_pos);
+ if (nsegs == 0)
+ return -EIO;
+
+ dst = bvecq_alloc_chain(nsegs, GFP_KERNEL);
+ if (!dst)
+ return -ENOMEM;
+ *to = dst;
+ max_segs = nsegs;
+ nsegs = 0;
+
+ /* Transcribe the segments */
+ src = pos->bvecq;
+ for (;;) {
+ for (; slot < src->nr_slots; slot++) {
+ const struct bio_vec *sv = &src->bv[slot];
+ struct bio_vec *dv = &dst->bv[dst->nr_slots];
+
+ _debug("EXTR BQ=%x[%x] off=%zx am=%zx p=%lx",
+ src->priv, slot, offset, amount, page_to_pfn(sv->bv_page));
+
+ if (offset < sv->bv_len && sv->bv_page) {
+ size_t part = umin(sv->bv_len - offset, amount);
+
+ bvec_set_page(dv, sv->bv_page, part,
+ sv->bv_offset + offset);
+ extracted += part;
+ amount -= part;
+ offset += part;
+ trace_netfs_bv_slot(dst, dst->nr_slots);
+ dst->nr_slots++;
+ nsegs++;
+ if (bvecq_is_full(dst))
+ dst = dst->next;
+ if (nsegs >= max_segs)
+ goto out;
+ if (amount == 0)
+ goto out;
+ }
+ offset = 0;
+ }
+
+ /* pos->bvecq isn't allowed to go NULL as the queue may get
+ * extended and we would lose our place.
+ */
+ if (!src->next)
+ break;
+ slot = 0;
+ src = src->next;
+ if (src->discontig && extracted > 0)
+ break;
+ }
+
+out:
+ if (slot == src->nr_slots && src->next) {
+ src = src->next;
+ slot = 0;
+ offset = 0;
+ }
+ bvecq_pos_move(pos, src);
+ pos->slot = slot;
+ pos->offset = offset;
+ return extracted;
+}
+
+/**
+ * bvecq_load_from_ra - Allocate a bvecq chain and load from readahead
+ * @pos: Blank position object to attach the new chain to.
+ * @ractl: The readahead control context.
+ *
+ * Decant the set of folios to be read from the readahead context into a bvecq
+ * chain. Each folio occupies one bio_vec element.
+ *
+ * Return: Amount of data loaded or -ENOMEM on allocation failure.
+ */
+ssize_t bvecq_load_from_ra(struct bvecq_pos *pos, struct readahead_control *ractl)
+{
+ XA_STATE(xas, &ractl->mapping->i_pages, ractl->_index);
+ struct folio *folio;
+ struct bvecq *bq;
+ size_t loaded = 0;
+
+ bq = bvecq_alloc_chain(ractl->_nr_folios, GFP_NOFS);
+ if (!bq)
+ return -ENOMEM;
+
+ pos->bvecq = bq;
+ pos->slot = 0;
+ pos->offset = 0;
+
+ rcu_read_lock();
+
+ xas_for_each(&xas, folio, ractl->_index + ractl->_nr_pages - 1) {
+ size_t len;
+
+ if (xas_retry(&xas, folio))
+ continue;
+ VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio);
+
+ len = folio_size(folio);
+ bvec_set_folio(&bq->bv[bq->nr_slots++], folio, len, 0);
+ loaded += len;
+ trace_netfs_folio(folio, netfs_folio_trace_read);
+
+ if (bq->nr_slots >= bq->max_slots) {
+ bq = bq->next;
+ if (!bq)
+ break;
+ }
+ }
+
+ rcu_read_unlock();
+
+ ractl->_index += ractl->_nr_pages;
+ ractl->_nr_pages = 0;
+ return loaded;
+}
diff --git a/fs/netfs/internal.h b/fs/netfs/internal.h
index 2fcf31de5f2c..ad47bcc1947b 100644
--- a/fs/netfs/internal.h
+++ b/fs/netfs/internal.h
@@ -168,6 +168,7 @@ extern atomic_t netfs_n_wh_retry_write_subreq;
extern atomic_t netfs_n_wb_lock_skip;
extern atomic_t netfs_n_wb_lock_wait;
extern atomic_t netfs_n_folioq;
+extern atomic_t netfs_n_bvecq;
int netfs_stats_show(struct seq_file *m, void *v);
diff --git a/fs/netfs/stats.c b/fs/netfs/stats.c
index ab6b916addc4..84c2a4bcc762 100644
--- a/fs/netfs/stats.c
+++ b/fs/netfs/stats.c
@@ -48,6 +48,7 @@ atomic_t netfs_n_wh_retry_write_subreq;
atomic_t netfs_n_wb_lock_skip;
atomic_t netfs_n_wb_lock_wait;
atomic_t netfs_n_folioq;
+atomic_t netfs_n_bvecq;
int netfs_stats_show(struct seq_file *m, void *v)
{
@@ -90,9 +91,10 @@ int netfs_stats_show(struct seq_file *m, void *v)
atomic_read(&netfs_n_rh_retry_read_subreq),
atomic_read(&netfs_n_wh_retry_write_req),
atomic_read(&netfs_n_wh_retry_write_subreq));
- seq_printf(m, "Objs : rr=%u sr=%u foq=%u wsc=%u\n",
+ seq_printf(m, "Objs : rr=%u sr=%u bq=%u foq=%u wsc=%u\n",
atomic_read(&netfs_n_rh_rreq),
atomic_read(&netfs_n_rh_sreq),
+ atomic_read(&netfs_n_bvecq),
atomic_read(&netfs_n_folioq),
atomic_read(&netfs_n_wh_wstream_conflict));
seq_printf(m, "WbLock : skip=%u wait=%u\n",
diff --git a/include/linux/bvecq.h b/include/linux/bvecq.h
index 462125af1cc7..6c58a7fb6472 100644
--- a/include/linux/bvecq.h
+++ b/include/linux/bvecq.h
@@ -17,7 +17,7 @@
* iterated over with an ITER_BVECQ iterator. The list is non-circular; next
* and prev are NULL at the ends.
*
- * The bv pointer points to the segment array; this may be __bv if allocated
+ * The bv pointer points to the bio_vec array; this may be __bv if allocated
* together. The caller is responsible for determining whether or not this is
* the case as the array pointed to by bv may be follow on directly from the
* bvecq by accident of allocation (ie. ->bv == ->__bv is *not* sufficient to
@@ -33,8 +33,8 @@ struct bvecq {
unsigned long long fpos; /* File position */
refcount_t ref;
u32 priv; /* Private data */
- u16 nr_segs; /* Number of elements in bv[] used */
- u16 max_segs; /* Number of elements allocated in bv[] */
+ u16 nr_slots; /* Number of elements in bv[] used */
+ u16 max_slots; /* Number of elements allocated in bv[] */
bool inline_bv:1; /* T if __bv[] is being used */
bool free:1; /* T if the pages need freeing */
bool unpin:1; /* T if the pages need unpinning, not freeing */
@@ -43,4 +43,163 @@ struct bvecq {
struct bio_vec __bv[]; /* Default array (if ->inline_bv) */
};
+/*
+ * Position in a bio_vec queue. The bvecq holds a ref on the queue segment it
+ * points to.
+ */
+struct bvecq_pos {
+ struct bvecq *bvecq; /* The first bvecq */
+ unsigned int offset; /* The offset within the starting slot */
+ u16 slot; /* The starting slot */
+};
+
+void bvecq_dump(const struct bvecq *bq);
+struct bvecq *bvecq_alloc_one(size_t nr_slots, gfp_t gfp);
+struct bvecq *bvecq_alloc_chain(size_t nr_slots, gfp_t gfp);
+struct bvecq *bvecq_alloc_buffer(size_t size, unsigned int pre_slots, gfp_t gfp);
+void bvecq_put(struct bvecq *bq);
+int bvecq_expand_buffer(struct bvecq **_buffer, size_t *_cur_size, ssize_t size, gfp_t gfp);
+int bvecq_shorten_buffer(struct bvecq *bq, unsigned int slot, size_t size);
+int bvecq_buffer_init(struct bvecq_pos *pos, gfp_t gfp);
+int bvecq_buffer_make_space(struct bvecq_pos *pos, gfp_t gfp);
+void bvecq_pos_advance(struct bvecq_pos *pos, size_t amount);
+ssize_t bvecq_zero(struct bvecq_pos *pos, size_t amount);
+size_t bvecq_slice(struct bvecq_pos *pos, size_t max_size,
+ unsigned int max_segs, unsigned int *_nr_segs);
+ssize_t bvecq_extract(struct bvecq_pos *pos, size_t max_size,
+ unsigned int max_segs, struct bvecq **to);
+ssize_t bvecq_load_from_ra(struct bvecq_pos *pos, struct readahead_control *ractl);
+
+/**
+ * bvecq_get - Get a ref on a bvecq
+ * @bq: The bvecq to get a ref on
+ */
+static inline struct bvecq *bvecq_get(struct bvecq *bq)
+{
+ refcount_inc(&bq->ref);
+ return bq;
+}
+
+/**
+ * bvecq_is_full - Determine if a bvecq is full
+ * @bvecq: The object to query
+ *
+ * Return: true if full; false if not.
+ */
+static inline bool bvecq_is_full(const struct bvecq *bvecq)
+{
+ return bvecq->nr_slots >= bvecq->max_slots;
+}
+
+/**
+ * bvecq_pos_set - Set one position to be the same as another
+ * @pos: The position object to set
+ * @at: The source position.
+ *
+ * Set @pos to have the same position as @at. This may take a ref on the
+ * bvecq pointed to.
+ */
+static inline void bvecq_pos_set(struct bvecq_pos *pos, const struct bvecq_pos *at)
+{
+ *pos = *at;
+ bvecq_get(pos->bvecq);
+}
+
+/**
+ * bvecq_pos_unset - Unset a position
+ * @pos: The position object to unset
+ *
+ * Unset @pos. This does any needed ref cleanup.
+ */
+static inline void bvecq_pos_unset(struct bvecq_pos *pos)
+{
+ bvecq_put(pos->bvecq);
+ pos->bvecq = NULL;
+ pos->slot = 0;
+ pos->offset = 0;
+}
+
+/**
+ * bvecq_pos_transfer - Transfer one position to another, clearing the first
+ * @pos: The position object to set
+ * @from: The source position to clear.
+ *
+ * Set @pos to have the same position as @from and then clear @from. This may
+ * transfer a ref on the bvecq pointed to.
+ */
+static inline void bvecq_pos_transfer(struct bvecq_pos *pos, struct bvecq_pos *from)
+{
+ *pos = *from;
+ from->bvecq = NULL;
+ from->slot = 0;
+ from->offset = 0;
+}
+
+/**
+ * bvecq_pos_move - Update a position to a new bvecq
+ * @pos: The position object to update.
+ * @to: The new bvecq to point at.
+ *
+ * Update @pos to point to @to if it doesn't already do so. This may
+ * manipulate refs on the bvecqs pointed to.
+ */
+static inline void bvecq_pos_move(struct bvecq_pos *pos, struct bvecq *to)
+{
+ struct bvecq *old = pos->bvecq;
+
+ if (old != to) {
+ pos->bvecq = bvecq_get(to);
+ bvecq_put(old);
+ }
+}
+
+/**
+ * bvecq_pos_step - Step a position to the next slot if possible
+ * @pos: The position object to step.
+ *
+ * Update @pos to point to the next slot in the queue if not at the end. This
+ * may manipulate refs on the bvecqs pointed to.
+ *
+ * Return: true if successful, false if was at the end.
+ */
+static inline bool bvecq_pos_step(struct bvecq_pos *pos)
+{
+ struct bvecq *bq = pos->bvecq;
+
+ pos->slot++;
+ pos->offset = 0;
+ if (pos->slot <= bq->nr_slots)
+ return true;
+ if (!bq->next)
+ return false;
+ bvecq_pos_move(pos, bq->next);
+ return true;
+}
+
+/**
+ * bvecq_delete_spent - Delete the bvecq at the front if possible
+ * @pos: The position object to update.
+ *
+ * Delete the used up bvecq at the front of the queue that @pos points to if it
+ * is not the last node in the queue; if it is the last node in the queue, it
+ * is kept so that the queue doesn't become detached from the other end. This
+ * may manipulate refs on the bvecqs pointed to.
+ */
+static inline struct bvecq *bvecq_delete_spent(struct bvecq_pos *pos)
+{
+ struct bvecq *spent = pos->bvecq;
+ /* Read the contents of the queue node after the pointer to it. */
+ struct bvecq *next = smp_load_acquire(&spent->next);
+
+ if (!next)
+ return NULL;
+ next->prev = NULL;
+ spent->next = NULL;
+ bvecq_put(spent);
+ pos->bvecq = next; /* We take spent's ref */
+ pos->slot = 0;
+ pos->offset = 0;
+ return next;
+}
+
#endif /* _LINUX_BVECQ_H */
diff --git a/include/linux/iov_iter.h b/include/linux/iov_iter.h
index 999607ece481..309642b3901f 100644
--- a/include/linux/iov_iter.h
+++ b/include/linux/iov_iter.h
@@ -152,7 +152,7 @@ size_t iterate_bvecq(struct iov_iter *iter, size_t len, void *priv, void *priv2,
unsigned int slot = iter->bvecq_slot;
size_t progress = 0, skip = iter->iov_offset;
- if (slot == bq->nr_segs) {
+ if (slot == bq->nr_slots) {
/* The iterator may have been extended. */
bq = bq->next;
slot = 0;
@@ -176,7 +176,7 @@ size_t iterate_bvecq(struct iov_iter *iter, size_t len, void *priv, void *priv2,
if (skip >= bvec->bv_len) {
skip = 0;
slot++;
- if (slot >= bq->nr_segs) {
+ if (slot >= bq->nr_slots) {
if (!bq->next)
break;
bq = bq->next;
diff --git a/include/linux/netfs.h b/include/linux/netfs.h
index cc56b6512769..5bc48aacf7f6 100644
--- a/include/linux/netfs.h
+++ b/include/linux/netfs.h
@@ -17,6 +17,7 @@
#include <linux/workqueue.h>
#include <linux/fs.h>
#include <linux/pagemap.h>
+#include <linux/bvecq.h>
#include <linux/uio.h>
#include <linux/rolling_buffer.h>
diff --git a/include/trace/events/netfs.h b/include/trace/events/netfs.h
index b8236f9e940e..fbb094231659 100644
--- a/include/trace/events/netfs.h
+++ b/include/trace/events/netfs.h
@@ -779,6 +779,30 @@ TRACE_EVENT(netfs_folioq,
__print_symbolic(__entry->trace, netfs_folioq_traces))
);
+TRACE_EVENT(netfs_bv_slot,
+ TP_PROTO(const struct bvecq *bq, int slot),
+
+ TP_ARGS(bq, slot),
+
+ TP_STRUCT__entry(
+ __field(unsigned long, pfn)
+ __field(unsigned int, offset)
+ __field(unsigned int, len)
+ __field(unsigned int, slot)
+ ),
+
+ TP_fast_assign(
+ __entry->slot = slot;
+ __entry->pfn = page_to_pfn(bq->bv[slot].bv_page);
+ __entry->offset = bq->bv[slot].bv_offset;
+ __entry->len = bq->bv[slot].bv_len;
+ ),
+
+ TP_printk("bq[%x] p=%lx %x-%x",
+ __entry->slot,
+ __entry->pfn, __entry->offset, __entry->offset + __entry->len)
+ );
+
#undef EM
#undef E_
#endif /* _TRACE_NETFS_H */
diff --git a/lib/iov_iter.c b/lib/iov_iter.c
index df8d037894b1..4f091e6d4a22 100644
--- a/lib/iov_iter.c
+++ b/lib/iov_iter.c
@@ -580,7 +580,7 @@ static void iov_iter_bvecq_advance(struct iov_iter *i, size_t by)
return;
i->count -= by;
- if (slot >= bq->nr_segs) {
+ if (slot >= bq->nr_slots) {
bq = bq->next;
slot = 0;
}
@@ -593,7 +593,7 @@ static void iov_iter_bvecq_advance(struct iov_iter *i, size_t by)
break;
by -= len;
slot++;
- if (slot >= bq->nr_segs && bq->next) {
+ if (slot >= bq->nr_slots && bq->next) {
bq = bq->next;
slot = 0;
}
@@ -662,7 +662,7 @@ static void iov_iter_bvecq_revert(struct iov_iter *i, size_t unroll)
if (slot == 0) {
bq = bq->prev;
- slot = bq->nr_segs;
+ slot = bq->nr_slots;
}
slot--;
@@ -947,7 +947,7 @@ static unsigned long iov_iter_alignment_bvecq(const struct iov_iter *iter)
return res;
for (bq = iter->bvecq; bq; bq = bq->next) {
- for (; slot < bq->nr_segs; slot++) {
+ for (; slot < bq->nr_slots; slot++) {
const struct bio_vec *bvec = &bq->bv[slot];
size_t part = umin(bvec->bv_len - skip, size);
@@ -1331,7 +1331,7 @@ static size_t iov_npages_bvecq(const struct iov_iter *iter, size_t maxpages)
size_t size = iter->count;
for (bq = iter->bvecq; bq; bq = bq->next) {
- for (; slot < bq->nr_segs; slot++) {
+ for (; slot < bq->nr_slots; slot++) {
const struct bio_vec *bvec = &bq->bv[slot];
size_t offs = (bvec->bv_offset + skip) % PAGE_SIZE;
size_t part = umin(bvec->bv_len - skip, size);
@@ -1731,7 +1731,7 @@ static ssize_t iov_iter_extract_bvecq_pages(struct iov_iter *iter,
unsigned int seg = iter->bvecq_slot, count = 0, nr = 0;
size_t extracted = 0, offset = iter->iov_offset;
- if (seg >= bvecq->nr_segs) {
+ if (seg >= bvecq->nr_slots) {
bvecq = bvecq->next;
if (WARN_ON_ONCE(!bvecq))
return 0;
@@ -1763,7 +1763,7 @@ static ssize_t iov_iter_extract_bvecq_pages(struct iov_iter *iter,
if (offset >= blen) {
offset = 0;
seg++;
- if (seg >= bvecq->nr_segs) {
+ if (seg >= bvecq->nr_slots) {
if (!bvecq->next) {
WARN_ON_ONCE(extracted < iter->count);
break;
@@ -1816,7 +1816,7 @@ static ssize_t iov_iter_extract_bvecq_pages(struct iov_iter *iter,
if (offset >= blen) {
offset = 0;
seg++;
- if (seg >= bvecq->nr_segs) {
+ if (seg >= bvecq->nr_slots) {
if (!bvecq->next) {
WARN_ON_ONCE(extracted < iter->count);
break;
diff --git a/lib/scatterlist.c b/lib/scatterlist.c
index 03e3883a1a2d..93a3d194a914 100644
--- a/lib/scatterlist.c
+++ b/lib/scatterlist.c
@@ -1345,7 +1345,7 @@ static ssize_t extract_bvecq_to_sg(struct iov_iter *iter,
ssize_t ret = 0;
size_t offset = iter->iov_offset;
- if (seg >= bvecq->nr_segs) {
+ if (seg >= bvecq->nr_slots) {
bvecq = bvecq->next;
if (WARN_ON_ONCE(!bvecq))
return 0;
@@ -1373,7 +1373,7 @@ static ssize_t extract_bvecq_to_sg(struct iov_iter *iter,
if (offset >= blen) {
offset = 0;
seg++;
- if (seg >= bvecq->nr_segs) {
+ if (seg >= bvecq->nr_slots) {
if (!bvecq->next) {
WARN_ON_ONCE(ret < iter->count);
break;
diff --git a/lib/tests/kunit_iov_iter.c b/lib/tests/kunit_iov_iter.c
index 5bc941f64343..ff0621636ff1 100644
--- a/lib/tests/kunit_iov_iter.c
+++ b/lib/tests/kunit_iov_iter.c
@@ -543,28 +543,28 @@ static void iov_kunit_destroy_bvecq(void *data)
for (bq = data; bq; bq = next) {
next = bq->next;
- for (int i = 0; i < bq->nr_segs; i++)
+ for (int i = 0; i < bq->nr_slots; i++)
if (bq->bv[i].bv_page)
put_page(bq->bv[i].bv_page);
kfree(bq);
}
}
-static struct bvecq *iov_kunit_alloc_bvecq(struct kunit *test, unsigned int max_segs)
+static struct bvecq *iov_kunit_alloc_bvecq(struct kunit *test, unsigned int max_slots)
{
struct bvecq *bq;
- bq = kzalloc(struct_size(bq, __bv, max_segs), GFP_KERNEL);
+ bq = kzalloc(struct_size(bq, __bv, max_slots), GFP_KERNEL);
KUNIT_ASSERT_NOT_ERR_OR_NULL(test, bq);
- bq->max_segs = max_segs;
+ bq->max_slots = max_slots;
return bq;
}
-static struct bvecq *iov_kunit_create_bvecq(struct kunit *test, unsigned int max_segs)
+static struct bvecq *iov_kunit_create_bvecq(struct kunit *test, unsigned int max_slots)
{
struct bvecq *bq;
- bq = iov_kunit_alloc_bvecq(test, max_segs);
+ bq = iov_kunit_alloc_bvecq(test, max_slots);
kunit_add_action_or_reset(test, iov_kunit_destroy_bvecq, bq);
return bq;
}
@@ -578,13 +578,13 @@ static void __init iov_kunit_load_bvecq(struct kunit *test,
size_t size = 0;
for (int i = 0; i < npages; i++) {
- if (bq->nr_segs >= bq->max_segs) {
+ if (bq->nr_slots >= bq->max_slots) {
bq->next = iov_kunit_alloc_bvecq(test, 8);
bq->next->prev = bq;
bq = bq->next;
}
- bvec_set_page(&bq->bv[bq->nr_segs], pages[i], PAGE_SIZE, 0);
- bq->nr_segs++;
+ bvec_set_page(&bq->bv[bq->nr_slots], pages[i], PAGE_SIZE, 0);
+ bq->nr_slots++;
size += PAGE_SIZE;
}
iov_iter_bvec_queue(iter, dir, bq_head, 0, 0, size);
More information about the linux-afs
mailing list