Commit 7d4e87e9 authored by Jakub Kicinski's avatar Jakub Kicinski

Merge branch 'splice-net-some-miscellaneous-msg_splice_pages-changes'

David Howells says:

====================
splice, net: Some miscellaneous MSG_SPLICE_PAGES changes

Now that the splice_to_socket() has been rewritten so that nothing now uses
the ->sendpage() file op[1], some further changes can be made, so here are
some miscellaneous changes that can now be done.

 (1) Remove the ->sendpage() file op.

 (2) Remove hash_sendpage*() from AF_ALG.

 (3) Make sunrpc send multiple pages in single sendmsg() call rather than
     calling sendpage() in TCP (or maybe TLS).

 (4) Make tcp_bpf_sendpage() a wrapper around tcp_bpf_sendmsg().

 (5) Make AF_KCM use sendmsg() when calling down to TCP and then make it
     send entire fragment lists in single sendmsg calls.

Link: https://git.kernel.org/pub/scm/linux/kernel/git/netdev/net-next.git/commit/?id=fd5f4d7da29218485153fd8b4c08da7fc130c79f [1]
====================

Link: https://lore.kernel.org/r/20230609100221.2620633-1-dhowells@redhat.comSigned-off-by: default avatarJakub Kicinski <kuba@kernel.org>
parents ccbe64be c31a25e1
...@@ -161,58 +161,6 @@ static int hash_sendmsg(struct socket *sock, struct msghdr *msg, ...@@ -161,58 +161,6 @@ static int hash_sendmsg(struct socket *sock, struct msghdr *msg,
goto unlock; goto unlock;
} }
static ssize_t hash_sendpage(struct socket *sock, struct page *page,
int offset, size_t size, int flags)
{
struct sock *sk = sock->sk;
struct alg_sock *ask = alg_sk(sk);
struct hash_ctx *ctx = ask->private;
int err;
if (flags & MSG_SENDPAGE_NOTLAST)
flags |= MSG_MORE;
lock_sock(sk);
sg_init_table(ctx->sgl.sgl, 1);
sg_set_page(ctx->sgl.sgl, page, size, offset);
if (!(flags & MSG_MORE)) {
err = hash_alloc_result(sk, ctx);
if (err)
goto unlock;
} else if (!ctx->more)
hash_free_result(sk, ctx);
ahash_request_set_crypt(&ctx->req, ctx->sgl.sgl, ctx->result, size);
if (!(flags & MSG_MORE)) {
if (ctx->more)
err = crypto_ahash_finup(&ctx->req);
else
err = crypto_ahash_digest(&ctx->req);
} else {
if (!ctx->more) {
err = crypto_ahash_init(&ctx->req);
err = crypto_wait_req(err, &ctx->wait);
if (err)
goto unlock;
}
err = crypto_ahash_update(&ctx->req);
}
err = crypto_wait_req(err, &ctx->wait);
if (err)
goto unlock;
ctx->more = flags & MSG_MORE;
unlock:
release_sock(sk);
return err ?: size;
}
static int hash_recvmsg(struct socket *sock, struct msghdr *msg, size_t len, static int hash_recvmsg(struct socket *sock, struct msghdr *msg, size_t len,
int flags) int flags)
{ {
...@@ -328,7 +276,6 @@ static struct proto_ops algif_hash_ops = { ...@@ -328,7 +276,6 @@ static struct proto_ops algif_hash_ops = {
.release = af_alg_release, .release = af_alg_release,
.sendmsg = hash_sendmsg, .sendmsg = hash_sendmsg,
.sendpage = hash_sendpage,
.recvmsg = hash_recvmsg, .recvmsg = hash_recvmsg,
.accept = hash_accept, .accept = hash_accept,
}; };
...@@ -380,18 +327,6 @@ static int hash_sendmsg_nokey(struct socket *sock, struct msghdr *msg, ...@@ -380,18 +327,6 @@ static int hash_sendmsg_nokey(struct socket *sock, struct msghdr *msg,
return hash_sendmsg(sock, msg, size); return hash_sendmsg(sock, msg, size);
} }
static ssize_t hash_sendpage_nokey(struct socket *sock, struct page *page,
int offset, size_t size, int flags)
{
int err;
err = hash_check_key(sock);
if (err)
return err;
return hash_sendpage(sock, page, offset, size, flags);
}
static int hash_recvmsg_nokey(struct socket *sock, struct msghdr *msg, static int hash_recvmsg_nokey(struct socket *sock, struct msghdr *msg,
size_t ignored, int flags) size_t ignored, int flags)
{ {
...@@ -430,7 +365,6 @@ static struct proto_ops algif_hash_ops_nokey = { ...@@ -430,7 +365,6 @@ static struct proto_ops algif_hash_ops_nokey = {
.release = af_alg_release, .release = af_alg_release,
.sendmsg = hash_sendmsg_nokey, .sendmsg = hash_sendmsg_nokey,
.sendpage = hash_sendpage_nokey,
.recvmsg = hash_recvmsg_nokey, .recvmsg = hash_recvmsg_nokey,
.accept = hash_accept_nokey, .accept = hash_accept_nokey,
}; };
......
...@@ -1790,7 +1790,6 @@ struct file_operations { ...@@ -1790,7 +1790,6 @@ struct file_operations {
int (*fsync) (struct file *, loff_t, loff_t, int datasync); int (*fsync) (struct file *, loff_t, loff_t, int datasync);
int (*fasync) (int, struct file *, int); int (*fasync) (int, struct file *, int);
int (*lock) (struct file *, int, struct file_lock *); int (*lock) (struct file *, int, struct file_lock *);
ssize_t (*sendpage) (struct file *, struct page *, int, size_t, loff_t *, int);
unsigned long (*get_unmapped_area)(struct file *, unsigned long, unsigned long, unsigned long, unsigned long); unsigned long (*get_unmapped_area)(struct file *, unsigned long, unsigned long, unsigned long, unsigned long);
int (*check_flags)(int); int (*check_flags)(int);
int (*flock) (struct file *, int, struct file_lock *); int (*flock) (struct file *, int, struct file_lock *);
......
...@@ -161,16 +161,15 @@ static inline bool svc_put_not_last(struct svc_serv *serv) ...@@ -161,16 +161,15 @@ static inline bool svc_put_not_last(struct svc_serv *serv)
extern u32 svc_max_payload(const struct svc_rqst *rqstp); extern u32 svc_max_payload(const struct svc_rqst *rqstp);
/* /*
* RPC Requsts and replies are stored in one or more pages. * RPC Requests and replies are stored in one or more pages.
* We maintain an array of pages for each server thread. * We maintain an array of pages for each server thread.
* Requests are copied into these pages as they arrive. Remaining * Requests are copied into these pages as they arrive. Remaining
* pages are available to write the reply into. * pages are available to write the reply into.
* *
* Pages are sent using ->sendpage so each server thread needs to * Pages are sent using ->sendmsg with MSG_SPLICE_PAGES so each server thread
* allocate more to replace those used in sending. To help keep track * needs to allocate more to replace those used in sending. To help keep track
* of these pages we have a receive list where all pages initialy live, * of these pages we have a receive list where all pages initialy live, and a
* and a send list where pages are moved to when there are to be part * send list where pages are moved to when there are to be part of a reply.
* of a reply.
* *
* We use xdr_buf for holding responses as it fits well with NFS * We use xdr_buf for holding responses as it fits well with NFS
* read responses (that have a header, and some data pages, and possibly * read responses (that have a header, and some data pages, and possibly
......
...@@ -47,9 +47,9 @@ struct kcm_stats { ...@@ -47,9 +47,9 @@ struct kcm_stats {
struct kcm_tx_msg { struct kcm_tx_msg {
unsigned int sent; unsigned int sent;
unsigned int fragidx;
unsigned int frag_offset; unsigned int frag_offset;
unsigned int msg_flags; unsigned int msg_flags;
bool started_tx;
struct sk_buff *frag_skb; struct sk_buff *frag_skb;
struct sk_buff *last_skb; struct sk_buff *last_skb;
}; };
......
...@@ -568,49 +568,18 @@ static int tcp_bpf_sendmsg(struct sock *sk, struct msghdr *msg, size_t size) ...@@ -568,49 +568,18 @@ static int tcp_bpf_sendmsg(struct sock *sk, struct msghdr *msg, size_t size)
static int tcp_bpf_sendpage(struct sock *sk, struct page *page, int offset, static int tcp_bpf_sendpage(struct sock *sk, struct page *page, int offset,
size_t size, int flags) size_t size, int flags)
{ {
struct sk_msg tmp, *msg = NULL; struct bio_vec bvec;
int err = 0, copied = 0; struct msghdr msg = {
struct sk_psock *psock; .msg_flags = flags | MSG_SPLICE_PAGES,
bool enospc = false; };
psock = sk_psock_get(sk);
if (unlikely(!psock))
return tcp_sendpage(sk, page, offset, size, flags);
lock_sock(sk); bvec_set_page(&bvec, page, size, offset);
if (psock->cork) { iov_iter_bvec(&msg.msg_iter, ITER_SOURCE, &bvec, 1, size);
msg = psock->cork;
} else {
msg = &tmp;
sk_msg_init(msg);
}
/* Catch case where ring is full and sendpage is stalled. */ if (flags & MSG_SENDPAGE_NOTLAST)
if (unlikely(sk_msg_full(msg))) msg.msg_flags |= MSG_MORE;
goto out_err;
sk_msg_page_add(msg, page, size, offset);
sk_mem_charge(sk, size);
copied = size;
if (sk_msg_full(msg))
enospc = true;
if (psock->cork_bytes) {
if (size > psock->cork_bytes)
psock->cork_bytes = 0;
else
psock->cork_bytes -= size;
if (psock->cork_bytes && !enospc)
goto out_err;
/* All cork bytes are accounted, rerun the prog. */
psock->eval = __SK_NONE;
psock->cork_bytes = 0;
}
err = tcp_bpf_send_verdict(sk, psock, msg, &copied, flags); return tcp_bpf_sendmsg(sk, &msg, size);
out_err:
release_sock(sk);
sk_psock_put(sk, psock);
return copied ? copied : err;
} }
enum { enum {
......
...@@ -581,12 +581,10 @@ static void kcm_report_tx_retry(struct kcm_sock *kcm) ...@@ -581,12 +581,10 @@ static void kcm_report_tx_retry(struct kcm_sock *kcm)
*/ */
static int kcm_write_msgs(struct kcm_sock *kcm) static int kcm_write_msgs(struct kcm_sock *kcm)
{ {
unsigned int total_sent = 0;
struct sock *sk = &kcm->sk; struct sock *sk = &kcm->sk;
struct kcm_psock *psock; struct kcm_psock *psock;
struct sk_buff *skb, *head; struct sk_buff *head;
struct kcm_tx_msg *txm;
unsigned short fragidx, frag_offset;
unsigned int sent, total_sent = 0;
int ret = 0; int ret = 0;
kcm->tx_wait_more = false; kcm->tx_wait_more = false;
...@@ -600,72 +598,57 @@ static int kcm_write_msgs(struct kcm_sock *kcm) ...@@ -600,72 +598,57 @@ static int kcm_write_msgs(struct kcm_sock *kcm)
if (skb_queue_empty(&sk->sk_write_queue)) if (skb_queue_empty(&sk->sk_write_queue))
return 0; return 0;
kcm_tx_msg(skb_peek(&sk->sk_write_queue))->sent = 0; kcm_tx_msg(skb_peek(&sk->sk_write_queue))->started_tx = false;
} else if (skb_queue_empty(&sk->sk_write_queue)) {
return 0;
} }
head = skb_peek(&sk->sk_write_queue); retry:
txm = kcm_tx_msg(head); while ((head = skb_peek(&sk->sk_write_queue))) {
struct msghdr msg = {
if (txm->sent) { .msg_flags = MSG_DONTWAIT | MSG_SPLICE_PAGES,
/* Send of first skbuff in queue already in progress */ };
if (WARN_ON(!psock)) { struct kcm_tx_msg *txm = kcm_tx_msg(head);
ret = -EINVAL; struct sk_buff *skb;
goto out; unsigned int msize;
int i;
if (!txm->started_tx) {
psock = reserve_psock(kcm);
if (!psock)
goto out;
skb = head;
txm->frag_offset = 0;
txm->sent = 0;
txm->started_tx = true;
} else {
if (WARN_ON(!psock)) {
ret = -EINVAL;
goto out;
}
skb = txm->frag_skb;
} }
sent = txm->sent;
frag_offset = txm->frag_offset;
fragidx = txm->fragidx;
skb = txm->frag_skb;
goto do_frag;
}
try_again:
psock = reserve_psock(kcm);
if (!psock)
goto out;
do {
skb = head;
txm = kcm_tx_msg(head);
sent = 0;
do_frag_list:
if (WARN_ON(!skb_shinfo(skb)->nr_frags)) { if (WARN_ON(!skb_shinfo(skb)->nr_frags)) {
ret = -EINVAL; ret = -EINVAL;
goto out; goto out;
} }
for (fragidx = 0; fragidx < skb_shinfo(skb)->nr_frags; msize = 0;
fragidx++) { for (i = 0; i < skb_shinfo(skb)->nr_frags; i++)
skb_frag_t *frag; msize += skb_shinfo(skb)->frags[i].bv_len;
frag_offset = 0; iov_iter_bvec(&msg.msg_iter, ITER_SOURCE,
do_frag: skb_shinfo(skb)->frags, skb_shinfo(skb)->nr_frags,
frag = &skb_shinfo(skb)->frags[fragidx]; msize);
if (WARN_ON(!skb_frag_size(frag))) { iov_iter_advance(&msg.msg_iter, txm->frag_offset);
ret = -EINVAL;
goto out;
}
ret = kernel_sendpage(psock->sk->sk_socket, do {
skb_frag_page(frag), ret = sock_sendmsg(psock->sk->sk_socket, &msg);
skb_frag_off(frag) + frag_offset,
skb_frag_size(frag) - frag_offset,
MSG_DONTWAIT);
if (ret <= 0) { if (ret <= 0) {
if (ret == -EAGAIN) { if (ret == -EAGAIN) {
/* Save state to try again when there's /* Save state to try again when there's
* write space on the socket * write space on the socket
*/ */
txm->sent = sent;
txm->frag_offset = frag_offset;
txm->fragidx = fragidx;
txm->frag_skb = skb; txm->frag_skb = skb;
ret = 0; ret = 0;
goto out; goto out;
} }
...@@ -679,39 +662,36 @@ static int kcm_write_msgs(struct kcm_sock *kcm) ...@@ -679,39 +662,36 @@ static int kcm_write_msgs(struct kcm_sock *kcm)
true); true);
unreserve_psock(kcm); unreserve_psock(kcm);
txm->sent = 0; txm->started_tx = false;
kcm_report_tx_retry(kcm); kcm_report_tx_retry(kcm);
ret = 0; ret = 0;
goto retry;
goto try_again;
} }
sent += ret; txm->sent += ret;
frag_offset += ret; txm->frag_offset += ret;
KCM_STATS_ADD(psock->stats.tx_bytes, ret); KCM_STATS_ADD(psock->stats.tx_bytes, ret);
if (frag_offset < skb_frag_size(frag)) { } while (msg.msg_iter.count > 0);
/* Not finished with this frag */
goto do_frag;
}
}
if (skb == head) { if (skb == head) {
if (skb_has_frag_list(skb)) { if (skb_has_frag_list(skb)) {
skb = skb_shinfo(skb)->frag_list; txm->frag_skb = skb_shinfo(skb)->frag_list;
goto do_frag_list; txm->frag_offset = 0;
continue;
} }
} else if (skb->next) { } else if (skb->next) {
skb = skb->next; txm->frag_skb = skb->next;
goto do_frag_list; txm->frag_offset = 0;
continue;
} }
/* Successfully sent the whole packet, account for it. */ /* Successfully sent the whole packet, account for it. */
sk->sk_wmem_queued -= txm->sent;
total_sent += txm->sent;
skb_dequeue(&sk->sk_write_queue); skb_dequeue(&sk->sk_write_queue);
kfree_skb(head); kfree_skb(head);
sk->sk_wmem_queued -= sent;
total_sent += sent;
KCM_STATS_INCR(psock->stats.tx_msgs); KCM_STATS_INCR(psock->stats.tx_msgs);
} while ((head = skb_peek(&sk->sk_write_queue))); }
out: out:
if (!head) { if (!head) {
/* Done with all queued messages. */ /* Done with all queued messages. */
......
...@@ -1203,13 +1203,14 @@ static int svc_tcp_recvfrom(struct svc_rqst *rqstp) ...@@ -1203,13 +1203,14 @@ static int svc_tcp_recvfrom(struct svc_rqst *rqstp)
static int svc_tcp_send_kvec(struct socket *sock, const struct kvec *vec, static int svc_tcp_send_kvec(struct socket *sock, const struct kvec *vec,
int flags) int flags)
{ {
return kernel_sendpage(sock, virt_to_page(vec->iov_base), struct msghdr msg = { .msg_flags = MSG_SPLICE_PAGES | flags, };
offset_in_page(vec->iov_base),
vec->iov_len, flags); iov_iter_kvec(&msg.msg_iter, ITER_SOURCE, vec, 1, vec->iov_len);
return sock_sendmsg(sock, &msg);
} }
/* /*
* kernel_sendpage() is used exclusively to reduce the number of * MSG_SPLICE_PAGES is used exclusively to reduce the number of
* copy operations in this path. Therefore the caller must ensure * copy operations in this path. Therefore the caller must ensure
* that the pages backing @xdr are unchanging. * that the pages backing @xdr are unchanging.
* *
...@@ -1249,28 +1250,13 @@ static int svc_tcp_sendmsg(struct socket *sock, struct xdr_buf *xdr, ...@@ -1249,28 +1250,13 @@ static int svc_tcp_sendmsg(struct socket *sock, struct xdr_buf *xdr,
if (ret != head->iov_len) if (ret != head->iov_len)
goto out; goto out;
if (xdr->page_len) { msg.msg_flags = MSG_SPLICE_PAGES;
unsigned int offset, len, remaining; iov_iter_bvec(&msg.msg_iter, ITER_SOURCE, xdr->bvec,
struct bio_vec *bvec; xdr_buf_pagecount(xdr), xdr->page_len);
ret = sock_sendmsg(sock, &msg);
bvec = xdr->bvec + (xdr->page_base >> PAGE_SHIFT); if (ret < 0)
offset = offset_in_page(xdr->page_base); return ret;
remaining = xdr->page_len; *sentp += ret;
while (remaining > 0) {
len = min(remaining, bvec->bv_len - offset);
ret = kernel_sendpage(sock, bvec->bv_page,
bvec->bv_offset + offset,
len, 0);
if (ret < 0)
return ret;
*sentp += ret;
if (ret != len)
goto out;
remaining -= len;
offset = 0;
bvec++;
}
}
if (tail->iov_len) { if (tail->iov_len) {
ret = svc_tcp_send_kvec(sock, tail, 0); ret = svc_tcp_send_kvec(sock, tail, 0);
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment