Merge branch 'splice-net-some-miscellaneous-msg_splice_pages-changes'

David Howells says: ==================== splice, net: Some miscellaneous MSG_SPLICE_PAGES changes Now that the splice_to_socket() has been rewritten so that nothing now uses the ->sendpage() file op[1], some further changes can be made, so here are some miscellaneous changes that can now be done. (1) Remove the ->sendpage() file op. (2) Remove hash_sendpage*() from AF_ALG. (3) Make sunrpc send multiple pages in single sendmsg() call rather than calling sendpage() in TCP (or maybe TLS). (4) Make tcp_bpf_sendpage() a wrapper around tcp_bpf_sendmsg(). (5) Make AF_KCM use sendmsg() when calling down to TCP and then make it send entire fragment lists in single sendmsg calls. Link: https://git.kernel.org/pub/scm/linux/kernel/git/netdev/net-next.git/commit/?id=fd5f4d7da29218485153fd8b4c08da7fc130c79f [1] ==================== Link: https://lore.kernel.org/r/20230609100221.2620633-1-dhowells@redhat.comSigned-off-by: Jakub Kicinski <kuba@kernel.org>

Merge branch 'splice-net-some-miscellaneous-msg_splice_pages-changes'
David Howells says: ==================== splice, net: Some miscellaneous MSG_SPLICE_PAGES changes Now that the splice_to_socket() has been rewritten so that nothing now uses the ->sendpage() file op[1], some further changes can be made, so here are some miscellaneous changes that can now be done. (1) Remove the ->sendpage() file op. (2) Remove hash_sendpage*() from AF_ALG. (3) Make sunrpc send multiple pages in single sendmsg() call rather than calling sendpage() in TCP (or maybe TLS). (4) Make tcp_bpf_sendpage() a wrapper around tcp_bpf_sendmsg(). (5) Make AF_KCM use sendmsg() when calling down to TCP and then make it send entire fragment lists in single sendmsg calls. Link: https://git.kernel.org/pub/scm/linux/kernel/git/netdev/net-next.git/commit/?id=fd5f4d7da29218485153fd8b4c08da7fc130c79f [1] ==================== Link: https://lore.kernel.org/r/20230609100221.2620633-1-dhowells@redhat.comSigned-off-by: Jakub Kicinski <kuba@kernel.org>
7d4e87e9 · Jakub Kicinski · ccbe64be · c31a25e1 · 7d4e87e9 · 7d4e87e9
Commit 7d4e87e9 authored Jun 12, 2023 by Jakub Kicinski
7 changed files
--- a/crypto/algif_hash.c
+++ b/crypto/algif_hash.c
@@ -161,58 +161,6 @@ static int hash_sendmsg(struct socket *sock, struct msghdr *msg,
 	goto unlock;
 }
-static ssize_t hash_sendpage(struct socket *sock, struct page *page,
-			     int offset, size_t size, int flags)
-{
-	struct sock *sk = sock->sk;
-	struct alg_sock *ask = alg_sk(sk);
-	struct hash_ctx *ctx = ask->private;
-	int err;
-	if (flags & MSG_SENDPAGE_NOTLAST)
-		flags |= MSG_MORE;
-	lock_sock(sk);
-	sg_init_table(ctx->sgl.sgl, 1);
-	sg_set_page(ctx->sgl.sgl, page, size, offset);
-	if (!(flags & MSG_MORE)) {
-		err = hash_alloc_result(sk, ctx);
-		if (err)
-			goto unlock;
-	} else if (!ctx->more)
-		hash_free_result(sk, ctx);
-	ahash_request_set_crypt(&ctx->req, ctx->sgl.sgl, ctx->result, size);
-	if (!(flags & MSG_MORE)) {
-		if (ctx->more)
-			err = crypto_ahash_finup(&ctx->req);
-		else
-			err = crypto_ahash_digest(&ctx->req);
-	} else {
-		if (!ctx->more) {
-			err = crypto_ahash_init(&ctx->req);
-			err = crypto_wait_req(err, &ctx->wait);
-			if (err)
-				goto unlock;
-		}
-		err = crypto_ahash_update(&ctx->req);
-	}
-	err = crypto_wait_req(err, &ctx->wait);
-	if (err)
-		goto unlock;
-	ctx->more = flags & MSG_MORE;
-unlock:
-	release_sock(sk);
-	return err ?: size;
-}
 static int hash_recvmsg(struct socket *sock, struct msghdr *msg, size_t len,
 			int flags)
 {
@@ -328,7 +276,6 @@ static struct proto_ops algif_hash_ops = {
 	.release	=	af_alg_release,
 	.sendmsg	=	hash_sendmsg,
-	.sendpage	=	hash_sendpage,
 	.recvmsg	=	hash_recvmsg,
 	.accept		=	hash_accept,
 };
@@ -380,18 +327,6 @@ static int hash_sendmsg_nokey(struct socket *sock, struct msghdr *msg,
 	return hash_sendmsg(sock, msg, size);
 }
-static ssize_t hash_sendpage_nokey(struct socket *sock, struct page *page,
-				   int offset, size_t size, int flags)
-{
-	int err;
-	err = hash_check_key(sock);
-	if (err)
-		return err;
-	return hash_sendpage(sock, page, offset, size, flags);
-}
 static int hash_recvmsg_nokey(struct socket *sock, struct msghdr *msg,
 			      size_t ignored, int flags)
 {
@@ -430,7 +365,6 @@ static struct proto_ops algif_hash_ops_nokey = {
 	.release	=	af_alg_release,
 	.sendmsg	=	hash_sendmsg_nokey,
-	.sendpage	=	hash_sendpage_nokey,
 	.recvmsg	=	hash_recvmsg_nokey,
 	.accept		=	hash_accept_nokey,
 };

--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -1790,7 +1790,6 @@ struct file_operations {
 	int (*fsync) (struct file *, loff_t, loff_t, int datasync);
 	int (*fasync) (int, struct file *, int);
 	int (*lock) (struct file *, int, struct file_lock *);
-	ssize_t (*sendpage) (struct file *, struct page *, int, size_t, loff_t *, int);
 	unsigned long (*get_unmapped_area)(struct file *, unsigned long, unsigned long, unsigned long, unsigned long);
 	int (*check_flags)(int);
 	int (*flock) (struct file *, int, struct file_lock *);

--- a/include/linux/sunrpc/svc.h
+++ b/include/linux/sunrpc/svc.h
@@ -161,16 +161,15 @@ static inline bool svc_put_not_last(struct svc_serv *serv)
 extern u32 svc_max_payload(const struct svc_rqst *rqstp);
 /*
- * RPC Requsts and replies are stored in one or more pages.
+ * RPC Requests and replies are stored in one or more pages.
 * We maintain an array of pages for each server thread.
 * Requests are copied into these pages as they arrive.  Remaining
 * pages are available to write the reply into.
 *
- * Pages are sent using ->sendpage so each server thread needs to
+ * Pages are sent using ->sendmsg with MSG_SPLICE_PAGES so each server thread
- * allocate more to replace those used in sending.  To help keep track
+ * needs to allocate more to replace those used in sending.  To help keep track
- * of these pages we have a receive list where all pages initialy live,
+ * of these pages we have a receive list where all pages initialy live, and a
- * and a send list where pages are moved to when there are to be part
+ * send list where pages are moved to when there are to be part of a reply.
- * of a reply.
 *
 * We use xdr_buf for holding responses as it fits well with NFS
 * read responses (that have a header, and some data pages, and possibly

--- a/include/net/kcm.h
+++ b/include/net/kcm.h
@@ -47,9 +47,9 @@ struct kcm_stats {
 struct kcm_tx_msg {
 	unsigned int sent;
-	unsigned int fragidx;
 	unsigned int frag_offset;
 	unsigned int msg_flags;
+	bool started_tx;
 	struct sk_buff *frag_skb;
 	struct sk_buff *last_skb;
 };

--- a/net/ipv4/tcp_bpf.c
+++ b/net/ipv4/tcp_bpf.c
@@ -568,49 +568,18 @@ static int tcp_bpf_sendmsg(struct sock *sk, struct msghdr *msg, size_t size)
 static int tcp_bpf_sendpage(struct sock *sk, struct page *page, int offset,
 			    size_t size, int flags)
 {
-	struct sk_msg tmp, *msg = NULL;
+	struct bio_vec bvec;
-	int err = 0, copied = 0;
+	struct msghdr msg = {
-	struct sk_psock *psock;
+		.msg_flags = flags | MSG_SPLICE_PAGES,
-	bool enospc = false;
+	};
-	psock = sk_psock_get(sk);
-	if (unlikely(!psock))
-		return tcp_sendpage(sk, page, offset, size, flags);
-	lock_sock(sk);
+	bvec_set_page(&bvec, page, size, offset);
-	if (psock->cork) {
+	iov_iter_bvec(&msg.msg_iter, ITER_SOURCE, &bvec, 1, size);
-		msg = psock->cork;
-	} else {
-		msg = &tmp;
-		sk_msg_init(msg);
-	}
-	/* Catch case where ring is full and sendpage is stalled. */
+	if (flags & MSG_SENDPAGE_NOTLAST)
-	if (unlikely(sk_msg_full(msg)))
+		msg.msg_flags |= MSG_MORE;
-		goto out_err;
-	sk_msg_page_add(msg, page, size, offset);
-	sk_mem_charge(sk, size);
-	copied = size;
-	if (sk_msg_full(msg))
-		enospc = true;
-	if (psock->cork_bytes) {
-		if (size > psock->cork_bytes)
-			psock->cork_bytes = 0;
-		else
-			psock->cork_bytes -= size;
-		if (psock->cork_bytes && !enospc)
-			goto out_err;
-		/* All cork bytes are accounted, rerun the prog. */
-		psock->eval = __SK_NONE;
-		psock->cork_bytes = 0;
-	}
-	err = tcp_bpf_send_verdict(sk, psock, msg, &copied, flags);
+	return tcp_bpf_sendmsg(sk, &msg, size);
-out_err:
-	release_sock(sk);
-	sk_psock_put(sk, psock);
-	return copied ? copied : err;
 }
 enum {

--- a/net/kcm/kcmsock.c
+++ b/net/kcm/kcmsock.c
@@ -581,12 +581,10 @@ static void kcm_report_tx_retry(struct kcm_sock *kcm)
 */
 static int kcm_write_msgs(struct kcm_sock *kcm)
 {
+	unsigned int total_sent = 0;
 	struct sock *sk = &kcm->sk;
 	struct kcm_psock *psock;
-	struct sk_buff *skb, *head;
+	struct sk_buff *head;
-	struct kcm_tx_msg *txm;
-	unsigned short fragidx, frag_offset;
-	unsigned int sent, total_sent = 0;
 	int ret = 0;
 	kcm->tx_wait_more = false;
@@ -600,72 +598,57 @@ static int kcm_write_msgs(struct kcm_sock *kcm)
 		if (skb_queue_empty(&sk->sk_write_queue))
 			return 0;
-		kcm_tx_msg(skb_peek(&sk->sk_write_queue))->sent = 0;
+		kcm_tx_msg(skb_peek(&sk->sk_write_queue))->started_tx = false;
-	} else if (skb_queue_empty(&sk->sk_write_queue)) {
-		return 0;
 	}
-	head = skb_peek(&sk->sk_write_queue);
+retry:
-	txm = kcm_tx_msg(head);
+	while ((head = skb_peek(&sk->sk_write_queue))) {
+		struct msghdr msg = {
-	if (txm->sent) {
+			.msg_flags = MSG_DONTWAIT | MSG_SPLICE_PAGES,
-		/* Send of first skbuff in queue already in progress */
+		};
-		if (WARN_ON(!psock)) {
+		struct kcm_tx_msg *txm = kcm_tx_msg(head);
-			ret = -EINVAL;
+		struct sk_buff *skb;
-			goto out;
+		unsigned int msize;
+		int i;
+		if (!txm->started_tx) {
+			psock = reserve_psock(kcm);
+			if (!psock)
+				goto out;
+			skb = head;
+			txm->frag_offset = 0;
+			txm->sent = 0;
+			txm->started_tx = true;
+		} else {
+			if (WARN_ON(!psock)) {
+				ret = -EINVAL;
+				goto out;
+			}
+			skb = txm->frag_skb;
 		}
-		sent = txm->sent;
-		frag_offset = txm->frag_offset;
-		fragidx = txm->fragidx;
-		skb = txm->frag_skb;
-		goto do_frag;
-	}
-try_again:
-	psock = reserve_psock(kcm);
-	if (!psock)
-		goto out;
-	do {
-		skb = head;
-		txm = kcm_tx_msg(head);
-		sent = 0;
-do_frag_list:
 		if (WARN_ON(!skb_shinfo(skb)->nr_frags)) {
 			ret = -EINVAL;
 			goto out;
 		}
-		for (fragidx = 0; fragidx < skb_shinfo(skb)->nr_frags;
+		msize = 0;
-		     fragidx++) {
+		for (i = 0; i < skb_shinfo(skb)->nr_frags; i++)
-			skb_frag_t *frag;
+			msize += skb_shinfo(skb)->frags[i].bv_len;
-			frag_offset = 0;
+		iov_iter_bvec(&msg.msg_iter, ITER_SOURCE,
-do_frag:
+			      skb_shinfo(skb)->frags, skb_shinfo(skb)->nr_frags,
-			frag = &skb_shinfo(skb)->frags[fragidx];
+			      msize);
-			if (WARN_ON(!skb_frag_size(frag))) {
+		iov_iter_advance(&msg.msg_iter, txm->frag_offset);
-				ret = -EINVAL;
-				goto out;
-			}
-			ret = kernel_sendpage(psock->sk->sk_socket,
+		do {
-					      skb_frag_page(frag),
+			ret = sock_sendmsg(psock->sk->sk_socket, &msg);
-					      skb_frag_off(frag) + frag_offset,
-					      skb_frag_size(frag) - frag_offset,
-					      MSG_DONTWAIT);
 			if (ret <= 0) {
 				if (ret == -EAGAIN) {
 					/* Save state to try again when there's
 					 * write space on the socket
 					 */
-					txm->sent = sent;
-					txm->frag_offset = frag_offset;
-					txm->fragidx = fragidx;
 					txm->frag_skb = skb;
 					ret = 0;
 					goto out;
 				}
@@ -679,39 +662,36 @@ static int kcm_write_msgs(struct kcm_sock *kcm)
 						   true);
 				unreserve_psock(kcm);
-				txm->sent = 0;
+				txm->started_tx = false;
 				kcm_report_tx_retry(kcm);
 				ret = 0;
+				goto retry;
-				goto try_again;
 			}
-			sent += ret;
+			txm->sent += ret;
-			frag_offset += ret;
+			txm->frag_offset += ret;
 			KCM_STATS_ADD(psock->stats.tx_bytes, ret);
-			if (frag_offset < skb_frag_size(frag)) {
+		} while (msg.msg_iter.count > 0);
-				/* Not finished with this frag */
-				goto do_frag;
-			}
-		}
 		if (skb == head) {
 			if (skb_has_frag_list(skb)) {
-				skb = skb_shinfo(skb)->frag_list;
+				txm->frag_skb = skb_shinfo(skb)->frag_list;
-				goto do_frag_list;
+				txm->frag_offset = 0;
+				continue;
 			}
 		} else if (skb->next) {
-			skb = skb->next;
+			txm->frag_skb = skb->next;
-			goto do_frag_list;
+			txm->frag_offset = 0;
+			continue;
 		}
 		/* Successfully sent the whole packet, account for it. */
+		sk->sk_wmem_queued -= txm->sent;
+		total_sent += txm->sent;
 		skb_dequeue(&sk->sk_write_queue);
 		kfree_skb(head);
-		sk->sk_wmem_queued -= sent;
-		total_sent += sent;
 		KCM_STATS_INCR(psock->stats.tx_msgs);
-	} while ((head = skb_peek(&sk->sk_write_queue)));
+	}
 out:
 	if (!head) {
 		/* Done with all queued messages. */

--- a/net/sunrpc/svcsock.c
+++ b/net/sunrpc/svcsock.c
@@ -1203,13 +1203,14 @@ static int svc_tcp_recvfrom(struct svc_rqst *rqstp)
 static int svc_tcp_send_kvec(struct socket *sock, const struct kvec *vec,
 			      int flags)
 {
-	return kernel_sendpage(sock, virt_to_page(vec->iov_base),
+	struct msghdr msg = { .msg_flags = MSG_SPLICE_PAGES | flags, };
-			       offset_in_page(vec->iov_base),
-			       vec->iov_len, flags);
+	iov_iter_kvec(&msg.msg_iter, ITER_SOURCE, vec, 1, vec->iov_len);
+	return sock_sendmsg(sock, &msg);
 }
 /*
- * kernel_sendpage() is used exclusively to reduce the number of
+ * MSG_SPLICE_PAGES is used exclusively to reduce the number of
 * copy operations in this path. Therefore the caller must ensure
 * that the pages backing @xdr are unchanging.
 *
@@ -1249,28 +1250,13 @@ static int svc_tcp_sendmsg(struct socket *sock, struct xdr_buf *xdr,
 	if (ret != head->iov_len)
 		goto out;
-	if (xdr->page_len) {
+	msg.msg_flags = MSG_SPLICE_PAGES;
-		unsigned int offset, len, remaining;
+	iov_iter_bvec(&msg.msg_iter, ITER_SOURCE, xdr->bvec,
-		struct bio_vec *bvec;
+		      xdr_buf_pagecount(xdr), xdr->page_len);
+	ret = sock_sendmsg(sock, &msg);
-		bvec = xdr->bvec + (xdr->page_base >> PAGE_SHIFT);
+	if (ret < 0)
-		offset = offset_in_page(xdr->page_base);
+		return ret;
-		remaining = xdr->page_len;
+	*sentp += ret;
-		while (remaining > 0) {
-			len = min(remaining, bvec->bv_len - offset);
-			ret = kernel_sendpage(sock, bvec->bv_page,
-					      bvec->bv_offset + offset,
-					      len, 0);
-			if (ret < 0)
-				return ret;
-			*sentp += ret;
-			if (ret != len)
-				goto out;
-			remaining -= len;
-			offset = 0;
-			bvec++;
-		}
-	}
 	if (tail->iov_len) {
 		ret = svc_tcp_send_kvec(sock, tail, 0);