Commit 780bf05f authored by David S. Miller's avatar David S. Miller

Merge branch 'smc-improvements'

Tony Lu says:

====================
net/smc: Improvements for TCP_CORK and sendfile()

Currently, SMC use default implement for syscall sendfile() [1], which
is wildly used in nginx and big data sences. Usually, applications use
sendfile() with TCP_CORK:

fstat(20, {st_mode=S_IFREG|0644, st_size=4096, ...}) = 0
setsockopt(19, SOL_TCP, TCP_CORK, [1], 4) = 0
writev(19, [{iov_base="HTTP/1.1 200 OK\r\nServer: nginx/1"..., iov_len=240}], 1) = 240
sendfile(19, 20, [0] => [4096], 4096)   = 4096
close(20)                               = 0
setsockopt(19, SOL_TCP, TCP_CORK, [0], 4) = 0

The above is an example of Nginx, when sendfile() on, Nginx first
enables TCP_CORK, write headers, the data will not be sent. Then call
sendfile(), it reads file and write to sndbuf. When TCP_CORK is cleared,
all pending data is sent out.

The performance of the default implement of sendfile is lower than when
it is off. After investigation, it shows two parts to improve:
- unnecessary lock contention of delayed work
- less data per send than when sendfile off

Patch #1 tries to reduce lock_sock() contention in smc_tx_work().
Patch #2 removes timed work for corking, and let applications control
it. See TCP_CORK [2] MSG_MORE [3].
Patch #3 adds MSG_SENDPAGE_NOTLAST for corking more data when
sendfile().

Test environments:
- CPU Intel Xeon Platinum 8 core, mem 32 GiB, nic Mellanox CX4
- socket sndbuf / rcvbuf: 16384 / 131072 bytes
- server: smc_run nginx
- client: smc_run ./wrk -c 100 -t 2 -d 30 http://192.168.100.1:8080/4k.html
- payload: 4KB local disk file

Items                     QPS
sendfile off        272477.10
sendfile on (orig)  223622.79
sendfile on (this)  395847.21

This benchmark shows +45.28% improvement compared with sendfile off, and
+77.02% compared with original sendfile implement.

[1] https://man7.org/linux/man-pages/man2/sendfile.2.html
[2] https://linux.die.net/man/7/tcp
[3] https://man7.org/linux/man-pages/man2/send.2.html
====================
Signed-off-by: default avatarDavid S. Miller <davem@davemloft.net>
parents 01b2a995 be9a16cc
......@@ -2523,8 +2523,8 @@ static int smc_setsockopt(struct socket *sock, int level, int optname,
sk->sk_state != SMC_CLOSED) {
if (!val) {
SMC_STAT_INC(smc, cork_cnt);
mod_delayed_work(smc->conn.lgr->tx_wq,
&smc->conn.tx_work, 0);
smc_tx_pending(&smc->conn);
cancel_delayed_work(&smc->conn.tx_work);
}
}
break;
......@@ -2662,8 +2662,10 @@ static ssize_t smc_sendpage(struct socket *sock, struct page *page,
rc = kernel_sendpage(smc->clcsock, page, offset,
size, flags);
} else {
lock_sock(sk);
rc = smc_tx_sendpage(smc, page, offset, size, flags);
release_sock(sk);
SMC_STAT_INC(smc, sendpage_cnt);
rc = sock_no_sendpage(sock, page, offset, size, flags);
}
out:
......
......@@ -31,7 +31,6 @@
#include "smc_tracepoint.h"
#define SMC_TX_WORK_DELAY 0
#define SMC_TX_CORK_DELAY (HZ >> 2) /* 250 ms */
/***************************** sndbuf producer *******************************/
......@@ -236,16 +235,15 @@ int smc_tx_sendmsg(struct smc_sock *smc, struct msghdr *msg, size_t len)
*/
if ((msg->msg_flags & MSG_OOB) && !send_remaining)
conn->urg_tx_pend = true;
if ((msg->msg_flags & MSG_MORE || smc_tx_is_corked(smc)) &&
(atomic_read(&conn->sndbuf_space) >
(conn->sndbuf_desc->len >> 1)))
/* for a corked socket defer the RDMA writes if there
* is still sufficient sndbuf_space available
if ((msg->msg_flags & MSG_MORE || smc_tx_is_corked(smc) ||
msg->msg_flags & MSG_SENDPAGE_NOTLAST) &&
(atomic_read(&conn->sndbuf_space)))
/* for a corked socket defer the RDMA writes if
* sndbuf_space is still available. The applications
* should known how/when to uncork it.
*/
queue_delayed_work(conn->lgr->tx_wq, &conn->tx_work,
SMC_TX_CORK_DELAY);
else
smc_tx_sndbuf_nonempty(conn);
continue;
smc_tx_sndbuf_nonempty(conn);
trace_smc_tx_sendmsg(smc, copylen);
} /* while (msg_data_left(msg)) */
......@@ -260,6 +258,22 @@ int smc_tx_sendmsg(struct smc_sock *smc, struct msghdr *msg, size_t len)
return rc;
}
int smc_tx_sendpage(struct smc_sock *smc, struct page *page, int offset,
size_t size, int flags)
{
struct msghdr msg = {.msg_flags = flags};
char *kaddr = kmap(page);
struct kvec iov;
int rc;
iov.iov_base = kaddr + offset;
iov.iov_len = size;
iov_iter_kvec(&msg.msg_iter, WRITE, &iov, 1, size);
rc = smc_tx_sendmsg(smc, &msg, size);
kunmap(page);
return rc;
}
/***************************** sndbuf consumer *******************************/
/* sndbuf consumer: actual data transfer of one target chunk with ISM write */
......@@ -597,27 +611,32 @@ int smc_tx_sndbuf_nonempty(struct smc_connection *conn)
return rc;
}
/* Wakeup sndbuf consumers from process context
* since there is more data to transmit
*/
void smc_tx_work(struct work_struct *work)
void smc_tx_pending(struct smc_connection *conn)
{
struct smc_connection *conn = container_of(to_delayed_work(work),
struct smc_connection,
tx_work);
struct smc_sock *smc = container_of(conn, struct smc_sock, conn);
int rc;
lock_sock(&smc->sk);
if (smc->sk.sk_err)
goto out;
return;
rc = smc_tx_sndbuf_nonempty(conn);
if (!rc && conn->local_rx_ctrl.prod_flags.write_blocked &&
!atomic_read(&conn->bytes_to_rcv))
conn->local_rx_ctrl.prod_flags.write_blocked = 0;
}
/* Wakeup sndbuf consumers from process context
* since there is more data to transmit
*/
void smc_tx_work(struct work_struct *work)
{
struct smc_connection *conn = container_of(to_delayed_work(work),
struct smc_connection,
tx_work);
struct smc_sock *smc = container_of(conn, struct smc_sock, conn);
out:
lock_sock(&smc->sk);
smc_tx_pending(conn);
release_sock(&smc->sk);
}
......
......@@ -27,9 +27,12 @@ static inline int smc_tx_prepared_sends(struct smc_connection *conn)
return smc_curs_diff(conn->sndbuf_desc->len, &sent, &prep);
}
void smc_tx_pending(struct smc_connection *conn);
void smc_tx_work(struct work_struct *work);
void smc_tx_init(struct smc_sock *smc);
int smc_tx_sendmsg(struct smc_sock *smc, struct msghdr *msg, size_t len);
int smc_tx_sendpage(struct smc_sock *smc, struct page *page, int offset,
size_t size, int flags);
int smc_tx_sndbuf_nonempty(struct smc_connection *conn);
void smc_tx_sndbuf_nonfull(struct smc_sock *smc);
void smc_tx_consumer_update(struct smc_connection *conn, bool force);
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment