Commit 433b0a12 authored by Ilya Dryomov's avatar Ilya Dryomov

libceph: use MSG_SENDPAGE_NOTLAST with ceph_tcp_sendpage()

Prevent do_tcp_sendpages() from calling tcp_push() (at least) once per
page.  Instead, arrange for tcp_push() to be called (at least) once per
data payload.  This results in more MSS-sized packets and fewer packets
overall (5-10% reduction in my tests with typical OSD request sizes).
See commits 2f533844 ("tcp: allow splice() to build full TSO
packets"), 35f9c09f ("tcp: tcp_sendpages() should call tcp_push()
once") and ae62ca7b ("tcp: fix MSG_SENDPAGE_NOTLAST logic") for
details.

Here is an example of a packet size histogram for 128K OSD requests
(MSS = 1448, top 5):

Before:

     SIZE    COUNT
     1448   777700
      952   127915
     1200    39238
     1219     9806
       21     5675

After:

     SIZE    COUNT
     1448   897280
       21     6201
     1019     2797
      643     2739
      376     2479

We could do slightly better by explicitly corking the socket but it's
not clear it's worth it.
Signed-off-by: default avatarIlya Dryomov <idryomov@gmail.com>
parent 3239eb52
...@@ -560,12 +560,15 @@ static int ceph_tcp_sendmsg(struct socket *sock, struct kvec *iov, ...@@ -560,12 +560,15 @@ static int ceph_tcp_sendmsg(struct socket *sock, struct kvec *iov,
return r; return r;
} }
/*
* @more: either or both of MSG_MORE and MSG_SENDPAGE_NOTLAST
*/
static int ceph_tcp_sendpage(struct socket *sock, struct page *page, static int ceph_tcp_sendpage(struct socket *sock, struct page *page,
int offset, size_t size, bool more) int offset, size_t size, int more)
{ {
ssize_t (*sendpage)(struct socket *sock, struct page *page, ssize_t (*sendpage)(struct socket *sock, struct page *page,
int offset, size_t size, int flags); int offset, size_t size, int flags);
int flags = MSG_DONTWAIT | MSG_NOSIGNAL | (more ? MSG_MORE : 0); int flags = MSG_DONTWAIT | MSG_NOSIGNAL | more;
int ret; int ret;
/* /*
...@@ -1552,6 +1555,7 @@ static int write_partial_message_data(struct ceph_connection *con) ...@@ -1552,6 +1555,7 @@ static int write_partial_message_data(struct ceph_connection *con)
struct ceph_msg *msg = con->out_msg; struct ceph_msg *msg = con->out_msg;
struct ceph_msg_data_cursor *cursor = &msg->cursor; struct ceph_msg_data_cursor *cursor = &msg->cursor;
bool do_datacrc = !ceph_test_opt(from_msgr(con->msgr), NOCRC); bool do_datacrc = !ceph_test_opt(from_msgr(con->msgr), NOCRC);
int more = MSG_MORE | MSG_SENDPAGE_NOTLAST;
u32 crc; u32 crc;
dout("%s %p msg %p\n", __func__, con, msg); dout("%s %p msg %p\n", __func__, con, msg);
...@@ -1580,8 +1584,10 @@ static int write_partial_message_data(struct ceph_connection *con) ...@@ -1580,8 +1584,10 @@ static int write_partial_message_data(struct ceph_connection *con)
} }
page = ceph_msg_data_next(cursor, &page_offset, &length, NULL); page = ceph_msg_data_next(cursor, &page_offset, &length, NULL);
if (length == cursor->total_resid)
more = MSG_MORE;
ret = ceph_tcp_sendpage(con->sock, page, page_offset, length, ret = ceph_tcp_sendpage(con->sock, page, page_offset, length,
true); more);
if (ret <= 0) { if (ret <= 0) {
if (do_datacrc) if (do_datacrc)
msg->footer.data_crc = cpu_to_le32(crc); msg->footer.data_crc = cpu_to_le32(crc);
...@@ -1611,13 +1617,16 @@ static int write_partial_message_data(struct ceph_connection *con) ...@@ -1611,13 +1617,16 @@ static int write_partial_message_data(struct ceph_connection *con)
*/ */
static int write_partial_skip(struct ceph_connection *con) static int write_partial_skip(struct ceph_connection *con)
{ {
int more = MSG_MORE | MSG_SENDPAGE_NOTLAST;
int ret; int ret;
dout("%s %p %d left\n", __func__, con, con->out_skip); dout("%s %p %d left\n", __func__, con, con->out_skip);
while (con->out_skip > 0) { while (con->out_skip > 0) {
size_t size = min(con->out_skip, (int) PAGE_SIZE); size_t size = min(con->out_skip, (int) PAGE_SIZE);
ret = ceph_tcp_sendpage(con->sock, zero_page, 0, size, true); if (size == con->out_skip)
more = MSG_MORE;
ret = ceph_tcp_sendpage(con->sock, zero_page, 0, size, more);
if (ret <= 0) if (ret <= 0)
goto out; goto out;
con->out_skip -= ret; con->out_skip -= ret;
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment