Commit 5d659b1d authored by David S. Miller's avatar David S. Miller

Merge branch 'tcp-mmap-rework-zerocopy-receive'

Eric Dumazet says:

====================
tcp: mmap: rework zerocopy receive

syzbot reported a lockdep issue caused by tcp mmap() support.

I implemented Andy Lutomirski nice suggestions to resolve the
issue and increase scalability as well.

First patch is adding a new getsockopt() operation and changes mmap()
behavior.

Second patch changes tcp_mmap reference program.

v4: tcp mmap() support depends on CONFIG_MMU, as kbuild bot told us.

v3: change TCP_ZEROCOPY_RECEIVE to be a getsockopt() option
    instead of setsockopt(), feedback from Ka-Cheon Poon

v2: Added a missing page align of zc->length in tcp_zerocopy_receive()
    Properly clear zc->recv_skip_hint in case user request was completed.
====================
Signed-off-by: default avatarDavid S. Miller <davem@davemloft.net>
parents 589f84fb aacb0c2e
......@@ -122,6 +122,7 @@ enum {
#define TCP_MD5SIG_EXT 32 /* TCP MD5 Signature with extensions */
#define TCP_FASTOPEN_KEY 33 /* Set the key for Fast Open (cookie) */
#define TCP_FASTOPEN_NO_COOKIE 34 /* Enable TFO without a TFO cookie */
#define TCP_ZEROCOPY_RECEIVE 35
struct tcp_repair_opt {
__u32 opt_code;
......@@ -276,4 +277,11 @@ struct tcp_diag_md5sig {
__u8 tcpm_key[TCP_MD5SIG_MAXKEYLEN];
};
/* setsockopt(fd, IPPROTO_TCP, TCP_ZEROCOPY_RECEIVE, ...) */
struct tcp_zerocopy_receive {
__u64 address; /* in: address of mapping */
__u32 length; /* in/out: number of bytes to map/mapped */
__u32 recv_skip_hint; /* out: amount of bytes to skip */
};
#endif /* _UAPI_LINUX_TCP_H */
......@@ -994,7 +994,9 @@ const struct proto_ops inet_stream_ops = {
.getsockopt = sock_common_getsockopt,
.sendmsg = inet_sendmsg,
.recvmsg = inet_recvmsg,
#ifdef CONFIG_MMU
.mmap = tcp_mmap,
#endif
.sendpage = inet_sendpage,
.splice_read = tcp_splice_read,
.read_sock = tcp_read_sock,
......
......@@ -1726,118 +1726,113 @@ int tcp_set_rcvlowat(struct sock *sk, int val)
}
EXPORT_SYMBOL(tcp_set_rcvlowat);
/* When user wants to mmap X pages, we first need to perform the mapping
* before freeing any skbs in receive queue, otherwise user would be unable
* to fallback to standard recvmsg(). This happens if some data in the
* requested block is not exactly fitting in a page.
*
* We only support order-0 pages for the moment.
* mmap() on TCP is very strict, there is no point
* trying to accommodate with pathological layouts.
*/
#ifdef CONFIG_MMU
static const struct vm_operations_struct tcp_vm_ops = {
};
int tcp_mmap(struct file *file, struct socket *sock,
struct vm_area_struct *vma)
{
unsigned long size = vma->vm_end - vma->vm_start;
unsigned int nr_pages = size >> PAGE_SHIFT;
struct page **pages_array = NULL;
u32 seq, len, offset, nr = 0;
struct sock *sk = sock->sk;
const skb_frag_t *frags;
if (vma->vm_flags & (VM_WRITE | VM_EXEC))
return -EPERM;
vma->vm_flags &= ~(VM_MAYWRITE | VM_MAYEXEC);
/* Instruct vm_insert_page() to not down_read(mmap_sem) */
vma->vm_flags |= VM_MIXEDMAP;
vma->vm_ops = &tcp_vm_ops;
return 0;
}
EXPORT_SYMBOL(tcp_mmap);
static int tcp_zerocopy_receive(struct sock *sk,
struct tcp_zerocopy_receive *zc)
{
unsigned long address = (unsigned long)zc->address;
const skb_frag_t *frags = NULL;
u32 length = 0, seq, offset;
struct vm_area_struct *vma;
struct sk_buff *skb = NULL;
struct tcp_sock *tp;
struct sk_buff *skb;
int ret;
if (vma->vm_pgoff || !nr_pages)
if (address & (PAGE_SIZE - 1) || address != zc->address)
return -EINVAL;
if (vma->vm_flags & VM_WRITE)
return -EPERM;
/* TODO: Maybe the following is not needed if pages are COW */
vma->vm_flags &= ~VM_MAYWRITE;
lock_sock(sk);
ret = -ENOTCONN;
if (sk->sk_state == TCP_LISTEN)
goto out;
return -ENOTCONN;
sock_rps_record_flow(sk);
if (tcp_inq(sk) < size) {
ret = sock_flag(sk, SOCK_DONE) ? -EIO : -EAGAIN;
down_read(&current->mm->mmap_sem);
ret = -EINVAL;
vma = find_vma(current->mm, address);
if (!vma || vma->vm_start > address || vma->vm_ops != &tcp_vm_ops)
goto out;
}
zc->length = min_t(unsigned long, zc->length, vma->vm_end - address);
tp = tcp_sk(sk);
seq = tp->copied_seq;
/* Abort if urgent data is in the area */
if (unlikely(tp->urg_data)) {
u32 urg_offset = tp->urg_seq - seq;
zc->length = min_t(u32, zc->length, tcp_inq(sk));
zc->length &= ~(PAGE_SIZE - 1);
ret = -EINVAL;
if (urg_offset < size)
goto out;
}
ret = -ENOMEM;
pages_array = kvmalloc_array(nr_pages, sizeof(struct page *),
GFP_KERNEL);
if (!pages_array)
goto out;
skb = tcp_recv_skb(sk, seq, &offset);
ret = -EINVAL;
skb_start:
/* We do not support anything not in page frags */
offset -= skb_headlen(skb);
if ((int)offset < 0)
goto out;
if (skb_has_frag_list(skb))
goto out;
len = skb->data_len - offset;
frags = skb_shinfo(skb)->frags;
while (offset) {
if (frags->size > offset)
goto out;
offset -= frags->size;
frags++;
}
while (nr < nr_pages) {
if (len) {
if (len < PAGE_SIZE)
goto out;
if (frags->size != PAGE_SIZE || frags->page_offset)
goto out;
pages_array[nr++] = skb_frag_page(frags);
frags++;
len -= PAGE_SIZE;
seq += PAGE_SIZE;
continue;
zap_page_range(vma, address, zc->length);
zc->recv_skip_hint = 0;
ret = 0;
while (length + PAGE_SIZE <= zc->length) {
if (zc->recv_skip_hint < PAGE_SIZE) {
if (skb) {
skb = skb->next;
offset = seq - TCP_SKB_CB(skb)->seq;
} else {
skb = tcp_recv_skb(sk, seq, &offset);
}
zc->recv_skip_hint = skb->len - offset;
offset -= skb_headlen(skb);
if ((int)offset < 0 || skb_has_frag_list(skb))
break;
frags = skb_shinfo(skb)->frags;
while (offset) {
if (frags->size > offset)
goto out;
offset -= frags->size;
frags++;
}
}
skb = skb->next;
offset = seq - TCP_SKB_CB(skb)->seq;
goto skb_start;
}
/* OK, we have a full set of pages ready to be inserted into vma */
for (nr = 0; nr < nr_pages; nr++) {
ret = vm_insert_page(vma, vma->vm_start + (nr << PAGE_SHIFT),
pages_array[nr]);
if (frags->size != PAGE_SIZE || frags->page_offset)
break;
ret = vm_insert_page(vma, address + length,
skb_frag_page(frags));
if (ret)
goto out;
break;
length += PAGE_SIZE;
seq += PAGE_SIZE;
zc->recv_skip_hint -= PAGE_SIZE;
frags++;
}
/* operation is complete, we can 'consume' all skbs */
tp->copied_seq = seq;
tcp_rcv_space_adjust(sk);
/* Clean up data we have read: This will do ACK frames. */
tcp_recv_skb(sk, seq, &offset);
tcp_cleanup_rbuf(sk, size);
ret = 0;
out:
release_sock(sk);
kvfree(pages_array);
up_read(&current->mm->mmap_sem);
if (length) {
tp->copied_seq = seq;
tcp_rcv_space_adjust(sk);
/* Clean up data we have read: This will do ACK frames. */
tcp_recv_skb(sk, seq, &offset);
tcp_cleanup_rbuf(sk, length);
ret = 0;
if (length == zc->length)
zc->recv_skip_hint = 0;
} else {
if (!zc->recv_skip_hint && sock_flag(sk, SOCK_DONE))
ret = -EIO;
}
zc->length = length;
return ret;
}
EXPORT_SYMBOL(tcp_mmap);
#endif
static void tcp_update_recv_tstamps(struct sk_buff *skb,
struct scm_timestamping *tss)
......@@ -3472,6 +3467,25 @@ static int do_tcp_getsockopt(struct sock *sk, int level,
}
return 0;
}
#ifdef CONFIG_MMU
case TCP_ZEROCOPY_RECEIVE: {
struct tcp_zerocopy_receive zc;
int err;
if (get_user(len, optlen))
return -EFAULT;
if (len != sizeof(zc))
return -EINVAL;
if (copy_from_user(&zc, optval, len))
return -EFAULT;
lock_sock(sk);
err = tcp_zerocopy_receive(sk, &zc);
release_sock(sk);
if (!err && copy_to_user(optval, &zc, len))
err = -EFAULT;
return err;
}
#endif
default:
return -ENOPROTOOPT;
}
......
......@@ -578,7 +578,9 @@ const struct proto_ops inet6_stream_ops = {
.getsockopt = sock_common_getsockopt, /* ok */
.sendmsg = inet_sendmsg, /* ok */
.recvmsg = inet_recvmsg, /* ok */
#ifdef CONFIG_MMU
.mmap = tcp_mmap,
#endif
.sendpage = inet_sendpage,
.sendmsg_locked = tcp_sendmsg_locked,
.sendpage_locked = tcp_sendpage_locked,
......
......@@ -76,9 +76,10 @@
#include <time.h>
#include <sys/time.h>
#include <netinet/in.h>
#include <netinet/tcp.h>
#include <arpa/inet.h>
#include <poll.h>
#include <linux/tcp.h>
#include <assert.h>
#ifndef MSG_ZEROCOPY
#define MSG_ZEROCOPY 0x4000000
......@@ -134,11 +135,12 @@ void hash_zone(void *zone, unsigned int length)
void *child_thread(void *arg)
{
unsigned long total_mmap = 0, total = 0;
struct tcp_zerocopy_receive zc;
unsigned long delta_usec;
int flags = MAP_SHARED;
struct timeval t0, t1;
char *buffer = NULL;
void *oaddr = NULL;
void *addr = NULL;
double throughput;
struct rusage ru;
int lu, fd;
......@@ -153,41 +155,46 @@ void *child_thread(void *arg)
perror("malloc");
goto error;
}
if (zflg) {
addr = mmap(NULL, chunk_size, PROT_READ, flags, fd, 0);
if (addr == (void *)-1)
zflg = 0;
}
while (1) {
struct pollfd pfd = { .fd = fd, .events = POLLIN, };
int sub;
poll(&pfd, 1, 10000);
if (zflg) {
void *naddr;
naddr = mmap(oaddr, chunk_size, PROT_READ, flags, fd, 0);
if (naddr == (void *)-1) {
if (errno == EAGAIN) {
/* That is if SO_RCVLOWAT is buggy */
usleep(1000);
continue;
}
if (errno == EINVAL) {
flags = MAP_SHARED;
oaddr = NULL;
goto fallback;
}
if (errno != EIO)
perror("mmap()");
socklen_t zc_len = sizeof(zc);
int res;
zc.address = (__u64)addr;
zc.length = chunk_size;
zc.recv_skip_hint = 0;
res = getsockopt(fd, IPPROTO_TCP, TCP_ZEROCOPY_RECEIVE,
&zc, &zc_len);
if (res == -1)
break;
if (zc.length) {
assert(zc.length <= chunk_size);
total_mmap += zc.length;
if (xflg)
hash_zone(addr, zc.length);
total += zc.length;
}
total_mmap += chunk_size;
if (xflg)
hash_zone(naddr, chunk_size);
total += chunk_size;
if (!keepflag) {
flags |= MAP_FIXED;
oaddr = naddr;
if (zc.recv_skip_hint) {
assert(zc.recv_skip_hint <= chunk_size);
lu = read(fd, buffer, zc.recv_skip_hint);
if (lu > 0) {
if (xflg)
hash_zone(buffer, lu);
total += lu;
}
}
continue;
}
fallback:
sub = 0;
while (sub < chunk_size) {
lu = read(fd, buffer + sub, chunk_size - sub);
......@@ -228,6 +235,8 @@ void *child_thread(void *arg)
error:
free(buffer);
close(fd);
if (zflg)
munmap(addr, chunk_size);
pthread_exit(0);
}
......@@ -371,7 +380,8 @@ int main(int argc, char *argv[])
setup_sockaddr(cfg_family, host, &listenaddr);
if (mss &&
setsockopt(fdlisten, SOL_TCP, TCP_MAXSEG, &mss, sizeof(mss)) == -1) {
setsockopt(fdlisten, IPPROTO_TCP, TCP_MAXSEG,
&mss, sizeof(mss)) == -1) {
perror("setsockopt TCP_MAXSEG");
exit(1);
}
......@@ -402,7 +412,7 @@ int main(int argc, char *argv[])
setup_sockaddr(cfg_family, host, &addr);
if (mss &&
setsockopt(fd, SOL_TCP, TCP_MAXSEG, &mss, sizeof(mss)) == -1) {
setsockopt(fd, IPPROTO_TCP, TCP_MAXSEG, &mss, sizeof(mss)) == -1) {
perror("setsockopt TCP_MAXSEG");
exit(1);
}
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment