Commit 2d48d67f authored by Eliezer Tamir's avatar Eliezer Tamir Committed by David S. Miller

net: poll/select low latency socket support

select/poll busy-poll support.

Split sysctl value into two separate ones, one for read and one for poll.
updated Documentation/sysctl/net.txt

Add a new poll flag POLL_LL. When this flag is set, sock_poll will call
sk_poll_ll if possible. sock_poll sets this flag in its return value
to indicate to select/poll when a socket that can busy poll is found.

When poll/select have nothing to report, call the low-level
sock_poll again until we are out of time or we find something.

Once the system call finds something, it stops setting POLL_LL, so it can
return the result to the user ASAP.
Signed-off-by: default avatarEliezer Tamir <eliezer.tamir@linux.intel.com>
Signed-off-by: default avatarDavid S. Miller <davem@davemloft.net>
parent e4f2379d
...@@ -50,13 +50,27 @@ The maximum number of packets that kernel can handle on a NAPI interrupt, ...@@ -50,13 +50,27 @@ The maximum number of packets that kernel can handle on a NAPI interrupt,
it's a Per-CPU variable. it's a Per-CPU variable.
Default: 64 Default: 64
low_latency_poll low_latency_read
---------------- ----------------
Low latency busy poll timeout. (needs CONFIG_NET_LL_RX_POLL) Low latency busy poll timeout for socket reads. (needs CONFIG_NET_LL_RX_POLL)
Approximate time in us to spin waiting for packets on the device queue. Approximate time in us to spin waiting for packets on the device queue.
This sets the default value of the SO_LL socket option.
Can be set or overridden per socket by setting socket option SO_LL.
Recommended value is 50. May increase power usage. Recommended value is 50. May increase power usage.
Default: 0 (off) Default: 0 (off)
low_latency_poll
----------------
Low latency busy poll timeout for poll and select. (needs CONFIG_NET_LL_RX_POLL)
Approximate time in us to spin waiting for packets on the device queue.
Recommended value depends on the number of sockets you poll on.
For several sockets 50, for several hundreds 100.
For more than that you probably want to use epoll.
Note that only sockets with SO_LL set will be busy polled, so you want to either
selectively set SO_LL on those sockets or set sysctl.net.low_latency_read globally.
May increase power usage.
Default: 0 (off)
rmem_default rmem_default
------------ ------------
......
...@@ -27,6 +27,7 @@ ...@@ -27,6 +27,7 @@
#include <linux/rcupdate.h> #include <linux/rcupdate.h>
#include <linux/hrtimer.h> #include <linux/hrtimer.h>
#include <linux/sched/rt.h> #include <linux/sched/rt.h>
#include <net/ll_poll.h>
#include <asm/uaccess.h> #include <asm/uaccess.h>
...@@ -384,9 +385,10 @@ static int max_select_fd(unsigned long n, fd_set_bits *fds) ...@@ -384,9 +385,10 @@ static int max_select_fd(unsigned long n, fd_set_bits *fds)
#define POLLEX_SET (POLLPRI) #define POLLEX_SET (POLLPRI)
static inline void wait_key_set(poll_table *wait, unsigned long in, static inline void wait_key_set(poll_table *wait, unsigned long in,
unsigned long out, unsigned long bit) unsigned long out, unsigned long bit,
unsigned int ll_flag)
{ {
wait->_key = POLLEX_SET; wait->_key = POLLEX_SET | ll_flag;
if (in & bit) if (in & bit)
wait->_key |= POLLIN_SET; wait->_key |= POLLIN_SET;
if (out & bit) if (out & bit)
...@@ -400,6 +402,8 @@ int do_select(int n, fd_set_bits *fds, struct timespec *end_time) ...@@ -400,6 +402,8 @@ int do_select(int n, fd_set_bits *fds, struct timespec *end_time)
poll_table *wait; poll_table *wait;
int retval, i, timed_out = 0; int retval, i, timed_out = 0;
unsigned long slack = 0; unsigned long slack = 0;
unsigned int ll_flag = POLL_LL;
u64 ll_time = ll_end_time();
rcu_read_lock(); rcu_read_lock();
retval = max_select_fd(n, fds); retval = max_select_fd(n, fds);
...@@ -422,6 +426,7 @@ int do_select(int n, fd_set_bits *fds, struct timespec *end_time) ...@@ -422,6 +426,7 @@ int do_select(int n, fd_set_bits *fds, struct timespec *end_time)
retval = 0; retval = 0;
for (;;) { for (;;) {
unsigned long *rinp, *routp, *rexp, *inp, *outp, *exp; unsigned long *rinp, *routp, *rexp, *inp, *outp, *exp;
bool can_ll = false;
inp = fds->in; outp = fds->out; exp = fds->ex; inp = fds->in; outp = fds->out; exp = fds->ex;
rinp = fds->res_in; routp = fds->res_out; rexp = fds->res_ex; rinp = fds->res_in; routp = fds->res_out; rexp = fds->res_ex;
...@@ -449,7 +454,8 @@ int do_select(int n, fd_set_bits *fds, struct timespec *end_time) ...@@ -449,7 +454,8 @@ int do_select(int n, fd_set_bits *fds, struct timespec *end_time)
f_op = f.file->f_op; f_op = f.file->f_op;
mask = DEFAULT_POLLMASK; mask = DEFAULT_POLLMASK;
if (f_op && f_op->poll) { if (f_op && f_op->poll) {
wait_key_set(wait, in, out, bit); wait_key_set(wait, in, out,
bit, ll_flag);
mask = (*f_op->poll)(f.file, wait); mask = (*f_op->poll)(f.file, wait);
} }
fdput(f); fdput(f);
...@@ -468,6 +474,11 @@ int do_select(int n, fd_set_bits *fds, struct timespec *end_time) ...@@ -468,6 +474,11 @@ int do_select(int n, fd_set_bits *fds, struct timespec *end_time)
retval++; retval++;
wait->_qproc = NULL; wait->_qproc = NULL;
} }
if (mask & POLL_LL)
can_ll = true;
/* got something, stop busy polling */
if (retval)
ll_flag = 0;
} }
} }
if (res_in) if (res_in)
...@@ -486,6 +497,9 @@ int do_select(int n, fd_set_bits *fds, struct timespec *end_time) ...@@ -486,6 +497,9 @@ int do_select(int n, fd_set_bits *fds, struct timespec *end_time)
break; break;
} }
if (can_ll && can_poll_ll(ll_time))
continue;
/* /*
* If this is the first loop and we have a timeout * If this is the first loop and we have a timeout
* given, then we convert to ktime_t and set the to * given, then we convert to ktime_t and set the to
...@@ -717,7 +731,8 @@ struct poll_list { ...@@ -717,7 +731,8 @@ struct poll_list {
* pwait poll_table will be used by the fd-provided poll handler for waiting, * pwait poll_table will be used by the fd-provided poll handler for waiting,
* if pwait->_qproc is non-NULL. * if pwait->_qproc is non-NULL.
*/ */
static inline unsigned int do_pollfd(struct pollfd *pollfd, poll_table *pwait) static inline unsigned int do_pollfd(struct pollfd *pollfd, poll_table *pwait,
bool *can_ll, unsigned int ll_flag)
{ {
unsigned int mask; unsigned int mask;
int fd; int fd;
...@@ -731,7 +746,10 @@ static inline unsigned int do_pollfd(struct pollfd *pollfd, poll_table *pwait) ...@@ -731,7 +746,10 @@ static inline unsigned int do_pollfd(struct pollfd *pollfd, poll_table *pwait)
mask = DEFAULT_POLLMASK; mask = DEFAULT_POLLMASK;
if (f.file->f_op && f.file->f_op->poll) { if (f.file->f_op && f.file->f_op->poll) {
pwait->_key = pollfd->events|POLLERR|POLLHUP; pwait->_key = pollfd->events|POLLERR|POLLHUP;
pwait->_key |= ll_flag;
mask = f.file->f_op->poll(f.file, pwait); mask = f.file->f_op->poll(f.file, pwait);
if (mask & POLL_LL)
*can_ll = true;
} }
/* Mask out unneeded events. */ /* Mask out unneeded events. */
mask &= pollfd->events | POLLERR | POLLHUP; mask &= pollfd->events | POLLERR | POLLHUP;
...@@ -750,6 +768,8 @@ static int do_poll(unsigned int nfds, struct poll_list *list, ...@@ -750,6 +768,8 @@ static int do_poll(unsigned int nfds, struct poll_list *list,
ktime_t expire, *to = NULL; ktime_t expire, *to = NULL;
int timed_out = 0, count = 0; int timed_out = 0, count = 0;
unsigned long slack = 0; unsigned long slack = 0;
unsigned int ll_flag = POLL_LL;
u64 ll_time = ll_end_time();
/* Optimise the no-wait case */ /* Optimise the no-wait case */
if (end_time && !end_time->tv_sec && !end_time->tv_nsec) { if (end_time && !end_time->tv_sec && !end_time->tv_nsec) {
...@@ -762,6 +782,7 @@ static int do_poll(unsigned int nfds, struct poll_list *list, ...@@ -762,6 +782,7 @@ static int do_poll(unsigned int nfds, struct poll_list *list,
for (;;) { for (;;) {
struct poll_list *walk; struct poll_list *walk;
bool can_ll = false;
for (walk = list; walk != NULL; walk = walk->next) { for (walk = list; walk != NULL; walk = walk->next) {
struct pollfd * pfd, * pfd_end; struct pollfd * pfd, * pfd_end;
...@@ -776,9 +797,10 @@ static int do_poll(unsigned int nfds, struct poll_list *list, ...@@ -776,9 +797,10 @@ static int do_poll(unsigned int nfds, struct poll_list *list,
* this. They'll get immediately deregistered * this. They'll get immediately deregistered
* when we break out and return. * when we break out and return.
*/ */
if (do_pollfd(pfd, pt)) { if (do_pollfd(pfd, pt, &can_ll, ll_flag)) {
count++; count++;
pt->_qproc = NULL; pt->_qproc = NULL;
ll_flag = 0;
} }
} }
} }
...@@ -795,6 +817,8 @@ static int do_poll(unsigned int nfds, struct poll_list *list, ...@@ -795,6 +817,8 @@ static int do_poll(unsigned int nfds, struct poll_list *list,
if (count || timed_out) if (count || timed_out)
break; break;
if (can_ll && can_poll_ll(ll_time))
continue;
/* /*
* If this is the first loop and we have a timeout * If this is the first loop and we have a timeout
* given, then we convert to ktime_t and set the to * given, then we convert to ktime_t and set the to
......
...@@ -30,6 +30,7 @@ ...@@ -30,6 +30,7 @@
#ifdef CONFIG_NET_LL_RX_POLL #ifdef CONFIG_NET_LL_RX_POLL
struct napi_struct; struct napi_struct;
extern unsigned int sysctl_net_ll_read __read_mostly;
extern unsigned int sysctl_net_ll_poll __read_mostly; extern unsigned int sysctl_net_ll_poll __read_mostly;
/* return values from ndo_ll_poll */ /* return values from ndo_ll_poll */
...@@ -38,17 +39,18 @@ extern unsigned int sysctl_net_ll_poll __read_mostly; ...@@ -38,17 +39,18 @@ extern unsigned int sysctl_net_ll_poll __read_mostly;
/* we can use sched_clock() because we don't care much about precision /* we can use sched_clock() because we don't care much about precision
* we only care that the average is bounded * we only care that the average is bounded
* we don't mind a ~2.5% imprecision so <<10 instead of *1000
* sk->sk_ll_usec is a u_int so this can't overflow
*/ */
static inline u64 ll_end_time(struct sock *sk) static inline u64 ll_sk_end_time(struct sock *sk)
{ {
u64 end_time = ACCESS_ONCE(sk->sk_ll_usec); return ((u64)ACCESS_ONCE(sk->sk_ll_usec) << 10) + sched_clock();
}
/* we don't mind a ~2.5% imprecision
* sk->sk_ll_usec is a u_int so this can't overflow
*/
end_time = (end_time << 10) + sched_clock();
return end_time; /* in poll/select we use the global sysctl_net_ll_poll value */
static inline u64 ll_end_time(void)
{
return ((u64)ACCESS_ONCE(sysctl_net_ll_poll) << 10) + sched_clock();
} }
static inline bool sk_valid_ll(struct sock *sk) static inline bool sk_valid_ll(struct sock *sk)
...@@ -62,10 +64,13 @@ static inline bool can_poll_ll(u64 end_time) ...@@ -62,10 +64,13 @@ static inline bool can_poll_ll(u64 end_time)
return !time_after64(sched_clock(), end_time); return !time_after64(sched_clock(), end_time);
} }
/* when used in sock_poll() nonblock is known at compile time to be true
* so the loop and end_time will be optimized out
*/
static inline bool sk_poll_ll(struct sock *sk, int nonblock) static inline bool sk_poll_ll(struct sock *sk, int nonblock)
{ {
u64 end_time = nonblock ? 0 : ll_sk_end_time(sk);
const struct net_device_ops *ops; const struct net_device_ops *ops;
u64 end_time = ll_end_time(sk);
struct napi_struct *napi; struct napi_struct *napi;
int rc = false; int rc = false;
...@@ -84,7 +89,6 @@ static inline bool sk_poll_ll(struct sock *sk, int nonblock) ...@@ -84,7 +89,6 @@ static inline bool sk_poll_ll(struct sock *sk, int nonblock)
goto out; goto out;
do { do {
rc = ops->ndo_ll_poll(napi); rc = ops->ndo_ll_poll(napi);
if (rc == LL_FLUSH_FAILED) if (rc == LL_FLUSH_FAILED)
...@@ -95,8 +99,8 @@ static inline bool sk_poll_ll(struct sock *sk, int nonblock) ...@@ -95,8 +99,8 @@ static inline bool sk_poll_ll(struct sock *sk, int nonblock)
NET_ADD_STATS_BH(sock_net(sk), NET_ADD_STATS_BH(sock_net(sk),
LINUX_MIB_LOWLATENCYRXPACKETS, rc); LINUX_MIB_LOWLATENCYRXPACKETS, rc);
} while (skb_queue_empty(&sk->sk_receive_queue) } while (!nonblock && skb_queue_empty(&sk->sk_receive_queue) &&
&& can_poll_ll(end_time) && !nonblock); can_poll_ll(end_time));
rc = !skb_queue_empty(&sk->sk_receive_queue); rc = !skb_queue_empty(&sk->sk_receive_queue);
out: out:
...@@ -118,7 +122,12 @@ static inline void sk_mark_ll(struct sock *sk, struct sk_buff *skb) ...@@ -118,7 +122,12 @@ static inline void sk_mark_ll(struct sock *sk, struct sk_buff *skb)
#else /* CONFIG_NET_LL_RX_POLL */ #else /* CONFIG_NET_LL_RX_POLL */
static inline u64 ll_end_time(struct sock *sk) static inline u64 sk_ll_end_time(struct sock *sk)
{
return 0;
}
static inline u64 ll_end_time(void)
{ {
return 0; return 0;
} }
......
...@@ -30,6 +30,8 @@ ...@@ -30,6 +30,8 @@
#define POLLFREE 0x4000 /* currently only for epoll */ #define POLLFREE 0x4000 /* currently only for epoll */
#define POLL_LL 0x8000
struct pollfd { struct pollfd {
int fd; int fd;
short events; short events;
......
...@@ -2307,7 +2307,7 @@ void sock_init_data(struct socket *sock, struct sock *sk) ...@@ -2307,7 +2307,7 @@ void sock_init_data(struct socket *sock, struct sock *sk)
#ifdef CONFIG_NET_LL_RX_POLL #ifdef CONFIG_NET_LL_RX_POLL
sk->sk_napi_id = 0; sk->sk_napi_id = 0;
sk->sk_ll_usec = sysctl_net_ll_poll; sk->sk_ll_usec = sysctl_net_ll_read;
#endif #endif
/* /*
......
...@@ -306,6 +306,14 @@ static struct ctl_table net_core_table[] = { ...@@ -306,6 +306,14 @@ static struct ctl_table net_core_table[] = {
.mode = 0644, .mode = 0644,
.proc_handler = proc_dointvec .proc_handler = proc_dointvec
}, },
{
.procname = "low_latency_read",
.data = &sysctl_net_ll_read,
.maxlen = sizeof(unsigned int),
.mode = 0644,
.proc_handler = proc_dointvec
},
#
#endif #endif
#endif /* CONFIG_NET */ #endif /* CONFIG_NET */
{ {
......
...@@ -107,6 +107,7 @@ ...@@ -107,6 +107,7 @@
#include <net/ll_poll.h> #include <net/ll_poll.h>
#ifdef CONFIG_NET_LL_RX_POLL #ifdef CONFIG_NET_LL_RX_POLL
unsigned int sysctl_net_ll_read __read_mostly;
unsigned int sysctl_net_ll_poll __read_mostly; unsigned int sysctl_net_ll_poll __read_mostly;
#endif #endif
...@@ -1147,13 +1148,24 @@ EXPORT_SYMBOL(sock_create_lite); ...@@ -1147,13 +1148,24 @@ EXPORT_SYMBOL(sock_create_lite);
/* No kernel lock held - perfect */ /* No kernel lock held - perfect */
static unsigned int sock_poll(struct file *file, poll_table *wait) static unsigned int sock_poll(struct file *file, poll_table *wait)
{ {
unsigned int ll_flag = 0;
struct socket *sock; struct socket *sock;
/* /*
* We can't return errors to poll, so it's either yes or no. * We can't return errors to poll, so it's either yes or no.
*/ */
sock = file->private_data; sock = file->private_data;
return sock->ops->poll(file, sock, wait);
if (sk_valid_ll(sock->sk)) {
/* this socket can poll_ll so tell the system call */
ll_flag = POLL_LL;
/* once, only if requested by syscall */
if (wait && (wait->_key & POLL_LL))
sk_poll_ll(sock->sk, 1);
}
return ll_flag | sock->ops->poll(file, sock, wait);
} }
static int sock_mmap(struct file *file, struct vm_area_struct *vma) static int sock_mmap(struct file *file, struct vm_area_struct *vma)
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment