Commit ee3398c7 authored by David S. Miller's avatar David S. Miller

Merge branch 'net-skb-defer-freeing-polish'

Eric Dumazet says:

====================
net: polish skb defer freeing

While testing this recently added feature on a variety
of platforms/configurations, I found the following issues:

1) A race leading to concurrent calls to smp_call_function_single_async()

2) Missed opportunity to use napi_consume_skb()

3) Need to limit the max length of the per-cpu lists.

4) Process the per-cpu list more frequently, for the
   (unusual) case where net_rx_action() has mutiple
   napi_poll() to process per round.
====================
Signed-off-by: default avatarDavid S. Miller <davem@davemloft.net>
parents 3daebfbe 90987650
...@@ -322,6 +322,14 @@ a leaked reference faster. A larger value may be useful to prevent false ...@@ -322,6 +322,14 @@ a leaked reference faster. A larger value may be useful to prevent false
warnings on slow/loaded systems. warnings on slow/loaded systems.
Default value is 10, minimum 1, maximum 3600. Default value is 10, minimum 1, maximum 3600.
skb_defer_max
-------------
Max size (in skbs) of the per-cpu list of skbs being freed
by the cpu which allocated them. Used by TCP stack so far.
Default: 64
optmem_max optmem_max
---------- ----------
......
...@@ -3136,6 +3136,7 @@ struct softnet_data { ...@@ -3136,6 +3136,7 @@ struct softnet_data {
/* Another possibly contended cache line */ /* Another possibly contended cache line */
spinlock_t defer_lock ____cacheline_aligned_in_smp; spinlock_t defer_lock ____cacheline_aligned_in_smp;
int defer_count; int defer_count;
int defer_ipi_scheduled;
struct sk_buff *defer_list; struct sk_buff *defer_list;
call_single_data_t defer_csd; call_single_data_t defer_csd;
}; };
......
...@@ -4330,6 +4330,7 @@ int netdev_max_backlog __read_mostly = 1000; ...@@ -4330,6 +4330,7 @@ int netdev_max_backlog __read_mostly = 1000;
EXPORT_SYMBOL(netdev_max_backlog); EXPORT_SYMBOL(netdev_max_backlog);
int netdev_tstamp_prequeue __read_mostly = 1; int netdev_tstamp_prequeue __read_mostly = 1;
unsigned int sysctl_skb_defer_max __read_mostly = 64;
int netdev_budget __read_mostly = 300; int netdev_budget __read_mostly = 300;
/* Must be at least 2 jiffes to guarantee 1 jiffy timeout */ /* Must be at least 2 jiffes to guarantee 1 jiffy timeout */
unsigned int __read_mostly netdev_budget_usecs = 2 * USEC_PER_SEC / HZ; unsigned int __read_mostly netdev_budget_usecs = 2 * USEC_PER_SEC / HZ;
...@@ -4582,9 +4583,12 @@ static void rps_trigger_softirq(void *data) ...@@ -4582,9 +4583,12 @@ static void rps_trigger_softirq(void *data)
#endif /* CONFIG_RPS */ #endif /* CONFIG_RPS */
/* Called from hardirq (IPI) context */ /* Called from hardirq (IPI) context */
static void trigger_rx_softirq(void *data __always_unused) static void trigger_rx_softirq(void *data)
{ {
struct softnet_data *sd = data;
__raise_softirq_irqoff(NET_RX_SOFTIRQ); __raise_softirq_irqoff(NET_RX_SOFTIRQ);
smp_store_release(&sd->defer_ipi_scheduled, 0);
} }
/* /*
...@@ -6630,7 +6634,7 @@ static void skb_defer_free_flush(struct softnet_data *sd) ...@@ -6630,7 +6634,7 @@ static void skb_defer_free_flush(struct softnet_data *sd)
while (skb != NULL) { while (skb != NULL) {
next = skb->next; next = skb->next;
__kfree_skb(skb); napi_consume_skb(skb, 1);
skb = next; skb = next;
} }
} }
...@@ -6651,6 +6655,8 @@ static __latent_entropy void net_rx_action(struct softirq_action *h) ...@@ -6651,6 +6655,8 @@ static __latent_entropy void net_rx_action(struct softirq_action *h)
for (;;) { for (;;) {
struct napi_struct *n; struct napi_struct *n;
skb_defer_free_flush(sd);
if (list_empty(&list)) { if (list_empty(&list)) {
if (!sd_has_rps_ipi_waiting(sd) && list_empty(&repoll)) if (!sd_has_rps_ipi_waiting(sd) && list_empty(&repoll))
goto end; goto end;
...@@ -6680,8 +6686,7 @@ static __latent_entropy void net_rx_action(struct softirq_action *h) ...@@ -6680,8 +6686,7 @@ static __latent_entropy void net_rx_action(struct softirq_action *h)
__raise_softirq_irqoff(NET_RX_SOFTIRQ); __raise_softirq_irqoff(NET_RX_SOFTIRQ);
net_rps_action_and_irq_enable(sd); net_rps_action_and_irq_enable(sd);
end: end:;
skb_defer_free_flush(sd);
} }
struct netdev_adjacent { struct netdev_adjacent {
...@@ -11382,7 +11387,7 @@ static int __init net_dev_init(void) ...@@ -11382,7 +11387,7 @@ static int __init net_dev_init(void)
INIT_CSD(&sd->csd, rps_trigger_softirq, sd); INIT_CSD(&sd->csd, rps_trigger_softirq, sd);
sd->cpu = i; sd->cpu = i;
#endif #endif
INIT_CSD(&sd->defer_csd, trigger_rx_softirq, NULL); INIT_CSD(&sd->defer_csd, trigger_rx_softirq, sd);
spin_lock_init(&sd->defer_lock); spin_lock_init(&sd->defer_lock);
init_gro_hash(&sd->backlog); init_gro_hash(&sd->backlog);
......
...@@ -39,7 +39,7 @@ void dev_addr_check(struct net_device *dev); ...@@ -39,7 +39,7 @@ void dev_addr_check(struct net_device *dev);
/* sysctls not referred to from outside net/core/ */ /* sysctls not referred to from outside net/core/ */
extern int netdev_budget; extern int netdev_budget;
extern unsigned int netdev_budget_usecs; extern unsigned int netdev_budget_usecs;
extern unsigned int sysctl_skb_defer_max;
extern int netdev_tstamp_prequeue; extern int netdev_tstamp_prequeue;
extern int netdev_unregister_timeout_secs; extern int netdev_unregister_timeout_secs;
extern int weight_p; extern int weight_p;
......
...@@ -80,6 +80,7 @@ ...@@ -80,6 +80,7 @@
#include <linux/user_namespace.h> #include <linux/user_namespace.h>
#include <linux/indirect_call_wrapper.h> #include <linux/indirect_call_wrapper.h>
#include "dev.h"
#include "sock_destructor.h" #include "sock_destructor.h"
struct kmem_cache *skbuff_head_cache __ro_after_init; struct kmem_cache *skbuff_head_cache __ro_after_init;
...@@ -6496,16 +6497,21 @@ void skb_attempt_defer_free(struct sk_buff *skb) ...@@ -6496,16 +6497,21 @@ void skb_attempt_defer_free(struct sk_buff *skb)
int cpu = skb->alloc_cpu; int cpu = skb->alloc_cpu;
struct softnet_data *sd; struct softnet_data *sd;
unsigned long flags; unsigned long flags;
unsigned int defer_max;
bool kick; bool kick;
if (WARN_ON_ONCE(cpu >= nr_cpu_ids) || if (WARN_ON_ONCE(cpu >= nr_cpu_ids) ||
!cpu_online(cpu) || !cpu_online(cpu) ||
cpu == raw_smp_processor_id()) { cpu == raw_smp_processor_id()) {
__kfree_skb(skb); nodefer: __kfree_skb(skb);
return; return;
} }
sd = &per_cpu(softnet_data, cpu); sd = &per_cpu(softnet_data, cpu);
defer_max = READ_ONCE(sysctl_skb_defer_max);
if (READ_ONCE(sd->defer_count) >= defer_max)
goto nodefer;
/* We do not send an IPI or any signal. /* We do not send an IPI or any signal.
* Remote cpu will eventually call skb_defer_free_flush() * Remote cpu will eventually call skb_defer_free_flush()
*/ */
...@@ -6515,18 +6521,14 @@ void skb_attempt_defer_free(struct sk_buff *skb) ...@@ -6515,18 +6521,14 @@ void skb_attempt_defer_free(struct sk_buff *skb)
WRITE_ONCE(sd->defer_list, skb); WRITE_ONCE(sd->defer_list, skb);
sd->defer_count++; sd->defer_count++;
/* kick every time queue length reaches 128. /* Send an IPI every time queue reaches half capacity. */
* This should avoid blocking in smp_call_function_single_async(). kick = sd->defer_count == (defer_max >> 1);
* This condition should hardly be bit under normal conditions,
* unless cpu suddenly stopped to receive NIC interrupts.
*/
kick = sd->defer_count == 128;
spin_unlock_irqrestore(&sd->defer_lock, flags); spin_unlock_irqrestore(&sd->defer_lock, flags);
/* Make sure to trigger NET_RX_SOFTIRQ on the remote CPU /* Make sure to trigger NET_RX_SOFTIRQ on the remote CPU
* if we are unlucky enough (this seems very unlikely). * if we are unlucky enough (this seems very unlikely).
*/ */
if (unlikely(kick)) if (unlikely(kick) && !cmpxchg(&sd->defer_ipi_scheduled, 0, 1))
smp_call_function_single_async(cpu, &sd->defer_csd); smp_call_function_single_async(cpu, &sd->defer_csd);
} }
...@@ -578,6 +578,14 @@ static struct ctl_table net_core_table[] = { ...@@ -578,6 +578,14 @@ static struct ctl_table net_core_table[] = {
.extra1 = SYSCTL_ONE, .extra1 = SYSCTL_ONE,
.extra2 = &int_3600, .extra2 = &int_3600,
}, },
{
.procname = "skb_defer_max",
.data = &sysctl_skb_defer_max,
.maxlen = sizeof(unsigned int),
.mode = 0644,
.proc_handler = proc_dointvec_minmax,
.extra1 = SYSCTL_ZERO,
},
{ } { }
}; };
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment