Commit 1a3e4d6a authored by Paolo Abeni's avatar Paolo Abeni

Merge branch 'net-provide-smp-threads-for-backlog-napi'

Sebastian Andrzej Siewior says:

====================
net: Provide SMP threads for backlog NAPI

The RPS code and "deferred skb free" both send IPI/ function call
to a remote CPU in which a softirq is raised. This leads to a warning on
PREEMPT_RT because raising softiqrs from function call led to undesired
behaviour in the past. I had duct tape in RT for the "deferred skb free"
and Wander Lairson Costa reported the RPS case.

This series only provides support for SMP threads for backlog NAPI, I
did not attach a patch to make it default and remove the IPI related
code to avoid confusion. I can post it for reference it asked.

The RedHat performance team was so kind to provide some testing here.
The series (with the IPI code removed) has been tested and no regression
vs without the series has been found. For testing iperf3 was used on 25G
interface, provided by mlx5, ix40e or ice driver and RPS was enabled. I
can provide the individual test results if needed.
====================

Link: https://lore.kernel.org/r/20240325074943.289909-1-bigeasy@linutronix.deSigned-off-by: default avatarPaolo Abeni <pabeni@redhat.com>
parents 26f44b70 765b11f8
......@@ -3287,6 +3287,7 @@ static inline void dev_xmit_recursion_dec(void)
__this_cpu_dec(softnet_data.xmit.recursion);
}
void kick_defer_list_purge(struct softnet_data *sd, unsigned int cpu);
void __netif_schedule(struct Qdisc *q);
void netif_schedule_queue(struct netdev_queue *txq);
......
......@@ -78,6 +78,7 @@
#include <linux/slab.h>
#include <linux/sched.h>
#include <linux/sched/mm.h>
#include <linux/smpboot.h>
#include <linux/mutex.h>
#include <linux/rwsem.h>
#include <linux/string.h>
......@@ -197,35 +198,60 @@ static inline struct hlist_head *dev_index_hash(struct net *net, int ifindex)
return &net->dev_index_head[ifindex & (NETDEV_HASHENTRIES - 1)];
}
static inline void rps_lock_irqsave(struct softnet_data *sd,
unsigned long *flags)
#ifndef CONFIG_PREEMPT_RT
static DEFINE_STATIC_KEY_FALSE(use_backlog_threads_key);
static int __init setup_backlog_napi_threads(char *arg)
{
static_branch_enable(&use_backlog_threads_key);
return 0;
}
early_param("thread_backlog_napi", setup_backlog_napi_threads);
static bool use_backlog_threads(void)
{
return static_branch_unlikely(&use_backlog_threads_key);
}
#else
static bool use_backlog_threads(void)
{
if (IS_ENABLED(CONFIG_RPS))
return true;
}
#endif
static inline void backlog_lock_irq_save(struct softnet_data *sd,
unsigned long *flags)
{
if (IS_ENABLED(CONFIG_RPS) || use_backlog_threads())
spin_lock_irqsave(&sd->input_pkt_queue.lock, *flags);
else if (!IS_ENABLED(CONFIG_PREEMPT_RT))
local_irq_save(*flags);
}
static inline void rps_lock_irq_disable(struct softnet_data *sd)
static inline void backlog_lock_irq_disable(struct softnet_data *sd)
{
if (IS_ENABLED(CONFIG_RPS))
if (IS_ENABLED(CONFIG_RPS) || use_backlog_threads())
spin_lock_irq(&sd->input_pkt_queue.lock);
else if (!IS_ENABLED(CONFIG_PREEMPT_RT))
local_irq_disable();
}
static inline void rps_unlock_irq_restore(struct softnet_data *sd,
unsigned long *flags)
static inline void backlog_unlock_irq_restore(struct softnet_data *sd,
unsigned long *flags)
{
if (IS_ENABLED(CONFIG_RPS))
if (IS_ENABLED(CONFIG_RPS) || use_backlog_threads())
spin_unlock_irqrestore(&sd->input_pkt_queue.lock, *flags);
else if (!IS_ENABLED(CONFIG_PREEMPT_RT))
local_irq_restore(*flags);
}
static inline void rps_unlock_irq_enable(struct softnet_data *sd)
static inline void backlog_unlock_irq_enable(struct softnet_data *sd)
{
if (IS_ENABLED(CONFIG_RPS))
if (IS_ENABLED(CONFIG_RPS) || use_backlog_threads())
spin_unlock_irq(&sd->input_pkt_queue.lock);
else if (!IS_ENABLED(CONFIG_PREEMPT_RT))
local_irq_enable();
......@@ -4404,6 +4430,7 @@ EXPORT_SYMBOL(__dev_direct_xmit);
/*************************************************************************
* Receiver routines
*************************************************************************/
static DEFINE_PER_CPU(struct task_struct *, backlog_napi);
unsigned int sysctl_skb_defer_max __read_mostly = 64;
int weight_p __read_mostly = 64; /* old backlog weight */
......@@ -4427,18 +4454,16 @@ static inline void ____napi_schedule(struct softnet_data *sd,
*/
thread = READ_ONCE(napi->thread);
if (thread) {
/* Avoid doing set_bit() if the thread is in
* INTERRUPTIBLE state, cause napi_thread_wait()
* makes sure to proceed with napi polling
* if the thread is explicitly woken from here.
*/
if (READ_ONCE(thread->__state) != TASK_INTERRUPTIBLE)
set_bit(NAPI_STATE_SCHED_THREADED, &napi->state);
if (use_backlog_threads() && thread == raw_cpu_read(backlog_napi))
goto use_local_napi;
set_bit(NAPI_STATE_SCHED_THREADED, &napi->state);
wake_up_process(thread);
return;
}
}
use_local_napi:
list_add_tail(&napi->poll_list, &sd->poll_list);
WRITE_ONCE(napi->list_owner, smp_processor_id());
/* If not called from net_rx_action()
......@@ -4678,6 +4703,11 @@ static void napi_schedule_rps(struct softnet_data *sd)
#ifdef CONFIG_RPS
if (sd != mysd) {
if (use_backlog_threads()) {
__napi_schedule_irqoff(&sd->backlog);
return;
}
sd->rps_ipi_next = mysd->rps_ipi_list;
mysd->rps_ipi_list = sd;
......@@ -4692,6 +4722,23 @@ static void napi_schedule_rps(struct softnet_data *sd)
__napi_schedule_irqoff(&mysd->backlog);
}
void kick_defer_list_purge(struct softnet_data *sd, unsigned int cpu)
{
unsigned long flags;
if (use_backlog_threads()) {
backlog_lock_irq_save(sd, &flags);
if (!__test_and_set_bit(NAPI_STATE_SCHED, &sd->backlog.state))
__napi_schedule_irqoff(&sd->backlog);
backlog_unlock_irq_restore(sd, &flags);
} else if (!cmpxchg(&sd->defer_ipi_scheduled, 0, 1)) {
smp_call_function_single_async(cpu, &sd->defer_csd);
}
}
#ifdef CONFIG_NET_FLOW_LIMIT
int netdev_flow_limit_table_len __read_mostly = (1 << 12);
#endif
......@@ -4747,7 +4794,7 @@ static int enqueue_to_backlog(struct sk_buff *skb, int cpu,
reason = SKB_DROP_REASON_NOT_SPECIFIED;
sd = &per_cpu(softnet_data, cpu);
rps_lock_irqsave(sd, &flags);
backlog_lock_irq_save(sd, &flags);
if (!netif_running(skb->dev))
goto drop;
qlen = skb_queue_len(&sd->input_pkt_queue);
......@@ -4757,7 +4804,7 @@ static int enqueue_to_backlog(struct sk_buff *skb, int cpu,
enqueue:
__skb_queue_tail(&sd->input_pkt_queue, skb);
input_queue_tail_incr_save(sd, qtail);
rps_unlock_irq_restore(sd, &flags);
backlog_unlock_irq_restore(sd, &flags);
return NET_RX_SUCCESS;
}
......@@ -4772,7 +4819,7 @@ static int enqueue_to_backlog(struct sk_buff *skb, int cpu,
drop:
sd->dropped++;
rps_unlock_irq_restore(sd, &flags);
backlog_unlock_irq_restore(sd, &flags);
dev_core_stats_rx_dropped_inc(skb->dev);
kfree_skb_reason(skb, reason);
......@@ -5838,7 +5885,7 @@ static void flush_backlog(struct work_struct *work)
local_bh_disable();
sd = this_cpu_ptr(&softnet_data);
rps_lock_irq_disable(sd);
backlog_lock_irq_disable(sd);
skb_queue_walk_safe(&sd->input_pkt_queue, skb, tmp) {
if (skb->dev->reg_state == NETREG_UNREGISTERING) {
__skb_unlink(skb, &sd->input_pkt_queue);
......@@ -5846,7 +5893,7 @@ static void flush_backlog(struct work_struct *work)
input_queue_head_incr(sd);
}
}
rps_unlock_irq_enable(sd);
backlog_unlock_irq_enable(sd);
skb_queue_walk_safe(&sd->process_queue, skb, tmp) {
if (skb->dev->reg_state == NETREG_UNREGISTERING) {
......@@ -5864,14 +5911,14 @@ static bool flush_required(int cpu)
struct softnet_data *sd = &per_cpu(softnet_data, cpu);
bool do_flush;
rps_lock_irq_disable(sd);
backlog_lock_irq_disable(sd);
/* as insertion into process_queue happens with the rps lock held,
* process_queue access may race only with dequeue
*/
do_flush = !skb_queue_empty(&sd->input_pkt_queue) ||
!skb_queue_empty_lockless(&sd->process_queue);
rps_unlock_irq_enable(sd);
backlog_unlock_irq_enable(sd);
return do_flush;
#endif
......@@ -5937,7 +5984,7 @@ static void net_rps_action_and_irq_enable(struct softnet_data *sd)
#ifdef CONFIG_RPS
struct softnet_data *remsd = sd->rps_ipi_list;
if (remsd) {
if (!use_backlog_threads() && remsd) {
sd->rps_ipi_list = NULL;
local_irq_enable();
......@@ -5952,7 +5999,7 @@ static void net_rps_action_and_irq_enable(struct softnet_data *sd)
static bool sd_has_rps_ipi_waiting(struct softnet_data *sd)
{
#ifdef CONFIG_RPS
return sd->rps_ipi_list != NULL;
return !use_backlog_threads() && sd->rps_ipi_list;
#else
return false;
#endif
......@@ -5986,7 +6033,7 @@ static int process_backlog(struct napi_struct *napi, int quota)
}
rps_lock_irq_disable(sd);
backlog_lock_irq_disable(sd);
if (skb_queue_empty(&sd->input_pkt_queue)) {
/*
* Inline a custom version of __napi_complete().
......@@ -5996,13 +6043,13 @@ static int process_backlog(struct napi_struct *napi, int quota)
* We can use a plain write instead of clear_bit(),
* and we dont need an smp_mb() memory barrier.
*/
napi->state = 0;
napi->state &= NAPIF_STATE_THREADED;
again = false;
} else {
skb_queue_splice_tail_init(&sd->input_pkt_queue,
&sd->process_queue);
}
rps_unlock_irq_enable(sd);
backlog_unlock_irq_enable(sd);
}
return work;
......@@ -6710,8 +6757,6 @@ static int napi_poll(struct napi_struct *n, struct list_head *repoll)
static int napi_thread_wait(struct napi_struct *napi)
{
bool woken = false;
set_current_state(TASK_INTERRUPTIBLE);
while (!kthread_should_stop()) {
......@@ -6720,15 +6765,13 @@ static int napi_thread_wait(struct napi_struct *napi)
* Testing SCHED bit is not enough because SCHED bit might be
* set by some other busy poll thread or by napi_disable().
*/
if (test_bit(NAPI_STATE_SCHED_THREADED, &napi->state) || woken) {
if (test_bit(NAPI_STATE_SCHED_THREADED, &napi->state)) {
WARN_ON(!list_empty(&napi->poll_list));
__set_current_state(TASK_RUNNING);
return 0;
}
schedule();
/* woken being true indicates this thread owns this napi. */
woken = true;
set_current_state(TASK_INTERRUPTIBLE);
}
__set_current_state(TASK_RUNNING);
......@@ -6736,43 +6779,48 @@ static int napi_thread_wait(struct napi_struct *napi)
return -1;
}
static int napi_threaded_poll(void *data)
static void napi_threaded_poll_loop(struct napi_struct *napi)
{
struct napi_struct *napi = data;
struct softnet_data *sd;
void *have;
unsigned long last_qs = jiffies;
while (!napi_thread_wait(napi)) {
unsigned long last_qs = jiffies;
for (;;) {
bool repoll = false;
for (;;) {
bool repoll = false;
void *have;
local_bh_disable();
sd = this_cpu_ptr(&softnet_data);
sd->in_napi_threaded_poll = true;
local_bh_disable();
sd = this_cpu_ptr(&softnet_data);
sd->in_napi_threaded_poll = true;
have = netpoll_poll_lock(napi);
__napi_poll(napi, &repoll);
netpoll_poll_unlock(have);
have = netpoll_poll_lock(napi);
__napi_poll(napi, &repoll);
netpoll_poll_unlock(have);
sd->in_napi_threaded_poll = false;
barrier();
sd->in_napi_threaded_poll = false;
barrier();
if (sd_has_rps_ipi_waiting(sd)) {
local_irq_disable();
net_rps_action_and_irq_enable(sd);
}
skb_defer_free_flush(sd);
local_bh_enable();
if (sd_has_rps_ipi_waiting(sd)) {
local_irq_disable();
net_rps_action_and_irq_enable(sd);
}
skb_defer_free_flush(sd);
local_bh_enable();
if (!repoll)
break;
if (!repoll)
break;
rcu_softirq_qs_periodic(last_qs);
cond_resched();
}
rcu_softirq_qs_periodic(last_qs);
cond_resched();
}
}
static int napi_threaded_poll(void *data)
{
struct napi_struct *napi = data;
while (!napi_thread_wait(napi))
napi_threaded_poll_loop(napi);
return 0;
}
......@@ -11373,7 +11421,7 @@ static int dev_cpu_dead(unsigned int oldcpu)
list_del_init(&napi->poll_list);
if (napi->poll == process_backlog)
napi->state = 0;
napi->state &= NAPIF_STATE_THREADED;
else
____napi_schedule(sd, napi);
}
......@@ -11381,12 +11429,14 @@ static int dev_cpu_dead(unsigned int oldcpu)
raise_softirq_irqoff(NET_TX_SOFTIRQ);
local_irq_enable();
if (!use_backlog_threads()) {
#ifdef CONFIG_RPS
remsd = oldsd->rps_ipi_list;
oldsd->rps_ipi_list = NULL;
remsd = oldsd->rps_ipi_list;
oldsd->rps_ipi_list = NULL;
#endif
/* send out pending IPI's on offline CPU */
net_rps_send_ipi(remsd);
/* send out pending IPI's on offline CPU */
net_rps_send_ipi(remsd);
}
/* Process offline CPU's input_pkt_queue */
while ((skb = __skb_dequeue(&oldsd->process_queue))) {
......@@ -11725,6 +11775,38 @@ static int net_page_pool_create(int cpuid)
return 0;
}
static int backlog_napi_should_run(unsigned int cpu)
{
struct softnet_data *sd = per_cpu_ptr(&softnet_data, cpu);
struct napi_struct *napi = &sd->backlog;
return test_bit(NAPI_STATE_SCHED_THREADED, &napi->state);
}
static void run_backlog_napi(unsigned int cpu)
{
struct softnet_data *sd = per_cpu_ptr(&softnet_data, cpu);
napi_threaded_poll_loop(&sd->backlog);
}
static void backlog_napi_setup(unsigned int cpu)
{
struct softnet_data *sd = per_cpu_ptr(&softnet_data, cpu);
struct napi_struct *napi = &sd->backlog;
napi->thread = this_cpu_read(backlog_napi);
set_bit(NAPI_STATE_THREADED, &napi->state);
}
static struct smp_hotplug_thread backlog_threads = {
.store = &backlog_napi,
.thread_should_run = backlog_napi_should_run,
.thread_fn = run_backlog_napi,
.thread_comm = "backlog_napi/%u",
.setup = backlog_napi_setup,
};
/*
* This is called single threaded during boot, so no need
* to take the rtnl semaphore.
......@@ -11776,10 +11858,13 @@ static int __init net_dev_init(void)
init_gro_hash(&sd->backlog);
sd->backlog.poll = process_backlog;
sd->backlog.weight = weight_p;
INIT_LIST_HEAD(&sd->backlog.poll_list);
if (net_page_pool_create(i))
goto out;
}
if (use_backlog_threads())
smpboot_register_percpu_thread(&backlog_threads);
dev_boot_phase = 0;
......
......@@ -7039,8 +7039,8 @@ nodefer: __kfree_skb(skb);
/* Make sure to trigger NET_RX_SOFTIRQ on the remote CPU
* if we are unlucky enough (this seems very unlikely).
*/
if (unlikely(kick) && !cmpxchg(&sd->defer_ipi_scheduled, 0, 1))
smp_call_function_single_async(cpu, &sd->defer_csd);
if (unlikely(kick))
kick_defer_list_purge(sd, cpu);
}
static void skb_splice_csum_page(struct sk_buff *skb, struct page *page,
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment