Commit 82a678e2 authored by David S. Miller's avatar David S. Miller

Merge branch 'bridge-mdb-events'

Tobias Waldekranz says:

====================
net: bridge: switchdev: Ensure MDB events are delivered exactly once

When a device is attached to a bridge, drivers will request a replay
of objects that were created before the device joined the bridge, that
are still of interest to the joining port. Typical examples include
FDB entries and MDB memberships on other ports ("foreign interfaces")
or on the bridge itself.

Conversely when a device is detached, the bridge will synthesize
deletion events for all those objects that are still live, but no
longer applicable to the device in question.

This series eliminates two races related to the synching and
unsynching phases of a bridge's MDB with a joining or leaving device,
that would cause notifications of such objects to be either delivered
twice (1/2), or not at all (2/2).

A similar race to the one solved by 1/2 still remains for the
FDB. This is much harder to solve, due to the lockless operation of
the FDB's rhashtable, and is therefore knowingly left out of this
series.

v1 -> v2:
- Squash the previously separate addition of
  switchdev_port_obj_act_is_deferred into first consumer.
- Use ether_addr_equal to compare MAC addresses.
- Document switchdev_port_obj_act_is_deferred (renamed from
  switchdev_port_obj_is_deferred in v1, to indicate that we also match
  on the action).
- Delay allocations of MDB objects until we know they're needed.
- Use non-RCU version of the hash list iterator, now that the MDB is
  not scanned while holding the RCU read lock.
- Add Fixes tag to commit message

v2 -> v3:
- Fix unlocking in error paths
- Access RCU protected port list via mlock_dereference, since MDB is
  guaranteed to remain constant for the duration of the scan.

v3 -> v4:
- Limit the search for exiting deferred events in 1/2 to only apply to
  additions, since the problem does not exist in the deletion case.
- Add 2/2, to plug a related race when unoffloading an indirectly
  associated device.

v4 -> v5:
- Fix grammatical errors in kerneldoc of
  switchdev_port_obj_act_is_deferred
====================
Signed-off-by: default avatarDavid S. Miller <davem@davemloft.net>
parents b4ea9b6a f7a70d65
......@@ -308,6 +308,9 @@ void switchdev_deferred_process(void);
int switchdev_port_attr_set(struct net_device *dev,
const struct switchdev_attr *attr,
struct netlink_ext_ack *extack);
bool switchdev_port_obj_act_is_deferred(struct net_device *dev,
enum switchdev_notifier_type nt,
const struct switchdev_obj *obj);
int switchdev_port_obj_add(struct net_device *dev,
const struct switchdev_obj *obj,
struct netlink_ext_ack *extack);
......
......@@ -595,21 +595,40 @@ br_switchdev_mdb_replay_one(struct notifier_block *nb, struct net_device *dev,
}
static int br_switchdev_mdb_queue_one(struct list_head *mdb_list,
struct net_device *dev,
unsigned long action,
enum switchdev_obj_id id,
const struct net_bridge_mdb_entry *mp,
struct net_device *orig_dev)
{
struct switchdev_obj_port_mdb *mdb;
struct switchdev_obj_port_mdb mdb = {
.obj = {
.id = id,
.orig_dev = orig_dev,
},
};
struct switchdev_obj_port_mdb *pmdb;
mdb = kzalloc(sizeof(*mdb), GFP_ATOMIC);
if (!mdb)
return -ENOMEM;
br_switchdev_mdb_populate(&mdb, mp);
if (action == SWITCHDEV_PORT_OBJ_ADD &&
switchdev_port_obj_act_is_deferred(dev, action, &mdb.obj)) {
/* This event is already in the deferred queue of
* events, so this replay must be elided, lest the
* driver receives duplicate events for it. This can
* only happen when replaying additions, since
* modifications are always immediately visible in
* br->mdb_list, whereas actual event delivery may be
* delayed.
*/
return 0;
}
mdb->obj.id = id;
mdb->obj.orig_dev = orig_dev;
br_switchdev_mdb_populate(mdb, mp);
list_add_tail(&mdb->obj.list, mdb_list);
pmdb = kmemdup(&mdb, sizeof(mdb), GFP_ATOMIC);
if (!pmdb)
return -ENOMEM;
list_add_tail(&pmdb->obj.list, mdb_list);
return 0;
}
......@@ -677,51 +696,50 @@ br_switchdev_mdb_replay(struct net_device *br_dev, struct net_device *dev,
if (!br_opt_get(br, BROPT_MULTICAST_ENABLED))
return 0;
/* We cannot walk over br->mdb_list protected just by the rtnl_mutex,
* because the write-side protection is br->multicast_lock. But we
* need to emulate the [ blocking ] calling context of a regular
* switchdev event, so since both br->multicast_lock and RCU read side
* critical sections are atomic, we have no choice but to pick the RCU
* read side lock, queue up all our events, leave the critical section
* and notify switchdev from blocking context.
if (adding)
action = SWITCHDEV_PORT_OBJ_ADD;
else
action = SWITCHDEV_PORT_OBJ_DEL;
/* br_switchdev_mdb_queue_one() will take care to not queue a
* replay of an event that is already pending in the switchdev
* deferred queue. In order to safely determine that, there
* must be no new deferred MDB notifications enqueued for the
* duration of the MDB scan. Therefore, grab the write-side
* lock to avoid racing with any concurrent IGMP/MLD snooping.
*/
rcu_read_lock();
spin_lock_bh(&br->multicast_lock);
hlist_for_each_entry_rcu(mp, &br->mdb_list, mdb_node) {
hlist_for_each_entry(mp, &br->mdb_list, mdb_node) {
struct net_bridge_port_group __rcu * const *pp;
const struct net_bridge_port_group *p;
if (mp->host_joined) {
err = br_switchdev_mdb_queue_one(&mdb_list,
err = br_switchdev_mdb_queue_one(&mdb_list, dev, action,
SWITCHDEV_OBJ_ID_HOST_MDB,
mp, br_dev);
if (err) {
rcu_read_unlock();
spin_unlock_bh(&br->multicast_lock);
goto out_free_mdb;
}
}
for (pp = &mp->ports; (p = rcu_dereference(*pp)) != NULL;
for (pp = &mp->ports; (p = mlock_dereference(*pp, br)) != NULL;
pp = &p->next) {
if (p->key.port->dev != dev)
continue;
err = br_switchdev_mdb_queue_one(&mdb_list,
err = br_switchdev_mdb_queue_one(&mdb_list, dev, action,
SWITCHDEV_OBJ_ID_PORT_MDB,
mp, dev);
if (err) {
rcu_read_unlock();
spin_unlock_bh(&br->multicast_lock);
goto out_free_mdb;
}
}
}
rcu_read_unlock();
if (adding)
action = SWITCHDEV_PORT_OBJ_ADD;
else
action = SWITCHDEV_PORT_OBJ_DEL;
spin_unlock_bh(&br->multicast_lock);
list_for_each_entry(obj, &mdb_list, list) {
err = br_switchdev_mdb_replay_one(nb, dev,
......@@ -786,6 +804,16 @@ static void nbp_switchdev_unsync_objs(struct net_bridge_port *p,
br_switchdev_mdb_replay(br_dev, dev, ctx, false, blocking_nb, NULL);
br_switchdev_vlan_replay(br_dev, ctx, false, blocking_nb, NULL);
/* Make sure that the device leaving this bridge has seen all
* relevant events before it is disassociated. In the normal
* case, when the device is directly attached to the bridge,
* this is covered by del_nbp(). If the association was indirect
* however, e.g. via a team or bond, and the device is leaving
* that intermediate device, then the bridge port remains in
* place.
*/
switchdev_deferred_process();
}
/* Let the bridge know that this port is offloaded, so that it can assign a
......
......@@ -19,6 +19,35 @@
#include <linux/rtnetlink.h>
#include <net/switchdev.h>
static bool switchdev_obj_eq(const struct switchdev_obj *a,
const struct switchdev_obj *b)
{
const struct switchdev_obj_port_vlan *va, *vb;
const struct switchdev_obj_port_mdb *ma, *mb;
if (a->id != b->id || a->orig_dev != b->orig_dev)
return false;
switch (a->id) {
case SWITCHDEV_OBJ_ID_PORT_VLAN:
va = SWITCHDEV_OBJ_PORT_VLAN(a);
vb = SWITCHDEV_OBJ_PORT_VLAN(b);
return va->flags == vb->flags &&
va->vid == vb->vid &&
va->changed == vb->changed;
case SWITCHDEV_OBJ_ID_PORT_MDB:
case SWITCHDEV_OBJ_ID_HOST_MDB:
ma = SWITCHDEV_OBJ_PORT_MDB(a);
mb = SWITCHDEV_OBJ_PORT_MDB(b);
return ma->vid == mb->vid &&
ether_addr_equal(ma->addr, mb->addr);
default:
break;
}
BUG();
}
static LIST_HEAD(deferred);
static DEFINE_SPINLOCK(deferred_lock);
......@@ -307,6 +336,50 @@ int switchdev_port_obj_del(struct net_device *dev,
}
EXPORT_SYMBOL_GPL(switchdev_port_obj_del);
/**
* switchdev_port_obj_act_is_deferred - Is object action pending?
*
* @dev: port device
* @nt: type of action; add or delete
* @obj: object to test
*
* Returns true if a deferred item is pending, which is
* equivalent to the action @nt on an object @obj.
*
* rtnl_lock must be held.
*/
bool switchdev_port_obj_act_is_deferred(struct net_device *dev,
enum switchdev_notifier_type nt,
const struct switchdev_obj *obj)
{
struct switchdev_deferred_item *dfitem;
bool found = false;
ASSERT_RTNL();
spin_lock_bh(&deferred_lock);
list_for_each_entry(dfitem, &deferred, list) {
if (dfitem->dev != dev)
continue;
if ((dfitem->func == switchdev_port_obj_add_deferred &&
nt == SWITCHDEV_PORT_OBJ_ADD) ||
(dfitem->func == switchdev_port_obj_del_deferred &&
nt == SWITCHDEV_PORT_OBJ_DEL)) {
if (switchdev_obj_eq((const void *)dfitem->data, obj)) {
found = true;
break;
}
}
}
spin_unlock_bh(&deferred_lock);
return found;
}
EXPORT_SYMBOL_GPL(switchdev_port_obj_act_is_deferred);
static ATOMIC_NOTIFIER_HEAD(switchdev_notif_chain);
static BLOCKING_NOTIFIER_HEAD(switchdev_blocking_notif_chain);
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment