Commit 4373a5e2 authored by David S. Miller's avatar David S. Miller

Merge branch 'packet-DDOS'

Eric Dumazet says:

====================
net/packet: better behavior under DDOS

Using tcpdump (or other af_packet user) on a busy host can lead to
catastrophic consequences, because suddenly, potentially all cpus
are spinning on a contended spinlock.

Both packet_rcv() and tpacket_rcv() grab the spinlock
to eventually find there is no room for an additional packet.

This patch series align packet_rcv() and tpacket_rcv() to both
check if the queue is full before grabbing the spinlock.

If the queue is full, they both increment a new atomic counter
placed on a separate cache line to let readers drain the queue faster.

There is still false sharing on this new atomic counter,
we might in the future make it per cpu if there is interest.
====================
Acked-by: default avatarWillem de Bruijn <willemb@google.com>
Signed-off-by: default avatarDavid S. Miller <davem@davemloft.net>
parents f30e33bc 9bb6cd65
...@@ -384,7 +384,7 @@ static void __packet_set_status(struct packet_sock *po, void *frame, int status) ...@@ -384,7 +384,7 @@ static void __packet_set_status(struct packet_sock *po, void *frame, int status)
smp_wmb(); smp_wmb();
} }
static int __packet_get_status(struct packet_sock *po, void *frame) static int __packet_get_status(const struct packet_sock *po, void *frame)
{ {
union tpacket_uhdr h; union tpacket_uhdr h;
...@@ -460,10 +460,10 @@ static __u32 __packet_set_timestamp(struct packet_sock *po, void *frame, ...@@ -460,10 +460,10 @@ static __u32 __packet_set_timestamp(struct packet_sock *po, void *frame,
return ts_status; return ts_status;
} }
static void *packet_lookup_frame(struct packet_sock *po, static void *packet_lookup_frame(const struct packet_sock *po,
struct packet_ring_buffer *rb, const struct packet_ring_buffer *rb,
unsigned int position, unsigned int position,
int status) int status)
{ {
unsigned int pg_vec_pos, frame_offset; unsigned int pg_vec_pos, frame_offset;
union tpacket_uhdr h; union tpacket_uhdr h;
...@@ -758,7 +758,7 @@ static void prb_close_block(struct tpacket_kbdq_core *pkc1, ...@@ -758,7 +758,7 @@ static void prb_close_block(struct tpacket_kbdq_core *pkc1,
struct tpacket_hdr_v1 *h1 = &pbd1->hdr.bh1; struct tpacket_hdr_v1 *h1 = &pbd1->hdr.bh1;
struct sock *sk = &po->sk; struct sock *sk = &po->sk;
if (po->stats.stats3.tp_drops) if (atomic_read(&po->tp_drops))
status |= TP_STATUS_LOSING; status |= TP_STATUS_LOSING;
last_pkt = (struct tpacket3_hdr *)pkc1->prev; last_pkt = (struct tpacket3_hdr *)pkc1->prev;
...@@ -1082,10 +1082,10 @@ static void *packet_current_rx_frame(struct packet_sock *po, ...@@ -1082,10 +1082,10 @@ static void *packet_current_rx_frame(struct packet_sock *po,
} }
} }
static void *prb_lookup_block(struct packet_sock *po, static void *prb_lookup_block(const struct packet_sock *po,
struct packet_ring_buffer *rb, const struct packet_ring_buffer *rb,
unsigned int idx, unsigned int idx,
int status) int status)
{ {
struct tpacket_kbdq_core *pkc = GET_PBDQC_FROM_RB(rb); struct tpacket_kbdq_core *pkc = GET_PBDQC_FROM_RB(rb);
struct tpacket_block_desc *pbd = GET_PBLOCK_DESC(pkc, idx); struct tpacket_block_desc *pbd = GET_PBLOCK_DESC(pkc, idx);
...@@ -1198,12 +1198,12 @@ static void packet_free_pending(struct packet_sock *po) ...@@ -1198,12 +1198,12 @@ static void packet_free_pending(struct packet_sock *po)
#define ROOM_LOW 0x1 #define ROOM_LOW 0x1
#define ROOM_NORMAL 0x2 #define ROOM_NORMAL 0x2
static bool __tpacket_has_room(struct packet_sock *po, int pow_off) static bool __tpacket_has_room(const struct packet_sock *po, int pow_off)
{ {
int idx, len; int idx, len;
len = po->rx_ring.frame_max + 1; len = READ_ONCE(po->rx_ring.frame_max) + 1;
idx = po->rx_ring.head; idx = READ_ONCE(po->rx_ring.head);
if (pow_off) if (pow_off)
idx += len >> pow_off; idx += len >> pow_off;
if (idx >= len) if (idx >= len)
...@@ -1211,12 +1211,12 @@ static bool __tpacket_has_room(struct packet_sock *po, int pow_off) ...@@ -1211,12 +1211,12 @@ static bool __tpacket_has_room(struct packet_sock *po, int pow_off)
return packet_lookup_frame(po, &po->rx_ring, idx, TP_STATUS_KERNEL); return packet_lookup_frame(po, &po->rx_ring, idx, TP_STATUS_KERNEL);
} }
static bool __tpacket_v3_has_room(struct packet_sock *po, int pow_off) static bool __tpacket_v3_has_room(const struct packet_sock *po, int pow_off)
{ {
int idx, len; int idx, len;
len = po->rx_ring.prb_bdqc.knum_blocks; len = READ_ONCE(po->rx_ring.prb_bdqc.knum_blocks);
idx = po->rx_ring.prb_bdqc.kactive_blk_num; idx = READ_ONCE(po->rx_ring.prb_bdqc.kactive_blk_num);
if (pow_off) if (pow_off)
idx += len >> pow_off; idx += len >> pow_off;
if (idx >= len) if (idx >= len)
...@@ -1224,15 +1224,18 @@ static bool __tpacket_v3_has_room(struct packet_sock *po, int pow_off) ...@@ -1224,15 +1224,18 @@ static bool __tpacket_v3_has_room(struct packet_sock *po, int pow_off)
return prb_lookup_block(po, &po->rx_ring, idx, TP_STATUS_KERNEL); return prb_lookup_block(po, &po->rx_ring, idx, TP_STATUS_KERNEL);
} }
static int __packet_rcv_has_room(struct packet_sock *po, struct sk_buff *skb) static int __packet_rcv_has_room(const struct packet_sock *po,
const struct sk_buff *skb)
{ {
struct sock *sk = &po->sk; const struct sock *sk = &po->sk;
int ret = ROOM_NONE; int ret = ROOM_NONE;
if (po->prot_hook.func != tpacket_rcv) { if (po->prot_hook.func != tpacket_rcv) {
int avail = sk->sk_rcvbuf - atomic_read(&sk->sk_rmem_alloc) int rcvbuf = READ_ONCE(sk->sk_rcvbuf);
- (skb ? skb->truesize : 0); int avail = rcvbuf - atomic_read(&sk->sk_rmem_alloc)
if (avail > (sk->sk_rcvbuf >> ROOM_POW_OFF)) - (skb ? skb->truesize : 0);
if (avail > (rcvbuf >> ROOM_POW_OFF))
return ROOM_NORMAL; return ROOM_NORMAL;
else if (avail > 0) else if (avail > 0)
return ROOM_LOW; return ROOM_LOW;
...@@ -1257,19 +1260,24 @@ static int __packet_rcv_has_room(struct packet_sock *po, struct sk_buff *skb) ...@@ -1257,19 +1260,24 @@ static int __packet_rcv_has_room(struct packet_sock *po, struct sk_buff *skb)
static int packet_rcv_has_room(struct packet_sock *po, struct sk_buff *skb) static int packet_rcv_has_room(struct packet_sock *po, struct sk_buff *skb)
{ {
int ret; int pressure, ret;
bool has_room;
spin_lock_bh(&po->sk.sk_receive_queue.lock);
ret = __packet_rcv_has_room(po, skb); ret = __packet_rcv_has_room(po, skb);
has_room = ret == ROOM_NORMAL; pressure = ret != ROOM_NORMAL;
if (po->pressure == has_room)
po->pressure = !has_room; if (READ_ONCE(po->pressure) != pressure)
spin_unlock_bh(&po->sk.sk_receive_queue.lock); WRITE_ONCE(po->pressure, pressure);
return ret; return ret;
} }
static void packet_rcv_try_clear_pressure(struct packet_sock *po)
{
if (READ_ONCE(po->pressure) &&
__packet_rcv_has_room(po, NULL) == ROOM_NORMAL)
WRITE_ONCE(po->pressure, 0);
}
static void packet_sock_destruct(struct sock *sk) static void packet_sock_destruct(struct sock *sk)
{ {
skb_queue_purge(&sk->sk_error_queue); skb_queue_purge(&sk->sk_error_queue);
...@@ -1350,7 +1358,7 @@ static unsigned int fanout_demux_rollover(struct packet_fanout *f, ...@@ -1350,7 +1358,7 @@ static unsigned int fanout_demux_rollover(struct packet_fanout *f,
i = j = min_t(int, po->rollover->sock, num - 1); i = j = min_t(int, po->rollover->sock, num - 1);
do { do {
po_next = pkt_sk(f->arr[i]); po_next = pkt_sk(f->arr[i]);
if (po_next != po_skip && !po_next->pressure && if (po_next != po_skip && !READ_ONCE(po_next->pressure) &&
packet_rcv_has_room(po_next, skb) == ROOM_NORMAL) { packet_rcv_has_room(po_next, skb) == ROOM_NORMAL) {
if (i != j) if (i != j)
po->rollover->sock = i; po->rollover->sock = i;
...@@ -2125,10 +2133,8 @@ static int packet_rcv(struct sk_buff *skb, struct net_device *dev, ...@@ -2125,10 +2133,8 @@ static int packet_rcv(struct sk_buff *skb, struct net_device *dev,
drop_n_acct: drop_n_acct:
is_drop_n_account = true; is_drop_n_account = true;
spin_lock(&sk->sk_receive_queue.lock); atomic_inc(&po->tp_drops);
po->stats.stats1.tp_drops++;
atomic_inc(&sk->sk_drops); atomic_inc(&sk->sk_drops);
spin_unlock(&sk->sk_receive_queue.lock);
drop_n_restore: drop_n_restore:
if (skb_head != skb->data && skb_shared(skb)) { if (skb_head != skb->data && skb_shared(skb)) {
...@@ -2192,6 +2198,12 @@ static int tpacket_rcv(struct sk_buff *skb, struct net_device *dev, ...@@ -2192,6 +2198,12 @@ static int tpacket_rcv(struct sk_buff *skb, struct net_device *dev,
if (!res) if (!res)
goto drop_n_restore; goto drop_n_restore;
/* If we are flooded, just give up */
if (__packet_rcv_has_room(po, skb) == ROOM_NONE) {
atomic_inc(&po->tp_drops);
goto drop_n_restore;
}
if (skb->ip_summed == CHECKSUM_PARTIAL) if (skb->ip_summed == CHECKSUM_PARTIAL)
status |= TP_STATUS_CSUMNOTREADY; status |= TP_STATUS_CSUMNOTREADY;
else if (skb->pkt_type != PACKET_OUTGOING && else if (skb->pkt_type != PACKET_OUTGOING &&
...@@ -2262,7 +2274,7 @@ static int tpacket_rcv(struct sk_buff *skb, struct net_device *dev, ...@@ -2262,7 +2274,7 @@ static int tpacket_rcv(struct sk_buff *skb, struct net_device *dev,
* Anyways, moving it for V1/V2 only as V3 doesn't need this * Anyways, moving it for V1/V2 only as V3 doesn't need this
* at packet level. * at packet level.
*/ */
if (po->stats.stats1.tp_drops) if (atomic_read(&po->tp_drops))
status |= TP_STATUS_LOSING; status |= TP_STATUS_LOSING;
} }
...@@ -2378,9 +2390,9 @@ static int tpacket_rcv(struct sk_buff *skb, struct net_device *dev, ...@@ -2378,9 +2390,9 @@ static int tpacket_rcv(struct sk_buff *skb, struct net_device *dev,
return 0; return 0;
drop_n_account: drop_n_account:
is_drop_n_account = true;
po->stats.stats1.tp_drops++;
spin_unlock(&sk->sk_receive_queue.lock); spin_unlock(&sk->sk_receive_queue.lock);
atomic_inc(&po->tp_drops);
is_drop_n_account = true;
sk->sk_data_ready(sk); sk->sk_data_ready(sk);
kfree_skb(copy_skb); kfree_skb(copy_skb);
...@@ -3303,8 +3315,7 @@ static int packet_recvmsg(struct socket *sock, struct msghdr *msg, size_t len, ...@@ -3303,8 +3315,7 @@ static int packet_recvmsg(struct socket *sock, struct msghdr *msg, size_t len,
if (skb == NULL) if (skb == NULL)
goto out; goto out;
if (pkt_sk(sk)->pressure) packet_rcv_try_clear_pressure(pkt_sk(sk));
packet_rcv_has_room(pkt_sk(sk), NULL);
if (pkt_sk(sk)->has_vnet_hdr) { if (pkt_sk(sk)->has_vnet_hdr) {
err = packet_rcv_vnet(msg, skb, &len); err = packet_rcv_vnet(msg, skb, &len);
...@@ -3876,6 +3887,7 @@ static int packet_getsockopt(struct socket *sock, int level, int optname, ...@@ -3876,6 +3887,7 @@ static int packet_getsockopt(struct socket *sock, int level, int optname,
void *data = &val; void *data = &val;
union tpacket_stats_u st; union tpacket_stats_u st;
struct tpacket_rollover_stats rstats; struct tpacket_rollover_stats rstats;
int drops;
if (level != SOL_PACKET) if (level != SOL_PACKET)
return -ENOPROTOOPT; return -ENOPROTOOPT;
...@@ -3892,14 +3904,17 @@ static int packet_getsockopt(struct socket *sock, int level, int optname, ...@@ -3892,14 +3904,17 @@ static int packet_getsockopt(struct socket *sock, int level, int optname,
memcpy(&st, &po->stats, sizeof(st)); memcpy(&st, &po->stats, sizeof(st));
memset(&po->stats, 0, sizeof(po->stats)); memset(&po->stats, 0, sizeof(po->stats));
spin_unlock_bh(&sk->sk_receive_queue.lock); spin_unlock_bh(&sk->sk_receive_queue.lock);
drops = atomic_xchg(&po->tp_drops, 0);
if (po->tp_version == TPACKET_V3) { if (po->tp_version == TPACKET_V3) {
lv = sizeof(struct tpacket_stats_v3); lv = sizeof(struct tpacket_stats_v3);
st.stats3.tp_packets += st.stats3.tp_drops; st.stats3.tp_drops = drops;
st.stats3.tp_packets += drops;
data = &st.stats3; data = &st.stats3;
} else { } else {
lv = sizeof(struct tpacket_stats); lv = sizeof(struct tpacket_stats);
st.stats1.tp_packets += st.stats1.tp_drops; st.stats1.tp_drops = drops;
st.stats1.tp_packets += drops;
data = &st.stats1; data = &st.stats1;
} }
...@@ -4118,8 +4133,7 @@ static __poll_t packet_poll(struct file *file, struct socket *sock, ...@@ -4118,8 +4133,7 @@ static __poll_t packet_poll(struct file *file, struct socket *sock,
TP_STATUS_KERNEL)) TP_STATUS_KERNEL))
mask |= EPOLLIN | EPOLLRDNORM; mask |= EPOLLIN | EPOLLRDNORM;
} }
if (po->pressure && __packet_rcv_has_room(po, NULL) == ROOM_NORMAL) packet_rcv_try_clear_pressure(po);
po->pressure = 0;
spin_unlock_bh(&sk->sk_receive_queue.lock); spin_unlock_bh(&sk->sk_receive_queue.lock);
spin_lock_bh(&sk->sk_write_queue.lock); spin_lock_bh(&sk->sk_write_queue.lock);
if (po->tx_ring.pg_vec) { if (po->tx_ring.pg_vec) {
......
...@@ -131,6 +131,7 @@ struct packet_sock { ...@@ -131,6 +131,7 @@ struct packet_sock {
struct net_device __rcu *cached_dev; struct net_device __rcu *cached_dev;
int (*xmit)(struct sk_buff *skb); int (*xmit)(struct sk_buff *skb);
struct packet_type prot_hook ____cacheline_aligned_in_smp; struct packet_type prot_hook ____cacheline_aligned_in_smp;
atomic_t tp_drops ____cacheline_aligned_in_smp;
}; };
static struct packet_sock *pkt_sk(struct sock *sk) static struct packet_sock *pkt_sk(struct sock *sk)
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment