Commit fd258f2a authored by David S. Miller's avatar David S. Miller

Merge branch 'udp-pernetns-hash'

Kuniyuki Iwashima says:

====================
udp: Introduce optional per-netns hash table.

This series is the UDP version of the per-netns ehash series [0],
which were initially in the same patch set. [1]

The notable difference with TCP is the max table size is 64K and the min
size is 128.  This is because the possible hash range by udp_hashfn()
always fits in 64K within the same netns and because we want to keep a
bitmap in udp_lib_get_port() on the stack.  Also, the UDP per-netns table
isolates both 1-tuple and 2-tuple tables.

For details, please see the last patch.

  patch 1 - 4: prep for per-netns hash table
  patch     5: add per-netns hash table

[0]: https://lore.kernel.org/netdev/20220908011022.45342-1-kuniyu@amazon.com/
[1]: https://lore.kernel.org/netdev/20220826000445.46552-1-kuniyu@amazon.com/
====================
Signed-off-by: default avatarDavid S. Miller <davem@davemloft.net>
parents e8822565 9804985b
...@@ -1177,6 +1177,33 @@ udp_rmem_min - INTEGER ...@@ -1177,6 +1177,33 @@ udp_rmem_min - INTEGER
udp_wmem_min - INTEGER udp_wmem_min - INTEGER
UDP does not have tx memory accounting and this tunable has no effect. UDP does not have tx memory accounting and this tunable has no effect.
udp_hash_entries - INTEGER
Show the number of hash buckets for UDP sockets in the current
networking namespace.
A negative value means the networking namespace does not own its
hash buckets and shares the initial networking namespace's one.
udp_child_ehash_entries - INTEGER
Control the number of hash buckets for UDP sockets in the child
networking namespace, which must be set before clone() or unshare().
If the value is not 0, the kernel uses a value rounded up to 2^n
as the actual hash bucket size. 0 is a special value, meaning
the child networking namespace will share the initial networking
namespace's hash buckets.
Note that the child will use the global one in case the kernel
fails to allocate enough memory. In addition, the global hash
buckets are spread over available NUMA nodes, but the allocation
of the child hash table depends on the current process's NUMA
policy, which could result in performance differences.
Possible values: 0, 2^n (n: 7 (128) - 16 (64K))
Default: 0
RAW variables RAW variables
============= =============
......
...@@ -23,7 +23,9 @@ static inline struct udphdr *udp_hdr(const struct sk_buff *skb) ...@@ -23,7 +23,9 @@ static inline struct udphdr *udp_hdr(const struct sk_buff *skb)
return (struct udphdr *)skb_transport_header(skb); return (struct udphdr *)skb_transport_header(skb);
} }
#define UDP_HTABLE_SIZE_MIN_PERNET 128
#define UDP_HTABLE_SIZE_MIN (CONFIG_BASE_SMALL ? 128 : 256) #define UDP_HTABLE_SIZE_MIN (CONFIG_BASE_SMALL ? 128 : 256)
#define UDP_HTABLE_SIZE_MAX 65536
static inline u32 udp_hashfn(const struct net *net, u32 num, u32 mask) static inline u32 udp_hashfn(const struct net *net, u32 num, u32 mask)
{ {
......
...@@ -43,6 +43,7 @@ struct tcp_fastopen_context; ...@@ -43,6 +43,7 @@ struct tcp_fastopen_context;
struct netns_ipv4 { struct netns_ipv4 {
struct inet_timewait_death_row tcp_death_row; struct inet_timewait_death_row tcp_death_row;
struct udp_table *udp_table;
#ifdef CONFIG_SYSCTL #ifdef CONFIG_SYSCTL
struct ctl_table_header *forw_hdr; struct ctl_table_header *forw_hdr;
...@@ -207,6 +208,8 @@ struct netns_ipv4 { ...@@ -207,6 +208,8 @@ struct netns_ipv4 {
atomic_t dev_addr_genid; atomic_t dev_addr_genid;
unsigned int sysctl_udp_child_hash_entries;
#ifdef CONFIG_SYSCTL #ifdef CONFIG_SYSCTL
unsigned long *sysctl_local_reserved_ports; unsigned long *sysctl_local_reserved_ports;
int sysctl_ip_prot_sock; int sysctl_ip_prot_sock;
......
...@@ -6432,7 +6432,7 @@ static struct sock *sk_lookup(struct net *net, struct bpf_sock_tuple *tuple, ...@@ -6432,7 +6432,7 @@ static struct sock *sk_lookup(struct net *net, struct bpf_sock_tuple *tuple,
else else
sk = __udp4_lib_lookup(net, src4, tuple->ipv4.sport, sk = __udp4_lib_lookup(net, src4, tuple->ipv4.sport,
dst4, tuple->ipv4.dport, dst4, tuple->ipv4.dport,
dif, sdif, &udp_table, NULL); dif, sdif, net->ipv4.udp_table, NULL);
#if IS_ENABLED(CONFIG_IPV6) #if IS_ENABLED(CONFIG_IPV6)
} else { } else {
struct in6_addr *src6 = (struct in6_addr *)&tuple->ipv6.saddr; struct in6_addr *src6 = (struct in6_addr *)&tuple->ipv6.saddr;
...@@ -6448,7 +6448,7 @@ static struct sock *sk_lookup(struct net *net, struct bpf_sock_tuple *tuple, ...@@ -6448,7 +6448,7 @@ static struct sock *sk_lookup(struct net *net, struct bpf_sock_tuple *tuple,
src6, tuple->ipv6.sport, src6, tuple->ipv6.sport,
dst6, tuple->ipv6.dport, dst6, tuple->ipv6.dport,
dif, sdif, dif, sdif,
&udp_table, NULL); net->ipv4.udp_table, NULL);
#endif #endif
} }
......
...@@ -40,6 +40,7 @@ static int one_day_secs = 24 * 3600; ...@@ -40,6 +40,7 @@ static int one_day_secs = 24 * 3600;
static u32 fib_multipath_hash_fields_all_mask __maybe_unused = static u32 fib_multipath_hash_fields_all_mask __maybe_unused =
FIB_MULTIPATH_HASH_FIELD_ALL_MASK; FIB_MULTIPATH_HASH_FIELD_ALL_MASK;
static unsigned int tcp_child_ehash_entries_max = 16 * 1024 * 1024; static unsigned int tcp_child_ehash_entries_max = 16 * 1024 * 1024;
static unsigned int udp_child_hash_entries_max = UDP_HTABLE_SIZE_MAX;
static int tcp_plb_max_rounds = 31; static int tcp_plb_max_rounds = 31;
static int tcp_plb_max_cong_thresh = 256; static int tcp_plb_max_cong_thresh = 256;
...@@ -402,12 +403,36 @@ static int proc_tcp_ehash_entries(struct ctl_table *table, int write, ...@@ -402,12 +403,36 @@ static int proc_tcp_ehash_entries(struct ctl_table *table, int write,
if (!net_eq(net, &init_net) && !hinfo->pernet) if (!net_eq(net, &init_net) && !hinfo->pernet)
tcp_ehash_entries *= -1; tcp_ehash_entries *= -1;
memset(&tbl, 0, sizeof(tbl));
tbl.data = &tcp_ehash_entries; tbl.data = &tcp_ehash_entries;
tbl.maxlen = sizeof(int); tbl.maxlen = sizeof(int);
return proc_dointvec(&tbl, write, buffer, lenp, ppos); return proc_dointvec(&tbl, write, buffer, lenp, ppos);
} }
static int proc_udp_hash_entries(struct ctl_table *table, int write,
void *buffer, size_t *lenp, loff_t *ppos)
{
struct net *net = container_of(table->data, struct net,
ipv4.sysctl_udp_child_hash_entries);
int udp_hash_entries;
struct ctl_table tbl;
udp_hash_entries = net->ipv4.udp_table->mask + 1;
/* A negative number indicates that the child netns
* shares the global udp_table.
*/
if (!net_eq(net, &init_net) && net->ipv4.udp_table == &udp_table)
udp_hash_entries *= -1;
memset(&tbl, 0, sizeof(tbl));
tbl.data = &udp_hash_entries;
tbl.maxlen = sizeof(int);
return proc_dointvec(&tbl, write, buffer, lenp, ppos);
}
#ifdef CONFIG_IP_ROUTE_MULTIPATH #ifdef CONFIG_IP_ROUTE_MULTIPATH
static int proc_fib_multipath_hash_policy(struct ctl_table *table, int write, static int proc_fib_multipath_hash_policy(struct ctl_table *table, int write,
void *buffer, size_t *lenp, void *buffer, size_t *lenp,
...@@ -1361,6 +1386,21 @@ static struct ctl_table ipv4_net_table[] = { ...@@ -1361,6 +1386,21 @@ static struct ctl_table ipv4_net_table[] = {
.extra1 = SYSCTL_ZERO, .extra1 = SYSCTL_ZERO,
.extra2 = &tcp_child_ehash_entries_max, .extra2 = &tcp_child_ehash_entries_max,
}, },
{
.procname = "udp_hash_entries",
.data = &init_net.ipv4.sysctl_udp_child_hash_entries,
.mode = 0444,
.proc_handler = proc_udp_hash_entries,
},
{
.procname = "udp_child_hash_entries",
.data = &init_net.ipv4.sysctl_udp_child_hash_entries,
.maxlen = sizeof(unsigned int),
.mode = 0644,
.proc_handler = proc_douintvec_minmax,
.extra1 = SYSCTL_ZERO,
.extra2 = &udp_child_hash_entries_max,
},
{ {
.procname = "udp_rmem_min", .procname = "udp_rmem_min",
.data = &init_net.ipv4.sysctl_udp_rmem_min, .data = &init_net.ipv4.sysctl_udp_rmem_min,
......
...@@ -129,7 +129,12 @@ DEFINE_PER_CPU(int, udp_memory_per_cpu_fw_alloc); ...@@ -129,7 +129,12 @@ DEFINE_PER_CPU(int, udp_memory_per_cpu_fw_alloc);
EXPORT_PER_CPU_SYMBOL_GPL(udp_memory_per_cpu_fw_alloc); EXPORT_PER_CPU_SYMBOL_GPL(udp_memory_per_cpu_fw_alloc);
#define MAX_UDP_PORTS 65536 #define MAX_UDP_PORTS 65536
#define PORTS_PER_CHAIN (MAX_UDP_PORTS / UDP_HTABLE_SIZE_MIN) #define PORTS_PER_CHAIN (MAX_UDP_PORTS / UDP_HTABLE_SIZE_MIN_PERNET)
static struct udp_table *udp_get_table_prot(struct sock *sk)
{
return sk->sk_prot->h.udp_table ? : sock_net(sk)->ipv4.udp_table;
}
static int udp_lib_lport_inuse(struct net *net, __u16 num, static int udp_lib_lport_inuse(struct net *net, __u16 num,
const struct udp_hslot *hslot, const struct udp_hslot *hslot,
...@@ -232,16 +237,16 @@ static int udp_reuseport_add_sock(struct sock *sk, struct udp_hslot *hslot) ...@@ -232,16 +237,16 @@ static int udp_reuseport_add_sock(struct sock *sk, struct udp_hslot *hslot)
int udp_lib_get_port(struct sock *sk, unsigned short snum, int udp_lib_get_port(struct sock *sk, unsigned short snum,
unsigned int hash2_nulladdr) unsigned int hash2_nulladdr)
{ {
struct udp_table *udptable = udp_get_table_prot(sk);
struct udp_hslot *hslot, *hslot2; struct udp_hslot *hslot, *hslot2;
struct udp_table *udptable = sk->sk_prot->h.udp_table;
int error = 1;
struct net *net = sock_net(sk); struct net *net = sock_net(sk);
int error = 1;
if (!snum) { if (!snum) {
DECLARE_BITMAP(bitmap, PORTS_PER_CHAIN);
unsigned short first, last;
int low, high, remaining; int low, high, remaining;
unsigned int rand; unsigned int rand;
unsigned short first, last;
DECLARE_BITMAP(bitmap, PORTS_PER_CHAIN);
inet_get_local_port_range(net, &low, &high); inet_get_local_port_range(net, &low, &high);
remaining = (high - low) + 1; remaining = (high - low) + 1;
...@@ -467,7 +472,7 @@ static struct sock *udp4_lookup_run_bpf(struct net *net, ...@@ -467,7 +472,7 @@ static struct sock *udp4_lookup_run_bpf(struct net *net,
struct sock *sk, *reuse_sk; struct sock *sk, *reuse_sk;
bool no_reuseport; bool no_reuseport;
if (udptable != &udp_table) if (udptable != net->ipv4.udp_table)
return NULL; /* only UDP is supported */ return NULL; /* only UDP is supported */
no_reuseport = bpf_sk_lookup_run_v4(net, IPPROTO_UDP, saddr, sport, no_reuseport = bpf_sk_lookup_run_v4(net, IPPROTO_UDP, saddr, sport,
...@@ -548,10 +553,11 @@ struct sock *udp4_lib_lookup_skb(const struct sk_buff *skb, ...@@ -548,10 +553,11 @@ struct sock *udp4_lib_lookup_skb(const struct sk_buff *skb,
__be16 sport, __be16 dport) __be16 sport, __be16 dport)
{ {
const struct iphdr *iph = ip_hdr(skb); const struct iphdr *iph = ip_hdr(skb);
struct net *net = dev_net(skb->dev);
return __udp4_lib_lookup(dev_net(skb->dev), iph->saddr, sport, return __udp4_lib_lookup(net, iph->saddr, sport,
iph->daddr, dport, inet_iif(skb), iph->daddr, dport, inet_iif(skb),
inet_sdif(skb), &udp_table, NULL); inet_sdif(skb), net->ipv4.udp_table, NULL);
} }
/* Must be called under rcu_read_lock(). /* Must be called under rcu_read_lock().
...@@ -564,7 +570,7 @@ struct sock *udp4_lib_lookup(struct net *net, __be32 saddr, __be16 sport, ...@@ -564,7 +570,7 @@ struct sock *udp4_lib_lookup(struct net *net, __be32 saddr, __be16 sport,
struct sock *sk; struct sock *sk;
sk = __udp4_lib_lookup(net, saddr, sport, daddr, dport, sk = __udp4_lib_lookup(net, saddr, sport, daddr, dport,
dif, 0, &udp_table, NULL); dif, 0, net->ipv4.udp_table, NULL);
if (sk && !refcount_inc_not_zero(&sk->sk_refcnt)) if (sk && !refcount_inc_not_zero(&sk->sk_refcnt))
sk = NULL; sk = NULL;
return sk; return sk;
...@@ -802,7 +808,7 @@ int __udp4_lib_err(struct sk_buff *skb, u32 info, struct udp_table *udptable) ...@@ -802,7 +808,7 @@ int __udp4_lib_err(struct sk_buff *skb, u32 info, struct udp_table *udptable)
int udp_err(struct sk_buff *skb, u32 info) int udp_err(struct sk_buff *skb, u32 info)
{ {
return __udp4_lib_err(skb, info, &udp_table); return __udp4_lib_err(skb, info, dev_net(skb->dev)->ipv4.udp_table);
} }
/* /*
...@@ -1999,7 +2005,7 @@ EXPORT_SYMBOL(udp_disconnect); ...@@ -1999,7 +2005,7 @@ EXPORT_SYMBOL(udp_disconnect);
void udp_lib_unhash(struct sock *sk) void udp_lib_unhash(struct sock *sk)
{ {
if (sk_hashed(sk)) { if (sk_hashed(sk)) {
struct udp_table *udptable = sk->sk_prot->h.udp_table; struct udp_table *udptable = udp_get_table_prot(sk);
struct udp_hslot *hslot, *hslot2; struct udp_hslot *hslot, *hslot2;
hslot = udp_hashslot(udptable, sock_net(sk), hslot = udp_hashslot(udptable, sock_net(sk),
...@@ -2030,7 +2036,7 @@ EXPORT_SYMBOL(udp_lib_unhash); ...@@ -2030,7 +2036,7 @@ EXPORT_SYMBOL(udp_lib_unhash);
void udp_lib_rehash(struct sock *sk, u16 newhash) void udp_lib_rehash(struct sock *sk, u16 newhash)
{ {
if (sk_hashed(sk)) { if (sk_hashed(sk)) {
struct udp_table *udptable = sk->sk_prot->h.udp_table; struct udp_table *udptable = udp_get_table_prot(sk);
struct udp_hslot *hslot, *hslot2, *nhslot2; struct udp_hslot *hslot, *hslot2, *nhslot2;
hslot2 = udp_hashslot2(udptable, udp_sk(sk)->udp_portaddr_hash); hslot2 = udp_hashslot2(udptable, udp_sk(sk)->udp_portaddr_hash);
...@@ -2519,10 +2525,14 @@ static struct sock *__udp4_lib_mcast_demux_lookup(struct net *net, ...@@ -2519,10 +2525,14 @@ static struct sock *__udp4_lib_mcast_demux_lookup(struct net *net,
__be16 rmt_port, __be32 rmt_addr, __be16 rmt_port, __be32 rmt_addr,
int dif, int sdif) int dif, int sdif)
{ {
struct sock *sk, *result; struct udp_table *udptable = net->ipv4.udp_table;
unsigned short hnum = ntohs(loc_port); unsigned short hnum = ntohs(loc_port);
unsigned int slot = udp_hashfn(net, hnum, udp_table.mask); struct sock *sk, *result;
struct udp_hslot *hslot = &udp_table.hash[slot]; struct udp_hslot *hslot;
unsigned int slot;
slot = udp_hashfn(net, hnum, udptable->mask);
hslot = &udptable->hash[slot];
/* Do not bother scanning a too big list */ /* Do not bother scanning a too big list */
if (hslot->count > 10) if (hslot->count > 10)
...@@ -2550,14 +2560,19 @@ static struct sock *__udp4_lib_demux_lookup(struct net *net, ...@@ -2550,14 +2560,19 @@ static struct sock *__udp4_lib_demux_lookup(struct net *net,
__be16 rmt_port, __be32 rmt_addr, __be16 rmt_port, __be32 rmt_addr,
int dif, int sdif) int dif, int sdif)
{ {
unsigned short hnum = ntohs(loc_port); struct udp_table *udptable = net->ipv4.udp_table;
unsigned int hash2 = ipv4_portaddr_hash(net, loc_addr, hnum);
unsigned int slot2 = hash2 & udp_table.mask;
struct udp_hslot *hslot2 = &udp_table.hash2[slot2];
INET_ADDR_COOKIE(acookie, rmt_addr, loc_addr); INET_ADDR_COOKIE(acookie, rmt_addr, loc_addr);
const __portpair ports = INET_COMBINED_PORTS(rmt_port, hnum); unsigned short hnum = ntohs(loc_port);
unsigned int hash2, slot2;
struct udp_hslot *hslot2;
__portpair ports;
struct sock *sk; struct sock *sk;
hash2 = ipv4_portaddr_hash(net, loc_addr, hnum);
slot2 = hash2 & udptable->mask;
hslot2 = &udptable->hash2[slot2];
ports = INET_COMBINED_PORTS(rmt_port, hnum);
udp_portaddr_for_each_entry_rcu(sk, &hslot2->head) { udp_portaddr_for_each_entry_rcu(sk, &hslot2->head) {
if (inet_match(net, sk, acookie, ports, dif, sdif)) if (inet_match(net, sk, acookie, ports, dif, sdif))
return sk; return sk;
...@@ -2637,7 +2652,7 @@ int udp_v4_early_demux(struct sk_buff *skb) ...@@ -2637,7 +2652,7 @@ int udp_v4_early_demux(struct sk_buff *skb)
int udp_rcv(struct sk_buff *skb) int udp_rcv(struct sk_buff *skb)
{ {
return __udp4_lib_rcv(skb, &udp_table, IPPROTO_UDP); return __udp4_lib_rcv(skb, dev_net(skb->dev)->ipv4.udp_table, IPPROTO_UDP);
} }
void udp_destroy_sock(struct sock *sk) void udp_destroy_sock(struct sock *sk)
...@@ -2960,7 +2975,7 @@ struct proto udp_prot = { ...@@ -2960,7 +2975,7 @@ struct proto udp_prot = {
.sysctl_wmem_offset = offsetof(struct net, ipv4.sysctl_udp_wmem_min), .sysctl_wmem_offset = offsetof(struct net, ipv4.sysctl_udp_wmem_min),
.sysctl_rmem_offset = offsetof(struct net, ipv4.sysctl_udp_rmem_min), .sysctl_rmem_offset = offsetof(struct net, ipv4.sysctl_udp_rmem_min),
.obj_size = sizeof(struct udp_sock), .obj_size = sizeof(struct udp_sock),
.h.udp_table = &udp_table, .h.udp_table = NULL,
.diag_destroy = udp_abort, .diag_destroy = udp_abort,
}; };
EXPORT_SYMBOL(udp_prot); EXPORT_SYMBOL(udp_prot);
...@@ -2968,21 +2983,30 @@ EXPORT_SYMBOL(udp_prot); ...@@ -2968,21 +2983,30 @@ EXPORT_SYMBOL(udp_prot);
/* ------------------------------------------------------------------------ */ /* ------------------------------------------------------------------------ */
#ifdef CONFIG_PROC_FS #ifdef CONFIG_PROC_FS
static struct udp_table *udp_get_table_afinfo(struct udp_seq_afinfo *afinfo,
struct net *net)
{
return afinfo->udp_table ? : net->ipv4.udp_table;
}
static struct sock *udp_get_first(struct seq_file *seq, int start) static struct sock *udp_get_first(struct seq_file *seq, int start)
{ {
struct sock *sk;
struct udp_seq_afinfo *afinfo;
struct udp_iter_state *state = seq->private; struct udp_iter_state *state = seq->private;
struct net *net = seq_file_net(seq); struct net *net = seq_file_net(seq);
struct udp_seq_afinfo *afinfo;
struct udp_table *udptable;
struct sock *sk;
if (state->bpf_seq_afinfo) if (state->bpf_seq_afinfo)
afinfo = state->bpf_seq_afinfo; afinfo = state->bpf_seq_afinfo;
else else
afinfo = pde_data(file_inode(seq->file)); afinfo = pde_data(file_inode(seq->file));
for (state->bucket = start; state->bucket <= afinfo->udp_table->mask; udptable = udp_get_table_afinfo(afinfo, net);
for (state->bucket = start; state->bucket <= udptable->mask;
++state->bucket) { ++state->bucket) {
struct udp_hslot *hslot = &afinfo->udp_table->hash[state->bucket]; struct udp_hslot *hslot = &udptable->hash[state->bucket];
if (hlist_empty(&hslot->head)) if (hlist_empty(&hslot->head))
continue; continue;
...@@ -3004,9 +3028,10 @@ static struct sock *udp_get_first(struct seq_file *seq, int start) ...@@ -3004,9 +3028,10 @@ static struct sock *udp_get_first(struct seq_file *seq, int start)
static struct sock *udp_get_next(struct seq_file *seq, struct sock *sk) static struct sock *udp_get_next(struct seq_file *seq, struct sock *sk)
{ {
struct udp_seq_afinfo *afinfo;
struct udp_iter_state *state = seq->private; struct udp_iter_state *state = seq->private;
struct net *net = seq_file_net(seq); struct net *net = seq_file_net(seq);
struct udp_seq_afinfo *afinfo;
struct udp_table *udptable;
if (state->bpf_seq_afinfo) if (state->bpf_seq_afinfo)
afinfo = state->bpf_seq_afinfo; afinfo = state->bpf_seq_afinfo;
...@@ -3020,8 +3045,11 @@ static struct sock *udp_get_next(struct seq_file *seq, struct sock *sk) ...@@ -3020,8 +3045,11 @@ static struct sock *udp_get_next(struct seq_file *seq, struct sock *sk)
sk->sk_family != afinfo->family))); sk->sk_family != afinfo->family)));
if (!sk) { if (!sk) {
if (state->bucket <= afinfo->udp_table->mask) udptable = udp_get_table_afinfo(afinfo, net);
spin_unlock_bh(&afinfo->udp_table->hash[state->bucket].lock);
if (state->bucket <= udptable->mask)
spin_unlock_bh(&udptable->hash[state->bucket].lock);
return udp_get_first(seq, state->bucket + 1); return udp_get_first(seq, state->bucket + 1);
} }
return sk; return sk;
...@@ -3062,16 +3090,19 @@ EXPORT_SYMBOL(udp_seq_next); ...@@ -3062,16 +3090,19 @@ EXPORT_SYMBOL(udp_seq_next);
void udp_seq_stop(struct seq_file *seq, void *v) void udp_seq_stop(struct seq_file *seq, void *v)
{ {
struct udp_seq_afinfo *afinfo;
struct udp_iter_state *state = seq->private; struct udp_iter_state *state = seq->private;
struct udp_seq_afinfo *afinfo;
struct udp_table *udptable;
if (state->bpf_seq_afinfo) if (state->bpf_seq_afinfo)
afinfo = state->bpf_seq_afinfo; afinfo = state->bpf_seq_afinfo;
else else
afinfo = pde_data(file_inode(seq->file)); afinfo = pde_data(file_inode(seq->file));
if (state->bucket <= afinfo->udp_table->mask) udptable = udp_get_table_afinfo(afinfo, seq_file_net(seq));
spin_unlock_bh(&afinfo->udp_table->hash[state->bucket].lock);
if (state->bucket <= udptable->mask)
spin_unlock_bh(&udptable->hash[state->bucket].lock);
} }
EXPORT_SYMBOL(udp_seq_stop); EXPORT_SYMBOL(udp_seq_stop);
...@@ -3184,7 +3215,7 @@ EXPORT_SYMBOL(udp_seq_ops); ...@@ -3184,7 +3215,7 @@ EXPORT_SYMBOL(udp_seq_ops);
static struct udp_seq_afinfo udp4_seq_afinfo = { static struct udp_seq_afinfo udp4_seq_afinfo = {
.family = AF_INET, .family = AF_INET,
.udp_table = &udp_table, .udp_table = NULL,
}; };
static int __net_init udp4_proc_init_net(struct net *net) static int __net_init udp4_proc_init_net(struct net *net)
...@@ -3246,7 +3277,7 @@ void __init udp_table_init(struct udp_table *table, const char *name) ...@@ -3246,7 +3277,7 @@ void __init udp_table_init(struct udp_table *table, const char *name)
&table->log, &table->log,
&table->mask, &table->mask,
UDP_HTABLE_SIZE_MIN, UDP_HTABLE_SIZE_MIN,
64 * 1024); UDP_HTABLE_SIZE_MAX);
table->hash2 = table->hash + (table->mask + 1); table->hash2 = table->hash + (table->mask + 1);
for (i = 0; i <= table->mask; i++) { for (i = 0; i <= table->mask; i++) {
...@@ -3271,7 +3302,7 @@ u32 udp_flow_hashrnd(void) ...@@ -3271,7 +3302,7 @@ u32 udp_flow_hashrnd(void)
} }
EXPORT_SYMBOL(udp_flow_hashrnd); EXPORT_SYMBOL(udp_flow_hashrnd);
static int __net_init udp_sysctl_init(struct net *net) static void __net_init udp_sysctl_init(struct net *net)
{ {
net->ipv4.sysctl_udp_rmem_min = PAGE_SIZE; net->ipv4.sysctl_udp_rmem_min = PAGE_SIZE;
net->ipv4.sysctl_udp_wmem_min = PAGE_SIZE; net->ipv4.sysctl_udp_wmem_min = PAGE_SIZE;
...@@ -3279,12 +3310,103 @@ static int __net_init udp_sysctl_init(struct net *net) ...@@ -3279,12 +3310,103 @@ static int __net_init udp_sysctl_init(struct net *net)
#ifdef CONFIG_NET_L3_MASTER_DEV #ifdef CONFIG_NET_L3_MASTER_DEV
net->ipv4.sysctl_udp_l3mdev_accept = 0; net->ipv4.sysctl_udp_l3mdev_accept = 0;
#endif #endif
}
static struct udp_table __net_init *udp_pernet_table_alloc(unsigned int hash_entries)
{
struct udp_table *udptable;
int i;
udptable = kmalloc(sizeof(*udptable), GFP_KERNEL);
if (!udptable)
goto out;
udptable->hash = vmalloc_huge(hash_entries * 2 * sizeof(struct udp_hslot),
GFP_KERNEL_ACCOUNT);
if (!udptable->hash)
goto free_table;
udptable->hash2 = udptable->hash + hash_entries;
udptable->mask = hash_entries - 1;
udptable->log = ilog2(hash_entries);
for (i = 0; i < hash_entries; i++) {
INIT_HLIST_HEAD(&udptable->hash[i].head);
udptable->hash[i].count = 0;
spin_lock_init(&udptable->hash[i].lock);
INIT_HLIST_HEAD(&udptable->hash2[i].head);
udptable->hash2[i].count = 0;
spin_lock_init(&udptable->hash2[i].lock);
}
return udptable;
free_table:
kfree(udptable);
out:
return NULL;
}
static void __net_exit udp_pernet_table_free(struct net *net)
{
struct udp_table *udptable = net->ipv4.udp_table;
if (udptable == &udp_table)
return;
kvfree(udptable->hash);
kfree(udptable);
}
static void __net_init udp_set_table(struct net *net)
{
struct udp_table *udptable;
unsigned int hash_entries;
struct net *old_net;
if (net_eq(net, &init_net))
goto fallback;
old_net = current->nsproxy->net_ns;
hash_entries = READ_ONCE(old_net->ipv4.sysctl_udp_child_hash_entries);
if (!hash_entries)
goto fallback;
/* Set min to keep the bitmap on stack in udp_lib_get_port() */
if (hash_entries < UDP_HTABLE_SIZE_MIN_PERNET)
hash_entries = UDP_HTABLE_SIZE_MIN_PERNET;
else
hash_entries = roundup_pow_of_two(hash_entries);
udptable = udp_pernet_table_alloc(hash_entries);
if (udptable) {
net->ipv4.udp_table = udptable;
} else {
pr_warn("Failed to allocate UDP hash table (entries: %u) "
"for a netns, fallback to the global one\n",
hash_entries);
fallback:
net->ipv4.udp_table = &udp_table;
}
}
static int __net_init udp_pernet_init(struct net *net)
{
udp_sysctl_init(net);
udp_set_table(net);
return 0; return 0;
} }
static void __net_exit udp_pernet_exit(struct net *net)
{
udp_pernet_table_free(net);
}
static struct pernet_operations __net_initdata udp_sysctl_ops = { static struct pernet_operations __net_initdata udp_sysctl_ops = {
.init = udp_sysctl_init, .init = udp_pernet_init,
.exit = udp_pernet_exit,
}; };
#if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS) #if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS)
...@@ -3302,7 +3424,7 @@ static int bpf_iter_init_udp(void *priv_data, struct bpf_iter_aux_info *aux) ...@@ -3302,7 +3424,7 @@ static int bpf_iter_init_udp(void *priv_data, struct bpf_iter_aux_info *aux)
return -ENOMEM; return -ENOMEM;
afinfo->family = AF_UNSPEC; afinfo->family = AF_UNSPEC;
afinfo->udp_table = &udp_table; afinfo->udp_table = NULL;
st->bpf_seq_afinfo = afinfo; st->bpf_seq_afinfo = afinfo;
ret = bpf_iter_init_seq_net(priv_data, aux); ret = bpf_iter_init_seq_net(priv_data, aux);
if (ret) if (ret)
......
...@@ -147,13 +147,13 @@ static void udp_dump(struct udp_table *table, struct sk_buff *skb, ...@@ -147,13 +147,13 @@ static void udp_dump(struct udp_table *table, struct sk_buff *skb,
static void udp_diag_dump(struct sk_buff *skb, struct netlink_callback *cb, static void udp_diag_dump(struct sk_buff *skb, struct netlink_callback *cb,
const struct inet_diag_req_v2 *r) const struct inet_diag_req_v2 *r)
{ {
udp_dump(&udp_table, skb, cb, r); udp_dump(sock_net(cb->skb->sk)->ipv4.udp_table, skb, cb, r);
} }
static int udp_diag_dump_one(struct netlink_callback *cb, static int udp_diag_dump_one(struct netlink_callback *cb,
const struct inet_diag_req_v2 *req) const struct inet_diag_req_v2 *req)
{ {
return udp_dump_one(&udp_table, cb, req); return udp_dump_one(sock_net(cb->skb->sk)->ipv4.udp_table, cb, req);
} }
static void udp_diag_get_info(struct sock *sk, struct inet_diag_msg *r, static void udp_diag_get_info(struct sock *sk, struct inet_diag_msg *r,
...@@ -225,7 +225,7 @@ static int __udp_diag_destroy(struct sk_buff *in_skb, ...@@ -225,7 +225,7 @@ static int __udp_diag_destroy(struct sk_buff *in_skb,
static int udp_diag_destroy(struct sk_buff *in_skb, static int udp_diag_destroy(struct sk_buff *in_skb,
const struct inet_diag_req_v2 *req) const struct inet_diag_req_v2 *req)
{ {
return __udp_diag_destroy(in_skb, req, &udp_table); return __udp_diag_destroy(in_skb, req, sock_net(in_skb->sk)->ipv4.udp_table);
} }
static int udplite_diag_destroy(struct sk_buff *in_skb, static int udplite_diag_destroy(struct sk_buff *in_skb,
......
...@@ -600,10 +600,11 @@ static struct sock *udp4_gro_lookup_skb(struct sk_buff *skb, __be16 sport, ...@@ -600,10 +600,11 @@ static struct sock *udp4_gro_lookup_skb(struct sk_buff *skb, __be16 sport,
__be16 dport) __be16 dport)
{ {
const struct iphdr *iph = skb_gro_network_header(skb); const struct iphdr *iph = skb_gro_network_header(skb);
struct net *net = dev_net(skb->dev);
return __udp4_lib_lookup(dev_net(skb->dev), iph->saddr, sport, return __udp4_lib_lookup(net, iph->saddr, sport,
iph->daddr, dport, inet_iif(skb), iph->daddr, dport, inet_iif(skb),
inet_sdif(skb), &udp_table, NULL); inet_sdif(skb), net->ipv4.udp_table, NULL);
} }
INDIRECT_CALLABLE_SCOPE INDIRECT_CALLABLE_SCOPE
......
...@@ -217,7 +217,7 @@ static inline struct sock *udp6_lookup_run_bpf(struct net *net, ...@@ -217,7 +217,7 @@ static inline struct sock *udp6_lookup_run_bpf(struct net *net,
struct sock *sk, *reuse_sk; struct sock *sk, *reuse_sk;
bool no_reuseport; bool no_reuseport;
if (udptable != &udp_table) if (udptable != net->ipv4.udp_table)
return NULL; /* only UDP is supported */ return NULL; /* only UDP is supported */
no_reuseport = bpf_sk_lookup_run_v6(net, IPPROTO_UDP, saddr, sport, no_reuseport = bpf_sk_lookup_run_v6(net, IPPROTO_UDP, saddr, sport,
...@@ -298,10 +298,11 @@ struct sock *udp6_lib_lookup_skb(const struct sk_buff *skb, ...@@ -298,10 +298,11 @@ struct sock *udp6_lib_lookup_skb(const struct sk_buff *skb,
__be16 sport, __be16 dport) __be16 sport, __be16 dport)
{ {
const struct ipv6hdr *iph = ipv6_hdr(skb); const struct ipv6hdr *iph = ipv6_hdr(skb);
struct net *net = dev_net(skb->dev);
return __udp6_lib_lookup(dev_net(skb->dev), &iph->saddr, sport, return __udp6_lib_lookup(net, &iph->saddr, sport,
&iph->daddr, dport, inet6_iif(skb), &iph->daddr, dport, inet6_iif(skb),
inet6_sdif(skb), &udp_table, NULL); inet6_sdif(skb), net->ipv4.udp_table, NULL);
} }
/* Must be called under rcu_read_lock(). /* Must be called under rcu_read_lock().
...@@ -314,7 +315,7 @@ struct sock *udp6_lib_lookup(struct net *net, const struct in6_addr *saddr, __be ...@@ -314,7 +315,7 @@ struct sock *udp6_lib_lookup(struct net *net, const struct in6_addr *saddr, __be
struct sock *sk; struct sock *sk;
sk = __udp6_lib_lookup(net, saddr, sport, daddr, dport, sk = __udp6_lib_lookup(net, saddr, sport, daddr, dport,
dif, 0, &udp_table, NULL); dif, 0, net->ipv4.udp_table, NULL);
if (sk && !refcount_inc_not_zero(&sk->sk_refcnt)) if (sk && !refcount_inc_not_zero(&sk->sk_refcnt))
sk = NULL; sk = NULL;
return sk; return sk;
...@@ -689,7 +690,8 @@ static __inline__ int udpv6_err(struct sk_buff *skb, ...@@ -689,7 +690,8 @@ static __inline__ int udpv6_err(struct sk_buff *skb,
struct inet6_skb_parm *opt, u8 type, struct inet6_skb_parm *opt, u8 type,
u8 code, int offset, __be32 info) u8 code, int offset, __be32 info)
{ {
return __udp6_lib_err(skb, opt, type, code, offset, info, &udp_table); return __udp6_lib_err(skb, opt, type, code, offset, info,
dev_net(skb->dev)->ipv4.udp_table);
} }
static int udpv6_queue_rcv_one_skb(struct sock *sk, struct sk_buff *skb) static int udpv6_queue_rcv_one_skb(struct sock *sk, struct sk_buff *skb)
...@@ -1063,13 +1065,18 @@ static struct sock *__udp6_lib_demux_lookup(struct net *net, ...@@ -1063,13 +1065,18 @@ static struct sock *__udp6_lib_demux_lookup(struct net *net,
__be16 rmt_port, const struct in6_addr *rmt_addr, __be16 rmt_port, const struct in6_addr *rmt_addr,
int dif, int sdif) int dif, int sdif)
{ {
struct udp_table *udptable = net->ipv4.udp_table;
unsigned short hnum = ntohs(loc_port); unsigned short hnum = ntohs(loc_port);
unsigned int hash2 = ipv6_portaddr_hash(net, loc_addr, hnum); unsigned int hash2, slot2;
unsigned int slot2 = hash2 & udp_table.mask; struct udp_hslot *hslot2;
struct udp_hslot *hslot2 = &udp_table.hash2[slot2]; __portpair ports;
const __portpair ports = INET_COMBINED_PORTS(rmt_port, hnum);
struct sock *sk; struct sock *sk;
hash2 = ipv6_portaddr_hash(net, loc_addr, hnum);
slot2 = hash2 & udptable->mask;
hslot2 = &udptable->hash2[slot2];
ports = INET_COMBINED_PORTS(rmt_port, hnum);
udp_portaddr_for_each_entry_rcu(sk, &hslot2->head) { udp_portaddr_for_each_entry_rcu(sk, &hslot2->head) {
if (sk->sk_state == TCP_ESTABLISHED && if (sk->sk_state == TCP_ESTABLISHED &&
inet6_match(net, sk, rmt_addr, loc_addr, ports, dif, sdif)) inet6_match(net, sk, rmt_addr, loc_addr, ports, dif, sdif))
...@@ -1123,7 +1130,7 @@ void udp_v6_early_demux(struct sk_buff *skb) ...@@ -1123,7 +1130,7 @@ void udp_v6_early_demux(struct sk_buff *skb)
INDIRECT_CALLABLE_SCOPE int udpv6_rcv(struct sk_buff *skb) INDIRECT_CALLABLE_SCOPE int udpv6_rcv(struct sk_buff *skb)
{ {
return __udp6_lib_rcv(skb, &udp_table, IPPROTO_UDP); return __udp6_lib_rcv(skb, dev_net(skb->dev)->ipv4.udp_table, IPPROTO_UDP);
} }
/* /*
...@@ -1720,7 +1727,7 @@ EXPORT_SYMBOL(udp6_seq_ops); ...@@ -1720,7 +1727,7 @@ EXPORT_SYMBOL(udp6_seq_ops);
static struct udp_seq_afinfo udp6_seq_afinfo = { static struct udp_seq_afinfo udp6_seq_afinfo = {
.family = AF_INET6, .family = AF_INET6,
.udp_table = &udp_table, .udp_table = NULL,
}; };
int __net_init udp6_proc_init(struct net *net) int __net_init udp6_proc_init(struct net *net)
...@@ -1770,7 +1777,7 @@ struct proto udpv6_prot = { ...@@ -1770,7 +1777,7 @@ struct proto udpv6_prot = {
.sysctl_wmem_offset = offsetof(struct net, ipv4.sysctl_udp_wmem_min), .sysctl_wmem_offset = offsetof(struct net, ipv4.sysctl_udp_wmem_min),
.sysctl_rmem_offset = offsetof(struct net, ipv4.sysctl_udp_rmem_min), .sysctl_rmem_offset = offsetof(struct net, ipv4.sysctl_udp_rmem_min),
.obj_size = sizeof(struct udp6_sock), .obj_size = sizeof(struct udp6_sock),
.h.udp_table = &udp_table, .h.udp_table = NULL,
.diag_destroy = udp_abort, .diag_destroy = udp_abort,
}; };
......
...@@ -116,10 +116,11 @@ static struct sock *udp6_gro_lookup_skb(struct sk_buff *skb, __be16 sport, ...@@ -116,10 +116,11 @@ static struct sock *udp6_gro_lookup_skb(struct sk_buff *skb, __be16 sport,
__be16 dport) __be16 dport)
{ {
const struct ipv6hdr *iph = skb_gro_network_header(skb); const struct ipv6hdr *iph = skb_gro_network_header(skb);
struct net *net = dev_net(skb->dev);
return __udp6_lib_lookup(dev_net(skb->dev), &iph->saddr, sport, return __udp6_lib_lookup(net, &iph->saddr, sport,
&iph->daddr, dport, inet6_iif(skb), &iph->daddr, dport, inet6_iif(skb),
inet6_sdif(skb), &udp_table, NULL); inet6_sdif(skb), net->ipv4.udp_table, NULL);
} }
INDIRECT_CALLABLE_SCOPE INDIRECT_CALLABLE_SCOPE
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment