Commit a918eb9f authored by David S. Miller's avatar David S. Miller

Merge branch 'rt_cong_ctrl'

Daniel Borkmann says:

====================
net: allow setting congctl via routing table

This is the second part of our work and allows for setting the congestion
control algorithm via routing table. For details, please see individual
patches.

Since patch 1 is a bug fix, we suggest applying patch 1 to net, and then
merging net into net-next, for example, and following up with the remaining
feature patches wrt dependencies.

Joint work with Florian Westphal, suggested by Hannes Frederic Sowa.

Patch for iproute2 is available under [1], but will be reposted with along
with the man-page update when this set hits net-next.

  [1] http://patchwork.ozlabs.org/patch/418149/

Thanks!

v2 -> v3:
 - Added module auto-loading as suggested by David Miller, thanks!
  - Added patch 2 for handling possible sleeps in fib6
  - While working on this, we discovered a bug, hence fix in patch 1
  - Added auto-loading to patch 4
 - Rebased, retested, rest the same.
v1 -> v2:
 - Very sorry, I noticed I had decnet disabled during testing.
   Added missing header include in decnet, rest as is.
====================
Signed-off-by: default avatarDavid S. Miller <davem@davemloft.net>
parents 6cb69742 81164413
...@@ -98,7 +98,8 @@ struct inet_connection_sock { ...@@ -98,7 +98,8 @@ struct inet_connection_sock {
const struct tcp_congestion_ops *icsk_ca_ops; const struct tcp_congestion_ops *icsk_ca_ops;
const struct inet_connection_sock_af_ops *icsk_af_ops; const struct inet_connection_sock_af_ops *icsk_af_ops;
unsigned int (*icsk_sync_mss)(struct sock *sk, u32 pmtu); unsigned int (*icsk_sync_mss)(struct sock *sk, u32 pmtu);
__u8 icsk_ca_state; __u8 icsk_ca_state:7,
icsk_ca_dst_locked:1;
__u8 icsk_retransmits; __u8 icsk_retransmits;
__u8 icsk_pending; __u8 icsk_pending;
__u8 icsk_backoff; __u8 icsk_backoff;
......
...@@ -74,6 +74,11 @@ struct fib6_node { ...@@ -74,6 +74,11 @@ struct fib6_node {
#define FIB6_SUBTREE(fn) ((fn)->subtree) #define FIB6_SUBTREE(fn) ((fn)->subtree)
#endif #endif
struct mx6_config {
const u32 *mx;
DECLARE_BITMAP(mx_valid, RTAX_MAX);
};
/* /*
* routing information * routing information
* *
...@@ -291,9 +296,8 @@ struct fib6_node *fib6_locate(struct fib6_node *root, ...@@ -291,9 +296,8 @@ struct fib6_node *fib6_locate(struct fib6_node *root,
void fib6_clean_all(struct net *net, int (*func)(struct rt6_info *, void *arg), void fib6_clean_all(struct net *net, int (*func)(struct rt6_info *, void *arg),
void *arg); void *arg);
int fib6_add(struct fib6_node *root, struct rt6_info *rt, struct nl_info *info, int fib6_add(struct fib6_node *root, struct rt6_info *rt,
struct nlattr *mx, int mx_len); struct nl_info *info, struct mx6_config *mxc);
int fib6_del(struct rt6_info *rt, struct nl_info *info); int fib6_del(struct rt6_info *rt, struct nl_info *info);
void inet6_rt_notify(int event, struct rt6_info *rt, struct nl_info *info); void inet6_rt_notify(int event, struct rt6_info *rt, struct nl_info *info);
......
...@@ -448,6 +448,7 @@ int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb); ...@@ -448,6 +448,7 @@ int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb);
struct sock *tcp_create_openreq_child(struct sock *sk, struct sock *tcp_create_openreq_child(struct sock *sk,
struct request_sock *req, struct request_sock *req,
struct sk_buff *skb); struct sk_buff *skb);
void tcp_ca_openreq_child(struct sock *sk, const struct dst_entry *dst);
struct sock *tcp_v4_syn_recv_sock(struct sock *sk, struct sk_buff *skb, struct sock *tcp_v4_syn_recv_sock(struct sock *sk, struct sk_buff *skb,
struct request_sock *req, struct request_sock *req,
struct dst_entry *dst); struct dst_entry *dst);
...@@ -636,6 +637,11 @@ static inline u32 tcp_rto_min_us(struct sock *sk) ...@@ -636,6 +637,11 @@ static inline u32 tcp_rto_min_us(struct sock *sk)
return jiffies_to_usecs(tcp_rto_min(sk)); return jiffies_to_usecs(tcp_rto_min(sk));
} }
static inline bool tcp_ca_dst_locked(const struct dst_entry *dst)
{
return dst_metric_locked(dst, RTAX_CC_ALGO);
}
/* Compute the actual receive window we are currently advertising. /* Compute the actual receive window we are currently advertising.
* Rcv_nxt can be after the window if our peer push more data * Rcv_nxt can be after the window if our peer push more data
* than the offered window. * than the offered window.
...@@ -787,6 +793,8 @@ enum tcp_ca_ack_event_flags { ...@@ -787,6 +793,8 @@ enum tcp_ca_ack_event_flags {
#define TCP_CA_MAX 128 #define TCP_CA_MAX 128
#define TCP_CA_BUF_MAX (TCP_CA_NAME_MAX*TCP_CA_MAX) #define TCP_CA_BUF_MAX (TCP_CA_NAME_MAX*TCP_CA_MAX)
#define TCP_CA_UNSPEC 0
/* Algorithm can be set on socket without CAP_NET_ADMIN privileges */ /* Algorithm can be set on socket without CAP_NET_ADMIN privileges */
#define TCP_CONG_NON_RESTRICTED 0x1 #define TCP_CONG_NON_RESTRICTED 0x1
/* Requires ECN/ECT set on all packets */ /* Requires ECN/ECT set on all packets */
...@@ -794,7 +802,8 @@ enum tcp_ca_ack_event_flags { ...@@ -794,7 +802,8 @@ enum tcp_ca_ack_event_flags {
struct tcp_congestion_ops { struct tcp_congestion_ops {
struct list_head list; struct list_head list;
unsigned long flags; u32 key;
u32 flags;
/* initialize private data (optional) */ /* initialize private data (optional) */
void (*init)(struct sock *sk); void (*init)(struct sock *sk);
...@@ -841,6 +850,17 @@ u32 tcp_reno_ssthresh(struct sock *sk); ...@@ -841,6 +850,17 @@ u32 tcp_reno_ssthresh(struct sock *sk);
void tcp_reno_cong_avoid(struct sock *sk, u32 ack, u32 acked); void tcp_reno_cong_avoid(struct sock *sk, u32 ack, u32 acked);
extern struct tcp_congestion_ops tcp_reno; extern struct tcp_congestion_ops tcp_reno;
struct tcp_congestion_ops *tcp_ca_find_key(u32 key);
u32 tcp_ca_get_key_by_name(const char *name);
#ifdef CONFIG_INET
char *tcp_ca_get_name_by_key(u32 key, char *buffer);
#else
static inline char *tcp_ca_get_name_by_key(u32 key, char *buffer)
{
return NULL;
}
#endif
static inline bool tcp_ca_needs_ecn(const struct sock *sk) static inline bool tcp_ca_needs_ecn(const struct sock *sk)
{ {
const struct inet_connection_sock *icsk = inet_csk(sk); const struct inet_connection_sock *icsk = inet_csk(sk);
......
...@@ -389,6 +389,8 @@ enum { ...@@ -389,6 +389,8 @@ enum {
#define RTAX_INITRWND RTAX_INITRWND #define RTAX_INITRWND RTAX_INITRWND
RTAX_QUICKACK, RTAX_QUICKACK,
#define RTAX_QUICKACK RTAX_QUICKACK #define RTAX_QUICKACK RTAX_QUICKACK
RTAX_CC_ALGO,
#define RTAX_CC_ALGO RTAX_CC_ALGO
__RTAX_MAX __RTAX_MAX
}; };
......
...@@ -50,6 +50,7 @@ ...@@ -50,6 +50,7 @@
#include <net/arp.h> #include <net/arp.h>
#include <net/route.h> #include <net/route.h>
#include <net/udp.h> #include <net/udp.h>
#include <net/tcp.h>
#include <net/sock.h> #include <net/sock.h>
#include <net/pkt_sched.h> #include <net/pkt_sched.h>
#include <net/fib_rules.h> #include <net/fib_rules.h>
...@@ -669,9 +670,19 @@ int rtnetlink_put_metrics(struct sk_buff *skb, u32 *metrics) ...@@ -669,9 +670,19 @@ int rtnetlink_put_metrics(struct sk_buff *skb, u32 *metrics)
for (i = 0; i < RTAX_MAX; i++) { for (i = 0; i < RTAX_MAX; i++) {
if (metrics[i]) { if (metrics[i]) {
if (i == RTAX_CC_ALGO - 1) {
char tmp[TCP_CA_NAME_MAX], *name;
name = tcp_ca_get_name_by_key(metrics[i], tmp);
if (!name)
continue;
if (nla_put_string(skb, i + 1, name))
goto nla_put_failure;
} else {
if (nla_put_u32(skb, i + 1, metrics[i]))
goto nla_put_failure;
}
valid++; valid++;
if (nla_put_u32(skb, i+1, metrics[i]))
goto nla_put_failure;
} }
} }
......
...@@ -298,7 +298,8 @@ struct dn_fib_info *dn_fib_create_info(const struct rtmsg *r, struct nlattr *att ...@@ -298,7 +298,8 @@ struct dn_fib_info *dn_fib_create_info(const struct rtmsg *r, struct nlattr *att
int type = nla_type(attr); int type = nla_type(attr);
if (type) { if (type) {
if (type > RTAX_MAX || nla_len(attr) < 4) if (type > RTAX_MAX || type == RTAX_CC_ALGO ||
nla_len(attr) < 4)
goto err_inval; goto err_inval;
fi->fib_metrics[type-1] = nla_get_u32(attr); fi->fib_metrics[type-1] = nla_get_u32(attr);
......
...@@ -29,6 +29,7 @@ ...@@ -29,6 +29,7 @@
#include <linux/route.h> /* RTF_xxx */ #include <linux/route.h> /* RTF_xxx */
#include <net/neighbour.h> #include <net/neighbour.h>
#include <net/netlink.h> #include <net/netlink.h>
#include <net/tcp.h>
#include <net/dst.h> #include <net/dst.h>
#include <net/flow.h> #include <net/flow.h>
#include <net/fib_rules.h> #include <net/fib_rules.h>
...@@ -273,7 +274,8 @@ static inline size_t dn_fib_nlmsg_size(struct dn_fib_info *fi) ...@@ -273,7 +274,8 @@ static inline size_t dn_fib_nlmsg_size(struct dn_fib_info *fi)
size_t payload = NLMSG_ALIGN(sizeof(struct rtmsg)) size_t payload = NLMSG_ALIGN(sizeof(struct rtmsg))
+ nla_total_size(4) /* RTA_TABLE */ + nla_total_size(4) /* RTA_TABLE */
+ nla_total_size(2) /* RTA_DST */ + nla_total_size(2) /* RTA_DST */
+ nla_total_size(4); /* RTA_PRIORITY */ + nla_total_size(4) /* RTA_PRIORITY */
+ nla_total_size(TCP_CA_NAME_MAX); /* RTAX_CC_ALGO */
/* space for nested metrics */ /* space for nested metrics */
payload += nla_total_size((RTAX_MAX * nla_total_size(4))); payload += nla_total_size((RTAX_MAX * nla_total_size(4)));
......
...@@ -360,7 +360,8 @@ static inline size_t fib_nlmsg_size(struct fib_info *fi) ...@@ -360,7 +360,8 @@ static inline size_t fib_nlmsg_size(struct fib_info *fi)
+ nla_total_size(4) /* RTA_TABLE */ + nla_total_size(4) /* RTA_TABLE */
+ nla_total_size(4) /* RTA_DST */ + nla_total_size(4) /* RTA_DST */
+ nla_total_size(4) /* RTA_PRIORITY */ + nla_total_size(4) /* RTA_PRIORITY */
+ nla_total_size(4); /* RTA_PREFSRC */ + nla_total_size(4) /* RTA_PREFSRC */
+ nla_total_size(TCP_CA_NAME_MAX); /* RTAX_CC_ALGO */
/* space for nested metrics */ /* space for nested metrics */
payload += nla_total_size((RTAX_MAX * nla_total_size(4))); payload += nla_total_size((RTAX_MAX * nla_total_size(4)));
...@@ -859,7 +860,16 @@ struct fib_info *fib_create_info(struct fib_config *cfg) ...@@ -859,7 +860,16 @@ struct fib_info *fib_create_info(struct fib_config *cfg)
if (type > RTAX_MAX) if (type > RTAX_MAX)
goto err_inval; goto err_inval;
val = nla_get_u32(nla); if (type == RTAX_CC_ALGO) {
char tmp[TCP_CA_NAME_MAX];
nla_strlcpy(tmp, nla, sizeof(tmp));
val = tcp_ca_get_key_by_name(tmp);
if (val == TCP_CA_UNSPEC)
goto err_inval;
} else {
val = nla_get_u32(nla);
}
if (type == RTAX_ADVMSS && val > 65535 - 40) if (type == RTAX_ADVMSS && val > 65535 - 40)
val = 65535 - 40; val = 65535 - 40;
if (type == RTAX_MTU && val > 65535 - 15) if (type == RTAX_MTU && val > 65535 - 15)
......
...@@ -13,6 +13,7 @@ ...@@ -13,6 +13,7 @@
#include <linux/types.h> #include <linux/types.h>
#include <linux/list.h> #include <linux/list.h>
#include <linux/gfp.h> #include <linux/gfp.h>
#include <linux/jhash.h>
#include <net/tcp.h> #include <net/tcp.h>
static DEFINE_SPINLOCK(tcp_cong_list_lock); static DEFINE_SPINLOCK(tcp_cong_list_lock);
...@@ -31,6 +32,34 @@ static struct tcp_congestion_ops *tcp_ca_find(const char *name) ...@@ -31,6 +32,34 @@ static struct tcp_congestion_ops *tcp_ca_find(const char *name)
return NULL; return NULL;
} }
/* Must be called with rcu lock held */
static const struct tcp_congestion_ops *__tcp_ca_find_autoload(const char *name)
{
const struct tcp_congestion_ops *ca = tcp_ca_find(name);
#ifdef CONFIG_MODULES
if (!ca && capable(CAP_NET_ADMIN)) {
rcu_read_unlock();
request_module("tcp_%s", name);
rcu_read_lock();
ca = tcp_ca_find(name);
}
#endif
return ca;
}
/* Simple linear search, not much in here. */
struct tcp_congestion_ops *tcp_ca_find_key(u32 key)
{
struct tcp_congestion_ops *e;
list_for_each_entry_rcu(e, &tcp_cong_list, list) {
if (e->key == key)
return e;
}
return NULL;
}
/* /*
* Attach new congestion control algorithm to the list * Attach new congestion control algorithm to the list
* of available options. * of available options.
...@@ -45,9 +74,12 @@ int tcp_register_congestion_control(struct tcp_congestion_ops *ca) ...@@ -45,9 +74,12 @@ int tcp_register_congestion_control(struct tcp_congestion_ops *ca)
return -EINVAL; return -EINVAL;
} }
ca->key = jhash(ca->name, sizeof(ca->name), strlen(ca->name));
spin_lock(&tcp_cong_list_lock); spin_lock(&tcp_cong_list_lock);
if (tcp_ca_find(ca->name)) { if (ca->key == TCP_CA_UNSPEC || tcp_ca_find_key(ca->key)) {
pr_notice("%s already registered\n", ca->name); pr_notice("%s already registered or non-unique key\n",
ca->name);
ret = -EEXIST; ret = -EEXIST;
} else { } else {
list_add_tail_rcu(&ca->list, &tcp_cong_list); list_add_tail_rcu(&ca->list, &tcp_cong_list);
...@@ -70,9 +102,50 @@ void tcp_unregister_congestion_control(struct tcp_congestion_ops *ca) ...@@ -70,9 +102,50 @@ void tcp_unregister_congestion_control(struct tcp_congestion_ops *ca)
spin_lock(&tcp_cong_list_lock); spin_lock(&tcp_cong_list_lock);
list_del_rcu(&ca->list); list_del_rcu(&ca->list);
spin_unlock(&tcp_cong_list_lock); spin_unlock(&tcp_cong_list_lock);
/* Wait for outstanding readers to complete before the
* module gets removed entirely.
*
* A try_module_get() should fail by now as our module is
* in "going" state since no refs are held anymore and
* module_exit() handler being called.
*/
synchronize_rcu();
} }
EXPORT_SYMBOL_GPL(tcp_unregister_congestion_control); EXPORT_SYMBOL_GPL(tcp_unregister_congestion_control);
u32 tcp_ca_get_key_by_name(const char *name)
{
const struct tcp_congestion_ops *ca;
u32 key;
might_sleep();
rcu_read_lock();
ca = __tcp_ca_find_autoload(name);
key = ca ? ca->key : TCP_CA_UNSPEC;
rcu_read_unlock();
return key;
}
EXPORT_SYMBOL_GPL(tcp_ca_get_key_by_name);
char *tcp_ca_get_name_by_key(u32 key, char *buffer)
{
const struct tcp_congestion_ops *ca;
char *ret = NULL;
rcu_read_lock();
ca = tcp_ca_find_key(key);
if (ca)
ret = strncpy(buffer, ca->name,
TCP_CA_NAME_MAX);
rcu_read_unlock();
return ret;
}
EXPORT_SYMBOL_GPL(tcp_ca_get_name_by_key);
/* Assign choice of congestion control. */ /* Assign choice of congestion control. */
void tcp_assign_congestion_control(struct sock *sk) void tcp_assign_congestion_control(struct sock *sk)
{ {
...@@ -107,6 +180,18 @@ void tcp_init_congestion_control(struct sock *sk) ...@@ -107,6 +180,18 @@ void tcp_init_congestion_control(struct sock *sk)
icsk->icsk_ca_ops->init(sk); icsk->icsk_ca_ops->init(sk);
} }
static void tcp_reinit_congestion_control(struct sock *sk,
const struct tcp_congestion_ops *ca)
{
struct inet_connection_sock *icsk = inet_csk(sk);
tcp_cleanup_congestion_control(sk);
icsk->icsk_ca_ops = ca;
if (sk->sk_state != TCP_CLOSE && icsk->icsk_ca_ops->init)
icsk->icsk_ca_ops->init(sk);
}
/* Manage refcounts on socket close. */ /* Manage refcounts on socket close. */
void tcp_cleanup_congestion_control(struct sock *sk) void tcp_cleanup_congestion_control(struct sock *sk)
{ {
...@@ -241,42 +326,26 @@ int tcp_set_allowed_congestion_control(char *val) ...@@ -241,42 +326,26 @@ int tcp_set_allowed_congestion_control(char *val)
int tcp_set_congestion_control(struct sock *sk, const char *name) int tcp_set_congestion_control(struct sock *sk, const char *name)
{ {
struct inet_connection_sock *icsk = inet_csk(sk); struct inet_connection_sock *icsk = inet_csk(sk);
struct tcp_congestion_ops *ca; const struct tcp_congestion_ops *ca;
int err = 0; int err = 0;
rcu_read_lock(); if (icsk->icsk_ca_dst_locked)
ca = tcp_ca_find(name); return -EPERM;
/* no change asking for existing value */ rcu_read_lock();
ca = __tcp_ca_find_autoload(name);
/* No change asking for existing value */
if (ca == icsk->icsk_ca_ops) if (ca == icsk->icsk_ca_ops)
goto out; goto out;
#ifdef CONFIG_MODULES
/* not found attempt to autoload module */
if (!ca && capable(CAP_NET_ADMIN)) {
rcu_read_unlock();
request_module("tcp_%s", name);
rcu_read_lock();
ca = tcp_ca_find(name);
}
#endif
if (!ca) if (!ca)
err = -ENOENT; err = -ENOENT;
else if (!((ca->flags & TCP_CONG_NON_RESTRICTED) || else if (!((ca->flags & TCP_CONG_NON_RESTRICTED) ||
ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))) ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)))
err = -EPERM; err = -EPERM;
else if (!try_module_get(ca->owner)) else if (!try_module_get(ca->owner))
err = -EBUSY; err = -EBUSY;
else
else { tcp_reinit_congestion_control(sk, ca);
tcp_cleanup_congestion_control(sk);
icsk->icsk_ca_ops = ca;
if (sk->sk_state != TCP_CLOSE && icsk->icsk_ca_ops->init)
icsk->icsk_ca_ops->init(sk);
}
out: out:
rcu_read_unlock(); rcu_read_unlock();
return err; return err;
......
...@@ -1340,6 +1340,8 @@ struct sock *tcp_v4_syn_recv_sock(struct sock *sk, struct sk_buff *skb, ...@@ -1340,6 +1340,8 @@ struct sock *tcp_v4_syn_recv_sock(struct sock *sk, struct sk_buff *skb,
} }
sk_setup_caps(newsk, dst); sk_setup_caps(newsk, dst);
tcp_ca_openreq_child(newsk, dst);
tcp_sync_mss(newsk, dst_mtu(dst)); tcp_sync_mss(newsk, dst_mtu(dst));
newtp->advmss = dst_metric_advmss(dst); newtp->advmss = dst_metric_advmss(dst);
if (tcp_sk(sk)->rx_opt.user_mss && if (tcp_sk(sk)->rx_opt.user_mss &&
......
...@@ -399,6 +399,32 @@ static void tcp_ecn_openreq_child(struct tcp_sock *tp, ...@@ -399,6 +399,32 @@ static void tcp_ecn_openreq_child(struct tcp_sock *tp,
tp->ecn_flags = inet_rsk(req)->ecn_ok ? TCP_ECN_OK : 0; tp->ecn_flags = inet_rsk(req)->ecn_ok ? TCP_ECN_OK : 0;
} }
void tcp_ca_openreq_child(struct sock *sk, const struct dst_entry *dst)
{
struct inet_connection_sock *icsk = inet_csk(sk);
u32 ca_key = dst_metric(dst, RTAX_CC_ALGO);
bool ca_got_dst = false;
if (ca_key != TCP_CA_UNSPEC) {
const struct tcp_congestion_ops *ca;
rcu_read_lock();
ca = tcp_ca_find_key(ca_key);
if (likely(ca && try_module_get(ca->owner))) {
icsk->icsk_ca_dst_locked = tcp_ca_dst_locked(dst);
icsk->icsk_ca_ops = ca;
ca_got_dst = true;
}
rcu_read_unlock();
}
if (!ca_got_dst && !try_module_get(icsk->icsk_ca_ops->owner))
tcp_assign_congestion_control(sk);
tcp_set_ca_state(sk, TCP_CA_Open);
}
EXPORT_SYMBOL_GPL(tcp_ca_openreq_child);
/* This is not only more efficient than what we used to do, it eliminates /* This is not only more efficient than what we used to do, it eliminates
* a lot of code duplication between IPv4/IPv6 SYN recv processing. -DaveM * a lot of code duplication between IPv4/IPv6 SYN recv processing. -DaveM
* *
...@@ -451,10 +477,6 @@ struct sock *tcp_create_openreq_child(struct sock *sk, struct request_sock *req, ...@@ -451,10 +477,6 @@ struct sock *tcp_create_openreq_child(struct sock *sk, struct request_sock *req,
newtp->snd_cwnd = TCP_INIT_CWND; newtp->snd_cwnd = TCP_INIT_CWND;
newtp->snd_cwnd_cnt = 0; newtp->snd_cwnd_cnt = 0;
if (!try_module_get(newicsk->icsk_ca_ops->owner))
tcp_assign_congestion_control(newsk);
tcp_set_ca_state(newsk, TCP_CA_Open);
tcp_init_xmit_timers(newsk); tcp_init_xmit_timers(newsk);
__skb_queue_head_init(&newtp->out_of_order_queue); __skb_queue_head_init(&newtp->out_of_order_queue);
newtp->write_seq = newtp->pushed_seq = treq->snt_isn + 1; newtp->write_seq = newtp->pushed_seq = treq->snt_isn + 1;
......
...@@ -2939,6 +2939,25 @@ struct sk_buff *tcp_make_synack(struct sock *sk, struct dst_entry *dst, ...@@ -2939,6 +2939,25 @@ struct sk_buff *tcp_make_synack(struct sock *sk, struct dst_entry *dst,
} }
EXPORT_SYMBOL(tcp_make_synack); EXPORT_SYMBOL(tcp_make_synack);
static void tcp_ca_dst_init(struct sock *sk, const struct dst_entry *dst)
{
struct inet_connection_sock *icsk = inet_csk(sk);
const struct tcp_congestion_ops *ca;
u32 ca_key = dst_metric(dst, RTAX_CC_ALGO);
if (ca_key == TCP_CA_UNSPEC)
return;
rcu_read_lock();
ca = tcp_ca_find_key(ca_key);
if (likely(ca && try_module_get(ca->owner))) {
module_put(icsk->icsk_ca_ops->owner);
icsk->icsk_ca_dst_locked = tcp_ca_dst_locked(dst);
icsk->icsk_ca_ops = ca;
}
rcu_read_unlock();
}
/* Do all connect socket setups that can be done AF independent. */ /* Do all connect socket setups that can be done AF independent. */
static void tcp_connect_init(struct sock *sk) static void tcp_connect_init(struct sock *sk)
{ {
...@@ -2964,6 +2983,8 @@ static void tcp_connect_init(struct sock *sk) ...@@ -2964,6 +2983,8 @@ static void tcp_connect_init(struct sock *sk)
tcp_mtup_init(sk); tcp_mtup_init(sk);
tcp_sync_mss(sk, dst_mtu(dst)); tcp_sync_mss(sk, dst_mtu(dst));
tcp_ca_dst_init(sk, dst);
if (!tp->window_clamp) if (!tp->window_clamp)
tp->window_clamp = dst_metric(dst, RTAX_WINDOW); tp->window_clamp = dst_metric(dst, RTAX_WINDOW);
tp->advmss = dst_metric_advmss(dst); tp->advmss = dst_metric_advmss(dst);
......
...@@ -630,32 +630,35 @@ static bool rt6_qualify_for_ecmp(struct rt6_info *rt) ...@@ -630,32 +630,35 @@ static bool rt6_qualify_for_ecmp(struct rt6_info *rt)
RTF_GATEWAY; RTF_GATEWAY;
} }
static int fib6_commit_metrics(struct dst_entry *dst, static void fib6_copy_metrics(u32 *mp, const struct mx6_config *mxc)
struct nlattr *mx, int mx_len)
{ {
struct nlattr *nla; int i;
int remaining;
u32 *mp;
if (dst->flags & DST_HOST) { for (i = 0; i < RTAX_MAX; i++) {
mp = dst_metrics_write_ptr(dst); if (test_bit(i, mxc->mx_valid))
} else { mp[i] = mxc->mx[i];
mp = kzalloc(sizeof(u32) * RTAX_MAX, GFP_ATOMIC);
if (!mp)
return -ENOMEM;
dst_init_metrics(dst, mp, 0);
} }
}
nla_for_each_attr(nla, mx, mx_len, remaining) { static int fib6_commit_metrics(struct dst_entry *dst, struct mx6_config *mxc)
int type = nla_type(nla); {
if (!mxc->mx)
return 0;
if (type) { if (dst->flags & DST_HOST) {
if (type > RTAX_MAX) u32 *mp = dst_metrics_write_ptr(dst);
return -EINVAL;
mp[type - 1] = nla_get_u32(nla); if (unlikely(!mp))
} return -ENOMEM;
fib6_copy_metrics(mp, mxc);
} else {
dst_init_metrics(dst, mxc->mx, false);
/* We've stolen mx now. */
mxc->mx = NULL;
} }
return 0; return 0;
} }
...@@ -664,7 +667,7 @@ static int fib6_commit_metrics(struct dst_entry *dst, ...@@ -664,7 +667,7 @@ static int fib6_commit_metrics(struct dst_entry *dst,
*/ */
static int fib6_add_rt2node(struct fib6_node *fn, struct rt6_info *rt, static int fib6_add_rt2node(struct fib6_node *fn, struct rt6_info *rt,
struct nl_info *info, struct nlattr *mx, int mx_len) struct nl_info *info, struct mx6_config *mxc)
{ {
struct rt6_info *iter = NULL; struct rt6_info *iter = NULL;
struct rt6_info **ins; struct rt6_info **ins;
...@@ -773,11 +776,10 @@ static int fib6_add_rt2node(struct fib6_node *fn, struct rt6_info *rt, ...@@ -773,11 +776,10 @@ static int fib6_add_rt2node(struct fib6_node *fn, struct rt6_info *rt,
pr_warn("NLM_F_CREATE should be set when creating new route\n"); pr_warn("NLM_F_CREATE should be set when creating new route\n");
add: add:
if (mx) { err = fib6_commit_metrics(&rt->dst, mxc);
err = fib6_commit_metrics(&rt->dst, mx, mx_len); if (err)
if (err) return err;
return err;
}
rt->dst.rt6_next = iter; rt->dst.rt6_next = iter;
*ins = rt; *ins = rt;
rt->rt6i_node = fn; rt->rt6i_node = fn;
...@@ -797,11 +799,11 @@ static int fib6_add_rt2node(struct fib6_node *fn, struct rt6_info *rt, ...@@ -797,11 +799,11 @@ static int fib6_add_rt2node(struct fib6_node *fn, struct rt6_info *rt,
pr_warn("NLM_F_REPLACE set, but no existing node found!\n"); pr_warn("NLM_F_REPLACE set, but no existing node found!\n");
return -ENOENT; return -ENOENT;
} }
if (mx) {
err = fib6_commit_metrics(&rt->dst, mx, mx_len); err = fib6_commit_metrics(&rt->dst, mxc);
if (err) if (err)
return err; return err;
}
*ins = rt; *ins = rt;
rt->rt6i_node = fn; rt->rt6i_node = fn;
rt->dst.rt6_next = iter->dst.rt6_next; rt->dst.rt6_next = iter->dst.rt6_next;
...@@ -838,8 +840,8 @@ void fib6_force_start_gc(struct net *net) ...@@ -838,8 +840,8 @@ void fib6_force_start_gc(struct net *net)
* with source addr info in sub-trees * with source addr info in sub-trees
*/ */
int fib6_add(struct fib6_node *root, struct rt6_info *rt, struct nl_info *info, int fib6_add(struct fib6_node *root, struct rt6_info *rt,
struct nlattr *mx, int mx_len) struct nl_info *info, struct mx6_config *mxc)
{ {
struct fib6_node *fn, *pn = NULL; struct fib6_node *fn, *pn = NULL;
int err = -ENOMEM; int err = -ENOMEM;
...@@ -934,7 +936,7 @@ int fib6_add(struct fib6_node *root, struct rt6_info *rt, struct nl_info *info, ...@@ -934,7 +936,7 @@ int fib6_add(struct fib6_node *root, struct rt6_info *rt, struct nl_info *info,
} }
#endif #endif
err = fib6_add_rt2node(fn, rt, info, mx, mx_len); err = fib6_add_rt2node(fn, rt, info, mxc);
if (!err) { if (!err) {
fib6_start_gc(info->nl_net, rt); fib6_start_gc(info->nl_net, rt);
if (!(rt->rt6i_flags & RTF_CACHE)) if (!(rt->rt6i_flags & RTF_CACHE))
......
...@@ -853,14 +853,14 @@ EXPORT_SYMBOL(rt6_lookup); ...@@ -853,14 +853,14 @@ EXPORT_SYMBOL(rt6_lookup);
*/ */
static int __ip6_ins_rt(struct rt6_info *rt, struct nl_info *info, static int __ip6_ins_rt(struct rt6_info *rt, struct nl_info *info,
struct nlattr *mx, int mx_len) struct mx6_config *mxc)
{ {
int err; int err;
struct fib6_table *table; struct fib6_table *table;
table = rt->rt6i_table; table = rt->rt6i_table;
write_lock_bh(&table->tb6_lock); write_lock_bh(&table->tb6_lock);
err = fib6_add(&table->tb6_root, rt, info, mx, mx_len); err = fib6_add(&table->tb6_root, rt, info, mxc);
write_unlock_bh(&table->tb6_lock); write_unlock_bh(&table->tb6_lock);
return err; return err;
...@@ -868,10 +868,10 @@ static int __ip6_ins_rt(struct rt6_info *rt, struct nl_info *info, ...@@ -868,10 +868,10 @@ static int __ip6_ins_rt(struct rt6_info *rt, struct nl_info *info,
int ip6_ins_rt(struct rt6_info *rt) int ip6_ins_rt(struct rt6_info *rt)
{ {
struct nl_info info = { struct nl_info info = { .nl_net = dev_net(rt->dst.dev), };
.nl_net = dev_net(rt->dst.dev), struct mx6_config mxc = { .mx = NULL, };
};
return __ip6_ins_rt(rt, &info, NULL, 0); return __ip6_ins_rt(rt, &info, &mxc);
} }
static struct rt6_info *rt6_alloc_cow(struct rt6_info *ort, static struct rt6_info *rt6_alloc_cow(struct rt6_info *ort,
...@@ -1470,9 +1470,51 @@ static int ip6_dst_gc(struct dst_ops *ops) ...@@ -1470,9 +1470,51 @@ static int ip6_dst_gc(struct dst_ops *ops)
return entries > rt_max_size; return entries > rt_max_size;
} }
/* static int ip6_convert_metrics(struct mx6_config *mxc,
* const struct fib6_config *cfg)
*/ {
struct nlattr *nla;
int remaining;
u32 *mp;
if (cfg->fc_mx == NULL)
return 0;
mp = kzalloc(sizeof(u32) * RTAX_MAX, GFP_KERNEL);
if (unlikely(!mp))
return -ENOMEM;
nla_for_each_attr(nla, cfg->fc_mx, cfg->fc_mx_len, remaining) {
int type = nla_type(nla);
if (type) {
u32 val;
if (unlikely(type > RTAX_MAX))
goto err;
if (type == RTAX_CC_ALGO) {
char tmp[TCP_CA_NAME_MAX];
nla_strlcpy(tmp, nla, sizeof(tmp));
val = tcp_ca_get_key_by_name(tmp);
if (val == TCP_CA_UNSPEC)
goto err;
} else {
val = nla_get_u32(nla);
}
mp[type - 1] = val;
__set_bit(type - 1, mxc->mx_valid);
}
}
mxc->mx = mp;
return 0;
err:
kfree(mp);
return -EINVAL;
}
int ip6_route_add(struct fib6_config *cfg) int ip6_route_add(struct fib6_config *cfg)
{ {
...@@ -1482,6 +1524,7 @@ int ip6_route_add(struct fib6_config *cfg) ...@@ -1482,6 +1524,7 @@ int ip6_route_add(struct fib6_config *cfg)
struct net_device *dev = NULL; struct net_device *dev = NULL;
struct inet6_dev *idev = NULL; struct inet6_dev *idev = NULL;
struct fib6_table *table; struct fib6_table *table;
struct mx6_config mxc = { .mx = NULL, };
int addr_type; int addr_type;
if (cfg->fc_dst_len > 128 || cfg->fc_src_len > 128) if (cfg->fc_dst_len > 128 || cfg->fc_src_len > 128)
...@@ -1677,8 +1720,14 @@ int ip6_route_add(struct fib6_config *cfg) ...@@ -1677,8 +1720,14 @@ int ip6_route_add(struct fib6_config *cfg)
cfg->fc_nlinfo.nl_net = dev_net(dev); cfg->fc_nlinfo.nl_net = dev_net(dev);
return __ip6_ins_rt(rt, &cfg->fc_nlinfo, cfg->fc_mx, cfg->fc_mx_len); err = ip6_convert_metrics(&mxc, cfg);
if (err)
goto out;
err = __ip6_ins_rt(rt, &cfg->fc_nlinfo, &mxc);
kfree(mxc.mx);
return err;
out: out:
if (dev) if (dev)
dev_put(dev); dev_put(dev);
...@@ -2534,7 +2583,8 @@ static inline size_t rt6_nlmsg_size(void) ...@@ -2534,7 +2583,8 @@ static inline size_t rt6_nlmsg_size(void)
+ nla_total_size(4) /* RTA_OIF */ + nla_total_size(4) /* RTA_OIF */
+ nla_total_size(4) /* RTA_PRIORITY */ + nla_total_size(4) /* RTA_PRIORITY */
+ RTAX_MAX * nla_total_size(4) /* RTA_METRICS */ + RTAX_MAX * nla_total_size(4) /* RTA_METRICS */
+ nla_total_size(sizeof(struct rta_cacheinfo)); + nla_total_size(sizeof(struct rta_cacheinfo))
+ nla_total_size(TCP_CA_NAME_MAX); /* RTAX_CC_ALGO */
} }
static int rt6_fill_node(struct net *net, static int rt6_fill_node(struct net *net,
......
...@@ -1199,6 +1199,8 @@ static struct sock *tcp_v6_syn_recv_sock(struct sock *sk, struct sk_buff *skb, ...@@ -1199,6 +1199,8 @@ static struct sock *tcp_v6_syn_recv_sock(struct sock *sk, struct sk_buff *skb,
inet_csk(newsk)->icsk_ext_hdr_len = (newnp->opt->opt_nflen + inet_csk(newsk)->icsk_ext_hdr_len = (newnp->opt->opt_nflen +
newnp->opt->opt_flen); newnp->opt->opt_flen);
tcp_ca_openreq_child(newsk, dst);
tcp_sync_mss(newsk, dst_mtu(dst)); tcp_sync_mss(newsk, dst_mtu(dst));
newtp->advmss = dst_metric_advmss(dst); newtp->advmss = dst_metric_advmss(dst);
if (tcp_sk(sk)->rx_opt.user_mss && if (tcp_sk(sk)->rx_opt.user_mss &&
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment