Commit 7bb50f30 authored by Jakub Kicinski's avatar Jakub Kicinski

Merge branch 'net-fib_rules-add-dscp-selector-support'

Ido Schimmel says:

====================
net: fib_rules: Add DSCP selector support

Currently, the kernel rejects IPv4 FIB rules that try to match on the
upper three DSCP bits:

 # ip -4 rule add tos 0x1c table 100
 # ip -4 rule add tos 0x3c table 100
 Error: Invalid tos.

The reason for that is that historically users of the FIB lookup API
only populated the lower three DSCP bits in the TOS field of the IPv4
flow key ('flowi4_tos'), which fits the TOS definition from the initial
IPv4 specification (RFC 791).

This is not very useful nowadays and instead some users want to be able
to match on the six bits DSCP field, which replaced the TOS and IP
precedence fields over 25 years ago (RFC 2474). In addition, the current
behavior differs between IPv4 and IPv6 which does allow users to match
on the entire DSCP field using the TOS selector.

Recent patchsets made sure that callers of the FIB lookup API now
populate the entire DSCP field in the IPv4 flow key. Therefore, it is
now possible to extend FIB rules to match on DSCP.

This is done by adding a new DSCP attribute which is implemented for
both IPv4 and IPv6 to provide user space programs a consistent behavior
between both address families.

The behavior of the old TOS selector is unchanged and IPv4 FIB rules
using it will only match on the lower three DSCP bits. The kernel will
reject rules that try to use both selectors.

Patch #1 adds the new DSCP attribute but rejects its usage.

Patches #2-#3 implement IPv4 and IPv6 support.

Patch #4 allows user space to use the new attribute.

Patches #5-#6 add selftests.
====================

Link: https://patch.msgid.link/20240911093748.3662015-1-idosch@nvidia.comSigned-off-by: default avatarJakub Kicinski <kuba@kernel.org>
parents 716425d6 2bf1259a
......@@ -67,6 +67,7 @@ enum {
FRA_IP_PROTO, /* ip proto */
FRA_SPORT_RANGE, /* sport */
FRA_DPORT_RANGE, /* dport */
FRA_DSCP, /* dscp */
__FRA_MAX
};
......
......@@ -11,6 +11,7 @@
#include <linux/list.h>
#include <linux/module.h>
#include <net/net_namespace.h>
#include <net/inet_dscp.h>
#include <net/sock.h>
#include <net/fib_rules.h>
#include <net/ip_tunnels.h>
......@@ -766,7 +767,8 @@ static const struct nla_policy fib_rule_policy[FRA_MAX + 1] = {
[FRA_PROTOCOL] = { .type = NLA_U8 },
[FRA_IP_PROTO] = { .type = NLA_U8 },
[FRA_SPORT_RANGE] = { .len = sizeof(struct fib_rule_port_range) },
[FRA_DPORT_RANGE] = { .len = sizeof(struct fib_rule_port_range) }
[FRA_DPORT_RANGE] = { .len = sizeof(struct fib_rule_port_range) },
[FRA_DSCP] = NLA_POLICY_MAX(NLA_U8, INET_DSCP_MASK >> 2),
};
int fib_nl_newrule(struct sk_buff *skb, struct nlmsghdr *nlh,
......
......@@ -37,6 +37,7 @@ struct fib4_rule {
u8 dst_len;
u8 src_len;
dscp_t dscp;
u8 dscp_full:1; /* DSCP or TOS selector */
__be32 src;
__be32 srcmask;
__be32 dst;
......@@ -186,7 +187,15 @@ INDIRECT_CALLABLE_SCOPE int fib4_rule_match(struct fib_rule *rule,
((daddr ^ r->dst) & r->dstmask))
return 0;
if (r->dscp && !fib_dscp_masked_match(r->dscp, fl4))
/* When DSCP selector is used we need to match on the entire DSCP field
* in the flow information structure. When TOS selector is used we need
* to mask the upper three DSCP bits prior to matching to maintain
* legacy behavior.
*/
if (r->dscp_full && r->dscp != inet_dsfield_to_dscp(fl4->flowi4_tos))
return 0;
else if (!r->dscp_full && r->dscp &&
!fib_dscp_masked_match(r->dscp, fl4))
return 0;
if (rule->ip_proto && (rule->ip_proto != fl4->flowi4_proto))
......@@ -217,6 +226,20 @@ static struct fib_table *fib_empty_table(struct net *net)
return NULL;
}
static int fib4_nl2rule_dscp(const struct nlattr *nla, struct fib4_rule *rule4,
struct netlink_ext_ack *extack)
{
if (rule4->dscp) {
NL_SET_ERR_MSG(extack, "Cannot specify both TOS and DSCP");
return -EINVAL;
}
rule4->dscp = inet_dsfield_to_dscp(nla_get_u8(nla) << 2);
rule4->dscp_full = true;
return 0;
}
static int fib4_rule_configure(struct fib_rule *rule, struct sk_buff *skb,
struct fib_rule_hdr *frh,
struct nlattr **tb,
......@@ -238,6 +261,10 @@ static int fib4_rule_configure(struct fib_rule *rule, struct sk_buff *skb,
}
rule4->dscp = inet_dsfield_to_dscp(frh->tos);
if (tb[FRA_DSCP] &&
fib4_nl2rule_dscp(tb[FRA_DSCP], rule4, extack) < 0)
goto errout;
/* split local/main if they are not already split */
err = fib_unmerge(net);
if (err)
......@@ -320,9 +347,19 @@ static int fib4_rule_compare(struct fib_rule *rule, struct fib_rule_hdr *frh,
if (frh->dst_len && (rule4->dst_len != frh->dst_len))
return 0;
if (frh->tos && inet_dscp_to_dsfield(rule4->dscp) != frh->tos)
if (frh->tos &&
(rule4->dscp_full ||
inet_dscp_to_dsfield(rule4->dscp) != frh->tos))
return 0;
if (tb[FRA_DSCP]) {
dscp_t dscp;
dscp = inet_dsfield_to_dscp(nla_get_u8(tb[FRA_DSCP]) << 2);
if (!rule4->dscp_full || rule4->dscp != dscp)
return 0;
}
#ifdef CONFIG_IP_ROUTE_CLASSID
if (tb[FRA_FLOW] && (rule4->tclassid != nla_get_u32(tb[FRA_FLOW])))
return 0;
......@@ -344,7 +381,15 @@ static int fib4_rule_fill(struct fib_rule *rule, struct sk_buff *skb,
frh->dst_len = rule4->dst_len;
frh->src_len = rule4->src_len;
frh->tos = inet_dscp_to_dsfield(rule4->dscp);
if (rule4->dscp_full) {
frh->tos = 0;
if (nla_put_u8(skb, FRA_DSCP,
inet_dscp_to_dsfield(rule4->dscp) >> 2))
goto nla_put_failure;
} else {
frh->tos = inet_dscp_to_dsfield(rule4->dscp);
}
if ((rule4->dst_len &&
nla_put_in_addr(skb, FRA_DST, rule4->dst)) ||
......@@ -366,7 +411,8 @@ static size_t fib4_rule_nlmsg_payload(struct fib_rule *rule)
{
return nla_total_size(4) /* dst */
+ nla_total_size(4) /* src */
+ nla_total_size(4); /* flow */
+ nla_total_size(4) /* flow */
+ nla_total_size(1); /* dscp */
}
static void fib4_rule_flush_cache(struct fib_rules_ops *ops)
......
......@@ -27,6 +27,7 @@ struct fib6_rule {
struct rt6key src;
struct rt6key dst;
dscp_t dscp;
u8 dscp_full:1; /* DSCP or TOS selector */
};
static bool fib6_rule_matchall(const struct fib_rule *rule)
......@@ -345,6 +346,20 @@ INDIRECT_CALLABLE_SCOPE int fib6_rule_match(struct fib_rule *rule,
return 1;
}
static int fib6_nl2rule_dscp(const struct nlattr *nla, struct fib6_rule *rule6,
struct netlink_ext_ack *extack)
{
if (rule6->dscp) {
NL_SET_ERR_MSG(extack, "Cannot specify both TOS and DSCP");
return -EINVAL;
}
rule6->dscp = inet_dsfield_to_dscp(nla_get_u8(nla) << 2);
rule6->dscp_full = true;
return 0;
}
static int fib6_rule_configure(struct fib_rule *rule, struct sk_buff *skb,
struct fib_rule_hdr *frh,
struct nlattr **tb,
......@@ -361,6 +376,9 @@ static int fib6_rule_configure(struct fib_rule *rule, struct sk_buff *skb,
}
rule6->dscp = inet_dsfield_to_dscp(frh->tos);
if (tb[FRA_DSCP] && fib6_nl2rule_dscp(tb[FRA_DSCP], rule6, extack) < 0)
goto errout;
if (rule->action == FR_ACT_TO_TBL && !rule->l3mdev) {
if (rule->table == RT6_TABLE_UNSPEC) {
NL_SET_ERR_MSG(extack, "Invalid table");
......@@ -413,9 +431,19 @@ static int fib6_rule_compare(struct fib_rule *rule, struct fib_rule_hdr *frh,
if (frh->dst_len && (rule6->dst.plen != frh->dst_len))
return 0;
if (frh->tos && inet_dscp_to_dsfield(rule6->dscp) != frh->tos)
if (frh->tos &&
(rule6->dscp_full ||
inet_dscp_to_dsfield(rule6->dscp) != frh->tos))
return 0;
if (tb[FRA_DSCP]) {
dscp_t dscp;
dscp = inet_dsfield_to_dscp(nla_get_u8(tb[FRA_DSCP]) << 2);
if (!rule6->dscp_full || rule6->dscp != dscp)
return 0;
}
if (frh->src_len &&
nla_memcmp(tb[FRA_SRC], &rule6->src.addr, sizeof(struct in6_addr)))
return 0;
......@@ -434,7 +462,15 @@ static int fib6_rule_fill(struct fib_rule *rule, struct sk_buff *skb,
frh->dst_len = rule6->dst.plen;
frh->src_len = rule6->src.plen;
frh->tos = inet_dscp_to_dsfield(rule6->dscp);
if (rule6->dscp_full) {
frh->tos = 0;
if (nla_put_u8(skb, FRA_DSCP,
inet_dscp_to_dsfield(rule6->dscp) >> 2))
goto nla_put_failure;
} else {
frh->tos = inet_dscp_to_dsfield(rule6->dscp);
}
if ((rule6->dst.plen &&
nla_put_in6_addr(skb, FRA_DST, &rule6->dst.addr)) ||
......@@ -450,7 +486,8 @@ static int fib6_rule_fill(struct fib_rule *rule, struct sk_buff *skb,
static size_t fib6_rule_nlmsg_payload(struct fib_rule *rule)
{
return nla_total_size(16) /* dst */
+ nla_total_size(16); /* src */
+ nla_total_size(16) /* src */
+ nla_total_size(1); /* dscp */
}
static void fib6_rule_flush_cache(struct fib_rules_ops *ops)
......
......@@ -274,6 +274,23 @@ fib_rule6_test()
"$getnomatch" "ipproto ipv6-icmp match" \
"ipproto ipv6-tcp no match"
fi
fib_check_iproute_support "dscp" "tos"
if [ $? -eq 0 ]; then
match="dscp 0x3f"
getmatch="tos 0xfc"
getnomatch="tos 0xf4"
fib_rule6_test_match_n_redirect "$match" "$getmatch" \
"$getnomatch" "dscp redirect to table" \
"dscp no redirect to table"
match="dscp 0x3f"
getmatch="from $SRC_IP6 iif $DEV tos 0xfc"
getnomatch="from $SRC_IP6 iif $DEV tos 0xf4"
fib_rule6_test_match_n_redirect "$match" "$getmatch" \
"$getnomatch" "iif dscp redirect to table" \
"iif dscp no redirect to table"
fi
}
fib_rule6_vrf_test()
......@@ -319,6 +336,34 @@ fib_rule6_connect_test()
log_test $? 1 "rule6 dsfield tcp no connect (dsfield 0x20)"
$IP -6 rule del dsfield 0x04 table $RTABLE_PEER
ip rule help 2>&1 | grep -q dscp
if [ $? -ne 0 ]; then
echo "SKIP: iproute2 iprule too old, missing dscp match"
cleanup_peer
return
fi
$IP -6 rule add dscp 0x3f table $RTABLE_PEER
nettest -q -6 -B -t 5 -N $testns -O $peerns -U -D -Q 0xfc \
-l 2001:db8::1:11 -r 2001:db8::1:11
log_test $? 0 "rule6 dscp udp connect"
nettest -q -6 -B -t 5 -N $testns -O $peerns -Q 0xfc \
-l 2001:db8::1:11 -r 2001:db8::1:11
log_test $? 0 "rule6 dscp tcp connect"
nettest -q -6 -B -t 5 -N $testns -O $peerns -U -D -Q 0xf4 \
-l 2001:db8::1:11 -r 2001:db8::1:11
log_test $? 1 "rule6 dscp udp no connect"
nettest -q -6 -B -t 5 -N $testns -O $peerns -Q 0xf4 \
-l 2001:db8::1:11 -r 2001:db8::1:11
log_test $? 1 "rule6 dscp tcp no connect"
$IP -6 rule del dscp 0x3f table $RTABLE_PEER
cleanup_peer
}
......@@ -468,6 +513,23 @@ fib_rule4_test()
"$getnomatch" "ipproto icmp match" \
"ipproto tcp no match"
fi
fib_check_iproute_support "dscp" "tos"
if [ $? -eq 0 ]; then
match="dscp 0x3f"
getmatch="tos 0xfc"
getnomatch="tos 0xf4"
fib_rule4_test_match_n_redirect "$match" "$getmatch" \
"$getnomatch" "dscp redirect to table" \
"dscp no redirect to table"
match="dscp 0x3f"
getmatch="from $SRC_IP iif $DEV tos 0xfc"
getnomatch="from $SRC_IP iif $DEV tos 0xf4"
fib_rule4_test_match_n_redirect "$match" "$getmatch" \
"$getnomatch" "iif dscp redirect to table" \
"iif dscp no redirect to table"
fi
}
fib_rule4_vrf_test()
......@@ -513,6 +575,34 @@ fib_rule4_connect_test()
log_test $? 1 "rule4 dsfield tcp no connect (dsfield 0x20)"
$IP -4 rule del dsfield 0x04 table $RTABLE_PEER
ip rule help 2>&1 | grep -q dscp
if [ $? -ne 0 ]; then
echo "SKIP: iproute2 iprule too old, missing dscp match"
cleanup_peer
return
fi
$IP -4 rule add dscp 0x3f table $RTABLE_PEER
nettest -q -B -t 5 -N $testns -O $peerns -D -U -Q 0xfc \
-l 198.51.100.11 -r 198.51.100.11
log_test $? 0 "rule4 dscp udp connect"
nettest -q -B -t 5 -N $testns -O $peerns -Q 0xfc \
-l 198.51.100.11 -r 198.51.100.11
log_test $? 0 "rule4 dscp tcp connect"
nettest -q -B -t 5 -N $testns -O $peerns -D -U -Q 0xf4 \
-l 198.51.100.11 -r 198.51.100.11
log_test $? 1 "rule4 dscp udp no connect"
nettest -q -B -t 5 -N $testns -O $peerns -Q 0xf4 \
-l 198.51.100.11 -r 198.51.100.11
log_test $? 1 "rule4 dscp tcp no connect"
$IP -4 rule del dscp 0x3f table $RTABLE_PEER
cleanup_peer
}
################################################################################
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment