Commit 8470e436 authored by David S. Miller's avatar David S. Miller

Merge branch 'net-cacheline-optimizations'

Coco Li says:

====================
Analyze and Reorganize core Networking Structs to optimize cacheline consumption

Currently, variable-heavy structs in the networking stack is organized
chronologically, logically and sometimes by cacheline access.

This patch series attempts to reorganize the core networking stack
variables to minimize cacheline consumption during the phase of data
transfer. Specifically, we looked at the TCP/IP stack and the fast
path definition in TCP.

For documentation purposes, we also added new files for each core data
structure we considered, although not all ended up being modified due
to the amount of existing cacheline they span in the fast path. In
the documentation, we recorded all variables we identified on the
fast path and the reasons. We also hope that in the future when
variables are added/modified, the document can be referred to and
updated accordingly to reflect the latest variable organization.

Tested:
Our tests were run with neper tcp_rr using tcp traffic. The tests have $cpu
number of threads and variable number of flows (see below).

Tests were run on 6.5-rc1

Efficiency is computed as cpu seconds / throughput (one tcp_rr round trip).
The following result shows efficiency delta before and after the patch
series is applied.

On AMD platforms with 100Gb/s NIC and 256Mb L3 cache:
IPv4
Flows   with patches    clean kernel      Percent reduction
30k     0.0001736538065 0.0002741191042 -36.65%
20k     0.0001583661752 0.0002712559158 -41.62%
10k     0.0001639148817 0.0002951800751 -44.47%
5k      0.0001859683866 0.0003320642536 -44.00%
1k      0.0002035190546 0.0003152056382 -35.43%

IPv6
Flows   with patches  clean kernel    Percent reduction
30k     0.000202535503  0.0003275329163 -38.16%
20k     0.0002020654777 0.0003411304786 -40.77%
10k     0.0002122427035 0.0003803674705 -44.20%
5k      0.0002348776729 0.0004030403953 -41.72%
1k      0.0002237384583 0.0002813646157 -20.48%

On Intel platforms with 200Gb/s NIC and 105Mb L3 cache:
IPv6
Flows   with patches    clean kernel    Percent reduction
30k     0.0006296537873 0.0006370427753 -1.16%
20k     0.0003451029365 0.0003628016076 -4.88%
10k     0.0003187646958 0.0003346835645 -4.76%
5k      0.0002954676348 0.000311807592  -5.24%
1k      0.0001909169342 0.0001848069709 3.31%

v8 changes:
1. Update net_device_read_txrx cache group maximum
2. Update MAINTAINERS for documentations
3. Skip __cache_group variables in scripts/kernel-doc
====================
Signed-off-by: default avatarDavid S. Miller <davem@davemloft.net>
parents 7453d7a6 18fd64d2
......@@ -75,6 +75,7 @@ Contents:
mptcp-sysctl
multiqueue
napi
net_cachelines/index
netconsole
netdev-features
netdevices
......
.. SPDX-License-Identifier: GPL-2.0
.. Copyright (C) 2023 Google LLC
===================================
Common Networking Struct Cachelines
===================================
.. toctree::
:maxdepth: 1
inet_connection_sock
inet_sock
net_device
netns_ipv4_sysctl
snmp
tcp_sock
.. SPDX-License-Identifier: GPL-2.0
.. Copyright (C) 2023 Google LLC
=====================================================
inet_connection_sock struct fast path usage breakdown
=====================================================
Type Name fastpath_tx_access fastpath_rx_access comment
..struct ..inet_connection_sock
struct_inet_sock icsk_inet read_mostly read_mostly tcp_init_buffer_space,tcp_init_transfer,tcp_finish_connect,tcp_connect,tcp_send_rcvq,tcp_send_syn_data
struct_request_sock_queue icsk_accept_queue - -
struct_inet_bind_bucket icsk_bind_hash read_mostly - tcp_set_state
struct_inet_bind2_bucket icsk_bind2_hash read_mostly - tcp_set_state,inet_put_port
unsigned_long icsk_timeout read_mostly - inet_csk_reset_xmit_timer,tcp_connect
struct_timer_list icsk_retransmit_timer read_mostly - inet_csk_reset_xmit_timer,tcp_connect
struct_timer_list icsk_delack_timer read_mostly - inet_csk_reset_xmit_timer,tcp_connect
u32 icsk_rto read_write - tcp_cwnd_validate,tcp_schedule_loss_probe,tcp_connect_init,tcp_connect,tcp_write_xmit,tcp_push_one
u32 icsk_rto_min - -
u32 icsk_delack_max - -
u32 icsk_pmtu_cookie read_write - tcp_sync_mss,tcp_current_mss,tcp_send_syn_data,tcp_connect_init,tcp_connect
struct_tcp_congestion_ops icsk_ca_ops read_write - tcp_cwnd_validate,tcp_tso_segs,tcp_ca_dst_init,tcp_connect_init,tcp_connect,tcp_write_xmit
struct_inet_connection_sock_af_ops icsk_af_ops read_mostly - tcp_finish_connect,tcp_send_syn_data,tcp_mtup_init,tcp_mtu_check_reprobe,tcp_mtu_probe,tcp_connect_init,tcp_connect,__tcp_transmit_skb
struct_tcp_ulp_ops* icsk_ulp_ops - -
void* icsk_ulp_data - -
u8:5 icsk_ca_state read_write - tcp_cwnd_application_limited,tcp_set_ca_state,tcp_enter_cwr,tcp_tso_should_defer,tcp_mtu_probe,tcp_schedule_loss_probe,tcp_write_xmit,__tcp_transmit_skb
u8:1 icsk_ca_initialized read_write - tcp_init_transfer,tcp_init_congestion_control,tcp_init_transfer,tcp_finish_connect,tcp_connect
u8:1 icsk_ca_setsockopt - -
u8:1 icsk_ca_dst_locked write_mostly - tcp_ca_dst_init,tcp_connect_init,tcp_connect
u8 icsk_retransmits write_mostly - tcp_connect_init,tcp_connect
u8 icsk_pending read_write - inet_csk_reset_xmit_timer,tcp_connect,tcp_check_probe_timer,__tcp_push_pending_frames,tcp_rearm_rto,tcp_event_new_data_sent,tcp_event_new_data_sent
u8 icsk_backoff write_mostly - tcp_write_queue_purge,tcp_connect_init
u8 icsk_syn_retries - -
u8 icsk_probes_out - -
u16 icsk_ext_hdr_len read_mostly - __tcp_mtu_to_mss,tcp_mtu_to_rss,tcp_mtu_probe,tcp_write_xmit,tcp_mtu_to_mss,
struct_icsk_ack_u8 pending read_write read_write inet_csk_ack_scheduled,__tcp_cleanup_rbuf,tcp_cleanup_rbuf,inet_csk_clear_xmit_timer,tcp_event_ack-sent,inet_csk_reset_xmit_timer
struct_icsk_ack_u8 quick read_write write_mostly tcp_dec_quickack_mode,tcp_event_ack_sent,__tcp_transmit_skb,__tcp_select_window,__tcp_cleanup_rbuf
struct_icsk_ack_u8 pingpong - -
struct_icsk_ack_u8 retry write_mostly read_write inet_csk_clear_xmit_timer,tcp_rearm_rto,tcp_event_new_data_sent,tcp_write_xmit,__tcp_send_ack,tcp_send_ack,
struct_icsk_ack_u8 ato read_mostly write_mostly tcp_dec_quickack_mode,tcp_event_ack_sent,__tcp_transmit_skb,__tcp_send_ack,tcp_send_ack
struct_icsk_ack_unsigned_long timeout read_write read_write inet_csk_reset_xmit_timer,tcp_connect
struct_icsk_ack_u32 lrcvtime read_write - tcp_finish_connect,tcp_connect,tcp_event_data_sent,__tcp_transmit_skb
struct_icsk_ack_u16 rcv_mss write_mostly read_mostly __tcp_select_window,__tcp_cleanup_rbuf,tcp_initialize_rcv_mss,tcp_connect_init
struct_icsk_mtup_int search_high read_write - tcp_mtup_init,tcp_sync_mss,tcp_connect_init,tcp_mtu_check_reprobe,tcp_write_xmit
struct_icsk_mtup_int search_low read_write - tcp_mtu_probe,tcp_mtu_check_reprobe,tcp_write_xmit,tcp_sync_mss,tcp_connect_init,tcp_mtup_init
struct_icsk_mtup_u32:31 probe_size read_write - tcp_mtup_init,tcp_connect_init,__tcp_transmit_skb
struct_icsk_mtup_u32:1 enabled read_write - tcp_mtup_init,tcp_sync_mss,tcp_connect_init,tcp_mtu_probe,tcp_write_xmit
struct_icsk_mtup_u32 probe_timestamp read_write - tcp_mtup_init,tcp_connect_init,tcp_mtu_check_reprobe,tcp_mtu_probe
u32 icsk_probes_tstamp - -
u32 icsk_user_timeout - -
u64[104/sizeof(u64)] icsk_ca_priv - -
.. SPDX-License-Identifier: GPL-2.0
.. Copyright (C) 2023 Google LLC
=====================================================
inet_connection_sock struct fast path usage breakdown
=====================================================
Type Name fastpath_tx_access fastpath_rx_access comment
..struct ..inet_sock
struct_sock sk read_mostly read_mostly tcp_init_buffer_space,tcp_init_transfer,tcp_finish_connect,tcp_connect,tcp_send_rcvq,tcp_send_syn_data
struct_ipv6_pinfo* pinet6 - -
be16 inet_sport read_mostly - __tcp_transmit_skb
be32 inet_daddr read_mostly - ip_select_ident_segs
be32 inet_rcv_saddr - -
be16 inet_dport read_mostly - __tcp_transmit_skb
u16 inet_num - -
be32 inet_saddr - -
s16 uc_ttl read_mostly - __ip_queue_xmit/ip_select_ttl
u16 cmsg_flags - -
struct_ip_options_rcu* inet_opt read_mostly - __ip_queue_xmit
u16 inet_id read_mostly - ip_select_ident_segs
u8 tos read_mostly - ip_queue_xmit
u8 min_ttl - -
u8 mc_ttl - -
u8 pmtudisc - -
u8:1 recverr - -
u8:1 is_icsk - -
u8:1 freebind - -
u8:1 hdrincl - -
u8:1 mc_loop - -
u8:1 transparent - -
u8:1 mc_all - -
u8:1 nodefrag - -
u8:1 bind_address_no_port - -
u8:1 recverr_rfc4884 - -
u8:1 defer_connect read_mostly - tcp_sendmsg_fastopen
u8 rcv_tos - -
u8 convert_csum - -
int uc_index - -
int mc_index - -
be32 mc_addr - -
struct_ip_mc_socklist* mc_list - -
struct_inet_cork_full cork read_mostly - __tcp_transmit_skb
struct local_port_range - -
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
......@@ -14986,6 +14986,7 @@ Q: https://patchwork.kernel.org/project/netdevbpf/list/
T: git git://git.kernel.org/pub/scm/linux/kernel/git/netdev/net.git
T: git git://git.kernel.org/pub/scm/linux/kernel/git/netdev/net-next.git
F: Documentation/devicetree/bindings/net/
F: Documentation/networking/net_cachelines/net_device.rst
F: drivers/connector/
F: drivers/net/
F: include/dt-bindings/net/
......@@ -15041,6 +15042,7 @@ T: git git://git.kernel.org/pub/scm/linux/kernel/git/netdev/net-next.git
F: Documentation/core-api/netlink.rst
F: Documentation/netlink/
F: Documentation/networking/
F: Documentation/networking/net_cachelines/
F: Documentation/process/maintainer-netdev.rst
F: Documentation/userspace-api/netlink/
F: include/linux/in.h
......@@ -15149,6 +15151,7 @@ NETWORKING [TCP]
M: Eric Dumazet <edumazet@google.com>
L: netdev@vger.kernel.org
S: Maintained
F: Documentation/networking/net_cachelines/tcp_sock.rst
F: include/linux/tcp.h
F: include/net/tcp.h
F: include/trace/events/tcp.h
......
......@@ -85,6 +85,31 @@
#define cache_line_size() L1_CACHE_BYTES
#endif
#ifndef __cacheline_group_begin
#define __cacheline_group_begin(GROUP) \
__u8 __cacheline_group_begin__##GROUP[0]
#endif
#ifndef __cacheline_group_end
#define __cacheline_group_end(GROUP) \
__u8 __cacheline_group_end__##GROUP[0]
#endif
#ifndef CACHELINE_ASSERT_GROUP_MEMBER
#define CACHELINE_ASSERT_GROUP_MEMBER(TYPE, GROUP, MEMBER) \
BUILD_BUG_ON(!(offsetof(TYPE, MEMBER) >= \
offsetofend(TYPE, __cacheline_group_begin__##GROUP) && \
offsetofend(TYPE, MEMBER) <= \
offsetof(TYPE, __cacheline_group_end__##GROUP)))
#endif
#ifndef CACHELINE_ASSERT_GROUP_SIZE
#define CACHELINE_ASSERT_GROUP_SIZE(TYPE, GROUP, SIZE) \
BUILD_BUG_ON(offsetof(TYPE, __cacheline_group_end__##GROUP) - \
offsetofend(TYPE, __cacheline_group_begin__##GROUP) > \
SIZE)
#endif
/*
* Helper to add padding within a struct to ensure data fall into separate
* cachelines.
......
......@@ -42,6 +42,38 @@ struct inet_timewait_death_row {
struct tcp_fastopen_context;
struct netns_ipv4 {
/* Cacheline organization can be found documented in
* Documentation/networking/net_cachelines/netns_ipv4_sysctl.rst.
* Please update the document when adding new fields.
*/
/* TX readonly hotpath cache lines */
__cacheline_group_begin(netns_ipv4_read_tx);
u8 sysctl_tcp_early_retrans;
u8 sysctl_tcp_tso_win_divisor;
u8 sysctl_tcp_tso_rtt_log;
u8 sysctl_tcp_autocorking;
int sysctl_tcp_min_snd_mss;
unsigned int sysctl_tcp_notsent_lowat;
int sysctl_tcp_limit_output_bytes;
int sysctl_tcp_min_rtt_wlen;
int sysctl_tcp_wmem[3];
u8 sysctl_ip_fwd_use_pmtu;
__cacheline_group_end(netns_ipv4_read_tx);
/* TXRX readonly hotpath cache lines */
__cacheline_group_begin(netns_ipv4_read_txrx);
u8 sysctl_tcp_moderate_rcvbuf;
__cacheline_group_end(netns_ipv4_read_txrx);
/* RX readonly hotpath cache line */
__cacheline_group_begin(netns_ipv4_read_rx);
u8 sysctl_ip_early_demux;
u8 sysctl_tcp_early_demux;
int sysctl_tcp_reordering;
int sysctl_tcp_rmem[3];
__cacheline_group_end(netns_ipv4_read_rx);
struct inet_timewait_death_row tcp_death_row;
struct udp_table *udp_table;
......@@ -96,17 +128,14 @@ struct netns_ipv4 {
u8 sysctl_ip_default_ttl;
u8 sysctl_ip_no_pmtu_disc;
u8 sysctl_ip_fwd_use_pmtu;
u8 sysctl_ip_fwd_update_priority;
u8 sysctl_ip_nonlocal_bind;
u8 sysctl_ip_autobind_reuse;
/* Shall we try to damage output packets if routing dev changes? */
u8 sysctl_ip_dynaddr;
u8 sysctl_ip_early_demux;
#ifdef CONFIG_NET_L3_MASTER_DEV
u8 sysctl_raw_l3mdev_accept;
#endif
u8 sysctl_tcp_early_demux;
u8 sysctl_udp_early_demux;
u8 sysctl_nexthop_compat_mode;
......@@ -119,7 +148,6 @@ struct netns_ipv4 {
u8 sysctl_tcp_mtu_probing;
int sysctl_tcp_mtu_probe_floor;
int sysctl_tcp_base_mss;
int sysctl_tcp_min_snd_mss;
int sysctl_tcp_probe_threshold;
u32 sysctl_tcp_probe_interval;
......@@ -135,17 +163,14 @@ struct netns_ipv4 {
u8 sysctl_tcp_backlog_ack_defer;
u8 sysctl_tcp_pingpong_thresh;
int sysctl_tcp_reordering;
u8 sysctl_tcp_retries1;
u8 sysctl_tcp_retries2;
u8 sysctl_tcp_orphan_retries;
u8 sysctl_tcp_tw_reuse;
int sysctl_tcp_fin_timeout;
unsigned int sysctl_tcp_notsent_lowat;
u8 sysctl_tcp_sack;
u8 sysctl_tcp_window_scaling;
u8 sysctl_tcp_timestamps;
u8 sysctl_tcp_early_retrans;
u8 sysctl_tcp_recovery;
u8 sysctl_tcp_thin_linear_timeouts;
u8 sysctl_tcp_slow_start_after_idle;
......@@ -161,21 +186,13 @@ struct netns_ipv4 {
u8 sysctl_tcp_frto;
u8 sysctl_tcp_nometrics_save;
u8 sysctl_tcp_no_ssthresh_metrics_save;
u8 sysctl_tcp_moderate_rcvbuf;
u8 sysctl_tcp_tso_win_divisor;
u8 sysctl_tcp_workaround_signed_windows;
int sysctl_tcp_limit_output_bytes;
int sysctl_tcp_challenge_ack_limit;
int sysctl_tcp_min_rtt_wlen;
u8 sysctl_tcp_min_tso_segs;
u8 sysctl_tcp_tso_rtt_log;
u8 sysctl_tcp_autocorking;
u8 sysctl_tcp_reflect_tos;
int sysctl_tcp_invalid_ratelimit;
int sysctl_tcp_pacing_ss_ratio;
int sysctl_tcp_pacing_ca_ratio;
int sysctl_tcp_wmem[3];
int sysctl_tcp_rmem[3];
unsigned int sysctl_tcp_child_ehash_entries;
unsigned long sysctl_tcp_comp_sack_delay_ns;
unsigned long sysctl_tcp_comp_sack_slack_ns;
......
......@@ -1099,11 +1099,56 @@ static void rtnl_net_notifyid(struct net *net, int cmd, int id, u32 portid,
rtnl_set_sk_err(net, RTNLGRP_NSID, err);
}
#ifdef CONFIG_NET_NS
static void __init netns_ipv4_struct_check(void)
{
/* TX readonly hotpath cache lines */
CACHELINE_ASSERT_GROUP_MEMBER(struct netns_ipv4, netns_ipv4_read_tx,
sysctl_tcp_early_retrans);
CACHELINE_ASSERT_GROUP_MEMBER(struct netns_ipv4, netns_ipv4_read_tx,
sysctl_tcp_tso_win_divisor);
CACHELINE_ASSERT_GROUP_MEMBER(struct netns_ipv4, netns_ipv4_read_tx,
sysctl_tcp_tso_rtt_log);
CACHELINE_ASSERT_GROUP_MEMBER(struct netns_ipv4, netns_ipv4_read_tx,
sysctl_tcp_autocorking);
CACHELINE_ASSERT_GROUP_MEMBER(struct netns_ipv4, netns_ipv4_read_tx,
sysctl_tcp_min_snd_mss);
CACHELINE_ASSERT_GROUP_MEMBER(struct netns_ipv4, netns_ipv4_read_tx,
sysctl_tcp_notsent_lowat);
CACHELINE_ASSERT_GROUP_MEMBER(struct netns_ipv4, netns_ipv4_read_tx,
sysctl_tcp_limit_output_bytes);
CACHELINE_ASSERT_GROUP_MEMBER(struct netns_ipv4, netns_ipv4_read_tx,
sysctl_tcp_min_rtt_wlen);
CACHELINE_ASSERT_GROUP_MEMBER(struct netns_ipv4, netns_ipv4_read_tx,
sysctl_tcp_wmem);
CACHELINE_ASSERT_GROUP_MEMBER(struct netns_ipv4, netns_ipv4_read_tx,
sysctl_ip_fwd_use_pmtu);
CACHELINE_ASSERT_GROUP_SIZE(struct netns_ipv4, netns_ipv4_read_tx, 33);
/* TXRX readonly hotpath cache lines */
CACHELINE_ASSERT_GROUP_MEMBER(struct netns_ipv4, netns_ipv4_read_txrx,
sysctl_tcp_moderate_rcvbuf);
CACHELINE_ASSERT_GROUP_SIZE(struct netns_ipv4, netns_ipv4_read_txrx, 1);
/* RX readonly hotpath cache line */
CACHELINE_ASSERT_GROUP_MEMBER(struct netns_ipv4, netns_ipv4_read_rx,
sysctl_ip_early_demux);
CACHELINE_ASSERT_GROUP_MEMBER(struct netns_ipv4, netns_ipv4_read_rx,
sysctl_tcp_early_demux);
CACHELINE_ASSERT_GROUP_MEMBER(struct netns_ipv4, netns_ipv4_read_rx,
sysctl_tcp_reordering);
CACHELINE_ASSERT_GROUP_MEMBER(struct netns_ipv4, netns_ipv4_read_rx,
sysctl_tcp_rmem);
CACHELINE_ASSERT_GROUP_SIZE(struct netns_ipv4, netns_ipv4_read_rx, 18);
}
#endif
void __init net_ns_init(void)
{
struct net_generic *ng;
#ifdef CONFIG_NET_NS
netns_ipv4_struct_check();
net_cachep = kmem_cache_create("net_namespace", sizeof(struct net),
SMP_CACHE_BYTES,
SLAB_PANIC|SLAB_ACCOUNT, NULL);
......
......@@ -1592,6 +1592,11 @@ sub push_parameter($$$$$) {
$parameterdescs{$param} = "anonymous\n";
$anon_struct_union = 1;
}
elsif ($param =~ "__cacheline_group" )
# handle cache group enforcing variables: they do not need be described in header files
{
return; # ignore __cacheline_group_begin and __cacheline_group_end
}
# warn if parameter has no description
# (but ignore ones starting with # as these are not parameters
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment