Commit faf4cf74 authored by Jakub Kicinski's avatar Jakub Kicinski

Merge branch 'reorganize-remaining-patch-of-networking-struct-cachelines'

Coco Li says:

====================
Reorganize remaining patch of networking struct cachelines

Rebase patches to top-of-head in https://lwn.net/Articles/951321/ to
ensure the results of the cacheline savings are still accurate.
====================

Link: https://lore.kernel.org/r/20231204201232.520025-1-lixiaoyan@google.comSigned-off-by: default avatarJakub Kicinski <kuba@kernel.org>
parents 5aa00e9e d5fed5ad
...@@ -2097,6 +2097,70 @@ enum netdev_stat_type { ...@@ -2097,6 +2097,70 @@ enum netdev_stat_type {
*/ */
struct net_device { struct net_device {
/* Cacheline organization can be found documented in
* Documentation/networking/net_cachelines/net_device.rst.
* Please update the document when adding new fields.
*/
/* TX read-mostly hotpath */
__cacheline_group_begin(net_device_read_tx);
unsigned long long priv_flags;
const struct net_device_ops *netdev_ops;
const struct header_ops *header_ops;
struct netdev_queue *_tx;
unsigned int real_num_tx_queues;
unsigned int gso_max_size;
unsigned int gso_ipv4_max_size;
u16 gso_max_segs;
s16 num_tc;
/* Note : dev->mtu is often read without holding a lock.
* Writers usually hold RTNL.
* It is recommended to use READ_ONCE() to annotate the reads,
* and to use WRITE_ONCE() to annotate the writes.
*/
unsigned int mtu;
unsigned short needed_headroom;
struct netdev_tc_txq tc_to_txq[TC_MAX_QUEUE];
#ifdef CONFIG_XPS
struct xps_dev_maps __rcu *xps_maps[XPS_MAPS_MAX];
#endif
#ifdef CONFIG_NETFILTER_EGRESS
struct nf_hook_entries __rcu *nf_hooks_egress;
#endif
#ifdef CONFIG_NET_XGRESS
struct bpf_mprog_entry __rcu *tcx_egress;
#endif
__cacheline_group_end(net_device_read_tx);
/* TXRX read-mostly hotpath */
__cacheline_group_begin(net_device_read_txrx);
unsigned int flags;
unsigned short hard_header_len;
netdev_features_t features;
struct inet6_dev __rcu *ip6_ptr;
__cacheline_group_end(net_device_read_txrx);
/* RX read-mostly hotpath */
__cacheline_group_begin(net_device_read_rx);
struct list_head ptype_specific;
int ifindex;
unsigned int real_num_rx_queues;
struct netdev_rx_queue *_rx;
unsigned long gro_flush_timeout;
int napi_defer_hard_irqs;
unsigned int gro_max_size;
unsigned int gro_ipv4_max_size;
rx_handler_func_t __rcu *rx_handler;
void __rcu *rx_handler_data;
possible_net_t nd_net;
#ifdef CONFIG_NETPOLL
struct netpoll_info __rcu *npinfo;
#endif
#ifdef CONFIG_NET_XGRESS
struct bpf_mprog_entry __rcu *tcx_ingress;
#endif
__cacheline_group_end(net_device_read_rx);
char name[IFNAMSIZ]; char name[IFNAMSIZ];
struct netdev_name_node *name_node; struct netdev_name_node *name_node;
struct dev_ifalias __rcu *ifalias; struct dev_ifalias __rcu *ifalias;
...@@ -2121,7 +2185,6 @@ struct net_device { ...@@ -2121,7 +2185,6 @@ struct net_device {
struct list_head unreg_list; struct list_head unreg_list;
struct list_head close_list; struct list_head close_list;
struct list_head ptype_all; struct list_head ptype_all;
struct list_head ptype_specific;
struct { struct {
struct list_head upper; struct list_head upper;
...@@ -2129,26 +2192,13 @@ struct net_device { ...@@ -2129,26 +2192,13 @@ struct net_device {
} adj_list; } adj_list;
/* Read-mostly cache-line for fast-path access */ /* Read-mostly cache-line for fast-path access */
unsigned int flags;
xdp_features_t xdp_features; xdp_features_t xdp_features;
unsigned long long priv_flags;
const struct net_device_ops *netdev_ops;
const struct xdp_metadata_ops *xdp_metadata_ops; const struct xdp_metadata_ops *xdp_metadata_ops;
const struct xsk_tx_metadata_ops *xsk_tx_metadata_ops; const struct xsk_tx_metadata_ops *xsk_tx_metadata_ops;
int ifindex;
unsigned short gflags; unsigned short gflags;
unsigned short hard_header_len;
/* Note : dev->mtu is often read without holding a lock.
* Writers usually hold RTNL.
* It is recommended to use READ_ONCE() to annotate the reads,
* and to use WRITE_ONCE() to annotate the writes.
*/
unsigned int mtu;
unsigned short needed_headroom;
unsigned short needed_tailroom; unsigned short needed_tailroom;
netdev_features_t features;
netdev_features_t hw_features; netdev_features_t hw_features;
netdev_features_t wanted_features; netdev_features_t wanted_features;
netdev_features_t vlan_features; netdev_features_t vlan_features;
...@@ -2192,8 +2242,6 @@ struct net_device { ...@@ -2192,8 +2242,6 @@ struct net_device {
const struct tlsdev_ops *tlsdev_ops; const struct tlsdev_ops *tlsdev_ops;
#endif #endif
const struct header_ops *header_ops;
unsigned char operstate; unsigned char operstate;
unsigned char link_mode; unsigned char link_mode;
...@@ -2234,9 +2282,7 @@ struct net_device { ...@@ -2234,9 +2282,7 @@ struct net_device {
/* Protocol-specific pointers */ /* Protocol-specific pointers */
struct in_device __rcu *ip_ptr; struct in_device __rcu *ip_ptr;
struct inet6_dev __rcu *ip6_ptr;
#if IS_ENABLED(CONFIG_VLAN_8021Q) #if IS_ENABLED(CONFIG_VLAN_8021Q)
struct vlan_info __rcu *vlan_info; struct vlan_info __rcu *vlan_info;
#endif #endif
...@@ -2271,26 +2317,14 @@ struct net_device { ...@@ -2271,26 +2317,14 @@ struct net_device {
/* Interface address info used in eth_type_trans() */ /* Interface address info used in eth_type_trans() */
const unsigned char *dev_addr; const unsigned char *dev_addr;
struct netdev_rx_queue *_rx;
unsigned int num_rx_queues; unsigned int num_rx_queues;
unsigned int real_num_rx_queues;
struct bpf_prog __rcu *xdp_prog; struct bpf_prog __rcu *xdp_prog;
unsigned long gro_flush_timeout;
int napi_defer_hard_irqs;
#define GRO_LEGACY_MAX_SIZE 65536u #define GRO_LEGACY_MAX_SIZE 65536u
/* TCP minimal MSS is 8 (TCP_MIN_GSO_SIZE), /* TCP minimal MSS is 8 (TCP_MIN_GSO_SIZE),
* and shinfo->gso_segs is a 16bit field. * and shinfo->gso_segs is a 16bit field.
*/ */
#define GRO_MAX_SIZE (8 * 65535u) #define GRO_MAX_SIZE (8 * 65535u)
unsigned int gro_max_size;
unsigned int gro_ipv4_max_size;
unsigned int xdp_zc_max_segs; unsigned int xdp_zc_max_segs;
rx_handler_func_t __rcu *rx_handler;
void __rcu *rx_handler_data;
#ifdef CONFIG_NET_XGRESS
struct bpf_mprog_entry __rcu *tcx_ingress;
#endif
struct netdev_queue __rcu *ingress_queue; struct netdev_queue __rcu *ingress_queue;
#ifdef CONFIG_NETFILTER_INGRESS #ifdef CONFIG_NETFILTER_INGRESS
struct nf_hook_entries __rcu *nf_hooks_ingress; struct nf_hook_entries __rcu *nf_hooks_ingress;
...@@ -2305,25 +2339,13 @@ struct net_device { ...@@ -2305,25 +2339,13 @@ struct net_device {
/* /*
* Cache lines mostly used on transmit path * Cache lines mostly used on transmit path
*/ */
struct netdev_queue *_tx ____cacheline_aligned_in_smp;
unsigned int num_tx_queues; unsigned int num_tx_queues;
unsigned int real_num_tx_queues;
struct Qdisc __rcu *qdisc; struct Qdisc __rcu *qdisc;
unsigned int tx_queue_len; unsigned int tx_queue_len;
spinlock_t tx_global_lock; spinlock_t tx_global_lock;
struct xdp_dev_bulk_queue __percpu *xdp_bulkq; struct xdp_dev_bulk_queue __percpu *xdp_bulkq;
#ifdef CONFIG_XPS
struct xps_dev_maps __rcu *xps_maps[XPS_MAPS_MAX];
#endif
#ifdef CONFIG_NET_XGRESS
struct bpf_mprog_entry __rcu *tcx_egress;
#endif
#ifdef CONFIG_NETFILTER_EGRESS
struct nf_hook_entries __rcu *nf_hooks_egress;
#endif
#ifdef CONFIG_NET_SCHED #ifdef CONFIG_NET_SCHED
DECLARE_HASHTABLE (qdisc_hash, 4); DECLARE_HASHTABLE (qdisc_hash, 4);
#endif #endif
...@@ -2362,12 +2384,6 @@ struct net_device { ...@@ -2362,12 +2384,6 @@ struct net_device {
bool needs_free_netdev; bool needs_free_netdev;
void (*priv_destructor)(struct net_device *dev); void (*priv_destructor)(struct net_device *dev);
#ifdef CONFIG_NETPOLL
struct netpoll_info __rcu *npinfo;
#endif
possible_net_t nd_net;
/* mid-layer private */ /* mid-layer private */
void *ml_priv; void *ml_priv;
enum netdev_ml_priv_type ml_priv_type; enum netdev_ml_priv_type ml_priv_type;
...@@ -2402,20 +2418,15 @@ struct net_device { ...@@ -2402,20 +2418,15 @@ struct net_device {
*/ */
#define GSO_MAX_SIZE (8 * GSO_MAX_SEGS) #define GSO_MAX_SIZE (8 * GSO_MAX_SEGS)
unsigned int gso_max_size;
#define TSO_LEGACY_MAX_SIZE 65536 #define TSO_LEGACY_MAX_SIZE 65536
#define TSO_MAX_SIZE UINT_MAX #define TSO_MAX_SIZE UINT_MAX
unsigned int tso_max_size; unsigned int tso_max_size;
u16 gso_max_segs;
#define TSO_MAX_SEGS U16_MAX #define TSO_MAX_SEGS U16_MAX
u16 tso_max_segs; u16 tso_max_segs;
unsigned int gso_ipv4_max_size;
#ifdef CONFIG_DCB #ifdef CONFIG_DCB
const struct dcbnl_rtnl_ops *dcbnl_ops; const struct dcbnl_rtnl_ops *dcbnl_ops;
#endif #endif
s16 num_tc;
struct netdev_tc_txq tc_to_txq[TC_MAX_QUEUE];
u8 prio_tc_map[TC_BITMASK + 1]; u8 prio_tc_map[TC_BITMASK + 1];
#if IS_ENABLED(CONFIG_FCOE) #if IS_ENABLED(CONFIG_FCOE)
......
...@@ -194,23 +194,121 @@ static inline bool tcp_rsk_used_ao(const struct request_sock *req) ...@@ -194,23 +194,121 @@ static inline bool tcp_rsk_used_ao(const struct request_sock *req)
#define TCP_RMEM_TO_WIN_SCALE 8 #define TCP_RMEM_TO_WIN_SCALE 8
struct tcp_sock { struct tcp_sock {
/* Cacheline organization can be found documented in
* Documentation/networking/net_cachelines/tcp_sock.rst.
* Please update the document when adding new fields.
*/
/* inet_connection_sock has to be the first member of tcp_sock */ /* inet_connection_sock has to be the first member of tcp_sock */
struct inet_connection_sock inet_conn; struct inet_connection_sock inet_conn;
u16 tcp_header_len; /* Bytes of tcp header to send */
/* TX read-mostly hotpath cache lines */
__cacheline_group_begin(tcp_sock_read_tx);
/* timestamp of last sent data packet (for restart window) */
u32 max_window; /* Maximal window ever seen from peer */
u32 rcv_ssthresh; /* Current window clamp */
u32 reordering; /* Packet reordering metric. */
u32 notsent_lowat; /* TCP_NOTSENT_LOWAT */
u16 gso_segs; /* Max number of segs per GSO packet */ u16 gso_segs; /* Max number of segs per GSO packet */
/* from STCP, retrans queue hinting */
struct sk_buff *lost_skb_hint;
struct sk_buff *retransmit_skb_hint;
__cacheline_group_end(tcp_sock_read_tx);
/* TXRX read-mostly hotpath cache lines */
__cacheline_group_begin(tcp_sock_read_txrx);
u32 tsoffset; /* timestamp offset */
u32 snd_wnd; /* The window we expect to receive */
u32 mss_cache; /* Cached effective mss, not including SACKS */
u32 snd_cwnd; /* Sending congestion window */
u32 prr_out; /* Total number of pkts sent during Recovery. */
u32 lost_out; /* Lost packets */
u32 sacked_out; /* SACK'd packets */
u16 tcp_header_len; /* Bytes of tcp header to send */
u8 chrono_type : 2, /* current chronograph type */
repair : 1,
is_sack_reneg:1, /* in recovery from loss with SACK reneg? */
is_cwnd_limited:1;/* forward progress limited by snd_cwnd? */
__cacheline_group_end(tcp_sock_read_txrx);
/* RX read-mostly hotpath cache lines */
__cacheline_group_begin(tcp_sock_read_rx);
u32 copied_seq; /* Head of yet unread data */
u32 rcv_tstamp; /* timestamp of last received ACK (for keepalives) */
u32 snd_wl1; /* Sequence for window update */
u32 tlp_high_seq; /* snd_nxt at the time of TLP */
u32 rttvar_us; /* smoothed mdev_max */
u32 retrans_out; /* Retransmitted packets out */
u16 advmss; /* Advertised MSS */
u16 urg_data; /* Saved octet of OOB data and control flags */
u32 lost; /* Total data packets lost incl. rexmits */
struct minmax rtt_min;
/* OOO segments go in this rbtree. Socket lock must be held. */
struct rb_root out_of_order_queue;
u32 snd_ssthresh; /* Slow start size threshold */
__cacheline_group_end(tcp_sock_read_rx);
/* TX read-write hotpath cache lines */
__cacheline_group_begin(tcp_sock_write_tx) ____cacheline_aligned;
u32 segs_out; /* RFC4898 tcpEStatsPerfSegsOut
* The total number of segments sent.
*/
u32 data_segs_out; /* RFC4898 tcpEStatsPerfDataSegsOut
* total number of data segments sent.
*/
u64 bytes_sent; /* RFC4898 tcpEStatsPerfHCDataOctetsOut
* total number of data bytes sent.
*/
u32 snd_sml; /* Last byte of the most recently transmitted small packet */
u32 chrono_start; /* Start time in jiffies of a TCP chrono */
u32 chrono_stat[3]; /* Time in jiffies for chrono_stat stats */
u32 write_seq; /* Tail(+1) of data held in tcp send buffer */
u32 pushed_seq; /* Last pushed seq, required to talk to windows */
u32 lsndtime;
u32 mdev_us; /* medium deviation */
u64 tcp_wstamp_ns; /* departure time for next sent data packet */
u64 tcp_clock_cache; /* cache last tcp_clock_ns() (see tcp_mstamp_refresh()) */
u64 tcp_mstamp; /* most recent packet received/sent */
u32 rtt_seq; /* sequence number to update rttvar */
struct list_head tsorted_sent_queue; /* time-sorted sent but un-SACKed skbs */
struct sk_buff *highest_sack; /* skb just after the highest
* skb with SACKed bit set
* (validity guaranteed only if
* sacked_out > 0)
*/
u8 ecn_flags; /* ECN status bits. */
__cacheline_group_end(tcp_sock_write_tx);
/* TXRX read-write hotpath cache lines */
__cacheline_group_begin(tcp_sock_write_txrx);
/* /*
* Header prediction flags * Header prediction flags
* 0x5?10 << 16 + snd_wnd in net byte order * 0x5?10 << 16 + snd_wnd in net byte order
*/ */
__be32 pred_flags; __be32 pred_flags;
u32 rcv_nxt; /* What we want to receive next */
u32 snd_nxt; /* Next sequence we send */
u32 snd_una; /* First byte we want an ack for */
u32 window_clamp; /* Maximal window to advertise */
u32 srtt_us; /* smoothed round trip time << 3 in usecs */
u32 packets_out; /* Packets which are "in flight" */
u32 snd_up; /* Urgent pointer */
u32 delivered; /* Total data packets delivered incl. rexmits */
u32 delivered_ce; /* Like the above but only ECE marked packets */
u32 app_limited; /* limited until "delivered" reaches this val */
u32 rcv_wnd; /* Current receiver window */
/* /*
* RFC793 variables by their proper names. This means you can * Options received (usually on last packet, some only on SYN packets).
* read the code and the spec side by side (and laugh ...)
* See RFC793 and RFC1122. The RFC writes these in capitals.
*/ */
u64 bytes_received; /* RFC4898 tcpEStatsAppHCThruOctetsReceived struct tcp_options_received rx_opt;
u8 nonagle : 4,/* Disable Nagle algorithm? */
rate_app_limited:1; /* rate_{delivered,interval_us} limited? */
__cacheline_group_end(tcp_sock_write_txrx);
/* RX read-write hotpath cache lines */
__cacheline_group_begin(tcp_sock_write_rx);
u64 bytes_received;
/* RFC4898 tcpEStatsAppHCThruOctetsReceived
* sum(delta(rcv_nxt)), or how many bytes * sum(delta(rcv_nxt)), or how many bytes
* were acked. * were acked.
*/ */
...@@ -220,45 +318,44 @@ struct tcp_sock { ...@@ -220,45 +318,44 @@ struct tcp_sock {
u32 data_segs_in; /* RFC4898 tcpEStatsPerfDataSegsIn u32 data_segs_in; /* RFC4898 tcpEStatsPerfDataSegsIn
* total number of data segments in. * total number of data segments in.
*/ */
u32 rcv_nxt; /* What we want to receive next */
u32 copied_seq; /* Head of yet unread data */
u32 rcv_wup; /* rcv_nxt on last window update sent */ u32 rcv_wup; /* rcv_nxt on last window update sent */
u32 snd_nxt; /* Next sequence we send */ u32 max_packets_out; /* max packets_out in last window */
u32 segs_out; /* RFC4898 tcpEStatsPerfSegsOut u32 cwnd_usage_seq; /* right edge of cwnd usage tracking flight */
* The total number of segments sent. u32 rate_delivered; /* saved rate sample: packets delivered */
*/ u32 rate_interval_us; /* saved rate sample: time elapsed */
u32 data_segs_out; /* RFC4898 tcpEStatsPerfDataSegsOut u32 rcv_rtt_last_tsecr;
* total number of data segments sent. u64 first_tx_mstamp; /* start of window send phase */
*/ u64 delivered_mstamp; /* time we reached "delivered" */
u64 bytes_sent; /* RFC4898 tcpEStatsPerfHCDataOctetsOut
* total number of data bytes sent.
*/
u64 bytes_acked; /* RFC4898 tcpEStatsAppHCThruOctetsAcked u64 bytes_acked; /* RFC4898 tcpEStatsAppHCThruOctetsAcked
* sum(delta(snd_una)), or how many bytes * sum(delta(snd_una)), or how many bytes
* were acked. * were acked.
*/ */
struct {
u32 rtt_us;
u32 seq;
u64 time;
} rcv_rtt_est;
/* Receiver queue space */
struct {
u32 space;
u32 seq;
u64 time;
} rcvq_space;
__cacheline_group_end(tcp_sock_write_rx);
/* End of Hot Path */
/*
* RFC793 variables by their proper names. This means you can
* read the code and the spec side by side (and laugh ...)
* See RFC793 and RFC1122. The RFC writes these in capitals.
*/
u32 dsack_dups; /* RFC4898 tcpEStatsStackDSACKDups u32 dsack_dups; /* RFC4898 tcpEStatsStackDSACKDups
* total number of DSACK blocks received * total number of DSACK blocks received
*/ */
u32 snd_una; /* First byte we want an ack for */
u32 snd_sml; /* Last byte of the most recently transmitted small packet */
u32 rcv_tstamp; /* timestamp of last received ACK (for keepalives) */
u32 lsndtime; /* timestamp of last sent data packet (for restart window) */
u32 last_oow_ack_time; /* timestamp of last out-of-window ACK */ u32 last_oow_ack_time; /* timestamp of last out-of-window ACK */
u32 compressed_ack_rcv_nxt; u32 compressed_ack_rcv_nxt;
u32 tsoffset; /* timestamp offset */
struct list_head tsq_node; /* anchor in tsq_tasklet.head list */ struct list_head tsq_node; /* anchor in tsq_tasklet.head list */
struct list_head tsorted_sent_queue; /* time-sorted sent but un-SACKed skbs */
u32 snd_wl1; /* Sequence for window update */
u32 snd_wnd; /* The window we expect to receive */
u32 max_window; /* Maximal window ever seen from peer */
u32 mss_cache; /* Cached effective mss, not including SACKS */
u32 window_clamp; /* Maximal window to advertise */
u32 rcv_ssthresh; /* Current window clamp */
u8 scaling_ratio; /* see tcp_win_from_space() */ u8 scaling_ratio; /* see tcp_win_from_space() */
/* Information of the most recently (s)acked skb */ /* Information of the most recently (s)acked skb */
struct tcp_rack { struct tcp_rack {
...@@ -272,24 +369,16 @@ struct tcp_sock { ...@@ -272,24 +369,16 @@ struct tcp_sock {
dsack_seen:1, /* Whether DSACK seen after last adj */ dsack_seen:1, /* Whether DSACK seen after last adj */
advanced:1; /* mstamp advanced since last lost marking */ advanced:1; /* mstamp advanced since last lost marking */
} rack; } rack;
u16 advmss; /* Advertised MSS */
u8 compressed_ack; u8 compressed_ack;
u8 dup_ack_counter:2, u8 dup_ack_counter:2,
tlp_retrans:1, /* TLP is a retransmission */ tlp_retrans:1, /* TLP is a retransmission */
tcp_usec_ts:1, /* TSval values in usec */ tcp_usec_ts:1, /* TSval values in usec */
unused:4; unused:4;
u32 chrono_start; /* Start time in jiffies of a TCP chrono */ u8 thin_lto : 1,/* Use linear timeouts for thin streams */
u32 chrono_stat[3]; /* Time in jiffies for chrono_stat stats */ recvmsg_inq : 1,/* Indicate # of bytes in queue upon recvmsg */
u8 chrono_type:2, /* current chronograph type */
rate_app_limited:1, /* rate_{delivered,interval_us} limited? */
fastopen_connect:1, /* FASTOPEN_CONNECT sockopt */ fastopen_connect:1, /* FASTOPEN_CONNECT sockopt */
fastopen_no_cookie:1, /* Allow send/recv SYN+data without a cookie */ fastopen_no_cookie:1, /* Allow send/recv SYN+data without a cookie */
is_sack_reneg:1, /* in recovery from loss with SACK reneg? */ fastopen_client_fail:2, /* reason why fastopen failed */
fastopen_client_fail:2; /* reason why fastopen failed */
u8 nonagle : 4,/* Disable Nagle algorithm? */
thin_lto : 1,/* Use linear timeouts for thin streams */
recvmsg_inq : 1,/* Indicate # of bytes in queue upon recvmsg */
repair : 1,
frto : 1;/* F-RTO (RFC5682) activated in CA_Loss */ frto : 1;/* F-RTO (RFC5682) activated in CA_Loss */
u8 repair_queue; u8 repair_queue;
u8 save_syn:2, /* Save headers of SYN packet */ u8 save_syn:2, /* Save headers of SYN packet */
...@@ -297,45 +386,19 @@ struct tcp_sock { ...@@ -297,45 +386,19 @@ struct tcp_sock {
syn_fastopen:1, /* SYN includes Fast Open option */ syn_fastopen:1, /* SYN includes Fast Open option */
syn_fastopen_exp:1,/* SYN includes Fast Open exp. option */ syn_fastopen_exp:1,/* SYN includes Fast Open exp. option */
syn_fastopen_ch:1, /* Active TFO re-enabling probe */ syn_fastopen_ch:1, /* Active TFO re-enabling probe */
syn_data_acked:1,/* data in SYN is acked by SYN-ACK */ syn_data_acked:1;/* data in SYN is acked by SYN-ACK */
is_cwnd_limited:1;/* forward progress limited by snd_cwnd? */
u32 tlp_high_seq; /* snd_nxt at the time of TLP */
u32 tcp_tx_delay; /* delay (in usec) added to TX packets */ u32 tcp_tx_delay; /* delay (in usec) added to TX packets */
u64 tcp_wstamp_ns; /* departure time for next sent data packet */
u64 tcp_clock_cache; /* cache last tcp_clock_ns() (see tcp_mstamp_refresh()) */
/* RTT measurement */ /* RTT measurement */
u64 tcp_mstamp; /* most recent packet received/sent */
u32 srtt_us; /* smoothed round trip time << 3 in usecs */
u32 mdev_us; /* medium deviation */
u32 mdev_max_us; /* maximal mdev for the last rtt period */ u32 mdev_max_us; /* maximal mdev for the last rtt period */
u32 rttvar_us; /* smoothed mdev_max */
u32 rtt_seq; /* sequence number to update rttvar */
struct minmax rtt_min;
u32 packets_out; /* Packets which are "in flight" */
u32 retrans_out; /* Retransmitted packets out */
u32 max_packets_out; /* max packets_out in last window */
u32 cwnd_usage_seq; /* right edge of cwnd usage tracking flight */
u16 urg_data; /* Saved octet of OOB data and control flags */
u8 ecn_flags; /* ECN status bits. */
u8 keepalive_probes; /* num of allowed keep alive probes */ u8 keepalive_probes; /* num of allowed keep alive probes */
u32 reordering; /* Packet reordering metric. */
u32 reord_seen; /* number of data packet reordering events */ u32 reord_seen; /* number of data packet reordering events */
u32 snd_up; /* Urgent pointer */
/*
* Options received (usually on last packet, some only on SYN packets).
*/
struct tcp_options_received rx_opt;
/* /*
* Slow start and congestion control (see also Nagle, and Karn & Partridge) * Slow start and congestion control (see also Nagle, and Karn & Partridge)
*/ */
u32 snd_ssthresh; /* Slow start size threshold */
u32 snd_cwnd; /* Sending congestion window */
u32 snd_cwnd_cnt; /* Linear increase counter */ u32 snd_cwnd_cnt; /* Linear increase counter */
u32 snd_cwnd_clamp; /* Do not allow snd_cwnd to grow above this */ u32 snd_cwnd_clamp; /* Do not allow snd_cwnd to grow above this */
u32 snd_cwnd_used; u32 snd_cwnd_used;
...@@ -343,32 +406,10 @@ struct tcp_sock { ...@@ -343,32 +406,10 @@ struct tcp_sock {
u32 prior_cwnd; /* cwnd right before starting loss recovery */ u32 prior_cwnd; /* cwnd right before starting loss recovery */
u32 prr_delivered; /* Number of newly delivered packets to u32 prr_delivered; /* Number of newly delivered packets to
* receiver in Recovery. */ * receiver in Recovery. */
u32 prr_out; /* Total number of pkts sent during Recovery. */
u32 delivered; /* Total data packets delivered incl. rexmits */
u32 delivered_ce; /* Like the above but only ECE marked packets */
u32 lost; /* Total data packets lost incl. rexmits */
u32 app_limited; /* limited until "delivered" reaches this val */
u64 first_tx_mstamp; /* start of window send phase */
u64 delivered_mstamp; /* time we reached "delivered" */
u32 rate_delivered; /* saved rate sample: packets delivered */
u32 rate_interval_us; /* saved rate sample: time elapsed */
u32 rcv_wnd; /* Current receiver window */
u32 write_seq; /* Tail(+1) of data held in tcp send buffer */
u32 notsent_lowat; /* TCP_NOTSENT_LOWAT */
u32 pushed_seq; /* Last pushed seq, required to talk to windows */
u32 lost_out; /* Lost packets */
u32 sacked_out; /* SACK'd packets */
struct hrtimer pacing_timer; struct hrtimer pacing_timer;
struct hrtimer compressed_ack_timer; struct hrtimer compressed_ack_timer;
/* from STCP, retrans queue hinting */
struct sk_buff* lost_skb_hint;
struct sk_buff *retransmit_skb_hint;
/* OOO segments go in this rbtree. Socket lock must be held. */
struct rb_root out_of_order_queue;
struct sk_buff *ooo_last_skb; /* cache rb_last(out_of_order_queue) */ struct sk_buff *ooo_last_skb; /* cache rb_last(out_of_order_queue) */
/* SACKs data, these 2 need to be together (see tcp_options_write) */ /* SACKs data, these 2 need to be together (see tcp_options_write) */
...@@ -377,12 +418,6 @@ struct tcp_sock { ...@@ -377,12 +418,6 @@ struct tcp_sock {
struct tcp_sack_block recv_sack_cache[4]; struct tcp_sack_block recv_sack_cache[4];
struct sk_buff *highest_sack; /* skb just after the highest
* skb with SACKed bit set
* (validity guaranteed only if
* sacked_out > 0)
*/
int lost_cnt_hint; int lost_cnt_hint;
u32 prior_ssthresh; /* ssthresh saved at recovery start */ u32 prior_ssthresh; /* ssthresh saved at recovery start */
...@@ -433,21 +468,6 @@ struct tcp_sock { ...@@ -433,21 +468,6 @@ struct tcp_sock {
u32 rcv_ooopack; /* Received out-of-order packets, for tcpinfo */ u32 rcv_ooopack; /* Received out-of-order packets, for tcpinfo */
/* Receiver side RTT estimation */
u32 rcv_rtt_last_tsecr;
struct {
u32 rtt_us;
u32 seq;
u64 time;
} rcv_rtt_est;
/* Receiver queue space */
struct {
u32 space;
u32 seq;
u64 time;
} rcvq_space;
/* TCP-specific MTU probe information. */ /* TCP-specific MTU probe information. */
struct { struct {
u32 probe_seq_start; u32 probe_seq_start;
......
...@@ -11609,6 +11609,60 @@ static struct pernet_operations __net_initdata default_device_ops = { ...@@ -11609,6 +11609,60 @@ static struct pernet_operations __net_initdata default_device_ops = {
.exit_batch = default_device_exit_batch, .exit_batch = default_device_exit_batch,
}; };
static void __init net_dev_struct_check(void)
{
/* TX read-mostly hotpath */
CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_tx, priv_flags);
CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_tx, netdev_ops);
CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_tx, header_ops);
CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_tx, _tx);
CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_tx, real_num_tx_queues);
CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_tx, gso_max_size);
CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_tx, gso_ipv4_max_size);
CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_tx, gso_max_segs);
CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_tx, num_tc);
CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_tx, mtu);
CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_tx, needed_headroom);
CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_tx, tc_to_txq);
#ifdef CONFIG_XPS
CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_tx, xps_maps);
#endif
#ifdef CONFIG_NETFILTER_EGRESS
CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_tx, nf_hooks_egress);
#endif
#ifdef CONFIG_NET_XGRESS
CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_tx, tcx_egress);
#endif
CACHELINE_ASSERT_GROUP_SIZE(struct net_device, net_device_read_tx, 152);
/* TXRX read-mostly hotpath */
CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_txrx, flags);
CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_txrx, hard_header_len);
CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_txrx, features);
CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_txrx, ip6_ptr);
CACHELINE_ASSERT_GROUP_SIZE(struct net_device, net_device_read_txrx, 30);
/* RX read-mostly hotpath */
CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_rx, ptype_specific);
CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_rx, ifindex);
CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_rx, real_num_rx_queues);
CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_rx, _rx);
CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_rx, gro_flush_timeout);
CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_rx, napi_defer_hard_irqs);
CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_rx, gro_max_size);
CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_rx, gro_ipv4_max_size);
CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_rx, rx_handler);
CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_rx, rx_handler_data);
CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_rx, nd_net);
#ifdef CONFIG_NETPOLL
CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_rx, npinfo);
#endif
#ifdef CONFIG_NET_XGRESS
CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_rx, tcx_ingress);
#endif
CACHELINE_ASSERT_GROUP_SIZE(struct net_device, net_device_read_rx, 96);
}
/* /*
* Initialize the DEV module. At boot time this walks the device list and * Initialize the DEV module. At boot time this walks the device list and
* unhooks any devices that fail to initialise (normally hardware not * unhooks any devices that fail to initialise (normally hardware not
...@@ -11626,6 +11680,8 @@ static int __init net_dev_init(void) ...@@ -11626,6 +11680,8 @@ static int __init net_dev_init(void)
BUG_ON(!dev_boot_phase); BUG_ON(!dev_boot_phase);
net_dev_struct_check();
if (dev_proc_init()) if (dev_proc_init())
goto out; goto out;
......
...@@ -4564,6 +4564,97 @@ static void __init tcp_init_mem(void) ...@@ -4564,6 +4564,97 @@ static void __init tcp_init_mem(void)
sysctl_tcp_mem[2] = sysctl_tcp_mem[0] * 2; /* 9.37 % */ sysctl_tcp_mem[2] = sysctl_tcp_mem[0] * 2; /* 9.37 % */
} }
static void __init tcp_struct_check(void)
{
/* TX read-mostly hotpath cache lines */
CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_read_tx, max_window);
CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_read_tx, rcv_ssthresh);
CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_read_tx, reordering);
CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_read_tx, notsent_lowat);
CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_read_tx, gso_segs);
CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_read_tx, lost_skb_hint);
CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_read_tx, retransmit_skb_hint);
CACHELINE_ASSERT_GROUP_SIZE(struct tcp_sock, tcp_sock_read_tx, 40);
/* TXRX read-mostly hotpath cache lines */
CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_read_txrx, tsoffset);
CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_read_txrx, snd_wnd);
CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_read_txrx, mss_cache);
CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_read_txrx, snd_cwnd);
CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_read_txrx, prr_out);
CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_read_txrx, lost_out);
CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_read_txrx, sacked_out);
CACHELINE_ASSERT_GROUP_SIZE(struct tcp_sock, tcp_sock_read_txrx, 31);
/* RX read-mostly hotpath cache lines */
CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_read_rx, copied_seq);
CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_read_rx, rcv_tstamp);
CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_read_rx, snd_wl1);
CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_read_rx, tlp_high_seq);
CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_read_rx, rttvar_us);
CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_read_rx, retrans_out);
CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_read_rx, advmss);
CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_read_rx, urg_data);
CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_read_rx, lost);
CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_read_rx, rtt_min);
CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_read_rx, out_of_order_queue);
CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_read_rx, snd_ssthresh);
CACHELINE_ASSERT_GROUP_SIZE(struct tcp_sock, tcp_sock_read_rx, 69);
/* TX read-write hotpath cache lines */
CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_tx, segs_out);
CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_tx, data_segs_out);
CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_tx, bytes_sent);
CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_tx, snd_sml);
CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_tx, chrono_start);
CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_tx, chrono_stat);
CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_tx, write_seq);
CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_tx, pushed_seq);
CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_tx, lsndtime);
CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_tx, mdev_us);
CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_tx, tcp_wstamp_ns);
CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_tx, tcp_clock_cache);
CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_tx, tcp_mstamp);
CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_tx, rtt_seq);
CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_tx, tsorted_sent_queue);
CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_tx, highest_sack);
CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_tx, ecn_flags);
CACHELINE_ASSERT_GROUP_SIZE(struct tcp_sock, tcp_sock_write_tx, 113);
/* TXRX read-write hotpath cache lines */
CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_txrx, pred_flags);
CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_txrx, rcv_nxt);
CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_txrx, snd_nxt);
CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_txrx, snd_una);
CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_txrx, window_clamp);
CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_txrx, srtt_us);
CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_txrx, packets_out);
CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_txrx, snd_up);
CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_txrx, delivered);
CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_txrx, delivered_ce);
CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_txrx, app_limited);
CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_txrx, rcv_wnd);
CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_txrx, rx_opt);
CACHELINE_ASSERT_GROUP_SIZE(struct tcp_sock, tcp_sock_write_txrx, 76);
/* RX read-write hotpath cache lines */
CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_rx, bytes_received);
CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_rx, segs_in);
CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_rx, data_segs_in);
CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_rx, rcv_wup);
CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_rx, max_packets_out);
CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_rx, cwnd_usage_seq);
CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_rx, rate_delivered);
CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_rx, rate_interval_us);
CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_rx, rcv_rtt_last_tsecr);
CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_rx, first_tx_mstamp);
CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_rx, delivered_mstamp);
CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_rx, bytes_acked);
CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_rx, rcv_rtt_est);
CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_rx, rcvq_space);
CACHELINE_ASSERT_GROUP_SIZE(struct tcp_sock, tcp_sock_write_rx, 99);
}
void __init tcp_init(void) void __init tcp_init(void)
{ {
int max_rshare, max_wshare, cnt; int max_rshare, max_wshare, cnt;
...@@ -4574,6 +4665,8 @@ void __init tcp_init(void) ...@@ -4574,6 +4665,8 @@ void __init tcp_init(void)
BUILD_BUG_ON(sizeof(struct tcp_skb_cb) > BUILD_BUG_ON(sizeof(struct tcp_skb_cb) >
sizeof_field(struct sk_buff, cb)); sizeof_field(struct sk_buff, cb));
tcp_struct_check();
percpu_counter_init(&tcp_sockets_allocated, 0, GFP_KERNEL); percpu_counter_init(&tcp_sockets_allocated, 0, GFP_KERNEL);
timer_setup(&tcp_orphan_timer, tcp_orphan_update, TIMER_DEFERRABLE); timer_setup(&tcp_orphan_timer, tcp_orphan_update, TIMER_DEFERRABLE);
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment