Commit 0a68a20c authored by David S. Miller's avatar David S. Miller

Merge branch 'dccp' of git://eden-feed.erg.abdn.ac.uk/dccp_exp

Conflicts:

	net/dccp/input.c
	net/dccp/options.c
parents 17dce5df a3cbdde8
......@@ -45,6 +45,25 @@ http://linux-net.osdl.org/index.php/DCCP_Testing#Experimental_DCCP_source_tree
Socket options
==============
DCCP_SOCKOPT_QPOLICY_ID sets the dequeuing policy for outgoing packets. It takes
a policy ID as argument and can only be set before the connection (i.e. changes
during an established connection are not supported). Currently, two policies are
defined: the "simple" policy (DCCPQ_POLICY_SIMPLE), which does nothing special,
and a priority-based variant (DCCPQ_POLICY_PRIO). The latter allows to pass an
u32 priority value as ancillary data to sendmsg(), where higher numbers indicate
a higher packet priority (similar to SO_PRIORITY). This ancillary data needs to
be formatted using a cmsg(3) message header filled in as follows:
cmsg->cmsg_level = SOL_DCCP;
cmsg->cmsg_type = DCCP_SCM_PRIORITY;
cmsg->cmsg_len = CMSG_LEN(sizeof(uint32_t)); /* or CMSG_LEN(4) */
DCCP_SOCKOPT_QPOLICY_TXQLEN sets the maximum length of the output queue. A zero
value is always interpreted as unbounded queue length. If different from zero,
the interpretation of this parameter depends on the current dequeuing policy
(see above): the "simple" policy will enforce a fixed queue size by returning
EAGAIN, whereas the "prio" policy enforces a fixed queue length by dropping the
lowest-priority packet first. The default value for this parameter is
initialised from /proc/sys/net/dccp/default/tx_qlen.
DCCP_SOCKOPT_SERVICE sets the service. The specification mandates use of
service codes (RFC 4340, sec. 8.1.2); if this socket option is not set,
......@@ -57,6 +76,24 @@ can be set before calling bind().
DCCP_SOCKOPT_GET_CUR_MPS is read-only and retrieves the current maximum packet
size (application payload size) in bytes, see RFC 4340, section 14.
DCCP_SOCKOPT_AVAILABLE_CCIDS is also read-only and returns the list of CCIDs
supported by the endpoint (see include/linux/dccp.h for symbolic constants).
The caller needs to provide a sufficiently large (> 2) array of type uint8_t.
DCCP_SOCKOPT_CCID is write-only and sets both the TX and RX CCIDs at the same
time, combining the operation of the next two socket options. This option is
preferrable over the latter two, since often applications will use the same
type of CCID for both directions; and mixed use of CCIDs is not currently well
understood. This socket option takes as argument at least one uint8_t value, or
an array of uint8_t values, which must match available CCIDS (see above). CCIDs
must be registered on the socket before calling connect() or listen().
DCCP_SOCKOPT_TX_CCID is read/write. It returns the current CCID (if set) or sets
the preference list for the TX CCID, using the same format as DCCP_SOCKOPT_CCID.
Please note that the getsockopt argument type here is `int', not uint8_t.
DCCP_SOCKOPT_RX_CCID is analogous to DCCP_SOCKOPT_TX_CCID, but for the RX CCID.
DCCP_SOCKOPT_SERVER_TIMEWAIT enables the server (listening socket) to hold
timewait state when closing the connection (RFC 4340, 8.3). The usual case is
that the closing server sends a CloseReq, whereupon the client holds timewait
......@@ -115,23 +152,16 @@ retries2
importance for retransmitted acknowledgments and feature negotiation,
data packets are never retransmitted. Analogue of tcp_retries2.
send_ndp = 1
Whether or not to send NDP count options (sec. 7.7.2).
send_ackvec = 1
Whether or not to send Ack Vector options (sec. 11.5).
ack_ratio = 2
The default Ack Ratio (sec. 11.3) to use.
tx_ccid = 2
Default CCID for the sender-receiver half-connection.
Default CCID for the sender-receiver half-connection. Depending on the
choice of CCID, the Send Ack Vector feature is enabled automatically.
rx_ccid = 2
Default CCID for the receiver-sender half-connection.
Default CCID for the receiver-sender half-connection; see tx_ccid.
seq_window = 100
The initial sequence window (sec. 7.5.2).
The initial sequence window (sec. 7.5.2) of the sender. This influences
the local ackno validity and the remote seqno validity windows (7.5.1).
tx_qlen = 5
The size of the transmit buffer in packets. A value of 0 corresponds
......
......@@ -165,9 +165,13 @@ enum {
DCCPO_TIMESTAMP_ECHO = 42,
DCCPO_ELAPSED_TIME = 43,
DCCPO_MAX = 45,
DCCPO_MIN_CCID_SPECIFIC = 128,
DCCPO_MAX_CCID_SPECIFIC = 255,
DCCPO_MIN_RX_CCID_SPECIFIC = 128, /* from sender to receiver */
DCCPO_MAX_RX_CCID_SPECIFIC = 191,
DCCPO_MIN_TX_CCID_SPECIFIC = 192, /* from receiver to sender */
DCCPO_MAX_TX_CCID_SPECIFIC = 255,
};
/* maximum size of a single TLV-encoded DCCP option (sans type/len bytes) */
#define DCCP_SINGLE_OPT_MAXLEN 253
/* DCCP CCIDS */
enum {
......@@ -176,27 +180,36 @@ enum {
};
/* DCCP features (RFC 4340 section 6.4) */
enum {
enum dccp_feature_numbers {
DCCPF_RESERVED = 0,
DCCPF_CCID = 1,
DCCPF_SHORT_SEQNOS = 2, /* XXX: not yet implemented */
DCCPF_SHORT_SEQNOS = 2,
DCCPF_SEQUENCE_WINDOW = 3,
DCCPF_ECN_INCAPABLE = 4, /* XXX: not yet implemented */
DCCPF_ECN_INCAPABLE = 4,
DCCPF_ACK_RATIO = 5,
DCCPF_SEND_ACK_VECTOR = 6,
DCCPF_SEND_NDP_COUNT = 7,
DCCPF_MIN_CSUM_COVER = 8,
DCCPF_DATA_CHECKSUM = 9, /* XXX: not yet implemented */
DCCPF_DATA_CHECKSUM = 9,
/* 10-127 reserved */
DCCPF_MIN_CCID_SPECIFIC = 128,
DCCPF_SEND_LEV_RATE = 192, /* RFC 4342, sec. 8.4 */
DCCPF_MAX_CCID_SPECIFIC = 255,
};
/* this structure is argument to DCCP_SOCKOPT_CHANGE_X */
struct dccp_so_feat {
__u8 dccpsf_feat;
__u8 __user *dccpsf_val;
__u8 dccpsf_len;
/* DCCP socket control message types for cmsg */
enum dccp_cmsg_type {
DCCP_SCM_PRIORITY = 1,
DCCP_SCM_QPOLICY_MAX = 0xFFFF,
/* ^-- Up to here reserved exclusively for qpolicy parameters */
DCCP_SCM_MAX
};
/* DCCP priorities for outgoing/queued packets */
enum dccp_packet_dequeueing_policy {
DCCPQ_POLICY_SIMPLE,
DCCPQ_POLICY_PRIO,
DCCPQ_POLICY_MAX
};
/* DCCP socket options */
......@@ -208,6 +221,12 @@ struct dccp_so_feat {
#define DCCP_SOCKOPT_SERVER_TIMEWAIT 6
#define DCCP_SOCKOPT_SEND_CSCOV 10
#define DCCP_SOCKOPT_RECV_CSCOV 11
#define DCCP_SOCKOPT_AVAILABLE_CCIDS 12
#define DCCP_SOCKOPT_CCID 13
#define DCCP_SOCKOPT_TX_CCID 14
#define DCCP_SOCKOPT_RX_CCID 15
#define DCCP_SOCKOPT_QPOLICY_ID 16
#define DCCP_SOCKOPT_QPOLICY_TXQLEN 17
#define DCCP_SOCKOPT_CCID_RX_INFO 128
#define DCCP_SOCKOPT_CCID_TX_INFO 192
......@@ -355,62 +374,13 @@ static inline unsigned int dccp_hdr_len(const struct sk_buff *skb)
return __dccp_hdr_len(dccp_hdr(skb));
}
/* initial values for each feature */
#define DCCPF_INITIAL_SEQUENCE_WINDOW 100
#define DCCPF_INITIAL_ACK_RATIO 2
#define DCCPF_INITIAL_CCID DCCPC_CCID2
#define DCCPF_INITIAL_SEND_ACK_VECTOR 1
/* FIXME: for now we're default to 1 but it should really be 0 */
#define DCCPF_INITIAL_SEND_NDP_COUNT 1
/**
* struct dccp_minisock - Minimal DCCP connection representation
*
* Will be used to pass the state from dccp_request_sock to dccp_sock.
*
* @dccpms_sequence_window - Sequence Window Feature (section 7.5.2)
* @dccpms_ccid - Congestion Control Id (CCID) (section 10)
* @dccpms_send_ack_vector - Send Ack Vector Feature (section 11.5)
* @dccpms_send_ndp_count - Send NDP Count Feature (7.7.2)
* @dccpms_ack_ratio - Ack Ratio Feature (section 11.3)
* @dccpms_pending - List of features being negotiated
* @dccpms_conf -
*/
struct dccp_minisock {
__u64 dccpms_sequence_window;
__u8 dccpms_rx_ccid;
__u8 dccpms_tx_ccid;
__u8 dccpms_send_ack_vector;
__u8 dccpms_send_ndp_count;
__u8 dccpms_ack_ratio;
struct list_head dccpms_pending;
struct list_head dccpms_conf;
};
struct dccp_opt_conf {
__u8 *dccpoc_val;
__u8 dccpoc_len;
};
struct dccp_opt_pend {
struct list_head dccpop_node;
__u8 dccpop_type;
__u8 dccpop_feat;
__u8 *dccpop_val;
__u8 dccpop_len;
int dccpop_conf;
struct dccp_opt_conf *dccpop_sc;
};
extern void dccp_minisock_init(struct dccp_minisock *dmsk);
/**
* struct dccp_request_sock - represent DCCP-specific connection request
* @dreq_inet_rsk: structure inherited from
* @dreq_iss: initial sequence number sent on the Response (RFC 4340, 7.1)
* @dreq_isr: initial sequence number received on the Request
* @dreq_service: service code present on the Request (there is just one)
* @dreq_featneg: feature negotiation options for this connection
* The following two fields are analogous to the ones in dccp_sock:
* @dreq_timestamp_echo: last received timestamp to echo (13.1)
* @dreq_timestamp_echo: the time of receiving the last @dreq_timestamp_echo
......@@ -420,6 +390,7 @@ struct dccp_request_sock {
__u64 dreq_iss;
__u64 dreq_isr;
__be32 dreq_service;
struct list_head dreq_featneg;
__u32 dreq_timestamp_echo;
__u32 dreq_timestamp_time;
};
......@@ -491,21 +462,28 @@ struct dccp_ackvec;
* @dccps_timestamp_time - time of receiving latest @dccps_timestamp_echo
* @dccps_l_ack_ratio - feature-local Ack Ratio
* @dccps_r_ack_ratio - feature-remote Ack Ratio
* @dccps_l_seq_win - local Sequence Window (influences ack number validity)
* @dccps_r_seq_win - remote Sequence Window (influences seq number validity)
* @dccps_pcslen - sender partial checksum coverage (via sockopt)
* @dccps_pcrlen - receiver partial checksum coverage (via sockopt)
* @dccps_send_ndp_count - local Send NDP Count feature (7.7.2)
* @dccps_ndp_count - number of Non Data Packets since last data packet
* @dccps_mss_cache - current value of MSS (path MTU minus header sizes)
* @dccps_rate_last - timestamp for rate-limiting DCCP-Sync (RFC 4340, 7.5.4)
* @dccps_minisock - associated minisock (accessed via dccp_msk)
* @dccps_featneg - tracks feature-negotiation state (mostly during handshake)
* @dccps_hc_rx_ackvec - rx half connection ack vector
* @dccps_hc_rx_ccid - CCID used for the receiver (or receiving half-connection)
* @dccps_hc_tx_ccid - CCID used for the sender (or sending half-connection)
* @dccps_options_received - parsed set of retrieved options
* @dccps_qpolicy - TX dequeueing policy, one of %dccp_packet_dequeueing_policy
* @dccps_tx_qlen - maximum length of the TX queue
* @dccps_role - role of this sock, one of %dccp_role
* @dccps_hc_rx_insert_options - receiver wants to add options when acking
* @dccps_hc_tx_insert_options - sender wants to add options when sending
* @dccps_server_timewait - server holds timewait state on close (RFC 4340, 8.3)
* @dccps_xmit_timer - timer for when CCID is not ready to send
* @dccps_sync_scheduled - flag which signals "send out-of-band message soon"
* @dccps_xmitlet - tasklet scheduled by the TX CCID to dequeue data packets
* @dccps_xmit_timer - used by the TX CCID to delay sending (rate-based pacing)
* @dccps_syn_rtt - RTT sample from Request/Response exchange (in usecs)
*/
struct dccp_sock {
......@@ -529,19 +507,26 @@ struct dccp_sock {
__u32 dccps_timestamp_time;
__u16 dccps_l_ack_ratio;
__u16 dccps_r_ack_ratio;
__u16 dccps_pcslen;
__u16 dccps_pcrlen;
__u64 dccps_l_seq_win:48;
__u64 dccps_r_seq_win:48;
__u8 dccps_pcslen:4;
__u8 dccps_pcrlen:4;
__u8 dccps_send_ndp_count:1;
__u64 dccps_ndp_count:48;
unsigned long dccps_rate_last;
struct dccp_minisock dccps_minisock;
struct list_head dccps_featneg;
struct dccp_ackvec *dccps_hc_rx_ackvec;
struct ccid *dccps_hc_rx_ccid;
struct ccid *dccps_hc_tx_ccid;
struct dccp_options_received dccps_options_received;
__u8 dccps_qpolicy;
__u32 dccps_tx_qlen;
enum dccp_role dccps_role:2;
__u8 dccps_hc_rx_insert_options:1;
__u8 dccps_hc_tx_insert_options:1;
__u8 dccps_server_timewait:1;
__u8 dccps_sync_scheduled:1;
struct tasklet_struct dccps_xmitlet;
struct timer_list dccps_xmit_timer;
};
......@@ -550,11 +535,6 @@ static inline struct dccp_sock *dccp_sk(const struct sock *sk)
return (struct dccp_sock *)sk;
}
static inline struct dccp_minisock *dccp_msk(const struct sock *sk)
{
return (struct dccp_minisock *)&dccp_sk(sk)->dccps_minisock;
}
static inline const char *dccp_role(const struct sock *sk)
{
switch (dccp_sk(sk)->dccps_role) {
......
......@@ -782,6 +782,21 @@ static inline __u32 tcp_current_ssthresh(const struct sock *sk)
/* Use define here intentionally to get WARN_ON location shown at the caller */
#define tcp_verify_left_out(tp) WARN_ON(tcp_left_out(tp) > tp->packets_out)
/*
* Convert RFC3390 larger initial windows into an equivalent number of packets.
*
* John Heffner states:
*
* The RFC specifies a window of no more than 4380 bytes
* unless 2*MSS > 4380. Reading the pseudocode in the RFC
* is a bit misleading because they use a clamp at 4380 bytes
* rather than a multiplier in the relevant range.
*/
static inline u32 rfc3390_bytes_to_packets(const u32 bytes)
{
return bytes <= 1095 ? 4 : (bytes > 1460 ? 2 : 3);
}
extern void tcp_enter_cwr(struct sock *sk, const int set_ssthresh);
extern __u32 tcp_init_cwnd(struct tcp_sock *tp, struct dst_entry *dst);
......
......@@ -25,9 +25,6 @@ config INET_DCCP_DIAG
def_tristate y if (IP_DCCP = y && INET_DIAG = y)
def_tristate m
config IP_DCCP_ACKVEC
bool
source "net/dccp/ccids/Kconfig"
menu "DCCP Kernel Hacking"
......
obj-$(CONFIG_IP_DCCP) += dccp.o dccp_ipv4.o
dccp-y := ccid.o feat.o input.o minisocks.o options.o output.o proto.o timer.o
dccp-y := ccid.o feat.o input.o minisocks.o options.o \
qpolicy.o output.o proto.o timer.o ackvec.o
dccp_ipv4-y := ipv4.o
......@@ -8,8 +9,6 @@ dccp_ipv4-y := ipv4.o
obj-$(subst y,$(CONFIG_IP_DCCP),$(CONFIG_IPV6)) += dccp_ipv6.o
dccp_ipv6-y := ipv6.o
dccp-$(CONFIG_IP_DCCP_ACKVEC) += ackvec.o
obj-$(CONFIG_INET_DCCP_DIAG) += dccp_diag.o
obj-$(CONFIG_NET_DCCPPROBE) += dccp_probe.o
......
This diff is collapsed.
......@@ -3,156 +3,134 @@
/*
* net/dccp/ackvec.h
*
* An implementation of the DCCP protocol
* An implementation of Ack Vectors for the DCCP protocol
* Copyright (c) 2007 University of Aberdeen, Scotland, UK
* Copyright (c) 2005 Arnaldo Carvalho de Melo <acme@mandriva.com>
*
* This program is free software; you can redistribute it and/or modify it
* under the terms of the GNU General Public License version 2 as
* published by the Free Software Foundation.
*/
#include <linux/dccp.h>
#include <linux/compiler.h>
#include <linux/ktime.h>
#include <linux/list.h>
#include <linux/types.h>
/* Read about the ECN nonce to see why it is 253 */
#define DCCP_MAX_ACKVEC_OPT_LEN 253
/* We can spread an ack vector across multiple options */
#define DCCP_MAX_ACKVEC_LEN (DCCP_MAX_ACKVEC_OPT_LEN * 2)
/*
* Ack Vector buffer space is static, in multiples of %DCCP_SINGLE_OPT_MAXLEN,
* the maximum size of a single Ack Vector. Setting %DCCPAV_NUM_ACKVECS to 1
* will be sufficient for most cases of low Ack Ratios, using a value of 2 gives
* more headroom if Ack Ratio is higher or when the sender acknowledges slowly.
* The maximum value is bounded by the u16 types for indices and functions.
*/
#define DCCPAV_NUM_ACKVECS 2
#define DCCPAV_MAX_ACKVEC_LEN (DCCP_SINGLE_OPT_MAXLEN * DCCPAV_NUM_ACKVECS)
#define DCCP_ACKVEC_STATE_RECEIVED 0
#define DCCP_ACKVEC_STATE_ECN_MARKED (1 << 6)
#define DCCP_ACKVEC_STATE_NOT_RECEIVED (3 << 6)
/* Estimated minimum average Ack Vector length - used for updating MPS */
#define DCCPAV_MIN_OPTLEN 16
#define DCCP_ACKVEC_STATE_MASK 0xC0 /* 11000000 */
#define DCCP_ACKVEC_LEN_MASK 0x3F /* 00111111 */
/* Threshold for coping with large bursts of losses */
#define DCCPAV_BURST_THRESH (DCCPAV_MAX_ACKVEC_LEN / 8)
/** struct dccp_ackvec - ack vector
*
* This data structure is the one defined in RFC 4340, Appendix A.
*
* @av_buf_head - circular buffer head
* @av_buf_tail - circular buffer tail
* @av_buf_ackno - ack # of the most recent packet acknowledgeable in the
* buffer (i.e. %av_buf_head)
* @av_buf_nonce - the one-bit sum of the ECN Nonces on all packets acked
* by the buffer with State 0
*
* Additionally, the HC-Receiver must keep some information about the
* Ack Vectors it has recently sent. For each packet sent carrying an
* Ack Vector, it remembers four variables:
enum dccp_ackvec_states {
DCCPAV_RECEIVED = 0x00,
DCCPAV_ECN_MARKED = 0x40,
DCCPAV_RESERVED = 0x80,
DCCPAV_NOT_RECEIVED = 0xC0
};
#define DCCPAV_MAX_RUNLEN 0x3F
static inline u8 dccp_ackvec_runlen(const u8 *cell)
{
return *cell & DCCPAV_MAX_RUNLEN;
}
static inline u8 dccp_ackvec_state(const u8 *cell)
{
return *cell & ~DCCPAV_MAX_RUNLEN;
}
/** struct dccp_ackvec - Ack Vector main data structure
*
* @av_records - list of dccp_ackvec_record
* @av_ack_nonce - the one-bit sum of the ECN Nonces for all State 0.
* This implements a fixed-size circular buffer within an array and is largely
* based on Appendix A of RFC 4340.
*
* @av_time - the time in usecs
* @av_buf - circular buffer of acknowledgeable packets
* @av_buf: circular buffer storage area
* @av_buf_head: head index; begin of live portion in @av_buf
* @av_buf_tail: tail index; first index _after_ the live portion in @av_buf
* @av_buf_ackno: highest seqno of acknowledgeable packet recorded in @av_buf
* @av_tail_ackno: lowest seqno of acknowledgeable packet recorded in @av_buf
* @av_buf_nonce: ECN nonce sums, each covering subsequent segments of up to
* %DCCP_SINGLE_OPT_MAXLEN cells in the live portion of @av_buf
* @av_overflow: if 1 then buf_head == buf_tail indicates buffer wraparound
* @av_records: list of %dccp_ackvec_record (Ack Vectors sent previously)
*/
struct dccp_ackvec {
u64 av_buf_ackno;
struct list_head av_records;
ktime_t av_time;
u8 av_buf[DCCPAV_MAX_ACKVEC_LEN];
u16 av_buf_head;
u16 av_vec_len;
u8 av_buf_nonce;
u8 av_ack_nonce;
u8 av_buf[DCCP_MAX_ACKVEC_LEN];
u16 av_buf_tail;
u64 av_buf_ackno:48;
u64 av_tail_ackno:48;
bool av_buf_nonce[DCCPAV_NUM_ACKVECS];
u8 av_overflow:1;
struct list_head av_records;
};
/** struct dccp_ackvec_record - ack vector record
/** struct dccp_ackvec_record - Records information about sent Ack Vectors
*
* ACK vector record as defined in Appendix A of spec.
* These list entries define the additional information which the HC-Receiver
* keeps about recently-sent Ack Vectors; again refer to RFC 4340, Appendix A.
*
* The list is sorted by avr_ack_seqno
* @avr_node: the list node in @av_records
* @avr_ack_seqno: sequence number of the packet the Ack Vector was sent on
* @avr_ack_ackno: the Ack number that this record/Ack Vector refers to
* @avr_ack_ptr: pointer into @av_buf where this record starts
* @avr_ack_runlen: run length of @avr_ack_ptr at the time of sending
* @avr_ack_nonce: the sum of @av_buf_nonce's at the time this record was sent
*
* @avr_node - node in av_records
* @avr_ack_seqno - sequence number of the packet this record was sent on
* @avr_ack_ackno - sequence number being acknowledged
* @avr_ack_ptr - pointer into av_buf where this record starts
* @avr_ack_nonce - av_ack_nonce at the time this record was sent
* @avr_sent_len - lenght of the record in av_buf
* The list as a whole is sorted in descending order by @avr_ack_seqno.
*/
struct dccp_ackvec_record {
struct list_head avr_node;
u64 avr_ack_seqno;
u64 avr_ack_ackno;
u64 avr_ack_seqno:48;
u64 avr_ack_ackno:48;
u16 avr_ack_ptr;
u16 avr_sent_len;
u8 avr_ack_nonce;
u8 avr_ack_runlen;
u8 avr_ack_nonce:1;
};
struct sock;
struct sk_buff;
#ifdef CONFIG_IP_DCCP_ACKVEC
extern int dccp_ackvec_init(void);
extern void dccp_ackvec_exit(void);
extern struct dccp_ackvec *dccp_ackvec_alloc(const gfp_t priority);
extern void dccp_ackvec_free(struct dccp_ackvec *av);
extern int dccp_ackvec_add(struct dccp_ackvec *av, const struct sock *sk,
const u64 ackno, const u8 state);
extern void dccp_ackvec_check_rcv_ackno(struct dccp_ackvec *av,
struct sock *sk, const u64 ackno);
extern int dccp_ackvec_parse(struct sock *sk, const struct sk_buff *skb,
u64 *ackno, const u8 opt,
const u8 *value, const u8 len);
extern void dccp_ackvec_input(struct dccp_ackvec *av, struct sk_buff *skb);
extern int dccp_ackvec_update_records(struct dccp_ackvec *av, u64 seq, u8 sum);
extern void dccp_ackvec_clear_state(struct dccp_ackvec *av, const u64 ackno);
extern u16 dccp_ackvec_buflen(const struct dccp_ackvec *av);
extern int dccp_insert_option_ackvec(struct sock *sk, struct sk_buff *skb);
static inline int dccp_ackvec_pending(const struct dccp_ackvec *av)
{
return av->av_vec_len;
}
#else /* CONFIG_IP_DCCP_ACKVEC */
static inline int dccp_ackvec_init(void)
static inline bool dccp_ackvec_is_empty(const struct dccp_ackvec *av)
{
return 0;
return av->av_overflow == 0 && av->av_buf_head == av->av_buf_tail;
}
static inline void dccp_ackvec_exit(void)
{
}
static inline struct dccp_ackvec *dccp_ackvec_alloc(const gfp_t priority)
{
return NULL;
}
static inline void dccp_ackvec_free(struct dccp_ackvec *av)
{
}
static inline int dccp_ackvec_add(struct dccp_ackvec *av, const struct sock *sk,
const u64 ackno, const u8 state)
{
return -1;
}
static inline void dccp_ackvec_check_rcv_ackno(struct dccp_ackvec *av,
struct sock *sk, const u64 ackno)
{
}
static inline int dccp_ackvec_parse(struct sock *sk, const struct sk_buff *skb,
const u64 *ackno, const u8 opt,
const u8 *value, const u8 len)
{
return -1;
}
static inline int dccp_insert_option_ackvec(const struct sock *sk,
const struct sk_buff *skb)
{
return -1;
}
/**
* struct dccp_ackvec_parsed - Record offsets of Ack Vectors in skb
* @vec: start of vector (offset into skb)
* @len: length of @vec
* @nonce: whether @vec had an ECN nonce of 0 or 1
* @node: FIFO - arranged in descending order of ack_ackno
* This structure is used by CCIDs to access Ack Vectors in a received skb.
*/
struct dccp_ackvec_parsed {
u8 *vec,
len,
nonce:1;
struct list_head node;
};
static inline int dccp_ackvec_pending(const struct dccp_ackvec *av)
{
return 0;
}
#endif /* CONFIG_IP_DCCP_ACKVEC */
extern int dccp_ackvec_parsed_add(struct list_head *head,
u8 *vec, u8 len, u8 nonce);
extern void dccp_ackvec_parsed_cleanup(struct list_head *parsed_chunks);
#endif /* _ACKVEC_H */
......@@ -13,6 +13,13 @@
#include "ccid.h"
static u8 builtin_ccids[] = {
DCCPC_CCID2, /* CCID2 is supported by default */
#if defined(CONFIG_IP_DCCP_CCID3) || defined(CONFIG_IP_DCCP_CCID3_MODULE)
DCCPC_CCID3,
#endif
};
static struct ccid_operations *ccids[CCID_MAX];
#if defined(CONFIG_SMP) || defined(CONFIG_PREEMPT)
static atomic_t ccids_lockct = ATOMIC_INIT(0);
......@@ -86,6 +93,47 @@ static void ccid_kmem_cache_destroy(struct kmem_cache *slab)
}
}
/* check that up to @array_len members in @ccid_array are supported */
bool ccid_support_check(u8 const *ccid_array, u8 array_len)
{
u8 i, j, found;
for (i = 0, found = 0; i < array_len; i++, found = 0) {
for (j = 0; !found && j < ARRAY_SIZE(builtin_ccids); j++)
found = (ccid_array[i] == builtin_ccids[j]);
if (!found)
return false;
}
return true;
}
/**
* ccid_get_builtin_ccids - Provide copy of `builtin' CCID array
* @ccid_array: pointer to copy into
* @array_len: value to return length into
* This function allocates memory - caller must see that it is freed after use.
*/
int ccid_get_builtin_ccids(u8 **ccid_array, u8 *array_len)
{
*ccid_array = kmemdup(builtin_ccids, sizeof(builtin_ccids), gfp_any());
if (*ccid_array == NULL)
return -ENOBUFS;
*array_len = ARRAY_SIZE(builtin_ccids);
return 0;
}
int ccid_getsockopt_builtin_ccids(struct sock *sk, int len,
char __user *optval, int __user *optlen)
{
if (len < sizeof(builtin_ccids))
return -EINVAL;
if (put_user(sizeof(builtin_ccids), optlen) ||
copy_to_user(optval, builtin_ccids, sizeof(builtin_ccids)))
return -EFAULT;
return 0;
}
int ccid_register(struct ccid_operations *ccid_ops)
{
int err = -ENOBUFS;
......@@ -148,22 +196,41 @@ int ccid_unregister(struct ccid_operations *ccid_ops)
EXPORT_SYMBOL_GPL(ccid_unregister);
struct ccid *ccid_new(unsigned char id, struct sock *sk, int rx, gfp_t gfp)
/**
* ccid_request_module - Pre-load CCID module for later use
* This should be called only from process context (e.g. during connection
* setup) and is necessary for later calls to ccid_new (typically in software
* interrupt), so that it has the modules available when they are needed.
*/
static int ccid_request_module(u8 id)
{
struct ccid_operations *ccid_ops;
struct ccid *ccid = NULL;
if (!in_atomic()) {
ccids_read_lock();
#ifdef CONFIG_KMOD
if (ccids[id] == NULL) {
/* We only try to load if in process context */
ccids_read_unlock();
if (gfp & GFP_ATOMIC)
goto out;
request_module("net-dccp-ccid-%d", id);
ccids_read_lock();
return request_module("net-dccp-ccid-%d", id);
}
ccids_read_unlock();
}
return 0;
}
int ccid_request_modules(u8 const *ccid_array, u8 array_len)
{
#ifdef CONFIG_KMOD
while (array_len--)
if (ccid_request_module(ccid_array[array_len]))
return -1;
#endif
return 0;
}
struct ccid *ccid_new(unsigned char id, struct sock *sk, int rx, gfp_t gfp)
{
struct ccid_operations *ccid_ops;
struct ccid *ccid = NULL;
ccids_read_lock();
ccid_ops = ccids[id];
if (ccid_ops == NULL)
goto out_unlock;
......@@ -205,20 +272,6 @@ struct ccid *ccid_new(unsigned char id, struct sock *sk, int rx, gfp_t gfp)
EXPORT_SYMBOL_GPL(ccid_new);
struct ccid *ccid_hc_rx_new(unsigned char id, struct sock *sk, gfp_t gfp)
{
return ccid_new(id, sk, 1, gfp);
}
EXPORT_SYMBOL_GPL(ccid_hc_rx_new);
struct ccid *ccid_hc_tx_new(unsigned char id,struct sock *sk, gfp_t gfp)
{
return ccid_new(id, sk, 0, gfp);
}
EXPORT_SYMBOL_GPL(ccid_hc_tx_new);
static void ccid_delete(struct ccid *ccid, struct sock *sk, int rx)
{
struct ccid_operations *ccid_ops;
......
......@@ -60,22 +60,18 @@ struct ccid_operations {
void (*ccid_hc_tx_exit)(struct sock *sk);
void (*ccid_hc_rx_packet_recv)(struct sock *sk,
struct sk_buff *skb);
int (*ccid_hc_rx_parse_options)(struct sock *sk,
unsigned char option,
unsigned char len, u16 idx,
unsigned char* value);
int (*ccid_hc_rx_parse_options)(struct sock *sk, u8 pkt,
u8 opt, u8 *val, u8 len);
int (*ccid_hc_rx_insert_options)(struct sock *sk,
struct sk_buff *skb);
void (*ccid_hc_tx_packet_recv)(struct sock *sk,
struct sk_buff *skb);
int (*ccid_hc_tx_parse_options)(struct sock *sk,
unsigned char option,
unsigned char len, u16 idx,
unsigned char* value);
int (*ccid_hc_tx_parse_options)(struct sock *sk, u8 pkt,
u8 opt, u8 *val, u8 len);
int (*ccid_hc_tx_send_packet)(struct sock *sk,
struct sk_buff *skb);
void (*ccid_hc_tx_packet_sent)(struct sock *sk,
int more, unsigned int len);
unsigned int len);
void (*ccid_hc_rx_get_info)(struct sock *sk,
struct tcp_info *info);
void (*ccid_hc_tx_get_info)(struct sock *sk,
......@@ -103,31 +99,78 @@ static inline void *ccid_priv(const struct ccid *ccid)
return (void *)ccid->ccid_priv;
}
extern bool ccid_support_check(u8 const *ccid_array, u8 array_len);
extern int ccid_get_builtin_ccids(u8 **ccid_array, u8 *array_len);
extern int ccid_getsockopt_builtin_ccids(struct sock *sk, int len,
char __user *, int __user *);
extern int ccid_request_modules(u8 const *ccid_array, u8 array_len);
extern struct ccid *ccid_new(unsigned char id, struct sock *sk, int rx,
gfp_t gfp);
extern struct ccid *ccid_hc_rx_new(unsigned char id, struct sock *sk,
gfp_t gfp);
extern struct ccid *ccid_hc_tx_new(unsigned char id, struct sock *sk,
gfp_t gfp);
static inline int ccid_get_current_rx_ccid(struct dccp_sock *dp)
{
struct ccid *ccid = dp->dccps_hc_rx_ccid;
if (ccid == NULL || ccid->ccid_ops == NULL)
return -1;
return ccid->ccid_ops->ccid_id;
}
static inline int ccid_get_current_tx_ccid(struct dccp_sock *dp)
{
struct ccid *ccid = dp->dccps_hc_tx_ccid;
if (ccid == NULL || ccid->ccid_ops == NULL)
return -1;
return ccid->ccid_ops->ccid_id;
}
extern void ccid_hc_rx_delete(struct ccid *ccid, struct sock *sk);
extern void ccid_hc_tx_delete(struct ccid *ccid, struct sock *sk);
/*
* Congestion control of queued data packets via CCID decision.
*
* The TX CCID performs its congestion-control by indicating whether and when a
* queued packet may be sent, using the return code of ccid_hc_tx_send_packet().
* The following modes are supported via the symbolic constants below:
* - timer-based pacing (CCID returns a delay value in milliseconds);
* - autonomous dequeueing (CCID internally schedules dccps_xmitlet).
*/
enum ccid_dequeueing_decision {
CCID_PACKET_SEND_AT_ONCE = 0x00000, /* "green light": no delay */
CCID_PACKET_DELAY_MAX = 0x0FFFF, /* maximum delay in msecs */
CCID_PACKET_DELAY = 0x10000, /* CCID msec-delay mode */
CCID_PACKET_WILL_DEQUEUE_LATER = 0x20000, /* CCID autonomous mode */
CCID_PACKET_ERR = 0xF0000, /* error condition */
};
static inline int ccid_packet_dequeue_eval(const int return_code)
{
if (return_code < 0)
return CCID_PACKET_ERR;
if (return_code == 0)
return CCID_PACKET_SEND_AT_ONCE;
if (return_code <= CCID_PACKET_DELAY_MAX)
return CCID_PACKET_DELAY;
return return_code;
}
static inline int ccid_hc_tx_send_packet(struct ccid *ccid, struct sock *sk,
struct sk_buff *skb)
{
int rc = 0;
if (ccid->ccid_ops->ccid_hc_tx_send_packet != NULL)
rc = ccid->ccid_ops->ccid_hc_tx_send_packet(sk, skb);
return rc;
return ccid->ccid_ops->ccid_hc_tx_send_packet(sk, skb);
return CCID_PACKET_SEND_AT_ONCE;
}
static inline void ccid_hc_tx_packet_sent(struct ccid *ccid, struct sock *sk,
int more, unsigned int len)
unsigned int len)
{
if (ccid->ccid_ops->ccid_hc_tx_packet_sent != NULL)
ccid->ccid_ops->ccid_hc_tx_packet_sent(sk, more, len);
ccid->ccid_ops->ccid_hc_tx_packet_sent(sk, len);
}
static inline void ccid_hc_rx_packet_recv(struct ccid *ccid, struct sock *sk,
......@@ -144,27 +187,31 @@ static inline void ccid_hc_tx_packet_recv(struct ccid *ccid, struct sock *sk,
ccid->ccid_ops->ccid_hc_tx_packet_recv(sk, skb);
}
/**
* ccid_hc_tx_parse_options - Parse CCID-specific options sent by the receiver
* @pkt: type of packet that @opt appears on (RFC 4340, 5.1)
* @opt: the CCID-specific option type (RFC 4340, 5.8 and 10.3)
* @val: value of @opt
* @len: length of @val in bytes
*/
static inline int ccid_hc_tx_parse_options(struct ccid *ccid, struct sock *sk,
unsigned char option,
unsigned char len, u16 idx,
unsigned char* value)
u8 pkt, u8 opt, u8 *val, u8 len)
{
int rc = 0;
if (ccid->ccid_ops->ccid_hc_tx_parse_options != NULL)
rc = ccid->ccid_ops->ccid_hc_tx_parse_options(sk, option, len, idx,
value);
return rc;
if (ccid->ccid_ops->ccid_hc_tx_parse_options == NULL)
return 0;
return ccid->ccid_ops->ccid_hc_tx_parse_options(sk, pkt, opt, val, len);
}
/**
* ccid_hc_rx_parse_options - Parse CCID-specific options sent by the sender
* Arguments are analogous to ccid_hc_tx_parse_options()
*/
static inline int ccid_hc_rx_parse_options(struct ccid *ccid, struct sock *sk,
unsigned char option,
unsigned char len, u16 idx,
unsigned char* value)
u8 pkt, u8 opt, u8 *val, u8 len)
{
int rc = 0;
if (ccid->ccid_ops->ccid_hc_rx_parse_options != NULL)
rc = ccid->ccid_ops->ccid_hc_rx_parse_options(sk, option, len, idx, value);
return rc;
if (ccid->ccid_ops->ccid_hc_rx_parse_options == NULL)
return 0;
return ccid->ccid_ops->ccid_hc_rx_parse_options(sk, pkt, opt, val, len);
}
static inline int ccid_hc_rx_insert_options(struct ccid *ccid, struct sock *sk,
......
menu "DCCP CCIDs Configuration (EXPERIMENTAL)"
depends on EXPERIMENTAL
config IP_DCCP_CCID2
tristate "CCID2 (TCP-Like) (EXPERIMENTAL)"
tristate "CCID2 (TCP-Like)"
def_tristate IP_DCCP
select IP_DCCP_ACKVEC
---help---
CCID 2, TCP-like Congestion Control, denotes Additive Increase,
Multiplicative Decrease (AIMD) congestion control with behavior
......@@ -36,7 +34,7 @@ config IP_DCCP_CCID2_DEBUG
If in doubt, say N.
config IP_DCCP_CCID3
tristate "CCID3 (TCP-Friendly) (EXPERIMENTAL)"
tristate "CCID3 (TCP-Friendly)"
def_tristate IP_DCCP
select IP_DCCP_TFRC_LIB
---help---
......@@ -64,9 +62,9 @@ config IP_DCCP_CCID3
If in doubt, say M.
if IP_DCCP_CCID3
config IP_DCCP_CCID3_DEBUG
bool "CCID3 debugging messages"
depends on IP_DCCP_CCID3
---help---
Enable CCID3-specific debugging messages.
......@@ -76,10 +74,29 @@ config IP_DCCP_CCID3_DEBUG
If in doubt, say N.
choice
prompt "Select method for measuring the packet size s"
default IP_DCCP_CCID3_MEASURE_S_AS_MPS
config IP_DCCP_CCID3_MEASURE_S_AS_MPS
bool "Always use MPS in place of s"
---help---
This use is recommended as it is consistent with the initialisation
of X and suggested when s varies (rfc3448bis, (1) in section 4.1).
config IP_DCCP_CCID3_MEASURE_S_AS_AVG
bool "Use moving average"
---help---
An alternative way of tracking s, also supported by rfc3448bis.
This used to be the default for CCID-3 in previous kernels.
config IP_DCCP_CCID3_MEASURE_S_AS_MAX
bool "Track the maximum payload length"
---help---
An experimental method based on tracking the maximum packet size.
endchoice
config IP_DCCP_CCID3_RTO
int "Use higher bound for nofeedback timer"
default 100
depends on IP_DCCP_CCID3 && EXPERIMENTAL
---help---
Use higher lower bound for nofeedback timer expiration.
......@@ -106,6 +123,7 @@ config IP_DCCP_CCID3_RTO
The purpose of the nofeedback timer is to slow DCCP down when there
is serious network congestion: experimenting with larger values should
therefore not be performed on WANs.
endif # IP_DCCP_CCID3
config IP_DCCP_TFRC_LIB
tristate
......
This diff is collapsed.
......@@ -42,34 +42,49 @@ struct ccid2_seq {
/** struct ccid2_hc_tx_sock - CCID2 TX half connection
*
* @ccid2hctx_{cwnd,ssthresh,pipe}: as per RFC 4341, section 5
* @ccid2hctx_packets_acked - Ack counter for deriving cwnd growth (RFC 3465)
* @ccid2hctx_lastrtt -time RTT was last measured
* @ccid2hctx_rpseq - last consecutive seqno
* @ccid2hctx_rpdupack - dupacks since rpseq
*/
* @{cwnd,ssthresh,pipe}: as per RFC 4341, section 5
* @packets_acked: Ack counter for deriving cwnd growth (RFC 3465)
* @srtt: smoothed RTT estimate, scaled by 2^3
* @mdev: smoothed RTT variation, scaled by 2^2
* @mdev_max: maximum of @mdev during one flight
* @rttvar: moving average/maximum of @mdev_max
* @rto: RTO value deriving from SRTT and RTTVAR (RFC 2988)
* @rtt_seq: to decay RTTVAR at most once per flight
* @rpseq: last consecutive seqno
* @rpdupack: dupacks since rpseq
* @av_chunks: list of Ack Vectors received on current skb
*/
struct ccid2_hc_tx_sock {
u32 ccid2hctx_cwnd;
u32 ccid2hctx_ssthresh;
u32 ccid2hctx_pipe;
u32 ccid2hctx_packets_acked;
struct ccid2_seq *ccid2hctx_seqbuf[CCID2_SEQBUF_MAX];
int ccid2hctx_seqbufc;
struct ccid2_seq *ccid2hctx_seqh;
struct ccid2_seq *ccid2hctx_seqt;
long ccid2hctx_rto;
long ccid2hctx_srtt;
long ccid2hctx_rttvar;
unsigned long ccid2hctx_lastrtt;
struct timer_list ccid2hctx_rtotimer;
u64 ccid2hctx_rpseq;
int ccid2hctx_rpdupack;
unsigned long ccid2hctx_last_cong;
u64 ccid2hctx_high_ack;
u32 cwnd;
u32 ssthresh;
u32 pipe;
u32 packets_acked;
struct ccid2_seq *seqbuf[CCID2_SEQBUF_MAX];
int seqbufc;
struct ccid2_seq *seqh;
struct ccid2_seq *seqt;
/* RTT measurement: variables/principles are the same as in TCP */
u32 srtt,
mdev,
mdev_max,
rttvar,
rto;
u64 rtt_seq:48;
struct timer_list rtotimer;
u64 rpseq;
int rpdupack;
unsigned long last_cong;
u64 high_ack;
struct list_head av_chunks;
};
static inline bool ccid2_cwnd_network_limited(struct ccid2_hc_tx_sock *hctx)
{
return (hctx->pipe >= hctx->cwnd);
}
struct ccid2_hc_rx_sock {
int ccid2hcrx_data;
int data;
};
static inline struct ccid2_hc_tx_sock *ccid2_hc_tx_sk(const struct sock *sk)
......
This diff is collapsed.
......@@ -47,11 +47,22 @@
/* Two seconds as per RFC 3448 4.2 */
#define TFRC_INITIAL_TIMEOUT (2 * USEC_PER_SEC)
/* In usecs - half the scheduling granularity as per RFC3448 4.6 */
#define TFRC_OPSYS_HALF_TIME_GRAN (USEC_PER_SEC / (2 * HZ))
/* Maximum backoff interval t_mbi (RFC 3448, 4.3) */
#define TFRC_T_MBI (64 * USEC_PER_SEC)
/* Parameter t_mbi from [RFC 3448, 4.3]: backoff interval in seconds */
#define TFRC_T_MBI 64
/*
* The t_delta parameter (RFC 3448, 4.6): delays of less than %USEC_PER_MSEC are
* rounded down to 0, since sk_reset_timer() here uses millisecond granularity.
* Hence we can use a constant t_delta = %USEC_PER_MSEC when HZ >= 500. A coarse
* resolution of HZ < 500 means that the error is below one timer tick (t_gran)
* when using the constant t_delta = t_gran / 2 = %USEC_PER_SEC / (2 * HZ).
*/
#if (HZ >= 500)
# define TFRC_T_DELTA USEC_PER_MSEC
#else
# define TFRC_T_DELTA (USEC_PER_SEC / (2 * HZ))
#warning Coarse CONFIG_HZ resolution -- higher value recommended for TFRC.
#endif
enum ccid3_options {
TFRC_OPT_LOSS_EVENT_RATE = 192,
......@@ -59,62 +70,43 @@ enum ccid3_options {
TFRC_OPT_RECEIVE_RATE = 194,
};
struct ccid3_options_received {
u64 ccid3or_seqno:48,
ccid3or_loss_intervals_idx:16;
u16 ccid3or_loss_intervals_len;
u32 ccid3or_loss_event_rate;
u32 ccid3or_receive_rate;
};
/* TFRC sender states */
enum ccid3_hc_tx_states {
TFRC_SSTATE_NO_SENT = 1,
TFRC_SSTATE_NO_FBACK,
TFRC_SSTATE_FBACK,
TFRC_SSTATE_TERM,
};
/** struct ccid3_hc_tx_sock - CCID3 sender half-connection socket
*
* @ccid3hctx_x - Current sending rate in 64 * bytes per second
* @ccid3hctx_x_recv - Receive rate in 64 * bytes per second
* @ccid3hctx_x_calc - Calculated rate in bytes per second
* @ccid3hctx_rtt - Estimate of current round trip time in usecs
* @ccid3hctx_p - Current loss event rate (0-1) scaled by 1000000
* @ccid3hctx_s - Packet size in bytes
* @ccid3hctx_t_rto - Nofeedback Timer setting in usecs
* @ccid3hctx_t_ipi - Interpacket (send) interval (RFC 3448, 4.6) in usecs
* @ccid3hctx_state - Sender state, one of %ccid3_hc_tx_states
* @ccid3hctx_last_win_count - Last window counter sent
* @ccid3hctx_t_last_win_count - Timestamp of earliest packet
* with last_win_count value sent
* @ccid3hctx_no_feedback_timer - Handle to no feedback timer
* @ccid3hctx_t_ld - Time last doubled during slow start
* @ccid3hctx_t_nom - Nominal send time of next packet
* @ccid3hctx_delta - Send timer delta (RFC 3448, 4.6) in usecs
* @ccid3hctx_hist - Packet history
* @ccid3hctx_options_received - Parsed set of retrieved options
* @x - Current sending rate in 64 * bytes per second
* @x_recv - Receive rate in 64 * bytes per second
* @x_calc - Calculated rate in bytes per second
* @rtt - Estimate of current round trip time in usecs
* @r_sqmean - Estimate of long-term RTT (RFC 3448, 4.5)
* @p - Current loss event rate (0-1) scaled by 1000000
* @s - Packet size in bytes
* @t_rto - Nofeedback Timer setting in usecs
* @t_ipi - Interpacket (send) interval (RFC 3448, 4.6) in usecs
* @feedback - Whether feedback has been received or not
* @last_win_count - Last window counter sent
* @t_last_win_count - Timestamp of earliest packet with
* last_win_count value sent
* @no_feedback_timer - Handle to no feedback timer
* @t_ld - Time last doubled during slow start
* @t_nom - Nominal send time of next packet
* @hist - Packet history
*/
struct ccid3_hc_tx_sock {
struct tfrc_tx_info ccid3hctx_tfrc;
#define ccid3hctx_x ccid3hctx_tfrc.tfrctx_x
#define ccid3hctx_x_recv ccid3hctx_tfrc.tfrctx_x_recv
#define ccid3hctx_x_calc ccid3hctx_tfrc.tfrctx_x_calc
#define ccid3hctx_rtt ccid3hctx_tfrc.tfrctx_rtt
#define ccid3hctx_p ccid3hctx_tfrc.tfrctx_p
#define ccid3hctx_t_rto ccid3hctx_tfrc.tfrctx_rto
#define ccid3hctx_t_ipi ccid3hctx_tfrc.tfrctx_ipi
u16 ccid3hctx_s;
enum ccid3_hc_tx_states ccid3hctx_state:8;
u8 ccid3hctx_last_win_count;
ktime_t ccid3hctx_t_last_win_count;
struct timer_list ccid3hctx_no_feedback_timer;
ktime_t ccid3hctx_t_ld;
ktime_t ccid3hctx_t_nom;
u32 ccid3hctx_delta;
struct tfrc_tx_hist_entry *ccid3hctx_hist;
struct ccid3_options_received ccid3hctx_options_received;
u64 x;
u64 x_recv;
u32 x_calc;
u32 rtt;
u16 r_sqmean;
u32 p;
u32 t_rto;
u32 t_ipi;
u16 s;
bool feedback:1;
u8 last_win_count;
ktime_t t_last_win_count;
struct timer_list no_feedback_timer;
ktime_t t_ld;
ktime_t t_nom;
struct tfrc_tx_hist_entry *hist;
};
static inline struct ccid3_hc_tx_sock *ccid3_hc_tx_sk(const struct sock *sk)
......@@ -124,41 +116,32 @@ static inline struct ccid3_hc_tx_sock *ccid3_hc_tx_sk(const struct sock *sk)
return hctx;
}
/* TFRC receiver states */
enum ccid3_hc_rx_states {
TFRC_RSTATE_NO_DATA = 1,
TFRC_RSTATE_DATA,
TFRC_RSTATE_TERM = 127,
enum ccid3_fback_type {
CCID3_FBACK_NONE = 0,
CCID3_FBACK_INITIAL,
CCID3_FBACK_PERIODIC,
CCID3_FBACK_PARAM_CHANGE
};
/** struct ccid3_hc_rx_sock - CCID3 receiver half-connection socket
*
* @ccid3hcrx_x_recv - Receiver estimate of send rate (RFC 3448 4.3)
* @ccid3hcrx_rtt - Receiver estimate of rtt (non-standard)
* @ccid3hcrx_p - Current loss event rate (RFC 3448 5.4)
* @ccid3hcrx_last_counter - Tracks window counter (RFC 4342, 8.1)
* @ccid3hcrx_state - Receiver state, one of %ccid3_hc_rx_states
* @ccid3hcrx_bytes_recv - Total sum of DCCP payload bytes
* @ccid3hcrx_x_recv - Receiver estimate of send rate (RFC 3448, sec. 4.3)
* @ccid3hcrx_rtt - Receiver estimate of RTT
* @ccid3hcrx_tstamp_last_feedback - Time at which last feedback was sent
* @ccid3hcrx_tstamp_last_ack - Time at which last feedback was sent
* @ccid3hcrx_hist - Packet history (loss detection + RTT sampling)
* @ccid3hcrx_li_hist - Loss Interval database
* @ccid3hcrx_s - Received packet size in bytes
* @ccid3hcrx_pinv - Inverse of Loss Event Rate (RFC 4342, sec. 8.5)
* @last_counter - Tracks window counter (RFC 4342, 8.1)
* @feedback - The type of the feedback last sent
* @x_recv - Receiver estimate of send rate (RFC 3448, sec. 4.3)
* @tstamp_last_feedback - Time at which last feedback was sent
* @hist - Packet history (loss detection + RTT sampling)
* @li_hist - Loss Interval database
* @p_inverse - Inverse of Loss Event Rate (RFC 4342, sec. 8.5)
*/
struct ccid3_hc_rx_sock {
u8 ccid3hcrx_last_counter:4;
enum ccid3_hc_rx_states ccid3hcrx_state:8;
u32 ccid3hcrx_bytes_recv;
u32 ccid3hcrx_x_recv;
u32 ccid3hcrx_rtt;
ktime_t ccid3hcrx_tstamp_last_feedback;
struct tfrc_rx_hist ccid3hcrx_hist;
struct tfrc_loss_hist ccid3hcrx_li_hist;
u16 ccid3hcrx_s;
#define ccid3hcrx_pinv ccid3hcrx_li_hist.i_mean
u8 last_counter:4;
enum ccid3_fback_type feedback:4;
u32 x_recv;
ktime_t tstamp_last_feedback;
struct tfrc_rx_hist hist;
struct tfrc_loss_hist li_hist;
#define p_inverse li_hist.i_mean
};
static inline struct ccid3_hc_rx_sock *ccid3_hc_rx_sk(const struct sock *sk)
......
......@@ -86,21 +86,26 @@ static void tfrc_lh_calc_i_mean(struct tfrc_loss_hist *lh)
/**
* tfrc_lh_update_i_mean - Update the `open' loss interval I_0
* For recomputing p: returns `true' if p > p_prev <=> 1/p < 1/p_prev
* This updates I_mean as the sequence numbers increase. As a consequence, the
* open loss interval I_0 increases, hence p = W_tot/max(I_tot0, I_tot1)
* decreases, and thus there is no need to send renewed feedback.
*/
u8 tfrc_lh_update_i_mean(struct tfrc_loss_hist *lh, struct sk_buff *skb)
void tfrc_lh_update_i_mean(struct tfrc_loss_hist *lh, struct sk_buff *skb)
{
struct tfrc_loss_interval *cur = tfrc_lh_peek(lh);
u32 old_i_mean = lh->i_mean;
s64 len;
if (cur == NULL) /* not initialised */
return 0;
return;
/* FIXME: should probably also count non-data packets (RFC 4342, 6.1) */
if (!dccp_data_packet(skb))
return;
len = dccp_delta_seqno(cur->li_seqno, DCCP_SKB_CB(skb)->dccpd_seq) + 1;
if (len - (s64)cur->li_length <= 0) /* duplicate or reordered */
return 0;
return;
if (SUB16(dccp_hdr(skb)->dccph_ccval, cur->li_ccval) > 4)
/*
......@@ -114,14 +119,11 @@ u8 tfrc_lh_update_i_mean(struct tfrc_loss_hist *lh, struct sk_buff *skb)
cur->li_is_closed = 1;
if (tfrc_lh_length(lh) == 1) /* due to RFC 3448, 6.3.1 */
return 0;
return;
cur->li_length = len;
tfrc_lh_calc_i_mean(lh);
return (lh->i_mean < old_i_mean);
}
EXPORT_SYMBOL_GPL(tfrc_lh_update_i_mean);
/* Determine if `new_loss' does begin a new loss interval [RFC 4342, 10.2] */
static inline u8 tfrc_lh_is_new_loss(struct tfrc_loss_interval *cur,
......@@ -138,18 +140,18 @@ static inline u8 tfrc_lh_is_new_loss(struct tfrc_loss_interval *cur,
* @sk: Used by @calc_first_li in caller-specific way (subtyping)
* Updates I_mean and returns 1 if a new interval has in fact been added to @lh.
*/
int tfrc_lh_interval_add(struct tfrc_loss_hist *lh, struct tfrc_rx_hist *rh,
bool tfrc_lh_interval_add(struct tfrc_loss_hist *lh, struct tfrc_rx_hist *rh,
u32 (*calc_first_li)(struct sock *), struct sock *sk)
{
struct tfrc_loss_interval *cur = tfrc_lh_peek(lh), *new;
if (cur != NULL && !tfrc_lh_is_new_loss(cur, tfrc_rx_hist_loss_prev(rh)))
return 0;
return false;
new = tfrc_lh_demand_next(lh);
if (unlikely(new == NULL)) {
DCCP_CRIT("Cannot allocate/add loss record.");
return 0;
return false;
}
new->li_seqno = tfrc_rx_hist_loss_prev(rh)->tfrchrx_seqno;
......@@ -167,7 +169,7 @@ int tfrc_lh_interval_add(struct tfrc_loss_hist *lh, struct tfrc_rx_hist *rh,
tfrc_lh_calc_i_mean(lh);
}
return 1;
return true;
}
EXPORT_SYMBOL_GPL(tfrc_lh_interval_add);
......
......@@ -67,9 +67,9 @@ static inline u8 tfrc_lh_length(struct tfrc_loss_hist *lh)
struct tfrc_rx_hist;
extern int tfrc_lh_interval_add(struct tfrc_loss_hist *, struct tfrc_rx_hist *,
extern bool tfrc_lh_interval_add(struct tfrc_loss_hist *, struct tfrc_rx_hist *,
u32 (*first_li)(struct sock *), struct sock *);
extern u8 tfrc_lh_update_i_mean(struct tfrc_loss_hist *lh, struct sk_buff *);
extern void tfrc_lh_update_i_mean(struct tfrc_loss_hist *lh, struct sk_buff *);
extern void tfrc_lh_cleanup(struct tfrc_loss_hist *lh);
#endif /* _DCCP_LI_HIST_ */
This diff is collapsed.
......@@ -40,12 +40,28 @@
#include <linux/slab.h>
#include "tfrc.h"
struct tfrc_tx_hist_entry;
/**
* tfrc_tx_hist_entry - Simple singly-linked TX history list
* @next: next oldest entry (LIFO order)
* @seqno: sequence number of this entry
* @stamp: send time of packet with sequence number @seqno
*/
struct tfrc_tx_hist_entry {
struct tfrc_tx_hist_entry *next;
u64 seqno;
ktime_t stamp;
};
static inline struct tfrc_tx_hist_entry *
tfrc_tx_hist_find_entry(struct tfrc_tx_hist_entry *head, u64 seqno)
{
while (head != NULL && head->seqno != seqno)
head = head->next;
return head;
}
extern int tfrc_tx_hist_add(struct tfrc_tx_hist_entry **headp, u64 seqno);
extern void tfrc_tx_hist_purge(struct tfrc_tx_hist_entry **headp);
extern u32 tfrc_tx_hist_rtt(struct tfrc_tx_hist_entry *head,
const u64 seqno, const ktime_t now);
/* Subtraction a-b modulo-16, respects circular wrap-around */
#define SUB16(a, b) (((a) + 16 - (b)) & 0xF)
......@@ -75,12 +91,22 @@ struct tfrc_rx_hist_entry {
* @loss_count: Number of entries in circular history
* @loss_start: Movable index (for loss detection)
* @rtt_sample_prev: Used during RTT sampling, points to candidate entry
* @rtt_estimate: Receiver RTT estimate
* @packet_size: Packet size in bytes (as per RFC 3448, 3.1)
* @bytes_recvd: Number of bytes received since @bytes_start
* @bytes_start: Start time for counting @bytes_recvd
*/
struct tfrc_rx_hist {
struct tfrc_rx_hist_entry *ring[TFRC_NDUPACK + 1];
u8 loss_count:2,
loss_start:2;
/* Receiver RTT sampling */
#define rtt_sample_prev loss_start
u32 rtt_estimate;
/* Receiver sampling of application payload lengths */
u32 packet_size,
bytes_recvd;
ktime_t bytes_start;
};
/**
......@@ -124,20 +150,50 @@ static inline bool tfrc_rx_hist_loss_pending(const struct tfrc_rx_hist *h)
return h->loss_count > 0;
}
/*
* Accessor functions to retrieve parameters sampled by the RX history
*/
static inline u32 tfrc_rx_hist_packet_size(const struct tfrc_rx_hist *h)
{
if (h->packet_size == 0) {
DCCP_WARN("No sample for s, using fallback\n");
return TCP_MIN_RCVMSS;
}
return h->packet_size;
}
static inline u32 tfrc_rx_hist_rtt(const struct tfrc_rx_hist *h)
{
if (h->rtt_estimate == 0) {
DCCP_WARN("No RTT estimate available, using fallback RTT\n");
return DCCP_FALLBACK_RTT;
}
return h->rtt_estimate;
}
static inline void tfrc_rx_hist_restart_byte_counter(struct tfrc_rx_hist *h)
{
h->bytes_recvd = 0;
h->bytes_start = ktime_get_real();
}
extern u32 tfrc_rx_hist_x_recv(struct tfrc_rx_hist *h, const u32 last_x_recv);
extern void tfrc_rx_hist_add_packet(struct tfrc_rx_hist *h,
const struct sk_buff *skb, const u64 ndp);
extern int tfrc_rx_hist_duplicate(struct tfrc_rx_hist *h, struct sk_buff *skb);
struct tfrc_loss_hist;
extern int tfrc_rx_handle_loss(struct tfrc_rx_hist *h,
extern bool tfrc_rx_congestion_event(struct tfrc_rx_hist *h,
struct tfrc_loss_hist *lh,
struct sk_buff *skb, const u64 ndp,
u32 (*first_li)(struct sock *sk),
struct sock *sk);
extern u32 tfrc_rx_hist_sample_rtt(struct tfrc_rx_hist *h,
extern void tfrc_rx_hist_sample_rtt(struct tfrc_rx_hist *h,
const struct sk_buff *skb);
extern int tfrc_rx_hist_alloc(struct tfrc_rx_hist *h);
extern int tfrc_rx_hist_init(struct tfrc_rx_hist *h, struct sock *sk);
extern void tfrc_rx_hist_purge(struct tfrc_rx_hist *h);
#endif /* _DCCP_PKT_HIST_ */
......@@ -47,6 +47,21 @@ static inline u32 scaled_div32(u64 a, u64 b)
return result;
}
/**
* tfrc_scaled_sqrt - Compute scaled integer sqrt(x) for 0 < x < 2^22-1
* Uses scaling to improve accuracy of the integer approximation of sqrt(). The
* scaling factor of 2^10 limits the maximum @sample to 4e6; this is okay for
* clamped RTT samples (dccp_sample_rtt).
* Should best be used for expressions of type sqrt(x)/sqrt(y), since then the
* scaling factor is neutralised. For this purpose, it avoids returning zero.
*/
static inline u16 tfrc_scaled_sqrt(const u32 sample)
{
const unsigned long non_zero_sample = sample ? : 1;
return int_sqrt(non_zero_sample << 10);
}
/**
* tfrc_ewma - Exponentially weighted moving average
* @weight: Weight to be used as damping factor, in units of 1/10
......@@ -58,6 +73,7 @@ static inline u32 tfrc_ewma(const u32 avg, const u32 newval, const u8 weight)
extern u32 tfrc_calc_x(u16 s, u32 R, u32 p);
extern u32 tfrc_calc_x_reverse_lookup(u32 fvalue);
extern u32 tfrc_invert_loss_event_rate(u32 loss_event_rate);
extern int tfrc_tx_packet_history_init(void);
extern void tfrc_tx_packet_history_exit(void);
......
......@@ -632,7 +632,15 @@ u32 tfrc_calc_x(u16 s, u32 R, u32 p)
if (p <= TFRC_CALC_X_SPLIT) { /* 0.0000 < p <= 0.05 */
if (p < TFRC_SMALLEST_P) { /* 0.0000 < p < 0.0001 */
DCCP_WARN("Value of p (%d) below resolution. "
/*
* In the congestion-avoidance phase p decays towards 0
* when there are no further losses, so this case is
* natural. Truncating to p_min = 0.01% means that the
* maximum achievable throughput is limited to about
* X_calc_max = 122.4 * s/RTT (see RFC 3448, 3.1); e.g.
* with s=1500 bytes, RTT=0.01 s: X_calc_max = 147 Mbps.
*/
tfrc_pr_debug("Value of p (%d) below resolution. "
"Substituting %d\n", p, TFRC_SMALLEST_P);
index = 0;
} else /* 0.0001 <= p <= 0.05 */
......@@ -658,7 +666,6 @@ u32 tfrc_calc_x(u16 s, u32 R, u32 p)
result = scaled_div(s, R);
return scaled_div32(result, f);
}
EXPORT_SYMBOL_GPL(tfrc_calc_x);
/**
......@@ -693,5 +700,19 @@ u32 tfrc_calc_x_reverse_lookup(u32 fvalue)
index = tfrc_binsearch(fvalue, 0);
return (index + 1) * 1000000 / TFRC_CALC_X_ARRSIZE;
}
EXPORT_SYMBOL_GPL(tfrc_calc_x_reverse_lookup);
/**
* tfrc_invert_loss_event_rate - Compute p so that 10^6 corresponds to 100%
* When @loss_event_rate is large, there is a chance that p is truncated to 0.
* To avoid re-entering slow-start in that case, we set p = TFRC_SMALLEST_P > 0.
*/
u32 tfrc_invert_loss_event_rate(u32 loss_event_rate)
{
if (loss_event_rate == UINT_MAX) /* see RFC 4342, 8.5 */
return 0;
if (unlikely(loss_event_rate == 0)) /* map 1/0 into 100% */
return 1000000;
return max_t(u32, scaled_div(1, loss_event_rate), TFRC_SMALLEST_P);
}
EXPORT_SYMBOL_GPL(tfrc_invert_loss_event_rate);
......@@ -42,9 +42,11 @@
extern int dccp_debug;
#define dccp_pr_debug(format, a...) DCCP_PR_DEBUG(dccp_debug, format, ##a)
#define dccp_pr_debug_cat(format, a...) DCCP_PRINTK(dccp_debug, format, ##a)
#define dccp_debug(fmt, a...) dccp_pr_debug_cat(KERN_DEBUG fmt, ##a)
#else
#define dccp_pr_debug(format, a...)
#define dccp_pr_debug_cat(format, a...)
#define dccp_debug(format, a...)
#endif
extern struct inet_hashinfo dccp_hashinfo;
......@@ -61,11 +63,14 @@ extern void dccp_time_wait(struct sock *sk, int state, int timeo);
* - DCCP-Reset with ACK Subheader and 4 bytes of Reset Code fields
* Hence a safe upper bound for the maximum option length is 1020-28 = 992
*/
#define MAX_DCCP_SPECIFIC_HEADER (255 * sizeof(int))
#define MAX_DCCP_SPECIFIC_HEADER (255 * sizeof(uint32_t))
#define DCCP_MAX_PACKET_HDR 28
#define DCCP_MAX_OPT_LEN (MAX_DCCP_SPECIFIC_HEADER - DCCP_MAX_PACKET_HDR)
#define MAX_DCCP_HEADER (MAX_DCCP_SPECIFIC_HEADER + MAX_HEADER)
/* Upper bound for initial feature-negotiation overhead (padded to 32 bits) */
#define DCCP_FEATNEG_OVERHEAD (32 * sizeof(uint32_t))
#define DCCP_TIMEWAIT_LEN (60 * HZ) /* how long to wait to destroy TIME-WAIT
* state, about 60 seconds */
......@@ -81,10 +86,13 @@ extern void dccp_time_wait(struct sock *sk, int state, int timeo);
*/
#define DCCP_RTO_MAX ((unsigned)(64 * HZ))
/* DCCP base time resolution - 10 microseconds (RFC 4340, 13.1 ... 13.3) */
#define DCCP_TIME_RESOLUTION 10
/*
* RTT sampling: sanity bounds and fallback RTT value from RFC 4340, section 3.4
*/
#define DCCP_SANE_RTT_MIN 100
#define DCCP_SANE_RTT_MIN (10 * DCCP_TIME_RESOLUTION)
#define DCCP_FALLBACK_RTT (USEC_PER_SEC / 5)
#define DCCP_SANE_RTT_MAX (3 * USEC_PER_SEC)
......@@ -95,12 +103,6 @@ extern void dccp_time_wait(struct sock *sk, int state, int timeo);
extern int sysctl_dccp_request_retries;
extern int sysctl_dccp_retries1;
extern int sysctl_dccp_retries2;
extern int sysctl_dccp_feat_sequence_window;
extern int sysctl_dccp_feat_rx_ccid;
extern int sysctl_dccp_feat_tx_ccid;
extern int sysctl_dccp_feat_ack_ratio;
extern int sysctl_dccp_feat_send_ack_vector;
extern int sysctl_dccp_feat_send_ndp_count;
extern int sysctl_dccp_tx_qlen;
extern int sysctl_dccp_sync_ratelimit;
......@@ -235,8 +237,22 @@ extern void dccp_reqsk_send_ack(struct sock *sk, struct sk_buff *skb,
extern void dccp_send_sync(struct sock *sk, const u64 seq,
const enum dccp_pkt_type pkt_type);
extern void dccp_write_xmit(struct sock *sk, int block);
/*
* TX Packet Dequeueing Interface
*/
extern void dccp_qpolicy_push(struct sock *sk, struct sk_buff *skb);
extern bool dccp_qpolicy_full(struct sock *sk);
extern void dccp_qpolicy_drop(struct sock *sk, struct sk_buff *skb);
extern struct sk_buff *dccp_qpolicy_top(struct sock *sk);
extern struct sk_buff *dccp_qpolicy_pop(struct sock *sk);
extern bool dccp_qpolicy_param_ok(struct sock *sk, __be32 param);
/*
* TX Packet Output and TX Timers
*/
extern void dccp_write_xmit(struct sock *sk);
extern void dccp_write_space(struct sock *sk);
extern void dccp_flush_write_queue(struct sock *sk, long *time_budget);
extern void dccp_init_xmit_timers(struct sock *sk);
static inline void dccp_clear_xmit_timers(struct sock *sk)
......@@ -252,7 +268,8 @@ extern const char *dccp_state_name(const int state);
extern void dccp_set_state(struct sock *sk, const int state);
extern void dccp_done(struct sock *sk);
extern void dccp_reqsk_init(struct request_sock *req, struct sk_buff *skb);
extern int dccp_reqsk_init(struct request_sock *rq, struct dccp_sock const *dp,
struct sk_buff const *skb);
extern int dccp_v4_conn_request(struct sock *sk, struct sk_buff *skb);
......@@ -317,6 +334,13 @@ extern struct sk_buff *dccp_ctl_make_reset(struct sock *sk,
extern int dccp_send_reset(struct sock *sk, enum dccp_reset_codes code);
extern void dccp_send_close(struct sock *sk, const int active);
extern int dccp_invalid_packet(struct sk_buff *skb);
static inline u32 dccp_sane_rtt(long usec_sample)
{
if (unlikely(usec_sample <= 0 || usec_sample > DCCP_SANE_RTT_MAX))
DCCP_WARN("RTT sample %ld out of bounds!\n", usec_sample);
return clamp_val(usec_sample, DCCP_SANE_RTT_MIN, DCCP_SANE_RTT_MAX);
}
extern u32 dccp_sample_rtt(struct sock *sk, long delta);
static inline int dccp_bad_service_code(const struct sock *sk,
......@@ -411,36 +435,62 @@ static inline void dccp_hdr_set_ack(struct dccp_hdr_ack_bits *dhack,
static inline void dccp_update_gsr(struct sock *sk, u64 seq)
{
struct dccp_sock *dp = dccp_sk(sk);
const struct dccp_minisock *dmsk = dccp_msk(sk);
dp->dccps_gsr = seq;
dccp_set_seqno(&dp->dccps_swl,
dp->dccps_gsr + 1 - (dmsk->dccpms_sequence_window / 4));
dccp_set_seqno(&dp->dccps_swh,
dp->dccps_gsr + (3 * dmsk->dccpms_sequence_window) / 4);
/* Sequence validity window depends on remote Sequence Window (7.5.1) */
dp->dccps_swl = SUB48(ADD48(dp->dccps_gsr, 1), dp->dccps_r_seq_win / 4);
/*
* Adjust SWL so that it is not below ISR. In contrast to RFC 4340,
* 7.5.1 we perform this check beyond the initial handshake: W/W' are
* always > 32, so for the first W/W' packets in the lifetime of a
* connection we always have to adjust SWL.
* A second reason why we are doing this is that the window depends on
* the feature-remote value of Sequence Window: nothing stops the peer
* from updating this value while we are busy adjusting SWL for the
* first W packets (we would have to count from scratch again then).
* Therefore it is safer to always make sure that the Sequence Window
* is not artificially extended by a peer who grows SWL downwards by
* continually updating the feature-remote Sequence-Window.
* If sequence numbers wrap it is bad luck. But that will take a while
* (48 bit), and this measure prevents Sequence-number attacks.
*/
if (before48(dp->dccps_swl, dp->dccps_isr))
dp->dccps_swl = dp->dccps_isr;
dp->dccps_swh = ADD48(dp->dccps_gsr, (3 * dp->dccps_r_seq_win) / 4);
}
static inline void dccp_update_gss(struct sock *sk, u64 seq)
{
struct dccp_sock *dp = dccp_sk(sk);
dp->dccps_awh = dp->dccps_gss = seq;
dccp_set_seqno(&dp->dccps_awl,
(dp->dccps_gss -
dccp_msk(sk)->dccpms_sequence_window + 1));
dp->dccps_gss = seq;
/* Ack validity window depends on local Sequence Window value (7.5.1) */
dp->dccps_awl = SUB48(ADD48(dp->dccps_gss, 1), dp->dccps_l_seq_win);
/* Adjust AWL so that it is not below ISS - see comment above for SWL */
if (before48(dp->dccps_awl, dp->dccps_iss))
dp->dccps_awl = dp->dccps_iss;
dp->dccps_awh = dp->dccps_gss;
}
static inline int dccp_ackvec_pending(const struct sock *sk)
{
return dccp_sk(sk)->dccps_hc_rx_ackvec != NULL &&
!dccp_ackvec_is_empty(dccp_sk(sk)->dccps_hc_rx_ackvec);
}
static inline int dccp_ack_pending(const struct sock *sk)
{
const struct dccp_sock *dp = dccp_sk(sk);
return dp->dccps_timestamp_echo != 0 ||
#ifdef CONFIG_IP_DCCP_ACKVEC
(dccp_msk(sk)->dccpms_send_ack_vector &&
dccp_ackvec_pending(dp->dccps_hc_rx_ackvec)) ||
#endif
inet_csk_ack_scheduled(sk);
return dccp_ackvec_pending(sk) || inet_csk_ack_scheduled(sk);
}
extern int dccp_feat_signal_nn_change(struct sock *sk, u8 feat, u64 nn_val);
extern int dccp_feat_finalise_settings(struct dccp_sock *dp);
extern int dccp_feat_server_ccid_dependencies(struct dccp_request_sock *dreq);
extern int dccp_feat_insert_opts(struct dccp_sock*, struct dccp_request_sock*,
struct sk_buff *skb);
extern int dccp_feat_activate_values(struct sock *sk, struct list_head *fn);
extern void dccp_feat_list_purge(struct list_head *fn_list);
extern int dccp_insert_options(struct sock *sk, struct sk_buff *skb);
extern int dccp_insert_options_rsk(struct dccp_request_sock*, struct sk_buff*);
extern int dccp_insert_option_elapsed_time(struct sock *sk,
......
......@@ -29,7 +29,7 @@ static void dccp_get_info(struct sock *sk, struct tcp_info *info)
info->tcpi_backoff = icsk->icsk_backoff;
info->tcpi_pmtu = icsk->icsk_pmtu_cookie;
if (dccp_msk(sk)->dccpms_send_ack_vector)
if (dp->dccps_hc_rx_ackvec != NULL)
info->tcpi_options |= TCPI_OPT_SACK;
ccid_hc_rx_get_info(dp->dccps_hc_rx_ccid, sk, info);
......
This diff is collapsed.
......@@ -3,38 +3,134 @@
/*
* net/dccp/feat.h
*
* An implementation of the DCCP protocol
* Feature negotiation for the DCCP protocol (RFC 4340, section 6)
* Copyright (c) 2008 Gerrit Renker <gerrit@erg.abdn.ac.uk>
* Copyright (c) 2005 Andrea Bittau <a.bittau@cs.ucl.ac.uk>
*
* This program is free software; you can redistribute it and/or modify it
* under the terms of the GNU General Public License version 2 as
* published by the Free Software Foundation.
*/
#include <linux/types.h>
#include "dccp.h"
#ifdef CONFIG_IP_DCCP_DEBUG
extern const char *dccp_feat_typename(const u8 type);
extern const char *dccp_feat_name(const u8 feat);
/*
* Known limit values
*/
/* Ack Ratio takes 2-byte integer values (11.3) */
#define DCCPF_ACK_RATIO_MAX 0xFFFF
/* Wmin=32 and Wmax=2^46-1 from 7.5.2 */
#define DCCPF_SEQ_WMIN 32
#define DCCPF_SEQ_WMAX 0x3FFFFFFFFFFFull
/* Maximum number of SP values that fit in a single (Confirm) option */
#define DCCP_FEAT_MAX_SP_VALS (DCCP_SINGLE_OPT_MAXLEN - 2)
enum dccp_feat_type {
FEAT_AT_RX = 1, /* located at RX side of half-connection */
FEAT_AT_TX = 2, /* located at TX side of half-connection */
FEAT_SP = 4, /* server-priority reconciliation (6.3.1) */
FEAT_NN = 8, /* non-negotiable reconciliation (6.3.2) */
FEAT_UNKNOWN = 0xFF /* not understood or invalid feature */
};
enum dccp_feat_state {
FEAT_DEFAULT = 0, /* using default values from 6.4 */
FEAT_INITIALISING, /* feature is being initialised */
FEAT_CHANGING, /* Change sent but not confirmed yet */
FEAT_UNSTABLE, /* local modification in state CHANGING */
FEAT_STABLE /* both ends (think they) agree */
};
static inline void dccp_feat_debug(const u8 type, const u8 feat, const u8 val)
/**
* dccp_feat_val - Container for SP or NN feature values
* @nn: single NN value
* @sp.vec: single SP value plus optional preference list
* @sp.len: length of @sp.vec in bytes
*/
typedef union {
u64 nn;
struct {
u8 *vec;
u8 len;
} sp;
} dccp_feat_val;
/**
* struct feat_entry - Data structure to perform feature negotiation
* @feat_num: one of %dccp_feature_numbers
* @val: feature's current value (SP features may have preference list)
* @state: feature's current state
* @needs_mandatory: whether Mandatory options should be sent
* @needs_confirm: whether to send a Confirm instead of a Change
* @empty_confirm: whether to send an empty Confirm (depends on @needs_confirm)
* @is_local: feature location (1) or feature-remote (0)
* @node: list pointers, entries arranged in FIFO order
*/
struct dccp_feat_entry {
u8 feat_num;
dccp_feat_val val;
enum dccp_feat_state state:8;
bool needs_mandatory:1,
needs_confirm:1,
empty_confirm:1,
is_local:1;
struct list_head node;
};
static inline u8 dccp_feat_genopt(struct dccp_feat_entry *entry)
{
dccp_pr_debug("%s(%s (%d), %d)\n", dccp_feat_typename(type),
dccp_feat_name(feat), feat, val);
if (entry->needs_confirm)
return entry->is_local ? DCCPO_CONFIRM_L : DCCPO_CONFIRM_R;
return entry->is_local ? DCCPO_CHANGE_L : DCCPO_CHANGE_R;
}
#else
#define dccp_feat_debug(type, feat, val)
#endif /* CONFIG_IP_DCCP_DEBUG */
extern int dccp_feat_change(struct dccp_minisock *dmsk, u8 type, u8 feature,
u8 *val, u8 len, gfp_t gfp);
extern int dccp_feat_change_recv(struct sock *sk, u8 type, u8 feature,
u8 *val, u8 len);
extern int dccp_feat_confirm_recv(struct sock *sk, u8 type, u8 feature,
u8 *val, u8 len);
extern void dccp_feat_clean(struct dccp_minisock *dmsk);
extern int dccp_feat_clone(struct sock *oldsk, struct sock *newsk);
extern int dccp_feat_init(struct dccp_minisock *dmsk);
/**
* struct ccid_dependency - Track changes resulting from choosing a CCID
* @dependent_feat: one of %dccp_feature_numbers
* @is_local: local (1) or remote (0) @dependent_feat
* @is_mandatory: whether presence of @dependent_feat is mission-critical or not
* @val: corresponding default value for @dependent_feat (u8 is sufficient here)
*/
struct ccid_dependency {
u8 dependent_feat;
bool is_local:1,
is_mandatory:1;
u8 val;
};
/*
* Sysctls to seed defaults for feature negotiation
*/
extern unsigned long sysctl_dccp_sequence_window;
extern int sysctl_dccp_rx_ccid;
extern int sysctl_dccp_tx_ccid;
extern int dccp_feat_init(struct sock *sk);
extern void dccp_feat_initialise_sysctls(void);
extern int dccp_feat_register_sp(struct sock *sk, u8 feat, u8 is_local,
u8 const *list, u8 len);
extern int dccp_feat_register_nn(struct sock *sk, u8 feat, u64 val);
extern int dccp_feat_parse_options(struct sock *, struct dccp_request_sock *,
u8 mand, u8 opt, u8 feat, u8 *val, u8 len);
extern int dccp_feat_clone_list(struct list_head const *, struct list_head *);
/*
* Encoding variable-length options and their maximum length.
*
* This affects NN options (SP options are all u8) and other variable-length
* options (see table 3 in RFC 4340). The limit is currently given the Sequence
* Window NN value (sec. 7.5.2) and the NDP count (sec. 7.7) option, all other
* options consume less than 6 bytes (timestamps are 4 bytes).
* When updating this constant (e.g. due to new internet drafts / RFCs), make
* sure that you also update all code which refers to it.
*/
#define DCCP_OPTVAL_MAXLEN 6
extern void dccp_encode_value_var(const u64 value, u8 *to, const u8 len);
extern u64 dccp_decode_value_var(const u8 *bf, const u8 len);
extern int dccp_insert_option_mandatory(struct sk_buff *skb);
extern int dccp_insert_fn_opt(struct sk_buff *skb, u8 type, u8 feat,
u8 *val, u8 len, bool repeat_first);
#endif /* _DCCP_FEAT_H */
This diff is collapsed.
......@@ -545,6 +545,7 @@ static void dccp_v4_ctl_send_reset(struct sock *sk, struct sk_buff *rxskb)
static void dccp_v4_reqsk_destructor(struct request_sock *req)
{
dccp_feat_list_purge(&dccp_rsk(req)->dreq_featneg);
kfree(inet_rsk(req)->opt);
}
......@@ -595,7 +596,8 @@ int dccp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
if (req == NULL)
goto drop;
dccp_reqsk_init(req, skb);
if (dccp_reqsk_init(req, dccp_sk(sk), skb))
goto drop_and_free;
dreq = dccp_rsk(req);
if (dccp_parse_options(sk, dreq, skb))
......
......@@ -302,6 +302,7 @@ static int dccp_v6_send_response(struct sock *sk, struct request_sock *req)
static void dccp_v6_reqsk_destructor(struct request_sock *req)
{
dccp_feat_list_purge(&dccp_rsk(req)->dreq_featneg);
if (inet6_rsk(req)->pktopts != NULL)
kfree_skb(inet6_rsk(req)->pktopts);
}
......@@ -424,7 +425,8 @@ static int dccp_v6_conn_request(struct sock *sk, struct sk_buff *skb)
if (req == NULL)
goto drop;
dccp_reqsk_init(req, skb);
if (dccp_reqsk_init(req, dccp_sk(sk), skb))
goto drop_and_free;
dreq = dccp_rsk(req);
if (dccp_parse_options(sk, dreq, skb))
......
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
......@@ -811,25 +811,12 @@ void tcp_update_metrics(struct sock *sk)
}
}
/* Numbers are taken from RFC3390.
*
* John Heffner states:
*
* The RFC specifies a window of no more than 4380 bytes
* unless 2*MSS > 4380. Reading the pseudocode in the RFC
* is a bit misleading because they use a clamp at 4380 bytes
* rather than use a multiplier in the relevant range.
*/
__u32 tcp_init_cwnd(struct tcp_sock *tp, struct dst_entry *dst)
{
__u32 cwnd = (dst ? dst_metric(dst, RTAX_INITCWND) : 0);
if (!cwnd) {
if (tp->mss_cache > 1460)
cwnd = 2;
else
cwnd = (tp->mss_cache > 1095) ? 3 : 4;
}
if (!cwnd)
cwnd = rfc3390_bytes_to_packets(tp->mss_cache);
return min_t(__u32, cwnd, tp->snd_cwnd_clamp);
}
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment