Commit 5af7d0d9 authored by Linus Torvalds's avatar Linus Torvalds

Merge bk://kernel.bkbits.net/davem/sparc-2.5

into home.osdl.org:/home/torvalds/v2.5/linux
parents 86f48877 d4cef02e
...@@ -61,7 +61,11 @@ extern struct rt6_info *rt6_lookup(struct in6_addr *daddr, ...@@ -61,7 +61,11 @@ extern struct rt6_info *rt6_lookup(struct in6_addr *daddr,
struct in6_addr *saddr, struct in6_addr *saddr,
int oif, int flags); int oif, int flags);
extern struct rt6_info *ip6_dst_alloc(void); extern struct dst_entry *ndisc_dst_alloc(struct net_device *dev,
struct neighbour *neigh,
int (*output)(struct sk_buff *));
extern int ndisc_dst_gc(int *more);
extern void fib6_force_start_gc(void);
/* /*
* support functions for ND * support functions for ND
......
/*
* IP Virtual Server
* data structure and functionality definitions
*/
#ifndef _IP_VS_H
#define _IP_VS_H
#include <asm/types.h> /* For __uXX types */
#define IP_VS_VERSION_CODE 0x010107
#define NVERSION(version) \
(version >> 16) & 0xFF, \
(version >> 8) & 0xFF, \
version & 0xFF
/*
* Virtual Service Flags
*/
#define IP_VS_SVC_F_PERSISTENT 0x0001 /* persistent port */
#define IP_VS_SVC_F_HASHED 0x0002 /* hashed entry */
/*
* Destination Server Flags
*/
#define IP_VS_DEST_F_AVAILABLE 0x0001 /* server is available */
#define IP_VS_DEST_F_OVERLOAD 0x0002 /* server is overloaded */
/*
* IPVS sync daemon states
*/
#define IP_VS_STATE_NONE 0x0000 /* daemon is stopped */
#define IP_VS_STATE_MASTER 0x0001 /* started as master */
#define IP_VS_STATE_BACKUP 0x0002 /* started as backup */
/*
* IPVS socket options
*/
#define IP_VS_BASE_CTL (64+1024+64) /* base */
#define IP_VS_SO_SET_NONE IP_VS_BASE_CTL /* just peek */
#define IP_VS_SO_SET_INSERT (IP_VS_BASE_CTL+1)
#define IP_VS_SO_SET_ADD (IP_VS_BASE_CTL+2)
#define IP_VS_SO_SET_EDIT (IP_VS_BASE_CTL+3)
#define IP_VS_SO_SET_DEL (IP_VS_BASE_CTL+4)
#define IP_VS_SO_SET_FLUSH (IP_VS_BASE_CTL+5)
#define IP_VS_SO_SET_LIST (IP_VS_BASE_CTL+6)
#define IP_VS_SO_SET_ADDDEST (IP_VS_BASE_CTL+7)
#define IP_VS_SO_SET_DELDEST (IP_VS_BASE_CTL+8)
#define IP_VS_SO_SET_EDITDEST (IP_VS_BASE_CTL+9)
#define IP_VS_SO_SET_TIMEOUT (IP_VS_BASE_CTL+10)
#define IP_VS_SO_SET_STARTDAEMON (IP_VS_BASE_CTL+11)
#define IP_VS_SO_SET_STOPDAEMON (IP_VS_BASE_CTL+12)
#define IP_VS_SO_SET_RESTORE (IP_VS_BASE_CTL+13)
#define IP_VS_SO_SET_SAVE (IP_VS_BASE_CTL+14)
#define IP_VS_SO_SET_ZERO (IP_VS_BASE_CTL+15)
#define IP_VS_SO_SET_MAX IP_VS_SO_SET_ZERO
#define IP_VS_SO_GET_VERSION IP_VS_BASE_CTL
#define IP_VS_SO_GET_INFO (IP_VS_BASE_CTL+1)
#define IP_VS_SO_GET_SERVICES (IP_VS_BASE_CTL+2)
#define IP_VS_SO_GET_SERVICE (IP_VS_BASE_CTL+3)
#define IP_VS_SO_GET_DESTS (IP_VS_BASE_CTL+4)
#define IP_VS_SO_GET_DEST (IP_VS_BASE_CTL+5) /* not used now */
#define IP_VS_SO_GET_TIMEOUT (IP_VS_BASE_CTL+6)
#define IP_VS_SO_GET_DAEMON (IP_VS_BASE_CTL+7)
#define IP_VS_SO_GET_MAX IP_VS_SO_GET_DAEMON
/*
* IPVS Connection Flags
*/
#define IP_VS_CONN_F_FWD_MASK 0x0007 /* mask for the fwd methods */
#define IP_VS_CONN_F_MASQ 0x0000 /* masquerading/NAT */
#define IP_VS_CONN_F_LOCALNODE 0x0001 /* local node */
#define IP_VS_CONN_F_TUNNEL 0x0002 /* tunneling */
#define IP_VS_CONN_F_DROUTE 0x0003 /* direct routing */
#define IP_VS_CONN_F_BYPASS 0x0004 /* cache bypass */
#define IP_VS_CONN_F_SYNC 0x0020 /* entry created by sync */
#define IP_VS_CONN_F_HASHED 0x0040 /* hashed entry */
#define IP_VS_CONN_F_NOOUTPUT 0x0080 /* no output packets */
#define IP_VS_CONN_F_INACTIVE 0x0100 /* not established */
#define IP_VS_CONN_F_OUT_SEQ 0x0200 /* must do output seq adjust */
#define IP_VS_CONN_F_IN_SEQ 0x0400 /* must do input seq adjust */
#define IP_VS_CONN_F_SEQ_MASK 0x0600 /* in/out sequence mask */
#define IP_VS_CONN_F_NO_CPORT 0x0800 /* no client port set yet */
/* Move it to better place one day, for now keep it unique */
#define NFC_IPVS_PROPERTY 0x10000
#define IP_VS_SCHEDNAME_MAXLEN 16
#define IP_VS_IFNAME_MAXLEN 16
/*
* The struct ip_vs_service_user and struct ip_vs_dest_user are
* used to set IPVS rules through setsockopt.
*/
struct ip_vs_service_user {
/* virtual service addresses */
u_int16_t protocol;
u_int32_t addr; /* virtual ip address */
u_int16_t port;
u_int32_t fwmark; /* firwall mark of service */
/* virtual service options */
char sched_name[IP_VS_SCHEDNAME_MAXLEN];
unsigned flags; /* virtual service flags */
unsigned timeout; /* persistent timeout in sec */
u_int32_t netmask; /* persistent netmask */
};
struct ip_vs_dest_user {
/* destination server address */
u_int32_t addr;
u_int16_t port;
/* real server options */
unsigned conn_flags; /* connection flags */
int weight; /* destination weight */
/* thresholds for active connections */
u_int32_t u_threshold; /* upper threshold */
u_int32_t l_threshold; /* lower threshold */
};
/*
* IPVS statistics object (for user space)
*/
struct ip_vs_stats_user
{
__u32 conns; /* connections scheduled */
__u32 inpkts; /* incoming packets */
__u32 outpkts; /* outgoing packets */
__u64 inbytes; /* incoming bytes */
__u64 outbytes; /* outgoing bytes */
__u32 cps; /* current connection rate */
__u32 inpps; /* current in packet rate */
__u32 outpps; /* current out packet rate */
__u32 inbps; /* current in byte rate */
__u32 outbps; /* current out byte rate */
};
/* The argument to IP_VS_SO_GET_INFO */
struct ip_vs_getinfo {
/* version number */
unsigned int version;
/* size of connection hash table */
unsigned int size;
/* number of virtual services */
unsigned int num_services;
};
/* The argument to IP_VS_SO_GET_SERVICE */
struct ip_vs_service_entry {
/* which service: user fills in these */
u_int16_t protocol;
u_int32_t addr; /* virtual address */
u_int16_t port;
u_int32_t fwmark; /* firwall mark of service */
/* service options */
char sched_name[IP_VS_SCHEDNAME_MAXLEN];
unsigned flags; /* virtual service flags */
unsigned timeout; /* persistent timeout */
u_int32_t netmask; /* persistent netmask */
/* number of real servers */
unsigned int num_dests;
/* statistics */
struct ip_vs_stats_user stats;
};
struct ip_vs_dest_entry {
u_int32_t addr; /* destination address */
u_int16_t port;
unsigned conn_flags; /* connection flags */
int weight; /* destination weight */
u_int32_t u_threshold; /* upper threshold */
u_int32_t l_threshold; /* lower threshold */
u_int32_t activeconns; /* active connections */
u_int32_t inactconns; /* inactive connections */
u_int32_t persistconns; /* persistent connections */
/* statistics */
struct ip_vs_stats_user stats;
};
/* The argument to IP_VS_SO_GET_DESTS */
struct ip_vs_get_dests {
/* which service: user fills in these */
u_int16_t protocol;
u_int32_t addr; /* virtual address */
u_int16_t port;
u_int32_t fwmark; /* firwall mark of service */
/* number of real servers */
unsigned int num_dests;
/* the real servers */
struct ip_vs_dest_entry entrytable[0];
};
/* The argument to IP_VS_SO_GET_SERVICES */
struct ip_vs_get_services {
/* number of virtual services */
unsigned int num_services;
/* service table */
struct ip_vs_service_entry entrytable[0];
};
/* The argument to IP_VS_SO_GET_TIMEOUT */
struct ip_vs_timeout_user {
int tcp_timeout;
int tcp_fin_timeout;
int udp_timeout;
};
/* The argument to IP_VS_SO_GET_DAEMON */
struct ip_vs_daemon_user {
/* sync daemon state (master/backup) */
int state;
/* multicast interface name */
char mcast_ifn[IP_VS_IFNAME_MAXLEN];
/* SyncID we belong to */
int syncid;
};
#ifdef __KERNEL__
#include <linux/config.h>
#include <linux/list.h> /* for struct list_head */
#include <linux/spinlock.h> /* for struct rwlock_t */
#include <linux/skbuff.h> /* for struct sk_buff */
#include <linux/ip.h> /* for struct iphdr */
#include <asm/atomic.h> /* for struct atomic_t */
#include <linux/netdevice.h> /* for struct neighbour */
#include <net/dst.h> /* for struct dst_entry */
#include <net/tcp.h>
#include <net/udp.h>
#include <linux/compiler.h>
#ifdef CONFIG_IP_VS_DEBUG
extern int ip_vs_get_debug_level(void);
#define IP_VS_DBG(level, msg...) \
do { \
if (level <= ip_vs_get_debug_level()) \
printk(KERN_DEBUG "IPVS: " msg); \
} while (0)
#define IP_VS_DBG_RL(msg...) \
do { \
if (net_ratelimit()) \
printk(KERN_DEBUG "IPVS: " msg); \
} while (0)
#define IP_VS_DBG_PKT(level, pp, iph, msg) \
do { \
if (level <= ip_vs_get_debug_level()) \
pp->debug_packet(pp, iph, msg); \
} while (0)
#define IP_VS_DBG_RL_PKT(level, pp, iph, msg) \
do { \
if (level <= ip_vs_get_debug_level() && \
net_ratelimit()) \
pp->debug_packet(pp, iph, msg); \
} while (0)
#else /* NO DEBUGGING at ALL */
#define IP_VS_DBG(level, msg...) do {} while (0)
#define IP_VS_DBG_RL(msg...) do {} while (0)
#define IP_VS_DBG_PKT(level, pp, iph, msg) do {} while (0)
#define IP_VS_DBG_RL_PKT(level, pp, iph, msg) do {} while (0)
#endif
#define IP_VS_BUG() BUG()
#define IP_VS_ERR(msg...) printk(KERN_ERR "IPVS: " msg)
#define IP_VS_INFO(msg...) printk(KERN_INFO "IPVS: " msg)
#define IP_VS_WARNING(msg...) \
printk(KERN_WARNING "IPVS: " msg)
#define IP_VS_ERR_RL(msg...) \
do { \
if (net_ratelimit()) \
printk(KERN_ERR "IPVS: " msg); \
} while (0)
#ifdef CONFIG_IP_VS_DEBUG
#define EnterFunction(level) \
do { \
if (level <= ip_vs_get_debug_level()) \
printk(KERN_DEBUG "Enter: %s, %s line %i\n", \
__FUNCTION__, __FILE__, __LINE__); \
} while (0)
#define LeaveFunction(level) \
do { \
if (level <= ip_vs_get_debug_level()) \
printk(KERN_DEBUG "Leave: %s, %s line %i\n", \
__FUNCTION__, __FILE__, __LINE__); \
} while (0)
#else
#define EnterFunction(level) do {} while (0)
#define LeaveFunction(level) do {} while (0)
#endif
#define IP_VS_WAIT_WHILE(expr) while (expr) { cpu_relax(); }
/*
* The port number of FTP service (in network order).
*/
#define FTPPORT __constant_htons(21)
#define FTPDATA __constant_htons(20)
/*
* IPVS sysctl variables under the /proc/sys/net/ipv4/vs/
*/
#define NET_IPV4_VS 21
enum {
NET_IPV4_VS_DEBUG_LEVEL=1,
NET_IPV4_VS_AMEMTHRESH=2,
NET_IPV4_VS_AMDROPRATE=3,
NET_IPV4_VS_DROP_ENTRY=4,
NET_IPV4_VS_DROP_PACKET=5,
NET_IPV4_VS_SECURE_TCP=6,
NET_IPV4_VS_TO_ES=7,
NET_IPV4_VS_TO_SS=8,
NET_IPV4_VS_TO_SR=9,
NET_IPV4_VS_TO_FW=10,
NET_IPV4_VS_TO_TW=11,
NET_IPV4_VS_TO_CL=12,
NET_IPV4_VS_TO_CW=13,
NET_IPV4_VS_TO_LA=14,
NET_IPV4_VS_TO_LI=15,
NET_IPV4_VS_TO_SA=16,
NET_IPV4_VS_TO_UDP=17,
NET_IPV4_VS_TO_ICMP=18,
NET_IPV4_VS_LBLC_EXPIRE=19,
NET_IPV4_VS_LBLCR_EXPIRE=20,
NET_IPV4_VS_CACHE_BYPASS=22,
NET_IPV4_VS_EXPIRE_NODEST_CONN=23,
NET_IPV4_VS_SYNC_THRESHOLD=24,
NET_IPV4_VS_NAT_ICMP_SEND=25,
NET_IPV4_VS_LAST
};
/*
* TCP State Values
*/
enum {
IP_VS_TCP_S_NONE = 0,
IP_VS_TCP_S_ESTABLISHED,
IP_VS_TCP_S_SYN_SENT,
IP_VS_TCP_S_SYN_RECV,
IP_VS_TCP_S_FIN_WAIT,
IP_VS_TCP_S_TIME_WAIT,
IP_VS_TCP_S_CLOSE,
IP_VS_TCP_S_CLOSE_WAIT,
IP_VS_TCP_S_LAST_ACK,
IP_VS_TCP_S_LISTEN,
IP_VS_TCP_S_SYNACK,
IP_VS_TCP_S_LAST
};
/*
* UDP State Values
*/
enum {
IP_VS_UDP_S_NORMAL,
IP_VS_UDP_S_LAST,
};
/*
* ICMP State Values
*/
enum {
IP_VS_ICMP_S_NORMAL,
IP_VS_ICMP_S_LAST,
};
/*
* Transport protocol header
*/
union ip_vs_tphdr {
unsigned char *raw;
struct udphdr *uh;
struct tcphdr *th;
struct icmphdr *icmph;
__u16 *portp;
};
/*
* Delta sequence info structure
* Each ip_vs_conn has 2 (output AND input seq. changes).
* Only used in the VS/NAT.
*/
struct ip_vs_seq {
__u32 init_seq; /* Add delta from this seq */
__u32 delta; /* Delta in sequence numbers */
__u32 previous_delta; /* Delta in sequence numbers
before last resized pkt */
};
/*
* IPVS statistics object
*/
struct ip_vs_stats
{
__u32 conns; /* connections scheduled */
__u32 inpkts; /* incoming packets */
__u32 outpkts; /* outgoing packets */
__u64 inbytes; /* incoming bytes */
__u64 outbytes; /* outgoing bytes */
__u32 cps; /* current connection rate */
__u32 inpps; /* current in packet rate */
__u32 outpps; /* current out packet rate */
__u32 inbps; /* current in byte rate */
__u32 outbps; /* current out byte rate */
spinlock_t lock; /* spin lock */
};
struct ip_vs_conn;
struct ip_vs_app;
struct ip_vs_protocol {
struct ip_vs_protocol *next;
char *name;
__u16 protocol;
int minhlen;
int minhlen_icmp;
int dont_defrag;
int skip_nonexisting;
int slave; /* if controlled by others */
atomic_t appcnt; /* counter of proto app incs */
int *timeout_table; /* protocol timeout table */
void (*init)(struct ip_vs_protocol *pp);
void (*exit)(struct ip_vs_protocol *pp);
int (*conn_schedule)(struct sk_buff *skb, struct ip_vs_protocol *pp,
struct iphdr *iph, union ip_vs_tphdr h,
int *verdict, struct ip_vs_conn **cpp);
struct ip_vs_conn *
(*conn_in_get)(struct sk_buff *skb,
struct ip_vs_protocol *pp, struct iphdr *iph,
union ip_vs_tphdr h, int inverse);
struct ip_vs_conn *
(*conn_out_get)(struct sk_buff *skb,
struct ip_vs_protocol *pp, struct iphdr *iph,
union ip_vs_tphdr h, int inverse);
int (*snat_handler)(struct sk_buff *skb,
struct ip_vs_protocol *pp, struct ip_vs_conn *cp,
struct iphdr *iph, union ip_vs_tphdr h, int size);
int (*dnat_handler)(struct sk_buff *skb,
struct ip_vs_protocol *pp, struct ip_vs_conn *cp,
struct iphdr *iph, union ip_vs_tphdr h, int size);
int (*csum_check)(struct sk_buff *skb,
struct ip_vs_protocol *pp, struct iphdr *iph,
union ip_vs_tphdr h, int size);
const char *(*state_name)(int state);
int (*state_transition)(struct ip_vs_conn *cp, int direction,
struct iphdr *iph, union ip_vs_tphdr h,
struct ip_vs_protocol *pp);
int (*register_app)(struct ip_vs_app *inc);
void (*unregister_app)(struct ip_vs_app *inc);
int (*app_conn_bind)(struct ip_vs_conn *cp);
void (*debug_packet)(struct ip_vs_protocol *pp, struct iphdr *iph,
char *msg);
void (*timeout_change)(struct ip_vs_protocol *pp, int flags);
int (*set_state_timeout)(struct ip_vs_protocol *pp, char *sname, int to);
};
extern struct ip_vs_protocol * ip_vs_proto_get(unsigned short proto);
/*
* IP_VS structure allocated for each dynamically scheduled connection
*/
struct ip_vs_conn {
struct list_head c_list; /* hashed list heads */
/* Protocol, addresses and port numbers */
__u32 caddr; /* client address */
__u32 vaddr; /* virtual address */
__u32 daddr; /* destination address */
__u16 cport;
__u16 vport;
__u16 dport;
__u16 protocol; /* Which protocol (TCP/UDP) */
/* counter and timer */
atomic_t refcnt; /* reference count */
struct timer_list timer; /* Expiration timer */
volatile unsigned long timeout; /* timeout */
/* Flags and state transition */
spinlock_t lock; /* lock for state transition */
volatile __u16 flags; /* status flags */
volatile __u16 state; /* state info */
/* Control members */
struct ip_vs_conn *control; /* Master control connection */
atomic_t n_control; /* Number of controlled ones */
struct ip_vs_dest *dest; /* real server */
atomic_t in_pkts; /* incoming packet counter */
/* packet transmitter for different forwarding methods */
int (*packet_xmit)(struct sk_buff *skb, struct ip_vs_conn *cp,
struct ip_vs_protocol *pp);
/* Note: we can group the following members into a structure,
in order to save more space, and the following members are
only used in VS/NAT anyway */
struct ip_vs_app *app; /* bound ip_vs_app object */
void *app_data; /* Application private data */
struct ip_vs_seq in_seq; /* incoming seq. struct */
struct ip_vs_seq out_seq; /* outgoing seq. struct */
};
/*
* The information about the virtual service offered to the net
* and the forwarding entries
*/
struct ip_vs_service {
struct list_head s_list; /* for normal service table */
struct list_head f_list; /* for fwmark-based service table */
atomic_t refcnt; /* reference counter */
atomic_t usecnt; /* use counter */
__u16 protocol; /* which protocol (TCP/UDP) */
__u32 addr; /* IP address for virtual service */
__u16 port; /* port number for the service */
__u32 fwmark; /* firewall mark of the service */
unsigned flags; /* service status flags */
unsigned timeout; /* persistent timeout in ticks */
__u32 netmask; /* grouping granularity */
struct list_head destinations; /* real server d-linked list */
__u32 num_dests; /* number of servers */
struct ip_vs_stats stats; /* statistics for the service */
struct ip_vs_app *inc; /* bind conns to this app inc */
/* for scheduling */
struct ip_vs_scheduler *scheduler; /* bound scheduler object */
rwlock_t sched_lock; /* lock sched_data */
void *sched_data; /* scheduler application data */
};
/*
* The real server destination forwarding entry
* with ip address, port number, and so on.
*/
struct ip_vs_dest {
struct list_head n_list; /* for the dests in the service */
struct list_head d_list; /* for table with all the dests */
__u32 addr; /* IP address of the server */
__u16 port; /* port number of the server */
volatile unsigned flags; /* dest status flags */
atomic_t conn_flags; /* flags to copy to conn */
atomic_t weight; /* server weight */
atomic_t refcnt; /* reference counter */
struct ip_vs_stats stats; /* statistics */
/* connection counters and thresholds */
atomic_t activeconns; /* active connections */
atomic_t inactconns; /* inactive connections */
atomic_t persistconns; /* persistent connections */
__u32 u_threshold; /* upper threshold */
__u32 l_threshold; /* lower threshold */
/* for destination cache */
spinlock_t dst_lock; /* lock of dst_cache */
struct dst_entry *dst_cache; /* destination cache entry */
u32 dst_rtos; /* RT_TOS(tos) for dst */
/* for virtual service */
struct ip_vs_service *svc; /* service it belongs to */
__u16 protocol; /* which protocol (TCP/UDP) */
__u32 vaddr; /* virtual IP address */
__u16 vport; /* virtual port number */
__u32 vfwmark; /* firewall mark of service */
};
/*
* The scheduler object
*/
struct ip_vs_scheduler {
struct list_head n_list; /* d-linked list head */
char *name; /* scheduler name */
atomic_t refcnt; /* reference counter */
struct module *module; /* THIS_MODULE/NULL */
/* scheduler initializing service */
int (*init_service)(struct ip_vs_service *svc);
/* scheduling service finish */
int (*done_service)(struct ip_vs_service *svc);
/* scheduler updating service */
int (*update_service)(struct ip_vs_service *svc);
/* selecting a server from the given service */
struct ip_vs_dest* (*schedule)(struct ip_vs_service *svc,
struct iphdr *iph);
};
/*
* The application module object (a.k.a. app incarnation)
*/
struct ip_vs_app
{
struct list_head a_list; /* member in app list */
int type; /* IP_VS_APP_TYPE_xxx */
char *name; /* application module name */
__u16 protocol;
struct module *module; /* THIS_MODULE/NULL */
struct list_head incs_list; /* list of incarnations */
/* members for application incarnations */
struct list_head p_list; /* member in proto app list */
struct ip_vs_app *app; /* its real application */
__u16 port; /* port number in net order */
atomic_t usecnt; /* usage counter */
/* output hook */
int (*pkt_out)(struct ip_vs_app *, struct ip_vs_conn *,
struct sk_buff *);
/* input hook */
int (*pkt_in)(struct ip_vs_app *, struct ip_vs_conn *,
struct sk_buff *);
/* ip_vs_app initializer */
int (*init_conn)(struct ip_vs_app *, struct ip_vs_conn *);
/* ip_vs_app finish */
int (*done_conn)(struct ip_vs_app *, struct ip_vs_conn *);
/* not used now */
int (*bind_conn)(struct ip_vs_app *, struct ip_vs_conn *,
struct ip_vs_protocol *);
void (*unbind_conn)(struct ip_vs_app *, struct ip_vs_conn *);
int * timeout_table;
int * timeouts;
int timeouts_size;
int (*conn_schedule)(struct sk_buff *skb, struct ip_vs_app *app,
struct iphdr *iph, union ip_vs_tphdr h,
int *verdict, struct ip_vs_conn **cpp);
struct ip_vs_conn *
(*conn_in_get)(struct sk_buff *skb, struct ip_vs_app *app,
struct iphdr *iph, union ip_vs_tphdr h, int inverse);
struct ip_vs_conn *
(*conn_out_get)(struct sk_buff *skb, struct ip_vs_app *app,
struct iphdr *iph, union ip_vs_tphdr h, int inverse);
int (*state_transition)(struct ip_vs_conn *cp, int direction,
struct iphdr *iph,
union ip_vs_tphdr h, struct ip_vs_app *app);
void (*timeout_change)(struct ip_vs_app *app, int flags);
};
/*
* IPVS core functions
* (from ip_vs_core.c)
*/
extern const char *ip_vs_proto_name(unsigned proto);
extern unsigned int check_for_ip_vs_out(struct sk_buff **skb_p,
int (*okfn)(struct sk_buff *));
extern void ip_vs_init_hash_table(struct list_head *table, int rows);
#define IP_VS_INIT_HASH_TABLE(t) ip_vs_init_hash_table(t, sizeof(t)/sizeof(t[0]))
#define IP_VS_APP_TYPE_UNSPEC 0
#define IP_VS_APP_TYPE_FTP 1
/*
* ip_vs_conn handling functions
* (from ip_vs_conn.c)
*/
/*
* IPVS connection entry hash table
*/
#ifndef CONFIG_IP_VS_TAB_BITS
#define CONFIG_IP_VS_TAB_BITS 12
#endif
/* make sure that IP_VS_CONN_TAB_BITS is located in [8, 20] */
#if CONFIG_IP_VS_TAB_BITS < 8
#define IP_VS_CONN_TAB_BITS 8
#endif
#if CONFIG_IP_VS_TAB_BITS > 20
#define IP_VS_CONN_TAB_BITS 20
#endif
#if 8 <= CONFIG_IP_VS_TAB_BITS && CONFIG_IP_VS_TAB_BITS <= 20
#define IP_VS_CONN_TAB_BITS CONFIG_IP_VS_TAB_BITS
#endif
#define IP_VS_CONN_TAB_SIZE (1 << IP_VS_CONN_TAB_BITS)
#define IP_VS_CONN_TAB_MASK (IP_VS_CONN_TAB_SIZE - 1)
enum {
IP_VS_DIR_INPUT = 0,
IP_VS_DIR_OUTPUT,
IP_VS_DIR_INPUT_ONLY,
IP_VS_DIR_LAST,
};
extern struct ip_vs_conn *ip_vs_conn_in_get
(int protocol, __u32 s_addr, __u16 s_port, __u32 d_addr, __u16 d_port);
extern struct ip_vs_conn *ip_vs_conn_out_get
(int protocol, __u32 s_addr, __u16 s_port, __u32 d_addr, __u16 d_port);
/* put back the conn without restarting its timer */
static inline void __ip_vs_conn_put(struct ip_vs_conn *cp)
{
atomic_dec(&cp->refcnt);
}
extern void ip_vs_conn_put(struct ip_vs_conn *cp);
extern void ip_vs_conn_fill_cport(struct ip_vs_conn *cp, __u16 cport);
extern struct ip_vs_conn *
ip_vs_conn_new(int proto, __u32 caddr, __u16 cport, __u32 vaddr, __u16 vport,
__u32 daddr, __u16 dport, unsigned flags,
struct ip_vs_dest *dest);
extern void ip_vs_conn_expire_now(struct ip_vs_conn *cp);
extern const char * ip_vs_state_name(__u16 proto, int state);
extern void ip_vs_tcp_conn_listen(struct ip_vs_conn *cp);
extern int ip_vs_check_template(struct ip_vs_conn *ct);
extern void ip_vs_secure_tcp_set(int on);
extern void ip_vs_random_dropentry(void);
extern int ip_vs_conn_init(void);
extern void ip_vs_conn_cleanup(void);
static inline void ip_vs_control_del(struct ip_vs_conn *cp)
{
struct ip_vs_conn *ctl_cp = cp->control;
if (!ctl_cp) {
IP_VS_ERR("request control DEL for uncontrolled: "
"%d.%d.%d.%d:%d to %d.%d.%d.%d:%d\n",
NIPQUAD(cp->caddr),ntohs(cp->cport),
NIPQUAD(cp->vaddr),ntohs(cp->vport));
return;
}
IP_VS_DBG(7, "DELeting control for: "
"cp.dst=%d.%d.%d.%d:%d ctl_cp.dst=%d.%d.%d.%d:%d\n",
NIPQUAD(cp->caddr),ntohs(cp->cport),
NIPQUAD(ctl_cp->caddr),ntohs(ctl_cp->cport));
cp->control = NULL;
if (atomic_read(&ctl_cp->n_control) == 0) {
IP_VS_ERR("BUG control DEL with n=0 : "
"%d.%d.%d.%d:%d to %d.%d.%d.%d:%d\n",
NIPQUAD(cp->caddr),ntohs(cp->cport),
NIPQUAD(cp->vaddr),ntohs(cp->vport));
return;
}
atomic_dec(&ctl_cp->n_control);
}
static inline void
ip_vs_control_add(struct ip_vs_conn *cp, struct ip_vs_conn *ctl_cp)
{
if (cp->control) {
IP_VS_ERR("request control ADD for already controlled: "
"%d.%d.%d.%d:%d to %d.%d.%d.%d:%d\n",
NIPQUAD(cp->caddr),ntohs(cp->cport),
NIPQUAD(cp->vaddr),ntohs(cp->vport));
ip_vs_control_del(cp);
}
IP_VS_DBG(7, "ADDing control for: "
"cp.dst=%d.%d.%d.%d:%d ctl_cp.dst=%d.%d.%d.%d:%d\n",
NIPQUAD(cp->caddr),ntohs(cp->cport),
NIPQUAD(ctl_cp->caddr),ntohs(ctl_cp->cport));
cp->control = ctl_cp;
atomic_inc(&ctl_cp->n_control);
}
/*
* IPVS application functions
* (from ip_vs_app.c)
*/
#define IP_VS_APP_MAX_PORTS 8
extern int register_ip_vs_app(struct ip_vs_app *app);
extern void unregister_ip_vs_app(struct ip_vs_app *app);
extern int ip_vs_bind_app(struct ip_vs_conn *cp, struct ip_vs_protocol *pp);
extern void ip_vs_unbind_app(struct ip_vs_conn *cp);
extern int
register_ip_vs_app_inc(struct ip_vs_app *app, __u16 proto, __u16 port);
extern int ip_vs_app_inc_get(struct ip_vs_app *inc);
extern void ip_vs_app_inc_put(struct ip_vs_app *inc);
extern int ip_vs_app_pkt_out(struct ip_vs_conn *, struct sk_buff *skb);
extern int ip_vs_app_pkt_in(struct ip_vs_conn *, struct sk_buff *skb);
extern int ip_vs_skb_replace(struct sk_buff *skb, int pri,
char *o_buf, int o_len, char *n_buf, int n_len);
extern int ip_vs_app_init(void);
extern void ip_vs_app_cleanup(void);
/*
* IPVS protocol functions (from ip_vs_proto.c)
*/
extern int ip_vs_protocol_init(void);
extern void ip_vs_protocol_cleanup(void);
extern void ip_vs_protocol_timeout_change(int flags);
extern int *ip_vs_create_timeout_table(int *table, int size);
extern int
ip_vs_set_state_timeout(int *table, int num, char **names, char *name, int to);
extern struct ip_vs_protocol ip_vs_protocol_tcp;
extern struct ip_vs_protocol ip_vs_protocol_udp;
extern struct ip_vs_protocol ip_vs_protocol_icmp;
extern struct ip_vs_protocol ip_vs_protocol_esp;
extern struct ip_vs_protocol ip_vs_protocol_ah;
/*
* Registering/unregistering scheduler functions
* (from ip_vs_sched.c)
*/
extern int register_ip_vs_scheduler(struct ip_vs_scheduler *scheduler);
extern int unregister_ip_vs_scheduler(struct ip_vs_scheduler *scheduler);
extern int ip_vs_bind_scheduler(struct ip_vs_service *svc,
struct ip_vs_scheduler *scheduler);
extern int ip_vs_unbind_scheduler(struct ip_vs_service *svc);
extern struct ip_vs_scheduler *ip_vs_scheduler_get(const char *sched_name);
extern void ip_vs_scheduler_put(struct ip_vs_scheduler *scheduler);
extern struct ip_vs_conn *
ip_vs_schedule(struct ip_vs_service *svc, struct iphdr *iph);
extern int ip_vs_leave(struct ip_vs_service *svc, struct sk_buff *skb,
struct ip_vs_protocol *pp, union ip_vs_tphdr h);
/*
* IPVS control data and functions (from ip_vs_ctl.c)
*/
extern int sysctl_ip_vs_cache_bypass;
extern int sysctl_ip_vs_expire_nodest_conn;
extern int sysctl_ip_vs_sync_threshold[2];
extern int sysctl_ip_vs_nat_icmp_send;
extern atomic_t ip_vs_dropentry;
extern struct ip_vs_stats ip_vs_stats;
extern struct ip_vs_service *
ip_vs_service_get(__u32 fwmark, __u16 protocol, __u32 vaddr, __u16 vport);
static inline void ip_vs_service_put(struct ip_vs_service *svc)
{
atomic_dec(&svc->usecnt);
}
extern struct ip_vs_dest *
ip_vs_lookup_real_service(__u16 protocol, __u32 daddr, __u16 dport);
extern int ip_vs_use_count_inc(void);
extern void ip_vs_use_count_dec(void);
extern void update_defense_level(void);
extern int ip_vs_control_init(void);
extern void ip_vs_control_cleanup(void);
/*
* IPVS sync daemon data and function prototypes
* (from ip_vs_sync.c)
*/
extern volatile int ip_vs_sync_state;
extern volatile int ip_vs_master_syncid;
extern volatile int ip_vs_backup_syncid;
extern char ip_vs_master_mcast_ifn[IP_VS_IFNAME_MAXLEN];
extern char ip_vs_backup_mcast_ifn[IP_VS_IFNAME_MAXLEN];
extern int start_sync_thread(int state, char *mcast_ifn, __u8 syncid);
extern int stop_sync_thread(int state);
extern void ip_vs_sync_conn(struct ip_vs_conn *cp);
/*
* IPVS rate estimator prototypes (from ip_vs_est.c)
*/
extern int ip_vs_new_estimator(struct ip_vs_stats *stats);
extern void ip_vs_kill_estimator(struct ip_vs_stats *stats);
extern void ip_vs_zero_estimator(struct ip_vs_stats *stats);
/*
* Various IPVS packet transmitters (from ip_vs_xmit.c)
*/
extern int ip_vs_null_xmit
(struct sk_buff *skb, struct ip_vs_conn *cp, struct ip_vs_protocol *pp);
extern int ip_vs_bypass_xmit
(struct sk_buff *skb, struct ip_vs_conn *cp, struct ip_vs_protocol *pp);
extern int ip_vs_nat_xmit
(struct sk_buff *skb, struct ip_vs_conn *cp, struct ip_vs_protocol *pp);
extern int ip_vs_tunnel_xmit
(struct sk_buff *skb, struct ip_vs_conn *cp, struct ip_vs_protocol *pp);
extern int ip_vs_dr_xmit
(struct sk_buff *skb, struct ip_vs_conn *cp, struct ip_vs_protocol *pp);
extern int ip_vs_icmp_xmit
(struct sk_buff *skb, struct ip_vs_conn *cp, struct ip_vs_protocol *pp);
extern void ip_vs_dst_reset(struct ip_vs_dest *dest);
/*
* This is a simple mechanism to ignore packets when
* we are loaded. Just set ip_vs_drop_rate to 'n' and
* we start to drop 1/rate of the packets
*/
extern int ip_vs_drop_rate;
extern int ip_vs_drop_counter;
static __inline__ int ip_vs_todrop(void)
{
if (!ip_vs_drop_rate) return 0;
if (--ip_vs_drop_counter > 0) return 0;
ip_vs_drop_counter = ip_vs_drop_rate;
return 1;
}
/*
* ip_vs_fwd_tag returns the forwarding tag of the connection
*/
#define IP_VS_FWD_METHOD(cp) (cp->flags & IP_VS_CONN_F_FWD_MASK)
extern __inline__ char ip_vs_fwd_tag(struct ip_vs_conn *cp)
{
char fwd;
switch (IP_VS_FWD_METHOD(cp)) {
case IP_VS_CONN_F_MASQ:
fwd = 'M'; break;
case IP_VS_CONN_F_LOCALNODE:
fwd = 'L'; break;
case IP_VS_CONN_F_TUNNEL:
fwd = 'T'; break;
case IP_VS_CONN_F_DROUTE:
fwd = 'R'; break;
case IP_VS_CONN_F_BYPASS:
fwd = 'B'; break;
default:
fwd = '?'; break;
}
return fwd;
}
static inline u16 ip_vs_check_diff(u32 old, u32 new, u16 oldsum)
{
u32 diff[2] = { old, new };
return csum_fold(csum_partial((char *) diff, sizeof(diff),
oldsum ^ 0xFFFF));
}
#endif /* __KERNEL__ */
#endif /* _IP_VS_H */
...@@ -405,34 +405,6 @@ static int do_set_attach_filter(int fd, int level, int optname, ...@@ -405,34 +405,6 @@ static int do_set_attach_filter(int fd, int level, int optname,
sizeof(struct sock_fprog)); sizeof(struct sock_fprog));
} }
static int do_set_icmpv6_filter(int fd, int level, int optname,
char *optval, int optlen)
{
struct icmp6_filter kfilter;
mm_segment_t old_fs;
int ret, i;
if (optlen < sizeof(kfilter))
return -EINVAL;
if (copy_from_user(&kfilter, optval, sizeof(kfilter)))
return -EFAULT;
for (i = 0; i < 8; i += 2) {
u32 tmp = kfilter.data[i];
kfilter.data[i] = kfilter.data[i + 1];
kfilter.data[i + 1] = tmp;
}
old_fs = get_fs();
set_fs(KERNEL_DS);
ret = sys_setsockopt(fd, level, optname,
(char *) &kfilter, sizeof(kfilter));
set_fs(old_fs);
return ret;
}
static int do_set_sock_timeout(int fd, int level, int optname, char *optval, int optlen) static int do_set_sock_timeout(int fd, int level, int optname, char *optval, int optlen)
{ {
struct compat_timeval *up = (struct compat_timeval *) optval; struct compat_timeval *up = (struct compat_timeval *) optval;
...@@ -465,9 +437,6 @@ asmlinkage long compat_sys_setsockopt(int fd, int level, int optname, ...@@ -465,9 +437,6 @@ asmlinkage long compat_sys_setsockopt(int fd, int level, int optname,
optval, optlen); optval, optlen);
if (optname == SO_RCVTIMEO || optname == SO_SNDTIMEO) if (optname == SO_RCVTIMEO || optname == SO_SNDTIMEO)
return do_set_sock_timeout(fd, level, optname, optval, optlen); return do_set_sock_timeout(fd, level, optname, optval, optlen);
if (level == SOL_ICMPV6 && optname == ICMPV6_FILTER)
return do_set_icmpv6_filter(fd, level, optname,
optval, optlen);
return sys_setsockopt(fd, level, optname, optval, optlen); return sys_setsockopt(fd, level, optname, optval, optlen);
} }
......
/* /*
* net-sysfs.c - network device class and attributes * net-sysfs.c - network device class and attributes
* *
* Copyright (c) 2003 Stephen Hemminber <shemminger@osdl.org> * Copyright (c) 2003 Stephen Hemminger <shemminger@osdl.org>
* *
*/ */
......
...@@ -375,4 +375,5 @@ config INET_IPCOMP ...@@ -375,4 +375,5 @@ config INET_IPCOMP
If unsure, say Y. If unsure, say Y.
source "net/ipv4/netfilter/Kconfig" source "net/ipv4/netfilter/Kconfig"
source "net/ipv4/ipvs/Kconfig"
...@@ -21,5 +21,6 @@ obj-$(CONFIG_INET_ESP) += esp4.o ...@@ -21,5 +21,6 @@ obj-$(CONFIG_INET_ESP) += esp4.o
obj-$(CONFIG_INET_IPCOMP) += ipcomp.o obj-$(CONFIG_INET_IPCOMP) += ipcomp.o
obj-$(CONFIG_IP_PNP) += ipconfig.o obj-$(CONFIG_IP_PNP) += ipconfig.o
obj-$(CONFIG_NETFILTER) += netfilter/ obj-$(CONFIG_NETFILTER) += netfilter/
obj-$(CONFIG_IP_VS) += ipvs/
obj-y += xfrm4_policy.o xfrm4_state.o xfrm4_input.o xfrm4_tunnel.o obj-y += xfrm4_policy.o xfrm4_state.o xfrm4_input.o xfrm4_tunnel.o
#
# IP Virtual Server configuration
#
menu "IP: Virtual Server Configuration"
depends on INET && NETFILTER
config IP_VS
tristate "IP virtual server support (EXPERIMENTAL)"
depends on INET && NETFILTER
---help---
IP Virtual Server support will let you build a high-performance
virtual server based on cluster of two or more real servers. This
option must be enabled for at least one of the clustered computers
that will take care of intercepting incomming connections to a
single IP address and scheduling them to real servers.
Three request dispatching techniques are implemented, they are
virtual server via NAT, virtual server via tunneling and virtual
server via direct routing. The several scheduling algorithms can
be used to choose which server the connection is directed to,
thus load balancing can be achieved among the servers. For more
information and its administration program, please visit the
following URL:
http://www.linuxvirtualserver.org/
If you want to compile it in kernel, say Y. If you want to compile
it as a module, say M here and read Documentation/modules.txt. If
unsure, say N.
config IP_VS_DEBUG
bool "IP virtual server debugging"
depends on IP_VS
---help---
Say Y here if you want to get additional messages useful in
debugging the IP virtual server code. You can change the debug
level in /proc/sys/net/ipv4/vs/debug_level
config IP_VS_TAB_BITS
int "IPVS connection table size (the Nth power of 2)"
depends on IP_VS
default "12"
---help---
The IPVS connection hash table uses the chaining scheme to handle
hash collisions. Using a big IPVS connection hash table will greatly
reduce conflicts when there are hundreds of thousands of connections
in the hash table.
Note the table size must be power of 2. The table size will be the
value of 2 to the your input number power. The number to choose is
from 8 to 20, the default number is 12, which means the table size
is 4096. Don't input the number too small, otherwise you will lose
performance on it. You can adapt the table size yourself, according
to your virtual server application. It is good to set the table size
not far less than the number of connections per second multiplying
average lasting time of connection in the table. For example, your
virtual server gets 200 connections per second, the connection lasts
for 200 seconds in average in the connection table, the table size
should be not far less than 200x200, it is good to set the table
size 32768 (2**15).
Another note that each connection occupies 128 bytes effectively and
each hash entry uses 8 bytes, so you can estimate how much memory is
needed for your box.
comment "IPVS transport protocol load balancing support"
depends on IP_VS
config IP_VS_PROTO_TCP
bool "TCP load balancing support"
depends on IP_VS
---help---
This option enables support for load balancing TCP transport
protocol. Say Y if unsure.
config IP_VS_PROTO_UDP
bool "UDP load balancing support"
depends on IP_VS
---help---
This option enables support for load balancing UDP transport
protocol. Say Y if unsure.
config IP_VS_PROTO_ESP
bool "ESP load balancing support"
depends on IP_VS
---help---
This option enables support for load balancing ESP (Encapsultion
Security Payload) transport protocol. Say Y if unsure.
config IP_VS_PROTO_AH
bool "AH load balancing support"
depends on IP_VS
---help---
This option enables support for load balancing AH (Authentication
Header) transport protocol. Say Y if unsure.
comment "IPVS scheduler"
depends on IP_VS
config IP_VS_RR
tristate "round-robin scheduling"
depends on IP_VS
---help---
The robin-robin scheduling algorithm simply directs network
connections to different real servers in a round-robin manner.
If you want to compile it in kernel, say Y. If you want to compile
it as a module, say M here and read Documentation/modules.txt. If
unsure, say N.
config IP_VS_WRR
tristate "weighted round-robin scheduling"
depends on IP_VS
---help---
The weighted robin-robin scheduling algorithm directs network
connections to different real servers based on server weights
in a round-robin manner. Servers with higher weights receive
new connections first than those with less weights, and servers
with higher weights get more connections than those with less
weights and servers with equal weights get equal connections.
If you want to compile it in kernel, say Y. If you want to compile
it as a module, say M here and read Documentation/modules.txt. If
unsure, say N.
config IP_VS_LC
tristate "least-connection scheduling scheduling"
depends on IP_VS
---help---
The least-connection scheduling algorithm directs network
connections to the server with the least number of active
connections.
If you want to compile it in kernel, say Y. If you want to compile
it as a module, say M here and read Documentation/modules.txt. If
unsure, say N.
config IP_VS_WLC
tristate "weighted least-connection scheduling"
depends on IP_VS
---help---
The weighted least-connection scheduling algorithm directs network
connections to the server with the least active connections
normalized by the server weight.
If you want to compile it in kernel, say Y. If you want to compile
it as a module, say M here and read Documentation/modules.txt. If
unsure, say N.
config IP_VS_LBLC
tristate "locality-based least-connection with replication scheduling"
depends on IP_VS
---help---
The locality-based least-connection scheduling algorithm is for
destination IP load balancing. It is usually used in cache cluster.
This algorithm usually directs packet destined for an IP address to
its server if the server is alive and under load. If the server is
overloaded (its active connection numbers is larger than its weight)
and there is a server in its half load, then allocate the weighted
least-connection server to this IP address.
If you want to compile it in kernel, say Y. If you want to compile
it as a module, say M here and read Documentation/modules.txt. If
unsure, say N.
config IP_VS_LBLCR
tristate "locality-based least-connection with replication schedulin"
depends on IP_VS
---help---
The locality-based least-connection with replication scheduling
algorithm is also for destination IP load balancing. It is
usually used in cache cluster. It differs from the LBLC scheduling
as follows: the load balancer maintains mappings from a target
to a set of server nodes that can serve the target. Requests for
a target are assigned to the least-connection node in the target's
server set. If all the node in the server set are over loaded,
it picks up a least-connection node in the cluster and adds it
in the sever set for the target. If the server set has not been
modified for the specified time, the most loaded node is removed
from the server set, in order to avoid high degree of replication.
If you want to compile it in kernel, say Y. If you want to compile
it as a module, say M here and read Documentation/modules.txt. If
unsure, say N.
config IP_VS_DH
tristate "destination hashing scheduling"
depends on IP_VS
---help---
The destination hashing scheduling algorithm assigns network
connections to the servers through looking up a statically assigned
hash table by their destination IP addresses.
If you want to compile it in kernel, say Y. If you want to compile
it as a module, say M here and read Documentation/modules.txt. If
unsure, say N.
config IP_VS_SH
tristate "source hashing scheduling"
depends on IP_VS
---help---
The source hashing scheduling algorithm assigns network
connections to the servers through looking up a statically assigned
hash table by their source IP addresses.
If you want to compile it in kernel, say Y. If you want to compile
it as a module, say M here and read Documentation/modules.txt. If
unsure, say N.
config IP_VS_SED
tristate "shortest expected delay scheduling"
depends on IP_VS
---help---
The shortest expected delay scheduling algorithm assigns network
connections to the server with the shortest expected delay. The
expected delay that the job will experience is (Ci + 1) / Ui if
sent to the ith server, in which Ci is the number of connections
on the the ith server and Ui is the fixed service rate (weight)
of the ith server.
If you want to compile it in kernel, say Y. If you want to compile
it as a module, say M here and read Documentation/modules.txt. If
unsure, say N.
config IP_VS_NQ
tristate "never queue scheduling"
depends on IP_VS
---help---
The never queue scheduling algorithm adopts a two-speed model.
When there is an idle server available, the job will be sent to
the idle server, instead of waiting for a fast one. When there
is no idle server available, the job will be sent to the server
that minimize its expected delay (The Shortest Expected Delay
scheduling algorithm).
If you want to compile it in kernel, say Y. If you want to compile
it as a module, say M here and read Documentation/modules.txt. If
unsure, say N.
comment 'IPVS application helper'
depends on IP_VS
config IP_VS_FTP
tristate "FTP protocol helper"
depends on IP_VS
---help---
FTP is a protocol that transfers IP address and/or port number in
the payload. In the virtual server via Network Address Translation,
the IP address and port number of real servers cannot be sent to
clients in ftp connections directly, so FTP protocol helper is
required for tracking the connection and mangling it back to that of
virtual service.
If you want to compile it in kernel, say Y. If you want to compile
it as a module, say M here and read Documentation/modules.txt. If
unsure, say N.
endmenu
#
# Makefile for the IPVS modules on top of IPv4.
#
# Note! Dependencies are done automagically by 'make dep', which also
# removes any old dependencies. DON'T put your own dependencies here
# unless it's something special (ie not a .c file).
#
# Note 2! The CFLAGS definition is now in the main makefile...
# IPVS transport protocol load balancing support
ip_vs_proto-objs-y :=
ip_vs_proto-objs-$(CONFIG_IP_VS_PROTO_TCP) += ip_vs_proto_tcp.o
ip_vs_proto-objs-$(CONFIG_IP_VS_PROTO_UDP) += ip_vs_proto_udp.o
ip_vs_proto-objs-$(CONFIG_IP_VS_PROTO_ESP) += ip_vs_proto_esp.o
ip_vs_proto-objs-$(CONFIG_IP_VS_PROTO_AH) += ip_vs_proto_ah.o
ip_vs-objs := ip_vs_conn.o ip_vs_core.o ip_vs_ctl.o ip_vs_sched.o \
ip_vs_xmit.o ip_vs_app.o ip_vs_sync.o \
ip_vs_est.o ip_vs_proto.o ip_vs_proto_icmp.o \
$(ip_vs_proto-objs-y)
# IPVS core
obj-$(CONFIG_IP_VS) += ip_vs.o
# IPVS schedulers
obj-$(CONFIG_IP_VS_RR) += ip_vs_rr.o
obj-$(CONFIG_IP_VS_WRR) += ip_vs_wrr.o
obj-$(CONFIG_IP_VS_LC) += ip_vs_lc.o
obj-$(CONFIG_IP_VS_WLC) += ip_vs_wlc.o
obj-$(CONFIG_IP_VS_LBLC) += ip_vs_lblc.o
obj-$(CONFIG_IP_VS_LBLCR) += ip_vs_lblcr.o
obj-$(CONFIG_IP_VS_DH) += ip_vs_dh.o
obj-$(CONFIG_IP_VS_SH) += ip_vs_sh.o
obj-$(CONFIG_IP_VS_SED) += ip_vs_sed.o
obj-$(CONFIG_IP_VS_NQ) += ip_vs_nq.o
# IPVS application helpers
obj-$(CONFIG_IP_VS_FTP) += ip_vs_ftp.o
/*
* ip_vs_app.c: Application module support for IPVS
*
* Version: $Id: ip_vs_app.c,v 1.17 2003/03/22 06:31:21 wensong Exp $
*
* Authors: Wensong Zhang <wensong@linuxvirtualserver.org>
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public License
* as published by the Free Software Foundation; either version
* 2 of the License, or (at your option) any later version.
*
* Most code here is taken from ip_masq_app.c in kernel 2.2. The difference
* is that ip_vs_app module handles the reverse direction (incoming requests
* and outgoing responses).
*
* IP_MASQ_APP application masquerading module
*
* Author: Juan Jose Ciarlante, <jjciarla@raiz.uncu.edu.ar>
*
*/
#include <linux/config.h>
#include <linux/module.h>
#include <linux/types.h>
#include <linux/kernel.h>
#include <linux/errno.h>
#include <linux/skbuff.h>
#include <linux/in.h>
#include <linux/ip.h>
#include <linux/init.h>
#include <net/protocol.h>
#include <net/tcp.h>
#include <net/udp.h>
#include <asm/system.h>
#include <linux/stat.h>
#include <linux/proc_fs.h>
#include <net/ip_vs.h>
EXPORT_SYMBOL(register_ip_vs_app);
EXPORT_SYMBOL(unregister_ip_vs_app);
EXPORT_SYMBOL(register_ip_vs_app_inc);
/* ipvs application list head */
static LIST_HEAD(ip_vs_app_list);
static DECLARE_MUTEX(__ip_vs_app_mutex);
/*
* Get an ip_vs_app object
*/
static inline int ip_vs_app_get(struct ip_vs_app *app)
{
/* test and get the module atomically */
if (app->module)
return try_module_get(app->module);
else
return 1;
}
static inline void ip_vs_app_put(struct ip_vs_app *app)
{
if (app->module)
module_put(app->module);
}
/*
* Allocate/initialize app incarnation and register it in proto apps.
*/
int
ip_vs_app_inc_new(struct ip_vs_app *app, __u16 proto, __u16 port)
{
struct ip_vs_protocol *pp;
struct ip_vs_app *inc;
int ret;
if (!(pp = ip_vs_proto_get(proto)))
return -EPROTONOSUPPORT;
if (!pp->unregister_app)
return -EOPNOTSUPP;
inc = kmalloc(sizeof(struct ip_vs_app), GFP_KERNEL);
if (!inc)
return -ENOMEM;
memcpy(inc, app, sizeof(*inc));
INIT_LIST_HEAD(&inc->p_list);
INIT_LIST_HEAD(&inc->incs_list);
inc->app = app;
inc->port = htons(port);
atomic_set(&inc->usecnt, 0);
if (app->timeouts) {
inc->timeout_table =
ip_vs_create_timeout_table(app->timeouts,
app->timeouts_size);
if (!inc->timeout_table) {
ret = -ENOMEM;
goto out;
}
}
ret = pp->register_app(inc);
if (ret)
goto out;
list_add(&inc->a_list, &app->incs_list);
IP_VS_DBG(9, "%s application %s:%u registered\n",
pp->name, inc->name, inc->port);
return 0;
out:
if (inc->timeout_table)
kfree(inc->timeout_table);
kfree(inc);
return ret;
}
/*
* Release app incarnation
*/
static void
ip_vs_app_inc_release(struct ip_vs_app *inc)
{
struct ip_vs_protocol *pp;
if (!(pp = ip_vs_proto_get(inc->protocol)))
return;
if (pp->unregister_app)
pp->unregister_app(inc);
IP_VS_DBG(9, "%s App %s:%u unregistered\n",
pp->name, inc->name, inc->port);
list_del(&inc->a_list);
if (inc->timeout_table != NULL)
kfree(inc->timeout_table);
kfree(inc);
}
/*
* Get reference to app inc (only called from softirq)
*
*/
int ip_vs_app_inc_get(struct ip_vs_app *inc)
{
int result;
atomic_inc(&inc->usecnt);
if (unlikely((result = ip_vs_app_get(inc->app)) != 1))
atomic_dec(&inc->usecnt);
return result;
}
/*
* Put the app inc (only called from timer or net softirq)
*/
void ip_vs_app_inc_put(struct ip_vs_app *inc)
{
ip_vs_app_put(inc->app);
atomic_dec(&inc->usecnt);
}
/*
* Register an application incarnation in protocol applications
*/
int
register_ip_vs_app_inc(struct ip_vs_app *app, __u16 proto, __u16 port)
{
int result;
down(&__ip_vs_app_mutex);
result = ip_vs_app_inc_new(app, proto, port);
up(&__ip_vs_app_mutex);
return result;
}
/*
* ip_vs_app registration routine
*/
int register_ip_vs_app(struct ip_vs_app *app)
{
/* increase the module use count */
ip_vs_use_count_inc();
down(&__ip_vs_app_mutex);
list_add(&app->a_list, &ip_vs_app_list);
up(&__ip_vs_app_mutex);
return 0;
}
/*
* ip_vs_app unregistration routine
* We are sure there are no app incarnations attached to services
*/
void unregister_ip_vs_app(struct ip_vs_app *app)
{
struct ip_vs_app *inc;
struct list_head *l = &app->incs_list;
down(&__ip_vs_app_mutex);
while (l->next != l) {
inc = list_entry(l->next, struct ip_vs_app, a_list);
ip_vs_app_inc_release(inc);
}
list_del(&app->a_list);
up(&__ip_vs_app_mutex);
/* decrease the module use count */
ip_vs_use_count_dec();
}
#if 0000
/*
* Get reference to app by name (called from user context)
*/
struct ip_vs_app *ip_vs_app_get_by_name(char *appname)
{
struct list_head *p;
struct ip_vs_app *app, *a = NULL;
down(&__ip_vs_app_mutex);
list_for_each (p, &ip_vs_app_list) {
app = list_entry(p, struct ip_vs_app, a_list);
if (strcmp(app->name, appname))
continue;
/* softirq may call ip_vs_app_get too, so the caller
must disable softirq on the current CPU */
if (ip_vs_app_get(app))
a = app;
break;
}
up(&__ip_vs_app_mutex);
return a;
}
#endif
/*
* Bind ip_vs_conn to its ip_vs_app (called by cp constructor)
*/
int ip_vs_bind_app(struct ip_vs_conn *cp, struct ip_vs_protocol *pp)
{
return pp->app_conn_bind(cp);
}
/*
* Unbind cp from application incarnation (called by cp destructor)
*/
void ip_vs_unbind_app(struct ip_vs_conn *cp)
{
struct ip_vs_app *inc = cp->app;
if (!inc)
return;
if (inc->unbind_conn)
inc->unbind_conn(inc, cp);
if (inc->done_conn)
inc->done_conn(inc, cp);
ip_vs_app_inc_put(inc);
cp->app = NULL;
}
/*
* Fixes th->seq based on ip_vs_seq info.
*/
static inline void vs_fix_seq(const struct ip_vs_seq *vseq, struct tcphdr *th)
{
__u32 seq = ntohl(th->seq);
/*
* Adjust seq with delta-offset for all packets after
* the most recent resized pkt seq and with previous_delta offset
* for all packets before most recent resized pkt seq.
*/
if (vseq->delta || vseq->previous_delta) {
if(after(seq, vseq->init_seq)) {
th->seq = htonl(seq + vseq->delta);
IP_VS_DBG(9, "vs_fix_seq(): added delta (%d) to seq\n",
vseq->delta);
} else {
th->seq = htonl(seq + vseq->previous_delta);
IP_VS_DBG(9, "vs_fix_seq(): added previous_delta "
"(%d) to seq\n", vseq->previous_delta);
}
}
}
/*
* Fixes th->ack_seq based on ip_vs_seq info.
*/
static inline void
vs_fix_ack_seq(const struct ip_vs_seq *vseq, struct tcphdr *th)
{
__u32 ack_seq = ntohl(th->ack_seq);
/*
* Adjust ack_seq with delta-offset for
* the packets AFTER most recent resized pkt has caused a shift
* for packets before most recent resized pkt, use previous_delta
*/
if (vseq->delta || vseq->previous_delta) {
/* since ack_seq is the number of octet that is expected
to receive next, so compare it with init_seq+delta */
if(after(ack_seq, vseq->init_seq+vseq->delta)) {
th->ack_seq = htonl(ack_seq - vseq->delta);
IP_VS_DBG(9, "vs_fix_ack_seq(): subtracted delta "
"(%d) from ack_seq\n", vseq->delta);
} else {
th->ack_seq = htonl(ack_seq - vseq->previous_delta);
IP_VS_DBG(9, "vs_fix_ack_seq(): subtracted "
"previous_delta (%d) from ack_seq\n",
vseq->previous_delta);
}
}
}
/*
* Updates ip_vs_seq if pkt has been resized
* Assumes already checked proto==IPPROTO_TCP and diff!=0.
*/
static inline void vs_seq_update(struct ip_vs_conn *cp, struct ip_vs_seq *vseq,
unsigned flag, __u32 seq, int diff)
{
/* spinlock is to keep updating cp->flags atomic */
spin_lock(&cp->lock);
if (!(cp->flags & flag) || after(seq, vseq->init_seq)) {
vseq->previous_delta = vseq->delta;
vseq->delta += diff;
vseq->init_seq = seq;
cp->flags |= flag;
}
spin_unlock(&cp->lock);
}
/*
* Output pkt hook. Will call bound ip_vs_app specific function
* called by ipvs packet handler, assumes previously checked cp!=NULL
* returns (new - old) skb->len diff.
*/
int ip_vs_app_pkt_out(struct ip_vs_conn *cp, struct sk_buff *skb)
{
struct ip_vs_app *app;
int diff;
struct iphdr *iph;
struct tcphdr *th;
__u32 seq;
/*
* check if application module is bound to
* this ip_vs_conn.
*/
if ((app = cp->app) == NULL)
return 0;
iph = skb->nh.iph;
th = (struct tcphdr *)&(((char *)iph)[iph->ihl*4]);
/*
* Remember seq number in case this pkt gets resized
*/
seq = ntohl(th->seq);
/*
* Fix seq stuff if flagged as so.
*/
if (cp->protocol == IPPROTO_TCP) {
if (cp->flags & IP_VS_CONN_F_OUT_SEQ)
vs_fix_seq(&cp->out_seq, th);
if (cp->flags & IP_VS_CONN_F_IN_SEQ)
vs_fix_ack_seq(&cp->in_seq, th);
}
/*
* Call private output hook function
*/
if (app->pkt_out == NULL)
return 0;
diff = app->pkt_out(app, cp, skb);
/*
* Update ip_vs seq stuff if len has changed.
*/
if (diff != 0 && cp->protocol == IPPROTO_TCP)
vs_seq_update(cp, &cp->out_seq,
IP_VS_CONN_F_OUT_SEQ, seq, diff);
return diff;
}
/*
* Input pkt hook. Will call bound ip_vs_app specific function
* called by ipvs packet handler, assumes previously checked cp!=NULL.
* returns (new - old) skb->len diff.
*/
int ip_vs_app_pkt_in(struct ip_vs_conn *cp, struct sk_buff *skb)
{
struct ip_vs_app *app;
int diff;
struct iphdr *iph;
struct tcphdr *th;
__u32 seq;
/*
* check if application module is bound to
* this ip_vs_conn.
*/
if ((app = cp->app) == NULL)
return 0;
iph = skb->nh.iph;
th = (struct tcphdr *)&(((char *)iph)[iph->ihl*4]);
/*
* Remember seq number in case this pkt gets resized
*/
seq = ntohl(th->seq);
/*
* Fix seq stuff if flagged as so.
*/
if (cp->protocol == IPPROTO_TCP) {
if (cp->flags & IP_VS_CONN_F_IN_SEQ)
vs_fix_seq(&cp->in_seq, th);
if (cp->flags & IP_VS_CONN_F_OUT_SEQ)
vs_fix_ack_seq(&cp->out_seq, th);
}
/*
* Call private input hook function
*/
if (app->pkt_in == NULL)
return 0;
diff = app->pkt_in(app, cp, skb);
/*
* Update ip_vs seq stuff if len has changed.
*/
if (diff != 0 && cp->protocol == IPPROTO_TCP)
vs_seq_update(cp, &cp->in_seq,
IP_VS_CONN_F_IN_SEQ, seq, diff);
return diff;
}
/*
* /proc/net/ip_vs_app entry function
*/
static int
ip_vs_app_getinfo(char *buffer, char **start, off_t offset, int length)
{
off_t pos=0;
int len=0;
char temp[64];
struct ip_vs_app *app, *inc;
struct list_head *e, *i;
pos = 64;
if (pos > offset) {
len += sprintf(buffer+len, "%-63s\n",
"prot port usecnt name");
}
down(&__ip_vs_app_mutex);
list_for_each (e, &ip_vs_app_list) {
app = list_entry(e, struct ip_vs_app, a_list);
list_for_each (i, &app->incs_list) {
inc = list_entry(i, struct ip_vs_app, a_list);
pos += 64;
if (pos <= offset)
continue;
sprintf(temp, "%-3s %-7u %-6d %-17s",
ip_vs_proto_name(inc->protocol),
ntohs(inc->port),
atomic_read(&inc->usecnt),
inc->name);
len += sprintf(buffer+len, "%-63s\n", temp);
if (pos >= offset+length)
goto done;
}
}
done:
up(&__ip_vs_app_mutex);
*start = buffer+len-(pos-offset); /* Start of wanted data */
len = pos-offset;
if (len > length)
len = length;
if (len < 0)
len = 0;
return len;
}
/*
* Replace a segment of data with a new segment
*/
int ip_vs_skb_replace(struct sk_buff *skb, int pri,
char *o_buf, int o_len, char *n_buf, int n_len)
{
struct iphdr *iph;
int diff;
int o_offset;
int o_left;
EnterFunction(9);
diff = n_len - o_len;
o_offset = o_buf - (char *)skb->data;
/* The length of left data after o_buf+o_len in the skb data */
o_left = skb->len - (o_offset + o_len);
if (diff <= 0) {
memmove(o_buf + n_len, o_buf + o_len, o_left);
memcpy(o_buf, n_buf, n_len);
skb_trim(skb, skb->len + diff);
} else if (diff <= skb_tailroom(skb)) {
skb_put(skb, diff);
memmove(o_buf + n_len, o_buf + o_len, o_left);
memcpy(o_buf, n_buf, n_len);
} else {
if (pskb_expand_head(skb, skb_headroom(skb), diff, pri))
return -ENOMEM;
skb_put(skb, diff);
memmove(skb->data + o_offset + n_len,
skb->data + o_offset + o_len, o_left);
memcpy(skb->data + o_offset, n_buf, n_len);
}
/* must update the iph total length here */
iph = skb->nh.iph;
iph->tot_len = htons(skb->len);
LeaveFunction(9);
return 0;
}
int ip_vs_app_init(void)
{
/* we will replace it with proc_net_ipvs_create() soon */
proc_net_create("ip_vs_app", 0, ip_vs_app_getinfo);
return 0;
}
void ip_vs_app_cleanup(void)
{
proc_net_remove("ip_vs_app");
}
/*
* IPVS An implementation of the IP virtual server support for the
* LINUX operating system. IPVS is now implemented as a module
* over the Netfilter framework. IPVS can be used to build a
* high-performance and highly available server based on a
* cluster of servers.
*
* Version: $Id: ip_vs_conn.c,v 1.31 2003/04/18 09:03:16 wensong Exp $
*
* Authors: Wensong Zhang <wensong@linuxvirtualserver.org>
* Peter Kese <peter.kese@ijs.si>
* Julian Anastasov <ja@ssi.bg>
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public License
* as published by the Free Software Foundation; either version
* 2 of the License, or (at your option) any later version.
*
* The IPVS code for kernel 2.2 was done by Wensong Zhang and Peter Kese,
* with changes/fixes from Julian Anastasov, Lars Marowsky-Bree, Horms
* and others. Many code here is taken from IP MASQ code of kernel 2.2.
*
* Changes:
*
*/
#include <linux/config.h>
#include <linux/types.h>
#include <linux/kernel.h>
#include <linux/compiler.h>
#include <linux/vmalloc.h>
#include <linux/proc_fs.h> /* for proc_net_* */
#include <linux/jhash.h>
#include <linux/random.h>
#include <net/ip_vs.h>
/*
* Connection hash table: for input and output packets lookups of IPVS
*/
static struct list_head *ip_vs_conn_tab;
/* SLAB cache for IPVS connections */
static kmem_cache_t *ip_vs_conn_cachep;
/* counter for current IPVS connections */
static atomic_t ip_vs_conn_count = ATOMIC_INIT(0);
/* counter for no client port connections */
static atomic_t ip_vs_conn_no_cport_cnt = ATOMIC_INIT(0);
/* random value for IPVS connection hash */
static unsigned int ip_vs_conn_rnd;
/*
* Fine locking granularity for big connection hash table
*/
#define CT_LOCKARRAY_BITS 4
#define CT_LOCKARRAY_SIZE (1<<CT_LOCKARRAY_BITS)
#define CT_LOCKARRAY_MASK (CT_LOCKARRAY_SIZE-1)
struct ip_vs_aligned_lock
{
rwlock_t l;
} __attribute__((__aligned__(SMP_CACHE_BYTES)));
/* lock array for conn table */
struct ip_vs_aligned_lock
__ip_vs_conntbl_lock_array[CT_LOCKARRAY_SIZE] __cacheline_aligned;
static inline void ct_read_lock(unsigned key)
{
read_lock(&__ip_vs_conntbl_lock_array[key&CT_LOCKARRAY_MASK].l);
}
static inline void ct_read_unlock(unsigned key)
{
read_unlock(&__ip_vs_conntbl_lock_array[key&CT_LOCKARRAY_MASK].l);
}
static inline void ct_write_lock(unsigned key)
{
write_lock(&__ip_vs_conntbl_lock_array[key&CT_LOCKARRAY_MASK].l);
}
static inline void ct_write_unlock(unsigned key)
{
write_unlock(&__ip_vs_conntbl_lock_array[key&CT_LOCKARRAY_MASK].l);
}
static inline void ct_read_lock_bh(unsigned key)
{
read_lock_bh(&__ip_vs_conntbl_lock_array[key&CT_LOCKARRAY_MASK].l);
}
static inline void ct_read_unlock_bh(unsigned key)
{
read_unlock_bh(&__ip_vs_conntbl_lock_array[key&CT_LOCKARRAY_MASK].l);
}
static inline void ct_write_lock_bh(unsigned key)
{
write_lock_bh(&__ip_vs_conntbl_lock_array[key&CT_LOCKARRAY_MASK].l);
}
static inline void ct_write_unlock_bh(unsigned key)
{
write_unlock_bh(&__ip_vs_conntbl_lock_array[key&CT_LOCKARRAY_MASK].l);
}
/*
* Returns hash value for IPVS connection entry
*/
static unsigned int ip_vs_conn_hashkey(unsigned proto, __u32 addr, __u16 port)
{
return jhash_3words(addr, port, proto, ip_vs_conn_rnd)
& IP_VS_CONN_TAB_MASK;
}
/*
* Hashes ip_vs_conn in ip_vs_conn_tab by proto,addr,port.
* returns bool success.
*/
static inline int ip_vs_conn_hash(struct ip_vs_conn *cp)
{
unsigned hash;
if (cp->flags & IP_VS_CONN_F_HASHED) {
IP_VS_ERR("ip_vs_conn_hash(): request for already hashed, "
"called from %p\n", __builtin_return_address(0));
return 0;
}
/* Hash by protocol, client address and port */
hash = ip_vs_conn_hashkey(cp->protocol, cp->caddr, cp->cport);
ct_write_lock(hash);
list_add(&cp->c_list, &ip_vs_conn_tab[hash]);
cp->flags |= IP_VS_CONN_F_HASHED;
atomic_inc(&cp->refcnt);
ct_write_unlock(hash);
return 1;
}
/*
* UNhashes ip_vs_conn from ip_vs_conn_tab.
* returns bool success.
*/
static inline int ip_vs_conn_unhash(struct ip_vs_conn *cp)
{
unsigned hash;
if (!(cp->flags & IP_VS_CONN_F_HASHED)) {
IP_VS_ERR("ip_vs_conn_unhash(): request for unhash flagged, "
"called from %p\n", __builtin_return_address(0));
return 0;
}
/* unhash it and decrease its reference counter */
hash = ip_vs_conn_hashkey(cp->protocol, cp->caddr, cp->cport);
ct_write_lock(hash);
list_del(&cp->c_list);
cp->flags &= ~IP_VS_CONN_F_HASHED;
atomic_dec(&cp->refcnt);
ct_write_unlock(hash);
return 1;
}
/*
* Gets ip_vs_conn associated with supplied parameters in the ip_vs_conn_tab.
* Called for pkts coming from OUTside-to-INside.
* s_addr, s_port: pkt source address (foreign host)
* d_addr, d_port: pkt dest address (load balancer)
*/
static inline struct ip_vs_conn *__ip_vs_conn_in_get
(int protocol, __u32 s_addr, __u16 s_port, __u32 d_addr, __u16 d_port)
{
unsigned hash;
struct ip_vs_conn *cp;
struct list_head *l,*e;
hash = ip_vs_conn_hashkey(protocol, s_addr, s_port);
l = &ip_vs_conn_tab[hash];
ct_read_lock(hash);
for (e=l->next; e!=l; e=e->next) {
cp = list_entry(e, struct ip_vs_conn, c_list);
if (s_addr==cp->caddr && s_port==cp->cport &&
d_port==cp->vport && d_addr==cp->vaddr &&
protocol==cp->protocol) {
/* HIT */
atomic_inc(&cp->refcnt);
ct_read_unlock(hash);
return cp;
}
}
ct_read_unlock(hash);
return NULL;
}
struct ip_vs_conn *ip_vs_conn_in_get
(int protocol, __u32 s_addr, __u16 s_port, __u32 d_addr, __u16 d_port)
{
struct ip_vs_conn *cp;
cp = __ip_vs_conn_in_get(protocol, s_addr, s_port, d_addr, d_port);
if (!cp && atomic_read(&ip_vs_conn_no_cport_cnt))
cp = __ip_vs_conn_in_get(protocol, s_addr, 0, d_addr, d_port);
IP_VS_DBG(7, "lookup/in %s %u.%u.%u.%u:%d->%u.%u.%u.%u:%d %s\n",
ip_vs_proto_name(protocol),
NIPQUAD(s_addr), ntohs(s_port),
NIPQUAD(d_addr), ntohs(d_port),
cp?"hit":"not hit");
return cp;
}
/*
* Gets ip_vs_conn associated with supplied parameters in the ip_vs_conn_tab.
* Called for pkts coming from inside-to-OUTside.
* s_addr, s_port: pkt source address (inside host)
* d_addr, d_port: pkt dest address (foreign host)
*/
struct ip_vs_conn *ip_vs_conn_out_get
(int protocol, __u32 s_addr, __u16 s_port, __u32 d_addr, __u16 d_port)
{
unsigned hash;
struct ip_vs_conn *cp, *ret=NULL;
struct list_head *l,*e;
/*
* Check for "full" addressed entries
*/
hash = ip_vs_conn_hashkey(protocol, d_addr, d_port);
l = &ip_vs_conn_tab[hash];
ct_read_lock(hash);
for (e=l->next; e!=l; e=e->next) {
cp = list_entry(e, struct ip_vs_conn, c_list);
if (d_addr == cp->caddr && d_port == cp->cport &&
s_port == cp->dport && s_addr == cp->daddr &&
protocol == cp->protocol) {
/* HIT */
atomic_inc(&cp->refcnt);
ret = cp;
break;
}
}
ct_read_unlock(hash);
IP_VS_DBG(7, "lookup/out %s %u.%u.%u.%u:%d->%u.%u.%u.%u:%d %s\n",
ip_vs_proto_name(protocol),
NIPQUAD(s_addr), ntohs(s_port),
NIPQUAD(d_addr), ntohs(d_port),
ret?"hit":"not hit");
return ret;
}
/*
* Put back the conn and restart its timer with its timeout
*/
void ip_vs_conn_put(struct ip_vs_conn *cp)
{
/* reset it expire in its timeout */
mod_timer(&cp->timer, jiffies+cp->timeout);
__ip_vs_conn_put(cp);
}
/*
* Fill a no_client_port connection with a client port number
*/
void ip_vs_conn_fill_cport(struct ip_vs_conn *cp, __u16 cport)
{
atomic_dec(&ip_vs_conn_no_cport_cnt);
ip_vs_conn_unhash(cp);
cp->flags &= ~IP_VS_CONN_F_NO_CPORT;
cp->cport = cport;
/* hash on new dport */
ip_vs_conn_hash(cp);
}
/*
* Bind a connection entry with the corresponding packet_xmit.
* Called by ip_vs_conn_new.
*/
static inline void ip_vs_bind_xmit(struct ip_vs_conn *cp)
{
switch (IP_VS_FWD_METHOD(cp)) {
case IP_VS_CONN_F_MASQ:
cp->packet_xmit = ip_vs_nat_xmit;
break;
case IP_VS_CONN_F_TUNNEL:
cp->packet_xmit = ip_vs_tunnel_xmit;
break;
case IP_VS_CONN_F_DROUTE:
cp->packet_xmit = ip_vs_dr_xmit;
break;
case IP_VS_CONN_F_LOCALNODE:
cp->packet_xmit = ip_vs_null_xmit;
break;
case IP_VS_CONN_F_BYPASS:
cp->packet_xmit = ip_vs_bypass_xmit;
break;
}
}
static inline int ip_vs_dest_totalconns(struct ip_vs_dest *dest)
{
return atomic_read(&dest->activeconns)
+ atomic_read(&dest->inactconns);
}
/*
* Bind a connection entry with a virtual service destination
* Called just after a new connection entry is created.
*/
static inline void
ip_vs_bind_dest(struct ip_vs_conn *cp, struct ip_vs_dest *dest)
{
/* if dest is NULL, then return directly */
if (!dest)
return;
/* Increase the refcnt counter of the dest */
atomic_inc(&dest->refcnt);
/* Bind with the destination and its corresponding transmitter */
cp->flags |= atomic_read(&dest->conn_flags);
cp->dest = dest;
IP_VS_DBG(9, "Bind-dest %s c:%u.%u.%u.%u:%d v:%u.%u.%u.%u:%d "
"d:%u.%u.%u.%u:%d fwd:%c s:%u flg:%X cnt:%d destcnt:%d\n",
ip_vs_proto_name(cp->protocol),
NIPQUAD(cp->caddr), ntohs(cp->cport),
NIPQUAD(cp->vaddr), ntohs(cp->vport),
NIPQUAD(cp->daddr), ntohs(cp->dport),
ip_vs_fwd_tag(cp), cp->state,
cp->flags, atomic_read(&cp->refcnt),
atomic_read(&dest->refcnt));
/* Update the connection counters */
if (cp->cport || (cp->flags & IP_VS_CONN_F_NO_CPORT)) {
/* It is a normal connection, so increase the inactive
connection counter because it is in TCP SYNRECV
state (inactive) or other protocol inacive state */
atomic_inc(&dest->inactconns);
} else {
/* It is a persistent connection/template, so increase
the peristent connection counter */
atomic_inc(&dest->persistconns);
}
if (dest->u_threshold != 0 &&
ip_vs_dest_totalconns(dest) >= dest->u_threshold)
dest->flags |= IP_VS_DEST_F_OVERLOAD;
}
/*
* Unbind a connection entry with its VS destination
* Called by the ip_vs_conn_expire function.
*/
static inline void ip_vs_unbind_dest(struct ip_vs_conn *cp)
{
struct ip_vs_dest *dest = cp->dest;
if (!dest)
return;
IP_VS_DBG(9, "Unbind-dest %s c:%u.%u.%u.%u:%d v:%u.%u.%u.%u:%d "
"d:%u.%u.%u.%u:%d fwd:%c s:%u flg:%X cnt:%d destcnt:%d\n",
ip_vs_proto_name(cp->protocol),
NIPQUAD(cp->caddr), ntohs(cp->cport),
NIPQUAD(cp->vaddr), ntohs(cp->vport),
NIPQUAD(cp->daddr), ntohs(cp->dport),
ip_vs_fwd_tag(cp), cp->state,
cp->flags, atomic_read(&cp->refcnt),
atomic_read(&dest->refcnt));
/* Update the connection counters */
if (cp->cport || (cp->flags & IP_VS_CONN_F_NO_CPORT)) {
/* It is a normal connection, so decrease the inactconns
or activeconns counter */
if (cp->flags & IP_VS_CONN_F_INACTIVE) {
atomic_dec(&dest->inactconns);
} else {
atomic_dec(&dest->activeconns);
}
} else {
/* It is a persistent connection/template, so decrease
the peristent connection counter */
atomic_dec(&dest->persistconns);
}
if (dest->l_threshold != 0) {
if (ip_vs_dest_totalconns(dest) < dest->l_threshold)
dest->flags &= ~IP_VS_DEST_F_OVERLOAD;
} else if (dest->u_threshold != 0) {
if (ip_vs_dest_totalconns(dest) * 4 < dest->u_threshold * 3)
dest->flags &= ~IP_VS_DEST_F_OVERLOAD;
} else {
if (dest->flags & IP_VS_DEST_F_OVERLOAD)
dest->flags &= ~IP_VS_DEST_F_OVERLOAD;
}
/*
* Simply decrease the refcnt of the dest, because the
* dest will be either in service's destination list
* or in the trash.
*/
atomic_dec(&dest->refcnt);
}
/*
* Checking if the destination of a connection template is available.
* If available, return 1, otherwise invalidate this connection
* template and return 0.
*/
int ip_vs_check_template(struct ip_vs_conn *ct)
{
struct ip_vs_dest *dest = ct->dest;
/*
* Checking the dest server status.
*/
if ((dest == NULL) ||
!(dest->flags & IP_VS_DEST_F_AVAILABLE)) {
IP_VS_DBG(9, "check_template: dest not available for "
"protocol %s s:%u.%u.%u.%u:%d v:%u.%u.%u.%u:%d "
"-> d:%u.%u.%u.%u:%d\n",
ip_vs_proto_name(ct->protocol),
NIPQUAD(ct->caddr), ntohs(ct->cport),
NIPQUAD(ct->vaddr), ntohs(ct->vport),
NIPQUAD(ct->daddr), ntohs(ct->dport));
/*
* Invalidate the connection template
*/
ip_vs_conn_unhash(ct);
ct->dport = 65535;
ct->vport = 65535;
ct->cport = 0;
ip_vs_conn_hash(ct);
/*
* Simply decrease the refcnt of the template,
* don't restart its timer.
*/
atomic_dec(&ct->refcnt);
return 0;
}
return 1;
}
static void ip_vs_conn_expire(unsigned long data)
{
struct ip_vs_conn *cp = (struct ip_vs_conn *)data;
cp->timeout = 60*HZ;
/*
* hey, I'm using it
*/
atomic_inc(&cp->refcnt);
/*
* do I control anybody?
*/
if (atomic_read(&cp->n_control))
goto expire_later;
/*
* unhash it if it is hashed in the conn table
*/
ip_vs_conn_unhash(cp);
/*
* refcnt==1 implies I'm the only one referrer
*/
if (likely(atomic_read(&cp->refcnt) == 1)) {
/* make sure that there is no timer on it now */
del_timer_sync(&cp->timer);
/* does anybody control me? */
if (cp->control)
ip_vs_control_del(cp);
if (unlikely(cp->app != NULL))
ip_vs_unbind_app(cp);
ip_vs_unbind_dest(cp);
//ip_vs_timeout_detach(cp);
if (cp->flags & IP_VS_CONN_F_NO_CPORT)
atomic_dec(&ip_vs_conn_no_cport_cnt);
atomic_dec(&ip_vs_conn_count);
kmem_cache_free(ip_vs_conn_cachep, cp);
return;
}
/* hash it back to the table */
ip_vs_conn_hash(cp);
expire_later:
IP_VS_DBG(7, "delayed: refcnt-1=%d conn.n_control=%d\n",
atomic_read(&cp->refcnt)-1,
atomic_read(&cp->n_control));
ip_vs_conn_put(cp);
}
void ip_vs_conn_expire_now(struct ip_vs_conn *cp)
{
cp->timeout = 0;
mod_timer(&cp->timer, jiffies);
__ip_vs_conn_put(cp);
}
/*
* Create a new connection entry and hash it into the ip_vs_conn_tab
*/
struct ip_vs_conn *
ip_vs_conn_new(int proto, __u32 caddr, __u16 cport, __u32 vaddr, __u16 vport,
__u32 daddr, __u16 dport, unsigned flags,
struct ip_vs_dest *dest)
{
struct ip_vs_conn *cp;
struct ip_vs_protocol *pp = ip_vs_proto_get(proto);
cp = kmem_cache_alloc(ip_vs_conn_cachep, GFP_ATOMIC);
if (cp == NULL) {
IP_VS_ERR_RL("ip_vs_conn_new: no memory available.\n");
return NULL;
}
memset(cp, 0, sizeof(*cp));
INIT_LIST_HEAD(&cp->c_list);
init_timer(&cp->timer);
cp->timer.data = (unsigned long)cp;
cp->timer.function = ip_vs_conn_expire;
cp->protocol = proto;
cp->caddr = caddr;
cp->cport = cport;
cp->vaddr = vaddr;
cp->vport = vport;
cp->daddr = daddr;
cp->dport = dport;
cp->flags = flags;
cp->lock = SPIN_LOCK_UNLOCKED;
/*
* Set the entry is referenced by the current thread before hashing
* it in the table, so that other thread run ip_vs_random_dropentry
* but cannot drop this entry.
*/
atomic_set(&cp->refcnt, 1);
atomic_set(&cp->n_control, 0);
atomic_set(&cp->in_pkts, 0);
atomic_inc(&ip_vs_conn_count);
if (flags & IP_VS_CONN_F_NO_CPORT)
atomic_inc(&ip_vs_conn_no_cport_cnt);
/* Bind the connection with a destination server */
ip_vs_bind_dest(cp, dest);
/* Set its state and timeout */
cp->state = 0;
cp->timeout = 3*HZ;
/* Bind its packet transmitter */
ip_vs_bind_xmit(cp);
if (unlikely(pp && atomic_read(&pp->appcnt)))
ip_vs_bind_app(cp, pp);
/* Hash it in the ip_vs_conn_tab finally */
ip_vs_conn_hash(cp);
return cp;
}
/*
* /proc/net/ip_vs_conn entries
*/
static int
ip_vs_conn_getinfo(char *buffer, char **start, off_t offset, int length)
{
off_t pos=0;
int idx, len=0;
char temp[70];
struct ip_vs_conn *cp;
struct list_head *l, *e;
pos = 128;
if (pos > offset) {
len += sprintf(buffer+len, "%-127s\n",
"Pro FromIP FPrt ToIP TPrt DestIP DPrt State Expires");
}
for(idx = 0; idx < IP_VS_CONN_TAB_SIZE; idx++) {
/*
* Lock is actually only need in next loop
* we are called from uspace: must stop bh.
*/
ct_read_lock_bh(idx);
l = &ip_vs_conn_tab[idx];
for (e=l->next; e!=l; e=e->next) {
cp = list_entry(e, struct ip_vs_conn, c_list);
pos += 128;
if (pos <= offset)
continue;
sprintf(temp,
"%-3s %08X %04X %08X %04X %08X %04X %-11s %7lu",
ip_vs_proto_name(cp->protocol),
ntohl(cp->caddr), ntohs(cp->cport),
ntohl(cp->vaddr), ntohs(cp->vport),
ntohl(cp->daddr), ntohs(cp->dport),
ip_vs_state_name(cp->protocol, cp->state),
(cp->timer.expires-jiffies)/HZ);
len += sprintf(buffer+len, "%-127s\n", temp);
if (pos >= offset+length) {
ct_read_unlock_bh(idx);
goto done;
}
}
ct_read_unlock_bh(idx);
}
done:
*start = buffer+len-(pos-offset); /* Start of wanted data */
len = pos-offset;
if (len > length)
len = length;
if (len < 0)
len = 0;
return len;
}
/*
* Randomly drop connection entries before running out of memory
*/
static inline int todrop_entry(struct ip_vs_conn *cp)
{
/*
* The drop rate array needs tuning for real environments.
* Called from timer bh only => no locking
*/
static char todrop_rate[9] = {0, 1, 2, 3, 4, 5, 6, 7, 8};
static char todrop_counter[9] = {0};
int i;
/* if the conn entry hasn't lasted for 60 seconds, don't drop it.
This will leave enough time for normal connection to get
through. */
if (cp->timeout+jiffies-cp->timer.expires < 60*HZ)
return 0;
/* Don't drop the entry if its number of incoming packets is not
located in [0, 8] */
i = atomic_read(&cp->in_pkts);
if (i > 8 || i < 0) return 0;
if (!todrop_rate[i]) return 0;
if (--todrop_counter[i] > 0) return 0;
todrop_counter[i] = todrop_rate[i];
return 1;
}
void ip_vs_random_dropentry(void)
{
int idx;
struct ip_vs_conn *cp;
struct list_head *l,*e;
struct ip_vs_conn *ct;
/*
* Randomly scan 1/32 of the whole table every second
*/
for (idx = 0; idx < (IP_VS_CONN_TAB_SIZE>>5); idx++) {
unsigned hash = net_random() & IP_VS_CONN_TAB_MASK;
/*
* Lock is actually needed in this loop.
*/
ct_write_lock(hash);
l = &ip_vs_conn_tab[hash];
for (e=l->next; e!=l; e=e->next) {
cp = list_entry(e, struct ip_vs_conn, c_list);
if (!cp->cport && !(cp->flags & IP_VS_CONN_F_NO_CPORT))
/* connection template */
continue;
if (cp->protocol == IPPROTO_TCP) {
switch(cp->state) {
case IP_VS_TCP_S_SYN_RECV:
case IP_VS_TCP_S_SYNACK:
break;
case IP_VS_TCP_S_ESTABLISHED:
if (todrop_entry(cp))
break;
continue;
default:
continue;
}
} else {
if (!todrop_entry(cp))
continue;
}
/*
* Drop the entry, and drop its ct if not referenced
*/
atomic_inc(&cp->refcnt);
ct_write_unlock(hash);
if ((ct = cp->control))
atomic_inc(&ct->refcnt);
IP_VS_DBG(4, "del connection\n");
ip_vs_conn_expire_now(cp);
if (ct) {
IP_VS_DBG(4, "del conn template\n");
ip_vs_conn_expire_now(ct);
}
ct_write_lock(hash);
}
ct_write_unlock(hash);
}
}
/*
* Flush all the connection entries in the ip_vs_conn_tab
*/
static void ip_vs_conn_flush(void)
{
int idx;
struct ip_vs_conn *cp;
struct list_head *l,*e;
struct ip_vs_conn *ct;
flush_again:
for (idx=0; idx<IP_VS_CONN_TAB_SIZE; idx++) {
/*
* Lock is actually needed in this loop.
*/
ct_write_lock_bh(idx);
l = &ip_vs_conn_tab[idx];
for (e=l->next; e!=l; e=e->next) {
cp = list_entry(e, struct ip_vs_conn, c_list);
atomic_inc(&cp->refcnt);
ct_write_unlock(idx);
if ((ct = cp->control))
atomic_inc(&ct->refcnt);
IP_VS_DBG(4, "del connection\n");
ip_vs_conn_expire_now(cp);
if (ct) {
IP_VS_DBG(4, "del conn template\n");
ip_vs_conn_expire_now(ct);
}
ct_write_lock(idx);
}
ct_write_unlock_bh(idx);
}
/* the counter may be not NULL, because maybe some conn entries
are run by slow timer handler or unhashed but still referred */
if (atomic_read(&ip_vs_conn_count) != 0) {
schedule();
goto flush_again;
}
}
int ip_vs_conn_init(void)
{
int idx;
/*
* Allocate the connection hash table and initialize its list heads
*/
ip_vs_conn_tab = vmalloc(IP_VS_CONN_TAB_SIZE*sizeof(struct list_head));
if (!ip_vs_conn_tab)
return -ENOMEM;
/* Allocate ip_vs_conn slab cache */
ip_vs_conn_cachep = kmem_cache_create("ip_vs_conn",
sizeof(struct ip_vs_conn), 0,
SLAB_HWCACHE_ALIGN, NULL, NULL);
if (!ip_vs_conn_cachep) {
vfree(ip_vs_conn_tab);
return -ENOMEM;
}
IP_VS_INFO("Connection hash table configured "
"(size=%d, memory=%ldKbytes)\n",
IP_VS_CONN_TAB_SIZE,
(long)(IP_VS_CONN_TAB_SIZE*sizeof(struct list_head))/1024);
IP_VS_DBG(0, "Each connection entry needs %d bytes at least\n",
sizeof(struct ip_vs_conn));
for (idx = 0; idx < IP_VS_CONN_TAB_SIZE; idx++) {
INIT_LIST_HEAD(&ip_vs_conn_tab[idx]);
}
for (idx = 0; idx < CT_LOCKARRAY_SIZE; idx++) {
__ip_vs_conntbl_lock_array[idx].l = RW_LOCK_UNLOCKED;
}
proc_net_create("ip_vs_conn", 0, ip_vs_conn_getinfo);
/* calculate the random value for connection hash */
get_random_bytes(&ip_vs_conn_rnd, sizeof(ip_vs_conn_rnd));
return 0;
}
void ip_vs_conn_cleanup(void)
{
/* flush all the connection entries first */
ip_vs_conn_flush();
/* Release the empty cache */
kmem_cache_destroy(ip_vs_conn_cachep);
proc_net_remove("ip_vs_conn");
vfree(ip_vs_conn_tab);
}
/*
* IPVS An implementation of the IP virtual server support for the
* LINUX operating system. IPVS is now implemented as a module
* over the Netfilter framework. IPVS can be used to build a
* high-performance and highly available server based on a
* cluster of servers.
*
* Version: $Id: ip_vs_core.c,v 1.34 2003/05/10 03:05:23 wensong Exp $
*
* Authors: Wensong Zhang <wensong@linuxvirtualserver.org>
* Peter Kese <peter.kese@ijs.si>
* Julian Anastasov <ja@ssi.bg>
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public License
* as published by the Free Software Foundation; either version
* 2 of the License, or (at your option) any later version.
*
* The IPVS code for kernel 2.2 was done by Wensong Zhang and Peter Kese,
* with changes/fixes from Julian Anastasov, Lars Marowsky-Bree, Horms
* and others.
*
* Changes:
*
*/
#include <linux/config.h>
#include <linux/module.h>
#include <linux/types.h>
#include <linux/kernel.h>
#include <linux/errno.h>
#include <linux/compiler.h>
#include <linux/ip.h>
#include <linux/tcp.h>
#include <linux/icmp.h>
#include <net/ip.h>
#include <net/tcp.h>
#include <net/udp.h>
#include <net/icmp.h> /* for icmp_send */
#include <net/route.h>
#include <linux/netfilter.h>
#include <linux/netfilter_ipv4.h>
#include <net/ip_vs.h>
EXPORT_SYMBOL(register_ip_vs_scheduler);
EXPORT_SYMBOL(unregister_ip_vs_scheduler);
EXPORT_SYMBOL(ip_vs_skb_replace);
EXPORT_SYMBOL(ip_vs_proto_name);
EXPORT_SYMBOL(ip_vs_conn_new);
EXPORT_SYMBOL(ip_vs_conn_in_get);
EXPORT_SYMBOL(ip_vs_conn_out_get);
EXPORT_SYMBOL(ip_vs_tcp_conn_listen);
EXPORT_SYMBOL(ip_vs_conn_put);
#ifdef CONFIG_IP_VS_DEBUG
EXPORT_SYMBOL(ip_vs_get_debug_level);
#endif
EXPORT_SYMBOL(check_for_ip_vs_out);
/* ID used in ICMP lookups */
#define icmp_id(icmph) ((icmph->un).echo.id)
const char *ip_vs_proto_name(unsigned proto)
{
static char buf[20];
switch (proto) {
case IPPROTO_IP:
return "IP";
case IPPROTO_UDP:
return "UDP";
case IPPROTO_TCP:
return "TCP";
case IPPROTO_ICMP:
return "ICMP";
default:
sprintf(buf, "IP_%d", proto);
return buf;
}
}
void ip_vs_init_hash_table(struct list_head *table, int rows)
{
while (--rows >= 0)
INIT_LIST_HEAD(&table[rows]);
}
static inline void
ip_vs_in_stats(struct ip_vs_conn *cp, struct sk_buff *skb)
{
struct ip_vs_dest *dest = cp->dest;
if (dest && (dest->flags & IP_VS_DEST_F_AVAILABLE)) {
spin_lock(&dest->stats.lock);
dest->stats.inpkts++;
dest->stats.inbytes += skb->len;
spin_unlock(&dest->stats.lock);
spin_lock(&dest->svc->stats.lock);
dest->svc->stats.inpkts++;
dest->svc->stats.inbytes += skb->len;
spin_unlock(&dest->svc->stats.lock);
spin_lock(&ip_vs_stats.lock);
ip_vs_stats.inpkts++;
ip_vs_stats.inbytes += skb->len;
spin_unlock(&ip_vs_stats.lock);
}
}
static inline void
ip_vs_out_stats(struct ip_vs_conn *cp, struct sk_buff *skb)
{
struct ip_vs_dest *dest = cp->dest;
if (dest && (dest->flags & IP_VS_DEST_F_AVAILABLE)) {
spin_lock(&dest->stats.lock);
dest->stats.outpkts++;
dest->stats.outbytes += skb->len;
spin_unlock(&dest->stats.lock);
spin_lock(&dest->svc->stats.lock);
dest->svc->stats.outpkts++;
dest->svc->stats.outbytes += skb->len;
spin_unlock(&dest->svc->stats.lock);
spin_lock(&ip_vs_stats.lock);
ip_vs_stats.outpkts++;
ip_vs_stats.outbytes += skb->len;
spin_unlock(&ip_vs_stats.lock);
}
}
static inline void
ip_vs_conn_stats(struct ip_vs_conn *cp, struct ip_vs_service *svc)
{
spin_lock(&cp->dest->stats.lock);
cp->dest->stats.conns++;
spin_unlock(&cp->dest->stats.lock);
spin_lock(&svc->stats.lock);
svc->stats.conns++;
spin_unlock(&svc->stats.lock);
spin_lock(&ip_vs_stats.lock);
ip_vs_stats.conns++;
spin_unlock(&ip_vs_stats.lock);
}
static inline int
ip_vs_set_state(struct ip_vs_conn *cp, int direction,
struct iphdr *iph, union ip_vs_tphdr h,
struct ip_vs_protocol *pp)
{
if (unlikely(!pp->state_transition))
return 0;
return pp->state_transition(cp, direction, iph, h, pp);
}
/*
* IPVS persistent scheduling function
* It creates a connection entry according to its template if exists,
* or selects a server and creates a connection entry plus a template.
* Locking: we are svc user (svc->refcnt), so we hold all dests too
* Protocols supported: TCP, UDP
*/
static struct ip_vs_conn *
ip_vs_sched_persist(struct ip_vs_service *svc, struct iphdr *iph)
{
struct ip_vs_conn *cp = NULL;
struct ip_vs_dest *dest;
const __u16 *portp;
struct ip_vs_conn *ct;
__u16 dport; /* destination port to forward */
__u32 snet; /* source network of the client, after masking */
portp = (__u16 *)&(((char *)iph)[iph->ihl*4]);
/* Mask saddr with the netmask to adjust template granularity */
snet = iph->saddr & svc->netmask;
IP_VS_DBG(6, "p-schedule: src %u.%u.%u.%u:%u dest %u.%u.%u.%u:%u "
"mnet %u.%u.%u.%u\n",
NIPQUAD(iph->saddr), ntohs(portp[0]),
NIPQUAD(iph->daddr), ntohs(portp[1]),
NIPQUAD(snet));
/*
* As far as we know, FTP is a very complicated network protocol, and
* it uses control connection and data connections. For active FTP,
* FTP server initialize data connection to the client, its source port
* is often 20. For passive FTP, FTP server tells the clients the port
* that it passively listens to, and the client issues the data
* connection. In the tunneling or direct routing mode, the load
* balancer is on the client-to-server half of connection, the port
* number is unknown to the load balancer. So, a conn template like
* <caddr, 0, vaddr, 0, daddr, 0> is created for persistent FTP
* service, and a template like <caddr, 0, vaddr, vport, daddr, dport>
* is created for other persistent services.
*/
if (portp[1] == svc->port) {
/* Check if a template already exists */
if (svc->port != FTPPORT)
ct = ip_vs_conn_in_get(iph->protocol, snet, 0,
iph->daddr, portp[1]);
else
ct = ip_vs_conn_in_get(iph->protocol, snet, 0,
iph->daddr, 0);
if (!ct || !ip_vs_check_template(ct)) {
/*
* No template found or the dest of the connection
* template is not available.
*/
dest = svc->scheduler->schedule(svc, iph);
if (dest == NULL) {
IP_VS_DBG(1, "p-schedule: no dest found.\n");
return NULL;
}
/*
* Create a template like <protocol,caddr,0,
* vaddr,vport,daddr,dport> for non-ftp service,
* and <protocol,caddr,0,vaddr,0,daddr,0>
* for ftp service.
*/
if (svc->port != FTPPORT)
ct = ip_vs_conn_new(iph->protocol,
snet, 0,
iph->daddr, portp[1],
dest->addr, dest->port,
0,
dest);
else
ct = ip_vs_conn_new(iph->protocol,
snet, 0,
iph->daddr, 0,
dest->addr, 0,
0,
dest);
if (ct == NULL)
return NULL;
ct->timeout = svc->timeout;
} else {
/* set destination with the found template */
dest = ct->dest;
}
dport = dest->port;
} else {
/*
* Note: persistent fwmark-based services and persistent
* port zero service are handled here.
* fwmark template: <IPPROTO_IP,caddr,0,fwmark,0,daddr,0>
* port zero template: <protocol,caddr,0,vaddr,0,daddr,0>
*/
if (svc->fwmark)
ct = ip_vs_conn_in_get(IPPROTO_IP, snet, 0,
htonl(svc->fwmark), 0);
else
ct = ip_vs_conn_in_get(iph->protocol, snet, 0,
iph->daddr, 0);
if (!ct || !ip_vs_check_template(ct)) {
/*
* If it is not persistent port zero, return NULL,
* otherwise create a connection template.
*/
if (svc->port)
return NULL;
dest = svc->scheduler->schedule(svc, iph);
if (dest == NULL) {
IP_VS_DBG(1, "p-schedule: no dest found.\n");
return NULL;
}
/*
* Create a template according to the service
*/
if (svc->fwmark)
ct = ip_vs_conn_new(IPPROTO_IP,
snet, 0,
htonl(svc->fwmark), 0,
dest->addr, 0,
0,
dest);
else
ct = ip_vs_conn_new(iph->protocol,
snet, 0,
iph->daddr, 0,
dest->addr, 0,
0,
dest);
if (ct == NULL)
return NULL;
ct->timeout = svc->timeout;
} else {
/* set destination with the found template */
dest = ct->dest;
}
dport = portp[1];
}
/*
* Create a new connection according to the template
*/
cp = ip_vs_conn_new(iph->protocol,
iph->saddr, portp[0],
iph->daddr, portp[1],
dest->addr, dport,
0,
dest);
if (cp == NULL) {
ip_vs_conn_put(ct);
return NULL;
}
/*
* Add its control
*/
ip_vs_control_add(cp, ct);
ip_vs_conn_put(ct);
ip_vs_conn_stats(cp, svc);
return cp;
}
/*
* IPVS main scheduling function
* It selects a server according to the virtual service, and
* creates a connection entry.
* Protocols supported: TCP, UDP
*/
struct ip_vs_conn *
ip_vs_schedule(struct ip_vs_service *svc, struct iphdr *iph)
{
struct ip_vs_conn *cp = NULL;
struct ip_vs_dest *dest;
const __u16 *portp;
/*
* Persistent service
*/
if (svc->flags & IP_VS_SVC_F_PERSISTENT)
return ip_vs_sched_persist(svc, iph);
/*
* Non-persistent service
*/
portp = (__u16 *)&(((char *)iph)[iph->ihl*4]);
if (!svc->fwmark && portp[1] != svc->port) {
if (!svc->port)
IP_VS_ERR("Schedule: port zero only supported "
"in persistent services, "
"check your ipvs configuration\n");
return NULL;
}
dest = svc->scheduler->schedule(svc, iph);
if (dest == NULL) {
IP_VS_DBG(1, "Schedule: no dest found.\n");
return NULL;
}
/*
* Create a connection entry.
*/
cp = ip_vs_conn_new(iph->protocol,
iph->saddr, portp[0],
iph->daddr, portp[1],
dest->addr, dest->port?dest->port:portp[1],
0,
dest);
if (cp == NULL)
return NULL;
IP_VS_DBG(6, "Schedule fwd:%c c:%u.%u.%u.%u:%u v:%u.%u.%u.%u:%u "
"d:%u.%u.%u.%u:%u flg:%X cnt:%d\n",
ip_vs_fwd_tag(cp),
NIPQUAD(cp->caddr), ntohs(cp->cport),
NIPQUAD(cp->vaddr), ntohs(cp->vport),
NIPQUAD(cp->daddr), ntohs(cp->dport),
cp->flags, atomic_read(&cp->refcnt));
ip_vs_conn_stats(cp, svc);
return cp;
}
/*
* Pass or drop the packet.
* Called by ip_vs_in, when the virtual service is available but
* no destination is available for a new connection.
*/
int ip_vs_leave(struct ip_vs_service *svc, struct sk_buff *skb,
struct ip_vs_protocol *pp, union ip_vs_tphdr h)
{
struct iphdr *iph = skb->nh.iph;
/* if it is fwmark-based service, the cache_bypass sysctl is up
and the destination is RTN_UNICAST (and not local), then create
a cache_bypass connection entry */
if (sysctl_ip_vs_cache_bypass && svc->fwmark
&& (inet_addr_type(iph->daddr) == RTN_UNICAST)) {
int ret, cs;
struct ip_vs_conn *cp;
ip_vs_service_put(svc);
/* create a new connection entry */
IP_VS_DBG(6, "ip_vs_leave: create a cache_bypass entry\n");
cp = ip_vs_conn_new(iph->protocol,
iph->saddr, h.portp[0],
iph->daddr, h.portp[1],
0, 0,
IP_VS_CONN_F_BYPASS,
NULL);
if (cp == NULL) {
kfree_skb(skb);
return NF_STOLEN;
}
/* statistics */
ip_vs_in_stats(cp, skb);
/* set state */
cs = ip_vs_set_state(cp, IP_VS_DIR_INPUT, iph, h, pp);
/* transmit the first SYN packet */
ret = cp->packet_xmit(skb, cp, pp);
atomic_inc(&cp->in_pkts);
ip_vs_conn_put(cp);
return ret;
}
/*
* When the virtual ftp service is presented, packets destined
* for other services on the VIP may get here (except services
* listed in the ipvs table), pass the packets, because it is
* not ipvs job to decide to drop the packets.
*/
if ((svc->port == FTPPORT) && (h.portp[1] != FTPPORT)) {
ip_vs_service_put(svc);
return NF_ACCEPT;
}
ip_vs_service_put(svc);
/*
* Notify the client that the destination is unreachable, and
* release the socket buffer.
* Since it is in IP layer, the TCP socket is not actually
* created, the TCP RST packet cannot be sent, instead that
* ICMP_PORT_UNREACH is sent here no matter it is TCP/UDP. --WZ
*/
icmp_send(skb, ICMP_DEST_UNREACH, ICMP_PORT_UNREACH, 0);
kfree_skb(skb);
return NF_STOLEN;
}
/*
* It is hooked before NF_IP_PRI_NAT_SRC at the NF_IP_POST_ROUTING
* chain, and is used for VS/NAT.
* It detects packets for VS/NAT connections and sends the packets
* immediately. This can avoid that iptable_nat mangles the packets
* for VS/NAT.
*/
static unsigned int ip_vs_post_routing(unsigned int hooknum,
struct sk_buff **skb_p,
const struct net_device *in,
const struct net_device *out,
int (*okfn)(struct sk_buff *))
{
struct sk_buff *skb = *skb_p;
if (!(skb->nfcache & NFC_IPVS_PROPERTY))
return NF_ACCEPT;
/* The packet was sent from IPVS, exit this chain */
(*okfn)(skb);
return NF_STOLEN;
}
/*
* Handle ICMP messages in the inside-to-outside direction (outgoing).
* Find any that might be relevant, check against existing connections,
* forward to the right destination host if relevant.
* Currently handles error types - unreachable, quench, ttl exceeded.
* (Only used in VS/NAT)
*/
static int ip_vs_out_icmp(struct sk_buff **skb_p, int *related)
{
struct sk_buff *skb = *skb_p;
struct iphdr *iph;
struct icmphdr *icmph;
struct iphdr *ciph; /* The ip header contained within the ICMP */
unsigned short ihl;
unsigned short len;
unsigned short clen, cihl;
struct ip_vs_conn *cp;
struct ip_vs_protocol *pp;
union ip_vs_tphdr h;
*related = 1;
/* reassemble IP fragments, but will it happen in ICMP packets?? */
if (skb->nh.iph->frag_off & __constant_htons(IP_MF|IP_OFFSET)) {
skb = ip_defrag(skb);
if (!skb)
return NF_STOLEN;
*skb_p = skb;
}
if (skb_is_nonlinear(skb)) {
if (skb_linearize(skb, GFP_ATOMIC) != 0)
return NF_DROP;
ip_send_check(skb->nh.iph);
}
iph = skb->nh.iph;
ihl = iph->ihl << 2;
icmph = (struct icmphdr *)((char *)iph + ihl);
len = ntohs(iph->tot_len) - ihl;
if (len < sizeof(struct icmphdr))
return NF_DROP;
IP_VS_DBG(12, "outgoing ICMP (%d,%d) %u.%u.%u.%u->%u.%u.%u.%u\n",
icmph->type, ntohs(icmp_id(icmph)),
NIPQUAD(iph->saddr), NIPQUAD(iph->daddr));
/*
* Work through seeing if this is for us.
* These checks are supposed to be in an order that means easy
* things are checked first to speed up processing.... however
* this means that some packets will manage to get a long way
* down this stack and then be rejected, but that's life.
*/
if ((icmph->type != ICMP_DEST_UNREACH) &&
(icmph->type != ICMP_SOURCE_QUENCH) &&
(icmph->type != ICMP_TIME_EXCEEDED)) {
*related = 0;
return NF_ACCEPT;
}
/* Now find the contained IP header */
clen = len - sizeof(struct icmphdr);
if (clen < sizeof(struct iphdr))
return NF_DROP;
ciph = (struct iphdr *) (icmph + 1);
cihl = ciph->ihl << 2;
if (clen < cihl)
return NF_DROP;
pp = ip_vs_proto_get(ciph->protocol);
if (!pp)
return NF_ACCEPT;
/* Is the embedded protocol header present? */
if (unlikely(ciph->frag_off & __constant_htons(IP_OFFSET) &&
(pp->minhlen || pp->dont_defrag)))
return NF_ACCEPT;
/* We need at least TCP/UDP ports here */
if (clen < cihl + pp->minhlen_icmp)
return NF_DROP;
h.raw = (char *) ciph + cihl;
/* Ensure the checksum is correct */
if (ip_compute_csum((unsigned char *) icmph, len)) {
/* Failed checksum! */
IP_VS_DBG(1, "forward ICMP: failed checksum from %d.%d.%d.%d!\n",
NIPQUAD(iph->saddr));
return NF_DROP;
}
IP_VS_DBG_PKT(11, pp, ciph, "Handling outgoing ICMP for");
/* ciph content is actually <protocol, caddr, cport, daddr, dport> */
cp = pp->conn_out_get(skb, pp, ciph, h, 1);
if (!cp)
return NF_ACCEPT;
if (IP_VS_FWD_METHOD(cp) != 0) {
IP_VS_ERR("shouldn't reach here, because the box is on the"
"half connection in the tun/dr module.\n");
}
/* Now we do real damage to this packet...! */
/* First change the source IP address, and recalc checksum */
iph->saddr = cp->vaddr;
ip_send_check(iph);
/* Now change the *dest* address in the contained IP */
ciph->daddr = cp->vaddr;
ip_send_check(ciph);
/* the TCP/UDP dest port - cannot redo check */
if (IPPROTO_TCP == ciph->protocol || IPPROTO_UDP == ciph->protocol)
h.portp[1] = cp->vport;
/* And finally the ICMP checksum */
icmph->checksum = 0;
icmph->checksum = ip_compute_csum((unsigned char *) icmph, len);
skb->ip_summed = CHECKSUM_UNNECESSARY;
/* do the statistics and put it back */
ip_vs_out_stats(cp, skb);
__ip_vs_conn_put(cp);
IP_VS_DBG_PKT(11, pp, ciph, "Forwarding correct outgoing ICMP");
skb->nfcache |= NFC_IPVS_PROPERTY;
return NF_ACCEPT;
}
/*
* It is hooked at the NF_IP_FORWARD chain, used only for VS/NAT.
* Check if outgoing packet belongs to the established ip_vs_conn,
* rewrite addresses of the packet and send it on its way...
*/
static unsigned int
ip_vs_out(unsigned int hooknum, struct sk_buff **skb_p,
const struct net_device *in, const struct net_device *out,
int (*okfn)(struct sk_buff *))
{
struct sk_buff *skb = *skb_p;
struct iphdr *iph;
struct ip_vs_protocol *pp;
union ip_vs_tphdr h;
struct ip_vs_conn *cp;
int size, ihl, firstfrag;
EnterFunction(11);
if (skb->nfcache & NFC_IPVS_PROPERTY)
return NF_ACCEPT;
iph = skb->nh.iph;
if (unlikely(iph->protocol == IPPROTO_ICMP)) {
int related, verdict = ip_vs_out_icmp(skb_p, &related);
if (related)
return verdict;
}
pp = ip_vs_proto_get(iph->protocol);
if (unlikely(!pp))
return NF_ACCEPT;
/* reassemble IP fragments */
if (unlikely(iph->frag_off & __constant_htons(IP_MF|IP_OFFSET) &&
!pp->dont_defrag)) {
skb = ip_defrag(skb);
if (!skb)
return NF_STOLEN;
iph = skb->nh.iph;
*skb_p = skb;
}
/* make sure that protocol header is available in skb data area,
note that skb data area may be reallocated. */
ihl = iph->ihl << 2;
firstfrag = !(iph->frag_off & __constant_htons(IP_OFFSET));
/*
* WARNING: we can work with !firstfrag packets, make sure
* each protocol handler checks for firstfrag
*/
if (firstfrag &&
!pskb_may_pull(skb, ihl+pp->minhlen))
return NF_DROP;
iph = skb->nh.iph;
h.raw = (char*) iph + ihl;
/*
* Check if the packet belongs to an existing entry
*/
cp = pp->conn_out_get(skb, pp, iph, h, 0);
if (unlikely(!cp)) {
if (sysctl_ip_vs_nat_icmp_send &&
(pp->protocol == IPPROTO_TCP ||
pp->protocol == IPPROTO_UDP) &&
ip_vs_lookup_real_service(iph->protocol,
iph->saddr, h.portp[0])) {
/*
* Notify the real server: there is no existing
* entry if it is not RST packet or not TCP packet.
*/
if (!h.th->rst || iph->protocol != IPPROTO_TCP) {
icmp_send(skb, ICMP_DEST_UNREACH,
ICMP_PORT_UNREACH, 0);
kfree_skb(skb);
return NF_STOLEN;
}
}
IP_VS_DBG_PKT(12, pp, iph,
"packet continues traversal as normal");
if (!pp->dont_defrag)
ip_send_check(iph);
return NF_ACCEPT;
}
/*
* If it has ip_vs_app helper, the helper may change the payload,
* so it needs full checksum checking and checksum calculation.
* If not, only the header (addr/port) is changed, so it is fast
* to do incremental checksum update, and let the destination host
* do final checksum checking.
*/
if (unlikely(cp->app && !pp->slave && skb_is_nonlinear(skb))) {
if (skb_linearize(skb, GFP_ATOMIC) != 0) {
ip_vs_conn_put(cp);
return NF_DROP;
}
iph = skb->nh.iph;
h.raw = (char*) iph + ihl;
}
size = skb->len - ihl;
IP_VS_DBG(11, "O-pkt: %s size=%d\n", pp->name, size);
/* do TCP/UDP checksum checking if it has application helper */
if (unlikely(cp->app && pp->csum_check && !pp->slave)) {
if (!pp->csum_check(skb, pp, iph, h, size)) {
ip_vs_conn_put(cp);
return NF_DROP;
}
}
IP_VS_DBG_PKT(11, pp, iph, "Outgoing packet");
/* mangle the packet */
iph->saddr = cp->vaddr;
if (pp->snat_handler) {
pp->snat_handler(skb, pp, cp, iph, h, size);
iph = skb->nh.iph;
h.raw = (char*) iph + ihl;
}
ip_send_check(iph);
IP_VS_DBG_PKT(10, pp, iph, "After SNAT");
ip_vs_out_stats(cp, skb);
ip_vs_set_state(cp, IP_VS_DIR_OUTPUT, iph, h, pp);
ip_vs_conn_put(cp);
skb->nfcache |= NFC_IPVS_PROPERTY;
LeaveFunction(11);
return NF_ACCEPT;
}
/*
* Check if the packet is for VS/NAT connections, then send it
* immediately.
* Called by ip_fw_compact to detect packets for VS/NAT before
* they are changed by ipchains masquerading code.
*/
unsigned int
check_for_ip_vs_out(struct sk_buff **skb_p, int (*okfn)(struct sk_buff *))
{
unsigned int ret;
ret = ip_vs_out(NF_IP_FORWARD, skb_p, NULL, NULL, NULL);
if (ret != NF_ACCEPT) {
return ret;
} else {
/* send the packet immediately if it is already mangled
by ip_vs_out */
if ((*skb_p)->nfcache & NFC_IPVS_PROPERTY) {
(*okfn)(*skb_p);
return NF_STOLEN;
}
}
return NF_ACCEPT;
}
/*
* Handle ICMP messages in the outside-to-inside direction (incoming).
* Find any that might be relevant, check against existing connections,
* forward to the right destination host if relevant.
* Currently handles error types - unreachable, quench, ttl exceeded
*/
static int ip_vs_in_icmp(struct sk_buff **skb_p, int *related)
{
struct sk_buff *skb = *skb_p;
struct iphdr *iph;
struct icmphdr *icmph;
struct iphdr *ciph; /* The ip header contained within the ICMP */
unsigned short len;
unsigned short clen, cihl;
struct ip_vs_conn *cp;
struct ip_vs_protocol *pp;
union ip_vs_tphdr h;
int rc;
*related = 1;
if (skb_is_nonlinear(skb)) {
if (skb_linearize(skb, GFP_ATOMIC) != 0)
return NF_DROP;
ip_send_check(skb->nh.iph);
}
iph = skb->nh.iph;
icmph = (struct icmphdr *)((char *)iph+(iph->ihl<<2));
len = ntohs(iph->tot_len) - (iph->ihl<<2);
if (len < sizeof(struct icmphdr))
return NF_DROP;
IP_VS_DBG(12, "icmp in (%d,%d) %u.%u.%u.%u -> %u.%u.%u.%u\n",
icmph->type, ntohs(icmp_id(icmph)),
NIPQUAD(iph->saddr), NIPQUAD(iph->daddr));
if ((icmph->type != ICMP_DEST_UNREACH) &&
(icmph->type != ICMP_SOURCE_QUENCH) &&
(icmph->type != ICMP_TIME_EXCEEDED)) {
*related = 0;
return NF_ACCEPT;
}
/*
* If we get here we have an ICMP error of one of the above 3 types
* Now find the contained IP header
*/
clen = len - sizeof(struct icmphdr);
if (clen < sizeof(struct iphdr))
return NF_DROP;
ciph = (struct iphdr *) (icmph + 1);
cihl = ciph->ihl << 2;
if (clen < cihl)
return NF_DROP;
pp = ip_vs_proto_get(ciph->protocol);
if (!pp)
return NF_ACCEPT;
/* Is the embedded protocol header present? */
if (unlikely(ciph->frag_off & __constant_htons(IP_OFFSET) &&
(pp->minhlen || pp->dont_defrag)))
return NF_ACCEPT;
/* We need at least TCP/UDP ports here */
if (clen < cihl + pp->minhlen_icmp)
return NF_DROP;
/* Ensure the checksum is correct */
if (ip_compute_csum((unsigned char *) icmph, len)) {
/* Failed checksum! */
IP_VS_ERR_RL("incoming ICMP: failed checksum from "
"%d.%d.%d.%d!\n", NIPQUAD(iph->saddr));
return NF_DROP;
}
h.raw = (char *) ciph + cihl;
IP_VS_DBG_PKT(11, pp, ciph, "Handling incoming ICMP for");
/* This is pretty much what ip_vs_conn_in_get() does,
except parameters are in the reverse order */
cp = pp->conn_in_get(skb, pp, ciph, h, 1);
if (cp == NULL)
return NF_ACCEPT;
ip_vs_in_stats(cp, skb);
rc = ip_vs_icmp_xmit(skb, cp, pp);
__ip_vs_conn_put(cp);
return rc;
}
/*
* Check if it's for virtual services, look it up,
* and send it on its way...
*/
static unsigned int
ip_vs_in(unsigned int hooknum, struct sk_buff **skb_p,
const struct net_device *in, const struct net_device *out,
int (*okfn)(struct sk_buff *))
{
struct sk_buff *skb = *skb_p;
struct iphdr *iph = skb->nh.iph;
struct ip_vs_protocol *pp = ip_vs_proto_get(iph->protocol);
union ip_vs_tphdr h;
struct ip_vs_conn *cp;
int ihl, ret, restart;
int firstfrag;
/*
* Big tappo: only PACKET_HOST (neither loopback nor mcasts)
* ... don't know why 1st test DOES NOT include 2nd (?)
*/
if (unlikely(skb->pkt_type != PACKET_HOST || skb->dev == &loopback_dev)) {
IP_VS_DBG(12, "packet type=%d proto=%d daddr=%d.%d.%d.%d ignored\n",
skb->pkt_type,
iph->protocol,
NIPQUAD(iph->daddr));
return NF_ACCEPT;
}
if (unlikely(iph->protocol == IPPROTO_ICMP)) {
int related, verdict = ip_vs_in_icmp(skb_p, &related);
if (related)
return verdict;
}
/* Protocol supported? */
if (unlikely(!pp))
return NF_ACCEPT;
/* make sure that protocol header is available in skb data area,
note that skb data area may be reallocated. */
ihl = iph->ihl << 2;
#if 0
/* Enable this when not in LOCAL_IN */
firstfrag = !(iph->frag_off & __constant_htons(IP_OFFSET));
/*
* WARNING: we can work with !firstfrag packets, make sure
* each protocol handler checks for firstfrag
*/
#else
firstfrag = 1;
#endif
if (firstfrag &&
!pskb_may_pull(skb, ihl+pp->minhlen))
return NF_DROP;
iph = skb->nh.iph;
h.raw = (char*) iph + ihl;
/*
* Check if the packet belongs to an existing connection entry
*/
cp = pp->conn_in_get(skb, pp, iph, h, 0);
if (unlikely(!cp)) {
int v;
if (!pp->conn_schedule(skb, pp, iph, h, &v, &cp)) {
return v;
}
}
if (unlikely(!cp)) {
/* sorry, all this trouble for a no-hit :) */
IP_VS_DBG_PKT(12, pp, iph,
"packet continues traversal as normal");
return NF_ACCEPT;
}
IP_VS_DBG_PKT(11, pp, iph, "Incoming packet");
/* Check the server status */
if (cp->dest && !(cp->dest->flags & IP_VS_DEST_F_AVAILABLE)) {
/* the destination server is not availabe */
if (sysctl_ip_vs_expire_nodest_conn) {
/* try to expire the connection immediately */
ip_vs_conn_expire_now(cp);
} else {
/* don't restart its timer, and silently
drop the packet. */
__ip_vs_conn_put(cp);
}
return NF_DROP;
}
ip_vs_in_stats(cp, skb);
restart = ip_vs_set_state(cp, IP_VS_DIR_INPUT, iph, h, pp);
if (cp->packet_xmit)
ret = cp->packet_xmit(skb, cp, pp);
else {
IP_VS_DBG_RL("warning: packet_xmit is null");
ret = NF_ACCEPT;
}
/* increase its packet counter and check if it is needed
to be synchronized */
atomic_inc(&cp->in_pkts);
if (ip_vs_sync_state == IP_VS_STATE_MASTER &&
(cp->protocol != IPPROTO_TCP ||
cp->state == IP_VS_TCP_S_ESTABLISHED) &&
(atomic_read(&cp->in_pkts) % sysctl_ip_vs_sync_threshold[1]
== sysctl_ip_vs_sync_threshold[0]))
ip_vs_sync_conn(cp);
ip_vs_conn_put(cp);
return ret;
}
/*
* It is hooked at the NF_IP_FORWARD chain, in order to catch ICMP
* packets destined for 0.0.0.0/0.
* When fwmark-based virtual service is used, such as transparent
* cache cluster, TCP packets can be marked and routed to ip_vs_in,
* but ICMP destined for 0.0.0.0/0 cannot not be easily marked and
* sent to ip_vs_in_icmp. So, catch them at the NF_IP_FORWARD chain
* and send them to ip_vs_in_icmp.
*/
static unsigned int
ip_vs_forward_icmp(unsigned int hooknum, struct sk_buff **skb_p,
const struct net_device *in, const struct net_device *out,
int (*okfn)(struct sk_buff *))
{
struct sk_buff *skb = *skb_p;
struct iphdr *iph = skb->nh.iph;
int r;
if (iph->protocol != IPPROTO_ICMP)
return NF_ACCEPT;
if (iph->frag_off & __constant_htons(IP_MF|IP_OFFSET)) {
skb = ip_defrag(skb);
if (!skb)
return NF_STOLEN;
*skb_p = skb;
}
return ip_vs_in_icmp(skb_p, &r);
}
/* After packet filtering, forward packet through VS/DR, VS/TUN,
or VS/NAT(change destination), so that filtering rules can be
applied to IPVS. */
static struct nf_hook_ops ip_vs_in_ops = {
.hook = ip_vs_in,
.owner = THIS_MODULE,
.pf = PF_INET,
.hooknum = NF_IP_LOCAL_IN,
.priority = 100,
};
/* After packet filtering, change source only for VS/NAT */
static struct nf_hook_ops ip_vs_out_ops = {
.hook = ip_vs_out,
.owner = THIS_MODULE,
.pf = PF_INET,
.hooknum = NF_IP_FORWARD,
.priority = 100,
};
/* After packet filtering (but before ip_vs_out_icmp), catch icmp
destined for 0.0.0.0/0, which is for incoming IPVS connections */
static struct nf_hook_ops ip_vs_forward_icmp_ops = {
.hook = ip_vs_forward_icmp,
.owner = THIS_MODULE,
.pf = PF_INET,
.hooknum = NF_IP_FORWARD,
.priority = 99,
};
/* Before the netfilter connection tracking, exit from POST_ROUTING */
static struct nf_hook_ops ip_vs_post_routing_ops = {
.hook = ip_vs_post_routing,
.owner = THIS_MODULE,
.pf = PF_INET,
.hooknum = NF_IP_POST_ROUTING,
.priority = NF_IP_PRI_NAT_SRC-1,
};
/*
* Initialize IP Virtual Server
*/
static int __init ip_vs_init(void)
{
int ret;
ret = ip_vs_control_init();
if (ret < 0) {
IP_VS_ERR("can't setup control.\n");
goto cleanup_nothing;
}
ip_vs_protocol_init();
ret = ip_vs_app_init();
if (ret < 0) {
IP_VS_ERR("can't setup application helper.\n");
goto cleanup_protocol;
}
ret = ip_vs_conn_init();
if (ret < 0) {
IP_VS_ERR("can't setup connection table.\n");
goto cleanup_app;
}
ret = nf_register_hook(&ip_vs_in_ops);
if (ret < 0) {
IP_VS_ERR("can't register in hook.\n");
goto cleanup_conn;
}
ret = nf_register_hook(&ip_vs_out_ops);
if (ret < 0) {
IP_VS_ERR("can't register out hook.\n");
goto cleanup_inops;
}
ret = nf_register_hook(&ip_vs_post_routing_ops);
if (ret < 0) {
IP_VS_ERR("can't register post_routing hook.\n");
goto cleanup_outops;
}
ret = nf_register_hook(&ip_vs_forward_icmp_ops);
if (ret < 0) {
IP_VS_ERR("can't register forward_icmp hook.\n");
goto cleanup_postroutingops;
}
IP_VS_INFO("ipvs loaded.\n");
return ret;
cleanup_postroutingops:
nf_unregister_hook(&ip_vs_post_routing_ops);
cleanup_outops:
nf_unregister_hook(&ip_vs_out_ops);
cleanup_inops:
nf_unregister_hook(&ip_vs_in_ops);
cleanup_conn:
ip_vs_conn_cleanup();
cleanup_app:
ip_vs_app_cleanup();
cleanup_protocol:
ip_vs_protocol_cleanup();
ip_vs_control_cleanup();
cleanup_nothing:
return ret;
}
static void __exit ip_vs_cleanup(void)
{
nf_unregister_hook(&ip_vs_forward_icmp_ops);
nf_unregister_hook(&ip_vs_post_routing_ops);
nf_unregister_hook(&ip_vs_out_ops);
nf_unregister_hook(&ip_vs_in_ops);
ip_vs_conn_cleanup();
ip_vs_app_cleanup();
ip_vs_protocol_cleanup();
ip_vs_control_cleanup();
IP_VS_INFO("ipvs unloaded.\n");
}
module_init(ip_vs_init);
module_exit(ip_vs_cleanup);
MODULE_LICENSE("GPL");
/*
* IPVS An implementation of the IP virtual server support for the
* LINUX operating system. IPVS is now implemented as a module
* over the NetFilter framework. IPVS can be used to build a
* high-performance and highly available server based on a
* cluster of servers.
*
* Version: $Id: ip_vs_ctl.c,v 1.36 2003/06/08 09:31:19 wensong Exp $
*
* Authors: Wensong Zhang <wensong@linuxvirtualserver.org>
* Peter Kese <peter.kese@ijs.si>
* Julian Anastasov <ja@ssi.bg>
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public License
* as published by the Free Software Foundation; either version
* 2 of the License, or (at your option) any later version.
*
* Changes:
*
*/
#include <linux/config.h>
#include <linux/kernel.h>
#include <linux/module.h>
#include <linux/init.h>
#include <linux/types.h>
#include <linux/errno.h>
#include <linux/fs.h>
#include <linux/sysctl.h>
#include <linux/proc_fs.h>
#include <linux/timer.h>
#include <linux/swap.h>
#include <linux/proc_fs.h>
#include <linux/netfilter.h>
#include <linux/netfilter_ipv4.h>
#include <net/ip.h>
#include <net/sock.h>
#include <asm/uaccess.h>
#include <net/ip_vs.h>
/* semaphore for IPVS sockopts. And, [gs]etsockopt may sleep. */
static DECLARE_MUTEX(__ip_vs_mutex);
/* lock for service table */
rwlock_t __ip_vs_svc_lock = RW_LOCK_UNLOCKED;
/* lock for table with the real services */
static rwlock_t __ip_vs_rs_lock = RW_LOCK_UNLOCKED;
/* lock for state and timeout tables */
static rwlock_t __ip_vs_securetcp_lock = RW_LOCK_UNLOCKED;
/* lock for drop entry handling */
static spinlock_t __ip_vs_dropentry_lock = SPIN_LOCK_UNLOCKED;
/* lock for drop packet handling */
static spinlock_t __ip_vs_droppacket_lock = SPIN_LOCK_UNLOCKED;
/* 1/rate drop and drop-entry variables */
int ip_vs_drop_rate = 0;
int ip_vs_drop_counter = 0;
atomic_t ip_vs_dropentry = ATOMIC_INIT(0);
/* number of virtual services */
static int ip_vs_num_services = 0;
/* sysctl variables */
static int sysctl_ip_vs_drop_entry = 0;
static int sysctl_ip_vs_drop_packet = 0;
static int sysctl_ip_vs_secure_tcp = 0;
static int sysctl_ip_vs_amemthresh = 1024;
static int sysctl_ip_vs_am_droprate = 10;
int sysctl_ip_vs_cache_bypass = 0;
int sysctl_ip_vs_expire_nodest_conn = 0;
int sysctl_ip_vs_sync_threshold[2] = { 3, 50 };
int sysctl_ip_vs_nat_icmp_send = 0;
#ifdef CONFIG_IP_VS_DEBUG
static int sysctl_ip_vs_debug_level = 0;
int ip_vs_get_debug_level(void)
{
return sysctl_ip_vs_debug_level;
}
#endif
/*
* update_defense_level is called from timer bh and from sysctl.
*/
void update_defense_level(void)
{
struct sysinfo i;
static int old_secure_tcp = 0;
int availmem;
int nomem;
int to_change = -1;
/* we only count free and buffered memory (in pages) */
si_meminfo(&i);
availmem = i.freeram + i.bufferram;
/* however in linux 2.5 the i.bufferram is total page cache size,
we need adjust it */
/* si_swapinfo(&i); */
/* availmem = availmem - (i.totalswap - i.freeswap); */
nomem = (availmem < sysctl_ip_vs_amemthresh);
/* drop_entry */
spin_lock(&__ip_vs_dropentry_lock);
switch (sysctl_ip_vs_drop_entry) {
case 0:
atomic_set(&ip_vs_dropentry, 0);
break;
case 1:
if (nomem) {
atomic_set(&ip_vs_dropentry, 1);
sysctl_ip_vs_drop_entry = 2;
} else {
atomic_set(&ip_vs_dropentry, 0);
}
break;
case 2:
if (nomem) {
atomic_set(&ip_vs_dropentry, 1);
} else {
atomic_set(&ip_vs_dropentry, 0);
sysctl_ip_vs_drop_entry = 1;
};
break;
case 3:
atomic_set(&ip_vs_dropentry, 1);
break;
}
spin_unlock(&__ip_vs_dropentry_lock);
/* drop_packet */
spin_lock(&__ip_vs_droppacket_lock);
switch (sysctl_ip_vs_drop_packet) {
case 0:
ip_vs_drop_rate = 0;
break;
case 1:
if (nomem) {
ip_vs_drop_rate = ip_vs_drop_counter
= sysctl_ip_vs_amemthresh /
(sysctl_ip_vs_amemthresh-availmem);
sysctl_ip_vs_drop_packet = 2;
} else {
ip_vs_drop_rate = 0;
}
break;
case 2:
if (nomem) {
ip_vs_drop_rate = ip_vs_drop_counter
= sysctl_ip_vs_amemthresh /
(sysctl_ip_vs_amemthresh-availmem);
} else {
ip_vs_drop_rate = 0;
sysctl_ip_vs_drop_packet = 1;
}
break;
case 3:
ip_vs_drop_rate = sysctl_ip_vs_am_droprate;
break;
}
spin_unlock(&__ip_vs_droppacket_lock);
/* secure_tcp */
write_lock(&__ip_vs_securetcp_lock);
switch (sysctl_ip_vs_secure_tcp) {
case 0:
if (old_secure_tcp >= 2)
to_change = 0;
break;
case 1:
if (nomem) {
if (old_secure_tcp < 2)
to_change = 1;
sysctl_ip_vs_secure_tcp = 2;
} else {
if (old_secure_tcp >= 2)
to_change = 0;
}
break;
case 2:
if (nomem) {
if (old_secure_tcp < 2)
to_change = 1;
} else {
if (old_secure_tcp >= 2)
to_change = 0;
sysctl_ip_vs_secure_tcp = 1;
}
break;
case 3:
if (old_secure_tcp < 2)
to_change = 1;
break;
}
old_secure_tcp = sysctl_ip_vs_secure_tcp;
if (to_change >= 0)
ip_vs_protocol_timeout_change(sysctl_ip_vs_secure_tcp>1);
write_unlock(&__ip_vs_securetcp_lock);
}
int
ip_vs_use_count_inc(void)
{
return try_module_get(THIS_MODULE);
}
void
ip_vs_use_count_dec(void)
{
module_put(THIS_MODULE);
}
/*
* Hash table: for virtual service lookups
*/
#define IP_VS_SVC_TAB_BITS 8
#define IP_VS_SVC_TAB_SIZE (1 << IP_VS_SVC_TAB_BITS)
#define IP_VS_SVC_TAB_MASK (IP_VS_SVC_TAB_SIZE - 1)
/* the service table hashed by <protocol, addr, port> */
static struct list_head ip_vs_svc_table[IP_VS_SVC_TAB_SIZE];
/* the service table hashed by fwmark */
static struct list_head ip_vs_svc_fwm_table[IP_VS_SVC_TAB_SIZE];
/*
* Hash table: for real service lookups
*/
#define IP_VS_RTAB_BITS 4
#define IP_VS_RTAB_SIZE (1 << IP_VS_RTAB_BITS)
#define IP_VS_RTAB_MASK (IP_VS_RTAB_SIZE - 1)
static struct list_head ip_vs_rtable[IP_VS_RTAB_SIZE];
/*
* Trash for destinations
*/
static LIST_HEAD(ip_vs_dest_trash);
/*
* FTP & NULL virtual service counters
*/
static atomic_t ip_vs_ftpsvc_counter = ATOMIC_INIT(0);
static atomic_t ip_vs_nullsvc_counter = ATOMIC_INIT(0);
/*
* Returns hash value for virtual service
*/
static __inline__ unsigned
ip_vs_svc_hashkey(unsigned proto, __u32 addr, __u16 port)
{
register unsigned porth = ntohs(port);
return (proto^ntohl(addr)^(porth>>IP_VS_SVC_TAB_BITS)^porth)
& IP_VS_SVC_TAB_MASK;
}
/*
* Returns hash value of fwmark for virtual service lookup
*/
static __inline__ unsigned ip_vs_svc_fwm_hashkey(__u32 fwmark)
{
return fwmark & IP_VS_SVC_TAB_MASK;
}
/*
* Hashes a service in the ip_vs_svc_table by <proto,addr,port>
* or in the ip_vs_svc_fwm_table by fwmark.
* Should be called with locked tables.
*/
static int ip_vs_svc_hash(struct ip_vs_service *svc)
{
unsigned hash;
if (svc->flags & IP_VS_SVC_F_HASHED) {
IP_VS_ERR("ip_vs_svc_hash(): request for already hashed, "
"called from %p\n", __builtin_return_address(0));
return 0;
}
if (svc->fwmark == 0) {
/*
* Hash it by <protocol,addr,port> in ip_vs_svc_table
*/
hash = ip_vs_svc_hashkey(svc->protocol, svc->addr, svc->port);
list_add(&svc->s_list, &ip_vs_svc_table[hash]);
} else {
/*
* Hash it by fwmark in ip_vs_svc_fwm_table
*/
hash = ip_vs_svc_fwm_hashkey(svc->fwmark);
list_add(&svc->f_list, &ip_vs_svc_fwm_table[hash]);
}
svc->flags |= IP_VS_SVC_F_HASHED;
/* increase its refcnt because it is referenced by the svc table */
atomic_inc(&svc->refcnt);
return 1;
}
/*
* Unhashes a service from ip_vs_svc_table/ip_vs_svc_fwm_table.
* Should be called with locked tables.
*/
static int ip_vs_svc_unhash(struct ip_vs_service *svc)
{
if (!(svc->flags & IP_VS_SVC_F_HASHED)) {
IP_VS_ERR("ip_vs_svc_unhash(): request for unhash flagged, "
"called from %p\n", __builtin_return_address(0));
return 0;
}
if (svc->fwmark == 0) {
/* Remove it from the ip_vs_svc_table table */
list_del(&svc->s_list);
} else {
/* Remove it from the ip_vs_svc_fwm_table table */
list_del(&svc->f_list);
}
svc->flags &= ~IP_VS_SVC_F_HASHED;
atomic_dec(&svc->refcnt);
return 1;
}
/*
* Get service by {proto,addr,port} in the service table.
*/
static __inline__ struct ip_vs_service *
__ip_vs_service_get(__u16 protocol, __u32 vaddr, __u16 vport)
{
unsigned hash;
struct ip_vs_service *svc;
struct list_head *l,*e;
/* Check for "full" addressed entries */
hash = ip_vs_svc_hashkey(protocol, vaddr, vport);
l = &ip_vs_svc_table[hash];
for (e=l->next; e!=l; e=e->next) {
svc = list_entry(e, struct ip_vs_service, s_list);
if ((svc->addr == vaddr)
&& (svc->port == vport)
&& (svc->protocol == protocol)) {
/* HIT */
atomic_inc(&svc->usecnt);
return svc;
}
}
return NULL;
}
/*
* Get service by {fwmark} in the service table.
*/
static __inline__ struct ip_vs_service *__ip_vs_svc_fwm_get(__u32 fwmark)
{
unsigned hash;
struct ip_vs_service *svc;
struct list_head *l,*e;
/* Check for fwmark addressed entries */
hash = ip_vs_svc_fwm_hashkey(fwmark);
l = &ip_vs_svc_fwm_table[hash];
for (e=l->next; e!=l; e=e->next) {
svc = list_entry(e, struct ip_vs_service, f_list);
if (svc->fwmark == fwmark) {
/* HIT */
atomic_inc(&svc->usecnt);
return svc;
}
}
return NULL;
}
struct ip_vs_service *
ip_vs_service_get(__u32 fwmark, __u16 protocol, __u32 vaddr, __u16 vport)
{
struct ip_vs_service *svc;
read_lock(&__ip_vs_svc_lock);
/*
* Check the table hashed by fwmark first
*/
if (fwmark && (svc = __ip_vs_svc_fwm_get(fwmark)))
goto out;
/*
* Check the table hashed by <protocol,addr,port>
* for "full" addressed entries
*/
svc = __ip_vs_service_get(protocol, vaddr, vport);
if (svc == NULL
&& protocol == IPPROTO_TCP
&& atomic_read(&ip_vs_ftpsvc_counter)
&& (vport == FTPDATA || ntohs(vport) >= PROT_SOCK)) {
/*
* Check if ftp service entry exists, the packet
* might belong to FTP data connections.
*/
svc = __ip_vs_service_get(protocol, vaddr, FTPPORT);
}
if (svc == NULL
&& atomic_read(&ip_vs_nullsvc_counter)) {
/*
* Check if the catch-all port (port zero) exists
*/
svc = __ip_vs_service_get(protocol, vaddr, 0);
}
out:
read_unlock(&__ip_vs_svc_lock);
IP_VS_DBG(6, "lookup service: fwm %u %s %u.%u.%u.%u:%u %s\n",
fwmark, ip_vs_proto_name(protocol),
NIPQUAD(vaddr), ntohs(vport),
svc?"hit":"not hit");
return svc;
}
static inline void
__ip_vs_bind_svc(struct ip_vs_dest *dest, struct ip_vs_service *svc)
{
atomic_inc(&svc->refcnt);
dest->svc = svc;
}
static inline void
__ip_vs_unbind_svc(struct ip_vs_dest *dest)
{
struct ip_vs_service *svc = dest->svc;
dest->svc = NULL;
if (atomic_dec_and_test(&svc->refcnt))
kfree(svc);
}
/*
* Returns hash value for real service
*/
static __inline__ unsigned ip_vs_rs_hashkey(__u32 addr, __u16 port)
{
register unsigned porth = ntohs(port);
return (ntohl(addr)^(porth>>IP_VS_RTAB_BITS)^porth)
& IP_VS_RTAB_MASK;
}
/*
* Hashes ip_vs_dest in ip_vs_rtable by <proto,addr,port>.
* should be called with locked tables.
*/
static int ip_vs_rs_hash(struct ip_vs_dest *dest)
{
unsigned hash;
if (!list_empty(&dest->d_list)) {
return 0;
}
/*
* Hash by proto,addr,port,
* which are the parameters of the real service.
*/
hash = ip_vs_rs_hashkey(dest->addr, dest->port);
list_add(&dest->d_list, &ip_vs_rtable[hash]);
return 1;
}
/*
* UNhashes ip_vs_dest from ip_vs_rtable.
* should be called with locked tables.
*/
static int ip_vs_rs_unhash(struct ip_vs_dest *dest)
{
/*
* Remove it from the ip_vs_rtable table.
*/
if (!list_empty(&dest->d_list)) {
list_del(&dest->d_list);
INIT_LIST_HEAD(&dest->d_list);
}
return 1;
}
/*
* Lookup real service by <proto,addr,port> in the real service table.
*/
struct ip_vs_dest *
ip_vs_lookup_real_service(__u16 protocol, __u32 daddr, __u16 dport)
{
unsigned hash;
struct ip_vs_dest *dest;
struct list_head *l,*e;
/*
* Check for "full" addressed entries
* Return the first found entry
*/
hash = ip_vs_rs_hashkey(daddr, dport);
l = &ip_vs_rtable[hash];
read_lock(&__ip_vs_rs_lock);
for (e=l->next; e!=l; e=e->next) {
dest = list_entry(e, struct ip_vs_dest, d_list);
if ((dest->addr == daddr)
&& (dest->port == dport)
&& ((dest->protocol == protocol) ||
dest->vfwmark)) {
/* HIT */
read_unlock(&__ip_vs_rs_lock);
return dest;
}
}
read_unlock(&__ip_vs_rs_lock);
return NULL;
}
/*
* Lookup destination by {addr,port} in the given service
*/
static struct ip_vs_dest *
ip_vs_lookup_dest(struct ip_vs_service *svc, __u32 daddr, __u16 dport)
{
struct ip_vs_dest *dest;
struct list_head *l, *e;
/*
* Find the destination for the given service
*/
l = &svc->destinations;
for (e=l->next; e!=l; e=e->next) {
dest = list_entry(e, struct ip_vs_dest, n_list);
if ((dest->addr == daddr) && (dest->port == dport)) {
/* HIT */
return dest;
}
}
return NULL;
}
/*
* Lookup dest by {svc,addr,port} in the destination trash.
* The destination trash is used to hold the destinations that are removed
* from the service table but are still referenced by some conn entries.
* The reason to add the destination trash is when the dest is temporary
* down (either by administrator or by monitor program), the dest can be
* picked back from the trash, the remaining connections to the dest can
* continue, and the counting information of the dest is also useful for
* scheduling.
*/
static struct ip_vs_dest *
ip_vs_trash_get_dest(struct ip_vs_service *svc, __u32 daddr, __u16 dport)
{
struct ip_vs_dest *dest;
struct list_head *l, *e;
/*
* Find the destination in trash
*/
l = &ip_vs_dest_trash;
for (e=l->next; e!=l; e=e->next) {
dest = list_entry(e, struct ip_vs_dest, n_list);
IP_VS_DBG(3, "Destination %u/%u.%u.%u.%u:%u still in trash, "
"refcnt=%d\n",
dest->vfwmark,
NIPQUAD(dest->addr), ntohs(dest->port),
atomic_read(&dest->refcnt));
if (dest->addr == daddr &&
dest->port == dport &&
dest->vfwmark == svc->fwmark &&
dest->protocol == svc->protocol &&
(svc->fwmark ||
(dest->vaddr == svc->addr &&
dest->vport == svc->port))) {
/* HIT */
return dest;
}
/*
* Try to purge the destination from trash if not referenced
*/
if (atomic_read(&dest->refcnt) == 1) {
IP_VS_DBG(3, "Removing destination %u/%u.%u.%u.%u:%u "
"from trash\n",
dest->vfwmark,
NIPQUAD(dest->addr), ntohs(dest->port));
e = e->prev;
list_del(&dest->n_list);
ip_vs_dst_reset(dest);
__ip_vs_unbind_svc(dest);
kfree(dest);
}
}
return NULL;
}
/*
* Clean up all the destinations in the trash
* Called by the ip_vs_control_cleanup()
*
* When the ip_vs_control_clearup is activated by ipvs module exit,
* the service tables must have been flushed and all the connections
* are expired, and the refcnt of each destination in the trash must
* be 1, so we simply release them here.
*/
static void ip_vs_trash_cleanup(void)
{
struct ip_vs_dest *dest;
struct list_head *l;
l = &ip_vs_dest_trash;
while (l->next != l) {
dest = list_entry(l->next, struct ip_vs_dest, n_list);
list_del(&dest->n_list);
ip_vs_dst_reset(dest);
__ip_vs_unbind_svc(dest);
kfree(dest);
}
}
/*
* Update a destination in the given service
*/
static void
__ip_vs_update_dest(struct ip_vs_service *svc,
struct ip_vs_dest *dest, struct ip_vs_dest_user *udest)
{
int conn_flags;
/* set the weight and the flags */
atomic_set(&dest->weight, udest->weight);
conn_flags = udest->conn_flags | IP_VS_CONN_F_INACTIVE;
/* check if local node and update the flags */
if (inet_addr_type(udest->addr) == RTN_LOCAL) {
conn_flags = (conn_flags & ~IP_VS_CONN_F_FWD_MASK)
| IP_VS_CONN_F_LOCALNODE;
}
/* set the IP_VS_CONN_F_NOOUTPUT flag if not masquerading/NAT */
if ((conn_flags & IP_VS_CONN_F_FWD_MASK) != 0) {
conn_flags |= IP_VS_CONN_F_NOOUTPUT;
} else {
/*
* Put the real service in ip_vs_rtable if not present.
* For now only for NAT!
*/
write_lock_bh(&__ip_vs_rs_lock);
ip_vs_rs_hash(dest);
write_unlock_bh(&__ip_vs_rs_lock);
}
atomic_set(&dest->conn_flags, conn_flags);
/* bind the service */
if (!dest->svc) {
__ip_vs_bind_svc(dest, svc);
} else {
if (dest->svc != svc) {
__ip_vs_unbind_svc(dest);
__ip_vs_bind_svc(dest, svc);
}
}
/* set the dest status flags */
dest->flags |= IP_VS_DEST_F_AVAILABLE;
if (udest->u_threshold == 0 || udest->u_threshold > dest->u_threshold)
dest->flags &= ~IP_VS_DEST_F_OVERLOAD;
dest->u_threshold = udest->u_threshold;
dest->l_threshold = udest->l_threshold;
}
/*
* Create a destination for the given service
*/
static int
ip_vs_new_dest(struct ip_vs_service *svc, struct ip_vs_dest_user *udest,
struct ip_vs_dest **dest_p)
{
struct ip_vs_dest *dest;
unsigned atype;
EnterFunction(2);
atype = inet_addr_type(udest->addr);
if (atype != RTN_LOCAL && atype != RTN_UNICAST)
return -EINVAL;
dest = kmalloc(sizeof(struct ip_vs_dest), GFP_ATOMIC);
if (dest == NULL) {
IP_VS_ERR("ip_vs_new_dest: kmalloc failed.\n");
return -ENOMEM;
}
memset(dest, 0, sizeof(struct ip_vs_dest));
dest->protocol = svc->protocol;
dest->vaddr = svc->addr;
dest->vport = svc->port;
dest->vfwmark = svc->fwmark;
dest->addr = udest->addr;
dest->port = udest->port;
atomic_set(&dest->activeconns, 0);
atomic_set(&dest->inactconns, 0);
atomic_set(&dest->persistconns, 0);
atomic_set(&dest->refcnt, 0);
INIT_LIST_HEAD(&dest->d_list);
dest->dst_lock = SPIN_LOCK_UNLOCKED;
dest->stats.lock = SPIN_LOCK_UNLOCKED;
__ip_vs_update_dest(svc, dest, udest);
ip_vs_new_estimator(&dest->stats);
*dest_p = dest;
LeaveFunction(2);
return 0;
}
/*
* Add a destination into an existing service
*/
static int
ip_vs_add_dest(struct ip_vs_service *svc, struct ip_vs_dest_user *udest)
{
struct ip_vs_dest *dest;
__u32 daddr = udest->addr;
__u16 dport = udest->port;
int ret;
EnterFunction(2);
if (udest->weight < 0) {
IP_VS_ERR("ip_vs_add_dest(): server weight less than zero\n");
return -ERANGE;
}
/*
* Check if the dest already exists in the list
*/
dest = ip_vs_lookup_dest(svc, daddr, dport);
if (dest != NULL) {
IP_VS_DBG(1, "ip_vs_add_dest(): dest already exists\n");
return -EEXIST;
}
/*
* Check if the dest already exists in the trash and
* is from the same service
*/
dest = ip_vs_trash_get_dest(svc, daddr, dport);
if (dest != NULL) {
IP_VS_DBG(3, "Get destination %u.%u.%u.%u:%u from trash, "
"refcnt=%d, service %u/%u.%u.%u.%u:%u\n",
NIPQUAD(daddr), ntohs(dport),
atomic_read(&dest->refcnt),
dest->vfwmark,
NIPQUAD(dest->vaddr),
ntohs(dest->vport));
__ip_vs_update_dest(svc, dest, udest);
/*
* Get the destination from the trash
*/
list_del(&dest->n_list);
ip_vs_new_estimator(&dest->stats);
write_lock_bh(&__ip_vs_svc_lock);
/*
* Wait until all other svc users go away.
*/
IP_VS_WAIT_WHILE(atomic_read(&svc->usecnt) > 1);
list_add(&dest->n_list, &svc->destinations);
svc->num_dests++;
/* call the update_service function of its scheduler */
svc->scheduler->update_service(svc);
write_unlock_bh(&__ip_vs_svc_lock);
return 0;
}
/*
* Allocate and initialize the dest structure
*/
ret = ip_vs_new_dest(svc, udest, &dest);
if (ret) {
return ret;
}
/*
* Add the dest entry into the list
*/
atomic_inc(&dest->refcnt);
write_lock_bh(&__ip_vs_svc_lock);
/*
* Wait until all other svc users go away.
*/
IP_VS_WAIT_WHILE(atomic_read(&svc->usecnt) > 1);
list_add(&dest->n_list, &svc->destinations);
svc->num_dests++;
/* call the update_service function of its scheduler */
svc->scheduler->update_service(svc);
write_unlock_bh(&__ip_vs_svc_lock);
LeaveFunction(2);
return 0;
}
/*
* Edit a destination in the given service
*/
static int
ip_vs_edit_dest(struct ip_vs_service *svc, struct ip_vs_dest_user *udest)
{
struct ip_vs_dest *dest;
__u32 daddr = udest->addr;
__u16 dport = udest->port;
EnterFunction(2);
if (udest->weight < 0) {
IP_VS_ERR("ip_vs_add_dest(): server weight less than zero\n");
return -ERANGE;
}
/*
* Lookup the destination list
*/
dest = ip_vs_lookup_dest(svc, daddr, dport);
if (dest == NULL) {
IP_VS_DBG(1, "ip_vs_edit_dest(): dest doesn't exist\n");
return -ENOENT;
}
__ip_vs_update_dest(svc, dest, udest);
/* call the update_service, because server weight may be changed */
svc->scheduler->update_service(svc);
LeaveFunction(2);
return 0;
}
/*
* Delete a destination (must be already unlinked from the service)
*/
static void __ip_vs_del_dest(struct ip_vs_dest *dest)
{
ip_vs_kill_estimator(&dest->stats);
/*
* Remove it from the d-linked list with the real services.
*/
write_lock_bh(&__ip_vs_rs_lock);
ip_vs_rs_unhash(dest);
write_unlock_bh(&__ip_vs_rs_lock);
/*
* Decrease the refcnt of the dest, and free the dest
* if nobody refers to it (refcnt=0). Otherwise, throw
* the destination into the trash.
*/
if (atomic_dec_and_test(&dest->refcnt)) {
ip_vs_dst_reset(dest);
/* simply decrease svc->refcnt here, let the caller check
and release the service if nobody refers to it.
Only user context can release destination and service,
and only one user context can update virtual service at a
time, so the operation here is OK */
atomic_dec(&dest->svc->refcnt);
kfree(dest);
} else {
IP_VS_DBG(3, "Moving dest %u.%u.%u.%u:%u into trash, refcnt=%d\n",
NIPQUAD(dest->addr), ntohs(dest->port),
atomic_read(&dest->refcnt));
list_add(&dest->n_list, &ip_vs_dest_trash);
atomic_inc(&dest->refcnt);
}
}
/*
* Unlink a destination from the given service
*/
static void __ip_vs_unlink_dest(struct ip_vs_service *svc,
struct ip_vs_dest *dest,
int svcupd)
{
dest->flags &= ~IP_VS_DEST_F_AVAILABLE;
/*
* Remove it from the d-linked destination list.
*/
list_del(&dest->n_list);
svc->num_dests--;
if (svcupd) {
/*
* Call the update_service function of its scheduler
*/
svc->scheduler->update_service(svc);
}
}
/*
* Delete a destination server in the given service
*/
static int
ip_vs_del_dest(struct ip_vs_service *svc,struct ip_vs_dest_user *udest)
{
struct ip_vs_dest *dest;
__u32 daddr = udest->addr;
__u16 dport = udest->port;
EnterFunction(2);
dest = ip_vs_lookup_dest(svc, daddr, dport);
if (dest == NULL) {
IP_VS_DBG(1, "ip_vs_del_dest(): destination not found!\n");
return -ENOENT;
}
write_lock_bh(&__ip_vs_svc_lock);
/*
* Wait until all other svc users go away.
*/
IP_VS_WAIT_WHILE(atomic_read(&svc->usecnt) > 1);
/*
* Unlink dest from the service
*/
__ip_vs_unlink_dest(svc, dest, 1);
write_unlock_bh(&__ip_vs_svc_lock);
/*
* Delete the destination
*/
__ip_vs_del_dest(dest);
LeaveFunction(2);
return 0;
}
/*
* Add a service into the service hash table
*/
static int
ip_vs_add_service(struct ip_vs_service_user *u, struct ip_vs_service **svc_p)
{
int ret = 0;
struct ip_vs_scheduler *sched = NULL;
struct ip_vs_service *svc = NULL;
/* increase the module use count */
ip_vs_use_count_inc();
/* Lookup the scheduler by 'u->sched_name' */
sched = ip_vs_scheduler_get(u->sched_name);
if (sched == NULL) {
IP_VS_INFO("Scheduler module ip_vs_%s not found\n",
u->sched_name);
ret = -ENOENT;
goto out_mod_dec;
}
svc = (struct ip_vs_service *)
kmalloc(sizeof(struct ip_vs_service), GFP_ATOMIC);
if (svc == NULL) {
IP_VS_DBG(1, "ip_vs_add_service: kmalloc failed.\n");
ret = -ENOMEM;
goto out_err;
}
memset(svc, 0, sizeof(struct ip_vs_service));
/* I'm the first user of the service */
atomic_set(&svc->usecnt, 1);
atomic_set(&svc->refcnt, 0);
svc->protocol = u->protocol;
svc->addr = u->addr;
svc->port = u->port;
svc->fwmark = u->fwmark;
svc->flags = u->flags;
svc->timeout = u->timeout * HZ;
svc->netmask = u->netmask;
INIT_LIST_HEAD(&svc->destinations);
svc->sched_lock = RW_LOCK_UNLOCKED;
svc->stats.lock = SPIN_LOCK_UNLOCKED;
/* Bind the scheduler */
ret = ip_vs_bind_scheduler(svc, sched);
if (ret)
goto out_err;
sched = NULL;
/* Update the virtual service counters */
if (svc->port == FTPPORT)
atomic_inc(&ip_vs_ftpsvc_counter);
else if (svc->port == 0)
atomic_inc(&ip_vs_nullsvc_counter);
ip_vs_new_estimator(&svc->stats);
ip_vs_num_services++;
/* Hash the service into the service table */
write_lock_bh(&__ip_vs_svc_lock);
ip_vs_svc_hash(svc);
write_unlock_bh(&__ip_vs_svc_lock);
*svc_p = svc;
return 0;
out_err:
if (svc != NULL) {
if (svc->scheduler)
ip_vs_unbind_scheduler(svc);
if (svc->inc) {
local_bh_disable();
ip_vs_app_inc_put(svc->inc);
local_bh_enable();
}
kfree(svc);
}
ip_vs_scheduler_put(sched);
out_mod_dec:
/* decrease the module use count */
ip_vs_use_count_dec();
return ret;
}
/*
* Edit a service and bind it with a new scheduler
*/
static int
ip_vs_edit_service(struct ip_vs_service *svc, struct ip_vs_service_user *u)
{
struct ip_vs_scheduler *sched, *old_sched;
int ret = 0;
/*
* Lookup the scheduler, by 'u->sched_name'
*/
sched = ip_vs_scheduler_get(u->sched_name);
if (sched == NULL) {
IP_VS_INFO("Scheduler module ip_vs_%s not found\n",
u->sched_name);
return -ENOENT;
}
old_sched = sched;
write_lock_bh(&__ip_vs_svc_lock);
/*
* Wait until all other svc users go away.
*/
IP_VS_WAIT_WHILE(atomic_read(&svc->usecnt) > 1);
/*
* Set the flags and timeout value
*/
svc->flags = u->flags | IP_VS_SVC_F_HASHED;
svc->timeout = u->timeout * HZ;
svc->netmask = u->netmask;
old_sched = svc->scheduler;
if (sched != old_sched) {
/*
* Unbind the old scheduler
*/
if ((ret = ip_vs_unbind_scheduler(svc))) {
old_sched = sched;
goto out;
}
/*
* Bind the new scheduler
*/
if ((ret = ip_vs_bind_scheduler(svc, sched))) {
/*
* If ip_vs_bind_scheduler fails, restore the old
* scheduler.
* The main reason of failure is out of memory.
*
* The question is if the old scheduler can be
* restored all the time. TODO: if it cannot be
* restored some time, we must delete the service,
* otherwise the system may crash.
*/
ip_vs_bind_scheduler(svc, old_sched);
old_sched = sched;
goto out;
}
}
out:
write_unlock_bh(&__ip_vs_svc_lock);
if (old_sched)
ip_vs_scheduler_put(old_sched);
return ret;
}
/*
* Delete a service from the service list
* - The service must be unlinked, unlocked and not referenced!
* - We are called under _bh lock
*/
static void __ip_vs_del_service(struct ip_vs_service *svc)
{
struct list_head *l;
struct ip_vs_dest *dest;
struct ip_vs_scheduler *old_sched;
ip_vs_num_services--;
ip_vs_kill_estimator(&svc->stats);
/* Unbind scheduler */
old_sched = svc->scheduler;
ip_vs_unbind_scheduler(svc);
if (old_sched)
ip_vs_scheduler_put(old_sched);
/* Unbind app inc */
if (svc->inc) {
ip_vs_app_inc_put(svc->inc);
svc->inc = NULL;
}
/*
* Unlink the whole destination list
*/
l = &svc->destinations;
while (l->next != l) {
dest = list_entry(l->next, struct ip_vs_dest, n_list);
__ip_vs_unlink_dest(svc, dest, 0);
__ip_vs_del_dest(dest);
}
/*
* Update the virtual service counters
*/
if (svc->port == FTPPORT)
atomic_dec(&ip_vs_ftpsvc_counter);
else if (svc->port == 0)
atomic_dec(&ip_vs_nullsvc_counter);
/*
* Free the service if nobody refers to it
*/
if (atomic_read(&svc->refcnt) == 0)
kfree(svc);
/* decrease the module use count */
ip_vs_use_count_dec();
}
/*
* Delete a service from the service list
*/
static int ip_vs_del_service(struct ip_vs_service *svc)
{
if (svc == NULL)
return -EEXIST;
/*
* Unhash it from the service table
*/
write_lock_bh(&__ip_vs_svc_lock);
ip_vs_svc_unhash(svc);
/*
* Wait until all the svc users go away.
*/
IP_VS_WAIT_WHILE(atomic_read(&svc->usecnt) > 1);
__ip_vs_del_service(svc);
write_unlock_bh(&__ip_vs_svc_lock);
return 0;
}
/*
* Flush all the virtual services
*/
static int ip_vs_flush(void)
{
int idx;
struct ip_vs_service *svc;
struct list_head *l;
/*
* Flush the service table hashed by <protocol,addr,port>
*/
for(idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) {
l = &ip_vs_svc_table[idx];
while (l->next != l) {
svc = list_entry(l->next,struct ip_vs_service,s_list);
write_lock_bh(&__ip_vs_svc_lock);
ip_vs_svc_unhash(svc);
/*
* Wait until all the svc users go away.
*/
IP_VS_WAIT_WHILE(atomic_read(&svc->usecnt) > 0);
__ip_vs_del_service(svc);
write_unlock_bh(&__ip_vs_svc_lock);
}
}
/*
* Flush the service table hashed by fwmark
*/
for(idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) {
l = &ip_vs_svc_fwm_table[idx];
while (l->next != l) {
svc = list_entry(l->next,struct ip_vs_service,f_list);
write_lock_bh(&__ip_vs_svc_lock);
ip_vs_svc_unhash(svc);
/*
* Wait until all the svc users go away.
*/
IP_VS_WAIT_WHILE(atomic_read(&svc->usecnt) > 0);
__ip_vs_del_service(svc);
write_unlock_bh(&__ip_vs_svc_lock);
}
}
return 0;
}
/*
* Zero counters in a service or all services
*/
static void
ip_vs_zero_stats(struct ip_vs_stats *stats)
{
spin_lock_bh(&stats->lock);
memset(stats, 0, (char *)&stats->lock - (char *)stats);
spin_unlock_bh(&stats->lock);
ip_vs_zero_estimator(stats);
}
static int ip_vs_zero_service(struct ip_vs_service *svc)
{
struct list_head *l;
struct ip_vs_dest *dest;
write_lock_bh(&__ip_vs_svc_lock);
list_for_each (l, &svc->destinations) {
dest = list_entry(l, struct ip_vs_dest, n_list);
ip_vs_zero_stats(&dest->stats);
}
ip_vs_zero_stats(&svc->stats);
write_unlock_bh(&__ip_vs_svc_lock);
return 0;
}
static int ip_vs_zero_all(void)
{
int idx;
struct list_head *l;
struct ip_vs_service *svc;
for(idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) {
list_for_each (l, &ip_vs_svc_table[idx]) {
svc = list_entry(l, struct ip_vs_service, s_list);
ip_vs_zero_service(svc);
}
}
for(idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) {
list_for_each (l, &ip_vs_svc_fwm_table[idx]) {
svc = list_entry(l, struct ip_vs_service, f_list);
ip_vs_zero_service(svc);
}
}
ip_vs_zero_stats(&ip_vs_stats);
return 0;
}
static int
proc_do_defense_mode(ctl_table *table, int write, struct file * filp,
void *buffer, size_t *lenp)
{
int *valp = table->data;
int val = *valp;
int rc;
rc = proc_dointvec(table, write, filp, buffer, lenp);
if (write && (*valp != val)) {
if ((*valp < 0) || (*valp > 3)) {
/* Restore the correct value */
*valp = val;
} else {
local_bh_disable();
update_defense_level();
local_bh_enable();
}
}
return rc;
}
static int
proc_do_sync_threshold(ctl_table *table, int write, struct file *filp,
void *buffer, size_t *lenp)
{
int *valp = table->data;
int val[2];
int rc;
/* backup the value first */
memcpy(val, valp, sizeof(val));
rc = proc_dointvec(table, write, filp, buffer, lenp);
if (write && (valp[0] < 0 || valp[1] < 0 || valp[0] >= valp[1])) {
/* Restore the correct value */
memcpy(valp, val, sizeof(val));
}
return rc;
}
/*
* IPVS sysctl table (under the /proc/sys/net/ipv4/vs/)
*/
struct ip_vs_sysctl_table {
struct ctl_table_header *sysctl_header;
ctl_table vs_vars[NET_IPV4_VS_LAST];
ctl_table vs_dir[2];
ctl_table ipv4_dir[2];
ctl_table root_dir[2];
};
static struct ip_vs_sysctl_table ipv4_vs_table = {
NULL,
{{NET_IPV4_VS_AMEMTHRESH, "amemthresh",
&sysctl_ip_vs_amemthresh, sizeof(int), 0644, NULL,
&proc_dointvec},
#ifdef CONFIG_IP_VS_DEBUG
{NET_IPV4_VS_DEBUG_LEVEL, "debug_level",
&sysctl_ip_vs_debug_level, sizeof(int), 0644, NULL,
&proc_dointvec},
#endif
{NET_IPV4_VS_AMDROPRATE, "am_droprate",
&sysctl_ip_vs_am_droprate, sizeof(int), 0644, NULL,
&proc_dointvec},
{NET_IPV4_VS_DROP_ENTRY, "drop_entry",
&sysctl_ip_vs_drop_entry, sizeof(int), 0644, NULL,
&proc_do_defense_mode},
{NET_IPV4_VS_DROP_PACKET, "drop_packet",
&sysctl_ip_vs_drop_packet, sizeof(int), 0644, NULL,
&proc_do_defense_mode},
{NET_IPV4_VS_SECURE_TCP, "secure_tcp",
&sysctl_ip_vs_secure_tcp, sizeof(int), 0644, NULL,
&proc_do_defense_mode},
#if 0
{NET_IPV4_VS_TO_ES, "timeout_established",
&vs_timeout_table_dos.timeout[IP_VS_S_ESTABLISHED],
sizeof(int), 0644, NULL, &proc_dointvec_jiffies},
{NET_IPV4_VS_TO_SS, "timeout_synsent",
&vs_timeout_table_dos.timeout[IP_VS_S_SYN_SENT],
sizeof(int), 0644, NULL, &proc_dointvec_jiffies},
{NET_IPV4_VS_TO_SR, "timeout_synrecv",
&vs_timeout_table_dos.timeout[IP_VS_S_SYN_RECV],
sizeof(int), 0644, NULL, &proc_dointvec_jiffies},
{NET_IPV4_VS_TO_FW, "timeout_finwait",
&vs_timeout_table_dos.timeout[IP_VS_S_FIN_WAIT],
sizeof(int), 0644, NULL, &proc_dointvec_jiffies},
{NET_IPV4_VS_TO_TW, "timeout_timewait",
&vs_timeout_table_dos.timeout[IP_VS_S_TIME_WAIT],
sizeof(int), 0644, NULL, &proc_dointvec_jiffies},
{NET_IPV4_VS_TO_CL, "timeout_close",
&vs_timeout_table_dos.timeout[IP_VS_S_CLOSE],
sizeof(int), 0644, NULL, &proc_dointvec_jiffies},
{NET_IPV4_VS_TO_CW, "timeout_closewait",
&vs_timeout_table_dos.timeout[IP_VS_S_CLOSE_WAIT],
sizeof(int), 0644, NULL, &proc_dointvec_jiffies},
{NET_IPV4_VS_TO_LA, "timeout_lastack",
&vs_timeout_table_dos.timeout[IP_VS_S_LAST_ACK],
sizeof(int), 0644, NULL, &proc_dointvec_jiffies},
{NET_IPV4_VS_TO_LI, "timeout_listen",
&vs_timeout_table_dos.timeout[IP_VS_S_LISTEN],
sizeof(int), 0644, NULL, &proc_dointvec_jiffies},
{NET_IPV4_VS_TO_SA, "timeout_synack",
&vs_timeout_table_dos.timeout[IP_VS_S_SYNACK],
sizeof(int), 0644, NULL, &proc_dointvec_jiffies},
{NET_IPV4_VS_TO_UDP, "timeout_udp",
&vs_timeout_table_dos.timeout[IP_VS_S_UDP],
sizeof(int), 0644, NULL, &proc_dointvec_jiffies},
{NET_IPV4_VS_TO_ICMP, "timeout_icmp",
&vs_timeout_table_dos.timeout[IP_VS_S_ICMP],
sizeof(int), 0644, NULL, &proc_dointvec_jiffies},
#endif
{NET_IPV4_VS_CACHE_BYPASS, "cache_bypass",
&sysctl_ip_vs_cache_bypass, sizeof(int), 0644, NULL,
&proc_dointvec},
{NET_IPV4_VS_EXPIRE_NODEST_CONN, "expire_nodest_conn",
&sysctl_ip_vs_expire_nodest_conn, sizeof(int), 0644, NULL,
&proc_dointvec},
{NET_IPV4_VS_SYNC_THRESHOLD, "sync_threshold",
&sysctl_ip_vs_sync_threshold, sizeof(sysctl_ip_vs_sync_threshold),
0644, NULL, &proc_do_sync_threshold},
{NET_IPV4_VS_NAT_ICMP_SEND, "nat_icmp_send",
&sysctl_ip_vs_nat_icmp_send, sizeof(int), 0644, NULL,
&proc_dointvec},
{0}},
{{NET_IPV4_VS, "vs", NULL, 0, 0555, ipv4_vs_table.vs_vars},
{0}},
{{NET_IPV4, "ipv4", NULL, 0, 0555, ipv4_vs_table.vs_dir},
{0}},
{{CTL_NET, "net", NULL, 0, 0555, ipv4_vs_table.ipv4_dir},
{0}}
};
/*
* Write the contents of the VS rule table to a PROCfs file.
* (It is kept just for backward compatibility)
*/
static inline char *ip_vs_fwd_name(unsigned flags)
{
char *fwd;
switch (flags & IP_VS_CONN_F_FWD_MASK) {
case IP_VS_CONN_F_LOCALNODE:
fwd = "Local";
break;
case IP_VS_CONN_F_TUNNEL:
fwd = "Tunnel";
break;
case IP_VS_CONN_F_DROUTE:
fwd = "Route";
break;
default:
fwd = "Masq";
}
return fwd;
}
static inline int sprintf_dest(char *str, struct ip_vs_dest *dest)
{
return sprintf(str, " -> %08X:%04X %-7s %-6d %-10d %-10d",
ntohl(dest->addr), ntohs(dest->port),
ip_vs_fwd_name(atomic_read(&dest->conn_flags)),
atomic_read(&dest->weight),
atomic_read(&dest->activeconns),
atomic_read(&dest->inactconns));
}
static int ip_vs_get_info(char *buf, char **start, off_t offset, int length)
{
int len=0;
off_t pos=0;
char temp[64], temp2[32];
int idx;
struct ip_vs_service *svc;
struct ip_vs_dest *dest;
struct list_head *l, *e, *p, *q;
/*
* Note: since the length of the buffer is usually the multiple
* of 512, it is good to use fixed record of the divisor of 512,
* so that records won't be truncated at buffer boundary.
*/
pos = 192;
if (pos > offset) {
sprintf(temp,
"IP Virtual Server version %d.%d.%d (size=%d)",
NVERSION(IP_VS_VERSION_CODE), IP_VS_CONN_TAB_SIZE);
len += sprintf(buf+len, "%-63s\n", temp);
len += sprintf(buf+len, "%-63s\n",
"Prot LocalAddress:Port Scheduler Flags");
len += sprintf(buf+len, "%-63s\n",
" -> RemoteAddress:Port Forward Weight ActiveConn InActConn");
}
read_lock_bh(&__ip_vs_svc_lock);
/* print the service table hashed by <protocol,addr,port> */
for (idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) {
l = &ip_vs_svc_table[idx];
for (e=l->next; e!=l; e=e->next) {
svc = list_entry(e, struct ip_vs_service, s_list);
pos += 64;
if (pos > offset) {
if (svc->flags & IP_VS_SVC_F_PERSISTENT)
sprintf(temp2, "persistent %d %08X",
svc->timeout,
ntohl(svc->netmask));
else
temp2[0] = '\0';
sprintf(temp, "%s %08X:%04X %s %s",
ip_vs_proto_name(svc->protocol),
ntohl(svc->addr),
ntohs(svc->port),
svc->scheduler->name, temp2);
len += sprintf(buf+len, "%-63s\n", temp);
if (len >= length)
goto done;
}
p = &svc->destinations;
for (q=p->next; q!=p; q=q->next) {
dest = list_entry(q, struct ip_vs_dest, n_list);
pos += 64;
if (pos <= offset)
continue;
sprintf_dest(temp, dest);
len += sprintf(buf+len, "%-63s\n", temp);
if (len >= length)
goto done;
}
}
}
/* print the service table hashed by fwmark */
for (idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) {
l = &ip_vs_svc_fwm_table[idx];
for (e=l->next; e!=l; e=e->next) {
svc = list_entry(e, struct ip_vs_service, f_list);
pos += 64;
if (pos > offset) {
if (svc->flags & IP_VS_SVC_F_PERSISTENT)
sprintf(temp2, "persistent %d %08X",
svc->timeout,
ntohl(svc->netmask));
else
temp2[0] = '\0';
sprintf(temp, "FWM %08X %s %s",
svc->fwmark,
svc->scheduler->name, temp2);
len += sprintf(buf+len, "%-63s\n", temp);
if (len >= length)
goto done;
}
p = &svc->destinations;
for (q=p->next; q!=p; q=q->next) {
dest = list_entry(q, struct ip_vs_dest, n_list);
pos += 64;
if (pos <= offset)
continue;
sprintf_dest(temp, dest);
len += sprintf(buf+len, "%-63s\n", temp);
if (len >= length)
goto done;
}
}
}
done:
read_unlock_bh(&__ip_vs_svc_lock);
*start = buf+len-(pos-offset); /* Start of wanted data */
len = pos-offset;
if (len > length)
len = length;
if (len < 0)
len = 0;
return len;
}
struct ip_vs_stats ip_vs_stats;
static int
ip_vs_stats_get_info(char *buf, char **start, off_t offset, int length)
{
int len=0;
off_t pos=0;
char temp[64];
pos += 320;
if (pos > offset) {
len += sprintf(buf+len, "%-63s\n%-63s\n",
/* 01234567 01234567 01234567 0123456701234567 0123456701234567 */
" Total Incoming Outgoing Incoming Outgoing",
" Conns Packets Packets Bytes Bytes");
spin_lock_bh(&ip_vs_stats.lock);
sprintf(temp, "%8X %8X %8X %8X%08X %8X%08X",
ip_vs_stats.conns,
ip_vs_stats.inpkts,
ip_vs_stats.outpkts,
(__u32)(ip_vs_stats.inbytes>>32),
(__u32)ip_vs_stats.inbytes,
(__u32)(ip_vs_stats.outbytes>>32),
(__u32)ip_vs_stats.outbytes);
len += sprintf(buf+len, "%-62s\n\n", temp);
len += sprintf(buf+len, "%-63s\n",
/* 01234567 01234567 01234567 0123456701234567 0123456701234567 */
" Conns/s Pkts/s Pkts/s Bytes/s Bytes/s");
sprintf(temp, "%8X %8X %8X %16X %16X",
ip_vs_stats.cps,
ip_vs_stats.inpps,
ip_vs_stats.outpps,
ip_vs_stats.inbps,
ip_vs_stats.outbps);
len += sprintf(buf+len, "%-63s\n", temp);
spin_unlock_bh(&ip_vs_stats.lock);
}
*start = buf+len-(pos-offset); /* Start of wanted data */
len = pos-offset;
if (len > length)
len = length;
if (len < 0)
len = 0;
return len;
}
/*
* Set timeout values for tcp tcpfin udp in the timeout_table.
*/
static int ip_vs_set_timeout(struct ip_vs_timeout_user *u)
{
IP_VS_DBG(2, "Setting timeout tcp:%d tcpfin:%d udp:%d\n",
u->tcp_timeout,
u->tcp_fin_timeout,
u->udp_timeout);
#ifdef CONFIG_IP_VS_PROTO_TCP
if (u->tcp_timeout) {
ip_vs_protocol_tcp.timeout_table[IP_VS_TCP_S_ESTABLISHED]
= u->tcp_timeout * HZ;
}
if (u->tcp_fin_timeout) {
ip_vs_protocol_tcp.timeout_table[IP_VS_TCP_S_FIN_WAIT]
= u->tcp_fin_timeout * HZ;
}
#endif
#ifdef CONFIG_IP_VS_PROTO_UDP
if (u->udp_timeout) {
ip_vs_protocol_udp.timeout_table[IP_VS_UDP_S_NORMAL]
= u->udp_timeout * HZ;
}
#endif
return 0;
}
#define SET_CMDID(cmd) (cmd - IP_VS_BASE_CTL)
#define SERVICE_ARG_LEN (sizeof(struct ip_vs_service_user))
#define SVCDEST_ARG_LEN (sizeof(struct ip_vs_service_user) + \
sizeof(struct ip_vs_dest_user))
#define TIMEOUT_ARG_LEN (sizeof(struct ip_vs_timeout_user))
#define DAEMON_ARG_LEN (sizeof(struct ip_vs_daemon_user))
#define MAX_ARG_LEN SVCDEST_ARG_LEN
static unsigned char set_arglen[SET_CMDID(IP_VS_SO_SET_MAX)+1] = {
[SET_CMDID(IP_VS_SO_SET_ADD)] = SERVICE_ARG_LEN,
[SET_CMDID(IP_VS_SO_SET_EDIT)] = SERVICE_ARG_LEN,
[SET_CMDID(IP_VS_SO_SET_DEL)] = SERVICE_ARG_LEN,
[SET_CMDID(IP_VS_SO_SET_FLUSH)] = 0,
[SET_CMDID(IP_VS_SO_SET_ADDDEST)] = SVCDEST_ARG_LEN,
[SET_CMDID(IP_VS_SO_SET_DELDEST)] = SVCDEST_ARG_LEN,
[SET_CMDID(IP_VS_SO_SET_EDITDEST)] = SVCDEST_ARG_LEN,
[SET_CMDID(IP_VS_SO_SET_TIMEOUT)] = TIMEOUT_ARG_LEN,
[SET_CMDID(IP_VS_SO_SET_STARTDAEMON)] = DAEMON_ARG_LEN,
[SET_CMDID(IP_VS_SO_SET_STOPDAEMON)] = DAEMON_ARG_LEN,
[SET_CMDID(IP_VS_SO_SET_ZERO)] = SERVICE_ARG_LEN,
};
static int
do_ip_vs_set_ctl(struct sock *sk, int cmd, void *user, unsigned int len)
{
int ret;
unsigned char arg[MAX_ARG_LEN];
struct ip_vs_service_user *usvc;
struct ip_vs_service *svc;
struct ip_vs_dest_user *udest;
if (!capable(CAP_NET_ADMIN))
return -EPERM;
if (len != set_arglen[SET_CMDID(cmd)]) {
IP_VS_ERR("set_ctl: len %u != %u\n",
len, set_arglen[SET_CMDID(cmd)]);
return -EINVAL;
}
if (copy_from_user(arg, user, len) != 0)
return -EFAULT;
/* increase the module use count */
ip_vs_use_count_inc();
if (down_interruptible(&__ip_vs_mutex)) {
ret = -ERESTARTSYS;
goto out_dec;
}
if (cmd == IP_VS_SO_SET_FLUSH) {
/* Flush the virtual service */
ret = ip_vs_flush();
goto out_unlock;
} else if (cmd == IP_VS_SO_SET_TIMEOUT) {
/* Set timeout values for (tcp tcpfin udp) */
ret = ip_vs_set_timeout((struct ip_vs_timeout_user *)arg);
goto out_unlock;
} else if (cmd == IP_VS_SO_SET_STARTDAEMON) {
struct ip_vs_daemon_user *dm = (struct ip_vs_daemon_user *)arg;
ret = start_sync_thread(dm->state, dm->mcast_ifn, dm->syncid);
goto out_unlock;
} else if (cmd == IP_VS_SO_SET_STOPDAEMON) {
struct ip_vs_daemon_user *dm = (struct ip_vs_daemon_user *)arg;
ret = stop_sync_thread(dm->state);
goto out_unlock;
}
usvc = (struct ip_vs_service_user *)arg;
udest = (struct ip_vs_dest_user *)(usvc + 1);
if (cmd == IP_VS_SO_SET_ZERO) {
/* if no service address is set, zero counters in all */
if (!usvc->fwmark && !usvc->addr && !usvc->port) {
ret = ip_vs_zero_all();
goto out_unlock;
}
}
/* Check for valid protocol: TCP or UDP, even for fwmark!=0 */
if (usvc->protocol!=IPPROTO_TCP && usvc->protocol!=IPPROTO_UDP) {
IP_VS_INFO("vs_ctl: invalid protocol: %d %d.%d.%d.%d:%d %s",
ntohs(usvc->protocol), NIPQUAD(usvc->addr),
ntohs(usvc->port), usvc->sched_name);
ret = -EFAULT;
goto out_unlock;
}
/* Lookup the exact service by <protocol, addr, port> or fwmark */
if (usvc->fwmark == 0)
svc = __ip_vs_service_get(usvc->protocol,
usvc->addr, usvc->port);
else
svc = __ip_vs_svc_fwm_get(usvc->fwmark);
if (cmd != IP_VS_SO_SET_ADD
&& (svc == NULL || svc->protocol != usvc->protocol)) {
ret = -ESRCH;
goto out_unlock;
}
switch (cmd) {
case IP_VS_SO_SET_ADD:
if (svc != NULL)
ret = -EEXIST;
else
ret = ip_vs_add_service(usvc, &svc);
break;
case IP_VS_SO_SET_EDIT:
ret = ip_vs_edit_service(svc, usvc);
break;
case IP_VS_SO_SET_DEL:
ret = ip_vs_del_service(svc);
if (!ret)
goto out_unlock;
break;
case IP_VS_SO_SET_ZERO:
ret = ip_vs_zero_service(svc);
break;
case IP_VS_SO_SET_ADDDEST:
ret = ip_vs_add_dest(svc, udest);
break;
case IP_VS_SO_SET_EDITDEST:
ret = ip_vs_edit_dest(svc, udest);
break;
case IP_VS_SO_SET_DELDEST:
ret = ip_vs_del_dest(svc, udest);
break;
default:
ret = -EINVAL;
}
if (svc)
ip_vs_service_put(svc);
out_unlock:
up(&__ip_vs_mutex);
out_dec:
/* decrease the module use count */
ip_vs_use_count_dec();
return ret;
}
static void
ip_vs_copy_stats(struct ip_vs_stats_user *dst, struct ip_vs_stats *src)
{
spin_lock_bh(&src->lock);
memcpy(dst, src, (char*)&src->lock - (char*)src);
spin_unlock_bh(&src->lock);
}
static void
ip_vs_copy_service(struct ip_vs_service_entry *dst, struct ip_vs_service *src)
{
dst->protocol = src->protocol;
dst->addr = src->addr;
dst->port = src->port;
dst->fwmark = src->fwmark;
strcpy(dst->sched_name, src->scheduler->name);
dst->flags = src->flags;
dst->timeout = src->timeout / HZ;
dst->netmask = src->netmask;
dst->num_dests = src->num_dests;
ip_vs_copy_stats(&dst->stats, &src->stats);
}
static inline int
__ip_vs_get_service_entries(const struct ip_vs_get_services *get,
struct ip_vs_get_services *uptr)
{
int idx, count=0;
struct ip_vs_service *svc;
struct list_head *l;
struct ip_vs_service_entry entry;
int ret = 0;
for (idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) {
list_for_each (l, &ip_vs_svc_table[idx]) {
if (count >= get->num_services)
goto out;
svc = list_entry(l, struct ip_vs_service, s_list);
ip_vs_copy_service(&entry, svc);
if (copy_to_user(&uptr->entrytable[count],
&entry, sizeof(entry))) {
ret = -EFAULT;
goto out;
}
count++;
}
}
for (idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) {
list_for_each (l, &ip_vs_svc_fwm_table[idx]) {
if (count >= get->num_services)
goto out;
svc = list_entry(l, struct ip_vs_service, f_list);
ip_vs_copy_service(&entry, svc);
if (copy_to_user(&uptr->entrytable[count],
&entry, sizeof(entry))) {
ret = -EFAULT;
goto out;
}
count++;
}
}
out:
return ret;
}
static inline int
__ip_vs_get_dest_entries(const struct ip_vs_get_dests *get,
struct ip_vs_get_dests *uptr)
{
struct ip_vs_service *svc;
int ret = 0;
if (get->fwmark)
svc = __ip_vs_svc_fwm_get(get->fwmark);
else
svc = __ip_vs_service_get(get->protocol,
get->addr, get->port);
if (svc) {
int count = 0;
struct ip_vs_dest *dest;
struct list_head *l, *e;
struct ip_vs_dest_entry entry;
l = &svc->destinations;
for (e=l->next; e!=l; e=e->next) {
if (count >= get->num_dests)
break;
dest = list_entry(e, struct ip_vs_dest, n_list);
entry.addr = dest->addr;
entry.port = dest->port;
entry.conn_flags = atomic_read(&dest->conn_flags);
entry.weight = atomic_read(&dest->weight);
entry.u_threshold = dest->u_threshold;
entry.l_threshold = dest->l_threshold;
entry.activeconns = atomic_read(&dest->activeconns);
entry.inactconns = atomic_read(&dest->inactconns);
entry.persistconns = atomic_read(&dest->persistconns);
ip_vs_copy_stats(&entry.stats, &dest->stats);
if (copy_to_user(&uptr->entrytable[count],
&entry, sizeof(entry))) {
ret = -EFAULT;
break;
}
count++;
}
ip_vs_service_put(svc);
} else
ret = -ESRCH;
return ret;
}
static inline void
__ip_vs_get_timeouts(struct ip_vs_timeout_user *u)
{
#ifdef CONFIG_IP_VS_PROTO_TCP
u->tcp_timeout =
ip_vs_protocol_tcp.timeout_table[IP_VS_TCP_S_ESTABLISHED] / HZ;
u->tcp_fin_timeout =
ip_vs_protocol_tcp.timeout_table[IP_VS_TCP_S_FIN_WAIT] / HZ;
#endif
#ifdef CONFIG_IP_VS_PROTO_UDP
u->udp_timeout =
ip_vs_protocol_udp.timeout_table[IP_VS_UDP_S_NORMAL] / HZ;
#endif
}
#define GET_CMDID(cmd) (cmd - IP_VS_BASE_CTL)
#define GET_INFO_ARG_LEN (sizeof(struct ip_vs_getinfo))
#define GET_SERVICES_ARG_LEN (sizeof(struct ip_vs_get_services))
#define GET_SERVICE_ARG_LEN (sizeof(struct ip_vs_service_entry))
#define GET_DESTS_ARG_LEN (sizeof(struct ip_vs_get_dests))
#define GET_TIMEOUT_ARG_LEN (sizeof(struct ip_vs_timeout_user))
#define GET_DAEMON_ARG_LEN (sizeof(struct ip_vs_daemon_user) * 2)
static unsigned char get_arglen[GET_CMDID(IP_VS_SO_GET_MAX)+1] = {
[GET_CMDID(IP_VS_SO_GET_VERSION)] = 64,
[GET_CMDID(IP_VS_SO_GET_INFO)] = GET_INFO_ARG_LEN,
[GET_CMDID(IP_VS_SO_GET_SERVICES)] = GET_SERVICES_ARG_LEN,
[GET_CMDID(IP_VS_SO_GET_SERVICE)] = GET_SERVICE_ARG_LEN,
[GET_CMDID(IP_VS_SO_GET_DESTS)] = GET_DESTS_ARG_LEN,
[GET_CMDID(IP_VS_SO_GET_TIMEOUT)] = GET_TIMEOUT_ARG_LEN,
[GET_CMDID(IP_VS_SO_GET_DAEMON)] = GET_DAEMON_ARG_LEN,
};
static int
do_ip_vs_get_ctl(struct sock *sk, int cmd, void *user, int *len)
{
unsigned char arg[128];
int ret = 0;
if (!capable(CAP_NET_ADMIN))
return -EPERM;
if (*len < get_arglen[GET_CMDID(cmd)]) {
IP_VS_ERR("get_ctl: len %u < %u\n",
*len, get_arglen[GET_CMDID(cmd)]);
return -EINVAL;
}
if (copy_from_user(arg, user, get_arglen[GET_CMDID(cmd)]) != 0)
return -EFAULT;
if (down_interruptible(&__ip_vs_mutex))
return -ERESTARTSYS;
switch (cmd) {
case IP_VS_SO_GET_VERSION:
{
char buf[64];
sprintf(buf, "IP Virtual Server version %d.%d.%d (size=%d)",
NVERSION(IP_VS_VERSION_CODE), IP_VS_CONN_TAB_SIZE);
if (copy_to_user(user, buf, strlen(buf)+1) != 0) {
ret = -EFAULT;
goto out;
}
*len = strlen(buf)+1;
}
break;
case IP_VS_SO_GET_INFO:
{
struct ip_vs_getinfo info;
info.version = IP_VS_VERSION_CODE;
info.size = IP_VS_CONN_TAB_SIZE;
info.num_services = ip_vs_num_services;
if (copy_to_user(user, &info, sizeof(info)) != 0)
ret = -EFAULT;
}
break;
case IP_VS_SO_GET_SERVICES:
{
struct ip_vs_get_services *get;
int size;
get = (struct ip_vs_get_services *)arg;
size = sizeof(*get) +
sizeof(struct ip_vs_service_entry) * get->num_services;
if (*len != size) {
IP_VS_ERR("length: %u != %u\n", *len, size);
ret = -EINVAL;
goto out;
}
ret = __ip_vs_get_service_entries(get, user);
}
break;
case IP_VS_SO_GET_SERVICE:
{
struct ip_vs_service_entry *entry;
struct ip_vs_service *svc;
entry = (struct ip_vs_service_entry *)arg;
if (entry->fwmark)
svc = __ip_vs_svc_fwm_get(entry->fwmark);
else
svc = __ip_vs_service_get(entry->protocol,
entry->addr, entry->port);
if (svc) {
ip_vs_copy_service(entry, svc);
if (copy_to_user(user, entry, sizeof(*entry)) != 0)
ret = -EFAULT;
ip_vs_service_put(svc);
} else
ret = -ESRCH;
}
break;
case IP_VS_SO_GET_DESTS:
{
struct ip_vs_get_dests *get;
int size;
get = (struct ip_vs_get_dests *)arg;
size = sizeof(*get) +
sizeof(struct ip_vs_dest_entry) * get->num_dests;
if (*len != size) {
IP_VS_ERR("length: %u != %u\n", *len, size);
ret = -EINVAL;
goto out;
}
ret = __ip_vs_get_dest_entries(get, user);
}
break;
case IP_VS_SO_GET_TIMEOUT:
{
struct ip_vs_timeout_user t;
__ip_vs_get_timeouts(&t);
if (copy_to_user(user, &t, sizeof(t)) != 0)
ret = -EFAULT;
}
break;
case IP_VS_SO_GET_DAEMON:
{
struct ip_vs_daemon_user d[2];
memset(&d, 0, sizeof(d));
if (ip_vs_sync_state & IP_VS_STATE_MASTER) {
d[0].state = IP_VS_STATE_MASTER;
strcpy(d[0].mcast_ifn, ip_vs_master_mcast_ifn);
d[0].syncid = ip_vs_master_syncid;
}
if (ip_vs_sync_state & IP_VS_STATE_BACKUP) {
d[1].state = IP_VS_STATE_BACKUP;
strcpy(d[1].mcast_ifn, ip_vs_backup_mcast_ifn);
d[1].syncid = ip_vs_backup_syncid;
}
if (copy_to_user(user, &d, sizeof(d)) != 0)
ret = -EFAULT;
}
break;
default:
ret = -EINVAL;
}
out:
up(&__ip_vs_mutex);
return ret;
}
static struct nf_sockopt_ops ip_vs_sockopts = {
{ NULL, NULL }, PF_INET,
IP_VS_BASE_CTL, IP_VS_SO_SET_MAX+1, do_ip_vs_set_ctl,
IP_VS_BASE_CTL, IP_VS_SO_GET_MAX+1, do_ip_vs_get_ctl
};
int ip_vs_control_init(void)
{
int ret;
int idx;
EnterFunction(2);
ret = nf_register_sockopt(&ip_vs_sockopts);
if (ret) {
IP_VS_ERR("cannot register sockopt.\n");
return ret;
}
proc_net_create("ip_vs", 0, ip_vs_get_info);
proc_net_create("ip_vs_stats", 0, ip_vs_stats_get_info);
ipv4_vs_table.sysctl_header =
register_sysctl_table(ipv4_vs_table.root_dir, 0);
/* Initialize ip_vs_svc_table, ip_vs_svc_fwm_table, ip_vs_rtable */
for(idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) {
INIT_LIST_HEAD(&ip_vs_svc_table[idx]);
INIT_LIST_HEAD(&ip_vs_svc_fwm_table[idx]);
}
for(idx = 0; idx < IP_VS_RTAB_SIZE; idx++) {
INIT_LIST_HEAD(&ip_vs_rtable[idx]);
}
memset(&ip_vs_stats, 0, sizeof(ip_vs_stats));
ip_vs_stats.lock = SPIN_LOCK_UNLOCKED;
ip_vs_new_estimator(&ip_vs_stats);
LeaveFunction(2);
return 0;
}
void ip_vs_control_cleanup(void)
{
EnterFunction(2);
ip_vs_trash_cleanup();
ip_vs_kill_estimator(&ip_vs_stats);
unregister_sysctl_table(ipv4_vs_table.sysctl_header);
proc_net_remove("ip_vs_stats");
proc_net_remove("ip_vs");
nf_unregister_sockopt(&ip_vs_sockopts);
LeaveFunction(2);
}
/*
* IPVS: Destination Hashing scheduling module
*
* Version: $Id: ip_vs_dh.c,v 1.5 2002/09/15 08:14:08 wensong Exp $
*
* Authors: Wensong Zhang <wensong@gnuchina.org>
*
* Inspired by the consistent hashing scheduler patch from
* Thomas Proell <proellt@gmx.de>
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public License
* as published by the Free Software Foundation; either version
* 2 of the License, or (at your option) any later version.
*
* Changes:
*
*/
/*
* The dh algorithm is to select server by the hash key of destination IP
* address. The pseudo code is as follows:
*
* n <- servernode[dest_ip];
* if (n is dead) OR
* (n is overloaded) OR (n.weight <= 0) then
* return NULL;
*
* return n;
*
* Notes that servernode is a 256-bucket hash table that maps the hash
* index derived from packet destination IP address to the current server
* array. If the dh scheduler is used in cache cluster, it is good to
* combine it with cache_bypass feature. When the statically assigned
* server is dead or overloaded, the load balancer can bypass the cache
* server and send requests to the original server directly.
*
*/
#include <linux/config.h>
#include <linux/module.h>
#include <linux/init.h>
#include <linux/types.h>
#include <linux/kernel.h>
#include <linux/errno.h>
#include <net/ip_vs.h>
/*
* IPVS DH bucket
*/
struct ip_vs_dh_bucket {
struct ip_vs_dest *dest; /* real server (cache) */
};
/*
* for IPVS DH entry hash table
*/
#ifndef CONFIG_IP_VS_DH_TAB_BITS
#define CONFIG_IP_VS_DH_TAB_BITS 8
#endif
#define IP_VS_DH_TAB_BITS CONFIG_IP_VS_DH_TAB_BITS
#define IP_VS_DH_TAB_SIZE (1 << IP_VS_DH_TAB_BITS)
#define IP_VS_DH_TAB_MASK (IP_VS_DH_TAB_SIZE - 1)
/*
* Returns hash value for IPVS DH entry
*/
static inline unsigned ip_vs_dh_hashkey(__u32 addr)
{
return (ntohl(addr)*2654435761UL) & IP_VS_DH_TAB_MASK;
}
/*
* Get ip_vs_dest associated with supplied parameters.
*/
static inline struct ip_vs_dest *
ip_vs_dh_get(struct ip_vs_dh_bucket *tbl, __u32 addr)
{
return (tbl[ip_vs_dh_hashkey(addr)]).dest;
}
/*
* Assign all the hash buckets of the specified table with the service.
*/
static int
ip_vs_dh_assign(struct ip_vs_dh_bucket *tbl, struct ip_vs_service *svc)
{
int i;
struct ip_vs_dh_bucket *b;
struct list_head *p;
struct ip_vs_dest *dest;
b = tbl;
p = &svc->destinations;
for (i=0; i<IP_VS_DH_TAB_SIZE; i++) {
if (list_empty(p)) {
b->dest = NULL;
} else {
if (p == &svc->destinations)
p = p->next;
dest = list_entry(p, struct ip_vs_dest, n_list);
atomic_inc(&dest->refcnt);
b->dest = dest;
p = p->next;
}
b++;
}
return 0;
}
/*
* Flush all the hash buckets of the specified table.
*/
static void ip_vs_dh_flush(struct ip_vs_dh_bucket *tbl)
{
int i;
struct ip_vs_dh_bucket *b;
b = tbl;
for (i=0; i<IP_VS_DH_TAB_SIZE; i++) {
if (b->dest) {
atomic_dec(&b->dest->refcnt);
b->dest = NULL;
}
b++;
}
}
static int ip_vs_dh_init_svc(struct ip_vs_service *svc)
{
struct ip_vs_dh_bucket *tbl;
/* allocate the DH table for this service */
tbl = kmalloc(sizeof(struct ip_vs_dh_bucket)*IP_VS_DH_TAB_SIZE,
GFP_ATOMIC);
if (tbl == NULL) {
IP_VS_ERR("ip_vs_dh_init_svc(): no memory\n");
return -ENOMEM;
}
svc->sched_data = tbl;
IP_VS_DBG(6, "DH hash table (memory=%dbytes) allocated for "
"current service\n",
sizeof(struct ip_vs_dh_bucket)*IP_VS_DH_TAB_SIZE);
/* assign the hash buckets with the updated service */
ip_vs_dh_assign(tbl, svc);
return 0;
}
static int ip_vs_dh_done_svc(struct ip_vs_service *svc)
{
struct ip_vs_dh_bucket *tbl = svc->sched_data;
/* got to clean up hash buckets here */
ip_vs_dh_flush(tbl);
/* release the table itself */
kfree(svc->sched_data);
IP_VS_DBG(6, "DH hash table (memory=%dbytes) released\n",
sizeof(struct ip_vs_dh_bucket)*IP_VS_DH_TAB_SIZE);
return 0;
}
static int ip_vs_dh_update_svc(struct ip_vs_service *svc)
{
struct ip_vs_dh_bucket *tbl = svc->sched_data;
/* got to clean up hash buckets here */
ip_vs_dh_flush(tbl);
/* assign the hash buckets with the updated service */
ip_vs_dh_assign(tbl, svc);
return 0;
}
/*
* If the dest flags is set with IP_VS_DEST_F_OVERLOAD,
* consider that the server is overloaded here.
*/
static inline int is_overloaded(struct ip_vs_dest *dest)
{
return dest->flags & IP_VS_DEST_F_OVERLOAD;
}
/*
* Destination hashing scheduling
*/
static struct ip_vs_dest *
ip_vs_dh_schedule(struct ip_vs_service *svc, struct iphdr *iph)
{
struct ip_vs_dest *dest;
struct ip_vs_dh_bucket *tbl;
IP_VS_DBG(6, "ip_vs_dh_schedule(): Scheduling...\n");
tbl = (struct ip_vs_dh_bucket *)svc->sched_data;
dest = ip_vs_dh_get(tbl, iph->daddr);
if (!dest
|| !(dest->flags & IP_VS_DEST_F_AVAILABLE)
|| atomic_read(&dest->weight) <= 0
|| is_overloaded(dest)) {
return NULL;
}
IP_VS_DBG(6, "DH: destination IP address %u.%u.%u.%u "
"--> server %u.%u.%u.%u:%d\n",
NIPQUAD(iph->daddr),
NIPQUAD(dest->addr),
ntohs(dest->port));
return dest;
}
/*
* IPVS DH Scheduler structure
*/
static struct ip_vs_scheduler ip_vs_dh_scheduler =
{
.name = "dh",
.refcnt = ATOMIC_INIT(0),
.module = THIS_MODULE,
.init_service = ip_vs_dh_init_svc,
.done_service = ip_vs_dh_done_svc,
.update_service = ip_vs_dh_update_svc,
.schedule = ip_vs_dh_schedule,
};
static int __init ip_vs_dh_init(void)
{
INIT_LIST_HEAD(&ip_vs_dh_scheduler.n_list);
return register_ip_vs_scheduler(&ip_vs_dh_scheduler);
}
static void __exit ip_vs_dh_cleanup(void)
{
unregister_ip_vs_scheduler(&ip_vs_dh_scheduler);
}
module_init(ip_vs_dh_init);
module_exit(ip_vs_dh_cleanup);
MODULE_LICENSE("GPL");
/*
* ip_vs_est.c: simple rate estimator for IPVS
*
* Version: $Id: ip_vs_est.c,v 1.4 2002/11/30 01:50:35 wensong Exp $
*
* Authors: Wensong Zhang <wensong@linuxvirtualserver.org>
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public License
* as published by the Free Software Foundation; either version
* 2 of the License, or (at your option) any later version.
*
* Changes:
*
*/
#include <linux/kernel.h>
#include <linux/types.h>
#include <net/ip_vs.h>
/*
This code is to estimate rate in a shorter interval (such as 8
seconds) for virtual services and real servers. For measure rate in a
long interval, it is easy to implement a user level daemon which
periodically reads those statistical counters and measure rate.
Currently, the measurement is activated by slow timer handler. Hope
this measurement will not introduce too much load.
We measure rate during the last 8 seconds every 2 seconds:
avgrate = avgrate*(1-W) + rate*W
where W = 2^(-2)
NOTES.
* The stored value for average bps is scaled by 2^5, so that maximal
rate is ~2.15Gbits/s, average pps and cps are scaled by 2^10.
* A lot code is taken from net/sched/estimator.c
*/
struct ip_vs_estimator
{
struct ip_vs_estimator *next;
struct ip_vs_stats *stats;
u32 last_conns;
u32 last_inpkts;
u32 last_outpkts;
u64 last_inbytes;
u64 last_outbytes;
u32 cps;
u32 inpps;
u32 outpps;
u32 inbps;
u32 outbps;
};
static struct ip_vs_estimator *est_list = NULL;
static rwlock_t est_lock = RW_LOCK_UNLOCKED;
static struct timer_list est_timer;
static void estimation_timer(unsigned long arg)
{
struct ip_vs_estimator *e;
struct ip_vs_stats *s;
u32 n_conns;
u32 n_inpkts, n_outpkts;
u64 n_inbytes, n_outbytes;
u32 rate;
read_lock(&est_lock);
for (e = est_list; e; e = e->next) {
s = e->stats;
n_conns = s->conns;
n_inpkts = s->inpkts;
n_outpkts = s->outpkts;
n_inbytes = s->inbytes;
n_outbytes = s->outbytes;
/* scaled by 2^10, but divided 2 seconds */
rate = (n_conns - e->last_conns)<<9;
e->last_conns = n_conns;
e->cps += ((long)rate - (long)e->cps)>>2;
s->cps = (e->cps+0x1FF)>>10;
rate = (n_inpkts - e->last_inpkts)<<9;
e->last_inpkts = n_inpkts;
e->inpps += ((long)rate - (long)e->inpps)>>2;
s->inpps = (e->inpps+0x1FF)>>10;
rate = (n_outpkts - e->last_outpkts)<<9;
e->last_outpkts = n_outpkts;
e->outpps += ((long)rate - (long)e->outpps)>>2;
s->outpps = (e->outpps+0x1FF)>>10;
rate = (n_inbytes - e->last_inbytes)<<4;
e->last_inbytes = n_inbytes;
e->inbps += ((long)rate - (long)e->inbps)>>2;
s->inbps = (e->inbps+0xF)>>5;
rate = (n_outbytes - e->last_outbytes)<<4;
e->last_outbytes = n_outbytes;
e->outbps += ((long)rate - (long)e->outbps)>>2;
s->outbps = (e->outbps+0xF)>>5;
}
read_unlock(&est_lock);
mod_timer(&est_timer, jiffies + 2*HZ);
}
int ip_vs_new_estimator(struct ip_vs_stats *stats)
{
struct ip_vs_estimator *est;
est = kmalloc(sizeof(*est), GFP_KERNEL);
if (est == NULL)
return -ENOMEM;
memset(est, 0, sizeof(*est));
est->stats = stats;
est->last_conns = stats->conns;
est->cps = stats->cps<<10;
est->last_inpkts = stats->inpkts;
est->inpps = stats->inpps<<10;
est->last_outpkts = stats->outpkts;
est->outpps = stats->outpps<<10;
est->last_inbytes = stats->inbytes;
est->inbps = stats->inbps<<5;
est->last_outbytes = stats->outbytes;
est->outbps = stats->outbps<<5;
write_lock_bh(&est_lock);
est->next = est_list;
if (est->next == NULL) {
init_timer(&est_timer);
est_timer.expires = jiffies + 2*HZ;
est_timer.function = estimation_timer;
add_timer(&est_timer);
}
est_list = est;
write_unlock_bh(&est_lock);
return 0;
}
void ip_vs_kill_estimator(struct ip_vs_stats *stats)
{
struct ip_vs_estimator *est, **pest;
int killed = 0;
write_lock_bh(&est_lock);
pest = &est_list;
while ((est=*pest) != NULL) {
if (est->stats != stats) {
pest = &est->next;
continue;
}
*pest = est->next;
kfree(est);
killed++;
}
if (killed && est_list == NULL)
del_timer_sync(&est_timer);
write_unlock_bh(&est_lock);
}
void ip_vs_zero_estimator(struct ip_vs_stats *stats)
{
struct ip_vs_estimator *e;
write_lock_bh(&est_lock);
for (e = est_list; e; e = e->next) {
if (e->stats != stats)
continue;
/* set counters zero */
e->last_conns = 0;
e->last_inpkts = 0;
e->last_outpkts = 0;
e->last_inbytes = 0;
e->last_outbytes = 0;
e->cps = 0;
e->inpps = 0;
e->outpps = 0;
e->inbps = 0;
e->outbps = 0;
}
write_unlock_bh(&est_lock);
}
/*
* ip_vs_ftp.c: IPVS ftp application module
*
* Version: $Id: ip_vs_ftp.c,v 1.13 2002/09/15 08:14:08 wensong Exp $
*
* Authors: Wensong Zhang <wensong@linuxvirtualserver.org>
*
* Changes:
*
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public License
* as published by the Free Software Foundation; either version
* 2 of the License, or (at your option) any later version.
*
* Most code here is taken from ip_masq_ftp.c in kernel 2.2. The difference
* is that ip_vs_ftp module handles the reverse direction to ip_masq_ftp.
*
* IP_MASQ_FTP ftp masquerading module
*
* Version: @(#)ip_masq_ftp.c 0.04 02/05/96
*
* Author: Wouter Gadeyne
*
*/
#include <linux/config.h>
#include <linux/module.h>
#include <asm/system.h>
#include <linux/types.h>
#include <linux/kernel.h>
#include <linux/skbuff.h>
#include <linux/in.h>
#include <linux/ip.h>
#include <linux/init.h>
#include <net/protocol.h>
#include <net/tcp.h>
#include <net/ip_vs.h>
#define SERVER_STRING "227 Entering Passive Mode ("
#define CLIENT_STRING "PORT "
/*
* List of ports (up to IP_VS_APP_MAX_PORTS) to be handled by helper
* First port is set to the default port.
*/
static int ports[IP_VS_APP_MAX_PORTS] = {21, 0};
/*
* Debug level
*/
#ifdef CONFIG_IP_VS_DEBUG
static int debug=0;
MODULE_PARM(debug, "i");
#endif
MODULE_PARM(ports, "1-" __MODULE_STRING(IP_VS_APP_MAX_PORTS) "i");
/* Dummy variable */
static int ip_vs_ftp_pasv;
static int
ip_vs_ftp_init_conn(struct ip_vs_app *app, struct ip_vs_conn *cp)
{
return 0;
}
static int
ip_vs_ftp_done_conn(struct ip_vs_app *app, struct ip_vs_conn *cp)
{
return 0;
}
/*
* Get <addr,port> from the string "xxx.xxx.xxx.xxx,ppp,ppp", started
* with the "pattern" and terminated with the "term" character.
* <addr,port> is in network order.
*/
static int ip_vs_ftp_get_addrport(char *data, char *data_limit,
const char *pattern, size_t plen, char term,
__u32 *addr, __u16 *port,
char **start, char **end)
{
unsigned char p1,p2,p3,p4,p5,p6;
while (data < data_limit) {
if (strnicmp(data, pattern, plen) != 0) {
data++;
continue;
}
*start = data+plen;
p1 = simple_strtoul(data+plen, &data, 10);
if (*data != ',')
continue;
p2 = simple_strtoul(data+1, &data, 10);
if (*data != ',')
continue;
p3 = simple_strtoul(data+1, &data, 10);
if (*data != ',')
continue;
p4 = simple_strtoul(data+1, &data, 10);
if (*data != ',')
continue;
p5 = simple_strtoul(data+1, &data, 10);
if (*data != ',')
continue;
p6 = simple_strtoul(data+1, &data, 10);
if (*data != term)
continue;
*end = data;
*addr = (p4<<24) | (p3<<16) | (p2<<8) | p1;
*port = (p6<<8) | p5;
return 1;
}
return 0;
}
/*
* Look at outgoing ftp packets to catch the response to a PASV command
* from the server (inside-to-outside).
* When we see one, we build a connection entry with the client address,
* client port 0 (unknown at the moment), the server address and the
* server port. Mark the current connection entry as a control channel
* of the new entry. All this work is just to make the data connection
* can be scheduled to the right server later.
*
* The outgoing packet should be something like
* "227 Entering Passive Mode (xxx,xxx,xxx,xxx,ppp,ppp)".
* xxx,xxx,xxx,xxx is the server address, ppp,ppp is the server port number.
*/
static int ip_vs_ftp_out(struct ip_vs_app *app,
struct ip_vs_conn *cp, struct sk_buff *skb)
{
struct iphdr *iph;
struct tcphdr *th;
char *data, *data_limit;
char *start, *end;
__u32 from;
__u16 port;
struct ip_vs_conn *n_cp;
char buf[24]; /* xxx.xxx.xxx.xxx,ppp,ppp\000 */
unsigned buf_len;
int diff;
/* Only useful for established sessions */
if (cp->state != IP_VS_TCP_S_ESTABLISHED)
return 0;
if (cp->app_data == &ip_vs_ftp_pasv) {
iph = skb->nh.iph;
th = (struct tcphdr *)&(((char *)iph)[iph->ihl*4]);
data = (char *)th + (th->doff << 2);
data_limit = skb->tail;
if (ip_vs_ftp_get_addrport(data, data_limit,
SERVER_STRING,
sizeof(SERVER_STRING)-1, ')',
&from, &port,
&start, &end) == 0)
return 0;
IP_VS_DBG(1-debug, "PASV response (%u.%u.%u.%u:%d) -> "
"%u.%u.%u.%u:%d detected\n",
NIPQUAD(from), ntohs(port), NIPQUAD(cp->caddr), 0);
/*
* Now update or create an connection entry for it
*/
n_cp = ip_vs_conn_out_get(iph->protocol, from, port,
cp->caddr, 0);
if (!n_cp) {
n_cp = ip_vs_conn_new(IPPROTO_TCP,
cp->caddr, 0,
cp->vaddr, port,
from, port,
IP_VS_CONN_F_NO_CPORT,
cp->dest);
if (!n_cp)
return 0;
/* add its controller */
ip_vs_control_add(n_cp, cp);
}
/*
* Replace the old passive address with the new one
*/
from = n_cp->vaddr;
port = n_cp->vport;
sprintf(buf,"%d,%d,%d,%d,%d,%d", NIPQUAD(from),
port&255, port>>8&255);
buf_len = strlen(buf);
/*
* Calculate required delta-offset to keep TCP happy
*/
diff = buf_len - (end-start);
if (diff == 0) {
/* simply replace it with new passive address */
memcpy(start, buf, buf_len);
} else {
/* fixme: return value isn't checked here */
ip_vs_skb_replace(skb, GFP_ATOMIC, start,
end-start, buf, buf_len);
}
cp->app_data = NULL;
ip_vs_tcp_conn_listen(n_cp);
ip_vs_conn_put(n_cp);
return diff;
}
return 0;
}
/*
* Look at incoming ftp packets to catch the PASV/PORT command
* (outside-to-inside).
*
* The incoming packet having the PORT command should be something like
* "PORT xxx,xxx,xxx,xxx,ppp,ppp\n".
* xxx,xxx,xxx,xxx is the client address, ppp,ppp is the client port number.
* In this case, we create a connection entry using the client address and
* port, so that the active ftp data connection from the server can reach
* the client.
*/
static int ip_vs_ftp_in(struct ip_vs_app *app,
struct ip_vs_conn *cp, struct sk_buff *skb)
{
struct iphdr *iph;
struct tcphdr *th;
char *data, *data_start, *data_limit;
char *start, *end;
__u32 to;
__u16 port;
struct ip_vs_conn *n_cp;
/* Only useful for established sessions */
if (cp->state != IP_VS_TCP_S_ESTABLISHED)
return 0;
/*
* Detecting whether it is passive
*/
iph = skb->nh.iph;
th = (struct tcphdr *)&(((char *)iph)[iph->ihl*4]);
/* Since there may be OPTIONS in the TCP packet and the HLEN is
the length of the header in 32-bit multiples, it is accurate
to calculate data address by th+HLEN*4 */
data = data_start = (char *)th + (th->doff << 2);
data_limit = skb->tail;
while (data < data_limit) {
if (strnicmp(data, "PASV\r\n", 6) == 0) {
IP_VS_DBG(1-debug, "got PASV at %d of %d\n",
data - data_start,
data_limit - data_start);
cp->app_data = &ip_vs_ftp_pasv;
return 0;
}
data++;
}
/*
* To support virtual FTP server, the scenerio is as follows:
* FTP client ----> Load Balancer ----> FTP server
* First detect the port number in the application data,
* then create a new connection entry for the coming data
* connection.
*/
data = data_start;
data_limit = skb->h.raw + skb->len - 18;
if (ip_vs_ftp_get_addrport(data, data_limit,
CLIENT_STRING, sizeof(CLIENT_STRING)-1,
'\r', &to, &port,
&start, &end) == 0)
return 0;
IP_VS_DBG(1-debug, "PORT %u.%u.%u.%u:%d detected\n",
NIPQUAD(to), ntohs(port));
/*
* Now update or create a connection entry for it
*/
IP_VS_DBG(1-debug, "protocol %s %u.%u.%u.%u:%d %u.%u.%u.%u:%d\n",
ip_vs_proto_name(iph->protocol),
NIPQUAD(to), ntohs(port), NIPQUAD(iph->daddr), 0);
n_cp = ip_vs_conn_in_get(iph->protocol,
to, port,
iph->daddr, htons(ntohs(cp->vport)-1));
if (!n_cp) {
n_cp = ip_vs_conn_new(IPPROTO_TCP,
to, port,
cp->vaddr, htons(ntohs(cp->vport)-1),
cp->daddr, htons(ntohs(cp->dport)-1),
0,
cp->dest);
if (!n_cp)
return 0;
/* add its controller */
ip_vs_control_add(n_cp, cp);
}
/*
* Move tunnel to listen state
*/
ip_vs_tcp_conn_listen(n_cp);
ip_vs_conn_put(n_cp);
/* no diff required for incoming packets */
return 0;
}
static struct ip_vs_app ip_vs_ftp = {
.name = "ftp",
.type = IP_VS_APP_TYPE_FTP,
.protocol = IPPROTO_TCP,
.module = THIS_MODULE,
.incs_list = LIST_HEAD_INIT(ip_vs_ftp.incs_list),
.init_conn = ip_vs_ftp_init_conn,
.done_conn = ip_vs_ftp_done_conn,
.bind_conn = NULL,
.unbind_conn = NULL,
.pkt_out = ip_vs_ftp_out,
.pkt_in = ip_vs_ftp_in,
};
/*
* ip_vs_ftp initialization
*/
static int __init ip_vs_ftp_init(void)
{
int i, ret;
struct ip_vs_app *app = &ip_vs_ftp;
ret = register_ip_vs_app(app);
if (ret)
return ret;
for (i=0; i<IP_VS_APP_MAX_PORTS; i++) {
if (!ports[i])
continue;
ret = register_ip_vs_app_inc(app, app->protocol, ports[i]);
if (ret)
break;
IP_VS_DBG(1-debug, "%s: loaded support on port[%d] = %d\n",
app->name, i, ports[i]);
}
if (ret)
unregister_ip_vs_app(app);
return ret;
}
/*
* ip_vs_ftp finish.
*/
static void __exit ip_vs_ftp_exit(void)
{
unregister_ip_vs_app(&ip_vs_ftp);
}
module_init(ip_vs_ftp_init);
module_exit(ip_vs_ftp_exit);
MODULE_LICENSE("GPL");
/*
* IPVS: Locality-Based Least-Connection scheduling module
*
* Version: $Id: ip_vs_lblc.c,v 1.10 2002/09/15 08:14:08 wensong Exp $
*
* Authors: Wensong Zhang <wensong@gnuchina.org>
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public License
* as published by the Free Software Foundation; either version
* 2 of the License, or (at your option) any later version.
*
* Changes:
* Martin Hamilton : fixed the terrible locking bugs
* *lock(tbl->lock) ==> *lock(&tbl->lock)
* Wensong Zhang : fixed the uninitilized tbl->lock bug
* Wensong Zhang : added doing full expiration check to
* collect stale entries of 24+ hours when
* no partial expire check in a half hour
* Julian Anastasov : replaced del_timer call with del_timer_sync
* to avoid the possible race between timer
* handler and del_timer thread in SMP
*
*/
/*
* The lblc algorithm is as follows (pseudo code):
*
* if cachenode[dest_ip] is null then
* n, cachenode[dest_ip] <- {weighted least-conn node};
* else
* n <- cachenode[dest_ip];
* if (n is dead) OR
* (n.conns>n.weight AND
* there is a node m with m.conns<m.weight/2) then
* n, cachenode[dest_ip] <- {weighted least-conn node};
*
* return n;
*
* Thanks must go to Wenzhuo Zhang for talking WCCP to me and pushing
* me to write this module.
*/
#include <linux/config.h>
#include <linux/module.h>
#include <linux/init.h>
#include <linux/types.h>
#include <linux/kernel.h>
#include <linux/errno.h>
/* for systcl */
#include <linux/fs.h>
#include <linux/sysctl.h>
#include <net/ip_vs.h>
/*
* It is for garbage collection of stale IPVS lblc entries,
* when the table is full.
*/
#define CHECK_EXPIRE_INTERVAL (60*HZ)
#define ENTRY_TIMEOUT (6*60*HZ)
/*
* It is for full expiration check.
* When there is no partial expiration check (garbage collection)
* in a half hour, do a full expiration check to collect stale
* entries that haven't been touched for a day.
*/
#define COUNT_FOR_FULL_EXPIRATION 30
int sysctl_ip_vs_lblc_expiration = 24*60*60*HZ;
/*
* for IPVS lblc entry hash table
*/
#ifndef CONFIG_IP_VS_LBLC_TAB_BITS
#define CONFIG_IP_VS_LBLC_TAB_BITS 10
#endif
#define IP_VS_LBLC_TAB_BITS CONFIG_IP_VS_LBLC_TAB_BITS
#define IP_VS_LBLC_TAB_SIZE (1 << IP_VS_LBLC_TAB_BITS)
#define IP_VS_LBLC_TAB_MASK (IP_VS_LBLC_TAB_SIZE - 1)
/*
* IPVS lblc entry represents an association between destination
* IP address and its destination server
*/
struct ip_vs_lblc_entry {
struct list_head list;
__u32 addr; /* destination IP address */
struct ip_vs_dest *dest; /* real server (cache) */
unsigned long lastuse; /* last used time */
};
/*
* IPVS lblc hash table
*/
struct ip_vs_lblc_table {
rwlock_t lock; /* lock for this table */
struct list_head bucket[IP_VS_LBLC_TAB_SIZE]; /* hash bucket */
atomic_t entries; /* number of entries */
int max_size; /* maximum size of entries */
struct timer_list periodic_timer; /* collect stale entries */
int rover; /* rover for expire check */
int counter; /* counter for no expire */
};
/*
* IPVS LBLC sysctl table
*/
struct ip_vs_lblc_sysctl_table {
struct ctl_table_header *sysctl_header;
ctl_table vs_vars[2];
ctl_table vs_dir[2];
ctl_table ipv4_dir[2];
ctl_table root_dir[2];
};
static struct ip_vs_lblc_sysctl_table lblc_sysctl_table = {
NULL,
{{NET_IPV4_VS_LBLC_EXPIRE, "lblc_expiration",
&sysctl_ip_vs_lblc_expiration,
sizeof(int), 0644, NULL, &proc_dointvec_jiffies},
{0}},
{{NET_IPV4_VS, "vs", NULL, 0, 0555, lblc_sysctl_table.vs_vars},
{0}},
{{NET_IPV4, "ipv4", NULL, 0, 0555, lblc_sysctl_table.vs_dir},
{0}},
{{CTL_NET, "net", NULL, 0, 0555, lblc_sysctl_table.ipv4_dir},
{0}}
};
/*
* new/free a ip_vs_lblc_entry, which is a mapping of a destionation
* IP address to a server.
*/
static inline struct ip_vs_lblc_entry *
ip_vs_lblc_new(__u32 daddr, struct ip_vs_dest *dest)
{
struct ip_vs_lblc_entry *en;
en = kmalloc(sizeof(struct ip_vs_lblc_entry), GFP_ATOMIC);
if (en == NULL) {
IP_VS_ERR("ip_vs_lblc_new(): no memory\n");
return NULL;
}
INIT_LIST_HEAD(&en->list);
en->addr = daddr;
atomic_inc(&dest->refcnt);
en->dest = dest;
return en;
}
static inline void ip_vs_lblc_free(struct ip_vs_lblc_entry *en)
{
list_del(&en->list);
/*
* We don't kfree dest because it is refered either by its service
* or the trash dest list.
*/
atomic_dec(&en->dest->refcnt);
kfree(en);
}
/*
* Returns hash value for IPVS LBLC entry
*/
static inline unsigned ip_vs_lblc_hashkey(__u32 addr)
{
return (ntohl(addr)*2654435761UL) & IP_VS_LBLC_TAB_MASK;
}
/*
* Hash an entry in the ip_vs_lblc_table.
* returns bool success.
*/
static int
ip_vs_lblc_hash(struct ip_vs_lblc_table *tbl, struct ip_vs_lblc_entry *en)
{
unsigned hash;
if (!list_empty(&en->list)) {
IP_VS_ERR("ip_vs_lblc_hash(): request for already hashed, "
"called from %p\n", __builtin_return_address(0));
return 0;
}
/*
* Hash by destination IP address
*/
hash = ip_vs_lblc_hashkey(en->addr);
write_lock(&tbl->lock);
list_add(&en->list, &tbl->bucket[hash]);
atomic_inc(&tbl->entries);
write_unlock(&tbl->lock);
return 1;
}
#if 0000
/*
* Unhash ip_vs_lblc_entry from ip_vs_lblc_table.
* returns bool success.
*/
static int ip_vs_lblc_unhash(struct ip_vs_lblc_table *tbl,
struct ip_vs_lblc_entry *en)
{
if (list_empty(&en->list)) {
IP_VS_ERR("ip_vs_lblc_unhash(): request for not hashed entry, "
"called from %p\n", __builtin_return_address(0));
return 0;
}
/*
* Remove it from the table
*/
write_lock(&tbl->lock);
list_del(&en->list);
INIT_LIST_HEAD(&en->list);
write_unlock(&tbl->lock);
return 1;
}
#endif
/*
* Get ip_vs_lblc_entry associated with supplied parameters.
*/
static inline struct ip_vs_lblc_entry *
ip_vs_lblc_get(struct ip_vs_lblc_table *tbl, __u32 addr)
{
unsigned hash;
struct ip_vs_lblc_entry *en;
struct list_head *l,*e;
hash = ip_vs_lblc_hashkey(addr);
l = &tbl->bucket[hash];
read_lock(&tbl->lock);
for (e=l->next; e!=l; e=e->next) {
en = list_entry(e, struct ip_vs_lblc_entry, list);
if (en->addr == addr) {
/* HIT */
read_unlock(&tbl->lock);
return en;
}
}
read_unlock(&tbl->lock);
return NULL;
}
/*
* Flush all the entries of the specified table.
*/
static void ip_vs_lblc_flush(struct ip_vs_lblc_table *tbl)
{
int i;
struct list_head *l;
struct ip_vs_lblc_entry *en;
for (i=0; i<IP_VS_LBLC_TAB_SIZE; i++) {
write_lock(&tbl->lock);
for (l=&tbl->bucket[i]; l->next!=l; ) {
en = list_entry(l->next,
struct ip_vs_lblc_entry, list);
ip_vs_lblc_free(en);
atomic_dec(&tbl->entries);
}
write_unlock(&tbl->lock);
}
}
static inline void ip_vs_lblc_full_check(struct ip_vs_lblc_table *tbl)
{
unsigned long now = jiffies;
int i, j;
struct list_head *l, *e;
struct ip_vs_lblc_entry *en;
for (i=0, j=tbl->rover; i<IP_VS_LBLC_TAB_SIZE; i++) {
j = (j + 1) & IP_VS_LBLC_TAB_MASK;
e = l = &tbl->bucket[j];
write_lock(&tbl->lock);
while (e->next != l) {
en = list_entry(e->next,
struct ip_vs_lblc_entry, list);
if ((now - en->lastuse) <
sysctl_ip_vs_lblc_expiration) {
e = e->next;
continue;
}
ip_vs_lblc_free(en);
atomic_dec(&tbl->entries);
}
write_unlock(&tbl->lock);
}
tbl->rover = j;
}
/*
* Periodical timer handler for IPVS lblc table
* It is used to collect stale entries when the number of entries
* exceeds the maximum size of the table.
*
* Fixme: we probably need more complicated algorithm to collect
* entries that have not been used for a long time even
* if the number of entries doesn't exceed the maximum size
* of the table.
* The full expiration check is for this purpose now.
*/
static void ip_vs_lblc_check_expire(unsigned long data)
{
struct ip_vs_lblc_table *tbl;
unsigned long now = jiffies;
int goal;
int i, j;
struct list_head *l, *e;
struct ip_vs_lblc_entry *en;
tbl = (struct ip_vs_lblc_table *)data;
if ((tbl->counter % COUNT_FOR_FULL_EXPIRATION) == 0) {
/* do full expiration check */
ip_vs_lblc_full_check(tbl);
tbl->counter = 1;
goto out;
}
if (atomic_read(&tbl->entries) <= tbl->max_size) {
tbl->counter++;
goto out;
}
goal = (atomic_read(&tbl->entries) - tbl->max_size)*4/3;
if (goal > tbl->max_size/2)
goal = tbl->max_size/2;
for (i=0, j=tbl->rover; i<IP_VS_LBLC_TAB_SIZE; i++) {
j = (j + 1) & IP_VS_LBLC_TAB_MASK;
e = l = &tbl->bucket[j];
write_lock(&tbl->lock);
while (e->next != l) {
en = list_entry(e->next,
struct ip_vs_lblc_entry, list);
if ((now - en->lastuse) < ENTRY_TIMEOUT) {
e = e->next;
continue;
}
ip_vs_lblc_free(en);
atomic_dec(&tbl->entries);
goal--;
}
write_unlock(&tbl->lock);
if (goal <= 0)
break;
}
tbl->rover = j;
out:
mod_timer(&tbl->periodic_timer, jiffies+CHECK_EXPIRE_INTERVAL);
}
static int ip_vs_lblc_init_svc(struct ip_vs_service *svc)
{
int i;
struct ip_vs_lblc_table *tbl;
/*
* Allocate the ip_vs_lblc_table for this service
*/
tbl = kmalloc(sizeof(struct ip_vs_lblc_table), GFP_ATOMIC);
if (tbl == NULL) {
IP_VS_ERR("ip_vs_lblc_init_svc(): no memory\n");
return -ENOMEM;
}
svc->sched_data = tbl;
IP_VS_DBG(6, "LBLC hash table (memory=%dbytes) allocated for "
"current service\n",
sizeof(struct ip_vs_lblc_table));
/*
* Initialize the hash buckets
*/
for (i=0; i<IP_VS_LBLC_TAB_SIZE; i++) {
INIT_LIST_HEAD(&tbl->bucket[i]);
}
tbl->lock = RW_LOCK_UNLOCKED;
tbl->max_size = IP_VS_LBLC_TAB_SIZE*16;
tbl->rover = 0;
tbl->counter = 1;
/*
* Hook periodic timer for garbage collection
*/
init_timer(&tbl->periodic_timer);
tbl->periodic_timer.data = (unsigned long)tbl;
tbl->periodic_timer.function = ip_vs_lblc_check_expire;
tbl->periodic_timer.expires = jiffies+CHECK_EXPIRE_INTERVAL;
add_timer(&tbl->periodic_timer);
return 0;
}
static int ip_vs_lblc_done_svc(struct ip_vs_service *svc)
{
struct ip_vs_lblc_table *tbl = svc->sched_data;
/* remove periodic timer */
del_timer_sync(&tbl->periodic_timer);
/* got to clean up table entries here */
ip_vs_lblc_flush(tbl);
/* release the table itself */
kfree(svc->sched_data);
IP_VS_DBG(6, "LBLC hash table (memory=%dbytes) released\n",
sizeof(struct ip_vs_lblc_table));
return 0;
}
static int ip_vs_lblc_update_svc(struct ip_vs_service *svc)
{
return 0;
}
static inline struct ip_vs_dest *
__ip_vs_wlc_schedule(struct ip_vs_service *svc, struct iphdr *iph)
{
register struct list_head *l, *e;
struct ip_vs_dest *dest, *least;
int loh, doh;
/*
* We think the overhead of processing active connections is fifty
* times higher than that of inactive connections in average. (This
* fifty times might not be accurate, we will change it later.) We
* use the following formula to estimate the overhead:
* dest->activeconns*50 + dest->inactconns
* and the load:
* (dest overhead) / dest->weight
*
* Remember -- no floats in kernel mode!!!
* The comparison of h1*w2 > h2*w1 is equivalent to that of
* h1/w1 > h2/w2
* if every weight is larger than zero.
*
* The server with weight=0 is quiesced and will not receive any
* new connection.
*/
l = &svc->destinations;
for (e=l->next; e!=l; e=e->next) {
least = list_entry(e, struct ip_vs_dest, n_list);
if (least->flags & IP_VS_DEST_F_OVERLOAD)
continue;
if (atomic_read(&least->weight) > 0) {
loh = atomic_read(&least->activeconns) * 50
+ atomic_read(&least->inactconns);
goto nextstage;
}
}
return NULL;
/*
* Find the destination with the least load.
*/
nextstage:
for (e=e->next; e!=l; e=e->next) {
dest = list_entry(e, struct ip_vs_dest, n_list);
if (dest->flags & IP_VS_DEST_F_OVERLOAD)
continue;
doh = atomic_read(&dest->activeconns) * 50
+ atomic_read(&dest->inactconns);
if (loh * atomic_read(&dest->weight) >
doh * atomic_read(&least->weight)) {
least = dest;
loh = doh;
}
}
IP_VS_DBG(6, "LBLC: server %d.%d.%d.%d:%d "
"activeconns %d refcnt %d weight %d overhead %d\n",
NIPQUAD(least->addr), ntohs(least->port),
atomic_read(&least->activeconns),
atomic_read(&least->refcnt),
atomic_read(&least->weight), loh);
return least;
}
/*
* If this destination server is overloaded and there is a less loaded
* server, then return true.
*/
static inline int
is_overloaded(struct ip_vs_dest *dest, struct ip_vs_service *svc)
{
if (atomic_read(&dest->activeconns) > atomic_read(&dest->weight)) {
register struct list_head *l, *e;
struct ip_vs_dest *d;
l = &svc->destinations;
for (e=l->next; e!=l; e=e->next) {
d = list_entry(e, struct ip_vs_dest, n_list);
if (atomic_read(&d->activeconns)*2
< atomic_read(&d->weight)) {
return 1;
}
}
}
return 0;
}
/*
* Locality-Based (weighted) Least-Connection scheduling
*/
static struct ip_vs_dest *
ip_vs_lblc_schedule(struct ip_vs_service *svc, struct iphdr *iph)
{
struct ip_vs_dest *dest;
struct ip_vs_lblc_table *tbl;
struct ip_vs_lblc_entry *en;
IP_VS_DBG(6, "ip_vs_lblc_schedule(): Scheduling...\n");
tbl = (struct ip_vs_lblc_table *)svc->sched_data;
en = ip_vs_lblc_get(tbl, iph->daddr);
if (en == NULL) {
dest = __ip_vs_wlc_schedule(svc, iph);
if (dest == NULL) {
IP_VS_DBG(1, "no destination available\n");
return NULL;
}
en = ip_vs_lblc_new(iph->daddr, dest);
if (en == NULL) {
return NULL;
}
ip_vs_lblc_hash(tbl, en);
} else {
dest = en->dest;
if (!(dest->flags & IP_VS_DEST_F_AVAILABLE)
|| atomic_read(&dest->weight) <= 0
|| is_overloaded(dest, svc)) {
dest = __ip_vs_wlc_schedule(svc, iph);
if (dest == NULL) {
IP_VS_DBG(1, "no destination available\n");
return NULL;
}
atomic_dec(&en->dest->refcnt);
atomic_inc(&dest->refcnt);
en->dest = dest;
}
}
en->lastuse = jiffies;
IP_VS_DBG(6, "LBLC: destination IP address %u.%u.%u.%u "
"--> server %u.%u.%u.%u:%d\n",
NIPQUAD(en->addr),
NIPQUAD(dest->addr),
ntohs(dest->port));
return dest;
}
/*
* IPVS LBLC Scheduler structure
*/
static struct ip_vs_scheduler ip_vs_lblc_scheduler =
{
.name = "lblc",
.refcnt = ATOMIC_INIT(0),
.module = THIS_MODULE,
.init_service = ip_vs_lblc_init_svc,
.done_service = ip_vs_lblc_done_svc,
.update_service = ip_vs_lblc_update_svc,
.schedule = ip_vs_lblc_schedule,
};
static int __init ip_vs_lblc_init(void)
{
INIT_LIST_HEAD(&ip_vs_lblc_scheduler.n_list);
lblc_sysctl_table.sysctl_header =
register_sysctl_table(lblc_sysctl_table.root_dir, 0);
return register_ip_vs_scheduler(&ip_vs_lblc_scheduler);
}
static void __exit ip_vs_lblc_cleanup(void)
{
unregister_sysctl_table(lblc_sysctl_table.sysctl_header);
unregister_ip_vs_scheduler(&ip_vs_lblc_scheduler);
}
module_init(ip_vs_lblc_init);
module_exit(ip_vs_lblc_cleanup);
MODULE_LICENSE("GPL");
/*
* IPVS: Locality-Based Least-Connection with Replication scheduler
*
* Version: $Id: ip_vs_lblcr.c,v 1.11 2002/09/15 08:14:08 wensong Exp $
*
* Authors: Wensong Zhang <wensong@gnuchina.org>
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public License
* as published by the Free Software Foundation; either version
* 2 of the License, or (at your option) any later version.
*
* Changes:
* Julian Anastasov : Added the missing (dest->weight>0)
* condition in the ip_vs_dest_set_max.
*
*/
/*
* The lblc/r algorithm is as follows (pseudo code):
*
* if serverSet[dest_ip] is null then
* n, serverSet[dest_ip] <- {weighted least-conn node};
* else
* n <- {least-conn (alive) node in serverSet[dest_ip]};
* if (n is null) OR
* (n.conns>n.weight AND
* there is a node m with m.conns<m.weight/2) then
* n <- {weighted least-conn node};
* add n to serverSet[dest_ip];
* if |serverSet[dest_ip]| > 1 AND
* now - serverSet[dest_ip].lastMod > T then
* m <- {most conn node in serverSet[dest_ip]};
* remove m from serverSet[dest_ip];
* if serverSet[dest_ip] changed then
* serverSet[dest_ip].lastMod <- now;
*
* return n;
*
*/
#include <linux/config.h>
#include <linux/module.h>
#include <linux/init.h>
#include <linux/types.h>
#include <linux/kernel.h>
#include <linux/errno.h>
/* for systcl */
#include <linux/fs.h>
#include <linux/sysctl.h>
/* for proc_net_create/proc_net_remove */
#include <linux/proc_fs.h>
#include <net/ip_vs.h>
/*
* It is for garbage collection of stale IPVS lblcr entries,
* when the table is full.
*/
#define CHECK_EXPIRE_INTERVAL (60*HZ)
#define ENTRY_TIMEOUT (6*60*HZ)
/*
* It is for full expiration check.
* When there is no partial expiration check (garbage collection)
* in a half hour, do a full expiration check to collect stale
* entries that haven't been touched for a day.
*/
#define COUNT_FOR_FULL_EXPIRATION 30
int sysctl_ip_vs_lblcr_expiration = 24*60*60*HZ;
/*
* for IPVS lblcr entry hash table
*/
#ifndef CONFIG_IP_VS_LBLCR_TAB_BITS
#define CONFIG_IP_VS_LBLCR_TAB_BITS 10
#endif
#define IP_VS_LBLCR_TAB_BITS CONFIG_IP_VS_LBLCR_TAB_BITS
#define IP_VS_LBLCR_TAB_SIZE (1 << IP_VS_LBLCR_TAB_BITS)
#define IP_VS_LBLCR_TAB_MASK (IP_VS_LBLCR_TAB_SIZE - 1)
/*
* IPVS destination set structure and operations
*/
struct ip_vs_dest_list {
struct ip_vs_dest_list *next; /* list link */
struct ip_vs_dest *dest; /* destination server */
};
struct ip_vs_dest_set {
atomic_t size; /* set size */
unsigned long lastmod; /* last modified time */
struct ip_vs_dest_list *list; /* destination list */
rwlock_t lock; /* lock for this list */
};
static struct ip_vs_dest_list *
ip_vs_dest_set_insert(struct ip_vs_dest_set *set, struct ip_vs_dest *dest)
{
struct ip_vs_dest_list *e;
for (e=set->list; e!=NULL; e=e->next) {
if (e->dest == dest)
/* already existed */
return NULL;
}
e = kmalloc(sizeof(struct ip_vs_dest_list), GFP_ATOMIC);
if (e == NULL) {
IP_VS_ERR("ip_vs_dest_set_insert(): no memory\n");
return NULL;
}
atomic_inc(&dest->refcnt);
e->dest = dest;
/* link it to the list */
write_lock(&set->lock);
e->next = set->list;
set->list = e;
atomic_inc(&set->size);
write_unlock(&set->lock);
set->lastmod = jiffies;
return e;
}
static void
ip_vs_dest_set_erase(struct ip_vs_dest_set *set, struct ip_vs_dest *dest)
{
struct ip_vs_dest_list *e, **ep;
write_lock(&set->lock);
for (ep=&set->list, e=*ep; e!=NULL; e=*ep) {
if (e->dest == dest) {
/* HIT */
*ep = e->next;
atomic_dec(&set->size);
set->lastmod = jiffies;
atomic_dec(&e->dest->refcnt);
kfree(e);
break;
}
ep = &e->next;
}
write_unlock(&set->lock);
}
static void ip_vs_dest_set_eraseall(struct ip_vs_dest_set *set)
{
struct ip_vs_dest_list *e, **ep;
write_lock(&set->lock);
for (ep=&set->list, e=*ep; e!=NULL; e=*ep) {
*ep = e->next;
/*
* We don't kfree dest because it is refered either
* by its service or by the trash dest list.
*/
atomic_dec(&e->dest->refcnt);
kfree(e);
}
write_unlock(&set->lock);
}
/* get weighted least-connection node in the destination set */
static inline struct ip_vs_dest *ip_vs_dest_set_min(struct ip_vs_dest_set *set)
{
register struct ip_vs_dest_list *e;
struct ip_vs_dest *dest, *least;
int loh, doh;
if (set == NULL)
return NULL;
read_lock(&set->lock);
/* select the first destination server, whose weight > 0 */
for (e=set->list; e!=NULL; e=e->next) {
least = e->dest;
if (least->flags & IP_VS_DEST_F_OVERLOAD)
continue;
if ((atomic_read(&least->weight) > 0)
&& (least->flags & IP_VS_DEST_F_AVAILABLE)) {
loh = atomic_read(&least->activeconns) * 50
+ atomic_read(&least->inactconns);
goto nextstage;
}
}
read_unlock(&set->lock);
return NULL;
/* find the destination with the weighted least load */
nextstage:
for (e=e->next; e!=NULL; e=e->next) {
dest = e->dest;
if (dest->flags & IP_VS_DEST_F_OVERLOAD)
continue;
doh = atomic_read(&dest->activeconns) * 50
+ atomic_read(&dest->inactconns);
if ((loh * atomic_read(&dest->weight) >
doh * atomic_read(&least->weight))
&& (dest->flags & IP_VS_DEST_F_AVAILABLE)) {
least = dest;
loh = doh;
}
}
read_unlock(&set->lock);
IP_VS_DBG(6, "ip_vs_dest_set_min: server %d.%d.%d.%d:%d "
"activeconns %d refcnt %d weight %d overhead %d\n",
NIPQUAD(least->addr), ntohs(least->port),
atomic_read(&least->activeconns),
atomic_read(&least->refcnt),
atomic_read(&least->weight), loh);
return least;
}
/* get weighted most-connection node in the destination set */
static inline struct ip_vs_dest *ip_vs_dest_set_max(struct ip_vs_dest_set *set)
{
register struct ip_vs_dest_list *e;
struct ip_vs_dest *dest, *most;
int moh, doh;
if (set == NULL)
return NULL;
read_lock(&set->lock);
/* select the first destination server, whose weight > 0 */
for (e=set->list; e!=NULL; e=e->next) {
most = e->dest;
if (atomic_read(&most->weight) > 0) {
moh = atomic_read(&most->activeconns) * 50
+ atomic_read(&most->inactconns);
goto nextstage;
}
}
read_unlock(&set->lock);
return NULL;
/* find the destination with the weighted most load */
nextstage:
for (e=e->next; e!=NULL; e=e->next) {
dest = e->dest;
doh = atomic_read(&dest->activeconns) * 50
+ atomic_read(&dest->inactconns);
/* moh/mw < doh/dw ==> moh*dw < doh*mw, where mw,dw>0 */
if ((moh * atomic_read(&dest->weight) <
doh * atomic_read(&most->weight))
&& (atomic_read(&dest->weight) > 0)) {
most = dest;
moh = doh;
}
}
read_unlock(&set->lock);
IP_VS_DBG(6, "ip_vs_dest_set_max: server %d.%d.%d.%d:%d "
"activeconns %d refcnt %d weight %d overhead %d\n",
NIPQUAD(most->addr), ntohs(most->port),
atomic_read(&most->activeconns),
atomic_read(&most->refcnt),
atomic_read(&most->weight), moh);
return most;
}
/*
* IPVS lblcr entry represents an association between destination
* IP address and its destination server set
*/
struct ip_vs_lblcr_entry {
struct list_head list;
__u32 addr; /* destination IP address */
struct ip_vs_dest_set set; /* destination server set */
unsigned long lastuse; /* last used time */
};
/*
* IPVS lblcr hash table
*/
struct ip_vs_lblcr_table {
rwlock_t lock; /* lock for this table */
struct list_head bucket[IP_VS_LBLCR_TAB_SIZE]; /* hash bucket */
atomic_t entries; /* number of entries */
int max_size; /* maximum size of entries */
struct timer_list periodic_timer; /* collect stale entries */
int rover; /* rover for expire check */
int counter; /* counter for no expire */
};
/*
* IPVS LBLCR sysctl table
*/
struct ip_vs_lblcr_sysctl_table {
struct ctl_table_header *sysctl_header;
ctl_table vs_vars[2];
ctl_table vs_dir[2];
ctl_table ipv4_dir[2];
ctl_table root_dir[2];
};
static struct ip_vs_lblcr_sysctl_table lblcr_sysctl_table = {
NULL,
{{NET_IPV4_VS_LBLCR_EXPIRE, "lblcr_expiration",
&sysctl_ip_vs_lblcr_expiration,
sizeof(int), 0644, NULL, &proc_dointvec_jiffies},
{0}},
{{NET_IPV4_VS, "vs", NULL, 0, 0555, lblcr_sysctl_table.vs_vars},
{0}},
{{NET_IPV4, "ipv4", NULL, 0, 0555, lblcr_sysctl_table.vs_dir},
{0}},
{{CTL_NET, "net", NULL, 0, 0555, lblcr_sysctl_table.ipv4_dir},
{0}}
};
/*
* new/free a ip_vs_lblcr_entry, which is a mapping of a destination
* IP address to a server.
*/
static inline struct ip_vs_lblcr_entry *ip_vs_lblcr_new(__u32 daddr)
{
struct ip_vs_lblcr_entry *en;
en = kmalloc(sizeof(struct ip_vs_lblcr_entry), GFP_ATOMIC);
if (en == NULL) {
IP_VS_ERR("ip_vs_lblcr_new(): no memory\n");
return NULL;
}
INIT_LIST_HEAD(&en->list);
en->addr = daddr;
/* initilize its dest set */
atomic_set(&(en->set.size), 0);
en->set.list = NULL;
en->set.lock = RW_LOCK_UNLOCKED;
return en;
}
static inline void ip_vs_lblcr_free(struct ip_vs_lblcr_entry *en)
{
list_del(&en->list);
ip_vs_dest_set_eraseall(&en->set);
kfree(en);
}
/*
* Returns hash value for IPVS LBLCR entry
*/
static inline unsigned ip_vs_lblcr_hashkey(__u32 addr)
{
return (ntohl(addr)*2654435761UL) & IP_VS_LBLCR_TAB_MASK;
}
/*
* Hash an entry in the ip_vs_lblcr_table.
* returns bool success.
*/
static int
ip_vs_lblcr_hash(struct ip_vs_lblcr_table *tbl, struct ip_vs_lblcr_entry *en)
{
unsigned hash;
if (!list_empty(&en->list)) {
IP_VS_ERR("ip_vs_lblcr_hash(): request for already hashed, "
"called from %p\n", __builtin_return_address(0));
return 0;
}
/*
* Hash by destination IP address
*/
hash = ip_vs_lblcr_hashkey(en->addr);
write_lock(&tbl->lock);
list_add(&en->list, &tbl->bucket[hash]);
atomic_inc(&tbl->entries);
write_unlock(&tbl->lock);
return 1;
}
#if 0000
/*
* Unhash ip_vs_lblcr_entry from ip_vs_lblcr_table.
* returns bool success.
*/
static int ip_vs_lblcr_unhash(struct ip_vs_lblcr_table *tbl,
struct ip_vs_lblcr_entry *en)
{
if (list_empty(&en->list)) {
IP_VS_ERR("ip_vs_lblcr_unhash(): request for not hashed entry, "
"called from %p\n", __builtin_return_address(0));
return 0;
}
/*
* Remove it from the table
*/
write_lock(&tbl->lock);
list_del(&en->list);
INIT_LIST_HEAD(&en->list);
write_unlock(&tbl->lock);
return 1;
}
#endif
/*
* Get ip_vs_lblcr_entry associated with supplied parameters.
*/
static inline struct ip_vs_lblcr_entry *
ip_vs_lblcr_get(struct ip_vs_lblcr_table *tbl, __u32 addr)
{
unsigned hash;
struct ip_vs_lblcr_entry *en;
struct list_head *l,*e;
hash = ip_vs_lblcr_hashkey(addr);
l = &tbl->bucket[hash];
read_lock(&tbl->lock);
for (e=l->next; e!=l; e=e->next) {
en = list_entry(e, struct ip_vs_lblcr_entry, list);
if (en->addr == addr) {
/* HIT */
read_unlock(&tbl->lock);
return en;
}
}
read_unlock(&tbl->lock);
return NULL;
}
/*
* Flush all the entries of the specified table.
*/
static void ip_vs_lblcr_flush(struct ip_vs_lblcr_table *tbl)
{
int i;
struct list_head *l;
struct ip_vs_lblcr_entry *en;
for (i=0; i<IP_VS_LBLCR_TAB_SIZE; i++) {
write_lock(&tbl->lock);
for (l=&tbl->bucket[i]; l->next!=l; ) {
en = list_entry(l->next,
struct ip_vs_lblcr_entry, list);
ip_vs_lblcr_free(en);
atomic_dec(&tbl->entries);
}
write_unlock(&tbl->lock);
}
}
static inline void ip_vs_lblcr_full_check(struct ip_vs_lblcr_table *tbl)
{
unsigned long now = jiffies;
int i, j;
struct list_head *l, *e;
struct ip_vs_lblcr_entry *en;
for (i=0, j=tbl->rover; i<IP_VS_LBLCR_TAB_SIZE; i++) {
j = (j + 1) & IP_VS_LBLCR_TAB_MASK;
e = l = &tbl->bucket[j];
write_lock(&tbl->lock);
while (e->next != l) {
en = list_entry(e->next,
struct ip_vs_lblcr_entry, list);
if ((now - en->lastuse) <
sysctl_ip_vs_lblcr_expiration) {
e = e->next;
continue;
}
ip_vs_lblcr_free(en);
atomic_dec(&tbl->entries);
}
write_unlock(&tbl->lock);
}
tbl->rover = j;
}
/*
* Periodical timer handler for IPVS lblcr table
* It is used to collect stale entries when the number of entries
* exceeds the maximum size of the table.
*
* Fixme: we probably need more complicated algorithm to collect
* entries that have not been used for a long time even
* if the number of entries doesn't exceed the maximum size
* of the table.
* The full expiration check is for this purpose now.
*/
static void ip_vs_lblcr_check_expire(unsigned long data)
{
struct ip_vs_lblcr_table *tbl;
unsigned long now = jiffies;
int goal;
int i, j;
struct list_head *l, *e;
struct ip_vs_lblcr_entry *en;
tbl = (struct ip_vs_lblcr_table *)data;
if ((tbl->counter % COUNT_FOR_FULL_EXPIRATION) == 0) {
/* do full expiration check */
ip_vs_lblcr_full_check(tbl);
tbl->counter = 1;
goto out;
}
if (atomic_read(&tbl->entries) <= tbl->max_size) {
tbl->counter++;
goto out;
}
goal = (atomic_read(&tbl->entries) - tbl->max_size)*4/3;
if (goal > tbl->max_size/2)
goal = tbl->max_size/2;
for (i=0, j=tbl->rover; i<IP_VS_LBLCR_TAB_SIZE; i++) {
j = (j + 1) & IP_VS_LBLCR_TAB_MASK;
e = l = &tbl->bucket[j];
write_lock(&tbl->lock);
while (e->next != l) {
en = list_entry(e->next,
struct ip_vs_lblcr_entry, list);
if ((now - en->lastuse) < ENTRY_TIMEOUT) {
e = e->next;
continue;
}
ip_vs_lblcr_free(en);
atomic_dec(&tbl->entries);
goal--;
}
write_unlock(&tbl->lock);
if (goal <= 0)
break;
}
tbl->rover = j;
out:
mod_timer(&tbl->periodic_timer, jiffies+CHECK_EXPIRE_INTERVAL);
}
#ifdef CONFIG_IP_VS_LBLCR_DEBUG
static struct ip_vs_lblcr_table *lblcr_table_list;
/*
* /proc/net/ip_vs_lblcr to display the mappings of
* destination IP address <==> its serverSet
*/
static int
ip_vs_lblcr_getinfo(char *buffer, char **start, off_t offset, int length)
{
off_t pos=0, begin;
int len=0, size;
struct ip_vs_lblcr_table *tbl;
unsigned long now = jiffies;
int i;
struct list_head *l, *e;
struct ip_vs_lblcr_entry *en;
tbl = lblcr_table_list;
size = sprintf(buffer, "LastTime Dest IP address Server set\n");
pos += size;
len += size;
for (i=0; i<IP_VS_LBLCR_TAB_SIZE; i++) {
l = &tbl->bucket[i];
read_lock_bh(&tbl->lock);
for (e=l->next; e!=l; e=e->next) {
char tbuf[16];
struct ip_vs_dest_list *d;
en = list_entry(e, struct ip_vs_lblcr_entry, list);
sprintf(tbuf, "%u.%u.%u.%u", NIPQUAD(en->addr));
size = sprintf(buffer+len, "%8lu %-16s ",
now-en->lastuse, tbuf);
read_lock(&en->set.lock);
for (d=en->set.list; d!=NULL; d=d->next) {
size += sprintf(buffer+len+size,
"%u.%u.%u.%u ",
NIPQUAD(d->dest->addr));
}
read_unlock(&en->set.lock);
size += sprintf(buffer+len+size, "\n");
len += size;
pos += size;
if (pos <= offset)
len=0;
if (pos >= offset+length) {
read_unlock_bh(&tbl->lock);
goto done;
}
}
read_unlock_bh(&tbl->lock);
}
done:
begin = len - (pos - offset);
*start = buffer + begin;
len -= begin;
if(len>length)
len = length;
return len;
}
#endif
static int ip_vs_lblcr_init_svc(struct ip_vs_service *svc)
{
int i;
struct ip_vs_lblcr_table *tbl;
/*
* Allocate the ip_vs_lblcr_table for this service
*/
tbl = kmalloc(sizeof(struct ip_vs_lblcr_table), GFP_ATOMIC);
if (tbl == NULL) {
IP_VS_ERR("ip_vs_lblcr_init_svc(): no memory\n");
return -ENOMEM;
}
svc->sched_data = tbl;
IP_VS_DBG(6, "LBLCR hash table (memory=%dbytes) allocated for "
"current service\n",
sizeof(struct ip_vs_lblcr_table));
/*
* Initialize the hash buckets
*/
for (i=0; i<IP_VS_LBLCR_TAB_SIZE; i++) {
INIT_LIST_HEAD(&tbl->bucket[i]);
}
tbl->lock = RW_LOCK_UNLOCKED;
tbl->max_size = IP_VS_LBLCR_TAB_SIZE*16;
tbl->rover = 0;
tbl->counter = 1;
/*
* Hook periodic timer for garbage collection
*/
init_timer(&tbl->periodic_timer);
tbl->periodic_timer.data = (unsigned long)tbl;
tbl->periodic_timer.function = ip_vs_lblcr_check_expire;
tbl->periodic_timer.expires = jiffies+CHECK_EXPIRE_INTERVAL;
add_timer(&tbl->periodic_timer);
#ifdef CONFIG_IP_VS_LBLCR_DEBUG
lblcr_table_list = tbl;
#endif
return 0;
}
static int ip_vs_lblcr_done_svc(struct ip_vs_service *svc)
{
struct ip_vs_lblcr_table *tbl = svc->sched_data;
/* remove periodic timer */
del_timer_sync(&tbl->periodic_timer);
/* got to clean up table entries here */
ip_vs_lblcr_flush(tbl);
/* release the table itself */
kfree(svc->sched_data);
IP_VS_DBG(6, "LBLCR hash table (memory=%dbytes) released\n",
sizeof(struct ip_vs_lblcr_table));
return 0;
}
static int ip_vs_lblcr_update_svc(struct ip_vs_service *svc)
{
return 0;
}
static inline struct ip_vs_dest *
__ip_vs_wlc_schedule(struct ip_vs_service *svc, struct iphdr *iph)
{
register struct list_head *l, *e;
struct ip_vs_dest *dest, *least;
int loh, doh;
/*
* We think the overhead of processing active connections is fifty
* times higher than that of inactive connections in average. (This
* fifty times might not be accurate, we will change it later.) We
* use the following formula to estimate the overhead:
* dest->activeconns*50 + dest->inactconns
* and the load:
* (dest overhead) / dest->weight
*
* Remember -- no floats in kernel mode!!!
* The comparison of h1*w2 > h2*w1 is equivalent to that of
* h1/w1 > h2/w2
* if every weight is larger than zero.
*
* The server with weight=0 is quiesced and will not receive any
* new connection.
*/
l = &svc->destinations;
for (e=l->next; e!=l; e=e->next) {
least = list_entry(e, struct ip_vs_dest, n_list);
if (least->flags & IP_VS_DEST_F_OVERLOAD)
continue;
if (atomic_read(&least->weight) > 0) {
loh = atomic_read(&least->activeconns) * 50
+ atomic_read(&least->inactconns);
goto nextstage;
}
}
return NULL;
/*
* Find the destination with the least load.
*/
nextstage:
for (e=e->next; e!=l; e=e->next) {
dest = list_entry(e, struct ip_vs_dest, n_list);
if (dest->flags & IP_VS_DEST_F_OVERLOAD)
continue;
doh = atomic_read(&dest->activeconns) * 50
+ atomic_read(&dest->inactconns);
if (loh * atomic_read(&dest->weight) >
doh * atomic_read(&least->weight)) {
least = dest;
loh = doh;
}
}
IP_VS_DBG(6, "LBLCR: server %d.%d.%d.%d:%d "
"activeconns %d refcnt %d weight %d overhead %d\n",
NIPQUAD(least->addr), ntohs(least->port),
atomic_read(&least->activeconns),
atomic_read(&least->refcnt),
atomic_read(&least->weight), loh);
return least;
}
/*
* If this destination server is overloaded and there is a less loaded
* server, then return true.
*/
static inline int
is_overloaded(struct ip_vs_dest *dest, struct ip_vs_service *svc)
{
if (atomic_read(&dest->activeconns) > atomic_read(&dest->weight)) {
register struct list_head *l, *e;
struct ip_vs_dest *d;
l = &svc->destinations;
for (e=l->next; e!=l; e=e->next) {
d = list_entry(e, struct ip_vs_dest, n_list);
if (atomic_read(&d->activeconns)*2
< atomic_read(&d->weight)) {
return 1;
}
}
}
return 0;
}
/*
* Locality-Based (weighted) Least-Connection scheduling
*/
static struct ip_vs_dest *
ip_vs_lblcr_schedule(struct ip_vs_service *svc, struct iphdr *iph)
{
struct ip_vs_dest *dest;
struct ip_vs_lblcr_table *tbl;
struct ip_vs_lblcr_entry *en;
IP_VS_DBG(6, "ip_vs_lblcr_schedule(): Scheduling...\n");
tbl = (struct ip_vs_lblcr_table *)svc->sched_data;
en = ip_vs_lblcr_get(tbl, iph->daddr);
if (en == NULL) {
dest = __ip_vs_wlc_schedule(svc, iph);
if (dest == NULL) {
IP_VS_DBG(1, "no destination available\n");
return NULL;
}
en = ip_vs_lblcr_new(iph->daddr);
if (en == NULL) {
return NULL;
}
ip_vs_dest_set_insert(&en->set, dest);
ip_vs_lblcr_hash(tbl, en);
} else {
dest = ip_vs_dest_set_min(&en->set);
if (!dest || is_overloaded(dest, svc)) {
dest = __ip_vs_wlc_schedule(svc, iph);
if (dest == NULL) {
IP_VS_DBG(1, "no destination available\n");
return NULL;
}
ip_vs_dest_set_insert(&en->set, dest);
}
if (atomic_read(&en->set.size) > 1 &&
jiffies-en->set.lastmod > sysctl_ip_vs_lblcr_expiration) {
struct ip_vs_dest *m;
m = ip_vs_dest_set_max(&en->set);
if (m)
ip_vs_dest_set_erase(&en->set, m);
}
}
en->lastuse = jiffies;
IP_VS_DBG(6, "LBLCR: destination IP address %u.%u.%u.%u "
"--> server %u.%u.%u.%u:%d\n",
NIPQUAD(en->addr),
NIPQUAD(dest->addr),
ntohs(dest->port));
return dest;
}
/*
* IPVS LBLCR Scheduler structure
*/
static struct ip_vs_scheduler ip_vs_lblcr_scheduler =
{
.name = "lblcr",
.refcnt = ATOMIC_INIT(0),
.module = THIS_MODULE,
.init_service = ip_vs_lblcr_init_svc,
.done_service = ip_vs_lblcr_done_svc,
.update_service = ip_vs_lblcr_update_svc,
.schedule = ip_vs_lblcr_schedule,
};
static int __init ip_vs_lblcr_init(void)
{
INIT_LIST_HEAD(&ip_vs_lblcr_scheduler.n_list);
lblcr_sysctl_table.sysctl_header =
register_sysctl_table(lblcr_sysctl_table.root_dir, 0);
#ifdef CONFIG_IP_VS_LBLCR_DEBUG
proc_net_create("ip_vs_lblcr", 0, ip_vs_lblcr_getinfo);
#endif
return register_ip_vs_scheduler(&ip_vs_lblcr_scheduler);
}
static void __exit ip_vs_lblcr_cleanup(void)
{
#ifdef CONFIG_IP_VS_LBLCR_DEBUG
proc_net_remove("ip_vs_lblcr");
#endif
unregister_sysctl_table(lblcr_sysctl_table.sysctl_header);
unregister_ip_vs_scheduler(&ip_vs_lblcr_scheduler);
}
module_init(ip_vs_lblcr_init);
module_exit(ip_vs_lblcr_cleanup);
MODULE_LICENSE("GPL");
/*
* IPVS: Least-Connection Scheduling module
*
* Version: $Id: ip_vs_lc.c,v 1.10 2003/04/18 09:03:16 wensong Exp $
*
* Authors: Wensong Zhang <wensong@linuxvirtualserver.org>
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public License
* as published by the Free Software Foundation; either version
* 2 of the License, or (at your option) any later version.
*
* Changes:
* Wensong Zhang : added the ip_vs_lc_update_svc
* Wensong Zhang : added any dest with weight=0 is quiesced
*
*/
#include <linux/config.h>
#include <linux/module.h>
#include <linux/init.h>
#include <linux/types.h>
#include <linux/kernel.h>
#include <linux/errno.h>
#include <net/ip_vs.h>
static int ip_vs_lc_init_svc(struct ip_vs_service *svc)
{
return 0;
}
static int ip_vs_lc_done_svc(struct ip_vs_service *svc)
{
return 0;
}
static int ip_vs_lc_update_svc(struct ip_vs_service *svc)
{
return 0;
}
static inline unsigned int
ip_vs_lc_dest_overhead(struct ip_vs_dest *dest)
{
/*
* We think the overhead of processing active connections is 256
* times higher than that of inactive connections in average. (This
* 256 times might not be accurate, we will change it later) We
* use the following formula to estimate the overhead now:
* dest->activeconns*256 + dest->inactconns
*/
return (atomic_read(&dest->activeconns) << 8) +
atomic_read(&dest->inactconns);
}
/*
* Least Connection scheduling
*/
static struct ip_vs_dest *
ip_vs_lc_schedule(struct ip_vs_service *svc, struct iphdr *iph)
{
struct list_head *l, *e;
struct ip_vs_dest *dest, *least;
unsigned int loh, doh;
IP_VS_DBG(6, "ip_vs_lc_schedule(): Scheduling...\n");
/*
* Simply select the server with the least number of
* (activeconns<<5) + inactconns
* Except whose weight is equal to zero.
* If the weight is equal to zero, it means that the server is
* quiesced, the existing connections to the server still get
* served, but no new connection is assigned to the server.
*/
l = &svc->destinations;
for (e=l->next; e!=l; e=e->next) {
least = list_entry (e, struct ip_vs_dest, n_list);
if (least->flags & IP_VS_DEST_F_OVERLOAD)
continue;
if (atomic_read(&least->weight) > 0) {
loh = ip_vs_lc_dest_overhead(least);
goto nextstage;
}
}
return NULL;
/*
* Find the destination with the least load.
*/
nextstage:
for (e=e->next; e!=l; e=e->next) {
dest = list_entry(e, struct ip_vs_dest, n_list);
if ((dest->flags & IP_VS_DEST_F_OVERLOAD) ||
atomic_read(&dest->weight) == 0)
continue;
doh = ip_vs_lc_dest_overhead(dest);
if (doh < loh) {
least = dest;
loh = doh;
}
}
IP_VS_DBG(6, "LC: server %u.%u.%u.%u:%u activeconns %d inactconns %d\n",
NIPQUAD(least->addr), ntohs(least->port),
atomic_read(&least->activeconns),
atomic_read(&least->inactconns));
return least;
}
static struct ip_vs_scheduler ip_vs_lc_scheduler = {
.name = "lc",
.refcnt = ATOMIC_INIT(0),
.module = THIS_MODULE,
.init_service = ip_vs_lc_init_svc,
.done_service = ip_vs_lc_done_svc,
.update_service = ip_vs_lc_update_svc,
.schedule = ip_vs_lc_schedule,
};
static int __init ip_vs_lc_init(void)
{
INIT_LIST_HEAD(&ip_vs_lc_scheduler.n_list);
return register_ip_vs_scheduler(&ip_vs_lc_scheduler) ;
}
static void __exit ip_vs_lc_cleanup(void)
{
unregister_ip_vs_scheduler(&ip_vs_lc_scheduler);
}
module_init(ip_vs_lc_init);
module_exit(ip_vs_lc_cleanup);
MODULE_LICENSE("GPL");
/*
* IPVS: Never Queue scheduling module
*
* Version: $Id: ip_vs_nq.c,v 1.2 2003/06/08 09:31:19 wensong Exp $
*
* Authors: Wensong Zhang <wensong@linuxvirtualserver.org>
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public License
* as published by the Free Software Foundation; either version
* 2 of the License, or (at your option) any later version.
*
* Changes:
*
*/
/*
* The NQ algorithm adopts a two-speed model. When there is an idle server
* available, the job will be sent to the idle server, instead of waiting
* for a fast one. When there is no idle server available, the job will be
* sent to the server that minimize its expected delay (The Shortest
* Expected Delay scheduling algorithm).
*
* See the following paper for more information:
* A. Weinrib and S. Shenker, Greed is not enough: Adaptive load sharing
* in large heterogeneous systems. In Proceedings IEEE INFOCOM'88,
* pages 986-994, 1988.
*
* Thanks must go to Marko Buuri <marko@buuri.name> for talking NQ to me.
*
* The difference between NQ and SED is that NQ can improve overall
* system utilization.
*
*/
#include <linux/config.h>
#include <linux/module.h>
#include <linux/init.h>
#include <linux/types.h>
#include <linux/kernel.h>
#include <linux/errno.h>
#include <net/ip_vs.h>
static int
ip_vs_nq_init_svc(struct ip_vs_service *svc)
{
return 0;
}
static int
ip_vs_nq_done_svc(struct ip_vs_service *svc)
{
return 0;
}
static int
ip_vs_nq_update_svc(struct ip_vs_service *svc)
{
return 0;
}
static inline unsigned int
ip_vs_nq_dest_overhead(struct ip_vs_dest *dest)
{
/*
* We only use the active connection number in the cost
* calculation here.
*/
return atomic_read(&dest->activeconns) + 1;
}
/*
* Weighted Least Connection scheduling
*/
static struct ip_vs_dest *
ip_vs_nq_schedule(struct ip_vs_service *svc, struct iphdr *iph)
{
register struct list_head *l, *e;
struct ip_vs_dest *dest, *least;
unsigned int loh, doh;
IP_VS_DBG(6, "ip_vs_nq_schedule(): Scheduling...\n");
/*
* We calculate the load of each dest server as follows:
* (server expected overhead) / dest->weight
*
* Remember -- no floats in kernel mode!!!
* The comparison of h1*w2 > h2*w1 is equivalent to that of
* h1/w1 > h2/w2
* if every weight is larger than zero.
*
* The server with weight=0 is quiesced and will not receive any
* new connections.
*/
l = &svc->destinations;
for (e=l->next; e!=l; e=e->next) {
least = list_entry(e, struct ip_vs_dest, n_list);
if (!(least->flags & IP_VS_DEST_F_OVERLOAD) &&
atomic_read(&least->weight) > 0) {
loh = ip_vs_nq_dest_overhead(least);
/* return the server directly if it is idle */
if (atomic_read(&least->activeconns) == 0)
goto out;
goto nextstage;
}
}
return NULL;
/*
* Find the destination with the least load.
*/
nextstage:
for (e=e->next; e!=l; e=e->next) {
dest = list_entry(e, struct ip_vs_dest, n_list);
if (dest->flags & IP_VS_DEST_F_OVERLOAD)
continue;
doh = ip_vs_nq_dest_overhead(dest);
/* return the server directly if it is idle */
if (atomic_read(&dest->activeconns) == 0) {
least = dest;
goto out;
}
if (loh * atomic_read(&dest->weight) >
doh * atomic_read(&least->weight)) {
least = dest;
loh = doh;
}
}
out:
IP_VS_DBG(6, "NQ: server %u.%u.%u.%u:%u "
"activeconns %d refcnt %d weight %d overhead %d\n",
NIPQUAD(least->addr), ntohs(least->port),
atomic_read(&least->activeconns),
atomic_read(&least->refcnt),
atomic_read(&least->weight), loh);
return least;
}
static struct ip_vs_scheduler ip_vs_nq_scheduler =
{
.name = "nq",
.refcnt = ATOMIC_INIT(0),
.module = THIS_MODULE,
.init_service = ip_vs_nq_init_svc,
.done_service = ip_vs_nq_done_svc,
.update_service = ip_vs_nq_update_svc,
.schedule = ip_vs_nq_schedule,
};
static int __init ip_vs_nq_init(void)
{
INIT_LIST_HEAD(&ip_vs_nq_scheduler.n_list);
return register_ip_vs_scheduler(&ip_vs_nq_scheduler);
}
static void __exit ip_vs_nq_cleanup(void)
{
unregister_ip_vs_scheduler(&ip_vs_nq_scheduler);
}
module_init(ip_vs_nq_init);
module_exit(ip_vs_nq_cleanup);
MODULE_LICENSE("GPL");
/*
* ip_vs_proto.c: transport protocol load balancing support for IPVS
*
* Version: $Id: ip_vs_proto.c,v 1.2 2003/04/18 09:03:16 wensong Exp $
*
* Authors: Wensong Zhang <wensong@linuxvirtualserver.org>
* Julian Anastasov <ja@ssi.bg>
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public License
* as published by the Free Software Foundation; either version
* 2 of the License, or (at your option) any later version.
*
* Changes:
*
*/
#include <linux/config.h>
#include <linux/module.h>
#include <linux/types.h>
#include <linux/kernel.h>
#include <linux/errno.h>
#include <linux/skbuff.h>
#include <linux/in.h>
#include <linux/ip.h>
#include <linux/init.h>
#include <net/protocol.h>
#include <net/tcp.h>
#include <net/udp.h>
#include <asm/system.h>
#include <linux/stat.h>
#include <linux/proc_fs.h>
#include <net/ip_vs.h>
/*
* IPVS protocols can only be registered/unregistered when the ipvs
* module is loaded/unloaded, so no lock is needed in accessing the
* ipvs protocol table.
*/
#define IP_VS_PROTO_TAB_SIZE 32 /* must be power of 2 */
#define IP_VS_PROTO_HASH(proto) ((proto) & (IP_VS_PROTO_TAB_SIZE-1))
static struct ip_vs_protocol *ip_vs_proto_table[IP_VS_PROTO_TAB_SIZE];
/*
* register an ipvs protocol
*/
int register_ip_vs_protocol(struct ip_vs_protocol *pp)
{
unsigned hash = IP_VS_PROTO_HASH(pp->protocol);
pp->next = ip_vs_proto_table[hash];
ip_vs_proto_table[hash] = pp;
if (pp->init != NULL)
pp->init(pp);
return 0;
}
/*
* unregister an ipvs protocol
*/
int unregister_ip_vs_protocol(struct ip_vs_protocol *pp)
{
struct ip_vs_protocol **pp_p;
unsigned hash = IP_VS_PROTO_HASH(pp->protocol);
pp_p = &ip_vs_proto_table[hash];
for (; *pp_p; pp_p = &(*pp_p)->next) {
if (*pp_p == pp) {
*pp_p = pp->next;
if (pp->exit != NULL)
pp->exit(pp);
return 0;
}
}
return -ESRCH;
}
/*
* get ip_vs_protocol object by its proto.
*/
struct ip_vs_protocol * ip_vs_proto_get(unsigned short proto)
{
struct ip_vs_protocol *pp;
unsigned hash = IP_VS_PROTO_HASH(proto);
for (pp = ip_vs_proto_table[hash]; pp; pp = pp->next) {
if (pp->protocol == proto)
return pp;
}
return NULL;
}
/*
* Propagate event for state change to all protocols
*/
void ip_vs_protocol_timeout_change(int flags)
{
struct ip_vs_protocol *pp;
int i;
for (i = 0; i < IP_VS_PROTO_TAB_SIZE; i++) {
for (pp = ip_vs_proto_table[i]; pp; pp = pp->next) {
if (pp->timeout_change)
pp->timeout_change(pp, flags);
}
}
}
int *
ip_vs_create_timeout_table(int *table, int size)
{
int *t;
t = kmalloc(size, GFP_ATOMIC);
if (t == NULL)
return NULL;
memcpy(t, table, size);
return t;
}
/*
* Set timeout value for state specified by name
*/
int
ip_vs_set_state_timeout(int *table, int num, char **names, char *name, int to)
{
int i;
if (!table || !name || !to)
return -EINVAL;
for (i = 0; i < num; i++) {
if (strcmp(names[i], name))
continue;
table[i] = to * HZ;
return 0;
}
return -ENOENT;
}
const char * ip_vs_state_name(__u16 proto, int state)
{
struct ip_vs_protocol *pp = ip_vs_proto_get(proto);
if (pp == NULL || pp->state_name == NULL)
return "ERR!";
return pp->state_name(state);
}
void
tcpudp_debug_packet(struct ip_vs_protocol *pp, struct iphdr *iph, char *msg)
{
char buf[128];
union ip_vs_tphdr h;
h.raw = (char *) iph + iph->ihl * 4;
if (iph->frag_off & __constant_htons(IP_OFFSET))
sprintf(buf, "%s %u.%u.%u.%u->%u.%u.%u.%u frag",
pp->name, NIPQUAD(iph->saddr), NIPQUAD(iph->daddr));
else
sprintf(buf, "%s %u.%u.%u.%u:%u->%u.%u.%u.%u:%u",
pp->name,
NIPQUAD(iph->saddr),
ntohs(h.portp[0]),
NIPQUAD(iph->daddr),
ntohs(h.portp[1]));
printk(KERN_DEBUG "IPVS: %s: %s\n", msg, buf);
}
int ip_vs_protocol_init(void)
{
char protocols[64];
#define REGISTER_PROTOCOL(p) \
do { \
register_ip_vs_protocol(p); \
strcat(protocols, ", "); \
strcat(protocols, (p)->name); \
} while (0)
protocols[0] = '\0';
protocols[2] = '\0';
#ifdef CONFIG_IP_VS_PROTO_TCP
REGISTER_PROTOCOL(&ip_vs_protocol_tcp);
#endif
#ifdef CONFIG_IP_VS_PROTO_UDP
REGISTER_PROTOCOL(&ip_vs_protocol_udp);
#endif
#ifdef CONFIG_IP_VS_PROTO_ICMP
REGISTER_PROTOCOL(&ip_vs_protocol_icmp);
#endif
#ifdef CONFIG_IP_VS_PROTO_AH
REGISTER_PROTOCOL(&ip_vs_protocol_ah);
#endif
#ifdef CONFIG_IP_VS_PROTO_ESP
REGISTER_PROTOCOL(&ip_vs_protocol_esp);
#endif
IP_VS_INFO("Registered protocols (%s)\n", &protocols[2]);
return 0;
}
void ip_vs_protocol_cleanup(void)
{
struct ip_vs_protocol *pp;
int i;
/* unregister all the ipvs protocols */
for (i = 0; i < IP_VS_PROTO_TAB_SIZE; i++) {
while ((pp = ip_vs_proto_table[i]) != NULL)
unregister_ip_vs_protocol(pp);
}
}
/*
* ip_vs_proto_ah.c: AH IPSec load balancing support for IPVS
*
* Version: $Id: ip_vs_proto_ah.c,v 1.1 2003/07/04 15:04:37 wensong Exp $
*
* Authors: Julian Anastasov <ja@ssi.bg>, February 2002
* Wensong Zhang <wensong@linuxvirtualserver.org>
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public License
* version 2 as published by the Free Software Foundation;
*
*/
#include <linux/config.h>
#include <linux/module.h>
#include <linux/types.h>
#include <linux/kernel.h>
#include <linux/errno.h>
#include <linux/compiler.h>
#include <linux/vmalloc.h>
#include <linux/netfilter.h>
#include <linux/netfilter_ipv4.h>
#include <net/ip_vs.h>
/* TODO:
struct isakmp_hdr {
__u8 icookie[8];
__u8 rcookie[8];
__u8 np;
__u8 version;
__u8 xchgtype;
__u8 flags;
__u32 msgid;
__u32 length;
};
*/
#define PORT_ISAKMP 500
static struct ip_vs_conn *
ah_conn_in_get(struct sk_buff *skb, struct ip_vs_protocol *pp,
struct iphdr *iph, union ip_vs_tphdr h, int inverse)
{
struct ip_vs_conn *cp;
if (likely(!inverse)) {
cp = ip_vs_conn_in_get(IPPROTO_UDP,
iph->saddr,
__constant_htons(PORT_ISAKMP),
iph->daddr,
__constant_htons(PORT_ISAKMP));
} else {
cp = ip_vs_conn_in_get(IPPROTO_UDP,
iph->daddr,
__constant_htons(PORT_ISAKMP),
iph->saddr,
__constant_htons(PORT_ISAKMP));
}
if (!cp) {
/*
* We are not sure if the packet is from our
* service, so the caller should check skip_nonexisting
*/
IP_VS_DBG(12, "Unknown ISAKMP entry for outin packet "
"%s%s %u.%u.%u.%u->%u.%u.%u.%u\n",
inverse?"ICMP+":"",
pp->name,
NIPQUAD(iph->saddr),
NIPQUAD(iph->daddr));
}
return cp;
}
static struct ip_vs_conn *
ah_conn_out_get(struct sk_buff *skb, struct ip_vs_protocol *pp,
struct iphdr *iph, union ip_vs_tphdr h, int inverse)
{
struct ip_vs_conn *cp;
if (likely(!inverse)) {
cp = ip_vs_conn_out_get(IPPROTO_UDP,
iph->saddr,
__constant_htons(PORT_ISAKMP),
iph->daddr,
__constant_htons(PORT_ISAKMP));
} else {
cp = ip_vs_conn_out_get(IPPROTO_UDP,
iph->daddr,
__constant_htons(PORT_ISAKMP),
iph->saddr,
__constant_htons(PORT_ISAKMP));
}
if (!cp) {
/*
* We are not sure if the packet is from our
* service, so the caller should check skip_nonexisting
* or our conn_schedule hook should return NF_ACCEPT
*/
IP_VS_DBG(12, "Unknown ISAKMP entry for inout packet "
"%s%s %u.%u.%u.%u->%u.%u.%u.%u\n",
inverse ? "ICMP+" : "",
pp->name,
NIPQUAD(iph->saddr),
NIPQUAD(iph->daddr));
}
return cp;
}
static int
ah_conn_schedule(struct sk_buff *skb, struct ip_vs_protocol *pp,
struct iphdr *iph, union ip_vs_tphdr h,
int *verdict, struct ip_vs_conn **cpp)
{
/*
* AH is only related traffic. Pass the packet to IP stack.
*/
*verdict = NF_ACCEPT;
return 0;
}
static void
ah_debug_packet(struct ip_vs_protocol *pp, struct iphdr *iph, char *msg)
{
char buf[256];
sprintf(buf, "%s %u.%u.%u.%u->%u.%u.%u.%u",
pp->name, NIPQUAD(iph->saddr), NIPQUAD(iph->daddr));
printk(KERN_DEBUG "IPVS: %s: %s\n", msg, buf);
}
static void ah_init(struct ip_vs_protocol *pp)
{
/* nothing to do now */
}
static void ah_exit(struct ip_vs_protocol *pp)
{
/* nothing to do now */
}
struct ip_vs_protocol ip_vs_protocol_ah = {
.name = "AH",
.protocol = IPPROTO_AH,
.minhlen = 0,
.minhlen_icmp = 0,
.dont_defrag = 1,
.skip_nonexisting = 1,
.slave = 1,
.init = ah_init,
.exit = ah_exit,
.conn_schedule = ah_conn_schedule,
.conn_in_get = ah_conn_in_get,
.conn_out_get = ah_conn_out_get,
.snat_handler = NULL,
.dnat_handler = NULL,
.state_transition = NULL,
.register_app = NULL,
.unregister_app = NULL,
.app_conn_bind = NULL,
.csum_check = NULL,
.debug_packet = ah_debug_packet,
.timeout_change = NULL, /* ISAKMP */
.set_state_timeout = NULL,
};
/*
* ip_vs_proto_esp.c: ESP IPSec load balancing support for IPVS
*
* Version: $Id: ip_vs_proto_esp.c,v 1.1 2003/07/04 15:04:37 wensong Exp $
*
* Authors: Julian Anastasov <ja@ssi.bg>, February 2002
* Wensong Zhang <wensong@linuxvirtualserver.org>
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public License
* version 2 as published by the Free Software Foundation;
*
*/
#include <linux/config.h>
#include <linux/module.h>
#include <linux/types.h>
#include <linux/kernel.h>
#include <linux/errno.h>
#include <linux/compiler.h>
#include <linux/vmalloc.h>
#include <linux/netfilter.h>
#include <linux/netfilter_ipv4.h>
#include <net/ip_vs.h>
/* TODO:
struct isakmp_hdr {
__u8 icookie[8];
__u8 rcookie[8];
__u8 np;
__u8 version;
__u8 xchgtype;
__u8 flags;
__u32 msgid;
__u32 length;
};
*/
#define PORT_ISAKMP 500
static struct ip_vs_conn *
esp_conn_in_get(struct sk_buff *skb, struct ip_vs_protocol *pp,
struct iphdr *iph, union ip_vs_tphdr h, int inverse)
{
struct ip_vs_conn *cp;
if (likely(!inverse)) {
cp = ip_vs_conn_in_get(IPPROTO_UDP,
iph->saddr,
__constant_htons(PORT_ISAKMP),
iph->daddr,
__constant_htons(PORT_ISAKMP));
} else {
cp = ip_vs_conn_in_get(IPPROTO_UDP,
iph->daddr,
__constant_htons(PORT_ISAKMP),
iph->saddr,
__constant_htons(PORT_ISAKMP));
}
if (!cp) {
/*
* We are not sure if the packet is from our
* service, so the caller should check skip_nonexisting
*/
IP_VS_DBG(12, "Unknown ISAKMP entry for outin packet "
"%s%s %u.%u.%u.%u->%u.%u.%u.%u\n",
inverse ? "ICMP+" : "",
pp->name,
NIPQUAD(iph->saddr),
NIPQUAD(iph->daddr));
}
return cp;
}
static struct ip_vs_conn *
esp_conn_out_get(struct sk_buff *skb, struct ip_vs_protocol *pp,
struct iphdr *iph, union ip_vs_tphdr h, int inverse)
{
struct ip_vs_conn *cp;
if (likely(!inverse)) {
cp = ip_vs_conn_out_get(IPPROTO_UDP,
iph->saddr,
__constant_htons(PORT_ISAKMP),
iph->daddr,
__constant_htons(PORT_ISAKMP));
} else {
cp = ip_vs_conn_out_get(IPPROTO_UDP,
iph->daddr,
__constant_htons(PORT_ISAKMP),
iph->saddr,
__constant_htons(PORT_ISAKMP));
}
if (!cp) {
/*
* We are not sure if the packet is from our
* service, so the caller should check skip_nonexisting
* or our conn_schedule hook should return NF_ACCEPT
*/
IP_VS_DBG(12, "Unknown ISAKMP entry for inout packet "
"%s%s %u.%u.%u.%u->%u.%u.%u.%u\n",
inverse?"ICMP+":"",
pp->name,
NIPQUAD(iph->saddr),
NIPQUAD(iph->daddr));
}
return cp;
}
static int
esp_conn_schedule(struct sk_buff *skb, struct ip_vs_protocol *pp,
struct iphdr *iph, union ip_vs_tphdr h,
int *verdict, struct ip_vs_conn **cpp)
{
/*
* ESP is only related traffic. Pass the packet to IP stack.
*/
*verdict = NF_ACCEPT;
return 0;
}
static void
esp_debug_packet(struct ip_vs_protocol *pp, struct iphdr *iph, char *msg)
{
char buf[256];
sprintf(buf, "%s %u.%u.%u.%u->%u.%u.%u.%u",
pp->name, NIPQUAD(iph->saddr), NIPQUAD(iph->daddr));
printk(KERN_DEBUG "IPVS: %s: %s\n", msg, buf);
}
static void esp_init(struct ip_vs_protocol *pp)
{
/* nothing to do now */
}
static void esp_exit(struct ip_vs_protocol *pp)
{
/* nothing to do now */
}
struct ip_vs_protocol ip_vs_protocol_esp = {
.name = "ESP",
.protocol = IPPROTO_ESP,
.minhlen = 0,
.minhlen_icmp = 0,
.dont_defrag = 1,
.skip_nonexisting = 1,
.slave = 1,
.init = esp_init,
.exit = esp_exit,
.conn_schedule = esp_conn_schedule,
.conn_in_get = esp_conn_in_get,
.conn_out_get = esp_conn_out_get,
.snat_handler = NULL,
.dnat_handler = NULL,
.csum_check = NULL,
.state_transition = NULL,
.register_app = NULL,
.unregister_app = NULL,
.app_conn_bind = NULL,
.debug_packet = esp_debug_packet,
.timeout_change = NULL, /* ISAKMP */
};
/*
* ip_vs_proto_icmp.c: ICMP load balancing support for IP Virtual Server
*
* Authors: Julian Anastasov <ja@ssi.bg>, March 2002
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public License
* version 2 as published by the Free Software Foundation;
*
*/
#include <linux/config.h>
#include <linux/module.h>
#include <linux/types.h>
#include <linux/kernel.h>
#include <linux/errno.h>
#include <linux/compiler.h>
#include <linux/vmalloc.h>
#include <linux/icmp.h>
#include <linux/netfilter.h>
#include <linux/netfilter_ipv4.h>
#include <net/ip_vs.h>
static int icmp_timeouts[1] = { 1*60*HZ };
static char * icmp_state_name_table[1] = { "ICMP" };
struct ip_vs_conn *
icmp_conn_in_get(struct sk_buff *skb, struct ip_vs_protocol *pp,
struct iphdr *iph, union ip_vs_tphdr h, int inverse)
{
#if 0
struct ip_vs_conn *cp;
if (likely(!inverse)) {
cp = ip_vs_conn_in_get(iph->protocol,
iph->saddr, 0,
iph->daddr, 0);
} else {
cp = ip_vs_conn_in_get(iph->protocol,
iph->daddr, 0,
iph->saddr, 0);
}
return cp;
#else
return NULL;
#endif
}
struct ip_vs_conn *
icmp_conn_out_get(struct sk_buff *skb, struct ip_vs_protocol *pp,
struct iphdr *iph, union ip_vs_tphdr h, int inverse)
{
#if 0
struct ip_vs_conn *cp;
if (likely(!inverse)) {
cp = ip_vs_conn_out_get(iph->protocol,
iph->saddr, 0,
iph->daddr, 0);
} else {
cp = ip_vs_conn_out_get(IPPROTO_UDP,
iph->daddr, 0,
iph->saddr, 0);
}
return cp;
#else
return NULL;
#endif
}
static int
icmp_conn_schedule(struct sk_buff *skb, struct ip_vs_protocol *pp,
struct iphdr *iph, union ip_vs_tphdr h,
int *verdict, struct ip_vs_conn **cpp)
{
*verdict = NF_ACCEPT;
return 0;
}
static int
icmp_csum_check(struct sk_buff *skb, struct ip_vs_protocol *pp,
struct iphdr *iph, union ip_vs_tphdr h, int size)
{
if (!(iph->frag_off & __constant_htons(IP_OFFSET))) {
if (ip_compute_csum(h.raw, size)) {
IP_VS_DBG_RL_PKT(0, pp, iph, "Failed checksum for");
return 0;
}
}
return 1;
}
static void
icmp_debug_packet(struct ip_vs_protocol *pp, struct iphdr *iph, char *msg)
{
char buf[256];
union ip_vs_tphdr h;
h.raw = (char *) iph + iph->ihl * 4;
if (iph->frag_off & __constant_htons(IP_OFFSET))
sprintf(buf, "%s %u.%u.%u.%u->%u.%u.%u.%u frag",
pp->name, NIPQUAD(iph->saddr), NIPQUAD(iph->daddr));
else
sprintf(buf, "%s %u.%u.%u.%u->%u.%u.%u.%u T:%d C:%d",
pp->name, NIPQUAD(iph->saddr), NIPQUAD(iph->daddr),
h.icmph->type, h.icmph->code);
printk(KERN_DEBUG "IPVS: %s: %s\n", msg, buf);
}
static int
icmp_state_transition(struct ip_vs_conn *cp,
int direction, struct iphdr *iph,
union ip_vs_tphdr h, struct ip_vs_protocol *pp)
{
cp->timeout = pp->timeout_table[IP_VS_ICMP_S_NORMAL];
return 1;
}
static int
icmp_set_state_timeout(struct ip_vs_protocol *pp, char *sname, int to)
{
int num;
char **names;
num = IP_VS_ICMP_S_LAST;
names = icmp_state_name_table;
return ip_vs_set_state_timeout(pp->timeout_table, num, names, sname, to);
}
static void icmp_init(struct ip_vs_protocol *pp)
{
pp->timeout_table = icmp_timeouts;
}
static void icmp_exit(struct ip_vs_protocol *pp)
{
}
struct ip_vs_protocol ip_vs_protocol_icmp = {
.name = "ICMP",
.protocol = IPPROTO_ICMP,
.minhlen = sizeof(struct icmphdr),
.minhlen_icmp = 8,
.dont_defrag = 0,
.skip_nonexisting = 0,
.slave = 0,
.init = icmp_init,
.exit = icmp_exit,
.conn_schedule = icmp_conn_schedule,
.conn_in_get = icmp_conn_in_get,
.conn_out_get = icmp_conn_out_get,
.snat_handler = NULL,
.dnat_handler = NULL,
.csum_check = icmp_csum_check,
.state_transition = icmp_state_transition,
.register_app = NULL,
.unregister_app = NULL,
.app_conn_bind = NULL,
.debug_packet = icmp_debug_packet,
.timeout_change = NULL,
.set_state_timeout = icmp_set_state_timeout,
};
/*
* ip_vs_proto_tcp.c: TCP load balancing support for IPVS
*
* Version: $Id: ip_vs_proto_tcp.c,v 1.3 2002/11/30 01:50:35 wensong Exp $
*
* Authors: Wensong Zhang <wensong@linuxvirtualserver.org>
* Julian Anastasov <ja@ssi.bg>
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public License
* as published by the Free Software Foundation; either version
* 2 of the License, or (at your option) any later version.
*
* Changes:
*
*/
#include <linux/config.h>
#include <linux/compiler.h>
#include <linux/ip.h>
#include <linux/tcp.h> /* for tcphdr */
#include <net/ip.h>
#include <net/tcp.h> /* for csum_tcpudp_magic */
#include <linux/netfilter.h>
#include <net/ip_vs.h>
static struct ip_vs_conn *
tcp_conn_in_get(struct sk_buff *skb, struct ip_vs_protocol *pp,
struct iphdr *iph, union ip_vs_tphdr h, int inverse)
{
if (likely(!inverse)) {
return ip_vs_conn_in_get(iph->protocol,
iph->saddr, h.th->source,
iph->daddr, h.th->dest);
} else {
return ip_vs_conn_in_get(iph->protocol,
iph->daddr, h.th->dest,
iph->saddr, h.th->source);
}
}
static struct ip_vs_conn *
tcp_conn_out_get(struct sk_buff *skb, struct ip_vs_protocol *pp,
struct iphdr *iph, union ip_vs_tphdr h, int inverse)
{
if (likely(!inverse)) {
return ip_vs_conn_out_get(iph->protocol,
iph->saddr, h.th->source,
iph->daddr, h.th->dest);
} else {
return ip_vs_conn_out_get(iph->protocol,
iph->daddr, h.th->dest,
iph->saddr, h.th->source);
}
}
static int
tcp_conn_schedule(struct sk_buff *skb, struct ip_vs_protocol *pp,
struct iphdr *iph, union ip_vs_tphdr h,
int *verdict, struct ip_vs_conn **cpp)
{
struct ip_vs_service *svc;
if (h.th->syn &&
(svc = ip_vs_service_get(skb->nfmark, iph->protocol,
iph->daddr, h.portp[1]))) {
if (ip_vs_todrop()) {
/*
* It seems that we are very loaded.
* We have to drop this packet :(
*/
ip_vs_service_put(svc);
*verdict = NF_DROP;
return 0;
}
/*
* Let the virtual server select a real server for the
* incoming connection, and create a connection entry.
*/
*cpp = ip_vs_schedule(svc, iph);
if (!*cpp) {
*verdict = ip_vs_leave(svc, skb, pp, h);
return 0;
}
ip_vs_service_put(svc);
}
return 1;
}
static inline void
tcp_fast_csum_update(union ip_vs_tphdr *h, u32 oldip, u32 newip,
u16 oldport, u16 newport)
{
h->th->check =
ip_vs_check_diff(~oldip, newip,
ip_vs_check_diff(oldport ^ 0xFFFF,
newport, h->th->check));
}
static int
tcp_snat_handler(struct sk_buff *skb,
struct ip_vs_protocol *pp, struct ip_vs_conn *cp,
struct iphdr *iph, union ip_vs_tphdr h, int size)
{
int ihl = (char *) h.raw - (char *) iph;
/* We are sure that we work on first fragment */
h.th->source = cp->vport;
/* Call application helper if needed */
if (ip_vs_app_pkt_out(cp, skb) != 0) {
/* skb data has probably changed, update pointers */
iph = skb->nh.iph;
h.raw = (char*)iph + ihl;
size = skb->len - ihl;
}
/* Adjust TCP checksums */
if (!cp->app) {
/* Only port and addr are changed, do fast csum update */
tcp_fast_csum_update(&h, cp->daddr, cp->vaddr,
cp->dport, cp->vport);
if (skb->ip_summed == CHECKSUM_HW)
skb->ip_summed = CHECKSUM_NONE;
} else {
/* full checksum calculation */
h.th->check = 0;
skb->csum = csum_partial(h.raw, size, 0);
h.th->check = csum_tcpudp_magic(iph->saddr, iph->daddr,
size, iph->protocol,
skb->csum);
IP_VS_DBG(11, "O-pkt: %s O-csum=%d (+%d)\n",
pp->name, h.th->check,
(char*)&(h.th->check) - (char*)h.raw);
}
return 1;
}
static int
tcp_dnat_handler(struct sk_buff *skb,
struct ip_vs_protocol *pp, struct ip_vs_conn *cp,
struct iphdr *iph, union ip_vs_tphdr h, int size)
{
int ihl = (char *) h.raw - (char *) iph;
/* We are sure that we work on first fragment */
h.th->dest = cp->dport;
/*
* Attempt ip_vs_app call.
* It will fix ip_vs_conn and iph ack_seq stuff
*/
if (ip_vs_app_pkt_in(cp, skb) != 0) {
/* skb data has probably changed, update pointers */
iph = skb->nh.iph;
h.raw = (char*) iph + ihl;
size = skb->len - ihl;
}
/*
* Adjust TCP/UDP checksums
*/
if (!cp->app) {
/* Only port and addr are changed, do fast csum update */
tcp_fast_csum_update(&h, cp->vaddr, cp->daddr,
cp->vport, cp->dport);
if (skb->ip_summed == CHECKSUM_HW)
skb->ip_summed = CHECKSUM_NONE;
} else {
/* full checksum calculation */
h.th->check = 0;
h.th->check = csum_tcpudp_magic(iph->saddr, iph->daddr,
size, iph->protocol,
csum_partial(h.raw, size, 0));
skb->ip_summed = CHECKSUM_UNNECESSARY;
}
return 1;
}
static int
tcp_csum_check(struct sk_buff *skb, struct ip_vs_protocol *pp,
struct iphdr *iph, union ip_vs_tphdr h, int size)
{
switch (skb->ip_summed) {
case CHECKSUM_NONE:
skb->csum = csum_partial(h.raw, size, 0);
case CHECKSUM_HW:
if (csum_tcpudp_magic(iph->saddr, iph->daddr, size,
iph->protocol, skb->csum)) {
IP_VS_DBG_RL_PKT(0, pp, iph,
"Failed checksum for");
return 0;
}
break;
default:
/* CHECKSUM_UNNECESSARY */
break;
}
return 1;
}
#define TCP_DIR_INPUT 0
#define TCP_DIR_OUTPUT 4
#define TCP_DIR_INPUT_ONLY 8
static int tcp_state_off[IP_VS_DIR_LAST] = {
[IP_VS_DIR_INPUT] = TCP_DIR_INPUT,
[IP_VS_DIR_OUTPUT] = TCP_DIR_OUTPUT,
[IP_VS_DIR_INPUT_ONLY] = TCP_DIR_INPUT_ONLY,
};
/*
* Timeout table[state]
*/
static int tcp_timeouts[IP_VS_TCP_S_LAST+1] = {
[IP_VS_TCP_S_NONE] = 2*HZ,
[IP_VS_TCP_S_ESTABLISHED] = 15*60*HZ,
[IP_VS_TCP_S_SYN_SENT] = 2*60*HZ,
[IP_VS_TCP_S_SYN_RECV] = 1*60*HZ,
[IP_VS_TCP_S_FIN_WAIT] = 2*60*HZ,
[IP_VS_TCP_S_TIME_WAIT] = 2*60*HZ,
[IP_VS_TCP_S_CLOSE] = 10*HZ,
[IP_VS_TCP_S_CLOSE_WAIT] = 60*HZ,
[IP_VS_TCP_S_LAST_ACK] = 30*HZ,
[IP_VS_TCP_S_LISTEN] = 2*60*HZ,
[IP_VS_TCP_S_SYNACK] = 120*HZ,
[IP_VS_TCP_S_LAST] = 2*HZ,
};
#if 0
/* FIXME: This is going to die */
static int tcp_timeouts_dos[IP_VS_TCP_S_LAST+1] = {
[IP_VS_TCP_S_NONE] = 2*HZ,
[IP_VS_TCP_S_ESTABLISHED] = 8*60*HZ,
[IP_VS_TCP_S_SYN_SENT] = 60*HZ,
[IP_VS_TCP_S_SYN_RECV] = 10*HZ,
[IP_VS_TCP_S_FIN_WAIT] = 60*HZ,
[IP_VS_TCP_S_TIME_WAIT] = 60*HZ,
[IP_VS_TCP_S_CLOSE] = 10*HZ,
[IP_VS_TCP_S_CLOSE_WAIT] = 60*HZ,
[IP_VS_TCP_S_LAST_ACK] = 30*HZ,
[IP_VS_TCP_S_LISTEN] = 2*60*HZ,
[IP_VS_TCP_S_SYNACK] = 100*HZ,
[IP_VS_TCP_S_LAST] = 2*HZ,
};
#endif
static char * tcp_state_name_table[IP_VS_TCP_S_LAST+1] = {
[IP_VS_TCP_S_NONE] = "NONE",
[IP_VS_TCP_S_ESTABLISHED] = "ESTABLISHED",
[IP_VS_TCP_S_SYN_SENT] = "SYN_SENT",
[IP_VS_TCP_S_SYN_RECV] = "SYN_RECV",
[IP_VS_TCP_S_FIN_WAIT] = "FIN_WAIT",
[IP_VS_TCP_S_TIME_WAIT] = "TIME_WAIT",
[IP_VS_TCP_S_CLOSE] = "CLOSE",
[IP_VS_TCP_S_CLOSE_WAIT] = "CLOSE_WAIT",
[IP_VS_TCP_S_LAST_ACK] = "LAST_ACK",
[IP_VS_TCP_S_LISTEN] = "LISTEN",
[IP_VS_TCP_S_SYNACK] = "SYNACK",
[IP_VS_TCP_S_LAST] = "BUG!",
};
#define sNO IP_VS_TCP_S_NONE
#define sES IP_VS_TCP_S_ESTABLISHED
#define sSS IP_VS_TCP_S_SYN_SENT
#define sSR IP_VS_TCP_S_SYN_RECV
#define sFW IP_VS_TCP_S_FIN_WAIT
#define sTW IP_VS_TCP_S_TIME_WAIT
#define sCL IP_VS_TCP_S_CLOSE
#define sCW IP_VS_TCP_S_CLOSE_WAIT
#define sLA IP_VS_TCP_S_LAST_ACK
#define sLI IP_VS_TCP_S_LISTEN
#define sSA IP_VS_TCP_S_SYNACK
struct tcp_states_t {
int next_state[IP_VS_TCP_S_LAST];
};
static const char * tcp_state_name(int state)
{
if (state >= IP_VS_TCP_S_LAST)
return "ERR!";
return tcp_state_name_table[state] ? tcp_state_name_table[state] : "?";
}
static struct tcp_states_t tcp_states [] = {
/* INPUT */
/* sNO, sES, sSS, sSR, sFW, sTW, sCL, sCW, sLA, sLI, sSA */
/*syn*/ {{sSR, sES, sES, sSR, sSR, sSR, sSR, sSR, sSR, sSR, sSR }},
/*fin*/ {{sCL, sCW, sSS, sTW, sTW, sTW, sCL, sCW, sLA, sLI, sTW }},
/*ack*/ {{sCL, sES, sSS, sES, sFW, sTW, sCL, sCW, sCL, sLI, sES }},
/*rst*/ {{sCL, sCL, sCL, sSR, sCL, sCL, sCL, sCL, sLA, sLI, sSR }},
/* OUTPUT */
/* sNO, sES, sSS, sSR, sFW, sTW, sCL, sCW, sLA, sLI, sSA */
/*syn*/ {{sSS, sES, sSS, sSR, sSS, sSS, sSS, sSS, sSS, sLI, sSR }},
/*fin*/ {{sTW, sFW, sSS, sTW, sFW, sTW, sCL, sTW, sLA, sLI, sTW }},
/*ack*/ {{sES, sES, sSS, sES, sFW, sTW, sCL, sCW, sLA, sES, sES }},
/*rst*/ {{sCL, sCL, sSS, sCL, sCL, sTW, sCL, sCL, sCL, sCL, sCL }},
/* INPUT-ONLY */
/* sNO, sES, sSS, sSR, sFW, sTW, sCL, sCW, sLA, sLI, sSA */
/*syn*/ {{sSR, sES, sES, sSR, sSR, sSR, sSR, sSR, sSR, sSR, sSR }},
/*fin*/ {{sCL, sFW, sSS, sTW, sFW, sTW, sCL, sCW, sLA, sLI, sTW }},
/*ack*/ {{sCL, sES, sSS, sES, sFW, sTW, sCL, sCW, sCL, sLI, sES }},
/*rst*/ {{sCL, sCL, sCL, sSR, sCL, sCL, sCL, sCL, sLA, sLI, sCL }},
};
static struct tcp_states_t tcp_states_dos [] = {
/* INPUT */
/* sNO, sES, sSS, sSR, sFW, sTW, sCL, sCW, sLA, sLI, sSA */
/*syn*/ {{sSR, sES, sES, sSR, sSR, sSR, sSR, sSR, sSR, sSR, sSA }},
/*fin*/ {{sCL, sCW, sSS, sTW, sTW, sTW, sCL, sCW, sLA, sLI, sSA }},
/*ack*/ {{sCL, sES, sSS, sSR, sFW, sTW, sCL, sCW, sCL, sLI, sSA }},
/*rst*/ {{sCL, sCL, sCL, sSR, sCL, sCL, sCL, sCL, sLA, sLI, sCL }},
/* OUTPUT */
/* sNO, sES, sSS, sSR, sFW, sTW, sCL, sCW, sLA, sLI, sSA */
/*syn*/ {{sSS, sES, sSS, sSA, sSS, sSS, sSS, sSS, sSS, sLI, sSA }},
/*fin*/ {{sTW, sFW, sSS, sTW, sFW, sTW, sCL, sTW, sLA, sLI, sTW }},
/*ack*/ {{sES, sES, sSS, sES, sFW, sTW, sCL, sCW, sLA, sES, sES }},
/*rst*/ {{sCL, sCL, sSS, sCL, sCL, sTW, sCL, sCL, sCL, sCL, sCL }},
/* INPUT-ONLY */
/* sNO, sES, sSS, sSR, sFW, sTW, sCL, sCW, sLA, sLI, sSA */
/*syn*/ {{sSA, sES, sES, sSR, sSA, sSA, sSA, sSA, sSA, sSA, sSA }},
/*fin*/ {{sCL, sFW, sSS, sTW, sFW, sTW, sCL, sCW, sLA, sLI, sTW }},
/*ack*/ {{sCL, sES, sSS, sES, sFW, sTW, sCL, sCW, sCL, sLI, sES }},
/*rst*/ {{sCL, sCL, sCL, sSR, sCL, sCL, sCL, sCL, sLA, sLI, sCL }},
};
static struct tcp_states_t *tcp_state_table = tcp_states;
static void tcp_timeout_change(struct ip_vs_protocol *pp, int flags)
{
int on = (flags & 1); /* secure_tcp */
/*
** FIXME: change secure_tcp to independent sysctl var
** or make it per-service or per-app because it is valid
** for most if not for all of the applications. Something
** like "capabilities" (flags) for each object.
*/
tcp_state_table = (on? tcp_states_dos : tcp_states);
}
static int
tcp_set_state_timeout(struct ip_vs_protocol *pp, char *sname, int to)
{
return ip_vs_set_state_timeout(pp->timeout_table, IP_VS_TCP_S_LAST,
tcp_state_name_table, sname, to);
}
static inline int tcp_state_idx(struct tcphdr *th)
{
if (th->rst)
return 3;
if (th->syn)
return 0;
if (th->fin)
return 1;
if (th->ack)
return 2;
return -1;
}
static inline void
set_tcp_state(struct ip_vs_protocol *pp, struct ip_vs_conn *cp,
int direction, union ip_vs_tphdr h)
{
int state_idx;
struct tcphdr *th = h.th;
int new_state = IP_VS_TCP_S_CLOSE;
int state_off = tcp_state_off[direction];
/*
* Update state offset to INPUT_ONLY if necessary
* or delete NO_OUTPUT flag if output packet detected
*/
if (cp->flags & IP_VS_CONN_F_NOOUTPUT) {
if (state_off == TCP_DIR_OUTPUT)
cp->flags &= ~IP_VS_CONN_F_NOOUTPUT;
else
state_off = TCP_DIR_INPUT_ONLY;
}
if ((state_idx = tcp_state_idx(th)) < 0) {
IP_VS_DBG(8, "tcp_state_idx=%d!!!\n", state_idx);
goto tcp_state_out;
}
new_state = tcp_state_table[state_off+state_idx].next_state[cp->state];
tcp_state_out:
if (new_state != cp->state) {
struct ip_vs_dest *dest = cp->dest;
IP_VS_DBG(8, "%s %s [%c%c%c%c] %u.%u.%u.%u:%d->"
"%u.%u.%u.%u:%d state: %s->%s cnt:%d\n",
pp->name,
(state_off==TCP_DIR_OUTPUT)?"output ":"input ",
th->syn? 'S' : '.',
th->fin? 'F' : '.',
th->ack? 'A' : '.',
th->rst? 'R' : '.',
NIPQUAD(cp->daddr), ntohs(cp->dport),
NIPQUAD(cp->caddr), ntohs(cp->cport),
tcp_state_name(cp->state),
tcp_state_name(new_state),
atomic_read(&cp->refcnt));
if (dest) {
if (!(cp->flags & IP_VS_CONN_F_INACTIVE) &&
(new_state != IP_VS_TCP_S_ESTABLISHED)) {
atomic_dec(&dest->activeconns);
atomic_inc(&dest->inactconns);
cp->flags |= IP_VS_CONN_F_INACTIVE;
} else if ((cp->flags & IP_VS_CONN_F_INACTIVE) &&
(new_state == IP_VS_TCP_S_ESTABLISHED)) {
atomic_inc(&dest->activeconns);
atomic_dec(&dest->inactconns);
cp->flags &= ~IP_VS_CONN_F_INACTIVE;
}
}
}
cp->timeout = pp->timeout_table[cp->state = new_state];
}
/*
* Handle state transitions
*/
static int
tcp_state_transition(struct ip_vs_conn *cp,
int direction, struct iphdr *iph,
union ip_vs_tphdr h, struct ip_vs_protocol *pp)
{
spin_lock(&cp->lock);
set_tcp_state(pp, cp, direction, h);
spin_unlock(&cp->lock);
return 1;
}
/*
* Hash table for TCP application incarnations
*/
#define TCP_APP_TAB_BITS 4
#define TCP_APP_TAB_SIZE (1 << TCP_APP_TAB_BITS)
#define TCP_APP_TAB_MASK (TCP_APP_TAB_SIZE - 1)
static struct list_head tcp_apps[TCP_APP_TAB_SIZE];
static spinlock_t tcp_app_lock = SPIN_LOCK_UNLOCKED;
static inline __u16 tcp_app_hashkey(__u16 port)
{
return ((port >> TCP_APP_TAB_BITS) ^ port) & TCP_APP_TAB_MASK;
}
static int tcp_register_app(struct ip_vs_app *inc)
{
struct ip_vs_app *i;
struct list_head *t, *p;
__u16 hash, port = inc->port;
int ret = 0;
hash = tcp_app_hashkey(port);
t = &tcp_apps[hash];
spin_lock_bh(&tcp_app_lock);
for (p = t->next; p != t; p = p->next) {
i = list_entry(p, struct ip_vs_app, p_list);
if (i->port == port) {
ret = -EEXIST;
goto out;
}
}
list_add(&inc->p_list, t);
atomic_inc(&ip_vs_protocol_tcp.appcnt);
out:
spin_unlock_bh(&tcp_app_lock);
return ret;
}
static void
tcp_unregister_app(struct ip_vs_app *inc)
{
spin_lock_bh(&tcp_app_lock);
atomic_dec(&ip_vs_protocol_tcp.appcnt);
list_del(&inc->p_list);
spin_unlock_bh(&tcp_app_lock);
}
static int
tcp_app_conn_bind(struct ip_vs_conn *cp)
{
struct list_head *t, *p;
int hash;
struct ip_vs_app *inc;
int result = 0;
/* Default binding: bind app only for NAT */
if (IP_VS_FWD_METHOD(cp) != IP_VS_CONN_F_MASQ)
return 0;
/* Lookup application incarnations and bind the right one */
hash = tcp_app_hashkey(cp->vport);
t = &tcp_apps[hash];
spin_lock(&tcp_app_lock);
for (p = t->next; p != t; p = p->next) {
inc = list_entry(p, struct ip_vs_app, p_list);
if (inc->port == cp->vport) {
if (unlikely(!ip_vs_app_inc_get(inc)))
break;
spin_unlock(&tcp_app_lock);
IP_VS_DBG(9, "%s: Binding conn %u.%u.%u.%u:%u->"
"%u.%u.%u.%u:%u to app %s on port %u\n",
__FUNCTION__,
NIPQUAD(cp->caddr), ntohs(cp->cport),
NIPQUAD(cp->vaddr), ntohs(cp->vport),
inc->name, ntohs(inc->port));
cp->app = inc;
if (inc->init_conn)
result = inc->init_conn(inc, cp);
goto out;
}
}
spin_unlock(&tcp_app_lock);
out:
return result;
}
/*
* Set LISTEN timeout. (ip_vs_conn_put will setup timer)
*/
void ip_vs_tcp_conn_listen(struct ip_vs_conn *cp)
{
spin_lock(&cp->lock);
cp->state = IP_VS_TCP_S_LISTEN;
cp->timeout = ip_vs_protocol_tcp.timeout_table[IP_VS_TCP_S_LISTEN];
spin_unlock(&cp->lock);
}
static void tcp_init(struct ip_vs_protocol *pp)
{
IP_VS_INIT_HASH_TABLE(tcp_apps);
pp->timeout_table = tcp_timeouts;
}
static void tcp_exit(struct ip_vs_protocol *pp)
{
}
extern void
tcpudp_debug_packet(struct ip_vs_protocol *pp, struct iphdr *iph, char *msg);
struct ip_vs_protocol ip_vs_protocol_tcp = {
.name = "TCP",
.protocol = IPPROTO_TCP,
.minhlen = sizeof(struct tcphdr),
.minhlen_icmp = 8,
.dont_defrag = 0,
.skip_nonexisting = 0,
.slave = 0,
.appcnt = ATOMIC_INIT(0),
.init = tcp_init,
.exit = tcp_exit,
.register_app = tcp_register_app,
.unregister_app = tcp_unregister_app,
.conn_schedule = tcp_conn_schedule,
.conn_in_get = tcp_conn_in_get,
.conn_out_get = tcp_conn_out_get,
.snat_handler = tcp_snat_handler,
.dnat_handler = tcp_dnat_handler,
.csum_check = tcp_csum_check,
.state_name = tcp_state_name,
.state_transition = tcp_state_transition,
.app_conn_bind = tcp_app_conn_bind,
.debug_packet = tcpudp_debug_packet,
.timeout_change = tcp_timeout_change,
.set_state_timeout = tcp_set_state_timeout,
};
/*
* ip_vs_proto_udp.c: UDP load balancing support for IPVS
*
* Version: $Id: ip_vs_proto_udp.c,v 1.3 2002/11/30 01:50:35 wensong Exp $
*
* Authors: Wensong Zhang <wensong@linuxvirtualserver.org>
* Julian Anastasov <ja@ssi.bg>
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public License
* as published by the Free Software Foundation; either version
* 2 of the License, or (at your option) any later version.
*
* Changes:
*
*/
#include <linux/kernel.h>
#include <linux/netfilter.h>
#include <net/ip_vs.h>
static struct ip_vs_conn *
udp_conn_in_get(struct sk_buff *skb, struct ip_vs_protocol *pp,
struct iphdr *iph, union ip_vs_tphdr h, int inverse)
{
struct ip_vs_conn *cp;
if (likely(!inverse)) {
cp = ip_vs_conn_in_get(iph->protocol,
iph->saddr, h.portp[0],
iph->daddr, h.portp[1]);
} else {
cp = ip_vs_conn_in_get(iph->protocol,
iph->daddr, h.portp[1],
iph->saddr, h.portp[0]);
}
return cp;
}
static struct ip_vs_conn *
udp_conn_out_get(struct sk_buff *skb, struct ip_vs_protocol *pp,
struct iphdr *iph, union ip_vs_tphdr h, int inverse)
{
struct ip_vs_conn *cp;
if (likely(!inverse)) {
cp = ip_vs_conn_out_get(iph->protocol,
iph->saddr, h.portp[0],
iph->daddr, h.portp[1]);
} else {
cp = ip_vs_conn_out_get(iph->protocol,
iph->daddr, h.portp[1],
iph->saddr, h.portp[0]);
}
return cp;
}
static int
udp_conn_schedule(struct sk_buff *skb, struct ip_vs_protocol *pp,
struct iphdr *iph, union ip_vs_tphdr h,
int *verdict, struct ip_vs_conn **cpp)
{
struct ip_vs_service *svc;
if ((svc = ip_vs_service_get(skb->nfmark, iph->protocol,
iph->daddr, h.portp[1]))) {
if (ip_vs_todrop()) {
/*
* It seems that we are very loaded.
* We have to drop this packet :(
*/
ip_vs_service_put(svc);
*verdict = NF_DROP;
return 0;
}
/*
* Let the virtual server select a real server for the
* incoming connection, and create a connection entry.
*/
*cpp = ip_vs_schedule(svc, iph);
if (!*cpp) {
*verdict = ip_vs_leave(svc, skb, pp, h);
return 0;
}
ip_vs_service_put(svc);
}
return 1;
}
static inline void
udp_fast_csum_update(union ip_vs_tphdr *h, u32 oldip, u32 newip,
u16 oldport, u16 newport)
{
h->uh->check =
ip_vs_check_diff(~oldip, newip,
ip_vs_check_diff(oldport ^ 0xFFFF,
newport, h->uh->check));
if (!h->uh->check)
h->uh->check = 0xFFFF;
}
static int
udp_snat_handler(struct sk_buff *skb,
struct ip_vs_protocol *pp, struct ip_vs_conn *cp,
struct iphdr *iph, union ip_vs_tphdr h, int size)
{
int ihl = (char *) h.raw - (char *) iph;
/* We are sure that we work on first fragment */
h.portp[0] = cp->vport;
/*
* Call application helper if needed
*/
if (ip_vs_app_pkt_out(cp, skb) != 0) {
/* skb data has probably changed, update pointers */
iph = skb->nh.iph;
h.raw = (char*)iph + ihl;
size = skb->len - ihl;
}
/*
* Adjust UDP checksums
*/
if (!cp->app && (h.uh->check != 0)) {
/* Only port and addr are changed, do fast csum update */
udp_fast_csum_update(&h, cp->daddr, cp->vaddr,
cp->dport, cp->vport);
if (skb->ip_summed == CHECKSUM_HW)
skb->ip_summed = CHECKSUM_NONE;
} else {
/* full checksum calculation */
h.uh->check = 0;
skb->csum = csum_partial(h.raw, size, 0);
h.uh->check = csum_tcpudp_magic(iph->saddr, iph->daddr,
size, iph->protocol,
skb->csum);
if (h.uh->check == 0)
h.uh->check = 0xFFFF;
IP_VS_DBG(11, "O-pkt: %s O-csum=%d (+%d)\n",
pp->name, h.uh->check,
(char*)&(h.uh->check) - (char*)h.raw);
}
return 1;
}
static int
udp_dnat_handler(struct sk_buff *skb,
struct ip_vs_protocol *pp, struct ip_vs_conn *cp,
struct iphdr *iph, union ip_vs_tphdr h, int size)
{
int ihl = (char *) h.raw - (char *) iph;
/* We are sure that we work on first fragment */
h.portp[1] = cp->dport;
/*
* Attempt ip_vs_app call.
* will fix ip_vs_conn and iph ack_seq stuff
*/
if (ip_vs_app_pkt_in(cp, skb) != 0) {
/* skb data has probably changed, update pointers */
iph = skb->nh.iph;
h.raw = (char*) iph + ihl;
size = skb->len - ihl;
}
/*
* Adjust UDP checksums
*/
if (!cp->app && (h.uh->check != 0)) {
/* Only port and addr are changed, do fast csum update */
udp_fast_csum_update(&h, cp->vaddr, cp->daddr,
cp->vport, cp->dport);
if (skb->ip_summed == CHECKSUM_HW)
skb->ip_summed = CHECKSUM_NONE;
} else {
/* full checksum calculation */
h.uh->check = 0;
h.uh->check = csum_tcpudp_magic(iph->saddr, iph->daddr,
size, iph->protocol,
csum_partial(h.raw, size, 0));
if (h.uh->check == 0)
h.uh->check = 0xFFFF;
skb->ip_summed = CHECKSUM_UNNECESSARY;
}
return 1;
}
static int
udp_csum_check(struct sk_buff *skb, struct ip_vs_protocol *pp,
struct iphdr *iph, union ip_vs_tphdr h, int size)
{
if (h.uh->check != 0) {
switch (skb->ip_summed) {
case CHECKSUM_NONE:
skb->csum = csum_partial(h.raw, size, 0);
case CHECKSUM_HW:
if (csum_tcpudp_magic(iph->saddr, iph->daddr, size,
iph->protocol, skb->csum)) {
IP_VS_DBG_RL_PKT(0, pp, iph,
"Failed checksum for");
return 0;
}
break;
default:
/* CHECKSUM_UNNECESSARY */
break;
}
}
return 1;
}
/*
* Note: the caller guarantees that only one of register_app,
* unregister_app or app_conn_bind is called each time.
*/
#define UDP_APP_TAB_BITS 4
#define UDP_APP_TAB_SIZE (1 << UDP_APP_TAB_BITS)
#define UDP_APP_TAB_MASK (UDP_APP_TAB_SIZE - 1)
static struct list_head udp_apps[UDP_APP_TAB_SIZE];
static spinlock_t udp_app_lock = SPIN_LOCK_UNLOCKED;
static inline __u16 udp_app_hashkey(__u16 port)
{
return ((port >> UDP_APP_TAB_BITS) ^ port) & UDP_APP_TAB_MASK;
}
static int udp_register_app(struct ip_vs_app *inc)
{
struct ip_vs_app *i;
struct list_head *t, *p;
__u16 hash, port = inc->port;
int ret = 0;
hash = udp_app_hashkey(port);
t = &udp_apps[hash];
spin_lock_bh(&udp_app_lock);
for (p = t->next; p != t; p = p->next) {
i = list_entry(p, struct ip_vs_app, p_list);
if (i->port == port) {
ret = -EEXIST;
goto out;
}
}
list_add(&inc->p_list, t);
atomic_inc(&ip_vs_protocol_udp.appcnt);
out:
spin_unlock_bh(&udp_app_lock);
return ret;
}
static void
udp_unregister_app(struct ip_vs_app *inc)
{
spin_lock_bh(&udp_app_lock);
atomic_dec(&ip_vs_protocol_udp.appcnt);
list_del(&inc->p_list);
spin_unlock_bh(&udp_app_lock);
}
static int udp_app_conn_bind(struct ip_vs_conn *cp)
{
struct list_head *t, *p;
int hash;
struct ip_vs_app *inc;
int result = 0;
/* Default binding: bind app only for NAT */
if (IP_VS_FWD_METHOD(cp) != IP_VS_CONN_F_MASQ)
return 0;
/* Lookup application incarnations and bind the right one */
hash = udp_app_hashkey(cp->vport);
t = &udp_apps[hash];
spin_lock(&udp_app_lock);
for (p = t->next; p != t; p = p->next) {
inc = list_entry(p, struct ip_vs_app, p_list);
if (inc->port == cp->vport) {
if (unlikely(!ip_vs_app_inc_get(inc)))
break;
spin_unlock(&udp_app_lock);
IP_VS_DBG(9, "%s: Binding conn %u.%u.%u.%u:%u->"
"%u.%u.%u.%u:%u to app %s on port %u\n",
__FUNCTION__,
NIPQUAD(cp->caddr), ntohs(cp->cport),
NIPQUAD(cp->vaddr), ntohs(cp->vport),
inc->name, ntohs(inc->port));
cp->app = inc;
if (inc->init_conn)
result = inc->init_conn(inc, cp);
goto out;
}
}
spin_unlock(&udp_app_lock);
out:
return result;
}
static int udp_timeouts[IP_VS_UDP_S_LAST+1] = {
[IP_VS_UDP_S_NORMAL] = 5*60*HZ,
[IP_VS_UDP_S_LAST] = 2*HZ,
};
static char * udp_state_name_table[IP_VS_UDP_S_LAST+1] = {
[IP_VS_UDP_S_NORMAL] = "UDP",
[IP_VS_UDP_S_LAST] = "BUG!",
};
static int
udp_set_state_timeout(struct ip_vs_protocol *pp, char *sname, int to)
{
return ip_vs_set_state_timeout(pp->timeout_table, IP_VS_UDP_S_LAST,
udp_state_name_table, sname, to);
}
static const char * udp_state_name(int state)
{
if (state >= IP_VS_UDP_S_LAST)
return "ERR!";
return udp_state_name_table[state] ? udp_state_name_table[state] : "?";
}
static int
udp_state_transition(struct ip_vs_conn *cp,
int direction, struct iphdr *iph,
union ip_vs_tphdr h, struct ip_vs_protocol *pp)
{
cp->timeout = pp->timeout_table[IP_VS_UDP_S_NORMAL];
return 1;
}
static void udp_init(struct ip_vs_protocol *pp)
{
IP_VS_INIT_HASH_TABLE(udp_apps);
pp->timeout_table = udp_timeouts;
}
static void udp_exit(struct ip_vs_protocol *pp)
{
}
extern void
tcpudp_debug_packet(struct ip_vs_protocol *pp, struct iphdr *iph, char *msg);
struct ip_vs_protocol ip_vs_protocol_udp = {
.name = "UDP",
.protocol = IPPROTO_UDP,
.minhlen = sizeof(struct udphdr),
.minhlen_icmp = 8,
.dont_defrag = 0,
.skip_nonexisting = 0,
.slave = 0,
.init = udp_init,
.exit = udp_exit,
.conn_schedule = udp_conn_schedule,
.conn_in_get = udp_conn_in_get,
.conn_out_get = udp_conn_out_get,
.snat_handler = udp_snat_handler,
.dnat_handler = udp_dnat_handler,
.csum_check = udp_csum_check,
.state_transition = udp_state_transition,
.state_name = udp_state_name,
.register_app = udp_register_app,
.unregister_app = udp_unregister_app,
.app_conn_bind = udp_app_conn_bind,
.debug_packet = tcpudp_debug_packet,
.timeout_change = NULL,
.set_state_timeout = udp_set_state_timeout,
};
/*
* IPVS: Round-Robin Scheduling module
*
* Version: $Id: ip_vs_rr.c,v 1.9 2002/09/15 08:14:08 wensong Exp $
*
* Authors: Wensong Zhang <wensong@linuxvirtualserver.org>
* Peter Kese <peter.kese@ijs.si>
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public License
* as published by the Free Software Foundation; either version
* 2 of the License, or (at your option) any later version.
*
* Fixes/Changes:
* Wensong Zhang : changed the ip_vs_rr_schedule to return dest
* Julian Anastasov : fixed the NULL pointer access bug in debugging
* Wensong Zhang : changed some comestics things for debugging
* Wensong Zhang : changed for the d-linked destination list
* Wensong Zhang : added the ip_vs_rr_update_svc
* Wensong Zhang : added any dest with weight=0 is quiesced
*
*/
#include <linux/config.h>
#include <linux/module.h>
#include <linux/init.h>
#include <linux/types.h>
#include <linux/kernel.h>
#include <linux/errno.h>
#include <net/ip_vs.h>
static int ip_vs_rr_init_svc(struct ip_vs_service *svc)
{
svc->sched_data = &svc->destinations;
return 0;
}
static int ip_vs_rr_done_svc(struct ip_vs_service *svc)
{
return 0;
}
static int ip_vs_rr_update_svc(struct ip_vs_service *svc)
{
svc->sched_data = &svc->destinations;
return 0;
}
/*
* Round-Robin Scheduling
*/
static struct ip_vs_dest *
ip_vs_rr_schedule(struct ip_vs_service *svc, struct iphdr *iph)
{
register struct list_head *p, *q;
struct ip_vs_dest *dest;
IP_VS_DBG(6, "ip_vs_rr_schedule(): Scheduling...\n");
write_lock(&svc->sched_lock);
p = (struct list_head *)svc->sched_data;
p = p->next;
q = p;
do {
/* skip list head */
if (q == &svc->destinations) {
q = q->next;
continue;
}
dest = list_entry(q, struct ip_vs_dest, n_list);
if (!(dest->flags & IP_VS_DEST_F_OVERLOAD) &&
atomic_read(&dest->weight) > 0)
/* HIT */
goto out;
q = q->next;
} while (q != p);
write_unlock(&svc->sched_lock);
return NULL;
out:
svc->sched_data = q;
write_unlock(&svc->sched_lock);
IP_VS_DBG(6, "RR: server %u.%u.%u.%u:%u "
"activeconns %d refcnt %d weight %d\n",
NIPQUAD(dest->addr), ntohs(dest->port),
atomic_read(&dest->activeconns),
atomic_read(&dest->refcnt), atomic_read(&dest->weight));
return dest;
}
static struct ip_vs_scheduler ip_vs_rr_scheduler = {
.name = "rr", /* name */
.refcnt = ATOMIC_INIT(0),
.module = THIS_MODULE,
.init_service = ip_vs_rr_init_svc,
.done_service = ip_vs_rr_done_svc,
.update_service = ip_vs_rr_update_svc,
.schedule = ip_vs_rr_schedule,
};
static int __init ip_vs_rr_init(void)
{
INIT_LIST_HEAD(&ip_vs_rr_scheduler.n_list);
return register_ip_vs_scheduler(&ip_vs_rr_scheduler);
}
static void __exit ip_vs_rr_cleanup(void)
{
unregister_ip_vs_scheduler(&ip_vs_rr_scheduler);
}
module_init(ip_vs_rr_init);
module_exit(ip_vs_rr_cleanup);
MODULE_LICENSE("GPL");
/*
* IPVS An implementation of the IP virtual server support for the
* LINUX operating system. IPVS is now implemented as a module
* over the Netfilter framework. IPVS can be used to build a
* high-performance and highly available server based on a
* cluster of servers.
*
* Version: $Id: ip_vs_sched.c,v 1.13 2003/05/10 03:05:23 wensong Exp $
*
* Authors: Wensong Zhang <wensong@linuxvirtualserver.org>
* Peter Kese <peter.kese@ijs.si>
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public License
* as published by the Free Software Foundation; either version
* 2 of the License, or (at your option) any later version.
*
* Changes:
*
*/
#include <linux/config.h>
#include <linux/module.h>
#include <linux/kernel.h>
#include <linux/errno.h>
#include <linux/sched.h>
#include <linux/spinlock.h>
#include <asm/string.h>
#include <linux/kmod.h>
#include <net/ip_vs.h>
/*
* IPVS scheduler list
*/
static LIST_HEAD(ip_vs_schedulers);
/* lock for service table */
static rwlock_t __ip_vs_sched_lock = RW_LOCK_UNLOCKED;
/*
* Bind a service with a scheduler
*/
int ip_vs_bind_scheduler(struct ip_vs_service *svc,
struct ip_vs_scheduler *scheduler)
{
int ret;
if (svc == NULL) {
IP_VS_ERR("ip_vs_bind_scheduler(): svc arg NULL\n");
return -EINVAL;
}
if (scheduler == NULL) {
IP_VS_ERR("ip_vs_bind_scheduler(): scheduler arg NULL\n");
return -EINVAL;
}
svc->scheduler = scheduler;
if (scheduler->init_service) {
ret = scheduler->init_service(svc);
if (ret) {
IP_VS_ERR("ip_vs_bind_scheduler(): init error\n");
return ret;
}
}
return 0;
}
/*
* Unbind a service with its scheduler
*/
int ip_vs_unbind_scheduler(struct ip_vs_service *svc)
{
struct ip_vs_scheduler *sched;
if (svc == NULL) {
IP_VS_ERR("ip_vs_unbind_scheduler(): svc arg NULL\n");
return -EINVAL;
}
sched = svc->scheduler;
if (sched == NULL) {
IP_VS_ERR("ip_vs_unbind_scheduler(): svc isn't bound\n");
return -EINVAL;
}
if (sched->done_service) {
if (sched->done_service(svc) != 0) {
IP_VS_ERR("ip_vs_unbind_scheduler(): done error\n");
return -EINVAL;
}
}
svc->scheduler = NULL;
return 0;
}
/*
* Get scheduler in the scheduler list by name
*/
static struct ip_vs_scheduler *ip_vs_sched_getbyname(const char *sched_name)
{
struct ip_vs_scheduler *sched;
struct list_head *l, *e;
IP_VS_DBG(2, "ip_vs_sched_getbyname(): sched_name \"%s\"\n",
sched_name);
l = &ip_vs_schedulers;
read_lock_bh(&__ip_vs_sched_lock);
for (e=l->next; e!=l; e=e->next) {
sched = list_entry(e, struct ip_vs_scheduler, n_list);
/*
* Test and get the modules atomically
*/
if (sched->module && !try_module_get(sched->module)) {
/*
* This scheduler is just deleted
*/
continue;
}
if (strcmp(sched_name, sched->name)==0) {
/* HIT */
read_unlock_bh(&__ip_vs_sched_lock);
return sched;
}
if (sched->module)
module_put(sched->module);
}
read_unlock_bh(&__ip_vs_sched_lock);
return NULL;
}
/*
* Lookup scheduler and try to load it if it doesn't exist
*/
struct ip_vs_scheduler *ip_vs_scheduler_get(const char *sched_name)
{
struct ip_vs_scheduler *sched;
/*
* Search for the scheduler by sched_name
*/
sched = ip_vs_sched_getbyname(sched_name);
/*
* If scheduler not found, load the module and search again
*/
if (sched == NULL) {
char module_name[IP_VS_SCHEDNAME_MAXLEN+8];
sprintf(module_name,"ip_vs_%s", sched_name);
request_module(module_name);
sched = ip_vs_sched_getbyname(sched_name);
}
return sched;
}
void ip_vs_scheduler_put(struct ip_vs_scheduler *scheduler)
{
if (scheduler->module)
module_put(scheduler->module);
}
/*
* Register a scheduler in the scheduler list
*/
int register_ip_vs_scheduler(struct ip_vs_scheduler *scheduler)
{
struct ip_vs_scheduler *sched;
if (!scheduler) {
IP_VS_ERR("register_ip_vs_scheduler(): NULL arg\n");
return -EINVAL;
}
if (!scheduler->name) {
IP_VS_ERR("register_ip_vs_scheduler(): NULL scheduler_name\n");
return -EINVAL;
}
/* increase the module use count */
ip_vs_use_count_inc();
/*
* Make sure that the scheduler with this name doesn't exist
* in the scheduler list.
*/
sched = ip_vs_sched_getbyname(scheduler->name);
if (sched) {
ip_vs_scheduler_put(sched);
ip_vs_use_count_dec();
IP_VS_ERR("register_ip_vs_scheduler(): [%s] scheduler "
"already existed in the system\n", scheduler->name);
return -EINVAL;
}
write_lock_bh(&__ip_vs_sched_lock);
if (scheduler->n_list.next != &scheduler->n_list) {
write_unlock_bh(&__ip_vs_sched_lock);
ip_vs_use_count_dec();
IP_VS_ERR("register_ip_vs_scheduler(): [%s] scheduler "
"already linked\n", scheduler->name);
return -EINVAL;
}
/*
* Add it into the d-linked scheduler list
*/
list_add(&scheduler->n_list, &ip_vs_schedulers);
write_unlock_bh(&__ip_vs_sched_lock);
IP_VS_INFO("[%s] scheduler registered.\n", scheduler->name);
return 0;
}
/*
* Unregister a scheduler from the scheduler list
*/
int unregister_ip_vs_scheduler(struct ip_vs_scheduler *scheduler)
{
if (!scheduler) {
IP_VS_ERR( "unregister_ip_vs_scheduler(): NULL arg\n");
return -EINVAL;
}
write_lock_bh(&__ip_vs_sched_lock);
if (scheduler->n_list.next == &scheduler->n_list) {
write_unlock_bh(&__ip_vs_sched_lock);
IP_VS_ERR("unregister_ip_vs_scheduler(): [%s] scheduler "
"is not in the list. failed\n", scheduler->name);
return -EINVAL;
}
/*
* Remove it from the d-linked scheduler list
*/
list_del(&scheduler->n_list);
write_unlock_bh(&__ip_vs_sched_lock);
/* decrease the module use count */
ip_vs_use_count_dec();
IP_VS_INFO("[%s] scheduler unregistered.\n", scheduler->name);
return 0;
}
/*
* IPVS: Shortest Expected Delay scheduling module
*
* Version: $Id: ip_vs_sed.c,v 1.1 2003/05/10 03:06:08 wensong Exp $
*
* Authors: Wensong Zhang <wensong@linuxvirtualserver.org>
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public License
* as published by the Free Software Foundation; either version
* 2 of the License, or (at your option) any later version.
*
* Changes:
*
*/
/*
* The SED algorithm attempts to minimize each job's expected delay until
* completion. The expected delay that the job will experience is
* (Ci + 1) / Ui if sent to the ith server, in which Ci is the number of
* jobs on the the ith server and Ui is the fixed service rate (weight) of
* the ith server. The SED algorithm adopts a greedy policy that each does
* what is in its own best interest, i.e. to join the queue which would
* minimize its expected delay of completion.
*
* See the following paper for more information:
* A. Weinrib and S. Shenker, Greed is not enough: Adaptive load sharing
* in large heterogeneous systems. In Proceedings IEEE INFOCOM'88,
* pages 986-994, 1988.
*
* Thanks must go to Marko Buuri <marko@buuri.name> for talking SED to me.
*
* The difference between SED and WLC is that SED includes the incoming
* job in the cost function (the increment of 1). SED may outperform
* WLC, while scheduling big jobs under larger heterogeneous systems
* (the server weight varies a lot).
*
*/
#include <linux/config.h>
#include <linux/module.h>
#include <linux/init.h>
#include <linux/types.h>
#include <linux/kernel.h>
#include <linux/errno.h>
#include <net/ip_vs.h>
static int
ip_vs_sed_init_svc(struct ip_vs_service *svc)
{
return 0;
}
static int
ip_vs_sed_done_svc(struct ip_vs_service *svc)
{
return 0;
}
static int
ip_vs_sed_update_svc(struct ip_vs_service *svc)
{
return 0;
}
static inline unsigned int
ip_vs_sed_dest_overhead(struct ip_vs_dest *dest)
{
/*
* We only use the active connection number in the cost
* calculation here.
*/
return atomic_read(&dest->activeconns) + 1;
}
/*
* Weighted Least Connection scheduling
*/
static struct ip_vs_dest *
ip_vs_sed_schedule(struct ip_vs_service *svc, struct iphdr *iph)
{
register struct list_head *l, *e;
struct ip_vs_dest *dest, *least;
unsigned int loh, doh;
IP_VS_DBG(6, "ip_vs_sed_schedule(): Scheduling...\n");
/*
* We calculate the load of each dest server as follows:
* (server expected overhead) / dest->weight
*
* Remember -- no floats in kernel mode!!!
* The comparison of h1*w2 > h2*w1 is equivalent to that of
* h1/w1 > h2/w2
* if every weight is larger than zero.
*
* The server with weight=0 is quiesced and will not receive any
* new connections.
*/
l = &svc->destinations;
for (e=l->next; e!=l; e=e->next) {
least = list_entry(e, struct ip_vs_dest, n_list);
if (!(least->flags & IP_VS_DEST_F_OVERLOAD) &&
atomic_read(&least->weight) > 0) {
loh = ip_vs_sed_dest_overhead(least);
goto nextstage;
}
}
return NULL;
/*
* Find the destination with the least load.
*/
nextstage:
for (e=e->next; e!=l; e=e->next) {
dest = list_entry(e, struct ip_vs_dest, n_list);
if (dest->flags & IP_VS_DEST_F_OVERLOAD)
continue;
doh = ip_vs_sed_dest_overhead(dest);
if (loh * atomic_read(&dest->weight) >
doh * atomic_read(&least->weight)) {
least = dest;
loh = doh;
}
}
IP_VS_DBG(6, "SED: server %u.%u.%u.%u:%u "
"activeconns %d refcnt %d weight %d overhead %d\n",
NIPQUAD(least->addr), ntohs(least->port),
atomic_read(&least->activeconns),
atomic_read(&least->refcnt),
atomic_read(&least->weight), loh);
return least;
}
static struct ip_vs_scheduler ip_vs_sed_scheduler =
{
.name = "sed",
.refcnt = ATOMIC_INIT(0),
.module = THIS_MODULE,
.init_service = ip_vs_sed_init_svc,
.done_service = ip_vs_sed_done_svc,
.update_service = ip_vs_sed_update_svc,
.schedule = ip_vs_sed_schedule,
};
static int __init ip_vs_sed_init(void)
{
INIT_LIST_HEAD(&ip_vs_sed_scheduler.n_list);
return register_ip_vs_scheduler(&ip_vs_sed_scheduler);
}
static void __exit ip_vs_sed_cleanup(void)
{
unregister_ip_vs_scheduler(&ip_vs_sed_scheduler);
}
module_init(ip_vs_sed_init);
module_exit(ip_vs_sed_cleanup);
MODULE_LICENSE("GPL");
/*
* IPVS: Source Hashing scheduling module
*
* Version: $Id: ip_vs_sh.c,v 1.5 2002/09/15 08:14:08 wensong Exp $
*
* Authors: Wensong Zhang <wensong@gnuchina.org>
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public License
* as published by the Free Software Foundation; either version
* 2 of the License, or (at your option) any later version.
*
* Changes:
*
*/
/*
* The sh algorithm is to select server by the hash key of source IP
* address. The pseudo code is as follows:
*
* n <- servernode[src_ip];
* if (n is dead) OR
* (n is overloaded) or (n.weight <= 0) then
* return NULL;
*
* return n;
*
* Notes that servernode is a 256-bucket hash table that maps the hash
* index derived from packet source IP address to the current server
* array. If the sh scheduler is used in cache cluster, it is good to
* combine it with cache_bypass feature. When the statically assigned
* server is dead or overloaded, the load balancer can bypass the cache
* server and send requests to the original server directly.
*
*/
#include <linux/config.h>
#include <linux/module.h>
#include <linux/init.h>
#include <linux/types.h>
#include <linux/kernel.h>
#include <linux/errno.h>
#include <net/ip_vs.h>
/*
* IPVS SH bucket
*/
struct ip_vs_sh_bucket {
struct ip_vs_dest *dest; /* real server (cache) */
};
/*
* for IPVS SH entry hash table
*/
#ifndef CONFIG_IP_VS_SH_TAB_BITS
#define CONFIG_IP_VS_SH_TAB_BITS 8
#endif
#define IP_VS_SH_TAB_BITS CONFIG_IP_VS_SH_TAB_BITS
#define IP_VS_SH_TAB_SIZE (1 << IP_VS_SH_TAB_BITS)
#define IP_VS_SH_TAB_MASK (IP_VS_SH_TAB_SIZE - 1)
/*
* Returns hash value for IPVS SH entry
*/
static inline unsigned ip_vs_sh_hashkey(__u32 addr)
{
return (ntohl(addr)*2654435761UL) & IP_VS_SH_TAB_MASK;
}
/*
* Get ip_vs_dest associated with supplied parameters.
*/
static inline struct ip_vs_dest *
ip_vs_sh_get(struct ip_vs_sh_bucket *tbl, __u32 addr)
{
return (tbl[ip_vs_sh_hashkey(addr)]).dest;
}
/*
* Assign all the hash buckets of the specified table with the service.
*/
static int
ip_vs_sh_assign(struct ip_vs_sh_bucket *tbl, struct ip_vs_service *svc)
{
int i;
struct ip_vs_sh_bucket *b;
struct list_head *p;
struct ip_vs_dest *dest;
b = tbl;
p = &svc->destinations;
for (i=0; i<IP_VS_SH_TAB_SIZE; i++) {
if (list_empty(p)) {
b->dest = NULL;
} else {
if (p == &svc->destinations)
p = p->next;
dest = list_entry(p, struct ip_vs_dest, n_list);
atomic_inc(&dest->refcnt);
b->dest = dest;
p = p->next;
}
b++;
}
return 0;
}
/*
* Flush all the hash buckets of the specified table.
*/
static void ip_vs_sh_flush(struct ip_vs_sh_bucket *tbl)
{
int i;
struct ip_vs_sh_bucket *b;
b = tbl;
for (i=0; i<IP_VS_SH_TAB_SIZE; i++) {
if (b->dest) {
atomic_dec(&b->dest->refcnt);
b->dest = NULL;
}
b++;
}
}
static int ip_vs_sh_init_svc(struct ip_vs_service *svc)
{
struct ip_vs_sh_bucket *tbl;
/* allocate the SH table for this service */
tbl = kmalloc(sizeof(struct ip_vs_sh_bucket)*IP_VS_SH_TAB_SIZE,
GFP_ATOMIC);
if (tbl == NULL) {
IP_VS_ERR("ip_vs_sh_init_svc(): no memory\n");
return -ENOMEM;
}
svc->sched_data = tbl;
IP_VS_DBG(6, "SH hash table (memory=%dbytes) allocated for "
"current service\n",
sizeof(struct ip_vs_sh_bucket)*IP_VS_SH_TAB_SIZE);
/* assign the hash buckets with the updated service */
ip_vs_sh_assign(tbl, svc);
return 0;
}
static int ip_vs_sh_done_svc(struct ip_vs_service *svc)
{
struct ip_vs_sh_bucket *tbl = svc->sched_data;
/* got to clean up hash buckets here */
ip_vs_sh_flush(tbl);
/* release the table itself */
kfree(svc->sched_data);
IP_VS_DBG(6, "SH hash table (memory=%dbytes) released\n",
sizeof(struct ip_vs_sh_bucket)*IP_VS_SH_TAB_SIZE);
return 0;
}
static int ip_vs_sh_update_svc(struct ip_vs_service *svc)
{
struct ip_vs_sh_bucket *tbl = svc->sched_data;
/* got to clean up hash buckets here */
ip_vs_sh_flush(tbl);
/* assign the hash buckets with the updated service */
ip_vs_sh_assign(tbl, svc);
return 0;
}
/*
* If the dest flags is set with IP_VS_DEST_F_OVERLOAD,
* consider that the server is overloaded here.
*/
static inline int is_overloaded(struct ip_vs_dest *dest)
{
return dest->flags & IP_VS_DEST_F_OVERLOAD;
}
/*
* Source Hashing scheduling
*/
static struct ip_vs_dest *
ip_vs_sh_schedule(struct ip_vs_service *svc, struct iphdr *iph)
{
struct ip_vs_dest *dest;
struct ip_vs_sh_bucket *tbl;
IP_VS_DBG(6, "ip_vs_sh_schedule(): Scheduling...\n");
tbl = (struct ip_vs_sh_bucket *)svc->sched_data;
dest = ip_vs_sh_get(tbl, iph->saddr);
if (!dest
|| !(dest->flags & IP_VS_DEST_F_AVAILABLE)
|| atomic_read(&dest->weight) <= 0
|| is_overloaded(dest)) {
return NULL;
}
IP_VS_DBG(6, "SH: source IP address %u.%u.%u.%u "
"--> server %u.%u.%u.%u:%d\n",
NIPQUAD(iph->saddr),
NIPQUAD(dest->addr),
ntohs(dest->port));
return dest;
}
/*
* IPVS SH Scheduler structure
*/
static struct ip_vs_scheduler ip_vs_sh_scheduler =
{
.name = "sh",
.refcnt = ATOMIC_INIT(0),
.module = THIS_MODULE,
.init_service = ip_vs_sh_init_svc,
.done_service = ip_vs_sh_done_svc,
.update_service = ip_vs_sh_update_svc,
.schedule = ip_vs_sh_schedule,
};
static int __init ip_vs_sh_init(void)
{
INIT_LIST_HEAD(&ip_vs_sh_scheduler.n_list);
return register_ip_vs_scheduler(&ip_vs_sh_scheduler);
}
static void __exit ip_vs_sh_cleanup(void)
{
unregister_ip_vs_scheduler(&ip_vs_sh_scheduler);
}
module_init(ip_vs_sh_init);
module_exit(ip_vs_sh_cleanup);
MODULE_LICENSE("GPL");
/*
* IPVS An implementation of the IP virtual server support for the
* LINUX operating system. IPVS is now implemented as a module
* over the NetFilter framework. IPVS can be used to build a
* high-performance and highly available server based on a
* cluster of servers.
*
* Version: $Id: ip_vs_sync.c,v 1.13 2003/06/08 09:31:19 wensong Exp $
*
* Authors: Wensong Zhang <wensong@linuxvirtualserver.org>
*
* ip_vs_sync: sync connection info from master load balancer to backups
* through multicast
*
* Changes:
* Alexandre Cassen : Added master & backup support at a time.
* Alexandre Cassen : Added SyncID support for incoming sync
* messages filtering.
*/
#define __KERNEL_SYSCALLS__ /* for waitpid */
#include <linux/config.h>
#include <linux/module.h>
#include <linux/kernel.h>
#include <linux/errno.h>
#include <linux/slab.h>
#include <linux/net.h>
#include <linux/sched.h>
#include <linux/wait.h>
#include <linux/unistd.h>
#include <linux/completion.h>
#include <linux/skbuff.h>
#include <linux/in.h>
#include <linux/igmp.h> /* for ip_mc_join_group */
#include <net/ip.h>
#include <net/sock.h>
#include <asm/uaccess.h> /* for get_fs and set_fs */
#include <net/ip_vs.h>
#define IP_VS_SYNC_GROUP 0xe0000051 /* multicast addr - 224.0.0.81 */
#define IP_VS_SYNC_PORT 8848 /* multicast port */
/*
* IPVS sync connection entry
*/
struct ip_vs_sync_conn {
__u8 reserved;
/* Protocol, addresses and port numbers */
__u8 protocol; /* Which protocol (TCP/UDP) */
__u16 cport;
__u16 vport;
__u16 dport;
__u32 caddr; /* client address */
__u32 vaddr; /* virtual address */
__u32 daddr; /* destination address */
/* Flags and state transition */
__u16 flags; /* status flags */
__u16 state; /* state info */
/* The sequence options start here */
};
struct ip_vs_sync_conn_options {
struct ip_vs_seq in_seq; /* incoming seq. struct */
struct ip_vs_seq out_seq; /* outgoing seq. struct */
};
#define IP_VS_SYNC_CONN_TIMEOUT (3*60*HZ)
#define SIMPLE_CONN_SIZE (sizeof(struct ip_vs_sync_conn))
#define FULL_CONN_SIZE \
(sizeof(struct ip_vs_sync_conn) + sizeof(struct ip_vs_sync_conn_options))
/*
The master mulitcasts messages to the backup load balancers in the
following format.
0 1 2 3
0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
| Count Conns | SyncID | Size |
+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
| |
| IPVS Sync Connection (1) |
+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
| . |
| . |
| . |
+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
| |
| IPVS Sync Connection (n) |
+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
*/
#define SYNC_MESG_HEADER_LEN 4
struct ip_vs_sync_mesg {
__u8 nr_conns;
__u8 syncid;
__u16 size;
/* ip_vs_sync_conn entries start here */
};
/* the maximum length of sync (sending/receiving) message */
static int sync_send_mesg_maxlen;
static int sync_recv_mesg_maxlen;
struct ip_vs_sync_buff {
struct list_head list;
unsigned long firstuse;
/* pointers for the message data */
struct ip_vs_sync_mesg *mesg;
unsigned char *head;
unsigned char *end;
};
/* the sync_buff list head and the lock */
static LIST_HEAD(ip_vs_sync_queue);
static spinlock_t ip_vs_sync_lock = SPIN_LOCK_UNLOCKED;
/* current sync_buff for accepting new conn entries */
static struct ip_vs_sync_buff *curr_sb = NULL;
static spinlock_t curr_sb_lock = SPIN_LOCK_UNLOCKED;
/* ipvs sync daemon state */
volatile int ip_vs_sync_state = IP_VS_STATE_NONE;
volatile int ip_vs_master_syncid = 0;
volatile int ip_vs_backup_syncid = 0;
/* multicast interface name */
char ip_vs_master_mcast_ifn[IP_VS_IFNAME_MAXLEN];
char ip_vs_backup_mcast_ifn[IP_VS_IFNAME_MAXLEN];
/* multicast addr */
static struct sockaddr_in mcast_addr;
static inline void sb_queue_tail(struct ip_vs_sync_buff *sb)
{
spin_lock(&ip_vs_sync_lock);
list_add_tail(&sb->list, &ip_vs_sync_queue);
spin_unlock(&ip_vs_sync_lock);
}
static inline struct ip_vs_sync_buff * sb_dequeue(void)
{
struct ip_vs_sync_buff *sb;
spin_lock_bh(&ip_vs_sync_lock);
if (list_empty(&ip_vs_sync_queue)) {
sb = NULL;
} else {
sb = list_entry(ip_vs_sync_queue.next,
struct ip_vs_sync_buff,
list);
list_del(&sb->list);
}
spin_unlock_bh(&ip_vs_sync_lock);
return sb;
}
static inline struct ip_vs_sync_buff * ip_vs_sync_buff_create(void)
{
struct ip_vs_sync_buff *sb;
if (!(sb=kmalloc(sizeof(struct ip_vs_sync_buff), GFP_ATOMIC)))
return NULL;
if (!(sb->mesg=kmalloc(sync_send_mesg_maxlen, GFP_ATOMIC))) {
kfree(sb);
return NULL;
}
sb->mesg->nr_conns = 0;
sb->mesg->syncid = ip_vs_master_syncid;
sb->mesg->size = 4;
sb->head = (unsigned char *)sb->mesg + 4;
sb->end = (unsigned char *)sb->mesg + sync_send_mesg_maxlen;
sb->firstuse = jiffies;
return sb;
}
static inline void ip_vs_sync_buff_release(struct ip_vs_sync_buff *sb)
{
kfree(sb->mesg);
kfree(sb);
}
/*
* Get the current sync buffer if it has been created for more
* than the specified time or the specified time is zero.
*/
static inline struct ip_vs_sync_buff *
get_curr_sync_buff(unsigned long time)
{
struct ip_vs_sync_buff *sb;
spin_lock_bh(&curr_sb_lock);
if (curr_sb &&
(jiffies - curr_sb->firstuse > time || time == 0)) {
sb = curr_sb;
curr_sb = NULL;
} else
sb = NULL;
spin_unlock_bh(&curr_sb_lock);
return sb;
}
/*
* Add an ip_vs_conn information into the current sync_buff.
* Called by ip_vs_in.
*/
void ip_vs_sync_conn(struct ip_vs_conn *cp)
{
struct ip_vs_sync_mesg *m;
struct ip_vs_sync_conn *s;
int len;
spin_lock(&curr_sb_lock);
if (!curr_sb) {
if (!(curr_sb=ip_vs_sync_buff_create())) {
spin_unlock(&curr_sb_lock);
IP_VS_ERR("ip_vs_sync_buff_create failed.\n");
return;
}
}
len = (cp->flags & IP_VS_CONN_F_SEQ_MASK) ? FULL_CONN_SIZE :
SIMPLE_CONN_SIZE;
m = curr_sb->mesg;
s = (struct ip_vs_sync_conn *)curr_sb->head;
/* copy members */
s->protocol = cp->protocol;
s->cport = cp->cport;
s->vport = cp->vport;
s->dport = cp->dport;
s->caddr = cp->caddr;
s->vaddr = cp->vaddr;
s->daddr = cp->daddr;
s->flags = htons(cp->flags & ~IP_VS_CONN_F_HASHED);
s->state = htons(cp->state);
if (cp->flags & IP_VS_CONN_F_SEQ_MASK) {
struct ip_vs_sync_conn_options *opt =
(struct ip_vs_sync_conn_options *)&s[1];
memcpy(opt, &cp->in_seq, sizeof(*opt));
}
m->nr_conns++;
m->size += len;
curr_sb->head += len;
/* check if there is a space for next one */
if (curr_sb->head+FULL_CONN_SIZE > curr_sb->end) {
sb_queue_tail(curr_sb);
curr_sb = NULL;
}
spin_unlock(&curr_sb_lock);
/* synchronize its controller if it has */
if (cp->control)
ip_vs_sync_conn(cp->control);
}
/*
* Process received multicast message and create the corresponding
* ip_vs_conn entries.
*/
static void ip_vs_process_message(const char *buffer, const size_t buflen)
{
struct ip_vs_sync_mesg *m = (struct ip_vs_sync_mesg *)buffer;
struct ip_vs_sync_conn *s;
struct ip_vs_sync_conn_options *opt;
struct ip_vs_conn *cp;
char *p;
int i;
if (buflen != m->size) {
IP_VS_ERR("bogus message\n");
return;
}
/* SyncID sanity check */
if (ip_vs_backup_syncid != 0 && m->syncid != ip_vs_backup_syncid) {
IP_VS_DBG(7, "Ignoring incoming msg with syncid = %d\n",
m->syncid);
return;
}
p = (char *)buffer + sizeof(struct ip_vs_sync_mesg);
for (i=0; i<m->nr_conns; i++) {
s = (struct ip_vs_sync_conn *)p;
cp = ip_vs_conn_in_get(s->protocol,
s->caddr, s->cport,
s->vaddr, s->vport);
if (!cp) {
cp = ip_vs_conn_new(s->protocol,
s->caddr, s->cport,
s->vaddr, s->vport,
s->daddr, s->dport,
ntohs(s->flags), NULL);
if (!cp) {
IP_VS_ERR("ip_vs_conn_new failed\n");
return;
}
cp->state = ntohs(s->state);
} else if (!cp->dest) {
/* it is an entry created by the synchronization */
cp->state = ntohs(s->state);
cp->flags = ntohs(s->flags) | IP_VS_CONN_F_HASHED;
} /* Note that we don't touch its state and flags
if it is a normal entry. */
if (ntohs(s->flags) & IP_VS_CONN_F_SEQ_MASK) {
opt = (struct ip_vs_sync_conn_options *)&s[1];
memcpy(&cp->in_seq, opt, sizeof(*opt));
p += FULL_CONN_SIZE;
} else
p += SIMPLE_CONN_SIZE;
atomic_set(&cp->in_pkts, sysctl_ip_vs_sync_threshold[0]);
cp->timeout = IP_VS_SYNC_CONN_TIMEOUT;
ip_vs_conn_put(cp);
if (p > buffer+buflen) {
IP_VS_ERR("bogus message\n");
return;
}
}
}
/*
* Setup loopback of outgoing multicasts on a sending socket
*/
static void set_mcast_loop(struct sock *sk, u_char loop)
{
struct inet_opt *inet = inet_sk(sk);
/* setsockopt(sock, SOL_IP, IP_MULTICAST_LOOP, &loop, sizeof(loop)); */
lock_sock(sk);
inet->mc_loop = loop ? 1 : 0;
release_sock(sk);
}
/*
* Specify TTL for outgoing multicasts on a sending socket
*/
static void set_mcast_ttl(struct sock *sk, u_char ttl)
{
struct inet_opt *inet = inet_sk(sk);
/* setsockopt(sock, SOL_IP, IP_MULTICAST_TTL, &ttl, sizeof(ttl)); */
lock_sock(sk);
inet->mc_ttl = ttl;
release_sock(sk);
}
/*
* Specifiy default interface for outgoing multicasts
*/
static int set_mcast_if(struct sock *sk, char *ifname)
{
struct net_device *dev;
struct inet_opt *inet = inet_sk(sk);
if ((dev = __dev_get_by_name(ifname)) == NULL)
return -ENODEV;
if (sk->sk_bound_dev_if && dev->ifindex != sk->sk_bound_dev_if)
return -EINVAL;
lock_sock(sk);
inet->mc_index = dev->ifindex;
/* inet->mc_addr = 0; */
release_sock(sk);
return 0;
}
/*
* Set the maximum length of sync message according to the
* specified interface's MTU.
*/
static int set_sync_mesg_maxlen(int sync_state)
{
struct net_device *dev;
int num;
if (sync_state == IP_VS_STATE_MASTER) {
if ((dev = __dev_get_by_name(ip_vs_master_mcast_ifn)) == NULL)
return -ENODEV;
num = (dev->mtu - sizeof(struct iphdr) -
sizeof(struct udphdr) -
SYNC_MESG_HEADER_LEN - 20) / SIMPLE_CONN_SIZE;
sync_send_mesg_maxlen =
SYNC_MESG_HEADER_LEN + SIMPLE_CONN_SIZE * num;
IP_VS_DBG(7, "setting the maximum length of sync sending "
"message %d.\n", sync_send_mesg_maxlen);
} else if (sync_state == IP_VS_STATE_BACKUP) {
if ((dev = __dev_get_by_name(ip_vs_backup_mcast_ifn)) == NULL)
return -ENODEV;
sync_recv_mesg_maxlen = dev->mtu -
sizeof(struct iphdr) - sizeof(struct udphdr);
IP_VS_DBG(7, "setting the maximum length of sync receiving "
"message %d.\n", sync_recv_mesg_maxlen);
}
return 0;
}
/*
* Join a multicast group.
* the group is specified by a class D multicast address 224.0.0.0/8
* in the in_addr structure passed in as a parameter.
*/
static int
join_mcast_group(struct sock *sk, struct in_addr *addr, char *ifname)
{
struct ip_mreqn mreq;
struct net_device *dev;
int ret;
memset(&mreq, 0, sizeof(mreq));
memcpy(&mreq.imr_multiaddr, addr, sizeof(struct in_addr));
if ((dev = __dev_get_by_name(ifname)) == NULL)
return -ENODEV;
if (sk->sk_bound_dev_if && dev->ifindex != sk->sk_bound_dev_if)
return -EINVAL;
mreq.imr_ifindex = dev->ifindex;
lock_sock(sk);
ret = ip_mc_join_group(sk, &mreq);
release_sock(sk);
return ret;
}
static int bind_mcastif_addr(struct socket *sock, char *ifname)
{
struct net_device *dev;
u32 addr;
struct sockaddr_in sin;
if ((dev = __dev_get_by_name(ifname)) == NULL)
return -ENODEV;
addr = inet_select_addr(dev, 0, RT_SCOPE_UNIVERSE);
if (!addr)
IP_VS_ERR("You probably need to specify IP address on "
"multicast interface.\n");
IP_VS_DBG(7, "binding socket with (%s) %u.%u.%u.%u\n",
ifname, NIPQUAD(addr));
/* Now bind the socket with the address of multicast interface */
sin.sin_family = AF_INET;
sin.sin_addr.s_addr = addr;
sin.sin_port = 0;
return sock->ops->bind(sock, (struct sockaddr*)&sin, sizeof(sin));
}
/*
* Set up sending multicast socket over UDP
*/
static struct socket * make_send_sock(void)
{
struct socket *sock;
/* First create a socket */
if (sock_create(PF_INET, SOCK_DGRAM, IPPROTO_UDP, &sock) < 0) {
IP_VS_ERR("Error during creation of socket; terminating\n");
return NULL;
}
if (set_mcast_if(sock->sk, ip_vs_master_mcast_ifn) < 0) {
IP_VS_ERR("Error setting outbound mcast interface\n");
goto error;
}
set_mcast_loop(sock->sk, 0);
set_mcast_ttl(sock->sk, 1);
if (bind_mcastif_addr(sock, ip_vs_master_mcast_ifn) < 0) {
IP_VS_ERR("Error binding address of the mcast interface\n");
goto error;
}
if (sock->ops->connect(sock,
(struct sockaddr*)&mcast_addr,
sizeof(struct sockaddr), 0) < 0) {
IP_VS_ERR("Error connecting to the multicast addr\n");
goto error;
}
return sock;
error:
sock_release(sock);
return NULL;
}
/*
* Set up receiving multicast socket over UDP
*/
static struct socket * make_receive_sock(void)
{
struct socket *sock;
/* First create a socket */
if (sock_create(PF_INET, SOCK_DGRAM, IPPROTO_UDP, &sock) < 0) {
IP_VS_ERR("Error during creation of socket; terminating\n");
return NULL;
}
/* it is equivalent to the REUSEADDR option in user-space */
sock->sk->sk_reuse = 1;
if (sock->ops->bind(sock,
(struct sockaddr*)&mcast_addr,
sizeof(struct sockaddr)) < 0) {
IP_VS_ERR("Error binding to the multicast addr\n");
goto error;
}
/* join the multicast group */
if (join_mcast_group(sock->sk,
(struct in_addr*)&mcast_addr.sin_addr,
ip_vs_backup_mcast_ifn) < 0) {
IP_VS_ERR("Error joining to the multicast group\n");
goto error;
}
return sock;
error:
sock_release(sock);
return NULL;
}
static int
ip_vs_send_async(struct socket *sock, const char *buffer, const size_t length)
{
struct msghdr msg;
mm_segment_t oldfs;
struct iovec iov;
int len;
EnterFunction(7);
iov.iov_base = (void *)buffer;
iov.iov_len = length;
msg.msg_name = 0;
msg.msg_namelen = 0;
msg.msg_iov = &iov;
msg.msg_iovlen = 1;
msg.msg_control = NULL;
msg.msg_controllen = 0;
msg.msg_flags = MSG_DONTWAIT|MSG_NOSIGNAL;
oldfs = get_fs(); set_fs(KERNEL_DS);
len = sock_sendmsg(sock, &msg, (size_t)(length));
set_fs(oldfs);
LeaveFunction(7);
return len;
}
static int
ip_vs_receive(struct socket *sock, char *buffer, const size_t buflen)
{
struct msghdr msg;
struct iovec iov;
int len;
mm_segment_t oldfs;
EnterFunction(7);
/* Receive a packet */
iov.iov_base = buffer;
iov.iov_len = (size_t)buflen;
msg.msg_name = 0;
msg.msg_namelen = 0;
msg.msg_iov = &iov;
msg.msg_iovlen = 1;
msg.msg_control = NULL;
msg.msg_controllen = 0;
msg.msg_flags = 0;
oldfs = get_fs(); set_fs(KERNEL_DS);
len = sock_recvmsg(sock, &msg, buflen, 0);
set_fs(oldfs);
if (len < 0)
return -1;
LeaveFunction(7);
return len;
}
static int errno;
static DECLARE_WAIT_QUEUE_HEAD(sync_wait);
static pid_t sync_master_pid = 0;
static pid_t sync_backup_pid = 0;
static DECLARE_WAIT_QUEUE_HEAD(stop_sync_wait);
static int stop_master_sync = 0;
static int stop_backup_sync = 0;
static void sync_master_loop(void)
{
struct socket *sock;
struct ip_vs_sync_buff *sb;
struct ip_vs_sync_mesg *m;
/* create the sending multicast socket */
sock = make_send_sock();
if (!sock)
return;
IP_VS_INFO("sync thread started: state = MASTER, mcast_ifn = %s, "
"syncid = %d\n",
ip_vs_master_mcast_ifn, ip_vs_master_syncid);
for (;;) {
while ((sb=sb_dequeue())) {
m = sb->mesg;
if (ip_vs_send_async(sock, (char *)m,
m->size) != m->size)
IP_VS_ERR("ip_vs_send_async error\n");
ip_vs_sync_buff_release(sb);
}
/* check if entries stay in curr_sb for 2 seconds */
if ((sb = get_curr_sync_buff(2*HZ))) {
m = sb->mesg;
if (ip_vs_send_async(sock, (char *)m,
m->size) != m->size)
IP_VS_ERR("ip_vs_send_async error\n");
ip_vs_sync_buff_release(sb);
}
if (stop_master_sync)
break;
__set_current_state(TASK_INTERRUPTIBLE);
schedule_timeout(HZ);
__set_current_state(TASK_RUNNING);
}
/* clean up the sync_buff queue */
while ((sb=sb_dequeue())) {
ip_vs_sync_buff_release(sb);
}
/* clean up the current sync_buff */
if ((sb = get_curr_sync_buff(0))) {
ip_vs_sync_buff_release(sb);
}
/* release the sending multicast socket */
sock_release(sock);
}
static void sync_backup_loop(void)
{
struct socket *sock;
char *buf;
int len;
if (!(buf = kmalloc(sync_recv_mesg_maxlen, GFP_ATOMIC))) {
IP_VS_ERR("sync_backup_loop: kmalloc error\n");
return;
}
/* create the receiving multicast socket */
sock = make_receive_sock();
if (!sock)
goto out;
IP_VS_INFO("sync thread started: state = BACKUP, mcast_ifn = %s, "
"syncid = %d\n",
ip_vs_backup_mcast_ifn, ip_vs_backup_syncid);
for (;;) {
/* do you have data now? */
while (!skb_queue_empty(&(sock->sk->sk_receive_queue))) {
if ((len =
ip_vs_receive(sock, buf,
sync_recv_mesg_maxlen)) <= 0) {
IP_VS_ERR("receiving message error\n");
break;
}
/* disable bottom half, because it accessed the data
shared by softirq while getting/creating conns */
local_bh_disable();
ip_vs_process_message(buf, len);
local_bh_enable();
}
if (stop_backup_sync)
break;
__set_current_state(TASK_INTERRUPTIBLE);
schedule_timeout(HZ);
__set_current_state(TASK_RUNNING);
}
/* release the sending multicast socket */
sock_release(sock);
out:
kfree(buf);
}
static void set_sync_pid(int sync_state, pid_t sync_pid)
{
if (sync_state == IP_VS_STATE_MASTER)
sync_master_pid = sync_pid;
else if (sync_state == IP_VS_STATE_BACKUP)
sync_backup_pid = sync_pid;
}
static void set_stop_sync(int sync_state, int set)
{
if (sync_state == IP_VS_STATE_MASTER)
stop_master_sync = set;
else if (sync_state == IP_VS_STATE_BACKUP)
stop_backup_sync = set;
else {
stop_master_sync = set;
stop_backup_sync = set;
}
}
static int sync_thread(void *startup)
{
DECLARE_WAITQUEUE(wait, current);
mm_segment_t oldmm;
int state;
const char *name;
/* increase the module use count */
ip_vs_use_count_inc();
if (ip_vs_sync_state & IP_VS_STATE_MASTER && !sync_master_pid) {
state = IP_VS_STATE_MASTER;
name = "ipvs syncmaster";
} else if (ip_vs_sync_state & IP_VS_STATE_BACKUP && !sync_backup_pid) {
state = IP_VS_STATE_BACKUP;
name = "ipvs syncbackup";
} else {
IP_VS_BUG();
ip_vs_use_count_dec();
return -EINVAL;
}
daemonize(name);
oldmm = get_fs();
set_fs(KERNEL_DS);
/* Block all signals */
spin_lock_irq(&current->sighand->siglock);
siginitsetinv(&current->blocked, 0);
recalc_sigpending();
spin_unlock_irq(&current->sighand->siglock);
/* set the maximum length of sync message */
set_sync_mesg_maxlen(state);
/* set up multicast address */
mcast_addr.sin_family = AF_INET;
mcast_addr.sin_port = htons(IP_VS_SYNC_PORT);
mcast_addr.sin_addr.s_addr = htonl(IP_VS_SYNC_GROUP);
add_wait_queue(&sync_wait, &wait);
set_sync_pid(state, current->pid);
complete((struct completion *)startup);
/* processing master/backup loop here */
if (state == IP_VS_STATE_MASTER)
sync_master_loop();
else if (state == IP_VS_STATE_BACKUP)
sync_backup_loop();
else IP_VS_BUG();
remove_wait_queue(&sync_wait, &wait);
/* thread exits */
set_sync_pid(state, 0);
IP_VS_INFO("sync thread stopped!\n");
set_fs(oldmm);
/* decrease the module use count */
ip_vs_use_count_dec();
set_stop_sync(state, 0);
wake_up(&stop_sync_wait);
return 0;
}
static int fork_sync_thread(void *startup)
{
/* fork the sync thread here, then the parent process of the
sync thread is the init process after this thread exits. */
if (kernel_thread(sync_thread, startup, 0) < 0)
IP_VS_BUG();
return 0;
}
int start_sync_thread(int state, char *mcast_ifn, __u8 syncid)
{
DECLARE_COMPLETION(startup);
pid_t pid;
int waitpid_result;
if ((state == IP_VS_STATE_MASTER && sync_master_pid) ||
(state == IP_VS_STATE_BACKUP && sync_backup_pid))
return -EEXIST;
IP_VS_DBG(7, "%s: pid %d\n", __FUNCTION__, current->pid);
IP_VS_DBG(7, "Each ip_vs_sync_conn entry need %d bytes\n",
sizeof(struct ip_vs_sync_conn));
ip_vs_sync_state |= state;
if (state == IP_VS_STATE_MASTER) {
strcpy(ip_vs_master_mcast_ifn, mcast_ifn);
ip_vs_master_syncid = syncid;
} else {
strcpy(ip_vs_backup_mcast_ifn, mcast_ifn);
ip_vs_backup_syncid = syncid;
}
if ((pid = kernel_thread(fork_sync_thread, &startup, 0)) < 0)
IP_VS_BUG();
if ((waitpid_result = waitpid(pid, NULL, __WCLONE)) != pid) {
IP_VS_ERR("%s: waitpid(%d,...) failed, errno %d\n",
__FUNCTION__, pid, -waitpid_result);
}
wait_for_completion(&startup);
return 0;
}
int stop_sync_thread(int state)
{
DECLARE_WAITQUEUE(wait, current);
if ((state == IP_VS_STATE_MASTER && !sync_master_pid) ||
(state == IP_VS_STATE_BACKUP && !sync_backup_pid))
return -ESRCH;
IP_VS_DBG(7, "%s: pid %d\n", __FUNCTION__, current->pid);
IP_VS_INFO("stopping sync thread %d ...\n",
(state == IP_VS_STATE_MASTER) ? sync_master_pid : sync_backup_pid);
__set_current_state(TASK_UNINTERRUPTIBLE);
add_wait_queue(&stop_sync_wait, &wait);
set_stop_sync(state, 1);
ip_vs_sync_state -= state;
wake_up(&sync_wait);
schedule();
__set_current_state(TASK_RUNNING);
remove_wait_queue(&stop_sync_wait, &wait);
/* Note: no need to reap the sync thread, because its parent
process is the init process */
if ((state == IP_VS_STATE_MASTER && stop_master_sync) ||
(state == IP_VS_STATE_BACKUP && stop_backup_sync))
IP_VS_BUG();
return 0;
}
/*
* IPVS: Weighted Least-Connection Scheduling module
*
* Version: $Id: ip_vs_wlc.c,v 1.13 2003/04/18 09:03:16 wensong Exp $
*
* Authors: Wensong Zhang <wensong@linuxvirtualserver.org>
* Peter Kese <peter.kese@ijs.si>
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public License
* as published by the Free Software Foundation; either version
* 2 of the License, or (at your option) any later version.
*
* Changes:
* Wensong Zhang : changed the ip_vs_wlc_schedule to return dest
* Wensong Zhang : changed to use the inactconns in scheduling
* Wensong Zhang : changed some comestics things for debugging
* Wensong Zhang : changed for the d-linked destination list
* Wensong Zhang : added the ip_vs_wlc_update_svc
* Wensong Zhang : added any dest with weight=0 is quiesced
*
*/
#include <linux/config.h>
#include <linux/module.h>
#include <linux/init.h>
#include <linux/types.h>
#include <linux/kernel.h>
#include <linux/errno.h>
#include <net/ip_vs.h>
static int
ip_vs_wlc_init_svc(struct ip_vs_service *svc)
{
return 0;
}
static int
ip_vs_wlc_done_svc(struct ip_vs_service *svc)
{
return 0;
}
static int
ip_vs_wlc_update_svc(struct ip_vs_service *svc)
{
return 0;
}
static inline unsigned int
ip_vs_wlc_dest_overhead(struct ip_vs_dest *dest)
{
/*
* We think the overhead of processing active connections is 256
* times higher than that of inactive connections in average. (This
* 256 times might not be accurate, we will change it later) We
* use the following formula to estimate the overhead now:
* dest->activeconns*256 + dest->inactconns
*/
return (atomic_read(&dest->activeconns) << 8) +
atomic_read(&dest->inactconns);
}
/*
* Weighted Least Connection scheduling
*/
static struct ip_vs_dest *
ip_vs_wlc_schedule(struct ip_vs_service *svc, struct iphdr *iph)
{
register struct list_head *l, *e;
struct ip_vs_dest *dest, *least;
unsigned int loh, doh;
IP_VS_DBG(6, "ip_vs_wlc_schedule(): Scheduling...\n");
/*
* We calculate the load of each dest server as follows:
* (dest overhead) / dest->weight
*
* Remember -- no floats in kernel mode!!!
* The comparison of h1*w2 > h2*w1 is equivalent to that of
* h1/w1 > h2/w2
* if every weight is larger than zero.
*
* The server with weight=0 is quiesced and will not receive any
* new connections.
*/
l = &svc->destinations;
for (e=l->next; e!=l; e=e->next) {
least = list_entry(e, struct ip_vs_dest, n_list);
if (!(least->flags & IP_VS_DEST_F_OVERLOAD) &&
atomic_read(&least->weight) > 0) {
loh = ip_vs_wlc_dest_overhead(least);
goto nextstage;
}
}
return NULL;
/*
* Find the destination with the least load.
*/
nextstage:
for (e=e->next; e!=l; e=e->next) {
dest = list_entry(e, struct ip_vs_dest, n_list);
if (dest->flags & IP_VS_DEST_F_OVERLOAD)
continue;
doh = ip_vs_wlc_dest_overhead(dest);
if (loh * atomic_read(&dest->weight) >
doh * atomic_read(&least->weight)) {
least = dest;
loh = doh;
}
}
IP_VS_DBG(6, "WLC: server %u.%u.%u.%u:%u "
"activeconns %d refcnt %d weight %d overhead %d\n",
NIPQUAD(least->addr), ntohs(least->port),
atomic_read(&least->activeconns),
atomic_read(&least->refcnt),
atomic_read(&least->weight), loh);
return least;
}
static struct ip_vs_scheduler ip_vs_wlc_scheduler =
{
.name = "wlc",
.refcnt = ATOMIC_INIT(0),
.module = THIS_MODULE,
.init_service = ip_vs_wlc_init_svc,
.done_service = ip_vs_wlc_done_svc,
.update_service = ip_vs_wlc_update_svc,
.schedule = ip_vs_wlc_schedule,
};
static int __init ip_vs_wlc_init(void)
{
INIT_LIST_HEAD(&ip_vs_wlc_scheduler.n_list);
return register_ip_vs_scheduler(&ip_vs_wlc_scheduler);
}
static void __exit ip_vs_wlc_cleanup(void)
{
unregister_ip_vs_scheduler(&ip_vs_wlc_scheduler);
}
module_init(ip_vs_wlc_init);
module_exit(ip_vs_wlc_cleanup);
MODULE_LICENSE("GPL");
/*
* IPVS: Weighted Round-Robin Scheduling module
*
* Version: $Id: ip_vs_wrr.c,v 1.12 2002/09/15 08:14:08 wensong Exp $
*
* Authors: Wensong Zhang <wensong@linuxvirtualserver.org>
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public License
* as published by the Free Software Foundation; either version
* 2 of the License, or (at your option) any later version.
*
* Changes:
* Wensong Zhang : changed the ip_vs_wrr_schedule to return dest
* Wensong Zhang : changed some comestics things for debugging
* Wensong Zhang : changed for the d-linked destination list
* Wensong Zhang : added the ip_vs_wrr_update_svc
* Julian Anastasov : fixed the bug of returning destination
* with weight 0 when all weights are zero
*
*/
#include <linux/config.h>
#include <linux/module.h>
#include <linux/init.h>
#include <linux/types.h>
#include <linux/kernel.h>
#include <linux/errno.h>
#include <net/ip_vs.h>
/*
* current destination pointer for weighted round-robin scheduling
*/
struct ip_vs_wrr_mark {
struct list_head *cl; /* current list head */
int cw; /* current weight */
int mw; /* maximum weight */
int di; /* decreasing interval */
};
/*
* Get the gcd of server weights
*/
static int gcd(int a, int b)
{
int c;
while ((c = a % b)) {
a = b;
b = c;
}
return b;
}
static int ip_vs_wrr_gcd_weight(struct ip_vs_service *svc)
{
register struct list_head *l, *e;
struct ip_vs_dest *dest;
int weight;
int g = 1;
l = &svc->destinations;
for (e=l->next; e!=l; e=e->next) {
dest = list_entry(e, struct ip_vs_dest, n_list);
weight = atomic_read(&dest->weight);
if (weight > 0) {
g = weight;
break;
}
}
if (e == l)
return g;
for (e=e->next; e!=l; e=e->next) {
dest = list_entry(e, struct ip_vs_dest, n_list);
weight = atomic_read(&dest->weight);
if (weight > 0)
g = gcd(weight, g);
}
return g;
}
/*
* Get the maximum weight of the service destinations.
*/
static int ip_vs_wrr_max_weight(struct ip_vs_service *svc)
{
register struct list_head *l, *e;
struct ip_vs_dest *dest;
int weight = 0;
l = &svc->destinations;
for (e=l->next; e!=l; e=e->next) {
dest = list_entry(e, struct ip_vs_dest, n_list);
if (atomic_read(&dest->weight) > weight)
weight = atomic_read(&dest->weight);
}
return weight;
}
static int ip_vs_wrr_init_svc(struct ip_vs_service *svc)
{
struct ip_vs_wrr_mark *mark;
/*
* Allocate the mark variable for WRR scheduling
*/
mark = kmalloc(sizeof(struct ip_vs_wrr_mark), GFP_ATOMIC);
if (mark == NULL) {
IP_VS_ERR("ip_vs_wrr_init_svc(): no memory\n");
return -ENOMEM;
}
mark->cl = &svc->destinations;
mark->cw = 0;
mark->mw = ip_vs_wrr_max_weight(svc);
mark->di = ip_vs_wrr_gcd_weight(svc);
svc->sched_data = mark;
return 0;
}
static int ip_vs_wrr_done_svc(struct ip_vs_service *svc)
{
/*
* Release the mark variable
*/
kfree(svc->sched_data);
return 0;
}
static int ip_vs_wrr_update_svc(struct ip_vs_service *svc)
{
struct ip_vs_wrr_mark *mark = svc->sched_data;
mark->cl = &svc->destinations;
mark->mw = ip_vs_wrr_max_weight(svc);
mark->di = ip_vs_wrr_gcd_weight(svc);
return 0;
}
/*
* Weighted Round-Robin Scheduling
*/
static struct ip_vs_dest *
ip_vs_wrr_schedule(struct ip_vs_service *svc, struct iphdr *iph)
{
struct ip_vs_dest *dest;
struct ip_vs_wrr_mark *mark = svc->sched_data;
struct list_head *p;
IP_VS_DBG(6, "ip_vs_wrr_schedule(): Scheduling...\n");
/*
* This loop will always terminate, because mark->cw in (0, max_weight]
* and at least one server has its weight equal to max_weight.
*/
write_lock(&svc->sched_lock);
p = mark->cl;
while (1) {
if (mark->cl == &svc->destinations) {
/* it is at the head of the destination list */
if (mark->cl == mark->cl->next) {
/* no dest entry */
dest = NULL;
goto out;
}
mark->cl = svc->destinations.next;
mark->cw -= mark->di;
if (mark->cw <= 0) {
mark->cw = mark->mw;
/*
* Still zero, which means no availabe servers.
*/
if (mark->cw == 0) {
mark->cl = &svc->destinations;
IP_VS_INFO("ip_vs_wrr_schedule(): "
"no available servers\n");
dest = NULL;
goto out;
}
}
} else
mark->cl = mark->cl->next;
if (mark->cl != &svc->destinations) {
/* not at the head of the list */
dest = list_entry(mark->cl, struct ip_vs_dest, n_list);
if (!(dest->flags & IP_VS_DEST_F_OVERLOAD) &&
atomic_read(&dest->weight) >= mark->cw) {
/* got it */
break;
}
}
if (mark->cl == p) {
/* back to the start, and no dest is found.
It is only possible when all dests are OVERLOADED */
dest = NULL;
goto out;
}
}
IP_VS_DBG(6, "WRR: server %u.%u.%u.%u:%u "
"activeconns %d refcnt %d weight %d\n",
NIPQUAD(dest->addr), ntohs(dest->port),
atomic_read(&dest->activeconns),
atomic_read(&dest->refcnt),
atomic_read(&dest->weight));
out:
write_unlock(&svc->sched_lock);
return dest;
}
static struct ip_vs_scheduler ip_vs_wrr_scheduler = {
.name = "wrr",
.refcnt = ATOMIC_INIT(0),
.module = THIS_MODULE,
.init_service = ip_vs_wrr_init_svc,
.done_service = ip_vs_wrr_done_svc,
.update_service = ip_vs_wrr_update_svc,
.schedule = ip_vs_wrr_schedule,
};
static int __init ip_vs_wrr_init(void)
{
INIT_LIST_HEAD(&ip_vs_wrr_scheduler.n_list);
return register_ip_vs_scheduler(&ip_vs_wrr_scheduler) ;
}
static void __exit ip_vs_wrr_cleanup(void)
{
unregister_ip_vs_scheduler(&ip_vs_wrr_scheduler);
}
module_init(ip_vs_wrr_init);
module_exit(ip_vs_wrr_cleanup);
MODULE_LICENSE("GPL");
/*
* ip_vs_xmit.c: various packet transmitters for IPVS
*
* Version: $Id: ip_vs_xmit.c,v 1.2 2002/11/30 01:50:35 wensong Exp $
*
* Authors: Wensong Zhang <wensong@linuxvirtualserver.org>
* Julian Anastasov <ja@ssi.bg>
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public License
* as published by the Free Software Foundation; either version
* 2 of the License, or (at your option) any later version.
*
* Changes:
*
*/
#include <linux/config.h>
#include <linux/types.h>
#include <linux/kernel.h>
#include <linux/compiler.h>
#include <linux/ip.h>
#include <linux/tcp.h> /* for tcphdr */
#include <net/tcp.h> /* for csum_tcpudp_magic */
#include <net/udp.h>
#include <net/icmp.h> /* for icmp_send */
#include <net/route.h> /* for ip_route_output */
#include <linux/netfilter.h>
#include <linux/netfilter_ipv4.h>
#include <net/ip_vs.h>
/*
* Destination cache to speed up outgoing route lookup
*/
static inline void
__ip_vs_dst_set(struct ip_vs_dest *dest, u32 rtos, struct dst_entry *dst)
{
struct dst_entry *old_dst;
old_dst = dest->dst_cache;
dest->dst_cache = dst;
dest->dst_rtos = rtos;
dst_release(old_dst);
}
static inline struct dst_entry *
__ip_vs_dst_check(struct ip_vs_dest *dest, u32 rtos, u32 cookie)
{
struct dst_entry *dst = dest->dst_cache;
if (!dst)
return NULL;
if ((dst->obsolete || rtos != dest->dst_rtos) &&
dst->ops->check(dst, cookie) == NULL) {
dest->dst_cache = 0;
return NULL;
}
dst_hold(dst);
return dst;
}
static inline struct rtable *
__ip_vs_get_out_rt(struct ip_vs_conn *cp, u32 rtos)
{
struct rtable *rt; /* Route to the other host */
struct ip_vs_dest *dest = cp->dest;
if (dest) {
spin_lock(&dest->dst_lock);
if (!(rt = (struct rtable *)
__ip_vs_dst_check(dest, rtos, 0))) {
struct flowi fl = {
.oif = 0,
.nl_u = {
.ip4_u = {
.daddr = dest->addr,
.saddr = 0,
.tos = rtos, } },
.proto = cp->protocol,
};
if (ip_route_output_key(&rt, &fl)) {
spin_unlock(&dest->dst_lock);
IP_VS_DBG_RL("ip_route_output error, "
"dest: %u.%u.%u.%u\n",
NIPQUAD(dest->addr));
return NULL;
}
__ip_vs_dst_set(dest, rtos, dst_clone(&rt->u.dst));
IP_VS_DBG(10, "new dst %u.%u.%u.%u, refcnt=%d, rtos=%X\n",
NIPQUAD(dest->addr),
atomic_read(&rt->u.dst.__refcnt), rtos);
}
spin_unlock(&dest->dst_lock);
} else {
struct flowi fl = {
.oif = 0,
.nl_u = {
.ip4_u = {
.daddr = dest->addr,
.saddr = 0,
.tos = rtos, } },
.proto = cp->protocol,
};
if (ip_route_output_key(&rt, &fl)) {
IP_VS_DBG_RL("ip_route_output error, dest: "
"%u.%u.%u.%u\n", NIPQUAD(cp->daddr));
return NULL;
}
}
return rt;
}
/*
* Release dest->dst_cache before a dest is removed
*/
void
ip_vs_dst_reset(struct ip_vs_dest *dest)
{
struct dst_entry *old_dst;
old_dst = dest->dst_cache;
dest->dst_cache = NULL;
dst_release(old_dst);
}
static inline int
ip_vs_skb_cow(struct sk_buff *skb, unsigned int headroom,
struct iphdr **iph_p, unsigned char **t_p)
{
int delta = (headroom > 16 ? headroom : 16) - skb_headroom(skb);
if (delta < 0)
delta = 0;
if (delta ||skb_cloned(skb)) {
if (pskb_expand_head(skb, (delta+15)&~15, 0, GFP_ATOMIC))
return -ENOMEM;
/* skb data changed, update pointers */
*iph_p = skb->nh.iph;
*t_p = (char*) (*iph_p) + (*iph_p)->ihl * 4;
}
return 0;
}
#define IP_VS_XMIT(skb, rt) \
do { \
skb->nfcache |= NFC_IPVS_PROPERTY; \
NF_HOOK(PF_INET, NF_IP_LOCAL_OUT, skb, NULL, \
rt->u.dst.dev, dst_output); \
} while (0)
/*
* NULL transmitter (do nothing except return NF_ACCEPT)
*/
int
ip_vs_null_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
struct ip_vs_protocol *pp)
{
return NF_ACCEPT;
}
/*
* Bypass transmitter
* Let packets bypass the destination when the destination is not
* available, it may be only used in transparent cache cluster.
*/
int
ip_vs_bypass_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
struct ip_vs_protocol *pp)
{
struct rtable *rt; /* Route to the other host */
struct iphdr *iph = skb->nh.iph;
u8 tos = iph->tos;
int mtu;
struct flowi fl = {
.oif = 0,
.nl_u = {
.ip4_u = {
.daddr = iph->daddr,
.saddr = 0,
.tos = RT_TOS(tos), } },
.proto = iph->protocol,
};
EnterFunction(10);
if (ip_route_output_key(&rt, &fl)) {
IP_VS_DBG_RL("ip_vs_bypass_xmit(): ip_route_output error, "
"dest: %u.%u.%u.%u\n", NIPQUAD(iph->daddr));
goto tx_error_icmp;
}
/* MTU checking */
mtu = dst_pmtu(&rt->u.dst);
if ((skb->len > mtu) && (iph->frag_off&__constant_htons(IP_DF))) {
ip_rt_put(rt);
icmp_send(skb, ICMP_DEST_UNREACH,ICMP_FRAG_NEEDED, htonl(mtu));
IP_VS_DBG_RL("ip_vs_bypass_xmit(): frag needed\n");
goto tx_error;
}
if (skb_is_nonlinear(skb) && skb->len <= mtu)
ip_send_check(iph);
if (unlikely(skb_headroom(skb) < rt->u.dst.dev->hard_header_len)) {
if (skb_cow(skb, rt->u.dst.dev->hard_header_len)) {
ip_rt_put(rt);
IP_VS_ERR_RL("ip_vs_bypass_xmit(): no memory\n");
goto tx_error;
}
}
/* drop old route */
dst_release(skb->dst);
skb->dst = &rt->u.dst;
#ifdef CONFIG_NETFILTER_DEBUG
skb->nf_debug = 0;
#endif /* CONFIG_NETFILTER_DEBUG */
IP_VS_XMIT(skb, rt);
LeaveFunction(10);
return NF_STOLEN;
tx_error_icmp:
dst_link_failure(skb);
tx_error:
kfree_skb(skb);
return NF_STOLEN;
}
/*
* NAT transmitter (only for outside-to-inside nat forwarding)
* Not used for related ICMP
*/
int
ip_vs_nat_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
struct ip_vs_protocol *pp)
{
struct rtable *rt; /* Route to the other host */
struct iphdr *iph;
union ip_vs_tphdr h;
int ihl;
unsigned short size;
int mtu;
EnterFunction(10);
/*
* If it has ip_vs_app helper, the helper may change the payload,
* so it needs full checksum checking and checksum calculation.
* If not, only the header (such as IP address and port number)
* will be changed, so it is fast to do incremental checksum update,
* and let the destination host do final checksum checking.
*/
if (unlikely(cp->app && !pp->slave)) {
if (skb_is_nonlinear(skb) &&
skb_linearize(skb, GFP_ATOMIC) != 0)
return NF_DROP;
}
iph = skb->nh.iph;
ihl = iph->ihl << 2;
h.raw = (char*) iph + ihl;
size = ntohs(iph->tot_len) - ihl;
/* do TCP/UDP checksum checking if it has application helper */
if (unlikely(cp->app && pp->csum_check && !pp->slave)) {
if (!pp->csum_check(skb, pp, iph, h, size))
goto tx_error;
}
/*
* Check if it is no clinet port connection ...
*/
if (unlikely(cp->flags & IP_VS_CONN_F_NO_CPORT)) {
ip_vs_conn_fill_cport(cp, h.portp[0]);
IP_VS_DBG(10, "filled cport=%d\n", ntohs(cp->dport));
}
if (!(rt = __ip_vs_get_out_rt(cp, RT_TOS(iph->tos))))
goto tx_error_icmp;
/* MTU checking */
mtu = dst_pmtu(&rt->u.dst);
if ((skb->len > mtu) && (iph->frag_off&__constant_htons(IP_DF))) {
ip_rt_put(rt);
icmp_send(skb, ICMP_DEST_UNREACH,ICMP_FRAG_NEEDED, htonl(mtu));
IP_VS_DBG_RL_PKT(0, pp, iph, "ip_vs_nat_xmit(): frag needed for");
goto tx_error;
}
/* drop old route */
dst_release(skb->dst);
skb->dst = &rt->u.dst;
/* copy-on-write the packet before mangling it */
if (ip_vs_skb_cow(skb, rt->u.dst.dev->hard_header_len, &iph, &h.raw))
return NF_DROP;
/* mangle the packet */
iph->daddr = cp->daddr;
if (pp->dnat_handler) {
pp->dnat_handler(skb, pp, cp, iph, h, size);
iph = skb->nh.iph;
h.raw = (char*) iph + ihl;
}
ip_send_check(iph);
IP_VS_DBG_PKT(10, pp, iph, "After DNAT");
/* FIXME: when application helper enlarges the packet and the length
is larger than the MTU of outgoing device, there will be still
MTU problem. */
#ifdef CONFIG_NETFILTER_DEBUG
skb->nf_debug = 0;
#endif /* CONFIG_NETFILTER_DEBUG */
IP_VS_XMIT(skb, rt);
LeaveFunction(10);
return NF_STOLEN;
tx_error_icmp:
dst_link_failure(skb);
tx_error:
kfree_skb(skb);
return NF_STOLEN;
}
/*
* IP Tunneling transmitter
*
* This function encapsulates the packet in a new IP packet, its
* destination will be set to cp->daddr. Most code of this function
* is taken from ipip.c.
*
* It is used in VS/TUN cluster. The load balancer selects a real
* server from a cluster based on a scheduling algorithm,
* encapsulates the request packet and forwards it to the selected
* server. For example, all real servers are configured with
* "ifconfig tunl0 <Virtual IP Address> up". When the server receives
* the encapsulated packet, it will decapsulate the packet, processe
* the request and return the response packets directly to the client
* without passing the load balancer. This can greatly increase the
* scalability of virtual server.
*
* Used for ANY protocol
*/
int
ip_vs_tunnel_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
struct ip_vs_protocol *pp)
{
struct rtable *rt; /* Route to the other host */
struct net_device *tdev; /* Device to other host */
struct iphdr *old_iph = skb->nh.iph;
u8 tos = old_iph->tos;
u16 df = old_iph->frag_off;
struct iphdr *iph; /* Our new IP header */
int max_headroom; /* The extra header space needed */
int mtu;
EnterFunction(10);
if (skb->protocol != __constant_htons(ETH_P_IP)) {
IP_VS_DBG_RL("ip_vs_tunnel_xmit(): protocol error, "
"ETH_P_IP: %d, skb protocol: %d\n",
__constant_htons(ETH_P_IP), skb->protocol);
goto tx_error;
}
if (!(rt = __ip_vs_get_out_rt(cp, RT_TOS(tos))))
goto tx_error_icmp;
tdev = rt->u.dst.dev;
mtu = dst_pmtu(&rt->u.dst) - sizeof(struct iphdr);
if (mtu < 68) {
ip_rt_put(rt);
IP_VS_DBG_RL("ip_vs_tunnel_xmit(): mtu less than 68\n");
goto tx_error;
}
if (skb->dst)
skb->dst->ops->update_pmtu(skb->dst, mtu);
df |= (old_iph->frag_off&__constant_htons(IP_DF));
if ((old_iph->frag_off&__constant_htons(IP_DF))
&& mtu < ntohs(old_iph->tot_len)) {
icmp_send(skb, ICMP_DEST_UNREACH,ICMP_FRAG_NEEDED, htonl(mtu));
ip_rt_put(rt);
IP_VS_DBG_RL("ip_vs_tunnel_xmit(): frag needed\n");
goto tx_error;
}
if (skb_is_nonlinear(skb))
ip_send_check(old_iph);
skb->h.raw = skb->nh.raw;
/*
* Okay, now see if we can stuff it in the buffer as-is.
*/
max_headroom = (((tdev->hard_header_len+15)&~15)+sizeof(struct iphdr));
if (skb_headroom(skb) < max_headroom
|| skb_cloned(skb) || skb_shared(skb)) {
struct sk_buff *new_skb =
skb_realloc_headroom(skb, max_headroom);
if (!new_skb) {
ip_rt_put(rt);
kfree_skb(skb);
IP_VS_ERR_RL("ip_vs_tunnel_xmit(): no memory\n");
return -EINVAL;
}
kfree_skb(skb);
skb = new_skb;
}
skb->nh.raw = skb_push(skb, sizeof(struct iphdr));
memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt));
/* drop old route */
dst_release(skb->dst);
skb->dst = &rt->u.dst;
/*
* Push down and install the IPIP header.
*/
iph = skb->nh.iph;
iph->version = 4;
iph->ihl = sizeof(struct iphdr)>>2;
iph->frag_off = df;
iph->protocol = IPPROTO_IPIP;
iph->tos = tos;
iph->daddr = rt->rt_dst;
iph->saddr = rt->rt_src;
iph->ttl = old_iph->ttl;
iph->tot_len = htons(skb->len);
ip_select_ident(iph, &rt->u.dst, NULL);
ip_send_check(iph);
skb->ip_summed = CHECKSUM_NONE;
#ifdef CONFIG_NETFILTER_DEBUG
skb->nf_debug = 0;
#endif /* CONFIG_NETFILTER_DEBUG */
IP_VS_XMIT(skb, rt);
LeaveFunction(10);
return NF_STOLEN;
tx_error_icmp:
dst_link_failure(skb);
tx_error:
kfree_skb(skb);
return NF_STOLEN;
}
/*
* Direct Routing transmitter
* Used for ANY protocol
*/
int
ip_vs_dr_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
struct ip_vs_protocol *pp)
{
struct rtable *rt; /* Route to the other host */
struct iphdr *iph = skb->nh.iph;
int mtu;
EnterFunction(10);
if (!(rt = __ip_vs_get_out_rt(cp, RT_TOS(iph->tos))))
goto tx_error_icmp;
/* MTU checking */
mtu = dst_pmtu(&rt->u.dst);
if ((iph->frag_off&__constant_htons(IP_DF)) && skb->len > mtu) {
icmp_send(skb, ICMP_DEST_UNREACH,ICMP_FRAG_NEEDED, htonl(mtu));
ip_rt_put(rt);
IP_VS_DBG_RL("ip_vs_dr_xmit(): frag needed\n");
goto tx_error;
}
if (skb_is_nonlinear(skb) && skb->len <= mtu)
ip_send_check(iph);
if (unlikely(skb_headroom(skb) < rt->u.dst.dev->hard_header_len)) {
if (skb_cow(skb, rt->u.dst.dev->hard_header_len)) {
ip_rt_put(rt);
IP_VS_ERR_RL("ip_vs_dr_xmit(): no memory\n");
goto tx_error;
}
}
/* drop old route */
dst_release(skb->dst);
skb->dst = &rt->u.dst;
#ifdef CONFIG_NETFILTER_DEBUG
skb->nf_debug = 0;
#endif /* CONFIG_NETFILTER_DEBUG */
IP_VS_XMIT(skb, rt);
LeaveFunction(10);
return NF_STOLEN;
tx_error_icmp:
dst_link_failure(skb);
tx_error:
kfree_skb(skb);
return NF_STOLEN;
}
/*
* ICMP packet transmitter
* called by the ip_vs_in_icmp
*/
int
ip_vs_icmp_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
struct ip_vs_protocol *pp)
{
struct rtable *rt; /* Route to the other host */
struct iphdr *iph;
struct icmphdr *icmph;
struct iphdr *ciph; /* The ip header contained within the ICMP */
unsigned short len;
union ip_vs_tphdr h;
int mtu;
int rc;
EnterFunction(10);
/* The ICMP packet for VS/TUN, VS/DR and LOCALNODE will be
forwarded directly here, because there is no need to
translate address/port back */
if (IP_VS_FWD_METHOD(cp) != IP_VS_CONN_F_MASQ) {
if (cp->packet_xmit)
rc = cp->packet_xmit(skb, cp, pp);
else
rc = NF_ACCEPT;
atomic_inc(&cp->in_pkts);
__ip_vs_conn_put(cp);
goto out;
}
iph = skb->nh.iph;
icmph = (struct icmphdr *)((char *)iph+(iph->ihl<<2));
len = ntohs(iph->tot_len) - (iph->ihl<<2);
/*
* mangle and send the packet here (only for VS/NAT)
*/
if (!(rt = __ip_vs_get_out_rt(cp, RT_TOS(iph->tos))))
goto tx_error_icmp;
/* MTU checking */
mtu = dst_pmtu(&rt->u.dst);
if ((skb->len > mtu) && (iph->frag_off&__constant_htons(IP_DF))) {
ip_rt_put(rt);
icmp_send(skb, ICMP_DEST_UNREACH,ICMP_FRAG_NEEDED, htonl(mtu));
IP_VS_DBG_RL("ip_vs_in_icmp(): frag needed\n");
goto tx_error;
}
/* drop old route */
dst_release(skb->dst);
skb->dst = &rt->u.dst;
/* copy-on-write the packet before mangling it */
if (ip_vs_skb_cow(skb, rt->u.dst.dev->hard_header_len,
&iph, (unsigned char**)&icmph)) {
rc = NF_DROP;
goto out;
}
ciph = (struct iphdr *) (icmph + 1);
h.raw = (char *) ciph + (ciph->ihl << 2);
/* The ICMP packet for VS/NAT must be written to correct addresses
before being forwarded to the right server */
/* First change the dest IP address, and recalc checksum */
iph->daddr = cp->daddr;
ip_send_check(iph);
/* Now change the *source* address in the contained IP */
ciph->saddr = cp->daddr;
ip_send_check(ciph);
/* the TCP/UDP source port - cannot redo check */
if (IPPROTO_TCP == ciph->protocol || IPPROTO_UDP == ciph->protocol)
h.portp[0] = cp->dport;
/* And finally the ICMP checksum */
icmph->checksum = 0;
icmph->checksum = ip_compute_csum((unsigned char *) icmph, len);
skb->ip_summed = CHECKSUM_UNNECESSARY;
IP_VS_DBG_PKT(11, pp, ciph, "Forwarding incoming ICMP");
#ifdef CONFIG_NETFILTER_DEBUG
skb->nf_debug = 0;
#endif /* CONFIG_NETFILTER_DEBUG */
IP_VS_XMIT(skb, rt);
rc = NF_STOLEN;
goto out;
tx_error_icmp:
dst_link_failure(skb);
tx_error:
dev_kfree_skb(skb);
rc = NF_STOLEN;
out:
LeaveFunction(10);
return rc;
}
...@@ -19,6 +19,12 @@ struct notifier_block; ...@@ -19,6 +19,12 @@ struct notifier_block;
static struct firewall_ops *fwops; static struct firewall_ops *fwops;
#ifdef CONFIG_IP_VS
/* From ip_vs_core.c */
extern unsigned int
check_for_ip_vs_out(struct sk_buff **skb_p, int (*okfn)(struct sk_buff *));
#endif
/* They call these; we do what they want. */ /* They call these; we do what they want. */
int register_firewall(int pf, struct firewall_ops *fw) int register_firewall(int pf, struct firewall_ops *fw)
{ {
...@@ -134,8 +140,14 @@ fw_in(unsigned int hooknum, ...@@ -134,8 +140,14 @@ fw_in(unsigned int hooknum,
return NF_ACCEPT; return NF_ACCEPT;
case FW_MASQUERADE: case FW_MASQUERADE:
if (hooknum == NF_IP_FORWARD) if (hooknum == NF_IP_FORWARD) {
#ifdef CONFIG_IP_VS
/* check if it is for ip_vs */
if (check_for_ip_vs_out(pskb, okfn) == NF_STOLEN)
return NF_STOLEN;
#endif
return do_masquerade(pskb, out); return do_masquerade(pskb, out);
}
else return NF_ACCEPT; else return NF_ACCEPT;
case FW_REDIRECT: case FW_REDIRECT:
......
...@@ -496,6 +496,12 @@ static __inline__ void fib6_start_gc(struct rt6_info *rt) ...@@ -496,6 +496,12 @@ static __inline__ void fib6_start_gc(struct rt6_info *rt)
mod_timer(&ip6_fib_timer, jiffies + ip6_rt_gc_interval); mod_timer(&ip6_fib_timer, jiffies + ip6_rt_gc_interval);
} }
void fib6_force_start_gc(void)
{
if (ip6_fib_timer.expires == 0)
mod_timer(&ip6_fib_timer, jiffies + ip6_rt_gc_interval);
}
/* /*
* Add routing information to the routing tree. * Add routing information to the routing tree.
* <destination addr>/<source addr> * <destination addr>/<source addr>
...@@ -1214,6 +1220,7 @@ void fib6_run_gc(unsigned long dummy) ...@@ -1214,6 +1220,7 @@ void fib6_run_gc(unsigned long dummy)
write_lock_bh(&rt6_lock); write_lock_bh(&rt6_lock);
ndisc_dst_gc(&gc_args.more);
fib6_clean_tree(&ip6_routing_table, fib6_age, 0, NULL); fib6_clean_tree(&ip6_routing_table, fib6_age, 0, NULL);
write_unlock_bh(&rt6_lock); write_unlock_bh(&rt6_lock);
......
...@@ -402,25 +402,6 @@ static int ndisc_output(struct sk_buff *skb) ...@@ -402,25 +402,6 @@ static int ndisc_output(struct sk_buff *skb)
return -EINVAL; return -EINVAL;
} }
static inline struct dst_entry *ndisc_dst_alloc(struct net_device *dev,
struct neighbour *neigh)
{
struct rt6_info *rt = ip6_dst_alloc();
if (unlikely(rt == NULL))
goto out;
rt->rt6i_dev = dev;
rt->rt6i_nexthop = neigh;
rt->rt6i_expires = 0;
rt->rt6i_flags = RTF_LOCAL;
rt->rt6i_metric = 0;
rt->u.dst.metrics[RTAX_HOPLIMIT-1] = 255;
rt->u.dst.output = ndisc_output;
out:
return (struct dst_entry *)rt;
}
static inline void ndisc_flow_init(struct flowi *fl, u8 type, static inline void ndisc_flow_init(struct flowi *fl, u8 type,
struct in6_addr *saddr, struct in6_addr *daddr) struct in6_addr *saddr, struct in6_addr *daddr)
{ {
...@@ -463,13 +444,13 @@ static void ndisc_send_na(struct net_device *dev, struct neighbour *neigh, ...@@ -463,13 +444,13 @@ static void ndisc_send_na(struct net_device *dev, struct neighbour *neigh,
ndisc_flow_init(&fl, NDISC_NEIGHBOUR_ADVERTISEMENT, src_addr, daddr); ndisc_flow_init(&fl, NDISC_NEIGHBOUR_ADVERTISEMENT, src_addr, daddr);
dst = ndisc_dst_alloc(dev, neigh); dst = ndisc_dst_alloc(dev, neigh, ndisc_output);
if (!dst) if (!dst)
return; return;
err = xfrm_lookup(&dst, &fl, NULL, 0); err = xfrm_lookup(&dst, &fl, NULL, 0);
if (err < 0) { if (err < 0) {
dst_free(dst); dst_release(dst);
return; return;
} }
...@@ -485,7 +466,7 @@ static void ndisc_send_na(struct net_device *dev, struct neighbour *neigh, ...@@ -485,7 +466,7 @@ static void ndisc_send_na(struct net_device *dev, struct neighbour *neigh,
if (skb == NULL) { if (skb == NULL) {
ND_PRINTK1("send_na: alloc skb failed\n"); ND_PRINTK1("send_na: alloc skb failed\n");
dst_free(dst); dst_release(dst);
return; return;
} }
...@@ -515,7 +496,6 @@ static void ndisc_send_na(struct net_device *dev, struct neighbour *neigh, ...@@ -515,7 +496,6 @@ static void ndisc_send_na(struct net_device *dev, struct neighbour *neigh,
csum_partial((__u8 *) msg, csum_partial((__u8 *) msg,
len, 0)); len, 0));
dst_clone(dst);
skb->dst = dst; skb->dst = dst;
idev = in6_dev_get(dst->dev); idev = in6_dev_get(dst->dev);
dst_output(skb); dst_output(skb);
...@@ -550,10 +530,9 @@ void ndisc_send_ns(struct net_device *dev, struct neighbour *neigh, ...@@ -550,10 +530,9 @@ void ndisc_send_ns(struct net_device *dev, struct neighbour *neigh,
ndisc_flow_init(&fl, NDISC_NEIGHBOUR_SOLICITATION, saddr, daddr); ndisc_flow_init(&fl, NDISC_NEIGHBOUR_SOLICITATION, saddr, daddr);
dst = ndisc_dst_alloc(dev, neigh); dst = ndisc_dst_alloc(dev, neigh, ndisc_output);
if (!dst) if (!dst)
return; return;
dst_clone(dst);
err = xfrm_lookup(&dst, &fl, NULL, 0); err = xfrm_lookup(&dst, &fl, NULL, 0);
if (err < 0) { if (err < 0) {
...@@ -570,6 +549,7 @@ void ndisc_send_ns(struct net_device *dev, struct neighbour *neigh, ...@@ -570,6 +549,7 @@ void ndisc_send_ns(struct net_device *dev, struct neighbour *neigh,
1, &err); 1, &err);
if (skb == NULL) { if (skb == NULL) {
ND_PRINTK1("send_ns: alloc skb failed\n"); ND_PRINTK1("send_ns: alloc skb failed\n");
dst_release(dst);
return; return;
} }
...@@ -595,7 +575,6 @@ void ndisc_send_ns(struct net_device *dev, struct neighbour *neigh, ...@@ -595,7 +575,6 @@ void ndisc_send_ns(struct net_device *dev, struct neighbour *neigh,
csum_partial((__u8 *) msg, csum_partial((__u8 *) msg,
len, 0)); len, 0));
/* send it! */ /* send it! */
dst_clone(dst);
skb->dst = dst; skb->dst = dst;
idev = in6_dev_get(dst->dev); idev = in6_dev_get(dst->dev);
dst_output(skb); dst_output(skb);
...@@ -622,10 +601,9 @@ void ndisc_send_rs(struct net_device *dev, struct in6_addr *saddr, ...@@ -622,10 +601,9 @@ void ndisc_send_rs(struct net_device *dev, struct in6_addr *saddr,
ndisc_flow_init(&fl, NDISC_ROUTER_SOLICITATION, saddr, daddr); ndisc_flow_init(&fl, NDISC_ROUTER_SOLICITATION, saddr, daddr);
dst = ndisc_dst_alloc(dev, NULL); dst = ndisc_dst_alloc(dev, NULL, ndisc_output);
if (!dst) if (!dst)
return; return;
dst_clone(dst);
err = xfrm_lookup(&dst, &fl, NULL, 0); err = xfrm_lookup(&dst, &fl, NULL, 0);
if (err < 0) { if (err < 0) {
...@@ -664,7 +642,6 @@ void ndisc_send_rs(struct net_device *dev, struct in6_addr *saddr, ...@@ -664,7 +642,6 @@ void ndisc_send_rs(struct net_device *dev, struct in6_addr *saddr,
csum_partial((__u8 *) hdr, len, 0)); csum_partial((__u8 *) hdr, len, 0));
/* send it! */ /* send it! */
dst_clone(dst);
skb->dst = dst; skb->dst = dst;
idev = in6_dev_get(dst->dev); idev = in6_dev_get(dst->dev);
dst_output(skb); dst_output(skb);
...@@ -1321,7 +1298,6 @@ void ndisc_send_redirect(struct sk_buff *skb, struct neighbour *neigh, ...@@ -1321,7 +1298,6 @@ void ndisc_send_redirect(struct sk_buff *skb, struct neighbour *neigh,
if (rt == NULL) if (rt == NULL)
return; return;
dst = &rt->u.dst; dst = &rt->u.dst;
dst_clone(dst);
err = xfrm_lookup(&dst, &fl, NULL, 0); err = xfrm_lookup(&dst, &fl, NULL, 0);
if (err) { if (err) {
...@@ -1329,16 +1305,17 @@ void ndisc_send_redirect(struct sk_buff *skb, struct neighbour *neigh, ...@@ -1329,16 +1305,17 @@ void ndisc_send_redirect(struct sk_buff *skb, struct neighbour *neigh,
return; return;
} }
rt = (struct rt6_info *) dst;
if (rt->rt6i_flags & RTF_GATEWAY) { if (rt->rt6i_flags & RTF_GATEWAY) {
ND_PRINTK1("ndisc_send_redirect: not a neighbour\n"); ND_PRINTK1("ndisc_send_redirect: not a neighbour\n");
dst_release(&rt->u.dst); dst_release(dst);
return; return;
} }
if (!xrlim_allow(&rt->u.dst, 1*HZ)) { if (!xrlim_allow(dst, 1*HZ)) {
dst_release(&rt->u.dst); dst_release(dst);
return; return;
} }
dst_release(&rt->u.dst);
if (dev->addr_len) { if (dev->addr_len) {
if (neigh->nud_state&NUD_VALID) { if (neigh->nud_state&NUD_VALID) {
...@@ -1348,6 +1325,7 @@ void ndisc_send_redirect(struct sk_buff *skb, struct neighbour *neigh, ...@@ -1348,6 +1325,7 @@ void ndisc_send_redirect(struct sk_buff *skb, struct neighbour *neigh,
We will make it later, when will be sure, We will make it later, when will be sure,
that it is alive. that it is alive.
*/ */
dst_release(dst);
return; return;
} }
} }
...@@ -1366,11 +1344,11 @@ void ndisc_send_redirect(struct sk_buff *skb, struct neighbour *neigh, ...@@ -1366,11 +1344,11 @@ void ndisc_send_redirect(struct sk_buff *skb, struct neighbour *neigh,
hlen = 0; hlen = 0;
skb_reserve(skb, (dev->hard_header_len + 15) & ~15); skb_reserve(buff, (dev->hard_header_len + 15) & ~15);
ip6_nd_hdr(sk, buff, dev, &saddr_buf, &skb->nh.ipv6h->saddr, ip6_nd_hdr(sk, buff, dev, &saddr_buf, &skb->nh.ipv6h->saddr,
IPPROTO_ICMPV6, len); IPPROTO_ICMPV6, len);
skb->h.raw = (unsigned char*) icmph = (struct icmp6hdr *) skb_put(buff, len); buff->h.raw = (unsigned char*) icmph = (struct icmp6hdr *) skb_put(buff, len);
memset(icmph, 0, sizeof(struct icmp6hdr)); memset(icmph, 0, sizeof(struct icmp6hdr));
icmph->icmp6_type = NDISC_REDIRECT; icmph->icmp6_type = NDISC_REDIRECT;
...@@ -1408,9 +1386,9 @@ void ndisc_send_redirect(struct sk_buff *skb, struct neighbour *neigh, ...@@ -1408,9 +1386,9 @@ void ndisc_send_redirect(struct sk_buff *skb, struct neighbour *neigh,
len, IPPROTO_ICMPV6, len, IPPROTO_ICMPV6,
csum_partial((u8 *) icmph, len, 0)); csum_partial((u8 *) icmph, len, 0));
skb->dst = dst; buff->dst = dst;
idev = in6_dev_get(dst->dev); idev = in6_dev_get(dst->dev);
dst_output(skb); dst_output(buff);
ICMP6_INC_STATS(idev, Icmp6OutRedirects); ICMP6_INC_STATS(idev, Icmp6OutRedirects);
ICMP6_INC_STATS(idev, Icmp6OutMsgs); ICMP6_INC_STATS(idev, Icmp6OutMsgs);
......
...@@ -131,16 +131,11 @@ rwlock_t rt6_lock = RW_LOCK_UNLOCKED; ...@@ -131,16 +131,11 @@ rwlock_t rt6_lock = RW_LOCK_UNLOCKED;
/* allocate dst with ip6_dst_ops */ /* allocate dst with ip6_dst_ops */
static __inline__ struct rt6_info *__ip6_dst_alloc(void) static __inline__ struct rt6_info *ip6_dst_alloc(void)
{ {
return dst_alloc(&ip6_dst_ops); return dst_alloc(&ip6_dst_ops);
} }
struct rt6_info *ip6_dst_alloc(void)
{
return __ip6_dst_alloc();
}
/* /*
* Route lookup. Any rt6_lock is implied. * Route lookup. Any rt6_lock is implied.
*/ */
...@@ -560,6 +555,60 @@ static void ip6_rt_update_pmtu(struct dst_entry *dst, u32 mtu) ...@@ -560,6 +555,60 @@ static void ip6_rt_update_pmtu(struct dst_entry *dst, u32 mtu)
} }
} }
/* Protected by rt6_lock. */
static struct dst_entry *ndisc_dst_gc_list;
struct dst_entry *ndisc_dst_alloc(struct net_device *dev,
struct neighbour *neigh,
int (*output)(struct sk_buff *))
{
struct rt6_info *rt = ip6_dst_alloc();
if (unlikely(rt == NULL))
goto out;
rt->rt6i_dev = dev;
rt->rt6i_nexthop = neigh;
rt->rt6i_expires = 0;
rt->rt6i_flags = RTF_LOCAL;
rt->rt6i_metric = 0;
atomic_set(&rt->u.dst.__refcnt, 1);
rt->u.dst.metrics[RTAX_HOPLIMIT-1] = 255;
rt->u.dst.output = output;
write_lock_bh(&rt6_lock);
rt->u.dst.next = ndisc_dst_gc_list;
ndisc_dst_gc_list = &rt->u.dst;
write_unlock_bh(&rt6_lock);
fib6_force_start_gc();
out:
return (struct dst_entry *)rt;
}
int ndisc_dst_gc(int *more)
{
struct dst_entry *dst, *next, **pprev;
int freed;
next = NULL;
pprev = &ndisc_dst_gc_list;
freed = 0;
while ((dst = *pprev) != NULL) {
if (!atomic_read(&dst->__refcnt)) {
*pprev = dst->next;
dst_free(dst);
freed++;
} else {
pprev = &dst->next;
(*more)++;
}
}
return freed;
}
static int ip6_dst_gc(void) static int ip6_dst_gc(void)
{ {
static unsigned expire = 30*HZ; static unsigned expire = 30*HZ;
...@@ -655,7 +704,7 @@ int ip6_route_add(struct in6_rtmsg *rtmsg, struct nlmsghdr *nlh, void *_rtattr) ...@@ -655,7 +704,7 @@ int ip6_route_add(struct in6_rtmsg *rtmsg, struct nlmsghdr *nlh, void *_rtattr)
if (rtmsg->rtmsg_metric == 0) if (rtmsg->rtmsg_metric == 0)
rtmsg->rtmsg_metric = IP6_RT_PRIO_USER; rtmsg->rtmsg_metric = IP6_RT_PRIO_USER;
rt = __ip6_dst_alloc(); rt = ip6_dst_alloc();
if (rt == NULL) if (rt == NULL)
return -ENOMEM; return -ENOMEM;
...@@ -1066,7 +1115,7 @@ void rt6_pmtu_discovery(struct in6_addr *daddr, struct in6_addr *saddr, ...@@ -1066,7 +1115,7 @@ void rt6_pmtu_discovery(struct in6_addr *daddr, struct in6_addr *saddr,
static struct rt6_info * ip6_rt_copy(struct rt6_info *ort) static struct rt6_info * ip6_rt_copy(struct rt6_info *ort)
{ {
struct rt6_info *rt = __ip6_dst_alloc(); struct rt6_info *rt = ip6_dst_alloc();
if (rt) { if (rt) {
rt->u.dst.input = ort->u.dst.input; rt->u.dst.input = ort->u.dst.input;
...@@ -1209,7 +1258,7 @@ int ip6_pkt_discard(struct sk_buff *skb) ...@@ -1209,7 +1258,7 @@ int ip6_pkt_discard(struct sk_buff *skb)
int ip6_rt_addr_add(struct in6_addr *addr, struct net_device *dev) int ip6_rt_addr_add(struct in6_addr *addr, struct net_device *dev)
{ {
struct rt6_info *rt = __ip6_dst_alloc(); struct rt6_info *rt = ip6_dst_alloc();
if (rt == NULL) if (rt == NULL)
return -ENOMEM; return -ENOMEM;
......
...@@ -430,6 +430,10 @@ int netlink_unicast(struct sock *ssk, struct sk_buff *skb, u32 pid, int nonblock ...@@ -430,6 +430,10 @@ int netlink_unicast(struct sock *ssk, struct sk_buff *skb, u32 pid, int nonblock
goto no_dst; goto no_dst;
nlk = nlk_sk(sk); nlk = nlk_sk(sk);
/* Don't bother queuing skb if kernel socket has no input function */
if (nlk->pid == 0 && !nlk->data_ready)
goto no_dst;
#ifdef NL_EMULATE_DEV #ifdef NL_EMULATE_DEV
if (nlk->handler) { if (nlk->handler) {
skb_orphan(skb); skb_orphan(skb);
......
...@@ -265,6 +265,7 @@ EXPORT_SYMBOL(inet_family_ops); ...@@ -265,6 +265,7 @@ EXPORT_SYMBOL(inet_family_ops);
EXPORT_SYMBOL(in_aton); EXPORT_SYMBOL(in_aton);
EXPORT_SYMBOL(ip_mc_inc_group); EXPORT_SYMBOL(ip_mc_inc_group);
EXPORT_SYMBOL(ip_mc_dec_group); EXPORT_SYMBOL(ip_mc_dec_group);
EXPORT_SYMBOL(ip_mc_join_group);
EXPORT_SYMBOL(ip_finish_output); EXPORT_SYMBOL(ip_finish_output);
EXPORT_SYMBOL(inet_stream_ops); EXPORT_SYMBOL(inet_stream_ops);
EXPORT_SYMBOL(inet_dgram_ops); EXPORT_SYMBOL(inet_dgram_ops);
......
...@@ -855,6 +855,7 @@ xfrm_state_ok(struct xfrm_tmpl *tmpl, struct xfrm_state *x, ...@@ -855,6 +855,7 @@ xfrm_state_ok(struct xfrm_tmpl *tmpl, struct xfrm_state *x,
{ {
return x->id.proto == tmpl->id.proto && return x->id.proto == tmpl->id.proto &&
(x->id.spi == tmpl->id.spi || !tmpl->id.spi) && (x->id.spi == tmpl->id.spi || !tmpl->id.spi) &&
(x->props.reqid == tmpl->reqid || !tmpl->reqid) &&
x->props.mode == tmpl->mode && x->props.mode == tmpl->mode &&
(tmpl->aalgos & (1<<x->props.aalgo)) && (tmpl->aalgos & (1<<x->props.aalgo)) &&
!(x->props.mode && xfrm_state_addr_cmp(tmpl, x, family)); !(x->props.mode && xfrm_state_addr_cmp(tmpl, x, family));
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment