Commit a64cda04 authored by Alexey Kuznetsov's avatar Alexey Kuznetsov Committed by David S. Miller

[NET]: Prepare for zerocopy NFS and IPSEC.

- Import va10-hwchecksum-2.5.36.patch
- Import va11-udpsendfile-2.5.36.patch
- Implement new encapsulation friendly ipv4 output path.
parent 08e3418b
......@@ -137,8 +137,24 @@ struct inet_opt {
int mc_index; /* Multicast device index */
__u32 mc_addr;
struct ip_mc_socklist *mc_list; /* Group array */
struct page *sndmsg_page; /* Cached page for sendmsg */
u32 sndmsg_off; /* Cached offset for sendmsg */
/*
* Following members are used to retain the infomation to build
* an ip header on each ip fragmentation while the socket is corked.
*/
struct {
unsigned int flags;
unsigned int fragsize;
struct ip_options *opt;
struct rtable *rt;
int length; /* Total length of all frames */
u32 addr;
} cork;
};
#define IPCORK_OPT 1 /* ip-options has been held in ipcork.opt */
struct ipv6_pinfo;
/* WARNING: don't change the layout of the members in inet_sock! */
......
......@@ -765,6 +765,15 @@ static inline int skb_headlen(const struct sk_buff *skb)
return skb->len - skb->data_len;
}
static inline int skb_pagelen(const struct sk_buff *skb)
{
int i, len = 0;
for (i = (int)skb_shinfo(skb)->nr_frags - 1; i >= 0; i--)
len += skb_shinfo(skb)->frags[i].size;
return len + skb_headlen(skb);
}
#define SKB_PAGE_ASSERT(skb) do { if (skb_shinfo(skb)->nr_frags) \
BUG(); } while (0)
#define SKB_FRAG_ASSERT(skb) do { if (skb_shinfo(skb)->frag_list) \
......
......@@ -285,8 +285,6 @@ struct tcp_opt {
struct tcp_func *af_specific; /* Operations which are AF_INET{4,6} specific */
struct sk_buff *send_head; /* Front of stuff to transmit */
struct page *sndmsg_page; /* Cached page for sendmsg */
u32 sndmsg_off; /* Cached offset for sendmsg */
__u32 rcv_wnd; /* Current receiver window */
__u32 rcv_wup; /* rcv_nxt on last window update sent */
......
......@@ -17,6 +17,9 @@
#ifndef _LINUX_UDP_H
#define _LINUX_UDP_H
#include <asm/byteorder.h>
#include <net/sock.h>
#include <linux/ip.h>
struct udphdr {
__u16 source;
......@@ -25,5 +28,33 @@ struct udphdr {
__u16 check;
};
/* UDP socket options */
#define UDP_CORK 1 /* Never send partially complete segments */
struct udp_opt {
int pending; /* Any pending frames ? */
unsigned int corkflag; /* Cork is required */
/*
* Following members retains the infomation to create a UDP header
* when the socket is uncorked.
*/
u32 saddr; /* source address */
u32 daddr; /* destination address */
__u16 sport; /* source port */
__u16 dport; /* destination port */
__u16 len; /* total length of pending frames */
};
/* WARNING: don't change the layout of the members in udp_sock! */
struct udp_sock {
struct sock sk;
#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
struct ipv6_pinfo *pinet6;
#endif
struct inet_opt inet;
struct udp_opt udp;
};
#define udp_sk(__sk) (&((struct udp_sock *)__sk)->udp)
#endif /* _LINUX_UDP_H */
......@@ -29,6 +29,7 @@ struct dst_entry
struct dst_entry *next;
atomic_t __refcnt; /* client references */
int __use;
struct dst_entry *child;
struct net_device *dev;
int obsolete;
int flags;
......@@ -36,6 +37,8 @@ struct dst_entry
unsigned long lastuse;
unsigned long expires;
unsigned header_len; /* more space at head required */
unsigned mxlock;
unsigned pmtu;
unsigned window;
......@@ -108,18 +111,30 @@ void dst_release(struct dst_entry * dst)
atomic_dec(&dst->__refcnt);
}
/* Children define the path of the packet through the
* Linux networking. Thus, destinations are stackable.
*/
static inline struct dst_entry *dst_pop(struct dst_entry *dst)
{
struct dst_entry *child = dst_clone(dst->child);
dst_release(dst);
return child;
}
extern void * dst_alloc(struct dst_ops * ops);
extern void __dst_free(struct dst_entry * dst);
extern void dst_destroy(struct dst_entry * dst);
extern struct dst_entry *dst_destroy(struct dst_entry * dst);
static inline
void dst_free(struct dst_entry * dst)
static inline void dst_free(struct dst_entry * dst)
{
if (dst->obsolete > 1)
return;
if (!atomic_read(&dst->__refcnt)) {
dst_destroy(dst);
return;
dst = dst_destroy(dst);
if (!dst)
return;
}
__dst_free(dst);
}
......@@ -155,6 +170,37 @@ static inline void dst_set_expires(struct dst_entry *dst, int timeout)
dst->expires = expires;
}
/* Output packet to network from transport. */
static inline int dst_output(struct sk_buff *skb)
{
int err;
for (;;) {
err = skb->dst->output(skb);
if (likely(err == 0))
return err;
if (unlikely(err != NET_XMIT_BYPASS))
return err;
}
}
/* Input packet from network to transport. */
static inline int dst_input(struct sk_buff *skb)
{
int err;
for (;;) {
err = skb->dst->input(skb);
if (likely(err == 0))
return err;
/* Oh, Jamal... Seems, I will not forgive you this mess. :-) */
if (unlikely(err != NET_XMIT_BYPASS))
return err;
}
}
extern void dst_init(void);
#endif
......
......@@ -102,12 +102,26 @@ extern int ip_build_xmit(struct sock *sk,
int getfrag (const void *,
char *,
unsigned int,
unsigned int),
unsigned int,
struct sk_buff *),
const void *frag,
unsigned length,
struct ipcm_cookie *ipc,
struct rtable *rt,
int flags);
extern int ip_append_data(struct sock *sk,
int getfrag(void *from, char *to, int offset, int len,
int odd, struct sk_buff *skb),
void *from, int len, int protolen,
struct ipcm_cookie *ipc,
struct rtable *rt,
unsigned int flags);
extern int generic_getfrag(void *from, char *to, int offset, int len, int odd, struct sk_buff *skb);
extern ssize_t ip_append_page(struct sock *sk, struct page *page,
int offset, size_t size, int flags);
extern int ip_push_pending_frames(struct sock *sk);
extern void ip_flush_pending_frames(struct sock *sk);
/*
* Map a multicast IP onto multicast MAC for type Token Ring.
......
......@@ -249,6 +249,8 @@ struct proto {
struct msghdr *msg,
int len, int noblock, int flags,
int *addr_len);
int (*sendpage)(struct sock *sk, struct page *page,
int offset, size_t size, int flags);
int (*bind)(struct sock *sk,
struct sockaddr *uaddr, int addr_len);
......
......@@ -1851,7 +1851,7 @@ static inline void tcp_v4_setup_caps(struct sock *sk, struct dst_entry *dst)
{
sk->route_caps = dst->dev->features;
if (sk->route_caps & NETIF_F_TSO) {
if (sk->no_largesend)
if (sk->no_largesend || dst->header_len)
sk->route_caps &= ~NETIF_F_TSO;
}
}
......
......@@ -76,6 +76,4 @@ extern struct udp_mib udp_statistics[NR_CPUS*2];
#define UDP_INC_STATS_BH(field) SNMP_INC_STATS_BH(udp_statistics, field)
#define UDP_INC_STATS_USER(field) SNMP_INC_STATS_USER(udp_statistics, field)
#define udp_sock inet_sock
#endif /* _UDP_H */
......@@ -40,7 +40,6 @@ static void dst_run_gc(unsigned long);
static struct timer_list dst_gc_timer =
{ data: DST_GC_MIN, function: dst_run_gc };
static void dst_run_gc(unsigned long dummy)
{
int delayed = 0;
......@@ -60,7 +59,11 @@ static void dst_run_gc(unsigned long dummy)
delayed++;
continue;
}
*dstp = dst->next;
if (dst->child) {
dst->child->next = dst->next;
*dstp = dst->child;
} else
*dstp = dst->next;
dst_destroy(dst);
}
if (!dst_garbage_list) {
......@@ -141,10 +144,16 @@ void __dst_free(struct dst_entry * dst)
spin_unlock_bh(&dst_lock);
}
void dst_destroy(struct dst_entry * dst)
struct dst_entry *dst_destroy(struct dst_entry * dst)
{
struct neighbour *neigh = dst->neighbour;
struct hh_cache *hh = dst->hh;
struct dst_entry *child;
struct neighbour *neigh;
struct hh_cache *hh;
again:
neigh = dst->neighbour;
hh = dst->hh;
child = dst->child;
dst->hh = NULL;
if (hh && atomic_dec_and_test(&hh->hh_refcnt))
......@@ -165,6 +174,12 @@ void dst_destroy(struct dst_entry * dst)
atomic_dec(&dst_total);
#endif
kmem_cache_free(dst->ops->kmem_cachep, dst);
dst = child;
if (dst && !atomic_read(&dst->__refcnt))
goto again;
return dst;
}
static int dst_dev_event(struct notifier_block *this, unsigned long event, void *ptr)
......
......@@ -774,6 +774,21 @@ int inet_sendmsg(struct kiocb *iocb, struct socket *sock, struct msghdr *msg,
return sk->prot->sendmsg(iocb, sk, msg, size);
}
ssize_t inet_sendpage(struct socket *sock, struct page *page, int offset, size_t size, int flags)
{
struct sock *sk = sock->sk;
/* We may need to bind the socket. */
if (!inet_sk(sk)->num && inet_autobind(sk))
return -EAGAIN;
if (sk->prot->sendpage)
return sk->prot->sendpage(sk, page, offset, size, flags);
return sock_no_sendpage(sock, page, offset, size, flags);
}
int inet_shutdown(struct socket *sock, int how)
{
struct sock *sk = sock->sk;
......@@ -977,7 +992,7 @@ struct proto_ops inet_dgram_ops = {
.sendmsg = inet_sendmsg,
.recvmsg = inet_recvmsg,
.mmap = sock_no_mmap,
.sendpage = sock_no_sendpage,
.sendpage = inet_sendpage,
};
struct net_proto_family inet_family_ops = {
......
......@@ -357,12 +357,14 @@ static void icmp_out_count(int type)
* checksum.
*/
static int icmp_glue_bits(const void *p, char *to, unsigned int offset,
unsigned int fraglen)
unsigned int fraglen, struct sk_buff *skb)
{
struct icmp_bxm *icmp_param = (struct icmp_bxm *)p;
struct icmphdr *icmph;
unsigned int csum;
skb->ip_summed = CHECKSUM_NONE;
if (offset) {
icmp_param->csum =
skb_copy_and_csum_bits(icmp_param->skb,
......
This diff is collapsed.
......@@ -437,8 +437,10 @@ int ip_setsockopt(struct sock *sk, int level, int optname, char *optval, int opt
(!((1<<sk->state)&(TCPF_LISTEN|TCPF_CLOSE))
&& inet->daddr != LOOPBACK4_IPV6)) {
#endif
if (inet->opt)
tp->ext_header_len -= inet->opt->optlen;
if (opt)
tp->ext_header_len = opt->optlen;
tp->ext_header_len += opt->optlen;
tcp_sync_mss(sk, tp->pmtu_cookie);
#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
}
......
......@@ -259,9 +259,10 @@ struct rawfakehdr
*/
static int raw_getfrag(const void *p, char *to, unsigned int offset,
unsigned int fraglen)
unsigned int fraglen, struct sk_buff *skb)
{
struct rawfakehdr *rfh = (struct rawfakehdr *) p;
skb->ip_summed = CHECKSUM_NONE; /* Is there any good place to set it? */
return memcpy_fromiovecend(to, rfh->iov, offset, fraglen);
}
......@@ -270,10 +271,12 @@ static int raw_getfrag(const void *p, char *to, unsigned int offset,
*/
static int raw_getrawfrag(const void *p, char *to, unsigned int offset,
unsigned int fraglen)
unsigned int fraglen, struct sk_buff *skb)
{
struct rawfakehdr *rfh = (struct rawfakehdr *) p;
skb->ip_summed = CHECKSUM_NONE; /* Is there any good place to set it? */
if (memcpy_fromiovecend(to, rfh->iov, offset, fraglen))
return -EFAULT;
......
......@@ -204,6 +204,8 @@
* Andi Kleen : Make poll agree with SIGIO
* Salvatore Sanfilippo : Support SO_LINGER with linger == 1 and
* lingertime == 0 (RFC 793 ABORT Call)
* Hirokazu Takahashi : Use copy_from_user() instead of
* csum_and_copy_from_user() if possible.
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public License
......@@ -958,8 +960,8 @@ ssize_t tcp_sendpage(struct socket *sock, struct page *page, int offset,
return res;
}
#define TCP_PAGE(sk) (tcp_sk(sk)->sndmsg_page)
#define TCP_OFF(sk) (tcp_sk(sk)->sndmsg_off)
#define TCP_PAGE(sk) (inet_sk(sk)->sndmsg_page)
#define TCP_OFF(sk) (inet_sk(sk)->sndmsg_off)
static inline int tcp_copy_to_page(struct sock *sk, char *from,
struct sk_buff *skb, struct page *page,
......@@ -968,18 +970,22 @@ static inline int tcp_copy_to_page(struct sock *sk, char *from,
int err = 0;
unsigned int csum;
csum = csum_and_copy_from_user(from, page_address(page) + off,
if (skb->ip_summed == CHECKSUM_NONE) {
csum = csum_and_copy_from_user(from, page_address(page) + off,
copy, 0, &err);
if (!err) {
if (skb->ip_summed == CHECKSUM_NONE)
skb->csum = csum_block_add(skb->csum, csum, skb->len);
skb->len += copy;
skb->data_len += copy;
skb->truesize += copy;
sk->wmem_queued += copy;
sk->forward_alloc -= copy;
if (err) return err;
skb->csum = csum_block_add(skb->csum, csum, skb->len);
} else {
if (copy_from_user(page_address(page) + off, from, copy))
return -EFAULT;
}
return err;
skb->len += copy;
skb->data_len += copy;
skb->truesize += copy;
sk->wmem_queued += copy;
sk->forward_alloc -= copy;
return 0;
}
static inline int skb_add_data(struct sk_buff *skb, char *from, int copy)
......@@ -988,11 +994,16 @@ static inline int skb_add_data(struct sk_buff *skb, char *from, int copy)
unsigned int csum;
int off = skb->len;
csum = csum_and_copy_from_user(from, skb_put(skb, copy),
if (skb->ip_summed == CHECKSUM_NONE) {
csum = csum_and_copy_from_user(from, skb_put(skb, copy),
copy, 0, &err);
if (!err) {
skb->csum = csum_block_add(skb->csum, csum, off);
return 0;
if (!err) {
skb->csum = csum_block_add(skb->csum, csum, off);
return 0;
}
} else {
if (!copy_from_user(skb_put(skb, copy), from, copy))
return 0;
}
__skb_trim(skb, off);
......@@ -1075,6 +1086,12 @@ int tcp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
if (!skb)
goto wait_for_memory;
/*
* Check whether we can use HW checksum.
*/
if (sk->route_caps & (NETIF_F_IP_CSUM|NETIF_F_NO_CSUM|NETIF_F_HW_CSUM))
skb->ip_summed = CHECKSUM_HW;
skb_entail(sk, tp, skb);
copy = mss_now;
}
......
......@@ -781,6 +781,7 @@ int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
__sk_dst_set(sk, &rt->u.dst);
tcp_v4_setup_caps(sk, &rt->u.dst);
tp->ext_header_len += rt->u.dst.header_len;
if (!inet->opt || !inet->opt->srr)
daddr = rt->rt_dst;
......@@ -1577,6 +1578,7 @@ struct sock *tcp_v4_syn_recv_sock(struct sock *sk, struct sk_buff *skb,
newtp->ext_header_len = 0;
if (newinet->opt)
newtp->ext_header_len = newinet->opt->optlen;
newtp->ext_header_len += dst->header_len;
newinet->id = newtp->write_seq ^ jiffies;
tcp_sync_mss(newsk, dst->pmtu);
......@@ -2087,8 +2089,8 @@ static int tcp_v4_destroy_sock(struct sock *sk)
tcp_put_port(sk);
/* If sendmsg cached page exists, toss it. */
if (tp->sndmsg_page)
__free_page(tp->sndmsg_page);
if (inet_sk(sk)->sndmsg_page)
__free_page(inet_sk(sk)->sndmsg_page);
atomic_dec(&tcp_sockets_allocated);
......
This diff is collapsed.
......@@ -1876,6 +1876,7 @@ static int tcp_v6_init_sock(struct sock *sk)
static int tcp_v6_destroy_sock(struct sock *sk)
{
struct tcp_opt *tp = tcp_sk(sk);
struct inet_opt *inet = inet_sk(sk);
tcp_clear_xmit_timers(sk);
......@@ -1893,8 +1894,8 @@ static int tcp_v6_destroy_sock(struct sock *sk)
tcp_put_port(sk);
/* If sendmsg cached page exists, toss it. */
if (tp->sndmsg_page != NULL)
__free_page(tp->sndmsg_page);
if (inet->sndmsg_page != NULL)
__free_page(inet->sndmsg_page);
atomic_dec(&tcp_sockets_allocated);
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment