Commit a64cda04 authored by Alexey Kuznetsov's avatar Alexey Kuznetsov Committed by David S. Miller

[NET]: Prepare for zerocopy NFS and IPSEC.

- Import va10-hwchecksum-2.5.36.patch
- Import va11-udpsendfile-2.5.36.patch
- Implement new encapsulation friendly ipv4 output path.
parent 08e3418b
......@@ -137,8 +137,24 @@ struct inet_opt {
int mc_index; /* Multicast device index */
__u32 mc_addr;
struct ip_mc_socklist *mc_list; /* Group array */
struct page *sndmsg_page; /* Cached page for sendmsg */
u32 sndmsg_off; /* Cached offset for sendmsg */
/*
* Following members are used to retain the infomation to build
* an ip header on each ip fragmentation while the socket is corked.
*/
struct {
unsigned int flags;
unsigned int fragsize;
struct ip_options *opt;
struct rtable *rt;
int length; /* Total length of all frames */
u32 addr;
} cork;
};
#define IPCORK_OPT 1 /* ip-options has been held in ipcork.opt */
struct ipv6_pinfo;
/* WARNING: don't change the layout of the members in inet_sock! */
......
......@@ -765,6 +765,15 @@ static inline int skb_headlen(const struct sk_buff *skb)
return skb->len - skb->data_len;
}
static inline int skb_pagelen(const struct sk_buff *skb)
{
int i, len = 0;
for (i = (int)skb_shinfo(skb)->nr_frags - 1; i >= 0; i--)
len += skb_shinfo(skb)->frags[i].size;
return len + skb_headlen(skb);
}
#define SKB_PAGE_ASSERT(skb) do { if (skb_shinfo(skb)->nr_frags) \
BUG(); } while (0)
#define SKB_FRAG_ASSERT(skb) do { if (skb_shinfo(skb)->frag_list) \
......
......@@ -285,8 +285,6 @@ struct tcp_opt {
struct tcp_func *af_specific; /* Operations which are AF_INET{4,6} specific */
struct sk_buff *send_head; /* Front of stuff to transmit */
struct page *sndmsg_page; /* Cached page for sendmsg */
u32 sndmsg_off; /* Cached offset for sendmsg */
__u32 rcv_wnd; /* Current receiver window */
__u32 rcv_wup; /* rcv_nxt on last window update sent */
......
......@@ -17,6 +17,9 @@
#ifndef _LINUX_UDP_H
#define _LINUX_UDP_H
#include <asm/byteorder.h>
#include <net/sock.h>
#include <linux/ip.h>
struct udphdr {
__u16 source;
......@@ -25,5 +28,33 @@ struct udphdr {
__u16 check;
};
/* UDP socket options */
#define UDP_CORK 1 /* Never send partially complete segments */
struct udp_opt {
int pending; /* Any pending frames ? */
unsigned int corkflag; /* Cork is required */
/*
* Following members retains the infomation to create a UDP header
* when the socket is uncorked.
*/
u32 saddr; /* source address */
u32 daddr; /* destination address */
__u16 sport; /* source port */
__u16 dport; /* destination port */
__u16 len; /* total length of pending frames */
};
/* WARNING: don't change the layout of the members in udp_sock! */
struct udp_sock {
struct sock sk;
#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
struct ipv6_pinfo *pinet6;
#endif
struct inet_opt inet;
struct udp_opt udp;
};
#define udp_sk(__sk) (&((struct udp_sock *)__sk)->udp)
#endif /* _LINUX_UDP_H */
......@@ -29,6 +29,7 @@ struct dst_entry
struct dst_entry *next;
atomic_t __refcnt; /* client references */
int __use;
struct dst_entry *child;
struct net_device *dev;
int obsolete;
int flags;
......@@ -36,6 +37,8 @@ struct dst_entry
unsigned long lastuse;
unsigned long expires;
unsigned header_len; /* more space at head required */
unsigned mxlock;
unsigned pmtu;
unsigned window;
......@@ -108,18 +111,30 @@ void dst_release(struct dst_entry * dst)
atomic_dec(&dst->__refcnt);
}
/* Children define the path of the packet through the
* Linux networking. Thus, destinations are stackable.
*/
static inline struct dst_entry *dst_pop(struct dst_entry *dst)
{
struct dst_entry *child = dst_clone(dst->child);
dst_release(dst);
return child;
}
extern void * dst_alloc(struct dst_ops * ops);
extern void __dst_free(struct dst_entry * dst);
extern void dst_destroy(struct dst_entry * dst);
extern struct dst_entry *dst_destroy(struct dst_entry * dst);
static inline
void dst_free(struct dst_entry * dst)
static inline void dst_free(struct dst_entry * dst)
{
if (dst->obsolete > 1)
return;
if (!atomic_read(&dst->__refcnt)) {
dst_destroy(dst);
return;
dst = dst_destroy(dst);
if (!dst)
return;
}
__dst_free(dst);
}
......@@ -155,6 +170,37 @@ static inline void dst_set_expires(struct dst_entry *dst, int timeout)
dst->expires = expires;
}
/* Output packet to network from transport. */
static inline int dst_output(struct sk_buff *skb)
{
int err;
for (;;) {
err = skb->dst->output(skb);
if (likely(err == 0))
return err;
if (unlikely(err != NET_XMIT_BYPASS))
return err;
}
}
/* Input packet from network to transport. */
static inline int dst_input(struct sk_buff *skb)
{
int err;
for (;;) {
err = skb->dst->input(skb);
if (likely(err == 0))
return err;
/* Oh, Jamal... Seems, I will not forgive you this mess. :-) */
if (unlikely(err != NET_XMIT_BYPASS))
return err;
}
}
extern void dst_init(void);
#endif
......
......@@ -102,12 +102,26 @@ extern int ip_build_xmit(struct sock *sk,
int getfrag (const void *,
char *,
unsigned int,
unsigned int),
unsigned int,
struct sk_buff *),
const void *frag,
unsigned length,
struct ipcm_cookie *ipc,
struct rtable *rt,
int flags);
extern int ip_append_data(struct sock *sk,
int getfrag(void *from, char *to, int offset, int len,
int odd, struct sk_buff *skb),
void *from, int len, int protolen,
struct ipcm_cookie *ipc,
struct rtable *rt,
unsigned int flags);
extern int generic_getfrag(void *from, char *to, int offset, int len, int odd, struct sk_buff *skb);
extern ssize_t ip_append_page(struct sock *sk, struct page *page,
int offset, size_t size, int flags);
extern int ip_push_pending_frames(struct sock *sk);
extern void ip_flush_pending_frames(struct sock *sk);
/*
* Map a multicast IP onto multicast MAC for type Token Ring.
......
......@@ -249,6 +249,8 @@ struct proto {
struct msghdr *msg,
int len, int noblock, int flags,
int *addr_len);
int (*sendpage)(struct sock *sk, struct page *page,
int offset, size_t size, int flags);
int (*bind)(struct sock *sk,
struct sockaddr *uaddr, int addr_len);
......
......@@ -1851,7 +1851,7 @@ static inline void tcp_v4_setup_caps(struct sock *sk, struct dst_entry *dst)
{
sk->route_caps = dst->dev->features;
if (sk->route_caps & NETIF_F_TSO) {
if (sk->no_largesend)
if (sk->no_largesend || dst->header_len)
sk->route_caps &= ~NETIF_F_TSO;
}
}
......
......@@ -76,6 +76,4 @@ extern struct udp_mib udp_statistics[NR_CPUS*2];
#define UDP_INC_STATS_BH(field) SNMP_INC_STATS_BH(udp_statistics, field)
#define UDP_INC_STATS_USER(field) SNMP_INC_STATS_USER(udp_statistics, field)
#define udp_sock inet_sock
#endif /* _UDP_H */
......@@ -40,7 +40,6 @@ static void dst_run_gc(unsigned long);
static struct timer_list dst_gc_timer =
{ data: DST_GC_MIN, function: dst_run_gc };
static void dst_run_gc(unsigned long dummy)
{
int delayed = 0;
......@@ -60,7 +59,11 @@ static void dst_run_gc(unsigned long dummy)
delayed++;
continue;
}
*dstp = dst->next;
if (dst->child) {
dst->child->next = dst->next;
*dstp = dst->child;
} else
*dstp = dst->next;
dst_destroy(dst);
}
if (!dst_garbage_list) {
......@@ -141,10 +144,16 @@ void __dst_free(struct dst_entry * dst)
spin_unlock_bh(&dst_lock);
}
void dst_destroy(struct dst_entry * dst)
struct dst_entry *dst_destroy(struct dst_entry * dst)
{
struct neighbour *neigh = dst->neighbour;
struct hh_cache *hh = dst->hh;
struct dst_entry *child;
struct neighbour *neigh;
struct hh_cache *hh;
again:
neigh = dst->neighbour;
hh = dst->hh;
child = dst->child;
dst->hh = NULL;
if (hh && atomic_dec_and_test(&hh->hh_refcnt))
......@@ -165,6 +174,12 @@ void dst_destroy(struct dst_entry * dst)
atomic_dec(&dst_total);
#endif
kmem_cache_free(dst->ops->kmem_cachep, dst);
dst = child;
if (dst && !atomic_read(&dst->__refcnt))
goto again;
return dst;
}
static int dst_dev_event(struct notifier_block *this, unsigned long event, void *ptr)
......
......@@ -774,6 +774,21 @@ int inet_sendmsg(struct kiocb *iocb, struct socket *sock, struct msghdr *msg,
return sk->prot->sendmsg(iocb, sk, msg, size);
}
ssize_t inet_sendpage(struct socket *sock, struct page *page, int offset, size_t size, int flags)
{
struct sock *sk = sock->sk;
/* We may need to bind the socket. */
if (!inet_sk(sk)->num && inet_autobind(sk))
return -EAGAIN;
if (sk->prot->sendpage)
return sk->prot->sendpage(sk, page, offset, size, flags);
return sock_no_sendpage(sock, page, offset, size, flags);
}
int inet_shutdown(struct socket *sock, int how)
{
struct sock *sk = sock->sk;
......@@ -977,7 +992,7 @@ struct proto_ops inet_dgram_ops = {
.sendmsg = inet_sendmsg,
.recvmsg = inet_recvmsg,
.mmap = sock_no_mmap,
.sendpage = sock_no_sendpage,
.sendpage = inet_sendpage,
};
struct net_proto_family inet_family_ops = {
......
......@@ -357,12 +357,14 @@ static void icmp_out_count(int type)
* checksum.
*/
static int icmp_glue_bits(const void *p, char *to, unsigned int offset,
unsigned int fraglen)
unsigned int fraglen, struct sk_buff *skb)
{
struct icmp_bxm *icmp_param = (struct icmp_bxm *)p;
struct icmphdr *icmph;
unsigned int csum;
skb->ip_summed = CHECKSUM_NONE;
if (offset) {
icmp_param->csum =
skb_copy_and_csum_bits(icmp_param->skb,
......
......@@ -15,6 +15,7 @@
* Stefan Becker, <stefanb@yello.ping.de>
* Jorge Cwik, <jorge@laser.satlink.net>
* Arnt Gulbrandsen, <agulbra@nvg.unit.no>
* Hirokazu Takahashi, <taka@valinux.co.jp>
*
* See ip_input.c for original log
*
......@@ -38,6 +39,9 @@
* Marc Boucher : When call_out_firewall returns FW_QUEUE,
* silently drop skb instead of failing with -EPERM.
* Detlev Wengorz : Copy protocol for fragments.
* Hirokazu Takahashi: HW checksumming for outgoing UDP
* datagrams.
* Hirokazu Takahashi: sendfile() on UDP works now.
*/
#include <asm/uaccess.h>
......@@ -108,16 +112,9 @@ static int ip_dev_loopback_xmit(struct sk_buff *newskb)
return 0;
}
/* Don't just hand NF_HOOK skb->dst->output, in case netfilter hook
changes route */
static inline int
output_maybe_reroute(struct sk_buff *skb)
{
return skb->dst->output(skb);
}
/*
* Add an ip header to a skbuff and send it out.
*
*/
int ip_build_and_send_pkt(struct sk_buff *skb, struct sock *sk,
u32 saddr, u32 daddr, struct ip_options *opt)
......@@ -153,15 +150,34 @@ int ip_build_and_send_pkt(struct sk_buff *skb, struct sock *sk,
}
ip_send_check(iph);
skb->priority = sk->priority;
/* Send it out. */
return NF_HOOK(PF_INET, NF_IP_LOCAL_OUT, skb, NULL, rt->u.dst.dev,
output_maybe_reroute);
dst_output);
}
static inline int ip_finish_output2(struct sk_buff *skb)
{
struct dst_entry *dst = skb->dst;
struct hh_cache *hh = dst->hh;
struct net_device *dev = dst->dev;
/* Be paranoid, rather than too clever. */
if (unlikely(skb_headroom(skb) < dev->hard_header_len
&& dev->hard_header)) {
struct sk_buff *skb2;
skb2 = skb_realloc_headroom(skb, (dev->hard_header_len&~15) + 16);
if (skb2 == NULL) {
kfree_skb(skb);
return -ENOMEM;
}
if (skb->sk)
skb_set_owner_w(skb2, skb->sk);
kfree_skb(skb);
skb = skb2;
}
#ifdef CONFIG_NETFILTER_DEBUG
nf_debug_ip_finish_output2(skb);
......@@ -203,10 +219,6 @@ int ip_mc_output(struct sk_buff *skb)
* If the indicated interface is up and running, send the packet.
*/
IP_INC_STATS(IpOutRequests);
#ifdef CONFIG_IP_ROUTE_NAT
if (rt->rt_flags & RTCF_NAT)
ip_do_nat(skb);
#endif
skb->dev = dev;
skb->protocol = htons(ETH_P_IP);
......@@ -251,100 +263,21 @@ int ip_mc_output(struct sk_buff *skb)
newskb->dev, ip_dev_loopback_xmit);
}
return ip_finish_output(skb);
if (skb->len > dev->mtu || skb_shinfo(skb)->frag_list)
return ip_fragment(skb, ip_finish_output);
else
return ip_finish_output(skb);
}
int ip_output(struct sk_buff *skb)
{
#ifdef CONFIG_IP_ROUTE_NAT
struct rtable *rt = (struct rtable*)skb->dst;
#endif
IP_INC_STATS(IpOutRequests);
#ifdef CONFIG_IP_ROUTE_NAT
if (rt->rt_flags&RTCF_NAT)
ip_do_nat(skb);
#endif
return ip_finish_output(skb);
}
/* Queues a packet to be sent, and starts the transmitter if necessary.
* This routine also needs to put in the total length and compute the
* checksum. We use to do this in two stages, ip_build_header() then
* this, but that scheme created a mess when routes disappeared etc.
* So we do it all here, and the TCP send engine has been changed to
* match. (No more unroutable FIN disasters, etc. wheee...) This will
* most likely make other reliable transport layers above IP easier
* to implement under Linux.
*/
static inline int ip_queue_xmit2(struct sk_buff *skb)
{
struct sock *sk = skb->sk;
struct rtable *rt = (struct rtable *)skb->dst;
struct net_device *dev;
struct iphdr *iph = skb->nh.iph;
dev = rt->u.dst.dev;
/* This can happen when the transport layer has segments queued
* with a cached route, and by the time we get here things are
* re-routed to a device with a different MTU than the original
* device. Sick, but we must cover it.
*/
if (skb_headroom(skb) < dev->hard_header_len && dev->hard_header) {
struct sk_buff *skb2;
skb2 = skb_realloc_headroom(skb, (dev->hard_header_len + 15) & ~15);
kfree_skb(skb);
if (skb2 == NULL)
return -ENOMEM;
if (sk)
skb_set_owner_w(skb2, sk);
skb = skb2;
iph = skb->nh.iph;
}
if (skb->len > rt->u.dst.pmtu) {
unsigned int hlen;
if (!(sk->route_caps&NETIF_F_TSO))
goto fragment;
/* Hack zone: all this must be done by TCP. */
hlen = ((skb->h.raw - skb->data) + (skb->h.th->doff << 2));
skb_shinfo(skb)->tso_size = rt->u.dst.pmtu - hlen;
skb_shinfo(skb)->tso_segs =
(skb->len - hlen + skb_shinfo(skb)->tso_size - 1)/
skb_shinfo(skb)->tso_size - 1;
}
ip_select_ident_more(iph, &rt->u.dst, sk, skb_shinfo(skb)->tso_segs);
/* Add an IP checksum. */
ip_send_check(iph);
skb->priority = sk->priority;
return skb->dst->output(skb);
fragment:
if (ip_dont_fragment(sk, &rt->u.dst)) {
/* Reject packet ONLY if TCP might fragment
* it itself, if were careful enough.
*/
NETDEBUG(printk(KERN_DEBUG "sending pkt_too_big (len[%u] pmtu[%u]) to self\n",
skb->len, rt->u.dst.pmtu));
icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED,
htonl(rt->u.dst.pmtu));
kfree_skb(skb);
return -EMSGSIZE;
}
ip_select_ident(iph, &rt->u.dst, sk);
if (skb->ip_summed == CHECKSUM_HW &&
(skb = skb_checksum_help(skb)) == NULL)
return -ENOMEM;
return ip_fragment(skb, skb->dst->output);
if ((skb->len > skb->dst->dev->mtu || skb_shinfo(skb)->frag_list) &&
!skb_shinfo(skb)->tso_size)
return ip_fragment(skb, ip_finish_output);
else
return ip_finish_output(skb);
}
int ip_queue_xmit(struct sk_buff *skb)
......@@ -415,8 +348,26 @@ int ip_queue_xmit(struct sk_buff *skb)
ip_options_build(skb, opt, inet->daddr, rt, 0);
}
if (skb->len > rt->u.dst.pmtu && (sk->route_caps&NETIF_F_TSO)) {
unsigned int hlen;
/* Hack zone: all this must be done by TCP. */
hlen = ((skb->h.raw - skb->data) + (skb->h.th->doff << 2));
skb_shinfo(skb)->tso_size = rt->u.dst.pmtu - hlen;
skb_shinfo(skb)->tso_segs =
(skb->len - hlen + skb_shinfo(skb)->tso_size - 1)/
skb_shinfo(skb)->tso_size - 1;
}
ip_select_ident_more(iph, &rt->u.dst, sk, skb_shinfo(skb)->tso_segs);
/* Add an IP checksum. */
ip_send_check(iph);
skb->priority = sk->priority;
return NF_HOOK(PF_INET, NF_IP_LOCAL_OUT, skb, NULL, rt->u.dst.dev,
ip_queue_xmit2);
dst_output);
no_route:
IP_INC_STATS(IpOutNoRoutes);
......@@ -424,7 +375,8 @@ int ip_queue_xmit(struct sk_buff *skb)
return -EHOSTUNREACH;
}
/*
/* _Dead beaf_
*
* Build and send a packet, with as little as one copy
*
* Doesn't care much about ip options... option length can be
......@@ -448,7 +400,8 @@ static int ip_build_xmit_slow(struct sock *sk,
int getfrag (const void *,
char *,
unsigned int,
unsigned int),
unsigned int,
struct sk_buff *),
const void *frag,
unsigned length,
struct ipcm_cookie *ipc,
......@@ -462,10 +415,11 @@ static int ip_build_xmit_slow(struct sock *sk,
int mtu;
u16 id;
int hh_len = (rt->u.dst.dev->hard_header_len + 15)&~15;
int hh_len = (rt->u.dst.dev->hard_header_len&~15) + 16;
int nfrags=0;
struct ip_options *opt = ipc->opt;
int df = 0;
int csumselect = CHECKSUM_NONE;
mtu = rt->u.dst.pmtu;
if (ip_dont_fragment(sk, &rt->u.dst))
......@@ -526,6 +480,13 @@ static int ip_build_xmit_slow(struct sock *sk,
if (flags&MSG_PROBE)
goto out;
/*
* Give the upper layer a chance to decide whether to use HW
* checksumming or not.
*/
if (offset == 0 && rt->u.dst.dev->features & (NETIF_F_IP_CSUM|NETIF_F_NO_CSUM|NETIF_F_HW_CSUM))
csumselect = CHECKSUM_HW;
/*
* Begin outputting the bytes.
*/
......@@ -560,6 +521,7 @@ static int ip_build_xmit_slow(struct sock *sk,
skb->priority = sk->priority;
skb->dst = dst_clone(&rt->u.dst);
skb->ip_summed = csumselect;
skb_reserve(skb, hh_len);
/*
......@@ -607,18 +569,18 @@ static int ip_build_xmit_slow(struct sock *sk,
else
iph->ttl = inet->ttl;
iph->protocol = sk->protocol;
iph->check = 0;
iph->saddr = rt->rt_src;
iph->daddr = rt->rt_dst;
iph->check = ip_fast_csum((unsigned char *)iph, iph->ihl);
ip_send_check(iph);
data += iph->ihl*4;
skb->h.raw = data;
}
/*
* User data callback
*/
if (getfrag(frag, data, offset, fraglen-fragheaderlen)) {
if (getfrag(frag, data, offset, fraglen-fragheaderlen, skb)) {
err = -EFAULT;
kfree_skb(skb);
goto error;
......@@ -630,7 +592,7 @@ static int ip_build_xmit_slow(struct sock *sk,
nfrags++;
err = NF_HOOK(PF_INET, NF_IP_LOCAL_OUT, skb, NULL,
skb->dst->dev, output_maybe_reroute);
skb->dst->dev, dst_output);
if (err) {
if (err > 0)
err = inet->recverr ? net_xmit_errno(err) : 0;
......@@ -658,7 +620,8 @@ int ip_build_xmit(struct sock *sk,
int getfrag (const void *,
char *,
unsigned int,
unsigned int),
unsigned int,
struct sk_buff *),
const void *frag,
unsigned length,
struct ipcm_cookie *ipc,
......@@ -705,7 +668,7 @@ int ip_build_xmit(struct sock *sk,
* Fast path for unfragmented frames without options.
*/
{
int hh_len = (rt->u.dst.dev->hard_header_len + 15)&~15;
int hh_len = (rt->u.dst.dev->hard_header_len&~15) + 16;
skb = sock_alloc_send_skb(sk, length+hh_len+15,
flags&MSG_DONTWAIT, &err);
......@@ -719,6 +682,13 @@ int ip_build_xmit(struct sock *sk,
skb->nh.iph = iph = (struct iphdr *)skb_put(skb, length);
/*
* Give the upper layer a chance to decide whether to use HW
* checksumming or not.
*/
if (rt->u.dst.dev->features & (NETIF_F_IP_CSUM|NETIF_F_NO_CSUM|NETIF_F_HW_CSUM))
skb->ip_summed = CHECKSUM_HW;
if (!inet->hdrincl) {
iph->version=4;
iph->ihl=5;
......@@ -732,18 +702,20 @@ int ip_build_xmit(struct sock *sk,
iph->protocol=sk->protocol;
iph->saddr=rt->rt_src;
iph->daddr=rt->rt_dst;
iph->check=0;
iph->check = ip_fast_csum((unsigned char *)iph, iph->ihl);
err = getfrag(frag, ((char *)iph)+iph->ihl*4,0, length-iph->ihl*4);
ip_send_check(iph);
skb->h.raw = skb->nh.raw + iph->ihl*4;
err = getfrag(frag, ((char *)iph)+iph->ihl*4,0, length-iph->ihl*4, skb);
}
else {
skb->h.raw = skb->nh.raw;
err = getfrag(frag, (void *)iph, 0, length, skb);
}
else
err = getfrag(frag, (void *)iph, 0, length);
if (err)
goto error_fault;
err = NF_HOOK(PF_INET, NF_IP_LOCAL_OUT, skb, NULL, rt->u.dst.dev,
output_maybe_reroute);
dst_output);
if (err > 0)
err = inet->recverr ? net_xmit_errno(err) : 0;
if (err)
......@@ -759,13 +731,37 @@ int ip_build_xmit(struct sock *sk,
return err;
}
static void ip_copy_metadata(struct sk_buff *to, struct sk_buff *from)
{
to->pkt_type = from->pkt_type;
to->priority = from->priority;
to->protocol = from->protocol;
to->security = from->security;
to->dst = dst_clone(from->dst);
to->dev = from->dev;
/* Copy the flags to each fragment. */
IPCB(to)->flags = IPCB(from)->flags;
#ifdef CONFIG_NET_SCHED
to->tc_index = from->tc_index;
#endif
#ifdef CONFIG_NETFILTER
to->nfmark = from->nfmark;
/* Connection association is same as pre-frag packet */
to->nfct = from->nfct;
nf_conntrack_get(to->nfct);
#ifdef CONFIG_NETFILTER_DEBUG
to->nf_debug = from->nf_debug;
#endif
#endif
}
/*
* This IP datagram is too large to be sent in one piece. Break it up into
* smaller pieces (each of size equal to IP header plus
* a block of the data of the original IP data part) that will yet fit in a
* single device frame, and queue such a frame for sending.
*
* Yes this is inefficient, feel free to submit a quicker one.
*/
int ip_fragment(struct sk_buff *skb, int (*output)(struct sk_buff*))
......@@ -789,13 +785,111 @@ int ip_fragment(struct sk_buff *skb, int (*output)(struct sk_buff*))
iph = skb->nh.iph;
if (unlikely(iph->frag_off & htons(IP_DF))) {
icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED,
htonl(rt->u.dst.pmtu));
kfree_skb(skb);
return -EMSGSIZE;
}
/*
* Setup starting values.
*/
hlen = iph->ihl * 4;
left = skb->len - hlen; /* Space per frame */
mtu = rt->u.dst.pmtu - hlen; /* Size of data space */
/* When frag_list is given, use it. First, check its validity:
* some transformers could create wrong frag_list or break existing
* one, it is not prohibited. In this case fall back to copying.
*
* LATER: this step can be merged to real generation of fragments,
* we can switch to copy when see the first bad fragment.
*/
if (skb_shinfo(skb)->frag_list) {
struct sk_buff *frag;
int first_len = skb_pagelen(skb);
if (first_len - hlen > mtu ||
((first_len - hlen) & 7) ||
(iph->frag_off & htons(IP_MF|IP_OFFSET)) ||
skb_cloned(skb))
goto slow_path;
for (frag = skb_shinfo(skb)->frag_list; frag; frag = frag->next) {
/* Correct geometry. */
if (frag->len > mtu ||
((frag->len & 7) && frag->next) ||
skb_headroom(frag) < hlen)
goto slow_path;
/* Correct socket ownership. */
if (frag->sk == NULL)
goto slow_path;
/* Partially cloned skb? */
if (skb_shared(frag))
goto slow_path;
}
/* Everything is OK. Generate! */
err = 0;
offset = 0;
frag = skb_shinfo(skb)->frag_list;
skb_shinfo(skb)->frag_list = 0;
skb->data_len = first_len - skb_headlen(skb);
skb->len = first_len;
iph->tot_len = htons(first_len);
iph->frag_off |= htons(IP_MF);
ip_send_check(iph);
for (;;) {
/* Prepare header of the next frame,
* before previous one went down. */
if (frag) {
frag->h.raw = frag->data;
frag->nh.raw = __skb_push(frag, hlen);
memcpy(frag->nh.raw, iph, hlen);
iph = frag->nh.iph;
iph->tot_len = htons(frag->len);
ip_copy_metadata(frag, skb);
if (offset == 0)
ip_options_fragment(frag);
offset += skb->len - hlen;
iph->frag_off = htons(offset>>3);
if (frag->next != NULL)
iph->frag_off |= htons(IP_MF);
/* Ready, complete checksum */
ip_send_check(iph);
}
err = output(skb);
if (err || !frag)
break;
skb = frag;
frag = skb->next;
skb->next = NULL;
}
if (err == 0) {
IP_INC_STATS(IpFragOKs);
return 0;
}
while (frag) {
skb = frag->next;
kfree_skb(frag);
frag = skb;
}
IP_INC_STATS(IpFragFails);
return err;
}
slow_path:
left = skb->len - hlen; /* Space per frame */
ptr = raw + hlen; /* Where to start from */
/*
......@@ -823,7 +917,7 @@ int ip_fragment(struct sk_buff *skb, int (*output)(struct sk_buff*))
* Allocate buffer.
*/
if ((skb2 = alloc_skb(len+hlen+dev->hard_header_len+15,GFP_ATOMIC)) == NULL) {
if ((skb2 = alloc_skb(len+hlen+rt->u.dst.dev->hard_header_len+16,GFP_ATOMIC)) == NULL) {
NETDEBUG(printk(KERN_INFO "IP: frag: no memory for new fragment!\n"));
err = -ENOMEM;
goto fail;
......@@ -833,14 +927,11 @@ int ip_fragment(struct sk_buff *skb, int (*output)(struct sk_buff*))
* Set up data on packet
*/
skb2->pkt_type = skb->pkt_type;
skb2->priority = skb->priority;
skb_reserve(skb2, (dev->hard_header_len+15)&~15);
ip_copy_metadata(skb2, skb);
skb_reserve(skb2, (rt->u.dst.dev->hard_header_len&~15)+16);
skb_put(skb2, len + hlen);
skb2->nh.raw = skb2->data;
skb2->h.raw = skb2->data + hlen;
skb2->protocol = skb->protocol;
skb2->security = skb->security;
/*
* Charge the memory for the fragment to any owner
......@@ -849,8 +940,6 @@ int ip_fragment(struct sk_buff *skb, int (*output)(struct sk_buff*))
if (skb->sk)
skb_set_owner_w(skb2, skb->sk);
skb2->dst = dst_clone(skb->dst);
skb2->dev = skb->dev;
/*
* Copy the packet header into the new buffer.
......@@ -880,9 +969,6 @@ int ip_fragment(struct sk_buff *skb, int (*output)(struct sk_buff*))
if (offset == 0)
ip_options_fragment(skb);
/* Copy the flags to each fragment. */
IPCB(skb2)->flags = IPCB(skb)->flags;
/*
* Added AC : If we are fragmenting a fragment that's not the
* last fragment then keep MF on each bit
......@@ -892,19 +978,6 @@ int ip_fragment(struct sk_buff *skb, int (*output)(struct sk_buff*))
ptr += len;
offset += len;
#ifdef CONFIG_NET_SCHED
skb2->tc_index = skb->tc_index;
#endif
#ifdef CONFIG_NETFILTER
skb2->nfmark = skb->nfmark;
/* Connection association is same as pre-frag packet */
skb2->nfct = skb->nfct;
nf_conntrack_get(skb2->nfct);
#ifdef CONFIG_NETFILTER_DEBUG
skb2->nf_debug = skb->nf_debug;
#endif
#endif
/*
* Put this fragment into the sending queue.
*/
......@@ -929,11 +1002,524 @@ int ip_fragment(struct sk_buff *skb, int (*output)(struct sk_buff*))
return err;
}
int
generic_getfrag(void *from, char *to, int offset, int len, int odd, struct sk_buff *skb)
{
struct iovec *iov = from;
if (skb->ip_summed == CHECKSUM_HW) {
if (memcpy_fromiovecend(to, iov, offset, len) < 0)
return -EFAULT;
} else {
unsigned int csum = 0;
if (csum_partial_copy_fromiovecend(to, iov, offset, len, &csum) < 0)
return -EFAULT;
skb->csum = csum_block_add(skb->csum, csum, odd);
}
return 0;
}
static inline int
skb_can_coalesce(struct sk_buff *skb, int i, struct page *page, int off)
{
if (i) {
skb_frag_t *frag = &skb_shinfo(skb)->frags[i-1];
return page == frag->page &&
off == frag->page_offset+frag->size;
}
return 0;
}
static inline void
skb_fill_page_desc(struct sk_buff *skb, int i, struct page *page, int off, int size)
{
skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
frag->page = page;
frag->page_offset = off;
frag->size = size;
skb_shinfo(skb)->nr_frags = i+1;
}
static inline unsigned int
csum_page(struct page *page, int offset, int copy)
{
char *kaddr;
unsigned int csum;
kaddr = kmap(page);
csum = csum_partial(kaddr + offset, copy, 0);
kunmap(page);
return csum;
}
/*
* ip_append_data() and ip_append_page() can make one large IP datagram
* from many pieces of data. Each pieces will be holded on the socket
* until ip_push_pending_frames() is called. Eache pieces can be a page
* or non-page data.
*
* Not only UDP, other transport protocols - e.g. raw sockets - can use
* this interface potentially.
*
* LATER: length must be adjusted by pad at tail, when it is required.
*/
int ip_append_data(struct sock *sk,
int getfrag(void *from, char *to, int offset, int len,
int odd, struct sk_buff *skb),
void *from, int length, int transhdrlen,
struct ipcm_cookie *ipc, struct rtable *rt,
unsigned int flags)
{
struct inet_opt *inet = inet_sk(sk);
struct sk_buff *skb;
struct ip_options *opt = NULL;
int hh_len;
int exthdrlen;
int mtu;
int copy;
int err;
int offset = 0;
unsigned int maxfraglen, fragheaderlen;
int csummode = CHECKSUM_NONE;
if (inet->hdrincl)
return -EPERM;
if (flags&MSG_PROBE)
return 0;
if (skb_queue_empty(&sk->write_queue)) {
/*
* setup for corking.
*/
opt = ipc->opt;
if (opt) {
if (inet->cork.opt == NULL)
inet->cork.opt = kmalloc(sizeof(struct ip_options)+40, GFP_KERNEL);
memcpy(inet->cork.opt, opt, sizeof(struct ip_options)+opt->optlen);
inet->cork.flags |= IPCORK_OPT;
inet->cork.addr = ipc->addr;
}
dst_hold(&rt->u.dst);
inet->cork.fragsize = mtu = rt->u.dst.pmtu;
inet->cork.rt = rt;
inet->cork.length = 0;
inet->sndmsg_page = NULL;
inet->sndmsg_off = 0;
if ((exthdrlen = rt->u.dst.header_len) != 0) {
length += exthdrlen;
transhdrlen += exthdrlen;
}
} else {
rt = inet->cork.rt;
if (inet->cork.flags & IPCORK_OPT)
opt = inet->cork.opt;
transhdrlen = 0;
exthdrlen = 0;
mtu = inet->cork.fragsize;
}
hh_len = (rt->u.dst.dev->hard_header_len&~15) + 16;
fragheaderlen = sizeof(struct iphdr) + (opt ? opt->optlen : 0);
maxfraglen = ((mtu-fragheaderlen) & ~7) + fragheaderlen;
if (inet->cork.length + length > 0xFFFF - fragheaderlen) {
ip_local_error(sk, EMSGSIZE, rt->rt_dst, inet->dport, mtu-exthdrlen);
return -EMSGSIZE;
}
#if 0 /* Not now */
/*
* transhdrlen > 0 means that this is the first fragment and we wish
* it won't be fragmented in the future.
*/
if (transhdrlen &&
length + fragheaderlen <= maxfraglen &&
rt->u.dst.dev->features&(NETIF_F_IP_CSUM|NETIF_F_NO_CSUM|NETIF_F_HW_CSUM) &&
!exthdrlen)
csummode = CHECKSUM_HW;
#endif
inet->cork.length += length;
if ((skb = skb_peek_tail(&sk->write_queue)) == NULL)
goto alloc_new_skb;
while (length > 0) {
if ((copy = maxfraglen - skb->len) <= 0) {
char *data;
unsigned int datalen;
unsigned int fraglen;
BUG_TRAP(copy == 0);
alloc_new_skb:
datalen = maxfraglen - fragheaderlen;
if (datalen > length)
datalen = length;
fraglen = datalen + fragheaderlen;
if (!(flags & MSG_DONTWAIT) || transhdrlen) {
skb = sock_alloc_send_skb(sk, fraglen + hh_len + 15,
(flags & MSG_DONTWAIT), &err);
} else {
skb = sock_wmalloc(sk, fraglen + hh_len + 15, 1,
sk->allocation);
if (unlikely(skb == NULL))
err = -ENOBUFS;
}
if (skb == NULL)
goto error;
/*
* Fill in the control structures
*/
skb->ip_summed = csummode;
skb->csum = 0;
skb_reserve(skb, hh_len);
/*
* Find where to start putting bytes.
*/
data = skb_put(skb, fraglen);
skb->nh.raw = __skb_pull(skb, exthdrlen);
data += fragheaderlen;
skb->h.raw = data + exthdrlen;
copy = datalen - transhdrlen;
if (copy > 0 && getfrag(from, data + transhdrlen, offset, copy, 0, skb) < 0) {
err = -EFAULT;
kfree_skb(skb);
goto error;
}
offset += copy;
length -= datalen;
transhdrlen = 0;
exthdrlen = 0;
csummode = CHECKSUM_NONE;
/*
* Put the packet on the pending queue.
*/
__skb_queue_tail(&sk->write_queue, skb);
continue;
}
if (!(rt->u.dst.dev->features&NETIF_F_SG)) {
int off;
if (!((skb->len - fragheaderlen) & 7))
goto alloc_new_skb;
/*
* Align the start address of the next IP fragment
* on 8 byte boundary.
*/
copy = 8 - ((skb->len - fragheaderlen) & 7);
off = skb->len;
if (copy > length)
copy = length;
if (getfrag(from, skb_put(skb, copy), offset, copy, off, skb) < 0) {
__skb_trim(skb, off);
err = -EFAULT;
goto error;
}
} else {
int i = skb_shinfo(skb)->nr_frags;
skb_frag_t *frag = &skb_shinfo(skb)->frags[i-1];
struct page *page = inet->sndmsg_page;
int off = inet->sndmsg_off;
unsigned int left;
if (copy > length)
copy = length;
if (page && (left = PAGE_SIZE - off) > 0) {
if (copy >= left)
copy = left;
if (page != frag->page) {
if (i == MAX_SKB_FRAGS) {
err = -EMSGSIZE;
goto error;
}
get_page(page);
skb_fill_page_desc(skb, i, page, inet->sndmsg_off, 0);
frag = &skb_shinfo(skb)->frags[i];
}
} else if (i < MAX_SKB_FRAGS) {
if (copy > PAGE_SIZE)
copy = PAGE_SIZE;
page = alloc_pages(sk->allocation, 0);
if (page == NULL) {
err = -ENOMEM;
goto error;
}
inet->sndmsg_page = page;
inet->sndmsg_off = 0;
skb_fill_page_desc(skb, i, page, 0, 0);
frag = &skb_shinfo(skb)->frags[i];
skb->truesize += PAGE_SIZE;
atomic_add(PAGE_SIZE, &sk->wmem_alloc);
} else {
err = -EMSGSIZE;
goto error;
}
if (getfrag(from, page_address(frag->page)+frag->page_offset+frag->size, offset, copy, skb->len, skb) < 0) {
err = -EFAULT;
goto error;
}
inet->sndmsg_off += copy;
frag->size += copy;
skb->len += copy;
skb->data_len += copy;
}
offset += copy;
length -= copy;
}
return 0;
error:
inet->cork.length -= length;
IP_INC_STATS(IpOutDiscards);
return err;
}
ssize_t ip_append_page(struct sock *sk, struct page *page,
int offset, size_t size, int flags)
{
struct inet_opt *inet = inet_sk(sk);
struct sk_buff *skb;
struct rtable *rt;
struct ip_options *opt = NULL;
int hh_len;
int mtu;
int len;
int err;
unsigned int maxfraglen, fragheaderlen;
if (inet->hdrincl)
return -EPERM;
if (flags&MSG_PROBE)
return 0;
if (skb_queue_empty(&sk->write_queue))
return -EINVAL;
rt = inet->cork.rt;
if (inet->cork.flags & IPCORK_OPT)
opt = inet->cork.opt;
if (!(rt->u.dst.dev->features&NETIF_F_SG))
return -EOPNOTSUPP;
hh_len = (rt->u.dst.dev->hard_header_len&~15)+16;
mtu = inet->cork.fragsize;
fragheaderlen = sizeof(struct iphdr) + (opt ? opt->optlen : 0);
maxfraglen = ((mtu-fragheaderlen) & ~7) + fragheaderlen;
if (inet->cork.length + size > 0xFFFF - fragheaderlen) {
ip_local_error(sk, EMSGSIZE, rt->rt_dst, inet->dport, mtu);
return -EMSGSIZE;
}
if ((skb = skb_peek_tail(&sk->write_queue)) == NULL)
return -EINVAL;
inet->cork.length += size;
while (size > 0) {
int i;
if ((len = maxfraglen - skb->len) <= 0) {
char *data;
struct iphdr *iph;
BUG_TRAP(len == 0);
skb = sock_wmalloc(sk, fragheaderlen + hh_len + 15, 1,
sk->allocation);
if (unlikely(!skb)) {
err = -ENOBUFS;
goto error;
}
/*
* Fill in the control structures
*/
skb->ip_summed = CHECKSUM_NONE;
skb->csum = 0;
skb_reserve(skb, hh_len);
/*
* Find where to start putting bytes.
*/
data = skb_put(skb, fragheaderlen);
skb->nh.iph = iph = (struct iphdr *)data;
data += fragheaderlen;
skb->h.raw = data;
/*
* Put the packet on the pending queue.
*/
__skb_queue_tail(&sk->write_queue, skb);
continue;
}
i = skb_shinfo(skb)->nr_frags;
if (len > size)
len = size;
if (skb_can_coalesce(skb, i, page, offset)) {
skb_shinfo(skb)->frags[i-1].size += len;
} else if (i < MAX_SKB_FRAGS) {
get_page(page);
skb_fill_page_desc(skb, i, page, offset, len);
} else {
err = -EMSGSIZE;
goto error;
}
if (skb->ip_summed == CHECKSUM_NONE) {
unsigned int csum;
csum = csum_page(page, offset, len);
skb->csum = csum_block_add(skb->csum, csum, skb->len);
}
skb->len += len;
skb->data_len += len;
offset += len;
size -= len;
}
return 0;
error:
inet->cork.length -= size;
IP_INC_STATS(IpOutDiscards);
return err;
}
/*
* Combined all pending IP fragments on the socket as one IP datagram
* and push them out.
*/
int ip_push_pending_frames(struct sock *sk)
{
struct sk_buff *skb, *tmp_skb;
struct sk_buff **tail_skb;
struct inet_opt *inet = inet_sk(sk);
struct ip_options *opt = NULL;
struct rtable *rt = inet->cork.rt;
struct iphdr *iph;
int df = 0;
__u8 ttl;
int err = 0;
if ((skb = __skb_dequeue(&sk->write_queue)) == NULL)
goto out;
tail_skb = &(skb_shinfo(skb)->frag_list);
while ((tmp_skb = __skb_dequeue(&sk->write_queue)) != NULL) {
__skb_pull(tmp_skb, skb->h.raw - skb->nh.raw);
*tail_skb = tmp_skb;
tail_skb = &(tmp_skb->next);
skb->len += tmp_skb->len;
skb->data_len += tmp_skb->len;
#if 0 /* Logically correct, but useless work, ip_fragment() will have to undo */
skb->truesize += tmp_skb->truesize;
__sock_put(tmp_skb->sk);
tmp_skb->destructor = NULL;
tmp_skb->sk = NULL;
#endif
}
if (inet->pmtudisc == IP_PMTUDISC_DO ||
(!skb_shinfo(skb)->frag_list && ip_dont_fragment(sk, &rt->u.dst)))
df = htons(IP_DF);
if (inet->cork.flags & IPCORK_OPT)
opt = inet->cork.opt;
if (rt->rt_type == RTN_MULTICAST)
ttl = inet->mc_ttl;
else
ttl = inet->ttl;
iph = (struct iphdr *)skb->data;
iph->version = 4;
iph->ihl = 5;
if (opt) {
iph->ihl += opt->optlen>>2;
ip_options_build(skb, opt, inet->cork.addr, rt, 0);
}
iph->tos = inet->tos;
iph->tot_len = htons(skb->len);
iph->frag_off = df;
if (!df) {
__ip_select_ident(iph, &rt->u.dst, 0);
} else {
iph->id = htons(inet->id++);
}
iph->ttl = ttl;
iph->protocol = sk->protocol;
iph->saddr = rt->rt_src;
iph->daddr = rt->rt_dst;
ip_send_check(iph);
skb->priority = sk->priority;
skb->dst = dst_clone(&rt->u.dst);
/* Netfilter gets whole the not fragmented skb. */
err = NF_HOOK(PF_INET, NF_IP_LOCAL_OUT, skb, NULL,
skb->dst->dev, dst_output);
if (err) {
if (err > 0)
err = inet->recverr ? net_xmit_errno(err) : 0;
if (err)
goto error;
}
out:
inet->cork.flags &= ~IPCORK_OPT;
if (inet->cork.rt) {
ip_rt_put(inet->cork.rt);
inet->cork.rt = NULL;
}
return err;
error:
IP_INC_STATS(IpOutDiscards);
goto out;
}
/*
* Throw away all pending data on the socket.
*/
void ip_flush_pending_frames(struct sock *sk)
{
struct inet_opt *inet = inet_sk(sk);
struct sk_buff *skb;
while ((skb = __skb_dequeue_tail(&sk->write_queue)) != NULL)
kfree_skb(skb);
inet->cork.flags &= ~IPCORK_OPT;
if (inet->cork.opt) {
kfree(inet->cork.opt);
inet->cork.opt = NULL;
}
if (inet->cork.rt) {
ip_rt_put(inet->cork.rt);
inet->cork.rt = NULL;
}
}
/*
* Fetch data from kernel space and fill in checksum if needed.
*/
static int ip_reply_glue_bits(const void *dptr, char *to, unsigned int offset,
unsigned int fraglen)
unsigned int fraglen, struct sk_buff *skb)
{
struct ip_reply_arg *dp = (struct ip_reply_arg*)dptr;
u16 *pktp = (u16 *)to;
......@@ -962,6 +1548,8 @@ static int ip_reply_glue_bits(const void *dptr, char *to, unsigned int offset,
if (hdrflag && dp->csumoffset)
*(pktp + dp->csumoffset) = csum_fold(dp->csum); /* fill in checksum */
skb->ip_summed = CHECKSUM_NONE;
return 0;
}
......@@ -971,6 +1559,8 @@ static int ip_reply_glue_bits(const void *dptr, char *to, unsigned int offset,
*
* Should run single threaded per socket because it uses the sock
* structure to pass arguments.
*
* LATER: switch from ip_build_xmit to ip_append_*
*/
void ip_send_reply(struct sock *sk, struct sk_buff *skb, struct ip_reply_arg *arg,
unsigned int len)
......
......@@ -437,8 +437,10 @@ int ip_setsockopt(struct sock *sk, int level, int optname, char *optval, int opt
(!((1<<sk->state)&(TCPF_LISTEN|TCPF_CLOSE))
&& inet->daddr != LOOPBACK4_IPV6)) {
#endif
if (inet->opt)
tp->ext_header_len -= inet->opt->optlen;
if (opt)
tp->ext_header_len = opt->optlen;
tp->ext_header_len += opt->optlen;
tcp_sync_mss(sk, tp->pmtu_cookie);
#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
}
......
......@@ -259,9 +259,10 @@ struct rawfakehdr
*/
static int raw_getfrag(const void *p, char *to, unsigned int offset,
unsigned int fraglen)
unsigned int fraglen, struct sk_buff *skb)
{
struct rawfakehdr *rfh = (struct rawfakehdr *) p;
skb->ip_summed = CHECKSUM_NONE; /* Is there any good place to set it? */
return memcpy_fromiovecend(to, rfh->iov, offset, fraglen);
}
......@@ -270,10 +271,12 @@ static int raw_getfrag(const void *p, char *to, unsigned int offset,
*/
static int raw_getrawfrag(const void *p, char *to, unsigned int offset,
unsigned int fraglen)
unsigned int fraglen, struct sk_buff *skb)
{
struct rawfakehdr *rfh = (struct rawfakehdr *) p;
skb->ip_summed = CHECKSUM_NONE; /* Is there any good place to set it? */
if (memcpy_fromiovecend(to, rfh->iov, offset, fraglen))
return -EFAULT;
......
......@@ -204,6 +204,8 @@
* Andi Kleen : Make poll agree with SIGIO
* Salvatore Sanfilippo : Support SO_LINGER with linger == 1 and
* lingertime == 0 (RFC 793 ABORT Call)
* Hirokazu Takahashi : Use copy_from_user() instead of
* csum_and_copy_from_user() if possible.
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public License
......@@ -958,8 +960,8 @@ ssize_t tcp_sendpage(struct socket *sock, struct page *page, int offset,
return res;
}
#define TCP_PAGE(sk) (tcp_sk(sk)->sndmsg_page)
#define TCP_OFF(sk) (tcp_sk(sk)->sndmsg_off)
#define TCP_PAGE(sk) (inet_sk(sk)->sndmsg_page)
#define TCP_OFF(sk) (inet_sk(sk)->sndmsg_off)
static inline int tcp_copy_to_page(struct sock *sk, char *from,
struct sk_buff *skb, struct page *page,
......@@ -968,18 +970,22 @@ static inline int tcp_copy_to_page(struct sock *sk, char *from,
int err = 0;
unsigned int csum;
csum = csum_and_copy_from_user(from, page_address(page) + off,
if (skb->ip_summed == CHECKSUM_NONE) {
csum = csum_and_copy_from_user(from, page_address(page) + off,
copy, 0, &err);
if (!err) {
if (skb->ip_summed == CHECKSUM_NONE)
skb->csum = csum_block_add(skb->csum, csum, skb->len);
skb->len += copy;
skb->data_len += copy;
skb->truesize += copy;
sk->wmem_queued += copy;
sk->forward_alloc -= copy;
if (err) return err;
skb->csum = csum_block_add(skb->csum, csum, skb->len);
} else {
if (copy_from_user(page_address(page) + off, from, copy))
return -EFAULT;
}
return err;
skb->len += copy;
skb->data_len += copy;
skb->truesize += copy;
sk->wmem_queued += copy;
sk->forward_alloc -= copy;
return 0;
}
static inline int skb_add_data(struct sk_buff *skb, char *from, int copy)
......@@ -988,11 +994,16 @@ static inline int skb_add_data(struct sk_buff *skb, char *from, int copy)
unsigned int csum;
int off = skb->len;
csum = csum_and_copy_from_user(from, skb_put(skb, copy),
if (skb->ip_summed == CHECKSUM_NONE) {
csum = csum_and_copy_from_user(from, skb_put(skb, copy),
copy, 0, &err);
if (!err) {
skb->csum = csum_block_add(skb->csum, csum, off);
return 0;
if (!err) {
skb->csum = csum_block_add(skb->csum, csum, off);
return 0;
}
} else {
if (!copy_from_user(skb_put(skb, copy), from, copy))
return 0;
}
__skb_trim(skb, off);
......@@ -1075,6 +1086,12 @@ int tcp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
if (!skb)
goto wait_for_memory;
/*
* Check whether we can use HW checksum.
*/
if (sk->route_caps & (NETIF_F_IP_CSUM|NETIF_F_NO_CSUM|NETIF_F_HW_CSUM))
skb->ip_summed = CHECKSUM_HW;
skb_entail(sk, tp, skb);
copy = mss_now;
}
......
......@@ -781,6 +781,7 @@ int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
__sk_dst_set(sk, &rt->u.dst);
tcp_v4_setup_caps(sk, &rt->u.dst);
tp->ext_header_len += rt->u.dst.header_len;
if (!inet->opt || !inet->opt->srr)
daddr = rt->rt_dst;
......@@ -1577,6 +1578,7 @@ struct sock *tcp_v4_syn_recv_sock(struct sock *sk, struct sk_buff *skb,
newtp->ext_header_len = 0;
if (newinet->opt)
newtp->ext_header_len = newinet->opt->optlen;
newtp->ext_header_len += dst->header_len;
newinet->id = newtp->write_seq ^ jiffies;
tcp_sync_mss(newsk, dst->pmtu);
......@@ -2087,8 +2089,8 @@ static int tcp_v4_destroy_sock(struct sock *sk)
tcp_put_port(sk);
/* If sendmsg cached page exists, toss it. */
if (tp->sndmsg_page)
__free_page(tp->sndmsg_page);
if (inet_sk(sk)->sndmsg_page)
__free_page(inet_sk(sk)->sndmsg_page);
atomic_dec(&tcp_sockets_allocated);
......
......@@ -11,6 +11,7 @@
* Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
* Arnt Gulbrandsen, <agulbra@nvg.unit.no>
* Alan Cox, <Alan.Cox@linux.org>
* Hirokazu Takahashi, <taka@valinux.co.jp>
*
* Fixes:
* Alan Cox : verify_area() calls
......@@ -62,6 +63,9 @@
* Janos Farkas : don't deliver multi/broadcasts to a different
* bound-to-device socket
* Arnaldo C. Melo : move proc routines to ip_proc.c.
* Hirokazu Takahashi : HW checksumming for outgoing UDP
* datagrams.
* Hirokazu Takahashi : sendfile() on UDP works now.
*
*
* This program is free software; you can redistribute it and/or
......@@ -365,6 +369,95 @@ void udp_err(struct sk_buff *skb, u32 info)
sock_put(sk);
}
/*
* Throw away all pending data and cancel the corking. Socket is locked.
*/
static void udp_flush_pending_frames(struct sock *sk)
{
struct udp_opt *up = udp_sk(sk);
if (up->pending) {
up->pending = 0;
ip_flush_pending_frames(sk);
}
}
/*
* Push out all pending data as one UDP datagram. Socket is locked.
*/
static int udp_push_pending_frames(struct sock *sk, struct udp_opt *up)
{
struct sk_buff *skb;
struct udphdr *uh;
int err = 0;
/* Grab the skbuff where UDP header space exists. */
if ((skb = skb_peek(&sk->write_queue)) == NULL)
goto out;
/*
* Create a UDP header
*/
uh = skb->h.uh;
uh->source = up->sport;
uh->dest = up->dport;
uh->len = htons(up->len);
uh->check = 0;
if (sk->no_check == UDP_CSUM_NOXMIT) {
skb->ip_summed = CHECKSUM_NONE;
goto send;
}
if (skb_queue_len(&sk->write_queue) == 1) {
/*
* Only one fragment on the socket.
*/
if (skb->ip_summed == CHECKSUM_HW) {
skb->csum = offsetof(struct udphdr, check);
uh->check = ~csum_tcpudp_magic(up->saddr, up->daddr,
up->len, IPPROTO_UDP, 0);
} else {
skb->csum = csum_partial((char *)uh,
sizeof(struct udphdr), skb->csum);
uh->check = csum_tcpudp_magic(up->saddr, up->daddr,
up->len, IPPROTO_UDP, skb->csum);
if (uh->check == 0)
uh->check = -1;
}
} else {
unsigned int csum = 0;
/*
* HW-checksum won't work as there are two or more
* fragments on the socket so that all csums of sk_buffs
* should be together.
*/
if (skb->ip_summed == CHECKSUM_HW) {
int offset = (unsigned char *)uh - skb->data;
skb->csum = skb_checksum(skb, offset, skb->len - offset, 0);
skb->ip_summed = CHECKSUM_NONE;
} else {
skb->csum = csum_partial((char *)uh,
sizeof(struct udphdr), skb->csum);
}
skb_queue_walk(&sk->write_queue, skb) {
csum = csum_add(csum, skb->csum);
}
uh->check = csum_tcpudp_magic(up->saddr, up->daddr,
up->len, IPPROTO_UDP, csum);
if (uh->check == 0)
uh->check = -1;
}
send:
err = ip_push_pending_frames(sk);
out:
up->len = 0;
up->pending = 0;
return err;
}
static unsigned short udp_check(struct udphdr *uh, int len, unsigned long saddr, unsigned long daddr, unsigned long base)
{
......@@ -384,10 +477,19 @@ struct udpfakehdr
* Copy and checksum a UDP packet from user space into a buffer.
*/
static int udp_getfrag(const void *p, char * to, unsigned int offset, unsigned int fraglen)
static int udp_getfrag(const void *p, char * to, unsigned int offset, unsigned int fraglen, struct sk_buff *skb)
{
struct udpfakehdr *ufh = (struct udpfakehdr *)p;
if (offset==0) {
if (skb->ip_summed == CHECKSUM_HW) {
skb->csum = offsetof(struct udphdr, check);
ufh->uh.check = ~csum_tcpudp_magic(ufh->saddr, ufh->daddr,
ntohs(ufh->uh.len), IPPROTO_UDP, ufh->wcheck);
memcpy(to, ufh, sizeof(struct udphdr));
return memcpy_fromiovecend(to+sizeof(struct udphdr), ufh->iov, offset,
fraglen-sizeof(struct udphdr));
}
if (csum_partial_copy_fromiovecend(to+sizeof(struct udphdr), ufh->iov, offset,
fraglen-sizeof(struct udphdr), &ufh->wcheck))
return -EFAULT;
......@@ -411,10 +513,11 @@ static int udp_getfrag(const void *p, char * to, unsigned int offset, unsigned i
* Copy a UDP packet from user space into a buffer without checksumming.
*/
static int udp_getfrag_nosum(const void *p, char * to, unsigned int offset, unsigned int fraglen)
static int udp_getfrag_nosum(const void *p, char * to, unsigned int offset, unsigned int fraglen, struct sk_buff *skb)
{
struct udpfakehdr *ufh = (struct udpfakehdr *)p;
skb->ip_summed = CHECKSUM_NONE;
if (offset==0) {
memcpy(to, ufh, sizeof(struct udphdr));
return memcpy_fromiovecend(to+sizeof(struct udphdr), ufh->iov, offset,
......@@ -428,7 +531,8 @@ int udp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
int len)
{
struct inet_opt *inet = inet_sk(sk);
int ulen = len + sizeof(struct udphdr);
struct udp_opt *up = udp_sk(sk);
int ulen = len;
struct ipcm_cookie ipc;
struct udpfakehdr ufh;
struct rtable *rt = NULL;
......@@ -437,6 +541,7 @@ int udp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
u32 daddr;
u8 tos;
int err;
int corkreq = up->corkflag || msg->msg_flags&MSG_MORE;
/* This check is ONLY to check for arithmetic overflow
on integer(!) len. Not more! Real check will be made
......@@ -459,10 +564,26 @@ int udp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
if (msg->msg_flags&MSG_OOB) /* Mirror BSD error message compatibility */
return -EOPNOTSUPP;
ipc.opt = NULL;
if (up->pending) {
/*
* There are pending frames.
* The socket lock must be held while it's corked.
*/
lock_sock(sk);
if (likely(up->pending))
goto do_append_data;
release_sock(sk);
NETDEBUG(if (net_ratelimit()) printk(KERN_DEBUG "udp cork app bug 1\n"));
return -EINVAL;
}
ulen += sizeof(struct udphdr);
/*
* Get and verify the address.
*/
if (msg->msg_name) {
struct sockaddr_in * usin = (struct sockaddr_in*)msg->msg_name;
if (msg->msg_namelen < sizeof(*usin))
......@@ -489,7 +610,6 @@ int udp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
ipc.addr = inet->saddr;
ufh.uh.source = inet->sport;
ipc.opt = NULL;
ipc.oif = sk->bound_dev_if;
if (msg->msg_controllen) {
err = ip_cmsg_send(msg, &ipc);
......@@ -558,6 +678,29 @@ int udp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
ufh.iov = msg->msg_iov;
ufh.wcheck = 0;
/* 0x80000000 is temporary hook for testing new output path */
if (corkreq || rt->u.dst.header_len || (msg->msg_flags&0x80000000)) {
lock_sock(sk);
if (unlikely(up->pending)) {
/* The socket is already corked while preparing it. */
/* ... which is an evident application bug. --ANK */
release_sock(sk);
NETDEBUG(if (net_ratelimit()) printk(KERN_DEBUG "udp cork app bug 2\n"));
err = -EINVAL;
goto out;
}
/*
* Now cork the socket to pend data.
*/
up->daddr = ufh.daddr;
up->dport = ufh.uh.dest;
up->saddr = ufh.saddr;
up->sport = ufh.uh.source;
up->pending = 1;
goto do_append_data;
}
/* RFC1122: OK. Provides the checksumming facility (MUST) as per */
/* 4.1.3.4. It's configurable by the application via setsockopt() */
/* (MAY) and it defaults to on (MUST). */
......@@ -584,6 +727,62 @@ int udp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
goto back_from_confirm;
err = 0;
goto out;
do_append_data:
up->len += ulen;
err = ip_append_data(sk, generic_getfrag, msg->msg_iov, ulen, sizeof(struct udphdr), &ipc, rt, msg->msg_flags);
if (err)
udp_flush_pending_frames(sk);
else if (!corkreq)
err = udp_push_pending_frames(sk, up);
release_sock(sk);
goto out;
}
ssize_t udp_sendpage(struct sock *sk, struct page *page, int offset, size_t size, int flags)
{
struct udp_opt *up = udp_sk(sk);
int ret;
if (!up->pending) {
struct msghdr msg = { .msg_flags = flags|MSG_MORE };
/* Call udp_sendmsg to specify destination address which
* sendpage interface can't pass.
* This will succeed only when the socket is connected.
*/
ret = udp_sendmsg(NULL, sk, &msg, 0);
if (ret < 0)
return ret;
}
lock_sock(sk);
if (unlikely(!up->pending)) {
release_sock(sk);
NETDEBUG(if (net_ratelimit()) printk(KERN_DEBUG "udp cork app bug 3\n"));
return -EINVAL;
}
ret = ip_append_page(sk, page, offset, size, flags);
if (ret == -EOPNOTSUPP) {
release_sock(sk);
return sock_no_sendpage(sk->socket, page, offset, size, flags);
}
if (ret < 0) {
udp_flush_pending_frames(sk);
goto out;
}
up->len += size;
if (!(up->corkflag || (flags&MSG_MORE)))
ret = udp_push_pending_frames(sk, up);
if (!ret)
ret = size;
out:
release_sock(sk);
return ret;
}
/*
......@@ -985,16 +1184,99 @@ int udp_rcv(struct sk_buff *skb)
return(0);
}
static int udp_destroy_sock(struct sock *sk)
{
lock_sock(sk);
udp_flush_pending_frames(sk);
release_sock(sk);
return 0;
}
/*
* Socket option code for UDP
*/
static int udp_setsockopt(struct sock *sk, int level, int optname,
char *optval, int optlen)
{
struct udp_opt *up = udp_sk(sk);
int val;
int err = 0;
if (level != SOL_UDP)
return ip_setsockopt(sk, level, optname, optval, optlen);
if(optlen<sizeof(int))
return -EINVAL;
if (get_user(val, (int *)optval))
return -EFAULT;
switch(optname) {
case UDP_CORK:
if (val != 0) {
up->corkflag = 1;
} else {
up->corkflag = 0;
lock_sock(sk);
udp_push_pending_frames(sk, up);
release_sock(sk);
}
break;
default:
err = -ENOPROTOOPT;
break;
};
return err;
}
static int udp_getsockopt(struct sock *sk, int level, int optname,
char *optval, int *optlen)
{
struct udp_opt *up = udp_sk(sk);
int val, len;
if (level != SOL_UDP)
return ip_getsockopt(sk, level, optname, optval, optlen);
if(get_user(len,optlen))
return -EFAULT;
len = min_t(unsigned int, len, sizeof(int));
if(len < 0)
return -EINVAL;
switch(optname) {
case UDP_CORK:
val = up->corkflag;
break;
default:
return -ENOPROTOOPT;
};
if(put_user(len, optlen))
return -EFAULT;
if(copy_to_user(optval, &val,len))
return -EFAULT;
return 0;
}
struct proto udp_prot = {
.name = "UDP",
.close = udp_close,
.connect = udp_connect,
.disconnect = udp_disconnect,
.ioctl = udp_ioctl,
.setsockopt = ip_setsockopt,
.getsockopt = ip_getsockopt,
.destroy = udp_destroy_sock,
.setsockopt = udp_setsockopt,
.getsockopt = udp_getsockopt,
.sendmsg = udp_sendmsg,
.recvmsg = udp_recvmsg,
.sendpage = udp_sendpage,
.backlog_rcv = udp_queue_rcv_skb,
.hash = udp_v4_hash,
.unhash = udp_v4_unhash,
......
......@@ -1876,6 +1876,7 @@ static int tcp_v6_init_sock(struct sock *sk)
static int tcp_v6_destroy_sock(struct sock *sk)
{
struct tcp_opt *tp = tcp_sk(sk);
struct inet_opt *inet = inet_sk(sk);
tcp_clear_xmit_timers(sk);
......@@ -1893,8 +1894,8 @@ static int tcp_v6_destroy_sock(struct sock *sk)
tcp_put_port(sk);
/* If sendmsg cached page exists, toss it. */
if (tp->sndmsg_page != NULL)
__free_page(tp->sndmsg_page);
if (inet->sndmsg_page != NULL)
__free_page(inet->sndmsg_page);
atomic_dec(&tcp_sockets_allocated);
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment