Commit 5a5f0792 authored by David S. Miller's avatar David S. Miller

Merge branch 'pskb_extract'

Sowmini Varadhan says:

====================
pskb_extract() helper function.

This patchset follows up on the discussion in
 https://www.mail-archive.com/netdev@vger.kernel.org/msg105090.html

For RDS-TCP, we have to deal with the full gamut of
nonlinear sk_buffs, including all the frag_list variants.
Also, the parent skb has to remain unchanged, while the clone
is queued for Rx on the PF_RDS socket.

Patch 1 of this patchset adds a pskb_extract() function that
does all this without the redundant memcpy's in pskb_expand_head()
and __pskb_pull_tail().

v2: Marcelo Leitner review comments
====================
Signed-off-by: default avatarDavid S. Miller <davem@davemloft.net>
parents 557fc4a0 947d2756
...@@ -2986,6 +2986,8 @@ struct sk_buff *skb_vlan_untag(struct sk_buff *skb); ...@@ -2986,6 +2986,8 @@ struct sk_buff *skb_vlan_untag(struct sk_buff *skb);
int skb_ensure_writable(struct sk_buff *skb, int write_len); int skb_ensure_writable(struct sk_buff *skb, int write_len);
int skb_vlan_pop(struct sk_buff *skb); int skb_vlan_pop(struct sk_buff *skb);
int skb_vlan_push(struct sk_buff *skb, __be16 vlan_proto, u16 vlan_tci); int skb_vlan_push(struct sk_buff *skb, __be16 vlan_proto, u16 vlan_tci);
struct sk_buff *pskb_extract(struct sk_buff *skb, int off, int to_copy,
gfp_t gfp);
static inline int memcpy_from_msg(void *data, struct msghdr *msg, int len) static inline int memcpy_from_msg(void *data, struct msghdr *msg, int len)
{ {
......
...@@ -4622,3 +4622,245 @@ struct sk_buff *alloc_skb_with_frags(unsigned long header_len, ...@@ -4622,3 +4622,245 @@ struct sk_buff *alloc_skb_with_frags(unsigned long header_len,
return NULL; return NULL;
} }
EXPORT_SYMBOL(alloc_skb_with_frags); EXPORT_SYMBOL(alloc_skb_with_frags);
/* carve out the first off bytes from skb when off < headlen */
static int pskb_carve_inside_header(struct sk_buff *skb, const u32 off,
const int headlen, gfp_t gfp_mask)
{
int i;
int size = skb_end_offset(skb);
int new_hlen = headlen - off;
u8 *data;
int doff = 0;
size = SKB_DATA_ALIGN(size);
if (skb_pfmemalloc(skb))
gfp_mask |= __GFP_MEMALLOC;
data = kmalloc_reserve(size +
SKB_DATA_ALIGN(sizeof(struct skb_shared_info)),
gfp_mask, NUMA_NO_NODE, NULL);
if (!data)
return -ENOMEM;
size = SKB_WITH_OVERHEAD(ksize(data));
/* Copy real data, and all frags */
skb_copy_from_linear_data_offset(skb, off, data, new_hlen);
skb->len -= off;
memcpy((struct skb_shared_info *)(data + size),
skb_shinfo(skb),
offsetof(struct skb_shared_info,
frags[skb_shinfo(skb)->nr_frags]));
if (skb_cloned(skb)) {
/* drop the old head gracefully */
if (skb_orphan_frags(skb, gfp_mask)) {
kfree(data);
return -ENOMEM;
}
for (i = 0; i < skb_shinfo(skb)->nr_frags; i++)
skb_frag_ref(skb, i);
if (skb_has_frag_list(skb))
skb_clone_fraglist(skb);
skb_release_data(skb);
} else {
/* we can reuse existing recount- all we did was
* relocate values
*/
skb_free_head(skb);
}
doff = (data - skb->head);
skb->head = data;
skb->data = data;
skb->head_frag = 0;
#ifdef NET_SKBUFF_DATA_USES_OFFSET
skb->end = size;
doff = 0;
#else
skb->end = skb->head + size;
#endif
skb_set_tail_pointer(skb, skb_headlen(skb));
skb_headers_offset_update(skb, 0);
skb->cloned = 0;
skb->hdr_len = 0;
skb->nohdr = 0;
atomic_set(&skb_shinfo(skb)->dataref, 1);
return 0;
}
static int pskb_carve(struct sk_buff *skb, const u32 off, gfp_t gfp);
/* carve out the first eat bytes from skb's frag_list. May recurse into
* pskb_carve()
*/
static int pskb_carve_frag_list(struct sk_buff *skb,
struct skb_shared_info *shinfo, int eat,
gfp_t gfp_mask)
{
struct sk_buff *list = shinfo->frag_list;
struct sk_buff *clone = NULL;
struct sk_buff *insp = NULL;
do {
if (!list) {
pr_err("Not enough bytes to eat. Want %d\n", eat);
return -EFAULT;
}
if (list->len <= eat) {
/* Eaten as whole. */
eat -= list->len;
list = list->next;
insp = list;
} else {
/* Eaten partially. */
if (skb_shared(list)) {
clone = skb_clone(list, gfp_mask);
if (!clone)
return -ENOMEM;
insp = list->next;
list = clone;
} else {
/* This may be pulled without problems. */
insp = list;
}
if (pskb_carve(list, eat, gfp_mask) < 0) {
kfree_skb(clone);
return -ENOMEM;
}
break;
}
} while (eat);
/* Free pulled out fragments. */
while ((list = shinfo->frag_list) != insp) {
shinfo->frag_list = list->next;
kfree_skb(list);
}
/* And insert new clone at head. */
if (clone) {
clone->next = list;
shinfo->frag_list = clone;
}
return 0;
}
/* carve off first len bytes from skb. Split line (off) is in the
* non-linear part of skb
*/
static int pskb_carve_inside_nonlinear(struct sk_buff *skb, const u32 off,
int pos, gfp_t gfp_mask)
{
int i, k = 0;
int size = skb_end_offset(skb);
u8 *data;
const int nfrags = skb_shinfo(skb)->nr_frags;
struct skb_shared_info *shinfo;
int doff = 0;
size = SKB_DATA_ALIGN(size);
if (skb_pfmemalloc(skb))
gfp_mask |= __GFP_MEMALLOC;
data = kmalloc_reserve(size +
SKB_DATA_ALIGN(sizeof(struct skb_shared_info)),
gfp_mask, NUMA_NO_NODE, NULL);
if (!data)
return -ENOMEM;
size = SKB_WITH_OVERHEAD(ksize(data));
memcpy((struct skb_shared_info *)(data + size),
skb_shinfo(skb), offsetof(struct skb_shared_info,
frags[skb_shinfo(skb)->nr_frags]));
if (skb_orphan_frags(skb, gfp_mask)) {
kfree(data);
return -ENOMEM;
}
shinfo = (struct skb_shared_info *)(data + size);
for (i = 0; i < nfrags; i++) {
int fsize = skb_frag_size(&skb_shinfo(skb)->frags[i]);
if (pos + fsize > off) {
shinfo->frags[k] = skb_shinfo(skb)->frags[i];
if (pos < off) {
/* Split frag.
* We have two variants in this case:
* 1. Move all the frag to the second
* part, if it is possible. F.e.
* this approach is mandatory for TUX,
* where splitting is expensive.
* 2. Split is accurately. We make this.
*/
shinfo->frags[0].page_offset += off - pos;
skb_frag_size_sub(&shinfo->frags[0], off - pos);
}
skb_frag_ref(skb, i);
k++;
}
pos += fsize;
}
shinfo->nr_frags = k;
if (skb_has_frag_list(skb))
skb_clone_fraglist(skb);
if (k == 0) {
/* split line is in frag list */
pskb_carve_frag_list(skb, shinfo, off - pos, gfp_mask);
}
skb_release_data(skb);
doff = (data - skb->head);
skb->head = data;
skb->head_frag = 0;
skb->data = data;
#ifdef NET_SKBUFF_DATA_USES_OFFSET
skb->end = size;
doff = 0;
#else
skb->end = skb->head + size;
#endif
skb_reset_tail_pointer(skb);
skb_headers_offset_update(skb, 0);
skb->cloned = 0;
skb->hdr_len = 0;
skb->nohdr = 0;
skb->len -= off;
skb->data_len = skb->len;
atomic_set(&skb_shinfo(skb)->dataref, 1);
return 0;
}
/* remove len bytes from the beginning of the skb */
static int pskb_carve(struct sk_buff *skb, const u32 len, gfp_t gfp)
{
int headlen = skb_headlen(skb);
if (len < headlen)
return pskb_carve_inside_header(skb, len, headlen, gfp);
else
return pskb_carve_inside_nonlinear(skb, len, headlen, gfp);
}
/* Extract to_copy bytes starting at off from skb, and return this in
* a new skb
*/
struct sk_buff *pskb_extract(struct sk_buff *skb, int off,
int to_copy, gfp_t gfp)
{
struct sk_buff *clone = skb_clone(skb, gfp);
if (!clone)
return NULL;
if (pskb_carve(clone, off, gfp) < 0 ||
pskb_trim(clone, to_copy)) {
kfree_skb(clone);
return NULL;
}
return clone;
}
EXPORT_SYMBOL(pskb_extract);
...@@ -207,22 +207,14 @@ static int rds_tcp_data_recv(read_descriptor_t *desc, struct sk_buff *skb, ...@@ -207,22 +207,14 @@ static int rds_tcp_data_recv(read_descriptor_t *desc, struct sk_buff *skb,
} }
if (left && tc->t_tinc_data_rem) { if (left && tc->t_tinc_data_rem) {
clone = skb_clone(skb, arg->gfp); to_copy = min(tc->t_tinc_data_rem, left);
clone = pskb_extract(skb, offset, to_copy, arg->gfp);
if (!clone) { if (!clone) {
desc->error = -ENOMEM; desc->error = -ENOMEM;
goto out; goto out;
} }
to_copy = min(tc->t_tinc_data_rem, left);
if (!pskb_pull(clone, offset) ||
pskb_trim(clone, to_copy)) {
pr_warn("rds_tcp_data_recv: pull/trim failed "
"left %zu data_rem %zu skb_len %d\n",
left, tc->t_tinc_data_rem, skb->len);
kfree_skb(clone);
desc->error = -ENOMEM;
goto out;
}
skb_queue_tail(&tinc->ti_skb_list, clone); skb_queue_tail(&tinc->ti_skb_list, clone);
rdsdebug("skb %p data %p len %d off %u to_copy %zu -> " rdsdebug("skb %p data %p len %d off %u to_copy %zu -> "
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment