Thanks to visit codestin.com
Credit goes to github.com

Skip to content

Commit 5640f76

Browse files
edumazetdavem330
authored andcommitted
net: use a per task frag allocator
We currently use a per socket order-0 page cache for tcp_sendmsg() operations. This page is used to build fragments for skbs. Its done to increase probability of coalescing small write() into single segments in skbs still in write queue (not yet sent) But it wastes a lot of memory for applications handling many mostly idle sockets, since each socket holds one page in sk->sk_sndmsg_page Its also quite inefficient to build TSO 64KB packets, because we need about 16 pages per skb on arches where PAGE_SIZE = 4096, so we hit page allocator more than wanted. This patch adds a per task frag allocator and uses bigger pages, if available. An automatic fallback is done in case of memory pressure. (up to 32768 bytes per frag, thats order-3 pages on x86) This increases TCP stream performance by 20% on loopback device, but also benefits on other network devices, since 8x less frags are mapped on transmit and unmapped on tx completion. Alexander Duyck mentioned a probable performance win on systems with IOMMU enabled. Its possible some SG enabled hardware cant cope with bigger fragments, but their ndo_start_xmit() should already handle this, splitting a fragment in sub fragments, since some arches have PAGE_SIZE=65536 Successfully tested on various ethernet devices. (ixgbe, igb, bnx2x, tg3, mellanox mlx4) Signed-off-by: Eric Dumazet <[email protected]> Cc: Ben Hutchings <[email protected]> Cc: Vijay Subramanian <[email protected]> Cc: Alexander Duyck <[email protected]> Tested-by: Vijay Subramanian <[email protected]> Signed-off-by: David S. Miller <[email protected]>
1 parent b98b8ba commit 5640f76

File tree

13 files changed

+167
-200
lines changed

13 files changed

+167
-200
lines changed

include/linux/sched.h

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1530,6 +1530,9 @@ struct task_struct {
15301530
* cache last used pipe for splice
15311531
*/
15321532
struct pipe_inode_info *splice_pipe;
1533+
1534+
struct page_frag task_frag;
1535+
15331536
#ifdef CONFIG_TASK_DELAY_ACCT
15341537
struct task_delay_info *delays;
15351538
#endif

include/net/inet_sock.h

Lines changed: 1 addition & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -101,10 +101,8 @@ struct inet_cork {
101101
__be32 addr;
102102
struct ip_options *opt;
103103
unsigned int fragsize;
104-
struct dst_entry *dst;
105104
int length; /* Total length of all frames */
106-
struct page *page;
107-
u32 off;
105+
struct dst_entry *dst;
108106
u8 tx_flags;
109107
};
110108

include/net/sock.h

Lines changed: 15 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -247,8 +247,7 @@ struct cg_proto;
247247
* @sk_stamp: time stamp of last packet received
248248
* @sk_socket: Identd and reporting IO signals
249249
* @sk_user_data: RPC layer private data
250-
* @sk_sndmsg_page: cached page for sendmsg
251-
* @sk_sndmsg_off: cached offset for sendmsg
250+
* @sk_frag: cached page frag
252251
* @sk_peek_off: current peek_offset value
253252
* @sk_send_head: front of stuff to transmit
254253
* @sk_security: used by security modules
@@ -362,9 +361,8 @@ struct sock {
362361
ktime_t sk_stamp;
363362
struct socket *sk_socket;
364363
void *sk_user_data;
365-
struct page *sk_sndmsg_page;
364+
struct page_frag sk_frag;
366365
struct sk_buff *sk_send_head;
367-
__u32 sk_sndmsg_off;
368366
__s32 sk_peek_off;
369367
int sk_write_pending;
370368
#ifdef CONFIG_SECURITY
@@ -2034,18 +2032,23 @@ static inline void sk_stream_moderate_sndbuf(struct sock *sk)
20342032

20352033
struct sk_buff *sk_stream_alloc_skb(struct sock *sk, int size, gfp_t gfp);
20362034

2037-
static inline struct page *sk_stream_alloc_page(struct sock *sk)
2035+
/**
2036+
* sk_page_frag - return an appropriate page_frag
2037+
* @sk: socket
2038+
*
2039+
* If socket allocation mode allows current thread to sleep, it means its
2040+
* safe to use the per task page_frag instead of the per socket one.
2041+
*/
2042+
static inline struct page_frag *sk_page_frag(struct sock *sk)
20382043
{
2039-
struct page *page = NULL;
2044+
if (sk->sk_allocation & __GFP_WAIT)
2045+
return &current->task_frag;
20402046

2041-
page = alloc_pages(sk->sk_allocation, 0);
2042-
if (!page) {
2043-
sk_enter_memory_pressure(sk);
2044-
sk_stream_moderate_sndbuf(sk);
2045-
}
2046-
return page;
2047+
return &sk->sk_frag;
20472048
}
20482049

2050+
extern bool sk_page_frag_refill(struct sock *sk, struct page_frag *pfrag);
2051+
20492052
/*
20502053
* Default write policy as shown to user space via poll/select/SIGIO
20512054
*/

kernel/exit.c

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1046,6 +1046,9 @@ void do_exit(long code)
10461046
if (tsk->splice_pipe)
10471047
__free_pipe_info(tsk->splice_pipe);
10481048

1049+
if (tsk->task_frag.page)
1050+
put_page(tsk->task_frag.page);
1051+
10491052
validate_creds_for_do_exit(tsk);
10501053

10511054
preempt_disable();

kernel/fork.c

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -330,6 +330,7 @@ static struct task_struct *dup_task_struct(struct task_struct *orig)
330330
tsk->btrace_seq = 0;
331331
#endif
332332
tsk->splice_pipe = NULL;
333+
tsk->task_frag.page = NULL;
333334

334335
account_kernel_stack(ti, 1);
335336

net/core/skbuff.c

Lines changed: 9 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -1655,38 +1655,19 @@ static struct page *linear_to_page(struct page *page, unsigned int *len,
16551655
unsigned int *offset,
16561656
struct sk_buff *skb, struct sock *sk)
16571657
{
1658-
struct page *p = sk->sk_sndmsg_page;
1659-
unsigned int off;
1658+
struct page_frag *pfrag = sk_page_frag(sk);
16601659

1661-
if (!p) {
1662-
new_page:
1663-
p = sk->sk_sndmsg_page = alloc_pages(sk->sk_allocation, 0);
1664-
if (!p)
1665-
return NULL;
1666-
1667-
off = sk->sk_sndmsg_off = 0;
1668-
/* hold one ref to this page until it's full */
1669-
} else {
1670-
unsigned int mlen;
1671-
1672-
/* If we are the only user of the page, we can reset offset */
1673-
if (page_count(p) == 1)
1674-
sk->sk_sndmsg_off = 0;
1675-
off = sk->sk_sndmsg_off;
1676-
mlen = PAGE_SIZE - off;
1677-
if (mlen < 64 && mlen < *len) {
1678-
put_page(p);
1679-
goto new_page;
1680-
}
1660+
if (!sk_page_frag_refill(sk, pfrag))
1661+
return NULL;
16811662

1682-
*len = min_t(unsigned int, *len, mlen);
1683-
}
1663+
*len = min_t(unsigned int, *len, pfrag->size - pfrag->offset);
16841664

1685-
memcpy(page_address(p) + off, page_address(page) + *offset, *len);
1686-
sk->sk_sndmsg_off += *len;
1687-
*offset = off;
1665+
memcpy(page_address(pfrag->page) + pfrag->offset,
1666+
page_address(page) + *offset, *len);
1667+
*offset = pfrag->offset;
1668+
pfrag->offset += *len;
16881669

1689-
return p;
1670+
return pfrag->page;
16901671
}
16911672

16921673
static bool spd_can_coalesce(const struct splice_pipe_desc *spd,

net/core/sock.c

Lines changed: 47 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1744,6 +1744,45 @@ struct sk_buff *sock_alloc_send_skb(struct sock *sk, unsigned long size,
17441744
}
17451745
EXPORT_SYMBOL(sock_alloc_send_skb);
17461746

1747+
/* On 32bit arches, an skb frag is limited to 2^15 */
1748+
#define SKB_FRAG_PAGE_ORDER get_order(32768)
1749+
1750+
bool sk_page_frag_refill(struct sock *sk, struct page_frag *pfrag)
1751+
{
1752+
int order;
1753+
1754+
if (pfrag->page) {
1755+
if (atomic_read(&pfrag->page->_count) == 1) {
1756+
pfrag->offset = 0;
1757+
return true;
1758+
}
1759+
if (pfrag->offset < pfrag->size)
1760+
return true;
1761+
put_page(pfrag->page);
1762+
}
1763+
1764+
/* We restrict high order allocations to users that can afford to wait */
1765+
order = (sk->sk_allocation & __GFP_WAIT) ? SKB_FRAG_PAGE_ORDER : 0;
1766+
1767+
do {
1768+
gfp_t gfp = sk->sk_allocation;
1769+
1770+
if (order)
1771+
gfp |= __GFP_COMP | __GFP_NOWARN;
1772+
pfrag->page = alloc_pages(gfp, order);
1773+
if (likely(pfrag->page)) {
1774+
pfrag->offset = 0;
1775+
pfrag->size = PAGE_SIZE << order;
1776+
return true;
1777+
}
1778+
} while (--order >= 0);
1779+
1780+
sk_enter_memory_pressure(sk);
1781+
sk_stream_moderate_sndbuf(sk);
1782+
return false;
1783+
}
1784+
EXPORT_SYMBOL(sk_page_frag_refill);
1785+
17471786
static void __lock_sock(struct sock *sk)
17481787
__releases(&sk->sk_lock.slock)
17491788
__acquires(&sk->sk_lock.slock)
@@ -2173,8 +2212,8 @@ void sock_init_data(struct socket *sock, struct sock *sk)
21732212
sk->sk_error_report = sock_def_error_report;
21742213
sk->sk_destruct = sock_def_destruct;
21752214

2176-
sk->sk_sndmsg_page = NULL;
2177-
sk->sk_sndmsg_off = 0;
2215+
sk->sk_frag.page = NULL;
2216+
sk->sk_frag.offset = 0;
21782217
sk->sk_peek_off = -1;
21792218

21802219
sk->sk_peer_pid = NULL;
@@ -2417,6 +2456,12 @@ void sk_common_release(struct sock *sk)
24172456
xfrm_sk_free_policy(sk);
24182457

24192458
sk_refcnt_debug_release(sk);
2459+
2460+
if (sk->sk_frag.page) {
2461+
put_page(sk->sk_frag.page);
2462+
sk->sk_frag.page = NULL;
2463+
}
2464+
24202465
sock_put(sk);
24212466
}
24222467
EXPORT_SYMBOL(sk_common_release);

net/ipv4/ip_output.c

Lines changed: 28 additions & 42 deletions
Original file line numberDiff line numberDiff line change
@@ -793,6 +793,7 @@ static int __ip_append_data(struct sock *sk,
793793
struct flowi4 *fl4,
794794
struct sk_buff_head *queue,
795795
struct inet_cork *cork,
796+
struct page_frag *pfrag,
796797
int getfrag(void *from, char *to, int offset,
797798
int len, int odd, struct sk_buff *skb),
798799
void *from, int length, int transhdrlen,
@@ -987,47 +988,30 @@ static int __ip_append_data(struct sock *sk,
987988
}
988989
} else {
989990
int i = skb_shinfo(skb)->nr_frags;
990-
skb_frag_t *frag = &skb_shinfo(skb)->frags[i-1];
991-
struct page *page = cork->page;
992-
int off = cork->off;
993-
unsigned int left;
994-
995-
if (page && (left = PAGE_SIZE - off) > 0) {
996-
if (copy >= left)
997-
copy = left;
998-
if (page != skb_frag_page(frag)) {
999-
if (i == MAX_SKB_FRAGS) {
1000-
err = -EMSGSIZE;
1001-
goto error;
1002-
}
1003-
skb_fill_page_desc(skb, i, page, off, 0);
1004-
skb_frag_ref(skb, i);
1005-
frag = &skb_shinfo(skb)->frags[i];
1006-
}
1007-
} else if (i < MAX_SKB_FRAGS) {
1008-
if (copy > PAGE_SIZE)
1009-
copy = PAGE_SIZE;
1010-
page = alloc_pages(sk->sk_allocation, 0);
1011-
if (page == NULL) {
1012-
err = -ENOMEM;
1013-
goto error;
1014-
}
1015-
cork->page = page;
1016-
cork->off = 0;
1017991

1018-
skb_fill_page_desc(skb, i, page, 0, 0);
1019-
frag = &skb_shinfo(skb)->frags[i];
1020-
} else {
1021-
err = -EMSGSIZE;
1022-
goto error;
1023-
}
1024-
if (getfrag(from, skb_frag_address(frag)+skb_frag_size(frag),
1025-
offset, copy, skb->len, skb) < 0) {
1026-
err = -EFAULT;
992+
err = -ENOMEM;
993+
if (!sk_page_frag_refill(sk, pfrag))
1027994
goto error;
995+
996+
if (!skb_can_coalesce(skb, i, pfrag->page,
997+
pfrag->offset)) {
998+
err = -EMSGSIZE;
999+
if (i == MAX_SKB_FRAGS)
1000+
goto error;
1001+
1002+
__skb_fill_page_desc(skb, i, pfrag->page,
1003+
pfrag->offset, 0);
1004+
skb_shinfo(skb)->nr_frags = ++i;
1005+
get_page(pfrag->page);
10281006
}
1029-
cork->off += copy;
1030-
skb_frag_size_add(frag, copy);
1007+
copy = min_t(int, copy, pfrag->size - pfrag->offset);
1008+
if (getfrag(from,
1009+
page_address(pfrag->page) + pfrag->offset,
1010+
offset, copy, skb->len, skb) < 0)
1011+
goto error_efault;
1012+
1013+
pfrag->offset += copy;
1014+
skb_frag_size_add(&skb_shinfo(skb)->frags[i - 1], copy);
10311015
skb->len += copy;
10321016
skb->data_len += copy;
10331017
skb->truesize += copy;
@@ -1039,6 +1023,8 @@ static int __ip_append_data(struct sock *sk,
10391023

10401024
return 0;
10411025

1026+
error_efault:
1027+
err = -EFAULT;
10421028
error:
10431029
cork->length -= length;
10441030
IP_INC_STATS(sock_net(sk), IPSTATS_MIB_OUTDISCARDS);
@@ -1079,8 +1065,6 @@ static int ip_setup_cork(struct sock *sk, struct inet_cork *cork,
10791065
cork->dst = &rt->dst;
10801066
cork->length = 0;
10811067
cork->tx_flags = ipc->tx_flags;
1082-
cork->page = NULL;
1083-
cork->off = 0;
10841068

10851069
return 0;
10861070
}
@@ -1117,7 +1101,8 @@ int ip_append_data(struct sock *sk, struct flowi4 *fl4,
11171101
transhdrlen = 0;
11181102
}
11191103

1120-
return __ip_append_data(sk, fl4, &sk->sk_write_queue, &inet->cork.base, getfrag,
1104+
return __ip_append_data(sk, fl4, &sk->sk_write_queue, &inet->cork.base,
1105+
sk_page_frag(sk), getfrag,
11211106
from, length, transhdrlen, flags);
11221107
}
11231108

@@ -1439,7 +1424,8 @@ struct sk_buff *ip_make_skb(struct sock *sk,
14391424
if (err)
14401425
return ERR_PTR(err);
14411426

1442-
err = __ip_append_data(sk, fl4, &queue, &cork, getfrag,
1427+
err = __ip_append_data(sk, fl4, &queue, &cork,
1428+
&current->task_frag, getfrag,
14431429
from, length, transhdrlen, flags);
14441430
if (err) {
14451431
__ip_flush_pending_frames(sk, &queue, &cork);

net/ipv4/raw.c

Lines changed: 12 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -131,18 +131,23 @@ static struct sock *__raw_v4_lookup(struct net *net, struct sock *sk,
131131
* 0 - deliver
132132
* 1 - block
133133
*/
134-
static __inline__ int icmp_filter(struct sock *sk, struct sk_buff *skb)
134+
static int icmp_filter(const struct sock *sk, const struct sk_buff *skb)
135135
{
136-
int type;
137-
138-
if (!pskb_may_pull(skb, sizeof(struct icmphdr)))
136+
struct icmphdr _hdr;
137+
const struct icmphdr *hdr;
138+
139+
pr_err("icmp_filter skb_transport_offset %d data-head %ld len %d/%d\n",
140+
skb_transport_offset(skb), skb->data - skb->head, skb->len, skb->data_len);
141+
hdr = skb_header_pointer(skb, skb_transport_offset(skb),
142+
sizeof(_hdr), &_hdr);
143+
pr_err("head %p data %p hdr %p type %d\n", skb->head, skb->data, hdr, hdr ? hdr->type : -1);
144+
if (!hdr)
139145
return 1;
140146

141-
type = icmp_hdr(skb)->type;
142-
if (type < 32) {
147+
if (hdr->type < 32) {
143148
__u32 data = raw_sk(sk)->filter.data;
144149

145-
return ((1 << type) & data) != 0;
150+
return ((1U << hdr->type) & data) != 0;
146151
}
147152

148153
/* Do not block unknown ICMP types */

0 commit comments

Comments
 (0)