Skip to content

Commit 6277d9b

Browse files
q2venKernel Patches Daemon
authored andcommitted
net-memcg: Allow decoupling memcg from global protocol memory accounting.
Some protocols (e.g., TCP, UDP) implement memory accounting for socket buffers and charge memory to per-protocol global counters pointed to by sk->sk_proto->memory_allocated. When running under a non-root cgroup, this memory is also charged to the memcg as "sock" in memory.stat. Even when a memcg controls memory usage, sockets of such protocols are still subject to global limits (e.g., /proc/sys/net/ipv4/tcp_mem). This makes it difficult to accurately estimate and configure appropriate global limits, especially in multi-tenant environments. If all workloads were guaranteed to be controlled under memcg, the issue could be worked around by setting tcp_mem[0~2] to UINT_MAX. In reality, this assumption does not always hold, and processes not controlled by memcg lose the seatbelt and can consume memory up to the global limit, becoming noisy neighbour. Let's decouple sockets in memcg from the global per-protocol memory accounting if sockets have SK_BPF_MEMCG_SOCK_ISOLATED in sk->sk_memcg. This simplifies memcg configuration while keeping the global limits within a reasonable range. If mem_cgroup_sk_isolated(sk) returns true, the per-protocol memory accounting is skipped. In __inet_accept(), we need to reclaim counts that are already charged for child sockets because we do not allocate sk->sk_memcg until accept(). Note that trace_sock_exceed_buf_limit() will always show 0 as accounted for the isolated sockets, but this can be obtained via memory.stat. Tested with a script that creates local socket pairs and send()s a bunch of data without recv()ing. Setup: # mkdir /sys/fs/cgroup/test # echo $$ >> /sys/fs/cgroup/test/cgroup.procs # sysctl -q net.ipv4.tcp_mem="1000 1000 1000" Without bpf prog: # prlimit -n=524288:524288 bash -c "python3 pressure.py" & # cat /sys/fs/cgroup/test/memory.stat | grep sock sock 22642688 # cat /proc/net/sockstat| grep TCP TCP: inuse 2006 orphan 0 tw 0 alloc 2008 mem 5376 # ss -tn | head -n 5 State Recv-Q Send-Q Local Address:Port Peer Address:Port ESTAB 2000 0 127.0.0.1:34479 127.0.0.1:53188 ESTAB 2000 0 127.0.0.1:34479 127.0.0.1:49972 ESTAB 2000 0 127.0.0.1:34479 127.0.0.1:53868 ESTAB 2000 0 127.0.0.1:34479 127.0.0.1:53554 # nstat | grep Pressure || echo no pressure TcpExtTCPMemoryPressures 1 0.0 With bpf prog in the next patch: # bpftool prog load sk_memcg.bpf.o /sys/fs/bpf/sk_memcg type cgroup/sock_create # bpftool cgroup attach /sys/fs/cgroup/test cgroup_inet_sock_create pinned /sys/fs/bpf/sk_memcg # prlimit -n=524288:524288 bash -c "python3 pressure.py" & # cat /sys/fs/cgroup/test/memory.stat | grep sock sock 2757468160 # cat /proc/net/sockstat | grep TCP TCP: inuse 2006 orphan 0 tw 0 alloc 2008 mem 0 # ss -tn | head -n 5 State Recv-Q Send-Q Local Address:Port Peer Address:Port ESTAB 111000 0 127.0.0.1:36019 127.0.0.1:49026 ESTAB 110000 0 127.0.0.1:36019 127.0.0.1:45630 ESTAB 110000 0 127.0.0.1:36019 127.0.0.1:44870 ESTAB 111000 0 127.0.0.1:36019 127.0.0.1:45274 # nstat | grep Pressure || echo no pressure no pressure Signed-off-by: Kuniyuki Iwashima <[email protected]>
1 parent a74566a commit 6277d9b

File tree

9 files changed

+90
-32
lines changed

9 files changed

+90
-32
lines changed

include/net/proto_memory.h

Lines changed: 12 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -31,13 +31,22 @@ static inline bool sk_under_memory_pressure(const struct sock *sk)
3131
if (!sk->sk_prot->memory_pressure)
3232
return false;
3333

34-
if (mem_cgroup_sk_enabled(sk) &&
35-
mem_cgroup_sk_under_memory_pressure(sk))
36-
return true;
34+
if (mem_cgroup_sk_enabled(sk)) {
35+
if (mem_cgroup_sk_under_memory_pressure(sk))
36+
return true;
37+
38+
if (mem_cgroup_sk_isolated(sk))
39+
return false;
40+
}
3741

3842
return !!READ_ONCE(*sk->sk_prot->memory_pressure);
3943
}
4044

45+
static inline bool sk_should_enter_memory_pressure(struct sock *sk)
46+
{
47+
return !mem_cgroup_sk_enabled(sk) || !mem_cgroup_sk_isolated(sk);
48+
}
49+
4150
static inline long
4251
proto_memory_allocated(const struct proto *prot)
4352
{

include/net/tcp.h

Lines changed: 7 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -275,9 +275,13 @@ extern unsigned long tcp_memory_pressure;
275275
/* optimized version of sk_under_memory_pressure() for TCP sockets */
276276
static inline bool tcp_under_memory_pressure(const struct sock *sk)
277277
{
278-
if (mem_cgroup_sk_enabled(sk) &&
279-
mem_cgroup_sk_under_memory_pressure(sk))
280-
return true;
278+
if (mem_cgroup_sk_enabled(sk)) {
279+
if (mem_cgroup_sk_under_memory_pressure(sk))
280+
return true;
281+
282+
if (mem_cgroup_sk_isolated(sk))
283+
return false;
284+
}
281285

282286
return READ_ONCE(tcp_memory_pressure);
283287
}

net/core/sock.c

Lines changed: 44 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -1046,17 +1046,21 @@ static int sock_reserve_memory(struct sock *sk, int bytes)
10461046
if (!charged)
10471047
return -ENOMEM;
10481048

1049-
/* pre-charge to forward_alloc */
1050-
sk_memory_allocated_add(sk, pages);
1051-
allocated = sk_memory_allocated(sk);
1052-
/* If the system goes into memory pressure with this
1053-
* precharge, give up and return error.
1054-
*/
1055-
if (allocated > sk_prot_mem_limits(sk, 1)) {
1056-
sk_memory_allocated_sub(sk, pages);
1057-
mem_cgroup_sk_uncharge(sk, pages);
1058-
return -ENOMEM;
1049+
if (!mem_cgroup_sk_isolated(sk)) {
1050+
/* pre-charge to forward_alloc */
1051+
sk_memory_allocated_add(sk, pages);
1052+
allocated = sk_memory_allocated(sk);
1053+
1054+
/* If the system goes into memory pressure with this
1055+
* precharge, give up and return error.
1056+
*/
1057+
if (allocated > sk_prot_mem_limits(sk, 1)) {
1058+
sk_memory_allocated_sub(sk, pages);
1059+
mem_cgroup_sk_uncharge(sk, pages);
1060+
return -ENOMEM;
1061+
}
10591062
}
1063+
10601064
sk_forward_alloc_add(sk, pages << PAGE_SHIFT);
10611065

10621066
WRITE_ONCE(sk->sk_reserved_mem,
@@ -3154,8 +3158,11 @@ bool sk_page_frag_refill(struct sock *sk, struct page_frag *pfrag)
31543158
if (likely(skb_page_frag_refill(32U, pfrag, sk->sk_allocation)))
31553159
return true;
31563160

3157-
sk_enter_memory_pressure(sk);
3161+
if (sk_should_enter_memory_pressure(sk))
3162+
sk_enter_memory_pressure(sk);
3163+
31583164
sk_stream_moderate_sndbuf(sk);
3165+
31593166
return false;
31603167
}
31613168
EXPORT_SYMBOL(sk_page_frag_refill);
@@ -3268,18 +3275,30 @@ int __sk_mem_raise_allocated(struct sock *sk, int size, int amt, int kind)
32683275
{
32693276
bool memcg_enabled = false, charged = false;
32703277
struct proto *prot = sk->sk_prot;
3271-
long allocated;
3272-
3273-
sk_memory_allocated_add(sk, amt);
3274-
allocated = sk_memory_allocated(sk);
3278+
long allocated = 0;
32753279

32763280
if (mem_cgroup_sk_enabled(sk)) {
3281+
bool isolated = mem_cgroup_sk_isolated(sk);
3282+
32773283
memcg_enabled = true;
32783284
charged = mem_cgroup_sk_charge(sk, amt, gfp_memcg_charge());
3279-
if (!charged)
3285+
3286+
if (isolated && charged)
3287+
return 1;
3288+
3289+
if (!charged) {
3290+
if (!isolated) {
3291+
sk_memory_allocated_add(sk, amt);
3292+
allocated = sk_memory_allocated(sk);
3293+
}
3294+
32803295
goto suppress_allocation;
3296+
}
32813297
}
32823298

3299+
sk_memory_allocated_add(sk, amt);
3300+
allocated = sk_memory_allocated(sk);
3301+
32833302
/* Under limit. */
32843303
if (allocated <= sk_prot_mem_limits(sk, 0)) {
32853304
sk_leave_memory_pressure(sk);
@@ -3358,7 +3377,8 @@ int __sk_mem_raise_allocated(struct sock *sk, int size, int amt, int kind)
33583377

33593378
trace_sock_exceed_buf_limit(sk, prot, allocated, kind);
33603379

3361-
sk_memory_allocated_sub(sk, amt);
3380+
if (allocated)
3381+
sk_memory_allocated_sub(sk, amt);
33623382

33633383
if (charged)
33643384
mem_cgroup_sk_uncharge(sk, amt);
@@ -3397,11 +3417,15 @@ EXPORT_SYMBOL(__sk_mem_schedule);
33973417
*/
33983418
void __sk_mem_reduce_allocated(struct sock *sk, int amount)
33993419
{
3400-
sk_memory_allocated_sub(sk, amount);
3401-
3402-
if (mem_cgroup_sk_enabled(sk))
3420+
if (mem_cgroup_sk_enabled(sk)) {
34033421
mem_cgroup_sk_uncharge(sk, amount);
34043422

3423+
if (mem_cgroup_sk_isolated(sk))
3424+
return;
3425+
}
3426+
3427+
sk_memory_allocated_sub(sk, amount);
3428+
34053429
if (sk_under_global_memory_pressure(sk) &&
34063430
(sk_memory_allocated(sk) < sk_prot_mem_limits(sk, 0)))
34073431
sk_leave_memory_pressure(sk);

net/ipv4/af_inet.c

Lines changed: 11 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -95,6 +95,7 @@
9595
#include <net/checksum.h>
9696
#include <net/ip.h>
9797
#include <net/protocol.h>
98+
#include <net/proto_memory.h>
9899
#include <net/arp.h>
99100
#include <net/route.h>
100101
#include <net/ip_fib.h>
@@ -773,8 +774,17 @@ void __inet_accept(struct socket *sock, struct socket *newsock, struct sock *new
773774
*/
774775
amt = sk_mem_pages(newsk->sk_forward_alloc +
775776
atomic_read(&newsk->sk_rmem_alloc));
776-
if (amt)
777+
if (amt) {
778+
/* This amt is already charged globally to
779+
* sk_prot->memory_allocated due to lack of
780+
* sk_memcg until accept(), thus we need to
781+
* reclaim it here if newsk is isolated.
782+
*/
783+
if (mem_cgroup_sk_isolated(newsk))
784+
sk_memory_allocated_sub(newsk, amt);
785+
777786
mem_cgroup_sk_charge(newsk, amt, gfp);
787+
}
778788
}
779789

780790
kmem_cache_charge(newsk, gfp);

net/ipv4/inet_connection_sock.c

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,7 @@
2222
#include <net/tcp.h>
2323
#include <net/sock_reuseport.h>
2424
#include <net/addrconf.h>
25+
#include <net/proto_memory.h>
2526

2627
#if IS_ENABLED(CONFIG_IPV6)
2728
/* match_sk*_wildcard == true: IPV6_ADDR_ANY equals to any IPv6 addresses

net/ipv4/tcp.c

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -908,7 +908,8 @@ struct sk_buff *tcp_stream_alloc_skb(struct sock *sk, gfp_t gfp,
908908
}
909909
__kfree_skb(skb);
910910
} else {
911-
sk->sk_prot->enter_memory_pressure(sk);
911+
if (sk_should_enter_memory_pressure(sk))
912+
tcp_enter_memory_pressure(sk);
912913
sk_stream_moderate_sndbuf(sk);
913914
}
914915
return NULL;

net/ipv4/tcp_output.c

Lines changed: 8 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -3574,12 +3574,18 @@ void sk_forced_mem_schedule(struct sock *sk, int size)
35743574
delta = size - sk->sk_forward_alloc;
35753575
if (delta <= 0)
35763576
return;
3577+
35773578
amt = sk_mem_pages(delta);
35783579
sk_forward_alloc_add(sk, amt << PAGE_SHIFT);
3579-
sk_memory_allocated_add(sk, amt);
35803580

3581-
if (mem_cgroup_sk_enabled(sk))
3581+
if (mem_cgroup_sk_enabled(sk)) {
35823582
mem_cgroup_sk_charge(sk, amt, gfp_memcg_charge() | __GFP_NOFAIL);
3583+
3584+
if (mem_cgroup_sk_isolated(sk))
3585+
return;
3586+
}
3587+
3588+
sk_memory_allocated_add(sk, amt);
35833589
}
35843590

35853591
/* Send a FIN. The caller locks the socket for us.

net/mptcp/protocol.c

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,7 @@
1616
#include <net/inet_common.h>
1717
#include <net/inet_hashtables.h>
1818
#include <net/protocol.h>
19+
#include <net/proto_memory.h>
1920
#include <net/tcp_states.h>
2021
#if IS_ENABLED(CONFIG_MPTCP_IPV6)
2122
#include <net/transp_v6.h>
@@ -1016,7 +1017,7 @@ static void mptcp_enter_memory_pressure(struct sock *sk)
10161017
mptcp_for_each_subflow(msk, subflow) {
10171018
struct sock *ssk = mptcp_subflow_tcp_sock(subflow);
10181019

1019-
if (first)
1020+
if (first && sk_should_enter_memory_pressure(ssk))
10201021
tcp_enter_memory_pressure(ssk);
10211022
sk_stream_moderate_sndbuf(ssk);
10221023

net/tls/tls_device.c

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -35,6 +35,7 @@
3535
#include <linux/netdevice.h>
3636
#include <net/dst.h>
3737
#include <net/inet_connection_sock.h>
38+
#include <net/proto_memory.h>
3839
#include <net/tcp.h>
3940
#include <net/tls.h>
4041
#include <linux/skbuff_ref.h>
@@ -371,7 +372,8 @@ static int tls_do_allocation(struct sock *sk,
371372
if (!offload_ctx->open_record) {
372373
if (unlikely(!skb_page_frag_refill(prepend_size, pfrag,
373374
sk->sk_allocation))) {
374-
READ_ONCE(sk->sk_prot)->enter_memory_pressure(sk);
375+
if (sk_should_enter_memory_pressure(sk))
376+
READ_ONCE(sk->sk_prot)->enter_memory_pressure(sk);
375377
sk_stream_moderate_sndbuf(sk);
376378
return -ENOMEM;
377379
}

0 commit comments

Comments
 (0)