Skip to content
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.

Commit b611b77

Browse files
author
Paolo Abeni
committedFeb 29, 2024
Merge tag 'nf-24-02-29' of git://git.kernel.org/pub/scm/linux/kernel/git/netfilter/nf
Pablo Neira Ayuso says: ==================== Netfilter fixes for net Patch #1 restores NFPROTO_INET with nft_compat, from Ignat Korchagin. Patch #2 fixes an issue with bridge netfilter and broadcast/multicast packets. There is a day 0 bug in br_netfilter when used with connection tracking. Conntrack assumes that an nf_conn structure that is not yet added to hash table ("unconfirmed"), is only visible by the current cpu that is processing the sk_buff. For bridge this isn't true, sk_buff can get cloned in between, and clones can be processed in parallel on different cpu. This patch disables NAT and conntrack helpers for multicast packets. Patch #3 adds a selftest to cover for the br_netfilter bug. netfilter pull request 24-02-29 * tag 'nf-24-02-29' of git://git.kernel.org/pub/scm/linux/kernel/git/netfilter/nf: selftests: netfilter: add bridge conntrack + multicast test case netfilter: bridge: confirm multicast packets before passing them up the stack netfilter: nf_tables: allow NFPROTO_INET in nft_(match/target)_validate() ==================== Link: https://lore.kernel.org/r/[email protected] Signed-off-by: Paolo Abeni <[email protected]>
2 parents 51dd4ee + 6523cf5 commit b611b77

File tree

7 files changed

+338
-1
lines changed

7 files changed

+338
-1
lines changed
 

‎include/linux/netfilter.h

+1
Original file line numberDiff line numberDiff line change
@@ -474,6 +474,7 @@ struct nf_ct_hook {
474474
const struct sk_buff *);
475475
void (*attach)(struct sk_buff *nskb, const struct sk_buff *skb);
476476
void (*set_closing)(struct nf_conntrack *nfct);
477+
int (*confirm)(struct sk_buff *skb);
477478
};
478479
extern const struct nf_ct_hook __rcu *nf_ct_hook;
479480

‎net/bridge/br_netfilter_hooks.c

+96
Original file line numberDiff line numberDiff line change
@@ -43,6 +43,10 @@
4343
#include <linux/sysctl.h>
4444
#endif
4545

46+
#if IS_ENABLED(CONFIG_NF_CONNTRACK)
47+
#include <net/netfilter/nf_conntrack_core.h>
48+
#endif
49+
4650
static unsigned int brnf_net_id __read_mostly;
4751

4852
struct brnf_net {
@@ -553,6 +557,90 @@ static unsigned int br_nf_pre_routing(void *priv,
553557
return NF_STOLEN;
554558
}
555559

560+
#if IS_ENABLED(CONFIG_NF_CONNTRACK)
561+
/* conntracks' nf_confirm logic cannot handle cloned skbs referencing
562+
* the same nf_conn entry, which will happen for multicast (broadcast)
563+
* Frames on bridges.
564+
*
565+
* Example:
566+
* macvlan0
567+
* br0
568+
* ethX ethY
569+
*
570+
* ethX (or Y) receives multicast or broadcast packet containing
571+
* an IP packet, not yet in conntrack table.
572+
*
573+
* 1. skb passes through bridge and fake-ip (br_netfilter)Prerouting.
574+
* -> skb->_nfct now references a unconfirmed entry
575+
* 2. skb is broad/mcast packet. bridge now passes clones out on each bridge
576+
* interface.
577+
* 3. skb gets passed up the stack.
578+
* 4. In macvlan case, macvlan driver retains clone(s) of the mcast skb
579+
* and schedules a work queue to send them out on the lower devices.
580+
*
581+
* The clone skb->_nfct is not a copy, it is the same entry as the
582+
* original skb. The macvlan rx handler then returns RX_HANDLER_PASS.
583+
* 5. Normal conntrack hooks (in NF_INET_LOCAL_IN) confirm the orig skb.
584+
*
585+
* The Macvlan broadcast worker and normal confirm path will race.
586+
*
587+
* This race will not happen if step 2 already confirmed a clone. In that
588+
* case later steps perform skb_clone() with skb->_nfct already confirmed (in
589+
* hash table). This works fine.
590+
*
591+
* But such confirmation won't happen when eb/ip/nftables rules dropped the
592+
* packets before they reached the nf_confirm step in postrouting.
593+
*
594+
* Work around this problem by explicit confirmation of the entry at
595+
* LOCAL_IN time, before upper layer has a chance to clone the unconfirmed
596+
* entry.
597+
*
598+
*/
599+
static unsigned int br_nf_local_in(void *priv,
600+
struct sk_buff *skb,
601+
const struct nf_hook_state *state)
602+
{
603+
struct nf_conntrack *nfct = skb_nfct(skb);
604+
const struct nf_ct_hook *ct_hook;
605+
struct nf_conn *ct;
606+
int ret;
607+
608+
if (!nfct || skb->pkt_type == PACKET_HOST)
609+
return NF_ACCEPT;
610+
611+
ct = container_of(nfct, struct nf_conn, ct_general);
612+
if (likely(nf_ct_is_confirmed(ct)))
613+
return NF_ACCEPT;
614+
615+
WARN_ON_ONCE(skb_shared(skb));
616+
WARN_ON_ONCE(refcount_read(&nfct->use) != 1);
617+
618+
/* We can't call nf_confirm here, it would create a dependency
619+
* on nf_conntrack module.
620+
*/
621+
ct_hook = rcu_dereference(nf_ct_hook);
622+
if (!ct_hook) {
623+
skb->_nfct = 0ul;
624+
nf_conntrack_put(nfct);
625+
return NF_ACCEPT;
626+
}
627+
628+
nf_bridge_pull_encap_header(skb);
629+
ret = ct_hook->confirm(skb);
630+
switch (ret & NF_VERDICT_MASK) {
631+
case NF_STOLEN:
632+
return NF_STOLEN;
633+
default:
634+
nf_bridge_push_encap_header(skb);
635+
break;
636+
}
637+
638+
ct = container_of(nfct, struct nf_conn, ct_general);
639+
WARN_ON_ONCE(!nf_ct_is_confirmed(ct));
640+
641+
return ret;
642+
}
643+
#endif
556644

557645
/* PF_BRIDGE/FORWARD *************************************************/
558646
static int br_nf_forward_finish(struct net *net, struct sock *sk, struct sk_buff *skb)
@@ -964,6 +1052,14 @@ static const struct nf_hook_ops br_nf_ops[] = {
9641052
.hooknum = NF_BR_PRE_ROUTING,
9651053
.priority = NF_BR_PRI_BRNF,
9661054
},
1055+
#if IS_ENABLED(CONFIG_NF_CONNTRACK)
1056+
{
1057+
.hook = br_nf_local_in,
1058+
.pf = NFPROTO_BRIDGE,
1059+
.hooknum = NF_BR_LOCAL_IN,
1060+
.priority = NF_BR_PRI_LAST,
1061+
},
1062+
#endif
9671063
{
9681064
.hook = br_nf_forward,
9691065
.pf = NFPROTO_BRIDGE,

‎net/bridge/netfilter/nf_conntrack_bridge.c

+30
Original file line numberDiff line numberDiff line change
@@ -291,6 +291,30 @@ static unsigned int nf_ct_bridge_pre(void *priv, struct sk_buff *skb,
291291
return nf_conntrack_in(skb, &bridge_state);
292292
}
293293

294+
static unsigned int nf_ct_bridge_in(void *priv, struct sk_buff *skb,
295+
const struct nf_hook_state *state)
296+
{
297+
enum ip_conntrack_info ctinfo;
298+
struct nf_conn *ct;
299+
300+
if (skb->pkt_type == PACKET_HOST)
301+
return NF_ACCEPT;
302+
303+
/* nf_conntrack_confirm() cannot handle concurrent clones,
304+
* this happens for broad/multicast frames with e.g. macvlan on top
305+
* of the bridge device.
306+
*/
307+
ct = nf_ct_get(skb, &ctinfo);
308+
if (!ct || nf_ct_is_confirmed(ct) || nf_ct_is_template(ct))
309+
return NF_ACCEPT;
310+
311+
/* let inet prerouting call conntrack again */
312+
skb->_nfct = 0;
313+
nf_ct_put(ct);
314+
315+
return NF_ACCEPT;
316+
}
317+
294318
static void nf_ct_bridge_frag_save(struct sk_buff *skb,
295319
struct nf_bridge_frag_data *data)
296320
{
@@ -385,6 +409,12 @@ static struct nf_hook_ops nf_ct_bridge_hook_ops[] __read_mostly = {
385409
.hooknum = NF_BR_PRE_ROUTING,
386410
.priority = NF_IP_PRI_CONNTRACK,
387411
},
412+
{
413+
.hook = nf_ct_bridge_in,
414+
.pf = NFPROTO_BRIDGE,
415+
.hooknum = NF_BR_LOCAL_IN,
416+
.priority = NF_IP_PRI_CONNTRACK_CONFIRM,
417+
},
388418
{
389419
.hook = nf_ct_bridge_post,
390420
.pf = NFPROTO_BRIDGE,

‎net/netfilter/nf_conntrack_core.c

+1
Original file line numberDiff line numberDiff line change
@@ -2756,6 +2756,7 @@ static const struct nf_ct_hook nf_conntrack_hook = {
27562756
.get_tuple_skb = nf_conntrack_get_tuple_skb,
27572757
.attach = nf_conntrack_attach,
27582758
.set_closing = nf_conntrack_set_closing,
2759+
.confirm = __nf_conntrack_confirm,
27592760
};
27602761

27612762
void nf_conntrack_init_end(void)

‎net/netfilter/nft_compat.c

+20
Original file line numberDiff line numberDiff line change
@@ -359,10 +359,20 @@ static int nft_target_validate(const struct nft_ctx *ctx,
359359

360360
if (ctx->family != NFPROTO_IPV4 &&
361361
ctx->family != NFPROTO_IPV6 &&
362+
ctx->family != NFPROTO_INET &&
362363
ctx->family != NFPROTO_BRIDGE &&
363364
ctx->family != NFPROTO_ARP)
364365
return -EOPNOTSUPP;
365366

367+
ret = nft_chain_validate_hooks(ctx->chain,
368+
(1 << NF_INET_PRE_ROUTING) |
369+
(1 << NF_INET_LOCAL_IN) |
370+
(1 << NF_INET_FORWARD) |
371+
(1 << NF_INET_LOCAL_OUT) |
372+
(1 << NF_INET_POST_ROUTING));
373+
if (ret)
374+
return ret;
375+
366376
if (nft_is_base_chain(ctx->chain)) {
367377
const struct nft_base_chain *basechain =
368378
nft_base_chain(ctx->chain);
@@ -610,10 +620,20 @@ static int nft_match_validate(const struct nft_ctx *ctx,
610620

611621
if (ctx->family != NFPROTO_IPV4 &&
612622
ctx->family != NFPROTO_IPV6 &&
623+
ctx->family != NFPROTO_INET &&
613624
ctx->family != NFPROTO_BRIDGE &&
614625
ctx->family != NFPROTO_ARP)
615626
return -EOPNOTSUPP;
616627

628+
ret = nft_chain_validate_hooks(ctx->chain,
629+
(1 << NF_INET_PRE_ROUTING) |
630+
(1 << NF_INET_LOCAL_IN) |
631+
(1 << NF_INET_FORWARD) |
632+
(1 << NF_INET_LOCAL_OUT) |
633+
(1 << NF_INET_POST_ROUTING));
634+
if (ret)
635+
return ret;
636+
617637
if (nft_is_base_chain(ctx->chain)) {
618638
const struct nft_base_chain *basechain =
619639
nft_base_chain(ctx->chain);

‎tools/testing/selftests/netfilter/Makefile

+2-1
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,8 @@ TEST_PROGS := nft_trans_stress.sh nft_fib.sh nft_nat.sh bridge_brouter.sh \
77
nft_queue.sh nft_meta.sh nf_nat_edemux.sh \
88
ipip-conntrack-mtu.sh conntrack_tcp_unreplied.sh \
99
conntrack_vrf.sh nft_synproxy.sh rpath.sh nft_audit.sh \
10-
conntrack_sctp_collision.sh xt_string.sh
10+
conntrack_sctp_collision.sh xt_string.sh \
11+
bridge_netfilter.sh
1112

1213
HOSTPKG_CONFIG := pkg-config
1314

Original file line numberDiff line numberDiff line change
@@ -0,0 +1,188 @@
1+
#!/bin/bash
2+
# SPDX-License-Identifier: GPL-2.0
3+
#
4+
# Test bridge netfilter + conntrack, a combination that doesn't really work,
5+
# with multicast/broadcast packets racing for hash table insertion.
6+
7+
# eth0 br0 eth0
8+
# setup is: ns1 <->,ns0 <-> ns3
9+
# ns2 <-' `'-> ns4
10+
11+
# Kselftest framework requirement - SKIP code is 4.
12+
ksft_skip=4
13+
ret=0
14+
15+
sfx=$(mktemp -u "XXXXXXXX")
16+
ns0="ns0-$sfx"
17+
ns1="ns1-$sfx"
18+
ns2="ns2-$sfx"
19+
ns3="ns3-$sfx"
20+
ns4="ns4-$sfx"
21+
22+
ebtables -V > /dev/null 2>&1
23+
if [ $? -ne 0 ];then
24+
echo "SKIP: Could not run test without ebtables"
25+
exit $ksft_skip
26+
fi
27+
28+
ip -Version > /dev/null 2>&1
29+
if [ $? -ne 0 ];then
30+
echo "SKIP: Could not run test without ip tool"
31+
exit $ksft_skip
32+
fi
33+
34+
for i in $(seq 0 4); do
35+
eval ip netns add \$ns$i
36+
done
37+
38+
cleanup() {
39+
for i in $(seq 0 4); do eval ip netns del \$ns$i;done
40+
}
41+
42+
trap cleanup EXIT
43+
44+
do_ping()
45+
{
46+
fromns="$1"
47+
dstip="$2"
48+
49+
ip netns exec $fromns ping -c 1 -q $dstip > /dev/null
50+
if [ $? -ne 0 ]; then
51+
echo "ERROR: ping from $fromns to $dstip"
52+
ip netns exec ${ns0} nft list ruleset
53+
ret=1
54+
fi
55+
}
56+
57+
bcast_ping()
58+
{
59+
fromns="$1"
60+
dstip="$2"
61+
62+
for i in $(seq 1 1000); do
63+
ip netns exec $fromns ping -q -f -b -c 1 -q $dstip > /dev/null 2>&1
64+
if [ $? -ne 0 ]; then
65+
echo "ERROR: ping -b from $fromns to $dstip"
66+
ip netns exec ${ns0} nft list ruleset
67+
fi
68+
done
69+
}
70+
71+
ip link add veth1 netns ${ns0} type veth peer name eth0 netns ${ns1}
72+
if [ $? -ne 0 ]; then
73+
echo "SKIP: Can't create veth device"
74+
exit $ksft_skip
75+
fi
76+
77+
ip link add veth2 netns ${ns0} type veth peer name eth0 netns $ns2
78+
ip link add veth3 netns ${ns0} type veth peer name eth0 netns $ns3
79+
ip link add veth4 netns ${ns0} type veth peer name eth0 netns $ns4
80+
81+
ip -net ${ns0} link set lo up
82+
83+
for i in $(seq 1 4); do
84+
ip -net ${ns0} link set veth$i up
85+
done
86+
87+
ip -net ${ns0} link add br0 type bridge stp_state 0 forward_delay 0 nf_call_iptables 1 nf_call_ip6tables 1 nf_call_arptables 1
88+
if [ $? -ne 0 ]; then
89+
echo "SKIP: Can't create bridge br0"
90+
exit $ksft_skip
91+
fi
92+
93+
# make veth0,1,2 part of bridge.
94+
for i in $(seq 1 3); do
95+
ip -net ${ns0} link set veth$i master br0
96+
done
97+
98+
# add a macvlan on top of the bridge.
99+
MACVLAN_ADDR=ba:f3:13:37:42:23
100+
ip -net ${ns0} link add link br0 name macvlan0 type macvlan mode private
101+
ip -net ${ns0} link set macvlan0 address ${MACVLAN_ADDR}
102+
ip -net ${ns0} link set macvlan0 up
103+
ip -net ${ns0} addr add 10.23.0.1/24 dev macvlan0
104+
105+
# add a macvlan on top of veth4.
106+
MACVLAN_ADDR=ba:f3:13:37:42:24
107+
ip -net ${ns0} link add link veth4 name macvlan4 type macvlan mode vepa
108+
ip -net ${ns0} link set macvlan4 address ${MACVLAN_ADDR}
109+
ip -net ${ns0} link set macvlan4 up
110+
111+
# make the macvlan part of the bridge.
112+
# veth4 is not a bridge port, only the macvlan on top of it.
113+
ip -net ${ns0} link set macvlan4 master br0
114+
115+
ip -net ${ns0} link set br0 up
116+
ip -net ${ns0} addr add 10.0.0.1/24 dev br0
117+
ip netns exec ${ns0} sysctl -q net.bridge.bridge-nf-call-iptables=1
118+
ret=$?
119+
if [ $ret -ne 0 ] ; then
120+
echo "SKIP: bridge netfilter not available"
121+
ret=$ksft_skip
122+
fi
123+
124+
# for testing, so namespaces will reply to ping -b probes.
125+
ip netns exec ${ns0} sysctl -q net.ipv4.icmp_echo_ignore_broadcasts=0
126+
127+
# enable conntrack in ns0 and drop broadcast packets in forward to
128+
# avoid them from getting confirmed in the postrouting hook before
129+
# the cloned skb is passed up the stack.
130+
ip netns exec ${ns0} nft -f - <<EOF
131+
table ip filter {
132+
chain input {
133+
type filter hook input priority 1; policy accept
134+
iifname br0 counter
135+
ct state new accept
136+
}
137+
}
138+
139+
table bridge filter {
140+
chain forward {
141+
type filter hook forward priority 0; policy accept
142+
meta pkttype broadcast ip protocol icmp counter drop
143+
}
144+
}
145+
EOF
146+
147+
# place 1, 2 & 3 in same subnet, connected via ns0:br0.
148+
# ns4 is placed in same subnet as well, but its not
149+
# part of the bridge: the corresponding veth4 is not
150+
# part of the bridge, only its macvlan interface.
151+
for i in $(seq 1 4); do
152+
eval ip -net \$ns$i link set lo up
153+
eval ip -net \$ns$i link set eth0 up
154+
done
155+
for i in $(seq 1 2); do
156+
eval ip -net \$ns$i addr add 10.0.0.1$i/24 dev eth0
157+
done
158+
159+
ip -net ${ns3} addr add 10.23.0.13/24 dev eth0
160+
ip -net ${ns4} addr add 10.23.0.14/24 dev eth0
161+
162+
# test basic connectivity
163+
do_ping ${ns1} 10.0.0.12
164+
do_ping ${ns3} 10.23.0.1
165+
do_ping ${ns4} 10.23.0.1
166+
167+
if [ $ret -eq 0 ];then
168+
echo "PASS: netns connectivity: ns1 can reach ns2, ns3 and ns4 can reach ns0"
169+
fi
170+
171+
bcast_ping ${ns1} 10.0.0.255
172+
173+
# This should deliver broadcast to macvlan0, which is on top of ns0:br0.
174+
bcast_ping ${ns3} 10.23.0.255
175+
176+
# same, this time via veth4:macvlan4.
177+
bcast_ping ${ns4} 10.23.0.255
178+
179+
read t < /proc/sys/kernel/tainted
180+
181+
if [ $t -eq 0 ];then
182+
echo PASS: kernel not tainted
183+
else
184+
echo ERROR: kernel is tainted
185+
ret=1
186+
fi
187+
188+
exit $ret

0 commit comments

Comments
 (0)
Please sign in to comment.