releases/5.10.71/netfilter-conntrack-serialize-hash-resizes-and-cleanups.patch - pub/scm/linux/kernel/git/stable/stable-queue - Git at Google

 From e9edc188fc76499b0b9bd60364084037f6d03773 Mon Sep 17 00:00:00 2001
 From: Eric Dumazet <edumazet@google.com>
 Date: Fri, 17 Sep 2021 15:15:56 -0700
 Subject: netfilter: conntrack: serialize hash resizes and cleanups

 From: Eric Dumazet <edumazet@google.com>

 commit e9edc188fc76499b0b9bd60364084037f6d03773 upstream.

 Syzbot was able to trigger the following warning [1]

 No repro found by syzbot yet but I was able to trigger similar issue
 by having 2 scripts running in parallel, changing conntrack hash sizes,
 and:

 for j in `seq 1 1000` ; do unshare -n /bin/true >/dev/null ; done

 It would take more than 5 minutes for net_namespace structures
 to be cleaned up.

 This is because nf_ct_iterate_cleanup() has to restart everytime
 a resize happened.

 By adding a mutex, we can serialize hash resizes and cleanups
 and also make get_next_corpse() faster by skipping over empty
 buckets.

 Even without resizes in the picture, this patch considerably
 speeds up network namespace dismantles.

 [1]
 INFO: task syz-executor.0:8312 can't die for more than 144 seconds.
 task:syz-executor.0  state:R  running task     stack:25672 pid: 8312 ppid:  6573 flags:0x00004006
 Call Trace:
  context_switch kernel/sched/core.c:4955 [inline]
  __schedule+0x940/0x26f0 kernel/sched/core.c:6236
  preempt_schedule_common+0x45/0xc0 kernel/sched/core.c:6408
  preempt_schedule_thunk+0x16/0x18 arch/x86/entry/thunk_64.S:35
  __local_bh_enable_ip+0x109/0x120 kernel/softirq.c:390
  local_bh_enable include/linux/bottom_half.h:32 [inline]
  get_next_corpse net/netfilter/nf_conntrack_core.c:2252 [inline]
  nf_ct_iterate_cleanup+0x15a/0x450 net/netfilter/nf_conntrack_core.c:2275
  nf_conntrack_cleanup_net_list+0x14c/0x4f0 net/netfilter/nf_conntrack_core.c:2469
  ops_exit_list+0x10d/0x160 net/core/net_namespace.c:171
  setup_net+0x639/0xa30 net/core/net_namespace.c:349
  copy_net_ns+0x319/0x760 net/core/net_namespace.c:470
  create_new_namespaces+0x3f6/0xb20 kernel/nsproxy.c:110
  unshare_nsproxy_namespaces+0xc1/0x1f0 kernel/nsproxy.c:226
  ksys_unshare+0x445/0x920 kernel/fork.c:3128
  __do_sys_unshare kernel/fork.c:3202 [inline]
  __se_sys_unshare kernel/fork.c:3200 [inline]
  __x64_sys_unshare+0x2d/0x40 kernel/fork.c:3200
  do_syscall_x64 arch/x86/entry/common.c:50 [inline]
  do_syscall_64+0x35/0xb0 arch/x86/entry/common.c:80
  entry_SYSCALL_64_after_hwframe+0x44/0xae
 RIP: 0033:0x7f63da68e739
 RSP: 002b:00007f63d7c05188 EFLAGS: 00000246 ORIG_RAX: 0000000000000110
 RAX: ffffffffffffffda RBX: 00007f63da792f80 RCX: 00007f63da68e739
 RDX: 0000000000000000 RSI: 0000000000000000 RDI: 0000000040000000
 RBP: 00007f63da6e8cc4 R08: 0000000000000000 R09: 0000000000000000
 R10: 0000000000000000 R11: 0000000000000246 R12: 00007f63da792f80
 R13: 00007fff50b75d3f R14: 00007f63d7c05300 R15: 0000000000022000

 Showing all locks held in the system:
 1 lock held by khungtaskd/27:
  #0: ffffffff8b980020 (rcu_read_lock){....}-{1:2}, at: debug_show_all_locks+0x53/0x260 kernel/locking/lockdep.c:6446
 2 locks held by kworker/u4:2/153:
  #0: ffff888010c69138 ((wq_completion)events_unbound){+.+.}-{0:0}, at: arch_atomic64_set arch/x86/include/asm/atomic64_64.h:34 [inline]
  #0: ffff888010c69138 ((wq_completion)events_unbound){+.+.}-{0:0}, at: arch_atomic_long_set include/linux/atomic/atomic-long.h:41 [inline]
  #0: ffff888010c69138 ((wq_completion)events_unbound){+.+.}-{0:0}, at: atomic_long_set include/linux/atomic/atomic-instrumented.h:1198 [inline]
  #0: ffff888010c69138 ((wq_completion)events_unbound){+.+.}-{0:0}, at: set_work_data kernel/workqueue.c:634 [inline]
  #0: ffff888010c69138 ((wq_completion)events_unbound){+.+.}-{0:0}, at: set_work_pool_and_clear_pending kernel/workqueue.c:661 [inline]
  #0: ffff888010c69138 ((wq_completion)events_unbound){+.+.}-{0:0}, at: process_one_work+0x896/0x1690 kernel/workqueue.c:2268
  #1: ffffc9000140fdb0 ((kfence_timer).work){+.+.}-{0:0}, at: process_one_work+0x8ca/0x1690 kernel/workqueue.c:2272
 1 lock held by systemd-udevd/2970:
 1 lock held by in:imklog/6258:
  #0: ffff88807f970ff0 (&f->f_pos_lock){+.+.}-{3:3}, at: __fdget_pos+0xe9/0x100 fs/file.c:990
 3 locks held by kworker/1:6/8158:
 1 lock held by syz-executor.0/8312:
 2 locks held by kworker/u4:13/9320:
 1 lock held by syz-executor.5/10178:
 1 lock held by syz-executor.4/10217:

 Signed-off-by: Eric Dumazet <edumazet@google.com>
 Reported-by: syzbot <syzkaller@googlegroups.com>
 Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
 Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
 ---
  net/netfilter/nf_conntrack_core.c |   70 ++++++++++++++++++++------------------
  1 file changed, 37 insertions(+), 33 deletions(-)

 --- a/net/netfilter/nf_conntrack_core.c
 +++ b/net/netfilter/nf_conntrack_core.c
 @@ -75,6 +75,9 @@ static __read_mostly struct kmem_cache *
  static DEFINE_SPINLOCK(nf_conntrack_locks_all_lock);
  static __read_mostly bool nf_conntrack_locks_all;

 +/* serialize hash resizes and nf_ct_iterate_cleanup */
 +static DEFINE_MUTEX(nf_conntrack_mutex);
 +
  #define GC_SCAN_INTERVAL	(120u * HZ)
  #define GC_SCAN_MAX_DURATION	msecs_to_jiffies(10)

 @@ -2173,28 +2176,31 @@ get_next_corpse(int (*iter)(struct nf_co
  	spinlock_t *lockp;

  	for (; *bucket < nf_conntrack_htable_size; (*bucket)++) {
 +		struct hlist_nulls_head *hslot = &nf_conntrack_hash[*bucket];
 +
 +		if (hlist_nulls_empty(hslot))
 +			continue;
 +
  		lockp = &nf_conntrack_locks[*bucket % CONNTRACK_LOCKS];
  		local_bh_disable();
  		nf_conntrack_lock(lockp);
 -		if (*bucket < nf_conntrack_htable_size) {
 -			hlist_nulls_for_each_entry(h, n, &nf_conntrack_hash[*bucket], hnnode) {
 -				if (NF_CT_DIRECTION(h) != IP_CT_DIR_REPLY)
 -					continue;
 -				/* All nf_conn objects are added to hash table twice, one
 -				 * for original direction tuple, once for the reply tuple.
 -				 *
 -				 * Exception: In the IPS_NAT_CLASH case, only the reply
 -				 * tuple is added (the original tuple already existed for
 -				 * a different object).
 -				 *
 -				 * We only need to call the iterator once for each
 -				 * conntrack, so we just use the 'reply' direction
 -				 * tuple while iterating.
 -				 */
 -				ct = nf_ct_tuplehash_to_ctrack(h);
 -				if (iter(ct, data))
 -					goto found;
 -			}
 +		hlist_nulls_for_each_entry(h, n, hslot, hnnode) {
 +			if (NF_CT_DIRECTION(h) != IP_CT_DIR_REPLY)
 +				continue;
 +			/* All nf_conn objects are added to hash table twice, one
 +			 * for original direction tuple, once for the reply tuple.
 +			 *
 +			 * Exception: In the IPS_NAT_CLASH case, only the reply
 +			 * tuple is added (the original tuple already existed for
 +			 * a different object).
 +			 *
 +			 * We only need to call the iterator once for each
 +			 * conntrack, so we just use the 'reply' direction
 +			 * tuple while iterating.
 +			 */
 +			ct = nf_ct_tuplehash_to_ctrack(h);
 +			if (iter(ct, data))
 +				goto found;
  		}
  		spin_unlock(lockp);
  		local_bh_enable();
 @@ -2212,26 +2218,20 @@ found:
  static void nf_ct_iterate_cleanup(int (*iter)(struct nf_conn *i, void *data),
  				  void *data, u32 portid, int report)
  {
 -	unsigned int bucket = 0, sequence;
 +	unsigned int bucket = 0;
  	struct nf_conn *ct;

  	might_sleep();

 -	for (;;) {
 -		sequence = read_seqcount_begin(&nf_conntrack_generation);
 -
 -		while ((ct = get_next_corpse(iter, data, &bucket)) != NULL) {
 -			/* Time to push up daises... */
 +	mutex_lock(&nf_conntrack_mutex);
 +	while ((ct = get_next_corpse(iter, data, &bucket)) != NULL) {
 +		/* Time to push up daises... */

 -			nf_ct_delete(ct, portid, report);
 -			nf_ct_put(ct);
 -			cond_resched();
 -		}
 -
 -		if (!read_seqcount_retry(&nf_conntrack_generation, sequence))
 -			break;
 -		bucket = 0;
 +		nf_ct_delete(ct, portid, report);
 +		nf_ct_put(ct);
 +		cond_resched();
  	}
 +	mutex_unlock(&nf_conntrack_mutex);
  }

  struct iter_data {
 @@ -2461,8 +2461,10 @@ int nf_conntrack_hash_resize(unsigned in
  	if (!hash)
  		return -ENOMEM;

 +	mutex_lock(&nf_conntrack_mutex);
  	old_size = nf_conntrack_htable_size;
  	if (old_size == hashsize) {
 +		mutex_unlock(&nf_conntrack_mutex);
  		kvfree(hash);
  		return 0;
  	}
 @@ -2498,6 +2500,8 @@ int nf_conntrack_hash_resize(unsigned in
  	nf_conntrack_all_unlock();
  	local_bh_enable();

 +	mutex_unlock(&nf_conntrack_mutex);
 +
  	synchronize_net();
  	kvfree(old_hash);
  	return 0;
	From e9edc188fc76499b0b9bd60364084037f6d03773 Mon Sep 17 00:00:00 2001
	From: Eric Dumazet <edumazet@google.com>
	Date: Fri, 17 Sep 2021 15:15:56 -0700
	Subject: netfilter: conntrack: serialize hash resizes and cleanups

	From: Eric Dumazet <edumazet@google.com>

	commit e9edc188fc76499b0b9bd60364084037f6d03773 upstream.

	Syzbot was able to trigger the following warning [1]

	No repro found by syzbot yet but I was able to trigger similar issue
	by having 2 scripts running in parallel, changing conntrack hash sizes,
	and:

	for j in `seq 1 1000` ; do unshare -n /bin/true >/dev/null ; done

	It would take more than 5 minutes for net_namespace structures
	to be cleaned up.

	This is because nf_ct_iterate_cleanup() has to restart everytime
	a resize happened.

	By adding a mutex, we can serialize hash resizes and cleanups
	and also make get_next_corpse() faster by skipping over empty
	buckets.

	Even without resizes in the picture, this patch considerably
	speeds up network namespace dismantles.

	[1]
	INFO: task syz-executor.0:8312 can't die for more than 144 seconds.
	task:syz-executor.0 state:R running task stack:25672 pid: 8312 ppid: 6573 flags:0x00004006
	Call Trace:
	context_switch kernel/sched/core.c:4955 [inline]
	__schedule+0x940/0x26f0 kernel/sched/core.c:6236
	preempt_schedule_common+0x45/0xc0 kernel/sched/core.c:6408
	preempt_schedule_thunk+0x16/0x18 arch/x86/entry/thunk_64.S:35
	__local_bh_enable_ip+0x109/0x120 kernel/softirq.c:390
	local_bh_enable include/linux/bottom_half.h:32 [inline]
	get_next_corpse net/netfilter/nf_conntrack_core.c:2252 [inline]
	nf_ct_iterate_cleanup+0x15a/0x450 net/netfilter/nf_conntrack_core.c:2275
	nf_conntrack_cleanup_net_list+0x14c/0x4f0 net/netfilter/nf_conntrack_core.c:2469
	ops_exit_list+0x10d/0x160 net/core/net_namespace.c:171
	setup_net+0x639/0xa30 net/core/net_namespace.c:349
	copy_net_ns+0x319/0x760 net/core/net_namespace.c:470
	create_new_namespaces+0x3f6/0xb20 kernel/nsproxy.c:110
	unshare_nsproxy_namespaces+0xc1/0x1f0 kernel/nsproxy.c:226
	ksys_unshare+0x445/0x920 kernel/fork.c:3128
	__do_sys_unshare kernel/fork.c:3202 [inline]
	__se_sys_unshare kernel/fork.c:3200 [inline]
	__x64_sys_unshare+0x2d/0x40 kernel/fork.c:3200
	do_syscall_x64 arch/x86/entry/common.c:50 [inline]
	do_syscall_64+0x35/0xb0 arch/x86/entry/common.c:80
	entry_SYSCALL_64_after_hwframe+0x44/0xae
	RIP: 0033:0x7f63da68e739
	RSP: 002b:00007f63d7c05188 EFLAGS: 00000246 ORIG_RAX: 0000000000000110
	RAX: ffffffffffffffda RBX: 00007f63da792f80 RCX: 00007f63da68e739
	RDX: 0000000000000000 RSI: 0000000000000000 RDI: 0000000040000000
	RBP: 00007f63da6e8cc4 R08: 0000000000000000 R09: 0000000000000000
	R10: 0000000000000000 R11: 0000000000000246 R12: 00007f63da792f80
	R13: 00007fff50b75d3f R14: 00007f63d7c05300 R15: 0000000000022000

	Showing all locks held in the system:
	1 lock held by khungtaskd/27:
	#0: ffffffff8b980020 (rcu_read_lock){....}-{1:2}, at: debug_show_all_locks+0x53/0x260 kernel/locking/lockdep.c:6446
	2 locks held by kworker/u4:2/153:
	#0: ffff888010c69138 ((wq_completion)events_unbound){+.+.}-{0:0}, at: arch_atomic64_set arch/x86/include/asm/atomic64_64.h:34 [inline]
	#0: ffff888010c69138 ((wq_completion)events_unbound){+.+.}-{0:0}, at: arch_atomic_long_set include/linux/atomic/atomic-long.h:41 [inline]
	#0: ffff888010c69138 ((wq_completion)events_unbound){+.+.}-{0:0}, at: atomic_long_set include/linux/atomic/atomic-instrumented.h:1198 [inline]
	#0: ffff888010c69138 ((wq_completion)events_unbound){+.+.}-{0:0}, at: set_work_data kernel/workqueue.c:634 [inline]
	#0: ffff888010c69138 ((wq_completion)events_unbound){+.+.}-{0:0}, at: set_work_pool_and_clear_pending kernel/workqueue.c:661 [inline]
	#0: ffff888010c69138 ((wq_completion)events_unbound){+.+.}-{0:0}, at: process_one_work+0x896/0x1690 kernel/workqueue.c:2268
	#1: ffffc9000140fdb0 ((kfence_timer).work){+.+.}-{0:0}, at: process_one_work+0x8ca/0x1690 kernel/workqueue.c:2272
	1 lock held by systemd-udevd/2970:
	1 lock held by in:imklog/6258:
	#0: ffff88807f970ff0 (&f->f_pos_lock){+.+.}-{3:3}, at: __fdget_pos+0xe9/0x100 fs/file.c:990
	3 locks held by kworker/1:6/8158:
	1 lock held by syz-executor.0/8312:
	2 locks held by kworker/u4:13/9320:
	1 lock held by syz-executor.5/10178:
	1 lock held by syz-executor.4/10217:

	Signed-off-by: Eric Dumazet <edumazet@google.com>
	Reported-by: syzbot <syzkaller@googlegroups.com>
	Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
	Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
	---
	net/netfilter/nf_conntrack_core.c \| 70 ++++++++++++++++++++------------------
	1 file changed, 37 insertions(+), 33 deletions(-)

	--- a/net/netfilter/nf_conntrack_core.c
	+++ b/net/netfilter/nf_conntrack_core.c
	@@ -75,6 +75,9 @@ static __read_mostly struct kmem_cache *
	static DEFINE_SPINLOCK(nf_conntrack_locks_all_lock);
	static __read_mostly bool nf_conntrack_locks_all;

	+/* serialize hash resizes and nf_ct_iterate_cleanup */
	+static DEFINE_MUTEX(nf_conntrack_mutex);
	+
	#define GC_SCAN_INTERVAL (120u * HZ)
	#define GC_SCAN_MAX_DURATION msecs_to_jiffies(10)

	@@ -2173,28 +2176,31 @@ get_next_corpse(int (*iter)(struct nf_co
	spinlock_t *lockp;

	for (; bucket < nf_conntrack_htable_size; (bucket)++) {
	+ struct hlist_nulls_head hslot = &nf_conntrack_hash[bucket];
	+
	+ if (hlist_nulls_empty(hslot))
	+ continue;
	+
	lockp = &nf_conntrack_locks[*bucket % CONNTRACK_LOCKS];
	local_bh_disable();
	nf_conntrack_lock(lockp);
	- if (*bucket < nf_conntrack_htable_size) {
	- hlist_nulls_for_each_entry(h, n, &nf_conntrack_hash[*bucket], hnnode) {
	- if (NF_CT_DIRECTION(h) != IP_CT_DIR_REPLY)
	- continue;
	- /* All nf_conn objects are added to hash table twice, one
	- * for original direction tuple, once for the reply tuple.
	- *
	- * Exception: In the IPS_NAT_CLASH case, only the reply
	- * tuple is added (the original tuple already existed for
	- * a different object).
	- *
	- * We only need to call the iterator once for each
	- * conntrack, so we just use the 'reply' direction
	- * tuple while iterating.
	- */
	- ct = nf_ct_tuplehash_to_ctrack(h);
	- if (iter(ct, data))
	- goto found;
	- }
	+ hlist_nulls_for_each_entry(h, n, hslot, hnnode) {
	+ if (NF_CT_DIRECTION(h) != IP_CT_DIR_REPLY)
	+ continue;
	+ /* All nf_conn objects are added to hash table twice, one
	+ * for original direction tuple, once for the reply tuple.
	+ *
	+ * Exception: In the IPS_NAT_CLASH case, only the reply
	+ * tuple is added (the original tuple already existed for
	+ * a different object).
	+ *
	+ * We only need to call the iterator once for each
	+ * conntrack, so we just use the 'reply' direction
	+ * tuple while iterating.
	+ */
	+ ct = nf_ct_tuplehash_to_ctrack(h);
	+ if (iter(ct, data))
	+ goto found;
	}
	spin_unlock(lockp);
	local_bh_enable();
	@@ -2212,26 +2218,20 @@ found:
	static void nf_ct_iterate_cleanup(int (iter)(struct nf_conn i, void *data),
	void *data, u32 portid, int report)
	{
	- unsigned int bucket = 0, sequence;
	+ unsigned int bucket = 0;
	struct nf_conn *ct;

	might_sleep();

	- for (;;) {
	- sequence = read_seqcount_begin(&nf_conntrack_generation);
	-
	- while ((ct = get_next_corpse(iter, data, &bucket)) != NULL) {
	- /* Time to push up daises... */
	+ mutex_lock(&nf_conntrack_mutex);
	+ while ((ct = get_next_corpse(iter, data, &bucket)) != NULL) {
	+ /* Time to push up daises... */

	- nf_ct_delete(ct, portid, report);
	- nf_ct_put(ct);
	- cond_resched();
	- }
	-
	- if (!read_seqcount_retry(&nf_conntrack_generation, sequence))
	- break;
	- bucket = 0;
	+ nf_ct_delete(ct, portid, report);
	+ nf_ct_put(ct);
	+ cond_resched();
	}
	+ mutex_unlock(&nf_conntrack_mutex);
	}

	struct iter_data {
	@@ -2461,8 +2461,10 @@ int nf_conntrack_hash_resize(unsigned in
	if (!hash)
	return -ENOMEM;

	+ mutex_lock(&nf_conntrack_mutex);
	old_size = nf_conntrack_htable_size;
	if (old_size == hashsize) {
	+ mutex_unlock(&nf_conntrack_mutex);
	kvfree(hash);
	return 0;
	}
	@@ -2498,6 +2500,8 @@ int nf_conntrack_hash_resize(unsigned in
	nf_conntrack_all_unlock();
	local_bh_enable();

	+ mutex_unlock(&nf_conntrack_mutex);
	+
	synchronize_net();
	kvfree(old_hash);
	return 0;