releases/5.4.183/memfd-fix-f_seal_write-after-shmem-huge-page-allocated.patch - pub/scm/linux/kernel/git/stable/stable-queue - Git at Google

 From f2b277c4d1c63a85127e8aa2588e9cc3bd21cb99 Mon Sep 17 00:00:00 2001
 From: Hugh Dickins <hughd@google.com>
 Date: Fri, 4 Mar 2022 20:29:01 -0800
 Subject: memfd: fix F_SEAL_WRITE after shmem huge page allocated

 From: Hugh Dickins <hughd@google.com>

 commit f2b277c4d1c63a85127e8aa2588e9cc3bd21cb99 upstream.

 Wangyong reports: after enabling tmpfs filesystem to support transparent
 hugepage with the following command:

   echo always > /sys/kernel/mm/transparent_hugepage/shmem_enabled

 the docker program tries to add F_SEAL_WRITE through the following
 command, but it fails unexpectedly with errno EBUSY:

   fcntl(5, F_ADD_SEALS, F_SEAL_WRITE) = -1.

 That is because memfd_tag_pins() and memfd_wait_for_pins() were never
 updated for shmem huge pages: checking page_mapcount() against
 page_count() is hopeless on THP subpages - they need to check
 total_mapcount() against page_count() on THP heads only.

 Make memfd_tag_pins() (compared > 1) as strict as memfd_wait_for_pins()
 (compared != 1): either can be justified, but given the non-atomic
 total_mapcount() calculation, it is better now to be strict.  Bear in
 mind that total_mapcount() itself scans all of the THP subpages, when
 choosing to take an XA_CHECK_SCHED latency break.

 Also fix the unlikely xa_is_value() case in memfd_wait_for_pins(): if a
 page has been swapped out since memfd_tag_pins(), then its refcount must
 have fallen, and so it can safely be untagged.

 Link: https://lkml.kernel.org/r/a4f79248-df75-2c8c-3df-ba3317ccb5da@google.com
 Signed-off-by: Hugh Dickins <hughd@google.com>
 Reported-by: Zeal Robot <zealci@zte.com.cn>
 Reported-by: wangyong <wang.yong12@zte.com.cn>
 Cc: Mike Kravetz <mike.kravetz@oracle.com>
 Cc: Matthew Wilcox (Oracle) <willy@infradead.org>
 Cc: CGEL ZTE <cgel.zte@gmail.com>
 Cc: Kirill A. Shutemov <kirill@shutemov.name>
 Cc: Song Liu <songliubraving@fb.com>
 Cc: Yang Yang <yang.yang29@zte.com.cn>
 Cc: <stable@vger.kernel.org>
 Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
 Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
 Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
 ---
  mm/memfd.c |   40 ++++++++++++++++++++++++++++------------
  1 file changed, 28 insertions(+), 12 deletions(-)

 --- a/mm/memfd.c
 +++ b/mm/memfd.c
 @@ -31,20 +31,28 @@
  static void memfd_tag_pins(struct xa_state *xas)
  {
  	struct page *page;
 -	unsigned int tagged = 0;
 +	int latency = 0;
 +	int cache_count;

  	lru_add_drain();

  	xas_lock_irq(xas);
  	xas_for_each(xas, page, ULONG_MAX) {
 -		if (xa_is_value(page))
 -			continue;
 -		page = find_subpage(page, xas->xa_index);
 -		if (page_count(page) - page_mapcount(page) > 1)
 +		cache_count = 1;
 +		if (!xa_is_value(page) &&
 +		    PageTransHuge(page) && !PageHuge(page))
 +			cache_count = HPAGE_PMD_NR;
 +
 +		if (!xa_is_value(page) &&
 +		    page_count(page) - total_mapcount(page) != cache_count)
  			xas_set_mark(xas, MEMFD_TAG_PINNED);
 +		if (cache_count != 1)
 +			xas_set(xas, page->index + cache_count);

 -		if (++tagged % XA_CHECK_SCHED)
 +		latency += cache_count;
 +		if (latency < XA_CHECK_SCHED)
  			continue;
 +		latency = 0;

  		xas_pause(xas);
  		xas_unlock_irq(xas);
 @@ -73,7 +81,8 @@ static int memfd_wait_for_pins(struct ad

  	error = 0;
  	for (scan = 0; scan <= LAST_SCAN; scan++) {
 -		unsigned int tagged = 0;
 +		int latency = 0;
 +		int cache_count;

  		if (!xas_marked(&xas, MEMFD_TAG_PINNED))
  			break;
 @@ -87,10 +96,14 @@ static int memfd_wait_for_pins(struct ad
  		xas_lock_irq(&xas);
  		xas_for_each_marked(&xas, page, ULONG_MAX, MEMFD_TAG_PINNED) {
  			bool clear = true;
 -			if (xa_is_value(page))
 -				continue;
 -			page = find_subpage(page, xas.xa_index);
 -			if (page_count(page) - page_mapcount(page) != 1) {
 +
 +			cache_count = 1;
 +			if (!xa_is_value(page) &&
 +			    PageTransHuge(page) && !PageHuge(page))
 +				cache_count = HPAGE_PMD_NR;
 +
 +			if (!xa_is_value(page) && cache_count !=
 +			    page_count(page) - total_mapcount(page)) {
  				/*
  				 * On the last scan, we clean up all those tags
  				 * we inserted; but make a note that we still
 @@ -103,8 +116,11 @@ static int memfd_wait_for_pins(struct ad
  			}
  			if (clear)
  				xas_clear_mark(&xas, MEMFD_TAG_PINNED);
 -			if (++tagged % XA_CHECK_SCHED)
 +
 +			latency += cache_count;
 +			if (latency < XA_CHECK_SCHED)
  				continue;
 +			latency = 0;

  			xas_pause(&xas);
  			xas_unlock_irq(&xas);
	From f2b277c4d1c63a85127e8aa2588e9cc3bd21cb99 Mon Sep 17 00:00:00 2001
	From: Hugh Dickins <hughd@google.com>
	Date: Fri, 4 Mar 2022 20:29:01 -0800
	Subject: memfd: fix F_SEAL_WRITE after shmem huge page allocated

	From: Hugh Dickins <hughd@google.com>

	commit f2b277c4d1c63a85127e8aa2588e9cc3bd21cb99 upstream.

	Wangyong reports: after enabling tmpfs filesystem to support transparent
	hugepage with the following command:

	echo always > /sys/kernel/mm/transparent_hugepage/shmem_enabled

	the docker program tries to add F_SEAL_WRITE through the following
	command, but it fails unexpectedly with errno EBUSY:

	fcntl(5, F_ADD_SEALS, F_SEAL_WRITE) = -1.

	That is because memfd_tag_pins() and memfd_wait_for_pins() were never
	updated for shmem huge pages: checking page_mapcount() against
	page_count() is hopeless on THP subpages - they need to check
	total_mapcount() against page_count() on THP heads only.

	Make memfd_tag_pins() (compared > 1) as strict as memfd_wait_for_pins()
	(compared != 1): either can be justified, but given the non-atomic
	total_mapcount() calculation, it is better now to be strict. Bear in
	mind that total_mapcount() itself scans all of the THP subpages, when
	choosing to take an XA_CHECK_SCHED latency break.

	Also fix the unlikely xa_is_value() case in memfd_wait_for_pins(): if a
	page has been swapped out since memfd_tag_pins(), then its refcount must
	have fallen, and so it can safely be untagged.

	Link: https://lkml.kernel.org/r/a4f79248-df75-2c8c-3df-ba3317ccb5da@google.com
	Signed-off-by: Hugh Dickins <hughd@google.com>
	Reported-by: Zeal Robot <zealci@zte.com.cn>
	Reported-by: wangyong <wang.yong12@zte.com.cn>
	Cc: Mike Kravetz <mike.kravetz@oracle.com>
	Cc: Matthew Wilcox (Oracle) <willy@infradead.org>
	Cc: CGEL ZTE <cgel.zte@gmail.com>
	Cc: Kirill A. Shutemov <kirill@shutemov.name>
	Cc: Song Liu <songliubraving@fb.com>
	Cc: Yang Yang <yang.yang29@zte.com.cn>
	Cc: <stable@vger.kernel.org>
	Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
	Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
	Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
	---
	mm/memfd.c \| 40 ++++++++++++++++++++++++++++------------
	1 file changed, 28 insertions(+), 12 deletions(-)

	--- a/mm/memfd.c
	+++ b/mm/memfd.c
	@@ -31,20 +31,28 @@
	static void memfd_tag_pins(struct xa_state *xas)
	{
	struct page *page;
	- unsigned int tagged = 0;
	+ int latency = 0;
	+ int cache_count;

	lru_add_drain();

	xas_lock_irq(xas);
	xas_for_each(xas, page, ULONG_MAX) {
	- if (xa_is_value(page))
	- continue;
	- page = find_subpage(page, xas->xa_index);
	- if (page_count(page) - page_mapcount(page) > 1)
	+ cache_count = 1;
	+ if (!xa_is_value(page) &&
	+ PageTransHuge(page) && !PageHuge(page))
	+ cache_count = HPAGE_PMD_NR;
	+
	+ if (!xa_is_value(page) &&
	+ page_count(page) - total_mapcount(page) != cache_count)
	xas_set_mark(xas, MEMFD_TAG_PINNED);
	+ if (cache_count != 1)
	+ xas_set(xas, page->index + cache_count);

	- if (++tagged % XA_CHECK_SCHED)
	+ latency += cache_count;
	+ if (latency < XA_CHECK_SCHED)
	continue;
	+ latency = 0;

	xas_pause(xas);
	xas_unlock_irq(xas);
	@@ -73,7 +81,8 @@ static int memfd_wait_for_pins(struct ad

	error = 0;
	for (scan = 0; scan <= LAST_SCAN; scan++) {
	- unsigned int tagged = 0;
	+ int latency = 0;
	+ int cache_count;

	if (!xas_marked(&xas, MEMFD_TAG_PINNED))
	break;
	@@ -87,10 +96,14 @@ static int memfd_wait_for_pins(struct ad
	xas_lock_irq(&xas);
	xas_for_each_marked(&xas, page, ULONG_MAX, MEMFD_TAG_PINNED) {
	bool clear = true;
	- if (xa_is_value(page))
	- continue;
	- page = find_subpage(page, xas.xa_index);
	- if (page_count(page) - page_mapcount(page) != 1) {
	+
	+ cache_count = 1;
	+ if (!xa_is_value(page) &&
	+ PageTransHuge(page) && !PageHuge(page))
	+ cache_count = HPAGE_PMD_NR;
	+
	+ if (!xa_is_value(page) && cache_count !=
	+ page_count(page) - total_mapcount(page)) {
	/*
	* On the last scan, we clean up all those tags
	* we inserted; but make a note that we still
	@@ -103,8 +116,11 @@ static int memfd_wait_for_pins(struct ad
	}
	if (clear)
	xas_clear_mark(&xas, MEMFD_TAG_PINNED);
	- if (++tagged % XA_CHECK_SCHED)
	+
	+ latency += cache_count;
	+ if (latency < XA_CHECK_SCHED)
	continue;
	+ latency = 0;

	xas_pause(&xas);
	xas_unlock_irq(&xas);