mm/memfd_luo.c - pub/scm/linux/kernel/git/stable/linux-stable.git - Git at Google

 // SPDX-License-Identifier: GPL-2.0

 /*
  * Copyright (c) 2025, Google LLC.
  * Pasha Tatashin <pasha.tatashin@soleen.com>
  *
  * Copyright (C) 2025 Amazon.com Inc. or its affiliates.
  * Pratyush Yadav <ptyadav@amazon.de>
  */

 /**
  * DOC: Memfd Preservation via LUO
  *
  * Overview
  * ========
  *
  * Memory file descriptors (memfd) can be preserved over a kexec using the Live
  * Update Orchestrator (LUO) file preservation. This allows userspace to
  * transfer its memory contents to the next kernel after a kexec.
  *
  * The preservation is not intended to be transparent. Only select properties of
  * the file are preserved. All others are reset to default. The preserved
  * properties are described below.
  *
  * .. note::
  *    The LUO API is not stabilized yet, so the preserved properties of a memfd
  *    are also not stable and are subject to backwards incompatible changes.
  *
  * .. note::
  *    Currently a memfd backed by Hugetlb is not supported. Memfds created
  *    with ``MFD_HUGETLB`` will be rejected.
  *
  * Preserved Properties
  * ====================
  *
  * The following properties of the memfd are preserved across kexec:
  *
  * File Contents
  *   All data stored in the file is preserved.
  *
  * File Size
  *   The size of the file is preserved. Holes in the file are filled by
  *   allocating pages for them during preservation.
  *
  * File Position
  *   The current file position is preserved, allowing applications to continue
  *   reading/writing from their last position.
  *
  * File Status Flags
  *   memfds are always opened with ``O_RDWR`` and ``O_LARGEFILE``. This property
  *   is maintained.
  *
  * Non-Preserved Properties
  * ========================
  *
  * All properties which are not preserved must be assumed to be reset to
  * default. This section describes some of those properties which may be more of
  * note.
  *
  * ``FD_CLOEXEC`` flag
  *   A memfd can be created with the ``MFD_CLOEXEC`` flag that sets the
  *   ``FD_CLOEXEC`` on the file. This flag is not preserved and must be set
  *   again after restore via ``fcntl()``.
  *
  * Seals
  *   File seals are not preserved. The file is unsealed on restore and if
  *   needed, must be sealed again via ``fcntl()``.
  */

 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt

 #include <linux/bits.h>
 #include <linux/err.h>
 #include <linux/file.h>
 #include <linux/io.h>
 #include <linux/kexec_handover.h>
 #include <linux/kho/abi/memfd.h>
 #include <linux/liveupdate.h>
 #include <linux/shmem_fs.h>
 #include <linux/vmalloc.h>
 #include "internal.h"

 static int memfd_luo_preserve_folios(struct file *file,
 				     struct kho_vmalloc *kho_vmalloc,
 				     struct memfd_luo_folio_ser **out_folios_ser,
 				     u64 *nr_foliosp)
 {
 	struct inode *inode = file_inode(file);
 	struct memfd_luo_folio_ser *folios_ser;
 	unsigned int max_folios;
 	long i, size, nr_pinned;
 	struct folio **folios;
 	int err = -EINVAL;
 	pgoff_t offset;
 	u64 nr_folios;

 	size = i_size_read(inode);
 	/*
 	 * If the file has zero size, then the folios and nr_folios properties
 	 * are not set.
 	 */
 	if (!size) {
 		*nr_foliosp = 0;
 		*out_folios_ser = NULL;
 		memset(kho_vmalloc, 0, sizeof(*kho_vmalloc));
 		return 0;
 	}

 	/*
 	 * Guess the number of folios based on inode size. Real number might end
 	 * up being smaller if there are higher order folios.
 	 */
 	max_folios = PAGE_ALIGN(size) / PAGE_SIZE;
 	folios = kvmalloc_array(max_folios, sizeof(*folios), GFP_KERNEL);
 	if (!folios)
 		return -ENOMEM;

 	/*
 	 * Pin the folios so they don't move around behind our back. This also
 	 * ensures none of the folios are in CMA -- which ensures they don't
 	 * fall in KHO scratch memory. It also moves swapped out folios back to
 	 * memory.
 	 *
 	 * A side effect of doing this is that it allocates a folio for all
 	 * indices in the file. This might waste memory on sparse memfds. If
 	 * that is really a problem in the future, we can have a
 	 * memfd_pin_folios() variant that does not allocate a page on empty
 	 * slots.
 	 */
 	nr_pinned = memfd_pin_folios(file, 0, size - 1, folios, max_folios,
 				     &offset);
 	if (nr_pinned < 0) {
 		err = nr_pinned;
 		pr_err("failed to pin folios: %d\n", err);
 		goto err_free_folios;
 	}
 	nr_folios = nr_pinned;

 	folios_ser = vcalloc(nr_folios, sizeof(*folios_ser));
 	if (!folios_ser) {
 		err = -ENOMEM;
 		goto err_unpin;
 	}

 	for (i = 0; i < nr_folios; i++) {
 		struct memfd_luo_folio_ser *pfolio = &folios_ser[i];
 		struct folio *folio = folios[i];
 		unsigned int flags = 0;

 		err = kho_preserve_folio(folio);
 		if (err)
 			goto err_unpreserve;

 		if (folio_test_dirty(folio))
 			flags |= MEMFD_LUO_FOLIO_DIRTY;
 		if (folio_test_uptodate(folio))
 			flags |= MEMFD_LUO_FOLIO_UPTODATE;

 		pfolio->pfn = folio_pfn(folio);
 		pfolio->flags = flags;
 		pfolio->index = folio->index;
 	}

 	err = kho_preserve_vmalloc(folios_ser, kho_vmalloc);
 	if (err)
 		goto err_unpreserve;

 	kvfree(folios);
 	*nr_foliosp = nr_folios;
 	*out_folios_ser = folios_ser;

 	/*
 	 * Note: folios_ser is purposely not freed here. It is preserved
 	 * memory (via KHO). In the 'unpreserve' path, we use the vmap pointer
 	 * that is passed via private_data.
 	 */
 	return 0;

 err_unpreserve:
 	for (i = i - 1; i >= 0; i--)
 		kho_unpreserve_folio(folios[i]);
 	vfree(folios_ser);
 err_unpin:
 	unpin_folios(folios, nr_folios);
 err_free_folios:
 	kvfree(folios);

 	return err;
 }

 static void memfd_luo_unpreserve_folios(struct kho_vmalloc *kho_vmalloc,
 					struct memfd_luo_folio_ser *folios_ser,
 					u64 nr_folios)
 {
 	long i;

 	if (!nr_folios)
 		return;

 	kho_unpreserve_vmalloc(kho_vmalloc);

 	for (i = 0; i < nr_folios; i++) {
 		const struct memfd_luo_folio_ser *pfolio = &folios_ser[i];
 		struct folio *folio;

 		if (!pfolio->pfn)
 			continue;

 		folio = pfn_folio(pfolio->pfn);

 		kho_unpreserve_folio(folio);
 		unpin_folio(folio);
 	}

 	vfree(folios_ser);
 }

 static int memfd_luo_preserve(struct liveupdate_file_op_args *args)
 {
 	struct inode *inode = file_inode(args->file);
 	struct memfd_luo_folio_ser *folios_ser;
 	struct memfd_luo_ser *ser;
 	u64 nr_folios;
 	int err = 0;

 	inode_lock(inode);
 	shmem_freeze(inode, true);

 	/* Allocate the main serialization structure in preserved memory */
 	ser = kho_alloc_preserve(sizeof(*ser));
 	if (IS_ERR(ser)) {
 		err = PTR_ERR(ser);
 		goto err_unlock;
 	}

 	ser->pos = args->file->f_pos;
 	ser->size = i_size_read(inode);

 	err = memfd_luo_preserve_folios(args->file, &ser->folios,
 					&folios_ser, &nr_folios);
 	if (err)
 		goto err_free_ser;

 	ser->nr_folios = nr_folios;
 	inode_unlock(inode);

 	args->private_data = folios_ser;
 	args->serialized_data = virt_to_phys(ser);

 	return 0;

 err_free_ser:
 	kho_unpreserve_free(ser);
 err_unlock:
 	shmem_freeze(inode, false);
 	inode_unlock(inode);
 	return err;
 }

 static int memfd_luo_freeze(struct liveupdate_file_op_args *args)
 {
 	struct memfd_luo_ser *ser;

 	if (WARN_ON_ONCE(!args->serialized_data))
 		return -EINVAL;

 	ser = phys_to_virt(args->serialized_data);

 	/*
 	 * The pos might have changed since prepare. Everything else stays the
 	 * same.
 	 */
 	ser->pos = args->file->f_pos;

 	return 0;
 }

 static void memfd_luo_unpreserve(struct liveupdate_file_op_args *args)
 {
 	struct inode *inode = file_inode(args->file);
 	struct memfd_luo_ser *ser;

 	if (WARN_ON_ONCE(!args->serialized_data))
 		return;

 	inode_lock(inode);
 	shmem_freeze(inode, false);

 	ser = phys_to_virt(args->serialized_data);

 	memfd_luo_unpreserve_folios(&ser->folios, args->private_data,
 				    ser->nr_folios);

 	kho_unpreserve_free(ser);
 	inode_unlock(inode);
 }

 static void memfd_luo_discard_folios(const struct memfd_luo_folio_ser *folios_ser,
 				     u64 nr_folios)
 {
 	u64 i;

 	for (i = 0; i < nr_folios; i++) {
 		const struct memfd_luo_folio_ser *pfolio = &folios_ser[i];
 		struct folio *folio;
 		phys_addr_t phys;

 		if (!pfolio->pfn)
 			continue;

 		phys = PFN_PHYS(pfolio->pfn);
 		folio = kho_restore_folio(phys);
 		if (!folio) {
 			pr_warn_ratelimited("Unable to restore folio at physical address: %llx\n",
 					    phys);
 			continue;
 		}

 		folio_put(folio);
 	}
 }

 static void memfd_luo_finish(struct liveupdate_file_op_args *args)
 {
 	struct memfd_luo_folio_ser *folios_ser;
 	struct memfd_luo_ser *ser;

 	if (args->retrieved)
 		return;

 	ser = phys_to_virt(args->serialized_data);
 	if (!ser)
 		return;

 	if (ser->nr_folios) {
 		folios_ser = kho_restore_vmalloc(&ser->folios);
 		if (!folios_ser)
 			goto out;

 		memfd_luo_discard_folios(folios_ser, ser->nr_folios);
 		vfree(folios_ser);
 	}

 out:
 	kho_restore_free(ser);
 }

 static int memfd_luo_retrieve_folios(struct file *file,
 				     struct memfd_luo_folio_ser *folios_ser,
 				     u64 nr_folios)
 {
 	struct inode *inode = file_inode(file);
 	struct address_space *mapping = inode->i_mapping;
 	struct folio *folio;
 	int err = -EIO;
 	long i;

 	for (i = 0; i < nr_folios; i++) {
 		const struct memfd_luo_folio_ser *pfolio = &folios_ser[i];
 		phys_addr_t phys;
 		u64 index;
 		int flags;

 		if (!pfolio->pfn)
 			continue;

 		phys = PFN_PHYS(pfolio->pfn);
 		folio = kho_restore_folio(phys);
 		if (!folio) {
 			pr_err("Unable to restore folio at physical address: %llx\n",
 			       phys);
 			goto put_folios;
 		}
 		index = pfolio->index;
 		flags = pfolio->flags;

 		/* Set up the folio for insertion. */
 		__folio_set_locked(folio);
 		__folio_set_swapbacked(folio);

 		err = mem_cgroup_charge(folio, NULL, mapping_gfp_mask(mapping));
 		if (err) {
 			pr_err("shmem: failed to charge folio index %ld: %d\n",
 			       i, err);
 			goto unlock_folio;
 		}

 		err = shmem_add_to_page_cache(folio, mapping, index, NULL,
 					      mapping_gfp_mask(mapping));
 		if (err) {
 			pr_err("shmem: failed to add to page cache folio index %ld: %d\n",
 			       i, err);
 			goto unlock_folio;
 		}

 		if (flags & MEMFD_LUO_FOLIO_UPTODATE)
 			folio_mark_uptodate(folio);
 		if (flags & MEMFD_LUO_FOLIO_DIRTY)
 			folio_mark_dirty(folio);

 		err = shmem_inode_acct_blocks(inode, 1);
 		if (err) {
 			pr_err("shmem: failed to account folio index %ld: %d\n",
 			       i, err);
 			goto unlock_folio;
 		}

 		shmem_recalc_inode(inode, 1, 0);
 		folio_add_lru(folio);
 		folio_unlock(folio);
 		folio_put(folio);
 	}

 	return 0;

 unlock_folio:
 	folio_unlock(folio);
 	folio_put(folio);
 put_folios:
 	/*
 	 * Note: don't free the folios already added to the file. They will be
 	 * freed when the file is freed. Free the ones not added yet here.
 	 */
 	for (long j = i + 1; j < nr_folios; j++) {
 		const struct memfd_luo_folio_ser *pfolio = &folios_ser[j];

 		folio = kho_restore_folio(pfolio->pfn);
 		if (folio)
 			folio_put(folio);
 	}

 	return err;
 }

 static int memfd_luo_retrieve(struct liveupdate_file_op_args *args)
 {
 	struct memfd_luo_folio_ser *folios_ser;
 	struct memfd_luo_ser *ser;
 	struct file *file;
 	int err;

 	ser = phys_to_virt(args->serialized_data);
 	if (!ser)
 		return -EINVAL;

 	file = shmem_file_setup("", 0, VM_NORESERVE);

 	if (IS_ERR(file)) {
 		pr_err("failed to setup file: %pe\n", file);
 		return PTR_ERR(file);
 	}

 	vfs_setpos(file, ser->pos, MAX_LFS_FILESIZE);
 	file->f_inode->i_size = ser->size;

 	if (ser->nr_folios) {
 		folios_ser = kho_restore_vmalloc(&ser->folios);
 		if (!folios_ser) {
 			err = -EINVAL;
 			goto put_file;
 		}

 		err = memfd_luo_retrieve_folios(file, folios_ser, ser->nr_folios);
 		vfree(folios_ser);
 		if (err)
 			goto put_file;
 	}

 	args->file = file;
 	kho_restore_free(ser);

 	return 0;

 put_file:
 	fput(file);

 	return err;
 }

 static bool memfd_luo_can_preserve(struct liveupdate_file_handler *handler,
 				   struct file *file)
 {
 	struct inode *inode = file_inode(file);

 	return shmem_file(file) && !inode->i_nlink;
 }

 static const struct liveupdate_file_ops memfd_luo_file_ops = {
 	.freeze = memfd_luo_freeze,
 	.finish = memfd_luo_finish,
 	.retrieve = memfd_luo_retrieve,
 	.preserve = memfd_luo_preserve,
 	.unpreserve = memfd_luo_unpreserve,
 	.can_preserve = memfd_luo_can_preserve,
 	.owner = THIS_MODULE,
 };

 static struct liveupdate_file_handler memfd_luo_handler = {
 	.ops = &memfd_luo_file_ops,
 	.compatible = MEMFD_LUO_FH_COMPATIBLE,
 };

 static int __init memfd_luo_init(void)
 {
 	int err = liveupdate_register_file_handler(&memfd_luo_handler);

 	if (err && err != -EOPNOTSUPP) {
 		pr_err("Could not register luo filesystem handler: %pe\n",
 		       ERR_PTR(err));

 		return err;
 	}

 	return 0;
 }
 late_initcall(memfd_luo_init);
	// SPDX-License-Identifier: GPL-2.0

	/*
	* Copyright (c) 2025, Google LLC.
	* Pasha Tatashin <pasha.tatashin@soleen.com>
	*
	* Copyright (C) 2025 Amazon.com Inc. or its affiliates.
	* Pratyush Yadav <ptyadav@amazon.de>
	*/

	/**
	* DOC: Memfd Preservation via LUO
	*
	* Overview
	* ========
	*
	* Memory file descriptors (memfd) can be preserved over a kexec using the Live
	* Update Orchestrator (LUO) file preservation. This allows userspace to
	* transfer its memory contents to the next kernel after a kexec.
	*
	* The preservation is not intended to be transparent. Only select properties of
	* the file are preserved. All others are reset to default. The preserved
	* properties are described below.
	*
	* .. note::
	* The LUO API is not stabilized yet, so the preserved properties of a memfd
	* are also not stable and are subject to backwards incompatible changes.
	*
	* .. note::
	* Currently a memfd backed by Hugetlb is not supported. Memfds created
	* with ``MFD_HUGETLB`` will be rejected.
	*
	* Preserved Properties
	* ====================
	*
	* The following properties of the memfd are preserved across kexec:
	*
	* File Contents
	* All data stored in the file is preserved.
	*
	* File Size
	* The size of the file is preserved. Holes in the file are filled by
	* allocating pages for them during preservation.
	*
	* File Position
	* The current file position is preserved, allowing applications to continue
	* reading/writing from their last position.
	*
	* File Status Flags
	* memfds are always opened with ``O_RDWR`` and ``O_LARGEFILE``. This property
	* is maintained.
	*
	* Non-Preserved Properties
	* ========================
	*
	* All properties which are not preserved must be assumed to be reset to
	* default. This section describes some of those properties which may be more of
	* note.
	*
	* ``FD_CLOEXEC`` flag
	* A memfd can be created with the ``MFD_CLOEXEC`` flag that sets the
	* ``FD_CLOEXEC`` on the file. This flag is not preserved and must be set
	* again after restore via ``fcntl()``.
	*
	* Seals
	* File seals are not preserved. The file is unsealed on restore and if
	* needed, must be sealed again via ``fcntl()``.
	*/

	#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt

	#include <linux/bits.h>
	#include <linux/err.h>
	#include <linux/file.h>
	#include <linux/io.h>
	#include <linux/kexec_handover.h>
	#include <linux/kho/abi/memfd.h>
	#include <linux/liveupdate.h>
	#include <linux/shmem_fs.h>
	#include <linux/vmalloc.h>
	#include "internal.h"

	static int memfd_luo_preserve_folios(struct file *file,
	struct kho_vmalloc *kho_vmalloc,
	struct memfd_luo_folio_ser **out_folios_ser,
	u64 *nr_foliosp)
	{
	struct inode *inode = file_inode(file);
	struct memfd_luo_folio_ser *folios_ser;
	unsigned int max_folios;
	long i, size, nr_pinned;
	struct folio **folios;
	int err = -EINVAL;
	pgoff_t offset;
	u64 nr_folios;

	size = i_size_read(inode);
	/*
	* If the file has zero size, then the folios and nr_folios properties
	* are not set.
	*/
	if (!size) {
	*nr_foliosp = 0;
	*out_folios_ser = NULL;
	memset(kho_vmalloc, 0, sizeof(*kho_vmalloc));
	return 0;
	}

	/*
	* Guess the number of folios based on inode size. Real number might end
	* up being smaller if there are higher order folios.
	*/
	max_folios = PAGE_ALIGN(size) / PAGE_SIZE;
	folios = kvmalloc_array(max_folios, sizeof(*folios), GFP_KERNEL);
	if (!folios)
	return -ENOMEM;

	/*
	* Pin the folios so they don't move around behind our back. This also
	* ensures none of the folios are in CMA -- which ensures they don't
	* fall in KHO scratch memory. It also moves swapped out folios back to
	* memory.
	*
	* A side effect of doing this is that it allocates a folio for all
	* indices in the file. This might waste memory on sparse memfds. If
	* that is really a problem in the future, we can have a
	* memfd_pin_folios() variant that does not allocate a page on empty
	* slots.
	*/
	nr_pinned = memfd_pin_folios(file, 0, size - 1, folios, max_folios,
	&offset);
	if (nr_pinned < 0) {
	err = nr_pinned;
	pr_err("failed to pin folios: %d\n", err);
	goto err_free_folios;
	}
	nr_folios = nr_pinned;

	folios_ser = vcalloc(nr_folios, sizeof(*folios_ser));
	if (!folios_ser) {
	err = -ENOMEM;
	goto err_unpin;
	}

	for (i = 0; i < nr_folios; i++) {
	struct memfd_luo_folio_ser *pfolio = &folios_ser[i];
	struct folio *folio = folios[i];
	unsigned int flags = 0;

	err = kho_preserve_folio(folio);
	if (err)
	goto err_unpreserve;

	if (folio_test_dirty(folio))
	flags \|= MEMFD_LUO_FOLIO_DIRTY;
	if (folio_test_uptodate(folio))
	flags \|= MEMFD_LUO_FOLIO_UPTODATE;

	pfolio->pfn = folio_pfn(folio);
	pfolio->flags = flags;
	pfolio->index = folio->index;
	}

	err = kho_preserve_vmalloc(folios_ser, kho_vmalloc);
	if (err)
	goto err_unpreserve;

	kvfree(folios);
	*nr_foliosp = nr_folios;
	*out_folios_ser = folios_ser;

	/*
	* Note: folios_ser is purposely not freed here. It is preserved
	* memory (via KHO). In the 'unpreserve' path, we use the vmap pointer
	* that is passed via private_data.
	*/
	return 0;

	err_unpreserve:
	for (i = i - 1; i >= 0; i--)
	kho_unpreserve_folio(folios[i]);
	vfree(folios_ser);
	err_unpin:
	unpin_folios(folios, nr_folios);
	err_free_folios:
	kvfree(folios);

	return err;
	}

	static void memfd_luo_unpreserve_folios(struct kho_vmalloc *kho_vmalloc,
	struct memfd_luo_folio_ser *folios_ser,
	u64 nr_folios)
	{
	long i;

	if (!nr_folios)
	return;

	kho_unpreserve_vmalloc(kho_vmalloc);

	for (i = 0; i < nr_folios; i++) {
	const struct memfd_luo_folio_ser *pfolio = &folios_ser[i];
	struct folio *folio;

	if (!pfolio->pfn)
	continue;

	folio = pfn_folio(pfolio->pfn);

	kho_unpreserve_folio(folio);
	unpin_folio(folio);
	}

	vfree(folios_ser);
	}

	static int memfd_luo_preserve(struct liveupdate_file_op_args *args)
	{
	struct inode *inode = file_inode(args->file);
	struct memfd_luo_folio_ser *folios_ser;
	struct memfd_luo_ser *ser;
	u64 nr_folios;
	int err = 0;

	inode_lock(inode);
	shmem_freeze(inode, true);

	/* Allocate the main serialization structure in preserved memory */
	ser = kho_alloc_preserve(sizeof(*ser));
	if (IS_ERR(ser)) {
	err = PTR_ERR(ser);
	goto err_unlock;
	}

	ser->pos = args->file->f_pos;
	ser->size = i_size_read(inode);

	err = memfd_luo_preserve_folios(args->file, &ser->folios,
	&folios_ser, &nr_folios);
	if (err)
	goto err_free_ser;

	ser->nr_folios = nr_folios;
	inode_unlock(inode);

	args->private_data = folios_ser;
	args->serialized_data = virt_to_phys(ser);

	return 0;

	err_free_ser:
	kho_unpreserve_free(ser);
	err_unlock:
	shmem_freeze(inode, false);
	inode_unlock(inode);
	return err;
	}

	static int memfd_luo_freeze(struct liveupdate_file_op_args *args)
	{
	struct memfd_luo_ser *ser;

	if (WARN_ON_ONCE(!args->serialized_data))
	return -EINVAL;

	ser = phys_to_virt(args->serialized_data);

	/*
	* The pos might have changed since prepare. Everything else stays the
	* same.
	*/
	ser->pos = args->file->f_pos;

	return 0;
	}

	static void memfd_luo_unpreserve(struct liveupdate_file_op_args *args)
	{
	struct inode *inode = file_inode(args->file);
	struct memfd_luo_ser *ser;

	if (WARN_ON_ONCE(!args->serialized_data))
	return;

	inode_lock(inode);
	shmem_freeze(inode, false);

	ser = phys_to_virt(args->serialized_data);

	memfd_luo_unpreserve_folios(&ser->folios, args->private_data,
	ser->nr_folios);

	kho_unpreserve_free(ser);
	inode_unlock(inode);
	}

	static void memfd_luo_discard_folios(const struct memfd_luo_folio_ser *folios_ser,
	u64 nr_folios)
	{
	u64 i;

	for (i = 0; i < nr_folios; i++) {
	const struct memfd_luo_folio_ser *pfolio = &folios_ser[i];
	struct folio *folio;
	phys_addr_t phys;

	if (!pfolio->pfn)
	continue;

	phys = PFN_PHYS(pfolio->pfn);
	folio = kho_restore_folio(phys);
	if (!folio) {
	pr_warn_ratelimited("Unable to restore folio at physical address: %llx\n",
	phys);
	continue;
	}

	folio_put(folio);
	}
	}

	static void memfd_luo_finish(struct liveupdate_file_op_args *args)
	{
	struct memfd_luo_folio_ser *folios_ser;
	struct memfd_luo_ser *ser;

	if (args->retrieved)
	return;

	ser = phys_to_virt(args->serialized_data);
	if (!ser)
	return;

	if (ser->nr_folios) {
	folios_ser = kho_restore_vmalloc(&ser->folios);
	if (!folios_ser)
	goto out;

	memfd_luo_discard_folios(folios_ser, ser->nr_folios);
	vfree(folios_ser);
	}

	out:
	kho_restore_free(ser);
	}

	static int memfd_luo_retrieve_folios(struct file *file,
	struct memfd_luo_folio_ser *folios_ser,
	u64 nr_folios)
	{
	struct inode *inode = file_inode(file);
	struct address_space *mapping = inode->i_mapping;
	struct folio *folio;
	int err = -EIO;
	long i;

	for (i = 0; i < nr_folios; i++) {
	const struct memfd_luo_folio_ser *pfolio = &folios_ser[i];
	phys_addr_t phys;
	u64 index;
	int flags;

	if (!pfolio->pfn)
	continue;

	phys = PFN_PHYS(pfolio->pfn);
	folio = kho_restore_folio(phys);
	if (!folio) {
	pr_err("Unable to restore folio at physical address: %llx\n",
	phys);
	goto put_folios;
	}
	index = pfolio->index;
	flags = pfolio->flags;

	/* Set up the folio for insertion. */
	__folio_set_locked(folio);
	__folio_set_swapbacked(folio);

	err = mem_cgroup_charge(folio, NULL, mapping_gfp_mask(mapping));
	if (err) {
	pr_err("shmem: failed to charge folio index %ld: %d\n",
	i, err);
	goto unlock_folio;
	}

	err = shmem_add_to_page_cache(folio, mapping, index, NULL,
	mapping_gfp_mask(mapping));
	if (err) {
	pr_err("shmem: failed to add to page cache folio index %ld: %d\n",
	i, err);
	goto unlock_folio;
	}

	if (flags & MEMFD_LUO_FOLIO_UPTODATE)
	folio_mark_uptodate(folio);
	if (flags & MEMFD_LUO_FOLIO_DIRTY)
	folio_mark_dirty(folio);

	err = shmem_inode_acct_blocks(inode, 1);
	if (err) {
	pr_err("shmem: failed to account folio index %ld: %d\n",
	i, err);
	goto unlock_folio;
	}

	shmem_recalc_inode(inode, 1, 0);
	folio_add_lru(folio);
	folio_unlock(folio);
	folio_put(folio);
	}

	return 0;

	unlock_folio:
	folio_unlock(folio);
	folio_put(folio);
	put_folios:
	/*
	* Note: don't free the folios already added to the file. They will be
	* freed when the file is freed. Free the ones not added yet here.
	*/
	for (long j = i + 1; j < nr_folios; j++) {
	const struct memfd_luo_folio_ser *pfolio = &folios_ser[j];

	folio = kho_restore_folio(pfolio->pfn);
	if (folio)
	folio_put(folio);
	}

	return err;
	}

	static int memfd_luo_retrieve(struct liveupdate_file_op_args *args)
	{
	struct memfd_luo_folio_ser *folios_ser;
	struct memfd_luo_ser *ser;
	struct file *file;
	int err;

	ser = phys_to_virt(args->serialized_data);
	if (!ser)
	return -EINVAL;

	file = shmem_file_setup("", 0, VM_NORESERVE);

	if (IS_ERR(file)) {
	pr_err("failed to setup file: %pe\n", file);
	return PTR_ERR(file);
	}

	vfs_setpos(file, ser->pos, MAX_LFS_FILESIZE);
	file->f_inode->i_size = ser->size;

	if (ser->nr_folios) {
	folios_ser = kho_restore_vmalloc(&ser->folios);
	if (!folios_ser) {
	err = -EINVAL;
	goto put_file;
	}

	err = memfd_luo_retrieve_folios(file, folios_ser, ser->nr_folios);
	vfree(folios_ser);
	if (err)
	goto put_file;
	}

	args->file = file;
	kho_restore_free(ser);

	return 0;

	put_file:
	fput(file);

	return err;
	}

	static bool memfd_luo_can_preserve(struct liveupdate_file_handler *handler,
	struct file *file)
	{
	struct inode *inode = file_inode(file);

	return shmem_file(file) && !inode->i_nlink;
	}

	static const struct liveupdate_file_ops memfd_luo_file_ops = {
	.freeze = memfd_luo_freeze,
	.finish = memfd_luo_finish,
	.retrieve = memfd_luo_retrieve,
	.preserve = memfd_luo_preserve,
	.unpreserve = memfd_luo_unpreserve,
	.can_preserve = memfd_luo_can_preserve,
	.owner = THIS_MODULE,
	};

	static struct liveupdate_file_handler memfd_luo_handler = {
	.ops = &memfd_luo_file_ops,
	.compatible = MEMFD_LUO_FH_COMPATIBLE,
	};

	static int __init memfd_luo_init(void)
	{
	int err = liveupdate_register_file_handler(&memfd_luo_handler);

	if (err && err != -EOPNOTSUPP) {
	pr_err("Could not register luo filesystem handler: %pe\n",
	ERR_PTR(err));

	return err;
	}

	return 0;
	}
	late_initcall(memfd_luo_init);