fs/eulerfs/dep.c - pub/scm/linux/kernel/git/colyli/openEuler-kernel - Git at Google

 // SPDX-License-Identifier: GPL-2.0
 /*
  * Copyright (C) 2021. Huawei Technologies Co., Ltd. All rights reserved.
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License version 2 and
  * only version 2 as published by the Free Software Foundation.
  *
  * This program is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  * GNU General Public License for more details.
  */

 #include <linux/percpu.h>
 #include <linux/slab.h>
 #include <linux/fs.h>
 #include <linux/kthread.h>
 #include <linux/list.h>
 #include <linux/ratelimit.h>
 #include <linux/writeback.h>
 #include "euler.h"
 #include "dep.h"
 #include "lock.h"
 #include "dax.h"
 #include "dht.h"

 static void do_dep_diradd_oneshot(struct inode *dir_inode, struct dep_node *dep,
 				  u64 *bitset);

 struct flush_list_head {
 	int count;
 	struct llist_head head;
 };

 DEFINE_PER_CPU(struct flush_list_head, flush_list_percpu);

 #define IFMT_HAS_ROOT(ifmt)                                                    \
 	((ifmt) == S_IFREG || (ifmt) == S_IFDIR || (ifmt) == S_IFLNK)

 #define INODE_COND_TRYLOCK(inode, tag, enter_cond, exit_cond, exit_expr)       \
 	do {                                                                   \
 	tag:                                                                   \
 		if (enter_cond) {                                              \
 			if (likely(inode_trylock(inode))) {                    \
 				/* get the lock, okay */                       \
 			} else {                                               \
 				if (exit_cond) {                               \
 					exit_expr;                             \
 				} else {                                       \
 					cond_resched();                        \
 					goto tag;                              \
 				}                                              \
 			}                                                      \
 		}                                                              \
 	} while (0)

 static inline void fsync_dir_oneshot(struct inode *dir)
 {
 	eufs_dir_fsync_oneshot(dir);
 }

 static void do_dep_dirrem(struct inode *inode, struct dep_node *dep,
 			  u64 *bitset)
 {
 	struct nv_dict_entry *prevde = dep->prevde;
 	struct nv_dict_entry *de = dep->de;
 	int idx;

 	eufs_dbg("!! %s !!", __func__);
 	NV_ASSERT(de);
 	NV_ASSERT(de->inode);
 	NV_ASSERT(de->name);

 	idx = INDEX(de->hv);
 	bitset[idx / 64] = bitset[idx / 64] | (0x1ull << (idx & 63));
 	eufs_dbg("bitset-add: dict=%llx, %d %llx\n",
 		 eufs_iread_dict(EUFS_PI(inode)), idx, bitset[idx / 64]);

 	/*
 	 * This is a removal of a newly created dentry, nothing to do,
 	 * the prevde is already manipulated in dht.c
 	 */
 	if (de->volatile_next == EUFS_DIR_DELNEW)
 		return;

 	/*
 	 * If dentries immediately following the deleted dentry are
 	 * also deleted, prevde->volatile_next will be modified again.
 	 * So if we assign prevde->volatile_next to prevde->next,
 	 * these deletion will be persisted prematurely.
 	 */
 	if (prevde && !eufs_dentry_is_not_persist(prevde)) {
 		prevde->next = de->next;
 		persist_dentry(prevde);
 	}
 }

 static void do_dep_dirrem_reclaim(struct super_block *sb, struct dep_node *dep)
 {
 	struct nv_dict_entry *de = dep->de;
 	struct eufs_inode __maybe_unused *pi;
 	struct inode *child;

 	pi = s2p(sb, de->inode);
 	child = dep->inode;
 	NV_ASSERT(EUFS_PI(child) == pi);
 	eufs_dbg("dirrem: child_inode=%px\n", child);
 	BUG_ON(!child);
 	eufs_free_name(sb, de);
 	nv_free(sb, de);
 }

 #define EUFS_PRINT_BITSET(lvl, bitset)                                         \
 	eufs_##lvl("bitsets: %llx %llx %llx %llx %llx %llx %llx %llx\n",       \
 		   bitset[0], bitset[1], bitset[2], bitset[3], bitset[4],      \
 		   bitset[5], bitset[6], bitset[7])

 static void eufs_sync_buckets(struct eufs_inode_info *vi, u64 bitset[8])
 {
 	struct inode *inode = &vi->vfs_inode;
 	struct super_block *sb = inode->i_sb;
 	struct eufs_inode *pi = EUFS_FRESH_PI(EUFS_PI(inode));
 	struct nv_dict *dict;
 	int i;

 	/* Volatile buckets */
 	if (!vi->i_volatile_dict)
 		return;

 	EUFS_PRINT_BITSET(dbg, bitset);

 	BUG_ON(!inode_is_header_locked(inode));
 	dict = o2p(sb, eufs_iread_dict(pi));
 	for (i = 0; i < 8; ++i) {
 		int j;
 		bool dirty;
 		int idx;

 		if (!bitset[i])
 			continue;
 		dirty = false;
 		for (j = 0; j <= 64; ++j) {
 			if (j % 8 == 0 && dirty) {
 				dirty = false;
 				eufs_flush_cacheline(&dict->table[idx]);
 			}
 			if (j == 64)
 				break;
 			if (!(bitset[i] & (0x1ull << j)))
 				continue;
 			idx = i * 64 + j;
 			eufs_dbg_dir("handle index %d (i %d, j %d) of inode=%px\n",
 				     idx, i, j, inode);

 			eufs_dbg_dir(" idx=%d  dict[idx]=%px vdict[idx]=%px\n",
 				     idx, dict->table[idx],
 				     vi->i_volatile_dict->table[idx]);

 			if (unlikely(vi->i_volatile_dict->table[idx] ==
 				     EUFS_DIR_EOC_PTR))
 				dict->table[idx] = NULL_VAL;
 			else if (vi->i_volatile_dict->table[idx] != NULL)
 				dict->table[idx] = COMPOSE_DICT_HEAD_le64(
 					sb, vi->i_volatile_dict->table[idx]);
 			vi->i_volatile_dict->table[idx] = NULL;
 			dirty = true;
 		}
 	}
 }

 /*
  * Some ideas on fast fsync (of dir):
  *
  * 1. Batch and coalescence. The newly inserted dentry should be marked and
  * during its removal, it should be marked again so that unnecessary dep_diradd
  * an be prevented.
  *
  * 2. Split! The lock (only when there is one lock needed) can be temporarily
  * given up so between handling two deps. This requires that the dentry pointed
  * by dir_pi should not be reclaimed (like in RCU). Well, actually, combined
  * with the following one idea, this is quite acceptable.
  *
  * 3. Delayed free. The removal operations can be delayed until the locks are
  * released.
  *
  *
  * Parallel fsync for a vi is not throughly considered though.
  *
  * 4. Detach only if the list is empty?
  */
 static void fsync_rename_inode(struct inode *dir)
 {
 	struct eufs_inode_info *vi = EUFS_I(dir);

 	if (!vi->i_is_dirty)
 		return;

 	/* I'm holding the lock, so if it's dirty, it's dirty. */
 	fsync_dir_oneshot(dir);
 }

 void fsync_rename_inodes(struct inode *old_dir, struct inode *new_dir,
 			 struct inode **locked_inodes)
 {
 	int i;
 	struct inode *inode;

 	/*
 	 * The two parent dirs, might have parent-child relations sometime
 	 * before. So we need to transfer these two dirs too.
 	 */
 	for (i = 0; i < EUFS_INODE_CNT_IN_RENAME; i++) {
 		inode = locked_inodes[i];
 		if (inode)
 			eufs_inode_mark_lock_transferable(inode);
 	}

 	if (old_dir == new_dir) {
 		fsync_rename_inode(old_dir);
 	} else {
 		fsync_rename_inode(old_dir);
 		fsync_rename_inode(new_dir);
 	}

 	for (i = 0; i < EUFS_INODE_CNT_IN_RENAME; i++) {
 		inode = locked_inodes[i];
 		if (inode)
 			eufs_inode_wait_lock_transfer_done(inode);
 	}
 }

 static void eufs_update_persisted_seq(struct eufs_inode_info *vi,
 				      struct list_head *head)
 {
 	if (!list_empty(head)) {
 		struct dep_node *dep =
 			list_last_entry(head, struct dep_node, node);

 		vi->i_persisted_dep_seq = dep->seq;
 	}
 }

 static int fsync_dir_bg(struct inode *dir)
 {
 	struct dep_node *dep, *next;
 	LIST_HEAD(detached_list);
 	LIST_HEAD(dump_list);
 	int i;
 #define FSYNC_DIR_VI_LOOP_NUM (20)

 	struct eufs_inode_info *vi = EUFS_I(dir);
 	struct super_block *sb = dir->i_sb;
 	struct eufs_sb_info *sbi = EUFS_SB(sb);
 	struct eufs_inode *pi = EUFS_PI(dir);
 	u64 bitset[8] = { 0 };
 	int dep_count = 0;

 retry:
 	inode_urgent_lock(dir);

 	/* Phase 1 */
 	for (i = FSYNC_DIR_VI_LOOP_NUM; i >= 0; --i) {
 		/* Get all deps round by round */
 		if (i == 0) {
 			/* Last round */
 			inode_header_lock(dir);
 		}
 		inode_dep_lock(dir);

 		if (list_empty(&vi->i_dep_list) && i > 0) {
 			/* Skip to last round */
 			i = 1;
 		}
 		list_cut_position(&detached_list, &vi->i_dep_list,
 				  vi->i_dep_list.prev);

 		if (i > 0)
 			inode_dep_unlock(dir);

 		/* Do dep one by one. */
 		list_for_each_entry_safe(dep, next, &detached_list, node) {
 			if (dep->type == DEP_DIRADD) {
 				/*
 				 * FIXME: the lockset might be different since
 				 * we might have released the inode lock.
 				 */
 				do_dep_diradd_oneshot(dir, dep, bitset);

 			} else if (dep->type == DEP_DIRREM) {
 				do_dep_dirrem(dir, dep, bitset);

 			} else
 				BUG();
 		}

 		list_splice_tail_init(&detached_list, &dump_list);

 		if (i == 0) {
 			eufs_pbarrier();

 			if (!list_empty(&dump_list))
 				/* Phase 2 */
 				eufs_sync_buckets(vi, bitset);

 			inode_dep_unlock(dir);
 			inode_header_unlock(dir);
 			break;
 		}
 	}

 	inode_urgent_unlock(dir);

 	/* Phase 3 */
 	inode_lock(dir);

 	if (!list_empty(&vi->i_dep_list)) {
 		inode_unlock(dir);
 		/* To handle new deps between phase 2 & 3 */
 		/* FIXME: Live lock possible! */
 		goto retry;
 	}

 	if (dir->i_nlink)
 		eufs_sync_pinode(dir, pi, false);

 	eufs_update_persisted_seq(vi, &dump_list);

 	vi->i_is_persisting = false;
 	vi->i_is_dirty = false;

 	if (dir->i_nlink)
 		persist_pinode(pi);

 	inode_unlock(dir);

 	eufs_pbarrier();

 	/* Reclaim memory and clear the list */
 	list_for_each_entry_safe(dep, next, &dump_list, node) {
 		struct inode *child_inode = dep->inode;
 		struct eufs_inode_info *child_vi = EUFS_I(child_inode);

 		if (dep->type == DEP_DIRREM)
 			do_dep_dirrem_reclaim(sb, dep);

 		/* remove from owner list */
 		spin_lock(&child_vi->i_owner_lock);
 		list_del_init(&dep->owner_node);
 		spin_unlock(&child_vi->i_owner_lock);

 		iput(child_inode);

 		list_del(&dep->node);

 		eufs_free_dep_node(dep);
 		dep_count++;
 	}
 	atomic_sub(dep_count, &sbi->s_nr_dep_nodes);
 	eufs_dbg("@cpu=%d !! fsync dir vi done: inode=%px\n",
 		 smp_processor_id(), &vi->vfs_inode);
 	return 0;
 }

 static int fsync_nondir_oneshot(struct inode *inode)
 {
 	struct eufs_inode_info *vi = EUFS_I(inode);
 	struct eufs_inode *pi;

 	/* For files other than dir */
 	WARN(S_ISDIR(inode->i_mode), "%s on a dir!", __func__);

 	/* Inode needs to remove. Nothing to do */
 	if (!inode->i_nlink) {
 		vi->i_is_dirty = false;
 		return 0;
 	}

 	pi = EUFS_PI(inode);

 	eufs_sync_pinode(inode, pi, false);

 	persist_pinode(pi);

 	vi->i_is_dirty = false;

 	return 0;
 }

 static int fsync_nondir_bg(struct inode *inode)
 {
 	struct eufs_inode_info *vi = EUFS_I(inode);
 	int r;

 	inode_lock(inode);
 	r = fsync_nondir_oneshot(inode);
 	vi->i_is_persisting = false;
 	inode_unlock(inode);

 	return r;
 }

 static void fsync_bg(struct inode *inode)
 {
 	struct eufs_sb_info *sbi = EUFS_SB(inode->i_sb);

 	wait_on_inode(inode);

 	/* Reading i_mode may need no protection */
 	if (S_ISDIR(inode->i_mode))
 		fsync_dir_bg(inode);
 	else
 		fsync_nondir_bg(inode);

 	/* Decrease */
 	iput(inode);

 	if (atomic_dec_and_test(&sbi->s_nr_dirty_inodes) && sbi->s_draining) {
 		/* end of draining */
 		sbi->s_draining = false;
 	}
 }

 void fsync_oneshot(struct inode *inode)
 {
 	/* Reading i_mode may need no protection */
 	if (S_ISDIR(inode->i_mode))
 		fsync_dir_oneshot(inode);
 	else
 		fsync_nondir_oneshot(inode);
 }

 static void do_dep_diradd_oneshot(struct inode *dir_inode, struct dep_node *dep,
 				  u64 *bitset)
 {
 	struct super_block *sb = dir_inode->i_sb;
 	struct nv_dict_entry *de = dep->de;
 	struct inode *inode = dep->inode;
 	struct eufs_inode_info *dir_vi = EUFS_I(dir_inode);
 	struct eufs_inode *pi;
 	struct eufs_inode *fresh_pi;
 	int idx;
 	void *buffer[16];
 	struct alloc_batch ab;
 	bool lock_transferred = false;

 	idx = INDEX(de->hv);
 	bitset[idx / 64] = bitset[idx / 64] | (0x1ull << (idx & 63));

 	if (de->volatile_next == EUFS_DIR_DELNEW) {
 		/*
 		 * The de is already invisible from both the latest view and
 		 * the consistent view.
 		 * Will be handled in the corresponding dirrem.
 		 */
 		return;
 	}

 	/* Meow? This equality is the sign of diradd */
 	WARN(!eufs_dentry_is_not_persist(de), "diradd wrong sign");

 	pi = s2p(sb, de->inode);

 	wait_on_inode(inode);
 retry:
 	if (likely(inode_trylock(inode))) {
 		/* Got the lock */
 	} else {
 		if (eufs_inode_mark_lock_transferring(inode)) {
 			lock_transferred = true;
 		} else {
 			cond_resched();
 			goto retry;
 		}
 	}

 	eufs_sync_pinode(inode, pi, false);
 	fresh_pi = EUFS_FRESH_PI(pi);

 	if (!lock_transferred)
 		inode_unlock(inode);
 	else
 		eufs_inode_lock_transfer_done(inode);

 	ab.n_used = 0;
 	ab.size = 16;
 	ab.batch = buffer;

 	eufs_alloc_batch_add(sb, &ab, de);
 	/*
 	 * force to persist the allocation without checking.
 	 * TODO: we should differentiate the link and create syscall to agree
 	 * with checking
 	 */
 	eufs_alloc_persist(sb, pi, true);

 	if (S_ISLNK(fresh_pi->i_mode)) {
 		void *root = o2p(sb, eufs_iread_root(fresh_pi));

 		/* reg file's root is done in btree */
 		/* In case of Hard link, we must force the allocation persitence */
 		eufs_alloc_persist(sb, root, true);
 		persist_symlink(root);
 	} else if (S_ISDIR(fresh_pi->i_mode)) {
 		void *root = o2p(sb, eufs_iread_root(fresh_pi));

 		eufs_alloc_persist(sb, root, false);
 		persist_page(root);
 	}

 	persist_name(sb, de, &ab);

 	eufs_alloc_batch_persist_reset(sb, &ab);

 	persist_pinode(pi);

 	spin_lock(&dir_vi->i_dentry_persist_lock);
 	eufs_dentry_clr_not_persist_flag(de);
 	spin_unlock(&dir_vi->i_dentry_persist_lock);

 	persist_dentry(de);
 }

 void eufs_dir_fsync_oneshot(struct inode *dir)
 {
 	struct dep_node *dep;
 	struct dep_node *next;
 	struct super_block *sb = dir->i_sb;
 	struct eufs_sb_info *sbi = EUFS_SB(sb);
 	struct eufs_inode_info *vi = EUFS_I(dir);
 	LIST_HEAD(detached_list);
 	u64 bitset[8] = { 0 };
 	int dep_count = 0;

 	BUG_ON(!inode_is_locked(dir));

 	inode_urgent_lock(dir);

 	/* get all deps */
 	inode_header_lock(dir);
 	inode_dep_lock(dir);

 	if (list_empty(&vi->i_dep_list))
 		goto unlock_sync_pinode;

 	list_for_each_entry(dep, &vi->i_dep_list, node) {
 		if (dep->type == DEP_DIRADD)
 			do_dep_diradd_oneshot(dir, dep, bitset);
 		else if (dep->type == DEP_DIRREM)
 			do_dep_dirrem(dir, dep, bitset);
 		else
 			BUG();
 	}

 	list_splice_init(&vi->i_dep_list, &detached_list);

 	/* sync buckets */
 	eufs_pbarrier();
 	eufs_sync_buckets(vi, bitset);

 unlock_sync_pinode:
 	inode_dep_unlock(dir);
 	inode_header_unlock(dir);

 	/* sync pinode */
 	if (dir->i_nlink)
 		eufs_sync_pinode(dir, EUFS_PI(dir), false);

 	eufs_pbarrier();

 	eufs_update_persisted_seq(vi, &detached_list);

 	vi->i_is_dirty = false;

 	/* Reclaim memory and clear the list */
 	list_for_each_entry_safe(dep, next, &detached_list, node) {
 		struct inode *child_inode = dep->inode;
 		struct eufs_inode_info *child_vinode = EUFS_I(child_inode);

 		spin_lock(&child_vinode->i_owner_lock);
 		list_del_init(&dep->owner_node);
 		spin_unlock(&child_vinode->i_owner_lock);

 		if (dep->type == DEP_DIRREM) {
 			do_dep_dirrem_reclaim(sb, dep);
 			iput(dep->inode);
 		} else if (dep->type == DEP_DIRADD) {
 			iput(dep->inode);
 		}
 		list_del(&dep->node);
 		eufs_free_dep_node(dep);
 		dep_count++;
 	}
 	atomic_sub(dep_count, &sbi->s_nr_dep_nodes);

 	inode_urgent_unlock(dir);
 }

 void fsync_on_draining(struct inode *dir, struct inode *inode)
 {
 	BUG_ON(!dir);
 	BUG_ON(!inode_is_locked(dir));
 	BUG_ON(inode && !inode_is_locked(inode));

 	/* for link/unlink/rmdir */
 	if (inode)
 		eufs_inode_mark_lock_transferable(inode);

 	fsync_dir_oneshot(dir);

 	if (inode)
 		eufs_inode_wait_lock_transfer_done(inode);
 }

 #define NR_FLUSH_EACH_ROUND (16)
 #define FLUSH_START_THRESHOLD (64)

 static __always_inline int handle_persistees_for_each_cpu(
 		struct super_block *sb, const struct cpumask *mask, int idx) {
 	struct eufs_sb_info *sbi = EUFS_SB(sb);
 	struct llist_node *list;
 	struct llist_head *head;
 	struct eufs_inode_info *vi;
 	struct eufs_inode_info *next;
 	int n_active_list;
 	int cpu;
 	bool need;

 retry:
 	need = sbi->need_sync[idx];
 	n_active_list = 0;
 	for_each_cpu(cpu, mask) {
 		head = per_cpu_ptr(sbi->persistee_list, cpu);

 		if (unlikely(llist_empty(head)))
 			continue;

 		n_active_list++;

 		list = llist_del_all(head);

 		eufs_dbg("persister get list %px for cpu%d\n", list, cpu);

 		/* reverse the ordering for better locality? */
 		llist_for_each_entry_safe(vi, next, list, i_persistee_node)
 			fsync_bg(&vi->vfs_inode);
 		eufs_dbg("persister handled list %px\n", list);
 	}
 	/**
 	 * We need a complete round of run for fssync. If
 	 * need != sbi->need_sync[idx], need_sync was modified during our last
 	 * round. We need to retry to ensure a complete round of run.
 	 * It's okay if dirty inodes of a cpu is still being processed by
 	 * another persister, since we will wait for all persisters to finish
 	 * for fssync.
 	 */
 	if (need != READ_ONCE(sbi->need_sync[idx]))
 		goto retry;
 	if (need) {
 		sbi->need_sync[idx] = false;
 		wake_up(&sbi->sync_wq);
 	}
 	if (READ_ONCE(sbi->need_sync[idx]))
 		goto retry;

 	return n_active_list;
 }

 static int persister(void *data)
 {
 	struct super_block *sb = data;
 	struct eufs_sb_info *sbi = EUFS_SB(sb);
 	const struct cpumask *mask = cpumask_of_node(numa_node_id());
 	const int period =
 		(persist_period == 0) ? /* default */ (HZ / 4) :
 					/* less than a second */
 			((persist_period < 0) ? (HZ / (-persist_period)) :
 						/* more than a second */
 				 (HZ * persist_period));
 	int idx = 0;
 	int num_persisters = num_sockets * persisters_per_socket;

 	eufs_info("sb=%px cpu=%d cpumask=%*pbl period=%d\n", data,
 		  smp_processor_id(), cpumask_pr_args(mask), period);

 	while (idx < num_persisters && sbi->persisters[idx] != current)
 		idx++;
 	BUG_ON(idx >= num_persisters);

 	while (!kthread_should_stop()) {
 		set_current_state(TASK_INTERRUPTIBLE);
 		schedule_timeout(period);
 		handle_persistees_for_each_cpu(sb, mask, idx);
 	}

 	while (handle_persistees_for_each_cpu(sb, mask, idx))
 		cpu_relax();

 	eufs_info("finalizing on %d\n", smp_processor_id());

 	return 0;
 }

 int dep_init(struct super_block *sb)
 {
 	struct eufs_sb_info *sbi = EUFS_SB(sb);
 	int cpu;
 	int i, j;
 	char name[BDEVNAME_SIZE];
 	int err;

 	sbi->persistee_list = alloc_percpu(struct llist_head);
 	if (!sbi->persistee_list) {
 		err = -ENOMEM;
 		goto cleanup;
 	}

 	/* init each llist */
 	for_each_possible_cpu(cpu)
 		init_llist_head(per_cpu_ptr(sbi->persistee_list, cpu));

 	sbi->persisters = kzalloc(sizeof(struct task_struct *) *
 					  persisters_per_socket * num_sockets,
 				  GFP_KERNEL);
 	if (!sbi->persisters) {
 		err = -ENOMEM;
 		goto cleanup;
 	}

 	sbi->need_sync = kzalloc(
 		sizeof(bool) * persisters_per_socket * num_sockets, GFP_KERNEL);
 	if (!sbi->need_sync) {
 		err = -ENOMEM;
 		goto cleanup;
 	}

 	init_waitqueue_head(&sbi->sync_wq);

 	bdevname(sb->s_bdev, name);
 	for (i = 0; i < num_sockets; ++i) {
 		for (j = 0; j < persisters_per_socket; ++j) {
 			int idx = i * persisters_per_socket + j;

 			sbi->persisters[idx] = kthread_create_on_node(
 				persister, sb, i, "hmfs/%s-%d.%d", name, i, j);

 			if (IS_ERR(sbi->persisters[idx])) {
 				err = PTR_ERR(sbi->persisters[idx]);
 				pr_err("create persister %s-%d.%d error %d",
 				       name, i, j, err);
 				sbi->persisters[idx] = NULL;
 				goto cleanup;
 			}

 			set_cpus_allowed_ptr(sbi->persisters[idx],
 					     cpumask_of_node(i));

 			wake_up_process(sbi->persisters[idx]);
 		}
 	}

 	return 0;

 cleanup:
 	dep_fini(sb);
 	return err;
 }

 void dep_fini(struct super_block *sb)
 {
 	struct eufs_sb_info *sbi = EUFS_SB(sb);

 	if (sbi->persisters) {
 		int i;

 		for (i = 0; i < persisters_per_socket * num_sockets; ++i) {
 			if (sbi->persisters[i]) {
 				kthread_stop(sbi->persisters[i]);
 				sbi->persisters[i] = NULL;
 			}
 		}

 		kfree(sbi->persisters);
 		sbi->persisters = NULL;
 	}

 	kfree(sbi->need_sync);
 	sbi->need_sync = NULL;

 	free_percpu(sbi->persistee_list);
 	sbi->persistee_list = NULL;
 }
	// SPDX-License-Identifier: GPL-2.0
	/*
	* Copyright (C) 2021. Huawei Technologies Co., Ltd. All rights reserved.
	* This program is free software; you can redistribute it and/or modify
	* it under the terms of the GNU General Public License version 2 and
	* only version 2 as published by the Free Software Foundation.
	*
	* This program is distributed in the hope that it will be useful,
	* but WITHOUT ANY WARRANTY; without even the implied warranty of
	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
	* GNU General Public License for more details.
	*/

	#include <linux/percpu.h>
	#include <linux/slab.h>
	#include <linux/fs.h>
	#include <linux/kthread.h>
	#include <linux/list.h>
	#include <linux/ratelimit.h>
	#include <linux/writeback.h>
	#include "euler.h"
	#include "dep.h"
	#include "lock.h"
	#include "dax.h"
	#include "dht.h"

	static void do_dep_diradd_oneshot(struct inode dir_inode, struct dep_node dep,
	u64 *bitset);

	struct flush_list_head {
	int count;
	struct llist_head head;
	};

	DEFINE_PER_CPU(struct flush_list_head, flush_list_percpu);

	#define IFMT_HAS_ROOT(ifmt) \
	((ifmt) == S_IFREG \|\| (ifmt) == S_IFDIR \|\| (ifmt) == S_IFLNK)

	#define INODE_COND_TRYLOCK(inode, tag, enter_cond, exit_cond, exit_expr) \
	do { \
	tag: \
	if (enter_cond) { \
	if (likely(inode_trylock(inode))) { \
	/* get the lock, okay */ \
	} else { \
	if (exit_cond) { \
	exit_expr; \
	} else { \
	cond_resched(); \
	goto tag; \
	} \
	} \
	} \
	} while (0)

	static inline void fsync_dir_oneshot(struct inode *dir)
	{
	eufs_dir_fsync_oneshot(dir);
	}

	static void do_dep_dirrem(struct inode inode, struct dep_node dep,
	u64 *bitset)
	{
	struct nv_dict_entry *prevde = dep->prevde;
	struct nv_dict_entry *de = dep->de;
	int idx;

	eufs_dbg("!! %s !!", __func__);
	NV_ASSERT(de);
	NV_ASSERT(de->inode);
	NV_ASSERT(de->name);

	idx = INDEX(de->hv);
	bitset[idx / 64] = bitset[idx / 64] \| (0x1ull << (idx & 63));
	eufs_dbg("bitset-add: dict=%llx, %d %llx\n",
	eufs_iread_dict(EUFS_PI(inode)), idx, bitset[idx / 64]);

	/*
	* This is a removal of a newly created dentry, nothing to do,
	* the prevde is already manipulated in dht.c
	*/
	if (de->volatile_next == EUFS_DIR_DELNEW)
	return;

	/*
	* If dentries immediately following the deleted dentry are
	* also deleted, prevde->volatile_next will be modified again.
	* So if we assign prevde->volatile_next to prevde->next,
	* these deletion will be persisted prematurely.
	*/
	if (prevde && !eufs_dentry_is_not_persist(prevde)) {
	prevde->next = de->next;
	persist_dentry(prevde);
	}
	}

	static void do_dep_dirrem_reclaim(struct super_block sb, struct dep_node dep)
	{
	struct nv_dict_entry *de = dep->de;
	struct eufs_inode __maybe_unused *pi;
	struct inode *child;

	pi = s2p(sb, de->inode);
	child = dep->inode;
	NV_ASSERT(EUFS_PI(child) == pi);
	eufs_dbg("dirrem: child_inode=%px\n", child);
	BUG_ON(!child);
	eufs_free_name(sb, de);
	nv_free(sb, de);
	}

	#define EUFS_PRINT_BITSET(lvl, bitset) \
	eufs_##lvl("bitsets: %llx %llx %llx %llx %llx %llx %llx %llx\n", \
	bitset[0], bitset[1], bitset[2], bitset[3], bitset[4], \
	bitset[5], bitset[6], bitset[7])

	static void eufs_sync_buckets(struct eufs_inode_info *vi, u64 bitset[8])
	{
	struct inode *inode = &vi->vfs_inode;
	struct super_block *sb = inode->i_sb;
	struct eufs_inode *pi = EUFS_FRESH_PI(EUFS_PI(inode));
	struct nv_dict *dict;
	int i;

	/* Volatile buckets */
	if (!vi->i_volatile_dict)
	return;

	EUFS_PRINT_BITSET(dbg, bitset);

	BUG_ON(!inode_is_header_locked(inode));
	dict = o2p(sb, eufs_iread_dict(pi));
	for (i = 0; i < 8; ++i) {
	int j;
	bool dirty;
	int idx;

	if (!bitset[i])
	continue;
	dirty = false;
	for (j = 0; j <= 64; ++j) {
	if (j % 8 == 0 && dirty) {
	dirty = false;
	eufs_flush_cacheline(&dict->table[idx]);
	}
	if (j == 64)
	break;
	if (!(bitset[i] & (0x1ull << j)))
	continue;
	idx = i * 64 + j;
	eufs_dbg_dir("handle index %d (i %d, j %d) of inode=%px\n",
	idx, i, j, inode);

	eufs_dbg_dir(" idx=%d dict[idx]=%px vdict[idx]=%px\n",
	idx, dict->table[idx],
	vi->i_volatile_dict->table[idx]);

	if (unlikely(vi->i_volatile_dict->table[idx] ==
	EUFS_DIR_EOC_PTR))
	dict->table[idx] = NULL_VAL;
	else if (vi->i_volatile_dict->table[idx] != NULL)
	dict->table[idx] = COMPOSE_DICT_HEAD_le64(
	sb, vi->i_volatile_dict->table[idx]);
	vi->i_volatile_dict->table[idx] = NULL;
	dirty = true;
	}
	}
	}

	/*
	* Some ideas on fast fsync (of dir):
	*
	* 1. Batch and coalescence. The newly inserted dentry should be marked and
	* during its removal, it should be marked again so that unnecessary dep_diradd
	* an be prevented.
	*
	* 2. Split! The lock (only when there is one lock needed) can be temporarily
	* given up so between handling two deps. This requires that the dentry pointed
	* by dir_pi should not be reclaimed (like in RCU). Well, actually, combined
	* with the following one idea, this is quite acceptable.
	*
	* 3. Delayed free. The removal operations can be delayed until the locks are
	* released.
	*
	*
	* Parallel fsync for a vi is not throughly considered though.
	*
	* 4. Detach only if the list is empty?
	*/
	static void fsync_rename_inode(struct inode *dir)
	{
	struct eufs_inode_info *vi = EUFS_I(dir);

	if (!vi->i_is_dirty)
	return;

	/* I'm holding the lock, so if it's dirty, it's dirty. */
	fsync_dir_oneshot(dir);
	}

	void fsync_rename_inodes(struct inode old_dir, struct inode new_dir,
	struct inode **locked_inodes)
	{
	int i;
	struct inode *inode;

	/*
	* The two parent dirs, might have parent-child relations sometime
	* before. So we need to transfer these two dirs too.
	*/
	for (i = 0; i < EUFS_INODE_CNT_IN_RENAME; i++) {
	inode = locked_inodes[i];
	if (inode)
	eufs_inode_mark_lock_transferable(inode);
	}

	if (old_dir == new_dir) {
	fsync_rename_inode(old_dir);
	} else {
	fsync_rename_inode(old_dir);
	fsync_rename_inode(new_dir);
	}

	for (i = 0; i < EUFS_INODE_CNT_IN_RENAME; i++) {
	inode = locked_inodes[i];
	if (inode)
	eufs_inode_wait_lock_transfer_done(inode);
	}
	}

	static void eufs_update_persisted_seq(struct eufs_inode_info *vi,
	struct list_head *head)
	{
	if (!list_empty(head)) {
	struct dep_node *dep =
	list_last_entry(head, struct dep_node, node);

	vi->i_persisted_dep_seq = dep->seq;
	}
	}

	static int fsync_dir_bg(struct inode *dir)
	{
	struct dep_node dep, next;
	LIST_HEAD(detached_list);
	LIST_HEAD(dump_list);
	int i;
	#define FSYNC_DIR_VI_LOOP_NUM (20)

	struct eufs_inode_info *vi = EUFS_I(dir);
	struct super_block *sb = dir->i_sb;
	struct eufs_sb_info *sbi = EUFS_SB(sb);
	struct eufs_inode *pi = EUFS_PI(dir);
	u64 bitset[8] = { 0 };
	int dep_count = 0;

	retry:
	inode_urgent_lock(dir);

	/* Phase 1 */
	for (i = FSYNC_DIR_VI_LOOP_NUM; i >= 0; --i) {
	/* Get all deps round by round */
	if (i == 0) {
	/* Last round */
	inode_header_lock(dir);
	}
	inode_dep_lock(dir);

	if (list_empty(&vi->i_dep_list) && i > 0) {
	/* Skip to last round */
	i = 1;
	}
	list_cut_position(&detached_list, &vi->i_dep_list,
	vi->i_dep_list.prev);

	if (i > 0)
	inode_dep_unlock(dir);

	/* Do dep one by one. */
	list_for_each_entry_safe(dep, next, &detached_list, node) {
	if (dep->type == DEP_DIRADD) {
	/*
	* FIXME: the lockset might be different since
	* we might have released the inode lock.
	*/
	do_dep_diradd_oneshot(dir, dep, bitset);

	} else if (dep->type == DEP_DIRREM) {
	do_dep_dirrem(dir, dep, bitset);

	} else
	BUG();
	}

	list_splice_tail_init(&detached_list, &dump_list);

	if (i == 0) {
	eufs_pbarrier();

	if (!list_empty(&dump_list))
	/* Phase 2 */
	eufs_sync_buckets(vi, bitset);

	inode_dep_unlock(dir);
	inode_header_unlock(dir);
	break;
	}
	}

	inode_urgent_unlock(dir);

	/* Phase 3 */
	inode_lock(dir);

	if (!list_empty(&vi->i_dep_list)) {
	inode_unlock(dir);
	/* To handle new deps between phase 2 & 3 */
	/* FIXME: Live lock possible! */
	goto retry;
	}

	if (dir->i_nlink)
	eufs_sync_pinode(dir, pi, false);

	eufs_update_persisted_seq(vi, &dump_list);

	vi->i_is_persisting = false;
	vi->i_is_dirty = false;

	if (dir->i_nlink)
	persist_pinode(pi);

	inode_unlock(dir);

	eufs_pbarrier();

	/* Reclaim memory and clear the list */
	list_for_each_entry_safe(dep, next, &dump_list, node) {
	struct inode *child_inode = dep->inode;
	struct eufs_inode_info *child_vi = EUFS_I(child_inode);

	if (dep->type == DEP_DIRREM)
	do_dep_dirrem_reclaim(sb, dep);

	/* remove from owner list */
	spin_lock(&child_vi->i_owner_lock);
	list_del_init(&dep->owner_node);
	spin_unlock(&child_vi->i_owner_lock);

	iput(child_inode);

	list_del(&dep->node);

	eufs_free_dep_node(dep);
	dep_count++;
	}
	atomic_sub(dep_count, &sbi->s_nr_dep_nodes);
	eufs_dbg("@cpu=%d !! fsync dir vi done: inode=%px\n",
	smp_processor_id(), &vi->vfs_inode);
	return 0;
	}

	static int fsync_nondir_oneshot(struct inode *inode)
	{
	struct eufs_inode_info *vi = EUFS_I(inode);
	struct eufs_inode *pi;

	/* For files other than dir */
	WARN(S_ISDIR(inode->i_mode), "%s on a dir!", __func__);

	/* Inode needs to remove. Nothing to do */
	if (!inode->i_nlink) {
	vi->i_is_dirty = false;
	return 0;
	}

	pi = EUFS_PI(inode);

	eufs_sync_pinode(inode, pi, false);

	persist_pinode(pi);

	vi->i_is_dirty = false;

	return 0;
	}

	static int fsync_nondir_bg(struct inode *inode)
	{
	struct eufs_inode_info *vi = EUFS_I(inode);
	int r;

	inode_lock(inode);
	r = fsync_nondir_oneshot(inode);
	vi->i_is_persisting = false;
	inode_unlock(inode);

	return r;
	}

	static void fsync_bg(struct inode *inode)
	{
	struct eufs_sb_info *sbi = EUFS_SB(inode->i_sb);

	wait_on_inode(inode);

	/* Reading i_mode may need no protection */
	if (S_ISDIR(inode->i_mode))
	fsync_dir_bg(inode);
	else
	fsync_nondir_bg(inode);

	/* Decrease */
	iput(inode);

	if (atomic_dec_and_test(&sbi->s_nr_dirty_inodes) && sbi->s_draining) {
	/* end of draining */
	sbi->s_draining = false;
	}
	}

	void fsync_oneshot(struct inode *inode)
	{
	/* Reading i_mode may need no protection */
	if (S_ISDIR(inode->i_mode))
	fsync_dir_oneshot(inode);
	else
	fsync_nondir_oneshot(inode);
	}

	static void do_dep_diradd_oneshot(struct inode dir_inode, struct dep_node dep,
	u64 *bitset)
	{
	struct super_block *sb = dir_inode->i_sb;
	struct nv_dict_entry *de = dep->de;
	struct inode *inode = dep->inode;
	struct eufs_inode_info *dir_vi = EUFS_I(dir_inode);
	struct eufs_inode *pi;
	struct eufs_inode *fresh_pi;
	int idx;
	void *buffer[16];
	struct alloc_batch ab;
	bool lock_transferred = false;

	idx = INDEX(de->hv);
	bitset[idx / 64] = bitset[idx / 64] \| (0x1ull << (idx & 63));

	if (de->volatile_next == EUFS_DIR_DELNEW) {
	/*
	* The de is already invisible from both the latest view and
	* the consistent view.
	* Will be handled in the corresponding dirrem.
	*/
	return;
	}

	/* Meow? This equality is the sign of diradd */
	WARN(!eufs_dentry_is_not_persist(de), "diradd wrong sign");

	pi = s2p(sb, de->inode);

	wait_on_inode(inode);
	retry:
	if (likely(inode_trylock(inode))) {
	/* Got the lock */
	} else {
	if (eufs_inode_mark_lock_transferring(inode)) {
	lock_transferred = true;
	} else {
	cond_resched();
	goto retry;
	}
	}

	eufs_sync_pinode(inode, pi, false);
	fresh_pi = EUFS_FRESH_PI(pi);

	if (!lock_transferred)
	inode_unlock(inode);
	else
	eufs_inode_lock_transfer_done(inode);

	ab.n_used = 0;
	ab.size = 16;
	ab.batch = buffer;

	eufs_alloc_batch_add(sb, &ab, de);
	/*
	* force to persist the allocation without checking.
	* TODO: we should differentiate the link and create syscall to agree
	* with checking
	*/
	eufs_alloc_persist(sb, pi, true);

	if (S_ISLNK(fresh_pi->i_mode)) {
	void *root = o2p(sb, eufs_iread_root(fresh_pi));

	/* reg file's root is done in btree */
	/* In case of Hard link, we must force the allocation persitence */
	eufs_alloc_persist(sb, root, true);
	persist_symlink(root);
	} else if (S_ISDIR(fresh_pi->i_mode)) {
	void *root = o2p(sb, eufs_iread_root(fresh_pi));

	eufs_alloc_persist(sb, root, false);
	persist_page(root);
	}

	persist_name(sb, de, &ab);

	eufs_alloc_batch_persist_reset(sb, &ab);

	persist_pinode(pi);

	spin_lock(&dir_vi->i_dentry_persist_lock);
	eufs_dentry_clr_not_persist_flag(de);
	spin_unlock(&dir_vi->i_dentry_persist_lock);

	persist_dentry(de);
	}

	void eufs_dir_fsync_oneshot(struct inode *dir)
	{
	struct dep_node *dep;
	struct dep_node *next;
	struct super_block *sb = dir->i_sb;
	struct eufs_sb_info *sbi = EUFS_SB(sb);
	struct eufs_inode_info *vi = EUFS_I(dir);
	LIST_HEAD(detached_list);
	u64 bitset[8] = { 0 };
	int dep_count = 0;

	BUG_ON(!inode_is_locked(dir));

	inode_urgent_lock(dir);

	/* get all deps */
	inode_header_lock(dir);
	inode_dep_lock(dir);

	if (list_empty(&vi->i_dep_list))
	goto unlock_sync_pinode;

	list_for_each_entry(dep, &vi->i_dep_list, node) {
	if (dep->type == DEP_DIRADD)
	do_dep_diradd_oneshot(dir, dep, bitset);
	else if (dep->type == DEP_DIRREM)
	do_dep_dirrem(dir, dep, bitset);
	else
	BUG();
	}

	list_splice_init(&vi->i_dep_list, &detached_list);

	/* sync buckets */
	eufs_pbarrier();
	eufs_sync_buckets(vi, bitset);

	unlock_sync_pinode:
	inode_dep_unlock(dir);
	inode_header_unlock(dir);

	/* sync pinode */
	if (dir->i_nlink)
	eufs_sync_pinode(dir, EUFS_PI(dir), false);

	eufs_pbarrier();

	eufs_update_persisted_seq(vi, &detached_list);

	vi->i_is_dirty = false;

	/* Reclaim memory and clear the list */
	list_for_each_entry_safe(dep, next, &detached_list, node) {
	struct inode *child_inode = dep->inode;
	struct eufs_inode_info *child_vinode = EUFS_I(child_inode);

	spin_lock(&child_vinode->i_owner_lock);
	list_del_init(&dep->owner_node);
	spin_unlock(&child_vinode->i_owner_lock);

	if (dep->type == DEP_DIRREM) {
	do_dep_dirrem_reclaim(sb, dep);
	iput(dep->inode);
	} else if (dep->type == DEP_DIRADD) {
	iput(dep->inode);
	}
	list_del(&dep->node);
	eufs_free_dep_node(dep);
	dep_count++;
	}
	atomic_sub(dep_count, &sbi->s_nr_dep_nodes);

	inode_urgent_unlock(dir);
	}

	void fsync_on_draining(struct inode dir, struct inode inode)
	{
	BUG_ON(!dir);
	BUG_ON(!inode_is_locked(dir));
	BUG_ON(inode && !inode_is_locked(inode));

	/* for link/unlink/rmdir */
	if (inode)
	eufs_inode_mark_lock_transferable(inode);

	fsync_dir_oneshot(dir);

	if (inode)
	eufs_inode_wait_lock_transfer_done(inode);
	}

	#define NR_FLUSH_EACH_ROUND (16)
	#define FLUSH_START_THRESHOLD (64)

	static __always_inline int handle_persistees_for_each_cpu(
	struct super_block sb, const struct cpumask mask, int idx) {
	struct eufs_sb_info *sbi = EUFS_SB(sb);
	struct llist_node *list;
	struct llist_head *head;
	struct eufs_inode_info *vi;
	struct eufs_inode_info *next;
	int n_active_list;
	int cpu;
	bool need;

	retry:
	need = sbi->need_sync[idx];
	n_active_list = 0;
	for_each_cpu(cpu, mask) {
	head = per_cpu_ptr(sbi->persistee_list, cpu);

	if (unlikely(llist_empty(head)))
	continue;

	n_active_list++;

	list = llist_del_all(head);

	eufs_dbg("persister get list %px for cpu%d\n", list, cpu);

	/* reverse the ordering for better locality? */
	llist_for_each_entry_safe(vi, next, list, i_persistee_node)
	fsync_bg(&vi->vfs_inode);
	eufs_dbg("persister handled list %px\n", list);
	}
	/**
	* We need a complete round of run for fssync. If
	* need != sbi->need_sync[idx], need_sync was modified during our last
	* round. We need to retry to ensure a complete round of run.
	* It's okay if dirty inodes of a cpu is still being processed by
	* another persister, since we will wait for all persisters to finish
	* for fssync.
	*/
	if (need != READ_ONCE(sbi->need_sync[idx]))
	goto retry;
	if (need) {
	sbi->need_sync[idx] = false;
	wake_up(&sbi->sync_wq);
	}
	if (READ_ONCE(sbi->need_sync[idx]))
	goto retry;

	return n_active_list;
	}

	static int persister(void *data)
	{
	struct super_block *sb = data;
	struct eufs_sb_info *sbi = EUFS_SB(sb);
	const struct cpumask *mask = cpumask_of_node(numa_node_id());
	const int period =
	(persist_period == 0) ? /* default */ (HZ / 4) :
	/* less than a second */
	((persist_period < 0) ? (HZ / (-persist_period)) :
	/* more than a second */
	(HZ * persist_period));
	int idx = 0;
	int num_persisters = num_sockets * persisters_per_socket;

	eufs_info("sb=%px cpu=%d cpumask=%*pbl period=%d\n", data,
	smp_processor_id(), cpumask_pr_args(mask), period);

	while (idx < num_persisters && sbi->persisters[idx] != current)
	idx++;
	BUG_ON(idx >= num_persisters);

	while (!kthread_should_stop()) {
	set_current_state(TASK_INTERRUPTIBLE);
	schedule_timeout(period);
	handle_persistees_for_each_cpu(sb, mask, idx);
	}

	while (handle_persistees_for_each_cpu(sb, mask, idx))
	cpu_relax();

	eufs_info("finalizing on %d\n", smp_processor_id());

	return 0;
	}

	int dep_init(struct super_block *sb)
	{
	struct eufs_sb_info *sbi = EUFS_SB(sb);
	int cpu;
	int i, j;
	char name[BDEVNAME_SIZE];
	int err;

	sbi->persistee_list = alloc_percpu(struct llist_head);
	if (!sbi->persistee_list) {
	err = -ENOMEM;
	goto cleanup;
	}

	/* init each llist */
	for_each_possible_cpu(cpu)
	init_llist_head(per_cpu_ptr(sbi->persistee_list, cpu));

	sbi->persisters = kzalloc(sizeof(struct task_struct )
	persisters_per_socket * num_sockets,
	GFP_KERNEL);
	if (!sbi->persisters) {
	err = -ENOMEM;
	goto cleanup;
	}

	sbi->need_sync = kzalloc(
	sizeof(bool) * persisters_per_socket * num_sockets, GFP_KERNEL);
	if (!sbi->need_sync) {
	err = -ENOMEM;
	goto cleanup;
	}

	init_waitqueue_head(&sbi->sync_wq);

	bdevname(sb->s_bdev, name);
	for (i = 0; i < num_sockets; ++i) {
	for (j = 0; j < persisters_per_socket; ++j) {
	int idx = i * persisters_per_socket + j;

	sbi->persisters[idx] = kthread_create_on_node(
	persister, sb, i, "hmfs/%s-%d.%d", name, i, j);

	if (IS_ERR(sbi->persisters[idx])) {
	err = PTR_ERR(sbi->persisters[idx]);
	pr_err("create persister %s-%d.%d error %d",
	name, i, j, err);
	sbi->persisters[idx] = NULL;
	goto cleanup;
	}

	set_cpus_allowed_ptr(sbi->persisters[idx],
	cpumask_of_node(i));

	wake_up_process(sbi->persisters[idx]);
	}
	}

	return 0;

	cleanup:
	dep_fini(sb);
	return err;
	}

	void dep_fini(struct super_block *sb)
	{
	struct eufs_sb_info *sbi = EUFS_SB(sb);

	if (sbi->persisters) {
	int i;

	for (i = 0; i < persisters_per_socket * num_sockets; ++i) {
	if (sbi->persisters[i]) {
	kthread_stop(sbi->persisters[i]);
	sbi->persisters[i] = NULL;
	}
	}

	kfree(sbi->persisters);
	sbi->persisters = NULL;
	}

	kfree(sbi->need_sync);
	sbi->need_sync = NULL;

	free_percpu(sbi->persistee_list);
	sbi->persistee_list = NULL;
	}