blob: c3db0750b66f3c4c44cc847554de4f0c95fd1b14 [file] [log] [blame]
// SPDX-License-Identifier: GPL-2.0
/*
* Copyright (C) 2021. Huawei Technologies Co., Ltd. All rights reserved.
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License version 2 and
* only version 2 as published by the Free Software Foundation.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*/
#include <linux/fs.h>
#include <linux/aio.h>
#include <linux/sched.h>
#include <linux/highuid.h>
#include <linux/module.h>
#include <linux/mpage.h>
#include <linux/backing-dev.h>
#include <linux/types.h>
#include <linux/ratelimit.h>
#include <linux/dax.h>
#include "euler.h"
#include "dax.h"
#include "dht.h"
#include "dep.h"
static int eufs_read_pinode(struct inode *inode, struct eufs_inode *pi)
{
int ret = -EIO;
struct eufs_inode_info *vi = EUFS_I(inode);
struct super_block *sb = inode->i_sb;
umode_t mode;
u64 blocks;
u64 encoded_root;
eufs_dbg("%s: inode=%px pi=%px, pi->i_mode=%x\n", __func__, inode,
pi, eufs_iread_mode(pi));
pi = EUFS_FRESH_PI(pi);
eufs_set_inode_flags(inode, eufs_iread_flags(pi));
mode = eufs_iread_mode(pi);
inode->i_mode = mode;
vi->i_version = eufs_iread_version(pi);
inode->i_ctime.tv_sec = eufs_iread_ctime(pi);
inode->i_ctime.tv_nsec = eufs_iread_ctime_nsec(pi);
i_uid_write(inode, eufs_iread_uid(pi));
i_gid_write(inode, eufs_iread_gid(pi));
vi->i_dotdot = eufs_iread_dotdot(pi);
vi->i_ext = eufs_iread_ext(pi);
inode->i_generation = eufs_iread_generation(pi);
set_nlink(inode, eufs_iread_nlink(pi));
inode->i_mtime.tv_sec = eufs_iread_mtime(pi);
inode->i_atime.tv_sec = eufs_iread_atime(pi);
inode->i_mtime.tv_nsec = eufs_iread_mtime_nsec(pi);
inode->i_atime.tv_nsec = eufs_iread_atime_nsec(pi);
inode->i_size = eufs_iread_size(pi);
blocks = 0;
switch (mode & S_IFMT) {
case S_IFDIR:
vi->i_dotdot = eufs_iread_dotdot(pi);
vi->i_volatile_root = NULL;
vi->i_volatile_height = 0;
blocks = 1;
break;
case S_IFREG:
vi->i_volatile_tree_blocks = eufs_iread_tree_blocks(pi);
eufs_alloc_batch_init(&vi->page_batch, 2);
fallthrough;
case S_IFLNK:
encoded_root = eufs_iread_root(pi);
vi->i_volatile_root = o2p(sb, root_ptr(encoded_root));
vi->i_volatile_height = root_height(encoded_root);
if (S_ISREG(mode))
/* These blocks contain hole as well */
blocks = vi->i_volatile_tree_blocks;
else
blocks = 1;
break;
case S_IFCHR:
case S_IFBLK:
inode->i_rdev = eufs_iread_rdev(pi);
break;
}
/* check if the inode is active. */
if (inode->i_nlink == 0) {
/* this inode is deleted */
ret = -ESTALE;
goto bad_inode;
}
inode->i_blocks = blocks << (inode->i_blkbits - 9);
inode->i_mapping->a_ops = &eufs_aops;
switch (mode & S_IFMT) {
case S_IFREG:
inode->i_op = &eufs_file_inode_operations;
inode->i_fop = &eufs_file_operations;
break;
case S_IFDIR:
inode->i_op = &eufs_dir_inode_operations;
inode->i_fop = &eufs_dir_operations;
break;
case S_IFLNK:
inode->i_op = &eufs_symlink_inode_operations;
break;
default:
inode->i_size = 0;
inode->i_op = &eufs_special_inode_operations;
init_special_inode(inode, inode->i_mode, eufs_iread_rdev(pi));
break;
}
return 0;
bad_inode:
make_bad_inode(inode);
return ret;
}
void eufs_sync_pinode(struct inode *inode, struct eufs_inode *pi, bool evict)
{
struct eufs_inode_info *vi = EUFS_I(inode);
struct super_block *sb = inode->i_sb;
u64 pi_root_o;
u64 pi_tree_blocks;
struct eufs_inode __pmem *twin_pi = EUFS_TWIN_PI(pi);
bool new = false;
BUG_ON(!pi);
BUG_ON(!inode);
BUG_ON(!evict && !inode_is_locked(inode));
if (!inode->i_nlink)
return;
/* let pi be the latest pinode */
if (!pi->i_fresh || !twin_pi->i_fresh)
new = true;
if (pi->i_fresh < twin_pi->i_fresh || (new && (pi > twin_pi))) {
struct eufs_inode *t = pi;
pi = twin_pi;
twin_pi = t;
}
pi_root_o = eufs_iread_root(pi);
pi_tree_blocks = eufs_iread_tree_blocks(pi);
switch (inode->i_mode & S_IFMT) {
case S_IFDIR:
break;
case S_IFREG:
BUG_ON(!evict && !inode_is_locked(inode));
if (vi->i_volatile_tree_blocks > pi_tree_blocks) {
/* For a newly created pi, this is always true */
void __pmem *root = vi->i_volatile_root;
int height = vi->i_volatile_height;
BUG_ON(root_height(pi_root_o) > vi->i_volatile_height);
eufs_alloc_batch_persist_reset(sb, &vi->page_batch);
eufs_persist_btree(
sb, root, height, pi_tree_blocks * PAGE_SIZE,
vi->i_volatile_tree_blocks * PAGE_SIZE);
} else {
eufs_alloc_batch_persist_reset(sb, &vi->page_batch);
}
pi_root_o = encode_root(p2o(sb, vi->i_volatile_root),
vi->i_volatile_height);
pi_tree_blocks = vi->i_volatile_tree_blocks;
break;
case S_IFLNK:
/* Never change */
break;
case S_IFCHR:
case S_IFBLK:
pi_root_o = ((u64)inode->i_rdev << 32) | inode->i_rdev;
break;
}
if (!evict && !inode_is_locked(inode)) {
eufs_info("! inode=%px\n", inode);
BUG();
}
BUG_ON(!evict && !inode_is_locked(inode));
/* update to new data */
eufs_iwrite_flags(twin_pi, eufs_get_inode_flags(inode, pi));
eufs_iwrite_mode(twin_pi, inode->i_mode);
eufs_iwrite_version(twin_pi, 1);
eufs_iwrite_ctime(twin_pi, inode->i_ctime.tv_sec);
eufs_iwrite_ctime_nsec(twin_pi, inode->i_ctime.tv_nsec);
eufs_iwrite_uid(twin_pi, i_uid_read(inode));
eufs_iwrite_gid(twin_pi, i_gid_read(inode));
eufs_iwrite_dotdot(twin_pi, vi->i_dotdot);
eufs_iwrite_ext(twin_pi, vi->i_ext); /* no ext here */
eufs_iwrite_generation(twin_pi, inode->i_generation);
eufs_iwrite_nlink(twin_pi, inode->i_nlink);
eufs_iwrite_mtime(twin_pi, inode->i_mtime.tv_sec);
eufs_iwrite_atime(twin_pi, inode->i_atime.tv_sec);
eufs_iwrite_mtime_nsec(twin_pi, inode->i_mtime.tv_nsec);
eufs_iwrite_atime_nsec(twin_pi, inode->i_atime.tv_nsec);
eufs_iwrite_root(twin_pi, pi_root_o);
eufs_iwrite_size(twin_pi, inode->i_size);
eufs_iwrite_tree_blocks(twin_pi, pi_tree_blocks);
eufs_flush_cacheline(twin_pi);
if (new) {
/* Handle new */
pi->i_fresh = 1;
eufs_flush_cacheline(&pi->i_fresh);
twin_pi->i_fresh = 2;
} else if (unlikely(pi->i_fresh == U16_MAX)) {
/* Handle overflow */
/* Invarient: pi should always be the freshest */
/* freshness 0 is reserved for new inodes */
twin_pi->i_fresh = 1;
eufs_flush_cacheline(&twin_pi->i_fresh);
pi->i_fresh = 2;
eufs_flush_cacheline(&pi->i_fresh);
twin_pi->i_fresh = 3;
} else {
/* Normal case */
twin_pi->i_fresh = pi->i_fresh + 1;
}
/* This flush also flushes the bottom half of the twin_pi */
eufs_flush_cacheline(&twin_pi->i_fresh);
}
struct inode *eufs_iget(struct super_block *sb, struct eufs_inode *pi)
{
struct inode *inode;
int err;
WARN_ON(!EUFS_IS_HEAD_PI(pi));
inode = iget_locked(sb, eufs_pi2ino(sb, pi));
if (unlikely(!inode))
return ERR_PTR(-ENOMEM);
if (!(inode->i_state & I_NEW))
return inode;
err = eufs_read_pinode(inode, pi);
if (unlikely(err))
goto fail;
unlock_new_inode(inode);
return inode;
fail:
iget_failed(inode);
return ERR_PTR(err);
}
void eufs_evict_inode(struct inode *inode)
{
struct super_block *sb = inode->i_sb;
struct eufs_inode *pi = EUFS_PI(inode);
struct eufs_inode *fresh_pi;
struct eufs_inode_info *vi = EUFS_I(inode);
eufs_dbg(
"Evicting: inode=%px, pi=%px i_nlink=%u inode->i_size=%lld blocks=%lld\n",
inode, pi, inode->i_nlink, inode->i_size,
vi->i_volatile_tree_blocks);
if (!inode->i_nlink && !is_bad_inode(inode)) {
/* Free the inode */
fresh_pi = EUFS_FRESH_PI(pi);
switch (inode->i_mode & S_IFMT) {
case S_IFDIR:
/* Directory can be removed only if the dict is empty */
NV_ASSERT(!vi->i_volatile_root);
nv_free(sb, o2p(sb, eufs_iread_dict(fresh_pi)));
break;
case S_IFLNK:
NV_ASSERT(!vi->i_volatile_root);
nv_free(sb, o2p(sb, eufs_iread_root(fresh_pi)));
break;
case S_IFREG:
/* Traverse the B-tree! */
eufs_free_btree(sb, vi->i_volatile_root,
vi->i_volatile_height,
vi->i_volatile_tree_blocks);
break;
default:
break;
}
eufs_iwrite_nlink(fresh_pi, 0);
eufs_iwrite_mode(fresh_pi, 0);
eufs_flush_cacheline(fresh_pi);
WARN_ON(!EUFS_IS_HEAD_PI(pi));
nv_free(sb, pi);
} else if (!is_bad_inode(inode)) {
eufs_sync_pinode(inode, pi, true);
}
if (!is_bad_inode(inode) && vi->i_volatile_dict) {
eufs_free_page(vi->i_volatile_dict);
vi->i_volatile_dict = NULL;
}
truncate_inode_pages_final(&inode->i_data);
clear_inode(inode);
return;
}
int eufs_write_inode(struct inode *inode, struct writeback_control *wbc)
{
inode_lock(inode);
eufs_sync_pinode(inode, EUFS_PI(inode), false);
inode_unlock(inode);
return 0;
}
int eufs_notify_change(struct dentry *dentry, struct iattr *attr)
{
struct inode *inode = dentry->d_inode;
struct eufs_inode *pi = EUFS_PI(inode);
int ret;
unsigned int ia_valid = attr->ia_valid;
if (!pi)
return -EACCES;
ret = setattr_prepare(dentry, attr);
if (ret)
return ret;
if ((ia_valid & ATTR_SIZE) && attr->ia_size != inode->i_size) {
struct eufs_inode_info *vi = EUFS_I(inode);
bool shrink;
eufs_dbg(
"notify change (size): vi=%px inode=%px, pi=%px (%lld), %lld to %lld\n",
vi, inode, pi, eufs_iread_size(pi), inode->i_size,
attr->ia_size);
down_write(&vi->mmap_rwsem);
shrink = attr->ia_size < inode->i_size;
if (attr->ia_size > inode->i_size) {
unsigned long num_blocks =
DIV_ROUND_UP(attr->ia_size, PAGE_SIZE);
/* make sure the file has enough pages allocated */
ret = eufs_extend_btree(inode, num_blocks);
if (ret < 0) {
up_write(&vi->mmap_rwsem);
return ret;
}
/* zeroing the extended range [i_size, ia_size) */
eufs_inode_zero_range(inode, inode->i_size,
attr->ia_size);
}
truncate_setsize(inode, attr->ia_size);
attr->ia_valid = ia_valid | (ATTR_CTIME | ATTR_MTIME);
if (shrink)
eufs_shrink_btree(inode);
/* zeroing the part beyond the new EOF [ia_size, PAGE_ALIGN(ia_size)) */
eufs_inode_zero_range(inode, attr->ia_size,
PAGE_ALIGN(attr->ia_size));
up_write(&vi->mmap_rwsem);
}
eufs_dbg("notify change: inode=%px, pi=%px, imode=%x to imode=%x\n",
inode, pi, inode->i_mode, attr->ia_mode);
setattr_copy(inode, attr);
request_persistence(inode);
return 0;
}
int eufs_file_getattr(const struct path *path, struct kstat *stat,
u32 request_mask, unsigned int query_flags)
{
struct inode *inode = d_inode(path->dentry);
struct eufs_inode *pi = EUFS_FRESH_PI(EUFS_PI(inode));
unsigned int flags = eufs_get_inode_flags(inode, pi);
flags &= FS_FL_USER_VISIBLE;
if (flags & FS_APPEND_FL)
stat->attributes |= STATX_ATTR_APPEND;
if (flags & FS_IMMUTABLE_FL)
stat->attributes |= STATX_ATTR_IMMUTABLE;
stat->attributes_mask |= (STATX_ATTR_APPEND | STATX_ATTR_IMMUTABLE);
generic_fillattr(inode, stat);
return 0;
}
/* Transfer FS_*_FL to S_* and write to inode */
void eufs_set_inode_flags(struct inode *inode, unsigned int flags)
{
inode->i_flags &=
~(S_SYNC | S_APPEND | S_IMMUTABLE | S_NOATIME | S_DIRSYNC);
if (flags & FS_SYNC_FL)
inode->i_flags |= S_SYNC;
if (flags & FS_APPEND_FL)
inode->i_flags |= S_APPEND;
if (flags & FS_IMMUTABLE_FL)
inode->i_flags |= S_IMMUTABLE;
if (flags & FS_NOATIME_FL)
inode->i_flags |= S_NOATIME;
if (flags & FS_DIRSYNC_FL)
inode->i_flags |= S_DIRSYNC;
inode->i_flags |= S_DAX;
}
/* Get S_* from inode and transfer to FS_*_FL */
unsigned int eufs_get_inode_flags(struct inode *inode, struct eufs_inode *pi)
{
unsigned int flags = inode->i_flags;
unsigned int eufs_flags = eufs_iread_flags(EUFS_FRESH_PI(pi));
eufs_flags &= ~(FS_SYNC_FL | FS_APPEND_FL | FS_IMMUTABLE_FL |
FS_NOATIME_FL | FS_DIRSYNC_FL);
if (flags & S_SYNC)
eufs_flags |= FS_SYNC_FL;
if (flags & S_APPEND)
eufs_flags |= FS_APPEND_FL;
if (flags & S_IMMUTABLE)
eufs_flags |= FS_IMMUTABLE_FL;
if (flags & S_NOATIME)
eufs_flags |= FS_NOATIME_FL;
if (flags & S_DIRSYNC)
eufs_flags |= FS_DIRSYNC_FL;
return eufs_flags;
}
static int eufs_writepages(struct address_space *mapping,
struct writeback_control *wbc)
{
struct inode *inode = mapping->host;
struct dax_device *dax_dev = NULL;
int ret = 0;
/* Only for regular file */
if (!S_ISREG(inode->i_mode))
return -EIO;
dax_dev = EUFS_SB(inode->i_sb)->s_dax_dev;
ret = dax_writeback_mapping_range(mapping, dax_dev, wbc);
return ret;
}
const struct address_space_operations eufs_aops = {
.writepages = eufs_writepages,
};
struct inode *pre_inodes_get(struct dentry *dentry, struct inode *dir,
umode_t mode, bool special, dev_t rdev)
{
struct inode *inode = NULL;
struct eufs_inode __pmem *pi;
struct super_block *sb = dir->i_sb;
struct eufs_sb_info *sbi = EUFS_SB(sb);
struct eufs_inode __pmem *dir_pi = EUFS_FRESH_PI(EUFS_PI(dir));
struct eufs_inode_info *vi;
void *pre_page = NULL;
int err;
u64 blocks;
NV_ASSERT(dir_pi);
inode = new_inode(sb);
if (IS_ERR(inode))
return inode;
vi = EUFS_I(inode);
vi->i_volatile_dict = NULL;
BUG_ON(inode->i_nlink != 1);
inode->i_size = 0;
vi->i_ext = 0;
vi->i_dotdot = 0;
vi->i_version = 1;
pi = eufs_malloc_pinode(sb);
if (!pi)
goto no_space_err;
pi->i_fresh = 0;
EUFS_TWIN_PI(pi)->i_fresh = 0;
blocks = 0;
if (S_ISREG(mode)) {
pre_page = eufs_malloc_file_data(sb);
if (!pre_page)
goto no_space_err;
blocks = 1;
} else if (S_ISLNK(mode)) {
pre_page = eufs_zalloc_symlink(sb);
if (!pre_page)
goto no_space_err;
blocks = 1;
} else if (S_ISDIR(mode)) {
pre_page = eufs_zalloc_htable(sb);
if (!pre_page)
goto no_space_err;
blocks = 1;
}
inode->i_blocks = blocks << (inode->i_blkbits - 9);
eufs_dbg("bind inode(%px) ->pi(%px)->i_ino=0x%lx, vi->trans=%d\n",
inode, pi, eufs_pi2ino(sb, pi), vi->i_lock_transferred);
inode->i_ino = eufs_pi2ino(sb, pi);
inode_init_owner(inode, dir, mode);
inode->i_mtime = inode->i_atime = inode->i_ctime = current_time(inode);
inode->i_generation = atomic_add_return(1, &sbi->next_generation);
if (special)
init_special_inode(inode, mode, rdev);
eufs_iwrite_root(pi, EUFS_POISON_VALUE);
if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode)) {
eufs_iwrite_rdev(pi, inode->i_rdev);
} else {
vi->i_volatile_height = 0;
if (S_ISREG(inode->i_mode)) {
vi->i_volatile_tree_blocks = 1;
eufs_iwrite_tree_blocks(pi, 0);
vi->i_volatile_root = pre_page;
/* 0th block is treated as a hole until allocated. */
vi->hole_at_sta = true;
eufs_iwrite_root(
pi, encode_root(p2o(sb, vi->i_volatile_root),
vi->i_volatile_height));
eufs_alloc_batch_init(&vi->page_batch, 2);
eufs_alloc_batch_add(sb, &vi->page_batch,
vi->i_volatile_root);
} else if (S_ISDIR(inode->i_mode)) {
vi->i_volatile_dict = NULL;
eufs_iwrite_dict(pi, p2o(sb, pre_page));
/* allocation persisted in do_dep_diradd */
} else if (S_ISLNK(inode->i_mode)) {
eufs_iwrite_root(pi, p2o(sb, pre_page));
/* allocation persisted in do_dep_diradd */
}
}
eufs_iwrite_mode(pi, inode->i_mode);
eufs_iwrite_size(pi, 0);
eufs_dbg(
"alloc inode=%px pi=%px pi->root=0x%llx pi->i_mode=0%o on cpu %d\n",
inode, pi, eufs_iread_root(pi), eufs_iread_mode(pi),
smp_processor_id());
eufs_iwrite_flags(pi, dir_pi->i_flags);
eufs_set_inode_flags(inode, eufs_iread_flags(pi));
err = insert_inode_locked(inode);
if (err) {
eufs_err(sb, "eufs_new_inode failed ino 0x%lx err %d\n",
inode->i_ino, err);
goto out;
}
return inode;
no_space_err:
err = -ENOSPC;
out:
if (pre_page)
nv_free(sb, pre_page);
if (pi)
nv_free(sb, pi);
if (inode) {
make_bad_inode(inode);
inode->i_ino = 0;
iput(inode);
}
return ERR_PTR(err);
}
void eufs_inode_size_write(struct inode *inode, loff_t new_size)
{
i_size_write(inode, new_size);
request_persistence(inode);
}