master

分支 (1)

管理

管理

master

eulerfs
/
dep.c

// SPDX-License-Identifier: GPL-2.0
/*
 * Copyright (C) 2021. Huawei Technologies Co., Ltd. All rights reserved.
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License version 2 and
 * only version 2 as published by the Free Software Foundation.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
 * GNU General Public License for more details.
 */

#include <linux/percpu.h>
#include <linux/slab.h>
#include <linux/fs.h>
#include <linux/kthread.h>
#include <linux/list.h>
#include <linux/ratelimit.h>
#include <linux/writeback.h>
#include "euler.h"
#include "dep.h"
#include "lock.h"
#include "dax.h"
#include "dht.h"

static void do_dep_diradd_oneshot(struct inode *dir_inode, struct dep_node *dep,
				  u64 *bitset);

struct flush_list_head {
	int count;
	struct llist_head head;
};

DEFINE_PER_CPU(struct flush_list_head, flush_list_percpu);

#define IFMT_HAS_ROOT(ifmt)                                                    \
	((ifmt) == S_IFREG || (ifmt) == S_IFDIR || (ifmt) == S_IFLNK)

#define INODE_COND_TRYLOCK(inode, tag, enter_cond, exit_cond, exit_expr)       \
	do {                                                                   \
	tag:                                                                   \
		if (enter_cond) {                                              \
			if (likely(inode_trylock(inode))) {                    \
				/* get the lock, okay */                       \
			} else {                                               \
				if (exit_cond) {                               \
					exit_expr;                             \
				} else {                                       \
					cond_resched();                        \
					goto tag;                              \
				}                                              \
			}                                                      \
		}                                                              \
	} while (0)

static inline void fsync_dir_oneshot(struct inode *dir)
{
	eufs_dir_fsync_oneshot(dir);
}

static void do_dep_dirrem(struct inode *inode, struct dep_node *dep,
			  u64 *bitset)
{
	struct nv_dict_entry *prevde = dep->prevde;
	struct nv_dict_entry *de = dep->de;
	int idx;

	eufs_dbg("!! %s !!", __func__);
	NV_ASSERT(de);
	NV_ASSERT(de->inode);
	NV_ASSERT(de->name);

	idx = INDEX(de->hv);
	bitset[idx / 64] = bitset[idx / 64] | (0x1ull << (idx & 63));
	eufs_dbg("bitset-add: dict=%llx, %d %llx\n",
		 eufs_iread_dict(EUFS_PI(inode)), idx, bitset[idx / 64]);

	/*
	 * This is a removal of a newly created dentry, nothing to do,
	 * the prevde is already manipulated in dht.c
	 */
	if (de->volatile_next == EUFS_DIR_DELNEW)
		return;

	/*
	 * If dentries immediately following the deleted dentry are
	 * also deleted, prevde->volatile_next will be modified again.
	 * So if we assign prevde->volatile_next to prevde->next,
	 * these deletion will be persisted prematurely.
	 */
	if (prevde && !eufs_dentry_is_not_persist(prevde)) {
		prevde->next = de->next;
		persist_dentry(prevde);
	}
}

static void do_dep_dirrem_reclaim(struct super_block *sb, struct dep_node *dep)
{
	struct nv_dict_entry *de = dep->de;
	struct eufs_inode __maybe_unused *pi;
	struct inode *child;

	pi = s2p(sb, de->inode);
	child = dep->inode;
	NV_ASSERT(EUFS_PI(child) == pi);
	eufs_dbg("dirrem: child_inode=%px\n", child);
	BUG_ON(!child);
	eufs_free_name(sb, de);
	nv_free(sb, de);
}

#define EUFS_PRINT_BITSET(lvl, bitset)                                         \
	eufs_##lvl("bitsets: %llx %llx %llx %llx %llx %llx %llx %llx\n",       \
		   bitset[0], bitset[1], bitset[2], bitset[3], bitset[4],      \
		   bitset[5], bitset[6], bitset[7])

static void eufs_sync_buckets(struct eufs_inode_info *vi, u64 bitset[8])
{
	struct inode *inode = &vi->vfs_inode;
	struct super_block *sb = inode->i_sb;
	struct eufs_inode *pi = EUFS_FRESH_PI(EUFS_PI(inode));
	struct nv_dict *dict;
	int i;

	/* Volatile buckets */
	if (!vi->i_volatile_dict)
		return;

	EUFS_PRINT_BITSET(dbg, bitset);

	BUG_ON(!inode_is_header_locked(inode));
	dict = o2p(sb, eufs_iread_dict(pi));
	for (i = 0; i < 8; ++i) {
		int j;
		bool dirty;
		int idx;

		if (!bitset[i])
			continue;
		dirty = false;
		for (j = 0; j <= 64; ++j) {
			if (j % 8 == 0 && dirty) {
				dirty = false;
				eufs_flush_cacheline(&dict->table[idx]);
			}
			if (j == 64)
				break;
			if (!(bitset[i] & (0x1ull << j)))
				continue;
			idx = i * 64 + j;
			eufs_dbg_dir("handle index %d (i %d, j %d) of inode=%px\n",
				     idx, i, j, inode);

			eufs_dbg_dir(" idx=%d  dict[idx]=%px vdict[idx]=%px\n",
				     idx, dict->table[idx],
				     vi->i_volatile_dict->table[idx]);

			if (unlikely(vi->i_volatile_dict->table[idx] ==
				     EUFS_DIR_EOC_PTR))
				dict->table[idx] = NULL_VAL;
			else if (vi->i_volatile_dict->table[idx] != NULL)
				dict->table[idx] = COMPOSE_DICT_HEAD_le64(
					sb, vi->i_volatile_dict->table[idx]);
			vi->i_volatile_dict->table[idx] = NULL;
			dirty = true;
		}
	}
}

/*
 * Some ideas on fast fsync (of dir):
 *
 * 1. Batch and coalescence. The newly inserted dentry should be marked and
 * during its removal, it should be marked again so that unnecessary dep_diradd
 * an be prevented.
 *
 * 2. Split! The lock (only when there is one lock needed) can be temporarily
 * given up so between handling two deps. This requires that the dentry pointed
 * by dir_pi should not be reclaimed (like in RCU). Well, actually, combined
 * with the following one idea, this is quite acceptable.
 *
 * 3. Delayed free. The removal operations can be delayed until the locks are
 * released.
 *
 *
 * Parallel fsync for a vi is not throughly considered though.
 *
 * 4. Detach only if the list is empty?
 */
static void fsync_rename_inode(struct inode *dir)
{
	struct eufs_inode_info *vi = EUFS_I(dir);

	if (!vi->i_is_dirty)
		return;

	/* I'm holding the lock, so if it's dirty, it's dirty. */
	fsync_dir_oneshot(dir);
}

void fsync_rename_inodes(struct inode *old_dir, struct inode *new_dir,
			 struct inode **locked_inodes)
{
	int i;
	struct inode *inode;

	/*
	 * The two parent dirs, might have parent-child relations sometime
	 * before. So we need to transfer these two dirs too.
	 */
	for (i = 0; i < EUFS_INODE_CNT_IN_RENAME; i++) {
		inode = locked_inodes[i];
		if (inode)
			eufs_inode_mark_lock_transferable(inode);
	}

	if (old_dir == new_dir) {
		fsync_rename_inode(old_dir);
	} else {
		fsync_rename_inode(old_dir);
		fsync_rename_inode(new_dir);
	}

	for (i = 0; i < EUFS_INODE_CNT_IN_RENAME; i++) {
		inode = locked_inodes[i];
		if (inode)
			eufs_inode_wait_lock_transfer_done(inode);
	}
}

static void eufs_update_persisted_seq(struct eufs_inode_info *vi,
				      struct list_head *head)
{
	if (!list_empty(head)) {
		struct dep_node *dep =
			list_last_entry(head, struct dep_node, node);

		vi->i_persisted_dep_seq = dep->seq;
	}
}

static int fsync_dir_bg(struct inode *dir)
{
	struct dep_node *dep, *next;
	LIST_HEAD(detached_list);
	LIST_HEAD(dump_list);
	int i;
#define FSYNC_DIR_VI_LOOP_NUM (20)

	struct eufs_inode_info *vi = EUFS_I(dir);
	struct super_block *sb = dir->i_sb;
	struct eufs_sb_info *sbi = EUFS_SB(sb);
	struct eufs_inode *pi = EUFS_PI(dir);
	u64 bitset[8] = { 0 };
	int dep_count = 0;

retry:
	inode_urgent_lock(dir);

	/* Phase 1 */
	for (i = FSYNC_DIR_VI_LOOP_NUM; i >= 0; --i) {
		/* Get all deps round by round */
		if (i == 0) {
			/* Last round */
			inode_header_lock(dir);
		}
		inode_dep_lock(dir);

		if (list_empty(&vi->i_dep_list) && i > 0) {
			/* Skip to last round */
			i = 1;
		}
		list_cut_position(&detached_list, &vi->i_dep_list,
				  vi->i_dep_list.prev);

		if (i > 0)
			inode_dep_unlock(dir);

		/* Do dep one by one. */
		list_for_each_entry_safe(dep, next, &detached_list, node) {
			if (dep->type == DEP_DIRADD) {
				/*
				 * FIXME: the lockset might be different since
				 * we might have released the inode lock.
				 */
				do_dep_diradd_oneshot(dir, dep, bitset);

			} else if (dep->type == DEP_DIRREM) {
				do_dep_dirrem(dir, dep, bitset);

			} else
				BUG();
		}

		list_splice_tail_init(&detached_list, &dump_list);

		if (i == 0) {
			eufs_pbarrier();

			if (!list_empty(&dump_list))
				/* Phase 2 */
				eufs_sync_buckets(vi, bitset);

			inode_dep_unlock(dir);
			inode_header_unlock(dir);
			break;
		}
	}

	inode_urgent_unlock(dir);

	/* Phase 3 */
	inode_lock(dir);

	if (!list_empty(&vi->i_dep_list)) {
		inode_unlock(dir);
		/* To handle new deps between phase 2 & 3 */
		/* FIXME: Live lock possible! */
		goto retry;
	}

	if (dir->i_nlink)
		eufs_sync_pinode(dir, pi, false);

	eufs_update_persisted_seq(vi, &dump_list);

	vi->i_is_persisting = false;
	vi->i_is_dirty = false;

	if (dir->i_nlink)
		persist_pinode(pi);

	inode_unlock(dir);

	eufs_pbarrier();

	/* Reclaim memory and clear the list */
	list_for_each_entry_safe(dep, next, &dump_list, node) {
		struct inode *child_inode = dep->inode;
		struct eufs_inode_info *child_vi = EUFS_I(child_inode);

		if (dep->type == DEP_DIRREM)
			do_dep_dirrem_reclaim(sb, dep);

		/* remove from owner list */
		spin_lock(&child_vi->i_owner_lock);
		list_del_init(&dep->owner_node);
		spin_unlock(&child_vi->i_owner_lock);

		iput(child_inode);

		list_del(&dep->node);

		eufs_free_dep_node(dep);
		dep_count++;
	}
	atomic_sub(dep_count, &sbi->s_nr_dep_nodes);
	eufs_dbg("@cpu=%d !! fsync dir vi done: inode=%px\n",
		 smp_processor_id(), &vi->vfs_inode);
	return 0;
}

static int fsync_nondir_oneshot(struct inode *inode)
{
	struct eufs_inode_info *vi = EUFS_I(inode);
	struct eufs_inode *pi;

	/* For files other than dir */
	WARN(S_ISDIR(inode->i_mode), "%s on a dir!", __func__);

	/* Inode needs to remove. Nothing to do */
	if (!inode->i_nlink) {
		vi->i_is_dirty = false;
		return 0;
	}

	pi = EUFS_PI(inode);

	eufs_sync_pinode(inode, pi, false);

	persist_pinode(pi);

	vi->i_is_dirty = false;

	return 0;
}

static int fsync_nondir_bg(struct inode *inode)
{
	struct eufs_inode_info *vi = EUFS_I(inode);
	int r;

	inode_lock(inode);
	r = fsync_nondir_oneshot(inode);
	vi->i_is_persisting = false;
	inode_unlock(inode);

	return r;
}

static void fsync_bg(struct inode *inode)
{
	struct eufs_sb_info *sbi = EUFS_SB(inode->i_sb);

	wait_on_inode(inode);

	/* Reading i_mode may need no protection */
	if (S_ISDIR(inode->i_mode))
		fsync_dir_bg(inode);
	else
		fsync_nondir_bg(inode);

	/* Decrease */
	iput(inode);

	if (atomic_dec_and_test(&sbi->s_nr_dirty_inodes) && sbi->s_draining) {
		/* end of draining */
		sbi->s_draining = false;
	}
}

void fsync_oneshot(struct inode *inode)
{
	/* Reading i_mode may need no protection */
	if (S_ISDIR(inode->i_mode))
		fsync_dir_oneshot(inode);
	else
		fsync_nondir_oneshot(inode);
}

static void do_dep_diradd_oneshot(struct inode *dir_inode, struct dep_node *dep,
				  u64 *bitset)
{
	struct super_block *sb = dir_inode->i_sb;
	struct nv_dict_entry *de = dep->de;
	struct inode *inode = dep->inode;
	struct eufs_inode_info *dir_vi = EUFS_I(dir_inode);
	struct eufs_inode *pi;
	struct eufs_inode *fresh_pi;
	int idx;
	void *buffer[16];
	struct alloc_batch ab;
	bool lock_transferred = false;

	idx = INDEX(de->hv);
	bitset[idx / 64] = bitset[idx / 64] | (0x1ull << (idx & 63));

	if (de->volatile_next == EUFS_DIR_DELNEW) {
		/*
		 * The de is already invisible from both the latest view and
		 * the consistent view.
		 * Will be handled in the corresponding dirrem.
		 */
		return;
	}

	/* Meow? This equality is the sign of diradd */
	WARN(!eufs_dentry_is_not_persist(de), "diradd wrong sign");

	pi = s2p(sb, de->inode);

	wait_on_inode(inode);
retry:
	if (likely(inode_trylock(inode))) {
		/* Got the lock */
	} else {
		if (eufs_inode_mark_lock_transferring(inode)) {
			lock_transferred = true;
		} else {
			cond_resched();
			goto retry;
		}
	}

	eufs_sync_pinode(inode, pi, false);
	fresh_pi = EUFS_FRESH_PI(pi);

	if (!lock_transferred)
		inode_unlock(inode);
	else
		eufs_inode_lock_transfer_done(inode);

	ab.n_used = 0;
	ab.size = 16;
	ab.batch = buffer;

	eufs_alloc_batch_add(sb, &ab, de);
	/*
	 * force to persist the allocation without checking.
	 * TODO: we should differentiate the link and create syscall to agree
	 * with checking
	 */
	eufs_alloc_persist(sb, pi, true);

	if (S_ISLNK(fresh_pi->i_mode)) {
		void *root = o2p(sb, eufs_iread_root(fresh_pi));

		/* reg file's root is done in btree */
		/* In case of Hard link, we must force the allocation persitence */
		eufs_alloc_persist(sb, root, true);
		persist_symlink(root);
	} else if (S_ISDIR(fresh_pi->i_mode)) {
		void *root = o2p(sb, eufs_iread_root(fresh_pi));

		eufs_alloc_persist(sb, root, false);
		persist_page(root);
	}

	persist_name(sb, de, &ab);

	eufs_alloc_batch_persist_reset(sb, &ab);

	persist_pinode(pi);

	spin_lock(&dir_vi->i_dentry_persist_lock);
	eufs_dentry_clr_not_persist_flag(de);
	spin_unlock(&dir_vi->i_dentry_persist_lock);

	persist_dentry(de);
}

void eufs_dir_fsync_oneshot(struct inode *dir)
{
	struct dep_node *dep;
	struct dep_node *next;
	struct super_block *sb = dir->i_sb;
	struct eufs_sb_info *sbi = EUFS_SB(sb);
	struct eufs_inode_info *vi = EUFS_I(dir);
	LIST_HEAD(detached_list);
	u64 bitset[8] = { 0 };
	int dep_count = 0;

	BUG_ON(!inode_is_locked(dir));

	inode_urgent_lock(dir);

	/* get all deps */
	inode_header_lock(dir);
	inode_dep_lock(dir);

	if (list_empty(&vi->i_dep_list))
		goto unlock_sync_pinode;

	list_for_each_entry(dep, &vi->i_dep_list, node) {
		if (dep->type == DEP_DIRADD)
			do_dep_diradd_oneshot(dir, dep, bitset);
		else if (dep->type == DEP_DIRREM)
			do_dep_dirrem(dir, dep, bitset);
		else
			BUG();
	}

	list_splice_init(&vi->i_dep_list, &detached_list);

	/* sync buckets */
	eufs_pbarrier();
	eufs_sync_buckets(vi, bitset);

unlock_sync_pinode:
	inode_dep_unlock(dir);
	inode_header_unlock(dir);

	/* sync pinode */
	if (dir->i_nlink)
		eufs_sync_pinode(dir, EUFS_PI(dir), false);

	eufs_pbarrier();

	eufs_update_persisted_seq(vi, &detached_list);

	vi->i_is_dirty = false;

	/* Reclaim memory and clear the list */
	list_for_each_entry_safe(dep, next, &detached_list, node) {
		struct inode *child_inode = dep->inode;
		struct eufs_inode_info *child_vinode = EUFS_I(child_inode);

		spin_lock(&child_vinode->i_owner_lock);
		list_del_init(&dep->owner_node);
		spin_unlock(&child_vinode->i_owner_lock);

		if (dep->type == DEP_DIRREM) {
			do_dep_dirrem_reclaim(sb, dep);
			iput(dep->inode);
		} else if (dep->type == DEP_DIRADD) {
			iput(dep->inode);
		}
		list_del(&dep->node);
		eufs_free_dep_node(dep);
		dep_count++;
	}
	atomic_sub(dep_count, &sbi->s_nr_dep_nodes);

	inode_urgent_unlock(dir);
}

void fsync_on_draining(struct inode *dir, struct inode *inode)
{
	BUG_ON(!dir);
	BUG_ON(!inode_is_locked(dir));
	BUG_ON(inode && !inode_is_locked(inode));

	/* for link/unlink/rmdir */
	if (inode)
		eufs_inode_mark_lock_transferable(inode);

	fsync_dir_oneshot(dir);

	if (inode)
		eufs_inode_wait_lock_transfer_done(inode);
}

#define NR_FLUSH_EACH_ROUND (16)
#define FLUSH_START_THRESHOLD (64)

static __always_inline int handle_persistees_for_each_cpu(
		struct super_block *sb, const struct cpumask *mask, int idx) {
	struct eufs_sb_info *sbi = EUFS_SB(sb);
	struct llist_node *list;
	struct llist_head *head;
	struct eufs_inode_info *vi;
	struct eufs_inode_info *next;
	int n_active_list;
	int cpu;
	bool need;

retry:
	need = sbi->need_sync[idx];
	n_active_list = 0;
	for_each_cpu(cpu, mask) {
		head = per_cpu_ptr(sbi->persistee_list, cpu);

		if (unlikely(llist_empty(head)))
			continue;

		n_active_list++;

		list = llist_del_all(head);

		eufs_dbg("persister get list %px for cpu%d\n", list, cpu);

		/* reverse the ordering for better locality? */
		llist_for_each_entry_safe(vi, next, list, i_persistee_node)
			fsync_bg(&vi->vfs_inode);
		eufs_dbg("persister handled list %px\n", list);
	}
	/**
	 * We need a complete round of run for fssync. If
	 * need != sbi->need_sync[idx], need_sync was modified during our last
	 * round. We need to retry to ensure a complete round of run.
	 * It's okay if dirty inodes of a cpu is still being processed by
	 * another persister, since we will wait for all persisters to finish
	 * for fssync.
	 */
	if (need != READ_ONCE(sbi->need_sync[idx]))
		goto retry;
	if (need) {
		sbi->need_sync[idx] = false;
		wake_up(&sbi->sync_wq);
	}
	if (READ_ONCE(sbi->need_sync[idx]))
		goto retry;

	return n_active_list;
}

static int persister(void *data)
{
	struct super_block *sb = data;
	struct eufs_sb_info *sbi = EUFS_SB(sb);
	const struct cpumask *mask = cpumask_of_node(numa_node_id());
	const int period =
		(persist_period == 0) ? /* default */ (HZ / 4) :
					/* less than a second */
			((persist_period < 0) ? (HZ / (-persist_period)) :
						/* more than a second */
				 (HZ * persist_period));
	int idx = 0;
	int num_persisters = num_sockets * persisters_per_socket;

	eufs_info("sb=%px cpu=%d cpumask=%*pbl period=%d\n", data,
		  smp_processor_id(), cpumask_pr_args(mask), period);

	while (idx < num_persisters && sbi->persisters[idx] != current)
		idx++;
	BUG_ON(idx >= num_persisters);

	while (!kthread_should_stop()) {
		set_current_state(TASK_INTERRUPTIBLE);
		schedule_timeout(period);
		handle_persistees_for_each_cpu(sb, mask, idx);
	}

	while (handle_persistees_for_each_cpu(sb, mask, idx))
		cpu_relax();

	eufs_info("finalizing on %d\n", smp_processor_id());

	return 0;
}

int dep_init(struct super_block *sb)
{
	struct eufs_sb_info *sbi = EUFS_SB(sb);
	int cpu;
	int i, j;
	char name[BDEVNAME_SIZE];
	int err;

	sbi->persistee_list = alloc_percpu(struct llist_head);
	if (!sbi->persistee_list) {
		err = -ENOMEM;
		goto cleanup;
	}

	/* init each llist */
	for_each_possible_cpu(cpu)
		init_llist_head(per_cpu_ptr(sbi->persistee_list, cpu));

	sbi->persisters = kmalloc(sizeof(struct task_struct *) *
					  persisters_per_socket * num_sockets,
				  GFP_KERNEL);
	if (!sbi->persisters) {
		err = -ENOMEM;
		goto cleanup;
	}

	sbi->need_sync = kzalloc(
		sizeof(bool) * persisters_per_socket * num_sockets, GFP_KERNEL);
	if (!sbi->need_sync) {
		err = -ENOMEM;
		goto cleanup;
	}

	init_waitqueue_head(&sbi->sync_wq);

	bdevname(sb->s_bdev, name);
	for (i = 0; i < num_sockets; ++i) {
		for (j = 0; j < persisters_per_socket; ++j) {
			int idx = i * persisters_per_socket + j;

			sbi->persisters[idx] = kthread_create_on_node(
				persister, sb, i, "hmfs/%s-%d.%d", name, i, j);

			if (IS_ERR(sbi->persisters[idx])) {
				err = PTR_ERR(sbi->persisters[idx]);
				pr_err("create persister %s-%d.%d error %d",
				       name, i, j, err);
				sbi->persisters[idx] = NULL;
				goto cleanup;
			}

			set_cpus_allowed_ptr(sbi->persisters[idx],
					     cpumask_of_node(i));

			wake_up_process(sbi->persisters[idx]);
		}
	}

	return 0;

cleanup:
	dep_fini(sb);
	return err;
}

void dep_fini(struct super_block *sb)
{
	struct eufs_sb_info *sbi = EUFS_SB(sb);

	if (sbi->persisters) {
		int i;

		for (i = 0; i < persisters_per_socket * num_sockets; ++i) {
			if (sbi->persisters[i]) {
				kthread_stop(sbi->persisters[i]);
				sbi->persisters[i] = NULL;
			}
		}

		kfree(sbi->persisters);
		sbi->persisters = NULL;
	}

	kfree(sbi->need_sync);
	sbi->need_sync = NULL;

	free_percpu(sbi->persistee_list);
	sbi->persistee_list = NULL;
}