当前仓库属于关闭状态,部分功能使用受限,详情请查阅 仓库状态说明
7 Star 7 Fork 8

openEuler/eulerfs
关闭

加入 Gitee
与超过 1200万 开发者一起发现、参与优秀开源项目,私有仓库也完全免费 :)
免费加入
文件
克隆/下载
dep.c 18.88 KB
一键复制 编辑 原始数据 按行查看 历史
Yu Kuai 提交于 2021-11-30 16:31 . eulerfs: initial import
123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791
// SPDX-License-Identifier: GPL-2.0
/*
* Copyright (C) 2021. Huawei Technologies Co., Ltd. All rights reserved.
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License version 2 and
* only version 2 as published by the Free Software Foundation.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*/
#include <linux/percpu.h>
#include <linux/slab.h>
#include <linux/fs.h>
#include <linux/kthread.h>
#include <linux/list.h>
#include <linux/ratelimit.h>
#include <linux/writeback.h>
#include "euler.h"
#include "dep.h"
#include "lock.h"
#include "dax.h"
#include "dht.h"
static void do_dep_diradd_oneshot(struct inode *dir_inode, struct dep_node *dep,
u64 *bitset);
struct flush_list_head {
int count;
struct llist_head head;
};
DEFINE_PER_CPU(struct flush_list_head, flush_list_percpu);
#define IFMT_HAS_ROOT(ifmt) \
((ifmt) == S_IFREG || (ifmt) == S_IFDIR || (ifmt) == S_IFLNK)
#define INODE_COND_TRYLOCK(inode, tag, enter_cond, exit_cond, exit_expr) \
do { \
tag: \
if (enter_cond) { \
if (likely(inode_trylock(inode))) { \
/* get the lock, okay */ \
} else { \
if (exit_cond) { \
exit_expr; \
} else { \
cond_resched(); \
goto tag; \
} \
} \
} \
} while (0)
static inline void fsync_dir_oneshot(struct inode *dir)
{
eufs_dir_fsync_oneshot(dir);
}
static void do_dep_dirrem(struct inode *inode, struct dep_node *dep,
u64 *bitset)
{
struct nv_dict_entry *prevde = dep->prevde;
struct nv_dict_entry *de = dep->de;
int idx;
eufs_dbg("!! %s !!", __func__);
NV_ASSERT(de);
NV_ASSERT(de->inode);
NV_ASSERT(de->name);
idx = INDEX(de->hv);
bitset[idx / 64] = bitset[idx / 64] | (0x1ull << (idx & 63));
eufs_dbg("bitset-add: dict=%llx, %d %llx\n",
eufs_iread_dict(EUFS_PI(inode)), idx, bitset[idx / 64]);
/*
* This is a removal of a newly created dentry, nothing to do,
* the prevde is already manipulated in dht.c
*/
if (de->volatile_next == EUFS_DIR_DELNEW)
return;
/*
* If dentries immediately following the deleted dentry are
* also deleted, prevde->volatile_next will be modified again.
* So if we assign prevde->volatile_next to prevde->next,
* these deletion will be persisted prematurely.
*/
if (prevde && !eufs_dentry_is_not_persist(prevde)) {
prevde->next = de->next;
persist_dentry(prevde);
}
}
static void do_dep_dirrem_reclaim(struct super_block *sb, struct dep_node *dep)
{
struct nv_dict_entry *de = dep->de;
struct eufs_inode __maybe_unused *pi;
struct inode *child;
pi = s2p(sb, de->inode);
child = dep->inode;
NV_ASSERT(EUFS_PI(child) == pi);
eufs_dbg("dirrem: child_inode=%px\n", child);
BUG_ON(!child);
eufs_free_name(sb, de);
nv_free(sb, de);
}
#define EUFS_PRINT_BITSET(lvl, bitset) \
eufs_##lvl("bitsets: %llx %llx %llx %llx %llx %llx %llx %llx\n", \
bitset[0], bitset[1], bitset[2], bitset[3], bitset[4], \
bitset[5], bitset[6], bitset[7])
static void eufs_sync_buckets(struct eufs_inode_info *vi, u64 bitset[8])
{
struct inode *inode = &vi->vfs_inode;
struct super_block *sb = inode->i_sb;
struct eufs_inode *pi = EUFS_FRESH_PI(EUFS_PI(inode));
struct nv_dict *dict;
int i;
/* Volatile buckets */
if (!vi->i_volatile_dict)
return;
EUFS_PRINT_BITSET(dbg, bitset);
BUG_ON(!inode_is_header_locked(inode));
dict = o2p(sb, eufs_iread_dict(pi));
for (i = 0; i < 8; ++i) {
int j;
bool dirty;
int idx;
if (!bitset[i])
continue;
dirty = false;
for (j = 0; j <= 64; ++j) {
if (j % 8 == 0 && dirty) {
dirty = false;
eufs_flush_cacheline(&dict->table[idx]);
}
if (j == 64)
break;
if (!(bitset[i] & (0x1ull << j)))
continue;
idx = i * 64 + j;
eufs_dbg_dir("handle index %d (i %d, j %d) of inode=%px\n",
idx, i, j, inode);
eufs_dbg_dir(" idx=%d dict[idx]=%px vdict[idx]=%px\n",
idx, dict->table[idx],
vi->i_volatile_dict->table[idx]);
if (unlikely(vi->i_volatile_dict->table[idx] ==
EUFS_DIR_EOC_PTR))
dict->table[idx] = NULL_VAL;
else if (vi->i_volatile_dict->table[idx] != NULL)
dict->table[idx] = COMPOSE_DICT_HEAD_le64(
sb, vi->i_volatile_dict->table[idx]);
vi->i_volatile_dict->table[idx] = NULL;
dirty = true;
}
}
}
/*
* Some ideas on fast fsync (of dir):
*
* 1. Batch and coalescence. The newly inserted dentry should be marked and
* during its removal, it should be marked again so that unnecessary dep_diradd
* an be prevented.
*
* 2. Split! The lock (only when there is one lock needed) can be temporarily
* given up so between handling two deps. This requires that the dentry pointed
* by dir_pi should not be reclaimed (like in RCU). Well, actually, combined
* with the following one idea, this is quite acceptable.
*
* 3. Delayed free. The removal operations can be delayed until the locks are
* released.
*
*
* Parallel fsync for a vi is not throughly considered though.
*
* 4. Detach only if the list is empty?
*/
static void fsync_rename_inode(struct inode *dir)
{
struct eufs_inode_info *vi = EUFS_I(dir);
if (!vi->i_is_dirty)
return;
/* I'm holding the lock, so if it's dirty, it's dirty. */
fsync_dir_oneshot(dir);
}
void fsync_rename_inodes(struct inode *old_dir, struct inode *new_dir,
struct inode **locked_inodes)
{
int i;
struct inode *inode;
/*
* The two parent dirs, might have parent-child relations sometime
* before. So we need to transfer these two dirs too.
*/
for (i = 0; i < EUFS_INODE_CNT_IN_RENAME; i++) {
inode = locked_inodes[i];
if (inode)
eufs_inode_mark_lock_transferable(inode);
}
if (old_dir == new_dir) {
fsync_rename_inode(old_dir);
} else {
fsync_rename_inode(old_dir);
fsync_rename_inode(new_dir);
}
for (i = 0; i < EUFS_INODE_CNT_IN_RENAME; i++) {
inode = locked_inodes[i];
if (inode)
eufs_inode_wait_lock_transfer_done(inode);
}
}
static void eufs_update_persisted_seq(struct eufs_inode_info *vi,
struct list_head *head)
{
if (!list_empty(head)) {
struct dep_node *dep =
list_last_entry(head, struct dep_node, node);
vi->i_persisted_dep_seq = dep->seq;
}
}
static int fsync_dir_bg(struct inode *dir)
{
struct dep_node *dep, *next;
LIST_HEAD(detached_list);
LIST_HEAD(dump_list);
int i;
#define FSYNC_DIR_VI_LOOP_NUM (20)
struct eufs_inode_info *vi = EUFS_I(dir);
struct super_block *sb = dir->i_sb;
struct eufs_sb_info *sbi = EUFS_SB(sb);
struct eufs_inode *pi = EUFS_PI(dir);
u64 bitset[8] = { 0 };
int dep_count = 0;
retry:
inode_urgent_lock(dir);
/* Phase 1 */
for (i = FSYNC_DIR_VI_LOOP_NUM; i >= 0; --i) {
/* Get all deps round by round */
if (i == 0) {
/* Last round */
inode_header_lock(dir);
}
inode_dep_lock(dir);
if (list_empty(&vi->i_dep_list) && i > 0) {
/* Skip to last round */
i = 1;
}
list_cut_position(&detached_list, &vi->i_dep_list,
vi->i_dep_list.prev);
if (i > 0)
inode_dep_unlock(dir);
/* Do dep one by one. */
list_for_each_entry_safe(dep, next, &detached_list, node) {
if (dep->type == DEP_DIRADD) {
/*
* FIXME: the lockset might be different since
* we might have released the inode lock.
*/
do_dep_diradd_oneshot(dir, dep, bitset);
} else if (dep->type == DEP_DIRREM) {
do_dep_dirrem(dir, dep, bitset);
} else
BUG();
}
list_splice_tail_init(&detached_list, &dump_list);
if (i == 0) {
eufs_pbarrier();
if (!list_empty(&dump_list))
/* Phase 2 */
eufs_sync_buckets(vi, bitset);
inode_dep_unlock(dir);
inode_header_unlock(dir);
break;
}
}
inode_urgent_unlock(dir);
/* Phase 3 */
inode_lock(dir);
if (!list_empty(&vi->i_dep_list)) {
inode_unlock(dir);
/* To handle new deps between phase 2 & 3 */
/* FIXME: Live lock possible! */
goto retry;
}
if (dir->i_nlink)
eufs_sync_pinode(dir, pi, false);
eufs_update_persisted_seq(vi, &dump_list);
vi->i_is_persisting = false;
vi->i_is_dirty = false;
if (dir->i_nlink)
persist_pinode(pi);
inode_unlock(dir);
eufs_pbarrier();
/* Reclaim memory and clear the list */
list_for_each_entry_safe(dep, next, &dump_list, node) {
struct inode *child_inode = dep->inode;
struct eufs_inode_info *child_vi = EUFS_I(child_inode);
if (dep->type == DEP_DIRREM)
do_dep_dirrem_reclaim(sb, dep);
/* remove from owner list */
spin_lock(&child_vi->i_owner_lock);
list_del_init(&dep->owner_node);
spin_unlock(&child_vi->i_owner_lock);
iput(child_inode);
list_del(&dep->node);
eufs_free_dep_node(dep);
dep_count++;
}
atomic_sub(dep_count, &sbi->s_nr_dep_nodes);
eufs_dbg("@cpu=%d !! fsync dir vi done: inode=%px\n",
smp_processor_id(), &vi->vfs_inode);
return 0;
}
static int fsync_nondir_oneshot(struct inode *inode)
{
struct eufs_inode_info *vi = EUFS_I(inode);
struct eufs_inode *pi;
/* For files other than dir */
WARN(S_ISDIR(inode->i_mode), "%s on a dir!", __func__);
/* Inode needs to remove. Nothing to do */
if (!inode->i_nlink) {
vi->i_is_dirty = false;
return 0;
}
pi = EUFS_PI(inode);
eufs_sync_pinode(inode, pi, false);
persist_pinode(pi);
vi->i_is_dirty = false;
return 0;
}
static int fsync_nondir_bg(struct inode *inode)
{
struct eufs_inode_info *vi = EUFS_I(inode);
int r;
inode_lock(inode);
r = fsync_nondir_oneshot(inode);
vi->i_is_persisting = false;
inode_unlock(inode);
return r;
}
static void fsync_bg(struct inode *inode)
{
struct eufs_sb_info *sbi = EUFS_SB(inode->i_sb);
wait_on_inode(inode);
/* Reading i_mode may need no protection */
if (S_ISDIR(inode->i_mode))
fsync_dir_bg(inode);
else
fsync_nondir_bg(inode);
/* Decrease */
iput(inode);
if (atomic_dec_and_test(&sbi->s_nr_dirty_inodes) && sbi->s_draining) {
/* end of draining */
sbi->s_draining = false;
}
}
void fsync_oneshot(struct inode *inode)
{
/* Reading i_mode may need no protection */
if (S_ISDIR(inode->i_mode))
fsync_dir_oneshot(inode);
else
fsync_nondir_oneshot(inode);
}
static void do_dep_diradd_oneshot(struct inode *dir_inode, struct dep_node *dep,
u64 *bitset)
{
struct super_block *sb = dir_inode->i_sb;
struct nv_dict_entry *de = dep->de;
struct inode *inode = dep->inode;
struct eufs_inode_info *dir_vi = EUFS_I(dir_inode);
struct eufs_inode *pi;
struct eufs_inode *fresh_pi;
int idx;
void *buffer[16];
struct alloc_batch ab;
bool lock_transferred = false;
idx = INDEX(de->hv);
bitset[idx / 64] = bitset[idx / 64] | (0x1ull << (idx & 63));
if (de->volatile_next == EUFS_DIR_DELNEW) {
/*
* The de is already invisible from both the latest view and
* the consistent view.
* Will be handled in the corresponding dirrem.
*/
return;
}
/* Meow? This equality is the sign of diradd */
WARN(!eufs_dentry_is_not_persist(de), "diradd wrong sign");
pi = s2p(sb, de->inode);
wait_on_inode(inode);
retry:
if (likely(inode_trylock(inode))) {
/* Got the lock */
} else {
if (eufs_inode_mark_lock_transferring(inode)) {
lock_transferred = true;
} else {
cond_resched();
goto retry;
}
}
eufs_sync_pinode(inode, pi, false);
fresh_pi = EUFS_FRESH_PI(pi);
if (!lock_transferred)
inode_unlock(inode);
else
eufs_inode_lock_transfer_done(inode);
ab.n_used = 0;
ab.size = 16;
ab.batch = buffer;
eufs_alloc_batch_add(sb, &ab, de);
/*
* force to persist the allocation without checking.
* TODO: we should differentiate the link and create syscall to agree
* with checking
*/
eufs_alloc_persist(sb, pi, true);
if (S_ISLNK(fresh_pi->i_mode)) {
void *root = o2p(sb, eufs_iread_root(fresh_pi));
/* reg file's root is done in btree */
/* In case of Hard link, we must force the allocation persitence */
eufs_alloc_persist(sb, root, true);
persist_symlink(root);
} else if (S_ISDIR(fresh_pi->i_mode)) {
void *root = o2p(sb, eufs_iread_root(fresh_pi));
eufs_alloc_persist(sb, root, false);
persist_page(root);
}
persist_name(sb, de, &ab);
eufs_alloc_batch_persist_reset(sb, &ab);
persist_pinode(pi);
spin_lock(&dir_vi->i_dentry_persist_lock);
eufs_dentry_clr_not_persist_flag(de);
spin_unlock(&dir_vi->i_dentry_persist_lock);
persist_dentry(de);
}
void eufs_dir_fsync_oneshot(struct inode *dir)
{
struct dep_node *dep;
struct dep_node *next;
struct super_block *sb = dir->i_sb;
struct eufs_sb_info *sbi = EUFS_SB(sb);
struct eufs_inode_info *vi = EUFS_I(dir);
LIST_HEAD(detached_list);
u64 bitset[8] = { 0 };
int dep_count = 0;
BUG_ON(!inode_is_locked(dir));
inode_urgent_lock(dir);
/* get all deps */
inode_header_lock(dir);
inode_dep_lock(dir);
if (list_empty(&vi->i_dep_list))
goto unlock_sync_pinode;
list_for_each_entry(dep, &vi->i_dep_list, node) {
if (dep->type == DEP_DIRADD)
do_dep_diradd_oneshot(dir, dep, bitset);
else if (dep->type == DEP_DIRREM)
do_dep_dirrem(dir, dep, bitset);
else
BUG();
}
list_splice_init(&vi->i_dep_list, &detached_list);
/* sync buckets */
eufs_pbarrier();
eufs_sync_buckets(vi, bitset);
unlock_sync_pinode:
inode_dep_unlock(dir);
inode_header_unlock(dir);
/* sync pinode */
if (dir->i_nlink)
eufs_sync_pinode(dir, EUFS_PI(dir), false);
eufs_pbarrier();
eufs_update_persisted_seq(vi, &detached_list);
vi->i_is_dirty = false;
/* Reclaim memory and clear the list */
list_for_each_entry_safe(dep, next, &detached_list, node) {
struct inode *child_inode = dep->inode;
struct eufs_inode_info *child_vinode = EUFS_I(child_inode);
spin_lock(&child_vinode->i_owner_lock);
list_del_init(&dep->owner_node);
spin_unlock(&child_vinode->i_owner_lock);
if (dep->type == DEP_DIRREM) {
do_dep_dirrem_reclaim(sb, dep);
iput(dep->inode);
} else if (dep->type == DEP_DIRADD) {
iput(dep->inode);
}
list_del(&dep->node);
eufs_free_dep_node(dep);
dep_count++;
}
atomic_sub(dep_count, &sbi->s_nr_dep_nodes);
inode_urgent_unlock(dir);
}
void fsync_on_draining(struct inode *dir, struct inode *inode)
{
BUG_ON(!dir);
BUG_ON(!inode_is_locked(dir));
BUG_ON(inode && !inode_is_locked(inode));
/* for link/unlink/rmdir */
if (inode)
eufs_inode_mark_lock_transferable(inode);
fsync_dir_oneshot(dir);
if (inode)
eufs_inode_wait_lock_transfer_done(inode);
}
#define NR_FLUSH_EACH_ROUND (16)
#define FLUSH_START_THRESHOLD (64)
static __always_inline int handle_persistees_for_each_cpu(
struct super_block *sb, const struct cpumask *mask, int idx) {
struct eufs_sb_info *sbi = EUFS_SB(sb);
struct llist_node *list;
struct llist_head *head;
struct eufs_inode_info *vi;
struct eufs_inode_info *next;
int n_active_list;
int cpu;
bool need;
retry:
need = sbi->need_sync[idx];
n_active_list = 0;
for_each_cpu(cpu, mask) {
head = per_cpu_ptr(sbi->persistee_list, cpu);
if (unlikely(llist_empty(head)))
continue;
n_active_list++;
list = llist_del_all(head);
eufs_dbg("persister get list %px for cpu%d\n", list, cpu);
/* reverse the ordering for better locality? */
llist_for_each_entry_safe(vi, next, list, i_persistee_node)
fsync_bg(&vi->vfs_inode);
eufs_dbg("persister handled list %px\n", list);
}
/**
* We need a complete round of run for fssync. If
* need != sbi->need_sync[idx], need_sync was modified during our last
* round. We need to retry to ensure a complete round of run.
* It's okay if dirty inodes of a cpu is still being processed by
* another persister, since we will wait for all persisters to finish
* for fssync.
*/
if (need != READ_ONCE(sbi->need_sync[idx]))
goto retry;
if (need) {
sbi->need_sync[idx] = false;
wake_up(&sbi->sync_wq);
}
if (READ_ONCE(sbi->need_sync[idx]))
goto retry;
return n_active_list;
}
static int persister(void *data)
{
struct super_block *sb = data;
struct eufs_sb_info *sbi = EUFS_SB(sb);
const struct cpumask *mask = cpumask_of_node(numa_node_id());
const int period =
(persist_period == 0) ? /* default */ (HZ / 4) :
/* less than a second */
((persist_period < 0) ? (HZ / (-persist_period)) :
/* more than a second */
(HZ * persist_period));
int idx = 0;
int num_persisters = num_sockets * persisters_per_socket;
eufs_info("sb=%px cpu=%d cpumask=%*pbl period=%d\n", data,
smp_processor_id(), cpumask_pr_args(mask), period);
while (idx < num_persisters && sbi->persisters[idx] != current)
idx++;
BUG_ON(idx >= num_persisters);
while (!kthread_should_stop()) {
set_current_state(TASK_INTERRUPTIBLE);
schedule_timeout(period);
handle_persistees_for_each_cpu(sb, mask, idx);
}
while (handle_persistees_for_each_cpu(sb, mask, idx))
cpu_relax();
eufs_info("finalizing on %d\n", smp_processor_id());
return 0;
}
int dep_init(struct super_block *sb)
{
struct eufs_sb_info *sbi = EUFS_SB(sb);
int cpu;
int i, j;
char name[BDEVNAME_SIZE];
int err;
sbi->persistee_list = alloc_percpu(struct llist_head);
if (!sbi->persistee_list) {
err = -ENOMEM;
goto cleanup;
}
/* init each llist */
for_each_possible_cpu(cpu)
init_llist_head(per_cpu_ptr(sbi->persistee_list, cpu));
sbi->persisters = kmalloc(sizeof(struct task_struct *) *
persisters_per_socket * num_sockets,
GFP_KERNEL);
if (!sbi->persisters) {
err = -ENOMEM;
goto cleanup;
}
sbi->need_sync = kzalloc(
sizeof(bool) * persisters_per_socket * num_sockets, GFP_KERNEL);
if (!sbi->need_sync) {
err = -ENOMEM;
goto cleanup;
}
init_waitqueue_head(&sbi->sync_wq);
bdevname(sb->s_bdev, name);
for (i = 0; i < num_sockets; ++i) {
for (j = 0; j < persisters_per_socket; ++j) {
int idx = i * persisters_per_socket + j;
sbi->persisters[idx] = kthread_create_on_node(
persister, sb, i, "hmfs/%s-%d.%d", name, i, j);
if (IS_ERR(sbi->persisters[idx])) {
err = PTR_ERR(sbi->persisters[idx]);
pr_err("create persister %s-%d.%d error %d",
name, i, j, err);
sbi->persisters[idx] = NULL;
goto cleanup;
}
set_cpus_allowed_ptr(sbi->persisters[idx],
cpumask_of_node(i));
wake_up_process(sbi->persisters[idx]);
}
}
return 0;
cleanup:
dep_fini(sb);
return err;
}
void dep_fini(struct super_block *sb)
{
struct eufs_sb_info *sbi = EUFS_SB(sb);
if (sbi->persisters) {
int i;
for (i = 0; i < persisters_per_socket * num_sockets; ++i) {
if (sbi->persisters[i]) {
kthread_stop(sbi->persisters[i]);
sbi->persisters[i] = NULL;
}
}
kfree(sbi->persisters);
sbi->persisters = NULL;
}
kfree(sbi->need_sync);
sbi->need_sync = NULL;
free_percpu(sbi->persistee_list);
sbi->persistee_list = NULL;
}
马建仓 AI 助手
尝试更多
代码解读
代码找茬
代码优化
1
https://gitee.com/openeuler/eulerfs.git
git@gitee.com:openeuler/eulerfs.git
openeuler
eulerfs
eulerfs
master

搜索帮助