当前仓库属于关闭状态,部分功能使用受限,详情请查阅 仓库状态说明
7 Star 7 Fork 8

openEuler/eulerfs
关闭

加入 Gitee
与超过 1200万 开发者一起发现、参与优秀开源项目,私有仓库也完全免费 :)
免费加入
文件
克隆/下载
dax.c 43.20 KB
一键复制 编辑 原始数据 按行查看 历史
Yu Kuai 提交于 2021-11-30 16:31 . eulerfs: initial import
1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237123812391240124112421243124412451246124712481249125012511252125312541255125612571258125912601261126212631264126512661267126812691270127112721273127412751276127712781279128012811282128312841285128612871288128912901291129212931294129512961297129812991300130113021303130413051306130713081309131013111312131313141315131613171318131913201321132213231324132513261327132813291330133113321333133413351336133713381339134013411342134313441345134613471348134913501351135213531354135513561357135813591360136113621363136413651366136713681369137013711372137313741375137613771378137913801381138213831384138513861387138813891390139113921393139413951396139713981399140014011402140314041405140614071408140914101411141214131414141514161417141814191420142114221423142414251426142714281429143014311432143314341435143614371438143914401441144214431444144514461447144814491450145114521453145414551456145714581459146014611462146314641465146614671468146914701471147214731474147514761477147814791480148114821483148414851486148714881489149014911492149314941495149614971498149915001501150215031504150515061507150815091510151115121513151415151516151715181519152015211522152315241525152615271528152915301531153215331534153515361537153815391540154115421543154415451546154715481549155015511552155315541555155615571558155915601561156215631564156515661567156815691570157115721573157415751576157715781579158015811582158315841585158615871588158915901591159215931594159515961597159815991600160116021603160416051606160716081609161016111612161316141615161616171618161916201621162216231624162516261627162816291630163116321633163416351636163716381639164016411642164316441645164616471648164916501651165216531654165516561657165816591660166116621663166416651666166716681669167016711672167316741675167616771678167916801681168216831684168516861687168816891690169116921693169416951696
// SPDX-License-Identifier: GPL-2.0
/*
* Copyright (C) 2021. Huawei Technologies Co., Ltd. All rights reserved.
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License version 2 and
* only version 2 as published by the Free Software Foundation.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*/
#include <linux/mm.h>
#include <linux/sched.h>
#include <linux/fs.h>
#include <linux/pfn_t.h>
#include <linux/buffer_head.h>
#include <linux/iomap.h>
#include <linux/dax.h>
#include <linux/cpufeature.h>
#include <linux/pgtable.h>
#include "euler.h"
#include "dax.h"
#include "dep.h"
#include "wear.h"
#include "alloc_interface.h"
int eufs_persist_btree_node(void *root, int sta, int len);
static __always_inline void eufs_clear_pmem(void *addr, size_t size)
{
memset(addr, 0, size);
eufs_flush_range(addr, size);
}
static __always_inline void *eufs_find_data_block_btree(struct inode *inode,
unsigned long blocknr,
__le64 **parent)
{
__le64 *bp;
u32 height, bit_shift;
unsigned int idx;
struct eufs_inode_info *vi = EUFS_I(inode);
/* inode must be a regular file */
height = vi->i_volatile_height;
bp = vi->i_volatile_root;
NV_ASSERT(blocknr < (1UL << (height * EUFS_FILE_TREE_DEGREE_SHIFT)));
if (height == 0) {
BUG_ON(blocknr != 0);
if (parent)
*parent = NULL;
return (void *)bp;
}
if (height == 1) {
if (bp[blocknr] == NULL_VAL)
BUG();
if (parent)
*parent = bp;
return s2p(inode->i_sb, bp[blocknr]);
}
while (height > 0) {
bit_shift = (height - 1) * EUFS_FILE_TREE_DEGREE_SHIFT;
idx = blocknr >> bit_shift;
if (parent)
*parent = bp;
bp = s2p(inode->i_sb, bp[idx]);
if (bp == 0)
return 0;
blocknr = blocknr & ((1 << bit_shift) - 1);
height--;
}
return bp;
}
static int eufs_extend_btree_recursive_blind(struct inode *inode,
int level_left, __le64 *parent,
int sta_index,
int end_index, /* inclusive */
struct alloc_batch *ab)
{
struct super_block *sb = inode->i_sb;
void *p;
long r;
int i;
for (i = sta_index; i <= end_index; ++i) {
if (!level_left) {
parent[i] = NULL_ADDR;
continue;
}
/* level_left */
p = eufs_alloc_batch_allocate_file_index(inode->i_sb, ab);
if (!p)
return -ENOSPC;
parent[i] = p2s(sb, p);
/* recur */
r = eufs_extend_btree_recursive_blind(inode, level_left - 1, p,
0,
EUFS_FILE_TREE_DEGREE - 1,
ab);
if (IS_ERR_VALUE(r))
return r;
}
return 0;
}
/*
* Allocate blocks from top to bottom.
*
* Only allocate the interior blocks which will have a leaf child block or
* an interior child block. And the unused pointers for children will NOT
* be zeroed.
*
* New leaf blocks are not allocated, and their values are set to NULL_ADDR.
*/
static int eufs_extend_btree_recursive(struct inode *inode, int level_left,
__le64 *parent, unsigned long origin,
unsigned long num_blocks,
struct alloc_batch *ab, bool blind)
{
struct super_block *sb = inode->i_sb;
const unsigned long nblocks_per_slot =
1 << (level_left * EUFS_FILE_TREE_DEGREE_SHIFT);
unsigned long off;
int sta_index, end_index;
int i;
long r;
void *p;
if (blind) {
return eufs_extend_btree_recursive_blind(
inode, level_left, parent, 0, EUFS_FILE_TREE_DEGREE - 1,
ab);
}
if (origin == 0) {
/* end_index could be zero */
end_index = (num_blocks - 1) / nblocks_per_slot;
r = eufs_extend_btree_recursive_blind(inode, level_left, parent,
0, end_index - 1, ab);
if (IS_ERR_VALUE(r))
return r;
if (!level_left) {
parent[end_index] = NULL_ADDR;
} else {
p = eufs_alloc_batch_allocate_file_index(inode->i_sb,
ab);
if (!p)
return -ENOSPC;
parent[end_index] = p2s(sb, p);
off = nblocks_per_slot * end_index;
r = eufs_extend_btree_recursive(inode, level_left - 1,
p, 0, num_blocks - off,
ab, false);
if (IS_ERR_VALUE(r))
return r;
}
return 0;
}
sta_index = (origin - 1) / nblocks_per_slot;
end_index = (num_blocks - 1) / nblocks_per_slot;
/*
* No need to create a new sub-tree, so descend to the sub-tree
* rooted in parent[sta_index]
*/
if (sta_index == end_index) {
if (!level_left)
return 0;
/* calculate the needed block count in the sub-tree */
off = sta_index * nblocks_per_slot;
r = eufs_extend_btree_recursive(inode, level_left - 1,
s2p(sb, parent[sta_index]),
origin - off, num_blocks - off,
ab, false);
if (IS_ERR_VALUE(r))
return r;
return 0;
}
if (!level_left) {
for (i = sta_index + 1; i <= end_index; ++i)
parent[i] = NULL_ADDR;
return 0;
}
/* extend sub-tree shared with existed blocks to its maximum size */
off = sta_index * nblocks_per_slot;
r = eufs_extend_btree_recursive(inode, level_left - 1,
s2p(sb, parent[sta_index]),
origin - off, nblocks_per_slot, ab,
false);
if (IS_ERR_VALUE(r))
return r;
/* new sub-trees which will be fully initialized */
r = eufs_extend_btree_recursive_blind(inode, level_left, parent,
sta_index + 1, end_index - 1, ab);
if (IS_ERR_VALUE(r))
return r;
/* the last new sub-tree which may only needs partial initialization */
p = eufs_alloc_batch_allocate_file_index(inode->i_sb, ab);
if (!p)
return -ENOSPC;
parent[end_index] = p2s(sb, p);
off = end_index * nblocks_per_slot;
r = eufs_extend_btree_recursive(inode, level_left - 1, p, 0,
num_blocks - off, ab, false);
if (IS_ERR_VALUE(r))
return r;
return 0;
}
static unsigned long eufs_count_pages(unsigned long leaf_blocks)
{
unsigned long tot = leaf_blocks;
while (leaf_blocks > 1) {
leaf_blocks = DIV_ROUND_UP(leaf_blocks, EUFS_FILE_TREE_DEGREE);
tot += leaf_blocks;
}
return tot;
}
/* So that we have page[0..num_blocks-1] */
int eufs_extend_btree(struct inode *inode, unsigned long num_blocks)
{
struct eufs_inode_info *vi = EUFS_I(inode);
unsigned long full_size;
unsigned long need_blocks;
__le64 *new_root;
long r = 0;
struct alloc_batch *ab = &vi->page_batch;
if (!num_blocks)
return 0;
if (vi->i_volatile_tree_blocks >= num_blocks)
/* already allocated */
return 0;
if (num_blocks > inode->i_sb->s_maxbytes >> EUFS_BLOCK_SIZE_BITS)
return -EFBIG;
/* Grow from vi->i_volatile_tree_blocks to num_blocks */
need_blocks = eufs_count_pages(num_blocks) -
eufs_count_pages(vi->i_volatile_tree_blocks);
/* Set NULL_ADDR for extended data blocks */
need_blocks -= (num_blocks - vi->i_volatile_tree_blocks);
r = eufs_alloc_batch_pre_allocate_begin(inode->i_sb, ab, need_blocks);
if (IS_ERR_VALUE(r))
return r;
BUG_ON(!vi->i_volatile_root);
if (!vi->i_volatile_root) {
vi->i_volatile_root =
eufs_alloc_batch_allocate_file_data(inode->i_sb, ab);
BUG_ON(!vi->i_volatile_root);
vi->i_volatile_height = 0;
}
if (num_blocks == 1) {
/* Already allocated */
goto out;
}
full_size = 1UL
<< (vi->i_volatile_height * EUFS_FILE_TREE_DEGREE_SHIFT);
while (full_size < num_blocks) {
new_root =
eufs_alloc_batch_allocate_file_index(inode->i_sb, ab);
new_root[0] = p2s(inode->i_sb, vi->i_volatile_root);
vi->i_volatile_root = new_root;
vi->i_volatile_height++;
full_size <<= EUFS_FILE_TREE_DEGREE_SHIFT;
}
BUG_ON(vi->i_volatile_height < 1);
r = eufs_extend_btree_recursive(inode, vi->i_volatile_height - 1,
vi->i_volatile_root,
vi->i_volatile_tree_blocks, num_blocks,
ab, false);
out:
eufs_alloc_batch_pre_allocate_end(inode->i_sb, ab);
vi->i_volatile_tree_blocks = num_blocks;
num_blocks <<= (inode->i_blkbits - 9);
if (num_blocks > inode->i_blocks)
inode->i_blocks = num_blocks;
return r;
}
int eufs_alloc_blocks_btree(struct inode *inode, unsigned long start_block,
unsigned long num_blocks, int zero)
{
long r;
unsigned long blocknr, need_blocks = 0,
end_block = start_block + num_blocks;
long pi_tree_blocks =
eufs_iread_tree_blocks(EUFS_FRESH_PI(EUFS_PI(inode)));
struct eufs_inode_info *vi = EUFS_I(inode);
struct alloc_batch *ab = &vi->page_batch;
__le64 *parent;
unsigned int ofs;
void *xmem;
int last_ofs_line = -1;
r = eufs_extend_btree(inode, start_block + num_blocks);
if (r)
return r;
if (start_block == 0)
vi->hole_at_sta = false;
/* The 0th data block is always allocated. */
blocknr = start_block ? start_block : 1;
/* TODO: need optimization. */
while (blocknr < end_block) {
eufs_find_data_block_btree(inode, blocknr, &parent);
BUG_ON(!parent);
ofs = blocknr & (EUFS_FILE_TREE_DEGREE - 1);
while (ofs < EUFS_FILE_TREE_DEGREE && blocknr < end_block) {
if (parent[ofs] == NULL_ADDR) {
/*
* The leaf blocks are not allocated before persist,
* e.g. through truncate() + fsync()
*/
if (blocknr < pi_tree_blocks) {
xmem = eufs_zalloc_file_data(
inode->i_sb);
if (!xmem)
return -ENOSPC;
eufs_alloc_persist(inode->i_sb, xmem,
false);
eufs_flush_page(xmem);
parent[ofs] = p2s(inode->i_sb, xmem);
eufs_flush_cacheline(&parent[ofs]);
invalidate_inode_pages2_range(
inode->i_mapping, blocknr,
blocknr);
} else
need_blocks++;
}
ofs++;
blocknr++;
}
}
if (!need_blocks)
return 0;
/* TODO: need optimization. */
r = eufs_alloc_batch_pre_allocate_begin(inode->i_sb, ab, need_blocks);
if (IS_ERR_VALUE(r))
return r;
blocknr = start_block ? start_block : 1;
while (blocknr < end_block) {
eufs_find_data_block_btree(inode, blocknr, &parent);
BUG_ON(!parent);
last_ofs_line = -1;
ofs = blocknr & (EUFS_FILE_TREE_DEGREE - 1);
while (ofs < EUFS_FILE_TREE_DEGREE && blocknr < end_block) {
if (parent[ofs] == NULL_ADDR) {
xmem = eufs_alloc_batch_allocate_file_data(
inode->i_sb, ab);
if (zero == EUFS_ALLOC_BLOCKS_ZERO_ALL ||
((zero == EUFS_ALLOC_BLOCKS_ZERO_EDGE) &&
(blocknr == start_block ||
blocknr == end_block - 1)))
eufs_clear_pmem(xmem, PAGE_SIZE);
parent[ofs] = p2s(inode->i_sb, xmem);
invalidate_inode_pages2_range(inode->i_mapping,
blocknr, blocknr);
if (last_ofs_line == -1)
last_ofs_line =
(ofs >>
EUFS_PTR_CNT_SHIFT_PER_CACHELINE);
}
ofs++;
if (last_ofs_line != -1 &&
(ofs >> EUFS_PTR_CNT_SHIFT_PER_CACHELINE) !=
last_ofs_line) {
eufs_flush_cacheline(&parent[ofs - 1]);
last_ofs_line = -1;
}
blocknr++;
}
if (last_ofs_line != -1)
eufs_flush_cacheline(&parent[ofs - 1]);
}
eufs_alloc_batch_pre_allocate_end(inode->i_sb, ab);
return r;
}
static int eufs_alloc_blocks_btree_for_write(struct inode *inode, loff_t pos,
int len)
{
long r;
unsigned long blocknr, need_blocks = 0;
long pi_tree_blocks =
eufs_iread_tree_blocks(EUFS_FRESH_PI(EUFS_PI(inode)));
size_t file_size_block = PAGE_DIV_ROUND_UP(inode->i_size);
struct eufs_inode_info *vi = EUFS_I(inode);
struct alloc_batch *ab = &vi->page_batch;
__le64 *parent;
void *xmem;
/* The page first byte resides in */
unsigned long first_page = PAGE_DIV_ROUND_DOWN(pos);
/* The page last byte resides in */
unsigned long last_page = PAGE_DIV_ROUND_DOWN(pos + len - 1);
unsigned long pending_flush_bits;
int start_offset;
int end_offset;
int ofs;
r = eufs_extend_btree(inode, last_page + 1);
if (r)
return r;
/* hole_at_sta is used by SEEK_HOLE. */
/* FIXME: We need a durable way to present hole_at_sta. */
if (first_page == 0)
vi->hole_at_sta = false;
/* The 0th data block is always allocated. */
blocknr = first_page ? first_page : 1;
/*
* Can be optimized by saving the top-down parent pointers
* in a cursor and advancing by moving the cursor
*/
while (blocknr <= last_page) {
eufs_find_data_block_btree(inode, blocknr, &parent);
BUG_ON(!parent);
/* One ofs, one block */
for (ofs = blocknr & (EUFS_FILE_TREE_DEGREE - 1);
ofs < EUFS_FILE_TREE_DEGREE && blocknr <= last_page;
++ofs, ++blocknr) {
/* Not a hole */
if (parent[ofs] != NULL_ADDR)
continue;
/* Hole */
if (blocknr < pi_tree_blocks) {
/*
* TODO: optimize option, instead of wrting
* zeros here, we can write the actual data
* instead.
*/
xmem = eufs_zalloc_file_data(inode->i_sb);
if (!xmem)
return -ENOSPC;
eufs_alloc_persist(inode->i_sb, xmem, false);
eufs_flush_page(xmem);
parent[ofs] = p2s(inode->i_sb, xmem);
eufs_flush_cacheline(&parent[ofs]);
invalidate_inode_pages2_range(inode->i_mapping,
blocknr, blocknr);
} else
need_blocks++;
}
}
if (!need_blocks)
return 0;
/* FIXME: This requries re-write */
r = eufs_alloc_batch_pre_allocate_begin(inode->i_sb, ab, need_blocks);
if (IS_ERR_VALUE(r))
return r;
start_offset = pos & (PAGE_SIZE - 1);
end_offset = (pos + len) & (PAGE_SIZE - 1);
blocknr = first_page ? first_page : 1;
while (blocknr <= last_page) {
unsigned long bit;
eufs_find_data_block_btree(inode, blocknr, &parent);
BUG_ON(!parent);
/* No cacheline is pending to be flushed for this index block */
pending_flush_bits = 0;
for (ofs = blocknr & (EUFS_FILE_TREE_DEGREE - 1);
ofs < EUFS_FILE_TREE_DEGREE && blocknr <= last_page;
++ofs, ++blocknr) {
/* Not a hole */
if (parent[ofs] != NULL_ADDR)
continue;
xmem = eufs_alloc_batch_allocate_file_data(inode->i_sb,
ab);
if (unlikely(blocknr == first_page &&
(start_offset != 0)))
eufs_clear_pmem(xmem, start_offset);
/* Do not clear the last block which is after the EOF-block */
if (unlikely(blocknr == last_page &&
(end_offset != 0) &&
blocknr < file_size_block))
eufs_clear_pmem((char *)xmem + end_offset,
PAGE_SIZE - end_offset);
parent[ofs] = p2s(inode->i_sb, xmem);
invalidate_inode_pages2_range(inode->i_mapping, blocknr,
blocknr);
set_bit(ofs >> EUFS_PTR_CNT_SHIFT_PER_CACHELINE,
&pending_flush_bits);
}
for (bit = find_first_bit(&pending_flush_bits, 64); bit < 64;
bit = find_next_bit(&pending_flush_bits, 64, bit + 1)) {
ofs = bit << EUFS_PTR_CNT_SHIFT_PER_CACHELINE;
eufs_flush_cacheline(&parent[ofs]);
}
}
eufs_alloc_batch_pre_allocate_end(inode->i_sb, ab);
return r;
}
static void eufs_free_recursive_btree_blind(struct super_block *sb,
__le64 *root, int level_left)
{
int i;
BUG_ON(!root);
if (level_left == -1) {
if (root != NULL_ADDR_PTR)
nv_zfree(sb, root);
return;
}
/* level_left */
BUG_ON(root == NULL_ADDR_PTR);
for (i = 0; i < EUFS_FILE_TREE_DEGREE; ++i) {
eufs_free_recursive_btree_blind(sb, s2p(sb, root[i]),
level_left - 1);
}
nv_zfree(sb, root);
}
static void eufs_free_recursive_btree(struct super_block *sb, __le64 *root,
int level_left, u64 blocks_left)
{
u64 nblocks_per_slot;
int i;
BUG_ON(!root);
BUG_ON(!blocks_left);
if (level_left == -1) {
if (root != NULL_ADDR_PTR)
nv_zfree(sb, root);
return;
}
/* level_left */
BUG_ON(root == NULL_ADDR_PTR);
nblocks_per_slot = 1 << (level_left * EUFS_FILE_TREE_DEGREE_SHIFT);
for (i = 0; i < EUFS_FILE_TREE_DEGREE; ++i) {
if (blocks_left >= nblocks_per_slot) {
/* the whole sub-tree needs to be freed */
eufs_free_recursive_btree_blind(sb, s2p(sb, root[i]),
level_left - 1);
blocks_left -= nblocks_per_slot;
if (blocks_left == 0)
break;
} else {
eufs_free_recursive_btree(sb, s2p(sb, root[i]),
level_left - 1, blocks_left);
break;
}
}
nv_zfree(sb, root);
}
int eufs_shrink_btree(struct inode *inode)
{
struct super_block *sb = inode->i_sb;
struct eufs_inode_info *vi = EUFS_I(inode);
struct eufs_inode *pi = EUFS_PI(inode);
void *root;
u64 capacity;
__le64 *new_root;
__le64 *old_root;
int new_height;
u64 size;
u64 blocks;
u64 count;
u64 blocks_left;
u64 __maybe_unused pi_root_o;
BUG_ON(!inode_is_locked(inode));
BUG_ON(vi->i_volatile_height > EUFS_MAX_FILE_TREE_HEIGHT);
pi_root_o = eufs_iread_root(pi);
eufs_dbg("shrink btree stage 1: pi=%px vi->{s=%lld b=%lld h=%d r=%px} pi->{s=%lld b=%lld h=%d r=0x%llx}\n",
pi, inode->i_size, vi->i_volatile_tree_blocks,
vi->i_volatile_height, vi->i_volatile_root,
eufs_iread_size(pi), eufs_iread_tree_blocks(pi),
root_height(pi_root_o), root_ptr(pi_root_o));
eufs_sync_pinode(inode, pi, false);
capacity = PAGE_SIZE
<< (EUFS_FILE_TREE_DEGREE_SHIFT * vi->i_volatile_height);
new_root = vi->i_volatile_root;
old_root = vi->i_volatile_root;
new_height = vi->i_volatile_height;
size = inode->i_size == 0 ? 1 : inode->i_size;
/* old block count */
blocks = vi->i_volatile_tree_blocks;
/* Check whether the height should be reduced */
for (;;) {
capacity >>= EUFS_FILE_TREE_DEGREE_SHIFT;
if (capacity < size || capacity < PAGE_SIZE)
break;
new_root = s2p(sb, new_root[0]);
new_height--;
}
vi->i_volatile_root = new_root;
vi->i_volatile_height = new_height;
vi->i_volatile_tree_blocks = DIV_ROUND_UP(size, PAGE_SIZE);
eufs_sync_pinode(inode, pi, false);
eufs_alloc_batch_persist_reset(sb, &vi->page_batch);
/* new block count and it's greater than 0 */
count = blocks_left = vi->i_volatile_tree_blocks;
/* shrink from old_root/_height to new_root/_height */
root = old_root;
if (blocks_left == blocks)
goto out;
if (blocks == 1)
goto out;
if (blocks <= EUFS_FILE_BCNT_WITH_HEIGHT(1)) {
int i;
__le64 *proot = root;
for (i = count; i < blocks; ++i) {
nv_free(sb, s2p(sb, proot[i]));
proot[i] = NULL_VAL;
}
goto out;
}
if (blocks <= EUFS_FILE_BCNT_WITH_HEIGHT(2)) {
int i;
__le64 *proot = root;
for (i = EUFS_H2_INDEX_IN_L0(count); i < EUFS_FILE_TREE_DEGREE;
++i) {
__le64 *pages = s2p(sb, proot[i]);
int j = EUFS_H2_INDEX_IN_L1(count);
for (; j < EUFS_FILE_TREE_DEGREE; ++j) {
nv_free(sb, s2p(sb, pages[j]));
pages[j] = NULL_VAL;
count++;
if (count >= blocks)
break;
}
if (EUFS_H2_IS_FREE_L1_SUBTREE(i, blocks_left)) {
nv_free(sb, pages);
proot[i] = NULL_VAL;
}
if (count >= blocks)
break;
}
goto out;
}
if (blocks <= EUFS_FILE_BCNT_WITH_HEIGHT(3)) {
int i, j, k;
__le64 *pproot = root;
for (i = EUFS_H3_INDEX_IN_L0(count); i < EUFS_FILE_TREE_DEGREE;
++i) {
__le64 *proot = s2p(sb, pproot[i]);
j = EUFS_H3_INDEX_IN_L1(i, count);
for (; j < EUFS_FILE_TREE_DEGREE; ++j) {
__le64 *pages = s2p(sb, proot[j]);
k = EUFS_H3_INDEX_IN_L2(count);
for (; k < EUFS_FILE_TREE_DEGREE; ++k) {
nv_free(sb, s2p(sb, pages[k]));
pages[k] = NULL_VAL;
count++;
if (count >= blocks)
break;
}
if (EUFS_H3_IS_FREE_L2_SUBTREE(i, j,
blocks_left)) {
nv_free(sb, pages);
proot[j] = NULL_VAL;
}
if (count >= blocks)
break;
}
if (EUFS_H3_IS_FREE_L1_SUBTREE(i, blocks_left)) {
nv_free(sb, proot);
pproot[i] = NULL_VAL;
}
if (count >= blocks)
break;
}
}
out:
while (old_root != new_root) {
__le64 *r = old_root;
BUG_ON(!r);
old_root = s2p(sb, r[0]);
nv_free(sb, r);
}
return 0;
}
int eufs_free_btree(struct super_block *sb, void *root, int height, u64 blocks)
{
NV_ASSERT(!(height < 0 || height > EUFS_MAX_FILE_TREE_HEIGHT));
eufs_dbg("nvfree tree root: %px\n", root);
if (blocks == 0)
return 0;
if (blocks == 1) {
/* height == 0 */
nv_free(sb, root);
return 0;
}
eufs_free_recursive_btree(sb, (__le64 *)root, height - 1, blocks);
return 0;
}
int eufs_persist_btree_node(void *root, int sta, int len)
{
BUG_ON(len > EUFS_FILE_TREE_DEGREE);
BUG_ON(len < 0);
BUG_ON(sta + len > EUFS_FILE_TREE_DEGREE);
BUG_ON(sta + len < 0);
if (len == 0)
return 0;
eufs_ptr_fast_check(root);
eufs_flush_range(((void **)root) + sta, len * sizeof(void *));
return 0;
}
static void eufs_persist_btree_h2_subtree(struct super_block *sb, void *root,
int start0, int idx0, int idx1)
{
__le64 *proot = root;
int i;
void *p;
for (i = start0; i < idx0; ++i) {
BUG_ON(proot[i] == 0);
p = s2p(sb, proot[i]);
eufs_ptr_fast_check(p);
eufs_persist_btree_node(p, 0, EUFS_FILE_TREE_DEGREE);
}
/*
* According to the WARN_ON in eufs_persist_new_btree_h2,
* idx0 < EUFS_FILE_TREE_DEGREE if idx1 != 0. So the following code
* is safe.
*/
if (idx1 != 0) {
p = s2p(sb, proot[idx0]);
eufs_persist_btree_node(p, 0, idx1);
}
}
static void eufs_persist_btree_h2_root(void *root, int start0, int idx0,
int idx1)
{
int cnt = idx0 - start0;
/*
* It's the L1 index of the next block, so when it's not equals with 0,
* the node[idx0] also needs persistence.
*/
if (idx1 != 0)
cnt++;
eufs_persist_btree_node(root, start0, cnt);
}
static void eufs_persist_new_btree_h2_by_idx(struct super_block *sb, void *root,
int start0, int idx0, int idx1)
{
eufs_persist_btree_h2_subtree(sb, root, start0, idx0, idx1);
/* It's a new btree, so persist the whole root node */
eufs_persist_btree_h2_root(root, 0, idx0, idx1);
}
static void eufs_persist_new_btree_h2(struct super_block *sb, void *root,
int start0, unsigned long bcnt)
{
/* the L0/L1 index of the next block in new tree with height 2 */
int idx0 = EUFS_H2_INDEX_IN_L0(bcnt);
int idx1 = EUFS_H2_INDEX_IN_L1(bcnt);
/*
* Notice a corner case: bcnt == EUFS_FILE_BCNT_WITH_HEIGHT(2), in
* which (idx0 == EUFS_FILE_TREE_DEGREE && idx1 == 0)
*/
WARN_ON(idx0 == EUFS_FILE_TREE_DEGREE && idx1);
eufs_persist_new_btree_h2_by_idx(sb, root, start0, idx0, idx1);
}
static void eufs_persist_inc_btree_h2_by_idx(struct super_block *sb, void *root,
int old_idx0, int old_idx1,
int new_idx0, int new_idx1)
{
__le64 *proot = root;
void *p;
int start;
p = s2p(sb, proot[old_idx0]);
if (old_idx0 == new_idx0) {
if (old_idx0 == EUFS_FILE_TREE_DEGREE)
return;
eufs_persist_btree_node(p, old_idx1, new_idx1 - old_idx1);
/* node[old_idx0] needs persistence */
if (!old_idx1)
eufs_persist_btree_node(root, old_idx0, 1);
return;
}
eufs_persist_btree_node(p, old_idx1, EUFS_FILE_TREE_DEGREE - old_idx1);
eufs_persist_btree_h2_subtree(sb, root, old_idx0 + 1, new_idx0,
new_idx1);
start = old_idx0;
/* if old_idx0 is not 0, root[start] must have already been persisted */
if (old_idx1)
start++;
eufs_persist_btree_h2_root(root, start, new_idx0, new_idx1);
}
static void eufs_persist_inc_btree_h2(struct super_block *sb, void *root,
unsigned long old_bcnt,
unsigned long new_bcnt)
{
/* the L0/L1 index of the next block in tree */
int old_idx0 = EUFS_H2_INDEX_IN_L0(old_bcnt);
int old_idx1 = EUFS_H2_INDEX_IN_L1(old_bcnt);
int new_idx0 = EUFS_H2_INDEX_IN_L0(new_bcnt);
int new_idx1 = EUFS_H2_INDEX_IN_L1(new_bcnt);
/*
* Notice a corner case: bcnt == EUFS_FILE_BCNT_WITH_HEIGHT(2), in
* which (idx0 == EUFS_FILE_TREE_DEGREE && idx1 == 0)
*/
WARN_ON(old_idx0 == EUFS_FILE_TREE_DEGREE && old_idx1);
WARN_ON(new_idx0 == EUFS_FILE_TREE_DEGREE && new_idx1);
eufs_persist_inc_btree_h2_by_idx(sb, root, old_idx0, old_idx1, new_idx0,
new_idx1);
}
static void eufs_persist_new_btree_h3(struct super_block *sb, void *root,
int start0, unsigned long bcnt_left)
{
int i;
unsigned long left = bcnt_left;
__le64 *pproot = root;
for (i = start0; i < EUFS_FILE_TREE_DEGREE; ++i) {
__le64 *proot = s2p(sb, pproot[i]);
int j;
for (j = 0; j < EUFS_FILE_TREE_DEGREE; ++j) {
void *p = s2p(sb, proot[j]);
if (left >= EUFS_FILE_TREE_DEGREE) {
eufs_persist_btree_node(p, 0,
EUFS_FILE_TREE_DEGREE);
left -= EUFS_FILE_TREE_DEGREE;
} else {
eufs_persist_btree_node(p, 0, left);
left = 0;
j++;
break;
}
}
eufs_persist_btree_node(proot, 0, j);
if (!left) {
i++;
break;
}
}
eufs_persist_btree_node(root, 0, i);
}
static void eufs_persist_inc_btree_h3(struct super_block *sb, void *root,
unsigned long old_bcnt,
unsigned long new_bcnt)
{
/* The L0/L1/L2 position of the next block in tree */
int o_idx0 = EUFS_H3_INDEX_IN_L0(old_bcnt);
int o_idx1 = EUFS_H3_INDEX_IN_L1(o_idx0, old_bcnt);
int o_idx2 = EUFS_H3_INDEX_IN_L2(old_bcnt);
int n_idx0 = EUFS_H3_INDEX_IN_L0(new_bcnt);
int n_idx1 = EUFS_H3_INDEX_IN_L1(n_idx0, new_bcnt);
int n_idx2 = EUFS_H3_INDEX_IN_L2(new_bcnt);
__le64 *pproot = root;
__le64 *proot;
void *p;
int i;
if (o_idx0 == n_idx0 && o_idx1 == n_idx1) {
/* persist from the bottom up */
proot = s2p(sb, pproot[o_idx0]);
p = s2p(sb, proot[o_idx1]);
eufs_persist_btree_node(p, o_idx2, n_idx2 - o_idx2);
/* node[o_idx1] needs persistence */
if (!o_idx2) {
eufs_persist_btree_node(proot, o_idx1, 1);
/* node[o_idx0] needs persistence */
if (!o_idx1)
eufs_persist_btree_node(root, o_idx0, 1);
}
return;
}
if (o_idx0 == n_idx0) {
proot = s2p(sb, pproot[o_idx0]);
eufs_persist_inc_btree_h2_by_idx(sb, proot, o_idx1, o_idx2,
n_idx1, n_idx2);
/* node[o_idx0] needs persistence */
if (!o_idx1 && !o_idx2)
eufs_persist_btree_node(root, o_idx0, 1);
return;
}
/*
* A corner case: o_idx1 == EUFS_FILE_TREE_DEGREE && o_idx2 == 0. This
* can be handled in the function eufs_persist_inc_btree_h2_by_idx, but
* we still check it here for efficiency.
*/
if (o_idx1 < EUFS_FILE_TREE_DEGREE) {
proot = s2p(sb, pproot[o_idx0]);
eufs_persist_inc_btree_h2_by_idx(sb, proot, o_idx1, o_idx2,
EUFS_FILE_TREE_DEGREE, 0);
} else {
WARN_ON(o_idx2 != 0);
}
for (i = o_idx0 + 1; i < n_idx0; ++i) {
proot = s2p(sb, pproot[i]);
eufs_persist_new_btree_h2_by_idx(sb, proot, 0,
EUFS_FILE_TREE_DEGREE, 0);
}
if (n_idx1 || n_idx2) {
proot = s2p(sb, pproot[n_idx0]);
eufs_persist_new_btree_h2_by_idx(sb, proot, 0, n_idx1, n_idx2);
/* root[n_idx0] needs to be persisted */
n_idx0++;
}
/* root[o_idx0] has been persisted */
if (o_idx1 || o_idx2)
o_idx0++;
eufs_persist_btree_node(root, o_idx0, n_idx0 - o_idx0);
}
/* Only structure persistency is needed */
int eufs_persist_btree(struct super_block *sb, void *root, int height,
u64 old_size, u64 new_size)
{
unsigned long old_nblocks, new_nblocks;
__le64 *proot;
__le64 *pproot;
if (old_size == 0)
old_size = 1; /* at least one block */
NV_ASSERT(!(height < 0 || height > EUFS_MAX_FILE_TREE_HEIGHT));
if (!root)
return 0;
/* don't support for persisting for shrink */
if (old_size > new_size)
return 0;
old_nblocks = DIV_ROUND_UP(old_size, PAGE_SIZE);
new_nblocks = DIV_ROUND_UP(new_size, PAGE_SIZE);
if (old_nblocks == new_nblocks)
return 0;
proot = root;
if (old_nblocks == 1) {
/* data do not need flush */
if (new_nblocks == 1)
return 0;
if (new_nblocks <= EUFS_FILE_BCNT_WITH_HEIGHT(1)) {
eufs_persist_btree_node(root, 0, new_nblocks);
return 0;
}
if (new_nblocks <= EUFS_FILE_BCNT_WITH_HEIGHT(2)) {
eufs_persist_new_btree_h2(sb, root, 0, new_nblocks);
return 0;
}
if (new_nblocks <= EUFS_FILE_BCNT_WITH_HEIGHT(3)) {
eufs_persist_new_btree_h3(sb, root, 0, new_nblocks);
return 0;
}
} else if (old_nblocks <= EUFS_FILE_BCNT_WITH_HEIGHT(1)) {
if (new_nblocks <= EUFS_FILE_BCNT_WITH_HEIGHT(1)) {
eufs_persist_btree_node(root, old_nblocks,
new_nblocks - old_nblocks);
return 0;
}
if (new_nblocks <= EUFS_FILE_BCNT_WITH_HEIGHT(2)) {
__le64 *p = s2p(sb, proot[0]);
eufs_persist_btree_node(p, old_nblocks,
EUFS_FILE_TREE_DEGREE -
old_nblocks);
eufs_persist_new_btree_h2(sb, proot, 1, new_nblocks);
return 0;
}
if (new_nblocks <= EUFS_FILE_BCNT_WITH_HEIGHT(3)) {
void *p;
pproot = root;
proot = s2p(sb, pproot[0]);
p = s2p(sb, proot[0]);
eufs_persist_btree_node(p, old_nblocks,
EUFS_FILE_TREE_DEGREE -
old_nblocks);
eufs_persist_new_btree_h2(
sb, proot, 1, EUFS_FILE_BCNT_WITH_HEIGHT(2));
eufs_persist_new_btree_h3(
sb, root, 1,
new_nblocks - EUFS_FILE_BCNT_WITH_HEIGHT(2));
return 0;
}
} else if (old_nblocks <= EUFS_FILE_BCNT_WITH_HEIGHT(2)) {
if (new_nblocks <= EUFS_FILE_BCNT_WITH_HEIGHT(2)) {
eufs_persist_inc_btree_h2(sb, root, old_nblocks,
new_nblocks);
return 0;
}
if (new_nblocks <= EUFS_FILE_BCNT_WITH_HEIGHT(3)) {
pproot = root;
proot = s2p(sb, pproot[0]);
eufs_persist_inc_btree_h2(
sb, proot, old_nblocks,
EUFS_FILE_BCNT_WITH_HEIGHT(2));
eufs_persist_new_btree_h3(
sb, root, 1,
new_nblocks - EUFS_FILE_BCNT_WITH_HEIGHT(2));
return 0;
}
} else if (old_nblocks <= EUFS_FILE_BCNT_WITH_HEIGHT(3)) {
if (new_nblocks <= EUFS_FILE_BCNT_WITH_HEIGHT(3)) {
eufs_persist_inc_btree_h3(sb, root, old_nblocks,
new_nblocks);
return 0;
}
}
BUG();
return 0;
}
static ssize_t do_mapping_read(struct address_space *mapping,
struct file_ra_state *_ra, struct file *filp,
char __user *buf, size_t len, loff_t *ppos)
{
struct inode *inode = mapping->host;
pgoff_t index, end_index;
unsigned long offset;
loff_t isize, pos;
size_t copied = 0, error = 0;
pos = *ppos;
index = pos >> PAGE_SHIFT;
offset = pos & ~PAGE_MASK;
isize = i_size_read(inode);
if (!isize)
goto out;
end_index = (isize - 1) >> PAGE_SHIFT;
do {
unsigned long nr, left;
void *xmem;
/* nr is the maximum number of bytes to copy from this page */
nr = PAGE_SIZE;
if (index >= end_index) {
if (index > end_index)
goto out;
nr = ((isize - 1) & ~PAGE_MASK) + 1;
if (nr <= offset)
goto out;
}
nr = nr - offset;
if (nr > len - copied)
nr = len - copied;
xmem = eufs_find_data_block_btree(inode, index, NULL);
BUG_ON(!eufs_access_ok(inode->i_sb, xmem, PAGE_SIZE));
if (unlikely(!xmem))
BUG();
/*
* Ok, we have the mem, so now we can copy it to user space...
*
* The actor routine returns how many bytes were actually used..
* NOTE! This may not be the same as how much of a user buffer
* we filled up (we may be padding etc), so we can only update
* "pos" here (the actor routine has to update the user buffer
* pointers and the remaining count).
*/
if (xmem != NULL_ADDR_PTR)
left = __copy_to_user(buf + copied, xmem + offset, nr);
else
left = __clear_user(buf + copied, nr);
if (left) {
error = -EFAULT;
goto out;
}
copied += (nr - left);
offset += (nr - left);
index += offset >> PAGE_SHIFT;
offset &= ~PAGE_MASK;
} while (copied < len);
out:
*ppos = pos + copied;
if (filp)
file_accessed(filp);
return copied ? copied : error;
}
/*
* Wrappers. We need to use the rcu read lock to avoid
* concurrent truncate operation. No problem for write because we held
* i_mutex.
*/
ssize_t eufs_file_read(struct file *filp, char __user *buf, size_t len,
loff_t *ppos)
{
ssize_t res;
inode_lock_shared(file_inode(filp));
if (!access_ok(buf, len))
res = -EFAULT;
else
res = do_mapping_read(filp->f_mapping, &filp->f_ra, filp, buf,
len, ppos);
inode_unlock_shared(file_inode(filp));
return res;
}
static __always_inline size_t memcpy_to_nvmm(char *kmem, loff_t offset,
const char __user *buf,
size_t bytes)
{
size_t copied;
if (support_clwb && !force_nocache_write) {
copied = bytes - __copy_from_user(kmem + offset, buf, bytes);
eufs_flush_buffer(kmem + offset, copied, 0);
} else {
copied = bytes - __copy_from_user_inatomic_nocache(
kmem + offset, buf, bytes);
}
return copied;
}
ssize_t __eufs_file_write_inode(struct inode *inode, const char __user *buf,
size_t count, loff_t pos, loff_t *ppos,
bool zero, bool keep)
{
long status = 0;
size_t bytes;
ssize_t written = 0;
if (!count)
return 0;
eufs_dbg("file write: inode=%px count=%lx pos=%llx, zero=%d keep=%d\n",
inode, count, pos, zero, keep);
do {
unsigned long index;
unsigned long offset;
size_t copied;
__le64 *parent;
void __pmem *xmem;
void __pmem *xmem_new = NULL;
offset = (pos & (PAGE_SIZE - 1)); /* Within page */
index = pos >> PAGE_SHIFT;
bytes = PAGE_SIZE - offset;
if (bytes > count)
bytes = count;
xmem = eufs_find_data_block_btree(inode, index, &parent);
if (!eufs_access_ok(inode->i_sb, xmem, PAGE_SIZE)) {
dump_stack();
BUG();
}
/* do no wear leveling for 0-level btrees */
if (xmem != NULL_ADDR_PTR && parent && !zero) {
/* wear threshold! */
if (!wear_inc(inode->i_sb, xmem))
xmem_new = eufs_malloc_file_data(inode->i_sb);
}
if (zero) {
copied = bytes;
if (xmem != NULL_ADDR_PTR)
eufs_clear_pmem((char *)xmem + offset, bytes);
} else {
BUG_ON(xmem == NULL_ADDR_PTR);
copied = memcpy_to_nvmm((char *)xmem, offset, buf,
bytes);
}
if (xmem_new) {
struct eufs_inode_info *vi = EUFS_I(inode);
eufs_dbg(
"inode=%px pos=%llx xmem:[%px -> %px] weared\n",
inode, pos, xmem, xmem_new);
eufs_alloc_persist(inode->i_sb, xmem_new, true);
WARN_ON(xmem !=
s2p(inode->i_sb,
parent[index % EUFS_FILE_TREE_DEGREE]));
/*
* disable page fault, clear all related PTEs, and remove the
* dax entry from the radix tree before replace the old block
*/
down_write(&vi->mmap_rwsem);
invalidate_inode_pages2_range(inode->i_mapping,
pos / PAGE_SIZE,
pos / PAGE_SIZE);
memcpy_to_nvmm(xmem_new, 0, xmem, PAGE_SIZE);
parent[index % EUFS_FILE_TREE_DEGREE] =
p2s(inode->i_sb, xmem_new);
up_write(&vi->mmap_rwsem);
eufs_flush_cacheline(
&parent[index % EUFS_FILE_TREE_DEGREE]);
eufs_pbarrier();
/*
* It is important to persist all preivous alllocations
* here. Otherwise, the xmem might be freed before its
* information is handled in the page_batch, which will
* cause xmem being marked as allocated (page_batch does
* this) when it is in the free list.
* xfstests/generic/299 can trigger this.
*/
eufs_alloc_batch_persist_reset(
inode->i_sb, &EUFS_I(inode)->page_batch);
nv_free_rest(inode->i_sb, xmem);
}
eufs_dbg(
"! file writing to pos=%ld xmem=%px, offset=%ld, buf=%px, bytes=%ld index=%ld, copied=%ld\n",
(long)pos, xmem, (long)offset, buf, (long)bytes,
(long)index, (long)copied);
if (likely(copied > 0)) {
written += copied;
count -= copied;
pos += copied;
buf += copied;
}
if (unlikely(copied != bytes)) {
status = -EFAULT;
break;
}
} while (count);
if (ppos)
*ppos = pos;
eufs_dbg("pos: %d inode->i_size: %d written: %d\n", (int)pos,
(int)inode->i_size, (int)written);
/*
* No need to use i_size_read() here, the i_size
* cannot change under us because we hold i_mutex.
*/
if (!keep && pos > inode->i_size)
eufs_inode_size_write(inode, pos);
return written ? written : status;
}
ssize_t __eufs_file_write(struct address_space *mapping,
const char __user *buf, size_t count, loff_t pos,
loff_t *ppos, bool zero, bool keep)
{
return __eufs_file_write_inode(mapping->host, buf, count, pos, ppos,
zero, keep);
}
ssize_t eufs_file_write(struct file *filp, const char __user *buf,
size_t len, loff_t *ppos)
{
struct address_space *mapping = filp->f_mapping;
struct inode *inode = mapping->host;
struct eufs_inode_info *vi = EUFS_I(inode);
struct super_block *sb = inode->i_sb;
ssize_t written = 0;
loff_t pos;
size_t count, ret;
bool osync = false;
inode_lock(inode);
if (!access_ok(buf, len)) {
ret = -EFAULT;
goto out;
}
if (filp->f_flags & O_APPEND)
pos = inode->i_size;
else
pos = *ppos;
if (filp->f_flags & __O_SYNC)
osync = true;
count = len;
if (count == 0) {
ret = 0;
goto out;
}
NV_ASSERT(sb->s_blocksize == PAGE_SIZE);
ret = file_remove_privs(filp);
if (ret)
goto out;
inode->i_ctime = inode->i_mtime = current_time(inode);
/*
* It's a little tricky here. We should use mmap_rwsem to protect
* the block allocation and i_size update, but mmap_rwsem can not
* be taken during block writing because that will lead to
* dead-lock. We only use mmap_rwsem to protect the block allocation,
* and there are two reasons we can do that:
* 1. mmap fault takes the mmap_rwsem before read i_size, so it
* can not read the updated i_size before the allocation is done.
* 2. write only extends the block tree, and will not remove or
* modify the existed block mappings.
*/
down_write(&vi->mmap_rwsem);
/*
* Possible cases for writing [pos~pos+len)
*
* Definitions
* EOF: the byte after last valid byte
* EOF-page: page contains EOF
* first: the page pos belongs to
* last: the page pos+len belongs to
* Macro EOP(p): the last byte of p's page
*
* IMPORTANT NOTICE: we do not guarantee that [EOF~EOP(EOF)] are
* zeroed! When we mmap a file, we will erase that (in DRAM) in the
* mmap syscall. This can concurrently happen with a write syscall
* which may cause consistency problems (especially when it's an
* append). Concurrent mmap-access and read-/write-access should be
* protected by the application.
*
* 1) EOF-page | first | last
* area-to-zero: [EOF~EOP(EOF)]
* 2) EOF-page=first| last
* area-to-zero: [EOF~pos) if EOF<pos
* 3) first | EOF-page | last
* area-to-zero: none
* 4) first | EOF-page=last
* area-to-zero: none
* 5) first | last | EOF-page
* area-to-zero:
* And for ALL cases, if first/last page is a hole, we need to zero
* the part that will not be written in this write.
*/
/* don't zero-out the allocated blocks */
ret = eufs_alloc_blocks_btree_for_write(inode, pos, count);
if (IS_ERR_VALUE(ret)) {
up_write(&vi->mmap_rwsem);
goto out;
}
/* If we decide to guarantee zeroed file tail, we may use this snippet. */
/* zeroing part of the last block goes beyond the new EOF */
if (PAGE_ALIGN(pos + count) > PAGE_ALIGN(inode->i_size))
eufs_inode_zero_range(inode, pos + count,
PAGE_ALIGN(pos + count));
/*
* zeroing the hole created by write.
* part of the hole is included in the last page that exceeds EOF,
* and it has already been zeroed, so only zeroing the remaining part.
*/
if (pos > inode->i_size) {
loff_t offset = inode->i_size & (PAGE_SIZE - 1);
if (offset || !inode->i_size) {
/*
* Zero EOF~EOP(EOF).
* This also satisfies case 2), since [EOP(EOF)+1~pos]
* are holes.
*/
eufs_inode_zero_range_len(inode, inode->i_size,
PAGE_SIZE - offset);
}
}
up_write(&vi->mmap_rwsem);
written = __eufs_file_write(mapping, buf, count, pos, ppos, false,
false);
if (written < 0 || written != count) {
eufs_dbg("write incomplete/failed: written %ld len %ld pos %llx\n",
written, count, pos);
}
if (osync) {
eufs_alloc_batch_persist_reset(sb, &EUFS_I(inode)->page_batch);
eufs_sync_pinode(inode, EUFS_PI(inode), false);
} else {
request_persistence(inode);
}
ret = written;
out:
inode_unlock(inode);
return ret;
}
static int eufs_iomap_begin(struct inode *inode, loff_t offset, loff_t length,
unsigned int flags, struct iomap *iomap,
struct iomap *src)
{
struct super_block *sb = inode->i_sb;
struct eufs_sb_info *sbi = sb->s_fs_info;
unsigned int blkbits = inode->i_blkbits;
unsigned long first_block = offset >> blkbits;
bool new = false;
void *__pmem xmem;
__le64 *__pmem parent;
eufs_dbg("fault: inode=%px addr=0x%llx rw=%d length=%lld\n", inode,
offset, flags & IOMAP_WRITE, length);
inode_leaf_lock(inode);
xmem = eufs_find_data_block_btree(inode, first_block, &parent);
/* allocate a new block for write */
if (xmem == NULL_ADDR_PTR && (flags & IOMAP_WRITE)) {
int ofs = first_block & (EUFS_FILE_TREE_DEGREE - 1);
/*
* We cannot use normal allocation here because they can send
* IPI to gather pages and blocks. So here we need to use
* non-blocking version, which uses reserved pages instead of
* gathering pages by IPI.
*/
xmem = eufs_zalloc_file_data(inode->i_sb);
if (!xmem) {
inode_leaf_unlock(inode);
return -ENOSPC;
}
eufs_alloc_persist(inode->i_sb, xmem, false);
/*
* the first block is preallocated during inode initialization,
* so parent should not be NULL when xmem is NULL_ADDR
*/
BUG_ON(!parent);
eufs_flush_page(xmem);
parent[ofs] = p2s(sb, xmem);
eufs_flush_cacheline(&parent[ofs]);
new = true;
}
inode_leaf_unlock(inode);
iomap->flags = 0;
iomap->bdev = inode->i_sb->s_bdev;
iomap->offset = (u64)first_block << blkbits;
iomap->dax_dev = sbi->s_dax_dev;
iomap->length = 1 << blkbits;
if (xmem == NULL_ADDR_PTR) {
iomap->type = IOMAP_HOLE;
iomap->addr = IOMAP_NULL_ADDR;
} else {
iomap->type = IOMAP_MAPPED;
iomap->addr = (xmem - sbi->virt_addr);
}
if (new)
iomap->flags |= IOMAP_F_NEW;
return 0;
}
static int eufs_iomap_end(struct inode *inode, loff_t offset, loff_t length,
ssize_t written, unsigned int flags,
struct iomap *iomap)
{
return 0;
}
const struct iomap_ops eufs_iomap_ops = {
.iomap_begin = eufs_iomap_begin,
.iomap_end = eufs_iomap_end,
};
static unsigned int eufs_dax_fault(struct vm_fault *vmf)
{
struct inode *inode = file_inode(vmf->vma->vm_file);
struct eufs_inode_info *vi = EUFS_I(inode);
int ret;
if (vmf->flags & FAULT_FLAG_WRITE) {
sb_start_pagefault(inode->i_sb);
file_update_time(vmf->vma->vm_file);
}
/*
* i_size and the block tree must be consistent during mmap fault,
* else eulerfs may map to a freed block or a hole instead of an
* allocated block.
*
* Now i_rwsem is used to protect against the update of i_size and
* the block tree, but it can NOT been used in mmap fault path,
* because mmap fault may be triggered in the middle of
* write or read operation when the dst or src buffer is a mapped
* range of the same file, and that will lead to dead-lock due to
* two acquisitions of the same lock (i_rwsem).
*
* So mmap_rwsem is provided. The read-lock will be used in mmap
* fault path, and the write-lock will be used in truncate &
* fallocate & write paths.
*/
down_read(&vi->mmap_rwsem);
ret = dax_iomap_fault(vmf, PE_SIZE_PTE, NULL, &ret, &eufs_iomap_ops);
up_read(&vi->mmap_rwsem);
if (vmf->flags & FAULT_FLAG_WRITE)
sb_end_pagefault(inode->i_sb);
return ret;
}
const struct vm_operations_struct eufs_file_vm_ops = {
.fault = eufs_dax_fault,
.page_mkwrite = eufs_dax_fault,
.pfn_mkwrite = eufs_dax_fault,
};
int eufs_dax_file_mmap(struct file *file, struct vm_area_struct *vma)
{
file_accessed(file);
vma->vm_flags |= VM_MIXEDMAP;
vma->vm_ops = &eufs_file_vm_ops;
eufs_dbg("dax file mmaped!\n");
return 0;
}
static loff_t eufs_seek_block(struct file *file, loff_t offset, int whence)
{
struct inode *inode = file_inode(file);
loff_t maxbytes = inode->i_sb->s_maxbytes;
pgoff_t pgofs;
loff_t data_ofs = offset, isize;
__le64 *parent;
void *addr;
unsigned int ofs;
inode_lock(inode);
isize = i_size_read(inode);
if (offset >= isize)
goto fail;
pgofs = (pgoff_t)(offset >> PAGE_SHIFT);
if (EUFS_I(inode)->hole_at_sta && pgofs == 0) {
if (whence == SEEK_HOLE)
goto found;
pgofs++;
data_ofs = (loff_t)pgofs << PAGE_SHIFT;
}
while (data_ofs < isize) {
addr = eufs_find_data_block_btree(inode, pgofs, &parent);
ofs = pgofs & (EUFS_FILE_TREE_DEGREE - 1);
while (ofs < EUFS_FILE_TREE_DEGREE && data_ofs < isize) {
if (parent)
addr = s2p(inode->i_sb, parent[ofs]);
if (addr == NULL_ADDR_PTR && whence == SEEK_HOLE)
goto found;
if (addr && addr != NULL_ADDR_PTR &&
whence == SEEK_DATA)
goto found;
ofs++;
pgofs++;
data_ofs = (loff_t)pgofs << PAGE_SHIFT;
}
}
if (whence == SEEK_DATA)
goto fail;
found:
if (whence == SEEK_HOLE && data_ofs > isize)
data_ofs = isize;
inode_unlock(inode);
return vfs_setpos(file, data_ofs, maxbytes);
fail:
inode_unlock(inode);
return -ENXIO;
}
loff_t eufs_file_llseek(struct file *file, loff_t offset, int whence)
{
struct inode *inode = file_inode(file);
loff_t maxbytes = inode->i_sb->s_maxbytes;
switch (whence) {
case SEEK_SET:
case SEEK_CUR:
case SEEK_END:
return generic_file_llseek_size(file, offset, whence, maxbytes,
i_size_read(inode));
case SEEK_DATA:
case SEEK_HOLE:
if (offset < 0)
return -ENXIO;
return eufs_seek_block(file, offset, whence);
}
return -EINVAL;
}
马建仓 AI 助手
尝试更多
代码解读
代码找茬
代码优化
1
https://gitee.com/openeuler/eulerfs.git
git@gitee.com:openeuler/eulerfs.git
openeuler
eulerfs
eulerfs
master

搜索帮助