Merge tag 'bcachefs-2025-04-24' of git://evilpiepirate.org/bcachefs

Pull bcachefs fixes from Kent Overstreet:

 - Case insensitive directories now work

 - Ciemap now correctly reports on unwritten pagecache data

 - bcachefs tools 1.25.1 was incorrectly picking unaligned bucket sizes;
   fix journal and write path bugs this uncovered

And assorted smaller fixes...

* tag 'bcachefs-2025-04-24' of git://evilpiepirate.org/bcachefs: (24 commits)
  bcachefs: Rework fiemap transaction restart handling
  bcachefs: add fiemap delalloc extent detection
  bcachefs: refactor fiemap processing into extent helper and struct
  bcachefs: track current fiemap offset in start variable
  bcachefs: drop duplicate fiemap sync flag
  bcachefs: Fix btree_iter_peek_prev() at end of inode
  bcachefs: Make btree_iter_peek_prev() assert more precise
  bcachefs: Unit test fixes
  bcachefs: Print mount opts earlier
  bcachefs: unlink: casefold d_invalidate
  bcachefs: Fix casefold lookups
  bcachefs: Casefold is now a regular opts.h option
  bcachefs: Implement fileattr_(get|set)
  bcachefs: Allocator now copes with unaligned buckets
  bcachefs: Start copygc, rebalance threads earlier
  bcachefs: Refactor bch2_run_recovery_passes()
  bcachefs: bch2_copygc_wakeup()
  bcachefs: Fix ref leak in write_super()
  bcachefs: Change __journal_entry_close() assert to ERO
  bcachefs: Ensure journal space is block size aligned
  ...
This commit is contained in:
Linus Torvalds
2025-04-25 09:06:14 -07:00
32 changed files with 737 additions and 587 deletions

View File

@@ -1425,6 +1425,8 @@ int bch2_alloc_sectors_start_trans(struct btree_trans *trans,
open_bucket_for_each(c, &wp->ptrs, ob, i)
wp->sectors_free = min(wp->sectors_free, ob->sectors_free);
wp->sectors_free = rounddown(wp->sectors_free, block_sectors(c));
BUG_ON(!wp->sectors_free || wp->sectors_free == UINT_MAX);
return 0;

View File

@@ -110,7 +110,9 @@ static inline void bch2_alloc_sectors_done_inlined(struct bch_fs *c, struct writ
unsigned i;
open_bucket_for_each(c, &wp->ptrs, ob, i)
ob_push(c, !ob->sectors_free ? &ptrs : &keep, ob);
ob_push(c, ob->sectors_free < block_sectors(c)
? &ptrs
: &keep, ob);
wp->ptrs = keep;
mutex_unlock(&wp->lock);

View File

@@ -366,6 +366,10 @@ static inline void bkey_init(struct bkey *k)
#define __BKEY_PADDED(key, pad) \
struct bkey_i key; __u64 key ## _pad[pad]
enum bch_bkey_type_flags {
BKEY_TYPE_strict_btree_checks = BIT(0),
};
/*
* - DELETED keys are used internally to mark keys that should be ignored but
* override keys in composition order. Their version number is ignored.
@@ -383,46 +387,46 @@ static inline void bkey_init(struct bkey *k)
*
* - WHITEOUT: for hash table btrees
*/
#define BCH_BKEY_TYPES() \
x(deleted, 0) \
x(whiteout, 1) \
x(error, 2) \
x(cookie, 3) \
x(hash_whiteout, 4) \
x(btree_ptr, 5) \
x(extent, 6) \
x(reservation, 7) \
x(inode, 8) \
x(inode_generation, 9) \
x(dirent, 10) \
x(xattr, 11) \
x(alloc, 12) \
x(quota, 13) \
x(stripe, 14) \
x(reflink_p, 15) \
x(reflink_v, 16) \
x(inline_data, 17) \
x(btree_ptr_v2, 18) \
x(indirect_inline_data, 19) \
x(alloc_v2, 20) \
x(subvolume, 21) \
x(snapshot, 22) \
x(inode_v2, 23) \
x(alloc_v3, 24) \
x(set, 25) \
x(lru, 26) \
x(alloc_v4, 27) \
x(backpointer, 28) \
x(inode_v3, 29) \
x(bucket_gens, 30) \
x(snapshot_tree, 31) \
x(logged_op_truncate, 32) \
x(logged_op_finsert, 33) \
x(accounting, 34) \
x(inode_alloc_cursor, 35)
#define BCH_BKEY_TYPES() \
x(deleted, 0, 0) \
x(whiteout, 1, 0) \
x(error, 2, 0) \
x(cookie, 3, 0) \
x(hash_whiteout, 4, BKEY_TYPE_strict_btree_checks) \
x(btree_ptr, 5, BKEY_TYPE_strict_btree_checks) \
x(extent, 6, BKEY_TYPE_strict_btree_checks) \
x(reservation, 7, BKEY_TYPE_strict_btree_checks) \
x(inode, 8, BKEY_TYPE_strict_btree_checks) \
x(inode_generation, 9, BKEY_TYPE_strict_btree_checks) \
x(dirent, 10, BKEY_TYPE_strict_btree_checks) \
x(xattr, 11, BKEY_TYPE_strict_btree_checks) \
x(alloc, 12, BKEY_TYPE_strict_btree_checks) \
x(quota, 13, BKEY_TYPE_strict_btree_checks) \
x(stripe, 14, BKEY_TYPE_strict_btree_checks) \
x(reflink_p, 15, BKEY_TYPE_strict_btree_checks) \
x(reflink_v, 16, BKEY_TYPE_strict_btree_checks) \
x(inline_data, 17, BKEY_TYPE_strict_btree_checks) \
x(btree_ptr_v2, 18, BKEY_TYPE_strict_btree_checks) \
x(indirect_inline_data, 19, BKEY_TYPE_strict_btree_checks) \
x(alloc_v2, 20, BKEY_TYPE_strict_btree_checks) \
x(subvolume, 21, BKEY_TYPE_strict_btree_checks) \
x(snapshot, 22, BKEY_TYPE_strict_btree_checks) \
x(inode_v2, 23, BKEY_TYPE_strict_btree_checks) \
x(alloc_v3, 24, BKEY_TYPE_strict_btree_checks) \
x(set, 25, 0) \
x(lru, 26, BKEY_TYPE_strict_btree_checks) \
x(alloc_v4, 27, BKEY_TYPE_strict_btree_checks) \
x(backpointer, 28, BKEY_TYPE_strict_btree_checks) \
x(inode_v3, 29, BKEY_TYPE_strict_btree_checks) \
x(bucket_gens, 30, BKEY_TYPE_strict_btree_checks) \
x(snapshot_tree, 31, BKEY_TYPE_strict_btree_checks) \
x(logged_op_truncate, 32, BKEY_TYPE_strict_btree_checks) \
x(logged_op_finsert, 33, BKEY_TYPE_strict_btree_checks) \
x(accounting, 34, BKEY_TYPE_strict_btree_checks) \
x(inode_alloc_cursor, 35, BKEY_TYPE_strict_btree_checks)
enum bch_bkey_type {
#define x(name, nr) KEY_TYPE_##name = nr,
#define x(name, nr, ...) KEY_TYPE_##name = nr,
BCH_BKEY_TYPES()
#undef x
KEY_TYPE_MAX,
@@ -863,6 +867,7 @@ LE64_BITMASK(BCH_SB_VERSION_INCOMPAT_ALLOWED,
LE64_BITMASK(BCH_SB_SHARD_INUMS_NBITS, struct bch_sb, flags[6], 0, 4);
LE64_BITMASK(BCH_SB_WRITE_ERROR_TIMEOUT,struct bch_sb, flags[6], 4, 14);
LE64_BITMASK(BCH_SB_CSUM_ERR_RETRY_NR, struct bch_sb, flags[6], 14, 20);
LE64_BITMASK(BCH_SB_CASEFOLD, struct bch_sb, flags[6], 22, 23);
static inline __u64 BCH_SB_COMPRESSION_TYPE(const struct bch_sb *sb)
{

View File

@@ -21,7 +21,7 @@
#include "xattr.h"
const char * const bch2_bkey_types[] = {
#define x(name, nr) #name,
#define x(name, nr, ...) #name,
BCH_BKEY_TYPES()
#undef x
NULL
@@ -115,7 +115,7 @@ static bool key_type_set_merge(struct bch_fs *c, struct bkey_s l, struct bkey_s_
})
const struct bkey_ops bch2_bkey_ops[] = {
#define x(name, nr) [KEY_TYPE_##name] = bch2_bkey_ops_##name,
#define x(name, nr, ...) [KEY_TYPE_##name] = bch2_bkey_ops_##name,
BCH_BKEY_TYPES()
#undef x
};
@@ -155,6 +155,12 @@ static u64 bch2_key_types_allowed[] = {
#undef x
};
static const enum bch_bkey_type_flags bch2_bkey_type_flags[] = {
#define x(name, nr, flags) [KEY_TYPE_##name] = flags,
BCH_BKEY_TYPES()
#undef x
};
const char *bch2_btree_node_type_str(enum btree_node_type type)
{
return type == BKEY_TYPE_btree ? "internal btree node" : bch2_btree_id_str(type - 1);
@@ -177,8 +183,18 @@ int __bch2_bkey_validate(struct bch_fs *c, struct bkey_s_c k,
if (type >= BKEY_TYPE_NR)
return 0;
bkey_fsck_err_on(k.k->type < KEY_TYPE_MAX &&
(type == BKEY_TYPE_btree || (from.flags & BCH_VALIDATE_commit)) &&
enum bch_bkey_type_flags bkey_flags = k.k->type < KEY_TYPE_MAX
? bch2_bkey_type_flags[k.k->type]
: 0;
bool strict_key_type_allowed =
(from.flags & BCH_VALIDATE_commit) ||
type == BKEY_TYPE_btree ||
(from.btree < BTREE_ID_NR &&
(bkey_flags & BKEY_TYPE_strict_btree_checks));
bkey_fsck_err_on(strict_key_type_allowed &&
k.k->type < KEY_TYPE_MAX &&
!(bch2_key_types_allowed[type] & BIT_ULL(k.k->type)),
c, bkey_invalid_type_for_btree,
"invalid key type for btree %s (%s)",

View File

@@ -2577,7 +2577,10 @@ struct bkey_s_c bch2_btree_iter_peek_prev_min(struct btree_trans *trans, struct
struct bpos end)
{
if ((iter->flags & (BTREE_ITER_is_extents|BTREE_ITER_filter_snapshots)) &&
!bkey_eq(iter->pos, POS_MAX)) {
!bkey_eq(iter->pos, POS_MAX) &&
!((iter->flags & BTREE_ITER_is_extents) &&
iter->pos.offset == U64_MAX)) {
/*
* bkey_start_pos(), for extents, is not monotonically
* increasing until after filtering for snapshots:
@@ -2602,7 +2605,7 @@ struct bkey_s_c bch2_btree_iter_peek_prev_min(struct btree_trans *trans, struct
bch2_trans_verify_not_unlocked_or_in_restart(trans);
bch2_btree_iter_verify_entry_exit(iter);
EBUG_ON((iter->flags & BTREE_ITER_filter_snapshots) && bpos_eq(end, POS_MIN));
EBUG_ON((iter->flags & BTREE_ITER_filter_snapshots) && iter->pos.inode != end.inode);
int ret = trans_maybe_inject_restart(trans, _RET_IP_);
if (unlikely(ret)) {

View File

@@ -13,8 +13,8 @@
#include <linux/dcache.h>
static int bch2_casefold(struct btree_trans *trans, const struct bch_hash_info *info,
const struct qstr *str, struct qstr *out_cf)
int bch2_casefold(struct btree_trans *trans, const struct bch_hash_info *info,
const struct qstr *str, struct qstr *out_cf)
{
*out_cf = (struct qstr) QSTR_INIT(NULL, 0);
@@ -35,18 +35,6 @@ static int bch2_casefold(struct btree_trans *trans, const struct bch_hash_info *
#endif
}
static inline int bch2_maybe_casefold(struct btree_trans *trans,
const struct bch_hash_info *info,
const struct qstr *str, struct qstr *out_cf)
{
if (likely(!info->cf_encoding)) {
*out_cf = *str;
return 0;
} else {
return bch2_casefold(trans, info, str, out_cf);
}
}
static unsigned bch2_dirent_name_bytes(struct bkey_s_c_dirent d)
{
if (bkey_val_bytes(d.k) < offsetof(struct bch_dirent, d_name))

View File

@@ -23,6 +23,21 @@ struct bch_fs;
struct bch_hash_info;
struct bch_inode_info;
int bch2_casefold(struct btree_trans *, const struct bch_hash_info *,
const struct qstr *, struct qstr *);
static inline int bch2_maybe_casefold(struct btree_trans *trans,
const struct bch_hash_info *info,
const struct qstr *str, struct qstr *out_cf)
{
if (likely(!info->cf_encoding)) {
*out_cf = *str;
return 0;
} else {
return bch2_casefold(trans, info, str, out_cf);
}
}
struct qstr bch2_dirent_get_name(struct bkey_s_c_dirent d);
static inline unsigned dirent_val_u64s(unsigned len, unsigned cf_len)

View File

@@ -272,9 +272,6 @@ static struct fsck_err_state *fsck_err_get(struct bch_fs *c,
{
struct fsck_err_state *s;
if (!test_bit(BCH_FS_fsck_running, &c->flags))
return NULL;
list_for_each_entry(s, &c->fsck_error_msgs, list)
if (s->id == id) {
/*
@@ -639,14 +636,14 @@ int __bch2_bkey_fsck_err(struct bch_fs *c,
return ret;
}
void bch2_flush_fsck_errs(struct bch_fs *c)
static void __bch2_flush_fsck_errs(struct bch_fs *c, bool print)
{
struct fsck_err_state *s, *n;
mutex_lock(&c->fsck_error_msgs_lock);
list_for_each_entry_safe(s, n, &c->fsck_error_msgs, list) {
if (s->ratelimited && s->last_msg)
if (print && s->ratelimited && s->last_msg)
bch_err(c, "Saw %llu errors like:\n %s", s->nr, s->last_msg);
list_del(&s->list);
@@ -657,6 +654,16 @@ void bch2_flush_fsck_errs(struct bch_fs *c)
mutex_unlock(&c->fsck_error_msgs_lock);
}
void bch2_flush_fsck_errs(struct bch_fs *c)
{
__bch2_flush_fsck_errs(c, true);
}
void bch2_free_fsck_errs(struct bch_fs *c)
{
__bch2_flush_fsck_errs(c, false);
}
int bch2_inum_offset_err_msg_trans(struct btree_trans *trans, struct printbuf *out,
subvol_inum inum, u64 offset)
{

View File

@@ -93,6 +93,7 @@ int __bch2_fsck_err(struct bch_fs *, struct btree_trans *,
_flags, BCH_FSCK_ERR_##_err_type, __VA_ARGS__)
void bch2_flush_fsck_errs(struct bch_fs *);
void bch2_free_fsck_errs(struct bch_fs *);
#define fsck_err_wrap(_do) \
({ \

View File

@@ -21,206 +21,6 @@
#define FSOP_GOING_FLAGS_LOGFLUSH 0x1 /* flush log but not data */
#define FSOP_GOING_FLAGS_NOLOGFLUSH 0x2 /* don't flush log nor data */
struct flags_set {
unsigned mask;
unsigned flags;
unsigned projid;
bool set_projinherit;
bool projinherit;
};
static int bch2_inode_flags_set(struct btree_trans *trans,
struct bch_inode_info *inode,
struct bch_inode_unpacked *bi,
void *p)
{
struct bch_fs *c = inode->v.i_sb->s_fs_info;
/*
* We're relying on btree locking here for exclusion with other ioctl
* calls - use the flags in the btree (@bi), not inode->i_flags:
*/
struct flags_set *s = p;
unsigned newflags = s->flags;
unsigned oldflags = bi->bi_flags & s->mask;
if (((newflags ^ oldflags) & (BCH_INODE_append|BCH_INODE_immutable)) &&
!capable(CAP_LINUX_IMMUTABLE))
return -EPERM;
if (!S_ISREG(bi->bi_mode) &&
!S_ISDIR(bi->bi_mode) &&
(newflags & (BCH_INODE_nodump|BCH_INODE_noatime)) != newflags)
return -EINVAL;
if ((newflags ^ oldflags) & BCH_INODE_casefolded) {
#ifdef CONFIG_UNICODE
int ret = 0;
/* Not supported on individual files. */
if (!S_ISDIR(bi->bi_mode))
return -EOPNOTSUPP;
/*
* Make sure the dir is empty, as otherwise we'd need to
* rehash everything and update the dirent keys.
*/
ret = bch2_empty_dir_trans(trans, inode_inum(inode));
if (ret < 0)
return ret;
ret = bch2_request_incompat_feature(c, bcachefs_metadata_version_casefolding);
if (ret)
return ret;
bch2_check_set_feature(c, BCH_FEATURE_casefolding);
#else
printk(KERN_ERR "Cannot use casefolding on a kernel without CONFIG_UNICODE\n");
return -EOPNOTSUPP;
#endif
}
if (s->set_projinherit) {
bi->bi_fields_set &= ~(1 << Inode_opt_project);
bi->bi_fields_set |= ((int) s->projinherit << Inode_opt_project);
}
bi->bi_flags &= ~s->mask;
bi->bi_flags |= newflags;
bi->bi_ctime = timespec_to_bch2_time(c, current_time(&inode->v));
return 0;
}
static int bch2_ioc_getflags(struct bch_inode_info *inode, int __user *arg)
{
unsigned flags = map_flags(bch_flags_to_uflags, inode->ei_inode.bi_flags);
return put_user(flags, arg);
}
static int bch2_ioc_setflags(struct bch_fs *c,
struct file *file,
struct bch_inode_info *inode,
void __user *arg)
{
struct flags_set s = { .mask = map_defined(bch_flags_to_uflags) };
unsigned uflags;
int ret;
if (get_user(uflags, (int __user *) arg))
return -EFAULT;
s.flags = map_flags_rev(bch_flags_to_uflags, uflags);
if (uflags)
return -EOPNOTSUPP;
ret = mnt_want_write_file(file);
if (ret)
return ret;
inode_lock(&inode->v);
if (!inode_owner_or_capable(file_mnt_idmap(file), &inode->v)) {
ret = -EACCES;
goto setflags_out;
}
mutex_lock(&inode->ei_update_lock);
ret = bch2_subvol_is_ro(c, inode->ei_inum.subvol) ?:
bch2_write_inode(c, inode, bch2_inode_flags_set, &s,
ATTR_CTIME);
mutex_unlock(&inode->ei_update_lock);
setflags_out:
inode_unlock(&inode->v);
mnt_drop_write_file(file);
return ret;
}
static int bch2_ioc_fsgetxattr(struct bch_inode_info *inode,
struct fsxattr __user *arg)
{
struct fsxattr fa = { 0 };
fa.fsx_xflags = map_flags(bch_flags_to_xflags, inode->ei_inode.bi_flags);
if (inode->ei_inode.bi_fields_set & (1 << Inode_opt_project))
fa.fsx_xflags |= FS_XFLAG_PROJINHERIT;
fa.fsx_projid = inode->ei_qid.q[QTYP_PRJ];
if (copy_to_user(arg, &fa, sizeof(fa)))
return -EFAULT;
return 0;
}
static int fssetxattr_inode_update_fn(struct btree_trans *trans,
struct bch_inode_info *inode,
struct bch_inode_unpacked *bi,
void *p)
{
struct flags_set *s = p;
if (s->projid != bi->bi_project) {
bi->bi_fields_set |= 1U << Inode_opt_project;
bi->bi_project = s->projid;
}
return bch2_inode_flags_set(trans, inode, bi, p);
}
static int bch2_ioc_fssetxattr(struct bch_fs *c,
struct file *file,
struct bch_inode_info *inode,
struct fsxattr __user *arg)
{
struct flags_set s = { .mask = map_defined(bch_flags_to_xflags) };
struct fsxattr fa;
int ret;
if (copy_from_user(&fa, arg, sizeof(fa)))
return -EFAULT;
s.set_projinherit = true;
s.projinherit = (fa.fsx_xflags & FS_XFLAG_PROJINHERIT) != 0;
fa.fsx_xflags &= ~FS_XFLAG_PROJINHERIT;
s.flags = map_flags_rev(bch_flags_to_xflags, fa.fsx_xflags);
if (fa.fsx_xflags)
return -EOPNOTSUPP;
if (fa.fsx_projid >= U32_MAX)
return -EINVAL;
/*
* inode fields accessible via the xattr interface are stored with a +1
* bias, so that 0 means unset:
*/
s.projid = fa.fsx_projid + 1;
ret = mnt_want_write_file(file);
if (ret)
return ret;
inode_lock(&inode->v);
if (!inode_owner_or_capable(file_mnt_idmap(file), &inode->v)) {
ret = -EACCES;
goto err;
}
mutex_lock(&inode->ei_update_lock);
ret = bch2_subvol_is_ro(c, inode->ei_inum.subvol) ?:
bch2_set_projid(c, inode, fa.fsx_projid) ?:
bch2_write_inode(c, inode, fssetxattr_inode_update_fn, &s,
ATTR_CTIME);
mutex_unlock(&inode->ei_update_lock);
err:
inode_unlock(&inode->v);
mnt_drop_write_file(file);
return ret;
}
static int bch2_reinherit_attrs_fn(struct btree_trans *trans,
struct bch_inode_info *inode,
struct bch_inode_unpacked *bi,
@@ -558,23 +358,6 @@ long bch2_fs_file_ioctl(struct file *file, unsigned cmd, unsigned long arg)
long ret;
switch (cmd) {
case FS_IOC_GETFLAGS:
ret = bch2_ioc_getflags(inode, (int __user *) arg);
break;
case FS_IOC_SETFLAGS:
ret = bch2_ioc_setflags(c, file, inode, (int __user *) arg);
break;
case FS_IOC_FSGETXATTR:
ret = bch2_ioc_fsgetxattr(inode, (void __user *) arg);
break;
case FS_IOC_FSSETXATTR:
ret = bch2_ioc_fssetxattr(c, file, inode,
(void __user *) arg);
break;
case BCHFS_IOC_REINHERIT_ATTRS:
ret = bch2_ioc_reinherit_attrs(c, file, inode,
(void __user *) arg);

View File

@@ -2,81 +2,6 @@
#ifndef _BCACHEFS_FS_IOCTL_H
#define _BCACHEFS_FS_IOCTL_H
/* Inode flags: */
/* bcachefs inode flags -> vfs inode flags: */
static const __maybe_unused unsigned bch_flags_to_vfs[] = {
[__BCH_INODE_sync] = S_SYNC,
[__BCH_INODE_immutable] = S_IMMUTABLE,
[__BCH_INODE_append] = S_APPEND,
[__BCH_INODE_noatime] = S_NOATIME,
[__BCH_INODE_casefolded] = S_CASEFOLD,
};
/* bcachefs inode flags -> FS_IOC_GETFLAGS: */
static const __maybe_unused unsigned bch_flags_to_uflags[] = {
[__BCH_INODE_sync] = FS_SYNC_FL,
[__BCH_INODE_immutable] = FS_IMMUTABLE_FL,
[__BCH_INODE_append] = FS_APPEND_FL,
[__BCH_INODE_nodump] = FS_NODUMP_FL,
[__BCH_INODE_noatime] = FS_NOATIME_FL,
[__BCH_INODE_casefolded] = FS_CASEFOLD_FL,
};
/* bcachefs inode flags -> FS_IOC_FSGETXATTR: */
static const __maybe_unused unsigned bch_flags_to_xflags[] = {
[__BCH_INODE_sync] = FS_XFLAG_SYNC,
[__BCH_INODE_immutable] = FS_XFLAG_IMMUTABLE,
[__BCH_INODE_append] = FS_XFLAG_APPEND,
[__BCH_INODE_nodump] = FS_XFLAG_NODUMP,
[__BCH_INODE_noatime] = FS_XFLAG_NOATIME,
//[__BCH_INODE_PROJINHERIT] = FS_XFLAG_PROJINHERIT;
};
#define set_flags(_map, _in, _out) \
do { \
unsigned _i; \
\
for (_i = 0; _i < ARRAY_SIZE(_map); _i++) \
if ((_in) & (1 << _i)) \
(_out) |= _map[_i]; \
else \
(_out) &= ~_map[_i]; \
} while (0)
#define map_flags(_map, _in) \
({ \
unsigned _out = 0; \
\
set_flags(_map, _in, _out); \
_out; \
})
#define map_flags_rev(_map, _in) \
({ \
unsigned _i, _out = 0; \
\
for (_i = 0; _i < ARRAY_SIZE(_map); _i++) \
if ((_in) & _map[_i]) { \
(_out) |= 1 << _i; \
(_in) &= ~_map[_i]; \
} \
(_out); \
})
#define map_defined(_map) \
({ \
unsigned _in = ~0; \
\
map_flags_rev(_map, _in); \
})
/* Set VFS inode flags from bcachefs inode: */
static inline void bch2_inode_flags_to_vfs(struct bch_inode_info *inode)
{
set_flags(bch_flags_to_vfs, inode->ei_inode.bi_flags, inode->v.i_flags);
}
long bch2_fs_file_ioctl(struct file *, unsigned, unsigned long);
long bch2_compat_fs_ioctl(struct file *, unsigned, unsigned long);

View File

@@ -33,6 +33,7 @@
#include <linux/backing-dev.h>
#include <linux/exportfs.h>
#include <linux/fiemap.h>
#include <linux/fileattr.h>
#include <linux/fs_context.h>
#include <linux/module.h>
#include <linux/pagemap.h>
@@ -51,6 +52,22 @@ static void bch2_vfs_inode_init(struct btree_trans *, subvol_inum,
struct bch_inode_unpacked *,
struct bch_subvolume *);
/* Set VFS inode flags from bcachefs inode: */
static inline void bch2_inode_flags_to_vfs(struct bch_fs *c, struct bch_inode_info *inode)
{
static const __maybe_unused unsigned bch_flags_to_vfs[] = {
[__BCH_INODE_sync] = S_SYNC,
[__BCH_INODE_immutable] = S_IMMUTABLE,
[__BCH_INODE_append] = S_APPEND,
[__BCH_INODE_noatime] = S_NOATIME,
};
set_flags(bch_flags_to_vfs, inode->ei_inode.bi_flags, inode->v.i_flags);
if (bch2_inode_casefold(c, &inode->ei_inode))
inode->v.i_flags |= S_CASEFOLD;
}
void bch2_inode_update_after_write(struct btree_trans *trans,
struct bch_inode_info *inode,
struct bch_inode_unpacked *bi,
@@ -79,7 +96,7 @@ void bch2_inode_update_after_write(struct btree_trans *trans,
inode->ei_inode = *bi;
bch2_inode_flags_to_vfs(inode);
bch2_inode_flags_to_vfs(c, inode);
}
int __must_check bch2_write_inode(struct bch_fs *c,
@@ -631,13 +648,18 @@ static struct bch_inode_info *bch2_lookup_trans(struct btree_trans *trans,
const struct qstr *name)
{
struct bch_fs *c = trans->c;
struct btree_iter dirent_iter = {};
subvol_inum inum = {};
struct printbuf buf = PRINTBUF;
struct qstr lookup_name;
int ret = bch2_maybe_casefold(trans, dir_hash_info, name, &lookup_name);
if (ret)
return ERR_PTR(ret);
struct btree_iter dirent_iter = {};
struct bkey_s_c k = bch2_hash_lookup(trans, &dirent_iter, bch2_dirent_hash_desc,
dir_hash_info, dir, name, 0);
int ret = bkey_err(k);
dir_hash_info, dir, &lookup_name, 0);
ret = bkey_err(k);
if (ret)
return ERR_PTR(ret);
@@ -825,6 +847,11 @@ int __bch2_unlink(struct inode *vdir, struct dentry *dentry,
*/
set_nlink(&inode->v, 0);
}
if (IS_CASEFOLDED(vdir)) {
d_invalidate(dentry);
d_prune_aliases(&inode->v);
}
err:
bch2_trans_put(trans);
bch2_unlock_inodes(INODE_UPDATE_LOCK, dir, inode);
@@ -1235,10 +1262,20 @@ static int bch2_tmpfile(struct mnt_idmap *idmap,
return finish_open_simple(file, 0);
}
struct bch_fiemap_extent {
struct bkey_buf kbuf;
unsigned flags;
};
static int bch2_fill_extent(struct bch_fs *c,
struct fiemap_extent_info *info,
struct bkey_s_c k, unsigned flags)
struct bch_fiemap_extent *fe)
{
struct bkey_s_c k = bkey_i_to_s_c(fe->kbuf.k);
unsigned flags = fe->flags;
BUG_ON(!k.k->size);
if (bkey_extent_is_direct_data(k.k)) {
struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
const union bch_extent_entry *entry;
@@ -1291,110 +1328,223 @@ static int bch2_fill_extent(struct bch_fs *c,
}
}
/*
* Scan a range of an inode for data in pagecache.
*
* Intended to be retryable, so don't modify the output params until success is
* imminent.
*/
static int
bch2_fiemap_hole_pagecache(struct inode *vinode, u64 *start, u64 *end,
bool nonblock)
{
loff_t dstart, dend;
dstart = bch2_seek_pagecache_data(vinode, *start, *end, 0, nonblock);
if (dstart < 0)
return dstart;
if (dstart == *end) {
*start = dstart;
return 0;
}
dend = bch2_seek_pagecache_hole(vinode, dstart, *end, 0, nonblock);
if (dend < 0)
return dend;
/* race */
BUG_ON(dstart == dend);
*start = dstart;
*end = dend;
return 0;
}
/*
* Scan a range of pagecache that corresponds to a file mapping hole in the
* extent btree. If data is found, fake up an extent key so it looks like a
* delalloc extent to the rest of the fiemap processing code.
*/
static int
bch2_next_fiemap_pagecache_extent(struct btree_trans *trans, struct bch_inode_info *inode,
u64 start, u64 end, struct bch_fiemap_extent *cur)
{
struct bch_fs *c = trans->c;
struct bkey_i_extent *delextent;
struct bch_extent_ptr ptr = {};
loff_t dstart = start << 9, dend = end << 9;
int ret;
/*
* We hold btree locks here so we cannot block on folio locks without
* dropping trans locks first. Run a nonblocking scan for the common
* case of no folios over holes and fall back on failure.
*
* Note that dropping locks like this is technically racy against
* writeback inserting to the extent tree, but a non-sync fiemap scan is
* fundamentally racy with writeback anyways. Therefore, just report the
* range as delalloc regardless of whether we have to cycle trans locks.
*/
ret = bch2_fiemap_hole_pagecache(&inode->v, &dstart, &dend, true);
if (ret == -EAGAIN)
ret = drop_locks_do(trans,
bch2_fiemap_hole_pagecache(&inode->v, &dstart, &dend, false));
if (ret < 0)
return ret;
/*
* Create a fake extent key in the buffer. We have to add a dummy extent
* pointer for the fill code to add an extent entry. It's explicitly
* zeroed to reflect delayed allocation (i.e. phys offset 0).
*/
bch2_bkey_buf_realloc(&cur->kbuf, c, sizeof(*delextent) / sizeof(u64));
delextent = bkey_extent_init(cur->kbuf.k);
delextent->k.p = POS(inode->ei_inum.inum, dend >> 9);
delextent->k.size = (dend - dstart) >> 9;
bch2_bkey_append_ptr(&delextent->k_i, ptr);
cur->flags = FIEMAP_EXTENT_DELALLOC;
return 0;
}
static int bch2_next_fiemap_extent(struct btree_trans *trans,
struct bch_inode_info *inode,
u64 start, u64 end,
struct bch_fiemap_extent *cur)
{
u32 snapshot;
int ret = bch2_subvolume_get_snapshot(trans, inode->ei_inum.subvol, &snapshot);
if (ret)
return ret;
struct btree_iter iter;
bch2_trans_iter_init(trans, &iter, BTREE_ID_extents,
SPOS(inode->ei_inum.inum, start, snapshot), 0);
struct bkey_s_c k =
bch2_btree_iter_peek_max(trans, &iter, POS(inode->ei_inum.inum, end));
ret = bkey_err(k);
if (ret)
goto err;
ret = bch2_next_fiemap_pagecache_extent(trans, inode, start, end, cur);
if (ret)
goto err;
struct bpos pagecache_start = bkey_start_pos(&cur->kbuf.k->k);
/*
* Does the pagecache or the btree take precedence?
*
* It _should_ be the pagecache, so that we correctly report delalloc
* extents when dirty in the pagecache (we're COW, after all).
*
* But we'd have to add per-sector writeback tracking to
* bch_folio_state, otherwise we report delalloc extents for clean
* cached data in the pagecache.
*
* We should do this, but even then fiemap won't report stable mappings:
* on bcachefs data moves around in the background (copygc, rebalance)
* and we don't provide a way for userspace to lock that out.
*/
if (k.k &&
bkey_le(bpos_max(iter.pos, bkey_start_pos(k.k)),
pagecache_start)) {
bch2_bkey_buf_reassemble(&cur->kbuf, trans->c, k);
bch2_cut_front(iter.pos, cur->kbuf.k);
bch2_cut_back(POS(inode->ei_inum.inum, end), cur->kbuf.k);
cur->flags = 0;
} else if (k.k) {
bch2_cut_back(bkey_start_pos(k.k), cur->kbuf.k);
}
if (cur->kbuf.k->k.type == KEY_TYPE_reflink_p) {
unsigned sectors = cur->kbuf.k->k.size;
s64 offset_into_extent = 0;
enum btree_id data_btree = BTREE_ID_extents;
int ret = bch2_read_indirect_extent(trans, &data_btree, &offset_into_extent,
&cur->kbuf);
if (ret)
goto err;
struct bkey_i *k = cur->kbuf.k;
sectors = min_t(unsigned, sectors, k->k.size - offset_into_extent);
bch2_cut_front(POS(k->k.p.inode,
bkey_start_offset(&k->k) + offset_into_extent),
k);
bch2_key_resize(&k->k, sectors);
k->k.p = iter.pos;
k->k.p.offset += k->k.size;
}
err:
bch2_trans_iter_exit(trans, &iter);
return ret;
}
static int bch2_fiemap(struct inode *vinode, struct fiemap_extent_info *info,
u64 start, u64 len)
{
struct bch_fs *c = vinode->i_sb->s_fs_info;
struct bch_inode_info *ei = to_bch_ei(vinode);
struct btree_trans *trans;
struct btree_iter iter;
struct bkey_s_c k;
struct bkey_buf cur, prev;
bool have_extent = false;
struct bch_fiemap_extent cur, prev;
int ret = 0;
ret = fiemap_prep(&ei->v, info, start, &len, FIEMAP_FLAG_SYNC);
ret = fiemap_prep(&ei->v, info, start, &len, 0);
if (ret)
return ret;
struct bpos end = POS(ei->v.i_ino, (start + len) >> 9);
if (start + len < start)
return -EINVAL;
start >>= 9;
u64 end = (start + len) >> 9;
bch2_bkey_buf_init(&cur.kbuf);
bch2_bkey_buf_init(&prev.kbuf);
bkey_init(&prev.kbuf.k->k);
bch2_bkey_buf_init(&cur);
bch2_bkey_buf_init(&prev);
trans = bch2_trans_get(c);
bch2_trans_iter_init(trans, &iter, BTREE_ID_extents,
POS(ei->v.i_ino, start), 0);
while (!ret || bch2_err_matches(ret, BCH_ERR_transaction_restart)) {
enum btree_id data_btree = BTREE_ID_extents;
bch2_trans_begin(trans);
u32 snapshot;
ret = bch2_subvolume_get_snapshot(trans, ei->ei_inum.subvol, &snapshot);
while (start < end) {
ret = lockrestart_do(trans,
bch2_next_fiemap_extent(trans, ei, start, end, &cur));
if (ret)
continue;
goto err;
bch2_btree_iter_set_snapshot(trans, &iter, snapshot);
BUG_ON(bkey_start_offset(&cur.kbuf.k->k) < start);
BUG_ON(cur.kbuf.k->k.p.offset > end);
k = bch2_btree_iter_peek_max(trans, &iter, end);
ret = bkey_err(k);
if (ret)
continue;
if (!k.k)
if (bkey_start_offset(&cur.kbuf.k->k) == end)
break;
if (!bkey_extent_is_data(k.k) &&
k.k->type != KEY_TYPE_reservation) {
bch2_btree_iter_advance(trans, &iter);
continue;
}
start = cur.kbuf.k->k.p.offset;
s64 offset_into_extent = iter.pos.offset - bkey_start_offset(k.k);
unsigned sectors = k.k->size - offset_into_extent;
bch2_bkey_buf_reassemble(&cur, c, k);
ret = bch2_read_indirect_extent(trans, &data_btree,
&offset_into_extent, &cur);
if (ret)
continue;
k = bkey_i_to_s_c(cur.k);
bch2_bkey_buf_realloc(&prev, c, k.k->u64s);
sectors = min_t(unsigned, sectors, k.k->size - offset_into_extent);
bch2_cut_front(POS(k.k->p.inode,
bkey_start_offset(k.k) +
offset_into_extent),
cur.k);
bch2_key_resize(&cur.k->k, sectors);
cur.k->k.p = iter.pos;
cur.k->k.p.offset += cur.k->k.size;
if (have_extent) {
if (!bkey_deleted(&prev.kbuf.k->k)) {
bch2_trans_unlock(trans);
ret = bch2_fill_extent(c, info,
bkey_i_to_s_c(prev.k), 0);
ret = bch2_fill_extent(c, info, &prev);
if (ret)
break;
goto err;
}
bkey_copy(prev.k, cur.k);
have_extent = true;
bch2_btree_iter_set_pos(trans, &iter,
POS(iter.pos.inode, iter.pos.offset + sectors));
bch2_bkey_buf_copy(&prev.kbuf, c, cur.kbuf.k);
prev.flags = cur.flags;
}
bch2_trans_iter_exit(trans, &iter);
if (!ret && have_extent) {
if (!bkey_deleted(&prev.kbuf.k->k)) {
bch2_trans_unlock(trans);
ret = bch2_fill_extent(c, info, bkey_i_to_s_c(prev.k),
FIEMAP_EXTENT_LAST);
prev.flags |= FIEMAP_EXTENT_LAST;
ret = bch2_fill_extent(c, info, &prev);
}
err:
bch2_trans_put(trans);
bch2_bkey_buf_exit(&cur, c);
bch2_bkey_buf_exit(&prev, c);
return ret < 0 ? ret : 0;
bch2_bkey_buf_exit(&cur.kbuf, c);
bch2_bkey_buf_exit(&prev.kbuf, c);
return bch2_err_class(ret < 0 ? ret : 0);
}
static const struct vm_operations_struct bch_vm_ops = {
@@ -1449,6 +1599,165 @@ static int bch2_open(struct inode *vinode, struct file *file)
return generic_file_open(vinode, file);
}
/* bcachefs inode flags -> FS_IOC_GETFLAGS: */
static const __maybe_unused unsigned bch_flags_to_uflags[] = {
[__BCH_INODE_sync] = FS_SYNC_FL,
[__BCH_INODE_immutable] = FS_IMMUTABLE_FL,
[__BCH_INODE_append] = FS_APPEND_FL,
[__BCH_INODE_nodump] = FS_NODUMP_FL,
[__BCH_INODE_noatime] = FS_NOATIME_FL,
};
/* bcachefs inode flags -> FS_IOC_FSGETXATTR: */
static const __maybe_unused unsigned bch_flags_to_xflags[] = {
[__BCH_INODE_sync] = FS_XFLAG_SYNC,
[__BCH_INODE_immutable] = FS_XFLAG_IMMUTABLE,
[__BCH_INODE_append] = FS_XFLAG_APPEND,
[__BCH_INODE_nodump] = FS_XFLAG_NODUMP,
[__BCH_INODE_noatime] = FS_XFLAG_NOATIME,
};
static int bch2_fileattr_get(struct dentry *dentry,
struct fileattr *fa)
{
struct bch_inode_info *inode = to_bch_ei(d_inode(dentry));
struct bch_fs *c = inode->v.i_sb->s_fs_info;
fileattr_fill_xflags(fa, map_flags(bch_flags_to_xflags, inode->ei_inode.bi_flags));
if (inode->ei_inode.bi_fields_set & (1 << Inode_opt_project))
fa->fsx_xflags |= FS_XFLAG_PROJINHERIT;
if (bch2_inode_casefold(c, &inode->ei_inode))
fa->flags |= FS_CASEFOLD_FL;
fa->fsx_projid = inode->ei_qid.q[QTYP_PRJ];
return 0;
}
struct flags_set {
unsigned mask;
unsigned flags;
unsigned projid;
bool set_project;
bool set_casefold;
bool casefold;
};
static int fssetxattr_inode_update_fn(struct btree_trans *trans,
struct bch_inode_info *inode,
struct bch_inode_unpacked *bi,
void *p)
{
struct bch_fs *c = trans->c;
struct flags_set *s = p;
/*
* We're relying on btree locking here for exclusion with other ioctl
* calls - use the flags in the btree (@bi), not inode->i_flags:
*/
if (!S_ISREG(bi->bi_mode) &&
!S_ISDIR(bi->bi_mode) &&
(s->flags & (BCH_INODE_nodump|BCH_INODE_noatime)) != s->flags)
return -EINVAL;
if (s->casefold != bch2_inode_casefold(c, bi)) {
#ifdef CONFIG_UNICODE
int ret = 0;
/* Not supported on individual files. */
if (!S_ISDIR(bi->bi_mode))
return -EOPNOTSUPP;
/*
* Make sure the dir is empty, as otherwise we'd need to
* rehash everything and update the dirent keys.
*/
ret = bch2_empty_dir_trans(trans, inode_inum(inode));
if (ret < 0)
return ret;
ret = bch2_request_incompat_feature(c, bcachefs_metadata_version_casefolding);
if (ret)
return ret;
bch2_check_set_feature(c, BCH_FEATURE_casefolding);
bi->bi_casefold = s->casefold + 1;
bi->bi_fields_set |= BIT(Inode_opt_casefold);
#else
printk(KERN_ERR "Cannot use casefolding on a kernel without CONFIG_UNICODE\n");
return -EOPNOTSUPP;
#endif
}
if (s->set_project) {
bi->bi_project = s->projid;
bi->bi_fields_set |= BIT(Inode_opt_project);
}
bi->bi_flags &= ~s->mask;
bi->bi_flags |= s->flags;
bi->bi_ctime = timespec_to_bch2_time(c, current_time(&inode->v));
return 0;
}
static int bch2_fileattr_set(struct mnt_idmap *idmap,
struct dentry *dentry,
struct fileattr *fa)
{
struct bch_inode_info *inode = to_bch_ei(d_inode(dentry));
struct bch_fs *c = inode->v.i_sb->s_fs_info;
struct flags_set s = {};
int ret;
if (fa->fsx_valid) {
fa->fsx_xflags &= ~FS_XFLAG_PROJINHERIT;
s.mask = map_defined(bch_flags_to_xflags);
s.flags |= map_flags_rev(bch_flags_to_xflags, fa->fsx_xflags);
if (fa->fsx_xflags)
return -EOPNOTSUPP;
if (fa->fsx_projid >= U32_MAX)
return -EINVAL;
/*
* inode fields accessible via the xattr interface are stored with a +1
* bias, so that 0 means unset:
*/
if ((inode->ei_inode.bi_project ||
fa->fsx_projid) &&
inode->ei_inode.bi_project != fa->fsx_projid + 1) {
s.projid = fa->fsx_projid + 1;
s.set_project = true;
}
}
if (fa->flags_valid) {
s.mask = map_defined(bch_flags_to_uflags);
s.set_casefold = true;
s.casefold = (fa->flags & FS_CASEFOLD_FL) != 0;
fa->flags &= ~FS_CASEFOLD_FL;
s.flags |= map_flags_rev(bch_flags_to_uflags, fa->flags);
if (fa->flags)
return -EOPNOTSUPP;
}
mutex_lock(&inode->ei_update_lock);
ret = bch2_subvol_is_ro(c, inode->ei_inum.subvol) ?:
(s.set_project
? bch2_set_projid(c, inode, fa->fsx_projid)
: 0) ?:
bch2_write_inode(c, inode, fssetxattr_inode_update_fn, &s,
ATTR_CTIME);
mutex_unlock(&inode->ei_update_lock);
return ret;
}
static const struct file_operations bch_file_operations = {
.open = bch2_open,
.llseek = bch2_llseek,
@@ -1476,6 +1785,8 @@ static const struct inode_operations bch_file_inode_operations = {
.get_inode_acl = bch2_get_acl,
.set_acl = bch2_set_acl,
#endif
.fileattr_get = bch2_fileattr_get,
.fileattr_set = bch2_fileattr_set,
};
static const struct inode_operations bch_dir_inode_operations = {
@@ -1496,6 +1807,8 @@ static const struct inode_operations bch_dir_inode_operations = {
.get_inode_acl = bch2_get_acl,
.set_acl = bch2_set_acl,
#endif
.fileattr_get = bch2_fileattr_get,
.fileattr_set = bch2_fileattr_set,
};
static const struct file_operations bch_dir_file_operations = {
@@ -1518,6 +1831,8 @@ static const struct inode_operations bch_symlink_inode_operations = {
.get_inode_acl = bch2_get_acl,
.set_acl = bch2_set_acl,
#endif
.fileattr_get = bch2_fileattr_get,
.fileattr_set = bch2_fileattr_set,
};
static const struct inode_operations bch_special_inode_operations = {
@@ -1528,6 +1843,8 @@ static const struct inode_operations bch_special_inode_operations = {
.get_inode_acl = bch2_get_acl,
.set_acl = bch2_set_acl,
#endif
.fileattr_get = bch2_fileattr_get,
.fileattr_set = bch2_fileattr_set,
};
static const struct address_space_operations bch_address_space_operations = {

View File

@@ -243,6 +243,14 @@ static inline unsigned bkey_inode_mode(struct bkey_s_c k)
}
}
static inline bool bch2_inode_casefold(struct bch_fs *c, const struct bch_inode_unpacked *bi)
{
/* inode apts are stored with a +1 bias: 0 means "unset, use fs opt" */
return bi->bi_casefold
? bi->bi_casefold - 1
: c->opts.casefold;
}
/* i_nlink: */
static inline unsigned nlink_bias(umode_t mode)

View File

@@ -103,7 +103,8 @@ struct bch_inode_generation {
x(bi_parent_subvol, 32) \
x(bi_nocow, 8) \
x(bi_depth, 32) \
x(bi_inodes_32bit, 8)
x(bi_inodes_32bit, 8) \
x(bi_casefold, 8)
/* subset of BCH_INODE_FIELDS */
#define BCH_INODE_OPTS() \
@@ -117,7 +118,8 @@ struct bch_inode_generation {
x(background_target, 16) \
x(erasure_code, 16) \
x(nocow, 8) \
x(inodes_32bit, 8)
x(inodes_32bit, 8) \
x(casefold, 8)
enum inode_opt_id {
#define x(name, ...) \
@@ -137,8 +139,7 @@ enum inode_opt_id {
x(i_sectors_dirty, 6) \
x(unlinked, 7) \
x(backptr_untrusted, 8) \
x(has_child_snapshot, 9) \
x(casefolded, 10)
x(has_child_snapshot, 9)
/* bits 20+ reserved for packed fields below: */

View File

@@ -281,7 +281,24 @@ static void __journal_entry_close(struct journal *j, unsigned closed_val, bool t
sectors = vstruct_blocks_plus(buf->data, c->block_bits,
buf->u64s_reserved) << c->block_bits;
BUG_ON(sectors > buf->sectors);
if (unlikely(sectors > buf->sectors)) {
struct printbuf err = PRINTBUF;
err.atomic++;
prt_printf(&err, "journal entry overran reserved space: %u > %u\n",
sectors, buf->sectors);
prt_printf(&err, "buf u64s %u u64s reserved %u cur_entry_u64s %u block_bits %u\n",
le32_to_cpu(buf->data->u64s), buf->u64s_reserved,
j->cur_entry_u64s,
c->block_bits);
prt_printf(&err, "fatal error - emergency read only");
bch2_journal_halt_locked(j);
bch_err(c, "%s", err.buf);
printbuf_exit(&err);
return;
}
buf->sectors = sectors;
/*
@@ -1462,8 +1479,6 @@ int bch2_fs_journal_start(struct journal *j, u64 cur_seq)
j->last_empty_seq = cur_seq - 1; /* to match j->seq */
spin_lock(&j->lock);
set_bit(JOURNAL_running, &j->flags);
j->last_flush_write = jiffies;
j->reservations.idx = journal_cur_seq(j);
@@ -1474,6 +1489,21 @@ int bch2_fs_journal_start(struct journal *j, u64 cur_seq)
return 0;
}
void bch2_journal_set_replay_done(struct journal *j)
{
/*
* journal_space_available must happen before setting JOURNAL_running
* JOURNAL_running must happen before JOURNAL_replay_done
*/
spin_lock(&j->lock);
bch2_journal_space_available(j);
set_bit(JOURNAL_need_flush_write, &j->flags);
set_bit(JOURNAL_running, &j->flags);
set_bit(JOURNAL_replay_done, &j->flags);
spin_unlock(&j->lock);
}
/* init/exit: */
void bch2_dev_journal_exit(struct bch_dev *ca)

View File

@@ -437,12 +437,6 @@ static inline int bch2_journal_error(struct journal *j)
struct bch_dev;
static inline void bch2_journal_set_replay_done(struct journal *j)
{
BUG_ON(!test_bit(JOURNAL_running, &j->flags));
set_bit(JOURNAL_replay_done, &j->flags);
}
void bch2_journal_unblock(struct journal *);
void bch2_journal_block(struct journal *);
struct journal_buf *bch2_next_write_buffer_flush_journal_buf(struct journal *, u64, bool *);
@@ -459,6 +453,7 @@ void bch2_dev_journal_stop(struct journal *, struct bch_dev *);
void bch2_fs_journal_stop(struct journal *);
int bch2_fs_journal_start(struct journal *, u64);
void bch2_journal_set_replay_done(struct journal *);
void bch2_dev_journal_exit(struct bch_dev *);
int bch2_dev_journal_init(struct bch_dev *, struct bch_sb *);

View File

@@ -252,7 +252,10 @@ void bch2_journal_space_available(struct journal *j)
bch2_journal_set_watermark(j);
out:
j->cur_entry_sectors = !ret ? j->space[journal_space_discarded].next_entry : 0;
j->cur_entry_sectors = !ret
? round_down(j->space[journal_space_discarded].next_entry,
block_sectors(c))
: 0;
j->cur_entry_error = ret;
if (!ret)

View File

@@ -356,6 +356,13 @@ static int bch2_copygc_thread(void *arg)
set_freezable();
/*
* Data move operations can't run until after check_snapshots has
* completed, and bch2_snapshot_is_ancestor() is available.
*/
kthread_wait_freezable(c->recovery_pass_done > BCH_RECOVERY_PASS_check_snapshots ||
kthread_should_stop());
bch2_move_stats_init(&move_stats, "copygc");
bch2_moving_ctxt_init(&ctxt, c, NULL, &move_stats,
writepoint_ptr(&c->copygc_write_point),

View File

@@ -5,6 +5,15 @@
unsigned long bch2_copygc_wait_amount(struct bch_fs *);
void bch2_copygc_wait_to_text(struct printbuf *, struct bch_fs *);
static inline void bch2_copygc_wakeup(struct bch_fs *c)
{
rcu_read_lock();
struct task_struct *p = rcu_dereference(c->copygc_thread);
if (p)
wake_up_process(p);
rcu_read_unlock();
}
void bch2_copygc_stop(struct bch_fs *);
int bch2_copygc_start(struct bch_fs *);
void bch2_fs_copygc_init(struct bch_fs *);

View File

@@ -47,10 +47,6 @@ int bch2_create_trans(struct btree_trans *trans,
if (ret)
goto err;
/* Inherit casefold state from parent. */
if (S_ISDIR(mode))
new_inode->bi_flags |= dir_u->bi_flags & BCH_INODE_casefolded;
if (!(flags & BCH_CREATE_SNAPSHOT)) {
/* Normal create path - allocate a new inode: */
bch2_inode_init_late(new_inode, now, uid, gid, mode, rdev, dir_u);

View File

@@ -228,6 +228,11 @@ enum fsck_err_opts {
OPT_BOOL(), \
BCH_SB_ERASURE_CODE, false, \
NULL, "Enable erasure coding (DO NOT USE YET)") \
x(casefold, u8, \
OPT_FS|OPT_INODE|OPT_FORMAT, \
OPT_BOOL(), \
BCH_SB_CASEFOLD, false, \
NULL, "Dirent lookups are casefolded") \
x(inodes_32bit, u8, \
OPT_FS|OPT_INODE|OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME, \
OPT_BOOL(), \

View File

@@ -262,7 +262,7 @@ int bch2_set_rebalance_needs_scan(struct bch_fs *c, u64 inum)
int ret = bch2_trans_commit_do(c, NULL, NULL,
BCH_TRANS_COMMIT_no_enospc,
bch2_set_rebalance_needs_scan_trans(trans, inum));
rebalance_wakeup(c);
bch2_rebalance_wakeup(c);
return ret;
}
@@ -581,6 +581,13 @@ static int bch2_rebalance_thread(void *arg)
set_freezable();
/*
* Data move operations can't run until after check_snapshots has
* completed, and bch2_snapshot_is_ancestor() is available.
*/
kthread_wait_freezable(c->recovery_pass_done > BCH_RECOVERY_PASS_check_snapshots ||
kthread_should_stop());
bch2_moving_ctxt_init(&ctxt, c, NULL, &r->work_stats,
writepoint_ptr(&c->rebalance_write_point),
true);
@@ -664,7 +671,7 @@ void bch2_rebalance_stop(struct bch_fs *c)
c->rebalance.thread = NULL;
if (p) {
/* for sychronizing with rebalance_wakeup() */
/* for sychronizing with bch2_rebalance_wakeup() */
synchronize_rcu();
kthread_stop(p);

View File

@@ -37,7 +37,7 @@ int bch2_set_rebalance_needs_scan_trans(struct btree_trans *, u64);
int bch2_set_rebalance_needs_scan(struct bch_fs *, u64 inum);
int bch2_set_fs_needs_rebalance(struct bch_fs *);
static inline void rebalance_wakeup(struct bch_fs *c)
static inline void bch2_rebalance_wakeup(struct bch_fs *c)
{
struct task_struct *p;

View File

@@ -18,6 +18,7 @@
#include "journal_seq_blacklist.h"
#include "logged_ops.h"
#include "move.h"
#include "movinggc.h"
#include "namei.h"
#include "quota.h"
#include "rebalance.h"
@@ -1129,13 +1130,13 @@ int bch2_fs_initialize(struct bch_fs *c)
if (ret)
goto err;
set_bit(BCH_FS_accounting_replay_done, &c->flags);
bch2_journal_set_replay_done(&c->journal);
ret = bch2_fs_read_write_early(c);
if (ret)
goto err;
set_bit(BCH_FS_accounting_replay_done, &c->flags);
bch2_journal_set_replay_done(&c->journal);
for_each_member_device(c, ca) {
ret = bch2_dev_usage_init(ca, false);
if (ret) {
@@ -1194,6 +1195,9 @@ int bch2_fs_initialize(struct bch_fs *c)
c->recovery_pass_done = BCH_RECOVERY_PASS_NR - 1;
bch2_copygc_wakeup(c);
bch2_rebalance_wakeup(c);
if (enabled_qtypes(c)) {
ret = bch2_fs_quota_read(c);
if (ret)

View File

@@ -12,6 +12,7 @@
#include "journal.h"
#include "lru.h"
#include "logged_ops.h"
#include "movinggc.h"
#include "rebalance.h"
#include "recovery.h"
#include "recovery_passes.h"
@@ -262,49 +263,52 @@ int bch2_run_recovery_passes(struct bch_fs *c)
*/
c->opts.recovery_passes_exclude &= ~BCH_RECOVERY_PASS_set_may_go_rw;
while (c->curr_recovery_pass < ARRAY_SIZE(recovery_pass_fns) && !ret) {
c->next_recovery_pass = c->curr_recovery_pass + 1;
spin_lock_irq(&c->recovery_pass_lock);
spin_lock_irq(&c->recovery_pass_lock);
while (c->curr_recovery_pass < ARRAY_SIZE(recovery_pass_fns) && !ret) {
unsigned prev_done = c->recovery_pass_done;
unsigned pass = c->curr_recovery_pass;
c->next_recovery_pass = pass + 1;
if (c->opts.recovery_pass_last &&
c->curr_recovery_pass > c->opts.recovery_pass_last) {
spin_unlock_irq(&c->recovery_pass_lock);
c->curr_recovery_pass > c->opts.recovery_pass_last)
break;
}
if (!should_run_recovery_pass(c, pass)) {
c->curr_recovery_pass++;
c->recovery_pass_done = max(c->recovery_pass_done, pass);
if (should_run_recovery_pass(c, pass)) {
spin_unlock_irq(&c->recovery_pass_lock);
continue;
ret = bch2_run_recovery_pass(c, pass) ?:
bch2_journal_flush(&c->journal);
if (!ret && !test_bit(BCH_FS_error, &c->flags))
bch2_clear_recovery_pass_required(c, pass);
spin_lock_irq(&c->recovery_pass_lock);
if (c->next_recovery_pass < c->curr_recovery_pass) {
/*
* bch2_run_explicit_recovery_pass() was called: we
* can't always catch -BCH_ERR_restart_recovery because
* it may have been called from another thread (btree
* node read completion)
*/
ret = 0;
c->recovery_passes_complete &= ~(~0ULL << c->curr_recovery_pass);
} else {
c->recovery_passes_complete |= BIT_ULL(pass);
c->recovery_pass_done = max(c->recovery_pass_done, pass);
}
}
spin_unlock_irq(&c->recovery_pass_lock);
ret = bch2_run_recovery_pass(c, pass) ?:
bch2_journal_flush(&c->journal);
if (!ret && !test_bit(BCH_FS_error, &c->flags))
bch2_clear_recovery_pass_required(c, pass);
spin_lock_irq(&c->recovery_pass_lock);
if (c->next_recovery_pass < c->curr_recovery_pass) {
/*
* bch2_run_explicit_recovery_pass() was called: we
* can't always catch -BCH_ERR_restart_recovery because
* it may have been called from another thread (btree
* node read completion)
*/
ret = 0;
c->recovery_passes_complete &= ~(~0ULL << c->curr_recovery_pass);
} else {
c->recovery_passes_complete |= BIT_ULL(pass);
c->recovery_pass_done = max(c->recovery_pass_done, pass);
}
c->curr_recovery_pass = c->next_recovery_pass;
spin_unlock_irq(&c->recovery_pass_lock);
if (prev_done <= BCH_RECOVERY_PASS_check_snapshots &&
c->recovery_pass_done > BCH_RECOVERY_PASS_check_snapshots) {
bch2_copygc_wakeup(c);
bch2_rebalance_wakeup(c);
}
}
spin_unlock_irq(&c->recovery_pass_lock);
return ret;
}

View File

@@ -396,7 +396,7 @@ u32 bch2_snapshot_tree_oldest_subvol(struct bch_fs *c, u32 snapshot_root)
u32 subvol = 0, s;
rcu_read_lock();
while (id) {
while (id && bch2_snapshot_exists(c, id)) {
s = snapshot_t(c, id)->subvol;
if (s && (!subvol || s < subvol))

View File

@@ -33,7 +33,7 @@ bch2_str_hash_opt_to_type(struct bch_fs *c, enum bch_str_hash_opts opt)
struct bch_hash_info {
u8 type;
struct unicode_map *cf_encoding;
struct unicode_map *cf_encoding;
/*
* For crc32 or crc64 string hashes the first key value of
* the siphash_key (k0) is used as the key.
@@ -44,11 +44,10 @@ struct bch_hash_info {
static inline struct bch_hash_info
bch2_hash_info_init(struct bch_fs *c, const struct bch_inode_unpacked *bi)
{
/* XXX ick */
struct bch_hash_info info = {
.type = INODE_STR_HASH(bi),
#ifdef CONFIG_UNICODE
.cf_encoding = !!(bi->bi_flags & BCH_INODE_casefolded) ? c->cf_encoding : NULL,
.cf_encoding = bch2_inode_casefold(c, bi) ? c->cf_encoding : NULL,
#endif
.siphash_key = { .k0 = bi->bi_hash_seed }
};

View File

@@ -1102,7 +1102,8 @@ int bch2_write_super(struct bch_fs *c)
prt_str(&buf, ")");
bch2_fs_fatal_error(c, ": %s", buf.buf);
printbuf_exit(&buf);
return -BCH_ERR_sb_not_downgraded;
ret = -BCH_ERR_sb_not_downgraded;
goto out;
}
darray_for_each(online_devices, ca) {

View File

@@ -418,32 +418,6 @@ bool bch2_fs_emergency_read_only_locked(struct bch_fs *c)
return ret;
}
static int bch2_fs_read_write_late(struct bch_fs *c)
{
int ret;
/*
* Data move operations can't run until after check_snapshots has
* completed, and bch2_snapshot_is_ancestor() is available.
*
* Ideally we'd start copygc/rebalance earlier instead of waiting for
* all of recovery/fsck to complete:
*/
ret = bch2_copygc_start(c);
if (ret) {
bch_err(c, "error starting copygc thread");
return ret;
}
ret = bch2_rebalance_start(c);
if (ret) {
bch_err(c, "error starting rebalance thread");
return ret;
}
return 0;
}
static int __bch2_fs_read_write(struct bch_fs *c, bool early)
{
int ret;
@@ -466,29 +440,28 @@ static int __bch2_fs_read_write(struct bch_fs *c, bool early)
clear_bit(BCH_FS_clean_shutdown, &c->flags);
/*
* First journal write must be a flush write: after a clean shutdown we
* don't read the journal, so the first journal write may end up
* overwriting whatever was there previously, and there must always be
* at least one non-flush write in the journal or recovery will fail:
*/
set_bit(JOURNAL_need_flush_write, &c->journal.flags);
set_bit(JOURNAL_running, &c->journal.flags);
__for_each_online_member(c, ca, BIT(BCH_MEMBER_STATE_rw), READ) {
bch2_dev_allocator_add(c, ca);
percpu_ref_reinit(&ca->io_ref[WRITE]);
}
bch2_recalc_capacity(c);
/*
* First journal write must be a flush write: after a clean shutdown we
* don't read the journal, so the first journal write may end up
* overwriting whatever was there previously, and there must always be
* at least one non-flush write in the journal or recovery will fail:
*/
spin_lock(&c->journal.lock);
set_bit(JOURNAL_need_flush_write, &c->journal.flags);
set_bit(JOURNAL_running, &c->journal.flags);
bch2_journal_space_available(&c->journal);
spin_unlock(&c->journal.lock);
ret = bch2_fs_mark_dirty(c);
if (ret)
goto err;
spin_lock(&c->journal.lock);
bch2_journal_space_available(&c->journal);
spin_unlock(&c->journal.lock);
ret = bch2_journal_reclaim_start(&c->journal);
if (ret)
goto err;
@@ -504,10 +477,17 @@ static int __bch2_fs_read_write(struct bch_fs *c, bool early)
atomic_long_inc(&c->writes[i]);
}
#endif
if (!early) {
ret = bch2_fs_read_write_late(c);
if (ret)
goto err;
ret = bch2_copygc_start(c);
if (ret) {
bch_err_msg(c, ret, "error starting copygc thread");
goto err;
}
ret = bch2_rebalance_start(c);
if (ret) {
bch_err_msg(c, ret, "error starting rebalance thread");
goto err;
}
bch2_do_discards(c);
@@ -553,6 +533,7 @@ static void __bch2_fs_free(struct bch_fs *c)
bch2_find_btree_nodes_exit(&c->found_btree_nodes);
bch2_free_pending_node_rewrites(c);
bch2_free_fsck_errs(c);
bch2_fs_accounting_exit(c);
bch2_fs_sb_errors_exit(c);
bch2_fs_counters_exit(c);
@@ -1023,6 +1004,40 @@ static void print_mount_opts(struct bch_fs *c)
printbuf_exit(&p);
}
static bool bch2_fs_may_start(struct bch_fs *c)
{
struct bch_dev *ca;
unsigned i, flags = 0;
if (c->opts.very_degraded)
flags |= BCH_FORCE_IF_DEGRADED|BCH_FORCE_IF_LOST;
if (c->opts.degraded)
flags |= BCH_FORCE_IF_DEGRADED;
if (!c->opts.degraded &&
!c->opts.very_degraded) {
mutex_lock(&c->sb_lock);
for (i = 0; i < c->disk_sb.sb->nr_devices; i++) {
if (!bch2_member_exists(c->disk_sb.sb, i))
continue;
ca = bch2_dev_locked(c, i);
if (!bch2_dev_is_online(ca) &&
(ca->mi.state == BCH_MEMBER_STATE_rw ||
ca->mi.state == BCH_MEMBER_STATE_ro)) {
mutex_unlock(&c->sb_lock);
return false;
}
}
mutex_unlock(&c->sb_lock);
}
return bch2_have_enough_devs(c, bch2_online_devs(c), flags, true);
}
int bch2_fs_start(struct bch_fs *c)
{
time64_t now = ktime_get_real_seconds();
@@ -1030,6 +1045,9 @@ int bch2_fs_start(struct bch_fs *c)
print_mount_opts(c);
if (!bch2_fs_may_start(c))
return -BCH_ERR_insufficient_devices_to_start;
down_write(&c->state_lock);
mutex_lock(&c->sb_lock);
@@ -1082,13 +1100,10 @@ int bch2_fs_start(struct bch_fs *c)
wake_up(&c->ro_ref_wait);
down_write(&c->state_lock);
if (c->opts.read_only) {
if (c->opts.read_only)
bch2_fs_read_only(c);
} else {
ret = !test_bit(BCH_FS_rw, &c->flags)
? bch2_fs_read_write(c)
: bch2_fs_read_write_late(c);
}
else if (!test_bit(BCH_FS_rw, &c->flags))
ret = bch2_fs_read_write(c);
up_write(&c->state_lock);
err:
@@ -1500,7 +1515,7 @@ static int bch2_dev_attach_bdev(struct bch_fs *c, struct bch_sb_handle *sb)
printbuf_exit(&name);
rebalance_wakeup(c);
bch2_rebalance_wakeup(c);
return 0;
}
@@ -1559,40 +1574,6 @@ bool bch2_dev_state_allowed(struct bch_fs *c, struct bch_dev *ca,
}
}
static bool bch2_fs_may_start(struct bch_fs *c)
{
struct bch_dev *ca;
unsigned i, flags = 0;
if (c->opts.very_degraded)
flags |= BCH_FORCE_IF_DEGRADED|BCH_FORCE_IF_LOST;
if (c->opts.degraded)
flags |= BCH_FORCE_IF_DEGRADED;
if (!c->opts.degraded &&
!c->opts.very_degraded) {
mutex_lock(&c->sb_lock);
for (i = 0; i < c->disk_sb.sb->nr_devices; i++) {
if (!bch2_member_exists(c->disk_sb.sb, i))
continue;
ca = bch2_dev_locked(c, i);
if (!bch2_dev_is_online(ca) &&
(ca->mi.state == BCH_MEMBER_STATE_rw ||
ca->mi.state == BCH_MEMBER_STATE_ro)) {
mutex_unlock(&c->sb_lock);
return false;
}
}
mutex_unlock(&c->sb_lock);
}
return bch2_have_enough_devs(c, bch2_online_devs(c), flags, true);
}
static void __bch2_dev_read_only(struct bch_fs *c, struct bch_dev *ca)
{
bch2_dev_io_ref_stop(ca, WRITE);
@@ -1646,7 +1627,7 @@ int __bch2_dev_set_state(struct bch_fs *c, struct bch_dev *ca,
if (new_state == BCH_MEMBER_STATE_rw)
__bch2_dev_read_write(c, ca);
rebalance_wakeup(c);
bch2_rebalance_wakeup(c);
return ret;
}
@@ -2228,11 +2209,6 @@ struct bch_fs *bch2_fs_open(char * const *devices, unsigned nr_devices,
}
up_write(&c->state_lock);
if (!bch2_fs_may_start(c)) {
ret = -BCH_ERR_insufficient_devices_to_start;
goto err_print;
}
if (!c->opts.nostart) {
ret = bch2_fs_start(c);
if (ret)

View File

@@ -654,11 +654,10 @@ static ssize_t sysfs_opt_store(struct bch_fs *c,
bch2_set_rebalance_needs_scan(c, 0);
if (v && id == Opt_rebalance_enabled)
rebalance_wakeup(c);
bch2_rebalance_wakeup(c);
if (v && id == Opt_copygc_enabled &&
c->copygc_thread)
wake_up_process(c->copygc_thread);
if (v && id == Opt_copygc_enabled)
bch2_copygc_wakeup(c);
if (id == Opt_discard && !ca) {
mutex_lock(&c->sb_lock);

View File

@@ -342,6 +342,8 @@ static int test_iterate_slots_extents(struct bch_fs *c, u64 nr)
*/
static int test_peek_end(struct bch_fs *c, u64 nr)
{
delete_test_keys(c);
struct btree_trans *trans = bch2_trans_get(c);
struct btree_iter iter;
struct bkey_s_c k;
@@ -362,6 +364,8 @@ static int test_peek_end(struct bch_fs *c, u64 nr)
static int test_peek_end_extents(struct bch_fs *c, u64 nr)
{
delete_test_keys(c);
struct btree_trans *trans = bch2_trans_get(c);
struct btree_iter iter;
struct bkey_s_c k;

View File

@@ -739,4 +739,42 @@ static inline void memcpy_swab(void *_dst, void *_src, size_t len)
*--dst = *src++;
}
#define set_flags(_map, _in, _out) \
do { \
unsigned _i; \
\
for (_i = 0; _i < ARRAY_SIZE(_map); _i++) \
if ((_in) & (1 << _i)) \
(_out) |= _map[_i]; \
else \
(_out) &= ~_map[_i]; \
} while (0)
#define map_flags(_map, _in) \
({ \
unsigned _out = 0; \
\
set_flags(_map, _in, _out); \
_out; \
})
#define map_flags_rev(_map, _in) \
({ \
unsigned _i, _out = 0; \
\
for (_i = 0; _i < ARRAY_SIZE(_map); _i++) \
if ((_in) & _map[_i]) { \
(_out) |= 1 << _i; \
(_in) &= ~_map[_i]; \
} \
(_out); \
})
#define map_defined(_map) \
({ \
unsigned _in = ~0; \
\
map_flags_rev(_map, _in); \
})
#endif /* _BCACHEFS_UTIL_H */