From 4c63c2454eff996c5e27991221106eb511f7db38 Mon Sep 17 00:00:00 2001 From: Luke Dashjr Date: Thu, 29 Oct 2015 08:22:21 +0000 Subject: [PATCH 01/37] btrfs: bugfix: handle FS_IOC32_{GETFLAGS,SETFLAGS,GETVERSION} in btrfs_ioctl 32-bit ioctl uses these rather than the regular FS_IOC_* versions. They can be handled in btrfs using the same code. Without this, 32-bit {ch,ls}attr fail. Signed-off-by: Luke Dashjr Cc: stable@vger.kernel.org Reviewed-by: Josef Bacik Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/ctree.h | 1 + fs/btrfs/file.c | 2 +- fs/btrfs/inode.c | 2 +- fs/btrfs/ioctl.c | 21 +++++++++++++++++++++ 4 files changed, 24 insertions(+), 2 deletions(-) diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 84a6a5b3384a..208d19938fdf 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -4122,6 +4122,7 @@ void btrfs_test_inode_set_ops(struct inode *inode); /* ioctl.c */ long btrfs_ioctl(struct file *file, unsigned int cmd, unsigned long arg); +long btrfs_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg); int btrfs_ioctl_get_supported_features(void __user *arg); void btrfs_update_iflags(struct inode *inode); void btrfs_inherit_iflags(struct inode *inode, struct inode *dir); diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index 8d7b5a45c005..751daacd268d 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -2956,7 +2956,7 @@ const struct file_operations btrfs_file_operations = { .fallocate = btrfs_fallocate, .unlocked_ioctl = btrfs_ioctl, #ifdef CONFIG_COMPAT - .compat_ioctl = btrfs_ioctl, + .compat_ioctl = btrfs_compat_ioctl, #endif .copy_file_range = btrfs_copy_file_range, .clone_file_range = btrfs_clone_file_range, diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 2aaba58b4856..167fc3d49450 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -10184,7 +10184,7 @@ static const struct file_operations btrfs_dir_file_operations = { .iterate = btrfs_real_readdir, .unlocked_ioctl = btrfs_ioctl, #ifdef CONFIG_COMPAT - .compat_ioctl = btrfs_ioctl, + .compat_ioctl = btrfs_compat_ioctl, #endif .release = btrfs_release_file, .fsync = btrfs_sync_file, diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index 5a23806ae418..f545f81f642d 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -5552,3 +5552,24 @@ long btrfs_ioctl(struct file *file, unsigned int return -ENOTTY; } + +#ifdef CONFIG_COMPAT +long btrfs_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg) +{ + switch (cmd) { + case FS_IOC32_GETFLAGS: + cmd = FS_IOC_GETFLAGS; + break; + case FS_IOC32_SETFLAGS: + cmd = FS_IOC_SETFLAGS; + break; + case FS_IOC32_GETVERSION: + cmd = FS_IOC_GETVERSION; + break; + default: + return -ENOIOCTLCMD; + } + + return btrfs_ioctl(file, cmd, (unsigned long) compat_ptr(arg)); +} +#endif From a91326679f2a0a4c239cd643674fdcda28ee86be Mon Sep 17 00:00:00 2001 From: Liu Bo Date: Mon, 7 Mar 2016 16:56:21 -0800 Subject: [PATCH 02/37] Btrfs: make mapping->writeback_index point to the last written page MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit If sequential writer is writing in the middle of the page and it just redirties the last written page by continuing from it. In the above case this can end up with seeking back to that firstly redirtied page after writing all the pages at the end of file because btrfs updates mapping->writeback_index to 1 past the current one. For non-cow filesystems, the cost is only about extra seek, while for cow filesystems such as btrfs, it means unnecessary fragments. To avoid it, we just need to continue writeback from the last written page. This also updates btrfs to behave like what write_cache_pages() does, ie, bail out immediately if there is an error in writepage(). Reported-by: Holger Hoffstätte Signed-off-by: Liu Bo Signed-off-by: David Sterba --- fs/btrfs/extent_io.c | 32 ++++++++++++++++++++++++++------ 1 file changed, 26 insertions(+), 6 deletions(-) diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index d247fc0eea19..fd41a730d239 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -3200,14 +3200,10 @@ int extent_read_full_page(struct extent_io_tree *tree, struct page *page, return ret; } -static noinline void update_nr_written(struct page *page, - struct writeback_control *wbc, - unsigned long nr_written) +static void update_nr_written(struct page *page, struct writeback_control *wbc, + unsigned long nr_written) { wbc->nr_to_write -= nr_written; - if (wbc->range_cyclic || (wbc->nr_to_write > 0 && - wbc->range_start == 0 && wbc->range_end == LLONG_MAX)) - page->mapping->writeback_index = page->index + nr_written; } /* @@ -3926,6 +3922,8 @@ static int extent_write_cache_pages(struct extent_io_tree *tree, int nr_pages; pgoff_t index; pgoff_t end; /* Inclusive */ + pgoff_t done_index; + int range_whole = 0; int scanned = 0; int tag; @@ -3948,6 +3946,8 @@ static int extent_write_cache_pages(struct extent_io_tree *tree, } else { index = wbc->range_start >> PAGE_SHIFT; end = wbc->range_end >> PAGE_SHIFT; + if (wbc->range_start == 0 && wbc->range_end == LLONG_MAX) + range_whole = 1; scanned = 1; } if (wbc->sync_mode == WB_SYNC_ALL) @@ -3957,6 +3957,7 @@ static int extent_write_cache_pages(struct extent_io_tree *tree, retry: if (wbc->sync_mode == WB_SYNC_ALL) tag_pages_for_writeback(mapping, index, end); + done_index = index; while (!done && !nr_to_write_done && (index <= end) && (nr_pages = pagevec_lookup_tag(&pvec, mapping, &index, tag, min(end - index, (pgoff_t)PAGEVEC_SIZE-1) + 1))) { @@ -3966,6 +3967,7 @@ static int extent_write_cache_pages(struct extent_io_tree *tree, for (i = 0; i < nr_pages; i++) { struct page *page = pvec.pages[i]; + done_index = page->index; /* * At this point we hold neither mapping->tree_lock nor * lock on the page itself: the page may be truncated or @@ -4009,6 +4011,20 @@ static int extent_write_cache_pages(struct extent_io_tree *tree, } if (!err && ret < 0) err = ret; + if (ret < 0) { + /* + * done_index is set past this page, + * so media errors will not choke + * background writeout for the entire + * file. This has consequences for + * range_cyclic semantics (ie. it may + * not be suitable for data integrity + * writeout). + */ + done_index = page->index + 1; + done = 1; + break; + } /* * the filesystem may choose to bump up nr_to_write. @@ -4029,6 +4045,10 @@ static int extent_write_cache_pages(struct extent_io_tree *tree, index = 0; goto retry; } + + if (wbc->range_cyclic || (wbc->nr_to_write > 0 && range_whole)) + mapping->writeback_index = done_index; + btrfs_add_delayed_iput(inode); return err; } From 894b36e35ae01186b77b083f3f67569a349062a6 Mon Sep 17 00:00:00 2001 From: Liu Bo Date: Mon, 7 Mar 2016 16:56:22 -0800 Subject: [PATCH 03/37] Btrfs: cleanup error handling in extent_write_cached_pages Now that we bail out immediately if ->writepage() returns an error, we don't need an extra error to retain the error code. Signed-off-by: Liu Bo Signed-off-by: David Sterba --- fs/btrfs/extent_io.c | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index fd41a730d239..b67d6d24440b 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -3916,7 +3916,6 @@ static int extent_write_cache_pages(struct extent_io_tree *tree, struct inode *inode = mapping->host; int ret = 0; int done = 0; - int err = 0; int nr_to_write_done = 0; struct pagevec pvec; int nr_pages; @@ -4009,8 +4008,6 @@ static int extent_write_cache_pages(struct extent_io_tree *tree, unlock_page(page); ret = 0; } - if (!err && ret < 0) - err = ret; if (ret < 0) { /* * done_index is set past this page, @@ -4036,7 +4033,7 @@ static int extent_write_cache_pages(struct extent_io_tree *tree, pagevec_release(&pvec); cond_resched(); } - if (!scanned && !done && !err) { + if (!scanned && !done) { /* * We hit the last page and there is more work to be done: wrap * back to the start of the file @@ -4050,7 +4047,7 @@ static int extent_write_cache_pages(struct extent_io_tree *tree, mapping->writeback_index = done_index; btrfs_add_delayed_iput(inode); - return err; + return ret; } static void flush_epd_write_bio(struct extent_page_data *epd) From a2af23b7d7cb0de89570e97da84f6fb642e990a4 Mon Sep 17 00:00:00 2001 From: Chandan Rajendra Date: Mon, 4 Apr 2016 02:53:06 +0530 Subject: [PATCH 04/37] Btrfs: __btrfs_buffered_write: Pass valid file offset when releasing delalloc space The delalloc reserved space is calculated in terms of number of bytes used by an integral number of blocks. This is done by rounding down the value of 'pos' to the nearest multiple of sectorsize. The file offset value held by 'pos' variable may not be aligned to sectorsize and hence when passing it as an argument to btrfs_delalloc_release_space(), we may end up releasing larger delalloc space than we originally had reserved. Signed-off-by: Chandan Rajendra Signed-off-by: David Sterba --- fs/btrfs/file.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index 751daacd268d..af059c44684d 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -1696,7 +1696,9 @@ static noinline ssize_t __btrfs_buffered_write(struct file *file, btrfs_end_write_no_snapshoting(root); btrfs_delalloc_release_metadata(inode, release_bytes); } else { - btrfs_delalloc_release_space(inode, pos, release_bytes); + btrfs_delalloc_release_space(inode, + round_down(pos, root->sectorsize), + release_bytes); } } From cf25ce518e8ef9d59b292e51193bed2b023a32da Mon Sep 17 00:00:00 2001 From: Liu Bo Date: Mon, 14 Dec 2015 18:29:32 -0800 Subject: [PATCH 05/37] Btrfs: do not create empty block group if we have allocated data Now we force to create empty block group to keep data profile alive, however, in the below example, we eventually get an empty block group while we're trying to get more space for other types (metadata/system), - Before, block group "A": size=2G, used=1.2G block group "B": size=2G, used=512M - After "btrfs balance start -dusage=50 mount_point", block group "A": size=2G, used=(1.2+0.5)G block group "C": size=2G, used=0 Since there is no data in block group C, it won't be deleted automatically and we have to get the unused 2G until the next mount. Balance itself just moves data and doesn't remove data, so it's safe to not create such a empty block group if we already have data allocated in other block groups. Signed-off-by: Liu Bo Signed-off-by: David Sterba --- fs/btrfs/volumes.c | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index bd0f45fb38c4..745a619241c7 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -3402,6 +3402,7 @@ static int __btrfs_balance(struct btrfs_fs_info *fs_info) u32 count_meta = 0; u32 count_sys = 0; int chunk_reserved = 0; + u64 bytes_used = 0; /* step one make some room on all the devices */ devices = &fs_info->fs_devices->devices; @@ -3540,7 +3541,13 @@ static int __btrfs_balance(struct btrfs_fs_info *fs_info) goto loop; } - if ((chunk_type & BTRFS_BLOCK_GROUP_DATA) && !chunk_reserved) { + ASSERT(fs_info->data_sinfo); + spin_lock(&fs_info->data_sinfo->lock); + bytes_used = fs_info->data_sinfo->bytes_used; + spin_unlock(&fs_info->data_sinfo->lock); + + if ((chunk_type & BTRFS_BLOCK_GROUP_DATA) && + !chunk_reserved && !bytes_used) { trans = btrfs_start_transaction(chunk_root, 0); if (IS_ERR(trans)) { mutex_unlock(&fs_info->delete_unused_bgs_mutex); From 88be159c905a2b4f6d59afa352bef410afb6af02 Mon Sep 17 00:00:00 2001 From: "Austin S. Hemmelgarn" Date: Wed, 23 Mar 2016 14:22:59 -0400 Subject: [PATCH 06/37] btrfs: allow balancing to dup with multi-device Currently, we don't allow the user to try and rebalance to a dup profile on a multi-device filesystem. In most cases, this is a perfectly sensible restriction as raid1 uses the same amount of space and provides better protection. However, when reshaping a multi-device filesystem down to a single device filesystem, this requires the user to convert metadata and system chunks to single profile before deleting devices, and then convert again to dup, which leaves a period of time where metadata integrity is reduced. This patch removes the single-device-only restriction from converting to dup profile to remove this potential data integrity reduction. Signed-off-by: Austin S. Hemmelgarn Signed-off-by: David Sterba --- fs/btrfs/volumes.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 745a619241c7..83dfed667637 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -3700,10 +3700,8 @@ int btrfs_balance(struct btrfs_balance_control *bctl, num_devices--; } btrfs_dev_replace_unlock(&fs_info->dev_replace, 0); - allowed = BTRFS_AVAIL_ALLOC_BIT_SINGLE; - if (num_devices == 1) - allowed |= BTRFS_BLOCK_GROUP_DUP; - else if (num_devices > 1) + allowed = BTRFS_AVAIL_ALLOC_BIT_SINGLE | BTRFS_BLOCK_GROUP_DUP; + if (num_devices > 1) allowed |= (BTRFS_BLOCK_GROUP_RAID0 | BTRFS_BLOCK_GROUP_RAID1); if (num_devices > 2) allowed |= BTRFS_BLOCK_GROUP_RAID5; From ae02d1bd070767e109f4a6f1bb1f466e9698a355 Mon Sep 17 00:00:00 2001 From: Luis de Bethencourt Date: Wed, 30 Mar 2016 21:53:38 +0100 Subject: [PATCH 07/37] btrfs: fix mixed block count of available space Metadata for mixed block is already accounted in total data and should not be counted as part of the free metadata space. Signed-off-by: Luis de Bethencourt Link: https://bugzilla.kernel.org/show_bug.cgi?id=114281 Signed-off-by: David Sterba --- fs/btrfs/super.c | 16 +++++++++++++--- 1 file changed, 13 insertions(+), 3 deletions(-) diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index 00b8f37cc306..bdca79ce45f1 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c @@ -2051,6 +2051,7 @@ static int btrfs_statfs(struct dentry *dentry, struct kstatfs *buf) struct btrfs_block_rsv *block_rsv = &fs_info->global_block_rsv; int ret; u64 thresh = 0; + int mixed = 0; /* * holding chunk_muext to avoid allocating new chunks, holding @@ -2076,8 +2077,17 @@ static int btrfs_statfs(struct dentry *dentry, struct kstatfs *buf) } } } - if (found->flags & BTRFS_BLOCK_GROUP_METADATA) - total_free_meta += found->disk_total - found->disk_used; + + /* + * Metadata in mixed block goup profiles are accounted in data + */ + if (!mixed && found->flags & BTRFS_BLOCK_GROUP_METADATA) { + if (found->flags & BTRFS_BLOCK_GROUP_DATA) + mixed = 1; + else + total_free_meta += found->disk_total - + found->disk_used; + } total_used += found->disk_used; } @@ -2115,7 +2125,7 @@ static int btrfs_statfs(struct dentry *dentry, struct kstatfs *buf) */ thresh = 4 * 1024 * 1024; - if (total_free_meta - thresh < block_rsv->size) + if (!mixed && total_free_meta - thresh < block_rsv->size) buf->f_bavail = 0; buf->f_type = BTRFS_SUPER_MAGIC; From 41b34accb265e3a20211a7a8ef3625678f1c6ec7 Mon Sep 17 00:00:00 2001 From: Luis de Bethencourt Date: Wed, 30 Mar 2016 23:18:14 +0100 Subject: [PATCH 08/37] btrfs: avoid overflowing f_bfree Since mixed block groups accounting isn't byte-accurate and f_bree is an unsigned integer, it could overflow. Avoid this. Signed-off-by: Luis de Bethencourt Suggested-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/super.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index bdca79ce45f1..fe03efb5bec0 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c @@ -2100,7 +2100,11 @@ static int btrfs_statfs(struct dentry *dentry, struct kstatfs *buf) /* Account global block reserve as used, it's in logical size already */ spin_lock(&block_rsv->lock); - buf->f_bfree -= block_rsv->size >> bits; + /* Mixed block groups accounting is not byte-accurate, avoid overflow */ + if (buf->f_bfree >= block_rsv->size >> bits) + buf->f_bfree -= block_rsv->size >> bits; + else + buf->f_bfree = 0; spin_unlock(&block_rsv->lock); buf->f_bavail = div_u64(total_free_data, factor); From f5ecec3ce21f706e9e7a330b2e8e5a2941927b46 Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Wed, 13 Apr 2016 09:40:59 +0300 Subject: [PATCH 09/37] btrfs: send: silence an integer overflow warning The "sizeof(*arg->clone_sources) * arg->clone_sources_count" expression can overflow. It causes several static checker warnings. It's all under CAP_SYS_ADMIN so it's not that serious but lets silence the warnings. Signed-off-by: Dan Carpenter Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/send.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c index 8d358c547c59..ec433795fa71 100644 --- a/fs/btrfs/send.c +++ b/fs/btrfs/send.c @@ -5978,6 +5978,12 @@ long btrfs_ioctl_send(struct file *mnt_file, void __user *arg_) goto out; } + if (arg->clone_sources_count > + ULLONG_MAX / sizeof(*arg->clone_sources)) { + ret = -EINVAL; + goto out; + } + if (!access_ok(VERIFY_READ, arg->clone_sources, sizeof(*arg->clone_sources) * arg->clone_sources_count)) { From 779bf3fefa835cb52a07457c8acac6f2f66f2493 Mon Sep 17 00:00:00 2001 From: Anand Jain Date: Mon, 18 Apr 2016 16:51:23 +0800 Subject: [PATCH 10/37] btrfs: fix lock dep warning, move scratch dev out of device_list_mutex and uuid_mutex When the replace target fails, the target device will be taken out of fs device list, scratch + update_dev_time and freed. However we could do the scratch + update_dev_time and free part after the device has been taken out of device list, so that we don't have to hold the device_list_mutex and uuid_mutex locks. Reported issue: [ 5375.718845] ====================================================== [ 5375.718846] [ INFO: possible circular locking dependency detected ] [ 5375.718849] 4.4.5-scst31x-debug-11+ #40 Not tainted [ 5375.718849] ------------------------------------------------------- [ 5375.718851] btrfs-health/4662 is trying to acquire lock: [ 5375.718861] (sb_writers){.+.+.+}, at: [] __sb_start_write+0xb7/0xf0 [ 5375.718862] [ 5375.718862] but task is already holding lock: [ 5375.718907] (&fs_devs->device_list_mutex){+.+.+.}, at: [] btrfs_destroy_dev_replace_tgtdev+0x3c/0x150 [btrfs] [ 5375.718907] [ 5375.718907] which lock already depends on the new lock. [ 5375.718907] [ 5375.718908] [ 5375.718908] the existing dependency chain (in reverse order) is: [ 5375.718911] [ 5375.718911] -> #3 (&fs_devs->device_list_mutex){+.+.+.}: [ 5375.718917] [] lock_acquire+0xce/0x1e0 [ 5375.718921] [] mutex_lock_nested+0x69/0x3c0 [ 5375.718940] [] btrfs_show_devname+0x36/0x210 [btrfs] [ 5375.718945] [] show_vfsmnt+0x49/0x150 [ 5375.718948] [] m_show+0x17/0x20 [ 5375.718951] [] seq_read+0x2d8/0x3b0 [ 5375.718955] [] __vfs_read+0x28/0xd0 [ 5375.718959] [] vfs_read+0x86/0x130 [ 5375.718962] [] SyS_read+0x49/0xa0 [ 5375.718966] [] entry_SYSCALL_64_fastpath+0x16/0x7a [ 5375.718968] [ 5375.718968] -> #2 (namespace_sem){+++++.}: [ 5375.718971] [] lock_acquire+0xce/0x1e0 [ 5375.718974] [] down_write+0x49/0x80 [ 5375.718977] [] lock_mount+0x43/0x1c0 [ 5375.718979] [] do_add_mount+0x23/0xd0 [ 5375.718982] [] do_mount+0x27b/0xe30 [ 5375.718985] [] SyS_mount+0x8c/0xd0 [ 5375.718988] [] entry_SYSCALL_64_fastpath+0x16/0x7a [ 5375.718991] [ 5375.718991] -> #1 (&sb->s_type->i_mutex_key#5){+.+.+.}: [ 5375.718994] [] lock_acquire+0xce/0x1e0 [ 5375.718996] [] mutex_lock_nested+0x69/0x3c0 [ 5375.719001] [] path_openat+0x468/0x1360 [ 5375.719004] [] do_filp_open+0x7e/0xe0 [ 5375.719007] [] do_sys_open+0x12b/0x210 [ 5375.719010] [] SyS_open+0x1e/0x20 [ 5375.719013] [] entry_SYSCALL_64_fastpath+0x16/0x7a [ 5375.719015] [ 5375.719015] -> #0 (sb_writers){.+.+.+}: [ 5375.719018] [] __lock_acquire+0x17ba/0x1ae0 [ 5375.719021] [] lock_acquire+0xce/0x1e0 [ 5375.719026] [] percpu_down_read+0x4f/0xa0 [ 5375.719028] [] __sb_start_write+0xb7/0xf0 [ 5375.719031] [] mnt_want_write+0x24/0x50 [ 5375.719035] [] path_openat+0xd32/0x1360 [ 5375.719037] [] do_filp_open+0x7e/0xe0 [ 5375.719040] [] file_open_name+0xe4/0x130 [ 5375.719043] [] filp_open+0x33/0x60 [ 5375.719073] [] update_dev_time+0x16/0x40 [btrfs] [ 5375.719099] [] btrfs_scratch_superblocks+0x4e/0x90 [btrfs] [ 5375.719123] [] btrfs_destroy_dev_replace_tgtdev+0x65/0x150 [btrfs] [ 5375.719150] [] btrfs_dev_replace_finishing+0x6b0/0x990 [btrfs] [ 5375.719175] [] btrfs_dev_replace_start+0x33e/0x540 [btrfs] [ 5375.719199] [] btrfs_auto_replace_start+0xf8/0x140 [btrfs] [ 5375.719222] [] health_kthread+0x246/0x490 [btrfs] [ 5375.719225] [] kthread+0xef/0x110 [ 5375.719229] [] ret_from_fork+0x3f/0x70 [ 5375.719230] [ 5375.719230] other info that might help us debug this: [ 5375.719230] [ 5375.719233] Chain exists of: [ 5375.719233] sb_writers --> namespace_sem --> &fs_devs->device_list_mutex [ 5375.719233] [ 5375.719234] Possible unsafe locking scenario: [ 5375.719234] [ 5375.719234] CPU0 CPU1 [ 5375.719235] ---- ---- [ 5375.719236] lock(&fs_devs->device_list_mutex); [ 5375.719238] lock(namespace_sem); [ 5375.719239] lock(&fs_devs->device_list_mutex); [ 5375.719241] lock(sb_writers); [ 5375.719241] [ 5375.719241] *** DEADLOCK *** [ 5375.719241] [ 5375.719243] 4 locks held by btrfs-health/4662: [ 5375.719266] #0: (&fs_info->health_mutex){+.+.+.}, at: [] health_kthread+0x63/0x490 [btrfs] [ 5375.719293] #1: (&fs_info->dev_replace.lock_finishing_cancel_unmount){+.+.+.}, at: [] btrfs_dev_replace_finishing+0x41/0x990 [btrfs] [ 5375.719319] #2: (uuid_mutex){+.+.+.}, at: [] btrfs_destroy_dev_replace_tgtdev+0x20/0x150 [btrfs] [ 5375.719343] #3: (&fs_devs->device_list_mutex){+.+.+.}, at: [] btrfs_destroy_dev_replace_tgtdev+0x3c/0x150 [btrfs] [ 5375.719343] [ 5375.719343] stack backtrace: [ 5375.719347] CPU: 2 PID: 4662 Comm: btrfs-health Not tainted 4.4.5-scst31x-debug-11+ #40 [ 5375.719348] Hardware name: Supermicro SYS-6018R-WTRT/X10DRW-iT, BIOS 1.0c 01/07/2015 [ 5375.719352] 0000000000000000 ffff880856f73880 ffffffff813529e3 ffffffff826182a0 [ 5375.719354] ffffffff8260c090 ffff880856f738c0 ffffffff810d667c ffff880856f73930 [ 5375.719357] ffff880861f32b40 ffff880861f32b68 0000000000000003 0000000000000004 [ 5375.719357] Call Trace: [ 5375.719363] [] dump_stack+0x85/0xc2 [ 5375.719366] [] print_circular_bug+0x1ec/0x260 [ 5375.719369] [] __lock_acquire+0x17ba/0x1ae0 [ 5375.719373] [] ? debug_lockdep_rcu_enabled+0x1d/0x20 [ 5375.719376] [] lock_acquire+0xce/0x1e0 [ 5375.719378] [] ? __sb_start_write+0xb7/0xf0 [ 5375.719383] [] percpu_down_read+0x4f/0xa0 [ 5375.719385] [] ? __sb_start_write+0xb7/0xf0 [ 5375.719387] [] __sb_start_write+0xb7/0xf0 [ 5375.719389] [] mnt_want_write+0x24/0x50 [ 5375.719393] [] path_openat+0xd32/0x1360 [ 5375.719415] [] ? btrfs_congested_fn+0x180/0x180 [btrfs] [ 5375.719418] [] ? debug_lockdep_rcu_enabled+0x1d/0x20 [ 5375.719420] [] do_filp_open+0x7e/0xe0 [ 5375.719423] [] ? rcu_read_lock_sched_held+0x6d/0x80 [ 5375.719426] [] ? kmem_cache_alloc+0x26b/0x5d0 [ 5375.719430] [] ? getname_kernel+0x34/0x120 [ 5375.719433] [] file_open_name+0xe4/0x130 [ 5375.719436] [] filp_open+0x33/0x60 [ 5375.719462] [] update_dev_time+0x16/0x40 [btrfs] [ 5375.719485] [] btrfs_scratch_superblocks+0x4e/0x90 [btrfs] [ 5375.719506] [] btrfs_destroy_dev_replace_tgtdev+0x65/0x150 [btrfs] [ 5375.719530] [] btrfs_dev_replace_finishing+0x6b0/0x990 [btrfs] [ 5375.719554] [] ? btrfs_dev_replace_finishing+0x553/0x990 [btrfs] [ 5375.719576] [] btrfs_dev_replace_start+0x33e/0x540 [btrfs] [ 5375.719598] [] btrfs_auto_replace_start+0xf8/0x140 [btrfs] [ 5375.719621] [] health_kthread+0x246/0x490 [btrfs] [ 5375.719641] [] ? health_kthread+0x138/0x490 [btrfs] [ 5375.719661] [] ? btrfs_congested_fn+0x180/0x180 [btrfs] [ 5375.719663] [] kthread+0xef/0x110 [ 5375.719666] [] ? kthread_create_on_node+0x200/0x200 [ 5375.719669] [] ret_from_fork+0x3f/0x70 [ 5375.719672] [] ? kthread_create_on_node+0x200/0x200 [ 5375.719697] ------------[ cut here ]------------ Signed-off-by: Anand Jain Reported-by: Yauhen Kharuzhy Signed-off-by: David Sterba --- fs/btrfs/volumes.c | 17 ++++++++++++----- 1 file changed, 12 insertions(+), 5 deletions(-) diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 83dfed667637..7bd553d42e0d 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -2024,10 +2024,9 @@ void btrfs_destroy_dev_replace_tgtdev(struct btrfs_fs_info *fs_info, btrfs_sysfs_rm_device_link(fs_info->fs_devices, tgtdev); - if (tgtdev->bdev) { - btrfs_scratch_superblocks(tgtdev->bdev, tgtdev->name->str); + if (tgtdev->bdev) fs_info->fs_devices->open_devices--; - } + fs_info->fs_devices->num_devices--; next_device = list_entry(fs_info->fs_devices->devices.next, @@ -2038,10 +2037,18 @@ void btrfs_destroy_dev_replace_tgtdev(struct btrfs_fs_info *fs_info, fs_info->fs_devices->latest_bdev = next_device->bdev; list_del_rcu(&tgtdev->dev_list); - call_rcu(&tgtdev->rcu, free_device); - mutex_unlock(&fs_info->fs_devices->device_list_mutex); mutex_unlock(&uuid_mutex); + + /* + * The update_dev_time() with in btrfs_scratch_superblocks() + * may lead to a call to btrfs_show_devname() which will try + * to hold device_list_mutex. And here this device + * is already out of device list, so we don't have to hold + * the device_list_mutex lock. + */ + btrfs_scratch_superblocks(tgtdev->bdev, tgtdev->name->str); + call_rcu(&tgtdev->rcu, free_device); } static int btrfs_find_device_by_path(struct btrfs_root *root, char *device_path, From 6ff48ce06b07255a6459cd8b816a110971a81f00 Mon Sep 17 00:00:00 2001 From: David Sterba Date: Mon, 11 Apr 2016 18:40:08 +0200 Subject: [PATCH 11/37] btrfs: send: use vmalloc only as fallback for send_buf Signed-off-by: David Sterba --- fs/btrfs/send.c | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c index ec433795fa71..b6e2c6ec4ee5 100644 --- a/fs/btrfs/send.c +++ b/fs/btrfs/send.c @@ -6028,10 +6028,13 @@ long btrfs_ioctl_send(struct file *mnt_file, void __user *arg_) sctx->clone_roots_cnt = arg->clone_sources_count; sctx->send_max_size = BTRFS_SEND_BUF_SIZE; - sctx->send_buf = vmalloc(sctx->send_max_size); + sctx->send_buf = kmalloc(sctx->send_max_size, GFP_KERNEL | __GFP_NOWARN); if (!sctx->send_buf) { - ret = -ENOMEM; - goto out; + sctx->send_buf = vmalloc(sctx->send_max_size); + if (!sctx->send_buf) { + ret = -ENOMEM; + goto out; + } } sctx->read_buf = vmalloc(BTRFS_SEND_READ_SIZE); @@ -6220,7 +6223,7 @@ long btrfs_ioctl_send(struct file *mnt_file, void __user *arg_) fput(sctx->send_filp); vfree(sctx->clone_roots); - vfree(sctx->send_buf); + kvfree(sctx->send_buf); vfree(sctx->read_buf); name_cache_free(sctx); From eb5b75fe2e61a9ba907785b70318736112b0cf93 Mon Sep 17 00:00:00 2001 From: David Sterba Date: Mon, 11 Apr 2016 18:40:08 +0200 Subject: [PATCH 12/37] btrfs: send: use vmalloc only as fallback for read_buf Signed-off-by: David Sterba --- fs/btrfs/send.c | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c index b6e2c6ec4ee5..4e950ebbef53 100644 --- a/fs/btrfs/send.c +++ b/fs/btrfs/send.c @@ -6037,10 +6037,13 @@ long btrfs_ioctl_send(struct file *mnt_file, void __user *arg_) } } - sctx->read_buf = vmalloc(BTRFS_SEND_READ_SIZE); + sctx->read_buf = kmalloc(BTRFS_SEND_READ_SIZE, GFP_KERNEL | __GFP_NOWARN); if (!sctx->read_buf) { - ret = -ENOMEM; - goto out; + sctx->read_buf = vmalloc(BTRFS_SEND_READ_SIZE); + if (!sctx->read_buf) { + ret = -ENOMEM; + goto out; + } } sctx->pending_dir_moves = RB_ROOT; @@ -6224,7 +6227,7 @@ long btrfs_ioctl_send(struct file *mnt_file, void __user *arg_) vfree(sctx->clone_roots); kvfree(sctx->send_buf); - vfree(sctx->read_buf); + kvfree(sctx->read_buf); name_cache_free(sctx); From e55d1153dbf48485a74eb4bf4eefeaedcf1486a9 Mon Sep 17 00:00:00 2001 From: David Sterba Date: Mon, 11 Apr 2016 18:52:02 +0200 Subject: [PATCH 13/37] btrfs: send: use temporary variable to store allocation size We're going to use the argument multiple times later. Signed-off-by: David Sterba --- fs/btrfs/send.c | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c index 4e950ebbef53..4f85a47c2f55 100644 --- a/fs/btrfs/send.c +++ b/fs/btrfs/send.c @@ -5939,6 +5939,7 @@ long btrfs_ioctl_send(struct file *mnt_file, void __user *arg_) u32 i; u64 *clone_sources_tmp = NULL; int clone_sources_to_rollback = 0; + unsigned alloc_size; int sort_clone_roots = 0; int index; @@ -6050,24 +6051,25 @@ long btrfs_ioctl_send(struct file *mnt_file, void __user *arg_) sctx->waiting_dir_moves = RB_ROOT; sctx->orphan_dirs = RB_ROOT; - sctx->clone_roots = vzalloc(sizeof(struct clone_root) * - (arg->clone_sources_count + 1)); + alloc_size = sizeof(struct clone_root) * (arg->clone_sources_count + 1); + + sctx->clone_roots = vzalloc(alloc_size); if (!sctx->clone_roots) { ret = -ENOMEM; goto out; } + alloc_size = arg->clone_sources_count * sizeof(*arg->clone_sources); + if (arg->clone_sources_count) { - clone_sources_tmp = vmalloc(arg->clone_sources_count * - sizeof(*arg->clone_sources)); + clone_sources_tmp = vmalloc(alloc_size); if (!clone_sources_tmp) { ret = -ENOMEM; goto out; } ret = copy_from_user(clone_sources_tmp, arg->clone_sources, - arg->clone_sources_count * - sizeof(*arg->clone_sources)); + alloc_size); if (ret) { ret = -EFAULT; goto out; From c03d01f3404282712b9fd280297f133860c91c93 Mon Sep 17 00:00:00 2001 From: David Sterba Date: Mon, 11 Apr 2016 18:40:08 +0200 Subject: [PATCH 14/37] btrfs: send: use vmalloc only as fallback for clone_roots Signed-off-by: David Sterba --- fs/btrfs/send.c | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c index 4f85a47c2f55..5a5d37b37150 100644 --- a/fs/btrfs/send.c +++ b/fs/btrfs/send.c @@ -6053,10 +6053,13 @@ long btrfs_ioctl_send(struct file *mnt_file, void __user *arg_) alloc_size = sizeof(struct clone_root) * (arg->clone_sources_count + 1); - sctx->clone_roots = vzalloc(alloc_size); + sctx->clone_roots = kzalloc(alloc_size, GFP_KERNEL | __GFP_NOWARN); if (!sctx->clone_roots) { - ret = -ENOMEM; - goto out; + sctx->clone_roots = vzalloc(alloc_size); + if (!sctx->clone_roots) { + ret = -ENOMEM; + goto out; + } } alloc_size = arg->clone_sources_count * sizeof(*arg->clone_sources); @@ -6227,7 +6230,7 @@ long btrfs_ioctl_send(struct file *mnt_file, void __user *arg_) if (sctx->send_filp) fput(sctx->send_filp); - vfree(sctx->clone_roots); + kvfree(sctx->clone_roots); kvfree(sctx->send_buf); kvfree(sctx->read_buf); From 2f91306a37809907474a06c1defdb1ff50be06f0 Mon Sep 17 00:00:00 2001 From: David Sterba Date: Mon, 11 Apr 2016 18:40:08 +0200 Subject: [PATCH 15/37] btrfs: send: use vmalloc only as fallback for clone_sources_tmp Signed-off-by: David Sterba --- fs/btrfs/send.c | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c index 5a5d37b37150..6a8c86074aa4 100644 --- a/fs/btrfs/send.c +++ b/fs/btrfs/send.c @@ -6065,10 +6065,13 @@ long btrfs_ioctl_send(struct file *mnt_file, void __user *arg_) alloc_size = arg->clone_sources_count * sizeof(*arg->clone_sources); if (arg->clone_sources_count) { - clone_sources_tmp = vmalloc(alloc_size); + clone_sources_tmp = kmalloc(alloc_size, GFP_KERNEL | __GFP_NOWARN); if (!clone_sources_tmp) { - ret = -ENOMEM; - goto out; + clone_sources_tmp = vmalloc(alloc_size); + if (!clone_sources_tmp) { + ret = -ENOMEM; + goto out; + } } ret = copy_from_user(clone_sources_tmp, arg->clone_sources, @@ -6106,7 +6109,7 @@ long btrfs_ioctl_send(struct file *mnt_file, void __user *arg_) sctx->clone_roots[i].root = clone_root; clone_sources_to_rollback = i + 1; } - vfree(clone_sources_tmp); + kvfree(clone_sources_tmp); clone_sources_tmp = NULL; } @@ -6224,7 +6227,7 @@ long btrfs_ioctl_send(struct file *mnt_file, void __user *arg_) btrfs_root_dec_send_in_progress(sctx->parent_root); kfree(arg); - vfree(clone_sources_tmp); + kvfree(clone_sources_tmp); if (sctx) { if (sctx->send_filp) From 153519559a39725c5a45269256fec0efb81bcd1f Mon Sep 17 00:00:00 2001 From: David Sterba Date: Mon, 11 Apr 2016 18:40:08 +0200 Subject: [PATCH 16/37] btrfs: clone: use vmalloc only as fallback for nodesize bufer Signed-off-by: David Sterba --- fs/btrfs/ioctl.c | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index f545f81f642d..cfa7d47b1ba0 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -3468,13 +3468,16 @@ static int btrfs_clone(struct inode *src, struct inode *inode, u64 last_dest_end = destoff; ret = -ENOMEM; - buf = vmalloc(root->nodesize); - if (!buf) - return ret; + buf = kmalloc(root->nodesize, GFP_KERNEL | __GFP_NOWARN); + if (!buf) { + buf = vmalloc(root->nodesize); + if (!buf) + return ret; + } path = btrfs_alloc_path(); if (!path) { - vfree(buf); + kvfree(buf); return ret; } @@ -3775,7 +3778,7 @@ static int btrfs_clone(struct inode *src, struct inode *inode, out: btrfs_free_path(path); - vfree(buf); + kvfree(buf); return ret; } From 49a3c4d9b64d2e5d6fd268d53153945d04cb5559 Mon Sep 17 00:00:00 2001 From: David Sterba Date: Thu, 24 Mar 2016 17:49:22 +0100 Subject: [PATCH 17/37] btrfs: use dynamic allocation for root item in create_subvol The size of root item is more than 400 bytes, which is quite a lot of stack space. As we do IO from inside the subvolume ioctls, we should keep the stack usage low in case the filesystem is on top of other layers (NFS, device mapper, iscsi, etc). Reviewed-by: Tsutomu Itoh Signed-off-by: David Sterba --- fs/btrfs/ioctl.c | 65 +++++++++++++++++++++++++++--------------------- 1 file changed, 37 insertions(+), 28 deletions(-) diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index cfa7d47b1ba0..798f58e7338e 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -439,7 +439,7 @@ static noinline int create_subvol(struct inode *dir, { struct btrfs_trans_handle *trans; struct btrfs_key key; - struct btrfs_root_item root_item; + struct btrfs_root_item *root_item; struct btrfs_inode_item *inode_item; struct extent_buffer *leaf; struct btrfs_root *root = BTRFS_I(dir)->root; @@ -455,16 +455,22 @@ static noinline int create_subvol(struct inode *dir, u64 qgroup_reserved; uuid_le new_uuid; + root_item = kzalloc(sizeof(*root_item), GFP_KERNEL); + if (!root_item) + return -ENOMEM; + ret = btrfs_find_free_objectid(root->fs_info->tree_root, &objectid); if (ret) - return ret; + goto fail_free; /* * Don't create subvolume whose level is not zero. Or qgroup will be * screwed up since it assume subvolme qgroup's level to be 0. */ - if (btrfs_qgroup_level(objectid)) - return -ENOSPC; + if (btrfs_qgroup_level(objectid)) { + ret = -ENOSPC; + goto fail_free; + } btrfs_init_block_rsv(&block_rsv, BTRFS_BLOCK_RSV_TEMP); /* @@ -474,14 +480,14 @@ static noinline int create_subvol(struct inode *dir, ret = btrfs_subvolume_reserve_metadata(root, &block_rsv, 8, &qgroup_reserved, false); if (ret) - return ret; + goto fail_free; trans = btrfs_start_transaction(root, 0); if (IS_ERR(trans)) { ret = PTR_ERR(trans); btrfs_subvolume_release_metadata(root, &block_rsv, qgroup_reserved); - return ret; + goto fail_free; } trans->block_rsv = &block_rsv; trans->bytes_reserved = block_rsv.size; @@ -509,47 +515,45 @@ static noinline int create_subvol(struct inode *dir, BTRFS_UUID_SIZE); btrfs_mark_buffer_dirty(leaf); - memset(&root_item, 0, sizeof(root_item)); - - inode_item = &root_item.inode; + inode_item = &root_item->inode; btrfs_set_stack_inode_generation(inode_item, 1); btrfs_set_stack_inode_size(inode_item, 3); btrfs_set_stack_inode_nlink(inode_item, 1); btrfs_set_stack_inode_nbytes(inode_item, root->nodesize); btrfs_set_stack_inode_mode(inode_item, S_IFDIR | 0755); - btrfs_set_root_flags(&root_item, 0); - btrfs_set_root_limit(&root_item, 0); + btrfs_set_root_flags(root_item, 0); + btrfs_set_root_limit(root_item, 0); btrfs_set_stack_inode_flags(inode_item, BTRFS_INODE_ROOT_ITEM_INIT); - btrfs_set_root_bytenr(&root_item, leaf->start); - btrfs_set_root_generation(&root_item, trans->transid); - btrfs_set_root_level(&root_item, 0); - btrfs_set_root_refs(&root_item, 1); - btrfs_set_root_used(&root_item, leaf->len); - btrfs_set_root_last_snapshot(&root_item, 0); + btrfs_set_root_bytenr(root_item, leaf->start); + btrfs_set_root_generation(root_item, trans->transid); + btrfs_set_root_level(root_item, 0); + btrfs_set_root_refs(root_item, 1); + btrfs_set_root_used(root_item, leaf->len); + btrfs_set_root_last_snapshot(root_item, 0); - btrfs_set_root_generation_v2(&root_item, - btrfs_root_generation(&root_item)); + btrfs_set_root_generation_v2(root_item, + btrfs_root_generation(root_item)); uuid_le_gen(&new_uuid); - memcpy(root_item.uuid, new_uuid.b, BTRFS_UUID_SIZE); - btrfs_set_stack_timespec_sec(&root_item.otime, cur_time.tv_sec); - btrfs_set_stack_timespec_nsec(&root_item.otime, cur_time.tv_nsec); - root_item.ctime = root_item.otime; - btrfs_set_root_ctransid(&root_item, trans->transid); - btrfs_set_root_otransid(&root_item, trans->transid); + memcpy(root_item->uuid, new_uuid.b, BTRFS_UUID_SIZE); + btrfs_set_stack_timespec_sec(&root_item->otime, cur_time.tv_sec); + btrfs_set_stack_timespec_nsec(&root_item->otime, cur_time.tv_nsec); + root_item->ctime = root_item->otime; + btrfs_set_root_ctransid(root_item, trans->transid); + btrfs_set_root_otransid(root_item, trans->transid); btrfs_tree_unlock(leaf); free_extent_buffer(leaf); leaf = NULL; - btrfs_set_root_dirid(&root_item, new_dirid); + btrfs_set_root_dirid(root_item, new_dirid); key.objectid = objectid; key.offset = 0; key.type = BTRFS_ROOT_ITEM_KEY; ret = btrfs_insert_root(trans, root->fs_info->tree_root, &key, - &root_item); + root_item); if (ret) goto fail; @@ -601,12 +605,13 @@ static noinline int create_subvol(struct inode *dir, BUG_ON(ret); ret = btrfs_uuid_tree_add(trans, root->fs_info->uuid_root, - root_item.uuid, BTRFS_UUID_KEY_SUBVOL, + root_item->uuid, BTRFS_UUID_KEY_SUBVOL, objectid); if (ret) btrfs_abort_transaction(trans, root, ret); fail: + kfree(root_item); trans->block_rsv = NULL; trans->bytes_reserved = 0; btrfs_subvolume_release_metadata(root, &block_rsv, qgroup_reserved); @@ -629,6 +634,10 @@ static noinline int create_subvol(struct inode *dir, d_instantiate(dentry, inode); } return ret; + +fail_free: + kfree(root_item); + return ret; } static void btrfs_wait_for_no_snapshoting_writes(struct btrfs_root *root) From e6c11f9a462e9ef4876d5e1539a6c06eded4c793 Mon Sep 17 00:00:00 2001 From: David Sterba Date: Thu, 24 Mar 2016 18:00:53 +0100 Subject: [PATCH 18/37] btrfs: reuse existing variable in scrub_stripe, reduce stack usage The key variable occupies 17 bytes, the key_start is used once, we can simply reuse existing 'key' for that purpose. As the key is not a simple type, compiler doest not do it on itself. Signed-off-by: David Sterba --- fs/btrfs/scrub.c | 19 +++++++++---------- 1 file changed, 9 insertions(+), 10 deletions(-) diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c index 4678f03e878e..2ff2876656ba 100644 --- a/fs/btrfs/scrub.c +++ b/fs/btrfs/scrub.c @@ -3070,7 +3070,6 @@ static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx, int slot; u64 nstripes; struct extent_buffer *l; - struct btrfs_key key; u64 physical; u64 logical; u64 logic_end; @@ -3079,7 +3078,7 @@ static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx, int mirror_num; struct reada_control *reada1; struct reada_control *reada2; - struct btrfs_key key_start; + struct btrfs_key key; struct btrfs_key key_end; u64 increment = map->stripe_len; u64 offset; @@ -3158,21 +3157,21 @@ static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx, scrub_blocked_if_needed(fs_info); /* FIXME it might be better to start readahead at commit root */ - key_start.objectid = logical; - key_start.type = BTRFS_EXTENT_ITEM_KEY; - key_start.offset = (u64)0; + key.objectid = logical; + key.type = BTRFS_EXTENT_ITEM_KEY; + key.offset = (u64)0; key_end.objectid = logic_end; key_end.type = BTRFS_METADATA_ITEM_KEY; key_end.offset = (u64)-1; - reada1 = btrfs_reada_add(root, &key_start, &key_end); + reada1 = btrfs_reada_add(root, &key, &key_end); - key_start.objectid = BTRFS_EXTENT_CSUM_OBJECTID; - key_start.type = BTRFS_EXTENT_CSUM_KEY; - key_start.offset = logical; + key.objectid = BTRFS_EXTENT_CSUM_OBJECTID; + key.type = BTRFS_EXTENT_CSUM_KEY; + key.offset = logical; key_end.objectid = BTRFS_EXTENT_CSUM_OBJECTID; key_end.type = BTRFS_EXTENT_CSUM_KEY; key_end.offset = logic_end; - reada2 = btrfs_reada_add(csum_root, &key_start, &key_end); + reada2 = btrfs_reada_add(csum_root, &key, &key_end); if (!IS_ERR(reada1)) btrfs_reada_wait(reada1); From ee6111386a1b304f8bf589d36810d53e3b27ee20 Mon Sep 17 00:00:00 2001 From: David Sterba Date: Fri, 23 Jan 2015 18:43:31 +0100 Subject: [PATCH 19/37] btrfs: add read-only check to sysfs handler of features We don't want to trigger the change on a read-only filesystem, similar to what the label handler does. Signed-off-by: David Sterba --- fs/btrfs/sysfs.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/fs/btrfs/sysfs.c b/fs/btrfs/sysfs.c index 539e7b5e3f86..6a6bb600b1ff 100644 --- a/fs/btrfs/sysfs.c +++ b/fs/btrfs/sysfs.c @@ -120,6 +120,9 @@ static ssize_t btrfs_feature_attr_store(struct kobject *kobj, if (!fs_info) return -EPERM; + if (fs_info->sb->s_flags & MS_RDONLY) + return -EROFS; + ret = kstrtoul(skip_spaces(buf), 0, &val); if (ret) return ret; From 66ac9fe7bacf9fa76c472efc7a7aaa590c7bce6a Mon Sep 17 00:00:00 2001 From: David Sterba Date: Tue, 26 Apr 2016 16:03:57 +0200 Subject: [PATCH 20/37] btrfs: add check to sysfs handler of label Add a sanity check for the fs_info as we will dereference it, similar to what the 'store features' handler does. Signed-off-by: David Sterba --- fs/btrfs/sysfs.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/fs/btrfs/sysfs.c b/fs/btrfs/sysfs.c index 6a6bb600b1ff..3d14618ce54b 100644 --- a/fs/btrfs/sysfs.c +++ b/fs/btrfs/sysfs.c @@ -377,6 +377,9 @@ static ssize_t btrfs_label_store(struct kobject *kobj, struct btrfs_fs_info *fs_info = to_fs_info(kobj); size_t p_len; + if (!fs_info) + return -EPERM; + if (fs_info->sb->s_flags & MS_RDONLY) return -EROFS; From ee17fc8005287d2d6ca7cab6e814e5043d773735 Mon Sep 17 00:00:00 2001 From: David Sterba Date: Tue, 26 Apr 2016 16:22:06 +0200 Subject: [PATCH 21/37] btrfs: sysfs: protect reading label by lock If the label setting ioctl races with sysfs label handler, we could get mixed result in the output, part old part new. We should either get the old or new label. The chances to hit this race are low. Signed-off-by: David Sterba --- fs/btrfs/sysfs.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/fs/btrfs/sysfs.c b/fs/btrfs/sysfs.c index 3d14618ce54b..4879656bda3c 100644 --- a/fs/btrfs/sysfs.c +++ b/fs/btrfs/sysfs.c @@ -367,7 +367,13 @@ static ssize_t btrfs_label_show(struct kobject *kobj, { struct btrfs_fs_info *fs_info = to_fs_info(kobj); char *label = fs_info->super_copy->label; - return snprintf(buf, PAGE_SIZE, label[0] ? "%s\n" : "%s", label); + ssize_t ret; + + spin_lock(&fs_info->super_lock); + ret = snprintf(buf, PAGE_SIZE, label[0] ? "%s\n" : "%s", label); + spin_unlock(&fs_info->super_lock); + + return ret; } static ssize_t btrfs_label_store(struct kobject *kobj, From 3d8da67817606380fdadfa483d4dba5c3a5446c6 Mon Sep 17 00:00:00 2001 From: Liu Bo Date: Tue, 26 Apr 2016 17:53:31 -0700 Subject: [PATCH 22/37] Btrfs: fix divide error upon chunk's stripe_len The struct 'map_lookup' uses type int for @stripe_len, while btrfs_chunk_stripe_len() can return a u64 value, and it may end up with @stripe_len being undefined value and it can lead to 'divide error' in __btrfs_map_block(). This changes 'map_lookup' to use type u64 for stripe_len, also right now we only use BTRFS_STRIPE_LEN for stripe_len, so this adds a valid checker for BTRFS_STRIPE_LEN. Reported-by: Vegard Nossum Reported-by: Quentin Casasnovas Signed-off-by: Liu Bo Reviewed-by: David Sterba [ folded division fix to scrub_raid56_parity ] Signed-off-by: David Sterba --- fs/btrfs/scrub.c | 2 +- fs/btrfs/volumes.c | 2 +- fs/btrfs/volumes.h | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c index 2ff2876656ba..96d2a0de35a8 100644 --- a/fs/btrfs/scrub.c +++ b/fs/btrfs/scrub.c @@ -2860,7 +2860,7 @@ static noinline_for_stack int scrub_raid56_parity(struct scrub_ctx *sctx, int extent_mirror_num; int stop_loop = 0; - nsectors = map->stripe_len / root->sectorsize; + nsectors = div_u64(map->stripe_len, root->sectorsize); bitmap_len = scrub_calc_parity_bitmap_len(nsectors); sparity = kzalloc(sizeof(struct scrub_parity) + 2 * bitmap_len, GFP_NOFS); diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 7bd553d42e0d..f13e2bcc1398 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -6254,7 +6254,7 @@ static int read_one_chunk(struct btrfs_root *root, struct btrfs_key *key, "invalid chunk length %llu", length); return -EIO; } - if (!is_power_of_2(stripe_len)) { + if (!is_power_of_2(stripe_len) || stripe_len != BTRFS_STRIPE_LEN) { btrfs_err(root->fs_info, "invalid chunk stripe length: %llu", stripe_len); return -EIO; diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h index 1939ebde63df..7507be74f7da 100644 --- a/fs/btrfs/volumes.h +++ b/fs/btrfs/volumes.h @@ -347,7 +347,7 @@ struct map_lookup { u64 type; int io_align; int io_width; - int stripe_len; + u64 stripe_len; int sector_size; int num_stripes; int sub_stripes; From e042d1ec4417981dfe9331e47b76f17929bc2ffe Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Tue, 12 Apr 2016 12:54:40 -0400 Subject: [PATCH 23/37] Btrfs: remove BUG_ON()'s in btrfs_map_block btrfs_map_block can go horribly wrong in the face of fs corruption, lets agree to not be assholes and panic at any possible chance things are all fucked up. Signed-off-by: Josef Bacik [ removed type casts ] Signed-off-by: David Sterba --- fs/btrfs/volumes.c | 18 ++++++++++++++++-- 1 file changed, 16 insertions(+), 2 deletions(-) diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index f13e2bcc1398..8d3ceb0fd239 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -5290,7 +5290,15 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw, stripe_nr = div64_u64(stripe_nr, stripe_len); stripe_offset = stripe_nr * stripe_len; - BUG_ON(offset < stripe_offset); + if (offset < stripe_offset) { + btrfs_crit(fs_info, "stripe math has gone wrong, " + "stripe_offset=%llu, offset=%llu, start=%llu, " + "logical=%llu, stripe_len=%llu", + stripe_offset, offset, em->start, logical, + stripe_len); + free_extent_map(em); + return -EINVAL; + } /* stripe_offset is the offset of this block in its stripe*/ stripe_offset = offset - stripe_offset; @@ -5531,7 +5539,13 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw, &stripe_index); mirror_num = stripe_index + 1; } - BUG_ON(stripe_index >= map->num_stripes); + if (stripe_index >= map->num_stripes) { + btrfs_crit(fs_info, "stripe index math went horribly wrong, " + "got stripe_index=%u, num_stripes=%u", + stripe_index, map->num_stripes); + ret = -EINVAL; + goto out; + } num_alloc_stripes = num_stripes; if (dev_replace_is_ongoing) { From 2473114981a36b4f0f57cf6e9548037d547a71b7 Mon Sep 17 00:00:00 2001 From: Ashish Samant Date: Fri, 29 Apr 2016 18:33:59 -0700 Subject: [PATCH 24/37] btrfs: Fix BUG_ON condition in scrub_setup_recheck_block() pagev array in scrub_block{} is of size SCRUB_MAX_PAGES_PER_BLOCK. page_index should be checked with the same to trigger BUG_ON(). Signed-off-by: Ashish Samant Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/scrub.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c index 96d2a0de35a8..d270c700ed31 100644 --- a/fs/btrfs/scrub.c +++ b/fs/btrfs/scrub.c @@ -1350,7 +1350,7 @@ static int scrub_setup_recheck_block(struct scrub_block *original_sblock, recover->bbio = bbio; recover->map_length = mapped_length; - BUG_ON(page_index >= SCRUB_PAGES_PER_RD_BIO); + BUG_ON(page_index >= SCRUB_MAX_PAGES_PER_BLOCK); nmirrors = min(scrub_nr_raid_mirrors(bbio), BTRFS_MAX_MIRRORS); From 48b3b9d401ec86899a52003b37331190a35a81a6 Mon Sep 17 00:00:00 2001 From: Anand Jain Date: Tue, 12 Apr 2016 21:36:16 +0800 Subject: [PATCH 25/37] btrfs: fix lock dep warning move scratch super outside of chunk_mutex Move scratch super outside of the chunk lock to avoid below lockdep warning. The better place to scratch super is in the function btrfs_rm_dev_replace_free_srcdev() just before free_device, which is outside of the chunk lock as well. To reproduce: (fresh boot) mkfs.btrfs -f -draid5 -mraid5 /dev/sdc /dev/sdd /dev/sde mount /dev/sdc /btrfs dd if=/dev/zero of=/btrfs/tf1 bs=4096 count=100 (get devmgt from https://github.com/asj/devmgt.git) devmgt detach /dev/sde dd if=/dev/zero of=/btrfs/tf1 bs=4096 count=100 sync btrfs replace start -Brf 3 /dev/sdf /btrfs <-- devmgt attach host7 ====================================================== [ INFO: possible circular locking dependency detected ] 4.6.0-rc2asj+ #1 Not tainted --------------------------------------------------- btrfs/2174 is trying to acquire lock: (sb_writers){.+.+.+}, at: [] __sb_start_write+0xb4/0xf0 but task is already holding lock: (&fs_info->chunk_mutex){+.+.+.}, at: [] btrfs_dev_replace_finishing+0x145/0x980 [btrfs] which lock already depends on the new lock. Chain exists of: sb_writers --> &fs_devs->device_list_mutex --> &fs_info->chunk_mutex Possible unsafe locking scenario: CPU0 CPU1 ---- ---- lock(&fs_info->chunk_mutex); lock(&fs_devs->device_list_mutex); lock(&fs_info->chunk_mutex); lock(sb_writers); *** DEADLOCK *** -> #0 (sb_writers){.+.+.+}: [] __lock_acquire+0x1bc5/0x1ee0 [] lock_acquire+0xbe/0x210 [] percpu_down_read+0x4a/0xa0 [] __sb_start_write+0xb4/0xf0 [] mnt_want_write+0x24/0x50 [] path_openat+0x952/0x1190 [] do_filp_open+0x91/0x100 [] file_open_name+0xfc/0x140 [] filp_open+0x33/0x60 [] update_dev_time+0x16/0x40 [btrfs] [] btrfs_scratch_superblocks+0x5d/0xb0 [btrfs] [] btrfs_rm_dev_replace_remove_srcdev+0xae/0xd0 [btrfs] [] btrfs_dev_replace_finishing+0x4b5/0x980 [btrfs] [] btrfs_dev_replace_start+0x358/0x530 [btrfs] Signed-off-by: Anand Jain Signed-off-by: David Sterba --- fs/btrfs/volumes.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 8d3ceb0fd239..c5e0d2d431dc 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -1972,11 +1972,8 @@ void btrfs_rm_dev_replace_remove_srcdev(struct btrfs_fs_info *fs_info, if (srcdev->missing) fs_devices->missing_devices--; - if (srcdev->writeable) { + if (srcdev->writeable) fs_devices->rw_devices--; - /* zero out the old super if it is writable */ - btrfs_scratch_superblocks(srcdev->bdev, srcdev->name->str); - } if (srcdev->bdev) fs_devices->open_devices--; @@ -1987,6 +1984,10 @@ void btrfs_rm_dev_replace_free_srcdev(struct btrfs_fs_info *fs_info, { struct btrfs_fs_devices *fs_devices = srcdev->fs_devices; + if (srcdev->writeable) { + /* zero out the old super if it is writable */ + btrfs_scratch_superblocks(srcdev->bdev, srcdev->name->str); + } call_rcu(&srcdev->rcu, free_device); /* From 7ab19625a911f7568ec85302e3aa7a64186006c8 Mon Sep 17 00:00:00 2001 From: David Sterba Date: Wed, 4 May 2016 11:32:00 +0200 Subject: [PATCH 26/37] btrfs: add write protection to SET_FEATURES ioctl Perform the want_write check if we get far enough to do any writes. Signed-off-by: David Sterba --- fs/btrfs/ioctl.c | 16 +++++++++++++--- 1 file changed, 13 insertions(+), 3 deletions(-) diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index 798f58e7338e..1e8ce5247a81 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -5406,9 +5406,15 @@ static int btrfs_ioctl_set_features(struct file *file, void __user *arg) if (ret) return ret; + ret = mnt_want_write_file(file); + if (ret) + return ret; + trans = btrfs_start_transaction(root, 0); - if (IS_ERR(trans)) - return PTR_ERR(trans); + if (IS_ERR(trans)) { + ret = PTR_ERR(trans); + goto out_drop_write; + } spin_lock(&root->fs_info->super_lock); newflags = btrfs_super_compat_flags(super_block); @@ -5427,7 +5433,11 @@ static int btrfs_ioctl_set_features(struct file *file, void __user *arg) btrfs_set_super_incompat_flags(super_block, newflags); spin_unlock(&root->fs_info->super_lock); - return btrfs_commit_transaction(trans, root); + ret = btrfs_commit_transaction(trans, root); +out_drop_write: + mnt_drop_write_file(file); + + return ret; } long btrfs_ioctl(struct file *file, unsigned int From 58d7bbf81f54667e36940d5f4b5609606efa597b Mon Sep 17 00:00:00 2001 From: David Sterba Date: Wed, 4 May 2016 14:10:47 +0200 Subject: [PATCH 27/37] btrfs: ioctl: reorder exclusive op check in RM_DEV Move the op exclusivity check before the other code (same as in ADD_DEV). Signed-off-by: David Sterba --- fs/btrfs/ioctl.c | 23 +++++++++++------------ 1 file changed, 11 insertions(+), 12 deletions(-) diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index 1e8ce5247a81..c81d6daefe74 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -2689,32 +2689,31 @@ static long btrfs_ioctl_rm_dev(struct file *file, void __user *arg) if (ret) return ret; - vol_args = memdup_user(arg, sizeof(*vol_args)); - if (IS_ERR(vol_args)) { - ret = PTR_ERR(vol_args); - goto err_drop; - } - - vol_args->name[BTRFS_PATH_NAME_MAX] = '\0'; - if (atomic_xchg(&root->fs_info->mutually_exclusive_operation_running, 1)) { ret = BTRFS_ERROR_DEV_EXCL_RUN_IN_PROGRESS; + goto out_drop_write; + } + + vol_args = memdup_user(arg, sizeof(*vol_args)); + if (IS_ERR(vol_args)) { + ret = PTR_ERR(vol_args); goto out; } + vol_args->name[BTRFS_PATH_NAME_MAX] = '\0'; mutex_lock(&root->fs_info->volume_mutex); ret = btrfs_rm_device(root, vol_args->name); mutex_unlock(&root->fs_info->volume_mutex); - atomic_set(&root->fs_info->mutually_exclusive_operation_running, 0); if (!ret) btrfs_info(root->fs_info, "disk deleted %s",vol_args->name); - -out: kfree(vol_args); -err_drop: +out: + atomic_set(&root->fs_info->mutually_exclusive_operation_running, 0); +out_drop_write: mnt_drop_write_file(file); + return ret; } From 2f3165ecf103599f82bf0ea254039db335fb5005 Mon Sep 17 00:00:00 2001 From: Zygo Blaxell Date: Thu, 5 May 2016 00:23:49 -0400 Subject: [PATCH 28/37] btrfs: don't force mounts to wait for cleaner_kthread to delete one or more subvolumes During a mount, we start the cleaner kthread first because the transaction kthread wants to wake up the cleaner kthread. We start the transaction kthread next because everything in btrfs wants transactions. We do reloc recovery in the thread that was doing the original mount call once the transaction kthread is running. This means that the cleaner kthread could already be running when reloc recovery happens (e.g. if a snapshot delete was started before a crash). Relocation does not play well with the cleaner kthread, so a mutex was added in commit 5f3164813b90f7dbcb5c3ab9006906222ce471b7 "Btrfs: fix race between balance recovery and root deletion" to prevent both from being active at the same time. If the cleaner kthread is already holding the mutex by the time we get to btrfs_recover_relocation, the mount will be blocked until at least one deleted subvolume is cleaned (possibly more if the mount process doesn't get the lock right away). During this time (which could be an arbitrarily long time on a large/slow filesystem), the mount process is stuck and the filesystem is unnecessarily inaccessible. Fix this by locking cleaner_mutex before we start cleaner_kthread, and unlocking the mutex after mount no longer requires it. This ensures that the mounting process will not be blocked by the cleaner kthread. The cleaner kthread is already prepared for mutex contention and will just go to sleep until the mutex is available. Signed-off-by: Zygo Blaxell Reviewed-by: Filipe Manana Signed-off-by: David Sterba --- fs/btrfs/disk-io.c | 18 +++++++++++++++--- 1 file changed, 15 insertions(+), 3 deletions(-) diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 4e47849d7427..070c1dad42bd 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -2517,6 +2517,7 @@ int open_ctree(struct super_block *sb, int num_backups_tried = 0; int backup_index = 0; int max_active; + bool cleaner_mutex_locked = false; tree_root = fs_info->tree_root = btrfs_alloc_root(fs_info, GFP_KERNEL); chunk_root = fs_info->chunk_root = btrfs_alloc_root(fs_info, GFP_KERNEL); @@ -2997,6 +2998,13 @@ int open_ctree(struct super_block *sb, goto fail_sysfs; } + /* + * Hold the cleaner_mutex thread here so that we don't block + * for a long time on btrfs_recover_relocation. cleaner_kthread + * will wait for us to finish mounting the filesystem. + */ + mutex_lock(&fs_info->cleaner_mutex); + cleaner_mutex_locked = true; fs_info->cleaner_kthread = kthread_run(cleaner_kthread, tree_root, "btrfs-cleaner"); if (IS_ERR(fs_info->cleaner_kthread)) @@ -3056,10 +3064,8 @@ int open_ctree(struct super_block *sb, ret = btrfs_cleanup_fs_roots(fs_info); if (ret) goto fail_qgroup; - - mutex_lock(&fs_info->cleaner_mutex); + /* We locked cleaner_mutex before creating cleaner_kthread. */ ret = btrfs_recover_relocation(tree_root); - mutex_unlock(&fs_info->cleaner_mutex); if (ret < 0) { printk(KERN_WARNING "BTRFS: failed to recover relocation\n"); @@ -3067,6 +3073,8 @@ int open_ctree(struct super_block *sb, goto fail_qgroup; } } + mutex_unlock(&fs_info->cleaner_mutex); + cleaner_mutex_locked = false; location.objectid = BTRFS_FS_TREE_OBJECTID; location.type = BTRFS_ROOT_ITEM_KEY; @@ -3180,6 +3188,10 @@ int open_ctree(struct super_block *sb, filemap_write_and_wait(fs_info->btree_inode->i_mapping); fail_sysfs: + if (cleaner_mutex_locked) { + mutex_unlock(&fs_info->cleaner_mutex); + cleaner_mutex_locked = false; + } btrfs_sysfs_remove_mounted(fs_info); fail_fsdev_sysfs: From 8eb0dfdbda3f56bf7d248ed87fcc383df114ecbb Mon Sep 17 00:00:00 2001 From: Adam Borowski Date: Sun, 8 May 2016 15:08:00 +0200 Subject: [PATCH 29/37] btrfs: fix int32 overflow in shrink_delalloc(). UBSAN: Undefined behaviour in fs/btrfs/extent-tree.c:4623:21 signed integer overflow: 10808 * 262144 cannot be represented in type 'int [8]' If 8192<=items<16384, we request a writeback of an insane number of pages which is benign (everything will be written). But if items>=16384, the space reservation won't be enough. Signed-off-by: Adam Borowski Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/extent-tree.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 84e060eb0de8..391f576789e9 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -4620,7 +4620,7 @@ static void shrink_delalloc(struct btrfs_root *root, u64 to_reclaim, u64 orig, /* Calc the number of the pages we need flush for space reservation */ items = calc_reclaim_items_nr(root, to_reclaim); - to_reclaim = items * EXTENT_SIZE_PER_ITEM; + to_reclaim = (u64)items * EXTENT_SIZE_PER_ITEM; trans = (struct btrfs_trans_handle *)current->journal_info; block_rsv = &root->fs_info->delalloc_block_rsv; From 6ac10a6ac2b11ada24580cc76dcd0c182061c576 Mon Sep 17 00:00:00 2001 From: David Sterba Date: Wed, 27 Apr 2016 02:15:15 +0200 Subject: [PATCH 30/37] btrfs: rename and document compression workspace members The names are confusing, pick more fitting names and add comments. Signed-off-by: David Sterba --- fs/btrfs/compression.c | 35 +++++++++++++++++++---------------- 1 file changed, 19 insertions(+), 16 deletions(-) diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c index ff61a41ac90b..4d5cd9624bb3 100644 --- a/fs/btrfs/compression.c +++ b/fs/btrfs/compression.c @@ -743,8 +743,11 @@ int btrfs_submit_compressed_read(struct inode *inode, struct bio *bio, static struct { struct list_head idle_ws; spinlock_t ws_lock; - int num_ws; - atomic_t alloc_ws; + /* Number of free workspaces */ + int free_ws; + /* Total number of allocated workspaces */ + atomic_t total_ws; + /* Waiters for a free workspace */ wait_queue_head_t ws_wait; } btrfs_comp_ws[BTRFS_COMPRESS_TYPES]; @@ -760,7 +763,7 @@ void __init btrfs_init_compress(void) for (i = 0; i < BTRFS_COMPRESS_TYPES; i++) { INIT_LIST_HEAD(&btrfs_comp_ws[i].idle_ws); spin_lock_init(&btrfs_comp_ws[i].ws_lock); - atomic_set(&btrfs_comp_ws[i].alloc_ws, 0); + atomic_set(&btrfs_comp_ws[i].total_ws, 0); init_waitqueue_head(&btrfs_comp_ws[i].ws_wait); } } @@ -777,35 +780,35 @@ static struct list_head *find_workspace(int type) struct list_head *idle_ws = &btrfs_comp_ws[idx].idle_ws; spinlock_t *ws_lock = &btrfs_comp_ws[idx].ws_lock; - atomic_t *alloc_ws = &btrfs_comp_ws[idx].alloc_ws; + atomic_t *total_ws = &btrfs_comp_ws[idx].total_ws; wait_queue_head_t *ws_wait = &btrfs_comp_ws[idx].ws_wait; - int *num_ws = &btrfs_comp_ws[idx].num_ws; + int *free_ws = &btrfs_comp_ws[idx].free_ws; again: spin_lock(ws_lock); if (!list_empty(idle_ws)) { workspace = idle_ws->next; list_del(workspace); - (*num_ws)--; + (*free_ws)--; spin_unlock(ws_lock); return workspace; } - if (atomic_read(alloc_ws) > cpus) { + if (atomic_read(total_ws) > cpus) { DEFINE_WAIT(wait); spin_unlock(ws_lock); prepare_to_wait(ws_wait, &wait, TASK_UNINTERRUPTIBLE); - if (atomic_read(alloc_ws) > cpus && !*num_ws) + if (atomic_read(total_ws) > cpus && !*free_ws) schedule(); finish_wait(ws_wait, &wait); goto again; } - atomic_inc(alloc_ws); + atomic_inc(total_ws); spin_unlock(ws_lock); workspace = btrfs_compress_op[idx]->alloc_workspace(); if (IS_ERR(workspace)) { - atomic_dec(alloc_ws); + atomic_dec(total_ws); wake_up(ws_wait); } return workspace; @@ -820,21 +823,21 @@ static void free_workspace(int type, struct list_head *workspace) int idx = type - 1; struct list_head *idle_ws = &btrfs_comp_ws[idx].idle_ws; spinlock_t *ws_lock = &btrfs_comp_ws[idx].ws_lock; - atomic_t *alloc_ws = &btrfs_comp_ws[idx].alloc_ws; + atomic_t *total_ws = &btrfs_comp_ws[idx].total_ws; wait_queue_head_t *ws_wait = &btrfs_comp_ws[idx].ws_wait; - int *num_ws = &btrfs_comp_ws[idx].num_ws; + int *free_ws = &btrfs_comp_ws[idx].free_ws; spin_lock(ws_lock); - if (*num_ws < num_online_cpus()) { + if (*free_ws < num_online_cpus()) { list_add(workspace, idle_ws); - (*num_ws)++; + (*free_ws)++; spin_unlock(ws_lock); goto wake; } spin_unlock(ws_lock); btrfs_compress_op[idx]->free_workspace(workspace); - atomic_dec(alloc_ws); + atomic_dec(total_ws); wake: /* * Make sure counter is updated before we wake up waiters. @@ -857,7 +860,7 @@ static void free_workspaces(void) workspace = btrfs_comp_ws[i].idle_ws.next; list_del(workspace); btrfs_compress_op[i]->free_workspace(workspace); - atomic_dec(&btrfs_comp_ws[i].alloc_ws); + atomic_dec(&btrfs_comp_ws[i].total_ws); } } } From f77dd0d6b2f0f2cf290cacbd48f5eee18586e52b Mon Sep 17 00:00:00 2001 From: David Sterba Date: Wed, 27 Apr 2016 02:55:15 +0200 Subject: [PATCH 31/37] btrfs: preallocate compression workspaces Preallocate one workspace for each compression type so we can guarantee forward progress in the worst case. A failure cannot be a hard error as we might not use compression at all on the filesystem. If we can't allocate the workspaces later when need them, it might actually deadlock, but in such situation the system has effectively not enough memory to operate properly. Signed-off-by: David Sterba --- fs/btrfs/compression.c | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c index 4d5cd9624bb3..38c058bcf359 100644 --- a/fs/btrfs/compression.c +++ b/fs/btrfs/compression.c @@ -761,10 +761,26 @@ void __init btrfs_init_compress(void) int i; for (i = 0; i < BTRFS_COMPRESS_TYPES; i++) { + struct list_head *workspace; + INIT_LIST_HEAD(&btrfs_comp_ws[i].idle_ws); spin_lock_init(&btrfs_comp_ws[i].ws_lock); atomic_set(&btrfs_comp_ws[i].total_ws, 0); init_waitqueue_head(&btrfs_comp_ws[i].ws_wait); + + /* + * Preallocate one workspace for each compression type so + * we can guarantee forward progress in the worst case + */ + workspace = btrfs_compress_op[i]->alloc_workspace(); + if (IS_ERR(workspace)) { + printk(KERN_WARNING + "BTRFS: cannot preallocate compression workspace, will try later"); + } else { + atomic_set(&btrfs_comp_ws[i].total_ws, 1); + btrfs_comp_ws[i].free_ws = 1; + list_add(workspace, &btrfs_comp_ws[i].idle_ws); + } } } From e721e49dd1681d45d71919f0561f5e978a34153c Mon Sep 17 00:00:00 2001 From: David Sterba Date: Wed, 27 Apr 2016 02:41:17 +0200 Subject: [PATCH 32/37] btrfs: make find_workspace always succeed With just one preallocated workspace we can guarantee forward progress even if there's no memory available for new workspaces. The cost is more waiting but we also get rid of several error paths. On average, there will be several idle workspaces, so the waiting penalty won't be so bad. In the worst case, all cpus will compete for one workspace until there's some memory. Attempts to allocate a new one are done each time the waiters are woken up. Signed-off-by: David Sterba --- fs/btrfs/compression.c | 20 ++++++++++++-------- 1 file changed, 12 insertions(+), 8 deletions(-) diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c index 38c058bcf359..c70625560265 100644 --- a/fs/btrfs/compression.c +++ b/fs/btrfs/compression.c @@ -785,8 +785,10 @@ void __init btrfs_init_compress(void) } /* - * this finds an available workspace or allocates a new one - * ERR_PTR is returned if things go bad. + * This finds an available workspace or allocates a new one. + * If it's not possible to allocate a new one, waits until there's one. + * Preallocation makes a forward progress guarantees and we do not return + * errors. */ static struct list_head *find_workspace(int type) { @@ -826,6 +828,14 @@ static struct list_head *find_workspace(int type) if (IS_ERR(workspace)) { atomic_dec(total_ws); wake_up(ws_wait); + + /* + * Do not return the error but go back to waiting. There's a + * workspace preallocated for each type and the compression + * time is bounded so we get to a workspace eventually. This + * makes our caller's life easier. + */ + goto again; } return workspace; } @@ -913,8 +923,6 @@ int btrfs_compress_pages(int type, struct address_space *mapping, int ret; workspace = find_workspace(type); - if (IS_ERR(workspace)) - return PTR_ERR(workspace); ret = btrfs_compress_op[type-1]->compress_pages(workspace, mapping, start, len, pages, @@ -949,8 +957,6 @@ static int btrfs_decompress_biovec(int type, struct page **pages_in, int ret; workspace = find_workspace(type); - if (IS_ERR(workspace)) - return PTR_ERR(workspace); ret = btrfs_compress_op[type-1]->decompress_biovec(workspace, pages_in, disk_start, @@ -971,8 +977,6 @@ int btrfs_decompress(int type, unsigned char *data_in, struct page *dest_page, int ret; workspace = find_workspace(type); - if (IS_ERR(workspace)) - return PTR_ERR(workspace); ret = btrfs_compress_op[type-1]->decompress(workspace, data_in, dest_page, start_byte, From 523567168da04bae0f88802ddef49d00072c5d58 Mon Sep 17 00:00:00 2001 From: David Sterba Date: Wed, 27 Apr 2016 03:07:39 +0200 Subject: [PATCH 33/37] btrfs: make find_workspace warn if there are no workspaces Be verbose if there are no workspaces at all, ie. the module init time preallocation failed. Signed-off-by: David Sterba --- fs/btrfs/compression.c | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c index c70625560265..658c39b70fba 100644 --- a/fs/btrfs/compression.c +++ b/fs/btrfs/compression.c @@ -834,7 +834,21 @@ static struct list_head *find_workspace(int type) * workspace preallocated for each type and the compression * time is bounded so we get to a workspace eventually. This * makes our caller's life easier. + * + * To prevent silent and low-probability deadlocks (when the + * initial preallocation fails), check if there are any + * workspaces at all. */ + if (atomic_read(total_ws) == 0) { + static DEFINE_RATELIMIT_STATE(_rs, + /* once per minute */ 60 * HZ, + /* no burst */ 1); + + if (__ratelimit(&_rs)) { + printk(KERN_WARNING + "no compression workspaces, low memory, retrying"); + } + } goto again; } return workspace; From 72928f2476d08c79f132b4f44a17c9a011dd98e3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Vincent=20Stehl=C3=A9?= Date: Tue, 10 May 2016 14:56:20 +0200 Subject: [PATCH 34/37] Btrfs: fix fspath error deallocation MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Make sure to deallocate fspath with vfree() in case of error in init_ipath(). fspath is allocated with vmalloc() in init_data_container() since commit 425d17a290c0 ("Btrfs: use larger limit for translation of logical to inode"). Signed-off-by: Vincent Stehlé Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/backref.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c index 80e8472d618b..d3090187fd76 100644 --- a/fs/btrfs/backref.c +++ b/fs/btrfs/backref.c @@ -1991,7 +1991,7 @@ struct inode_fs_paths *init_ipath(s32 total_bytes, struct btrfs_root *fs_root, ifp = kmalloc(sizeof(*ifp), GFP_NOFS); if (!ifp) { - kfree(fspath); + vfree(fspath); return ERR_PTR(-ENOMEM); } From 6426c7ad697df4bddf1d222685e6802e7616feaa Mon Sep 17 00:00:00 2001 From: Qu Wenruo Date: Wed, 11 May 2016 12:53:52 -0700 Subject: [PATCH 35/37] btrfs: qgroup: Fix qgroup accounting when creating snapshot Current btrfs qgroup design implies a requirement that after calling btrfs_qgroup_account_extents() there must be a commit root switch. Normally this is OK, as btrfs_qgroup_accounting_extents() is only called inside btrfs_commit_transaction() just be commit_cowonly_roots(). However there is a exception at create_pending_snapshot(), which will call btrfs_qgroup_account_extents() but no any commit root switch. In case of creating a snapshot whose parent root is itself (create a snapshot of fs tree), it will corrupt qgroup by the following trace: (skipped unrelated data) ====== btrfs_qgroup_account_extent: bytenr = 29786112, num_bytes = 16384, nr_old_roots = 0, nr_new_roots = 1 qgroup_update_counters: qgid = 5, cur_old_count = 0, cur_new_count = 1, rfer = 0, excl = 0 qgroup_update_counters: qgid = 5, cur_old_count = 0, cur_new_count = 1, rfer = 16384, excl = 16384 btrfs_qgroup_account_extent: bytenr = 29786112, num_bytes = 16384, nr_old_roots = 0, nr_new_roots = 0 ====== The problem here is in first qgroup_account_extent(), the nr_new_roots of the extent is 1, which means its reference got increased, and qgroup increased its rfer and excl. But at second qgroup_account_extent(), its reference got decreased, but between these two qgroup_account_extent(), there is no switch roots. This leads to the same nr_old_roots, and this extent just got ignored by qgroup, which means this extent is wrongly accounted. Fix it by call commit_cowonly_roots() after qgroup_account_extent() in create_pending_snapshot(), with needed preparation. Mark: I added a check at the top of qgroup_account_snapshot() to skip this code if qgroups are turned off. xfstest btrfs/122 exposes this problem. Signed-off-by: Qu Wenruo Reviewed-by: Josef Bacik Signed-off-by: Mark Fasheh Signed-off-by: David Sterba --- fs/btrfs/transaction.c | 129 +++++++++++++++++++++++++++++++++-------- 1 file changed, 105 insertions(+), 24 deletions(-) diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index 43885e51b882..d7172d7ced5f 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -311,10 +311,11 @@ static noinline int join_transaction(struct btrfs_root *root, unsigned int type) * when the transaction commits */ static int record_root_in_trans(struct btrfs_trans_handle *trans, - struct btrfs_root *root) + struct btrfs_root *root, + int force) { - if (test_bit(BTRFS_ROOT_REF_COWS, &root->state) && - root->last_trans < trans->transid) { + if ((test_bit(BTRFS_ROOT_REF_COWS, &root->state) && + root->last_trans < trans->transid) || force) { WARN_ON(root == root->fs_info->extent_root); WARN_ON(root->commit_root != root->node); @@ -331,7 +332,7 @@ static int record_root_in_trans(struct btrfs_trans_handle *trans, smp_wmb(); spin_lock(&root->fs_info->fs_roots_radix_lock); - if (root->last_trans == trans->transid) { + if (root->last_trans == trans->transid && !force) { spin_unlock(&root->fs_info->fs_roots_radix_lock); return 0; } @@ -402,7 +403,7 @@ int btrfs_record_root_in_trans(struct btrfs_trans_handle *trans, return 0; mutex_lock(&root->fs_info->reloc_mutex); - record_root_in_trans(trans, root); + record_root_in_trans(trans, root, 0); mutex_unlock(&root->fs_info->reloc_mutex); return 0; @@ -1310,6 +1311,92 @@ int btrfs_defrag_root(struct btrfs_root *root) return ret; } +/* + * Do all special snapshot related qgroup dirty hack. + * + * Will do all needed qgroup inherit and dirty hack like switch commit + * roots inside one transaction and write all btree into disk, to make + * qgroup works. + */ +static int qgroup_account_snapshot(struct btrfs_trans_handle *trans, + struct btrfs_root *src, + struct btrfs_root *parent, + struct btrfs_qgroup_inherit *inherit, + u64 dst_objectid) +{ + struct btrfs_fs_info *fs_info = src->fs_info; + int ret; + + /* + * Save some performance in the case that qgroups are not + * enabled. If this check races with the ioctl, rescan will + * kick in anyway. + */ + mutex_lock(&fs_info->qgroup_ioctl_lock); + if (!fs_info->quota_enabled) { + mutex_unlock(&fs_info->qgroup_ioctl_lock); + return 0; + } + mutex_unlock(&fs_info->qgroup_ioctl_lock); + + /* + * We are going to commit transaction, see btrfs_commit_transaction() + * comment for reason locking tree_log_mutex + */ + mutex_lock(&fs_info->tree_log_mutex); + + ret = commit_fs_roots(trans, src); + if (ret) + goto out; + ret = btrfs_qgroup_prepare_account_extents(trans, fs_info); + if (ret < 0) + goto out; + ret = btrfs_qgroup_account_extents(trans, fs_info); + if (ret < 0) + goto out; + + /* Now qgroup are all updated, we can inherit it to new qgroups */ + ret = btrfs_qgroup_inherit(trans, fs_info, + src->root_key.objectid, dst_objectid, + inherit); + if (ret < 0) + goto out; + + /* + * Now we do a simplified commit transaction, which will: + * 1) commit all subvolume and extent tree + * To ensure all subvolume and extent tree have a valid + * commit_root to accounting later insert_dir_item() + * 2) write all btree blocks onto disk + * This is to make sure later btree modification will be cowed + * Or commit_root can be populated and cause wrong qgroup numbers + * In this simplified commit, we don't really care about other trees + * like chunk and root tree, as they won't affect qgroup. + * And we don't write super to avoid half committed status. + */ + ret = commit_cowonly_roots(trans, src); + if (ret) + goto out; + switch_commit_roots(trans->transaction, fs_info); + ret = btrfs_write_and_wait_transaction(trans, src); + if (ret) + btrfs_std_error(fs_info, ret, + "Error while writing out transaction for qgroup"); + +out: + mutex_unlock(&fs_info->tree_log_mutex); + + /* + * Force parent root to be updated, as we recorded it before so its + * last_trans == cur_transid. + * Or it won't be committed again onto disk after later + * insert_dir_item() + */ + if (!ret) + record_root_in_trans(trans, parent, 1); + return ret; +} + /* * new snapshots need to be created at a very specific time in the * transaction commit. This does the actual creation. @@ -1383,7 +1470,7 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans, dentry = pending->dentry; parent_inode = pending->dir; parent_root = BTRFS_I(parent_inode)->root; - record_root_in_trans(trans, parent_root); + record_root_in_trans(trans, parent_root, 0); cur_time = current_fs_time(parent_inode->i_sb); @@ -1420,7 +1507,7 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans, goto fail; } - record_root_in_trans(trans, root); + record_root_in_trans(trans, root, 0); btrfs_set_root_last_snapshot(&root->root_item, trans->transid); memcpy(new_root_item, &root->root_item, sizeof(*new_root_item)); btrfs_check_and_init_root_item(new_root_item); @@ -1516,6 +1603,17 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans, goto fail; } + /* + * Do special qgroup accounting for snapshot, as we do some qgroup + * snapshot hack to do fast snapshot. + * To co-operate with that hack, we do hack again. + * Or snapshot will be greatly slowed down by a subtree qgroup rescan + */ + ret = qgroup_account_snapshot(trans, root, parent_root, + pending->inherit, objectid); + if (ret < 0) + goto fail; + ret = btrfs_insert_dir_item(trans, parent_root, dentry->d_name.name, dentry->d_name.len, parent_inode, &key, @@ -1559,23 +1657,6 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans, goto fail; } - /* - * account qgroup counters before qgroup_inherit() - */ - ret = btrfs_qgroup_prepare_account_extents(trans, fs_info); - if (ret) - goto fail; - ret = btrfs_qgroup_account_extents(trans, fs_info); - if (ret) - goto fail; - ret = btrfs_qgroup_inherit(trans, fs_info, - root->root_key.objectid, - objectid, pending->inherit); - if (ret) { - btrfs_abort_transaction(trans, root, ret); - goto fail; - } - fail: pending->error = ret; dir_item_existed: From 2c1984f244838477aab4e5882f4479491ae1084a Mon Sep 17 00:00:00 2001 From: David Sterba Date: Thu, 12 May 2016 11:05:03 +0200 Subject: [PATCH 36/37] btrfs: build fixup for qgroup_account_snapshot The macro btrfs_std_error got renamed to btrfs_handle_fs_error in an independent branch for the same merge target (4.7). To make the code compilable for bisectability reasons, add a temporary stub. Signed-off-by: David Sterba --- fs/btrfs/transaction.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index d7172d7ced5f..530081388d77 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -1311,6 +1311,11 @@ int btrfs_defrag_root(struct btrfs_root *root) return ret; } +/* Bisesctability fixup, remove in 4.8 */ +#ifndef btrfs_std_error +#define btrfs_std_error btrfs_handle_fs_error +#endif + /* * Do all special snapshot related qgroup dirty hack. * From 4673272f43ae790ab9ec04e38a7542f82bb8f020 Mon Sep 17 00:00:00 2001 From: Scott Talbert Date: Mon, 9 May 2016 09:14:28 -0400 Subject: [PATCH 37/37] btrfs: fix memory leak during RAID 5/6 device replacement A 'struct bio' is allocated in scrub_missing_raid56_pages(), but it was never freed anywhere. Signed-off-by: Scott Talbert Signed-off-by: David Sterba --- fs/btrfs/scrub.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c index d270c700ed31..fa35cdc46494 100644 --- a/fs/btrfs/scrub.c +++ b/fs/btrfs/scrub.c @@ -2127,6 +2127,8 @@ static void scrub_missing_raid56_end_io(struct bio *bio) if (bio->bi_error) sblock->no_io_error_seen = 0; + bio_put(bio); + btrfs_queue_work(fs_info->scrub_workers, &sblock->work); }