From 55180498dfd5f3c7e2d2c0e470f7cede1acee248 Mon Sep 17 00:00:00 2001 From: Zhiqiang Liu Date: Sat, 7 Dec 2019 11:00:08 +0800 Subject: [PATCH 01/15] md-bitmap: small cleanups In md_bitmap_unplug, bitmap->storage.filemap is double checked. In md_bitmap_daemon_work, bitmap->storage.filemap should be checked before reference. Signed-off-by: Zhiqiang Liu Signed-off-by: Song Liu --- drivers/md/md-bitmap.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/drivers/md/md-bitmap.c b/drivers/md/md-bitmap.c index 3ad18246fcb3..9860062bdc1e 100644 --- a/drivers/md/md-bitmap.c +++ b/drivers/md/md-bitmap.c @@ -1019,8 +1019,6 @@ void md_bitmap_unplug(struct bitmap *bitmap) /* look at each page to see if there are any set bits that need to be * flushed out to disk */ for (i = 0; i < bitmap->storage.file_pages; i++) { - if (!bitmap->storage.filemap) - return; dirty = test_and_clear_page_attr(bitmap, i, BITMAP_PAGE_DIRTY); need_write = test_and_clear_page_attr(bitmap, i, BITMAP_PAGE_NEEDWRITE); @@ -1338,7 +1336,8 @@ void md_bitmap_daemon_work(struct mddev *mddev) BITMAP_PAGE_DIRTY)) /* bitmap_unplug will handle the rest */ break; - if (test_and_clear_page_attr(bitmap, j, + if (bitmap->storage.filemap && + test_and_clear_page_attr(bitmap, j, BITMAP_PAGE_NEEDWRITE)) { write_page(bitmap, bitmap->storage.filemap[j], 0); } From 6b8651aac1dca6140dd7fb4c9fec2736ed3f6223 Mon Sep 17 00:00:00 2001 From: Zhengyuan Liu Date: Fri, 20 Dec 2019 10:21:26 +0800 Subject: [PATCH 02/15] raid6/test: fix a compilation error MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The compilation error is redeclaration showed as following: In file included from ../../../include/linux/limits.h:6, from /usr/include/x86_64-linux-gnu/bits/local_lim.h:38, from /usr/include/x86_64-linux-gnu/bits/posix1_lim.h:161, from /usr/include/limits.h:183, from /usr/lib/gcc/x86_64-linux-gnu/8/include-fixed/limits.h:194, from /usr/lib/gcc/x86_64-linux-gnu/8/include-fixed/syslimits.h:7, from /usr/lib/gcc/x86_64-linux-gnu/8/include-fixed/limits.h:34, from ../../../include/linux/raid/pq.h:30, from algos.c:14: ../../../include/linux/types.h:114:15: error: conflicting types for ‘int64_t’ typedef s64 int64_t; ^~~~~~~ In file included from /usr/include/stdint.h:34, from /usr/lib/gcc/x86_64-linux-gnu/8/include/stdint.h:9, from /usr/include/inttypes.h:27, from ../../../include/linux/raid/pq.h:29, from algos.c:14: /usr/include/x86_64-linux-gnu/bits/stdint-intn.h:27:19: note: previous \ declaration of ‘int64_t’ was here typedef __int64_t int64_t; Fixes: 54d50897d544 ("linux/kernel.h: split *_MAX and *_MIN macros into ") Signed-off-by: Zhengyuan Liu Signed-off-by: Song Liu --- include/linux/raid/pq.h | 1 - 1 file changed, 1 deletion(-) diff --git a/include/linux/raid/pq.h b/include/linux/raid/pq.h index 0832c9b66852..0b6e7ad9cd2a 100644 --- a/include/linux/raid/pq.h +++ b/include/linux/raid/pq.h @@ -27,7 +27,6 @@ extern const char raid6_empty_zero_page[PAGE_SIZE]; #include #include -#include #include #include #include From 5e5ac01c2b8802921fee680518a986011cb59820 Mon Sep 17 00:00:00 2001 From: Zhengyuan Liu Date: Fri, 20 Dec 2019 10:21:27 +0800 Subject: [PATCH 03/15] raid6/test: fix a compilation warning The compilation warning is redefination showed as following: In file included from tables.c:2: ../../../include/linux/export.h:180: warning: "EXPORT_SYMBOL" redefined #define EXPORT_SYMBOL(sym) __EXPORT_SYMBOL(sym, "") In file included from tables.c:1: ../../../include/linux/raid/pq.h:61: note: this is the location of the previous definition #define EXPORT_SYMBOL(sym) Fixes: 69a94abb82ee ("export.h, genksyms: do not make genksyms calculate CRC of trimmed symbols") Signed-off-by: Zhengyuan Liu Signed-off-by: Song Liu --- include/linux/raid/pq.h | 2 ++ lib/raid6/mktables.c | 2 +- 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/include/linux/raid/pq.h b/include/linux/raid/pq.h index 0b6e7ad9cd2a..e0ddb47f4402 100644 --- a/include/linux/raid/pq.h +++ b/include/linux/raid/pq.h @@ -58,7 +58,9 @@ extern const char raid6_empty_zero_page[PAGE_SIZE]; #define enable_kernel_altivec() #define disable_kernel_altivec() +#undef EXPORT_SYMBOL #define EXPORT_SYMBOL(sym) +#undef EXPORT_SYMBOL_GPL #define EXPORT_SYMBOL_GPL(sym) #define MODULE_LICENSE(licence) #define MODULE_DESCRIPTION(desc) diff --git a/lib/raid6/mktables.c b/lib/raid6/mktables.c index 9c485df1308f..f02e10fa6238 100644 --- a/lib/raid6/mktables.c +++ b/lib/raid6/mktables.c @@ -56,8 +56,8 @@ int main(int argc, char *argv[]) uint8_t v; uint8_t exptbl[256], invtbl[256]; - printf("#include \n"); printf("#include \n"); + printf("#include \n"); /* Compute multiplication table */ printf("\nconst u8 __attribute__((aligned(256)))\n" From f591df3cc6d60cadf8ceff5d44af73ea6ba0a39a Mon Sep 17 00:00:00 2001 From: Zhengyuan Liu Date: Fri, 20 Dec 2019 10:21:28 +0800 Subject: [PATCH 04/15] md/raid6: fix algorithm choice under larger PAGE_SIZE There are several algorithms available for raid6 to generate xor and syndrome parity, including basic int1, int2 ... int32 and SIMD optimized implementation like sse and neon. To test and choose the best algorithms at the initial stage, we need provide enough disk data to feed the algorithms. However, the disk number we provided depends on page size and gfmul table, seeing bellow: const int disks = (65536/PAGE_SIZE) + 2; So when come to 64K PAGE_SIZE, there is only one data disk plus 2 parity disk, as a result the chosed algorithm is not reliable. For example, on my arm64 machine with 64K page enabled, it will choose intx32 as the best one, although the NEON implementation is better. This patch tries to fix the problem by defining a constant raid6 disk number to supporting arbitrary page size. Suggested-by: H. Peter Anvin Signed-off-by: Zhengyuan Liu Signed-off-by: Song Liu --- include/linux/raid/pq.h | 4 +++ lib/raid6/algos.c | 63 ++++++++++++++++++++++++++--------------- 2 files changed, 44 insertions(+), 23 deletions(-) diff --git a/include/linux/raid/pq.h b/include/linux/raid/pq.h index e0ddb47f4402..154e954b711d 100644 --- a/include/linux/raid/pq.h +++ b/include/linux/raid/pq.h @@ -28,6 +28,7 @@ extern const char raid6_empty_zero_page[PAGE_SIZE]; #include #include #include +#include #include #include #include @@ -43,6 +44,9 @@ typedef uint64_t u64; #ifndef PAGE_SIZE # define PAGE_SIZE 4096 #endif +#ifndef PAGE_SHIFT +# define PAGE_SHIFT 12 +#endif extern const char raid6_empty_zero_page[PAGE_SIZE]; #define __init diff --git a/lib/raid6/algos.c b/lib/raid6/algos.c index 17417eee0866..bf1b4765c8f6 100644 --- a/lib/raid6/algos.c +++ b/lib/raid6/algos.c @@ -124,6 +124,9 @@ const struct raid6_recov_calls *const raid6_recov_algos[] = { #define time_before(x, y) ((x) < (y)) #endif +#define RAID6_TEST_DISKS 8 +#define RAID6_TEST_DISKS_ORDER 3 + static inline const struct raid6_recov_calls *raid6_choose_recov(void) { const struct raid6_recov_calls *const *algo; @@ -146,7 +149,7 @@ static inline const struct raid6_recov_calls *raid6_choose_recov(void) } static inline const struct raid6_calls *raid6_choose_gen( - void *(*const dptrs)[(65536/PAGE_SIZE)+2], const int disks) + void *(*const dptrs)[RAID6_TEST_DISKS], const int disks) { unsigned long perf, bestgenperf, bestxorperf, j0, j1; int start = (disks>>1)-1, stop = disks-3; /* work on the second half of the disks */ @@ -181,7 +184,8 @@ static inline const struct raid6_calls *raid6_choose_gen( best = *algo; } pr_info("raid6: %-8s gen() %5ld MB/s\n", (*algo)->name, - (perf*HZ) >> (20-16+RAID6_TIME_JIFFIES_LG2)); + (perf * HZ * (disks-2)) >> + (20 - PAGE_SHIFT + RAID6_TIME_JIFFIES_LG2)); if (!(*algo)->xor_syndrome) continue; @@ -204,17 +208,24 @@ static inline const struct raid6_calls *raid6_choose_gen( bestxorperf = perf; pr_info("raid6: %-8s xor() %5ld MB/s\n", (*algo)->name, - (perf*HZ) >> (20-16+RAID6_TIME_JIFFIES_LG2+1)); + (perf * HZ * (disks-2)) >> + (20 - PAGE_SHIFT + RAID6_TIME_JIFFIES_LG2 + 1)); } } if (best) { - pr_info("raid6: using algorithm %s gen() %ld MB/s\n", - best->name, - (bestgenperf*HZ) >> (20-16+RAID6_TIME_JIFFIES_LG2)); - if (best->xor_syndrome) - pr_info("raid6: .... xor() %ld MB/s, rmw enabled\n", - (bestxorperf*HZ) >> (20-16+RAID6_TIME_JIFFIES_LG2+1)); + if (IS_ENABLED(CONFIG_RAID6_PQ_BENCHMARK)) { + pr_info("raid6: using algorithm %s gen() %ld MB/s\n", + best->name, + (bestgenperf * HZ * (disks-2)) >> + (20 - PAGE_SHIFT+RAID6_TIME_JIFFIES_LG2)); + if (best->xor_syndrome) + pr_info("raid6: .... xor() %ld MB/s, rmw enabled\n", + (bestxorperf * HZ * (disks-2)) >> + (20 - PAGE_SHIFT + RAID6_TIME_JIFFIES_LG2 + 1)); + } else + pr_info("raid6: skip pq benchmark and using algorithm %s\n", + best->name); raid6_call = *best; } else pr_err("raid6: Yikes! No algorithm found!\n"); @@ -228,27 +239,33 @@ static inline const struct raid6_calls *raid6_choose_gen( int __init raid6_select_algo(void) { - const int disks = (65536/PAGE_SIZE)+2; + const int disks = RAID6_TEST_DISKS; const struct raid6_calls *gen_best; const struct raid6_recov_calls *rec_best; - char *syndromes; - void *dptrs[(65536/PAGE_SIZE)+2]; - int i; + char *disk_ptr, *p; + void *dptrs[RAID6_TEST_DISKS]; + int i, cycle; - for (i = 0; i < disks-2; i++) - dptrs[i] = ((char *)raid6_gfmul) + PAGE_SIZE*i; - - /* Normal code - use a 2-page allocation to avoid D$ conflict */ - syndromes = (void *) __get_free_pages(GFP_KERNEL, 1); - - if (!syndromes) { + /* prepare the buffer and fill it circularly with gfmul table */ + disk_ptr = (char *)__get_free_pages(GFP_KERNEL, RAID6_TEST_DISKS_ORDER); + if (!disk_ptr) { pr_err("raid6: Yikes! No memory available.\n"); return -ENOMEM; } - dptrs[disks-2] = syndromes; - dptrs[disks-1] = syndromes + PAGE_SIZE; + p = disk_ptr; + for (i = 0; i < disks; i++) + dptrs[i] = p + PAGE_SIZE * i; + + cycle = ((disks - 2) * PAGE_SIZE) / 65536; + for (i = 0; i < cycle; i++) { + memcpy(p, raid6_gfmul, 65536); + p += 65536; + } + + if ((disks - 2) * PAGE_SIZE % 65536) + memcpy(p, raid6_gfmul, (disks - 2) * PAGE_SIZE % 65536); /* select raid gen_syndrome function */ gen_best = raid6_choose_gen(&dptrs, disks); @@ -256,7 +273,7 @@ int __init raid6_select_algo(void) /* select raid recover functions */ rec_best = raid6_choose_recov(); - free_pages((unsigned long)syndromes, 1); + free_pages((unsigned long)disk_ptr, RAID6_TEST_DISKS_ORDER); return gen_best && rec_best ? 0 : -EINVAL; } From d2c9ad41249ac862d3a3a4d5d56e6b1cd79d8a17 Mon Sep 17 00:00:00 2001 From: Guoqing Jiang Date: Fri, 20 Dec 2019 15:46:29 +0100 Subject: [PATCH 05/15] raid5: remove worker_cnt_per_group argument from alloc_thread_groups We can use "cnt" directly to update conf->worker_cnt_per_group if alloc_thread_groups returns 0. Signed-off-by: Guoqing Jiang Signed-off-by: Song Liu --- drivers/md/raid5.c | 21 +++++++-------------- 1 file changed, 7 insertions(+), 14 deletions(-) diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index d4d3b67ffbba..ba00e9877f02 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -6598,7 +6598,6 @@ raid5_show_group_thread_cnt(struct mddev *mddev, char *page) static int alloc_thread_groups(struct r5conf *conf, int cnt, int *group_cnt, - int *worker_cnt_per_group, struct r5worker_group **worker_groups); static ssize_t raid5_store_group_thread_cnt(struct mddev *mddev, const char *page, size_t len) @@ -6607,7 +6606,7 @@ raid5_store_group_thread_cnt(struct mddev *mddev, const char *page, size_t len) unsigned int new; int err; struct r5worker_group *new_groups, *old_groups; - int group_cnt, worker_cnt_per_group; + int group_cnt; if (len >= PAGE_SIZE) return -EINVAL; @@ -6630,13 +6629,11 @@ raid5_store_group_thread_cnt(struct mddev *mddev, const char *page, size_t len) if (old_groups) flush_workqueue(raid5_wq); - err = alloc_thread_groups(conf, new, - &group_cnt, &worker_cnt_per_group, - &new_groups); + err = alloc_thread_groups(conf, new, &group_cnt, &new_groups); if (!err) { spin_lock_irq(&conf->device_lock); conf->group_cnt = group_cnt; - conf->worker_cnt_per_group = worker_cnt_per_group; + conf->worker_cnt_per_group = new; conf->worker_groups = new_groups; spin_unlock_irq(&conf->device_lock); @@ -6672,16 +6669,13 @@ static struct attribute_group raid5_attrs_group = { .attrs = raid5_attrs, }; -static int alloc_thread_groups(struct r5conf *conf, int cnt, - int *group_cnt, - int *worker_cnt_per_group, +static int alloc_thread_groups(struct r5conf *conf, int cnt, int *group_cnt, struct r5worker_group **worker_groups) { int i, j, k; ssize_t size; struct r5worker *workers; - *worker_cnt_per_group = cnt; if (cnt == 0) { *group_cnt = 0; *worker_groups = NULL; @@ -6882,7 +6876,7 @@ static struct r5conf *setup_conf(struct mddev *mddev) struct disk_info *disk; char pers_name[6]; int i; - int group_cnt, worker_cnt_per_group; + int group_cnt; struct r5worker_group *new_group; int ret; @@ -6928,10 +6922,9 @@ static struct r5conf *setup_conf(struct mddev *mddev) for (i = 0; i < PENDING_IO_MAX; i++) list_add(&conf->pending_data[i].sibling, &conf->free_list); /* Don't enable multi-threading by default*/ - if (!alloc_thread_groups(conf, 0, &group_cnt, &worker_cnt_per_group, - &new_group)) { + if (!alloc_thread_groups(conf, 0, &group_cnt, &new_group)) { conf->group_cnt = group_cnt; - conf->worker_cnt_per_group = worker_cnt_per_group; + conf->worker_cnt_per_group = 0; conf->worker_groups = new_group; } else goto abort; From 404659cf1e2570dad3cd117fa3bd71f06ecfd142 Mon Sep 17 00:00:00 2001 From: Guoqing Jiang Date: Mon, 23 Dec 2019 10:48:53 +0100 Subject: [PATCH 06/15] md: rename wb stuffs Previously, wb_info_pool and wb_list stuffs are introduced to address potential data inconsistence issue for write behind device. Now rename them to serial related name, since the same mechanism will be used to address reorder overlap write issue for raid1. Signed-off-by: Guoqing Jiang Signed-off-by: Song Liu --- drivers/md/md-bitmap.c | 20 ++++++------ drivers/md/md.c | 70 ++++++++++++++++++++++-------------------- drivers/md/md.h | 24 +++++++-------- drivers/md/raid1.c | 43 +++++++++++++------------- 4 files changed, 80 insertions(+), 77 deletions(-) diff --git a/drivers/md/md-bitmap.c b/drivers/md/md-bitmap.c index 9860062bdc1e..212e75dfebb7 100644 --- a/drivers/md/md-bitmap.c +++ b/drivers/md/md-bitmap.c @@ -1789,8 +1789,8 @@ void md_bitmap_destroy(struct mddev *mddev) return; md_bitmap_wait_behind_writes(mddev); - mempool_destroy(mddev->wb_info_pool); - mddev->wb_info_pool = NULL; + mempool_destroy(mddev->serial_info_pool); + mddev->serial_info_pool = NULL; mutex_lock(&mddev->bitmap_info.mutex); spin_lock(&mddev->lock); @@ -1907,7 +1907,7 @@ int md_bitmap_load(struct mddev *mddev) goto out; rdev_for_each(rdev, mddev) - mddev_create_wb_pool(mddev, rdev, true); + mddev_create_serial_pool(mddev, rdev, true); if (mddev_is_clustered(mddev)) md_cluster_ops->load_bitmaps(mddev, mddev->bitmap_info.nodes); @@ -2474,16 +2474,16 @@ backlog_store(struct mddev *mddev, const char *buf, size_t len) if (backlog > COUNTER_MAX) return -EINVAL; mddev->bitmap_info.max_write_behind = backlog; - if (!backlog && mddev->wb_info_pool) { - /* wb_info_pool is not needed if backlog is zero */ - mempool_destroy(mddev->wb_info_pool); - mddev->wb_info_pool = NULL; - } else if (backlog && !mddev->wb_info_pool) { - /* wb_info_pool is needed since backlog is not zero */ + if (!backlog && mddev->serial_info_pool) { + /* serial_info_pool is not needed if backlog is zero */ + mempool_destroy(mddev->serial_info_pool); + mddev->serial_info_pool = NULL; + } else if (backlog && !mddev->serial_info_pool) { + /* serial_info_pool is needed since backlog is not zero */ struct md_rdev *rdev; rdev_for_each(rdev, mddev) - mddev_create_wb_pool(mddev, rdev, false); + mddev_create_serial_pool(mddev, rdev, false); } if (old_mwb != backlog) md_bitmap_update_sb(mddev->bitmap); diff --git a/drivers/md/md.c b/drivers/md/md.c index 4e7c9f398bc6..ea37bfacb6fb 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -125,72 +125,74 @@ static inline int speed_max(struct mddev *mddev) mddev->sync_speed_max : sysctl_speed_limit_max; } -static int rdev_init_wb(struct md_rdev *rdev) +static int rdev_init_serial(struct md_rdev *rdev) { if (rdev->bdev->bd_queue->nr_hw_queues == 1) return 0; - spin_lock_init(&rdev->wb_list_lock); - INIT_LIST_HEAD(&rdev->wb_list); - init_waitqueue_head(&rdev->wb_io_wait); - set_bit(WBCollisionCheck, &rdev->flags); + spin_lock_init(&rdev->serial_list_lock); + INIT_LIST_HEAD(&rdev->serial_list); + init_waitqueue_head(&rdev->serial_io_wait); + set_bit(CollisionCheck, &rdev->flags); return 1; } /* - * Create wb_info_pool if rdev is the first multi-queue device flaged + * Create serial_info_pool if rdev is the first multi-queue device flaged * with writemostly, also write-behind mode is enabled. */ -void mddev_create_wb_pool(struct mddev *mddev, struct md_rdev *rdev, +void mddev_create_serial_pool(struct mddev *mddev, struct md_rdev *rdev, bool is_suspend) { if (mddev->bitmap_info.max_write_behind == 0) return; - if (!test_bit(WriteMostly, &rdev->flags) || !rdev_init_wb(rdev)) + if (!test_bit(WriteMostly, &rdev->flags) || !rdev_init_serial(rdev)) return; - if (mddev->wb_info_pool == NULL) { + if (mddev->serial_info_pool == NULL) { unsigned int noio_flag; if (!is_suspend) mddev_suspend(mddev); noio_flag = memalloc_noio_save(); - mddev->wb_info_pool = mempool_create_kmalloc_pool(NR_WB_INFOS, - sizeof(struct wb_info)); + mddev->serial_info_pool = + mempool_create_kmalloc_pool(NR_SERIAL_INFOS, + sizeof(struct serial_info)); memalloc_noio_restore(noio_flag); - if (!mddev->wb_info_pool) - pr_err("can't alloc memory pool for writemostly\n"); + if (!mddev->serial_info_pool) + pr_err("can't alloc memory pool for serialization\n"); if (!is_suspend) mddev_resume(mddev); } } -EXPORT_SYMBOL_GPL(mddev_create_wb_pool); +EXPORT_SYMBOL_GPL(mddev_create_serial_pool); /* - * destroy wb_info_pool if rdev is the last device flaged with WBCollisionCheck. + * Destroy serial_info_pool if rdev is the last device flaged with + * CollisionCheck. */ -static void mddev_destroy_wb_pool(struct mddev *mddev, struct md_rdev *rdev) +static void mddev_destroy_serial_pool(struct mddev *mddev, struct md_rdev *rdev) { - if (!test_and_clear_bit(WBCollisionCheck, &rdev->flags)) + if (!test_and_clear_bit(CollisionCheck, &rdev->flags)) return; - if (mddev->wb_info_pool) { + if (mddev->serial_info_pool) { struct md_rdev *temp; int num = 0; /* - * Check if other rdevs need wb_info_pool. + * Check if other rdevs need serial_info_pool. */ rdev_for_each(temp, mddev) if (temp != rdev && - test_bit(WBCollisionCheck, &temp->flags)) + test_bit(CollisionCheck, &temp->flags)) num++; if (!num) { mddev_suspend(rdev->mddev); - mempool_destroy(mddev->wb_info_pool); - mddev->wb_info_pool = NULL; + mempool_destroy(mddev->serial_info_pool); + mddev->serial_info_pool = NULL; mddev_resume(rdev->mddev); } } @@ -2337,7 +2339,7 @@ static int bind_rdev_to_array(struct md_rdev *rdev, struct mddev *mddev) pr_debug("md: bind<%s>\n", b); if (mddev->raid_disks) - mddev_create_wb_pool(mddev, rdev, false); + mddev_create_serial_pool(mddev, rdev, false); if ((err = kobject_add(&rdev->kobj, &mddev->kobj, "dev-%s", b))) goto fail; @@ -2375,7 +2377,7 @@ static void unbind_rdev_from_array(struct md_rdev *rdev) bd_unlink_disk_holder(rdev->bdev, rdev->mddev->gendisk); list_del_rcu(&rdev->same_set); pr_debug("md: unbind<%s>\n", bdevname(rdev->bdev,b)); - mddev_destroy_wb_pool(rdev->mddev, rdev); + mddev_destroy_serial_pool(rdev->mddev, rdev); rdev->mddev = NULL; sysfs_remove_link(&rdev->kobj, "block"); sysfs_put(rdev->sysfs_state); @@ -2888,10 +2890,10 @@ state_store(struct md_rdev *rdev, const char *buf, size_t len) } } else if (cmd_match(buf, "writemostly")) { set_bit(WriteMostly, &rdev->flags); - mddev_create_wb_pool(rdev->mddev, rdev, false); + mddev_create_serial_pool(rdev->mddev, rdev, false); err = 0; } else if (cmd_match(buf, "-writemostly")) { - mddev_destroy_wb_pool(rdev->mddev, rdev); + mddev_destroy_serial_pool(rdev->mddev, rdev); clear_bit(WriteMostly, &rdev->flags); err = 0; } else if (cmd_match(buf, "blocked")) { @@ -5773,14 +5775,14 @@ int md_run(struct mddev *mddev) rdev_for_each(rdev, mddev) { if (test_bit(WriteMostly, &rdev->flags) && - rdev_init_wb(rdev)) + rdev_init_serial(rdev)) creat_pool = true; } - if (creat_pool && mddev->wb_info_pool == NULL) { - mddev->wb_info_pool = - mempool_create_kmalloc_pool(NR_WB_INFOS, - sizeof(struct wb_info)); - if (!mddev->wb_info_pool) { + if (creat_pool && mddev->serial_info_pool == NULL) { + mddev->serial_info_pool = + mempool_create_kmalloc_pool(NR_SERIAL_INFOS, + sizeof(struct serial_info)); + if (!mddev->serial_info_pool) { err = -ENOMEM; goto bitmap_abort; } @@ -6025,8 +6027,8 @@ static void __md_stop_writes(struct mddev *mddev) mddev->in_sync = 1; md_update_sb(mddev, 1); } - mempool_destroy(mddev->wb_info_pool); - mddev->wb_info_pool = NULL; + mempool_destroy(mddev->serial_info_pool); + mddev->serial_info_pool = NULL; } void md_stop_writes(struct mddev *mddev) diff --git a/drivers/md/md.h b/drivers/md/md.h index 5f86f8adb0a4..7b811645cec7 100644 --- a/drivers/md/md.h +++ b/drivers/md/md.h @@ -111,11 +111,11 @@ struct md_rdev { */ /* - * The members for check collision of write behind IOs. + * The members for check collision of write IOs. */ - struct list_head wb_list; - spinlock_t wb_list_lock; - wait_queue_head_t wb_io_wait; + struct list_head serial_list; + spinlock_t serial_list_lock; + wait_queue_head_t serial_io_wait; struct work_struct del_work; /* used for delayed sysfs removal */ @@ -201,9 +201,9 @@ enum flag_bits { * it didn't fail, so don't use FailFast * any more for metadata */ - WBCollisionCheck, /* - * multiqueue device should check if there - * is collision between write behind bios. + CollisionCheck, /* + * check if there is collision between raid1 + * serial bios. */ }; @@ -263,9 +263,9 @@ enum mddev_sb_flags { MD_SB_NEED_REWRITE, /* metadata write needs to be repeated */ }; -#define NR_WB_INFOS 8 -/* record current range of write behind IOs */ -struct wb_info { +#define NR_SERIAL_INFOS 8 +/* record current range of serialize IOs */ +struct serial_info { sector_t lo; sector_t hi; struct list_head list; @@ -487,7 +487,7 @@ struct mddev { */ struct work_struct flush_work; struct work_struct event_work; /* used by dm to report failure event */ - mempool_t *wb_info_pool; + mempool_t *serial_info_pool; void (*sync_super)(struct mddev *mddev, struct md_rdev *rdev); struct md_cluster_info *cluster_info; unsigned int good_device_nr; /* good device num within cluster raid */ @@ -737,7 +737,7 @@ extern struct bio *bio_alloc_mddev(gfp_t gfp_mask, int nr_iovecs, extern void md_reload_sb(struct mddev *mddev, int raid_disk); extern void md_update_sb(struct mddev *mddev, int force); extern void md_kick_rdev_from_array(struct md_rdev * rdev); -extern void mddev_create_wb_pool(struct mddev *mddev, struct md_rdev *rdev, +extern void mddev_create_serial_pool(struct mddev *mddev, struct md_rdev *rdev, bool is_suspend); struct md_rdev *md_find_rdev_nr_rcu(struct mddev *mddev, int nr); struct md_rdev *md_find_rdev_rcu(struct mddev *mddev, dev_t dev); diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c index 201fd8aec59a..0439f674ab14 100644 --- a/drivers/md/raid1.c +++ b/drivers/md/raid1.c @@ -50,17 +50,17 @@ static void lower_barrier(struct r1conf *conf, sector_t sector_nr); #include "raid1-10.c" -static int check_and_add_wb(struct md_rdev *rdev, sector_t lo, sector_t hi) +static int check_and_add_serial(struct md_rdev *rdev, sector_t lo, sector_t hi) { - struct wb_info *wi, *temp_wi; + struct serial_info *wi, *temp_wi; unsigned long flags; int ret = 0; struct mddev *mddev = rdev->mddev; - wi = mempool_alloc(mddev->wb_info_pool, GFP_NOIO); + wi = mempool_alloc(mddev->serial_info_pool, GFP_NOIO); - spin_lock_irqsave(&rdev->wb_list_lock, flags); - list_for_each_entry(temp_wi, &rdev->wb_list, list) { + spin_lock_irqsave(&rdev->serial_list_lock, flags); + list_for_each_entry(temp_wi, &rdev->serial_list, list) { /* collision happened */ if (hi > temp_wi->lo && lo < temp_wi->hi) { ret = -EBUSY; @@ -71,34 +71,34 @@ static int check_and_add_wb(struct md_rdev *rdev, sector_t lo, sector_t hi) if (!ret) { wi->lo = lo; wi->hi = hi; - list_add(&wi->list, &rdev->wb_list); + list_add(&wi->list, &rdev->serial_list); } else - mempool_free(wi, mddev->wb_info_pool); - spin_unlock_irqrestore(&rdev->wb_list_lock, flags); + mempool_free(wi, mddev->serial_info_pool); + spin_unlock_irqrestore(&rdev->serial_list_lock, flags); return ret; } -static void remove_wb(struct md_rdev *rdev, sector_t lo, sector_t hi) +static void remove_serial(struct md_rdev *rdev, sector_t lo, sector_t hi) { - struct wb_info *wi; + struct serial_info *wi; unsigned long flags; int found = 0; struct mddev *mddev = rdev->mddev; - spin_lock_irqsave(&rdev->wb_list_lock, flags); - list_for_each_entry(wi, &rdev->wb_list, list) + spin_lock_irqsave(&rdev->serial_list_lock, flags); + list_for_each_entry(wi, &rdev->serial_list, list) if (hi == wi->hi && lo == wi->lo) { list_del(&wi->list); - mempool_free(wi, mddev->wb_info_pool); + mempool_free(wi, mddev->serial_info_pool); found = 1; break; } if (!found) - WARN(1, "The write behind IO is not recorded\n"); - spin_unlock_irqrestore(&rdev->wb_list_lock, flags); - wake_up(&rdev->wb_io_wait); + WARN(1, "The write IO is not recorded for serialization\n"); + spin_unlock_irqrestore(&rdev->serial_list_lock, flags); + wake_up(&rdev->serial_io_wait); } /* @@ -499,11 +499,11 @@ static void raid1_end_write_request(struct bio *bio) } if (behind) { - if (test_bit(WBCollisionCheck, &rdev->flags)) { + if (test_bit(CollisionCheck, &rdev->flags)) { sector_t lo = r1_bio->sector; sector_t hi = r1_bio->sector + r1_bio->sectors; - remove_wb(rdev, lo, hi); + remove_serial(rdev, lo, hi); } if (test_bit(WriteMostly, &rdev->flags)) atomic_dec(&r1_bio->behind_remaining); @@ -1508,12 +1508,13 @@ static void raid1_write_request(struct mddev *mddev, struct bio *bio, if (r1_bio->behind_master_bio) { struct md_rdev *rdev = conf->mirrors[i].rdev; - if (test_bit(WBCollisionCheck, &rdev->flags)) { + if (test_bit(CollisionCheck, &rdev->flags)) { sector_t lo = r1_bio->sector; sector_t hi = r1_bio->sector + r1_bio->sectors; - wait_event(rdev->wb_io_wait, - check_and_add_wb(rdev, lo, hi) == 0); + wait_event(rdev->serial_io_wait, + check_and_add_serial(rdev, lo, hi) + == 0); } if (test_bit(WriteMostly, &rdev->flags)) atomic_inc(&r1_bio->behind_remaining); From 3e173ab55b990d2b4ceb90bf55a88a96eb88598e Mon Sep 17 00:00:00 2001 From: Guoqing Jiang Date: Mon, 23 Dec 2019 10:48:54 +0100 Subject: [PATCH 07/15] md: fix a typo s/creat/create It actually means create here, so fix the typo. Reported-by: Song Liu Signed-off-by: Guoqing Jiang Signed-off-by: Song Liu --- drivers/md/md.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/md/md.c b/drivers/md/md.c index ea37bfacb6fb..8f5def0cb60a 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -5771,14 +5771,14 @@ int md_run(struct mddev *mddev) goto bitmap_abort; if (mddev->bitmap_info.max_write_behind > 0) { - bool creat_pool = false; + bool create_pool = false; rdev_for_each(rdev, mddev) { if (test_bit(WriteMostly, &rdev->flags) && rdev_init_serial(rdev)) - creat_pool = true; + create_pool = true; } - if (creat_pool && mddev->serial_info_pool == NULL) { + if (create_pool && mddev->serial_info_pool == NULL) { mddev->serial_info_pool = mempool_create_kmalloc_pool(NR_SERIAL_INFOS, sizeof(struct serial_info)); From 11d3a9f65018c9fb3d4f2032aec76af2ba98431c Mon Sep 17 00:00:00 2001 From: Guoqing Jiang Date: Mon, 23 Dec 2019 10:48:55 +0100 Subject: [PATCH 08/15] md: prepare for enable raid1 io serialization 1. The related resources (spin_lock, list and waitqueue) are needed for address raid1 reorder overlap issue too, in this case, rdev is set to NULL for mddev_create/destroy_serial_pool which implies all rdevs need to handle these resources. And also add "is_suspend" to mddev_destroy_serial_pool since it will be called under suspended situation, which also makes both create and destroy pool have same arguments. 2. Introduce rdevs_init_serial which is called if raid1 io serialization is enabled since all rdevs need to init related stuffs. 3. rdev_init_serial and clear_bit(CollisionCheck, &rdev->flags) should be called between suspend and resume. No need to export mddev_create_serial_pool since it is only called in md-mod module. Signed-off-by: Guoqing Jiang Signed-off-by: Song Liu --- drivers/md/md.c | 65 ++++++++++++++++++++++++++++++++++--------------- drivers/md/md.h | 2 +- 2 files changed, 46 insertions(+), 21 deletions(-) diff --git a/drivers/md/md.c b/drivers/md/md.c index 8f5def0cb60a..b9b041b7e196 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -127,9 +127,6 @@ static inline int speed_max(struct mddev *mddev) static int rdev_init_serial(struct md_rdev *rdev) { - if (rdev->bdev->bd_queue->nr_hw_queues == 1) - return 0; - spin_lock_init(&rdev->serial_list_lock); INIT_LIST_HEAD(&rdev->serial_list); init_waitqueue_head(&rdev->serial_io_wait); @@ -138,17 +135,29 @@ static int rdev_init_serial(struct md_rdev *rdev) return 1; } +static void rdevs_init_serial(struct mddev *mddev) +{ + struct md_rdev *rdev; + + rdev_for_each(rdev, mddev) { + if (test_bit(CollisionCheck, &rdev->flags)) + continue; + rdev_init_serial(rdev); + } +} + /* - * Create serial_info_pool if rdev is the first multi-queue device flaged - * with writemostly, also write-behind mode is enabled. + * Create serial_info_pool for raid1 under conditions: + * 1. rdev is the first multi-queue device flaged with writemostly, + * also write-behind mode is enabled. + * 2. rdev is NULL, means want to enable serialization for all rdevs. */ void mddev_create_serial_pool(struct mddev *mddev, struct md_rdev *rdev, - bool is_suspend) + bool is_suspend) { - if (mddev->bitmap_info.max_write_behind == 0) - return; - - if (!test_bit(WriteMostly, &rdev->flags) || !rdev_init_serial(rdev)) + if (rdev && (mddev->bitmap_info.max_write_behind == 0 || + rdev->bdev->bd_queue->nr_hw_queues == 1 || + !test_bit(WriteMostly, &rdev->flags))) return; if (mddev->serial_info_pool == NULL) { @@ -156,6 +165,10 @@ void mddev_create_serial_pool(struct mddev *mddev, struct md_rdev *rdev, if (!is_suspend) mddev_suspend(mddev); + if (!rdev) + rdevs_init_serial(mddev); + else + rdev_init_serial(rdev); noio_flag = memalloc_noio_save(); mddev->serial_info_pool = mempool_create_kmalloc_pool(NR_SERIAL_INFOS, @@ -167,15 +180,16 @@ void mddev_create_serial_pool(struct mddev *mddev, struct md_rdev *rdev, mddev_resume(mddev); } } -EXPORT_SYMBOL_GPL(mddev_create_serial_pool); /* * Destroy serial_info_pool if rdev is the last device flaged with - * CollisionCheck. + * CollisionCheck, or rdev is NULL when we disable serialization + * for normal raid1. */ -static void mddev_destroy_serial_pool(struct mddev *mddev, struct md_rdev *rdev) +static void mddev_destroy_serial_pool(struct mddev *mddev, struct md_rdev *rdev, + bool is_suspend) { - if (!test_and_clear_bit(CollisionCheck, &rdev->flags)) + if (rdev && !test_bit(CollisionCheck, &rdev->flags)) return; if (mddev->serial_info_pool) { @@ -185,16 +199,27 @@ static void mddev_destroy_serial_pool(struct mddev *mddev, struct md_rdev *rdev) /* * Check if other rdevs need serial_info_pool. */ - rdev_for_each(temp, mddev) + if (!is_suspend) + mddev_suspend(mddev); + rdev_for_each(temp, mddev) { + if (!rdev) { + clear_bit(CollisionCheck, &temp->flags); + continue; + } + if (temp != rdev && test_bit(CollisionCheck, &temp->flags)) num++; - if (!num) { - mddev_suspend(rdev->mddev); + } + + if (rdev) + clear_bit(CollisionCheck, &rdev->flags); + if (!rdev || !num) { mempool_destroy(mddev->serial_info_pool); mddev->serial_info_pool = NULL; - mddev_resume(rdev->mddev); } + if (!is_suspend) + mddev_resume(mddev); } } @@ -2377,7 +2402,7 @@ static void unbind_rdev_from_array(struct md_rdev *rdev) bd_unlink_disk_holder(rdev->bdev, rdev->mddev->gendisk); list_del_rcu(&rdev->same_set); pr_debug("md: unbind<%s>\n", bdevname(rdev->bdev,b)); - mddev_destroy_serial_pool(rdev->mddev, rdev); + mddev_destroy_serial_pool(rdev->mddev, rdev, false); rdev->mddev = NULL; sysfs_remove_link(&rdev->kobj, "block"); sysfs_put(rdev->sysfs_state); @@ -2893,7 +2918,7 @@ state_store(struct md_rdev *rdev, const char *buf, size_t len) mddev_create_serial_pool(rdev->mddev, rdev, false); err = 0; } else if (cmd_match(buf, "-writemostly")) { - mddev_destroy_serial_pool(rdev->mddev, rdev); + mddev_destroy_serial_pool(rdev->mddev, rdev, false); clear_bit(WriteMostly, &rdev->flags); err = 0; } else if (cmd_match(buf, "blocked")) { diff --git a/drivers/md/md.h b/drivers/md/md.h index 7b811645cec7..de04a8d3a67a 100644 --- a/drivers/md/md.h +++ b/drivers/md/md.h @@ -738,7 +738,7 @@ extern void md_reload_sb(struct mddev *mddev, int raid_disk); extern void md_update_sb(struct mddev *mddev, int force); extern void md_kick_rdev_from_array(struct md_rdev * rdev); extern void mddev_create_serial_pool(struct mddev *mddev, struct md_rdev *rdev, - bool is_suspend); + bool is_suspend); struct md_rdev *md_find_rdev_nr_rcu(struct mddev *mddev, int nr); struct md_rdev *md_find_rdev_rcu(struct mddev *mddev, dev_t dev); From 3938f5fb82aedbf39792ffee448c61c819e6ab38 Mon Sep 17 00:00:00 2001 From: Guoqing Jiang Date: Mon, 23 Dec 2019 10:48:56 +0100 Subject: [PATCH 09/15] md: add serialize_policy sysfs node for raid1 With the new sysfs node, we can use it to control if raid1 array wants io serialization or not. So mddev_create_serial_pool and mddev_destroy_serial_pool are called in serialize_policy_store to enable or disable the serialization. Signed-off-by: Guoqing Jiang Signed-off-by: Song Liu --- drivers/md/md.c | 52 +++++++++++++++++++++++++++++++++++++++++++++++++ drivers/md/md.h | 1 + 2 files changed, 53 insertions(+) diff --git a/drivers/md/md.c b/drivers/md/md.c index b9b041b7e196..796cf70e1c9f 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -5304,6 +5304,57 @@ static struct md_sysfs_entry md_fail_last_dev = __ATTR(fail_last_dev, S_IRUGO | S_IWUSR, fail_last_dev_show, fail_last_dev_store); +static ssize_t serialize_policy_show(struct mddev *mddev, char *page) +{ + if (mddev->pers == NULL || (mddev->pers->level != 1)) + return sprintf(page, "n/a\n"); + else + return sprintf(page, "%d\n", mddev->serialize_policy); +} + +/* + * Setting serialize_policy to true to enforce write IO is not reordered + * for raid1. + */ +static ssize_t +serialize_policy_store(struct mddev *mddev, const char *buf, size_t len) +{ + int err; + bool value; + + err = kstrtobool(buf, &value); + if (err) + return err; + + if (value == mddev->serialize_policy) + return len; + + err = mddev_lock(mddev); + if (err) + return err; + if (mddev->pers == NULL || (mddev->pers->level != 1)) { + pr_err("md: serialize_policy is only effective for raid1\n"); + err = -EINVAL; + goto unlock; + } + + mddev_suspend(mddev); + if (value) + mddev_create_serial_pool(mddev, NULL, true); + else + mddev_destroy_serial_pool(mddev, NULL, true); + mddev->serialize_policy = value; + mddev_resume(mddev); +unlock: + mddev_unlock(mddev); + return err ?: len; +} + +static struct md_sysfs_entry md_serialize_policy = +__ATTR(serialize_policy, S_IRUGO | S_IWUSR, serialize_policy_show, + serialize_policy_store); + + static struct attribute *md_default_attrs[] = { &md_level.attr, &md_layout.attr, @@ -5321,6 +5372,7 @@ static struct attribute *md_default_attrs[] = { &max_corr_read_errors.attr, &md_consistency_policy.attr, &md_fail_last_dev.attr, + &md_serialize_policy.attr, NULL, }; diff --git a/drivers/md/md.h b/drivers/md/md.h index de04a8d3a67a..f51a3afaee1b 100644 --- a/drivers/md/md.h +++ b/drivers/md/md.h @@ -494,6 +494,7 @@ struct mddev { bool has_superblocks:1; bool fail_last_dev:1; + bool serialize_policy:1; }; enum recovery_flags { From de31ee949739aba9ce7dbb8b10e72c6fce0e76c7 Mon Sep 17 00:00:00 2001 From: Guoqing Jiang Date: Mon, 23 Dec 2019 10:48:57 +0100 Subject: [PATCH 10/15] md: reorgnize mddev_create/destroy_serial_pool So far, IO serialization is used for two scenarios: 1. raid1 which enables write-behind mode, and there is rdev in the array which is multi-queue device and flaged with writemostly. 2. IO serialization is enabled or disabled by change serialize_policy. So introduce rdev_need_serial to check the first scenario. And for 1, IO serialization is enabled automatically while 2 is controlled manually. And it is possible that both scenarios are true, so for create serial pool, rdev/rdevs_init_serial should be separate from check if the pool existed or not. Then for destroy pool, we need to check if the pool is needed by other rdevs due to the first scenario. Signed-off-by: Guoqing Jiang Signed-off-by: Song Liu --- drivers/md/md.c | 71 +++++++++++++++++++++++++++++-------------------- 1 file changed, 42 insertions(+), 29 deletions(-) diff --git a/drivers/md/md.c b/drivers/md/md.c index 796cf70e1c9f..788559f42d43 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -147,28 +147,40 @@ static void rdevs_init_serial(struct mddev *mddev) } /* - * Create serial_info_pool for raid1 under conditions: - * 1. rdev is the first multi-queue device flaged with writemostly, - * also write-behind mode is enabled. - * 2. rdev is NULL, means want to enable serialization for all rdevs. + * rdev needs to enable serial stuffs if it meets the conditions: + * 1. it is multi-queue device flaged with writemostly. + * 2. the write-behind mode is enabled. + */ +static int rdev_need_serial(struct md_rdev *rdev) +{ + return (rdev && rdev->mddev->bitmap_info.max_write_behind > 0 && + rdev->bdev->bd_queue->nr_hw_queues != 1 && + test_bit(WriteMostly, &rdev->flags)); +} + +/* + * Init resource for rdev(s), then create serial_info_pool if: + * 1. rdev is the first device which return true from rdev_enable_serial. + * 2. rdev is NULL, means we want to enable serialization for all rdevs. */ void mddev_create_serial_pool(struct mddev *mddev, struct md_rdev *rdev, bool is_suspend) { - if (rdev && (mddev->bitmap_info.max_write_behind == 0 || - rdev->bdev->bd_queue->nr_hw_queues == 1 || - !test_bit(WriteMostly, &rdev->flags))) + if (rdev && !rdev_need_serial(rdev) && + !test_bit(CollisionCheck, &rdev->flags)) return; + if (!is_suspend) + mddev_suspend(mddev); + + if (!rdev) + rdevs_init_serial(mddev); + else + rdev_init_serial(rdev); + if (mddev->serial_info_pool == NULL) { unsigned int noio_flag; - if (!is_suspend) - mddev_suspend(mddev); - if (!rdev) - rdevs_init_serial(mddev); - else - rdev_init_serial(rdev); noio_flag = memalloc_noio_save(); mddev->serial_info_pool = mempool_create_kmalloc_pool(NR_SERIAL_INFOS, @@ -176,15 +188,16 @@ void mddev_create_serial_pool(struct mddev *mddev, struct md_rdev *rdev, memalloc_noio_restore(noio_flag); if (!mddev->serial_info_pool) pr_err("can't alloc memory pool for serialization\n"); - if (!is_suspend) - mddev_resume(mddev); } + if (!is_suspend) + mddev_resume(mddev); } /* - * Destroy serial_info_pool if rdev is the last device flaged with - * CollisionCheck, or rdev is NULL when we disable serialization - * for normal raid1. + * Free resource from rdev(s), and destroy serial_info_pool under conditions: + * 1. rdev is the last device flaged with CollisionCheck. + * 2. when bitmap is destroyed while policy is not enabled. + * 3. for disable policy, the pool is destroyed only when no rdev needs it. */ static void mddev_destroy_serial_pool(struct mddev *mddev, struct md_rdev *rdev, bool is_suspend) @@ -194,27 +207,27 @@ static void mddev_destroy_serial_pool(struct mddev *mddev, struct md_rdev *rdev, if (mddev->serial_info_pool) { struct md_rdev *temp; - int num = 0; + int num = 0; /* used to track if other rdevs need the pool */ - /* - * Check if other rdevs need serial_info_pool. - */ if (!is_suspend) mddev_suspend(mddev); rdev_for_each(temp, mddev) { if (!rdev) { - clear_bit(CollisionCheck, &temp->flags); - continue; - } - - if (temp != rdev && - test_bit(CollisionCheck, &temp->flags)) + if (!rdev_need_serial(temp)) + clear_bit(CollisionCheck, &temp->flags); + else + num++; + } else if (temp != rdev && + test_bit(CollisionCheck, &temp->flags)) num++; } if (rdev) clear_bit(CollisionCheck, &rdev->flags); - if (!rdev || !num) { + + if (num) + pr_info("The mempool could be used by other devices\n"); + else { mempool_destroy(mddev->serial_info_pool); mddev->serial_info_pool = NULL; } From 69df9cfc70421fb7949e8f7a19bfc36600b5522b Mon Sep 17 00:00:00 2001 From: Guoqing Jiang Date: Mon, 23 Dec 2019 10:48:58 +0100 Subject: [PATCH 11/15] raid1: serialize the overlap write Before dispatch write bio, raid1 array which enables serialize_policy need to check if overlap exists between this bio and previous on-flying bios. If there is overlap, then it has to wait until the collision is disappeared. Signed-off-by: Guoqing Jiang Signed-off-by: Song Liu --- drivers/md/raid1.c | 27 +++++++++++++-------------- 1 file changed, 13 insertions(+), 14 deletions(-) diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c index 0439f674ab14..3ad2f5a59d08 100644 --- a/drivers/md/raid1.c +++ b/drivers/md/raid1.c @@ -430,6 +430,8 @@ static void raid1_end_write_request(struct bio *bio) int mirror = find_bio_disk(r1_bio, bio); struct md_rdev *rdev = conf->mirrors[mirror].rdev; bool discard_error; + sector_t lo = r1_bio->sector; + sector_t hi = r1_bio->sector + r1_bio->sectors; discard_error = bio->bi_status && bio_op(bio) == REQ_OP_DISCARD; @@ -499,12 +501,8 @@ static void raid1_end_write_request(struct bio *bio) } if (behind) { - if (test_bit(CollisionCheck, &rdev->flags)) { - sector_t lo = r1_bio->sector; - sector_t hi = r1_bio->sector + r1_bio->sectors; - + if (test_bit(CollisionCheck, &rdev->flags)) remove_serial(rdev, lo, hi); - } if (test_bit(WriteMostly, &rdev->flags)) atomic_dec(&r1_bio->behind_remaining); @@ -527,7 +525,8 @@ static void raid1_end_write_request(struct bio *bio) call_bio_endio(r1_bio); } } - } + } else if (rdev->mddev->serialize_policy) + remove_serial(rdev, lo, hi); if (r1_bio->bios[mirror] == NULL) rdev_dec_pending(rdev, conf->mddev); @@ -1337,6 +1336,7 @@ static void raid1_write_request(struct mddev *mddev, struct bio *bio, struct raid1_plug_cb *plug = NULL; int first_clone; int max_sectors; + sector_t lo, hi; if (mddev_is_clustered(mddev) && md_cluster_ops->area_resyncing(mddev, WRITE, @@ -1364,6 +1364,8 @@ static void raid1_write_request(struct mddev *mddev, struct bio *bio, r1_bio = alloc_r1bio(mddev, bio); r1_bio->sectors = max_write_sectors; + lo = r1_bio->sector; + hi = r1_bio->sector + r1_bio->sectors; if (conf->pending_count >= max_queued_requests) { md_wakeup_thread(mddev->thread); @@ -1479,6 +1481,7 @@ static void raid1_write_request(struct mddev *mddev, struct bio *bio, for (i = 0; i < disks; i++) { struct bio *mbio = NULL; + struct md_rdev *rdev = conf->mirrors[i].rdev; if (!r1_bio->bios[i]) continue; @@ -1506,19 +1509,15 @@ static void raid1_write_request(struct mddev *mddev, struct bio *bio, mbio = bio_clone_fast(bio, GFP_NOIO, &mddev->bio_set); if (r1_bio->behind_master_bio) { - struct md_rdev *rdev = conf->mirrors[i].rdev; - - if (test_bit(CollisionCheck, &rdev->flags)) { - sector_t lo = r1_bio->sector; - sector_t hi = r1_bio->sector + r1_bio->sectors; - + if (test_bit(CollisionCheck, &rdev->flags)) wait_event(rdev->serial_io_wait, check_and_add_serial(rdev, lo, hi) == 0); - } if (test_bit(WriteMostly, &rdev->flags)) atomic_inc(&r1_bio->behind_remaining); - } + } else if (mddev->serialize_policy) + wait_event(rdev->serial_io_wait, + check_and_add_serial(rdev, lo, hi) == 0); r1_bio->bios[i] = mbio; From 4d26d32fe4dafd29e168addb7c11949a36e7e5f8 Mon Sep 17 00:00:00 2001 From: Guoqing Jiang Date: Mon, 23 Dec 2019 10:48:59 +0100 Subject: [PATCH 12/15] md: don't destroy serial_info_pool if serialize_policy is true The serial_info_pool is needed if array sets serialize_policy to true, so don't destroy it. Signed-off-by: Guoqing Jiang Signed-off-by: Song Liu --- drivers/md/md-bitmap.c | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/drivers/md/md-bitmap.c b/drivers/md/md-bitmap.c index 212e75dfebb7..92f0d45946e8 100644 --- a/drivers/md/md-bitmap.c +++ b/drivers/md/md-bitmap.c @@ -1789,8 +1789,10 @@ void md_bitmap_destroy(struct mddev *mddev) return; md_bitmap_wait_behind_writes(mddev); - mempool_destroy(mddev->serial_info_pool); - mddev->serial_info_pool = NULL; + if (!mddev->serialize_policy) { + mempool_destroy(mddev->serial_info_pool); + mddev->serial_info_pool = NULL; + } mutex_lock(&mddev->bitmap_info.mutex); spin_lock(&mddev->lock); @@ -2476,8 +2478,10 @@ backlog_store(struct mddev *mddev, const char *buf, size_t len) mddev->bitmap_info.max_write_behind = backlog; if (!backlog && mddev->serial_info_pool) { /* serial_info_pool is not needed if backlog is zero */ - mempool_destroy(mddev->serial_info_pool); - mddev->serial_info_pool = NULL; + if (!mddev->serialize_policy) { + mempool_destroy(mddev->serial_info_pool); + mddev->serial_info_pool = NULL; + } } else if (backlog && !mddev->serial_info_pool) { /* serial_info_pool is needed since backlog is not zero */ struct md_rdev *rdev; From 69b00b5bb23552d43e8bbed73ef6624604bb94a2 Mon Sep 17 00:00:00 2001 From: Guoqing Jiang Date: Mon, 23 Dec 2019 10:49:00 +0100 Subject: [PATCH 13/15] md: introduce a new struct for IO serialization Obviously, IO serialization could cause the degradation of performance a lot. In order to reduce the degradation, so a rb interval tree is added in raid1 to speed up the check of collision. So, a rb root is needed in md_rdev, then abstract all the serialize related members to a new struct (serial_in_rdev), embed it into md_rdev. Of course, we need to free the struct if it is not needed anymore, so rdev/rdevs_uninit_serial are added accordingly. And they should be called when destroty memory pool or can't alloc memory. And we need to consider to call mddev_destroy_serial_pool in case serialize_policy/write-behind is disabled, bitmap is destroyed or in __md_stop_writes. Signed-off-by: Guoqing Jiang Signed-off-by: Song Liu --- drivers/md/md-bitmap.c | 12 ++---- drivers/md/md.c | 84 ++++++++++++++++++++++++++++++++---------- drivers/md/md.h | 26 ++++++++----- drivers/md/raid1.c | 59 +++++++++++++++-------------- 4 files changed, 117 insertions(+), 64 deletions(-) diff --git a/drivers/md/md-bitmap.c b/drivers/md/md-bitmap.c index 92f0d45946e8..e230052c2107 100644 --- a/drivers/md/md-bitmap.c +++ b/drivers/md/md-bitmap.c @@ -1789,10 +1789,8 @@ void md_bitmap_destroy(struct mddev *mddev) return; md_bitmap_wait_behind_writes(mddev); - if (!mddev->serialize_policy) { - mempool_destroy(mddev->serial_info_pool); - mddev->serial_info_pool = NULL; - } + if (!mddev->serialize_policy) + mddev_destroy_serial_pool(mddev, NULL, true); mutex_lock(&mddev->bitmap_info.mutex); spin_lock(&mddev->lock); @@ -2478,10 +2476,8 @@ backlog_store(struct mddev *mddev, const char *buf, size_t len) mddev->bitmap_info.max_write_behind = backlog; if (!backlog && mddev->serial_info_pool) { /* serial_info_pool is not needed if backlog is zero */ - if (!mddev->serialize_policy) { - mempool_destroy(mddev->serial_info_pool); - mddev->serial_info_pool = NULL; - } + if (!mddev->serialize_policy) + mddev_destroy_serial_pool(mddev, NULL, false); } else if (backlog && !mddev->serial_info_pool) { /* serial_info_pool is needed since backlog is not zero */ struct md_rdev *rdev; diff --git a/drivers/md/md.c b/drivers/md/md.c index 788559f42d43..9c4e61c988ac 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -125,25 +125,59 @@ static inline int speed_max(struct mddev *mddev) mddev->sync_speed_max : sysctl_speed_limit_max; } -static int rdev_init_serial(struct md_rdev *rdev) +static void rdev_uninit_serial(struct md_rdev *rdev) { - spin_lock_init(&rdev->serial_list_lock); - INIT_LIST_HEAD(&rdev->serial_list); - init_waitqueue_head(&rdev->serial_io_wait); - set_bit(CollisionCheck, &rdev->flags); + if (!test_and_clear_bit(CollisionCheck, &rdev->flags)) + return; - return 1; + kfree(rdev->serial); + rdev->serial = NULL; } -static void rdevs_init_serial(struct mddev *mddev) +static void rdevs_uninit_serial(struct mddev *mddev) { struct md_rdev *rdev; + rdev_for_each(rdev, mddev) + rdev_uninit_serial(rdev); +} + +static int rdev_init_serial(struct md_rdev *rdev) +{ + struct serial_in_rdev *serial = NULL; + + if (test_bit(CollisionCheck, &rdev->flags)) + return 0; + + serial = kmalloc(sizeof(struct serial_in_rdev), GFP_KERNEL); + if (!serial) + return -ENOMEM; + + spin_lock_init(&serial->serial_lock); + serial->serial_rb = RB_ROOT_CACHED; + init_waitqueue_head(&serial->serial_io_wait); + rdev->serial = serial; + set_bit(CollisionCheck, &rdev->flags); + + return 0; +} + +static int rdevs_init_serial(struct mddev *mddev) +{ + struct md_rdev *rdev; + int ret = 0; + rdev_for_each(rdev, mddev) { - if (test_bit(CollisionCheck, &rdev->flags)) - continue; - rdev_init_serial(rdev); + ret = rdev_init_serial(rdev); + if (ret) + break; } + + /* Free all resources if pool is not existed */ + if (ret && !mddev->serial_info_pool) + rdevs_uninit_serial(mddev); + + return ret; } /* @@ -166,6 +200,8 @@ static int rdev_need_serial(struct md_rdev *rdev) void mddev_create_serial_pool(struct mddev *mddev, struct md_rdev *rdev, bool is_suspend) { + int ret = 0; + if (rdev && !rdev_need_serial(rdev) && !test_bit(CollisionCheck, &rdev->flags)) return; @@ -174,9 +210,11 @@ void mddev_create_serial_pool(struct mddev *mddev, struct md_rdev *rdev, mddev_suspend(mddev); if (!rdev) - rdevs_init_serial(mddev); + ret = rdevs_init_serial(mddev); else - rdev_init_serial(rdev); + ret = rdev_init_serial(rdev); + if (ret) + goto abort; if (mddev->serial_info_pool == NULL) { unsigned int noio_flag; @@ -186,9 +224,13 @@ void mddev_create_serial_pool(struct mddev *mddev, struct md_rdev *rdev, mempool_create_kmalloc_pool(NR_SERIAL_INFOS, sizeof(struct serial_info)); memalloc_noio_restore(noio_flag); - if (!mddev->serial_info_pool) + if (!mddev->serial_info_pool) { + rdevs_uninit_serial(mddev); pr_err("can't alloc memory pool for serialization\n"); + } } + +abort: if (!is_suspend) mddev_resume(mddev); } @@ -199,8 +241,8 @@ void mddev_create_serial_pool(struct mddev *mddev, struct md_rdev *rdev, * 2. when bitmap is destroyed while policy is not enabled. * 3. for disable policy, the pool is destroyed only when no rdev needs it. */ -static void mddev_destroy_serial_pool(struct mddev *mddev, struct md_rdev *rdev, - bool is_suspend) +void mddev_destroy_serial_pool(struct mddev *mddev, struct md_rdev *rdev, + bool is_suspend) { if (rdev && !test_bit(CollisionCheck, &rdev->flags)) return; @@ -213,8 +255,9 @@ static void mddev_destroy_serial_pool(struct mddev *mddev, struct md_rdev *rdev, mddev_suspend(mddev); rdev_for_each(temp, mddev) { if (!rdev) { - if (!rdev_need_serial(temp)) - clear_bit(CollisionCheck, &temp->flags); + if (!mddev->serialize_policy || + !rdev_need_serial(temp)) + rdev_uninit_serial(temp); else num++; } else if (temp != rdev && @@ -223,7 +266,7 @@ static void mddev_destroy_serial_pool(struct mddev *mddev, struct md_rdev *rdev, } if (rdev) - clear_bit(CollisionCheck, &rdev->flags); + rdev_uninit_serial(rdev); if (num) pr_info("The mempool could be used by other devices\n"); @@ -6117,8 +6160,9 @@ static void __md_stop_writes(struct mddev *mddev) mddev->in_sync = 1; md_update_sb(mddev, 1); } - mempool_destroy(mddev->serial_info_pool); - mddev->serial_info_pool = NULL; + /* disable policy to guarantee rdevs free resources for serialization */ + mddev->serialize_policy = 0; + mddev_destroy_serial_pool(mddev, NULL, true); } void md_stop_writes(struct mddev *mddev) diff --git a/drivers/md/md.h b/drivers/md/md.h index f51a3afaee1b..acd681939112 100644 --- a/drivers/md/md.h +++ b/drivers/md/md.h @@ -32,6 +32,16 @@ * be retried. */ #define MD_FAILFAST (REQ_FAILFAST_DEV | REQ_FAILFAST_TRANSPORT) + +/* + * The struct embedded in rdev is used to serialize IO. + */ +struct serial_in_rdev { + struct rb_root_cached serial_rb; + spinlock_t serial_lock; + wait_queue_head_t serial_io_wait; +}; + /* * MD's 'extended' device */ @@ -110,12 +120,7 @@ struct md_rdev { * in superblock. */ - /* - * The members for check collision of write IOs. - */ - struct list_head serial_list; - spinlock_t serial_list_lock; - wait_queue_head_t serial_io_wait; + struct serial_in_rdev *serial; /* used for raid1 io serialization */ struct work_struct del_work; /* used for delayed sysfs removal */ @@ -266,9 +271,10 @@ enum mddev_sb_flags { #define NR_SERIAL_INFOS 8 /* record current range of serialize IOs */ struct serial_info { - sector_t lo; - sector_t hi; - struct list_head list; + struct rb_node node; + sector_t start; /* start sector of rb node */ + sector_t last; /* end sector of rb node */ + sector_t _subtree_last; /* highest sector in subtree of rb node */ }; struct mddev { @@ -740,6 +746,8 @@ extern void md_update_sb(struct mddev *mddev, int force); extern void md_kick_rdev_from_array(struct md_rdev * rdev); extern void mddev_create_serial_pool(struct mddev *mddev, struct md_rdev *rdev, bool is_suspend); +extern void mddev_destroy_serial_pool(struct mddev *mddev, struct md_rdev *rdev, + bool is_suspend); struct md_rdev *md_find_rdev_nr_rcu(struct mddev *mddev, int nr); struct md_rdev *md_find_rdev_rcu(struct mddev *mddev, dev_t dev); diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c index 3ad2f5a59d08..5c6a03747448 100644 --- a/drivers/md/raid1.c +++ b/drivers/md/raid1.c @@ -29,6 +29,7 @@ #include #include #include +#include #include @@ -50,55 +51,58 @@ static void lower_barrier(struct r1conf *conf, sector_t sector_nr); #include "raid1-10.c" +#define START(node) ((node)->start) +#define LAST(node) ((node)->last) +INTERVAL_TREE_DEFINE(struct serial_info, node, sector_t, _subtree_last, + START, LAST, static inline, raid1_rb); + static int check_and_add_serial(struct md_rdev *rdev, sector_t lo, sector_t hi) { - struct serial_info *wi, *temp_wi; + struct serial_info *si; unsigned long flags; int ret = 0; struct mddev *mddev = rdev->mddev; + struct serial_in_rdev *serial = rdev->serial; - wi = mempool_alloc(mddev->serial_info_pool, GFP_NOIO); - - spin_lock_irqsave(&rdev->serial_list_lock, flags); - list_for_each_entry(temp_wi, &rdev->serial_list, list) { - /* collision happened */ - if (hi > temp_wi->lo && lo < temp_wi->hi) { - ret = -EBUSY; - break; - } - } + si = mempool_alloc(mddev->serial_info_pool, GFP_NOIO); + spin_lock_irqsave(&serial->serial_lock, flags); + /* collision happened */ + if (raid1_rb_iter_first(&serial->serial_rb, lo, hi)) + ret = -EBUSY; if (!ret) { - wi->lo = lo; - wi->hi = hi; - list_add(&wi->list, &rdev->serial_list); + si->start = lo; + si->last = hi; + raid1_rb_insert(si, &serial->serial_rb); } else - mempool_free(wi, mddev->serial_info_pool); - spin_unlock_irqrestore(&rdev->serial_list_lock, flags); + mempool_free(si, mddev->serial_info_pool); + spin_unlock_irqrestore(&serial->serial_lock, flags); return ret; } static void remove_serial(struct md_rdev *rdev, sector_t lo, sector_t hi) { - struct serial_info *wi; + struct serial_info *si; unsigned long flags; int found = 0; struct mddev *mddev = rdev->mddev; + struct serial_in_rdev *serial = rdev->serial; - spin_lock_irqsave(&rdev->serial_list_lock, flags); - list_for_each_entry(wi, &rdev->serial_list, list) - if (hi == wi->hi && lo == wi->lo) { - list_del(&wi->list); - mempool_free(wi, mddev->serial_info_pool); + spin_lock_irqsave(&serial->serial_lock, flags); + for (si = raid1_rb_iter_first(&serial->serial_rb, lo, hi); + si; si = raid1_rb_iter_next(si, lo, hi)) { + if (si->start == lo && si->last == hi) { + raid1_rb_remove(si, &serial->serial_rb); + mempool_free(si, mddev->serial_info_pool); found = 1; break; } - + } if (!found) WARN(1, "The write IO is not recorded for serialization\n"); - spin_unlock_irqrestore(&rdev->serial_list_lock, flags); - wake_up(&rdev->serial_io_wait); + spin_unlock_irqrestore(&serial->serial_lock, flags); + wake_up(&serial->serial_io_wait); } /* @@ -1482,6 +1486,7 @@ static void raid1_write_request(struct mddev *mddev, struct bio *bio, for (i = 0; i < disks; i++) { struct bio *mbio = NULL; struct md_rdev *rdev = conf->mirrors[i].rdev; + struct serial_in_rdev *serial = rdev->serial; if (!r1_bio->bios[i]) continue; @@ -1510,13 +1515,13 @@ static void raid1_write_request(struct mddev *mddev, struct bio *bio, if (r1_bio->behind_master_bio) { if (test_bit(CollisionCheck, &rdev->flags)) - wait_event(rdev->serial_io_wait, + wait_event(serial->serial_io_wait, check_and_add_serial(rdev, lo, hi) == 0); if (test_bit(WriteMostly, &rdev->flags)) atomic_inc(&r1_bio->behind_remaining); } else if (mddev->serialize_policy) - wait_event(rdev->serial_io_wait, + wait_event(serial->serial_io_wait, check_and_add_serial(rdev, lo, hi) == 0); r1_bio->bios[i] = mbio; From 025471f9f50fede6527c70336484becbcb2aff28 Mon Sep 17 00:00:00 2001 From: Guoqing Jiang Date: Mon, 23 Dec 2019 10:49:01 +0100 Subject: [PATCH 14/15] md/raid1: use bucket based mechanism for IO serialization Since raid1 had already used bucket based mechanism to reduce the conflict between write IO and resync IO, it is possible to speed up performance for io serialization with refer to the same mechanism. To align with the barrier bucket mechanism, we created arrays (with the same number of BARRIER_BUCKETS_NR) for spinlock, rb tree and waitqueue. Then we can reduce lock competition with multiple spinlocks, boost search performance with multiple rb trees and also reduce thundering herd problem with multiple waitqueues. Signed-off-by: Guoqing Jiang Signed-off-by: Song Liu --- drivers/md/md.c | 18 +++++++++++++----- drivers/md/raid1.c | 9 ++++++--- 2 files changed, 19 insertions(+), 8 deletions(-) diff --git a/drivers/md/md.c b/drivers/md/md.c index 9c4e61c988ac..4824d50526fa 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -130,7 +130,7 @@ static void rdev_uninit_serial(struct md_rdev *rdev) if (!test_and_clear_bit(CollisionCheck, &rdev->flags)) return; - kfree(rdev->serial); + kvfree(rdev->serial); rdev->serial = NULL; } @@ -144,18 +144,26 @@ static void rdevs_uninit_serial(struct mddev *mddev) static int rdev_init_serial(struct md_rdev *rdev) { + /* serial_nums equals with BARRIER_BUCKETS_NR */ + int i, serial_nums = 1 << ((PAGE_SHIFT - ilog2(sizeof(atomic_t)))); struct serial_in_rdev *serial = NULL; if (test_bit(CollisionCheck, &rdev->flags)) return 0; - serial = kmalloc(sizeof(struct serial_in_rdev), GFP_KERNEL); + serial = kvmalloc(sizeof(struct serial_in_rdev) * serial_nums, + GFP_KERNEL); if (!serial) return -ENOMEM; - spin_lock_init(&serial->serial_lock); - serial->serial_rb = RB_ROOT_CACHED; - init_waitqueue_head(&serial->serial_io_wait); + for (i = 0; i < serial_nums; i++) { + struct serial_in_rdev *serial_tmp = &serial[i]; + + spin_lock_init(&serial_tmp->serial_lock); + serial_tmp->serial_rb = RB_ROOT_CACHED; + init_waitqueue_head(&serial_tmp->serial_io_wait); + } + rdev->serial = serial; set_bit(CollisionCheck, &rdev->flags); diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c index 5c6a03747448..48d553d7989a 100644 --- a/drivers/md/raid1.c +++ b/drivers/md/raid1.c @@ -62,7 +62,8 @@ static int check_and_add_serial(struct md_rdev *rdev, sector_t lo, sector_t hi) unsigned long flags; int ret = 0; struct mddev *mddev = rdev->mddev; - struct serial_in_rdev *serial = rdev->serial; + int idx = sector_to_idx(lo); + struct serial_in_rdev *serial = &rdev->serial[idx]; si = mempool_alloc(mddev->serial_info_pool, GFP_NOIO); @@ -87,7 +88,8 @@ static void remove_serial(struct md_rdev *rdev, sector_t lo, sector_t hi) unsigned long flags; int found = 0; struct mddev *mddev = rdev->mddev; - struct serial_in_rdev *serial = rdev->serial; + int idx = sector_to_idx(lo); + struct serial_in_rdev *serial = &rdev->serial[idx]; spin_lock_irqsave(&serial->serial_lock, flags); for (si = raid1_rb_iter_first(&serial->serial_rb, lo, hi); @@ -1486,7 +1488,8 @@ static void raid1_write_request(struct mddev *mddev, struct bio *bio, for (i = 0; i < disks; i++) { struct bio *mbio = NULL; struct md_rdev *rdev = conf->mirrors[i].rdev; - struct serial_in_rdev *serial = rdev->serial; + int idx = sector_to_idx(lo); + struct serial_in_rdev *serial = &rdev->serial[idx]; if (!r1_bio->bios[i]) continue; From d0d2d8ba0494655a01b97542c083e51b29cf8637 Mon Sep 17 00:00:00 2001 From: Guoqing Jiang Date: Mon, 23 Dec 2019 10:49:02 +0100 Subject: [PATCH 15/15] md/raid1: introduce wait_for_serialization Previously, we call check_and_add_serial when serialization is enabled for write IO, but it could allocate and free memory back and forth. Now, let's just get an element from memory pool with the new function, then insert node to rb tree if no collision happens. Signed-off-by: Guoqing Jiang Signed-off-by: Song Liu --- drivers/md/raid1.c | 41 ++++++++++++++++++++++------------------- 1 file changed, 22 insertions(+), 19 deletions(-) diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c index 48d553d7989a..cd810e195086 100644 --- a/drivers/md/raid1.c +++ b/drivers/md/raid1.c @@ -56,32 +56,43 @@ static void lower_barrier(struct r1conf *conf, sector_t sector_nr); INTERVAL_TREE_DEFINE(struct serial_info, node, sector_t, _subtree_last, START, LAST, static inline, raid1_rb); -static int check_and_add_serial(struct md_rdev *rdev, sector_t lo, sector_t hi) +static int check_and_add_serial(struct md_rdev *rdev, struct r1bio *r1_bio, + struct serial_info *si, int idx) { - struct serial_info *si; unsigned long flags; int ret = 0; - struct mddev *mddev = rdev->mddev; - int idx = sector_to_idx(lo); + sector_t lo = r1_bio->sector; + sector_t hi = lo + r1_bio->sectors; struct serial_in_rdev *serial = &rdev->serial[idx]; - si = mempool_alloc(mddev->serial_info_pool, GFP_NOIO); - spin_lock_irqsave(&serial->serial_lock, flags); /* collision happened */ if (raid1_rb_iter_first(&serial->serial_rb, lo, hi)) ret = -EBUSY; - if (!ret) { + else { si->start = lo; si->last = hi; raid1_rb_insert(si, &serial->serial_rb); - } else - mempool_free(si, mddev->serial_info_pool); + } spin_unlock_irqrestore(&serial->serial_lock, flags); return ret; } +static void wait_for_serialization(struct md_rdev *rdev, struct r1bio *r1_bio) +{ + struct mddev *mddev = rdev->mddev; + struct serial_info *si; + int idx = sector_to_idx(r1_bio->sector); + struct serial_in_rdev *serial = &rdev->serial[idx]; + + if (WARN_ON(!mddev->serial_info_pool)) + return; + si = mempool_alloc(mddev->serial_info_pool, GFP_NOIO); + wait_event(serial->serial_io_wait, + check_and_add_serial(rdev, r1_bio, si, idx) == 0); +} + static void remove_serial(struct md_rdev *rdev, sector_t lo, sector_t hi) { struct serial_info *si; @@ -1342,7 +1353,6 @@ static void raid1_write_request(struct mddev *mddev, struct bio *bio, struct raid1_plug_cb *plug = NULL; int first_clone; int max_sectors; - sector_t lo, hi; if (mddev_is_clustered(mddev) && md_cluster_ops->area_resyncing(mddev, WRITE, @@ -1370,8 +1380,6 @@ static void raid1_write_request(struct mddev *mddev, struct bio *bio, r1_bio = alloc_r1bio(mddev, bio); r1_bio->sectors = max_write_sectors; - lo = r1_bio->sector; - hi = r1_bio->sector + r1_bio->sectors; if (conf->pending_count >= max_queued_requests) { md_wakeup_thread(mddev->thread); @@ -1488,8 +1496,6 @@ static void raid1_write_request(struct mddev *mddev, struct bio *bio, for (i = 0; i < disks; i++) { struct bio *mbio = NULL; struct md_rdev *rdev = conf->mirrors[i].rdev; - int idx = sector_to_idx(lo); - struct serial_in_rdev *serial = &rdev->serial[idx]; if (!r1_bio->bios[i]) continue; @@ -1518,14 +1524,11 @@ static void raid1_write_request(struct mddev *mddev, struct bio *bio, if (r1_bio->behind_master_bio) { if (test_bit(CollisionCheck, &rdev->flags)) - wait_event(serial->serial_io_wait, - check_and_add_serial(rdev, lo, hi) - == 0); + wait_for_serialization(rdev, r1_bio); if (test_bit(WriteMostly, &rdev->flags)) atomic_inc(&r1_bio->behind_remaining); } else if (mddev->serialize_policy) - wait_event(serial->serial_io_wait, - check_and_add_serial(rdev, lo, hi) == 0); + wait_for_serialization(rdev, r1_bio); r1_bio->bios[i] = mbio;