From 3e148a3209792e04f63ec99701235c960765fc9a Mon Sep 17 00:00:00 2001 From: Guoqing Jiang Date: Wed, 19 Jun 2019 17:30:46 +0800 Subject: [PATCH 1/5] md/raid1: fix potential data inconsistency issue with write behind device For write-behind mode, we think write IO is complete once it has reached all the non-writemostly devices. It works fine for single queue devices. But for multiqueue device, if there are lots of IOs come from upper layer, then the write-behind device could issue those IOs to different queues, depends on the each queue's delay, so there is no guarantee that those IOs can arrive in order. To address the issue, we need to check the collision among write behind IOs, we can only continue without collision, otherwise wait for the completion of previous collisioned IO. And WBCollision is introduced for multiqueue device which is worked under write-behind mode. But this patch doesn't handle below cases which could have the data inconsistency issue as well, these cases will be handled in later patches. 1. modify max_write_behind by write backlog node. 2. add or remove array's bitmap dynamically. 3. the change of member disk. Reviewed-by: NeilBrown Signed-off-by: Guoqing Jiang Signed-off-by: Song Liu --- drivers/md/md.c | 41 ++++++++++++++++++++++++++++ drivers/md/md.h | 21 ++++++++++++++ drivers/md/raid1.c | 68 +++++++++++++++++++++++++++++++++++++++++++++- 3 files changed, 129 insertions(+), 1 deletion(-) diff --git a/drivers/md/md.c b/drivers/md/md.c index 1f37a1adc926..9a9762c83cc8 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -124,6 +124,19 @@ static inline int speed_max(struct mddev *mddev) mddev->sync_speed_max : sysctl_speed_limit_max; } +static int rdev_init_wb(struct md_rdev *rdev) +{ + if (rdev->bdev->bd_queue->nr_hw_queues == 1) + return 0; + + spin_lock_init(&rdev->wb_list_lock); + INIT_LIST_HEAD(&rdev->wb_list); + init_waitqueue_head(&rdev->wb_io_wait); + set_bit(WBCollisionCheck, &rdev->flags); + + return 1; +} + static struct ctl_table_header *raid_table_header; static struct ctl_table raid_table[] = { @@ -5597,6 +5610,32 @@ int md_run(struct mddev *mddev) md_bitmap_destroy(mddev); goto abort; } + + if (mddev->bitmap_info.max_write_behind > 0) { + bool creat_pool = false; + + rdev_for_each(rdev, mddev) { + if (test_bit(WriteMostly, &rdev->flags) && + rdev_init_wb(rdev)) + creat_pool = true; + } + if (creat_pool && mddev->wb_info_pool == NULL) { + mddev->wb_info_pool = + mempool_create_kmalloc_pool(NR_WB_INFOS, + sizeof(struct wb_info)); + if (!mddev->wb_info_pool) { + err = -ENOMEM; + mddev_detach(mddev); + if (mddev->private) + pers->free(mddev, mddev->private); + mddev->private = NULL; + module_put(pers->owner); + md_bitmap_destroy(mddev); + goto abort; + } + } + } + if (mddev->queue) { bool nonrot = true; @@ -5825,6 +5864,8 @@ static void __md_stop_writes(struct mddev *mddev) mddev->in_sync = 1; md_update_sb(mddev, 1); } + mempool_destroy(mddev->wb_info_pool); + mddev->wb_info_pool = NULL; } void md_stop_writes(struct mddev *mddev) diff --git a/drivers/md/md.h b/drivers/md/md.h index 7c930c091193..d449d514cff9 100644 --- a/drivers/md/md.h +++ b/drivers/md/md.h @@ -109,6 +109,14 @@ struct md_rdev { * for reporting to userspace and storing * in superblock. */ + + /* + * The members for check collision of write behind IOs. + */ + struct list_head wb_list; + spinlock_t wb_list_lock; + wait_queue_head_t wb_io_wait; + struct work_struct del_work; /* used for delayed sysfs removal */ struct kernfs_node *sysfs_state; /* handle for 'state' @@ -193,6 +201,10 @@ enum flag_bits { * it didn't fail, so don't use FailFast * any more for metadata */ + WBCollisionCheck, /* + * multiqueue device should check if there + * is collision between write behind bios. + */ }; static inline int is_badblock(struct md_rdev *rdev, sector_t s, int sectors, @@ -245,6 +257,14 @@ enum mddev_sb_flags { MD_SB_NEED_REWRITE, /* metadata write needs to be repeated */ }; +#define NR_WB_INFOS 8 +/* record current range of write behind IOs */ +struct wb_info { + sector_t lo; + sector_t hi; + struct list_head list; +}; + struct mddev { void *private; struct md_personality *pers; @@ -461,6 +481,7 @@ struct mddev { */ struct work_struct flush_work; struct work_struct event_work; /* used by dm to report failure event */ + mempool_t *wb_info_pool; void (*sync_super)(struct mddev *mddev, struct md_rdev *rdev); struct md_cluster_info *cluster_info; unsigned int good_device_nr; /* good device num within cluster raid */ diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c index a7860b5f33f2..3d44da663797 100644 --- a/drivers/md/raid1.c +++ b/drivers/md/raid1.c @@ -50,6 +50,57 @@ static void lower_barrier(struct r1conf *conf, sector_t sector_nr); #include "raid1-10.c" +static int check_and_add_wb(struct md_rdev *rdev, sector_t lo, sector_t hi) +{ + struct wb_info *wi, *temp_wi; + unsigned long flags; + int ret = 0; + struct mddev *mddev = rdev->mddev; + + wi = mempool_alloc(mddev->wb_info_pool, GFP_NOIO); + + spin_lock_irqsave(&rdev->wb_list_lock, flags); + list_for_each_entry(temp_wi, &rdev->wb_list, list) { + /* collision happened */ + if (hi > temp_wi->lo && lo < temp_wi->hi) { + ret = -EBUSY; + break; + } + } + + if (!ret) { + wi->lo = lo; + wi->hi = hi; + list_add(&wi->list, &rdev->wb_list); + } else + mempool_free(wi, mddev->wb_info_pool); + spin_unlock_irqrestore(&rdev->wb_list_lock, flags); + + return ret; +} + +static void remove_wb(struct md_rdev *rdev, sector_t lo, sector_t hi) +{ + struct wb_info *wi; + unsigned long flags; + int found = 0; + struct mddev *mddev = rdev->mddev; + + spin_lock_irqsave(&rdev->wb_list_lock, flags); + list_for_each_entry(wi, &rdev->wb_list, list) + if (hi == wi->hi && lo == wi->lo) { + list_del(&wi->list); + mempool_free(wi, mddev->wb_info_pool); + found = 1; + break; + } + + if (!found) + WARN_ON("The write behind IO is not recorded\n"); + spin_unlock_irqrestore(&rdev->wb_list_lock, flags); + wake_up(&rdev->wb_io_wait); +} + /* * for resync bio, r1bio pointer can be retrieved from the per-bio * 'struct resync_pages'. @@ -446,6 +497,12 @@ static void raid1_end_write_request(struct bio *bio) } if (behind) { + if (test_bit(WBCollisionCheck, &rdev->flags)) { + sector_t lo = r1_bio->sector; + sector_t hi = r1_bio->sector + r1_bio->sectors; + + remove_wb(rdev, lo, hi); + } if (test_bit(WriteMostly, &rdev->flags)) atomic_dec(&r1_bio->behind_remaining); @@ -1443,7 +1500,16 @@ static void raid1_write_request(struct mddev *mddev, struct bio *bio, mbio = bio_clone_fast(bio, GFP_NOIO, &mddev->bio_set); if (r1_bio->behind_master_bio) { - if (test_bit(WriteMostly, &conf->mirrors[i].rdev->flags)) + struct md_rdev *rdev = conf->mirrors[i].rdev; + + if (test_bit(WBCollisionCheck, &rdev->flags)) { + sector_t lo = r1_bio->sector; + sector_t hi = r1_bio->sector + r1_bio->sectors; + + wait_event(rdev->wb_io_wait, + check_and_add_wb(rdev, lo, hi) == 0); + } + if (test_bit(WriteMostly, &rdev->flags)) atomic_inc(&r1_bio->behind_remaining); } From 963c555e75b033202dd76cf6325a7b7c83d08d5f Mon Sep 17 00:00:00 2001 From: Guoqing Jiang Date: Fri, 14 Jun 2019 17:10:36 +0800 Subject: [PATCH 2/5] md: introduce mddev_create/destroy_wb_pool for the change of member device Previously, we called rdev_init_wb to avoid potential data inconsistency when array is created. Now, we need to call the function and create mempool if a device is added or just be flaged as "writemostly". So mddev_create_wb_pool is introduced and called accordingly. And for safety reason, we mark implicit GFP_NOIO allocation scope for create mempool during mddev_suspend/mddev_resume. And mempool should be removed conversely after remove a member device or its's "writemostly" flag, which is done by call mddev_destroy_wb_pool. Reviewed-by: NeilBrown Signed-off-by: Guoqing Jiang Signed-off-by: Song Liu --- drivers/md/md.c | 65 +++++++++++++++++++++++++++++++++++++++++++++++++ drivers/md/md.h | 2 ++ 2 files changed, 67 insertions(+) diff --git a/drivers/md/md.c b/drivers/md/md.c index 9a9762c83cc8..b43207ab00b2 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -37,6 +37,7 @@ */ +#include #include #include #include @@ -137,6 +138,64 @@ static int rdev_init_wb(struct md_rdev *rdev) return 1; } +/* + * Create wb_info_pool if rdev is the first multi-queue device flaged + * with writemostly, also write-behind mode is enabled. + */ +void mddev_create_wb_pool(struct mddev *mddev, struct md_rdev *rdev, + bool is_suspend) +{ + if (mddev->bitmap_info.max_write_behind == 0) + return; + + if (!test_bit(WriteMostly, &rdev->flags) || !rdev_init_wb(rdev)) + return; + + if (mddev->wb_info_pool == NULL) { + unsigned int noio_flag; + + if (!is_suspend) + mddev_suspend(mddev); + noio_flag = memalloc_noio_save(); + mddev->wb_info_pool = mempool_create_kmalloc_pool(NR_WB_INFOS, + sizeof(struct wb_info)); + memalloc_noio_restore(noio_flag); + if (!mddev->wb_info_pool) + pr_err("can't alloc memory pool for writemostly\n"); + if (!is_suspend) + mddev_resume(mddev); + } +} +EXPORT_SYMBOL_GPL(mddev_create_wb_pool); + +/* + * destroy wb_info_pool if rdev is the last device flaged with WBCollisionCheck. + */ +static void mddev_destroy_wb_pool(struct mddev *mddev, struct md_rdev *rdev) +{ + if (!test_and_clear_bit(WBCollisionCheck, &rdev->flags)) + return; + + if (mddev->wb_info_pool) { + struct md_rdev *temp; + int num = 0; + + /* + * Check if other rdevs need wb_info_pool. + */ + rdev_for_each(temp, mddev) + if (temp != rdev && + test_bit(WBCollisionCheck, &temp->flags)) + num++; + if (!num) { + mddev_suspend(rdev->mddev); + mempool_destroy(mddev->wb_info_pool); + mddev->wb_info_pool = NULL; + mddev_resume(rdev->mddev); + } + } +} + static struct ctl_table_header *raid_table_header; static struct ctl_table raid_table[] = { @@ -2223,6 +2282,9 @@ static int bind_rdev_to_array(struct md_rdev *rdev, struct mddev *mddev) rdev->mddev = mddev; pr_debug("md: bind<%s>\n", b); + if (mddev->raid_disks) + mddev_create_wb_pool(mddev, rdev, false); + if ((err = kobject_add(&rdev->kobj, &mddev->kobj, "dev-%s", b))) goto fail; @@ -2259,6 +2321,7 @@ static void unbind_rdev_from_array(struct md_rdev *rdev) bd_unlink_disk_holder(rdev->bdev, rdev->mddev->gendisk); list_del_rcu(&rdev->same_set); pr_debug("md: unbind<%s>\n", bdevname(rdev->bdev,b)); + mddev_destroy_wb_pool(rdev->mddev, rdev); rdev->mddev = NULL; sysfs_remove_link(&rdev->kobj, "block"); sysfs_put(rdev->sysfs_state); @@ -2771,8 +2834,10 @@ state_store(struct md_rdev *rdev, const char *buf, size_t len) } } else if (cmd_match(buf, "writemostly")) { set_bit(WriteMostly, &rdev->flags); + mddev_create_wb_pool(rdev->mddev, rdev, false); err = 0; } else if (cmd_match(buf, "-writemostly")) { + mddev_destroy_wb_pool(rdev->mddev, rdev); clear_bit(WriteMostly, &rdev->flags); err = 0; } else if (cmd_match(buf, "blocked")) { diff --git a/drivers/md/md.h b/drivers/md/md.h index d449d514cff9..10f98200e2f8 100644 --- a/drivers/md/md.h +++ b/drivers/md/md.h @@ -730,6 +730,8 @@ extern struct bio *bio_alloc_mddev(gfp_t gfp_mask, int nr_iovecs, extern void md_reload_sb(struct mddev *mddev, int raid_disk); extern void md_update_sb(struct mddev *mddev, int force); extern void md_kick_rdev_from_array(struct md_rdev * rdev); +extern void mddev_create_wb_pool(struct mddev *mddev, struct md_rdev *rdev, + bool is_suspend); struct md_rdev *md_find_rdev_nr_rcu(struct mddev *mddev, int nr); struct md_rdev *md_find_rdev_rcu(struct mddev *mddev, dev_t dev); From 10c92fca636e40dcb15d85ffe06b1b6843cd28fc Mon Sep 17 00:00:00 2001 From: Guoqing Jiang Date: Fri, 14 Jun 2019 17:10:37 +0800 Subject: [PATCH 3/5] md-bitmap: create and destroy wb_info_pool with the change of backlog Since we can enable write-behind mode by write backlog node, so create wb_info_pool if the mode is just enabled, also call call md_bitmap_update_sb to make user aware the write-behind mode is enabled. Conversely, wb_info_pool should be destroyed when write-behind mode is disabled. Beside above, it is better to update bitmap sb if we change the number of max_write_behind. Reviewed-by: NeilBrown Signed-off-by: Guoqing Jiang Signed-off-by: Song Liu --- drivers/md/md-bitmap.c | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/drivers/md/md-bitmap.c b/drivers/md/md-bitmap.c index c01d41198f5e..15dd817fe83b 100644 --- a/drivers/md/md-bitmap.c +++ b/drivers/md/md-bitmap.c @@ -2462,12 +2462,26 @@ static ssize_t backlog_store(struct mddev *mddev, const char *buf, size_t len) { unsigned long backlog; + unsigned long old_mwb = mddev->bitmap_info.max_write_behind; int rv = kstrtoul(buf, 10, &backlog); if (rv) return rv; if (backlog > COUNTER_MAX) return -EINVAL; mddev->bitmap_info.max_write_behind = backlog; + if (!backlog && mddev->wb_info_pool) { + /* wb_info_pool is not needed if backlog is zero */ + mempool_destroy(mddev->wb_info_pool); + mddev->wb_info_pool = NULL; + } else if (backlog && !mddev->wb_info_pool) { + /* wb_info_pool is needed since backlog is not zero */ + struct md_rdev *rdev; + + rdev_for_each(rdev, mddev) + mddev_create_wb_pool(mddev, rdev, false); + } + if (old_mwb != backlog) + md_bitmap_update_sb(mddev->bitmap); return len; } From 617b194a13c0f3b0a6d14fc6227c222877c23b4e Mon Sep 17 00:00:00 2001 From: Guoqing Jiang Date: Fri, 14 Jun 2019 17:10:38 +0800 Subject: [PATCH 4/5] md-bitmap: create and destroy wb_info_pool with the change of bitmap The write-behind attribute is part of bitmap, since bitmap can be added/removed dynamically with the following. 1. mdadm --grow /dev/md0 --bitmap=none 2. mdadm --grow /dev/md0 --bitmap=internal --write-behind So we need to destroy wb_info_pool in md_bitmap_destroy, and create the pool before load bitmap. Reviewed-by: NeilBrown Signed-off-by: Guoqing Jiang Signed-off-by: Song Liu --- drivers/md/md-bitmap.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/drivers/md/md-bitmap.c b/drivers/md/md-bitmap.c index 15dd817fe83b..b092c7b5282f 100644 --- a/drivers/md/md-bitmap.c +++ b/drivers/md/md-bitmap.c @@ -1790,6 +1790,8 @@ void md_bitmap_destroy(struct mddev *mddev) return; md_bitmap_wait_behind_writes(mddev); + mempool_destroy(mddev->wb_info_pool); + mddev->wb_info_pool = NULL; mutex_lock(&mddev->bitmap_info.mutex); spin_lock(&mddev->lock); @@ -1900,10 +1902,14 @@ int md_bitmap_load(struct mddev *mddev) sector_t start = 0; sector_t sector = 0; struct bitmap *bitmap = mddev->bitmap; + struct md_rdev *rdev; if (!bitmap) goto out; + rdev_for_each(rdev, mddev) + mddev_create_wb_pool(mddev, rdev, true); + if (mddev_is_clustered(mddev)) md_cluster_ops->load_bitmaps(mddev, mddev->bitmap_info.nodes); From d494549ac8852ec42854d1491dd17bb9350a0abc Mon Sep 17 00:00:00 2001 From: Guoqing Jiang Date: Fri, 14 Jun 2019 17:10:39 +0800 Subject: [PATCH 5/5] md: add bitmap_abort label in md_run Now, there are two places need to consider about the failure of destroy bitmap, so move the common part between bitmap_abort and abort label. Reviewed-by: NeilBrown Signed-off-by: Guoqing Jiang Signed-off-by: Song Liu --- drivers/md/md.c | 26 ++++++++++---------------- 1 file changed, 10 insertions(+), 16 deletions(-) diff --git a/drivers/md/md.c b/drivers/md/md.c index b43207ab00b2..692fc365e73c 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -5666,15 +5666,8 @@ int md_run(struct mddev *mddev) mddev->bitmap = bitmap; } - if (err) { - mddev_detach(mddev); - if (mddev->private) - pers->free(mddev, mddev->private); - mddev->private = NULL; - module_put(pers->owner); - md_bitmap_destroy(mddev); - goto abort; - } + if (err) + goto bitmap_abort; if (mddev->bitmap_info.max_write_behind > 0) { bool creat_pool = false; @@ -5690,13 +5683,7 @@ int md_run(struct mddev *mddev) sizeof(struct wb_info)); if (!mddev->wb_info_pool) { err = -ENOMEM; - mddev_detach(mddev); - if (mddev->private) - pers->free(mddev, mddev->private); - mddev->private = NULL; - module_put(pers->owner); - md_bitmap_destroy(mddev); - goto abort; + goto bitmap_abort; } } } @@ -5761,6 +5748,13 @@ int md_run(struct mddev *mddev) sysfs_notify(&mddev->kobj, NULL, "degraded"); return 0; +bitmap_abort: + mddev_detach(mddev); + if (mddev->private) + pers->free(mddev, mddev->private); + mddev->private = NULL; + module_put(pers->owner); + md_bitmap_destroy(mddev); abort: bioset_exit(&mddev->bio_set); bioset_exit(&mddev->sync_set);