From a865b96c513bcaeec49669010d67c40aa8e58619 Mon Sep 17 00:00:00 2001
From: Yu Kuai <yukuai3@huawei.com>
Date: Mon, 29 May 2023 21:20:32 +0800
Subject: [PATCH 01/36] Revert "md: unlock mddev before reap sync_thread in
 action_store"

This reverts commit 9dfbdafda3b34e262e43e786077bab8e476a89d1.

Because it will introduce a defect that sync_thread can be running while
MD_RECOVERY_RUNNING is cleared, which will cause some unexpected problems,
for example:

list_add corruption. prev->next should be next (ffff0001ac1daba0), but was ffff0000ce1a02a0. (prev=ffff0000ce1a02a0).
Call trace:
 __list_add_valid+0xfc/0x140
 insert_work+0x78/0x1a0
 __queue_work+0x500/0xcf4
 queue_work_on+0xe8/0x12c
 md_check_recovery+0xa34/0xf30
 raid10d+0xb8/0x900 [raid10]
 md_thread+0x16c/0x2cc
 kthread+0x1a4/0x1ec
 ret_from_fork+0x10/0x18

This is because work is requeued while it's still inside workqueue:

t1:			t2:
action_store
 mddev_lock
  if (mddev->sync_thread)
   mddev_unlock
   md_unregister_thread
   // first sync_thread is done
			md_check_recovery
			 mddev_try_lock
			 /*
			  * once MD_RECOVERY_DONE is set, new sync_thread
			  * can start.
			  */
			 set_bit(MD_RECOVERY_RUNNING, &mddev->recovery)
			 INIT_WORK(&mddev->del_work, md_start_sync)
			 queue_work(md_misc_wq, &mddev->del_work)
			  test_and_set_bit(WORK_STRUCT_PENDING_BIT, ...)
			  // set pending bit
			  insert_work
			   list_add_tail
			 mddev_unlock
   mddev_lock_nointr
   md_reap_sync_thread
   // MD_RECOVERY_RUNNING is cleared
 mddev_unlock

t3:

// before queued work started from t2
md_check_recovery
 // MD_RECOVERY_RUNNING is not set, a new sync_thread can be started
 INIT_WORK(&mddev->del_work, md_start_sync)
  work->data = 0
  // work pending bit is cleared
 queue_work(md_misc_wq, &mddev->del_work)
  insert_work
   list_add_tail
   // list is corrupted

The above commit is reverted to fix the problem, the deadlock this
commit tries to fix will be fixed in following patches.

Signed-off-by: Yu Kuai <yukuai3@huawei.com>
Signed-off-by: Song Liu <song@kernel.org>
Link: https://lore.kernel.org/r/20230529132037.2124527-2-yukuai1@huaweicloud.com
---
 drivers/md/dm-raid.c |  1 -
 drivers/md/md.c      | 19 ++-----------------
 2 files changed, 2 insertions(+), 18 deletions(-)

diff --git a/drivers/md/dm-raid.c b/drivers/md/dm-raid.c
index 8846bf510a35..1f22bef27841 100644
--- a/drivers/md/dm-raid.c
+++ b/drivers/md/dm-raid.c
@@ -3725,7 +3725,6 @@ static int raid_message(struct dm_target *ti, unsigned int argc, char **argv,
 	if (!strcasecmp(argv[0], "idle") || !strcasecmp(argv[0], "frozen")) {
 		if (mddev->sync_thread) {
 			set_bit(MD_RECOVERY_INTR, &mddev->recovery);
-			md_unregister_thread(&mddev->sync_thread);
 			md_reap_sync_thread(mddev);
 		}
 	} else if (decipher_sync_action(mddev, mddev->recovery) != st_idle)
diff --git a/drivers/md/md.c b/drivers/md/md.c
index 2e38ef421d69..d445d5fb7a01 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -4764,19 +4764,6 @@ action_store(struct mddev *mddev, const char *page, size_t len)
 			if (work_pending(&mddev->del_work))
 				flush_workqueue(md_misc_wq);
 			if (mddev->sync_thread) {
-				sector_t save_rp = mddev->reshape_position;
-
-				mddev_unlock(mddev);
-				set_bit(MD_RECOVERY_INTR, &mddev->recovery);
-				md_unregister_thread(&mddev->sync_thread);
-				mddev_lock_nointr(mddev);
-				/*
-				 * set RECOVERY_INTR again and restore reshape
-				 * position in case others changed them after
-				 * got lock, eg, reshape_position_store and
-				 * md_check_recovery.
-				 */
-				mddev->reshape_position = save_rp;
 				set_bit(MD_RECOVERY_INTR, &mddev->recovery);
 				md_reap_sync_thread(mddev);
 			}
@@ -6176,7 +6163,6 @@ static void __md_stop_writes(struct mddev *mddev)
 		flush_workqueue(md_misc_wq);
 	if (mddev->sync_thread) {
 		set_bit(MD_RECOVERY_INTR, &mddev->recovery);
-		md_unregister_thread(&mddev->sync_thread);
 		md_reap_sync_thread(mddev);
 	}
 
@@ -9327,7 +9313,6 @@ void md_check_recovery(struct mddev *mddev)
 			 * ->spare_active and clear saved_raid_disk
 			 */
 			set_bit(MD_RECOVERY_INTR, &mddev->recovery);
-			md_unregister_thread(&mddev->sync_thread);
 			md_reap_sync_thread(mddev);
 			clear_bit(MD_RECOVERY_RECOVER, &mddev->recovery);
 			clear_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
@@ -9363,7 +9348,6 @@ void md_check_recovery(struct mddev *mddev)
 			goto unlock;
 		}
 		if (mddev->sync_thread) {
-			md_unregister_thread(&mddev->sync_thread);
 			md_reap_sync_thread(mddev);
 			goto unlock;
 		}
@@ -9443,7 +9427,8 @@ void md_reap_sync_thread(struct mddev *mddev)
 	sector_t old_dev_sectors = mddev->dev_sectors;
 	bool is_reshaped = false;
 
-	/* sync_thread should be unregistered, collect result */
+	/* resync has finished, collect result */
+	md_unregister_thread(&mddev->sync_thread);
 	if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery) &&
 	    !test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery) &&
 	    mddev->degraded != mddev->raid_disks) {

From 64e5e09afc14f8cc9058b0ed5c9cc4c8cd126b85 Mon Sep 17 00:00:00 2001
From: Yu Kuai <yukuai3@huawei.com>
Date: Mon, 29 May 2023 21:20:33 +0800
Subject: [PATCH 02/36] md: refactor action_store() for 'idle' and 'frozen'

Prepare to handle 'idle' and 'frozen' differently to fix a deadlock, there
are no functional changes except that MD_RECOVERY_RUNNING is checked
again after 'reconfig_mutex' is held.

Signed-off-by: Yu Kuai <yukuai3@huawei.com>
Signed-off-by: Song Liu <song@kernel.org>
Link: https://lore.kernel.org/r/20230529132037.2124527-3-yukuai1@huaweicloud.com
---
 drivers/md/md.c | 61 ++++++++++++++++++++++++++++++++++++-------------
 1 file changed, 45 insertions(+), 16 deletions(-)

diff --git a/drivers/md/md.c b/drivers/md/md.c
index d445d5fb7a01..7fa91f0e5620 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -4747,6 +4747,46 @@ action_show(struct mddev *mddev, char *page)
 	return sprintf(page, "%s\n", type);
 }
 
+static void stop_sync_thread(struct mddev *mddev)
+{
+	if (!test_bit(MD_RECOVERY_RUNNING, &mddev->recovery))
+		return;
+
+	if (mddev_lock(mddev))
+		return;
+
+	/*
+	 * Check again in case MD_RECOVERY_RUNNING is cleared before lock is
+	 * held.
+	 */
+	if (!test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) {
+		mddev_unlock(mddev);
+		return;
+	}
+
+	if (work_pending(&mddev->del_work))
+		flush_workqueue(md_misc_wq);
+
+	if (mddev->sync_thread) {
+		set_bit(MD_RECOVERY_INTR, &mddev->recovery);
+		md_reap_sync_thread(mddev);
+	}
+
+	mddev_unlock(mddev);
+}
+
+static void idle_sync_thread(struct mddev *mddev)
+{
+	clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
+	stop_sync_thread(mddev);
+}
+
+static void frozen_sync_thread(struct mddev *mddev)
+{
+	set_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
+	stop_sync_thread(mddev);
+}
+
 static ssize_t
 action_store(struct mddev *mddev, const char *page, size_t len)
 {
@@ -4754,22 +4794,11 @@ action_store(struct mddev *mddev, const char *page, size_t len)
 		return -EINVAL;
 
 
-	if (cmd_match(page, "idle") || cmd_match(page, "frozen")) {
-		if (cmd_match(page, "frozen"))
-			set_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
-		else
-			clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
-		if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) &&
-		    mddev_lock(mddev) == 0) {
-			if (work_pending(&mddev->del_work))
-				flush_workqueue(md_misc_wq);
-			if (mddev->sync_thread) {
-				set_bit(MD_RECOVERY_INTR, &mddev->recovery);
-				md_reap_sync_thread(mddev);
-			}
-			mddev_unlock(mddev);
-		}
-	} else if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery))
+	if (cmd_match(page, "idle"))
+		idle_sync_thread(mddev);
+	else if (cmd_match(page, "frozen"))
+		frozen_sync_thread(mddev);
+	else if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery))
 		return -EBUSY;
 	else if (cmd_match(page, "resync"))
 		clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery);

From 6f56f0c4f1241f1694a6a9438dd4f78d4513a917 Mon Sep 17 00:00:00 2001
From: Yu Kuai <yukuai3@huawei.com>
Date: Mon, 29 May 2023 21:20:34 +0800
Subject: [PATCH 03/36] md: add a mutex to synchronize idle and frozen in
 action_store()

Currently, for idle and frozen, action_store will hold 'reconfig_mutex'
and call md_reap_sync_thread() to stop sync thread, however, this will
cause deadlock (explained in the next patch). In order to fix the
problem, following patch will release 'reconfig_mutex' and wait on
'resync_wait', like md_set_readonly() and do_md_stop() does.

Consider that action_store() will set/clear 'MD_RECOVERY_FROZEN'
unconditionally, which might cause unexpected problems, for example,
frozen just set 'MD_RECOVERY_FROZEN' and is still in progress, while
'idle' clear 'MD_RECOVERY_FROZEN' and new sync thread is started, which
might starve in progress frozen. A mutex is added to synchronize idle
and frozen from action_store().

Signed-off-by: Yu Kuai <yukuai3@huawei.com>
Signed-off-by: Song Liu <song@kernel.org>
Link: https://lore.kernel.org/r/20230529132037.2124527-4-yukuai1@huaweicloud.com
---
 drivers/md/md.c | 5 +++++
 drivers/md/md.h | 3 +++
 2 files changed, 8 insertions(+)

diff --git a/drivers/md/md.c b/drivers/md/md.c
index 7fa91f0e5620..3d7e87cab8ad 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -643,6 +643,7 @@ void mddev_init(struct mddev *mddev)
 {
 	mutex_init(&mddev->open_mutex);
 	mutex_init(&mddev->reconfig_mutex);
+	mutex_init(&mddev->sync_mutex);
 	mutex_init(&mddev->bitmap_info.mutex);
 	INIT_LIST_HEAD(&mddev->disks);
 	INIT_LIST_HEAD(&mddev->all_mddevs);
@@ -4777,14 +4778,18 @@ static void stop_sync_thread(struct mddev *mddev)
 
 static void idle_sync_thread(struct mddev *mddev)
 {
+	mutex_lock(&mddev->sync_mutex);
 	clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
 	stop_sync_thread(mddev);
+	mutex_unlock(&mddev->sync_mutex);
 }
 
 static void frozen_sync_thread(struct mddev *mddev)
 {
+	mutex_lock(&mddev->sync_mutex);
 	set_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
 	stop_sync_thread(mddev);
+	mutex_unlock(&mddev->sync_mutex);
 }
 
 static ssize_t
diff --git a/drivers/md/md.h b/drivers/md/md.h
index 1aef86bf3fc3..18c168bf5fab 100644
--- a/drivers/md/md.h
+++ b/drivers/md/md.h
@@ -535,6 +535,9 @@ struct mddev {
 	 */
 	struct list_head		deleting;
 
+	/* Used to synchronize idle and frozen for action_store() */
+	struct mutex			sync_mutex;
+
 	bool	has_superblocks:1;
 	bool	fail_last_dev:1;
 	bool	serialize_policy:1;

From 130443d60b1b8c7a609a2af3384dd8e60df97181 Mon Sep 17 00:00:00 2001
From: Yu Kuai <yukuai3@huawei.com>
Date: Mon, 29 May 2023 21:20:35 +0800
Subject: [PATCH 04/36] md: refactor idle/frozen_sync_thread() to fix deadlock

Our test found a following deadlock in raid10:

1) Issue a normal write, and such write failed:

  raid10_end_write_request
   set_bit(R10BIO_WriteError, &r10_bio->state)
   one_write_done
    reschedule_retry

  // later from md thread
  raid10d
   handle_write_completed
    list_add(&r10_bio->retry_list, &conf->bio_end_io_list)

  // later from md thread
  raid10d
   if (!test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags))
    list_move(conf->bio_end_io_list.prev, &tmp)
    r10_bio = list_first_entry(&tmp, struct r10bio, retry_list)
    raid_end_bio_io(r10_bio)

Dependency chain 1: normal io is waiting for updating superblock

2) Trigger a recovery:

  raid10_sync_request
   raise_barrier

Dependency chain 2: sync thread is waiting for normal io

3) echo idle/frozen to sync_action:

  action_store
   mddev_lock
    md_unregister_thread
     kthread_stop

Dependency chain 3: drop 'reconfig_mutex' is waiting for sync thread

4) md thread can't update superblock:

  raid10d
   md_check_recovery
    if (mddev_trylock(mddev))
     md_update_sb

Dependency chain 4: update superblock is waiting for 'reconfig_mutex'

Hence cyclic dependency exist, in order to fix the problem, we must
break one of them. Dependency 1 and 2 can't be broken because they are
foundation design. Dependency 4 may be possible if it can be guaranteed
that no io can be inflight, however, this requires a new mechanism which
seems complex. Dependency 3 is a good choice, because idle/frozen only
requires sync thread to finish, which can be done asynchronously that is
already implemented, and 'reconfig_mutex' is not needed anymore.

This patch switch 'idle' and 'frozen' to wait sync thread to be done
asynchronously, and this patch also add a sequence counter to record how
many times sync thread is done, so that 'idle' won't keep waiting on new
started sync thread.

Noted that raid456 has similiar deadlock([1]), and it's verified[2] this
deadlock can be fixed by this patch as well.

[1] https://lore.kernel.org/linux-raid/5ed54ffc-ce82-bf66-4eff-390cb23bc1ac@molgen.mpg.de/T/#t
[2] https://lore.kernel.org/linux-raid/e9067438-d713-f5f3-0d3d-9e6b0e9efa0e@huaweicloud.com/

Signed-off-by: Yu Kuai <yukuai3@huawei.com>
Signed-off-by: Song Liu <song@kernel.org>
Link: https://lore.kernel.org/r/20230529132037.2124527-5-yukuai1@huaweicloud.com
---
 drivers/md/md.c | 23 +++++++++++++++++++----
 drivers/md/md.h |  2 ++
 2 files changed, 21 insertions(+), 4 deletions(-)

diff --git a/drivers/md/md.c b/drivers/md/md.c
index 3d7e87cab8ad..920701ab9505 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -651,6 +651,7 @@ void mddev_init(struct mddev *mddev)
 	timer_setup(&mddev->safemode_timer, md_safemode_timeout, 0);
 	atomic_set(&mddev->active, 1);
 	atomic_set(&mddev->openers, 0);
+	atomic_set(&mddev->sync_seq, 0);
 	spin_lock_init(&mddev->lock);
 	atomic_set(&mddev->flush_pending, 0);
 	init_waitqueue_head(&mddev->sb_wait);
@@ -4768,19 +4769,27 @@ static void stop_sync_thread(struct mddev *mddev)
 	if (work_pending(&mddev->del_work))
 		flush_workqueue(md_misc_wq);
 
-	if (mddev->sync_thread) {
-		set_bit(MD_RECOVERY_INTR, &mddev->recovery);
-		md_reap_sync_thread(mddev);
-	}
+	set_bit(MD_RECOVERY_INTR, &mddev->recovery);
+	/*
+	 * Thread might be blocked waiting for metadata update which will now
+	 * never happen
+	 */
+	md_wakeup_thread_directly(mddev->sync_thread);
 
 	mddev_unlock(mddev);
 }
 
 static void idle_sync_thread(struct mddev *mddev)
 {
+	int sync_seq = atomic_read(&mddev->sync_seq);
+
 	mutex_lock(&mddev->sync_mutex);
 	clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
 	stop_sync_thread(mddev);
+
+	wait_event(resync_wait, sync_seq != atomic_read(&mddev->sync_seq) ||
+			!test_bit(MD_RECOVERY_RUNNING, &mddev->recovery));
+
 	mutex_unlock(&mddev->sync_mutex);
 }
 
@@ -4789,6 +4798,10 @@ static void frozen_sync_thread(struct mddev *mddev)
 	mutex_lock(&mddev->sync_mutex);
 	set_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
 	stop_sync_thread(mddev);
+
+	wait_event(resync_wait, mddev->sync_thread == NULL &&
+			!test_bit(MD_RECOVERY_RUNNING, &mddev->recovery));
+
 	mutex_unlock(&mddev->sync_mutex);
 }
 
@@ -9463,6 +9476,8 @@ void md_reap_sync_thread(struct mddev *mddev)
 
 	/* resync has finished, collect result */
 	md_unregister_thread(&mddev->sync_thread);
+	atomic_inc(&mddev->sync_seq);
+
 	if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery) &&
 	    !test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery) &&
 	    mddev->degraded != mddev->raid_disks) {
diff --git a/drivers/md/md.h b/drivers/md/md.h
index 18c168bf5fab..914e6ece9af2 100644
--- a/drivers/md/md.h
+++ b/drivers/md/md.h
@@ -537,6 +537,8 @@ struct mddev {
 
 	/* Used to synchronize idle and frozen for action_store() */
 	struct mutex			sync_mutex;
+	/* The sequence number for sync thread */
+	atomic_t sync_seq;
 
 	bool	has_superblocks:1;
 	bool	fail_last_dev:1;

From 753260ed0b46d2ba0d3d6f68a6a49187bff443e4 Mon Sep 17 00:00:00 2001
From: Yu Kuai <yukuai3@huawei.com>
Date: Mon, 29 May 2023 21:20:36 +0800
Subject: [PATCH 05/36] md: wake up 'resync_wait' at last in
 md_reap_sync_thread()

md_reap_sync_thread() is just replaced with wait_event(resync_wait, ...)
from action_store(), just make sure action_store() will still wait for
everything to be done in md_reap_sync_thread().

Signed-off-by: Yu Kuai <yukuai3@huawei.com>
Reviewd-by: Xiao Ni <xni@redhat.com>
Signed-off-by: Song Liu <song@kernel.org>
Link: https://lore.kernel.org/r/20230529132037.2124527-6-yukuai1@huaweicloud.com
---
 drivers/md/md.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/md/md.c b/drivers/md/md.c
index 920701ab9505..e0d8e751a782 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -9522,7 +9522,6 @@ void md_reap_sync_thread(struct mddev *mddev)
 	if (mddev_is_clustered(mddev) && is_reshaped
 				      && !test_bit(MD_CLOSING, &mddev->flags))
 		md_cluster_ops->update_size(mddev, old_dev_sectors);
-	wake_up(&resync_wait);
 	/* flag recovery needed just to double check */
 	set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
 	sysfs_notify_dirent_safe(mddev->sysfs_completed);
@@ -9530,6 +9529,7 @@ void md_reap_sync_thread(struct mddev *mddev)
 	md_new_event();
 	if (mddev->event_work.func)
 		queue_work(md_misc_wq, &mddev->event_work);
+	wake_up(&resync_wait);
 }
 EXPORT_SYMBOL(md_reap_sync_thread);
 

From f71209b1f21c838a973d858d9f6f76cd39227733 Mon Sep 17 00:00:00 2001
From: Yu Kuai <yukuai3@huawei.com>
Date: Mon, 29 May 2023 21:20:37 +0800
Subject: [PATCH 06/36] md: enhance checking in md_check_recovery()

For md_check_recovery():

1) if 'MD_RECOVERY_RUNING' is not set, register new sync_thread.
2) if 'MD_RECOVERY_RUNING' is set:
 a) if 'MD_RECOVERY_DONE' is not set, don't do anything, wait for
   md_do_sync() to be done.
 b) if 'MD_RECOVERY_DONE' is set, unregister sync_thread. Current code
   expects that sync_thread is not NULL, otherwise new sync_thread will
   be registered, which will corrupt the array.

Make sure md_check_recovery() won't register new sync_thread if
'MD_RECOVERY_RUNING' is still set, and a new WARN_ON_ONCE() is added for
the above corruption,

Signed-off-by: Yu Kuai <yukuai3@huawei.com>
Reviewed-by: Xiao Ni <xni@redhat.com>
Signed-off-by: Song Liu <song@kernel.org>
Link: https://lore.kernel.org/r/20230529132037.2124527-7-yukuai1@huaweicloud.com
---
 drivers/md/md.c | 22 +++++++++++++++-------
 1 file changed, 15 insertions(+), 7 deletions(-)

diff --git a/drivers/md/md.c b/drivers/md/md.c
index e0d8e751a782..320d71537359 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -9388,16 +9388,24 @@ void md_check_recovery(struct mddev *mddev)
 		if (mddev->sb_flags)
 			md_update_sb(mddev, 0);
 
-		if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) &&
-		    !test_bit(MD_RECOVERY_DONE, &mddev->recovery)) {
-			/* resync/recovery still happening */
-			clear_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
-			goto unlock;
-		}
-		if (mddev->sync_thread) {
+		/*
+		 * Never start a new sync thread if MD_RECOVERY_RUNNING is
+		 * still set.
+		 */
+		if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) {
+			if (!test_bit(MD_RECOVERY_DONE, &mddev->recovery)) {
+				/* resync/recovery still happening */
+				clear_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
+				goto unlock;
+			}
+
+			if (WARN_ON_ONCE(!mddev->sync_thread))
+				goto unlock;
+
 			md_reap_sync_thread(mddev);
 			goto unlock;
 		}
+
 		/* Set RUNNING before clearing NEEDED to avoid
 		 * any transients in the value of "sync_action".
 		 */

From 59cefee75bda5d4cc14f4a1ca861b69091e22c3e Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Thu, 15 Jun 2023 08:48:30 +0200
Subject: [PATCH 07/36] md-bitmap: set BITMAP_WRITE_ERROR in write_sb_page

Set BITMAP_WRITE_ERROR directly in write_sb_page instead of propagating
the error to the caller and setting it there.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Hannes Reinecke <hare@suse.de>
Reviewed-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
Reviewed-by: Himanshu Madhani <himanshu.madhani@oracle.com>
Signed-off-by: Song Liu <song@kernel.org>
Link: https://lore.kernel.org/r/20230615064840.629492-2-hch@lst.de
---
 drivers/md/md-bitmap.c | 21 ++++++++-------------
 1 file changed, 8 insertions(+), 13 deletions(-)

diff --git a/drivers/md/md-bitmap.c b/drivers/md/md-bitmap.c
index 1ff712889a3b..d8469720fac2 100644
--- a/drivers/md/md-bitmap.c
+++ b/drivers/md/md-bitmap.c
@@ -279,22 +279,20 @@ static int __write_sb_page(struct md_rdev *rdev, struct bitmap *bitmap,
 	return 0;
 }
 
-static int write_sb_page(struct bitmap *bitmap, struct page *page, int wait)
+static void write_sb_page(struct bitmap *bitmap, struct page *page, int wait)
 {
-	struct md_rdev *rdev;
 	struct mddev *mddev = bitmap->mddev;
-	int ret;
 
 	do {
-		rdev = NULL;
+		struct md_rdev *rdev = NULL;
+
 		while ((rdev = next_active_rdev(rdev, mddev)) != NULL) {
-			ret = __write_sb_page(rdev, bitmap, page);
-			if (ret)
-				return ret;
+			if (__write_sb_page(rdev, bitmap, page) < 0) {
+				set_bit(BITMAP_WRITE_ERROR, &bitmap->flags);
+				return;
+			}
 		}
 	} while (wait && md_super_wait(mddev) < 0);
-
-	return 0;
 }
 
 static void md_bitmap_file_kick(struct bitmap *bitmap);
@@ -306,10 +304,7 @@ static void write_page(struct bitmap *bitmap, struct page *page, int wait)
 	struct buffer_head *bh;
 
 	if (bitmap->storage.file == NULL) {
-		switch (write_sb_page(bitmap, page, wait)) {
-		case -EINVAL:
-			set_bit(BITMAP_WRITE_ERROR, &bitmap->flags);
-		}
+		write_sb_page(bitmap, page, wait);
 	} else {
 
 		bh = page_buffers(page);

From 546ac0b2e2b15d0af7e6d10506558dded1d9d54a Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Thu, 15 Jun 2023 08:48:31 +0200
Subject: [PATCH 08/36] md-bitmap: initialize variables at declaration time in
 md_bitmap_file_unmap

Just a small tidyup to prepare for bigger changes.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Hannes Reinecke <hare@suse.de>
Reviewed-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
Reviewed-by: Himanshu Madhani <himanshu.madhani@oracle.com>
Signed-off-by: Song Liu <song@kernel.org>
Link: https://lore.kernel.org/r/20230615064840.629492-3-hch@lst.de
---
 drivers/md/md-bitmap.c | 12 ++++--------
 1 file changed, 4 insertions(+), 8 deletions(-)

diff --git a/drivers/md/md-bitmap.c b/drivers/md/md-bitmap.c
index d8469720fac2..0b2d8933cbc7 100644
--- a/drivers/md/md-bitmap.c
+++ b/drivers/md/md-bitmap.c
@@ -842,14 +842,10 @@ static int md_bitmap_storage_alloc(struct bitmap_storage *store,
 
 static void md_bitmap_file_unmap(struct bitmap_storage *store)
 {
-	struct page **map, *sb_page;
-	int pages;
-	struct file *file;
-
-	file = store->file;
-	map = store->filemap;
-	pages = store->file_pages;
-	sb_page = store->sb_page;
+	struct file *file = store->file;
+	struct page *sb_page = store->sb_page;
+	struct page **map = store->filemap;
+	int pages = store->file_pages;
 
 	while (pages--)
 		if (map[pages] != sb_page) /* 0 is sb_page, release it below */

From 92348518f23f4fc81caa1a0f7f587566db67b52f Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Thu, 15 Jun 2023 08:48:32 +0200
Subject: [PATCH 09/36] md-bitmap: use %pD to print the file name in
 md_bitmap_file_kick

Don't bother allocating an extra buffer in the I/O failure handler and
instead use the printk built-in format to print the last 4 path name
components.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Hannes Reinecke <hare@suse.de>
Reviewed-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
Reviewed-by: Himanshu Madhani <himanshu.madhani@oracle.com>
Signed-off-by: Song Liu <song@kernel.org>
Link: https://lore.kernel.org/r/20230615064840.629492-4-hch@lst.de
---
 drivers/md/md-bitmap.c | 12 ++----------
 1 file changed, 2 insertions(+), 10 deletions(-)

diff --git a/drivers/md/md-bitmap.c b/drivers/md/md-bitmap.c
index 0b2d8933cbc7..e4b466522d4e 100644
--- a/drivers/md/md-bitmap.c
+++ b/drivers/md/md-bitmap.c
@@ -870,21 +870,13 @@ static void md_bitmap_file_unmap(struct bitmap_storage *store)
  */
 static void md_bitmap_file_kick(struct bitmap *bitmap)
 {
-	char *path, *ptr = NULL;
-
 	if (!test_and_set_bit(BITMAP_STALE, &bitmap->flags)) {
 		md_bitmap_update_sb(bitmap);
 
 		if (bitmap->storage.file) {
-			path = kmalloc(PAGE_SIZE, GFP_KERNEL);
-			if (path)
-				ptr = file_path(bitmap->storage.file,
-					     path, PAGE_SIZE);
+			pr_warn("%s: kicking failed bitmap file %pD4 from array!\n",
+				bmname(bitmap), bitmap->storage.file);
 
-			pr_warn("%s: kicking failed bitmap file %s from array!\n",
-				bmname(bitmap), IS_ERR(ptr) ? "" : ptr);
-
-			kfree(path);
 		} else
 			pr_warn("%s: disabling internal bitmap due to errors\n",
 				bmname(bitmap));

From 5339178e5303084da7655874d1aa69a0572c9b79 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Thu, 15 Jun 2023 08:48:33 +0200
Subject: [PATCH 10/36] md-bitmap: split file writes into a separate helper

Split the file write code out of write_page into a separate helper.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Hannes Reinecke <hare@suse.de>
Reviewed-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
Reviewed-by: Himanshu Madhani <himanshu.madhani@oracle.com>
Signed-off-by: Song Liu <song@kernel.org>
Link: https://lore.kernel.org/r/20230615064840.629492-5-hch@lst.de
---
 drivers/md/md-bitmap.c | 48 +++++++++++++++++++++---------------------
 1 file changed, 24 insertions(+), 24 deletions(-)

diff --git a/drivers/md/md-bitmap.c b/drivers/md/md-bitmap.c
index e4b466522d4e..46fbcfc9d1fc 100644
--- a/drivers/md/md-bitmap.c
+++ b/drivers/md/md-bitmap.c
@@ -296,33 +296,22 @@ static void write_sb_page(struct bitmap *bitmap, struct page *page, int wait)
 }
 
 static void md_bitmap_file_kick(struct bitmap *bitmap);
-/*
- * write out a page to a file
- */
-static void write_page(struct bitmap *bitmap, struct page *page, int wait)
+
+static void write_file_page(struct bitmap *bitmap, struct page *page, int wait)
 {
-	struct buffer_head *bh;
+	struct buffer_head *bh = page_buffers(page);
 
-	if (bitmap->storage.file == NULL) {
-		write_sb_page(bitmap, page, wait);
-	} else {
-
-		bh = page_buffers(page);
-
-		while (bh && bh->b_blocknr) {
-			atomic_inc(&bitmap->pending_writes);
-			set_buffer_locked(bh);
-			set_buffer_mapped(bh);
-			submit_bh(REQ_OP_WRITE | REQ_SYNC, bh);
-			bh = bh->b_this_page;
-		}
-
-		if (wait)
-			wait_event(bitmap->write_wait,
-				   atomic_read(&bitmap->pending_writes)==0);
+	while (bh && bh->b_blocknr) {
+		atomic_inc(&bitmap->pending_writes);
+		set_buffer_locked(bh);
+		set_buffer_mapped(bh);
+		submit_bh(REQ_OP_WRITE | REQ_SYNC, bh);
+		bh = bh->b_this_page;
 	}
-	if (test_bit(BITMAP_WRITE_ERROR, &bitmap->flags))
-		md_bitmap_file_kick(bitmap);
+
+	if (wait)
+		wait_event(bitmap->write_wait,
+			   atomic_read(&bitmap->pending_writes) == 0);
 }
 
 static void end_bitmap_write(struct buffer_head *bh, int uptodate)
@@ -429,6 +418,17 @@ static int read_page(struct file *file, unsigned long index,
  * bitmap file superblock operations
  */
 
+/*
+ * write out a page to a file
+ */
+static void write_page(struct bitmap *bitmap, struct page *page, int wait)
+{
+	if (bitmap->storage.file)
+		write_file_page(bitmap, page, wait);
+	else
+		write_sb_page(bitmap, page, wait);
+}
+
 /*
  * md_bitmap_wait_writes() should be called before writing any bitmap
  * blocks, to ensure previous writes, particularly from

From d681054c2f67cfc45042c2de25845b06bb89c148 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Thu, 15 Jun 2023 08:48:34 +0200
Subject: [PATCH 11/36] md-bitmap: rename read_page to read_file_page

Make the difference to read_sb_page clear.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Hannes Reinecke <hare@suse.de>
Reviewed-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
Reviewed-by: Himanshu Madhani <himanshu.madhani@oracle.com>
Signed-off-by: Song Liu <song@kernel.org>
Link: https://lore.kernel.org/r/20230615064840.629492-6-hch@lst.de
---
 drivers/md/md-bitmap.c | 10 ++++------
 1 file changed, 4 insertions(+), 6 deletions(-)

diff --git a/drivers/md/md-bitmap.c b/drivers/md/md-bitmap.c
index 46fbcfc9d1fc..fa0f6ca7b61b 100644
--- a/drivers/md/md-bitmap.c
+++ b/drivers/md/md-bitmap.c
@@ -348,10 +348,8 @@ static void free_buffers(struct page *page)
  * This usage is similar to how swap files are handled, and allows us
  * to write to a file with no concerns of memory allocation failing.
  */
-static int read_page(struct file *file, unsigned long index,
-		     struct bitmap *bitmap,
-		     unsigned long count,
-		     struct page *page)
+static int read_file_page(struct file *file, unsigned long index,
+		struct bitmap *bitmap, unsigned long count, struct page *page)
 {
 	int ret = 0;
 	struct inode *inode = file_inode(file);
@@ -632,7 +630,7 @@ static int md_bitmap_read_sb(struct bitmap *bitmap)
 		loff_t isize = i_size_read(bitmap->storage.file->f_mapping->host);
 		int bytes = isize > PAGE_SIZE ? PAGE_SIZE : isize;
 
-		err = read_page(bitmap->storage.file, 0,
+		err = read_file_page(bitmap->storage.file, 0,
 				bitmap, bytes, sb_page);
 	} else {
 		err = read_sb_page(bitmap->mddev,
@@ -1141,7 +1139,7 @@ static int md_bitmap_init_from_disk(struct bitmap *bitmap, sector_t start)
 				count = PAGE_SIZE;
 			page = store->filemap[index];
 			if (file)
-				ret = read_page(file, index, bitmap,
+				ret = read_file_page(file, index, bitmap,
 						count, page);
 			else
 				ret = read_sb_page(

From 844dc6691ad5f53a624f4b07bf84037abbb8fce2 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Thu, 15 Jun 2023 08:48:35 +0200
Subject: [PATCH 12/36] md-bitmap: refactor md_bitmap_init_from_disk

Split the confusing loop in md_bitmap_init_from_disk that iterates over
all chunks but also needs to read and map the pages into three separate
loops: one that iterates over the pages to read them, a second optional
one to iterate over the pages to mark them invalid if the bitmaps are
out of date, and a final one that actually iterates over the chunks.

Reported-by: kernel test robot <lkp@intel.com>
Closes: https://lore.kernel.org/oe-kbuild-all/202306160552.smw0qbmb-lkp@intel.com/
Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Song Liu <song@kernel.org>
Link: https://lore.kernel.org/r/20230615064840.629492-7-hch@lst.de
---
 drivers/md/md-bitmap.c | 141 ++++++++++++++++++++---------------------
 1 file changed, 70 insertions(+), 71 deletions(-)

diff --git a/drivers/md/md-bitmap.c b/drivers/md/md-bitmap.c
index fa0f6ca7b61b..db5725beaefb 100644
--- a/drivers/md/md-bitmap.c
+++ b/drivers/md/md-bitmap.c
@@ -1065,33 +1065,31 @@ void md_bitmap_unplug_async(struct bitmap *bitmap)
 EXPORT_SYMBOL(md_bitmap_unplug_async);
 
 static void md_bitmap_set_memory_bits(struct bitmap *bitmap, sector_t offset, int needed);
-/* * bitmap_init_from_disk -- called at bitmap_create time to initialize
- * the in-memory bitmap from the on-disk bitmap -- also, sets up the
- * memory mapping of the bitmap file
- * Special cases:
- *   if there's no bitmap file, or if the bitmap file had been
- *   previously kicked from the array, we mark all the bits as
- *   1's in order to cause a full resync.
+
+/*
+ * Initialize the in-memory bitmap from the on-disk bitmap and set up the memory
+ * mapping of the bitmap file.
+ *
+ * Special case: If there's no bitmap file, or if the bitmap file had been
+ * previously kicked from the array, we mark all the bits as 1's in order to
+ * cause a full resync.
  *
  * We ignore all bits for sectors that end earlier than 'start'.
- * This is used when reading an out-of-date bitmap...
+ * This is used when reading an out-of-date bitmap.
  */
 static int md_bitmap_init_from_disk(struct bitmap *bitmap, sector_t start)
 {
-	unsigned long i, chunks, index, oldindex, bit, node_offset = 0;
-	struct page *page = NULL;
-	unsigned long bit_cnt = 0;
-	struct file *file;
-	unsigned long offset;
-	int outofdate;
-	int ret = -ENOSPC;
-	void *paddr;
+	bool outofdate = test_bit(BITMAP_STALE, &bitmap->flags);
+	struct mddev *mddev = bitmap->mddev;
+	unsigned long chunks = bitmap->counts.chunks;
 	struct bitmap_storage *store = &bitmap->storage;
+	struct file *file = store->file;
+	unsigned long node_offset = 0;
+	unsigned long bit_cnt = 0;
+	unsigned long i;
+	int ret;
 
-	chunks = bitmap->counts.chunks;
-	file = store->file;
-
-	if (!file && !bitmap->mddev->bitmap_info.offset) {
+	if (!file && !mddev->bitmap_info.offset) {
 		/* No permanent bitmap - fill with '1s'. */
 		store->filemap = NULL;
 		store->file_pages = 0;
@@ -1106,77 +1104,79 @@ static int md_bitmap_init_from_disk(struct bitmap *bitmap, sector_t start)
 		return 0;
 	}
 
-	outofdate = test_bit(BITMAP_STALE, &bitmap->flags);
-	if (outofdate)
-		pr_warn("%s: bitmap file is out of date, doing full recovery\n", bmname(bitmap));
-
 	if (file && i_size_read(file->f_mapping->host) < store->bytes) {
 		pr_warn("%s: bitmap file too short %lu < %lu\n",
 			bmname(bitmap),
 			(unsigned long) i_size_read(file->f_mapping->host),
 			store->bytes);
+		ret = -ENOSPC;
 		goto err;
 	}
 
-	oldindex = ~0L;
-	offset = 0;
-	if (!bitmap->mddev->bitmap_info.external)
-		offset = sizeof(bitmap_super_t);
-
-	if (mddev_is_clustered(bitmap->mddev))
+	if (mddev_is_clustered(mddev))
 		node_offset = bitmap->cluster_slot * (DIV_ROUND_UP(store->bytes, PAGE_SIZE));
 
-	for (i = 0; i < chunks; i++) {
-		int b;
-		index = file_page_index(&bitmap->storage, i);
-		bit = file_page_offset(&bitmap->storage, i);
-		if (index != oldindex) { /* this is a new page, read it in */
-			int count;
-			/* unmap the old page, we're done with it */
-			if (index == store->file_pages-1)
-				count = store->bytes - index * PAGE_SIZE;
-			else
-				count = PAGE_SIZE;
-			page = store->filemap[index];
-			if (file)
-				ret = read_file_page(file, index, bitmap,
-						count, page);
-			else
-				ret = read_sb_page(
-					bitmap->mddev,
-					bitmap->mddev->bitmap_info.offset,
-					page,
-					index + node_offset, count);
+	for (i = 0; i < store->file_pages; i++) {
+		struct page *page = store->filemap[i];
+		int count;
 
-			if (ret)
-				goto err;
+		/* unmap the old page, we're done with it */
+		if (i == store->file_pages - 1)
+			count = store->bytes - i * PAGE_SIZE;
+		else
+			count = PAGE_SIZE;
 
-			oldindex = index;
+		if (file)
+			ret = read_file_page(file, i, bitmap, count, page);
+		else
+			ret = read_sb_page(mddev, mddev->bitmap_info.offset,
+					   page, i + node_offset, count);
+		if (ret)
+			goto err;
+	}
 
-			if (outofdate) {
-				/*
-				 * if bitmap is out of date, dirty the
-				 * whole page and write it out
-				 */
-				paddr = kmap_atomic(page);
-				memset(paddr + offset, 0xff,
-				       PAGE_SIZE - offset);
-				kunmap_atomic(paddr);
-				write_page(bitmap, page, 1);
+	if (outofdate) {
+		pr_warn("%s: bitmap file is out of date, doing full recovery\n",
+			bmname(bitmap));
 
+		for (i = 0; i < store->file_pages; i++) {
+			struct page *page = store->filemap[i];
+			unsigned long offset = 0;
+			void *paddr;
+
+			if (i == 0 && !mddev->bitmap_info.external)
+				offset = sizeof(bitmap_super_t);
+
+			/*
+			 * If the bitmap is out of date, dirty the whole page
+			 * and write it out
+			 */
+			paddr = kmap_atomic(page);
+			memset(paddr + offset, 0xff, PAGE_SIZE - offset);
+			kunmap_atomic(paddr);
+
+			write_page(bitmap, page, 1);
+			if (test_bit(BITMAP_WRITE_ERROR, &bitmap->flags)) {
 				ret = -EIO;
-				if (test_bit(BITMAP_WRITE_ERROR,
-					     &bitmap->flags))
-					goto err;
+				goto err;
 			}
 		}
+	}
+
+	for (i = 0; i < chunks; i++) {
+		struct page *page = filemap_get_page(&bitmap->storage, i);
+		unsigned long bit = file_page_offset(&bitmap->storage, i);
+		void *paddr;
+		bool was_set;
+
 		paddr = kmap_atomic(page);
 		if (test_bit(BITMAP_HOSTENDIAN, &bitmap->flags))
-			b = test_bit(bit, paddr);
+			was_set = test_bit(bit, paddr);
 		else
-			b = test_bit_le(bit, paddr);
+			was_set = test_bit_le(bit, paddr);
 		kunmap_atomic(paddr);
-		if (b) {
+
+		if (was_set) {
 			/* if the disk bit is set, set the memory bit */
 			int needed = ((sector_t)(i+1) << bitmap->counts.chunkshift
 				      >= start);
@@ -1185,7 +1185,6 @@ static int md_bitmap_init_from_disk(struct bitmap *bitmap, sector_t start)
 						  needed);
 			bit_cnt++;
 		}
-		offset = 0;
 	}
 
 	pr_debug("%s: bitmap initialized from disk: read %lu pages, set %lu of %lu bits\n",

From 0c3ea5cc8fbdc3515cfb0c47f5a284882f5e4d80 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Thu, 15 Jun 2023 08:48:36 +0200
Subject: [PATCH 13/36] md-bitmap: cleanup read_sb_page

Convert read_sb_page to the normal kernel coding style, calculate the
target sector only once, and add a local iosize variable to make the call
to sync_page_io more readable.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Hannes Reinecke <hare@suse.de>
Reviewed-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
Reviewed-by: Himanshu Madhani <himanshu.madhani@oracle.com>
Signed-off-by: Song Liu <song@kernel.org>
Link: https://lore.kernel.org/r/20230615064840.629492-8-hch@lst.de
---
 drivers/md/md-bitmap.c | 27 +++++++++++++--------------
 1 file changed, 13 insertions(+), 14 deletions(-)

diff --git a/drivers/md/md-bitmap.c b/drivers/md/md-bitmap.c
index db5725beaefb..c6dd1fa5a0be 100644
--- a/drivers/md/md-bitmap.c
+++ b/drivers/md/md-bitmap.c
@@ -139,26 +139,25 @@ static void md_bitmap_checkfree(struct bitmap_counts *bitmap, unsigned long page
  */
 
 /* IO operations when bitmap is stored near all superblocks */
-static int read_sb_page(struct mddev *mddev, loff_t offset,
-			struct page *page,
-			unsigned long index, int size)
-{
-	/* choose a good rdev and read the page from there */
 
+/* choose a good rdev and read the page from there */
+static int read_sb_page(struct mddev *mddev, loff_t offset,
+		struct page *page, unsigned long index, int size)
+{
+
+	sector_t sector = offset + index * (PAGE_SIZE / SECTOR_SIZE);
 	struct md_rdev *rdev;
-	sector_t target;
 
 	rdev_for_each(rdev, mddev) {
-		if (! test_bit(In_sync, &rdev->flags)
-		    || test_bit(Faulty, &rdev->flags)
-		    || test_bit(Bitmap_sync, &rdev->flags))
+		u32 iosize = roundup(size, bdev_logical_block_size(rdev->bdev));
+
+		if (!test_bit(In_sync, &rdev->flags) ||
+		    test_bit(Faulty, &rdev->flags) ||
+		    test_bit(Bitmap_sync, &rdev->flags))
 			continue;
 
-		target = offset + index * (PAGE_SIZE/512);
-
-		if (sync_page_io(rdev, target,
-				 roundup(size, bdev_logical_block_size(rdev->bdev)),
-				 page, REQ_OP_READ, true)) {
+		if (sync_page_io(rdev, sector, iosize, page, REQ_OP_READ,
+				true)) {
 			page->index = index;
 			return 0;
 		}

From f5f2d5ac9f6e807e080311ec36bdf3d6c45b40d4 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Thu, 15 Jun 2023 08:48:37 +0200
Subject: [PATCH 14/36] md-bitmap: account for mddev->bitmap_info.offset in
 read_sb_page

Diretly apply mddev->bitmap_info.offset to the sector number to read
instead of doing that in both callers.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Hannes Reinecke <hare@suse.de>
Reviewed-by: Himanshu Madhani <himanshu.madhani@oracle.com>
Signed-off-by: Song Liu <song@kernel.org>
Link: https://lore.kernel.org/r/20230615064840.629492-9-hch@lst.de
---
 drivers/md/md-bitmap.c | 17 ++++++++---------
 1 file changed, 8 insertions(+), 9 deletions(-)

diff --git a/drivers/md/md-bitmap.c b/drivers/md/md-bitmap.c
index c6dd1fa5a0be..ae1c6f47b965 100644
--- a/drivers/md/md-bitmap.c
+++ b/drivers/md/md-bitmap.c
@@ -145,7 +145,8 @@ static int read_sb_page(struct mddev *mddev, loff_t offset,
 		struct page *page, unsigned long index, int size)
 {
 
-	sector_t sector = offset + index * (PAGE_SIZE / SECTOR_SIZE);
+	sector_t sector = mddev->bitmap_info.offset + offset +
+		index * (PAGE_SIZE / SECTOR_SIZE);
 	struct md_rdev *rdev;
 
 	rdev_for_each(rdev, mddev) {
@@ -593,7 +594,7 @@ static int md_bitmap_read_sb(struct bitmap *bitmap)
 	unsigned long sectors_reserved = 0;
 	int err = -EINVAL;
 	struct page *sb_page;
-	loff_t offset = bitmap->mddev->bitmap_info.offset;
+	loff_t offset = 0;
 
 	if (!bitmap->storage.file && !bitmap->mddev->bitmap_info.offset) {
 		chunksize = 128 * 1024 * 1024;
@@ -620,7 +621,7 @@ static int md_bitmap_read_sb(struct bitmap *bitmap)
 		bm_blocks = ((bm_blocks+7) >> 3) + sizeof(bitmap_super_t);
 		/* to 4k blocks */
 		bm_blocks = DIV_ROUND_UP_SECTOR_T(bm_blocks, 4096);
-		offset = bitmap->mddev->bitmap_info.offset + (bitmap->cluster_slot * (bm_blocks << 3));
+		offset = bitmap->cluster_slot * (bm_blocks << 3);
 		pr_debug("%s:%d bm slot: %d offset: %llu\n", __func__, __LINE__,
 			bitmap->cluster_slot, offset);
 	}
@@ -632,10 +633,8 @@ static int md_bitmap_read_sb(struct bitmap *bitmap)
 		err = read_file_page(bitmap->storage.file, 0,
 				bitmap, bytes, sb_page);
 	} else {
-		err = read_sb_page(bitmap->mddev,
-				   offset,
-				   sb_page,
-				   0, sizeof(bitmap_super_t));
+		err = read_sb_page(bitmap->mddev, offset, sb_page, 0,
+				   sizeof(bitmap_super_t));
 	}
 	if (err)
 		return err;
@@ -1128,8 +1127,8 @@ static int md_bitmap_init_from_disk(struct bitmap *bitmap, sector_t start)
 		if (file)
 			ret = read_file_page(file, i, bitmap, count, page);
 		else
-			ret = read_sb_page(mddev, mddev->bitmap_info.offset,
-					   page, i + node_offset, count);
+			ret = read_sb_page(mddev, 0, page, i + node_offset,
+					   count);
 		if (ret)
 			goto err;
 	}

From d7038f951828da19fa9aafddfa087b69032c9687 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Thu, 15 Jun 2023 08:48:38 +0200
Subject: [PATCH 15/36] md-bitmap: don't use ->index for pages backing the
 bitmap file

The md driver allocates pages for storing the bitmap file data, which
are not page cache pages, and then stores the page granularity file
offset in page->index, which is a field that isn't really valid except
for page cache pages.

Use a separate index for the superblock, and use the scheme used at
read size to recalculate the index for the bitmap pages instead.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Song Liu <song@kernel.org>
Link: https://lore.kernel.org/r/20230615064840.629492-10-hch@lst.de
---
 drivers/md/md-bitmap.c | 65 ++++++++++++++++++++++++------------------
 drivers/md/md-bitmap.h |  1 +
 2 files changed, 39 insertions(+), 27 deletions(-)

diff --git a/drivers/md/md-bitmap.c b/drivers/md/md-bitmap.c
index ae1c6f47b965..a280bfd29f65 100644
--- a/drivers/md/md-bitmap.c
+++ b/drivers/md/md-bitmap.c
@@ -157,11 +157,8 @@ static int read_sb_page(struct mddev *mddev, loff_t offset,
 		    test_bit(Bitmap_sync, &rdev->flags))
 			continue;
 
-		if (sync_page_io(rdev, sector, iosize, page, REQ_OP_READ,
-				true)) {
-			page->index = index;
+		if (sync_page_io(rdev, sector, iosize, page, REQ_OP_READ, true))
 			return 0;
-		}
 	}
 	return -EIO;
 }
@@ -225,18 +222,19 @@ static unsigned int bitmap_io_size(unsigned int io_size, unsigned int opt_size,
 }
 
 static int __write_sb_page(struct md_rdev *rdev, struct bitmap *bitmap,
-			   struct page *page)
+			   unsigned long pg_index, struct page *page)
 {
 	struct block_device *bdev;
 	struct mddev *mddev = bitmap->mddev;
 	struct bitmap_storage *store = &bitmap->storage;
 	loff_t sboff, offset = mddev->bitmap_info.offset;
-	sector_t ps, doff;
+	sector_t ps = pg_index * PAGE_SIZE / SECTOR_SIZE;
 	unsigned int size = PAGE_SIZE;
 	unsigned int opt_size = PAGE_SIZE;
+	sector_t doff;
 
 	bdev = (rdev->meta_bdev) ? rdev->meta_bdev : rdev->bdev;
-	if (page->index == store->file_pages - 1) {
+	if (pg_index == store->file_pages - 1) {
 		unsigned int last_page_size = store->bytes & (PAGE_SIZE - 1);
 
 		if (last_page_size == 0)
@@ -245,7 +243,6 @@ static int __write_sb_page(struct md_rdev *rdev, struct bitmap *bitmap,
 		opt_size = optimal_io_size(bdev, last_page_size, size);
 	}
 
-	ps = page->index * PAGE_SIZE / SECTOR_SIZE;
 	sboff = rdev->sb_start + offset;
 	doff = rdev->data_offset;
 
@@ -279,7 +276,8 @@ static int __write_sb_page(struct md_rdev *rdev, struct bitmap *bitmap,
 	return 0;
 }
 
-static void write_sb_page(struct bitmap *bitmap, struct page *page, int wait)
+static void write_sb_page(struct bitmap *bitmap, unsigned long pg_index,
+			  struct page *page, bool wait)
 {
 	struct mddev *mddev = bitmap->mddev;
 
@@ -287,7 +285,7 @@ static void write_sb_page(struct bitmap *bitmap, struct page *page, int wait)
 		struct md_rdev *rdev = NULL;
 
 		while ((rdev = next_active_rdev(rdev, mddev)) != NULL) {
-			if (__write_sb_page(rdev, bitmap, page) < 0) {
+			if (__write_sb_page(rdev, bitmap, pg_index, page) < 0) {
 				set_bit(BITMAP_WRITE_ERROR, &bitmap->flags);
 				return;
 			}
@@ -397,7 +395,6 @@ static int read_file_page(struct file *file, unsigned long index,
 		blk_cur++;
 		bh = bh->b_this_page;
 	}
-	page->index = index;
 
 	wait_event(bitmap->write_wait,
 		   atomic_read(&bitmap->pending_writes)==0);
@@ -419,12 +416,21 @@ static int read_file_page(struct file *file, unsigned long index,
 /*
  * write out a page to a file
  */
-static void write_page(struct bitmap *bitmap, struct page *page, int wait)
+static void filemap_write_page(struct bitmap *bitmap, unsigned long pg_index,
+			       bool wait)
 {
-	if (bitmap->storage.file)
+	struct bitmap_storage *store = &bitmap->storage;
+	struct page *page = store->filemap[pg_index];
+
+	if (mddev_is_clustered(bitmap->mddev)) {
+		pg_index += bitmap->cluster_slot *
+			DIV_ROUND_UP(store->bytes, PAGE_SIZE);
+	}
+
+	if (store->file)
 		write_file_page(bitmap, page, wait);
 	else
-		write_sb_page(bitmap, page, wait);
+		write_sb_page(bitmap, pg_index, page, wait);
 }
 
 /*
@@ -481,7 +487,12 @@ void md_bitmap_update_sb(struct bitmap *bitmap)
 	sb->sectors_reserved = cpu_to_le32(bitmap->mddev->
 					   bitmap_info.space);
 	kunmap_atomic(sb);
-	write_page(bitmap, bitmap->storage.sb_page, 1);
+
+	if (bitmap->storage.file)
+		write_file_page(bitmap, bitmap->storage.sb_page, 1);
+	else
+		write_sb_page(bitmap, bitmap->storage.sb_index,
+			      bitmap->storage.sb_page, 1);
 }
 EXPORT_SYMBOL(md_bitmap_update_sb);
 
@@ -533,7 +544,7 @@ static int md_bitmap_new_disk_sb(struct bitmap *bitmap)
 	bitmap->storage.sb_page = alloc_page(GFP_KERNEL | __GFP_ZERO);
 	if (bitmap->storage.sb_page == NULL)
 		return -ENOMEM;
-	bitmap->storage.sb_page->index = 0;
+	bitmap->storage.sb_index = 0;
 
 	sb = kmap_atomic(bitmap->storage.sb_page);
 
@@ -810,7 +821,7 @@ static int md_bitmap_storage_alloc(struct bitmap_storage *store,
 	if (store->sb_page) {
 		store->filemap[0] = store->sb_page;
 		pnum = 1;
-		store->sb_page->index = offset;
+		store->sb_index = offset;
 	}
 
 	for ( ; pnum < num_pages; pnum++) {
@@ -819,7 +830,6 @@ static int md_bitmap_storage_alloc(struct bitmap_storage *store,
 			store->file_pages = pnum;
 			return -ENOMEM;
 		}
-		store->filemap[pnum]->index = pnum + offset;
 	}
 	store->file_pages = pnum;
 
@@ -924,6 +934,7 @@ static void md_bitmap_file_set_bit(struct bitmap *bitmap, sector_t block)
 	void *kaddr;
 	unsigned long chunk = block >> bitmap->counts.chunkshift;
 	struct bitmap_storage *store = &bitmap->storage;
+	unsigned long index = file_page_index(store, chunk);
 	unsigned long node_offset = 0;
 
 	if (mddev_is_clustered(bitmap->mddev))
@@ -941,9 +952,9 @@ static void md_bitmap_file_set_bit(struct bitmap *bitmap, sector_t block)
 	else
 		set_bit_le(bit, kaddr);
 	kunmap_atomic(kaddr);
-	pr_debug("set file bit %lu page %lu\n", bit, page->index);
+	pr_debug("set file bit %lu page %lu\n", bit, index);
 	/* record page number so it gets flushed to disk when unplug occurs */
-	set_page_attr(bitmap, page->index - node_offset, BITMAP_PAGE_DIRTY);
+	set_page_attr(bitmap, index - node_offset, BITMAP_PAGE_DIRTY);
 }
 
 static void md_bitmap_file_clear_bit(struct bitmap *bitmap, sector_t block)
@@ -953,6 +964,7 @@ static void md_bitmap_file_clear_bit(struct bitmap *bitmap, sector_t block)
 	void *paddr;
 	unsigned long chunk = block >> bitmap->counts.chunkshift;
 	struct bitmap_storage *store = &bitmap->storage;
+	unsigned long index = file_page_index(store, chunk);
 	unsigned long node_offset = 0;
 
 	if (mddev_is_clustered(bitmap->mddev))
@@ -968,8 +980,8 @@ static void md_bitmap_file_clear_bit(struct bitmap *bitmap, sector_t block)
 	else
 		clear_bit_le(bit, paddr);
 	kunmap_atomic(paddr);
-	if (!test_page_attr(bitmap, page->index - node_offset, BITMAP_PAGE_NEEDWRITE)) {
-		set_page_attr(bitmap, page->index - node_offset, BITMAP_PAGE_PENDING);
+	if (!test_page_attr(bitmap, index - node_offset, BITMAP_PAGE_NEEDWRITE)) {
+		set_page_attr(bitmap, index - node_offset, BITMAP_PAGE_PENDING);
 		bitmap->allclean = 0;
 	}
 }
@@ -1021,7 +1033,7 @@ void md_bitmap_unplug(struct bitmap *bitmap)
 							  "md bitmap_unplug");
 			}
 			clear_page_attr(bitmap, i, BITMAP_PAGE_PENDING);
-			write_page(bitmap, bitmap->storage.filemap[i], 0);
+			filemap_write_page(bitmap, i, false);
 			writing = 1;
 		}
 	}
@@ -1153,7 +1165,7 @@ static int md_bitmap_init_from_disk(struct bitmap *bitmap, sector_t start)
 			memset(paddr + offset, 0xff, PAGE_SIZE - offset);
 			kunmap_atomic(paddr);
 
-			write_page(bitmap, page, 1);
+			filemap_write_page(bitmap, i, true);
 			if (test_bit(BITMAP_WRITE_ERROR, &bitmap->flags)) {
 				ret = -EIO;
 				goto err;
@@ -1374,9 +1386,8 @@ void md_bitmap_daemon_work(struct mddev *mddev)
 			break;
 		if (bitmap->storage.filemap &&
 		    test_and_clear_page_attr(bitmap, j,
-					     BITMAP_PAGE_NEEDWRITE)) {
-			write_page(bitmap, bitmap->storage.filemap[j], 0);
-		}
+					     BITMAP_PAGE_NEEDWRITE))
+			filemap_write_page(bitmap, j, false);
 	}
 
  done:
diff --git a/drivers/md/md-bitmap.h b/drivers/md/md-bitmap.h
index 8a3788c9bfef..bb9eb418780a 100644
--- a/drivers/md/md-bitmap.h
+++ b/drivers/md/md-bitmap.h
@@ -201,6 +201,7 @@ struct bitmap {
 		struct file *file;		/* backing disk file */
 		struct page *sb_page;		/* cached copy of the bitmap
 						 * file superblock */
+		unsigned long sb_index;
 		struct page **filemap;		/* list of cache pages for
 						 * the file */
 		unsigned long *filemap_attr;	/* attributes associated

From a34d4ef82c3c4bd8bda817e9fb53ef37c5595ddd Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Thu, 15 Jun 2023 08:48:39 +0200
Subject: [PATCH 16/36] md: make bitmap file support optional

The support for write intent bitmaps in files on an external files in md
is a hot mess that abuses ->bmap to map file offsets into physical device
objects, and also abuses buffer_heads in a creative way.

Make this code optional so that MD can be built into future kernels
without buffer_head support, and so that we can eventually deprecate it.

Note this does not affect the internal bitmap support, which has none of
the problems.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Hannes Reinecke <hare@suse.de>
Reviewed-by: Himanshu Madhani <himanshu.madhani@oracle.com>
Signed-off-by: Song Liu <song@kernel.org>
Link: https://lore.kernel.org/r/20230615064840.629492-11-hch@lst.de
---
 drivers/md/Kconfig     | 10 ++++++++++
 drivers/md/md-bitmap.c | 15 +++++++++++++++
 drivers/md/md.c        |  7 +++++++
 3 files changed, 32 insertions(+)

diff --git a/drivers/md/Kconfig b/drivers/md/Kconfig
index b0a22e99bade..9712ab9bcba5 100644
--- a/drivers/md/Kconfig
+++ b/drivers/md/Kconfig
@@ -50,6 +50,16 @@ config MD_AUTODETECT
 
 	  If unsure, say Y.
 
+config MD_BITMAP_FILE
+	bool "MD bitmap file support"
+	default y
+	help
+	  If you say Y here, support for write intent bitmaps in files on an
+	  external file system is enabled.  This is an alternative to the internal
+	  bitmaps near the MD superblock, and very problematic code that abuses
+	  various kernel APIs and can only work with files on a file system not
+	  actually sitting on the MD device.
+
 config MD_LINEAR
 	tristate "Linear (append) mode (deprecated)"
 	depends on BLK_DEV_MD
diff --git a/drivers/md/md-bitmap.c b/drivers/md/md-bitmap.c
index a280bfd29f65..a58a4c30265e 100644
--- a/drivers/md/md-bitmap.c
+++ b/drivers/md/md-bitmap.c
@@ -295,6 +295,7 @@ static void write_sb_page(struct bitmap *bitmap, unsigned long pg_index,
 
 static void md_bitmap_file_kick(struct bitmap *bitmap);
 
+#ifdef CONFIG_MD_BITMAP_FILE
 static void write_file_page(struct bitmap *bitmap, struct page *page, int wait)
 {
 	struct buffer_head *bh = page_buffers(page);
@@ -408,6 +409,20 @@ static int read_file_page(struct file *file, unsigned long index,
 		       ret);
 	return ret;
 }
+#else /* CONFIG_MD_BITMAP_FILE */
+static void write_file_page(struct bitmap *bitmap, struct page *page, int wait)
+{
+}
+static int read_file_page(struct file *file, unsigned long index,
+		struct bitmap *bitmap, unsigned long count, struct page *page)
+{
+	return -EIO;
+}
+static void free_buffers(struct page *page)
+{
+	put_page(page);
+}
+#endif /* CONFIG_MD_BITMAP_FILE */
 
 /*
  * bitmap file superblock operations
diff --git a/drivers/md/md.c b/drivers/md/md.c
index 320d71537359..f46996a95b0c 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -7043,6 +7043,13 @@ static int set_bitmap_file(struct mddev *mddev, int fd)
 
 		if (mddev->bitmap || mddev->bitmap_info.file)
 			return -EEXIST; /* cannot add when bitmap is present */
+
+		if (!IS_ENABLED(CONFIG_MD_BITMAP_FILE)) {
+			pr_warn("%s: bitmap files not supported by this kernel\n",
+				mdname(mddev));
+			return -EINVAL;
+		}
+
 		f = fget(fd);
 
 		if (f == NULL) {

From 0ae1c9d38426737c39085f919b9b27d2eab3802e Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Thu, 15 Jun 2023 08:48:40 +0200
Subject: [PATCH 17/36] md: deprecate bitmap file support

The support for bitmaps on files is a very bad idea abusing various kernel
APIs, and fundamentally requires the file to not be on the actual array
without a way to check that this is actually the case.  Add a deprecation
warning to see if we might be able to eventually drop it.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Hannes Reinecke <hare@suse.de>
Reviewed-by: Himanshu Madhani <himanshu.madhani@oracle.com>
Signed-off-by: Song Liu <song@kernel.org>
Link: https://lore.kernel.org/r/20230615064840.629492-12-hch@lst.de
---
 drivers/md/Kconfig | 2 +-
 drivers/md/md.c    | 2 ++
 2 files changed, 3 insertions(+), 1 deletion(-)

diff --git a/drivers/md/Kconfig b/drivers/md/Kconfig
index 9712ab9bcba5..444517d1a233 100644
--- a/drivers/md/Kconfig
+++ b/drivers/md/Kconfig
@@ -51,7 +51,7 @@ config MD_AUTODETECT
 	  If unsure, say Y.
 
 config MD_BITMAP_FILE
-	bool "MD bitmap file support"
+	bool "MD bitmap file support (deprecated)"
 	default y
 	help
 	  If you say Y here, support for write intent bitmaps in files on an
diff --git a/drivers/md/md.c b/drivers/md/md.c
index f46996a95b0c..f8774b1ef0aa 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -7049,6 +7049,8 @@ static int set_bitmap_file(struct mddev *mddev, int fd)
 				mdname(mddev));
 			return -EINVAL;
 		}
+		pr_warn("%s: using deprecated bitmap file support\n",
+			mdname(mddev));
 
 		f = fget(fd);
 

From c567c86b90d4715081adfe5eb812141a5b6b4883 Mon Sep 17 00:00:00 2001
From: Yu Kuai <yukuai3@huawei.com>
Date: Thu, 22 Jun 2023 00:51:03 +0800
Subject: [PATCH 18/36] md: move initialization and destruction of
 'io_acct_set' to md.c

'io_acct_set' is only used for raid0 and raid456, prepare to use it for
raid1 and raid10, so that io accounting from different levels can be
consistent.

By the way, follow up patches will also use this io clone mechanism to
make sure 'active_io' represents in flight io, not io that is dispatching,
so that mddev_suspend will wait for io to be done as designed.

Signed-off-by: Yu Kuai <yukuai3@huawei.com>
Reviewed-by: Xiao Ni <xni@redhat.com>
Signed-off-by: Song Liu <song@kernel.org>
Link: https://lore.kernel.org/r/20230621165110.1498313-2-yukuai1@huaweicloud.com
---
 drivers/md/md.c    | 27 ++++++++++-----------------
 drivers/md/md.h    |  2 --
 drivers/md/raid0.c | 16 ++--------------
 drivers/md/raid5.c | 41 +++++++++++------------------------------
 4 files changed, 23 insertions(+), 63 deletions(-)

diff --git a/drivers/md/md.c b/drivers/md/md.c
index f8774b1ef0aa..1a0844250b9b 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -5876,6 +5876,13 @@ int md_run(struct mddev *mddev)
 			goto exit_bio_set;
 	}
 
+	if (!bioset_initialized(&mddev->io_acct_set)) {
+		err = bioset_init(&mddev->io_acct_set, BIO_POOL_SIZE,
+				  offsetof(struct md_io_acct, bio_clone), 0);
+		if (err)
+			goto exit_sync_set;
+	}
+
 	spin_lock(&pers_lock);
 	pers = find_pers(mddev->level, mddev->clevel);
 	if (!pers || !try_module_get(pers->owner)) {
@@ -6053,6 +6060,8 @@ int md_run(struct mddev *mddev)
 	module_put(pers->owner);
 	md_bitmap_destroy(mddev);
 abort:
+	bioset_exit(&mddev->io_acct_set);
+exit_sync_set:
 	bioset_exit(&mddev->sync_set);
 exit_bio_set:
 	bioset_exit(&mddev->bio_set);
@@ -6276,6 +6285,7 @@ static void __md_stop(struct mddev *mddev)
 	percpu_ref_exit(&mddev->active_io);
 	bioset_exit(&mddev->bio_set);
 	bioset_exit(&mddev->sync_set);
+	bioset_exit(&mddev->io_acct_set);
 }
 
 void md_stop(struct mddev *mddev)
@@ -8641,23 +8651,6 @@ void md_submit_discard_bio(struct mddev *mddev, struct md_rdev *rdev,
 }
 EXPORT_SYMBOL_GPL(md_submit_discard_bio);
 
-int acct_bioset_init(struct mddev *mddev)
-{
-	int err = 0;
-
-	if (!bioset_initialized(&mddev->io_acct_set))
-		err = bioset_init(&mddev->io_acct_set, BIO_POOL_SIZE,
-			offsetof(struct md_io_acct, bio_clone), 0);
-	return err;
-}
-EXPORT_SYMBOL_GPL(acct_bioset_init);
-
-void acct_bioset_exit(struct mddev *mddev)
-{
-	bioset_exit(&mddev->io_acct_set);
-}
-EXPORT_SYMBOL_GPL(acct_bioset_exit);
-
 static void md_end_io_acct(struct bio *bio)
 {
 	struct md_io_acct *md_io_acct = bio->bi_private;
diff --git a/drivers/md/md.h b/drivers/md/md.h
index 914e6ece9af2..4d771e5d3c71 100644
--- a/drivers/md/md.h
+++ b/drivers/md/md.h
@@ -774,8 +774,6 @@ extern void md_error(struct mddev *mddev, struct md_rdev *rdev);
 extern void md_finish_reshape(struct mddev *mddev);
 void md_submit_discard_bio(struct mddev *mddev, struct md_rdev *rdev,
 			struct bio *bio, sector_t start, sector_t size);
-int acct_bioset_init(struct mddev *mddev);
-void acct_bioset_exit(struct mddev *mddev);
 void md_account_bio(struct mddev *mddev, struct bio **bio);
 
 extern bool __must_check md_flush_request(struct mddev *mddev, struct bio *bio);
diff --git a/drivers/md/raid0.c b/drivers/md/raid0.c
index d1ac73fcd852..4106d943aae7 100644
--- a/drivers/md/raid0.c
+++ b/drivers/md/raid0.c
@@ -377,7 +377,6 @@ static void raid0_free(struct mddev *mddev, void *priv)
 	struct r0conf *conf = priv;
 
 	free_conf(mddev, conf);
-	acct_bioset_exit(mddev);
 }
 
 static int raid0_run(struct mddev *mddev)
@@ -392,16 +391,11 @@ static int raid0_run(struct mddev *mddev)
 	if (md_check_no_bitmap(mddev))
 		return -EINVAL;
 
-	if (acct_bioset_init(mddev)) {
-		pr_err("md/raid0:%s: alloc acct bioset failed.\n", mdname(mddev));
-		return -ENOMEM;
-	}
-
 	/* if private is not null, we are here after takeover */
 	if (mddev->private == NULL) {
 		ret = create_strip_zones(mddev, &conf);
 		if (ret < 0)
-			goto exit_acct_set;
+			return ret;
 		mddev->private = conf;
 	}
 	conf = mddev->private;
@@ -432,15 +426,9 @@ static int raid0_run(struct mddev *mddev)
 
 	ret = md_integrity_register(mddev);
 	if (ret)
-		goto free;
+		free_conf(mddev, conf);
 
 	return ret;
-
-free:
-	free_conf(mddev, conf);
-exit_acct_set:
-	acct_bioset_exit(mddev);
-	return ret;
 }
 
 /*
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index 85b3004594e0..db3cec228237 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -7787,19 +7787,12 @@ static int raid5_run(struct mddev *mddev)
 	struct md_rdev *rdev;
 	struct md_rdev *journal_dev = NULL;
 	sector_t reshape_offset = 0;
-	int i, ret = 0;
+	int i;
 	long long min_offset_diff = 0;
 	int first = 1;
 
-	if (acct_bioset_init(mddev)) {
-		pr_err("md/raid456:%s: alloc acct bioset failed.\n", mdname(mddev));
+	if (mddev_init_writes_pending(mddev) < 0)
 		return -ENOMEM;
-	}
-
-	if (mddev_init_writes_pending(mddev) < 0) {
-		ret = -ENOMEM;
-		goto exit_acct_set;
-	}
 
 	if (mddev->recovery_cp != MaxSector)
 		pr_notice("md/raid:%s: not clean -- starting background reconstruction\n",
@@ -7830,8 +7823,7 @@ static int raid5_run(struct mddev *mddev)
 	    (mddev->bitmap_info.offset || mddev->bitmap_info.file)) {
 		pr_notice("md/raid:%s: array cannot have both journal and bitmap\n",
 			  mdname(mddev));
-		ret = -EINVAL;
-		goto exit_acct_set;
+		return -EINVAL;
 	}
 
 	if (mddev->reshape_position != MaxSector) {
@@ -7856,15 +7848,13 @@ static int raid5_run(struct mddev *mddev)
 		if (journal_dev) {
 			pr_warn("md/raid:%s: don't support reshape with journal - aborting.\n",
 				mdname(mddev));
-			ret = -EINVAL;
-			goto exit_acct_set;
+			return -EINVAL;
 		}
 
 		if (mddev->new_level != mddev->level) {
 			pr_warn("md/raid:%s: unsupported reshape required - aborting.\n",
 				mdname(mddev));
-			ret = -EINVAL;
-			goto exit_acct_set;
+			return -EINVAL;
 		}
 		old_disks = mddev->raid_disks - mddev->delta_disks;
 		/* reshape_position must be on a new-stripe boundary, and one
@@ -7880,8 +7870,7 @@ static int raid5_run(struct mddev *mddev)
 		if (sector_div(here_new, chunk_sectors * new_data_disks)) {
 			pr_warn("md/raid:%s: reshape_position not on a stripe boundary\n",
 				mdname(mddev));
-			ret = -EINVAL;
-			goto exit_acct_set;
+			return -EINVAL;
 		}
 		reshape_offset = here_new * chunk_sectors;
 		/* here_new is the stripe we will write to */
@@ -7903,8 +7892,7 @@ static int raid5_run(struct mddev *mddev)
 			else if (mddev->ro == 0) {
 				pr_warn("md/raid:%s: in-place reshape must be started in read-only mode - aborting\n",
 					mdname(mddev));
-				ret = -EINVAL;
-				goto exit_acct_set;
+				return -EINVAL;
 			}
 		} else if (mddev->reshape_backwards
 		    ? (here_new * chunk_sectors + min_offset_diff <=
@@ -7914,8 +7902,7 @@ static int raid5_run(struct mddev *mddev)
 			/* Reading from the same stripe as writing to - bad */
 			pr_warn("md/raid:%s: reshape_position too early for auto-recovery - aborting.\n",
 				mdname(mddev));
-			ret = -EINVAL;
-			goto exit_acct_set;
+			return -EINVAL;
 		}
 		pr_debug("md/raid:%s: reshape will continue\n", mdname(mddev));
 		/* OK, we should be able to continue; */
@@ -7939,10 +7926,8 @@ static int raid5_run(struct mddev *mddev)
 	else
 		conf = mddev->private;
 
-	if (IS_ERR(conf)) {
-		ret = PTR_ERR(conf);
-		goto exit_acct_set;
-	}
+	if (IS_ERR(conf))
+		return PTR_ERR(conf);
 
 	if (test_bit(MD_HAS_JOURNAL, &mddev->flags)) {
 		if (!journal_dev) {
@@ -8140,10 +8125,7 @@ static int raid5_run(struct mddev *mddev)
 	free_conf(conf);
 	mddev->private = NULL;
 	pr_warn("md/raid:%s: failed to run raid set.\n", mdname(mddev));
-	ret = -EIO;
-exit_acct_set:
-	acct_bioset_exit(mddev);
-	return ret;
+	return -EIO;
 }
 
 static void raid5_free(struct mddev *mddev, void *priv)
@@ -8151,7 +8133,6 @@ static void raid5_free(struct mddev *mddev, void *priv)
 	struct r5conf *conf = priv;
 
 	free_conf(conf);
-	acct_bioset_exit(mddev);
 	mddev->to_remove = &raid5_attrs_group;
 }
 

From c687297b884507a4737b747957eda567063901df Mon Sep 17 00:00:00 2001
From: Yu Kuai <yukuai3@huawei.com>
Date: Thu, 22 Jun 2023 00:51:04 +0800
Subject: [PATCH 19/36] md: also clone new io if io accounting is disabled

Currently, 'active_io' is grabbed before make_reqeust() is called, and
it's dropped immediately make_reqeust() returns. Hence 'active_io'
actually means io is dispatching, not io is inflight.

For raid0 and raid456 that io accounting is enabled, 'active_io' will
also be grabbed when bio is cloned for io accounting, and this 'active_io'
is dropped until io is done.

Always clone new bio so that 'active_io' will mean that io is inflight,
raid1 and raid10 will switch to use this method in later patches.

Now that bio will be cloned even if io accounting is disabled, also
rename related structure from '*_acct_*' to '*_clone_*'.

Signed-off-by: Yu Kuai <yukuai3@huawei.com>
Reviewed-by: Xiao Ni <xni@redhat.com>
Signed-off-by: Song Liu <song@kernel.org>
Link: https://lore.kernel.org/r/20230621165110.1498313-3-yukuai1@huaweicloud.com
---
 drivers/md/md.c    | 61 +++++++++++++++++++++++-----------------------
 drivers/md/md.h    |  4 +--
 drivers/md/raid5.c | 18 +++++++-------
 3 files changed, 41 insertions(+), 42 deletions(-)

diff --git a/drivers/md/md.c b/drivers/md/md.c
index 1a0844250b9b..abb616720393 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -2306,7 +2306,7 @@ int md_integrity_register(struct mddev *mddev)
 	pr_debug("md: data integrity enabled on %s\n", mdname(mddev));
 	if (bioset_integrity_create(&mddev->bio_set, BIO_POOL_SIZE) ||
 	    (mddev->level != 1 && mddev->level != 10 &&
-	     bioset_integrity_create(&mddev->io_acct_set, BIO_POOL_SIZE))) {
+	     bioset_integrity_create(&mddev->io_clone_set, BIO_POOL_SIZE))) {
 		/*
 		 * No need to handle the failure of bioset_integrity_create,
 		 * because the function is called by md_run() -> pers->run(),
@@ -5876,9 +5876,9 @@ int md_run(struct mddev *mddev)
 			goto exit_bio_set;
 	}
 
-	if (!bioset_initialized(&mddev->io_acct_set)) {
-		err = bioset_init(&mddev->io_acct_set, BIO_POOL_SIZE,
-				  offsetof(struct md_io_acct, bio_clone), 0);
+	if (!bioset_initialized(&mddev->io_clone_set)) {
+		err = bioset_init(&mddev->io_clone_set, BIO_POOL_SIZE,
+				  offsetof(struct md_io_clone, bio_clone), 0);
 		if (err)
 			goto exit_sync_set;
 	}
@@ -6060,7 +6060,7 @@ int md_run(struct mddev *mddev)
 	module_put(pers->owner);
 	md_bitmap_destroy(mddev);
 abort:
-	bioset_exit(&mddev->io_acct_set);
+	bioset_exit(&mddev->io_clone_set);
 exit_sync_set:
 	bioset_exit(&mddev->sync_set);
 exit_bio_set:
@@ -6285,7 +6285,7 @@ static void __md_stop(struct mddev *mddev)
 	percpu_ref_exit(&mddev->active_io);
 	bioset_exit(&mddev->bio_set);
 	bioset_exit(&mddev->sync_set);
-	bioset_exit(&mddev->io_acct_set);
+	bioset_exit(&mddev->io_clone_set);
 }
 
 void md_stop(struct mddev *mddev)
@@ -8651,46 +8651,45 @@ void md_submit_discard_bio(struct mddev *mddev, struct md_rdev *rdev,
 }
 EXPORT_SYMBOL_GPL(md_submit_discard_bio);
 
-static void md_end_io_acct(struct bio *bio)
+static void md_end_clone_io(struct bio *bio)
 {
-	struct md_io_acct *md_io_acct = bio->bi_private;
-	struct bio *orig_bio = md_io_acct->orig_bio;
-	struct mddev *mddev = md_io_acct->mddev;
+	struct md_io_clone *md_io_clone = bio->bi_private;
+	struct bio *orig_bio = md_io_clone->orig_bio;
+	struct mddev *mddev = md_io_clone->mddev;
 
 	orig_bio->bi_status = bio->bi_status;
 
-	bio_end_io_acct(orig_bio, md_io_acct->start_time);
+	if (md_io_clone->start_time)
+		bio_end_io_acct(orig_bio, md_io_clone->start_time);
+
 	bio_put(bio);
 	bio_endio(orig_bio);
-
 	percpu_ref_put(&mddev->active_io);
 }
 
-/*
- * Used by personalities that don't already clone the bio and thus can't
- * easily add the timestamp to their extended bio structure.
- */
-void md_account_bio(struct mddev *mddev, struct bio **bio)
+static void md_clone_bio(struct mddev *mddev, struct bio **bio)
 {
 	struct block_device *bdev = (*bio)->bi_bdev;
-	struct md_io_acct *md_io_acct;
-	struct bio *clone;
+	struct md_io_clone *md_io_clone;
+	struct bio *clone =
+		bio_alloc_clone(bdev, *bio, GFP_NOIO, &mddev->io_clone_set);
 
-	if (!blk_queue_io_stat(bdev->bd_disk->queue))
-		return;
+	md_io_clone = container_of(clone, struct md_io_clone, bio_clone);
+	md_io_clone->orig_bio = *bio;
+	md_io_clone->mddev = mddev;
+	if (blk_queue_io_stat(bdev->bd_disk->queue))
+		md_io_clone->start_time = bio_start_io_acct(*bio);
 
-	percpu_ref_get(&mddev->active_io);
-
-	clone = bio_alloc_clone(bdev, *bio, GFP_NOIO, &mddev->io_acct_set);
-	md_io_acct = container_of(clone, struct md_io_acct, bio_clone);
-	md_io_acct->orig_bio = *bio;
-	md_io_acct->start_time = bio_start_io_acct(*bio);
-	md_io_acct->mddev = mddev;
-
-	clone->bi_end_io = md_end_io_acct;
-	clone->bi_private = md_io_acct;
+	clone->bi_end_io = md_end_clone_io;
+	clone->bi_private = md_io_clone;
 	*bio = clone;
 }
+
+void md_account_bio(struct mddev *mddev, struct bio **bio)
+{
+	percpu_ref_get(&mddev->active_io);
+	md_clone_bio(mddev, bio);
+}
 EXPORT_SYMBOL_GPL(md_account_bio);
 
 /* md_allow_write(mddev)
diff --git a/drivers/md/md.h b/drivers/md/md.h
index 4d771e5d3c71..8ae957480976 100644
--- a/drivers/md/md.h
+++ b/drivers/md/md.h
@@ -510,7 +510,7 @@ struct mddev {
 	struct bio_set			sync_set; /* for sync operations like
 						   * metadata and bitmap writes
 						   */
-	struct bio_set			io_acct_set; /* for raid0 and raid5 io accounting */
+	struct bio_set			io_clone_set;
 
 	/* Generic flush handling.
 	 * The last to finish preflush schedules a worker to submit
@@ -736,7 +736,7 @@ struct md_thread {
 	void			*private;
 };
 
-struct md_io_acct {
+struct md_io_clone {
 	struct mddev	*mddev;
 	struct bio	*orig_bio;
 	unsigned long	start_time;
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index db3cec228237..1da9dd3e2f18 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -5468,13 +5468,13 @@ static struct bio *remove_bio_from_retry(struct r5conf *conf,
  */
 static void raid5_align_endio(struct bio *bi)
 {
-	struct md_io_acct *md_io_acct = bi->bi_private;
-	struct bio *raid_bi = md_io_acct->orig_bio;
+	struct md_io_clone *md_io_clone = bi->bi_private;
+	struct bio *raid_bi = md_io_clone->orig_bio;
 	struct mddev *mddev;
 	struct r5conf *conf;
 	struct md_rdev *rdev;
 	blk_status_t error = bi->bi_status;
-	unsigned long start_time = md_io_acct->start_time;
+	unsigned long start_time = md_io_clone->start_time;
 
 	bio_put(bi);
 
@@ -5506,7 +5506,7 @@ static int raid5_read_one_chunk(struct mddev *mddev, struct bio *raid_bio)
 	struct md_rdev *rdev;
 	sector_t sector, end_sector, first_bad;
 	int bad_sectors, dd_idx;
-	struct md_io_acct *md_io_acct;
+	struct md_io_clone *md_io_clone;
 	bool did_inc;
 
 	if (!in_chunk_boundary(mddev, raid_bio)) {
@@ -5544,15 +5544,15 @@ static int raid5_read_one_chunk(struct mddev *mddev, struct bio *raid_bio)
 	}
 
 	align_bio = bio_alloc_clone(rdev->bdev, raid_bio, GFP_NOIO,
-				    &mddev->io_acct_set);
-	md_io_acct = container_of(align_bio, struct md_io_acct, bio_clone);
+				    &mddev->io_clone_set);
+	md_io_clone = container_of(align_bio, struct md_io_clone, bio_clone);
 	raid_bio->bi_next = (void *)rdev;
 	if (blk_queue_io_stat(raid_bio->bi_bdev->bd_disk->queue))
-		md_io_acct->start_time = bio_start_io_acct(raid_bio);
-	md_io_acct->orig_bio = raid_bio;
+		md_io_clone->start_time = bio_start_io_acct(raid_bio);
+	md_io_clone->orig_bio = raid_bio;
 
 	align_bio->bi_end_io = raid5_align_endio;
-	align_bio->bi_private = md_io_acct;
+	align_bio->bi_private = md_io_clone;
 	align_bio->bi_iter.bi_sector = sector;
 
 	/* No reshape active, so we can trust rdev->data_offset */

From 05048cbccab79e9fb9030274170ccd710fe69474 Mon Sep 17 00:00:00 2001
From: Yu Kuai <yukuai3@huawei.com>
Date: Thu, 22 Jun 2023 00:51:05 +0800
Subject: [PATCH 20/36] raid5: fix missing io accounting in raid5_align_endio()

Io will only be accounted as done from raid5_align_endio() if the io
succeeded, and io inflight counter will be leaked if such io failed.

Fix this problem by switching to use md_account_bio() for io accounting.

Signed-off-by: Yu Kuai <yukuai3@huawei.com>
Reviewed-by: Xiao Ni <xni@redhat.com>
Signed-off-by: Song Liu <song@kernel.org>
Link: https://lore.kernel.org/r/20230621165110.1498313-4-yukuai1@huaweicloud.com
---
 drivers/md/raid5.c | 29 ++++++++---------------------
 1 file changed, 8 insertions(+), 21 deletions(-)

diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index 1da9dd3e2f18..32a87193bad7 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -5468,26 +5468,17 @@ static struct bio *remove_bio_from_retry(struct r5conf *conf,
  */
 static void raid5_align_endio(struct bio *bi)
 {
-	struct md_io_clone *md_io_clone = bi->bi_private;
-	struct bio *raid_bi = md_io_clone->orig_bio;
-	struct mddev *mddev;
-	struct r5conf *conf;
-	struct md_rdev *rdev;
+	struct bio *raid_bi = bi->bi_private;
+	struct md_rdev *rdev = (void *)raid_bi->bi_next;
+	struct mddev *mddev = rdev->mddev;
+	struct r5conf *conf = mddev->private;
 	blk_status_t error = bi->bi_status;
-	unsigned long start_time = md_io_clone->start_time;
 
 	bio_put(bi);
-
-	rdev = (void*)raid_bi->bi_next;
 	raid_bi->bi_next = NULL;
-	mddev = rdev->mddev;
-	conf = mddev->private;
-
 	rdev_dec_pending(rdev, conf->mddev);
 
 	if (!error) {
-		if (blk_queue_io_stat(raid_bi->bi_bdev->bd_disk->queue))
-			bio_end_io_acct(raid_bi, start_time);
 		bio_endio(raid_bi);
 		if (atomic_dec_and_test(&conf->active_aligned_reads))
 			wake_up(&conf->wait_for_quiescent);
@@ -5506,7 +5497,6 @@ static int raid5_read_one_chunk(struct mddev *mddev, struct bio *raid_bio)
 	struct md_rdev *rdev;
 	sector_t sector, end_sector, first_bad;
 	int bad_sectors, dd_idx;
-	struct md_io_clone *md_io_clone;
 	bool did_inc;
 
 	if (!in_chunk_boundary(mddev, raid_bio)) {
@@ -5543,16 +5533,13 @@ static int raid5_read_one_chunk(struct mddev *mddev, struct bio *raid_bio)
 		return 0;
 	}
 
-	align_bio = bio_alloc_clone(rdev->bdev, raid_bio, GFP_NOIO,
-				    &mddev->io_clone_set);
-	md_io_clone = container_of(align_bio, struct md_io_clone, bio_clone);
+	md_account_bio(mddev, &raid_bio);
 	raid_bio->bi_next = (void *)rdev;
-	if (blk_queue_io_stat(raid_bio->bi_bdev->bd_disk->queue))
-		md_io_clone->start_time = bio_start_io_acct(raid_bio);
-	md_io_clone->orig_bio = raid_bio;
 
+	align_bio = bio_alloc_clone(rdev->bdev, raid_bio, GFP_NOIO,
+				    &mddev->bio_set);
 	align_bio->bi_end_io = raid5_align_endio;
-	align_bio->bi_private = md_io_clone;
+	align_bio->bi_private = raid_bio;
 	align_bio->bi_iter.bi_sector = sector;
 
 	/* No reshape active, so we can trust rdev->data_offset */

From bb2a9acefaf9ce5bbc1e70f407e34599233d0243 Mon Sep 17 00:00:00 2001
From: Yu Kuai <yukuai3@huawei.com>
Date: Thu, 22 Jun 2023 00:51:06 +0800
Subject: [PATCH 21/36] md/raid1: switch to use md_account_bio() for io
 accounting

Two problems can be fixed this way:

1) 'active_io' will represent inflight io instead of io that is
dispatching.

2) If io accounting is enabled or disabled while io is still inflight,
bio_start_io_acct() and bio_end_io_acct() is not balanced and io
inflight counter will be leaked.

Signed-off-by: Yu Kuai <yukuai3@huawei.com>
Reviewed-by: Xiao Ni <xni@redhat.com>
Signed-off-by: Song Liu <song@kernel.org>
Link: https://lore.kernel.org/r/20230621165110.1498313-5-yukuai1@huaweicloud.com
---
 drivers/md/raid1.c | 14 ++++++--------
 drivers/md/raid1.h |  1 -
 2 files changed, 6 insertions(+), 9 deletions(-)

diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c
index dd25832eb045..06fa1580501f 100644
--- a/drivers/md/raid1.c
+++ b/drivers/md/raid1.c
@@ -304,8 +304,6 @@ static void call_bio_endio(struct r1bio *r1_bio)
 	if (!test_bit(R1BIO_Uptodate, &r1_bio->state))
 		bio->bi_status = BLK_STS_IOERR;
 
-	if (blk_queue_io_stat(bio->bi_bdev->bd_disk->queue))
-		bio_end_io_acct(bio, r1_bio->start_time);
 	bio_endio(bio);
 }
 
@@ -1303,10 +1301,10 @@ static void raid1_read_request(struct mddev *mddev, struct bio *bio,
 	}
 
 	r1_bio->read_disk = rdisk;
-
-	if (!r1bio_existed && blk_queue_io_stat(bio->bi_bdev->bd_disk->queue))
-		r1_bio->start_time = bio_start_io_acct(bio);
-
+	if (!r1bio_existed) {
+		md_account_bio(mddev, &bio);
+		r1_bio->master_bio = bio;
+	}
 	read_bio = bio_alloc_clone(mirror->rdev->bdev, bio, gfp,
 				   &mddev->bio_set);
 
@@ -1500,8 +1498,8 @@ static void raid1_write_request(struct mddev *mddev, struct bio *bio,
 		r1_bio->sectors = max_sectors;
 	}
 
-	if (blk_queue_io_stat(bio->bi_bdev->bd_disk->queue))
-		r1_bio->start_time = bio_start_io_acct(bio);
+	md_account_bio(mddev, &bio);
+	r1_bio->master_bio = bio;
 	atomic_set(&r1_bio->remaining, 1);
 	atomic_set(&r1_bio->behind_remaining, 0);
 
diff --git a/drivers/md/raid1.h b/drivers/md/raid1.h
index 468f189da7a0..14d4211a123a 100644
--- a/drivers/md/raid1.h
+++ b/drivers/md/raid1.h
@@ -157,7 +157,6 @@ struct r1bio {
 	sector_t		sector;
 	int			sectors;
 	unsigned long		state;
-	unsigned long		start_time;
 	struct mddev		*mddev;
 	/*
 	 * original bio going to /dev/mdx

From 820455238366a78a44a85cc7d58a987e728464d9 Mon Sep 17 00:00:00 2001
From: Yu Kuai <yukuai3@huawei.com>
Date: Thu, 22 Jun 2023 00:51:07 +0800
Subject: [PATCH 22/36] md/raid10: switch to use md_account_bio() for io
 accounting

Make sure that 'active_io' will represent inflight io instead of io that
is dispatching, and io accounting from all levels will be consistent.

Signed-off-by: Yu Kuai <yukuai3@huawei.com>
Reviewed-by: Xiao Ni <xni@redhat.com>
Signed-off-by: Song Liu <song@kernel.org>
Link: https://lore.kernel.org/r/20230621165110.1498313-6-yukuai1@huaweicloud.com
---
 drivers/md/raid10.c | 20 +++++++++-----------
 drivers/md/raid10.h |  1 -
 2 files changed, 9 insertions(+), 12 deletions(-)

diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c
index 5051149e27bb..d42e9b7d2608 100644
--- a/drivers/md/raid10.c
+++ b/drivers/md/raid10.c
@@ -325,8 +325,6 @@ static void raid_end_bio_io(struct r10bio *r10_bio)
 	if (!test_bit(R10BIO_Uptodate, &r10_bio->state))
 		bio->bi_status = BLK_STS_IOERR;
 
-	if (r10_bio->start_time)
-		bio_end_io_acct(bio, r10_bio->start_time);
 	bio_endio(bio);
 	/*
 	 * Wake up any possible resync thread that waits for the device
@@ -1172,7 +1170,7 @@ static bool regular_request_wait(struct mddev *mddev, struct r10conf *conf,
 }
 
 static void raid10_read_request(struct mddev *mddev, struct bio *bio,
-				struct r10bio *r10_bio)
+				struct r10bio *r10_bio, bool io_accounting)
 {
 	struct r10conf *conf = mddev->private;
 	struct bio *read_bio;
@@ -1243,9 +1241,10 @@ static void raid10_read_request(struct mddev *mddev, struct bio *bio,
 	}
 	slot = r10_bio->read_slot;
 
-	if (!r10_bio->start_time &&
-	    blk_queue_io_stat(bio->bi_bdev->bd_disk->queue))
-		r10_bio->start_time = bio_start_io_acct(bio);
+	if (io_accounting) {
+		md_account_bio(mddev, &bio);
+		r10_bio->master_bio = bio;
+	}
 	read_bio = bio_alloc_clone(rdev->bdev, bio, gfp, &mddev->bio_set);
 
 	r10_bio->devs[slot].bio = read_bio;
@@ -1543,8 +1542,8 @@ static void raid10_write_request(struct mddev *mddev, struct bio *bio,
 		r10_bio->master_bio = bio;
 	}
 
-	if (blk_queue_io_stat(bio->bi_bdev->bd_disk->queue))
-		r10_bio->start_time = bio_start_io_acct(bio);
+	md_account_bio(mddev, &bio);
+	r10_bio->master_bio = bio;
 	atomic_set(&r10_bio->remaining, 1);
 	md_bitmap_startwrite(mddev->bitmap, r10_bio->sector, r10_bio->sectors, 0);
 
@@ -1571,12 +1570,11 @@ static void __make_request(struct mddev *mddev, struct bio *bio, int sectors)
 	r10_bio->sector = bio->bi_iter.bi_sector;
 	r10_bio->state = 0;
 	r10_bio->read_slot = -1;
-	r10_bio->start_time = 0;
 	memset(r10_bio->devs, 0, sizeof(r10_bio->devs[0]) *
 			conf->geo.raid_disks);
 
 	if (bio_data_dir(bio) == READ)
-		raid10_read_request(mddev, bio, r10_bio);
+		raid10_read_request(mddev, bio, r10_bio, true);
 	else
 		raid10_write_request(mddev, bio, r10_bio);
 }
@@ -2985,7 +2983,7 @@ static void handle_read_error(struct mddev *mddev, struct r10bio *r10_bio)
 
 	rdev_dec_pending(rdev, mddev);
 	r10_bio->state = 0;
-	raid10_read_request(mddev, r10_bio->master_bio, r10_bio);
+	raid10_read_request(mddev, r10_bio->master_bio, r10_bio, false);
 	/*
 	 * allow_barrier after re-submit to ensure no sync io
 	 * can be issued while regular io pending.
diff --git a/drivers/md/raid10.h b/drivers/md/raid10.h
index 63e48b11b552..2e75e88d0802 100644
--- a/drivers/md/raid10.h
+++ b/drivers/md/raid10.h
@@ -123,7 +123,6 @@ struct r10bio {
 	sector_t		sector;	/* virtual sector number */
 	int			sectors;
 	unsigned long		state;
-	unsigned long		start_time;
 	struct mddev		*mddev;
 	/*
 	 * original bio going to /dev/mdx

From bdf2b52136dd19a55aaf5484cb254498c61383a5 Mon Sep 17 00:00:00 2001
From: Yu Kuai <yukuai3@huawei.com>
Date: Thu, 22 Jun 2023 00:51:08 +0800
Subject: [PATCH 23/36] md/md-multipath: enable io accounting

use md_account_bio() to enable io accounting, also make sure
mddev_suspend() will wait for all io to be done.

Signed-off-by: Yu Kuai <yukuai3@huawei.com>
Reviewed-by: Xiao Ni <xni@redhat.com>
Signed-off-by: Song Liu <song@kernel.org>
Link: https://lore.kernel.org/r/20230621165110.1498313-7-yukuai1@huaweicloud.com
---
 drivers/md/md-multipath.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/drivers/md/md-multipath.c b/drivers/md/md-multipath.c
index 92c45be203d7..d22276870283 100644
--- a/drivers/md/md-multipath.c
+++ b/drivers/md/md-multipath.c
@@ -107,6 +107,7 @@ static bool multipath_make_request(struct mddev *mddev, struct bio * bio)
 	    && md_flush_request(mddev, bio))
 		return true;
 
+	md_account_bio(mddev, &bio);
 	mp_bh = mempool_alloc(&conf->pool, GFP_NOIO);
 
 	mp_bh->master_bio = bio;

From 09f43cb530b03e4d58b35a39e54de658fc8d09b7 Mon Sep 17 00:00:00 2001
From: Yu Kuai <yukuai3@huawei.com>
Date: Thu, 22 Jun 2023 00:51:09 +0800
Subject: [PATCH 24/36] md/md-linear: enable io accounting

use md_account_bio() to enable io accounting, also make sure
mddev_suspend() will wait for all io to be done.

Signed-off-by: Yu Kuai <yukuai3@huawei.com>
Reviewed-by: Xiao Ni <xni@redhat.com>
Signed-off-by: Song Liu <song@kernel.org>
Link: https://lore.kernel.org/r/20230621165110.1498313-8-yukuai1@huaweicloud.com
---
 drivers/md/md-linear.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/drivers/md/md-linear.c b/drivers/md/md-linear.c
index 4eb72b9dd933..71ac99646827 100644
--- a/drivers/md/md-linear.c
+++ b/drivers/md/md-linear.c
@@ -238,6 +238,7 @@ static bool linear_make_request(struct mddev *mddev, struct bio *bio)
 		bio = split;
 	}
 
+	md_account_bio(mddev, &bio);
 	bio_set_dev(bio, tmp_dev->rdev->bdev);
 	bio->bi_iter.bi_sector = bio->bi_iter.bi_sector -
 		start_sector + data_offset;

From dd9a68601409d905810a936a7c4e1241b604013f Mon Sep 17 00:00:00 2001
From: Yu Kuai <yukuai3@huawei.com>
Date: Thu, 22 Jun 2023 00:51:10 +0800
Subject: [PATCH 25/36] md/md-faulty: enable io accounting

use md_account_bio() to enable io accounting, also make sure
mddev_suspend() will wait for all io to be done.

Signed-off-by: Yu Kuai <yukuai3@huawei.com>
Reviewed-by: Xiao Ni <xni@redhat.com>
Signed-off-by: Song Liu <song@kernel.org>
Link: https://lore.kernel.org/r/20230621165110.1498313-9-yukuai1@huaweicloud.com
---
 drivers/md/md-faulty.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/drivers/md/md-faulty.c b/drivers/md/md-faulty.c
index 50ad818978a4..a039e8e20f55 100644
--- a/drivers/md/md-faulty.c
+++ b/drivers/md/md-faulty.c
@@ -204,6 +204,8 @@ static bool faulty_make_request(struct mddev *mddev, struct bio *bio)
 			failit = 1;
 		}
 	}
+
+	md_account_bio(mddev, &bio);
 	if (failit) {
 		struct bio *b = bio_alloc_clone(conf->rdev->bdev, bio, GFP_NOIO,
 						&mddev->bio_set);

From ffb1e7a03f966065323b18c96da23a2118a19529 Mon Sep 17 00:00:00 2001
From: Li Nan <linan122@huawei.com>
Date: Tue, 27 Jun 2023 09:43:32 +0800
Subject: [PATCH 26/36] md/raid1: prioritize adding disk to 'removed' mirror

New disk should be added to "removed" position first instead of to be a
replacement. Commit 6090368abcb4 ("md/raid10: prioritize adding disk to
'removed' mirror") has fixed this issue for raid10. Fix it for raid1 now.

Signed-off-by: Li Nan <linan122@huawei.com>
Reviewed-by: Yu Kuai <yukuai3@huawei.com>
Link: https://lore.kernel.org/r/20230627014332.3810102-1-linan666@huaweicloud.com
Signed-off-by: Song Liu <song@kernel.org>
---
 drivers/md/raid1.c | 26 +++++++++++++++-----------
 1 file changed, 15 insertions(+), 11 deletions(-)

diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c
index 06fa1580501f..f834d99a36f6 100644
--- a/drivers/md/raid1.c
+++ b/drivers/md/raid1.c
@@ -1764,7 +1764,7 @@ static int raid1_add_disk(struct mddev *mddev, struct md_rdev *rdev)
 {
 	struct r1conf *conf = mddev->private;
 	int err = -EEXIST;
-	int mirror = 0;
+	int mirror = 0, repl_slot = -1;
 	struct raid1_info *p;
 	int first = 0;
 	int last = conf->raid_disks - 1;
@@ -1807,17 +1807,21 @@ static int raid1_add_disk(struct mddev *mddev, struct md_rdev *rdev)
 			break;
 		}
 		if (test_bit(WantReplacement, &p->rdev->flags) &&
-		    p[conf->raid_disks].rdev == NULL) {
-			/* Add this device as a replacement */
-			clear_bit(In_sync, &rdev->flags);
-			set_bit(Replacement, &rdev->flags);
-			rdev->raid_disk = mirror;
-			err = 0;
-			conf->fullsync = 1;
-			rcu_assign_pointer(p[conf->raid_disks].rdev, rdev);
-			break;
-		}
+		    p[conf->raid_disks].rdev == NULL && repl_slot < 0)
+			repl_slot = mirror;
 	}
+
+	if (err && repl_slot >= 0) {
+		/* Add this device as a replacement */
+		p = conf->mirrors + repl_slot;
+		clear_bit(In_sync, &rdev->flags);
+		set_bit(Replacement, &rdev->flags);
+		rdev->raid_disk = repl_slot;
+		err = 0;
+		conf->fullsync = 1;
+		rcu_assign_pointer(p[conf->raid_disks].rdev, rdev);
+	}
+
 	print_conf(conf);
 	return err;
 }

From 605eeda6e70f692311b36180f217208d367476f6 Mon Sep 17 00:00:00 2001
From: Li Nan <linan122@huawei.com>
Date: Sat, 24 Jun 2023 01:32:34 +0800
Subject: [PATCH 27/36] md/raid10: optimize fix_read_error

We dereference r10_bio->read_slot too many times in fix_read_error().
Optimize it by using a variable to store read_slot.

Signed-off-by: Li Nan <linan122@huawei.com>
Reviewed-by: Yu Kuai <yukuai3@huawei.com>
Link: https://lore.kernel.org/r/20230623173236.2513554-2-linan666@huaweicloud.com
Signed-off-by: Song Liu <song@kernel.org>
---
 drivers/md/raid10.c | 20 ++++++++++----------
 1 file changed, 10 insertions(+), 10 deletions(-)

diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c
index d42e9b7d2608..abea91a54db1 100644
--- a/drivers/md/raid10.c
+++ b/drivers/md/raid10.c
@@ -2718,10 +2718,10 @@ static int r10_sync_page_io(struct md_rdev *rdev, sector_t sector,
 static void fix_read_error(struct r10conf *conf, struct mddev *mddev, struct r10bio *r10_bio)
 {
 	int sect = 0; /* Offset from r10_bio->sector */
-	int sectors = r10_bio->sectors;
+	int sectors = r10_bio->sectors, slot = r10_bio->read_slot;
 	struct md_rdev *rdev;
 	int max_read_errors = atomic_read(&mddev->max_corr_read_errors);
-	int d = r10_bio->devs[r10_bio->read_slot].devnum;
+	int d = r10_bio->devs[slot].devnum;
 
 	/* still own a reference to this rdev, so it cannot
 	 * have been cleared recently.
@@ -2742,13 +2742,13 @@ static void fix_read_error(struct r10conf *conf, struct mddev *mddev, struct r10
 		pr_notice("md/raid10:%s: %pg: Failing raid device\n",
 			  mdname(mddev), rdev->bdev);
 		md_error(mddev, rdev);
-		r10_bio->devs[r10_bio->read_slot].bio = IO_BLOCKED;
+		r10_bio->devs[slot].bio = IO_BLOCKED;
 		return;
 	}
 
 	while(sectors) {
 		int s = sectors;
-		int sl = r10_bio->read_slot;
+		int sl = slot;
 		int success = 0;
 		int start;
 
@@ -2783,7 +2783,7 @@ static void fix_read_error(struct r10conf *conf, struct mddev *mddev, struct r10
 			sl++;
 			if (sl == conf->copies)
 				sl = 0;
-		} while (!success && sl != r10_bio->read_slot);
+		} while (!success && sl != slot);
 		rcu_read_unlock();
 
 		if (!success) {
@@ -2791,16 +2791,16 @@ static void fix_read_error(struct r10conf *conf, struct mddev *mddev, struct r10
 			 * as bad on the first device to discourage future
 			 * reads.
 			 */
-			int dn = r10_bio->devs[r10_bio->read_slot].devnum;
+			int dn = r10_bio->devs[slot].devnum;
 			rdev = conf->mirrors[dn].rdev;
 
 			if (!rdev_set_badblocks(
 				    rdev,
-				    r10_bio->devs[r10_bio->read_slot].addr
+				    r10_bio->devs[slot].addr
 				    + sect,
 				    s, 0)) {
 				md_error(mddev, rdev);
-				r10_bio->devs[r10_bio->read_slot].bio
+				r10_bio->devs[slot].bio
 					= IO_BLOCKED;
 			}
 			break;
@@ -2809,7 +2809,7 @@ static void fix_read_error(struct r10conf *conf, struct mddev *mddev, struct r10
 		start = sl;
 		/* write it back and re-read */
 		rcu_read_lock();
-		while (sl != r10_bio->read_slot) {
+		while (sl != slot) {
 			if (sl==0)
 				sl = conf->copies;
 			sl--;
@@ -2843,7 +2843,7 @@ static void fix_read_error(struct r10conf *conf, struct mddev *mddev, struct r10
 			rcu_read_lock();
 		}
 		sl = start;
-		while (sl != r10_bio->read_slot) {
+		while (sl != slot) {
 			if (sl==0)
 				sl = conf->copies;
 			sl--;

From 02c67a3b72b13951c2ca134bd7065f03ec57946d Mon Sep 17 00:00:00 2001
From: Li Nan <linan122@huawei.com>
Date: Sat, 24 Jun 2023 01:32:35 +0800
Subject: [PATCH 28/36] md: remove redundant check in fix_read_error()

In fix_read_error(), 'success' will be checked immediately after assigning
it, if it is set to 1 then the loop will break. Checking it again in
condition of loop is redundant. Clean it up.

Signed-off-by: Li Nan <linan122@huawei.com>
Reviewed-by: Yu Kuai <yukuai3@huawei.com>
Link: https://lore.kernel.org/r/20230623173236.2513554-3-linan666@huaweicloud.com
Signed-off-by: Song Liu <song@kernel.org>
---
 drivers/md/raid1.c  | 2 +-
 drivers/md/raid10.c | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c
index f834d99a36f6..a68c9cccbf0d 100644
--- a/drivers/md/raid1.c
+++ b/drivers/md/raid1.c
@@ -2301,7 +2301,7 @@ static void fix_read_error(struct r1conf *conf, int read_disk,
 			d++;
 			if (d == conf->raid_disks * 2)
 				d = 0;
-		} while (!success && d != read_disk);
+		} while (d != read_disk);
 
 		if (!success) {
 			/* Cannot read from anywhere - mark it bad */
diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c
index abea91a54db1..757687fb90a7 100644
--- a/drivers/md/raid10.c
+++ b/drivers/md/raid10.c
@@ -2783,7 +2783,7 @@ static void fix_read_error(struct r10conf *conf, struct mddev *mddev, struct r10
 			sl++;
 			if (sl == conf->copies)
 				sl = 0;
-		} while (!success && sl != slot);
+		} while (sl != slot);
 		rcu_read_unlock();
 
 		if (!success) {

From b39f35ebe86d88788d85f61e83c81c308cb76727 Mon Sep 17 00:00:00 2001
From: Yu Kuai <yukuai3@huawei.com>
Date: Wed, 28 Jun 2023 09:29:30 +0800
Subject: [PATCH 29/36] md: don't quiesce in mddev_suspend()

Some levels doesn't implement "pers->quiesce", for example
raid0_quiesce() is empty, and now that all levels will drop 'active_io'
until io is done, wait for 'active_io' to be 0 is enough to make sure all
normal io is done, and percpu_ref_kill() for 'active_io' will make sure
no new normal io can be dispatched. There is no need to call
"pers->quiesce" anymore from mddev_suspend().

Signed-off-by: Yu Kuai <yukuai3@huawei.com>
Link: https://lore.kernel.org/r/20230628012931.88911-2-yukuai1@huaweicloud.com
Signed-off-by: Song Liu <song@kernel.org>
---
 drivers/md/md.c | 2 --
 1 file changed, 2 deletions(-)

diff --git a/drivers/md/md.c b/drivers/md/md.c
index abb616720393..962dacfd98cf 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -453,7 +453,6 @@ void mddev_suspend(struct mddev *mddev)
 		mddev->pers->prepare_suspend(mddev);
 
 	wait_event(mddev->sb_wait, percpu_ref_is_zero(&mddev->active_io));
-	mddev->pers->quiesce(mddev, 1);
 	clear_bit_unlock(MD_ALLOW_SB_UPDATE, &mddev->flags);
 	wait_event(mddev->sb_wait, !test_bit(MD_UPDATING_SB, &mddev->flags));
 
@@ -472,7 +471,6 @@ void mddev_resume(struct mddev *mddev)
 		return;
 	percpu_ref_resurrect(&mddev->active_io);
 	wake_up(&mddev->sb_wait);
-	mddev->pers->quiesce(mddev, 0);
 
 	set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
 	md_wakeup_thread(mddev->thread);

From e24ed04389f9619e0aaef615a8948633c182a8b0 Mon Sep 17 00:00:00 2001
From: Yu Kuai <yukuai3@huawei.com>
Date: Wed, 28 Jun 2023 09:29:31 +0800
Subject: [PATCH 30/36] md: restore 'noio_flag' for the last mddev_resume()

memalloc_noio_save() is called for the first mddev_suspend(), and
repeated mddev_suspend() only increase 'suspended'. However,
memalloc_noio_restore() is also called for the first mddev_resume(),
which means that memory reclaim will be enabled before the last
mddev_resume() is called, while the array is still suspended.

Fix this problem by restore 'noio_flag' for the last mddev_resume().

Fixes: 78f57ef9d50a ("md: use memalloc scope APIs in mddev_suspend()/mddev_resume()")
Signed-off-by: Yu Kuai <yukuai3@huawei.com>
Link: https://lore.kernel.org/r/20230628012931.88911-3-yukuai1@huaweicloud.com
Signed-off-by: Song Liu <song@kernel.org>
---
 drivers/md/md.c | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/drivers/md/md.c b/drivers/md/md.c
index 962dacfd98cf..a3d98273b295 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -464,11 +464,13 @@ EXPORT_SYMBOL_GPL(mddev_suspend);
 
 void mddev_resume(struct mddev *mddev)
 {
-	/* entred the memalloc scope from mddev_suspend() */
-	memalloc_noio_restore(mddev->noio_flag);
 	lockdep_assert_held(&mddev->reconfig_mutex);
 	if (--mddev->suspended)
 		return;
+
+	/* entred the memalloc scope from mddev_suspend() */
+	memalloc_noio_restore(mddev->noio_flag);
+
 	percpu_ref_resurrect(&mddev->active_io);
 	wake_up(&mddev->sb_wait);
 

From 21bd9a68fef47c4f0e951be9a6fac9745cee1bab Mon Sep 17 00:00:00 2001
From: Jack Wang <jinpu.wang@ionos.com>
Date: Wed, 5 Jul 2023 13:32:27 +0200
Subject: [PATCH 31/36] md/raid1: Avoid lock contention from wake_up()

wake_up is called unconditionally in a few paths such as make_request(),
which cause lock contention under high concurrency workload like below
    raid1_end_write_request
     wake_up
      __wake_up_common_lock
       spin_lock_irqsave

Improve performance by only call wake_up() if waitqueue is not empty

Fio test script:

[global]
name=random reads and writes
ioengine=libaio
direct=1
readwrite=randrw
rwmixread=70
iodepth=64
buffered=0
filename=/dev/md0
size=1G
runtime=30
time_based
randrepeat=0
norandommap
refill_buffers
ramp_time=10
bs=4k
numjobs=400
group_reporting=1
[job1]

Test result with 2 ramdisk in raid1 on a Intel Broadwell 56 cores server.

	Before this patch       With this patch
	READ	BW=4621MB/s	BW=7337MB/s
	WRITE	BW=1980MB/s	BW=3144MB/s

The patch is inspired by Yu Kuai's change for raid10:
https://lore.kernel.org/r/20230621105728.1268542-1-yukuai1@huaweicloud.com

Cc: Yu Kuai <yukuai3@huawei.com>
Signed-off-by: Jack Wang <jinpu.wang@ionos.com>
Reviewed-by: Yu Kuai <yukuai3@huawei.com>
Link: https://lore.kernel.org/r/20230705113227.148494-1-jinpu.wang@ionos.com
Signed-off-by: Song Liu <song@kernel.org>
---
 drivers/md/raid1.c | 18 ++++++++++++------
 1 file changed, 12 insertions(+), 6 deletions(-)

diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c
index a68c9cccbf0d..23d211969565 100644
--- a/drivers/md/raid1.c
+++ b/drivers/md/raid1.c
@@ -789,11 +789,17 @@ static int read_balance(struct r1conf *conf, struct r1bio *r1_bio, int *max_sect
 	return best_disk;
 }
 
+static void wake_up_barrier(struct r1conf *conf)
+{
+	if (wq_has_sleeper(&conf->wait_barrier))
+		wake_up(&conf->wait_barrier);
+}
+
 static void flush_bio_list(struct r1conf *conf, struct bio *bio)
 {
 	/* flush any pending bitmap writes to disk before proceeding w/ I/O */
 	raid1_prepare_flush_writes(conf->mddev->bitmap);
-	wake_up(&conf->wait_barrier);
+	wake_up_barrier(conf);
 
 	while (bio) { /* submit pending writes */
 		struct bio *next = bio->bi_next;
@@ -970,7 +976,7 @@ static bool _wait_barrier(struct r1conf *conf, int idx, bool nowait)
 	 * In case freeze_array() is waiting for
 	 * get_unqueued_pending() == extra
 	 */
-	wake_up(&conf->wait_barrier);
+	wake_up_barrier(conf);
 	/* Wait for the barrier in same barrier unit bucket to drop. */
 
 	/* Return false when nowait flag is set */
@@ -1013,7 +1019,7 @@ static bool wait_read_barrier(struct r1conf *conf, sector_t sector_nr, bool nowa
 	 * In case freeze_array() is waiting for
 	 * get_unqueued_pending() == extra
 	 */
-	wake_up(&conf->wait_barrier);
+	wake_up_barrier(conf);
 	/* Wait for array to be unfrozen */
 
 	/* Return false when nowait flag is set */
@@ -1042,7 +1048,7 @@ static bool wait_barrier(struct r1conf *conf, sector_t sector_nr, bool nowait)
 static void _allow_barrier(struct r1conf *conf, int idx)
 {
 	atomic_dec(&conf->nr_pending[idx]);
-	wake_up(&conf->wait_barrier);
+	wake_up_barrier(conf);
 }
 
 static void allow_barrier(struct r1conf *conf, sector_t sector_nr)
@@ -1171,7 +1177,7 @@ static void raid1_unplug(struct blk_plug_cb *cb, bool from_schedule)
 		spin_lock_irq(&conf->device_lock);
 		bio_list_merge(&conf->pending_bio_list, &plug->pending);
 		spin_unlock_irq(&conf->device_lock);
-		wake_up(&conf->wait_barrier);
+		wake_up_barrier(conf);
 		md_wakeup_thread(mddev->thread);
 		kfree(plug);
 		return;
@@ -1574,7 +1580,7 @@ static void raid1_write_request(struct mddev *mddev, struct bio *bio,
 	r1_bio_write_done(r1_bio);
 
 	/* In case raid1d snuck in to freeze_array */
-	wake_up(&conf->wait_barrier);
+	wake_up_barrier(conf);
 }
 
 static bool raid1_make_request(struct mddev *mddev, struct bio *bio)

From 7e85c41b9e1df9192c225afd7cfec8dcad137feb Mon Sep 17 00:00:00 2001
From: Li Nan <linan122@huawei.com>
Date: Sat, 1 Jul 2023 16:05:27 +0800
Subject: [PATCH 32/36] md/raid10: check replacement and rdev to prevent submit
 the same io twice

After commit 4ca40c2ce099 ("md/raid10: Allow replacement device to be
replace old drive."), 'rdev' and 'replacement' could appear to be
identical. There are already checks for that in wait_blocked_dev() and
raid10_write_request(). Add check for raid10_handle_discard() now.

Signed-off-by: Li Nan <linan122@huawei.com>
Link: https://lore.kernel.org/r/20230701080529.2684932-2-linan666@huaweicloud.com
Signed-off-by: Song Liu <song@kernel.org>
---
 drivers/md/raid10.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c
index 757687fb90a7..60963449d3f5 100644
--- a/drivers/md/raid10.c
+++ b/drivers/md/raid10.c
@@ -1785,6 +1785,8 @@ static int raid10_handle_discard(struct mddev *mddev, struct bio *bio)
 		r10_bio->devs[disk].bio = NULL;
 		r10_bio->devs[disk].repl_bio = NULL;
 
+		if (rdev == rrdev)
+			rrdev = NULL;
 		if (rdev && (test_bit(Faulty, &rdev->flags)))
 			rdev = NULL;
 		if (rrdev && (test_bit(Faulty, &rrdev->flags)))

From b99f8fd2d91eb734f13098aa1cf337edaca454b7 Mon Sep 17 00:00:00 2001
From: Li Nan <linan122@huawei.com>
Date: Sat, 1 Jul 2023 16:05:28 +0800
Subject: [PATCH 33/36] md/raid10: factor out dereference_rdev_and_rrdev()

Factor out a helper to get 'rdev' and 'replacement' from config->mirrors.
Just to make code cleaner and prepare to fix the bug of io loss while
'replacement' replace 'rdev'.

There is no functional change.

Signed-off-by: Li Nan <linan122@huawei.com>
Link: https://lore.kernel.org/r/20230701080529.2684932-3-linan666@huaweicloud.com
Signed-off-by: Song Liu <song@kernel.org>
---
 drivers/md/raid10.c | 29 ++++++++++++++++++++---------
 1 file changed, 20 insertions(+), 9 deletions(-)

diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c
index 60963449d3f5..d21aeb9546d9 100644
--- a/drivers/md/raid10.c
+++ b/drivers/md/raid10.c
@@ -1321,6 +1321,25 @@ static void raid10_write_one_disk(struct mddev *mddev, struct r10bio *r10_bio,
 	}
 }
 
+static struct md_rdev *dereference_rdev_and_rrdev(struct raid10_info *mirror,
+						  struct md_rdev **prrdev)
+{
+	struct md_rdev *rdev, *rrdev;
+
+	rrdev = rcu_dereference(mirror->replacement);
+	/*
+	 * Read replacement first to prevent reading both rdev and
+	 * replacement as NULL during replacement replace rdev.
+	 */
+	smp_mb();
+	rdev = rcu_dereference(mirror->rdev);
+	if (rdev == rrdev)
+		rrdev = NULL;
+
+	*prrdev = rrdev;
+	return rdev;
+}
+
 static void wait_blocked_dev(struct mddev *mddev, struct r10bio *r10_bio)
 {
 	int i;
@@ -1464,15 +1483,7 @@ static void raid10_write_request(struct mddev *mddev, struct bio *bio,
 		int d = r10_bio->devs[i].devnum;
 		struct md_rdev *rdev, *rrdev;
 
-		rrdev = rcu_dereference(conf->mirrors[d].replacement);
-		/*
-		 * Read replacement first to prevent reading both rdev and
-		 * replacement as NULL during replacement replace rdev.
-		 */
-		smp_mb();
-		rdev = rcu_dereference(conf->mirrors[d].rdev);
-		if (rdev == rrdev)
-			rrdev = NULL;
+		rdev = dereference_rdev_and_rrdev(&conf->mirrors[d], &rrdev);
 		if (rdev && (test_bit(Faulty, &rdev->flags)))
 			rdev = NULL;
 		if (rrdev && (test_bit(Faulty, &rrdev->flags)))

From 673643490b9a0eb3b25633abe604f62b8f63dba1 Mon Sep 17 00:00:00 2001
From: Li Nan <linan122@huawei.com>
Date: Sat, 1 Jul 2023 16:05:29 +0800
Subject: [PATCH 34/36] md/raid10: use dereference_rdev_and_rrdev() to get
 devices

Commit 2ae6aaf76912 ("md/raid10: fix io loss while replacement replace
rdev") reads replacement first to prevent io loss. However, there are same
issue in wait_blocked_dev() and raid10_handle_discard(), too. Fix it by
using dereference_rdev_and_rrdev() to get devices.

Fixes: d30588b2731f ("md/raid10: improve raid10 discard request")
Fixes: f2e7e269a752 ("md/raid10: pull the code that wait for blocked dev into one function")
Signed-off-by: Li Nan <linan122@huawei.com>
Link: https://lore.kernel.org/r/20230701080529.2684932-4-linan666@huaweicloud.com
Signed-off-by: Song Liu <song@kernel.org>
---
 drivers/md/raid10.c | 15 +++++----------
 1 file changed, 5 insertions(+), 10 deletions(-)

diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c
index d21aeb9546d9..16aa9d735880 100644
--- a/drivers/md/raid10.c
+++ b/drivers/md/raid10.c
@@ -1350,11 +1350,9 @@ static void wait_blocked_dev(struct mddev *mddev, struct r10bio *r10_bio)
 	blocked_rdev = NULL;
 	rcu_read_lock();
 	for (i = 0; i < conf->copies; i++) {
-		struct md_rdev *rdev = rcu_dereference(conf->mirrors[i].rdev);
-		struct md_rdev *rrdev = rcu_dereference(
-			conf->mirrors[i].replacement);
-		if (rdev == rrdev)
-			rrdev = NULL;
+		struct md_rdev *rdev, *rrdev;
+
+		rdev = dereference_rdev_and_rrdev(&conf->mirrors[i], &rrdev);
 		if (rdev && unlikely(test_bit(Blocked, &rdev->flags))) {
 			atomic_inc(&rdev->nr_pending);
 			blocked_rdev = rdev;
@@ -1789,15 +1787,12 @@ static int raid10_handle_discard(struct mddev *mddev, struct bio *bio)
 	 */
 	rcu_read_lock();
 	for (disk = 0; disk < geo->raid_disks; disk++) {
-		struct md_rdev *rdev = rcu_dereference(conf->mirrors[disk].rdev);
-		struct md_rdev *rrdev = rcu_dereference(
-			conf->mirrors[disk].replacement);
+		struct md_rdev *rdev, *rrdev;
 
+		rdev = dereference_rdev_and_rrdev(&conf->mirrors[disk], &rrdev);
 		r10_bio->devs[disk].bio = NULL;
 		r10_bio->devs[disk].repl_bio = NULL;
 
-		if (rdev == rrdev)
-			rrdev = NULL;
 		if (rdev && (test_bit(Faulty, &rdev->flags)))
 			rdev = NULL;
 		if (rrdev && (test_bit(Faulty, &rrdev->flags)))

From b4d129640f194ffc4cc64c3e97f98ae944c072e8 Mon Sep 17 00:00:00 2001
From: Yu Kuai <yukuai3@huawei.com>
Date: Thu, 6 Jul 2023 16:37:26 +0800
Subject: [PATCH 35/36] md/md-bitmap: remove unnecessary local variable in
 backlog_store()

Local variable is definied first in the beginning of backlog_store(),
there is no need to define it again.

Fixes: 8c13ab115b57 ("md/bitmap: don't set max_write_behind if there is no write mostly device")
Signed-off-by: Yu Kuai <yukuai3@huawei.com>
Link: https://lore.kernel.org/r/20230706083727.608914-2-yukuai1@huaweicloud.com
Signed-off-by: Song Liu <song@kernel.org>
---
 drivers/md/md-bitmap.c | 2 --
 1 file changed, 2 deletions(-)

diff --git a/drivers/md/md-bitmap.c b/drivers/md/md-bitmap.c
index a58a4c30265e..1f3339961159 100644
--- a/drivers/md/md-bitmap.c
+++ b/drivers/md/md-bitmap.c
@@ -2569,8 +2569,6 @@ backlog_store(struct mddev *mddev, const char *buf, size_t len)
 			mddev_destroy_serial_pool(mddev, NULL, false);
 	} else if (backlog && !mddev->serial_info_pool) {
 		/* serial_info_pool is needed since backlog is not zero */
-		struct md_rdev *rdev;
-
 		rdev_for_each(rdev, mddev)
 			mddev_create_serial_pool(mddev, rdev, false);
 	}

From 44abfa6a95df425c0660d56043020b67e6d93ab8 Mon Sep 17 00:00:00 2001
From: Yu Kuai <yukuai3@huawei.com>
Date: Thu, 6 Jul 2023 16:37:27 +0800
Subject: [PATCH 36/36] md/md-bitmap: hold 'reconfig_mutex' in backlog_store()

Several reasons why 'reconfig_mutex' should be held:

1) rdev_for_each() is not safe to be called without the lock, because
   rdev can be removed concurrently.
2) mddev_destroy_serial_pool() and mddev_create_serial_pool() should not
   be called concurrently.
3) mddev_suspend() from mddev_destroy/create_serial_pool() should be
   protected by the lock.

Fixes: 10c92fca636e ("md-bitmap: create and destroy wb_info_pool with the change of backlog")
Signed-off-by: Yu Kuai <yukuai3@huawei.com>
Link: https://lore.kernel.org/r/20230706083727.608914-3-yukuai1@huaweicloud.com
Signed-off-by: Song Liu <song@kernel.org>
---
 drivers/md/md-bitmap.c | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/drivers/md/md-bitmap.c b/drivers/md/md-bitmap.c
index 1f3339961159..6f9ff14971f9 100644
--- a/drivers/md/md-bitmap.c
+++ b/drivers/md/md-bitmap.c
@@ -2546,6 +2546,10 @@ backlog_store(struct mddev *mddev, const char *buf, size_t len)
 	if (backlog > COUNTER_MAX)
 		return -EINVAL;
 
+	rv = mddev_lock(mddev);
+	if (rv)
+		return rv;
+
 	/*
 	 * Without write mostly device, it doesn't make sense to set
 	 * backlog for max_write_behind.
@@ -2559,6 +2563,7 @@ backlog_store(struct mddev *mddev, const char *buf, size_t len)
 	if (!has_write_mostly) {
 		pr_warn_ratelimited("%s: can't set backlog, no write mostly device available\n",
 				    mdname(mddev));
+		mddev_unlock(mddev);
 		return -EINVAL;
 	}
 
@@ -2574,6 +2579,8 @@ backlog_store(struct mddev *mddev, const char *buf, size_t len)
 	}
 	if (old_mwb != backlog)
 		md_bitmap_update_sb(mddev->bitmap);
+
+	mddev_unlock(mddev);
 	return len;
 }