From 85c8c3c1f8d9e31f626c93435dd91c2f85603e07 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 12 Apr 2021 10:05:28 +0200 Subject: [PATCH 1/4] md: factor out a mddev_alloc_unit helper from mddev_find Split out a self contained helper to find a free minor for the md "unit" number. Signed-off-by: Christoph Hellwig Signed-off-by: Song Liu --- drivers/md/md.c | 47 +++++++++++++++++++++++++++-------------------- 1 file changed, 27 insertions(+), 20 deletions(-) diff --git a/drivers/md/md.c b/drivers/md/md.c index 3ce5f4e0f431..8ef06330fc66 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -745,6 +745,27 @@ static struct mddev *mddev_find_locked(dev_t unit) return NULL; } +/* find an unused unit number */ +static dev_t mddev_alloc_unit(void) +{ + static int next_minor = 512; + int start = next_minor; + bool is_free = 0; + dev_t dev = 0; + + while (!is_free) { + dev = MKDEV(MD_MAJOR, next_minor); + next_minor++; + if (next_minor > MINORMASK) + next_minor = 0; + if (next_minor == start) + return 0; /* Oh dear, all in use. */ + is_free = !mddev_find_locked(dev); + } + + return dev; +} + static struct mddev *mddev_find(dev_t unit) { struct mddev *mddev; @@ -787,27 +808,13 @@ static struct mddev *mddev_find_or_alloc(dev_t unit) return new; } } else if (new) { - /* find an unused unit number */ - static int next_minor = 512; - int start = next_minor; - int is_free = 0; - int dev = 0; - while (!is_free) { - dev = MKDEV(MD_MAJOR, next_minor); - next_minor++; - if (next_minor > MINORMASK) - next_minor = 0; - if (next_minor == start) { - /* Oh dear, all in use. */ - spin_unlock(&all_mddevs_lock); - kfree(new); - return NULL; - } - - is_free = !mddev_find_locked(dev); + new->unit = mddev_alloc_unit(); + if (!new->unit) { + spin_unlock(&all_mddevs_lock); + kfree(new); + return NULL; } - new->unit = dev; - new->md_minor = MINOR(dev); + new->md_minor = MINOR(new->unit); new->hold_active = UNTIL_STOP; list_add(&new->all_mddevs, &all_mddevs); spin_unlock(&all_mddevs_lock); From d144fe6ff176d79efd411e520103a99e11874c36 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 12 Apr 2021 10:05:29 +0200 Subject: [PATCH 2/4] md: refactor mddev_find_or_alloc Allocate the new mddev first speculatively, which greatly simplifies the code flow. Signed-off-by: Christoph Hellwig Signed-off-by: Song Liu --- drivers/md/md.c | 74 +++++++++++++++++++++---------------------------- 1 file changed, 31 insertions(+), 43 deletions(-) diff --git a/drivers/md/md.c b/drivers/md/md.c index 8ef06330fc66..de6f8e511c14 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -784,57 +784,45 @@ static struct mddev *mddev_find(dev_t unit) static struct mddev *mddev_find_or_alloc(dev_t unit) { - struct mddev *mddev, *new = NULL; + struct mddev *mddev = NULL, *new; if (unit && MAJOR(unit) != MD_MAJOR) - unit &= ~((1<all_mddevs, &all_mddevs); - spin_unlock(&all_mddevs_lock); - new->hold_active = UNTIL_IOCTL; - return new; - } - } else if (new) { - new->unit = mddev_alloc_unit(); - if (!new->unit) { - spin_unlock(&all_mddevs_lock); - kfree(new); - return NULL; - } - new->md_minor = MINOR(new->unit); - new->hold_active = UNTIL_STOP; - list_add(&new->all_mddevs, &all_mddevs); - spin_unlock(&all_mddevs_lock); - return new; - } - spin_unlock(&all_mddevs_lock); + unit &= ~((1 << MdpMinorShift) - 1); new = kzalloc(sizeof(*new), GFP_KERNEL); if (!new) return NULL; - - new->unit = unit; - if (MAJOR(unit) == MD_MAJOR) - new->md_minor = MINOR(unit); - else - new->md_minor = MINOR(unit) >> MdpMinorShift; - mddev_init(new); - goto retry; + spin_lock(&all_mddevs_lock); + if (unit) { + mddev = mddev_find_locked(unit); + if (mddev) { + mddev_get(mddev); + goto out_free_new; + } + + new->unit = unit; + if (MAJOR(unit) == MD_MAJOR) + new->md_minor = MINOR(unit); + else + new->md_minor = MINOR(unit) >> MdpMinorShift; + new->hold_active = UNTIL_IOCTL; + } else { + new->unit = mddev_alloc_unit(); + if (!new->unit) + goto out_free_new; + new->md_minor = MINOR(new->unit); + new->hold_active = UNTIL_STOP; + } + + list_add(&new->all_mddevs, &all_mddevs); + spin_unlock(&all_mddevs_lock); + return new; +out_free_new: + spin_unlock(&all_mddevs_lock); + kfree(new); + return mddev; } static struct attribute_group md_redundancy_group; From 0d809b3837a0bede8f58a67e303e339585777bf4 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 12 Apr 2021 10:05:30 +0200 Subject: [PATCH 3/4] md: do not return existing mddevs from mddev_find_or_alloc Instead of returning an existing mddev, just for it to be discarded later directly return -EEXIST. Rename the function to mddev_alloc now that it doesn't find an existing mddev. Signed-off-by: Christoph Hellwig Signed-off-by: Song Liu --- drivers/md/md.c | 46 +++++++++++++++++++++++----------------------- 1 file changed, 23 insertions(+), 23 deletions(-) diff --git a/drivers/md/md.c b/drivers/md/md.c index de6f8e511c14..af9bdb907b2b 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -782,26 +782,24 @@ static struct mddev *mddev_find(dev_t unit) return mddev; } -static struct mddev *mddev_find_or_alloc(dev_t unit) +static struct mddev *mddev_alloc(dev_t unit) { - struct mddev *mddev = NULL, *new; + struct mddev *new; + int error; if (unit && MAJOR(unit) != MD_MAJOR) unit &= ~((1 << MdpMinorShift) - 1); new = kzalloc(sizeof(*new), GFP_KERNEL); if (!new) - return NULL; + return ERR_PTR(-ENOMEM); mddev_init(new); spin_lock(&all_mddevs_lock); if (unit) { - mddev = mddev_find_locked(unit); - if (mddev) { - mddev_get(mddev); + error = -EEXIST; + if (mddev_find_locked(unit)) goto out_free_new; - } - new->unit = unit; if (MAJOR(unit) == MD_MAJOR) new->md_minor = MINOR(unit); @@ -809,6 +807,7 @@ static struct mddev *mddev_find_or_alloc(dev_t unit) new->md_minor = MINOR(unit) >> MdpMinorShift; new->hold_active = UNTIL_IOCTL; } else { + error = -ENODEV; new->unit = mddev_alloc_unit(); if (!new->unit) goto out_free_new; @@ -822,7 +821,7 @@ static struct mddev *mddev_find_or_alloc(dev_t unit) out_free_new: spin_unlock(&all_mddevs_lock); kfree(new); - return mddev; + return ERR_PTR(error); } static struct attribute_group md_redundancy_group; @@ -5661,29 +5660,29 @@ static int md_alloc(dev_t dev, char *name) * writing to /sys/module/md_mod/parameters/new_array. */ static DEFINE_MUTEX(disks_mutex); - struct mddev *mddev = mddev_find_or_alloc(dev); + struct mddev *mddev; struct gendisk *disk; int partitioned; int shift; int unit; - int error; + int error ; - if (!mddev) - return -ENODEV; - - partitioned = (MAJOR(mddev->unit) != MD_MAJOR); - shift = partitioned ? MdpMinorShift : 0; - unit = MINOR(mddev->unit) >> shift; - - /* wait for any previous instance of this device to be - * completely removed (mddev_delayed_delete). + /* + * Wait for any previous instance of this device to be completely + * removed (mddev_delayed_delete). */ flush_workqueue(md_misc_wq); mutex_lock(&disks_mutex); - error = -EEXIST; - if (mddev->gendisk) - goto abort; + mddev = mddev_alloc(dev); + if (IS_ERR(mddev)) { + mutex_unlock(&disks_mutex); + return PTR_ERR(mddev); + } + + partitioned = (MAJOR(mddev->unit) != MD_MAJOR); + shift = partitioned ? MdpMinorShift : 0; + unit = MINOR(mddev->unit) >> shift; if (name && !dev) { /* Need to ensure that 'name' is not a duplicate. @@ -5695,6 +5694,7 @@ static int md_alloc(dev_t dev, char *name) if (mddev2->gendisk && strcmp(mddev2->gendisk->disk_name, name) == 0) { spin_unlock(&all_mddevs_lock); + error = -EEXIST; goto abort; } spin_unlock(&all_mddevs_lock); From 404a8ef512587b2460107d3272c17a89aef75edf Mon Sep 17 00:00:00 2001 From: Sudhakar Panneerselvam Date: Tue, 13 Apr 2021 04:08:29 +0000 Subject: [PATCH 4/4] md/bitmap: wait for external bitmap writes to complete during tear down NULL pointer dereference was observed in super_written() when it tries to access the mddev structure. [The below stack trace is from an older kernel, but the problem described in this patch applies to the mainline kernel.] [ 1194.474861] task: ffff8fdd20858000 task.stack: ffffb99d40790000 [ 1194.488000] RIP: 0010:super_written+0x29/0xe1 [ 1194.499688] RSP: 0018:ffff8ffb7fcc3c78 EFLAGS: 00010046 [ 1194.512477] RAX: 0000000000000000 RBX: ffff8ffb7bf4a000 RCX: ffff8ffb78991048 [ 1194.527325] RDX: 0000000000000001 RSI: 0000000000000000 RDI: ffff8ffb56b8a200 [ 1194.542576] RBP: ffff8ffb7fcc3c90 R08: 000000000000000b R09: 0000000000000000 [ 1194.558001] R10: ffff8ffb56b8a298 R11: 0000000000000000 R12: ffff8ffb56b8a200 [ 1194.573070] R13: 0000000000000000 R14: 0000000000000000 R15: 0000000000000000 [ 1194.588117] FS: 0000000000000000(0000) GS:ffff8ffb7fcc0000(0000) knlGS:0000000000000000 [ 1194.604264] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [ 1194.617375] CR2: 00000000000002b8 CR3: 00000021e040a002 CR4: 00000000007606e0 [ 1194.632327] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 [ 1194.647865] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 [ 1194.663316] PKRU: 55555554 [ 1194.674090] Call Trace: [ 1194.683735] [ 1194.692948] bio_endio+0xae/0x135 [ 1194.703580] blk_update_request+0xad/0x2fa [ 1194.714990] blk_update_bidi_request+0x20/0x72 [ 1194.726578] __blk_end_bidi_request+0x2c/0x4d [ 1194.738373] __blk_end_request_all+0x31/0x49 [ 1194.749344] blk_flush_complete_seq+0x377/0x383 [ 1194.761550] flush_end_io+0x1dd/0x2a7 [ 1194.772910] blk_finish_request+0x9f/0x13c [ 1194.784544] scsi_end_request+0x180/0x25c [ 1194.796149] scsi_io_completion+0xc8/0x610 [ 1194.807503] scsi_finish_command+0xdc/0x125 [ 1194.818897] scsi_softirq_done+0x81/0xde [ 1194.830062] blk_done_softirq+0xa4/0xcc [ 1194.841008] __do_softirq+0xd9/0x29f [ 1194.851257] irq_exit+0xe6/0xeb [ 1194.861290] do_IRQ+0x59/0xe3 [ 1194.871060] common_interrupt+0x1c6/0x382 [ 1194.881988] [ 1194.890646] RIP: 0010:cpuidle_enter_state+0xdd/0x2a5 [ 1194.902532] RSP: 0018:ffffb99d40793e68 EFLAGS: 00000246 ORIG_RAX: ffffffffffffff43 [ 1194.917317] RAX: ffff8ffb7fce27c0 RBX: ffff8ffb7fced800 RCX: 000000000000001f [ 1194.932056] RDX: 0000000000000000 RSI: 0000000000000004 RDI: 0000000000000000 [ 1194.946428] RBP: ffffb99d40793ea0 R08: 0000000000000004 R09: 0000000000002ed2 [ 1194.960508] R10: 0000000000002664 R11: 0000000000000018 R12: 0000000000000003 [ 1194.974454] R13: 000000000000000b R14: ffffffff925715a0 R15: 0000011610120d5a [ 1194.988607] ? cpuidle_enter_state+0xcc/0x2a5 [ 1194.999077] cpuidle_enter+0x17/0x19 [ 1195.008395] call_cpuidle+0x23/0x3a [ 1195.017718] do_idle+0x172/0x1d5 [ 1195.026358] cpu_startup_entry+0x73/0x75 [ 1195.035769] start_secondary+0x1b9/0x20b [ 1195.044894] secondary_startup_64+0xa5/0xa5 [ 1195.084921] RIP: super_written+0x29/0xe1 RSP: ffff8ffb7fcc3c78 [ 1195.096354] CR2: 00000000000002b8 bio in the above stack is a bitmap write whose completion is invoked after the tear down sequence sets the mddev structure to NULL in rdev. During tear down, there is an attempt to flush the bitmap writes, but for external bitmaps, there is no explicit wait for all the bitmap writes to complete. For instance, md_bitmap_flush() is called to flush the bitmap writes, but the last call to md_bitmap_daemon_work() in md_bitmap_flush() could generate new bitmap writes for which there is no explicit wait to complete those writes. The call to md_bitmap_update_sb() will return simply for external bitmaps and the follow-up call to md_update_sb() is conditional and may not get called for external bitmaps. This results in a kernel panic when the completion routine, super_written() is called which tries to reference mddev in the rdev that has been set to NULL(in unbind_rdev_from_array() by tear down sequence). The solution is to call md_super_wait() for external bitmaps after the last call to md_bitmap_daemon_work() in md_bitmap_flush() to ensure there are no pending bitmap writes before proceeding with the tear down. Cc: stable@vger.kernel.org Signed-off-by: Sudhakar Panneerselvam Reviewed-by: Zhao Heming Signed-off-by: Song Liu --- drivers/md/md-bitmap.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/md/md-bitmap.c b/drivers/md/md-bitmap.c index 200c5d0f08bf..ea3130e11680 100644 --- a/drivers/md/md-bitmap.c +++ b/drivers/md/md-bitmap.c @@ -1722,6 +1722,8 @@ void md_bitmap_flush(struct mddev *mddev) md_bitmap_daemon_work(mddev); bitmap->daemon_lastrun -= sleep; md_bitmap_daemon_work(mddev); + if (mddev->bitmap_info.external) + md_super_wait(mddev); md_bitmap_update_sb(bitmap); }