mirror of
https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
synced 2026-05-16 09:02:21 -04:00
Merge tag 'md-7.1-20260407' of git://git.kernel.org/pub/scm/linux/kernel/git/mdraid/linux into for-7.1/block
Pull MD changes from Yu Kuai: "Bug Fixes: - avoid a sysfs deadlock when clearing array state (Yu Kuai) - validate raid5 journal payloads before reading metadata (Junrui Luo) - fall back to the correct bitmap operations after version mismatches (Yu Kuai) - serialize overlapping writes on writemostly raid1 disks (Xiao Ni) - wake raid456 reshape waiters before suspend (Yu Kuai) - prevent retry_aligned_read() from triggering soft lockups (Chia-Ming Chang) Improvements: - switch raid0 strip zone and devlist allocations to kvmalloc helpers (Gregory Price) - track clean unwritten stripes for proactive RAID5 parity building (Yu Kuai) - speed up initial llbitmap sync with write_zeroes_unmap support (Yu Kuai) Cleanups: - remove the unused static md workqueue definition (Abd-Alrhman Masalkhi)" * tag 'md-7.1-20260407' of git://git.kernel.org/pub/scm/linux/kernel/git/mdraid/linux: md/raid5: fix soft lockup in retry_aligned_read() md: wake raid456 reshape waiters before suspend md/raid1: serialize overlap io for writemostly disk md/md-llbitmap: optimize initial sync with write_zeroes_unmap support md/md-llbitmap: add CleanUnwritten state for RAID-5 proactive parity building md: add fallback to correct bitmap_ops on version mismatch md/raid5: validate payload size before accessing journal metadata md: remove unused static md_wq workqueue md/raid0: use kvzalloc/kvfree for strip_zone and devlist allocations md: fix array_state=clear sysfs deadlock
This commit is contained in:
@@ -208,6 +208,20 @@ enum llbitmap_state {
|
||||
BitNeedSync,
|
||||
/* data is synchronizing */
|
||||
BitSyncing,
|
||||
/*
|
||||
* Proactive sync requested for unwritten region (raid456 only).
|
||||
* Triggered via sysfs when user wants to pre-build XOR parity
|
||||
* for regions that have never been written.
|
||||
*/
|
||||
BitNeedSyncUnwritten,
|
||||
/* Proactive sync in progress for unwritten region */
|
||||
BitSyncingUnwritten,
|
||||
/*
|
||||
* XOR parity has been pre-built for a region that has never had
|
||||
* user data written. When user writes to this region, it transitions
|
||||
* to BitDirty.
|
||||
*/
|
||||
BitCleanUnwritten,
|
||||
BitStateCount,
|
||||
BitNone = 0xff,
|
||||
};
|
||||
@@ -232,6 +246,12 @@ enum llbitmap_action {
|
||||
* BitNeedSync.
|
||||
*/
|
||||
BitmapActionStale,
|
||||
/*
|
||||
* Proactive sync trigger for raid456 - builds XOR parity for
|
||||
* Unwritten regions without requiring user data write first.
|
||||
*/
|
||||
BitmapActionProactiveSync,
|
||||
BitmapActionClearUnwritten,
|
||||
BitmapActionCount,
|
||||
/* Init state is BitUnwritten */
|
||||
BitmapActionInit,
|
||||
@@ -304,6 +324,8 @@ static char state_machine[BitStateCount][BitmapActionCount] = {
|
||||
[BitmapActionDaemon] = BitNone,
|
||||
[BitmapActionDiscard] = BitNone,
|
||||
[BitmapActionStale] = BitNone,
|
||||
[BitmapActionProactiveSync] = BitNeedSyncUnwritten,
|
||||
[BitmapActionClearUnwritten] = BitNone,
|
||||
},
|
||||
[BitClean] = {
|
||||
[BitmapActionStartwrite] = BitDirty,
|
||||
@@ -314,6 +336,8 @@ static char state_machine[BitStateCount][BitmapActionCount] = {
|
||||
[BitmapActionDaemon] = BitNone,
|
||||
[BitmapActionDiscard] = BitUnwritten,
|
||||
[BitmapActionStale] = BitNeedSync,
|
||||
[BitmapActionProactiveSync] = BitNone,
|
||||
[BitmapActionClearUnwritten] = BitNone,
|
||||
},
|
||||
[BitDirty] = {
|
||||
[BitmapActionStartwrite] = BitNone,
|
||||
@@ -324,6 +348,8 @@ static char state_machine[BitStateCount][BitmapActionCount] = {
|
||||
[BitmapActionDaemon] = BitClean,
|
||||
[BitmapActionDiscard] = BitUnwritten,
|
||||
[BitmapActionStale] = BitNeedSync,
|
||||
[BitmapActionProactiveSync] = BitNone,
|
||||
[BitmapActionClearUnwritten] = BitNone,
|
||||
},
|
||||
[BitNeedSync] = {
|
||||
[BitmapActionStartwrite] = BitNone,
|
||||
@@ -334,6 +360,8 @@ static char state_machine[BitStateCount][BitmapActionCount] = {
|
||||
[BitmapActionDaemon] = BitNone,
|
||||
[BitmapActionDiscard] = BitUnwritten,
|
||||
[BitmapActionStale] = BitNone,
|
||||
[BitmapActionProactiveSync] = BitNone,
|
||||
[BitmapActionClearUnwritten] = BitNone,
|
||||
},
|
||||
[BitSyncing] = {
|
||||
[BitmapActionStartwrite] = BitNone,
|
||||
@@ -344,6 +372,44 @@ static char state_machine[BitStateCount][BitmapActionCount] = {
|
||||
[BitmapActionDaemon] = BitNone,
|
||||
[BitmapActionDiscard] = BitUnwritten,
|
||||
[BitmapActionStale] = BitNeedSync,
|
||||
[BitmapActionProactiveSync] = BitNone,
|
||||
[BitmapActionClearUnwritten] = BitNone,
|
||||
},
|
||||
[BitNeedSyncUnwritten] = {
|
||||
[BitmapActionStartwrite] = BitNeedSync,
|
||||
[BitmapActionStartsync] = BitSyncingUnwritten,
|
||||
[BitmapActionEndsync] = BitNone,
|
||||
[BitmapActionAbortsync] = BitUnwritten,
|
||||
[BitmapActionReload] = BitUnwritten,
|
||||
[BitmapActionDaemon] = BitNone,
|
||||
[BitmapActionDiscard] = BitUnwritten,
|
||||
[BitmapActionStale] = BitUnwritten,
|
||||
[BitmapActionProactiveSync] = BitNone,
|
||||
[BitmapActionClearUnwritten] = BitUnwritten,
|
||||
},
|
||||
[BitSyncingUnwritten] = {
|
||||
[BitmapActionStartwrite] = BitSyncing,
|
||||
[BitmapActionStartsync] = BitSyncingUnwritten,
|
||||
[BitmapActionEndsync] = BitCleanUnwritten,
|
||||
[BitmapActionAbortsync] = BitUnwritten,
|
||||
[BitmapActionReload] = BitUnwritten,
|
||||
[BitmapActionDaemon] = BitNone,
|
||||
[BitmapActionDiscard] = BitUnwritten,
|
||||
[BitmapActionStale] = BitUnwritten,
|
||||
[BitmapActionProactiveSync] = BitNone,
|
||||
[BitmapActionClearUnwritten] = BitUnwritten,
|
||||
},
|
||||
[BitCleanUnwritten] = {
|
||||
[BitmapActionStartwrite] = BitDirty,
|
||||
[BitmapActionStartsync] = BitNone,
|
||||
[BitmapActionEndsync] = BitNone,
|
||||
[BitmapActionAbortsync] = BitNone,
|
||||
[BitmapActionReload] = BitNone,
|
||||
[BitmapActionDaemon] = BitNone,
|
||||
[BitmapActionDiscard] = BitUnwritten,
|
||||
[BitmapActionStale] = BitUnwritten,
|
||||
[BitmapActionProactiveSync] = BitNone,
|
||||
[BitmapActionClearUnwritten] = BitUnwritten,
|
||||
},
|
||||
};
|
||||
|
||||
@@ -376,6 +442,7 @@ static void llbitmap_infect_dirty_bits(struct llbitmap *llbitmap,
|
||||
pctl->state[pos] = level_456 ? BitNeedSync : BitDirty;
|
||||
break;
|
||||
case BitClean:
|
||||
case BitCleanUnwritten:
|
||||
pctl->state[pos] = BitDirty;
|
||||
break;
|
||||
}
|
||||
@@ -383,7 +450,7 @@ static void llbitmap_infect_dirty_bits(struct llbitmap *llbitmap,
|
||||
}
|
||||
|
||||
static void llbitmap_set_page_dirty(struct llbitmap *llbitmap, int idx,
|
||||
int offset)
|
||||
int offset, bool infect)
|
||||
{
|
||||
struct llbitmap_page_ctl *pctl = llbitmap->pctl[idx];
|
||||
unsigned int io_size = llbitmap->io_size;
|
||||
@@ -398,7 +465,7 @@ static void llbitmap_set_page_dirty(struct llbitmap *llbitmap, int idx,
|
||||
* resync all the dirty bits, hence skip infect new dirty bits to
|
||||
* prevent resync unnecessary data.
|
||||
*/
|
||||
if (llbitmap->mddev->degraded) {
|
||||
if (llbitmap->mddev->degraded || !infect) {
|
||||
set_bit(block, pctl->dirty);
|
||||
return;
|
||||
}
|
||||
@@ -438,7 +505,9 @@ static void llbitmap_write(struct llbitmap *llbitmap, enum llbitmap_state state,
|
||||
|
||||
llbitmap->pctl[idx]->state[bit] = state;
|
||||
if (state == BitDirty || state == BitNeedSync)
|
||||
llbitmap_set_page_dirty(llbitmap, idx, bit);
|
||||
llbitmap_set_page_dirty(llbitmap, idx, bit, true);
|
||||
else if (state == BitNeedSyncUnwritten)
|
||||
llbitmap_set_page_dirty(llbitmap, idx, bit, false);
|
||||
}
|
||||
|
||||
static struct page *llbitmap_read_page(struct llbitmap *llbitmap, int idx)
|
||||
@@ -585,13 +654,73 @@ static int llbitmap_cache_pages(struct llbitmap *llbitmap)
|
||||
return 0;
|
||||
}
|
||||
|
||||
/*
|
||||
* Check if all underlying disks support write_zeroes with unmap.
|
||||
*/
|
||||
static bool llbitmap_all_disks_support_wzeroes_unmap(struct llbitmap *llbitmap)
|
||||
{
|
||||
struct mddev *mddev = llbitmap->mddev;
|
||||
struct md_rdev *rdev;
|
||||
|
||||
rdev_for_each(rdev, mddev) {
|
||||
if (rdev->raid_disk < 0 || test_bit(Faulty, &rdev->flags))
|
||||
continue;
|
||||
|
||||
if (bdev_write_zeroes_unmap_sectors(rdev->bdev) == 0)
|
||||
return false;
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
/*
|
||||
* Issue write_zeroes to all underlying disks to zero their data regions.
|
||||
* This ensures parity consistency for RAID-456 (0 XOR 0 = 0).
|
||||
* Returns true if all disks were successfully zeroed.
|
||||
*/
|
||||
static bool llbitmap_zero_all_disks(struct llbitmap *llbitmap)
|
||||
{
|
||||
struct mddev *mddev = llbitmap->mddev;
|
||||
struct md_rdev *rdev;
|
||||
sector_t dev_sectors = mddev->dev_sectors;
|
||||
int ret;
|
||||
|
||||
rdev_for_each(rdev, mddev) {
|
||||
if (rdev->raid_disk < 0 || test_bit(Faulty, &rdev->flags))
|
||||
continue;
|
||||
|
||||
ret = blkdev_issue_zeroout(rdev->bdev,
|
||||
rdev->data_offset,
|
||||
dev_sectors,
|
||||
GFP_KERNEL, 0);
|
||||
if (ret) {
|
||||
pr_warn("md/llbitmap: failed to zero disk %pg: %d\n",
|
||||
rdev->bdev, ret);
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
static void llbitmap_init_state(struct llbitmap *llbitmap)
|
||||
{
|
||||
struct mddev *mddev = llbitmap->mddev;
|
||||
enum llbitmap_state state = BitUnwritten;
|
||||
unsigned long i;
|
||||
|
||||
if (test_and_clear_bit(BITMAP_CLEAN, &llbitmap->flags))
|
||||
if (test_and_clear_bit(BITMAP_CLEAN, &llbitmap->flags)) {
|
||||
state = BitClean;
|
||||
} else if (raid_is_456(mddev) &&
|
||||
llbitmap_all_disks_support_wzeroes_unmap(llbitmap)) {
|
||||
/*
|
||||
* All disks support write_zeroes with unmap. Zero all disks
|
||||
* to ensure parity consistency, then set BitCleanUnwritten
|
||||
* to skip initial sync.
|
||||
*/
|
||||
if (llbitmap_zero_all_disks(llbitmap))
|
||||
state = BitCleanUnwritten;
|
||||
}
|
||||
|
||||
for (i = 0; i < llbitmap->chunks; i++)
|
||||
llbitmap_write(llbitmap, state, i);
|
||||
@@ -627,11 +756,10 @@ static enum llbitmap_state llbitmap_state_machine(struct llbitmap *llbitmap,
|
||||
goto write_bitmap;
|
||||
}
|
||||
|
||||
if (c == BitNeedSync)
|
||||
if (c == BitNeedSync || c == BitNeedSyncUnwritten)
|
||||
need_resync = !mddev->degraded;
|
||||
|
||||
state = state_machine[c][action];
|
||||
|
||||
write_bitmap:
|
||||
if (unlikely(mddev->degraded)) {
|
||||
/* For degraded array, mark new data as need sync. */
|
||||
@@ -658,8 +786,7 @@ static enum llbitmap_state llbitmap_state_machine(struct llbitmap *llbitmap,
|
||||
}
|
||||
|
||||
llbitmap_write(llbitmap, state, start);
|
||||
|
||||
if (state == BitNeedSync)
|
||||
if (state == BitNeedSync || state == BitNeedSyncUnwritten)
|
||||
need_resync = !mddev->degraded;
|
||||
else if (state == BitDirty &&
|
||||
!timer_pending(&llbitmap->pending_timer))
|
||||
@@ -1229,7 +1356,7 @@ static bool llbitmap_blocks_synced(struct mddev *mddev, sector_t offset)
|
||||
unsigned long p = offset >> llbitmap->chunkshift;
|
||||
enum llbitmap_state c = llbitmap_read(llbitmap, p);
|
||||
|
||||
return c == BitClean || c == BitDirty;
|
||||
return c == BitClean || c == BitDirty || c == BitCleanUnwritten;
|
||||
}
|
||||
|
||||
static sector_t llbitmap_skip_sync_blocks(struct mddev *mddev, sector_t offset)
|
||||
@@ -1243,6 +1370,10 @@ static sector_t llbitmap_skip_sync_blocks(struct mddev *mddev, sector_t offset)
|
||||
if (c == BitUnwritten)
|
||||
return blocks;
|
||||
|
||||
/* Skip CleanUnwritten - no user data, will be reset after recovery */
|
||||
if (c == BitCleanUnwritten)
|
||||
return blocks;
|
||||
|
||||
/* For degraded array, don't skip */
|
||||
if (mddev->degraded)
|
||||
return 0;
|
||||
@@ -1261,14 +1392,25 @@ static bool llbitmap_start_sync(struct mddev *mddev, sector_t offset,
|
||||
{
|
||||
struct llbitmap *llbitmap = mddev->bitmap;
|
||||
unsigned long p = offset >> llbitmap->chunkshift;
|
||||
enum llbitmap_state state;
|
||||
|
||||
/*
|
||||
* Before recovery starts, convert CleanUnwritten to Unwritten.
|
||||
* This ensures the new disk won't have stale parity data.
|
||||
*/
|
||||
if (offset == 0 && test_bit(MD_RECOVERY_RECOVER, &mddev->recovery) &&
|
||||
!test_bit(MD_RECOVERY_LAZY_RECOVER, &mddev->recovery))
|
||||
llbitmap_state_machine(llbitmap, 0, llbitmap->chunks - 1,
|
||||
BitmapActionClearUnwritten);
|
||||
|
||||
|
||||
/*
|
||||
* Handle one bit at a time, this is much simpler. And it doesn't matter
|
||||
* if md_do_sync() loop more times.
|
||||
*/
|
||||
*blocks = llbitmap->chunksize - (offset & (llbitmap->chunksize - 1));
|
||||
return llbitmap_state_machine(llbitmap, p, p,
|
||||
BitmapActionStartsync) == BitSyncing;
|
||||
state = llbitmap_state_machine(llbitmap, p, p, BitmapActionStartsync);
|
||||
return state == BitSyncing || state == BitSyncingUnwritten;
|
||||
}
|
||||
|
||||
/* Something is wrong, sync_thread stop at @offset */
|
||||
@@ -1474,9 +1616,15 @@ static ssize_t bits_show(struct mddev *mddev, char *page)
|
||||
}
|
||||
|
||||
mutex_unlock(&mddev->bitmap_info.mutex);
|
||||
return sprintf(page, "unwritten %d\nclean %d\ndirty %d\nneed sync %d\nsyncing %d\n",
|
||||
return sprintf(page,
|
||||
"unwritten %d\nclean %d\ndirty %d\n"
|
||||
"need sync %d\nsyncing %d\n"
|
||||
"need sync unwritten %d\nsyncing unwritten %d\n"
|
||||
"clean unwritten %d\n",
|
||||
bits[BitUnwritten], bits[BitClean], bits[BitDirty],
|
||||
bits[BitNeedSync], bits[BitSyncing]);
|
||||
bits[BitNeedSync], bits[BitSyncing],
|
||||
bits[BitNeedSyncUnwritten], bits[BitSyncingUnwritten],
|
||||
bits[BitCleanUnwritten]);
|
||||
}
|
||||
|
||||
static struct md_sysfs_entry llbitmap_bits = __ATTR_RO(bits);
|
||||
@@ -1549,11 +1697,39 @@ barrier_idle_store(struct mddev *mddev, const char *buf, size_t len)
|
||||
|
||||
static struct md_sysfs_entry llbitmap_barrier_idle = __ATTR_RW(barrier_idle);
|
||||
|
||||
static ssize_t
|
||||
proactive_sync_store(struct mddev *mddev, const char *buf, size_t len)
|
||||
{
|
||||
struct llbitmap *llbitmap;
|
||||
|
||||
/* Only for RAID-456 */
|
||||
if (!raid_is_456(mddev))
|
||||
return -EINVAL;
|
||||
|
||||
mutex_lock(&mddev->bitmap_info.mutex);
|
||||
llbitmap = mddev->bitmap;
|
||||
if (!llbitmap || !llbitmap->pctl) {
|
||||
mutex_unlock(&mddev->bitmap_info.mutex);
|
||||
return -ENODEV;
|
||||
}
|
||||
|
||||
/* Trigger proactive sync on all Unwritten regions */
|
||||
llbitmap_state_machine(llbitmap, 0, llbitmap->chunks - 1,
|
||||
BitmapActionProactiveSync);
|
||||
|
||||
mutex_unlock(&mddev->bitmap_info.mutex);
|
||||
return len;
|
||||
}
|
||||
|
||||
static struct md_sysfs_entry llbitmap_proactive_sync =
|
||||
__ATTR(proactive_sync, 0200, NULL, proactive_sync_store);
|
||||
|
||||
static struct attribute *md_llbitmap_attrs[] = {
|
||||
&llbitmap_bits.attr,
|
||||
&llbitmap_metadata.attr,
|
||||
&llbitmap_daemon_sleep.attr,
|
||||
&llbitmap_barrier_idle.attr,
|
||||
&llbitmap_proactive_sync.attr,
|
||||
NULL
|
||||
};
|
||||
|
||||
|
||||
139
drivers/md/md.c
139
drivers/md/md.c
@@ -84,7 +84,6 @@ static DEFINE_XARRAY(md_submodule);
|
||||
static const struct kobj_type md_ktype;
|
||||
|
||||
static DECLARE_WAIT_QUEUE_HEAD(resync_wait);
|
||||
static struct workqueue_struct *md_wq;
|
||||
|
||||
/*
|
||||
* This workqueue is used for sync_work to register new sync_thread, and for
|
||||
@@ -188,7 +187,6 @@ static int rdev_init_serial(struct md_rdev *rdev)
|
||||
|
||||
spin_lock_init(&serial_tmp->serial_lock);
|
||||
serial_tmp->serial_rb = RB_ROOT_CACHED;
|
||||
init_waitqueue_head(&serial_tmp->serial_io_wait);
|
||||
}
|
||||
|
||||
rdev->serial = serial;
|
||||
@@ -489,6 +487,17 @@ int mddev_suspend(struct mddev *mddev, bool interruptible)
|
||||
}
|
||||
|
||||
percpu_ref_kill(&mddev->active_io);
|
||||
|
||||
/*
|
||||
* RAID456 IO can sleep in wait_for_reshape while still holding an
|
||||
* active_io reference. If reshape is already interrupted or frozen,
|
||||
* wake those waiters so they can abort and drop the reference instead
|
||||
* of deadlocking suspend.
|
||||
*/
|
||||
if (mddev->pers && mddev->pers->prepare_suspend &&
|
||||
reshape_interrupted(mddev))
|
||||
mddev->pers->prepare_suspend(mddev);
|
||||
|
||||
if (interruptible)
|
||||
err = wait_event_interruptible(mddev->sb_wait,
|
||||
percpu_ref_is_zero(&mddev->active_io));
|
||||
@@ -6130,10 +6139,16 @@ md_attr_store(struct kobject *kobj, struct attribute *attr,
|
||||
}
|
||||
spin_unlock(&all_mddevs_lock);
|
||||
rv = entry->store(mddev, page, length);
|
||||
mddev_put(mddev);
|
||||
|
||||
/*
|
||||
* For "array_state=clear", dropping the extra kobject reference from
|
||||
* sysfs_break_active_protection() can trigger md kobject deletion.
|
||||
* Restore active protection before mddev_put() so deletion happens
|
||||
* after the sysfs write path fully unwinds.
|
||||
*/
|
||||
if (kn)
|
||||
sysfs_unbreak_active_protection(kn);
|
||||
mddev_put(mddev);
|
||||
|
||||
return rv;
|
||||
}
|
||||
@@ -6449,15 +6464,124 @@ static void md_safemode_timeout(struct timer_list *t)
|
||||
|
||||
static int start_dirty_degraded;
|
||||
|
||||
/*
|
||||
* Read bitmap superblock and return the bitmap_id based on disk version.
|
||||
* This is used as fallback when default bitmap version and on-disk version
|
||||
* doesn't match, and mdadm is not the latest version to set bitmap_type.
|
||||
*/
|
||||
static enum md_submodule_id md_bitmap_get_id_from_sb(struct mddev *mddev)
|
||||
{
|
||||
struct md_rdev *rdev;
|
||||
struct page *sb_page;
|
||||
bitmap_super_t *sb;
|
||||
enum md_submodule_id id = ID_BITMAP_NONE;
|
||||
sector_t sector;
|
||||
u32 version;
|
||||
|
||||
if (!mddev->bitmap_info.offset)
|
||||
return ID_BITMAP_NONE;
|
||||
|
||||
sb_page = alloc_page(GFP_KERNEL);
|
||||
if (!sb_page) {
|
||||
pr_warn("md: %s: failed to allocate memory for bitmap\n",
|
||||
mdname(mddev));
|
||||
return ID_BITMAP_NONE;
|
||||
}
|
||||
|
||||
sector = mddev->bitmap_info.offset;
|
||||
|
||||
rdev_for_each(rdev, mddev) {
|
||||
u32 iosize;
|
||||
|
||||
if (!test_bit(In_sync, &rdev->flags) ||
|
||||
test_bit(Faulty, &rdev->flags) ||
|
||||
test_bit(Bitmap_sync, &rdev->flags))
|
||||
continue;
|
||||
|
||||
iosize = roundup(sizeof(bitmap_super_t),
|
||||
bdev_logical_block_size(rdev->bdev));
|
||||
if (sync_page_io(rdev, sector, iosize, sb_page, REQ_OP_READ,
|
||||
true))
|
||||
goto read_ok;
|
||||
}
|
||||
pr_warn("md: %s: failed to read bitmap from any device\n",
|
||||
mdname(mddev));
|
||||
goto out;
|
||||
|
||||
read_ok:
|
||||
sb = kmap_local_page(sb_page);
|
||||
if (sb->magic != cpu_to_le32(BITMAP_MAGIC)) {
|
||||
pr_warn("md: %s: invalid bitmap magic 0x%x\n",
|
||||
mdname(mddev), le32_to_cpu(sb->magic));
|
||||
goto out_unmap;
|
||||
}
|
||||
|
||||
version = le32_to_cpu(sb->version);
|
||||
switch (version) {
|
||||
case BITMAP_MAJOR_LO:
|
||||
case BITMAP_MAJOR_HI:
|
||||
case BITMAP_MAJOR_CLUSTERED:
|
||||
id = ID_BITMAP;
|
||||
break;
|
||||
case BITMAP_MAJOR_LOCKLESS:
|
||||
id = ID_LLBITMAP;
|
||||
break;
|
||||
default:
|
||||
pr_warn("md: %s: unknown bitmap version %u\n",
|
||||
mdname(mddev), version);
|
||||
break;
|
||||
}
|
||||
|
||||
out_unmap:
|
||||
kunmap_local(sb);
|
||||
out:
|
||||
__free_page(sb_page);
|
||||
return id;
|
||||
}
|
||||
|
||||
static int md_bitmap_create(struct mddev *mddev)
|
||||
{
|
||||
enum md_submodule_id orig_id = mddev->bitmap_id;
|
||||
enum md_submodule_id sb_id;
|
||||
int err;
|
||||
|
||||
if (mddev->bitmap_id == ID_BITMAP_NONE)
|
||||
return -EINVAL;
|
||||
|
||||
if (!mddev_set_bitmap_ops(mddev))
|
||||
return -ENOENT;
|
||||
|
||||
return mddev->bitmap_ops->create(mddev);
|
||||
err = mddev->bitmap_ops->create(mddev);
|
||||
if (!err)
|
||||
return 0;
|
||||
|
||||
/*
|
||||
* Create failed, if default bitmap version and on-disk version
|
||||
* doesn't match, and mdadm is not the latest version to set
|
||||
* bitmap_type, set bitmap_ops based on the disk version.
|
||||
*/
|
||||
mddev_clear_bitmap_ops(mddev);
|
||||
|
||||
sb_id = md_bitmap_get_id_from_sb(mddev);
|
||||
if (sb_id == ID_BITMAP_NONE || sb_id == orig_id)
|
||||
return err;
|
||||
|
||||
pr_info("md: %s: bitmap version mismatch, switching from %d to %d\n",
|
||||
mdname(mddev), orig_id, sb_id);
|
||||
|
||||
mddev->bitmap_id = sb_id;
|
||||
if (!mddev_set_bitmap_ops(mddev)) {
|
||||
mddev->bitmap_id = orig_id;
|
||||
return -ENOENT;
|
||||
}
|
||||
|
||||
err = mddev->bitmap_ops->create(mddev);
|
||||
if (err) {
|
||||
mddev_clear_bitmap_ops(mddev);
|
||||
mddev->bitmap_id = orig_id;
|
||||
}
|
||||
|
||||
return err;
|
||||
}
|
||||
|
||||
static void md_bitmap_destroy(struct mddev *mddev)
|
||||
@@ -10505,10 +10629,6 @@ static int __init md_init(void)
|
||||
goto err_bitmap;
|
||||
|
||||
ret = -ENOMEM;
|
||||
md_wq = alloc_workqueue("md", WQ_MEM_RECLAIM | WQ_PERCPU, 0);
|
||||
if (!md_wq)
|
||||
goto err_wq;
|
||||
|
||||
md_misc_wq = alloc_workqueue("md_misc", WQ_PERCPU, 0);
|
||||
if (!md_misc_wq)
|
||||
goto err_misc_wq;
|
||||
@@ -10533,8 +10653,6 @@ static int __init md_init(void)
|
||||
err_md:
|
||||
destroy_workqueue(md_misc_wq);
|
||||
err_misc_wq:
|
||||
destroy_workqueue(md_wq);
|
||||
err_wq:
|
||||
md_llbitmap_exit();
|
||||
err_bitmap:
|
||||
md_bitmap_exit();
|
||||
@@ -10843,7 +10961,6 @@ static __exit void md_exit(void)
|
||||
spin_unlock(&all_mddevs_lock);
|
||||
|
||||
destroy_workqueue(md_misc_wq);
|
||||
destroy_workqueue(md_wq);
|
||||
md_bitmap_exit();
|
||||
}
|
||||
|
||||
|
||||
@@ -126,7 +126,6 @@ enum sync_action {
|
||||
struct serial_in_rdev {
|
||||
struct rb_root_cached serial_rb;
|
||||
spinlock_t serial_lock;
|
||||
wait_queue_head_t serial_io_wait;
|
||||
};
|
||||
|
||||
/*
|
||||
@@ -381,7 +380,11 @@ struct serial_info {
|
||||
struct rb_node node;
|
||||
sector_t start; /* start sector of rb node */
|
||||
sector_t last; /* end sector of rb node */
|
||||
sector_t wnode_start; /* address of waiting nodes on the same list */
|
||||
sector_t _subtree_last; /* highest sector in subtree of rb node */
|
||||
struct list_head list_node;
|
||||
struct list_head waiters;
|
||||
struct completion ready;
|
||||
};
|
||||
|
||||
/*
|
||||
|
||||
@@ -143,13 +143,13 @@ static int create_strip_zones(struct mddev *mddev, struct r0conf **private_conf)
|
||||
}
|
||||
|
||||
err = -ENOMEM;
|
||||
conf->strip_zone = kzalloc_objs(struct strip_zone, conf->nr_strip_zones);
|
||||
conf->strip_zone = kvzalloc_objs(struct strip_zone, conf->nr_strip_zones);
|
||||
if (!conf->strip_zone)
|
||||
goto abort;
|
||||
conf->devlist = kzalloc(array3_size(sizeof(struct md_rdev *),
|
||||
conf->nr_strip_zones,
|
||||
mddev->raid_disks),
|
||||
GFP_KERNEL);
|
||||
conf->devlist = kvzalloc(array3_size(sizeof(struct md_rdev *),
|
||||
conf->nr_strip_zones,
|
||||
mddev->raid_disks),
|
||||
GFP_KERNEL);
|
||||
if (!conf->devlist)
|
||||
goto abort;
|
||||
|
||||
@@ -291,8 +291,8 @@ static int create_strip_zones(struct mddev *mddev, struct r0conf **private_conf)
|
||||
|
||||
return 0;
|
||||
abort:
|
||||
kfree(conf->strip_zone);
|
||||
kfree(conf->devlist);
|
||||
kvfree(conf->strip_zone);
|
||||
kvfree(conf->devlist);
|
||||
kfree(conf);
|
||||
*private_conf = ERR_PTR(err);
|
||||
return err;
|
||||
@@ -373,8 +373,8 @@ static void raid0_free(struct mddev *mddev, void *priv)
|
||||
{
|
||||
struct r0conf *conf = priv;
|
||||
|
||||
kfree(conf->strip_zone);
|
||||
kfree(conf->devlist);
|
||||
kvfree(conf->strip_zone);
|
||||
kvfree(conf->devlist);
|
||||
kfree(conf);
|
||||
}
|
||||
|
||||
|
||||
@@ -57,21 +57,29 @@ INTERVAL_TREE_DEFINE(struct serial_info, node, sector_t, _subtree_last,
|
||||
START, LAST, static inline, raid1_rb);
|
||||
|
||||
static int check_and_add_serial(struct md_rdev *rdev, struct r1bio *r1_bio,
|
||||
struct serial_info *si, int idx)
|
||||
struct serial_info *si)
|
||||
{
|
||||
unsigned long flags;
|
||||
int ret = 0;
|
||||
sector_t lo = r1_bio->sector;
|
||||
sector_t hi = lo + r1_bio->sectors - 1;
|
||||
int idx = sector_to_idx(r1_bio->sector);
|
||||
struct serial_in_rdev *serial = &rdev->serial[idx];
|
||||
struct serial_info *head_si;
|
||||
|
||||
spin_lock_irqsave(&serial->serial_lock, flags);
|
||||
/* collision happened */
|
||||
if (raid1_rb_iter_first(&serial->serial_rb, lo, hi))
|
||||
ret = -EBUSY;
|
||||
else {
|
||||
head_si = raid1_rb_iter_first(&serial->serial_rb, lo, hi);
|
||||
if (head_si && head_si != si) {
|
||||
si->start = lo;
|
||||
si->last = hi;
|
||||
si->wnode_start = head_si->wnode_start;
|
||||
list_add_tail(&si->list_node, &head_si->waiters);
|
||||
ret = -EBUSY;
|
||||
} else if (!head_si) {
|
||||
si->start = lo;
|
||||
si->last = hi;
|
||||
si->wnode_start = si->start;
|
||||
raid1_rb_insert(si, &serial->serial_rb);
|
||||
}
|
||||
spin_unlock_irqrestore(&serial->serial_lock, flags);
|
||||
@@ -83,19 +91,22 @@ static void wait_for_serialization(struct md_rdev *rdev, struct r1bio *r1_bio)
|
||||
{
|
||||
struct mddev *mddev = rdev->mddev;
|
||||
struct serial_info *si;
|
||||
int idx = sector_to_idx(r1_bio->sector);
|
||||
struct serial_in_rdev *serial = &rdev->serial[idx];
|
||||
|
||||
if (WARN_ON(!mddev->serial_info_pool))
|
||||
return;
|
||||
si = mempool_alloc(mddev->serial_info_pool, GFP_NOIO);
|
||||
wait_event(serial->serial_io_wait,
|
||||
check_and_add_serial(rdev, r1_bio, si, idx) == 0);
|
||||
INIT_LIST_HEAD(&si->waiters);
|
||||
INIT_LIST_HEAD(&si->list_node);
|
||||
init_completion(&si->ready);
|
||||
while (check_and_add_serial(rdev, r1_bio, si)) {
|
||||
wait_for_completion(&si->ready);
|
||||
reinit_completion(&si->ready);
|
||||
}
|
||||
}
|
||||
|
||||
static void remove_serial(struct md_rdev *rdev, sector_t lo, sector_t hi)
|
||||
{
|
||||
struct serial_info *si;
|
||||
struct serial_info *si, *iter_si;
|
||||
unsigned long flags;
|
||||
int found = 0;
|
||||
struct mddev *mddev = rdev->mddev;
|
||||
@@ -106,16 +117,28 @@ static void remove_serial(struct md_rdev *rdev, sector_t lo, sector_t hi)
|
||||
for (si = raid1_rb_iter_first(&serial->serial_rb, lo, hi);
|
||||
si; si = raid1_rb_iter_next(si, lo, hi)) {
|
||||
if (si->start == lo && si->last == hi) {
|
||||
raid1_rb_remove(si, &serial->serial_rb);
|
||||
mempool_free(si, mddev->serial_info_pool);
|
||||
found = 1;
|
||||
break;
|
||||
}
|
||||
}
|
||||
if (!found)
|
||||
if (found) {
|
||||
raid1_rb_remove(si, &serial->serial_rb);
|
||||
if (!list_empty(&si->waiters)) {
|
||||
list_for_each_entry(iter_si, &si->waiters, list_node) {
|
||||
if (iter_si->wnode_start == si->wnode_start) {
|
||||
list_del_init(&iter_si->list_node);
|
||||
list_splice_init(&si->waiters, &iter_si->waiters);
|
||||
raid1_rb_insert(iter_si, &serial->serial_rb);
|
||||
complete(&iter_si->ready);
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
mempool_free(si, mddev->serial_info_pool);
|
||||
} else {
|
||||
WARN(1, "The write IO is not recorded for serialization\n");
|
||||
}
|
||||
spin_unlock_irqrestore(&serial->serial_lock, flags);
|
||||
wake_up(&serial->serial_io_wait);
|
||||
}
|
||||
|
||||
/*
|
||||
|
||||
@@ -2002,15 +2002,27 @@ r5l_recovery_verify_data_checksum_for_mb(struct r5l_log *log,
|
||||
return -ENOMEM;
|
||||
|
||||
while (mb_offset < le32_to_cpu(mb->meta_size)) {
|
||||
sector_t payload_len;
|
||||
|
||||
payload = (void *)mb + mb_offset;
|
||||
payload_flush = (void *)mb + mb_offset;
|
||||
|
||||
if (le16_to_cpu(payload->header.type) == R5LOG_PAYLOAD_DATA) {
|
||||
payload_len = sizeof(struct r5l_payload_data_parity) +
|
||||
(sector_t)sizeof(__le32) *
|
||||
(le32_to_cpu(payload->size) >> (PAGE_SHIFT - 9));
|
||||
if (mb_offset + payload_len > le32_to_cpu(mb->meta_size))
|
||||
goto mismatch;
|
||||
if (r5l_recovery_verify_data_checksum(
|
||||
log, ctx, page, log_offset,
|
||||
payload->checksum[0]) < 0)
|
||||
goto mismatch;
|
||||
} else if (le16_to_cpu(payload->header.type) == R5LOG_PAYLOAD_PARITY) {
|
||||
payload_len = sizeof(struct r5l_payload_data_parity) +
|
||||
(sector_t)sizeof(__le32) *
|
||||
(le32_to_cpu(payload->size) >> (PAGE_SHIFT - 9));
|
||||
if (mb_offset + payload_len > le32_to_cpu(mb->meta_size))
|
||||
goto mismatch;
|
||||
if (r5l_recovery_verify_data_checksum(
|
||||
log, ctx, page, log_offset,
|
||||
payload->checksum[0]) < 0)
|
||||
@@ -2023,22 +2035,18 @@ r5l_recovery_verify_data_checksum_for_mb(struct r5l_log *log,
|
||||
payload->checksum[1]) < 0)
|
||||
goto mismatch;
|
||||
} else if (le16_to_cpu(payload->header.type) == R5LOG_PAYLOAD_FLUSH) {
|
||||
/* nothing to do for R5LOG_PAYLOAD_FLUSH here */
|
||||
payload_len = sizeof(struct r5l_payload_flush) +
|
||||
(sector_t)le32_to_cpu(payload_flush->size);
|
||||
if (mb_offset + payload_len > le32_to_cpu(mb->meta_size))
|
||||
goto mismatch;
|
||||
} else /* not R5LOG_PAYLOAD_DATA/PARITY/FLUSH */
|
||||
goto mismatch;
|
||||
|
||||
if (le16_to_cpu(payload->header.type) == R5LOG_PAYLOAD_FLUSH) {
|
||||
mb_offset += sizeof(struct r5l_payload_flush) +
|
||||
le32_to_cpu(payload_flush->size);
|
||||
} else {
|
||||
/* DATA or PARITY payload */
|
||||
if (le16_to_cpu(payload->header.type) != R5LOG_PAYLOAD_FLUSH) {
|
||||
log_offset = r5l_ring_add(log, log_offset,
|
||||
le32_to_cpu(payload->size));
|
||||
mb_offset += sizeof(struct r5l_payload_data_parity) +
|
||||
sizeof(__le32) *
|
||||
(le32_to_cpu(payload->size) >> (PAGE_SHIFT - 9));
|
||||
}
|
||||
|
||||
mb_offset += payload_len;
|
||||
}
|
||||
|
||||
put_page(page);
|
||||
@@ -2089,6 +2097,7 @@ r5c_recovery_analyze_meta_block(struct r5l_log *log,
|
||||
log_offset = r5l_ring_add(log, ctx->pos, BLOCK_SECTORS);
|
||||
|
||||
while (mb_offset < le32_to_cpu(mb->meta_size)) {
|
||||
sector_t payload_len;
|
||||
int dd;
|
||||
|
||||
payload = (void *)mb + mb_offset;
|
||||
@@ -2097,6 +2106,12 @@ r5c_recovery_analyze_meta_block(struct r5l_log *log,
|
||||
if (le16_to_cpu(payload->header.type) == R5LOG_PAYLOAD_FLUSH) {
|
||||
int i, count;
|
||||
|
||||
payload_len = sizeof(struct r5l_payload_flush) +
|
||||
(sector_t)le32_to_cpu(payload_flush->size);
|
||||
if (mb_offset + payload_len >
|
||||
le32_to_cpu(mb->meta_size))
|
||||
return -EINVAL;
|
||||
|
||||
count = le32_to_cpu(payload_flush->size) / sizeof(__le64);
|
||||
for (i = 0; i < count; ++i) {
|
||||
stripe_sect = le64_to_cpu(payload_flush->flush_stripes[i]);
|
||||
@@ -2110,12 +2125,17 @@ r5c_recovery_analyze_meta_block(struct r5l_log *log,
|
||||
}
|
||||
}
|
||||
|
||||
mb_offset += sizeof(struct r5l_payload_flush) +
|
||||
le32_to_cpu(payload_flush->size);
|
||||
mb_offset += payload_len;
|
||||
continue;
|
||||
}
|
||||
|
||||
/* DATA or PARITY payload */
|
||||
payload_len = sizeof(struct r5l_payload_data_parity) +
|
||||
(sector_t)sizeof(__le32) *
|
||||
(le32_to_cpu(payload->size) >> (PAGE_SHIFT - 9));
|
||||
if (mb_offset + payload_len > le32_to_cpu(mb->meta_size))
|
||||
return -EINVAL;
|
||||
|
||||
stripe_sect = (le16_to_cpu(payload->header.type) == R5LOG_PAYLOAD_DATA) ?
|
||||
raid5_compute_sector(
|
||||
conf, le64_to_cpu(payload->location), 0, &dd,
|
||||
@@ -2180,9 +2200,7 @@ r5c_recovery_analyze_meta_block(struct r5l_log *log,
|
||||
log_offset = r5l_ring_add(log, log_offset,
|
||||
le32_to_cpu(payload->size));
|
||||
|
||||
mb_offset += sizeof(struct r5l_payload_data_parity) +
|
||||
sizeof(__le32) *
|
||||
(le32_to_cpu(payload->size) >> (PAGE_SHIFT - 9));
|
||||
mb_offset += payload_len;
|
||||
}
|
||||
|
||||
return 0;
|
||||
|
||||
@@ -6641,7 +6641,13 @@ static int retry_aligned_read(struct r5conf *conf, struct bio *raid_bio,
|
||||
}
|
||||
|
||||
if (!add_stripe_bio(sh, raid_bio, dd_idx, 0, 0)) {
|
||||
raid5_release_stripe(sh);
|
||||
int hash;
|
||||
|
||||
spin_lock_irq(&conf->device_lock);
|
||||
hash = sh->hash_lock_index;
|
||||
__release_stripe(conf, sh,
|
||||
&conf->temp_inactive_list[hash]);
|
||||
spin_unlock_irq(&conf->device_lock);
|
||||
conf->retry_read_aligned = raid_bio;
|
||||
conf->retry_read_offset = scnt;
|
||||
return handled;
|
||||
|
||||
Reference in New Issue
Block a user