Merge tag 'md-7.1-20260407' of git://git.kernel.org/pub/scm/linux/kernel/git/mdraid/linux into for-7.1/block

Pull MD changes from Yu Kuai:

"Bug Fixes:
 - avoid a sysfs deadlock when clearing array state (Yu Kuai)
 - validate raid5 journal payloads before reading metadata (Junrui Luo)
 - fall back to the correct bitmap operations after version mismatches
   (Yu Kuai)
 - serialize overlapping writes on writemostly raid1 disks (Xiao Ni)
 - wake raid456 reshape waiters before suspend (Yu Kuai)
 - prevent retry_aligned_read() from triggering soft lockups
   (Chia-Ming Chang)

 Improvements:
 - switch raid0 strip zone and devlist allocations to kvmalloc helpers
   (Gregory Price)
 - track clean unwritten stripes for proactive RAID5 parity building
   (Yu Kuai)
 - speed up initial llbitmap sync with write_zeroes_unmap support
   (Yu Kuai)

 Cleanups:
 - remove the unused static md workqueue definition
   (Abd-Alrhman Masalkhi)"

* tag 'md-7.1-20260407' of git://git.kernel.org/pub/scm/linux/kernel/git/mdraid/linux:
  md/raid5: fix soft lockup in retry_aligned_read()
  md: wake raid456 reshape waiters before suspend
  md/raid1: serialize overlap io for writemostly disk
  md/md-llbitmap: optimize initial sync with write_zeroes_unmap support
  md/md-llbitmap: add CleanUnwritten state for RAID-5 proactive parity building
  md: add fallback to correct bitmap_ops on version mismatch
  md/raid5: validate payload size before accessing journal metadata
  md: remove unused static md_wq workqueue
  md/raid0: use kvzalloc/kvfree for strip_zone and devlist allocations
  md: fix array_state=clear sysfs deadlock
This commit is contained in:
Jens Axboe
2026-04-08 06:53:16 -06:00
7 changed files with 406 additions and 63 deletions

View File

@@ -208,6 +208,20 @@ enum llbitmap_state {
BitNeedSync,
/* data is synchronizing */
BitSyncing,
/*
* Proactive sync requested for unwritten region (raid456 only).
* Triggered via sysfs when user wants to pre-build XOR parity
* for regions that have never been written.
*/
BitNeedSyncUnwritten,
/* Proactive sync in progress for unwritten region */
BitSyncingUnwritten,
/*
* XOR parity has been pre-built for a region that has never had
* user data written. When user writes to this region, it transitions
* to BitDirty.
*/
BitCleanUnwritten,
BitStateCount,
BitNone = 0xff,
};
@@ -232,6 +246,12 @@ enum llbitmap_action {
* BitNeedSync.
*/
BitmapActionStale,
/*
* Proactive sync trigger for raid456 - builds XOR parity for
* Unwritten regions without requiring user data write first.
*/
BitmapActionProactiveSync,
BitmapActionClearUnwritten,
BitmapActionCount,
/* Init state is BitUnwritten */
BitmapActionInit,
@@ -304,6 +324,8 @@ static char state_machine[BitStateCount][BitmapActionCount] = {
[BitmapActionDaemon] = BitNone,
[BitmapActionDiscard] = BitNone,
[BitmapActionStale] = BitNone,
[BitmapActionProactiveSync] = BitNeedSyncUnwritten,
[BitmapActionClearUnwritten] = BitNone,
},
[BitClean] = {
[BitmapActionStartwrite] = BitDirty,
@@ -314,6 +336,8 @@ static char state_machine[BitStateCount][BitmapActionCount] = {
[BitmapActionDaemon] = BitNone,
[BitmapActionDiscard] = BitUnwritten,
[BitmapActionStale] = BitNeedSync,
[BitmapActionProactiveSync] = BitNone,
[BitmapActionClearUnwritten] = BitNone,
},
[BitDirty] = {
[BitmapActionStartwrite] = BitNone,
@@ -324,6 +348,8 @@ static char state_machine[BitStateCount][BitmapActionCount] = {
[BitmapActionDaemon] = BitClean,
[BitmapActionDiscard] = BitUnwritten,
[BitmapActionStale] = BitNeedSync,
[BitmapActionProactiveSync] = BitNone,
[BitmapActionClearUnwritten] = BitNone,
},
[BitNeedSync] = {
[BitmapActionStartwrite] = BitNone,
@@ -334,6 +360,8 @@ static char state_machine[BitStateCount][BitmapActionCount] = {
[BitmapActionDaemon] = BitNone,
[BitmapActionDiscard] = BitUnwritten,
[BitmapActionStale] = BitNone,
[BitmapActionProactiveSync] = BitNone,
[BitmapActionClearUnwritten] = BitNone,
},
[BitSyncing] = {
[BitmapActionStartwrite] = BitNone,
@@ -344,6 +372,44 @@ static char state_machine[BitStateCount][BitmapActionCount] = {
[BitmapActionDaemon] = BitNone,
[BitmapActionDiscard] = BitUnwritten,
[BitmapActionStale] = BitNeedSync,
[BitmapActionProactiveSync] = BitNone,
[BitmapActionClearUnwritten] = BitNone,
},
[BitNeedSyncUnwritten] = {
[BitmapActionStartwrite] = BitNeedSync,
[BitmapActionStartsync] = BitSyncingUnwritten,
[BitmapActionEndsync] = BitNone,
[BitmapActionAbortsync] = BitUnwritten,
[BitmapActionReload] = BitUnwritten,
[BitmapActionDaemon] = BitNone,
[BitmapActionDiscard] = BitUnwritten,
[BitmapActionStale] = BitUnwritten,
[BitmapActionProactiveSync] = BitNone,
[BitmapActionClearUnwritten] = BitUnwritten,
},
[BitSyncingUnwritten] = {
[BitmapActionStartwrite] = BitSyncing,
[BitmapActionStartsync] = BitSyncingUnwritten,
[BitmapActionEndsync] = BitCleanUnwritten,
[BitmapActionAbortsync] = BitUnwritten,
[BitmapActionReload] = BitUnwritten,
[BitmapActionDaemon] = BitNone,
[BitmapActionDiscard] = BitUnwritten,
[BitmapActionStale] = BitUnwritten,
[BitmapActionProactiveSync] = BitNone,
[BitmapActionClearUnwritten] = BitUnwritten,
},
[BitCleanUnwritten] = {
[BitmapActionStartwrite] = BitDirty,
[BitmapActionStartsync] = BitNone,
[BitmapActionEndsync] = BitNone,
[BitmapActionAbortsync] = BitNone,
[BitmapActionReload] = BitNone,
[BitmapActionDaemon] = BitNone,
[BitmapActionDiscard] = BitUnwritten,
[BitmapActionStale] = BitUnwritten,
[BitmapActionProactiveSync] = BitNone,
[BitmapActionClearUnwritten] = BitUnwritten,
},
};
@@ -376,6 +442,7 @@ static void llbitmap_infect_dirty_bits(struct llbitmap *llbitmap,
pctl->state[pos] = level_456 ? BitNeedSync : BitDirty;
break;
case BitClean:
case BitCleanUnwritten:
pctl->state[pos] = BitDirty;
break;
}
@@ -383,7 +450,7 @@ static void llbitmap_infect_dirty_bits(struct llbitmap *llbitmap,
}
static void llbitmap_set_page_dirty(struct llbitmap *llbitmap, int idx,
int offset)
int offset, bool infect)
{
struct llbitmap_page_ctl *pctl = llbitmap->pctl[idx];
unsigned int io_size = llbitmap->io_size;
@@ -398,7 +465,7 @@ static void llbitmap_set_page_dirty(struct llbitmap *llbitmap, int idx,
* resync all the dirty bits, hence skip infect new dirty bits to
* prevent resync unnecessary data.
*/
if (llbitmap->mddev->degraded) {
if (llbitmap->mddev->degraded || !infect) {
set_bit(block, pctl->dirty);
return;
}
@@ -438,7 +505,9 @@ static void llbitmap_write(struct llbitmap *llbitmap, enum llbitmap_state state,
llbitmap->pctl[idx]->state[bit] = state;
if (state == BitDirty || state == BitNeedSync)
llbitmap_set_page_dirty(llbitmap, idx, bit);
llbitmap_set_page_dirty(llbitmap, idx, bit, true);
else if (state == BitNeedSyncUnwritten)
llbitmap_set_page_dirty(llbitmap, idx, bit, false);
}
static struct page *llbitmap_read_page(struct llbitmap *llbitmap, int idx)
@@ -585,13 +654,73 @@ static int llbitmap_cache_pages(struct llbitmap *llbitmap)
return 0;
}
/*
* Check if all underlying disks support write_zeroes with unmap.
*/
static bool llbitmap_all_disks_support_wzeroes_unmap(struct llbitmap *llbitmap)
{
struct mddev *mddev = llbitmap->mddev;
struct md_rdev *rdev;
rdev_for_each(rdev, mddev) {
if (rdev->raid_disk < 0 || test_bit(Faulty, &rdev->flags))
continue;
if (bdev_write_zeroes_unmap_sectors(rdev->bdev) == 0)
return false;
}
return true;
}
/*
* Issue write_zeroes to all underlying disks to zero their data regions.
* This ensures parity consistency for RAID-456 (0 XOR 0 = 0).
* Returns true if all disks were successfully zeroed.
*/
static bool llbitmap_zero_all_disks(struct llbitmap *llbitmap)
{
struct mddev *mddev = llbitmap->mddev;
struct md_rdev *rdev;
sector_t dev_sectors = mddev->dev_sectors;
int ret;
rdev_for_each(rdev, mddev) {
if (rdev->raid_disk < 0 || test_bit(Faulty, &rdev->flags))
continue;
ret = blkdev_issue_zeroout(rdev->bdev,
rdev->data_offset,
dev_sectors,
GFP_KERNEL, 0);
if (ret) {
pr_warn("md/llbitmap: failed to zero disk %pg: %d\n",
rdev->bdev, ret);
return false;
}
}
return true;
}
static void llbitmap_init_state(struct llbitmap *llbitmap)
{
struct mddev *mddev = llbitmap->mddev;
enum llbitmap_state state = BitUnwritten;
unsigned long i;
if (test_and_clear_bit(BITMAP_CLEAN, &llbitmap->flags))
if (test_and_clear_bit(BITMAP_CLEAN, &llbitmap->flags)) {
state = BitClean;
} else if (raid_is_456(mddev) &&
llbitmap_all_disks_support_wzeroes_unmap(llbitmap)) {
/*
* All disks support write_zeroes with unmap. Zero all disks
* to ensure parity consistency, then set BitCleanUnwritten
* to skip initial sync.
*/
if (llbitmap_zero_all_disks(llbitmap))
state = BitCleanUnwritten;
}
for (i = 0; i < llbitmap->chunks; i++)
llbitmap_write(llbitmap, state, i);
@@ -627,11 +756,10 @@ static enum llbitmap_state llbitmap_state_machine(struct llbitmap *llbitmap,
goto write_bitmap;
}
if (c == BitNeedSync)
if (c == BitNeedSync || c == BitNeedSyncUnwritten)
need_resync = !mddev->degraded;
state = state_machine[c][action];
write_bitmap:
if (unlikely(mddev->degraded)) {
/* For degraded array, mark new data as need sync. */
@@ -658,8 +786,7 @@ static enum llbitmap_state llbitmap_state_machine(struct llbitmap *llbitmap,
}
llbitmap_write(llbitmap, state, start);
if (state == BitNeedSync)
if (state == BitNeedSync || state == BitNeedSyncUnwritten)
need_resync = !mddev->degraded;
else if (state == BitDirty &&
!timer_pending(&llbitmap->pending_timer))
@@ -1229,7 +1356,7 @@ static bool llbitmap_blocks_synced(struct mddev *mddev, sector_t offset)
unsigned long p = offset >> llbitmap->chunkshift;
enum llbitmap_state c = llbitmap_read(llbitmap, p);
return c == BitClean || c == BitDirty;
return c == BitClean || c == BitDirty || c == BitCleanUnwritten;
}
static sector_t llbitmap_skip_sync_blocks(struct mddev *mddev, sector_t offset)
@@ -1243,6 +1370,10 @@ static sector_t llbitmap_skip_sync_blocks(struct mddev *mddev, sector_t offset)
if (c == BitUnwritten)
return blocks;
/* Skip CleanUnwritten - no user data, will be reset after recovery */
if (c == BitCleanUnwritten)
return blocks;
/* For degraded array, don't skip */
if (mddev->degraded)
return 0;
@@ -1261,14 +1392,25 @@ static bool llbitmap_start_sync(struct mddev *mddev, sector_t offset,
{
struct llbitmap *llbitmap = mddev->bitmap;
unsigned long p = offset >> llbitmap->chunkshift;
enum llbitmap_state state;
/*
* Before recovery starts, convert CleanUnwritten to Unwritten.
* This ensures the new disk won't have stale parity data.
*/
if (offset == 0 && test_bit(MD_RECOVERY_RECOVER, &mddev->recovery) &&
!test_bit(MD_RECOVERY_LAZY_RECOVER, &mddev->recovery))
llbitmap_state_machine(llbitmap, 0, llbitmap->chunks - 1,
BitmapActionClearUnwritten);
/*
* Handle one bit at a time, this is much simpler. And it doesn't matter
* if md_do_sync() loop more times.
*/
*blocks = llbitmap->chunksize - (offset & (llbitmap->chunksize - 1));
return llbitmap_state_machine(llbitmap, p, p,
BitmapActionStartsync) == BitSyncing;
state = llbitmap_state_machine(llbitmap, p, p, BitmapActionStartsync);
return state == BitSyncing || state == BitSyncingUnwritten;
}
/* Something is wrong, sync_thread stop at @offset */
@@ -1474,9 +1616,15 @@ static ssize_t bits_show(struct mddev *mddev, char *page)
}
mutex_unlock(&mddev->bitmap_info.mutex);
return sprintf(page, "unwritten %d\nclean %d\ndirty %d\nneed sync %d\nsyncing %d\n",
return sprintf(page,
"unwritten %d\nclean %d\ndirty %d\n"
"need sync %d\nsyncing %d\n"
"need sync unwritten %d\nsyncing unwritten %d\n"
"clean unwritten %d\n",
bits[BitUnwritten], bits[BitClean], bits[BitDirty],
bits[BitNeedSync], bits[BitSyncing]);
bits[BitNeedSync], bits[BitSyncing],
bits[BitNeedSyncUnwritten], bits[BitSyncingUnwritten],
bits[BitCleanUnwritten]);
}
static struct md_sysfs_entry llbitmap_bits = __ATTR_RO(bits);
@@ -1549,11 +1697,39 @@ barrier_idle_store(struct mddev *mddev, const char *buf, size_t len)
static struct md_sysfs_entry llbitmap_barrier_idle = __ATTR_RW(barrier_idle);
static ssize_t
proactive_sync_store(struct mddev *mddev, const char *buf, size_t len)
{
struct llbitmap *llbitmap;
/* Only for RAID-456 */
if (!raid_is_456(mddev))
return -EINVAL;
mutex_lock(&mddev->bitmap_info.mutex);
llbitmap = mddev->bitmap;
if (!llbitmap || !llbitmap->pctl) {
mutex_unlock(&mddev->bitmap_info.mutex);
return -ENODEV;
}
/* Trigger proactive sync on all Unwritten regions */
llbitmap_state_machine(llbitmap, 0, llbitmap->chunks - 1,
BitmapActionProactiveSync);
mutex_unlock(&mddev->bitmap_info.mutex);
return len;
}
static struct md_sysfs_entry llbitmap_proactive_sync =
__ATTR(proactive_sync, 0200, NULL, proactive_sync_store);
static struct attribute *md_llbitmap_attrs[] = {
&llbitmap_bits.attr,
&llbitmap_metadata.attr,
&llbitmap_daemon_sleep.attr,
&llbitmap_barrier_idle.attr,
&llbitmap_proactive_sync.attr,
NULL
};

View File

@@ -84,7 +84,6 @@ static DEFINE_XARRAY(md_submodule);
static const struct kobj_type md_ktype;
static DECLARE_WAIT_QUEUE_HEAD(resync_wait);
static struct workqueue_struct *md_wq;
/*
* This workqueue is used for sync_work to register new sync_thread, and for
@@ -188,7 +187,6 @@ static int rdev_init_serial(struct md_rdev *rdev)
spin_lock_init(&serial_tmp->serial_lock);
serial_tmp->serial_rb = RB_ROOT_CACHED;
init_waitqueue_head(&serial_tmp->serial_io_wait);
}
rdev->serial = serial;
@@ -489,6 +487,17 @@ int mddev_suspend(struct mddev *mddev, bool interruptible)
}
percpu_ref_kill(&mddev->active_io);
/*
* RAID456 IO can sleep in wait_for_reshape while still holding an
* active_io reference. If reshape is already interrupted or frozen,
* wake those waiters so they can abort and drop the reference instead
* of deadlocking suspend.
*/
if (mddev->pers && mddev->pers->prepare_suspend &&
reshape_interrupted(mddev))
mddev->pers->prepare_suspend(mddev);
if (interruptible)
err = wait_event_interruptible(mddev->sb_wait,
percpu_ref_is_zero(&mddev->active_io));
@@ -6130,10 +6139,16 @@ md_attr_store(struct kobject *kobj, struct attribute *attr,
}
spin_unlock(&all_mddevs_lock);
rv = entry->store(mddev, page, length);
mddev_put(mddev);
/*
* For "array_state=clear", dropping the extra kobject reference from
* sysfs_break_active_protection() can trigger md kobject deletion.
* Restore active protection before mddev_put() so deletion happens
* after the sysfs write path fully unwinds.
*/
if (kn)
sysfs_unbreak_active_protection(kn);
mddev_put(mddev);
return rv;
}
@@ -6449,15 +6464,124 @@ static void md_safemode_timeout(struct timer_list *t)
static int start_dirty_degraded;
/*
* Read bitmap superblock and return the bitmap_id based on disk version.
* This is used as fallback when default bitmap version and on-disk version
* doesn't match, and mdadm is not the latest version to set bitmap_type.
*/
static enum md_submodule_id md_bitmap_get_id_from_sb(struct mddev *mddev)
{
struct md_rdev *rdev;
struct page *sb_page;
bitmap_super_t *sb;
enum md_submodule_id id = ID_BITMAP_NONE;
sector_t sector;
u32 version;
if (!mddev->bitmap_info.offset)
return ID_BITMAP_NONE;
sb_page = alloc_page(GFP_KERNEL);
if (!sb_page) {
pr_warn("md: %s: failed to allocate memory for bitmap\n",
mdname(mddev));
return ID_BITMAP_NONE;
}
sector = mddev->bitmap_info.offset;
rdev_for_each(rdev, mddev) {
u32 iosize;
if (!test_bit(In_sync, &rdev->flags) ||
test_bit(Faulty, &rdev->flags) ||
test_bit(Bitmap_sync, &rdev->flags))
continue;
iosize = roundup(sizeof(bitmap_super_t),
bdev_logical_block_size(rdev->bdev));
if (sync_page_io(rdev, sector, iosize, sb_page, REQ_OP_READ,
true))
goto read_ok;
}
pr_warn("md: %s: failed to read bitmap from any device\n",
mdname(mddev));
goto out;
read_ok:
sb = kmap_local_page(sb_page);
if (sb->magic != cpu_to_le32(BITMAP_MAGIC)) {
pr_warn("md: %s: invalid bitmap magic 0x%x\n",
mdname(mddev), le32_to_cpu(sb->magic));
goto out_unmap;
}
version = le32_to_cpu(sb->version);
switch (version) {
case BITMAP_MAJOR_LO:
case BITMAP_MAJOR_HI:
case BITMAP_MAJOR_CLUSTERED:
id = ID_BITMAP;
break;
case BITMAP_MAJOR_LOCKLESS:
id = ID_LLBITMAP;
break;
default:
pr_warn("md: %s: unknown bitmap version %u\n",
mdname(mddev), version);
break;
}
out_unmap:
kunmap_local(sb);
out:
__free_page(sb_page);
return id;
}
static int md_bitmap_create(struct mddev *mddev)
{
enum md_submodule_id orig_id = mddev->bitmap_id;
enum md_submodule_id sb_id;
int err;
if (mddev->bitmap_id == ID_BITMAP_NONE)
return -EINVAL;
if (!mddev_set_bitmap_ops(mddev))
return -ENOENT;
return mddev->bitmap_ops->create(mddev);
err = mddev->bitmap_ops->create(mddev);
if (!err)
return 0;
/*
* Create failed, if default bitmap version and on-disk version
* doesn't match, and mdadm is not the latest version to set
* bitmap_type, set bitmap_ops based on the disk version.
*/
mddev_clear_bitmap_ops(mddev);
sb_id = md_bitmap_get_id_from_sb(mddev);
if (sb_id == ID_BITMAP_NONE || sb_id == orig_id)
return err;
pr_info("md: %s: bitmap version mismatch, switching from %d to %d\n",
mdname(mddev), orig_id, sb_id);
mddev->bitmap_id = sb_id;
if (!mddev_set_bitmap_ops(mddev)) {
mddev->bitmap_id = orig_id;
return -ENOENT;
}
err = mddev->bitmap_ops->create(mddev);
if (err) {
mddev_clear_bitmap_ops(mddev);
mddev->bitmap_id = orig_id;
}
return err;
}
static void md_bitmap_destroy(struct mddev *mddev)
@@ -10505,10 +10629,6 @@ static int __init md_init(void)
goto err_bitmap;
ret = -ENOMEM;
md_wq = alloc_workqueue("md", WQ_MEM_RECLAIM | WQ_PERCPU, 0);
if (!md_wq)
goto err_wq;
md_misc_wq = alloc_workqueue("md_misc", WQ_PERCPU, 0);
if (!md_misc_wq)
goto err_misc_wq;
@@ -10533,8 +10653,6 @@ static int __init md_init(void)
err_md:
destroy_workqueue(md_misc_wq);
err_misc_wq:
destroy_workqueue(md_wq);
err_wq:
md_llbitmap_exit();
err_bitmap:
md_bitmap_exit();
@@ -10843,7 +10961,6 @@ static __exit void md_exit(void)
spin_unlock(&all_mddevs_lock);
destroy_workqueue(md_misc_wq);
destroy_workqueue(md_wq);
md_bitmap_exit();
}

View File

@@ -126,7 +126,6 @@ enum sync_action {
struct serial_in_rdev {
struct rb_root_cached serial_rb;
spinlock_t serial_lock;
wait_queue_head_t serial_io_wait;
};
/*
@@ -381,7 +380,11 @@ struct serial_info {
struct rb_node node;
sector_t start; /* start sector of rb node */
sector_t last; /* end sector of rb node */
sector_t wnode_start; /* address of waiting nodes on the same list */
sector_t _subtree_last; /* highest sector in subtree of rb node */
struct list_head list_node;
struct list_head waiters;
struct completion ready;
};
/*

View File

@@ -143,13 +143,13 @@ static int create_strip_zones(struct mddev *mddev, struct r0conf **private_conf)
}
err = -ENOMEM;
conf->strip_zone = kzalloc_objs(struct strip_zone, conf->nr_strip_zones);
conf->strip_zone = kvzalloc_objs(struct strip_zone, conf->nr_strip_zones);
if (!conf->strip_zone)
goto abort;
conf->devlist = kzalloc(array3_size(sizeof(struct md_rdev *),
conf->nr_strip_zones,
mddev->raid_disks),
GFP_KERNEL);
conf->devlist = kvzalloc(array3_size(sizeof(struct md_rdev *),
conf->nr_strip_zones,
mddev->raid_disks),
GFP_KERNEL);
if (!conf->devlist)
goto abort;
@@ -291,8 +291,8 @@ static int create_strip_zones(struct mddev *mddev, struct r0conf **private_conf)
return 0;
abort:
kfree(conf->strip_zone);
kfree(conf->devlist);
kvfree(conf->strip_zone);
kvfree(conf->devlist);
kfree(conf);
*private_conf = ERR_PTR(err);
return err;
@@ -373,8 +373,8 @@ static void raid0_free(struct mddev *mddev, void *priv)
{
struct r0conf *conf = priv;
kfree(conf->strip_zone);
kfree(conf->devlist);
kvfree(conf->strip_zone);
kvfree(conf->devlist);
kfree(conf);
}

View File

@@ -57,21 +57,29 @@ INTERVAL_TREE_DEFINE(struct serial_info, node, sector_t, _subtree_last,
START, LAST, static inline, raid1_rb);
static int check_and_add_serial(struct md_rdev *rdev, struct r1bio *r1_bio,
struct serial_info *si, int idx)
struct serial_info *si)
{
unsigned long flags;
int ret = 0;
sector_t lo = r1_bio->sector;
sector_t hi = lo + r1_bio->sectors - 1;
int idx = sector_to_idx(r1_bio->sector);
struct serial_in_rdev *serial = &rdev->serial[idx];
struct serial_info *head_si;
spin_lock_irqsave(&serial->serial_lock, flags);
/* collision happened */
if (raid1_rb_iter_first(&serial->serial_rb, lo, hi))
ret = -EBUSY;
else {
head_si = raid1_rb_iter_first(&serial->serial_rb, lo, hi);
if (head_si && head_si != si) {
si->start = lo;
si->last = hi;
si->wnode_start = head_si->wnode_start;
list_add_tail(&si->list_node, &head_si->waiters);
ret = -EBUSY;
} else if (!head_si) {
si->start = lo;
si->last = hi;
si->wnode_start = si->start;
raid1_rb_insert(si, &serial->serial_rb);
}
spin_unlock_irqrestore(&serial->serial_lock, flags);
@@ -83,19 +91,22 @@ static void wait_for_serialization(struct md_rdev *rdev, struct r1bio *r1_bio)
{
struct mddev *mddev = rdev->mddev;
struct serial_info *si;
int idx = sector_to_idx(r1_bio->sector);
struct serial_in_rdev *serial = &rdev->serial[idx];
if (WARN_ON(!mddev->serial_info_pool))
return;
si = mempool_alloc(mddev->serial_info_pool, GFP_NOIO);
wait_event(serial->serial_io_wait,
check_and_add_serial(rdev, r1_bio, si, idx) == 0);
INIT_LIST_HEAD(&si->waiters);
INIT_LIST_HEAD(&si->list_node);
init_completion(&si->ready);
while (check_and_add_serial(rdev, r1_bio, si)) {
wait_for_completion(&si->ready);
reinit_completion(&si->ready);
}
}
static void remove_serial(struct md_rdev *rdev, sector_t lo, sector_t hi)
{
struct serial_info *si;
struct serial_info *si, *iter_si;
unsigned long flags;
int found = 0;
struct mddev *mddev = rdev->mddev;
@@ -106,16 +117,28 @@ static void remove_serial(struct md_rdev *rdev, sector_t lo, sector_t hi)
for (si = raid1_rb_iter_first(&serial->serial_rb, lo, hi);
si; si = raid1_rb_iter_next(si, lo, hi)) {
if (si->start == lo && si->last == hi) {
raid1_rb_remove(si, &serial->serial_rb);
mempool_free(si, mddev->serial_info_pool);
found = 1;
break;
}
}
if (!found)
if (found) {
raid1_rb_remove(si, &serial->serial_rb);
if (!list_empty(&si->waiters)) {
list_for_each_entry(iter_si, &si->waiters, list_node) {
if (iter_si->wnode_start == si->wnode_start) {
list_del_init(&iter_si->list_node);
list_splice_init(&si->waiters, &iter_si->waiters);
raid1_rb_insert(iter_si, &serial->serial_rb);
complete(&iter_si->ready);
break;
}
}
}
mempool_free(si, mddev->serial_info_pool);
} else {
WARN(1, "The write IO is not recorded for serialization\n");
}
spin_unlock_irqrestore(&serial->serial_lock, flags);
wake_up(&serial->serial_io_wait);
}
/*

View File

@@ -2002,15 +2002,27 @@ r5l_recovery_verify_data_checksum_for_mb(struct r5l_log *log,
return -ENOMEM;
while (mb_offset < le32_to_cpu(mb->meta_size)) {
sector_t payload_len;
payload = (void *)mb + mb_offset;
payload_flush = (void *)mb + mb_offset;
if (le16_to_cpu(payload->header.type) == R5LOG_PAYLOAD_DATA) {
payload_len = sizeof(struct r5l_payload_data_parity) +
(sector_t)sizeof(__le32) *
(le32_to_cpu(payload->size) >> (PAGE_SHIFT - 9));
if (mb_offset + payload_len > le32_to_cpu(mb->meta_size))
goto mismatch;
if (r5l_recovery_verify_data_checksum(
log, ctx, page, log_offset,
payload->checksum[0]) < 0)
goto mismatch;
} else if (le16_to_cpu(payload->header.type) == R5LOG_PAYLOAD_PARITY) {
payload_len = sizeof(struct r5l_payload_data_parity) +
(sector_t)sizeof(__le32) *
(le32_to_cpu(payload->size) >> (PAGE_SHIFT - 9));
if (mb_offset + payload_len > le32_to_cpu(mb->meta_size))
goto mismatch;
if (r5l_recovery_verify_data_checksum(
log, ctx, page, log_offset,
payload->checksum[0]) < 0)
@@ -2023,22 +2035,18 @@ r5l_recovery_verify_data_checksum_for_mb(struct r5l_log *log,
payload->checksum[1]) < 0)
goto mismatch;
} else if (le16_to_cpu(payload->header.type) == R5LOG_PAYLOAD_FLUSH) {
/* nothing to do for R5LOG_PAYLOAD_FLUSH here */
payload_len = sizeof(struct r5l_payload_flush) +
(sector_t)le32_to_cpu(payload_flush->size);
if (mb_offset + payload_len > le32_to_cpu(mb->meta_size))
goto mismatch;
} else /* not R5LOG_PAYLOAD_DATA/PARITY/FLUSH */
goto mismatch;
if (le16_to_cpu(payload->header.type) == R5LOG_PAYLOAD_FLUSH) {
mb_offset += sizeof(struct r5l_payload_flush) +
le32_to_cpu(payload_flush->size);
} else {
/* DATA or PARITY payload */
if (le16_to_cpu(payload->header.type) != R5LOG_PAYLOAD_FLUSH) {
log_offset = r5l_ring_add(log, log_offset,
le32_to_cpu(payload->size));
mb_offset += sizeof(struct r5l_payload_data_parity) +
sizeof(__le32) *
(le32_to_cpu(payload->size) >> (PAGE_SHIFT - 9));
}
mb_offset += payload_len;
}
put_page(page);
@@ -2089,6 +2097,7 @@ r5c_recovery_analyze_meta_block(struct r5l_log *log,
log_offset = r5l_ring_add(log, ctx->pos, BLOCK_SECTORS);
while (mb_offset < le32_to_cpu(mb->meta_size)) {
sector_t payload_len;
int dd;
payload = (void *)mb + mb_offset;
@@ -2097,6 +2106,12 @@ r5c_recovery_analyze_meta_block(struct r5l_log *log,
if (le16_to_cpu(payload->header.type) == R5LOG_PAYLOAD_FLUSH) {
int i, count;
payload_len = sizeof(struct r5l_payload_flush) +
(sector_t)le32_to_cpu(payload_flush->size);
if (mb_offset + payload_len >
le32_to_cpu(mb->meta_size))
return -EINVAL;
count = le32_to_cpu(payload_flush->size) / sizeof(__le64);
for (i = 0; i < count; ++i) {
stripe_sect = le64_to_cpu(payload_flush->flush_stripes[i]);
@@ -2110,12 +2125,17 @@ r5c_recovery_analyze_meta_block(struct r5l_log *log,
}
}
mb_offset += sizeof(struct r5l_payload_flush) +
le32_to_cpu(payload_flush->size);
mb_offset += payload_len;
continue;
}
/* DATA or PARITY payload */
payload_len = sizeof(struct r5l_payload_data_parity) +
(sector_t)sizeof(__le32) *
(le32_to_cpu(payload->size) >> (PAGE_SHIFT - 9));
if (mb_offset + payload_len > le32_to_cpu(mb->meta_size))
return -EINVAL;
stripe_sect = (le16_to_cpu(payload->header.type) == R5LOG_PAYLOAD_DATA) ?
raid5_compute_sector(
conf, le64_to_cpu(payload->location), 0, &dd,
@@ -2180,9 +2200,7 @@ r5c_recovery_analyze_meta_block(struct r5l_log *log,
log_offset = r5l_ring_add(log, log_offset,
le32_to_cpu(payload->size));
mb_offset += sizeof(struct r5l_payload_data_parity) +
sizeof(__le32) *
(le32_to_cpu(payload->size) >> (PAGE_SHIFT - 9));
mb_offset += payload_len;
}
return 0;

View File

@@ -6641,7 +6641,13 @@ static int retry_aligned_read(struct r5conf *conf, struct bio *raid_bio,
}
if (!add_stripe_bio(sh, raid_bio, dd_idx, 0, 0)) {
raid5_release_stripe(sh);
int hash;
spin_lock_irq(&conf->device_lock);
hash = sh->hash_lock_index;
__release_stripe(conf, sh,
&conf->temp_inactive_list[hash]);
spin_unlock_irq(&conf->device_lock);
conf->retry_read_aligned = raid_bio;
conf->retry_read_offset = scnt;
return handled;