From 9e809bb1defe9be7fed2e21552c6b03b2694394d Mon Sep 17 00:00:00 2001 From: Matthew Sakai Date: Wed, 11 Feb 2026 10:05:54 -0500 Subject: [PATCH 01/62] dm vdo indexer: validate saved zone count Verify that the loaded zone count is in the valid range before using it as a loop iterator. Signed-off-by: Matthew Sakai Signed-off-by: Mikulas Patocka --- drivers/md/dm-vdo/indexer/index-layout.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/drivers/md/dm-vdo/indexer/index-layout.c b/drivers/md/dm-vdo/indexer/index-layout.c index 61edf2b72427..37144249f7ba 100644 --- a/drivers/md/dm-vdo/indexer/index-layout.c +++ b/drivers/md/dm-vdo/indexer/index-layout.c @@ -1445,6 +1445,9 @@ static int __must_check reconstruct_index_save(struct index_save_layout *isl, u64 last_block = next_block + isl->index_save.block_count; isl->zone_count = table->header.region_count - 3; + if (isl->zone_count > MAX_ZONES) + return vdo_log_error_strerror(UDS_CORRUPT_DATA, + "invalid zone count"); last_region = &table->regions[table->header.region_count - 1]; if (last_region->kind == RL_KIND_EMPTY) { From b3929b2cc2a6003b8e301e6540c651e60d24dcb4 Mon Sep 17 00:00:00 2001 From: Matthew Sakai Date: Wed, 11 Feb 2026 10:05:55 -0500 Subject: [PATCH 02/62] dm vdo slab-depot: validate old zone count on load Verify the old zone count has a valid value before using it to compute slab summary entry offsets. Signed-off-by: Matthew Sakai Signed-off-by: Mikulas Patocka --- drivers/md/dm-vdo/slab-depot.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/drivers/md/dm-vdo/slab-depot.c b/drivers/md/dm-vdo/slab-depot.c index 034ecaa51f48..ad00afc2c168 100644 --- a/drivers/md/dm-vdo/slab-depot.c +++ b/drivers/md/dm-vdo/slab-depot.c @@ -4262,6 +4262,10 @@ int vdo_decode_slab_depot(struct slab_depot_state_2_0 state, struct vdo *vdo, } slab_size_shift = ilog2(slab_size); + if (state.zone_count > MAX_VDO_PHYSICAL_ZONES) + return vdo_log_error_strerror(UDS_CORRUPT_DATA, + "invalid zone count"); + result = vdo_allocate_extended(struct slab_depot, vdo->thread_config.physical_zone_count, struct block_allocator, __func__, &depot); From 7d1f98d668ee34c1d15bdc0420fdd062f24a27c0 Mon Sep 17 00:00:00 2001 From: Ming-Hung Tsai Date: Mon, 9 Feb 2026 15:54:05 +0800 Subject: [PATCH 03/62] dm cache: fix null-deref with concurrent writes in passthrough mode In passthrough mode, when dm-cache starts to invalidate a cache entry and bio prison cell lock fails due to concurrent write to the same cached block, mg->cell remains NULL. The error path in invalidate_complete() attempts to unlock and free the cell unconditionally, causing a NULL pointer dereference: KASAN: null-ptr-deref in range [0x0000000000000000-0x0000000000000007] CPU: 0 UID: 0 PID: 134 Comm: fio Not tainted 6.19.0-rc7 #3 PREEMPT RIP: 0010:dm_cell_unlock_v2+0x3f/0x210 Call Trace: invalidate_complete+0xef/0x430 map_bio+0x130f/0x1a10 cache_map+0x320/0x6b0 __map_bio+0x458/0x510 dm_submit_bio+0x40e/0x16d0 __submit_bio+0x419/0x870 Reproduce steps: 1. Create a cache device dmsetup create cmeta --table "0 8192 linear /dev/sdc 0" dmsetup create cdata --table "0 131072 linear /dev/sdc 8192" dmsetup create corig --table "0 262144 linear /dev/sdc 262144" dd if=/dev/zero of=/dev/mapper/cmeta bs=4k count=1 oflag=direct dmsetup create cache --table "0 262144 cache /dev/mapper/cmeta \ /dev/mapper/cdata /dev/mapper/corig 128 2 metadata2 writethrough smq 0" 2. Promote the first data block into cache fio --filename=/dev/mapper/cache --name=populate --rw=write --bs=4k \ --direct=1 --size=64k 3. Reload the cache into passthrough mode dmsetup suspend cache dmsetup reload cache --table "0 262144 cache /dev/mapper/cmeta \ /dev/mapper/cdata /dev/mapper/corig 128 2 metadata2 passthrough smq 0" dmsetup resume cache 4. Write to the first cached block concurrently fio --filename=/dev/mapper/cache --name test --rw=randwrite --bs=4k \ --randrepeat=0 --direct=1 --numjobs=2 --size 64k Fix by checking if mg->cell is valid before attempting to unlock it. Fixes: b29d4986d0da ("dm cache: significant rework to leverage dm-bio-prison-v2") Signed-off-by: Ming-Hung Tsai Signed-off-by: Mikulas Patocka --- drivers/md/dm-cache-target.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/drivers/md/dm-cache-target.c b/drivers/md/dm-cache-target.c index 935ab79b1d0c..ae1edffd14ea 100644 --- a/drivers/md/dm-cache-target.c +++ b/drivers/md/dm-cache-target.c @@ -1462,8 +1462,10 @@ static void invalidate_complete(struct dm_cache_migration *mg, bool success) struct cache *cache = mg->cache; bio_list_init(&bios); - if (dm_cell_unlock_v2(cache->prison, mg->cell, &bios)) - free_prison_cell(cache, mg->cell); + if (mg->cell) { + if (dm_cell_unlock_v2(cache->prison, mg->cell, &bios)) + free_prison_cell(cache, mg->cell); + } if (!success && mg->overwrite_bio) bio_io_error(mg->overwrite_bio); From 0c5eef0aad508231d8e43ff8392692925e131b68 Mon Sep 17 00:00:00 2001 From: Ming-Hung Tsai Date: Mon, 9 Feb 2026 15:54:06 +0800 Subject: [PATCH 04/62] dm cache: fix write path cache coherency in passthrough mode In passthrough mode, dm-cache defers write bio submission until cache invalidation completes to maintain existing coherency, requiring the target map function to return DM_MAPIO_SUBMITTED. The current map_bio() returns DM_MAPIO_REMAPPED, violating the required ordering constraint. Reproduce steps: 1. Create a cache device dmsetup create cmeta --table "0 8192 linear /dev/sdc 0" dmsetup create cdata --table "0 131072 linear /dev/sdc 8192" dmsetup create corig --table "0 262144 linear /dev/sdc 262144" dd if=/dev/zero of=/dev/mapper/cmeta bs=4k count=1 oflag=direct dmsetup create cache --table "0 262144 cache /dev/mapper/cmeta \ /dev/mapper/cdata /dev/mapper/corig 128 2 metadata2 writethrough smq 0" 2. Promote the first data block into the cache fio --filename=/dev/mapper/cache --name=populate --rw=write --bs=4k \ --direct=1 --size=64k 3. Reload the cache into passthrough mode dmsetup suspend cache dmsetup reload cache --table "0 262144 cache /dev/mapper/cmeta \ /dev/mapper/cdata /dev/mapper/corig 128 2 metadata2 passthrough smq 0" dmsetup resume cache 4. Write to the first data block, and check io ordering using ftrace echo 1 > /sys/kernel/debug/tracing/events/block/block_bio_queue/enable echo 1 > /sys/kernel/debug/tracing/events/block/block_bio_complete/enable echo 1 > /sys/kernel/debug/tracing/events/block/block_rq_complete/enable fio --filename=/dev/mapper/cache --name=test --rw=write --bs=64k \ --direct=1 --size 64k 5. ftrace logs show that write operations to the cache origin (252:2) and metadata operations (252:0) are unsynchronized: the origin write occurs before metadata commit. fio-146 [000] ..... 420.139562: block_bio_queue: 252,3 WS 0 + 128 [fio] fio-146 [000] ..... 420.149395: block_bio_queue: 252,2 WS 0 + 128 [fio] fio-146 [000] ..... 420.149763: block_bio_queue: 8,32 WS 262144 + 128 [fio] fio-146 [000] dNh1. 420.151446: block_rq_complete: 8,32 WS () 262144 + 128 be,0,4 [0] fio-146 [000] dNh1. 420.152731: block_bio_complete: 252,2 WS 0 + 128 [0] fio-146 [000] dNh1. 420.154229: block_bio_complete: 252,3 WS 0 + 128 [0] kworker/0:0-9 [000] ..... 420.160530: block_bio_queue: 252,0 W 408 + 8 [kworker/0:0] kworker/0:0-9 [000] ..... 420.161641: block_bio_queue: 8,32 W 408 + 8 [kworker/0:0] kworker/0:0-9 [000] ..... 420.162533: block_bio_queue: 252,0 W 416 + 8 [kworker/0:0] kworker/0:0-9 [000] ..... 420.162821: block_bio_queue: 8,32 W 416 + 8 [kworker/0:0] Fixes: b29d4986d0da ("dm cache: significant rework to leverage dm-bio-prison-v2") Signed-off-by: Ming-Hung Tsai Signed-off-by: Mikulas Patocka --- drivers/md/dm-cache-target.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/md/dm-cache-target.c b/drivers/md/dm-cache-target.c index ae1edffd14ea..b608e88acd51 100644 --- a/drivers/md/dm-cache-target.c +++ b/drivers/md/dm-cache-target.c @@ -1703,6 +1703,7 @@ static int map_bio(struct cache *cache, struct bio *bio, dm_oblock_t block, bio_drop_shared_lock(cache, bio); atomic_inc(&cache->stats.demotion); invalidate_start(cache, cblock, block, bio); + return DM_MAPIO_SUBMITTED; } else remap_to_origin_clear_discard(cache, bio, block); } else { From 4ca8b8bd952df7c3ccdc68af9bd3419d0839a04b Mon Sep 17 00:00:00 2001 From: Ming-Hung Tsai Date: Mon, 9 Feb 2026 15:54:07 +0800 Subject: [PATCH 05/62] dm cache: fix write hang in passthrough mode The invalidate_remove() function has incomplete logic for handling write hit bios after cache invalidation. It sets up the remapping for the overwrite_bio but then drops it immediately without submission, causing write operations to hang. Fix by adding a new invalidate_committed() continuation that submits the remapped writes to the cache origin after metadata commit completes, while using the overwrite_endio hook to ensure proper completion sequencing. This maintains existing coherency. Also improve error handling in invalidate_complete() to preserve the original error status instead of using bio_io_error() unconditionally. Fixes: b29d4986d0da ("dm cache: significant rework to leverage dm-bio-prison-v2") Signed-off-by: Ming-Hung Tsai Signed-off-by: Mikulas Patocka --- drivers/md/dm-cache-target.c | 30 +++++++++++++++++++++++++----- 1 file changed, 25 insertions(+), 5 deletions(-) diff --git a/drivers/md/dm-cache-target.c b/drivers/md/dm-cache-target.c index b608e88acd51..d3ef88b859ab 100644 --- a/drivers/md/dm-cache-target.c +++ b/drivers/md/dm-cache-target.c @@ -1467,8 +1467,14 @@ static void invalidate_complete(struct dm_cache_migration *mg, bool success) free_prison_cell(cache, mg->cell); } - if (!success && mg->overwrite_bio) - bio_io_error(mg->overwrite_bio); + if (mg->overwrite_bio) { + // Set generic error if the bio hasn't been issued yet, + // e.g., invalidation or metadata commit failed before bio + // submission. Otherwise preserve the bio's own error status. + if (!success && !mg->overwrite_bio->bi_status) + mg->overwrite_bio->bi_status = BLK_STS_IOERR; + bio_endio(mg->overwrite_bio); + } free_migration(mg); defer_bios(cache, &bios); @@ -1508,6 +1514,22 @@ static int invalidate_cblock(struct cache *cache, dm_cblock_t cblock) return r; } +static void invalidate_committed(struct work_struct *ws) +{ + struct dm_cache_migration *mg = ws_to_mg(ws); + struct cache *cache = mg->cache; + struct bio *bio = mg->overwrite_bio; + struct per_bio_data *pb = get_per_bio_data(bio); + + if (mg->k.input) + invalidate_complete(mg, false); + + init_continuation(&mg->k, invalidate_completed); + remap_to_origin_clear_discard(cache, bio, mg->invalidate_oblock); + dm_hook_bio(&pb->hook_info, bio, overwrite_endio, mg); + dm_submit_bio_remap(bio, NULL); +} + static void invalidate_remove(struct work_struct *ws) { int r; @@ -1520,10 +1542,8 @@ static void invalidate_remove(struct work_struct *ws) return; } - init_continuation(&mg->k, invalidate_completed); + init_continuation(&mg->k, invalidate_committed); continue_after_commit(&cache->committer, &mg->k); - remap_to_origin_clear_discard(cache, mg->overwrite_bio, mg->invalidate_oblock); - mg->overwrite_bio = NULL; schedule_commit(&cache->committer); } From 2d1f7b65f5deedd2e6b09fdc6ea27f8375f24b45 Mon Sep 17 00:00:00 2001 From: Ming-Hung Tsai Date: Mon, 9 Feb 2026 15:54:08 +0800 Subject: [PATCH 06/62] dm cache policy smq: fix missing locks in invalidating cache blocks In passthrough mode, the policy invalidate_mapping operation is called simultaneously from multiple workers, thus it should be protected by a lock. Otherwise, we might end up with data races on the allocated blocks counter, or even use-after-free issues with internal data structures when doing concurrent writes. Note that the existing FIXME in smq_invalidate_mapping() doesn't affect passthrough mode since migration tasks don't exist there, but would need attention if supporting fast device shrinking via suspend/resume without target reloading. Reproduce steps: 1. Create a cache device consisting of 1024 cache entries dmsetup create cmeta --table "0 8192 linear /dev/sdc 0" dmsetup create cdata --table "0 131072 linear /dev/sdc 8192" dmsetup create corig --table "0 262144 linear /dev/sdc 262144" dd if=/dev/zero of=/dev/mapper/cmeta bs=4k count=1 oflag=direct dmsetup create cache --table "0 262144 cache /dev/mapper/cmeta \ /dev/mapper/cdata /dev/mapper/corig 128 2 metadata2 writethrough smq 0" 2. Populate the cache, and record the number of cached blocks fio --name=populate --filename=/dev/mapper/cache --rw=randwrite --bs=4k \ --size=64m --direct=1 nr_cached=$(dmsetup status cache | awk '{split($7, a, "/"); print a[1]}') 3. Reload the cache into passthrough mode dmsetup suspend cache dmsetup reload cache --table "0 262144 cache /dev/mapper/cmeta \ /dev/mapper/cdata /dev/mapper/corig 128 2 metadata2 passthrough smq 0" dmsetup resume cache 4. Write to the passthrough cache. By setting multiple jobs with I/O size equal to the cache block size, cache blocks are invalidated concurrently from different workers. fio --filename=/dev/mapper/cache --name=test --rw=randwrite --bs=64k \ --direct=1 --numjobs=2 --randrepeat=0 --size=64m 5. Check if demoted matches cached block count. These numbers should match but may differ due to the data race. nr_demoted=$(dmsetup status cache | awk '{print $12}') echo "$nr_cached, $nr_demoted" Fixes: b29d4986d0da ("dm cache: significant rework to leverage dm-bio-prison-v2") Signed-off-by: Ming-Hung Tsai Signed-off-by: Mikulas Patocka --- drivers/md/dm-cache-policy-smq.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/drivers/md/dm-cache-policy-smq.c b/drivers/md/dm-cache-policy-smq.c index b328d9601046..dd77a93fd68d 100644 --- a/drivers/md/dm-cache-policy-smq.c +++ b/drivers/md/dm-cache-policy-smq.c @@ -1589,14 +1589,18 @@ static int smq_invalidate_mapping(struct dm_cache_policy *p, dm_cblock_t cblock) { struct smq_policy *mq = to_smq_policy(p); struct entry *e = get_entry(&mq->cache_alloc, from_cblock(cblock)); + unsigned long flags; if (!e->allocated) return -ENODATA; + spin_lock_irqsave(&mq->lock, flags); // FIXME: what if this block has pending background work? del_queue(mq, e); h_remove(&mq->table, e); free_entry(&mq->cache_alloc, e); + spin_unlock_irqrestore(&mq->lock, flags); + return 0; } From e4f66341779d0cf4c83c74793753a84094286d9e Mon Sep 17 00:00:00 2001 From: Ming-Hung Tsai Date: Mon, 9 Feb 2026 15:54:09 +0800 Subject: [PATCH 07/62] dm cache: fix concurrent write failure in passthrough mode When bio prison cell lock acquisition fails due to concurrent writes to the same block in passthrough mode, dm-cache incorrectly returns an I/O error instead of properly handling the concurrency. This can occur in both process and workqueue contexts when invalidate_lock() is called for exclusive access to a data block. Fix this by deferring the write bios to ensure proper block device behavior. Reproduce steps: 1. Create a cache device dmsetup create cmeta --table "0 8192 linear /dev/sdc 0" dmsetup create cdata --table "0 131072 linear /dev/sdc 8192" dmsetup create corig --table "0 262144 linear /dev/sdc 262144" dd if=/dev/zero of=/dev/mapper/cmeta bs=4k count=1 oflag=direct dmsetup create cache --table "0 262144 cache /dev/mapper/cmeta \ /dev/mapper/cdata /dev/mapper/corig 128 2 metadata2 writethrough smq 0" 2. Promote the first data block into cache fio --filename=/dev/mapper/cache --name=populate --rw=write --bs=4k \ --direct=1 --size=64k 3. Reload the cache into passthrough mode dmsetup suspend cache dmsetup reload cache --table "0 262144 cache /dev/mapper/cmeta \ /dev/mapper/cdata /dev/mapper/corig 128 2 metadata2 passthrough smq 0" dmsetup resume cache 4. Write to the first cached block concurrently. Sometimes one of the processes will receive I/O errors. fio --filename=/dev/mapper/cache --name test --rw=randwrite --bs=4k \ --randrepeat=0 --direct=1 --numjobs=2 --size 64k fio-3.41 fio: io_u error on file /dev/mapper/cache: Input/output error: write offset=4096, buflen=4096 fio: pid=106, err=5/file:io_u.c:2008, func=io_u error, error=Input/output error test: (groupid=0, jobs=1): err= 0: pid=105 test: (groupid=0, jobs=1): err= 5 (file:io_u.c:2008, func=io_u error, error=Input/output error): pid=106 Fixes: b29d4986d0da ("dm cache: significant rework to leverage dm-bio-prison-v2") Signed-off-by: Ming-Hung Tsai Signed-off-by: Mikulas Patocka --- drivers/md/dm-cache-target.c | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/drivers/md/dm-cache-target.c b/drivers/md/dm-cache-target.c index d3ef88b859ab..32d22c7b9a07 100644 --- a/drivers/md/dm-cache-target.c +++ b/drivers/md/dm-cache-target.c @@ -1561,6 +1561,15 @@ static int invalidate_lock(struct dm_cache_migration *mg) READ_WRITE_LOCK_LEVEL, prealloc, &mg->cell); if (r < 0) { free_prison_cell(cache, prealloc); + + /* Defer the bio for retrying the cell lock */ + if (mg->overwrite_bio) { + struct bio *bio = mg->overwrite_bio; + + mg->overwrite_bio = NULL; + defer_bio(cache, bio); + } + invalidate_complete(mg, false); return r; } From 322586745bd1a0e5f3559fd1635fdeb4dbd1d6b8 Mon Sep 17 00:00:00 2001 From: Ming-Hung Tsai Date: Mon, 9 Feb 2026 15:54:10 +0800 Subject: [PATCH 08/62] dm cache: fix dirty mapping checking in passthrough mode switching As mentioned in commit 9b1cc9f251af ("dm cache: share cache-metadata object across inactive and active DM tables"), dm-cache assumed table reload occurs after suspension, while LVM's table preload breaks this assumption. The dirty mapping check for passthrough mode was designed around this assumption and is performed during table creation, causing the check to fail with preload while metadata updates are ongoing. This risks loading dirty mappings into passthrough mode, resulting in data loss. Reproduce steps: 1. Create a writeback cache with zero migration_threshold to produce dirty mappings dmsetup create cmeta --table "0 8192 linear /dev/sdc 0" dmsetup create cdata --table "0 131072 linear /dev/sdc 8192" dmsetup create corig --table "0 262144 linear /dev/sdc 262144" dd if=/dev/zero of=/dev/mapper/cmeta bs=4k count=1 oflag=direct dmsetup create cache --table "0 262144 cache /dev/mapper/cmeta \ /dev/mapper/cdata /dev/mapper/corig 128 2 metadata2 writeback smq \ 2 migration_threshold 0" 2. Preload a table in passthrough mode dmsetup reload cache --table "0 262144 cache /dev/mapper/cmeta \ /dev/mapper/cdata /dev/mapper/corig 128 2 metadata2 passthrough smq 0" 3. Write to the first cache block to make it dirty fio --filename=/dev/mapper/cache --name=populate --rw=write --bs=4k \ --direct=1 --size=64k 4. Resume the inactive table. Now it's possible to load the dirty block into passthrough mode. dmsetup resume cache Fix by moving the checks to the preresume phase to support table preloading. Also remove the unused function dm_cache_metadata_all_clean. Fixes: 2ee57d587357 ("dm cache: add passthrough mode") Signed-off-by: Ming-Hung Tsai Signed-off-by: Mikulas Patocka --- drivers/md/dm-cache-metadata.c | 11 ----------- drivers/md/dm-cache-metadata.h | 5 ----- drivers/md/dm-cache-target.c | 25 ++++++++----------------- 3 files changed, 8 insertions(+), 33 deletions(-) diff --git a/drivers/md/dm-cache-metadata.c b/drivers/md/dm-cache-metadata.c index 57158c02d096..1b86e80c89cc 100644 --- a/drivers/md/dm-cache-metadata.c +++ b/drivers/md/dm-cache-metadata.c @@ -1714,17 +1714,6 @@ int dm_cache_write_hints(struct dm_cache_metadata *cmd, struct dm_cache_policy * return r; } -int dm_cache_metadata_all_clean(struct dm_cache_metadata *cmd, bool *result) -{ - int r; - - READ_LOCK(cmd); - r = blocks_are_unmapped_or_clean(cmd, 0, cmd->cache_blocks, result); - READ_UNLOCK(cmd); - - return r; -} - void dm_cache_metadata_set_read_only(struct dm_cache_metadata *cmd) { WRITE_LOCK_VOID(cmd); diff --git a/drivers/md/dm-cache-metadata.h b/drivers/md/dm-cache-metadata.h index 5f77890207fe..2f107e7c67d0 100644 --- a/drivers/md/dm-cache-metadata.h +++ b/drivers/md/dm-cache-metadata.h @@ -135,11 +135,6 @@ int dm_cache_get_metadata_dev_size(struct dm_cache_metadata *cmd, */ int dm_cache_write_hints(struct dm_cache_metadata *cmd, struct dm_cache_policy *p); -/* - * Query method. Are all the blocks in the cache clean? - */ -int dm_cache_metadata_all_clean(struct dm_cache_metadata *cmd, bool *result); - int dm_cache_metadata_needs_check(struct dm_cache_metadata *cmd, bool *result); int dm_cache_metadata_set_needs_check(struct dm_cache_metadata *cmd); void dm_cache_metadata_set_read_only(struct dm_cache_metadata *cmd); diff --git a/drivers/md/dm-cache-target.c b/drivers/md/dm-cache-target.c index 32d22c7b9a07..e479ac22b97c 100644 --- a/drivers/md/dm-cache-target.c +++ b/drivers/md/dm-cache-target.c @@ -2499,23 +2499,8 @@ static int cache_create(struct cache_args *ca, struct cache **result) goto bad; } - if (passthrough_mode(cache)) { - bool all_clean; - - r = dm_cache_metadata_all_clean(cache->cmd, &all_clean); - if (r) { - *error = "dm_cache_metadata_all_clean() failed"; - goto bad; - } - - if (!all_clean) { - *error = "Cannot enter passthrough mode unless all blocks are clean"; - r = -EINVAL; - goto bad; - } - + if (passthrough_mode(cache)) policy_allow_migrations(cache->policy, false); - } spin_lock_init(&cache->lock); bio_list_init(&cache->deferred_bios); @@ -2842,6 +2827,12 @@ static int load_mapping(void *context, dm_oblock_t oblock, dm_cblock_t cblock, struct cache *cache = context; if (dirty) { + if (passthrough_mode(cache)) { + DMERR("%s: cannot enter passthrough mode unless all blocks are clean", + cache_device_name(cache)); + return -EBUSY; + } + set_bit(from_cblock(cblock), cache->dirty_bitset); atomic_inc(&cache->nr_dirty); } else @@ -3075,7 +3066,7 @@ static int cache_preresume(struct dm_target *ti) load_filtered_mapping, cache); if (r) { DMERR("%s: could not load cache mappings", cache_device_name(cache)); - if (r != -EFBIG) + if (r != -EFBIG && r != -EBUSY) metadata_operation_failed(cache, "dm_cache_load_mappings", r); return r; } From a373b3d5289e50ab26d4cf776bf5891436ff3658 Mon Sep 17 00:00:00 2001 From: Ming-Hung Tsai Date: Mon, 9 Feb 2026 15:54:11 +0800 Subject: [PATCH 09/62] dm cache: prevent entering passthrough mode after unclean shutdown dm-cache assumes all cache blocks are dirty when it recovers from an unclean shutdown. Given that the passthrough mode doesn't handle dirty blocks, we should not load a cache in passthrough mode if it was not cleanly shut down; or we'll risk data loss while updating an actually dirty block. Also bump the target version to 2.4.0 to mark completion of passthrough mode fixes. Reproduce steps: 1. Create a writeback cache with zero migration_threshold to produce dirty blocks. dmsetup create cmeta --table "0 8192 linear /dev/sdc 0" dmsetup create cdata --table "0 131072 linear /dev/sdc 8192" dmsetup create corig --table "0 262144 linear /dev/sdc 262144" dd if=/dev/zero of=/dev/mapper/cmeta bs=4k count=1 oflag=direct dmsetup create cache --table "0 262144 cache /dev/mapper/cmeta \ /dev/mapper/cdata /dev/mapper/corig 128 2 metadata2 writeback smq \ 2 migration_threshold 0" 2. Write the first cache block dirty fio --filename=/dev/mapper/cache --name=populate --rw=write --bs=4k \ --direct=1 --size=64k 3. Ensure the number of dirty blocks is 1. This status query triggers metadata commit without flushing the dirty bitset, setting up the unclean shutdown state. dmsetup status cache | awk '{print $14}' 4. Force reboot, leaving the cache uncleanly shutdown. echo b > /proc/sysrq-trigger 5. Activate the above cache components, and verify the first data block remains dirty. dmsetup create cmeta --table "0 8192 linear /dev/sdc 0" dmsetup create cdata --table "0 131072 linear /dev/sdc 8192" dmsetup create corig --table "0 262144 linear /dev/sdc 262144" dd if=/dev/mapper/cdata of=/tmp/cb0.bin bs=64k count=1 dd if=/dev/mapper/corig of=/tmp/ob0.bin bs=64k count=1 md5sum /tmp/cb0.bin /tmp/ob0.bin # expected to be different 6. Try bringing up the cache in passthrough mode. It succeeds, while the first cache block was loaded dirty due to unclean shutdown, violates the passthrough mode's constraints. dmsetup create cache --table "0 262144 cache /dev/mapper/cmeta \ /dev/mapper/cdata /dev/mapper/corig 128 2 metadata2 passthrough smq 0" dmsetup status cache | awk '{print $14}' 7. (Optional) Demonstrate the integrity issue: invalidating the dirty block in passthrough mode doesn't write back the dirty data, causing data loss. fio --filename=/dev/mapper/cache --name=invalidate --rw=write --bs=4k \ --direct=1 --size=4k # overwrite the first 4k to trigger invalidation dmsetup remove cache dd if=/dev/mapper/corig of=/tmp/ob0new.bin bs=64k count=1 cb0sum=$(dd if=/tmp/cb0.bin bs=4k count=15 skip=1 | md5sum | \ awk '{print $1}') ob0newsum=$(dd if=/tmp/ob0new.bin bs=4k count=15 skip=1 | md5sum | \ awk '{print $1}') echo "$cb0sum, $ob0newsum" # remaining 60k should differ (data loss) Signed-off-by: Ming-Hung Tsai Signed-off-by: Mikulas Patocka --- drivers/md/dm-cache-metadata.c | 9 +++++++++ drivers/md/dm-cache-metadata.h | 5 +++++ drivers/md/dm-cache-target.c | 19 ++++++++++++++++++- 3 files changed, 32 insertions(+), 1 deletion(-) diff --git a/drivers/md/dm-cache-metadata.c b/drivers/md/dm-cache-metadata.c index 1b86e80c89cc..25b8aebdca53 100644 --- a/drivers/md/dm-cache-metadata.c +++ b/drivers/md/dm-cache-metadata.c @@ -1813,3 +1813,12 @@ int dm_cache_metadata_abort(struct dm_cache_metadata *cmd) return r; } + +int dm_cache_metadata_clean_when_opened(struct dm_cache_metadata *cmd, bool *result) +{ + READ_LOCK(cmd); + *result = cmd->clean_when_opened; + READ_UNLOCK(cmd); + + return 0; +} diff --git a/drivers/md/dm-cache-metadata.h b/drivers/md/dm-cache-metadata.h index 2f107e7c67d0..91f8706b41fd 100644 --- a/drivers/md/dm-cache-metadata.h +++ b/drivers/md/dm-cache-metadata.h @@ -141,6 +141,11 @@ void dm_cache_metadata_set_read_only(struct dm_cache_metadata *cmd); void dm_cache_metadata_set_read_write(struct dm_cache_metadata *cmd); int dm_cache_metadata_abort(struct dm_cache_metadata *cmd); +/* + * Query method. Was the metadata cleanly shut down when opened? + */ +int dm_cache_metadata_clean_when_opened(struct dm_cache_metadata *cmd, bool *result); + /*----------------------------------------------------------------*/ #endif /* DM_CACHE_METADATA_H */ diff --git a/drivers/md/dm-cache-target.c b/drivers/md/dm-cache-target.c index e479ac22b97c..f8200c154805 100644 --- a/drivers/md/dm-cache-target.c +++ b/drivers/md/dm-cache-target.c @@ -2952,6 +2952,9 @@ static dm_cblock_t get_cache_dev_size(struct cache *cache) static bool can_resume(struct cache *cache) { + bool clean_when_opened; + int r; + /* * Disallow retrying the resume operation for devices that failed the * first resume attempt, as the failure leaves the policy object partially @@ -2968,6 +2971,20 @@ static bool can_resume(struct cache *cache) return false; } + if (passthrough_mode(cache)) { + r = dm_cache_metadata_clean_when_opened(cache->cmd, &clean_when_opened); + if (r) { + DMERR("%s: failed to query metadata flags", cache_device_name(cache)); + return false; + } + + if (!clean_when_opened) { + DMERR("%s: unable to resume into passthrough mode after unclean shutdown", + cache_device_name(cache)); + return false; + } + } + return true; } @@ -3533,7 +3550,7 @@ static void cache_io_hints(struct dm_target *ti, struct queue_limits *limits) static struct target_type cache_target = { .name = "cache", - .version = {2, 3, 0}, + .version = {2, 4, 0}, .module = THIS_MODULE, .ctr = cache_ctr, .dtr = cache_dtr, From 51d81e14fe6788dc6463064c7517480f2acd2724 Mon Sep 17 00:00:00 2001 From: Benjamin Marzinski Date: Thu, 12 Feb 2026 13:05:41 -0500 Subject: [PATCH 10/62] dm-mpath: don't stop probing paths at presuspend Commit 5c977f102315 ("dm-mpath: Don't grab work_mutex while probing paths"), added code to make multipath quit probing paths early, if it was trying to suspend. This isn't necessary. It was just an optimization to try to keep path probing from delaying a suspend. However it causes problems with the intended user of this code, qemu. The path probing code was added because failed ioctls to multipath devices don't cause paths to fail in cases where a regular IO failure would. If an ioctl to a path failed because the path was down, and the multipath device had passed presuspend, the M_MPATH_PROBE_PATHS ioctl would exit early, without probing the path. The caller would then retry the original ioctl, hoping to use a different path. But if there was only one path in the pathgroup, it would pick the same non-working path again, even if there were working paths in other pathgroups. ioctls to a suspended dm device will return -EAGAIN, notifying the caller that the device is suspended, but ioctls to a device that is just preparing to suspend won't (and in general, shouldn't). This means that the caller (qemu in this case) would get into a tight loop where it would issue an ioctl that failed, skip probing the paths because the device had already passed presuspend, and start over issuing the ioctl again. This would continue until the multipath device finally fully suspended, or the caller gave up and failed the ioctl. multipath's path probing code could return -EAGAIN in this case, and the caller could delay a bit before retrying, but the whole purpose of skipping the probe after presuspend was to speed things up, and that would just slow them down. Instead, remove the is_suspending flag, and check dm_suspended() instead to decide whether to exit the probing code early. This means that when the probing code exits early, future ioctls will also be delayed, because the device is fully suspended. Fixes: 5c977f102315 ("dm-mpath: Don't grab work_mutex while probing paths") Signed-off-by: Benjamin Marzinski Reviewed-by: Martin Wilck Reviewed-by: Hanna Czenczek Signed-off-by: Mikulas Patocka --- drivers/md/dm-mpath.c | 9 ++------- 1 file changed, 2 insertions(+), 7 deletions(-) diff --git a/drivers/md/dm-mpath.c b/drivers/md/dm-mpath.c index 8f4ae2f51545..7cb7bb6233b6 100644 --- a/drivers/md/dm-mpath.c +++ b/drivers/md/dm-mpath.c @@ -102,7 +102,6 @@ struct multipath { struct bio_list queued_bios; struct timer_list nopath_timer; /* Timeout for queue_if_no_path */ - bool is_suspending; }; /* @@ -1749,9 +1748,6 @@ static void multipath_presuspend(struct dm_target *ti) { struct multipath *m = ti->private; - spin_lock_irq(&m->lock); - m->is_suspending = true; - spin_unlock_irq(&m->lock); /* FIXME: bio-based shouldn't need to always disable queue_if_no_path */ if (m->queue_mode == DM_TYPE_BIO_BASED || !dm_noflush_suspending(m->ti)) queue_if_no_path(m, false, true, __func__); @@ -1774,7 +1770,6 @@ static void multipath_resume(struct dm_target *ti) struct multipath *m = ti->private; spin_lock_irq(&m->lock); - m->is_suspending = false; if (test_bit(MPATHF_SAVED_QUEUE_IF_NO_PATH, &m->flags)) { set_bit(MPATHF_QUEUE_IF_NO_PATH, &m->flags); clear_bit(MPATHF_SAVED_QUEUE_IF_NO_PATH, &m->flags); @@ -2098,7 +2093,7 @@ static int probe_active_paths(struct multipath *m) if (m->current_pg == m->last_probed_pg) goto skip_probe; } - if (!m->current_pg || m->is_suspending || + if (!m->current_pg || dm_suspended(m->ti) || test_bit(MPATHF_QUEUE_IO, &m->flags)) goto skip_probe; set_bit(MPATHF_DELAY_PG_SWITCH, &m->flags); @@ -2107,7 +2102,7 @@ static int probe_active_paths(struct multipath *m) list_for_each_entry(pgpath, &pg->pgpaths, list) { if (pg != READ_ONCE(m->current_pg) || - READ_ONCE(m->is_suspending)) + dm_suspended(m->ti)) goto out; if (!pgpath->is_active) continue; From 5282ac80183bd25e44c4f5f52a7f46e9a54289eb Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Wed, 25 Feb 2026 17:39:15 -0800 Subject: [PATCH 11/62] dm-ima: use SHA-256 library Make dm_ima_measure_on_table_load() use the SHA-256 library API instead of crypto_shash to calculate the SHA-256 hash value that it needs. This is simpler and more efficient. It also ensures that SHA-256 is actually available and doesn't fail due to the unreliable loading by name. While doing this, also use kasprintf() to simplify building the string version of the digest. Signed-off-by: Eric Biggers Signed-off-by: Mikulas Patocka --- drivers/md/Kconfig | 1 + drivers/md/dm-ima.c | 54 +++++++++------------------------------------ drivers/md/dm-ima.h | 1 - 3 files changed, 11 insertions(+), 45 deletions(-) diff --git a/drivers/md/Kconfig b/drivers/md/Kconfig index c58a9a8ea54e..53351048d3ec 100644 --- a/drivers/md/Kconfig +++ b/drivers/md/Kconfig @@ -226,6 +226,7 @@ config BLK_DEV_DM select BLOCK_HOLDER_DEPRECATED if SYSFS select BLK_DEV_DM_BUILTIN select BLK_MQ_STACKING + select CRYPTO_LIB_SHA256 if IMA depends on DAX || DAX=n help Device-mapper is a low level volume manager. It works by allowing diff --git a/drivers/md/dm-ima.c b/drivers/md/dm-ima.c index efb3cd4f9cd4..9495ca035056 100644 --- a/drivers/md/dm-ima.c +++ b/drivers/md/dm-ima.c @@ -12,9 +12,7 @@ #include #include -#include -#include -#include +#include #define DM_MSG_PREFIX "ima" @@ -178,19 +176,13 @@ void dm_ima_measure_on_table_load(struct dm_table *table, unsigned int status_fl size_t device_data_buf_len, target_metadata_buf_len, target_data_buf_len, l = 0; char *target_metadata_buf = NULL, *target_data_buf = NULL, *digest_buf = NULL; char *ima_buf = NULL, *device_data_buf = NULL; - int digest_size, last_target_measured = -1, r; + int last_target_measured = -1; status_type_t type = STATUSTYPE_IMA; size_t cur_total_buf_len = 0; unsigned int num_targets, i; - SHASH_DESC_ON_STACK(shash, NULL); - struct crypto_shash *tfm = NULL; - u8 *digest = NULL; + struct sha256_ctx hash_ctx; + u8 digest[SHA256_DIGEST_SIZE]; bool noio = false; - /* - * In below hash_alg_prefix_len assignment +1 is for the additional char (':'), - * when prefixing the hash value with the hash algorithm name. e.g. sha256:. - */ - const size_t hash_alg_prefix_len = strlen(DM_IMA_TABLE_HASH_ALG) + 1; char table_load_event_name[] = "dm_table_load"; ima_buf = dm_ima_alloc(DM_IMA_MEASUREMENT_BUF_LEN, noio); @@ -210,19 +202,7 @@ void dm_ima_measure_on_table_load(struct dm_table *table, unsigned int status_fl if (dm_ima_alloc_and_copy_device_data(table->md, &device_data_buf, num_targets, noio)) goto error; - tfm = crypto_alloc_shash(DM_IMA_TABLE_HASH_ALG, 0, 0); - if (IS_ERR(tfm)) - goto error; - - shash->tfm = tfm; - digest_size = crypto_shash_digestsize(tfm); - digest = dm_ima_alloc(digest_size, noio); - if (!digest) - goto error; - - r = crypto_shash_init(shash); - if (r) - goto error; + sha256_init(&hash_ctx); memcpy(ima_buf + l, DM_IMA_VERSION_STR, table->md->ima.dm_version_str_len); l += table->md->ima.dm_version_str_len; @@ -270,9 +250,7 @@ void dm_ima_measure_on_table_load(struct dm_table *table, unsigned int status_fl */ if (unlikely(cur_total_buf_len >= DM_IMA_MEASUREMENT_BUF_LEN)) { dm_ima_measure_data(table_load_event_name, ima_buf, l, noio); - r = crypto_shash_update(shash, (const u8 *)ima_buf, l); - if (r < 0) - goto error; + sha256_update(&hash_ctx, (const u8 *)ima_buf, l); memset(ima_buf, 0, DM_IMA_MEASUREMENT_BUF_LEN); l = 0; @@ -311,9 +289,7 @@ void dm_ima_measure_on_table_load(struct dm_table *table, unsigned int status_fl if (!last_target_measured) { dm_ima_measure_data(table_load_event_name, ima_buf, l, noio); - r = crypto_shash_update(shash, (const u8 *)ima_buf, l); - if (r < 0) - goto error; + sha256_update(&hash_ctx, (const u8 *)ima_buf, l); } /* @@ -321,20 +297,13 @@ void dm_ima_measure_on_table_load(struct dm_table *table, unsigned int status_fl * so that the table data can be verified against the future device state change * events, e.g. resume, rename, remove, table-clear etc. */ - r = crypto_shash_final(shash, digest); - if (r < 0) - goto error; - - digest_buf = dm_ima_alloc((digest_size*2) + hash_alg_prefix_len + 1, noio); + sha256_final(&hash_ctx, digest); + digest_buf = kasprintf(GFP_KERNEL, "sha256:%*phN", SHA256_DIGEST_SIZE, + digest); if (!digest_buf) goto error; - snprintf(digest_buf, hash_alg_prefix_len + 1, "%s:", DM_IMA_TABLE_HASH_ALG); - - for (i = 0; i < digest_size; i++) - snprintf((digest_buf + hash_alg_prefix_len + (i*2)), 3, "%02x", digest[i]); - if (table->md->ima.active_table.hash != table->md->ima.inactive_table.hash) kfree(table->md->ima.inactive_table.hash); @@ -354,9 +323,6 @@ void dm_ima_measure_on_table_load(struct dm_table *table, unsigned int status_fl kfree(digest_buf); kfree(device_data_buf); exit: - kfree(digest); - if (tfm) - crypto_free_shash(tfm); kfree(ima_buf); kfree(target_metadata_buf); kfree(target_data_buf); diff --git a/drivers/md/dm-ima.h b/drivers/md/dm-ima.h index 568870a1a145..a403deca6093 100644 --- a/drivers/md/dm-ima.h +++ b/drivers/md/dm-ima.h @@ -15,7 +15,6 @@ #define DM_IMA_TARGET_METADATA_BUF_LEN 128 #define DM_IMA_TARGET_DATA_BUF_LEN 2048 #define DM_IMA_DEVICE_CAPACITY_BUF_LEN 128 -#define DM_IMA_TABLE_HASH_ALG "sha256" #define __dm_ima_stringify(s) #s #define __dm_ima_str(s) __dm_ima_stringify(s) From 6af58aa3b028e364c0a8f8b6be48fca17e571de3 Mon Sep 17 00:00:00 2001 From: Ken Raeburn Date: Thu, 26 Feb 2026 20:12:06 -0500 Subject: [PATCH 12/62] dm vdo: update vdo_allocate_extended to take a field name, no types All of VDO's "extended" allocations use a flexible array field at the end of the allocated structure. We can infer the struct type from the supplied pointer. Replacing the array field type with the field name lets us use struct_size from overflow.h to compute the size instead of the local __vdo_do_allocation version. One allocation of bio structures doesn't conform to this pattern, since the removal of bi_inline_vecs; directly compute the total size for that case. Signed-off-by: Ken Raeburn Signed-off-by: Mikulas Patocka --- drivers/md/dm-vdo/block-map.c | 16 +++++--------- drivers/md/dm-vdo/data-vio.c | 3 +-- drivers/md/dm-vdo/dedupe.c | 3 +-- drivers/md/dm-vdo/indexer/index-layout.c | 11 ++++------ drivers/md/dm-vdo/indexer/index.c | 7 ++---- drivers/md/dm-vdo/indexer/open-chapter.c | 4 +--- drivers/md/dm-vdo/indexer/radix-sort.c | 3 +-- drivers/md/dm-vdo/io-submitter.c | 3 +-- drivers/md/dm-vdo/logical-zone.c | 3 +-- drivers/md/dm-vdo/memory-alloc.h | 27 ++++++++---------------- drivers/md/dm-vdo/packer.c | 7 +++--- drivers/md/dm-vdo/physical-zone.c | 6 ++---- drivers/md/dm-vdo/priority-table.c | 3 +-- drivers/md/dm-vdo/recovery-journal.c | 6 ++---- drivers/md/dm-vdo/repair.c | 4 +--- drivers/md/dm-vdo/slab-depot.c | 5 ++--- drivers/md/dm-vdo/vio.c | 7 +++--- 17 files changed, 40 insertions(+), 78 deletions(-) diff --git a/drivers/md/dm-vdo/block-map.c b/drivers/md/dm-vdo/block-map.c index a7db5b41155e..25eda82a9635 100644 --- a/drivers/md/dm-vdo/block-map.c +++ b/drivers/md/dm-vdo/block-map.c @@ -2478,9 +2478,7 @@ static int make_forest(struct block_map *map, block_count_t entries) return VDO_SUCCESS; } - result = vdo_allocate_extended(struct forest, map->root_count, - struct block_map_tree, __func__, - &forest); + result = vdo_allocate_extended(map->root_count, trees, __func__, &forest); if (result != VDO_SUCCESS) return result; @@ -2707,8 +2705,7 @@ void vdo_traverse_forest(struct block_map *map, vdo_entry_callback_fn callback, struct cursors *cursors; int result; - result = vdo_allocate_extended(struct cursors, map->root_count, - struct cursor, __func__, &cursors); + result = vdo_allocate_extended(map->root_count, cursors, __func__, &cursors); if (result != VDO_SUCCESS) { vdo_fail_completion(completion, result); return; @@ -2758,9 +2755,7 @@ static int __must_check initialize_block_map_zone(struct block_map *map, zone->thread_id = vdo->thread_config.logical_threads[zone_number]; zone->block_map = map; - result = vdo_allocate_extended(struct dirty_lists, maximum_age, - dirty_era_t, __func__, - &zone->dirty_lists); + result = vdo_allocate_extended(maximum_age, eras, __func__, &zone->dirty_lists); if (result != VDO_SUCCESS) return result; @@ -2900,9 +2895,8 @@ int vdo_decode_block_map(struct block_map_state_2_0 state, block_count_t logical if (result != VDO_SUCCESS) return result; - result = vdo_allocate_extended(struct block_map, - vdo->thread_config.logical_zone_count, - struct block_map_zone, __func__, &map); + result = vdo_allocate_extended(vdo->thread_config.logical_zone_count, + zones, __func__, &map); if (result != VDO_SUCCESS) return result; diff --git a/drivers/md/dm-vdo/data-vio.c b/drivers/md/dm-vdo/data-vio.c index 3333e1e5b02e..370d4239ba31 100644 --- a/drivers/md/dm-vdo/data-vio.c +++ b/drivers/md/dm-vdo/data-vio.c @@ -842,8 +842,7 @@ int make_data_vio_pool(struct vdo *vdo, data_vio_count_t pool_size, struct data_vio_pool *pool; data_vio_count_t i; - result = vdo_allocate_extended(struct data_vio_pool, pool_size, struct data_vio, - __func__, &pool); + result = vdo_allocate_extended(pool_size, data_vios, __func__, &pool); if (result != VDO_SUCCESS) return result; diff --git a/drivers/md/dm-vdo/dedupe.c b/drivers/md/dm-vdo/dedupe.c index 75a26f3f4461..36e9f1236025 100644 --- a/drivers/md/dm-vdo/dedupe.c +++ b/drivers/md/dm-vdo/dedupe.c @@ -2418,8 +2418,7 @@ int vdo_make_hash_zones(struct vdo *vdo, struct hash_zones **zones_ptr) if (zone_count == 0) return VDO_SUCCESS; - result = vdo_allocate_extended(struct hash_zones, zone_count, struct hash_zone, - __func__, &zones); + result = vdo_allocate_extended(zone_count, zones, __func__, &zones); if (result != VDO_SUCCESS) return result; diff --git a/drivers/md/dm-vdo/indexer/index-layout.c b/drivers/md/dm-vdo/indexer/index-layout.c index 37144249f7ba..76bcc0ae76cc 100644 --- a/drivers/md/dm-vdo/indexer/index-layout.c +++ b/drivers/md/dm-vdo/indexer/index-layout.c @@ -459,8 +459,7 @@ static int __must_check make_index_save_region_table(struct index_save_layout *i type = RH_TYPE_UNSAVED; } - result = vdo_allocate_extended(struct region_table, region_count, - struct layout_region, + result = vdo_allocate_extended(region_count, regions, "layout region table for ISL", &table); if (result != VDO_SUCCESS) return result; @@ -642,9 +641,8 @@ static int __must_check make_layout_region_table(struct index_layout *layout, struct region_table *table; struct layout_region *lr; - result = vdo_allocate_extended(struct region_table, region_count, - struct layout_region, "layout region table", - &table); + result = vdo_allocate_extended(region_count, regions, + "layout region table", &table); if (result != VDO_SUCCESS) return result; @@ -1138,8 +1136,7 @@ static int __must_check load_region_table(struct buffered_reader *reader, header.version); } - result = vdo_allocate_extended(struct region_table, header.region_count, - struct layout_region, + result = vdo_allocate_extended(header.region_count, regions, "single file layout region table", &table); if (result != VDO_SUCCESS) return result; diff --git a/drivers/md/dm-vdo/indexer/index.c b/drivers/md/dm-vdo/indexer/index.c index df4934846244..d4724fe17bf1 100644 --- a/drivers/md/dm-vdo/indexer/index.c +++ b/drivers/md/dm-vdo/indexer/index.c @@ -764,9 +764,7 @@ static int make_chapter_writer(struct uds_index *index, size_t collated_records_size = (sizeof(struct uds_volume_record) * index->volume->geometry->records_per_chapter); - result = vdo_allocate_extended(struct chapter_writer, index->zone_count, - struct open_chapter_zone *, "Chapter Writer", - &writer); + result = vdo_allocate_extended(index->zone_count, chapters, "Chapter Writer", &writer); if (result != VDO_SUCCESS) return result; @@ -1160,8 +1158,7 @@ int uds_make_index(struct uds_configuration *config, enum uds_open_index_type op u64 nonce; unsigned int z; - result = vdo_allocate_extended(struct uds_index, config->zone_count, - struct uds_request_queue *, "index", &index); + result = vdo_allocate_extended(config->zone_count, zone_queues, "index", &index); if (result != VDO_SUCCESS) return result; diff --git a/drivers/md/dm-vdo/indexer/open-chapter.c b/drivers/md/dm-vdo/indexer/open-chapter.c index 4a67bcadaae0..89b91c600bfd 100644 --- a/drivers/md/dm-vdo/indexer/open-chapter.c +++ b/drivers/md/dm-vdo/indexer/open-chapter.c @@ -68,9 +68,7 @@ int uds_make_open_chapter(const struct index_geometry *geometry, unsigned int zo size_t capacity = geometry->records_per_chapter / zone_count; size_t slot_count = (1 << bits_per(capacity * LOAD_RATIO)); - result = vdo_allocate_extended(struct open_chapter_zone, slot_count, - struct open_chapter_zone_slot, "open chapter", - &open_chapter); + result = vdo_allocate_extended(slot_count, slots, "open chapter", &open_chapter); if (result != VDO_SUCCESS) return result; diff --git a/drivers/md/dm-vdo/indexer/radix-sort.c b/drivers/md/dm-vdo/indexer/radix-sort.c index 66b8c706a1ef..4b81e130d18a 100644 --- a/drivers/md/dm-vdo/indexer/radix-sort.c +++ b/drivers/md/dm-vdo/indexer/radix-sort.c @@ -211,8 +211,7 @@ int uds_make_radix_sorter(unsigned int count, struct radix_sorter **sorter) unsigned int stack_size = count / INSERTION_SORT_THRESHOLD; struct radix_sorter *radix_sorter; - result = vdo_allocate_extended(struct radix_sorter, stack_size, struct task, - __func__, &radix_sorter); + result = vdo_allocate_extended(stack_size, stack, __func__, &radix_sorter); if (result != VDO_SUCCESS) return result; diff --git a/drivers/md/dm-vdo/io-submitter.c b/drivers/md/dm-vdo/io-submitter.c index e26d75f8366d..0e9932929fee 100644 --- a/drivers/md/dm-vdo/io-submitter.c +++ b/drivers/md/dm-vdo/io-submitter.c @@ -383,8 +383,7 @@ int vdo_make_io_submitter(unsigned int thread_count, unsigned int rotation_inter struct io_submitter *io_submitter; int result; - result = vdo_allocate_extended(struct io_submitter, thread_count, - struct bio_queue_data, "bio submission data", + result = vdo_allocate_extended(thread_count, bio_queue_data, "bio submission data", &io_submitter); if (result != VDO_SUCCESS) return result; diff --git a/drivers/md/dm-vdo/logical-zone.c b/drivers/md/dm-vdo/logical-zone.c index 0a27e60a9dfd..fa7c3eb7ee6b 100644 --- a/drivers/md/dm-vdo/logical-zone.c +++ b/drivers/md/dm-vdo/logical-zone.c @@ -94,8 +94,7 @@ int vdo_make_logical_zones(struct vdo *vdo, struct logical_zones **zones_ptr) if (zone_count == 0) return VDO_SUCCESS; - result = vdo_allocate_extended(struct logical_zones, zone_count, - struct logical_zone, __func__, &zones); + result = vdo_allocate_extended(zone_count, zones, __func__, &zones); if (result != VDO_SUCCESS) return result; diff --git a/drivers/md/dm-vdo/memory-alloc.h b/drivers/md/dm-vdo/memory-alloc.h index 0093d9f940d9..ab2375d549f4 100644 --- a/drivers/md/dm-vdo/memory-alloc.h +++ b/drivers/md/dm-vdo/memory-alloc.h @@ -8,6 +8,7 @@ #include #include /* for PAGE_SIZE */ +#include #include "permassert.h" #include "thread-registry.h" @@ -71,31 +72,21 @@ static inline int __vdo_do_allocation(size_t count, size_t size, size_t extra, __vdo_do_allocation(COUNT, sizeof(TYPE), 0, __alignof__(TYPE), WHAT, PTR) /* - * Allocate one object of an indicated type, followed by one or more elements of a second type, - * logging an error if the allocation fails. The memory will be zeroed. + * Allocate a structure with a flexible array member, with a specified number of elements, logging + * an error if the allocation fails. The memory will be zeroed. * - * @TYPE1: The type of the primary object to allocate. This type determines the alignment of the - * allocated memory. * @COUNT: The number of objects to allocate - * @TYPE2: The type of array objects to allocate + * @FIELD: The flexible array field at the end of the structure * @WHAT: What is being allocated (for error logging) * @PTR: A pointer to hold the allocated memory * * Return: VDO_SUCCESS or an error code */ -#define vdo_allocate_extended(TYPE1, COUNT, TYPE2, WHAT, PTR) \ - __extension__({ \ - int _result; \ - TYPE1 **_ptr = (PTR); \ - BUILD_BUG_ON(__alignof__(TYPE1) < __alignof__(TYPE2)); \ - _result = __vdo_do_allocation(COUNT, \ - sizeof(TYPE2), \ - sizeof(TYPE1), \ - __alignof__(TYPE1), \ - WHAT, \ - _ptr); \ - _result; \ - }) +#define vdo_allocate_extended(COUNT, FIELD, WHAT, PTR) \ + vdo_allocate_memory(struct_size(*(PTR), FIELD, (COUNT)), \ + __alignof__(typeof(**(PTR))), \ + WHAT, \ + (PTR)) /* * Allocate memory starting on a cache line boundary, logging an error if the allocation fails. The diff --git a/drivers/md/dm-vdo/packer.c b/drivers/md/dm-vdo/packer.c index 666be6d557e1..e638694d896c 100644 --- a/drivers/md/dm-vdo/packer.c +++ b/drivers/md/dm-vdo/packer.c @@ -120,8 +120,7 @@ static int __must_check make_bin(struct packer *packer) struct packer_bin *bin; int result; - result = vdo_allocate_extended(struct packer_bin, VDO_MAX_COMPRESSION_SLOTS, - struct vio *, __func__, &bin); + result = vdo_allocate_extended(VDO_MAX_COMPRESSION_SLOTS, incoming, __func__, &bin); if (result != VDO_SUCCESS) return result; @@ -168,8 +167,8 @@ int vdo_make_packer(struct vdo *vdo, block_count_t bin_count, struct packer **pa * bin must have a canceler for which it is waiting, and any canceler will only have * canceled one lock holder at a time. */ - result = vdo_allocate_extended(struct packer_bin, MAXIMUM_VDO_USER_VIOS / 2, - struct vio *, __func__, &packer->canceled_bin); + result = vdo_allocate_extended(MAXIMUM_VDO_USER_VIOS / 2, incoming, __func__, + &packer->canceled_bin); if (result != VDO_SUCCESS) { vdo_free_packer(packer); return result; diff --git a/drivers/md/dm-vdo/physical-zone.c b/drivers/md/dm-vdo/physical-zone.c index 686eb7d714e6..a8c7a57516eb 100644 --- a/drivers/md/dm-vdo/physical-zone.c +++ b/drivers/md/dm-vdo/physical-zone.c @@ -240,8 +240,7 @@ static int make_pbn_lock_pool(size_t capacity, struct pbn_lock_pool **pool_ptr) struct pbn_lock_pool *pool; int result; - result = vdo_allocate_extended(struct pbn_lock_pool, capacity, idle_pbn_lock, - __func__, &pool); + result = vdo_allocate_extended(capacity, locks, __func__, &pool); if (result != VDO_SUCCESS) return result; @@ -368,8 +367,7 @@ int vdo_make_physical_zones(struct vdo *vdo, struct physical_zones **zones_ptr) if (zone_count == 0) return VDO_SUCCESS; - result = vdo_allocate_extended(struct physical_zones, zone_count, - struct physical_zone, __func__, &zones); + result = vdo_allocate_extended(zone_count, zones, __func__, &zones); if (result != VDO_SUCCESS) return result; diff --git a/drivers/md/dm-vdo/priority-table.c b/drivers/md/dm-vdo/priority-table.c index 9bae8256ba4e..bb8a878ce4e5 100644 --- a/drivers/md/dm-vdo/priority-table.c +++ b/drivers/md/dm-vdo/priority-table.c @@ -60,8 +60,7 @@ int vdo_make_priority_table(unsigned int max_priority, struct priority_table **t if (max_priority > MAX_PRIORITY) return UDS_INVALID_ARGUMENT; - result = vdo_allocate_extended(struct priority_table, max_priority + 1, - struct bucket, __func__, &table); + result = vdo_allocate_extended(max_priority + 1, buckets, __func__, &table); if (result != VDO_SUCCESS) return result; diff --git a/drivers/md/dm-vdo/recovery-journal.c b/drivers/md/dm-vdo/recovery-journal.c index 9cc0f0ff1664..6da303961376 100644 --- a/drivers/md/dm-vdo/recovery-journal.c +++ b/drivers/md/dm-vdo/recovery-journal.c @@ -711,10 +711,8 @@ int vdo_decode_recovery_journal(struct recovery_journal_state_7_0 state, nonce_t struct recovery_journal *journal; int result; - result = vdo_allocate_extended(struct recovery_journal, - RECOVERY_JOURNAL_RESERVED_BLOCKS, - struct recovery_journal_block, __func__, - &journal); + result = vdo_allocate_extended(RECOVERY_JOURNAL_RESERVED_BLOCKS, blocks, + __func__, &journal); if (result != VDO_SUCCESS) return result; diff --git a/drivers/md/dm-vdo/repair.c b/drivers/md/dm-vdo/repair.c index 8c006fb3afcf..e479d3582040 100644 --- a/drivers/md/dm-vdo/repair.c +++ b/drivers/md/dm-vdo/repair.c @@ -1715,9 +1715,7 @@ void vdo_repair(struct vdo_completion *parent) vdo_log_warning("Device was dirty, rebuilding reference counts"); } - result = vdo_allocate_extended(struct repair_completion, page_count, - struct vdo_page_completion, __func__, - &repair); + result = vdo_allocate_extended(page_count, page_completions, __func__, &repair); if (result != VDO_SUCCESS) { vdo_fail_completion(parent, result); return; diff --git a/drivers/md/dm-vdo/slab-depot.c b/drivers/md/dm-vdo/slab-depot.c index ad00afc2c168..286fc4465a92 100644 --- a/drivers/md/dm-vdo/slab-depot.c +++ b/drivers/md/dm-vdo/slab-depot.c @@ -4266,9 +4266,8 @@ int vdo_decode_slab_depot(struct slab_depot_state_2_0 state, struct vdo *vdo, return vdo_log_error_strerror(UDS_CORRUPT_DATA, "invalid zone count"); - result = vdo_allocate_extended(struct slab_depot, - vdo->thread_config.physical_zone_count, - struct block_allocator, __func__, &depot); + result = vdo_allocate_extended(vdo->thread_config.physical_zone_count, + allocators, __func__, &depot); if (result != VDO_SUCCESS) return result; diff --git a/drivers/md/dm-vdo/vio.c b/drivers/md/dm-vdo/vio.c index 5ffc867d9c5e..cc739d52a70c 100644 --- a/drivers/md/dm-vdo/vio.c +++ b/drivers/md/dm-vdo/vio.c @@ -52,8 +52,8 @@ static int create_multi_block_bio(block_count_t size, struct bio **bio_ptr) struct bio *bio = NULL; int result; - result = vdo_allocate_extended(struct bio, size + 1, struct bio_vec, - "bio", &bio); + result = vdo_allocate_memory(sizeof(struct bio) + sizeof(struct bio_vec) * (size + 1), + __alignof__(struct bio), "bio", &bio); if (result != VDO_SUCCESS) return result; @@ -327,8 +327,7 @@ int make_vio_pool(struct vdo *vdo, size_t pool_size, size_t block_count, thread_ int result; size_t per_vio_size = VDO_BLOCK_SIZE * block_count; - result = vdo_allocate_extended(struct vio_pool, pool_size, struct pooled_vio, - __func__, &pool); + result = vdo_allocate_extended(pool_size, vios, __func__, &pool); if (result != VDO_SUCCESS) return result; From 9bb388b1a95751c4a4a99a4dab1b21136a4eeb96 Mon Sep 17 00:00:00 2001 From: Ken Raeburn Date: Thu, 26 Feb 2026 20:12:07 -0500 Subject: [PATCH 13/62] dm vdo: remove redundant TYPE argument from vdo_allocate macro We can infer the type needed from the supplied pointer argument. A couple invocation sites needed fixing to supply the proper type of pointer. Use overflow.h's size_mul, and we can remove the __vdo_do_allocation wrapper which did the same overflow check. Signed-off-by: Ken Raeburn Signed-off-by: Mikulas Patocka --- drivers/md/dm-vdo/action-manager.c | 2 +- drivers/md/dm-vdo/block-map.c | 16 ++----- drivers/md/dm-vdo/dedupe.c | 3 +- drivers/md/dm-vdo/dm-vdo-target.c | 12 ++--- drivers/md/dm-vdo/encodings.c | 2 +- drivers/md/dm-vdo/flush.c | 4 +- drivers/md/dm-vdo/funnel-queue.c | 2 +- drivers/md/dm-vdo/funnel-workqueue.c | 8 ++-- drivers/md/dm-vdo/indexer/chapter-index.c | 2 +- drivers/md/dm-vdo/indexer/config.c | 2 +- drivers/md/dm-vdo/indexer/delta-index.c | 13 ++--- .../md/dm-vdo/indexer/funnel-requestqueue.c | 2 +- drivers/md/dm-vdo/indexer/geometry.c | 2 +- drivers/md/dm-vdo/indexer/index-layout.c | 14 +++--- drivers/md/dm-vdo/indexer/index-page-map.c | 8 ++-- drivers/md/dm-vdo/indexer/index-session.c | 2 +- drivers/md/dm-vdo/indexer/index.c | 7 ++- drivers/md/dm-vdo/indexer/io-factory.c | 6 +-- drivers/md/dm-vdo/indexer/sparse-cache.c | 10 ++-- drivers/md/dm-vdo/indexer/volume-index.c | 10 ++-- drivers/md/dm-vdo/indexer/volume.c | 22 ++++----- drivers/md/dm-vdo/int-map.c | 5 +- drivers/md/dm-vdo/memory-alloc.c | 8 ++-- drivers/md/dm-vdo/memory-alloc.h | 47 ++----------------- drivers/md/dm-vdo/message-stats.c | 2 +- drivers/md/dm-vdo/packer.c | 2 +- drivers/md/dm-vdo/recovery-journal.c | 17 +++---- drivers/md/dm-vdo/repair.c | 11 ++--- drivers/md/dm-vdo/slab-depot.c | 32 +++++-------- drivers/md/dm-vdo/thread-utils.c | 2 +- drivers/md/dm-vdo/vdo.c | 32 ++++++------- drivers/md/dm-vdo/vio.c | 5 +- 32 files changed, 114 insertions(+), 198 deletions(-) diff --git a/drivers/md/dm-vdo/action-manager.c b/drivers/md/dm-vdo/action-manager.c index e3bba0b28aad..b8a3977b815d 100644 --- a/drivers/md/dm-vdo/action-manager.c +++ b/drivers/md/dm-vdo/action-manager.c @@ -107,7 +107,7 @@ int vdo_make_action_manager(zone_count_t zones, struct action_manager **manager_ptr) { struct action_manager *manager; - int result = vdo_allocate(1, struct action_manager, __func__, &manager); + int result = vdo_allocate(1, __func__, &manager); if (result != VDO_SUCCESS) return result; diff --git a/drivers/md/dm-vdo/block-map.c b/drivers/md/dm-vdo/block-map.c index 25eda82a9635..5ffc360540ed 100644 --- a/drivers/md/dm-vdo/block-map.c +++ b/drivers/md/dm-vdo/block-map.c @@ -221,8 +221,7 @@ static int __must_check allocate_cache_components(struct vdo_page_cache *cache) u64 size = cache->page_count * (u64) VDO_BLOCK_SIZE; int result; - result = vdo_allocate(cache->page_count, struct page_info, "page infos", - &cache->infos); + result = vdo_allocate(cache->page_count, "page infos", &cache->infos); if (result != VDO_SUCCESS) return result; @@ -2364,18 +2363,15 @@ static int make_segment(struct forest *old_forest, block_count_t new_pages, forest->segments = index + 1; - result = vdo_allocate(forest->segments, struct boundary, - "forest boundary array", &forest->boundaries); + result = vdo_allocate(forest->segments, "forest boundary array", &forest->boundaries); if (result != VDO_SUCCESS) return result; - result = vdo_allocate(forest->segments, struct tree_page *, - "forest page pointers", &forest->pages); + result = vdo_allocate(forest->segments, "forest page pointers", &forest->pages); if (result != VDO_SUCCESS) return result; - result = vdo_allocate(new_pages, struct tree_page, - "new forest pages", &forest->pages[index]); + result = vdo_allocate(new_pages, "new forest pages", &forest->pages[index]); if (result != VDO_SUCCESS) return result; @@ -2400,9 +2396,7 @@ static int make_segment(struct forest *old_forest, block_count_t new_pages, struct block_map_tree *tree = &(forest->trees[root]); height_t height; - int result = vdo_allocate(forest->segments, - struct block_map_tree_segment, - "tree root segments", &tree->segments); + result = vdo_allocate(forest->segments, "tree root segments", &tree->segments); if (result != VDO_SUCCESS) return result; diff --git a/drivers/md/dm-vdo/dedupe.c b/drivers/md/dm-vdo/dedupe.c index 36e9f1236025..4e0fefd077d0 100644 --- a/drivers/md/dm-vdo/dedupe.c +++ b/drivers/md/dm-vdo/dedupe.c @@ -2364,8 +2364,7 @@ static int __must_check initialize_zone(struct vdo *vdo, struct hash_zones *zone vdo_set_completion_callback(&zone->completion, timeout_index_operations_callback, zone->thread_id); INIT_LIST_HEAD(&zone->lock_pool); - result = vdo_allocate(LOCK_POOL_CAPACITY, struct hash_lock, "hash_lock array", - &zone->lock_array); + result = vdo_allocate(LOCK_POOL_CAPACITY, "hash_lock array", &zone->lock_array); if (result != VDO_SUCCESS) return result; diff --git a/drivers/md/dm-vdo/dm-vdo-target.c b/drivers/md/dm-vdo/dm-vdo-target.c index 6af40d40f255..7eb676e58ed5 100644 --- a/drivers/md/dm-vdo/dm-vdo-target.c +++ b/drivers/md/dm-vdo/dm-vdo-target.c @@ -273,8 +273,7 @@ static int split_string(const char *string, char separator, char ***substring_ar substring_count++; } - result = vdo_allocate(substring_count + 1, char *, "string-splitting array", - &substrings); + result = vdo_allocate(substring_count + 1, "string-splitting array", &substrings); if (result != VDO_SUCCESS) return result; @@ -282,7 +281,7 @@ static int split_string(const char *string, char separator, char ***substring_ar if (*s == separator) { ptrdiff_t length = s - string; - result = vdo_allocate(length + 1, char, "split string", + result = vdo_allocate(length + 1, "split string", &substrings[current_substring]); if (result != VDO_SUCCESS) { free_string_array(substrings); @@ -303,8 +302,7 @@ static int split_string(const char *string, char separator, char ***substring_ar BUG_ON(current_substring != (substring_count - 1)); length = strlen(string); - result = vdo_allocate(length + 1, char, "split string", - &substrings[current_substring]); + result = vdo_allocate(length + 1, "split string", &substrings[current_substring]); if (result != VDO_SUCCESS) { free_string_array(substrings); return result; @@ -332,7 +330,7 @@ static int join_strings(char **substring_array, size_t array_length, char separa for (i = 0; (i < array_length) && (substring_array[i] != NULL); i++) string_length += strlen(substring_array[i]) + 1; - result = vdo_allocate(string_length, char, __func__, &output); + result = vdo_allocate(string_length, __func__, &output); if (result != VDO_SUCCESS) return result; @@ -726,7 +724,7 @@ static int parse_device_config(int argc, char **argv, struct dm_target *ti, return VDO_BAD_CONFIGURATION; } - result = vdo_allocate(1, struct device_config, "device_config", &config); + result = vdo_allocate(1, "device_config", &config); if (result != VDO_SUCCESS) { handle_parse_error(config, error_ptr, "Could not allocate config structure"); diff --git a/drivers/md/dm-vdo/encodings.c b/drivers/md/dm-vdo/encodings.c index bd60f4b3a0d0..ec98c539701e 100644 --- a/drivers/md/dm-vdo/encodings.c +++ b/drivers/md/dm-vdo/encodings.c @@ -798,7 +798,7 @@ static int allocate_partition(struct layout *layout, u8 id, struct partition *partition; int result; - result = vdo_allocate(1, struct partition, __func__, &partition); + result = vdo_allocate(1, __func__, &partition); if (result != VDO_SUCCESS) return result; diff --git a/drivers/md/dm-vdo/flush.c b/drivers/md/dm-vdo/flush.c index 82a259ef1601..6c1610ba91b6 100644 --- a/drivers/md/dm-vdo/flush.c +++ b/drivers/md/dm-vdo/flush.c @@ -105,7 +105,7 @@ static void *allocate_flush(gfp_t gfp_mask, void *pool_data) if ((gfp_mask & GFP_NOWAIT) == GFP_NOWAIT) { flush = vdo_allocate_memory_nowait(sizeof(struct vdo_flush), __func__); } else { - int result = vdo_allocate(1, struct vdo_flush, __func__, &flush); + int result = vdo_allocate(1, __func__, &flush); if (result != VDO_SUCCESS) vdo_log_error_strerror(result, "failed to allocate spare flush"); @@ -134,7 +134,7 @@ static void free_flush(void *element, void *pool_data __always_unused) */ int vdo_make_flusher(struct vdo *vdo) { - int result = vdo_allocate(1, struct flusher, __func__, &vdo->flusher); + int result = vdo_allocate(1, __func__, &vdo->flusher); if (result != VDO_SUCCESS) return result; diff --git a/drivers/md/dm-vdo/funnel-queue.c b/drivers/md/dm-vdo/funnel-queue.c index a63b2f2bfd7d..7011963c9073 100644 --- a/drivers/md/dm-vdo/funnel-queue.c +++ b/drivers/md/dm-vdo/funnel-queue.c @@ -14,7 +14,7 @@ int vdo_make_funnel_queue(struct funnel_queue **queue_ptr) int result; struct funnel_queue *queue; - result = vdo_allocate(1, struct funnel_queue, "funnel queue", &queue); + result = vdo_allocate(1, "funnel queue", &queue); if (result != VDO_SUCCESS) return result; diff --git a/drivers/md/dm-vdo/funnel-workqueue.c b/drivers/md/dm-vdo/funnel-workqueue.c index 8a79b33b8b09..62d300f70de9 100644 --- a/drivers/md/dm-vdo/funnel-workqueue.c +++ b/drivers/md/dm-vdo/funnel-workqueue.c @@ -322,7 +322,7 @@ static int make_simple_work_queue(const char *thread_name_prefix, const char *na "queue priority count %u within limit %u", type->max_priority, VDO_WORK_Q_MAX_PRIORITY); - result = vdo_allocate(1, struct simple_work_queue, "simple work queue", &queue); + result = vdo_allocate(1, "simple work queue", &queue); if (result != VDO_SUCCESS) return result; @@ -405,13 +405,11 @@ int vdo_make_work_queue(const char *thread_name_prefix, const char *name, return result; } - result = vdo_allocate(1, struct round_robin_work_queue, "round-robin work queue", - &queue); + result = vdo_allocate(1, "round-robin work queue", &queue); if (result != VDO_SUCCESS) return result; - result = vdo_allocate(thread_count, struct simple_work_queue *, - "subordinate work queues", &queue->service_queues); + result = vdo_allocate(thread_count, "subordinate work queues", &queue->service_queues); if (result != VDO_SUCCESS) { vdo_free(queue); return result; diff --git a/drivers/md/dm-vdo/indexer/chapter-index.c b/drivers/md/dm-vdo/indexer/chapter-index.c index fb1db41c794b..bb3b0ab5d50d 100644 --- a/drivers/md/dm-vdo/indexer/chapter-index.c +++ b/drivers/md/dm-vdo/indexer/chapter-index.c @@ -20,7 +20,7 @@ int uds_make_open_chapter_index(struct open_chapter_index **chapter_index, size_t memory_size; struct open_chapter_index *index; - result = vdo_allocate(1, struct open_chapter_index, "open chapter index", &index); + result = vdo_allocate(1, "open chapter index", &index); if (result != VDO_SUCCESS) return result; diff --git a/drivers/md/dm-vdo/indexer/config.c b/drivers/md/dm-vdo/indexer/config.c index 5532371b952f..4a2cc66cfd60 100644 --- a/drivers/md/dm-vdo/indexer/config.c +++ b/drivers/md/dm-vdo/indexer/config.c @@ -325,7 +325,7 @@ int uds_make_configuration(const struct uds_parameters *params, if (result != UDS_SUCCESS) return result; - result = vdo_allocate(1, struct uds_configuration, __func__, &config); + result = vdo_allocate(1, __func__, &config); if (result != VDO_SUCCESS) return result; diff --git a/drivers/md/dm-vdo/indexer/delta-index.c b/drivers/md/dm-vdo/indexer/delta-index.c index 0ac2443f0df3..b288749067de 100644 --- a/drivers/md/dm-vdo/indexer/delta-index.c +++ b/drivers/md/dm-vdo/indexer/delta-index.c @@ -311,18 +311,16 @@ static int initialize_delta_zone(struct delta_zone *delta_zone, size_t size, { int result; - result = vdo_allocate(size, u8, "delta list", &delta_zone->memory); + result = vdo_allocate(size, "delta list", &delta_zone->memory); if (result != VDO_SUCCESS) return result; - result = vdo_allocate(list_count + 2, u64, "delta list temp", - &delta_zone->new_offsets); + result = vdo_allocate(list_count + 2, "delta list temp", &delta_zone->new_offsets); if (result != VDO_SUCCESS) return result; /* Allocate the delta lists. */ - result = vdo_allocate(list_count + 2, struct delta_list, "delta lists", - &delta_zone->delta_lists); + result = vdo_allocate(list_count + 2, "delta lists", &delta_zone->delta_lists); if (result != VDO_SUCCESS) return result; @@ -352,8 +350,7 @@ int uds_initialize_delta_index(struct delta_index *delta_index, unsigned int zon unsigned int z; size_t zone_memory; - result = vdo_allocate(zone_count, struct delta_zone, "Delta Index Zones", - &delta_index->delta_zones); + result = vdo_allocate(zone_count, "Delta Index Zones", &delta_index->delta_zones); if (result != VDO_SUCCESS) return result; @@ -1047,7 +1044,7 @@ int uds_finish_restoring_delta_index(struct delta_index *delta_index, unsigned int z; u8 *data; - result = vdo_allocate(DELTA_LIST_MAX_BYTE_COUNT, u8, __func__, &data); + result = vdo_allocate(DELTA_LIST_MAX_BYTE_COUNT, __func__, &data); if (result != VDO_SUCCESS) return result; diff --git a/drivers/md/dm-vdo/indexer/funnel-requestqueue.c b/drivers/md/dm-vdo/indexer/funnel-requestqueue.c index 1a5735375ddc..03797cf87b91 100644 --- a/drivers/md/dm-vdo/indexer/funnel-requestqueue.c +++ b/drivers/md/dm-vdo/indexer/funnel-requestqueue.c @@ -198,7 +198,7 @@ int uds_make_request_queue(const char *queue_name, int result; struct uds_request_queue *queue; - result = vdo_allocate(1, struct uds_request_queue, __func__, &queue); + result = vdo_allocate(1, __func__, &queue); if (result != VDO_SUCCESS) return result; diff --git a/drivers/md/dm-vdo/indexer/geometry.c b/drivers/md/dm-vdo/indexer/geometry.c index c0575612e820..49f122a223d5 100644 --- a/drivers/md/dm-vdo/indexer/geometry.c +++ b/drivers/md/dm-vdo/indexer/geometry.c @@ -61,7 +61,7 @@ int uds_make_index_geometry(size_t bytes_per_page, u32 record_pages_per_chapter, int result; struct index_geometry *geometry; - result = vdo_allocate(1, struct index_geometry, "geometry", &geometry); + result = vdo_allocate(1, "geometry", &geometry); if (result != VDO_SUCCESS) return result; diff --git a/drivers/md/dm-vdo/indexer/index-layout.c b/drivers/md/dm-vdo/indexer/index-layout.c index 76bcc0ae76cc..7a1209b21c03 100644 --- a/drivers/md/dm-vdo/indexer/index-layout.c +++ b/drivers/md/dm-vdo/indexer/index-layout.c @@ -519,7 +519,7 @@ static int __must_check write_index_save_header(struct index_save_layout *isl, u8 *buffer; size_t offset = 0; - result = vdo_allocate(table->encoded_size, u8, "index save data", &buffer); + result = vdo_allocate(table->encoded_size, "index save data", &buffer); if (result != VDO_SUCCESS) return result; @@ -688,7 +688,7 @@ static int __must_check write_layout_header(struct index_layout *layout, u8 *buffer; size_t offset = 0; - result = vdo_allocate(table->encoded_size, u8, "layout data", &buffer); + result = vdo_allocate(table->encoded_size, "layout data", &buffer); if (result != VDO_SUCCESS) return result; @@ -778,8 +778,7 @@ static int create_index_layout(struct index_layout *layout, struct uds_configura if (result != UDS_SUCCESS) return result; - result = vdo_allocate(sizes.save_count, struct index_save_layout, __func__, - &layout->index.saves); + result = vdo_allocate(sizes.save_count, __func__, &layout->index.saves); if (result != VDO_SUCCESS) return result; @@ -1174,7 +1173,7 @@ static int __must_check read_super_block_data(struct buffered_reader *reader, u8 *buffer; size_t offset = 0; - result = vdo_allocate(saved_size, u8, "super block data", &buffer); + result = vdo_allocate(saved_size, "super block data", &buffer); if (result != VDO_SUCCESS) return result; @@ -1308,8 +1307,7 @@ static int __must_check reconstitute_layout(struct index_layout *layout, int result; u64 next_block = first_block; - result = vdo_allocate(layout->super.max_saves, struct index_save_layout, - __func__, &layout->index.saves); + result = vdo_allocate(layout->super.max_saves, __func__, &layout->index.saves); if (result != VDO_SUCCESS) return result; @@ -1672,7 +1670,7 @@ int uds_make_index_layout(struct uds_configuration *config, bool new_layout, if (result != UDS_SUCCESS) return result; - result = vdo_allocate(1, struct index_layout, __func__, &layout); + result = vdo_allocate(1, __func__, &layout); if (result != VDO_SUCCESS) return result; diff --git a/drivers/md/dm-vdo/indexer/index-page-map.c b/drivers/md/dm-vdo/indexer/index-page-map.c index 00b44e07d0c1..1d45d466d07f 100644 --- a/drivers/md/dm-vdo/indexer/index-page-map.c +++ b/drivers/md/dm-vdo/indexer/index-page-map.c @@ -38,13 +38,13 @@ int uds_make_index_page_map(const struct index_geometry *geometry, int result; struct index_page_map *map; - result = vdo_allocate(1, struct index_page_map, "page map", &map); + result = vdo_allocate(1, "page map", &map); if (result != VDO_SUCCESS) return result; map->geometry = geometry; map->entries_per_chapter = geometry->index_pages_per_chapter - 1; - result = vdo_allocate(get_entry_count(geometry), u16, "Index Page Map Entries", + result = vdo_allocate(get_entry_count(geometry), "Index Page Map Entries", &map->entries); if (result != VDO_SUCCESS) { uds_free_index_page_map(map); @@ -118,7 +118,7 @@ int uds_write_index_page_map(struct index_page_map *map, struct buffered_writer u64 saved_size = uds_compute_index_page_map_save_size(map->geometry); u32 i; - result = vdo_allocate(saved_size, u8, "page map data", &buffer); + result = vdo_allocate(saved_size, "page map data", &buffer); if (result != VDO_SUCCESS) return result; @@ -145,7 +145,7 @@ int uds_read_index_page_map(struct index_page_map *map, struct buffered_reader * u64 saved_size = uds_compute_index_page_map_save_size(map->geometry); u32 i; - result = vdo_allocate(saved_size, u8, "page map data", &buffer); + result = vdo_allocate(saved_size, "page map data", &buffer); if (result != VDO_SUCCESS) return result; diff --git a/drivers/md/dm-vdo/indexer/index-session.c b/drivers/md/dm-vdo/indexer/index-session.c index aa575a24e0b2..6c78070e1a05 100644 --- a/drivers/md/dm-vdo/indexer/index-session.c +++ b/drivers/md/dm-vdo/indexer/index-session.c @@ -217,7 +217,7 @@ static int __must_check make_empty_index_session(struct uds_index_session **inde int result; struct uds_index_session *session; - result = vdo_allocate(1, struct uds_index_session, __func__, &session); + result = vdo_allocate(1, __func__, &session); if (result != VDO_SUCCESS) return result; diff --git a/drivers/md/dm-vdo/indexer/index.c b/drivers/md/dm-vdo/indexer/index.c index d4724fe17bf1..793bd32c1179 100644 --- a/drivers/md/dm-vdo/indexer/index.c +++ b/drivers/md/dm-vdo/indexer/index.c @@ -88,7 +88,7 @@ static int launch_zone_message(struct uds_zone_message message, unsigned int zon int result; struct uds_request *request; - result = vdo_allocate(1, struct uds_request, __func__, &request); + result = vdo_allocate(1, __func__, &request); if (result != VDO_SUCCESS) return result; @@ -1121,7 +1121,7 @@ static int make_index_zone(struct uds_index *index, unsigned int zone_number) int result; struct index_zone *zone; - result = vdo_allocate(1, struct index_zone, "index zone", &zone); + result = vdo_allocate(1, "index zone", &zone); if (result != VDO_SUCCESS) return result; @@ -1170,8 +1170,7 @@ int uds_make_index(struct uds_configuration *config, enum uds_open_index_type op return result; } - result = vdo_allocate(index->zone_count, struct index_zone *, "zones", - &index->zones); + result = vdo_allocate(index->zone_count, "zones", &index->zones); if (result != VDO_SUCCESS) { uds_free_index(index); return result; diff --git a/drivers/md/dm-vdo/indexer/io-factory.c b/drivers/md/dm-vdo/indexer/io-factory.c index 1bee9d63dc0a..f42861372030 100644 --- a/drivers/md/dm-vdo/indexer/io-factory.c +++ b/drivers/md/dm-vdo/indexer/io-factory.c @@ -64,7 +64,7 @@ int uds_make_io_factory(struct block_device *bdev, struct io_factory **factory_p int result; struct io_factory *factory; - result = vdo_allocate(1, struct io_factory, __func__, &factory); + result = vdo_allocate(1, __func__, &factory); if (result != VDO_SUCCESS) return result; @@ -144,7 +144,7 @@ int uds_make_buffered_reader(struct io_factory *factory, off_t offset, u64 block if (result != UDS_SUCCESS) return result; - result = vdo_allocate(1, struct buffered_reader, "buffered reader", &reader); + result = vdo_allocate(1, "buffered reader", &reader); if (result != VDO_SUCCESS) { dm_bufio_client_destroy(client); return result; @@ -282,7 +282,7 @@ int uds_make_buffered_writer(struct io_factory *factory, off_t offset, u64 block if (result != UDS_SUCCESS) return result; - result = vdo_allocate(1, struct buffered_writer, "buffered writer", &writer); + result = vdo_allocate(1, "buffered writer", &writer); if (result != VDO_SUCCESS) { dm_bufio_client_destroy(client); return result; diff --git a/drivers/md/dm-vdo/indexer/sparse-cache.c b/drivers/md/dm-vdo/indexer/sparse-cache.c index 28920167827c..eb62d3f01834 100644 --- a/drivers/md/dm-vdo/indexer/sparse-cache.c +++ b/drivers/md/dm-vdo/indexer/sparse-cache.c @@ -222,13 +222,12 @@ static int __must_check initialize_cached_chapter_index(struct cached_chapter_in chapter->virtual_chapter = NO_CHAPTER; chapter->index_pages_count = geometry->index_pages_per_chapter; - result = vdo_allocate(chapter->index_pages_count, struct delta_index_page, - __func__, &chapter->index_pages); + result = vdo_allocate(chapter->index_pages_count, __func__, &chapter->index_pages); if (result != VDO_SUCCESS) return result; - return vdo_allocate(chapter->index_pages_count, struct dm_buffer *, - "sparse index volume pages", &chapter->page_buffers); + return vdo_allocate(chapter->index_pages_count, "sparse index volume pages", + &chapter->page_buffers); } static int __must_check make_search_list(struct sparse_cache *cache, @@ -294,8 +293,7 @@ int uds_make_sparse_cache(const struct index_geometry *geometry, unsigned int ca } /* purge_search_list() needs some temporary lists for sorting. */ - result = vdo_allocate(capacity * 2, struct cached_chapter_index *, - "scratch entries", &cache->scratch_entries); + result = vdo_allocate(capacity * 2, "scratch entries", &cache->scratch_entries); if (result != VDO_SUCCESS) goto out; diff --git a/drivers/md/dm-vdo/indexer/volume-index.c b/drivers/md/dm-vdo/indexer/volume-index.c index afb062e1f1fb..e78d2725ce8b 100644 --- a/drivers/md/dm-vdo/indexer/volume-index.c +++ b/drivers/md/dm-vdo/indexer/volume-index.c @@ -1211,13 +1211,12 @@ static int initialize_volume_sub_index(const struct uds_configuration *config, (zone_count * sizeof(struct volume_sub_index_zone))); /* The following arrays are initialized to all zeros. */ - result = vdo_allocate(params.list_count, u64, "first chapter to flush", + result = vdo_allocate(params.list_count, "first chapter to flush", &sub_index->flush_chapters); if (result != VDO_SUCCESS) return result; - return vdo_allocate(zone_count, struct volume_sub_index_zone, - "volume index zones", &sub_index->zones); + return vdo_allocate(zone_count, "volume index zones", &sub_index->zones); } int uds_make_volume_index(const struct uds_configuration *config, u64 volume_nonce, @@ -1228,7 +1227,7 @@ int uds_make_volume_index(const struct uds_configuration *config, u64 volume_non struct volume_index *volume_index; int result; - result = vdo_allocate(1, struct volume_index, "volume index", &volume_index); + result = vdo_allocate(1, "volume index", &volume_index); if (result != VDO_SUCCESS) return result; @@ -1249,8 +1248,7 @@ int uds_make_volume_index(const struct uds_configuration *config, u64 volume_non volume_index->sparse_sample_rate = config->sparse_sample_rate; - result = vdo_allocate(config->zone_count, struct volume_index_zone, - "volume index zones", &volume_index->zones); + result = vdo_allocate(config->zone_count, "volume index zones", &volume_index->zones); if (result != VDO_SUCCESS) { uds_free_volume_index(volume_index); return result; diff --git a/drivers/md/dm-vdo/indexer/volume.c b/drivers/md/dm-vdo/indexer/volume.c index 425b3a74f4db..af97c0cbeede 100644 --- a/drivers/md/dm-vdo/indexer/volume.c +++ b/drivers/md/dm-vdo/indexer/volume.c @@ -1509,23 +1509,21 @@ static int __must_check initialize_page_cache(struct page_cache *cache, if (result != VDO_SUCCESS) return result; - result = vdo_allocate(VOLUME_CACHE_MAX_QUEUED_READS, struct queued_read, - "volume read queue", &cache->read_queue); + result = vdo_allocate(VOLUME_CACHE_MAX_QUEUED_READS, "volume read queue", + &cache->read_queue); if (result != VDO_SUCCESS) return result; - result = vdo_allocate(cache->zone_count, struct search_pending_counter, - "Volume Cache Zones", &cache->search_pending_counters); + result = vdo_allocate(cache->zone_count, "Volume Cache Zones", + &cache->search_pending_counters); if (result != VDO_SUCCESS) return result; - result = vdo_allocate(cache->indexable_pages, u16, "page cache index", - &cache->index); + result = vdo_allocate(cache->indexable_pages, "page cache index", &cache->index); if (result != VDO_SUCCESS) return result; - result = vdo_allocate(cache->cache_slots, struct cached_page, "page cache cache", - &cache->cache); + result = vdo_allocate(cache->cache_slots, "page cache cache", &cache->cache); if (result != VDO_SUCCESS) return result; @@ -1548,7 +1546,7 @@ int uds_make_volume(const struct uds_configuration *config, struct index_layout unsigned int reserved_buffers; int result; - result = vdo_allocate(1, struct volume, "volume", &volume); + result = vdo_allocate(1, "volume", &volume); if (result != VDO_SUCCESS) return result; @@ -1585,8 +1583,7 @@ int uds_make_volume(const struct uds_configuration *config, struct index_layout return result; } - result = vdo_allocate(geometry->records_per_page, - const struct uds_volume_record *, "record pointers", + result = vdo_allocate(geometry->records_per_page, "record pointers", &volume->record_pointers); if (result != VDO_SUCCESS) { uds_free_volume(volume); @@ -1626,8 +1623,7 @@ int uds_make_volume(const struct uds_configuration *config, struct index_layout uds_init_cond(&volume->read_threads_read_done_cond); uds_init_cond(&volume->read_threads_cond); - result = vdo_allocate(config->read_threads, struct thread *, "reader threads", - &volume->reader_threads); + result = vdo_allocate(config->read_threads, "reader threads", &volume->reader_threads); if (result != VDO_SUCCESS) { uds_free_volume(volume); return result; diff --git a/drivers/md/dm-vdo/int-map.c b/drivers/md/dm-vdo/int-map.c index aeb690415dbd..28d8af1f9be2 100644 --- a/drivers/md/dm-vdo/int-map.c +++ b/drivers/md/dm-vdo/int-map.c @@ -164,8 +164,7 @@ static int allocate_buckets(struct int_map *map, size_t capacity) * without have to wrap back around to element zero. */ map->bucket_count = capacity + (NEIGHBORHOOD - 1); - return vdo_allocate(map->bucket_count, struct bucket, - "struct int_map buckets", &map->buckets); + return vdo_allocate(map->bucket_count, "struct int_map buckets", &map->buckets); } /** @@ -182,7 +181,7 @@ int vdo_int_map_create(size_t initial_capacity, struct int_map **map_ptr) int result; size_t capacity; - result = vdo_allocate(1, struct int_map, "struct int_map", &map); + result = vdo_allocate(1, "struct int_map", &map); if (result != VDO_SUCCESS) return result; diff --git a/drivers/md/dm-vdo/memory-alloc.c b/drivers/md/dm-vdo/memory-alloc.c index 185f259c7245..a7f07522110d 100644 --- a/drivers/md/dm-vdo/memory-alloc.c +++ b/drivers/md/dm-vdo/memory-alloc.c @@ -245,7 +245,7 @@ int vdo_allocate_memory(size_t size, size_t align, const char *what, void *ptr) } else { struct vmalloc_block_info *block; - if (vdo_allocate(1, struct vmalloc_block_info, __func__, &block) == VDO_SUCCESS) { + if (vdo_allocate(1, __func__, &block) == VDO_SUCCESS) { /* * It is possible for __vmalloc to fail to allocate memory because there * are no pages available. A short sleep may allow the page reclaimer @@ -341,6 +341,7 @@ int vdo_reallocate_memory(void *ptr, size_t old_size, size_t size, const char *w void *new_ptr) { int result; + char *temp_ptr; if (size == 0) { vdo_free(ptr); @@ -348,9 +349,10 @@ int vdo_reallocate_memory(void *ptr, size_t old_size, size_t size, const char *w return VDO_SUCCESS; } - result = vdo_allocate(size, char, what, new_ptr); + result = vdo_allocate(size, what, &temp_ptr); if (result != VDO_SUCCESS) return result; + *(void **) new_ptr = temp_ptr; if (ptr != NULL) { if (old_size < size) @@ -368,7 +370,7 @@ int vdo_duplicate_string(const char *string, const char *what, char **new_string int result; u8 *dup; - result = vdo_allocate(strlen(string) + 1, u8, what, &dup); + result = vdo_allocate(strlen(string) + 1, what, &dup); if (result != VDO_SUCCESS) return result; diff --git a/drivers/md/dm-vdo/memory-alloc.h b/drivers/md/dm-vdo/memory-alloc.h index ab2375d549f4..bc5527327ed8 100644 --- a/drivers/md/dm-vdo/memory-alloc.h +++ b/drivers/md/dm-vdo/memory-alloc.h @@ -16,60 +16,19 @@ /* Custom memory allocation function that tracks memory usage */ int __must_check vdo_allocate_memory(size_t size, size_t align, const char *what, void *ptr); -/* - * Allocate storage based on element counts, sizes, and alignment. - * - * This is a generalized form of our allocation use case: It allocates an array of objects, - * optionally preceded by one object of another type (i.e., a struct with trailing variable-length - * array), with the alignment indicated. - * - * Why is this inline? The sizes and alignment will always be constant, when invoked through the - * macros below, and often the count will be a compile-time constant 1 or the number of extra bytes - * will be a compile-time constant 0. So at least some of the arithmetic can usually be optimized - * away, and the run-time selection between allocation functions always can. In many cases, it'll - * boil down to just a function call with a constant size. - * - * @count: The number of objects to allocate - * @size: The size of an object - * @extra: The number of additional bytes to allocate - * @align: The required alignment - * @what: What is being allocated (for error logging) - * @ptr: A pointer to hold the allocated memory - * - * Return: VDO_SUCCESS or an error code - */ -static inline int __vdo_do_allocation(size_t count, size_t size, size_t extra, - size_t align, const char *what, void *ptr) -{ - size_t total_size = count * size + extra; - - /* Overflow check: */ - if ((size > 0) && (count > ((SIZE_MAX - extra) / size))) { - /* - * This is kind of a hack: We rely on the fact that SIZE_MAX would cover the entire - * address space (minus one byte) and thus the system can never allocate that much - * and the call will always fail. So we can report an overflow as "out of memory" - * by asking for "merely" SIZE_MAX bytes. - */ - total_size = SIZE_MAX; - } - - return vdo_allocate_memory(total_size, align, what, ptr); -} - /* * Allocate one or more elements of the indicated type, logging an error if the allocation fails. * The memory will be zeroed. * * @COUNT: The number of objects to allocate - * @TYPE: The type of objects to allocate. This type determines the alignment of the allocation. * @WHAT: What is being allocated (for error logging) * @PTR: A pointer to hold the allocated memory * * Return: VDO_SUCCESS or an error code */ -#define vdo_allocate(COUNT, TYPE, WHAT, PTR) \ - __vdo_do_allocation(COUNT, sizeof(TYPE), 0, __alignof__(TYPE), WHAT, PTR) +#define vdo_allocate(COUNT, WHAT, PTR) \ + vdo_allocate_memory(size_mul((COUNT), sizeof(typeof(**(PTR)))), \ + __alignof__(typeof(**(PTR))), WHAT, PTR) /* * Allocate a structure with a flexible array member, with a specified number of elements, logging diff --git a/drivers/md/dm-vdo/message-stats.c b/drivers/md/dm-vdo/message-stats.c index 75dfcd7c5f63..b4c919780c22 100644 --- a/drivers/md/dm-vdo/message-stats.c +++ b/drivers/md/dm-vdo/message-stats.c @@ -420,7 +420,7 @@ int vdo_write_stats(struct vdo *vdo, char *buf, unsigned int maxlen) struct vdo_statistics *stats; int result; - result = vdo_allocate(1, struct vdo_statistics, __func__, &stats); + result = vdo_allocate(1, __func__, &stats); if (result != VDO_SUCCESS) { vdo_log_error("Cannot allocate memory to write VDO statistics"); return result; diff --git a/drivers/md/dm-vdo/packer.c b/drivers/md/dm-vdo/packer.c index e638694d896c..ea2d8d14495c 100644 --- a/drivers/md/dm-vdo/packer.c +++ b/drivers/md/dm-vdo/packer.c @@ -145,7 +145,7 @@ int vdo_make_packer(struct vdo *vdo, block_count_t bin_count, struct packer **pa block_count_t i; int result; - result = vdo_allocate(1, struct packer, __func__, &packer); + result = vdo_allocate(1, __func__, &packer); if (result != VDO_SUCCESS) return result; diff --git a/drivers/md/dm-vdo/recovery-journal.c b/drivers/md/dm-vdo/recovery-journal.c index 6da303961376..f03939cc89e3 100644 --- a/drivers/md/dm-vdo/recovery-journal.c +++ b/drivers/md/dm-vdo/recovery-journal.c @@ -593,32 +593,29 @@ static int __must_check initialize_lock_counter(struct recovery_journal *journal struct thread_config *config = &vdo->thread_config; struct lock_counter *counter = &journal->lock_counter; - result = vdo_allocate(journal->size, u16, __func__, &counter->journal_counters); + result = vdo_allocate(journal->size, __func__, &counter->journal_counters); if (result != VDO_SUCCESS) return result; - result = vdo_allocate(journal->size, atomic_t, __func__, - &counter->journal_decrement_counts); + result = vdo_allocate(journal->size, __func__, &counter->journal_decrement_counts); if (result != VDO_SUCCESS) return result; - result = vdo_allocate(journal->size * config->logical_zone_count, u16, __func__, + result = vdo_allocate(journal->size * config->logical_zone_count, __func__, &counter->logical_counters); if (result != VDO_SUCCESS) return result; - result = vdo_allocate(journal->size, atomic_t, __func__, - &counter->logical_zone_counts); + result = vdo_allocate(journal->size, __func__, &counter->logical_zone_counts); if (result != VDO_SUCCESS) return result; - result = vdo_allocate(journal->size * config->physical_zone_count, u16, __func__, + result = vdo_allocate(journal->size * config->physical_zone_count, __func__, &counter->physical_counters); if (result != VDO_SUCCESS) return result; - result = vdo_allocate(journal->size, atomic_t, __func__, - &counter->physical_zone_counts); + result = vdo_allocate(journal->size, __func__, &counter->physical_zone_counts); if (result != VDO_SUCCESS) return result; @@ -672,7 +669,7 @@ static int initialize_recovery_block(struct vdo *vdo, struct recovery_journal *j * Allocate a full block for the journal block even though not all of the space is used * since the VIO needs to write a full disk block. */ - result = vdo_allocate(VDO_BLOCK_SIZE, char, __func__, &data); + result = vdo_allocate(VDO_BLOCK_SIZE, __func__, &data); if (result != VDO_SUCCESS) return result; diff --git a/drivers/md/dm-vdo/repair.c b/drivers/md/dm-vdo/repair.c index e479d3582040..43ce65a69e61 100644 --- a/drivers/md/dm-vdo/repair.c +++ b/drivers/md/dm-vdo/repair.c @@ -1417,8 +1417,7 @@ static int parse_journal_for_rebuild(struct repair_completion *repair) * packed_recovery_journal_entry from every valid journal block. */ count = ((repair->highest_tail - repair->block_map_head + 1) * entries_per_block); - result = vdo_allocate(count, struct numbered_block_mapping, __func__, - &repair->entries); + result = vdo_allocate(count, __func__, &repair->entries); if (result != VDO_SUCCESS) return result; @@ -1464,8 +1463,7 @@ static int extract_new_mappings(struct repair_completion *repair) * Allocate an array of numbered_block_mapping structs just large enough to transcribe * every packed_recovery_journal_entry from every valid journal block. */ - result = vdo_allocate(repair->entry_count, struct numbered_block_mapping, - __func__, &repair->entries); + result = vdo_allocate(repair->entry_count, __func__, &repair->entries); if (result != VDO_SUCCESS) return result; @@ -1727,12 +1725,11 @@ void vdo_repair(struct vdo_completion *parent) prepare_repair_completion(repair, finish_repair, VDO_ZONE_TYPE_ADMIN); repair->page_count = page_count; - result = vdo_allocate(remaining * VDO_BLOCK_SIZE, char, __func__, - &repair->journal_data); + result = vdo_allocate(remaining * VDO_BLOCK_SIZE, __func__, &repair->journal_data); if (abort_on_error(result, repair)) return; - result = vdo_allocate(vio_count, struct vio, __func__, &repair->vios); + result = vdo_allocate(vio_count, __func__, &repair->vios); if (abort_on_error(result, repair)) return; diff --git a/drivers/md/dm-vdo/slab-depot.c b/drivers/md/dm-vdo/slab-depot.c index 286fc4465a92..7fcbb361b38d 100644 --- a/drivers/md/dm-vdo/slab-depot.c +++ b/drivers/md/dm-vdo/slab-depot.c @@ -2453,8 +2453,7 @@ static int allocate_slab_counters(struct vdo_slab *slab) if (result != VDO_SUCCESS) return result; - result = vdo_allocate(slab->reference_block_count, struct reference_block, - __func__, &slab->reference_blocks); + result = vdo_allocate(slab->reference_block_count, __func__, &slab->reference_blocks); if (result != VDO_SUCCESS) return result; @@ -2463,8 +2462,7 @@ static int allocate_slab_counters(struct vdo_slab *slab) * so we can word-search even at the very end. */ bytes = (slab->reference_block_count * COUNTS_PER_BLOCK) + (2 * BYTES_PER_WORD); - result = vdo_allocate(bytes, vdo_refcount_t, "ref counts array", - &slab->counters); + result = vdo_allocate(bytes, "ref counts array", &slab->counters); if (result != VDO_SUCCESS) { vdo_free(vdo_forget(slab->reference_blocks)); return result; @@ -3563,8 +3561,7 @@ static int get_slab_statuses(struct block_allocator *allocator, struct slab_status *statuses; struct slab_iterator iterator = get_slab_iterator(allocator); - result = vdo_allocate(allocator->slab_count, struct slab_status, __func__, - &statuses); + result = vdo_allocate(allocator->slab_count, __func__, &statuses); if (result != VDO_SUCCESS) return result; @@ -3739,13 +3736,12 @@ static int initialize_slab_journal(struct vdo_slab *slab) const struct slab_config *slab_config = &slab->allocator->depot->slab_config; int result; - result = vdo_allocate(slab_config->slab_journal_blocks, struct journal_lock, - __func__, &journal->locks); + result = vdo_allocate(slab_config->slab_journal_blocks, __func__, &journal->locks); if (result != VDO_SUCCESS) return result; - result = vdo_allocate(VDO_BLOCK_SIZE, char, "struct packed_slab_journal_block", - (char **) &journal->block); + BUILD_BUG_ON(sizeof(*journal->block) != VDO_BLOCK_SIZE); + result = vdo_allocate(1, "struct packed_slab_journal_block", &journal->block); if (result != VDO_SUCCESS) return result; @@ -3800,7 +3796,7 @@ static int __must_check make_slab(physical_block_number_t slab_origin, struct vdo_slab *slab; int result; - result = vdo_allocate(1, struct vdo_slab, __func__, &slab); + result = vdo_allocate(1, __func__, &slab); if (result != VDO_SUCCESS) return result; @@ -3857,8 +3853,7 @@ static int allocate_slabs(struct slab_depot *depot, slab_count_t slab_count) physical_block_number_t slab_origin; int result; - result = vdo_allocate(slab_count, struct vdo_slab *, - "slab pointer array", &depot->new_slabs); + result = vdo_allocate(slab_count, "slab pointer array", &depot->new_slabs); if (result != VDO_SUCCESS) return result; @@ -4011,8 +4006,7 @@ static int initialize_slab_scrubber(struct block_allocator *allocator) char *journal_data; int result; - result = vdo_allocate(VDO_BLOCK_SIZE * slab_journal_size, - char, __func__, &journal_data); + result = vdo_allocate(VDO_BLOCK_SIZE * slab_journal_size, __func__, &journal_data); if (result != VDO_SUCCESS) return result; @@ -4045,7 +4039,7 @@ static int __must_check initialize_slab_summary_block(struct block_allocator *al struct slab_summary_block *block = &allocator->summary_blocks[index]; int result; - result = vdo_allocate(VDO_BLOCK_SIZE, char, __func__, &block->outgoing_entries); + result = vdo_allocate(VDO_BLOCK_SIZE, __func__, &block->outgoing_entries); if (result != VDO_SUCCESS) return result; @@ -4114,8 +4108,7 @@ static int __must_check initialize_block_allocator(struct slab_depot *depot, if (result != VDO_SUCCESS) return result; - result = vdo_allocate(VDO_SLAB_SUMMARY_BLOCKS_PER_ZONE, - struct slab_summary_block, __func__, + result = vdo_allocate(VDO_SLAB_SUMMARY_BLOCKS_PER_ZONE, __func__, &allocator->summary_blocks); if (result != VDO_SUCCESS) return result; @@ -4174,8 +4167,7 @@ static int allocate_components(struct slab_depot *depot, depot->summary_origin = summary_partition->offset; depot->hint_shift = vdo_get_slab_summary_hint_shift(depot->slab_size_shift); - result = vdo_allocate(MAXIMUM_VDO_SLAB_SUMMARY_ENTRIES, - struct slab_summary_entry, __func__, + result = vdo_allocate(MAXIMUM_VDO_SLAB_SUMMARY_ENTRIES, __func__, &depot->summary_entries); if (result != VDO_SUCCESS) return result; diff --git a/drivers/md/dm-vdo/thread-utils.c b/drivers/md/dm-vdo/thread-utils.c index ec08478dd013..826afc952b56 100644 --- a/drivers/md/dm-vdo/thread-utils.c +++ b/drivers/md/dm-vdo/thread-utils.c @@ -56,7 +56,7 @@ int vdo_create_thread(void (*thread_function)(void *), void *thread_data, struct thread *thread; int result; - result = vdo_allocate(1, struct thread, __func__, &thread); + result = vdo_allocate(1, __func__, &thread); if (result != VDO_SUCCESS) { vdo_log_warning("Error allocating memory for %s", name); return result; diff --git a/drivers/md/dm-vdo/vdo.c b/drivers/md/dm-vdo/vdo.c index 09fd0628d18c..167cf93a284a 100644 --- a/drivers/md/dm-vdo/vdo.c +++ b/drivers/md/dm-vdo/vdo.c @@ -207,29 +207,28 @@ static int __must_check initialize_thread_config(struct thread_count_config coun config->hash_zone_count = counts.hash_zones; } - result = vdo_allocate(config->logical_zone_count, thread_id_t, - "logical thread array", &config->logical_threads); + result = vdo_allocate(config->logical_zone_count, "logical thread array", + &config->logical_threads); if (result != VDO_SUCCESS) { uninitialize_thread_config(config); return result; } - result = vdo_allocate(config->physical_zone_count, thread_id_t, - "physical thread array", &config->physical_threads); + result = vdo_allocate(config->physical_zone_count, "physical thread array", + &config->physical_threads); if (result != VDO_SUCCESS) { uninitialize_thread_config(config); return result; } - result = vdo_allocate(config->hash_zone_count, thread_id_t, - "hash thread array", &config->hash_zone_threads); + result = vdo_allocate(config->hash_zone_count, "hash thread array", + &config->hash_zone_threads); if (result != VDO_SUCCESS) { uninitialize_thread_config(config); return result; } - result = vdo_allocate(config->bio_thread_count, thread_id_t, - "bio thread array", &config->bio_threads); + result = vdo_allocate(config->bio_thread_count, "bio thread array", &config->bio_threads); if (result != VDO_SUCCESS) { uninitialize_thread_config(config); return result; @@ -269,7 +268,7 @@ static int __must_check read_geometry_block(struct vdo *vdo) char *block; int result; - result = vdo_allocate(VDO_BLOCK_SIZE, u8, __func__, &block); + result = vdo_allocate(VDO_BLOCK_SIZE, __func__, &block); if (result != VDO_SUCCESS) return result; @@ -493,7 +492,7 @@ static int initialize_vdo(struct vdo *vdo, struct device_config *config, config->thread_counts.hash_zones, vdo->thread_config.thread_count); /* Compression context storage */ - result = vdo_allocate(config->thread_counts.cpu_threads, char *, "LZ4 context", + result = vdo_allocate(config->thread_counts.cpu_threads, "LZ4 context", &vdo->compression_context); if (result != VDO_SUCCESS) { *reason = "cannot allocate LZ4 context"; @@ -501,7 +500,7 @@ static int initialize_vdo(struct vdo *vdo, struct device_config *config, } for (i = 0; i < config->thread_counts.cpu_threads; i++) { - result = vdo_allocate(LZ4_MEM_COMPRESS, char, "LZ4 context", + result = vdo_allocate(LZ4_MEM_COMPRESS, "LZ4 context", &vdo->compression_context[i]); if (result != VDO_SUCCESS) { *reason = "cannot allocate LZ4 context"; @@ -537,7 +536,7 @@ int vdo_make(unsigned int instance, struct device_config *config, char **reason, /* Initialize with a generic failure reason to prevent returning garbage. */ *reason = "Unspecified error"; - result = vdo_allocate(1, struct vdo, __func__, &vdo); + result = vdo_allocate(1, __func__, &vdo); if (result != VDO_SUCCESS) { *reason = "Cannot allocate VDO"; return result; @@ -554,8 +553,7 @@ int vdo_make(unsigned int instance, struct device_config *config, char **reason, snprintf(vdo->thread_name_prefix, sizeof(vdo->thread_name_prefix), "vdo%u", instance); - result = vdo_allocate(vdo->thread_config.thread_count, - struct vdo_thread, __func__, &vdo->threads); + result = vdo_allocate(vdo->thread_config.thread_count, __func__, &vdo->threads); if (result != VDO_SUCCESS) { *reason = "Cannot allocate thread structures"; return result; @@ -724,8 +722,7 @@ static int initialize_super_block(struct vdo *vdo, struct vdo_super_block *super { int result; - result = vdo_allocate(VDO_BLOCK_SIZE, char, "encoded super block", - (char **) &vdo->super_block.buffer); + result = vdo_allocate(VDO_BLOCK_SIZE, "encoded super block", &vdo->super_block.buffer); if (result != VDO_SUCCESS) return result; @@ -997,8 +994,7 @@ int vdo_register_read_only_listener(struct vdo *vdo, void *listener, if (result != VDO_SUCCESS) return result; - result = vdo_allocate(1, struct read_only_listener, __func__, - &read_only_listener); + result = vdo_allocate(1, __func__, &read_only_listener); if (result != VDO_SUCCESS) return result; diff --git a/drivers/md/dm-vdo/vio.c b/drivers/md/dm-vdo/vio.c index cc739d52a70c..ea8ac619ff1b 100644 --- a/drivers/md/dm-vdo/vio.c +++ b/drivers/md/dm-vdo/vio.c @@ -129,7 +129,7 @@ int create_multi_block_metadata_vio(struct vdo *vdo, enum vio_type vio_type, * Metadata vios should use direct allocation and not use the buffer pool, which is * reserved for submissions from the linux block layer. */ - result = vdo_allocate(1, struct vio, __func__, &vio); + result = vdo_allocate(1, __func__, &vio); if (result != VDO_SUCCESS) { vdo_log_error("metadata vio allocation failure %d", result); return result; @@ -335,8 +335,7 @@ int make_vio_pool(struct vdo *vdo, size_t pool_size, size_t block_count, thread_ INIT_LIST_HEAD(&pool->available); INIT_LIST_HEAD(&pool->busy); - result = vdo_allocate(pool_size * per_vio_size, char, - "VIO pool buffer", &pool->buffer); + result = vdo_allocate(pool_size * per_vio_size, "VIO pool buffer", &pool->buffer); if (result != VDO_SUCCESS) { free_vio_pool(pool); return result; From db139186beac761c1172a7b20687fb2078c6a70c Mon Sep 17 00:00:00 2001 From: Ken Raeburn Date: Thu, 26 Feb 2026 20:12:08 -0500 Subject: [PATCH 14/62] dm vdo: add __counted_by attribute to a number of structures This attribute allows the compiler to refine compile-time diagnostics and run-time sanitizer features with information about the size of the flexible arrays. Signed-off-by: Ken Raeburn Signed-off-by: Mikulas Patocka --- drivers/md/dm-vdo/block-map.h | 2 +- drivers/md/dm-vdo/dedupe.c | 2 +- drivers/md/dm-vdo/indexer/index.h | 2 +- drivers/md/dm-vdo/indexer/open-chapter.h | 2 +- drivers/md/dm-vdo/logical-zone.h | 2 +- drivers/md/dm-vdo/physical-zone.c | 2 +- drivers/md/dm-vdo/repair.c | 2 +- drivers/md/dm-vdo/slab-depot.h | 2 +- 8 files changed, 8 insertions(+), 8 deletions(-) diff --git a/drivers/md/dm-vdo/block-map.h b/drivers/md/dm-vdo/block-map.h index 39a13039e4a3..4fd24043b0d7 100644 --- a/drivers/md/dm-vdo/block-map.h +++ b/drivers/md/dm-vdo/block-map.h @@ -276,7 +276,7 @@ struct block_map { block_count_t next_entry_count; zone_count_t zone_count; - struct block_map_zone zones[]; + struct block_map_zone zones[] __counted_by(zone_count); }; /** diff --git a/drivers/md/dm-vdo/dedupe.c b/drivers/md/dm-vdo/dedupe.c index 4e0fefd077d0..5f5639d89bc6 100644 --- a/drivers/md/dm-vdo/dedupe.c +++ b/drivers/md/dm-vdo/dedupe.c @@ -296,7 +296,7 @@ struct hash_zones { /* The number of zones */ zone_count_t zone_count; /* The hash zones themselves */ - struct hash_zone zones[]; + struct hash_zone zones[] __counted_by(zone_count); }; /* These are in milliseconds. */ diff --git a/drivers/md/dm-vdo/indexer/index.h b/drivers/md/dm-vdo/indexer/index.h index edabb239548e..1891f2de508e 100644 --- a/drivers/md/dm-vdo/indexer/index.h +++ b/drivers/md/dm-vdo/indexer/index.h @@ -53,7 +53,7 @@ struct uds_index { index_callback_fn callback; struct uds_request_queue *triage_queue; - struct uds_request_queue *zone_queues[]; + struct uds_request_queue *zone_queues[] __counted_by(zone_count); }; enum request_stage { diff --git a/drivers/md/dm-vdo/indexer/open-chapter.h b/drivers/md/dm-vdo/indexer/open-chapter.h index a4250bb19525..ea6d7336aea0 100644 --- a/drivers/md/dm-vdo/indexer/open-chapter.h +++ b/drivers/md/dm-vdo/indexer/open-chapter.h @@ -40,7 +40,7 @@ struct open_chapter_zone { /* The number of slots in the hash table */ unsigned int slot_count; /* The hash table slots, referencing virtual record numbers */ - struct open_chapter_zone_slot slots[]; + struct open_chapter_zone_slot slots[] __counted_by(slot_count); }; int __must_check uds_make_open_chapter(const struct index_geometry *geometry, diff --git a/drivers/md/dm-vdo/logical-zone.h b/drivers/md/dm-vdo/logical-zone.h index 1b666c84a193..a36a864c6836 100644 --- a/drivers/md/dm-vdo/logical-zone.h +++ b/drivers/md/dm-vdo/logical-zone.h @@ -60,7 +60,7 @@ struct logical_zones { /* The number of zones */ zone_count_t zone_count; /* The logical zones themselves */ - struct logical_zone zones[]; + struct logical_zone zones[] __counted_by(zone_count); }; int __must_check vdo_make_logical_zones(struct vdo *vdo, diff --git a/drivers/md/dm-vdo/physical-zone.c b/drivers/md/dm-vdo/physical-zone.c index a8c7a57516eb..d6ad8f1a33bb 100644 --- a/drivers/md/dm-vdo/physical-zone.c +++ b/drivers/md/dm-vdo/physical-zone.c @@ -200,7 +200,7 @@ struct pbn_lock_pool { /** @idle_list: A list containing all idle PBN lock instances. */ struct list_head idle_list; /** @locks: The memory for all the locks allocated by this pool. */ - idle_pbn_lock locks[]; + idle_pbn_lock locks[] __counted_by(capacity); }; /** diff --git a/drivers/md/dm-vdo/repair.c b/drivers/md/dm-vdo/repair.c index 43ce65a69e61..bfed62260280 100644 --- a/drivers/md/dm-vdo/repair.c +++ b/drivers/md/dm-vdo/repair.c @@ -127,7 +127,7 @@ struct repair_completion { * The page completions used for playing the journal into the block map, and, during * read-only rebuild, for rebuilding the reference counts from the block map. */ - struct vdo_page_completion page_completions[]; + struct vdo_page_completion page_completions[] __counted_by(page_count); }; /* diff --git a/drivers/md/dm-vdo/slab-depot.h b/drivers/md/dm-vdo/slab-depot.h index fadc0c9d4dc4..6bfd61c937b6 100644 --- a/drivers/md/dm-vdo/slab-depot.h +++ b/drivers/md/dm-vdo/slab-depot.h @@ -509,7 +509,7 @@ struct slab_depot { struct slab_summary_entry *summary_entries; /* The block allocators for this depot */ - struct block_allocator allocators[]; + struct block_allocator allocators[] __counted_by(zone_count); }; struct reference_updater; From 4c788c6f921b22f9b6c3f316c4a071c05683e7de Mon Sep 17 00:00:00 2001 From: Junrui Luo Date: Sun, 1 Mar 2026 21:10:58 +0800 Subject: [PATCH 15/62] dm mirror: fix integer overflow in create_dirty_log() The argument count calculation in create_dirty_log() performs `*args_used = 2 + param_count` before validating against argc. When a user provides a param_count close to UINT_MAX via the device mapper table string, this unsigned addition wraps around to a small value, causing the subsequent `argc < *args_used` check to be bypassed. The overflowed param_count is then passed as argc to dm_dirty_log_create(), where it can cause out-of-bounds reads on the argv array. Fix by comparing param_count against argc - 2 before performing the addition, following the same pattern used by parse_features() in the same file. Since argc >= 2 is already guaranteed, the subtraction is safe. Fixes: 1da177e4c3f4 ("Linux-2.6.12-rc2") Cc: stable@vger.kernel.org Reported-by: Yuhao Jiang Signed-off-by: Junrui Luo Reviewed-by: Benjamin Marzinski Signed-off-by: Mikulas Patocka --- drivers/md/dm-raid1.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/md/dm-raid1.c b/drivers/md/dm-raid1.c index 80a5c4127707..de5c00704e69 100644 --- a/drivers/md/dm-raid1.c +++ b/drivers/md/dm-raid1.c @@ -993,13 +993,13 @@ static struct dm_dirty_log *create_dirty_log(struct dm_target *ti, return NULL; } - *args_used = 2 + param_count; - - if (argc < *args_used) { + if (param_count > argc - 2) { ti->error = "Insufficient mirror log arguments"; return NULL; } + *args_used = 2 + param_count; + dl = dm_dirty_log_create(argv[0], ti, mirror_flush, param_count, argv + 2); if (!dl) { From 044ca491d4086dc5bf233e9fcb71db52df32f633 Mon Sep 17 00:00:00 2001 From: Ming-Hung Tsai Date: Wed, 4 Mar 2026 19:56:28 +0800 Subject: [PATCH 16/62] dm cache metadata: fix memory leak on metadata abort retry When failing to acquire the root_lock in dm_cache_metadata_abort because the block_manager is read-only, the temporary block_manager created outside the root_lock is not properly released, causing a memory leak. Reproduce steps: This can be reproduced by reloading a new table while the metadata is read-only. While the second call to dm_cache_metadata_abort is caused by lack of support for table preload in dm-cache, mentioned in commit 9b1cc9f251af ("dm cache: share cache-metadata object across inactive and active DM tables"), it exposes the memory leak in dm_cache_metadata_abort when the function is called multiple times. Specifically, dm-cache fails to sync the new cache object's mode during preresume, creating the reproducer condition. This issue could also occur through concurrent metadata_operation_failed calls due to races in cache mode updates, but the table preload scenario below provides a reliable reproducer. 1. Create a cache device with some faulty trailing metadata blocks dmsetup create cmeta < unreferenced object 0xffff8880080c2010 (size 16): comm "dmsetup", pid 132, jiffies 4294982580 hex dump (first 16 bytes): 00 38 b9 07 80 88 ff ff 6a 6b 6b 6b 6b 6b 6b a5 ... backtrace (crc 3118f31c): kmemleak_alloc+0x28/0x40 __kmalloc_cache_noprof+0x3d9/0x510 dm_block_manager_create+0x51/0x140 dm_cache_metadata_abort+0x85/0x320 metadata_operation_failed+0x103/0x1e0 cache_preresume+0xacd/0xe70 dm_table_resume_targets+0xd3/0x320 __dm_resume+0x1b/0xf0 dm_resume+0x127/0x170 Fixes: 352b837a5541 ("dm cache: Fix ABBA deadlock between shrink_slab and dm_cache_metadata_abort") Signed-off-by: Ming-Hung Tsai Signed-off-by: Mikulas Patocka --- drivers/md/dm-cache-metadata.c | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/drivers/md/dm-cache-metadata.c b/drivers/md/dm-cache-metadata.c index 25b8aebdca53..acd9b179fcb3 100644 --- a/drivers/md/dm-cache-metadata.c +++ b/drivers/md/dm-cache-metadata.c @@ -1023,6 +1023,12 @@ static bool cmd_write_lock(struct dm_cache_metadata *cmd) return; \ } while (0) +#define WRITE_LOCK_OR_GOTO(cmd, label) \ + do { \ + if (!cmd_write_lock((cmd))) \ + goto label; \ + } while (0) + #define WRITE_UNLOCK(cmd) \ up_write(&(cmd)->root_lock) @@ -1780,11 +1786,8 @@ int dm_cache_metadata_abort(struct dm_cache_metadata *cmd) new_bm = dm_block_manager_create(cmd->bdev, DM_CACHE_METADATA_BLOCK_SIZE << SECTOR_SHIFT, CACHE_MAX_CONCURRENT_LOCKS); - WRITE_LOCK(cmd); - if (cmd->fail_io) { - WRITE_UNLOCK(cmd); - goto out; - } + /* cmd_write_lock() already checks fail_io with cmd->root_lock held */ + WRITE_LOCK_OR_GOTO(cmd, out); __destroy_persistent_data_objects(cmd, false); old_bm = cmd->bm; From c20e36b7631d83e7535877f08af8b0af72c44b1a Mon Sep 17 00:00:00 2001 From: Junrui Luo Date: Thu, 5 Mar 2026 20:05:48 +0800 Subject: [PATCH 17/62] dm log: fix out-of-bounds write due to region_count overflow The local variable region_count in create_log_context() is declared as unsigned int (32-bit), but dm_sector_div_up() returns sector_t (64-bit). When a device-mapper target has a sufficiently large ti->len with a small region_size, the division result can exceed UINT_MAX. The truncated value is then used to calculate bitset_size, causing clean_bits, sync_bits, and recovering_bits to be allocated far smaller than needed for the actual number of regions. Subsequent log operations (log_set_bit, log_clear_bit, log_test_bit) use region indices derived from the full untruncated region space, causing out-of-bounds writes to kernel heap memory allocated by vmalloc. This can be reproduced by creating a mirror target whose region_count overflows 32 bits: dmsetup create bigzero --table '0 8589934594 zero' dmsetup create mymirror --table '0 8589934594 mirror \ core 2 2 nosync 2 /dev/mapper/bigzero 0 \ /dev/mapper/bigzero 0' The status output confirms the truncation (sync_count=1 instead of 4294967297, because 0x100000001 was truncated to 1): $ dmsetup status mymirror 0 8589934594 mirror 2 254:1 254:1 1/4294967297 ... This leads to a kernel crash in core_in_sync: BUG: scheduling while atomic: (udev-worker)/9150/0x00000000 RIP: 0010:core_in_sync+0x14/0x30 [dm_log] CR2: 0000000000000008 Fixing recursive fault but reboot is needed! Fix by widening the local region_count to sector_t and adding an explicit overflow check before the value is assigned to lc->region_count. Fixes: 1da177e4c3f4 ("Linux-2.6.12-rc2") Reported-by: Yuhao Jiang Signed-off-by: Junrui Luo Signed-off-by: Mikulas Patocka --- drivers/md/dm-log.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/drivers/md/dm-log.c b/drivers/md/dm-log.c index 1aa6a4a7d232..d316757a328b 100644 --- a/drivers/md/dm-log.c +++ b/drivers/md/dm-log.c @@ -373,7 +373,7 @@ static int create_log_context(struct dm_dirty_log *log, struct dm_target *ti, struct log_c *lc; uint32_t region_size; - unsigned int region_count; + sector_t region_count; size_t bitset_size, buf_size; int r; char dummy; @@ -401,6 +401,10 @@ static int create_log_context(struct dm_dirty_log *log, struct dm_target *ti, } region_count = dm_sector_div_up(ti->len, region_size); + if (region_count > UINT_MAX) { + DMWARN("region count exceeds limit of %u", UINT_MAX); + return -EINVAL; + } lc = kmalloc_obj(*lc); if (!lc) { From 2b14e0bb63cc671120e7791658f5c494fc66d072 Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Thu, 5 Feb 2026 20:59:20 -0800 Subject: [PATCH 18/62] dm-verity-fec: correctly reject too-small FEC devices Fix verity_fec_ctr() to reject too-small FEC devices by correctly computing the number of parity blocks as 'f->rounds * f->roots'. Previously it incorrectly used 'div64_u64(f->rounds * f->roots, v->fec->roots << SECTOR_SHIFT)' which is a much smaller value. Note that the units of 'rounds' are blocks, not bytes. This matches the units of the value returned by dm_bufio_get_device_size(), which are also blocks. A later commit will give 'rounds' a clearer name. Fixes: a739ff3f543a ("dm verity: add support for forward error correction") Cc: stable@vger.kernel.org Signed-off-by: Eric Biggers Signed-off-by: Mikulas Patocka --- drivers/md/dm-verity-fec.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/drivers/md/dm-verity-fec.c b/drivers/md/dm-verity-fec.c index 14be4d888af3..9d5dace7d420 100644 --- a/drivers/md/dm-verity-fec.c +++ b/drivers/md/dm-verity-fec.c @@ -625,7 +625,7 @@ int verity_fec_ctr(struct dm_verity *v) { struct dm_verity_fec *f = v->fec; struct dm_target *ti = v->ti; - u64 hash_blocks, fec_blocks; + u64 hash_blocks; int ret; if (!verity_fec_is_enabled(v)) { @@ -706,8 +706,7 @@ int verity_fec_ctr(struct dm_verity *v) dm_bufio_set_sector_offset(f->bufio, f->start << (v->data_dev_block_bits - SECTOR_SHIFT)); - fec_blocks = div64_u64(f->rounds * f->roots, v->fec->roots << SECTOR_SHIFT); - if (dm_bufio_get_device_size(f->bufio) < fec_blocks) { + if (dm_bufio_get_device_size(f->bufio) < f->rounds * f->roots) { ti->error = "FEC device is too small"; return -E2BIG; } From 4355142245f7e55336dcc005ec03592df4d546f8 Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Thu, 5 Feb 2026 20:59:21 -0800 Subject: [PATCH 19/62] dm-verity-fec: correctly reject too-small hash devices Fix verity_fec_ctr() to reject too-small hash devices by correctly taking hash_start into account. Note that this is necessary because dm-verity doesn't call dm_bufio_set_sector_offset() on the hash device's bufio client (v->bufio). Thus, dm_bufio_get_device_size(v->bufio) returns a size relative to 0 rather than hash_start. An alternative fix would be to call dm_bufio_set_sector_offset() on v->bufio, but then all the code that reads from the hash device would have to be adjusted accordingly. Fixes: a739ff3f543a ("dm verity: add support for forward error correction") Cc: stable@vger.kernel.org Signed-off-by: Eric Biggers Signed-off-by: Mikulas Patocka --- drivers/md/dm-verity-fec.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/md/dm-verity-fec.c b/drivers/md/dm-verity-fec.c index 9d5dace7d420..721b7c377386 100644 --- a/drivers/md/dm-verity-fec.c +++ b/drivers/md/dm-verity-fec.c @@ -688,7 +688,8 @@ int verity_fec_ctr(struct dm_verity *v) * it to be large enough. */ f->hash_blocks = f->blocks - v->data_blocks; - if (dm_bufio_get_device_size(v->bufio) < f->hash_blocks) { + if (dm_bufio_get_device_size(v->bufio) < + v->hash_start + f->hash_blocks) { ti->error = "Hash device is too small for " DM_VERITY_OPT_FEC_BLOCKS; return -E2BIG; From 48640c88a8ddd482b6456fcbc084b08dd2bac083 Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Thu, 5 Feb 2026 20:59:22 -0800 Subject: [PATCH 20/62] dm-verity-fec: fix corrected block count stat dm_verity_fec::corrected seems to have been intended to count the number of corrected blocks. However, it actually counted the number of calls to fec_decode_bufs() that corrected at least one error. That's not the same thing. For example, in low-memory situations correcting a single block can require many calls to fec_decode_bufs(). Fix it to count corrected blocks instead. Fixes: ae97648e14f7 ("dm verity fec: Expose corrected block count via status") Cc: Shubhankar Mishra Cc: stable@vger.kernel.org Signed-off-by: Eric Biggers Signed-off-by: Mikulas Patocka --- drivers/md/dm-verity-fec.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/drivers/md/dm-verity-fec.c b/drivers/md/dm-verity-fec.c index 721b7c377386..1e776e0d6be5 100644 --- a/drivers/md/dm-verity-fec.c +++ b/drivers/md/dm-verity-fec.c @@ -163,11 +163,9 @@ static int fec_decode_bufs(struct dm_verity *v, struct dm_verity_io *io, if (r < 0 && neras) DMERR_LIMIT("%s: FEC %llu: failed to correct: %d", v->data_dev->name, (unsigned long long)rsb, r); - else if (r > 0) { + else if (r > 0) DMWARN_LIMIT("%s: FEC %llu: corrected %d errors", v->data_dev->name, (unsigned long long)rsb, r); - atomic64_inc(&v->fec->corrected); - } return r; } @@ -439,6 +437,7 @@ int verity_fec_decode(struct dm_verity *v, struct dm_verity_io *io, } memcpy(dest, fio->output, 1 << v->data_dev_block_bits); + atomic64_inc(&v->fec->corrected); done: fio->level--; From a7fca324d7d90f7b139d4d32747c83a629fdb446 Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Thu, 5 Feb 2026 20:59:23 -0800 Subject: [PATCH 21/62] dm-verity-fec: fix the size of dm_verity_fec_io::erasures At most 25 entries in dm_verity_fec_io::erasures are used: the maximum number of FEC roots plus one. Therefore, set the array size accordingly. This reduces the size of dm_verity_fec_io by 912 bytes. Note: a later commit introduces a constant DM_VERITY_FEC_MAX_ROOTS, which allows the size to be more clearly expressed as DM_VERITY_FEC_MAX_ROOTS + 1. This commit just fixes the size first. Fixes: a739ff3f543a ("dm verity: add support for forward error correction") Cc: stable@vger.kernel.org Signed-off-by: Eric Biggers Signed-off-by: Mikulas Patocka --- drivers/md/dm-verity-fec.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/md/dm-verity-fec.h b/drivers/md/dm-verity-fec.h index 35d28d9f8a9b..32ca2bfee1db 100644 --- a/drivers/md/dm-verity-fec.h +++ b/drivers/md/dm-verity-fec.h @@ -47,7 +47,8 @@ struct dm_verity_fec { /* per-bio data */ struct dm_verity_fec_io { struct rs_control *rs; /* Reed-Solomon state */ - int erasures[DM_VERITY_FEC_MAX_RSN]; /* erasures for decode_rs8 */ + /* erasures for decode_rs8 */ + int erasures[DM_VERITY_FEC_RSM - DM_VERITY_FEC_MIN_RSN + 1]; u8 *output; /* buffer for corrected output */ unsigned int level; /* recursion level */ unsigned int nbufs; /* number of buffers allocated */ From 430a05cb926f6bdf53e81460a2c3a553257f3f61 Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Thu, 5 Feb 2026 20:59:24 -0800 Subject: [PATCH 22/62] dm-verity-fec: fix reading parity bytes split across blocks (take 3) fec_decode_bufs() assumes that the parity bytes of the first RS codeword it decodes are never split across parity blocks. This assumption is false. Consider v->fec->block_size == 4096 && v->fec->roots == 17 && fio->nbufs == 1, for example. In that case, each call to fec_decode_bufs() consumes v->fec->roots * (fio->nbufs << DM_VERITY_FEC_BUF_RS_BITS) = 272 parity bytes. Considering that the parity data for each message block starts on a block boundary, the byte alignment in the parity data will iterate through 272*i mod 4096 until the 3 parity blocks have been consumed. On the 16th call (i=15), the alignment will be 4080 bytes into the first block. Only 16 bytes remain in that block, but 17 parity bytes will be needed. The code reads out-of-bounds from the parity block buffer. Fortunately this doesn't normally happen, since it can occur only for certain non-default values of fec_roots *and* when the maximum number of buffers couldn't be allocated due to low memory. For example with block_size=4096 only the following cases are affected: fec_roots=17: nbufs in [1, 3, 5, 15] fec_roots=19: nbufs in [1, 229] fec_roots=21: nbufs in [1, 3, 5, 13, 15, 39, 65, 195] fec_roots=23: nbufs in [1, 89] Regardless, fix it by refactoring how the parity blocks are read. Fixes: 6df90c02bae4 ("dm-verity FEC: Fix RS FEC repair for roots unaligned to block size (take 2)") Cc: stable@vger.kernel.org Signed-off-by: Eric Biggers Signed-off-by: Mikulas Patocka --- drivers/md/dm-verity-fec.c | 100 ++++++++++++++++--------------------- 1 file changed, 44 insertions(+), 56 deletions(-) diff --git a/drivers/md/dm-verity-fec.c b/drivers/md/dm-verity-fec.c index 1e776e0d6be5..e5d38bb3f16f 100644 --- a/drivers/md/dm-verity-fec.c +++ b/drivers/md/dm-verity-fec.c @@ -33,36 +33,6 @@ static inline u64 fec_interleave(struct dm_verity *v, u64 offset) return offset + mod * (v->fec->rounds << v->data_dev_block_bits); } -/* - * Read error-correcting codes for the requested RS block. Returns a pointer - * to the data block. Caller is responsible for releasing buf. - */ -static u8 *fec_read_parity(struct dm_verity *v, u64 rsb, int index, - unsigned int *offset, unsigned int par_buf_offset, - struct dm_buffer **buf, unsigned short ioprio) -{ - u64 position, block, rem; - u8 *res; - - /* We have already part of parity bytes read, skip to the next block */ - if (par_buf_offset) - index++; - - position = (index + rsb) * v->fec->roots; - block = div64_u64_rem(position, v->fec->io_size, &rem); - *offset = par_buf_offset ? 0 : (unsigned int)rem; - - res = dm_bufio_read_with_ioprio(v->fec->bufio, block, buf, ioprio); - if (IS_ERR(res)) { - DMERR("%s: FEC %llu: parity read failed (block %llu): %ld", - v->data_dev->name, (unsigned long long)rsb, - (unsigned long long)block, PTR_ERR(res)); - *buf = NULL; - } - - return res; -} - /* Loop over each allocated buffer. */ #define fec_for_each_buffer(io, __i) \ for (__i = 0; __i < (io)->nbufs; __i++) @@ -102,15 +72,29 @@ static int fec_decode_bufs(struct dm_verity *v, struct dm_verity_io *io, { int r, corrected = 0, res; struct dm_buffer *buf; - unsigned int n, i, j, offset, par_buf_offset = 0; + unsigned int n, i, j, parity_pos, to_copy; uint16_t par_buf[DM_VERITY_FEC_RSM - DM_VERITY_FEC_MIN_RSN]; u8 *par, *block; + u64 parity_block; struct bio *bio = dm_bio_from_per_bio_data(io, v->ti->per_io_data_size); - par = fec_read_parity(v, rsb, block_offset, &offset, - par_buf_offset, &buf, bio->bi_ioprio); - if (IS_ERR(par)) + /* + * Compute the index of the first parity block that will be needed and + * the starting position in that block. Then read that block. + * + * io_size is always a power of 2, but roots might not be. Note that + * when it's not, a codeword's parity bytes can span a block boundary. + */ + parity_block = (rsb + block_offset) * v->fec->roots; + parity_pos = parity_block & (v->fec->io_size - 1); + parity_block >>= v->data_dev_block_bits; + par = dm_bufio_read_with_ioprio(v->fec->bufio, parity_block, &buf, + bio->bi_ioprio); + if (IS_ERR(par)) { + DMERR("%s: FEC %llu: parity read failed (block %llu): %ld", + v->data_dev->name, rsb, parity_block, PTR_ERR(par)); return PTR_ERR(par); + } /* * Decode the RS blocks we have in bufs. Each RS block results in @@ -118,8 +102,32 @@ static int fec_decode_bufs(struct dm_verity *v, struct dm_verity_io *io, */ fec_for_each_buffer_rs_block(fio, n, i) { block = fec_buffer_rs_block(v, fio, n, i); - for (j = 0; j < v->fec->roots - par_buf_offset; j++) - par_buf[par_buf_offset + j] = par[offset + j]; + + /* + * Copy the next 'roots' parity bytes to 'par_buf', reading + * another parity block if needed. + */ + to_copy = min(v->fec->io_size - parity_pos, v->fec->roots); + for (j = 0; j < to_copy; j++) + par_buf[j] = par[parity_pos++]; + if (to_copy < v->fec->roots) { + parity_block++; + parity_pos = 0; + + dm_bufio_release(buf); + par = dm_bufio_read_with_ioprio(v->fec->bufio, + parity_block, &buf, + bio->bi_ioprio); + if (IS_ERR(par)) { + DMERR("%s: FEC %llu: parity read failed (block %llu): %ld", + v->data_dev->name, rsb, parity_block, + PTR_ERR(par)); + return PTR_ERR(par); + } + for (; j < v->fec->roots; j++) + par_buf[j] = par[parity_pos++]; + } + /* Decode an RS block using Reed-Solomon */ res = decode_rs8(fio->rs, block, par_buf, v->fec->rsn, NULL, neras, fio->erasures, 0, NULL); @@ -134,26 +142,6 @@ static int fec_decode_bufs(struct dm_verity *v, struct dm_verity_io *io, block_offset++; if (block_offset >= 1 << v->data_dev_block_bits) goto done; - - /* Read the next block when we run out of parity bytes */ - offset += (v->fec->roots - par_buf_offset); - /* Check if parity bytes are split between blocks */ - if (offset < v->fec->io_size && (offset + v->fec->roots) > v->fec->io_size) { - par_buf_offset = v->fec->io_size - offset; - for (j = 0; j < par_buf_offset; j++) - par_buf[j] = par[offset + j]; - offset += par_buf_offset; - } else - par_buf_offset = 0; - - if (offset >= v->fec->io_size) { - dm_bufio_release(buf); - - par = fec_read_parity(v, rsb, block_offset, &offset, - par_buf_offset, &buf, bio->bi_ioprio); - if (IS_ERR(par)) - return PTR_ERR(par); - } } done: r = corrected; From a9fbb31af763344072dbd7afcdff79438c70d480 Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Thu, 5 Feb 2026 20:59:25 -0800 Subject: [PATCH 23/62] dm-verity: rename dm_verity::hash_blocks to dm_verity::hash_end Rename hash_blocks to hash_end to reflect what it actually is. Signed-off-by: Eric Biggers Signed-off-by: Mikulas Patocka --- drivers/md/dm-verity-fec.c | 2 +- drivers/md/dm-verity-target.c | 8 ++++---- drivers/md/dm-verity.h | 4 ++-- 3 files changed, 7 insertions(+), 7 deletions(-) diff --git a/drivers/md/dm-verity-fec.c b/drivers/md/dm-verity-fec.c index e5d38bb3f16f..7a5ca1186af0 100644 --- a/drivers/md/dm-verity-fec.c +++ b/drivers/md/dm-verity-fec.c @@ -635,7 +635,7 @@ int verity_fec_ctr(struct dm_verity *v) * hash device after the hash blocks. */ - hash_blocks = v->hash_blocks - v->hash_start; + hash_blocks = v->hash_end - v->hash_start; /* * Require matching block sizes for data and hash devices for diff --git a/drivers/md/dm-verity-target.c b/drivers/md/dm-verity-target.c index 61073cd01d13..e1d435c79e96 100644 --- a/drivers/md/dm-verity-target.c +++ b/drivers/md/dm-verity-target.c @@ -733,8 +733,8 @@ static void verity_prefetch_io(struct work_struct *work) hash_block_start &= ~(sector_t)(cluster - 1); hash_block_end |= cluster - 1; - if (unlikely(hash_block_end >= v->hash_blocks)) - hash_block_end = v->hash_blocks - 1; + if (unlikely(hash_block_end >= v->hash_end)) + hash_block_end = v->hash_end - 1; } no_prefetch_cluster: dm_bufio_prefetch_with_ioprio(v->bufio, hash_block_start, @@ -1607,7 +1607,7 @@ static int verity_ctr(struct dm_target *ti, unsigned int argc, char **argv) } hash_position += s; } - v->hash_blocks = hash_position; + v->hash_end = hash_position; r = mempool_init_page_pool(&v->recheck_pool, 1, 0); if (unlikely(r)) { @@ -1634,7 +1634,7 @@ static int verity_ctr(struct dm_target *ti, unsigned int argc, char **argv) goto bad; } - if (dm_bufio_get_device_size(v->bufio) < v->hash_blocks) { + if (dm_bufio_get_device_size(v->bufio) < v->hash_end) { ti->error = "Hash device is too small"; r = -E2BIG; goto bad; diff --git a/drivers/md/dm-verity.h b/drivers/md/dm-verity.h index d6bfabb27113..2922263501f6 100644 --- a/drivers/md/dm-verity.h +++ b/drivers/md/dm-verity.h @@ -53,9 +53,9 @@ struct dm_verity { unsigned int sig_size; /* root digest signature size */ #endif /* CONFIG_SECURITY */ unsigned int salt_size; - sector_t hash_start; /* hash start in blocks */ + sector_t hash_start; /* index of first hash block on hash_dev */ + sector_t hash_end; /* 1 + index of last hash block on hash dev */ sector_t data_blocks; /* the number of data blocks */ - sector_t hash_blocks; /* the number of hash blocks */ unsigned char data_dev_block_bits; /* log2(data blocksize) */ unsigned char hash_dev_block_bits; /* log2(hash blocksize) */ unsigned char hash_per_block_bits; /* log2(hashes in hash block) */ From 05777b2800b060585d601705a8d1bb5afadc2f11 Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Thu, 5 Feb 2026 20:59:26 -0800 Subject: [PATCH 24/62] dm-verity-fec: improve documentation for Forward Error Correction Update verity.rst to add a dedicated section about FEC and improve the documentation for the FEC-related parameters. Signed-off-by: Eric Biggers Signed-off-by: Mikulas Patocka --- .../admin-guide/device-mapper/verity.rst | 122 +++++++++++++++--- 1 file changed, 102 insertions(+), 20 deletions(-) diff --git a/Documentation/admin-guide/device-mapper/verity.rst b/Documentation/admin-guide/device-mapper/verity.rst index 3ecab1cff9c6..eb9475d7e196 100644 --- a/Documentation/admin-guide/device-mapper/verity.rst +++ b/Documentation/admin-guide/device-mapper/verity.rst @@ -102,29 +102,42 @@ ignore_zero_blocks that are not guaranteed to contain zeroes. use_fec_from_device - Use forward error correction (FEC) to recover from corruption if hash - verification fails. Use encoding data from the specified device. This - may be the same device where data and hash blocks reside, in which case - fec_start must be outside data and hash areas. + Use forward error correction (FEC) parity data from the specified device to + try to automatically recover from corruption and I/O errors. - If the encoding data covers additional metadata, it must be accessible - on the hash device after the hash blocks. + If this option is given, then and must also be + given. must also be equal to . - Note: block sizes for data and hash devices must match. Also, if the - verity is encrypted the should be too. + can be the same as , in which case must be + outside the data area. It can also be the same as , in which case + must be outside the hash and optional additional metadata areas. + + If the data is encrypted, the should be too. + + For more information, see `Forward error correction`_. fec_roots - Number of generator roots. This equals to the number of parity bytes in - the encoding data. For example, in RS(M, N) encoding, the number of roots - is M-N. + The number of parity bytes in each 255-byte Reed-Solomon codeword. The + Reed-Solomon code used will be an RS(255, k) code where k = 255 - fec_roots. + + The supported values are 2 through 24 inclusive. Higher values provide + stronger error correction. However, the minimum value of 2 already provides + strong error correction due to the use of interleaving, so 2 is the + recommended value for most users. fec_roots=2 corresponds to an + RS(255, 253) code, which has a space overhead of about 0.8%. fec_blocks - The number of encoding data blocks on the FEC device. The block size for - the FEC device is . + The total number of blocks that are error-checked using + FEC. This must be at least the sum of and the number of + blocks needed by the hash tree. It can include additional metadata blocks, + which are assumed to be accessible on following the hash blocks. + + Note that this is *not* the number of parity blocks. The number of parity + blocks is inferred from , , and . fec_start - This is the offset, in blocks, from the start of the - FEC device to the beginning of the encoding data. + This is the offset, in blocks, from the start of + to the beginning of the parity data. check_at_most_once Verify data blocks only the first time they are read from the data device, @@ -180,11 +193,6 @@ per-block basis. This allows for a lightweight hash computation on first read into the page cache. Block hashes are stored linearly, aligned to the nearest block size. -If forward error correction (FEC) support is enabled any recovery of -corrupted data will be verified using the cryptographic hash of the -corresponding data. This is why combining error correction with -integrity checking is essential. - Hash Tree --------- @@ -212,6 +220,80 @@ The tree looks something like: / ... \ / . . . \ / \ blk_0 ... blk_127 blk_16256 blk_16383 blk_32640 . . . blk_32767 +Forward error correction +------------------------ + +dm-verity's optional forward error correction (FEC) support adds strong error +correction capabilities to dm-verity. It allows systems that would be rendered +inoperable by errors to continue operating, albeit with reduced performance. + +FEC uses Reed-Solomon (RS) codes that are interleaved across the entire +device(s), allowing long bursts of corrupt or unreadable blocks to be recovered. + +dm-verity validates any FEC-corrected block against the wanted hash before using +it. Therefore, FEC doesn't affect the security properties of dm-verity. + +The integration of FEC with dm-verity provides significant benefits over a +separate error correction layer: + +- dm-verity invokes FEC only when a block's hash doesn't match the wanted hash + or the block cannot be read at all. As a result, FEC doesn't add overhead to + the common case where no error occurs. + +- dm-verity hashes are also used to identify erasure locations for RS decoding. + This allows correcting twice as many errors. + +FEC uses an RS(255, k) code where k = 255 - fec_roots. fec_roots is usually 2. +This means that each k (usually 253) message bytes have fec_roots (usually 2) +bytes of parity data added to get a 255-byte codeword. (Many external sources +call RS codewords "blocks". Since dm-verity already uses the term "block" to +mean something else, we'll use the clearer term "RS codeword".) + +FEC checks fec_blocks blocks of message data in total, consisting of: + +1. The data blocks from the data device +2. The hash blocks from the hash device +3. Optional additional metadata that follows the hash blocks on the hash device + +dm-verity assumes that the FEC parity data was computed as if the following +procedure were followed: + +1. Concatenate the message data from the above sources. +2. Zero-pad to the next multiple of k blocks. Let msg be the resulting byte + array, and msglen its length in bytes. +3. For 0 <= i < msglen / k (for each RS codeword): + a. Select msg[i + j * msglen / k] for 0 <= j < k. + Consider these to be the 'k' message bytes of an RS codeword. + b. Compute the corresponding 'fec_roots' parity bytes of the RS codeword, + and concatenate them to the FEC parity data. + +Step 3a interleaves the RS codewords across the entire device using an +interleaving degree of data_block_size * ceil(fec_blocks / k). This is the +maximal interleaving, such that the message data consists of a region containing +byte 0 of all the RS codewords, then a region containing byte 1 of all the RS +codewords, and so on up to the region for byte 'k - 1'. Note that the number of +codewords is set to a multiple of data_block_size; thus, the regions are +block-aligned, and there is an implicit zero padding of up to 'k - 1' blocks. + +This interleaving allows long bursts of errors to be corrected. It provides +much stronger error correction than storage devices typically provide, while +keeping the space overhead low. + +The cost is slow decoding: correcting a single block usually requires reading +254 extra blocks spread evenly across the device(s). However, that is +acceptable because dm-verity uses FEC only when there is actually an error. + +The list below contains additional details about the RS codes used by +dm-verity's FEC. Userspace programs that generate the parity data need to use +these parameters for the parity data to match exactly: + +- Field used is GF(256) +- Bytes are mapped to/from GF(256) elements in the natural way, where bits 0 + through 7 (low-order to high-order) map to the coefficients of x^0 through x^7 +- Field generator polynomial is x^8 + x^4 + x^3 + x^2 + 1 +- The codes used are systematic, BCH-view codes +- Primitive element alpha is 'x' +- First consecutive root of code generator polynomial is 'x^0' On-disk format ============== From 82fbd6a3e29a329d439690cd7ccc4162c9cd8db6 Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Thu, 5 Feb 2026 20:59:27 -0800 Subject: [PATCH 25/62] dm-verity-fec: replace {MAX,MIN}_RSN with {MIN,MAX}_ROOTS Every time DM_VERITY_FEC_{MAX,MIN}_RSN are used, they are subtracted from DM_VERITY_FEC_RSM to get the bounds on the number of roots. Therefore, replace these with {MIN,MAX}_ROOTS constants which are more directly useful. (Note the inversion, where MAX_RSN maps to MIN_ROOTS and MIN_RSN maps to MAX_ROOTS.) No functional change. Signed-off-by: Eric Biggers Signed-off-by: Mikulas Patocka --- drivers/md/dm-verity-fec.c | 6 +++--- drivers/md/dm-verity-fec.h | 7 +++---- 2 files changed, 6 insertions(+), 7 deletions(-) diff --git a/drivers/md/dm-verity-fec.c b/drivers/md/dm-verity-fec.c index 7a5ca1186af0..0622b7496b8d 100644 --- a/drivers/md/dm-verity-fec.c +++ b/drivers/md/dm-verity-fec.c @@ -73,7 +73,7 @@ static int fec_decode_bufs(struct dm_verity *v, struct dm_verity_io *io, int r, corrected = 0, res; struct dm_buffer *buf; unsigned int n, i, j, parity_pos, to_copy; - uint16_t par_buf[DM_VERITY_FEC_RSM - DM_VERITY_FEC_MIN_RSN]; + uint16_t par_buf[DM_VERITY_FEC_MAX_ROOTS]; u8 *par, *block; u64 parity_block; struct bio *bio = dm_bio_from_per_bio_data(io, v->ti->per_io_data_size); @@ -572,8 +572,8 @@ int verity_fec_parse_opt_args(struct dm_arg_set *as, struct dm_verity *v, } else if (!strcasecmp(arg_name, DM_VERITY_OPT_FEC_ROOTS)) { if (sscanf(arg_value, "%hhu%c", &num_c, &dummy) != 1 || !num_c || - num_c < (DM_VERITY_FEC_RSM - DM_VERITY_FEC_MAX_RSN) || - num_c > (DM_VERITY_FEC_RSM - DM_VERITY_FEC_MIN_RSN)) { + num_c < DM_VERITY_FEC_MIN_ROOTS || + num_c > DM_VERITY_FEC_MAX_ROOTS) { ti->error = "Invalid " DM_VERITY_OPT_FEC_ROOTS; return -EINVAL; } diff --git a/drivers/md/dm-verity-fec.h b/drivers/md/dm-verity-fec.h index 32ca2bfee1db..d8d0e81da270 100644 --- a/drivers/md/dm-verity-fec.h +++ b/drivers/md/dm-verity-fec.h @@ -13,8 +13,8 @@ /* Reed-Solomon(M, N) parameters */ #define DM_VERITY_FEC_RSM 255 -#define DM_VERITY_FEC_MAX_RSN 253 -#define DM_VERITY_FEC_MIN_RSN 231 /* ~10% space overhead */ +#define DM_VERITY_FEC_MIN_ROOTS 2 /* RS(255, 253): ~0.8% space overhead */ +#define DM_VERITY_FEC_MAX_ROOTS 24 /* RS(255, 231): ~10% space overhead */ /* buffers for deinterleaving and decoding */ #define DM_VERITY_FEC_BUF_RS_BITS 4 /* 1 << RS blocks per buffer */ @@ -47,8 +47,7 @@ struct dm_verity_fec { /* per-bio data */ struct dm_verity_fec_io { struct rs_control *rs; /* Reed-Solomon state */ - /* erasures for decode_rs8 */ - int erasures[DM_VERITY_FEC_RSM - DM_VERITY_FEC_MIN_RSN + 1]; + int erasures[DM_VERITY_FEC_MAX_ROOTS + 1]; /* erasures for decode_rs8 */ u8 *output; /* buffer for corrected output */ unsigned int level; /* recursion level */ unsigned int nbufs; /* number of buffers allocated */ From f34ebde14c7c23fa9844cc5c03209048510fd686 Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Thu, 5 Feb 2026 20:59:28 -0800 Subject: [PATCH 26/62] dm-verity-fec: use standard names for Reed-Solomon parameters "RS(n, k)" is by far the most common and standard notation for describing Reed-Solomon codes. Each RS codeword consists of 'n' symbols, divided into 'k' message symbols and 'n - k' parity symbols. 'n - k' is also the number of roots of the generator polynomial. dm-verity uses "RS(M, N)" instead. I haven't been able to find any other source that uses this convention. This quirk makes the code harder to understand than necessary, especially due to dm-verity's 'N' meaning something different from the standard 'n'. Therefore, update dm-verity-fec.c and dm-verity-fec.h to use the standard parameter names. No functional changes. Signed-off-by: Eric Biggers Signed-off-by: Mikulas Patocka --- drivers/md/dm-verity-fec.c | 30 +++++++++++++++--------------- drivers/md/dm-verity-fec.h | 8 ++++---- 2 files changed, 19 insertions(+), 19 deletions(-) diff --git a/drivers/md/dm-verity-fec.c b/drivers/md/dm-verity-fec.c index 0622b7496b8d..e45880dd5eee 100644 --- a/drivers/md/dm-verity-fec.c +++ b/drivers/md/dm-verity-fec.c @@ -29,7 +29,7 @@ static inline u64 fec_interleave(struct dm_verity *v, u64 offset) { u32 mod; - mod = do_div(offset, v->fec->rsn); + mod = do_div(offset, v->fec->rs_k); return offset + mod * (v->fec->rounds << v->data_dev_block_bits); } @@ -50,7 +50,7 @@ static inline u8 *fec_buffer_rs_block(struct dm_verity *v, struct dm_verity_fec_io *fio, unsigned int i, unsigned int j) { - return &fio->bufs[i][j * v->fec->rsn]; + return &fio->bufs[i][j * v->fec->rs_k]; } /* @@ -129,7 +129,7 @@ static int fec_decode_bufs(struct dm_verity *v, struct dm_verity_io *io, } /* Decode an RS block using Reed-Solomon */ - res = decode_rs8(fio->rs, block, par_buf, v->fec->rsn, + res = decode_rs8(fio->rs, block, par_buf, v->fec->rs_k, NULL, neras, fio->erasures, 0, NULL); if (res < 0) { r = res; @@ -197,15 +197,15 @@ static int fec_read_bufs(struct dm_verity *v, struct dm_verity_io *io, return -EINVAL; /* - * read each of the rsn data blocks that are part of the RS block, and + * read each of the rs_k data blocks that are part of the RS block, and * interleave contents to available bufs */ - for (i = 0; i < v->fec->rsn; i++) { - ileaved = fec_interleave(v, rsb * v->fec->rsn + i); + for (i = 0; i < v->fec->rs_k; i++) { + ileaved = fec_interleave(v, rsb * v->fec->rs_k + i); /* * target is the data block we want to correct, target_index is - * the index of this block within the rsn RS blocks + * the index of this block within the rs_k RS blocks */ if (ileaved == target) target_index = i; @@ -322,7 +322,7 @@ static void fec_init_bufs(struct dm_verity *v, struct dm_verity_fec_io *fio) unsigned int n; fec_for_each_buffer(fio, n) - memset(fio->bufs[n], 0, v->fec->rsn << DM_VERITY_FEC_BUF_RS_BITS); + memset(fio->bufs[n], 0, v->fec->rs_k << DM_VERITY_FEC_BUF_RS_BITS); memset(fio->erasures, 0, sizeof(fio->erasures)); } @@ -394,12 +394,12 @@ int verity_fec_decode(struct dm_verity *v, struct dm_verity_io *io, block = block - v->hash_start + v->data_blocks; /* - * For RS(M, N), the continuous FEC data is divided into blocks of N - * bytes. Since block size may not be divisible by N, the last block + * For RS(n, k), the continuous FEC data is divided into blocks of k + * bytes. Since block size may not be divisible by k, the last block * is zero padded when decoding. * - * Each byte of the block is covered by a different RS(M, N) code, - * and each code is interleaved over N blocks to make it less likely + * Each byte of the block is covered by a different RS(n, k) code, + * and each code is interleaved over k blocks to make it less likely * that bursty corruption will leave us in unrecoverable state. */ @@ -650,7 +650,7 @@ int verity_fec_ctr(struct dm_verity *v) ti->error = "Missing " DM_VERITY_OPT_FEC_ROOTS; return -EINVAL; } - f->rsn = DM_VERITY_FEC_RSM - f->roots; + f->rs_k = DM_VERITY_FEC_RS_N - f->roots; if (!f->blocks) { ti->error = "Missing " DM_VERITY_OPT_FEC_BLOCKS; @@ -658,7 +658,7 @@ int verity_fec_ctr(struct dm_verity *v) } f->rounds = f->blocks; - if (sector_div(f->rounds, f->rsn)) + if (sector_div(f->rounds, f->rs_k)) f->rounds++; /* @@ -730,7 +730,7 @@ int verity_fec_ctr(struct dm_verity *v) } f->cache = kmem_cache_create("dm_verity_fec_buffers", - f->rsn << DM_VERITY_FEC_BUF_RS_BITS, + f->rs_k << DM_VERITY_FEC_BUF_RS_BITS, 0, 0, NULL); if (!f->cache) { ti->error = "Cannot create FEC buffer cache"; diff --git a/drivers/md/dm-verity-fec.h b/drivers/md/dm-verity-fec.h index d8d0e81da270..5afa93f2f1fc 100644 --- a/drivers/md/dm-verity-fec.h +++ b/drivers/md/dm-verity-fec.h @@ -11,8 +11,8 @@ #include "dm-verity.h" #include -/* Reed-Solomon(M, N) parameters */ -#define DM_VERITY_FEC_RSM 255 +/* Reed-Solomon(n, k) parameters */ +#define DM_VERITY_FEC_RS_N 255 #define DM_VERITY_FEC_MIN_ROOTS 2 /* RS(255, 253): ~0.8% space overhead */ #define DM_VERITY_FEC_MAX_ROOTS 24 /* RS(255, 231): ~10% space overhead */ @@ -34,8 +34,8 @@ struct dm_verity_fec { sector_t blocks; /* number of blocks covered */ sector_t rounds; /* number of interleaving rounds */ sector_t hash_blocks; /* blocks covered after v->hash_start */ - unsigned char roots; /* number of parity bytes, M-N of RS(M, N) */ - unsigned char rsn; /* N of RS(M, N) */ + unsigned char roots; /* parity bytes per RS codeword, n-k of RS(n, k) */ + unsigned char rs_k; /* message bytes per RS codeword, k of RS(n, k) */ mempool_t fio_pool; /* mempool for dm_verity_fec_io */ mempool_t rs_pool; /* mempool for fio->rs */ mempool_t prealloc_pool; /* mempool for preallocated buffers */ From 9b6098ad5b19261fd319b2637a28c69693585908 Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Thu, 5 Feb 2026 20:59:29 -0800 Subject: [PATCH 27/62] dm-verity-fec: rename "RS block" to "RS codeword" The literature refers to the unit of a Reed-Solomon (RS) code as either a "block" or a "codeword". dm-verity's source code uses "RS block". Unfortunately, that's really confusing because "block" already means something else in dm-verity. Especially problematic is the fact that dm-verity sometimes uses "RS block" to mean an RS codeword and sometimes to mean some dm-verity block that's related to the RS decoding process, for example one of the blocks that shares its RS codewords with the target block. Let's use "RS codeword" instead, or "RS message" when referring to just the message part of the codeword. Update some comments, function names, macro names, and variable names accordingly. No functional change. There are still some remaining comments where "RS block" refers to a dm-verity block. Later commits will handle these cases. Signed-off-by: Eric Biggers Signed-off-by: Mikulas Patocka --- drivers/md/dm-verity-fec.c | 58 ++++++++++++++++++++------------------ drivers/md/dm-verity-fec.h | 10 +++---- 2 files changed, 35 insertions(+), 33 deletions(-) diff --git a/drivers/md/dm-verity-fec.c b/drivers/md/dm-verity-fec.c index e45880dd5eee..148c85174c56 100644 --- a/drivers/md/dm-verity-fec.c +++ b/drivers/md/dm-verity-fec.c @@ -11,11 +11,12 @@ #define DM_MSG_PREFIX "verity-fec" /* - * When correcting a data block, the FEC code performs optimally when it can - * collect all the associated RS blocks at the same time. As each byte is part - * of a different RS block, there are '1 << data_dev_block_bits' RS blocks. - * There are '1 << DM_VERITY_FEC_BUF_RS_BITS' RS blocks per buffer, so that - * gives '1 << (data_dev_block_bits - DM_VERITY_FEC_BUF_RS_BITS)' buffers. + * When correcting a block, the FEC implementation performs optimally when it + * can collect all the associated RS codewords at the same time. As each byte + * is part of a different codeword, there are '1 << data_dev_block_bits' + * codewords. Each buffer has space for the message bytes for + * '1 << DM_VERITY_FEC_BUF_RS_BITS' codewords, so that gives + * '1 << (data_dev_block_bits - DM_VERITY_FEC_BUF_RS_BITS)' buffers. */ static inline unsigned int fec_max_nbufs(struct dm_verity *v) { @@ -37,25 +38,26 @@ static inline u64 fec_interleave(struct dm_verity *v, u64 offset) #define fec_for_each_buffer(io, __i) \ for (__i = 0; __i < (io)->nbufs; __i++) -/* Loop over each RS block in each allocated buffer. */ -#define fec_for_each_buffer_rs_block(io, __i, __j) \ +/* Loop over each RS message in each allocated buffer. */ +/* To stop early, use 'goto', not 'break' (since this uses nested loops). */ +#define fec_for_each_buffer_rs_message(io, __i, __j) \ fec_for_each_buffer(io, __i) \ for (__j = 0; __j < 1 << DM_VERITY_FEC_BUF_RS_BITS; __j++) /* - * Return a pointer to the current RS block when called inside - * fec_for_each_buffer_rs_block. + * Return a pointer to the current RS message when called inside + * fec_for_each_buffer_rs_message. */ -static inline u8 *fec_buffer_rs_block(struct dm_verity *v, - struct dm_verity_fec_io *fio, - unsigned int i, unsigned int j) +static inline u8 *fec_buffer_rs_message(struct dm_verity *v, + struct dm_verity_fec_io *fio, + unsigned int i, unsigned int j) { return &fio->bufs[i][j * v->fec->rs_k]; } /* - * Return an index to the current RS block when called inside - * fec_for_each_buffer_rs_block. + * Return the index of the current RS message when called inside + * fec_for_each_buffer_rs_message. */ static inline unsigned int fec_buffer_rs_index(unsigned int i, unsigned int j) { @@ -63,8 +65,8 @@ static inline unsigned int fec_buffer_rs_index(unsigned int i, unsigned int j) } /* - * Decode all RS blocks from buffers and copy corrected bytes into fio->output - * starting from block_offset. + * Decode all RS codewords whose message bytes were loaded into fio->bufs. Copy + * the corrected bytes into fio->output starting from block_offset. */ static int fec_decode_bufs(struct dm_verity *v, struct dm_verity_io *io, struct dm_verity_fec_io *fio, u64 rsb, int byte_index, @@ -74,7 +76,7 @@ static int fec_decode_bufs(struct dm_verity *v, struct dm_verity_io *io, struct dm_buffer *buf; unsigned int n, i, j, parity_pos, to_copy; uint16_t par_buf[DM_VERITY_FEC_MAX_ROOTS]; - u8 *par, *block; + u8 *par, *msg_buf; u64 parity_block; struct bio *bio = dm_bio_from_per_bio_data(io, v->ti->per_io_data_size); @@ -97,11 +99,12 @@ static int fec_decode_bufs(struct dm_verity *v, struct dm_verity_io *io, } /* - * Decode the RS blocks we have in bufs. Each RS block results in - * one corrected target byte and consumes fec->roots parity bytes. + * Decode the RS codewords whose message bytes are in bufs. Each RS + * codeword results in one corrected target byte and consumes fec->roots + * parity bytes. */ - fec_for_each_buffer_rs_block(fio, n, i) { - block = fec_buffer_rs_block(v, fio, n, i); + fec_for_each_buffer_rs_message(fio, n, i) { + msg_buf = fec_buffer_rs_message(v, fio, n, i); /* * Copy the next 'roots' parity bytes to 'par_buf', reading @@ -128,8 +131,8 @@ static int fec_decode_bufs(struct dm_verity *v, struct dm_verity_io *io, par_buf[j] = par[parity_pos++]; } - /* Decode an RS block using Reed-Solomon */ - res = decode_rs8(fio->rs, block, par_buf, v->fec->rs_k, + /* Decode an RS codeword using the Reed-Solomon library. */ + res = decode_rs8(fio->rs, msg_buf, par_buf, v->fec->rs_k, NULL, neras, fio->erasures, 0, NULL); if (res < 0) { r = res; @@ -137,7 +140,7 @@ static int fec_decode_bufs(struct dm_verity *v, struct dm_verity_io *io, } corrected += res; - fio->output[block_offset] = block[byte_index]; + fio->output[block_offset] = msg_buf[byte_index]; block_offset++; if (block_offset >= 1 << v->data_dev_block_bits) @@ -185,7 +188,7 @@ static int fec_read_bufs(struct dm_verity *v, struct dm_verity_io *io, struct dm_bufio_client *bufio; struct dm_verity_fec_io *fio = io->fec_io; u64 block, ileaved; - u8 *bbuf, *rs_block; + u8 *bbuf; u8 want_digest[HASH_MAX_DIGESTSIZE]; unsigned int n, k; struct bio *bio = dm_bio_from_per_bio_data(io, v->ti->per_io_data_size); @@ -262,14 +265,13 @@ static int fec_read_bufs(struct dm_verity *v, struct dm_verity_io *io, * deinterleave and copy the bytes that fit into bufs, * starting from block_offset */ - fec_for_each_buffer_rs_block(fio, n, j) { + fec_for_each_buffer_rs_message(fio, n, j) { k = fec_buffer_rs_index(n, j) + block_offset; if (k >= 1 << v->data_dev_block_bits) goto done; - rs_block = fec_buffer_rs_block(v, fio, n, j); - rs_block[i] = bbuf[k]; + fec_buffer_rs_message(v, fio, n, j)[i] = bbuf[k]; } done: dm_bufio_release(buf); diff --git a/drivers/md/dm-verity-fec.h b/drivers/md/dm-verity-fec.h index 5afa93f2f1fc..257a609274c7 100644 --- a/drivers/md/dm-verity-fec.h +++ b/drivers/md/dm-verity-fec.h @@ -17,7 +17,7 @@ #define DM_VERITY_FEC_MAX_ROOTS 24 /* RS(255, 231): ~10% space overhead */ /* buffers for deinterleaving and decoding */ -#define DM_VERITY_FEC_BUF_RS_BITS 4 /* 1 << RS blocks per buffer */ +#define DM_VERITY_FEC_BUF_RS_BITS 4 /* log2(RS messages per buffer) */ #define DM_VERITY_OPT_FEC_DEV "use_fec_from_device" #define DM_VERITY_OPT_FEC_BLOCKS "fec_blocks" @@ -52,10 +52,10 @@ struct dm_verity_fec_io { unsigned int level; /* recursion level */ unsigned int nbufs; /* number of buffers allocated */ /* - * Buffers for deinterleaving RS blocks. Each buffer has space for - * the data bytes of (1 << DM_VERITY_FEC_BUF_RS_BITS) RS blocks. The - * array length is fec_max_nbufs(v), and we try to allocate that many - * buffers. However, in low-memory situations we may be unable to + * Buffers for deinterleaving RS codewords. Each buffer has space for + * the message bytes of (1 << DM_VERITY_FEC_BUF_RS_BITS) RS codewords. + * The array length is fec_max_nbufs(v), and we try to allocate that + * many buffers. However, in low-memory situations we may be unable to * allocate all buffers. 'nbufs' holds the number actually allocated. */ u8 *bufs[]; From e75d55461871bbf5debe33f528f267e23c84a370 Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Thu, 5 Feb 2026 20:59:30 -0800 Subject: [PATCH 28/62] dm-verity-fec: replace io_size with block_size dm-verity's FEC implementation assumes that data_block_size == hash_block_size, and it accesses the FEC device in units of the same size. Many places in the code want that size and compute it on-demand as '1 << v->data_dev_block_bits'. However, it's actually already available in v->fec->io_size. Rename that field to block_size, initialize it a bit earlier, and use it in the appropriate places. Note that while these sizes could in principle be different, that case is not supported. So there's no need to complicate the code for it. Signed-off-by: Eric Biggers Signed-off-by: Mikulas Patocka --- drivers/md/dm-verity-fec.c | 30 +++++++++++++----------------- drivers/md/dm-verity-fec.h | 2 +- 2 files changed, 14 insertions(+), 18 deletions(-) diff --git a/drivers/md/dm-verity-fec.c b/drivers/md/dm-verity-fec.c index 148c85174c56..1251d45f6f85 100644 --- a/drivers/md/dm-verity-fec.c +++ b/drivers/md/dm-verity-fec.c @@ -84,11 +84,11 @@ static int fec_decode_bufs(struct dm_verity *v, struct dm_verity_io *io, * Compute the index of the first parity block that will be needed and * the starting position in that block. Then read that block. * - * io_size is always a power of 2, but roots might not be. Note that + * block_size is always a power of 2, but roots might not be. Note that * when it's not, a codeword's parity bytes can span a block boundary. */ parity_block = (rsb + block_offset) * v->fec->roots; - parity_pos = parity_block & (v->fec->io_size - 1); + parity_pos = parity_block & (v->fec->block_size - 1); parity_block >>= v->data_dev_block_bits; par = dm_bufio_read_with_ioprio(v->fec->bufio, parity_block, &buf, bio->bi_ioprio); @@ -110,7 +110,7 @@ static int fec_decode_bufs(struct dm_verity *v, struct dm_verity_io *io, * Copy the next 'roots' parity bytes to 'par_buf', reading * another parity block if needed. */ - to_copy = min(v->fec->io_size - parity_pos, v->fec->roots); + to_copy = min(v->fec->block_size - parity_pos, v->fec->roots); for (j = 0; j < to_copy; j++) par_buf[j] = par[parity_pos++]; if (to_copy < v->fec->roots) { @@ -143,7 +143,7 @@ static int fec_decode_bufs(struct dm_verity *v, struct dm_verity_io *io, fio->output[block_offset] = msg_buf[byte_index]; block_offset++; - if (block_offset >= 1 << v->data_dev_block_bits) + if (block_offset >= v->fec->block_size) goto done; } done: @@ -167,7 +167,7 @@ static int fec_decode_bufs(struct dm_verity *v, struct dm_verity_io *io, static int fec_is_erasure(struct dm_verity *v, struct dm_verity_io *io, const u8 *want_digest, const u8 *data) { - if (unlikely(verity_hash(v, io, data, 1 << v->data_dev_block_bits, + if (unlikely(verity_hash(v, io, data, v->fec->block_size, io->tmp_digest))) return 0; @@ -268,7 +268,7 @@ static int fec_read_bufs(struct dm_verity *v, struct dm_verity_io *io, fec_for_each_buffer_rs_message(fio, n, j) { k = fec_buffer_rs_index(n, j) + block_offset; - if (k >= 1 << v->data_dev_block_bits) + if (k >= v->fec->block_size) goto done; fec_buffer_rs_message(v, fio, n, j)[i] = bbuf[k]; @@ -341,7 +341,7 @@ static int fec_decode_rsb(struct dm_verity *v, struct dm_verity_io *io, int r, neras = 0; unsigned int pos; - for (pos = 0; pos < 1 << v->data_dev_block_bits; ) { + for (pos = 0; pos < v->fec->block_size;) { fec_init_bufs(v, fio); r = fec_read_bufs(v, io, rsb, offset, pos, @@ -357,8 +357,7 @@ static int fec_decode_rsb(struct dm_verity *v, struct dm_verity_io *io, } /* Always re-validate the corrected block against the expected hash */ - r = verity_hash(v, io, fio->output, 1 << v->data_dev_block_bits, - io->tmp_digest); + r = verity_hash(v, io, fio->output, v->fec->block_size, io->tmp_digest); if (unlikely(r < 0)) return r; @@ -426,7 +425,7 @@ int verity_fec_decode(struct dm_verity *v, struct dm_verity_io *io, goto done; } - memcpy(dest, fio->output, 1 << v->data_dev_block_bits); + memcpy(dest, fio->output, v->fec->block_size); atomic64_inc(&v->fec->corrected); done: @@ -647,6 +646,7 @@ int verity_fec_ctr(struct dm_verity *v) ti->error = "Block sizes must match to use FEC"; return -EINVAL; } + f->block_size = 1 << v->data_dev_block_bits; if (!f->roots) { ti->error = "Missing " DM_VERITY_OPT_FEC_ROOTS; @@ -684,10 +684,7 @@ int verity_fec_ctr(struct dm_verity *v) return -E2BIG; } - f->io_size = 1 << v->data_dev_block_bits; - - f->bufio = dm_bufio_client_create(f->dev->bdev, - f->io_size, + f->bufio = dm_bufio_client_create(f->dev->bdev, f->block_size, 1, 0, NULL, NULL, 0); if (IS_ERR(f->bufio)) { ti->error = "Cannot initialize FEC bufio client"; @@ -701,8 +698,7 @@ int verity_fec_ctr(struct dm_verity *v) return -E2BIG; } - f->data_bufio = dm_bufio_client_create(v->data_dev->bdev, - 1 << v->data_dev_block_bits, + f->data_bufio = dm_bufio_client_create(v->data_dev->bdev, f->block_size, 1, 0, NULL, NULL, 0); if (IS_ERR(f->data_bufio)) { ti->error = "Cannot initialize FEC data bufio client"; @@ -749,7 +745,7 @@ int verity_fec_ctr(struct dm_verity *v) /* Preallocate an output buffer for each thread */ ret = mempool_init_kmalloc_pool(&f->output_pool, num_online_cpus(), - 1 << v->data_dev_block_bits); + f->block_size); if (ret) { ti->error = "Cannot allocate FEC output pool"; return ret; diff --git a/drivers/md/dm-verity-fec.h b/drivers/md/dm-verity-fec.h index 257a609274c7..49d43894ea74 100644 --- a/drivers/md/dm-verity-fec.h +++ b/drivers/md/dm-verity-fec.h @@ -29,7 +29,7 @@ struct dm_verity_fec { struct dm_dev *dev; /* parity data device */ struct dm_bufio_client *data_bufio; /* for data dev access */ struct dm_bufio_client *bufio; /* for parity data access */ - size_t io_size; /* IO size for roots */ + size_t block_size; /* size of data, hash, and parity blocks in bytes */ sector_t start; /* parity data start in blocks */ sector_t blocks; /* number of blocks covered */ sector_t rounds; /* number of interleaving rounds */ From 41208f3707e97976f3b1b7c36c4c094e05e5cf1d Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Thu, 5 Feb 2026 20:59:31 -0800 Subject: [PATCH 29/62] dm-verity-fec: rename rounds to region_blocks It's hard to reconcile the value stored in dm_verity_fec::rounds with its name and documentation. Most likely "rounds" is being used as an alias for what is more commonly called the interleaving degree or "number of ways". But the interleaving is done at the byte level, whereas the units of "rounds" are blocks. So it's not really that. In practice, the reason the code needs this value is that it expresses the number of blocks in each "region" of the message data, where each region contains the bytes from a particular index in the RS codewords. Rename it to region_blocks to make the code a bit more understandable. Signed-off-by: Eric Biggers Signed-off-by: Mikulas Patocka --- drivers/md/dm-verity-fec.c | 16 ++++++++-------- drivers/md/dm-verity-fec.h | 2 +- 2 files changed, 9 insertions(+), 9 deletions(-) diff --git a/drivers/md/dm-verity-fec.c b/drivers/md/dm-verity-fec.c index 1251d45f6f85..d132fd5dc7b4 100644 --- a/drivers/md/dm-verity-fec.c +++ b/drivers/md/dm-verity-fec.c @@ -31,7 +31,7 @@ static inline u64 fec_interleave(struct dm_verity *v, u64 offset) u32 mod; mod = do_div(offset, v->fec->rs_k); - return offset + mod * (v->fec->rounds << v->data_dev_block_bits); + return offset + mod * (v->fec->region_blocks << v->data_dev_block_bits); } /* Loop over each allocated buffer. */ @@ -405,13 +405,13 @@ int verity_fec_decode(struct dm_verity *v, struct dm_verity_io *io, */ offset = block << v->data_dev_block_bits; - res = div64_u64(offset, v->fec->rounds << v->data_dev_block_bits); + res = div64_u64(offset, v->fec->region_blocks << v->data_dev_block_bits); /* * The base RS block we can feed to the interleaver to find out all * blocks required for decoding. */ - rsb = offset - res * (v->fec->rounds << v->data_dev_block_bits); + rsb = offset - res * (v->fec->region_blocks << v->data_dev_block_bits); /* * Locating erasures is slow, so attempt to recover the block without @@ -659,15 +659,15 @@ int verity_fec_ctr(struct dm_verity *v) return -EINVAL; } - f->rounds = f->blocks; - if (sector_div(f->rounds, f->rs_k)) - f->rounds++; + f->region_blocks = f->blocks; + if (sector_div(f->region_blocks, f->rs_k)) + f->region_blocks++; /* * Due to optional metadata, f->blocks can be larger than * data_blocks and hash_blocks combined. */ - if (f->blocks < v->data_blocks + hash_blocks || !f->rounds) { + if (f->blocks < v->data_blocks + hash_blocks || !f->region_blocks) { ti->error = "Invalid " DM_VERITY_OPT_FEC_BLOCKS; return -EINVAL; } @@ -693,7 +693,7 @@ int verity_fec_ctr(struct dm_verity *v) dm_bufio_set_sector_offset(f->bufio, f->start << (v->data_dev_block_bits - SECTOR_SHIFT)); - if (dm_bufio_get_device_size(f->bufio) < f->rounds * f->roots) { + if (dm_bufio_get_device_size(f->bufio) < f->region_blocks * f->roots) { ti->error = "FEC device is too small"; return -E2BIG; } diff --git a/drivers/md/dm-verity-fec.h b/drivers/md/dm-verity-fec.h index 49d43894ea74..50b5e187d5cc 100644 --- a/drivers/md/dm-verity-fec.h +++ b/drivers/md/dm-verity-fec.h @@ -32,7 +32,7 @@ struct dm_verity_fec { size_t block_size; /* size of data, hash, and parity blocks in bytes */ sector_t start; /* parity data start in blocks */ sector_t blocks; /* number of blocks covered */ - sector_t rounds; /* number of interleaving rounds */ + sector_t region_blocks; /* blocks per region: ceil(blocks / rs_k) */ sector_t hash_blocks; /* blocks covered after v->hash_start */ unsigned char roots; /* parity bytes per RS codeword, n-k of RS(n, k) */ unsigned char rs_k; /* message bytes per RS codeword, k of RS(n, k) */ From 8ef45923fdcb7ec44e3a965bcbf41723e20814e4 Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Thu, 5 Feb 2026 20:59:32 -0800 Subject: [PATCH 30/62] dm-verity-fec: simplify computation of rsb To compute 'rsb', verity_fec_decode() divides 'offset' by 'v->fec->region_blocks << v->data_dev_block_bits', then subtracts the quotient times that divisor. That's simply the long way to do a modulo operation, i.e. a - b * floor(a / b) instead of just a % b. Use div64_u64_rem() to get the remainder more concisely. Signed-off-by: Eric Biggers Signed-off-by: Mikulas Patocka --- drivers/md/dm-verity-fec.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/md/dm-verity-fec.c b/drivers/md/dm-verity-fec.c index d132fd5dc7b4..d2c55896e6f7 100644 --- a/drivers/md/dm-verity-fec.c +++ b/drivers/md/dm-verity-fec.c @@ -377,7 +377,7 @@ int verity_fec_decode(struct dm_verity *v, struct dm_verity_io *io, { int r; struct dm_verity_fec_io *fio; - u64 offset, res, rsb; + u64 offset, rsb; if (!verity_fec_is_enabled(v)) return -EOPNOTSUPP; @@ -405,13 +405,13 @@ int verity_fec_decode(struct dm_verity *v, struct dm_verity_io *io, */ offset = block << v->data_dev_block_bits; - res = div64_u64(offset, v->fec->region_blocks << v->data_dev_block_bits); /* * The base RS block we can feed to the interleaver to find out all * blocks required for decoding. */ - rsb = offset - res * (v->fec->region_blocks << v->data_dev_block_bits); + div64_u64_rem(offset, v->fec->region_blocks << v->data_dev_block_bits, + &rsb); /* * Locating erasures is slow, so attempt to recover the block without From 5ef22361fa98a44409f11a10cb7d08b5cbf6d57c Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Thu, 5 Feb 2026 20:59:33 -0800 Subject: [PATCH 31/62] dm-verity-fec: simplify computation of ileaved fec_read_bufs() just iterates over a sequence of message blocks with step size region_blocks. At each step, 'ileaved' is just the offset (in bytes) to one of these blocks. Compute it in the straightforward way, eliminating fec_interleave(). In more detail, previously the code computed 'ileaved = (n / k) + (n % k) * (region_blocks * block_size)' where n = rsb * k + i and 0 <= i < k. Substituting 'n' gives: ileaved = ((rsb * k + i) / k) + ((rsb * k + i) % k) * region_blocks * block_size = rsb + (i * region_blocks * block_size) The result is more efficient and easier to understand. Signed-off-by: Eric Biggers Signed-off-by: Mikulas Patocka --- drivers/md/dm-verity-fec.c | 13 +------------ 1 file changed, 1 insertion(+), 12 deletions(-) diff --git a/drivers/md/dm-verity-fec.c b/drivers/md/dm-verity-fec.c index d2c55896e6f7..baf988c29761 100644 --- a/drivers/md/dm-verity-fec.c +++ b/drivers/md/dm-verity-fec.c @@ -23,17 +23,6 @@ static inline unsigned int fec_max_nbufs(struct dm_verity *v) return 1 << (v->data_dev_block_bits - DM_VERITY_FEC_BUF_RS_BITS); } -/* - * Return an interleaved offset for a byte in RS block. - */ -static inline u64 fec_interleave(struct dm_verity *v, u64 offset) -{ - u32 mod; - - mod = do_div(offset, v->fec->rs_k); - return offset + mod * (v->fec->region_blocks << v->data_dev_block_bits); -} - /* Loop over each allocated buffer. */ #define fec_for_each_buffer(io, __i) \ for (__i = 0; __i < (io)->nbufs; __i++) @@ -204,7 +193,7 @@ static int fec_read_bufs(struct dm_verity *v, struct dm_verity_io *io, * interleave contents to available bufs */ for (i = 0; i < v->fec->rs_k; i++) { - ileaved = fec_interleave(v, rsb * v->fec->rs_k + i); + ileaved = rsb + i * (v->fec->region_blocks << v->data_dev_block_bits); /* * target is the data block we want to correct, target_index is From 96dfabe7382bb984a702b689b69b784d2a29ca0b Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Thu, 5 Feb 2026 20:59:34 -0800 Subject: [PATCH 32/62] dm-verity-fec: simplify deinterleaving Since fec_read_bufs() deinterleaves the bytes from 'bbuf' sequentially starting from 'block_offset', it can just do simple increments instead of the more complex fec_buffer_rs_index() computation. Signed-off-by: Eric Biggers Signed-off-by: Mikulas Patocka --- drivers/md/dm-verity-fec.c | 19 ++++--------------- 1 file changed, 4 insertions(+), 15 deletions(-) diff --git a/drivers/md/dm-verity-fec.c b/drivers/md/dm-verity-fec.c index baf988c29761..4aee948dde5d 100644 --- a/drivers/md/dm-verity-fec.c +++ b/drivers/md/dm-verity-fec.c @@ -44,15 +44,6 @@ static inline u8 *fec_buffer_rs_message(struct dm_verity *v, return &fio->bufs[i][j * v->fec->rs_k]; } -/* - * Return the index of the current RS message when called inside - * fec_for_each_buffer_rs_message. - */ -static inline unsigned int fec_buffer_rs_index(unsigned int i, unsigned int j) -{ - return (i << DM_VERITY_FEC_BUF_RS_BITS) + j; -} - /* * Decode all RS codewords whose message bytes were loaded into fio->bufs. Copy * the corrected bytes into fio->output starting from block_offset. @@ -179,7 +170,7 @@ static int fec_read_bufs(struct dm_verity *v, struct dm_verity_io *io, u64 block, ileaved; u8 *bbuf; u8 want_digest[HASH_MAX_DIGESTSIZE]; - unsigned int n, k; + unsigned int n, src_pos; struct bio *bio = dm_bio_from_per_bio_data(io, v->ti->per_io_data_size); if (neras) @@ -254,13 +245,11 @@ static int fec_read_bufs(struct dm_verity *v, struct dm_verity_io *io, * deinterleave and copy the bytes that fit into bufs, * starting from block_offset */ + src_pos = block_offset; fec_for_each_buffer_rs_message(fio, n, j) { - k = fec_buffer_rs_index(n, j) + block_offset; - - if (k >= v->fec->block_size) + if (src_pos >= v->fec->block_size) goto done; - - fec_buffer_rs_message(v, fio, n, j)[i] = bbuf[k]; + fec_buffer_rs_message(v, fio, n, j)[i] = bbuf[src_pos++]; } done: dm_bufio_release(buf); From 3ad2b952a3ea26c05ed6fdd6484051604b1dee66 Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Thu, 5 Feb 2026 20:59:35 -0800 Subject: [PATCH 33/62] dm-verity-fec: rename block_offset to out_pos The current position in the output block buffer is called 'pos' in fec_decode_rsb(), and 'block_offset' in fec_read_bufs() and fec_decode_bufs(). These names aren't very clear, especially 'block_offset' which is easily confused with the offset of a message or parity block or the position in the current parity block. Rename it to 'out_pos'. Signed-off-by: Eric Biggers Signed-off-by: Mikulas Patocka --- drivers/md/dm-verity-fec.c | 28 +++++++++++++--------------- 1 file changed, 13 insertions(+), 15 deletions(-) diff --git a/drivers/md/dm-verity-fec.c b/drivers/md/dm-verity-fec.c index 4aee948dde5d..6dee48858ed6 100644 --- a/drivers/md/dm-verity-fec.c +++ b/drivers/md/dm-verity-fec.c @@ -46,11 +46,11 @@ static inline u8 *fec_buffer_rs_message(struct dm_verity *v, /* * Decode all RS codewords whose message bytes were loaded into fio->bufs. Copy - * the corrected bytes into fio->output starting from block_offset. + * the corrected bytes into fio->output starting from out_pos. */ static int fec_decode_bufs(struct dm_verity *v, struct dm_verity_io *io, struct dm_verity_fec_io *fio, u64 rsb, int byte_index, - unsigned int block_offset, int neras) + unsigned int out_pos, int neras) { int r, corrected = 0, res; struct dm_buffer *buf; @@ -67,7 +67,7 @@ static int fec_decode_bufs(struct dm_verity *v, struct dm_verity_io *io, * block_size is always a power of 2, but roots might not be. Note that * when it's not, a codeword's parity bytes can span a block boundary. */ - parity_block = (rsb + block_offset) * v->fec->roots; + parity_block = (rsb + out_pos) * v->fec->roots; parity_pos = parity_block & (v->fec->block_size - 1); parity_block >>= v->data_dev_block_bits; par = dm_bufio_read_with_ioprio(v->fec->bufio, parity_block, &buf, @@ -120,10 +120,9 @@ static int fec_decode_bufs(struct dm_verity *v, struct dm_verity_io *io, } corrected += res; - fio->output[block_offset] = msg_buf[byte_index]; + fio->output[out_pos++] = msg_buf[byte_index]; - block_offset++; - if (block_offset >= v->fec->block_size) + if (out_pos >= v->fec->block_size) goto done; } done: @@ -159,8 +158,7 @@ static int fec_is_erasure(struct dm_verity *v, struct dm_verity_io *io, * fits into buffers. Check for erasure locations if @neras is non-NULL. */ static int fec_read_bufs(struct dm_verity *v, struct dm_verity_io *io, - u64 rsb, u64 target, unsigned int block_offset, - int *neras) + u64 rsb, u64 target, unsigned int out_pos, int *neras) { bool is_zero; int i, j, target_index = -1; @@ -243,9 +241,9 @@ static int fec_read_bufs(struct dm_verity *v, struct dm_verity_io *io, /* * deinterleave and copy the bytes that fit into bufs, - * starting from block_offset + * starting from out_pos */ - src_pos = block_offset; + src_pos = out_pos; fec_for_each_buffer_rs_message(fio, n, j) { if (src_pos >= v->fec->block_size) goto done; @@ -317,21 +315,21 @@ static int fec_decode_rsb(struct dm_verity *v, struct dm_verity_io *io, const u8 *want_digest, bool use_erasures) { int r, neras = 0; - unsigned int pos; + unsigned int out_pos; - for (pos = 0; pos < v->fec->block_size;) { + for (out_pos = 0; out_pos < v->fec->block_size;) { fec_init_bufs(v, fio); - r = fec_read_bufs(v, io, rsb, offset, pos, + r = fec_read_bufs(v, io, rsb, offset, out_pos, use_erasures ? &neras : NULL); if (unlikely(r < 0)) return r; - r = fec_decode_bufs(v, io, fio, rsb, r, pos, neras); + r = fec_decode_bufs(v, io, fio, rsb, r, out_pos, neras); if (r < 0) return r; - pos += fio->nbufs << DM_VERITY_FEC_BUF_RS_BITS; + out_pos += fio->nbufs << DM_VERITY_FEC_BUF_RS_BITS; } /* Always re-validate the corrected block against the expected hash */ From ca0da6cc096870e9e138f4c2bb78bd2560e29590 Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Thu, 5 Feb 2026 20:59:36 -0800 Subject: [PATCH 34/62] dm-verity-fec: move computation of offset and rsb down a level verity_fec_decode() computes (offset, rsb) from the target block index and calls fec_decode_rsb() with these parameters. Move this computation into fec_decode_rsb(), and rename fec_decode_rsb() to fec_decode(). This ends up being simpler and enables further refactoring, specifically making use of the quotient from the division more easily. The function renaming also eliminates a reference to the ambiguous term "rsb". This change does mean the same div64_u64_rem() can now be executed twice per block, since verity_fec_decode() calls fec_decode() up to twice per block. However, this cost is negligible compared to the rest of FEC. Signed-off-by: Eric Biggers Signed-off-by: Mikulas Patocka --- drivers/md/dm-verity-fec.c | 46 +++++++++++++++----------------------- 1 file changed, 18 insertions(+), 28 deletions(-) diff --git a/drivers/md/dm-verity-fec.c b/drivers/md/dm-verity-fec.c index 6dee48858ed6..bd0c3faf2743 100644 --- a/drivers/md/dm-verity-fec.c +++ b/drivers/md/dm-verity-fec.c @@ -306,16 +306,26 @@ static void fec_init_bufs(struct dm_verity *v, struct dm_verity_fec_io *fio) } /* - * Decode all RS blocks in a single data block and return the target block - * (indicated by @offset) in fio->output. If @use_erasures is non-zero, uses - * hashes to locate erasures. + * Try to correct the message (data or hash) block at index @target_block. + * + * If @use_erasures is true, use verity hashes to locate erasures. This makes + * the error correction slower but up to twice as capable. + * + * On success, return 0 and write the corrected block to @fio->output. 0 is + * returned only if the digest of the corrected block matches @want_digest; this + * is critical to ensure that FEC can't cause dm-verity to return bad data. */ -static int fec_decode_rsb(struct dm_verity *v, struct dm_verity_io *io, - struct dm_verity_fec_io *fio, u64 rsb, u64 offset, - const u8 *want_digest, bool use_erasures) +static int fec_decode(struct dm_verity *v, struct dm_verity_io *io, + struct dm_verity_fec_io *fio, u64 target_block, + const u8 *want_digest, bool use_erasures) { int r, neras = 0; unsigned int out_pos; + u64 offset = target_block << v->data_dev_block_bits; + u64 rsb; + + div64_u64_rem(offset, v->fec->region_blocks << v->data_dev_block_bits, + &rsb); for (out_pos = 0; out_pos < v->fec->block_size;) { fec_init_bufs(v, fio); @@ -353,7 +363,6 @@ int verity_fec_decode(struct dm_verity *v, struct dm_verity_io *io, { int r; struct dm_verity_fec_io *fio; - u64 offset, rsb; if (!verity_fec_is_enabled(v)) return -EOPNOTSUPP; @@ -370,33 +379,14 @@ int verity_fec_decode(struct dm_verity *v, struct dm_verity_io *io, if (type == DM_VERITY_BLOCK_TYPE_METADATA) block = block - v->hash_start + v->data_blocks; - /* - * For RS(n, k), the continuous FEC data is divided into blocks of k - * bytes. Since block size may not be divisible by k, the last block - * is zero padded when decoding. - * - * Each byte of the block is covered by a different RS(n, k) code, - * and each code is interleaved over k blocks to make it less likely - * that bursty corruption will leave us in unrecoverable state. - */ - - offset = block << v->data_dev_block_bits; - - /* - * The base RS block we can feed to the interleaver to find out all - * blocks required for decoding. - */ - div64_u64_rem(offset, v->fec->region_blocks << v->data_dev_block_bits, - &rsb); - /* * Locating erasures is slow, so attempt to recover the block without * them first. Do a second attempt with erasures if the corruption is * bad enough. */ - r = fec_decode_rsb(v, io, fio, rsb, offset, want_digest, false); + r = fec_decode(v, io, fio, block, want_digest, false); if (r < 0) { - r = fec_decode_rsb(v, io, fio, rsb, offset, want_digest, true); + r = fec_decode(v, io, fio, block, want_digest, true); if (r < 0) goto done; } From ca21ed4089200ff32ef0a17f58b6153499121cef Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Thu, 5 Feb 2026 20:59:37 -0800 Subject: [PATCH 35/62] dm-verity-fec: compute target region directly Instead of determining the target block's region by checking which block of the k blocks being iterated over in fec_read_bufs() is equal to the target block, instead just directly use the quotient of the division of target_block by region_blocks. This is the same value, just derived in a more straightforward way. Signed-off-by: Eric Biggers Signed-off-by: Mikulas Patocka --- drivers/md/dm-verity-fec.c | 37 ++++++++++++++++--------------------- 1 file changed, 16 insertions(+), 21 deletions(-) diff --git a/drivers/md/dm-verity-fec.c b/drivers/md/dm-verity-fec.c index bd0c3faf2743..a70451d95e16 100644 --- a/drivers/md/dm-verity-fec.c +++ b/drivers/md/dm-verity-fec.c @@ -49,8 +49,8 @@ static inline u8 *fec_buffer_rs_message(struct dm_verity *v, * the corrected bytes into fio->output starting from out_pos. */ static int fec_decode_bufs(struct dm_verity *v, struct dm_verity_io *io, - struct dm_verity_fec_io *fio, u64 rsb, int byte_index, - unsigned int out_pos, int neras) + struct dm_verity_fec_io *fio, u64 rsb, + int target_region, unsigned int out_pos, int neras) { int r, corrected = 0, res; struct dm_buffer *buf; @@ -120,7 +120,7 @@ static int fec_decode_bufs(struct dm_verity *v, struct dm_verity_io *io, } corrected += res; - fio->output[out_pos++] = msg_buf[byte_index]; + fio->output[out_pos++] = msg_buf[target_region]; if (out_pos >= v->fec->block_size) goto done; @@ -158,10 +158,10 @@ static int fec_is_erasure(struct dm_verity *v, struct dm_verity_io *io, * fits into buffers. Check for erasure locations if @neras is non-NULL. */ static int fec_read_bufs(struct dm_verity *v, struct dm_verity_io *io, - u64 rsb, u64 target, unsigned int out_pos, int *neras) + u64 rsb, unsigned int out_pos, int *neras) { bool is_zero; - int i, j, target_index = -1; + int i, j; struct dm_buffer *buf; struct dm_bufio_client *bufio; struct dm_verity_fec_io *fio = io->fec_io; @@ -183,14 +183,6 @@ static int fec_read_bufs(struct dm_verity *v, struct dm_verity_io *io, */ for (i = 0; i < v->fec->rs_k; i++) { ileaved = rsb + i * (v->fec->region_blocks << v->data_dev_block_bits); - - /* - * target is the data block we want to correct, target_index is - * the index of this block within the rs_k RS blocks - */ - if (ileaved == target) - target_index = i; - block = ileaved >> v->data_dev_block_bits; bufio = v->fec->data_bufio; @@ -252,8 +244,7 @@ static int fec_read_bufs(struct dm_verity *v, struct dm_verity_io *io, done: dm_bufio_release(buf); } - - return target_index; + return 0; } /* @@ -320,22 +311,26 @@ static int fec_decode(struct dm_verity *v, struct dm_verity_io *io, const u8 *want_digest, bool use_erasures) { int r, neras = 0; - unsigned int out_pos; - u64 offset = target_block << v->data_dev_block_bits; + unsigned int target_region, out_pos; u64 rsb; - div64_u64_rem(offset, v->fec->region_blocks << v->data_dev_block_bits, - &rsb); + target_region = div64_u64_rem( + target_block << v->data_dev_block_bits, + v->fec->region_blocks << v->data_dev_block_bits, &rsb); + if (WARN_ON_ONCE(target_region >= v->fec->rs_k)) + /* target_block is out-of-bounds. Should never happen. */ + return -EIO; for (out_pos = 0; out_pos < v->fec->block_size;) { fec_init_bufs(v, fio); - r = fec_read_bufs(v, io, rsb, offset, out_pos, + r = fec_read_bufs(v, io, rsb, out_pos, use_erasures ? &neras : NULL); if (unlikely(r < 0)) return r; - r = fec_decode_bufs(v, io, fio, rsb, r, out_pos, neras); + r = fec_decode_bufs(v, io, fio, rsb, target_region, + out_pos, neras); if (r < 0) return r; From b39b3c812eaf7956cc6f9ba570b75b5d52c8da62 Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Thu, 5 Feb 2026 20:59:38 -0800 Subject: [PATCH 36/62] dm-verity-fec: pass down index_in_region instead of rsb Replace 'rsb', which is a byte index, with 'index_in_region' which is a block index. The block index is slightly easier to compute, it matches what fec_read_bufs() wants, and it avoids the mismatch between the name and the units of the variable. ('rsb' stood for "Reed-Solomon block", but its units were bytes, not blocks.) fec_decode_bufs() does want it as a byte index when computing parity_block, but that's easily handled locally. As long as the parameters to the log messages are being adjusted, also eliminate the unnecessary casts to 'unsigned long long'. %llu is the correct way to print a u64 in the Linux kernel, as documented in printk-formats.rst. There's no PRIu64 macro like there is in userspace. Signed-off-by: Eric Biggers Signed-off-by: Mikulas Patocka --- drivers/md/dm-verity-fec.c | 47 +++++++++++++++++++++----------------- 1 file changed, 26 insertions(+), 21 deletions(-) diff --git a/drivers/md/dm-verity-fec.c b/drivers/md/dm-verity-fec.c index a70451d95e16..956e03210943 100644 --- a/drivers/md/dm-verity-fec.c +++ b/drivers/md/dm-verity-fec.c @@ -49,7 +49,7 @@ static inline u8 *fec_buffer_rs_message(struct dm_verity *v, * the corrected bytes into fio->output starting from out_pos. */ static int fec_decode_bufs(struct dm_verity *v, struct dm_verity_io *io, - struct dm_verity_fec_io *fio, u64 rsb, + struct dm_verity_fec_io *fio, u64 index_in_region, int target_region, unsigned int out_pos, int neras) { int r, corrected = 0, res; @@ -67,14 +67,16 @@ static int fec_decode_bufs(struct dm_verity *v, struct dm_verity_io *io, * block_size is always a power of 2, but roots might not be. Note that * when it's not, a codeword's parity bytes can span a block boundary. */ - parity_block = (rsb + out_pos) * v->fec->roots; + parity_block = ((index_in_region << v->data_dev_block_bits) + out_pos) * + v->fec->roots; parity_pos = parity_block & (v->fec->block_size - 1); parity_block >>= v->data_dev_block_bits; par = dm_bufio_read_with_ioprio(v->fec->bufio, parity_block, &buf, bio->bi_ioprio); if (IS_ERR(par)) { DMERR("%s: FEC %llu: parity read failed (block %llu): %ld", - v->data_dev->name, rsb, parity_block, PTR_ERR(par)); + v->data_dev->name, index_in_region, parity_block, + PTR_ERR(par)); return PTR_ERR(par); } @@ -103,8 +105,8 @@ static int fec_decode_bufs(struct dm_verity *v, struct dm_verity_io *io, bio->bi_ioprio); if (IS_ERR(par)) { DMERR("%s: FEC %llu: parity read failed (block %llu): %ld", - v->data_dev->name, rsb, parity_block, - PTR_ERR(par)); + v->data_dev->name, index_in_region, + parity_block, PTR_ERR(par)); return PTR_ERR(par); } for (; j < v->fec->roots; j++) @@ -132,10 +134,10 @@ static int fec_decode_bufs(struct dm_verity *v, struct dm_verity_io *io, if (r < 0 && neras) DMERR_LIMIT("%s: FEC %llu: failed to correct: %d", - v->data_dev->name, (unsigned long long)rsb, r); + v->data_dev->name, index_in_region, r); else if (r > 0) DMWARN_LIMIT("%s: FEC %llu: corrected %d errors", - v->data_dev->name, (unsigned long long)rsb, r); + v->data_dev->name, index_in_region, r); return r; } @@ -158,14 +160,14 @@ static int fec_is_erasure(struct dm_verity *v, struct dm_verity_io *io, * fits into buffers. Check for erasure locations if @neras is non-NULL. */ static int fec_read_bufs(struct dm_verity *v, struct dm_verity_io *io, - u64 rsb, unsigned int out_pos, int *neras) + u64 index_in_region, unsigned int out_pos, int *neras) { bool is_zero; int i, j; struct dm_buffer *buf; struct dm_bufio_client *bufio; struct dm_verity_fec_io *fio = io->fec_io; - u64 block, ileaved; + u64 block; u8 *bbuf; u8 want_digest[HASH_MAX_DIGESTSIZE]; unsigned int n, src_pos; @@ -182,8 +184,7 @@ static int fec_read_bufs(struct dm_verity *v, struct dm_verity_io *io, * interleave contents to available bufs */ for (i = 0; i < v->fec->rs_k; i++) { - ileaved = rsb + i * (v->fec->region_blocks << v->data_dev_block_bits); - block = ileaved >> v->data_dev_block_bits; + block = i * v->fec->region_blocks + index_in_region; bufio = v->fec->data_bufio; if (block >= v->data_blocks) { @@ -203,9 +204,8 @@ static int fec_read_bufs(struct dm_verity *v, struct dm_verity_io *io, bbuf = dm_bufio_read_with_ioprio(bufio, block, &buf, bio->bi_ioprio); if (IS_ERR(bbuf)) { DMWARN_LIMIT("%s: FEC %llu: read failed (%llu): %ld", - v->data_dev->name, - (unsigned long long)rsb, - (unsigned long long)block, PTR_ERR(bbuf)); + v->data_dev->name, index_in_region, block, + PTR_ERR(bbuf)); /* assume the block is corrupted */ if (neras && *neras <= v->fec->roots) @@ -312,11 +312,16 @@ static int fec_decode(struct dm_verity *v, struct dm_verity_io *io, { int r, neras = 0; unsigned int target_region, out_pos; - u64 rsb; + u64 index_in_region; - target_region = div64_u64_rem( - target_block << v->data_dev_block_bits, - v->fec->region_blocks << v->data_dev_block_bits, &rsb); + /* + * Compute 'target_region', the index of the region the target block is + * in; and 'index_in_region', the index of the target block within its + * region. The latter value is also the index within its region of each + * message block that shares its RS codewords with the target block. + */ + target_region = div64_u64_rem(target_block, v->fec->region_blocks, + &index_in_region); if (WARN_ON_ONCE(target_region >= v->fec->rs_k)) /* target_block is out-of-bounds. Should never happen. */ return -EIO; @@ -324,12 +329,12 @@ static int fec_decode(struct dm_verity *v, struct dm_verity_io *io, for (out_pos = 0; out_pos < v->fec->block_size;) { fec_init_bufs(v, fio); - r = fec_read_bufs(v, io, rsb, out_pos, + r = fec_read_bufs(v, io, index_in_region, out_pos, use_erasures ? &neras : NULL); if (unlikely(r < 0)) return r; - r = fec_decode_bufs(v, io, fio, rsb, target_region, + r = fec_decode_bufs(v, io, fio, index_in_region, target_region, out_pos, neras); if (r < 0) return r; @@ -344,7 +349,7 @@ static int fec_decode(struct dm_verity *v, struct dm_verity_io *io, if (memcmp(io->tmp_digest, want_digest, v->digest_size)) { DMERR_LIMIT("%s: FEC %llu: failed to correct (%d erasures)", - v->data_dev->name, (unsigned long long)rsb, neras); + v->data_dev->name, index_in_region, neras); return -EILSEQ; } From 71dab3b90f177462a23af81658c9cea327016137 Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Thu, 5 Feb 2026 20:59:39 -0800 Subject: [PATCH 37/62] dm-verity-fec: make fec_decode_bufs() just return 0 or error fec_decode_bufs() returns the number of errors corrected or a negative errno value. However, the caller just checks for an errno value and doesn't do anything with the number of errors corrected. Simplify the code by just returning 0 instead of the number of errors corrected. Signed-off-by: Eric Biggers Signed-off-by: Mikulas Patocka --- drivers/md/dm-verity-fec.c | 11 ++++------- 1 file changed, 4 insertions(+), 7 deletions(-) diff --git a/drivers/md/dm-verity-fec.c b/drivers/md/dm-verity-fec.c index 956e03210943..96728e35e8df 100644 --- a/drivers/md/dm-verity-fec.c +++ b/drivers/md/dm-verity-fec.c @@ -52,7 +52,7 @@ static int fec_decode_bufs(struct dm_verity *v, struct dm_verity_io *io, struct dm_verity_fec_io *fio, u64 index_in_region, int target_region, unsigned int out_pos, int neras) { - int r, corrected = 0, res; + int r = 0, corrected = 0, res; struct dm_buffer *buf; unsigned int n, i, j, parity_pos, to_copy; uint16_t par_buf[DM_VERITY_FEC_MAX_ROOTS]; @@ -118,9 +118,8 @@ static int fec_decode_bufs(struct dm_verity *v, struct dm_verity_io *io, NULL, neras, fio->erasures, 0, NULL); if (res < 0) { r = res; - goto error; + goto done; } - corrected += res; fio->output[out_pos++] = msg_buf[target_region]; @@ -128,16 +127,14 @@ static int fec_decode_bufs(struct dm_verity *v, struct dm_verity_io *io, goto done; } done: - r = corrected; -error: dm_bufio_release(buf); if (r < 0 && neras) DMERR_LIMIT("%s: FEC %llu: failed to correct: %d", v->data_dev->name, index_in_region, r); - else if (r > 0) + else if (r == 0 && corrected > 0) DMWARN_LIMIT("%s: FEC %llu: corrected %d errors", - v->data_dev->name, index_in_region, r); + v->data_dev->name, index_in_region, corrected); return r; } From d0829329de71634129420ff53557bb0ade6a9145 Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Thu, 5 Feb 2026 20:59:40 -0800 Subject: [PATCH 38/62] dm-verity-fec: log target_block instead of index_in_region The log message for a FEC error or correction includes the data device name and index_in_region as the context. Although the result of FEC (for a particular dm-verity instance) is expected to be the same for a given index_in_region, index_in_region does not uniquely identify the actual target block that is being corrected. Since that value (target_block) is likely more useful, log it instead. Signed-off-by: Eric Biggers Signed-off-by: Mikulas Patocka --- drivers/md/dm-verity-fec.c | 26 ++++++++++++++------------ 1 file changed, 14 insertions(+), 12 deletions(-) diff --git a/drivers/md/dm-verity-fec.c b/drivers/md/dm-verity-fec.c index 96728e35e8df..64fe7ed40d9e 100644 --- a/drivers/md/dm-verity-fec.c +++ b/drivers/md/dm-verity-fec.c @@ -49,8 +49,9 @@ static inline u8 *fec_buffer_rs_message(struct dm_verity *v, * the corrected bytes into fio->output starting from out_pos. */ static int fec_decode_bufs(struct dm_verity *v, struct dm_verity_io *io, - struct dm_verity_fec_io *fio, u64 index_in_region, - int target_region, unsigned int out_pos, int neras) + struct dm_verity_fec_io *fio, u64 target_block, + unsigned int target_region, u64 index_in_region, + unsigned int out_pos, int neras) { int r = 0, corrected = 0, res; struct dm_buffer *buf; @@ -75,7 +76,7 @@ static int fec_decode_bufs(struct dm_verity *v, struct dm_verity_io *io, bio->bi_ioprio); if (IS_ERR(par)) { DMERR("%s: FEC %llu: parity read failed (block %llu): %ld", - v->data_dev->name, index_in_region, parity_block, + v->data_dev->name, target_block, parity_block, PTR_ERR(par)); return PTR_ERR(par); } @@ -105,7 +106,7 @@ static int fec_decode_bufs(struct dm_verity *v, struct dm_verity_io *io, bio->bi_ioprio); if (IS_ERR(par)) { DMERR("%s: FEC %llu: parity read failed (block %llu): %ld", - v->data_dev->name, index_in_region, + v->data_dev->name, target_block, parity_block, PTR_ERR(par)); return PTR_ERR(par); } @@ -131,10 +132,10 @@ static int fec_decode_bufs(struct dm_verity *v, struct dm_verity_io *io, if (r < 0 && neras) DMERR_LIMIT("%s: FEC %llu: failed to correct: %d", - v->data_dev->name, index_in_region, r); + v->data_dev->name, target_block, r); else if (r == 0 && corrected > 0) DMWARN_LIMIT("%s: FEC %llu: corrected %d errors", - v->data_dev->name, index_in_region, corrected); + v->data_dev->name, target_block, corrected); return r; } @@ -157,7 +158,8 @@ static int fec_is_erasure(struct dm_verity *v, struct dm_verity_io *io, * fits into buffers. Check for erasure locations if @neras is non-NULL. */ static int fec_read_bufs(struct dm_verity *v, struct dm_verity_io *io, - u64 index_in_region, unsigned int out_pos, int *neras) + u64 target_block, u64 index_in_region, + unsigned int out_pos, int *neras) { bool is_zero; int i, j; @@ -201,7 +203,7 @@ static int fec_read_bufs(struct dm_verity *v, struct dm_verity_io *io, bbuf = dm_bufio_read_with_ioprio(bufio, block, &buf, bio->bi_ioprio); if (IS_ERR(bbuf)) { DMWARN_LIMIT("%s: FEC %llu: read failed (%llu): %ld", - v->data_dev->name, index_in_region, block, + v->data_dev->name, target_block, block, PTR_ERR(bbuf)); /* assume the block is corrupted */ @@ -326,13 +328,13 @@ static int fec_decode(struct dm_verity *v, struct dm_verity_io *io, for (out_pos = 0; out_pos < v->fec->block_size;) { fec_init_bufs(v, fio); - r = fec_read_bufs(v, io, index_in_region, out_pos, + r = fec_read_bufs(v, io, target_block, index_in_region, out_pos, use_erasures ? &neras : NULL); if (unlikely(r < 0)) return r; - r = fec_decode_bufs(v, io, fio, index_in_region, target_region, - out_pos, neras); + r = fec_decode_bufs(v, io, fio, target_block, target_region, + index_in_region, out_pos, neras); if (r < 0) return r; @@ -346,7 +348,7 @@ static int fec_decode(struct dm_verity *v, struct dm_verity_io *io, if (memcmp(io->tmp_digest, want_digest, v->digest_size)) { DMERR_LIMIT("%s: FEC %llu: failed to correct (%d erasures)", - v->data_dev->name, index_in_region, neras); + v->data_dev->name, target_block, neras); return -EILSEQ; } From 4f6d6fb3a6c53241c6059947d54b9f68fa0719e1 Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Thu, 5 Feb 2026 20:59:41 -0800 Subject: [PATCH 39/62] dm-verity-fec: improve comments for fec_read_bufs() Update the comments in and above fec_read_bufs() to more clearly describe what it does. Signed-off-by: Eric Biggers Signed-off-by: Mikulas Patocka --- drivers/md/dm-verity-fec.c | 30 ++++++++++++++++++++++-------- 1 file changed, 22 insertions(+), 8 deletions(-) diff --git a/drivers/md/dm-verity-fec.c b/drivers/md/dm-verity-fec.c index 64fe7ed40d9e..fb71e83e6404 100644 --- a/drivers/md/dm-verity-fec.c +++ b/drivers/md/dm-verity-fec.c @@ -154,8 +154,21 @@ static int fec_is_erasure(struct dm_verity *v, struct dm_verity_io *io, } /* - * Read data blocks that are part of the RS block and deinterleave as much as - * fits into buffers. Check for erasure locations if @neras is non-NULL. + * Read the message block at index @index_in_region within each of the + * @v->fec->rs_k regions and deinterleave their contents into @io->fec_io->bufs. + * + * @target_block gives the index of specific block within this sequence that is + * being corrected, relative to the start of all the FEC message blocks. + * + * @out_pos gives the current output position, i.e. the position in (each) block + * from which to start the deinterleaving. Deinterleaving continues until + * either end-of-block is reached or there's no more buffer space. + * + * If @neras is non-NULL, then also use verity hashes and the presence/absence + * of I/O errors to determine which of the message blocks in the sequence are + * likely to be incorrect. Write the number of such blocks to *@neras and the + * indices of the corresponding RS message bytes in [0, k - 1] to + * @io->fec_io->erasures, up to a limit of @v->fec->roots + 1 such blocks. */ static int fec_read_bufs(struct dm_verity *v, struct dm_verity_io *io, u64 target_block, u64 index_in_region, @@ -178,11 +191,11 @@ static int fec_read_bufs(struct dm_verity *v, struct dm_verity_io *io, if (WARN_ON(v->digest_size > sizeof(want_digest))) return -EINVAL; - /* - * read each of the rs_k data blocks that are part of the RS block, and - * interleave contents to available bufs - */ for (i = 0; i < v->fec->rs_k; i++) { + /* + * Read the block from region i. It contains the i'th message + * byte of the target block's RS codewords. + */ block = i * v->fec->region_blocks + index_in_region; bufio = v->fec->data_bufio; @@ -231,8 +244,9 @@ static int fec_read_bufs(struct dm_verity *v, struct dm_verity_io *io, } /* - * deinterleave and copy the bytes that fit into bufs, - * starting from out_pos + * Deinterleave the bytes of the block, starting from 'out_pos', + * into the i'th byte of the RS message buffers. Stop when + * end-of-block is reached or there are no more buffers. */ src_pos = out_pos; fec_for_each_buffer_rs_message(fio, n, j) { From 99a2312f69805f4ba92d98a757625e0300a747ab Mon Sep 17 00:00:00 2001 From: Guillaume Gonnet Date: Tue, 17 Mar 2026 22:32:28 +0100 Subject: [PATCH 40/62] dm init: ensure device probing has finished in dm-mod.waitfor= The early_lookup_bdev() function returns successfully when the disk device is present but not necessarily its partitions. In this situation, dm_early_create() fails as the partition block device does not exist yet. In my case, this phenomenon occurs quite often because the device is an SD card with slow reading times, on which kernel takes time to enumerate available partitions. Fortunately, the underlying device is back to "probing" state while enumerating partitions. Waiting for all probing to end is enough to fix this issue. That's also the reason why this problem never occurs with rootwait= parameter: the while loop inside wait_for_root() explicitly waits for probing to be done and then the function calls async_synchronize_full(). These lines were omitted in 035641b, even though the commit says it's based on the rootwait logic... Anyway, calling wait_for_device_probe() after our while loop does the job (it both waits for probing and calls async_synchronize_full). Fixes: 035641b01e72 ("dm init: add dm-mod.waitfor to wait for asynchronously probed block devices") Signed-off-by: Guillaume Gonnet Signed-off-by: Mikulas Patocka --- drivers/md/dm-init.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/drivers/md/dm-init.c b/drivers/md/dm-init.c index 7403823384c5..c1bacba92c65 100644 --- a/drivers/md/dm-init.c +++ b/drivers/md/dm-init.c @@ -303,8 +303,10 @@ static int __init dm_init_init(void) } } - if (waitfor[0]) + if (waitfor[0]) { + wait_for_device_probe(); DMINFO("all devices available"); + } list_for_each_entry(dev, &devices, list) { if (dm_early_create(&dev->dmi, dev->table, From b7cce3e2cca9cd78418f3c3784474b778e7996fe Mon Sep 17 00:00:00 2001 From: Mikulas Patocka Date: Mon, 16 Mar 2026 15:04:15 +0100 Subject: [PATCH 41/62] dm: don't report warning when doing deferred remove If dm_hash_remove_all was called from dm_deferred_remove, it would write a warning "remove_all left %d open device(s)" if there are some other devices active. The warning is bogus, so let's disable it in this case. Signed-off-by: Mikulas Patocka Reported-by: Zdenek Kabelac Cc: stable@vger.kernel.org Fixes: 2c140a246dc0 ("dm: allow remove to be deferred") --- drivers/md/dm-ioctl.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/md/dm-ioctl.c b/drivers/md/dm-ioctl.c index 3ab8b4beff86..4de734d82444 100644 --- a/drivers/md/dm-ioctl.c +++ b/drivers/md/dm-ioctl.c @@ -384,7 +384,7 @@ static void dm_hash_remove_all(bool keep_open_devices, bool mark_deferred, bool up_write(&_hash_lock); - if (dev_skipped) + if (dev_skipped && !only_deferred) DMWARN("remove_all left %d open device(s)", dev_skipped); } From 23e6e57a93bcabe86d5f0eab1df0c44706ab18f3 Mon Sep 17 00:00:00 2001 From: Mikulas Patocka Date: Wed, 18 Mar 2026 15:31:55 +0100 Subject: [PATCH 42/62] dm: make "dmsetup remove_all" interruptible The command "dmsetup remove_all" may take a long time (a minute for removing 1000 devices), so make it interruptible with fatal signals. For better readability, the bool arguments were changed to flags. Signed-off-by: Mikulas Patocka Reviewed-by: Bart Van Assche --- drivers/md/dm-ioctl.c | 35 +++++++++++++++++++++++++---------- 1 file changed, 25 insertions(+), 10 deletions(-) diff --git a/drivers/md/dm-ioctl.c b/drivers/md/dm-ioctl.c index 4de734d82444..405acc14d718 100644 --- a/drivers/md/dm-ioctl.c +++ b/drivers/md/dm-ioctl.c @@ -64,7 +64,11 @@ struct vers_iter { static struct rb_root name_rb_tree = RB_ROOT; static struct rb_root uuid_rb_tree = RB_ROOT; -static void dm_hash_remove_all(bool keep_open_devices, bool mark_deferred, bool only_deferred); +#define DM_REMOVE_KEEP_OPEN_DEVICES 1 +#define DM_REMOVE_MARK_DEFERRED 2 +#define DM_REMOVE_ONLY_DEFERRED 4 +#define DM_REMOVE_INTERRUPTIBLE 8 +static int dm_hash_remove_all(unsigned flags); /* * Guards access to both hash tables. @@ -78,7 +82,7 @@ static DEFINE_MUTEX(dm_hash_cells_mutex); static void dm_hash_exit(void) { - dm_hash_remove_all(false, false, false); + dm_hash_remove_all(0); } /* @@ -333,7 +337,7 @@ static struct dm_table *__hash_remove(struct hash_cell *hc) return table; } -static void dm_hash_remove_all(bool keep_open_devices, bool mark_deferred, bool only_deferred) +static int dm_hash_remove_all(unsigned flags) { int dev_skipped; struct rb_node *n; @@ -347,12 +351,17 @@ static void dm_hash_remove_all(bool keep_open_devices, bool mark_deferred, bool down_write(&_hash_lock); for (n = rb_first(&name_rb_tree); n; n = rb_next(n)) { + if (flags & DM_REMOVE_INTERRUPTIBLE && fatal_signal_pending(current)) { + up_write(&_hash_lock); + return -EINTR; + } + hc = container_of(n, struct hash_cell, name_node); md = hc->md; dm_get(md); - if (keep_open_devices && - dm_lock_for_deletion(md, mark_deferred, only_deferred)) { + if (flags & DM_REMOVE_KEEP_OPEN_DEVICES && + dm_lock_for_deletion(md, !!(flags & DM_REMOVE_MARK_DEFERRED), !!(flags & DM_REMOVE_ONLY_DEFERRED))) { dm_put(md); dev_skipped++; continue; @@ -368,7 +377,7 @@ static void dm_hash_remove_all(bool keep_open_devices, bool mark_deferred, bool } dm_ima_measure_on_device_remove(md, true); dm_put(md); - if (likely(keep_open_devices)) + if (likely(flags & DM_REMOVE_KEEP_OPEN_DEVICES)) dm_destroy(md); else dm_destroy_immediate(md); @@ -384,8 +393,10 @@ static void dm_hash_remove_all(bool keep_open_devices, bool mark_deferred, bool up_write(&_hash_lock); - if (dev_skipped && !only_deferred) + if (dev_skipped && !(flags & DM_REMOVE_ONLY_DEFERRED)) DMWARN("remove_all left %d open device(s)", dev_skipped); + + return 0; } /* @@ -513,7 +524,7 @@ static struct mapped_device *dm_hash_rename(struct dm_ioctl *param, void dm_deferred_remove(void) { - dm_hash_remove_all(true, false, true); + dm_hash_remove_all(DM_REMOVE_KEEP_OPEN_DEVICES | DM_REMOVE_ONLY_DEFERRED); } /* @@ -529,9 +540,13 @@ typedef int (*ioctl_fn)(struct file *filp, struct dm_ioctl *param, size_t param_ static int remove_all(struct file *filp, struct dm_ioctl *param, size_t param_size) { - dm_hash_remove_all(true, !!(param->flags & DM_DEFERRED_REMOVE), false); + int r; + int flags = DM_REMOVE_KEEP_OPEN_DEVICES | DM_REMOVE_INTERRUPTIBLE; + if (param->flags & DM_DEFERRED_REMOVE) + flags |= DM_REMOVE_MARK_DEFERRED; + r = dm_hash_remove_all(flags); param->data_size = 0; - return 0; + return r; } /* From 0e4c1eb59909ddaef19cd997e646d5d1ce251a6c Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Fri, 20 Mar 2026 14:15:08 -0700 Subject: [PATCH 43/62] dm-verity-fec: warn even when there were no errors Currently FEC logs a warning message if at least one error was corrected, or an error message if there were uncorrectable errors. However, it doesn't log anything if there were no errors. "No errors" is actually unexpected, though, considering that dm-verity calls verity_fec_decode() only when a block's digest doesn't match. If there were to ever be a bug where verity_fec_decode() is called on blocks with the correct digest, then there would be no indication in the log that FEC is running and degrading performance. Therefore, let's log the warning message even when there were no errors. Signed-off-by: Eric Biggers Signed-off-by: Mikulas Patocka --- drivers/md/dm-verity-fec.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/md/dm-verity-fec.c b/drivers/md/dm-verity-fec.c index fb71e83e6404..85ad9dc210ff 100644 --- a/drivers/md/dm-verity-fec.c +++ b/drivers/md/dm-verity-fec.c @@ -133,7 +133,7 @@ static int fec_decode_bufs(struct dm_verity *v, struct dm_verity_io *io, if (r < 0 && neras) DMERR_LIMIT("%s: FEC %llu: failed to correct: %d", v->data_dev->name, target_block, r); - else if (r == 0 && corrected > 0) + else if (r == 0) DMWARN_LIMIT("%s: FEC %llu: corrected %d errors", v->data_dev->name, target_block, corrected); From d1c3b6b8e74393a5b34b91a056bd7ea6ae33938a Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Sat, 21 Mar 2026 16:06:50 -0700 Subject: [PATCH 44/62] dm-crypt: Reimplement elephant diffuser using AES library Simplify and optimize dm-crypt's implementation of Bitlocker's "elephant diffuser" to use the AES library instead of an "ecb(aes)" crypto_skcipher. Note: struct aes_enckey is fixed-size, so it could be embedded directly in struct iv_elephant_private. But I kept it as a separate allocation so that the size of struct crypt_config doesn't increase. The elephant diffuser is rarely used in dm-crypt. Signed-off-by: Eric Biggers Signed-off-by: Mikulas Patocka --- drivers/md/Kconfig | 1 + drivers/md/dm-crypt.c | 85 +++++++++++++++---------------------------- 2 files changed, 31 insertions(+), 55 deletions(-) diff --git a/drivers/md/Kconfig b/drivers/md/Kconfig index 53351048d3ec..a3fcdca7e6db 100644 --- a/drivers/md/Kconfig +++ b/drivers/md/Kconfig @@ -300,6 +300,7 @@ config DM_CRYPT select CRYPTO select CRYPTO_CBC select CRYPTO_ESSIV + select CRYPTO_LIB_AES select CRYPTO_LIB_MD5 # needed by lmk IV mode help This device-mapper target allows you to create a device that diff --git a/drivers/md/dm-crypt.c b/drivers/md/dm-crypt.c index 54823341c9fd..76b0c6bfd45c 100644 --- a/drivers/md/dm-crypt.c +++ b/drivers/md/dm-crypt.c @@ -32,6 +32,7 @@ #include #include #include +#include #include #include #include @@ -133,7 +134,7 @@ struct iv_tcw_private { #define ELEPHANT_MAX_KEY_SIZE 32 struct iv_elephant_private { - struct crypto_skcipher *tfm; + struct aes_enckey *key; }; /* @@ -767,8 +768,8 @@ static void crypt_iv_elephant_dtr(struct crypt_config *cc) { struct iv_elephant_private *elephant = &cc->iv_gen_private.elephant; - crypto_free_skcipher(elephant->tfm); - elephant->tfm = NULL; + kfree_sensitive(elephant->key); + elephant->key = NULL; } static int crypt_iv_elephant_ctr(struct crypt_config *cc, struct dm_target *ti, @@ -777,13 +778,9 @@ static int crypt_iv_elephant_ctr(struct crypt_config *cc, struct dm_target *ti, struct iv_elephant_private *elephant = &cc->iv_gen_private.elephant; int r; - elephant->tfm = crypto_alloc_skcipher("ecb(aes)", 0, - CRYPTO_ALG_ALLOCATES_MEMORY); - if (IS_ERR(elephant->tfm)) { - r = PTR_ERR(elephant->tfm); - elephant->tfm = NULL; - return r; - } + elephant->key = kmalloc_obj(*elephant->key); + if (!elephant->key) + return -ENOMEM; r = crypt_iv_eboiv_ctr(cc, ti, NULL); if (r) @@ -935,41 +932,28 @@ static void diffuser_b_encrypt(u32 *d, size_t n) } } -static int crypt_iv_elephant(struct crypt_config *cc, struct dm_crypt_request *dmreq) +static void crypt_iv_elephant(struct crypt_config *cc, + struct dm_crypt_request *dmreq) { struct iv_elephant_private *elephant = &cc->iv_gen_private.elephant; - u8 *es, *ks, *data, *data2, *data_offset; - struct skcipher_request *req; - struct scatterlist *sg, *sg2, src, dst; - DECLARE_CRYPTO_WAIT(wait); - int i, r; + u8 *data, *data2, *data_offset; + struct scatterlist *sg, *sg2; + union { + __le64 w[2]; + u8 b[16]; + } es; + u8 ks[32] __aligned(__alignof(long)); /* Elephant sector key */ + int i; - req = skcipher_request_alloc(elephant->tfm, GFP_NOIO); - es = kzalloc(16, GFP_NOIO); /* Key for AES */ - ks = kzalloc(32, GFP_NOIO); /* Elephant sector key */ - - if (!req || !es || !ks) { - r = -ENOMEM; - goto out; - } - - *(__le64 *)es = cpu_to_le64(dmreq->iv_sector * cc->sector_size); + es.w[0] = cpu_to_le64(dmreq->iv_sector * cc->sector_size); + es.w[1] = 0; /* E(Ks, e(s)) */ - sg_init_one(&src, es, 16); - sg_init_one(&dst, ks, 16); - skcipher_request_set_crypt(req, &src, &dst, 16, NULL); - skcipher_request_set_callback(req, 0, crypto_req_done, &wait); - r = crypto_wait_req(crypto_skcipher_encrypt(req), &wait); - if (r) - goto out; + aes_encrypt(elephant->key, &ks[0], es.b); /* E(Ks, e'(s)) */ - es[15] = 0x80; - sg_init_one(&dst, &ks[16], 16); - r = crypto_wait_req(crypto_skcipher_encrypt(req), &wait); - if (r) - goto out; + es.b[15] = 0x80; + aes_encrypt(elephant->key, &ks[16], es.b); sg = crypt_get_sg_data(cc, dmreq->sg_out); data = kmap_local_page(sg_page(sg)); @@ -1001,23 +985,15 @@ static int crypt_iv_elephant(struct crypt_config *cc, struct dm_crypt_request *d } kunmap_local(data); -out: - kfree_sensitive(ks); - kfree_sensitive(es); - skcipher_request_free(req); - return r; + memzero_explicit(ks, sizeof(ks)); + memzero_explicit(&es, sizeof(es)); } static int crypt_iv_elephant_gen(struct crypt_config *cc, u8 *iv, struct dm_crypt_request *dmreq) { - int r; - - if (bio_data_dir(dmreq->ctx->bio_in) == WRITE) { - r = crypt_iv_elephant(cc, dmreq); - if (r) - return r; - } + if (bio_data_dir(dmreq->ctx->bio_in) == WRITE) + crypt_iv_elephant(cc, dmreq); return crypt_iv_eboiv_gen(cc, iv, dmreq); } @@ -1026,7 +1002,7 @@ static int crypt_iv_elephant_post(struct crypt_config *cc, u8 *iv, struct dm_crypt_request *dmreq) { if (bio_data_dir(dmreq->ctx->bio_in) != WRITE) - return crypt_iv_elephant(cc, dmreq); + crypt_iv_elephant(cc, dmreq); return 0; } @@ -1036,16 +1012,15 @@ static int crypt_iv_elephant_init(struct crypt_config *cc) struct iv_elephant_private *elephant = &cc->iv_gen_private.elephant; int key_offset = cc->key_size - cc->key_extra_size; - return crypto_skcipher_setkey(elephant->tfm, &cc->key[key_offset], cc->key_extra_size); + return aes_prepareenckey(elephant->key, &cc->key[key_offset], cc->key_extra_size); } static int crypt_iv_elephant_wipe(struct crypt_config *cc) { struct iv_elephant_private *elephant = &cc->iv_gen_private.elephant; - u8 key[ELEPHANT_MAX_KEY_SIZE]; - memset(key, 0, cc->key_extra_size); - return crypto_skcipher_setkey(elephant->tfm, key, cc->key_extra_size); + memzero_explicit(elephant->key, sizeof(*elephant->key)); + return 0; } static const struct crypt_iv_operations crypt_iv_plain_ops = { From 0be6c2b1c18f1586f0ec68463b85a8d56e4623f5 Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Sat, 21 Mar 2026 16:06:51 -0700 Subject: [PATCH 45/62] dm-crypt: Make crypt_iv_operations::wipe return void Since all implementations of crypt_iv_operations::wipe now return 0, change the return type to void. Signed-off-by: Eric Biggers Signed-off-by: Mikulas Patocka --- drivers/md/dm-crypt.c | 20 ++++++-------------- 1 file changed, 6 insertions(+), 14 deletions(-) diff --git a/drivers/md/dm-crypt.c b/drivers/md/dm-crypt.c index 76b0c6bfd45c..885208a82c55 100644 --- a/drivers/md/dm-crypt.c +++ b/drivers/md/dm-crypt.c @@ -110,7 +110,7 @@ struct crypt_iv_operations { const char *opts); void (*dtr)(struct crypt_config *cc); int (*init)(struct crypt_config *cc); - int (*wipe)(struct crypt_config *cc); + void (*wipe)(struct crypt_config *cc); int (*generator)(struct crypt_config *cc, u8 *iv, struct dm_crypt_request *dmreq); int (*post)(struct crypt_config *cc, u8 *iv, @@ -508,14 +508,12 @@ static int crypt_iv_lmk_init(struct crypt_config *cc) return 0; } -static int crypt_iv_lmk_wipe(struct crypt_config *cc) +static void crypt_iv_lmk_wipe(struct crypt_config *cc) { struct iv_lmk_private *lmk = &cc->iv_gen_private.lmk; if (lmk->seed) memset(lmk->seed, 0, LMK_SEED_SIZE); - - return 0; } static void crypt_iv_lmk_one(struct crypt_config *cc, u8 *iv, @@ -629,14 +627,12 @@ static int crypt_iv_tcw_init(struct crypt_config *cc) return 0; } -static int crypt_iv_tcw_wipe(struct crypt_config *cc) +static void crypt_iv_tcw_wipe(struct crypt_config *cc) { struct iv_tcw_private *tcw = &cc->iv_gen_private.tcw; memset(tcw->iv_seed, 0, cc->iv_size); memset(tcw->whitening, 0, TCW_WHITENING_SIZE); - - return 0; } static void crypt_iv_tcw_whitening(struct crypt_config *cc, @@ -1015,12 +1011,11 @@ static int crypt_iv_elephant_init(struct crypt_config *cc) return aes_prepareenckey(elephant->key, &cc->key[key_offset], cc->key_extra_size); } -static int crypt_iv_elephant_wipe(struct crypt_config *cc) +static void crypt_iv_elephant_wipe(struct crypt_config *cc) { struct iv_elephant_private *elephant = &cc->iv_gen_private.elephant; memzero_explicit(elephant->key, sizeof(*elephant->key)); - return 0; } static const struct crypt_iv_operations crypt_iv_plain_ops = { @@ -2648,11 +2643,8 @@ static int crypt_wipe_key(struct crypt_config *cc) get_random_bytes(&cc->key, cc->key_size); /* Wipe IV private keys */ - if (cc->iv_gen_ops && cc->iv_gen_ops->wipe) { - r = cc->iv_gen_ops->wipe(cc); - if (r) - return r; - } + if (cc->iv_gen_ops && cc->iv_gen_ops->wipe) + cc->iv_gen_ops->wipe(cc); kfree_sensitive(cc->key_string); cc->key_string = NULL; From 4b4a8d9560d987f4df17b7248ab1c8146138d0f5 Mon Sep 17 00:00:00 2001 From: Bruce Johnston Date: Tue, 24 Mar 2026 14:06:44 -0400 Subject: [PATCH 46/62] dm vdo: add geometry block initialization to encodings.c Add vdo_initialize_volume_geometry() to populate the geometry block, computing the space required for the two main regions on disk. Add uds_compute_index_size() to calculate the space required for the UDS indexer from the UDS configuration. Signed-off-by: Bruce Johnston Reviewed-by: Matthew Sakai Signed-off-by: Mikulas Patocka --- drivers/md/dm-vdo/encodings.c | 69 ++++++++++++++++++++++++ drivers/md/dm-vdo/encodings.h | 4 ++ drivers/md/dm-vdo/indexer/index-layout.c | 26 +++++++++ drivers/md/dm-vdo/indexer/indexer.h | 4 ++ 4 files changed, 103 insertions(+) diff --git a/drivers/md/dm-vdo/encodings.c b/drivers/md/dm-vdo/encodings.c index ec98c539701e..9961cb40f890 100644 --- a/drivers/md/dm-vdo/encodings.c +++ b/drivers/md/dm-vdo/encodings.c @@ -12,6 +12,7 @@ #include "permassert.h" #include "constants.h" +#include "indexer.h" #include "status-codes.h" #include "types.h" @@ -1486,3 +1487,71 @@ int vdo_decode_super_block(u8 *buffer) return ((checksum != saved_checksum) ? VDO_CHECKSUM_MISMATCH : VDO_SUCCESS); } + +/** + * vdo_compute_index_blocks() - Compute the number of blocks that the indexer will use. + * @config: The index config from which the blocks are calculated. + * @index_blocks_ptr: The number of blocks the index will use. + * + * Return: VDO_SUCCESS or an error code. + */ +static int vdo_compute_index_blocks(const struct index_config *config, + block_count_t *index_blocks_ptr) +{ + int result; + u64 index_bytes; + struct uds_parameters uds_parameters = { + .memory_size = config->mem, + .sparse = config->sparse, + }; + + result = uds_compute_index_size(&uds_parameters, &index_bytes); + if (result != UDS_SUCCESS) + return vdo_log_error_strerror(result, "error computing index size"); + + *index_blocks_ptr = index_bytes / VDO_BLOCK_SIZE; + return VDO_SUCCESS; +} + +/** + * vdo_initialize_volume_geometry() - Initialize the volume geometry so it can be written out. + * @nonce: The nonce to use to identify the vdo. + * @uuid: The uuid to use to identify the vdo. + * @index_config: The config used for structure initialization. + * @geometry: The volume geometry to initialize. + * + * Return: VDO_SUCCESS or an error code. + */ +int vdo_initialize_volume_geometry(nonce_t nonce, uuid_t *uuid, + const struct index_config *index_config, + struct volume_geometry *geometry) +{ + int result; + block_count_t index_blocks = 0; + + result = vdo_compute_index_blocks(index_config, &index_blocks); + if (result != VDO_SUCCESS) + return result; + + *geometry = (struct volume_geometry) { + /* This is for backwards compatibility. */ + .unused = 0, + .nonce = nonce, + .bio_offset = 0, + .regions = { + [VDO_INDEX_REGION] = { + .id = VDO_INDEX_REGION, + .start_block = 1, + }, + [VDO_DATA_REGION] = { + .id = VDO_DATA_REGION, + .start_block = 1 + index_blocks, + } + } + }; + + memcpy(&(geometry->uuid), uuid, sizeof(uuid_t)); + memcpy(&geometry->index_config, index_config, sizeof(struct index_config)); + + return VDO_SUCCESS; +} diff --git a/drivers/md/dm-vdo/encodings.h b/drivers/md/dm-vdo/encodings.h index 87b7d2f3b545..0bc5ae696a6a 100644 --- a/drivers/md/dm-vdo/encodings.h +++ b/drivers/md/dm-vdo/encodings.h @@ -803,6 +803,10 @@ vdo_get_index_region_size(struct volume_geometry geometry) vdo_get_index_region_start(geometry); } +int vdo_initialize_volume_geometry(nonce_t nonce, uuid_t *uuid, + const struct index_config *index_config, + struct volume_geometry *geometry); + int __must_check vdo_parse_geometry_block(unsigned char *block, struct volume_geometry *geometry); diff --git a/drivers/md/dm-vdo/indexer/index-layout.c b/drivers/md/dm-vdo/indexer/index-layout.c index 7a1209b21c03..5f4ce4ab1b1e 100644 --- a/drivers/md/dm-vdo/indexer/index-layout.c +++ b/drivers/md/dm-vdo/indexer/index-layout.c @@ -249,6 +249,32 @@ static int __must_check compute_sizes(const struct uds_configuration *config, return UDS_SUCCESS; } +int uds_compute_index_size(const struct uds_parameters *parameters, u64 *index_size) +{ + int result; + struct uds_configuration *index_config; + struct save_layout_sizes sizes; + + if (index_size == NULL) { + vdo_log_error("Missing output size pointer"); + return -EINVAL; + } + + result = uds_make_configuration(parameters, &index_config); + if (result != UDS_SUCCESS) { + vdo_log_error_strerror(result, "cannot compute index size"); + return result; + } + + result = compute_sizes(index_config, &sizes); + uds_free_configuration(index_config); + if (result != UDS_SUCCESS) + return result; + + *index_size = sizes.total_size; + return UDS_SUCCESS; +} + /* Create unique data using the current time and a pseudorandom number. */ static void create_unique_nonce_data(u8 *buffer) { diff --git a/drivers/md/dm-vdo/indexer/indexer.h b/drivers/md/dm-vdo/indexer/indexer.h index 7c1fc4577f5b..d765f24328eb 100644 --- a/drivers/md/dm-vdo/indexer/indexer.h +++ b/drivers/md/dm-vdo/indexer/indexer.h @@ -282,6 +282,10 @@ struct uds_request { ); }; +/* Compute the number of bytes needed to store an index. */ +int __must_check uds_compute_index_size(const struct uds_parameters *parameters, + u64 *index_size); + /* A session is required for most index operations. */ int __must_check uds_create_index_session(struct uds_index_session **session); From e073bb098ae28a909ece08ec05a6c59dbd282b12 Mon Sep 17 00:00:00 2001 From: Bruce Johnston Date: Tue, 24 Mar 2026 14:06:45 -0400 Subject: [PATCH 47/62] dm vdo: add super block initialization to encodings.c Add vdo_initialize_component_states() to populate the super block, computing the space required for the main VDO components on disk. Those include the slab depot, block map, and recovery journal. Signed-off-by: Bruce Johnston Reviewed-by: Matthew Sakai Signed-off-by: Mikulas Patocka --- drivers/md/dm-vdo/constants.h | 3 ++ drivers/md/dm-vdo/encodings.c | 82 +++++++++++++++++++++++++++++++++++ drivers/md/dm-vdo/encodings.h | 5 +++ 3 files changed, 90 insertions(+) diff --git a/drivers/md/dm-vdo/constants.h b/drivers/md/dm-vdo/constants.h index 2a8b03779f87..cc7bc3571ba1 100644 --- a/drivers/md/dm-vdo/constants.h +++ b/drivers/md/dm-vdo/constants.h @@ -44,6 +44,9 @@ enum { /* The default size of each slab journal, in blocks */ DEFAULT_VDO_SLAB_JOURNAL_SIZE = 224, + /* The recovery journal starting sequence number set at format time */ + RECOVERY_JOURNAL_STARTING_SEQUENCE_NUMBER = 1, + /* * The initial size of lbn_operations and pbn_operations, which is based upon the expected * maximum number of outstanding VIOs. This value was chosen to make it highly unlikely diff --git a/drivers/md/dm-vdo/encodings.c b/drivers/md/dm-vdo/encodings.c index 9961cb40f890..441c9aee1749 100644 --- a/drivers/md/dm-vdo/encodings.c +++ b/drivers/md/dm-vdo/encodings.c @@ -1488,6 +1488,88 @@ int vdo_decode_super_block(u8 *buffer) return ((checksum != saved_checksum) ? VDO_CHECKSUM_MISMATCH : VDO_SUCCESS); } +/** + * vdo_initialize_component_states() - Initialize the components so they can be written out. + * @vdo_config: The config used for component state initialization. + * @geometry: The volume geometry used to calculate the data region offset. + * @nonce: The nonce to use to identify the vdo. + * @states: The component states to initialize. + * + * Return: VDO_SUCCESS or an error code. + */ +int vdo_initialize_component_states(const struct vdo_config *vdo_config, + const struct volume_geometry *geometry, + nonce_t nonce, + struct vdo_component_states *states) +{ + int result; + struct slab_config slab_config; + struct partition *partition; + + states->vdo.config = *vdo_config; + states->vdo.nonce = nonce; + states->volume_version = VDO_VOLUME_VERSION_67_0; + + states->recovery_journal = (struct recovery_journal_state_7_0) { + .journal_start = RECOVERY_JOURNAL_STARTING_SEQUENCE_NUMBER, + .logical_blocks_used = 0, + .block_map_data_blocks = 0, + }; + + /* + * The layout starts 1 block past the beginning of the data region, as the + * data region contains the super block but the layout does not. + */ + result = vdo_initialize_layout(vdo_config->physical_blocks, + vdo_get_data_region_start(*geometry) + 1, + DEFAULT_VDO_BLOCK_MAP_TREE_ROOT_COUNT, + vdo_config->recovery_journal_size, + VDO_SLAB_SUMMARY_BLOCKS, + &states->layout); + if (result != VDO_SUCCESS) + return result; + + result = vdo_configure_slab(vdo_config->slab_size, + vdo_config->slab_journal_blocks, + &slab_config); + if (result != VDO_SUCCESS) { + vdo_uninitialize_layout(&states->layout); + return result; + } + + result = vdo_get_partition(&states->layout, VDO_SLAB_DEPOT_PARTITION, + &partition); + if (result != VDO_SUCCESS) { + vdo_uninitialize_layout(&states->layout); + return result; + } + + result = vdo_configure_slab_depot(partition, slab_config, 0, + &states->slab_depot); + if (result != VDO_SUCCESS) { + vdo_uninitialize_layout(&states->layout); + return result; + } + + result = vdo_get_partition(&states->layout, VDO_BLOCK_MAP_PARTITION, + &partition); + if (result != VDO_SUCCESS) { + vdo_uninitialize_layout(&states->layout); + return result; + } + + states->block_map = (struct block_map_state_2_0) { + .flat_page_origin = VDO_BLOCK_MAP_FLAT_PAGE_ORIGIN, + .flat_page_count = 0, + .root_origin = partition->offset, + .root_count = DEFAULT_VDO_BLOCK_MAP_TREE_ROOT_COUNT, + }; + + states->vdo.state = VDO_NEW; + + return VDO_SUCCESS; +} + /** * vdo_compute_index_blocks() - Compute the number of blocks that the indexer will use. * @config: The index config from which the blocks are calculated. diff --git a/drivers/md/dm-vdo/encodings.h b/drivers/md/dm-vdo/encodings.h index 0bc5ae696a6a..3cfbe4771a1c 100644 --- a/drivers/md/dm-vdo/encodings.h +++ b/drivers/md/dm-vdo/encodings.h @@ -1268,6 +1268,11 @@ int __must_check vdo_validate_component_states(struct vdo_component_states *stat void vdo_encode_super_block(u8 *buffer, struct vdo_component_states *states); int __must_check vdo_decode_super_block(u8 *buffer); +int vdo_initialize_component_states(const struct vdo_config *vdo_config, + const struct volume_geometry *geometry, + nonce_t nonce, + struct vdo_component_states *states); + /* We start with 0L and postcondition with ~0L to match our historical usage in userspace. */ static inline u32 vdo_crc32(const void *buf, unsigned long len) { From 2fb98e4170c4a0d9ebe091ca2421121daa352de0 Mon Sep 17 00:00:00 2001 From: Bruce Johnston Date: Tue, 24 Mar 2026 14:06:46 -0400 Subject: [PATCH 48/62] dm vdo: add formatting parameters to table line Extend the dm table line with three new optional parameters: indexMemory (UDS index memory size), indexSparse (dense vs sparse index), and slabSize (blocks per allocation slab). These values are parsed, validated, and stored in the device configuration for use during formatting. Rework the slab size constants from the single MAX_VDO_SLAB_BITS into explicit MIN_VDO_SLAB_BLOCKS, MAX_VDO_SLAB_BLOCKS, and DEFAULT_VDO_SLAB_BLOCKS values. Bump the target version from 9.1.0 to 9.2.0 to reflect this table line change. Signed-off-by: Bruce Johnston Reviewed-by: Matthew Sakai Signed-off-by: Mikulas Patocka --- drivers/md/dm-vdo/constants.h | 10 +++- drivers/md/dm-vdo/dm-vdo-target.c | 97 +++++++++++++++++++++++++++++-- drivers/md/dm-vdo/encodings.c | 12 +--- drivers/md/dm-vdo/encodings.h | 6 ++ drivers/md/dm-vdo/types.h | 3 + 5 files changed, 111 insertions(+), 17 deletions(-) diff --git a/drivers/md/dm-vdo/constants.h b/drivers/md/dm-vdo/constants.h index cc7bc3571ba1..b84e7edeb22e 100644 --- a/drivers/md/dm-vdo/constants.h +++ b/drivers/md/dm-vdo/constants.h @@ -60,8 +60,14 @@ enum { /* The maximum number of physical zones */ MAX_VDO_PHYSICAL_ZONES = 16, - /* The base-2 logarithm of the maximum blocks in one slab */ - MAX_VDO_SLAB_BITS = 23, + /* The default blocks in one slab */ + DEFAULT_VDO_SLAB_BLOCKS = 1U << 19, + + /* The minimum blocks in one slab */ + MIN_VDO_SLAB_BLOCKS = 1U << 13, + + /* The maximum blocks in one slab */ + MAX_VDO_SLAB_BLOCKS = 1U << 23, /* The maximum number of slabs the slab depot supports */ MAX_VDO_SLABS = 8192, diff --git a/drivers/md/dm-vdo/dm-vdo-target.c b/drivers/md/dm-vdo/dm-vdo-target.c index 7eb676e58ed5..1065c88a761c 100644 --- a/drivers/md/dm-vdo/dm-vdo-target.c +++ b/drivers/md/dm-vdo/dm-vdo-target.c @@ -9,6 +9,7 @@ #include #include #include +#include #include #include #include @@ -377,6 +378,75 @@ static inline int __must_check parse_bool(const char *bool_str, const char *true return VDO_SUCCESS; } +/** + * parse_memory() - Parse a string into an index memory value. + * @memory_str: The string value to convert to a memory value. + * @memory_ptr: A pointer to return the memory value in. + * + * Return: VDO_SUCCESS or an error + */ +static int __must_check parse_memory(const char *memory_str, + uds_memory_config_size_t *memory_ptr) +{ + uds_memory_config_size_t memory; + + if (strcmp(memory_str, "0.25") == 0) { + memory = UDS_MEMORY_CONFIG_256MB; + } else if ((strcmp(memory_str, "0.5") == 0) || (strcmp(memory_str, "0.50") == 0)) { + memory = UDS_MEMORY_CONFIG_512MB; + } else if (strcmp(memory_str, "0.75") == 0) { + memory = UDS_MEMORY_CONFIG_768MB; + } else { + unsigned int value; + int result; + + result = kstrtouint(memory_str, 10, &value); + if (result) { + vdo_log_error("optional parameter error: invalid memory size, must be a postive integer"); + return -EINVAL; + } + + if (value > UDS_MEMORY_CONFIG_MAX) { + vdo_log_error("optional parameter error: invalid memory size, must not be greater than %d", + UDS_MEMORY_CONFIG_MAX); + return -EINVAL; + } + + memory = value; + } + + *memory_ptr = memory; + return VDO_SUCCESS; +} + +/** + * parse_slab_size() - Parse a string option into a slab size value. + * @slab_str: The string value representing slab size. + * @slab_size_ptr: A pointer to return the slab size in. + * + * Return: VDO_SUCCESS or an error + */ +static int __must_check parse_slab_size(const char *slab_str, block_count_t *slab_size_ptr) +{ + block_count_t value; + int result; + + result = kstrtoull(slab_str, 10, &value); + if (result) { + vdo_log_error("optional parameter error: invalid slab size, must be a postive integer"); + return -EINVAL; + } + + if (value < MIN_VDO_SLAB_BLOCKS || value > MAX_VDO_SLAB_BLOCKS || (!is_power_of_2(value))) { + vdo_log_error("optional parameter error: invalid slab size, must be a power of two between %u and %u", + MIN_VDO_SLAB_BLOCKS, MAX_VDO_SLAB_BLOCKS); + return -EINVAL; + } + + *slab_size_ptr = value; + return VDO_SUCCESS; +} + /** * process_one_thread_config_spec() - Process one component of a thread parameter configuration * string and update the configuration data structure. @@ -566,7 +636,7 @@ static int process_one_key_value_pair(const char *key, unsigned int value, } /* Max discard sectors in blkdev_issue_discard is UINT_MAX >> 9 */ if (value > (UINT_MAX / VDO_BLOCK_SIZE)) { - vdo_log_error("optional parameter error: at most %d max discard blocks are allowed", + vdo_log_error("optional parameter error: at most %d max discard blocks are allowed", UINT_MAX / VDO_BLOCK_SIZE); return -EINVAL; } @@ -598,7 +668,16 @@ static int parse_one_key_value_pair(const char *key, const char *value, if (strcmp(key, "compression") == 0) return parse_bool(value, "on", "off", &config->compression); - /* The remaining arguments must have integral values. */ + if (strcmp(key, "indexSparse") == 0) + return parse_bool(value, "on", "off", &config->index_sparse); + + if (strcmp(key, "indexMemory") == 0) + return parse_memory(value, &config->index_memory); + + if (strcmp(key, "slabSize") == 0) + return parse_slab_size(value, &config->slab_blocks); + + /* The remaining arguments must have non-negative integral values. */ result = kstrtouint(value, 10, &count); if (result) { vdo_log_error("optional config string error: integer value needed, found \"%s\"", @@ -756,6 +835,9 @@ static int parse_device_config(int argc, char **argv, struct dm_target *ti, config->max_discard_blocks = 1; config->deduplication = true; config->compression = false; + config->index_memory = UDS_MEMORY_CONFIG_256MB; + config->index_sparse = false; + config->slab_blocks = DEFAULT_VDO_SLAB_BLOCKS; arg_set.argc = argc; arg_set.argv = argv; @@ -781,7 +863,7 @@ static int parse_device_config(int argc, char **argv, struct dm_target *ti, /* Get the physical blocks, if known. */ if (config->version >= 1) { result = kstrtoull(dm_shift_arg(&arg_set), 10, &config->physical_blocks); - if (result != VDO_SUCCESS) { + if (result) { handle_parse_error(config, error_ptr, "Invalid physical block count"); return VDO_BAD_CONFIGURATION; @@ -802,7 +884,7 @@ static int parse_device_config(int argc, char **argv, struct dm_target *ti, /* Get the page cache size. */ result = kstrtouint(dm_shift_arg(&arg_set), 10, &config->cache_size); - if (result != VDO_SUCCESS) { + if (result) { handle_parse_error(config, error_ptr, "Invalid block map page cache size"); return VDO_BAD_CONFIGURATION; @@ -810,7 +892,7 @@ static int parse_device_config(int argc, char **argv, struct dm_target *ti, /* Get the block map era length. */ result = kstrtouint(dm_shift_arg(&arg_set), 10, &config->block_map_maximum_age); - if (result != VDO_SUCCESS) { + if (result) { handle_parse_error(config, error_ptr, "Invalid block map maximum age"); return VDO_BAD_CONFIGURATION; } @@ -1457,10 +1539,13 @@ static int vdo_initialize(struct dm_target *ti, unsigned int instance, vdo_log_debug("Logical blocks = %llu", logical_blocks); vdo_log_debug("Physical block size = %llu", (u64) block_size); vdo_log_debug("Physical blocks = %llu", config->physical_blocks); + vdo_log_debug("Slab size = %llu", config->slab_blocks); vdo_log_debug("Block map cache blocks = %u", config->cache_size); vdo_log_debug("Block map maximum age = %u", config->block_map_maximum_age); vdo_log_debug("Deduplication = %s", (config->deduplication ? "on" : "off")); vdo_log_debug("Compression = %s", (config->compression ? "on" : "off")); + vdo_log_debug("Index memory = %u", config->index_memory); + vdo_log_debug("Index sparse = %s", (config->index_sparse ? "on" : "off")); vdo = vdo_find_matching(vdo_uses_device, config); if (vdo != NULL) { @@ -2856,7 +2941,7 @@ static void vdo_resume(struct dm_target *ti) static struct target_type vdo_target_bio = { .features = DM_TARGET_SINGLETON, .name = "vdo", - .version = { 9, 1, 0 }, + .version = { 9, 2, 0 }, .module = THIS_MODULE, .ctr = vdo_ctr, .dtr = vdo_dtr, diff --git a/drivers/md/dm-vdo/encodings.c b/drivers/md/dm-vdo/encodings.c index 441c9aee1749..3ceba010f4e6 100644 --- a/drivers/md/dm-vdo/encodings.c +++ b/drivers/md/dm-vdo/encodings.c @@ -16,12 +16,6 @@ #include "status-codes.h" #include "types.h" -/** The maximum logical space is 4 petabytes, which is 1 terablock. */ -static const block_count_t MAXIMUM_VDO_LOGICAL_BLOCKS = 1024ULL * 1024 * 1024 * 1024; - -/** The maximum physical space is 256 terabytes, which is 64 gigablocks. */ -static const block_count_t MAXIMUM_VDO_PHYSICAL_BLOCKS = 1024ULL * 1024 * 1024 * 64; - struct geometry_block { char magic_number[VDO_GEOMETRY_MAGIC_NUMBER_SIZE]; struct packed_header header; @@ -1220,9 +1214,9 @@ int vdo_validate_config(const struct vdo_config *config, if (result != VDO_SUCCESS) return result; - result = VDO_ASSERT(config->slab_size <= (1 << MAX_VDO_SLAB_BITS), - "slab size must be less than or equal to 2^%d", - MAX_VDO_SLAB_BITS); + result = VDO_ASSERT(config->slab_size <= MAX_VDO_SLAB_BLOCKS, + "slab size must be a power of two less than or equal to %d", + MAX_VDO_SLAB_BLOCKS); if (result != VDO_SUCCESS) return result; diff --git a/drivers/md/dm-vdo/encodings.h b/drivers/md/dm-vdo/encodings.h index 3cfbe4771a1c..0393936c8aeb 100644 --- a/drivers/md/dm-vdo/encodings.h +++ b/drivers/md/dm-vdo/encodings.h @@ -608,6 +608,12 @@ struct vdo_config { block_count_t slab_journal_blocks; /* number of slab journal blocks */ }; +/** The maximum logical space is 4 petabytes, which is 1 terablock. */ +#define MAXIMUM_VDO_LOGICAL_BLOCKS ((block_count_t)(1024ULL * 1024 * 1024 * 1024)) + +/** The maximum physical space is 256 terabytes, which is 64 gigablocks. */ +#define MAXIMUM_VDO_PHYSICAL_BLOCKS ((block_count_t)(1024ULL * 1024 * 1024 * 64)) + /* This is the structure that captures the vdo fields saved as a super block component. */ struct vdo_component { enum vdo_state state; diff --git a/drivers/md/dm-vdo/types.h b/drivers/md/dm-vdo/types.h index cdf36e7d7702..0d60a88aa086 100644 --- a/drivers/md/dm-vdo/types.h +++ b/drivers/md/dm-vdo/types.h @@ -227,6 +227,9 @@ struct device_config { bool compression; struct thread_count_config thread_counts; block_count_t max_discard_blocks; + block_count_t slab_blocks; + int index_memory; + bool index_sparse; }; enum vdo_completion_type { From beced130a367e0b99fa9424505ee7f07ddea86de Mon Sep 17 00:00:00 2001 From: Bruce Johnston Date: Tue, 24 Mar 2026 14:06:47 -0400 Subject: [PATCH 49/62] dm vdo: add upfront validation for logical size Add a validation check that the logical size passed via the table line does not exceed MAXIMUM_VDO_LOGICAL_BLOCKS. Signed-off-by: Bruce Johnston Reviewed-by: Matthew Sakai Signed-off-by: Mikulas Patocka --- drivers/md/dm-vdo/dm-vdo-target.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/drivers/md/dm-vdo/dm-vdo-target.c b/drivers/md/dm-vdo/dm-vdo-target.c index 1065c88a761c..cee6e4edf768 100644 --- a/drivers/md/dm-vdo/dm-vdo-target.c +++ b/drivers/md/dm-vdo/dm-vdo-target.c @@ -792,6 +792,12 @@ static int parse_device_config(int argc, char **argv, struct dm_target *ti, struct device_config *config = NULL; int result; + if (logical_bytes > (MAXIMUM_VDO_LOGICAL_BLOCKS * VDO_BLOCK_SIZE)) { + handle_parse_error(config, error_ptr, + "Logical size exceeds the maximum"); + return VDO_BAD_CONFIGURATION; + } + if ((logical_bytes % VDO_BLOCK_SIZE) != 0) { handle_parse_error(config, error_ptr, "Logical size must be a multiple of 4096"); From 427bf2c1f77435b36749a03e4f9d4549e3807e2d Mon Sep 17 00:00:00 2001 From: Bruce Johnston Date: Tue, 24 Mar 2026 14:06:48 -0400 Subject: [PATCH 50/62] dm vdo: add geometry block encoding Add vdo_encode_volume_geometry() to write the geometry block into a buffer so that it can be written to disk. The corresponding decode path already exists. Signed-off-by: Bruce Johnston Reviewed-by: Matthew Sakai Signed-off-by: Mikulas Patocka --- drivers/md/dm-vdo/encodings.c | 56 +++++++++++++++++++++++++++++++++++ drivers/md/dm-vdo/encodings.h | 2 ++ 2 files changed, 58 insertions(+) diff --git a/drivers/md/dm-vdo/encodings.c b/drivers/md/dm-vdo/encodings.c index 3ceba010f4e6..d75e023df637 100644 --- a/drivers/md/dm-vdo/encodings.c +++ b/drivers/md/dm-vdo/encodings.c @@ -287,6 +287,62 @@ static void decode_volume_geometry(u8 *buffer, size_t *offset, }; } +/** + * vdo_encode_volume_geometry() - Encode the on-disk representation of a volume geometry into a buffer. + * @buffer: A buffer to store the encoding. + * @geometry: The geometry to encode. + * @version: The geometry block version to encode. + * + * Return: VDO_SUCCESS or an error. + */ +int vdo_encode_volume_geometry(u8 *buffer, const struct volume_geometry *geometry, + u32 version) +{ + int result; + enum volume_region_id id; + u32 checksum; + size_t offset = 0; + const struct header *header; + + memcpy(buffer, VDO_GEOMETRY_MAGIC_NUMBER, VDO_GEOMETRY_MAGIC_NUMBER_SIZE); + offset += VDO_GEOMETRY_MAGIC_NUMBER_SIZE; + + header = (version > 4) ? &GEOMETRY_BLOCK_HEADER_5_0 : &GEOMETRY_BLOCK_HEADER_4_0; + vdo_encode_header(buffer, &offset, header); + + /* This is for backwards compatibility */ + encode_u32_le(buffer, &offset, geometry->unused); + encode_u64_le(buffer, &offset, geometry->nonce); + memcpy(buffer + offset, (unsigned char *) &geometry->uuid, sizeof(uuid_t)); + offset += sizeof(uuid_t); + + if (version > 4) + encode_u64_le(buffer, &offset, geometry->bio_offset); + + for (id = 0; id < VDO_VOLUME_REGION_COUNT; id++) { + encode_u32_le(buffer, &offset, geometry->regions[id].id); + encode_u64_le(buffer, &offset, geometry->regions[id].start_block); + } + + encode_u32_le(buffer, &offset, geometry->index_config.mem); + encode_u32_le(buffer, &offset, 0); + + if (geometry->index_config.sparse) + buffer[offset++] = 1; + else + buffer[offset++] = 0; + + result = VDO_ASSERT(header->size == offset + sizeof(u32), + "should have encoded up to the geometry checksum"); + if (result != VDO_SUCCESS) + return result; + + checksum = vdo_crc32(buffer, offset); + encode_u32_le(buffer, &offset, checksum); + + return VDO_SUCCESS; +} + /** * vdo_parse_geometry_block() - Decode and validate an encoded geometry block. * @block: The encoded geometry block. diff --git a/drivers/md/dm-vdo/encodings.h b/drivers/md/dm-vdo/encodings.h index 0393936c8aeb..67ff0ff2ffda 100644 --- a/drivers/md/dm-vdo/encodings.h +++ b/drivers/md/dm-vdo/encodings.h @@ -813,6 +813,8 @@ int vdo_initialize_volume_geometry(nonce_t nonce, uuid_t *uuid, const struct index_config *index_config, struct volume_geometry *geometry); +int vdo_encode_volume_geometry(u8 *buffer, const struct volume_geometry *geometry, + u32 version); int __must_check vdo_parse_geometry_block(unsigned char *block, struct volume_geometry *geometry); From b5d1f45c5fea9fa112f0dddebd95fadad06d3bd8 Mon Sep 17 00:00:00 2001 From: Bruce Johnston Date: Tue, 24 Mar 2026 14:06:49 -0400 Subject: [PATCH 51/62] dm vdo: add geometry block structure Introduce a vdo_geometry_block structure, containing a vio and buffer, mirroring the existing vdo_super_block structure. Both are now initialized at VDO startup and freed at shutdown, establishing the infrastructure needed to read and write the geometry block using the same mechanisms as the super block. Refactor read_geometry_block() to use the new structure. Signed-off-by: Bruce Johnston Reviewed-by: Matthew Sakai Signed-off-by: Mikulas Patocka --- drivers/md/dm-vdo/vdo.c | 101 ++++++++++++++++++++++------------------ drivers/md/dm-vdo/vdo.h | 10 ++++ 2 files changed, 66 insertions(+), 45 deletions(-) diff --git a/drivers/md/dm-vdo/vdo.c b/drivers/md/dm-vdo/vdo.c index 167cf93a284a..09a1a97b5c31 100644 --- a/drivers/md/dm-vdo/vdo.c +++ b/drivers/md/dm-vdo/vdo.c @@ -255,6 +255,37 @@ static int __must_check initialize_thread_config(struct thread_count_config coun return VDO_SUCCESS; } +static int initialize_geometry_block(struct vdo *vdo, + struct vdo_geometry_block *geometry_block) +{ + int result; + + result = vdo_allocate(VDO_BLOCK_SIZE, "encoded geometry block", + (char **) &vdo->geometry_block.buffer); + if (result != VDO_SUCCESS) + return result; + + return allocate_vio_components(vdo, VIO_TYPE_GEOMETRY, + VIO_PRIORITY_METADATA, NULL, 1, + (char *) geometry_block->buffer, + &vdo->geometry_block.vio); +} + +static int initialize_super_block(struct vdo *vdo, struct vdo_super_block *super_block) +{ + int result; + + result = vdo_allocate(VDO_BLOCK_SIZE, "encoded super block", + (char **) &vdo->super_block.buffer); + if (result != VDO_SUCCESS) + return result; + + return allocate_vio_components(vdo, VIO_TYPE_SUPER_BLOCK, + VIO_PRIORITY_METADATA, NULL, 1, + (char *) super_block->buffer, + &vdo->super_block.vio); +} + /** * read_geometry_block() - Synchronously read the geometry block from a vdo's underlying block * device. @@ -264,47 +295,29 @@ static int __must_check initialize_thread_config(struct thread_count_config coun */ static int __must_check read_geometry_block(struct vdo *vdo) { - struct vio *vio; - char *block; + struct vio *vio = &vdo->geometry_block.vio; + u8 *block = vdo->geometry_block.buffer; int result; - result = vdo_allocate(VDO_BLOCK_SIZE, __func__, &block); - if (result != VDO_SUCCESS) - return result; - - result = create_metadata_vio(vdo, VIO_TYPE_GEOMETRY, VIO_PRIORITY_HIGH, NULL, - block, &vio); - if (result != VDO_SUCCESS) { - vdo_free(block); - return result; - } - /* * This is only safe because, having not already loaded the geometry, the vdo's geometry's * bio_offset field is 0, so the fact that vio_reset_bio() will subtract that offset from * the supplied pbn is not a problem. */ - result = vio_reset_bio(vio, block, NULL, REQ_OP_READ, + result = vio_reset_bio(vio, (char *)block, NULL, REQ_OP_READ, VDO_GEOMETRY_BLOCK_LOCATION); - if (result != VDO_SUCCESS) { - free_vio(vdo_forget(vio)); - vdo_free(block); + if (result != VDO_SUCCESS) return result; - } bio_set_dev(vio->bio, vdo_get_backing_device(vdo)); submit_bio_wait(vio->bio); result = blk_status_to_errno(vio->bio->bi_status); - free_vio(vdo_forget(vio)); if (result != 0) { vdo_log_error_strerror(result, "synchronous read failed"); - vdo_free(block); return -EIO; } - result = vdo_parse_geometry_block((u8 *) block, &vdo->geometry); - vdo_free(block); - return result; + return vdo_parse_geometry_block(block, &vdo->geometry); } static bool get_zone_thread_name(const thread_id_t thread_ids[], zone_count_t count, @@ -474,6 +487,19 @@ static int initialize_vdo(struct vdo *vdo, struct device_config *config, vdo_initialize_completion(&vdo->admin.completion, vdo, VDO_ADMIN_COMPLETION); init_completion(&vdo->admin.callback_sync); mutex_init(&vdo->stats_mutex); + + result = initialize_geometry_block(vdo, &vdo->geometry_block); + if (result != VDO_SUCCESS) { + *reason = "Could not initialize geometry block"; + return result; + } + + result = initialize_super_block(vdo, &vdo->super_block); + if (result != VDO_SUCCESS) { + *reason = "Could not initialize super block"; + return result; + } + result = read_geometry_block(vdo); if (result != VDO_SUCCESS) { *reason = "Could not load geometry block"; @@ -646,6 +672,12 @@ static void free_listeners(struct vdo_thread *thread) } } +static void uninitialize_geometry_block(struct vdo_geometry_block *geometry_block) +{ + free_vio_components(&geometry_block->vio); + vdo_free(geometry_block->buffer); +} + static void uninitialize_super_block(struct vdo_super_block *super_block) { free_vio_components(&super_block->vio); @@ -693,6 +725,7 @@ void vdo_destroy(struct vdo *vdo) vdo_uninitialize_layout(&vdo->next_layout); if (vdo->partition_copier) dm_kcopyd_client_destroy(vdo_forget(vdo->partition_copier)); + uninitialize_geometry_block(&vdo->geometry_block); uninitialize_super_block(&vdo->super_block); vdo_free_block_map(vdo_forget(vdo->block_map)); vdo_free_hash_zones(vdo_forget(vdo->hash_zones)); @@ -718,20 +751,6 @@ void vdo_destroy(struct vdo *vdo) vdo_free(vdo); } -static int initialize_super_block(struct vdo *vdo, struct vdo_super_block *super_block) -{ - int result; - - result = vdo_allocate(VDO_BLOCK_SIZE, "encoded super block", &vdo->super_block.buffer); - if (result != VDO_SUCCESS) - return result; - - return allocate_vio_components(vdo, VIO_TYPE_SUPER_BLOCK, - VIO_PRIORITY_METADATA, NULL, 1, - (char *) super_block->buffer, - &vdo->super_block.vio); -} - /** * finish_reading_super_block() - Continue after loading the super block. * @completion: The super block vio. @@ -775,14 +794,6 @@ static void read_super_block_endio(struct bio *bio) */ void vdo_load_super_block(struct vdo *vdo, struct vdo_completion *parent) { - int result; - - result = initialize_super_block(vdo, &vdo->super_block); - if (result != VDO_SUCCESS) { - vdo_continue_completion(parent, result); - return; - } - vdo->super_block.vio.completion.parent = parent; vdo_submit_metadata_vio(&vdo->super_block.vio, vdo_get_data_region_start(vdo->geometry), diff --git a/drivers/md/dm-vdo/vdo.h b/drivers/md/dm-vdo/vdo.h index 1aaba73997b7..21f6ac999e9d 100644 --- a/drivers/md/dm-vdo/vdo.h +++ b/drivers/md/dm-vdo/vdo.h @@ -144,6 +144,13 @@ struct thread_config { struct thread_count_config; +struct vdo_geometry_block { + /* The vio for reading and writing the geometry block to disk */ + struct vio vio; + /* A buffer to hold the geometry block */ + u8 *buffer; +}; + struct vdo_super_block { /* The vio for reading and writing the super block to disk */ struct vio vio; @@ -186,6 +193,9 @@ struct vdo { /* The thread mapping */ struct thread_config thread_config; + /* The geometry block */ + struct vdo_geometry_block geometry_block; + /* The super block */ struct vdo_super_block super_block; From 9d9c28aa867ae3ffdd967a8caeb4e3fb6d4003cd Mon Sep 17 00:00:00 2001 From: Bruce Johnston Date: Tue, 24 Mar 2026 14:06:50 -0400 Subject: [PATCH 52/62] dm vdo: add synchronous metadata I/O submission helper Add vdo_submit_metadata_vio_wait(), a synchronous I/O submission helper that blocks until completion. This is needed for I/O during early initialization before work queues are available. Refactor read_geometry_block() to use it. Signed-off-by: Bruce Johnston Reviewed-by: Matthew Sakai Signed-off-by: Mikulas Patocka --- drivers/md/dm-vdo/io-submitter.c | 27 +++++++++++++++++++++++++++ drivers/md/dm-vdo/io-submitter.h | 4 ++++ drivers/md/dm-vdo/vdo.c | 16 +++------------- 3 files changed, 34 insertions(+), 13 deletions(-) diff --git a/drivers/md/dm-vdo/io-submitter.c b/drivers/md/dm-vdo/io-submitter.c index 0e9932929fee..0916c8609543 100644 --- a/drivers/md/dm-vdo/io-submitter.c +++ b/drivers/md/dm-vdo/io-submitter.c @@ -364,6 +364,33 @@ void __submit_metadata_vio(struct vio *vio, physical_block_number_t physical, vdo_launch_completion_with_priority(completion, get_metadata_priority(vio)); } +/** + * vdo_submit_metadata_vio_wait() - Submit I/O for a metadata vio and wait for completion. + * @vio: the vio for which to issue I/O + * @physical: the physical block number to read or write + * @operation: the type of I/O to perform + * + * The function operates similarly to __submit_metadata_vio except that it will + * block until the work is done. It can be used to do i/o before work queues + * and thread completions are set up. + * + * Return: VDO_SUCCESS or an error. + */ +int vdo_submit_metadata_vio_wait(struct vio *vio, + physical_block_number_t physical, + blk_opf_t operation) +{ + int result; + + result = vio_reset_bio(vio, vio->data, NULL, operation | REQ_META, physical); + if (result != VDO_SUCCESS) + return result; + + bio_set_dev(vio->bio, vdo_get_backing_device(vio->completion.vdo)); + submit_bio_wait(vio->bio); + return blk_status_to_errno(vio->bio->bi_status); +} + /** * vdo_make_io_submitter() - Create an io_submitter structure. * @thread_count: Number of bio-submission threads to set up. diff --git a/drivers/md/dm-vdo/io-submitter.h b/drivers/md/dm-vdo/io-submitter.h index 3088f11055fd..0f320a60e9e8 100644 --- a/drivers/md/dm-vdo/io-submitter.h +++ b/drivers/md/dm-vdo/io-submitter.h @@ -56,4 +56,8 @@ static inline void vdo_submit_flush_vio(struct vio *vio, bio_end_io_t callback, REQ_OP_WRITE | REQ_PREFLUSH, NULL, 0); } +int vdo_submit_metadata_vio_wait(struct vio *vio, + physical_block_number_t physical, + blk_opf_t operation); + #endif /* VDO_IO_SUBMITTER_H */ diff --git a/drivers/md/dm-vdo/vdo.c b/drivers/md/dm-vdo/vdo.c index 09a1a97b5c31..bc7afbca035d 100644 --- a/drivers/md/dm-vdo/vdo.c +++ b/drivers/md/dm-vdo/vdo.c @@ -295,8 +295,6 @@ static int initialize_super_block(struct vdo *vdo, struct vdo_super_block *super */ static int __must_check read_geometry_block(struct vdo *vdo) { - struct vio *vio = &vdo->geometry_block.vio; - u8 *block = vdo->geometry_block.buffer; int result; /* @@ -304,20 +302,12 @@ static int __must_check read_geometry_block(struct vdo *vdo) * bio_offset field is 0, so the fact that vio_reset_bio() will subtract that offset from * the supplied pbn is not a problem. */ - result = vio_reset_bio(vio, (char *)block, NULL, REQ_OP_READ, - VDO_GEOMETRY_BLOCK_LOCATION); + result = vdo_submit_metadata_vio_wait(&vdo->geometry_block.vio, + VDO_GEOMETRY_BLOCK_LOCATION, REQ_OP_READ); if (result != VDO_SUCCESS) return result; - bio_set_dev(vio->bio, vdo_get_backing_device(vdo)); - submit_bio_wait(vio->bio); - result = blk_status_to_errno(vio->bio->bi_status); - if (result != 0) { - vdo_log_error_strerror(result, "synchronous read failed"); - return -EIO; - } - - return vdo_parse_geometry_block(block, &vdo->geometry); + return vdo_parse_geometry_block(vdo->geometry_block.buffer, &vdo->geometry); } static bool get_zone_thread_name(const thread_id_t thread_ids[], zone_count_t count, From defce4e039bc0100164964f79e896d05cb45dc76 Mon Sep 17 00:00:00 2001 From: Bruce Johnston Date: Tue, 24 Mar 2026 14:06:51 -0400 Subject: [PATCH 53/62] dm vdo: add formatting logic and initialization Add the core formatting logic. The initialization path is updated to read the geometry block (block 0 on the storage device). If the block is entirely zeroed, the device is treated as unformatted and vdo_format() is called. Otherwise, the existing geometry is parsed and the VDO is loaded as before. The vdo_format() function initializes the volume geometry and super block, and marks the VDO as needing it's layout saved to disk. Signed-off-by: Bruce Johnston Reviewed-by: Matthew Sakai Signed-off-by: Mikulas Patocka --- drivers/md/dm-vdo/status-codes.c | 2 + drivers/md/dm-vdo/vdo.c | 104 +++++++++++++++++++++++-------- 2 files changed, 81 insertions(+), 25 deletions(-) diff --git a/drivers/md/dm-vdo/status-codes.c b/drivers/md/dm-vdo/status-codes.c index dd252d660b6d..9df5e4d7f884 100644 --- a/drivers/md/dm-vdo/status-codes.c +++ b/drivers/md/dm-vdo/status-codes.c @@ -80,6 +80,8 @@ int vdo_status_to_errno(int error) /* VDO or UDS error */ switch (error) { + case VDO_BAD_CONFIGURATION: + return -EINVAL; case VDO_NO_SPACE: return -ENOSPC; case VDO_READ_ONLY: diff --git a/drivers/md/dm-vdo/vdo.c b/drivers/md/dm-vdo/vdo.c index bc7afbca035d..b5e64af13437 100644 --- a/drivers/md/dm-vdo/vdo.c +++ b/drivers/md/dm-vdo/vdo.c @@ -34,7 +34,9 @@ #include #include #include +#include #include +#include #include "logger.h" #include "memory-alloc.h" @@ -55,6 +57,7 @@ #include "slab-depot.h" #include "statistics.h" #include "status-codes.h" +#include "time-utils.h" #include "vio.h" #define PARANOID_THREAD_CONSISTENCY_CHECKS 0 @@ -286,30 +289,6 @@ static int initialize_super_block(struct vdo *vdo, struct vdo_super_block *super &vdo->super_block.vio); } -/** - * read_geometry_block() - Synchronously read the geometry block from a vdo's underlying block - * device. - * @vdo: The vdo whose geometry is to be read. - * - * Return: VDO_SUCCESS or an error code. - */ -static int __must_check read_geometry_block(struct vdo *vdo) -{ - int result; - - /* - * This is only safe because, having not already loaded the geometry, the vdo's geometry's - * bio_offset field is 0, so the fact that vio_reset_bio() will subtract that offset from - * the supplied pbn is not a problem. - */ - result = vdo_submit_metadata_vio_wait(&vdo->geometry_block.vio, - VDO_GEOMETRY_BLOCK_LOCATION, REQ_OP_READ); - if (result != VDO_SUCCESS) - return result; - - return vdo_parse_geometry_block(vdo->geometry_block.buffer, &vdo->geometry); -} - static bool get_zone_thread_name(const thread_id_t thread_ids[], zone_count_t count, thread_id_t id, const char *prefix, char *buffer, size_t buffer_length) @@ -454,6 +433,67 @@ static int register_vdo(struct vdo *vdo) return result; } +/** + * vdo_format() - Format a block device to function as a new VDO. + * @vdo: The vdo to format. + * @error_ptr: The reason for any failure during this call. + * + * This function must be called on a device before a VDO can be loaded for the first time. + * Once a device has been formatted, the VDO can be loaded and shut down repeatedly. + * If a new VDO is desired, this function should be called again. + * + * Return: VDO_SUCCESS or an error + **/ +static int __must_check vdo_format(struct vdo *vdo, char **error_ptr) +{ + int result; + uuid_t uuid; + nonce_t nonce = current_time_us(); + struct device_config *config = vdo->device_config; + + struct index_config index_config = { + .mem = config->index_memory, + .sparse = config->index_sparse, + }; + + struct vdo_config vdo_config = { + .logical_blocks = config->logical_blocks, + .physical_blocks = config->physical_blocks, + .slab_size = config->slab_blocks, + .slab_journal_blocks = DEFAULT_VDO_SLAB_JOURNAL_SIZE, + .recovery_journal_size = DEFAULT_VDO_RECOVERY_JOURNAL_SIZE, + }; + + uuid_gen(&uuid); + result = vdo_initialize_volume_geometry(nonce, &uuid, &index_config, &vdo->geometry); + if (result != VDO_SUCCESS) { + *error_ptr = "Could not initialize volume geometry during format"; + return result; + } + + result = vdo_initialize_component_states(&vdo_config, &vdo->geometry, nonce, &vdo->states); + if (result == VDO_NO_SPACE) { + block_count_t slab_blocks = config->slab_blocks; + /* 1 is counting geometry block */ + block_count_t fixed_layout_size = 1 + + vdo->geometry.regions[VDO_DATA_REGION].start_block + + DEFAULT_VDO_BLOCK_MAP_TREE_ROOT_COUNT + + DEFAULT_VDO_RECOVERY_JOURNAL_SIZE + VDO_SLAB_SUMMARY_BLOCKS; + block_count_t necessary_size = fixed_layout_size + slab_blocks; + + vdo_log_error("Minimum required size for VDO volume: %llu bytes", + (unsigned long long) necessary_size * VDO_BLOCK_SIZE); + *error_ptr = "Could not allocate enough space for VDO during format"; + return result; + } + if (result != VDO_SUCCESS) { + *error_ptr = "Could not initialize data layout during format"; + return result; + } + + return VDO_SUCCESS; +} + /** * initialize_vdo() - Do the portion of initializing a vdo which will clean up after itself on * error. @@ -490,12 +530,26 @@ static int initialize_vdo(struct vdo *vdo, struct device_config *config, return result; } - result = read_geometry_block(vdo); + result = vdo_submit_metadata_vio_wait(&vdo->geometry_block.vio, + VDO_GEOMETRY_BLOCK_LOCATION, REQ_OP_READ); if (result != VDO_SUCCESS) { *reason = "Could not load geometry block"; return result; } + if (mem_is_zero(vdo->geometry_block.vio.data, VDO_BLOCK_SIZE)) { + result = vdo_format(vdo, reason); + if (result != VDO_SUCCESS) + return result; + } else { + result = vdo_parse_geometry_block(vdo->geometry_block.buffer, + &vdo->geometry); + if (result != VDO_SUCCESS) { + *reason = "Could not parse geometry block"; + return result; + } + } + result = initialize_thread_config(config->thread_counts, &vdo->thread_config); if (result != VDO_SUCCESS) { *reason = "Cannot create thread configuration"; From fc1d43826702d8c14845c187d3ea0743fdf8f223 Mon Sep 17 00:00:00 2001 From: Bruce Johnston Date: Tue, 24 Mar 2026 14:06:52 -0400 Subject: [PATCH 54/62] dm vdo: save the formatted metadata to disk Add vdo_save_super_block() and vdo_save_geometry_block() to perform asynchronous writes of the super block and geometry block respectively. Add vdo_clear_layout() to zero the UDS index's first block, the block map partition, and the recovery journal partition. These operations are driven by new phases in the pre-load state machine (PRE_LOAD_PHASE_FORMAT_*), ensuring that disk writes happen during pre-resume rather than during dmsetup create. Signed-off-by: Bruce Johnston Reviewed-by: Matthew Sakai Signed-off-by: Mikulas Patocka --- drivers/md/dm-vdo/dm-vdo-target.c | 36 +++++++++ drivers/md/dm-vdo/vdo.c | 122 +++++++++++++++++++++++++----- drivers/md/dm-vdo/vdo.h | 9 +++ 3 files changed, 147 insertions(+), 20 deletions(-) diff --git a/drivers/md/dm-vdo/dm-vdo-target.c b/drivers/md/dm-vdo/dm-vdo-target.c index cee6e4edf768..0135a6f941fd 100644 --- a/drivers/md/dm-vdo/dm-vdo-target.c +++ b/drivers/md/dm-vdo/dm-vdo-target.c @@ -61,6 +61,11 @@ enum admin_phases { LOAD_PHASE_DRAIN_JOURNAL, LOAD_PHASE_WAIT_FOR_READ_ONLY, PRE_LOAD_PHASE_START, + PRE_LOAD_PHASE_FORMAT_START, + PRE_LOAD_PHASE_FORMAT_SUPER, + PRE_LOAD_PHASE_FORMAT_GEOMETRY, + PRE_LOAD_PHASE_FORMAT_END, + PRE_LOAD_PHASE_LOAD_SUPER, PRE_LOAD_PHASE_LOAD_COMPONENTS, PRE_LOAD_PHASE_END, PREPARE_GROW_PHYSICAL_PHASE_START, @@ -110,6 +115,11 @@ static const char * const ADMIN_PHASE_NAMES[] = { "LOAD_PHASE_DRAIN_JOURNAL", "LOAD_PHASE_WAIT_FOR_READ_ONLY", "PRE_LOAD_PHASE_START", + "PRE_LOAD_PHASE_FORMAT_START", + "PRE_LOAD_PHASE_FORMAT_SUPER", + "PRE_LOAD_PHASE_FORMAT_GEOMETRY", + "PRE_LOAD_PHASE_FORMAT_END", + "PRE_LOAD_PHASE_LOAD_SUPER", "PRE_LOAD_PHASE_LOAD_COMPONENTS", "PRE_LOAD_PHASE_END", "PREPARE_GROW_PHYSICAL_PHASE_START", @@ -1487,7 +1497,33 @@ static void pre_load_callback(struct vdo_completion *completion) vdo_continue_completion(completion, result); return; } + if (vdo->needs_formatting) + vdo->admin.phase = PRE_LOAD_PHASE_FORMAT_START; + else + vdo->admin.phase = PRE_LOAD_PHASE_LOAD_SUPER; + vdo_continue_completion(completion, VDO_SUCCESS); + return; + + case PRE_LOAD_PHASE_FORMAT_START: + vdo_continue_completion(completion, vdo_clear_layout(vdo)); + return; + + case PRE_LOAD_PHASE_FORMAT_SUPER: + vdo_save_super_block(vdo, completion); + return; + + case PRE_LOAD_PHASE_FORMAT_GEOMETRY: + vdo_save_geometry_block(vdo, completion); + return; + + case PRE_LOAD_PHASE_FORMAT_END: + /* cleanup layout before load adds to it */ + vdo_uninitialize_layout(&vdo->states.layout); + vdo_continue_completion(completion, VDO_SUCCESS); + return; + + case PRE_LOAD_PHASE_LOAD_SUPER: vdo_load_super_block(vdo, completion); return; diff --git a/drivers/md/dm-vdo/vdo.c b/drivers/md/dm-vdo/vdo.c index b5e64af13437..7bec2418c121 100644 --- a/drivers/md/dm-vdo/vdo.c +++ b/drivers/md/dm-vdo/vdo.c @@ -491,6 +491,8 @@ static int __must_check vdo_format(struct vdo *vdo, char **error_ptr) return result; } + vdo->needs_formatting = true; + return VDO_SUCCESS; } @@ -951,24 +953,101 @@ static void record_vdo(struct vdo *vdo) vdo->states.layout = vdo->layout; } +static int __must_check clear_partition(struct vdo *vdo, enum partition_id id) +{ + struct partition *partition; + int result; + + result = vdo_get_partition(&vdo->states.layout, id, &partition); + if (result != VDO_SUCCESS) + return result; + + return blkdev_issue_zeroout(vdo_get_backing_device(vdo), + partition->offset * VDO_SECTORS_PER_BLOCK, + partition->count * VDO_SECTORS_PER_BLOCK, + GFP_NOWAIT, 0); +} + +int vdo_clear_layout(struct vdo *vdo) +{ + int result; + + /* Zero out the uds index's first block. */ + result = blkdev_issue_zeroout(vdo_get_backing_device(vdo), + VDO_SECTORS_PER_BLOCK, + VDO_SECTORS_PER_BLOCK, + GFP_NOWAIT, 0); + if (result != VDO_SUCCESS) + return result; + + result = clear_partition(vdo, VDO_BLOCK_MAP_PARTITION); + if (result != VDO_SUCCESS) + return result; + + return clear_partition(vdo, VDO_RECOVERY_JOURNAL_PARTITION); +} + /** - * continue_super_block_parent() - Continue the parent of a super block save operation. - * @completion: The super block vio. + * continue_parent() - Continue the parent of a save operation. + * @completion: The completion to continue. * - * This callback is registered in vdo_save_components(). */ -static void continue_super_block_parent(struct vdo_completion *completion) +static void continue_parent(struct vdo_completion *completion) { vdo_continue_completion(vdo_forget(completion->parent), completion->result); } +static void handle_write_endio(struct bio *bio) +{ + struct vio *vio = bio->bi_private; + struct vdo_completion *parent = vio->completion.parent; + + continue_vio_after_io(vio, continue_parent, + parent->callback_thread_id); +} + /** - * handle_save_error() - Log a super block save error. + * handle_geometry_block_save_error() - Log a geometry block save error. + * @completion: The super block vio. + * + * This error handler is registered in vdo_save_geometry_block(). + */ +static void handle_geometry_block_save_error(struct vdo_completion *completion) +{ + struct vdo_geometry_block *geometry_block = + container_of(as_vio(completion), struct vdo_geometry_block, vio); + + vio_record_metadata_io_error(&geometry_block->vio); + vdo_log_error_strerror(completion->result, "geometry block save failed"); + completion->callback(completion); +} + +/** + * vdo_save_geometry_block() - Encode the vdo and save the geometry block asynchronously. + * @vdo: The vdo whose state is being saved. + * @parent: The completion to notify when the save is complete. + */ +void vdo_save_geometry_block(struct vdo *vdo, struct vdo_completion *parent) +{ + struct vdo_geometry_block *geometry_block = &vdo->geometry_block; + + vdo_encode_volume_geometry(geometry_block->buffer, &vdo->geometry, + VDO_DEFAULT_GEOMETRY_BLOCK_VERSION); + geometry_block->vio.completion.parent = parent; + geometry_block->vio.completion.callback_thread_id = parent->callback_thread_id; + vdo_submit_metadata_vio(&geometry_block->vio, + VDO_GEOMETRY_BLOCK_LOCATION, + handle_write_endio, handle_geometry_block_save_error, + REQ_OP_WRITE | REQ_PREFLUSH | REQ_FUA); +} + +/** + * handle_super_block_save_error() - Log a super block save error. * @completion: The super block vio. * * This error handler is registered in vdo_save_components(). */ -static void handle_save_error(struct vdo_completion *completion) +static void handle_super_block_save_error(struct vdo_completion *completion) { struct vdo_super_block *super_block = container_of(as_vio(completion), struct vdo_super_block, vio); @@ -987,17 +1066,27 @@ static void handle_save_error(struct vdo_completion *completion) completion->callback(completion); } -static void super_block_write_endio(struct bio *bio) +/** + * vdo_save_super_block() - Save the component states to the super block asynchronously. + * @vdo: The vdo whose state is being saved. + * @parent: The completion to notify when the save is complete. + */ +void vdo_save_super_block(struct vdo *vdo, struct vdo_completion *parent) { - struct vio *vio = bio->bi_private; - struct vdo_completion *parent = vio->completion.parent; + struct vdo_super_block *super_block = &vdo->super_block; - continue_vio_after_io(vio, continue_super_block_parent, - parent->callback_thread_id); + vdo_encode_super_block(super_block->buffer, &vdo->states); + super_block->vio.completion.parent = parent; + super_block->vio.completion.callback_thread_id = parent->callback_thread_id; + vdo_submit_metadata_vio(&super_block->vio, + vdo_get_data_region_start(vdo->geometry), + handle_write_endio, handle_super_block_save_error, + REQ_OP_WRITE | REQ_PREFLUSH | REQ_FUA); } /** - * vdo_save_components() - Encode the vdo and save the super block asynchronously. + * vdo_save_components() - Copy the current state of the VDO to the states struct and save + * it to the super block asynchronously. * @vdo: The vdo whose state is being saved. * @parent: The completion to notify when the save is complete. */ @@ -1016,14 +1105,7 @@ void vdo_save_components(struct vdo *vdo, struct vdo_completion *parent) } record_vdo(vdo); - - vdo_encode_super_block(super_block->buffer, &vdo->states); - super_block->vio.completion.parent = parent; - super_block->vio.completion.callback_thread_id = parent->callback_thread_id; - vdo_submit_metadata_vio(&super_block->vio, - vdo_get_data_region_start(vdo->geometry), - super_block_write_endio, handle_save_error, - REQ_OP_WRITE | REQ_PREFLUSH | REQ_FUA); + vdo_save_super_block(vdo, parent); } /** diff --git a/drivers/md/dm-vdo/vdo.h b/drivers/md/dm-vdo/vdo.h index 21f6ac999e9d..9a63f5d45ce3 100644 --- a/drivers/md/dm-vdo/vdo.h +++ b/drivers/md/dm-vdo/vdo.h @@ -246,6 +246,7 @@ struct vdo { const struct admin_state_code *suspend_type; bool allocations_allowed; bool dump_on_shutdown; + bool needs_formatting; atomic_t processing_message; /* @@ -314,6 +315,10 @@ int __must_check vdo_make(unsigned int instance, struct device_config *config, void vdo_destroy(struct vdo *vdo); +int __must_check vdo_format_components(struct vdo *vdo); + +void vdo_format_super_block(struct vdo *vdo, struct vdo_completion *parent); + void vdo_load_super_block(struct vdo *vdo, struct vdo_completion *parent); struct block_device * __must_check vdo_get_backing_device(const struct vdo *vdo); @@ -336,6 +341,10 @@ enum vdo_state __must_check vdo_get_state(const struct vdo *vdo); void vdo_set_state(struct vdo *vdo, enum vdo_state state); +int vdo_clear_layout(struct vdo *vdo); +void vdo_save_geometry_block(struct vdo *vdo, struct vdo_completion *parent); +void vdo_save_super_block(struct vdo *vdo, struct vdo_completion *parent); + void vdo_save_components(struct vdo *vdo, struct vdo_completion *parent); int vdo_register_read_only_listener(struct vdo *vdo, void *listener, From 5387815aa821a8d6dd87ba3aa6869e3c9c709b2a Mon Sep 17 00:00:00 2001 From: Rosen Penev Date: Wed, 25 Mar 2026 21:13:54 -0700 Subject: [PATCH 55/62] dm-bufio: use kzalloc_flex Avoid manual size calculations and use the proper helper. Add __counted_by for extra runtime analysis. Signed-off-by: Rosen Penev Signed-off-by: Mikulas Patocka --- drivers/md/dm-bufio.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/md/dm-bufio.c b/drivers/md/dm-bufio.c index 60f7badec91f..26fedf5883ef 100644 --- a/drivers/md/dm-bufio.c +++ b/drivers/md/dm-bufio.c @@ -391,7 +391,7 @@ struct dm_buffer_cache { */ unsigned int num_locks; bool no_sleep; - struct buffer_tree trees[]; + struct buffer_tree trees[] __counted_by(num_locks); }; static DEFINE_STATIC_KEY_FALSE(no_sleep_enabled); @@ -2511,7 +2511,7 @@ struct dm_bufio_client *dm_bufio_client_create(struct block_device *bdev, unsign } num_locks = dm_num_hash_locks(); - c = kzalloc(sizeof(*c) + (num_locks * sizeof(struct buffer_tree)), GFP_KERNEL); + c = kzalloc_flex(*c, cache.trees, num_locks); if (!c) { r = -ENOMEM; goto bad_client; From 6ebf3b6c6f16fda0568aa4207c6cd398f983c354 Mon Sep 17 00:00:00 2001 From: Keith Busch Date: Wed, 25 Mar 2026 12:36:06 -0700 Subject: [PATCH 56/62] dm-integrity: fix mismatched queue limits A user can integritysetup a device with a backing device using a 4k logical block size, but request the dm device use 1k or 2k. This mismatch creates an inconsistency such that the dm device would report limits for IO that it can't actually execute. Fix this by using the backing device's limits if they are larger. Signed-off-by: Keith Busch Signed-off-by: Mikulas Patocka --- drivers/md/dm-integrity.c | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/drivers/md/dm-integrity.c b/drivers/md/dm-integrity.c index 06e805902151..8dfd498ed1ff 100644 --- a/drivers/md/dm-integrity.c +++ b/drivers/md/dm-integrity.c @@ -4047,9 +4047,15 @@ static void dm_integrity_io_hints(struct dm_target *ti, struct queue_limits *lim struct dm_integrity_c *ic = ti->private; if (ic->sectors_per_block > 1) { - limits->logical_block_size = ic->sectors_per_block << SECTOR_SHIFT; - limits->physical_block_size = ic->sectors_per_block << SECTOR_SHIFT; - limits->io_min = ic->sectors_per_block << SECTOR_SHIFT; + limits->logical_block_size = + max(limits->logical_block_size, + ic->sectors_per_block << SECTOR_SHIFT); + limits->physical_block_size = + max(limits->physical_block_size, + ic->sectors_per_block << SECTOR_SHIFT); + limits->io_min = + max(limits->io_min, + ic->sectors_per_block << SECTOR_SHIFT); limits->dma_alignment = limits->logical_block_size - 1; limits->discard_granularity = ic->sectors_per_block << SECTOR_SHIFT; } From cbc1532d2b0ec2a842bd459f01b590bbf16b7443 Mon Sep 17 00:00:00 2001 From: Keith Busch Date: Wed, 25 Mar 2026 12:36:07 -0700 Subject: [PATCH 57/62] dm-integrity: always set the io hints Don't depend on the defaults to be what is desired if the integrity device was set up with 512b sector size. Always set the queue limits to be at least what the device mapper wants. Signed-off-by: Keith Busch Signed-off-by: Mikulas Patocka --- drivers/md/dm-integrity.c | 21 ++++++++------------- 1 file changed, 8 insertions(+), 13 deletions(-) diff --git a/drivers/md/dm-integrity.c b/drivers/md/dm-integrity.c index 8dfd498ed1ff..d64c15c761d0 100644 --- a/drivers/md/dm-integrity.c +++ b/drivers/md/dm-integrity.c @@ -4046,19 +4046,14 @@ static void dm_integrity_io_hints(struct dm_target *ti, struct queue_limits *lim { struct dm_integrity_c *ic = ti->private; - if (ic->sectors_per_block > 1) { - limits->logical_block_size = - max(limits->logical_block_size, - ic->sectors_per_block << SECTOR_SHIFT); - limits->physical_block_size = - max(limits->physical_block_size, - ic->sectors_per_block << SECTOR_SHIFT); - limits->io_min = - max(limits->io_min, - ic->sectors_per_block << SECTOR_SHIFT); - limits->dma_alignment = limits->logical_block_size - 1; - limits->discard_granularity = ic->sectors_per_block << SECTOR_SHIFT; - } + limits->logical_block_size = max(limits->logical_block_size, + ic->sectors_per_block << SECTOR_SHIFT); + limits->physical_block_size = max(limits->physical_block_size, + ic->sectors_per_block << SECTOR_SHIFT); + limits->io_min = max(limits->io_min, + ic->sectors_per_block << SECTOR_SHIFT); + limits->dma_alignment = limits->logical_block_size - 1; + limits->discard_granularity = ic->sectors_per_block << SECTOR_SHIFT; if (!ic->internal_hash) { struct blk_integrity *bi = &limits->integrity; From 33eded29319d41fcba5d0257b126a48b449aad47 Mon Sep 17 00:00:00 2001 From: Keith Busch Date: Wed, 25 Mar 2026 12:36:08 -0700 Subject: [PATCH 58/62] dm: provide helper to set stacked limits There are multiple device mappers that set up their stacking limits exactly the same for the logical, physical and minimum IO queue limits. Provide a helper for it. Signed-off-by: Keith Busch Signed-off-by: Mikulas Patocka --- drivers/md/dm-crypt.c | 6 +----- drivers/md/dm-integrity.c | 7 +------ drivers/md/dm-verity-target.c | 8 +------- drivers/md/dm-writecache.c | 10 +--------- include/linux/device-mapper.h | 7 +++++++ 5 files changed, 11 insertions(+), 27 deletions(-) diff --git a/drivers/md/dm-crypt.c b/drivers/md/dm-crypt.c index 885208a82c55..60642cee8609 100644 --- a/drivers/md/dm-crypt.c +++ b/drivers/md/dm-crypt.c @@ -3684,11 +3684,7 @@ static void crypt_io_hints(struct dm_target *ti, struct queue_limits *limits) { struct crypt_config *cc = ti->private; - limits->logical_block_size = - max_t(unsigned int, limits->logical_block_size, cc->sector_size); - limits->physical_block_size = - max_t(unsigned int, limits->physical_block_size, cc->sector_size); - limits->io_min = max_t(unsigned int, limits->io_min, cc->sector_size); + dm_stack_bs_limits(limits, cc->sector_size); limits->dma_alignment = limits->logical_block_size - 1; /* diff --git a/drivers/md/dm-integrity.c b/drivers/md/dm-integrity.c index d64c15c761d0..65c30dec8222 100644 --- a/drivers/md/dm-integrity.c +++ b/drivers/md/dm-integrity.c @@ -4046,12 +4046,7 @@ static void dm_integrity_io_hints(struct dm_target *ti, struct queue_limits *lim { struct dm_integrity_c *ic = ti->private; - limits->logical_block_size = max(limits->logical_block_size, - ic->sectors_per_block << SECTOR_SHIFT); - limits->physical_block_size = max(limits->physical_block_size, - ic->sectors_per_block << SECTOR_SHIFT); - limits->io_min = max(limits->io_min, - ic->sectors_per_block << SECTOR_SHIFT); + dm_stack_bs_limits(limits, ic->sectors_per_block << SECTOR_SHIFT); limits->dma_alignment = limits->logical_block_size - 1; limits->discard_granularity = ic->sectors_per_block << SECTOR_SHIFT; diff --git a/drivers/md/dm-verity-target.c b/drivers/md/dm-verity-target.c index e1d435c79e96..9a9847f94c46 100644 --- a/drivers/md/dm-verity-target.c +++ b/drivers/md/dm-verity-target.c @@ -1011,13 +1011,7 @@ static void verity_io_hints(struct dm_target *ti, struct queue_limits *limits) { struct dm_verity *v = ti->private; - if (limits->logical_block_size < 1 << v->data_dev_block_bits) - limits->logical_block_size = 1 << v->data_dev_block_bits; - - if (limits->physical_block_size < 1 << v->data_dev_block_bits) - limits->physical_block_size = 1 << v->data_dev_block_bits; - - limits->io_min = limits->logical_block_size; + dm_stack_bs_limits(limits, 1 << v->data_dev_block_bits); /* * Similar to what dm-crypt does, opt dm-verity out of support for diff --git a/drivers/md/dm-writecache.c b/drivers/md/dm-writecache.c index 98bd945f6da7..493f5202ad04 100644 --- a/drivers/md/dm-writecache.c +++ b/drivers/md/dm-writecache.c @@ -1640,17 +1640,9 @@ static void writecache_io_hints(struct dm_target *ti, struct queue_limits *limit { struct dm_writecache *wc = ti->private; - if (limits->logical_block_size < wc->block_size) - limits->logical_block_size = wc->block_size; - - if (limits->physical_block_size < wc->block_size) - limits->physical_block_size = wc->block_size; - - if (limits->io_min < wc->block_size) - limits->io_min = wc->block_size; + dm_stack_bs_limits(limits, wc->block_size); } - static void writecache_writeback_endio(struct bio *bio) { struct writeback_struct *wb = container_of(bio, struct writeback_struct, bio); diff --git a/include/linux/device-mapper.h b/include/linux/device-mapper.h index 38f625af6ab4..cd4faaf5d427 100644 --- a/include/linux/device-mapper.h +++ b/include/linux/device-mapper.h @@ -755,4 +755,11 @@ static inline unsigned long to_bytes(sector_t n) return (n << SECTOR_SHIFT); } +static inline void dm_stack_bs_limits(struct queue_limits *limits, unsigned int bs) +{ + limits->logical_block_size = max(limits->logical_block_size, bs); + limits->physical_block_size = max(limits->physical_block_size, bs); + limits->io_min = max(limits->io_min, bs); +} + #endif /* _LINUX_DEVICE_MAPPER_H */ From a1cf2bd5b6424ead3d75d09c822f665907094a80 Mon Sep 17 00:00:00 2001 From: Colin Ian King Date: Sat, 28 Mar 2026 12:33:20 +0000 Subject: [PATCH 59/62] dm vdo: Fix spelling mistake "postive" -> "positive" There is a spelling mistake in a vdo_log_error message. Fix it. Signed-off-by: Colin Ian King Signed-off-by: Mikulas Patocka --- drivers/md/dm-vdo/dm-vdo-target.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/md/dm-vdo/dm-vdo-target.c b/drivers/md/dm-vdo/dm-vdo-target.c index 0135a6f941fd..1d8375cc3c3e 100644 --- a/drivers/md/dm-vdo/dm-vdo-target.c +++ b/drivers/md/dm-vdo/dm-vdo-target.c @@ -412,7 +412,7 @@ static int __must_check parse_memory(const char *memory_str, result = kstrtouint(memory_str, 10, &value); if (result) { - vdo_log_error("optional parameter error: invalid memory size, must be a postive integer"); + vdo_log_error("optional parameter error: invalid memory size, must be a positive integer"); return -EINVAL; } From 43fd83c0b1dc127cf13b4c05303665924e63ef94 Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Sun, 29 Mar 2026 13:49:52 -0700 Subject: [PATCH 60/62] dm-crypt: Make crypt_iv_operations::post return void Since all implementations of crypt_iv_operations::post now return 0, change the return type to void. Signed-off-by: Eric Biggers Signed-off-by: Mikulas Patocka --- drivers/md/dm-crypt.c | 31 +++++++++++++------------------ 1 file changed, 13 insertions(+), 18 deletions(-) diff --git a/drivers/md/dm-crypt.c b/drivers/md/dm-crypt.c index 60642cee8609..608b617fb817 100644 --- a/drivers/md/dm-crypt.c +++ b/drivers/md/dm-crypt.c @@ -113,8 +113,8 @@ struct crypt_iv_operations { void (*wipe)(struct crypt_config *cc); int (*generator)(struct crypt_config *cc, u8 *iv, struct dm_crypt_request *dmreq); - int (*post)(struct crypt_config *cc, u8 *iv, - struct dm_crypt_request *dmreq); + void (*post)(struct crypt_config *cc, u8 *iv, + struct dm_crypt_request *dmreq); }; struct iv_benbi_private { @@ -559,14 +559,14 @@ static int crypt_iv_lmk_gen(struct crypt_config *cc, u8 *iv, return 0; } -static int crypt_iv_lmk_post(struct crypt_config *cc, u8 *iv, - struct dm_crypt_request *dmreq) +static void crypt_iv_lmk_post(struct crypt_config *cc, u8 *iv, + struct dm_crypt_request *dmreq) { struct scatterlist *sg; u8 *dst; if (bio_data_dir(dmreq->ctx->bio_in) == WRITE) - return 0; + return; sg = crypt_get_sg_data(cc, dmreq->sg_out); dst = kmap_local_page(sg_page(sg)); @@ -576,7 +576,6 @@ static int crypt_iv_lmk_post(struct crypt_config *cc, u8 *iv, crypto_xor(dst + sg->offset, iv, cc->iv_size); kunmap_local(dst); - return 0; } static void crypt_iv_tcw_dtr(struct crypt_config *cc) @@ -684,22 +683,20 @@ static int crypt_iv_tcw_gen(struct crypt_config *cc, u8 *iv, return 0; } -static int crypt_iv_tcw_post(struct crypt_config *cc, u8 *iv, - struct dm_crypt_request *dmreq) +static void crypt_iv_tcw_post(struct crypt_config *cc, u8 *iv, + struct dm_crypt_request *dmreq) { struct scatterlist *sg; u8 *dst; if (bio_data_dir(dmreq->ctx->bio_in) != WRITE) - return 0; + return; /* Apply whitening on ciphertext */ sg = crypt_get_sg_data(cc, dmreq->sg_out); dst = kmap_local_page(sg_page(sg)); crypt_iv_tcw_whitening(cc, dmreq, dst + sg->offset); kunmap_local(dst); - - return 0; } static int crypt_iv_random_gen(struct crypt_config *cc, u8 *iv, @@ -994,13 +991,11 @@ static int crypt_iv_elephant_gen(struct crypt_config *cc, u8 *iv, return crypt_iv_eboiv_gen(cc, iv, dmreq); } -static int crypt_iv_elephant_post(struct crypt_config *cc, u8 *iv, - struct dm_crypt_request *dmreq) +static void crypt_iv_elephant_post(struct crypt_config *cc, u8 *iv, + struct dm_crypt_request *dmreq) { if (bio_data_dir(dmreq->ctx->bio_in) != WRITE) crypt_iv_elephant(cc, dmreq); - - return 0; } static int crypt_iv_elephant_init(struct crypt_config *cc) @@ -1346,7 +1341,7 @@ static int crypt_convert_block_aead(struct crypt_config *cc, } if (!r && cc->iv_gen_ops && cc->iv_gen_ops->post) - r = cc->iv_gen_ops->post(cc, org_iv, dmreq); + cc->iv_gen_ops->post(cc, org_iv, dmreq); bio_advance_iter(ctx->bio_in, &ctx->iter_in, cc->sector_size); bio_advance_iter(ctx->bio_out, &ctx->iter_out, cc->sector_size); @@ -1423,7 +1418,7 @@ static int crypt_convert_block_skcipher(struct crypt_config *cc, r = crypto_skcipher_decrypt(req); if (!r && cc->iv_gen_ops && cc->iv_gen_ops->post) - r = cc->iv_gen_ops->post(cc, org_iv, dmreq); + cc->iv_gen_ops->post(cc, org_iv, dmreq); bio_advance_iter(ctx->bio_in, &ctx->iter_in, cc->sector_size); bio_advance_iter(ctx->bio_out, &ctx->iter_out, cc->sector_size); @@ -2187,7 +2182,7 @@ static void kcryptd_async_done(void *data, int error) } if (!error && cc->iv_gen_ops && cc->iv_gen_ops->post) - error = cc->iv_gen_ops->post(cc, org_iv_of_dmreq(cc, dmreq), dmreq); + cc->iv_gen_ops->post(cc, org_iv_of_dmreq(cc, dmreq), dmreq); if (error == -EBADMSG) { sector_t s = le64_to_cpu(*org_sector_of_dmreq(cc, dmreq)); From 2fa49cc884f6496a915c35621ba4da35649bf159 Mon Sep 17 00:00:00 2001 From: Mikulas Patocka Date: Thu, 9 Apr 2026 17:49:58 +0200 Subject: [PATCH 61/62] dm: fix a buffer overflow in ioctl processing Tony Asleson (using Claude) found a buffer overflow in dm-ioctl in the function retrieve_status: 1. The code in retrieve_status checks that the output string fits into the output buffer and writes the output string there 2. Then, the code aligns the "outptr" variable to the next 8-byte boundary: outptr = align_ptr(outptr); 3. The alignment doesn't check overflow, so outptr could point past the buffer end 4. The "for" loop is iterated again, it executes: remaining = len - (outptr - outbuf); 5. If "outptr" points past "outbuf + len", the arithmetics wraps around and the variable "remaining" contains unusually high number 6. With "remaining" being high, the code writes more data past the end of the buffer Luckily, this bug has no security implications because: 1. Only root can issue device mapper ioctls 2. The commonly used libraries that communicate with device mapper (libdevmapper and devicemapper-rs) use buffer size that is aligned to 8 bytes - thus, "outptr = align_ptr(outptr)" can't overshoot the input buffer and the bug can't happen accidentally Reported-by: Tony Asleson Signed-off-by: Mikulas Patocka Reviewed-by: Bryn M. Reeves Cc: stable@vger.kernel.org --- drivers/md/dm-ioctl.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/drivers/md/dm-ioctl.c b/drivers/md/dm-ioctl.c index 405acc14d718..a529174c94cf 100644 --- a/drivers/md/dm-ioctl.c +++ b/drivers/md/dm-ioctl.c @@ -1356,6 +1356,10 @@ static void retrieve_status(struct dm_table *table, used = param->data_start + (outptr - outbuf); outptr = align_ptr(outptr); + if (!outptr || outptr > outbuf + len) { + param->flags |= DM_BUFFER_FULL_FLAG; + break; + } spec->next = outptr - outbuf; } From 8c0ee19db81f0fa1ff25fd75b22b17c0cc2acde3 Mon Sep 17 00:00:00 2001 From: Ming-Hung Tsai Date: Fri, 10 Apr 2026 21:08:01 +0800 Subject: [PATCH 62/62] dm cache: fix missing return in invalidate_committed's error path In passthrough mode, dm-cache defers write submission until after metadata commit completes via the invalidate_committed() continuation. On commit error, invalidate_committed() calls invalidate_complete() to end the bio and free the migration struct, after which it should return immediately. The patch 4ca8b8bd952d ("dm cache: fix write hang in passthrough mode") omitted this early return, causing execution to fall through into the success path on error. This results in use-after-free on the migration struct in the subsequent calls. Fix by adding the missing return after the invalidate_complete() call. Fixes: 4ca8b8bd952d ("dm cache: fix write hang in passthrough mode") Reported-by: Dan Carpenter Closes: https://lore.kernel.org/dm-devel/adjMq6T5RRjv_uxM@stanley.mountain/ Signed-off-by: Ming-Hung Tsai Signed-off-by: Mikulas Patocka --- drivers/md/dm-cache-target.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/drivers/md/dm-cache-target.c b/drivers/md/dm-cache-target.c index f8200c154805..097315a9bf0f 100644 --- a/drivers/md/dm-cache-target.c +++ b/drivers/md/dm-cache-target.c @@ -1521,8 +1521,10 @@ static void invalidate_committed(struct work_struct *ws) struct bio *bio = mg->overwrite_bio; struct per_bio_data *pb = get_per_bio_data(bio); - if (mg->k.input) + if (mg->k.input) { invalidate_complete(mg, false); + return; + } init_continuation(&mg->k, invalidate_completed); remap_to_origin_clear_discard(cache, bio, mg->invalidate_oblock);